diff options
author | Linus Torvalds | 2018-10-26 14:36:21 -0700 |
---|---|---|
committer | Linus Torvalds | 2018-10-26 14:36:21 -0700 |
commit | 685f7e4f161425b137056abe35ba8ef7b669d83d (patch) | |
tree | 550dd1f5dc9e852cfeec26bf5e3ce9dd060c8a33 /arch/powerpc/kernel | |
parent | c7a2c49ea6c9eebbe44ff2c08b663b2905ee2c13 (diff) | |
parent | 58cfbac25b1fd2b76f94566aed28a3662b0ff8c6 (diff) |
Merge tag 'powerpc-4.20-1' of git://git.kernel.org/pub/scm/linux/kernel/git/powerpc/linux
Pull powerpc updates from Michael Ellerman:
"Notable changes:
- A large series to rewrite our SLB miss handling, replacing a lot of
fairly complicated asm with much fewer lines of C.
- Following on from that, we now maintain a cache of SLB entries for
each process and preload them on context switch. Leading to a 27%
speedup for our context switch benchmark on Power9.
- Improvements to our handling of SLB multi-hit errors. We now print
more debug information when they occur, and try to continue running
by flushing the SLB and reloading, rather than treating them as
fatal.
- Enable THP migration on 64-bit Book3S machines (eg. Power7/8/9).
- Add support for physical memory up to 2PB in the linear mapping on
64-bit Book3S. We only support up to 512TB as regular system
memory, otherwise the percpu allocator runs out of vmalloc space.
- Add stack protector support for 32 and 64-bit, with a per-task
canary.
- Add support for PTRACE_SYSEMU and PTRACE_SYSEMU_SINGLESTEP.
- Support recognising "big cores" on Power9, where two SMT4 cores are
presented to us as a single SMT8 core.
- A large series to cleanup some of our ioremap handling and PTE
flags.
- Add a driver for the PAPR SCM (storage class memory) interface,
allowing guests to operate on SCM devices (acked by Dan).
- Changes to our ftrace code to handle very large kernels, where we
need to use a trampoline to get to ftrace_caller().
And many other smaller enhancements and cleanups.
Thanks to: Alan Modra, Alistair Popple, Aneesh Kumar K.V, Anton
Blanchard, Aravinda Prasad, Bartlomiej Zolnierkiewicz, Benjamin
Herrenschmidt, Breno Leitao, Cédric Le Goater, Christophe Leroy,
Christophe Lombard, Dan Carpenter, Daniel Axtens, Finn Thain, Gautham
R. Shenoy, Gustavo Romero, Haren Myneni, Hari Bathini, Jia Hongtao,
Joel Stanley, John Allen, Laurent Dufour, Madhavan Srinivasan, Mahesh
Salgaonkar, Mark Hairgrove, Masahiro Yamada, Michael Bringmann,
Michael Neuling, Michal Suchanek, Murilo Opsfelder Araujo, Nathan
Fontenot, Naveen N. Rao, Nicholas Piggin, Nick Desaulniers, Oliver
O'Halloran, Paul Mackerras, Petr Vorel, Rashmica Gupta, Reza Arbab,
Rob Herring, Sam Bobroff, Samuel Mendoza-Jonas, Scott Wood, Stan
Johnson, Stephen Rothwell, Stewart Smith, Suraj Jitindar Singh, Tyrel
Datwyler, Vaibhav Jain, Vasant Hegde, YueHaibing, zhong jiang"
* tag 'powerpc-4.20-1' of git://git.kernel.org/pub/scm/linux/kernel/git/powerpc/linux: (221 commits)
Revert "selftests/powerpc: Fix out-of-tree build errors"
powerpc/msi: Fix compile error on mpc83xx
powerpc: Fix stack protector crashes on CPU hotplug
powerpc/traps: restore recoverability of machine_check interrupts
powerpc/64/module: REL32 relocation range check
powerpc/64s/radix: Fix radix__flush_tlb_collapsed_pmd double flushing pmd
selftests/powerpc: Add a test of wild bctr
powerpc/mm: Fix page table dump to work on Radix
powerpc/mm/radix: Display if mappings are exec or not
powerpc/mm/radix: Simplify split mapping logic
powerpc/mm/radix: Remove the retry in the split mapping logic
powerpc/mm/radix: Fix small page at boundary when splitting
powerpc/mm/radix: Fix overuse of small pages in splitting logic
powerpc/mm/radix: Fix off-by-one in split mapping logic
powerpc/ftrace: Handle large kernel configs
powerpc/mm: Fix WARN_ON with THP NUMA migration
selftests/powerpc: Fix out-of-tree build errors
powerpc/time: no steal_time when CONFIG_PPC_SPLPAR is not selected
powerpc/time: Only set CONFIG_ARCH_HAS_SCALED_CPUTIME on PPC64
powerpc/time: isolate scaled cputime accounting in dedicated functions.
...
Diffstat (limited to 'arch/powerpc/kernel')
45 files changed, 1380 insertions, 825 deletions
diff --git a/arch/powerpc/kernel/Makefile b/arch/powerpc/kernel/Makefile index 3b66f2c19c84..53d4b8d5b54d 100644 --- a/arch/powerpc/kernel/Makefile +++ b/arch/powerpc/kernel/Makefile @@ -5,7 +5,8 @@ CFLAGS_ptrace.o += -DUTS_MACHINE='"$(UTS_MACHINE)"' -subdir-ccflags-$(CONFIG_PPC_WERROR) := -Werror +# Disable clang warning for using setjmp without setjmp.h header +CFLAGS_crash.o += $(call cc-disable-warning, builtin-requires-header) ifdef CONFIG_PPC64 CFLAGS_prom_init.o += $(NO_MINIMAL_TOC) @@ -20,12 +21,14 @@ CFLAGS_prom_init.o += $(DISABLE_LATENT_ENTROPY_PLUGIN) CFLAGS_btext.o += $(DISABLE_LATENT_ENTROPY_PLUGIN) CFLAGS_prom.o += $(DISABLE_LATENT_ENTROPY_PLUGIN) +CFLAGS_prom_init.o += $(call cc-option, -fno-stack-protector) + ifdef CONFIG_FUNCTION_TRACER # Do not trace early boot code -CFLAGS_REMOVE_cputable.o = -mno-sched-epilog $(CC_FLAGS_FTRACE) -CFLAGS_REMOVE_prom_init.o = -mno-sched-epilog $(CC_FLAGS_FTRACE) -CFLAGS_REMOVE_btext.o = -mno-sched-epilog $(CC_FLAGS_FTRACE) -CFLAGS_REMOVE_prom.o = -mno-sched-epilog $(CC_FLAGS_FTRACE) +CFLAGS_REMOVE_cputable.o = $(CC_FLAGS_FTRACE) +CFLAGS_REMOVE_prom_init.o = $(CC_FLAGS_FTRACE) +CFLAGS_REMOVE_btext.o = $(CC_FLAGS_FTRACE) +CFLAGS_REMOVE_prom.o = $(CC_FLAGS_FTRACE) endif obj-y := cputable.o ptrace.o syscalls.o \ diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c index d68b9ef38328..9ffc72ded73a 100644 --- a/arch/powerpc/kernel/asm-offsets.c +++ b/arch/powerpc/kernel/asm-offsets.c @@ -79,11 +79,16 @@ int main(void) { OFFSET(THREAD, task_struct, thread); OFFSET(MM, task_struct, mm); +#ifdef CONFIG_STACKPROTECTOR + OFFSET(TASK_CANARY, task_struct, stack_canary); +#ifdef CONFIG_PPC64 + OFFSET(PACA_CANARY, paca_struct, canary); +#endif +#endif OFFSET(MMCONTEXTID, mm_struct, context.id); #ifdef CONFIG_PPC64 DEFINE(SIGSEGV, SIGSEGV); DEFINE(NMI_MASK, NMI_MASK); - OFFSET(TASKTHREADPPR, task_struct, thread.ppr); #else OFFSET(THREAD_INFO, task_struct, stack); DEFINE(THREAD_INFO_GAP, _ALIGN_UP(sizeof(struct thread_info), 16)); @@ -173,7 +178,6 @@ int main(void) OFFSET(PACAKSAVE, paca_struct, kstack); OFFSET(PACACURRENT, paca_struct, __current); OFFSET(PACASAVEDMSR, paca_struct, saved_msr); - OFFSET(PACASTABRR, paca_struct, stab_rr); OFFSET(PACAR1, paca_struct, saved_r1); OFFSET(PACATOC, paca_struct, kernel_toc); OFFSET(PACAKBASE, paca_struct, kernelbase); @@ -212,6 +216,7 @@ int main(void) #ifdef CONFIG_PPC_BOOK3S_64 OFFSET(PACASLBCACHE, paca_struct, slb_cache); OFFSET(PACASLBCACHEPTR, paca_struct, slb_cache_ptr); + OFFSET(PACASTABRR, paca_struct, stab_rr); OFFSET(PACAVMALLOCSLLP, paca_struct, vmalloc_sllp); #ifdef CONFIG_PPC_MM_SLICES OFFSET(MMUPSIZESLLP, mmu_psize_def, sllp); @@ -274,11 +279,6 @@ int main(void) /* Interrupt register frame */ DEFINE(INT_FRAME_SIZE, STACK_INT_FRAME_SIZE); DEFINE(SWITCH_FRAME_SIZE, STACK_FRAME_OVERHEAD + sizeof(struct pt_regs)); -#ifdef CONFIG_PPC64 - /* Create extra stack space for SRR0 and SRR1 when calling prom/rtas. */ - DEFINE(PROM_FRAME_SIZE, STACK_FRAME_OVERHEAD + sizeof(struct pt_regs) + 16); - DEFINE(RTAS_FRAME_SIZE, STACK_FRAME_OVERHEAD + sizeof(struct pt_regs) + 16); -#endif /* CONFIG_PPC64 */ STACK_PT_REGS_OFFSET(GPR0, gpr[0]); STACK_PT_REGS_OFFSET(GPR1, gpr[1]); STACK_PT_REGS_OFFSET(GPR2, gpr[2]); @@ -322,10 +322,7 @@ int main(void) STACK_PT_REGS_OFFSET(_ESR, dsisr); #else /* CONFIG_PPC64 */ STACK_PT_REGS_OFFSET(SOFTE, softe); - - /* These _only_ to be used with {PROM,RTAS}_FRAME_SIZE!!! */ - DEFINE(_SRR0, STACK_FRAME_OVERHEAD+sizeof(struct pt_regs)); - DEFINE(_SRR1, STACK_FRAME_OVERHEAD+sizeof(struct pt_regs)+8); + STACK_PT_REGS_OFFSET(_PPR, ppr); #endif /* CONFIG_PPC64 */ #if defined(CONFIG_PPC32) diff --git a/arch/powerpc/kernel/btext.c b/arch/powerpc/kernel/btext.c index b2072d5bbf2b..b4241ed1456e 100644 --- a/arch/powerpc/kernel/btext.c +++ b/arch/powerpc/kernel/btext.c @@ -163,7 +163,7 @@ void btext_map(void) offset = ((unsigned long) dispDeviceBase) - base; size = dispDeviceRowBytes * dispDeviceRect[3] + offset + dispDeviceRect[0]; - vbase = __ioremap(base, size, pgprot_val(pgprot_noncached_wc(__pgprot(0)))); + vbase = ioremap_wc(base, size); if (!vbase) return; logicalDisplayBase = vbase + offset; diff --git a/arch/powerpc/kernel/cacheinfo.c b/arch/powerpc/kernel/cacheinfo.c index a8f20e5928e1..be57bd07596d 100644 --- a/arch/powerpc/kernel/cacheinfo.c +++ b/arch/powerpc/kernel/cacheinfo.c @@ -20,6 +20,8 @@ #include <linux/percpu.h> #include <linux/slab.h> #include <asm/prom.h> +#include <asm/cputhreads.h> +#include <asm/smp.h> #include "cacheinfo.h" @@ -627,17 +629,48 @@ static ssize_t level_show(struct kobject *k, struct kobj_attribute *attr, char * static struct kobj_attribute cache_level_attr = __ATTR(level, 0444, level_show, NULL); +static unsigned int index_dir_to_cpu(struct cache_index_dir *index) +{ + struct kobject *index_dir_kobj = &index->kobj; + struct kobject *cache_dir_kobj = index_dir_kobj->parent; + struct kobject *cpu_dev_kobj = cache_dir_kobj->parent; + struct device *dev = kobj_to_dev(cpu_dev_kobj); + + return dev->id; +} + +/* + * On big-core systems, each core has two groups of CPUs each of which + * has its own L1-cache. The thread-siblings which share l1-cache with + * @cpu can be obtained via cpu_smallcore_mask(). + */ +static const struct cpumask *get_big_core_shared_cpu_map(int cpu, struct cache *cache) +{ + if (cache->level == 1) + return cpu_smallcore_mask(cpu); + + return &cache->shared_cpu_map; +} + static ssize_t shared_cpu_map_show(struct kobject *k, struct kobj_attribute *attr, char *buf) { struct cache_index_dir *index; struct cache *cache; - int ret; + const struct cpumask *mask; + int ret, cpu; index = kobj_to_cache_index_dir(k); cache = index->cache; + if (has_big_cores) { + cpu = index_dir_to_cpu(index); + mask = get_big_core_shared_cpu_map(cpu, cache); + } else { + mask = &cache->shared_cpu_map; + } + ret = scnprintf(buf, PAGE_SIZE - 1, "%*pb\n", - cpumask_pr_args(&cache->shared_cpu_map)); + cpumask_pr_args(mask)); buf[ret++] = '\n'; buf[ret] = '\0'; return ret; diff --git a/arch/powerpc/kernel/crash_dump.c b/arch/powerpc/kernel/crash_dump.c index d10ad258d41a..bbdc4706c159 100644 --- a/arch/powerpc/kernel/crash_dump.c +++ b/arch/powerpc/kernel/crash_dump.c @@ -110,7 +110,7 @@ ssize_t copy_oldmem_page(unsigned long pfn, char *buf, vaddr = __va(paddr); csize = copy_oldmem_vaddr(vaddr, buf, csize, offset, userbuf); } else { - vaddr = __ioremap(paddr, PAGE_SIZE, 0); + vaddr = ioremap_cache(paddr, PAGE_SIZE); csize = copy_oldmem_vaddr(vaddr, buf, csize, offset, userbuf); iounmap(vaddr); } diff --git a/arch/powerpc/kernel/eeh.c b/arch/powerpc/kernel/eeh.c index 6ebba3e48b01..6cae6b56ffd6 100644 --- a/arch/powerpc/kernel/eeh.c +++ b/arch/powerpc/kernel/eeh.c @@ -169,6 +169,11 @@ static size_t eeh_dump_dev_log(struct eeh_dev *edev, char *buf, size_t len) int n = 0, l = 0; char buffer[128]; + if (!pdn) { + pr_warn("EEH: Note: No error log for absent device.\n"); + return 0; + } + n += scnprintf(buf+n, len-n, "%04x:%02x:%02x.%01x\n", pdn->phb->global_number, pdn->busno, PCI_SLOT(pdn->devfn), PCI_FUNC(pdn->devfn)); @@ -399,7 +404,7 @@ static int eeh_phb_check_failure(struct eeh_pe *pe) } /* Isolate the PHB and send event */ - eeh_pe_state_mark(phb_pe, EEH_PE_ISOLATED); + eeh_pe_mark_isolated(phb_pe); eeh_serialize_unlock(flags); pr_err("EEH: PHB#%x failure detected, location: %s\n", @@ -558,7 +563,7 @@ int eeh_dev_check_failure(struct eeh_dev *edev) * with other functions on this device, and functions under * bridges. */ - eeh_pe_state_mark(pe, EEH_PE_ISOLATED); + eeh_pe_mark_isolated(pe); eeh_serialize_unlock(flags); /* Most EEH events are due to device driver bugs. Having @@ -676,7 +681,7 @@ int eeh_pci_enable(struct eeh_pe *pe, int function) /* Check if the request is finished successfully */ if (active_flag) { - rc = eeh_ops->wait_state(pe, PCI_BUS_RESET_WAIT_MSEC); + rc = eeh_wait_state(pe, PCI_BUS_RESET_WAIT_MSEC); if (rc < 0) return rc; @@ -825,7 +830,8 @@ int pcibios_set_pcie_reset_state(struct pci_dev *dev, enum pcie_reset_state stat eeh_pe_state_clear(pe, EEH_PE_ISOLATED); break; case pcie_hot_reset: - eeh_pe_state_mark_with_cfg(pe, EEH_PE_ISOLATED); + eeh_pe_mark_isolated(pe); + eeh_pe_state_clear(pe, EEH_PE_CFG_BLOCKED); eeh_ops->set_option(pe, EEH_OPT_FREEZE_PE); eeh_pe_dev_traverse(pe, eeh_disable_and_save_dev_state, dev); if (!(pe->type & EEH_PE_VF)) @@ -833,7 +839,8 @@ int pcibios_set_pcie_reset_state(struct pci_dev *dev, enum pcie_reset_state stat eeh_ops->reset(pe, EEH_RESET_HOT); break; case pcie_warm_reset: - eeh_pe_state_mark_with_cfg(pe, EEH_PE_ISOLATED); + eeh_pe_mark_isolated(pe); + eeh_pe_state_clear(pe, EEH_PE_CFG_BLOCKED); eeh_ops->set_option(pe, EEH_OPT_FREEZE_PE); eeh_pe_dev_traverse(pe, eeh_disable_and_save_dev_state, dev); if (!(pe->type & EEH_PE_VF)) @@ -913,16 +920,15 @@ int eeh_pe_reset_full(struct eeh_pe *pe) break; /* Wait until the PE is in a functioning state */ - state = eeh_ops->wait_state(pe, PCI_BUS_RESET_WAIT_MSEC); - if (eeh_state_active(state)) - break; - + state = eeh_wait_state(pe, PCI_BUS_RESET_WAIT_MSEC); if (state < 0) { pr_warn("%s: Unrecoverable slot failure on PHB#%x-PE#%x", __func__, pe->phb->global_number, pe->addr); ret = -ENOTRECOVERABLE; break; } + if (eeh_state_active(state)) + break; /* Set error in case this is our last attempt */ ret = -EIO; @@ -1036,6 +1042,11 @@ void eeh_probe_devices(void) pdn = hose->pci_data; traverse_pci_dn(pdn, eeh_ops->probe, NULL); } + if (eeh_enabled()) + pr_info("EEH: PCI Enhanced I/O Error Handling Enabled\n"); + else + pr_info("EEH: No capable adapters found\n"); + } /** @@ -1079,18 +1090,7 @@ static int eeh_init(void) eeh_dev_phb_init_dynamic(hose); /* Initialize EEH event */ - ret = eeh_event_init(); - if (ret) - return ret; - - eeh_probe_devices(); - - if (eeh_enabled()) - pr_info("EEH: PCI Enhanced I/O Error Handling Enabled\n"); - else if (!eeh_has_flag(EEH_POSTPONED_PROBE)) - pr_info("EEH: No capable adapters found\n"); - - return ret; + return eeh_event_init(); } core_initcall_sync(eeh_init); diff --git a/arch/powerpc/kernel/eeh_dev.c b/arch/powerpc/kernel/eeh_dev.c index a34e6912c15e..d8c90f3284b5 100644 --- a/arch/powerpc/kernel/eeh_dev.c +++ b/arch/powerpc/kernel/eeh_dev.c @@ -60,8 +60,6 @@ struct eeh_dev *eeh_dev_init(struct pci_dn *pdn) /* Associate EEH device with OF node */ pdn->edev = edev; edev->pdn = pdn; - INIT_LIST_HEAD(&edev->list); - INIT_LIST_HEAD(&edev->rmv_list); return edev; } diff --git a/arch/powerpc/kernel/eeh_driver.c b/arch/powerpc/kernel/eeh_driver.c index 67619b4b3f96..9446248eb6b8 100644 --- a/arch/powerpc/kernel/eeh_driver.c +++ b/arch/powerpc/kernel/eeh_driver.c @@ -35,8 +35,8 @@ #include <asm/rtas.h> struct eeh_rmv_data { - struct list_head edev_list; - int removed; + struct list_head removed_vf_list; + int removed_dev_count; }; static int eeh_result_priority(enum pci_ers_result result) @@ -281,6 +281,10 @@ static void eeh_pe_report_edev(struct eeh_dev *edev, eeh_report_fn fn, struct pci_driver *driver; enum pci_ers_result new_result; + if (!edev->pdev) { + eeh_edev_info(edev, "no device"); + return; + } device_lock(&edev->pdev->dev); if (eeh_edev_actionable(edev)) { driver = eeh_pcid_get(edev->pdev); @@ -400,7 +404,7 @@ static void *eeh_dev_restore_state(struct eeh_dev *edev, void *userdata) * EEH device is created. */ if (edev->pe && (edev->pe->state & EEH_PE_CFG_RESTRICTED)) { - if (list_is_last(&edev->list, &edev->pe->edevs)) + if (list_is_last(&edev->entry, &edev->pe->edevs)) eeh_pe_restore_bars(edev->pe); return NULL; @@ -465,10 +469,9 @@ static enum pci_ers_result eeh_report_failure(struct eeh_dev *edev, return rc; } -static void *eeh_add_virt_device(void *data, void *userdata) +static void *eeh_add_virt_device(struct eeh_dev *edev) { struct pci_driver *driver; - struct eeh_dev *edev = (struct eeh_dev *)data; struct pci_dev *dev = eeh_dev_to_pci_dev(edev); struct pci_dn *pdn = eeh_dev_to_pdn(edev); @@ -499,7 +502,6 @@ static void *eeh_rmv_device(struct eeh_dev *edev, void *userdata) struct pci_driver *driver; struct pci_dev *dev = eeh_dev_to_pci_dev(edev); struct eeh_rmv_data *rmv_data = (struct eeh_rmv_data *)userdata; - int *removed = rmv_data ? &rmv_data->removed : NULL; /* * Actually, we should remove the PCI bridges as well. @@ -521,7 +523,7 @@ static void *eeh_rmv_device(struct eeh_dev *edev, void *userdata) if (eeh_dev_removed(edev)) return NULL; - if (removed) { + if (rmv_data) { if (eeh_pe_passed(edev->pe)) return NULL; driver = eeh_pcid_get(dev); @@ -539,10 +541,9 @@ static void *eeh_rmv_device(struct eeh_dev *edev, void *userdata) /* Remove it from PCI subsystem */ pr_debug("EEH: Removing %s without EEH sensitive driver\n", pci_name(dev)); - edev->bus = dev->bus; edev->mode |= EEH_DEV_DISCONNECTED; - if (removed) - (*removed)++; + if (rmv_data) + rmv_data->removed_dev_count++; if (edev->physfn) { #ifdef CONFIG_PCI_IOV @@ -558,7 +559,7 @@ static void *eeh_rmv_device(struct eeh_dev *edev, void *userdata) pdn->pe_number = IODA_INVALID_PE; #endif if (rmv_data) - list_add(&edev->rmv_list, &rmv_data->edev_list); + list_add(&edev->rmv_entry, &rmv_data->removed_vf_list); } else { pci_lock_rescan_remove(); pci_stop_and_remove_bus_device(dev); @@ -727,7 +728,7 @@ static int eeh_reset_device(struct eeh_pe *pe, struct pci_bus *bus, * the device up before the scripts have taken it down, * potentially weird things happen. */ - if (!driver_eeh_aware || rmv_data->removed) { + if (!driver_eeh_aware || rmv_data->removed_dev_count) { pr_info("EEH: Sleep 5s ahead of %s hotplug\n", (driver_eeh_aware ? "partial" : "complete")); ssleep(5); @@ -737,10 +738,10 @@ static int eeh_reset_device(struct eeh_pe *pe, struct pci_bus *bus, * PE. We should disconnect it so the binding can be * rebuilt when adding PCI devices. */ - edev = list_first_entry(&pe->edevs, struct eeh_dev, list); + edev = list_first_entry(&pe->edevs, struct eeh_dev, entry); eeh_pe_traverse(pe, eeh_pe_detach_dev, NULL); if (pe->type & EEH_PE_VF) { - eeh_add_virt_device(edev, NULL); + eeh_add_virt_device(edev); } else { if (!driver_eeh_aware) eeh_pe_state_clear(pe, EEH_PE_PRI_BUS); @@ -789,7 +790,8 @@ void eeh_handle_normal_event(struct eeh_pe *pe) struct eeh_pe *tmp_pe; int rc = 0; enum pci_ers_result result = PCI_ERS_RESULT_NONE; - struct eeh_rmv_data rmv_data = {LIST_HEAD_INIT(rmv_data.edev_list), 0}; + struct eeh_rmv_data rmv_data = + {LIST_HEAD_INIT(rmv_data.removed_vf_list), 0}; bus = eeh_pe_bus_get(pe); if (!bus) { @@ -806,10 +808,8 @@ void eeh_handle_normal_event(struct eeh_pe *pe) pr_err("EEH: PHB#%x-PE#%x has failed %d times in the last hour and has been permanently disabled.\n", pe->phb->global_number, pe->addr, pe->freeze_count); - goto hard_fail; + result = PCI_ERS_RESULT_DISCONNECT; } - pr_warn("EEH: This PCI device has failed %d times in the last hour and will be permanently disabled after %d failures.\n", - pe->freeze_count, eeh_max_freezes); /* Walk the various device drivers attached to this slot through * a reset sequence, giving each an opportunity to do what it needs @@ -821,31 +821,39 @@ void eeh_handle_normal_event(struct eeh_pe *pe) * the error. Override the result if necessary to have partially * hotplug for this case. */ - pr_info("EEH: Notify device drivers to shutdown\n"); - eeh_set_channel_state(pe, pci_channel_io_frozen); - eeh_set_irq_state(pe, false); - eeh_pe_report("error_detected(IO frozen)", pe, eeh_report_error, - &result); - if ((pe->type & EEH_PE_PHB) && - result != PCI_ERS_RESULT_NONE && - result != PCI_ERS_RESULT_NEED_RESET) - result = PCI_ERS_RESULT_NEED_RESET; + if (result != PCI_ERS_RESULT_DISCONNECT) { + pr_warn("EEH: This PCI device has failed %d times in the last hour and will be permanently disabled after %d failures.\n", + pe->freeze_count, eeh_max_freezes); + pr_info("EEH: Notify device drivers to shutdown\n"); + eeh_set_channel_state(pe, pci_channel_io_frozen); + eeh_set_irq_state(pe, false); + eeh_pe_report("error_detected(IO frozen)", pe, + eeh_report_error, &result); + if ((pe->type & EEH_PE_PHB) && + result != PCI_ERS_RESULT_NONE && + result != PCI_ERS_RESULT_NEED_RESET) + result = PCI_ERS_RESULT_NEED_RESET; + } /* Get the current PCI slot state. This can take a long time, * sometimes over 300 seconds for certain systems. */ - rc = eeh_ops->wait_state(pe, MAX_WAIT_FOR_RECOVERY*1000); - if (rc < 0 || rc == EEH_STATE_NOT_SUPPORT) { - pr_warn("EEH: Permanent failure\n"); - goto hard_fail; + if (result != PCI_ERS_RESULT_DISCONNECT) { + rc = eeh_wait_state(pe, MAX_WAIT_FOR_RECOVERY*1000); + if (rc < 0 || rc == EEH_STATE_NOT_SUPPORT) { + pr_warn("EEH: Permanent failure\n"); + result = PCI_ERS_RESULT_DISCONNECT; + } } /* Since rtas may enable MMIO when posting the error log, * don't post the error log until after all dev drivers * have been informed. */ - pr_info("EEH: Collect temporary log\n"); - eeh_slot_error_detail(pe, EEH_LOG_TEMP); + if (result != PCI_ERS_RESULT_DISCONNECT) { + pr_info("EEH: Collect temporary log\n"); + eeh_slot_error_detail(pe, EEH_LOG_TEMP); + } /* If all device drivers were EEH-unaware, then shut * down all of the device drivers, and hope they @@ -857,7 +865,7 @@ void eeh_handle_normal_event(struct eeh_pe *pe) if (rc) { pr_warn("%s: Unable to reset, err=%d\n", __func__, rc); - goto hard_fail; + result = PCI_ERS_RESULT_DISCONNECT; } } @@ -866,9 +874,9 @@ void eeh_handle_normal_event(struct eeh_pe *pe) pr_info("EEH: Enable I/O for affected devices\n"); rc = eeh_pci_enable(pe, EEH_OPT_THAW_MMIO); - if (rc < 0) - goto hard_fail; - if (rc) { + if (rc < 0) { + result = PCI_ERS_RESULT_DISCONNECT; + } else if (rc) { result = PCI_ERS_RESULT_NEED_RESET; } else { pr_info("EEH: Notify device drivers to resume I/O\n"); @@ -882,9 +890,9 @@ void eeh_handle_normal_event(struct eeh_pe *pe) pr_info("EEH: Enabled DMA for affected devices\n"); rc = eeh_pci_enable(pe, EEH_OPT_THAW_DMA); - if (rc < 0) - goto hard_fail; - if (rc) { + if (rc < 0) { + result = PCI_ERS_RESULT_DISCONNECT; + } else if (rc) { result = PCI_ERS_RESULT_NEED_RESET; } else { /* @@ -897,12 +905,6 @@ void eeh_handle_normal_event(struct eeh_pe *pe) } } - /* If any device has a hard failure, then shut off everything. */ - if (result == PCI_ERS_RESULT_DISCONNECT) { - pr_warn("EEH: Device driver gave up\n"); - goto hard_fail; - } - /* If any device called out for a reset, then reset the slot */ if (result == PCI_ERS_RESULT_NEED_RESET) { pr_info("EEH: Reset without hotplug activity\n"); @@ -910,88 +912,81 @@ void eeh_handle_normal_event(struct eeh_pe *pe) if (rc) { pr_warn("%s: Cannot reset, err=%d\n", __func__, rc); - goto hard_fail; + result = PCI_ERS_RESULT_DISCONNECT; + } else { + result = PCI_ERS_RESULT_NONE; + eeh_set_channel_state(pe, pci_channel_io_normal); + eeh_set_irq_state(pe, true); + eeh_pe_report("slot_reset", pe, eeh_report_reset, + &result); } - - pr_info("EEH: Notify device drivers " - "the completion of reset\n"); - result = PCI_ERS_RESULT_NONE; - eeh_set_channel_state(pe, pci_channel_io_normal); - eeh_set_irq_state(pe, true); - eeh_pe_report("slot_reset", pe, eeh_report_reset, &result); - } - - /* All devices should claim they have recovered by now. */ - if ((result != PCI_ERS_RESULT_RECOVERED) && - (result != PCI_ERS_RESULT_NONE)) { - pr_warn("EEH: Not recovered\n"); - goto hard_fail; - } - - /* - * For those hot removed VFs, we should add back them after PF get - * recovered properly. - */ - list_for_each_entry_safe(edev, tmp, &rmv_data.edev_list, rmv_list) { - eeh_add_virt_device(edev, NULL); - list_del(&edev->rmv_list); } - /* Tell all device drivers that they can resume operations */ - pr_info("EEH: Notify device driver to resume\n"); - eeh_set_channel_state(pe, pci_channel_io_normal); - eeh_set_irq_state(pe, true); - eeh_pe_report("resume", pe, eeh_report_resume, NULL); - eeh_for_each_pe(pe, tmp_pe) { - eeh_pe_for_each_dev(tmp_pe, edev, tmp) { - edev->mode &= ~EEH_DEV_NO_HANDLER; - edev->in_error = false; + if ((result == PCI_ERS_RESULT_RECOVERED) || + (result == PCI_ERS_RESULT_NONE)) { + /* + * For those hot removed VFs, we should add back them after PF + * get recovered properly. + */ + list_for_each_entry_safe(edev, tmp, &rmv_data.removed_vf_list, + rmv_entry) { + eeh_add_virt_device(edev); + list_del(&edev->rmv_entry); } - } - pr_info("EEH: Recovery successful.\n"); - goto final; + /* Tell all device drivers that they can resume operations */ + pr_info("EEH: Notify device driver to resume\n"); + eeh_set_channel_state(pe, pci_channel_io_normal); + eeh_set_irq_state(pe, true); + eeh_pe_report("resume", pe, eeh_report_resume, NULL); + eeh_for_each_pe(pe, tmp_pe) { + eeh_pe_for_each_dev(tmp_pe, edev, tmp) { + edev->mode &= ~EEH_DEV_NO_HANDLER; + edev->in_error = false; + } + } -hard_fail: - /* - * About 90% of all real-life EEH failures in the field - * are due to poorly seated PCI cards. Only 10% or so are - * due to actual, failed cards. - */ - pr_err("EEH: Unable to recover from failure from PHB#%x-PE#%x.\n" - "Please try reseating or replacing it\n", - pe->phb->global_number, pe->addr); + pr_info("EEH: Recovery successful.\n"); + } else { + /* + * About 90% of all real-life EEH failures in the field + * are due to poorly seated PCI cards. Only 10% or so are + * due to actual, failed cards. + */ + pr_err("EEH: Unable to recover from failure from PHB#%x-PE#%x.\n" + "Please try reseating or replacing it\n", + pe->phb->global_number, pe->addr); - eeh_slot_error_detail(pe, EEH_LOG_PERM); + eeh_slot_error_detail(pe, EEH_LOG_PERM); - /* Notify all devices that they're about to go down. */ - eeh_set_channel_state(pe, pci_channel_io_perm_failure); - eeh_set_irq_state(pe, false); - eeh_pe_report("error_detected(permanent failure)", pe, - eeh_report_failure, NULL); + /* Notify all devices that they're about to go down. */ + eeh_set_channel_state(pe, pci_channel_io_perm_failure); + eeh_set_irq_state(pe, false); + eeh_pe_report("error_detected(permanent failure)", pe, + eeh_report_failure, NULL); - /* Mark the PE to be removed permanently */ - eeh_pe_state_mark(pe, EEH_PE_REMOVED); + /* Mark the PE to be removed permanently */ + eeh_pe_state_mark(pe, EEH_PE_REMOVED); - /* - * Shut down the device drivers for good. We mark - * all removed devices correctly to avoid access - * the their PCI config any more. - */ - if (pe->type & EEH_PE_VF) { - eeh_pe_dev_traverse(pe, eeh_rmv_device, NULL); - eeh_pe_dev_mode_mark(pe, EEH_DEV_REMOVED); - } else { - eeh_pe_state_clear(pe, EEH_PE_PRI_BUS); - eeh_pe_dev_mode_mark(pe, EEH_DEV_REMOVED); + /* + * Shut down the device drivers for good. We mark + * all removed devices correctly to avoid access + * the their PCI config any more. + */ + if (pe->type & EEH_PE_VF) { + eeh_pe_dev_traverse(pe, eeh_rmv_device, NULL); + eeh_pe_dev_mode_mark(pe, EEH_DEV_REMOVED); + } else { + eeh_pe_state_clear(pe, EEH_PE_PRI_BUS); + eeh_pe_dev_mode_mark(pe, EEH_DEV_REMOVED); - pci_lock_rescan_remove(); - pci_hp_remove_devices(bus); - pci_unlock_rescan_remove(); - /* The passed PE should no longer be used */ - return; + pci_lock_rescan_remove(); + pci_hp_remove_devices(bus); + pci_unlock_rescan_remove(); + /* The passed PE should no longer be used */ + return; + } } -final: eeh_pe_state_clear(pe, EEH_PE_RECOVERING); } @@ -1026,7 +1021,7 @@ void eeh_handle_special_event(void) phb_pe = eeh_phb_pe_get(hose); if (!phb_pe) continue; - eeh_pe_state_mark(phb_pe, EEH_PE_ISOLATED); + eeh_pe_mark_isolated(phb_pe); } eeh_serialize_unlock(flags); @@ -1041,11 +1036,9 @@ void eeh_handle_special_event(void) /* Purge all events of the PHB */ eeh_remove_event(pe, true); - if (rc == EEH_NEXT_ERR_DEAD_PHB) - eeh_pe_state_mark(pe, EEH_PE_ISOLATED); - else - eeh_pe_state_mark(pe, - EEH_PE_ISOLATED | EEH_PE_RECOVERING); + if (rc != EEH_NEXT_ERR_DEAD_PHB) + eeh_pe_state_mark(pe, EEH_PE_RECOVERING); + eeh_pe_mark_isolated(pe); eeh_serialize_unlock(flags); diff --git a/arch/powerpc/kernel/eeh_pe.c b/arch/powerpc/kernel/eeh_pe.c index 1b238ecc553e..6fa2032e0594 100644 --- a/arch/powerpc/kernel/eeh_pe.c +++ b/arch/powerpc/kernel/eeh_pe.c @@ -75,7 +75,6 @@ static struct eeh_pe *eeh_pe_alloc(struct pci_controller *phb, int type) pe->type = type; pe->phb = phb; INIT_LIST_HEAD(&pe->child_list); - INIT_LIST_HEAD(&pe->child); INIT_LIST_HEAD(&pe->edevs); pe->data = (void *)pe + ALIGN(sizeof(struct eeh_pe), @@ -110,6 +109,57 @@ int eeh_phb_pe_create(struct pci_controller *phb) } /** + * eeh_wait_state - Wait for PE state + * @pe: EEH PE + * @max_wait: maximal period in millisecond + * + * Wait for the state of associated PE. It might take some time + * to retrieve the PE's state. + */ +int eeh_wait_state(struct eeh_pe *pe, int max_wait) +{ + int ret; + int mwait; + + /* + * According to PAPR, the state of PE might be temporarily + * unavailable. Under the circumstance, we have to wait + * for indicated time determined by firmware. The maximal + * wait time is 5 minutes, which is acquired from the original + * EEH implementation. Also, the original implementation + * also defined the minimal wait time as 1 second. + */ +#define EEH_STATE_MIN_WAIT_TIME (1000) +#define EEH_STATE_MAX_WAIT_TIME (300 * 1000) + + while (1) { + ret = eeh_ops->get_state(pe, &mwait); + + if (ret != EEH_STATE_UNAVAILABLE) + return ret; + + if (max_wait <= 0) { + pr_warn("%s: Timeout when getting PE's state (%d)\n", + __func__, max_wait); + return EEH_STATE_NOT_SUPPORT; + } + + if (mwait < EEH_STATE_MIN_WAIT_TIME) { + pr_warn("%s: Firmware returned bad wait value %d\n", + __func__, mwait); + mwait = EEH_STATE_MIN_WAIT_TIME; + } else if (mwait > EEH_STATE_MAX_WAIT_TIME) { + pr_warn("%s: Firmware returned too long wait value %d\n", + __func__, mwait); + mwait = EEH_STATE_MAX_WAIT_TIME; + } + + msleep(min(mwait, max_wait)); + max_wait -= mwait; + } +} + +/** * eeh_phb_pe_get - Retrieve PHB PE based on the given PHB * @phb: PCI controller * @@ -360,7 +410,7 @@ int eeh_add_to_parent_pe(struct eeh_dev *edev) edev->pe = pe; /* Put the edev to PE */ - list_add_tail(&edev->list, &pe->edevs); + list_add_tail(&edev->entry, &pe->edevs); pr_debug("EEH: Add %04x:%02x:%02x.%01x to Bus PE#%x\n", pdn->phb->global_number, pdn->busno, @@ -369,7 +419,7 @@ int eeh_add_to_parent_pe(struct eeh_dev *edev) pe->addr); return 0; } else if (pe && (pe->type & EEH_PE_INVALID)) { - list_add_tail(&edev->list, &pe->edevs); + list_add_tail(&edev->entry, &pe->edevs); edev->pe = pe; /* * We're running to here because of PCI hotplug caused by @@ -379,7 +429,7 @@ int eeh_add_to_parent_pe(struct eeh_dev *edev) while (parent) { if (!(parent->type & EEH_PE_INVALID)) break; - parent->type &= ~(EEH_PE_INVALID | EEH_PE_KEEP); + parent->type &= ~EEH_PE_INVALID; parent = parent->parent; } @@ -429,7 +479,7 @@ int eeh_add_to_parent_pe(struct eeh_dev *edev) * link the EEH device accordingly. */ list_add_tail(&pe->child, &parent->child_list); - list_add_tail(&edev->list, &pe->edevs); + list_add_tail(&edev->entry, &pe->edevs); edev->pe = pe; pr_debug("EEH: Add %04x:%02x:%02x.%01x to " "Device PE#%x, Parent PE#%x\n", @@ -457,7 +507,8 @@ int eeh_rmv_from_parent_pe(struct eeh_dev *edev) int cnt; struct pci_dn *pdn = eeh_dev_to_pdn(edev); - if (!edev->pe) { + pe = eeh_dev_to_pe(edev); + if (!pe) { pr_debug("%s: No PE found for device %04x:%02x:%02x.%01x\n", __func__, pdn->phb->global_number, pdn->busno, @@ -467,9 +518,8 @@ int eeh_rmv_from_parent_pe(struct eeh_dev *edev) } /* Remove the EEH device */ - pe = eeh_dev_to_pe(edev); edev->pe = NULL; - list_del(&edev->list); + list_del(&edev->entry); /* * Check if the parent PE includes any EEH devices. @@ -541,56 +591,50 @@ void eeh_pe_update_time_stamp(struct eeh_pe *pe) } /** - * __eeh_pe_state_mark - Mark the state for the PE - * @data: EEH PE - * @flag: state + * eeh_pe_state_mark - Mark specified state for PE and its associated device + * @pe: EEH PE * - * The function is used to mark the indicated state for the given - * PE. Also, the associated PCI devices will be put into IO frozen - * state as well. + * EEH error affects the current PE and its child PEs. The function + * is used to mark appropriate state for the affected PEs and the + * associated devices. */ -static void *__eeh_pe_state_mark(struct eeh_pe *pe, void *flag) +void eeh_pe_state_mark(struct eeh_pe *root, int state) { - int state = *((int *)flag); - struct eeh_dev *edev, *tmp; - struct pci_dev *pdev; - - /* Keep the state of permanently removed PE intact */ - if (pe->state & EEH_PE_REMOVED) - return NULL; - - pe->state |= state; - - /* Offline PCI devices if applicable */ - if (!(state & EEH_PE_ISOLATED)) - return NULL; - - eeh_pe_for_each_dev(pe, edev, tmp) { - pdev = eeh_dev_to_pci_dev(edev); - if (pdev) - pdev->error_state = pci_channel_io_frozen; - } - - /* Block PCI config access if required */ - if (pe->state & EEH_PE_CFG_RESTRICTED) - pe->state |= EEH_PE_CFG_BLOCKED; + struct eeh_pe *pe; - return NULL; + eeh_for_each_pe(root, pe) + if (!(pe->state & EEH_PE_REMOVED)) + pe->state |= state; } +EXPORT_SYMBOL_GPL(eeh_pe_state_mark); /** - * eeh_pe_state_mark - Mark specified state for PE and its associated device + * eeh_pe_mark_isolated * @pe: EEH PE * - * EEH error affects the current PE and its child PEs. The function - * is used to mark appropriate state for the affected PEs and the - * associated devices. + * Record that a PE has been isolated by marking the PE and it's children as + * EEH_PE_ISOLATED (and EEH_PE_CFG_BLOCKED, if required) and their PCI devices + * as pci_channel_io_frozen. */ -void eeh_pe_state_mark(struct eeh_pe *pe, int state) +void eeh_pe_mark_isolated(struct eeh_pe *root) { - eeh_pe_traverse(pe, __eeh_pe_state_mark, &state); + struct eeh_pe *pe; + struct eeh_dev *edev; + struct pci_dev *pdev; + + eeh_pe_state_mark(root, EEH_PE_ISOLATED); + eeh_for_each_pe(root, pe) { + list_for_each_entry(edev, &pe->edevs, entry) { + pdev = eeh_dev_to_pci_dev(edev); + if (pdev) + pdev->error_state = pci_channel_io_frozen; + } + /* Block PCI config access if required */ + if (pe->state & EEH_PE_CFG_RESTRICTED) + pe->state |= EEH_PE_CFG_BLOCKED; + } } -EXPORT_SYMBOL_GPL(eeh_pe_state_mark); +EXPORT_SYMBOL_GPL(eeh_pe_mark_isolated); static void *__eeh_pe_dev_mode_mark(struct eeh_dev *edev, void *flag) { @@ -671,28 +715,6 @@ void eeh_pe_state_clear(struct eeh_pe *pe, int state) eeh_pe_traverse(pe, __eeh_pe_state_clear, &state); } -/** - * eeh_pe_state_mark_with_cfg - Mark PE state with unblocked config space - * @pe: PE - * @state: PE state to be set - * - * Set specified flag to PE and its child PEs. The PCI config space - * of some PEs is blocked automatically when EEH_PE_ISOLATED is set, - * which isn't needed in some situations. The function allows to set - * the specified flag to indicated PEs without blocking their PCI - * config space. - */ -void eeh_pe_state_mark_with_cfg(struct eeh_pe *pe, int state) -{ - eeh_pe_traverse(pe, __eeh_pe_state_mark, &state); - if (!(state & EEH_PE_ISOLATED)) - return; - - /* Clear EEH_PE_CFG_BLOCKED, which might be set just now */ - state = EEH_PE_CFG_BLOCKED; - eeh_pe_traverse(pe, __eeh_pe_state_clear, &state); -} - /* * Some PCI bridges (e.g. PLX bridges) have primary/secondary * buses assigned explicitly by firmware, and we probably have @@ -945,7 +967,7 @@ struct pci_bus *eeh_pe_bus_get(struct eeh_pe *pe) return pe->bus; /* Retrieve the parent PCI bus of first (top) PCI device */ - edev = list_first_entry_or_null(&pe->edevs, struct eeh_dev, list); + edev = list_first_entry_or_null(&pe->edevs, struct eeh_dev, entry); pdev = eeh_dev_to_pci_dev(edev); if (pdev) return pdev->bus; diff --git a/arch/powerpc/kernel/entry_32.S b/arch/powerpc/kernel/entry_32.S index e58c3f467db5..77decded1175 100644 --- a/arch/powerpc/kernel/entry_32.S +++ b/arch/powerpc/kernel/entry_32.S @@ -794,7 +794,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_601) lis r10,MSR_KERNEL@h ori r10,r10,MSR_KERNEL@l bl transfer_to_handler_full - .long nonrecoverable_exception + .long unrecoverable_exception .long ret_from_except #endif @@ -1297,7 +1297,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_601) rlwinm r3,r3,0,0,30 stw r3,_TRAP(r1) 4: addi r3,r1,STACK_FRAME_OVERHEAD - bl nonrecoverable_exception + bl unrecoverable_exception /* shouldn't return */ b 4b diff --git a/arch/powerpc/kernel/entry_64.S b/arch/powerpc/kernel/entry_64.S index 2206912ea4f0..7b1693adff2a 100644 --- a/arch/powerpc/kernel/entry_64.S +++ b/arch/powerpc/kernel/entry_64.S @@ -171,7 +171,7 @@ system_call: /* label this so stack traces look sane */ * based on caller's run-mode / personality. */ ld r11,SYS_CALL_TABLE@toc(2) - andi. r10,r10,_TIF_32BIT + andis. r10,r10,_TIF_32BIT@h beq 15f addi r11,r11,8 /* use 32-bit syscall entries */ clrldi r3,r3,32 @@ -386,10 +386,9 @@ END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR) 4: /* Anything else left to do? */ BEGIN_FTR_SECTION - lis r3,INIT_PPR@highest /* Set thread.ppr = 3 */ - ld r10,PACACURRENT(r13) + lis r3,DEFAULT_PPR@highest /* Set default PPR */ sldi r3,r3,32 /* bits 11-13 are used for ppr */ - std r3,TASKTHREADPPR(r10) + std r3,_PPR(r1) END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR) andi. r0,r9,(_TIF_SYSCALL_DOTRACE|_TIF_SINGLESTEP) @@ -624,6 +623,10 @@ _GLOBAL(_switch) addi r6,r4,-THREAD /* Convert THREAD to 'current' */ std r6,PACACURRENT(r13) /* Set new 'current' */ +#if defined(CONFIG_STACKPROTECTOR) + ld r6, TASK_CANARY(r6) + std r6, PACA_CANARY(r13) +#endif ld r8,KSP(r4) /* new stack pointer */ #ifdef CONFIG_PPC_BOOK3S_64 @@ -672,7 +675,9 @@ END_MMU_FTR_SECTION_IFSET(MMU_FTR_1T_SEGMENT) isync slbie r6 +BEGIN_FTR_SECTION slbie r6 /* Workaround POWER5 < DD2.1 issue */ +END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_207S) slbmte r7,r0 isync 2: @@ -936,12 +941,6 @@ fast_exception_return: andi. r0,r3,MSR_RI beq- .Lunrecov_restore - /* Load PPR from thread struct before we clear MSR:RI */ -BEGIN_FTR_SECTION - ld r2,PACACURRENT(r13) - ld r2,TASKTHREADPPR(r2) -END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR) - /* * Clear RI before restoring r13. If we are returning to * userspace and we take an exception after restoring r13, @@ -962,7 +961,9 @@ END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR) andi. r0,r3,MSR_PR beq 1f BEGIN_FTR_SECTION - mtspr SPRN_PPR,r2 /* Restore PPR */ + /* Restore PPR */ + ld r2,_PPR(r1) + mtspr SPRN_PPR,r2 END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR) ACCOUNT_CPU_USER_EXIT(r13, r2, r4) REST_GPR(13, r1) @@ -1118,7 +1119,7 @@ _ASM_NOKPROBE_SYMBOL(fast_exception_return); _GLOBAL(enter_rtas) mflr r0 std r0,16(r1) - stdu r1,-RTAS_FRAME_SIZE(r1) /* Save SP and create stack space. */ + stdu r1,-SWITCH_FRAME_SIZE(r1) /* Save SP and create stack space. */ /* Because RTAS is running in 32b mode, it clobbers the high order half * of all registers that it saves. We therefore save those registers @@ -1250,7 +1251,7 @@ rtas_restore_regs: ld r8,_DSISR(r1) mtdsisr r8 - addi r1,r1,RTAS_FRAME_SIZE /* Unstack our frame */ + addi r1,r1,SWITCH_FRAME_SIZE /* Unstack our frame */ ld r0,16(r1) /* get return address */ mtlr r0 @@ -1261,7 +1262,7 @@ rtas_restore_regs: _GLOBAL(enter_prom) mflr r0 std r0,16(r1) - stdu r1,-PROM_FRAME_SIZE(r1) /* Save SP and create stack space */ + stdu r1,-SWITCH_FRAME_SIZE(r1) /* Save SP and create stack space */ /* Because PROM is running in 32b mode, it clobbers the high order half * of all registers that it saves. We therefore save those registers @@ -1318,8 +1319,8 @@ _GLOBAL(enter_prom) REST_10GPRS(22, r1) ld r4,_CCR(r1) mtcr r4 - - addi r1,r1,PROM_FRAME_SIZE + + addi r1,r1,SWITCH_FRAME_SIZE ld r0,16(r1) mtlr r0 blr diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S index 2d8fc8c9da7a..89d32bb79d5e 100644 --- a/arch/powerpc/kernel/exceptions-64s.S +++ b/arch/powerpc/kernel/exceptions-64s.S @@ -244,14 +244,13 @@ EXC_REAL_BEGIN(machine_check, 0x200, 0x100) SET_SCRATCH0(r13) /* save r13 */ EXCEPTION_PROLOG_0(PACA_EXMC) BEGIN_FTR_SECTION - b machine_check_powernv_early + b machine_check_common_early FTR_SECTION_ELSE b machine_check_pSeries_0 ALT_FTR_SECTION_END_IFSET(CPU_FTR_HVMODE) EXC_REAL_END(machine_check, 0x200, 0x100) EXC_VIRT_NONE(0x4200, 0x100) -TRAMP_REAL_BEGIN(machine_check_powernv_early) -BEGIN_FTR_SECTION +TRAMP_REAL_BEGIN(machine_check_common_early) EXCEPTION_PROLOG_1(PACA_EXMC, NOTEST, 0x200) /* * Register contents: @@ -305,7 +304,9 @@ BEGIN_FTR_SECTION /* Save r9 through r13 from EXMC save area to stack frame. */ EXCEPTION_PROLOG_COMMON_2(PACA_EXMC) mfmsr r11 /* get MSR value */ +BEGIN_FTR_SECTION ori r11,r11,MSR_ME /* turn on ME bit */ +END_FTR_SECTION_IFSET(CPU_FTR_HVMODE) ori r11,r11,MSR_RI /* turn on RI bit */ LOAD_HANDLER(r12, machine_check_handle_early) 1: mtspr SPRN_SRR0,r12 @@ -324,13 +325,15 @@ BEGIN_FTR_SECTION andc r11,r11,r10 /* Turn off MSR_ME */ b 1b b . /* prevent speculative execution */ -END_FTR_SECTION_IFSET(CPU_FTR_HVMODE) TRAMP_REAL_BEGIN(machine_check_pSeries) .globl machine_check_fwnmi machine_check_fwnmi: SET_SCRATCH0(r13) /* save r13 */ EXCEPTION_PROLOG_0(PACA_EXMC) +BEGIN_FTR_SECTION + b machine_check_common_early +END_FTR_SECTION_IFCLR(CPU_FTR_HVMODE) machine_check_pSeries_0: EXCEPTION_PROLOG_1(PACA_EXMC, KVMTEST_PR, 0x200) /* @@ -440,6 +443,9 @@ EXC_COMMON_BEGIN(machine_check_handle_early) bl machine_check_early std r3,RESULT(r1) /* Save result */ ld r12,_MSR(r1) +BEGIN_FTR_SECTION + b 4f +END_FTR_SECTION_IFCLR(CPU_FTR_HVMODE) #ifdef CONFIG_PPC_P7_NAP /* @@ -463,11 +469,12 @@ EXC_COMMON_BEGIN(machine_check_handle_early) */ rldicl. r11,r12,4,63 /* See if MC hit while in HV mode. */ beq 5f - andi. r11,r12,MSR_PR /* See if coming from user. */ +4: andi. r11,r12,MSR_PR /* See if coming from user. */ bne 9f /* continue in V mode if we are. */ 5: #ifdef CONFIG_KVM_BOOK3S_64_HANDLER +BEGIN_FTR_SECTION /* * We are coming from kernel context. Check if we are coming from * guest. if yes, then we can continue. We will fall through @@ -476,6 +483,7 @@ EXC_COMMON_BEGIN(machine_check_handle_early) lbz r11,HSTATE_IN_GUEST(r13) cmpwi r11,0 /* Check if coming from guest */ bne 9f /* continue if we are. */ +END_FTR_SECTION_IFSET(CPU_FTR_HVMODE) #endif /* * At this point we are not sure about what context we come from. @@ -510,6 +518,7 @@ EXC_COMMON_BEGIN(machine_check_handle_early) cmpdi r3,0 /* see if we handled MCE successfully */ beq 1b /* if !handled then panic */ +BEGIN_FTR_SECTION /* * Return from MC interrupt. * Queue up the MCE event so that we can log it later, while @@ -518,10 +527,24 @@ EXC_COMMON_BEGIN(machine_check_handle_early) bl machine_check_queue_event MACHINE_CHECK_HANDLER_WINDUP RFI_TO_USER_OR_KERNEL +FTR_SECTION_ELSE + /* + * pSeries: Return from MC interrupt. Before that stay on emergency + * stack and call machine_check_exception to log the MCE event. + */ + LOAD_HANDLER(r10,mce_return) + mtspr SPRN_SRR0,r10 + ld r10,PACAKMSR(r13) + mtspr SPRN_SRR1,r10 + RFI_TO_KERNEL + b . +ALT_FTR_SECTION_END_IFSET(CPU_FTR_HVMODE) 9: /* Deliver the machine check to host kernel in V mode. */ MACHINE_CHECK_HANDLER_WINDUP - b machine_check_pSeries + SET_SCRATCH0(r13) /* save r13 */ + EXCEPTION_PROLOG_0(PACA_EXMC) + b machine_check_pSeries_0 EXC_COMMON_BEGIN(unrecover_mce) /* Invoke machine_check_exception to print MCE event and panic. */ @@ -535,6 +558,13 @@ EXC_COMMON_BEGIN(unrecover_mce) bl unrecoverable_exception b 1b +EXC_COMMON_BEGIN(mce_return) + /* Invoke machine_check_exception to print MCE event and return. */ + addi r3,r1,STACK_FRAME_OVERHEAD + bl machine_check_exception + MACHINE_CHECK_HANDLER_WINDUP + RFI_TO_KERNEL + b . EXC_REAL(data_access, 0x300, 0x80) EXC_VIRT(data_access, 0x4300, 0x80, 0x300) @@ -566,28 +596,36 @@ ALT_MMU_FTR_SECTION_END_IFCLR(MMU_FTR_TYPE_RADIX) EXC_REAL_BEGIN(data_access_slb, 0x380, 0x80) - SET_SCRATCH0(r13) - EXCEPTION_PROLOG_0(PACA_EXSLB) - EXCEPTION_PROLOG_1(PACA_EXSLB, KVMTEST_PR, 0x380) - mr r12,r3 /* save r3 */ - mfspr r3,SPRN_DAR - mfspr r11,SPRN_SRR1 - crset 4*cr6+eq - BRANCH_TO_COMMON(r10, slb_miss_common) +EXCEPTION_PROLOG(PACA_EXSLB, data_access_slb_common, EXC_STD, KVMTEST_PR, 0x380); EXC_REAL_END(data_access_slb, 0x380, 0x80) EXC_VIRT_BEGIN(data_access_slb, 0x4380, 0x80) - SET_SCRATCH0(r13) - EXCEPTION_PROLOG_0(PACA_EXSLB) - EXCEPTION_PROLOG_1(PACA_EXSLB, NOTEST, 0x380) - mr r12,r3 /* save r3 */ - mfspr r3,SPRN_DAR - mfspr r11,SPRN_SRR1 - crset 4*cr6+eq - BRANCH_TO_COMMON(r10, slb_miss_common) +EXCEPTION_RELON_PROLOG(PACA_EXSLB, data_access_slb_common, EXC_STD, NOTEST, 0x380); EXC_VIRT_END(data_access_slb, 0x4380, 0x80) + TRAMP_KVM_SKIP(PACA_EXSLB, 0x380) +EXC_COMMON_BEGIN(data_access_slb_common) + mfspr r10,SPRN_DAR + std r10,PACA_EXSLB+EX_DAR(r13) + EXCEPTION_PROLOG_COMMON(0x380, PACA_EXSLB) + ld r4,PACA_EXSLB+EX_DAR(r13) + std r4,_DAR(r1) + addi r3,r1,STACK_FRAME_OVERHEAD + bl do_slb_fault + cmpdi r3,0 + bne- 1f + b fast_exception_return +1: /* Error case */ + std r3,RESULT(r1) + bl save_nvgprs + RECONCILE_IRQ_STATE(r10, r11) + ld r4,_DAR(r1) + ld r5,RESULT(r1) + addi r3,r1,STACK_FRAME_OVERHEAD + bl do_bad_slb_fault + b ret_from_except + EXC_REAL(instruction_access, 0x400, 0x80) EXC_VIRT(instruction_access, 0x4400, 0x80, 0x400) @@ -610,160 +648,34 @@ ALT_MMU_FTR_SECTION_END_IFCLR(MMU_FTR_TYPE_RADIX) EXC_REAL_BEGIN(instruction_access_slb, 0x480, 0x80) - SET_SCRATCH0(r13) - EXCEPTION_PROLOG_0(PACA_EXSLB) - EXCEPTION_PROLOG_1(PACA_EXSLB, KVMTEST_PR, 0x480) - mr r12,r3 /* save r3 */ - mfspr r3,SPRN_SRR0 /* SRR0 is faulting address */ - mfspr r11,SPRN_SRR1 - crclr 4*cr6+eq - BRANCH_TO_COMMON(r10, slb_miss_common) +EXCEPTION_PROLOG(PACA_EXSLB, instruction_access_slb_common, EXC_STD, KVMTEST_PR, 0x480); EXC_REAL_END(instruction_access_slb, 0x480, 0x80) EXC_VIRT_BEGIN(instruction_access_slb, 0x4480, 0x80) - SET_SCRATCH0(r13) - EXCEPTION_PROLOG_0(PACA_EXSLB) - EXCEPTION_PROLOG_1(PACA_EXSLB, NOTEST, 0x480) - mr r12,r3 /* save r3 */ - mfspr r3,SPRN_SRR0 /* SRR0 is faulting address */ - mfspr r11,SPRN_SRR1 - crclr 4*cr6+eq - BRANCH_TO_COMMON(r10, slb_miss_common) +EXCEPTION_RELON_PROLOG(PACA_EXSLB, instruction_access_slb_common, EXC_STD, NOTEST, 0x480); EXC_VIRT_END(instruction_access_slb, 0x4480, 0x80) -TRAMP_KVM(PACA_EXSLB, 0x480) - - -/* - * This handler is used by the 0x380 and 0x480 SLB miss interrupts, as well as - * the virtual mode 0x4380 and 0x4480 interrupts if AIL is enabled. - */ -EXC_COMMON_BEGIN(slb_miss_common) - /* - * r13 points to the PACA, r9 contains the saved CR, - * r12 contains the saved r3, - * r11 contain the saved SRR1, SRR0 is still ready for return - * r3 has the faulting address - * r9 - r13 are saved in paca->exslb. - * cr6.eq is set for a D-SLB miss, clear for a I-SLB miss - * We assume we aren't going to take any exceptions during this - * procedure. - */ - mflr r10 - stw r9,PACA_EXSLB+EX_CCR(r13) /* save CR in exc. frame */ - std r10,PACA_EXSLB+EX_LR(r13) /* save LR */ - - andi. r9,r11,MSR_PR // Check for exception from userspace - cmpdi cr4,r9,MSR_PR // And save the result in CR4 for later - - /* - * Test MSR_RI before calling slb_allocate_realmode, because the - * MSR in r11 gets clobbered. However we still want to allocate - * SLB in case MSR_RI=0, to minimise the risk of getting stuck in - * recursive SLB faults. So use cr5 for this, which is preserved. - */ - andi. r11,r11,MSR_RI /* check for unrecoverable exception */ - cmpdi cr5,r11,MSR_RI - - crset 4*cr0+eq -#ifdef CONFIG_PPC_BOOK3S_64 -BEGIN_MMU_FTR_SECTION - bl slb_allocate -END_MMU_FTR_SECTION_IFCLR(MMU_FTR_TYPE_RADIX) -#endif - - ld r10,PACA_EXSLB+EX_LR(r13) - lwz r9,PACA_EXSLB+EX_CCR(r13) /* get saved CR */ - mtlr r10 - - /* - * Large address, check whether we have to allocate new contexts. - */ - beq- 8f - bne- cr5,2f /* if unrecoverable exception, oops */ - - /* All done -- return from exception. */ - - bne cr4,1f /* returning to kernel */ - - mtcrf 0x80,r9 - mtcrf 0x08,r9 /* MSR[PR] indication is in cr4 */ - mtcrf 0x04,r9 /* MSR[RI] indication is in cr5 */ - mtcrf 0x02,r9 /* I/D indication is in cr6 */ - mtcrf 0x01,r9 /* slb_allocate uses cr0 and cr7 */ - - RESTORE_CTR(r9, PACA_EXSLB) - RESTORE_PPR_PACA(PACA_EXSLB, r9) - mr r3,r12 - ld r9,PACA_EXSLB+EX_R9(r13) - ld r10,PACA_EXSLB+EX_R10(r13) - ld r11,PACA_EXSLB+EX_R11(r13) - ld r12,PACA_EXSLB+EX_R12(r13) - ld r13,PACA_EXSLB+EX_R13(r13) - RFI_TO_USER - b . /* prevent speculative execution */ -1: - mtcrf 0x80,r9 - mtcrf 0x08,r9 /* MSR[PR] indication is in cr4 */ - mtcrf 0x04,r9 /* MSR[RI] indication is in cr5 */ - mtcrf 0x02,r9 /* I/D indication is in cr6 */ - mtcrf 0x01,r9 /* slb_allocate uses cr0 and cr7 */ - - RESTORE_CTR(r9, PACA_EXSLB) - RESTORE_PPR_PACA(PACA_EXSLB, r9) - mr r3,r12 - ld r9,PACA_EXSLB+EX_R9(r13) - ld r10,PACA_EXSLB+EX_R10(r13) - ld r11,PACA_EXSLB+EX_R11(r13) - ld r12,PACA_EXSLB+EX_R12(r13) - ld r13,PACA_EXSLB+EX_R13(r13) - RFI_TO_KERNEL - b . /* prevent speculative execution */ - - -2: std r3,PACA_EXSLB+EX_DAR(r13) - mr r3,r12 - mfspr r11,SPRN_SRR0 - mfspr r12,SPRN_SRR1 - LOAD_HANDLER(r10,unrecov_slb) - mtspr SPRN_SRR0,r10 - ld r10,PACAKMSR(r13) - mtspr SPRN_SRR1,r10 - RFI_TO_KERNEL - b . - -8: std r3,PACA_EXSLB+EX_DAR(r13) - mr r3,r12 - mfspr r11,SPRN_SRR0 - mfspr r12,SPRN_SRR1 - LOAD_HANDLER(r10, large_addr_slb) - mtspr SPRN_SRR0,r10 - ld r10,PACAKMSR(r13) - mtspr SPRN_SRR1,r10 - RFI_TO_KERNEL - b . +TRAMP_KVM(PACA_EXSLB, 0x480) -EXC_COMMON_BEGIN(unrecov_slb) - EXCEPTION_PROLOG_COMMON(0x4100, PACA_EXSLB) - RECONCILE_IRQ_STATE(r10, r11) +EXC_COMMON_BEGIN(instruction_access_slb_common) + EXCEPTION_PROLOG_COMMON(0x480, PACA_EXSLB) + ld r4,_NIP(r1) + addi r3,r1,STACK_FRAME_OVERHEAD + bl do_slb_fault + cmpdi r3,0 + bne- 1f + b fast_exception_return +1: /* Error case */ + std r3,RESULT(r1) bl save_nvgprs -1: addi r3,r1,STACK_FRAME_OVERHEAD - bl unrecoverable_exception - b 1b - -EXC_COMMON_BEGIN(large_addr_slb) - EXCEPTION_PROLOG_COMMON(0x380, PACA_EXSLB) RECONCILE_IRQ_STATE(r10, r11) - ld r3, PACA_EXSLB+EX_DAR(r13) - std r3, _DAR(r1) - beq cr6, 2f - li r10, 0x481 /* fix trap number for I-SLB miss */ - std r10, _TRAP(r1) -2: bl save_nvgprs - addi r3, r1, STACK_FRAME_OVERHEAD - bl slb_miss_large_addr + ld r4,_NIP(r1) + ld r5,RESULT(r1) + addi r3,r1,STACK_FRAME_OVERHEAD + bl do_bad_slb_fault b ret_from_except + EXC_REAL_BEGIN(hardware_interrupt, 0x500, 0x100) .globl hardware_interrupt_hv; hardware_interrupt_hv: diff --git a/arch/powerpc/kernel/fadump.c b/arch/powerpc/kernel/fadump.c index a711d22339ea..761b28b1427d 100644 --- a/arch/powerpc/kernel/fadump.c +++ b/arch/powerpc/kernel/fadump.c @@ -1444,8 +1444,8 @@ static ssize_t fadump_register_store(struct kobject *kobj, break; case 1: if (fw_dump.dump_registered == 1) { - ret = -EEXIST; - goto unlock_out; + /* Un-register Firmware-assisted dump */ + fadump_unregister_dump(&fdm); } /* Register Firmware-assisted dump */ ret = register_fadump(); diff --git a/arch/powerpc/kernel/head_8xx.S b/arch/powerpc/kernel/head_8xx.S index 6582f824d620..134a573a9f2d 100644 --- a/arch/powerpc/kernel/head_8xx.S +++ b/arch/powerpc/kernel/head_8xx.S @@ -642,7 +642,7 @@ DTLBMissIMMR: mtspr SPRN_MD_TWC, r10 mfspr r10, SPRN_IMMR /* Get current IMMR */ rlwinm r10, r10, 0, 0xfff80000 /* Get 512 kbytes boundary */ - ori r10, r10, 0xf0 | MD_SPS16K | _PAGE_PRIVILEGED | _PAGE_DIRTY | \ + ori r10, r10, 0xf0 | MD_SPS16K | _PAGE_SH | _PAGE_DIRTY | \ _PAGE_PRESENT | _PAGE_NO_CACHE mtspr SPRN_MD_RPN, r10 /* Update TLB entry */ @@ -660,7 +660,7 @@ DTLBMissLinear: li r11, MD_PS8MEG | MD_SVALID | M_APG2 mtspr SPRN_MD_TWC, r11 rlwinm r10, r10, 0, 0x0f800000 /* 8xx supports max 256Mb RAM */ - ori r10, r10, 0xf0 | MD_SPS16K | _PAGE_PRIVILEGED | _PAGE_DIRTY | \ + ori r10, r10, 0xf0 | MD_SPS16K | _PAGE_SH | _PAGE_DIRTY | \ _PAGE_PRESENT mtspr SPRN_MD_RPN, r10 /* Update TLB entry */ @@ -679,7 +679,7 @@ ITLBMissLinear: li r11, MI_PS8MEG | MI_SVALID | M_APG2 mtspr SPRN_MI_TWC, r11 rlwinm r10, r10, 0, 0x0f800000 /* 8xx supports max 256Mb RAM */ - ori r10, r10, 0xf0 | MI_SPS16K | _PAGE_PRIVILEGED | _PAGE_DIRTY | \ + ori r10, r10, 0xf0 | MI_SPS16K | _PAGE_SH | _PAGE_DIRTY | \ _PAGE_PRESENT mtspr SPRN_MI_RPN, r10 /* Update TLB entry */ diff --git a/arch/powerpc/kernel/io-workarounds.c b/arch/powerpc/kernel/io-workarounds.c index aa9f1b8261db..7e89d02a84e1 100644 --- a/arch/powerpc/kernel/io-workarounds.c +++ b/arch/powerpc/kernel/io-workarounds.c @@ -153,10 +153,10 @@ static const struct ppc_pci_io iowa_pci_io = { #ifdef CONFIG_PPC_INDIRECT_MMIO static void __iomem *iowa_ioremap(phys_addr_t addr, unsigned long size, - unsigned long flags, void *caller) + pgprot_t prot, void *caller) { struct iowa_bus *bus; - void __iomem *res = __ioremap_caller(addr, size, flags, caller); + void __iomem *res = __ioremap_caller(addr, size, prot, caller); int busno; bus = iowa_pci_find(0, (unsigned long)addr); diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c index 19b4c628f3be..f0dc680e659a 100644 --- a/arch/powerpc/kernel/iommu.c +++ b/arch/powerpc/kernel/iommu.c @@ -785,9 +785,9 @@ dma_addr_t iommu_map_page(struct device *dev, struct iommu_table *tbl, vaddr = page_address(page) + offset; uaddr = (unsigned long)vaddr; - npages = iommu_num_pages(uaddr, size, IOMMU_PAGE_SIZE(tbl)); if (tbl) { + npages = iommu_num_pages(uaddr, size, IOMMU_PAGE_SIZE(tbl)); align = 0; if (tbl->it_page_shift < PAGE_SHIFT && size >= PAGE_SIZE && ((unsigned long)vaddr & ~PAGE_MASK) == 0) diff --git a/arch/powerpc/kernel/isa-bridge.c b/arch/powerpc/kernel/isa-bridge.c index 1df6c74aa731..fda3ae48480c 100644 --- a/arch/powerpc/kernel/isa-bridge.c +++ b/arch/powerpc/kernel/isa-bridge.c @@ -110,14 +110,14 @@ static void pci_process_ISA_OF_ranges(struct device_node *isa_node, size = 0x10000; __ioremap_at(phb_io_base_phys, (void *)ISA_IO_BASE, - size, pgprot_val(pgprot_noncached(__pgprot(0)))); + size, pgprot_noncached(PAGE_KERNEL)); return; inval_range: printk(KERN_ERR "no ISA IO ranges or unexpected isa range, " "mapping 64k\n"); __ioremap_at(phb_io_base_phys, (void *)ISA_IO_BASE, - 0x10000, pgprot_val(pgprot_noncached(__pgprot(0)))); + 0x10000, pgprot_noncached(PAGE_KERNEL)); } @@ -253,7 +253,7 @@ void __init isa_bridge_init_non_pci(struct device_node *np) */ isa_io_base = ISA_IO_BASE; __ioremap_at(pbase, (void *)ISA_IO_BASE, - size, pgprot_val(pgprot_noncached(__pgprot(0)))); + size, pgprot_noncached(PAGE_KERNEL)); pr_debug("ISA: Non-PCI bridge is %pOF\n", np); } diff --git a/arch/powerpc/kernel/kgdb.c b/arch/powerpc/kernel/kgdb.c index 35e240a0a408..59c578f865aa 100644 --- a/arch/powerpc/kernel/kgdb.c +++ b/arch/powerpc/kernel/kgdb.c @@ -24,6 +24,7 @@ #include <asm/processor.h> #include <asm/machdep.h> #include <asm/debug.h> +#include <asm/code-patching.h> #include <linux/slab.h> /* @@ -144,7 +145,7 @@ static int kgdb_handle_breakpoint(struct pt_regs *regs) if (kgdb_handle_exception(1, SIGTRAP, 0, regs) != 0) return 0; - if (*(u32 *) (regs->nip) == *(u32 *) (&arch_kgdb_ops.gdb_bpt_instr)) + if (*(u32 *)regs->nip == BREAK_INSTR) regs->nip += BREAK_INSTR_SIZE; return 1; @@ -441,16 +442,42 @@ int kgdb_arch_handle_exception(int vector, int signo, int err_code, return -1; } +int kgdb_arch_set_breakpoint(struct kgdb_bkpt *bpt) +{ + int err; + unsigned int instr; + unsigned int *addr = (unsigned int *)bpt->bpt_addr; + + err = probe_kernel_address(addr, instr); + if (err) + return err; + + err = patch_instruction(addr, BREAK_INSTR); + if (err) + return -EFAULT; + + *(unsigned int *)bpt->saved_instr = instr; + + return 0; +} + +int kgdb_arch_remove_breakpoint(struct kgdb_bkpt *bpt) +{ + int err; + unsigned int instr = *(unsigned int *)bpt->saved_instr; + unsigned int *addr = (unsigned int *)bpt->bpt_addr; + + err = patch_instruction(addr, instr); + if (err) + return -EFAULT; + + return 0; +} + /* * Global data */ -struct kgdb_arch arch_kgdb_ops = { -#ifdef __LITTLE_ENDIAN__ - .gdb_bpt_instr = {0x08, 0x10, 0x82, 0x7d}, -#else - .gdb_bpt_instr = {0x7d, 0x82, 0x10, 0x08}, -#endif -}; +struct kgdb_arch arch_kgdb_ops; static int kgdb_not_implemented(struct pt_regs *regs) { diff --git a/arch/powerpc/kernel/mce.c b/arch/powerpc/kernel/mce.c index efdd16a79075..bd933a75f0bc 100644 --- a/arch/powerpc/kernel/mce.c +++ b/arch/powerpc/kernel/mce.c @@ -488,10 +488,11 @@ long machine_check_early(struct pt_regs *regs) { long handled = 0; - __this_cpu_inc(irq_stat.mce_exceptions); - - if (cur_cpu_spec && cur_cpu_spec->machine_check_early) - handled = cur_cpu_spec->machine_check_early(regs); + /* + * See if platform is capable of handling machine check. + */ + if (ppc_md.machine_check_early) + handled = ppc_md.machine_check_early(regs); return handled; } diff --git a/arch/powerpc/kernel/mce_power.c b/arch/powerpc/kernel/mce_power.c index 3497c8329c1d..6b800eec31f2 100644 --- a/arch/powerpc/kernel/mce_power.c +++ b/arch/powerpc/kernel/mce_power.c @@ -60,7 +60,7 @@ static unsigned long addr_to_pfn(struct pt_regs *regs, unsigned long addr) /* flush SLBs and reload */ #ifdef CONFIG_PPC_BOOK3S_64 -static void flush_and_reload_slb(void) +void flush_and_reload_slb(void) { /* Invalidate all SLBs */ slb_flush_all_realmode(); @@ -89,6 +89,13 @@ static void flush_and_reload_slb(void) static void flush_erat(void) { +#ifdef CONFIG_PPC_BOOK3S_64 + if (!early_cpu_has_feature(CPU_FTR_ARCH_300)) { + flush_and_reload_slb(); + return; + } +#endif + /* PPC_INVALIDATE_ERAT can only be used on ISA v3 and newer */ asm volatile(PPC_INVALIDATE_ERAT : : :"memory"); } diff --git a/arch/powerpc/kernel/module.c b/arch/powerpc/kernel/module.c index 77371c9ef3d8..2d861a36662e 100644 --- a/arch/powerpc/kernel/module.c +++ b/arch/powerpc/kernel/module.c @@ -74,6 +74,14 @@ int module_finalize(const Elf_Ehdr *hdr, (void *)sect->sh_addr + sect->sh_size); #endif /* CONFIG_PPC64 */ +#ifdef PPC64_ELF_ABI_v1 + sect = find_section(hdr, sechdrs, ".opd"); + if (sect != NULL) { + me->arch.start_opd = sect->sh_addr; + me->arch.end_opd = sect->sh_addr + sect->sh_size; + } +#endif /* PPC64_ELF_ABI_v1 */ + #ifdef CONFIG_PPC_BARRIER_NOSPEC sect = find_section(hdr, sechdrs, "__spec_barrier_fixup"); if (sect != NULL) diff --git a/arch/powerpc/kernel/module_64.c b/arch/powerpc/kernel/module_64.c index b8d61e019d06..8661eea78503 100644 --- a/arch/powerpc/kernel/module_64.c +++ b/arch/powerpc/kernel/module_64.c @@ -360,11 +360,6 @@ int module_frob_arch_sections(Elf64_Ehdr *hdr, else if (strcmp(secstrings+sechdrs[i].sh_name,"__versions")==0) dedotify_versions((void *)hdr + sechdrs[i].sh_offset, sechdrs[i].sh_size); - else if (!strcmp(secstrings + sechdrs[i].sh_name, ".opd")) { - me->arch.start_opd = sechdrs[i].sh_addr; - me->arch.end_opd = sechdrs[i].sh_addr + - sechdrs[i].sh_size; - } /* We don't handle .init for the moment: rename to _init */ while ((p = strstr(secstrings + sechdrs[i].sh_name, ".init"))) @@ -685,7 +680,14 @@ int apply_relocate_add(Elf64_Shdr *sechdrs, case R_PPC64_REL32: /* 32 bits relative (used by relative exception tables) */ - *(u32 *)location = value - (unsigned long)location; + /* Convert value to relative */ + value -= (unsigned long)location; + if (value + 0x80000000 > 0xffffffff) { + pr_err("%s: REL32 %li out of range!\n", + me->name, (long int)value); + return -ENOEXEC; + } + *(u32 *)location = value; break; case R_PPC64_TOCSAVE: diff --git a/arch/powerpc/kernel/pci_32.c b/arch/powerpc/kernel/pci_32.c index d63b488d34d7..4da8ed576229 100644 --- a/arch/powerpc/kernel/pci_32.c +++ b/arch/powerpc/kernel/pci_32.c @@ -17,7 +17,6 @@ #include <linux/of.h> #include <linux/slab.h> #include <linux/export.h> -#include <linux/syscalls.h> #include <asm/processor.h> #include <asm/io.h> diff --git a/arch/powerpc/kernel/pci_64.c b/arch/powerpc/kernel/pci_64.c index dff28f903512..9d8c10d55407 100644 --- a/arch/powerpc/kernel/pci_64.c +++ b/arch/powerpc/kernel/pci_64.c @@ -159,7 +159,7 @@ static int pcibios_map_phb_io_space(struct pci_controller *hose) /* Establish the mapping */ if (__ioremap_at(phys_page, area->addr, size_page, - pgprot_val(pgprot_noncached(__pgprot(0)))) == NULL) + pgprot_noncached(PAGE_KERNEL)) == NULL) return -ENOMEM; /* Fixup hose IO resource */ diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c index 5d983d8bac27..4d5322cfad25 100644 --- a/arch/powerpc/kernel/process.c +++ b/arch/powerpc/kernel/process.c @@ -43,6 +43,7 @@ #include <linux/uaccess.h> #include <linux/elf-randomize.h> #include <linux/pkeys.h> +#include <linux/seq_buf.h> #include <asm/pgtable.h> #include <asm/io.h> @@ -65,6 +66,7 @@ #include <asm/livepatch.h> #include <asm/cpu_has_feature.h> #include <asm/asm-prototypes.h> +#include <asm/stacktrace.h> #include <linux/kprobes.h> #include <linux/kdebug.h> @@ -102,24 +104,18 @@ static void check_if_tm_restore_required(struct task_struct *tsk) } } -static inline bool msr_tm_active(unsigned long msr) -{ - return MSR_TM_ACTIVE(msr); -} - static bool tm_active_with_fp(struct task_struct *tsk) { - return msr_tm_active(tsk->thread.regs->msr) && + return MSR_TM_ACTIVE(tsk->thread.regs->msr) && (tsk->thread.ckpt_regs.msr & MSR_FP); } static bool tm_active_with_altivec(struct task_struct *tsk) { - return msr_tm_active(tsk->thread.regs->msr) && + return MSR_TM_ACTIVE(tsk->thread.regs->msr) && (tsk->thread.ckpt_regs.msr & MSR_VEC); } #else -static inline bool msr_tm_active(unsigned long msr) { return false; } static inline void check_if_tm_restore_required(struct task_struct *tsk) { } static inline bool tm_active_with_fp(struct task_struct *tsk) { return false; } static inline bool tm_active_with_altivec(struct task_struct *tsk) { return false; } @@ -247,7 +243,8 @@ void enable_kernel_fp(void) * giveup as this would save to the 'live' structure not the * checkpointed structure. */ - if(!msr_tm_active(cpumsr) && msr_tm_active(current->thread.regs->msr)) + if (!MSR_TM_ACTIVE(cpumsr) && + MSR_TM_ACTIVE(current->thread.regs->msr)) return; __giveup_fpu(current); } @@ -311,7 +308,8 @@ void enable_kernel_altivec(void) * giveup as this would save to the 'live' structure not the * checkpointed structure. */ - if(!msr_tm_active(cpumsr) && msr_tm_active(current->thread.regs->msr)) + if (!MSR_TM_ACTIVE(cpumsr) && + MSR_TM_ACTIVE(current->thread.regs->msr)) return; __giveup_altivec(current); } @@ -397,7 +395,8 @@ void enable_kernel_vsx(void) * giveup as this would save to the 'live' structure not the * checkpointed structure. */ - if(!msr_tm_active(cpumsr) && msr_tm_active(current->thread.regs->msr)) + if (!MSR_TM_ACTIVE(cpumsr) && + MSR_TM_ACTIVE(current->thread.regs->msr)) return; __giveup_vsx(current); } @@ -530,7 +529,7 @@ void restore_math(struct pt_regs *regs) { unsigned long msr; - if (!msr_tm_active(regs->msr) && + if (!MSR_TM_ACTIVE(regs->msr) && !current->thread.load_fp && !loadvec(current->thread)) return; @@ -1252,17 +1251,16 @@ struct task_struct *__switch_to(struct task_struct *prev, return last; } -static int instructions_to_print = 16; +#define NR_INSN_TO_PRINT 16 static void show_instructions(struct pt_regs *regs) { int i; - unsigned long pc = regs->nip - (instructions_to_print * 3 / 4 * - sizeof(int)); + unsigned long pc = regs->nip - (NR_INSN_TO_PRINT * 3 / 4 * sizeof(int)); printk("Instruction dump:"); - for (i = 0; i < instructions_to_print; i++) { + for (i = 0; i < NR_INSN_TO_PRINT; i++) { int instr; if (!(i % 8)) @@ -1277,7 +1275,7 @@ static void show_instructions(struct pt_regs *regs) #endif if (!__kernel_text_address(pc) || - probe_kernel_address((unsigned int __user *)pc, instr)) { + probe_kernel_address((const void *)pc, instr)) { pr_cont("XXXXXXXX "); } else { if (regs->nip == pc) @@ -1295,43 +1293,43 @@ static void show_instructions(struct pt_regs *regs) void show_user_instructions(struct pt_regs *regs) { unsigned long pc; - int i; + int n = NR_INSN_TO_PRINT; + struct seq_buf s; + char buf[96]; /* enough for 8 times 9 + 2 chars */ - pc = regs->nip - (instructions_to_print * 3 / 4 * sizeof(int)); + pc = regs->nip - (NR_INSN_TO_PRINT * 3 / 4 * sizeof(int)); /* * Make sure the NIP points at userspace, not kernel text/data or * elsewhere. */ - if (!__access_ok(pc, instructions_to_print * sizeof(int), USER_DS)) { + if (!__access_ok(pc, NR_INSN_TO_PRINT * sizeof(int), USER_DS)) { pr_info("%s[%d]: Bad NIP, not dumping instructions.\n", current->comm, current->pid); return; } - pr_info("%s[%d]: code: ", current->comm, current->pid); + seq_buf_init(&s, buf, sizeof(buf)); - for (i = 0; i < instructions_to_print; i++) { - int instr; + while (n) { + int i; - if (!(i % 8) && (i > 0)) { - pr_cont("\n"); - pr_info("%s[%d]: code: ", current->comm, current->pid); - } + seq_buf_clear(&s); - if (probe_kernel_address((unsigned int __user *)pc, instr)) { - pr_cont("XXXXXXXX "); - } else { - if (regs->nip == pc) - pr_cont("<%08x> ", instr); - else - pr_cont("%08x ", instr); + for (i = 0; i < 8 && n; i++, n--, pc += sizeof(int)) { + int instr; + + if (probe_kernel_address((const void *)pc, instr)) { + seq_buf_printf(&s, "XXXXXXXX "); + continue; + } + seq_buf_printf(&s, regs->nip == pc ? "<%08x> " : "%08x ", instr); } - pc += sizeof(int); + if (!seq_buf_has_overflowed(&s)) + pr_info("%s[%d]: code: %s\n", current->comm, + current->pid, s.buffer); } - - pr_cont("\n"); } struct regbit { @@ -1485,6 +1483,15 @@ void flush_thread(void) #endif /* CONFIG_HAVE_HW_BREAKPOINT */ } +#ifdef CONFIG_PPC_BOOK3S_64 +void arch_setup_new_exec(void) +{ + if (radix_enabled()) + return; + hash__setup_new_exec(); +} +#endif + int set_thread_uses_vas(void) { #ifdef CONFIG_PPC_BOOK3S_64 @@ -1705,7 +1712,7 @@ int copy_thread(unsigned long clone_flags, unsigned long usp, p->thread.dscr = mfspr(SPRN_DSCR); } if (cpu_has_feature(CPU_FTR_HAS_PPR)) - p->thread.ppr = INIT_PPR; + childregs->ppr = DEFAULT_PPR; p->thread.tidr = 0; #endif @@ -1713,6 +1720,8 @@ int copy_thread(unsigned long clone_flags, unsigned long usp, return 0; } +void preload_new_slb_context(unsigned long start, unsigned long sp); + /* * Set up a thread for executing a new program */ @@ -1720,6 +1729,10 @@ void start_thread(struct pt_regs *regs, unsigned long start, unsigned long sp) { #ifdef CONFIG_PPC64 unsigned long load_addr = regs->gpr[2]; /* saved by ELF_PLAT_INIT */ + +#ifdef CONFIG_PPC_BOOK3S_64 + preload_new_slb_context(start, sp); +#endif #endif /* @@ -1810,6 +1823,7 @@ void start_thread(struct pt_regs *regs, unsigned long start, unsigned long sp) #ifdef CONFIG_VSX current->thread.used_vsr = 0; #endif + current->thread.load_slb = 0; current->thread.load_fp = 0; memset(¤t->thread.fp_state, 0, sizeof(current->thread.fp_state)); current->thread.fp_save_area = NULL; diff --git a/arch/powerpc/kernel/prom_init.c b/arch/powerpc/kernel/prom_init.c index 9b38a2e5dd35..f33ff4163a51 100644 --- a/arch/powerpc/kernel/prom_init.c +++ b/arch/powerpc/kernel/prom_init.c @@ -43,11 +43,13 @@ #include <asm/btext.h> #include <asm/sections.h> #include <asm/machdep.h> -#include <asm/opal.h> #include <asm/asm-prototypes.h> #include <linux/linux_logo.h> +/* All of prom_init bss lives here */ +#define __prombss __section(.bss.prominit) + /* * Eventually bump that one up */ @@ -87,7 +89,7 @@ #define OF_WORKAROUNDS 0 #else #define OF_WORKAROUNDS of_workarounds -int of_workarounds; +static int of_workarounds __prombss; #endif #define OF_WA_CLAIM 1 /* do phys/virt claim separately, then map */ @@ -148,29 +150,31 @@ extern void copy_and_flush(unsigned long dest, unsigned long src, unsigned long size, unsigned long offset); /* prom structure */ -static struct prom_t __initdata prom; +static struct prom_t __prombss prom; -static unsigned long prom_entry __initdata; +static unsigned long __prombss prom_entry; #define PROM_SCRATCH_SIZE 256 -static char __initdata of_stdout_device[256]; -static char __initdata prom_scratch[PROM_SCRATCH_SIZE]; +static char __prombss of_stdout_device[256]; +static char __prombss prom_scratch[PROM_SCRATCH_SIZE]; -static unsigned long __initdata dt_header_start; -static unsigned long __initdata dt_struct_start, dt_struct_end; -static unsigned long __initdata dt_string_start, dt_string_end; +static unsigned long __prombss dt_header_start; +static unsigned long __prombss dt_struct_start, dt_struct_end; +static unsigned long __prombss dt_string_start, dt_string_end; -static unsigned long __initdata prom_initrd_start, prom_initrd_end; +static unsigned long __prombss prom_initrd_start, prom_initrd_end; #ifdef CONFIG_PPC64 -static int __initdata prom_iommu_force_on; -static int __initdata prom_iommu_off; -static unsigned long __initdata prom_tce_alloc_start; -static unsigned long __initdata prom_tce_alloc_end; +static int __prombss prom_iommu_force_on; +static int __prombss prom_iommu_off; +static unsigned long __prombss prom_tce_alloc_start; +static unsigned long __prombss prom_tce_alloc_end; #endif -static bool prom_radix_disable __initdata = !IS_ENABLED(CONFIG_PPC_RADIX_MMU_DEFAULT); +#ifdef CONFIG_PPC_PSERIES +static bool __prombss prom_radix_disable; +#endif struct platform_support { bool hash_mmu; @@ -188,26 +192,25 @@ struct platform_support { #define PLATFORM_LPAR 0x0001 #define PLATFORM_POWERMAC 0x0400 #define PLATFORM_GENERIC 0x0500 -#define PLATFORM_OPAL 0x0600 -static int __initdata of_platform; +static int __prombss of_platform; -static char __initdata prom_cmd_line[COMMAND_LINE_SIZE]; +static char __prombss prom_cmd_line[COMMAND_LINE_SIZE]; -static unsigned long __initdata prom_memory_limit; +static unsigned long __prombss prom_memory_limit; -static unsigned long __initdata alloc_top; -static unsigned long __initdata alloc_top_high; -static unsigned long __initdata alloc_bottom; -static unsigned long __initdata rmo_top; -static unsigned long __initdata ram_top; +static unsigned long __prombss alloc_top; +static unsigned long __prombss alloc_top_high; +static unsigned long __prombss alloc_bottom; +static unsigned long __prombss rmo_top; +static unsigned long __prombss ram_top; -static struct mem_map_entry __initdata mem_reserve_map[MEM_RESERVE_MAP_SIZE]; -static int __initdata mem_reserve_cnt; +static struct mem_map_entry __prombss mem_reserve_map[MEM_RESERVE_MAP_SIZE]; +static int __prombss mem_reserve_cnt; -static cell_t __initdata regbuf[1024]; +static cell_t __prombss regbuf[1024]; -static bool rtas_has_query_cpu_stopped; +static bool __prombss rtas_has_query_cpu_stopped; /* @@ -522,8 +525,8 @@ static void add_string(char **str, const char *q) static char *tohex(unsigned int x) { - static char digits[] = "0123456789abcdef"; - static char result[9]; + static const char digits[] __initconst = "0123456789abcdef"; + static char result[9] __prombss; int i; result[8] = 0; @@ -664,6 +667,8 @@ static void __init early_cmdline_parse(void) #endif } +#ifdef CONFIG_PPC_PSERIES + prom_radix_disable = !IS_ENABLED(CONFIG_PPC_RADIX_MMU_DEFAULT); opt = strstr(prom_cmd_line, "disable_radix"); if (opt) { opt += 13; @@ -679,9 +684,10 @@ static void __init early_cmdline_parse(void) } if (prom_radix_disable) prom_debug("Radix disabled from cmdline\n"); +#endif /* CONFIG_PPC_PSERIES */ } -#if defined(CONFIG_PPC_PSERIES) || defined(CONFIG_PPC_POWERNV) +#ifdef CONFIG_PPC_PSERIES /* * The architecture vector has an array of PVR mask/value pairs, * followed by # option vectors - 1, followed by the option vectors. @@ -782,7 +788,7 @@ struct ibm_arch_vec { struct option_vector6 vec6; } __packed; -struct ibm_arch_vec __cacheline_aligned ibm_architecture_vec = { +static const struct ibm_arch_vec ibm_architecture_vec_template __initconst = { .pvrs = { { .mask = cpu_to_be32(0xfffe0000), /* POWER5/POWER5+ */ @@ -920,9 +926,11 @@ struct ibm_arch_vec __cacheline_aligned ibm_architecture_vec = { }, }; +static struct ibm_arch_vec __prombss ibm_architecture_vec ____cacheline_aligned; + /* Old method - ELF header with PT_NOTE sections only works on BE */ #ifdef __BIG_ENDIAN__ -static struct fake_elf { +static const struct fake_elf { Elf32_Ehdr elfhdr; Elf32_Phdr phdr[2]; struct chrpnote { @@ -955,7 +963,7 @@ static struct fake_elf { u32 ignore_me; } rpadesc; } rpanote; -} fake_elf = { +} fake_elf __initconst = { .elfhdr = { .e_ident = { 0x7f, 'E', 'L', 'F', ELFCLASS32, ELFDATA2MSB, EV_CURRENT }, @@ -1129,14 +1137,21 @@ static void __init prom_check_platform_support(void) }; int prop_len = prom_getproplen(prom.chosen, "ibm,arch-vec-5-platform-support"); + + /* First copy the architecture vec template */ + ibm_architecture_vec = ibm_architecture_vec_template; + if (prop_len > 1) { int i; - u8 vec[prop_len]; + u8 vec[8]; prom_debug("Found ibm,arch-vec-5-platform-support, len: %d\n", prop_len); + if (prop_len > sizeof(vec)) + prom_printf("WARNING: ibm,arch-vec-5-platform-support longer than expected (len: %d)\n", + prop_len); prom_getprop(prom.chosen, "ibm,arch-vec-5-platform-support", &vec, sizeof(vec)); - for (i = 0; i < prop_len; i += 2) { + for (i = 0; i < sizeof(vec); i += 2) { prom_debug("%d: index = 0x%x val = 0x%x\n", i / 2 , vec[i] , vec[i + 1]); @@ -1225,7 +1240,7 @@ static void __init prom_send_capabilities(void) } #endif /* __BIG_ENDIAN__ */ } -#endif /* #if defined(CONFIG_PPC_PSERIES) || defined(CONFIG_PPC_POWERNV) */ +#endif /* CONFIG_PPC_PSERIES */ /* * Memory allocation strategy... our layout is normally: @@ -1562,88 +1577,6 @@ static void __init prom_close_stdin(void) } } -#ifdef CONFIG_PPC_POWERNV - -#ifdef CONFIG_PPC_EARLY_DEBUG_OPAL -static u64 __initdata prom_opal_base; -static u64 __initdata prom_opal_entry; -#endif - -/* - * Allocate room for and instantiate OPAL - */ -static void __init prom_instantiate_opal(void) -{ - phandle opal_node; - ihandle opal_inst; - u64 base, entry; - u64 size = 0, align = 0x10000; - __be64 val64; - u32 rets[2]; - - prom_debug("prom_instantiate_opal: start...\n"); - - opal_node = call_prom("finddevice", 1, 1, ADDR("/ibm,opal")); - prom_debug("opal_node: %x\n", opal_node); - if (!PHANDLE_VALID(opal_node)) - return; - - val64 = 0; - prom_getprop(opal_node, "opal-runtime-size", &val64, sizeof(val64)); - size = be64_to_cpu(val64); - if (size == 0) - return; - val64 = 0; - prom_getprop(opal_node, "opal-runtime-alignment", &val64,sizeof(val64)); - align = be64_to_cpu(val64); - - base = alloc_down(size, align, 0); - if (base == 0) { - prom_printf("OPAL allocation failed !\n"); - return; - } - - opal_inst = call_prom("open", 1, 1, ADDR("/ibm,opal")); - if (!IHANDLE_VALID(opal_inst)) { - prom_printf("opening opal package failed (%x)\n", opal_inst); - return; - } - - prom_printf("instantiating opal at 0x%llx...", base); - - if (call_prom_ret("call-method", 4, 3, rets, - ADDR("load-opal-runtime"), - opal_inst, - base >> 32, base & 0xffffffff) != 0 - || (rets[0] == 0 && rets[1] == 0)) { - prom_printf(" failed\n"); - return; - } - entry = (((u64)rets[0]) << 32) | rets[1]; - - prom_printf(" done\n"); - - reserve_mem(base, size); - - prom_debug("opal base = 0x%llx\n", base); - prom_debug("opal align = 0x%llx\n", align); - prom_debug("opal entry = 0x%llx\n", entry); - prom_debug("opal size = 0x%llx\n", size); - - prom_setprop(opal_node, "/ibm,opal", "opal-base-address", - &base, sizeof(base)); - prom_setprop(opal_node, "/ibm,opal", "opal-entry-address", - &entry, sizeof(entry)); - -#ifdef CONFIG_PPC_EARLY_DEBUG_OPAL - prom_opal_base = base; - prom_opal_entry = entry; -#endif - prom_debug("prom_instantiate_opal: end...\n"); -} - -#endif /* CONFIG_PPC_POWERNV */ - /* * Allocate room for and instantiate RTAS */ @@ -2150,10 +2083,6 @@ static int __init prom_find_machine_type(void) } } #ifdef CONFIG_PPC64 - /* Try to detect OPAL */ - if (PHANDLE_VALID(call_prom("finddevice", 1, 1, ADDR("/ibm,opal")))) - return PLATFORM_OPAL; - /* Try to figure out if it's an IBM pSeries or any other * PAPR compliant platform. We assume it is if : * - /device_type is "chrp" (please, do NOT use that for future @@ -2202,7 +2131,7 @@ static void __init prom_check_displays(void) ihandle ih; int i; - static unsigned char default_colors[] = { + static const unsigned char default_colors[] __initconst = { 0x00, 0x00, 0x00, 0x00, 0x00, 0xaa, 0x00, 0xaa, 0x00, @@ -2398,7 +2327,7 @@ static void __init scan_dt_build_struct(phandle node, unsigned long *mem_start, char *namep, *prev_name, *sstart, *p, *ep, *lp, *path; unsigned long soff; unsigned char *valp; - static char pname[MAX_PROPERTY_NAME]; + static char pname[MAX_PROPERTY_NAME] __prombss; int l, room, has_phandle = 0; dt_push_token(OF_DT_BEGIN_NODE, mem_start, mem_end); @@ -2481,14 +2410,11 @@ static void __init scan_dt_build_struct(phandle node, unsigned long *mem_start, has_phandle = 1; } - /* Add a "linux,phandle" property if no "phandle" property already - * existed (can happen with OPAL) - */ + /* Add a "phandle" property if none already exist */ if (!has_phandle) { - soff = dt_find_string("linux,phandle"); + soff = dt_find_string("phandle"); if (soff == 0) - prom_printf("WARNING: Can't find string index for" - " <linux-phandle> node %s\n", path); + prom_printf("WARNING: Can't find string index for <phandle> node %s\n", path); else { dt_push_token(OF_DT_PROP, mem_start, mem_end); dt_push_token(4, mem_start, mem_end); @@ -2548,9 +2474,9 @@ static void __init flatten_device_tree(void) dt_string_start = mem_start; mem_start += 4; /* hole */ - /* Add "linux,phandle" in there, we'll need it */ + /* Add "phandle" in there, we'll need it */ namep = make_room(&mem_start, &mem_end, 16, 1); - strcpy(namep, "linux,phandle"); + strcpy(namep, "phandle"); mem_start = (unsigned long)namep + strlen(namep) + 1; /* Build string array */ @@ -3172,7 +3098,7 @@ unsigned long __init prom_init(unsigned long r3, unsigned long r4, */ early_cmdline_parse(); -#if defined(CONFIG_PPC_PSERIES) || defined(CONFIG_PPC_POWERNV) +#ifdef CONFIG_PPC_PSERIES /* * On pSeries, inform the firmware about our capabilities */ @@ -3216,15 +3142,9 @@ unsigned long __init prom_init(unsigned long r3, unsigned long r4, * On non-powermacs, try to instantiate RTAS. PowerMacs don't * have a usable RTAS implementation. */ - if (of_platform != PLATFORM_POWERMAC && - of_platform != PLATFORM_OPAL) + if (of_platform != PLATFORM_POWERMAC) prom_instantiate_rtas(); -#ifdef CONFIG_PPC_POWERNV - if (of_platform == PLATFORM_OPAL) - prom_instantiate_opal(); -#endif /* CONFIG_PPC_POWERNV */ - #ifdef CONFIG_PPC64 /* instantiate sml */ prom_instantiate_sml(); @@ -3237,8 +3157,7 @@ unsigned long __init prom_init(unsigned long r3, unsigned long r4, * * (This must be done after instanciating RTAS) */ - if (of_platform != PLATFORM_POWERMAC && - of_platform != PLATFORM_OPAL) + if (of_platform != PLATFORM_POWERMAC) prom_hold_cpus(); /* @@ -3282,11 +3201,9 @@ unsigned long __init prom_init(unsigned long r3, unsigned long r4, /* * in case stdin is USB and still active on IBM machines... * Unfortunately quiesce crashes on some powermacs if we have - * closed stdin already (in particular the powerbook 101). It - * appears that the OPAL version of OFW doesn't like it either. + * closed stdin already (in particular the powerbook 101). */ - if (of_platform != PLATFORM_POWERMAC && - of_platform != PLATFORM_OPAL) + if (of_platform != PLATFORM_POWERMAC) prom_close_stdin(); /* @@ -3304,10 +3221,8 @@ unsigned long __init prom_init(unsigned long r3, unsigned long r4, hdr = dt_header_start; /* Don't print anything after quiesce under OPAL, it crashes OFW */ - if (of_platform != PLATFORM_OPAL) { - prom_printf("Booting Linux via __start() @ 0x%lx ...\n", kbase); - prom_debug("->dt_header_start=0x%lx\n", hdr); - } + prom_printf("Booting Linux via __start() @ 0x%lx ...\n", kbase); + prom_debug("->dt_header_start=0x%lx\n", hdr); #ifdef CONFIG_PPC32 reloc_got2(-offset); @@ -3315,13 +3230,7 @@ unsigned long __init prom_init(unsigned long r3, unsigned long r4, unreloc_toc(); #endif -#ifdef CONFIG_PPC_EARLY_DEBUG_OPAL - /* OPAL early debug gets the OPAL base & entry in r8 and r9 */ - __start(hdr, kbase, 0, 0, 0, - prom_opal_base, prom_opal_entry); -#else __start(hdr, kbase, 0, 0, 0, 0, 0); -#endif return 0; } diff --git a/arch/powerpc/kernel/prom_init_check.sh b/arch/powerpc/kernel/prom_init_check.sh index acb6b9226352..667df97d2595 100644 --- a/arch/powerpc/kernel/prom_init_check.sh +++ b/arch/powerpc/kernel/prom_init_check.sh @@ -28,6 +28,18 @@ OBJ="$2" ERROR=0 +function check_section() +{ + file=$1 + section=$2 + size=$(objdump -h -j $section $file 2>/dev/null | awk "\$2 == \"$section\" {print \$3}") + size=${size:-0} + if [ $size -ne 0 ]; then + ERROR=1 + echo "Error: Section $section not empty in prom_init.c" >&2 + fi +} + for UNDEF in $($NM -u $OBJ | awk '{print $2}') do # On 64-bit nm gives us the function descriptors, which have @@ -66,4 +78,8 @@ do fi done +check_section $OBJ .data +check_section $OBJ .bss +check_section $OBJ .init.data + exit $ERROR diff --git a/arch/powerpc/kernel/ptrace.c b/arch/powerpc/kernel/ptrace.c index 9667666eb18e..afb819f4ca68 100644 --- a/arch/powerpc/kernel/ptrace.c +++ b/arch/powerpc/kernel/ptrace.c @@ -297,7 +297,7 @@ int ptrace_get_reg(struct task_struct *task, int regno, unsigned long *data) } #endif - if (regno < (sizeof(struct pt_regs) / sizeof(unsigned long))) { + if (regno < (sizeof(struct user_pt_regs) / sizeof(unsigned long))) { *data = ((unsigned long *)task->thread.regs)[regno]; return 0; } @@ -360,10 +360,10 @@ static int gpr_get(struct task_struct *target, const struct user_regset *regset, ret = user_regset_copyout(&pos, &count, &kbuf, &ubuf, &target->thread.regs->orig_gpr3, offsetof(struct pt_regs, orig_gpr3), - sizeof(struct pt_regs)); + sizeof(struct user_pt_regs)); if (!ret) ret = user_regset_copyout_zero(&pos, &count, &kbuf, &ubuf, - sizeof(struct pt_regs), -1); + sizeof(struct user_pt_regs), -1); return ret; } @@ -853,10 +853,10 @@ static int tm_cgpr_get(struct task_struct *target, ret = user_regset_copyout(&pos, &count, &kbuf, &ubuf, &target->thread.ckpt_regs.orig_gpr3, offsetof(struct pt_regs, orig_gpr3), - sizeof(struct pt_regs)); + sizeof(struct user_pt_regs)); if (!ret) ret = user_regset_copyout_zero(&pos, &count, &kbuf, &ubuf, - sizeof(struct pt_regs), -1); + sizeof(struct user_pt_regs), -1); return ret; } @@ -1609,7 +1609,7 @@ static int ppr_get(struct task_struct *target, void *kbuf, void __user *ubuf) { return user_regset_copyout(&pos, &count, &kbuf, &ubuf, - &target->thread.ppr, 0, sizeof(u64)); + &target->thread.regs->ppr, 0, sizeof(u64)); } static int ppr_set(struct task_struct *target, @@ -1618,7 +1618,7 @@ static int ppr_set(struct task_struct *target, const void *kbuf, const void __user *ubuf) { return user_regset_copyin(&pos, &count, &kbuf, &ubuf, - &target->thread.ppr, 0, sizeof(u64)); + &target->thread.regs->ppr, 0, sizeof(u64)); } static int dscr_get(struct task_struct *target, @@ -2508,6 +2508,7 @@ void ptrace_disable(struct task_struct *child) { /* make sure the single step bit is not set. */ user_disable_single_step(child); + clear_tsk_thread_flag(child, TIF_SYSCALL_EMU); } #ifdef CONFIG_PPC_ADV_DEBUG_REGS @@ -3130,7 +3131,7 @@ long arch_ptrace(struct task_struct *child, long request, case PTRACE_GETREGS: /* Get all pt_regs from the child. */ return copy_regset_to_user(child, &user_ppc_native_view, REGSET_GPR, - 0, sizeof(struct pt_regs), + 0, sizeof(struct user_pt_regs), datavp); #ifdef CONFIG_PPC64 @@ -3139,7 +3140,7 @@ long arch_ptrace(struct task_struct *child, long request, case PTRACE_SETREGS: /* Set all gp regs in the child. */ return copy_regset_from_user(child, &user_ppc_native_view, REGSET_GPR, - 0, sizeof(struct pt_regs), + 0, sizeof(struct user_pt_regs), datavp); case PTRACE_GETFPREGS: /* Get the child FPU state (FPR0...31 + FPSCR) */ @@ -3264,6 +3265,16 @@ long do_syscall_trace_enter(struct pt_regs *regs) { user_exit(); + if (test_thread_flag(TIF_SYSCALL_EMU)) { + ptrace_report_syscall(regs); + /* + * Returning -1 will skip the syscall execution. We want to + * avoid clobbering any register also, thus, not 'gotoing' + * skip label. + */ + return -1; + } + /* * The tracer may decide to abort the syscall, if so tracehook * will return !0. Note that the tracer may also just change @@ -3324,3 +3335,42 @@ void do_syscall_trace_leave(struct pt_regs *regs) user_enter(); } + +void __init pt_regs_check(void) +{ + BUILD_BUG_ON(offsetof(struct pt_regs, gpr) != + offsetof(struct user_pt_regs, gpr)); + BUILD_BUG_ON(offsetof(struct pt_regs, nip) != + offsetof(struct user_pt_regs, nip)); + BUILD_BUG_ON(offsetof(struct pt_regs, msr) != + offsetof(struct user_pt_regs, msr)); + BUILD_BUG_ON(offsetof(struct pt_regs, msr) != + offsetof(struct user_pt_regs, msr)); + BUILD_BUG_ON(offsetof(struct pt_regs, orig_gpr3) != + offsetof(struct user_pt_regs, orig_gpr3)); + BUILD_BUG_ON(offsetof(struct pt_regs, ctr) != + offsetof(struct user_pt_regs, ctr)); + BUILD_BUG_ON(offsetof(struct pt_regs, link) != + offsetof(struct user_pt_regs, link)); + BUILD_BUG_ON(offsetof(struct pt_regs, xer) != + offsetof(struct user_pt_regs, xer)); + BUILD_BUG_ON(offsetof(struct pt_regs, ccr) != + offsetof(struct user_pt_regs, ccr)); +#ifdef __powerpc64__ + BUILD_BUG_ON(offsetof(struct pt_regs, softe) != + offsetof(struct user_pt_regs, softe)); +#else + BUILD_BUG_ON(offsetof(struct pt_regs, mq) != + offsetof(struct user_pt_regs, mq)); +#endif + BUILD_BUG_ON(offsetof(struct pt_regs, trap) != + offsetof(struct user_pt_regs, trap)); + BUILD_BUG_ON(offsetof(struct pt_regs, dar) != + offsetof(struct user_pt_regs, dar)); + BUILD_BUG_ON(offsetof(struct pt_regs, dsisr) != + offsetof(struct user_pt_regs, dsisr)); + BUILD_BUG_ON(offsetof(struct pt_regs, result) != + offsetof(struct user_pt_regs, result)); + + BUILD_BUG_ON(sizeof(struct user_pt_regs) > sizeof(struct pt_regs)); +} diff --git a/arch/powerpc/kernel/rtas.c b/arch/powerpc/kernel/rtas.c index 8afd146bc9c7..de35bd8f047f 100644 --- a/arch/powerpc/kernel/rtas.c +++ b/arch/powerpc/kernel/rtas.c @@ -981,7 +981,15 @@ int rtas_ibm_suspend_me(u64 handle) goto out; } - stop_topology_update(); + cpu_hotplug_disable(); + + /* Check if we raced with a CPU-Offline Operation */ + if (unlikely(!cpumask_equal(cpu_present_mask, cpu_online_mask))) { + pr_err("%s: Raced against a concurrent CPU-Offline\n", + __func__); + atomic_set(&data.error, -EBUSY); + goto out_hotplug_enable; + } /* Call function on all CPUs. One of us will make the * rtas call @@ -994,7 +1002,8 @@ int rtas_ibm_suspend_me(u64 handle) if (atomic_read(&data.error) != 0) printk(KERN_ERR "Error doing global join\n"); - start_topology_update(); +out_hotplug_enable: + cpu_hotplug_enable(); /* Take down CPUs not online prior to suspend */ cpuret = rtas_offline_cpus_mask(offline_mask); diff --git a/arch/powerpc/kernel/rtasd.c b/arch/powerpc/kernel/rtasd.c index 44d66c33d59d..38cadae4ca4f 100644 --- a/arch/powerpc/kernel/rtasd.c +++ b/arch/powerpc/kernel/rtasd.c @@ -91,6 +91,8 @@ static char *rtas_event_type(int type) return "Dump Notification Event"; case RTAS_TYPE_PRRN: return "Platform Resource Reassignment Event"; + case RTAS_TYPE_HOTPLUG: + return "Hotplug Event"; } return rtas_type[0]; @@ -150,8 +152,10 @@ static void printk_log_rtas(char *buf, int len) } else { struct rtas_error_log *errlog = (struct rtas_error_log *)buf; - printk(RTAS_DEBUG "event: %d, Type: %s, Severity: %d\n", - error_log_cnt, rtas_event_type(rtas_error_type(errlog)), + printk(RTAS_DEBUG "event: %d, Type: %s (%d), Severity: %d\n", + error_log_cnt, + rtas_event_type(rtas_error_type(errlog)), + rtas_error_type(errlog), rtas_error_severity(errlog)); } } @@ -274,27 +278,16 @@ void pSeries_log_error(char *buf, unsigned int err_type, int fatal) } #ifdef CONFIG_PPC_PSERIES -static s32 prrn_update_scope; - -static void prrn_work_fn(struct work_struct *work) +static void handle_prrn_event(s32 scope) { /* * For PRRN, we must pass the negative of the scope value in * the RTAS event. */ - pseries_devicetree_update(-prrn_update_scope); + pseries_devicetree_update(-scope); numa_update_cpu_topology(false); } -static DECLARE_WORK(prrn_work, prrn_work_fn); - -static void prrn_schedule_update(u32 scope) -{ - flush_work(&prrn_work); - prrn_update_scope = scope; - schedule_work(&prrn_work); -} - static void handle_rtas_event(const struct rtas_error_log *log) { if (rtas_error_type(log) != RTAS_TYPE_PRRN || !prrn_is_enabled()) @@ -303,7 +296,7 @@ static void handle_rtas_event(const struct rtas_error_log *log) /* For PRRN Events the extended log length is used to denote * the scope for calling rtas update-nodes. */ - prrn_schedule_update(rtas_error_extended_log_length(log)); + handle_prrn_event(rtas_error_extended_log_length(log)); } #else diff --git a/arch/powerpc/kernel/setup-common.c b/arch/powerpc/kernel/setup-common.c index 93fa0c99681e..9ca9db707bcb 100644 --- a/arch/powerpc/kernel/setup-common.c +++ b/arch/powerpc/kernel/setup-common.c @@ -33,6 +33,7 @@ #include <linux/serial_8250.h> #include <linux/percpu.h> #include <linux/memblock.h> +#include <linux/bootmem.h> #include <linux/of_platform.h> #include <linux/hugetlb.h> #include <asm/debugfs.h> @@ -966,6 +967,8 @@ void __init setup_arch(char **cmdline_p) initmem_init(); + early_memtest(min_low_pfn << PAGE_SHIFT, max_low_pfn << PAGE_SHIFT); + #ifdef CONFIG_DUMMY_CONSOLE conswitchp = &dummy_con; #endif diff --git a/arch/powerpc/kernel/setup_64.c b/arch/powerpc/kernel/setup_64.c index 6a501b25dd85..faf00222b324 100644 --- a/arch/powerpc/kernel/setup_64.c +++ b/arch/powerpc/kernel/setup_64.c @@ -243,13 +243,19 @@ static void cpu_ready_for_interrupts(void) } /* - * Fixup HFSCR:TM based on CPU features. The bit is set by our - * early asm init because at that point we haven't updated our - * CPU features from firmware and device-tree. Here we have, - * so let's do it. + * Set HFSCR:TM based on CPU features: + * In the special case of TM no suspend (P9N DD2.1), Linux is + * told TM is off via the dt-ftrs but told to (partially) use + * it via OPAL_REINIT_CPUS_TM_SUSPEND_DISABLED. So HFSCR[TM] + * will be off from dt-ftrs but we need to turn it on for the + * no suspend case. */ - if (cpu_has_feature(CPU_FTR_HVMODE) && !cpu_has_feature(CPU_FTR_TM_COMP)) - mtspr(SPRN_HFSCR, mfspr(SPRN_HFSCR) & ~HFSCR_TM); + if (cpu_has_feature(CPU_FTR_HVMODE)) { + if (cpu_has_feature(CPU_FTR_TM_COMP)) + mtspr(SPRN_HFSCR, mfspr(SPRN_HFSCR) | HFSCR_TM); + else + mtspr(SPRN_HFSCR, mfspr(SPRN_HFSCR) & ~HFSCR_TM); + } /* Set IR and DR in PACA MSR */ get_paca()->kernel_msr = MSR_KERNEL; diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c index 61c1fadbc644..3f15edf25a0d 100644 --- a/arch/powerpc/kernel/smp.c +++ b/arch/powerpc/kernel/smp.c @@ -34,6 +34,8 @@ #include <linux/topology.h> #include <linux/profile.h> #include <linux/processor.h> +#include <linux/random.h> +#include <linux/stackprotector.h> #include <asm/ptrace.h> #include <linux/atomic.h> @@ -74,14 +76,32 @@ static DEFINE_PER_CPU(int, cpu_state) = { 0 }; #endif struct thread_info *secondary_ti; +bool has_big_cores; DEFINE_PER_CPU(cpumask_var_t, cpu_sibling_map); +DEFINE_PER_CPU(cpumask_var_t, cpu_smallcore_map); DEFINE_PER_CPU(cpumask_var_t, cpu_l2_cache_map); DEFINE_PER_CPU(cpumask_var_t, cpu_core_map); EXPORT_PER_CPU_SYMBOL(cpu_sibling_map); EXPORT_PER_CPU_SYMBOL(cpu_l2_cache_map); EXPORT_PER_CPU_SYMBOL(cpu_core_map); +EXPORT_SYMBOL_GPL(has_big_cores); + +#define MAX_THREAD_LIST_SIZE 8 +#define THREAD_GROUP_SHARE_L1 1 +struct thread_groups { + unsigned int property; + unsigned int nr_groups; + unsigned int threads_per_group; + unsigned int thread_list[MAX_THREAD_LIST_SIZE]; +}; + +/* + * On big-cores system, cpu_l1_cache_map for each CPU corresponds to + * the set its siblings that share the L1-cache. + */ +DEFINE_PER_CPU(cpumask_var_t, cpu_l1_cache_map); /* SMP operations for this machine */ struct smp_ops_t *smp_ops; @@ -674,6 +694,185 @@ static void set_cpus_unrelated(int i, int j, } #endif +/* + * parse_thread_groups: Parses the "ibm,thread-groups" device tree + * property for the CPU device node @dn and stores + * the parsed output in the thread_groups + * structure @tg if the ibm,thread-groups[0] + * matches @property. + * + * @dn: The device node of the CPU device. + * @tg: Pointer to a thread group structure into which the parsed + * output of "ibm,thread-groups" is stored. + * @property: The property of the thread-group that the caller is + * interested in. + * + * ibm,thread-groups[0..N-1] array defines which group of threads in + * the CPU-device node can be grouped together based on the property. + * + * ibm,thread-groups[0] tells us the property based on which the + * threads are being grouped together. If this value is 1, it implies + * that the threads in the same group share L1, translation cache. + * + * ibm,thread-groups[1] tells us how many such thread groups exist. + * + * ibm,thread-groups[2] tells us the number of threads in each such + * group. + * + * ibm,thread-groups[3..N-1] is the list of threads identified by + * "ibm,ppc-interrupt-server#s" arranged as per their membership in + * the grouping. + * + * Example: If ibm,thread-groups = [1,2,4,5,6,7,8,9,10,11,12] it + * implies that there are 2 groups of 4 threads each, where each group + * of threads share L1, translation cache. + * + * The "ibm,ppc-interrupt-server#s" of the first group is {5,6,7,8} + * and the "ibm,ppc-interrupt-server#s" of the second group is {9, 10, + * 11, 12} structure + * + * Returns 0 on success, -EINVAL if the property does not exist, + * -ENODATA if property does not have a value, and -EOVERFLOW if the + * property data isn't large enough. + */ +static int parse_thread_groups(struct device_node *dn, + struct thread_groups *tg, + unsigned int property) +{ + int i; + u32 thread_group_array[3 + MAX_THREAD_LIST_SIZE]; + u32 *thread_list; + size_t total_threads; + int ret; + + ret = of_property_read_u32_array(dn, "ibm,thread-groups", + thread_group_array, 3); + if (ret) + return ret; + + tg->property = thread_group_array[0]; + tg->nr_groups = thread_group_array[1]; + tg->threads_per_group = thread_group_array[2]; + if (tg->property != property || + tg->nr_groups < 1 || + tg->threads_per_group < 1) + return -ENODATA; + + total_threads = tg->nr_groups * tg->threads_per_group; + + ret = of_property_read_u32_array(dn, "ibm,thread-groups", + thread_group_array, + 3 + total_threads); + if (ret) + return ret; + + thread_list = &thread_group_array[3]; + + for (i = 0 ; i < total_threads; i++) + tg->thread_list[i] = thread_list[i]; + + return 0; +} + +/* + * get_cpu_thread_group_start : Searches the thread group in tg->thread_list + * that @cpu belongs to. + * + * @cpu : The logical CPU whose thread group is being searched. + * @tg : The thread-group structure of the CPU node which @cpu belongs + * to. + * + * Returns the index to tg->thread_list that points to the the start + * of the thread_group that @cpu belongs to. + * + * Returns -1 if cpu doesn't belong to any of the groups pointed to by + * tg->thread_list. + */ +static int get_cpu_thread_group_start(int cpu, struct thread_groups *tg) +{ + int hw_cpu_id = get_hard_smp_processor_id(cpu); + int i, j; + + for (i = 0; i < tg->nr_groups; i++) { + int group_start = i * tg->threads_per_group; + + for (j = 0; j < tg->threads_per_group; j++) { + int idx = group_start + j; + + if (tg->thread_list[idx] == hw_cpu_id) + return group_start; + } + } + + return -1; +} + +static int init_cpu_l1_cache_map(int cpu) + +{ + struct device_node *dn = of_get_cpu_node(cpu, NULL); + struct thread_groups tg = {.property = 0, + .nr_groups = 0, + .threads_per_group = 0}; + int first_thread = cpu_first_thread_sibling(cpu); + int i, cpu_group_start = -1, err = 0; + + if (!dn) + return -ENODATA; + + err = parse_thread_groups(dn, &tg, THREAD_GROUP_SHARE_L1); + if (err) + goto out; + + zalloc_cpumask_var_node(&per_cpu(cpu_l1_cache_map, cpu), + GFP_KERNEL, + cpu_to_node(cpu)); + + cpu_group_start = get_cpu_thread_group_start(cpu, &tg); + + if (unlikely(cpu_group_start == -1)) { + WARN_ON_ONCE(1); + err = -ENODATA; + goto out; + } + + for (i = first_thread; i < first_thread + threads_per_core; i++) { + int i_group_start = get_cpu_thread_group_start(i, &tg); + + if (unlikely(i_group_start == -1)) { + WARN_ON_ONCE(1); + err = -ENODATA; + goto out; + } + + if (i_group_start == cpu_group_start) + cpumask_set_cpu(i, per_cpu(cpu_l1_cache_map, cpu)); + } + +out: + of_node_put(dn); + return err; +} + +static int init_big_cores(void) +{ + int cpu; + + for_each_possible_cpu(cpu) { + int err = init_cpu_l1_cache_map(cpu); + + if (err) + return err; + + zalloc_cpumask_var_node(&per_cpu(cpu_smallcore_map, cpu), + GFP_KERNEL, + cpu_to_node(cpu)); + } + + has_big_cores = true; + return 0; +} + void __init smp_prepare_cpus(unsigned int max_cpus) { unsigned int cpu; @@ -712,6 +911,12 @@ void __init smp_prepare_cpus(unsigned int max_cpus) cpumask_set_cpu(boot_cpuid, cpu_l2_cache_mask(boot_cpuid)); cpumask_set_cpu(boot_cpuid, cpu_core_mask(boot_cpuid)); + init_big_cores(); + if (has_big_cores) { + cpumask_set_cpu(boot_cpuid, + cpu_smallcore_mask(boot_cpuid)); + } + if (smp_ops && smp_ops->probe) smp_ops->probe(); } @@ -995,10 +1200,28 @@ static void remove_cpu_from_masks(int cpu) set_cpus_unrelated(cpu, i, cpu_core_mask); set_cpus_unrelated(cpu, i, cpu_l2_cache_mask); set_cpus_unrelated(cpu, i, cpu_sibling_mask); + if (has_big_cores) + set_cpus_unrelated(cpu, i, cpu_smallcore_mask); } } #endif +static inline void add_cpu_to_smallcore_masks(int cpu) +{ + struct cpumask *this_l1_cache_map = per_cpu(cpu_l1_cache_map, cpu); + int i, first_thread = cpu_first_thread_sibling(cpu); + + if (!has_big_cores) + return; + + cpumask_set_cpu(cpu, cpu_smallcore_mask(cpu)); + + for (i = first_thread; i < first_thread + threads_per_core; i++) { + if (cpu_online(i) && cpumask_test_cpu(i, this_l1_cache_map)) + set_cpus_related(i, cpu, cpu_smallcore_mask); + } +} + static void add_cpu_to_masks(int cpu) { int first_thread = cpu_first_thread_sibling(cpu); @@ -1015,6 +1238,7 @@ static void add_cpu_to_masks(int cpu) if (cpu_online(i)) set_cpus_related(i, cpu, cpu_sibling_mask); + add_cpu_to_smallcore_masks(cpu); /* * Copy the thread sibling mask into the cache sibling mask * and mark any CPUs that share an L2 with this CPU. @@ -1044,6 +1268,7 @@ static bool shared_caches; void start_secondary(void *unused) { unsigned int cpu = smp_processor_id(); + struct cpumask *(*sibling_mask)(int) = cpu_sibling_mask; mmgrab(&init_mm); current->active_mm = &init_mm; @@ -1069,11 +1294,13 @@ void start_secondary(void *unused) /* Update topology CPU masks */ add_cpu_to_masks(cpu); + if (has_big_cores) + sibling_mask = cpu_smallcore_mask; /* * Check for any shared caches. Note that this must be done on a * per-core basis because one core in the pair might be disabled. */ - if (!cpumask_equal(cpu_l2_cache_mask(cpu), cpu_sibling_mask(cpu))) + if (!cpumask_equal(cpu_l2_cache_mask(cpu), sibling_mask(cpu))) shared_caches = true; set_numa_node(numa_cpu_lookup_table[cpu]); @@ -1083,6 +1310,8 @@ void start_secondary(void *unused) notify_cpu_starting(cpu); set_cpu_online(cpu, true); + boot_init_stack_canary(); + local_irq_enable(); /* We can enable ftrace for secondary cpus now */ @@ -1140,6 +1369,13 @@ static const struct cpumask *shared_cache_mask(int cpu) return cpu_l2_cache_mask(cpu); } +#ifdef CONFIG_SCHED_SMT +static const struct cpumask *smallcore_smt_mask(int cpu) +{ + return cpu_smallcore_mask(cpu); +} +#endif + static struct sched_domain_topology_level power9_topology[] = { #ifdef CONFIG_SCHED_SMT { cpu_smt_mask, powerpc_smt_flags, SD_INIT_NAME(SMT) }, @@ -1167,6 +1403,13 @@ void __init smp_cpus_done(unsigned int max_cpus) shared_proc_topology_init(); dump_numa_cpu_topology(); +#ifdef CONFIG_SCHED_SMT + if (has_big_cores) { + pr_info("Using small cores at SMT level\n"); + power9_topology[0].mask = smallcore_smt_mask; + powerpc_topology[0].mask = smallcore_smt_mask; + } +#endif /* * If any CPU detects that it's sharing a cache with another CPU then * use the deeper topology that is aware of this sharing. diff --git a/arch/powerpc/kernel/swsusp_asm64.S b/arch/powerpc/kernel/swsusp_asm64.S index f83bf6f72cb0..185216becb8b 100644 --- a/arch/powerpc/kernel/swsusp_asm64.S +++ b/arch/powerpc/kernel/swsusp_asm64.S @@ -262,7 +262,7 @@ END_FW_FTR_SECTION_IFCLR(FW_FEATURE_LPAR) addi r1,r1,-128 #ifdef CONFIG_PPC_BOOK3S_64 - bl slb_flush_and_rebolt + bl slb_flush_and_restore_bolted #endif bl do_after_copyback addi r1,r1,128 diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c index 70f145e02487..3646affae963 100644 --- a/arch/powerpc/kernel/time.c +++ b/arch/powerpc/kernel/time.c @@ -111,6 +111,7 @@ struct clock_event_device decrementer_clockevent = { .rating = 200, .irq = 0, .set_next_event = decrementer_set_next_event, + .set_state_oneshot_stopped = decrementer_shutdown, .set_state_shutdown = decrementer_shutdown, .tick_resume = decrementer_shutdown, .features = CLOCK_EVT_FEAT_ONESHOT | @@ -175,7 +176,7 @@ static void calc_cputime_factors(void) * Read the SPURR on systems that have it, otherwise the PURR, * or if that doesn't exist return the timebase value passed in. */ -static unsigned long read_spurr(unsigned long tb) +static inline unsigned long read_spurr(unsigned long tb) { if (cpu_has_feature(CPU_FTR_SPURR)) return mfspr(SPRN_SPURR); @@ -281,26 +282,17 @@ static inline u64 calculate_stolen_time(u64 stop_tb) * Account time for a transition between system, hard irq * or soft irq state. */ -static unsigned long vtime_delta(struct task_struct *tsk, - unsigned long *stime_scaled, - unsigned long *steal_time) +static unsigned long vtime_delta_scaled(struct cpu_accounting_data *acct, + unsigned long now, unsigned long stime) { - unsigned long now, nowscaled, deltascaled; - unsigned long stime; + unsigned long stime_scaled = 0; +#ifdef CONFIG_ARCH_HAS_SCALED_CPUTIME + unsigned long nowscaled, deltascaled; unsigned long utime, utime_scaled; - struct cpu_accounting_data *acct = get_accounting(tsk); - WARN_ON_ONCE(!irqs_disabled()); - - now = mftb(); nowscaled = read_spurr(now); - stime = now - acct->starttime; - acct->starttime = now; deltascaled = nowscaled - acct->startspurr; acct->startspurr = nowscaled; - - *steal_time = calculate_stolen_time(now); - utime = acct->utime - acct->utime_sspurr; acct->utime_sspurr = acct->utime; @@ -314,17 +306,38 @@ static unsigned long vtime_delta(struct task_struct *tsk, * the user ticks get saved up in paca->user_time_scaled to be * used by account_process_tick. */ - *stime_scaled = stime; + stime_scaled = stime; utime_scaled = utime; if (deltascaled != stime + utime) { if (utime) { - *stime_scaled = deltascaled * stime / (stime + utime); - utime_scaled = deltascaled - *stime_scaled; + stime_scaled = deltascaled * stime / (stime + utime); + utime_scaled = deltascaled - stime_scaled; } else { - *stime_scaled = deltascaled; + stime_scaled = deltascaled; } } acct->utime_scaled += utime_scaled; +#endif + + return stime_scaled; +} + +static unsigned long vtime_delta(struct task_struct *tsk, + unsigned long *stime_scaled, + unsigned long *steal_time) +{ + unsigned long now, stime; + struct cpu_accounting_data *acct = get_accounting(tsk); + + WARN_ON_ONCE(!irqs_disabled()); + + now = mftb(); + stime = now - acct->starttime; + acct->starttime = now; + + *stime_scaled = vtime_delta_scaled(acct, now, stime); + + *steal_time = calculate_stolen_time(now); return stime; } @@ -341,7 +354,9 @@ void vtime_account_system(struct task_struct *tsk) if ((tsk->flags & PF_VCPU) && !irq_count()) { acct->gtime += stime; +#ifdef CONFIG_ARCH_HAS_SCALED_CPUTIME acct->utime_scaled += stime_scaled; +#endif } else { if (hardirq_count()) acct->hardirq_time += stime; @@ -350,7 +365,9 @@ void vtime_account_system(struct task_struct *tsk) else acct->stime += stime; +#ifdef CONFIG_ARCH_HAS_SCALED_CPUTIME acct->stime_scaled += stime_scaled; +#endif } } EXPORT_SYMBOL_GPL(vtime_account_system); @@ -364,6 +381,21 @@ void vtime_account_idle(struct task_struct *tsk) acct->idle_time += stime + steal_time; } +static void vtime_flush_scaled(struct task_struct *tsk, + struct cpu_accounting_data *acct) +{ +#ifdef CONFIG_ARCH_HAS_SCALED_CPUTIME + if (acct->utime_scaled) + tsk->utimescaled += cputime_to_nsecs(acct->utime_scaled); + if (acct->stime_scaled) + tsk->stimescaled += cputime_to_nsecs(acct->stime_scaled); + + acct->utime_scaled = 0; + acct->utime_sspurr = 0; + acct->stime_scaled = 0; +#endif +} + /* * Account the whole cputime accumulated in the paca * Must be called with interrupts disabled. @@ -378,14 +410,13 @@ void vtime_flush(struct task_struct *tsk) if (acct->utime) account_user_time(tsk, cputime_to_nsecs(acct->utime)); - if (acct->utime_scaled) - tsk->utimescaled += cputime_to_nsecs(acct->utime_scaled); - if (acct->gtime) account_guest_time(tsk, cputime_to_nsecs(acct->gtime)); - if (acct->steal_time) + if (IS_ENABLED(CONFIG_PPC_SPLPAR) && acct->steal_time) { account_steal_time(cputime_to_nsecs(acct->steal_time)); + acct->steal_time = 0; + } if (acct->idle_time) account_idle_time(cputime_to_nsecs(acct->idle_time)); @@ -393,8 +424,6 @@ void vtime_flush(struct task_struct *tsk) if (acct->stime) account_system_index_time(tsk, cputime_to_nsecs(acct->stime), CPUTIME_SYSTEM); - if (acct->stime_scaled) - tsk->stimescaled += cputime_to_nsecs(acct->stime_scaled); if (acct->hardirq_time) account_system_index_time(tsk, cputime_to_nsecs(acct->hardirq_time), @@ -403,14 +432,12 @@ void vtime_flush(struct task_struct *tsk) account_system_index_time(tsk, cputime_to_nsecs(acct->softirq_time), CPUTIME_SOFTIRQ); + vtime_flush_scaled(tsk, acct); + acct->utime = 0; - acct->utime_scaled = 0; - acct->utime_sspurr = 0; acct->gtime = 0; - acct->steal_time = 0; acct->idle_time = 0; acct->stime = 0; - acct->stime_scaled = 0; acct->hardirq_time = 0; acct->softirq_time = 0; } @@ -984,10 +1011,14 @@ static void register_decrementer_clockevent(int cpu) *dec = decrementer_clockevent; dec->cpumask = cpumask_of(cpu); + clockevents_config_and_register(dec, ppc_tb_freq, 2, decrementer_max); + printk_once(KERN_DEBUG "clockevent: %s mult[%x] shift[%d] cpu[%d]\n", dec->name, dec->mult, dec->shift, cpu); - clockevents_register_device(dec); + /* Set values for KVM, see kvm_emulate_dec() */ + decrementer_clockevent.mult = dec->mult; + decrementer_clockevent.shift = dec->shift; } static void enable_large_decrementer(void) @@ -1035,18 +1066,7 @@ static void __init set_decrementer_max(void) static void __init init_decrementer_clockevent(void) { - int cpu = smp_processor_id(); - - clockevents_calc_mult_shift(&decrementer_clockevent, ppc_tb_freq, 4); - - decrementer_clockevent.max_delta_ns = - clockevent_delta2ns(decrementer_max, &decrementer_clockevent); - decrementer_clockevent.max_delta_ticks = decrementer_max; - decrementer_clockevent.min_delta_ns = - clockevent_delta2ns(2, &decrementer_clockevent); - decrementer_clockevent.min_delta_ticks = 2; - - register_decrementer_clockevent(cpu); + register_decrementer_clockevent(smp_processor_id()); } void secondary_cpu_time_init(void) diff --git a/arch/powerpc/kernel/tm.S b/arch/powerpc/kernel/tm.S index 7716374786bd..9fabdce255cd 100644 --- a/arch/powerpc/kernel/tm.S +++ b/arch/powerpc/kernel/tm.S @@ -92,13 +92,14 @@ _GLOBAL(tm_abort) blr EXPORT_SYMBOL_GPL(tm_abort); -/* void tm_reclaim(struct thread_struct *thread, +/* + * void tm_reclaim(struct thread_struct *thread, * uint8_t cause) * * - Performs a full reclaim. This destroys outstanding - * transactions and updates thread->regs.tm_ckpt_* with the - * original checkpointed state. Note that thread->regs is - * unchanged. + * transactions and updates thread.ckpt_regs, thread.ckfp_state and + * thread.ckvr_state with the original checkpointed state. Note that + * thread->regs is unchanged. * * Purpose is to both abort transactions of, and preserve the state of, * a transactions at a context switch. We preserve/restore both sets of process @@ -163,15 +164,16 @@ _GLOBAL(tm_reclaim) */ TRECLAIM(R4) /* Cause in r4 */ - /* ******************** GPRs ******************** */ - /* Stash the checkpointed r13 away in the scratch SPR and get the real - * paca + /* + * ******************** GPRs ******************** + * Stash the checkpointed r13 in the scratch SPR and get the real paca. */ SET_SCRATCH0(r13) GET_PACA(r13) - /* Stash the checkpointed r1 away in paca tm_scratch and get the real - * stack pointer back + /* + * Stash the checkpointed r1 away in paca->tm_scratch and get the real + * stack pointer back into r1. */ std r1, PACATMSCRATCH(r13) ld r1, PACAR1(r13) @@ -209,14 +211,15 @@ _GLOBAL(tm_reclaim) addi r7, r12, PT_CKPT_REGS /* Thread's ckpt_regs */ - /* Make r7 look like an exception frame so that we - * can use the neat GPRx(n) macros. r7 is NOT a pt_regs ptr! + /* + * Make r7 look like an exception frame so that we can use the neat + * GPRx(n) macros. r7 is NOT a pt_regs ptr! */ subi r7, r7, STACK_FRAME_OVERHEAD /* Sync the userland GPRs 2-12, 14-31 to thread->regs: */ SAVE_GPR(0, r7) /* user r0 */ - SAVE_GPR(2, r7) /* user r2 */ + SAVE_GPR(2, r7) /* user r2 */ SAVE_4GPRS(3, r7) /* user r3-r6 */ SAVE_GPR(8, r7) /* user r8 */ SAVE_GPR(9, r7) /* user r9 */ @@ -237,7 +240,8 @@ _GLOBAL(tm_reclaim) /* ******************** NIP ******************** */ mfspr r3, SPRN_TFHAR std r3, _NIP(r7) /* Returns to failhandler */ - /* The checkpointed NIP is ignored when rescheduling/rechkpting, + /* + * The checkpointed NIP is ignored when rescheduling/rechkpting, * but is used in signal return to 'wind back' to the abort handler. */ @@ -260,12 +264,13 @@ _GLOBAL(tm_reclaim) std r3, THREAD_TM_TAR(r12) std r4, THREAD_TM_DSCR(r12) - /* MSR and flags: We don't change CRs, and we don't need to alter - * MSR. + /* + * MSR and flags: We don't change CRs, and we don't need to alter MSR. */ - /* ******************** FPR/VR/VSRs ************ + /* + * ******************** FPR/VR/VSRs ************ * After reclaiming, capture the checkpointed FPRs/VRs. * * We enabled VEC/FP/VSX in the msr above, so we can execute these @@ -275,7 +280,7 @@ _GLOBAL(tm_reclaim) /* Altivec (VEC/VMX/VR)*/ addi r7, r3, THREAD_CKVRSTATE - SAVE_32VRS(0, r6, r7) /* r6 scratch, r7 transact vr state */ + SAVE_32VRS(0, r6, r7) /* r6 scratch, r7 ckvr_state */ mfvscr v0 li r6, VRSTATE_VSCR stvx v0, r7, r6 @@ -286,12 +291,13 @@ _GLOBAL(tm_reclaim) /* Floating Point (FP) */ addi r7, r3, THREAD_CKFPSTATE - SAVE_32FPRS_VSRS(0, R6, R7) /* r6 scratch, r7 transact fp state */ + SAVE_32FPRS_VSRS(0, R6, R7) /* r6 scratch, r7 ckfp_state */ mffs fr0 stfd fr0,FPSTATE_FPSCR(r7) - /* TM regs, incl TEXASR -- these live in thread_struct. Note they've + /* + * TM regs, incl TEXASR -- these live in thread_struct. Note they've * been updated by the treclaim, to explain to userland the failure * cause (aborted). */ @@ -327,7 +333,7 @@ _GLOBAL(tm_reclaim) blr - /* + /* * void __tm_recheckpoint(struct thread_struct *thread) * - Restore the checkpointed register state saved by tm_reclaim * when we switch_to a process. @@ -343,7 +349,8 @@ _GLOBAL(__tm_recheckpoint) std r2, STK_GOT(r1) stdu r1, -TM_FRAME_SIZE(r1) - /* We've a struct pt_regs at [r1+STACK_FRAME_OVERHEAD]. + /* + * We've a struct pt_regs at [r1+STACK_FRAME_OVERHEAD]. * This is used for backing up the NVGPRs: */ SAVE_NVGPRS(r1) @@ -352,8 +359,9 @@ _GLOBAL(__tm_recheckpoint) addi r7, r3, PT_CKPT_REGS /* Thread's ckpt_regs */ - /* Make r7 look like an exception frame so that we - * can use the neat GPRx(n) macros. r7 is now NOT a pt_regs ptr! + /* + * Make r7 look like an exception frame so that we can use the neat + * GPRx(n) macros. r7 is now NOT a pt_regs ptr! */ subi r7, r7, STACK_FRAME_OVERHEAD @@ -421,14 +429,15 @@ restore_gprs: REST_NVGPRS(r7) /* GPR14-31 */ - /* Load up PPR and DSCR here so we don't run with user values for long - */ + /* Load up PPR and DSCR here so we don't run with user values for long */ mtspr SPRN_DSCR, r5 mtspr SPRN_PPR, r6 - /* Do final sanity check on TEXASR to make sure FS is set. Do this + /* + * Do final sanity check on TEXASR to make sure FS is set. Do this * here before we load up the userspace r1 so any bugs we hit will get - * a call chain */ + * a call chain. + */ mfspr r5, SPRN_TEXASR srdi r5, r5, 16 li r6, (TEXASR_FS)@h @@ -436,8 +445,9 @@ restore_gprs: 1: tdeqi r6, 0 EMIT_BUG_ENTRY 1b,__FILE__,__LINE__,0 - /* Do final sanity check on MSR to make sure we are not transactional - * or suspended + /* + * Do final sanity check on MSR to make sure we are not transactional + * or suspended. */ mfmsr r6 li r5, (MSR_TS_MASK)@higher @@ -453,8 +463,8 @@ restore_gprs: REST_GPR(6, r7) /* - * Store r1 and r5 on the stack so that we can access them - * after we clear MSR RI. + * Store r1 and r5 on the stack so that we can access them after we + * clear MSR RI. */ REST_GPR(5, r7) @@ -484,7 +494,8 @@ restore_gprs: HMT_MEDIUM - /* Our transactional state has now changed. + /* + * Our transactional state has now changed. * * Now just get out of here. Transactional (current) state will be * updated once restore is called on the return path in the _switch-ed diff --git a/arch/powerpc/kernel/trace/Makefile b/arch/powerpc/kernel/trace/Makefile index d22d8bafb643..b1725ad3e13d 100644 --- a/arch/powerpc/kernel/trace/Makefile +++ b/arch/powerpc/kernel/trace/Makefile @@ -3,11 +3,9 @@ # Makefile for the powerpc trace subsystem # -subdir-ccflags-$(CONFIG_PPC_WERROR) := -Werror - ifdef CONFIG_FUNCTION_TRACER # do not trace tracer code -CFLAGS_REMOVE_ftrace.o = -mno-sched-epilog $(CC_FLAGS_FTRACE) +CFLAGS_REMOVE_ftrace.o = $(CC_FLAGS_FTRACE) endif obj32-$(CONFIG_FUNCTION_TRACER) += ftrace_32.o diff --git a/arch/powerpc/kernel/trace/ftrace.c b/arch/powerpc/kernel/trace/ftrace.c index 4bfbb54dee51..4bf051d3e21e 100644 --- a/arch/powerpc/kernel/trace/ftrace.c +++ b/arch/powerpc/kernel/trace/ftrace.c @@ -30,6 +30,16 @@ #ifdef CONFIG_DYNAMIC_FTRACE + +/* + * We generally only have a single long_branch tramp and at most 2 or 3 plt + * tramps generated. But, we don't use the plt tramps currently. We also allot + * 2 tramps after .text and .init.text. So, we only end up with around 3 usable + * tramps in total. Set aside 8 just to be sure. + */ +#define NUM_FTRACE_TRAMPS 8 +static unsigned long ftrace_tramps[NUM_FTRACE_TRAMPS]; + static unsigned int ftrace_call_replace(unsigned long ip, unsigned long addr, int link) { @@ -85,13 +95,16 @@ static int test_24bit_addr(unsigned long ip, unsigned long addr) return create_branch((unsigned int *)ip, addr, 0); } -#ifdef CONFIG_MODULES - static int is_bl_op(unsigned int op) { return (op & 0xfc000003) == 0x48000001; } +static int is_b_op(unsigned int op) +{ + return (op & 0xfc000003) == 0x48000000; +} + static unsigned long find_bl_target(unsigned long ip, unsigned int op) { static int offset; @@ -104,6 +117,7 @@ static unsigned long find_bl_target(unsigned long ip, unsigned int op) return ip + (long)offset; } +#ifdef CONFIG_MODULES #ifdef CONFIG_PPC64 static int __ftrace_make_nop(struct module *mod, @@ -270,6 +284,146 @@ __ftrace_make_nop(struct module *mod, #endif /* PPC64 */ #endif /* CONFIG_MODULES */ +static unsigned long find_ftrace_tramp(unsigned long ip) +{ + int i; + + /* + * We have the compiler generated long_branch tramps at the end + * and we prefer those + */ + for (i = NUM_FTRACE_TRAMPS - 1; i >= 0; i--) + if (!ftrace_tramps[i]) + continue; + else if (create_branch((void *)ip, ftrace_tramps[i], 0)) + return ftrace_tramps[i]; + + return 0; +} + +static int add_ftrace_tramp(unsigned long tramp) +{ + int i; + + for (i = 0; i < NUM_FTRACE_TRAMPS; i++) + if (!ftrace_tramps[i]) { + ftrace_tramps[i] = tramp; + return 0; + } + + return -1; +} + +/* + * If this is a compiler generated long_branch trampoline (essentially, a + * trampoline that has a branch to _mcount()), we re-write the branch to + * instead go to ftrace_[regs_]caller() and note down the location of this + * trampoline. + */ +static int setup_mcount_compiler_tramp(unsigned long tramp) +{ + int i, op; + unsigned long ptr; + static unsigned long ftrace_plt_tramps[NUM_FTRACE_TRAMPS]; + + /* Is this a known long jump tramp? */ + for (i = 0; i < NUM_FTRACE_TRAMPS; i++) + if (!ftrace_tramps[i]) + break; + else if (ftrace_tramps[i] == tramp) + return 0; + + /* Is this a known plt tramp? */ + for (i = 0; i < NUM_FTRACE_TRAMPS; i++) + if (!ftrace_plt_tramps[i]) + break; + else if (ftrace_plt_tramps[i] == tramp) + return -1; + + /* New trampoline -- read where this goes */ + if (probe_kernel_read(&op, (void *)tramp, sizeof(int))) { + pr_debug("Fetching opcode failed.\n"); + return -1; + } + + /* Is this a 24 bit branch? */ + if (!is_b_op(op)) { + pr_debug("Trampoline is not a long branch tramp.\n"); + return -1; + } + + /* lets find where the pointer goes */ + ptr = find_bl_target(tramp, op); + + if (ptr != ppc_global_function_entry((void *)_mcount)) { + pr_debug("Trampoline target %p is not _mcount\n", (void *)ptr); + return -1; + } + + /* Let's re-write the tramp to go to ftrace_[regs_]caller */ +#ifdef CONFIG_DYNAMIC_FTRACE_WITH_REGS + ptr = ppc_global_function_entry((void *)ftrace_regs_caller); +#else + ptr = ppc_global_function_entry((void *)ftrace_caller); +#endif + if (!create_branch((void *)tramp, ptr, 0)) { + pr_debug("%ps is not reachable from existing mcount tramp\n", + (void *)ptr); + return -1; + } + + if (patch_branch((unsigned int *)tramp, ptr, 0)) { + pr_debug("REL24 out of range!\n"); + return -1; + } + + if (add_ftrace_tramp(tramp)) { + pr_debug("No tramp locations left\n"); + return -1; + } + + return 0; +} + +static int __ftrace_make_nop_kernel(struct dyn_ftrace *rec, unsigned long addr) +{ + unsigned long tramp, ip = rec->ip; + unsigned int op; + + /* Read where this goes */ + if (probe_kernel_read(&op, (void *)ip, sizeof(int))) { + pr_err("Fetching opcode failed.\n"); + return -EFAULT; + } + + /* Make sure that that this is still a 24bit jump */ + if (!is_bl_op(op)) { + pr_err("Not expected bl: opcode is %x\n", op); + return -EINVAL; + } + + /* Let's find where the pointer goes */ + tramp = find_bl_target(ip, op); + + pr_devel("ip:%lx jumps to %lx", ip, tramp); + + if (setup_mcount_compiler_tramp(tramp)) { + /* Are other trampolines reachable? */ + if (!find_ftrace_tramp(ip)) { + pr_err("No ftrace trampolines reachable from %ps\n", + (void *)ip); + return -EINVAL; + } + } + + if (patch_instruction((unsigned int *)ip, PPC_INST_NOP)) { + pr_err("Patching NOP failed.\n"); + return -EPERM; + } + + return 0; +} + int ftrace_make_nop(struct module *mod, struct dyn_ftrace *rec, unsigned long addr) { @@ -286,7 +440,8 @@ int ftrace_make_nop(struct module *mod, old = ftrace_call_replace(ip, addr, 1); new = PPC_INST_NOP; return ftrace_modify_code(ip, old, new); - } + } else if (core_kernel_text(ip)) + return __ftrace_make_nop_kernel(rec, addr); #ifdef CONFIG_MODULES /* @@ -456,6 +611,53 @@ __ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr) #endif /* CONFIG_PPC64 */ #endif /* CONFIG_MODULES */ +static int __ftrace_make_call_kernel(struct dyn_ftrace *rec, unsigned long addr) +{ + unsigned int op; + void *ip = (void *)rec->ip; + unsigned long tramp, entry, ptr; + + /* Make sure we're being asked to patch branch to a known ftrace addr */ + entry = ppc_global_function_entry((void *)ftrace_caller); + ptr = ppc_global_function_entry((void *)addr); + + if (ptr != entry) { +#ifdef CONFIG_DYNAMIC_FTRACE_WITH_REGS + entry = ppc_global_function_entry((void *)ftrace_regs_caller); + if (ptr != entry) { +#endif + pr_err("Unknown ftrace addr to patch: %ps\n", (void *)ptr); + return -EINVAL; +#ifdef CONFIG_DYNAMIC_FTRACE_WITH_REGS + } +#endif + } + + /* Make sure we have a nop */ + if (probe_kernel_read(&op, ip, sizeof(op))) { + pr_err("Unable to read ftrace location %p\n", ip); + return -EFAULT; + } + + if (op != PPC_INST_NOP) { + pr_err("Unexpected call sequence at %p: %x\n", ip, op); + return -EINVAL; + } + + tramp = find_ftrace_tramp((unsigned long)ip); + if (!tramp) { + pr_err("No ftrace trampolines reachable from %ps\n", ip); + return -EINVAL; + } + + if (patch_branch(ip, tramp, BRANCH_SET_LINK)) { + pr_err("Error patching branch to ftrace tramp!\n"); + return -EINVAL; + } + + return 0; +} + int ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr) { unsigned long ip = rec->ip; @@ -471,7 +673,8 @@ int ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr) old = PPC_INST_NOP; new = ftrace_call_replace(ip, addr, 1); return ftrace_modify_code(ip, old, new); - } + } else if (core_kernel_text(ip)) + return __ftrace_make_call_kernel(rec, addr); #ifdef CONFIG_MODULES /* @@ -603,6 +806,12 @@ int ftrace_modify_call(struct dyn_ftrace *rec, unsigned long old_addr, old = ftrace_call_replace(ip, old_addr, 1); new = ftrace_call_replace(ip, addr, 1); return ftrace_modify_code(ip, old, new); + } else if (core_kernel_text(ip)) { + /* + * We always patch out of range locations to go to the regs + * variant, so there is nothing to do here + */ + return 0; } #ifdef CONFIG_MODULES @@ -654,10 +863,54 @@ void arch_ftrace_update_code(int command) ftrace_modify_all_code(command); } +#ifdef CONFIG_PPC64 +#define PACATOC offsetof(struct paca_struct, kernel_toc) + +#define PPC_LO(v) ((v) & 0xffff) +#define PPC_HI(v) (((v) >> 16) & 0xffff) +#define PPC_HA(v) PPC_HI ((v) + 0x8000) + +extern unsigned int ftrace_tramp_text[], ftrace_tramp_init[]; + +int __init ftrace_dyn_arch_init(void) +{ + int i; + unsigned int *tramp[] = { ftrace_tramp_text, ftrace_tramp_init }; + u32 stub_insns[] = { + 0xe98d0000 | PACATOC, /* ld r12,PACATOC(r13) */ + 0x3d8c0000, /* addis r12,r12,<high> */ + 0x398c0000, /* addi r12,r12,<low> */ + 0x7d8903a6, /* mtctr r12 */ + 0x4e800420, /* bctr */ + }; +#ifdef CONFIG_DYNAMIC_FTRACE_WITH_REGS + unsigned long addr = ppc_global_function_entry((void *)ftrace_regs_caller); +#else + unsigned long addr = ppc_global_function_entry((void *)ftrace_caller); +#endif + long reladdr = addr - kernel_toc_addr(); + + if (reladdr > 0x7FFFFFFF || reladdr < -(0x80000000L)) { + pr_err("Address of %ps out of range of kernel_toc.\n", + (void *)addr); + return -1; + } + + for (i = 0; i < 2; i++) { + memcpy(tramp[i], stub_insns, sizeof(stub_insns)); + tramp[i][1] |= PPC_HA(reladdr); + tramp[i][2] |= PPC_LO(reladdr); + add_ftrace_tramp((unsigned long)tramp[i]); + } + + return 0; +} +#else int __init ftrace_dyn_arch_init(void) { return 0; } +#endif #endif /* CONFIG_DYNAMIC_FTRACE */ #ifdef CONFIG_FUNCTION_GRAPH_TRACER diff --git a/arch/powerpc/kernel/trace/ftrace_64.S b/arch/powerpc/kernel/trace/ftrace_64.S index e25f77c10a72..1782af2d1496 100644 --- a/arch/powerpc/kernel/trace/ftrace_64.S +++ b/arch/powerpc/kernel/trace/ftrace_64.S @@ -14,6 +14,18 @@ #include <asm/ppc-opcode.h> #include <asm/export.h> +.pushsection ".tramp.ftrace.text","aw",@progbits; +.globl ftrace_tramp_text +ftrace_tramp_text: + .space 64 +.popsection + +.pushsection ".tramp.ftrace.init","aw",@progbits; +.globl ftrace_tramp_init +ftrace_tramp_init: + .space 64 +.popsection + _GLOBAL(mcount) _GLOBAL(_mcount) EXPORT_SYMBOL(_mcount) diff --git a/arch/powerpc/kernel/traps.c b/arch/powerpc/kernel/traps.c index ab1bd06d7c44..9a86572db1ef 100644 --- a/arch/powerpc/kernel/traps.c +++ b/arch/powerpc/kernel/traps.c @@ -247,8 +247,6 @@ static void oops_end(unsigned long flags, struct pt_regs *regs, mdelay(MSEC_PER_SEC); } - if (in_interrupt()) - panic("Fatal exception in interrupt"); if (panic_on_oops) panic("Fatal exception"); do_exit(signr); @@ -535,10 +533,10 @@ int machine_check_e500mc(struct pt_regs *regs) printk("Caused by (from MCSR=%lx): ", reason); if (reason & MCSR_MCP) - printk("Machine Check Signal\n"); + pr_cont("Machine Check Signal\n"); if (reason & MCSR_ICPERR) { - printk("Instruction Cache Parity Error\n"); + pr_cont("Instruction Cache Parity Error\n"); /* * This is recoverable by invalidating the i-cache. @@ -556,7 +554,7 @@ int machine_check_e500mc(struct pt_regs *regs) } if (reason & MCSR_DCPERR_MC) { - printk("Data Cache Parity Error\n"); + pr_cont("Data Cache Parity Error\n"); /* * In write shadow mode we auto-recover from the error, but it @@ -575,38 +573,38 @@ int machine_check_e500mc(struct pt_regs *regs) } if (reason & MCSR_L2MMU_MHIT) { - printk("Hit on multiple TLB entries\n"); + pr_cont("Hit on multiple TLB entries\n"); recoverable = 0; } if (reason & MCSR_NMI) - printk("Non-maskable interrupt\n"); + pr_cont("Non-maskable interrupt\n"); if (reason & MCSR_IF) { - printk("Instruction Fetch Error Report\n"); + pr_cont("Instruction Fetch Error Report\n"); recoverable = 0; } if (reason & MCSR_LD) { - printk("Load Error Report\n"); + pr_cont("Load Error Report\n"); recoverable = 0; } if (reason & MCSR_ST) { - printk("Store Error Report\n"); + pr_cont("Store Error Report\n"); recoverable = 0; } if (reason & MCSR_LDG) { - printk("Guarded Load Error Report\n"); + pr_cont("Guarded Load Error Report\n"); recoverable = 0; } if (reason & MCSR_TLBSYNC) - printk("Simultaneous tlbsync operations\n"); + pr_cont("Simultaneous tlbsync operations\n"); if (reason & MCSR_BSL2_ERR) { - printk("Level 2 Cache Error\n"); + pr_cont("Level 2 Cache Error\n"); recoverable = 0; } @@ -616,7 +614,7 @@ int machine_check_e500mc(struct pt_regs *regs) addr = mfspr(SPRN_MCAR); addr |= (u64)mfspr(SPRN_MCARU) << 32; - printk("Machine Check %s Address: %#llx\n", + pr_cont("Machine Check %s Address: %#llx\n", reason & MCSR_MEA ? "Effective" : "Physical", addr); } @@ -640,29 +638,29 @@ int machine_check_e500(struct pt_regs *regs) printk("Caused by (from MCSR=%lx): ", reason); if (reason & MCSR_MCP) - printk("Machine Check Signal\n"); + pr_cont("Machine Check Signal\n"); if (reason & MCSR_ICPERR) - printk("Instruction Cache Parity Error\n"); + pr_cont("Instruction Cache Parity Error\n"); if (reason & MCSR_DCP_PERR) - printk("Data Cache Push Parity Error\n"); + pr_cont("Data Cache Push Parity Error\n"); if (reason & MCSR_DCPERR) - printk("Data Cache Parity Error\n"); + pr_cont("Data Cache Parity Error\n"); if (reason & MCSR_BUS_IAERR) - printk("Bus - Instruction Address Error\n"); + pr_cont("Bus - Instruction Address Error\n"); if (reason & MCSR_BUS_RAERR) - printk("Bus - Read Address Error\n"); + pr_cont("Bus - Read Address Error\n"); if (reason & MCSR_BUS_WAERR) - printk("Bus - Write Address Error\n"); + pr_cont("Bus - Write Address Error\n"); if (reason & MCSR_BUS_IBERR) - printk("Bus - Instruction Data Error\n"); + pr_cont("Bus - Instruction Data Error\n"); if (reason & MCSR_BUS_RBERR) - printk("Bus - Read Data Bus Error\n"); + pr_cont("Bus - Read Data Bus Error\n"); if (reason & MCSR_BUS_WBERR) - printk("Bus - Write Data Bus Error\n"); + pr_cont("Bus - Write Data Bus Error\n"); if (reason & MCSR_BUS_IPERR) - printk("Bus - Instruction Parity Error\n"); + pr_cont("Bus - Instruction Parity Error\n"); if (reason & MCSR_BUS_RPERR) - printk("Bus - Read Parity Error\n"); + pr_cont("Bus - Read Parity Error\n"); return 0; } @@ -680,19 +678,19 @@ int machine_check_e200(struct pt_regs *regs) printk("Caused by (from MCSR=%lx): ", reason); if (reason & MCSR_MCP) - printk("Machine Check Signal\n"); + pr_cont("Machine Check Signal\n"); if (reason & MCSR_CP_PERR) - printk("Cache Push Parity Error\n"); + pr_cont("Cache Push Parity Error\n"); if (reason & MCSR_CPERR) - printk("Cache Parity Error\n"); + pr_cont("Cache Parity Error\n"); if (reason & MCSR_EXCP_ERR) - printk("ISI, ITLB, or Bus Error on first instruction fetch for an exception handler\n"); + pr_cont("ISI, ITLB, or Bus Error on first instruction fetch for an exception handler\n"); if (reason & MCSR_BUS_IRERR) - printk("Bus - Read Bus Error on instruction fetch\n"); + pr_cont("Bus - Read Bus Error on instruction fetch\n"); if (reason & MCSR_BUS_DRERR) - printk("Bus - Read Bus Error on data load\n"); + pr_cont("Bus - Read Bus Error on data load\n"); if (reason & MCSR_BUS_WRERR) - printk("Bus - Write Bus Error on buffered store or cache line push\n"); + pr_cont("Bus - Write Bus Error on buffered store or cache line push\n"); return 0; } @@ -705,30 +703,30 @@ int machine_check_generic(struct pt_regs *regs) printk("Caused by (from SRR1=%lx): ", reason); switch (reason & 0x601F0000) { case 0x80000: - printk("Machine check signal\n"); + pr_cont("Machine check signal\n"); break; case 0: /* for 601 */ case 0x40000: case 0x140000: /* 7450 MSS error and TEA */ - printk("Transfer error ack signal\n"); + pr_cont("Transfer error ack signal\n"); break; case 0x20000: - printk("Data parity error signal\n"); + pr_cont("Data parity error signal\n"); break; case 0x10000: - printk("Address parity error signal\n"); + pr_cont("Address parity error signal\n"); break; case 0x20000000: - printk("L1 Data Cache error\n"); + pr_cont("L1 Data Cache error\n"); break; case 0x40000000: - printk("L1 Instruction Cache error\n"); + pr_cont("L1 Instruction Cache error\n"); break; case 0x00100000: - printk("L2 data cache parity error\n"); + pr_cont("L2 data cache parity error\n"); break; default: - printk("Unknown values in msr\n"); + pr_cont("Unknown values in msr\n"); } return 0; } @@ -741,9 +739,7 @@ void machine_check_exception(struct pt_regs *regs) if (!nested) nmi_enter(); - /* 64s accounts the mce in machine_check_early when in HVMODE */ - if (!IS_ENABLED(CONFIG_PPC_BOOK3S_64) || !cpu_has_feature(CPU_FTR_HVMODE)) - __this_cpu_inc(irq_stat.mce_exceptions); + __this_cpu_inc(irq_stat.mce_exceptions); add_taint(TAINT_MACHINE_CHECK, LOCKDEP_NOW_UNRELIABLE); @@ -767,12 +763,17 @@ void machine_check_exception(struct pt_regs *regs) if (check_io_access(regs)) goto bail; - die("Machine check", regs, SIGBUS); - /* Must die if the interrupt is not recoverable */ if (!(regs->msr & MSR_RI)) nmi_panic(regs, "Unrecoverable Machine check"); + if (!nested) + nmi_exit(); + + die("Machine check", regs, SIGBUS); + + return; + bail: if (!nested) nmi_exit(); @@ -1433,7 +1434,7 @@ void program_check_exception(struct pt_regs *regs) goto bail; } else { printk(KERN_EMERG "Unexpected TM Bad Thing exception " - "at %lx (msr 0x%x)\n", regs->nip, reason); + "at %lx (msr 0x%lx)\n", regs->nip, regs->msr); die("Unrecoverable exception", regs, SIGABRT); } } @@ -1547,14 +1548,6 @@ void StackOverflow(struct pt_regs *regs) panic("kernel stack overflow"); } -void nonrecoverable_exception(struct pt_regs *regs) -{ - printk(KERN_ERR "Non-recoverable exception at PC=%lx MSR=%lx\n", - regs->nip, regs->msr); - debugger(regs); - die("nonrecoverable exception", regs, SIGKILL); -} - void kernel_fp_unavailable_exception(struct pt_regs *regs) { enum ctx_state prev_state = exception_enter(); @@ -1750,16 +1743,20 @@ void fp_unavailable_tm(struct pt_regs *regs) * checkpointed FP registers need to be loaded. */ tm_reclaim_current(TM_CAUSE_FAC_UNAV); - /* Reclaim didn't save out any FPRs to transact_fprs. */ + + /* + * Reclaim initially saved out bogus (lazy) FPRs to ckfp_state, and + * then it was overwrite by the thr->fp_state by tm_reclaim_thread(). + * + * At this point, ck{fp,vr}_state contains the exact values we want to + * recheckpoint. + */ /* Enable FP for the task: */ current->thread.load_fp = 1; - /* This loads and recheckpoints the FP registers from - * thread.fpr[]. They will remain in registers after the - * checkpoint so we don't need to reload them after. - * If VMX is in use, the VRs now hold checkpointed values, - * so we don't want to load the VRs from the thread_struct. + /* + * Recheckpoint all the checkpointed ckpt, ck{fp, vr}_state registers. */ tm_recheckpoint(¤t->thread); } @@ -2086,8 +2083,8 @@ void SPEFloatingPointRoundException(struct pt_regs *regs) */ void unrecoverable_exception(struct pt_regs *regs) { - printk(KERN_EMERG "Unrecoverable exception %lx at %lx\n", - regs->trap, regs->nip); + pr_emerg("Unrecoverable exception %lx at %lx (msr=%lx)\n", + regs->trap, regs->nip, regs->msr); die("Unrecoverable exception", regs, SIGABRT); } NOKPROBE_SYMBOL(unrecoverable_exception); diff --git a/arch/powerpc/kernel/vdso32/datapage.S b/arch/powerpc/kernel/vdso32/datapage.S index 3745113fcc65..2a7eb5452aba 100644 --- a/arch/powerpc/kernel/vdso32/datapage.S +++ b/arch/powerpc/kernel/vdso32/datapage.S @@ -37,6 +37,7 @@ data_page_branch: mtlr r0 addi r3, r3, __kernel_datapage_offset-data_page_branch lwz r0,0(r3) + .cfi_restore lr add r3,r0,r3 blr .cfi_endproc diff --git a/arch/powerpc/kernel/vdso32/gettimeofday.S b/arch/powerpc/kernel/vdso32/gettimeofday.S index 769c2624e0a6..1e0bc5955a40 100644 --- a/arch/powerpc/kernel/vdso32/gettimeofday.S +++ b/arch/powerpc/kernel/vdso32/gettimeofday.S @@ -139,6 +139,7 @@ V_FUNCTION_BEGIN(__kernel_clock_gettime) */ 99: li r0,__NR_clock_gettime + .cfi_restore lr sc blr .cfi_endproc diff --git a/arch/powerpc/kernel/vdso64/datapage.S b/arch/powerpc/kernel/vdso64/datapage.S index abf17feffe40..bf9668691511 100644 --- a/arch/powerpc/kernel/vdso64/datapage.S +++ b/arch/powerpc/kernel/vdso64/datapage.S @@ -37,6 +37,7 @@ data_page_branch: mtlr r0 addi r3, r3, __kernel_datapage_offset-data_page_branch lwz r0,0(r3) + .cfi_restore lr add r3,r0,r3 blr .cfi_endproc diff --git a/arch/powerpc/kernel/vdso64/gettimeofday.S b/arch/powerpc/kernel/vdso64/gettimeofday.S index c002adcc694c..a4ed9edfd5f0 100644 --- a/arch/powerpc/kernel/vdso64/gettimeofday.S +++ b/arch/powerpc/kernel/vdso64/gettimeofday.S @@ -169,6 +169,7 @@ V_FUNCTION_BEGIN(__kernel_clock_gettime) */ 99: li r0,__NR_clock_gettime + .cfi_restore lr sc blr .cfi_endproc diff --git a/arch/powerpc/kernel/vmlinux.lds.S b/arch/powerpc/kernel/vmlinux.lds.S index 105a976323aa..434581bcd5b4 100644 --- a/arch/powerpc/kernel/vmlinux.lds.S +++ b/arch/powerpc/kernel/vmlinux.lds.S @@ -4,6 +4,9 @@ #else #define PROVIDE32(x) PROVIDE(x) #endif + +#define BSS_FIRST_SECTIONS *(.bss.prominit) + #include <asm/page.h> #include <asm-generic/vmlinux.lds.h> #include <asm/cache.h> @@ -99,6 +102,9 @@ SECTIONS #endif /* careful! __ftr_alt_* sections need to be close to .text */ *(.text.hot TEXT_MAIN .text.fixup .text.unlikely .fixup __ftr_alt_* .ref.text); +#ifdef CONFIG_PPC64 + *(.tramp.ftrace.text); +#endif SCHED_TEXT CPUIDLE_TEXT LOCK_TEXT @@ -181,7 +187,15 @@ SECTIONS */ . = ALIGN(STRICT_ALIGN_SIZE); __init_begin = .; - INIT_TEXT_SECTION(PAGE_SIZE) :kernel + . = ALIGN(PAGE_SIZE); + .init.text : AT(ADDR(.init.text) - LOAD_OFFSET) { + _sinittext = .; + INIT_TEXT + _einittext = .; +#ifdef CONFIG_PPC64 + *(.tramp.ftrace.init); +#endif + } :kernel /* .exit.text is discarded at runtime, not link time, * to deal with references from __bug_table |