From 5ffd229c02731a91d08ca21e76b503c5bbb5c095 Mon Sep 17 00:00:00 2001 From: Alexey Kardashevskiy Date: Tue, 21 May 2013 13:33:10 +1000 Subject: powerpc/vfio: Implement IOMMU driver for VFIO VFIO implements platform independent stuff such as a PCI driver, BAR access (via read/write on a file descriptor or direct mapping when possible) and IRQ signaling. The platform dependent part includes IOMMU initialization and handling. This implements an IOMMU driver for VFIO which does mapping/unmapping pages for the guest IO and provides information about DMA window (required by a POWER guest). Cc: David Gibson Signed-off-by: Alexey Kardashevskiy Signed-off-by: Paul Mackerras Acked-by: Alex Williamson Signed-off-by: Benjamin Herrenschmidt --- Documentation/vfio.txt | 63 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 63 insertions(+) (limited to 'Documentation') diff --git a/Documentation/vfio.txt b/Documentation/vfio.txt index 8eda3635a17d..c55533c0adb3 100644 --- a/Documentation/vfio.txt +++ b/Documentation/vfio.txt @@ -283,6 +283,69 @@ a direct pass through for VFIO_DEVICE_* ioctls. The read/write/mmap interfaces implement the device region access defined by the device's own VFIO_DEVICE_GET_REGION_INFO ioctl. + +PPC64 sPAPR implementation note +------------------------------------------------------------------------------- + +This implementation has some specifics: + +1) Only one IOMMU group per container is supported as an IOMMU group +represents the minimal entity which isolation can be guaranteed for and +groups are allocated statically, one per a Partitionable Endpoint (PE) +(PE is often a PCI domain but not always). + +2) The hardware supports so called DMA windows - the PCI address range +within which DMA transfer is allowed, any attempt to access address space +out of the window leads to the whole PE isolation. + +3) PPC64 guests are paravirtualized but not fully emulated. There is an API +to map/unmap pages for DMA, and it normally maps 1..32 pages per call and +currently there is no way to reduce the number of calls. In order to make things +faster, the map/unmap handling has been implemented in real mode which provides +an excellent performance which has limitations such as inability to do +locked pages accounting in real time. + +So 3 additional ioctls have been added: + + VFIO_IOMMU_SPAPR_TCE_GET_INFO - returns the size and the start + of the DMA window on the PCI bus. + + VFIO_IOMMU_ENABLE - enables the container. The locked pages accounting + is done at this point. This lets user first to know what + the DMA window is and adjust rlimit before doing any real job. + + VFIO_IOMMU_DISABLE - disables the container. + + +The code flow from the example above should be slightly changed: + + ..... + /* Add the group to the container */ + ioctl(group, VFIO_GROUP_SET_CONTAINER, &container); + + /* Enable the IOMMU model we want */ + ioctl(container, VFIO_SET_IOMMU, VFIO_SPAPR_TCE_IOMMU) + + /* Get addition sPAPR IOMMU info */ + vfio_iommu_spapr_tce_info spapr_iommu_info; + ioctl(container, VFIO_IOMMU_SPAPR_TCE_GET_INFO, &spapr_iommu_info); + + if (ioctl(container, VFIO_IOMMU_ENABLE)) + /* Cannot enable container, may be low rlimit */ + + /* Allocate some space and setup a DMA mapping */ + dma_map.vaddr = mmap(0, 1024 * 1024, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, 0, 0); + + dma_map.size = 1024 * 1024; + dma_map.iova = 0; /* 1MB starting at 0x0 from device view */ + dma_map.flags = VFIO_DMA_MAP_FLAG_READ | VFIO_DMA_MAP_FLAG_WRITE; + + /* Check here is .iova/.size are within DMA window from spapr_iommu_info */ + + ioctl(container, VFIO_IOMMU_MAP_DMA, &dma_map); + ..... + ------------------------------------------------------------------------------- [1] VFIO was originally an acronym for "Virtual Function I/O" in its -- cgit v1.2.3 From 8c43d2b0ca10cea9eb62d8996fb93f330be88ada Mon Sep 17 00:00:00 2001 From: Joe Liccese Date: Mon, 25 Jun 2012 12:22:09 -0500 Subject: powerpc: Add T4 LAC device tree binding & defs The Interlaken is a narrow, high speed channelized chip-to-chip interface. To facilitate interoperability between a data path device and a look-aside co-processor, the Interlaken Look-Aside protocol is defined for short transaction-related transfers. Although based on the Interlaken protocol, Interlaken Look-Aside is not directly compatible with Interlaken and can be considered a different operation mode. The Interlaken LA controller connects internal platform to Interlaken serial interface. It accepts LA command through software portals, which are system memory mapped 4KB spaces. The LA commands are then translated into the Interlaken control words and data words, which are sent on TX side to TCAM through SerDes lanes. Signed-off-by: Joe Liccese Signed-off-by: Scott Wood --- .../bindings/powerpc/fsl/interlaken-lac.txt | 309 +++++++++++++++++++++ .../boot/dts/fsl/interlaken-lac-portals.dtsi | 156 +++++++++++ arch/powerpc/boot/dts/fsl/interlaken-lac.dtsi | 45 +++ 3 files changed, 510 insertions(+) create mode 100644 Documentation/devicetree/bindings/powerpc/fsl/interlaken-lac.txt create mode 100644 arch/powerpc/boot/dts/fsl/interlaken-lac-portals.dtsi create mode 100644 arch/powerpc/boot/dts/fsl/interlaken-lac.dtsi (limited to 'Documentation') diff --git a/Documentation/devicetree/bindings/powerpc/fsl/interlaken-lac.txt b/Documentation/devicetree/bindings/powerpc/fsl/interlaken-lac.txt new file mode 100644 index 000000000000..641bc13983e1 --- /dev/null +++ b/Documentation/devicetree/bindings/powerpc/fsl/interlaken-lac.txt @@ -0,0 +1,309 @@ +=============================================================================== +Freescale Interlaken Look-Aside Controller Device Bindings +Copyright 2012 Freescale Semiconductor Inc. + +CONTENTS + - Interlaken Look-Aside Controller (LAC) Node + - Example LAC Node + - Interlaken Look-Aside Controller (LAC) Software Portal Node + - Interlaken Look-Aside Controller (LAC) Software Portal Child Nodes + - Example LAC SWP Node with Child Nodes + +============================================================================== +Interlaken Look-Aside Controller (LAC) Node + +DESCRIPTION + +The Interlaken is a narrow, high speed channelized chip-to-chip interface. To +facilitate interoperability between a data path device and a look-aside +co-processor, the Interlaken Look-Aside protocol is defined for short +transaction-related transfers. Although based on the Interlaken protocol, +Interlaken Look-Aside is not directly compatible with Interlaken and can be +considered a different operation mode. + +The Interlaken LA controller connects internal platform to Interlaken serial +interface. It accepts LA command through software portals, which are system +memory mapped 4KB spaces. The LA commands are then translated into the +Interlaken control words and data words, which are sent on TX side to TCAM +through SerDes lanes. + +There are two 4KiB spaces defined within the LAC global register memory map. +There is a full register set at 0x0000-0x0FFF (also known as the "hypervisor" +version), and a subset at 0x1000-0x1FFF. The former is a superset of the +latter, and includes certain registers that should not be accessible to +partitioned software. Separate nodes are used for each region, with a phandle +linking the hypervisor node to the normal operating node. + +PROPERTIES + + - compatible + Usage: required + Value type: + Definition: Must include "fsl,interlaken-lac". This represents only + those LAC CCSR registers not protected in partitioned + software. The version of the device is determined by the LAC + IP Block Revision Register (IPBRR0) at offset 0x0BF8. + + Table of correspondences between IPBRR0 values and example + chips: + Value Device + ----------- ------- + 0x02000100 T4240 + + The Hypervisor node has a different compatible. It must include + "fsl,interlaken-lac-hv". This node represents the protected + LAC register space and is required except inside a partition + where access to the hypervisor node is to be denied. + + - fsl,non-hv-node + Usage: required in "fsl,interlaken-lac-hv" + Value type: + Definition: Points to the non-protected LAC CCSR mapped register space + node. + + - reg + Usage: required + Value type: + Definition: A standard property. The first resource represents the + Interlaken LAC configuration registers. + + - interrupts: + Usage: required in non-hv node only + Value type: + Definition: Interrupt mapping for Interlaken LAC error IRQ. + +EXAMPLE + lac: lac@229000 { + compatible = "fsl,interlaken-lac" + reg = <0x229000 0x1000>; + interrupts = <16 2 1 18>; + }; + + lac-hv@228000 { + compatible = "fsl,interlaken-lac-hv" + reg = <0x228000 0x1000>; + fsl,non-hv-node = <&lac>; + }; + +=============================================================================== +Interlaken Look-Aside Controller (LAC) Software Portal Container Node + +DESCRIPTION +The Interlaken Look-Aside Controller (LAC) utilizes Software Portals to accept +Interlaken Look-Aside (ILA) commands. The Interlaken LAC software portal +memory map occupies 128KB of memory space. The software portal memory space is +intended to be cache-enabled. WIMG for each software space is required to be +0010 if stashing is enabled; otherwise, WIMG can be 0000 or 0010. + +PROPERTIES + + - #address-cells + Usage: required + Value type: + Definition: A standard property. Must have a value of 1. + + - #size-cells + Usage: required + Value type: + Definition: A standard property. Must have a value of 1. + + - compatible + Usage: required + Value type: + Definition: Must include "fsl,interlaken-lac-portals" + + - ranges + Usage: required + Value type: + Definition: A standard property. Specifies the address and length + of the LAC portal memory space. + +=============================================================================== +Interlaken Look-Aside Controller (LAC) Software Portals Child Nodes + +DESCRIPTION +There are up to 24 available software portals with each software portal +requiring 4KB of consecutive memory within the software portal memory mapped +space. + +PROPERTIES + + - compatible + Usage: required + Value type: + Definition: Must include "fsl,interlaken-lac-portal-vX.Y" where X is + the Major version (IP_MJ) found in the LAC IP Block Revision + Register (IPBRR0), at offset 0x0BF8, and Y is the Minor version + (IP_MN). + + Table of correspondences between version values and example chips: + Value Device + ------ ------- + 1.0 T4240 + + - reg + Usage: required + Value type: + Definition: A standard property. The first resource represents the + Interlaken LAC software portal registers. + + - fsl,liodn + Value type: + Definition: The logical I/O device number (LIODN) for this device. The + LIODN is a number expressed by this device and used to perform + look-ups in the IOMMU (PAMU) address table when performing + DMAs. This property is automatically added by u-boot. + +=============================================================================== +EXAMPLE + +lac-portals { + #address-cells = <0x1>; + #size-cells = <0x1>; + compatible = "fsl,interlaken-lac-portals"; + ranges = <0x0 0xf 0xf4400000 0x20000>; + + lportal0: lac-portal@0 { + compatible = "fsl,interlaken-lac-portal-v1.0"; + fsl,liodn = <0x204>; + reg = <0x0 0x1000>; + }; + + lportal1: lac-portal@1000 { + compatible = "fsl,interlaken-lac-portal-v1.0"; + fsl,liodn = <0x205>; + reg = <0x1000 0x1000>; + }; + + lportal2: lac-portal@2000 { + compatible = "fsl,interlaken-lac-portal-v1.0"; + fsl,liodn = <0x206>; + reg = <0x2000 0x1000>; + }; + + lportal3: lac-portal@3000 { + compatible = "fsl,interlaken-lac-portal-v1.0"; + fsl,liodn = <0x207>; + reg = <0x3000 0x1000>; + }; + + lportal4: lac-portal@4000 { + compatible = "fsl,interlaken-lac-portal-v1.0"; + fsl,liodn = <0x208>; + reg = <0x4000 0x1000>; + }; + + lportal5: lac-portal@5000 { + compatible = "fsl,interlaken-lac-portal-v1.0"; + fsl,liodn = <0x209>; + reg = <0x5000 0x1000>; + }; + + lportal6: lac-portal@6000 { + compatible = "fsl,interlaken-lac-portal-v1.0"; + fsl,liodn = <0x20A>; + reg = <0x6000 0x1000>; + }; + + lportal7: lac-portal@7000 { + compatible = "fsl,interlaken-lac-portal-v1.0"; + fsl,liodn = <0x20B>; + reg = <0x7000 0x1000>; + }; + + lportal8: lac-portal@8000 { + compatible = "fsl,interlaken-lac-portal-v1.0"; + fsl,liodn = <0x20C>; + reg = <0x8000 0x1000>; + }; + + lportal9: lac-portal@9000 { + compatible = "fsl,interlaken-lac-portal-v1.0"; + fsl,liodn = <0x20D>; + reg = <0x9000 0x1000>; + }; + + lportal10: lac-portal@A000 { + compatible = "fsl,interlaken-lac-portal-v1.0"; + fsl,liodn = <0x20E>; + reg = <0xA000 0x1000>; + }; + + lportal11: lac-portal@B000 { + compatible = "fsl,interlaken-lac-portal-v1.0"; + fsl,liodn = <0x20F>; + reg = <0xB000 0x1000>; + }; + + lportal12: lac-portal@C000 { + compatible = "fsl,interlaken-lac-portal-v1.0"; + fsl,liodn = <0x210>; + reg = <0xC000 0x1000>; + }; + + lportal13: lac-portal@D000 { + compatible = "fsl,interlaken-lac-portal-v1.0"; + fsl,liodn = <0x211>; + reg = <0xD000 0x1000>; + }; + + lportal14: lac-portal@E000 { + compatible = "fsl,interlaken-lac-portal-v1.0"; + fsl,liodn = <0x212>; + reg = <0xE000 0x1000>; + }; + + lportal15: lac-portal@F000 { + compatible = "fsl,interlaken-lac-portal-v1.0"; + fsl,liodn = <0x213>; + reg = <0xF000 0x1000>; + }; + + lportal16: lac-portal@10000 { + compatible = "fsl,interlaken-lac-portal-v1.0"; + fsl,liodn = <0x214>; + reg = <0x10000 0x1000>; + }; + + lportal17: lac-portal@11000 { + compatible = "fsl,interlaken-lac-portal-v1.0"; + fsl,liodn = <0x215>; + reg = <0x11000 0x1000>; + }; + + lportal8: lac-portal@1200 { + compatible = "fsl,interlaken-lac-portal-v1.0"; + fsl,liodn = <0x216>; + reg = <0x12000 0x1000>; + }; + + lportal19: lac-portal@13000 { + compatible = "fsl,interlaken-lac-portal-v1.0"; + fsl,liodn = <0x217>; + reg = <0x13000 0x1000>; + }; + + lportal20: lac-portal@14000 { + compatible = "fsl,interlaken-lac-portal-v1.0"; + fsl,liodn = <0x218>; + reg = <0x14000 0x1000>; + }; + + lportal21: lac-portal@15000 { + compatible = "fsl,interlaken-lac-portal-v1.0"; + fsl,liodn = <0x219>; + reg = <0x15000 0x1000>; + }; + + lportal22: lac-portal@16000 { + compatible = "fsl,interlaken-lac-portal-v1.0"; + fsl,liodn = <0x21A>; + reg = <0x16000 0x1000>; + }; + + lportal23: lac-portal@17000 { + compatible = "fsl,interlaken-lac-portal-v1.0"; + fsl,liodn = <0x21B>; + reg = <0x17000 0x1000>; + }; +}; diff --git a/arch/powerpc/boot/dts/fsl/interlaken-lac-portals.dtsi b/arch/powerpc/boot/dts/fsl/interlaken-lac-portals.dtsi new file mode 100644 index 000000000000..9cffccf4e07e --- /dev/null +++ b/arch/powerpc/boot/dts/fsl/interlaken-lac-portals.dtsi @@ -0,0 +1,156 @@ +/* T4240 Interlaken LAC Portal device tree stub with 24 portals. + * + * Copyright 2012 Freescale Semiconductor Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Freescale Semiconductor nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * + * ALTERNATIVELY, this software may be distributed under the terms of the + * GNU General Public License ("GPL") as published by the Free Software + * Foundation, either version 2 of that License or (at your option) any + * later version. + * + * THIS SOFTWARE IS PROVIDED BY Freescale Semiconductor "AS IS" AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL Freescale Semiconductor BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#address-cells = <0x1>; +#size-cells = <0x1>; +compatible = "fsl,interlaken-lac-portals"; + +lportal0: lac-portal@0 { + compatible = "fsl,interlaken-lac-portal-v1.0"; + reg = <0x0 0x1000>; +}; + +lportal1: lac-portal@1000 { + compatible = "fsl,interlaken-lac-portal-v1.0"; + reg = <0x1000 0x1000>; +}; + +lportal2: lac-portal@2000 { + compatible = "fsl,interlaken-lac-portal-v1.0"; + reg = <0x2000 0x1000>; +}; + +lportal3: lac-portal@3000 { + compatible = "fsl,interlaken-lac-portal-v1.0"; + reg = <0x3000 0x1000>; +}; + +lportal4: lac-portal@4000 { + compatible = "fsl,interlaken-lac-portal-v1.0"; + reg = <0x4000 0x1000>; +}; + +lportal5: lac-portal@5000 { + compatible = "fsl,interlaken-lac-portal-v1.0"; + reg = <0x5000 0x1000>; +}; + +lportal6: lac-portal@6000 { + compatible = "fsl,interlaken-lac-portal-v1.0"; + reg = <0x6000 0x1000>; +}; + +lportal7: lac-portal@7000 { + compatible = "fsl,interlaken-lac-portal-v1.0"; + reg = <0x7000 0x1000>; +}; + +lportal8: lac-portal@8000 { + compatible = "fsl,interlaken-lac-portal-v1.0"; + reg = <0x8000 0x1000>; +}; + +lportal9: lac-portal@9000 { + compatible = "fsl,interlaken-lac-portal-v1.0"; + reg = <0x9000 0x1000>; +}; + +lportal10: lac-portal@A000 { + compatible = "fsl,interlaken-lac-portal-v1.0"; + reg = <0xA000 0x1000>; +}; + +lportal11: lac-portal@B000 { + compatible = "fsl,interlaken-lac-portal-v1.0"; + reg = <0xB000 0x1000>; +}; + +lportal12: lac-portal@C000 { + compatible = "fsl,interlaken-lac-portal-v1.0"; + reg = <0xC000 0x1000>; +}; + +lportal13: lac-portal@D000 { + compatible = "fsl,interlaken-lac-portal-v1.0"; + reg = <0xD000 0x1000>; +}; + +lportal14: lac-portal@E000 { + compatible = "fsl,interlaken-lac-portal-v1.0"; + reg = <0xE000 0x1000>; +}; + +lportal15: lac-portal@F000 { + compatible = "fsl,interlaken-lac-portal-v1.0"; + reg = <0xF000 0x1000>; +}; + +lportal16: lac-portal@10000 { + compatible = "fsl,interlaken-lac-portal-v1.0"; + reg = <0x10000 0x1000>; +}; + +lportal17: lac-portal@11000 { + compatible = "fsl,interlaken-lac-portal-v1.0"; + reg = <0x11000 0x1000>; +}; + +lportal18: lac-portal@1200 { + compatible = "fsl,interlaken-lac-portal-v1.0"; + reg = <0x12000 0x1000>; +}; + +lportal19: lac-portal@13000 { + compatible = "fsl,interlaken-lac-portal-v1.0"; + reg = <0x13000 0x1000>; +}; + +lportal20: lac-portal@14000 { + compatible = "fsl,interlaken-lac-portal-v1.0"; + reg = <0x14000 0x1000>; +}; + +lportal21: lac-portal@15000 { + compatible = "fsl,interlaken-lac-portal-v1.0"; + reg = <0x15000 0x1000>; +}; + +lportal22: lac-portal@16000 { + compatible = "fsl,interlaken-lac-portal-v1.0"; + reg = <0x16000 0x1000>; +}; + +lportal23: lac-portal@17000 { + compatible = "fsl,interlaken-lac-portal-v1.0"; + reg = <0x17000 0x1000>; +}; diff --git a/arch/powerpc/boot/dts/fsl/interlaken-lac.dtsi b/arch/powerpc/boot/dts/fsl/interlaken-lac.dtsi new file mode 100644 index 000000000000..e8208720ac0e --- /dev/null +++ b/arch/powerpc/boot/dts/fsl/interlaken-lac.dtsi @@ -0,0 +1,45 @@ +/* + * T4 Interlaken Look-aside Controller (LAC) device tree stub + * + * Copyright 2012 Freescale Semiconductor Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Freescale Semiconductor nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * + * ALTERNATIVELY, this software may be distributed under the terms of the + * GNU General Public License ("GPL") as published by the Free Software + * Foundation, either version 2 of that License or (at your option) any + * later version. + * + * THIS SOFTWARE IS PROVIDED BY Freescale Semiconductor "AS IS" AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL Freescale Semiconductor BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +lac: lac@229000 { + compatible = "fsl,interlaken-lac"; + reg = <0x229000 0x1000>; + interrupts = <16 2 1 18>; +}; + +lac-hv@228000 { + compatible = "fsl,interlaken-lac-hv"; + reg = <0x228000 0x1000>; + fsl,non-hv-node = <&lac>; +}; -- cgit v1.2.3 From 330a1eb7775ba876dbd46b9885556e57f705e3d4 Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Fri, 28 Jun 2013 18:15:16 +1000 Subject: powerpc/perf: Core EBB support for 64-bit book3s Add support for EBB (Event Based Branches) on 64-bit book3s. See the included documentation for more details. EBBs are a feature which allows the hardware to branch directly to a specified user space address when a PMU event overflows. This can be used by programs for self-monitoring with no kernel involvement in the inner loop. Most of the logic is in the generic book3s code, primarily to avoid a proliferation of PMU callbacks. Signed-off-by: Michael Ellerman Signed-off-by: Benjamin Herrenschmidt --- Documentation/powerpc/00-INDEX | 2 + Documentation/powerpc/pmu-ebb.txt | 137 +++++++++++++++++++++++ arch/powerpc/include/asm/perf_event_server.h | 6 + arch/powerpc/include/asm/processor.h | 3 +- arch/powerpc/include/asm/reg.h | 8 ++ arch/powerpc/include/asm/switch_to.h | 14 +++ arch/powerpc/kernel/process.c | 4 + arch/powerpc/perf/core-book3s.c | 161 ++++++++++++++++++++++++--- 8 files changed, 321 insertions(+), 14 deletions(-) create mode 100644 Documentation/powerpc/pmu-ebb.txt (limited to 'Documentation') diff --git a/Documentation/powerpc/00-INDEX b/Documentation/powerpc/00-INDEX index dd9e92802ec0..05026ce1875e 100644 --- a/Documentation/powerpc/00-INDEX +++ b/Documentation/powerpc/00-INDEX @@ -14,6 +14,8 @@ hvcs.txt - IBM "Hypervisor Virtual Console Server" Installation Guide mpc52xx.txt - Linux 2.6.x on MPC52xx family +pmu-ebb.txt + - Description of the API for using the PMU with Event Based Branches. qe_firmware.txt - describes the layout of firmware binaries for the Freescale QUICC Engine and the code that parses and uploads the microcode therein. diff --git a/Documentation/powerpc/pmu-ebb.txt b/Documentation/powerpc/pmu-ebb.txt new file mode 100644 index 000000000000..73cd163dbfb8 --- /dev/null +++ b/Documentation/powerpc/pmu-ebb.txt @@ -0,0 +1,137 @@ +PMU Event Based Branches +======================== + +Event Based Branches (EBBs) are a feature which allows the hardware to +branch directly to a specified user space address when certain events occur. + +The full specification is available in Power ISA v2.07: + + https://www.power.org/documentation/power-isa-version-2-07/ + +One type of event for which EBBs can be configured is PMU exceptions. This +document describes the API for configuring the Power PMU to generate EBBs, +using the Linux perf_events API. + + +Terminology +----------- + +Throughout this document we will refer to an "EBB event" or "EBB events". This +just refers to a struct perf_event which has set the "EBB" flag in its +attr.config. All events which can be configured on the hardware PMU are +possible "EBB events". + + +Background +---------- + +When a PMU EBB occurs it is delivered to the currently running process. As such +EBBs can only sensibly be used by programs for self-monitoring. + +It is a feature of the perf_events API that events can be created on other +processes, subject to standard permission checks. This is also true of EBB +events, however unless the target process enables EBBs (via mtspr(BESCR)) no +EBBs will ever be delivered. + +This makes it possible for a process to enable EBBs for itself, but not +actually configure any events. At a later time another process can come along +and attach an EBB event to the process, which will then cause EBBs to be +delivered to the first process. It's not clear if this is actually useful. + + +When the PMU is configured for EBBs, all PMU interrupts are delivered to the +user process. This means once an EBB event is scheduled on the PMU, no non-EBB +events can be configured. This means that EBB events can not be run +concurrently with regular 'perf' commands, or any other perf events. + +It is however safe to run 'perf' commands on a process which is using EBBs. The +kernel will in general schedule the EBB event, and perf will be notified that +its events could not run. + +The exclusion between EBB events and regular events is implemented using the +existing "pinned" and "exclusive" attributes of perf_events. This means EBB +events will be given priority over other events, unless they are also pinned. +If an EBB event and a regular event are both pinned, then whichever is enabled +first will be scheduled and the other will be put in error state. See the +section below titled "Enabling an EBB event" for more information. + + +Creating an EBB event +--------------------- + +To request that an event is counted using EBB, the event code should have bit +63 set. + +EBB events must be created with a particular, and restrictive, set of +attributes - this is so that they interoperate correctly with the rest of the +perf_events subsystem. + +An EBB event must be created with the "pinned" and "exclusive" attributes set. +Note that if you are creating a group of EBB events, only the leader can have +these attributes set. + +An EBB event must NOT set any of the "inherit", "sample_period", "freq" or +"enable_on_exec" attributes. + +An EBB event must be attached to a task. This is specified to perf_event_open() +by passing a pid value, typically 0 indicating the current task. + +All events in a group must agree on whether they want EBB. That is all events +must request EBB, or none may request EBB. + +EBB events must specify the PMC they are to be counted on. This ensures +userspace is able to reliably determine which PMC the event is scheduled on. + + +Enabling an EBB event +--------------------- + +Once an EBB event has been successfully opened, it must be enabled with the +perf_events API. This can be achieved either via the ioctl() interface, or the +prctl() interface. + +However, due to the design of the perf_events API, enabling an event does not +guarantee that it has been scheduled on the PMU. To ensure that the EBB event +has been scheduled on the PMU, you must perform a read() on the event. If the +read() returns EOF, then the event has not been scheduled and EBBs are not +enabled. + +This behaviour occurs because the EBB event is pinned and exclusive. When the +EBB event is enabled it will force all other non-pinned events off the PMU. In +this case the enable will be successful. However if there is already an event +pinned on the PMU then the enable will not be successful. + + +Reading an EBB event +-------------------- + +It is possible to read() from an EBB event. However the results are +meaningless. Because interrupts are being delivered to the user process the +kernel is not able to count the event, and so will return a junk value. + + +Closing an EBB event +-------------------- + +When an EBB event is finished with, you can close it using close() as for any +regular event. If this is the last EBB event the PMU will be deconfigured and +no further PMU EBBs will be delivered. + + +EBB Handler +----------- + +The EBB handler is just regular userspace code, however it must be written in +the style of an interrupt handler. When the handler is entered all registers +are live (possibly) and so must be saved somehow before the handler can invoke +other code. + +It's up to the program how to handle this. For C programs a relatively simple +option is to create an interrupt frame on the stack and save registers there. + +Fork +---- + +EBB events are not inherited across fork. If the child process wishes to use +EBBs it should open a new event for itself. Similarly the EBB state in +BESCR/EBBHR/EBBRR is cleared across fork(). diff --git a/arch/powerpc/include/asm/perf_event_server.h b/arch/powerpc/include/asm/perf_event_server.h index f265049dd7d6..2dd7bfc459be 100644 --- a/arch/powerpc/include/asm/perf_event_server.h +++ b/arch/powerpc/include/asm/perf_event_server.h @@ -60,6 +60,7 @@ struct power_pmu { #define PPMU_HAS_SSLOT 0x00000020 /* Has sampled slot in MMCRA */ #define PPMU_HAS_SIER 0x00000040 /* Has SIER */ #define PPMU_BHRB 0x00000080 /* has BHRB feature enabled */ +#define PPMU_EBB 0x00000100 /* supports event based branch */ /* * Values for flags to get_alternatives() @@ -68,6 +69,11 @@ struct power_pmu { #define PPMU_LIMITED_PMC_REQD 2 /* have to put this on a limited PMC */ #define PPMU_ONLY_COUNT_RUN 4 /* only counting in run state */ +/* + * We use the event config bit 63 as a flag to request EBB. + */ +#define EVENT_CONFIG_EBB_SHIFT 63 + extern int register_power_pmu(struct power_pmu *); struct pt_regs; diff --git a/arch/powerpc/include/asm/processor.h b/arch/powerpc/include/asm/processor.h index 3f19df3cc7a3..47a35b08b963 100644 --- a/arch/powerpc/include/asm/processor.h +++ b/arch/powerpc/include/asm/processor.h @@ -287,8 +287,9 @@ struct thread_struct { unsigned long siar; unsigned long sdar; unsigned long sier; - unsigned long mmcr0; unsigned long mmcr2; + unsigned mmcr0; + unsigned used_ebb; #endif }; diff --git a/arch/powerpc/include/asm/reg.h b/arch/powerpc/include/asm/reg.h index 362142b69d5b..5d7d9c2a5473 100644 --- a/arch/powerpc/include/asm/reg.h +++ b/arch/powerpc/include/asm/reg.h @@ -621,6 +621,9 @@ #define MMCR0_PMXE 0x04000000UL /* performance monitor exception enable */ #define MMCR0_FCECE 0x02000000UL /* freeze ctrs on enabled cond or event */ #define MMCR0_TBEE 0x00400000UL /* time base exception enable */ +#define MMCR0_EBE 0x00100000UL /* Event based branch enable */ +#define MMCR0_PMCC 0x000c0000UL /* PMC control */ +#define MMCR0_PMCC_U6 0x00080000UL /* PMC1-6 are R/W by user (PR) */ #define MMCR0_PMC1CE 0x00008000UL /* PMC1 count enable*/ #define MMCR0_PMCjCE 0x00004000UL /* PMCj count enable*/ #define MMCR0_TRIGGER 0x00002000UL /* TRIGGER enable */ @@ -674,6 +677,11 @@ #define SIER_SIAR_VALID 0x0400000 /* SIAR contents valid */ #define SIER_SDAR_VALID 0x0200000 /* SDAR contents valid */ +/* When EBB is enabled, some of MMCR0/MMCR2/SIER are user accessible */ +#define MMCR0_USER_MASK (MMCR0_FC | MMCR0_PMXE | MMCR0_PMAO) +#define MMCR2_USER_MASK 0x4020100804020000UL /* (FC1P|FC2P|FC3P|FC4P|FC5P|FC6P) */ +#define SIER_USER_MASK 0x7fffffUL + #define SPRN_PA6T_MMCR0 795 #define PA6T_MMCR0_EN0 0x0000000000000001UL #define PA6T_MMCR0_EN1 0x0000000000000002UL diff --git a/arch/powerpc/include/asm/switch_to.h b/arch/powerpc/include/asm/switch_to.h index 200d763a0a67..49a13e0ef234 100644 --- a/arch/powerpc/include/asm/switch_to.h +++ b/arch/powerpc/include/asm/switch_to.h @@ -67,4 +67,18 @@ static inline void flush_spe_to_thread(struct task_struct *t) } #endif +static inline void clear_task_ebb(struct task_struct *t) +{ +#ifdef CONFIG_PPC_BOOK3S_64 + /* EBB perf events are not inherited, so clear all EBB state. */ + t->thread.bescr = 0; + t->thread.mmcr2 = 0; + t->thread.mmcr0 = 0; + t->thread.siar = 0; + t->thread.sdar = 0; + t->thread.sier = 0; + t->thread.used_ebb = 0; +#endif +} + #endif /* _ASM_POWERPC_SWITCH_TO_H */ diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c index b0f3e3f77e72..f8a76e6207bd 100644 --- a/arch/powerpc/kernel/process.c +++ b/arch/powerpc/kernel/process.c @@ -916,7 +916,11 @@ int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src) flush_altivec_to_thread(src); flush_vsx_to_thread(src); flush_spe_to_thread(src); + *dst = *src; + + clear_task_ebb(dst); + return 0; } diff --git a/arch/powerpc/perf/core-book3s.c b/arch/powerpc/perf/core-book3s.c index c91dc43e04de..a3985aee77fe 100644 --- a/arch/powerpc/perf/core-book3s.c +++ b/arch/powerpc/perf/core-book3s.c @@ -77,6 +77,9 @@ static unsigned int freeze_events_kernel = MMCR0_FCS; #define MMCR0_PMCjCE MMCR0_PMCnCE #define MMCR0_FC56 0 #define MMCR0_PMAO 0 +#define MMCR0_EBE 0 +#define MMCR0_PMCC 0 +#define MMCR0_PMCC_U6 0 #define SPRN_MMCRA SPRN_MMCR2 #define MMCRA_SAMPLE_ENABLE 0 @@ -104,6 +107,15 @@ static inline int siar_valid(struct pt_regs *regs) return 1; } +static bool is_ebb_event(struct perf_event *event) { return false; } +static int ebb_event_check(struct perf_event *event) { return 0; } +static void ebb_event_add(struct perf_event *event) { } +static void ebb_switch_out(unsigned long mmcr0) { } +static unsigned long ebb_switch_in(bool ebb, unsigned long mmcr0) +{ + return mmcr0; +} + static inline void power_pmu_bhrb_enable(struct perf_event *event) {} static inline void power_pmu_bhrb_disable(struct perf_event *event) {} void power_pmu_flush_branch_stack(void) {} @@ -464,6 +476,89 @@ void power_pmu_bhrb_read(struct cpu_hw_events *cpuhw) return; } +static bool is_ebb_event(struct perf_event *event) +{ + /* + * This could be a per-PMU callback, but we'd rather avoid the cost. We + * check that the PMU supports EBB, meaning those that don't can still + * use bit 63 of the event code for something else if they wish. + */ + return (ppmu->flags & PPMU_EBB) && + ((event->attr.config >> EVENT_CONFIG_EBB_SHIFT) & 1); +} + +static int ebb_event_check(struct perf_event *event) +{ + struct perf_event *leader = event->group_leader; + + /* Event and group leader must agree on EBB */ + if (is_ebb_event(leader) != is_ebb_event(event)) + return -EINVAL; + + if (is_ebb_event(event)) { + if (!(event->attach_state & PERF_ATTACH_TASK)) + return -EINVAL; + + if (!leader->attr.pinned || !leader->attr.exclusive) + return -EINVAL; + + if (event->attr.inherit || event->attr.sample_period || + event->attr.enable_on_exec || event->attr.freq) + return -EINVAL; + } + + return 0; +} + +static void ebb_event_add(struct perf_event *event) +{ + if (!is_ebb_event(event) || current->thread.used_ebb) + return; + + /* + * IFF this is the first time we've added an EBB event, set + * PMXE in the user MMCR0 so we can detect when it's cleared by + * userspace. We need this so that we can context switch while + * userspace is in the EBB handler (where PMXE is 0). + */ + current->thread.used_ebb = 1; + current->thread.mmcr0 |= MMCR0_PMXE; +} + +static void ebb_switch_out(unsigned long mmcr0) +{ + if (!(mmcr0 & MMCR0_EBE)) + return; + + current->thread.siar = mfspr(SPRN_SIAR); + current->thread.sier = mfspr(SPRN_SIER); + current->thread.sdar = mfspr(SPRN_SDAR); + current->thread.mmcr0 = mmcr0 & MMCR0_USER_MASK; + current->thread.mmcr2 = mfspr(SPRN_MMCR2) & MMCR2_USER_MASK; +} + +static unsigned long ebb_switch_in(bool ebb, unsigned long mmcr0) +{ + if (!ebb) + goto out; + + /* Enable EBB and read/write to all 6 PMCs for userspace */ + mmcr0 |= MMCR0_EBE | MMCR0_PMCC_U6; + + /* Add any bits from the user reg, FC or PMAO */ + mmcr0 |= current->thread.mmcr0; + + /* Be careful not to set PMXE if userspace had it cleared */ + if (!(current->thread.mmcr0 & MMCR0_PMXE)) + mmcr0 &= ~MMCR0_PMXE; + + mtspr(SPRN_SIAR, current->thread.siar); + mtspr(SPRN_SIER, current->thread.sier); + mtspr(SPRN_SDAR, current->thread.sdar); + mtspr(SPRN_MMCR2, current->thread.mmcr2); +out: + return mmcr0; +} #endif /* CONFIG_PPC64 */ static void perf_event_interrupt(struct pt_regs *regs); @@ -734,6 +829,13 @@ static void power_pmu_read(struct perf_event *event) if (!event->hw.idx) return; + + if (is_ebb_event(event)) { + val = read_pmc(event->hw.idx); + local64_set(&event->hw.prev_count, val); + return; + } + /* * Performance monitor interrupts come even when interrupts * are soft-disabled, as long as interrupts are hard-enabled. @@ -854,7 +956,7 @@ static void write_mmcr0(struct cpu_hw_events *cpuhw, unsigned long mmcr0) static void power_pmu_disable(struct pmu *pmu) { struct cpu_hw_events *cpuhw; - unsigned long flags, val; + unsigned long flags, mmcr0, val; if (!ppmu) return; @@ -871,11 +973,11 @@ static void power_pmu_disable(struct pmu *pmu) } /* - * Set the 'freeze counters' bit, clear PMAO/FC56. + * Set the 'freeze counters' bit, clear EBE/PMCC/PMAO/FC56. */ - val = mfspr(SPRN_MMCR0); + val = mmcr0 = mfspr(SPRN_MMCR0); val |= MMCR0_FC; - val &= ~(MMCR0_PMAO | MMCR0_FC56); + val &= ~(MMCR0_EBE | MMCR0_PMCC | MMCR0_PMAO | MMCR0_FC56); /* * The barrier is to make sure the mtspr has been @@ -896,7 +998,10 @@ static void power_pmu_disable(struct pmu *pmu) cpuhw->disabled = 1; cpuhw->n_added = 0; + + ebb_switch_out(mmcr0); } + local_irq_restore(flags); } @@ -911,15 +1016,15 @@ static void power_pmu_enable(struct pmu *pmu) struct cpu_hw_events *cpuhw; unsigned long flags; long i; - unsigned long val; + unsigned long val, mmcr0; s64 left; unsigned int hwc_index[MAX_HWEVENTS]; int n_lim; int idx; + bool ebb; if (!ppmu) return; - local_irq_save(flags); cpuhw = &__get_cpu_var(cpu_hw_events); @@ -933,6 +1038,13 @@ static void power_pmu_enable(struct pmu *pmu) cpuhw->disabled = 0; + /* + * EBB requires an exclusive group and all events must have the EBB + * flag set, or not set, so we can just check a single event. Also we + * know we have at least one event. + */ + ebb = is_ebb_event(cpuhw->event[0]); + /* * If we didn't change anything, or only removed events, * no need to recalculate MMCR* settings and reset the PMCs. @@ -1008,25 +1120,34 @@ static void power_pmu_enable(struct pmu *pmu) ++n_lim; continue; } - val = 0; - if (event->hw.sample_period) { - left = local64_read(&event->hw.period_left); - if (left < 0x80000000L) - val = 0x80000000L - left; + + if (ebb) + val = local64_read(&event->hw.prev_count); + else { + val = 0; + if (event->hw.sample_period) { + left = local64_read(&event->hw.period_left); + if (left < 0x80000000L) + val = 0x80000000L - left; + } + local64_set(&event->hw.prev_count, val); } - local64_set(&event->hw.prev_count, val); + event->hw.idx = idx; if (event->hw.state & PERF_HES_STOPPED) val = 0; write_pmc(idx, val); + perf_event_update_userpage(event); } cpuhw->n_limited = n_lim; cpuhw->mmcr[0] |= MMCR0_PMXE | MMCR0_FCECE; out_enable: + mmcr0 = ebb_switch_in(ebb, cpuhw->mmcr[0]); + mb(); - write_mmcr0(cpuhw, cpuhw->mmcr[0]); + write_mmcr0(cpuhw, mmcr0); /* * Enable instruction sampling if necessary @@ -1124,6 +1245,8 @@ static int power_pmu_add(struct perf_event *event, int ef_flags) event->hw.config = cpuhw->events[n0]; nocheck: + ebb_event_add(event); + ++cpuhw->n_events; ++cpuhw->n_added; @@ -1484,6 +1607,11 @@ static int power_pmu_event_init(struct perf_event *event) } } + /* Extra checks for EBB */ + err = ebb_event_check(event); + if (err) + return err; + /* * If this is in a group, check if it can go on with all the * other hardware events in the group. We assume the event @@ -1522,6 +1650,13 @@ static int power_pmu_event_init(struct perf_event *event) event->hw.last_period = event->hw.sample_period; local64_set(&event->hw.period_left, event->hw.last_period); + /* + * For EBB events we just context switch the PMC value, we don't do any + * of the sample_period logic. We use hw.prev_count for this. + */ + if (is_ebb_event(event)) + local64_set(&event->hw.prev_count, 0); + /* * See if we need to reserve the PMU. * If no events are currently in use, then we have to take a -- cgit v1.2.3