aboutsummaryrefslogtreecommitdiff
path: root/drivers/iommu
diff options
context:
space:
mode:
authorLinus Torvalds2024-05-14 10:01:29 -0700
committerLinus Torvalds2024-05-14 10:01:29 -0700
commit9776dd36095be19f5a0ad9f07a4fc221d2a0609a (patch)
treebd2bf5c11cf57fc187569670911a51f0b16f2796 /drivers/iommu
parent6bfd2d442af5c373042f196eef1915e1f6ac058a (diff)
parent6ecc2e7932fe8f132d3b671685f9995785f19e9a (diff)
Merge tag 'x86-irq-2024-05-12' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull x86 interrupt handling updates from Thomas Gleixner: "Add support for posted interrupts on bare metal. Posted interrupts is a virtualization feature which allows to inject interrupts directly into a guest without host interaction. The VT-d interrupt remapping hardware sets the bit which corresponds to the interrupt vector in a vector bitmap which is either used to inject the interrupt directly into the guest via a virtualized APIC or in case that the guest is scheduled out provides a host side notification interrupt which informs the host that an interrupt has been marked pending in the bitmap. This can be utilized on bare metal for scenarios where multiple devices, e.g. NVME storage, raise interrupts with a high frequency. In the default mode these interrupts are handles independently and therefore require a full roundtrip of interrupt entry/exit. Utilizing posted interrupts this roundtrip overhead can be avoided by coalescing these interrupt entries to a single entry for the posted interrupt notification. The notification interrupt then demultiplexes the pending bits in a memory based bitmap and invokes the corresponding device specific handlers. Depending on the usage scenario and device utilization throughput improvements between 10% and 130% have been measured. As this is only relevant for high end servers with multiple device queues per CPU attached and counterproductive for situations where interrupts are arriving at distinct times, the functionality is opt-in via a kernel command line parameter" * tag 'x86-irq-2024-05-12' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: x86/irq: Use existing helper for pending vector check iommu/vt-d: Enable posted mode for device MSIs iommu/vt-d: Make posted MSI an opt-in command line option x86/irq: Extend checks for pending vectors to posted interrupts x86/irq: Factor out common code for checking pending interrupts x86/irq: Install posted MSI notification handler x86/irq: Factor out handler invocation from common_interrupt() x86/irq: Set up per host CPU posted interrupt descriptors x86/irq: Reserve a per CPU IDT vector for posted MSIs x86/irq: Add a Kconfig option for posted MSI x86/irq: Remove bitfields in posted interrupt descriptor x86/irq: Unionize PID.PIR for 64bit access w/o casting KVM: VMX: Move posted interrupt descriptor out of VMX code
Diffstat (limited to 'drivers/iommu')
-rw-r--r--drivers/iommu/intel/irq_remapping.c113
-rw-r--r--drivers/iommu/irq_remapping.c5
2 files changed, 113 insertions, 5 deletions
diff --git a/drivers/iommu/intel/irq_remapping.c b/drivers/iommu/intel/irq_remapping.c
index 566297bc87dd..712ebfc9870c 100644
--- a/drivers/iommu/intel/irq_remapping.c
+++ b/drivers/iommu/intel/irq_remapping.c
@@ -19,6 +19,7 @@
#include <asm/cpu.h>
#include <asm/irq_remapping.h>
#include <asm/pci-direct.h>
+#include <asm/posted_intr.h>
#include "iommu.h"
#include "../irq_remapping.h"
@@ -49,6 +50,7 @@ struct irq_2_iommu {
u16 sub_handle;
u8 irte_mask;
enum irq_mode mode;
+ bool posted_msi;
};
struct intel_ir_data {
@@ -1118,6 +1120,14 @@ static void prepare_irte(struct irte *irte, int vector, unsigned int dest)
irte->redir_hint = 1;
}
+static void prepare_irte_posted(struct irte *irte)
+{
+ memset(irte, 0, sizeof(*irte));
+
+ irte->present = 1;
+ irte->p_pst = 1;
+}
+
struct irq_remap_ops intel_irq_remap_ops = {
.prepare = intel_prepare_irq_remapping,
.enable = intel_enable_irq_remapping,
@@ -1126,6 +1136,47 @@ struct irq_remap_ops intel_irq_remap_ops = {
.enable_faulting = enable_drhd_fault_handling,
};
+#ifdef CONFIG_X86_POSTED_MSI
+
+static phys_addr_t get_pi_desc_addr(struct irq_data *irqd)
+{
+ int cpu = cpumask_first(irq_data_get_effective_affinity_mask(irqd));
+
+ if (WARN_ON(cpu >= nr_cpu_ids))
+ return 0;
+
+ return __pa(per_cpu_ptr(&posted_msi_pi_desc, cpu));
+}
+
+static void intel_ir_reconfigure_irte_posted(struct irq_data *irqd)
+{
+ struct intel_ir_data *ir_data = irqd->chip_data;
+ struct irte *irte = &ir_data->irte_entry;
+ struct irte irte_pi;
+ u64 pid_addr;
+
+ pid_addr = get_pi_desc_addr(irqd);
+
+ if (!pid_addr) {
+ pr_warn("Failed to setup IRQ %d for posted mode", irqd->irq);
+ return;
+ }
+
+ memset(&irte_pi, 0, sizeof(irte_pi));
+
+ /* The shared IRTE already be set up as posted during alloc_irte */
+ dmar_copy_shared_irte(&irte_pi, irte);
+
+ irte_pi.pda_l = (pid_addr >> (32 - PDA_LOW_BIT)) & ~(-1UL << PDA_LOW_BIT);
+ irte_pi.pda_h = (pid_addr >> 32) & ~(-1UL << PDA_HIGH_BIT);
+
+ modify_irte(&ir_data->irq_2_iommu, &irte_pi);
+}
+
+#else
+static inline void intel_ir_reconfigure_irte_posted(struct irq_data *irqd) {}
+#endif
+
static void intel_ir_reconfigure_irte(struct irq_data *irqd, bool force)
{
struct intel_ir_data *ir_data = irqd->chip_data;
@@ -1139,8 +1190,9 @@ static void intel_ir_reconfigure_irte(struct irq_data *irqd, bool force)
irte->vector = cfg->vector;
irte->dest_id = IRTE_DEST(cfg->dest_apicid);
- /* Update the hardware only if the interrupt is in remapped mode. */
- if (force || ir_data->irq_2_iommu.mode == IRQ_REMAPPING)
+ if (ir_data->irq_2_iommu.posted_msi)
+ intel_ir_reconfigure_irte_posted(irqd);
+ else if (force || ir_data->irq_2_iommu.mode == IRQ_REMAPPING)
modify_irte(&ir_data->irq_2_iommu, irte);
}
@@ -1194,7 +1246,7 @@ static int intel_ir_set_vcpu_affinity(struct irq_data *data, void *info)
struct intel_ir_data *ir_data = data->chip_data;
struct vcpu_data *vcpu_pi_info = info;
- /* stop posting interrupts, back to remapping mode */
+ /* stop posting interrupts, back to the default mode */
if (!vcpu_pi_info) {
modify_irte(&ir_data->irq_2_iommu, &ir_data->irte_entry);
} else {
@@ -1233,6 +1285,49 @@ static struct irq_chip intel_ir_chip = {
.irq_set_vcpu_affinity = intel_ir_set_vcpu_affinity,
};
+/*
+ * With posted MSIs, all vectors are multiplexed into a single notification
+ * vector. Devices MSIs are then dispatched in a demux loop where
+ * EOIs can be coalesced as well.
+ *
+ * "INTEL-IR-POST" IRQ chip does not do EOI on ACK, thus the dummy irq_ack()
+ * function. Instead EOI is performed by the posted interrupt notification
+ * handler.
+ *
+ * For the example below, 3 MSIs are coalesced into one CPU notification. Only
+ * one apic_eoi() is needed.
+ *
+ * __sysvec_posted_msi_notification()
+ * irq_enter();
+ * handle_edge_irq()
+ * irq_chip_ack_parent()
+ * dummy(); // No EOI
+ * handle_irq_event()
+ * driver_handler()
+ * handle_edge_irq()
+ * irq_chip_ack_parent()
+ * dummy(); // No EOI
+ * handle_irq_event()
+ * driver_handler()
+ * handle_edge_irq()
+ * irq_chip_ack_parent()
+ * dummy(); // No EOI
+ * handle_irq_event()
+ * driver_handler()
+ * apic_eoi()
+ * irq_exit()
+ */
+
+static void dummy_ack(struct irq_data *d) { }
+
+static struct irq_chip intel_ir_chip_post_msi = {
+ .name = "INTEL-IR-POST",
+ .irq_ack = dummy_ack,
+ .irq_set_affinity = intel_ir_set_affinity,
+ .irq_compose_msi_msg = intel_ir_compose_msi_msg,
+ .irq_set_vcpu_affinity = intel_ir_set_vcpu_affinity,
+};
+
static void fill_msi_msg(struct msi_msg *msg, u32 index, u32 subhandle)
{
memset(msg, 0, sizeof(*msg));
@@ -1274,6 +1369,11 @@ static void intel_irq_remapping_prepare_irte(struct intel_ir_data *data,
break;
case X86_IRQ_ALLOC_TYPE_PCI_MSI:
case X86_IRQ_ALLOC_TYPE_PCI_MSIX:
+ if (posted_msi_supported()) {
+ prepare_irte_posted(irte);
+ data->irq_2_iommu.posted_msi = 1;
+ }
+
set_msi_sid(irte,
pci_real_dma_dev(msi_desc_to_pci_dev(info->desc)));
break;
@@ -1361,7 +1461,12 @@ static int intel_irq_remapping_alloc(struct irq_domain *domain,
irq_data->hwirq = (index << 16) + i;
irq_data->chip_data = ird;
- irq_data->chip = &intel_ir_chip;
+ if (posted_msi_supported() &&
+ ((info->type == X86_IRQ_ALLOC_TYPE_PCI_MSI) ||
+ (info->type == X86_IRQ_ALLOC_TYPE_PCI_MSIX)))
+ irq_data->chip = &intel_ir_chip_post_msi;
+ else
+ irq_data->chip = &intel_ir_chip;
intel_irq_remapping_prepare_irte(ird, irq_cfg, info, index, i);
irq_set_status_flags(virq + i, IRQ_MOVE_PCNTXT);
}
diff --git a/drivers/iommu/irq_remapping.c b/drivers/iommu/irq_remapping.c
index ee59647c2050..056fec6991bc 100644
--- a/drivers/iommu/irq_remapping.c
+++ b/drivers/iommu/irq_remapping.c
@@ -24,6 +24,8 @@ int no_x2apic_optout;
int disable_irq_post = 0;
+bool enable_posted_msi __ro_after_init;
+
static int disable_irq_remap;
static struct irq_remap_ops *remap_ops;
@@ -70,7 +72,8 @@ static __init int setup_irqremap(char *str)
no_x2apic_optout = 1;
else if (!strncmp(str, "nopost", 6))
disable_irq_post = 1;
-
+ else if (IS_ENABLED(CONFIG_X86_POSTED_MSI) && !strncmp(str, "posted_msi", 10))
+ enable_posted_msi = true;
str += strcspn(str, ",");
while (*str == ',')
str++;