diff options
-rw-r--r-- | drivers/net/ethernet/sfc/ef10.c | 14 | ||||
-rw-r--r-- | drivers/net/ethernet/sfc/efx.c | 269 | ||||
-rw-r--r-- | drivers/net/ethernet/sfc/efx.h | 3 | ||||
-rw-r--r-- | drivers/net/ethernet/sfc/ethtool.c | 25 | ||||
-rw-r--r-- | drivers/net/ethernet/sfc/net_driver.h | 64 | ||||
-rw-r--r-- | drivers/net/ethernet/sfc/rx.c | 149 | ||||
-rw-r--r-- | drivers/net/ethernet/sfc/tx.c | 92 |
7 files changed, 572 insertions, 44 deletions
diff --git a/drivers/net/ethernet/sfc/ef10.c b/drivers/net/ethernet/sfc/ef10.c index 0ec13f520e90..ad68eb0cb8fd 100644 --- a/drivers/net/ethernet/sfc/ef10.c +++ b/drivers/net/ethernet/sfc/ef10.c @@ -946,8 +946,10 @@ static int efx_ef10_link_piobufs(struct efx_nic *efx) /* Extra channels, even those with TXQs (PTP), do not require * PIO resources. */ - if (!channel->type->want_pio) + if (!channel->type->want_pio || + channel->channel >= efx->xdp_channel_offset) continue; + efx_for_each_channel_tx_queue(tx_queue, channel) { /* We assign the PIO buffers to queues in * reverse order to allow for the following @@ -1296,8 +1298,9 @@ static int efx_ef10_dimension_resources(struct efx_nic *efx) int rc; channel_vis = max(efx->n_channels, - (efx->n_tx_channels + efx->n_extra_tx_channels) * - EFX_TXQ_TYPES); + ((efx->n_tx_channels + efx->n_extra_tx_channels) * + EFX_TXQ_TYPES) + + efx->n_xdp_channels * efx->xdp_tx_per_channel); #ifdef EFX_USE_PIO /* Try to allocate PIO buffers if wanted and if the full @@ -2434,11 +2437,12 @@ static void efx_ef10_tx_init(struct efx_tx_queue *tx_queue) /* TSOv2 is a limited resource that can only be configured on a limited * number of queues. TSO without checksum offload is not really a thing, * so we only enable it for those queues. - * TSOv2 cannot be used with Hardware timestamping. + * TSOv2 cannot be used with Hardware timestamping, and is never needed + * for XDP tx. */ if (csum_offload && (nic_data->datapath_caps2 & (1 << MC_CMD_GET_CAPABILITIES_V2_OUT_TX_TSO_V2_LBN)) && - !tx_queue->timestamping) { + !tx_queue->timestamping && !tx_queue->xdp_tx) { tso_v2 = true; netif_dbg(efx, hw, efx->net_dev, "Using TSOv2 for channel %u\n", channel->channel); diff --git a/drivers/net/ethernet/sfc/efx.c b/drivers/net/ethernet/sfc/efx.c index 2fef7402233e..0fa9972027db 100644 --- a/drivers/net/ethernet/sfc/efx.c +++ b/drivers/net/ethernet/sfc/efx.c @@ -226,6 +226,10 @@ static void efx_fini_napi_channel(struct efx_channel *channel); static void efx_fini_struct(struct efx_nic *efx); static void efx_start_all(struct efx_nic *efx); static void efx_stop_all(struct efx_nic *efx); +static int efx_xdp_setup_prog(struct efx_nic *efx, struct bpf_prog *prog); +static int efx_xdp(struct net_device *dev, struct netdev_bpf *xdp); +static int efx_xdp_xmit(struct net_device *dev, int n, struct xdp_frame **xdpfs, + u32 flags); #define EFX_ASSERT_RESET_SERIALISED(efx) \ do { \ @@ -340,6 +344,8 @@ static int efx_poll(struct napi_struct *napi, int budget) spent = efx_process_channel(channel, budget); + xdp_do_flush_map(); + if (spent < budget) { if (efx_channel_has_rx_queue(channel) && efx->irq_rx_adaptive && @@ -579,9 +585,14 @@ efx_get_channel_name(struct efx_channel *channel, char *buf, size_t len) int number; number = channel->channel; - if (efx->tx_channel_offset == 0) { + + if (number >= efx->xdp_channel_offset && + !WARN_ON_ONCE(!efx->n_xdp_channels)) { + type = "-xdp"; + number -= efx->xdp_channel_offset; + } else if (efx->tx_channel_offset == 0) { type = ""; - } else if (channel->channel < efx->tx_channel_offset) { + } else if (number < efx->tx_channel_offset) { type = "-rx"; } else { type = "-tx"; @@ -651,7 +662,7 @@ static void efx_start_datapath(struct efx_nic *efx) efx->rx_dma_len = (efx->rx_prefix_size + EFX_MAX_FRAME_LEN(efx->net_dev->mtu) + efx->type->rx_buffer_padding); - rx_buf_len = (sizeof(struct efx_rx_page_state) + + rx_buf_len = (sizeof(struct efx_rx_page_state) + XDP_PACKET_HEADROOM + efx->rx_ip_align + efx->rx_dma_len); if (rx_buf_len <= PAGE_SIZE) { efx->rx_scatter = efx->type->always_rx_scatter; @@ -774,6 +785,7 @@ static void efx_stop_datapath(struct efx_nic *efx) efx_for_each_possible_channel_tx_queue(tx_queue, channel) efx_fini_tx_queue(tx_queue); } + efx->xdp_rxq_info_failed = false; } static void efx_remove_channel(struct efx_channel *channel) @@ -798,6 +810,8 @@ static void efx_remove_channels(struct efx_nic *efx) efx_for_each_channel(channel, efx) efx_remove_channel(channel); + + kfree(efx->xdp_tx_queues); } int @@ -1435,6 +1449,101 @@ static unsigned int efx_wanted_parallelism(struct efx_nic *efx) return count; } +static int efx_allocate_msix_channels(struct efx_nic *efx, + unsigned int max_channels, + unsigned int extra_channels, + unsigned int parallelism) +{ + unsigned int n_channels = parallelism; + int vec_count; + int n_xdp_tx; + int n_xdp_ev; + + if (efx_separate_tx_channels) + n_channels *= 2; + n_channels += extra_channels; + + /* To allow XDP transmit to happen from arbitrary NAPI contexts + * we allocate a TX queue per CPU. We share event queues across + * multiple tx queues, assuming tx and ev queues are both + * maximum size. + */ + + n_xdp_tx = num_possible_cpus(); + n_xdp_ev = DIV_ROUND_UP(n_xdp_tx, EFX_TXQ_TYPES); + + /* Check resources. + * We need a channel per event queue, plus a VI per tx queue. + * This may be more pessimistic than it needs to be. + */ + if (n_channels + n_xdp_ev > max_channels) { + netif_err(efx, drv, efx->net_dev, + "Insufficient resources for %d XDP event queues (%d other channels, max %d)\n", + n_xdp_ev, n_channels, max_channels); + efx->n_xdp_channels = 0; + efx->xdp_tx_per_channel = 0; + efx->xdp_tx_queue_count = 0; + } else { + efx->n_xdp_channels = n_xdp_ev; + efx->xdp_tx_per_channel = EFX_TXQ_TYPES; + efx->xdp_tx_queue_count = n_xdp_tx; + n_channels += n_xdp_ev; + netif_dbg(efx, drv, efx->net_dev, + "Allocating %d TX and %d event queues for XDP\n", + n_xdp_tx, n_xdp_ev); + } + + n_channels = min(n_channels, max_channels); + + vec_count = pci_msix_vec_count(efx->pci_dev); + if (vec_count < 0) + return vec_count; + if (vec_count < n_channels) { + netif_err(efx, drv, efx->net_dev, + "WARNING: Insufficient MSI-X vectors available (%d < %u).\n", + vec_count, n_channels); + netif_err(efx, drv, efx->net_dev, + "WARNING: Performance may be reduced.\n"); + n_channels = vec_count; + } + + efx->n_channels = n_channels; + + /* Do not create the PTP TX queue(s) if PTP uses the MC directly. */ + if (extra_channels && !efx_ptp_use_mac_tx_timestamps(efx)) + n_channels--; + + /* Ignore XDP tx channels when creating rx channels. */ + n_channels -= efx->n_xdp_channels; + + if (efx_separate_tx_channels) { + efx->n_tx_channels = + min(max(n_channels / 2, 1U), + efx->max_tx_channels); + efx->tx_channel_offset = + n_channels - efx->n_tx_channels; + efx->n_rx_channels = + max(n_channels - + efx->n_tx_channels, 1U); + } else { + efx->n_tx_channels = min(n_channels, efx->max_tx_channels); + efx->tx_channel_offset = 0; + efx->n_rx_channels = n_channels; + } + + if (efx->n_xdp_channels) + efx->xdp_channel_offset = efx->tx_channel_offset + + efx->n_tx_channels; + else + efx->xdp_channel_offset = efx->n_channels; + + netif_dbg(efx, drv, efx->net_dev, + "Allocating %u RX channels\n", + efx->n_rx_channels); + + return efx->n_channels; +} + /* Probe the number and type of interrupts we are able to obtain, and * the resulting numbers of channels and RX queues. */ @@ -1449,19 +1558,19 @@ static int efx_probe_interrupts(struct efx_nic *efx) ++extra_channels; if (efx->interrupt_mode == EFX_INT_MODE_MSIX) { + unsigned int parallelism = efx_wanted_parallelism(efx); struct msix_entry xentries[EFX_MAX_CHANNELS]; unsigned int n_channels; - n_channels = efx_wanted_parallelism(efx); - if (efx_separate_tx_channels) - n_channels *= 2; - n_channels += extra_channels; - n_channels = min(n_channels, efx->max_channels); - - for (i = 0; i < n_channels; i++) - xentries[i].entry = i; - rc = pci_enable_msix_range(efx->pci_dev, - xentries, 1, n_channels); + rc = efx_allocate_msix_channels(efx, efx->max_channels, + extra_channels, parallelism); + if (rc >= 0) { + n_channels = rc; + for (i = 0; i < n_channels; i++) + xentries[i].entry = i; + rc = pci_enable_msix_range(efx->pci_dev, xentries, 1, + n_channels); + } if (rc < 0) { /* Fall back to single channel MSI */ netif_err(efx, drv, efx->net_dev, @@ -1480,21 +1589,6 @@ static int efx_probe_interrupts(struct efx_nic *efx) } if (rc > 0) { - efx->n_channels = n_channels; - if (n_channels > extra_channels) - n_channels -= extra_channels; - if (efx_separate_tx_channels) { - efx->n_tx_channels = min(max(n_channels / 2, - 1U), - efx->max_tx_channels); - efx->n_rx_channels = max(n_channels - - efx->n_tx_channels, - 1U); - } else { - efx->n_tx_channels = min(n_channels, - efx->max_tx_channels); - efx->n_rx_channels = n_channels; - } for (i = 0; i < efx->n_channels; i++) efx_get_channel(efx, i)->irq = xentries[i].vector; @@ -1506,6 +1600,8 @@ static int efx_probe_interrupts(struct efx_nic *efx) efx->n_channels = 1; efx->n_rx_channels = 1; efx->n_tx_channels = 1; + efx->n_xdp_channels = 0; + efx->xdp_channel_offset = efx->n_channels; rc = pci_enable_msi(efx->pci_dev); if (rc == 0) { efx_get_channel(efx, 0)->irq = efx->pci_dev->irq; @@ -1524,12 +1620,14 @@ static int efx_probe_interrupts(struct efx_nic *efx) efx->n_channels = 1 + (efx_separate_tx_channels ? 1 : 0); efx->n_rx_channels = 1; efx->n_tx_channels = 1; + efx->n_xdp_channels = 0; + efx->xdp_channel_offset = efx->n_channels; efx->legacy_irq = efx->pci_dev->irq; } - /* Assign extra channels if possible */ + /* Assign extra channels if possible, before XDP channels */ efx->n_extra_tx_channels = 0; - j = efx->n_channels; + j = efx->xdp_channel_offset; for (i = 0; i < EFX_MAX_EXTRA_CHANNELS; i++) { if (!efx->extra_channel_type[i]) continue; @@ -1724,29 +1822,50 @@ static void efx_remove_interrupts(struct efx_nic *efx) efx->legacy_irq = 0; } -static void efx_set_channels(struct efx_nic *efx) +static int efx_set_channels(struct efx_nic *efx) { struct efx_channel *channel; struct efx_tx_queue *tx_queue; + int xdp_queue_number; efx->tx_channel_offset = efx_separate_tx_channels ? efx->n_channels - efx->n_tx_channels : 0; + if (efx->xdp_tx_queue_count) { + EFX_WARN_ON_PARANOID(efx->xdp_tx_queues); + + /* Allocate array for XDP TX queue lookup. */ + efx->xdp_tx_queues = kcalloc(efx->xdp_tx_queue_count, + sizeof(*efx->xdp_tx_queues), + GFP_KERNEL); + if (!efx->xdp_tx_queues) + return -ENOMEM; + } + /* We need to mark which channels really have RX and TX * queues, and adjust the TX queue numbers if we have separate * RX-only and TX-only channels. */ + xdp_queue_number = 0; efx_for_each_channel(channel, efx) { if (channel->channel < efx->n_rx_channels) channel->rx_queue.core_index = channel->channel; else channel->rx_queue.core_index = -1; - efx_for_each_channel_tx_queue(tx_queue, channel) + efx_for_each_channel_tx_queue(tx_queue, channel) { tx_queue->queue -= (efx->tx_channel_offset * EFX_TXQ_TYPES); + + if (efx_channel_is_xdp_tx(channel) && + xdp_queue_number < efx->xdp_tx_queue_count) { + efx->xdp_tx_queues[xdp_queue_number] = tx_queue; + xdp_queue_number++; + } + } } + return 0; } static int efx_probe_nic(struct efx_nic *efx) @@ -1776,7 +1895,9 @@ static int efx_probe_nic(struct efx_nic *efx) if (rc) goto fail1; - efx_set_channels(efx); + rc = efx_set_channels(efx); + if (rc) + goto fail1; /* dimension_resources can fail with EAGAIN */ rc = efx->type->dimension_resources(efx); @@ -2022,6 +2143,10 @@ static void efx_stop_all(struct efx_nic *efx) static void efx_remove_all(struct efx_nic *efx) { + rtnl_lock(); + efx_xdp_setup_prog(efx, NULL); + rtnl_unlock(); + efx_remove_channels(efx); efx_remove_filters(efx); #ifdef CONFIG_SFC_SRIOV @@ -2082,6 +2207,8 @@ int efx_init_irq_moderation(struct efx_nic *efx, unsigned int tx_usecs, channel->irq_moderation_us = rx_usecs; else if (efx_channel_has_tx_queues(channel)) channel->irq_moderation_us = tx_usecs; + else if (efx_channel_is_xdp_tx(channel)) + channel->irq_moderation_us = tx_usecs; } return 0; @@ -2277,6 +2404,17 @@ static void efx_watchdog(struct net_device *net_dev) efx_schedule_reset(efx, RESET_TYPE_TX_WATCHDOG); } +static unsigned int efx_xdp_max_mtu(struct efx_nic *efx) +{ + /* The maximum MTU that we can fit in a single page, allowing for + * framing, overhead and XDP headroom. + */ + int overhead = EFX_MAX_FRAME_LEN(0) + sizeof(struct efx_rx_page_state) + + efx->rx_prefix_size + efx->type->rx_buffer_padding + + efx->rx_ip_align + XDP_PACKET_HEADROOM; + + return PAGE_SIZE - overhead; +} /* Context: process, rtnl_lock() held. */ static int efx_change_mtu(struct net_device *net_dev, int new_mtu) @@ -2288,6 +2426,14 @@ static int efx_change_mtu(struct net_device *net_dev, int new_mtu) if (rc) return rc; + if (rtnl_dereference(efx->xdp_prog) && + new_mtu > efx_xdp_max_mtu(efx)) { + netif_err(efx, drv, efx->net_dev, + "Requested MTU of %d too big for XDP (max: %d)\n", + new_mtu, efx_xdp_max_mtu(efx)); + return -EINVAL; + } + netif_dbg(efx, drv, efx->net_dev, "changing MTU to %d\n", new_mtu); efx_device_detach_sync(efx); @@ -2489,8 +2635,65 @@ static const struct net_device_ops efx_netdev_ops = { #endif .ndo_udp_tunnel_add = efx_udp_tunnel_add, .ndo_udp_tunnel_del = efx_udp_tunnel_del, + .ndo_xdp_xmit = efx_xdp_xmit, + .ndo_bpf = efx_xdp }; +static int efx_xdp_setup_prog(struct efx_nic *efx, struct bpf_prog *prog) +{ + struct bpf_prog *old_prog; + + if (efx->xdp_rxq_info_failed) { + netif_err(efx, drv, efx->net_dev, + "Unable to bind XDP program due to previous failure of rxq_info\n"); + return -EINVAL; + } + + if (prog && efx->net_dev->mtu > efx_xdp_max_mtu(efx)) { + netif_err(efx, drv, efx->net_dev, + "Unable to configure XDP with MTU of %d (max: %d)\n", + efx->net_dev->mtu, efx_xdp_max_mtu(efx)); + return -EINVAL; + } + + old_prog = rtnl_dereference(efx->xdp_prog); + rcu_assign_pointer(efx->xdp_prog, prog); + /* Release the reference that was originally passed by the caller. */ + if (old_prog) + bpf_prog_put(old_prog); + + return 0; +} + +/* Context: process, rtnl_lock() held. */ +static int efx_xdp(struct net_device *dev, struct netdev_bpf *xdp) +{ + struct efx_nic *efx = netdev_priv(dev); + struct bpf_prog *xdp_prog; + + switch (xdp->command) { + case XDP_SETUP_PROG: + return efx_xdp_setup_prog(efx, xdp->prog); + case XDP_QUERY_PROG: + xdp_prog = rtnl_dereference(efx->xdp_prog); + xdp->prog_id = xdp_prog ? xdp_prog->aux->id : 0; + return 0; + default: + return -EINVAL; + } +} + +static int efx_xdp_xmit(struct net_device *dev, int n, struct xdp_frame **xdpfs, + u32 flags) +{ + struct efx_nic *efx = netdev_priv(dev); + + if (!netif_running(dev)) + return -EINVAL; + + return efx_xdp_tx_buffers(efx, n, xdpfs, flags & XDP_XMIT_FLUSH); +} + static void efx_update_name(struct efx_nic *efx) { strcpy(efx->name, efx->net_dev->name); diff --git a/drivers/net/ethernet/sfc/efx.h b/drivers/net/ethernet/sfc/efx.h index 04fed7c06618..45c7ae4114ec 100644 --- a/drivers/net/ethernet/sfc/efx.h +++ b/drivers/net/ethernet/sfc/efx.h @@ -322,4 +322,7 @@ static inline bool efx_rwsem_assert_write_locked(struct rw_semaphore *sem) return true; } +int efx_xdp_tx_buffers(struct efx_nic *efx, int n, struct xdp_frame **xdpfs, + bool flush); + #endif /* EFX_EFX_H */ diff --git a/drivers/net/ethernet/sfc/ethtool.c b/drivers/net/ethernet/sfc/ethtool.c index 86b965875540..8db593fb9699 100644 --- a/drivers/net/ethernet/sfc/ethtool.c +++ b/drivers/net/ethernet/sfc/ethtool.c @@ -83,6 +83,10 @@ static const struct efx_sw_stat_desc efx_sw_stat_desc[] = { EFX_ETHTOOL_UINT_CHANNEL_STAT(rx_frm_trunc), EFX_ETHTOOL_UINT_CHANNEL_STAT(rx_merge_events), EFX_ETHTOOL_UINT_CHANNEL_STAT(rx_merge_packets), + EFX_ETHTOOL_UINT_CHANNEL_STAT(rx_xdp_drops), + EFX_ETHTOOL_UINT_CHANNEL_STAT(rx_xdp_bad_drops), + EFX_ETHTOOL_UINT_CHANNEL_STAT(rx_xdp_tx), + EFX_ETHTOOL_UINT_CHANNEL_STAT(rx_xdp_redirect), }; #define EFX_ETHTOOL_SW_STAT_COUNT ARRAY_SIZE(efx_sw_stat_desc) @@ -399,6 +403,19 @@ static size_t efx_describe_per_queue_stats(struct efx_nic *efx, u8 *strings) } } } + if (efx->xdp_tx_queue_count && efx->xdp_tx_queues) { + unsigned short xdp; + + for (xdp = 0; xdp < efx->xdp_tx_queue_count; xdp++) { + n_stats++; + if (strings) { + snprintf(strings, ETH_GSTRING_LEN, + "tx-xdp-cpu-%hu.tx_packets", xdp); + strings += ETH_GSTRING_LEN; + } + } + } + return n_stats; } @@ -509,6 +526,14 @@ static void efx_ethtool_get_stats(struct net_device *net_dev, data++; } } + if (efx->xdp_tx_queue_count && efx->xdp_tx_queues) { + int xdp; + + for (xdp = 0; xdp < efx->xdp_tx_queue_count; xdp++) { + data[0] = efx->xdp_tx_queues[xdp]->tx_packets; + data++; + } + } efx_ptp_update_stats(efx, data); } diff --git a/drivers/net/ethernet/sfc/net_driver.h b/drivers/net/ethernet/sfc/net_driver.h index 284a1b047ac2..04e49eac7327 100644 --- a/drivers/net/ethernet/sfc/net_driver.h +++ b/drivers/net/ethernet/sfc/net_driver.h @@ -27,6 +27,7 @@ #include <linux/i2c.h> #include <linux/mtd/mtd.h> #include <net/busy_poll.h> +#include <net/xdp.h> #include "enum.h" #include "bitfield.h" @@ -136,7 +137,8 @@ struct efx_special_buffer { * struct efx_tx_buffer - buffer state for a TX descriptor * @skb: When @flags & %EFX_TX_BUF_SKB, the associated socket buffer to be * freed when descriptor completes - * @option: When @flags & %EFX_TX_BUF_OPTION, a NIC-specific option descriptor. + * @xdpf: When @flags & %EFX_TX_BUF_XDP, the XDP frame information; its @data + * member is the associated buffer to drop a page reference on. * @dma_addr: DMA address of the fragment. * @flags: Flags for allocation and DMA mapping type * @len: Length of this fragment. @@ -146,7 +148,10 @@ struct efx_special_buffer { * Only valid if @unmap_len != 0. */ struct efx_tx_buffer { - const struct sk_buff *skb; + union { + const struct sk_buff *skb; + struct xdp_frame *xdpf; + }; union { efx_qword_t option; dma_addr_t dma_addr; @@ -160,6 +165,7 @@ struct efx_tx_buffer { #define EFX_TX_BUF_SKB 2 /* buffer is last part of skb */ #define EFX_TX_BUF_MAP_SINGLE 8 /* buffer was mapped with dma_map_single() */ #define EFX_TX_BUF_OPTION 0x10 /* empty buffer for option descriptor */ +#define EFX_TX_BUF_XDP 0x20 /* buffer was sent with XDP */ /** * struct efx_tx_queue - An Efx TX queue @@ -189,6 +195,7 @@ struct efx_tx_buffer { * @piobuf_offset: Buffer offset to be specified in PIO descriptors * @initialised: Has hardware queue been initialised? * @timestamping: Is timestamping enabled for this channel? + * @xdp_tx: Is this an XDP tx queue? * @handle_tso: TSO xmit preparation handler. Sets up the TSO metadata and * may also map tx data, depending on the nature of the TSO implementation. * @read_count: Current read pointer. @@ -250,6 +257,7 @@ struct efx_tx_queue { unsigned int piobuf_offset; bool initialised; bool timestamping; + bool xdp_tx; /* Function pointers used in the fast path. */ int (*handle_tso)(struct efx_tx_queue*, struct sk_buff*, bool *); @@ -363,6 +371,8 @@ struct efx_rx_page_state { * refill was triggered. * @recycle_count: RX buffer recycle counter. * @slow_fill: Timer used to defer efx_nic_generate_fill_event(). + * @xdp_rxq_info: XDP specific RX queue information. + * @xdp_rxq_info_valid: Is xdp_rxq_info valid data?. */ struct efx_rx_queue { struct efx_nic *efx; @@ -394,6 +404,8 @@ struct efx_rx_queue { unsigned int slow_fill_count; /* Statistics to supplement MAC stats */ unsigned long rx_packets; + struct xdp_rxq_info xdp_rxq_info; + bool xdp_rxq_info_valid; }; enum efx_sync_events_state { @@ -441,6 +453,10 @@ enum efx_sync_events_state { * lack of descriptors * @n_rx_merge_events: Number of RX merged completion events * @n_rx_merge_packets: Number of RX packets completed by merged events + * @n_rx_xdp_drops: Count of RX packets intentionally dropped due to XDP + * @n_rx_xdp_bad_drops: Count of RX packets dropped due to XDP errors + * @n_rx_xdp_tx: Count of RX packets retransmitted due to XDP + * @n_rx_xdp_redirect: Count of RX packets redirected to a different NIC by XDP * @rx_pkt_n_frags: Number of fragments in next packet to be delivered by * __efx_rx_packet(), or zero if there is none * @rx_pkt_index: Ring index of first buffer for next packet to be delivered @@ -494,6 +510,10 @@ struct efx_channel { unsigned int n_rx_nodesc_trunc; unsigned int n_rx_merge_events; unsigned int n_rx_merge_packets; + unsigned int n_rx_xdp_drops; + unsigned int n_rx_xdp_bad_drops; + unsigned int n_rx_xdp_tx; + unsigned int n_rx_xdp_redirect; unsigned int rx_pkt_n_frags; unsigned int rx_pkt_index; @@ -818,6 +838,8 @@ struct efx_async_filter_insertion { * @msi_context: Context for each MSI * @extra_channel_types: Types of extra (non-traffic) channels that * should be allocated for this NIC + * @xdp_tx_queue_count: Number of entries in %xdp_tx_queues. + * @xdp_tx_queues: Array of pointers to tx queues used for XDP transmit. * @rxq_entries: Size of receive queues requested by user. * @txq_entries: Size of transmit queues requested by user. * @txq_stop_thresh: TX queue fill level at or above which we stop it. @@ -830,6 +852,9 @@ struct efx_async_filter_insertion { * @n_rx_channels: Number of channels used for RX (= number of RX queues) * @n_tx_channels: Number of channels used for TX * @n_extra_tx_channels: Number of extra channels with TX queues + * @n_xdp_channels: Number of channels used for XDP TX + * @xdp_channel_offset: Offset of zeroth channel used for XPD TX. + * @xdp_tx_per_channel: Max number of TX queues on an XDP TX channel. * @rx_ip_align: RX DMA address offset to have IP header aligned in * in accordance with NET_IP_ALIGN * @rx_dma_len: Current maximum RX DMA length @@ -894,6 +919,7 @@ struct efx_async_filter_insertion { * @loopback_mode: Loopback status * @loopback_modes: Supported loopback mode bitmask * @loopback_selftest: Offline self-test private state + * @xdp_prog: Current XDP programme for this interface * @filter_sem: Filter table rw_semaphore, protects existence of @filter_state * @filter_state: Architecture-dependent filter table state * @rps_mutex: Protects RPS state of all channels @@ -919,6 +945,8 @@ struct efx_async_filter_insertion { * @ptp_data: PTP state data * @ptp_warned: has this NIC seen and warned about unexpected PTP events? * @vpd_sn: Serial number read from VPD + * @xdp_rxq_info_failed: Have any of the rx queues failed to initialise their + * xdp_rxq_info structures? * @monitor_work: Hardware monitor workitem * @biu_lock: BIU (bus interface unit) lock * @last_irq_cpu: Last CPU to handle a possible test interrupt. This @@ -966,6 +994,9 @@ struct efx_nic { const struct efx_channel_type * extra_channel_type[EFX_MAX_EXTRA_CHANNELS]; + unsigned int xdp_tx_queue_count; + struct efx_tx_queue **xdp_tx_queues; + unsigned rxq_entries; unsigned txq_entries; unsigned int txq_stop_thresh; @@ -984,6 +1015,9 @@ struct efx_nic { unsigned tx_channel_offset; unsigned n_tx_channels; unsigned n_extra_tx_channels; + unsigned int n_xdp_channels; + unsigned int xdp_channel_offset; + unsigned int xdp_tx_per_channel; unsigned int rx_ip_align; unsigned int rx_dma_len; unsigned int rx_buffer_order; @@ -1053,6 +1087,10 @@ struct efx_nic { u64 loopback_modes; void *loopback_selftest; + /* We access loopback_selftest immediately before running XDP, + * so we want them next to each other. + */ + struct bpf_prog __rcu *xdp_prog; struct rw_semaphore filter_sem; void *filter_state; @@ -1082,6 +1120,7 @@ struct efx_nic { bool ptp_warned; char *vpd_sn; + bool xdp_rxq_info_failed; /* The following fields may be written more often */ @@ -1473,10 +1512,24 @@ efx_get_tx_queue(struct efx_nic *efx, unsigned index, unsigned type) return &efx->channel[efx->tx_channel_offset + index]->tx_queue[type]; } +static inline struct efx_channel * +efx_get_xdp_channel(struct efx_nic *efx, unsigned int index) +{ + EFX_WARN_ON_ONCE_PARANOID(index >= efx->n_xdp_channels); + return efx->channel[efx->xdp_channel_offset + index]; +} + +static inline bool efx_channel_is_xdp_tx(struct efx_channel *channel) +{ + return channel->channel - channel->efx->xdp_channel_offset < + channel->efx->n_xdp_channels; +} + static inline bool efx_channel_has_tx_queues(struct efx_channel *channel) { - return channel->type && channel->type->want_txqs && - channel->type->want_txqs(channel); + return efx_channel_is_xdp_tx(channel) || + (channel->type && channel->type->want_txqs && + channel->type->want_txqs(channel)); } static inline struct efx_tx_queue * @@ -1500,7 +1553,8 @@ static inline bool efx_tx_queue_used(struct efx_tx_queue *tx_queue) else \ for (_tx_queue = (_channel)->tx_queue; \ _tx_queue < (_channel)->tx_queue + EFX_TXQ_TYPES && \ - efx_tx_queue_used(_tx_queue); \ + (efx_tx_queue_used(_tx_queue) || \ + efx_channel_is_xdp_tx(_channel)); \ _tx_queue++) /* Iterate over all possible TX queues belonging to a channel */ diff --git a/drivers/net/ethernet/sfc/rx.c b/drivers/net/ethernet/sfc/rx.c index 85ec07f5a674..a7d9841105d8 100644 --- a/drivers/net/ethernet/sfc/rx.c +++ b/drivers/net/ethernet/sfc/rx.c @@ -17,6 +17,8 @@ #include <linux/iommu.h> #include <net/ip.h> #include <net/checksum.h> +#include <net/xdp.h> +#include <linux/bpf_trace.h> #include "net_driver.h" #include "efx.h" #include "filter.h" @@ -27,6 +29,9 @@ /* Preferred number of descriptors to fill at once */ #define EFX_RX_PREFERRED_BATCH 8U +/* Maximum rx prefix used by any architecture. */ +#define EFX_MAX_RX_PREFIX_SIZE 16 + /* Number of RX buffers to recycle pages for. When creating the RX page recycle * ring, this number is divided by the number of buffers per page to calculate * the number of pages to store in the RX page recycle ring. @@ -95,7 +100,7 @@ void efx_rx_config_page_split(struct efx_nic *efx) EFX_RX_BUF_ALIGNMENT); efx->rx_bufs_per_page = efx->rx_buffer_order ? 1 : ((PAGE_SIZE - sizeof(struct efx_rx_page_state)) / - efx->rx_page_buf_step); + (efx->rx_page_buf_step + XDP_PACKET_HEADROOM)); efx->rx_buffer_truesize = (PAGE_SIZE << efx->rx_buffer_order) / efx->rx_bufs_per_page; efx->rx_pages_per_batch = DIV_ROUND_UP(EFX_RX_PREFERRED_BATCH, @@ -185,6 +190,9 @@ static int efx_init_rx_buffers(struct efx_rx_queue *rx_queue, bool atomic) page_offset = sizeof(struct efx_rx_page_state); do { + page_offset += XDP_PACKET_HEADROOM; + dma_addr += XDP_PACKET_HEADROOM; + index = rx_queue->added_count & rx_queue->ptr_mask; rx_buf = efx_rx_buffer(rx_queue, index); rx_buf->dma_addr = dma_addr + efx->rx_ip_align; @@ -635,6 +643,123 @@ static void efx_rx_deliver(struct efx_channel *channel, u8 *eh, netif_receive_skb(skb); } +/** efx_do_xdp: perform XDP processing on a received packet + * + * Returns true if packet should still be delivered. + */ +static bool efx_do_xdp(struct efx_nic *efx, struct efx_channel *channel, + struct efx_rx_buffer *rx_buf, u8 **ehp) +{ + u8 rx_prefix[EFX_MAX_RX_PREFIX_SIZE]; + struct efx_rx_queue *rx_queue; + struct bpf_prog *xdp_prog; + struct xdp_frame *xdpf; + struct xdp_buff xdp; + u32 xdp_act; + s16 offset; + int err; + + rcu_read_lock(); + xdp_prog = rcu_dereference(efx->xdp_prog); + if (!xdp_prog) { + rcu_read_unlock(); + return true; + } + + rx_queue = efx_channel_get_rx_queue(channel); + + if (unlikely(channel->rx_pkt_n_frags > 1)) { + /* We can't do XDP on fragmented packets - drop. */ + rcu_read_unlock(); + efx_free_rx_buffers(rx_queue, rx_buf, + channel->rx_pkt_n_frags); + if (net_ratelimit()) + netif_err(efx, rx_err, efx->net_dev, + "XDP is not possible with multiple receive fragments (%d)\n", + channel->rx_pkt_n_frags); + channel->n_rx_xdp_bad_drops++; + return false; + } + + dma_sync_single_for_cpu(&efx->pci_dev->dev, rx_buf->dma_addr, + rx_buf->len, DMA_FROM_DEVICE); + + /* Save the rx prefix. */ + EFX_WARN_ON_PARANOID(efx->rx_prefix_size > EFX_MAX_RX_PREFIX_SIZE); + memcpy(rx_prefix, *ehp - efx->rx_prefix_size, + efx->rx_prefix_size); + + xdp.data = *ehp; + xdp.data_hard_start = xdp.data - XDP_PACKET_HEADROOM; + + /* No support yet for XDP metadata */ + xdp_set_data_meta_invalid(&xdp); + xdp.data_end = xdp.data + rx_buf->len; + xdp.rxq = &rx_queue->xdp_rxq_info; + + xdp_act = bpf_prog_run_xdp(xdp_prog, &xdp); + rcu_read_unlock(); + + offset = (u8 *)xdp.data - *ehp; + + switch (xdp_act) { + case XDP_PASS: + /* Fix up rx prefix. */ + if (offset) { + *ehp += offset; + rx_buf->page_offset += offset; + rx_buf->len -= offset; + memcpy(*ehp - efx->rx_prefix_size, rx_prefix, + efx->rx_prefix_size); + } + break; + + case XDP_TX: + /* Buffer ownership passes to tx on success. */ + xdpf = convert_to_xdp_frame(&xdp); + err = efx_xdp_tx_buffers(efx, 1, &xdpf, true); + if (unlikely(err != 1)) { + efx_free_rx_buffers(rx_queue, rx_buf, 1); + if (net_ratelimit()) + netif_err(efx, rx_err, efx->net_dev, + "XDP TX failed (%d)\n", err); + channel->n_rx_xdp_bad_drops++; + } else { + channel->n_rx_xdp_tx++; + } + break; + + case XDP_REDIRECT: + err = xdp_do_redirect(efx->net_dev, &xdp, xdp_prog); + if (unlikely(err)) { + efx_free_rx_buffers(rx_queue, rx_buf, 1); + if (net_ratelimit()) + netif_err(efx, rx_err, efx->net_dev, + "XDP redirect failed (%d)\n", err); + channel->n_rx_xdp_bad_drops++; + } else { + channel->n_rx_xdp_redirect++; + } + break; + + default: + bpf_warn_invalid_xdp_action(xdp_act); + efx_free_rx_buffers(rx_queue, rx_buf, 1); + channel->n_rx_xdp_bad_drops++; + break; + + case XDP_ABORTED: + trace_xdp_exception(efx->net_dev, xdp_prog, xdp_act); + /* Fall through */ + case XDP_DROP: + efx_free_rx_buffers(rx_queue, rx_buf, 1); + channel->n_rx_xdp_drops++; + break; + } + + return xdp_act == XDP_PASS; +} + /* Handle a received packet. Second half: Touches packet payload. */ void __efx_rx_packet(struct efx_channel *channel) { @@ -663,6 +788,9 @@ void __efx_rx_packet(struct efx_channel *channel) goto out; } + if (!efx_do_xdp(efx, channel, rx_buf, &eh)) + goto out; + if (unlikely(!(efx->net_dev->features & NETIF_F_RXCSUM))) rx_buf->flags &= ~EFX_RX_PKT_CSUMMED; @@ -731,6 +859,7 @@ void efx_init_rx_queue(struct efx_rx_queue *rx_queue) { struct efx_nic *efx = rx_queue->efx; unsigned int max_fill, trigger, max_trigger; + int rc = 0; netif_dbg(rx_queue->efx, drv, rx_queue->efx->net_dev, "initialising RX queue %d\n", efx_rx_queue_index(rx_queue)); @@ -764,6 +893,19 @@ void efx_init_rx_queue(struct efx_rx_queue *rx_queue) rx_queue->fast_fill_trigger = trigger; rx_queue->refill_enabled = true; + /* Initialise XDP queue information */ + rc = xdp_rxq_info_reg(&rx_queue->xdp_rxq_info, efx->net_dev, + rx_queue->core_index); + + if (rc) { + netif_err(efx, rx_err, efx->net_dev, + "Failure to initialise XDP queue information rc=%d\n", + rc); + efx->xdp_rxq_info_failed = true; + } else { + rx_queue->xdp_rxq_info_valid = true; + } + /* Set up RX descriptor ring */ efx_nic_init_rx(rx_queue); } @@ -805,6 +947,11 @@ void efx_fini_rx_queue(struct efx_rx_queue *rx_queue) } kfree(rx_queue->page_ring); rx_queue->page_ring = NULL; + + if (rx_queue->xdp_rxq_info_valid) + xdp_rxq_info_unreg(&rx_queue->xdp_rxq_info); + + rx_queue->xdp_rxq_info_valid = false; } void efx_remove_rx_queue(struct efx_rx_queue *rx_queue) diff --git a/drivers/net/ethernet/sfc/tx.c b/drivers/net/ethernet/sfc/tx.c index 65e81ec1b314..00c1c4402451 100644 --- a/drivers/net/ethernet/sfc/tx.c +++ b/drivers/net/ethernet/sfc/tx.c @@ -95,6 +95,8 @@ static void efx_dequeue_buffer(struct efx_tx_queue *tx_queue, netif_vdbg(tx_queue->efx, tx_done, tx_queue->efx->net_dev, "TX queue %d transmission id %x complete\n", tx_queue->queue, tx_queue->read_count); + } else if (buffer->flags & EFX_TX_BUF_XDP) { + xdp_return_frame_rx_napi(buffer->xdpf); } buffer->len = 0; @@ -597,6 +599,94 @@ err: return NETDEV_TX_OK; } +static void efx_xdp_return_frames(int n, struct xdp_frame **xdpfs) +{ + int i; + + for (i = 0; i < n; i++) + xdp_return_frame_rx_napi(xdpfs[i]); +} + +/* Transmit a packet from an XDP buffer + * + * Returns number of packets sent on success, error code otherwise. + * Runs in NAPI context, either in our poll (for XDP TX) or a different NIC + * (for XDP redirect). + */ +int efx_xdp_tx_buffers(struct efx_nic *efx, int n, struct xdp_frame **xdpfs, + bool flush) +{ + struct efx_tx_buffer *tx_buffer; + struct efx_tx_queue *tx_queue; + struct xdp_frame *xdpf; + dma_addr_t dma_addr; + unsigned int len; + int space; + int cpu; + int i; + + cpu = raw_smp_processor_id(); + + if (!efx->xdp_tx_queue_count || + unlikely(cpu >= efx->xdp_tx_queue_count)) + return -EINVAL; + + tx_queue = efx->xdp_tx_queues[cpu]; + if (unlikely(!tx_queue)) + return -EINVAL; + + if (unlikely(n && !xdpfs)) + return -EINVAL; + + if (!n) + return 0; + + /* Check for available space. We should never need multiple + * descriptors per frame. + */ + space = efx->txq_entries + + tx_queue->read_count - tx_queue->insert_count; + + for (i = 0; i < n; i++) { + xdpf = xdpfs[i]; + + if (i >= space) + break; + + /* We'll want a descriptor for this tx. */ + prefetchw(__efx_tx_queue_get_insert_buffer(tx_queue)); + + len = xdpf->len; + + /* Map for DMA. */ + dma_addr = dma_map_single(&efx->pci_dev->dev, + xdpf->data, len, + DMA_TO_DEVICE); + if (dma_mapping_error(&efx->pci_dev->dev, dma_addr)) + break; + + /* Create descriptor and set up for unmapping DMA. */ + tx_buffer = efx_tx_map_chunk(tx_queue, dma_addr, len); + tx_buffer->xdpf = xdpf; + tx_buffer->flags = EFX_TX_BUF_XDP | + EFX_TX_BUF_MAP_SINGLE; + tx_buffer->dma_offset = 0; + tx_buffer->unmap_len = len; + tx_queue->tx_packets++; + } + + /* Pass mapped frames to hardware. */ + if (flush && i > 0) + efx_nic_push_buffers(tx_queue); + + if (i == 0) + return -EIO; + + efx_xdp_return_frames(n - i, xdpfs + i); + + return i; +} + /* Remove packets from the TX queue * * This removes packets from the TX queue, up to and including the @@ -857,6 +947,8 @@ void efx_init_tx_queue(struct efx_tx_queue *tx_queue) tx_queue->completed_timestamp_major = 0; tx_queue->completed_timestamp_minor = 0; + tx_queue->xdp_tx = efx_channel_is_xdp_tx(tx_queue->channel); + /* Set up default function pointers. These may get replaced by * efx_nic_init_tx() based off NIC/queue capabilities. */ |