diff options
-rw-r--r-- | drivers/net/ethernet/mellanox/mlx5/core/en/reporter.h | 1 | ||||
-rw-r--r-- | drivers/net/ethernet/mellanox/mlx5/core/en/reporter_tx.c | 38 | ||||
-rw-r--r-- | drivers/net/ethernet/mellanox/mlx5/core/en_main.c | 53 |
3 files changed, 55 insertions, 37 deletions
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/reporter.h b/drivers/net/ethernet/mellanox/mlx5/core/en/reporter.h index 1c90059db5f8..e78e92753d73 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/reporter.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/reporter.h @@ -10,5 +10,6 @@ int mlx5e_tx_reporter_create(struct mlx5e_priv *priv); void mlx5e_tx_reporter_destroy(struct mlx5e_priv *priv); void mlx5e_tx_reporter_err_cqe(struct mlx5e_txqsq *sq); +int mlx5e_tx_reporter_timeout(struct mlx5e_txqsq *sq); #endif diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_tx.c b/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_tx.c index 35568e4820de..0aebfb377cf0 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_tx.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_tx.c @@ -126,6 +126,44 @@ void mlx5e_tx_reporter_err_cqe(struct mlx5e_txqsq *sq) &err_ctx); } +static int mlx5e_tx_reporter_timeout_recover(struct mlx5e_txqsq *sq) +{ + struct mlx5_eq_comp *eq = sq->cq.mcq.eq; + u32 eqe_count; + int ret; + + netdev_err(sq->channel->netdev, "EQ 0x%x: Cons = 0x%x, irqn = 0x%x\n", + eq->core.eqn, eq->core.cons_index, eq->core.irqn); + + eqe_count = mlx5_eq_poll_irq_disabled(eq); + ret = eqe_count ? true : false; + if (!eqe_count) { + clear_bit(MLX5E_SQ_STATE_ENABLED, &sq->state); + return ret; + } + + netdev_err(sq->channel->netdev, "Recover %d eqes on EQ 0x%x\n", + eqe_count, eq->core.eqn); + sq->channel->stats->eq_rearm++; + return ret; +} + +int mlx5e_tx_reporter_timeout(struct mlx5e_txqsq *sq) +{ + char err_str[MLX5E_TX_REPORTER_PER_SQ_MAX_LEN]; + struct mlx5e_tx_err_ctx err_ctx; + + err_ctx.sq = sq; + err_ctx.recover = mlx5e_tx_reporter_timeout_recover; + sprintf(err_str, + "TX timeout on queue: %d, SQ: 0x%x, CQ: 0x%x, SQ Cons: 0x%x SQ Prod: 0x%x, usecs since last trans: %u\n", + sq->channel->ix, sq->sqn, sq->cq.mcq.cqn, sq->cc, sq->pc, + jiffies_to_usecs(jiffies - sq->txq->trans_start)); + + return devlink_health_report(sq->channel->priv->tx_reporter, err_str, + &err_ctx); +} + /* state lock cannot be grabbed within this function. * It can cause a dead lock or a read-after-free. */ diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c index 93e1f037b019..d81ebba7c04e 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c @@ -4116,31 +4116,13 @@ netdev_features_t mlx5e_features_check(struct sk_buff *skb, return features; } -static bool mlx5e_tx_timeout_eq_recover(struct net_device *dev, - struct mlx5e_txqsq *sq) -{ - struct mlx5_eq_comp *eq = sq->cq.mcq.eq; - u32 eqe_count; - - netdev_err(dev, "EQ 0x%x: Cons = 0x%x, irqn = 0x%x\n", - eq->core.eqn, eq->core.cons_index, eq->core.irqn); - - eqe_count = mlx5_eq_poll_irq_disabled(eq); - if (!eqe_count) - return false; - - netdev_err(dev, "Recover %d eqes on EQ 0x%x\n", eqe_count, eq->core.eqn); - sq->channel->stats->eq_rearm++; - return true; -} - static void mlx5e_tx_timeout_work(struct work_struct *work) { struct mlx5e_priv *priv = container_of(work, struct mlx5e_priv, tx_timeout_work); - struct net_device *dev = priv->netdev; - bool reopen_channels = false; - int i, err; + bool report_failed = false; + int err; + int i; rtnl_lock(); mutex_lock(&priv->state_lock); @@ -4149,31 +4131,22 @@ static void mlx5e_tx_timeout_work(struct work_struct *work) goto unlock; for (i = 0; i < priv->channels.num * priv->channels.params.num_tc; i++) { - struct netdev_queue *dev_queue = netdev_get_tx_queue(dev, i); + struct netdev_queue *dev_queue = + netdev_get_tx_queue(priv->netdev, i); struct mlx5e_txqsq *sq = priv->txq2sq[i]; if (!netif_xmit_stopped(dev_queue)) continue; - netdev_err(dev, - "TX timeout on queue: %d, SQ: 0x%x, CQ: 0x%x, SQ Cons: 0x%x SQ Prod: 0x%x, usecs since last trans: %u\n", - i, sq->sqn, sq->cq.mcq.cqn, sq->cc, sq->pc, - jiffies_to_usecs(jiffies - dev_queue->trans_start)); - - /* If we recover a lost interrupt, most likely TX timeout will - * be resolved, skip reopening channels - */ - if (!mlx5e_tx_timeout_eq_recover(dev, sq)) { - clear_bit(MLX5E_SQ_STATE_ENABLED, &sq->state); - reopen_channels = true; - } + if (mlx5e_tx_reporter_timeout(sq)) + report_failed = true; } - if (!reopen_channels) + if (!report_failed) goto unlock; - mlx5e_close_locked(dev); - err = mlx5e_open_locked(dev); + mlx5e_close_locked(priv->netdev); + err = mlx5e_open_locked(priv->netdev); if (err) netdev_err(priv->netdev, "mlx5e_open_locked failed recovering from a tx_timeout, err(%d).\n", @@ -4189,6 +4162,12 @@ static void mlx5e_tx_timeout(struct net_device *dev) struct mlx5e_priv *priv = netdev_priv(dev); netdev_err(dev, "TX timeout detected\n"); + + if (IS_ERR_OR_NULL(priv->tx_reporter)) { + netdev_err_once(priv->netdev, "tx timeout will not be handled, no valid tx reporter\n"); + return; + } + queue_work(priv->wq, &priv->tx_timeout_work); } |