aboutsummaryrefslogtreecommitdiff
path: root/drivers/net/ethernet/mellanox/mlx5/core/health.c
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/net/ethernet/mellanox/mlx5/core/health.c')
-rw-r--r--drivers/net/ethernet/mellanox/mlx5/core/health.c224
1 files changed, 173 insertions, 51 deletions
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/health.c b/drivers/net/ethernet/mellanox/mlx5/core/health.c
index 292d76f2a904..f5deb642d0d6 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/health.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/health.c
@@ -34,6 +34,7 @@
#include <linux/module.h>
#include <linux/random.h>
#include <linux/vmalloc.h>
+#include <linux/hardirq.h>
#include <linux/mlx5/driver.h>
#include <linux/mlx5/cmd.h>
#include "mlx5_core.h"
@@ -46,39 +47,113 @@ enum {
enum {
MLX5_HEALTH_SYNDR_FW_ERR = 0x1,
MLX5_HEALTH_SYNDR_IRISC_ERR = 0x7,
+ MLX5_HEALTH_SYNDR_HW_UNRECOVERABLE_ERR = 0x8,
MLX5_HEALTH_SYNDR_CRC_ERR = 0x9,
MLX5_HEALTH_SYNDR_FETCH_PCI_ERR = 0xa,
MLX5_HEALTH_SYNDR_HW_FTL_ERR = 0xb,
MLX5_HEALTH_SYNDR_ASYNC_EQ_OVERRUN_ERR = 0xc,
MLX5_HEALTH_SYNDR_EQ_ERR = 0xd,
+ MLX5_HEALTH_SYNDR_EQ_INV = 0xe,
MLX5_HEALTH_SYNDR_FFSER_ERR = 0xf,
+ MLX5_HEALTH_SYNDR_HIGH_TEMP = 0x10
};
-static DEFINE_SPINLOCK(health_lock);
-static LIST_HEAD(health_list);
-static struct work_struct health_work;
+enum {
+ MLX5_NIC_IFC_FULL = 0,
+ MLX5_NIC_IFC_DISABLED = 1,
+ MLX5_NIC_IFC_NO_DRAM_NIC = 2
+};
-static void health_care(struct work_struct *work)
+static u8 get_nic_interface(struct mlx5_core_dev *dev)
{
- struct mlx5_core_health *health, *n;
- struct mlx5_core_dev *dev;
- struct mlx5_priv *priv;
- LIST_HEAD(tlist);
+ return (ioread32be(&dev->iseg->cmdq_addr_l_sz) >> 8) & 3;
+}
+
+static void trigger_cmd_completions(struct mlx5_core_dev *dev)
+{
+ unsigned long flags;
+ u64 vector;
- spin_lock_irq(&health_lock);
- list_splice_init(&health_list, &tlist);
+ /* wait for pending handlers to complete */
+ synchronize_irq(dev->priv.msix_arr[MLX5_EQ_VEC_CMD].vector);
+ spin_lock_irqsave(&dev->cmd.alloc_lock, flags);
+ vector = ~dev->cmd.bitmask & ((1ul << (1 << dev->cmd.log_sz)) - 1);
+ if (!vector)
+ goto no_trig;
+
+ vector |= MLX5_TRIGGERED_CMD_COMP;
+ spin_unlock_irqrestore(&dev->cmd.alloc_lock, flags);
+
+ mlx5_core_dbg(dev, "vector 0x%llx\n", vector);
+ mlx5_cmd_comp_handler(dev, vector);
+ return;
+
+no_trig:
+ spin_unlock_irqrestore(&dev->cmd.alloc_lock, flags);
+}
+
+static int in_fatal(struct mlx5_core_dev *dev)
+{
+ struct mlx5_core_health *health = &dev->priv.health;
+ struct health_buffer __iomem *h = health->health;
- spin_unlock_irq(&health_lock);
+ if (get_nic_interface(dev) == MLX5_NIC_IFC_DISABLED)
+ return 1;
- list_for_each_entry_safe(health, n, &tlist, list) {
- priv = container_of(health, struct mlx5_priv, health);
- dev = container_of(priv, struct mlx5_core_dev, priv);
- mlx5_core_warn(dev, "handling bad device here\n");
- /* nothing yet */
- spin_lock_irq(&health_lock);
- list_del_init(&health->list);
- spin_unlock_irq(&health_lock);
+ if (ioread32be(&h->fw_ver) == 0xffffffff)
+ return 1;
+
+ return 0;
+}
+
+void mlx5_enter_error_state(struct mlx5_core_dev *dev)
+{
+ if (dev->state == MLX5_DEVICE_STATE_INTERNAL_ERROR)
+ return;
+
+ mlx5_core_err(dev, "start\n");
+ if (pci_channel_offline(dev->pdev) || in_fatal(dev))
+ dev->state = MLX5_DEVICE_STATE_INTERNAL_ERROR;
+
+ mlx5_core_event(dev, MLX5_DEV_EVENT_SYS_ERROR, 0);
+ mlx5_core_err(dev, "end\n");
+}
+
+static void mlx5_handle_bad_state(struct mlx5_core_dev *dev)
+{
+ u8 nic_interface = get_nic_interface(dev);
+
+ switch (nic_interface) {
+ case MLX5_NIC_IFC_FULL:
+ mlx5_core_warn(dev, "Expected to see disabled NIC but it is full driver\n");
+ break;
+
+ case MLX5_NIC_IFC_DISABLED:
+ mlx5_core_warn(dev, "starting teardown\n");
+ break;
+
+ case MLX5_NIC_IFC_NO_DRAM_NIC:
+ mlx5_core_warn(dev, "Expected to see disabled NIC but it is no dram nic\n");
+ break;
+ default:
+ mlx5_core_warn(dev, "Expected to see disabled NIC but it is has invalid value %d\n",
+ nic_interface);
}
+
+ mlx5_disable_device(dev);
+}
+
+static void health_care(struct work_struct *work)
+{
+ struct mlx5_core_health *health;
+ struct mlx5_core_dev *dev;
+ struct mlx5_priv *priv;
+
+ health = container_of(work, struct mlx5_core_health, work);
+ priv = container_of(health, struct mlx5_priv, health);
+ dev = container_of(priv, struct mlx5_core_dev, priv);
+ mlx5_core_warn(dev, "handling bad device here\n");
+ mlx5_handle_bad_state(dev);
}
static const char *hsynd_str(u8 synd)
@@ -88,6 +163,8 @@ static const char *hsynd_str(u8 synd)
return "firmware internal error";
case MLX5_HEALTH_SYNDR_IRISC_ERR:
return "irisc not responding";
+ case MLX5_HEALTH_SYNDR_HW_UNRECOVERABLE_ERR:
+ return "unrecoverable hardware error";
case MLX5_HEALTH_SYNDR_CRC_ERR:
return "firmware CRC error";
case MLX5_HEALTH_SYNDR_FETCH_PCI_ERR:
@@ -98,48 +175,81 @@ static const char *hsynd_str(u8 synd)
return "async EQ buffer overrun";
case MLX5_HEALTH_SYNDR_EQ_ERR:
return "EQ error";
+ case MLX5_HEALTH_SYNDR_EQ_INV:
+ return "Invalid EQ refrenced";
case MLX5_HEALTH_SYNDR_FFSER_ERR:
return "FFSER error";
+ case MLX5_HEALTH_SYNDR_HIGH_TEMP:
+ return "High temprature";
default:
return "unrecognized error";
}
}
-static u16 read_be16(__be16 __iomem *p)
+static u16 get_maj(u32 fw)
{
- return swab16(readl((__force u16 __iomem *) p));
+ return fw >> 28;
}
-static u32 read_be32(__be32 __iomem *p)
+static u16 get_min(u32 fw)
{
- return swab32(readl((__force u32 __iomem *) p));
+ return fw >> 16 & 0xfff;
+}
+
+static u16 get_sub(u32 fw)
+{
+ return fw & 0xffff;
}
static void print_health_info(struct mlx5_core_dev *dev)
{
struct mlx5_core_health *health = &dev->priv.health;
struct health_buffer __iomem *h = health->health;
+ char fw_str[18];
+ u32 fw;
int i;
+ /* If the syndrom is 0, the device is OK and no need to print buffer */
+ if (!ioread8(&h->synd))
+ return;
+
for (i = 0; i < ARRAY_SIZE(h->assert_var); i++)
- pr_info("assert_var[%d] 0x%08x\n", i, read_be32(h->assert_var + i));
+ dev_err(&dev->pdev->dev, "assert_var[%d] 0x%08x\n", i, ioread32be(h->assert_var + i));
+
+ dev_err(&dev->pdev->dev, "assert_exit_ptr 0x%08x\n", ioread32be(&h->assert_exit_ptr));
+ dev_err(&dev->pdev->dev, "assert_callra 0x%08x\n", ioread32be(&h->assert_callra));
+ fw = ioread32be(&h->fw_ver);
+ sprintf(fw_str, "%d.%d.%d", get_maj(fw), get_min(fw), get_sub(fw));
+ dev_err(&dev->pdev->dev, "fw_ver %s\n", fw_str);
+ dev_err(&dev->pdev->dev, "hw_id 0x%08x\n", ioread32be(&h->hw_id));
+ dev_err(&dev->pdev->dev, "irisc_index %d\n", ioread8(&h->irisc_index));
+ dev_err(&dev->pdev->dev, "synd 0x%x: %s\n", ioread8(&h->synd), hsynd_str(ioread8(&h->synd)));
+ dev_err(&dev->pdev->dev, "ext_synd 0x%04x\n", ioread16be(&h->ext_synd));
+}
+
+static unsigned long get_next_poll_jiffies(void)
+{
+ unsigned long next;
- pr_info("assert_exit_ptr 0x%08x\n", read_be32(&h->assert_exit_ptr));
- pr_info("assert_callra 0x%08x\n", read_be32(&h->assert_callra));
- pr_info("fw_ver 0x%08x\n", read_be32(&h->fw_ver));
- pr_info("hw_id 0x%08x\n", read_be32(&h->hw_id));
- pr_info("irisc_index %d\n", readb(&h->irisc_index));
- pr_info("synd 0x%x: %s\n", readb(&h->synd), hsynd_str(readb(&h->synd)));
- pr_info("ext_sync 0x%04x\n", read_be16(&h->ext_sync));
+ get_random_bytes(&next, sizeof(next));
+ next %= HZ;
+ next += jiffies + MLX5_HEALTH_POLL_INTERVAL;
+
+ return next;
}
static void poll_health(unsigned long data)
{
struct mlx5_core_dev *dev = (struct mlx5_core_dev *)data;
struct mlx5_core_health *health = &dev->priv.health;
- unsigned long next;
u32 count;
+ if (dev->state == MLX5_DEVICE_STATE_INTERNAL_ERROR) {
+ trigger_cmd_completions(dev);
+ mod_timer(&health->timer, get_next_poll_jiffies());
+ return;
+ }
+
count = ioread32be(health->health_counter);
if (count == health->prev)
++health->miss_counter;
@@ -148,18 +258,16 @@ static void poll_health(unsigned long data)
health->prev = count;
if (health->miss_counter == MAX_MISSES) {
- mlx5_core_err(dev, "device's health compromised\n");
+ dev_err(&dev->pdev->dev, "device's health compromised - reached miss count\n");
print_health_info(dev);
- spin_lock_irq(&health_lock);
- list_add_tail(&health->list, &health_list);
- spin_unlock_irq(&health_lock);
-
- queue_work(mlx5_core_wq, &health_work);
} else {
- get_random_bytes(&next, sizeof(next));
- next %= HZ;
- next += jiffies + MLX5_HEALTH_POLL_INTERVAL;
- mod_timer(&health->timer, next);
+ mod_timer(&health->timer, get_next_poll_jiffies());
+ }
+
+ if (in_fatal(dev) && !health->sick) {
+ health->sick = true;
+ print_health_info(dev);
+ queue_work(health->wq, &health->work);
}
}
@@ -167,7 +275,6 @@ void mlx5_start_health_poll(struct mlx5_core_dev *dev)
{
struct mlx5_core_health *health = &dev->priv.health;
- INIT_LIST_HEAD(&health->list);
init_timer(&health->timer);
health->health = &dev->iseg->health;
health->health_counter = &dev->iseg->health_counter;
@@ -183,18 +290,33 @@ void mlx5_stop_health_poll(struct mlx5_core_dev *dev)
struct mlx5_core_health *health = &dev->priv.health;
del_timer_sync(&health->timer);
-
- spin_lock_irq(&health_lock);
- if (!list_empty(&health->list))
- list_del_init(&health->list);
- spin_unlock_irq(&health_lock);
}
-void mlx5_health_cleanup(void)
+void mlx5_health_cleanup(struct mlx5_core_dev *dev)
{
+ struct mlx5_core_health *health = &dev->priv.health;
+
+ destroy_workqueue(health->wq);
}
-void __init mlx5_health_init(void)
+int mlx5_health_init(struct mlx5_core_dev *dev)
{
- INIT_WORK(&health_work, health_care);
+ struct mlx5_core_health *health;
+ char *name;
+
+ health = &dev->priv.health;
+ name = kmalloc(64, GFP_KERNEL);
+ if (!name)
+ return -ENOMEM;
+
+ strcpy(name, "mlx5_health");
+ strcat(name, dev_name(&dev->pdev->dev));
+ health->wq = create_singlethread_workqueue(name);
+ kfree(name);
+ if (!health->wq)
+ return -ENOMEM;
+
+ INIT_WORK(&health->work, health_care);
+
+ return 0;
}