accel/habanalabs: add pci health check during heartbeat
[ Upstream commit d8b9cea584661b30305cf341bf9f675dc0a25471 ] Currently upon a heartbeat failure, we don't know if the failure is due to firmware hang or due to a bad PCI link. Hence, we are reading a PCI config space register with a known value (vendor ID) so we will know which of the two possibilities caused the heartbeat failure. Signed-off-by: Ofir Bitton <obitton@habana.ai> Reviewed-by: Oded Gabbay <ogabbay@kernel.org> Signed-off-by: Oded Gabbay <ogabbay@kernel.org> Signed-off-by: Sasha Levin <sashal@kernel.org>
This commit is contained in:
parent
b7a34e30d4
commit
d7933b92c4
3 changed files with 16 additions and 3 deletions
|
|
@ -870,6 +870,18 @@ static void device_early_fini(struct hl_device *hdev)
|
|||
hdev->asic_funcs->early_fini(hdev);
|
||||
}
|
||||
|
||||
static bool is_pci_link_healthy(struct hl_device *hdev)
|
||||
{
|
||||
u16 vendor_id;
|
||||
|
||||
if (!hdev->pdev)
|
||||
return false;
|
||||
|
||||
pci_read_config_word(hdev->pdev, PCI_VENDOR_ID, &vendor_id);
|
||||
|
||||
return (vendor_id == PCI_VENDOR_ID_HABANALABS);
|
||||
}
|
||||
|
||||
static void hl_device_heartbeat(struct work_struct *work)
|
||||
{
|
||||
struct hl_device *hdev = container_of(work, struct hl_device,
|
||||
|
|
@ -882,7 +894,8 @@ static void hl_device_heartbeat(struct work_struct *work)
|
|||
goto reschedule;
|
||||
|
||||
if (hl_device_operational(hdev, NULL))
|
||||
dev_err(hdev->dev, "Device heartbeat failed!\n");
|
||||
dev_err(hdev->dev, "Device heartbeat failed! PCI link is %s\n",
|
||||
is_pci_link_healthy(hdev) ? "healthy" : "broken");
|
||||
|
||||
hl_device_reset(hdev, HL_DRV_RESET_HARD | HL_DRV_RESET_HEARTBEAT);
|
||||
|
||||
|
|
|
|||
|
|
@ -34,6 +34,8 @@
|
|||
struct hl_device;
|
||||
struct hl_fpriv;
|
||||
|
||||
#define PCI_VENDOR_ID_HABANALABS 0x1da3
|
||||
|
||||
/* Use upper bits of mmap offset to store habana driver specific information.
|
||||
* bits[63:59] - Encode mmap type
|
||||
* bits[45:0] - mmap offset value
|
||||
|
|
|
|||
|
|
@ -54,8 +54,6 @@ module_param(boot_error_status_mask, ulong, 0444);
|
|||
MODULE_PARM_DESC(boot_error_status_mask,
|
||||
"Mask of the error status during device CPU boot (If bitX is cleared then error X is masked. Default all 1's)");
|
||||
|
||||
#define PCI_VENDOR_ID_HABANALABS 0x1da3
|
||||
|
||||
#define PCI_IDS_GOYA 0x0001
|
||||
#define PCI_IDS_GAUDI 0x1000
|
||||
#define PCI_IDS_GAUDI_SEC 0x1010
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue