525.85.05

This commit is contained in:
Andy Ritger
2023-01-19 10:41:59 -08:00
parent dac2350c7f
commit 811073c51e
90 changed files with 1937 additions and 668 deletions

View File

@@ -6611,12 +6611,12 @@ _nvswitch_service_nvlw_fatal_ls10
status[5] = _nvswitch_service_nvlipt_link_fatal_ls10(device, instance);
if (status[0] != NVL_SUCCESS &&
status[1] != NVL_SUCCESS &&
status[2] != NVL_SUCCESS &&
status[3] != NVL_SUCCESS &&
status[4] != NVL_SUCCESS &&
status[5] != NVL_SUCCESS)
if (status[0] != NVL_SUCCESS && status[0] != -NVL_NOT_FOUND &&
status[1] != NVL_SUCCESS && status[1] != -NVL_NOT_FOUND &&
status[2] != NVL_SUCCESS && status[2] != -NVL_NOT_FOUND &&
status[3] != NVL_SUCCESS && status[3] != -NVL_NOT_FOUND &&
status[4] != NVL_SUCCESS && status[4] != -NVL_NOT_FOUND &&
status[5] != NVL_SUCCESS && status[5] != -NVL_NOT_FOUND)
{
return -NVL_MORE_PROCESSING_REQUIRED;
}

View File

@@ -1,5 +1,5 @@
/*
* SPDX-FileCopyrightText: Copyright (c) 2020-2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-FileCopyrightText: Copyright (c) 2020-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: MIT
*
* Permission is hereby granted, free of charge, to any person obtaining a
@@ -496,6 +496,34 @@ nvswitch_corelib_get_rx_detect_ls10
return NVL_SUCCESS;
}
static NvBool
_nvswitch_is_tlc_in_reset
(
nvswitch_device *device,
nvlink_link *link
)
{
NvU32 clkStatus;
clkStatus = NVSWITCH_LINK_RD32_LS10(device, link->linkNumber,
NVLIPT_LNK, _NVLIPT_LNK, _CTRL_CLK_CTRL);
//
// TLC is in reset if any of the per-link clocks are off
// -- if TX and RX clocks are off then link is not powered on
// -- if TX/RX clocks are on but NCISOC clock is off, DL layer
// is on but TLC is still off
//
if (FLD_TEST_DRF(_NVLIPT_LNK, _CTRL_CLK_CTRL, _RXCLK_STS, _OFF, clkStatus) ||
FLD_TEST_DRF(_NVLIPT_LNK, _CTRL_CLK_CTRL, _TXCLK_STS, _OFF, clkStatus) ||
FLD_TEST_DRF(_NVLIPT_LNK, _CTRL_CLK_CTRL, _NCISOCCLK_STS, _OFF, clkStatus))
{
return NV_TRUE;
}
return NV_FALSE;
}
void
nvswitch_reset_persistent_link_hw_state_ls10
(
@@ -509,14 +537,21 @@ nvswitch_reset_persistent_link_hw_state_ls10
return;
}
// SETUPTC called with HW Reset
(void)nvswitch_minion_send_command(device, linkNumber, NV_MINION_NVLINK_DL_CMD_COMMAND_SETUPTC , 0x4);
// clear TLC TP Counters
(void)nvswitch_minion_send_command(device, linkNumber, NV_MINION_NVLINK_DL_CMD_COMMAND_CLR_TLC_MISC_REGS, 0);
// clear DL error counters
(void)nvswitch_minion_send_command(device, linkNumber, NV_MINION_NVLINK_DL_CMD_COMMAND_DLSTAT_CLR_DLERRCNT, 0);
// If TLC is not up then return
if (_nvswitch_is_tlc_in_reset(device, link))
{
return;
}
// SETUPTC called to reset and setup throughput counters
(void)nvswitch_minion_send_command(device, linkNumber, NV_MINION_NVLINK_DL_CMD_COMMAND_SETUPTC , 0x4);
// clear miscellaneous TLC counters and registers
(void)nvswitch_minion_send_command(device, linkNumber, NV_MINION_NVLINK_DL_CMD_COMMAND_CLR_TLC_MISC_REGS, 0);
}
NvlStatus
@@ -1469,3 +1504,82 @@ nvswitch_execute_unilateral_link_shutdown_ls10
return;
}
NvlStatus
nvswitch_reset_and_train_link_ls10
(
nvswitch_device *device,
nvlink_link *link
)
{
NvlStatus status = NVL_SUCCESS;
NvU32 retry_count = 3;
NvU32 link_state_request;
NvU32 link_state;
NvU32 stat_data;
NvU32 link_intr_subcode;
nvswitch_execute_unilateral_link_shutdown_ls10(link);
nvswitch_corelib_clear_link_state_ls10(link);
do
{
status = nvswitch_request_tl_link_state_ls10(link,
NV_NVLIPT_LNK_CTRL_LINK_STATE_REQUEST_REQUEST_RESET, NV_TRUE);
if (status == NVL_SUCCESS)
{
break;
}
else
{
link_state_request = NVSWITCH_LINK_RD32_LS10(device, link->linkNumber,
NVLIPT_LNK , _NVLIPT_LNK , _CTRL_LINK_STATE_REQUEST);
link_state = DRF_VAL(_NVLIPT_LNK, _CTRL_LINK_STATE_REQUEST, _STATUS,
link_state_request);
if (nvswitch_minion_get_dl_status(device, link->linkNumber,
NV_NVLSTAT_MN00, 0, &stat_data) == NVL_SUCCESS)
{
link_intr_subcode = DRF_VAL(_NVLSTAT, _MN00, _LINK_INTR_SUBCODE, stat_data);
}
if ((link_state == NV_NVLIPT_LNK_CTRL_LINK_STATE_REQUEST_STATUS_MINION_REQUEST_FAIL) &&
(link_intr_subcode == MINION_ALARM_BUSY))
{
status = nvswitch_request_tl_link_state_ls10(link,
NV_NVLIPT_LNK_CTRL_LINK_STATE_REQUEST_REQUEST_RESET, NV_TRUE);
//
// We retry the shutdown sequence 3 times when we see a MINION_REQUEST_FAIL
// or MINION_ALARM_BUSY
//
retry_count--;
}
else
{
break;
}
}
} while(retry_count);
if (status != NVL_SUCCESS)
{
NVSWITCH_PRINT(device, ERROR,
"%s: NvLink Reset has failed for link %d\n",
__FUNCTION__, link->linkNumber);
// Re-register links.
status = nvlink_lib_register_link(device->nvlink_device, link);
if (status != NVL_SUCCESS)
{
nvswitch_destroy_link(link);
return status;
}
return status;
}
return NVL_SUCCESS;
}

View File

@@ -31,7 +31,6 @@
#include "common_nvswitch.h"
#include "ls10/ls10.h"
#include "ls10/soe_ls10.h"
#include "lr10/soe_lr10.h"
#include "nvswitch/ls10/dev_soe_ip.h"
#include "nvswitch/ls10/dev_soe_ip_addendum.h"
@@ -555,7 +554,7 @@ nvswitch_soe_register_event_callbacks_ls10
device, pFlcn,
RM_SOE_UNIT_THERM,
NULL,
nvswitch_therm_soe_callback_lr10,
nvswitch_therm_soe_callback_ls10,
NULL,
&pSoe->thermEvtDesc);
if (status != NV_OK)

View File

@@ -28,6 +28,7 @@
#include "ls10/therm_ls10.h"
#include "error_nvswitch.h"
#include "soe/soeiftherm.h"
#include "rmflcncmdif_nvswitch.h"
#include "nvswitch/ls10/dev_therm.h"
@@ -356,3 +357,100 @@ nvswitch_monitor_thermal_alert_ls10
return;
}
/*
* @brief Callback function to recieve thermal messages from SOE.
*/
void
nvswitch_therm_soe_callback_ls10
(
nvswitch_device *device,
RM_FLCN_MSG *pGenMsg,
void *pParams,
NvU32 seqDesc,
NV_STATUS status
)
{
RM_SOE_THERM_MSG_SLOWDOWN_STATUS slowdown_status;
RM_SOE_THERM_MSG_SHUTDOWN_STATUS shutdown_status;
RM_FLCN_MSG_SOE *pMsg = (RM_FLCN_MSG_SOE *)pGenMsg;
NvU32 temperature;
NvU32 threshold;
switch (pMsg->msg.soeTherm.msgType)
{
case RM_SOE_THERM_MSG_ID_SLOWDOWN_STATUS:
{
slowdown_status = pMsg->msg.soeTherm.slowdown;
if (slowdown_status.bSlowdown)
{
if (slowdown_status.source.bTsense) // TSENSE_THERM_ALERT
{
temperature = RM_SOE_NV_TEMP_TO_CELSIUS_TRUNCED(slowdown_status.maxTemperature);
threshold = RM_SOE_NV_TEMP_TO_CELSIUS_TRUNCED(slowdown_status.warnThreshold);
NVSWITCH_PRINT_SXID(device, NVSWITCH_ERR_HW_HOST_THERMAL_EVENT_START,
"NVSWITCH Temperature %dC | TSENSE WARN Threshold %dC\n",
temperature, threshold);
NVSWITCH_PRINT_SXID(device, NVSWITCH_ERR_HW_HOST_THERMAL_EVENT_START,
"Thermal Slowdown Engaged | Temp higher than WARN Threshold\n");
}
NVSWITCH_PRINT_SXID(device, NVSWITCH_ERR_HW_HOST_THERMAL_EVENT_START,
"Thermal Slowdown Engaged | Links Thermal Mode %s\n", (slowdown_status.bLinksL1Status ? "ON" : "OFF"));
if (slowdown_status.source.bPmgr) // PMGR_THERM_ALERT
{
NVSWITCH_PRINT_SXID(device, NVSWITCH_ERR_HW_HOST_THERMAL_EVENT_START,
"Thermal Slowdown Engaged | PMGR WARN Threshold reached\n");
}
}
else // REVERT_SLOWDOWN
{
temperature = RM_SOE_NV_TEMP_TO_CELSIUS_TRUNCED(slowdown_status.maxTemperature);
threshold = RM_SOE_NV_TEMP_TO_CELSIUS_TRUNCED(slowdown_status.warnThreshold);
NVSWITCH_PRINT_SXID(device, NVSWITCH_ERR_HW_HOST_THERMAL_EVENT_END,
"NVSWITCH Temperature %dC | TSENSE WARN Threshold %dC\n",
temperature, threshold);
NVSWITCH_PRINT_SXID(device, NVSWITCH_ERR_HW_HOST_THERMAL_EVENT_END,
"Thermal Slowdown Disengaged | Links Thermal Mode %s\n", (slowdown_status.bLinksL1Status ? "ON" : "OFF"));
NVSWITCH_PRINT_SXID(device, NVSWITCH_ERR_HW_HOST_THERMAL_EVENT_END,
"Thermal slowdown Disengaged\n");
}
break;
}
case RM_SOE_THERM_MSG_ID_SHUTDOWN_STATUS:
{
shutdown_status = pMsg->msg.soeTherm.shutdown;
if (shutdown_status.source.bTsense) // TSENSE_THERM_SHUTDOWN
{
temperature = RM_SOE_NV_TEMP_TO_CELSIUS_TRUNCED(shutdown_status.maxTemperature);
threshold = RM_SOE_NV_TEMP_TO_CELSIUS_TRUNCED(shutdown_status.overtThreshold);
NVSWITCH_PRINT_SXID(device, NVSWITCH_ERR_HW_HOST_THERMAL_SHUTDOWN,
"NVSWITCH Temperature %dC | OVERT Threshold %dC\n",
temperature, threshold);
NVSWITCH_PRINT_SXID(device, NVSWITCH_ERR_HW_HOST_THERMAL_SHUTDOWN,
"TSENSE OVERT Threshold reached. Shutting Down\n");
}
if (shutdown_status.source.bPmgr) // PMGR_THERM_SHUTDOWN
{
NVSWITCH_PRINT_SXID(device, NVSWITCH_ERR_HW_HOST_THERMAL_EVENT_START,
"PMGR OVERT Threshold reached. Shutting Down\n");
}
break;
}
default:
{
NVSWITCH_PRINT(device, ERROR, "%s Unknown message Id\n", __FUNCTION__);
NVSWITCH_ASSERT(0);
}
}
}