mirror of
https://github.com/NVIDIA/open-gpu-kernel-modules.git
synced 2026-03-12 00:29:56 +00:00
525.85.05
This commit is contained in:
@@ -6611,12 +6611,12 @@ _nvswitch_service_nvlw_fatal_ls10
|
||||
status[5] = _nvswitch_service_nvlipt_link_fatal_ls10(device, instance);
|
||||
|
||||
|
||||
if (status[0] != NVL_SUCCESS &&
|
||||
status[1] != NVL_SUCCESS &&
|
||||
status[2] != NVL_SUCCESS &&
|
||||
status[3] != NVL_SUCCESS &&
|
||||
status[4] != NVL_SUCCESS &&
|
||||
status[5] != NVL_SUCCESS)
|
||||
if (status[0] != NVL_SUCCESS && status[0] != -NVL_NOT_FOUND &&
|
||||
status[1] != NVL_SUCCESS && status[1] != -NVL_NOT_FOUND &&
|
||||
status[2] != NVL_SUCCESS && status[2] != -NVL_NOT_FOUND &&
|
||||
status[3] != NVL_SUCCESS && status[3] != -NVL_NOT_FOUND &&
|
||||
status[4] != NVL_SUCCESS && status[4] != -NVL_NOT_FOUND &&
|
||||
status[5] != NVL_SUCCESS && status[5] != -NVL_NOT_FOUND)
|
||||
{
|
||||
return -NVL_MORE_PROCESSING_REQUIRED;
|
||||
}
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
/*
|
||||
* SPDX-FileCopyrightText: Copyright (c) 2020-2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
* SPDX-FileCopyrightText: Copyright (c) 2020-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
* SPDX-License-Identifier: MIT
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
@@ -496,6 +496,34 @@ nvswitch_corelib_get_rx_detect_ls10
|
||||
return NVL_SUCCESS;
|
||||
}
|
||||
|
||||
static NvBool
|
||||
_nvswitch_is_tlc_in_reset
|
||||
(
|
||||
nvswitch_device *device,
|
||||
nvlink_link *link
|
||||
)
|
||||
{
|
||||
NvU32 clkStatus;
|
||||
|
||||
clkStatus = NVSWITCH_LINK_RD32_LS10(device, link->linkNumber,
|
||||
NVLIPT_LNK, _NVLIPT_LNK, _CTRL_CLK_CTRL);
|
||||
|
||||
//
|
||||
// TLC is in reset if any of the per-link clocks are off
|
||||
// -- if TX and RX clocks are off then link is not powered on
|
||||
// -- if TX/RX clocks are on but NCISOC clock is off, DL layer
|
||||
// is on but TLC is still off
|
||||
//
|
||||
if (FLD_TEST_DRF(_NVLIPT_LNK, _CTRL_CLK_CTRL, _RXCLK_STS, _OFF, clkStatus) ||
|
||||
FLD_TEST_DRF(_NVLIPT_LNK, _CTRL_CLK_CTRL, _TXCLK_STS, _OFF, clkStatus) ||
|
||||
FLD_TEST_DRF(_NVLIPT_LNK, _CTRL_CLK_CTRL, _NCISOCCLK_STS, _OFF, clkStatus))
|
||||
{
|
||||
return NV_TRUE;
|
||||
}
|
||||
|
||||
return NV_FALSE;
|
||||
}
|
||||
|
||||
void
|
||||
nvswitch_reset_persistent_link_hw_state_ls10
|
||||
(
|
||||
@@ -509,14 +537,21 @@ nvswitch_reset_persistent_link_hw_state_ls10
|
||||
return;
|
||||
}
|
||||
|
||||
// SETUPTC called with HW Reset
|
||||
(void)nvswitch_minion_send_command(device, linkNumber, NV_MINION_NVLINK_DL_CMD_COMMAND_SETUPTC , 0x4);
|
||||
|
||||
// clear TLC TP Counters
|
||||
(void)nvswitch_minion_send_command(device, linkNumber, NV_MINION_NVLINK_DL_CMD_COMMAND_CLR_TLC_MISC_REGS, 0);
|
||||
|
||||
// clear DL error counters
|
||||
(void)nvswitch_minion_send_command(device, linkNumber, NV_MINION_NVLINK_DL_CMD_COMMAND_DLSTAT_CLR_DLERRCNT, 0);
|
||||
|
||||
// If TLC is not up then return
|
||||
if (_nvswitch_is_tlc_in_reset(device, link))
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
// SETUPTC called to reset and setup throughput counters
|
||||
(void)nvswitch_minion_send_command(device, linkNumber, NV_MINION_NVLINK_DL_CMD_COMMAND_SETUPTC , 0x4);
|
||||
|
||||
// clear miscellaneous TLC counters and registers
|
||||
(void)nvswitch_minion_send_command(device, linkNumber, NV_MINION_NVLINK_DL_CMD_COMMAND_CLR_TLC_MISC_REGS, 0);
|
||||
|
||||
}
|
||||
|
||||
NvlStatus
|
||||
@@ -1469,3 +1504,82 @@ nvswitch_execute_unilateral_link_shutdown_ls10
|
||||
return;
|
||||
}
|
||||
|
||||
NvlStatus
|
||||
nvswitch_reset_and_train_link_ls10
|
||||
(
|
||||
nvswitch_device *device,
|
||||
nvlink_link *link
|
||||
)
|
||||
{
|
||||
NvlStatus status = NVL_SUCCESS;
|
||||
NvU32 retry_count = 3;
|
||||
NvU32 link_state_request;
|
||||
NvU32 link_state;
|
||||
NvU32 stat_data;
|
||||
NvU32 link_intr_subcode;
|
||||
|
||||
nvswitch_execute_unilateral_link_shutdown_ls10(link);
|
||||
nvswitch_corelib_clear_link_state_ls10(link);
|
||||
|
||||
do
|
||||
{
|
||||
status = nvswitch_request_tl_link_state_ls10(link,
|
||||
NV_NVLIPT_LNK_CTRL_LINK_STATE_REQUEST_REQUEST_RESET, NV_TRUE);
|
||||
|
||||
if (status == NVL_SUCCESS)
|
||||
{
|
||||
break;
|
||||
}
|
||||
else
|
||||
{
|
||||
|
||||
link_state_request = NVSWITCH_LINK_RD32_LS10(device, link->linkNumber,
|
||||
NVLIPT_LNK , _NVLIPT_LNK , _CTRL_LINK_STATE_REQUEST);
|
||||
|
||||
link_state = DRF_VAL(_NVLIPT_LNK, _CTRL_LINK_STATE_REQUEST, _STATUS,
|
||||
link_state_request);
|
||||
|
||||
if (nvswitch_minion_get_dl_status(device, link->linkNumber,
|
||||
NV_NVLSTAT_MN00, 0, &stat_data) == NVL_SUCCESS)
|
||||
{
|
||||
link_intr_subcode = DRF_VAL(_NVLSTAT, _MN00, _LINK_INTR_SUBCODE, stat_data);
|
||||
}
|
||||
|
||||
if ((link_state == NV_NVLIPT_LNK_CTRL_LINK_STATE_REQUEST_STATUS_MINION_REQUEST_FAIL) &&
|
||||
(link_intr_subcode == MINION_ALARM_BUSY))
|
||||
{
|
||||
|
||||
status = nvswitch_request_tl_link_state_ls10(link,
|
||||
NV_NVLIPT_LNK_CTRL_LINK_STATE_REQUEST_REQUEST_RESET, NV_TRUE);
|
||||
|
||||
//
|
||||
// We retry the shutdown sequence 3 times when we see a MINION_REQUEST_FAIL
|
||||
// or MINION_ALARM_BUSY
|
||||
//
|
||||
retry_count--;
|
||||
}
|
||||
else
|
||||
{
|
||||
break;
|
||||
}
|
||||
}
|
||||
} while(retry_count);
|
||||
|
||||
if (status != NVL_SUCCESS)
|
||||
{
|
||||
NVSWITCH_PRINT(device, ERROR,
|
||||
"%s: NvLink Reset has failed for link %d\n",
|
||||
__FUNCTION__, link->linkNumber);
|
||||
|
||||
// Re-register links.
|
||||
status = nvlink_lib_register_link(device->nvlink_device, link);
|
||||
if (status != NVL_SUCCESS)
|
||||
{
|
||||
nvswitch_destroy_link(link);
|
||||
return status;
|
||||
}
|
||||
return status;
|
||||
}
|
||||
return NVL_SUCCESS;
|
||||
}
|
||||
|
||||
|
||||
@@ -31,7 +31,6 @@
|
||||
#include "common_nvswitch.h"
|
||||
#include "ls10/ls10.h"
|
||||
#include "ls10/soe_ls10.h"
|
||||
#include "lr10/soe_lr10.h"
|
||||
|
||||
#include "nvswitch/ls10/dev_soe_ip.h"
|
||||
#include "nvswitch/ls10/dev_soe_ip_addendum.h"
|
||||
@@ -555,7 +554,7 @@ nvswitch_soe_register_event_callbacks_ls10
|
||||
device, pFlcn,
|
||||
RM_SOE_UNIT_THERM,
|
||||
NULL,
|
||||
nvswitch_therm_soe_callback_lr10,
|
||||
nvswitch_therm_soe_callback_ls10,
|
||||
NULL,
|
||||
&pSoe->thermEvtDesc);
|
||||
if (status != NV_OK)
|
||||
|
||||
@@ -28,6 +28,7 @@
|
||||
#include "ls10/therm_ls10.h"
|
||||
#include "error_nvswitch.h"
|
||||
#include "soe/soeiftherm.h"
|
||||
#include "rmflcncmdif_nvswitch.h"
|
||||
|
||||
#include "nvswitch/ls10/dev_therm.h"
|
||||
|
||||
@@ -356,3 +357,100 @@ nvswitch_monitor_thermal_alert_ls10
|
||||
return;
|
||||
}
|
||||
|
||||
/*
|
||||
* @brief Callback function to recieve thermal messages from SOE.
|
||||
*/
|
||||
void
|
||||
nvswitch_therm_soe_callback_ls10
|
||||
(
|
||||
nvswitch_device *device,
|
||||
RM_FLCN_MSG *pGenMsg,
|
||||
void *pParams,
|
||||
NvU32 seqDesc,
|
||||
NV_STATUS status
|
||||
)
|
||||
{
|
||||
RM_SOE_THERM_MSG_SLOWDOWN_STATUS slowdown_status;
|
||||
RM_SOE_THERM_MSG_SHUTDOWN_STATUS shutdown_status;
|
||||
RM_FLCN_MSG_SOE *pMsg = (RM_FLCN_MSG_SOE *)pGenMsg;
|
||||
NvU32 temperature;
|
||||
NvU32 threshold;
|
||||
|
||||
switch (pMsg->msg.soeTherm.msgType)
|
||||
{
|
||||
case RM_SOE_THERM_MSG_ID_SLOWDOWN_STATUS:
|
||||
{
|
||||
slowdown_status = pMsg->msg.soeTherm.slowdown;
|
||||
if (slowdown_status.bSlowdown)
|
||||
{
|
||||
if (slowdown_status.source.bTsense) // TSENSE_THERM_ALERT
|
||||
{
|
||||
temperature = RM_SOE_NV_TEMP_TO_CELSIUS_TRUNCED(slowdown_status.maxTemperature);
|
||||
threshold = RM_SOE_NV_TEMP_TO_CELSIUS_TRUNCED(slowdown_status.warnThreshold);
|
||||
|
||||
NVSWITCH_PRINT_SXID(device, NVSWITCH_ERR_HW_HOST_THERMAL_EVENT_START,
|
||||
"NVSWITCH Temperature %dC | TSENSE WARN Threshold %dC\n",
|
||||
temperature, threshold);
|
||||
|
||||
NVSWITCH_PRINT_SXID(device, NVSWITCH_ERR_HW_HOST_THERMAL_EVENT_START,
|
||||
"Thermal Slowdown Engaged | Temp higher than WARN Threshold\n");
|
||||
}
|
||||
|
||||
NVSWITCH_PRINT_SXID(device, NVSWITCH_ERR_HW_HOST_THERMAL_EVENT_START,
|
||||
"Thermal Slowdown Engaged | Links Thermal Mode %s\n", (slowdown_status.bLinksL1Status ? "ON" : "OFF"));
|
||||
|
||||
if (slowdown_status.source.bPmgr) // PMGR_THERM_ALERT
|
||||
{
|
||||
NVSWITCH_PRINT_SXID(device, NVSWITCH_ERR_HW_HOST_THERMAL_EVENT_START,
|
||||
"Thermal Slowdown Engaged | PMGR WARN Threshold reached\n");
|
||||
}
|
||||
}
|
||||
else // REVERT_SLOWDOWN
|
||||
{
|
||||
temperature = RM_SOE_NV_TEMP_TO_CELSIUS_TRUNCED(slowdown_status.maxTemperature);
|
||||
threshold = RM_SOE_NV_TEMP_TO_CELSIUS_TRUNCED(slowdown_status.warnThreshold);
|
||||
|
||||
NVSWITCH_PRINT_SXID(device, NVSWITCH_ERR_HW_HOST_THERMAL_EVENT_END,
|
||||
"NVSWITCH Temperature %dC | TSENSE WARN Threshold %dC\n",
|
||||
temperature, threshold);
|
||||
|
||||
NVSWITCH_PRINT_SXID(device, NVSWITCH_ERR_HW_HOST_THERMAL_EVENT_END,
|
||||
"Thermal Slowdown Disengaged | Links Thermal Mode %s\n", (slowdown_status.bLinksL1Status ? "ON" : "OFF"));
|
||||
|
||||
NVSWITCH_PRINT_SXID(device, NVSWITCH_ERR_HW_HOST_THERMAL_EVENT_END,
|
||||
"Thermal slowdown Disengaged\n");
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
||||
case RM_SOE_THERM_MSG_ID_SHUTDOWN_STATUS:
|
||||
{
|
||||
shutdown_status = pMsg->msg.soeTherm.shutdown;
|
||||
if (shutdown_status.source.bTsense) // TSENSE_THERM_SHUTDOWN
|
||||
{
|
||||
temperature = RM_SOE_NV_TEMP_TO_CELSIUS_TRUNCED(shutdown_status.maxTemperature);
|
||||
threshold = RM_SOE_NV_TEMP_TO_CELSIUS_TRUNCED(shutdown_status.overtThreshold);
|
||||
|
||||
NVSWITCH_PRINT_SXID(device, NVSWITCH_ERR_HW_HOST_THERMAL_SHUTDOWN,
|
||||
"NVSWITCH Temperature %dC | OVERT Threshold %dC\n",
|
||||
temperature, threshold);
|
||||
|
||||
NVSWITCH_PRINT_SXID(device, NVSWITCH_ERR_HW_HOST_THERMAL_SHUTDOWN,
|
||||
"TSENSE OVERT Threshold reached. Shutting Down\n");
|
||||
}
|
||||
|
||||
if (shutdown_status.source.bPmgr) // PMGR_THERM_SHUTDOWN
|
||||
{
|
||||
NVSWITCH_PRINT_SXID(device, NVSWITCH_ERR_HW_HOST_THERMAL_EVENT_START,
|
||||
"PMGR OVERT Threshold reached. Shutting Down\n");
|
||||
}
|
||||
break;
|
||||
}
|
||||
default:
|
||||
{
|
||||
NVSWITCH_PRINT(device, ERROR, "%s Unknown message Id\n", __FUNCTION__);
|
||||
NVSWITCH_ASSERT(0);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user