Files
open-gpu-kernel-modules/src/common/nvswitch/kernel/error_nvswitch.c
Andy Ritger 758b4ee818 525.53
2022-11-10 08:39:33 -08:00

556 lines
16 KiB
C

/*
* SPDX-FileCopyrightText: Copyright (c) 2018-2019 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: MIT
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS IN THE SOFTWARE.
*/
#include "common_nvswitch.h"
#include "error_nvswitch.h"
#define NVSWITCH_DATE_LEN 64
//
// Error logging
//
static void
_nvswitch_dump_error_entry
(
nvswitch_device *device,
NvU32 error_count,
NVSWITCH_ERROR_TYPE *error_entry
)
{
if ((error_entry != NULL) &&
(error_entry->error_src == NVSWITCH_ERROR_SRC_HW))
{
NVSWITCH_PRINT_SXID_NO_BBX(device, error_entry->error_type,
"Severity %d Engine instance %02d Sub-engine instance %02d\n",
error_entry->severity, error_entry->instance, error_entry->subinstance);
NVSWITCH_PRINT_SXID_NO_BBX(device, error_entry->error_type,
"Data {0x%08x, 0x%08x, 0x%08x, 0x%08x, 0x%08x, 0x%08x, 0x%08x, 0x%08x, 0x%08x}\n",
error_entry->data.raw.flags,
error_entry->data.raw.data[0], error_entry->data.raw.data[1],
error_entry->data.raw.data[2], error_entry->data.raw.data[3],
error_entry->data.raw.data[4], error_entry->data.raw.data[5],
error_entry->data.raw.data[6], error_entry->data.raw.data[7]);
if ((error_entry->data.raw.data[ 8] != 0) ||
(error_entry->data.raw.data[ 9] != 0) ||
(error_entry->data.raw.data[10] != 0) ||
(error_entry->data.raw.data[11] != 0) ||
(error_entry->data.raw.data[12] != 0) ||
(error_entry->data.raw.data[13] != 0) ||
(error_entry->data.raw.data[14] != 0) ||
(error_entry->data.raw.data[15] != 0))
{
NVSWITCH_PRINT_SXID_NO_BBX(device, error_entry->error_type,
"Data {0x%08x, 0x%08x, 0x%08x, 0x%08x, 0x%08x, 0x%08x, 0x%08x, 0x%08x}\n",
error_entry->data.raw.data[ 8], error_entry->data.raw.data[ 9],
error_entry->data.raw.data[10], error_entry->data.raw.data[11],
error_entry->data.raw.data[12], error_entry->data.raw.data[13],
error_entry->data.raw.data[14], error_entry->data.raw.data[15]);
}
}
}
//
// Construct an error log
//
// If error_log_size > 0 a circular buffer is created to record errors
//
NvlStatus
nvswitch_construct_error_log
(
NVSWITCH_ERROR_LOG_TYPE *errors,
NvU32 error_log_size,
NvBool overwritable
)
{
NvlStatus retval = NVL_SUCCESS;
NVSWITCH_ASSERT(errors != NULL);
errors->error_start = 0;
errors->error_count = 0;
errors->error_total = 0;
errors->error_log_size = 0;
errors->error_log = NULL;
errors->overwritable = overwritable;
if (error_log_size > 0)
{
errors->error_log = nvswitch_os_malloc(error_log_size * sizeof(NVSWITCH_ERROR_TYPE));
}
if (errors->error_log != NULL)
{
errors->error_log_size = error_log_size;
nvswitch_os_memset(errors->error_log, 0, errors->error_log_size * sizeof(NVSWITCH_ERROR_TYPE));
}
if (error_log_size != errors->error_log_size)
{
retval = -NVL_NO_MEM;
}
return retval;
}
//
// Destroy an error log
//
void
nvswitch_destroy_error_log
(
nvswitch_device *device,
NVSWITCH_ERROR_LOG_TYPE *errors
)
{
if (errors == NULL)
return;
errors->error_start = 0;
errors->error_count = 0;
//errors->error_total = 0; // Don't reset total count of errors logged
errors->error_log_size = 0;
if (errors->error_log != NULL)
{
nvswitch_os_free(errors->error_log);
errors->error_log = NULL;
}
}
void
nvswitch_record_error
(
nvswitch_device *device,
NVSWITCH_ERROR_LOG_TYPE *errors,
NvU32 error_type, // NVSWITCH_ERR_*
NvU32 instance,
NvU32 subinstance,
NVSWITCH_ERROR_SRC_TYPE error_src, // NVSWITCH_ERROR_SRC_*
NVSWITCH_ERROR_SEVERITY_TYPE severity, // NVSWITCH_ERROR_SEVERITY_*
NvBool error_resolved,
void *data,
NvU32 data_size,
NvU32 line
)
{
NvU32 idx_error;
NVSWITCH_ASSERT(errors != NULL);
NVSWITCH_ASSERT(data_size <= sizeof(errors->error_log[idx_error].data));
// If no error log has been created, don't log it.
if ((errors->error_log_size != 0) && (errors->error_log != NULL))
{
idx_error = (errors->error_start + errors->error_count) % errors->error_log_size;
if (errors->error_count == errors->error_log_size)
{
// Error ring buffer already full.
if (errors->overwritable)
{
errors->error_start = (errors->error_start + 1) % errors->error_log_size;
}
else
{
// Return: ring buffer full
return;
}
}
else
{
errors->error_count++;
}
// Log error info
errors->error_log[idx_error].error_type = error_type;
errors->error_log[idx_error].instance = instance;
errors->error_log[idx_error].subinstance = subinstance;
errors->error_log[idx_error].error_src = error_src;
errors->error_log[idx_error].severity = severity;
errors->error_log[idx_error].error_resolved = error_resolved;
errors->error_log[idx_error].line = line;
// Log tracking info
errors->error_log[idx_error].timer_count = nvswitch_hw_counter_read_counter(device);
errors->error_log[idx_error].time = nvswitch_os_get_platform_time();
errors->error_log[idx_error].local_error_num = errors->error_total;
errors->error_log[idx_error].global_error_num = device->error_total;
// Copy ancillary data blob
nvswitch_os_memset(&errors->error_log[idx_error].data, 0, sizeof(errors->error_log[idx_error].data));
if ((data != NULL) && (data_size > 0))
{
nvswitch_os_memcpy(&errors->error_log[idx_error].data, data, data_size);
}
_nvswitch_dump_error_entry(device, idx_error, &errors->error_log[idx_error]);
}
errors->error_total++;
device->error_total++;
}
//
// Discard N errors from the specified log
//
void
nvswitch_discard_errors
(
NVSWITCH_ERROR_LOG_TYPE *errors,
NvU32 error_discard_count
)
{
error_discard_count = NV_MIN(error_discard_count, errors->error_count);
errors->error_start = (errors->error_start+error_discard_count) % errors->error_log_size;
errors->error_count -= error_discard_count;
}
//
// Retrieve an error entry by index.
// 0 = oldest error
// Out-of-range index does not return an error, but does return an error of type "NO_ERROR"
// error_count returns how many errors in the error log
//
void
nvswitch_get_error
(
nvswitch_device *device,
NVSWITCH_ERROR_LOG_TYPE *errors,
NVSWITCH_ERROR_TYPE *error_entry,
NvU32 error_idx,
NvU32 *error_count
)
{
NVSWITCH_ASSERT(errors != NULL);
if (error_entry != NULL)
{
if (error_idx >= errors->error_count)
{
// Index out-of-range
nvswitch_os_memset(error_entry, 0, sizeof(*error_entry));
error_entry->error_type = 0;
error_entry->instance = 0;
error_entry->subinstance = 0;
error_entry->local_error_num = errors->error_total;
error_entry->global_error_num = ((device == NULL) ? 0 : device->error_total);
error_entry->error_src = NVSWITCH_ERROR_SRC_NONE;
error_entry->severity = NVSWITCH_ERROR_SEVERITY_NONFATAL;
error_entry->error_resolved = NV_TRUE;
error_entry->line = 0;
error_entry->timer_count =
((device == NULL) ? 0 : nvswitch_hw_counter_read_counter(device));
error_entry->time = nvswitch_os_get_platform_time();
}
else
{
*error_entry = errors->error_log[(errors->error_start + error_idx) % errors->error_log_size];
}
}
if (error_count)
{
*error_count = errors->error_count;
}
}
//
// Retrieve the oldest logged error entry.
// Optionally remove the error entry after reading
// error_count returns how many remaining errors in the error log
//
void
nvswitch_get_next_error
(
nvswitch_device *device,
NVSWITCH_ERROR_LOG_TYPE *errors,
NVSWITCH_ERROR_TYPE *error_entry,
NvU32 *error_count,
NvBool remove_from_list
)
{
nvswitch_get_error(device, errors, error_entry, 0, error_count);
// Optionally remove the error from the log
if (remove_from_list)
{
nvswitch_discard_errors(errors, 1);
}
}
NVSWITCH_NVLINK_HW_ERROR
nvswitch_translate_hw_error
(
NVSWITCH_ERR_TYPE type
)
{
if ((type >= NVSWITCH_ERR_HW_NPORT_INGRESS) &&
(type < NVSWITCH_ERR_HW_NPORT_INGRESS_LAST))
{
return NVSWITCH_NVLINK_HW_INGRESS;
}
else if ((type >= NVSWITCH_ERR_HW_NPORT_EGRESS) &&
(type < NVSWITCH_ERR_HW_NPORT_EGRESS_LAST))
{
return NVSWITCH_NVLINK_HW_EGRESS;
}
else if ((type >= NVSWITCH_ERR_HW_NPORT_FSTATE) &&
(type < NVSWITCH_ERR_HW_NPORT_FSTATE_LAST))
{
return NVSWITCH_NVLINK_HW_FSTATE;
}
else if ((type >= NVSWITCH_ERR_HW_NPORT_TSTATE) &&
(type < NVSWITCH_ERR_HW_NPORT_TSTATE_LAST))
{
return NVSWITCH_NVLINK_HW_TSTATE;
}
else if ((type >= NVSWITCH_ERR_HW_NPORT_ROUTE) &&
(type < NVSWITCH_ERR_HW_NPORT_ROUTE_LAST))
{
return NVSWITCH_NVLINK_HW_ROUTE;
}
else if ((type >= NVSWITCH_ERR_HW_NPORT) &&
(type < NVSWITCH_ERR_HW_NPORT_LAST))
{
return NVSWITCH_NVLINK_HW_NPORT;
}
else if ((type >= NVSWITCH_ERR_HW_NVLCTRL) &&
(type < NVSWITCH_ERR_HW_NVLCTRL_LAST))
{
return NVSWITCH_NVLINK_HW_NVLCTRL;
}
else if ((type >= NVSWITCH_ERR_HW_NVLIPT) &&
(type < NVSWITCH_ERR_HW_NVLIPT_LAST))
{
return NVSWITCH_NVLINK_HW_NVLIPT;
}
else if ((type >= NVSWITCH_ERR_HW_NVLTLC) &&
(type < NVSWITCH_ERR_HW_NVLTLC_LAST))
{
return NVSWITCH_NVLINK_HW_NVLTLC;
}
else if ((type >= NVSWITCH_ERR_HW_DLPL) &&
(type < NVSWITCH_ERR_HW_DLPL_LAST))
{
return NVSWITCH_NVLINK_HW_DLPL;
}
else if ((type >= NVSWITCH_ERR_HW_AFS) &&
(type < NVSWITCH_ERR_HW_AFS_LAST))
{
return NVSWITCH_NVLINK_HW_AFS;
}
else if ((type >= NVSWITCH_ERR_HW_HOST) &&
(type < NVSWITCH_ERR_HW_HOST_LAST))
{
return NVSWITCH_NVLINK_HW_HOST;
}
else if ((type >= NVSWITCH_ERR_HW_MINION) &&
(type < NVSWITCH_ERR_HW_MINION_LAST))
{
return NVSWITCH_NVLINK_HW_MINION;
}
else if ((type >= NVSWITCH_ERR_HW_NXBAR) &&
(type < NVSWITCH_ERR_HW_NXBAR_LAST))
{
return NVSWITCH_NVLINK_HW_NXBAR;
}
else if ((type >= NVSWITCH_ERR_HW_NPORT_SOURCETRACK) &&
(type < NVSWITCH_ERR_HW_NPORT_SOURCETRACK_LAST))
{
return NVSWITCH_NVLINK_HW_SOURCETRACK;
}
else if ((type >= NVSWITCH_ERR_HW_NVLIPT_LNK) &&
(type < NVSWITCH_ERR_HW_NVLIPT_LNK_LAST))
{
return NVSWITCH_ERR_HW_NVLIPT_LNK;
}
else if ((type >= NVSWITCH_ERR_HW_SOE) &&
(type < NVSWITCH_ERR_HW_SOE_LAST))
{
return NVSWITCH_ERR_HW_SOE;
}
else if ((type >= NVSWITCH_ERR_HW_NPORT_MULTICASTTSTATE) &&
(type < NVSWITCH_ERR_HW_NPORT_MULTICASTTSTATE_LAST))
{
return NVSWITCH_ERR_HW_NPORT_MULTICASTTSTATE;
}
else if ((type >= NVSWITCH_ERR_HW_NPORT_REDUCTIONTSTATE) &&
(type < NVSWITCH_ERR_HW_NPORT_REDUCTIONTSTATE_LAST))
{
return NVSWITCH_ERR_HW_NPORT_REDUCTIONTSTATE;
}
else
{
// Update this assert after adding a new translation entry above
ct_assert(NVSWITCH_ERR_HW_NPORT_REDUCTIONTSTATE_LAST == (NVSWITCH_ERR_LAST - 1));
NVSWITCH_PRINT(NULL, ERROR,
"%s: Undefined error type\n", __FUNCTION__);
NVSWITCH_ASSERT(0);
return NVSWITCH_NVLINK_HW_GENERIC;
}
}
static NVSWITCH_NVLINK_ARCH_ERROR
_nvswitch_translate_arch_error
(
NVSWITCH_ERROR_TYPE *error_entry
)
{
if (error_entry->severity == NVSWITCH_ERROR_SEVERITY_FATAL)
{
return NVSWITCH_NVLINK_ARCH_ERROR_HW_FATAL;
}
else if (error_entry->severity == NVSWITCH_ERROR_SEVERITY_NONFATAL)
{
if (error_entry->error_resolved)
{
return NVSWITCH_NVLINK_ARCH_ERROR_HW_CORRECTABLE;
}
else
{
return NVSWITCH_NVLINK_ARCH_ERROR_HW_UNCORRECTABLE;
}
}
return NVSWITCH_NVLINK_ARCH_ERROR_GENERIC;
}
void
nvswitch_translate_error
(
NVSWITCH_ERROR_TYPE *error_entry,
NVSWITCH_NVLINK_ARCH_ERROR *arch_error,
NVSWITCH_NVLINK_HW_ERROR *hw_error
)
{
NVSWITCH_ASSERT(error_entry != NULL);
if (arch_error)
{
*arch_error = NVSWITCH_NVLINK_ARCH_ERROR_NONE;
}
if (hw_error)
{
*hw_error = NVSWITCH_NVLINK_HW_ERROR_NONE;
}
if (error_entry->error_src == NVSWITCH_ERROR_SRC_HW)
{
if (arch_error)
{
*arch_error = _nvswitch_translate_arch_error(error_entry);
}
if (hw_error)
{
*hw_error = nvswitch_translate_hw_error(error_entry->error_type);
}
}
else
{
NVSWITCH_PRINT(NULL, ERROR,
"%s: Undefined error source\n", __FUNCTION__);
NVSWITCH_ASSERT(0);
}
}
NvlStatus
nvswitch_ctrl_get_errors
(
nvswitch_device *device,
NVSWITCH_GET_ERRORS_PARAMS *p
)
{
NvU32 index = 0;
NvU32 count = 0;
NVSWITCH_ERROR_LOG_TYPE *error_log;
NVSWITCH_ERROR_TYPE error;
switch (p->errorType)
{
case NVSWITCH_ERROR_SEVERITY_FATAL:
error_log = &device->log_FATAL_ERRORS;
break;
case NVSWITCH_ERROR_SEVERITY_NONFATAL:
error_log = &device->log_NONFATAL_ERRORS;
break;
default:
return -NVL_BAD_ARGS;
}
nvswitch_os_memset(p->error, 0, sizeof(NVSWITCH_ERROR) *
NVSWITCH_ERROR_COUNT_SIZE);
p->nextErrorIndex = NVSWITCH_ERROR_NEXT_LOCAL_NUMBER(error_log);
p->errorCount = 0;
// If there is nothing to do, return.
nvswitch_get_error(device, error_log, &error, index, &count);
if (count == 0)
{
return NVL_SUCCESS;
}
//
// If the error's local_error_num is smaller than the errorIndex
// passed in by the client, fast-forward index by the difference.
// This will skip over errors that were previously read by the client.
//
if (error.local_error_num < p->errorIndex)
{
index = (NvU32) (p->errorIndex - error.local_error_num);
}
// If there is nothing to do after fast-forwarding, return.
if (index >= count)
{
return NVL_SUCCESS;
}
while ((p->errorCount < NVSWITCH_ERROR_COUNT_SIZE) && (index < count))
{
// Get the next error to consider from the log
nvswitch_get_error(device, error_log, &error, index, NULL);
p->error[p->errorCount].error_value = error.error_type;
p->error[p->errorCount].error_src = error.error_src;
p->error[p->errorCount].instance = error.instance;
p->error[p->errorCount].subinstance = error.subinstance;
p->error[p->errorCount].time = error.time;
p->error[p->errorCount].error_resolved = error.error_resolved;
p->errorCount++;
index++;
}
p->errorIndex = error.local_error_num + 1;
return NVL_SUCCESS;
}