mirror of
https://github.com/NVIDIA/open-gpu-kernel-modules.git
synced 2026-01-27 03:29:47 +00:00
556 lines
16 KiB
C
556 lines
16 KiB
C
/*
|
|
* SPDX-FileCopyrightText: Copyright (c) 2018-2019 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
|
* SPDX-License-Identifier: MIT
|
|
*
|
|
* Permission is hereby granted, free of charge, to any person obtaining a
|
|
* copy of this software and associated documentation files (the "Software"),
|
|
* to deal in the Software without restriction, including without limitation
|
|
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
|
* and/or sell copies of the Software, and to permit persons to whom the
|
|
* Software is furnished to do so, subject to the following conditions:
|
|
*
|
|
* The above copyright notice and this permission notice shall be included in
|
|
* all copies or substantial portions of the Software.
|
|
*
|
|
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
|
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
|
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
|
* DEALINGS IN THE SOFTWARE.
|
|
*/
|
|
|
|
#include "common_nvswitch.h"
|
|
#include "error_nvswitch.h"
|
|
|
|
#define NVSWITCH_DATE_LEN 64
|
|
|
|
//
|
|
// Error logging
|
|
//
|
|
static void
|
|
_nvswitch_dump_error_entry
|
|
(
|
|
nvswitch_device *device,
|
|
NvU32 error_count,
|
|
NVSWITCH_ERROR_TYPE *error_entry
|
|
)
|
|
{
|
|
if ((error_entry != NULL) &&
|
|
(error_entry->error_src == NVSWITCH_ERROR_SRC_HW))
|
|
{
|
|
NVSWITCH_PRINT_SXID_NO_BBX(device, error_entry->error_type,
|
|
"Severity %d Engine instance %02d Sub-engine instance %02d\n",
|
|
error_entry->severity, error_entry->instance, error_entry->subinstance);
|
|
|
|
NVSWITCH_PRINT_SXID_NO_BBX(device, error_entry->error_type,
|
|
"Data {0x%08x, 0x%08x, 0x%08x, 0x%08x, 0x%08x, 0x%08x, 0x%08x, 0x%08x, 0x%08x}\n",
|
|
error_entry->data.raw.flags,
|
|
error_entry->data.raw.data[0], error_entry->data.raw.data[1],
|
|
error_entry->data.raw.data[2], error_entry->data.raw.data[3],
|
|
error_entry->data.raw.data[4], error_entry->data.raw.data[5],
|
|
error_entry->data.raw.data[6], error_entry->data.raw.data[7]);
|
|
|
|
if ((error_entry->data.raw.data[ 8] != 0) ||
|
|
(error_entry->data.raw.data[ 9] != 0) ||
|
|
(error_entry->data.raw.data[10] != 0) ||
|
|
(error_entry->data.raw.data[11] != 0) ||
|
|
(error_entry->data.raw.data[12] != 0) ||
|
|
(error_entry->data.raw.data[13] != 0) ||
|
|
(error_entry->data.raw.data[14] != 0) ||
|
|
(error_entry->data.raw.data[15] != 0))
|
|
|
|
{
|
|
NVSWITCH_PRINT_SXID_NO_BBX(device, error_entry->error_type,
|
|
"Data {0x%08x, 0x%08x, 0x%08x, 0x%08x, 0x%08x, 0x%08x, 0x%08x, 0x%08x}\n",
|
|
error_entry->data.raw.data[ 8], error_entry->data.raw.data[ 9],
|
|
error_entry->data.raw.data[10], error_entry->data.raw.data[11],
|
|
error_entry->data.raw.data[12], error_entry->data.raw.data[13],
|
|
error_entry->data.raw.data[14], error_entry->data.raw.data[15]);
|
|
}
|
|
}
|
|
}
|
|
|
|
//
|
|
// Construct an error log
|
|
//
|
|
// If error_log_size > 0 a circular buffer is created to record errors
|
|
//
|
|
NvlStatus
|
|
nvswitch_construct_error_log
|
|
(
|
|
NVSWITCH_ERROR_LOG_TYPE *errors,
|
|
NvU32 error_log_size,
|
|
NvBool overwritable
|
|
)
|
|
{
|
|
NvlStatus retval = NVL_SUCCESS;
|
|
|
|
NVSWITCH_ASSERT(errors != NULL);
|
|
|
|
errors->error_start = 0;
|
|
errors->error_count = 0;
|
|
errors->error_total = 0;
|
|
errors->error_log_size = 0;
|
|
errors->error_log = NULL;
|
|
errors->overwritable = overwritable;
|
|
|
|
if (error_log_size > 0)
|
|
{
|
|
errors->error_log = nvswitch_os_malloc(error_log_size * sizeof(NVSWITCH_ERROR_TYPE));
|
|
}
|
|
|
|
if (errors->error_log != NULL)
|
|
{
|
|
errors->error_log_size = error_log_size;
|
|
nvswitch_os_memset(errors->error_log, 0, errors->error_log_size * sizeof(NVSWITCH_ERROR_TYPE));
|
|
}
|
|
|
|
if (error_log_size != errors->error_log_size)
|
|
{
|
|
retval = -NVL_NO_MEM;
|
|
}
|
|
|
|
return retval;
|
|
}
|
|
|
|
//
|
|
// Destroy an error log
|
|
//
|
|
void
|
|
nvswitch_destroy_error_log
|
|
(
|
|
nvswitch_device *device,
|
|
NVSWITCH_ERROR_LOG_TYPE *errors
|
|
)
|
|
{
|
|
if (errors == NULL)
|
|
return;
|
|
|
|
errors->error_start = 0;
|
|
errors->error_count = 0;
|
|
//errors->error_total = 0; // Don't reset total count of errors logged
|
|
errors->error_log_size = 0;
|
|
|
|
if (errors->error_log != NULL)
|
|
{
|
|
nvswitch_os_free(errors->error_log);
|
|
errors->error_log = NULL;
|
|
}
|
|
}
|
|
|
|
void
|
|
nvswitch_record_error
|
|
(
|
|
nvswitch_device *device,
|
|
NVSWITCH_ERROR_LOG_TYPE *errors,
|
|
NvU32 error_type, // NVSWITCH_ERR_*
|
|
NvU32 instance,
|
|
NvU32 subinstance,
|
|
NVSWITCH_ERROR_SRC_TYPE error_src, // NVSWITCH_ERROR_SRC_*
|
|
NVSWITCH_ERROR_SEVERITY_TYPE severity, // NVSWITCH_ERROR_SEVERITY_*
|
|
NvBool error_resolved,
|
|
void *data,
|
|
NvU32 data_size,
|
|
NvU32 line
|
|
)
|
|
{
|
|
NvU32 idx_error;
|
|
|
|
NVSWITCH_ASSERT(errors != NULL);
|
|
NVSWITCH_ASSERT(data_size <= sizeof(errors->error_log[idx_error].data));
|
|
|
|
// If no error log has been created, don't log it.
|
|
if ((errors->error_log_size != 0) && (errors->error_log != NULL))
|
|
{
|
|
idx_error = (errors->error_start + errors->error_count) % errors->error_log_size;
|
|
|
|
if (errors->error_count == errors->error_log_size)
|
|
{
|
|
// Error ring buffer already full.
|
|
if (errors->overwritable)
|
|
{
|
|
errors->error_start = (errors->error_start + 1) % errors->error_log_size;
|
|
}
|
|
else
|
|
{
|
|
// Return: ring buffer full
|
|
return;
|
|
}
|
|
}
|
|
else
|
|
{
|
|
errors->error_count++;
|
|
}
|
|
|
|
// Log error info
|
|
errors->error_log[idx_error].error_type = error_type;
|
|
errors->error_log[idx_error].instance = instance;
|
|
errors->error_log[idx_error].subinstance = subinstance;
|
|
errors->error_log[idx_error].error_src = error_src;
|
|
errors->error_log[idx_error].severity = severity;
|
|
errors->error_log[idx_error].error_resolved = error_resolved;
|
|
errors->error_log[idx_error].line = line;
|
|
|
|
// Log tracking info
|
|
errors->error_log[idx_error].timer_count = nvswitch_hw_counter_read_counter(device);
|
|
errors->error_log[idx_error].time = nvswitch_os_get_platform_time();
|
|
errors->error_log[idx_error].local_error_num = errors->error_total;
|
|
errors->error_log[idx_error].global_error_num = device->error_total;
|
|
|
|
// Copy ancillary data blob
|
|
nvswitch_os_memset(&errors->error_log[idx_error].data, 0, sizeof(errors->error_log[idx_error].data));
|
|
if ((data != NULL) && (data_size > 0))
|
|
{
|
|
nvswitch_os_memcpy(&errors->error_log[idx_error].data, data, data_size);
|
|
}
|
|
|
|
_nvswitch_dump_error_entry(device, idx_error, &errors->error_log[idx_error]);
|
|
}
|
|
errors->error_total++;
|
|
device->error_total++;
|
|
}
|
|
|
|
//
|
|
// Discard N errors from the specified log
|
|
//
|
|
|
|
void
|
|
nvswitch_discard_errors
|
|
(
|
|
NVSWITCH_ERROR_LOG_TYPE *errors,
|
|
NvU32 error_discard_count
|
|
)
|
|
{
|
|
error_discard_count = NV_MIN(error_discard_count, errors->error_count);
|
|
errors->error_start = (errors->error_start+error_discard_count) % errors->error_log_size;
|
|
errors->error_count -= error_discard_count;
|
|
}
|
|
|
|
//
|
|
// Retrieve an error entry by index.
|
|
// 0 = oldest error
|
|
// Out-of-range index does not return an error, but does return an error of type "NO_ERROR"
|
|
// error_count returns how many errors in the error log
|
|
//
|
|
|
|
void
|
|
nvswitch_get_error
|
|
(
|
|
nvswitch_device *device,
|
|
NVSWITCH_ERROR_LOG_TYPE *errors,
|
|
NVSWITCH_ERROR_TYPE *error_entry,
|
|
NvU32 error_idx,
|
|
NvU32 *error_count
|
|
)
|
|
{
|
|
NVSWITCH_ASSERT(errors != NULL);
|
|
|
|
if (error_entry != NULL)
|
|
{
|
|
if (error_idx >= errors->error_count)
|
|
{
|
|
// Index out-of-range
|
|
nvswitch_os_memset(error_entry, 0, sizeof(*error_entry));
|
|
error_entry->error_type = 0;
|
|
error_entry->instance = 0;
|
|
error_entry->subinstance = 0;
|
|
error_entry->local_error_num = errors->error_total;
|
|
error_entry->global_error_num = ((device == NULL) ? 0 : device->error_total);
|
|
error_entry->error_src = NVSWITCH_ERROR_SRC_NONE;
|
|
error_entry->severity = NVSWITCH_ERROR_SEVERITY_NONFATAL;
|
|
error_entry->error_resolved = NV_TRUE;
|
|
error_entry->line = 0;
|
|
error_entry->timer_count =
|
|
((device == NULL) ? 0 : nvswitch_hw_counter_read_counter(device));
|
|
error_entry->time = nvswitch_os_get_platform_time();
|
|
}
|
|
else
|
|
{
|
|
*error_entry = errors->error_log[(errors->error_start + error_idx) % errors->error_log_size];
|
|
}
|
|
}
|
|
|
|
if (error_count)
|
|
{
|
|
*error_count = errors->error_count;
|
|
}
|
|
}
|
|
|
|
|
|
//
|
|
// Retrieve the oldest logged error entry.
|
|
// Optionally remove the error entry after reading
|
|
// error_count returns how many remaining errors in the error log
|
|
//
|
|
|
|
void
|
|
nvswitch_get_next_error
|
|
(
|
|
nvswitch_device *device,
|
|
NVSWITCH_ERROR_LOG_TYPE *errors,
|
|
NVSWITCH_ERROR_TYPE *error_entry,
|
|
NvU32 *error_count,
|
|
NvBool remove_from_list
|
|
)
|
|
{
|
|
nvswitch_get_error(device, errors, error_entry, 0, error_count);
|
|
|
|
// Optionally remove the error from the log
|
|
if (remove_from_list)
|
|
{
|
|
nvswitch_discard_errors(errors, 1);
|
|
}
|
|
}
|
|
|
|
NVSWITCH_NVLINK_HW_ERROR
|
|
nvswitch_translate_hw_error
|
|
(
|
|
NVSWITCH_ERR_TYPE type
|
|
)
|
|
{
|
|
if ((type >= NVSWITCH_ERR_HW_NPORT_INGRESS) &&
|
|
(type < NVSWITCH_ERR_HW_NPORT_INGRESS_LAST))
|
|
{
|
|
return NVSWITCH_NVLINK_HW_INGRESS;
|
|
}
|
|
else if ((type >= NVSWITCH_ERR_HW_NPORT_EGRESS) &&
|
|
(type < NVSWITCH_ERR_HW_NPORT_EGRESS_LAST))
|
|
{
|
|
return NVSWITCH_NVLINK_HW_EGRESS;
|
|
}
|
|
else if ((type >= NVSWITCH_ERR_HW_NPORT_FSTATE) &&
|
|
(type < NVSWITCH_ERR_HW_NPORT_FSTATE_LAST))
|
|
{
|
|
return NVSWITCH_NVLINK_HW_FSTATE;
|
|
}
|
|
else if ((type >= NVSWITCH_ERR_HW_NPORT_TSTATE) &&
|
|
(type < NVSWITCH_ERR_HW_NPORT_TSTATE_LAST))
|
|
{
|
|
return NVSWITCH_NVLINK_HW_TSTATE;
|
|
}
|
|
else if ((type >= NVSWITCH_ERR_HW_NPORT_ROUTE) &&
|
|
(type < NVSWITCH_ERR_HW_NPORT_ROUTE_LAST))
|
|
{
|
|
return NVSWITCH_NVLINK_HW_ROUTE;
|
|
}
|
|
else if ((type >= NVSWITCH_ERR_HW_NPORT) &&
|
|
(type < NVSWITCH_ERR_HW_NPORT_LAST))
|
|
{
|
|
return NVSWITCH_NVLINK_HW_NPORT;
|
|
}
|
|
else if ((type >= NVSWITCH_ERR_HW_NVLCTRL) &&
|
|
(type < NVSWITCH_ERR_HW_NVLCTRL_LAST))
|
|
{
|
|
return NVSWITCH_NVLINK_HW_NVLCTRL;
|
|
}
|
|
else if ((type >= NVSWITCH_ERR_HW_NVLIPT) &&
|
|
(type < NVSWITCH_ERR_HW_NVLIPT_LAST))
|
|
{
|
|
return NVSWITCH_NVLINK_HW_NVLIPT;
|
|
}
|
|
else if ((type >= NVSWITCH_ERR_HW_NVLTLC) &&
|
|
(type < NVSWITCH_ERR_HW_NVLTLC_LAST))
|
|
{
|
|
return NVSWITCH_NVLINK_HW_NVLTLC;
|
|
}
|
|
else if ((type >= NVSWITCH_ERR_HW_DLPL) &&
|
|
(type < NVSWITCH_ERR_HW_DLPL_LAST))
|
|
{
|
|
return NVSWITCH_NVLINK_HW_DLPL;
|
|
}
|
|
else if ((type >= NVSWITCH_ERR_HW_AFS) &&
|
|
(type < NVSWITCH_ERR_HW_AFS_LAST))
|
|
{
|
|
return NVSWITCH_NVLINK_HW_AFS;
|
|
}
|
|
else if ((type >= NVSWITCH_ERR_HW_HOST) &&
|
|
(type < NVSWITCH_ERR_HW_HOST_LAST))
|
|
{
|
|
return NVSWITCH_NVLINK_HW_HOST;
|
|
}
|
|
else if ((type >= NVSWITCH_ERR_HW_MINION) &&
|
|
(type < NVSWITCH_ERR_HW_MINION_LAST))
|
|
{
|
|
return NVSWITCH_NVLINK_HW_MINION;
|
|
}
|
|
else if ((type >= NVSWITCH_ERR_HW_NXBAR) &&
|
|
(type < NVSWITCH_ERR_HW_NXBAR_LAST))
|
|
{
|
|
return NVSWITCH_NVLINK_HW_NXBAR;
|
|
}
|
|
else if ((type >= NVSWITCH_ERR_HW_NPORT_SOURCETRACK) &&
|
|
(type < NVSWITCH_ERR_HW_NPORT_SOURCETRACK_LAST))
|
|
{
|
|
return NVSWITCH_NVLINK_HW_SOURCETRACK;
|
|
}
|
|
else if ((type >= NVSWITCH_ERR_HW_NVLIPT_LNK) &&
|
|
(type < NVSWITCH_ERR_HW_NVLIPT_LNK_LAST))
|
|
{
|
|
return NVSWITCH_ERR_HW_NVLIPT_LNK;
|
|
}
|
|
else if ((type >= NVSWITCH_ERR_HW_SOE) &&
|
|
(type < NVSWITCH_ERR_HW_SOE_LAST))
|
|
{
|
|
return NVSWITCH_ERR_HW_SOE;
|
|
}
|
|
else if ((type >= NVSWITCH_ERR_HW_NPORT_MULTICASTTSTATE) &&
|
|
(type < NVSWITCH_ERR_HW_NPORT_MULTICASTTSTATE_LAST))
|
|
{
|
|
return NVSWITCH_ERR_HW_NPORT_MULTICASTTSTATE;
|
|
}
|
|
else if ((type >= NVSWITCH_ERR_HW_NPORT_REDUCTIONTSTATE) &&
|
|
(type < NVSWITCH_ERR_HW_NPORT_REDUCTIONTSTATE_LAST))
|
|
{
|
|
return NVSWITCH_ERR_HW_NPORT_REDUCTIONTSTATE;
|
|
}
|
|
else
|
|
{
|
|
// Update this assert after adding a new translation entry above
|
|
ct_assert(NVSWITCH_ERR_HW_NPORT_REDUCTIONTSTATE_LAST == (NVSWITCH_ERR_LAST - 1));
|
|
|
|
NVSWITCH_PRINT(NULL, ERROR,
|
|
"%s: Undefined error type\n", __FUNCTION__);
|
|
NVSWITCH_ASSERT(0);
|
|
return NVSWITCH_NVLINK_HW_GENERIC;
|
|
}
|
|
}
|
|
|
|
static NVSWITCH_NVLINK_ARCH_ERROR
|
|
_nvswitch_translate_arch_error
|
|
(
|
|
NVSWITCH_ERROR_TYPE *error_entry
|
|
)
|
|
{
|
|
if (error_entry->severity == NVSWITCH_ERROR_SEVERITY_FATAL)
|
|
{
|
|
return NVSWITCH_NVLINK_ARCH_ERROR_HW_FATAL;
|
|
}
|
|
else if (error_entry->severity == NVSWITCH_ERROR_SEVERITY_NONFATAL)
|
|
{
|
|
if (error_entry->error_resolved)
|
|
{
|
|
return NVSWITCH_NVLINK_ARCH_ERROR_HW_CORRECTABLE;
|
|
}
|
|
else
|
|
{
|
|
return NVSWITCH_NVLINK_ARCH_ERROR_HW_UNCORRECTABLE;
|
|
}
|
|
}
|
|
|
|
return NVSWITCH_NVLINK_ARCH_ERROR_GENERIC;
|
|
}
|
|
|
|
void
|
|
nvswitch_translate_error
|
|
(
|
|
NVSWITCH_ERROR_TYPE *error_entry,
|
|
NVSWITCH_NVLINK_ARCH_ERROR *arch_error,
|
|
NVSWITCH_NVLINK_HW_ERROR *hw_error
|
|
)
|
|
{
|
|
NVSWITCH_ASSERT(error_entry != NULL);
|
|
|
|
if (arch_error)
|
|
{
|
|
*arch_error = NVSWITCH_NVLINK_ARCH_ERROR_NONE;
|
|
}
|
|
|
|
if (hw_error)
|
|
{
|
|
*hw_error = NVSWITCH_NVLINK_HW_ERROR_NONE;
|
|
}
|
|
|
|
if (error_entry->error_src == NVSWITCH_ERROR_SRC_HW)
|
|
{
|
|
if (arch_error)
|
|
{
|
|
*arch_error = _nvswitch_translate_arch_error(error_entry);
|
|
}
|
|
|
|
if (hw_error)
|
|
{
|
|
*hw_error = nvswitch_translate_hw_error(error_entry->error_type);
|
|
}
|
|
}
|
|
else
|
|
{
|
|
NVSWITCH_PRINT(NULL, ERROR,
|
|
"%s: Undefined error source\n", __FUNCTION__);
|
|
NVSWITCH_ASSERT(0);
|
|
}
|
|
}
|
|
|
|
NvlStatus
|
|
nvswitch_ctrl_get_errors
|
|
(
|
|
nvswitch_device *device,
|
|
NVSWITCH_GET_ERRORS_PARAMS *p
|
|
)
|
|
{
|
|
NvU32 index = 0;
|
|
NvU32 count = 0;
|
|
NVSWITCH_ERROR_LOG_TYPE *error_log;
|
|
NVSWITCH_ERROR_TYPE error;
|
|
|
|
switch (p->errorType)
|
|
{
|
|
case NVSWITCH_ERROR_SEVERITY_FATAL:
|
|
error_log = &device->log_FATAL_ERRORS;
|
|
break;
|
|
case NVSWITCH_ERROR_SEVERITY_NONFATAL:
|
|
error_log = &device->log_NONFATAL_ERRORS;
|
|
break;
|
|
default:
|
|
return -NVL_BAD_ARGS;
|
|
}
|
|
|
|
nvswitch_os_memset(p->error, 0, sizeof(NVSWITCH_ERROR) *
|
|
NVSWITCH_ERROR_COUNT_SIZE);
|
|
p->nextErrorIndex = NVSWITCH_ERROR_NEXT_LOCAL_NUMBER(error_log);
|
|
p->errorCount = 0;
|
|
|
|
// If there is nothing to do, return.
|
|
nvswitch_get_error(device, error_log, &error, index, &count);
|
|
if (count == 0)
|
|
{
|
|
return NVL_SUCCESS;
|
|
}
|
|
|
|
//
|
|
// If the error's local_error_num is smaller than the errorIndex
|
|
// passed in by the client, fast-forward index by the difference.
|
|
// This will skip over errors that were previously read by the client.
|
|
//
|
|
if (error.local_error_num < p->errorIndex)
|
|
{
|
|
index = (NvU32) (p->errorIndex - error.local_error_num);
|
|
}
|
|
|
|
// If there is nothing to do after fast-forwarding, return.
|
|
if (index >= count)
|
|
{
|
|
return NVL_SUCCESS;
|
|
}
|
|
|
|
while ((p->errorCount < NVSWITCH_ERROR_COUNT_SIZE) && (index < count))
|
|
{
|
|
// Get the next error to consider from the log
|
|
nvswitch_get_error(device, error_log, &error, index, NULL);
|
|
|
|
p->error[p->errorCount].error_value = error.error_type;
|
|
p->error[p->errorCount].error_src = error.error_src;
|
|
p->error[p->errorCount].instance = error.instance;
|
|
p->error[p->errorCount].subinstance = error.subinstance;
|
|
p->error[p->errorCount].time = error.time;
|
|
p->error[p->errorCount].error_resolved = error.error_resolved;
|
|
p->errorCount++;
|
|
index++;
|
|
}
|
|
|
|
p->errorIndex = error.local_error_num + 1;
|
|
|
|
return NVL_SUCCESS;
|
|
}
|