This commit is contained in:
Andy Ritger
2022-11-10 08:39:33 -08:00
parent 7c345b838b
commit 758b4ee818
1323 changed files with 262135 additions and 60754 deletions

View File

@@ -0,0 +1,67 @@
/*
* SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: MIT
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS IN THE SOFTWARE.
*/
#ifndef NVLINK_INBAND_DRV_HDR_H
#define NVLINK_INBAND_DRV_HDR_H
/*
* This header file defines the header that should be used by RM and NVSwitch
* driver to sync minions on both the sides before an actual inband message
* transfer is initiated.
*
* Modifying the existing header structure is not allowed. A versioning
* policy must be enforced if such changes are needed in the future.
*
* - Avoid use of enums or bit fields. Always use fixed types.
* - Avoid conditional fields in the structs
* - Avoid nested and complex structs. Keep them simple and flat for ease of
* encoding and decoding.
* - Avoid embedded pointers. Flexible arrays at the end of the struct are allowed.
* - Always use the packed struct to typecast inband messages. More details:
* - Always have reserved flags or fields to CYA given the stable ABI conditions.
*/
/* Align to byte boundaries */
#pragma pack(push, 1)
#include "nvtypes.h"
#define NVLINK_INBAND_MAX_XFER_SIZE 0x100
#define NVLINK_INBAND_MAX_XFER_AT_ONCE 4
#define NVLINK_INBAND_DRV_HDR_TYPE_START NVBIT(0)
#define NVLINK_INBAND_DRV_HDR_TYPE_MID NVBIT(1)
#define NVLINK_INBAND_DRV_HDR_TYPE_END NVBIT(2)
/* Rest of the bits are reserved for future use and must be always set zero. */
typedef struct
{
NvU8 data;
} nvlink_inband_drv_hdr_t;
#pragma pack(pop)
/* Don't add any code after this line */
#endif

View File

@@ -0,0 +1,167 @@
/*
* SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: MIT
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS IN THE SOFTWARE.
*/
#ifndef NVLINK_INBAND_MSG_HDR_H
#define NVLINK_INBAND_MSG_HDR_H
/*
* Messages do not have individual versioning, instead a strict ABI is maintained. When a change is
* required on existing message, instead of modifying corresponding message structure, a completely
* new message type (like INBAND_MSG_TYPE_XXX_V1, INBAND_MSG_TYPE_XXX_V2) and corresponding message
* definition structure needs to be added. Do not modify existing structs in any way.
*
* Messages may contain fields which are debug only and must be used for logging purpose. Such
* fields shouldn't be trusted.
*
* - Avoid use of enums or bitfields. Always use fixed types.
* - Avoid conditional fields in the structs.
* - Avoid nested and complex structs. Keep them simple and flat for ease of encoding and decoding.
* - Avoid embedded pointers. Flexible arrays at the end of the struct are allowed.
* - Always use the packed struct to typecast inband messages. More details:
* - Always have reserved flags or fields to CYA given the stable ABI conditions.
*/
/* Align to byte boundaries */
#pragma pack(push, 1)
#include "nvtypes.h"
#include "nvmisc.h"
#include "nvCpuUuid.h"
#include "nvstatus.h"
#include "nvstatuscodes.h"
#define NVLINK_INBAND_MAX_MSG_SIZE 4096
#define NVLINK_INBAND_MSG_MAGIC_ID_FM 0xadbc
/* Nvlink Inband messages types */
#define NVLINK_INBAND_MSG_TYPE_GPU_PROBE_REQ 0
#define NVLINK_INBAND_MSG_TYPE_GPU_PROBE_RSP 1
#define NVLINK_INBAND_MSG_TYPE_MC_TEAM_SETUP_REQ 2
#define NVLINK_INBAND_MSG_TYPE_MC_TEAM_SETUP_RSP 3
#define NVLINK_INBAND_MSG_TYPE_MC_TEAM_RELEASE_REQ 4
#define NVLINK_INBAND_MSG_TYPE_MAX 5
/* Nvlink Inband message packet header */
typedef struct
{
NvU16 magicId; /* Identifier to represent in-band msg, will be NVLINK_INBAND_MSG_MAGIC_ID */
NvU64 requestId; /* Unique Id for a request and response will carry same id */
NV_STATUS status; /* High level status of the message/request */
NvU16 type; /* Type of encoded message. One of NVLINK_INBAND_MSG_TYPE_xxx */
NvU32 length; /* Length of encoded message */
NvU8 reserved[8]; /* For future use. Must be initialized to zero */
} nvlink_inband_msg_header_t;
#define NVLINK_INBAND_GPU_PROBE_CAPS_SRIOV_ENABLED NVBIT(0)
/* Add more caps as need in the future */
typedef struct
{
NvU64 pciInfo; /* Encoded as Domain(63:32):Bus(15:8):Device(0:7). (debug only) */
NvU8 moduleId; /* GPIO based physical/module ID of the GPU. (debug only) */
NvUuid gpuUuid; /* UUID of the GPU. (debug only) */
NvU64 discoveredLinkMask; /* GPU's discovered NVLink mask info. (debug only) */
NvU64 enabledLinkMask; /* GPU's currently enabled NvLink mask info. (debug only) */
NvU32 gpuCapMask; /* GPU capabilities, one of NVLINK_INBAND_GPU_PROBE_CAPS */
NvU8 reserved[32]; /* For future use. Must be initialized to zero */
} nvlink_inband_gpu_probe_req_t;
typedef struct
{
nvlink_inband_msg_header_t msgHdr;
nvlink_inband_gpu_probe_req_t probeReq;
} nvlink_inband_gpu_probe_req_msg_t;
#define NVLINK_INBAND_FM_CAPS_MC_TEAM_SETUP_V1 NVBIT64(0)
#define NVLINK_INBAND_FM_CAPS_MC_TEAM_RELEASE_V1 NVBIT64(1)
typedef struct
{
NvU64 gpuHandle; /* Unique handle assigned by initialization entity for this GPU */
NvU32 gfId; /* GFID which supports NVLink */
NvU64 fmCaps; /* Capability of FM e.g. what features FM support. */
NvUuid clusterUuid; /* Cluster UUID to which this node belongs */
NvU16 fabricPartitionId; /* Partition ID if the GPU belongs to a fabric partition */
NvU64 gpaAddress; /* GPA starting address for the GPU */
NvU64 gpaAddressRange; /* GPU GPA address range */
NvU64 flaAddress; /* FLA starting address for the GPU */
NvU64 flaAddressRange; /* GPU FLA address range */
NvU8 reserved[32]; /* For future use. Must be initialized to zero */
} nvlink_inband_gpu_probe_rsp_t;
typedef struct
{
nvlink_inband_msg_header_t msgHdr;
nvlink_inband_gpu_probe_rsp_t probeRsp;
} nvlink_inband_gpu_probe_rsp_msg_t;
typedef struct
{
NvU64 mcAllocSize; /* Multicast allocation size requested */
NvU32 flags; /* For future use. Must be initialized to zero */
NvU8 reserved[8]; /* For future use. Must be initialized to zero */
NvU16 numGpuHandles; /* Number of GPUs in this team */
NvU64 gpuHandles[]; /* Array of probed handles, should be last */
} nvlink_inband_mc_team_setup_req_t;
typedef struct
{
nvlink_inband_msg_header_t msgHdr;
nvlink_inband_mc_team_setup_req_t mcTeamSetupReq;
} nvlink_inband_mc_team_setup_req_msg_t;
typedef struct
{
NvU64 mcTeamHandle; /* Unique handle assigned for this Multicast team */
NvU32 flags; /* For future use. Must be initialized to zero */
NvU8 reserved[8]; /* For future use. Must be initialized to zero */
NvU64 mcAddressBase; /* FLA starting address assigned for the Multicast slot */
NvU64 mcAddressSize; /* Size of FLA assigned to the Multicast slot */
} nvlink_inband_mc_team_setup_rsp_t;
typedef struct
{
nvlink_inband_msg_header_t msgHdr;
nvlink_inband_mc_team_setup_rsp_t mcTeamSetupRsp;
} nvlink_inband_mc_team_setup_rsp_msg_t;
typedef struct
{
NvU64 mcTeamHandle; /* Unique handle assigned for the Multicast team */
NvU32 flags; /* For future use. Must be initialized to zero */
NvU8 reserved[8]; /* For future use. Must be initialized to zero */
} nvlink_inband_mc_team_release_req_t;
typedef struct
{
nvlink_inband_msg_header_t msgHdr;
nvlink_inband_mc_team_release_req_t mcTeamReleaseReq;
} nvlink_inband_mc_team_release_req_msg_t;
#pragma pack(pop)
/* Don't add any code after this line */
#endif

View File

@@ -108,7 +108,7 @@ struct nvlink_device
// Device type and status
NvU64 type;
NvBool initialized;
// Training type: ALI or Non-ALI
NvBool enableALI;
@@ -328,7 +328,7 @@ typedef struct nvlink_inband_data nvlink_inband_data;
#define NVLINK_LINKSTATE_CONTAIN 0x19 // TL is in contain mode
#define NVLINK_LINKSTATE_INITTL 0x1A // INITTL
#define NVLINK_LINKSTATE_INITPHASE5 0x1B // INITPHASE5
#define NVLINK_LINKSTATE_ALI 0x1C // ALI
#define NVLINK_LINKSTATE_ALI 0x1C // ALI
#define NVLINK_LINKSTATE_ACTIVE_PENDING 0x1D // Intermediate state for a link going to active
#define NVLINK_LINKSTATE_INVALID 0xFF // Invalid state

View File

@@ -89,7 +89,7 @@ typedef struct
typedef struct
{
NvU16 nodeId;
NvU32 linkIndex;
NvU16 linkIndex;
nvlink_pci_dev_info pciInfo;
} nvlink_endpoint;
@@ -117,7 +117,7 @@ typedef struct
typedef struct
{
NvU16 nodeId;
NvU32 linkIndex;
NvU16 linkIndex;
nvlink_pci_dev_info pciInfo;
NvU8 devUuid[NVLINK_UUID_LEN];
NvU32 devType;
@@ -189,9 +189,9 @@ typedef enum
/* link and sublink state of an nvlink endpoint */
typedef struct
{
NvU32 linkMode;
NvU32 txSubLinkMode;
NvU32 rxSubLinkMode;
NvU8 linkMode;
NvU8 txSubLinkMode;
NvU8 rxSubLinkMode;
} nvlink_link_state;
/*
@@ -354,7 +354,7 @@ typedef struct
*/
typedef struct
{
NvU32 linkIndex;
NvU16 linkIndex;
NvBool initStatus;
} nvlink_link_init_status;
@@ -503,7 +503,7 @@ typedef struct
*/
typedef struct
{
NvU32 linkIndex;
NvU16 linkIndex;
NV_DECLARE_ALIGNED(NvU64 tokenValue, 8);
} nvlink_token_info;
@@ -1111,6 +1111,11 @@ typedef struct
NvU32 endStatesCount;
} nvlink_get_device_link_states;
/*
* Note: Verify that new parameter structs for IOCTLs satisfy
* sizing restrictions for all OSs they could be used in.
*/
#define CTRL_NVLINK_CHECK_VERSION 0x01
#define CTRL_NVLINK_SET_NODE_ID 0x02
#define CTRL_NVLINK_SET_TX_COMMON_MODE 0x03

View File

@@ -1,5 +1,5 @@
/*******************************************************************************
Copyright (c) 2019-2020 NVidia Corporation
Copyright (c) 2019-2022 NVidia Corporation
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to
@@ -41,6 +41,11 @@ nvlink_core_get_intranode_conn
{
nvlink_intranode_conn *tmpConn = NULL;
if ((endpoint == NULL) || (conn == NULL))
{
return;
}
FOR_EACH_CONNECTION(tmpConn, nvlinkLibCtx.nv_intraconn_head, node)
{
if (tmpConn->end0 == endpoint || tmpConn->end1 == endpoint)
@@ -66,6 +71,11 @@ nvlink_core_get_internode_conn
{
nvlink_internode_conn *tmpConn = NULL;
if ((localLink == NULL) || (conn == NULL))
{
return;
}
FOR_EACH_CONNECTION(tmpConn, nvlinkLibCtx.nv_interconn_head, node)
{
if (tmpConn->local_end == localLink)
@@ -93,6 +103,11 @@ nvlink_core_add_intranode_conn
{
nvlink_intranode_conn *conn = NULL;
if ((end0 == NULL) || (end1 == NULL))
{
return NVL_BAD_ARGS;
}
// don't do anything if we have an intranode connecction
nvlink_core_get_intranode_conn(end0, &conn);
@@ -163,6 +178,11 @@ nvlink_core_add_internode_conn
{
nvlink_internode_conn *conn = NULL;
if ((localLink == NULL) || (remoteEndPoint == NULL))
{
return NVL_BAD_ARGS;
}
// Don't do anything if we have an internode connecction for local link
nvlink_core_get_internode_conn(localLink, &conn);
if (conn != NULL)
@@ -208,6 +228,9 @@ nvlink_core_remove_intranode_conn
nvlink_intranode_conn *conn
)
{
if (conn == NULL)
return;
// Remove the connection from the list of connections
nvListDel(&conn->node);
@@ -245,6 +268,9 @@ nvlink_core_remove_internode_conn
{
nvlink_internode_conn *conn = NULL;
if (localLink == NULL)
return;
nvlink_core_get_internode_conn(localLink, &conn);
if (conn != NULL)
@@ -269,6 +295,11 @@ nvlink_core_check_intranode_conn_state
NvU64 linkMode
)
{
if (conn == NULL)
{
return NVL_BAD_ARGS;
}
switch (linkMode)
{
case NVLINK_LINKSTATE_OFF:
@@ -485,6 +516,11 @@ nvlink_core_copy_intranode_conn_info
nvlink_conn_info *conn_info
)
{
if ((remote_end == NULL) || (conn_info == NULL))
{
return;
}
// copy the remote device pci information
conn_info->domain = remote_end->dev->pciInfo.domain;
conn_info->bus = remote_end->dev->pciInfo.bus;
@@ -520,6 +556,11 @@ nvlink_core_copy_internode_conn_info
nvlink_conn_info *conn_info
)
{
if ((remote_end == NULL) || (conn_info == NULL))
{
return;
}
// copy the remote device pci information
conn_info->domain = remote_end->pciInfo.domain;
conn_info->bus = remote_end->pciInfo.bus;

View File

@@ -54,6 +54,12 @@ nvlink_core_discover_and_get_remote_end
nvlink_device *dev = NULL;
nvlink_link *link = NULL;
NvU32 linkCount = 0;
if ((end == NULL) || (remote_end == NULL))
{
return;
}
nvlink_link **pLinks = (nvlink_link **)nvlink_malloc(
sizeof(nvlink_link *) * NVLINK_MAX_SYSTEM_LINK_NUM);
if (pLinks == NULL)
@@ -224,12 +230,11 @@ _nvlink_core_discover_topology(void)
isTokenFound = NV_TRUE;
//
// If a token is found mark bInitnegotiateConfigGood as
// True since we can only finish off discovery if
// INITNEGOTIATE has finished in order to get topology info from
// MINION
// If R4 tokens were used for NVLink3.0+, then mark initnegotiate
// passed, since ALT training won't get kicked off without it.
//
if ((end0->version >= NVLINK_DEVICE_VERSION_30))
if ((end0->version >= NVLINK_DEVICE_VERSION_30) &&
((end0->localSid == 0) || (end0->remoteSid == 0)))
{
end0->bInitnegotiateConfigGood = NV_TRUE;
end1->bInitnegotiateConfigGood = NV_TRUE;

View File

@@ -46,7 +46,11 @@ nvlink_core_init_links_from_off_to_swcfg
NvU32 i;
// Sanity check the links array
nvlink_assert(pLinks != NULL);
if (pLinks == NULL)
{
nvlink_assert(0);
return;
}
// Return early if there are no links to initialize
if (numLinks == 0)
@@ -65,6 +69,9 @@ nvlink_core_init_links_from_off_to_swcfg
{
for (i = 0; i < numLinks; i++)
{
if (pLinks[i] == NULL)
continue;
status = pLinks[i]->link_handlers->get_dl_link_mode(pLinks[i], &linkMode);
if ((status != NVL_SUCCESS) ||
(linkMode == NVLINK_LINKSTATE_FAIL) || (linkMode == NVLINK_LINKSTATE_FAULT))
@@ -84,6 +91,9 @@ nvlink_core_init_links_from_off_to_swcfg
{
for (i = 0; i < numLinks; i++)
{
if (pLinks[i] == NULL)
continue;
// If receiver detect has passed for the link, move to next link
if (pLinks[i]->bRxDetected)
continue;
@@ -107,6 +117,9 @@ nvlink_core_init_links_from_off_to_swcfg
{
for (i = 0; i < numLinks; i++)
{
if (pLinks[i] == NULL)
continue;
// In NVLink3.0 and 3.1, RXDET must be called serially - done above (Bug 2546220)
if (!((pLinks[i]->version == NVLINK_DEVICE_VERSION_30) ||
(pLinks[i]->version == NVLINK_DEVICE_VERSION_31)))
@@ -143,6 +156,9 @@ nvlink_core_init_links_from_off_to_swcfg
{
for (i = 0; i < numLinks; i++)
{
if (pLinks[i] == NULL)
continue;
// If receiver detect failed for the link, move to next link
if (!pLinks[i]->bRxDetected || pLinks[i]->bTxCommonModeFail)
continue;
@@ -172,6 +188,9 @@ nvlink_core_init_links_from_off_to_swcfg
{
for (i = 0; i < numLinks; i++)
{
if (pLinks[i] == NULL)
continue;
// If receiver detect failed for the link, move to next link
if (!pLinks[i]->bRxDetected || pLinks[i]->bTxCommonModeFail)
continue;
@@ -190,6 +209,9 @@ nvlink_core_init_links_from_off_to_swcfg
// Put the links in SAFE mode
for (i = 0; i < numLinks; i++)
{
if (pLinks[i] == NULL)
continue;
// If receiver detect failed for the link, move to next link
if (!pLinks[i]->bRxDetected || pLinks[i]->bTxCommonModeFail)
continue;
@@ -243,6 +265,9 @@ nvlink_core_init_links_from_off_to_swcfg
// Poll for links to enter SAFE mode
for (i = 0; i < numLinks; i++)
{
if (pLinks[i] == NULL)
continue;
status = nvlink_core_wait_for_link_init(pLinks[i]);
if (status == NVL_SUCCESS)
{
@@ -275,7 +300,11 @@ nvlink_core_init_links_from_off_to_swcfg_non_ALI
NvU32 i;
// Sanity check the links array
nvlink_assert(pLinks != NULL);
if (pLinks == NULL)
{
nvlink_assert(0);
return;
}
// Return early if there are no links to initialize
if (numLinks == 0)
@@ -294,6 +323,9 @@ nvlink_core_init_links_from_off_to_swcfg_non_ALI
{
for (i = 0; i < numLinks; i++)
{
if (pLinks[i] == NULL)
continue;
status = pLinks[i]->link_handlers->get_dl_link_mode(pLinks[i], &linkMode);
if ((status != NVL_SUCCESS) ||
(linkMode == NVLINK_LINKSTATE_FAIL) || (linkMode == NVLINK_LINKSTATE_FAULT))
@@ -313,6 +345,9 @@ nvlink_core_init_links_from_off_to_swcfg_non_ALI
{
for (i = 0; i < numLinks; i++)
{
if (pLinks[i] == NULL)
continue;
// In NVLink3.0 and 3.1, RXDET must be called serially - done above (Bug 2546220)
if (!((pLinks[i]->version == NVLINK_DEVICE_VERSION_30) ||
(pLinks[i]->version == NVLINK_DEVICE_VERSION_31)))
@@ -348,6 +383,9 @@ nvlink_core_init_links_from_off_to_swcfg_non_ALI
{
for (i = 0; i < numLinks; i++)
{
if (pLinks[i] == NULL)
continue;
// If receiver detect failed for the link, move to next link
if (!pLinks[i]->bRxDetected || pLinks[i]->bTxCommonModeFail)
continue;
@@ -371,6 +409,9 @@ nvlink_core_init_links_from_off_to_swcfg_non_ALI
{
for (i = 0; i < numLinks; i++)
{
if (pLinks[i] == NULL)
continue;
// If receiver detect failed for the link, move to next link
if (!pLinks[i]->bRxDetected || pLinks[i]->bTxCommonModeFail || pLinks[i]->bInitphase5Fails)
continue;
@@ -389,6 +430,9 @@ nvlink_core_init_links_from_off_to_swcfg_non_ALI
// Step 5: Put the links in SAFE mode
for (i = 0; i < numLinks; i++)
{
if (pLinks[i] == NULL)
continue;
// If receiver detect failed for the link, move to next link
if (!pLinks[i]->bRxDetected || pLinks[i]->bTxCommonModeFail || pLinks[i]->bInitphase5Fails)
continue;
@@ -442,6 +486,9 @@ nvlink_core_init_links_from_off_to_swcfg_non_ALI
// Poll for links to enter SAFE mode
for (i = 0; i < numLinks; i++)
{
if (pLinks[i] == NULL)
continue;
status = nvlink_core_wait_for_link_init(pLinks[i]);
if (status == NVL_SUCCESS)
{
@@ -473,7 +520,11 @@ nvlink_core_initphase5
NvU32 i;
// Sanity check the links array
nvlink_assert(links != NULL);
if (links == NULL)
{
nvlink_assert(0);
return NVL_BAD_ARGS;
}
// Return early if link array is empty
if (numLinks == 0)
@@ -489,6 +540,9 @@ nvlink_core_initphase5
NvlStatus status = NVL_SUCCESS;
NvU64 dlLinkMode = 0;
if (links[i] == NULL)
continue;
// INITPHASE5 is supported only for NVLINK version >= 4.0
if (links[i]->version < NVLINK_DEVICE_VERSION_40)
continue;
@@ -562,7 +616,11 @@ nvlink_core_initphase1
NvU32 i;
// Sanity check the links array
nvlink_assert(links != NULL);
if (links == NULL)
{
nvlink_assert(0);
return NVL_BAD_ARGS;
}
// Return early if link array is empty
if (numLinks == 0)
@@ -584,6 +642,9 @@ nvlink_core_initphase1
NvU32 rxSubMode = 0;
NvBool bPhyUnlocked = NV_FALSE;
if (links[i] == NULL)
continue;
// INITPHASE1 is supported only for NVLINK version >= 3.0
if (links[i]->version < NVLINK_DEVICE_VERSION_30)
continue;
@@ -690,7 +751,11 @@ nvlink_core_rx_init_term
NvU32 i;
// Sanity check the links array
nvlink_assert(links != NULL);
if (links == NULL)
{
nvlink_assert(0);
return NVL_BAD_ARGS;
}
// Return early if link array is empty
if (numLinks == 0)
@@ -705,6 +770,9 @@ nvlink_core_rx_init_term
{
NvU64 linkMode = NVLINK_LINKSTATE_OFF;
if (links[i] == NULL)
continue;
if (links[i]->version < NVLINK_DEVICE_VERSION_22)
continue;
@@ -797,7 +865,11 @@ nvlink_core_set_rx_detect
NvU32 i;
// Sanity check the links array
nvlink_assert(links != NULL);
if (links == NULL)
{
nvlink_assert(0);
return NVL_BAD_ARGS;
}
// Return early if link array is empty
if (numLinks == 0)
@@ -813,6 +885,9 @@ nvlink_core_set_rx_detect
NvlStatus status = NVL_SUCCESS;
NvU64 linkMode = NVLINK_LINKSTATE_OFF;
if (links[i] == NULL)
continue;
if (links[i]->version < NVLINK_DEVICE_VERSION_22)
continue;
@@ -933,7 +1008,11 @@ nvlink_core_get_rx_detect
NvU32 i;
// Sanity check the links array
nvlink_assert(links != NULL);
if (links == NULL)
{
nvlink_assert(0);
return NVL_BAD_ARGS;
}
// Return early if link array is empty
if (numLinks == 0)
@@ -948,6 +1027,9 @@ nvlink_core_get_rx_detect
{
NvU64 linkMode = NVLINK_LINKSTATE_OFF;
if (links[i] == NULL)
continue;
// If receiver detect has passed for the link, move to next link
if (links[i]->bRxDetected)
continue;
@@ -1048,7 +1130,11 @@ nvlink_core_enable_common_mode
NvU32 i;
// Sanity check the links array
nvlink_assert(links != NULL);
if (links == NULL)
{
nvlink_assert(0);
return NVL_BAD_ARGS;
}
// Return early if link array is empty
if (numLinks == 0)
@@ -1063,6 +1149,9 @@ nvlink_core_enable_common_mode
{
NvU64 linkMode = NVLINK_LINKSTATE_OFF;
if (links[i] == NULL)
continue;
if (!links[i]->bRxDetected)
{
// link did not pass RXDET, don't do anything
@@ -1157,7 +1246,11 @@ nvlink_core_calibrate_links
NvU32 i;
// Sanity check the links array
nvlink_assert(links != NULL);
if (links == NULL)
{
nvlink_assert(0);
return NVL_BAD_ARGS;
}
// Return early if link array is empty
if (numLinks == 0)
@@ -1173,6 +1266,9 @@ nvlink_core_calibrate_links
NvlStatus status = NVL_SUCCESS;
NvU64 linkMode = NVLINK_LINKSTATE_OFF;
if (links[i] == NULL)
continue;
// If receiver detect failed for the link, move to next link
if (!links[i]->bRxDetected || links[i]->bTxCommonModeFail)
continue;
@@ -1260,7 +1356,11 @@ nvlink_core_disable_common_mode
NvU32 i;
// Sanity check the links array
nvlink_assert(links != NULL);
if (links == NULL)
{
nvlink_assert(0);
return NVL_BAD_ARGS;
}
// Return early if link array is empty
if (numLinks == 0)
@@ -1276,6 +1376,9 @@ nvlink_core_disable_common_mode
NvlStatus status = NVL_SUCCESS;
NvU64 linkMode = NVLINK_LINKSTATE_OFF;
if (links[i] == NULL)
continue;
if (!links[i]->bRxDetected || links[i]->bTxCommonModeFail)
{
// link did not pass RXDET or failed in common mode, don't do anything
@@ -1368,7 +1471,11 @@ nvlink_core_enable_data
NvU32 i;
// Sanity check the links array
nvlink_assert(links != NULL);
if (links == NULL)
{
nvlink_assert(0);
return NVL_BAD_ARGS;
}
// Return early if link array is empty
if (numLinks == 0)
@@ -1384,6 +1491,9 @@ nvlink_core_enable_data
NvlStatus status = NVL_SUCCESS;
NvU64 linkMode = NVLINK_LINKSTATE_OFF;
if (links[i] == NULL)
continue;
// If receiver detect failed for the link, move to next link
if (!links[i]->bRxDetected || links[i]->bTxCommonModeFail)
continue;
@@ -1473,7 +1583,11 @@ nvlink_core_initnegotiate
NvU32 i;
// Sanity check the links array
nvlink_assert(links != NULL);
if (links == NULL)
{
nvlink_assert(0);
return NVL_BAD_ARGS;
}
// Return early if link array is empty
if (numLinks == 0)
@@ -1489,6 +1603,9 @@ nvlink_core_initnegotiate
NvlStatus status = NVL_SUCCESS;
NvU64 linkMode = NVLINK_LINKSTATE_OFF;
if (links[i] == NULL)
continue;
// If receiver detect failed for the link, move to next link
if (!links[i]->bRxDetected || links[i]->bTxCommonModeFail ||
links[i]->bSafeTransitionFail || links[i]->bInitphase5Fails)
@@ -1543,6 +1660,9 @@ nvlink_core_initnegotiate
NvlStatus status = NVL_SUCCESS;
NvU64 linkMode = NVLINK_LINKSTATE_OFF;
if (links[i] == NULL)
continue;
// If receiver detect failed for the link, move to next link
if (!links[i]->bRxDetected || links[i]->bTxCommonModeFail ||
links[i]->bSafeTransitionFail)
@@ -1606,6 +1726,12 @@ nvlink_core_wait_for_link_init
NvlStatus status = NVL_SUCCESS;
NvU64 linkMode = NVLINK_LINKSTATE_OFF;
if (link == NULL)
{
nvlink_assert(0);
return NVL_BAD_ARGS;
}
//
// Check for SW fail flags to exit early
//

View File

@@ -298,6 +298,11 @@ nvlink_core_get_endpoint_state
NvU64 dlState = NVLINK_LINKSTATE_INVALID;
NvU64 tlState = NVLINK_LINKSTATE_INVALID;
if ((link == NULL) || (linkState == NULL))
{
return;
}
//
// This is a best case effort to return the current state of the link
// to user as part of the ioctl call. Typically, this call should succeed
@@ -343,6 +348,11 @@ nvlink_core_get_device_by_devinfo
{
nvlink_device *tmpDev = NULL;
if ((devInfo == NULL) || (devInfo == NULL))
{
return;
}
FOR_EACH_DEVICE_REGISTERED(tmpDev, nvlinkLibCtx.nv_devicelist_head, node)
{
if ( (tmpDev->nodeId == devInfo->nodeId) &&
@@ -376,6 +386,11 @@ nvlink_core_get_link_by_endpoint
nvlink_device *tmpDev = NULL;
nvlink_link *tmpLink = NULL;
if ((endPoint == NULL) || (link == NULL))
{
return;
}
FOR_EACH_DEVICE_REGISTERED(tmpDev, nvlinkLibCtx.nv_devicelist_head, node)
{
if ((tmpDev->nodeId == endPoint->nodeId) &&
@@ -412,6 +427,11 @@ nvlink_core_copy_endpoint_info
nvlink_endpoint *endPointInfo
)
{
if ((connLink == NULL) || (endPointInfo == NULL))
{
return;
}
nvlink_device *dev = connLink->dev;
endPointInfo->pciInfo.domain = dev->pciInfo.domain;
@@ -435,6 +455,11 @@ nvlink_core_copy_device_info
nvlink_detailed_dev_info *devInfo
)
{
if ((tmpDev == NULL) || (devInfo == NULL))
{
return;
}
devInfo->pciInfo.domain = tmpDev->pciInfo.domain;
devInfo->pciInfo.bus = tmpDev->pciInfo.bus;
devInfo->pciInfo.device = tmpDev->pciInfo.device;
@@ -478,13 +503,20 @@ nvlink_core_link_init_async
NvU32 i;
// Sanity check the links array for non-zero links
nvlink_assert((links != NULL) && (numLinks > 0));
if ((links == NULL) || (numLinks == 0))
{
nvlink_assert(0);
return NVL_BAD_ARGS;
}
for (i = 0; i < numLinks; i++)
{
NvlStatus status = NVL_SUCCESS;
NvU64 linkMode = NVLINK_LINKSTATE_OFF;
if (links[i] == NULL)
continue;
if (!links[i]->bRxDetected || links[i]->bTxCommonModeFail)
{
// link did not pass RXDET or failed in common mode, don't do anything
@@ -540,6 +572,9 @@ nvlink_core_get_link_discovery_token
{
NvU64 token = 0;
if (link == NULL)
return token;
//
// generate a unique token value for discovering connections.
// link->token is the memory address of the allocated link object,
@@ -570,6 +605,11 @@ nvlink_core_write_link_discovery_token
NvlStatus status = NVL_SUCCESS;
NvU64 linkMode = NVLINK_LINKSTATE_OFF;
if (link == NULL)
{
return NVL_BAD_ARGS;
}
// Packet injection can only happen if link is in SWCFG/ACTIVE
status = link->link_handlers->get_dl_link_mode(link, &linkMode);
if (status != NVL_SUCCESS)
@@ -613,6 +653,11 @@ nvlink_core_read_link_discovery_token
NvlStatus status = NVL_SUCCESS;
NvU64 linkMode = NVLINK_LINKSTATE_OFF;
if (link == NULL)
{
return 0;
}
status = link->link_handlers->get_dl_link_mode(link, &linkMode);
if (status != NVL_SUCCESS)
{
@@ -654,6 +699,11 @@ nvlink_core_correlate_conn_by_token
nvlink_link *dstLink = NULL;
NvU64 readToken = 0;
if (srcLink == NULL)
{
return;
}
FOR_EACH_DEVICE_REGISTERED(dev, nvlinkLibCtx.nv_devicelist_head, node)
{
FOR_EACH_LINK_REGISTERED(dstLink, dev, node)

View File

@@ -45,6 +45,11 @@ nvlink_core_check_link_state
NvU64 crntTlLinkMode = NVLINK_LINKSTATE_OFF;
NvlStatus status = NVL_SUCCESS;
if (link == NULL)
{
return NV_FALSE;
}
switch (linkState)
{
case NVLINK_LINKSTATE_OFF:
@@ -129,6 +134,11 @@ nvlink_core_check_tx_sublink_state
NvU64 crntTxSublinkMode = NVLINK_SUBLINK_STATE_TX_OFF;
NvU32 crntTxSublinkSubMode = NVLINK_SUBLINK_SUBSTATE_TX_STABLE;
if (link == NULL)
{
return NV_FALSE;
}
status = link->link_handlers->get_tx_mode(link,
&crntTxSublinkMode,
&crntTxSublinkSubMode);
@@ -194,6 +204,11 @@ nvlink_core_check_rx_sublink_state
NvU64 crntRxSublinkMode = NVLINK_SUBLINK_STATE_RX_OFF;
NvU32 crntRxSublinkSubMode = NVLINK_SUBLINK_SUBSTATE_RX_STABLE;
if (link == NULL)
{
return NV_FALSE;
}
status = link->link_handlers->get_rx_mode(link,
&crntRxSublinkMode,
&crntRxSublinkSubMode);
@@ -258,6 +273,11 @@ nvlink_core_poll_link_state
{
NvU64 currentLinkState = ~0;
if (link == NULL)
{
return NVL_BAD_ARGS;
}
link->link_handlers->get_dl_link_mode(link, &currentLinkState);
while (currentLinkState != linkState)
@@ -316,6 +336,11 @@ nvlink_core_poll_sublink_state
{
NvlStatus status = NVL_SUCCESS;
if ((localTxSubLink == NULL) || (remoteRxSubLink == NULL))
{
return NVL_BAD_ARGS;
}
// check for tx sublink if a valid link is specified
if (localTxSubLink)
{
@@ -369,6 +394,11 @@ nvlink_core_poll_tx_sublink_state
NvU64 currentTxSublinkState = ~0;
NvU32 currentTxSublinkSubState = ~0;
if (link == NULL)
{
return NVL_BAD_ARGS;
}
link->link_handlers->get_tx_mode(link,
&currentTxSublinkState,
&currentTxSublinkSubState);
@@ -427,6 +457,11 @@ nvlink_core_poll_rx_sublink_state
NvU64 currentRxSublinkState = ~0;
NvU32 currentRxSublinkSubState = ~0;
if (link == NULL)
{
return NVL_BAD_ARGS;
}
link->link_handlers->get_rx_mode(link,
&currentRxSublinkState,
&currentRxSublinkSubState);

View File

@@ -45,6 +45,9 @@ nvlink_core_print_link_state
NvU32 txSublinkSubMode = 0;
NvU32 rxSublinkSubMode = 0;
if (link == NULL)
return;
link->link_handlers->get_dl_link_mode(link, &linkMode);
link->link_handlers->get_tx_mode(link, &txSublinkMode, &txSublinkSubMode);
link->link_handlers->get_rx_mode(link, &rxSublinkMode, &rxSublinkSubMode);
@@ -86,6 +89,9 @@ _nvlink_core_print_link
nvlink_link *link
)
{
if (link == NULL)
return;
switch (link->dev->type)
{
case NVLINK_DEVICE_TYPE_GPU:

View File

@@ -61,6 +61,9 @@ nvlink_core_powerdown_intranode_conns_from_active_to_L2
return NVL_ERR_GENERIC;
}
if (conns[0] == NULL)
return NVL_ERR_GENERIC;
// Set the version. Currently, only one version is supported on a chip
version = conns[0]->end0->version;
@@ -71,6 +74,9 @@ nvlink_core_powerdown_intranode_conns_from_active_to_L2
{
for (i = 0; i < connCount; i++)
{
if (conns[i] == NULL)
continue;
status = nvlink_core_check_intranode_conn_state(conns[i], NVLINK_LINKSTATE_HS);
if ((status == NVL_SUCCESS) || (status == NVL_ERR_INVALID_STATE))
{
@@ -98,6 +104,9 @@ nvlink_core_powerdown_intranode_conns_from_active_to_L2
// STEP 0: Disable HeartBeat on the endpoints of all connections
for (i = 0; i < connCount; i++)
{
if (conns[i] == NULL)
continue;
conns[i]->end0->link_handlers->set_dl_link_mode(conns[i]->end0,
NVLINK_LINKSTATE_DISABLE_HEARTBEAT,
flags);
@@ -114,6 +123,9 @@ nvlink_core_powerdown_intranode_conns_from_active_to_L2
// STEP 1: Disable PM on the endpoints of all connections
for (i = 0; i < connCount; i++)
{
if (conns[i] == NULL)
continue;
conns[i]->end0->link_handlers->set_dl_link_mode(conns[i]->end0,
NVLINK_LINKSTATE_DISABLE_PM,
flags);
@@ -132,6 +144,9 @@ nvlink_core_powerdown_intranode_conns_from_active_to_L2
{
for (i = 0; i < connCount; i++)
{
if (conns[i] == NULL)
continue;
status = conns[i]->end0->link_handlers->get_dl_link_mode(conns[i]->end0, &linkMode);
if ((status != NVL_SUCCESS) ||
(linkMode == NVLINK_LINKSTATE_FAIL) || (linkMode == NVLINK_LINKSTATE_FAULT))
@@ -155,6 +170,9 @@ nvlink_core_powerdown_intranode_conns_from_active_to_L2
// Check for each connection, if both the ends and their sublinks are in HS mode
for (i = 0; i < connCount; i++)
{
if (conns[i] == NULL)
continue;
status = nvlink_core_check_intranode_conn_state(conns[i], NVLINK_LINKSTATE_HS);
if (status == NVL_ERR_INVALID_STATE)
{
@@ -186,6 +204,9 @@ nvlink_core_powerdown_intranode_conns_from_active_to_L2
//
for (i = 0; i < connCount; i++)
{
if (conns[i] == NULL)
continue;
// Wait for the end0 to go to SWCFG
status = nvlink_core_poll_link_state(conns[i]->end0,
NVLINK_LINKSTATE_SAFE,
@@ -212,6 +233,9 @@ nvlink_core_powerdown_intranode_conns_from_active_to_L2
// STEP 3: Change sub-link state to SAFE on all endpoints
for (i = 0; i < connCount; i++)
{
if (conns[i] == NULL)
continue;
conns[i]->end0->link_handlers->set_tx_mode(conns[i]->end0,
NVLINK_SUBLINK_STATE_TX_SAFE,
flags);
@@ -228,6 +252,9 @@ nvlink_core_powerdown_intranode_conns_from_active_to_L2
// Poll for all endpoints sub-link state to reach SAFE
for (i = 0; i < connCount; i++)
{
if (conns[i] == NULL)
continue;
// Wait for sublinks to go to SAFE
status = nvlink_core_poll_sublink_state(conns[i]->end0,
NVLINK_SUBLINK_STATE_TX_SAFE,
@@ -261,6 +288,9 @@ nvlink_core_powerdown_intranode_conns_from_active_to_L2
// STEP 4: Save link state on all the endpoints
for (i = 0; i < connCount; i++)
{
if (conns[i] == NULL)
continue;
if (!conns[i]->end0->bStateSaved)
{
conns[i]->end0->link_handlers->set_dl_link_mode(conns[i]->end0,
@@ -281,6 +311,9 @@ nvlink_core_powerdown_intranode_conns_from_active_to_L2
{
for (i = 0; i < connCount; i++)
{
if (conns[i] == NULL)
continue;
status = conns[i]->end0->link_handlers->get_dl_link_mode(conns[i]->end0, &linkMode);
if ((status != NVL_SUCCESS) ||
(linkMode == NVLINK_LINKSTATE_FAIL) || (linkMode == NVLINK_LINKSTATE_FAULT))
@@ -304,6 +337,9 @@ nvlink_core_powerdown_intranode_conns_from_active_to_L2
// STEP 5: Trigger the sleep request on all the endpoints
for (i = 0; i < connCount; i++)
{
if (conns[i] == NULL)
continue;
//
// Send SLEEP request on one end of connection if not in loopback.
// Don' poll, since transition will happen when both ends get the request
@@ -327,6 +363,9 @@ nvlink_core_powerdown_intranode_conns_from_active_to_L2
// Finally check the connection states
for (i = 0; i < connCount; i++)
{
if (conns[i] == NULL)
continue;
status = nvlink_core_check_intranode_conn_state(conns[i], NVLINK_LINKSTATE_SLEEP);
if (status != NVL_SUCCESS)
{
@@ -392,6 +431,9 @@ nvlink_core_powerdown_intranode_conns_from_active_to_off
for (i = 0; i < connCount; i++)
{
if (conns[i] == NULL)
continue;
// Disable Power Management before moving link out of Active
conns[i]->end0->link_handlers->set_dl_link_mode(conns[i]->end0,
NVLINK_LINKSTATE_DISABLE_PM,
@@ -418,6 +460,9 @@ nvlink_core_powerdown_intranode_conns_from_active_to_off
// Poll for links to reach SWCFG & initiate sublinks to SAFE state
for (i = 0; i < connCount; i++)
{
if (conns[i] == NULL)
continue;
// Wait for the end0 to go to SWCFG
status = nvlink_core_poll_link_state(conns[i]->end0,
NVLINK_LINKSTATE_SAFE,
@@ -430,7 +475,7 @@ nvlink_core_powerdown_intranode_conns_from_active_to_off
// to track Failure
conns[i]->end0->inSWCFG = NV_FALSE;
}
}
else
{
conns[i]->end0->inSWCFG = NV_TRUE;
@@ -448,14 +493,14 @@ nvlink_core_powerdown_intranode_conns_from_active_to_off
// to track Failure
conns[i]->end1->inSWCFG = NV_FALSE;
}
}
else
{
conns[i]->end1->inSWCFG = NV_TRUE;
}
// Change each sublink state to SAFE
if(conns[i]->end0->inSWCFG == NV_TRUE)
if(conns[i]->end0->inSWCFG == NV_TRUE)
{
conns[i]->end0->link_handlers->set_tx_mode(conns[i]->end0,
NVLINK_SUBLINK_STATE_TX_SAFE,
@@ -473,6 +518,9 @@ nvlink_core_powerdown_intranode_conns_from_active_to_off
// Poll for sublinks to reach SAFE state
for (i = 0; i < connCount; i++)
{
if (conns[i] == NULL)
continue;
// Wait for sublinks to go to SAFE
if(conns[i]->end0->inSWCFG == NV_TRUE)
{
@@ -623,6 +671,9 @@ nvlink_core_powerdown_intranode_conns_from_active_to_swcfg
for (i = 0; i < connCount; i++)
{
if (conns[i] == NULL)
continue;
// Disable Power Management before moving link out of Active
conns[i]->end0->link_handlers->set_dl_link_mode(conns[i]->end0,
NVLINK_LINKSTATE_DISABLE_PM,
@@ -651,6 +702,9 @@ nvlink_core_powerdown_intranode_conns_from_active_to_swcfg
//
for (i = 0; i < connCount; i++)
{
if (conns[i] == NULL)
continue;
// Wait for the end0 to go to SWCFG
status = nvlink_core_poll_link_state(conns[i]->end0,
NVLINK_LINKSTATE_SAFE,
@@ -694,6 +748,9 @@ nvlink_core_powerdown_intranode_conns_from_active_to_swcfg
// Wait for sublinks to go to SAFE
for (i = 0; i < connCount; i++)
{
if (conns[i] == NULL)
continue;
status = nvlink_core_poll_sublink_state(conns[i]->end0,
NVLINK_SUBLINK_STATE_TX_SAFE,
NVLINK_SUBLINK_SUBSTATE_TX_STABLE,
@@ -736,6 +793,9 @@ nvlink_core_powerdown_intranode_conns_from_active_to_swcfg
// Update tracking info
for (i = 0; i < connCount; i++)
{
if (conns[i] == NULL)
continue;
NVLINK_PRINT((DBG_MODULE_NVLINK_CORE, NVLINK_DBG_LEVEL_INFO,
"%s: Connection is in SAFE mode. ",
__FUNCTION__));
@@ -779,6 +839,9 @@ nvlink_core_reset_intranode_conns
for (i = 0; i < connCount; i++)
{
if (conns[i] == NULL)
continue;
//
// Reset both ends of this connection.
// This path should enable/init those link endpoints as well.
@@ -825,6 +888,9 @@ _nvlink_core_clear_link_state
nvlink_link *link
)
{
if (link == NULL)
return;
// Receiver Detect needs to happen again
link->bRxDetected = NV_FALSE;
@@ -874,8 +940,8 @@ nvlink_core_powerdown_floorswept_conns_to_off
)
{
NvU32 i,j;
nvlink_intranode_conn **connsToShutdown;
nvlink_intranode_conn **visitedConns;
nvlink_intranode_conn **connsToShutdown = NULL;
nvlink_intranode_conn **visitedConns = NULL;
nvlink_intranode_conn *conn;
NvU32 connCount;
NvU32 numConnsToShutdown;
@@ -926,10 +992,15 @@ nvlink_core_powerdown_floorswept_conns_to_off
if (links[j]->linkNumber >= numLinksPerIoctrl*i &&
links[j]->linkNumber < numLinksPerIoctrl*(i+1))
{
conn = NULL;
nvlink_core_get_intranode_conn(links[j], &(conn));
if (conn == NULL ||
_nvlink_core_check_if_conn_in_array(visitedConns, connCount, conn))
_nvlink_core_check_if_conn_in_array(visitedConns, connCount, conn) ||
(conn->end0 == NULL || conn->end1 == NULL))
{
NVLINK_PRINT((DBG_MODULE_NVLINK_CORE, NVLINK_DBG_LEVEL_ERRORS,
"%s: AC debug -- conn gotten: 0x%x\n",
__FUNCTION__, conn));
continue;
}
else if(nvlink_core_check_intranode_conn_state(conn, NVLINK_LINKSTATE_OFF) ==

View File

@@ -34,14 +34,22 @@ NvlStatus
nvlink_core_train_check_link_ready_ALI
(
nvlink_link **links,
NvU32 linkCount
NvU32 linkCount
)
{
NvU32 i = 0;
NvU32 i = 0;
NvlStatus status = NVL_SUCCESS;
if (links == NULL)
{
return NVL_BAD_ARGS;
}
for (i = 0; i < linkCount; i++)
{
if (links[i] == NULL)
continue;
if (!nvlink_core_check_link_state(links[i], NVLINK_LINKSTATE_ALI))
{
// If link is not in active, update status to be error and continue
@@ -89,6 +97,9 @@ nvlink_core_train_internode_conns_from_swcfg_to_active
for (i = 0; i < connCount; i++)
{
if (conns[i] == NULL)
continue;
// Don't do anything if the link is already at HS.
if ((nvlink_core_check_link_state(conns[i]->local_end, NVLINK_LINKSTATE_HS)) &&
(nvlink_core_check_tx_sublink_state(conns[i]->local_end,
@@ -133,7 +144,7 @@ nvlink_core_train_internode_conns_from_swcfg_to_active
for (i = 0; i < connCount; i++)
{
if (skipConn[i])
if ((conns[i] == NULL) || skipConn[i])
{
continue;
}
@@ -151,6 +162,8 @@ nvlink_core_train_internode_conns_from_swcfg_to_active
for (i = 0; i < connCount; i++)
{
if (conns[i] == NULL)
continue;
// Wait for the link state to change.
status = nvlink_core_poll_link_state(conns[i]->local_end,
@@ -202,6 +215,11 @@ nvlink_core_train_internode_conn_sublink_from_safe_to_hs
{
NvlStatus status = NVL_SUCCESS;
if (conn == NULL)
{
return NVL_BAD_ARGS;
}
// NVLink 3.0 onwards this is handled through INITOPTIMIZE, return error
if (conn->local_end->version >= NVLINK_DEVICE_VERSION_30)
{
@@ -328,6 +346,9 @@ nvlink_core_train_intranode_conns_from_from_L2_to_active
// STEP 1: Reset all endpoints of the links. This clears any link state
for (i = 0; i < connCount; i++)
{
if (conns[i] == NULL)
continue;
conns[i]->end0->link_handlers->set_dl_link_mode(conns[i]->end0,
NVLINK_LINKSTATE_RESET,
flags);
@@ -339,6 +360,9 @@ nvlink_core_train_intranode_conns_from_from_L2_to_active
// STEP 2: NVLink 3 and beyond, we also need to perform INITPHASE1
for (i = 0; i < connCount; i++)
{
if (conns[i] == NULL)
continue;
conns[i]->end0->link_handlers->set_dl_link_mode(conns[i]->end0,
NVLINK_LINKSTATE_INITPHASE1,
flags);
@@ -355,6 +379,9 @@ nvlink_core_train_intranode_conns_from_from_L2_to_active
{
for (i = 0; i < connCount; i++)
{
if (conns[i] == NULL)
continue;
status = conns[i]->end0->link_handlers->get_dl_link_mode(conns[i]->end0, &linkMode);
if ((status != NVL_SUCCESS) ||
(linkMode == NVLINK_LINKSTATE_FAIL) || (linkMode == NVLINK_LINKSTATE_FAULT))
@@ -374,6 +401,9 @@ nvlink_core_train_intranode_conns_from_from_L2_to_active
// Verify that all the endpoints are now in INIT state
for (i = 0; i < connCount; i++)
{
if (conns[i] == NULL)
continue;
status = nvlink_core_check_intranode_conn_state(conns[i], NVLINK_LINKSTATE_OFF);
if (status != NVL_SUCCESS)
{
@@ -389,6 +419,9 @@ nvlink_core_train_intranode_conns_from_from_L2_to_active
// STEP 3: Restore all end point state saved while entering SLEEP state
for (i = 0; i < connCount; i++)
{
if (conns[i] == NULL)
continue;
if (conns[i]->end0->bStateSaved)
{
conns[i]->end0->link_handlers->set_dl_link_mode(conns[i]->end0,
@@ -409,6 +442,9 @@ nvlink_core_train_intranode_conns_from_from_L2_to_active
{
for (i = 0; i < connCount; i++)
{
if (conns[i] == NULL)
continue;
status = conns[i]->end0->link_handlers->get_dl_link_mode(conns[i]->end0, &linkMode);
if ((status != NVL_SUCCESS) ||
(linkMode == NVLINK_LINKSTATE_FAIL) || (linkMode == NVLINK_LINKSTATE_FAULT))
@@ -428,6 +464,9 @@ nvlink_core_train_intranode_conns_from_from_L2_to_active
// STEP 4: Initialize RX Termination on all end points
for (i = 0; i < connCount; i++)
{
if (conns[i] == NULL)
continue;
conns[i]->end0->link_handlers->set_rx_mode(conns[i]->end0,
NVLINK_SUBLINK_STATE_RX_INIT_TERM,
flags);
@@ -441,6 +480,9 @@ nvlink_core_train_intranode_conns_from_from_L2_to_active
{
for (i = 0; i < connCount; i++)
{
if (conns[i] == NULL)
continue;
status = conns[i]->end0->link_handlers->get_dl_link_mode(conns[i]->end0, &linkMode);
if ((status != NVL_SUCCESS) ||
(linkMode == NVLINK_LINKSTATE_FAIL) || (linkMode == NVLINK_LINKSTATE_FAULT))
@@ -460,6 +502,9 @@ nvlink_core_train_intranode_conns_from_from_L2_to_active
// STEP 5: Enable Common mode on Tx's of all endpoints
for (i = 0; i < connCount; i++)
{
if (conns[i] == NULL)
continue;
if (!((conns[i]->end0->tx_sublink_state == NVLINK_SUBLINK_STATE_TX_COMMON_MODE) ||
(conns[i]->end0->tx_sublink_state == NVLINK_SUBLINK_STATE_TX_COMMON_MODE_DISABLE) ||
(conns[i]->end0->tx_sublink_state == NVLINK_SUBLINK_STATE_TX_DATA_READY)))
@@ -483,6 +528,9 @@ nvlink_core_train_intranode_conns_from_from_L2_to_active
{
for (i = 0; i < connCount; i++)
{
if (conns[i] == NULL)
continue;
status = conns[i]->end0->link_handlers->get_dl_link_mode(conns[i]->end0, &linkMode);
if ((status != NVL_SUCCESS) ||
(linkMode == NVLINK_LINKSTATE_FAIL) || (linkMode == NVLINK_LINKSTATE_FAULT))
@@ -502,6 +550,9 @@ nvlink_core_train_intranode_conns_from_from_L2_to_active
// STEP 6: Put all Rx's in RXCAL
for (i = 0; i < connCount; i++)
{
if (conns[i] == NULL)
continue;
if (conns[i]->end0->rx_sublink_state != NVLINK_SUBLINK_STATE_RX_RXCAL)
{
conns[i]->end0->link_handlers->set_rx_mode(conns[i]->end0,
@@ -519,6 +570,9 @@ nvlink_core_train_intranode_conns_from_from_L2_to_active
// STEP 7: Disable Tx common mode
for (i = 0; i < connCount; i++)
{
if (conns[i] == NULL)
continue;
if (!((conns[i]->end0->tx_sublink_state == NVLINK_SUBLINK_STATE_TX_COMMON_MODE_DISABLE) ||
(conns[i]->end0->tx_sublink_state == NVLINK_SUBLINK_STATE_TX_DATA_READY)))
{
@@ -538,6 +592,9 @@ nvlink_core_train_intranode_conns_from_from_L2_to_active
// STEP 8: Set Data Ready and Enable
for (i = 0; i < connCount; i++)
{
if (conns[i] == NULL)
continue;
if (conns[i]->end0->tx_sublink_state != NVLINK_SUBLINK_STATE_TX_DATA_READY)
{
conns[i]->end0->link_handlers->set_tx_mode(conns[i]->end0,
@@ -557,6 +614,9 @@ nvlink_core_train_intranode_conns_from_from_L2_to_active
{
for (i = 0; i < connCount; i++)
{
if (conns[i] == NULL)
continue;
status = conns[i]->end0->link_handlers->get_dl_link_mode(conns[i]->end0, &linkMode);
if ((status != NVL_SUCCESS) ||
(linkMode == NVLINK_LINKSTATE_FAIL) || (linkMode == NVLINK_LINKSTATE_FAULT))
@@ -576,6 +636,9 @@ nvlink_core_train_intranode_conns_from_from_L2_to_active
// STEP 9: Set link mode to SAFE
for (i = 0; i < connCount; i++)
{
if (conns[i] == NULL)
continue;
conns[i]->end0->link_handlers->set_dl_link_mode(conns[i]->end0,
NVLINK_LINKSTATE_SAFE,
flags);
@@ -590,6 +653,9 @@ nvlink_core_train_intranode_conns_from_from_L2_to_active
// Verify all the endpoints link state now reflect SAFE state
for (i = 0; i < connCount; i++)
{
if (conns[i] == NULL)
continue;
status = nvlink_core_poll_link_state(conns[i]->end0,
NVLINK_LINKSTATE_SAFE,
NVLINK_TRANSITION_SAFE_TIMEOUT);
@@ -634,6 +700,9 @@ nvlink_core_train_intranode_conns_from_from_L2_to_active
// STEP 9: Set INITNEOGOTIATE
for (i = 0; i < connCount; i++)
{
if (conns[i] == NULL)
continue;
conns[i]->end0->link_handlers->set_dl_link_mode(conns[i]->end0,
NVLINK_LINKSTATE_INITNEGOTIATE,
flags);
@@ -651,6 +720,9 @@ nvlink_core_train_intranode_conns_from_from_L2_to_active
// STEP 8: Set POST_INITNEGOTIATE
for (i = 0; i < connCount; i++)
{
if (conns[i] == NULL)
continue;
conns[i]->end0->link_handlers->set_dl_link_mode(conns[i]->end0,
NVLINK_LINKSTATE_POST_INITNEGOTIATE,
flags);
@@ -684,6 +756,9 @@ nvlink_core_train_intranode_conns_from_from_L2_to_active
for (i = 0; i < connCount; i++)
{
if (conns[i] == NULL)
continue;
// Update the power state transition status of the link
conns[i]->end0->powerStateTransitionStatus = nvlink_power_state_in_L0;
conns[i]->end1->powerStateTransitionStatus = nvlink_power_state_in_L0;
@@ -729,6 +804,9 @@ nvlink_core_train_intranode_conns_from_swcfg_to_active_non_ALI
// Trigger INITOPTIMIZE on both ends of the connection
for (i = 0; i < connCount; i++)
{
if (conns[i] == NULL)
continue;
conns[i]->end0->link_handlers->set_dl_link_mode(conns[i]->end0,
NVLINK_LINKSTATE_INITOPTIMIZE,
flags);
@@ -745,6 +823,9 @@ nvlink_core_train_intranode_conns_from_swcfg_to_active_non_ALI
// Trigger POST_INITOPTIMIZE (Checks INITOPTIMIZE was successful) on both ends of the connection
for (i = 0; i < connCount; i++)
{
if (conns[i] == NULL)
continue;
conns[i]->end0->link_handlers->set_dl_link_mode(conns[i]->end0,
NVLINK_LINKSTATE_POST_INITOPTIMIZE,
flags);
@@ -761,6 +842,9 @@ nvlink_core_train_intranode_conns_from_swcfg_to_active_non_ALI
// Set link modes to ACTIVE
for (i = 0; i < connCount; i++)
{
if (conns[i] == NULL)
continue;
// Some settings required before moving to ACTIVE
_nvlink_core_set_link_pre_active_settings(conns[i]->end0, flags);
_nvlink_core_set_link_pre_active_settings(conns[i]->end1, flags);
@@ -783,6 +867,8 @@ nvlink_core_train_intranode_conns_from_swcfg_to_active_non_ALI
// Verify link mode HS on the endpoints
for (i = 0; i < connCount; i++)
{
if (conns[i] == NULL)
continue;
pollStatus = nvlink_core_poll_link_state(conns[i]->end0,
NVLINK_LINKSTATE_HS,
@@ -881,6 +967,9 @@ nvlink_core_train_intranode_conns_from_swcfg_to_active_ALT
for (i = 0; i < connCount; i++)
{
if (conns[i] == NULL)
continue;
status = conns[i]->end0->link_handlers->get_dl_link_mode(conns[i]->end0, &linkMode);
if (status != NVL_SUCCESS)
{
@@ -903,7 +992,7 @@ nvlink_core_train_intranode_conns_from_swcfg_to_active_ALT
// Trigger INITOPTIMIZE on both ends of the connection
for (i = 0; i < connCount; i++)
{
if (skipConn[i])
if ((conns[i] == NULL) || skipConn[i])
{
continue;
}
@@ -924,7 +1013,7 @@ nvlink_core_train_intranode_conns_from_swcfg_to_active_ALT
// Trigger POST_INITOPTIMIZE (Checks INITOPTIMIZE was successful) on both ends of the connection
for (i = 0; i < connCount; i++)
{
if (skipConn[i])
if ((conns[i] == NULL) || skipConn[i])
{
continue;
}
@@ -945,7 +1034,7 @@ nvlink_core_train_intranode_conns_from_swcfg_to_active_ALT
// Set link modes to ACTIVE
for (i = 0; i < connCount; i++)
{
if (skipConn[i])
if ((conns[i] == NULL) || skipConn[i])
{
continue;
}
@@ -966,7 +1055,7 @@ nvlink_core_train_intranode_conns_from_swcfg_to_active_ALT
// Verify link mode HS on the endpoints
for (i = 0; i < connCount; i++)
{
if (skipConn[i])
if ((conns[i] == NULL) || skipConn[i])
{
continue;
}
@@ -1067,6 +1156,9 @@ nvlink_core_train_intranode_conns_from_swcfg_to_active_legacy
// Enable PRBS generator on both ends of the link
for (i = 0; i < connCount; i++)
{
if (conns[i] == NULL)
continue;
_nvlink_core_set_sublink_pre_hs_settings(conns[i]->end0, flags);
_nvlink_core_set_sublink_pre_hs_settings(conns[i]->end1, flags);
}
@@ -1074,6 +1166,9 @@ nvlink_core_train_intranode_conns_from_swcfg_to_active_legacy
// Put TX sublink on both ends in High Speed
for (i = 0; i < connCount; i++)
{
if (conns[i] == NULL)
continue;
conns[i]->end0->link_handlers->set_tx_mode(conns[i]->end0,
NVLINK_SUBLINK_STATE_TX_HS,
flags);
@@ -1085,6 +1180,9 @@ nvlink_core_train_intranode_conns_from_swcfg_to_active_legacy
// Wait for sublinks to go in High Speed.
for (i = 0; i < connCount; i++)
{
if (conns[i] == NULL)
continue;
pollStatus = nvlink_core_poll_sublink_state(conns[i]->end0,
NVLINK_SUBLINK_STATE_TX_HS,
NVLINK_SUBLINK_SUBSTATE_TX_STABLE,
@@ -1121,6 +1219,9 @@ nvlink_core_train_intranode_conns_from_swcfg_to_active_legacy
// Some settings required before moving to ACTIVE
for (i = 0; i < connCount; i++)
{
if (conns[i] == NULL)
continue;
_nvlink_core_set_link_pre_active_settings(conns[i]->end0, flags);
_nvlink_core_set_link_pre_active_settings(conns[i]->end1, flags);
@@ -1136,6 +1237,9 @@ nvlink_core_train_intranode_conns_from_swcfg_to_active_legacy
// Verify link mode HS on the endpoints
for (i = 0; i < connCount; i++)
{
if (conns[i] == NULL)
continue;
pollStatus = nvlink_core_poll_link_state(conns[i]->end1,
NVLINK_LINKSTATE_HS,
NVLINK_TRANSITION_HS_TIMEOUT);
@@ -1198,6 +1302,9 @@ _nvlink_core_set_sublink_pre_hs_settings
NvU32 flags
)
{
if (link == NULL)
return;
//
// Before training the sublinks to HS, the PROD values must be loaded.
// On Volta/NVSwitch, the PROD values get loaded by UCODE during DLPL Init.
@@ -1225,6 +1332,9 @@ _nvlink_core_set_link_pre_active_settings
NvU32 flags
)
{
if (link == NULL)
return;
// Some settings required before moving to ACTIVE
link->link_handlers->set_dl_link_mode(link, NVLINK_LINKSTATE_PRE_HS, flags);
}
@@ -1243,6 +1353,9 @@ _nvlink_core_set_link_post_active_settings
NvU32 flags
)
{
if (link == NULL)
return;
link->link_handlers->training_complete(link);
link->link_handlers->set_tx_mode(link, NVLINK_SUBLINK_STATE_TX_POST_HS, flags);

View File

@@ -3573,10 +3573,10 @@ nvlink_lib_ctrl_get_device_link_states
//
FOR_EACH_LINK_REGISTERED(endpoint, dev, node)
{
if (numLinks >= NVLINK_MAX_NVLINK_ENDPOINTS)
if (numLinks >= NVLINK_MAX_SYSTEM_LINK_NUM)
{
NVLINK_PRINT((DBG_MODULE_NVLINK_CORE, NVLINK_DBG_LEVEL_ERRORS,
"%s: numLinks >= NVLINK_MAX_NVLINK_ENDPOINTS",
"%s: numLinks >= NVLINK_MAX_SYSTEM_LINK_NUM",
__FUNCTION__));
nvlink_assert(0);