535.86.05

This commit is contained in:
Bernhard Stoeckner
2023-07-18 15:54:53 +02:00
committed by Bernhard Stoeckner
parent 22a077c4fe
commit 10d538dfbd
264 changed files with 67251 additions and 107479 deletions

View File

@@ -439,6 +439,11 @@ NvlStatus nvlink_lib_register_link(nvlink_device *dev, nvlink_link *link);
*/
NvlStatus nvlink_lib_unregister_link(nvlink_link *link);
/*
* Gets number of devices with type deviceType
*/
NvlStatus nvlink_lib_return_device_count_by_type(NvU32 deviceType, NvU32 *numDevices);
/************************************************************************************************/
/******************************* NVLink link management functions *******************************/

View File

@@ -46,6 +46,11 @@ NvlStatus nvlink_lib_unload(void);
*/
NvlStatus nvlink_lib_ioctl_ctrl(nvlink_ioctrl_params *ctrl_params);
/*
* Gets number of devices with type deviceType
*/
NvlStatus nvlink_lib_return_device_count_by_type(NvU32 deviceType, NvU32 *numDevices);
#ifdef __cplusplus
}
#endif

View File

@@ -587,6 +587,8 @@ typedef enum
nvlink_train_conn_to_off,
nvlink_train_conn_active_to_swcfg,
nvlink_train_conn_swcfg_to_off,
nvlink_train_conn_off_to_active_ali_non_blocking,
nvlink_train_conn_off_to_active_ali_blocking,
/* See enum modification guidelines at the top of this file */
} nvlink_conn_train_type;
@@ -784,7 +786,10 @@ typedef struct
*
* NVLink 3.0 onwards, connection detection is handled by Minion. After INITNEGOTIATE
* completed, this interface needs to be queried to retrieve the local/remote SIDs
* and the local/remote link number of all links associated with a device
* and the local/remote link number of all links associated with a device.
*
* On NVLink 4.0 this needs to be queried after all links in the system have been
* trained.
*
* Parameters:
* devInfo [IN]
@@ -1079,7 +1084,7 @@ typedef struct
/*
* CTRL_NVLINK_GET_DEVICE_LINK_STATES
*
* Returns the link state of all enabled links on a given device.
* Returns the link state of all links on a given device.
*
* Parameters:
* devInfo [IN]
@@ -1096,7 +1101,7 @@ typedef struct
* will show the states as INVALID.
*
* endStatesCount [OUT]
* count of valid entries into the endStates array
* count of total entries in the endStates array
*
*/
typedef struct
@@ -1107,7 +1112,7 @@ typedef struct
/* output parameters */
NvlStatus status;
nvlink_link_state endStates[NVLINK_MAX_NVLINK_ENDPOINTS];
nvlink_link_state endStates[NVLINK_MAX_DEVICE_CONN];
NvU32 endStatesCount;
} nvlink_get_device_link_states;

View File

@@ -1,5 +1,5 @@
/*
* SPDX-FileCopyrightText: Copyright (c) 2019 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-FileCopyrightText: Copyright (c) 2019-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: MIT
*
* Permission is hereby granted, free of charge, to any person obtaining a
@@ -30,6 +30,9 @@ extern "C" {
#include "nvlink_common.h"
#define TOP_LEVEL_LOCKING_DISABLED 1
#define PER_LINK_LOCKING_DISABLED 1
#define NVLINK_FREE(x) nvlink_free((void *)x)
// Memory management functions

View File

@@ -841,7 +841,7 @@ nvlink_core_reset_intranode_conns
{
if (conns[i] == NULL)
continue;
//
// Reset both ends of this connection.
// This path should enable/init those link endpoints as well.

View File

@@ -932,6 +932,53 @@ nvlink_core_train_intranode_conns_from_swcfg_to_active_non_ALI
return status;
}
/**
* Train intranode connections associated with a list of links to HS
* using non-ALI sequence
*
* @param[in] links Array of links to train
* @param[in] numLinks Number of links in the array
*
* return NVL_SUCCESS if the connections train successfully
*/
NvlStatus
nvlink_core_train_intranode_conns_from_off_to_active_ALI
(
nvlink_link **pLinks,
NvU32 numLinks
)
{
NvlStatus status = NVL_SUCCESS;
NvlStatus returnStatus = NVL_SUCCESS;
NvU32 i;
if ((pLinks == NULL) || (numLinks == 0))
{
NVLINK_PRINT((DBG_MODULE_NVLINK_CORE, NVLINK_DBG_LEVEL_ERRORS,
"%s: No links to train to ACTIVE\n",
__FUNCTION__));
return NVL_ERR_GENERIC;
}
for (i = 0; i < numLinks; ++i)
{
if (pLinks[i] == NULL)
continue;
status = pLinks[i]->link_handlers->ali_training(pLinks[i]);
if (status != NVL_SUCCESS)
{
NVLINK_PRINT((DBG_MODULE_NVLINK_CORE, NVLINK_DBG_LEVEL_ERRORS,
"%s: failed to send ALI link training on link 0x%x\n",
__FUNCTION__, pLinks[i]->linkNumber));
returnStatus = status;
}
}
return returnStatus;
}
/**
* Train intranode connections associated with a list of links to HS
* using ALT sequence

View File

@@ -1444,6 +1444,54 @@ nvlink_lib_ctrl_device_read_discovery_tokens
return NVL_SUCCESS;
}
/**
* Perform peer link discovery
*
* @param[in] readParams IOCTL params
*
* return NvlStatus
*/
static NvlStatus
_nvlink_lib_ctrl_device_discover_peer_link
(
nvlink_link *link
)
{
NvlStatus status = NVL_SUCCESS;
//
// If the link succeeds rxDet(link is in HS, SAFE, or SLEEP mode) then go through and find its
// peer link. What is important is not actually finding the link, but making sure the corelib
// goes through the discovery process and has endpoints cache the remote information in the corelib
// such that FM or endpoints can query the corelib for the topology of the system.
//
NvU64 linkMode = NVLINK_LINKSTATE_OFF;
status = link->link_handlers->get_dl_link_mode(link, &linkMode);
if (status != NVL_SUCCESS)
{
NVLINK_PRINT((DBG_MODULE_NVLINK_CORE, NVLINK_DBG_LEVEL_ERRORS,
"%s: Unable to get link mode for %s:%s\n",
__FUNCTION__, link->dev->deviceName, link->linkName));
return status;
}
if ((linkMode == NVLINK_LINKSTATE_SAFE) ||
(linkMode == NVLINK_LINKSTATE_HS) ||
(linkMode == NVLINK_LINKSTATE_SLEEP))
{
nvlink_link *remoteLink = NULL;
nvlink_core_discover_and_get_remote_end(link, &remoteLink, 0);
if (remoteLink == NULL)
{
NVLINK_PRINT((DBG_MODULE_NVLINK_CORE, NVLINK_DBG_LEVEL_INFO,
"%s: link 0x%x: couldn't find link pair! Possible that other device queries need to finish before there is a found connection in the corelib\n",
__FUNCTION__, link->linkNumber));
}
}
return NVL_SUCCESS;
}
/**
* Read the SIDs for the the local and remote device
*
@@ -1557,6 +1605,19 @@ nvlink_lib_ctrl_device_read_sids
for (i = 0; i < numLinks; i++)
{
// ALI specific handling to update corelib structures and verify link status
if (dev->enableALI)
{
status = _nvlink_lib_ctrl_device_discover_peer_link(links[i]);
if (status != NVL_SUCCESS)
{
// Release the per-link locks and free links
nvlink_lib_link_locks_release(links, numLinks);
nvlink_free((void *)links);
return status;
}
}
// Fill-up the local/remote link numbers and SIDs
readParams->sidInfo[numEntries].localLinkSid = links[i]->localSid;
readParams->sidInfo[numEntries].remoteLinkSid = links[i]->remoteSid;
@@ -1685,6 +1746,22 @@ nvlink_lib_ctrl_discover_intranode_conns
continue;
}
// ALI specific handling to update corelib structures and verify link status
if (dev->enableALI)
{
status = _nvlink_lib_ctrl_device_discover_peer_link(link);
if (status != NVL_SUCCESS)
{
// Release the per-link locks
nvlink_lib_link_locks_release(links, numLinks);
// Release the top-level lock
nvlink_lib_top_lock_release();
nvlink_free((void *)links);
return status;
}
}
writeToken = nvlink_core_get_link_discovery_token(link);
if ((link->version < NVLINK_DEVICE_VERSION_30) ||
@@ -2006,6 +2083,7 @@ nvlink_lib_ctrl_train_intranode_conn
nvlink_intranode_conn *conn = NULL;
NvlStatus status = NVL_SUCCESS;
NvU32 count;
NvU32 i;
// make sure that this call is for single node systems
if (trainParams->srcEndPoint.nodeId != trainParams->dstEndPoint.nodeId)
@@ -2174,6 +2252,44 @@ nvlink_lib_ctrl_train_intranode_conn
}
break;
}
case nvlink_train_conn_off_to_active_ali_non_blocking:
case nvlink_train_conn_off_to_active_ali_blocking:
{
if (srcLink->version >= NVLINK_DEVICE_VERSION_40 &&
srcLink->dev->enableALI)
{
status = nvlink_core_train_intranode_conns_from_off_to_active_ALI(initLinks, count);
if (trainParams->trainTo == nvlink_train_conn_off_to_active_ali_blocking)
{
NvU32 timeout = NVLINK_TRANSITION_HS_TIMEOUT;
do
{
nvlink_sleep(1);
status = nvlink_core_train_check_link_ready_ALI(initLinks, count);
if (status == NVL_SUCCESS)
{
break;
}
timeout--;
} while(timeout > 0);
if (status == NVL_SUCCESS)
{
for ( i = 0; i < count; ++i)
{
//
// NVLINK_LINKSTATE_TRAFFIC_SETUP will make sure a request to active completes before
// setting buffer ready so use the internal check to see if the request for ALI completed
//
(void)initLinks[i]->link_handlers->set_dl_link_mode(initLinks[i], NVLINK_LINKSTATE_TRAFFIC_SETUP, 0);
}
}
}
}
break;
}
default:
{
status = NVL_BAD_ARGS;
@@ -2476,6 +2592,45 @@ nvlink_lib_ctrl_train_intranode_conns_parallel
}
break;
}
case nvlink_train_conn_off_to_active_ali_non_blocking:
case nvlink_train_conn_off_to_active_ali_blocking:
{
if (srcLink->version >= NVLINK_DEVICE_VERSION_40 &&
srcLink->dev->enableALI)
{
status = nvlink_core_train_intranode_conns_from_off_to_active_ALI(
initLinks, count);
if (trainParams->trainTo == nvlink_train_conn_off_to_active_ali_blocking)
{
NvU32 timeout = NVLINK_TRANSITION_HS_TIMEOUT;
do
{
nvlink_sleep(1);
status = nvlink_core_train_check_link_ready_ALI(initLinks, count);
if (status == NVL_SUCCESS)
{
break;
}
timeout--;
} while(timeout > 0);
if (status == NVL_SUCCESS)
{
for ( i = 0; i < count; ++i)
{
//
// NVLINK_LINKSTATE_TRAFFIC_SETUP will make sure a request to active completes before
// setting buffer ready so use the internal check to see if the request for ALI completed
//
(void)initLinks[i]->link_handlers->set_dl_link_mode(initLinks[i], NVLINK_LINKSTATE_TRAFFIC_SETUP, 0);
}
}
}
}
break;
}
default:
{
status = NVL_BAD_ARGS;
@@ -3397,6 +3552,8 @@ static NvlStatus nvlink_lib_ctrl_get_link_state
NvU32 numLinks = 0;
NvU32 i = 0;
ct_assert(NVLINK_MAX_SYSTEM_LINK_NUM == NVLINK_MAX_NVLINK_ENDPOINTS);
nvlink_link **links = (nvlink_link **)nvlink_malloc(
sizeof(nvlink_link *) * NVLINK_MAX_SYSTEM_LINK_NUM);
if (links == NULL)
@@ -3531,16 +3688,17 @@ nvlink_lib_ctrl_get_device_link_states
NvlStatus status = NVL_SUCCESS;
NvU32 numLinks = 0;
NvU32 i = 0;
ct_assert(NVLINK_MAX_SYSTEM_LINK_NUM == NVLINK_MAX_NVLINK_ENDPOINTS);
NvU8 linkNumber;
nvlink_link **links = (nvlink_link **)nvlink_malloc(
sizeof(nvlink_link *) * NVLINK_MAX_SYSTEM_LINK_NUM);
sizeof(nvlink_link *) * NVLINK_MAX_DEVICE_CONN);
if (links == NULL)
{
return NVL_NO_MEM;
}
nvlink_memset(params->endStates, 0x0, sizeof(params->endStates));
// Acquire the top-level lock
status = nvlink_lib_top_lock_acquire();
if (status != NVL_SUCCESS)
@@ -3573,10 +3731,10 @@ nvlink_lib_ctrl_get_device_link_states
//
FOR_EACH_LINK_REGISTERED(endpoint, dev, node)
{
if (numLinks >= NVLINK_MAX_SYSTEM_LINK_NUM)
if (numLinks >= NVLINK_MAX_DEVICE_CONN)
{
NVLINK_PRINT((DBG_MODULE_NVLINK_CORE, NVLINK_DBG_LEVEL_ERRORS,
"%s: numLinks >= NVLINK_MAX_SYSTEM_LINK_NUM",
"%s: numLinks >= NVLINK_MAX_DEVICE_CONN",
__FUNCTION__));
nvlink_assert(0);
@@ -3614,16 +3772,20 @@ nvlink_lib_ctrl_get_device_link_states
for (i = 0; i < numLinks; ++i)
{
linkNumber = links[i]->linkNumber;
nvlink_assert(linkNumber < NVLINK_MAX_DEVICE_CONN);
// Get the endpoint states of the link
nvlink_core_get_endpoint_state(links[i], &(params->endStates[i]));
nvlink_core_get_endpoint_state(links[i], &(params->endStates[linkNumber]));
NVLINK_PRINT((DBG_MODULE_NVLINK_CORE, NVLINK_DBG_LEVEL_INFO,
"%s: link 0x%x -- linkMode 0x%x,\n",
__FUNCTION__, i, params->endStates[i].linkMode));
__FUNCTION__, linkNumber, params->endStates[linkNumber].linkMode));
}
params->endStatesCount = numLinks;
// This is done to preserve client behavior that uses endStatesCount to iterate across endStates array
params->endStatesCount = NVLINK_MAX_DEVICE_CONN;
// Release the per-link locks
nvlink_lib_link_locks_release(links, numLinks);

View File

@@ -139,6 +139,12 @@ void nvlink_core_init_links_from_off_to_swcfg_non_ALI(nvlink_link **pLinks,
*/
NvlStatus nvlink_core_initnegotiate(nvlink_link **links, NvU32 numLinks, NvU32 flags);
/*
* Initialize all the endpoints from OFF to ACTIVE state for ALI sequence
* Used for nvlink 4.0+
*/
NvlStatus nvlink_core_train_intranode_conns_from_off_to_active_ALI(nvlink_link **pLinks,
NvU32 numLinks);
/************************************************************************************************/
/*************************** NVLink topology discovery functions ********************************/
/************************************************************************************************/
@@ -227,6 +233,12 @@ NvlStatus nvlink_core_train_intranode_conns_from_swcfg_to_active_non_ALI(nvlink_
NvlStatus nvlink_core_train_check_link_ready_ALI(nvlink_link **links,
NvU32 linkCount);
/**
* Initiate ALI training for nvlink 4.0+
*/
NvlStatus nvlink_core_train_from_off_to_active_ALI(nvlink_link **links,
NvU32 linkCount);
/************************************************************************************************/
/********************************** NVLink shutdown functions ***********************************/

View File

@@ -197,3 +197,48 @@ nvlink_lib_is_registerd_device_with_reduced_config(void)
return NV_FALSE;
}
/*
* Get the number of devices that have the device type deviceType
*/
NvlStatus
nvlink_lib_return_device_count_by_type
(
NvU32 deviceType,
NvU32 *numDevices
)
{
NvlStatus lock_status = NVL_SUCCESS;
nvlink_device *dev = NULL;
NvU32 device_count = 0;
if (nvlink_lib_is_initialized())
{
// Acquire top-level lock
lock_status = nvlink_lib_top_lock_acquire();
if (lock_status != NVL_SUCCESS)
{
NVLINK_PRINT((DBG_MODULE_NVLINK_CORE, NVLINK_DBG_LEVEL_ERRORS,
"%s: Failed to acquire top-level lock\n",
__FUNCTION__));
return lock_status;
}
// Top-level lock is now acquired
// Loop through device list
FOR_EACH_DEVICE_REGISTERED(dev, nvlinkLibCtx.nv_devicelist_head, node)
{
if (dev->type == deviceType)
{
device_count++;
}
}
// Release top-level lock
nvlink_lib_top_lock_release();
}
*numDevices = device_count;
return NVL_SUCCESS;
}

View File

@@ -1,5 +1,5 @@
/*******************************************************************************
Copyright (c) 2020 NVidia Corporation
Copyright (c) 2020-2023 NVidia Corporation
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to
deal in the Software without restriction, including without limitation the
@@ -26,14 +26,18 @@
#include "nvlink_lock.h"
//
// Only enabling locking for testing purposes at the moment.
// Disabled at all other times.
//
#define LOCKING_DISABLED 1
// Only enabling top level locking for linux as required by Bug 4108674.
// Per link locking is still disabled at all times. It will be enabled
// after other locking related clean up is done.
//
static void _sort_links(nvlink_link **, NvU32, NvBool (*)(void *, void *));
static NvBool _compare(void *, void *);
#if defined(NV_LINUX)
#undef TOP_LEVEL_LOCKING_DISABLED
# define TOP_LEVEL_LOCKING_DISABLED 0
#endif /* defined(NV_LINUX) */
/*
* Allocate top level lock. Return NVL_SUCCESS if
* the lock was allocated else return NVL_ERR_GENERIC.
@@ -41,7 +45,7 @@ static NvBool _compare(void *, void *);
NvlStatus
nvlink_lib_top_lock_alloc(void)
{
if (LOCKING_DISABLED)
if (TOP_LEVEL_LOCKING_DISABLED)
{
return NVL_SUCCESS;
}
@@ -82,7 +86,7 @@ nvlink_lib_top_lock_alloc(void)
NvlStatus
nvlink_lib_top_lock_free(void)
{
if (LOCKING_DISABLED)
if (TOP_LEVEL_LOCKING_DISABLED)
{
return NVL_SUCCESS;
}
@@ -115,7 +119,7 @@ nvlink_lib_link_lock_alloc
nvlink_link *link
)
{
if (LOCKING_DISABLED)
if (PER_LINK_LOCKING_DISABLED)
{
return NVL_SUCCESS;
}
@@ -158,7 +162,7 @@ nvlink_lib_link_lock_free
nvlink_link *link
)
{
if (LOCKING_DISABLED)
if (PER_LINK_LOCKING_DISABLED)
{
return NVL_SUCCESS;
}
@@ -188,7 +192,7 @@ nvlink_lib_link_lock_free
NvlStatus
nvlink_lib_top_lock_acquire(void)
{
if (LOCKING_DISABLED)
if (TOP_LEVEL_LOCKING_DISABLED)
{
return NVL_SUCCESS;
}
@@ -209,10 +213,6 @@ nvlink_lib_top_lock_acquire(void)
//
nvlink_acquireLock(nvlinkLibCtx.topLevelLock);
NVLINK_PRINT((DBG_MODULE_NVLINK_CORE, NVLINK_DBG_LEVEL_INFO,
"%s: Acquired top-level lock\n",
__FUNCTION__));
return NVL_SUCCESS;
}
@@ -223,7 +223,7 @@ nvlink_lib_top_lock_acquire(void)
NvlStatus
nvlink_lib_top_lock_release(void)
{
if (LOCKING_DISABLED)
if (TOP_LEVEL_LOCKING_DISABLED)
{
return NVL_SUCCESS;
}
@@ -244,10 +244,6 @@ nvlink_lib_top_lock_release(void)
//
nvlink_releaseLock(nvlinkLibCtx.topLevelLock);
NVLINK_PRINT((DBG_MODULE_NVLINK_CORE, NVLINK_DBG_LEVEL_INFO,
"%s: Released top-level lock\n",
__FUNCTION__));
return NVL_SUCCESS;
}
@@ -265,13 +261,12 @@ nvlink_lib_link_locks_acquire
int numLinks
)
{
if (LOCKING_DISABLED)
if (PER_LINK_LOCKING_DISABLED)
{
return NVL_SUCCESS;
}
int i;
nvlink_link *link_prev = NULL;
// Check if array of links is already empty before attempting to release.
@@ -328,13 +323,12 @@ nvlink_lib_link_locks_release
int numLinks
)
{
int i;
if (LOCKING_DISABLED)
if (PER_LINK_LOCKING_DISABLED)
{
return NVL_SUCCESS;
}
int i;
nvlink_link *link_prev = NULL;
// Check if array of links is already empty before attempting to release.