gdrcopy is not initialized

This commit is contained in:
Saeed Maleki
2023-03-28 04:56:06 +00:00
parent e7cccbf897
commit d9ba953fb0
2 changed files with 11 additions and 8 deletions

View File

@@ -86,7 +86,7 @@ MSCCLPP_API(mscclppResult_t, mscclppCommInitRank, mscclppComm_t* comm, int nrank
mscclppResult_t mscclppCommInitRank(mscclppComm_t* comm, int nranks, const char* ipPortPair, int rank)
{
if (mscclppGdrCopy == NULL) {
MSCCLPPCHECK(initGdrCopy());
// MSCCLPPCHECK(initGdrCopy());
}
mscclppResult_t res = mscclppSuccess;
@@ -162,7 +162,7 @@ MSCCLPP_API(mscclppResult_t, mscclppCommInitRankFromId, mscclppComm_t* comm, int
mscclppResult_t mscclppCommInitRankFromId(mscclppComm_t* comm, int nranks, mscclppUniqueId id, int rank)
{
if (mscclppGdrCopy == NULL) {
MSCCLPPCHECK(initGdrCopy());
// MSCCLPPCHECK(initGdrCopy());
}
mscclppResult_t res = mscclppSuccess;
@@ -213,7 +213,7 @@ mscclppResult_t mscclppCommDestroy(mscclppComm_t comm)
struct mscclppConn* conn = &comm->conns[i];
if (conn->cpuProxyFlagGdrDesc) {
// IB
MSCCLPPCHECK(mscclppGdrCudaFree(conn->cpuProxyFlagGdrDesc));
// MSCCLPPCHECK(mscclppGdrCudaFree(conn->cpuProxyFlagGdrDesc));
} else if (conn->devConn->proxyEpochId) {
// P2P
MSCCLPPCHECK(mscclppCudaFree(conn->devConn->proxyEpochId));
@@ -401,8 +401,8 @@ mscclppResult_t mscclppConnect(mscclppComm_t comm, int remoteRank, int tag, void
// If we couldn't find a matching context, create one
if (proxyState == NULL) {
MSCCLPPCHECK(mscclppCalloc(&proxyState, 1));
MSCCLPPCHECK(mscclppGdrCudaCalloc(&proxyState->triggerFifo.hostPtr, &proxyState->triggerFifo.devPtr,
MSCCLPP_PROXY_FIFO_SIZE, &proxyState->triggerFifo.desc));
// MSCCLPPCHECK(mscclppGdrCudaCalloc(&proxyState->triggerFifo.hostPtr, &proxyState->triggerFifo.devPtr,
// MSCCLPP_PROXY_FIFO_SIZE, &proxyState->triggerFifo.desc));
MSCCLPPCHECK(mscclppCudaHostCalloc(&proxyState->triggerFifo.hostPtr, MSCCLPP_PROXY_FIFO_SIZE));
proxyState->triggerFifo.devPtr = proxyState->triggerFifo.hostPtr;
// MSCCLPPCHECK(
@@ -497,7 +497,8 @@ mscclppResult_t mscclppIbConnectionSetupStart(struct connInfo* connInfo /*output
struct mscclppDevConn* devConn = conn->devConn;
devConn->remoteBuff = NULL;
devConn->remoteFlag = NULL;
MSCCLPPCHECK(mscclppGdrCudaCalloc(&conn->cpuProxyFlag, &devConn->proxyEpochId, 1, &conn->cpuProxyFlagGdrDesc));
// MSCCLPPCHECK(mscclppGdrCudaCalloc(&conn->cpuProxyFlag, &devConn->proxyEpochId, 1, &conn->cpuProxyFlagGdrDesc));
MSCCLPPCHECK(mscclppCudaCalloc(&devConn->proxyEpochId, 1));
struct mscclppIbContext* ibCtx = conn->ibCtx;
if (conn->ibQp == NULL) {

View File

@@ -122,7 +122,8 @@ void* mscclppProxyServiceP2P(void* _args)
// Send completion: reset only the high 64 bits
*(volatile uint64_t*)(&fifo[cachedFifoTail % MSCCLPP_PROXY_FIFO_SIZE]) = 0;
cachedFifoTail++;
PROXYCUDACHECK(cudaMemcpyAsync(fifoTailDevPtr, &cachedFifoTail, sizeof(uint64_t), cudaMemcpyHostToDevice, stream2));
if (((cachedFifoTail % 4) == 0) || (trigger.fields.type & mscclppSync))
PROXYCUDACHECK(cudaMemcpyAsync(fifoTailDevPtr, &cachedFifoTail, sizeof(uint64_t), cudaMemcpyHostToDevice, stream2));
*fifoTail = cachedFifoTail;
}
@@ -314,7 +315,8 @@ void* mscclppProxyServiceIb(void* _args)
// Send completion: reset only the high 64 bits
*(volatile uint64_t*)(&fifo[cachedFifoTail % MSCCLPP_PROXY_FIFO_SIZE]) = 0;
cachedFifoTail++;
PROXYCUDACHECK(cudaMemcpyAsync(fifoTailDevPtr, &cachedFifoTail, sizeof(uint64_t), cudaMemcpyHostToDevice, stream));
if (((cachedFifoTail % 4) == 0) || (trigger.fields.type & mscclppSync))
PROXYCUDACHECK(cudaMemcpyAsync(fifoTailDevPtr, &cachedFifoTail, sizeof(uint64_t), cudaMemcpyHostToDevice, stream));
*fifoTail = cachedFifoTail;
#endif
}