mirror of
https://github.com/microsoft/mscclpp.git
synced 2026-05-12 01:10:22 +00:00
gdrcopy is not initialized
This commit is contained in:
13
src/init.cc
13
src/init.cc
@@ -86,7 +86,7 @@ MSCCLPP_API(mscclppResult_t, mscclppCommInitRank, mscclppComm_t* comm, int nrank
|
||||
mscclppResult_t mscclppCommInitRank(mscclppComm_t* comm, int nranks, const char* ipPortPair, int rank)
|
||||
{
|
||||
if (mscclppGdrCopy == NULL) {
|
||||
MSCCLPPCHECK(initGdrCopy());
|
||||
// MSCCLPPCHECK(initGdrCopy());
|
||||
}
|
||||
|
||||
mscclppResult_t res = mscclppSuccess;
|
||||
@@ -162,7 +162,7 @@ MSCCLPP_API(mscclppResult_t, mscclppCommInitRankFromId, mscclppComm_t* comm, int
|
||||
mscclppResult_t mscclppCommInitRankFromId(mscclppComm_t* comm, int nranks, mscclppUniqueId id, int rank)
|
||||
{
|
||||
if (mscclppGdrCopy == NULL) {
|
||||
MSCCLPPCHECK(initGdrCopy());
|
||||
// MSCCLPPCHECK(initGdrCopy());
|
||||
}
|
||||
|
||||
mscclppResult_t res = mscclppSuccess;
|
||||
@@ -213,7 +213,7 @@ mscclppResult_t mscclppCommDestroy(mscclppComm_t comm)
|
||||
struct mscclppConn* conn = &comm->conns[i];
|
||||
if (conn->cpuProxyFlagGdrDesc) {
|
||||
// IB
|
||||
MSCCLPPCHECK(mscclppGdrCudaFree(conn->cpuProxyFlagGdrDesc));
|
||||
// MSCCLPPCHECK(mscclppGdrCudaFree(conn->cpuProxyFlagGdrDesc));
|
||||
} else if (conn->devConn->proxyEpochId) {
|
||||
// P2P
|
||||
MSCCLPPCHECK(mscclppCudaFree(conn->devConn->proxyEpochId));
|
||||
@@ -401,8 +401,8 @@ mscclppResult_t mscclppConnect(mscclppComm_t comm, int remoteRank, int tag, void
|
||||
// If we couldn't find a matching context, create one
|
||||
if (proxyState == NULL) {
|
||||
MSCCLPPCHECK(mscclppCalloc(&proxyState, 1));
|
||||
MSCCLPPCHECK(mscclppGdrCudaCalloc(&proxyState->triggerFifo.hostPtr, &proxyState->triggerFifo.devPtr,
|
||||
MSCCLPP_PROXY_FIFO_SIZE, &proxyState->triggerFifo.desc));
|
||||
// MSCCLPPCHECK(mscclppGdrCudaCalloc(&proxyState->triggerFifo.hostPtr, &proxyState->triggerFifo.devPtr,
|
||||
// MSCCLPP_PROXY_FIFO_SIZE, &proxyState->triggerFifo.desc));
|
||||
MSCCLPPCHECK(mscclppCudaHostCalloc(&proxyState->triggerFifo.hostPtr, MSCCLPP_PROXY_FIFO_SIZE));
|
||||
proxyState->triggerFifo.devPtr = proxyState->triggerFifo.hostPtr;
|
||||
// MSCCLPPCHECK(
|
||||
@@ -497,7 +497,8 @@ mscclppResult_t mscclppIbConnectionSetupStart(struct connInfo* connInfo /*output
|
||||
struct mscclppDevConn* devConn = conn->devConn;
|
||||
devConn->remoteBuff = NULL;
|
||||
devConn->remoteFlag = NULL;
|
||||
MSCCLPPCHECK(mscclppGdrCudaCalloc(&conn->cpuProxyFlag, &devConn->proxyEpochId, 1, &conn->cpuProxyFlagGdrDesc));
|
||||
// MSCCLPPCHECK(mscclppGdrCudaCalloc(&conn->cpuProxyFlag, &devConn->proxyEpochId, 1, &conn->cpuProxyFlagGdrDesc));
|
||||
MSCCLPPCHECK(mscclppCudaCalloc(&devConn->proxyEpochId, 1));
|
||||
|
||||
struct mscclppIbContext* ibCtx = conn->ibCtx;
|
||||
if (conn->ibQp == NULL) {
|
||||
|
||||
@@ -122,7 +122,8 @@ void* mscclppProxyServiceP2P(void* _args)
|
||||
// Send completion: reset only the high 64 bits
|
||||
*(volatile uint64_t*)(&fifo[cachedFifoTail % MSCCLPP_PROXY_FIFO_SIZE]) = 0;
|
||||
cachedFifoTail++;
|
||||
PROXYCUDACHECK(cudaMemcpyAsync(fifoTailDevPtr, &cachedFifoTail, sizeof(uint64_t), cudaMemcpyHostToDevice, stream2));
|
||||
if (((cachedFifoTail % 4) == 0) || (trigger.fields.type & mscclppSync))
|
||||
PROXYCUDACHECK(cudaMemcpyAsync(fifoTailDevPtr, &cachedFifoTail, sizeof(uint64_t), cudaMemcpyHostToDevice, stream2));
|
||||
*fifoTail = cachedFifoTail;
|
||||
}
|
||||
|
||||
@@ -314,7 +315,8 @@ void* mscclppProxyServiceIb(void* _args)
|
||||
// Send completion: reset only the high 64 bits
|
||||
*(volatile uint64_t*)(&fifo[cachedFifoTail % MSCCLPP_PROXY_FIFO_SIZE]) = 0;
|
||||
cachedFifoTail++;
|
||||
PROXYCUDACHECK(cudaMemcpyAsync(fifoTailDevPtr, &cachedFifoTail, sizeof(uint64_t), cudaMemcpyHostToDevice, stream));
|
||||
if (((cachedFifoTail % 4) == 0) || (trigger.fields.type & mscclppSync))
|
||||
PROXYCUDACHECK(cudaMemcpyAsync(fifoTailDevPtr, &cachedFifoTail, sizeof(uint64_t), cudaMemcpyHostToDevice, stream));
|
||||
*fifoTail = cachedFifoTail;
|
||||
#endif
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user