mirror of
https://github.com/microsoft/mscclpp.git
synced 2026-05-22 22:08:28 +00:00
bind numa node to communicator
This commit is contained in:
@@ -54,6 +54,7 @@ struct mscclppComm
|
||||
int rank; // my rank in the communicator
|
||||
int nRanks; // number of GPUs in communicator
|
||||
int cudaDev; // my cuda device index
|
||||
int numaNode; // my numa node number
|
||||
|
||||
// Flag to ask MSCCLPP kernels to abort
|
||||
volatile uint32_t* abortFlag;
|
||||
|
||||
@@ -373,6 +373,13 @@ void mscclppDefaultLogHandler(const char* msg);
|
||||
*/
|
||||
mscclppResult_t mscclppSetLogHandler(mscclppLogHandler_t handler);
|
||||
|
||||
/* Bind NUMA node for the communicator.
|
||||
*
|
||||
* Inputs:
|
||||
* numaNode: the NUMA node to be bound
|
||||
*/
|
||||
mscclppResult_t mscclppNumaBind(mscclppComm_t comm, int numaNode);
|
||||
|
||||
#ifdef __cplusplus
|
||||
} // end extern "C"
|
||||
#endif
|
||||
|
||||
@@ -75,6 +75,7 @@ mscclppResult_t mscclppCommInitRank(mscclppComm_t* comm, int nranks, const char*
|
||||
MSCCLPPCHECKGOTO(mscclppCalloc(&_comm, 1), res, fail);
|
||||
_comm->rank = rank;
|
||||
_comm->nRanks = nranks;
|
||||
_comm->numaNode = -1;
|
||||
// We assume that the user has set the device to the intended one already
|
||||
CUDACHECK(cudaGetDevice(&_comm->cudaDev));
|
||||
|
||||
@@ -547,3 +548,10 @@ mscclppResult_t mscclppSetBootstrapConnTimeout(int timeout)
|
||||
config->setBootstrapConnectionTimeoutConfig(timeout);
|
||||
return mscclppSuccess;
|
||||
}
|
||||
|
||||
MSCCLPP_API(mscclppResult_t, mscclppNumaBind, mscclppComm_t comm, int numaNode);
|
||||
mscclppResult_t mscclppNumaBind(mscclppComm_t comm, int numaNode)
|
||||
{
|
||||
comm->numaNode = numaNode;
|
||||
return mscclppSuccess;
|
||||
}
|
||||
|
||||
19
src/proxy.cc
19
src/proxy.cc
@@ -74,14 +74,19 @@ void* mscclppProxyService(void* _args)
|
||||
|
||||
PROXYCUDACHECK(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
|
||||
bool isP2pProxy = (ibCtx == nullptr);
|
||||
if (isP2pProxy) {
|
||||
// TODO(chhwang): find numa node
|
||||
// Current mapping is based on NDv4: GPU [0,1,2,3,4,5,6,7] -> NUMA [1,1,0,0,3,3,2,2]
|
||||
// TODO(saemal): either ask user or detect it automatically
|
||||
NumaBind((comm->cudaDev / 2) ^ 1);
|
||||
p2pStream = args->proxyState->stream;
|
||||
int numaNode = comm->numaNode;
|
||||
if (numaNode != -1) {
|
||||
NumaBind(numaNode);
|
||||
} else {
|
||||
NumaBind(ibCtx->numaNode);
|
||||
if (isP2pProxy) {
|
||||
// TODO(chhwang): find numa node
|
||||
// Current mapping is based on NDv4: GPU [0,1,2,3,4,5,6,7] -> NUMA [1,1,0,0,3,3,2,2]
|
||||
// TODO(saemal): either ask user or detect it automatically
|
||||
NumaBind((comm->cudaDev / 2) ^ 1);
|
||||
p2pStream = args->proxyState->stream;
|
||||
} else {
|
||||
NumaBind(ibCtx->numaNode);
|
||||
}
|
||||
}
|
||||
free(_args); // allocated in mscclppProxyCreate
|
||||
|
||||
|
||||
Reference in New Issue
Block a user