mirror of
https://github.com/microsoft/mscclpp.git
synced 2026-05-01 04:01:35 +00:00
@@ -551,11 +551,6 @@ class NonblockingFuture {
|
||||
/// @param future The shared future to move.
|
||||
NonblockingFuture(std::shared_future<T>&& future) : future(std::move(future)) {}
|
||||
|
||||
/// Copy constructor.
|
||||
///
|
||||
/// @param other The @ref NonblockingFuture to copy.
|
||||
NonblockingFuture(const NonblockingFuture& other) = default;
|
||||
|
||||
/// Check if the value is ready to be retrieved.
|
||||
///
|
||||
/// @return True if the value is ready, false otherwise.
|
||||
|
||||
@@ -39,7 +39,7 @@ class MyProxyService {
|
||||
semaphores_(semaphores),
|
||||
proxy_([&](mscclpp::ProxyTrigger triggerRaw) { return handleTrigger(triggerRaw); }, [&]() { bindThread(); }) {
|
||||
int cudaDevice;
|
||||
cudaGetDevice(&cudaDevice);
|
||||
MSCCLPP_CUDATHROW(cudaGetDevice(&cudaDevice));
|
||||
deviceNumaNode_ = mscclpp::getDeviceNumaNode(cudaDevice);
|
||||
}
|
||||
|
||||
|
||||
@@ -18,18 +18,6 @@
|
||||
|
||||
static int nranksPerNode = 8;
|
||||
|
||||
// Propagate errors up
|
||||
|
||||
#define MSCCLPPCHECK(call) \
|
||||
do { \
|
||||
mscclppResult_t res = call; \
|
||||
if (res != mscclppSuccess && res != mscclppInProgress) { \
|
||||
/* Print the back trace*/ \
|
||||
printf("Failure at %s:%d -> %s\n", __FILE__, __LINE__, mscclppGetErrorString(res)); \
|
||||
return res; \
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
// Check CUDA RT calls
|
||||
#define CUDACHECK(cmd) \
|
||||
do { \
|
||||
@@ -54,8 +42,7 @@ template <class T>
|
||||
using DeviceHandle = mscclpp::DeviceHandle<T>;
|
||||
__constant__ DeviceHandle<mscclpp::SimpleProxyChannel> constProxyChans[16];
|
||||
|
||||
__device__ void allgather0(DeviceHandle<mscclpp::SimpleProxyChannel> proxyChan, int rank, int world_size,
|
||||
int remoteRank, size_t nelemsPerGPU) {
|
||||
__device__ void allgather0(DeviceHandle<mscclpp::SimpleProxyChannel> proxyChan, int rank, size_t nelemsPerGPU) {
|
||||
// this allgather is really simple and implemented as an alltoall
|
||||
|
||||
// this thread's role is a sender role
|
||||
@@ -70,8 +57,8 @@ __device__ void allgather0(DeviceHandle<mscclpp::SimpleProxyChannel> proxyChan,
|
||||
if ((threadIdx.x % 32) == 0) proxyChan.wait();
|
||||
}
|
||||
|
||||
__device__ void localAllGather(DeviceHandle<mscclpp::SimpleProxyChannel> proxyChan, int rank, int world_size,
|
||||
int nranksPerNode, int remoteRank, uint64_t offset, uint64_t size) {
|
||||
__device__ void localAllGather(DeviceHandle<mscclpp::SimpleProxyChannel> proxyChan, int rank, int nranksPerNode,
|
||||
int remoteRank, uint64_t offset, uint64_t size) {
|
||||
// this allgather algorithm works as follows:
|
||||
// Step 1: GPU rank i sends data to GPU rank (i+1) % nranksPerNode
|
||||
// and waits for data from GPU rank (i-1) % nranksPerNode
|
||||
@@ -91,9 +78,9 @@ __device__ void localAllGather(DeviceHandle<mscclpp::SimpleProxyChannel> proxyCh
|
||||
}
|
||||
}
|
||||
|
||||
__device__ void allgather1(DeviceHandle<mscclpp::SimpleProxyChannel> proxyChan, int rank, int world_size,
|
||||
int nranksPerNode, int remoteRank, size_t nelemsPerGPU) {
|
||||
localAllGather(proxyChan, rank, world_size, nranksPerNode, remoteRank, rank * nelemsPerGPU * sizeof(int),
|
||||
__device__ void allgather1(DeviceHandle<mscclpp::SimpleProxyChannel> proxyChan, int rank, int nranksPerNode,
|
||||
int remoteRank, size_t nelemsPerGPU) {
|
||||
localAllGather(proxyChan, rank, nranksPerNode, remoteRank, rank * nelemsPerGPU * sizeof(int),
|
||||
nelemsPerGPU * sizeof(int));
|
||||
if (remoteRank / nranksPerNode == rank / nranksPerNode)
|
||||
if ((threadIdx.x % 32) == 0) proxyChan.flush();
|
||||
@@ -116,7 +103,7 @@ __device__ void allgather2(DeviceHandle<mscclpp::SimpleProxyChannel> proxyChan,
|
||||
// Step 1
|
||||
// local allgather
|
||||
if (remoteRank / nranksPerNode == rank / nranksPerNode) {
|
||||
localAllGather(proxyChan, rank, world_size, nranksPerNode, remoteRank, rank * nelemsPerGPU * sizeof(int),
|
||||
localAllGather(proxyChan, rank, nranksPerNode, remoteRank, rank * nelemsPerGPU * sizeof(int),
|
||||
nelemsPerGPU * sizeof(int));
|
||||
}
|
||||
// cross-node exchange
|
||||
@@ -134,7 +121,7 @@ __device__ void allgather2(DeviceHandle<mscclpp::SimpleProxyChannel> proxyChan,
|
||||
// local allgather
|
||||
int otherNghr = (rank + nranksPerNode) % world_size;
|
||||
if (remoteRank / nranksPerNode == rank / nranksPerNode) {
|
||||
localAllGather(proxyChan, rank, world_size, nranksPerNode, remoteRank, otherNghr * nelemsPerGPU * sizeof(int),
|
||||
localAllGather(proxyChan, rank, nranksPerNode, remoteRank, otherNghr * nelemsPerGPU * sizeof(int),
|
||||
(nelemsPerGPU * (pipelineSize - 1)) / pipelineSize * sizeof(int));
|
||||
}
|
||||
|
||||
@@ -152,7 +139,7 @@ __device__ void allgather2(DeviceHandle<mscclpp::SimpleProxyChannel> proxyChan,
|
||||
// Step 3
|
||||
// local allgather
|
||||
if (remoteRank / nranksPerNode == rank / nranksPerNode) {
|
||||
localAllGather(proxyChan, rank, world_size, nranksPerNode, remoteRank,
|
||||
localAllGather(proxyChan, rank, nranksPerNode, remoteRank,
|
||||
(otherNghr * nelemsPerGPU + (nelemsPerGPU * (pipelineSize - 1)) / pipelineSize) * sizeof(int),
|
||||
nelemsPerGPU / pipelineSize * sizeof(int));
|
||||
}
|
||||
@@ -170,9 +157,9 @@ __global__ void kernel(int rank, int world_size, int nranksPerNode, size_t nelem
|
||||
DeviceHandle<mscclpp::SimpleProxyChannel> proxyChan = constProxyChans[warpId];
|
||||
|
||||
if (kernel == 0)
|
||||
allgather0(proxyChan, rank, world_size, remoteRank, nelemsPerGPU);
|
||||
allgather0(proxyChan, rank, nelemsPerGPU);
|
||||
else if (kernel == 1)
|
||||
allgather1(proxyChan, rank, world_size, nranksPerNode, remoteRank, nelemsPerGPU);
|
||||
allgather1(proxyChan, rank, nranksPerNode, remoteRank, nelemsPerGPU);
|
||||
else if (kernel == 2)
|
||||
allgather2(proxyChan, rank, world_size, nranksPerNode, remoteRank, nelemsPerGPU);
|
||||
}
|
||||
@@ -388,7 +375,6 @@ int main(int argc, const char* argv[]) {
|
||||
}
|
||||
ip_port = (char*)parsedArgs["ip_port"].c_str();
|
||||
|
||||
int thisNode = rankToNode(rank);
|
||||
int cudaNum = rankToLocalRank(rank);
|
||||
CUDACHECK(cudaSetDevice(cudaNum));
|
||||
|
||||
@@ -452,19 +438,19 @@ int main(int argc, const char* argv[]) {
|
||||
if (rank == 0) printf("Capturing %d iterations of the kernel in a CUDA graph\n", cudagraphiter);
|
||||
cudaGraph_t graph;
|
||||
cudaGraphExec_t instance;
|
||||
cudaStreamBeginCapture(stream, cudaStreamCaptureModeGlobal);
|
||||
CUDACHECK(cudaStreamBeginCapture(stream, cudaStreamCaptureModeGlobal));
|
||||
for (int i = 0; i < cudagraphiter; ++i) {
|
||||
kernel<<<1, 32 * (world_size - 1), 0, stream>>>(rank, world_size, nranksPerNode, nelemsPerGPU, kernelNum);
|
||||
}
|
||||
cudaStreamEndCapture(stream, &graph);
|
||||
cudaGraphInstantiate(&instance, graph, NULL, NULL, 0);
|
||||
CUDACHECK(cudaStreamEndCapture(stream, &graph));
|
||||
CUDACHECK(cudaGraphInstantiate(&instance, graph, NULL, NULL, 0));
|
||||
|
||||
int cudagraphwarmup = 10;
|
||||
if (rank == 0)
|
||||
printf("Warming up %d iterations of the CUDA graph with %d iterations of the kernel\n", cudagraphwarmup,
|
||||
cudagraphiter);
|
||||
for (int i = 0; i < cudagraphwarmup; ++i) {
|
||||
cudaGraphLaunch(instance, stream);
|
||||
CUDACHECK(cudaGraphLaunch(instance, stream));
|
||||
}
|
||||
CUDACHECK(cudaStreamSynchronize(stream));
|
||||
|
||||
@@ -477,7 +463,7 @@ int main(int argc, const char* argv[]) {
|
||||
double t0, t1, ms, time_in_us;
|
||||
t0 = getTime();
|
||||
for (int i = 0; i < cudagraphlaunch; ++i) {
|
||||
cudaGraphLaunch(instance, stream);
|
||||
CUDACHECK(cudaGraphLaunch(instance, stream));
|
||||
}
|
||||
CUDACHECK(cudaStreamSynchronize(stream));
|
||||
|
||||
|
||||
@@ -23,18 +23,6 @@ int nranksPerNode;
|
||||
int rank;
|
||||
int world_size;
|
||||
|
||||
// Propagate errors up
|
||||
|
||||
// Check CUDA RT calls
|
||||
#define CUCHECK(cmd) \
|
||||
do { \
|
||||
cudaError_t err = cmd; \
|
||||
if (err != cudaSuccess) { \
|
||||
printf("%s:%d Cuda failure '%s'\n", __FILE__, __LINE__, cudaGetErrorString(err)); \
|
||||
exit(EXIT_FAILURE); \
|
||||
} \
|
||||
} while (false)
|
||||
|
||||
// Measure current time in second.
|
||||
static double getTime(void) {
|
||||
struct timespec tspec;
|
||||
@@ -45,8 +33,8 @@ static double getTime(void) {
|
||||
return (tspec.tv_nsec / 1.0e9) + tspec.tv_sec;
|
||||
}
|
||||
|
||||
__global__ void kernel(int r, int nranks, mscclpp::FifoDeviceHandle fifo,
|
||||
mscclpp::Host2DeviceSemaphore::DeviceHandle* handles, int handleIndex) {
|
||||
__global__ void kernel(int r, mscclpp::FifoDeviceHandle fifo, mscclpp::Host2DeviceSemaphore::DeviceHandle* handles,
|
||||
int handleIndex) {
|
||||
int tid = threadIdx.x;
|
||||
__syncthreads();
|
||||
// uint64_t tail;
|
||||
@@ -75,8 +63,8 @@ void print_usage(const char* prog) {
|
||||
|
||||
void initializeAndAllocateAllGatherData(int rank, int world_size, size_t dataSize, size_t nelemsPerGPU, int** data_h,
|
||||
int** data_d) {
|
||||
CUCHECK(cudaMalloc(data_d, dataSize));
|
||||
CUCHECK(cudaMemset(*data_d, 0, dataSize));
|
||||
MSCCLPP_CUDATHROW(cudaMalloc(data_d, dataSize));
|
||||
MSCCLPP_CUDATHROW(cudaMemset(*data_d, 0, dataSize));
|
||||
|
||||
*data_h = new int[nelemsPerGPU * world_size];
|
||||
for (size_t i = 0; i < nelemsPerGPU * world_size; i++) {
|
||||
@@ -87,29 +75,29 @@ void initializeAndAllocateAllGatherData(int rank, int world_size, size_t dataSiz
|
||||
(*data_h)[i] = 0;
|
||||
}
|
||||
}
|
||||
CUCHECK(cudaMemcpy(*data_d, *data_h, dataSize, cudaMemcpyHostToDevice));
|
||||
MSCCLPP_CUDATHROW(cudaMemcpy(*data_d, *data_h, dataSize, cudaMemcpyHostToDevice));
|
||||
}
|
||||
|
||||
class MyProxyService {
|
||||
private:
|
||||
int deviceNumaNode_;
|
||||
mscclpp::Proxy proxy_;
|
||||
int dataSize_;
|
||||
std::vector<mscclpp::RegisteredMemory> remoteMemories_;
|
||||
mscclpp::RegisteredMemory localMemory_;
|
||||
std::vector<std::shared_ptr<mscclpp::Host2HostSemaphore>> hostSemaphores_;
|
||||
std::vector<std::shared_ptr<mscclpp::Host2DeviceSemaphore>> deviceSemaphores1_;
|
||||
std::vector<std::shared_ptr<mscclpp::Host2DeviceSemaphore>> deviceSemaphores2_;
|
||||
std::vector<std::shared_ptr<mscclpp::Connection>> connections_;
|
||||
int dataSize_;
|
||||
mscclpp::Proxy proxy_;
|
||||
int deviceNumaNode_;
|
||||
|
||||
public:
|
||||
MyProxyService(mscclpp::Communicator& comm, int* data_d, int dataSize)
|
||||
: remoteMemories_(world_size),
|
||||
: dataSize_(dataSize),
|
||||
remoteMemories_(world_size),
|
||||
connections_(world_size),
|
||||
dataSize_(dataSize),
|
||||
proxy_([&](mscclpp::ProxyTrigger triggerRaw) { return handleTrigger(triggerRaw); }, [&]() { bindThread(); }) {
|
||||
int cudaDevice;
|
||||
CUCHECK(cudaGetDevice(&cudaDevice));
|
||||
MSCCLPP_CUDATHROW(cudaGetDevice(&cudaDevice));
|
||||
deviceNumaNode_ = mscclpp::getDeviceNumaNode(cudaDevice);
|
||||
|
||||
int thisNode = rankToNode(rank);
|
||||
@@ -237,7 +225,7 @@ int main(int argc, char* argv[]) {
|
||||
MPI_Comm_free(&shmcomm);
|
||||
|
||||
int cudaNum = rankToLocalRank(rank);
|
||||
CUCHECK(cudaSetDevice(cudaNum));
|
||||
MSCCLPP_CUDATHROW(cudaSetDevice(cudaNum));
|
||||
|
||||
if (rank == 0) printf("Initializing MSCCL++\n");
|
||||
auto bootstrap = std::make_shared<mscclpp::TcpBootstrap>(rank, world_size);
|
||||
@@ -268,30 +256,30 @@ int main(int argc, char* argv[]) {
|
||||
mscclpp::FifoDeviceHandle fifo = proxyService.fifo().deviceHandle();
|
||||
if (rank == 0) printf("Testing the correctness of AllGather implementation\n");
|
||||
cudaStream_t stream;
|
||||
CUCHECK(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
|
||||
MSCCLPP_CUDATHROW(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
|
||||
mscclpp::Host2DeviceSemaphore::DeviceHandle* deviceHandles1;
|
||||
mscclpp::Host2DeviceSemaphore::DeviceHandle* deviceHandles2;
|
||||
|
||||
CUCHECK(cudaMalloc(&deviceHandles1, sizeof(mscclpp::Host2DeviceSemaphore::DeviceHandle) * world_size));
|
||||
MSCCLPP_CUDATHROW(cudaMalloc(&deviceHandles1, sizeof(mscclpp::Host2DeviceSemaphore::DeviceHandle) * world_size));
|
||||
for (int i = 0; i < world_size; ++i) {
|
||||
if (i == rank) continue;
|
||||
auto handle = proxyService.getDeviceHandle1(i);
|
||||
CUCHECK(cudaMemcpy(&deviceHandles1[i], &handle, sizeof(mscclpp::Host2DeviceSemaphore::DeviceHandle),
|
||||
MSCCLPP_CUDATHROW(cudaMemcpy(&deviceHandles1[i], &handle, sizeof(mscclpp::Host2DeviceSemaphore::DeviceHandle),
|
||||
cudaMemcpyHostToDevice));
|
||||
}
|
||||
|
||||
CUCHECK(cudaMalloc(&deviceHandles2, sizeof(mscclpp::Host2DeviceSemaphore::DeviceHandle) * world_size));
|
||||
MSCCLPP_CUDATHROW(cudaMalloc(&deviceHandles2, sizeof(mscclpp::Host2DeviceSemaphore::DeviceHandle) * world_size));
|
||||
for (int i = 0; i < world_size; ++i) {
|
||||
if (i == rank) continue;
|
||||
auto handle = proxyService.getDeviceHandle2(i);
|
||||
CUCHECK(cudaMemcpy(&deviceHandles2[i], &handle, sizeof(mscclpp::Host2DeviceSemaphore::DeviceHandle),
|
||||
MSCCLPP_CUDATHROW(cudaMemcpy(&deviceHandles2[i], &handle, sizeof(mscclpp::Host2DeviceSemaphore::DeviceHandle),
|
||||
cudaMemcpyHostToDevice));
|
||||
}
|
||||
|
||||
kernel<<<1, world_size, 0, stream>>>(rank, world_size, fifo, deviceHandles1, 1);
|
||||
CUCHECK(cudaStreamSynchronize(stream));
|
||||
kernel<<<1, world_size, 0, stream>>>(rank, fifo, deviceHandles1, 1);
|
||||
MSCCLPP_CUDATHROW(cudaStreamSynchronize(stream));
|
||||
|
||||
CUCHECK(cudaMemcpy(data_h, data_d, dataSize, cudaMemcpyDeviceToHost));
|
||||
MSCCLPP_CUDATHROW(cudaMemcpy(data_h, data_d, dataSize, cudaMemcpyDeviceToHost));
|
||||
|
||||
for (size_t i = 0; i < nelemsPerGPU * world_size; i++) {
|
||||
int val = i + 1;
|
||||
@@ -307,14 +295,14 @@ int main(int argc, char* argv[]) {
|
||||
double t0, t1, ms, time_in_us;
|
||||
int iterwithoutcudagraph = 10;
|
||||
if (rank == 0) printf("Running %d iterations of the kernel without CUDA graph\n", iterwithoutcudagraph);
|
||||
CUCHECK(cudaStreamSynchronize(stream));
|
||||
MSCCLPP_CUDATHROW(cudaStreamSynchronize(stream));
|
||||
bootstrap->barrier();
|
||||
t0 = getTime();
|
||||
for (int i = 0; i < iterwithoutcudagraph; ++i) {
|
||||
kernel<<<1, world_size, 0, stream>>>(rank, world_size, fifo, deviceHandles1, 1);
|
||||
kernel<<<1, world_size, 0, stream>>>(rank, world_size, fifo, deviceHandles2, 2);
|
||||
kernel<<<1, world_size, 0, stream>>>(rank, fifo, deviceHandles1, 1);
|
||||
kernel<<<1, world_size, 0, stream>>>(rank, fifo, deviceHandles2, 2);
|
||||
}
|
||||
CUCHECK(cudaStreamSynchronize(stream));
|
||||
MSCCLPP_CUDATHROW(cudaStreamSynchronize(stream));
|
||||
bootstrap->barrier();
|
||||
t1 = getTime();
|
||||
ms = (t1 - t0) * 1000.0;
|
||||
@@ -327,22 +315,22 @@ int main(int argc, char* argv[]) {
|
||||
if (rank == 0) printf("Capturing %d iterations of the kernel in a CUDA graph\n", cudagraphiter);
|
||||
cudaGraph_t graph;
|
||||
cudaGraphExec_t instance;
|
||||
cudaStreamBeginCapture(stream, cudaStreamCaptureModeGlobal);
|
||||
MSCCLPP_CUDATHROW(cudaStreamBeginCapture(stream, cudaStreamCaptureModeGlobal));
|
||||
for (int i = 0; i < cudagraphiter; ++i) {
|
||||
kernel<<<1, world_size, 0, stream>>>(rank, world_size, fifo, deviceHandles1, 1);
|
||||
kernel<<<1, world_size, 0, stream>>>(rank, world_size, fifo, deviceHandles2, 2);
|
||||
kernel<<<1, world_size, 0, stream>>>(rank, fifo, deviceHandles1, 1);
|
||||
kernel<<<1, world_size, 0, stream>>>(rank, fifo, deviceHandles2, 2);
|
||||
}
|
||||
cudaStreamEndCapture(stream, &graph);
|
||||
cudaGraphInstantiate(&instance, graph, NULL, NULL, 0);
|
||||
MSCCLPP_CUDATHROW(cudaStreamEndCapture(stream, &graph));
|
||||
MSCCLPP_CUDATHROW(cudaGraphInstantiate(&instance, graph, NULL, NULL, 0));
|
||||
|
||||
int cudagraphwarmup = 10;
|
||||
if (rank == 0)
|
||||
printf("Warming up %d iterations of the CUDA graph with %d iterations of the kernel\n", cudagraphwarmup,
|
||||
cudagraphiter);
|
||||
for (int i = 0; i < cudagraphwarmup; ++i) {
|
||||
cudaGraphLaunch(instance, stream);
|
||||
MSCCLPP_CUDATHROW(cudaGraphLaunch(instance, stream));
|
||||
}
|
||||
CUCHECK(cudaStreamSynchronize(stream));
|
||||
MSCCLPP_CUDATHROW(cudaStreamSynchronize(stream));
|
||||
|
||||
// measure runtime
|
||||
int cudagraphlaunch = 10;
|
||||
@@ -352,9 +340,9 @@ int main(int argc, char* argv[]) {
|
||||
bootstrap->barrier();
|
||||
t0 = getTime();
|
||||
for (int i = 0; i < cudagraphlaunch; ++i) {
|
||||
cudaGraphLaunch(instance, stream);
|
||||
MSCCLPP_CUDATHROW(cudaGraphLaunch(instance, stream));
|
||||
}
|
||||
CUCHECK(cudaStreamSynchronize(stream));
|
||||
MSCCLPP_CUDATHROW(cudaStreamSynchronize(stream));
|
||||
|
||||
t1 = getTime();
|
||||
ms = (t1 - t0) * 1000.0;
|
||||
|
||||
@@ -21,7 +21,7 @@ __constant__ DeviceHandle<mscclpp::ProxyChannel> constRawProxyChan[16];
|
||||
|
||||
__constant__ DeviceHandle<mscclpp::SmChannel> constSmChans[8];
|
||||
|
||||
__global__ void allgather0(int rank, int worldSize, size_t nelemsPerGPU) {
|
||||
__global__ void allgather0(int rank, size_t nelemsPerGPU) {
|
||||
int warpId = threadIdx.x / 32;
|
||||
|
||||
// Each warp is responsible for one of the remote ranks
|
||||
@@ -41,9 +41,8 @@ __global__ void allgather0(int rank, int worldSize, size_t nelemsPerGPU) {
|
||||
if (threadIdx.x % 32 == 0) proxyChan.wait();
|
||||
}
|
||||
|
||||
__device__ void localAllGather(DeviceHandle<mscclpp::SimpleProxyChannel> proxyChan, int rank, int worldSize,
|
||||
int nRanksPerNode, int remoteRank, uint64_t offset, uint64_t size,
|
||||
bool flushAfterSignal = true) {
|
||||
__device__ void localAllGather(DeviceHandle<mscclpp::SimpleProxyChannel> proxyChan, int rank, int nRanksPerNode,
|
||||
int remoteRank, uint64_t offset, uint64_t size, bool flushAfterSignal = true) {
|
||||
// this allgather algorithm works as follows:
|
||||
// Step 1: GPU rank i sends data to GPU rank (i+1) % nRanksPerNode
|
||||
// and waits for data from GPU rank (i-1) % nRanksPerNode
|
||||
@@ -112,14 +111,14 @@ __device__ void localAllGatherSm(int rank, int nRanksPerNode, int startRankChunk
|
||||
constSmChans[peerIdx].get(offset + offsetForThisBlock, sizeForThisBlock, threadIdx.x, blockDim.x);
|
||||
}
|
||||
|
||||
__global__ void allgather1(int rank, int worldSize, int nRanksPerNode, size_t nelemsPerGPU) {
|
||||
__global__ void allgather1(int rank, int nRanksPerNode, size_t nelemsPerGPU) {
|
||||
int warpId = threadIdx.x / 32;
|
||||
int remoteRank = (warpId < rank) ? warpId : warpId + 1;
|
||||
|
||||
// Each warp is responsible for one of the remote ranks
|
||||
DeviceHandle<mscclpp::SimpleProxyChannel> proxyChan = constProxyChans[warpId];
|
||||
|
||||
localAllGather(proxyChan, rank, worldSize, nRanksPerNode, remoteRank, rank * nelemsPerGPU * sizeof(int),
|
||||
localAllGather(proxyChan, rank, nRanksPerNode, remoteRank, rank * nelemsPerGPU * sizeof(int),
|
||||
nelemsPerGPU * sizeof(int));
|
||||
}
|
||||
|
||||
@@ -145,7 +144,7 @@ __global__ void allgather2(int rank, int worldSize, int nRanksPerNode, size_t ne
|
||||
// Step 1
|
||||
// local allgather
|
||||
if (remoteRank / nRanksPerNode == rank / nRanksPerNode) {
|
||||
localAllGather(proxyChan, rank, worldSize, nRanksPerNode, remoteRank, rank * nelemsPerGPU * sizeof(int),
|
||||
localAllGather(proxyChan, rank, nRanksPerNode, remoteRank, rank * nelemsPerGPU * sizeof(int),
|
||||
nelemsPerGPU * sizeof(int), false);
|
||||
}
|
||||
// cross-node exchange
|
||||
@@ -170,7 +169,7 @@ __global__ void allgather2(int rank, int worldSize, int nRanksPerNode, size_t ne
|
||||
// local allgather
|
||||
int otherNghr = (rank + nRanksPerNode) % worldSize;
|
||||
if (remoteRank / nRanksPerNode == rank / nRanksPerNode) {
|
||||
localAllGather(proxyChan, rank, worldSize, nRanksPerNode, remoteRank, otherNghr * nelemsPerGPU * sizeof(int),
|
||||
localAllGather(proxyChan, rank, nRanksPerNode, remoteRank, otherNghr * nelemsPerGPU * sizeof(int),
|
||||
(nelemsPerGPU * (pipelineSize - 1)) / pipelineSize * sizeof(int), false);
|
||||
}
|
||||
|
||||
@@ -192,13 +191,13 @@ __global__ void allgather2(int rank, int worldSize, int nRanksPerNode, size_t ne
|
||||
// Step 3
|
||||
// local allgather
|
||||
if (remoteRank / nRanksPerNode == rank / nRanksPerNode) {
|
||||
localAllGather(proxyChan, rank, worldSize, nRanksPerNode, remoteRank,
|
||||
localAllGather(proxyChan, rank, nRanksPerNode, remoteRank,
|
||||
(otherNghr * nelemsPerGPU + (nelemsPerGPU * (pipelineSize - 1)) / pipelineSize) * sizeof(int),
|
||||
nelemsPerGPU / pipelineSize * sizeof(int));
|
||||
}
|
||||
}
|
||||
|
||||
__global__ void allgather3(int rank, int worldSize) {
|
||||
__global__ void allgather3() {
|
||||
int warpId = threadIdx.x / 32;
|
||||
|
||||
// Each warp is responsible for one of the remote ranks
|
||||
@@ -315,9 +314,9 @@ class AllGatherProxyService : public mscclpp::BaseProxyService {
|
||||
|
||||
AllGatherProxyService::AllGatherProxyService(int worldSize, int rank, int cudaDevice)
|
||||
: worldSize_(worldSize),
|
||||
sendBytes_(0),
|
||||
rank_(rank),
|
||||
cudaDevice_(cudaDevice),
|
||||
sendBytes_(0),
|
||||
proxy_(
|
||||
std::make_shared<mscclpp::Proxy>([&](mscclpp::ProxyTrigger triggerRaw) { return handleTrigger(triggerRaw); },
|
||||
[&]() {
|
||||
@@ -382,13 +381,13 @@ void AllGatherTestColl::runColl(const TestArgs& args, cudaStream_t stream) {
|
||||
nThreads = 32 * (worldSize - 1);
|
||||
}
|
||||
if (kernelNum == 0) {
|
||||
allgather0<<<nBlocks, nThreads, 0, stream>>>(rank, worldSize, paramCount_);
|
||||
allgather0<<<nBlocks, nThreads, 0, stream>>>(rank, paramCount_);
|
||||
} else if (kernelNum == 1) {
|
||||
allgather1<<<nBlocks, nThreads, 0, stream>>>(rank, worldSize, nRanksPerNode, paramCount_);
|
||||
allgather1<<<nBlocks, nThreads, 0, stream>>>(rank, nRanksPerNode, paramCount_);
|
||||
} else if (kernelNum == 2) {
|
||||
allgather2<<<nBlocks, nThreads, 0, stream>>>(rank, worldSize, nRanksPerNode, paramCount_);
|
||||
} else if (kernelNum == 3) {
|
||||
allgather3<<<nBlocks, nThreads, 0, stream>>>(rank, worldSize);
|
||||
allgather3<<<nBlocks, nThreads, 0, stream>>>();
|
||||
} else if (kernelNum == 4) {
|
||||
allgather4<<<nBlocks, nThreads, 0, stream>>>(rank, worldSize, nRanksPerNode, paramCount_);
|
||||
}
|
||||
|
||||
@@ -48,7 +48,7 @@ __forceinline__ __device__ void vectorSum(int* dst, int* src, size_t nElem, int
|
||||
size_t nLastInts = nElem % 4;
|
||||
int4* dst4 = (int4*)dst;
|
||||
int4* src4 = (int4*)src;
|
||||
for (int i = threadIdx.x + blockId * blockDim.x; i < nInt4; i += blockDim.x * nBlocks) {
|
||||
for (size_t i = threadIdx.x + blockId * blockDim.x; i < nInt4; i += blockDim.x * nBlocks) {
|
||||
dst4[i].w += src4[i].w;
|
||||
dst4[i].x += src4[i].x;
|
||||
dst4[i].y += src4[i].y;
|
||||
@@ -57,7 +57,7 @@ __forceinline__ __device__ void vectorSum(int* dst, int* src, size_t nElem, int
|
||||
if (nLastInts > 0) {
|
||||
int* dstLast = dst + nInt4 * 4;
|
||||
int* srcLast = src + nInt4 * 4;
|
||||
for (int i = threadIdx.x + blockId * blockDim.x; i < nLastInts; i += blockDim.x * nBlocks) {
|
||||
for (size_t i = threadIdx.x + blockId * blockDim.x; i < nLastInts; i += blockDim.x * nBlocks) {
|
||||
dstLast[i] += srcLast[i];
|
||||
}
|
||||
}
|
||||
@@ -68,7 +68,7 @@ __forceinline__ __device__ void vectorSum(int* dst, int* src, size_t nElem) {
|
||||
}
|
||||
|
||||
__device__ void vectorSumSingleBlock(int* dst, int* src, size_t nElem) {
|
||||
for (int i = threadIdx.x; i < nElem; i += blockDim.x) {
|
||||
for (size_t i = threadIdx.x; i < nElem; i += blockDim.x) {
|
||||
dst[i] += src[i];
|
||||
}
|
||||
}
|
||||
@@ -277,10 +277,10 @@ __device__ void allGather(int rank, int worldSize, int nRanksPerNode, size_t nel
|
||||
nelemsPerGPU / pipelineSize * sizeof(int));
|
||||
}
|
||||
|
||||
__device__ void localReduceScatterSm(int* buff, int* scratch, int rank, int nRanksPerNode, int startChunkIndex,
|
||||
size_t offsetInChunk, size_t chunkSize, size_t nelems, int nBlocks) {
|
||||
__device__ void localReduceScatterSm(int* buff, int rank, int nRanksPerNode, int startChunkIndex, size_t offsetInChunk,
|
||||
size_t chunkSize, size_t nelems, int nBlocks) {
|
||||
if (nRanksPerNode == 1) return;
|
||||
if (blockIdx.x >= nBlocks) return;
|
||||
if ((int)blockIdx.x >= nBlocks) return;
|
||||
const int nPeer = nRanksPerNode - 1;
|
||||
DeviceHandle<mscclpp::SmChannel>* smChans = constSmOutOfPlaceGetChans;
|
||||
|
||||
@@ -299,7 +299,7 @@ __device__ void localReduceScatterSm(int* buff, int* scratch, int rank, int nRan
|
||||
reduceScatterDeviceSyncer.sync(nBlocks);
|
||||
|
||||
const size_t nInt4 = nelems / 4;
|
||||
for (int idx = threadIdx.x + blockIdx.x * blockDim.x; idx < nInt4; idx += blockDim.x * nBlocks) {
|
||||
for (size_t idx = threadIdx.x + blockIdx.x * blockDim.x; idx < nInt4; idx += blockDim.x * nBlocks) {
|
||||
int4 sum = make_int4(0, 0, 0, 0);
|
||||
|
||||
for (int peerIdx = 0; peerIdx < nPeer; peerIdx++) {
|
||||
@@ -316,7 +316,7 @@ __device__ void localReduceScatterSm(int* buff, int* scratch, int rank, int nRan
|
||||
}
|
||||
|
||||
const size_t nLastInts = nelems % 4;
|
||||
for (int idx = threadIdx.x + blockIdx.x * blockDim.x; idx < nLastInts; idx += blockDim.x * nBlocks) {
|
||||
for (size_t idx = threadIdx.x + blockIdx.x * blockDim.x; idx < nLastInts; idx += blockDim.x * nBlocks) {
|
||||
int sum = 0;
|
||||
for (int peerIdx = 0; peerIdx < nPeer; peerIdx++) {
|
||||
int val = smChans[peerIdx].read<int>(indexOffset + nInt4 * 4 + idx);
|
||||
@@ -326,10 +326,10 @@ __device__ void localReduceScatterSm(int* buff, int* scratch, int rank, int nRan
|
||||
}
|
||||
}
|
||||
|
||||
__device__ void localReduceScatterSm2(int* buff, int* scratch, int rank, int nRanksPerNode, size_t chunkSize,
|
||||
size_t nelems, int nBlocks) {
|
||||
__device__ void localReduceScatterSm2(int* buff, int rank, int nRanksPerNode, size_t chunkSize, size_t nelems,
|
||||
int nBlocks) {
|
||||
if (nRanksPerNode == 1) return;
|
||||
if (blockIdx.x >= nBlocks) return;
|
||||
if ((int)blockIdx.x >= nBlocks) return;
|
||||
const int nPeer = nRanksPerNode - 1;
|
||||
DeviceHandle<mscclpp::SmChannel>* smChans = constSmOutOfPlaceGetChans;
|
||||
|
||||
@@ -344,7 +344,7 @@ __device__ void localReduceScatterSm2(int* buff, int* scratch, int rank, int nRa
|
||||
smChans[tid].signal();
|
||||
}
|
||||
const int waitStart = nBlocks * blockDim.x - nPeer;
|
||||
if (tid >= waitStart && tid < nBlocks * blockDim.x) {
|
||||
if (tid >= waitStart && tid < (int)(nBlocks * blockDim.x)) {
|
||||
smChans[tid - waitStart].wait();
|
||||
}
|
||||
reduceScatterDeviceSyncer.sync(nBlocks);
|
||||
@@ -353,7 +353,7 @@ __device__ void localReduceScatterSm2(int* buff, int* scratch, int rank, int nRa
|
||||
for (int index = 0; index < nPeer; ++index) {
|
||||
int4 val;
|
||||
int peerIdx = (index + localRankIndexInNode) % nPeer;
|
||||
for (int idx = threadIdx.x + blockIdx.x * blockDim.x; idx < nInt4; idx += blockDim.x * nBlocks) {
|
||||
for (size_t idx = threadIdx.x + blockIdx.x * blockDim.x; idx < nInt4; idx += blockDim.x * nBlocks) {
|
||||
val = smChans[peerIdx].read<int4>(indexOffset4 + idx);
|
||||
buff4[indexOffset4 + idx].w += val.w;
|
||||
buff4[indexOffset4 + idx].x += val.x;
|
||||
@@ -364,7 +364,7 @@ __device__ void localReduceScatterSm2(int* buff, int* scratch, int rank, int nRa
|
||||
|
||||
const size_t nLastInts = nelems % 4;
|
||||
for (int peerIdx = 0; peerIdx < nPeer; peerIdx++) {
|
||||
for (int idx = threadIdx.x + blockIdx.x * blockDim.x; idx < nLastInts; idx += blockDim.x * nBlocks) {
|
||||
for (size_t idx = threadIdx.x + blockIdx.x * blockDim.x; idx < nLastInts; idx += blockDim.x * nBlocks) {
|
||||
int val = smChans[(localRankIndexInNode + peerIdx) % nPeer].read<int>(indexOffset + nInt4 * 4 + idx);
|
||||
buff[indexOffset + nInt4 * 4 + idx] += val;
|
||||
}
|
||||
@@ -391,18 +391,18 @@ __device__ void reduceScatterSm(int* buff, int* scratch, int rank, int nRanksPer
|
||||
int peerNodeId = peerRank / nRanksPerNode;
|
||||
int nBlocksForReduceScatter =
|
||||
(int)(nBlocksForReduceScatterRatio * gridDim.x) / (nRanksPerNode - 1) * (nRanksPerNode - 1);
|
||||
int isComm = (threadIdx.x == 0) && (blockIdx.x == nBlocksForReduceScatter);
|
||||
int isComm = (threadIdx.x == 0) && ((int)blockIdx.x == nBlocksForReduceScatter);
|
||||
int peer = (peerRank < rank) ? peerRank : peerRank - 1;
|
||||
int nBlocksRemain = gridDim.x - nBlocksForReduceScatter;
|
||||
DeviceHandle<mscclpp::SimpleProxyChannel>& proxyChan = constDevFstRoundChans[peer];
|
||||
if (peerNodeId == rank / nRanksPerNode) {
|
||||
localReduceScatterSm(buff, scratch, rank, nRanksPerNode, 0, 0, chunkSize, chunkSize, gridDim.x);
|
||||
localReduceScatterSm(buff, rank, nRanksPerNode, 0, 0, chunkSize, chunkSize, gridDim.x);
|
||||
return;
|
||||
}
|
||||
|
||||
// step 1: local reduce
|
||||
int startChunkIndex = peerNodeId * nRanksPerNode;
|
||||
localReduceScatterSm(buff, scratch, rank, nRanksPerNode, startChunkIndex, 0, chunkSize, chunkSize / pipelineSize,
|
||||
localReduceScatterSm(buff, rank, nRanksPerNode, startChunkIndex, 0, chunkSize, chunkSize / pipelineSize,
|
||||
nBlocksForReduceScatter);
|
||||
deviceSyncer.sync(gridDim.x);
|
||||
|
||||
@@ -412,12 +412,12 @@ __device__ void reduceScatterSm(int* buff, int* scratch, int rank, int nRanksPer
|
||||
// opposite side
|
||||
proxyChan.putWithSignal(offset, (chunkSize / pipelineSize * sizeof(int)));
|
||||
}
|
||||
localReduceScatterSm(buff, scratch, rank, nRanksPerNode, startChunkIndex, chunkSize / pipelineSize, chunkSize,
|
||||
localReduceScatterSm(buff, rank, nRanksPerNode, startChunkIndex, chunkSize / pipelineSize, chunkSize,
|
||||
2 * chunkSize / pipelineSize, nBlocksForReduceScatter);
|
||||
if (isComm) {
|
||||
proxyChan.wait();
|
||||
}
|
||||
if (blockIdx.x >= nBlocksForReduceScatter) {
|
||||
if ((int)blockIdx.x >= nBlocksForReduceScatter) {
|
||||
ibDeviceSyncer.sync(nBlocksRemain);
|
||||
// reduce data received from peer to related rank
|
||||
size_t offset = rank * chunkSize * sizeof(int);
|
||||
@@ -436,8 +436,7 @@ __device__ void reduceScatterSm(int* buff, int* scratch, int rank, int nRanksPer
|
||||
size_t offset = (peerRank * chunkSize + chunkSize / pipelineSize) * sizeof(int);
|
||||
proxyChan.putWithSignal(offset, (pipelineSize - 1) * chunkSize / pipelineSize * sizeof(int));
|
||||
}
|
||||
localReduceScatterSm(buff, scratch, rank, nRanksPerNode, startChunkIndex, 0, chunkSize, chunkSize,
|
||||
nBlocksForReduceScatter);
|
||||
localReduceScatterSm(buff, rank, nRanksPerNode, startChunkIndex, 0, chunkSize, chunkSize, nBlocksForReduceScatter);
|
||||
if (isComm) {
|
||||
proxyChan.wait();
|
||||
}
|
||||
@@ -509,7 +508,7 @@ __device__ void localRingAllGatherSm(int rank, int nRanksPerNode, uint64_t size,
|
||||
constSmInPlaceChans[tid].signal();
|
||||
}
|
||||
int waitStart = nBlocks * blockDim.x - nPeer;
|
||||
if (tid >= waitStart && tid < nBlocks * blockDim.x) {
|
||||
if (tid >= waitStart && tid < (int)(nBlocks * blockDim.x)) {
|
||||
constSmInPlaceChans[tid - waitStart].wait();
|
||||
}
|
||||
allGatherDeviceSyncer.sync(nBlocks);
|
||||
@@ -631,8 +630,7 @@ __global__ void allreduce0(int* buff, int* scratch, int rank, int worldSize, siz
|
||||
}
|
||||
}
|
||||
|
||||
__global__ void __launch_bounds__(1024)
|
||||
allreduce1(int* buff, int* scratch, int rank, int worldSize, size_t nelems, size_t scratchDataCount) {
|
||||
__global__ void __launch_bounds__(1024) allreduce1(int* buff, int* scratch, int rank, int worldSize, size_t nelems) {
|
||||
int isComm = (threadIdx.x == 0) && (blockIdx.x == 0);
|
||||
int remoteSendRank = (rank + 1) % worldSize;
|
||||
int remoteRecvRank = (rank + worldSize - 1) % worldSize;
|
||||
@@ -849,23 +847,21 @@ __global__ void allreduce2(int* buff, void* scratch, void* putPktBuf, void* getP
|
||||
}
|
||||
|
||||
__global__ void __launch_bounds__(1024)
|
||||
allreduce3(int* buff, int* scratch, void* result, int rank, int nRanksPerNode, int worldSize, size_t nelems) {
|
||||
allreduce3(int* buff, int* scratch, int rank, int nRanksPerNode, int worldSize, size_t nelems) {
|
||||
reduceScatter(buff, scratch, rank, nRanksPerNode, worldSize, nelems);
|
||||
if (threadIdx.x == 0 && blockIdx.x == 0) {
|
||||
allGather(rank, worldSize, nRanksPerNode, nelems / worldSize);
|
||||
}
|
||||
}
|
||||
|
||||
__global__ void allreduce4(int* buff, int* scratch, void* result, int rank, int nRanksPerNode, int worldSize,
|
||||
size_t nelems) {
|
||||
__global__ void allreduce4(int* buff, int* scratch, int rank, int nRanksPerNode, int worldSize, size_t nelems) {
|
||||
reduceScatterSm(buff, scratch, rank, nRanksPerNode, worldSize, nelems);
|
||||
deviceSyncer.sync(gridDim.x);
|
||||
allGatherSm(rank, worldSize, nRanksPerNode, nelems / worldSize);
|
||||
}
|
||||
|
||||
__global__ void allreduce5(int* buff, int* scratch, void* result, int rank, int nRanksPerNode, int worldSize,
|
||||
size_t nelems) {
|
||||
localReduceScatterSm2(buff, scratch, rank, nRanksPerNode, nelems / worldSize, nelems / worldSize, gridDim.x);
|
||||
__global__ void allreduce5(int* buff, int rank, int nRanksPerNode, int worldSize, size_t nelems) {
|
||||
localReduceScatterSm2(buff, rank, nRanksPerNode, nelems / worldSize, nelems / worldSize, gridDim.x);
|
||||
deviceSyncer.sync(gridDim.x);
|
||||
localRingAllGatherSm(rank, nRanksPerNode, nelems / worldSize * sizeof(int), gridDim.x);
|
||||
}
|
||||
@@ -984,20 +980,19 @@ void AllReduceTestColl::runColl(const TestArgs& args, cudaStream_t stream) {
|
||||
allreduce0<<<nBlocks, nThreadsPerBlock, 0, stream>>>((int*)inputBuff, (int*)tmpBuff, rank, worldSize, paramCount_,
|
||||
scratchDataCount);
|
||||
else if (kernelNum == 1)
|
||||
allreduce1<<<nBlocks, nThreadsPerBlock, 0, stream>>>((int*)inputBuff, (int*)tmpBuff, rank, worldSize, paramCount_,
|
||||
scratchDataCount);
|
||||
allreduce1<<<nBlocks, nThreadsPerBlock, 0, stream>>>((int*)inputBuff, (int*)tmpBuff, rank, worldSize, paramCount_);
|
||||
else if (kernelNum == 2)
|
||||
allreduce2<<<nBlocks, nThreadsPerBlock, 0, stream>>>((int*)inputBuff, tmpBuff, putPacketBuff, getPacketBuff,
|
||||
resultBuff, rank, args.nRanksPerNode, worldSize, paramCount_);
|
||||
else if (kernelNum == 3)
|
||||
allreduce3<<<nBlocks, nThreadsPerBlock, 0, stream>>>((int*)inputBuff, (int*)tmpBuff, resultBuff, rank,
|
||||
args.nRanksPerNode, worldSize, paramCount_);
|
||||
allreduce3<<<nBlocks, nThreadsPerBlock, 0, stream>>>((int*)inputBuff, (int*)tmpBuff, rank, args.nRanksPerNode,
|
||||
worldSize, paramCount_);
|
||||
else if (kernelNum == 4)
|
||||
allreduce4<<<nBlocks, nThreadsPerBlock, 0, stream>>>((int*)inputBuff, (int*)tmpBuff, resultBuff, rank,
|
||||
args.nRanksPerNode, worldSize, paramCount_);
|
||||
allreduce4<<<nBlocks, nThreadsPerBlock, 0, stream>>>((int*)inputBuff, (int*)tmpBuff, rank, args.nRanksPerNode,
|
||||
worldSize, paramCount_);
|
||||
else if (kernelNum == 5)
|
||||
allreduce5<<<nBlocks, nThreadsPerBlock, 0, stream>>>((int*)inputBuff, (int*)tmpBuff, resultBuff, rank,
|
||||
args.nRanksPerNode, worldSize, paramCount_);
|
||||
allreduce5<<<nBlocks, nThreadsPerBlock, 0, stream>>>((int*)inputBuff, rank, args.nRanksPerNode, worldSize,
|
||||
paramCount_);
|
||||
else if (kernelNum == 6) {
|
||||
allreduce6<<<nBlocks, nThreadsPerBlock, 0, stream>>>((int*)inputBuff, (int*)tmpBuff, resultBuff, rank,
|
||||
args.nRanksPerNode, worldSize, paramCount_);
|
||||
@@ -1042,19 +1037,19 @@ void AllReduceTestColl::setupCollTest(size_t size) {
|
||||
|
||||
std::vector<KernelRestriction> AllReduceTestColl::getKernelRestrictions() {
|
||||
return {// {kernelNum, kernelName, compatibleWithMultiNodes, countDivisorForMultiNodes, alignedBytes}
|
||||
{0, "allreduce0", true, 1, .alignedBytes = 4 * worldSize_},
|
||||
{1, "allreduce1", true, 1, .alignedBytes = 4 * worldSize_},
|
||||
{2, "allreduce2", true, 1, .alignedBytes = 4 * worldSize_},
|
||||
{3, "allreduce3", true, 3, .alignedBytes = 4 * worldSize_},
|
||||
{0, "allreduce0", true, 1, 4 * worldSize_},
|
||||
{1, "allreduce1", true, 1, 4 * worldSize_},
|
||||
{2, "allreduce2", true, 1, 4 * worldSize_},
|
||||
{3, "allreduce3", true, 3, 4 * worldSize_},
|
||||
{
|
||||
4,
|
||||
"allreduce4",
|
||||
true,
|
||||
3,
|
||||
.alignedBytes = 16 * worldSize_ /*use ulong2 to transfer data*/,
|
||||
16 * worldSize_ /*use ulong2 to transfer data*/,
|
||||
},
|
||||
{5, "allreduce5", false, 1, .alignedBytes = 4 * worldSize_},
|
||||
{6, "allreduce6", false, 1, .alignedBytes = 4 * worldSize_}};
|
||||
{5, "allreduce5", false, 1, 4 * worldSize_},
|
||||
{6, "allreduce6", false, 1, 4 * worldSize_}};
|
||||
}
|
||||
|
||||
class AllReduceTestEngine : public BaseTestEngine {
|
||||
|
||||
@@ -14,7 +14,7 @@ void* localRecvBuff;
|
||||
void* localSendBuff;
|
||||
|
||||
__device__ void localAlltoall(int rank, int nRanksPerNode, size_t nElements) {
|
||||
int remoteRank = (blockIdx.x < rank) ? blockIdx.x : blockIdx.x + 1;
|
||||
int remoteRank = ((int)blockIdx.x < rank) ? blockIdx.x : blockIdx.x + 1;
|
||||
for (int i = 1; i < nRanksPerNode; i++) {
|
||||
DeviceHandle<mscclpp::SimpleProxyChannel> proxyChan = constProxyChans[blockIdx.x];
|
||||
if (threadIdx.x == 0 && remoteRank % nRanksPerNode == (rank + i) % nRanksPerNode) {
|
||||
@@ -29,8 +29,8 @@ __device__ void localAlltoall(int rank, int nRanksPerNode, size_t nElements) {
|
||||
}
|
||||
}
|
||||
|
||||
__global__ void alltoall0(int rank, int worldSize, size_t nElements) {
|
||||
int remoteRank = (blockIdx.x < rank) ? blockIdx.x : blockIdx.x + 1;
|
||||
__global__ void alltoall0(int rank, size_t nElements) {
|
||||
int remoteRank = ((int)blockIdx.x < rank) ? blockIdx.x : blockIdx.x + 1;
|
||||
DeviceHandle<mscclpp::SimpleProxyChannel> proxyChan = constProxyChans[blockIdx.x];
|
||||
if (threadIdx.x == 0) {
|
||||
proxyChan.putWithSignal(rank * nElements * sizeof(int), remoteRank * nElements * sizeof(int),
|
||||
@@ -68,7 +68,7 @@ void AllToAllTestColl::runColl(const TestArgs& args, cudaStream_t stream) {
|
||||
CUDATHROW(cudaMemcpyAsync((int*)localRecvBuff + paramCount_ * rank, (int*)localSendBuff + paramCount_ * rank,
|
||||
paramCount_ * sizeof(int), cudaMemcpyDeviceToDevice, stream));
|
||||
if (kernelNum == 0) {
|
||||
alltoall0<<<worldSize - 1, 32, 0, stream>>>(rank, worldSize, paramCount_);
|
||||
alltoall0<<<worldSize - 1, 32, 0, stream>>>(rank, paramCount_);
|
||||
} else if (kernelNum == 1) {
|
||||
alltoall1<<<worldSize - 1, 32, 0, stream>>>(rank, nRanksPerNode, paramCount_);
|
||||
}
|
||||
|
||||
@@ -188,7 +188,7 @@ BaseTestEngine::BaseTestEngine(const TestArgs& args, const std::string& name)
|
||||
CUDATHROW(cudaStreamCreateWithFlags(&this->stream_, cudaStreamNonBlocking));
|
||||
}
|
||||
|
||||
BaseTestEngine::~BaseTestEngine() { cudaStreamDestroy(stream_); }
|
||||
BaseTestEngine::~BaseTestEngine() { (void)cudaStreamDestroy(stream_); }
|
||||
|
||||
void BaseTestColl::setupCollTest(const TestArgs& args, size_t size) {
|
||||
this->worldSize_ = args.totalRanks;
|
||||
|
||||
@@ -6,6 +6,7 @@
|
||||
#include <cstring>
|
||||
#include <iostream>
|
||||
#include <mscclpp/concurrency.hpp>
|
||||
#include <mscclpp/cuda_utils.hpp>
|
||||
#include <mscclpp/semaphore.hpp>
|
||||
#include <mscclpp/sm_channel.hpp>
|
||||
#include <string>
|
||||
@@ -35,7 +36,7 @@ inline mscclpp::Transport getTransport(int rank, int peerRank, int nRanksPerNode
|
||||
|
||||
__device__ mscclpp::DeviceSyncer deviceSyncer;
|
||||
|
||||
__global__ void kernel(int rank, size_t dataSize, size_t dataPerBlock) {
|
||||
__global__ void kernel(size_t dataSize, size_t dataPerBlock) {
|
||||
size_t startIndex = blockIdx.x * dataPerBlock;
|
||||
size_t blockDataSize = min(dataSize - startIndex, dataPerBlock);
|
||||
int globalIndex = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
@@ -63,12 +64,12 @@ class SendRecvTestColl : public BaseTestColl {
|
||||
std::vector<KernelRestriction> getKernelRestrictions() override;
|
||||
};
|
||||
|
||||
void SendRecvTestColl::runColl(const TestArgs& args, cudaStream_t stream) {
|
||||
void SendRecvTestColl::runColl(const TestArgs&, cudaStream_t stream) {
|
||||
size_t sendBytes = sendCount_ * typeSize_;
|
||||
int blockNum = getBlockNum(sendBytes);
|
||||
size_t bytesPerBlock = (sendBytes + blockNum - 1) / blockNum;
|
||||
if (kernelNum_ == 0) {
|
||||
kernel<<<blockNum, BLOCK_THREADS_NUM, 0, stream>>>(args.rank, sendBytes, bytesPerBlock);
|
||||
kernel<<<blockNum, BLOCK_THREADS_NUM, 0, stream>>>(sendBytes, bytesPerBlock);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -87,11 +88,11 @@ std::vector<KernelRestriction> SendRecvTestColl::getKernelRestrictions() {
|
||||
void SendRecvTestColl::initData(const TestArgs& args, std::vector<void*> sendBuff, void* expectedBuff) {
|
||||
int rank = args.rank;
|
||||
if (sendBuff.size() != 1) std::unexpected();
|
||||
CUDATHROW(cudaMemset(sendBuff[0], 0, sendCount_ * typeSize_));
|
||||
MSCCLPP_CUDATHROW(cudaMemset(sendBuff[0], 0, sendCount_ * typeSize_));
|
||||
|
||||
// TODO: The type should not limited to int.
|
||||
std::vector<int> dataHost(std::max(sendCount_, recvCount_), rank);
|
||||
CUDATHROW(cudaMemcpy(sendBuff[0], dataHost.data(), sendCount_ * typeSize_, cudaMemcpyHostToDevice));
|
||||
MSCCLPP_CUDATHROW(cudaMemcpy(sendBuff[0], dataHost.data(), sendCount_ * typeSize_, cudaMemcpyHostToDevice));
|
||||
|
||||
int peerRank = (rank - 1 + args.totalRanks) % args.totalRanks;
|
||||
for (size_t i = 0; i < recvCount_; i++) {
|
||||
@@ -110,7 +111,7 @@ void SendRecvTestColl::setupCollTest(size_t size) {
|
||||
expectedCount_ = base;
|
||||
|
||||
mscclpp::DeviceSyncer syncer = {};
|
||||
CUDATHROW(cudaMemcpyToSymbol(deviceSyncer, &syncer, sizeof(mscclpp::DeviceSyncer)));
|
||||
MSCCLPP_CUDATHROW(cudaMemcpyToSymbol(deviceSyncer, &syncer, sizeof(mscclpp::DeviceSyncer)));
|
||||
}
|
||||
|
||||
class SendRecvTestEngine : public BaseTestEngine {
|
||||
@@ -189,8 +190,8 @@ void SendRecvTestEngine::setupConnections() {
|
||||
}
|
||||
std::transform(smChannels_.begin(), smChannels_.end(), smChannelHandles.begin(),
|
||||
[](const mscclpp::SmChannel& smChannel) { return smChannel.deviceHandle(); });
|
||||
cudaMemcpyToSymbol(constSmChans, smChannelHandles.data(),
|
||||
sizeof(DeviceHandle<mscclpp::SmChannel>) * smChannelHandles.size());
|
||||
MSCCLPP_CUDATHROW(cudaMemcpyToSymbol(constSmChans, smChannelHandles.data(),
|
||||
sizeof(DeviceHandle<mscclpp::SmChannel>) * smChannelHandles.size()));
|
||||
}
|
||||
|
||||
std::vector<void*> SendRecvTestEngine::getSendBuff() { return {devicePtrs_[0].get()}; }
|
||||
|
||||
Reference in New Issue
Block a user