mirror of
https://github.com/microsoft/mscclpp.git
synced 2026-05-13 01:36:10 +00:00
Fix the 2 GiB limit in allgather test. (#36)
This commit is contained in:
@@ -47,7 +47,7 @@ static double getTime(void)
|
||||
|
||||
__constant__ mscclppDevConn_t constDevConns[16];
|
||||
|
||||
__device__ void allgather0(mscclppDevConn_t devConn, int rank, int world_size, int remoteRank, int nelemsPerGPU)
|
||||
__device__ void allgather0(mscclppDevConn_t devConn, int rank, int world_size, int remoteRank, size_t nelemsPerGPU)
|
||||
{
|
||||
// this allgather is really simple and implemented as an alltoall
|
||||
|
||||
@@ -63,7 +63,7 @@ __device__ void allgather0(mscclppDevConn_t devConn, int rank, int world_size, i
|
||||
devConn.wait();
|
||||
}
|
||||
|
||||
__device__ void allgather1(mscclppDevConn_t devConn, int rank, int world_size, int remoteRank, int nelemsPerGPU)
|
||||
__device__ void allgather1(mscclppDevConn_t devConn, int rank, int world_size, int remoteRank, size_t nelemsPerGPU)
|
||||
{
|
||||
// this allgather algorithm works as follows:
|
||||
// Step 1: GPU rank i sends data to GPU rank (i+1) % world_size
|
||||
@@ -82,7 +82,7 @@ __device__ void allgather1(mscclppDevConn_t devConn, int rank, int world_size, i
|
||||
devConn.wait();
|
||||
}
|
||||
|
||||
__global__ void kernel(int rank, int world_size, int nelemsPerGPU, int kernel)
|
||||
__global__ void kernel(int rank, int world_size, size_t nelemsPerGPU, int kernel)
|
||||
{
|
||||
// only use a single thread from each warp
|
||||
if (threadIdx.x % 32 != 0)
|
||||
@@ -119,16 +119,16 @@ void print_usage(const char* prog)
|
||||
#endif
|
||||
}
|
||||
|
||||
void initializeAndAllocateAllGatherData(int rank, int world_size, size_t dataSize, int nelemsPerGPU, int** data_h,
|
||||
void initializeAndAllocateAllGatherData(int rank, int world_size, size_t dataSize, size_t nelemsPerGPU, int** data_h,
|
||||
int** data_d)
|
||||
{
|
||||
CUDACHECK(cudaMalloc(data_d, dataSize));
|
||||
CUDACHECK(cudaMemset(*data_d, 0, dataSize));
|
||||
|
||||
*data_h = new int[nelemsPerGPU * world_size];
|
||||
for (int i = 0; i < nelemsPerGPU * world_size; i++) {
|
||||
for (size_t i = 0; i < nelemsPerGPU * world_size; i++) {
|
||||
int val = i + 1;
|
||||
if (i / nelemsPerGPU == rank) {
|
||||
if (i / nelemsPerGPU == (size_t)rank) {
|
||||
(*data_h)[i] = val;
|
||||
} else {
|
||||
(*data_h)[i] = 0;
|
||||
@@ -317,9 +317,9 @@ int main(int argc, const char* argv[])
|
||||
int* data_h;
|
||||
size_t dataSize = 1024 * 1024 * 1024;
|
||||
if (parsedArgs.find("datasize") != parsedArgs.end()) {
|
||||
dataSize = std::stoi(parsedArgs["datasize"]);
|
||||
dataSize = std::stoul(parsedArgs["datasize"]);
|
||||
}
|
||||
int nelemsPerGPU = dataSize / sizeof(int) / world_size;
|
||||
size_t nelemsPerGPU = dataSize / sizeof(int) / world_size;
|
||||
|
||||
if (rank == 0)
|
||||
printf("Initializing data for allgather test\n");
|
||||
@@ -343,10 +343,10 @@ int main(int argc, const char* argv[])
|
||||
CUDACHECK(cudaMemcpy(data_h, data_d, dataSize, cudaMemcpyDeviceToHost));
|
||||
CUDACHECK(cudaDeviceSynchronize());
|
||||
|
||||
for (int i = 0; i < nelemsPerGPU * world_size; i++) {
|
||||
for (size_t i = 0; i < nelemsPerGPU * world_size; i++) {
|
||||
int val = i + 1;
|
||||
if (data_h[i] != val) {
|
||||
printf("oh uh! data_h[%d] (%d) != val (%d)\n", i, data_h[i], val);
|
||||
printf("oh uh! data_h[%ld] (%d) != val (%d)\n", i, data_h[i], val);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -32,7 +32,7 @@ static double getTime(void)
|
||||
|
||||
__constant__ mscclppDevConn_t constDevConns[16];
|
||||
|
||||
__global__ void kernel(int rank, int world_size, int nelemsPerGPU)
|
||||
__global__ void kernel(int rank, int world_size, size_t nelemsPerGPU)
|
||||
{
|
||||
if (threadIdx.x % 32 != 0)
|
||||
return;
|
||||
@@ -63,7 +63,7 @@ __global__ void kernel(int rank, int world_size, int nelemsPerGPU)
|
||||
continue;
|
||||
|
||||
// Trigger sending data, flag and synchronize after
|
||||
int ibPortion = nelemsPerGPU / 12; // nelemsPerGPU/12;
|
||||
size_t ibPortion = nelemsPerGPU / 12; // nelemsPerGPU/12;
|
||||
if (isIB)
|
||||
devConn.fifo.setTrigger(trig, mscclppFlag | mscclppData | mscclppSync,
|
||||
rank * nelemsPerGPU * sizeof(int) + (nelemsPerGPU - ibPortion) * sizeof(int),
|
||||
@@ -168,7 +168,7 @@ int main(int argc, const char* argv[])
|
||||
int* data_d;
|
||||
uint64_t* flag_d;
|
||||
size_t data_size = 1536 * 1024 * 1024;
|
||||
int nelemsPerGPU = data_size / sizeof(int) / world_size;
|
||||
size_t nelemsPerGPU = data_size / sizeof(int) / world_size;
|
||||
CUDACHECK(cudaMalloc(&data_d, data_size));
|
||||
CUDACHECK(cudaMalloc(&flag_d, sizeof(uint64_t)));
|
||||
CUDACHECK(cudaMemset(data_d, 0, data_size));
|
||||
@@ -176,7 +176,7 @@ int main(int argc, const char* argv[])
|
||||
|
||||
int* data_h = new int[nelemsPerGPU * world_size];
|
||||
for (int i = 0; i < nelemsPerGPU * world_size; i++) {
|
||||
int val = i + 1;
|
||||
size_t val = i + 1;
|
||||
if (i / nelemsPerGPU == rank) {
|
||||
data_h[i] = val;
|
||||
} else {
|
||||
@@ -221,7 +221,7 @@ int main(int argc, const char* argv[])
|
||||
CUDACHECK(cudaMemcpy(data_h, data_d, data_size, cudaMemcpyDeviceToHost));
|
||||
CUDACHECK(cudaDeviceSynchronize());
|
||||
|
||||
for (int i = 0; i < nelemsPerGPU * world_size; i++) {
|
||||
for (size_t i = 0; i < nelemsPerGPU * world_size; i++) {
|
||||
int val = i + 1;
|
||||
if (data_h[i] != val) {
|
||||
printf("oh uh things went wrong! data_h[%d] (%d) != val (%d)\n", i, data_h[i], val);
|
||||
|
||||
Reference in New Issue
Block a user