Fix the 2 GiB limit in allgather test. (#36)

This commit is contained in:
Bin Wang
2023-03-29 19:02:43 +08:00
committed by GitHub
parent 62279b0063
commit 7880be8ee2
2 changed files with 15 additions and 15 deletions

View File

@@ -47,7 +47,7 @@ static double getTime(void)
__constant__ mscclppDevConn_t constDevConns[16];
__device__ void allgather0(mscclppDevConn_t devConn, int rank, int world_size, int remoteRank, int nelemsPerGPU)
__device__ void allgather0(mscclppDevConn_t devConn, int rank, int world_size, int remoteRank, size_t nelemsPerGPU)
{
// this allgather is really simple and implemented as an alltoall
@@ -63,7 +63,7 @@ __device__ void allgather0(mscclppDevConn_t devConn, int rank, int world_size, i
devConn.wait();
}
__device__ void allgather1(mscclppDevConn_t devConn, int rank, int world_size, int remoteRank, int nelemsPerGPU)
__device__ void allgather1(mscclppDevConn_t devConn, int rank, int world_size, int remoteRank, size_t nelemsPerGPU)
{
// this allgather algorithm works as follows:
// Step 1: GPU rank i sends data to GPU rank (i+1) % world_size
@@ -82,7 +82,7 @@ __device__ void allgather1(mscclppDevConn_t devConn, int rank, int world_size, i
devConn.wait();
}
__global__ void kernel(int rank, int world_size, int nelemsPerGPU, int kernel)
__global__ void kernel(int rank, int world_size, size_t nelemsPerGPU, int kernel)
{
// only use a single thread from each warp
if (threadIdx.x % 32 != 0)
@@ -119,16 +119,16 @@ void print_usage(const char* prog)
#endif
}
void initializeAndAllocateAllGatherData(int rank, int world_size, size_t dataSize, int nelemsPerGPU, int** data_h,
void initializeAndAllocateAllGatherData(int rank, int world_size, size_t dataSize, size_t nelemsPerGPU, int** data_h,
int** data_d)
{
CUDACHECK(cudaMalloc(data_d, dataSize));
CUDACHECK(cudaMemset(*data_d, 0, dataSize));
*data_h = new int[nelemsPerGPU * world_size];
for (int i = 0; i < nelemsPerGPU * world_size; i++) {
for (size_t i = 0; i < nelemsPerGPU * world_size; i++) {
int val = i + 1;
if (i / nelemsPerGPU == rank) {
if (i / nelemsPerGPU == (size_t)rank) {
(*data_h)[i] = val;
} else {
(*data_h)[i] = 0;
@@ -317,9 +317,9 @@ int main(int argc, const char* argv[])
int* data_h;
size_t dataSize = 1024 * 1024 * 1024;
if (parsedArgs.find("datasize") != parsedArgs.end()) {
dataSize = std::stoi(parsedArgs["datasize"]);
dataSize = std::stoul(parsedArgs["datasize"]);
}
int nelemsPerGPU = dataSize / sizeof(int) / world_size;
size_t nelemsPerGPU = dataSize / sizeof(int) / world_size;
if (rank == 0)
printf("Initializing data for allgather test\n");
@@ -343,10 +343,10 @@ int main(int argc, const char* argv[])
CUDACHECK(cudaMemcpy(data_h, data_d, dataSize, cudaMemcpyDeviceToHost));
CUDACHECK(cudaDeviceSynchronize());
for (int i = 0; i < nelemsPerGPU * world_size; i++) {
for (size_t i = 0; i < nelemsPerGPU * world_size; i++) {
int val = i + 1;
if (data_h[i] != val) {
printf("oh uh! data_h[%d] (%d) != val (%d)\n", i, data_h[i], val);
printf("oh uh! data_h[%ld] (%d) != val (%d)\n", i, data_h[i], val);
break;
}
}

View File

@@ -32,7 +32,7 @@ static double getTime(void)
__constant__ mscclppDevConn_t constDevConns[16];
__global__ void kernel(int rank, int world_size, int nelemsPerGPU)
__global__ void kernel(int rank, int world_size, size_t nelemsPerGPU)
{
if (threadIdx.x % 32 != 0)
return;
@@ -63,7 +63,7 @@ __global__ void kernel(int rank, int world_size, int nelemsPerGPU)
continue;
// Trigger sending data, flag and synchronize after
int ibPortion = nelemsPerGPU / 12; // nelemsPerGPU/12;
size_t ibPortion = nelemsPerGPU / 12; // nelemsPerGPU/12;
if (isIB)
devConn.fifo.setTrigger(trig, mscclppFlag | mscclppData | mscclppSync,
rank * nelemsPerGPU * sizeof(int) + (nelemsPerGPU - ibPortion) * sizeof(int),
@@ -168,7 +168,7 @@ int main(int argc, const char* argv[])
int* data_d;
uint64_t* flag_d;
size_t data_size = 1536 * 1024 * 1024;
int nelemsPerGPU = data_size / sizeof(int) / world_size;
size_t nelemsPerGPU = data_size / sizeof(int) / world_size;
CUDACHECK(cudaMalloc(&data_d, data_size));
CUDACHECK(cudaMalloc(&flag_d, sizeof(uint64_t)));
CUDACHECK(cudaMemset(data_d, 0, data_size));
@@ -176,7 +176,7 @@ int main(int argc, const char* argv[])
int* data_h = new int[nelemsPerGPU * world_size];
for (int i = 0; i < nelemsPerGPU * world_size; i++) {
int val = i + 1;
size_t val = i + 1;
if (i / nelemsPerGPU == rank) {
data_h[i] = val;
} else {
@@ -221,7 +221,7 @@ int main(int argc, const char* argv[])
CUDACHECK(cudaMemcpy(data_h, data_d, data_size, cudaMemcpyDeviceToHost));
CUDACHECK(cudaDeviceSynchronize());
for (int i = 0; i < nelemsPerGPU * world_size; i++) {
for (size_t i = 0; i < nelemsPerGPU * world_size; i++) {
int val = i + 1;
if (data_h[i] != val) {
printf("oh uh things went wrong! data_h[%d] (%d) != val (%d)\n", i, data_h[i], val);