Fix the 2 GiB limit in allgather test. (#36)

2026-05-13 01:36:10 +00:00 · 2023-03-29 19:02:43 +08:00
parent 62279b0063
commit 7880be8ee2
2 changed files with 15 additions and 15 deletions
--- a/tests/allgather_test.cu
+++ b/tests/allgather_test.cu
@@ -47,7 +47,7 @@ static double getTime(void)

 __constant__ mscclppDevConn_t constDevConns[16];

-__device__ void allgather0(mscclppDevConn_t devConn, int rank, int world_size, int remoteRank, int nelemsPerGPU)
+__device__ void allgather0(mscclppDevConn_t devConn, int rank, int world_size, int remoteRank, size_t nelemsPerGPU)
 {
  // this allgather is really simple and implemented as an alltoall

@@ -63,7 +63,7 @@ __device__ void allgather0(mscclppDevConn_t devConn, int rank, int world_size, i
  devConn.wait();
 }

-__device__ void allgather1(mscclppDevConn_t devConn, int rank, int world_size, int remoteRank, int nelemsPerGPU)
+__device__ void allgather1(mscclppDevConn_t devConn, int rank, int world_size, int remoteRank, size_t nelemsPerGPU)
 {
  // this allgather algorithm works as follows:
  // Step 1: GPU rank i sends data to GPU rank (i+1) % world_size
@@ -82,7 +82,7 @@ __device__ void allgather1(mscclppDevConn_t devConn, int rank, int world_size, i
  devConn.wait();
 }

-__global__ void kernel(int rank, int world_size, int nelemsPerGPU, int kernel)
+__global__ void kernel(int rank, int world_size, size_t nelemsPerGPU, int kernel)
 {
  // only use a single thread from each warp
  if (threadIdx.x % 32 != 0)
@@ -119,16 +119,16 @@ void print_usage(const char* prog)
 #endif
 }

-void initializeAndAllocateAllGatherData(int rank, int world_size, size_t dataSize, int nelemsPerGPU, int** data_h,
+void initializeAndAllocateAllGatherData(int rank, int world_size, size_t dataSize, size_t nelemsPerGPU, int** data_h,
                                        int** data_d)
 {
  CUDACHECK(cudaMalloc(data_d, dataSize));
  CUDACHECK(cudaMemset(*data_d, 0, dataSize));

  *data_h = new int[nelemsPerGPU * world_size];
-  for (int i = 0; i < nelemsPerGPU * world_size; i++) {
+  for (size_t i = 0; i < nelemsPerGPU * world_size; i++) {
    int val = i + 1;
-    if (i / nelemsPerGPU == rank) {
+    if (i / nelemsPerGPU == (size_t)rank) {
      (*data_h)[i] = val;
    } else {
      (*data_h)[i] = 0;
@@ -317,9 +317,9 @@ int main(int argc, const char* argv[])
  int* data_h;
  size_t dataSize = 1024 * 1024 * 1024;
  if (parsedArgs.find("datasize") != parsedArgs.end()) {
-    dataSize = std::stoi(parsedArgs["datasize"]);
+    dataSize = std::stoul(parsedArgs["datasize"]);
  }
-  int nelemsPerGPU = dataSize / sizeof(int) / world_size;
+  size_t nelemsPerGPU = dataSize / sizeof(int) / world_size;

  if (rank == 0)
    printf("Initializing data for allgather test\n");
@@ -343,10 +343,10 @@ int main(int argc, const char* argv[])
  CUDACHECK(cudaMemcpy(data_h, data_d, dataSize, cudaMemcpyDeviceToHost));
  CUDACHECK(cudaDeviceSynchronize());

-  for (int i = 0; i < nelemsPerGPU * world_size; i++) {
+  for (size_t i = 0; i < nelemsPerGPU * world_size; i++) {
    int val = i + 1;
    if (data_h[i] != val) {
-      printf("oh uh! data_h[%d] (%d) != val (%d)\n", i, data_h[i], val);
+      printf("oh uh! data_h[%ld] (%d) != val (%d)\n", i, data_h[i], val);
      break;
    }
  }
--- a/tests/allgather_test2.cu
+++ b/tests/allgather_test2.cu
@@ -32,7 +32,7 @@ static double getTime(void)

 __constant__ mscclppDevConn_t constDevConns[16];

-__global__ void kernel(int rank, int world_size, int nelemsPerGPU)
+__global__ void kernel(int rank, int world_size, size_t nelemsPerGPU)
 {
  if (threadIdx.x % 32 != 0)
    return;
@@ -63,7 +63,7 @@ __global__ void kernel(int rank, int world_size, int nelemsPerGPU)
      continue;

    // Trigger sending data, flag and synchronize after
-    int ibPortion = nelemsPerGPU / 12; // nelemsPerGPU/12;
+    size_t ibPortion = nelemsPerGPU / 12; // nelemsPerGPU/12;
    if (isIB)
      devConn.fifo.setTrigger(trig, mscclppFlag | mscclppData | mscclppSync,
                              rank * nelemsPerGPU * sizeof(int) + (nelemsPerGPU - ibPortion) * sizeof(int),
@@ -168,7 +168,7 @@ int main(int argc, const char* argv[])
  int* data_d;
  uint64_t* flag_d;
  size_t data_size = 1536 * 1024 * 1024;
-  int nelemsPerGPU = data_size / sizeof(int) / world_size;
+  size_t nelemsPerGPU = data_size / sizeof(int) / world_size;
  CUDACHECK(cudaMalloc(&data_d, data_size));
  CUDACHECK(cudaMalloc(&flag_d, sizeof(uint64_t)));
  CUDACHECK(cudaMemset(data_d, 0, data_size));
@@ -176,7 +176,7 @@ int main(int argc, const char* argv[])

  int* data_h = new int[nelemsPerGPU * world_size];
  for (int i = 0; i < nelemsPerGPU * world_size; i++) {
-    int val = i + 1;
+    size_t val = i + 1;
    if (i / nelemsPerGPU == rank) {
      data_h[i] = val;
    } else {
@@ -221,7 +221,7 @@ int main(int argc, const char* argv[])
  CUDACHECK(cudaMemcpy(data_h, data_d, data_size, cudaMemcpyDeviceToHost));
  CUDACHECK(cudaDeviceSynchronize());

-  for (int i = 0; i < nelemsPerGPU * world_size; i++) {
+  for (size_t i = 0; i < nelemsPerGPU * world_size; i++) {
    int val = i + 1;
    if (data_h[i] != val) {
      printf("oh uh things went wrong! data_h[%d] (%d) != val (%d)\n", i, data_h[i], val);