diff --git a/tests/allgather_test.cu b/tests/allgather_test.cu index e9f8331c..721b60d5 100644 --- a/tests/allgather_test.cu +++ b/tests/allgather_test.cu @@ -47,7 +47,7 @@ static double getTime(void) __constant__ mscclppDevConn_t constDevConns[16]; -__device__ void allgather0(mscclppDevConn_t devConn, int rank, int world_size, int remoteRank, int nelemsPerGPU) +__device__ void allgather0(mscclppDevConn_t devConn, int rank, int world_size, int remoteRank, size_t nelemsPerGPU) { // this allgather is really simple and implemented as an alltoall @@ -63,7 +63,7 @@ __device__ void allgather0(mscclppDevConn_t devConn, int rank, int world_size, i devConn.wait(); } -__device__ void allgather1(mscclppDevConn_t devConn, int rank, int world_size, int remoteRank, int nelemsPerGPU) +__device__ void allgather1(mscclppDevConn_t devConn, int rank, int world_size, int remoteRank, size_t nelemsPerGPU) { // this allgather algorithm works as follows: // Step 1: GPU rank i sends data to GPU rank (i+1) % world_size @@ -82,7 +82,7 @@ __device__ void allgather1(mscclppDevConn_t devConn, int rank, int world_size, i devConn.wait(); } -__global__ void kernel(int rank, int world_size, int nelemsPerGPU, int kernel) +__global__ void kernel(int rank, int world_size, size_t nelemsPerGPU, int kernel) { // only use a single thread from each warp if (threadIdx.x % 32 != 0) @@ -119,16 +119,16 @@ void print_usage(const char* prog) #endif } -void initializeAndAllocateAllGatherData(int rank, int world_size, size_t dataSize, int nelemsPerGPU, int** data_h, +void initializeAndAllocateAllGatherData(int rank, int world_size, size_t dataSize, size_t nelemsPerGPU, int** data_h, int** data_d) { CUDACHECK(cudaMalloc(data_d, dataSize)); CUDACHECK(cudaMemset(*data_d, 0, dataSize)); *data_h = new int[nelemsPerGPU * world_size]; - for (int i = 0; i < nelemsPerGPU * world_size; i++) { + for (size_t i = 0; i < nelemsPerGPU * world_size; i++) { int val = i + 1; - if (i / nelemsPerGPU == rank) { + if (i / nelemsPerGPU == (size_t)rank) { (*data_h)[i] = val; } else { (*data_h)[i] = 0; @@ -317,9 +317,9 @@ int main(int argc, const char* argv[]) int* data_h; size_t dataSize = 1024 * 1024 * 1024; if (parsedArgs.find("datasize") != parsedArgs.end()) { - dataSize = std::stoi(parsedArgs["datasize"]); + dataSize = std::stoul(parsedArgs["datasize"]); } - int nelemsPerGPU = dataSize / sizeof(int) / world_size; + size_t nelemsPerGPU = dataSize / sizeof(int) / world_size; if (rank == 0) printf("Initializing data for allgather test\n"); @@ -343,10 +343,10 @@ int main(int argc, const char* argv[]) CUDACHECK(cudaMemcpy(data_h, data_d, dataSize, cudaMemcpyDeviceToHost)); CUDACHECK(cudaDeviceSynchronize()); - for (int i = 0; i < nelemsPerGPU * world_size; i++) { + for (size_t i = 0; i < nelemsPerGPU * world_size; i++) { int val = i + 1; if (data_h[i] != val) { - printf("oh uh! data_h[%d] (%d) != val (%d)\n", i, data_h[i], val); + printf("oh uh! data_h[%ld] (%d) != val (%d)\n", i, data_h[i], val); break; } } diff --git a/tests/allgather_test2.cu b/tests/allgather_test2.cu index 0ea6da48..047869fb 100644 --- a/tests/allgather_test2.cu +++ b/tests/allgather_test2.cu @@ -32,7 +32,7 @@ static double getTime(void) __constant__ mscclppDevConn_t constDevConns[16]; -__global__ void kernel(int rank, int world_size, int nelemsPerGPU) +__global__ void kernel(int rank, int world_size, size_t nelemsPerGPU) { if (threadIdx.x % 32 != 0) return; @@ -63,7 +63,7 @@ __global__ void kernel(int rank, int world_size, int nelemsPerGPU) continue; // Trigger sending data, flag and synchronize after - int ibPortion = nelemsPerGPU / 12; // nelemsPerGPU/12; + size_t ibPortion = nelemsPerGPU / 12; // nelemsPerGPU/12; if (isIB) devConn.fifo.setTrigger(trig, mscclppFlag | mscclppData | mscclppSync, rank * nelemsPerGPU * sizeof(int) + (nelemsPerGPU - ibPortion) * sizeof(int), @@ -168,7 +168,7 @@ int main(int argc, const char* argv[]) int* data_d; uint64_t* flag_d; size_t data_size = 1536 * 1024 * 1024; - int nelemsPerGPU = data_size / sizeof(int) / world_size; + size_t nelemsPerGPU = data_size / sizeof(int) / world_size; CUDACHECK(cudaMalloc(&data_d, data_size)); CUDACHECK(cudaMalloc(&flag_d, sizeof(uint64_t))); CUDACHECK(cudaMemset(data_d, 0, data_size)); @@ -176,7 +176,7 @@ int main(int argc, const char* argv[]) int* data_h = new int[nelemsPerGPU * world_size]; for (int i = 0; i < nelemsPerGPU * world_size; i++) { - int val = i + 1; + size_t val = i + 1; if (i / nelemsPerGPU == rank) { data_h[i] = val; } else { @@ -221,7 +221,7 @@ int main(int argc, const char* argv[]) CUDACHECK(cudaMemcpy(data_h, data_d, data_size, cudaMemcpyDeviceToHost)); CUDACHECK(cudaDeviceSynchronize()); - for (int i = 0; i < nelemsPerGPU * world_size; i++) { + for (size_t i = 0; i < nelemsPerGPU * world_size; i++) { int val = i + 1; if (data_h[i] != val) { printf("oh uh things went wrong! data_h[%d] (%d) != val (%d)\n", i, data_h[i], val);