diff --git a/tests/allgather_test.cu b/tests/allgather_test.cu
index e9f8331c..721b60d5 100644
--- a/tests/allgather_test.cu
+++ b/tests/allgather_test.cu
@@ -47,7 +47,7 @@ static double getTime(void)
 
 __constant__ mscclppDevConn_t constDevConns[16];
 
-__device__ void allgather0(mscclppDevConn_t devConn, int rank, int world_size, int remoteRank, int nelemsPerGPU)
+__device__ void allgather0(mscclppDevConn_t devConn, int rank, int world_size, int remoteRank, size_t nelemsPerGPU)
 {
   // this allgather is really simple and implemented as an alltoall
 
@@ -63,7 +63,7 @@ __device__ void allgather0(mscclppDevConn_t devConn, int rank, int world_size, i
   devConn.wait();
 }
 
-__device__ void allgather1(mscclppDevConn_t devConn, int rank, int world_size, int remoteRank, int nelemsPerGPU)
+__device__ void allgather1(mscclppDevConn_t devConn, int rank, int world_size, int remoteRank, size_t nelemsPerGPU)
 {
   // this allgather algorithm works as follows:
   // Step 1: GPU rank i sends data to GPU rank (i+1) % world_size
@@ -82,7 +82,7 @@ __device__ void allgather1(mscclppDevConn_t devConn, int rank, int world_size, i
   devConn.wait();
 }
 
-__global__ void kernel(int rank, int world_size, int nelemsPerGPU, int kernel)
+__global__ void kernel(int rank, int world_size, size_t nelemsPerGPU, int kernel)
 {
   // only use a single thread from each warp
   if (threadIdx.x % 32 != 0)
@@ -119,16 +119,16 @@ void print_usage(const char* prog)
 #endif
 }
 
-void initializeAndAllocateAllGatherData(int rank, int world_size, size_t dataSize, int nelemsPerGPU, int** data_h,
+void initializeAndAllocateAllGatherData(int rank, int world_size, size_t dataSize, size_t nelemsPerGPU, int** data_h,
                                         int** data_d)
 {
   CUDACHECK(cudaMalloc(data_d, dataSize));
   CUDACHECK(cudaMemset(*data_d, 0, dataSize));
 
   *data_h = new int[nelemsPerGPU * world_size];
-  for (int i = 0; i < nelemsPerGPU * world_size; i++) {
+  for (size_t i = 0; i < nelemsPerGPU * world_size; i++) {
     int val = i + 1;
-    if (i / nelemsPerGPU == rank) {
+    if (i / nelemsPerGPU == (size_t)rank) {
       (*data_h)[i] = val;
     } else {
       (*data_h)[i] = 0;
@@ -317,9 +317,9 @@ int main(int argc, const char* argv[])
   int* data_h;
   size_t dataSize = 1024 * 1024 * 1024;
   if (parsedArgs.find("datasize") != parsedArgs.end()) {
-    dataSize = std::stoi(parsedArgs["datasize"]);
+    dataSize = std::stoul(parsedArgs["datasize"]);
   }
-  int nelemsPerGPU = dataSize / sizeof(int) / world_size;
+  size_t nelemsPerGPU = dataSize / sizeof(int) / world_size;
 
   if (rank == 0)
     printf("Initializing data for allgather test\n");
@@ -343,10 +343,10 @@ int main(int argc, const char* argv[])
   CUDACHECK(cudaMemcpy(data_h, data_d, dataSize, cudaMemcpyDeviceToHost));
   CUDACHECK(cudaDeviceSynchronize());
 
-  for (int i = 0; i < nelemsPerGPU * world_size; i++) {
+  for (size_t i = 0; i < nelemsPerGPU * world_size; i++) {
     int val = i + 1;
     if (data_h[i] != val) {
-      printf("oh uh! data_h[%d] (%d) != val (%d)\n", i, data_h[i], val);
+      printf("oh uh! data_h[%ld] (%d) != val (%d)\n", i, data_h[i], val);
       break;
     }
   }
diff --git a/tests/allgather_test2.cu b/tests/allgather_test2.cu
index 0ea6da48..047869fb 100644
--- a/tests/allgather_test2.cu
+++ b/tests/allgather_test2.cu
@@ -32,7 +32,7 @@ static double getTime(void)
 
 __constant__ mscclppDevConn_t constDevConns[16];
 
-__global__ void kernel(int rank, int world_size, int nelemsPerGPU)
+__global__ void kernel(int rank, int world_size, size_t nelemsPerGPU)
 {
   if (threadIdx.x % 32 != 0)
     return;
@@ -63,7 +63,7 @@ __global__ void kernel(int rank, int world_size, int nelemsPerGPU)
       continue;
 
     // Trigger sending data, flag and synchronize after
-    int ibPortion = nelemsPerGPU / 12; // nelemsPerGPU/12;
+    size_t ibPortion = nelemsPerGPU / 12; // nelemsPerGPU/12;
     if (isIB)
       devConn.fifo.setTrigger(trig, mscclppFlag | mscclppData | mscclppSync,
                               rank * nelemsPerGPU * sizeof(int) + (nelemsPerGPU - ibPortion) * sizeof(int),
@@ -168,7 +168,7 @@ int main(int argc, const char* argv[])
   int* data_d;
   uint64_t* flag_d;
   size_t data_size = 1536 * 1024 * 1024;
-  int nelemsPerGPU = data_size / sizeof(int) / world_size;
+  size_t nelemsPerGPU = data_size / sizeof(int) / world_size;
   CUDACHECK(cudaMalloc(&data_d, data_size));
   CUDACHECK(cudaMalloc(&flag_d, sizeof(uint64_t)));
   CUDACHECK(cudaMemset(data_d, 0, data_size));
@@ -176,7 +176,7 @@ int main(int argc, const char* argv[])
 
   int* data_h = new int[nelemsPerGPU * world_size];
   for (int i = 0; i < nelemsPerGPU * world_size; i++) {
-    int val = i + 1;
+    size_t val = i + 1;
     if (i / nelemsPerGPU == rank) {
       data_h[i] = val;
     } else {
@@ -221,7 +221,7 @@ int main(int argc, const char* argv[])
   CUDACHECK(cudaMemcpy(data_h, data_d, data_size, cudaMemcpyDeviceToHost));
   CUDACHECK(cudaDeviceSynchronize());
 
-  for (int i = 0; i < nelemsPerGPU * world_size; i++) {
+  for (size_t i = 0; i < nelemsPerGPU * world_size; i++) {
     int val = i + 1;
     if (data_h[i] != val) {
       printf("oh uh things went wrong! data_h[%d] (%d) != val (%d)\n", i, data_h[i], val);