Robust correctness test (#221)

Co-authored-by: Aashaka Shah <aashaka96@gmail.com>
2026-04-19 22:39:11 +00:00 · 2023-11-22 12:06:50 +08:00
parent 3431f37067
commit 7bd66a938c
8 changed files with 24 additions and 20 deletions
--- a/.azure-pipelines/integration-test.yml
+++ b/.azure-pipelines/integration-test.yml
@@ -127,5 +127,5 @@ jobs:
        else
          pip3 install -r ./python/requirements_cu12.txt
        fi
-        mpirun -tag-output -x MSCCLPP_HOME=$(System.DefaultWorkingDirectory) -np 8 python3 ./python/benchmark/allreduce_bench.py
+        mpirun -tag-output -x MSCCLPP_HOME=$(System.DefaultWorkingDirectory) -np 8 python3 ./python/mscclpp_benchmark/allreduce_bench.py
      workingDirectory: '$(System.DefaultWorkingDirectory)'
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -12,7 +12,7 @@ version = "0.3.0"
 [tool.scikit-build]
 cmake.minimum-version = "3.25.0"
 build-dir = "build/{wheel_tag}"
-wheel.packages = ["python/mscclpp"]
+wheel.packages = ["python/mscclpp", "python/mscclpp_benchmark"]
 wheel.install-dir = "mscclpp"

 [tool.scikit-build.cmake.define]
--- a/python/benchmark/init.py
+++ b/python/benchmark/init.py
--- a/python/mscclpp_benchmark/init.py
+++ b/python/mscclpp_benchmark/init.py
@@ -0,0 +1 @@
+from .mscclpp_op import MscclppAllReduce1, MscclppAllReduce2, MscclppAllReduce3, MscclppAllReduce4, MscclppAllReduce5
--- a/python/mscclpp_benchmark/allreduce.cu
+++ b/python/mscclpp_benchmark/allreduce.cu
@@ -231,7 +231,7 @@ extern "C" __global__ void __launch_bounds__(1024, 1)
  nelems = nelems / (sizeof(int) / sizeof(TYPE));
  // This version of allreduce only works for single nodes
  const int nPeers = worldSize - 1;
-  const int nPkts = nelems / 2;
+  const size_t nPkts = nelems / 2;
  const int nelemsPerRank = nelems / worldSize;
  const int nPktsPerRank = nelemsPerRank / 2;
  // flag for packets. Initially 1
--- a/python/mscclpp_benchmark/allreduce_bench.py
+++ b/python/mscclpp_benchmark/allreduce_bench.py
@@ -11,7 +11,7 @@ from mscclpp import ProxyService
 from prettytable import PrettyTable
 import netifaces as ni

-data_type = cp.float16
+data_type = cp.float32

 if data_type == cp.float16:
    dtype_str = "fp16"
@@ -71,25 +71,28 @@ def human_readable_size(size, decimal_places=1):
    return f"{size:.{decimal_places}f} {unit}"


-def check_correctness(memory, func):
-    rand_gen = cp.random.default_rng(seed=MPI.COMM_WORLD.rank)
-    memory[:] = rand_gen.random(memory.shape).astype(data_type)
-    cp.cuda.runtime.deviceSynchronize()
-    output_memory = func(0)
-    cp.cuda.runtime.deviceSynchronize()
-    expected = cp.zeros_like(memory)
-    for i in range(MPI.COMM_WORLD.size):
-        rand_gen = cp.random.default_rng(seed=i)
-        expected += rand_gen.random(memory.shape).astype(data_type)
+def check_correctness(memory, func, niter=100):
+    ac = True
+    for p in range(niter):
+        memory[:] = cp.ones(memory.shape).astype(data_type) * (p * MPI.COMM_WORLD.size + MPI.COMM_WORLD.rank)
+        cp.cuda.runtime.deviceSynchronize()
+        output_memory = func(0)
+        cp.cuda.runtime.deviceSynchronize()
+        expected = cp.zeros_like(memory)
+        for i in range(MPI.COMM_WORLD.size):
+            expected += cp.ones(memory.shape).astype(data_type) * (p * MPI.COMM_WORLD.size + i)

-    if data_type == cp.float16:
-        ac = cp.allclose(output_memory, expected, rtol=1.0e-2, atol=1.0e-4)
-    else:
-        ac = cp.allclose(output_memory, expected, rtol=1.0e-2, atol=1.0e-4)
+        is_close = cp.isclose(output_memory, expected, rtol=1.0e-2, atol=2)
+        icf = is_close == 0
+        all_close = cp.all(is_close)
+        ac = ac and all_close
+        if not all_close:
+            print(
+                f"not close: p={p}, rank={MPI.COMM_WORLD.rank}, output={output_memory[icf][0]}, expected={expected[icf][0]}",
+                flush=True,
+            )

    ac = MPI.COMM_WORLD.allreduce(ac, op=MPI.SUM)
-    if not ac:
-        print(output_memory, expected)
    return ac


--- a/python/mscclpp_benchmark/mscclpp_op.py
+++ b/python/mscclpp_benchmark/mscclpp_op.py
--- a/python/mscclpp_benchmark/nccl_op.py
+++ b/python/mscclpp_benchmark/nccl_op.py
				`@@ -0,0 +1 @@`
				`from .mscclpp_op import MscclppAllReduce1, MscclppAllReduce2, MscclppAllReduce3, MscclppAllReduce4, MscclppAllReduce5`