mirror of
https://github.com/microsoft/mscclpp.git
synced 2026-04-19 22:39:11 +00:00
Robust correctness test (#221)
Co-authored-by: Aashaka Shah <aashaka96@gmail.com>
This commit is contained in:
@@ -127,5 +127,5 @@ jobs:
|
||||
else
|
||||
pip3 install -r ./python/requirements_cu12.txt
|
||||
fi
|
||||
mpirun -tag-output -x MSCCLPP_HOME=$(System.DefaultWorkingDirectory) -np 8 python3 ./python/benchmark/allreduce_bench.py
|
||||
mpirun -tag-output -x MSCCLPP_HOME=$(System.DefaultWorkingDirectory) -np 8 python3 ./python/mscclpp_benchmark/allreduce_bench.py
|
||||
workingDirectory: '$(System.DefaultWorkingDirectory)'
|
||||
|
||||
@@ -12,7 +12,7 @@ version = "0.3.0"
|
||||
[tool.scikit-build]
|
||||
cmake.minimum-version = "3.25.0"
|
||||
build-dir = "build/{wheel_tag}"
|
||||
wheel.packages = ["python/mscclpp"]
|
||||
wheel.packages = ["python/mscclpp", "python/mscclpp_benchmark"]
|
||||
wheel.install-dir = "mscclpp"
|
||||
|
||||
[tool.scikit-build.cmake.define]
|
||||
|
||||
1
python/mscclpp_benchmark/__init__.py
Normal file
1
python/mscclpp_benchmark/__init__.py
Normal file
@@ -0,0 +1 @@
|
||||
from .mscclpp_op import MscclppAllReduce1, MscclppAllReduce2, MscclppAllReduce3, MscclppAllReduce4, MscclppAllReduce5
|
||||
@@ -231,7 +231,7 @@ extern "C" __global__ void __launch_bounds__(1024, 1)
|
||||
nelems = nelems / (sizeof(int) / sizeof(TYPE));
|
||||
// This version of allreduce only works for single nodes
|
||||
const int nPeers = worldSize - 1;
|
||||
const int nPkts = nelems / 2;
|
||||
const size_t nPkts = nelems / 2;
|
||||
const int nelemsPerRank = nelems / worldSize;
|
||||
const int nPktsPerRank = nelemsPerRank / 2;
|
||||
// flag for packets. Initially 1
|
||||
@@ -11,7 +11,7 @@ from mscclpp import ProxyService
|
||||
from prettytable import PrettyTable
|
||||
import netifaces as ni
|
||||
|
||||
data_type = cp.float16
|
||||
data_type = cp.float32
|
||||
|
||||
if data_type == cp.float16:
|
||||
dtype_str = "fp16"
|
||||
@@ -71,25 +71,28 @@ def human_readable_size(size, decimal_places=1):
|
||||
return f"{size:.{decimal_places}f} {unit}"
|
||||
|
||||
|
||||
def check_correctness(memory, func):
|
||||
rand_gen = cp.random.default_rng(seed=MPI.COMM_WORLD.rank)
|
||||
memory[:] = rand_gen.random(memory.shape).astype(data_type)
|
||||
cp.cuda.runtime.deviceSynchronize()
|
||||
output_memory = func(0)
|
||||
cp.cuda.runtime.deviceSynchronize()
|
||||
expected = cp.zeros_like(memory)
|
||||
for i in range(MPI.COMM_WORLD.size):
|
||||
rand_gen = cp.random.default_rng(seed=i)
|
||||
expected += rand_gen.random(memory.shape).astype(data_type)
|
||||
def check_correctness(memory, func, niter=100):
|
||||
ac = True
|
||||
for p in range(niter):
|
||||
memory[:] = cp.ones(memory.shape).astype(data_type) * (p * MPI.COMM_WORLD.size + MPI.COMM_WORLD.rank)
|
||||
cp.cuda.runtime.deviceSynchronize()
|
||||
output_memory = func(0)
|
||||
cp.cuda.runtime.deviceSynchronize()
|
||||
expected = cp.zeros_like(memory)
|
||||
for i in range(MPI.COMM_WORLD.size):
|
||||
expected += cp.ones(memory.shape).astype(data_type) * (p * MPI.COMM_WORLD.size + i)
|
||||
|
||||
if data_type == cp.float16:
|
||||
ac = cp.allclose(output_memory, expected, rtol=1.0e-2, atol=1.0e-4)
|
||||
else:
|
||||
ac = cp.allclose(output_memory, expected, rtol=1.0e-2, atol=1.0e-4)
|
||||
is_close = cp.isclose(output_memory, expected, rtol=1.0e-2, atol=2)
|
||||
icf = is_close == 0
|
||||
all_close = cp.all(is_close)
|
||||
ac = ac and all_close
|
||||
if not all_close:
|
||||
print(
|
||||
f"not close: p={p}, rank={MPI.COMM_WORLD.rank}, output={output_memory[icf][0]}, expected={expected[icf][0]}",
|
||||
flush=True,
|
||||
)
|
||||
|
||||
ac = MPI.COMM_WORLD.allreduce(ac, op=MPI.SUM)
|
||||
if not ac:
|
||||
print(output_memory, expected)
|
||||
return ac
|
||||
|
||||
|
||||
Reference in New Issue
Block a user