Robust correctness test (#221)

Co-authored-by: Aashaka Shah <aashaka96@gmail.com>
This commit is contained in:
Changho Hwang
2023-11-22 12:06:50 +08:00
committed by GitHub
parent 3431f37067
commit 7bd66a938c
8 changed files with 24 additions and 20 deletions

View File

@@ -127,5 +127,5 @@ jobs:
else
pip3 install -r ./python/requirements_cu12.txt
fi
mpirun -tag-output -x MSCCLPP_HOME=$(System.DefaultWorkingDirectory) -np 8 python3 ./python/benchmark/allreduce_bench.py
mpirun -tag-output -x MSCCLPP_HOME=$(System.DefaultWorkingDirectory) -np 8 python3 ./python/mscclpp_benchmark/allreduce_bench.py
workingDirectory: '$(System.DefaultWorkingDirectory)'

View File

@@ -12,7 +12,7 @@ version = "0.3.0"
[tool.scikit-build]
cmake.minimum-version = "3.25.0"
build-dir = "build/{wheel_tag}"
wheel.packages = ["python/mscclpp"]
wheel.packages = ["python/mscclpp", "python/mscclpp_benchmark"]
wheel.install-dir = "mscclpp"
[tool.scikit-build.cmake.define]

View File

@@ -0,0 +1 @@
from .mscclpp_op import MscclppAllReduce1, MscclppAllReduce2, MscclppAllReduce3, MscclppAllReduce4, MscclppAllReduce5

View File

@@ -231,7 +231,7 @@ extern "C" __global__ void __launch_bounds__(1024, 1)
nelems = nelems / (sizeof(int) / sizeof(TYPE));
// This version of allreduce only works for single nodes
const int nPeers = worldSize - 1;
const int nPkts = nelems / 2;
const size_t nPkts = nelems / 2;
const int nelemsPerRank = nelems / worldSize;
const int nPktsPerRank = nelemsPerRank / 2;
// flag for packets. Initially 1

View File

@@ -11,7 +11,7 @@ from mscclpp import ProxyService
from prettytable import PrettyTable
import netifaces as ni
data_type = cp.float16
data_type = cp.float32
if data_type == cp.float16:
dtype_str = "fp16"
@@ -71,25 +71,28 @@ def human_readable_size(size, decimal_places=1):
return f"{size:.{decimal_places}f} {unit}"
def check_correctness(memory, func):
rand_gen = cp.random.default_rng(seed=MPI.COMM_WORLD.rank)
memory[:] = rand_gen.random(memory.shape).astype(data_type)
cp.cuda.runtime.deviceSynchronize()
output_memory = func(0)
cp.cuda.runtime.deviceSynchronize()
expected = cp.zeros_like(memory)
for i in range(MPI.COMM_WORLD.size):
rand_gen = cp.random.default_rng(seed=i)
expected += rand_gen.random(memory.shape).astype(data_type)
def check_correctness(memory, func, niter=100):
ac = True
for p in range(niter):
memory[:] = cp.ones(memory.shape).astype(data_type) * (p * MPI.COMM_WORLD.size + MPI.COMM_WORLD.rank)
cp.cuda.runtime.deviceSynchronize()
output_memory = func(0)
cp.cuda.runtime.deviceSynchronize()
expected = cp.zeros_like(memory)
for i in range(MPI.COMM_WORLD.size):
expected += cp.ones(memory.shape).astype(data_type) * (p * MPI.COMM_WORLD.size + i)
if data_type == cp.float16:
ac = cp.allclose(output_memory, expected, rtol=1.0e-2, atol=1.0e-4)
else:
ac = cp.allclose(output_memory, expected, rtol=1.0e-2, atol=1.0e-4)
is_close = cp.isclose(output_memory, expected, rtol=1.0e-2, atol=2)
icf = is_close == 0
all_close = cp.all(is_close)
ac = ac and all_close
if not all_close:
print(
f"not close: p={p}, rank={MPI.COMM_WORLD.rank}, output={output_memory[icf][0]}, expected={expected[icf][0]}",
flush=True,
)
ac = MPI.COMM_WORLD.allreduce(ac, op=MPI.SUM)
if not ac:
print(output_memory, expected)
return ac