diff --git a/python/test/executor_test.py b/python/test/executor_test.py index 59bc1661..8a309de5 100644 --- a/python/test/executor_test.py +++ b/python/test/executor_test.py @@ -24,6 +24,8 @@ def parse_dtype(dtype_str): dtype_str = dtype_str.strip().lower() if dtype_str == "float16": return cp.float16 + elif dtype_str in ("bfloat16", "bf16"): + return cp.float16 # same 2-byte size; mscclpp DataType is resolved from dtype_str elif dtype_str == "float32": return cp.float32 elif dtype_str == "int32": @@ -119,15 +121,18 @@ def parse_size(size_str): return int(size_str) -def dtype_to_mscclpp_dtype(dtype): - if dtype == cp.float16: +def dtype_to_mscclpp_dtype(dtype_str): + dtype_str = dtype_str.strip().lower() + if dtype_str == "float16": return DataType.float16 - elif dtype == cp.float32: + elif dtype_str in ("bfloat16", "bf16"): + return DataType.bfloat16 + elif dtype_str == "float32": return DataType.float32 - elif dtype == cp.int32: + elif dtype_str == "int32": return DataType.int32 else: - raise ValueError(f"Unknown data type: {dtype}") + raise ValueError(f"Unknown data type: {dtype_str}") def build_bufs( @@ -205,7 +210,7 @@ def main( result_buf.data.ptr, input_buf.nbytes, result_buf.nbytes, - dtype_to_mscclpp_dtype(dtype), + dtype_to_mscclpp_dtype(dtype_str), execution_plan, stream.ptr, packet_type, @@ -231,7 +236,7 @@ def main( npkit.shutdown() print( f"Rank: {mscclpp_group.my_rank} Execution time: {execution_time} us, " - f"data size: {result_buf.nbytes} bytes data type: {dtype().dtype.name} " + f"data size: {result_buf.nbytes} bytes data type: {dtype_str} " f"packet type: {packet_type}" ) executor = None @@ -243,7 +248,7 @@ if __name__ == "__main__": parser.add_argument("-path", "--execution_plan_path", type=str, required=True) parser.add_argument("--size", type=str, required=True) parser.add_argument("--in_place", action="store_true", help="flag to define an in-place operation") - parser.add_argument("--dtype", type=str, default="float16", help="Choose from float16, float32, int32") + parser.add_argument("--dtype", type=str, default="float16", help="Choose from float16, bfloat16, float32, int32") parser.add_argument("--packet_type", type=str, default="LL16", help="Choose from LL8, LL16") parser.add_argument("--n_iters", type=int, default=10) parser.add_argument("--n_graph_iters", type=int, default=10) diff --git a/python/test/executor_test_verifier.cu b/python/test/executor_test_verifier.cu index cf3cd4a6..e7749197 100644 --- a/python/test/executor_test_verifier.cu +++ b/python/test/executor_test_verifier.cu @@ -4,8 +4,10 @@ #include #if defined(__HIP_PLATFORM_AMD__) +#include #include #else +#include #include #endif @@ -30,6 +32,7 @@ static __device__ unsigned int ranqd1(unsigned int seed) { } \ } +FILL_DATA(bfloat16, __nv_bfloat16) FILL_DATA(float16, __half) FILL_DATA(float32, float) FILL_DATA(int32, int) @@ -48,11 +51,12 @@ FILL_DATA(int32, int) } \ } +TEST_DATA_ALL_GATHER(bfloat16, __nv_bfloat16) TEST_DATA_ALL_GATHER(float16, __half) TEST_DATA_ALL_GATHER(float32, float) TEST_DATA_ALL_GATHER(int32, int) -#define TEST_DATA_ALL_REDUCE(FuncNameType, DataType) \ +#define TEST_DATA_ALL_REDUCE(FuncNameType, DataType, Eps) \ extern "C" __global__ void __launch_bounds__(1024, 1) test_data_all_reduce_##FuncNameType( \ DataType* result_buf, DataType* test_buf, size_t num_elems, int num_ranks, int my_rank, int seq) { \ for (int rank = 0; rank < num_ranks; rank++) { \ @@ -66,15 +70,19 @@ TEST_DATA_ALL_GATHER(int32, int) } \ } \ for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < num_elems; i += blockDim.x * gridDim.x) { \ - assert(abs(float(result_buf[i]) - float(test_buf[i])) < 1e-3 * num_ranks); \ + float expected = float(test_buf[i]); \ + float result = float(result_buf[i]); \ + float tol = Eps * num_ranks * (1.0f + abs(expected)); \ + assert(abs(result - expected) <= tol); \ } \ } -TEST_DATA_ALL_REDUCE(float16, __half) -TEST_DATA_ALL_REDUCE(float32, float) -TEST_DATA_ALL_REDUCE(int32, int) +TEST_DATA_ALL_REDUCE(bfloat16, __nv_bfloat16, 7.8125e-3f) +TEST_DATA_ALL_REDUCE(float16, __half, 9.765625e-4f) +TEST_DATA_ALL_REDUCE(float32, float, 1.1920929e-7f) +TEST_DATA_ALL_REDUCE(int32, int, 0.0f) -#define TEST_DATA_REDUCE_SCATTER(FuncNameType, DataType) \ +#define TEST_DATA_REDUCE_SCATTER(FuncNameType, DataType, Eps) \ extern "C" __global__ void __launch_bounds__(1024, 1) test_data_reduce_scatter_##FuncNameType( \ DataType* result_buf, DataType* test_buf, size_t num_elems, int num_ranks, int my_rank, int seq) { \ int nem_elems_per_rank = num_elems / num_ranks; \ @@ -91,14 +99,18 @@ TEST_DATA_ALL_REDUCE(int32, int) } \ for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < num_elems; i += blockDim.x * gridDim.x) { \ if (i >= offset && i < offset + nem_elems_per_rank) { \ - assert(abs(float(result_buf[i - offset]) - float(test_buf[i])) < 1e-3 * num_ranks); \ + float expected = float(test_buf[i]); \ + float result = float(result_buf[i - offset]); \ + float tol = Eps * num_ranks * (1.0f + abs(expected)); \ + assert(abs(result - expected) <= tol); \ } \ } \ } -TEST_DATA_REDUCE_SCATTER(float16, __half) -TEST_DATA_REDUCE_SCATTER(float32, float) -TEST_DATA_REDUCE_SCATTER(int32, int) +TEST_DATA_REDUCE_SCATTER(bfloat16, __nv_bfloat16, 7.8125e-3f) +TEST_DATA_REDUCE_SCATTER(float16, __half, 9.765625e-4f) +TEST_DATA_REDUCE_SCATTER(float32, float, 1.1920929e-7f) +TEST_DATA_REDUCE_SCATTER(int32, int, 0.0f) #define TEST_DATA_ALL_TO_ALL(FuncNameType, DataType) \ extern "C" __global__ void __launch_bounds__(1024, 1) test_data_all_to_all_##FuncNameType( \ @@ -118,6 +130,7 @@ TEST_DATA_REDUCE_SCATTER(int32, int) } \ } +TEST_DATA_ALL_TO_ALL(bfloat16, __nv_bfloat16) TEST_DATA_ALL_TO_ALL(float16, __half) TEST_DATA_ALL_TO_ALL(float32, float) TEST_DATA_ALL_TO_ALL(int32, int) \ No newline at end of file