Adding Support to bf16 Executor Tests (#801)

This pull request adds support for the `bfloat16` (bf16) data type to the test executor, including both Python and CUDA components. The changes ensure that `bfloat16` is handled consistently across argument parsing, data type conversion, and test kernel implementations. Additionally, the CUDA verification kernels are refactored to use parameterized tolerances for improved numerical accuracy checks. **Support for bfloat16 data type:** * Added handling for `bfloat16`/`bf16` in the Python test executor's argument parsing, data type conversion (`parse_dtype`, `dtype_to_mscclpp_dtype`), and help text. [[1]](diffhunk://#diff-e643968a8622d1603868a8ecf4b2fcd8108be1e404a3420bb7e2a6d51dc23fdcR27-R28) [[2]](diffhunk://#diff-e643968a8622d1603868a8ecf4b2fcd8108be1e404a3420bb7e2a6d51dc23fdcL122-R135) [[3]](diffhunk://#diff-e643968a8622d1603868a8ecf4b2fcd8108be1e404a3420bb7e2a6d51dc23fdcL246-R251) * Updated output to display the correct data type string for `bfloat16`. **CUDA kernel and test improvements:** * Included `bfloat16` headers and defined test data fill and gather kernels for `bfloat16` on both CUDA and HIP platforms. [[1]](diffhunk://#diff-e18b8becff1c3b234733f5ca3250a76ffdc5edddb302c2da098b64b00ba7cf88R8-R11) [[2]](diffhunk://#diff-e18b8becff1c3b234733f5ca3250a76ffdc5edddb302c2da098b64b00ba7cf88R35) [[3]](diffhunk://#diff-e18b8becff1c3b234733f5ca3250a76ffdc5edddb302c2da098b64b00ba7cf88R54-R59) [[4]](diffhunk://#diff-e18b8becff1c3b234733f5ca3250a76ffdc5edddb302c2da098b64b00ba7cf88R133) * Refactored verification kernels (`ALL_REDUCE`, `REDUCE_SCATTER`) to use an explicit tolerance parameter (`Eps`) and added correct tolerances for each data type, including `bfloat16`. [[1]](diffhunk://#diff-e18b8becff1c3b234733f5ca3250a76ffdc5edddb302c2da098b64b00ba7cf88L69-R85) [[2]](diffhunk://#diff-e18b8becff1c3b234733f5ca3250a76ffdc5edddb302c2da098b64b00ba7cf88L94-R113) These changes ensure full support for `bfloat16` in the test executor and improve the accuracy and maintainability of the CUDA test kernels. --------- Co-authored-by: Caio Rocha <caiorocha@microsof.com>
2026-05-25 07:14:40 +00:00 · 2026-05-14 09:56:11 -07:00
parent 0c9b9abfd5
commit 40295df4c4
2 changed files with 36 additions and 18 deletions
--- a/python/test/executor_test.py
+++ b/python/test/executor_test.py
@@ -24,6 +24,8 @@ def parse_dtype(dtype_str):
    dtype_str = dtype_str.strip().lower()
    if dtype_str == "float16":
        return cp.float16
+    elif dtype_str in ("bfloat16", "bf16"):
+        return cp.float16  # same 2-byte size; mscclpp DataType is resolved from dtype_str
    elif dtype_str == "float32":
        return cp.float32
    elif dtype_str == "int32":
@@ -119,15 +121,18 @@ def parse_size(size_str):
        return int(size_str)


-def dtype_to_mscclpp_dtype(dtype):
-    if dtype == cp.float16:
+def dtype_to_mscclpp_dtype(dtype_str):
+    dtype_str = dtype_str.strip().lower()
+    if dtype_str == "float16":
        return DataType.float16
-    elif dtype == cp.float32:
+    elif dtype_str in ("bfloat16", "bf16"):
+        return DataType.bfloat16
+    elif dtype_str == "float32":
        return DataType.float32
-    elif dtype == cp.int32:
+    elif dtype_str == "int32":
        return DataType.int32
    else:
-        raise ValueError(f"Unknown data type: {dtype}")
+        raise ValueError(f"Unknown data type: {dtype_str}")


 def build_bufs(
@@ -205,7 +210,7 @@ def main(
        result_buf.data.ptr,
        input_buf.nbytes,
        result_buf.nbytes,
-        dtype_to_mscclpp_dtype(dtype),
+        dtype_to_mscclpp_dtype(dtype_str),
        execution_plan,
        stream.ptr,
        packet_type,
@@ -231,7 +236,7 @@ def main(
        npkit.shutdown()
    print(
        f"Rank: {mscclpp_group.my_rank} Execution time: {execution_time} us, "
-        f"data size: {result_buf.nbytes} bytes data type: {dtype().dtype.name} "
+        f"data size: {result_buf.nbytes} bytes data type: {dtype_str} "
        f"packet type: {packet_type}"
    )
    executor = None
@@ -243,7 +248,7 @@ if __name__ == "__main__":
    parser.add_argument("-path", "--execution_plan_path", type=str, required=True)
    parser.add_argument("--size", type=str, required=True)
    parser.add_argument("--in_place", action="store_true", help="flag to define an in-place operation")
-    parser.add_argument("--dtype", type=str, default="float16", help="Choose from float16, float32, int32")
+    parser.add_argument("--dtype", type=str, default="float16", help="Choose from float16, bfloat16, float32, int32")
    parser.add_argument("--packet_type", type=str, default="LL16", help="Choose from LL8, LL16")
    parser.add_argument("--n_iters", type=int, default=10)
    parser.add_argument("--n_graph_iters", type=int, default=10)
--- a/python/test/executor_test_verifier.cu
+++ b/python/test/executor_test_verifier.cu
@@ -4,8 +4,10 @@
 #include <assert.h>

 #if defined(__HIP_PLATFORM_AMD__)
+#include <hip/hip_bfloat16.h>
 #include <hip/hip_fp16.h>
 #else
+#include <cuda_bf16.h>
 #include <cuda_fp16.h>
 #endif

@@ -30,6 +32,7 @@ static __device__ unsigned int ranqd1(unsigned int seed) {
    }                                                                                                    \
  }

+FILL_DATA(bfloat16, __nv_bfloat16)
 FILL_DATA(float16, __half)
 FILL_DATA(float32, float)
 FILL_DATA(int32, int)
@@ -48,11 +51,12 @@ FILL_DATA(int32, int)
    }                                                                                                      \
  }

+TEST_DATA_ALL_GATHER(bfloat16, __nv_bfloat16)
 TEST_DATA_ALL_GATHER(float16, __half)
 TEST_DATA_ALL_GATHER(float32, float)
 TEST_DATA_ALL_GATHER(int32, int)

-#define TEST_DATA_ALL_REDUCE(FuncNameType, DataType)                                                       \
+#define TEST_DATA_ALL_REDUCE(FuncNameType, DataType, Eps)                                                  \
  extern "C" __global__ void __launch_bounds__(1024, 1) test_data_all_reduce_##FuncNameType(               \
      DataType* result_buf, DataType* test_buf, size_t num_elems, int num_ranks, int my_rank, int seq) {   \
    for (int rank = 0; rank < num_ranks; rank++) {                                                         \
@@ -66,15 +70,19 @@ TEST_DATA_ALL_GATHER(int32, int)
      }                                                                                                    \
    }                                                                                                      \
    for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < num_elems; i += blockDim.x * gridDim.x) {   \
-      assert(abs(float(result_buf[i]) - float(test_buf[i])) < 1e-3 * num_ranks);                           \
+      float expected = float(test_buf[i]);                                                                 \
+      float result = float(result_buf[i]);                                                                 \
+      float tol = Eps * num_ranks * (1.0f + abs(expected));                                                \
+      assert(abs(result - expected) <= tol);                                                               \
    }                                                                                                      \
  }

-TEST_DATA_ALL_REDUCE(float16, __half)
-TEST_DATA_ALL_REDUCE(float32, float)
-TEST_DATA_ALL_REDUCE(int32, int)
+TEST_DATA_ALL_REDUCE(bfloat16, __nv_bfloat16, 7.8125e-3f)
+TEST_DATA_ALL_REDUCE(float16, __half, 9.765625e-4f)
+TEST_DATA_ALL_REDUCE(float32, float, 1.1920929e-7f)
+TEST_DATA_ALL_REDUCE(int32, int, 0.0f)

-#define TEST_DATA_REDUCE_SCATTER(FuncNameType, DataType)                                                   \
+#define TEST_DATA_REDUCE_SCATTER(FuncNameType, DataType, Eps)                                              \
  extern "C" __global__ void __launch_bounds__(1024, 1) test_data_reduce_scatter_##FuncNameType(           \
      DataType* result_buf, DataType* test_buf, size_t num_elems, int num_ranks, int my_rank, int seq) {   \
    int nem_elems_per_rank = num_elems / num_ranks;                                                        \
@@ -91,14 +99,18 @@ TEST_DATA_ALL_REDUCE(int32, int)
    }                                                                                                      \
    for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < num_elems; i += blockDim.x * gridDim.x) {   \
      if (i >= offset && i < offset + nem_elems_per_rank) {                                                \
-        assert(abs(float(result_buf[i - offset]) - float(test_buf[i])) < 1e-3 * num_ranks);                \
+        float expected = float(test_buf[i]);                                                               \
+        float result = float(result_buf[i - offset]);                                                      \
+        float tol = Eps * num_ranks * (1.0f + abs(expected));                                              \
+        assert(abs(result - expected) <= tol);                                                             \
      }                                                                                                    \
    }                                                                                                      \
  }

-TEST_DATA_REDUCE_SCATTER(float16, __half)
-TEST_DATA_REDUCE_SCATTER(float32, float)
-TEST_DATA_REDUCE_SCATTER(int32, int)
+TEST_DATA_REDUCE_SCATTER(bfloat16, __nv_bfloat16, 7.8125e-3f)
+TEST_DATA_REDUCE_SCATTER(float16, __half, 9.765625e-4f)
+TEST_DATA_REDUCE_SCATTER(float32, float, 1.1920929e-7f)
+TEST_DATA_REDUCE_SCATTER(int32, int, 0.0f)

 #define TEST_DATA_ALL_TO_ALL(FuncNameType, DataType)                                                       \
  extern "C" __global__ void __launch_bounds__(1024, 1) test_data_all_to_all_##FuncNameType(               \
@@ -118,6 +130,7 @@ TEST_DATA_REDUCE_SCATTER(int32, int)
    }                                                                                                      \
  }

+TEST_DATA_ALL_TO_ALL(bfloat16, __nv_bfloat16)
 TEST_DATA_ALL_TO_ALL(float16, __half)
 TEST_DATA_ALL_TO_ALL(float32, float)
 TEST_DATA_ALL_TO_ALL(int32, int)