Files
mscclpp/python/test/executor_test_verifier.cu
Caio Rocha 40295df4c4 Adding Support to bf16 Executor Tests (#801)
This pull request adds support for the `bfloat16` (bf16) data type to
the test executor, including both Python and CUDA components. The
changes ensure that `bfloat16` is handled consistently across argument
parsing, data type conversion, and test kernel implementations.
Additionally, the CUDA verification kernels are refactored to use
parameterized tolerances for improved numerical accuracy checks.

**Support for bfloat16 data type:**

* Added handling for `bfloat16`/`bf16` in the Python test executor's
argument parsing, data type conversion (`parse_dtype`,
`dtype_to_mscclpp_dtype`), and help text.
[[1]](diffhunk://#diff-e643968a8622d1603868a8ecf4b2fcd8108be1e404a3420bb7e2a6d51dc23fdcR27-R28)
[[2]](diffhunk://#diff-e643968a8622d1603868a8ecf4b2fcd8108be1e404a3420bb7e2a6d51dc23fdcL122-R135)
[[3]](diffhunk://#diff-e643968a8622d1603868a8ecf4b2fcd8108be1e404a3420bb7e2a6d51dc23fdcL246-R251)
* Updated output to display the correct data type string for `bfloat16`.

**CUDA kernel and test improvements:**

* Included `bfloat16` headers and defined test data fill and gather
kernels for `bfloat16` on both CUDA and HIP platforms.
[[1]](diffhunk://#diff-e18b8becff1c3b234733f5ca3250a76ffdc5edddb302c2da098b64b00ba7cf88R8-R11)
[[2]](diffhunk://#diff-e18b8becff1c3b234733f5ca3250a76ffdc5edddb302c2da098b64b00ba7cf88R35)
[[3]](diffhunk://#diff-e18b8becff1c3b234733f5ca3250a76ffdc5edddb302c2da098b64b00ba7cf88R54-R59)
[[4]](diffhunk://#diff-e18b8becff1c3b234733f5ca3250a76ffdc5edddb302c2da098b64b00ba7cf88R133)
* Refactored verification kernels (`ALL_REDUCE`, `REDUCE_SCATTER`) to
use an explicit tolerance parameter (`Eps`) and added correct tolerances
for each data type, including `bfloat16`.
[[1]](diffhunk://#diff-e18b8becff1c3b234733f5ca3250a76ffdc5edddb302c2da098b64b00ba7cf88L69-R85)
[[2]](diffhunk://#diff-e18b8becff1c3b234733f5ca3250a76ffdc5edddb302c2da098b64b00ba7cf88L94-R113)

These changes ensure full support for `bfloat16` in the test executor
and improve the accuracy and maintainability of the CUDA test kernels.

---------

Co-authored-by: Caio Rocha <caiorocha@microsof.com>
2026-05-14 09:56:11 -07:00

136 lines
9.7 KiB
Plaintext

// Copyright (c) Microsoft Corporation.
// Licensed under the MIT license.
#include <assert.h>
#if defined(__HIP_PLATFORM_AMD__)
#include <hip/hip_bfloat16.h>
#include <hip/hip_fp16.h>
#else
#include <cuda_bf16.h>
#include <cuda_fp16.h>
#endif
// Numerical Recipes ranqd1, Chapter 7.1, §An Even Quicker Generator, Eq. 7.1.6
// parameters from Knuth and H. W. Lewis
static __device__ unsigned int ranqd1(unsigned int seed) {
const unsigned int a = 1664525;
const unsigned int c = 1013904223;
return a * seed + c;
}
// fill/test kernel pairs must have the same thread block size to
// match their random number series.
#define FILL_DATA(FuncNameType, DataType) \
extern "C" __global__ void __launch_bounds__(1024, 1) \
fill_data_##FuncNameType(DataType* input_buf, size_t num_elems, int rank, int seq) { \
unsigned int seed = (unsigned int)(blockIdx.x * blockDim.x + threadIdx.x + rank + seq); \
for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < num_elems; i += blockDim.x * gridDim.x) { \
seed = ranqd1(seed); \
input_buf[i] = DataType(seed % blockDim.x) / DataType(blockDim.x); \
} \
}
FILL_DATA(bfloat16, __nv_bfloat16)
FILL_DATA(float16, __half)
FILL_DATA(float32, float)
FILL_DATA(int32, int)
#define TEST_DATA_ALL_GATHER(FuncNameType, DataType) \
extern "C" __global__ void __launch_bounds__(1024, 1) test_data_all_gather_##FuncNameType( \
DataType* result_buf, DataType* test_buf, size_t num_elems, int num_ranks, int my_rank, int seq) { \
for (int rank = 0; rank < num_ranks; rank++) { \
size_t rank_offset = rank * num_elems; \
unsigned int seed = (unsigned int)(blockIdx.x * blockDim.x + threadIdx.x + rank + seq); \
for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < num_elems; i += blockDim.x * gridDim.x) { \
seed = ranqd1(seed); \
test_buf[rank_offset + i] = DataType(seed % blockDim.x) / DataType(blockDim.x); \
assert(result_buf[rank_offset + i] == test_buf[rank_offset + i]); \
} \
} \
}
TEST_DATA_ALL_GATHER(bfloat16, __nv_bfloat16)
TEST_DATA_ALL_GATHER(float16, __half)
TEST_DATA_ALL_GATHER(float32, float)
TEST_DATA_ALL_GATHER(int32, int)
#define TEST_DATA_ALL_REDUCE(FuncNameType, DataType, Eps) \
extern "C" __global__ void __launch_bounds__(1024, 1) test_data_all_reduce_##FuncNameType( \
DataType* result_buf, DataType* test_buf, size_t num_elems, int num_ranks, int my_rank, int seq) { \
for (int rank = 0; rank < num_ranks; rank++) { \
unsigned int seed = (unsigned int)(blockIdx.x * blockDim.x + threadIdx.x + rank + seq); \
for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < num_elems; i += blockDim.x * gridDim.x) { \
if (rank == 0) { \
test_buf[i] = 0; \
} \
seed = ranqd1(seed); \
test_buf[i] += DataType(seed % blockDim.x) / DataType(blockDim.x); \
} \
} \
for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < num_elems; i += blockDim.x * gridDim.x) { \
float expected = float(test_buf[i]); \
float result = float(result_buf[i]); \
float tol = Eps * num_ranks * (1.0f + abs(expected)); \
assert(abs(result - expected) <= tol); \
} \
}
TEST_DATA_ALL_REDUCE(bfloat16, __nv_bfloat16, 7.8125e-3f)
TEST_DATA_ALL_REDUCE(float16, __half, 9.765625e-4f)
TEST_DATA_ALL_REDUCE(float32, float, 1.1920929e-7f)
TEST_DATA_ALL_REDUCE(int32, int, 0.0f)
#define TEST_DATA_REDUCE_SCATTER(FuncNameType, DataType, Eps) \
extern "C" __global__ void __launch_bounds__(1024, 1) test_data_reduce_scatter_##FuncNameType( \
DataType* result_buf, DataType* test_buf, size_t num_elems, int num_ranks, int my_rank, int seq) { \
int nem_elems_per_rank = num_elems / num_ranks; \
int offset = nem_elems_per_rank * my_rank; \
for (int rank = 0; rank < num_ranks; rank++) { \
unsigned int seed = (unsigned int)(blockIdx.x * blockDim.x + threadIdx.x + rank + seq); \
for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < num_elems; i += blockDim.x * gridDim.x) { \
if (rank == 0) { \
test_buf[i] = 0; \
} \
seed = ranqd1(seed); \
test_buf[i] += DataType(seed % blockDim.x) / DataType(blockDim.x); \
} \
} \
for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < num_elems; i += blockDim.x * gridDim.x) { \
if (i >= offset && i < offset + nem_elems_per_rank) { \
float expected = float(test_buf[i]); \
float result = float(result_buf[i - offset]); \
float tol = Eps * num_ranks * (1.0f + abs(expected)); \
assert(abs(result - expected) <= tol); \
} \
} \
}
TEST_DATA_REDUCE_SCATTER(bfloat16, __nv_bfloat16, 7.8125e-3f)
TEST_DATA_REDUCE_SCATTER(float16, __half, 9.765625e-4f)
TEST_DATA_REDUCE_SCATTER(float32, float, 1.1920929e-7f)
TEST_DATA_REDUCE_SCATTER(int32, int, 0.0f)
#define TEST_DATA_ALL_TO_ALL(FuncNameType, DataType) \
extern "C" __global__ void __launch_bounds__(1024, 1) test_data_all_to_all_##FuncNameType( \
DataType* result_buf, DataType* test_buf, size_t num_elems, int num_ranks, int my_rank, int seq) { \
int nem_elems_per_rank = num_elems / num_ranks; \
int offset = nem_elems_per_rank * my_rank; \
for (int rank = 0; rank < num_ranks; rank++) { \
size_t rank_offset = rank * nem_elems_per_rank; \
unsigned int seed = (unsigned int)(blockIdx.x * blockDim.x + threadIdx.x + rank + seq); \
for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < num_elems; i += blockDim.x * gridDim.x) { \
seed = ranqd1(seed); \
if (i >= my_rank * nem_elems_per_rank && i < (my_rank + 1) * nem_elems_per_rank) { \
test_buf[rank_offset + i - offset] = DataType(seed % blockDim.x) / DataType(blockDim.x); \
assert(result_buf[rank_offset + i - offset] == test_buf[rank_offset + i - offset]); \
} \
} \
} \
}
TEST_DATA_ALL_TO_ALL(bfloat16, __nv_bfloat16)
TEST_DATA_ALL_TO_ALL(float16, __half)
TEST_DATA_ALL_TO_ALL(float32, float)
TEST_DATA_ALL_TO_ALL(int32, int)