mirror of
https://github.com/microsoft/mscclpp.git
synced 2026-05-12 09:17:06 +00:00
Support E4M3B15 datatype (#765)
## Summary - **Add `fp8_e4m3b15` datatype**: A software-defined FP8 type with 4 exponent bits, 3 mantissa bits, and bias=15 (max finite value: 0.9375). Implemented entirely in software with no HW dependency, using Triton-style bit manipulation through fp16 as intermediate for efficient conversion. - **Add mixed-precision accumulation for allreduce**: All allreduce algorithm variants (packet, NVLS packet, fullmesh, RSAG zero-copy, and others) now support a configurable `accumDtype` parameter, enabling FP8 inputs to be reduced in float16 or float32 for higher accuracy. - **Propagate `accumDtype` through the full API**: The new parameter is threaded from `Algorithm::execute()` → `NativeAlgorithm` → `KernelFunc` → dispatch → CUDA kernels, with `DataType::AUTO` as the default (resolves to input dtype at runtime). - **Add FP8 accumulation correctness tests**: New `test_fp8_accum.py` validates that higher-precision accumulation produces results at least as accurate as native FP8 accumulation across multiple algorithms and sizes. Skipped on CUDA SM < 89 (pre-Hopper); runs on HIP/ROCm. - **Add `test_fp8_accum.py` to CI**: Azure Pipeline `ut.yml` now runs FP8 accumulation tests alongside existing pytests. - **NCCL shim logging cleanup**: Migrated `printf`-style `WARN`/`INFO` calls to streaming-style logging. ## Key files | Area | Files | |------|-------| | New datatype + vector ops | `include/mscclpp/gpu_data_types.hpp` | | Accumulation reduce helpers | `src/core/include/reduce_kernel.hpp` | | Algorithm API (`accumDtype`) | `include/mscclpp/algorithm.hpp`, `src/core/algorithm.cc` | | Allreduce kernels | `src/ext/collectives/allreduce/*.cu` | | Dispatch + common | `src/ext/collectives/include/allreduce/common.hpp` | | Python bindings | `python/csrc/algorithm.cpp`, `python/mscclpp/_core/algorithm.py` | | Tests | `python/test/test_fp8_accum.py` | | CI | `.azure-pipelines/templates/ut.yml` | ## Test plan - [x] CI passes on H100 (CUDA SM 90) — full FP8 E4M3 + E4M3B15 accumulation tests - [x] CI passes on A100 (CUDA SM 80) — FP8 tests correctly skipped - [x] CI passes on MI300X (ROCm) — FP8 tests run via HIP - [x] Existing `test_mscclpp.py` tests continue to pass - [x] NCCL shim builds and runs correctly with new `accumDtype` defaults 🤖 Generated with [Claude Code](https://claude.com/claude-code) --------- Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com> Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
This commit is contained in:
@@ -41,7 +41,9 @@ NativeAlgorithm::NativeAlgorithm(std::string name, std::string collective, InitF
|
||||
CommResult NativeAlgorithm::execute(std::shared_ptr<Communicator> comm, const void* input, void* output,
|
||||
size_t inputSize, size_t outputSize, DataType dtype, ReduceOp op,
|
||||
cudaStream_t stream, std::shared_ptr<Executor>, int nBlocks, int nThreadsPerBlock,
|
||||
bool symmetricMemory, const std::unordered_map<std::string, uintptr_t>& extras) {
|
||||
bool symmetricMemory, const std::unordered_map<std::string, uintptr_t>& extras,
|
||||
DataType accumDtype) {
|
||||
if (accumDtype == DataType::AUTO) accumDtype = dtype;
|
||||
if (!initialized_) {
|
||||
initFunc_(comm);
|
||||
initialized_ = true;
|
||||
@@ -53,7 +55,7 @@ CommResult NativeAlgorithm::execute(std::shared_ptr<Communicator> comm, const vo
|
||||
contexts_[ctxKey] = ctx;
|
||||
}
|
||||
return kernelLaunchFunc_(contexts_[ctxKey], input, output, inputSize, outputSize, dtype, op, stream, nBlocks,
|
||||
nThreadsPerBlock, extras);
|
||||
nThreadsPerBlock, extras, accumDtype);
|
||||
}
|
||||
|
||||
const std::string& NativeAlgorithm::name() const { return name_; }
|
||||
@@ -77,10 +79,7 @@ const CollectiveBufferMode& NativeAlgorithm::bufferMode() const { return bufferM
|
||||
|
||||
Algorithm::Constraint NativeAlgorithm::constraint() const { return constraint_; }
|
||||
|
||||
void NativeAlgorithm::reset() {
|
||||
contexts_.clear();
|
||||
initialized_ = false;
|
||||
}
|
||||
void NativeAlgorithm::reset() { contexts_.clear(); }
|
||||
|
||||
void AlgorithmCollection::registerAlgorithm(const std::string collective, const std::string algoName,
|
||||
std::shared_ptr<Algorithm> algorithm) {
|
||||
@@ -166,7 +165,7 @@ Algorithm::Constraint DslAlgorithm::constraint() const { return constraint_; }
|
||||
CommResult DslAlgorithm::execute(std::shared_ptr<Communicator> comm, const void* input, void* output, size_t inputSize,
|
||||
size_t outputSize, DataType dtype, ReduceOp, cudaStream_t stream,
|
||||
std::shared_ptr<Executor> executor, int, int, bool,
|
||||
const std::unordered_map<std::string, uintptr_t>&) {
|
||||
const std::unordered_map<std::string, uintptr_t>&, DataType) {
|
||||
if (!executor) {
|
||||
THROW(EXEC, Error, ErrorCode::InvalidUsage, "Executor is null in DslAlgorithm::execute");
|
||||
}
|
||||
@@ -192,6 +191,10 @@ CommResult DslAlgorithm::execute(std::shared_ptr<Communicator> comm, const void*
|
||||
plan_, stream);
|
||||
break;
|
||||
#endif
|
||||
case DataType::FLOAT8_E4M3B15:
|
||||
executor->execute(rank, (__fp8_e4m3b15*)input, (__fp8_e4m3b15*)output, inputSize, outputSize,
|
||||
DataType::FLOAT8_E4M3B15, plan_, stream);
|
||||
break;
|
||||
case DataType::INT32:
|
||||
case DataType::UINT32:
|
||||
executor->execute(rank, (int*)input, (int*)output, inputSize, outputSize, DataType::UINT32, plan_, stream);
|
||||
|
||||
Reference in New Issue
Block a user