Prevent using NCCL if graph reduce type is bf16 and arch < AMPERE

2026-03-12 23:10:01 +00:00 · 2026-01-19 09:25:20 +00:00
parent 889c553a34
commit 5c1c0e2bad
1 changed files with 7 additions and 1 deletions
--- a/ggml/src/ggml-cuda/reduce.cu
+++ b/ggml/src/ggml-cuda/reduce.cu
@@ -138,7 +138,13 @@ void ggml_cuda_op_reduce([[maybe_unused]] ggml_backend_cuda_context & ctx, ggml_
    // It does not work at all if not all GPUs participate in the reduce op, and we
    // get suboptimal prompt processing performance when we have more than 2 GPUs.
    // Hence, if enabled, we use NCCL only for the cases where it works and performs well.
-    if (info.have_nccl && dst->type != GGML_TYPE_Q8_0 && nhave == nreduce && (nhave == 2 || dst->ne[1] < 32)) {
+#if __CUDA_ARCH__ >= CC_AMPERE
+    constexpr bool bf16_supported = true;
+#else
+    constexpr bool bf16_supported = false;
+#endif
+    if (info.have_nccl && dst->type != GGML_TYPE_Q8_0 && nhave == nreduce && (nhave == 2 || dst->ne[1] < 32) &&
+       (dst->type != GGML_TYPE_BF16 || bf16_supported)) {
        GGML_ASSERT(info.have_nccl);
        GGML_ASSERT(info.device_count == nreduce);
        auto data_type = dst->type == GGML_TYPE_F32 ? ncclFloat : dst->type == GGML_TYPE_BF16 ? ncclBfloat16 : ncclHalf;