From 5c1c0e2bade18e9f32450f697b3ee85aa9a5e0dc Mon Sep 17 00:00:00 2001 From: Kawrakow Date: Mon, 19 Jan 2026 09:25:20 +0000 Subject: [PATCH] Prevent using NCCL if graph reduce type is bf16 and arch < AMPERE --- ggml/src/ggml-cuda/reduce.cu | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/ggml/src/ggml-cuda/reduce.cu b/ggml/src/ggml-cuda/reduce.cu index d2f133d5..c3177be0 100644 --- a/ggml/src/ggml-cuda/reduce.cu +++ b/ggml/src/ggml-cuda/reduce.cu @@ -138,7 +138,13 @@ void ggml_cuda_op_reduce([[maybe_unused]] ggml_backend_cuda_context & ctx, ggml_ // It does not work at all if not all GPUs participate in the reduce op, and we // get suboptimal prompt processing performance when we have more than 2 GPUs. // Hence, if enabled, we use NCCL only for the cases where it works and performs well. - if (info.have_nccl && dst->type != GGML_TYPE_Q8_0 && nhave == nreduce && (nhave == 2 || dst->ne[1] < 32)) { +#if __CUDA_ARCH__ >= CC_AMPERE + constexpr bool bf16_supported = true; +#else + constexpr bool bf16_supported = false; +#endif + if (info.have_nccl && dst->type != GGML_TYPE_Q8_0 && nhave == nreduce && (nhave == 2 || dst->ne[1] < 32) && + (dst->type != GGML_TYPE_BF16 || bf16_supported)) { GGML_ASSERT(info.have_nccl); GGML_ASSERT(info.device_count == nreduce); auto data_type = dst->type == GGML_TYPE_F32 ? ncclFloat : dst->type == GGML_TYPE_BF16 ? ncclBfloat16 : ncclHalf;