diff --git a/ggml/src/ggml-cuda/reduce.cu b/ggml/src/ggml-cuda/reduce.cu index d2f133d5..c3177be0 100644 --- a/ggml/src/ggml-cuda/reduce.cu +++ b/ggml/src/ggml-cuda/reduce.cu @@ -138,7 +138,13 @@ void ggml_cuda_op_reduce([[maybe_unused]] ggml_backend_cuda_context & ctx, ggml_ // It does not work at all if not all GPUs participate in the reduce op, and we // get suboptimal prompt processing performance when we have more than 2 GPUs. // Hence, if enabled, we use NCCL only for the cases where it works and performs well. - if (info.have_nccl && dst->type != GGML_TYPE_Q8_0 && nhave == nreduce && (nhave == 2 || dst->ne[1] < 32)) { +#if __CUDA_ARCH__ >= CC_AMPERE + constexpr bool bf16_supported = true; +#else + constexpr bool bf16_supported = false; +#endif + if (info.have_nccl && dst->type != GGML_TYPE_Q8_0 && nhave == nreduce && (nhave == 2 || dst->ne[1] < 32) && + (dst->type != GGML_TYPE_BF16 || bf16_supported)) { GGML_ASSERT(info.have_nccl); GGML_ASSERT(info.device_count == nreduce); auto data_type = dst->type == GGML_TYPE_F32 ? ncclFloat : dst->type == GGML_TYPE_BF16 ? ncclBfloat16 : ncclHalf;