diff --git a/ggml/src/ggml-cuda/binbcast.cu b/ggml/src/ggml-cuda/binbcast.cu index 3440ac01..96166ceb 100644 --- a/ggml/src/ggml-cuda/binbcast.cu +++ b/ggml/src/ggml-cuda/binbcast.cu @@ -336,7 +336,15 @@ static __global__ void k_add_same(int64_t nelem, const data_t * x, const data_t if (i >= nelem) { return; } - z[i] = x[i] + y[i]; + if constexpr (std::is_same_v) { +#if __CUDA_ARCH__ >= CC_AMPERE + z[i] = x[i] + y[i]; +#else + z[i] = __float2bfloat16((float)x[i] + (float)y[i]); +#endif + } else { + z[i] = x[i] + y[i]; + } } template @@ -373,20 +381,6 @@ static __global__ void k_add_same_q8_0(int nelem, const block_q8_0 * x, const fl } } -//static __global__ void k_add_same_q8_0(const block_q8_0 * x, const block_q8_0 * y, block_q8_0 * z) { -// int ib = blockIdx.x; -// int iq = threadIdx.x; -// float s = (float)x[ib].d * x[ib].qs[iq] + (float)y[ib].d * y[ib].qs[iq]; -// float as = fabsf(s); -// as = warp_reduce_max(as); -// float d = as / 127; -// float id = d > 0 ? 1/d : 0; -// z[ib].qs[iq] = roundf(s * id); -// if (threadIdx.x == 0) { -// z[ib].d = (half)d; -// } -//} - void ggml_op_add_same_type(ggml_backend_cuda_context & ctx, enum ggml_type type, size_t nelem, const void * x, const void * y, void * z) { constexpr int kBlockSize = 256; diff --git a/ggml/src/ggml-cuda/reduce.cu b/ggml/src/ggml-cuda/reduce.cu index 23d054b7..c3177be0 100644 --- a/ggml/src/ggml-cuda/reduce.cu +++ b/ggml/src/ggml-cuda/reduce.cu @@ -14,7 +14,15 @@ template static __global__ void k_add(int nelem, const T * __restrict__ src, T * __restrict__ dst) { int i = blockIdx.x*block_size + threadIdx.x; if (i >= nelem) return; - dst[i] += src[i]; + if constexpr (std::is_same_v) { +#if __CUDA_ARCH__ >= CC_AMPERE + dst[i] += src[i]; +#else + dst[i] = __float2bfloat16((float)src[i] + (float)dst[i]); +#endif + } else { + dst[i] += src[i]; + } } template @@ -130,7 +138,13 @@ void ggml_cuda_op_reduce([[maybe_unused]] ggml_backend_cuda_context & ctx, ggml_ // It does not work at all if not all GPUs participate in the reduce op, and we // get suboptimal prompt processing performance when we have more than 2 GPUs. // Hence, if enabled, we use NCCL only for the cases where it works and performs well. - if (info.have_nccl && dst->type != GGML_TYPE_Q8_0 && nhave == nreduce && (nhave == 2 || dst->ne[1] < 32)) { +#if __CUDA_ARCH__ >= CC_AMPERE + constexpr bool bf16_supported = true; +#else + constexpr bool bf16_supported = false; +#endif + if (info.have_nccl && dst->type != GGML_TYPE_Q8_0 && nhave == nreduce && (nhave == 2 || dst->ne[1] < 32) && + (dst->type != GGML_TYPE_BF16 || bf16_supported)) { GGML_ASSERT(info.have_nccl); GGML_ASSERT(info.device_count == nreduce); auto data_type = dst->type == GGML_TYPE_F32 ? ncclFloat : dst->type == GGML_TYPE_BF16 ? ncclBfloat16 : ncclHalf;