mirror of
https://github.com/ikawrakow/ik_llama.cpp.git
synced 2026-05-01 03:41:53 +00:00
Fix bf16 additions on CUDA arch < Ampere (#1164)
* Fix bf16 additions on CUDA arch < Ampere * Prevent using NCCL if graph reduce type is bf16 and arch < AMPERE
This commit is contained in:
@@ -336,7 +336,15 @@ static __global__ void k_add_same(int64_t nelem, const data_t * x, const data_t
|
|||||||
if (i >= nelem) {
|
if (i >= nelem) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
z[i] = x[i] + y[i];
|
if constexpr (std::is_same_v<data_t, nv_bfloat16>) {
|
||||||
|
#if __CUDA_ARCH__ >= CC_AMPERE
|
||||||
|
z[i] = x[i] + y[i];
|
||||||
|
#else
|
||||||
|
z[i] = __float2bfloat16((float)x[i] + (float)y[i]);
|
||||||
|
#endif
|
||||||
|
} else {
|
||||||
|
z[i] = x[i] + y[i];
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
template <int block_size>
|
template <int block_size>
|
||||||
@@ -373,20 +381,6 @@ static __global__ void k_add_same_q8_0(int nelem, const block_q8_0 * x, const fl
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
//static __global__ void k_add_same_q8_0(const block_q8_0 * x, const block_q8_0 * y, block_q8_0 * z) {
|
|
||||||
// int ib = blockIdx.x;
|
|
||||||
// int iq = threadIdx.x;
|
|
||||||
// float s = (float)x[ib].d * x[ib].qs[iq] + (float)y[ib].d * y[ib].qs[iq];
|
|
||||||
// float as = fabsf(s);
|
|
||||||
// as = warp_reduce_max(as);
|
|
||||||
// float d = as / 127;
|
|
||||||
// float id = d > 0 ? 1/d : 0;
|
|
||||||
// z[ib].qs[iq] = roundf(s * id);
|
|
||||||
// if (threadIdx.x == 0) {
|
|
||||||
// z[ib].d = (half)d;
|
|
||||||
// }
|
|
||||||
//}
|
|
||||||
|
|
||||||
void ggml_op_add_same_type(ggml_backend_cuda_context & ctx, enum ggml_type type, size_t nelem,
|
void ggml_op_add_same_type(ggml_backend_cuda_context & ctx, enum ggml_type type, size_t nelem,
|
||||||
const void * x, const void * y, void * z) {
|
const void * x, const void * y, void * z) {
|
||||||
constexpr int kBlockSize = 256;
|
constexpr int kBlockSize = 256;
|
||||||
|
|||||||
@@ -14,7 +14,15 @@ template <typename T, int block_size>
|
|||||||
static __global__ void k_add(int nelem, const T * __restrict__ src, T * __restrict__ dst) {
|
static __global__ void k_add(int nelem, const T * __restrict__ src, T * __restrict__ dst) {
|
||||||
int i = blockIdx.x*block_size + threadIdx.x;
|
int i = blockIdx.x*block_size + threadIdx.x;
|
||||||
if (i >= nelem) return;
|
if (i >= nelem) return;
|
||||||
dst[i] += src[i];
|
if constexpr (std::is_same_v<T, nv_bfloat16>) {
|
||||||
|
#if __CUDA_ARCH__ >= CC_AMPERE
|
||||||
|
dst[i] += src[i];
|
||||||
|
#else
|
||||||
|
dst[i] = __float2bfloat16((float)src[i] + (float)dst[i]);
|
||||||
|
#endif
|
||||||
|
} else {
|
||||||
|
dst[i] += src[i];
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
template <int block_size>
|
template <int block_size>
|
||||||
@@ -130,7 +138,13 @@ void ggml_cuda_op_reduce([[maybe_unused]] ggml_backend_cuda_context & ctx, ggml_
|
|||||||
// It does not work at all if not all GPUs participate in the reduce op, and we
|
// It does not work at all if not all GPUs participate in the reduce op, and we
|
||||||
// get suboptimal prompt processing performance when we have more than 2 GPUs.
|
// get suboptimal prompt processing performance when we have more than 2 GPUs.
|
||||||
// Hence, if enabled, we use NCCL only for the cases where it works and performs well.
|
// Hence, if enabled, we use NCCL only for the cases where it works and performs well.
|
||||||
if (info.have_nccl && dst->type != GGML_TYPE_Q8_0 && nhave == nreduce && (nhave == 2 || dst->ne[1] < 32)) {
|
#if __CUDA_ARCH__ >= CC_AMPERE
|
||||||
|
constexpr bool bf16_supported = true;
|
||||||
|
#else
|
||||||
|
constexpr bool bf16_supported = false;
|
||||||
|
#endif
|
||||||
|
if (info.have_nccl && dst->type != GGML_TYPE_Q8_0 && nhave == nreduce && (nhave == 2 || dst->ne[1] < 32) &&
|
||||||
|
(dst->type != GGML_TYPE_BF16 || bf16_supported)) {
|
||||||
GGML_ASSERT(info.have_nccl);
|
GGML_ASSERT(info.have_nccl);
|
||||||
GGML_ASSERT(info.device_count == nreduce);
|
GGML_ASSERT(info.device_count == nreduce);
|
||||||
auto data_type = dst->type == GGML_TYPE_F32 ? ncclFloat : dst->type == GGML_TYPE_BF16 ? ncclBfloat16 : ncclHalf;
|
auto data_type = dst->type == GGML_TYPE_F32 ? ncclFloat : dst->type == GGML_TYPE_BF16 ? ncclBfloat16 : ncclHalf;
|
||||||
|
|||||||
Reference in New Issue
Block a user