Merge remote-tracking branch 'origin/main' into ik/adaptive_p_2

2026-03-13 07:20:15 +00:00 · 2026-01-19 13:11:04 +00:00
parent c9cd616f84 fa58c20c42
commit f62e317dbe
3 changed files with 25 additions and 30 deletions
--- a/ggml/src/ggml-cuda/binbcast.cu
+++ b/ggml/src/ggml-cuda/binbcast.cu
@@ -336,7 +336,15 @@ static __global__ void k_add_same(int64_t nelem, const data_t * x, const data_t
    if (i >= nelem) {
        return;
    }
-    z[i] = x[i] + y[i];
+    if constexpr (std::is_same_v<data_t, nv_bfloat16>) {
+#if __CUDA_ARCH__ >= CC_AMPERE
+        z[i] = x[i] + y[i];
+#else
+        z[i] = __float2bfloat16((float)x[i] + (float)y[i]);
+#endif
+    } else {
+        z[i] = x[i] + y[i];
+    }
 }

 template <int block_size>
@@ -373,20 +381,6 @@ static __global__ void k_add_same_q8_0(int nelem, const block_q8_0 * x, const fl
    }
 }

-//static __global__ void k_add_same_q8_0(const block_q8_0 * x, const block_q8_0 * y, block_q8_0 * z) {
-//    int ib = blockIdx.x;
-//    int iq = threadIdx.x;
-//    float s  = (float)x[ib].d * x[ib].qs[iq] + (float)y[ib].d * y[ib].qs[iq];
-//    float as = fabsf(s);
-//    as = warp_reduce_max(as);
-//    float d = as / 127;
-//    float id = d > 0 ? 1/d : 0;
-//    z[ib].qs[iq] = roundf(s * id);
-//    if (threadIdx.x == 0) {
-//        z[ib].d = (half)d;
-//    }
-//}
-
 void ggml_op_add_same_type(ggml_backend_cuda_context & ctx, enum ggml_type type, size_t nelem,
        const void * x, const void * y, void * z) {
    constexpr int kBlockSize = 256;
--- a/ggml/src/ggml-cuda/reduce.cu
+++ b/ggml/src/ggml-cuda/reduce.cu
@@ -14,7 +14,15 @@ template <typename T, int block_size>
 static __global__ void k_add(int nelem, const T * __restrict__ src, T * __restrict__ dst) {
    int i = blockIdx.x*block_size + threadIdx.x;
    if (i >= nelem) return;
-    dst[i] += src[i];
+    if constexpr (std::is_same_v<T, nv_bfloat16>) {
+#if __CUDA_ARCH__ >= CC_AMPERE
+        dst[i] += src[i];
+#else
+        dst[i] = __float2bfloat16((float)src[i] + (float)dst[i]);
+#endif
+    } else {
+        dst[i] += src[i];
+    }
 }

 template <int block_size>
@@ -130,7 +138,13 @@ void ggml_cuda_op_reduce([[maybe_unused]] ggml_backend_cuda_context & ctx, ggml_
    // It does not work at all if not all GPUs participate in the reduce op, and we
    // get suboptimal prompt processing performance when we have more than 2 GPUs.
    // Hence, if enabled, we use NCCL only for the cases where it works and performs well.
-    if (info.have_nccl && dst->type != GGML_TYPE_Q8_0 && nhave == nreduce && (nhave == 2 || dst->ne[1] < 32)) {
+#if __CUDA_ARCH__ >= CC_AMPERE
+    constexpr bool bf16_supported = true;
+#else
+    constexpr bool bf16_supported = false;
+#endif
+    if (info.have_nccl && dst->type != GGML_TYPE_Q8_0 && nhave == nreduce && (nhave == 2 || dst->ne[1] < 32) &&
+       (dst->type != GGML_TYPE_BF16 || bf16_supported)) {
        GGML_ASSERT(info.have_nccl);
        GGML_ASSERT(info.device_count == nreduce);
        auto data_type = dst->type == GGML_TYPE_F32 ? ncclFloat : dst->type == GGML_TYPE_BF16 ? ncclBfloat16 : ncclHalf;
--- a/src/llama-sampling.cpp
+++ b/src/llama-sampling.cpp
@@ -1150,19 +1150,6 @@ void llama_prep_adaptive_p_impl(
    }
    adapt_p_ctx->cum_orig_prob = iqk_exp_with_thresh(orig_prob.size(), orig_prob.data(), max_logit, max_logit - kDelta);

-    //float max_logit = candidates->data[0].logit;
-    //for (int j = 1; j < int(candidates->size); ++j) {
-    //    max_logit = std::max(max_logit, candidates->data[j].logit);
-    //}
-    //float min_logit = max_logit - kDelta;
-    //float cum_prob = 0.0f;
-    //for (int j = 0; j < int(candidates->size); ++j) {
-    //    float prob = candidates->data[j].logit > min_logit ? expf(candidates->data[j].logit - max_logit) : 0.0f;
-    //    cum_prob += prob;
-    //    orig_prob[j] = prob;
-    //}
-    //adapt_p_ctx->cum_orig_prob = cum_prob;
-
    if (smpl) smpl->t_sample_us += ggml_time_us() - t_start;
 }