Additional graph reduce types for split mode graph (#1154)

* WIP: add Q8_0 and BF16 as possible reduce types Does not work - there is a big somewhere * This finally works
2026-04-30 03:11:51 +00:00 · 2026-01-18 08:02:49 +02:00
parent ee463b079e
commit 7024fdbc72
10 changed files with 347 additions and 86 deletions
--- a/ggml/src/ggml-cuda/binbcast.cu
+++ b/ggml/src/ggml-cuda/binbcast.cu
@@ -330,7 +330,92 @@ static __global__ void k_fast_add_2(int64_t ne0, int64_t nelem, const src1_t * x
    z[i] = (dst_t)((float)x[i] + (float)y[i]);
 }

+template <int block_size, typename data_t>
+static __global__ void k_add_same(int64_t nelem, const data_t * x, const data_t * y, data_t * z) {
+    int64_t i = block_size*blockIdx.x + threadIdx.x;
+    if (i >= nelem) {
+        return;
+    }
+    z[i] = x[i] + y[i];
+}
+
+template <int block_size>
+static __global__ void k_add_same_q8_0(int nelem, const block_q8_0 * x, const block_q8_0 * y, block_q8_0 * z) {
+    int i = blockIdx.x*block_size + threadIdx.x;
+    if (i >= nelem) return;
+    int ib = i / QK8_0;
+    int iq = i % QK8_0;
+    float sum = (float)x[ib].d * x[ib].qs[iq] + (float)y[ib].d * y[ib].qs[iq];
+    float asum = fabsf(sum);
+    float max = warp_reduce_max(asum);
+    float d = max / 127;
+    float id = d > 0 ? 1/d : 0;
+    z[ib].qs[iq] = roundf(sum * id);
+    if (threadIdx.x % WARP_SIZE == 0) {
+        z[ib].d = (half)d;
+    }
+}
+
+template <int block_size>
+static __global__ void k_add_same_q8_0(int nelem, const block_q8_0 * x, const float * y, block_q8_0 * z) {
+    int i = blockIdx.x*block_size + threadIdx.x;
+    if (i >= nelem) return;
+    int ib = i / QK8_0;
+    int iq = i % QK8_0;
+    float sum = (float)x[ib].d * x[ib].qs[iq] + y[i];
+    float asum = fabsf(sum);
+    float max = warp_reduce_max(asum);
+    float d = max / 127;
+    float id = d > 0 ? 1/d : 0;
+    z[ib].qs[iq] = roundf(sum * id);
+    if (threadIdx.x % WARP_SIZE == 0) {
+        z[ib].d = (half)d;
+    }
+}
+
+//static __global__ void k_add_same_q8_0(const block_q8_0 * x, const block_q8_0 * y, block_q8_0 * z) {
+//    int ib = blockIdx.x;
+//    int iq = threadIdx.x;
+//    float s  = (float)x[ib].d * x[ib].qs[iq] + (float)y[ib].d * y[ib].qs[iq];
+//    float as = fabsf(s);
+//    as = warp_reduce_max(as);
+//    float d = as / 127;
+//    float id = d > 0 ? 1/d : 0;
+//    z[ib].qs[iq] = roundf(s * id);
+//    if (threadIdx.x == 0) {
+//        z[ib].d = (half)d;
+//    }
+//}
+
+void ggml_op_add_same_type(ggml_backend_cuda_context & ctx, enum ggml_type type, size_t nelem,
+        const void * x, const void * y, void * z) {
+    constexpr int kBlockSize = 256;
+    int nblocks = (nelem + kBlockSize - 1)/kBlockSize;
+    if (type == GGML_TYPE_F32) {
+        k_add_same<kBlockSize><<<nblocks, kBlockSize, 0, ctx.stream()>>>(nelem,
+                (const float *)x, (const float *)y, (float *)z);
+    } else if (type == GGML_TYPE_F16) {
+        k_add_same<kBlockSize><<<nblocks, kBlockSize, 0, ctx.stream()>>>(nelem,
+                (const half *)x, (const half *)y, (half *)z);
+    } else if (type == GGML_TYPE_BF16) {
+        k_add_same<kBlockSize><<<nblocks, kBlockSize, 0, ctx.stream()>>>(nelem,
+                (const nv_bfloat16 *)x, (const nv_bfloat16 *)y, (nv_bfloat16 *)z);
+    } else if (type == GGML_TYPE_Q8_0) {
+        k_add_same_q8_0<kBlockSize><<<nblocks, kBlockSize, 0, ctx.stream()>>>(nelem,
+                (const block_q8_0 *)x, (const block_q8_0 *)y, (block_q8_0 *)z);
+    } else {
+        GGML_ABORT("Unsupported add operation");
+    }
+}
+
 void ggml_cuda_op_add(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
+    if (dst->src[0]->type == dst->src[1]->type && dst->src[0]->type == dst->type &&
+        ggml_is_contiguous(dst->src[0]) && ggml_is_contiguous(dst->src[1]) && ggml_is_contiguous(dst) &&
+        ggml_are_same_shape(dst->src[0], dst->src[1])) {
+        //printf("%s(%s, %s): using fast same\n", __func__, dst->name, ggml_type_name(dst->type));
+        ggml_op_add_same_type(ctx, dst->type, ggml_nelements(dst), dst->src[0]->data, dst->src[1]->data, dst->data);
+        return;
+    }
    if (ggml_nrows(dst->src[1]) == 1 && dst->src[0]->ne[0] == dst->src[1]->ne[0] &&
        dst->type == GGML_TYPE_F32 && dst->src[0]->type == GGML_TYPE_F32 && dst->src[1]->type == GGML_TYPE_F32 &&
        ggml_are_same_shape(dst, dst->src[0]) && ggml_is_contiguous(dst)) {
@@ -361,6 +446,26 @@ void ggml_cuda_op_add(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
                k_fast_add_2<<<nblocks, kBlockSize, 0, ctx.stream()>>>(dst->ne[0], nelem,
                        (const float *)dst->src[0]->data, (const half *)dst->src[1]->data, (half *)dst->data);
            }
+        } else if (dst->type == GGML_TYPE_BF16) {
+            if (dst->src[0]->type == GGML_TYPE_BF16 && dst->src[1]->type == GGML_TYPE_BF16) {
+                k_fast_add_2<<<nblocks, kBlockSize, 0, ctx.stream()>>>(dst->ne[0], nelem,
+                        (const nv_bfloat16 *)dst->src[0]->data, (const nv_bfloat16 *)dst->src[1]->data, (nv_bfloat16 *)dst->data);
+            }
+            else if (dst->src[0]->type == GGML_TYPE_BF16 && dst->src[1]->type == GGML_TYPE_F32) {
+                k_fast_add_2<<<nblocks, kBlockSize, 0, ctx.stream()>>>(dst->ne[0], nelem,
+                        (const nv_bfloat16 *)dst->src[0]->data, (const float *)dst->src[1]->data, (nv_bfloat16 *)dst->data);
+            }
+            else if (dst->src[0]->type == GGML_TYPE_F32 && dst->src[1]->type == GGML_TYPE_F32) {
+                k_fast_add_2<<<nblocks, kBlockSize, 0, ctx.stream()>>>(dst->ne[0], nelem,
+                        (const float *)dst->src[0]->data, (const float *)dst->src[1]->data, (nv_bfloat16 *)dst->data);
+            } else {
+                k_fast_add_2<<<nblocks, kBlockSize, 0, ctx.stream()>>>(dst->ne[0], nelem,
+                        (const float *)dst->src[0]->data, (const nv_bfloat16 *)dst->src[1]->data, (nv_bfloat16 *)dst->data);
+            }
+        } else if (dst->type == GGML_TYPE_Q8_0) {
+            GGML_ASSERT(dst->src[0]->type == GGML_TYPE_Q8_0 && dst->src[1]->type == GGML_TYPE_F32);
+            k_add_same_q8_0<kBlockSize><<<nblocks, kBlockSize, 0, ctx.stream()>>>(nelem,
+                        (const block_q8_0 *)dst->src[0]->data, (const float *)dst->src[1]->data, (block_q8_0 *)dst->data);
        } else {
            if (dst->src[0]->type == GGML_TYPE_F16 && dst->src[1]->type == GGML_TYPE_F16) {
                k_fast_add_2<<<nblocks, kBlockSize, 0, ctx.stream()>>>(dst->ne[0], nelem,
--- a/ggml/src/ggml-cuda/binbcast.cuh
+++ b/ggml/src/ggml-cuda/binbcast.cuh
@@ -4,3 +4,6 @@ void ggml_cuda_op_repeat(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
 void ggml_cuda_op_add(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
 void ggml_cuda_op_mul(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
 void ggml_cuda_op_div(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
+
+void ggml_op_add_same_type(ggml_backend_cuda_context & ctx, enum ggml_type type, size_t nelem,
+        const void * x, const void * y, void * z);
--- a/ggml/src/ggml-cuda/norm.cu
+++ b/ggml/src/ggml-cuda/norm.cu
@@ -43,10 +43,20 @@ static __global__ void fused_norm_f32(const T * x, const float * c, float * dst,

    float2 mean_var = make_float2(0.f, 0.f);

-    for (int col = tid; col < ncols; col += block_size) {
-        const float xi = (float)x[row*ncols + col];
-        mean_var.x += xi;
-        mean_var.y += xi * xi;
+    if constexpr (std::is_same_v<T, block_q8_0>) {
+        static_assert(block_size % QK8_0 == 0);
+        auto xr = x + (row*ncols)/QK8_0;
+        for (int col = tid; col < ncols; col += block_size) {
+            const float xi = (float)xr[col / QK8_0].d * xr[col / QK8_0].qs[col % QK8_0];
+            mean_var.x += xi;
+            mean_var.y += xi * xi;
+        }
+    } else {
+        for (int col = tid; col < ncols; col += block_size) {
+            const float xi = (float)x[row*ncols + col];
+            mean_var.x += xi;
+            mean_var.y += xi * xi;
+        }
    }

    // sum up partial sums
@@ -67,8 +77,16 @@ static __global__ void fused_norm_f32(const T * x, const float * c, float * dst,
    const float var = mean_var.y / ncols - mean * mean;
    const float inv_std = rsqrtf(var + eps);

-    for (int col = tid; col < ncols; col += block_size) {
-        dst[row*ncols + col] = (T)(((float)x[row*ncols + col] - mean) * inv_std * c[col]);
+    if constexpr (std::is_same_v<T, block_q8_0>) {
+        static_assert(block_size % QK8_0 == 0);
+        auto xr = x + (row*ncols)/QK8_0;
+        for (int col = tid; col < ncols; col += block_size) {
+            dst[row*ncols + col] = ((float)xr[col/QK8_0].d*xr[col/QK8_0].qs[col%QK8_0] - mean) * inv_std * c[col];
+        }
+    } else {
+        for (int col = tid; col < ncols; col += block_size) {
+            dst[row*ncols + col] = ((float)x[row*ncols + col] - mean) * inv_std * c[col];
+        }
    }
 }

@@ -219,9 +237,23 @@ static __global__ void fused_rms_norm_f32(const src_t * x, const float * y, floa

    float tmp = 0.0f; // partial sum for thread in warp

-    for (int col = tid; col < ncols; col += block_size) {
-        const float xi = (float)x[row*ncols + col];
-        tmp += xi * xi;
+    if constexpr (std::is_same_v<src_t, block_q8_0>) {
+        static_assert(block_size % QK8_0 == 0);
+        auto xr = x + (row*ncols)/QK8_0;
+        for (int col = tid; col < ncols; col += block_size) {
+            const float xi = (float)xr[col / QK8_0].d * xr[col / QK8_0].qs[col % QK8_0];
+            tmp += xi * xi;
+        }
+    } else if constexpr (std::is_same_v<src_t, nv_bfloat16>) {
+        for (int col = tid; col < ncols; col += block_size) {
+            const float xi = __bfloat162float(x[row*ncols + col]);
+            tmp += xi * xi;
+        }
+    } else {
+        for (int col = tid; col < ncols; col += block_size) {
+            const float xi = (float)x[row*ncols + col];
+            tmp += xi * xi;
+        }
    }

    // sum up partial sums
@@ -241,8 +273,19 @@ static __global__ void fused_rms_norm_f32(const src_t * x, const float * y, floa
    const float mean = tmp / ncols;
    const float scale = rsqrtf(mean + eps);

-    for (int col = tid; col < ncols; col += block_size) {
-        dst[row*ncols + col] = scale * y[col] * (float)x[row*ncols + col];
+    if constexpr (std::is_same_v<src_t, block_q8_0>) {
+        auto xr = x + (row*ncols)/QK8_0;
+        for (int col = tid; col < ncols; col += block_size) {
+            dst[row*ncols + col] = scale * y[col] * (float)xr[col / QK8_0].d * xr[col / QK8_0].qs[col % QK8_0];
+        }
+    } else if constexpr (std::is_same_v<src_t, nv_bfloat16>) {
+        for (int col = tid; col < ncols; col += block_size) {
+            dst[row*ncols + col] = scale * y[col] * __bfloat162float(x[row*ncols + col]);
+        }
+    } else {
+        for (int col = tid; col < ncols; col += block_size) {
+            dst[row*ncols + col] = scale * y[col] * (float)x[row*ncols + col];
+        }
    }
 }

@@ -496,7 +539,8 @@ void ggml_cuda_op_fused_rms_norm(ggml_backend_cuda_context & ctx, ggml_tensor *
    float * dst_d = (float *)dst->data;
    cudaStream_t stream = ctx.stream();

-    GGML_ASSERT(src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16);
+    GGML_ASSERT(src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || src0->type == GGML_TYPE_BF16 ||
+               (ggml_is_contiguous(src0) && src0->type == GGML_TYPE_Q8_0));
    GGML_ASSERT(src1->type == GGML_TYPE_F32);
    GGML_ASSERT( dst->type == GGML_TYPE_F32);
    GGML_ASSERT(src0->ne[0] == src1->ne[0]);
@@ -511,8 +555,12 @@ void ggml_cuda_op_fused_rms_norm(ggml_backend_cuda_context & ctx, ggml_tensor *
        const int64_t nrows = ggml_nrows(src0);
        if (src0->type == GGML_TYPE_F32) {
            fused_rms_norm_f32_cuda(src0_d, src1_d, dst_d, ne00, nrows, eps, is_norm, stream);
-        } else {
+        } else if (src0->type == GGML_TYPE_F16) {
            fused_rms_norm_f32_cuda((const half *)src0_d, src1_d, dst_d, ne00, nrows, eps, is_norm, stream);
+        } else if (src0->type == GGML_TYPE_Q8_0) {
+            fused_rms_norm_f32_cuda((const block_q8_0 *)src0_d, src1_d, dst_d, ne00, nrows, eps, is_norm, stream);
+        } else {
+            fused_rms_norm_f32_cuda((const nv_bfloat16 *)src0_d, src1_d, dst_d, ne00, nrows, eps, is_norm, stream);
        }
    } else {
        if (is_norm) {
@@ -525,6 +573,8 @@ void ggml_cuda_op_fused_rms_norm(ggml_backend_cuda_context & ctx, ggml_tensor *
        auto s03 = src0->nb[3] / ts0;
        if (src0->type == GGML_TYPE_F32) {
            fused_rms_norm_f32_nc_cuda(src0_d, src1_d, dst_d, ne00, src0->ne[1], src0->ne[2], src0->ne[3], s01, s02, s03, eps, stream);
+        } else if (src0->type == GGML_TYPE_BF16) {
+            fused_rms_norm_f32_nc_cuda((const nv_bfloat16 *)src0_d, src1_d, dst_d, ne00, src0->ne[1], src0->ne[2], src0->ne[3], s01, s02, s03, eps, stream);
        } else {
            fused_rms_norm_f32_nc_cuda((const half *)src0_d, src1_d, dst_d, ne00, src0->ne[1], src0->ne[2], src0->ne[3], s01, s02, s03, eps, stream);
        }
--- a/ggml/src/ggml-cuda/reduce.cu
+++ b/ggml/src/ggml-cuda/reduce.cu
@@ -6,16 +6,35 @@
 //

 #include "reduce.cuh"
+#include "binbcast.cuh"
+#include "ggml-common.h"

 #include <chrono>

 template <typename T, int block_size>
-static __global__ void k_add(int nelem, const T * src, T * dst) {
+static __global__ void k_add(int nelem, const T * __restrict__ src, T * __restrict__ dst) {
    int i = blockIdx.x*block_size + threadIdx.x;
    if (i >= nelem) return;
    dst[i] += src[i];
 }

+template <int block_size>
+static __global__ void k_add(int nelem, const block_q8_0 * __restrict__ src, block_q8_0 * __restrict__ dst) {
+    int i = blockIdx.x*block_size + threadIdx.x;
+    if (i >= nelem) return;
+    int ib = i / QK8_0;
+    int iq = i % QK8_0;
+    float x = (float)src[ib].d * src[ib].qs[iq] + (float)dst[ib].d * dst[ib].qs[iq];
+    float ax = fabsf(x);
+    float max = warp_reduce_max(ax);
+    float d = max / 127;
+    float id = d > 0 ? 1/d : 0;
+    dst[ib].qs[iq] = roundf(x * id);
+    if (threadIdx.x % WARP_SIZE == 0) {
+        dst[ib].d = (half)d;
+    }
+}
+
 template <typename T, int block_size>
 static __global__ void k_add_sym(int nelem, T * src, T * dst) {
    int i = blockIdx.x*block_size + threadIdx.x;
@@ -68,7 +87,8 @@ void ggml_cuda_op_reduce([[maybe_unused]] ggml_backend_cuda_context & ctx, ggml_
    GGML_ASSERT(op == GGML_OP_ADD);
    int nreduce = dst->op_params[1];
    int nhave   = dst->op_params[2];
-    GGML_ASSERT(dst->type == GGML_TYPE_F16 || dst->type == GGML_TYPE_F32);
+    GGML_ASSERT(dst->type == GGML_TYPE_F16 || dst->type == GGML_TYPE_F32 ||
+                dst->type == GGML_TYPE_Q8_0 || dst->type == GGML_TYPE_BF16);
    GGML_ASSERT(ggml_is_contiguous(dst));
    GGML_ASSERT(nhave >=2 && nhave <= nreduce);
    if (dst->op_params[3] == 1) {
@@ -82,10 +102,10 @@ void ggml_cuda_op_reduce([[maybe_unused]] ggml_backend_cuda_context & ctx, ggml_
    // It does not work at all if not all GPUs participate in the reduce op, and we
    // get suboptimal prompt processing performance when we have more than 2 GPUs.
    // Hence, if enabled, we use NCCL only for the cases where it works and performs well.
-    if (info.have_nccl && nhave == nreduce && (nhave == 2 || dst->ne[1] < 32)) {
+    if (info.have_nccl && dst->type != GGML_TYPE_Q8_0 && nhave == nreduce && (nhave == 2 || dst->ne[1] < 32)) {
        GGML_ASSERT(info.have_nccl);
        GGML_ASSERT(info.device_count == nreduce);
-        auto data_type = dst->type == GGML_TYPE_F32 ? ncclFloat : ncclHalf;
+        auto data_type = dst->type == GGML_TYPE_F32 ? ncclFloat : dst->type == GGML_TYPE_BF16 ? ncclBfloat16 : ncclHalf;
        ncclGroupStart();
        for (int i = 0; i < nreduce; ++i) {
            ggml_cuda_set_device(i);
@@ -198,13 +218,25 @@ void ggml_cuda_op_reduce([[maybe_unused]] ggml_backend_cuda_context & ctx, ggml_
    //
    if (dst->ne[1] >= 32) {
        auto nelem = ggml_nelements(dst);
-        auto elem_size = ggml_element_size(dst);
-        auto nelem_per_device = (nelem + nhave - 1)/nhave;
-        auto required_size = nelem_per_device*elem_size;
+        auto tt = ggml_internal_get_type_traits(dst->type);
+        GGML_ASSERT(nelem % tt.blck_size == 0);
+        auto nblocks = nelem / tt.blck_size;
+        auto nblocks_per_device = (nblocks + nhave - 1)/nhave;
+        auto nelem_per_device = nblocks_per_device * tt.blck_size;
+        auto size_per_device  = nblocks_per_device * tt.type_size;
+        //size_t nelem_per_device, required_size;
+        //if (dst->type == GGML_TYPE_Q8_0) {
+        //    GGML_ASSERT(nelem % QK8_0 == 0);
+        //    nelem_per_device = QK8_0*((nelem/QK8_0 + nhave - 1)/nhave);
+        //    required_size nelem_per_device/QK8_0 * sizeof(ggml_block_q8_0);
+        //}
+        //auto elem_size = ggml_element_size(dst);
+        //auto nelem_per_device = (nelem + nhave - 1)/nhave;
+        //auto required_size = nelem_per_device*elem_size;
        for (int ii = 0; ii < nhave; ++ii) {
            int i = idx[ii];
            auto this_ctx = info.all_ctx[i];
-            if (!this_ctx->copy_event || !this_ctx->compute_event || required_size > this_ctx->copy_size) {
+            if (!this_ctx->copy_event || !this_ctx->compute_event || size_per_device > this_ctx->copy_size) {
                ggml_cuda_set_device(this_ctx->device);
                if (!this_ctx->copy_event) {
                    CUDA_CHECK(cudaEventCreateWithFlags(&this_ctx->copy_event, cudaEventDisableTiming));
@@ -212,12 +244,12 @@ void ggml_cuda_op_reduce([[maybe_unused]] ggml_backend_cuda_context & ctx, ggml_
                if (!this_ctx->compute_event) {
                    CUDA_CHECK(cudaEventCreateWithFlags(&this_ctx->compute_event, cudaEventDisableTiming));
                }
-                if (required_size > this_ctx->copy_size) {
+                if (size_per_device > this_ctx->copy_size) {
                    if (this_ctx->copy_buffer) {
                        CUDA_CHECK(cudaFree(this_ctx->copy_buffer));
                    }
-                    CUDA_CHECK(ggml_cuda_device_malloc(&this_ctx->copy_buffer, required_size, this_ctx->device));
-                    this_ctx->copy_size = required_size;
+                    CUDA_CHECK(ggml_cuda_device_malloc(&this_ctx->copy_buffer, size_per_device, this_ctx->device));
+                    this_ctx->copy_size = size_per_device;
                }
            }
        }
@@ -227,13 +259,14 @@ void ggml_cuda_op_reduce([[maybe_unused]] ggml_backend_cuda_context & ctx, ggml_
                int i = idx[ii];
                int peer = idx[(ii+1)%nhave];
                auto this_nelem = std::min(nelem_per_device, nelem - ichunk*nelem_per_device);
+                auto this_size  = (this_nelem / tt.blck_size) * tt.type_size;
                ggml_cuda_set_device(info.all_ctx[peer]->device);
                if (stage > 0) {
                    CUDA_CHECK(cudaStreamWaitEvent(info.all_ctx[peer]->stream(), info.all_ctx[i]->compute_event, 0));
                }
                CUDA_CHECK(cudaMemcpyPeerAsync(info.all_ctx[i]->copy_buffer, info.all_ctx[i]->device,
-                            (const char *)dst->src[peer]->data + ichunk*nelem_per_device*elem_size, info.all_ctx[peer]->device,
-                            this_nelem*elem_size, info.all_ctx[peer]->stream()));
+                            (const char *)dst->src[peer]->data + ichunk*size_per_device, info.all_ctx[peer]->device,
+                            this_size, info.all_ctx[peer]->stream()));
                CUDA_CHECK(cudaEventRecord(info.all_ctx[peer]->copy_event, info.all_ctx[peer]->stream()));
                ichunk = (ichunk + 1)%nhave;
            }
@@ -244,10 +277,19 @@ void ggml_cuda_op_reduce([[maybe_unused]] ggml_backend_cuda_context & ctx, ggml_
                auto this_nelem = std::min(nelem_per_device, nelem - ichunk*nelem_per_device);
                ggml_cuda_set_device(info.all_ctx[i]->device);
                CUDA_CHECK(cudaStreamWaitEvent(info.all_ctx[i]->stream(), info.all_ctx[peer]->copy_event, 0));
+                //ggml_op_add_same_type(ctx, dst->type, this_nelem, info.all_ctx[i]->copy_buffer,
+                //        (const char *)dst->src[i]->data + ichunk*size_per_device, (char *)dst->src[i]->data + ichunk*size_per_device);
                int num_blocks = (this_nelem + CUDA_REDUCE_BLOCK_SIZE - 1)/CUDA_REDUCE_BLOCK_SIZE;
                if (dst->type == GGML_TYPE_F16) {
                    k_add<half, CUDA_REDUCE_BLOCK_SIZE><<<num_blocks, CUDA_REDUCE_BLOCK_SIZE, 0, info.all_ctx[i]->stream()>>>(this_nelem,
                            (const half *)info.all_ctx[i]->copy_buffer, (half *)dst->src[i]->data + ichunk*nelem_per_device);
+                } else if (dst->type == GGML_TYPE_Q8_0) {
+                    k_add<CUDA_REDUCE_BLOCK_SIZE><<<num_blocks, CUDA_REDUCE_BLOCK_SIZE, 0, info.all_ctx[i]->stream()>>>(this_nelem,
+                            (const block_q8_0 *)info.all_ctx[i]->copy_buffer, (block_q8_0 *)dst->src[i]->data + ichunk*nelem_per_device/tt.blck_size);
+                } else if (dst->type == GGML_TYPE_BF16) {
+                    k_add<nv_bfloat16, CUDA_REDUCE_BLOCK_SIZE><<<num_blocks, CUDA_REDUCE_BLOCK_SIZE, 0, info.all_ctx[i]->stream()>>>(
+                            this_nelem, (const nv_bfloat16 *)info.all_ctx[i]->copy_buffer,
+                            (nv_bfloat16 *)dst->src[i]->data + ichunk*nelem_per_device);
                } else {
                    k_add<float, CUDA_REDUCE_BLOCK_SIZE><<<num_blocks, CUDA_REDUCE_BLOCK_SIZE, 0, info.all_ctx[i]->stream()>>>(this_nelem,
                            (const float *)info.all_ctx[i]->copy_buffer, (float *)dst->src[i]->data + ichunk*nelem_per_device);
@@ -262,13 +304,14 @@ void ggml_cuda_op_reduce([[maybe_unused]] ggml_backend_cuda_context & ctx, ggml_
                int i = idx[ii];
                int peer = idx[(ii+1)%nhave];
                auto this_nelem = std::min(nelem_per_device, nelem - ichunk*nelem_per_device);
+                auto this_size  = (this_nelem / tt.blck_size) * tt.type_size;
                ggml_cuda_set_device(info.all_ctx[peer]->device);
                if (stage == 0) {
                    CUDA_CHECK(cudaStreamWaitEvent(info.all_ctx[peer]->stream(), info.all_ctx[i]->compute_event, 0));
                }
-                CUDA_CHECK(cudaMemcpyPeerAsync((char *)dst->src[i]->data + ichunk*nelem_per_device*elem_size, info.all_ctx[i]->device,
-                            (const char *)dst->src[peer]->data + ichunk*nelem_per_device*elem_size, info.all_ctx[peer]->device,
-                            this_nelem*elem_size, info.all_ctx[peer]->stream()));
+                CUDA_CHECK(cudaMemcpyPeerAsync((char *)dst->src[i]->data + ichunk*size_per_device, info.all_ctx[i]->device,
+                            (const char *)dst->src[peer]->data + ichunk*size_per_device, info.all_ctx[peer]->device,
+                            this_size, info.all_ctx[peer]->stream()));
                CUDA_CHECK(cudaEventRecord(info.all_ctx[peer]->copy_event, info.all_ctx[peer]->stream()));
                //ggml_cuda_set_device(info.all_ctx[i]->device);
                //CUDA_CHECK(cudaStreamWaitEvent(info.all_ctx[i]->stream(), info.all_ctx[peer]->copy_event, 0));
@@ -351,6 +394,7 @@ void ggml_cuda_op_reduce([[maybe_unused]] ggml_backend_cuda_context & ctx, ggml_
        return;
    }
    if (dst->ne[1] < 32 && ctx.p2p_enabled) {
+        GGML_ASSERT(dst->type != GGML_TYPE_Q8_0);
        for (int ii = 0; ii < nhave; ++ii) {
            int i = idx[ii];
            GGML_ASSERT(dst->src[i]->type == dst->type);
@@ -464,6 +508,12 @@ void ggml_cuda_op_reduce([[maybe_unused]] ggml_backend_cuda_context & ctx, ggml_
        CUDA_CHECK(cudaStreamWaitEvent(ctx.stream(), info.all_ctx[i]->copy_event, 0));
        if (dst->type == GGML_TYPE_F16) {
            k_add<half, CUDA_REDUCE_BLOCK_SIZE><<<num_blocks, CUDA_REDUCE_BLOCK_SIZE, 0, ctx.stream()>>>(nelem, (const half *)ptr, (half *)dst->data);
+        } else if (dst->type == GGML_TYPE_BF16) {
+            k_add<nv_bfloat16, CUDA_REDUCE_BLOCK_SIZE><<<num_blocks, CUDA_REDUCE_BLOCK_SIZE, 0, ctx.stream()>>>(nelem,
+                    (const nv_bfloat16*)ptr, (nv_bfloat16 *)dst->data);
+        } else if (dst->type == GGML_TYPE_Q8_0) {
+            k_add<CUDA_REDUCE_BLOCK_SIZE><<<num_blocks, CUDA_REDUCE_BLOCK_SIZE, 0, ctx.stream()>>>(nelem, (const block_q8_0 *)ptr,
+                    (block_q8_0 *)dst->data);
        } else {
            k_add<float, CUDA_REDUCE_BLOCK_SIZE><<<num_blocks, CUDA_REDUCE_BLOCK_SIZE, 0, ctx.stream()>>>(nelem, (const float *)ptr, (float *)dst->data);
        }