From 3468438da8a1b8822de1e7882d326e71f1e0072d Mon Sep 17 00:00:00 2001
From: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Date: Wed, 26 Feb 2025 16:19:56 +0200
Subject: [PATCH] CUDA: Quantize non-contiguous tensors

---
 ggml/src/ggml-cuda.cu           | 22 ++++++++++----
 ggml/src/ggml-cuda/quantize.cu  | 52 +++++++++++++++++++++++++++++++++
 ggml/src/ggml-cuda/quantize.cuh |  3 ++
 src/llama.cpp                   |  2 +-
 4 files changed, 73 insertions(+), 6 deletions(-)

diff --git a/ggml/src/ggml-cuda.cu b/ggml/src/ggml-cuda.cu
index c305cd89..a180a28e 100644
--- a/ggml/src/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda.cu
@@ -1518,6 +1518,8 @@ static void ggml_cuda_op_mul_mat(
         }
     }
 
+    bool quantization_done = false;
+
     for (int id = 0; id < ggml_backend_cuda_get_device_count(); ++id) {
         if ((!split && id != ctx.device) || dev[id].row_low == dev[id].row_high) {
             continue;
@@ -1561,9 +1563,15 @@ static void ggml_cuda_op_mul_mat(
             }
             dev[id].src1_ddq = dev[id].src1_ddq_alloc.alloc(ctx.pool(id), src_1_ddq_size);
 
-            if (src1_on_device && src1_is_contiguous) {
-                quantize_src1(dev[id].src1_ddf, dev[id].src1_ddq, ne10, ne11, ne12*ne13, src1_padded_col_size, src0->type, stream);
+            if (src1_on_device && (src1_is_contiguous || (src1->ne[1] == 1 && src1->ne[3] == 1 && src1->nb[0] == sizeof(float)))) {
+                if (src1_is_contiguous) {
+                    quantize_src1(dev[id].src1_ddf, dev[id].src1_ddq, ne10, ne11, ne12*ne13, src1_padded_col_size, src0->type, stream);
+                } else {
+                    //printf("Calling quantize_tensor_q8_1_cuda for %s\n", src0->name);
+                    quantize_tensor_q8_1_cuda(src1, dev[id].src1_ddq, src0->type, stream);
+                }
                 CUDA_CHECK(cudaGetLastError());
+                quantization_done = true;
             }
         }
 
@@ -1649,13 +1657,17 @@ static void ggml_cuda_op_mul_mat(
                         }
                     }
                 } else if (src1_on_device && !src1_is_contiguous) {
-                    CUDA_CHECK(ggml_cuda_cpy_tensor_2d(
-                                src1_ddf_i, src1, i03, i02, src1_col_0, src1_col_0+src1_ncols, stream));
+                    if (!quantization_done) {
+                        //printf("Copying %s\n", src1->name);
+                        CUDA_CHECK(ggml_cuda_cpy_tensor_2d(
+                                    src1_ddf_i, src1, i03, i02, src1_col_0, src1_col_0+src1_ncols, stream));
+                    }
                 } else {
                     GGML_ABORT("fatal error");
                 }
 
-                if (quantize_src1 && !src1_is_contiguous) {
+                if (quantize_src1 && !src1_is_contiguous && !quantization_done) {
+                    //printf("Quantizing %s\n", src1->name);
                     quantize_src1(src1_ddf_i, src1_ddq_i, ne10, src1_ncols, 1, src1_padded_col_size, src0->type, stream);
                     CUDA_CHECK(cudaGetLastError());
                 }
diff --git a/ggml/src/ggml-cuda/quantize.cu b/ggml/src/ggml-cuda/quantize.cu
index 65c7e5f1..90dfac92 100644
--- a/ggml/src/ggml-cuda/quantize.cu
+++ b/ggml/src/ggml-cuda/quantize.cu
@@ -37,6 +37,42 @@ static __global__ void quantize_q8_1(const float * __restrict__ x, void * __rest
     reinterpret_cast<half&>(y[ib].ds.y) = sum;
 }
 
+static __global__ void quantize_q8_1(const float * __restrict__ x, void * __restrict__ vy, const int64_t kx, const int64_t kx0_padded, const uint64_t stride) {
+    const int64_t ix0 = (int64_t)blockDim.x*blockIdx.x + threadIdx.x;
+
+    if (ix0 >= kx0_padded) {
+        return;
+    }
+
+    const int64_t ix1 = blockIdx.y;
+
+    const int64_t i_padded = ix1*kx0_padded + ix0;
+
+    block_q8_1 * y = (block_q8_1 *) vy;
+
+    const int64_t ib = i_padded / QK8_1; // block index
+    const int64_t iqs = i_padded % QK8_1; // quant index
+
+    const float xi = ix0 < kx ? x[ix1*stride + ix0] : 0.0f;
+    float amax = fabsf(xi);
+    float sum = xi;
+
+    amax = warp_reduce_max(amax);
+    sum = warp_reduce_sum(sum);
+
+    const float d = amax / 127;
+    const int8_t q = amax == 0.0f ? 0 : roundf(xi / d);
+
+    y[ib].qs[iqs] = q;
+
+    if (iqs > 0) {
+        return;
+    }
+
+    reinterpret_cast<half&>(y[ib].ds.x) = d;
+    reinterpret_cast<half&>(y[ib].ds.y) = sum;
+}
+
 template <mmq_q8_1_ds_layout ds_layout>
 static __global__ void quantize_mmq_q8_1(
     const float * __restrict__ x, void * __restrict__ vy, const int64_t kx0, const int64_t kx1, const int64_t kx0_padded) {
@@ -164,3 +200,19 @@ void quantize_mmq_q8_1_cuda(
             break;
     }
 }
+
+void quantize_tensor_q8_1_cuda(const struct ggml_tensor * src, void * vy, const enum ggml_type type, cudaStream_t stream) {
+    GGML_ASSERT(src->ne[1] == 1 && src->ne[3] == 1);
+    GGML_ASSERT(src->type == GGML_TYPE_F32);
+    const int64_t src_padded_col_size = GGML_PAD(src->ne[0], MATRIX_ROW_PADDING);
+    GGML_ASSERT(src_padded_col_size % QK8_1 == 0);
+    if (src->ne[2] == 1 || ggml_is_contiguous(src)) {
+        quantize_row_q8_1_cuda((const float *)src->data, vy, src->ne[0], 1, 1, src_padded_col_size, type, stream);
+        return;
+    }
+    const int64_t block_num_x = (src_padded_col_size + CUDA_QUANTIZE_BLOCK_SIZE - 1) / CUDA_QUANTIZE_BLOCK_SIZE;
+    const dim3 num_blocks(block_num_x, src->ne[2]*src->ne[3], 1);
+    const dim3 block_size(CUDA_QUANTIZE_BLOCK_SIZE, 1, 1);
+    const uint64_t stride = src->nb[2]/sizeof(float);
+    quantize_q8_1<<<num_blocks, block_size, 0, stream>>>((const float *)src->data, vy, src->ne[0], src_padded_col_size, stride);
+}
diff --git a/ggml/src/ggml-cuda/quantize.cuh b/ggml/src/ggml-cuda/quantize.cuh
index 03bf322b..69622e18 100644
--- a/ggml/src/ggml-cuda/quantize.cuh
+++ b/ggml/src/ggml-cuda/quantize.cuh
@@ -22,3 +22,6 @@ void quantize_row_q8_1_cuda(
 void quantize_mmq_q8_1_cuda(
     const float * x, void * vy, const int64_t kx0, const int64_t kx1, const int64_t channels, const int64_t kx0_padded,
     const ggml_type type_x, cudaStream_t stream);
+
+// For now only applicable for tensors with ne[1] = 1, ne[3] = 1, and useful if ne[2] > 1
+void quantize_tensor_q8_1_cuda(const struct ggml_tensor * src, void * vy, const enum ggml_type type, cudaStream_t stream);
diff --git a/src/llama.cpp b/src/llama.cpp
index 4504451a..ebc7a772 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -13579,7 +13579,7 @@ struct llm_build_context {
                     cb(wk_b, "wk_b", il);
 
                     q_nope = ggml_permute(ctx0, q_nope, 0, 2, 1, 3);
-                    if (q_nope->ne[1] <= 32) q_nope = ggml_cont(ctx0, q_nope);
+                    //if (q_nope->ne[1] <= 32) q_nope = ggml_cont(ctx0, q_nope);
                     cb(q_nope, "q_nope_perm", il);
 
                     struct ggml_tensor * q_nope2 = ggml_mul_mat(ctx0, wk_b, q_nope);