From 9fbe5beef7dd2285de52fa0ca8a4a1e134117b16 Mon Sep 17 00:00:00 2001
From: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Date: Thu, 20 Mar 2025 15:24:15 +0200
Subject: [PATCH] Create wk_b and wv_b as Q8_0_R8 if the wkv_b type is
 interleaved

---
 ggml/src/ggml.c | 10 ++++++----
 src/llama.cpp   |  3 ++-
 2 files changed, 8 insertions(+), 5 deletions(-)

diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
index 49e6c45e..a2bdc156 100644
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@@ -10128,9 +10128,11 @@ static void ggml_compute_forward_dup_f32(
     }
 
     // parallelize by rows
+    int n_packed = ggml_packed_rows(dst->type);
+    GGML_ASSERT(dst->ne[1] % n_packed == 0);
     const int nr = ne01;
     // number of rows per thread
-    const int dr = (nr + nth - 1) / nth;
+    const int dr = n_packed*((nr/n_packed + nth - 1) / nth);
     // row range for this thread
     const int ir0 = dr * ith;
     const int ir1 = MIN(ir0 + dr, nr);
@@ -10182,10 +10184,10 @@ static void ggml_compute_forward_dup_f32(
                 for (int i03 = 0; i03 < ne03; i03++) {
                     for (int i02 = 0; i02 < ne02; i02++) {
                         id += rs * ir0;
-                        for (int i01 = ir0; i01 < ir1; i01++) {
+                        for (int i01 = ir0; i01 < ir1; i01 += n_packed) {
                             const float * src0_ptr = (float *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03);
-                            quantize_row_q(src0_ptr, dst_ptr + id, ne00);
-                            id += rs;
+                            quantize_row_q(src0_ptr, dst_ptr + id, ne00*n_packed);
+                            id += rs*n_packed;
                         }
                         id += rs * (ne01 - ir1);
                     }
diff --git a/src/llama.cpp b/src/llama.cpp
index 72ee345e..a459cb00 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -8194,7 +8194,8 @@ static bool llm_load_tensors(
                 auto wk_b_f32_t = ggml_cont(ctx, wk_b_f32_tview);
                 wk_b_f32_t->data = (char *)wk_b_f32->data + ggml_nbytes(wk_b_f32);
 
-                auto new_type = ggml_is_quantized(wkv_b.type) ? GGML_TYPE_Q8_0 : wkv_b.type;
+                auto new_type = ggml_is_quantized(wkv_b.type) ?
+                    wkv_b.type >= GGML_TYPE_Q4_0_R8 && wkv_b.type <= GGML_TYPE_Q8_K_R8 ? GGML_TYPE_Q8_0_R8 : GGML_TYPE_Q8_0 : wkv_b.type;
                 auto wk_b = ggml_cast(ctx, wk_b_f32_t, new_type);
                 wk_b->data = (char *)wk_b_f32_t->data + ggml_nbytes(wk_b_f32_t);