From 9fbe5beef7dd2285de52fa0ca8a4a1e134117b16 Mon Sep 17 00:00:00 2001 From: Iwan Kawrakow Date: Thu, 20 Mar 2025 15:24:15 +0200 Subject: [PATCH] Create wk_b and wv_b as Q8_0_R8 if the wkv_b type is interleaved --- ggml/src/ggml.c | 10 ++++++---- src/llama.cpp | 3 ++- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index 49e6c45e..a2bdc156 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -10128,9 +10128,11 @@ static void ggml_compute_forward_dup_f32( } // parallelize by rows + int n_packed = ggml_packed_rows(dst->type); + GGML_ASSERT(dst->ne[1] % n_packed == 0); const int nr = ne01; // number of rows per thread - const int dr = (nr + nth - 1) / nth; + const int dr = n_packed*((nr/n_packed + nth - 1) / nth); // row range for this thread const int ir0 = dr * ith; const int ir1 = MIN(ir0 + dr, nr); @@ -10182,10 +10184,10 @@ static void ggml_compute_forward_dup_f32( for (int i03 = 0; i03 < ne03; i03++) { for (int i02 = 0; i02 < ne02; i02++) { id += rs * ir0; - for (int i01 = ir0; i01 < ir1; i01++) { + for (int i01 = ir0; i01 < ir1; i01 += n_packed) { const float * src0_ptr = (float *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03); - quantize_row_q(src0_ptr, dst_ptr + id, ne00); - id += rs; + quantize_row_q(src0_ptr, dst_ptr + id, ne00*n_packed); + id += rs*n_packed; } id += rs * (ne01 - ir1); } diff --git a/src/llama.cpp b/src/llama.cpp index 72ee345e..a459cb00 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -8194,7 +8194,8 @@ static bool llm_load_tensors( auto wk_b_f32_t = ggml_cont(ctx, wk_b_f32_tview); wk_b_f32_t->data = (char *)wk_b_f32->data + ggml_nbytes(wk_b_f32); - auto new_type = ggml_is_quantized(wkv_b.type) ? GGML_TYPE_Q8_0 : wkv_b.type; + auto new_type = ggml_is_quantized(wkv_b.type) ? + wkv_b.type >= GGML_TYPE_Q4_0_R8 && wkv_b.type <= GGML_TYPE_Q8_K_R8 ? GGML_TYPE_Q8_0_R8 : GGML_TYPE_Q8_0 : wkv_b.type; auto wk_b = ggml_cast(ctx, wk_b_f32_t, new_type); wk_b->data = (char *)wk_b_f32_t->data + ggml_nbytes(wk_b_f32_t);