diff --git a/common/common.cpp b/common/common.cpp index 44678d7a..f7a6f76f 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -2259,6 +2259,9 @@ static ggml_type kv_cache_type_from_str(const std::string & s) { if (s == "q6_0") { return GGML_TYPE_Q6_0; } + if (s == "q8_KV") { + return GGML_TYPE_Q8_KV; + } throw std::runtime_error("Invalid cache type: " + s); } diff --git a/examples/llama-bench/llama-bench.cpp b/examples/llama-bench/llama-bench.cpp index 95df06dc..0222c213 100644 --- a/examples/llama-bench/llama-bench.cpp +++ b/examples/llama-bench/llama-bench.cpp @@ -339,6 +339,9 @@ static ggml_type ggml_type_from_name(const std::string & s) { if (s == "q6_0") { return GGML_TYPE_Q6_0; } + if (s == "q8_KV") { + return GGML_TYPE_Q8_KV; + } return GGML_TYPE_COUNT; } diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index c5d42e57..02d310d8 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -1364,8 +1364,8 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = { }, [GGML_TYPE_Q8_KV] = { .type_name = "q8_KV", - .blck_size = 1, - .type_size = 1, + .blck_size = 32, + .type_size = 32, .is_quantized = true, .to_float = (ggml_to_float_t) dequantize_row_q8_KV, .from_float = quantize_row_q8_KV, @@ -9449,7 +9449,7 @@ static void ggml_compute_forward_dup_f16( float * src0_f32 = (float *) params->wdata + (ne00 + CACHE_LINE_SIZE_F32) * ith; size_t id = 0; - size_t rs = nb0 * (ne00 / ggml_blck_size(dst->type)); + size_t rs = ggml_row_size(dst->type, ne00); //nb0 * (ne00 / ggml_blck_size(dst->type)); char * dst_ptr = (char *) dst->data; for (int i03 = 0; i03 < ne03; i03++) { @@ -9735,7 +9735,7 @@ static void ggml_compute_forward_dup_bf16( float * src0_f32 = (float *) params->wdata + (ne00 + CACHE_LINE_SIZE_F32) * ith; size_t id = 0; - size_t rs = nb0 * (ne00 / ggml_blck_size(dst->type)); + size_t rs = ggml_row_size(dst->type, ne00); //nb0 * (ne00 / ggml_blck_size(dst->type)); char * dst_ptr = (char *) dst->data; for (int i03 = 0; i03 < ne03; i03++) { @@ -10055,7 +10055,7 @@ static void ggml_compute_forward_dup_f32( ggml_from_float_t const quantize_row_q = type_traits[dst->type].from_float; size_t id = 0; - size_t rs = nb0 * (ne00 / ggml_blck_size(dst->type)); + size_t rs = ggml_row_size(dst->type, ne00); //nb0 * (ne00 / ggml_blck_size(dst->type)); char * dst_ptr = (char *) dst->data; for (int i03 = 0; i03 < ne03; i03++) { @@ -14357,7 +14357,7 @@ static void ggml_compute_forward_mul_mat_id( char * wdata_src1_end = (src1->type == vec_dot_type) ? (char *) params->wdata : - (char *) params->wdata + GGML_PAD(ggml_row_size(vec_dot_type, ggml_nelements(src1)), sizeof(int64_t)); + (char *) params->wdata + GGML_PAD(ggml_row_size(vec_dot_type, src1->ne[0])*ggml_nrows(src1), sizeof(int64_t)); struct mmid_row_mapping { int32_t i1; diff --git a/src/llama.cpp b/src/llama.cpp index 298c51eb..ec5d0fb1 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -3180,6 +3180,10 @@ static bool llama_kv_cache_init( for (int i = 0; i < (int) n_layer; i++) { const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(i) + hparams.n_embd_k_s(); const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(i) + hparams.n_embd_v_s(); + const uint32_t n_head = hparams.n_head(i); + const uint32_t n_head_kv = hparams.n_head_kv(i); + const uint32_t n_embd_head_k= hparams.n_embd_head_k; + struct ggml_context * ctx = offload ? ctx_map.at(model.buft_layer[i].buft) : cache.ctxs.front(); ggml_tensor * k; @@ -3201,7 +3205,8 @@ static bool llama_kv_cache_init( const uint32_t kv_lora_rank = hparams.n_lora_kv; LLAMA_LOG_INFO("%s: layer %d: n_embd_head_qk_rope = %d, kv_lora_rank = %d\n", __func__, i, n_embd_head_qk_rope, kv_lora_rank); #if MLA_USE_TRANSPOSED_CACHE - ggml_tensor * kv = ggml_new_tensor_1d(ctx, cache.type_k, (kv_lora_rank + n_embd_head_qk_rope)*kv_size); + ggml_tensor * kv = ggml_new_tensor_2d(ctx, cache.type_k, kv_lora_rank + n_embd_head_qk_rope, kv_size); + //ggml_tensor * kv = ggml_new_tensor_1d(ctx, cache.type_k, (kv_lora_rank + n_embd_head_qk_rope)*kv_size); #else ggml_tensor * kv = ggml_new_tensor_1d(ctx, cache.type_v, (kv_lora_rank + n_embd_head_qk_rope)*kv_size); #endif @@ -3215,7 +3220,10 @@ static bool llama_kv_cache_init( n_mla++; } else { - k = ggml_new_tensor_1d(ctx, type_k, n_embd_k_gqa*kv_size); + //printf("Creating cache tensors:\n"); + //printf("n_embd_k_gqa = %d, kv_size = %d, n_head = %d, n_head_kv = %d, n_embd_head_k = %d\n", (int)n_embd_k_gqa, (int)kv_size, (int)n_head, (int)n_head_kv, (int)n_embd_head_k); + //k = ggml_new_tensor_1d(ctx, type_k, n_embd_k_gqa*kv_size); + k = ggml_new_tensor_2d(ctx, type_k, n_embd_head_k, n_head_kv*kv_size); v = ggml_new_tensor_1d(ctx, type_v, n_embd_v_gqa*kv_size); ggml_format_name(k, "cache_k_l%d", i); ggml_format_name(v, "cache_v_l%d", i); @@ -8285,11 +8293,20 @@ static void llm_build_kv_store( const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(il); const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa(il); + const int64_t n_head = hparams.n_head(il); + const int64_t n_head_kv = hparams.n_head_kv(il); + const int64_t n_embd_head_k = hparams.n_embd_head_k; + const int64_t n_embd_head_v = hparams.n_embd_head_v; + GGML_ASSERT(kv.size == n_ctx); - struct ggml_tensor * k_cache_view = ggml_view_1d(ctx, kv.k_l[il], n_tokens*n_embd_k_gqa, - (ggml_row_size(kv.k_l[il]->type, n_embd_k_gqa))*kv_head); - cb(k_cache_view, "k_cache_view", il); + //struct ggml_tensor * k_cache_view = ggml_view_1d(ctx, kv.k_l[il], n_tokens*n_embd_k_gqa, + // (ggml_row_size(kv.k_l[il]->type, n_embd_k_gqa))*kv_head); + //cb(k_cache_view, "k_cache_view", il); + + auto k_row_size = ggml_row_size(kv.k_l[il]->type, n_embd_head_k); + ggml_tensor * k_cache_view = ggml_view_2d(ctx, kv.k_l[il], n_embd_head_k, n_tokens*n_head_kv, + k_row_size, k_row_size*n_head_kv*kv_head); // note: storing RoPE-ed version of K in the KV cache ggml_build_forward_expand(graph, ggml_cpy(ctx, k_cur, k_cache_view)); @@ -8708,7 +8725,7 @@ static struct ggml_tensor * llm_build_kqv( struct ggml_tensor * k = ggml_view_3d(ctx, kv.k_l[il], n_embd_head_k, n_kv, n_head_kv, - ggml_row_size(kv.k_l[il]->type, n_embd_k_gqa), + ggml_row_size(kv.k_l[il]->type, n_embd_head_k)*n_head_kv, //n_embd_k_gqa), ggml_row_size(kv.k_l[il]->type, n_embd_head_k), 0); cb(k, "k", il); @@ -13509,8 +13526,9 @@ struct llm_build_context { ggml_tensor * kvr = ggml_concat(ctx0, kv_compressed, ggml_permute(ctx0, k_rope, 0, 2, 1, 3), 0); cb(kvr, "kvr", il); - ggml_tensor * kv_cache_view = ggml_view_1d(ctx0, kv_self.kv_l[il], n_tokens*(kv_lora_rank + n_embd_head_qk_rope), - ggml_row_size(kv_self.kv_l[il]->type, kv_lora_rank + n_embd_head_qk_rope)*kv_head); + auto row_size = ggml_row_size(kv_self.kv_l[il]->type, kv_lora_rank + n_embd_head_qk_rope); + ggml_tensor * kv_cache_view = ggml_view_2d(ctx0, kv_self.kv_l[il], kv_self.kv_l[il]->ne[0], n_tokens, + row_size, row_size*kv_head); ggml_build_forward_expand(gf, ggml_cpy(ctx0, kvr, kv_cache_view)); ggml_tensor * kv_cache = ggml_view_2d(ctx0, kv_self.kv_l[il], kv_lora_rank + n_embd_head_qk_rope, n_kv,