diff --git a/ggml/src/ggml-cuda/fattn-new-mma.cu b/ggml/src/ggml-cuda/fattn-new-mma.cu index 615f8633..63a9ca57 100644 --- a/ggml/src/ggml-cuda/fattn-new-mma.cu +++ b/ggml/src/ggml-cuda/fattn-new-mma.cu @@ -1804,9 +1804,16 @@ static void launch_fattn_new_mma( to_fp16(K_data, K_f16.ptr, 1, ggml_nelements(K), main_stream); K_data = (char *) K_f16.ptr; - nb11 = K->ne[0]*sizeof(half); - nb12 = nb11*K->ne[1]; - nb13 = nb12*K->ne[2]; + auto bs = ggml_blck_size(K->type); + auto ts = ggml_type_size(K->type); + + nb11 = nb11*bs*sizeof(half)/ts; + nb12 = nb12*bs*sizeof(half)/ts; + nb13 = nb13*bs*sizeof(half)/ts; + + //nb11 = K->ne[0]*sizeof(half); + //nb12 = nb11*K->ne[1]; + //nb13 = nb12*K->ne[2]; } if (need_f16_V && V->type != GGML_TYPE_F16) { @@ -1823,9 +1830,16 @@ static void launch_fattn_new_mma( to_fp16(V_data, V_f16.ptr, 1, ggml_nelements(V), main_stream); V_data = (char *) V_f16.ptr; - nb21 = V->ne[0]*sizeof(half); - nb22 = nb21*V->ne[1]; - nb23 = nb22*V->ne[2]; + auto bs = ggml_blck_size(V->type); + auto ts = ggml_type_size(V->type); + + nb21 = nb21*bs*sizeof(half)/ts; + nb22 = nb22*bs*sizeof(half)/ts; + nb23 = nb23*bs*sizeof(half)/ts; + + //nb21 = V->ne[0]*sizeof(half); + //nb22 = nb21*V->ne[1]; + //nb23 = nb22*V->ne[2]; } } diff --git a/src/llama-build-context.cpp b/src/llama-build-context.cpp index af4939dc..3c6e552f 100644 --- a/src/llama-build-context.cpp +++ b/src/llama-build-context.cpp @@ -1623,7 +1623,6 @@ std::tuple llm_build_context::llm_buil ggml_tensor * wv, ggml_tensor * bv, ggml_tensor * q_norm, ggml_tensor * k_norm, float attention_scale, int il, bool add_graph_split) const { const int64_t n_embd_head_k = hparams.n_embd_head_k; - const int64_t n_embd_head_v = hparams.n_embd_head_v; const int64_t n_embd_gqa = hparams.n_embd_v_gqa(); if (wqkv) { auto qkv = llm_build_lora_mm(lctx, ctx0, wqkv, cur);