diff --git a/ggml/src/ggml-cuda/fattn-new-mma.cu b/ggml/src/ggml-cuda/fattn-new-mma.cu
index 615f8633..63a9ca57 100644
--- a/ggml/src/ggml-cuda/fattn-new-mma.cu
+++ b/ggml/src/ggml-cuda/fattn-new-mma.cu
@@ -1804,9 +1804,16 @@ static void launch_fattn_new_mma(
         to_fp16(K_data, K_f16.ptr, 1, ggml_nelements(K), main_stream);
         K_data = (char *) K_f16.ptr;
 
-        nb11 = K->ne[0]*sizeof(half);
-        nb12 = nb11*K->ne[1];
-        nb13 = nb12*K->ne[2];
+        auto bs = ggml_blck_size(K->type);
+        auto ts = ggml_type_size(K->type);
+
+        nb11 = nb11*bs*sizeof(half)/ts;
+        nb12 = nb12*bs*sizeof(half)/ts;
+        nb13 = nb13*bs*sizeof(half)/ts;
+
+        //nb11 = K->ne[0]*sizeof(half);
+        //nb12 = nb11*K->ne[1];
+        //nb13 = nb12*K->ne[2];
     }
 
     if (need_f16_V && V->type != GGML_TYPE_F16) {
@@ -1823,9 +1830,16 @@ static void launch_fattn_new_mma(
             to_fp16(V_data, V_f16.ptr, 1, ggml_nelements(V), main_stream);
             V_data = (char *) V_f16.ptr;
 
-            nb21 = V->ne[0]*sizeof(half);
-            nb22 = nb21*V->ne[1];
-            nb23 = nb22*V->ne[2];
+            auto bs = ggml_blck_size(V->type);
+            auto ts = ggml_type_size(V->type);
+
+            nb21 = nb21*bs*sizeof(half)/ts;
+            nb22 = nb22*bs*sizeof(half)/ts;
+            nb23 = nb23*bs*sizeof(half)/ts;
+
+            //nb21 = V->ne[0]*sizeof(half);
+            //nb22 = nb21*V->ne[1];
+            //nb23 = nb22*V->ne[2];
         }
     }
 
diff --git a/src/llama-build-context.cpp b/src/llama-build-context.cpp
index af4939dc..3c6e552f 100644
--- a/src/llama-build-context.cpp
+++ b/src/llama-build-context.cpp
@@ -1623,7 +1623,6 @@ std::tuple<ggml_tensor*, ggml_tensor*, ggml_tensor*> llm_build_context::llm_buil
             ggml_tensor * wv, ggml_tensor * bv,
             ggml_tensor * q_norm, ggml_tensor * k_norm, float attention_scale, int il, bool add_graph_split) const {
     const int64_t n_embd_head_k = hparams.n_embd_head_k;
-    const int64_t n_embd_head_v = hparams.n_embd_head_v;
     const int64_t n_embd_gqa  = hparams.n_embd_v_gqa();
     if (wqkv) {
         auto qkv = llm_build_lora_mm(lctx, ctx0, wqkv, cur);