diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
index 35ed68d0..950dd136 100644
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@@ -1029,7 +1029,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
         .vec_dot                  = ggml_vec_dot_iq2_bn_q8_K64,
         .vec_dot_type             = GGML_TYPE_Q8_K64,
         .nrows                    = 1,
-        .row_meta_size            = 0,
+        .row_meta_size            = 4,
     },
     [GGML_TYPE_IQ2_TN] = {
         .type_name                = "iq2_tn",
diff --git a/ggml/src/iqk/iqk_mul_mat.cpp b/ggml/src/iqk/iqk_mul_mat.cpp
index b77d08b6..7b2caf01 100644
--- a/ggml/src/iqk/iqk_mul_mat.cpp
+++ b/ggml/src/iqk/iqk_mul_mat.cpp
@@ -2571,7 +2571,7 @@ IQK_NOINLINE void mul_mat_iq1bn_q8_K64(int n, const void * vx, size_t bx, const
     }
 }
 
-struct DequantizeIQ2BN final : public BaseDequantizer<block_iq2_bn> {
+struct DequantizeIQ2BN final : public BaseDequantizer<block_iq2_bn, true> {
     DequantizeIQ2BN(const void * vx, size_t bx) : BaseDequantizer(vx, bx) {}
 
     IQK_ALWAYS_INLINE void prepare4(int i, __m256i * val) const {
@@ -2671,7 +2671,7 @@ IQK_NOINLINE void mul_mat_iq2bn_q8_K64(int n, const void * vx, size_t bx, const
             auto vd = q8.scale(iy);
             auto sumi = _mm_add_epi32(_mm256_castsi256_si128(accd[iy]), _mm256_extractf128_si256(accd[iy], 1));
             auto sumf = _mm_fmsub_ps(vd, _mm_cvtepi32_ps(sumi), q8.minus(iy));
-            info.store(ix, iy, hsum_float_4(sumf));
+            info.store(ix, iy, deq.d*hsum_float_4(sumf));
         }
     }
 }
diff --git a/ggml/src/iqk/iqk_quantize.cpp b/ggml/src/iqk/iqk_quantize.cpp
index 26bc5ecb..6a7e2642 100644
--- a/ggml/src/iqk/iqk_quantize.cpp
+++ b/ggml/src/iqk/iqk_quantize.cpp
@@ -150,10 +150,18 @@ void IQ1BNQuantizer::quantize_one_row_2bn(const float * src, block_iq2_bn * y, i
 
     constexpr int Nj = QK_IQ1BN/4;
 
+    float max = 0;
+    for (int j = 0; j < n_per_row; ++j) max = std::max(max, fabsf(src[j]));
+
+    float * dptr = (float *)y;
+    *dptr = max;
+    y = (block_iq2_bn *)(dptr + 1);
+    float thresh = 0.5f*max;
+
     for (int ib = 0; ib < nblock; ++ib) {
         auto xb = src + QK_IQ1BN*ib;
         for (int j = 0; j < QK_IQ1BN; ++j) {
-            L[j] = fabsf(xb[j]) < 1e-6f ? 1 : xb[j] < 0 ? 0 : 2;
+            L[j] = fabsf(xb[j]) < thresh ? 1 : xb[j] < 0 ? 0 : 2;
         }
         for (int j = 0; j < Nj; ++j) {
             y[ib].qs[j] = L[j] | (L[j + Nj] << 2) | (L[j + 2*Nj] << 4) | (L[j + 3*Nj] << 6);
@@ -255,13 +263,13 @@ void dequantize_row_iq1_bn(const block_iq1_bn * x, float * y, int64_t k) {
 
 size_t quantize_iq2_bn(const float * src, void * dst, int64_t nrows, int64_t n_per_row, const float * imatrix) {
     IQ1BNQuantizer iq1bn;
-    int nblock = n_per_row/QK_IQ1BN;
-    block_iq2_bn * y = (block_iq2_bn *)dst;
+    auto row_size = ggml_row_size(GGML_TYPE_IQ2_BN, n_per_row);
+    auto qrow = (char *)dst;
     for (int row = 0; row < nrows; ++row) {
-        iq1bn.quantize_one_row_2bn(src + row*n_per_row, y, n_per_row, imatrix);
-        y += nblock;
+        iq1bn.quantize_one_row_2bn(src + row*n_per_row, (block_iq2_bn *)qrow, n_per_row, imatrix);
+        qrow += row_size;
     }
-    return sizeof(block_iq2_bn)*nblock*nrows;
+    return nrows*row_size;
 }
 
 void quantize_row_iq2_bn_ref(const float * x, block_iq2_bn * y, int64_t k) {
diff --git a/src/llama.cpp b/src/llama.cpp
index 4ca1bd11..7a52bc08 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -13329,7 +13329,7 @@ struct llm_build_context {
                 float q_scale; std::memcpy(&q_scale, model.layers[il].wq->op_params, sizeof(float));
                 // Note: we could save this scale operation by applying the Q scale on the K * Q product further down
                 // (which also uses a scale). This works on the CPU and Metal backends, but produces NaNs on CUDA.
-                Qcur = ggml_scale(ctx0, Qcur, q_scale);
+                if (fabsf(q_scale-1) > 1e-4f) Qcur = ggml_scale(ctx0, Qcur, q_scale);
                 cb(Qcur, "Qcur", il);
                 if (model.layers[il].bq) {
                     Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
@@ -13339,7 +13339,7 @@ struct llm_build_context {
                 // B1.K
                 struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
                 float k_scale; std::memcpy(&k_scale, model.layers[il].wk->op_params, sizeof(float));
-                Kcur = ggml_scale(ctx0, Kcur, k_scale);
+                if (fabsf(k_scale-1) > 1e-4f) Kcur = ggml_scale(ctx0, Kcur, k_scale);
                 cb(Kcur, "Kcur", il);
                 if (model.layers[il].bk) {
                     Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
@@ -13351,7 +13351,7 @@ struct llm_build_context {
                 float v_scale; std::memcpy(&v_scale, model.layers[il].wv->op_params, sizeof(float));
                 cb(Vcur, "Vcur", il);
                 if (model.layers[il].bv) {
-                    Vcur = ggml_scale(ctx0, Vcur, v_scale);
+                    if (fabsf(v_scale-1) > 1e-4f) Vcur = ggml_scale(ctx0, Vcur, v_scale);
                     Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
                     cb(Vcur, "Vcur", il);
                     v_scale = 1;
@@ -13431,7 +13431,7 @@ struct llm_build_context {
 
                 cur = ggml_mul_mat(ctx0, model.layers[il].wo, cur_attn);
                 float wo_scale; std::memcpy(&wo_scale, model.layers[il].wo->op_params, sizeof(float));
-                cur = ggml_scale(ctx0, cur, wo_scale);
+                if (fabsf(wo_scale-1) > 1e-4f) cur = ggml_scale(ctx0, cur, wo_scale);
 
                 cb(cur, "kqv_out", il);
             }
@@ -13460,7 +13460,7 @@ struct llm_build_context {
 
                 cur = ggml_mul_mat(ctx0, model.layers[il].ffn_gate, cur);
                 float ffn_gate_scale; std::memcpy(&ffn_gate_scale, model.layers[il].ffn_gate->op_params, sizeof(float));
-                cur = ggml_scale(ctx0, cur, ffn_gate_scale);
+                if (fabsf(ffn_gate_scale-1) > 1e-4f) cur = ggml_scale(ctx0, cur, ffn_gate_scale);
 
                 cb(cur, "ffn_gate", il);
 
@@ -13479,7 +13479,7 @@ struct llm_build_context {
 
                 cur = ggml_mul_mat(ctx0, model.layers[il].ffn_down, cur);
                 float ffn_down_scale; std::memcpy(&ffn_down_scale, model.layers[il].ffn_down->op_params, sizeof(float));
-                cur = ggml_scale(ctx0, cur, ffn_down_scale);
+                if (fabsf(ffn_down_scale-1) > 1e-4f) cur = ggml_scale(ctx0, cur, ffn_down_scale);
                 cb(cur, "ffn_down", il);
             }
             cur = ggml_add(ctx0, cur, ffn_inp);