diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index 35ed68d0..950dd136 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -1029,7 +1029,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = { .vec_dot = ggml_vec_dot_iq2_bn_q8_K64, .vec_dot_type = GGML_TYPE_Q8_K64, .nrows = 1, - .row_meta_size = 0, + .row_meta_size = 4, }, [GGML_TYPE_IQ2_TN] = { .type_name = "iq2_tn", diff --git a/ggml/src/iqk/iqk_mul_mat.cpp b/ggml/src/iqk/iqk_mul_mat.cpp index b77d08b6..7b2caf01 100644 --- a/ggml/src/iqk/iqk_mul_mat.cpp +++ b/ggml/src/iqk/iqk_mul_mat.cpp @@ -2571,7 +2571,7 @@ IQK_NOINLINE void mul_mat_iq1bn_q8_K64(int n, const void * vx, size_t bx, const } } -struct DequantizeIQ2BN final : public BaseDequantizer { +struct DequantizeIQ2BN final : public BaseDequantizer { DequantizeIQ2BN(const void * vx, size_t bx) : BaseDequantizer(vx, bx) {} IQK_ALWAYS_INLINE void prepare4(int i, __m256i * val) const { @@ -2671,7 +2671,7 @@ IQK_NOINLINE void mul_mat_iq2bn_q8_K64(int n, const void * vx, size_t bx, const auto vd = q8.scale(iy); auto sumi = _mm_add_epi32(_mm256_castsi256_si128(accd[iy]), _mm256_extractf128_si256(accd[iy], 1)); auto sumf = _mm_fmsub_ps(vd, _mm_cvtepi32_ps(sumi), q8.minus(iy)); - info.store(ix, iy, hsum_float_4(sumf)); + info.store(ix, iy, deq.d*hsum_float_4(sumf)); } } } diff --git a/ggml/src/iqk/iqk_quantize.cpp b/ggml/src/iqk/iqk_quantize.cpp index 26bc5ecb..6a7e2642 100644 --- a/ggml/src/iqk/iqk_quantize.cpp +++ b/ggml/src/iqk/iqk_quantize.cpp @@ -150,10 +150,18 @@ void IQ1BNQuantizer::quantize_one_row_2bn(const float * src, block_iq2_bn * y, i constexpr int Nj = QK_IQ1BN/4; + float max = 0; + for (int j = 0; j < n_per_row; ++j) max = std::max(max, fabsf(src[j])); + + float * dptr = (float *)y; + *dptr = max; + y = (block_iq2_bn *)(dptr + 1); + float thresh = 0.5f*max; + for (int ib = 0; ib < nblock; ++ib) { auto xb = src + QK_IQ1BN*ib; for (int j = 0; j < QK_IQ1BN; ++j) { - L[j] = fabsf(xb[j]) < 1e-6f ? 1 : xb[j] < 0 ? 0 : 2; + L[j] = fabsf(xb[j]) < thresh ? 1 : xb[j] < 0 ? 0 : 2; } for (int j = 0; j < Nj; ++j) { y[ib].qs[j] = L[j] | (L[j + Nj] << 2) | (L[j + 2*Nj] << 4) | (L[j + 3*Nj] << 6); @@ -255,13 +263,13 @@ void dequantize_row_iq1_bn(const block_iq1_bn * x, float * y, int64_t k) { size_t quantize_iq2_bn(const float * src, void * dst, int64_t nrows, int64_t n_per_row, const float * imatrix) { IQ1BNQuantizer iq1bn; - int nblock = n_per_row/QK_IQ1BN; - block_iq2_bn * y = (block_iq2_bn *)dst; + auto row_size = ggml_row_size(GGML_TYPE_IQ2_BN, n_per_row); + auto qrow = (char *)dst; for (int row = 0; row < nrows; ++row) { - iq1bn.quantize_one_row_2bn(src + row*n_per_row, y, n_per_row, imatrix); - y += nblock; + iq1bn.quantize_one_row_2bn(src + row*n_per_row, (block_iq2_bn *)qrow, n_per_row, imatrix); + qrow += row_size; } - return sizeof(block_iq2_bn)*nblock*nrows; + return nrows*row_size; } void quantize_row_iq2_bn_ref(const float * x, block_iq2_bn * y, int64_t k) { diff --git a/src/llama.cpp b/src/llama.cpp index 4ca1bd11..7a52bc08 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -13329,7 +13329,7 @@ struct llm_build_context { float q_scale; std::memcpy(&q_scale, model.layers[il].wq->op_params, sizeof(float)); // Note: we could save this scale operation by applying the Q scale on the K * Q product further down // (which also uses a scale). This works on the CPU and Metal backends, but produces NaNs on CUDA. - Qcur = ggml_scale(ctx0, Qcur, q_scale); + if (fabsf(q_scale-1) > 1e-4f) Qcur = ggml_scale(ctx0, Qcur, q_scale); cb(Qcur, "Qcur", il); if (model.layers[il].bq) { Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq); @@ -13339,7 +13339,7 @@ struct llm_build_context { // B1.K struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur); float k_scale; std::memcpy(&k_scale, model.layers[il].wk->op_params, sizeof(float)); - Kcur = ggml_scale(ctx0, Kcur, k_scale); + if (fabsf(k_scale-1) > 1e-4f) Kcur = ggml_scale(ctx0, Kcur, k_scale); cb(Kcur, "Kcur", il); if (model.layers[il].bk) { Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk); @@ -13351,7 +13351,7 @@ struct llm_build_context { float v_scale; std::memcpy(&v_scale, model.layers[il].wv->op_params, sizeof(float)); cb(Vcur, "Vcur", il); if (model.layers[il].bv) { - Vcur = ggml_scale(ctx0, Vcur, v_scale); + if (fabsf(v_scale-1) > 1e-4f) Vcur = ggml_scale(ctx0, Vcur, v_scale); Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv); cb(Vcur, "Vcur", il); v_scale = 1; @@ -13431,7 +13431,7 @@ struct llm_build_context { cur = ggml_mul_mat(ctx0, model.layers[il].wo, cur_attn); float wo_scale; std::memcpy(&wo_scale, model.layers[il].wo->op_params, sizeof(float)); - cur = ggml_scale(ctx0, cur, wo_scale); + if (fabsf(wo_scale-1) > 1e-4f) cur = ggml_scale(ctx0, cur, wo_scale); cb(cur, "kqv_out", il); } @@ -13460,7 +13460,7 @@ struct llm_build_context { cur = ggml_mul_mat(ctx0, model.layers[il].ffn_gate, cur); float ffn_gate_scale; std::memcpy(&ffn_gate_scale, model.layers[il].ffn_gate->op_params, sizeof(float)); - cur = ggml_scale(ctx0, cur, ffn_gate_scale); + if (fabsf(ffn_gate_scale-1) > 1e-4f) cur = ggml_scale(ctx0, cur, ffn_gate_scale); cb(cur, "ffn_gate", il); @@ -13479,7 +13479,7 @@ struct llm_build_context { cur = ggml_mul_mat(ctx0, model.layers[il].ffn_down, cur); float ffn_down_scale; std::memcpy(&ffn_down_scale, model.layers[il].ffn_down->op_params, sizeof(float)); - cur = ggml_scale(ctx0, cur, ffn_down_scale); + if (fabsf(ffn_down_scale-1) > 1e-4f) cur = ggml_scale(ctx0, cur, ffn_down_scale); cb(cur, "ffn_down", il); } cur = ggml_add(ctx0, cur, ffn_inp);