From b0ba33bec08e88f138c91ebbbd72b0dc6b14fc59 Mon Sep 17 00:00:00 2001 From: Iwan Kawrakow Date: Wed, 5 Feb 2025 19:45:45 +0200 Subject: [PATCH] iq1_m_r4: Zen4 gemm --- examples/quantize/quantize.cpp | 2 + ggml/src/iqk/iqk_mul_mat.cpp | 117 +++++++++++++++++++++++++++++++++ include/llama.h | 1 + src/llama.cpp | 16 +++-- 4 files changed, 132 insertions(+), 4 deletions(-) diff --git a/examples/quantize/quantize.cpp b/examples/quantize/quantize.cpp index 1c847e6b..7bdd8597 100644 --- a/examples/quantize/quantize.cpp +++ b/examples/quantize/quantize.cpp @@ -30,6 +30,7 @@ static const std::vector QUANT_OPTIONS = { { "IQ2_M_R4", LLAMA_FTYPE_MOSTLY_IQ2_M_R4, " 2.7 bpw quantization", }, { "IQ1_S", LLAMA_FTYPE_MOSTLY_IQ1_S, " 1.56 bpw quantization", }, { "IQ1_S_R4", LLAMA_FTYPE_MOSTLY_IQ1_S_R4, " 1.5 bpw quantization", }, + { "IQ1_M_R4", LLAMA_FTYPE_MOSTLY_IQ1_M_R4, " 1.75 bpw quantization", }, { "IQ1_M", LLAMA_FTYPE_MOSTLY_IQ1_M, " 1.75 bpw quantization", }, { "IQ1_BN", LLAMA_FTYPE_MOSTLY_IQ1_BN, " 1.62 bpw quantization (Bitnet)", }, { "IQ2_BN", LLAMA_FTYPE_MOSTLY_IQ2_BN, " 2.00 bpw quantization (Bitnet)", }, @@ -512,6 +513,7 @@ int main(int argc, char ** argv) { params.ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S || params.ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS_R4 || params.ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || params.ftype == LLAMA_FTYPE_MOSTLY_IQ1_S_R4 || + params.ftype == LLAMA_FTYPE_MOSTLY_IQ1_M_R4 || params.ftype == LLAMA_FTYPE_MOSTLY_IQ1_M)) { fprintf(stderr, "\n==========================================================================================================\n"); fprintf(stderr, "Please do not use IQ1_S, IQ1_M, IQ2_S, IQ2_XXS, IQ2_XS or Q2_K_S quantization without an importance matrix\n"); diff --git a/ggml/src/iqk/iqk_mul_mat.cpp b/ggml/src/iqk/iqk_mul_mat.cpp index ea8e8274..081a90ab 100644 --- a/ggml/src/iqk/iqk_mul_mat.cpp +++ b/ggml/src/iqk/iqk_mul_mat.cpp @@ -260,6 +260,7 @@ struct MulMat { case GGML_TYPE_IQ2_S_R4: case GGML_TYPE_IQ3_XXS_R4: case GGML_TYPE_IQ1_S_R4: + case GGML_TYPE_IQ1_M_R4: case GGML_TYPE_IQ3_S_R4: return 4; case GGML_TYPE_IQ4_NL_R4: case GGML_TYPE_Q5_0_R4: @@ -295,6 +296,7 @@ struct MulMat { case GGML_TYPE_IQ3_XXS_R4: case GGML_TYPE_IQ3_S_R4: case GGML_TYPE_IQ1_S_R4: + case GGML_TYPE_IQ1_M_R4: case GGML_TYPE_IQ2_BN_R4: return 4; case GGML_TYPE_IQ4_XS_R4: case GGML_TYPE_Q4_0_R4: @@ -3609,6 +3611,106 @@ static void mul_mat_iq1_s_r4_q8_1(int n, const void * vx, size_t bx, const DataI } } +template +static void mul_mat_iq1_m_r4_q8_1(int n, const void * vx, size_t bx, const DataInfo& info, int nrc_x) { + GGML_ASSERT(nrc_x%4 == 0); + Q8 q8(info); + int nb = n / 32; + GGML_ASSERT(nb%4 == 0); + auto shuffle0 = _mm256_set_epi64x(0x0909090909090909, 0x0808080808080808, 0x0101010101010101, 0x0000000000000000); + auto step = _mm256_set1_epi8(2); +#ifndef HAVE_FANCY_SIMD + auto m1 = _mm256_set1_epi16(1); +#endif + __m256i qx[4]; + __m256 acc[nrc_y] = {}; + auto ms = _mm_set1_epi8(0x08); + float d8[8*nrc_y]; + union { __m256i vec; uint16_t val[16]; } helper; + for (int ix= 0; ix < nrc_x; ix += 4) { + auto dptr = (const ggml_half *)((const char *)vx + ix*bx); + auto d1 = _mm_mul_ps(_mm_set1_ps(0.125f), _mm_cvtph_ps(_mm_loadl_epi64((const __m128i *)dptr))); + auto x = (const block_iq1_m_r4 *)(dptr + 4); + for (int ib = 0; ib < nb/4; ++ib) { + for (int iy = 0; iy < nrc_y; ++iy) { + _mm256_storeu_ps(d8 + 8*iy, _mm256_cvtph_ps(_mm_loadu_si128((const __m128i *)q8.y[iy][ib].d))); + } + for (int k = 0; k < 4; ++k) { + auto qh = (const uint32_t *)x[4*ib+k].qh; + auto idxh = _mm_set_epi32(qh[1] >> 4, qh[1], qh[0] >> 4, qh[0]); + auto scales4 = _mm_set1_epi32(((const uint32_t *)x[4*ib+k].scales)[0]); + // r0s0, r1s0, r2s0, r3s0, r0s1, r1s1, r2s1, r3s1, r0s0, r1s0, r2s0, r3s0, r0s1, r1s1, r2s1, r3s1 + scales4 = _mm_and_si128(_mm_srlv_epi32(scales4, _mm_set_epi32(4, 0, 4, 0)), _mm_set1_epi8(0xf)); + scales4 = _mm_or_si128(_mm_slli_epi16(scales4, 1), _mm_set1_epi8(1)); + // r0s0, r0s0, r1s0, r1s0, r2s0, r2s0, r3s0, r3s0, r0s1, r0s1, r1s1, r1s1, r2s1, r2s1, r3s1, r3s1, + //scales4 = _mm_unpacklo_epi8(scales4, scales4); + scales4 = _mm_cvtepu8_epi16(scales4); + auto scales = MM256_SET_M128I(_mm_unpackhi_epi16(scales4, scales4), _mm_unpacklo_epi16(scales4, scales4)); + + auto signs128 = _mm_or_si128(_mm_cmpeq_epi8(_mm_and_si128(idxh, ms), ms), _mm_set1_epi8(1)); + signs128 = _mm_add_epi8(_mm_set1_epi8(-8), signs128); + auto signs = MM256_SET_M128I(signs128, signs128); + auto idxl = _mm256_cvtepu8_epi16(_mm_loadu_si128((const __m128i *)x[4*ib+k].qs)); + idxh = _mm_and_si128(idxh, _mm_set1_epi8(0x07)); + helper.vec = _mm256_or_si256(idxl, _mm256_slli_epi16(_mm256_cvtepu8_epi16(idxh), 8)); + qx[0] = _mm256_set_epi64x(iq1s_grid_us[helper.val[ 9]], iq1s_grid_us[helper.val[ 8]], + iq1s_grid_us[helper.val[ 1]], iq1s_grid_us[helper.val[ 0]]); + qx[1] = _mm256_set_epi64x(iq1s_grid_us[helper.val[13]], iq1s_grid_us[helper.val[12]], + iq1s_grid_us[helper.val[ 5]], iq1s_grid_us[helper.val[ 4]]); + qx[2] = _mm256_set_epi64x(iq1s_grid_us[helper.val[11]], iq1s_grid_us[helper.val[10]], + iq1s_grid_us[helper.val[ 3]], iq1s_grid_us[helper.val[ 2]]); + qx[3] = _mm256_set_epi64x(iq1s_grid_us[helper.val[15]], iq1s_grid_us[helper.val[14]], + iq1s_grid_us[helper.val[ 7]], iq1s_grid_us[helper.val[ 6]]); + qx[0] = _mm256_add_epi8(_mm256_slli_epi16(qx[0], 3), _mm256_shuffle_epi8(signs, shuffle0)); + auto shuffle = _mm256_add_epi8(shuffle0, step); + qx[2] = _mm256_add_epi8(_mm256_slli_epi16(qx[2], 3), _mm256_shuffle_epi8(signs, shuffle)); + shuffle = _mm256_add_epi8(shuffle, step); + qx[1] = _mm256_add_epi8(_mm256_slli_epi16(qx[1], 3), _mm256_shuffle_epi8(signs, shuffle)); + shuffle = _mm256_add_epi8(shuffle, step); + qx[3] = _mm256_add_epi8(_mm256_slli_epi16(qx[3], 3), _mm256_shuffle_epi8(signs, shuffle)); + auto s0 = _mm256_sign_epi8(qx[0], qx[0]); + auto s1 = _mm256_sign_epi8(qx[1], qx[1]); + auto s2 = _mm256_sign_epi8(qx[2], qx[2]); + auto s3 = _mm256_sign_epi8(qx[3], qx[3]); + for (int iy = 0; iy < nrc_y; ++iy) { + auto y = _mm256_loadu_si256((const __m256i *)q8.y[iy][ib].qs + k); + auto y1 = _mm256_shuffle_epi32(y, 0x44); + auto y2 = _mm256_shuffle_epi32(y, 0xee); +#ifdef HAVE_FANCY_SIMD + // 0,0, 1,1, 0,0, 1,1 as int32_t + auto sumi1 = _mm256_dpbusd_epi32(_mm256_dpbusd_epi32(_mm256_setzero_si256(), + s0, _mm256_sign_epi8(y1, qx[0])), s1, _mm256_sign_epi8(y2, qx[1])); + // 2,2, 3,3, 2,2, 3,3 as int32_t + auto sumi2 = _mm256_dpbusd_epi32(_mm256_dpbusd_epi32(_mm256_setzero_si256(), + s2, _mm256_sign_epi8(y1, qx[2])), s3, _mm256_sign_epi8(y2, qx[3])); + auto sumi = _mm256_packs_epi32(sumi1, sumi2); +#else + // 4 x row 0, 4 x row 1, 4 x row 0, 4 x row 1 + auto sumi1 = _mm256_add_epi16(_mm256_maddubs_epi16(s0, _mm256_sign_epi8(y1, qx[0])), + _mm256_maddubs_epi16(s1, _mm256_sign_epi8(y2, qx[1]))); + // 4 x row 2, 4 x row 3, 4 x row 2, 4 x row 3 + auto sumi2 = _mm256_add_epi16(_mm256_maddubs_epi16(s2, _mm256_sign_epi8(y1, qx[2])), + _mm256_maddubs_epi16(s3, _mm256_sign_epi8(y2, qx[3]))); + // 0,0, 1,1, 0,0, 1,1 as int32_t + sumi1 = _mm256_madd_epi16(m1, sumi1); + // 2,2, 3,3, 2,2, 3,3 as int32_t + sumi2 = _mm256_madd_epi16(m1, sumi2); + // 0,0, 1,1, 2,2, 3,3, 0,0, 1,1, 2,2, 3,3 as int16_t + auto sumi = _mm256_packs_epi32(sumi1, sumi2); +#endif + sumi = _mm256_madd_epi16(scales, sumi); + acc[iy] = _mm256_fmadd_ps(_mm256_set1_ps(d8[8*iy+k+0]), _mm256_cvtepi32_ps(sumi), acc[iy]); + } + } + } + for (int iy = 0; iy < nrc_y; ++iy) { + auto sumf = _mm_add_ps(_mm256_castps256_ps128(acc[iy]), _mm256_extractf128_ps(acc[iy], 1)); + info.store(ix, iy, _mm_mul_ps(d1, sumf)); + acc[iy] = _mm256_setzero_ps(); + } + } +} + #ifdef HAVE_FANCY_SIMD template static void mul_mat_q4_0_r4_q8_1(int n, const void * vx, size_t bx, const DataInfo& info, int nrc_x) { @@ -9078,6 +9180,21 @@ bool MulMat::prepare(int typeA, int typeB, int ne00, MulMat& mm, int Ny) { mm.funcs[7] = mul_mat_iq1_s_r4_q8_1<8>; #ifdef HAVE_FANCY_SIMD mm.func16 = mul_mat_iq1_s_r4_q8_1<16>; +#endif + expected_typeB = GGML_TYPE_Q8_1_X4; + break; + case GGML_TYPE_IQ1_M_R4: + assert (ne00 % QK4_NL == 0); + mm.funcs[0] = mul_mat_iq1_m_r4_q8_1<1>; + mm.funcs[1] = mul_mat_iq1_m_r4_q8_1<2>; + mm.funcs[2] = mul_mat_iq1_m_r4_q8_1<3>; + mm.funcs[3] = mul_mat_iq1_m_r4_q8_1<4>; + mm.funcs[4] = mul_mat_iq1_m_r4_q8_1<5>; + mm.funcs[5] = mul_mat_iq1_m_r4_q8_1<6>; + mm.funcs[6] = mul_mat_iq1_m_r4_q8_1<7>; + mm.funcs[7] = mul_mat_iq1_m_r4_q8_1<8>; +#ifdef HAVE_FANCY_SIMD + mm.func16 = mul_mat_iq1_m_r4_q8_1<16>; #endif expected_typeB = GGML_TYPE_Q8_1_X4; break; diff --git a/include/llama.h b/include/llama.h index 0f6d15ac..3f25b296 100644 --- a/include/llama.h +++ b/include/llama.h @@ -197,6 +197,7 @@ extern "C" { LLAMA_FTYPE_MOSTLY_IQ3_S_R4 = 226, // except 1d tensors LLAMA_FTYPE_MOSTLY_IQ2_M_R4 = 229, // except 1d tensors LLAMA_FTYPE_MOSTLY_IQ4_XS_R4 = 230, // except 1d tensors + LLAMA_FTYPE_MOSTLY_IQ1_M_R4 = 231, // except 1d tensors LLAMA_FTYPE_MOSTLY_Q6_0_R4 = 335, // except 1d tensors LLAMA_FTYPE_MOSTLY_BF16_R16 = 232, // except 1d tensors LLAMA_FTYPE_MOSTLY_IQ2_BN_R4 = 337, // except 1d tensors diff --git a/src/llama.cpp b/src/llama.cpp index 943b945a..117f59be 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -3955,6 +3955,7 @@ struct llama_model_loader { case GGML_TYPE_IQ3_XXS_R4: ftype = LLAMA_FTYPE_MOSTLY_IQ3_XXS_R4; break; case GGML_TYPE_IQ1_S: ftype = LLAMA_FTYPE_MOSTLY_IQ1_S; break; case GGML_TYPE_IQ1_S_R4:ftype = LLAMA_FTYPE_MOSTLY_IQ1_S_R4;break; + case GGML_TYPE_IQ1_M_R4:ftype = LLAMA_FTYPE_MOSTLY_IQ1_M_R4;break; case GGML_TYPE_IQ1_M: ftype = LLAMA_FTYPE_MOSTLY_IQ1_M; break; case GGML_TYPE_IQ1_BN: ftype = LLAMA_FTYPE_MOSTLY_IQ1_BN; break; case GGML_TYPE_IQ2_BN: ftype = LLAMA_FTYPE_MOSTLY_IQ2_BN; break; @@ -4690,6 +4691,7 @@ static std::string llama_model_ftype_name(llama_ftype ftype) { case LLAMA_FTYPE_MOSTLY_IQ3_XXS_R4: return "IQ3_XXS_R4 - 3.0625 bpw"; case LLAMA_FTYPE_MOSTLY_IQ1_S: return "IQ1_S - 1.5625 bpw"; case LLAMA_FTYPE_MOSTLY_IQ1_S_R4: return "IQ1_S_R4 - 1.5 bpw"; + case LLAMA_FTYPE_MOSTLY_IQ1_M_R4: return "IQ1_M_R4 - 1.75 bpw"; case LLAMA_FTYPE_MOSTLY_IQ1_M: return "IQ1_M - 1.75 bpw"; case LLAMA_FTYPE_MOSTLY_IQ4_NL: return "IQ4_NL - 4.5 bpw"; case LLAMA_FTYPE_MOSTLY_IQ4_NL_R4:return "IQ4_NL_R4 - 4.5 bpw"; @@ -15969,7 +15971,8 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n ftype == LLAMA_FTYPE_MOSTLY_IQ1_M || ftype == LLAMA_FTYPE_MOSTLY_IQ2_K || ftype == LLAMA_FTYPE_MOSTLY_IQ3_K || ftype == LLAMA_FTYPE_MOSTLY_IQ2_KS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_K_R4 || ftype == LLAMA_FTYPE_MOSTLY_IQ2_K_R4 || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS_R4 || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS_R4 || - ftype == LLAMA_FTYPE_MOSTLY_IQ2_M_R4 || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S_R4) { + ftype == LLAMA_FTYPE_MOSTLY_IQ2_M_R4 || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S_R4 || + ftype == LLAMA_FTYPE_MOSTLY_IQ1_M_R4) { new_type = !qs.has_output ? GGML_TYPE_IQ4_K : GGML_TYPE_Q5_K; } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS_R4 || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS_R4) { @@ -15991,7 +15994,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS_R4 || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS_R4 || - ftype == LLAMA_FTYPE_MOSTLY_IQ1_S_R4) { + ftype == LLAMA_FTYPE_MOSTLY_IQ1_S_R4 || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M_R4) { new_type = GGML_TYPE_Q2_K; } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M_R4) { @@ -16068,7 +16071,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n new_type = GGML_TYPE_BF16; } } - } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_S_R4) { + } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_S_R4 || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M_R4) { if (name.find("attn_v.weight") != std::string::npos) { if (qs.model.hparams.n_expert >= 4 || qs.model.hparams.n_gqa() >= 4) new_type = GGML_TYPE_IQ4_K_R4; else if (qs.model.hparams.n_gqa() >= 2) new_type = GGML_TYPE_IQ3_K_R4; @@ -16134,7 +16137,6 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n new_type = GGML_TYPE_Q5_K; } else { if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) new_type = GGML_TYPE_IQ2_K; - else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_S_R4) new_type = GGML_TYPE_IQ2_K_R4; else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || is_iq2_m) new_type = GGML_TYPE_IQ3_S; } } @@ -16580,6 +16582,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s case LLAMA_FTYPE_MOSTLY_IQ3_XXS_R4: default_type = GGML_TYPE_IQ3_XXS_R4; break; case LLAMA_FTYPE_MOSTLY_IQ1_S: default_type = GGML_TYPE_IQ1_S; break; case LLAMA_FTYPE_MOSTLY_IQ1_S_R4:default_type = GGML_TYPE_IQ1_S_R4;break; + case LLAMA_FTYPE_MOSTLY_IQ1_M_R4:default_type = GGML_TYPE_IQ1_M_R4;break; case LLAMA_FTYPE_MOSTLY_IQ1_M: default_type = GGML_TYPE_IQ1_M; break; case LLAMA_FTYPE_MOSTLY_IQ1_BN: default_type = GGML_TYPE_IQ1_BN; break; case LLAMA_FTYPE_MOSTLY_IQ2_BN: default_type = GGML_TYPE_IQ2_BN; break; @@ -16934,6 +16937,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s new_type == GGML_TYPE_IQ2_S_R4|| new_type == GGML_TYPE_IQ1_S || new_type == GGML_TYPE_IQ1_S_R4|| + new_type == GGML_TYPE_IQ1_M_R4|| (new_type == GGML_TYPE_IQ1_M && strcmp(tensor->name, "token_embd.weight") && strcmp(tensor->name, "output.weight")) || (new_type == GGML_TYPE_Q2_K && params->ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S && strcmp(tensor->name, "token_embd.weight") != 0))) { LLAMA_LOG_ERROR("\n\n============================================================\n"); @@ -17057,6 +17061,10 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s if (tensor->ne[1] % 4 != 0) new_type = GGML_TYPE_IQ1_S; else chunk_size_multiplier = 4; } + else if (new_type == GGML_TYPE_IQ1_M_R4) { + if (tensor->ne[1] % 4 != 0) new_type = GGML_TYPE_IQ1_M; + else chunk_size_multiplier = 4; + } else if (new_type == GGML_TYPE_BF16_R16) { if (tensor->ne[1] % 16 != 0) new_type = GGML_TYPE_BF16; else chunk_size_multiplier = 16;