From 6e31b493b31399a159039f24e9981166a39cb351 Mon Sep 17 00:00:00 2001
From: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Date: Tue, 4 Feb 2025 14:56:25 +0200
Subject: [PATCH] iq1_s_r4: gemm/gemv works on AVX2/Zen4

---
 examples/quantize/quantize.cpp |   2 +
 ggml/src/iqk/iqk_mul_mat.cpp   | 103 +++++++++++++++++++++++++++++++++
 include/llama.h                |   1 +
 src/llama.cpp                  |  41 ++++++++++++-
 4 files changed, 145 insertions(+), 2 deletions(-)
diff --git a/examples/quantize/quantize.cpp b/examples/quantize/quantize.cpp
index 5ffdbc84..1c847e6b 100644
--- a/examples/quantize/quantize.cpp
+++ b/examples/quantize/quantize.cpp
@@ -29,6 +29,7 @@ static const std::vector<struct quant_option> QUANT_OPTIONS = {
     { "IQ2_M",    LLAMA_FTYPE_MOSTLY_IQ2_M,    " 2.7  bpw quantization",            },
     { "IQ2_M_R4", LLAMA_FTYPE_MOSTLY_IQ2_M_R4, " 2.7  bpw quantization",            },
     { "IQ1_S",    LLAMA_FTYPE_MOSTLY_IQ1_S,    " 1.56 bpw quantization",            },
+    { "IQ1_S_R4", LLAMA_FTYPE_MOSTLY_IQ1_S_R4, " 1.5 bpw quantization",             },
     { "IQ1_M",    LLAMA_FTYPE_MOSTLY_IQ1_M,    " 1.75 bpw quantization",            },
     { "IQ1_BN",   LLAMA_FTYPE_MOSTLY_IQ1_BN,   " 1.62 bpw quantization (Bitnet)",   },
     { "IQ2_BN",   LLAMA_FTYPE_MOSTLY_IQ2_BN,   " 2.00 bpw quantization (Bitnet)",   },
@@ -510,6 +511,7 @@ int main(int argc, char ** argv) {
          params.ftype == LLAMA_FTYPE_MOSTLY_IQ2_S  || params.ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS_R4 ||
          params.ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S || params.ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS_R4 ||
          params.ftype == LLAMA_FTYPE_MOSTLY_IQ1_S  ||
+         params.ftype == LLAMA_FTYPE_MOSTLY_IQ1_S_R4 ||
          params.ftype == LLAMA_FTYPE_MOSTLY_IQ1_M)) {
         fprintf(stderr, "\n==========================================================================================================\n");
         fprintf(stderr, "Please do not use IQ1_S, IQ1_M, IQ2_S, IQ2_XXS, IQ2_XS or Q2_K_S quantization without an importance matrix\n");
diff --git a/ggml/src/iqk/iqk_mul_mat.cpp b/ggml/src/iqk/iqk_mul_mat.cpp
index 3bff3c41..307a1410 100644
--- a/ggml/src/iqk/iqk_mul_mat.cpp
+++ b/ggml/src/iqk/iqk_mul_mat.cpp
@@ -2745,6 +2745,94 @@ static void mul_mat_q4_0_r4_q8_1_avx2(int n, const void * vx, size_t bx, const D
     }
 }
 
+template <int nrc_y>
+static void mul_mat_iq1_s_r4_q8_1(int n, const void * vx, size_t bx, const DataInfo& info, int nrc_x) {
+    GGML_ASSERT(nrc_x%4 == 0);
+    Q8<nrc_y, block_q8_1_x4> q8(info);
+    int nb = n / 32;
+    GGML_ASSERT(nb%4 == 0);
+    __m256i qx[4];
+    __m256  acc[nrc_y] = {};
+    auto m1 = _mm256_set1_epi16(1);
+    auto m0 = _mm256_set1_epi8(1);
+    auto ms = _mm_set1_epi16(-32768);
+    //auto delta_0 = _mm256_set1_ps(IQ1S_DELTA);
+    float d8[8*nrc_y];
+    for (int ix= 0; ix < nrc_x; ix += 4) {
+        auto dptr = (const ggml_half *)((const char *)vx + ix*bx);
+        auto d1 = _mm_cvtph_ps(_mm_loadl_epi64((const __m128i *)dptr));
+        auto x = (const block_iq1_s_r4 *)(dptr + 4);
+        for (int ib = 0; ib < nb/4; ++ib) {
+            for (int iy = 0; iy < nrc_y; ++iy) {
+                _mm256_storeu_ps(d8 + 8*iy, _mm256_cvtph_ps(_mm_loadu_si128((const __m128i *)q8.y[iy][ib].d)));
+            }
+            for (int k = 0; k < 4; ++k) {
+                const uint64_t * s64 = (const uint64_t *)x[4*ib+k].qh;
+                auto sas = _mm_set1_epi64x(s64[0]);
+                //auto delta = _mm256_or_ps(delta_0, _mm256_castsi256_ps(_mm256_cvtepu16_epi32(_mm_and_si128(sas, _mm_set1_epi16(-32768)))));
+                auto scales4 = _mm_and_si128(_mm_srli_epi16(sas, 12), _mm_set1_epi16(7));
+                scales4 = _mm_or_si128(_mm_slli_epi16(scales4, 1), _mm_set1_epi16(1));
+                auto signs = _mm_or_si128(_mm_cmpeq_epi16(_mm_and_si128(sas, ms), ms), _mm256_castsi256_si128(m1));
+                auto delta4 = _mm_mul_ps(_mm_set1_ps(IQ1S_DELTA), _mm_cvtepi32_ps(_mm_cvtepi16_epi32(_mm_sign_epi16(scales4, signs))));
+                auto delta = _mm256_set_m128(delta4, delta4);
+                scales4 = _mm_unpacklo_epi16(scales4, scales4); // 0,0, 1,1, 2,2, 3,3
+                auto scales = MM256_SET_M128I(scales4, scales4);
+                qx[0] = _mm256_set_epi64x(iq1s_grid[x[4*ib+k].qs[ 9] | ((x[4*ib+k].qh[1] << 2) & 0x0700)],
+                                          iq1s_grid[x[4*ib+k].qs[ 8] | ((x[4*ib+k].qh[0] << 2) & 0x0700)],
+                                          iq1s_grid[x[4*ib+k].qs[ 1] | ((x[4*ib+k].qh[1] << 8) & 0x0700)],
+                                          iq1s_grid[x[4*ib+k].qs[ 0] | ((x[4*ib+k].qh[0] << 8) & 0x0700)]);
+                qx[1] = _mm256_set_epi64x(iq1s_grid[x[4*ib+k].qs[13] | ((x[4*ib+k].qh[1] >> 1) & 0x0700)],
+                                          iq1s_grid[x[4*ib+k].qs[12] | ((x[4*ib+k].qh[0] >> 1) & 0x0700)],
+                                          iq1s_grid[x[4*ib+k].qs[ 5] | ((x[4*ib+k].qh[1] << 5) & 0x0700)],
+                                          iq1s_grid[x[4*ib+k].qs[ 4] | ((x[4*ib+k].qh[0] << 5) & 0x0700)]);
+                qx[2] = _mm256_set_epi64x(iq1s_grid[x[4*ib+k].qs[11] | ((x[4*ib+k].qh[3] << 2) & 0x0700)],
+                                          iq1s_grid[x[4*ib+k].qs[10] | ((x[4*ib+k].qh[2] << 2) & 0x0700)],
+                                          iq1s_grid[x[4*ib+k].qs[ 3] | ((x[4*ib+k].qh[3] << 8) & 0x0700)],
+                                          iq1s_grid[x[4*ib+k].qs[ 2] | ((x[4*ib+k].qh[2] << 8) & 0x0700)]);
+                qx[3] = _mm256_set_epi64x(iq1s_grid[x[4*ib+k].qs[15] | ((x[4*ib+k].qh[3] >> 1) & 0x0700)],
+                                          iq1s_grid[x[4*ib+k].qs[14] | ((x[4*ib+k].qh[2] >> 1) & 0x0700)],
+                                          iq1s_grid[x[4*ib+k].qs[ 7] | ((x[4*ib+k].qh[3] << 5) & 0x0700)],
+                                          iq1s_grid[x[4*ib+k].qs[ 6] | ((x[4*ib+k].qh[2] << 5) & 0x0700)]);
+                for (int iy = 0; iy < nrc_y; ++iy) {
+                    auto y = _mm256_loadu_si256((const __m256i *)q8.y[iy][ib].qs + k);
+#ifdef HAVE_FANCY_SIMD
+                    // 0,0, 1,1, 0,0, 1,1 as int32_t
+                    auto sumi1 = _mm256_dpbusd_epi32(_mm256_dpbusd_epi32(_mm256_setzero_si256(),
+                                m0, _mm256_sign_epi8(_mm256_shuffle_epi32(y, 0x44), qx[0])),
+                                m0, _mm256_sign_epi8(_mm256_shuffle_epi32(y, 0xee), qx[1]));
+                    // 2,2, 3,3, 2,2, 3,3 as int32_t
+                    auto sumi2 = _mm256_dpbusd_epi32(_mm256_dpbusd_epi32(_mm256_setzero_si256(),
+                                m0, _mm256_sign_epi8(_mm256_shuffle_epi32(y, 0x44), qx[2])),
+                                m0, _mm256_sign_epi8(_mm256_shuffle_epi32(y, 0xee), qx[3]));
+                    auto sumi = _mm256_packs_epi32(sumi1, sumi2);
+#else
+                    // 4 x row 0, 4 x row 1, 4 x row 0, 4 x row 1
+                    auto sumi1 = _mm256_add_epi16(_mm256_maddubs_epi16(m0, _mm256_sign_epi8(_mm256_shuffle_epi32(y, 0x44), qx[0])),
+                                                  _mm256_maddubs_epi16(m0, _mm256_sign_epi8(_mm256_shuffle_epi32(y, 0xee), qx[1])));
+                    // 4 x row 2, 4 x row 3, 4 x row 2, 4 x row 3
+                    auto sumi2 = _mm256_add_epi16(_mm256_maddubs_epi16(m0, _mm256_sign_epi8(_mm256_shuffle_epi32(y, 0x44), qx[2])),
+                                                  _mm256_maddubs_epi16(m0, _mm256_sign_epi8(_mm256_shuffle_epi32(y, 0xee), qx[3])));
+                    // 0,0, 1,1, 0,0, 1,1  as int32_t
+                    sumi1 = _mm256_madd_epi16(m1, sumi1);
+                    // 2,2, 3,3, 2,2, 3,3  as int32_t
+                    sumi2 = _mm256_madd_epi16(m1, sumi2);
+                    // 0,0, 1,1, 2,2, 3,3, 0,0, 1,1, 2,2, 3,3 as int16_t
+                    auto sumi = _mm256_packs_epi32(sumi1, sumi2);
+#endif
+                    sumi = _mm256_madd_epi16(scales, sumi);
+                    acc[iy] = _mm256_fmadd_ps(_mm256_set1_ps(d8[8*iy+k+0]), _mm256_cvtepi32_ps(sumi), acc[iy]);
+                    acc[iy] = _mm256_fmadd_ps(_mm256_set1_ps(d8[8*iy+k+4]), delta, acc[iy]);
+                }
+            }
+        }
+        for (int iy = 0; iy < nrc_y; ++iy) {
+            auto sumf = _mm_add_ps(_mm256_castps256_ps128(acc[iy]), _mm256_extractf128_ps(acc[iy], 1));
+            info.store(ix, iy, _mm_mul_ps(d1, sumf));
+            acc[iy] = _mm256_setzero_ps();
+        }
+    }
+}
+
 #ifdef HAVE_FANCY_SIMD
 template <int nrc_y>
 static void mul_mat_q4_0_r4_q8_1(int n, const void * vx, size_t bx, const DataInfo& info, int nrc_x) {
@@ -8202,6 +8290,21 @@ bool MulMat::prepare(int typeA, int typeB, int ne00, MulMat& mm, int Ny) {
             mm.funcs[7] = mul_mat_q8_0_r4_q8_1<8>;
             expected_typeB = GGML_TYPE_Q8_1_X4;
             break;
+        case GGML_TYPE_IQ1_S_R4:
+            assert (ne00 % QK4_NL == 0);
+            mm.funcs[0] = mul_mat_iq1_s_r4_q8_1<1>;
+            mm.funcs[1] = mul_mat_iq1_s_r4_q8_1<2>;
+            mm.funcs[2] = mul_mat_iq1_s_r4_q8_1<3>;
+            mm.funcs[3] = mul_mat_iq1_s_r4_q8_1<4>;
+            mm.funcs[4] = mul_mat_iq1_s_r4_q8_1<5>;
+            mm.funcs[5] = mul_mat_iq1_s_r4_q8_1<6>;
+            mm.funcs[6] = mul_mat_iq1_s_r4_q8_1<7>;
+            mm.funcs[7] = mul_mat_iq1_s_r4_q8_1<8>;
+#ifdef HAVE_FANCY_SIMD
+            mm.func16 = mul_mat_iq1_s_r4_q8_1<16>;
+#endif
+            expected_typeB = GGML_TYPE_Q8_1_X4;
+            break;
 
         default:
             return false;
diff --git a/include/llama.h b/include/llama.h
index c21671c6..0f6d15ac 100644
--- a/include/llama.h
+++ b/include/llama.h
@@ -192,6 +192,7 @@ extern "C" {
         LLAMA_FTYPE_MOSTLY_IQ2_XXS_R4    = 219, // except 1d tensors
         LLAMA_FTYPE_MOSTLY_IQ2_XS_R4     = 220, // except 1d tensors
         LLAMA_FTYPE_MOSTLY_IQ3_XXS_R4    = 223, // except 1d tensors
+        LLAMA_FTYPE_MOSTLY_IQ1_S_R4      = 224, // except 1d tensors
         LLAMA_FTYPE_MOSTLY_IQ4_NL_R4     = 225, // except 1d tensors
         LLAMA_FTYPE_MOSTLY_IQ3_S_R4      = 226, // except 1d tensors
         LLAMA_FTYPE_MOSTLY_IQ2_M_R4      = 229, // except 1d tensors
diff --git a/src/llama.cpp b/src/llama.cpp
index 570c056c..0020c77f 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -3954,6 +3954,7 @@ struct llama_model_loader {
                 case GGML_TYPE_IQ3_XXS: ftype = LLAMA_FTYPE_MOSTLY_IQ3_XXS; break;
                 case GGML_TYPE_IQ3_XXS_R4: ftype = LLAMA_FTYPE_MOSTLY_IQ3_XXS_R4; break;
                 case GGML_TYPE_IQ1_S:   ftype = LLAMA_FTYPE_MOSTLY_IQ1_S;   break;
+                case GGML_TYPE_IQ1_S_R4:ftype = LLAMA_FTYPE_MOSTLY_IQ1_S_R4;break;
                 case GGML_TYPE_IQ1_M:   ftype = LLAMA_FTYPE_MOSTLY_IQ1_M;   break;
                 case GGML_TYPE_IQ1_BN:  ftype = LLAMA_FTYPE_MOSTLY_IQ1_BN;  break;
                 case GGML_TYPE_IQ2_BN:  ftype = LLAMA_FTYPE_MOSTLY_IQ2_BN;  break;
@@ -4688,6 +4689,7 @@ static std::string llama_model_ftype_name(llama_ftype ftype) {
         case LLAMA_FTYPE_MOSTLY_IQ3_XXS:  return "IQ3_XXS - 3.0625 bpw";
         case LLAMA_FTYPE_MOSTLY_IQ3_XXS_R4: return "IQ3_XXS_R4 - 3.0625 bpw";
         case LLAMA_FTYPE_MOSTLY_IQ1_S:    return "IQ1_S - 1.5625 bpw";
+        case LLAMA_FTYPE_MOSTLY_IQ1_S_R4: return "IQ1_S_R4 - 1.5 bpw";
         case LLAMA_FTYPE_MOSTLY_IQ1_M:    return "IQ1_M - 1.75 bpw";
         case LLAMA_FTYPE_MOSTLY_IQ4_NL:   return "IQ4_NL - 4.5 bpw";
         case LLAMA_FTYPE_MOSTLY_IQ4_NL_R4:return "IQ4_NL_R4 - 4.5 bpw";
@@ -15966,7 +15968,8 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
                      ftype == LLAMA_FTYPE_MOSTLY_IQ1_S   || ftype == LLAMA_FTYPE_MOSTLY_IQ2_S  || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M   ||
                      ftype == LLAMA_FTYPE_MOSTLY_IQ1_M   || ftype == LLAMA_FTYPE_MOSTLY_IQ2_K  || ftype == LLAMA_FTYPE_MOSTLY_IQ3_K   ||
                      ftype == LLAMA_FTYPE_MOSTLY_IQ2_KS  || ftype == LLAMA_FTYPE_MOSTLY_IQ3_K_R4 || ftype == LLAMA_FTYPE_MOSTLY_IQ2_K_R4 ||
-                     ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS_R4 || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS_R4 || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M_R4) {
+                     ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS_R4 || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS_R4 ||
+                     ftype == LLAMA_FTYPE_MOSTLY_IQ2_M_R4 || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S_R4) {
                 new_type = !qs.has_output ? GGML_TYPE_IQ4_K : GGML_TYPE_Q5_K;
             }
             else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS_R4 || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS_R4) {
@@ -15987,7 +15990,8 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
         } else {
             if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS ||
                 ftype == LLAMA_FTYPE_MOSTLY_IQ1_S   || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M  ||
-                ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS_R4 || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS_R4) {
+                ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS_R4 || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS_R4 ||
+                ftype == LLAMA_FTYPE_MOSTLY_IQ1_S_R4) {
                 new_type = GGML_TYPE_Q2_K;
             }
             else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M_R4) {
@@ -16064,6 +16068,32 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
                 new_type = GGML_TYPE_BF16;
             }
         }
+    } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_S_R4) {
+        if (name.find("attn_v.weight") != std::string::npos) {
+            if (qs.model.hparams.n_expert >= 4 || qs.model.hparams.n_gqa() >= 4) new_type = GGML_TYPE_IQ4_K_R4;
+            else if (qs.model.hparams.n_gqa() >= 2) new_type = GGML_TYPE_IQ3_K_R4;
+            else new_type = GGML_TYPE_Q2_K_R4;
+            ++qs.i_attention_wv;
+        }
+        else if (qs.model.hparams.n_expert >= 8 && name.find("attn_k") != std::string::npos) {
+            new_type = GGML_TYPE_Q4_K_R4;
+        }
+        else if (qs.model.hparams.n_expert >= 8 && name.find("attn_q") != std::string::npos) {
+            new_type = GGML_TYPE_Q4_K_R4;
+        }
+        else if (name.find("attn_qkv.weight") != std::string::npos) {
+            new_type = GGML_TYPE_IQ2_K_R4;
+        }
+        else if (name.find("ffn_down") != std::string::npos) {
+            auto [i_layer, n_layer] = layer_info(qs.i_ffn_down, qs.n_ffn_down, name.c_str());
+            if (qs.params->ffn_down_type < GGML_TYPE_COUNT) new_type = qs.params->ffn_down_type;
+            else if (i_layer < n_layer/8) {
+                new_type = GGML_TYPE_Q2_K_R4;
+            }
+        }
+        else if (name.find("attn_output.weight") != std::string::npos) {
+            new_type = qs.model.hparams.n_expert >= 4 ? GGML_TYPE_Q5_K_R4 : GGML_TYPE_IQ2_K_R4;
+        }
     } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S ||
                ftype == LLAMA_FTYPE_MOSTLY_IQ2_S   || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M  || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M ||
                ftype == LLAMA_FTYPE_MOSTLY_IQ2_KS  || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS_R4 || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS_R4 ||
@@ -16095,6 +16125,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
                 new_type = GGML_TYPE_Q5_K;
             } else {
                 if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) new_type = GGML_TYPE_IQ2_K;
+                else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_S_R4) new_type = GGML_TYPE_IQ2_K_R4;
                 else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || is_iq2_m) new_type = GGML_TYPE_IQ3_S;
             }
         }
@@ -16539,6 +16570,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
         case LLAMA_FTYPE_MOSTLY_IQ3_XXS: default_type = GGML_TYPE_IQ3_XXS; break;
         case LLAMA_FTYPE_MOSTLY_IQ3_XXS_R4: default_type = GGML_TYPE_IQ3_XXS_R4; break;
         case LLAMA_FTYPE_MOSTLY_IQ1_S:   default_type = GGML_TYPE_IQ1_S;   break;
+        case LLAMA_FTYPE_MOSTLY_IQ1_S_R4:default_type = GGML_TYPE_IQ1_S_R4;break;
         case LLAMA_FTYPE_MOSTLY_IQ1_M:   default_type = GGML_TYPE_IQ1_M;   break;
         case LLAMA_FTYPE_MOSTLY_IQ1_BN:  default_type = GGML_TYPE_IQ1_BN;  break;
         case LLAMA_FTYPE_MOSTLY_IQ2_BN:  default_type = GGML_TYPE_IQ2_BN;  break;
@@ -16892,6 +16924,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
                  new_type == GGML_TYPE_IQ2_S   ||
                  new_type == GGML_TYPE_IQ2_S_R4||
                  new_type == GGML_TYPE_IQ1_S   ||
+                 new_type == GGML_TYPE_IQ1_S_R4||
                 (new_type == GGML_TYPE_IQ1_M && strcmp(tensor->name, "token_embd.weight") && strcmp(tensor->name, "output.weight"))  ||
                 (new_type == GGML_TYPE_Q2_K && params->ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S && strcmp(tensor->name, "token_embd.weight") != 0))) {
                 LLAMA_LOG_ERROR("\n\n============================================================\n");
@@ -17011,6 +17044,10 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
                 if (tensor->ne[1] % 4 != 0) new_type = GGML_TYPE_IQ3_S;
                 else chunk_size_multiplier = 4;
             }
+            else if (new_type == GGML_TYPE_IQ1_S_R4) {
+                if (tensor->ne[1] % 4 != 0) new_type = GGML_TYPE_IQ1_S;
+                else chunk_size_multiplier = 4;
+            }
             else if (new_type == GGML_TYPE_BF16_R16) {
                 if (tensor->ne[1] % 16 != 0) new_type = GGML_TYPE_BF16;
                 else chunk_size_multiplier = 16;