diff --git a/examples/quantize/quantize.cpp b/examples/quantize/quantize.cpp
index 1ace5720..8e0d0969 100644
--- a/examples/quantize/quantize.cpp
+++ b/examples/quantize/quantize.cpp
@@ -44,6 +44,7 @@ static const std::vector<struct quant_option> QUANT_OPTIONS = {
     { "IQ4_NL",   LLAMA_FTYPE_MOSTLY_IQ4_NL,   " 4.50 bpw non-linear quantization", },
     { "IQ4_XS",   LLAMA_FTYPE_MOSTLY_IQ4_XS,   " 4.25 bpw non-linear quantization", },
     { "IQ4_KS",   LLAMA_FTYPE_MOSTLY_IQ4_KS,   " 4.25 bpw non-linear quantization", },
+    { "IQ4_KSS",  LLAMA_FTYPE_MOSTLY_IQ4_KSS,  " 4.0 bpw non-linear quantization",  },
     { "IQ2_K",    LLAMA_FTYPE_MOSTLY_IQ2_K,    " 2.375 bpw non-linear quantization",},
     { "IQ2_KS",   LLAMA_FTYPE_MOSTLY_IQ2_KS,   " 2.1875 bpw non-linear quantization",},
     { "IQ3_K",    LLAMA_FTYPE_MOSTLY_IQ3_K,    " 3.44 bpw non-linear quantization", },
diff --git a/ggml/src/ggml-cuda.cu b/ggml/src/ggml-cuda.cu
index 6648b7f8..e26f36a0 100644
--- a/ggml/src/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda.cu
@@ -2829,6 +2829,7 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons
                     case GGML_TYPE_IQ4_NL:
                     case GGML_TYPE_IQ4_XS:
                     case GGML_TYPE_IQ4_KS:
+                    case GGML_TYPE_IQ4_KSS:
                     case GGML_TYPE_IQ2_K:
                     case GGML_TYPE_IQ2_KS:
                     case GGML_TYPE_IQ3_K:
diff --git a/ggml/src/ggml-cuda/common.cuh b/ggml/src/ggml-cuda/common.cuh
index a6a9c3d3..a5658a24 100644
--- a/ggml/src/ggml-cuda/common.cuh
+++ b/ggml/src/ggml-cuda/common.cuh
@@ -543,6 +543,13 @@ struct ggml_cuda_type_traits<GGML_TYPE_IQ4_KS> {
     static constexpr int qi = QI4_XS;
 };
 
+template<>
+struct ggml_cuda_type_traits<GGML_TYPE_IQ4_KSS> {
+    static constexpr int qk = QK_K;
+    static constexpr int qr = QR4_XS;
+    static constexpr int qi = QI4_XS;
+};
+
 template<>
 struct ggml_cuda_type_traits<GGML_TYPE_IQ5_K> {
     static constexpr int qk = QK_K;
diff --git a/ggml/src/ggml-cuda/convert.cu b/ggml/src/ggml-cuda/convert.cu
index 1e4421b1..9a0ec9d6 100644
--- a/ggml/src/ggml-cuda/convert.cu
+++ b/ggml/src/ggml-cuda/convert.cu
@@ -638,6 +638,36 @@ static __global__ void dequantize_block_iq4_ks(const void * __restrict__ vx, dst
     }
 }
 
+template<typename dst_t>
+static __global__ void dequantize_block_iq4_kss(const void * __restrict__ vx, dst_t * __restrict__ yy, int64_t n_per_row, int64_t row_size) {
+
+    int64_t ii  = blockIdx.x;
+    int64_t row = (QK_K * ii) / n_per_row;
+    const char * cx = (const char *)vx + row * row_size;
+    float scale = *(const float *)cx;
+    const block_iq4_kss * x = (const block_iq4_kss *)(cx + sizeof(float));
+    const int64_t i   = ii - (row*n_per_row)/QK_K;
+
+    const int64_t tid = threadIdx.x;
+    const int64_t il = tid/8; // 0...3
+    const int64_t ib = tid%8; // 0...7
+    dst_t * y = yy + ii*QK_K + 32*ib + 4*il;
+    const uint32_t * q4 = x[i].qs + 4*ib;
+    uint8_t ls = (q4[0] >> 30) | ((q4[1] >> 28) & 0x0c) | ((q4[2] >> 26) & 0x30) | ((q4[3] >> 24) & 0xc0);
+    const float d = scale * ((ls & 254) - 127);
+    const int8_t * values = iq4k_values + ((ls & 1) << 4);
+    uint32_t aux32[2];
+    aux32[0] = (q4[il] & 0x00007fff) | ((q4[il] << 1) & 0x7fff0000);
+    aux32[0] ^= (aux32[0] << 1);
+    aux32[1] = ((aux32[0] >> 4) & 0x0f0f0f0f);
+    aux32[0] &= 0x0f0f0f0f;
+    const uint8_t * aux8 = (const uint8_t *)aux32;
+    for (int j = 0; j < 4; ++j) {
+        y[j+ 0] = d * values[aux8[j+0]];
+        y[j+16] = d * values[aux8[j+4]];
+    }
+}
+
 template<typename dst_t>
 static __global__ void dequantize_block_iq4_k(const void * __restrict__ vx, dst_t * __restrict__ yy) {
     const int64_t i   = blockIdx.x;
@@ -980,6 +1010,14 @@ static void dequantize_row_iq4_ks_cuda(const void * vx, dst_t * y, const int64_t
     dequantize_block_iq4_ks<<<nb, 32, 0, stream>>>(vx, y, n_per_row, row_size);
 }
 
+template<typename dst_t>
+static void dequantize_row_iq4_kss_cuda(const void * vx, dst_t * y, const int64_t nrows, const int64_t n_per_row, cudaStream_t stream) {
+    const int64_t k = nrows * n_per_row;
+    const int64_t row_size = ggml_row_size(GGML_TYPE_IQ4_KSS, n_per_row);
+    const int nb = (k + QK_K - 1) / QK_K;
+    dequantize_block_iq4_kss<<<nb, 32, 0, stream>>>(vx, y, n_per_row, row_size);
+}
+
 template<typename dst_t>
 static void dequantize_row_iq2_ks_cuda(const void * vx, dst_t * y, const int64_t nrows, const int64_t n_per_row, cudaStream_t stream) {
     const int64_t k = nrows * n_per_row;
@@ -1152,6 +1190,8 @@ to_fp16_cuda_t ggml_get_to_fp16_cuda(ggml_type type) {
             return dequantize_row_iq4_xs_cuda;
         case GGML_TYPE_IQ4_KS:
             return dequantize_row_iq4_ks_cuda;
+        case GGML_TYPE_IQ4_KSS:
+            return dequantize_row_iq4_kss_cuda;
         case GGML_TYPE_IQ2_KS:
             return dequantize_row_iq2_ks_cuda;
         case GGML_TYPE_IQ2_K:
@@ -1225,6 +1265,8 @@ to_fp32_cuda_t ggml_get_to_fp32_cuda(ggml_type type) {
             return dequantize_row_iq4_xs_cuda;
         case GGML_TYPE_IQ4_KS:
             return dequantize_row_iq4_ks_cuda;
+        case GGML_TYPE_IQ4_KSS:
+            return dequantize_row_iq4_kss_cuda;
         case GGML_TYPE_IQ2_KS:
             return dequantize_row_iq2_ks_cuda;
         case GGML_TYPE_IQ2_K:
diff --git a/ggml/src/ggml-cuda/iqk_mmvq.cu b/ggml/src/ggml-cuda/iqk_mmvq.cu
index 9ca219e4..cc7dc409 100644
--- a/ggml/src/ggml-cuda/iqk_mmvq.cu
+++ b/ggml/src/ggml-cuda/iqk_mmvq.cu
@@ -239,6 +239,34 @@ __device__ __forceinline__ float vec_dot_iq4_ks_q8_1(
     return dl * __low2float(bq8_1[ib32].ds) * sumi;
 }
 
+#define VDR_IQ4_KSS_Q8_1_MMVQ 4
+#define VDR_IQ4_KSS_Q8_1_MMQ  4
+
+__device__ __forceinline__ float vec_dot_iq4_kss_q8_1(
+    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & kbx, const int & iqs) {
+
+    float scale = *(const float *)vbq;
+    const block_iq4_kss * bq4 = (const block_iq4_kss *)((const char *)vbq + sizeof(float)) + kbx;
+    const uint8_t * all_values = (const uint8_t *)iq4k_values;
+
+    // TODO
+    return 0.f;
+
+    //// iqs is 0...28
+    //const int ib32 = iqs/4; // Why iqs/4 ?
+    //const int32_t  * q8 = (const int *)bq8_1[ib32].qs;
+    //const uint32_t * q4 = (const uint32_t *)bq4->qs + 4*ib32;
+    //const float dl = scale * ((bq4->scales[ib32] & 254) - 127);
+    //int v1, v2;
+    //int sumi = 0;
+    //for (int j = 0; j < 4; ++j) {
+    //    get_int_from_table_16_shift(q4[j], bq4->scales[ib32] & 1, all_values, v1, v2);
+    //    sumi = ggml_cuda_dp4a(v1, q8[j+0], sumi);
+    //    sumi = ggml_cuda_dp4a(v2, q8[j+4], sumi);
+    //}
+    //return dl * __low2float(bq8_1[ib32].ds) * sumi;
+}
+
 #define VDR_IQ5_K_Q8_1_MMVQ 4
 #define VDR_IQ5_K_Q8_1_MMQ  4
 
@@ -703,6 +731,13 @@ void mul_mat_vec_iq4_ks_q8_1_cuda(
     iqk_mul_mat_vec_q_cuda<GGML_TYPE_IQ4_KS, VDR_IQ4_KS_Q8_1_MMVQ, vec_dot_iq4_ks_q8_1>(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream);
 }
 
+void mul_mat_vec_iq4_kss_q8_1_cuda(
+    const void * vx, const void * vy, float * dst,
+    const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) {
+
+    iqk_mul_mat_vec_q_cuda<GGML_TYPE_IQ4_KSS, VDR_IQ4_KSS_Q8_1_MMVQ, vec_dot_iq4_kss_q8_1>(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream);
+}
+
 void mul_mat_vec_iq2_ks_q8_1_cuda(
     const void * vx, const void * vy, float * dst,
     const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) {
diff --git a/ggml/src/ggml-cuda/iqk_mmvq.cuh b/ggml/src/ggml-cuda/iqk_mmvq.cuh
index 3a93a1b6..0678c026 100644
--- a/ggml/src/ggml-cuda/iqk_mmvq.cuh
+++ b/ggml/src/ggml-cuda/iqk_mmvq.cuh
@@ -32,6 +32,10 @@ void mul_mat_vec_iq4_ks_q8_1_cuda(
     const void * vx, const void * vy, float * dst,
     const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream);
 
+void mul_mat_vec_iq4_kss_q8_1_cuda(
+    const void * vx, const void * vy, float * dst,
+    const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream);
+
 void mul_mat_vec_iq2_ks_q8_1_cuda(
     const void * vx, const void * vy, float * dst,
     const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream);
diff --git a/ggml/src/ggml-cuda/mmvq.cu b/ggml/src/ggml-cuda/mmvq.cu
index e312b266..107caf45 100644
--- a/ggml/src/ggml-cuda/mmvq.cu
+++ b/ggml/src/ggml-cuda/mmvq.cu
@@ -462,6 +462,9 @@ void ggml_cuda_op_mul_mat_vec_q(
         case GGML_TYPE_IQ4_KS:
             mul_mat_vec_iq4_ks_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
             break;
+        case GGML_TYPE_IQ4_KSS:
+            mul_mat_vec_iq4_kss_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
+            break;
         case GGML_TYPE_IQ2_KS:
             mul_mat_vec_iq2_ks_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
             break;
diff --git a/ggml/src/ggml-quants.c b/ggml/src/ggml-quants.c
index a845eaf5..68ec6126 100644
--- a/ggml/src/ggml-quants.c
+++ b/ggml/src/ggml-quants.c
@@ -15197,6 +15197,7 @@ bool ggml_validate_row_data(enum ggml_type type, const void * data, size_t nbyte
         case GGML_TYPE_IQ2_TN: break;
         case GGML_TYPE_IQ1_TN: break;
         case GGML_TYPE_IQ4_KS: break;
+        case GGML_TYPE_IQ4_KSS: break;
         case GGML_TYPE_Q4_0_4_4:
         case GGML_TYPE_Q4_0_4_8:
             {
diff --git a/ggml/src/iqk/iqk_quantize.cpp b/ggml/src/iqk/iqk_quantize.cpp
index 0ee8e286..b0f294a5 100644
--- a/ggml/src/iqk/iqk_quantize.cpp
+++ b/ggml/src/iqk/iqk_quantize.cpp
@@ -2841,12 +2841,16 @@ const uint16_t * scramble_table() {
             uint16_t val = i;
             int non = popcount(val);
             if (non%2) val |= (1 << 15);
+            bool found = false;
             for (int j = 0; j < int(table.size()); ++j) {
                 if ((j ^ (j << 1)) == val) {
-                    //printf("%5d  %5u  %5d\n", i, val, j);
-                    table[i] = j; break;
+                    table[i] = j; found = true; break;
                 }
             }
+            if (!found) {
+                printf("Oops: did not find for %d %u\n", i, val);
+                exit(1);
+            }
         }
     }
     return table.data();
@@ -2892,6 +2896,8 @@ uint16_t prune_iq4ks(uint16_t v, const int8_t * values, const float * x, const f
 void prune_iq4ks_to_iq4kss(int n_per_row, const uint16_t * table, const char * cx, const float * x, char *cy,
         const float * quant_weights, float * weight, float * all_scales) {
     constexpr int kBlockSize = 32;
+    float xv[4], w[4];
+    uint16_t vps[kBlockSize/4];
     const float * dptr_ks = (const float *)cx;
     const float d_ks = *dptr_ks;
     const block_iq4_ks * iq4ks = (const block_iq4_ks *)(dptr_ks + 1);
@@ -2906,6 +2912,7 @@ void prune_iq4ks_to_iq4kss(int n_per_row, const uint16_t * table, const char * c
         float sigma2 = 0;
         for (int j = 0; j < QK_K; ++j) sigma2 += xbl[j]*xbl[j];
         sigma2 *= 2.f/QK_K;
+        const uint16_t * q4 = (const uint16_t *)iq4ks[ibl].qs;
         for (int ib = 0; ib < QK_K/kBlockSize; ++ib) {
             const float * xb = xbl + ib*kBlockSize;
             if (quant_weights) {
@@ -2917,65 +2924,42 @@ void prune_iq4ks_to_iq4kss(int n_per_row, const uint16_t * table, const char * c
             const int8_t * values = iq4k_values + ((iq4ks[ibl].scales[ib] & 1) << 4);
             float dl  = d_ks * ((iq4ks[ibl].scales[ib] & 254) - 127);
             float sumqx = 0, sumq2 = 0;
+            for (int k = 0; k < kBlockSize/4; ++k) {
+                xv[0] = xb[2*k+0]; xv[1] = xb[2*k+kBlockSize/2]; xv[2] = xb[2*k+1]; xv[3] = xb[2*k+1+kBlockSize/2];
+                w[0] = weight[2*k+0]; w[1] = weight[2*k+kBlockSize/2]; w[2] = weight[2*k+1]; w[3] = weight[2*k+1+kBlockSize/2];
+                auto vp = prune_iq4ks(q4[k], values, xv, w, dl);
+                vps[k] = table[vp & 0x7fff];
+                for (int j = 0; j < 4; ++j) {
+                    float q = values[(vp >> 4*j) & 0xf];
+                    sumqx += w[j]*q*xv[j];
+                    sumq2 += w[j]*q*q;
+                }
+            }
             for (int k = 0; k < kBlockSize/8; ++k) {
-                uint16_t vl = 0, vh = 0;
-                for (int j = 0; j < 4; ++j) {
-                    uint8_t ql = iq4ks[ibl].qs[(kBlockSize/2)*ib + (kBlockSize/8)*k + j] & 0xf;
-                    uint8_t qh = iq4ks[ibl].qs[(kBlockSize/2)*ib + (kBlockSize/8)*k + j] >>  4;
-                    vl |= ql << 4*j;
-                    vh |= qh << 4*j;
-                }
-                auto vlp = prune_iq4ks(vl, values, xb + 4*k, weight + 4*k, dl);
-                auto vhp = prune_iq4ks(vh, values, xb + 4*k + kBlockSize/2, weight + 4*k + kBlockSize/2, dl);
-                auto vlp_s = table[vlp & 0x7fff];
-                auto vhp_s = table[vhp & 0x7fff];
-                y[ibl].qs[(kBlockSize/8)*ib + k] = vlp_s | (vhp_s << 15) | (((iq4ks[ibl].scales[ib] >> 2*k) & 3) << 30);
-                if ((vlp_s ^ (vlp_s << 1)) != vlp) {
-                    printf("Oops(l): vl = %u (%d) vlp = %u (%d) vlp_s = %u check = %u\n", vl, popcount(vl), vlp, popcount(vlp), vlp_s, vlp_s ^ (vlp_s << 1));
-                    exit(1);
-                }
-                if ((vhp_s ^ (vhp_s << 1)) != vhp) {
-                    printf("Oops(h): vh = %u (%d) vhp = %u (%d) vhp_s = %u check = %u\n", vh, popcount(vh), vhp, popcount(vhp), vhp_s, vhp_s ^ (vhp_s << 1));
-                    exit(1);
-                }
-                for (int j = 0; j < 4; ++j) {
-                    float ql = values[(vlp >> 4*j) & 0xf];
-                    float qh = values[(vhp >> 4*j) & 0xf];
-                    sumqx += weight[4*k+j]*ql*xb[4*k+j] + weight[4*k+j+16]*qh*xb[4*k+j+16];
-                    sumq2 += weight[4*k+j]*ql*ql + weight[4*k+j+16]*qh*qh;
-                }
-                //for (int j = 0; j < 4; ++j) {
-                //    uint8_t ql = iq4ks[ibl].qs[(kBlockSize/2)*ib + (kBlockSize/8)*k + j] & 0xf;
-                //    uint8_t qh = iq4ks[ibl].qs[(kBlockSize/2)*ib + (kBlockSize/8)*k + j] >>  4;
-                //    uint8_t qlp = (vlp >> 4*j) & 0xf;
-                //    uint8_t qhp = (vhp >> 4*j) & 0xf;
-                //    float diffl = dl*values[ql] - xb[4*k+j];
-                //    float diffh = dl*values[qh] - xb[4*k+j+16];
-                //    float difflp = dl*values[qlp] - xb[4*k+j];
-                //    float diffhp = dl*values[qhp] - xb[4*k+j+16];
-                //    printf("%d %d %d: %u %u %u %u   %g %g   %g %g\n", ib, k, j, ql, qh, qlp, qhp, diffl, diffh, difflp, diffhp);
-                //}
+                y[ibl].qs[(kBlockSize/8)*ib + k] = vps[2*k+0] | (vps[2*k+1] << 15) | (((iq4ks[ibl].scales[ib] >> 2*k) & 3) << 30);
+                //y[ibl].qs[(kBlockSize/8)*ib + k] = vps[2*k+0] | (vps[2*k+1] << 15);
             }
             scales[ib] = sumq2 > 0 ? sumqx/sumq2 : dl;
             max_abs_scale = std::max(max_abs_scale, scales[ib]);
+            q4 += kBlockSize/4;
         }
     }
-    if (!max_abs_scale) return;
-    float d = max_abs_scale/127;
-    *dptr = d;
-    float id = 1/d;
-    for (int ibl = 0; ibl < nblock; ++ibl) {
-        auto scales = all_scales + ibl*(QK_K/kBlockSize);
-        for (int ib = 0; ib < QK_K/kBlockSize; ++ib) {
-            int l = nearest_int(0.5f*(id*scales[ib]+127.f));
-            l = std::max(0, std::min(127, l)) << 1;
-            l |= (iq4ks[ibl].scales[ib] & 1);
-            for (int k = 0; k < 4; ++k) {
-                y[ibl].qs[4*ib+k] &= 0x3fffffff;
-                y[ibl].qs[4*ib+k] |= (((l >> 2*k) & 3) << 30);
-            }
-        }
-    }
+    //if (!max_abs_scale) return;
+    //float d = max_abs_scale/127;
+    //*dptr = d;
+    //float id = 1/d;
+    //for (int ibl = 0; ibl < nblock; ++ibl) {
+    //    auto scales = all_scales + ibl*(QK_K/kBlockSize);
+    //    for (int ib = 0; ib < QK_K/kBlockSize; ++ib) {
+    //        int l = nearest_int(0.5f*(id*scales[ib]+127.f));
+    //        l = std::max(0, std::min(127, l)) << 1;
+    //        l |= (iq4ks[ibl].scales[ib] & 1);
+    //        for (int k = 0; k < 4; ++k) {
+    //            //y[ibl].qs[4*ib+k] &= 0x3fffffff;
+    //            y[ibl].qs[4*ib+k] |= (((l >> 2*k) & 3) << 30);
+    //        }
+    //    }
+    //}
 }
 }
 
@@ -3010,36 +2994,36 @@ void dequantize_row_iq4_kss(const block_iq4_kss * x, float * y, int64_t k) {
     const float * dptr = (const float *)x;
     const float d = *dptr;
     x = (const block_iq4_kss *)(dptr + 1);
-    //uint32_t aux32[4];
-    //const uint8_t * aux8 = (const uint8_t *)aux32;
+    uint32_t aux32[4];
+    const uint8_t * aux8 = (const uint8_t *)aux32;
     for (int ibl = 0; ibl < k/QK_K; ++ibl) {
         auto qs = x[ibl].qs;
         for (int ib = 0; ib < QK_K/32; ++ib) {
-            uint8_t ls = ((qs[0] >> 30) | ((qs[1] >> 28) & 0x0c) | ((qs[2] >> 26) & 0x30) | ((qs[3] >> 24) & 0xc0));
-            const int8_t * values = iq4k_values + ((ls & 1) << 4);
-            const float dl = d * ((ls & 254) - 127);
-            for (int k = 0; k < 4; ++k) {
-                uint16_t vl = qs[k] & 0x7fff;
-                vl ^= (vl << 1);
-                uint16_t vh = (qs[k] >> 15) & 0x7fff;
-                vh ^= (vh << 1);
-                for (int j = 0; j < 4; ++j) {
-                    y[4*k + j +  0] = dl*values[(vl >> 4*j) & 0xf];
-                    y[4*k + j + 16] = dl*values[(vh >> 4*j) & 0xf];
-                }
-            }
-            //int16_t ls = 0;
-            //for (int k = 0; k < 4; ++k) {
-            //    aux32[k] = (qs[k] & 0x00007fff) | ((qs[k] << 1) & 0x7fff0000);
-            //    aux32[k] ^= (aux32[k] << 1);
-            //    ls |= (qs[k] >> 30) << 2*k;
-            //}
+            //uint8_t ls = ((qs[0] >> 30) | ((qs[1] >> 28) & 0x0c) | ((qs[2] >> 26) & 0x30) | ((qs[3] >> 24) & 0xc0));
             //const int8_t * values = iq4k_values + ((ls & 1) << 4);
-            //float dl = d * ((ls & 254) - 127);
-            //for (int j = 0; j < 16; ++j) {
-            //    y[j+ 0] = dl * values[aux8[j] & 0xf];
-            //    y[j+16] = dl * values[aux8[j] >>  4];
+            //const float dl = d * ((ls & 254) - 127);
+            //for (int k = 0; k < 4; ++k) {
+            //    uint16_t vl = qs[k] & 0x7fff;
+            //    vl ^= (vl << 1);
+            //    uint16_t vh = (qs[k] >> 15) & 0x7fff;
+            //    vh ^= (vh << 1);
+            //    for (int j = 0; j < 4; ++j) {
+            //        y[4*k + j +  0] = dl*values[(vl >> 4*j) & 0xf];
+            //        y[4*k + j + 16] = dl*values[(vh >> 4*j) & 0xf];
+            //    }
             //}
+            int16_t ls = 0;
+            for (int k = 0; k < 4; ++k) {
+                aux32[k] = (qs[k] & 0x00007fff) | ((qs[k] << 1) & 0x7fff0000);
+                aux32[k] ^= (aux32[k] << 1);
+                ls |= (qs[k] >> 30) << 2*k;
+            }
+            const int8_t * values = iq4k_values + ((ls & 1) << 4);
+            float dl = d * ((ls & 254) - 127);
+            for (int j = 0; j < 16; ++j) {
+                y[j+ 0] = dl * values[aux8[j] & 0xf];
+                y[j+16] = dl * values[aux8[j] >>  4];
+            }
             y  += 32;
             qs += 4;
         }
diff --git a/include/llama.h b/include/llama.h
index c9387e6b..133c2f0e 100644
--- a/include/llama.h
+++ b/include/llama.h
@@ -180,6 +180,7 @@ extern "C" {
         LLAMA_FTYPE_MOSTLY_IQ4_KS        = 145, // except 1d tensors
         LLAMA_FTYPE_MOSTLY_IQ3_KL        = 146, // except 1d tensors
         LLAMA_FTYPE_MOSTLY_IQ2_KS        = 147, // except 1d tensors
+        LLAMA_FTYPE_MOSTLY_IQ4_KSS       = 148, // except 1d tensors
 
         LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file
     };
diff --git a/src/llama.cpp b/src/llama.cpp
index b356f7bc..d9eec461 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -3795,6 +3795,7 @@ struct llama_model_loader {
                 case GGML_TYPE_IQ4_NL:  ftype = LLAMA_FTYPE_MOSTLY_IQ4_NL;  break;
                 case GGML_TYPE_IQ4_XS:  ftype = LLAMA_FTYPE_MOSTLY_IQ4_XS;  break;
                 case GGML_TYPE_IQ4_KS:  ftype = LLAMA_FTYPE_MOSTLY_IQ4_KS;  break;
+                case GGML_TYPE_IQ4_KSS: ftype = LLAMA_FTYPE_MOSTLY_IQ4_KSS; break;
                 case GGML_TYPE_IQ2_K:   ftype = LLAMA_FTYPE_MOSTLY_IQ2_K;   break;
                 case GGML_TYPE_IQ3_K:   ftype = LLAMA_FTYPE_MOSTLY_IQ3_K;   break;
                 case GGML_TYPE_IQ4_K:   ftype = LLAMA_FTYPE_MOSTLY_IQ4_K;   break;
@@ -4498,6 +4499,7 @@ static std::string llama_model_ftype_name(llama_ftype ftype) {
         case LLAMA_FTYPE_MOSTLY_IQ4_NL:   return "IQ4_NL - 4.5 bpw";
         case LLAMA_FTYPE_MOSTLY_IQ4_XS:   return "IQ4_XS - 4.25 bpw";
         case LLAMA_FTYPE_MOSTLY_IQ4_KS:   return "IQ4_KS - 4.25 bpw";
+        case LLAMA_FTYPE_MOSTLY_IQ4_KSS:  return "IQ4_KSS - 4.0 bpw";
         case LLAMA_FTYPE_MOSTLY_IQ2_K:    return "IQ2_K - 2.375 bpw";
         case LLAMA_FTYPE_MOSTLY_IQ3_K:    return "IQ3_K - 3.4325 bpw";
         case LLAMA_FTYPE_MOSTLY_IQ3_KL:   return "IQ3_KL - 4 bpw";
@@ -15651,7 +15653,8 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
                      ftype == LLAMA_FTYPE_MOSTLY_IQ2_KS) {
                 new_type = !qs.has_output ? GGML_TYPE_IQ4_K : GGML_TYPE_Q5_K;
             }
-            else if ((ftype == LLAMA_FTYPE_MOSTLY_IQ3_S || ftype == LLAMA_FTYPE_MOSTLY_IQ3_M || ftype == LLAMA_FTYPE_MOSTLY_IQ4_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ4_KS) && !qs.has_output) {
+            else if ((ftype == LLAMA_FTYPE_MOSTLY_IQ3_S || ftype == LLAMA_FTYPE_MOSTLY_IQ3_M || ftype == LLAMA_FTYPE_MOSTLY_IQ4_XS ||
+                      ftype == LLAMA_FTYPE_MOSTLY_IQ4_KS || ftype == LLAMA_FTYPE_MOSTLY_IQ4_KSS) && !qs.has_output) {
                 new_type = GGML_TYPE_IQ5_K;
             }
             else if (new_type != GGML_TYPE_Q8_0 && new_type != GGML_TYPE_IQ6_K) {
@@ -15742,7 +15745,8 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
             new_type = qs.i_attention_wv < 2 ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
         }
         else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
-        else if ((ftype == LLAMA_FTYPE_MOSTLY_IQ4_NL || ftype == LLAMA_FTYPE_MOSTLY_IQ4_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ4_KS) && qs.model.hparams.n_gqa() >= 2) {
+        else if ((ftype == LLAMA_FTYPE_MOSTLY_IQ4_NL || ftype == LLAMA_FTYPE_MOSTLY_IQ4_XS ||
+                  ftype == LLAMA_FTYPE_MOSTLY_IQ4_KS || ftype == LLAMA_FTYPE_MOSTLY_IQ4_KSS) && qs.model.hparams.n_gqa() >= 2) {
             new_type = GGML_TYPE_IQ5_K;
         }
         else if (ftype == LLAMA_FTYPE_MOSTLY_IQ4_K && qs.model.hparams.n_gqa() >= 2) {
@@ -15822,7 +15826,8 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
                 if (use_more_bits(i_layer, n_layer)) new_type = GGML_TYPE_Q6_K;
             }
         }
-        else if (i_layer < n_layer/8 && (ftype == LLAMA_FTYPE_MOSTLY_IQ4_NL || ftype == LLAMA_FTYPE_MOSTLY_IQ4_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ4_KS) && !qs.has_imatrix) {
+        else if (i_layer < n_layer/8 && (ftype == LLAMA_FTYPE_MOSTLY_IQ4_NL || ftype == LLAMA_FTYPE_MOSTLY_IQ4_XS ||
+                    ftype == LLAMA_FTYPE_MOSTLY_IQ4_KS || ftype == LLAMA_FTYPE_MOSTLY_IQ4_KSS) && !qs.has_imatrix) {
             new_type = GGML_TYPE_Q5_K;
         }
         else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M && use_more_bits(i_layer, n_layer)) new_type = GGML_TYPE_Q6_K;
@@ -15910,7 +15915,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
         new_type == GGML_TYPE_IQ1_M   || new_type == GGML_TYPE_IQ4_K   || new_type == GGML_TYPE_IQ2_K  ||
         new_type == GGML_TYPE_IQ5_K   || new_type == GGML_TYPE_IQ3_K   || new_type == GGML_TYPE_IQ2_TN ||
         new_type == GGML_TYPE_IQ6_K   || new_type == GGML_TYPE_IQ1_TN  || new_type == GGML_TYPE_IQ4_KS ||
-        new_type == GGML_TYPE_IQ2_KS) {
+        new_type == GGML_TYPE_IQ2_KS  || new_type == GGML_TYPE_IQ4_KSS) {
         int nx = tensor->ne[0];
         int ny = tensor->ne[1];
         if (nx % QK_K != 0) {
@@ -15942,6 +15947,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
             case GGML_TYPE_Q3_K:
             case GGML_TYPE_IQ2_K:
             case GGML_TYPE_IQ3_K:
+            case GGML_TYPE_IQ4_KSS:
             case GGML_TYPE_IQ4_KS:
             case GGML_TYPE_IQ4_XS: new_type = GGML_TYPE_IQ4_NL; break;
             case GGML_TYPE_IQ4_K:
@@ -16055,6 +16061,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
         case LLAMA_FTYPE_MOSTLY_IQ4_NL:  default_type = GGML_TYPE_IQ4_NL;  break;
         case LLAMA_FTYPE_MOSTLY_IQ4_XS:  default_type = GGML_TYPE_IQ4_XS;  break;
         case LLAMA_FTYPE_MOSTLY_IQ4_KS:  default_type = GGML_TYPE_IQ4_KS;  break;
+        case LLAMA_FTYPE_MOSTLY_IQ4_KSS: default_type = GGML_TYPE_IQ4_KSS; break;
         case LLAMA_FTYPE_MOSTLY_IQ2_K:   default_type = GGML_TYPE_IQ2_K;   break;
         case LLAMA_FTYPE_MOSTLY_IQ3_K:   default_type = GGML_TYPE_IQ3_K;   break;
         case LLAMA_FTYPE_MOSTLY_IQ3_KL:  default_type = GGML_TYPE_IQ3_K;   break;