diff --git a/examples/quantize/quantize.cpp b/examples/quantize/quantize.cpp
index 3cc19f70..38855a3c 100644
--- a/examples/quantize/quantize.cpp
+++ b/examples/quantize/quantize.cpp
@@ -46,6 +46,7 @@ static const std::vector<struct quant_option> QUANT_OPTIONS = {
     { "IQ4_KS",   LLAMA_FTYPE_MOSTLY_IQ4_KS,   " 4.25 bpw non-linear quantization", },
     { "IQ2_K",    LLAMA_FTYPE_MOSTLY_IQ2_K,    " 2.375 bpw non-linear quantization",},
     { "IQ3_K",    LLAMA_FTYPE_MOSTLY_IQ3_K,    " 3.44 bpw non-linear quantization", },
+    { "IQ3_KS",   LLAMA_FTYPE_MOSTLY_IQ3_KS,   " 3.25 bpw non-linear quantization", },
     { "IQ3_KL",   LLAMA_FTYPE_MOSTLY_IQ3_KL,   " 4 bpw non-linear quantization mix",},
     { "IQ4_K",    LLAMA_FTYPE_MOSTLY_IQ4_K,    " 4.5 bpw non-linear quantization",  },
     { "IQ5_K",    LLAMA_FTYPE_MOSTLY_IQ5_K,    " 5.5 bpw non-linear quantization",  },
diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h
index 3054dabd..3d2e1e83 100644
--- a/ggml/include/ggml.h
+++ b/ggml/include/ggml.h
@@ -404,6 +404,7 @@ extern "C" {
         GGML_TYPE_IQ2_TN  = 142,
         GGML_TYPE_IQ1_TN  = 143,
         GGML_TYPE_IQ4_KS  = 144,
+        GGML_TYPE_IQ3_KS  = 145,
         GGML_TYPE_COUNT,
     };
 
@@ -460,6 +461,7 @@ extern "C" {
         GGML_FTYPE_MOSTLY_IQ2_TN  = 135, // except 1d tensors
         GGML_FTYPE_MOSTLY_IQ1_TN  = 136, // except 1d tensors
         GGML_FTYPE_MOSTLY_IQ4_KS  = 137, // except 1d tensors
+        GGML_FTYPE_MOSTLY_IQ3_KS  = 138, // except 1d tensors
     };
 
     // available tensor operations:
diff --git a/ggml/src/ggml-common.h b/ggml/src/ggml-common.h
index 7eaf7437..2fa2501f 100644
--- a/ggml/src/ggml-common.h
+++ b/ggml/src/ggml-common.h
@@ -465,6 +465,13 @@ typedef struct {
 } block_iq3_k;
 static_assert(sizeof(block_iq3_k) == sizeof(ggml_half) + 2*sizeof(uint16_t) + QK_K/32 + QK_K/4 + QK_K/8, "wrong iq3_k block size/padding");
 
+typedef struct {
+    uint8_t scales[QK_K/32];
+    uint8_t qs[QK_K/4];
+    uint8_t qh[QK_K/8];
+} block_iq3_ks;
+static_assert(sizeof(block_iq3_ks) == QK_K/32 + QK_K/4 + QK_K/8, "wrong iq3_ks block size/padding");
+
 typedef struct {
     ggml_half d;
     uint16_t extra;
diff --git a/ggml/src/ggml-cuda.cu b/ggml/src/ggml-cuda.cu
index 0657252d..fea36395 100644
--- a/ggml/src/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda.cu
@@ -2829,6 +2829,7 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons
                     case GGML_TYPE_IQ4_NL:
                     case GGML_TYPE_IQ4_XS:
                     case GGML_TYPE_IQ4_KS:
+                    case GGML_TYPE_IQ3_KS:
                     case GGML_TYPE_IQ2_K:
                     case GGML_TYPE_IQ3_K:
                     case GGML_TYPE_IQ4_K:
diff --git a/ggml/src/ggml-cuda/common.cuh b/ggml/src/ggml-cuda/common.cuh
index c00cef29..b8557b7d 100644
--- a/ggml/src/ggml-cuda/common.cuh
+++ b/ggml/src/ggml-cuda/common.cuh
@@ -536,6 +536,13 @@ struct ggml_cuda_type_traits<GGML_TYPE_IQ4_KS> {
     static constexpr int qi = QI4_XS;
 };
 
+template<>
+struct ggml_cuda_type_traits<GGML_TYPE_IQ3_KS> {
+    static constexpr int qk = QK_K;
+    static constexpr int qr = QR4_XS;
+    static constexpr int qi = QI4_XS;
+};
+
 template<>
 struct ggml_cuda_type_traits<GGML_TYPE_IQ5_K> {
     static constexpr int qk = QK_K;
diff --git a/ggml/src/ggml-cuda/convert.cu b/ggml/src/ggml-cuda/convert.cu
index 62dd52a2..e32a9aa0 100644
--- a/ggml/src/ggml-cuda/convert.cu
+++ b/ggml/src/ggml-cuda/convert.cu
@@ -615,6 +615,37 @@ static __global__ void dequantize_block_iq4_xs(const void * __restrict__ vx, dst
     }
 }
 
+template<typename dst_t>
+static __global__ void dequantize_block_iq3_ks(const void * __restrict__ vx, dst_t * __restrict__ yy, int64_t n_per_row, int64_t row_size) {
+
+    int64_t ii  = blockIdx.x;
+    int64_t row = (QK_K * ii) / n_per_row;
+    const char * cx = (const char *)vx + row * row_size;
+    float scale = *(const float *)cx;
+    const block_iq3_ks * x = (const block_iq3_ks *)(cx + sizeof(float));
+    const int64_t i   = ii - (row*n_per_row)/QK_K;
+
+    const int tid = threadIdx.x;
+    int ib128 = tid/16; // 0 or 1
+    int il    = tid%16; // 0...15
+    dst_t * y = yy + ii*QK_K + 128*ib128 + 2*il;
+    //uint32_t sc = ((const uint32_t *)x[i].scales)[ib128];
+    //uint32_t aux32 =
+    const float dl1 = scale * ((x[i].scales[4*ib128+0] & 254) - 127);
+    const float dl2 = scale * ((x[i].scales[4*ib128+1] & 254) - 127);
+    const float dl3 = scale * ((x[i].scales[4*ib128+2] & 254) - 127);
+    const float dl4 = scale * ((x[i].scales[4*ib128+3] & 254) - 127);
+    const uint8_t * qs = x[i].qs + 32*ib128 + 2*il;
+    const uint8_t * qh = x[i].qh + 2*il;
+    for (int j = 0; j < 2; ++j) {
+        const uint8_t h = qh[j] >> (4*(ib128%2));
+        y[j+ 0] = dl1 * iq3nl_values[(((qs[j] >> 0) & 0x03) | ((h & 0x01) << 2)) + ((x[i].scales[4*ib128+0] & 1) << 3)];
+        y[j+32] = dl2 * iq3nl_values[(((qs[j] >> 2) & 0x03) | ((h & 0x02) << 1)) + ((x[i].scales[4*ib128+1] & 1) << 3)];
+        y[j+64] = dl3 * iq3nl_values[(((qs[j] >> 4) & 0x03) | ((h & 0x04) >> 0)) + ((x[i].scales[4*ib128+2] & 1) << 3)];
+        y[j+96] = dl4 * iq3nl_values[(((qs[j] >> 6) & 0x03) | ((h & 0x08) >> 1)) + ((x[i].scales[4*ib128+3] & 1) << 3)];
+    }
+}
+
 template<typename dst_t>
 static __global__ void dequantize_block_iq4_ks(const void * __restrict__ vx, dst_t * __restrict__ yy, int64_t n_per_row, int64_t row_size) {
 
@@ -952,6 +983,14 @@ static void dequantize_row_iq4_ks_cuda(const void * vx, dst_t * y, const int64_t
     dequantize_block_iq4_ks<<<nb, 32, 0, stream>>>(vx, y, n_per_row, row_size);
 }
 
+template<typename dst_t>
+static void dequantize_row_iq3_ks_cuda(const void * vx, dst_t * y, const int64_t nrows, const int64_t n_per_row, cudaStream_t stream) {
+    const int64_t k = nrows * n_per_row;
+    const int64_t row_size = ggml_row_size(GGML_TYPE_IQ3_KS, n_per_row);
+    const int nb = (k + QK_K - 1) / QK_K;
+    dequantize_block_iq3_ks<<<nb, 32, 0, stream>>>(vx, y, n_per_row, row_size);
+}
+
 template<typename dst_t>
 static void dequantize_row_iq2_k_cuda(const void * vx, dst_t * y, const int64_t nrows, const int64_t n_per_row, cudaStream_t stream) {
     const int64_t k = nrows * n_per_row;
@@ -1116,6 +1155,8 @@ to_fp16_cuda_t ggml_get_to_fp16_cuda(ggml_type type) {
             return dequantize_row_iq4_xs_cuda;
         case GGML_TYPE_IQ4_KS:
             return dequantize_row_iq4_ks_cuda;
+        case GGML_TYPE_IQ3_KS:
+            return dequantize_row_iq3_ks_cuda;
         case GGML_TYPE_IQ2_K:
             return dequantize_row_iq2_k_cuda;
         case GGML_TYPE_IQ3_K:
@@ -1187,6 +1228,8 @@ to_fp32_cuda_t ggml_get_to_fp32_cuda(ggml_type type) {
             return dequantize_row_iq4_xs_cuda;
         case GGML_TYPE_IQ4_KS:
             return dequantize_row_iq4_ks_cuda;
+        case GGML_TYPE_IQ3_KS:
+            return dequantize_row_iq3_ks_cuda;
         case GGML_TYPE_IQ2_K:
             return dequantize_row_iq2_k_cuda;
         case GGML_TYPE_IQ3_K:
diff --git a/ggml/src/ggml-cuda/iqk_mmvq.cu b/ggml/src/ggml-cuda/iqk_mmvq.cu
index a1f2d28c..7fc5e8f8 100644
--- a/ggml/src/ggml-cuda/iqk_mmvq.cu
+++ b/ggml/src/ggml-cuda/iqk_mmvq.cu
@@ -539,6 +539,73 @@ __device__ __forceinline__ float vec_dot_iq3_k_q8_1(
 
 }
 
+__device__ __forceinline__ float vec_dot_iq3_ks_q8_1(
+    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & kbx, const int & iiqs) {
+    return 0.f;
+//    const block_iq3_k * bq3 = (const block_iq3_k *) vbq + kbx;
+//
+//    int iqs = iiqs/4;
+//    const int ib128 = iqs/4;  // 0 or 1. 0 works on quants 0...127, 1 on quants 128...255
+//                              // Each thread processes 8 quants in each of the 4 32-blocks
+//    const int il8   = iqs%4;  // 0...3. 0 works on quants 0...7, 1 on quants 8...15, 2 on 16...23, 3 on 24...31
+//    const int shift = 4*(il8/2);
+//
+//    const uint16_t * ql = (const uint16_t *)bq3->qs + 16*ib128 + 4*il8;
+//    const uint16_t * qh = (const uint16_t *)bq3->qh + 4*il8;
+//
+//    uint32_t aux32;
+//    const uint8_t * aux8 = (const uint8_t *)&aux32;
+//
+//    const int hshift = 4*(1-ib128);
+//    const uint16_t sh = bq3->scales_h >> (8*ib128 + il8/2);
+//
+//    const uint8_t extra = bq3->extra >> (8*ib128 + il8/2);
+//    const uint16_t * values1 = iq3k_table + ((extra << 6) & 0x40);
+//    const uint16_t * values2 = iq3k_table + ((extra << 5) & 0x40);
+//    const uint16_t * values3 = iq3k_table + ((extra << 4) & 0x40);
+//    const uint16_t * values4 = iq3k_table + ((extra << 3) & 0x40);
+//
+//    const int * q8;
+//    int sumi[4] = {0, 0, 0, 0};
+//    int v;
+//    for (int i = 0; i < 2; ++i) {
+//        uint32_t vl = ql[2*i+0] | (ql[2*i+1] << 16);
+//        uint32_t vh = ((qh[2*i+0] | (qh[2*i+1] << 16)) << hshift) >> 2;
+//
+//        q8 = (const int *)bq8_1[4*ib128+0].qs + 2*il8;
+//        aux32 = (vl & 0x03030303) | (vh & 0x04040404);
+//        v = int_from_table_2(aux8, values1);
+//        sumi[0] = ggml_cuda_dp4a(v, q8[i], sumi[0]);
+//        vl >>= 2; vh >>= 1;
+//
+//        q8 += sizeof(block_q8_1)/4;
+//        aux32 = (vl & 0x03030303) | (vh & 0x04040404);
+//        v = int_from_table_2(aux8, values2);
+//        sumi[1] = ggml_cuda_dp4a(v, q8[i], sumi[1]);
+//        vl >>= 2; vh >>= 1;
+//
+//        q8 += sizeof(block_q8_1)/4;
+//        aux32 = (vl & 0x03030303) | (vh & 0x04040404);
+//        v = int_from_table_2(aux8, values3);
+//        sumi[2] = ggml_cuda_dp4a(v, q8[i], sumi[2]);
+//        vl >>= 2; vh >>= 1;
+//
+//        q8 += sizeof(block_q8_1)/4;
+//        aux32 = (vl & 0x03030303) | (vh & 0x04040404);
+//        v = int_from_table_2(aux8, values4);
+//        sumi[3] = ggml_cuda_dp4a(v, q8[i], sumi[3]);
+//
+//    }
+//    const float d = __half2float(bq3->d);
+//    const uint16_t * sl16 = (const uint16_t *)bq3->scales_l + 2*ib128;
+//    aux32 = ((((sl16[0] | (sl16[1] << 16)) >> shift) & 0x0f0f0f0f) << 1) | 0x01010101;
+//    return d * (__low2float(bq8_1[4*ib128+0].ds) * aux8[0] * (sh & 0x01 ? -1 : 1) * sumi[0] +
+//                __low2float(bq8_1[4*ib128+1].ds) * aux8[1] * (sh & 0x04 ? -1 : 1) * sumi[1] +
+//                __low2float(bq8_1[4*ib128+2].ds) * aux8[2] * (sh & 0x10 ? -1 : 1) * sumi[2] +
+//                __low2float(bq8_1[4*ib128+3].ds) * aux8[3] * (sh & 0x40 ? -1 : 1) * sumi[3]);
+
+}
+
 #define VDR_IQ2_TN_Q8_1_MMVQ 1
 #define VDR_IQ2_TN_Q8_1_MMQ  4
 
@@ -631,6 +698,13 @@ void mul_mat_vec_iq3_k_q8_1_cuda(
     iqk_mul_mat_vec_q_cuda<GGML_TYPE_IQ3_K, VDR_IQ3_K_Q8_1_MMVQ, vec_dot_iq3_k_q8_1>(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream);
 }
 
+void mul_mat_vec_iq3_ks_q8_1_cuda(
+    const void * vx, const void * vy, float * dst,
+    const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) {
+
+    iqk_mul_mat_vec_q_cuda<GGML_TYPE_IQ3_KS, VDR_IQ3_K_Q8_1_MMVQ, vec_dot_iq3_ks_q8_1>(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream);
+}
+
 void mul_mat_vec_iq4_k_q8_1_cuda(
     const void * vx, const void * vy, float * dst,
     const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) {
diff --git a/ggml/src/ggml-cuda/iqk_mmvq.cuh b/ggml/src/ggml-cuda/iqk_mmvq.cuh
index 8d76be1d..23873c6d 100644
--- a/ggml/src/ggml-cuda/iqk_mmvq.cuh
+++ b/ggml/src/ggml-cuda/iqk_mmvq.cuh
@@ -32,3 +32,7 @@ void mul_mat_vec_iq4_ks_q8_1_cuda(
     const void * vx, const void * vy, float * dst,
     const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream);
 
+void mul_mat_vec_iq3_ks_q8_1_cuda(
+    const void * vx, const void * vy, float * dst,
+    const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream);
+
diff --git a/ggml/src/ggml-cuda/mmvq.cu b/ggml/src/ggml-cuda/mmvq.cu
index 8e3c4aa4..4175d850 100644
--- a/ggml/src/ggml-cuda/mmvq.cu
+++ b/ggml/src/ggml-cuda/mmvq.cu
@@ -462,6 +462,9 @@ void ggml_cuda_op_mul_mat_vec_q(
         case GGML_TYPE_IQ4_KS:
             mul_mat_vec_iq4_ks_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
             break;
+        case GGML_TYPE_IQ3_KS:
+            mul_mat_vec_iq3_ks_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
+            break;
         case GGML_TYPE_IQ5_K:
             mul_mat_vec_iq5_k_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
             break;
diff --git a/ggml/src/ggml-quants.c b/ggml/src/ggml-quants.c
index 40978ac0..84fc2cc4 100644
--- a/ggml/src/ggml-quants.c
+++ b/ggml/src/ggml-quants.c
@@ -14947,7 +14947,7 @@ bool ggml_validate_row_data(enum ggml_type type, const void * data, size_t nbyte
         return false;
     }
 
-    if (type != GGML_TYPE_IQ2_TN && type != GGML_TYPE_IQ1_TN && type != GGML_TYPE_IQ4_KS && nbytes % ggml_type_size(type) != 0) {
+    if (type != GGML_TYPE_IQ2_TN && type != GGML_TYPE_IQ1_TN && type != GGML_TYPE_IQ4_KS && type != GGML_TYPE_IQ3_KS && nbytes % ggml_type_size(type) != 0) {
         fprintf(stderr, "%s: invalid size %zu for type %s (type size = %zu)\n", __func__, nbytes, ggml_type_name(type), ggml_type_size(type));
         return false;
     }
@@ -15161,12 +15161,13 @@ bool ggml_validate_row_data(enum ggml_type type, const void * data, size_t nbyte
         case GGML_TYPE_Q6_0: break;
         case GGML_TYPE_IQ2_K: break;
         case GGML_TYPE_IQ3_K: break;
+        case GGML_TYPE_IQ3_KS: break;
         case GGML_TYPE_IQ4_K: break;
+        case GGML_TYPE_IQ4_KS: break;
         case GGML_TYPE_IQ5_K: break;
         case GGML_TYPE_IQ6_K: break;
         case GGML_TYPE_IQ2_TN: break;
         case GGML_TYPE_IQ1_TN: break;
-        case GGML_TYPE_IQ4_KS: break;
         case GGML_TYPE_Q4_0_4_4:
         case GGML_TYPE_Q4_0_4_8:
             {
diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
index 97fa81b1..77af676c 100644
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@@ -1206,6 +1206,19 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
         .nrows                    = 1,
         .row_meta_size            = 0,
     },
+    [GGML_TYPE_IQ3_KS] = {
+        .type_name                = "iq3_ks",
+        .blck_size                = QK_K,
+        .type_size                = sizeof(block_iq3_ks),
+        .is_quantized             = true,
+        .to_float                 = (ggml_to_float_t) dequantize_row_iq3_ks,
+        .from_float               = quantize_row_iq3_ks,
+        .from_float_ref           = (ggml_from_float_t)quantize_row_iq3_ks_ref,
+        .vec_dot                  = vec_dot_iq3_ks_q8_k,
+        .vec_dot_type             = GGML_TYPE_Q8_K,
+        .nrows                    = 1,
+        .row_meta_size            = 4,
+    },
     [GGML_TYPE_IQ4_K] = {
         .type_name                = "iq4_k",
         .blck_size                = QK_K,
@@ -3907,6 +3920,7 @@ enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype) {
         case GGML_FTYPE_MOSTLY_IQ4_KS:        wtype = GGML_TYPE_IQ4_KS;   break;
         case GGML_FTYPE_MOSTLY_IQ2_K:         wtype = GGML_TYPE_IQ2_K;    break;
         case GGML_FTYPE_MOSTLY_IQ3_K:         wtype = GGML_TYPE_IQ3_K;    break;
+        case GGML_FTYPE_MOSTLY_IQ3_KS:        wtype = GGML_TYPE_IQ3_KS;   break;
         case GGML_FTYPE_MOSTLY_IQ4_K:         wtype = GGML_TYPE_IQ4_K;    break;
         case GGML_FTYPE_MOSTLY_IQ5_K:         wtype = GGML_TYPE_IQ5_K;    break;
         case GGML_FTYPE_MOSTLY_IQ6_K:         wtype = GGML_TYPE_IQ6_K;    break;
@@ -10407,6 +10421,7 @@ static void ggml_compute_forward_add(
         case GGML_TYPE_IQ4_KS:
         case GGML_TYPE_IQ2_K:
         case GGML_TYPE_IQ3_K:
+        case GGML_TYPE_IQ3_KS:
         case GGML_TYPE_IQ4_K:
         case GGML_TYPE_IQ5_K:
         case GGML_TYPE_IQ6_K:
@@ -10796,6 +10811,7 @@ static void ggml_compute_forward_add1(
         case GGML_TYPE_IQ4_KS:
         case GGML_TYPE_IQ2_K:
         case GGML_TYPE_IQ3_K:
+        case GGML_TYPE_IQ3_KS:
         case GGML_TYPE_IQ4_K:
         case GGML_TYPE_IQ5_K:
         case GGML_TYPE_IQ6_K:
@@ -10935,6 +10951,7 @@ static void ggml_compute_forward_acc(
         case GGML_TYPE_IQ4_KS:
         case GGML_TYPE_IQ2_K:
         case GGML_TYPE_IQ3_K:
+        case GGML_TYPE_IQ3_KS:
         case GGML_TYPE_IQ4_K:
         case GGML_TYPE_IQ5_K:
         case GGML_TYPE_IQ6_K:
@@ -14120,6 +14137,7 @@ static void ggml_compute_forward_out_prod(
         case GGML_TYPE_IQ4_KS:
         case GGML_TYPE_IQ2_K:
         case GGML_TYPE_IQ3_K:
+        case GGML_TYPE_IQ3_KS:
         case GGML_TYPE_IQ4_K:
         case GGML_TYPE_IQ5_K:
         case GGML_TYPE_IQ6_K:
@@ -14499,6 +14517,7 @@ static void ggml_compute_forward_set(
         case GGML_TYPE_IQ4_KS:
         case GGML_TYPE_IQ2_K:
         case GGML_TYPE_IQ3_K:
+        case GGML_TYPE_IQ3_KS:
         case GGML_TYPE_IQ4_K:
         case GGML_TYPE_IQ5_K:
         case GGML_TYPE_IQ6_K:
@@ -14772,6 +14791,7 @@ static void ggml_compute_forward_get_rows(
         case GGML_TYPE_IQ4_KS:
         case GGML_TYPE_IQ2_K:
         case GGML_TYPE_IQ3_K:
+        case GGML_TYPE_IQ3_KS:
         case GGML_TYPE_IQ4_K:
         case GGML_TYPE_IQ5_K:
         case GGML_TYPE_IQ6_K:
@@ -15372,6 +15392,7 @@ static void ggml_compute_forward_clamp(
         case GGML_TYPE_IQ4_KS:
         case GGML_TYPE_IQ2_K:
         case GGML_TYPE_IQ3_K:
+        case GGML_TYPE_IQ3_KS:
         case GGML_TYPE_IQ4_K:
         case GGML_TYPE_IQ5_K:
         case GGML_TYPE_IQ6_K:
@@ -22189,6 +22210,7 @@ size_t ggml_quantize_chunk(
         case GGML_TYPE_IQ4_KS:  result = quantize_iq4_ks (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
         case GGML_TYPE_IQ2_K:   result = quantize_iq2_k  (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
         case GGML_TYPE_IQ3_K:   result = quantize_iq3_k  (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
+        case GGML_TYPE_IQ3_KS:  result = quantize_iq3_ks (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
         case GGML_TYPE_IQ4_K:   result = quantize_iq4_k  (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
         case GGML_TYPE_IQ5_K:   result = quantize_iq5_k  (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
         case GGML_TYPE_IQ6_K:   result = quantize_iq6_k  (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
diff --git a/ggml/src/iqk/iqk_quantize.cpp b/ggml/src/iqk/iqk_quantize.cpp
index 430b629f..f813e4bd 100644
--- a/ggml/src/iqk/iqk_quantize.cpp
+++ b/ggml/src/iqk/iqk_quantize.cpp
@@ -2168,39 +2168,35 @@ void iqk_quantize_row_q8_K(const float * x, void * vy, int64_t k) {
 }
 
 namespace {
-static void quantize_row_iq4_k_impl_bs128(const int super_block_size, const int block_size,
-        int n_per_row, const float * x, char * cy,
+template <typename Row>
+static void quantize_row_iqX_ks_impl_T(Row& row,
+        int n_per_row, const float * x, //char * cy,
         float * all_scales, float * weight,
-        const int8_t * values,
         const float * quant_weights,
         const int ntry) {
 
-    //GGML_ASSERT(super_block_size == 256 && block_size == 128);
-
-    float * dptr = (float *)cy;
-    block_iq4_ks * y = (block_iq4_ks *)(dptr + 1);
-
-    const int8_t * shifted_values = values + 16;
+    auto values = Row::values();
+    auto shifted_values = Row::shifted_values();
 
     float amax_scale = 0;
 
-    for (int ibl = 0; ibl < n_per_row/super_block_size; ++ibl) {
-        memset(&y[ibl], 0, sizeof(block_iq4_ks));
-        const float * xbl = x + ibl*super_block_size;
-        auto scales = all_scales + ibl*(super_block_size/block_size);
+    for (int ibl = 0; ibl < n_per_row/Row::super_block_size; ++ibl) {
+        row.set_zero(ibl);
+        const float * xbl = x + ibl*Row::super_block_size;
+        auto scales = all_scales + ibl*(Row::super_block_size/Row::block_size);
         float sigma2 = 0;
-        for (int j = 0; j < super_block_size; ++j) sigma2 += xbl[j]*xbl[j];
-        sigma2 *= 2.f/super_block_size;
-        for (int ib = 0; ib < super_block_size/block_size; ++ib) {
-            const float * xb = xbl + ib*block_size;
+        for (int j = 0; j < Row::super_block_size; ++j) sigma2 += xbl[j]*xbl[j];
+        sigma2 *= 2.f/Row::super_block_size;
+        for (int ib = 0; ib < Row::super_block_size/Row::block_size; ++ib) {
+            const float * xb = xbl + ib*Row::block_size;
             if (quant_weights) {
-                const float * qw = quant_weights + ibl*super_block_size + ib*block_size;
-                for (int j = 0; j < block_size; ++j) weight[j] = qw[j] * sqrtf(sigma2 + xb[j]*xb[j]);
+                const float * qw = quant_weights + ibl*Row::super_block_size + ib*Row::block_size;
+                for (int j = 0; j < Row::block_size; ++j) weight[j] = qw[j] * sqrtf(sigma2 + xb[j]*xb[j]);
             } else {
-                for (int j = 0; j < block_size; ++j) weight[j] = xb[j]*xb[j];
+                for (int j = 0; j < Row::block_size; ++j) weight[j] = xb[j]*xb[j];
             }
             float amax = 0, max = 0;
-            for (int j = 0; j < block_size; ++j) {
+            for (int j = 0; j < Row::block_size; ++j) {
                 float ax = fabsf(xb[j]);
                 if (ax > amax) {
                     amax = ax; max = xb[j];
@@ -2214,14 +2210,14 @@ static void quantize_row_iq4_k_impl_bs128(const int super_block_size, const int
             float id = 1/d;
             float sumqx_p = 0, sumq2_p = 0;
             float sumqx_m = 0, sumq2_m = 0;
-            for (int j = 0; j < block_size; ++j) {
+            for (int j = 0; j < Row::block_size; ++j) {
                 float w = weight[j];
                 float al = id*xb[j];
-                int l = best_index_iq4nl(values, al);
+                int l = row.best_index(values, al);
                 float q = values[l];
                 sumqx_p += w*q*xb[j];
                 sumq2_p += w*q*q;
-                l = best_index_iq4nl(values, -al);
+                l = row.best_index(values, -al);
                 q = values[l];
                 sumqx_m += w*q*xb[j];
                 sumq2_m += w*q*q;
@@ -2236,14 +2232,14 @@ static void quantize_row_iq4_k_impl_bs128(const int super_block_size, const int
                 id = (itry + values[0])/max;
                 sumqx_p = sumq2_p = 0;
                 sumqx_m = sumq2_m = 0;
-                for (int j = 0; j < block_size; ++j) {
+                for (int j = 0; j < Row::block_size; ++j) {
                     float w = weight[j];
                     float al = id*xb[j];
-                    int l = best_index_iq4nl(values, al);
+                    int l = row.best_index(values, al);
                     float q = values[l];
                     sumqx_p += w*q*xb[j];
                     sumq2_p += w*q*q;
-                    l = best_index_iq4nl(values, -al);
+                    l = row.best_index(values, -al);
                     q = values[l];
                     sumqx_m += w*q*xb[j];
                     sumq2_m += w*q*q;
@@ -2257,14 +2253,14 @@ static void quantize_row_iq4_k_impl_bs128(const int super_block_size, const int
                 id = (itry + shifted_values[0])/max;
                 sumqx_p = sumq2_p = 0;
                 sumqx_m = sumq2_m = 0;
-                for (int j = 0; j < block_size; ++j) {
+                for (int j = 0; j < Row::block_size; ++j) {
                     float w = weight[j];
                     float al = id*xb[j];
-                    int l = best_index_iq4nl(shifted_values, al);
+                    int l = row.best_index(shifted_values, al);
                     float q = shifted_values[l];
                     sumqx_p += w*q*xb[j];
                     sumq2_p += w*q*q;
-                    l = best_index_iq4nl(shifted_values, -al);
+                    l = row.best_index(shifted_values, -al);
                     q = shifted_values[l];
                     sumqx_m += w*q*xb[j];
                     sumq2_m += w*q*q;
@@ -2276,58 +2272,108 @@ static void quantize_row_iq4_k_impl_bs128(const int super_block_size, const int
                     d = sumqx_m/sumq2_m; best = d * sumqx_m; is_shifted = true;
                 }
             }
-            if (is_shifted) y[ibl].scales[ib] = 0x01;
+            if (is_shifted) row.set_shifted(ibl, ib);
             scales[ib] = d;
             amax_scale = std::max(amax_scale, std::abs(d));
         }
     }
-    float d = amax_scale/127;
-    *dptr = d;
+    float d = row.set_row_scale_from_max(amax_scale);
     if (!d) return;
     float id = d ? 1/d : 0.f;
     float sumqx = 0, sumq2 = 0;
-    //float mse = 0;
-    for (int ibl = 0; ibl < n_per_row/super_block_size; ++ibl) {
-        const float * xbl = x + ibl*super_block_size;
+    for (int ibl = 0; ibl < n_per_row/Row::super_block_size; ++ibl) {
+        const float * xbl = x + ibl*Row::super_block_size;
         float sigma2 = 0;
-        for (int j = 0; j < super_block_size; ++j) sigma2 += xbl[j]*xbl[j];
-        sigma2 *= 2.f/super_block_size;
-        auto scales = all_scales + (super_block_size/block_size)*ibl;
-        for (int ib = 0; ib < super_block_size/block_size; ++ib) {
-            const int8_t * block_values = y[ibl].scales[ib] & 0x01 ? shifted_values : values;
-            int l = nearest_int(0.5f*(id*scales[ib]+127.f));
-            l = std::max(0, std::min(127, l)) << 1;
-            //printf("d = %g, id = %g, scales = %g, l = %d, dl = %g\n", d, id, scales[ib], l, d*(l - 127));
-            y[ibl].scales[ib] |= l;
-            l -= 127;
+        for (int j = 0; j < Row::super_block_size; ++j) sigma2 += xbl[j]*xbl[j];
+        sigma2 *= 2.f/Row::super_block_size;
+        auto scales = all_scales + (Row::super_block_size/Row::block_size)*ibl;
+        for (int ib = 0; ib < Row::super_block_size/Row::block_size; ++ib) {
+            int l = row.set_block_scale(ibl, ib, id*scales[ib]);
             float dl = d * l;
             float idl = dl ? 1/dl : 0.f;
-            const float * xb = xbl + ib*block_size;
+            const float * xb = xbl + ib*Row::block_size;
             if (quant_weights) {
-                const float * qw = quant_weights + ibl*super_block_size + ib*block_size;
-                for (int j = 0; j < block_size; ++j) weight[j] = qw[j] * sqrtf(sigma2 + xb[j]*xb[j]);
+                const float * qw = quant_weights + ibl*Row::super_block_size + ib*Row::block_size;
+                for (int j = 0; j < Row::block_size; ++j) weight[j] = qw[j] * sqrtf(sigma2 + xb[j]*xb[j]);
             } else {
-                for (int j = 0; j < block_size; ++j) weight[j] = xb[j]*xb[j];
-            }
-            auto qs = y[ibl].qs + ib*(block_size/2);
-            for (int j = 0; j < block_size/2; ++j) {
-                uint8_t i1 = best_index_iq4nl(block_values, idl*xb[j]);
-                uint8_t i2 = best_index_iq4nl(block_values, idl*xb[j+block_size/2]);
-                qs[j] = i1 | (i2 << 4);
-                float w1 = weight[j];
-                float w2 = weight[j+block_size/2];
-                float q1 = block_values[i1]*l;
-                float q2 = block_values[i2]*l;
-                sumqx += w1*q1*xb[j] + w2*q2*xb[j+block_size/2];
-                sumq2 += w1*q1*q1 + w2*q2*q2;
-                //float diff = xb[j] - d*q1; mse += diff*diff;
-                //diff = xb[j+block_size/2] - d*q2; mse += diff*diff;
+                for (int j = 0; j < Row::block_size; ++j) weight[j] = xb[j]*xb[j];
             }
+            row.set_final_values(ibl, ib, idl, l, xb, weight, sumqx, sumq2);
         }
     }
-    //printf("rmse = %g\n", sqrt(mse/n_per_row));
-    if (sumq2 > 0) *dptr = sumqx/sumq2;
+    if (sumq2 > 0) row.set_row_scale(sumqx/sumq2);
 }
+template <typename Block>
+struct BaseRowBS32 {
+    constexpr static int super_block_size = QK_K;
+    constexpr static int block_size = 32;
+
+    inline void set_row(char * c) {
+        cy = c; dptr = (float *)cy; *dptr = 0.f;
+        y = (Block *)(dptr + 1);
+    }
+    inline void set_zero(int ibl) { std::memset(&y[ibl], 0, sizeof(Block)); }
+    inline float set_row_scale_from_max(float amax) {
+        float d = amax/127;
+        *dptr = d;
+        return d;
+    }
+    inline int set_block_scale(int ibl, int ib, float x) {
+        int l = nearest_int(0.5f*(x+127.f));
+        l = std::max(0, std::min(127, l)) << 1;
+        y[ibl].scales[ib] |= l;
+        return l - 127;
+    }
+
+    char  * cy = nullptr;
+    float * dptr = nullptr;
+    Block * y = nullptr;
+};
+struct IQ4KSRow final : public BaseRowBS32<block_iq4_ks> {
+    static const int8_t * values() { return iq4k_values; }
+    static const int8_t * shifted_values() { return iq4k_values + 16; }
+    static inline int best_index(const int8_t * values, float x) { return best_index_iq4nl(values, x); }
+    inline void set_shifted(int ibl, int ib) { y[ibl].scales[ib] = 0x01; }
+    inline bool is_shifted(int ibl, int ib) const { return y[ibl].scales[ib] & 0x01; }
+    inline void set_row_scale(float d) { *dptr = d; }
+    void set_final_values(int ibl, int ib, float idl, int l, const float * xb, const float * weight, float& sumqx, float& sumq2) {
+        auto qs = y[ibl].qs + ib*(block_size/2);
+        auto block_values = is_shifted(ibl, ib) ? shifted_values() : values();
+        for (int j = 0; j < block_size/2; ++j) {
+            uint8_t i1 = best_index(block_values, idl*xb[j]);
+            uint8_t i2 = best_index(block_values, idl*xb[j+block_size/2]);
+            qs[j] = i1 | (i2 << 4);
+            float w1 = weight[j];
+            float w2 = weight[j+block_size/2];
+            float q1 = block_values[i1]*l;
+            float q2 = block_values[i2]*l;
+            sumqx += w1*q1*xb[j] + w2*q2*xb[j+block_size/2];
+            sumq2 += w1*q1*q1 + w2*q2*q2;
+        }
+    }
+};
+struct IQ3KSRow final : public BaseRowBS32<block_iq3_ks> {
+    static const int8_t * values() { return iq3nl_values; }
+    static const int8_t * shifted_values() { return iq3nl_values + 8; }
+    static inline int best_index(const int8_t * values, float x) { return best_index_iq3nl(values, x); }
+    inline void set_shifted(int ibl, int ib) { y[ibl].scales[ib] = 0x01; }
+    inline bool is_shifted(int ibl, int ib) const { return y[ibl].scales[ib] & 0x01; }
+    inline void set_row_scale(float d) { *dptr = d; }
+    void set_final_values(int ibl, int ib, float idl, int l, const float * xb, const float * weight, float& sumqx, float& sumq2) {
+        auto qs = y[ibl].qs + 32*(ib/4);
+        auto qh = y[ibl].qh + 32*(ib/8);
+        auto block_values = is_shifted(ibl, ib) ? shifted_values() : values();
+        for (int j = 0; j < block_size; ++j) {
+            uint8_t i = best_index(block_values, idl*xb[j]);
+            qs[j] |= ((i &  3) << 2*(ib%4));
+            qh[j] |= ((i >> 2) << (ib%8));
+            float w = weight[j];
+            float q = block_values[i]*l;
+            sumqx += w*q*xb[j];
+            sumq2 += w*q*q;
+        }
+    }
+};
 }
 
 void quantize_row_iq4_ks_ref(const float * x, block_iq4_ks * y, int64_t k) {
@@ -2339,15 +2385,16 @@ void quantize_row_iq4_ks(const float * x, void * y, int64_t k) {
 }
 
 size_t quantize_iq4_ks(const float * src, void * dst, int64_t nrows, int64_t n_per_row, const float * imatrix) {
-    //printf("============ %s(%d, %d)\n", __func__, int(nrows), int(n_per_row));
-    constexpr int kBlockSize = 32; //128;
+    constexpr int kBlockSize = 32;
     GGML_ASSERT(n_per_row%QK_K == 0);
     auto row_size = ggml_row_size(GGML_TYPE_IQ4_KS, n_per_row);
     char * qrow = (char *)dst;
     float weight[kBlockSize];
     std::vector<float> all_scales(n_per_row/kBlockSize);
+    IQ4KSRow iq4ks;
     for (int64_t row = 0; row < nrows; ++row) {
-        quantize_row_iq4_k_impl_bs128(QK_K, kBlockSize, n_per_row, src, qrow, all_scales.data(), weight, iq4k_values, imatrix, 7);
+        iq4ks.set_row(qrow);
+        quantize_row_iqX_ks_impl_T(iq4ks, n_per_row, src, all_scales.data(), weight, imatrix, 7);
         src += n_per_row;
         qrow += row_size;
     }
@@ -2419,3 +2466,106 @@ void  vec_dot_iq4_ks_q8_k(int n, float * s, size_t bs, const void * vx, size_t b
     *s = sumf;
 }
 
+void quantize_row_iq3_ks_ref(const float * GGML_RESTRICT x, block_iq3_ks  * GGML_RESTRICT y, int64_t k) {
+    quantize_iq3_ks(x, (void *)y, 1, k, nullptr);
+}
+
+void quantize_row_iq3_ks(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k) {
+    quantize_iq3_ks(x, (void *)y, 1, k, nullptr);
+}
+
+size_t quantize_iq3_ks(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix) {
+    constexpr int kBlockSize = 32;
+    GGML_ASSERT(n_per_row%QK_K == 0);
+    auto row_size = ggml_row_size(GGML_TYPE_IQ3_KS, n_per_row);
+    char * qrow = (char *)dst;
+    float weight[kBlockSize];
+    std::vector<float> all_scales(n_per_row/kBlockSize);
+    IQ3KSRow iq3ks;
+    for (int64_t row = 0; row < nrows; ++row) {
+        iq3ks.set_row(qrow);
+        quantize_row_iqX_ks_impl_T(iq3ks, n_per_row, src, all_scales.data(), weight, imatrix, 7);
+        src += n_per_row;
+        qrow += row_size;
+    }
+    return nrows * row_size;
+}
+
+void dequantize_row_iq3_ks(const block_iq3_ks  * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) {
+    constexpr int kBlockSize = 32;
+    GGML_ASSERT(k%QK_K == 0);
+    const float * dptr = (const float *)x;
+    float d = *dptr;
+    x = (const block_iq3_ks *)(dptr + 1);
+    int nblock = k/QK_K;
+    for (int ibl = 0; ibl < nblock; ++ibl) {
+        auto qs = x[ibl].qs;
+        auto qh = x[ibl].qh;
+        int shift = 0;
+        for (int ib = 0; ib < QK_K/kBlockSize; ++ib) {
+            float dl = d * ((int)(x[ibl].scales[ib] & 254) - 127);
+            const int8_t * values = iq3nl_values + ((x[ibl].scales[ib] & 1) << 3);
+            for (int j = 0; j < kBlockSize; ++j) {
+                y[j] = dl * values[((qs[j] >> shift) & 0x3) | (((qh[j] >> (ib%8)) & 1) << 2)];
+            }
+            y  += kBlockSize;
+            shift += 2;
+            if (shift == 8) { shift = 0; qs += kBlockSize; }
+        }
+    }
+}
+
+void vec_dot_iq3_ks_q8_k(int n, float * s, size_t bs, const void * vx, size_t bx, const void * vy, size_t by, int nrc) {
+    //constexpr int kBlockSize = 32;
+    GGML_ASSERT(nrc == 1);
+    GGML_ASSERT(n % QK_K == 0);
+#if GGML_USE_IQK_MULMAT
+    if (iqk_mul_mat(1, 1, n, GGML_TYPE_IQ3_KS, vx, 0, GGML_TYPE_Q8_K, vy, 0, s, 0, 0, 1)) {
+        return;
+    }
+#endif
+    GGML_UNUSED(bs);
+    GGML_UNUSED(bx);
+    GGML_UNUSED(by);
+
+    const float * dptr = (const float *)vx;
+    float d = *dptr;
+    const block_iq3_ks * x = (const block_iq3_ks *)(dptr + 1);
+    const block_q8_K   * y = (const block_q8_K   *)vy;
+
+    float sumf = 0;
+    for (int ibl = 0; ibl < n/QK_K; ++ibl) {
+        auto qy = y[ibl].qs;
+        auto qs = x[ibl].qs;
+        auto qh = x[ibl].qs;
+        auto scales = x[ibl].scales;
+        float sumb = 0;
+        for (int ib128 = 0; ib128 < QK_K/128; ++ib128) {
+            const int8_t * values1 = iq3nl_values + ((scales[0] & 1) << 3);
+            const int8_t * values2 = iq3nl_values + ((scales[1] & 1) << 3);
+            const int8_t * values3 = iq3nl_values + ((scales[2] & 1) << 3);
+            const int8_t * values4 = iq3nl_values + ((scales[3] & 1) << 3);
+            float ls1 = (scales[0] & 254) - 127;
+            float ls2 = (scales[1] & 254) - 127;
+            float ls3 = (scales[2] & 254) - 127;
+            float ls4 = (scales[3] & 254) - 127;
+            int sumi1 = 0, sumi2 = 0, sumi3 = 0, sumi4 = 0;
+            for (int j = 0; j < 32; ++j) {
+                uint8_t h = qh[j] >> 4*(ib128%2);
+                sumi1 += qy[j+ 0] * values1[((qs[j] >> 0) & 3) | ((h << 2) & 4)];
+                sumi2 += qy[j+32] * values2[((qs[j] >> 2) & 3) | ((h << 1) & 4)];
+                sumi3 += qy[j+64] * values3[((qs[j] >> 4) & 3) | ((h >> 0) & 4)];
+                sumi4 += qy[j+96] * values4[((qs[j] >> 6) & 3) | ((h >> 1) & 4)];
+            }
+            sumb += ls1*sumi1 + ls2*sumi2 + ls3*sumi3 + ls4*sumi4;
+            qy += 128;
+            qs += 32;
+            qh += 32*(ib128%2);
+            scales += 4;
+        }
+        sumf += y[ibl].d * sumb;
+    }
+    *s = sumf * d;
+}
+
+
diff --git a/ggml/src/iqk/iqk_quantize.h b/ggml/src/iqk/iqk_quantize.h
index a3623963..191a0fd2 100644
--- a/ggml/src/iqk/iqk_quantize.h
+++ b/ggml/src/iqk/iqk_quantize.h
@@ -61,6 +61,12 @@ size_t quantize_iq4_ks(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst
 void   dequantize_row_iq4_ks(const block_iq4_ks  * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
 void   vec_dot_iq4_ks_q8_k(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
 
+void   quantize_row_iq3_ks_ref(const float * GGML_RESTRICT x, block_iq3_ks  * GGML_RESTRICT y, int64_t k);
+void   quantize_row_iq3_ks(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
+size_t quantize_iq3_ks(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
+void   dequantize_row_iq3_ks(const block_iq3_ks  * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
+void   vec_dot_iq3_ks_q8_k(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
+
 void iqk_quantize_row_q8_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k);
 
 #ifdef __cplusplus
diff --git a/include/llama.h b/include/llama.h
index 9fb4af53..d6a9b10f 100644
--- a/include/llama.h
+++ b/include/llama.h
@@ -179,6 +179,7 @@ extern "C" {
         LLAMA_FTYPE_MOSTLY_IQ1_TN        = 144, // except 1d tensors
         LLAMA_FTYPE_MOSTLY_IQ4_KS        = 145, // except 1d tensors
         LLAMA_FTYPE_MOSTLY_IQ3_KL        = 146, // except 1d tensors
+        LLAMA_FTYPE_MOSTLY_IQ3_KS        = 147, // except 1d tensors
 
         LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file
     };
diff --git a/src/llama.cpp b/src/llama.cpp
index 80104303..32dd81b4 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -3796,6 +3796,7 @@ struct llama_model_loader {
                 case GGML_TYPE_IQ4_KS:  ftype = LLAMA_FTYPE_MOSTLY_IQ4_KS;  break;
                 case GGML_TYPE_IQ2_K:   ftype = LLAMA_FTYPE_MOSTLY_IQ2_K;   break;
                 case GGML_TYPE_IQ3_K:   ftype = LLAMA_FTYPE_MOSTLY_IQ3_K;   break;
+                case GGML_TYPE_IQ3_KS:  ftype = LLAMA_FTYPE_MOSTLY_IQ3_KS;  break;
                 case GGML_TYPE_IQ4_K:   ftype = LLAMA_FTYPE_MOSTLY_IQ4_K;   break;
                 case GGML_TYPE_IQ5_K:   ftype = LLAMA_FTYPE_MOSTLY_IQ5_K;   break;
                 case GGML_TYPE_IQ6_K:   ftype = LLAMA_FTYPE_MOSTLY_IQ6_K;   break;
@@ -4498,6 +4499,7 @@ static std::string llama_model_ftype_name(llama_ftype ftype) {
         case LLAMA_FTYPE_MOSTLY_IQ4_KS:   return "IQ4_KS - 4.25 bpw";
         case LLAMA_FTYPE_MOSTLY_IQ2_K:    return "IQ2_K - 2.375 bpw";
         case LLAMA_FTYPE_MOSTLY_IQ3_K:    return "IQ3_K - 3.4325 bpw";
+        case LLAMA_FTYPE_MOSTLY_IQ3_KS:   return "IQ3_KS - 3.25 bpw";
         case LLAMA_FTYPE_MOSTLY_IQ3_KL:   return "IQ3_KL - 4 bpw";
         case LLAMA_FTYPE_MOSTLY_IQ4_K:    return "IQ4_K - 4.5 bpw";
         case LLAMA_FTYPE_MOSTLY_IQ5_K:    return "IQ5_K - 5.5 bpw";
@@ -15623,7 +15625,8 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
             }
             else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS ||
                      ftype == LLAMA_FTYPE_MOSTLY_IQ1_S   || ftype == LLAMA_FTYPE_MOSTLY_IQ2_S  || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M   ||
-                     ftype == LLAMA_FTYPE_MOSTLY_IQ1_M   || ftype == LLAMA_FTYPE_MOSTLY_IQ2_K  || ftype == LLAMA_FTYPE_MOSTLY_IQ3_K) {
+                     ftype == LLAMA_FTYPE_MOSTLY_IQ1_M   || ftype == LLAMA_FTYPE_MOSTLY_IQ2_K  || ftype == LLAMA_FTYPE_MOSTLY_IQ3_K   ||
+                     ftype == LLAMA_FTYPE_MOSTLY_IQ3_KS) {
                 new_type = !qs.has_output ? GGML_TYPE_IQ4_K : GGML_TYPE_Q5_K;
             }
             else if ((ftype == LLAMA_FTYPE_MOSTLY_IQ3_S || ftype == LLAMA_FTYPE_MOSTLY_IQ3_M || ftype == LLAMA_FTYPE_MOSTLY_IQ4_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ4_KS) && !qs.has_output) {
@@ -15706,6 +15709,9 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
         else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_K && qs.model.hparams.n_gqa() >= 2) {
             new_type = GGML_TYPE_IQ4_K;
         }
+        else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_KS && qs.model.hparams.n_gqa() >= 2) {
+            new_type = GGML_TYPE_IQ4_K;
+        }
         else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_KL) {
             new_type = qs.model.hparams.n_gqa() >= 2 ? GGML_TYPE_IQ5_K : GGML_TYPE_IQ4_K;
         }
@@ -15883,7 +15889,8 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
         new_type == GGML_TYPE_IQ3_XXS || new_type == GGML_TYPE_IQ1_S   || new_type == GGML_TYPE_IQ3_S  ||
         new_type == GGML_TYPE_IQ1_M   || new_type == GGML_TYPE_IQ4_K   || new_type == GGML_TYPE_IQ2_K  ||
         new_type == GGML_TYPE_IQ5_K   || new_type == GGML_TYPE_IQ3_K   || new_type == GGML_TYPE_IQ2_TN ||
-        new_type == GGML_TYPE_IQ6_K   || new_type == GGML_TYPE_IQ1_TN  || new_type == GGML_TYPE_IQ4_KS) {
+        new_type == GGML_TYPE_IQ6_K   || new_type == GGML_TYPE_IQ1_TN  || new_type == GGML_TYPE_IQ4_KS ||
+        new_type == GGML_TYPE_IQ3_KS) {
         int nx = tensor->ne[0];
         int ny = tensor->ne[1];
         if (nx % QK_K != 0) {
@@ -15913,6 +15920,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
             case GGML_TYPE_Q2_K:
             case GGML_TYPE_Q3_K:
             case GGML_TYPE_IQ2_K:
+            case GGML_TYPE_IQ3_KS:
             case GGML_TYPE_IQ3_K:
             case GGML_TYPE_IQ4_KS:
             case GGML_TYPE_IQ4_XS: new_type = GGML_TYPE_IQ4_NL; break;
@@ -16028,6 +16036,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
         case LLAMA_FTYPE_MOSTLY_IQ4_KS:  default_type = GGML_TYPE_IQ4_KS;  break;
         case LLAMA_FTYPE_MOSTLY_IQ2_K:   default_type = GGML_TYPE_IQ2_K;   break;
         case LLAMA_FTYPE_MOSTLY_IQ3_K:   default_type = GGML_TYPE_IQ3_K;   break;
+        case LLAMA_FTYPE_MOSTLY_IQ3_KS:  default_type = GGML_TYPE_IQ3_KS;  break;
         case LLAMA_FTYPE_MOSTLY_IQ3_KL:  default_type = GGML_TYPE_IQ3_K;   break;
         case LLAMA_FTYPE_MOSTLY_IQ4_K:   default_type = GGML_TYPE_IQ4_K;   break;
         case LLAMA_FTYPE_MOSTLY_IQ5_K:   default_type = GGML_TYPE_IQ5_K;   break;