diff --git a/examples/quantize/quantize.cpp b/examples/quantize/quantize.cpp index 3cc19f70..1ace5720 100644 --- a/examples/quantize/quantize.cpp +++ b/examples/quantize/quantize.cpp @@ -45,6 +45,7 @@ static const std::vector QUANT_OPTIONS = { { "IQ4_XS", LLAMA_FTYPE_MOSTLY_IQ4_XS, " 4.25 bpw non-linear quantization", }, { "IQ4_KS", LLAMA_FTYPE_MOSTLY_IQ4_KS, " 4.25 bpw non-linear quantization", }, { "IQ2_K", LLAMA_FTYPE_MOSTLY_IQ2_K, " 2.375 bpw non-linear quantization",}, + { "IQ2_KS", LLAMA_FTYPE_MOSTLY_IQ2_KS, " 2.1875 bpw non-linear quantization",}, { "IQ3_K", LLAMA_FTYPE_MOSTLY_IQ3_K, " 3.44 bpw non-linear quantization", }, { "IQ3_KL", LLAMA_FTYPE_MOSTLY_IQ3_KL, " 4 bpw non-linear quantization mix",}, { "IQ4_K", LLAMA_FTYPE_MOSTLY_IQ4_K, " 4.5 bpw non-linear quantization", }, diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h index 3054dabd..fd7c23b9 100644 --- a/ggml/include/ggml.h +++ b/ggml/include/ggml.h @@ -404,6 +404,7 @@ extern "C" { GGML_TYPE_IQ2_TN = 142, GGML_TYPE_IQ1_TN = 143, GGML_TYPE_IQ4_KS = 144, + GGML_TYPE_IQ2_KS = 145, GGML_TYPE_COUNT, }; @@ -460,6 +461,7 @@ extern "C" { GGML_FTYPE_MOSTLY_IQ2_TN = 135, // except 1d tensors GGML_FTYPE_MOSTLY_IQ1_TN = 136, // except 1d tensors GGML_FTYPE_MOSTLY_IQ4_KS = 137, // except 1d tensors + GGML_FTYPE_MOSTLY_IQ2_KS = 138, // except 1d tensors }; // available tensor operations: diff --git a/ggml/src/ggml-common.h b/ggml/src/ggml-common.h index 7eaf7437..3a7b8989 100644 --- a/ggml/src/ggml-common.h +++ b/ggml/src/ggml-common.h @@ -455,6 +455,13 @@ typedef struct { } block_iq2_k; static_assert(sizeof(block_iq2_k) == sizeof(ggml_half) + sizeof(uint16_t) + QK_K/32 + QK_K/4, "wrong iq2_k block size/padding"); +typedef struct { + uint16_t extra; + uint8_t scales[QK_K/64]; + uint8_t qs[QK_K/4]; +} block_iq2_ks; +static_assert(sizeof(block_iq2_ks) == sizeof(uint16_t) + QK_K/64 + QK_K/4, "wrong iq2_ks block size/padding"); + typedef struct { ggml_half d; uint16_t extra; diff --git a/ggml/src/ggml-cuda.cu b/ggml/src/ggml-cuda.cu index 0657252d..6648b7f8 100644 --- a/ggml/src/ggml-cuda.cu +++ b/ggml/src/ggml-cuda.cu @@ -2830,6 +2830,7 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons case GGML_TYPE_IQ4_XS: case GGML_TYPE_IQ4_KS: case GGML_TYPE_IQ2_K: + case GGML_TYPE_IQ2_KS: case GGML_TYPE_IQ3_K: case GGML_TYPE_IQ4_K: case GGML_TYPE_IQ5_K: diff --git a/ggml/src/ggml-cuda/common.cuh b/ggml/src/ggml-cuda/common.cuh index c00cef29..a6a9c3d3 100644 --- a/ggml/src/ggml-cuda/common.cuh +++ b/ggml/src/ggml-cuda/common.cuh @@ -515,6 +515,13 @@ struct ggml_cuda_type_traits { static constexpr int qi = QI4_XS; }; +template<> +struct ggml_cuda_type_traits { + static constexpr int qk = QK_K; + static constexpr int qr = QR4_XS; + static constexpr int qi = QI4_XS; +}; + template<> struct ggml_cuda_type_traits { static constexpr int qk = QK_K; diff --git a/ggml/src/ggml-cuda/convert.cu b/ggml/src/ggml-cuda/convert.cu index 4ec136fa..4c6aa16e 100644 --- a/ggml/src/ggml-cuda/convert.cu +++ b/ggml/src/ggml-cuda/convert.cu @@ -743,6 +743,34 @@ static __global__ void dequantize_block_iq2_k(const void * __restrict__ vx, dst_ } } +template +static __global__ void dequantize_block_iq2_ks(const void * __restrict__ vx, dst_t * __restrict__ yy, int64_t n_per_row, int64_t row_size) { + + int64_t ii = blockIdx.x; + int64_t row = (QK_K * ii) / n_per_row; + const char * cx = (const char *)vx + row * row_size; + const float d = *(const float *)cx; + const block_iq2_ks * x = (const block_iq2_ks *)(cx + sizeof(float)); + const int64_t i = ii - (row*n_per_row)/QK_K; + + const int tid = threadIdx.x; + int ib128 = tid/16; // 0 or 1 + int il = tid%16; // 0...15 + dst_t * y = yy + ii*QK_K + 128*ib128 + 2*il; + const int16_t extra = x[i].extra >> 4*ib128; + const float dl1 = d * (((x[i].scales[2*ib128+0] & 0xf) | ((extra >> 4) & 0x10)) - 16); + const float dl2 = d * (((x[i].scales[2*ib128+0] >> 4) | ((extra >> 5) & 0x10)) - 16); + const float dl3 = d * (((x[i].scales[2*ib128+1] & 0xf) | ((extra >> 6) & 0x10)) - 16); + const float dl4 = d * (((x[i].scales[2*ib128+1] >> 4) | ((extra >> 7) & 0x10)) - 16); + const uint8_t * qs = x[i].qs + 32*ib128 + 2*il; + for (int j = 0; j < 2; ++j) { + y[j+ 0] = dl1 * iq2nl_values[((qs[j] >> 0) & 0x03) + ((extra << 2) & 4)]; + y[j+32] = dl2 * iq2nl_values[((qs[j] >> 2) & 0x03) + ((extra << 1) & 4)]; + y[j+64] = dl3 * iq2nl_values[((qs[j] >> 4) & 0x03) + ((extra >> 0) & 4)]; + y[j+96] = dl4 * iq2nl_values[((qs[j] >> 6) & 0x03) + ((extra >> 1) & 4)]; + } +} + template static __global__ void dequantize_block_iq3_k(const void * __restrict__ vx, dst_t * __restrict__ yy) { @@ -952,6 +980,14 @@ static void dequantize_row_iq4_ks_cuda(const void * vx, dst_t * y, const int64_t dequantize_block_iq4_ks<<>>(vx, y, n_per_row, row_size); } +template +static void dequantize_row_iq2_ks_cuda(const void * vx, dst_t * y, const int64_t nrows, const int64_t n_per_row, cudaStream_t stream) { + const int64_t k = nrows * n_per_row; + const int64_t row_size = ggml_row_size(GGML_TYPE_IQ2_KS, n_per_row); + const int nb = (k + QK_K - 1) / QK_K; + dequantize_block_iq2_ks<<>>(vx, y, n_per_row, row_size); +} + template static void dequantize_row_iq2_k_cuda(const void * vx, dst_t * y, const int64_t nrows, const int64_t n_per_row, cudaStream_t stream) { const int64_t k = nrows * n_per_row; @@ -1116,6 +1152,8 @@ to_fp16_cuda_t ggml_get_to_fp16_cuda(ggml_type type) { return dequantize_row_iq4_xs_cuda; case GGML_TYPE_IQ4_KS: return dequantize_row_iq4_ks_cuda; + case GGML_TYPE_IQ2_KS: + return dequantize_row_iq2_ks_cuda; case GGML_TYPE_IQ2_K: return dequantize_row_iq2_k_cuda; case GGML_TYPE_IQ3_K: @@ -1187,6 +1225,8 @@ to_fp32_cuda_t ggml_get_to_fp32_cuda(ggml_type type) { return dequantize_row_iq4_xs_cuda; case GGML_TYPE_IQ4_KS: return dequantize_row_iq4_ks_cuda; + case GGML_TYPE_IQ2_KS: + return dequantize_row_iq2_ks_cuda; case GGML_TYPE_IQ2_K: return dequantize_row_iq2_k_cuda; case GGML_TYPE_IQ3_K: diff --git a/ggml/src/ggml-cuda/iqk_mmvq.cu b/ggml/src/ggml-cuda/iqk_mmvq.cu index 5561f513..05d4775d 100644 --- a/ggml/src/ggml-cuda/iqk_mmvq.cu +++ b/ggml/src/ggml-cuda/iqk_mmvq.cu @@ -217,7 +217,6 @@ __device__ __forceinline__ float vec_dot_iq4_k_q8_1( #define VDR_IQ4_KS_Q8_1_MMVQ 4 #define VDR_IQ4_KS_Q8_1_MMQ 4 -// TODO __device__ __forceinline__ float vec_dot_iq4_ks_q8_1( const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & kbx, const int & iqs) { @@ -240,6 +239,18 @@ __device__ __forceinline__ float vec_dot_iq4_ks_q8_1( return dl * __low2float(bq8_1[ib32].ds) * sumi; } +#define VDR_IQ2_KS_Q8_1_MMVQ 4 +#define VDR_IQ2_KS_Q8_1_MMQ 4 + +__device__ __forceinline__ float vec_dot_iq2_ks_q8_1( + const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & kbx, const int & iqs) { + + float scale = *(const float *)vbq; + const block_iq2_ks * bq2 = (const block_iq2_ks *)((const char *)vbq + sizeof(float)) + kbx; + // TODO + return 0.f; +} + #define VDR_IQ5_K_Q8_1_MMVQ 4 #define VDR_IQ5_K_Q8_1_MMQ 4 @@ -645,6 +656,13 @@ void mul_mat_vec_iq4_ks_q8_1_cuda( iqk_mul_mat_vec_q_cuda(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream); } +void mul_mat_vec_iq2_ks_q8_1_cuda( + const void * vx, const void * vy, float * dst, + const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) { + + iqk_mul_mat_vec_q_cuda(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream); +} + void mul_mat_vec_iq5_k_q8_1_cuda( const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) { diff --git a/ggml/src/ggml-cuda/iqk_mmvq.cuh b/ggml/src/ggml-cuda/iqk_mmvq.cuh index 8d76be1d..3a93a1b6 100644 --- a/ggml/src/ggml-cuda/iqk_mmvq.cuh +++ b/ggml/src/ggml-cuda/iqk_mmvq.cuh @@ -32,3 +32,7 @@ void mul_mat_vec_iq4_ks_q8_1_cuda( const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream); +void mul_mat_vec_iq2_ks_q8_1_cuda( + const void * vx, const void * vy, float * dst, + const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream); + diff --git a/ggml/src/ggml-cuda/mmvq.cu b/ggml/src/ggml-cuda/mmvq.cu index 8e3c4aa4..e312b266 100644 --- a/ggml/src/ggml-cuda/mmvq.cu +++ b/ggml/src/ggml-cuda/mmvq.cu @@ -462,6 +462,9 @@ void ggml_cuda_op_mul_mat_vec_q( case GGML_TYPE_IQ4_KS: mul_mat_vec_iq4_ks_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream); break; + case GGML_TYPE_IQ2_KS: + mul_mat_vec_iq2_ks_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream); + break; case GGML_TYPE_IQ5_K: mul_mat_vec_iq5_k_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream); break; diff --git a/ggml/src/ggml-quants.c b/ggml/src/ggml-quants.c index 40978ac0..a845eaf5 100644 --- a/ggml/src/ggml-quants.c +++ b/ggml/src/ggml-quants.c @@ -12873,7 +12873,6 @@ static void quantize_row_iq2_xxs_impl(const float * restrict x, void * restrict const int * kmap_q2xs = iq2_data[gindex].map; const uint16_t * kneighbors_q2xs = iq2_data[gindex].neighbours; - GGML_ASSERT(quant_weights && "missing quantization weights"); GGML_ASSERT(kgrid_q2xs && "forgot to call ggml_quantize_init()?"); GGML_ASSERT(kmap_q2xs && "forgot to call ggml_quantize_init()?"); GGML_ASSERT(kneighbors_q2xs && "forgot to call ggml_quantize_init()?"); @@ -12908,8 +12907,12 @@ static void quantize_row_iq2_xxs_impl(const float * restrict x, void * restrict for (int ib = 0; ib < QK_K/32; ++ib) { const float * xb = xbl + 32*ib; - const float * qw = quant_weights + QK_K*ibl + 32*ib; - for (int i = 0; i < 32; ++i) weight[i] = qw[i] * sqrtf(sigma2 + xb[i]*xb[i]); + if (quant_weights) { + const float * qw = quant_weights + QK_K*ibl + 32*ib; + for (int i = 0; i < 32; ++i) weight[i] = qw[i] * sqrtf(sigma2 + xb[i]*xb[i]); + } else { + for (int i = 0; i < 32; ++i) weight[i] = 0.25f*sigma2 + xb[i]*xb[i]; + } for (int i = 0; i < 32; ++i) waux[i] = sqrtf(weight[i]); for (int k = 0; k < 4; ++k) { int nflip = 0; @@ -13046,7 +13049,6 @@ static void quantize_row_iq2_xs_impl(const float * restrict x, void * restrict v const int * kmap_q2xs = iq2_data[gindex].map; const uint16_t * kneighbors_q2xs = iq2_data[gindex].neighbours; - GGML_ASSERT(quant_weights && "missing quantization weights"); GGML_ASSERT(kmap_q2xs && "forgot to call ggml_quantize_init()?"); GGML_ASSERT(kgrid_q2xs && "forgot to call ggml_quantize_init()?"); GGML_ASSERT(kneighbors_q2xs && "forgot to call ggml_quantize_init()?"); @@ -13084,8 +13086,12 @@ static void quantize_row_iq2_xs_impl(const float * restrict x, void * restrict v for (int ib = 0; ib < QK_K/16; ++ib) { const float * xb = xbl + 16*ib; - const float * qw = quant_weights + QK_K*ibl + 16*ib; - for (int i = 0; i < 16; ++i) weight[i] = qw[i] * sqrtf(sigma2 + xb[i]*xb[i]); + if (quant_weights) { + const float * qw = quant_weights + QK_K*ibl + 16*ib; + for (int i = 0; i < 16; ++i) weight[i] = qw[i] * sqrtf(sigma2 + xb[i]*xb[i]); + } else { + for (int i = 0; i < 16; ++i) weight[i] = 0.25f*sigma2 + xb[i]*xb[i]; + } for (int i = 0; i < 16; ++i) waux[i] = sqrtf(weight[i]); for (int k = 0; k < 2; ++k) { int nflip = 0; @@ -13230,6 +13236,17 @@ size_t quantize_iq2_xxs(const float * restrict src, void * restrict dst, int64_t return nrow * nblock * sizeof(block_iq2_xxs); } +void quantize_row_iq2_xxs(const float * restrict x, void * restrict vy, int64_t k) { + assert(k % QK_K == 0); + block_iq2_xxs * restrict y = vy; + quantize_row_iq2_xxs_ref(x, y, k); +} + +void quantize_row_iq2_xxs_ref(const float * restrict x, block_iq2_xxs * restrict y, int64_t k) { + assert(k % QK_K == 0); + quantize_iq2_xxs(x, y, 1, k, NULL); +} + size_t quantize_iq2_xs(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) { GGML_ASSERT(n_per_row%QK_K == 0); int64_t nblock = n_per_row/QK_K; @@ -13242,6 +13259,17 @@ size_t quantize_iq2_xs(const float * restrict src, void * restrict dst, int64_t return nrow * nblock * sizeof(block_iq2_xs); } +void quantize_row_iq2_xs(const float * restrict x, void * restrict vy, int64_t k) { + assert(k % QK_K == 0); + block_iq2_xs * restrict y = vy; + quantize_row_iq2_xs_ref(x, y, k); +} + +void quantize_row_iq2_xs_ref(const float * restrict x, block_iq2_xs * restrict y, int64_t k) { + assert(k % QK_K == 0); + quantize_iq2_xs(x, y, 1, k, NULL); +} + // // ============================================= 3-bit using D4 lattice // @@ -14947,10 +14975,11 @@ bool ggml_validate_row_data(enum ggml_type type, const void * data, size_t nbyte return false; } - if (type != GGML_TYPE_IQ2_TN && type != GGML_TYPE_IQ1_TN && type != GGML_TYPE_IQ4_KS && nbytes % ggml_type_size(type) != 0) { - fprintf(stderr, "%s: invalid size %zu for type %s (type size = %zu)\n", __func__, nbytes, ggml_type_name(type), ggml_type_size(type)); - return false; - } + // Who needs this? + //if (type != GGML_TYPE_IQ2_TN && type != GGML_TYPE_IQ1_TN && type != GGML_TYPE_IQ4_KS && nbytes % ggml_type_size(type) != 0) { + // fprintf(stderr, "%s: invalid size %zu for type %s (type size = %zu)\n", __func__, nbytes, ggml_type_name(type), ggml_type_size(type)); + // return false; + //} const size_t nb = nbytes/ggml_type_size(type); @@ -15160,6 +15189,7 @@ bool ggml_validate_row_data(enum ggml_type type, const void * data, size_t nbyte } break; case GGML_TYPE_Q6_0: break; case GGML_TYPE_IQ2_K: break; + case GGML_TYPE_IQ2_KS: break; case GGML_TYPE_IQ3_K: break; case GGML_TYPE_IQ4_K: break; case GGML_TYPE_IQ5_K: break; diff --git a/ggml/src/ggml-quants.h b/ggml/src/ggml-quants.h index bad7e9d9..a40a6d37 100644 --- a/ggml/src/ggml-quants.h +++ b/ggml/src/ggml-quants.h @@ -35,6 +35,8 @@ void quantize_row_q6_K_ref(const float * GGML_RESTRICT x, block_q6_K * GGML_REST void quantize_row_q8_K_ref(const float * GGML_RESTRICT x, block_q8_K * GGML_RESTRICT y, int64_t k); void quantize_row_q8_K64_ref(const float * GGML_RESTRICT x, block_q8_K64 * GGML_RESTRICT y, int64_t k); +void quantize_row_iq2_xxs_ref(const float * GGML_RESTRICT x, block_iq2_xxs * GGML_RESTRICT y, int64_t k); +void quantize_row_iq2_xs_ref (const float * GGML_RESTRICT x, block_iq2_xs * GGML_RESTRICT y, int64_t k); void quantize_row_iq3_xxs_ref(const float * GGML_RESTRICT x, block_iq3_xxs * GGML_RESTRICT y, int64_t k); void quantize_row_iq4_nl_ref (const float * GGML_RESTRICT x, block_iq4_nl * GGML_RESTRICT y, int64_t k); void quantize_row_iq4_xs_ref (const float * GGML_RESTRICT x, block_iq4_xs * GGML_RESTRICT y, int64_t k); @@ -59,6 +61,8 @@ void quantize_row_q6_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, in void quantize_row_q8_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k); void quantize_row_q8_K64(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k); +void quantize_row_iq2_xxs(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k); +void quantize_row_iq2_xs (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k); void quantize_row_iq3_xxs(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k); void quantize_row_iq4_nl (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k); void quantize_row_iq4_xs (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k); diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index 97fa81b1..d945a34e 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -920,8 +920,8 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = { .type_size = sizeof(block_iq2_xxs), .is_quantized = true, .to_float = (ggml_to_float_t) dequantize_row_iq2_xxs, - .from_float = NULL, - .from_float_ref = NULL, + .from_float = quantize_row_iq2_xxs, + .from_float_ref = (ggml_from_float_t)quantize_row_iq2_xxs_ref, .vec_dot = ggml_vec_dot_iq2_xxs_q8_K, .vec_dot_type = GGML_TYPE_Q8_K, .nrows = 1, @@ -933,8 +933,8 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = { .type_size = sizeof(block_iq2_xs), .is_quantized = true, .to_float = (ggml_to_float_t) dequantize_row_iq2_xs, - .from_float = NULL, - .from_float_ref = NULL, + .from_float = quantize_row_iq2_xs, + .from_float_ref = (ggml_from_float_t)quantize_row_iq2_xs_ref, .vec_dot = ggml_vec_dot_iq2_xs_q8_K, .vec_dot_type = GGML_TYPE_Q8_K, .nrows = 1, @@ -1193,6 +1193,19 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = { .nrows = 1, .row_meta_size = 0, }, + [GGML_TYPE_IQ2_KS] = { + .type_name = "iq2_ks", + .blck_size = QK_K, + .type_size = sizeof(block_iq2_ks), + .is_quantized = true, + .to_float = (ggml_to_float_t) dequantize_row_iq2_ks, + .from_float = quantize_row_iq2_ks, + .from_float_ref = (ggml_from_float_t)quantize_row_iq2_ks_ref, + .vec_dot = vec_dot_iq2_ks_q8_k, + .vec_dot_type = GGML_TYPE_Q8_K, + .nrows = 1, + .row_meta_size = 4, + }, [GGML_TYPE_IQ3_K] = { .type_name = "iq3_k", .blck_size = QK_K, @@ -3906,6 +3919,7 @@ enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype) { case GGML_FTYPE_MOSTLY_IQ4_XS: wtype = GGML_TYPE_IQ4_XS; break; case GGML_FTYPE_MOSTLY_IQ4_KS: wtype = GGML_TYPE_IQ4_KS; break; case GGML_FTYPE_MOSTLY_IQ2_K: wtype = GGML_TYPE_IQ2_K; break; + case GGML_FTYPE_MOSTLY_IQ2_KS: wtype = GGML_TYPE_IQ2_KS; break; case GGML_FTYPE_MOSTLY_IQ3_K: wtype = GGML_TYPE_IQ3_K; break; case GGML_FTYPE_MOSTLY_IQ4_K: wtype = GGML_TYPE_IQ4_K; break; case GGML_FTYPE_MOSTLY_IQ5_K: wtype = GGML_TYPE_IQ5_K; break; @@ -10406,6 +10420,7 @@ static void ggml_compute_forward_add( case GGML_TYPE_IQ4_XS: case GGML_TYPE_IQ4_KS: case GGML_TYPE_IQ2_K: + case GGML_TYPE_IQ2_KS: case GGML_TYPE_IQ3_K: case GGML_TYPE_IQ4_K: case GGML_TYPE_IQ5_K: @@ -10795,6 +10810,7 @@ static void ggml_compute_forward_add1( case GGML_TYPE_IQ4_XS: case GGML_TYPE_IQ4_KS: case GGML_TYPE_IQ2_K: + case GGML_TYPE_IQ2_KS: case GGML_TYPE_IQ3_K: case GGML_TYPE_IQ4_K: case GGML_TYPE_IQ5_K: @@ -10934,6 +10950,7 @@ static void ggml_compute_forward_acc( case GGML_TYPE_IQ4_XS: case GGML_TYPE_IQ4_KS: case GGML_TYPE_IQ2_K: + case GGML_TYPE_IQ2_KS: case GGML_TYPE_IQ3_K: case GGML_TYPE_IQ4_K: case GGML_TYPE_IQ5_K: @@ -14119,6 +14136,7 @@ static void ggml_compute_forward_out_prod( case GGML_TYPE_IQ4_XS: case GGML_TYPE_IQ4_KS: case GGML_TYPE_IQ2_K: + case GGML_TYPE_IQ2_KS: case GGML_TYPE_IQ3_K: case GGML_TYPE_IQ4_K: case GGML_TYPE_IQ5_K: @@ -14498,6 +14516,7 @@ static void ggml_compute_forward_set( case GGML_TYPE_IQ4_XS: case GGML_TYPE_IQ4_KS: case GGML_TYPE_IQ2_K: + case GGML_TYPE_IQ2_KS: case GGML_TYPE_IQ3_K: case GGML_TYPE_IQ4_K: case GGML_TYPE_IQ5_K: @@ -14771,6 +14790,7 @@ static void ggml_compute_forward_get_rows( case GGML_TYPE_IQ4_XS: case GGML_TYPE_IQ4_KS: case GGML_TYPE_IQ2_K: + case GGML_TYPE_IQ2_KS: case GGML_TYPE_IQ3_K: case GGML_TYPE_IQ4_K: case GGML_TYPE_IQ5_K: @@ -15371,6 +15391,7 @@ static void ggml_compute_forward_clamp( case GGML_TYPE_IQ4_XS: case GGML_TYPE_IQ4_KS: case GGML_TYPE_IQ2_K: + case GGML_TYPE_IQ2_KS: case GGML_TYPE_IQ3_K: case GGML_TYPE_IQ4_K: case GGML_TYPE_IQ5_K: @@ -22188,6 +22209,7 @@ size_t ggml_quantize_chunk( case GGML_TYPE_IQ4_XS: result = quantize_iq4_xs (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; case GGML_TYPE_IQ4_KS: result = quantize_iq4_ks (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; case GGML_TYPE_IQ2_K: result = quantize_iq2_k (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; + case GGML_TYPE_IQ2_KS: result = quantize_iq2_ks (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; case GGML_TYPE_IQ3_K: result = quantize_iq3_k (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; case GGML_TYPE_IQ4_K: result = quantize_iq4_k (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; case GGML_TYPE_IQ5_K: result = quantize_iq5_k (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; diff --git a/ggml/src/iqk/iqk_quantize.cpp b/ggml/src/iqk/iqk_quantize.cpp index 6cc6cef0..c377f6ce 100644 --- a/ggml/src/iqk/iqk_quantize.cpp +++ b/ggml/src/iqk/iqk_quantize.cpp @@ -755,6 +755,227 @@ void vec_dot_iq2_k_q8_k(int n, float * GGML_RESTRICT s, size_t bs, const void * } +namespace { +void quantize_row_iq2_ks_impl(const float * x, void * vy, int n_per_row, const float * quant_weights, float * all_scales, float * all_sw, int8_t * all_Ls) { + + constexpr int kBlockSize = 32; + + float * dptr = (float *)vy; + *dptr = 0; + + block_iq2_ks * y = (block_iq2_ks *)(dptr + 1); + + float weight[kBlockSize]; + float sumx[kBlockSize+1], sumw[kBlockSize+1]; + + std::array, kBlockSize> pairs; + + const int8_t * shifted_values = iq2nl_values + 4; + + const int nblock = n_per_row/QK_K; + + for (int ibl = 0; ibl < nblock; ++ibl) { + + memset(&y[ibl], 0, sizeof(block_iq2_ks)); + + auto scales = all_scales + ibl*(QK_K/kBlockSize); + auto sw = all_sw + ibl*(QK_K/kBlockSize); + + const float * xbl = x + ibl*QK_K; + float sumx2 = 0; + for (int j = 0; j < QK_K; ++j) sumx2 += xbl[j]*xbl[j]; + const float sigma2 = 1.5f*sumx2/QK_K; + + uint16_t extra = 0; + + for (int ib = 0; ib < QK_K/kBlockSize; ++ib) { + const float * xb = xbl + kBlockSize*ib; + if (quant_weights) { + const float * qw = quant_weights + ibl*QK_K + ib*kBlockSize; + for (int j = 0; j < kBlockSize; ++j) weight[j] = qw[j] * sqrtf(sigma2 + xb[j]*xb[j]); + } else { + for (int j = 0; j < kBlockSize; ++j) weight[j] = 0.25f*sigma2 + xb[j]*xb[j]; + } + sw[ib] = 0; + for (int j = 0; j < kBlockSize; ++j) { + sw[ib] += weight[j]; + pairs[j] = {xb[j], j}; + } + std::sort(pairs.begin(), pairs.end()); + sumx[0] = sumw[0] = 0; + for (int j = 0; j < kBlockSize; ++j) { + int jj = pairs[j].second; + sumw[j+1] = sumw[j] + weight[jj]; + sumx[j+1] = sumx[j] + weight[jj]*xb[jj]; + } + float best = 0, d = 0; + bool is_shifted = false; + float sumqx, sumq2; + for (int i1 = 0; i1 < kBlockSize; ++i1) { + for (int i2 = i1; i2 < kBlockSize; ++i2) { + for (int i3 = i2; i3 < kBlockSize; ++i3) { + sumqx = (sumx[i1] - sumx[ 0])*iq2nl_values[0] + (sumx[i2] - sumx[i1])*iq2nl_values[1] + + (sumx[i3] - sumx[i2])*iq2nl_values[2] + (sumx[kBlockSize] - sumx[i3])*iq2nl_values[3]; + sumq2 = (sumw[i1] - sumw[ 0])*iq2nl_values[0]*iq2nl_values[0] + (sumw[i2] - sumw[i1])*iq2nl_values[1]*iq2nl_values[1] + + (sumw[i3] - sumw[i2])*iq2nl_values[2]*iq2nl_values[2] + (sumw[kBlockSize] - sumw[i3])*iq2nl_values[3]*iq2nl_values[3]; + if (sumq2 > 0 && sumqx*sumqx > best*sumq2) { + d = sumqx/sumq2; best = d*sumqx; is_shifted = false; + } + sumqx = (sumx[i1] - sumx[ 0])*shifted_values[0] + (sumx[i2] - sumx[i1])*shifted_values[1] + + (sumx[i3] - sumx[i2])*shifted_values[2] + (sumx[kBlockSize] - sumx[i3])*shifted_values[3]; + sumq2 = (sumw[i1] - sumw[ 0])*shifted_values[0]*shifted_values[0] + (sumw[i2] - sumw[i1])*shifted_values[1]*shifted_values[1] + + (sumw[i3] - sumw[i2])*shifted_values[2]*shifted_values[2] + (sumw[kBlockSize] - sumw[i3])*shifted_values[3]*shifted_values[3]; + if (sumq2 > 0 && sumqx*sumqx > best*sumq2) { + d = sumqx/sumq2; best = d*sumqx; is_shifted = true; + } + sumqx = (sumx[i1] - sumx[ 0])*iq2nl_values[3] + (sumx[i2] - sumx[i1])*iq2nl_values[2] + + (sumx[i3] - sumx[i2])*iq2nl_values[1] + (sumx[kBlockSize] - sumx[i3])*iq2nl_values[0]; + sumq2 = (sumw[i1] - sumw[ 0])*iq2nl_values[3]*iq2nl_values[3] + (sumw[i2] - sumw[i1])*iq2nl_values[2]*iq2nl_values[2] + + (sumw[i3] - sumw[i2])*iq2nl_values[1]*iq2nl_values[1] + (sumw[kBlockSize] - sumw[i3])*iq2nl_values[0]*iq2nl_values[0]; + if (sumq2 > 0 && sumqx*sumqx > best*sumq2) { + d = sumqx/sumq2; best = d*sumqx; is_shifted = false; + } + sumqx = (sumx[i1] - sumx[ 0])*shifted_values[3] + (sumx[i2] - sumx[i1])*shifted_values[2] + + (sumx[i3] - sumx[i2])*shifted_values[1] + (sumx[kBlockSize] - sumx[i3])*shifted_values[0]; + sumq2 = (sumw[i1] - sumw[ 0])*shifted_values[3]*shifted_values[3] + (sumw[i2] - sumw[i1])*shifted_values[2]*shifted_values[2] + + (sumw[i3] - sumw[i2])*shifted_values[1]*shifted_values[1] + (sumw[kBlockSize] - sumw[i3])*shifted_values[0]*shifted_values[0]; + if (sumq2 > 0 && sumqx*sumqx > best*sumq2) { + d = sumqx/sumq2; best = d*sumqx; is_shifted = true; + } + } + } + } + scales[ib] = d; + if (is_shifted) extra |= (1 << ib); + + } + y[ibl].extra = extra; + + } + + float d = make_qx_quants(nblock*(QK_K/kBlockSize), 16, all_scales, all_Ls, all_sw); + + if (!d) return; + + float sumqx = 0, sumq2 = 0; + for (int ibl = 0; ibl < nblock; ++ibl) { + auto xbl = x + ibl*QK_K; + float sumx2 = 0; + for (int j = 0; j < QK_K; ++j) sumx2 += xbl[j]*xbl[j]; + const float sigma2 = 1.5f*sumx2/QK_K; + auto Ls = all_Ls + ibl*(QK_K/kBlockSize); + for (int ib = 0; ib < QK_K/kBlockSize; ++ib) { + int ls = Ls[ib]; + y[ibl].scales[ib/2] |= ((ls & 0xf) << 4*(ib%2)); + y[ibl].extra |= ((ls >> 4) << (8 + ib)); + ls -= 16; + float dl = d * ls; + if (dl) { + const int8_t * block_values = y[ibl].extra & (1 << ib) ? shifted_values : iq2nl_values; + const float * xb = xbl + kBlockSize*ib; + if (quant_weights) { + const float * qw = quant_weights + ibl*QK_K + ib*kBlockSize; + for (int j = 0; j < kBlockSize; ++j) weight[j] = qw[j] * sqrtf(sigma2 + xb[j]*xb[j]); + } else { + for (int j = 0; j < kBlockSize; ++j) weight[j] = 0.25f*sigma2 + xb[j]*xb[j]; + } + float idl = 1/dl; + uint8_t * qs = y[ibl].qs + 32*(ib/4); + for (int j = 0; j < 32; ++j) { + const float al = idl*xb[j]; + int ibest = best_index_iq2nl(block_values, al); + qs[j] |= (ibest << 2*(ib%4)); + float w = weight[j]; + float q = block_values[ibest]*ls; + sumqx += w*q*xb[j]; + sumq2 += w*q*q; + } + } + } + } + *dptr = 1.030f*(sumq2 > 0 ? sumqx/sumq2 : d); +} +} + +void quantize_row_iq2_ks_ref(const float * GGML_RESTRICT x, block_iq2_ks * GGML_RESTRICT y, int64_t k) { + assert(k % QK_K == 0); + quantize_iq2_ks(x, (void *)y, 1, k, nullptr); +} + +void quantize_row_iq2_ks(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) { + assert(k % QK_K == 0); + block_iq2_ks * y = (block_iq2_ks *)vy; + quantize_row_iq2_ks_ref(x, y, k); +} + +size_t quantize_iq2_ks(const float * src, void * dst, int64_t nrows, int64_t n_per_row, const float * imatrix) { + constexpr int kBlockSize = 32; + GGML_ASSERT(n_per_row%QK_K == 0); + auto row_size = ggml_row_size(GGML_TYPE_IQ2_KS, n_per_row); + int nblock = n_per_row/QK_K; + std::vector all_scales(nblock*(QK_K/kBlockSize)), all_sw(nblock*(QK_K/kBlockSize)); + std::vector all_Ls(nblock*(QK_K/kBlockSize)); + char * qrow = (char *)dst; + for (int64_t row = 0; row < nrows; ++row) { + quantize_row_iq2_ks_impl(src, (void *)qrow, n_per_row, imatrix, all_scales.data(), all_sw.data(), all_Ls.data()); + src += n_per_row; + qrow += row_size; + } + return nrows * row_size; +} + +void dequantize_row_iq2_ks(const block_iq2_ks * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) { + assert(k % QK_K == 0); + const int nb = k / QK_K; + + const float * dptr = (const float *)x; + const float d = *dptr; + x = (const block_iq2_ks *)(dptr + 1); + + for (int i = 0; i < nb; i++) { + + const uint8_t * qs = x[i].qs; + + uint16_t extra = x[i].extra; + + int shift = 0; + for (int ib64 = 0; ib64 < QK_K/64; ++ib64) { + float dl1 = d * (((x[i].scales[ib64] & 0xf) | ((extra >> 4) & 0x10)) - 16); + float dl2 = d * (((x[i].scales[ib64] >> 4) | ((extra >> 5) & 0x10)) - 16); + const int8_t * values1 = extra & 1 ? iq2nl_values + 4 : iq2nl_values; + const int8_t * values2 = extra & 2 ? iq2nl_values + 4 : iq2nl_values; + extra >>= 2; + for (int j = 0; j < 32; ++j) { + y[j+ 0] = dl1 * values1[(qs[j] >> (shift+0)) & 3]; + y[j+32] = dl2 * values2[(qs[j] >> (shift+2)) & 3]; + } + y += 64; + shift += 4; + if (shift == 8) { qs += 32; shift = 0; } + } + + } + +} + +void vec_dot_iq2_ks_q8_k(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { + assert(n % QK_K == 0); + assert(nrc == 1); + GGML_UNUSED(nrc); + GGML_UNUSED(bx); + GGML_UNUSED(by); + GGML_UNUSED(bs); + +#if GGML_USE_IQK_MULMAT + if (iqk_mul_mat(1, 1, n, GGML_TYPE_IQ2_KS, vx, 0, GGML_TYPE_Q8_K, vy, 0, s, 0, 0, 1)) { + return; + } +#endif + + GGML_ABORT("not implemented"); + +} + // // ============================================== iq3_k // diff --git a/ggml/src/iqk/iqk_quantize.h b/ggml/src/iqk/iqk_quantize.h index a3623963..eb562779 100644 --- a/ggml/src/iqk/iqk_quantize.h +++ b/ggml/src/iqk/iqk_quantize.h @@ -61,6 +61,12 @@ size_t quantize_iq4_ks(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst void dequantize_row_iq4_ks(const block_iq4_ks * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); void vec_dot_iq4_ks_q8_k(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); +void quantize_row_iq2_ks_ref(const float * GGML_RESTRICT x, block_iq2_ks * GGML_RESTRICT y, int64_t k); +void quantize_row_iq2_ks(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k); +size_t quantize_iq2_ks(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); +void dequantize_row_iq2_ks(const block_iq2_ks * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); +void vec_dot_iq2_ks_q8_k(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); + void iqk_quantize_row_q8_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k); #ifdef __cplusplus diff --git a/include/llama.h b/include/llama.h index 9fb4af53..c9387e6b 100644 --- a/include/llama.h +++ b/include/llama.h @@ -179,6 +179,7 @@ extern "C" { LLAMA_FTYPE_MOSTLY_IQ1_TN = 144, // except 1d tensors LLAMA_FTYPE_MOSTLY_IQ4_KS = 145, // except 1d tensors LLAMA_FTYPE_MOSTLY_IQ3_KL = 146, // except 1d tensors + LLAMA_FTYPE_MOSTLY_IQ2_KS = 147, // except 1d tensors LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file }; diff --git a/src/llama.cpp b/src/llama.cpp index c338452b..b356f7bc 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -3783,6 +3783,7 @@ struct llama_model_loader { case GGML_TYPE_Q6_K: ftype = LLAMA_FTYPE_MOSTLY_Q6_K; break; case GGML_TYPE_IQ2_XXS: ftype = LLAMA_FTYPE_MOSTLY_IQ2_XXS; break; case GGML_TYPE_IQ2_XS: ftype = LLAMA_FTYPE_MOSTLY_IQ2_XS; break; + case GGML_TYPE_IQ2_KS: ftype = LLAMA_FTYPE_MOSTLY_IQ2_KS; break; case GGML_TYPE_IQ2_S: ftype = LLAMA_FTYPE_MOSTLY_IQ2_S; break; case GGML_TYPE_IQ3_XXS: ftype = LLAMA_FTYPE_MOSTLY_IQ3_XXS; break; case GGML_TYPE_IQ1_S: ftype = LLAMA_FTYPE_MOSTLY_IQ1_S; break; @@ -4487,6 +4488,7 @@ static std::string llama_model_ftype_name(llama_ftype ftype) { case LLAMA_FTYPE_MOSTLY_Q6_K: return "Q6_K"; case LLAMA_FTYPE_MOSTLY_IQ2_XXS: return "IQ2_XXS - 2.0625 bpw"; case LLAMA_FTYPE_MOSTLY_IQ2_XS: return "IQ2_XS - 2.3125 bpw"; + case LLAMA_FTYPE_MOSTLY_IQ2_KS: return "IQ2_KS - 2.1875 bpw"; case LLAMA_FTYPE_MOSTLY_IQ2_S: return "IQ2_S - 2.5 bpw"; case LLAMA_FTYPE_MOSTLY_IQ2_M: return "IQ2_M - 2.7 bpw"; case LLAMA_FTYPE_MOSTLY_IQ3_XS: return "IQ3_XS - 3.3 bpw"; @@ -15645,7 +15647,8 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M || - ftype == LLAMA_FTYPE_MOSTLY_IQ1_M || ftype == LLAMA_FTYPE_MOSTLY_IQ2_K || ftype == LLAMA_FTYPE_MOSTLY_IQ3_K) { + ftype == LLAMA_FTYPE_MOSTLY_IQ1_M || ftype == LLAMA_FTYPE_MOSTLY_IQ2_K || ftype == LLAMA_FTYPE_MOSTLY_IQ3_K || + ftype == LLAMA_FTYPE_MOSTLY_IQ2_KS) { new_type = !qs.has_output ? GGML_TYPE_IQ4_K : GGML_TYPE_Q5_K; } else if ((ftype == LLAMA_FTYPE_MOSTLY_IQ3_S || ftype == LLAMA_FTYPE_MOSTLY_IQ3_M || ftype == LLAMA_FTYPE_MOSTLY_IQ4_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ4_KS) && !qs.has_output) { @@ -15681,7 +15684,8 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n } } } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || - ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) { + ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M || + ftype == LLAMA_FTYPE_MOSTLY_IQ2_KS) { if (name.find("attn_v.weight") != std::string::npos) { if (qs.model.hparams.n_gqa() >= 4 || qs.model.hparams.n_expert >= 4) new_type = GGML_TYPE_IQ4_K; else if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) new_type = GGML_TYPE_IQ3_K; @@ -15905,7 +15909,8 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n new_type == GGML_TYPE_IQ3_XXS || new_type == GGML_TYPE_IQ1_S || new_type == GGML_TYPE_IQ3_S || new_type == GGML_TYPE_IQ1_M || new_type == GGML_TYPE_IQ4_K || new_type == GGML_TYPE_IQ2_K || new_type == GGML_TYPE_IQ5_K || new_type == GGML_TYPE_IQ3_K || new_type == GGML_TYPE_IQ2_TN || - new_type == GGML_TYPE_IQ6_K || new_type == GGML_TYPE_IQ1_TN || new_type == GGML_TYPE_IQ4_KS) { + new_type == GGML_TYPE_IQ6_K || new_type == GGML_TYPE_IQ1_TN || new_type == GGML_TYPE_IQ4_KS || + new_type == GGML_TYPE_IQ2_KS) { int nx = tensor->ne[0]; int ny = tensor->ne[1]; if (nx % QK_K != 0) { @@ -15925,6 +15930,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n switch (new_type) { case GGML_TYPE_IQ2_XXS: case GGML_TYPE_IQ2_XS: + case GGML_TYPE_IQ2_KS: case GGML_TYPE_IQ2_S: case GGML_TYPE_IQ3_XXS: case GGML_TYPE_IQ3_S: @@ -16036,6 +16042,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s case LLAMA_FTYPE_MOSTLY_Q6_K: default_type = GGML_TYPE_Q6_K; break; case LLAMA_FTYPE_MOSTLY_IQ2_XXS: default_type = GGML_TYPE_IQ2_XXS; break; case LLAMA_FTYPE_MOSTLY_IQ2_XS: default_type = GGML_TYPE_IQ2_XS; break; + case LLAMA_FTYPE_MOSTLY_IQ2_KS: default_type = GGML_TYPE_IQ2_KS; break; case LLAMA_FTYPE_MOSTLY_IQ2_S: default_type = GGML_TYPE_IQ2_XS; break; case LLAMA_FTYPE_MOSTLY_IQ2_M: default_type = GGML_TYPE_IQ2_S; break; case LLAMA_FTYPE_MOSTLY_IQ3_XXS: default_type = GGML_TYPE_IQ3_XXS; break;