diff --git a/examples/quantize/quantize.cpp b/examples/quantize/quantize.cpp index 7716f34e..dee0249f 100644 --- a/examples/quantize/quantize.cpp +++ b/examples/quantize/quantize.cpp @@ -32,6 +32,7 @@ static const std::vector QUANT_OPTIONS = { { "Q2_K", LLAMA_FTYPE_MOSTLY_Q2_K, " 2.63G, +0.6717 ppl @ LLaMA-v1-7B", }, { "Q2_K_S", LLAMA_FTYPE_MOSTLY_Q2_K_S, " 2.16G, +9.0634 ppl @ LLaMA-v1-7B", }, { "IQ3_XXS", LLAMA_FTYPE_MOSTLY_IQ3_XXS, " 3.06 bpw quantization", }, + { "IQ3_KT", LLAMA_FTYPE_MOSTLY_IQ3_KT, " 3.125 bpw quantization", }, { "IQ3_S", LLAMA_FTYPE_MOSTLY_IQ3_S, " 3.44 bpw quantization", }, { "IQ3_M", LLAMA_FTYPE_MOSTLY_IQ3_M, " 3.66 bpw quantization mix", }, { "Q3_K", LLAMA_FTYPE_MOSTLY_Q3_K_M, "alias for Q3_K_M" }, diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h index 276fa8b4..4430ac28 100644 --- a/ggml/include/ggml.h +++ b/ggml/include/ggml.h @@ -407,6 +407,7 @@ extern "C" { GGML_TYPE_IQ2_KS = 145, GGML_TYPE_IQ4_KSS = 146, GGML_TYPE_IQ2_KT = 147, + GGML_TYPE_IQ3_KT = 148, GGML_TYPE_COUNT, }; @@ -466,6 +467,7 @@ extern "C" { GGML_FTYPE_MOSTLY_IQ2_KS = 138, // except 1d tensors GGML_FTYPE_MOSTLY_IQ4_KSS = 139, // except 1d tensors GGML_FTYPE_MOSTLY_IQ2_KT = 140, // except 1d tensors + GGML_FTYPE_MOSTLY_IQ3_KT = 141, // except 1d tensors }; // available tensor operations: diff --git a/ggml/src/ggml-common.h b/ggml/src/ggml-common.h index e050daca..02231960 100644 --- a/ggml/src/ggml-common.h +++ b/ggml/src/ggml-common.h @@ -460,6 +460,13 @@ typedef struct { } block_iq2_kt; static_assert(sizeof(block_iq2_kt) == QK_K/4 + QK_K/64, "wrong iq2_kt block size/padding"); +typedef struct { + uint8_t scales[QK_K/64]; + uint8_t ql[QK_K/4]; + uint8_t qh[QK_K/8]; +} block_iq3_kt; +static_assert(sizeof(block_iq3_kt) == QK_K/4 + QK_K/8 + QK_K/64, "wrong iq3_kt block size/padding"); + typedef struct { ggml_half d; uint16_t extra; diff --git a/ggml/src/ggml-cuda.cu b/ggml/src/ggml-cuda.cu index 00873651..7a582e55 100644 --- a/ggml/src/ggml-cuda.cu +++ b/ggml/src/ggml-cuda.cu @@ -2849,6 +2849,7 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons case GGML_TYPE_IQ2_K: case GGML_TYPE_IQ2_KS: case GGML_TYPE_IQ2_KT: + case GGML_TYPE_IQ3_KT: case GGML_TYPE_IQ3_K: case GGML_TYPE_IQ4_K: case GGML_TYPE_IQ5_K: diff --git a/ggml/src/ggml-cuda/convert.cu b/ggml/src/ggml-cuda/convert.cu index 96a9e00f..47fc0dbc 100644 --- a/ggml/src/ggml-cuda/convert.cu +++ b/ggml/src/ggml-cuda/convert.cu @@ -371,6 +371,37 @@ static __global__ void dequantize_block_iq2_kt(const void * __restrict__ vx, dst } } +template +static __global__ void dequantize_block_iq3_kt(const void * __restrict__ vx, dst_t * __restrict__ yy, int64_t n_per_row, int64_t row_size) { + + int64_t ii = blockIdx.x; + int64_t row = (QK_K * ii) / n_per_row; + const char * cx = (const char *)vx + row * row_size; + float scale = *(const float *)cx; + const block_iq3_kt * x = (const block_iq3_kt *)(cx + sizeof(float)); + const int64_t i = ii - (row*n_per_row)/QK_K; + + constexpr uint32_t ka = 89226354; + constexpr uint32_t kb = 64248484; + constexpr uint32_t kmask = 0x8fff8fff; + constexpr uint32_t km32 = 0x3b603b60; + + const int64_t tid = threadIdx.x; + const int64_t ib = tid; // 0...31 + dst_t * y = yy + ii*QK_K + 8*ib; + uint32_t idx1 = x[i].ql[2*ib+0] + ((x[i].qh[(2*ib+0)%32] << (8-4*((2*ib+0)/32))) & 0xf00) + 4096; + uint32_t idx2 = x[i].ql[2*ib+1] + ((x[i].qh[(2*ib+1)%32] << (8-4*((2*ib+1)/32))) & 0xf00) + 4096; + const float dl = scale * iq4k_values[((x[i].scales[(ib/4)%4] >> 4*(ib/16)) & 0xf)] * 31.75f * 1.015f; + uint32_t s[2]; + const half * h = (const half *)s; + for (int j = 0; j < 4; ++j) { + idx1 = ka*idx1 + kb; s[0] = (idx1 & kmask) ^ km32; + idx2 = ka*idx2 + kb; s[1] = (idx2 & kmask) ^ km32; + y[j+0] = dl * (float)(h[0] + h[1]); + y[j+4] = dl * (float)(h[2] + h[3]); + } +} + template static __global__ void dequantize_block_iq2_xs(const void * __restrict__ vx, dst_t * __restrict__ yy) { @@ -907,6 +938,13 @@ static void dequantize_row_iq2_kt_cuda(const void * vx, dst_t * y, const int64_t dequantize_block_iq2_kt<<>>(vx, y, n_per_row, ggml_row_size(GGML_TYPE_IQ2_KT, n_per_row)); } +template +static void dequantize_row_iq3_kt_cuda(const void * vx, dst_t * y, const int64_t nrows, const int64_t n_per_row, cudaStream_t stream) { + const int64_t k = nrows * n_per_row; + const int nb = k / QK_K; + dequantize_block_iq3_kt<<>>(vx, y, n_per_row, ggml_row_size(GGML_TYPE_IQ3_KT, n_per_row)); +} + template static void dequantize_row_iq2_xs_cuda(const void * vx, dst_t * y, const int64_t nrows, const int64_t n_per_row, cudaStream_t stream) { const int64_t k = nrows * n_per_row; @@ -1145,6 +1183,8 @@ to_fp16_cuda_t ggml_get_to_fp16_cuda(ggml_type type) { return dequantize_row_iq2_xxs_cuda; case GGML_TYPE_IQ2_KT: return dequantize_row_iq2_kt_cuda; + case GGML_TYPE_IQ3_KT: + return dequantize_row_iq3_kt_cuda; case GGML_TYPE_IQ2_XS: return dequantize_row_iq2_xs_cuda; case GGML_TYPE_IQ2_S: @@ -1218,6 +1258,8 @@ to_fp32_cuda_t ggml_get_to_fp32_cuda(ggml_type type) { return dequantize_row_iq2_xxs_cuda; case GGML_TYPE_IQ2_KT: return dequantize_row_iq2_kt_cuda; + case GGML_TYPE_IQ3_KT: + return dequantize_row_iq3_kt_cuda; case GGML_TYPE_IQ2_XS: return dequantize_row_iq2_xs_cuda; case GGML_TYPE_IQ2_S: diff --git a/ggml/src/ggml-cuda/dmmv.cu b/ggml/src/ggml-cuda/dmmv.cu index 4698a403..b7dc77df 100644 --- a/ggml/src/ggml-cuda/dmmv.cu +++ b/ggml/src/ggml-cuda/dmmv.cu @@ -83,6 +83,10 @@ static __global__ void dequantize_mul_mat_vec_iq2_kt(const void * __restrict__ v } } +static __global__ void dequantize_mul_mat_vec_iq3_kt(const void * __restrict__ vx, const dfloat * __restrict__ yy, float * __restrict__ dst, + const int ncols, int nrows, int64_t row_size) { +} + static __global__ void dequantize_mul_mat_vec_q2_k(const void * __restrict__ vx, const float * __restrict__ yy, float * __restrict__ dst, const int ncols, int nrows) { static_assert(16%K_QUANTS_PER_ITERATION == 0, "16 must be divisible by K_QUANTS_PER_ITERATION"); @@ -639,6 +643,16 @@ static void dequantize_mul_mat_vec_iq2_kt_cuda(const void * vx, const dfloat * y dequantize_mul_mat_vec_iq2_kt<<>>(vx, y, dst, ncols, nrows, row_size); } +static void dequantize_mul_mat_vec_iq3_kt_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) { + GGML_ASSERT(ncols % QK_K == 0); + constexpr int ny = 2; + const int block_num_y = (nrows + ny - 1) / ny; + const dim3 block_nums(block_num_y, 1, 1); + const dim3 block_dims(32, ny, 1); + const int64_t row_size = ggml_row_size(GGML_TYPE_IQ3_KT, ncols); + dequantize_mul_mat_vec_iq3_kt<<>>(vx, y, dst, ncols, nrows, row_size); +} + static void dequantize_mul_mat_vec_q3_K_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) { GGML_ASSERT(ncols % QK_K == 0); const int ny = 2 / K_QUANTS_PER_ITERATION; @@ -735,6 +749,9 @@ void ggml_cuda_op_dequantize_mul_mat_vec( case GGML_TYPE_IQ2_KT: dequantize_mul_mat_vec_iq2_kt_cuda(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream); break; + case GGML_TYPE_IQ3_KT: + dequantize_mul_mat_vec_iq3_kt_cuda(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream); + break; case GGML_TYPE_Q3_K: dequantize_mul_mat_vec_q3_K_cuda(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream); break; @@ -768,6 +785,6 @@ bool ggml_cuda_dmmv_type_supported(ggml_type src0_type) { src0_type == GGML_TYPE_Q8_0 || src0_type == GGML_TYPE_Q2_K || src0_type == GGML_TYPE_Q3_K || src0_type == GGML_TYPE_Q4_K || src0_type == GGML_TYPE_Q5_K || src0_type == GGML_TYPE_Q6_K || - src0_type == GGML_TYPE_IQ2_KT || + src0_type == GGML_TYPE_IQ2_KT || src0_type == GGML_TYPE_IQ3_KT || src0_type == GGML_TYPE_F16; } diff --git a/ggml/src/ggml-quants.c b/ggml/src/ggml-quants.c index da02f5a6..81f2a27d 100644 --- a/ggml/src/ggml-quants.c +++ b/ggml/src/ggml-quants.c @@ -15191,6 +15191,7 @@ bool ggml_validate_row_data(enum ggml_type type, const void * data, size_t nbyte case GGML_TYPE_IQ2_K: break; case GGML_TYPE_IQ2_KS: break; case GGML_TYPE_IQ2_KT: break; + case GGML_TYPE_IQ3_KT: break; case GGML_TYPE_IQ3_K: break; case GGML_TYPE_IQ4_K: break; case GGML_TYPE_IQ5_K: break; diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index 6a53fd21..94db044a 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -1206,6 +1206,19 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = { .nrows = 1, .row_meta_size = 4, }, + [GGML_TYPE_IQ3_KT] = { + .type_name = "iq3_kt", + .blck_size = QK_K, + .type_size = sizeof(block_iq3_kt), + .is_quantized = true, + .to_float = (ggml_to_float_t) dequantize_row_iq3_kt, + .from_float = quantize_row_iq3_kt, + .from_float_ref = (ggml_from_float_t)quantize_row_iq3_kt_ref, + .vec_dot = vec_dot_iq3_kt_q8_k, + .vec_dot_type = GGML_TYPE_Q8_K, + .nrows = 1, + .row_meta_size = 4, + }, [GGML_TYPE_IQ3_K] = { .type_name = "iq3_k", .blck_size = QK_K, @@ -3922,6 +3935,7 @@ enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype) { case GGML_FTYPE_MOSTLY_IQ2_K: wtype = GGML_TYPE_IQ2_K; break; case GGML_FTYPE_MOSTLY_IQ2_KS: wtype = GGML_TYPE_IQ2_KS; break; case GGML_FTYPE_MOSTLY_IQ2_KT: wtype = GGML_TYPE_IQ2_KT; break; + case GGML_FTYPE_MOSTLY_IQ3_KT: wtype = GGML_TYPE_IQ3_KT; break; case GGML_FTYPE_MOSTLY_IQ3_K: wtype = GGML_TYPE_IQ3_K; break; case GGML_FTYPE_MOSTLY_IQ4_K: wtype = GGML_TYPE_IQ4_K; break; case GGML_FTYPE_MOSTLY_IQ5_K: wtype = GGML_TYPE_IQ5_K; break; @@ -10446,6 +10460,7 @@ static void ggml_compute_forward_add( case GGML_TYPE_IQ2_K: case GGML_TYPE_IQ2_KS: case GGML_TYPE_IQ2_KT: + case GGML_TYPE_IQ3_KT: case GGML_TYPE_IQ3_K: case GGML_TYPE_IQ4_K: case GGML_TYPE_IQ5_K: @@ -10889,6 +10904,7 @@ static void ggml_compute_forward_add1( case GGML_TYPE_IQ2_K: case GGML_TYPE_IQ2_KS: case GGML_TYPE_IQ2_KT: + case GGML_TYPE_IQ3_KT: case GGML_TYPE_IQ3_K: case GGML_TYPE_IQ4_K: case GGML_TYPE_IQ5_K: @@ -11029,6 +11045,7 @@ static void ggml_compute_forward_acc( case GGML_TYPE_IQ2_K: case GGML_TYPE_IQ2_KS: case GGML_TYPE_IQ2_KT: + case GGML_TYPE_IQ3_KT: case GGML_TYPE_IQ3_K: case GGML_TYPE_IQ4_K: case GGML_TYPE_IQ5_K: @@ -14215,6 +14232,7 @@ static void ggml_compute_forward_out_prod( case GGML_TYPE_IQ2_K: case GGML_TYPE_IQ2_KS: case GGML_TYPE_IQ2_KT: + case GGML_TYPE_IQ3_KT: case GGML_TYPE_IQ3_K: case GGML_TYPE_IQ4_K: case GGML_TYPE_IQ5_K: @@ -14595,6 +14613,7 @@ static void ggml_compute_forward_set( case GGML_TYPE_IQ2_K: case GGML_TYPE_IQ2_KS: case GGML_TYPE_IQ2_KT: + case GGML_TYPE_IQ3_KT: case GGML_TYPE_IQ3_K: case GGML_TYPE_IQ4_K: case GGML_TYPE_IQ5_K: @@ -14869,6 +14888,7 @@ static void ggml_compute_forward_get_rows( case GGML_TYPE_IQ2_K: case GGML_TYPE_IQ2_KS: case GGML_TYPE_IQ2_KT: + case GGML_TYPE_IQ3_KT: case GGML_TYPE_IQ3_K: case GGML_TYPE_IQ4_K: case GGML_TYPE_IQ5_K: @@ -15470,6 +15490,7 @@ static void ggml_compute_forward_clamp( case GGML_TYPE_IQ2_K: case GGML_TYPE_IQ2_KS: case GGML_TYPE_IQ2_KT: + case GGML_TYPE_IQ3_KT: case GGML_TYPE_IQ3_K: case GGML_TYPE_IQ4_K: case GGML_TYPE_IQ5_K: @@ -22297,6 +22318,7 @@ size_t ggml_quantize_chunk( case GGML_TYPE_IQ2_K: result = quantize_iq2_k (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; case GGML_TYPE_IQ2_KS: result = quantize_iq2_ks (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; case GGML_TYPE_IQ2_KT: result = quantize_iq2_kt (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; + case GGML_TYPE_IQ3_KT: result = quantize_iq3_kt (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; case GGML_TYPE_IQ3_K: result = quantize_iq3_k (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; case GGML_TYPE_IQ4_K: result = quantize_iq4_k (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; case GGML_TYPE_IQ5_K: result = quantize_iq5_k (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; diff --git a/ggml/src/iqk/iqk_quantize.cpp b/ggml/src/iqk/iqk_quantize.cpp index e19a3801..9d7d68ee 100644 --- a/ggml/src/iqk/iqk_quantize.cpp +++ b/ggml/src/iqk/iqk_quantize.cpp @@ -3132,6 +3132,11 @@ static inline float hsum_float_4(__m128 x) { static inline float hsum_float_8(__m256 x) { return hsum_float_4(_mm_add_ps(_mm256_castps256_ps128(x), _mm256_extractf128_ps(x, 1))); } +__m128 hsum_float_4x4(__m128 * accm) { + accm[0] = _mm_add_ps(_mm_unpacklo_ps(accm[0], accm[2]), _mm_unpackhi_ps(accm[0], accm[2])); + accm[1] = _mm_add_ps(_mm_unpacklo_ps(accm[1], accm[3]), _mm_unpackhi_ps(accm[1], accm[3])); + return _mm_add_ps(_mm_unpacklo_ps(accm[0], accm[1]), _mm_unpackhi_ps(accm[0], accm[1])); +} __m256 hsum_float_8x8(__m256 * accm) { for (int i = 0; i < 4; ++i) { accm[i] = _mm256_set_m128(_mm_add_ps(_mm256_castps256_ps128(accm[i+4]), _mm256_extractf128_ps(accm[i+4], 1)), @@ -3141,18 +3146,21 @@ __m256 hsum_float_8x8(__m256 * accm) { return _mm256_add_ps(_mm256_unpacklo_ps(accm[0], accm[1]), _mm256_unpackhi_ps(accm[0], accm[1])); } #endif -class QuantizerIQ2KT { +template +class QuantizerIQKT { + static_assert(group_size == 8 || group_size == 4); + static_assert(block_size >= 8 && block_size%8 == 0); public: constexpr static int kSuperBlockSize = QK_K; - constexpr static int kBlockSize = 32; - constexpr static int kGroupSize = 8; + constexpr static int kBlockSize = block_size; + constexpr static int kGroupSize = group_size; constexpr static int kNg = kBlockSize/kGroupSize; constexpr static int kNblock = kSuperBlockSize/kBlockSize; - constexpr static int kNumVal = 1 << 16; // i.e, 16 bits per group of 8 + constexpr static int kNumVal = 1 << num_bits; // i.e, 16 bits per group of 8 constexpr static float kScale = 31.75f; constexpr static bool kVerbose = false; - QuantizerIQ2KT(); + QuantizerIQKT(); const float * values() const { return m_values.data(); } inline void find_best_match(float d, const float * xb, const float * weight, int * best_idx) const; @@ -3169,8 +3177,6 @@ public: uint32_t s = (x & kmask) ^ km32; float val = GGML_FP16_TO_FP32(s & 65535) + GGML_FP16_TO_FP32(s >> 16); result[k] = scale*val; - //int ival = nearest_int(scale*val); - //result[k] = ival; } } private: @@ -3181,7 +3187,8 @@ private: std::vector> m_in_cluster; }; -QuantizerIQ2KT::QuantizerIQ2KT() { +template +QuantizerIQKT::QuantizerIQKT() { m_values.resize(kNumVal*kGroupSize); float * data = m_values.data(); for (int i = 0; i < kNumVal; ++i) { @@ -3191,30 +3198,34 @@ QuantizerIQ2KT::QuantizerIQ2KT() { // Make 128 clusters. // Note: we get a slightly better result by using 64 clusters // at the expense of almost doubling the quantization time. - m_clusters = cluster_points(m_values, kNumVal/512, 200); + m_clusters = cluster_points(m_values, num_clusters, 200); GGML_ASSERT(!m_clusters.empty()); m_in_cluster = finalize_clusters(m_values, m_clusters); } -float QuantizerIQ2KT::find_best_scale(const float * xb, const float * weight, const int * best_idx) const { -#ifdef __AVX2__ +template +float QuantizerIQKT::find_best_scale(const float * xb, const float * weight, const int * best_idx) const { + float sumqx = 0, sumq2 = 0; +#ifdef z__AVX2__ + // TODO: fix this for kGroupSize != 8 auto vqx = _mm256_setzero_ps(); auto vq2 = _mm256_setzero_ps(); for (int l = 0; l < kNg; ++l) { auto vx = _mm256_loadu_ps(xb+8*l); - auto vw = _mm256_loadu_ps(weight + 8*l); - auto vq = _mm256_loadu_ps(m_values.data() + kGroupSize*best_idx[l]); + auto vw = _mm256_loadu_ps(weight+8*l); + auto vq = kGroupSize == 8 ? _mm256_loadu_ps(m_values.data() + kGroupSize*best_idx[l]) : + _mm256_set_m128(_mm_loadu_ps(m_values.data() + kGroupSize*best_idx[l+1]), + _mm_loadu_ps(m_values.data() + kGroupSize*best_idx[l+0])); auto vqw = _mm256_mul_ps(vq, vw); vqx = _mm256_fmadd_ps(vqw, vx, vqx); vq2 = _mm256_fmadd_ps(vqw, vq, vq2); } - float sumqx = hsum_float_8(vqx); - float sumq2 = hsum_float_8(vq2); + sumqx = hsum_float_8(vqx); + sumq2 = hsum_float_8(vq2); #else - float sumqx = 0, sumq2 = 0; for (int l = 0; l < kNg; ++l) { - auto xl = xb + 8*l; - auto wl = weight + 8*l; + auto xl = xb + kGroupSize*l; + auto wl = weight + kGroupSize*l; auto ql = m_values.data() + kGroupSize*best_idx[l]; for (int k = 0; k < kGroupSize; ++k) { sumqx += wl[k]*ql[k]*xl[k]; @@ -3225,7 +3236,8 @@ float QuantizerIQ2KT::find_best_scale(const float * xb, const float * weight, co return sumq2 > 0 ? sumqx/sumq2 : 0.f; } -void QuantizerIQ2KT::find_best_match(float d, const float * xb, const float * weight, int * best_idx) const { +template +void QuantizerIQKT::find_best_match(float d, const float * xb, const float * weight, int * best_idx) const { if (!d) { std::memset(best_idx, 0, kNg*sizeof(int)); return; @@ -3233,66 +3245,130 @@ void QuantizerIQ2KT::find_best_match(float d, const float * xb, const float * we int ncluster = m_clusters.size()/kGroupSize; float id = 1/d; #ifdef __AVX2__ - __m256 sqx[8]; - __m256i add_idx = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0); - float sx[8]; - int index[8]; - auto vid = _mm256_set1_ps(id); - for (int l = 0; l < kNg; ++l) { - auto xl = xb + 8*l; - auto wl = weight + 8*l; - auto vx = _mm256_mul_ps(vid, _mm256_loadu_ps(xl)); - auto vw = _mm256_loadu_ps(wl); - auto vbest = _mm256_set1_ps(INFINITY); - auto best_index = _mm256_set1_epi32(-1); - float best = INFINITY; int jbest = -1; - for (int j = 0; j < ncluster; j += 8) { - auto idx = _mm256_add_epi32(_mm256_set1_epi32(j), add_idx); - for (int i = 0; i < 8; ++i) { - auto vq = _mm256_loadu_ps(m_clusters.data() + kGroupSize*(j+i)); - auto vdiff = _mm256_sub_ps(vq, vx); - sqx[i] = _mm256_mul_ps(vw, _mm256_mul_ps(vdiff, vdiff)); + if constexpr (kGroupSize == 8) { + __m256 sqx[8]; + const __m256i add_idx = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0); + float sx[8]; + int index[8]; + auto vid = _mm256_set1_ps(id); + for (int l = 0; l < kNg; ++l) { + auto xl = xb + 8*l; + auto wl = weight + 8*l; + auto vx = _mm256_mul_ps(vid, _mm256_loadu_ps(xl)); + auto vw = _mm256_loadu_ps(wl); + auto vbest = _mm256_set1_ps(INFINITY); + auto best_index = _mm256_set1_epi32(-1); + float best = INFINITY; int jbest = -1; + for (int j = 0; j < ncluster; j += 8) { + auto idx = _mm256_add_epi32(_mm256_set1_epi32(j), add_idx); + for (int i = 0; i < 8; ++i) { + auto vq = _mm256_loadu_ps(m_clusters.data() + kGroupSize*(j+i)); + auto vdiff = _mm256_sub_ps(vq, vx); + sqx[i] = _mm256_mul_ps(vw, _mm256_mul_ps(vdiff, vdiff)); + } + auto score = hsum_float_8x8(sqx); + auto mask = _mm256_cmp_ps(score, vbest, _CMP_LT_OQ); + best_index = _mm256_or_si256(_mm256_and_si256(_mm256_castps_si256(mask), idx), + _mm256_andnot_si256(_mm256_castps_si256(mask), best_index)); + vbest = _mm256_min_ps(vbest, score); } - auto score = hsum_float_8x8(sqx); - auto mask = _mm256_cmp_ps(score, vbest, _CMP_LT_OQ); - best_index = _mm256_or_si256(_mm256_and_si256(_mm256_castps_si256(mask), idx), - _mm256_andnot_si256(_mm256_castps_si256(mask), best_index)); - vbest = _mm256_min_ps(vbest, score); - } - _mm256_store_ps(sx, vbest); - _mm256_store_si256((__m256i *)index, best_index); - for (int i = 0; i < 8; ++i) { - if (sx[i] < best) { best = sx[i]; jbest = index[i]; } - } - auto& points = m_in_cluster[jbest]; - GGML_ASSERT(!points.empty() && points.size()%8 == 0); - int jbest_cluster = jbest; - vbest = _mm256_set1_ps(INFINITY); - best_index = _mm256_set1_epi32(-1); - best = INFINITY; jbest = -1; - for (int j = 0; j < int(points.size()); j += 8) { - auto idx = _mm256_loadu_si256((const __m256i*)(points.data() + j)); + _mm256_store_ps(sx, vbest); + _mm256_store_si256((__m256i *)index, best_index); for (int i = 0; i < 8; ++i) { - auto vq = _mm256_loadu_ps(m_values.data() + kGroupSize*points[j+i]); - auto vdiff = _mm256_sub_ps(vq, vx); - sqx[i] = _mm256_mul_ps(vw, _mm256_mul_ps(vdiff, vdiff)); + if (sx[i] < best) { best = sx[i]; jbest = index[i]; } } - auto score = hsum_float_8x8(sqx); - auto mask = _mm256_cmp_ps(score, vbest, _CMP_LT_OQ); - best_index = _mm256_or_si256(_mm256_and_si256(_mm256_castps_si256(mask), idx), - _mm256_andnot_si256(_mm256_castps_si256(mask), best_index)); - vbest = _mm256_min_ps(vbest, score); + auto& points = m_in_cluster[jbest]; + GGML_ASSERT(!points.empty() && points.size()%8 == 0); + int jbest_cluster = jbest; + vbest = _mm256_set1_ps(INFINITY); + best_index = _mm256_set1_epi32(-1); + best = INFINITY; jbest = -1; + for (int j = 0; j < int(points.size()); j += 8) { + auto idx = _mm256_loadu_si256((const __m256i*)(points.data() + j)); + for (int i = 0; i < 8; ++i) { + auto vq = _mm256_loadu_ps(m_values.data() + kGroupSize*points[j+i]); + auto vdiff = _mm256_sub_ps(vq, vx); + sqx[i] = _mm256_mul_ps(vw, _mm256_mul_ps(vdiff, vdiff)); + } + auto score = hsum_float_8x8(sqx); + auto mask = _mm256_cmp_ps(score, vbest, _CMP_LT_OQ); + best_index = _mm256_or_si256(_mm256_and_si256(_mm256_castps_si256(mask), idx), + _mm256_andnot_si256(_mm256_castps_si256(mask), best_index)); + vbest = _mm256_min_ps(vbest, score); + } + _mm256_store_ps(sx, vbest); + _mm256_store_si256((__m256i *)index, best_index); + for (int i = 0; i < 8; ++i) { + if (sx[i] < best) { best = sx[i]; jbest = index[i]; } + } + if (jbest < 0) { + fprintf(stderr, "Oops: jbest = %d for cluster %d with %d points\n", jbest, jbest_cluster, int(points.size())); + GGML_ASSERT(false); + } + best_idx[l] = jbest; } - _mm256_store_ps(sx, vbest); - _mm256_store_si256((__m256i *)index, best_index); - for (int i = 0; i < 8; ++i) { - if (sx[i] < best) { best = sx[i]; jbest = index[i]; } + } else { + __m128 sqx[4]; + const __m128i add_idx = _mm_set_epi32(3, 2, 1, 0); + float sx[4]; + int index[4]; + auto vid = _mm_set1_ps(id); + for (int l = 0; l < kNg; ++l) { + auto xl = xb + 4*l; + auto wl = weight + 4*l; + auto vx = _mm_mul_ps(vid, _mm_loadu_ps(xl)); + auto vw = _mm_loadu_ps(wl); + auto vbest = _mm_set1_ps(INFINITY); + auto best_index = _mm_set1_epi32(-1); + float best = INFINITY; int jbest = -1; + for (int j = 0; j < ncluster; j += 4) { + auto idx = _mm_add_epi32(_mm_set1_epi32(j), add_idx); + for (int i = 0; i < 4; ++i) { + auto vq = _mm_loadu_ps(m_clusters.data() + kGroupSize*(j+i)); + auto vdiff = _mm_sub_ps(vq, vx); + sqx[i] = _mm_mul_ps(vw, _mm_mul_ps(vdiff, vdiff)); + } + auto score = hsum_float_4x4(sqx); + auto mask = _mm_cmp_ps(score, vbest, _CMP_LT_OQ); + best_index = _mm_or_si128(_mm_and_si128(_mm_castps_si128(mask), idx), + _mm_andnot_si128(_mm_castps_si128(mask), best_index)); + vbest = _mm_min_ps(vbest, score); + } + _mm_store_ps(sx, vbest); + _mm_store_si128((__m128i *)index, best_index); + for (int i = 0; i < 4; ++i) { + if (sx[i] < best) { best = sx[i]; jbest = index[i]; } + } + auto& points = m_in_cluster[jbest]; + GGML_ASSERT(!points.empty() && points.size()%4 == 0); + int jbest_cluster = jbest; + vbest = _mm_set1_ps(INFINITY); + best_index = _mm_set1_epi32(-1); + best = INFINITY; jbest = -1; + for (int j = 0; j < int(points.size()); j += 4) { + auto idx = _mm_loadu_si128((const __m128i*)(points.data() + j)); + for (int i = 0; i < 4; ++i) { + auto vq = _mm_loadu_ps(m_values.data() + kGroupSize*points[j+i]); + auto vdiff = _mm_sub_ps(vq, vx); + sqx[i] = _mm_mul_ps(vw, _mm_mul_ps(vdiff, vdiff)); + } + auto score = hsum_float_4x4(sqx); + auto mask = _mm_cmp_ps(score, vbest, _CMP_LT_OQ); + best_index = _mm_or_si128(_mm_and_si128(_mm_castps_si128(mask), idx), + _mm_andnot_si128(_mm_castps_si128(mask), best_index)); + vbest = _mm_min_ps(vbest, score); + } + _mm_store_ps(sx, vbest); + _mm_store_si128((__m128i *)index, best_index); + for (int i = 0; i < 4; ++i) { + if (sx[i] < best) { best = sx[i]; jbest = index[i]; } + } + if (jbest < 0) { + fprintf(stderr, "Oops: jbest = %d for cluster %d with %d points\n", jbest, jbest_cluster, int(points.size())); + GGML_ASSERT(false); + } + best_idx[l] = jbest; } - if (jbest < 0) { - fprintf(stderr, "Oops: jbest = %d for cluster %d with %d points\n", jbest, jbest_cluster, int(points.size())); - GGML_ASSERT(false); - } - best_idx[l] = jbest; } #else // TODO @@ -3300,7 +3376,8 @@ void QuantizerIQ2KT::find_best_match(float d, const float * xb, const float * we #endif } -std::vector> QuantizerIQ2KT::finalize_clusters(const std::vector& values, const std::vector& clusters) { +template +std::vector> QuantizerIQKT::finalize_clusters(const std::vector& values, const std::vector& clusters) { int ncluster = clusters.size()/kGroupSize; GGML_ASSERT(ncluster%8 == 0); std::vector> p_in_cluster(ncluster); @@ -3375,7 +3452,8 @@ std::vector> QuantizerIQ2KT::finalize_clusters(const std::vecto return p_in_cluster; } -std::vector QuantizerIQ2KT::cluster_points(const std::vector& points, int ncluster, int niter) { +template +std::vector QuantizerIQKT::cluster_points(const std::vector& points, int ncluster, int niter) { constexpr int ndim = kGroupSize; GGML_ASSERT(points.size() % ndim == 0); int npoint = points.size() / ndim; @@ -3442,12 +3520,15 @@ std::vector QuantizerIQ2KT::cluster_points(const std::vector& poin return result; } +using QuantizerIQ2KT = QuantizerIQKT<32, 8, 16, 128>; + const QuantizerIQ2KT& iq2kt_quantizer() { static std::mutex mutex; std::lock_guard lock(mutex); static QuantizerIQ2KT quantizer; return quantizer; } + void quantize_row_iq2_kt_impl(const float * x, void * vy, int n_per_row, const float * quant_weights, float * all_scales) { constexpr float kSigmaScale = 2.0f; @@ -3640,4 +3721,213 @@ void vec_dot_iq2_kt_q8_k(int n, float * s, size_t bs, const void * vx, size_t bx } +namespace { +using QuantizerIQ3KT = QuantizerIQKT<32, 4, 11, 32>; +const QuantizerIQ3KT& iq3kt_quantizer() { + static std::mutex mutex; + std::lock_guard lock(mutex); + static QuantizerIQ3KT quantizer; + return quantizer; +} + +void quantize_row_iq3_kt_impl(const float * x, void * vy, int n_per_row, const float * quant_weights, float * all_scales) { + + constexpr float kSigmaScale = 2.0f; + + using Q = QuantizerIQ3KT; + + static_assert(Q::kNumVal%8 == 0); + + constexpr int kNumGroups = Q::kSuperBlockSize/Q::kGroupSize; + + float * dptr = (float *)vy; + + block_iq3_kt * y = (block_iq3_kt *)(dptr + 1); + + float weight[Q::kBlockSize]; + int best_idx[Q::kNg]; + + auto& quantizer = iq3kt_quantizer(); + + int nblock = n_per_row / Q::kSuperBlockSize; + + float amax_scale = 0, max_scale = 0; + + for (int ibl = 0; ibl < nblock; ++ibl) { + + memset(&y[ibl], 0, sizeof(block_iq3_kt)); + + const float * xbl = x + ibl*Q::kSuperBlockSize; + float sumx2 = 0; + for (int j = 0; j < Q::kSuperBlockSize; ++j) sumx2 += xbl[j]*xbl[j]; + const float sigma2 = kSigmaScale*sumx2/Q::kSuperBlockSize; + + auto scales = all_scales + ibl*Q::kNblock; + + for (int ib = 0; ib < Q::kNblock; ++ib) { + const float * xb = xbl + Q::kBlockSize*ib; + if (quant_weights) { + const float * qw = quant_weights + ibl*Q::kSuperBlockSize + ib*Q::kBlockSize; + for (int j = 0; j < Q::kBlockSize; ++j) weight[j] = qw[j] * sqrtf(sigma2 + xb[j]*xb[j]); + } else { + for (int j = 0; j < Q::kBlockSize; ++j) weight[j] = 0.25f*sigma2 + xb[j]*xb[j]; + } + float amax = 0; + for (int j = 0; j < Q::kBlockSize; ++j) { + float ax = std::abs(xb[j]); + amax = std::max(amax, ax); + } + float d = amax/96.f; + quantizer.find_best_match(d, xb, weight, best_idx); + scales[ib] = quantizer.find_best_scale(xb, weight, best_idx); + + for (int j = 0; j < Q::kNg; ++j) { + int jj = ib*Q::kNg + j; + y[ibl].ql[jj] = best_idx[j] & 255; + y[ibl].qh[jj%(kNumGroups/2)] |= ((best_idx[j] >> 8) << 4*(jj/(kNumGroups/2))); + } + + float abs_scale = std::abs(scales[ib]); + if (abs_scale > amax_scale) { + amax_scale = abs_scale; + max_scale = scales[ib]; + } + } + + } + + float d = max_scale/iq4k_values[0]; + float id = d ? 1/d : 0.f; + for (int ibl = 0; ibl < nblock; ++ibl) { + auto scales = all_scales + ibl*Q::kNblock; + for (int ib = 0; ib < Q::kNblock/2; ++ib) { + int ls1 = best_index_iq4nl(iq4k_values, id*scales[ib]); + int ls2 = best_index_iq4nl(iq4k_values, id*scales[ib + Q::kNblock/2]); + y[ibl].scales[ib] = ls1 | (ls2 << 4); + } + } + + //d *= 1.05f; + *dptr = d; + + for (int iloop = 0; iloop < 2; ++iloop) { + + d *= 1.05f; + + float sumqx = 0, sumq2 = 0; + for (int ibl = 0; ibl < nblock; ++ibl) { + + std::memset(y[ibl].qh, 0, kNumGroups/2); + const float * xbl = x + ibl*Q::kSuperBlockSize; + float sumx2 = 0; + for (int j = 0; j < Q::kSuperBlockSize; ++j) sumx2 += xbl[j]*xbl[j]; + const float sigma2 = kSigmaScale*sumx2/Q::kSuperBlockSize; + + for (int ib = 0; ib < Q::kNblock; ++ib) { + const float * xb = xbl + Q::kBlockSize*ib; + if (quant_weights) { + const float * qw = quant_weights + ibl*Q::kSuperBlockSize + ib*Q::kBlockSize; + for (int j = 0; j < Q::kBlockSize; ++j) weight[j] = qw[j] * sqrtf(sigma2 + xb[j]*xb[j]); + } else { + for (int j = 0; j < Q::kBlockSize; ++j) weight[j] = 0.25f*sigma2 + xb[j]*xb[j]; + } + int ls = iq4k_values[(y[ibl].scales[ib%(Q::kNblock/2)] >> 4*(ib/(Q::kNblock/2))) & 0xf]; + float dl = d*ls; + quantizer.find_best_match(dl, xb, weight, best_idx); + + for (int j = 0; j < Q::kNg; ++j) { + int jj = ib*Q::kNg + j; + y[ibl].ql[jj] = best_idx[j] & 255; + y[ibl].qh[jj%(kNumGroups/2)] |= ((best_idx[j] >> 8) << 4*(jj/(kNumGroups/2))); + auto xl = xb + Q::kGroupSize*j; + auto wl = weight + Q::kGroupSize*j; + auto ql = quantizer.values() + best_idx[j]*Q::kGroupSize; + for (int k = 0; k < Q::kGroupSize; ++k) { + float q = ql[k]*ls; + sumqx += wl[k]*xl[k]*q; + sumq2 += wl[k]*q*q; + } + } + } + } + if (sumq2 > 0) { + d = sumqx/sumq2; + *dptr = d; + } else { + break; + } + } +} +} + +void quantize_row_iq3_kt_ref(const float * x, block_iq3_kt * y, int64_t k) { + assert(k % QK_K == 0); + quantize_iq3_kt(x, (void *)y, 1, k, nullptr); +} + +void quantize_row_iq3_kt(const float * x, void * vy, int64_t k) { + assert(k % QK_K == 0); + block_iq3_kt * y = (block_iq3_kt *)vy; + quantize_row_iq3_kt_ref(x, y, k); +} + +size_t quantize_iq3_kt(const float * src, void * dst, int64_t nrows, int64_t n_per_row, const float * imatrix) { + GGML_ASSERT(n_per_row%QK_K == 0); + auto row_size = ggml_row_size(GGML_TYPE_IQ3_KT, n_per_row); + std::vector scales(n_per_row/QuantizerIQ3KT::kBlockSize); + char * qrow = (char *)dst; + for (int64_t row = 0; row < nrows; ++row) { + quantize_row_iq3_kt_impl(src, (void *)qrow, n_per_row, imatrix, scales.data()); + src += n_per_row; + qrow += row_size; + } + return nrows * row_size; +} + +void dequantize_row_iq3_kt(const block_iq3_kt * x, float * y, int64_t k) { + using Q = QuantizerIQ3KT; + constexpr int kNumGroups = Q::kSuperBlockSize/Q::kGroupSize; + assert(k % Q::kSuperBlockSize == 0); + const int nb = k / Q::kSuperBlockSize; + const float * dptr = (const float *)x; + const float d = *dptr * Q::kScale; + x = (const block_iq3_kt *)(dptr + 1); + auto& deq = iq3kt_quantizer(); + for (int ibl = 0; ibl < nb; ++ibl) { + auto yl = y + ibl*Q::kSuperBlockSize; + auto yh = yl + Q::kSuperBlockSize/2; + auto qll = x[ibl].ql; + auto qlh = qll + kNumGroups/2; + int jj = 0; + for (int ib = 0; ib < Q::kNblock/2; ++ib) { + float sl = d * iq4k_values[x[ibl].scales[ib] & 0xf]; + float sh = d * iq4k_values[x[ibl].scales[ib] >> 4]; + for (int ig = 0; ig < Q::kNg; ++ig) { + uint16_t ul = qll[jj] | ((x[ibl].qh[jj] << 8) & 0xf00); + uint16_t uh = qlh[jj] | ((x[ibl].qh[jj] << 4) & 0xf00); + deq.set_values(ul, yl, sl); + deq.set_values(uh, yh, sh); + yl += Q::kGroupSize; + yh += Q::kGroupSize; + ++jj; + } + } + } +} + +void vec_dot_iq3_kt_q8_k(int n, float * s, size_t bs, const void * vx, size_t bx, const void * vy, size_t by, int nrc) { + assert(n % QK_K == 0); + assert(nrc == 1); + GGML_UNUSED(nrc); + GGML_UNUSED(bx); + GGML_UNUSED(by); + GGML_UNUSED(bs); + +#if GGML_USE_IQK_MULMAT + if (iqk_mul_mat(1, 1, n, GGML_TYPE_IQ3_KT, vx, 0, GGML_TYPE_Q8_K, vy, 0, s, 0, 0, 1)) { + return; + } +#endif + +} diff --git a/ggml/src/iqk/iqk_quantize.h b/ggml/src/iqk/iqk_quantize.h index 814b613d..0b5e0818 100644 --- a/ggml/src/iqk/iqk_quantize.h +++ b/ggml/src/iqk/iqk_quantize.h @@ -67,6 +67,12 @@ size_t quantize_iq2_kt(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst void dequantize_row_iq2_kt(const block_iq2_kt * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); void vec_dot_iq2_kt_q8_k(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); +void quantize_row_iq3_kt_ref(const float * GGML_RESTRICT x, block_iq3_kt * GGML_RESTRICT y, int64_t k); +void quantize_row_iq3_kt(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k); +size_t quantize_iq3_kt(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); +void dequantize_row_iq3_kt(const block_iq3_kt * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); +void vec_dot_iq3_kt_q8_k(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); + void iqk_quantize_row_q8_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k); #ifdef __cplusplus diff --git a/include/llama.h b/include/llama.h index 8012c5b3..f44ebe41 100644 --- a/include/llama.h +++ b/include/llama.h @@ -180,6 +180,7 @@ extern "C" { LLAMA_FTYPE_MOSTLY_IQ2_KS = 147, // except 1d tensors LLAMA_FTYPE_MOSTLY_IQ4_KSS = 148, // except 1d tensors LLAMA_FTYPE_MOSTLY_IQ2_KT = 149, // except 1d tensors + LLAMA_FTYPE_MOSTLY_IQ3_KT = 150, // except 1d tensors LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file }; diff --git a/src/llama.cpp b/src/llama.cpp index 7106ef60..1e3d70b2 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -3845,6 +3845,7 @@ struct llama_model_loader { case GGML_TYPE_IQ2_KT: ftype = LLAMA_FTYPE_MOSTLY_IQ2_KT; break; case GGML_TYPE_IQ2_S: ftype = LLAMA_FTYPE_MOSTLY_IQ2_S; break; case GGML_TYPE_IQ3_XXS: ftype = LLAMA_FTYPE_MOSTLY_IQ3_XXS; break; + case GGML_TYPE_IQ3_KT: ftype = LLAMA_FTYPE_MOSTLY_IQ3_KT; break; case GGML_TYPE_IQ1_S: ftype = LLAMA_FTYPE_MOSTLY_IQ1_S; break; case GGML_TYPE_IQ1_M: ftype = LLAMA_FTYPE_MOSTLY_IQ1_M; break; case GGML_TYPE_IQ1_BN: ftype = LLAMA_FTYPE_MOSTLY_IQ1_BN; break; @@ -4552,6 +4553,7 @@ static std::string llama_model_ftype_name(llama_ftype ftype) { case LLAMA_FTYPE_MOSTLY_IQ2_M: return "IQ2_M - 2.7 bpw"; case LLAMA_FTYPE_MOSTLY_IQ3_XS: return "IQ3_XS - 3.3 bpw"; case LLAMA_FTYPE_MOSTLY_IQ3_XXS: return "IQ3_XXS - 3.0625 bpw"; + case LLAMA_FTYPE_MOSTLY_IQ3_KT: return "IQ3_KT - 3.125 bpw"; case LLAMA_FTYPE_MOSTLY_IQ1_S: return "IQ1_S - 1.5625 bpw"; case LLAMA_FTYPE_MOSTLY_IQ1_M: return "IQ1_M - 1.75 bpw"; case LLAMA_FTYPE_MOSTLY_IQ4_NL: return "IQ4_NL - 4.5 bpw"; @@ -15736,7 +15738,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M || ftype == LLAMA_FTYPE_MOSTLY_IQ2_K || ftype == LLAMA_FTYPE_MOSTLY_IQ3_K || - ftype == LLAMA_FTYPE_MOSTLY_IQ2_KS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_KT) { + ftype == LLAMA_FTYPE_MOSTLY_IQ2_KS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_KT || ftype == LLAMA_FTYPE_MOSTLY_IQ3_KT) { new_type = !qs.has_output ? GGML_TYPE_IQ4_K : GGML_TYPE_Q5_K; } else if ((ftype == LLAMA_FTYPE_MOSTLY_IQ3_S || ftype == LLAMA_FTYPE_MOSTLY_IQ3_M || ftype == LLAMA_FTYPE_MOSTLY_IQ4_XS || @@ -15758,7 +15760,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) { new_type = GGML_TYPE_IQ3_S; } - else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) { + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_KT) { new_type = GGML_TYPE_IQ3_S; } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_BN || ftype == LLAMA_FTYPE_MOSTLY_IQ2_BN) { @@ -15816,6 +15818,10 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n new_type = qs.model.hparams.n_gqa() >= 4 ? GGML_TYPE_Q4_K : qs.model.hparams.n_gqa() >= 2 ? GGML_TYPE_IQ3_K : !qs.has_imatrix ? GGML_TYPE_IQ3_S : GGML_TYPE_IQ3_XXS; } + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_KT) { + new_type = qs.model.hparams.n_gqa() >= 4 ? GGML_TYPE_Q4_K : qs.model.hparams.n_gqa() >= 2 ? GGML_TYPE_IQ3_K + : !qs.has_imatrix ? GGML_TYPE_IQ3_K : GGML_TYPE_IQ3_KT; + } else if ((ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_S) && qs.model.hparams.n_gqa() >= 2) { new_type = GGML_TYPE_IQ4_K; } @@ -15889,7 +15895,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_S) { if (qs.model.hparams.n_vocab >= 127999 && (qs.model.type == MODEL_8B || qs.model.type == MODEL_70B)) new_type = GGML_TYPE_Q4_K; - } + } } else if (name.find("ffn_down") != std::string::npos) { auto info = layer_info(qs.i_ffn_down, qs.n_ffn_down, name.c_str()); int i_layer = info.first, n_layer = info.second; @@ -15901,6 +15907,9 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS && !qs.has_imatrix) { new_type = i_layer < n_layer/8 ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K; } + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_KT && !qs.has_imatrix) { + new_type = i_layer < n_layer/8 ? GGML_TYPE_IQ4_K : GGML_TYPE_IQ3_K; + } else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) { new_type = i_layer < n_layer/16 ? GGML_TYPE_Q5_K : arch != LLM_ARCH_FALCON || use_more_bits(i_layer, n_layer) ? GGML_TYPE_Q4_K @@ -15948,12 +15957,13 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_IQ4_NL || ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_IQ3_S || ftype == LLAMA_FTYPE_MOSTLY_IQ3_M || ftype == LLAMA_FTYPE_MOSTLY_IQ4_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ4_K || - ftype == LLAMA_FTYPE_MOSTLY_IQ2_K || ftype == LLAMA_FTYPE_MOSTLY_IQ3_K) { + ftype == LLAMA_FTYPE_MOSTLY_IQ2_K || ftype == LLAMA_FTYPE_MOSTLY_IQ3_K || ftype == LLAMA_FTYPE_MOSTLY_IQ3_KT) { new_type = GGML_TYPE_Q5_K; } } else { if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K ) new_type = GGML_TYPE_Q3_K; else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) new_type = GGML_TYPE_IQ3_S; + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_KT ) new_type = GGML_TYPE_IQ3_K; else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M ) new_type = GGML_TYPE_Q4_K; else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L ) new_type = GGML_TYPE_Q5_K; else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M ) new_type = GGML_TYPE_IQ4_K; @@ -16016,8 +16026,8 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n new_type == GGML_TYPE_IQ3_XXS || new_type == GGML_TYPE_IQ1_S || new_type == GGML_TYPE_IQ3_S || new_type == GGML_TYPE_IQ1_M || new_type == GGML_TYPE_IQ4_K || new_type == GGML_TYPE_IQ2_K || new_type == GGML_TYPE_IQ5_K || new_type == GGML_TYPE_IQ3_K || - new_type == GGML_TYPE_IQ6_K || new_type == GGML_TYPE_IQ4_KS || - new_type == GGML_TYPE_IQ2_KS || new_type == GGML_TYPE_IQ4_KSS || new_type == GGML_TYPE_IQ2_KT) { + new_type == GGML_TYPE_IQ6_K || new_type == GGML_TYPE_IQ4_KS || new_type == GGML_TYPE_IQ2_KT || + new_type == GGML_TYPE_IQ2_KS || new_type == GGML_TYPE_IQ4_KSS || new_type == GGML_TYPE_IQ3_KT) { int nx = tensor->ne[0]; int ny = tensor->ne[1]; if (nx % QK_K != 0) { @@ -16041,6 +16051,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n case GGML_TYPE_IQ2_KT: case GGML_TYPE_IQ2_S: case GGML_TYPE_IQ3_XXS: + case GGML_TYPE_IQ3_KT: case GGML_TYPE_IQ3_S: case GGML_TYPE_IQ1_S: case GGML_TYPE_IQ1_M: @@ -16154,6 +16165,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s case LLAMA_FTYPE_MOSTLY_IQ2_S: default_type = GGML_TYPE_IQ2_XS; break; case LLAMA_FTYPE_MOSTLY_IQ2_M: default_type = GGML_TYPE_IQ2_S; break; case LLAMA_FTYPE_MOSTLY_IQ3_XXS: default_type = GGML_TYPE_IQ3_XXS; break; + case LLAMA_FTYPE_MOSTLY_IQ3_KT: default_type = GGML_TYPE_IQ3_KT; break; case LLAMA_FTYPE_MOSTLY_IQ1_S: default_type = GGML_TYPE_IQ1_S; break; case LLAMA_FTYPE_MOSTLY_IQ1_M: default_type = GGML_TYPE_IQ1_M; break; case LLAMA_FTYPE_MOSTLY_IQ1_BN: default_type = GGML_TYPE_IQ1_BN; break;