From a4f1ac8da48e93ca6458368ae003553fd44f861d Mon Sep 17 00:00:00 2001 From: Iwan Kawrakow Date: Tue, 5 Nov 2024 18:50:08 +0200 Subject: [PATCH] iq2_kt: quantize / dequantize I now see that I was comparing apples to oranges: iq2_xxs was using a weight of sigma^2/4 + x^2, while the Trellis approach wasn't (weight = 1). Once I use the same weight, iq2_kt is actually slightly worse than iq2_xxs in terms of rmse, so does not look promising at this point. Also, once each group of 8 Trellis values no longer has a constant sum(q^2) that we can precompute, quantization becomes significantly slower (476 seconds for LLaMA-3.1-8B). --- examples/quantize-stats/quantize-stats.cpp | 33 +++- ggml/include/ggml.h | 2 + ggml/src/ggml-common.h | 8 + ggml/src/ggml.c | 22 +++ ggml/src/iqk/iqk_quantize.cpp | 212 +++++++++++++++++++++ ggml/src/iqk/iqk_quantize.h | 6 + 6 files changed, 281 insertions(+), 2 deletions(-) diff --git a/examples/quantize-stats/quantize-stats.cpp b/examples/quantize-stats/quantize-stats.cpp index 281dcd03..99f03495 100644 --- a/examples/quantize-stats/quantize-stats.cpp +++ b/examples/quantize-stats/quantize-stats.cpp @@ -257,6 +257,24 @@ static inline int nearest_int(float fval) { return (i & 0x007fffff) - 0x00400000; } +//static void fast_ht(int n, float * values) { +// constexpr float ksqrt2 = 0.707106781f; +// float scale = 1; +// int h = 1; +// while (h < n) { +// for (int i = 0; i < n; i += 2*h) { +// for (int j = i; j < i + h; ++j) { +// float x = values[j], y = values[j + h]; +// values[j+0] = x + y; +// values[j+h] = x - y; +// } +// } +// h *= 2; +// scale *= ksqrt2; +// } +// for (int i = 0; i < n; ++i) values[i] *= scale; +//} + static const int8_t scale_values[16] = {-127, -104, -83, -65, -49, -35, -22, -10, 1, 13, 25, 38, 53, 69, 89, 113}; //static std::vector make_values(int nval, int n_per_val) { @@ -374,6 +392,7 @@ static void analyze_x(const char * name, int nrows, int n_per_row, const float * float lmse = 0, lmse_q = 0; std::vector scales(n_per_row/kBlockSize); std::vector best_idx(n_per_row/kBlockSize); + //float xtmp[kBlockSize]; while (true) { std::unique_lock lock(mutex); int first = counter; counter += chunk; @@ -395,8 +414,13 @@ static void analyze_x(const char * name, int nrows, int n_per_row, const float * for (int ib = 0; ib < n_per_row/kBlockSize; ++ib) { float best = 0, d = 0; int jbest = -1; auto xb = xr + kBlockSize*ib; + //std::memcpy(xtmp, xb, kBlockSize*sizeof(float)); + //fast_ht(kBlockSize, xtmp); #ifdef __AVX2__ - for (int l = 0; l < kBlockSize/8; ++l) vx[l] = _mm256_loadu_ps(xb+8*l); + for (int l = 0; l < kBlockSize/8; ++l) { + //vx[l] = _mm256_loadu_ps(xtmp+8*l); + vx[l] = _mm256_loadu_ps(xb+8*l); + } auto vbest = _mm256_set1_ps(0.f); auto best_index = _mm256_set1_epi32(-1); for (int j = 0; j < kNumVal; j += 8) { @@ -422,7 +446,8 @@ static void analyze_x(const char * name, int nrows, int n_per_row, const float * } auto qv = codes.data() + kBlockSize*jbest; float sumqx = 0; - for (int k = 0; k < 8; ++k) sumqx += xb[k]*qv[k]; + for (int k = 0; k < kBlockSize; ++k) sumqx += xb[k]*qv[k]; + //for (int k = 0; k < kBlockSize; ++k) sumqx += xtmp[k]*qv[k]; d = sumqx*sumq2i[jbest]; #else for (int j = 0; j < kNumVal; ++j) { @@ -440,6 +465,7 @@ static void analyze_x(const char * name, int nrows, int n_per_row, const float * best_idx[ib] = jbest; for (int k = 0; k < kBlockSize; ++k) { float diff = xb[k] - d*qv[k]; + //float diff = xtmp[k] - d*qv[k]; lmse += diff*diff; } } @@ -458,9 +484,12 @@ static void analyze_x(const char * name, int nrows, int n_per_row, const float * int ls = best_index_scale(scale_values, id*scales[ib]); float dl = d * scale_values[ls]; auto xb = xr + kBlockSize*ib; + //std::memcpy(xtmp, xb, kBlockSize*sizeof(float)); + //fast_ht(kBlockSize, xtmp); auto qv = codes.data() + kBlockSize*best_idx[ib]; for (int k = 0; k < kBlockSize; ++k) { float diff = xb[k] - dl*qv[k]; + //float diff = xtmp[k] - dl*qv[k]; lmse_q += diff*diff; } } diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h index 8980285f..276fa8b4 100644 --- a/ggml/include/ggml.h +++ b/ggml/include/ggml.h @@ -406,6 +406,7 @@ extern "C" { GGML_TYPE_IQ4_KS = 144, GGML_TYPE_IQ2_KS = 145, GGML_TYPE_IQ4_KSS = 146, + GGML_TYPE_IQ2_KT = 147, GGML_TYPE_COUNT, }; @@ -464,6 +465,7 @@ extern "C" { GGML_FTYPE_MOSTLY_IQ4_KS = 137, // except 1d tensors GGML_FTYPE_MOSTLY_IQ2_KS = 138, // except 1d tensors GGML_FTYPE_MOSTLY_IQ4_KSS = 139, // except 1d tensors + GGML_FTYPE_MOSTLY_IQ2_KT = 140, // except 1d tensors }; // available tensor operations: diff --git a/ggml/src/ggml-common.h b/ggml/src/ggml-common.h index f0c1ae68..ecaf2a20 100644 --- a/ggml/src/ggml-common.h +++ b/ggml/src/ggml-common.h @@ -454,6 +454,14 @@ typedef struct { } block_iq2_ks; static_assert(sizeof(block_iq2_ks) == sizeof(uint16_t) + QK_K/64 + QK_K/4, "wrong iq2_ks block size/padding"); +typedef struct { + ggml_half d; + uint8_t scales[QK_K/16]; + uint8_t qh[QK_K/16]; + uint8_t ql[QK_K/8]; +} block_iq2_kt; +static_assert(sizeof(block_iq2_kt) == sizeof(ggml_half) + QK_K/4, "wrong iq2_kt block size/padding"); + typedef struct { ggml_half d; uint16_t extra; diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index 39218ff4..adc7b29e 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -1193,6 +1193,19 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = { .nrows = 1, .row_meta_size = 2, }, + [GGML_TYPE_IQ2_KT] = { + .type_name = "iq2_kt", + .blck_size = QK_K, + .type_size = sizeof(block_iq2_kt), + .is_quantized = true, + .to_float = (ggml_to_float_t) dequantize_row_iq2_kt, + .from_float = quantize_row_iq2_kt, + .from_float_ref = (ggml_from_float_t)quantize_row_iq2_kt_ref, + .vec_dot = vec_dot_iq2_kt_q8_k, + .vec_dot_type = GGML_TYPE_Q8_K, + .nrows = 1, + .row_meta_size = 0, + }, [GGML_TYPE_IQ3_K] = { .type_name = "iq3_k", .blck_size = QK_K, @@ -3908,6 +3921,7 @@ enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype) { case GGML_FTYPE_MOSTLY_IQ4_KSS: wtype = GGML_TYPE_IQ4_KSS; break; case GGML_FTYPE_MOSTLY_IQ2_K: wtype = GGML_TYPE_IQ2_K; break; case GGML_FTYPE_MOSTLY_IQ2_KS: wtype = GGML_TYPE_IQ2_KS; break; + case GGML_FTYPE_MOSTLY_IQ2_KT: wtype = GGML_TYPE_IQ2_KT; break; case GGML_FTYPE_MOSTLY_IQ3_K: wtype = GGML_TYPE_IQ3_K; break; case GGML_FTYPE_MOSTLY_IQ4_K: wtype = GGML_TYPE_IQ4_K; break; case GGML_FTYPE_MOSTLY_IQ5_K: wtype = GGML_TYPE_IQ5_K; break; @@ -10431,6 +10445,7 @@ static void ggml_compute_forward_add( case GGML_TYPE_IQ4_KSS: case GGML_TYPE_IQ2_K: case GGML_TYPE_IQ2_KS: + case GGML_TYPE_IQ2_KT: case GGML_TYPE_IQ3_K: case GGML_TYPE_IQ4_K: case GGML_TYPE_IQ5_K: @@ -10873,6 +10888,7 @@ static void ggml_compute_forward_add1( case GGML_TYPE_IQ4_KSS: case GGML_TYPE_IQ2_K: case GGML_TYPE_IQ2_KS: + case GGML_TYPE_IQ2_KT: case GGML_TYPE_IQ3_K: case GGML_TYPE_IQ4_K: case GGML_TYPE_IQ5_K: @@ -11012,6 +11028,7 @@ static void ggml_compute_forward_acc( case GGML_TYPE_IQ4_KSS: case GGML_TYPE_IQ2_K: case GGML_TYPE_IQ2_KS: + case GGML_TYPE_IQ2_KT: case GGML_TYPE_IQ3_K: case GGML_TYPE_IQ4_K: case GGML_TYPE_IQ5_K: @@ -14197,6 +14214,7 @@ static void ggml_compute_forward_out_prod( case GGML_TYPE_IQ4_KSS: case GGML_TYPE_IQ2_K: case GGML_TYPE_IQ2_KS: + case GGML_TYPE_IQ2_KT: case GGML_TYPE_IQ3_K: case GGML_TYPE_IQ4_K: case GGML_TYPE_IQ5_K: @@ -14576,6 +14594,7 @@ static void ggml_compute_forward_set( case GGML_TYPE_IQ4_KSS: case GGML_TYPE_IQ2_K: case GGML_TYPE_IQ2_KS: + case GGML_TYPE_IQ2_KT: case GGML_TYPE_IQ3_K: case GGML_TYPE_IQ4_K: case GGML_TYPE_IQ5_K: @@ -14849,6 +14868,7 @@ static void ggml_compute_forward_get_rows( case GGML_TYPE_IQ4_KSS: case GGML_TYPE_IQ2_K: case GGML_TYPE_IQ2_KS: + case GGML_TYPE_IQ2_KT: case GGML_TYPE_IQ3_K: case GGML_TYPE_IQ4_K: case GGML_TYPE_IQ5_K: @@ -15449,6 +15469,7 @@ static void ggml_compute_forward_clamp( case GGML_TYPE_IQ4_KSS: case GGML_TYPE_IQ2_K: case GGML_TYPE_IQ2_KS: + case GGML_TYPE_IQ2_KT: case GGML_TYPE_IQ3_K: case GGML_TYPE_IQ4_K: case GGML_TYPE_IQ5_K: @@ -22275,6 +22296,7 @@ size_t ggml_quantize_chunk( case GGML_TYPE_IQ4_KSS: result = quantize_iq4_kss(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; case GGML_TYPE_IQ2_K: result = quantize_iq2_k (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; case GGML_TYPE_IQ2_KS: result = quantize_iq2_ks (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; + case GGML_TYPE_IQ2_KT: result = quantize_iq2_kt (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; case GGML_TYPE_IQ3_K: result = quantize_iq3_k (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; case GGML_TYPE_IQ4_K: result = quantize_iq4_k (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; case GGML_TYPE_IQ5_K: result = quantize_iq5_k (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; diff --git a/ggml/src/iqk/iqk_quantize.cpp b/ggml/src/iqk/iqk_quantize.cpp index b9d48237..41a15a56 100644 --- a/ggml/src/iqk/iqk_quantize.cpp +++ b/ggml/src/iqk/iqk_quantize.cpp @@ -3119,4 +3119,216 @@ void vec_dot_iq4_kss_q8_k(int n, float * s, size_t bs, const void * vx, size_t b GGML_UNUSED(by); } +// ========================================== iq2_kt ==================================================== + +namespace { +class QuantizerIQ2KT { +public: + constexpr static int kSuperBlockSize = 256; + constexpr static int kBlockSize = 8; + constexpr static int kNblock = kSuperBlockSize/kBlockSize; + constexpr static int kNumVal = 1 << 12; + QuantizerIQ2KT(); + const float * values() const { return m_values.data(); } + static inline void set_values(uint32_t i, float * result) { + constexpr uint32_t ka = 89226354; + constexpr uint32_t kb = 64248484; + constexpr uint32_t kmask = 0x8fff8fff; + constexpr uint32_t km32 = 0x3b603b60; + uint32_t x = i + 4096; + for (int k = 0; k < kBlockSize; ++k) { + x = ka*x + kb; + uint32_t s = (x & kmask) ^ km32; + float val = GGML_FP16_TO_FP32(s & 65535) + GGML_FP16_TO_FP32(s >> 16); + int ival = nearest_int(16.f*val); + result[k] = ival; + } + } +private: + std::vector m_values; +}; +QuantizerIQ2KT::QuantizerIQ2KT() { + m_values.resize(kNumVal*kBlockSize); + float * data = m_values.data(); + for (int i = 0; i < kNumVal; ++i) { + set_values(i, data); + data += kBlockSize; + } +} +const QuantizerIQ2KT& iq2kt_quantizer() { + static std::mutex mutex; + std::lock_guard lock(mutex); + static QuantizerIQ2KT quantizer; + return quantizer; +} +#ifdef __AVX2__ +__m256 hsum_float_8x8(__m256 * accm) { + for (int i = 0; i < 4; ++i) { + accm[i] = _mm256_set_m128(_mm_add_ps(_mm256_castps256_ps128(accm[i+4]), _mm256_extractf128_ps(accm[i+4], 1)), + _mm_add_ps(_mm256_castps256_ps128(accm[i+0]), _mm256_extractf128_ps(accm[i+0], 1))); + } + for (int i = 0; i < 2; ++i) accm[i] = _mm256_add_ps(_mm256_unpacklo_ps(accm[i], accm[i+2]), _mm256_unpackhi_ps(accm[i], accm[i+2])); + return _mm256_add_ps(_mm256_unpacklo_ps(accm[0], accm[1]), _mm256_unpackhi_ps(accm[0], accm[1])); +} +#endif +void quantize_row_iq2_kt_impl(const float * x, void * vy, int n_per_row, const float * quant_weights) { + + static_assert(QuantizerIQ2KT::kNumVal%8 == 0); + + block_iq2_kt * y = (block_iq2_kt *)vy; + + float weight[QuantizerIQ2KT::kBlockSize]; + float scales[QuantizerIQ2KT::kNblock]; + + const int nblock = n_per_row/QuantizerIQ2KT::kSuperBlockSize; + + auto& quantizer = iq2kt_quantizer(); + auto values = quantizer.values(); + +#ifdef __AVX2__ + __m256 sqx[8]; + __m256 sq2[8]; + __m256i add_idx = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0); + float sx[8]; + int index[8]; +#endif + + for (int ibl = 0; ibl < nblock; ++ibl) { + + memset(&y[ibl], 0, sizeof(block_iq2_kt)); + + const float * xbl = x + ibl*QuantizerIQ2KT::kSuperBlockSize; + float sumx2 = 0; + for (int j = 0; j < QuantizerIQ2KT::kSuperBlockSize; ++j) sumx2 += xbl[j]*xbl[j]; + const float sigma2 = 1.5f*sumx2/QuantizerIQ2KT::kSuperBlockSize; + + float amax_scale = 0, max_scale = 0; + + for (int ib = 0; ib < QuantizerIQ2KT::kNblock; ++ib) { + const float * xb = xbl + QuantizerIQ2KT::kBlockSize*ib; + if (quant_weights) { + const float * qw = quant_weights + ibl*QuantizerIQ2KT::kSuperBlockSize + ib*QuantizerIQ2KT::kBlockSize; + for (int j = 0; j < QuantizerIQ2KT::kBlockSize; ++j) weight[j] = qw[j] * sqrtf(sigma2 + xb[j]*xb[j]); + } else { + for (int j = 0; j < QuantizerIQ2KT::kBlockSize; ++j) weight[j] = 1;//0.25f*sigma2 + xb[j]*xb[j]; + } +#ifdef __AVX2__ + auto vw = _mm256_loadu_ps(weight); + auto vx = _mm256_loadu_ps(xb); + auto vbest = _mm256_set1_ps(0.f); + auto best_index = _mm256_set1_epi32(-1); + for (int j = 0; j < QuantizerIQ2KT::kNumVal; j += 8) { + auto idx = _mm256_add_epi32(_mm256_set1_epi32(j), add_idx); + for (int i = 0; i < 8; ++i) { + auto vq = _mm256_loadu_ps(values + QuantizerIQ2KT::kBlockSize*(j+i)); + auto wqv = _mm256_mul_ps(vq, vw); + sqx[i] = _mm256_mul_ps(wqv, vx); + sq2[i] = _mm256_mul_ps(wqv, vq); + } + auto sumqx = hsum_float_8x8(sqx); + auto sumq2 = hsum_float_8x8(sq2); + //auto score = _mm256_div_ps(_mm256_mul_ps(sumqx, sumqx), sumq2); + auto score = _mm256_mul_ps(_mm256_mul_ps(sumqx, sumqx), _mm256_rcp_ps(sumq2)); + auto mask = _mm256_cmp_ps(score, vbest, _CMP_GT_OQ); + best_index = _mm256_or_si256(_mm256_and_si256(_mm256_castps_si256(mask), idx), + _mm256_andnot_si256(_mm256_castps_si256(mask), best_index)); + vbest = _mm256_max_ps(vbest, score); + } + _mm256_store_ps(sx, vbest); + _mm256_store_si256((__m256i *)index, best_index); + float best = sx[0]; int jbest = index[0]; + for (int j = 1; j < 8; ++j) { + if (sx[j] > best) { best = sx[j]; jbest = index[j]; } + } + auto qv = values + QuantizerIQ2KT::kBlockSize*jbest; + float sumqx = 0, sumq2 = 0; + for (int k = 0; k < QuantizerIQ2KT::kBlockSize; ++k) { + sumqx += weight[k]*qv[k]*xb[k]; + sumq2 += weight[k]*qv[k]*qv[k]; + } + scales[ib] = sumqx/sumq2; + float abs_scale = std::abs(scales[ib]); + if (abs_scale > amax_scale) { + amax_scale = abs_scale; max_scale = scales[ib]; + } + y[ibl].ql[ib] = (jbest & 255); + y[ibl].qh[ib%(QuantizerIQ2KT::kNblock/2)] |= ((jbest >> 8) << 4*(ib/(QuantizerIQ2KT::kNblock/2))); +#else +#endif + } + float d = max_scale/iq4k_values[0]; + y[ibl].d = GGML_FP32_TO_FP16(d); + float id = d ? 1/d : 0.f; + + for (int ib = 0; ib < QuantizerIQ2KT::kNblock; ++ib) { + int ls = best_index_iq4nl(iq4k_values, id*scales[ib]); + y[ibl].scales[ib%(QuantizerIQ2KT::kNblock/2)] |= (ls << 4*(ib/(QuantizerIQ2KT::kNblock/2))); + } + } + +} +} + +void quantize_row_iq2_kt_ref(const float * GGML_RESTRICT x, block_iq2_kt * GGML_RESTRICT y, int64_t k) { + assert(k % QK_K == 0); + quantize_iq2_kt(x, (void *)y, 1, k, nullptr); +} + +void quantize_row_iq2_kt(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) { + assert(k % QK_K == 0); + block_iq2_kt * y = (block_iq2_kt *)vy; + quantize_row_iq2_kt_ref(x, y, k); +} + +size_t quantize_iq2_kt(const float * src, void * dst, int64_t nrows, int64_t n_per_row, const float * imatrix) { + GGML_ASSERT(n_per_row%QK_K == 0); + auto row_size = ggml_row_size(GGML_TYPE_IQ2_KT, n_per_row); + char * qrow = (char *)dst; + for (int64_t row = 0; row < nrows; ++row) { + quantize_row_iq2_kt_impl(src, (void *)qrow, n_per_row, imatrix); + src += n_per_row; + qrow += row_size; + } + return nrows * row_size; +} + +void dequantize_row_iq2_kt(const block_iq2_kt * x, float * y, int64_t k) { + assert(k % QuantizerIQ2KT::kSuperBlockSize == 0); + const int nb = k / QuantizerIQ2KT::kSuperBlockSize; + auto& deq = iq2kt_quantizer(); + for (int ibl = 0; ibl < nb; ++ibl) { + const float d = GGML_FP16_TO_FP32(x[ibl].d); + auto yl = y + ibl*QuantizerIQ2KT::kSuperBlockSize; + auto yh = yl + QuantizerIQ2KT::kSuperBlockSize/2; + for (int ib = 0; ib < QuantizerIQ2KT::kNblock/2; ++ib) { + uint32_t idx1 = x[ibl].ql[ib] | ((x[ibl].qh[ib] & 0xf) << 8); + uint32_t idx2 = x[ibl].ql[ib+QuantizerIQ2KT::kNblock/2] | ((x[ibl].qh[ib] >> 4) << 8); + deq.set_values(idx1, yl); + deq.set_values(idx2, yh); + float s1 = d * iq4k_values[x[ibl].scales[ib] & 0xf]; + float s2 = d * iq4k_values[x[ibl].scales[ib] >> 4]; + for (int j = 0; j < QuantizerIQ2KT::kBlockSize; ++j) yl[j] *= s1; + for (int j = 0; j < QuantizerIQ2KT::kBlockSize; ++j) yh[j] *= s2; + yl += QuantizerIQ2KT::kBlockSize; + yh += QuantizerIQ2KT::kBlockSize; + } + } +} + +void vec_dot_iq2_kt_q8_k(int n, float * s, size_t bs, const void * vx, size_t bx, const void * vy, size_t by, int nrc) { + assert(n % QK_K == 0); + assert(nrc == 1); + GGML_UNUSED(nrc); + GGML_UNUSED(bx); + GGML_UNUSED(by); + GGML_UNUSED(bs); + +#if GGML_USE_IQK_MULMAT + if (iqk_mul_mat(1, 1, n, GGML_TYPE_IQ2_KT, vx, 0, GGML_TYPE_Q8_K, vy, 0, s, 0, 0, 1)) { + return; + } +#endif + +} + diff --git a/ggml/src/iqk/iqk_quantize.h b/ggml/src/iqk/iqk_quantize.h index 50c425af..814b613d 100644 --- a/ggml/src/iqk/iqk_quantize.h +++ b/ggml/src/iqk/iqk_quantize.h @@ -61,6 +61,12 @@ size_t quantize_iq2_ks(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst void dequantize_row_iq2_ks(const block_iq2_ks * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); void vec_dot_iq2_ks_q8_k(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); +void quantize_row_iq2_kt_ref(const float * GGML_RESTRICT x, block_iq2_kt * GGML_RESTRICT y, int64_t k); +void quantize_row_iq2_kt(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k); +size_t quantize_iq2_kt(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); +void dequantize_row_iq2_kt(const block_iq2_kt * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); +void vec_dot_iq2_kt_q8_k(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); + void iqk_quantize_row_q8_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k); #ifdef __cplusplus