diff --git a/examples/quantize/quantize.cpp b/examples/quantize/quantize.cpp index 78fef3f8..916f57ec 100644 --- a/examples/quantize/quantize.cpp +++ b/examples/quantize/quantize.cpp @@ -83,6 +83,7 @@ static const std::vector QUANT_OPTIONS = { { "Q6_K", LLAMA_FTYPE_MOSTLY_Q6_K, " 5.15G, +0.0008 ppl @ LLaMA-v1-7B", }, { "Q6_K_R4", LLAMA_FTYPE_MOSTLY_Q6_K_R4, "Q6_K repacked", }, { "Q8_K_R8", LLAMA_FTYPE_MOSTLY_Q8_K_R8, "Q8_K repacked", }, + { "Q8_KV_R8", LLAMA_FTYPE_MOSTLY_Q8_KV_R8, "Q8_KV repacked", }, { "Q8_0", LLAMA_FTYPE_MOSTLY_Q8_0, " 6.70G, +0.0004 ppl @ LLaMA-v1-7B", }, { "Q4_0_4_4", LLAMA_FTYPE_MOSTLY_Q4_0_4_4, " 4.34G, +0.4685 ppl @ Llama-3-8B", }, { "Q4_0_4_8", LLAMA_FTYPE_MOSTLY_Q4_0_4_8, " 4.34G, +0.4685 ppl @ Llama-3-8B", }, diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h index ab9c34a0..d2131a15 100644 --- a/ggml/include/ggml.h +++ b/ggml/include/ggml.h @@ -443,6 +443,7 @@ extern "C" { GGML_TYPE_IQ4_K_R4 = 339, GGML_TYPE_IQ5_K_R4 = 340, GGML_TYPE_IQ4_KS_R4 = 344, + GGML_TYPE_Q8_KV_R8 = 398, GGML_TYPE_Q8_K_R8 = 399, GGML_TYPE_COUNT, }; @@ -529,6 +530,7 @@ extern "C" { GGML_FTYPE_MOSTLY_IQ4_K_R4 = 332, // except 1d tensors GGML_FTYPE_MOSTLY_IQ5_K_R4 = 333, // except 1d tensors GGML_FTYPE_MOSTLY_IQ4_KS_R4 = 337, // except 1d tensors + GGML_FTYPE_MOSTLY_Q8_KV_R8 = 398, // except 1d tensors GGML_FTYPE_MOSTLY_Q8_K_R8 = 399, // except 1d tensors }; diff --git a/ggml/src/ggml-quants.c b/ggml/src/ggml-quants.c index 52678cc0..e8218e76 100644 --- a/ggml/src/ggml-quants.c +++ b/ggml/src/ggml-quants.c @@ -15214,9 +15214,10 @@ bool ggml_validate_row_data(enum ggml_type type, const void * data, size_t nbyte case GGML_TYPE_IQ3_K_R4: break; case GGML_TYPE_IQ4_K_R4: break; case GGML_TYPE_IQ5_K_R4: break; - case GGML_TYPE_IQ4_KS_R4: break; - case GGML_TYPE_Q8_K_R8: break; - case GGML_TYPE_Q8_KV: break; + case GGML_TYPE_IQ4_KS_R4:break; + case GGML_TYPE_Q8_KV_R8: break; + case GGML_TYPE_Q8_K_R8: break; + case GGML_TYPE_Q8_KV: break; case GGML_TYPE_BF16_R16: break; case GGML_TYPE_Q4_0_4_4: case GGML_TYPE_Q4_0_4_8: diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index 02d310d8..0aee8dd4 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -1374,6 +1374,18 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = { .vec_dot_type = GGML_TYPE_Q8_KV, .row_meta_size = 8, }, + [GGML_TYPE_Q8_KV_R8] = { + .type_name = "q8_KV_r8", + .blck_size = 32, + .type_size = 32, + .is_quantized = true, + .to_float = (ggml_to_float_t) dequantize_row_q8_KV_r8, + .from_float = quantize_row_q8_KV_r8, + .from_float_ref = (ggml_from_float_t)quantize_row_q8_KV_r8_ref, + .vec_dot = vec_dot_q8_KV_r8_q8_KV, + .vec_dot_type = GGML_TYPE_Q8_KV, + .row_meta_size = 4, + }, [GGML_TYPE_Q8_K16] = { .type_name = "q8_K16", .blck_size = 64, @@ -4397,6 +4409,7 @@ enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype) { case GGML_FTYPE_MOSTLY_Q6_K: wtype = GGML_TYPE_Q6_K; break; case GGML_FTYPE_MOSTLY_Q6_K_R4: wtype = GGML_TYPE_Q6_K_R4; break; case GGML_FTYPE_MOSTLY_Q8_K_R8: wtype = GGML_TYPE_Q8_K_R8; break; + case GGML_FTYPE_MOSTLY_Q8_KV_R8: wtype = GGML_TYPE_Q8_KV_R8; break; case GGML_FTYPE_MOSTLY_IQ2_XXS: wtype = GGML_TYPE_IQ2_XXS; break; case GGML_FTYPE_MOSTLY_IQ2_XXS_R4: wtype = GGML_TYPE_IQ2_XXS_R4;break; case GGML_FTYPE_MOSTLY_IQ2_XS: wtype = GGML_TYPE_IQ2_XS; break; @@ -10949,6 +10962,7 @@ static void ggml_compute_forward_add( case GGML_TYPE_Q6_K: case GGML_TYPE_Q6_K_R4: case GGML_TYPE_Q8_K_R8: + case GGML_TYPE_Q8_KV_R8: case GGML_TYPE_IQ2_XXS: case GGML_TYPE_IQ2_XXS_R4: case GGML_TYPE_IQ2_XS: @@ -11419,6 +11433,7 @@ static void ggml_compute_forward_add1( case GGML_TYPE_Q6_K: case GGML_TYPE_Q6_K_R4: case GGML_TYPE_Q8_K_R8: + case GGML_TYPE_Q8_KV_R8: case GGML_TYPE_IQ2_XXS: case GGML_TYPE_IQ2_XXS_R4: case GGML_TYPE_IQ2_XS: @@ -11586,6 +11601,7 @@ static void ggml_compute_forward_acc( case GGML_TYPE_Q6_K: case GGML_TYPE_Q6_K_R4: case GGML_TYPE_Q8_K_R8: + case GGML_TYPE_Q8_KV_R8: case GGML_TYPE_IQ2_XXS: case GGML_TYPE_IQ2_XXS_R4: case GGML_TYPE_IQ2_XS: @@ -14793,6 +14809,7 @@ static void ggml_compute_forward_out_prod( case GGML_TYPE_Q6_K: case GGML_TYPE_Q6_K_R4: case GGML_TYPE_Q8_K_R8: + case GGML_TYPE_Q8_KV_R8: case GGML_TYPE_IQ2_XXS: case GGML_TYPE_IQ2_XXS_R4: case GGML_TYPE_IQ2_XS: @@ -15200,6 +15217,7 @@ static void ggml_compute_forward_set( case GGML_TYPE_Q6_K: case GGML_TYPE_Q6_K_R4: case GGML_TYPE_Q8_K_R8: + case GGML_TYPE_Q8_KV_R8: case GGML_TYPE_IQ2_XXS: case GGML_TYPE_IQ2_XXS_R4: case GGML_TYPE_IQ2_XS: @@ -15502,6 +15520,7 @@ static void ggml_compute_forward_get_rows( case GGML_TYPE_Q6_K: case GGML_TYPE_Q6_K_R4: case GGML_TYPE_Q8_K_R8: + case GGML_TYPE_Q8_KV_R8: case GGML_TYPE_IQ2_XXS: case GGML_TYPE_IQ2_XXS_R4: case GGML_TYPE_IQ2_XS: @@ -16131,6 +16150,7 @@ static void ggml_compute_forward_clamp( case GGML_TYPE_Q6_K: case GGML_TYPE_Q6_K_R4: case GGML_TYPE_Q8_K_R8: + case GGML_TYPE_Q8_KV_R8: case GGML_TYPE_Q8_KR8: case GGML_TYPE_IQ2_XXS: case GGML_TYPE_IQ2_XXS_R4: @@ -22998,6 +23018,7 @@ size_t ggml_quantize_chunk( case GGML_TYPE_Q6_K: result = quantize_q6_K(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; case GGML_TYPE_Q6_K_R4: result = quantize_q6_k_r4(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; case GGML_TYPE_Q8_K_R8: result = quantize_q8_k_r8(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; + case GGML_TYPE_Q8_KV_R8:result = quantize_q8_KV_r8(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; case GGML_TYPE_IQ2_XXS: result = quantize_iq2_xxs(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; case GGML_TYPE_IQ2_XXS_R4:result = quantize_iq2_xxs_r4(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; case GGML_TYPE_IQ2_XS: result = quantize_iq2_xs (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; diff --git a/ggml/src/iqk/iqk_mul_mat.cpp b/ggml/src/iqk/iqk_mul_mat.cpp index 5c5262ae..e040cf91 100644 --- a/ggml/src/iqk/iqk_mul_mat.cpp +++ b/ggml/src/iqk/iqk_mul_mat.cpp @@ -270,6 +270,7 @@ struct MulMat { case GGML_TYPE_Q4_K_R4: case GGML_TYPE_Q5_K_R4: case GGML_TYPE_Q8_KV: + case GGML_TYPE_Q8_KV_R8: case GGML_TYPE_Q8_K_R8: return 8; case GGML_TYPE_Q4_0_R8: case GGML_TYPE_Q8_0_R8: @@ -303,6 +304,7 @@ struct MulMat { case GGML_TYPE_Q4_0_R8: case GGML_TYPE_Q8_0_R8: case GGML_TYPE_Q8_KV: + case GGML_TYPE_Q8_KV_R8: case GGML_TYPE_Q8_K_R8: return 8; case GGML_TYPE_BF16_R16: return 16; default: return 1; @@ -9352,6 +9354,21 @@ bool MulMat::prepare(int typeA, int typeB, int ne00, MulMat& mm, int Ny) { mm.funcs[7] = mul_mat_q8_KV_q8_KV<8>; #ifdef HAVE_FANCY_SIMD mm.func16 = mul_mat_q8_KV_q8_KV<16>; +#endif + expected_typeB = GGML_TYPE_Q8_KV; + break; + case GGML_TYPE_Q8_KV_R8: + assert (ne00 % 32 == 0); + mm.funcs[0] = mul_mat_q8_KV_r8_q8_KV<1>; + mm.funcs[1] = mul_mat_q8_KV_r8_q8_KV<2>; + mm.funcs[2] = mul_mat_q8_KV_r8_q8_KV<3>; + mm.funcs[3] = mul_mat_q8_KV_r8_q8_KV<4>; + mm.funcs[4] = mul_mat_q8_KV_r8_q8_KV<5>; + mm.funcs[5] = mul_mat_q8_KV_r8_q8_KV<6>; + mm.funcs[6] = mul_mat_q8_KV_r8_q8_KV<7>; + mm.funcs[7] = mul_mat_q8_KV_r8_q8_KV<8>; +#ifdef HAVE_FANCY_SIMD + mm.func16 = mul_mat_q8_KV_r8_q8_KV<16>; #endif expected_typeB = GGML_TYPE_Q8_KV; break; @@ -14364,6 +14381,10 @@ bool MulMat::prepare(int typeA, int typeB, int ne00, MulMat& m, int /*Ny*/) { m.func16 = mul_mat_q8_KV_q8_KV<16>; expected_Btype = GGML_TYPE_Q8_KV; break; + case GGML_TYPE_Q8_KV_R8: + SET_MUL_MAT_FUNCTIONS(m, mul_mat_q8_KV_r8_q8_KV); + expected_Btype = GGML_TYPE_Q8_KV; + break; case GGML_TYPE_IQ2_K_R4: SET_MUL_MAT_FUNCTIONS(m, mul_mat_iq2_k_r4_q8_k); expected_Btype = GGML_TYPE_Q8_K; diff --git a/ggml/src/iqk/iqk_quantize.cpp b/ggml/src/iqk/iqk_quantize.cpp index 144741c0..c93961a4 100644 --- a/ggml/src/iqk/iqk_quantize.cpp +++ b/ggml/src/iqk/iqk_quantize.cpp @@ -5509,6 +5509,150 @@ void vec_dot_q8_k_r8_q8_k(int n, float * s, size_t bs, const void * vx, size_t b GGML_UNUSED(by); } +// +// ========================================= q8_KV_r8 +// + +void quantize_row_q8_KV_r8_ref(const float * x, void * y, int64_t k) { + quantize_q8_KV_r8(x, y, 8, k/8, nullptr); +} + +void quantize_row_q8_KV_r8(const float * x, void * y, int64_t k) { + quantize_q8_KV_r8(x, y, 8, k/8, nullptr); +} + +static void repack_q8_KV(int nrows, int n_per_row, const char * cx, char * cy, [[maybe_unused]] bool online) { + GGML_ASSERT(nrows%8 == 0); + GGML_ASSERT(n_per_row%16 == 0); + auto row_size_x = ggml_row_size(GGML_TYPE_Q8_KV, n_per_row); + auto row_size_y = ggml_row_size(GGML_TYPE_Q8_KV_R8, n_per_row); + const int8_t * x8[8]; +#ifdef __ARM_NEON + int8x16x2_t m0, m1, m2, m3; +#endif + for (int row = 0; row < nrows; row += 8) { + auto dy = (float *)cy; + auto qy = (int8_t *)(dy + 8); + for (int k = 0; k < 8; ++k) { + auto dx = (const float *)(cx + k*row_size_x); + dy[k] = dx[0]; + x8[k] = (const int8_t *)(dx + 2); + } + for (int ib = 0; ib < n_per_row/16; ++ib) { +#ifdef __AVX2__ +#define MM256_SET_M128I(a, b) _mm256_insertf128_si256(_mm256_castsi128_si256(b), (a), 1) + auto m0 = MM256_SET_M128I(_mm_loadu_si128((const __m128i *)x8[4]+ib), _mm_loadu_si128((const __m128i *)x8[0]+ib)); + auto m1 = MM256_SET_M128I(_mm_loadu_si128((const __m128i *)x8[5]+ib), _mm_loadu_si128((const __m128i *)x8[1]+ib)); + auto m2 = MM256_SET_M128I(_mm_loadu_si128((const __m128i *)x8[6]+ib), _mm_loadu_si128((const __m128i *)x8[2]+ib)); + auto m3 = MM256_SET_M128I(_mm_loadu_si128((const __m128i *)x8[7]+ib), _mm_loadu_si128((const __m128i *)x8[3]+ib)); + auto t0 = _mm256_unpacklo_epi32(m0, m1); + auto t1 = _mm256_unpacklo_epi32(m2, m3); + auto t2 = _mm256_unpackhi_epi32(m0, m1); + auto t3 = _mm256_unpackhi_epi32(m2, m3); + m0 = _mm256_unpacklo_epi64(t0, t1); + m1 = _mm256_unpackhi_epi64(t0, t1); + m2 = _mm256_unpacklo_epi64(t2, t3); + m3 = _mm256_unpackhi_epi64(t2, t3); +#ifdef HAVE_FANCY_SIMD + if (online) { + m0 = _mm256_add_epi8(m0, _mm256_set1_epi8(127)); + m1 = _mm256_add_epi8(m1, _mm256_set1_epi8(127)); + m2 = _mm256_add_epi8(m2, _mm256_set1_epi8(127)); + m3 = _mm256_add_epi8(m3, _mm256_set1_epi8(127)); + } +#endif + _mm256_storeu_si256((__m256i *)qy + 4*ib+0, m0); + _mm256_storeu_si256((__m256i *)qy + 4*ib+1, m1); + _mm256_storeu_si256((__m256i *)qy + 4*ib+2, m2); + _mm256_storeu_si256((__m256i *)qy + 4*ib+3, m3); +#elif defined __ARM_NEON + m0.val[0] = vld1q_s8(x8[0]+16*ib); m0.val[1] = vld1q_s8(x8[4]+16*ib); + m1.val[0] = vld1q_s8(x8[1]+16*ib); m1.val[1] = vld1q_s8(x8[5]+16*ib); + m2.val[0] = vld1q_s8(x8[2]+16*ib); m2.val[1] = vld1q_s8(x8[6]+16*ib); + m3.val[0] = vld1q_s8(x8[3]+16*ib); m3.val[1] = vld1q_s8(x8[7]+16*ib); + auto row01 = vtrnq_s32(vreinterpretq_s32_s8(m0.val[0]), vreinterpretq_s32_s8(m1.val[0])); + auto row23 = vtrnq_s32(vreinterpretq_s32_s8(m2.val[0]), vreinterpretq_s32_s8(m3.val[0])); + m0.val[0] = vreinterpretq_s8_s64(vtrn1q_s64(vreinterpretq_s64_s32(row01.val[0]), vreinterpretq_s64_s32(row23.val[0]))); + m1.val[0] = vreinterpretq_s8_s64(vtrn1q_s64(vreinterpretq_s64_s32(row01.val[1]), vreinterpretq_s64_s32(row23.val[1]))); + m2.val[0] = vreinterpretq_s8_s64(vtrn2q_s64(vreinterpretq_s64_s32(row01.val[0]), vreinterpretq_s64_s32(row23.val[0]))); + m3.val[0] = vreinterpretq_s8_s64(vtrn2q_s64(vreinterpretq_s64_s32(row01.val[1]), vreinterpretq_s64_s32(row23.val[1]))); + row01 = vtrnq_s32(vreinterpretq_s32_s8(m0.val[1]), vreinterpretq_s32_s8(m1.val[1])); + row23 = vtrnq_s32(vreinterpretq_s32_s8(m2.val[1]), vreinterpretq_s32_s8(m3.val[1])); + m0.val[1] = vreinterpretq_s8_s64(vtrn1q_s64(vreinterpretq_s64_s32(row01.val[0]), vreinterpretq_s64_s32(row23.val[0]))); + m1.val[1] = vreinterpretq_s8_s64(vtrn1q_s64(vreinterpretq_s64_s32(row01.val[1]), vreinterpretq_s64_s32(row23.val[1]))); + m2.val[1] = vreinterpretq_s8_s64(vtrn2q_s64(vreinterpretq_s64_s32(row01.val[0]), vreinterpretq_s64_s32(row23.val[0]))); + m3.val[1] = vreinterpretq_s8_s64(vtrn2q_s64(vreinterpretq_s64_s32(row01.val[1]), vreinterpretq_s64_s32(row23.val[1]))); + vst1q_s8_x2(qy + 0 + 128*ib, m0); + vst1q_s8_x2(qy + 32 + 128*ib, m1); + vst1q_s8_x2(qy + 64 + 128*ib, m2); + vst1q_s8_x2(qy + 96 + 128*ib, m3); +#else + // TODO + for (int l = 0; l < 4; ++l) { + for (int k = 0; k < 8; ++k) for (int i = 0; i < 4; ++i) { + y[ib].qs[32*l+4*k+i+ 0] = x8[k][ib].qs[i+4*l+ 0]; + y[ib].qs[32*l+4*k+i+128] = x8[k][ib].qs[i+4*l+16]; + } + } +#endif + + } + cx += 8*row_size_x; + cy += online ? 8*row_size_x : 8*row_size_y; + //So, if we are run-time-repacking (online = true) we don't want to change the stride, so we just leave some unused space at the end of each row + } +} +#ifdef HAVE_FANCY_SIMD +static void modify_q8_KV_r8(int64_t k, char * cy) { + int8_t * q8 = (int8_t *)(cy + 8*sizeof(float)); + for (int j = 0; j < k; ++j) q8[j] += 127; +} +#endif + +size_t quantize_q8_KV_r8(const float * src, void * dst, int64_t nrows, int64_t n_per_row, [[maybe_unused]] const float * imatrix) { + GGML_ASSERT(nrows%8 == 0); + GGML_ASSERT(n_per_row%16 == 0); + char * qcur = (char *)dst; + auto row_size_0 = ggml_row_size(GGML_TYPE_Q8_KV, n_per_row); + auto row_size_1 = ggml_row_size(GGML_TYPE_Q8_KV_R8, n_per_row); + std::vector qtmp(8*row_size_0); + for (int row = 0; row < nrows; row += 8) { + quantize_q8_KV(src, (void *)qtmp.data(), 8, n_per_row, imatrix); + repack_q8_KV(8, n_per_row, qtmp.data(), qcur, false); + qcur += 8*row_size_1; + src += 8*n_per_row; + } + return nrows*row_size_1; +} + +void dequantize_row_q8_KV_r8(const void * vx, float * y, int64_t k) { + auto n_per_row = k/8; + float * y8[8]; + for (int k = 0; k < 8; ++k) y8[k] = y + n_per_row*k; + auto dptr = (const float *)vx; + auto q8 = (const int8_t *)(dptr + 8); + for (int ib = 0; ib < n_per_row/16; ++ib) { + for (int k = 0; k < 8; ++k) { + for (int l = 0; l < 4; ++l) { + for (int i = 0; i < 4; ++i) y8[k][16*ib + 4*l + i] = dptr[k] * q8[128*ib + 32*l + 4*k + i]; + } + } + } +} + +void vec_dot_q8_KV_r8_q8_KV(int n, float * s, size_t bs, const void * vx, size_t bx, const void * vy, size_t by, int nrc) { +#if GGML_USE_IQK_MULMAT + if (iqk_mul_mat(1, 1, n, GGML_TYPE_Q8_KV_R8, vx, 0, GGML_TYPE_Q8_KV, vy, 0, s, 0, 0, 1)) { + return; + } +#endif + GGML_ASSERT(n%QK4_NL == 0); + GGML_ASSERT(nrc == 1); + GGML_UNUSED(bs); + GGML_UNUSED(bx); + GGML_UNUSED(by); +} + // // ========================================= bf16_r4 // @@ -6610,8 +6754,9 @@ bool iqk_modify_tensor(struct ggml_tensor * tensor) { { GGML_TYPE_Q4_0_R8, {modify_q4_0_r8, 8} }, #endif #ifdef HAVE_FANCY_SIMD - { GGML_TYPE_Q8_0_R8, {modify_q8_0_r8, 8} }, - { GGML_TYPE_Q8_K_R8, {modify_q8_k_r8, 8} }, + { GGML_TYPE_Q8_0_R8, {modify_q8_0_r8, 8} }, + { GGML_TYPE_Q8_K_R8, {modify_q8_k_r8, 8} }, + { GGML_TYPE_Q8_KV_R8, {modify_q8_KV_r8, 8} }, #endif }; auto it = k_mod_map.find(tensor->type); @@ -6670,6 +6815,7 @@ void iqk_repack_tensor(struct ggml_tensor * tensor) { { GGML_TYPE_Q6_0, { GGML_TYPE_Q6_0_R4, 4, (Repack::repack_func)repack_q6_0} }, { GGML_TYPE_Q8_0, { GGML_TYPE_Q8_0_R8, 8, (Repack::repack_func)repack_q8_0} }, { GGML_TYPE_Q8_K, { GGML_TYPE_Q8_K_R8, 8, (Repack::repack_func)repack_q8_k} }, + { GGML_TYPE_Q8_KV, { GGML_TYPE_Q8_KV_R8, 8, (Repack::repack_func)repack_q8_KV} }, #ifdef __AVX512BF16__ { GGML_TYPE_BF16, { GGML_TYPE_BF16_R16, 16, (Repack::repack_func)repack_bf16}}, { GGML_TYPE_F16, { GGML_TYPE_BF16_R16, 16, (Repack::repack_func)repack_bf16} }, diff --git a/ggml/src/iqk/iqk_quantize.h b/ggml/src/iqk/iqk_quantize.h index 91c7d3a3..76fbac3b 100644 --- a/ggml/src/iqk/iqk_quantize.h +++ b/ggml/src/iqk/iqk_quantize.h @@ -223,6 +223,12 @@ size_t quantize_q8_KV(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, void dequantize_row_q8_KV(const void * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); void vec_dot_q8_KV_q8_KV(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); +void quantize_row_q8_KV_r8_ref(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k); +void quantize_row_q8_KV_r8(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k); +size_t quantize_q8_KV_r8(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); +void dequantize_row_q8_KV_r8(const void * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); +void vec_dot_q8_KV_r8_q8_KV(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); + void iqk_quantize_row_q8_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k); void quantize_row_q8_K64_ref(const float * GGML_RESTRICT x, block_q8_K64 * GGML_RESTRICT y, int64_t k); void quantize_row_q8_K64(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k); diff --git a/include/llama.h b/include/llama.h index bc759c83..b5ad65e7 100644 --- a/include/llama.h +++ b/include/llama.h @@ -207,6 +207,7 @@ extern "C" { LLAMA_FTYPE_MOSTLY_IQ4_K_R4 = 340, // except 1d tensors LLAMA_FTYPE_MOSTLY_IQ5_K_R4 = 341, // except 1d tensors LLAMA_FTYPE_MOSTLY_IQ4_KS_R4 = 345, // except 1d tensors + LLAMA_FTYPE_MOSTLY_Q8_KV_R8 = 398, // except 1d tensors LLAMA_FTYPE_MOSTLY_Q8_K_R8 = 399, // except 1d tensors LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file diff --git a/src/llama.cpp b/src/llama.cpp index ec5d0fb1..0257a0a3 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -4021,6 +4021,7 @@ struct llama_model_loader { case GGML_TYPE_Q6_K: ftype = LLAMA_FTYPE_MOSTLY_Q6_K; break; case GGML_TYPE_Q6_K_R4: ftype = LLAMA_FTYPE_MOSTLY_Q6_K_R4; break; case GGML_TYPE_Q8_K_R8: ftype = LLAMA_FTYPE_MOSTLY_Q8_K_R8; break; + case GGML_TYPE_Q8_KV_R8: ftype = LLAMA_FTYPE_MOSTLY_Q8_KV_R8; break; case GGML_TYPE_IQ2_XXS: ftype = LLAMA_FTYPE_MOSTLY_IQ2_XXS; break; case GGML_TYPE_IQ2_XXS_R4:ftype = LLAMA_FTYPE_MOSTLY_IQ2_XXS_R4; break; case GGML_TYPE_IQ2_XS: ftype = LLAMA_FTYPE_MOSTLY_IQ2_XS; break; @@ -4756,6 +4757,7 @@ static std::string llama_model_ftype_name(llama_ftype ftype) { case LLAMA_FTYPE_MOSTLY_Q6_K: return "Q6_K"; case LLAMA_FTYPE_MOSTLY_Q6_K_R4: return "Q6_K_R4"; case LLAMA_FTYPE_MOSTLY_Q8_K_R8: return "Q8_K_R8"; + case LLAMA_FTYPE_MOSTLY_Q8_KV_R8: return "Q8_KV_R8"; case LLAMA_FTYPE_MOSTLY_IQ2_XXS: return "IQ2_XXS - 2.0625 bpw"; case LLAMA_FTYPE_MOSTLY_IQ2_XXS_R4:return "IQ2_XXS_R4 - 2.0625 bpw"; case LLAMA_FTYPE_MOSTLY_IQ2_XS: return "IQ2_XS - 2.3125 bpw"; @@ -16184,7 +16186,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n new_type = GGML_TYPE_IQ5_K; } else if (new_type != GGML_TYPE_Q8_0 && new_type != GGML_TYPE_Q8_0_R8 && new_type != GGML_TYPE_IQ6_K && new_type != GGML_TYPE_Q6_K_R4 && - new_type != GGML_TYPE_Q8_K_R8 && new_type != GGML_TYPE_Q8_KV) { + new_type != GGML_TYPE_Q8_K_R8 && new_type != GGML_TYPE_Q8_KV && new_type != GGML_TYPE_Q8_KV_R8) { new_type = GGML_TYPE_Q6_K; } } @@ -16238,6 +16240,9 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n else if (new_type == GGML_TYPE_Q8_K_R8) { new_type = GGML_TYPE_Q8_0; } + else if (new_type == GGML_TYPE_Q8_KV_R8) { + new_type = GGML_TYPE_Q8_0; + } else if (new_type == GGML_TYPE_IQ2_K_R4) { new_type = GGML_TYPE_IQ2_K; } @@ -16772,6 +16777,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s case LLAMA_FTYPE_MOSTLY_Q6_K: default_type = GGML_TYPE_Q6_K; break; case LLAMA_FTYPE_MOSTLY_Q6_K_R4: default_type = GGML_TYPE_Q6_K_R4; break; case LLAMA_FTYPE_MOSTLY_Q8_K_R8: default_type = GGML_TYPE_Q8_K_R8; break; + case LLAMA_FTYPE_MOSTLY_Q8_KV_R8: default_type = GGML_TYPE_Q8_KV_R8; break; case LLAMA_FTYPE_MOSTLY_IQ2_XXS: default_type = GGML_TYPE_IQ2_XXS; break; case LLAMA_FTYPE_MOSTLY_IQ2_XXS_R4:default_type = GGML_TYPE_IQ2_XXS_R4; break; case LLAMA_FTYPE_MOSTLY_IQ2_XS: default_type = GGML_TYPE_IQ2_XS; break; @@ -17215,6 +17221,10 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s if (tensor->ne[1] % 8 != 0) new_type = GGML_TYPE_Q8_0; else chunk_size_multiplier = 8; } + else if (new_type == GGML_TYPE_Q8_KV_R8) { + if (tensor->ne[1] % 8 != 0) new_type = GGML_TYPE_Q8_0; + else chunk_size_multiplier = 8; + } else if (new_type == GGML_TYPE_IQ2_BN_R4) { if (tensor->ne[1] % 4 != 0) new_type = GGML_TYPE_IQ2_BN; else chunk_size_multiplier = 4;