diff --git a/examples/quantize/quantize.cpp b/examples/quantize/quantize.cpp index 7ceee208..78fef3f8 100644 --- a/examples/quantize/quantize.cpp +++ b/examples/quantize/quantize.cpp @@ -56,6 +56,7 @@ static const std::vector QUANT_OPTIONS = { { "Q5_0_R4", LLAMA_FTYPE_MOSTLY_Q5_0_R4, " 5.50 bpw quantization", }, { "Q6_0_R4", LLAMA_FTYPE_MOSTLY_Q6_0_R4, " 6.50 bpw quantization", }, { "Q8_0_R8", LLAMA_FTYPE_MOSTLY_Q8_0_R8, " 8.50 bpw quantization", }, + { "Q8_KV", LLAMA_FTYPE_MOSTLY_Q8_KV, " 8.00 bpw quantization", }, { "IQ4_XS", LLAMA_FTYPE_MOSTLY_IQ4_XS, " 4.25 bpw non-linear quantization", }, { "IQ4_KS", LLAMA_FTYPE_MOSTLY_IQ4_KS, " 4.25 bpw non-linear quantization", }, { "IQ4_KS_R4",LLAMA_FTYPE_MOSTLY_IQ4_KS_R4,"IQ4_KS repacked", }, diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h index 66bcb25a..ab9c34a0 100644 --- a/ggml/include/ggml.h +++ b/ggml/include/ggml.h @@ -416,6 +416,7 @@ extern "C" { GGML_TYPE_Q8_K32 = 148, GGML_TYPE_Q8_KR8 = 149, GGML_TYPE_Q8_K128 = 150, + GGML_TYPE_Q8_KV = 151, GGML_TYPE_Q4_0_R8 = 202, GGML_TYPE_Q5_0_R4 = 206, @@ -501,6 +502,7 @@ extern "C" { GGML_FTYPE_MOSTLY_IQ4_KS = 137, // except 1d tensors GGML_FTYPE_MOSTLY_IQ2_KS = 138, // except 1d tensors GGML_FTYPE_MOSTLY_IQ4_KSS = 139, // except 1d tensors + GGML_FTYPE_MOSTLY_Q8_KV = 140, // except 1d tensors // GGML_FTYPE_MOSTLY_Q4_0_R8 = 202, // except 1d tensors GGML_FTYPE_MOSTLY_Q8_0_R8 = 207, // except 1d tensors diff --git a/ggml/src/ggml-quants.c b/ggml/src/ggml-quants.c index fe7de167..52678cc0 100644 --- a/ggml/src/ggml-quants.c +++ b/ggml/src/ggml-quants.c @@ -15216,6 +15216,7 @@ bool ggml_validate_row_data(enum ggml_type type, const void * data, size_t nbyte case GGML_TYPE_IQ5_K_R4: break; case GGML_TYPE_IQ4_KS_R4: break; case GGML_TYPE_Q8_K_R8: break; + case GGML_TYPE_Q8_KV: break; case GGML_TYPE_BF16_R16: break; case GGML_TYPE_Q4_0_4_4: case GGML_TYPE_Q4_0_4_8: diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index 8ab6b0a9..c5d42e57 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -1362,6 +1362,18 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = { .from_float = quantize_row_q8_K128, .row_meta_size = 0, }, + [GGML_TYPE_Q8_KV] = { + .type_name = "q8_KV", + .blck_size = 1, + .type_size = 1, + .is_quantized = true, + .to_float = (ggml_to_float_t) dequantize_row_q8_KV, + .from_float = quantize_row_q8_KV, + .from_float_ref = (ggml_from_float_t)quantize_row_q8_KV_ref, + .vec_dot = vec_dot_q8_KV_q8_KV, + .vec_dot_type = GGML_TYPE_Q8_KV, + .row_meta_size = 8, + }, [GGML_TYPE_Q8_K16] = { .type_name = "q8_K16", .blck_size = 64, @@ -4373,6 +4385,7 @@ enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype) { case GGML_FTYPE_MOSTLY_Q5_1: wtype = GGML_TYPE_Q5_1; break; case GGML_FTYPE_MOSTLY_Q6_0: wtype = GGML_TYPE_Q6_0; break; case GGML_FTYPE_MOSTLY_Q8_0: wtype = GGML_TYPE_Q8_0; break; + case GGML_FTYPE_MOSTLY_Q8_KV: wtype = GGML_TYPE_Q8_KV; break; case GGML_FTYPE_MOSTLY_Q2_K: wtype = GGML_TYPE_Q2_K; break; case GGML_FTYPE_MOSTLY_Q2_K_R4: wtype = GGML_TYPE_Q2_K_R4; break; case GGML_FTYPE_MOSTLY_Q3_K: wtype = GGML_TYPE_Q3_K; break; @@ -14061,7 +14074,7 @@ static void ggml_compute_forward_mul_mat( // nb01 >= nb00 - src0 is not transposed // compute by src0 rows -#if GGML_USE_IQK_MULMAT || GGML_USE_LLAMAFILE +#if GGML_USE_LLAMAFILE // broadcast factors const int64_t r2 = ne12 / ne02; const int64_t r3 = ne13 / ne03; @@ -14768,6 +14781,7 @@ static void ggml_compute_forward_out_prod( case GGML_TYPE_Q5_1: case GGML_TYPE_Q6_0: case GGML_TYPE_Q8_0: + case GGML_TYPE_Q8_KV: case GGML_TYPE_Q2_K: case GGML_TYPE_Q2_K_R4: case GGML_TYPE_Q3_K: @@ -15473,6 +15487,7 @@ static void ggml_compute_forward_get_rows( case GGML_TYPE_Q5_1: case GGML_TYPE_Q6_0: case GGML_TYPE_Q8_0: + case GGML_TYPE_Q8_KV: case GGML_TYPE_Q8_1: case GGML_TYPE_Q8_0_X4: case GGML_TYPE_Q8_1_X4: @@ -16159,6 +16174,7 @@ static void ggml_compute_forward_clamp( case GGML_TYPE_Q8_K: case GGML_TYPE_Q8_K64: case GGML_TYPE_Q8_K128: + case GGML_TYPE_Q8_KV: case GGML_TYPE_Q8_K16: case GGML_TYPE_Q8_K32: case GGML_TYPE_Q4_0_4_4: @@ -22970,6 +22986,7 @@ size_t ggml_quantize_chunk( case GGML_TYPE_Q5_1: result = quantize_q5_1(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; case GGML_TYPE_Q6_0: result = quantize_q6_0(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; case GGML_TYPE_Q8_0: result = quantize_q8_0(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; + case GGML_TYPE_Q8_KV: result = quantize_q8_KV(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; case GGML_TYPE_Q2_K: result = quantize_q2_K(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; case GGML_TYPE_Q2_K_R4: result = quantize_q2_k_r4(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; case GGML_TYPE_Q3_K: result = quantize_q3_K(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; diff --git a/ggml/src/iqk/iqk_mul_mat.cpp b/ggml/src/iqk/iqk_mul_mat.cpp index 8d6b45da..97e2a5b4 100644 --- a/ggml/src/iqk/iqk_mul_mat.cpp +++ b/ggml/src/iqk/iqk_mul_mat.cpp @@ -269,6 +269,7 @@ struct MulMat { case GGML_TYPE_IQ4_XS_R8: case GGML_TYPE_Q4_K_R4: case GGML_TYPE_Q5_K_R4: + case GGML_TYPE_Q8_KV: case GGML_TYPE_Q8_K_R8: return 8; case GGML_TYPE_Q4_0_R8: case GGML_TYPE_Q8_0_R8: @@ -301,6 +302,7 @@ struct MulMat { case GGML_TYPE_IQ4_XS_R8: case GGML_TYPE_Q4_0_R8: case GGML_TYPE_Q8_0_R8: + case GGML_TYPE_Q8_KV: case GGML_TYPE_Q8_K_R8: return 8; case GGML_TYPE_BF16_R16: return 16; default: return 1; @@ -6107,7 +6109,7 @@ static void mul_mat_q6_k_r4_q8_k(int n, const void * vx, size_t bx, const DataIn // The HAVE_FANCY_SIMD should only be #if defined(__AVX512_VNNI__ && defined(__AVX512VL__) template static void mul_mat_q8_k_r8_q8_k(int n, const void * vx, size_t bx, const DataInfo& info, int nrc_x) { - GGML_ASSERT(nrc_x%4 == 0); + GGML_ASSERT(nrc_x%8 == 0); Q8 q8(info); #ifndef HAVE_FANCY_SIMD auto m1 = _mm256_set1_epi16(1); @@ -6169,6 +6171,111 @@ static void mul_mat_q8_k_r8_q8_k(int n, const void * vx, size_t bx, const DataIn } } +template +static void mul_mat_q8_KV_q8_KV(int n, const void * vx, size_t bx, const DataInfo& info, int nrc_x) { + GGML_ASSERT(nrc_x%8 == 0); + GGML_ASSERT(n%128 == 0); + __m256i qx[4]; + __m256i sx[4]; + __m256i acc[nrc_y] = {}; + float dy[2*nrc_y]; + const int8_t * q8y[nrc_y]; + for (int iy = 0; iy < nrc_y; ++iy) { + auto dptr = (const float *)info.src1_row(iy); + dy[2*iy+0] = dptr[0]; + dy[2*iy+1] = 127*dptr[1]; + q8y[iy] = (const int8_t *)(dptr + 2); + } + for (int ix = 0; ix < nrc_x; ++ix) { + auto dx = (const float *)((const char *)vx + ix*bx); + auto q8x = (const int8_t *)(dx + 2); + //for (int i = 0; i < n/32; ++i) { + // //auto qx = _mm256_add_epi8(_mm256_loadu_si256((const __m256i *)q8x + i), _mm256_set1_epi8(127)); + // auto qx = _mm256_loadu_si256((const __m256i *)q8x + i); + // auto sx = _mm256_sign_epi8(qx, qx); + // for (int iy = 0; iy < nrc_y; ++iy) { + // //acc[iy] = _mm256_dpbusd_epi32(acc[iy], qx, _mm256_loadu_si256((const __m256i *)q8y[iy] + i)); + // acc[iy] = _mm256_dpbusd_epi32(acc[iy], sx, _mm256_sign_epi8(_mm256_loadu_si256((const __m256i *)q8y[iy] + i), qx)); + // } + //} + ////for (int iy = 0; iy < nrc_y; ++iy) { + //// int sumi = 0; + //// for (int j = 0; j < n; ++j) sumi += q8x[j]*q8y[iy][j]; + //// info.store(ix, iy, dx[0]*dy[2*iy+0]*sumi); + ////} + //for (int i = 0; i < n/128; ++i) { + // for (int j = 0; j < 4; ++j) { + // qx[j] = _mm256_loadu_si256((const __m256i *)q8x + 4*i + j); + // qx[j] = _mm256_add_epi8(qx[j], _mm256_set1_epi8(127)); + // } + // for (int iy = 0; iy < nrc_y; ++iy) { + // for (int j = 0; j < 4; ++j) { + // acc[iy] = _mm256_dpbusd_epi32(acc[iy], qx[j], _mm256_loadu_si256((const __m256i *)q8y[iy] + 4*i + j)); + // } + // } + //} + ////for (int i = 2*(n/128); i < n/64; ++i) { + //// for (int j = 0; j < 2; ++j) { + //// qx[j] = _mm256_loadu_si256((const __m256i *)q8x + 2*i + j); + //// qx[j] = _mm256_add_epi8(qx[j], _mm256_set1_epi8(127)); + //// } + //// for (int iy = 0; iy < nrc_y; ++iy) { + //// for (int j = 0; j < 2; ++j) { + //// acc[iy] = _mm256_dpbusd_epi32(acc[iy], qx[j], _mm256_loadu_si256((const __m256i *)q8y[iy] + 2*i + j)); + //// } + //// } + ////} + ////if (int i = 2*(n/64); i < n/32) { + //// qx[0] = _mm256_loadu_si256((const __m256i *)q8x + i); + //// qx[0] = _mm256_add_epi8(qx[0], _mm256_set1_epi8(127)); + //// for (int iy = 0; iy < nrc_y; ++iy) { + //// acc[iy] = _mm256_dpbusd_epi32(acc[iy], qx[0], _mm256_loadu_si256((const __m256i *)q8y[iy] + i)); + //// } + ////} + //// sum [dx * (qx_i - 128) * dy * qy_i] = dx*(dy*sum[qx_i * qy_i] - 128*dy*[sum qy_i] + //for (int iy = 0; iy < nrc_y; ++iy) { + // auto sumi = hsum_i32_8(acc[iy]); + // //info.store(ix, iy, dx[0]*(dy[2*iy+0]*sumi - dy[2*iy+1])); + // info.store(ix, iy, dx[0]*dy[2*iy+0]*sumi); + // acc[iy] = _mm256_setzero_si256(); + //} + for (int i = 0; i < n/128; ++i) { + for (int j = 0; j < 4; ++j) { + qx[j] = _mm256_loadu_si256((const __m256i *)q8x + 4*i + j); + sx[j] = _mm256_sign_epi8(qx[j], qx[j]); + } + for (int iy = 0; iy < nrc_y; ++iy) { + for (int j = 0; j < 4; ++j) { + acc[iy] = _mm256_dpbusd_epi32(acc[iy], sx[j], _mm256_sign_epi8(_mm256_loadu_si256((const __m256i *)q8y[iy] + 4*i + j), qx[j])); + } + } + } + //for (int i = 2*(n/128); i < n/64; ++i) { + // for (int j = 0; j < 2; ++j) { + // qx[j] = _mm256_loadu_si256((const __m256i *)q8x + 2*i + j); + // sx[j] = _mm256_sign_epi8(qx[j], qx[j]); + // } + // for (int iy = 0; iy < nrc_y; ++iy) { + // for (int j = 0; j < 2; ++j) { + // acc[iy] = _mm256_dpbusd_epi32(acc[iy], sx[j], _mm256_sign_epi8(_mm256_loadu_si256((const __m256i *)q8y[iy] + 2*i + j), qx[j])); + // } + // } + //} + //if (int i = 2*(n/64); i < n/32) { + // qx[0] = _mm256_loadu_si256((const __m256i *)q8x + i); + // sx[0] = _mm256_sign_epi8(qx[0], qx[0]); + // for (int iy = 0; iy < nrc_y; ++iy) { + // acc[iy] = _mm256_dpbusd_epi32(acc[iy], sx[0], _mm256_sign_epi8(_mm256_loadu_si256((const __m256i *)q8y[iy] + i), qx[0])); + // } + //} + for (int iy = 0; iy < nrc_y; ++iy) { + auto sumi = hsum_i32_8(acc[iy]); + info.store(ix, iy, dx[0]*dy[2*iy+0]*sumi); + acc[iy] = _mm256_setzero_si256(); + } + } +} + #ifdef __AVX512BF16__ template static void mul_mat_bf16_r16_bf16(int n, const void * vx, size_t bx, const DataInfo& info, int nrc_x) { @@ -9114,6 +9221,21 @@ bool MulMat::prepare(int typeA, int typeB, int ne00, MulMat& mm, int Ny) { #endif expected_typeB = GGML_TYPE_Q8_KR8; break; + case GGML_TYPE_Q8_KV: + assert (ne00 % 32 == 0); + mm.funcs[0] = mul_mat_q8_KV_q8_KV<1>; + mm.funcs[1] = mul_mat_q8_KV_q8_KV<2>; + mm.funcs[2] = mul_mat_q8_KV_q8_KV<3>; + mm.funcs[3] = mul_mat_q8_KV_q8_KV<4>; + mm.funcs[4] = mul_mat_q8_KV_q8_KV<5>; + mm.funcs[5] = mul_mat_q8_KV_q8_KV<6>; + mm.funcs[6] = mul_mat_q8_KV_q8_KV<7>; + mm.funcs[7] = mul_mat_q8_KV_q8_KV<8>; +#ifdef HAVE_FANCY_SIMD + mm.func16 = mul_mat_q8_KV_q8_KV<16>; +#endif + expected_typeB = GGML_TYPE_Q8_KV; + break; case GGML_TYPE_IQ4_K_R4: assert (ne00 % QK_K == 0); mm.funcs[0] = mul_mat_iq4_k_r4_q8_k<1>; diff --git a/ggml/src/iqk/iqk_quantize.cpp b/ggml/src/iqk/iqk_quantize.cpp index 24b49d89..e303d2af 100644 --- a/ggml/src/iqk/iqk_quantize.cpp +++ b/ggml/src/iqk/iqk_quantize.cpp @@ -2967,6 +2967,98 @@ void iqk_quantize_row_q8_K128(const float * x, void * vy, int64_t k) { } #endif } +// TODO: merge this with the above template +void iqk_quantize_row_q8_KV(const float * x, void * vy, int64_t k) { + assert(k % kBlockSize == 0); + auto dptr = (float *)vy; + auto q8 = (int8_t *)(dptr + 2); +#ifdef __AVX2__ + const __m256 signBit = _mm256_set1_ps(-0.0f); + const __m256i perm = _mm256_setr_epi32(0, 4, 1, 5, 2, 6, 3, 7); + __m256 maxAbs = _mm256_setzero_ps(); + for (int ib = 0; ib < k/8; ++ib) { + const __m256 v = _mm256_loadu_ps(x + 8*ib); + maxAbs = _mm256_max_ps( maxAbs, _mm256_andnot_ps(signBit, v)); + } + const float maxScalar = hmax_f32_8(maxAbs); + if (!maxScalar) { + dptr[0] = dptr[1] = 0; + std::memset(q8, 0, k*sizeof(int8_t)); + return; + } + dptr[0] = maxScalar / 127.f; + auto mul = _mm256_set1_ps(1/dptr[0]); + auto isum = _mm256_setzero_si256(); + for (int i = 0; i < k/32; i++) { + __m256 v0 = _mm256_mul_ps(mul, _mm256_loadu_ps(x + 32*i + 0)); + __m256 v1 = _mm256_mul_ps(mul, _mm256_loadu_ps(x + 32*i + 8)); + __m256 v2 = _mm256_mul_ps(mul, _mm256_loadu_ps(x + 32*i + 16)); + __m256 v3 = _mm256_mul_ps(mul, _mm256_loadu_ps(x + 32*i + 24)); + v0 = _mm256_round_ps(v0, _MM_ROUND_NEAREST); + v1 = _mm256_round_ps(v1, _MM_ROUND_NEAREST); + v2 = _mm256_round_ps(v2, _MM_ROUND_NEAREST); + v3 = _mm256_round_ps(v3, _MM_ROUND_NEAREST); + __m256i i0 = _mm256_cvtps_epi32(v0); + __m256i i1 = _mm256_cvtps_epi32(v1); + __m256i i2 = _mm256_cvtps_epi32(v2); + __m256i i3 = _mm256_cvtps_epi32(v3); + isum = _mm256_add_epi32(isum, _mm256_add_epi32(_mm256_add_epi32(i0, i1), _mm256_add_epi32(i2, i3))); + i0 = _mm256_packs_epi32( i0, i1 ); + i2 = _mm256_packs_epi32( i2, i3 ); + i0 = _mm256_packs_epi16( i0, i2 ); + i0 = _mm256_permutevar8x32_epi32( i0, perm ); + _mm256_storeu_si256((__m256i *)q8, i0); + q8 += 32; + } + dptr[1] = dptr[0] * hsum_i32_8(isum); +#elif defined __ARM_NEON + int32x4_t ival[8]; + auto vmax = vdupq_n_f32(0.f); + for (int j = 0; j < k; j += 4) { + vmax = vmaxq_f32(vmax, vabsq_f32(vld1q_f32(xb + j))); + } + auto smax = vmaxvq_f32(vmax); + if (!smax) { + dptr[0] = dptr[1] = 0; + std::memset(q8, 0, k*sizeof(int8_t)); + return; + } + auto vid = vdupq_n_f32(127/smax); + auto isum = vdupq_n_s32(0); + for (int ib = 0; ib < k/32; ++ib) { + for (int k = 0; k < 8; ++k) { + auto val = vld1q_f32(xb + 32*ib + 4*k); + ival[k] = vcvtnq_s32_f32(vmulq_f32(val, vid)); + isum = vaddq_s32(isum, ival[k]); + } + for (int k = 0; k < 4; ++k) { + auto i16 = vcombine_s16(vmovn_s32(ival[2*k+0]), vmovn_s32(ival[2*k+1])); + vst1_s8(q8, vmovn_s16(i16)); + q8 += 8; + } + } + dptr[1] = dptr[0] * vaddvq_s32(isum); +#else + float amax = 0; + for (int j = 0; j < k; ++j) { + float ax = std::abs(x[j]); + amax = std::max(amax, ax); + } + if (!amax) { + dptr[0] = dptr[1] = 0; + std::memset(q8, 0, k*sizeof(int8_t)); + return; + } + dptr[0] = amax/127; + float id = 1/dptr[0]; + int isum = 0; + for (int i = 0; i < k; i++) { + q8[i] = nearest_int(id*x[i]); + isum += q8[i]; + } + dptr[1] = dptr[0]*isum; +#endif +} } void quantize_row_q8_K128(const float * x, void * vy, int64_t k) { @@ -6450,6 +6542,47 @@ void vec_dot_iq1_m_r4_q8_k(int n, float * s, size_t bs, const void * vx, size_t GGML_UNUSED(by); } +void quantize_row_q8_KV(const float * x, void * vy, int64_t k) { + iqk_quantize_row_q8_KV(x, vy, k); +} + +void quantize_row_q8_KV_ref(const float * x, void * y, int64_t k) { + quantize_row_q8_KV(x, y, k); +} + +size_t quantize_q8_KV(const float * src, void * dst, int64_t nrows, int64_t n_per_row, const float * imatrix) { + (void)imatrix; + auto row_size = ggml_row_size(GGML_TYPE_Q8_KV, n_per_row); + auto q = (char *)dst; + for (int row = 0; row < nrows; ++row) { + quantize_row_q8_KV(src, q, n_per_row); + src += n_per_row; + q += row_size; + } + return row_size*nrows; +} + +void dequantize_row_q8_KV(const void * x, float * y, int64_t k) { + auto dptr = (const float *)x; + float d = dptr[0]; + auto q8 = (const int8_t *)(dptr + 2); + for (int j = 0; j < k; ++j) y[j] = d * q8[j]; +} + +void vec_dot_q8_KV_q8_KV(int n, float * s, size_t bs, const void * vx, size_t bx, const void * vy, size_t by, int nrc) { +#if GGML_USE_IQK_MULMAT + if (iqk_mul_mat(1, 1, n, GGML_TYPE_Q8_KV, vx, 0, GGML_TYPE_Q8_KV, vy, 0, s, 0, 0, 1)) { + return; + } +#endif + GGML_ASSERT(n%QK4_NL == 0); + GGML_ASSERT(nrc == 1); + GGML_UNUSED(bs); + GGML_UNUSED(bx); + GGML_UNUSED(by); +} + + //================================================ namespace { diff --git a/ggml/src/iqk/iqk_quantize.h b/ggml/src/iqk/iqk_quantize.h index 97719361..91c7d3a3 100644 --- a/ggml/src/iqk/iqk_quantize.h +++ b/ggml/src/iqk/iqk_quantize.h @@ -217,6 +217,12 @@ size_t quantize_q8_k_r8(const float * GGML_RESTRICT src, void * GGML_RESTRICT ds void dequantize_row_q8_k_r8(const block_q8_k_r8 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); void vec_dot_q8_k_r8_q8_k(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); +void quantize_row_q8_KV_ref(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k); +void quantize_row_q8_KV(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k); +size_t quantize_q8_KV(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); +void dequantize_row_q8_KV(const void * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); +void vec_dot_q8_KV_q8_KV(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); + void iqk_quantize_row_q8_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k); void quantize_row_q8_K64_ref(const float * GGML_RESTRICT x, block_q8_K64 * GGML_RESTRICT y, int64_t k); void quantize_row_q8_K64(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k); diff --git a/include/llama.h b/include/llama.h index 39251d35..bc759c83 100644 --- a/include/llama.h +++ b/include/llama.h @@ -180,6 +180,7 @@ extern "C" { LLAMA_FTYPE_MOSTLY_IQ3_KL = 146, // except 1d tensors LLAMA_FTYPE_MOSTLY_IQ2_KS = 147, // except 1d tensors LLAMA_FTYPE_MOSTLY_IQ4_KSS = 148, // except 1d tensors + LLAMA_FTYPE_MOSTLY_Q8_KV = 149, // except 1d tensors // LLAMA_FTYPE_MOSTLY_Q4_0_R8 = 202, // except 1d tensors LLAMA_FTYPE_MOSTLY_Q8_0_R8 = 207, // except 1d tensors diff --git a/src/llama.cpp b/src/llama.cpp index 8c4a966d..298c51eb 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -4002,6 +4002,7 @@ struct llama_model_loader { case GGML_TYPE_Q5_1: ftype = LLAMA_FTYPE_MOSTLY_Q5_1; break; case GGML_TYPE_Q6_0: ftype = LLAMA_FTYPE_MOSTLY_Q6_0; break; case GGML_TYPE_Q8_0: ftype = LLAMA_FTYPE_MOSTLY_Q8_0; break; + case GGML_TYPE_Q8_KV: ftype = LLAMA_FTYPE_MOSTLY_Q8_KV; break; case GGML_TYPE_Q2_K: ftype = LLAMA_FTYPE_MOSTLY_Q2_K; break; case GGML_TYPE_Q3_K: ftype = LLAMA_FTYPE_MOSTLY_Q3_K_M; break; case GGML_TYPE_Q3_K_R4: ftype = LLAMA_FTYPE_MOSTLY_Q3_K_R4; break; @@ -4730,6 +4731,7 @@ static std::string llama_model_ftype_name(llama_ftype ftype) { case LLAMA_FTYPE_MOSTLY_Q5_1: return "Q5_1"; case LLAMA_FTYPE_MOSTLY_Q6_0: return "Q6_0"; case LLAMA_FTYPE_MOSTLY_Q8_0: return "Q8_0"; + case LLAMA_FTYPE_MOSTLY_Q8_KV: return "Q8_KV"; case LLAMA_FTYPE_MOSTLY_Q2_K: return "Q2_K - Medium"; case LLAMA_FTYPE_MOSTLY_Q2_K_R4: return "Q2_K_R4"; case LLAMA_FTYPE_MOSTLY_Q2_K_S: return "Q2_K - Small"; @@ -16164,7 +16166,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n new_type = GGML_TYPE_IQ5_K; } else if (new_type != GGML_TYPE_Q8_0 && new_type != GGML_TYPE_Q8_0_R8 && new_type != GGML_TYPE_IQ6_K && new_type != GGML_TYPE_Q6_K_R4 && - new_type != GGML_TYPE_Q8_K_R8) { + new_type != GGML_TYPE_Q8_K_R8 && new_type != GGML_TYPE_Q8_KV) { new_type = GGML_TYPE_Q6_K; } } @@ -16728,6 +16730,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s case LLAMA_FTYPE_MOSTLY_Q5_1: default_type = GGML_TYPE_Q5_1; break; case LLAMA_FTYPE_MOSTLY_Q6_0: default_type = GGML_TYPE_Q6_0; break; case LLAMA_FTYPE_MOSTLY_Q8_0: default_type = GGML_TYPE_Q8_0; break; + case LLAMA_FTYPE_MOSTLY_Q8_KV:default_type = GGML_TYPE_Q8_KV;break; case LLAMA_FTYPE_MOSTLY_F16: default_type = GGML_TYPE_F16; break; case LLAMA_FTYPE_MOSTLY_BF16: default_type = GGML_TYPE_BF16; break; case LLAMA_FTYPE_MOSTLY_BF16_R16: default_type = GGML_TYPE_BF16_R16; break;