From 43c74f06daf721ebb4a2987f3c89ab84043aff0a Mon Sep 17 00:00:00 2001 From: Iwan Kawrakow Date: Wed, 2 Oct 2024 09:25:00 +0300 Subject: [PATCH] Adding q6_0 - basics + AVX2/Zen4 working --- examples/quantize/quantize.cpp | 1 + ggml/include/ggml.h | 2 + ggml/src/ggml-common.h | 8 ++ ggml/src/ggml-quants.c | 131 +++++++++++++++++++++++++++++++++ ggml/src/ggml-quants.h | 5 ++ ggml/src/ggml.c | 26 +++++++ ggml/src/iqk/iqk_mul_mat.cpp | 29 ++++++-- include/llama.h | 1 + src/llama.cpp | 3 + 9 files changed, 199 insertions(+), 7 deletions(-) diff --git a/examples/quantize/quantize.cpp b/examples/quantize/quantize.cpp index c11b8631..2b240299 100644 --- a/examples/quantize/quantize.cpp +++ b/examples/quantize/quantize.cpp @@ -20,6 +20,7 @@ static const std::vector QUANT_OPTIONS = { { "Q4_1", LLAMA_FTYPE_MOSTLY_Q4_1, " 3.90G, +0.1585 ppl @ LLaMA-v1-7B", }, { "Q5_0", LLAMA_FTYPE_MOSTLY_Q5_0, " 4.33G, +0.0683 ppl @ LLaMA-v1-7B", }, { "Q5_1", LLAMA_FTYPE_MOSTLY_Q5_1, " 4.70G, +0.0349 ppl @ LLaMA-v1-7B", }, + { "Q6_0", LLAMA_FTYPE_MOSTLY_Q6_0, " 6.5 bpw quantization", }, { "IQ2_XXS", LLAMA_FTYPE_MOSTLY_IQ2_XXS, " 2.06 bpw quantization", }, { "IQ2_XS", LLAMA_FTYPE_MOSTLY_IQ2_XS, " 2.31 bpw quantization", }, { "IQ2_S", LLAMA_FTYPE_MOSTLY_IQ2_S, " 2.5 bpw quantization", }, diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h index 36cc531f..08fe6a3e 100644 --- a/ggml/include/ggml.h +++ b/ggml/include/ggml.h @@ -392,6 +392,7 @@ extern "C" { GGML_TYPE_Q4_0_4_8 = 32, GGML_TYPE_Q4_0_8_8 = 33, // + GGML_TYPE_Q6_0 = 133, GGML_TYPE_IQ1_BN = 134, GGML_TYPE_IQ2_BN = 135, GGML_TYPE_Q8_K64 = 136, @@ -447,6 +448,7 @@ extern "C" { GGML_FTYPE_MOSTLY_Q4_0_4_8 = 26, // except 1d tensors GGML_FTYPE_MOSTLY_Q4_0_8_8 = 27, // except 1d tensors // + GGML_FTYPE_MOSTLY_Q6_0 = 127, // except 1d tensors GGML_FTYPE_MOSTLY_IQ1_BN = 128, // except 1d tensors GGML_FTYPE_MOSTLY_IQ2_BN = 129, // except 1d tensors GGML_FTYPE_MOSTLY_IQ2_K = 130, // except 1d tensors diff --git a/ggml/src/ggml-common.h b/ggml/src/ggml-common.h index bb0c4864..34377a16 100644 --- a/ggml/src/ggml-common.h +++ b/ggml/src/ggml-common.h @@ -187,6 +187,14 @@ typedef struct { } block_q5_1; static_assert(sizeof(block_q5_1) == 2 * sizeof(ggml_half) + sizeof(uint32_t) + QK5_1 / 2, "wrong q5_1 block size/padding"); +#define QK6_0 32 +typedef struct { + ggml_half d; // delta + uint8_t qh[QK6_0/4]; // 5+6-th bit of quants + uint8_t qs[QK6_0/2]; // nibbles / quants +} block_q6_0; +static_assert(sizeof(block_q6_0) == sizeof(ggml_half) + QK6_0/2 + QK6_0/4, "wrong q6_0 block size/padding"); + #define QK8_0 32 typedef struct { ggml_half d; // delta diff --git a/ggml/src/ggml-quants.c b/ggml/src/ggml-quants.c index bef2f73e..1de1eb06 100644 --- a/ggml/src/ggml-quants.c +++ b/ggml/src/ggml-quants.c @@ -848,6 +848,51 @@ void quantize_row_q5_1(const float * restrict x, void * restrict y, int64_t k) { quantize_row_q5_1_ref(x, y, k); } +void quantize_row_q6_0_ref(const float * restrict x, block_q6_0 * restrict y, int64_t k) { + static const int qk = QK6_0; + + assert(k % qk == 0); + + const int nb = k / qk; + + for (int i = 0; i < nb; i++) { + float amax = 0.0f; // absolute max + float max = 0.0f; + + for (int j = 0; j < qk; j++) { + const float v = x[i*qk + j]; + if (amax < fabsf(v)) { + amax = fabsf(v); + max = v; + } + } + + const float d = max / -32; + const float id = d ? 1.0f/d : 0.0f; + + y[i].d = GGML_FP32_TO_FP16(d); + memset(y[i].qh, 0, qk/4); + + for (int j = 0; j < qk/2; ++j) { + const float x0 = x[i*qk + 0 + j]*id; + const float x1 = x[i*qk + qk/2 + j]*id; + + const uint8_t xi0 = MIN(63, (int8_t)(x0 + 32.5f)); + const uint8_t xi1 = MIN(63, (int8_t)(x1 + 32.5f)); + + y[i].qs[j] = (xi0 & 0x0F) | ((xi1 & 0x0F) << 4); + + const uint8_t h = (xi0 >> 4) | ((xi1 >> 4) << 2); + y[i].qh[j%(qk/4)] |= (h << 4*(j/(qk/4))); + + } + } +} + +void quantize_row_q6_0(const float * restrict x, void * restrict y, int64_t k) { + quantize_row_q6_0_ref(x, y, k); +} + // reference implementation for deterministic creation of model files void quantize_row_q8_0_ref(const float * restrict x, block_q8_0 * restrict y, int64_t k) { assert(k % QK8_0 == 0); @@ -1691,6 +1736,28 @@ void dequantize_row_q5_1(const block_q5_1 * restrict x, float * restrict y, int6 } } +void dequantize_row_q6_0(const block_q6_0 * restrict x, float * restrict y, int64_t k) { + static const int qk = QK6_0; + + assert(k % qk == 0); + + const int nb = k / qk; + + for (int i = 0; i < nb; i++) { + const float d = GGML_FP16_TO_FP32(x[i].d); + + for (int j = 0; j < qk/2; ++j) { + const uint8_t h = x[i].qh[j%(qk/4)] >> 4*(j/(qk/4)); + + const int32_t x0 = ((x[i].qs[j] & 0x0F) | ((h << 4) & 0x30)) - 32; + const int32_t x1 = ((x[i].qs[j] >> 4) | ((h << 2) & 0x30)) - 32; + + y[i*qk + j + 0 ] = x0*d; + y[i*qk + j + qk/2] = x1*d; + } + } +} + void dequantize_row_q8_0(const block_q8_0 * restrict x, float * restrict y, int64_t k) { static const int qk = QK8_0; @@ -3429,6 +3496,54 @@ size_t quantize_q5_1(const float * restrict src, void * restrict dst, int64_t nr return nrow * row_size; } +static void quantize_row_q6_0_impl(const float * restrict x, block_q6_0 * restrict y, int64_t n_per_row, const float * quant_weights) { + static_assert(QK6_0 == 32, "QK6_0 must be 32"); + + float weight[QK6_0]; + int8_t L[QK6_0]; + + float sigma2 = 0; + if (quant_weights) { + float sum_x2 = 0; + for (int j = 0; j < n_per_row; ++j) sum_x2 += x[j]*x[j]; + sigma2 = sum_x2/n_per_row; + } + + const int64_t nb = n_per_row/QK6_0; + for (int ib = 0; ib < nb; ++ib) { + const float * xb = x + QK6_0 * ib; + if (quant_weights) { + const float * qw = quant_weights + QK6_0 * ib; + for (int j = 0; j < QK6_0; ++j) weight[j] = qw[j] * sqrtf(sigma2 + xb[j]*xb[j]); + } else { + for (int j = 0; j < QK6_0; ++j) weight[j] = xb[j]*xb[j]; + } + float d = make_qx_quants(QK6_0, 32, xb, L, 1, weight); + y[ib].d = GGML_FP32_TO_FP16(d); + + memset(y[ib].qh, 0, QK6_0/4); + + for (int j = 0; j < 16; ++j) { + const uint8_t xi0 = L[j]; + const uint8_t xi1 = L[j+16]; + y[ib].qs[j] = (xi0 & 0x0F) | ((xi1 & 0x0F) << 4); + const uint8_t h = (xi0 >> 4) | ((xi1 >> 4) << 2); + y[ib].qh[j%8] |= (h << 4*(j/8)); + } + } +} + +size_t quantize_q6_0(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) { + size_t row_size = ggml_row_size(GGML_TYPE_Q6_0, n_per_row); + char * qrow = (char *)dst; + for (int64_t row = 0; row < nrow; ++row) { + quantize_row_q6_0_impl(src, (block_q6_0*)qrow, n_per_row, quant_weights); + src += n_per_row; + qrow += row_size; + } + return nrow * row_size; +} + size_t quantize_q8_0(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) { (void)quant_weights; // not used const size_t row_size = ggml_row_size(GGML_TYPE_Q8_0, n_per_row); @@ -5383,6 +5498,21 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * restrict s, size_t bs, const void * r *s = sumf; } +void ggml_vec_dot_q6_0_q8_0(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) { +#if GGML_USE_IQK_MULMAT +#ifdef __AVX2__ + const enum ggml_type vec_dot_type = GGML_TYPE_Q8_1; +#else + const enum ggml_type vec_dot_type = GGML_TYPE_Q8_0; +#endif + if (iqk_mul_mat(nrc, nrc, n, GGML_TYPE_Q6_0, vx, bx, vec_dot_type, vy, by, s, bs, 0, 1)) { + return; + } +#endif + // TODO + *s = 0; +} + void ggml_vec_dot_q8_0_q8_0(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) { #if GGML_USE_IQK_MULMAT if (iqk_mul_mat(nrc, nrc, n, GGML_TYPE_Q8_0, vx, bx, GGML_TYPE_Q8_0, vy, by, s, bs, 0, 1)) { @@ -15020,6 +15150,7 @@ bool ggml_validate_row_data(enum ggml_type type, const void * data, size_t nbyte { VALIDATE_ROW_DATA_D_F16_IMPL(block_iq4_nl, data, nb); } break; + case GGML_TYPE_Q6_0: break; case GGML_TYPE_IQ2_K: break; case GGML_TYPE_IQ3_K: break; case GGML_TYPE_IQ4_K: break; diff --git a/ggml/src/ggml-quants.h b/ggml/src/ggml-quants.h index 775aa875..bad7e9d9 100644 --- a/ggml/src/ggml-quants.h +++ b/ggml/src/ggml-quants.h @@ -25,6 +25,7 @@ void quantize_row_q5_0_ref(const float * GGML_RESTRICT x, block_q5_0 * GGML_REST void quantize_row_q5_1_ref(const float * GGML_RESTRICT x, block_q5_1 * GGML_RESTRICT y, int64_t k); void quantize_row_q8_0_ref(const float * GGML_RESTRICT x, block_q8_0 * GGML_RESTRICT y, int64_t k); void quantize_row_q8_1_ref(const float * GGML_RESTRICT x, block_q8_1 * GGML_RESTRICT y, int64_t k); +void quantize_row_q6_0_ref(const float * GGML_RESTRICT x, block_q6_0 * GGML_RESTRICT y, int64_t k); void quantize_row_q2_K_ref(const float * GGML_RESTRICT x, block_q2_K * GGML_RESTRICT y, int64_t k); void quantize_row_q3_K_ref(const float * GGML_RESTRICT x, block_q3_K * GGML_RESTRICT y, int64_t k); @@ -48,6 +49,7 @@ void quantize_row_q5_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, in void quantize_row_q5_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k); void quantize_row_q8_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k); void quantize_row_q8_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k); +void quantize_row_q6_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k); void quantize_row_q2_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k); void quantize_row_q3_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k); @@ -72,6 +74,7 @@ void dequantize_row_q5_0(const block_q5_0 * GGML_RESTRICT x, float * GGML_RESTRI void dequantize_row_q5_1(const block_q5_1 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); void dequantize_row_q8_0(const block_q8_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); //void dequantize_row_q8_1(const block_q8_1 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); +void dequantize_row_q6_0(const block_q6_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); void dequantize_row_q2_K(const block_q2_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); void dequantize_row_q3_K(const block_q3_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); @@ -98,6 +101,7 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); +void ggml_vec_dot_q6_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); void ggml_vec_dot_q2_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); @@ -140,6 +144,7 @@ size_t quantize_q4_1(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, size_t quantize_q5_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); size_t quantize_q5_1(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); size_t quantize_q8_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); +size_t quantize_q6_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); void iq2xs_init_impl(enum ggml_type type); void iq2xs_free_impl(enum ggml_type type); diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index ee83fc43..d31713df 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -799,6 +799,23 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = { .nrows = 1, .row_meta_size = 0, }, + [GGML_TYPE_Q6_0] = { + .type_name = "q6_0", + .blck_size = QK6_0, + .type_size = sizeof(block_q6_0), + .is_quantized = true, + .to_float = (ggml_to_float_t) dequantize_row_q6_0, + .from_float = quantize_row_q6_0, + .from_float_ref = (ggml_from_float_t) quantize_row_q6_0_ref, + .vec_dot = ggml_vec_dot_q6_0_q8_0, +#if GGML_USE_IQK_MULMAT && defined __AVX2__ + .vec_dot_type = GGML_TYPE_Q8_1, +#else + .vec_dot_type = GGML_TYPE_Q8_0, +#endif + .nrows = 1, + .row_meta_size = 0, + }, [GGML_TYPE_Q8_0] = { .type_name = "q8_0", .blck_size = QK8_0, @@ -3788,6 +3805,7 @@ enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype) { case GGML_FTYPE_MOSTLY_Q4_1: wtype = GGML_TYPE_Q4_1; break; case GGML_FTYPE_MOSTLY_Q5_0: wtype = GGML_TYPE_Q5_0; break; case GGML_FTYPE_MOSTLY_Q5_1: wtype = GGML_TYPE_Q5_1; break; + case GGML_FTYPE_MOSTLY_Q6_0: wtype = GGML_TYPE_Q6_0; break; case GGML_FTYPE_MOSTLY_Q8_0: wtype = GGML_TYPE_Q8_0; break; case GGML_FTYPE_MOSTLY_Q2_K: wtype = GGML_TYPE_Q2_K; break; case GGML_FTYPE_MOSTLY_Q3_K: wtype = GGML_TYPE_Q3_K; break; @@ -10237,6 +10255,7 @@ static void ggml_compute_forward_add( case GGML_TYPE_Q4_1: case GGML_TYPE_Q5_0: case GGML_TYPE_Q5_1: + case GGML_TYPE_Q6_0: case GGML_TYPE_Q8_0: case GGML_TYPE_Q2_K: case GGML_TYPE_Q3_K: @@ -10623,6 +10642,7 @@ static void ggml_compute_forward_add1( case GGML_TYPE_Q4_1: case GGML_TYPE_Q5_0: case GGML_TYPE_Q5_1: + case GGML_TYPE_Q6_0: case GGML_TYPE_Q8_0: case GGML_TYPE_Q8_1: case GGML_TYPE_Q2_K: @@ -10760,6 +10780,7 @@ static void ggml_compute_forward_acc( case GGML_TYPE_Q4_1: case GGML_TYPE_Q5_0: case GGML_TYPE_Q5_1: + case GGML_TYPE_Q6_0: case GGML_TYPE_Q8_0: case GGML_TYPE_Q8_1: case GGML_TYPE_Q2_K: @@ -13858,6 +13879,7 @@ static void ggml_compute_forward_out_prod( case GGML_TYPE_Q4_1: case GGML_TYPE_Q5_0: case GGML_TYPE_Q5_1: + case GGML_TYPE_Q6_0: case GGML_TYPE_Q8_0: case GGML_TYPE_Q2_K: case GGML_TYPE_Q3_K: @@ -14234,6 +14256,7 @@ static void ggml_compute_forward_set( case GGML_TYPE_Q4_1: case GGML_TYPE_Q5_0: case GGML_TYPE_Q5_1: + case GGML_TYPE_Q6_0: case GGML_TYPE_Q8_0: case GGML_TYPE_Q8_1: case GGML_TYPE_Q2_K: @@ -14505,6 +14528,7 @@ static void ggml_compute_forward_get_rows( case GGML_TYPE_Q4_1: case GGML_TYPE_Q5_0: case GGML_TYPE_Q5_1: + case GGML_TYPE_Q6_0: case GGML_TYPE_Q8_0: case GGML_TYPE_Q8_1: case GGML_TYPE_Q2_K: @@ -15103,6 +15127,7 @@ static void ggml_compute_forward_clamp( case GGML_TYPE_Q4_1: case GGML_TYPE_Q5_0: case GGML_TYPE_Q5_1: + case GGML_TYPE_Q6_0: case GGML_TYPE_Q8_0: case GGML_TYPE_Q8_1: case GGML_TYPE_Q2_K: @@ -21899,6 +21924,7 @@ size_t ggml_quantize_chunk( case GGML_TYPE_Q4_1: result = quantize_q4_1(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; case GGML_TYPE_Q5_0: result = quantize_q5_0(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; case GGML_TYPE_Q5_1: result = quantize_q5_1(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; + case GGML_TYPE_Q6_0: result = quantize_q6_0(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; case GGML_TYPE_Q8_0: result = quantize_q8_0(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; case GGML_TYPE_Q2_K: result = quantize_q2_K(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; case GGML_TYPE_Q3_K: result = quantize_q3_K(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; diff --git a/ggml/src/iqk/iqk_mul_mat.cpp b/ggml/src/iqk/iqk_mul_mat.cpp index d16f01d9..5f4660ad 100644 --- a/ggml/src/iqk/iqk_mul_mat.cpp +++ b/ggml/src/iqk/iqk_mul_mat.cpp @@ -3228,6 +3228,16 @@ struct Q5_1_Dequantizer { return _mm256_or_si256(b4.dequant(x->qs), vqh); } }; +struct Q6_1_Dequantizer { + Dequantizer4bit b4; + const __m256i mh = _mm256_set1_epi8(0x30); + inline __m256i dequant(const block_q6_0 * x) const { + uint64_t aux64; std::memcpy(&aux64, x->qh, 8); + auto h128 = _mm_set_epi64x(aux64, aux64 << 4); + auto h256 = MM256_SET_M128I(_mm_srli_epi16(h128, 2), h128); + return _mm256_or_si256(b4.dequant(x->qs), _mm256_and_si256(h256, mh)); + } +}; template struct Q_Unpacker { @@ -3332,6 +3342,11 @@ struct Q5_1_Unpacker final : public Q_Unpacker, Q6_1_Dequantizer> { + Q6_0_1_Unpacker(const void * vx, size_t bx) : Q_Unpacker(vx, bx) {} + using Sum4T = Sum4TypeQ81; + inline static int block_size() { return QK5_0; } +}; // float matrices - we handle f16, bf16 (if native bf16 support is available) and f32, but only to f32 result @@ -3628,7 +3643,8 @@ template void MulMat::set_functions(MulMat& m) { } else if constexpr (std::is_same_v || std::is_same_v || std::is_same_v || std::is_same_v || - std::is_same_v || std::is_same_v) { + std::is_same_v || std::is_same_v || + std::is_same_v) { m.funcs[0] = mul_mat_qX_1_q8_1_T; m.funcs[1] = mul_mat_qX_1_q8_1_T; m.funcs[2] = mul_mat_qX_1_q8_1_T; @@ -3893,8 +3909,6 @@ bool MulMat::prepare(int typeA, int typeB, int ne00, MulMat& mm, int Ny) { break; case GGML_TYPE_Q4_0: assert (ne00 % QK4_0 == 0); - //MulMat::set_functions(mm); - //expected_typeB = GGML_TYPE_Q8_0; MulMat::set_functions(mm); expected_typeB = GGML_TYPE_Q8_1; break; @@ -3905,8 +3919,6 @@ bool MulMat::prepare(int typeA, int typeB, int ne00, MulMat& mm, int Ny) { break; case GGML_TYPE_Q5_0: assert (ne00 % QK5_0 == 0); - //MulMat::set_functions(mm); - //expected_typeB = GGML_TYPE_Q8_0; MulMat::set_functions(mm); expected_typeB = GGML_TYPE_Q8_1; break; @@ -3915,10 +3927,13 @@ bool MulMat::prepare(int typeA, int typeB, int ne00, MulMat& mm, int Ny) { MulMat::set_functions(mm); expected_typeB = GGML_TYPE_Q8_1; break; + case GGML_TYPE_Q6_0: + assert (ne00 % QK6_0 == 0); + MulMat::set_functions(mm); + expected_typeB = GGML_TYPE_Q8_1; + break; case GGML_TYPE_Q8_0: assert (ne00 % QK8_0 == 0); - //MulMat::set_functions(mm); - //expected_typeB = GGML_TYPE_Q8_0; MulMat::set_functions(mm); expected_typeB = GGML_TYPE_Q8_1; break; diff --git a/include/llama.h b/include/llama.h index 02d94b6c..43c0091e 100644 --- a/include/llama.h +++ b/include/llama.h @@ -167,6 +167,7 @@ extern "C" { LLAMA_FTYPE_MOSTLY_Q4_0_4_8 = 34, // except 1d tensors LLAMA_FTYPE_MOSTLY_Q4_0_8_8 = 35, // except 1d tensors // + LLAMA_FTYPE_MOSTLY_Q6_0 = 135, // except 1d tensors LLAMA_FTYPE_MOSTLY_IQ1_BN = 136, // except 1d tensors LLAMA_FTYPE_MOSTLY_IQ2_BN = 137, // except 1d tensors LLAMA_FTYPE_MOSTLY_IQ2_K = 138, // except 1d tensors diff --git a/src/llama.cpp b/src/llama.cpp index dca03ade..eb982125 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -3774,6 +3774,7 @@ struct llama_model_loader { case GGML_TYPE_Q4_1: ftype = LLAMA_FTYPE_MOSTLY_Q4_1; break; case GGML_TYPE_Q5_0: ftype = LLAMA_FTYPE_MOSTLY_Q5_0; break; case GGML_TYPE_Q5_1: ftype = LLAMA_FTYPE_MOSTLY_Q5_1; break; + case GGML_TYPE_Q6_0: ftype = LLAMA_FTYPE_MOSTLY_Q6_0; break; case GGML_TYPE_Q8_0: ftype = LLAMA_FTYPE_MOSTLY_Q8_0; break; case GGML_TYPE_Q2_K: ftype = LLAMA_FTYPE_MOSTLY_Q2_K; break; case GGML_TYPE_Q3_K: ftype = LLAMA_FTYPE_MOSTLY_Q3_K_M; break; @@ -4471,6 +4472,7 @@ static std::string llama_model_ftype_name(llama_ftype ftype) { case LLAMA_FTYPE_MOSTLY_Q4_1: return "Q4_1"; case LLAMA_FTYPE_MOSTLY_Q5_0: return "Q5_0"; case LLAMA_FTYPE_MOSTLY_Q5_1: return "Q5_1"; + case LLAMA_FTYPE_MOSTLY_Q6_0: return "Q6_0"; case LLAMA_FTYPE_MOSTLY_Q8_0: return "Q8_0"; case LLAMA_FTYPE_MOSTLY_Q2_K: return "Q2_K - Medium"; case LLAMA_FTYPE_MOSTLY_Q2_K_S: return "Q2_K - Small"; @@ -15967,6 +15969,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s case LLAMA_FTYPE_MOSTLY_Q4_1: default_type = GGML_TYPE_Q4_1; break; case LLAMA_FTYPE_MOSTLY_Q5_0: default_type = GGML_TYPE_Q5_0; break; case LLAMA_FTYPE_MOSTLY_Q5_1: default_type = GGML_TYPE_Q5_1; break; + case LLAMA_FTYPE_MOSTLY_Q6_0: default_type = GGML_TYPE_Q6_0; break; case LLAMA_FTYPE_MOSTLY_Q8_0: default_type = GGML_TYPE_Q8_0; break; case LLAMA_FTYPE_MOSTLY_F16: default_type = GGML_TYPE_F16; break; case LLAMA_FTYPE_MOSTLY_BF16: default_type = GGML_TYPE_BF16; break;