diff --git a/examples/quantize/quantize.cpp b/examples/quantize/quantize.cpp index 9a08d625..c6153e45 100644 --- a/examples/quantize/quantize.cpp +++ b/examples/quantize/quantize.cpp @@ -28,6 +28,7 @@ static const std::vector QUANT_OPTIONS = { { "IQ1_M", LLAMA_FTYPE_MOSTLY_IQ1_M, " 1.75 bpw quantization", }, { "IQ1_BN", LLAMA_FTYPE_MOSTLY_IQ1_BN, " 1.62 bpw quantization (Bitnet)", }, { "IQ2_BN", LLAMA_FTYPE_MOSTLY_IQ2_BN, " 2.00 bpw quantization (Bitnet)", }, + { "IQ1_TN", LLAMA_FTYPE_MOSTLY_IQ1_TN, " 1.69 bpw quantization (TriLM)", }, { "IQ2_TN", LLAMA_FTYPE_MOSTLY_IQ2_TN, " 2.06 bpw quantization (TriLM)", }, { "Q2_K", LLAMA_FTYPE_MOSTLY_Q2_K, " 2.63G, +0.6717 ppl @ LLaMA-v1-7B", }, { "Q2_K_S", LLAMA_FTYPE_MOSTLY_Q2_K_S, " 2.16G, +9.0634 ppl @ LLaMA-v1-7B", }, diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h index ab6d172d..5b46a70d 100644 --- a/ggml/include/ggml.h +++ b/ggml/include/ggml.h @@ -391,15 +391,17 @@ extern "C" { GGML_TYPE_Q4_0_4_4 = 31, GGML_TYPE_Q4_0_4_8 = 32, GGML_TYPE_Q4_0_8_8 = 33, - GGML_TYPE_IQ1_BN = 34, - GGML_TYPE_IQ2_BN = 35, - GGML_TYPE_Q8_K64 = 36, - GGML_TYPE_IQ2_K = 37, - GGML_TYPE_IQ3_K = 38, - GGML_TYPE_IQ4_K = 39, - GGML_TYPE_IQ5_K = 40, - GGML_TYPE_IQ6_K = 41, - GGML_TYPE_IQ2_TN = 42, + // + GGML_TYPE_IQ1_BN = 134, + GGML_TYPE_IQ2_BN = 135, + GGML_TYPE_Q8_K64 = 136, + GGML_TYPE_IQ2_K = 137, + GGML_TYPE_IQ3_K = 138, + GGML_TYPE_IQ4_K = 139, + GGML_TYPE_IQ5_K = 140, + GGML_TYPE_IQ6_K = 141, + GGML_TYPE_IQ2_TN = 142, + GGML_TYPE_IQ1_TN = 143, GGML_TYPE_COUNT, }; @@ -444,14 +446,16 @@ extern "C" { GGML_FTYPE_MOSTLY_Q4_0_4_4 = 25, // except 1d tensors GGML_FTYPE_MOSTLY_Q4_0_4_8 = 26, // except 1d tensors GGML_FTYPE_MOSTLY_Q4_0_8_8 = 27, // except 1d tensors - GGML_FTYPE_MOSTLY_IQ1_BN = 28, // except 1d tensors - GGML_FTYPE_MOSTLY_IQ2_BN = 29, // except 1d tensors - GGML_FTYPE_MOSTLY_IQ2_K = 30, // except 1d tensors - GGML_FTYPE_MOSTLY_IQ3_K = 31, // except 1d tensors - GGML_FTYPE_MOSTLY_IQ4_K = 32, // except 1d tensors - GGML_FTYPE_MOSTLY_IQ5_K = 33, // except 1d tensors - GGML_FTYPE_MOSTLY_IQ6_K = 34, // except 1d tensors - GGML_FTYPE_MOSTLY_IQ2_TN = 35, // except 1d tensors + // + GGML_FTYPE_MOSTLY_IQ1_BN = 128, // except 1d tensors + GGML_FTYPE_MOSTLY_IQ2_BN = 129, // except 1d tensors + GGML_FTYPE_MOSTLY_IQ2_K = 130, // except 1d tensors + GGML_FTYPE_MOSTLY_IQ3_K = 131, // except 1d tensors + GGML_FTYPE_MOSTLY_IQ4_K = 132, // except 1d tensors + GGML_FTYPE_MOSTLY_IQ5_K = 133, // except 1d tensors + GGML_FTYPE_MOSTLY_IQ6_K = 134, // except 1d tensors + GGML_FTYPE_MOSTLY_IQ2_TN = 135, // except 1d tensors + GGML_FTYPE_MOSTLY_IQ1_TN = 136, // except 1d tensors }; // available tensor operations: diff --git a/ggml/src/ggml-common.h b/ggml/src/ggml-common.h index 57fdeb82..f1a34f7a 100644 --- a/ggml/src/ggml-common.h +++ b/ggml/src/ggml-common.h @@ -435,6 +435,10 @@ static_assert(sizeof(block_iq2_bn) == QK_IQ2BN/4, "wrong iq2_bn block size/paddi // // TriLM - implemented as 2.0625 bpw // +typedef struct { + uint8_t qs[54]; +} block_iq1_tn; +static_assert(sizeof(block_iq1_tn) == 54, "wrong iq1_tn block size/padding"); typedef struct { ggml_half d; uint8_t qs[QK_K/4]; diff --git a/ggml/src/ggml-quants.c b/ggml/src/ggml-quants.c index 981fb54b..a9a25761 100644 --- a/ggml/src/ggml-quants.c +++ b/ggml/src/ggml-quants.c @@ -15015,6 +15015,7 @@ bool ggml_validate_row_data(enum ggml_type type, const void * data, size_t nbyte case GGML_TYPE_IQ5_K: break; case GGML_TYPE_IQ6_K: break; case GGML_TYPE_IQ2_TN: break; + case GGML_TYPE_IQ1_TN: break; case GGML_TYPE_Q4_0_4_4: case GGML_TYPE_Q4_0_4_8: { diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index d562002e..7d294f4a 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -985,6 +985,18 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = { .vec_dot_type = GGML_TYPE_Q8_K, .nrows = 1, }, + [GGML_TYPE_IQ1_TN] = { + .type_name = "iq1_tn", + .blck_size = QK_K, + .type_size = sizeof(block_iq1_tn), + .is_quantized = true, + .to_float = (ggml_to_float_t) dequantize_row_iq1_tn, + .from_float = quantize_row_iq1_tn, + .from_float_ref = (ggml_from_float_t)quantize_row_iq1_tn_ref, + .vec_dot = vec_dot_iq1_tn_q8_k, + .vec_dot_type = GGML_TYPE_Q8_K64, + .nrows = 1, + }, [GGML_TYPE_IQ4_NL] = { .type_name = "iq4_nl", .blck_size = QK4_NL, @@ -3705,6 +3717,7 @@ enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype) { case GGML_FTYPE_MOSTLY_IQ1_BN: wtype = GGML_TYPE_IQ1_BN; break; case GGML_FTYPE_MOSTLY_IQ2_BN: wtype = GGML_TYPE_IQ2_BN; break; case GGML_FTYPE_MOSTLY_IQ2_TN: wtype = GGML_TYPE_IQ2_TN; break; + case GGML_FTYPE_MOSTLY_IQ1_TN: wtype = GGML_TYPE_IQ1_TN; break; case GGML_FTYPE_MOSTLY_IQ4_NL: wtype = GGML_TYPE_IQ4_NL; break; case GGML_FTYPE_MOSTLY_IQ4_XS: wtype = GGML_TYPE_IQ4_XS; break; case GGML_FTYPE_MOSTLY_IQ2_K: wtype = GGML_TYPE_IQ2_K; break; @@ -10133,6 +10146,7 @@ static void ggml_compute_forward_add( case GGML_TYPE_IQ1_BN: case GGML_TYPE_IQ2_BN: case GGML_TYPE_IQ2_TN: + case GGML_TYPE_IQ1_TN: case GGML_TYPE_IQ4_NL: case GGML_TYPE_IQ4_XS: case GGML_TYPE_IQ2_K: @@ -10519,6 +10533,7 @@ static void ggml_compute_forward_add1( case GGML_TYPE_IQ1_BN: case GGML_TYPE_IQ2_BN: case GGML_TYPE_IQ2_TN: + case GGML_TYPE_IQ1_TN: case GGML_TYPE_IQ4_NL: case GGML_TYPE_IQ4_XS: case GGML_TYPE_IQ2_K: @@ -10655,6 +10670,7 @@ static void ggml_compute_forward_acc( case GGML_TYPE_IQ1_BN: case GGML_TYPE_IQ2_BN: case GGML_TYPE_IQ2_TN: + case GGML_TYPE_IQ1_TN: case GGML_TYPE_IQ4_NL: case GGML_TYPE_IQ4_XS: case GGML_TYPE_IQ2_K: @@ -13692,6 +13708,7 @@ static void ggml_compute_forward_out_prod( case GGML_TYPE_IQ1_BN: case GGML_TYPE_IQ2_BN: case GGML_TYPE_IQ2_TN: + case GGML_TYPE_IQ1_TN: case GGML_TYPE_IQ4_NL: case GGML_TYPE_IQ4_XS: case GGML_TYPE_IQ2_K: @@ -14068,6 +14085,7 @@ static void ggml_compute_forward_set( case GGML_TYPE_IQ1_BN: case GGML_TYPE_IQ2_BN: case GGML_TYPE_IQ2_TN: + case GGML_TYPE_IQ1_TN: case GGML_TYPE_IQ4_NL: case GGML_TYPE_IQ4_XS: case GGML_TYPE_IQ2_K: @@ -14338,6 +14356,7 @@ static void ggml_compute_forward_get_rows( case GGML_TYPE_IQ1_BN: case GGML_TYPE_IQ2_BN: case GGML_TYPE_IQ2_TN: + case GGML_TYPE_IQ1_TN: case GGML_TYPE_IQ4_NL: case GGML_TYPE_IQ4_XS: case GGML_TYPE_IQ2_K: @@ -14935,6 +14954,7 @@ static void ggml_compute_forward_clamp( case GGML_TYPE_IQ1_BN: case GGML_TYPE_IQ2_BN: case GGML_TYPE_IQ2_TN: + case GGML_TYPE_IQ1_TN: case GGML_TYPE_IQ4_NL: case GGML_TYPE_IQ4_XS: case GGML_TYPE_IQ2_K: @@ -21722,6 +21742,7 @@ size_t ggml_quantize_chunk( case GGML_TYPE_IQ1_BN: result = quantize_iq1_bn (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; case GGML_TYPE_IQ2_BN: result = quantize_iq2_bn (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; case GGML_TYPE_IQ2_TN: result = quantize_iq2_tn (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; + case GGML_TYPE_IQ1_TN: result = quantize_iq1_tn (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; case GGML_TYPE_IQ4_NL: result = quantize_iq4_nl (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; case GGML_TYPE_IQ4_XS: result = quantize_iq4_xs (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; case GGML_TYPE_IQ2_K: result = quantize_iq2_k (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; diff --git a/ggml/src/iqk/iqk_mul_mat.cpp b/ggml/src/iqk/iqk_mul_mat.cpp index 55366ab1..a532d18e 100644 --- a/ggml/src/iqk/iqk_mul_mat.cpp +++ b/ggml/src/iqk/iqk_mul_mat.cpp @@ -2131,7 +2131,7 @@ struct DequantizerIQ1BN { }; -template +template IQK_NOINLINE void mul_mat_iq1bn_q8_K64(int n, const void * vx, size_t bx, const DataInfo& info, int nrc_x) { const int nb = n / QK_IQ1BN; Q8_K64 q8(info); @@ -2143,11 +2143,21 @@ IQK_NOINLINE void mul_mat_iq1bn_q8_K64(int n, const void * vx, size_t bx, const const auto m1_16 = _mm256_set1_epi16(1); #endif - const block_iq1_bn * x = (const block_iq1_bn *)((const char *)vx); + //const block_iq1_bn * x = (const block_iq1_bn *)((const char *)vx); + + const block_iq1_bn * x; + const char * cx0 = (const char *)vx; + float scale; + //template > float scale; for (int ix = 0; ix < nrc_x; ++ix) { - x = (const block_iq1_bn *)((const char *)vx + ix*bx); + const char * cx = cx0 + ix*bx; + if constexpr (is_iq1_tn) { + scale = GGML_FP16_TO_FP32(*(const ggml_half *)cx); + cx += sizeof(ggml_half); + } + x = (const block_iq1_bn *)cx; if constexpr (nrc_y == 1) { __m256i acc1 = _mm256_setzero_si256(), acc2 = _mm256_setzero_si256(); @@ -2220,7 +2230,11 @@ IQK_NOINLINE void mul_mat_iq1bn_q8_K64(int n, const void * vx, size_t bx, const auto vd = q8.scale(iy); auto sumi = _mm_add_epi32(_mm256_castsi256_si128(accd[iy]), _mm256_extractf128_si256(accd[iy], 1)); auto sumf = _mm_mul_ps(vd, _mm_cvtepi32_ps(sumi)); - info.store(ix, iy, hsum_float_4(sumf)); + if constexpr (is_iq1_tn) { + info.store(ix, iy, scale*hsum_float_4(sumf)); + } else { + info.store(ix, iy, hsum_float_4(sumf)); + } } } @@ -3733,14 +3747,26 @@ bool MulMat::prepare(int typeA, int typeB, int ne00, MulMat& mm, int Ny) { break; case GGML_TYPE_IQ1_BN: assert (ne00 % QK_IQ1BN == 0); - mm.funcs[0] = mul_mat_iq1bn_q8_K64<1>; - mm.funcs[1] = mul_mat_iq1bn_q8_K64<2>; - mm.funcs[2] = mul_mat_iq1bn_q8_K64<3>; - mm.funcs[3] = mul_mat_iq1bn_q8_K64<4>; - mm.funcs[4] = mul_mat_iq1bn_q8_K64<5>; - mm.funcs[5] = mul_mat_iq1bn_q8_K64<6>; - mm.funcs[6] = mul_mat_iq1bn_q8_K64<7>; - mm.funcs[7] = mul_mat_iq1bn_q8_K64<8>; + mm.funcs[0] = mul_mat_iq1bn_q8_K64<1, false>; + mm.funcs[1] = mul_mat_iq1bn_q8_K64<2, false>; + mm.funcs[2] = mul_mat_iq1bn_q8_K64<3, false>; + mm.funcs[3] = mul_mat_iq1bn_q8_K64<4, false>; + mm.funcs[4] = mul_mat_iq1bn_q8_K64<5, false>; + mm.funcs[5] = mul_mat_iq1bn_q8_K64<6, false>; + mm.funcs[6] = mul_mat_iq1bn_q8_K64<7, false>; + mm.funcs[7] = mul_mat_iq1bn_q8_K64<8, false>; + expected_typeB = GGML_TYPE_Q8_K64; + break; + case GGML_TYPE_IQ1_TN: + assert (ne00 % QK_IQ1BN == 0); + mm.funcs[0] = mul_mat_iq1bn_q8_K64<1, true>; + mm.funcs[1] = mul_mat_iq1bn_q8_K64<2, true>; + mm.funcs[2] = mul_mat_iq1bn_q8_K64<3, true>; + mm.funcs[3] = mul_mat_iq1bn_q8_K64<4, true>; + mm.funcs[4] = mul_mat_iq1bn_q8_K64<5, true>; + mm.funcs[5] = mul_mat_iq1bn_q8_K64<6, true>; + mm.funcs[6] = mul_mat_iq1bn_q8_K64<7, true>; + mm.funcs[7] = mul_mat_iq1bn_q8_K64<8, true>; expected_typeB = GGML_TYPE_Q8_K64; break; case GGML_TYPE_IQ2_BN: diff --git a/ggml/src/iqk/iqk_quantize.cpp b/ggml/src/iqk/iqk_quantize.cpp index 0968becf..64815e92 100644 --- a/ggml/src/iqk/iqk_quantize.cpp +++ b/ggml/src/iqk/iqk_quantize.cpp @@ -119,6 +119,54 @@ void quantize_row_iq1_bn(const float * x, void * y, int64_t k) { quantize_iq1_bn(x, y, 1, k, nullptr); } +void quantize_row_iq1_tn_ref(const float * x, block_iq1_tn * y, int64_t k) { + quantize_iq1_tn(x, (void *)y, 1, k, nullptr); +} + +void quantize_row_iq1_tn(const float * x, void * y, int64_t k) { + quantize_iq1_tn(x, y, 1, k, nullptr); +} + +size_t quantize_iq1_tn(const float * src, void * dst, int64_t nrows, int64_t n_per_row, const float * imatrix) { + GGML_ASSERT(n_per_row >= 2*QK_K); // so we have space for the scale + int nblock = n_per_row/QK_IQ1BN; + float tmp[QK_IQ1BN]; + char * qrow = (char *)dst; + auto row_size = ggml_row_size(GGML_TYPE_IQ1_TN, n_per_row); + IQ1BNQuantizer iq1bn; + for (int row = 0; row < nrows; ++row) { + float max = fabsf(src[0]); + for (int j = 1; j < n_per_row; ++j) max = std::max(max, fabsf(src[j])); + if (!(max > 0)) printf("%s: found max = %g?\n", __func__, max); + //GGML_ASSERT(max > 0); + *(ggml_half *)qrow = GGML_FP32_TO_FP16(max); + block_iq1_bn * y = (block_iq1_bn *)(qrow + sizeof(ggml_half)); + const float * xb = src; + for (int ib = 0; ib < nblock; ++ib) { + for (int j = 0; j < QK_IQ1BN; ++j) tmp[j] = xb[j] < -0.5f*max ? -1 : xb[j] <= 0.5f*max ? 0 : 1; + iq1bn.quantize_one_row_1bn(tmp, y, QK_IQ1BN, imatrix); + ++y; + xb += QK_IQ1BN; + } + src += n_per_row; + qrow += row_size; + } + return nrows*row_size; +} + +void dequantize_row_iq1_tn(const block_iq1_tn * x, float * y, int64_t k) { + float scale = GGML_FP16_TO_FP32(*(const ggml_half *)x); + const block_iq1_bn * iq1bn = (const block_iq1_bn *)((const char *)x + sizeof(ggml_half)); + dequantize_row_iq1_bn(iq1bn, y, k); + for (int j = 0; j < int(k); ++j) y[j] *= scale; +} + +void vec_dot_iq1_tn_q8_k(int n, float * s, size_t bs, const void * vx, size_t bx, const void * vy, size_t by, int nrc) { + float scale = GGML_FP16_TO_FP32(*(const ggml_half *)vx); + ggml_vec_dot_iq1_bn_q8_K64(n, s, bs, (const void *)((const char *)vx + sizeof(ggml_half)), bx, vy, by, nrc); + *s *= scale; +} + void dequantize_row_iq1_bn(const block_iq1_bn * x, float * y, int64_t k) { assert(k%QK_IQ1BN == 0); int nblock = k / QK_IQ1BN; diff --git a/ggml/src/iqk/iqk_quantize.h b/ggml/src/iqk/iqk_quantize.h index d7a0748f..e5c16fc9 100644 --- a/ggml/src/iqk/iqk_quantize.h +++ b/ggml/src/iqk/iqk_quantize.h @@ -49,6 +49,12 @@ size_t quantize_iq2_tn(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst void dequantize_row_iq2_tn(const block_iq2_tn * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); void vec_dot_iq2_tn_q8_k(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); +void quantize_row_iq1_tn_ref(const float * GGML_RESTRICT x, block_iq1_tn * GGML_RESTRICT y, int64_t k); +void quantize_row_iq1_tn(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k); +size_t quantize_iq1_tn(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); +void dequantize_row_iq1_tn(const block_iq1_tn * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); +void vec_dot_iq1_tn_q8_k(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); + void iqk_quantize_row_q8_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k); #ifdef __cplusplus diff --git a/include/llama.h b/include/llama.h index a9af4c48..02d94b6c 100644 --- a/include/llama.h +++ b/include/llama.h @@ -166,14 +166,16 @@ extern "C" { LLAMA_FTYPE_MOSTLY_Q4_0_4_4 = 33, // except 1d tensors LLAMA_FTYPE_MOSTLY_Q4_0_4_8 = 34, // except 1d tensors LLAMA_FTYPE_MOSTLY_Q4_0_8_8 = 35, // except 1d tensors - LLAMA_FTYPE_MOSTLY_IQ1_BN = 36, // except 1d tensors - LLAMA_FTYPE_MOSTLY_IQ2_BN = 37, // except 1d tensors - LLAMA_FTYPE_MOSTLY_IQ2_K = 38, // except 1d tensors - LLAMA_FTYPE_MOSTLY_IQ3_K = 39, // except 1d tensors - LLAMA_FTYPE_MOSTLY_IQ4_K = 40, // except 1d tensors - LLAMA_FTYPE_MOSTLY_IQ5_K = 41, // except 1d tensors - LLAMA_FTYPE_MOSTLY_IQ6_K = 42, // except 1d tensors - LLAMA_FTYPE_MOSTLY_IQ2_TN = 43, // except 1d tensors + // + LLAMA_FTYPE_MOSTLY_IQ1_BN = 136, // except 1d tensors + LLAMA_FTYPE_MOSTLY_IQ2_BN = 137, // except 1d tensors + LLAMA_FTYPE_MOSTLY_IQ2_K = 138, // except 1d tensors + LLAMA_FTYPE_MOSTLY_IQ3_K = 139, // except 1d tensors + LLAMA_FTYPE_MOSTLY_IQ4_K = 140, // except 1d tensors + LLAMA_FTYPE_MOSTLY_IQ5_K = 141, // except 1d tensors + LLAMA_FTYPE_MOSTLY_IQ6_K = 142, // except 1d tensors + LLAMA_FTYPE_MOSTLY_IQ2_TN = 143, // except 1d tensors + LLAMA_FTYPE_MOSTLY_IQ1_TN = 144, // except 1d tensors LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file }; diff --git a/src/llama.cpp b/src/llama.cpp index 768aafa7..bb9b6848 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -3788,6 +3788,7 @@ struct llama_model_loader { case GGML_TYPE_IQ1_M: ftype = LLAMA_FTYPE_MOSTLY_IQ1_M; break; case GGML_TYPE_IQ1_BN: ftype = LLAMA_FTYPE_MOSTLY_IQ1_BN; break; case GGML_TYPE_IQ2_BN: ftype = LLAMA_FTYPE_MOSTLY_IQ2_BN; break; + case GGML_TYPE_IQ1_TN: ftype = LLAMA_FTYPE_MOSTLY_IQ1_TN; break; case GGML_TYPE_IQ2_TN: ftype = LLAMA_FTYPE_MOSTLY_IQ2_TN; break; case GGML_TYPE_IQ4_NL: ftype = LLAMA_FTYPE_MOSTLY_IQ4_NL; break; case GGML_TYPE_IQ4_XS: ftype = LLAMA_FTYPE_MOSTLY_IQ4_XS; break; @@ -4497,8 +4498,9 @@ static std::string llama_model_ftype_name(llama_ftype ftype) { case LLAMA_FTYPE_MOSTLY_IQ5_K: return "IQ5_K - 5.5 bpw"; case LLAMA_FTYPE_MOSTLY_IQ6_K: return "IQ6_K - 6.6 bpw"; case LLAMA_FTYPE_MOSTLY_IQ1_BN: return "IQ1_BN - 1.625 bpw Bitnet"; + case LLAMA_FTYPE_MOSTLY_IQ1_TN: return "IQ1_TN - 1.6875 bpw TriLM"; case LLAMA_FTYPE_MOSTLY_IQ2_BN: return "IQ2_BN - 2.00 bpw Bitnet"; - case LLAMA_FTYPE_MOSTLY_IQ2_TN: return "IQT_BN - 2.06 bpw TriLM"; + case LLAMA_FTYPE_MOSTLY_IQ2_TN: return "IQ2_TN - 2.06 bpw TriLM"; case LLAMA_FTYPE_MOSTLY_IQ3_S: return "IQ3_S - 3.4375 bpw"; case LLAMA_FTYPE_MOSTLY_IQ3_M: return "IQ3_S mix - 3.66 bpw"; case LLAMA_FTYPE_MOSTLY_Q4_0_4_4: return "Q4_0_4_4"; @@ -15644,7 +15646,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_BN || ftype == LLAMA_FTYPE_MOSTLY_IQ2_BN) { new_type = GGML_TYPE_IQ4_NL; } - else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_TN) { + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_TN || ftype == LLAMA_FTYPE_MOSTLY_IQ2_TN) { new_type = GGML_TYPE_Q4_K; } else if (new_type == GGML_TYPE_Q4_0_4_4 || new_type == GGML_TYPE_Q4_0_4_8 || @@ -15856,7 +15858,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n new_type == GGML_TYPE_IQ3_XXS || new_type == GGML_TYPE_IQ1_S || new_type == GGML_TYPE_IQ3_S || new_type == GGML_TYPE_IQ1_M || new_type == GGML_TYPE_IQ4_K || new_type == GGML_TYPE_IQ2_K || new_type == GGML_TYPE_IQ5_K || new_type == GGML_TYPE_IQ3_K || new_type == GGML_TYPE_IQ2_TN || - new_type == GGML_TYPE_IQ6_K) { + new_type == GGML_TYPE_IQ6_K || new_type == GGML_TYPE_IQ1_TN) { int nx = tensor->ne[0]; int ny = tensor->ne[1]; if (nx % QK_K != 0) { @@ -15881,6 +15883,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n case GGML_TYPE_IQ3_S: case GGML_TYPE_IQ1_S: case GGML_TYPE_IQ1_M: + case GGML_TYPE_IQ1_TN: case GGML_TYPE_IQ2_TN: case GGML_TYPE_Q2_K: case GGML_TYPE_Q3_K: @@ -15991,6 +15994,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s case LLAMA_FTYPE_MOSTLY_IQ1_M: default_type = GGML_TYPE_IQ1_M; break; case LLAMA_FTYPE_MOSTLY_IQ1_BN: default_type = GGML_TYPE_IQ1_BN; break; case LLAMA_FTYPE_MOSTLY_IQ2_BN: default_type = GGML_TYPE_IQ2_BN; break; + case LLAMA_FTYPE_MOSTLY_IQ1_TN: default_type = GGML_TYPE_IQ1_TN; break; case LLAMA_FTYPE_MOSTLY_IQ2_TN: default_type = GGML_TYPE_IQ2_TN; break; case LLAMA_FTYPE_MOSTLY_IQ4_NL: default_type = GGML_TYPE_IQ4_NL; break; case LLAMA_FTYPE_MOSTLY_IQ4_XS: default_type = GGML_TYPE_IQ4_XS; break;