From 5c42877a38ec6f942b126eef2f3d686c43031865 Mon Sep 17 00:00:00 2001 From: Iwan Kawrakow Date: Thu, 24 Oct 2024 14:15:03 +0200 Subject: [PATCH] Remove iq1_tn and iq2_tn - Part 1 Now that iq1_bn and iq2_bn have per row scales, there is no reason to also have iq1_tn and iq2_tn. --- examples/quantize/quantize.cpp | 2 - ggml/include/ggml.h | 4 +- ggml/src/ggml-common.h | 17 +- ggml/src/ggml-metal.m | 56 +---- ggml/src/ggml-metal.metal | 269 ---------------------- ggml/src/ggml-quants.c | 2 - ggml/src/ggml.c | 44 ---- ggml/src/iqk/iqk_mul_mat.cpp | 409 +-------------------------------- ggml/src/iqk/iqk_quantize.cpp | 133 ----------- ggml/src/iqk/iqk_quantize.h | 12 - include/llama.h | 2 - src/llama.cpp | 15 +- 12 files changed, 20 insertions(+), 945 deletions(-) diff --git a/examples/quantize/quantize.cpp b/examples/quantize/quantize.cpp index c88033b6..b5907e2b 100644 --- a/examples/quantize/quantize.cpp +++ b/examples/quantize/quantize.cpp @@ -29,8 +29,6 @@ static const std::vector QUANT_OPTIONS = { { "IQ1_M", LLAMA_FTYPE_MOSTLY_IQ1_M, " 1.75 bpw quantization", }, { "IQ1_BN", LLAMA_FTYPE_MOSTLY_IQ1_BN, " 1.62 bpw quantization (Bitnet)", }, { "IQ2_BN", LLAMA_FTYPE_MOSTLY_IQ2_BN, " 2.00 bpw quantization (Bitnet)", }, - { "IQ1_TN", LLAMA_FTYPE_MOSTLY_IQ1_TN, " 1.63 bpw quantization (TriLM)", }, - { "IQ2_TN", LLAMA_FTYPE_MOSTLY_IQ2_TN, " 2.00 bpw quantization (TriLM)", }, { "Q2_K", LLAMA_FTYPE_MOSTLY_Q2_K, " 2.63G, +0.6717 ppl @ LLaMA-v1-7B", }, { "Q2_K_S", LLAMA_FTYPE_MOSTLY_Q2_K_S, " 2.16G, +9.0634 ppl @ LLaMA-v1-7B", }, { "IQ3_XXS", LLAMA_FTYPE_MOSTLY_IQ3_XXS, " 3.06 bpw quantization", }, diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h index a99dc6b5..df074b10 100644 --- a/ggml/include/ggml.h +++ b/ggml/include/ggml.h @@ -401,8 +401,8 @@ extern "C" { GGML_TYPE_IQ4_K = 139, GGML_TYPE_IQ5_K = 140, GGML_TYPE_IQ6_K = 141, - GGML_TYPE_IQ2_TN = 142, - GGML_TYPE_IQ1_TN = 143, + // depricated: GGML_TYPE_IQ2_TN = 142, + // depricated: GGML_TYPE_IQ1_TN = 143, GGML_TYPE_IQ4_KS = 144, GGML_TYPE_IQ2_KS = 145, GGML_TYPE_IQ4_KSS = 146, diff --git a/ggml/src/ggml-common.h b/ggml/src/ggml-common.h index f8824b0e..f0c1ae68 100644 --- a/ggml/src/ggml-common.h +++ b/ggml/src/ggml-common.h @@ -389,9 +389,7 @@ typedef struct { static_assert(sizeof(block_iq1_m) == QK_K/8 + QK_K/16 + QK_K/32, "wrong iq1_m block size/padding"); // -// Bitnet - implemented as 1.625 bpw -// The block scale is a waste, but it allows us to plug it in without any additional -// changes to ggml. +// Bitnet and TriLM - implemented as 1.625 bpw // #define QK_IQ1BN 64 typedef struct { @@ -400,24 +398,13 @@ typedef struct { } block_iq1_bn; static_assert(sizeof(block_iq1_bn) == 13, "wrong iq1_bn block size/padding"); // -// Bitnet - implemented as 2.0 bpw +// Bitnet and TriLM - implemented as 2.0 bpw // #define QK_IQ2BN 64 typedef struct { uint8_t qs[QK_IQ2BN/4]; } block_iq2_bn; static_assert(sizeof(block_iq2_bn) == QK_IQ2BN/4, "wrong iq2_bn block size/padding"); -// -// TriLM - implemented as 2.0625 bpw -// -typedef struct { - uint8_t qs[52]; -} block_iq1_tn; -static_assert(sizeof(block_iq1_tn) == 52, "wrong iq1_tn block size/padding"); -typedef struct { - uint8_t qs[QK_K/4]; -} block_iq2_tn; -static_assert(sizeof(block_iq2_tn) == QK_K/4, "wrong iqt_bn block size/padding"); // Used by IQ1_M quants typedef union { diff --git a/ggml/src/ggml-metal.m b/ggml/src/ggml-metal.m index 9f696383..8d350aa1 100644 --- a/ggml/src/ggml-metal.m +++ b/ggml/src/ggml-metal.m @@ -101,9 +101,7 @@ enum ggml_metal_kernel_type { GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ1_S, GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ1_M, GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ1_BN, - GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ1_TN, GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ2_BN, - GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ2_TN, GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ4_NL, GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ4_XS, GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ4_KS, @@ -145,9 +143,7 @@ enum ggml_metal_kernel_type { GGML_METAL_KERNEL_TYPE_MUL_MV_IQ1_S_F32, GGML_METAL_KERNEL_TYPE_MUL_MV_IQ1_M_F32, GGML_METAL_KERNEL_TYPE_MUL_MV_IQ1_BN_F32, - GGML_METAL_KERNEL_TYPE_MUL_MV_IQ1_TN_F32, GGML_METAL_KERNEL_TYPE_MUL_MV_IQ2_BN_F32, - GGML_METAL_KERNEL_TYPE_MUL_MV_IQ2_TN_F32, GGML_METAL_KERNEL_TYPE_MUL_MV_IQ4_NL_F32, GGML_METAL_KERNEL_TYPE_MUL_MV_IQ4_XS_F32, GGML_METAL_KERNEL_TYPE_MUL_MV_IQ4_KS_F32, @@ -183,9 +179,7 @@ enum ggml_metal_kernel_type { GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ1_S_F32, GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ1_M_F32, GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ1_BN_F32, - GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ1_TN_F32, GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ2_BN_F32, - GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ2_TN_F32, GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ4_NL_F32, GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ4_XS_F32, GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ4_KS_F32, @@ -218,9 +212,7 @@ enum ggml_metal_kernel_type { GGML_METAL_KERNEL_TYPE_MUL_MM_IQ1_S_F32, GGML_METAL_KERNEL_TYPE_MUL_MM_IQ1_M_F32, GGML_METAL_KERNEL_TYPE_MUL_MM_IQ1_BN_F32, - GGML_METAL_KERNEL_TYPE_MUL_MM_IQ1_TN_F32, GGML_METAL_KERNEL_TYPE_MUL_MM_IQ2_BN_F32, - GGML_METAL_KERNEL_TYPE_MUL_MM_IQ2_TN_F32, GGML_METAL_KERNEL_TYPE_MUL_MM_IQ4_NL_F32, GGML_METAL_KERNEL_TYPE_MUL_MM_IQ4_XS_F32, GGML_METAL_KERNEL_TYPE_MUL_MM_IQ4_KS_F32, @@ -253,9 +245,7 @@ enum ggml_metal_kernel_type { GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ1_S_F32, GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ1_M_F32, GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ1_BN_F32, - GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ1_TN_F32, GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_BN_F32, - GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_TN_F32, GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ4_NL_F32, GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ4_XS_F32, GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ4_KS_F32, @@ -649,9 +639,7 @@ static struct ggml_backend_metal_context * ggml_metal_init(int n_cb) { GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ1_S, get_rows_iq1_s, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ1_M, get_rows_iq1_m, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ1_BN, get_rows_iq1_bn, true); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ1_TN, get_rows_iq1_tn, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ2_BN, get_rows_iq2_bn, true); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ2_TN, get_rows_iq2_tn, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ4_NL, get_rows_iq4_nl, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ4_XS, get_rows_iq4_xs, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ4_KS, get_rows_iq4_ks, true); @@ -693,9 +681,7 @@ static struct ggml_backend_metal_context * ggml_metal_init(int n_cb) { GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_IQ1_S_F32, mul_mv_iq1_s_f32, ctx->support_simdgroup_reduction); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_IQ1_M_F32, mul_mv_iq1_m_f32, ctx->support_simdgroup_reduction); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_IQ1_BN_F32, mul_mv_iq1_bn_f32, ctx->support_simdgroup_reduction); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_IQ1_TN_F32, mul_mv_iq1_tn_f32, ctx->support_simdgroup_reduction); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_IQ2_BN_F32, mul_mv_iq2_bn_f32, ctx->support_simdgroup_reduction); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_IQ2_TN_F32, mul_mv_iq2_tn_f32, ctx->support_simdgroup_reduction); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_IQ4_NL_F32, mul_mv_iq4_nl_f32, ctx->support_simdgroup_reduction); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_IQ4_XS_F32, mul_mv_iq4_xs_f32, ctx->support_simdgroup_reduction); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_IQ4_KS_F32, mul_mv_iq4_ks_f32, ctx->support_simdgroup_reduction); @@ -731,9 +717,7 @@ static struct ggml_backend_metal_context * ggml_metal_init(int n_cb) { GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ1_S_F32, mul_mv_id_iq1_s_f32, ctx->support_simdgroup_reduction); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ1_M_F32, mul_mv_id_iq1_m_f32, ctx->support_simdgroup_reduction); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ1_BN_F32, mul_mv_id_iq1_bn_f32, ctx->support_simdgroup_reduction); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ1_TN_F32, mul_mv_id_iq1_tn_f32, ctx->support_simdgroup_reduction); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ2_BN_F32, mul_mv_id_iq2_bn_f32, ctx->support_simdgroup_reduction); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ2_TN_F32, mul_mv_id_iq2_tn_f32, ctx->support_simdgroup_reduction); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ4_NL_F32, mul_mv_id_iq4_nl_f32, ctx->support_simdgroup_reduction); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ4_XS_F32, mul_mv_id_iq4_xs_f32, ctx->support_simdgroup_reduction); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ4_KS_F32, mul_mv_id_iq4_ks_f32, ctx->support_simdgroup_reduction); @@ -766,9 +750,7 @@ static struct ggml_backend_metal_context * ggml_metal_init(int n_cb) { GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_IQ1_S_F32, mul_mm_iq1_s_f32, ctx->support_simdgroup_mm); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_IQ1_M_F32, mul_mm_iq1_m_f32, ctx->support_simdgroup_mm); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_IQ1_BN_F32, mul_mm_iq1_bn_f32, ctx->support_simdgroup_mm); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_IQ1_TN_F32, mul_mm_iq1_tn_f32, ctx->support_simdgroup_mm); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_IQ2_BN_F32, mul_mm_iq2_bn_f32, ctx->support_simdgroup_mm); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_IQ2_TN_F32, mul_mm_iq2_tn_f32, ctx->support_simdgroup_mm); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_IQ4_NL_F32, mul_mm_iq4_nl_f32, ctx->support_simdgroup_mm); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_IQ4_XS_F32, mul_mm_iq4_xs_f32, ctx->support_simdgroup_mm); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_IQ4_KS_F32, mul_mm_iq4_ks_f32, ctx->support_simdgroup_mm); @@ -801,9 +783,7 @@ static struct ggml_backend_metal_context * ggml_metal_init(int n_cb) { GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ1_S_F32, mul_mm_id_iq1_s_f32, ctx->support_simdgroup_mm); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ1_M_F32, mul_mm_id_iq1_m_f32, ctx->support_simdgroup_mm); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ1_BN_F32, mul_mm_id_iq1_bn_f32, ctx->support_simdgroup_mm); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ1_TN_F32, mul_mm_id_iq1_tn_f32, ctx->support_simdgroup_mm); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_BN_F32, mul_mm_id_iq2_bn_f32, ctx->support_simdgroup_mm); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_TN_F32, mul_mm_id_iq2_tn_f32, ctx->support_simdgroup_mm); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ4_NL_F32, mul_mm_id_iq4_nl_f32, ctx->support_simdgroup_mm); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ4_XS_F32, mul_mm_id_iq4_xs_f32, ctx->support_simdgroup_mm); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ4_KS_F32, mul_mm_id_iq4_ks_f32, ctx->support_simdgroup_mm); @@ -2001,9 +1981,7 @@ static enum ggml_status ggml_metal_graph_compute( case GGML_TYPE_IQ1_S: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_IQ1_S_F32 ].pipeline; break; case GGML_TYPE_IQ1_M: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_IQ1_M_F32 ].pipeline; break; case GGML_TYPE_IQ1_BN: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_IQ1_BN_F32 ].pipeline; break; - case GGML_TYPE_IQ1_TN: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_IQ1_TN_F32 ].pipeline; break; case GGML_TYPE_IQ2_BN: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_IQ2_BN_F32 ].pipeline; break; - case GGML_TYPE_IQ2_TN: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_IQ2_TN_F32 ].pipeline; break; case GGML_TYPE_IQ4_NL: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_IQ4_NL_F32 ].pipeline; break; case GGML_TYPE_IQ4_XS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_IQ4_XS_F32 ].pipeline; break; case GGML_TYPE_IQ4_KS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_IQ4_KS_F32 ].pipeline; break; @@ -2197,24 +2175,12 @@ static enum ggml_status ggml_metal_graph_compute( nth1 = 16; pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_IQ1_BN_F32].pipeline; } break; - case GGML_TYPE_IQ1_TN: - { - nth0 = 4; - nth1 = 16; - pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_IQ1_TN_F32].pipeline; - } break; case GGML_TYPE_IQ2_BN: { nth0 = 4; nth1 = 16; pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_IQ2_BN_F32].pipeline; } break; - case GGML_TYPE_IQ2_TN: - { - nth0 = 4; - nth1 = 16; - pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_IQ2_TN_F32].pipeline; - } break; case GGML_TYPE_IQ4_NL: { nth0 = 4; @@ -2306,8 +2272,7 @@ static enum ggml_status ggml_metal_graph_compute( if (src0t == GGML_TYPE_Q4_0 || src0t == GGML_TYPE_Q4_1 || src0t == GGML_TYPE_Q5_0 || src0t == GGML_TYPE_Q5_1 || src0t == GGML_TYPE_Q8_0 || src0t == GGML_TYPE_Q2_K || src0t == GGML_TYPE_IQ1_S || src0t == GGML_TYPE_IQ1_M || src0t == GGML_TYPE_IQ2_S|| - src0t == GGML_TYPE_IQ1_BN|| src0t == GGML_TYPE_IQ2_BN|| src0t == GGML_TYPE_Q6_0 || - src0t == GGML_TYPE_IQ2_TN|| src0t == GGML_TYPE_IQ1_TN) { + src0t == GGML_TYPE_IQ1_BN|| src0t == GGML_TYPE_IQ2_BN|| src0t == GGML_TYPE_Q6_0) { [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 7)/8, ne11, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; } else if (src0t == GGML_TYPE_IQ2_KS || src0t == GGML_TYPE_IQ2_K || src0t == GGML_TYPE_IQ3_K) { @@ -2417,9 +2382,7 @@ static enum ggml_status ggml_metal_graph_compute( case GGML_TYPE_IQ1_S: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ1_S_F32 ].pipeline; break; case GGML_TYPE_IQ1_M: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ1_M_F32 ].pipeline; break; case GGML_TYPE_IQ1_BN: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ1_BN_F32 ].pipeline; break; - case GGML_TYPE_IQ1_TN: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ1_TN_F32 ].pipeline; break; case GGML_TYPE_IQ2_BN: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_BN_F32 ].pipeline; break; - case GGML_TYPE_IQ2_TN: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_TN_F32 ].pipeline; break; case GGML_TYPE_IQ4_NL: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ4_NL_F32 ].pipeline; break; case GGML_TYPE_IQ4_XS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ4_XS_F32 ].pipeline; break; case GGML_TYPE_IQ4_KS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ4_KS_F32 ].pipeline; break; @@ -2601,24 +2564,12 @@ static enum ggml_status ggml_metal_graph_compute( nth1 = 16; pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ1_BN_F32].pipeline; } break; - case GGML_TYPE_IQ1_TN: - { - nth0 = 4; - nth1 = 16; - pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ1_TN_F32].pipeline; - } break; case GGML_TYPE_IQ2_BN: { nth0 = 4; nth1 = 16; pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ2_BN_F32].pipeline; } break; - case GGML_TYPE_IQ2_TN: - { - nth0 = 4; - nth1 = 16; - pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ2_TN_F32].pipeline; - } break; case GGML_TYPE_IQ4_NL: { nth0 = 4; @@ -2721,8 +2672,7 @@ static enum ggml_status ggml_metal_graph_compute( if (src0t == GGML_TYPE_Q4_0 || src0t == GGML_TYPE_Q4_1 || src0t == GGML_TYPE_Q5_0 || src0t == GGML_TYPE_Q5_1 || src0t == GGML_TYPE_Q8_0 || src0t == GGML_TYPE_Q2_K || src0t == GGML_TYPE_IQ1_S || src0t == GGML_TYPE_IQ1_M || src0t == GGML_TYPE_Q6_0 || - src0t == GGML_TYPE_IQ1_BN|| src0t == GGML_TYPE_IQ2_BN|| src0t == GGML_TYPE_IQ2_K|| - src0t == GGML_TYPE_IQ2_TN|| src0t == GGML_TYPE_IQ1_TN) { + src0t == GGML_TYPE_IQ1_BN|| src0t == GGML_TYPE_IQ2_BN|| src0t == GGML_TYPE_IQ2_K) { [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 7)/8, _ne1, tgz) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; } else if (src0t == GGML_TYPE_IQ2_KS || src0t == GGML_TYPE_IQ2_K || src0t == GGML_TYPE_IQ3_K) { @@ -2790,9 +2740,7 @@ static enum ggml_status ggml_metal_graph_compute( case GGML_TYPE_IQ1_S: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ1_S ].pipeline; break; case GGML_TYPE_IQ1_M: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ1_M ].pipeline; break; case GGML_TYPE_IQ1_BN: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ1_BN ].pipeline; break; - case GGML_TYPE_IQ1_TN: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ1_TN ].pipeline; break; case GGML_TYPE_IQ2_BN: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ2_BN ].pipeline; break; - case GGML_TYPE_IQ2_TN: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ2_TN ].pipeline; break; case GGML_TYPE_IQ4_NL: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ4_NL ].pipeline; break; case GGML_TYPE_IQ4_XS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ4_XS ].pipeline; break; case GGML_TYPE_IQ4_KS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ4_KS ].pipeline; break; diff --git a/ggml/src/ggml-metal.metal b/ggml/src/ggml-metal.metal index 577e399d..bc0ea9f5 100644 --- a/ggml/src/ggml-metal.metal +++ b/ggml/src/ggml-metal.metal @@ -3948,128 +3948,6 @@ kernel void kernel_mul_mv_q2_K_f32( kernel_mul_mv_q2_K_f32_impl(src0, src1, dst, ne00, ne01, ne02, ne10, ne12, ne0, ne1, r2, r3, nullptr, tgpig, tiisg, sgitg); } -void kernel_mul_mv_iq2_tn_f32_impl( - device const void * src0, - device const float * src1, - device float * dst, - int64_t ne00, - int64_t ne01, - int64_t ne02, - int64_t ne10, - int64_t ne12, - int64_t ne0, - int64_t ne1, - uint r2, - uint r3, - threadgroup int8_t * shared_values, - uint3 tgpig, - uint tiisg, - uint sgitg) { - - const int nb = ne00/QK_K; - const int r0 = tgpig.x; - const int r1 = tgpig.y; - const int im = tgpig.z; - - const int row_size = nb*sizeof(block_iq2_tn) + 4; - - const int first_row = (r0 * N_SIMDGROUP + sgitg) * N_DST; - - const uint i12 = im%ne12; - const uint i13 = im/ne12; - - const uint offset0 = ((i12/r2)*ne01 + (i13/r3)*ne01*ne02)*row_size; - - device const char * cx = (device const char *) src0 + first_row*row_size + offset0; - device const float * y = (device const float*) src1 + r1*ne10 + im*ne00*ne1; - - float yl[32]; - float sumf[N_DST]={0.f}, all_sum; - float drow[N_DST]; - - const int ix = tiisg/8; // 0...3 - const int it = tiisg%8; // 0...7 - const int iq = it/4; // 0 or 1 - const int ir = it%4; // 0...3 - - device const float * y4 = y + ix * QK_K + 128 * iq + 8 * ir; - - for (int row = 0; row < N_DST; row++) drow[row] = *((device const float *)(cx + row*row_size)); - - for (int ib = ix; ib < nb; ib += 4) { - - float sumy = 0.f; - for (int i = 0; i < 8; ++i) { - yl[i+ 0] = y4[i+ 0]; sumy += yl[i+ 0]; - yl[i+ 8] = y4[i+32]; sumy += yl[i+ 8]; - yl[i+16] = y4[i+64]; sumy += yl[i+16]; - yl[i+24] = y4[i+96]; sumy += yl[i+24]; - } - - device const block_iq2_tn * x = (device const block_iq2_tn *)(cx + 4); - device const uint16_t * qs = (device const uint16_t *)x[ib].qs + 16 * iq + 4 * ir; - - for (int row = 0; row < N_DST; row++) { - - float4 acc1 = {0.f, 0.f, 0.f, 0.f}; - float4 acc2 = {0.f, 0.f, 0.f, 0.f}; - for (int i = 0; i < 8; i += 2) { - acc1[0] += yl[i+ 0] * (qs[i/2] & 0x0003); - acc2[0] += yl[i+ 1] * (qs[i/2] & 0x0300); - acc1[1] += yl[i+ 8] * (qs[i/2] & 0x000c); - acc2[1] += yl[i+ 9] * (qs[i/2] & 0x0c00); - acc1[2] += yl[i+16] * (qs[i/2] & 0x0030); - acc2[2] += yl[i+17] * (qs[i/2] & 0x3000); - acc1[3] += yl[i+24] * (qs[i/2] & 0x00c0); - acc2[3] += yl[i+25] * (qs[i/2] & 0xc000); - } - sumf[row] += (acc1[0] + 1.f/256.f * acc2[0]) * 1.f/ 1.f + - (acc1[1] + 1.f/256.f * acc2[1]) * 1.f/ 4.f + - (acc1[2] + 1.f/256.f * acc2[2]) * 1.f/16.f + - (acc1[3] + 1.f/256.f * acc2[3]) * 1.f/64.f - sumy; - - qs += row_size/2; - } - - y4 += 4 * QK_K; - } - - for (int row = 0; row < N_DST; ++row) { - all_sum = simd_sum(sumf[row]); - if (tiisg == 0) { - dst[r1*ne0 + im*ne0*ne1 + first_row + row] = drow[row]*all_sum; - } - } -} - -[[host_name("kernel_mul_mv_iq2_tn_f32")]] -kernel void kernel_mul_mv_iq2_tn_f32( - device const void * src0, - device const float * src1, - device float * dst, - constant int64_t & ne00, - constant int64_t & ne01, - constant int64_t & ne02, - constant uint64_t & nb00, - constant uint64_t & nb01, - constant uint64_t & nb02, - constant int64_t & ne10, - constant int64_t & ne11, - constant int64_t & ne12, - constant uint64_t & nb10, - constant uint64_t & nb11, - constant uint64_t & nb12, - constant int64_t & ne0, - constant int64_t & ne1, - constant uint & r2, - constant uint & r3, - uint3 tgpig[[threadgroup_position_in_grid]], - uint tiisg[[thread_index_in_simdgroup]], - uint sgitg[[simdgroup_index_in_threadgroup]]) { - - kernel_mul_mv_iq2_tn_f32_impl(src0, src1, dst, ne00, ne01, ne02, ne10, ne12, ne0, ne1, r2, r3, nullptr, tgpig, tiisg, sgitg); -} - void kernel_mul_mv_q3_K_f32_impl( device const void * src0, device const float * src1, @@ -5631,104 +5509,6 @@ void kernel_mul_mv_iq1_bn_f32_impl( } } -void kernel_mul_mv_iq1_tn_f32_impl( - device const void * src0, - device const float * src1, - device float * dst, - int64_t ne00, - int64_t ne01, - int64_t ne02, - int64_t ne10, - int64_t ne12, - int64_t ne0, - int64_t ne1, - uint r2, - uint r3, - threadgroup int8_t * shared_value, - uint3 tgpig, - uint tiisg, - uint sgitg) { - - const int nb = ne00/QK_IQ1BN; - const int r0 = tgpig.x; - const int r1 = tgpig.y; - const int im = tgpig.z; - - // Why are we not passing in src0->nb[0]? - // But because we are not, we need to use this hack - const uint row_size = 2 + sizeof(block_iq1_bn)*(ne00/QK_IQ1BN); - - const int first_row = (r0 * N_SIMDGROUP + sgitg) * N_DST; - - const uint i12 = im%ne12; - const uint i13 = im/ne12; - - const uint offset0 = ((i12/r2)*ne01 + (i13/r3)*(ne01*ne02))*row_size; - device const char * cx = (device const char *) src0 + first_row*row_size + offset0; - device const float * y = (device const float *) src1 + r1*ne10 + im*ne00*ne1; - - float yl[16]; - float sumf[N_DST]={0.f}; - - const int nb32 = nb * (QK_IQ1BN / 32); - - const int ix = tiisg/2; - const int ir = tiisg%2; - - device const float * y4 = (device const float *)y + 32 * ix + 16 * ir; - - const float values[3] = {-1.f, 0.f, 1.f}; - - constexpr uint8_t k_mult[5] = {81, 27, 9, 3, 1}; - - for (int ib32 = ix; ib32 < nb32; ib32 += 16) { - - for (int j = 0; j < 16; ++j) yl[j] = y4[j]; - - const int ibl = ib32 / (QK_IQ1BN / 32); - const int ib = ib32 % (QK_IQ1BN / 32); - const int i16 = 2*ib + ir; - - device const half * dh = (device const half *)cx; - device const block_iq1_bn * xr = (device const block_iq1_bn *)(dh + 1) + ibl; - device const uint8_t * ql = xr->ql + 3*i16; - device const uint8_t * extra = (device const uint8_t *)&xr->extra; - - for (int row = 0; row < N_DST; row++) { - - float acc = 0; - int i = 0; - for (int k = 0; k < 3; ++k) { - uint8_t q = ql[k]; - for (int j = 0; j < 5; ++j) { - uint8_t v = k_mult[j]*q; - v = 3*v >> 8; //(v + (v >> 1)) >> 7; - acc += yl[i++] * values[v]; - } - } - uint8_t v = k_mult[i16]*extra[0]; - v = 3*v >> 8; //(v + (v >> 1)) >> 7; - acc += yl[15] * values[v]; - - sumf[row] += acc * (float)dh[0]; - - extra += row_size; - ql += row_size; - dh += row_size/2; - } - - y4 += 32 * 16; - } - - for (int row = 0; row < N_DST; row += 2) { - half2 r = {(half)sumf[row], (half)sumf[row+1]}; - r = simd_sum(r); - if (tiisg < 2) { - dst[r1*ne0 + im*ne0*ne1 + first_row + row + tiisg] = r[tiisg]; - } - } -} - void kernel_mul_mv_iq2_bn_f32_impl( device const void * src0, device const float * src1, @@ -7034,34 +6814,6 @@ kernel void kernel_mul_mv_iq1_bn_f32( kernel_mul_mv_iq1_bn_f32_impl(src0, src1, dst, ne00, ne01, ne02, ne10, ne12, ne0, ne1, r2, r3, nullptr, tgpig, tiisg, sgitg); } -[[host_name("kernel_mul_mv_iq1_tn_f32")]] -kernel void kernel_mul_mv_iq1_tn_f32( - device const void * src0, - device const float * src1, - device float * dst, - constant int64_t & ne00, - constant int64_t & ne01, - constant int64_t & ne02, - constant uint64_t & nb00, - constant uint64_t & nb01, - constant uint64_t & nb02, - constant int64_t & ne10, - constant int64_t & ne11, - constant int64_t & ne12, - constant uint64_t & nb10, - constant uint64_t & nb11, - constant uint64_t & nb12, - constant int64_t & ne0, - constant int64_t & ne1, - constant uint & r2, - constant uint & r3, - uint3 tgpig[[threadgroup_position_in_grid]], - uint tiisg[[thread_index_in_simdgroup]], - uint sgitg[[simdgroup_index_in_threadgroup]]) { - - kernel_mul_mv_iq1_tn_f32_impl(src0, src1, dst, ne00, ne01, ne02, ne10, ne12, ne0, ne1, r2, r3, nullptr, tgpig, tiisg, sgitg); -} - [[host_name("kernel_mul_mv_iq2_bn_f32")]] kernel void kernel_mul_mv_iq2_bn_f32( device const void * src0, @@ -7453,19 +7205,6 @@ void dequantize_q2_K(device const block_q2_K * xb, short il, thread type4x4 & re } } -template -void dequantize_iq2_tn(device const block_iq2_tn * xb, short il, thread type4x4 & reg) { - device const uint8_t * q = (device const uint8_t *)xb->qs + 32*(il/8) + 16*(il&1); - - il = (il/2)%4; - - half coef = il>1 ? (il>2 ? 1/64.h : 1/16.h) : (il>0 ? 1/4.h : 1.h); - uchar mask = il>1 ? (il>2 ? 192 : 48) : (il>0 ? 12 : 3); - for (int i = 0; i < 16; ++i) { - reg[i/4][i%4] = coef * (q[i] & mask) - 1; - } -} - template void dequantize_q3_K(device const block_q3_K *xb, short il, thread type4x4 & reg) { const half d_all = xb->d; @@ -8510,9 +8249,7 @@ template [[host_name("kernel_get_rows_iq3_k")]] kernel get_rows_q_t kernel_get template [[host_name("kernel_get_rows_iq4_k")]] kernel get_rows_q_t kernel_get_rows_q; template [[host_name("kernel_get_rows_iq5_k")]] kernel get_rows_q_t kernel_get_rows_q; template [[host_name("kernel_get_rows_iq6_k")]] kernel get_rows_q_t kernel_get_rows_q; -template [[host_name("kernel_get_rows_iq1_tn")]] kernel get_rows_q_t kernel_get_rows_q2>; template [[host_name("kernel_get_rows_iq1_bn")]] kernel get_rows_q_t kernel_get_rows_q2>; -template [[host_name("kernel_get_rows_iq2_tn")]] kernel get_rows_q_t kernel_get_rows_q2>; template [[host_name("kernel_get_rows_iq2_bn")]] kernel get_rows_q_t kernel_get_rows_q2>; template [[host_name("kernel_get_rows_iq4_ks")]] kernel get_rows_q_t kernel_get_rows_q2>; template [[host_name("kernel_get_rows_iq4_kss")]] kernel get_rows_q_t kernel_get_rows_q2>; @@ -8555,9 +8292,7 @@ template [[host_name("kernel_mul_mm_iq3_k_f32")]] kernel mat_mm_t kernel_mul_m template [[host_name("kernel_mul_mm_iq4_k_f32")]] kernel mat_mm_t kernel_mul_mm>; template [[host_name("kernel_mul_mm_iq5_k_f32")]] kernel mat_mm_t kernel_mul_mm>; template [[host_name("kernel_mul_mm_iq6_k_f32")]] kernel mat_mm_t kernel_mul_mm>; -template [[host_name("kernel_mul_mm_iq1_tn_f32")]] kernel mat_mm_t kernel_mul_mm>; template [[host_name("kernel_mul_mm_iq1_bn_f32")]] kernel mat_mm_t kernel_mul_mm>; -template [[host_name("kernel_mul_mm_iq2_tn_f32")]] kernel mat_mm_t kernel_mul_mm>; template [[host_name("kernel_mul_mm_iq2_bn_f32")]] kernel mat_mm_t kernel_mul_mm>; template [[host_name("kernel_mul_mm_iq4_ks_f32")]] kernel mat_mm_t kernel_mul_mm>; template [[host_name("kernel_mul_mm_iq4_kss_f32")]] kernel mat_mm_t kernel_mul_mm>; @@ -8597,9 +8332,7 @@ template [[host_name("kernel_mul_mm_id_iq3_k_f32")]] kernel mat_mm_id_t kernel template [[host_name("kernel_mul_mm_id_iq4_k_f32")]] kernel mat_mm_id_t kernel_mul_mm_id>; template [[host_name("kernel_mul_mm_id_iq5_k_f32")]] kernel mat_mm_id_t kernel_mul_mm_id>; template [[host_name("kernel_mul_mm_id_iq6_k_f32")]] kernel mat_mm_id_t kernel_mul_mm_id>; -template [[host_name("kernel_mul_mm_id_iq1_tn_f32")]] kernel mat_mm_id_t kernel_mul_mm_id>; template [[host_name("kernel_mul_mm_id_iq1_bn_f32")]] kernel mat_mm_id_t kernel_mul_mm_id>; -template [[host_name("kernel_mul_mm_id_iq2_tn_f32")]] kernel mat_mm_id_t kernel_mul_mm_id>; template [[host_name("kernel_mul_mm_id_iq2_bn_f32")]] kernel mat_mm_id_t kernel_mul_mm_id>; template [[host_name("kernel_mul_mm_id_iq4_ks_f32")]] kernel mat_mm_id_t kernel_mul_mm_id>; template [[host_name("kernel_mul_mm_id_iq4_kss_f32")]] kernel mat_mm_id_t kernel_mul_mm_id>; @@ -8809,8 +8542,6 @@ template [[host_name("kernel_mul_mv_id_q6_K_f32")]] kernel kernel_mul_mv_id_t template [[host_name("kernel_mul_mv_id_iq1_s_f32")]] kernel kernel_mul_mv_id_t kernel_mul_mv_id>; template [[host_name("kernel_mul_mv_id_iq1_m_f32")]] kernel kernel_mul_mv_id_t kernel_mul_mv_id>; template [[host_name("kernel_mul_mv_id_iq1_bn_f32")]] kernel kernel_mul_mv_id_t kernel_mul_mv_id>; -template [[host_name("kernel_mul_mv_id_iq1_tn_f32")]] kernel kernel_mul_mv_id_t kernel_mul_mv_id>; -template [[host_name("kernel_mul_mv_id_iq2_tn_f32")]] kernel kernel_mul_mv_id_t kernel_mul_mv_id>; template [[host_name("kernel_mul_mv_id_iq2_bn_f32")]] kernel kernel_mul_mv_id_t kernel_mul_mv_id>; template [[host_name("kernel_mul_mv_id_iq2_xxs_f32")]] kernel kernel_mul_mv_id_t kernel_mul_mv_id>; template [[host_name("kernel_mul_mv_id_iq2_xs_f32")]] kernel kernel_mul_mv_id_t kernel_mul_mv_id>; diff --git a/ggml/src/ggml-quants.c b/ggml/src/ggml-quants.c index 68ec6126..d18b1981 100644 --- a/ggml/src/ggml-quants.c +++ b/ggml/src/ggml-quants.c @@ -15194,8 +15194,6 @@ bool ggml_validate_row_data(enum ggml_type type, const void * data, size_t nbyte case GGML_TYPE_IQ4_K: break; case GGML_TYPE_IQ5_K: break; case GGML_TYPE_IQ6_K: break; - case GGML_TYPE_IQ2_TN: break; - case GGML_TYPE_IQ1_TN: break; case GGML_TYPE_IQ4_KS: break; case GGML_TYPE_IQ4_KSS: break; case GGML_TYPE_Q4_0_4_4: diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index 1da77b9f..5570b1fc 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -1031,32 +1031,6 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = { .nrows = 1, .row_meta_size = 4, }, - [GGML_TYPE_IQ2_TN] = { - .type_name = "iq2_tn", - .blck_size = QK_K, - .type_size = sizeof(block_iq2_tn), - .is_quantized = true, - .to_float = (ggml_to_float_t) dequantize_row_iq2_tn, - .from_float = quantize_row_iq2_tn, - .from_float_ref = (ggml_from_float_t)quantize_row_iq2_tn_ref, - .vec_dot = vec_dot_iq2_tn_q8_k, - .vec_dot_type = GGML_TYPE_Q8_K, - .nrows = 1, - .row_meta_size = 4, - }, - [GGML_TYPE_IQ1_TN] = { - .type_name = "iq1_tn", - .blck_size = QK_K, - .type_size = sizeof(block_iq1_tn), - .is_quantized = true, - .to_float = (ggml_to_float_t) dequantize_row_iq1_tn, - .from_float = quantize_row_iq1_tn, - .from_float_ref = (ggml_from_float_t)quantize_row_iq1_tn_ref, - .vec_dot = vec_dot_iq1_tn_q8_k, - .vec_dot_type = GGML_TYPE_Q8_K64, - .nrows = 1, - .row_meta_size = 2, - }, [GGML_TYPE_IQ4_NL] = { .type_name = "iq4_nl", .blck_size = QK4_NL, @@ -3926,8 +3900,6 @@ enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype) { case GGML_FTYPE_MOSTLY_IQ1_M: wtype = GGML_TYPE_IQ1_M; break; case GGML_FTYPE_MOSTLY_IQ1_BN: wtype = GGML_TYPE_IQ1_BN; break; case GGML_FTYPE_MOSTLY_IQ2_BN: wtype = GGML_TYPE_IQ2_BN; break; - case GGML_FTYPE_MOSTLY_IQ2_TN: wtype = GGML_TYPE_IQ2_TN; break; - case GGML_FTYPE_MOSTLY_IQ1_TN: wtype = GGML_TYPE_IQ1_TN; break; case GGML_FTYPE_MOSTLY_IQ4_NL: wtype = GGML_TYPE_IQ4_NL; break; case GGML_FTYPE_MOSTLY_IQ4_XS: wtype = GGML_TYPE_IQ4_XS; break; case GGML_FTYPE_MOSTLY_IQ4_KS: wtype = GGML_TYPE_IQ4_KS; break; @@ -10428,8 +10400,6 @@ static void ggml_compute_forward_add( case GGML_TYPE_IQ1_M: case GGML_TYPE_IQ1_BN: case GGML_TYPE_IQ2_BN: - case GGML_TYPE_IQ2_TN: - case GGML_TYPE_IQ1_TN: case GGML_TYPE_IQ4_NL: case GGML_TYPE_IQ4_XS: case GGML_TYPE_IQ4_KS: @@ -10819,8 +10789,6 @@ static void ggml_compute_forward_add1( case GGML_TYPE_IQ1_M: case GGML_TYPE_IQ1_BN: case GGML_TYPE_IQ2_BN: - case GGML_TYPE_IQ2_TN: - case GGML_TYPE_IQ1_TN: case GGML_TYPE_IQ4_NL: case GGML_TYPE_IQ4_XS: case GGML_TYPE_IQ4_KS: @@ -10960,8 +10928,6 @@ static void ggml_compute_forward_acc( case GGML_TYPE_IQ1_M: case GGML_TYPE_IQ1_BN: case GGML_TYPE_IQ2_BN: - case GGML_TYPE_IQ2_TN: - case GGML_TYPE_IQ1_TN: case GGML_TYPE_IQ4_NL: case GGML_TYPE_IQ4_XS: case GGML_TYPE_IQ4_KS: @@ -14147,8 +14113,6 @@ static void ggml_compute_forward_out_prod( case GGML_TYPE_IQ1_M: case GGML_TYPE_IQ1_BN: case GGML_TYPE_IQ2_BN: - case GGML_TYPE_IQ2_TN: - case GGML_TYPE_IQ1_TN: case GGML_TYPE_IQ4_NL: case GGML_TYPE_IQ4_XS: case GGML_TYPE_IQ4_KS: @@ -14528,8 +14492,6 @@ static void ggml_compute_forward_set( case GGML_TYPE_IQ1_M: case GGML_TYPE_IQ1_BN: case GGML_TYPE_IQ2_BN: - case GGML_TYPE_IQ2_TN: - case GGML_TYPE_IQ1_TN: case GGML_TYPE_IQ4_NL: case GGML_TYPE_IQ4_XS: case GGML_TYPE_IQ4_KS: @@ -14803,8 +14765,6 @@ static void ggml_compute_forward_get_rows( case GGML_TYPE_IQ1_M: case GGML_TYPE_IQ1_BN: case GGML_TYPE_IQ2_BN: - case GGML_TYPE_IQ2_TN: - case GGML_TYPE_IQ1_TN: case GGML_TYPE_IQ4_NL: case GGML_TYPE_IQ4_XS: case GGML_TYPE_IQ4_KS: @@ -15405,8 +15365,6 @@ static void ggml_compute_forward_clamp( case GGML_TYPE_IQ1_M: case GGML_TYPE_IQ1_BN: case GGML_TYPE_IQ2_BN: - case GGML_TYPE_IQ2_TN: - case GGML_TYPE_IQ1_TN: case GGML_TYPE_IQ4_NL: case GGML_TYPE_IQ4_XS: case GGML_TYPE_IQ4_KS: @@ -22224,8 +22182,6 @@ size_t ggml_quantize_chunk( case GGML_TYPE_IQ1_M: result = quantize_iq1_m (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; case GGML_TYPE_IQ1_BN: result = quantize_iq1_bn (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; case GGML_TYPE_IQ2_BN: result = quantize_iq2_bn (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; - case GGML_TYPE_IQ2_TN: result = quantize_iq2_tn (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; - case GGML_TYPE_IQ1_TN: result = quantize_iq1_tn (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; case GGML_TYPE_IQ4_NL: result = quantize_iq4_nl (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; case GGML_TYPE_IQ4_XS: result = quantize_iq4_xs (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; case GGML_TYPE_IQ4_KS: result = quantize_iq4_ks (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; diff --git a/ggml/src/iqk/iqk_mul_mat.cpp b/ggml/src/iqk/iqk_mul_mat.cpp index d3567a1e..2701643c 100644 --- a/ggml/src/iqk/iqk_mul_mat.cpp +++ b/ggml/src/iqk/iqk_mul_mat.cpp @@ -755,18 +755,6 @@ struct DequantizerQ2K final : public BaseDequantizer { }; -struct DequantizerIQ2TN final : public BaseDequantizer { - DequantizerIQ2TN(const void * vx, size_t bx) : BaseDequantizer(vx, bx) {} - template - inline void new_block(int i, [[maybe_unused]] const Q8& q8, [[maybe_unused]] __m256 * accm, [[maybe_unused]] __m512i * scales) { - new_block(i); - } - inline void new_block(int i) { - bits.prepare(x[i].qs); - } - Q2Bits bits; -}; - struct DequantizerQ3K final : public BaseDequantizer { DequantizerQ3K(const void * vx, size_t bx) : BaseDequantizer(vx, bx) {} template @@ -1319,22 +1307,13 @@ static void mul_mat_qX_K_q8_K_AVX512(int n, const void * vx, size_t bx, const Da deq.new_block(i, q8, accm, scales); for (int iy = 0; iy < nrc_y; ++iy) { - if constexpr (std::is_same_v) { - auto sumi_scales = _mm256_madd_epi16(_mm256_set1_epi16(-1), q8.load_bsums(iy, i)); - auto sumi = _mm512_dpbusd_epi32(_mm512_dpbusd_epi32(_mm512_dpbusd_epi32(_mm512_dpbusd_epi32( - _mm512_inserti32x8(_mm512_setzero_si512(), sumi_scales, 0), - deq.bits.values[0], q8.load_quants64(iy, i, 0)), deq.bits.values[1], q8.load_quants64(iy, i, 1)), - deq.bits.values[2], q8.load_quants64(iy, i, 2)), deq.bits.values[3], q8.load_quants64(iy, i, 3)); - accd[iy] = _mm512_fmadd_ps(_mm512_set1_ps(deq.d*q8.scale(iy, i)), _mm512_cvtepi32_ps(sumi), accd[iy]); - } else { - const __m512i p1 = _mm512_dpbusd_epi32(_mm512_setzero_si512(), deq.bits.values[0], q8.load_quants64(iy, i, 0)); - const __m512i p2 = _mm512_dpbusd_epi32(_mm512_setzero_si512(), deq.bits.values[1], q8.load_quants64(iy, i, 1)); - const __m512i p3 = _mm512_dpbusd_epi32(_mm512_setzero_si512(), deq.bits.values[2], q8.load_quants64(iy, i, 2)); - const __m512i p4 = _mm512_dpbusd_epi32(_mm512_setzero_si512(), deq.bits.values[3], q8.load_quants64(iy, i, 3)); - auto sumi = _mm512_dpwssd_epi32(_mm512_setzero_si512(), scales[0], _mm512_packs_epi32(p1, p2)); - sumi = _mm512_dpwssd_epi32(sumi, scales[1], _mm512_packs_epi32(p3, p4)); - accd[iy] = _mm512_fmadd_ps(_mm512_set1_ps(deq.d*q8.scale(iy, i)), _mm512_cvtepi32_ps(sumi), accd[iy]); - } + const __m512i p1 = _mm512_dpbusd_epi32(_mm512_setzero_si512(), deq.bits.values[0], q8.load_quants64(iy, i, 0)); + const __m512i p2 = _mm512_dpbusd_epi32(_mm512_setzero_si512(), deq.bits.values[1], q8.load_quants64(iy, i, 1)); + const __m512i p3 = _mm512_dpbusd_epi32(_mm512_setzero_si512(), deq.bits.values[2], q8.load_quants64(iy, i, 2)); + const __m512i p4 = _mm512_dpbusd_epi32(_mm512_setzero_si512(), deq.bits.values[3], q8.load_quants64(iy, i, 3)); + auto sumi = _mm512_dpwssd_epi32(_mm512_setzero_si512(), scales[0], _mm512_packs_epi32(p1, p2)); + sumi = _mm512_dpwssd_epi32(sumi, scales[1], _mm512_packs_epi32(p3, p4)); + accd[iy] = _mm512_fmadd_ps(_mm512_set1_ps(deq.d*q8.scale(iy, i)), _mm512_cvtepi32_ps(sumi), accd[iy]); } } @@ -1347,64 +1326,6 @@ static void mul_mat_qX_K_q8_K_AVX512(int n, const void * vx, size_t bx, const Da } } -template -static void mul_mat_iq2tn_q8_K_AVX512(int n, const void * vx, size_t bx, const DataInfo& info, int nrc_x) { - assert(n % QK_K == 0); - const int nb = n / QK_K; - - Q8 q8(info); - - DequantizerIQ2TN deq1(vx, bx), deq2(vx, bx); - - __m512 accd[2*nrc_y]; - - for (int ix = 0; ix < nrc_x; ix += 2) { - - for (int iy = 0; iy < 2*nrc_y; ++iy) accd[iy] = _mm512_setzero_ps(); - - deq1.new_row(ix+0); - deq2.new_row(ix+1); - - for (int i = 0; i < nb; ++i) { - - deq1.new_block(i); - deq2.new_block(i); - //float d = 0.5f*(deq1.d + deq2.d); // The scale is supposed to be per per tensor, so we can use the same scale for both rows - - for (int iy = 0; iy < nrc_y; ++iy) { - auto sumi_scales_256 = _mm256_madd_epi16(_mm256_set1_epi16(-1), q8.load_bsums(iy, i)); - auto sumi_scales_512 = _mm512_inserti32x8(_mm512_setzero_si512(), sumi_scales_256, 0); - auto q8q = q8.load_quants64(iy, i, 0); - auto sumi_1 = _mm512_dpbusd_epi32(sumi_scales_512, deq1.bits.values[0], q8q); - auto sumi_2 = _mm512_dpbusd_epi32(sumi_scales_512, deq2.bits.values[0], q8q); - q8q = q8.load_quants64(iy, i, 1); - sumi_1 = _mm512_dpbusd_epi32(sumi_1, deq1.bits.values[1], q8q); - sumi_2 = _mm512_dpbusd_epi32(sumi_2, deq2.bits.values[1], q8q); - q8q = q8.load_quants64(iy, i, 2); - sumi_1 = _mm512_dpbusd_epi32(sumi_1, deq1.bits.values[2], q8q); - sumi_2 = _mm512_dpbusd_epi32(sumi_2, deq2.bits.values[2], q8q); - q8q = q8.load_quants64(iy, i, 3); - sumi_1 = _mm512_dpbusd_epi32(sumi_1, deq1.bits.values[3], q8q); - sumi_2 = _mm512_dpbusd_epi32(sumi_2, deq2.bits.values[3], q8q); - // The scale is supposed to be per per tensor, so we can use the same scale - auto vd = _mm512_set1_ps(/*d* */q8.scale(iy, i)); - accd[2*iy+0] = _mm512_fmadd_ps(vd, _mm512_cvtepi32_ps(sumi_1), accd[2*iy+0]); - accd[2*iy+1] = _mm512_fmadd_ps(vd, _mm512_cvtepi32_ps(sumi_2), accd[2*iy+1]); - // Leaving this here just in case ternary models start using per row scales - //accd[2*iy+0] = _mm512_fmadd_ps(_mm512_set1_ps(deq1.d*q8.scale(iy, i)), _mm512_cvtepi32_ps(sumi_1), accd[2*iy+0]); - //accd[2*iy+1] = _mm512_fmadd_ps(_mm512_set1_ps(deq2.d*q8.scale(iy, i)), _mm512_cvtepi32_ps(sumi_2), accd[2*iy+1]); - } - - } - - for (int iy = 0; iy < nrc_y; ++iy) { - info.store(ix+0, iy, deq1.d*_mm512_reduce_add_ps(accd[2*iy+0])); - info.store(ix+1, iy, deq2.d*_mm512_reduce_add_ps(accd[2*iy+1])); - } - - } -} - template static void mul_mat_iqX_k_q8_K_AVX512(int n, const void * vx, size_t bx, const DataInfo& info, int nrc_x) { assert(n % QK_K == 0); @@ -1478,33 +1399,19 @@ static void mul_mat_qX_K_q8_K_AVX512_1(int n, const void * vx, size_t bx, const for (int kx = 0; kx < k_nx; ++kx) deq[kx]->new_block(k_nx*i+kx, q8, &accm, scales+2*kx); - if constexpr (std::is_same_v) { - for (int kx = 0; kx < k_nx; ++kx) { - compute_block_iq2tn(0, k_nx*i+kx, deq[kx]->d, q8, deq[kx]->bits.values, &accd); - } - } else { - for (int kx = 0; kx < k_nx; ++kx) { - compute_block(0, k_nx*i+kx, deq[kx]->d, q8, deq[kx]->bits.values, scales+2*kx, &accd); - } + for (int kx = 0; kx < k_nx; ++kx) { + compute_block(0, k_nx*i+kx, deq[kx]->d, q8, deq[kx]->bits.values, scales+2*kx, &accd); } } if (2*(nb/2) < nb) { int i0 = 2*(nb/2); deq[0]->new_block(i0, q8, &accm, scales); - if constexpr (std::is_same_v) { - compute_block_iq2tn(0, i0, deq[0]->d, q8, deq[0]->bits.values, &accd); - } else { - compute_block(0, i0, deq[0]->d, q8, deq[0]->bits.values, scales, &accd); - } + compute_block(0, i0, deq[0]->d, q8, deq[0]->bits.values, scales, &accd); } - if constexpr (std::is_same_v) { - info.store(ix, 0, _mm512_reduce_add_ps(accd)); - } else { - auto sum256 = _mm256_add_ps(_mm512_castps512_ps256(accd), _mm512_extractf32x8_ps(accd, 1)); - info.store(ix, 0, hsum_float_8(_mm256_add_ps(accm, sum256))); - } + auto sum256 = _mm256_add_ps(_mm512_castps512_ps256(accd), _mm512_extractf32x8_ps(accd, 1)); + info.store(ix, 0, hsum_float_8(_mm256_add_ps(accm, sum256))); } } @@ -2066,90 +1973,6 @@ struct DequantizerQ6K final : public BaseDequantizer { const __m256i mh = _mm256_set1_epi8(0x30); }; -struct DequantizerIQ2TN final : public BaseDequantizer { - DequantizerIQ2TN(const void * vx, size_t bx) : BaseDequantizer(vx, bx) {} - - inline void prepare(int i, int j) { - bits.prepare(x[i].qs, j); - } - - Q2Bits bits; -}; - - -template -IQK_NOINLINE void mul_mat_iq2tn_q8_K(int n, const void * vx, size_t bx, const DataInfo& info, int nrc_x) { - assert(n%QK_K == 0); - const int nb = n/QK_K; - - Q8 q8(info); - DequantizerIQ2TN deq1(vx, bx), deq2(vx, bx); - - __m256 accd[nrc_y]; - const auto m1 = _mm256_set1_epi16(1); - - for (int ix = 0; ix < nrc_x; ++ix) { - - deq1.new_row(ix); - deq2.new_row(ix); - - for (int i = 0; i < nb; ++i) { - - if constexpr (nrc_y == 1) { - deq1.prepare(i, 0); - auto sumi1 = _mm256_add_epi16(_mm256_maddubs_epi16(deq1.bits.values[0], q8.load_quants(0, i, 0)), - _mm256_maddubs_epi16(deq1.bits.values[1], q8.load_quants(0, i, 1))); - sumi1 = _mm256_add_epi16(_mm256_add_epi16(_mm256_maddubs_epi16(deq1.bits.values[2], q8.load_quants(0, i, 2)), - _mm256_maddubs_epi16(deq1.bits.values[3], q8.load_quants(0, i, 3))), sumi1); - - deq2.prepare(i, 1); - auto sumi2 = _mm256_add_epi16(_mm256_maddubs_epi16(deq2.bits.values[0], q8.load_quants(0, i, 4)), - _mm256_maddubs_epi16(deq2.bits.values[1], q8.load_quants(0, i, 5))); - sumi2 = _mm256_add_epi16(_mm256_add_epi16(_mm256_maddubs_epi16(deq2.bits.values[2], q8.load_quants(0, i, 6)), - _mm256_maddubs_epi16(deq2.bits.values[3], q8.load_quants(0, i, 7))), sumi2); - auto sumi = _mm256_add_epi16(sumi2, _mm256_sub_epi16(sumi1, q8.load_bsums(0, i))); - auto vd = _mm256_set1_ps(deq1.d*q8.scale(0, i)); - auto sf = _mm256_cvtepi32_ps(_mm256_madd_epi16(m1, sumi)); - accd[0] = i > 0 ? _mm256_fmadd_ps(vd, sf, accd[0]) : _mm256_mul_ps(vd, sf); - } - else { - - deq1.prepare(i, 0); deq2.prepare(i, 1); - for (int iy = 0; iy < nrc_y; ++iy) { - auto vd = _mm256_set1_ps(deq1.d*q8.scale(iy, i)); - auto sumi = _mm256_add_epi16(_mm256_maddubs_epi16(deq1.bits.values[0], q8.load_quants(iy, i, 0)), - _mm256_maddubs_epi16(deq1.bits.values[1], q8.load_quants(iy, i, 1))); - sumi = _mm256_add_epi16(_mm256_add_epi16(_mm256_maddubs_epi16(deq1.bits.values[2], q8.load_quants(iy, i, 2)), - _mm256_maddubs_epi16(deq1.bits.values[3], q8.load_quants(iy, i, 3))), sumi); - sumi = _mm256_add_epi16(_mm256_add_epi16(_mm256_maddubs_epi16(deq2.bits.values[0], q8.load_quants(iy, i, 4)), - _mm256_maddubs_epi16(deq2.bits.values[1], q8.load_quants(iy, i, 5))), sumi); - sumi = _mm256_add_epi16(_mm256_add_epi16(_mm256_maddubs_epi16(deq2.bits.values[2], q8.load_quants(iy, i, 6)), - _mm256_maddubs_epi16(deq2.bits.values[3], q8.load_quants(iy, i, 7))), sumi); - sumi = _mm256_sub_epi16(sumi, q8.load_bsums(iy, i)); - - //auto sumi1 = _mm256_add_epi16(_mm256_maddubs_epi16(deq1.bits.values[0], q8.load_quants(iy, i, 0)), - // _mm256_maddubs_epi16(deq1.bits.values[1], q8.load_quants(iy, i, 1))); - //auto sumi2 = _mm256_add_epi16(_mm256_maddubs_epi16(deq1.bits.values[2], q8.load_quants(iy, i, 2)), - // _mm256_maddubs_epi16(deq1.bits.values[3], q8.load_quants(iy, i, 3))); - //sumi1 = _mm256_add_epi16(_mm256_add_epi16(_mm256_maddubs_epi16(deq2.bits.values[0], q8.load_quants(iy, i, 4)), - // _mm256_maddubs_epi16(deq2.bits.values[1], q8.load_quants(iy, i, 5))), sumi1); - //sumi2 = _mm256_add_epi16(_mm256_add_epi16(_mm256_maddubs_epi16(deq2.bits.values[2], q8.load_quants(iy, i, 6)), - // _mm256_maddubs_epi16(deq2.bits.values[3], q8.load_quants(iy, i, 7))), sumi2); - //auto sumi = _mm256_add_epi16(sumi2, _mm256_sub_epi16(sumi1, q8.load_bsums(iy, i))); - auto sf = _mm256_cvtepi32_ps(_mm256_madd_epi16(m1, sumi)); - accd[iy] = i > 0 ? _mm256_fmadd_ps(vd, sf, accd[iy]) : _mm256_mul_ps(vd, sf); - } - } - - } - - for (int iy = 0; iy < nrc_y; ++iy) { - info.store(ix, iy, hsum_float_8(accd[iy])); - } - - } -} - template static void mul_mat_qY_K_q8_K_T(int n, const void * vx, size_t bx, const DataInfo& info, int nrc_x) { assert(n%QK_K == 0); @@ -4071,30 +3894,6 @@ bool MulMat::prepare(int typeA, int typeB, int ne00, MulMat& mm, int Ny) { assert (ne00 % QK_K == 0); MulMat::set_functions(mm); break; - case GGML_TYPE_IQ2_TN: - assert (ne00 % QK_K == 0); -#ifdef HAVE_FANCY_SIMD - //MulMat::set_functions(mm); - mm.funcs[0] = mul_mat_qX_K_q8_K_AVX512_1; - //mm.funcs[0] = mul_mat_iq2tn_q8_K_AVX512<1>; - mm.funcs[1] = mul_mat_iq2tn_q8_K_AVX512<2>; - mm.funcs[2] = mul_mat_iq2tn_q8_K_AVX512<3>; - mm.funcs[3] = mul_mat_iq2tn_q8_K_AVX512<4>; - mm.funcs[4] = mul_mat_iq2tn_q8_K_AVX512<5>; - mm.funcs[5] = mul_mat_iq2tn_q8_K_AVX512<6>; - mm.funcs[6] = mul_mat_iq2tn_q8_K_AVX512<7>; - mm.funcs[7] = mul_mat_iq2tn_q8_K_AVX512<8>; -#else - mm.funcs[0] = mul_mat_iq2tn_q8_K<1>; - mm.funcs[1] = mul_mat_iq2tn_q8_K<2>; - mm.funcs[2] = mul_mat_iq2tn_q8_K<3>; - mm.funcs[3] = mul_mat_iq2tn_q8_K<4>; - mm.funcs[4] = mul_mat_iq2tn_q8_K<5>; - mm.funcs[5] = mul_mat_iq2tn_q8_K<6>; - mm.funcs[6] = mul_mat_iq2tn_q8_K<7>; - mm.funcs[7] = mul_mat_iq2tn_q8_K<8>; -#endif - break; case GGML_TYPE_Q3_K: assert (ne00 % QK_K == 0); MulMat::set_functions(mm); @@ -4179,18 +3978,6 @@ bool MulMat::prepare(int typeA, int typeB, int ne00, MulMat& mm, int Ny) { mm.funcs[7] = mul_mat_iq1bn_q8_K64<8>; expected_typeB = GGML_TYPE_Q8_K64; break; - case GGML_TYPE_IQ1_TN: - assert (ne00 % QK_IQ1BN == 0); - mm.funcs[0] = mul_mat_iq1bn_q8_K64<1>; - mm.funcs[1] = mul_mat_iq1bn_q8_K64<2>; - mm.funcs[2] = mul_mat_iq1bn_q8_K64<3>; - mm.funcs[3] = mul_mat_iq1bn_q8_K64<4>; - mm.funcs[4] = mul_mat_iq1bn_q8_K64<5>; - mm.funcs[5] = mul_mat_iq1bn_q8_K64<6>; - mm.funcs[6] = mul_mat_iq1bn_q8_K64<7>; - mm.funcs[7] = mul_mat_iq1bn_q8_K64<8>; - expected_typeB = GGML_TYPE_Q8_K64; - break; case GGML_TYPE_IQ2_BN: assert (ne00 % QK_IQ1BN == 0); mm.funcs[0] = mul_mat_iq2bn_q8_K64<1>; @@ -5406,156 +5193,6 @@ struct DequantizerIQ3S final : public BaseDequantizer { }; -struct DequantizerIQ2TN final : public BaseDequantizer { - DequantizerIQ2TN(const void * vx, size_t bx, int nrc) : BaseDequantizer(vx, bx, nrc) {} - - constexpr static int num_blocks() { return 16; } - constexpr static bool should_scale_quants() { return true; } - - //template - //inline void process_scales(int i, [[maybe_unused]] const Q8& q8, [[maybe_unused]] float32x4_t * acc) { - // d = GGML_FP16_TO_FP32(x[i].d); - //} - - inline void new_block(int) { } - - template - inline void compute(const Q8& q8, int i, int j, int32x4_t * sumi) { - for (int iy = 0; iy < Q8::nrc_y; ++iy) { - auto q8b_1 = q8.load_quants(iy, i, 4*j+0); - sumi[iy] = ggml_vdotq_s32(ggml_vdotq_s32(sumi[iy], vreinterpretq_s8_u8(bits.b1.val[0]), q8b_1.val[0]), - vreinterpretq_s8_u8(bits.b1.val[1]), q8b_1.val[1]); - - auto q8b_2 = q8.load_quants(iy, i, 4*j+1); - sumi[iy] = ggml_vdotq_s32(ggml_vdotq_s32(sumi[iy], vreinterpretq_s8_u8(bits.b1.val[2]), q8b_2.val[0]), - vreinterpretq_s8_u8(bits.b1.val[3]), q8b_2.val[1]); - - auto q8b_3 = q8.load_quants(iy, i, 4*j+2); - sumi[iy] = ggml_vdotq_s32(ggml_vdotq_s32(sumi[iy], vreinterpretq_s8_u8(bits.b2.val[0]), q8b_3.val[0]), - vreinterpretq_s8_u8(bits.b2.val[1]), q8b_3.val[1]); - - auto q8b_4 = q8.load_quants(iy, i, 4*j+3); - sumi[iy] = ggml_vdotq_s32(ggml_vdotq_s32(sumi[iy], vreinterpretq_s8_u8(bits.b2.val[2]), q8b_4.val[0]), - vreinterpretq_s8_u8(bits.b2.val[3]), q8b_4.val[1]); - } - } - template - inline void compute1(const Q8& q8, int i, int j, int32x4_t * sumi) { - auto q8b_1 = q8.load_quants(0, i, 4*j+0); - sumi[0] = ggml_vdotq_s32(ggml_vdotq_s32(sumi[0], vreinterpretq_s8_u8(bits.b1.val[0]), q8b_1.val[0]), - vreinterpretq_s8_u8(bits.b1.val[1]), q8b_1.val[1]); - - auto q8b_2 = q8.load_quants(0, i, 4*j+1); - sumi[1] = ggml_vdotq_s32(ggml_vdotq_s32(sumi[1], vreinterpretq_s8_u8(bits.b1.val[2]), q8b_2.val[0]), - vreinterpretq_s8_u8(bits.b1.val[3]), q8b_2.val[1]); - - q8b_1 = q8.load_quants(0, i, 4*j+2); - sumi[0] = ggml_vdotq_s32(ggml_vdotq_s32(sumi[0], vreinterpretq_s8_u8(bits.b2.val[0]), q8b_1.val[0]), - vreinterpretq_s8_u8(bits.b2.val[1]), q8b_1.val[1]); - - q8b_2 = q8.load_quants(0, i, 4*j+3); - sumi[1] = ggml_vdotq_s32(ggml_vdotq_s32(sumi[1], vreinterpretq_s8_u8(bits.b2.val[2]), q8b_2.val[0]), - vreinterpretq_s8_u8(bits.b2.val[3]), q8b_2.val[1]); - } - - IQK_ALWAYS_INLINE void prepare(int i, int j) { - bits.prepare(x[i].qs+32*j); - auto m1 = vdupq_n_s8(1); - for (int k = 0; k < 4; ++k) { - bits.b1.val[k] = vsubq_s8(bits.b1.val[k], m1); - bits.b2.val[k] = vsubq_s8(bits.b2.val[k], m1); - } - } - - Q2bits bits; -}; - -template -void mul_mat_iq2tn_K_q8_K_T(int n, const void * vx, size_t bx, const DataInfo& info, int nrc_x) { - assert(n % QK_K == 0); - const int nb = n / QK_K; - - Q8 q8(info); - - DequantizerIQ2TN deq(vx, bx, nrc_y); - float32x4_t acc[nrc_y]; - - for (int ix = 0; ix < nrc_x; ++ix) { - - deq.new_row(ix); - - for (int i = 0; i < nb; ++i) { - - int32x4_t sumi[nrc_y]; - for (int iy = 0; iy < nrc_y; ++iy) sumi[iy] = vdupq_n_s32(0); - - deq.new_block(i); - deq.prepare(i, 0); - deq.compute(q8, i, 0, sumi); - deq.prepare(i, 1); - deq.compute(q8, i, 1, sumi); - - if (i > 0) { - for (int iy = 0; iy < nrc_y; ++iy) { - acc[iy] = vmlaq_f32(acc[iy], vcvtq_f32_s32(sumi[iy]), vdupq_n_f32(deq.d*q8.scale(iy, i))); - } - } else { - for (int iy = 0; iy < nrc_y; ++iy) { - acc[iy] = vmulq_f32(vcvtq_f32_s32(sumi[iy]), vdupq_n_f32(deq.d*q8.scale(iy, i))); - } - } - } - - for (int iy = 0; iy < nrc_y; ++iy) { - info.store(ix, iy, vaddvq_f32(acc[iy])); - } - } -} -void mul_mat_iq2tn_K_q8_K_1(int n, const void * vx, size_t bx, const DataInfo& info, int nrc_x) { - assert(n % QK_K == 0); - const int nb = n / QK_K; - - Q8<1, block_q8_K> q8(info); - - DequantizerIQ2TN deq(vx, bx, 1); - - auto m1 = vdup_n_s16(-1); - float32x4_t acc[2]; - - for (int ix = 0; ix < nrc_x; ++ix) { - - deq.new_row(ix); - - for (int i = 0; i < nb; ++i) { - - int32x4_t sumi[2] = {}; - deq.new_block(i); - auto bsums = q8.load_bsums(0, i); - bsums.val[0] = vaddq_s32(bsums.val[0], bsums.val[1]); - sumi[0] = vmlal_s16(sumi[0], vget_low_s16 (bsums.val[0]), m1); - sumi[1] = vmlal_s16(sumi[1], vget_high_s16(bsums.val[0]), m1); - deq.bits.prepare(deq.x[i].qs); - deq.compute1(q8, i, 0, sumi); - deq.bits.prepare(deq.x[i].qs+32); - deq.compute1(q8, i, 1, sumi); - - auto vd = vdupq_n_f32(deq.d*q8.scale(0, i)); - if (i > 0) { - acc[0] = vmlaq_f32(acc[0], vcvtq_f32_s32(sumi[0]), vd); - acc[1] = vmlaq_f32(acc[1], vcvtq_f32_s32(sumi[1]), vd); - } else { - acc[0] = vmulq_f32(vcvtq_f32_s32(sumi[0]), vd); - acc[1] = vmulq_f32(vcvtq_f32_s32(sumi[1]), vd); - } - - } - - acc[0] = vaddq_f32(acc[0], acc[1]); - info.store(ix, 0, vaddvq_f32(acc[0])); - } -} - - template void mul_mat_qX_K_q8_K_T(int n, const void * vx, size_t bx, const DataInfo& info, int nrc_x) { assert(n % QK_K == 0); @@ -6855,17 +6492,6 @@ bool MulMat::prepare(int typeA, int typeB, int ne00, MulMat& m, int /*Ny*/) { case GGML_TYPE_Q2_K: MulMat::set_functions(m); break; - case GGML_TYPE_IQ2_TN: - //MulMat::set_functions(m); - m.funcs[0] = mul_mat_iq2tn_K_q8_K_1; - m.funcs[1] = mul_mat_iq2tn_K_q8_K_T<2>; - m.funcs[2] = mul_mat_iq2tn_K_q8_K_T<3>; - m.funcs[3] = mul_mat_iq2tn_K_q8_K_T<4>; - m.funcs[4] = mul_mat_iq2tn_K_q8_K_T<5>; - m.funcs[5] = mul_mat_iq2tn_K_q8_K_T<6>; - m.funcs[6] = mul_mat_iq2tn_K_q8_K_T<7>; - m.funcs[7] = mul_mat_iq2tn_K_q8_K_T<8>; - break; case GGML_TYPE_Q3_K: MulMat::set_functions(m); break; @@ -6931,17 +6557,6 @@ bool MulMat::prepare(int typeA, int typeB, int ne00, MulMat& m, int /*Ny*/) { m.funcs[7] = mul_mat_iq1bn_q8_K64<8>; expected_Btype = GGML_TYPE_Q8_K64; break; - case GGML_TYPE_IQ1_TN: - m.funcs[0] = mul_mat_iq1bn_q8_K64<1>; - m.funcs[1] = mul_mat_iq1bn_q8_K64<2>; - m.funcs[2] = mul_mat_iq1bn_q8_K64<3>; - m.funcs[3] = mul_mat_iq1bn_q8_K64<4>; - m.funcs[4] = mul_mat_iq1bn_q8_K64<5>; - m.funcs[5] = mul_mat_iq1bn_q8_K64<6>; - m.funcs[6] = mul_mat_iq1bn_q8_K64<7>; - m.funcs[7] = mul_mat_iq1bn_q8_K64<8>; - expected_Btype = GGML_TYPE_Q8_K64; - break; case GGML_TYPE_IQ2_BN: m.funcs[0] = mul_mat_iq2bn_q8_K64<1>; m.funcs[1] = mul_mat_iq2bn_q8_K64<2>; diff --git a/ggml/src/iqk/iqk_quantize.cpp b/ggml/src/iqk/iqk_quantize.cpp index 12a303a1..6d014e06 100644 --- a/ggml/src/iqk/iqk_quantize.cpp +++ b/ggml/src/iqk/iqk_quantize.cpp @@ -200,31 +200,6 @@ void quantize_row_iq1_bn(const float * x, void * y, int64_t k) { quantize_iq1_bn(x, y, 1, k, nullptr); } -void quantize_row_iq1_tn_ref(const float * x, block_iq1_tn * y, int64_t k) { - quantize_iq1_tn(x, (void *)y, 1, k, nullptr); -} - -void quantize_row_iq1_tn(const float * x, void * y, int64_t k) { - quantize_iq1_tn(x, y, 1, k, nullptr); -} - -size_t quantize_iq1_tn(const float * src, void * dst, int64_t nrows, int64_t n_per_row, const float * imatrix) { - return quantize_iq1_bn(src, dst, nrows, n_per_row, imatrix); -} - -void dequantize_row_iq1_tn(const block_iq1_tn * x, float * y, int64_t k) { - float scale = GGML_FP16_TO_FP32(*(const ggml_half *)x); - const block_iq1_bn * iq1bn = (const block_iq1_bn *)((const char *)x + sizeof(ggml_half)); - dequantize_row_iq1_bn(iq1bn, y, k); - for (int j = 0; j < int(k); ++j) y[j] *= scale; -} - -void vec_dot_iq1_tn_q8_k(int n, float * s, size_t bs, const void * vx, size_t bx, const void * vy, size_t by, int nrc) { - float scale = GGML_FP16_TO_FP32(*(const ggml_half *)vx); - ggml_vec_dot_iq1_bn_q8_K64(n, s, bs, (const void *)((const char *)vx + sizeof(ggml_half)), bx, vy, by, nrc); - *s *= scale; -} - void dequantize_row_iq1_bn(const block_iq1_bn * x, float * y, int64_t k) { assert(k%QK_IQ1BN == 0); int nblock = k / QK_IQ1BN; @@ -2364,114 +2339,6 @@ size_t quantize_iq6_k(const float * src, void * dst, int64_t nrows, int64_t n_pe return nrows * nblock * sizeof(block_iq6_k); } -// -// ========================== IQ2_TN -// - -void quantize_row_iq2_tn_ref(const float * x, block_iq2_tn * y, int64_t k) { - GGML_ASSERT(k%QK_K == 0); - - int nb = k/QK_K; - - auto quantize = [] (float xmax, float x) { - return x < -0.5f*xmax ? 0 : x < 0.5f*xmax ? 1 : 2; - }; - int n = k; - float max = x[0]; - for (int j = 1; j < n; ++j) max = std::max(max, fabsf(x[j])); - - *(float *)y = max; - y = (block_iq2_tn *)((float *)y + 1); - - for (int ibl = 0; ibl < nb; ++ibl) { - auto xb = x + QK_K*ibl; - auto qs = y[ibl].qs; - for (int l = 0; l < QK_K/128; ++l) { - for (int j = 0; j < 32; ++j) { - qs[j] = quantize(max, xb[j]) | (quantize(max, xb[j+32]) << 2) | (quantize(max, xb[j+64]) << 4) | (quantize(max, xb[j+96]) << 6); - } - xb += 128; - qs += 32; - } - } -} - -void quantize_row_iq2_tn(const float * x, void * y, int64_t k) { - quantize_row_iq2_tn_ref(x, (block_iq2_tn *)y, k); -} - -size_t quantize_iq2_tn(const float * src, void * dst, int64_t nrows, int64_t n_per_row, const float * /*imatrix*/) { - auto row_size = ggml_row_size(GGML_TYPE_IQ2_TN, n_per_row); - char * qrow = (char *)dst; - for (int row = 0; row < nrows; ++row) { - quantize_row_iq2_tn_ref(src, (block_iq2_tn *)qrow, n_per_row); - qrow += row_size; - src += n_per_row; - } - return row_size*nrows; -} - -void dequantize_row_iq2_tn(const block_iq2_tn * x, float * y, int64_t k) { - GGML_ASSERT(k%QK_K == 0); - const float * dptr = (const float *)x; - float d = *dptr; - x = (const block_iq2_tn *)(dptr + 1); - int nb = k/QK_K; - for (int ibl = 0; ibl < nb; ++ibl) { - auto qs = x[ibl].qs; - for (int l = 0; l < QK_K/128; ++l) { - for (int j = 0; j < 32; ++j) { - y[j+ 0] = d*((qs[j] >> 0) & 3) - d; - y[j+32] = d*((qs[j] >> 2) & 3) - d; - y[j+64] = d*((qs[j] >> 4) & 3) - d; - y[j+96] = d*((qs[j] >> 6) & 3) - d; - } - y += 128; - qs += 32; - } - } -} - -void vec_dot_iq2_tn_q8_k(int n, float * s, size_t bs, const void * vx, size_t bx, const void * vy, size_t by, int nrc) { - GGML_UNUSED(bs); - GGML_UNUSED(bx); - GGML_UNUSED(by); - GGML_UNUSED(nrc); -#if GGML_USE_IQK_MULMAT - if (iqk_mul_mat(1, 1, n, GGML_TYPE_IQ2_TN, vx, 0, GGML_TYPE_Q8_K, vy, 0, s, 0, 0, 1)) { - return; - } -#endif - - const int nb = n / QK_K; - - const float * dptr = (const float *)vx; - const float d = *dptr; - const block_iq2_tn * x = (const block_iq2_tn *)(dptr + 1); - const block_q8_K * y = (const block_q8_K *)vy; - - float sumf = 0; - - for (int i = 0; i < nb; i++) { - auto qs = x[i].qs; - auto q8 = y[i].qs; - int sumi1 = 0, sumi2 = 0, sumi3 = 0,sumi4 = 0; - for (int j = 0; j < QK_K/16; ++j) sumi1 -= y[i].bsums[j]; - for (int l = 0; l < QK_K/128; ++l) { - for (int j = 0; j < 32; ++j) { - sumi1 += q8[j+ 0] * (qs[j] & 0x03); - sumi2 += q8[j+32] * (qs[j] & 0x0c); - sumi3 += q8[j+64] * (qs[j] & 0x30); - sumi4 += q8[j+96] * (qs[j] & 0xc0); - } - q8 += 128; - qs += 32; - } - sumf += d * (sumi1 + 0.25f*sumi2 + 0.0625f*sumi3 + 0.015625f*sumi4); - } - *s = sumf; -} - #ifdef __AVX2__ namespace { inline int hsum_i32_8(const __m256i a) { diff --git a/ggml/src/iqk/iqk_quantize.h b/ggml/src/iqk/iqk_quantize.h index e0dde0d8..50c425af 100644 --- a/ggml/src/iqk/iqk_quantize.h +++ b/ggml/src/iqk/iqk_quantize.h @@ -43,18 +43,6 @@ size_t quantize_iq6_k(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, void dequantize_row_iq6_k(const block_iq6_k * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); void vec_dot_iq6_k_q8_k(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); -void quantize_row_iq2_tn_ref(const float * GGML_RESTRICT x, block_iq2_tn * GGML_RESTRICT y, int64_t k); -void quantize_row_iq2_tn(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k); -size_t quantize_iq2_tn(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); -void dequantize_row_iq2_tn(const block_iq2_tn * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); -void vec_dot_iq2_tn_q8_k(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); - -void quantize_row_iq1_tn_ref(const float * GGML_RESTRICT x, block_iq1_tn * GGML_RESTRICT y, int64_t k); -void quantize_row_iq1_tn(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k); -size_t quantize_iq1_tn(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); -void dequantize_row_iq1_tn(const block_iq1_tn * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); -void vec_dot_iq1_tn_q8_k(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); - void quantize_row_iq4_ks_ref(const float * GGML_RESTRICT x, block_iq4_ks * GGML_RESTRICT y, int64_t k); void quantize_row_iq4_ks(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k); size_t quantize_iq4_ks(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); diff --git a/include/llama.h b/include/llama.h index b2906693..965e5f50 100644 --- a/include/llama.h +++ b/include/llama.h @@ -175,8 +175,6 @@ extern "C" { LLAMA_FTYPE_MOSTLY_IQ4_K = 140, // except 1d tensors LLAMA_FTYPE_MOSTLY_IQ5_K = 141, // except 1d tensors LLAMA_FTYPE_MOSTLY_IQ6_K = 142, // except 1d tensors - LLAMA_FTYPE_MOSTLY_IQ2_TN = 143, // except 1d tensors - LLAMA_FTYPE_MOSTLY_IQ1_TN = 144, // except 1d tensors LLAMA_FTYPE_MOSTLY_IQ4_KS = 145, // except 1d tensors LLAMA_FTYPE_MOSTLY_IQ3_KL = 146, // except 1d tensors LLAMA_FTYPE_MOSTLY_IQ2_KS = 147, // except 1d tensors diff --git a/src/llama.cpp b/src/llama.cpp index 7a52bc08..2da815e2 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -3862,8 +3862,6 @@ struct llama_model_loader { case GGML_TYPE_IQ1_M: ftype = LLAMA_FTYPE_MOSTLY_IQ1_M; break; case GGML_TYPE_IQ1_BN: ftype = LLAMA_FTYPE_MOSTLY_IQ1_BN; break; case GGML_TYPE_IQ2_BN: ftype = LLAMA_FTYPE_MOSTLY_IQ2_BN; break; - case GGML_TYPE_IQ1_TN: ftype = LLAMA_FTYPE_MOSTLY_IQ1_TN; break; - case GGML_TYPE_IQ2_TN: ftype = LLAMA_FTYPE_MOSTLY_IQ2_TN; break; case GGML_TYPE_IQ4_NL: ftype = LLAMA_FTYPE_MOSTLY_IQ4_NL; break; case GGML_TYPE_IQ4_XS: ftype = LLAMA_FTYPE_MOSTLY_IQ4_XS; break; case GGML_TYPE_IQ4_KS: ftype = LLAMA_FTYPE_MOSTLY_IQ4_KS; break; @@ -4579,9 +4577,7 @@ static std::string llama_model_ftype_name(llama_ftype ftype) { case LLAMA_FTYPE_MOSTLY_IQ5_K: return "IQ5_K - 5.5 bpw"; case LLAMA_FTYPE_MOSTLY_IQ6_K: return "IQ6_K - 6.6 bpw"; case LLAMA_FTYPE_MOSTLY_IQ1_BN: return "IQ1_BN - 1.625 bpw Bitnet"; - case LLAMA_FTYPE_MOSTLY_IQ1_TN: return "IQ1_TN - 1.625 bpw TriLM"; case LLAMA_FTYPE_MOSTLY_IQ2_BN: return "IQ2_BN - 2.00 bpw Bitnet"; - case LLAMA_FTYPE_MOSTLY_IQ2_TN: return "IQ2_TN - 2.00 bpw TriLM"; case LLAMA_FTYPE_MOSTLY_IQ3_S: return "IQ3_S - 3.4375 bpw"; case LLAMA_FTYPE_MOSTLY_IQ3_M: return "IQ3_S mix - 3.66 bpw"; case LLAMA_FTYPE_MOSTLY_Q4_0_4_4: return "Q4_0_4_4"; @@ -15903,9 +15899,6 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_BN || ftype == LLAMA_FTYPE_MOSTLY_IQ2_BN) { new_type = GGML_TYPE_IQ4_NL; } - else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_TN || ftype == LLAMA_FTYPE_MOSTLY_IQ2_TN) { - new_type = GGML_TYPE_Q4_K; - } else if (new_type == GGML_TYPE_Q4_0_4_4 || new_type == GGML_TYPE_Q4_0_4_8 || new_type == GGML_TYPE_Q4_0_8_8) { new_type = GGML_TYPE_Q4_0; @@ -16154,8 +16147,8 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n new_type == GGML_TYPE_IQ2_XS || new_type == GGML_TYPE_IQ2_XXS || new_type == GGML_TYPE_IQ2_S || new_type == GGML_TYPE_IQ3_XXS || new_type == GGML_TYPE_IQ1_S || new_type == GGML_TYPE_IQ3_S || new_type == GGML_TYPE_IQ1_M || new_type == GGML_TYPE_IQ4_K || new_type == GGML_TYPE_IQ2_K || - new_type == GGML_TYPE_IQ5_K || new_type == GGML_TYPE_IQ3_K || new_type == GGML_TYPE_IQ2_TN || - new_type == GGML_TYPE_IQ6_K || new_type == GGML_TYPE_IQ1_TN || new_type == GGML_TYPE_IQ4_KS || + new_type == GGML_TYPE_IQ5_K || new_type == GGML_TYPE_IQ3_K || + new_type == GGML_TYPE_IQ6_K || new_type == GGML_TYPE_IQ4_KS || new_type == GGML_TYPE_IQ2_KS || new_type == GGML_TYPE_IQ4_KSS) { int nx = tensor->ne[0]; int ny = tensor->ne[1]; @@ -16182,8 +16175,6 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n case GGML_TYPE_IQ3_S: case GGML_TYPE_IQ1_S: case GGML_TYPE_IQ1_M: - case GGML_TYPE_IQ1_TN: - case GGML_TYPE_IQ2_TN: case GGML_TYPE_Q2_K: case GGML_TYPE_Q3_K: case GGML_TYPE_IQ2_K: @@ -16297,8 +16288,6 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s case LLAMA_FTYPE_MOSTLY_IQ1_M: default_type = GGML_TYPE_IQ1_M; break; case LLAMA_FTYPE_MOSTLY_IQ1_BN: default_type = GGML_TYPE_IQ1_BN; break; case LLAMA_FTYPE_MOSTLY_IQ2_BN: default_type = GGML_TYPE_IQ2_BN; break; - case LLAMA_FTYPE_MOSTLY_IQ1_TN: default_type = GGML_TYPE_IQ1_TN; break; - case LLAMA_FTYPE_MOSTLY_IQ2_TN: default_type = GGML_TYPE_IQ2_TN; break; case LLAMA_FTYPE_MOSTLY_IQ4_NL: default_type = GGML_TYPE_IQ4_NL; break; case LLAMA_FTYPE_MOSTLY_IQ4_XS: default_type = GGML_TYPE_IQ4_XS; break; case LLAMA_FTYPE_MOSTLY_IQ4_KS: default_type = GGML_TYPE_IQ4_KS; break;