From 042ea88616364b4d56e2c237e133968247c1c4a3 Mon Sep 17 00:00:00 2001 From: Iwan Kawrakow Date: Sat, 14 Dec 2024 19:06:19 +0200 Subject: [PATCH] Rename bf16_r4 to bf16_r16 We are interleaving 16 rows now. --- examples/quantize/quantize.cpp | 2 +- ggml/include/ggml.h | 4 ++-- ggml/src/ggml-quants.c | 2 +- ggml/src/ggml.c | 12 +++++------ ggml/src/iqk/iqk_mul_mat.cpp | 37 +++++++++++++--------------------- ggml/src/iqk/iqk_quantize.cpp | 4 ++-- ggml/src/iqk/iqk_quantize.h | 4 ++-- include/llama.h | 2 +- src/llama.cpp | 12 +++++------ 9 files changed, 35 insertions(+), 44 deletions(-) diff --git a/examples/quantize/quantize.cpp b/examples/quantize/quantize.cpp index 4401eb1d..5e5dd7c0 100644 --- a/examples/quantize/quantize.cpp +++ b/examples/quantize/quantize.cpp @@ -77,7 +77,7 @@ static const std::vector QUANT_OPTIONS = { { "Q4_0_8_8", LLAMA_FTYPE_MOSTLY_Q4_0_8_8, " 4.34G, +0.4685 ppl @ Llama-3-8B", }, { "F16", LLAMA_FTYPE_MOSTLY_F16, "14.00G, -0.0020 ppl @ Mistral-7B", }, { "BF16", LLAMA_FTYPE_MOSTLY_BF16, "14.00G, -0.0050 ppl @ Mistral-7B", }, - { "BF16_R4", LLAMA_FTYPE_MOSTLY_BF16_R4, "14.00G, -0.0050 ppl @ Mistral-7B", }, + { "BF16_R16", LLAMA_FTYPE_MOSTLY_BF16_R16, "14.00G, -0.0050 ppl @ Mistral-7B", }, { "F32", LLAMA_FTYPE_ALL_F32, "26.00G @ 7B", }, // Note: Ensure COPY comes after F32 to avoid ftype 0 from matching. { "COPY", LLAMA_FTYPE_ALL_F32, "only copy tensors, no quantizing", }, diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h index febb8960..7b7fde0d 100644 --- a/ggml/include/ggml.h +++ b/ggml/include/ggml.h @@ -420,7 +420,7 @@ extern "C" { GGML_TYPE_Q6_K_R4 = 214, GGML_TYPE_IQ4_NL_R4 = 220, GGML_TYPE_IQ4_XS_R4 = 223, - GGML_TYPE_BF16_R4 = 230, + GGML_TYPE_BF16_R16 = 230, GGML_TYPE_Q6_0_R4 = 233, GGML_TYPE_IQ2_BN_R4 = 335, GGML_TYPE_IQ4_K_R4 = 339, @@ -494,7 +494,7 @@ extern "C" { GGML_FTYPE_MOSTLY_Q6_K_R4 = 214, // except 1d tensors GGML_FTYPE_MOSTLY_IQ4_NL_R4 = 219, // except 1d tensors GGML_FTYPE_MOSTLY_IQ4_XS_R4 = 222, // except 1d tensors - GGML_FTYPE_MOSTLY_BF16_R4 = 224, // except 1d tensors + GGML_FTYPE_MOSTLY_BF16_R16 = 224, // except 1d tensors GGML_FTYPE_MOSTLY_Q6_0_R4 = 227, // except 1d tensors GGML_FTYPE_MOSTLY_IQ2_BN_R4 = 329, // except 1d tensors GGML_FTYPE_MOSTLY_IQ4_K_R4 = 332, // except 1d tensors diff --git a/ggml/src/ggml-quants.c b/ggml/src/ggml-quants.c index d76d41d9..0b157295 100644 --- a/ggml/src/ggml-quants.c +++ b/ggml/src/ggml-quants.c @@ -15209,7 +15209,7 @@ bool ggml_validate_row_data(enum ggml_type type, const void * data, size_t nbyte case GGML_TYPE_Q6_K_R4: break; case GGML_TYPE_IQ4_K_R4: break; case GGML_TYPE_Q8_K_R8: break; - case GGML_TYPE_BF16_R4: break; + case GGML_TYPE_BF16_R16: break; case GGML_TYPE_Q4_0_4_4: case GGML_TYPE_Q4_0_4_8: { diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index 1bdc7e92..51ef6eb2 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -1231,8 +1231,8 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = { .nrows = 1, .row_meta_size = 0, }, - [GGML_TYPE_BF16_R4] = { - .type_name = "bf16_r4", + [GGML_TYPE_BF16_R16] = { + .type_name = "bf16_r16", .blck_size = 1, .type_size = sizeof(ggml_bf16_t), .is_quantized = false, @@ -4123,7 +4123,7 @@ enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype) { case GGML_FTYPE_ALL_F32: wtype = GGML_TYPE_F32; break; case GGML_FTYPE_MOSTLY_F16: wtype = GGML_TYPE_F16; break; case GGML_FTYPE_MOSTLY_BF16: wtype = GGML_TYPE_BF16; break; - case GGML_FTYPE_MOSTLY_BF16_R4: wtype = GGML_TYPE_BF16_R4;break; + case GGML_FTYPE_MOSTLY_BF16_R16: wtype = GGML_TYPE_BF16_R16;break; case GGML_FTYPE_MOSTLY_Q4_0: wtype = GGML_TYPE_Q4_0; break; case GGML_FTYPE_MOSTLY_Q4_1: wtype = GGML_TYPE_Q4_1; break; case GGML_FTYPE_MOSTLY_Q5_0: wtype = GGML_TYPE_Q5_0; break; @@ -15762,7 +15762,7 @@ static void ggml_compute_forward_clamp( } break; case GGML_TYPE_F16: case GGML_TYPE_BF16: - case GGML_TYPE_BF16_R4: + case GGML_TYPE_BF16_R16: case GGML_TYPE_Q4_0: case GGML_TYPE_Q4_1: case GGML_TYPE_Q5_0: @@ -22666,9 +22666,9 @@ size_t ggml_quantize_chunk( ggml_fp32_to_bf16_row_ref(src + start, (ggml_bf16_t *)dst + start, n); result = n * elemsize; } break; - case GGML_TYPE_BF16_R4: + case GGML_TYPE_BF16_R16: { - repack_f32_bf16_r4(src + start, (char *) dst + start_row * row_size, nrows, n_per_row); + repack_f32_bf16_r16(src + start, (char *) dst + start_row * row_size, nrows, n_per_row); result = nrows * row_size; } break; case GGML_TYPE_F32: diff --git a/ggml/src/iqk/iqk_mul_mat.cpp b/ggml/src/iqk/iqk_mul_mat.cpp index 9e954e72..8fdf7163 100644 --- a/ggml/src/iqk/iqk_mul_mat.cpp +++ b/ggml/src/iqk/iqk_mul_mat.cpp @@ -184,7 +184,7 @@ struct MulMat { case GGML_TYPE_IQ4_XS_R4: case GGML_TYPE_IQ2_BN_R4: return 4; case GGML_TYPE_Q8_K_R8: return 8; - case GGML_TYPE_BF16_R4: return 16; + case GGML_TYPE_BF16_R16: return 16; default: return 1; } } @@ -3884,7 +3884,7 @@ static void mul_mat_q8_k_r8_q8_k(int n, const void * vx, size_t bx, const DataIn #ifdef __AVX512BF16__ template -static void mul_mat_bf16_r8_bf16(int n, const void * vx, size_t bx, const DataInfo& info, int nrc_x) { +static void mul_mat_bf16_r16_bf16(int n, const void * vx, size_t bx, const DataInfo& info, int nrc_x) { GGML_ASSERT(nrc_x%16 == 0); const ggml_bf16_t * y[nrc_y]; for (int iy = 0; iy < nrc_y; ++iy) y[iy] = (const ggml_bf16_t *)info.src1_row(iy); @@ -5994,24 +5994,16 @@ void set_mul_mat_bf16(MulMat& mm) { mm.funcs[3] = mul_mat_fX_fY_T<4>; mm.funcs[4] = mul_mat_fX_fY_T<5>; } -void set_mul_mat_bf16_r4(MulMat& mm) { +void set_mul_mat_bf16_r16(MulMat& mm) { for (auto& f : mm.funcs) f = nullptr; - mm.funcs[0] = mul_mat_bf16_r8_bf16<1>; - mm.funcs[1] = mul_mat_bf16_r8_bf16<2>; - mm.funcs[2] = mul_mat_bf16_r8_bf16<3>; - mm.funcs[3] = mul_mat_bf16_r8_bf16<4>; - mm.funcs[4] = mul_mat_bf16_r8_bf16<5>; - mm.funcs[5] = mul_mat_bf16_r8_bf16<6>; - mm.funcs[6] = mul_mat_bf16_r8_bf16<7>; - mm.funcs[7] = mul_mat_bf16_r8_bf16<8>; - //mm.funcs[0] = mul_mat_fX_fY_r4<1>; - //mm.funcs[1] = mul_mat_fX_fY_r4<2>; - //mm.funcs[2] = mul_mat_fX_fY_r4<3>; - //mm.funcs[3] = mul_mat_fX_fY_r4<4>; - ////mm.funcs[4] = mul_mat_fX_fY_r4<5>; - ////mm.funcs[5] = mul_mat_fX_fY_r4<6>; - ////mm.funcs[6] = mul_mat_fX_fY_r4<7>; - ////mm.funcs[7] = mul_mat_fX_fY_r4<8>; + mm.funcs[0] = mul_mat_bf16_r16_bf16<1>; + mm.funcs[1] = mul_mat_bf16_r16_bf16<2>; + mm.funcs[2] = mul_mat_bf16_r16_bf16<3>; + mm.funcs[3] = mul_mat_bf16_r16_bf16<4>; + mm.funcs[4] = mul_mat_bf16_r16_bf16<5>; + mm.funcs[5] = mul_mat_bf16_r16_bf16<6>; + mm.funcs[6] = mul_mat_bf16_r16_bf16<7>; + mm.funcs[7] = mul_mat_bf16_r16_bf16<8>; } #endif @@ -6030,12 +6022,11 @@ bool MulMat::prepare(int typeA, int typeB, int ne00, MulMat& mm, int Ny) { return true; } - if (typeA == GGML_TYPE_BF16_R4) { - //printf("%s: %s\n", __func__, ggml_type_name((ggml_type)typeB)); - if (ne00 % 32) return false; + if (typeA == GGML_TYPE_BF16_R16) { + if (ne00 % 16) return false; switch (typeB) { #ifdef __AVX512BF16__ - case GGML_TYPE_BF16: set_mul_mat_bf16_r4(mm); break; + case GGML_TYPE_BF16: set_mul_mat_bf16_r16(mm); break; #endif default: return false; } diff --git a/ggml/src/iqk/iqk_quantize.cpp b/ggml/src/iqk/iqk_quantize.cpp index 3a0a2565..abe81858 100644 --- a/ggml/src/iqk/iqk_quantize.cpp +++ b/ggml/src/iqk/iqk_quantize.cpp @@ -4787,11 +4787,11 @@ void repack_bf16(int nrows, int n_per_row, const T * x, ggml_bf16_t * y) { } } -void repack_f32_bf16_r4(const void * src, void * dst, int64_t nrows, int64_t n_per_row) { +void repack_f32_bf16_r16(const void * src, void * dst, int64_t nrows, int64_t n_per_row) { repack_bf16(nrows, n_per_row, (const float *)src, (ggml_bf16_t *)dst); } -void repack_bf16_bf16_r4(const void * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row) { +void repack_bf16_bf16_r16(const void * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row) { repack_bf16(nrows, n_per_row, (const ggml_bf16_t *)src, (ggml_bf16_t *)dst); } diff --git a/ggml/src/iqk/iqk_quantize.h b/ggml/src/iqk/iqk_quantize.h index 10754f21..e8721a5e 100644 --- a/ggml/src/iqk/iqk_quantize.h +++ b/ggml/src/iqk/iqk_quantize.h @@ -158,8 +158,8 @@ void quantize_row_q8_K16(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, void quantize_row_q8_K32(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k); void quantize_row_q8_KR8(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k); -void repack_f32_bf16_r4 (const void * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row); -void repack_bf16_bf16_r4(const void * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row); +void repack_f32_bf16_r16 (const void * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row); +void repack_bf16_bf16_r16(const void * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row); #ifdef __cplusplus } diff --git a/include/llama.h b/include/llama.h index 10ed9cc5..988ffec7 100644 --- a/include/llama.h +++ b/include/llama.h @@ -191,7 +191,7 @@ extern "C" { LLAMA_FTYPE_MOSTLY_IQ4_NL_R4 = 225, // except 1d tensors LLAMA_FTYPE_MOSTLY_IQ4_XS_R4 = 230, // except 1d tensors LLAMA_FTYPE_MOSTLY_Q6_0_R4 = 335, // except 1d tensors - LLAMA_FTYPE_MOSTLY_BF16_R4 = 232, // except 1d tensors + LLAMA_FTYPE_MOSTLY_BF16_R16 = 232, // except 1d tensors LLAMA_FTYPE_MOSTLY_IQ2_BN_R4 = 337, // except 1d tensors LLAMA_FTYPE_MOSTLY_IQ4_K_R4 = 340, // except 1d tensors LLAMA_FTYPE_MOSTLY_Q8_K_R8 = 399, // except 1d tensors diff --git a/src/llama.cpp b/src/llama.cpp index 8de6f5f3..536b2f97 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -3828,7 +3828,7 @@ struct llama_model_loader { case GGML_TYPE_F32: ftype = LLAMA_FTYPE_ALL_F32; break; case GGML_TYPE_F16: ftype = LLAMA_FTYPE_MOSTLY_F16; break; case GGML_TYPE_BF16: ftype = LLAMA_FTYPE_MOSTLY_BF16; break; - case GGML_TYPE_BF16_R4: ftype = LLAMA_FTYPE_MOSTLY_BF16_R4; break; + case GGML_TYPE_BF16_R16:ftype = LLAMA_FTYPE_MOSTLY_BF16_R16;break; case GGML_TYPE_Q4_0: ftype = LLAMA_FTYPE_MOSTLY_Q4_0; break; case GGML_TYPE_Q4_1: ftype = LLAMA_FTYPE_MOSTLY_Q4_1; break; case GGML_TYPE_Q5_0: ftype = LLAMA_FTYPE_MOSTLY_Q5_0; break; @@ -4541,7 +4541,7 @@ static std::string llama_model_ftype_name(llama_ftype ftype) { case LLAMA_FTYPE_ALL_F32: return "all F32"; case LLAMA_FTYPE_MOSTLY_F16: return "F16"; case LLAMA_FTYPE_MOSTLY_BF16: return "BF16"; - case LLAMA_FTYPE_MOSTLY_BF16_R4: return "BF16_R4"; + case LLAMA_FTYPE_MOSTLY_BF16_R16: return "BF16_R16"; case LLAMA_FTYPE_MOSTLY_Q4_0: return "Q4_0"; case LLAMA_FTYPE_MOSTLY_Q4_1: return "Q4_1"; case LLAMA_FTYPE_MOSTLY_Q5_0: return "Q5_0"; @@ -15835,7 +15835,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n else if (new_type == GGML_TYPE_Q8_0_R4) { new_type = GGML_TYPE_Q8_0; } - else if (new_type == GGML_TYPE_BF16_R4) { + else if (new_type == GGML_TYPE_BF16_R16) { new_type = GGML_TYPE_BF16; } } @@ -16233,7 +16233,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s case LLAMA_FTYPE_MOSTLY_Q8_0: default_type = GGML_TYPE_Q8_0; break; case LLAMA_FTYPE_MOSTLY_F16: default_type = GGML_TYPE_F16; break; case LLAMA_FTYPE_MOSTLY_BF16: default_type = GGML_TYPE_BF16; break; - case LLAMA_FTYPE_MOSTLY_BF16_R4: default_type = GGML_TYPE_BF16_R4; break; + case LLAMA_FTYPE_MOSTLY_BF16_R16: default_type = GGML_TYPE_BF16_R16; break; case LLAMA_FTYPE_ALL_F32: default_type = GGML_TYPE_F32; break; // K-quants @@ -16526,7 +16526,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s if (quantize) { new_type = default_type; - if (new_type == GGML_TYPE_BF16_R4 && strcmp(tensor->name, "token_embd.weight") == 0) { + if (new_type == GGML_TYPE_BF16_R16 && strcmp(tensor->name, "token_embd.weight") == 0) { new_type = GGML_TYPE_BF16; } @@ -16689,7 +16689,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s if (tensor->ne[1] % 4 != 0) new_type = GGML_TYPE_IQ4_K; else chunk_size_multiplier = 4; } - else if (new_type == GGML_TYPE_BF16_R4) { + else if (new_type == GGML_TYPE_BF16_R16) { if (tensor->ne[1] % 16 != 0) new_type = GGML_TYPE_BF16; else chunk_size_multiplier = 16; }