On Zen4 repack fp16 models to bf16_r16 when run-time-repacking is requested (#174)

This massively improves performance. As this is opt-in, we do not worry about possible precision loss in the f16 -> bf16 conversion. Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
2026-04-30 19:31:48 +00:00 · 2025-01-21 19:19:38 +02:00
parent 1e44bdf6e5
commit 09d4a8ad90
1 changed files with 3 additions and 1 deletions
--- a/ggml/src/iqk/iqk_quantize.cpp
+++ b/ggml/src/iqk/iqk_quantize.cpp
@@ -5260,6 +5260,7 @@ inline ggml_bf16_t to_bf16(const float& x) {
    helper.f = x;
    return ggml_bf16_t{(uint16_t)(helper.u >> 16)};
 }
 inline ggml_bf16_t to_bf16(const ggml_half& x) { return to_bf16(GGML_FP16_TO_FP32(x)); }
 inline ggml_bf16_t to_bf16(const ggml_bf16_t& x) { return x; }
 template <typename T>
 void repack_bf16(int nrows, int n_per_row, const T * x, ggml_bf16_t * y) {
@@ -6082,7 +6083,8 @@ void iqk_repack_tensor(struct ggml_tensor * tensor) {
        { GGML_TYPE_Q8_0,   { GGML_TYPE_Q8_0_R4,   4,  (Repack::repack_func)repack_q8_0}    },
        { GGML_TYPE_Q8_K,   { GGML_TYPE_Q8_K_R8,   8,  (Repack::repack_func)repack_q8_k}    },
 #ifdef __AVX512BF16__
-        { GGML_TYPE_BF16,   { GGML_TYPE_BF16_R16, 16,  (Repack::repack_func)repack_bf16<ggml_bf16_t>}    },
+        { GGML_TYPE_BF16,   { GGML_TYPE_BF16_R16, 16,  (Repack::repack_func)repack_bf16<ggml_bf16_t>}},
        { GGML_TYPE_F16,    { GGML_TYPE_BF16_R16, 16,  (Repack::repack_func)repack_bf16<ggml_half>}  },
 #endif
    };