mirror of
https://github.com/ikawrakow/ik_llama.cpp.git
synced 2026-04-30 19:31:48 +00:00
On Zen4 repack fp16 models to bf16_r16 when run-time-repacking is requested (#174)
This massively improves performance. As this is opt-in, we do not worry about possible precision loss in the f16 -> bf16 conversion. Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
This commit is contained in:
@@ -5260,6 +5260,7 @@ inline ggml_bf16_t to_bf16(const float& x) {
|
|||||||
helper.f = x;
|
helper.f = x;
|
||||||
return ggml_bf16_t{(uint16_t)(helper.u >> 16)};
|
return ggml_bf16_t{(uint16_t)(helper.u >> 16)};
|
||||||
}
|
}
|
||||||
|
inline ggml_bf16_t to_bf16(const ggml_half& x) { return to_bf16(GGML_FP16_TO_FP32(x)); }
|
||||||
inline ggml_bf16_t to_bf16(const ggml_bf16_t& x) { return x; }
|
inline ggml_bf16_t to_bf16(const ggml_bf16_t& x) { return x; }
|
||||||
template <typename T>
|
template <typename T>
|
||||||
void repack_bf16(int nrows, int n_per_row, const T * x, ggml_bf16_t * y) {
|
void repack_bf16(int nrows, int n_per_row, const T * x, ggml_bf16_t * y) {
|
||||||
@@ -6082,7 +6083,8 @@ void iqk_repack_tensor(struct ggml_tensor * tensor) {
|
|||||||
{ GGML_TYPE_Q8_0, { GGML_TYPE_Q8_0_R4, 4, (Repack::repack_func)repack_q8_0} },
|
{ GGML_TYPE_Q8_0, { GGML_TYPE_Q8_0_R4, 4, (Repack::repack_func)repack_q8_0} },
|
||||||
{ GGML_TYPE_Q8_K, { GGML_TYPE_Q8_K_R8, 8, (Repack::repack_func)repack_q8_k} },
|
{ GGML_TYPE_Q8_K, { GGML_TYPE_Q8_K_R8, 8, (Repack::repack_func)repack_q8_k} },
|
||||||
#ifdef __AVX512BF16__
|
#ifdef __AVX512BF16__
|
||||||
{ GGML_TYPE_BF16, { GGML_TYPE_BF16_R16, 16, (Repack::repack_func)repack_bf16<ggml_bf16_t>} },
|
{ GGML_TYPE_BF16, { GGML_TYPE_BF16_R16, 16, (Repack::repack_func)repack_bf16<ggml_bf16_t>}},
|
||||||
|
{ GGML_TYPE_F16, { GGML_TYPE_BF16_R16, 16, (Repack::repack_func)repack_bf16<ggml_half>} },
|
||||||
#endif
|
#endif
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user