* q5_k_r4: WIP

* q5_k_r4: Zen4 and AVX2

We get PP-512(LLaMA-3.1-8B) = 248.3 t/s on Zen4.
Q5_K_S has PP-512 = 190 t/s.

* q5_k_r4: NEON

We get PP-512(LLaMA-3.1-8B) = 96.1 t/s.

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
This commit is contained in:
Kawrakow
2024-12-10 18:13:47 +01:00
committed by GitHub
parent c819fa651b
commit a63a96b5ae
10 changed files with 454 additions and 1 deletions

View File

@@ -413,6 +413,7 @@ extern "C" {
GGML_TYPE_Q5_0_R4 = 206,
GGML_TYPE_Q8_0_R4 = 208,
GGML_TYPE_Q4_K_R4 = 212,
GGML_TYPE_Q5_K_R4 = 213,
GGML_TYPE_Q6_K_R4 = 214,
GGML_TYPE_IQ4_NL_R4 = 220,
GGML_TYPE_IQ4_XS_R4 = 223,
@@ -481,6 +482,7 @@ extern "C" {
GGML_FTYPE_MOSTLY_Q8_0_R4 = 207, // except 1d tensors
GGML_FTYPE_MOSTLY_Q5_0_R4 = 208, // except 1d tensors
GGML_FTYPE_MOSTLY_Q4_K_R4 = 212, // except 1d tensors
GGML_FTYPE_MOSTLY_Q5_K_R4 = 215, // except 1d tensors
GGML_FTYPE_MOSTLY_Q6_K_R4 = 214, // except 1d tensors
GGML_FTYPE_MOSTLY_IQ4_NL_R4 = 219, // except 1d tensors
GGML_FTYPE_MOSTLY_IQ4_XS_R4 = 222, // except 1d tensors