mirror of
https://github.com/ikawrakow/ik_llama.cpp.git
synced 2026-02-27 00:24:11 +00:00
R4 improvements on ARM_NEON (#125)
* q4_0_r4: 6% faster PP on NEON * qx_0_r4_q8_0 template Applied to q4_0_r4 and q5_0_r4. It makes q5_0_r4 PP ~7% faster. * Apply qx_0_r4_q8_0 template also to q6_0_r4 and iq4_nl_x4 * Simplify * Minor iq4_xs_r4 improvement on NEON --------- Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
This commit is contained in:
@@ -16569,6 +16569,10 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
||||
if (tensor->ne[1] % 4 != 0) new_type = GGML_TYPE_Q4_0;
|
||||
else chunk_size_multiplier = 4;
|
||||
}
|
||||
else if (new_type == GGML_TYPE_Q5_0_R4) {
|
||||
if (tensor->ne[1] % 4 != 0) new_type = GGML_TYPE_Q5_0;
|
||||
else chunk_size_multiplier = 4;
|
||||
}
|
||||
else if (new_type == GGML_TYPE_Q6_0_R4) {
|
||||
if (tensor->ne[1] % 4 != 0) new_type = GGML_TYPE_Q6_0;
|
||||
else chunk_size_multiplier = 4;
|
||||
|
||||
Reference in New Issue
Block a user