mirror of
https://github.com/ikawrakow/ik_llama.cpp.git
synced 2026-02-28 17:14:17 +00:00
iq1_s_r4: more bits for shared experts
With this mix we arrive at PPL(512) = 9.4140 for Deepseek-Lite using 1.766 bpw for the repeating layers. On the Ryzen-7950X we get PP-512 = 494 t/s and TG-128 = 52 t/s @ 16 threads.
This commit is contained in:
@@ -16078,12 +16078,20 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
|
||||
else if (qs.model.hparams.n_expert >= 8 && name.find("attn_k") != std::string::npos) {
|
||||
new_type = GGML_TYPE_Q4_K_R4;
|
||||
}
|
||||
else if (qs.model.hparams.n_expert >= 8 && (name.find("blk.0.ffn_down") != std::string::npos ||
|
||||
name.find("blk.0.ffn_gate") != std::string::npos ||
|
||||
name.find("blk.0.ffn_up") != std::string::npos)) {
|
||||
new_type = GGML_TYPE_IQ3_K_R4;
|
||||
}
|
||||
else if (qs.model.hparams.n_expert >= 8 && name.find("attn_q") != std::string::npos) {
|
||||
new_type = GGML_TYPE_Q4_K_R4;
|
||||
}
|
||||
else if (name.find("attn_qkv.weight") != std::string::npos) {
|
||||
new_type = GGML_TYPE_IQ2_K_R4;
|
||||
}
|
||||
else if (name.find("_shexp.weight") != std::string::npos) {
|
||||
new_type = GGML_TYPE_IQ4_K_R4;
|
||||
}
|
||||
else if (name.find("ffn_down") != std::string::npos) {
|
||||
auto [i_layer, n_layer] = layer_info(qs.i_ffn_down, qs.n_ffn_down, name.c_str());
|
||||
if (qs.params->ffn_down_type < GGML_TYPE_COUNT) new_type = qs.params->ffn_down_type;
|
||||
|
||||
Reference in New Issue
Block a user