iq1_s_r4: more bits for shared experts

With this mix we arrive at PPL(512) = 9.4140
for Deepseek-Lite using 1.766 bpw for the repeating layers.

On the Ryzen-7950X we get PP-512 = 494 t/s and
TG-128 = 52 t/s @ 16 threads.
This commit is contained in:
Iwan Kawrakow
2025-02-04 19:49:28 +02:00
parent 25ffa8703a
commit 19d384302b

View File

@@ -16078,12 +16078,20 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
else if (qs.model.hparams.n_expert >= 8 && name.find("attn_k") != std::string::npos) {
new_type = GGML_TYPE_Q4_K_R4;
}
else if (qs.model.hparams.n_expert >= 8 && (name.find("blk.0.ffn_down") != std::string::npos ||
name.find("blk.0.ffn_gate") != std::string::npos ||
name.find("blk.0.ffn_up") != std::string::npos)) {
new_type = GGML_TYPE_IQ3_K_R4;
}
else if (qs.model.hparams.n_expert >= 8 && name.find("attn_q") != std::string::npos) {
new_type = GGML_TYPE_Q4_K_R4;
}
else if (name.find("attn_qkv.weight") != std::string::npos) {
new_type = GGML_TYPE_IQ2_K_R4;
}
else if (name.find("_shexp.weight") != std::string::npos) {
new_type = GGML_TYPE_IQ4_K_R4;
}
else if (name.find("ffn_down") != std::string::npos) {
auto [i_layer, n_layer] = layer_info(qs.i_ffn_down, qs.n_ffn_down, name.c_str());
if (qs.params->ffn_down_type < GGML_TYPE_COUNT) new_type = qs.params->ffn_down_type;