iq1_s_r4: more bits for shared experts

With this mix we arrive at PPL(512) = 9.4140 for Deepseek-Lite using 1.766 bpw for the repeating layers. On the Ryzen-7950X we get PP-512 = 494 t/s and TG-128 = 52 t/s @ 16 threads.
2026-02-28 17:14:17 +00:00 · 2025-02-04 19:49:28 +02:00
parent 25ffa8703a
commit 19d384302b
1 changed files with 8 additions and 0 deletions
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -16078,12 +16078,20 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
        else if (qs.model.hparams.n_expert >= 8 && name.find("attn_k") != std::string::npos) {
            new_type = GGML_TYPE_Q4_K_R4;
        }
+        else if (qs.model.hparams.n_expert >= 8 && (name.find("blk.0.ffn_down") != std::string::npos ||
+                                                    name.find("blk.0.ffn_gate") != std::string::npos ||
+                                                    name.find("blk.0.ffn_up") != std::string::npos)) {
+            new_type = GGML_TYPE_IQ3_K_R4;
+        }
        else if (qs.model.hparams.n_expert >= 8 && name.find("attn_q") != std::string::npos) {
            new_type = GGML_TYPE_Q4_K_R4;
        }
        else if (name.find("attn_qkv.weight") != std::string::npos) {
            new_type = GGML_TYPE_IQ2_K_R4;
        }
+        else if (name.find("_shexp.weight") != std::string::npos) {
+            new_type = GGML_TYPE_IQ4_K_R4;
+        }
        else if (name.find("ffn_down") != std::string::npos) {
            auto [i_layer, n_layer] = layer_info(qs.i_ffn_down, qs.n_ffn_down, name.c_str());
            if (qs.params->ffn_down_type < GGML_TYPE_COUNT) new_type = qs.params->ffn_down_type;