diff --git a/src/llama.cpp b/src/llama.cpp index 0020c77f..92ede50c 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -16078,12 +16078,20 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n else if (qs.model.hparams.n_expert >= 8 && name.find("attn_k") != std::string::npos) { new_type = GGML_TYPE_Q4_K_R4; } + else if (qs.model.hparams.n_expert >= 8 && (name.find("blk.0.ffn_down") != std::string::npos || + name.find("blk.0.ffn_gate") != std::string::npos || + name.find("blk.0.ffn_up") != std::string::npos)) { + new_type = GGML_TYPE_IQ3_K_R4; + } else if (qs.model.hparams.n_expert >= 8 && name.find("attn_q") != std::string::npos) { new_type = GGML_TYPE_Q4_K_R4; } else if (name.find("attn_qkv.weight") != std::string::npos) { new_type = GGML_TYPE_IQ2_K_R4; } + else if (name.find("_shexp.weight") != std::string::npos) { + new_type = GGML_TYPE_IQ4_K_R4; + } else if (name.find("ffn_down") != std::string::npos) { auto [i_layer, n_layer] = layer_info(qs.i_ffn_down, qs.n_ffn_down, name.c_str()); if (qs.params->ffn_down_type < GGML_TYPE_COUNT) new_type = qs.params->ffn_down_type;