mirror of
https://github.com/ikawrakow/ik_llama.cpp.git
synced 2026-02-20 21:24:08 +00:00
Deepseek-Lite (#184)
* Quantization mixes tweaks * Make iq4_nl_r4 work with row size that are not a multiple of 128 ... on Zen4 * Make iq4_nl_r4 work with row size that are not a multiple of 128 ... on AVX2 * Make iq4_nl_r4 work with row size that are not a multiple of 128 ... on AVX2 * Make q6_0_w4 work with row size that are not a multiple of 128 ... on Zen4 * Make q6_0_w4 work with row size that are not a multiple of 128 ... on Zen4 * Make q5_0_r4 work with row size that are not a multiple of 128 ... on Zen4 and AVX2 * Make q5,6_0_r4, iq4_nl_e4 work with row size that are not a multiple of 128 also on NEON. --------- Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
This commit is contained in:
@@ -16075,7 +16075,10 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
|
||||
else new_type = ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || is_iq2_m ? GGML_TYPE_IQ3_S : GGML_TYPE_Q2_K;
|
||||
++qs.i_attention_wv;
|
||||
}
|
||||
else if (qs.model.hparams.n_expert == 8 && name.find("attn_k.weight") != std::string::npos) {
|
||||
else if (qs.model.hparams.n_expert >= 8 && name.find("attn_k") != std::string::npos) {
|
||||
new_type = GGML_TYPE_Q4_K;
|
||||
}
|
||||
else if (qs.model.hparams.n_expert >= 8 && name.find("attn_q") != std::string::npos) {
|
||||
new_type = GGML_TYPE_Q4_K;
|
||||
}
|
||||
else if (name.find("attn_qkv.weight") != std::string::npos) {
|
||||
@@ -16088,7 +16091,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
|
||||
++qs.i_ffn_down;
|
||||
}
|
||||
else if (name.find("attn_output.weight") != std::string::npos) {
|
||||
if (qs.model.hparams.n_expert == 8) {
|
||||
if (qs.model.hparams.n_expert >= 4) {
|
||||
new_type = GGML_TYPE_Q5_K;
|
||||
} else {
|
||||
if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) new_type = GGML_TYPE_IQ2_K;
|
||||
@@ -16188,9 +16191,9 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
|
||||
else if (new_type == GGML_TYPE_Q5_K) new_type = GGML_TYPE_Q6_K;
|
||||
}
|
||||
++qs.i_attention_wv;
|
||||
} else if (name.find("attn_k.weight") != std::string::npos) {
|
||||
} else if (name.find("attn_k") != std::string::npos) {
|
||||
if (qs.params->attn_k_type < GGML_TYPE_COUNT) new_type = qs.params->attn_k_type;
|
||||
else if (qs.model.hparams.n_expert == 8) {
|
||||
else if (qs.model.hparams.n_expert >= 8) {
|
||||
// for the 8-expert model, bumping this to Q8_0 trades just ~128MB
|
||||
// TODO: explore better strategies
|
||||
new_type = GGML_TYPE_Q8_0;
|
||||
@@ -16201,8 +16204,13 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
|
||||
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS_R4) {
|
||||
new_type = GGML_TYPE_IQ2_S;
|
||||
}
|
||||
} else if (name.find("attn_q.weight") != std::string::npos) {
|
||||
} else if (name.find("attn_q") != std::string::npos) {
|
||||
if (qs.params->attn_q_type < GGML_TYPE_COUNT) new_type = qs.params->attn_q_type;
|
||||
else if (qs.model.hparams.n_expert >= 8) {
|
||||
// for the 8-expert model, bumping this to Q8_0 trades just ~128MB
|
||||
// TODO: explore better strategies
|
||||
new_type = GGML_TYPE_Q8_0;
|
||||
}
|
||||
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS) {
|
||||
new_type = GGML_TYPE_IQ3_XXS;
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user