mirror of
https://github.com/ikawrakow/ik_llama.cpp.git
synced 2026-04-30 11:21:56 +00:00
Mistral3-large
This commit is contained in:
@@ -6265,6 +6265,11 @@ ggml_cgraph * llm_build_context::build_deepseek2() {
|
|||||||
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
||||||
struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
|
struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
|
||||||
|
|
||||||
|
ggml_tensor * inp_attn_scale = nullptr;
|
||||||
|
if (hparams.f_attn_temp_scale != 0.0f) {
|
||||||
|
inp_attn_scale = build_input_scale(n_tokens);
|
||||||
|
}
|
||||||
|
|
||||||
// whether to use n_tokens as the matrix dimension during multiplication or n_head
|
// whether to use n_tokens as the matrix dimension during multiplication or n_head
|
||||||
// n_tokens is higher during prompt processing, this allows to optimize for this case
|
// n_tokens is higher during prompt processing, this allows to optimize for this case
|
||||||
bool pp_opt = n_tokens >= 128; // Is it a fixed constant or is it somehow relared to n_head? original: n_tokens > n_head;
|
bool pp_opt = n_tokens >= 128; // Is it a fixed constant or is it somehow relared to n_head? original: n_tokens > n_head;
|
||||||
@@ -6455,10 +6460,14 @@ ggml_cgraph * llm_build_context::build_deepseek2() {
|
|||||||
}
|
}
|
||||||
cb(k_rope, "k_rope", il);
|
cb(k_rope, "k_rope", il);
|
||||||
|
|
||||||
//auto q = ggml_concat(ctx0, q_nope, q_rope, 0);
|
|
||||||
auto q = ggml_concat(ctx0, q_rope, q_nope, 0);
|
auto q = ggml_concat(ctx0, q_rope, q_nope, 0);
|
||||||
q = ggml_permute(ctx0, q, 0, 2, 1, 3);
|
|
||||||
cb(q, "q_concat", il);
|
cb(q, "q_concat", il);
|
||||||
|
if (inp_attn_scale) {
|
||||||
|
q = ggml_mul(ctx0, q, inp_attn_scale);
|
||||||
|
cb(q, "q_concat_temp_scaled", il);
|
||||||
|
}
|
||||||
|
q = ggml_permute(ctx0, q, 0, 2, 1, 3);
|
||||||
|
cb(q, "q_concat_permuted", il);
|
||||||
|
|
||||||
ggml_build_forward_expand(gf, q);
|
ggml_build_forward_expand(gf, q);
|
||||||
|
|
||||||
@@ -6528,9 +6537,12 @@ ggml_cgraph * llm_build_context::build_deepseek2() {
|
|||||||
struct ggml_tensor * q_nope2 = ggml_mul_mat(ctx0, wk_b, q_nope);
|
struct ggml_tensor * q_nope2 = ggml_mul_mat(ctx0, wk_b, q_nope);
|
||||||
cb(q_nope2, "q_nope2", il);
|
cb(q_nope2, "q_nope2", il);
|
||||||
|
|
||||||
//ggml_tensor * q = ggml_concat(ctx0, q_nope2, ggml_permute(ctx0, q_rope, 0, 2, 1, 3), 0);
|
|
||||||
ggml_tensor * q = ggml_concat(ctx0, ggml_permute(ctx0, q_rope, 0, 2, 1, 3), q_nope2, 0);
|
ggml_tensor * q = ggml_concat(ctx0, ggml_permute(ctx0, q_rope, 0, 2, 1, 3), q_nope2, 0);
|
||||||
cb(q, "q", il);
|
cb(q, "q", il);
|
||||||
|
if (inp_attn_scale) {
|
||||||
|
q = ggml_mul(ctx0, q, inp_attn_scale);
|
||||||
|
cb(q, "q_temp_scales", il);
|
||||||
|
}
|
||||||
|
|
||||||
if (lctx.cparams.flash_attn && (lctx.cparams.mla_attn == 1 || lctx.cparams.mla_attn == 3)) {
|
if (lctx.cparams.flash_attn && (lctx.cparams.mla_attn == 1 || lctx.cparams.mla_attn == 3)) {
|
||||||
ggml_tensor * kv_cache_lora = ggml_view_2d(ctx0, kv_self.k_l[il],
|
ggml_tensor * kv_cache_lora = ggml_view_2d(ctx0, kv_self.k_l[il],
|
||||||
@@ -6680,6 +6692,11 @@ ggml_cgraph * llm_build_context::build_deepseek2() {
|
|||||||
struct ggml_tensor * q_states = ggml_concat(ctx0, q_nope, q_rope, 0);
|
struct ggml_tensor * q_states = ggml_concat(ctx0, q_nope, q_rope, 0);
|
||||||
cb(q_states, "q_states", il);
|
cb(q_states, "q_states", il);
|
||||||
|
|
||||||
|
if (inp_attn_scale) {
|
||||||
|
q_states = ggml_mul(ctx0, q_states, inp_attn_scale);
|
||||||
|
cb(q_states, "q_states_temp_scales", il);
|
||||||
|
}
|
||||||
|
|
||||||
struct ggml_tensor * k_states = ggml_concat(ctx0, k_nope, ggml_repeat(ctx0, k_rope, q_rope), 0);
|
struct ggml_tensor * k_states = ggml_concat(ctx0, k_nope, ggml_repeat(ctx0, k_rope, q_rope), 0);
|
||||||
cb(k_states, "k_states", il);
|
cb(k_states, "k_states", il);
|
||||||
|
|
||||||
|
|||||||
@@ -786,6 +786,11 @@ void llm_load_hparams(
|
|||||||
}
|
}
|
||||||
ml.get_key(LLM_KV_ROPE_SCALING_YARN_LOG_MUL, hparams.rope_yarn_log_mul, false);
|
ml.get_key(LLM_KV_ROPE_SCALING_YARN_LOG_MUL, hparams.rope_yarn_log_mul, false);
|
||||||
|
|
||||||
|
hparams.f_attn_temp_scale = 0;
|
||||||
|
hparams.n_attn_temp_floor_scale = 0;
|
||||||
|
ml.get_key(LLM_KV_ATTENTION_TEMPERATURE_SCALE, hparams.f_attn_temp_scale, false);
|
||||||
|
ml.get_key(LLM_KV_ATTENTION_TEMPERATURE_LENGTH, hparams.n_attn_temp_floor_scale, false);
|
||||||
|
|
||||||
switch (hparams.n_layer) {
|
switch (hparams.n_layer) {
|
||||||
case 27: model.type = e_model::MODEL_16B; break;
|
case 27: model.type = e_model::MODEL_16B; break;
|
||||||
case 60: model.type = e_model::MODEL_236B; break;
|
case 60: model.type = e_model::MODEL_236B; break;
|
||||||
|
|||||||
Reference in New Issue
Block a user