mirror of
https://github.com/ikawrakow/ik_llama.cpp.git
synced 2026-04-24 16:39:45 +00:00
Additional graph reduce types for split mode graph (#1154)
* WIP: add Q8_0 and BF16 as possible reduce types Does not work - there is a big somewhere * This finally works
This commit is contained in:
@@ -633,6 +633,27 @@ static ggml_tensor * get_input_tensor_sm_graph(ggml_tensor * input, int id) {
|
||||
return cur;
|
||||
}
|
||||
|
||||
static inline ggml_tensor * do_split_norm(ggml_context * ctx, ggml_tensor * cur, ggml_tensor * the_norm, const llama_hparams & hparams,
|
||||
const llm_build_cb & cb, int id, int il_cb, bool is_norm) {
|
||||
if (the_norm && the_norm->extra) {
|
||||
auto norm = (ggml_split_tensor_t *)the_norm->extra;
|
||||
GGML_ASSERT(norm->splits[id]);
|
||||
//if (cur->type != GGML_TYPE_F16 && cur->type != GGML_TYPE_F32) {
|
||||
// cur = ggml_cast(ctx, cur, GGML_TYPE_F32);
|
||||
//}
|
||||
if (is_norm) {
|
||||
cur = ggml_fused_norm(ctx, cur, norm->splits[id], hparams.f_norm_eps);
|
||||
} else {
|
||||
cur = llm_build_context::llm_build_norm(ctx, cur, hparams, norm->splits[id], NULL, LLM_NORM_RMS, cb, il_cb);
|
||||
}
|
||||
cb(cur, "inp_normed", il_cb);
|
||||
}
|
||||
if (cur->type != GGML_TYPE_F32) {
|
||||
cur = ggml_cast(ctx, cur, GGML_TYPE_F32);
|
||||
}
|
||||
return cur;
|
||||
}
|
||||
|
||||
ggml_tensor * llm_build_context::llm_build_ffn(
|
||||
ggml_context * ctx,
|
||||
llama_context & lctx,
|
||||
@@ -673,19 +694,7 @@ ggml_tensor * llm_build_context::llm_build_ffn(
|
||||
GGML_ASSERT((!split_u && !split_g && !split_d) || (split_u && split_g && split_d));
|
||||
if (!split_u) continue;
|
||||
auto cur = get_input_tensor_sm_graph(input, id);
|
||||
if (ffn_norm && ffn_norm->extra) {
|
||||
auto norm = (ggml_split_tensor_t *)ffn_norm->extra;
|
||||
GGML_ASSERT(norm->splits[id]);
|
||||
if (is_norm) {
|
||||
cur = ggml_fused_norm(ctx, cur, norm->splits[id], lctx.model.hparams.f_norm_eps);
|
||||
} else {
|
||||
cur = llm_build_norm(ctx, cur, lctx.model.hparams, norm->splits[id], NULL, LLM_NORM_RMS, cb, il);
|
||||
}
|
||||
cb(cur, "ffn_inp_normed", il_cb);
|
||||
}
|
||||
else if (cur->type != GGML_TYPE_F32) {
|
||||
cur = ggml_cast(ctx, cur, GGML_TYPE_F32);
|
||||
}
|
||||
cur = do_split_norm(ctx, cur, ffn_norm, lctx.model.hparams, cb, id, il_cb, is_norm);
|
||||
cur = ggml_fused_up_gate(ctx, split_u, split_g, cur, unary_op);
|
||||
cb(cur, "ffn_up_gate", il_cb);
|
||||
cur = llm_build_lora_mm(lctx, ctx, split_d, cur);
|
||||
@@ -694,8 +703,8 @@ ggml_tensor * llm_build_context::llm_build_ffn(
|
||||
// GLM4 and GLM4_MOE seem to have numerical issues with half-precision accumulators
|
||||
ggml_mul_mat_set_prec(cur, GGML_PREC_F32);
|
||||
}
|
||||
if (cur->ne[1] > 32 && lctx.cparams.split_mode_f16) {
|
||||
cur = ggml_cast(ctx, cur, GGML_TYPE_F16);
|
||||
if (cur->ne[1] > 32 && lctx.cparams.reduce_type != GGML_TYPE_F32) {
|
||||
cur = ggml_cast(ctx, cur, lctx.cparams.reduce_type);
|
||||
}
|
||||
if (add_extra && add_extra->op == GGML_OP_REDUCE && add_extra->op_params[3] == 1) {
|
||||
// When the reduce op is turned off via op_params[3] == 1, we need to add each src
|
||||
@@ -1205,8 +1214,8 @@ llm_expert_gating_func_type gating_op,
|
||||
split_down_shexp->splits[id], split_down_b_shexp ? split_down_b_shexp->splits[id] : nullptr, nullptr,
|
||||
nullptr, type_op_shexp, LLM_FFN_PAR, cb, il);
|
||||
cb(shared_out, "ffn_shexp_out", il_cb);
|
||||
if (shared_out->ne[1] > 32 && lctx.cparams.split_mode_f16) {
|
||||
shared_out = ggml_cast(ctx, shared_out, GGML_TYPE_F16);
|
||||
if (shared_out->ne[1] > 32 && lctx.cparams.reduce_type != GGML_TYPE_F32) {
|
||||
shared_out = ggml_cast(ctx, shared_out, lctx.cparams.reduce_type);
|
||||
}
|
||||
results.push_back(shared_out);
|
||||
}
|
||||
@@ -1222,8 +1231,8 @@ llm_expert_gating_func_type gating_op,
|
||||
cb(cur, "ffn_shared_combined", il);
|
||||
}
|
||||
}
|
||||
if (routed_out->ne[1] > 32 && lctx.cparams.split_mode_f16) {
|
||||
auto routed_out_f16 = ggml_cast(ctx, routed_out, GGML_TYPE_F16);
|
||||
if (routed_out->ne[1] > 32 && lctx.cparams.reduce_type != GGML_TYPE_F32) {
|
||||
auto routed_out_f16 = ggml_cast(ctx, routed_out, lctx.cparams.reduce_type);
|
||||
cur = ggml_add(ctx, routed_out_f16, cur);
|
||||
} else {
|
||||
cur = ggml_add(ctx, routed_out, cur);
|
||||
@@ -1269,15 +1278,16 @@ llm_expert_gating_func_type gating_op,
|
||||
if (!split_up_exps->splits[id]) continue;
|
||||
int il_cb = 1000*(id + 1) + il;
|
||||
auto cur = get_input_tensor_sm_graph(input, id);
|
||||
if (ffn_norm) {
|
||||
auto split_ffn_norm = (ggml_split_tensor_t *)ffn_norm->extra;
|
||||
GGML_ASSERT(split_ffn_norm && split_ffn_norm->n_device == split_up_exps->n_device);
|
||||
cur = llm_build_norm(ctx, cur, lctx.model.hparams, split_ffn_norm->splits[id], nullptr, LLM_NORM_RMS, cb, il);
|
||||
cb(cur, "ffn_inp_normed", il_cb);
|
||||
}
|
||||
if (cur->type != GGML_TYPE_F32) {
|
||||
cur = ggml_cast(ctx, cur, GGML_TYPE_F32);
|
||||
}
|
||||
cur = do_split_norm(ctx, cur, ffn_norm, lctx.model.hparams, cb, id, il_cb, false);
|
||||
//if (ffn_norm) {
|
||||
// auto split_ffn_norm = (ggml_split_tensor_t *)ffn_norm->extra;
|
||||
// GGML_ASSERT(split_ffn_norm && split_ffn_norm->n_device == split_up_exps->n_device);
|
||||
// cur = llm_build_norm(ctx, cur, lctx.model.hparams, split_ffn_norm->splits[id], nullptr, LLM_NORM_RMS, cb, il);
|
||||
// cb(cur, "ffn_inp_normed", il_cb);
|
||||
//}
|
||||
//if (cur->type != GGML_TYPE_F32) {
|
||||
// cur = ggml_cast(ctx, cur, GGML_TYPE_F32);
|
||||
//}
|
||||
GGML_ASSERT(!split_gate_inp_b || split_gate_inp_b->splits[id]);
|
||||
GGML_ASSERT(!split_exps_down_b || split_exps_down_b->splits[id]);
|
||||
GGML_ASSERT(!split_exps_gate_b || split_exps_gate_b->splits[id]);
|
||||
@@ -1309,8 +1319,8 @@ llm_expert_gating_func_type gating_op,
|
||||
} else {
|
||||
cur = routed_out;
|
||||
}
|
||||
if (cur->ne[1] > 32 && lctx.cparams.split_mode_f16) {
|
||||
cur = ggml_cast(ctx, cur, GGML_TYPE_F16);
|
||||
if (cur->ne[1] > 32 && lctx.cparams.reduce_type != GGML_TYPE_F32) {
|
||||
cur = ggml_cast(ctx, cur, lctx.cparams.reduce_type);
|
||||
cb(cur, "ffn_out_f16", il_cb);
|
||||
}
|
||||
ggml_build_forward_expand(graph, cur);
|
||||
@@ -9180,7 +9190,7 @@ ggml_tensor * llm_build_context::build_std_attention(ggml_cgraph * gf, ggml_tens
|
||||
if (!model.layers[il].wqkv && !model.layers[il].wqk && cparams.flash_attn &&
|
||||
model.layers[il].wq->extra && model.layers[il].wk->extra && model.layers[il].wv->extra && model.layers[il].wo->extra) {
|
||||
if (kv_self.k_l[il]->extra && kv_self.v_l[il]->extra) {
|
||||
ggml_split_tensor_t * attn_norm = the_attn_norm ? (ggml_split_tensor_t *)the_attn_norm->extra : nullptr;
|
||||
//ggml_split_tensor_t * attn_norm = the_attn_norm ? (ggml_split_tensor_t *)the_attn_norm->extra : nullptr;
|
||||
auto wq = (ggml_split_tensor_t *)model.layers[il].wq->extra;
|
||||
auto wk = (ggml_split_tensor_t *)model.layers[il].wk->extra;
|
||||
auto wv = (ggml_split_tensor_t *)model.layers[il].wv->extra;
|
||||
@@ -9221,16 +9231,17 @@ ggml_tensor * llm_build_context::build_std_attention(ggml_cgraph * gf, ggml_tens
|
||||
(split_wq && split_wk && split_wv && split_wo && split_kl && split_vl));
|
||||
if (!split_wq) continue;
|
||||
auto cur = get_input_tensor_sm_graph(input, id);
|
||||
if (attn_norm) {
|
||||
if (is_norm) {
|
||||
cur = ggml_fused_norm(ctx0, cur, attn_norm->splits[id], lctx.model.hparams.f_norm_eps);
|
||||
} else {
|
||||
cur = llm_build_norm(ctx0, cur, lctx.model.hparams, attn_norm->splits[id], NULL, LLM_NORM_RMS, cb, il);
|
||||
}
|
||||
}
|
||||
if (cur->type != GGML_TYPE_F32) {
|
||||
cur = ggml_cast(ctx0, cur, GGML_TYPE_F32);
|
||||
}
|
||||
cur = do_split_norm(ctx0, cur, the_attn_norm, lctx.model.hparams, cb, id, il_cb, is_norm);
|
||||
//if (attn_norm) {
|
||||
// if (is_norm) {
|
||||
// cur = ggml_fused_norm(ctx0, cur, attn_norm->splits[id], lctx.model.hparams.f_norm_eps);
|
||||
// } else {
|
||||
// cur = llm_build_norm(ctx0, cur, lctx.model.hparams, attn_norm->splits[id], NULL, LLM_NORM_RMS, cb, il);
|
||||
// }
|
||||
//}
|
||||
//if (cur->type != GGML_TYPE_F32) {
|
||||
// cur = ggml_cast(ctx0, cur, GGML_TYPE_F32);
|
||||
//}
|
||||
auto the_q_norm = model.layers[il].attn_q_norm ? model.layers[il].attn_q_norm->extra ?
|
||||
((ggml_split_tensor_t *)model.layers[il].attn_q_norm->extra)->splits[id] : model.layers[il].attn_q_norm : nullptr;
|
||||
auto the_k_norm = model.layers[il].attn_k_norm ? model.layers[il].attn_k_norm->extra ?
|
||||
@@ -9368,8 +9379,8 @@ ggml_tensor * llm_build_context::build_std_attention(ggml_cgraph * gf, ggml_tens
|
||||
cb(cur, "kqv_wo_biased", il_cb);
|
||||
output_bias_added = true;
|
||||
}
|
||||
if (cur->ne[1] > 32 && lctx.cparams.split_mode_f16) {
|
||||
cur = ggml_cast(ctx0, cur, GGML_TYPE_F16);
|
||||
if (cur->ne[1] > 32 && lctx.cparams.reduce_type != GGML_TYPE_F32) {
|
||||
cur = ggml_cast(ctx0, cur, lctx.cparams.reduce_type);
|
||||
}
|
||||
ggml_build_forward_expand(gf, cur);
|
||||
attn[id] = cur;
|
||||
|
||||
@@ -41,11 +41,12 @@ struct llama_cparams {
|
||||
bool graph_reuse;
|
||||
bool k_cache_hadamard;
|
||||
bool split_mode_graph_scheduling;
|
||||
bool split_mode_f16;
|
||||
//bool split_mode_f16;
|
||||
bool scheduler_async;
|
||||
int min_experts;
|
||||
float thresh_experts;
|
||||
|
||||
enum ggml_type reduce_type;
|
||||
enum llama_pooling_type pooling_type;
|
||||
|
||||
ggml_backend_sched_eval_callback cb_eval;
|
||||
|
||||
@@ -4070,6 +4070,7 @@ struct llama_context_params llama_context_default_params() {
|
||||
/*.cb_eval_user_data =*/ nullptr,
|
||||
/*.type_k =*/ GGML_TYPE_F16,
|
||||
/*.type_v =*/ GGML_TYPE_F16,
|
||||
/*.type_reduce =*/ GGML_TYPE_F16,
|
||||
/*.logits_all =*/ false,
|
||||
/*.embeddings =*/ false,
|
||||
/*.offload_kqv =*/ true,
|
||||
@@ -4087,7 +4088,7 @@ struct llama_context_params llama_context_default_params() {
|
||||
/*.only_active_experts =*/ false,
|
||||
/*.k_cache_hadamard =*/ false,
|
||||
/*.split_mode_graph_scheduling =*/ false,
|
||||
/*.split_mode_f16 =*/ true,
|
||||
// /*.split_mode_f16 =*/ true,
|
||||
/*.scheduler_async =*/ false,
|
||||
/*.abort_callback =*/ nullptr,
|
||||
/*.abort_callback_data =*/ nullptr,
|
||||
@@ -4382,6 +4383,8 @@ struct llama_context * llama_new_context_with_model(
|
||||
struct llama_model * model,
|
||||
struct llama_context_params params) {
|
||||
|
||||
printf("===================================== %s: %s\n", __func__, ggml_type_name(params.type_reduce));
|
||||
|
||||
if (!model) {
|
||||
LLAMA_LOG_ERROR("%s: model cannot be NULL\n", __func__);
|
||||
return nullptr;
|
||||
@@ -4452,12 +4455,13 @@ struct llama_context * llama_new_context_with_model(
|
||||
cparams.graph_reuse = params.graph_reuse;
|
||||
cparams.k_cache_hadamard = params.k_cache_hadamard;
|
||||
cparams.split_mode_graph_scheduling = params.split_mode_graph_scheduling;
|
||||
cparams.split_mode_f16 = params.split_mode_f16;
|
||||
//cparams.split_mode_f16 = params.split_mode_f16;
|
||||
cparams.scheduler_async = params.scheduler_async;
|
||||
cparams.min_experts = params.min_experts;
|
||||
cparams.thresh_experts = params.thresh_experts;
|
||||
cparams.cuda_params = params.cuda_params;
|
||||
|
||||
cparams.reduce_type = params.type_reduce;
|
||||
cparams.pooling_type = params.pooling_type;
|
||||
|
||||
cparams.n_ctx = params.n_ctx == 0 ? hparams.n_ctx_train : params.n_ctx;
|
||||
@@ -4527,12 +4531,19 @@ struct llama_context * llama_new_context_with_model(
|
||||
cparams.mla_attn = 0;
|
||||
}
|
||||
if (model->arch == LLM_ARCH_OPENAI_MOE && model->split_mode == LLAMA_SPLIT_MODE_GRAPH) {
|
||||
if (cparams.split_mode_f16) {
|
||||
//if (cparams.split_mode_f16) {
|
||||
// LLAMA_LOG_WARN("=====================================================================\n");
|
||||
// LLAMA_LOG_WARN("GPT-OSS with split mode graph requires f32 precision\n");
|
||||
// LLAMA_LOG_WARN(" => changing cparams.split_mode_f16 to 'false'\n");
|
||||
// LLAMA_LOG_WARN("=====================================================================\n");
|
||||
// cparams.split_mode_f16 = false;
|
||||
//}
|
||||
if (cparams.reduce_type == GGML_TYPE_F16) {
|
||||
LLAMA_LOG_WARN("=====================================================================\n");
|
||||
LLAMA_LOG_WARN("GPT-OSS with split mode graph requires f32 precision\n");
|
||||
LLAMA_LOG_WARN(" => changing cparams.split_mode_f16 to 'false'\n");
|
||||
LLAMA_LOG_WARN("=====================================================================\n");
|
||||
cparams.split_mode_f16 = false;
|
||||
cparams.reduce_type = GGML_TYPE_F32;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -4552,7 +4563,8 @@ struct llama_context * llama_new_context_with_model(
|
||||
LLAMA_LOG_INFO("%s: graph_reuse = %d\n", __func__, cparams.graph_reuse);
|
||||
LLAMA_LOG_INFO("%s: k_cache_hadam = %d\n", __func__, cparams.k_cache_hadamard);
|
||||
LLAMA_LOG_INFO("%s: split_mode_graph_scheduling = %d\n", __func__, cparams.split_mode_graph_scheduling);
|
||||
LLAMA_LOG_INFO("%s: split_mode_f16= %d\n", __func__, cparams.split_mode_f16);
|
||||
//LLAMA_LOG_INFO("%s: split_mode_f16= %d\n", __func__, cparams.split_mode_f16);
|
||||
LLAMA_LOG_INFO("%s: reduce_type = %s\n", __func__, ggml_type_name(cparams.reduce_type));
|
||||
LLAMA_LOG_INFO("%s: sched_async = %d\n", __func__, cparams.scheduler_async);
|
||||
LLAMA_LOG_INFO("%s: ser = %d, %g\n", __func__, cparams.min_experts, cparams.thresh_experts);
|
||||
LLAMA_LOG_INFO("%s: freq_base = %.1f\n", __func__, cparams.rope_freq_base);
|
||||
|
||||
Reference in New Issue
Block a user