mirror of
https://github.com/ikawrakow/ik_llama.cpp.git
synced 2026-04-30 03:11:51 +00:00
Merge ffn_up and ffn_gate experts tensors (#1137)
* WIP - not working * WIP - not working * WIP - GPT-OSS working However, extremely stupid. The only way I could correctly repack the up/gate experts is to copy up and gate into host buffers, repack into another host buffer, copy back into the ffn_up_gate_exps tensor. This is going to be very slow for giant 500 GB models. My attempts to do this via a compute graph on the backend holding the tensors was unsuccessful. For GPT-OSS-20B I see ~6-7% better PP when using the original ik_llama.cpp fused_up_gate CUDA implementation, and ~10% when using the small batch size implementation. Other models are not working yet on CUDA as I need to fix the fused mul-unary implementation. * WIP * WIP - Qwen3-MoE (and hopefully all others) working But when I say here and in the previous commit "working", I mean PP is working. TG is still broken. * WIP: TG seems to be working * Minor * Add command line option to merge experts up/gate * Add merge up/gate command line parameter to llama-bench * Turn off merge_up_gate_exps if split mode graph It is not yet implemented * When no bias, allow merging up/gate with tensor overrides * Arghh, we need to increase the context size again * Cleanup
This commit is contained in:
@@ -910,7 +910,8 @@ ggml_tensor * llm_build_context::llm_build_moe_ffn(
|
||||
bool scale_w,
|
||||
float w_scale,
|
||||
llm_expert_gating_func_type gating_op,
|
||||
const llm_build_cb & cb, int il, ggml_cgraph * graph, bool add_input) {
|
||||
const llm_build_cb & cb, int il, ggml_cgraph * graph, bool add_input,
|
||||
ggml_tensor * up_gate_exps, ggml_tensor * up_gate_exps_b) {
|
||||
|
||||
auto input = cur;
|
||||
|
||||
@@ -1025,6 +1026,19 @@ llm_expert_gating_func_type gating_op,
|
||||
bool can_use_fmoe = type_op == LLM_FFN_SILU || type_op == LLM_FFN_GELU || type_op == LLM_FFN_SWIGLU_OAI_MOE;
|
||||
|
||||
ggml_tensor * par;
|
||||
if (can_use_fmoe && up_gate_exps) {
|
||||
if (up_gate_exps_b) {
|
||||
par = ggml_moe_up_gate_ext(ctx, up_gate_exps, nullptr, cur, selected_experts, up_gate_exps_b, nullptr,
|
||||
type_op == LLM_FFN_SILU ? GGML_UNARY_OP_SILU :
|
||||
type_op == LLM_FFN_GELU ? GGML_UNARY_OP_GELU : GGML_UNARY_OP_SWIGLU_OAI);
|
||||
} else {
|
||||
GGML_ASSERT(type_op != LLM_FFN_SWIGLU_OAI_MOE);
|
||||
par = ggml_moe_up_gate(ctx, up_gate_exps, nullptr, cur, selected_experts,
|
||||
type_op == LLM_FFN_SILU ? GGML_UNARY_OP_SILU : GGML_UNARY_OP_GELU);
|
||||
}
|
||||
} else {
|
||||
GGML_ASSERT(!up_gate_exps && !up_gate_exps_b);
|
||||
|
||||
if (can_use_fmoe && lctx.cparams.fused_moe_up_gate && up_exps->type == gate_exps->type) {
|
||||
if (up_exps_b || gate_exps_b) {
|
||||
par = ggml_moe_up_gate_ext(ctx, up_exps, gate_exps, cur, selected_experts, up_exps_b, gate_exps_b,
|
||||
@@ -1069,6 +1083,7 @@ llm_expert_gating_func_type gating_op,
|
||||
GGML_ABORT("fatal error");
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
cb(par, "ffn_moe_gate_par", il);
|
||||
|
||||
@@ -1130,7 +1145,8 @@ ggml_tensor * llm_build_context::llm_build_std_moe_ffn(ggml_context * ctx, llama
|
||||
float w_scale,
|
||||
llm_expert_gating_func_type gating_op,
|
||||
llm_ffn_op_type type_op_shexp,
|
||||
const llm_build_cb & cb, int il, ggml_cgraph * graph, bool add_input) {
|
||||
const llm_build_cb & cb, int il, ggml_cgraph * graph, bool add_input,
|
||||
ggml_tensor * up_gate_exps, ggml_tensor * up_gate_exps_b) {
|
||||
|
||||
auto split_up_exps = (ggml_split_tensor_t *)up_exps->extra;
|
||||
auto split_gate_exps = (ggml_split_tensor_t *)gate_exps->extra;
|
||||
@@ -1164,7 +1180,7 @@ llm_expert_gating_func_type gating_op,
|
||||
the_exp_probs_b,
|
||||
n_expert, n_expert_used,
|
||||
type_op, norm_w, scale_w, w_scale,
|
||||
gating_op, cb, il, graph, false);
|
||||
gating_op, cb, il, graph, false, up_gate_exps, up_gate_exps_b);
|
||||
cb(routed_out, "routed_out", il);
|
||||
if (add_input) {
|
||||
routed_out = ggml_add(ctx, routed_out, input);
|
||||
@@ -4047,7 +4063,8 @@ ggml_cgraph * llm_build_context::build_qwen3moe() {
|
||||
n_expert, n_expert_used,
|
||||
LLM_FFN_SILU, true, false, 0.0f,
|
||||
LLM_EXPERT_GATING_FUNC_SOFTMAX,
|
||||
LLM_FFN_SILU, cb, il, gf, true);
|
||||
LLM_FFN_SILU, cb, il, gf, true,
|
||||
model.layers[il].ffn_up_gate_exps);
|
||||
|
||||
//printf("%s: ffn = %s(%s)\n", __func__, cur->name, ggml_op_name(cur->op));
|
||||
|
||||
@@ -8410,7 +8427,8 @@ ggml_cgraph * llm_build_context::build_openai_moe() {
|
||||
n_expert, n_expert_used,
|
||||
LLM_FFN_SWIGLU_OAI_MOE, false, false, 0.0f,
|
||||
LLM_EXPERT_GATING_FUNC_TYPE_SOFTMAX_WEIGHT,
|
||||
LLM_FFN_SWIGLU_OAI_MOE, cb, il, gf, true);
|
||||
LLM_FFN_SWIGLU_OAI_MOE, cb, il, gf, true,
|
||||
model.layers[il].ffn_up_gate_exps, model.layers[il].ffn_up_gate_exps_b);
|
||||
|
||||
cur = lctx.cvec.apply_to(ctx0, cur, il);
|
||||
cb(cur, "l_out", il);
|
||||
|
||||
@@ -354,7 +354,8 @@ struct llm_build_context {
|
||||
bool scale_w,
|
||||
float w_scale,
|
||||
llm_expert_gating_func_type gating_op,
|
||||
const llm_build_cb & cb, int il, ggml_cgraph * graph = nullptr, bool add_input = false);
|
||||
const llm_build_cb & cb, int il, ggml_cgraph * graph = nullptr, bool add_input = false,
|
||||
ggml_tensor * up_gate_exps = nullptr, ggml_tensor * up_gate_exps_b = nullptr);
|
||||
|
||||
static ggml_tensor * llm_build_moe_ffn(ggml_context * ctx, llama_context & lctx,
|
||||
ggml_tensor * cur,
|
||||
@@ -370,7 +371,8 @@ llm_expert_gating_func_type gating_op,
|
||||
bool scale_w,
|
||||
float w_scale,
|
||||
llm_expert_gating_func_type gating_op,
|
||||
const llm_build_cb & cb, int il, ggml_cgraph * graph = nullptr, bool add_input = false) {
|
||||
const llm_build_cb & cb, int il, ggml_cgraph * graph = nullptr, bool add_input = false,
|
||||
ggml_tensor * up_gate_exps = nullptr, ggml_tensor * up_gate_exps_b = nullptr) {
|
||||
return llm_build_moe_ffn(ctx, lctx, cur,
|
||||
gate_inp, nullptr,
|
||||
up_exps, nullptr,
|
||||
@@ -379,7 +381,7 @@ llm_expert_gating_func_type gating_op,
|
||||
exp_probs_b,
|
||||
n_expert, n_expert_used,
|
||||
type_op, norm_w, scale_w, w_scale,
|
||||
gating_op, cb, il, graph, add_input);
|
||||
gating_op, cb, il, graph, add_input, up_gate_exps, up_gate_exps_b);
|
||||
}
|
||||
|
||||
static ggml_tensor * llm_build_std_moe_ffn(ggml_context * ctx, llama_context & lctx,
|
||||
@@ -401,7 +403,8 @@ llm_expert_gating_func_type gating_op,
|
||||
float w_scale,
|
||||
llm_expert_gating_func_type gating_op,
|
||||
llm_ffn_op_type type_op_shexp,
|
||||
const llm_build_cb & cb, int il, ggml_cgraph * graph, bool add_input = false);
|
||||
const llm_build_cb & cb, int il, ggml_cgraph * graph, bool add_input = false,
|
||||
ggml_tensor * up_gate_exps = nullptr, ggml_tensor * up_gate_exps_b = nullptr);
|
||||
|
||||
static ggml_cgraph * llama_build_graph_defrag(llama_context & lctx, const std::vector<uint32_t> & ids);
|
||||
|
||||
|
||||
@@ -31,6 +31,8 @@ struct create_tensors_helper : public create_tensors_helper_interface {
|
||||
|
||||
bool merge_qkv(const LLM_TN & tn, int i, int bias, bool ignore_attn_scale = false);
|
||||
|
||||
bool merge_up_gate_exps(const LLM_TN & tn, int i, int bias);
|
||||
|
||||
bool create_tensors() override;
|
||||
|
||||
bool create_llama_tensors(const LLM_TN & tn);
|
||||
@@ -141,6 +143,8 @@ struct create_tensors_helper : public create_tensors_helper_interface {
|
||||
ggml_tensor * create_tensor(ggml_context * ctx, const std::string & name, const std::vector<int64_t> & ne, int flags = 0,
|
||||
ggml_context ** actual_ctx = nullptr);
|
||||
|
||||
ggml_context * get_context_for_tensor(ggml_context * ctx, const std::string & name);
|
||||
|
||||
void create_default_embd_output(const LLM_TN & tn, int n_embd, int n_vocab, bool norm_bias);
|
||||
void create_embd_output(const LLM_TN & tn, int n_embd, int n_vocab, bool has_norm = true, bool use_ctx_split = false);
|
||||
|
||||
@@ -195,7 +199,10 @@ create_tensors_helper::create_tensors_helper(llama_model_loader & _ml, llama_mod
|
||||
buft_layer_count[model.buft_layer[i].buft_matrix]++;
|
||||
}
|
||||
|
||||
ctx_size = ggml_tensor_overhead()*(ml.n_tensors + 1); // +1 for models where tok_embd is duplicated as output
|
||||
auto n_tensors = ml.n_tensors;
|
||||
if (ml.merge_qkv) n_tensors += n_layer;
|
||||
if (ml.merge_up_gate_exps) n_tensors += n_layer;
|
||||
ctx_size = ggml_tensor_overhead()*(n_tensors + 1); // +1 for models where tok_embd is duplicated as output
|
||||
ctx_size += ggml_tensor_overhead()*n_layer*3; // for moe merged tensors
|
||||
|
||||
if (model.splits.size() > 1) {
|
||||
@@ -288,9 +295,7 @@ static std::vector<int> create_split(int nr, int granularity, const std::vector<
|
||||
return result;
|
||||
}
|
||||
|
||||
ggml_tensor * create_tensors_helper::create_tensor(ggml_context * ctx, const std::string & name, const std::vector<int64_t> & ne,
|
||||
int flags, ggml_context ** actual_context) {
|
||||
//auto requested_ctx = ctx;
|
||||
ggml_context * create_tensors_helper::get_context_for_tensor(ggml_context * ctx, const std::string & name) {
|
||||
if (ml.tensor_buft_overrides) {
|
||||
for (const auto * overrides = ml.tensor_buft_overrides; overrides->pattern != nullptr; ++overrides) {
|
||||
std::regex pattern(overrides->pattern);
|
||||
@@ -301,6 +306,12 @@ ggml_tensor * create_tensors_helper::create_tensor(ggml_context * ctx, const std
|
||||
}
|
||||
}
|
||||
}
|
||||
return ctx;
|
||||
}
|
||||
|
||||
ggml_tensor * create_tensors_helper::create_tensor(ggml_context * ctx, const std::string & name, const std::vector<int64_t> & ne,
|
||||
int flags, ggml_context ** actual_context) {
|
||||
ctx = get_context_for_tensor(ctx, name);
|
||||
if (actual_context) *actual_context = ctx;
|
||||
auto tensor = ml.create_tensor(ctx, name, ne, flags);
|
||||
if (tensor && ctx == split_ctx) {
|
||||
@@ -1168,9 +1179,14 @@ bool create_tensors_helper::create_qwen3_moe_tensors(const LLM_TN & tn) {
|
||||
// MoE branch
|
||||
const int64_t n_ff_exp = hparams.n_ff_exp ? hparams.n_ff_exp : n_ff / n_expert_used;
|
||||
|
||||
layer.ffn_gate_exps = create_tensor(ffn_ctx, tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert});
|
||||
bool merged = ml.merge_up_gate_exps && merge_up_gate_exps(tn, i, 0);
|
||||
if (merged) {
|
||||
use_mmap_buffer = false;
|
||||
} else {
|
||||
layer.ffn_up_exps = create_tensor(ffn_ctx, tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert});
|
||||
layer.ffn_gate_exps = create_tensor(ffn_ctx, tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert});
|
||||
}
|
||||
layer.ffn_down_exps = create_tensor(ffn_ctx, tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert});
|
||||
layer.ffn_up_exps = create_tensor(ffn_ctx, tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert});
|
||||
}
|
||||
return use_mmap_buffer;
|
||||
}
|
||||
@@ -2572,9 +2588,18 @@ bool create_tensors_helper::create_openai_moe_tensors(const LLM_TN & tn) {
|
||||
|
||||
ggml_context *ctx_ffn_gate, *ctx_ffn_up, *ctx_ffn_down;
|
||||
layer.ffn_gate_inp = create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), { n_embd, n_expert}, 0);
|
||||
layer.ffn_gate_exps = create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0, &ctx_ffn_gate);
|
||||
layer.ffn_down_exps = create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert}, 0, &ctx_ffn_down);
|
||||
layer.ffn_up_exps = create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0, &ctx_ffn_up);
|
||||
bool merged = ml.merge_up_gate_exps && merge_up_gate_exps(tn, i, 2);
|
||||
use_mmap_buffer &= !merged;
|
||||
if (merged) {
|
||||
ctx_ffn_gate = ctx_ffn_up = ctx_split;
|
||||
} else {
|
||||
layer.ffn_up_exps = create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i),
|
||||
{ n_embd, n_ff_exp, n_expert}, 0, &ctx_ffn_up);
|
||||
layer.ffn_gate_exps = create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i),
|
||||
{ n_embd, n_ff_exp, n_expert}, 0, &ctx_ffn_gate);
|
||||
}
|
||||
layer.ffn_down_exps = create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i),
|
||||
{n_ff_exp, n_embd, n_expert}, 0, &ctx_ffn_down);
|
||||
|
||||
// bias
|
||||
layer.ffn_gate_inp_b = create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_INP, "bias", i), {n_expert}, 0);
|
||||
@@ -2582,15 +2607,17 @@ bool create_tensors_helper::create_openai_moe_tensors(const LLM_TN & tn) {
|
||||
auto ctx_gate_b = ctx_ffn_gate == ctx_split ? ctx_split : ctx_layer;
|
||||
auto ctx_down_b = ctx_ffn_down == ctx_split ? ctx_split : ctx_layer;
|
||||
auto ctx_up_b = ctx_ffn_up == ctx_split ? ctx_split : ctx_layer;
|
||||
layer.ffn_gate_exps_b = create_tensor(ctx_gate_b, tn(LLM_TENSOR_FFN_GATE_EXPS, "bias", i), {n_ff_exp, n_expert}, 0, &ctx_ffn_gate_b);
|
||||
if (!merged) {
|
||||
layer.ffn_up_exps_b = create_tensor(ctx_up_b, tn(LLM_TENSOR_FFN_UP_EXPS, "bias", i), {n_ff_exp, n_expert}, 0, &ctx_ffn_up_b);
|
||||
layer.ffn_gate_exps_b = create_tensor(ctx_gate_b, tn(LLM_TENSOR_FFN_GATE_EXPS, "bias", i), {n_ff_exp, n_expert}, 0, &ctx_ffn_gate_b);
|
||||
}
|
||||
layer.ffn_down_exps_b = create_tensor(ctx_down_b, tn(LLM_TENSOR_FFN_DOWN_EXPS, "bias", i), { n_embd, n_expert}, 0, &ctx_ffn_down_b);
|
||||
layer.ffn_up_exps_b = create_tensor(ctx_up_b, tn(LLM_TENSOR_FFN_UP_EXPS, "bias", i), {n_ff_exp, n_expert}, 0, &ctx_ffn_up_b);
|
||||
|
||||
if (ctx_ffn_gate_b != ctx_ffn_gate) {
|
||||
if (!merged && ctx_ffn_gate_b != ctx_ffn_gate) {
|
||||
layer.ffn_gate_exps_b_dup = create_tensor(ctx_ffn_gate, tn(LLM_TENSOR_FFN_GATE_EXPS, "bias", i), {n_ff_exp, n_expert},
|
||||
llama_model_loader::TENSOR_DUPLICATED);
|
||||
}
|
||||
if (ctx_ffn_up_b != ctx_ffn_up) {
|
||||
if (!merged && ctx_ffn_up_b != ctx_ffn_up) {
|
||||
layer.ffn_up_exps_b_dup = create_tensor(ctx_ffn_up, tn(LLM_TENSOR_FFN_UP_EXPS, "bias", i), {n_ff_exp, n_expert},
|
||||
llama_model_loader::TENSOR_DUPLICATED);
|
||||
}
|
||||
@@ -2654,6 +2681,71 @@ bool create_tensors_helper::create_smollm3_tensors(const LLM_TN & tn) {
|
||||
return use_mmap_buffer;
|
||||
}
|
||||
|
||||
bool create_tensors_helper::merge_up_gate_exps(const LLM_TN & tn, int i, int bias) {
|
||||
ggml_context * ctx_split = ctx_for_layer_split(i);
|
||||
|
||||
auto & layer = model.layers[i];
|
||||
|
||||
auto u_name = tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i);
|
||||
auto g_name = tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i);
|
||||
auto u_meta = ml.require_tensor_meta(u_name.c_str());
|
||||
auto g_meta = ml.require_tensor_meta(g_name.c_str());
|
||||
|
||||
if (u_meta->type != g_meta->type || u_meta->ne[0] != g_meta->ne[0] || u_meta->ne[2] != g_meta->ne[2]) {
|
||||
printf("%s: not merging because up/fate meta info is different\n", __func__);
|
||||
return false;
|
||||
}
|
||||
|
||||
auto u_ctx = get_context_for_tensor(ctx_split, u_name);
|
||||
auto g_ctx = get_context_for_tensor(ctx_split, g_name);
|
||||
|
||||
if (u_ctx != g_ctx) {
|
||||
printf("%s: not merging because of context\n", __func__);
|
||||
return false;
|
||||
}
|
||||
|
||||
if (bias && (u_ctx != ctx_split || g_ctx != ctx_split)) {
|
||||
printf("%s: not merging because of context\n", __func__);
|
||||
return false;
|
||||
}
|
||||
|
||||
printf("%s: merging up/gate in layer %d\n", __func__, i);
|
||||
|
||||
layer.ffn_up_gate_exps = ggml_new_tensor_3d(u_ctx, u_meta->type, u_meta->ne[0], u_meta->ne[1] + g_meta->ne[1], u_meta->ne[2]);
|
||||
snprintf(layer.ffn_up_gate_exps->name, GGML_MAX_NAME, "blk.%d.ffn_up_gate_exps.weight", i);
|
||||
layer.ffn_up_exps = ml.create_tensor_as_view(u_ctx, layer.ffn_up_gate_exps, u_name.c_str(),
|
||||
{ u_meta->ne[0], u_meta->ne[1], u_meta->ne[2] }, 0);
|
||||
layer.ffn_gate_exps = ml.create_tensor_as_view(u_ctx, layer.ffn_up_gate_exps, g_name.c_str(),
|
||||
{ g_meta->ne[0], g_meta->ne[1], g_meta->ne[2] }, ggml_nbytes(layer.ffn_up_exps) ); //u_meta->ne[1]*u_meta->nb[1] );
|
||||
|
||||
if (!bias) return true;
|
||||
|
||||
auto u_name_b = tn(LLM_TENSOR_FFN_UP_EXPS, "bias", i);
|
||||
auto g_name_b = tn(LLM_TENSOR_FFN_GATE_EXPS, "bias", i);
|
||||
auto u_meta_b = ml.get_tensor_meta(u_name_b.c_str());
|
||||
auto g_meta_b = ml.get_tensor_meta(g_name_b.c_str());
|
||||
if (bias == 2) {
|
||||
GGML_ASSERT(u_meta_b && g_meta_b);
|
||||
GGML_ASSERT(u_meta_b->type == g_meta_b->type);
|
||||
GGML_ASSERT(u_meta_b->ne[1] == g_meta_b->ne[1]);
|
||||
} else {
|
||||
GGML_ASSERT(!u_meta_b && !g_meta_b);
|
||||
return true;
|
||||
}
|
||||
|
||||
GGML_ASSERT(u_meta->ne[1] == u_meta_b->ne[0]);
|
||||
GGML_ASSERT(g_meta->ne[1] == g_meta_b->ne[0]);
|
||||
|
||||
layer.ffn_up_gate_exps_b = ggml_new_tensor_2d(ctx_split, u_meta_b->type, u_meta_b->ne[0] + g_meta_b->ne[0], u_meta->ne[1]);
|
||||
snprintf(layer.ffn_up_gate_exps_b->name, GGML_MAX_NAME, "blk.%d.ffn_up_gate_exps.bias", i);
|
||||
layer.ffn_up_exps_b = ml.create_tensor_as_view(ctx_split, layer.ffn_up_gate_exps_b, u_name_b.c_str(),
|
||||
{ u_meta_b->ne[0], u_meta_b->ne[1] }, 0);
|
||||
layer.ffn_gate_exps_b = ml.create_tensor_as_view(ctx_split, layer.ffn_up_gate_exps_b, g_name_b.c_str(),
|
||||
{ g_meta_b->ne[0], g_meta_b->ne[1] }, ggml_nbytes(layer.ffn_up_exps_b) ); //u_meta->nb[1]);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
bool create_tensors_helper::merge_qkv(const LLM_TN & tn, int i, int bias, bool ignore_attn_scale) {
|
||||
auto& hparams = model.hparams;
|
||||
const int64_t n_head = hparams.n_head();
|
||||
@@ -2849,11 +2941,18 @@ bool create_tensors_helper::create_tensors() {
|
||||
bool use_mmap_buffer = true;
|
||||
if (ml.merge_qkv && (model.split_mode == LLAMA_SPLIT_MODE_GRAPH || model.split_mode == LLAMA_SPLIT_MODE_ATTN)) {
|
||||
LLAMA_LOG_WARN("\n========================================================\n");
|
||||
LLAMA_LOG_WARN("merge_qkv is not compatible with split model 'graph'\n");
|
||||
LLAMA_LOG_WARN("merge_qkv is not compatible with split mode 'graph'\n");
|
||||
LLAMA_LOG_WARN(" => turning off merge_qkv\n");
|
||||
LLAMA_LOG_WARN("========================================================\n\n");
|
||||
ml.merge_qkv = false;
|
||||
}
|
||||
if (ml.merge_up_gate_exps && (model.split_mode == LLAMA_SPLIT_MODE_GRAPH || model.split_mode == LLAMA_SPLIT_MODE_ATTN)) {
|
||||
LLAMA_LOG_WARN("\n========================================================\n");
|
||||
LLAMA_LOG_WARN("merge_up_gate_exps is not compatible with split mode 'graph'\n");
|
||||
LLAMA_LOG_WARN(" => turning off merge_up_gate_exps\n");
|
||||
LLAMA_LOG_WARN("========================================================\n\n");
|
||||
ml.merge_up_gate_exps = false;
|
||||
}
|
||||
switch (model.arch) {
|
||||
case LLM_ARCH_LLAMA:
|
||||
case LLM_ARCH_REFACT:
|
||||
|
||||
@@ -204,7 +204,7 @@ namespace GGUFMeta {
|
||||
}
|
||||
|
||||
llama_model_loader::llama_model_loader(const std::string & fname, bool use_mmap, bool check_tensors,
|
||||
bool repack_tensors, bool use_thp, bool merge_qkv,
|
||||
bool repack_tensors, bool use_thp, bool merge_qkv, bool merge_up_gate_exps,
|
||||
const llama_model_kv_override * param_overrides_p,
|
||||
const llama_model_tensor_buft_override * param_tensor_buft_overrides_p) {
|
||||
int trace = 0;
|
||||
@@ -497,6 +497,7 @@ llama_model_loader::llama_model_loader(const std::string & fname, bool use_mmap,
|
||||
this->repack_tensors = repack_tensors;
|
||||
this->use_thp = use_thp;
|
||||
this->merge_qkv = merge_qkv;
|
||||
this->merge_up_gate_exps = merge_up_gate_exps;
|
||||
}
|
||||
|
||||
llama_model_loader::~llama_model_loader() {
|
||||
|
||||
@@ -45,6 +45,7 @@ struct llama_model_loader {
|
||||
bool repack_tensors = false;
|
||||
bool use_thp = false;
|
||||
bool merge_qkv = false;
|
||||
bool merge_up_gate_exps = false;
|
||||
|
||||
llama_files files;
|
||||
llama_ftype ftype;
|
||||
@@ -79,7 +80,8 @@ struct llama_model_loader {
|
||||
std::string arch_name;
|
||||
LLM_KV llm_kv = LLM_KV(LLM_ARCH_UNKNOWN);
|
||||
|
||||
llama_model_loader(const std::string & fname, bool use_mmap, bool check_tensors, bool repack_tensors, bool use_thp, bool merge_qkv,
|
||||
llama_model_loader(const std::string & fname, bool use_mmap, bool check_tensors, bool repack_tensors, bool use_thp,
|
||||
bool merge_qkv, bool merge_up_gate_exps,
|
||||
const llama_model_kv_override * param_overrides_p,
|
||||
const llama_model_tensor_buft_override * param_tensor_buft_overrides_p);
|
||||
|
||||
|
||||
@@ -236,6 +236,7 @@ struct llama_layer {
|
||||
struct ggml_tensor * ffn_gate_exps = nullptr;
|
||||
struct ggml_tensor * ffn_down_exps = nullptr;
|
||||
struct ggml_tensor * ffn_up_exps = nullptr;
|
||||
struct ggml_tensor * ffn_up_gate_exps = nullptr;
|
||||
|
||||
llama_split_tensor split_ffn_gate_inp;
|
||||
llama_split_tensor split_ffn_up_exps;
|
||||
@@ -247,6 +248,7 @@ struct llama_layer {
|
||||
struct ggml_tensor * ffn_gate_exps_b = nullptr;
|
||||
struct ggml_tensor * ffn_down_exps_b = nullptr;
|
||||
struct ggml_tensor * ffn_up_exps_b = nullptr;
|
||||
struct ggml_tensor * ffn_up_gate_exps_b = nullptr;
|
||||
struct ggml_tensor * ffn_gate_exps_b_dup = nullptr;
|
||||
struct ggml_tensor * ffn_down_exps_b_dup = nullptr;
|
||||
struct ggml_tensor * ffn_up_exps_b_dup = nullptr;
|
||||
|
||||
@@ -1009,7 +1009,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
||||
kv_overrides = v->data();
|
||||
}
|
||||
llama_model_loader ml(fname_inp, use_mmap, /*check_tensors*/ true, /* repack_tensors */ false,
|
||||
/* use_thp */ false, /* merge_qkv */ false, kv_overrides, nullptr);
|
||||
/* use_thp */ false, /* merge_qkv */ false, /* merge_up_gate_exps */ false, kv_overrides, nullptr);
|
||||
ml.init_mappings(false); // no prefetching
|
||||
|
||||
llama_model model;
|
||||
|
||||
@@ -1882,6 +1882,9 @@ static bool llm_load_tensors(
|
||||
}
|
||||
|
||||
use_mmap_buffer = cth->create_tensors();
|
||||
if (!use_mmap_buffer) {
|
||||
ml.use_mmap = false;
|
||||
}
|
||||
|
||||
ml.done_getting_tensors();
|
||||
|
||||
@@ -2104,7 +2107,8 @@ static bool llm_load_tensors(
|
||||
static int llama_model_load(const std::string & fname, llama_model & model, llama_model_params & params) {
|
||||
try {
|
||||
llama_model_loader ml(fname, params.use_mmap, params.check_tensors,
|
||||
params.repack_tensors, params.use_thp, params.merge_qkv, params.kv_overrides, params.tensor_buft_overrides);
|
||||
params.repack_tensors, params.use_thp, params.merge_qkv, params.merge_up_gate_exps,
|
||||
params.kv_overrides, params.tensor_buft_overrides);
|
||||
|
||||
model.hparams.vocab_only = params.vocab_only;
|
||||
|
||||
@@ -4017,6 +4021,7 @@ struct llama_model_params llama_model_default_params() {
|
||||
/*.use_thp =*/ false,
|
||||
/*.validate_quants =*/ false,
|
||||
/*.merge_qkv =*/ false,
|
||||
/*.merge_up_gate_exps =*/ false,
|
||||
};
|
||||
|
||||
#ifdef GGML_USE_METAL
|
||||
@@ -4286,6 +4291,80 @@ void llama_free_model(struct llama_model * model) {
|
||||
delete model;
|
||||
}
|
||||
|
||||
static void llama_repack_up_gate_exps(llama_context & lctx) {
|
||||
auto & model = lctx.model;
|
||||
bool needs_repack = false;
|
||||
for (auto & l : model.layers) {
|
||||
if (l.ffn_up_gate_exps && l.ffn_up_exps && l.ffn_gate_exps) {
|
||||
needs_repack = true; break;
|
||||
}
|
||||
}
|
||||
if (!needs_repack) return;
|
||||
|
||||
std::vector<char> aux_buffer_up, aux_buffer_gate, aux_buffer_up_gate;
|
||||
for (int il = 0; il < int(model.layers.size()); ++il) {
|
||||
auto & l = model.layers[il];
|
||||
if (l.ffn_up_gate_exps && l.ffn_up_exps && l.ffn_gate_exps) {
|
||||
GGML_ASSERT(l.ffn_up_gate_exps->type == l.ffn_up_exps->type && l.ffn_up_gate_exps->type == l.ffn_gate_exps->type);
|
||||
GGML_ASSERT(l.ffn_up_gate_exps->ne[0] == l.ffn_up_exps->ne[0] && l.ffn_up_gate_exps->ne[0] == l.ffn_gate_exps->ne[0]);
|
||||
GGML_ASSERT(l.ffn_up_gate_exps->ne[2] == l.ffn_up_exps->ne[2] && l.ffn_up_gate_exps->ne[2] == l.ffn_gate_exps->ne[2]);
|
||||
GGML_ASSERT(l.ffn_up_gate_exps->ne[1] == l.ffn_up_exps->ne[1] + l.ffn_gate_exps->ne[1]);
|
||||
auto nbytes = ggml_nbytes(l.ffn_up_exps);
|
||||
GGML_ASSERT(nbytes == ggml_nbytes(l.ffn_gate_exps));
|
||||
if (nbytes > aux_buffer_up.size()) {
|
||||
aux_buffer_up.resize(nbytes);
|
||||
}
|
||||
if (nbytes > aux_buffer_gate.size()) {
|
||||
aux_buffer_gate.resize(nbytes);
|
||||
}
|
||||
printf("%s: repacking up/gate experts weight in layer %d\n", __func__, il);
|
||||
ggml_backend_tensor_get(l.ffn_up_exps, aux_buffer_up.data(), 0, nbytes);
|
||||
ggml_backend_tensor_get(l.ffn_gate_exps, aux_buffer_gate.data(), 0, nbytes);
|
||||
if (aux_buffer_up_gate.size() < 2*nbytes) {
|
||||
aux_buffer_up_gate.resize(2*nbytes);
|
||||
}
|
||||
size_t offset_up_gate = 0;
|
||||
size_t offset_up = 0;
|
||||
auto expert_size = l.ffn_up_exps->ne[1]*l.ffn_up_exps->nb[1];
|
||||
for (int i2 = 0; i2 < (int)l.ffn_up_gate_exps->ne[2]; ++i2) {
|
||||
std::memcpy(aux_buffer_up_gate.data() + offset_up_gate, aux_buffer_up.data() + offset_up, expert_size);
|
||||
offset_up_gate += expert_size;
|
||||
std::memcpy(aux_buffer_up_gate.data() + offset_up_gate, aux_buffer_gate.data() + offset_up, expert_size);
|
||||
offset_up_gate += expert_size;
|
||||
offset_up += expert_size;
|
||||
}
|
||||
ggml_backend_tensor_set(l.ffn_up_gate_exps, aux_buffer_up_gate.data(), 0, 2*expert_size*l.ffn_up_gate_exps->ne[2]);
|
||||
if (l.ffn_up_gate_exps_b && l.ffn_up_exps_b && l.ffn_gate_exps_b) {
|
||||
nbytes = ggml_nbytes(l.ffn_up_exps_b);
|
||||
GGML_ASSERT(nbytes == ggml_nbytes(l.ffn_gate_exps_b));
|
||||
if (nbytes > aux_buffer_up.size()) {
|
||||
aux_buffer_up.resize(nbytes);
|
||||
}
|
||||
if (nbytes > aux_buffer_gate.size()) {
|
||||
aux_buffer_gate.resize(nbytes);
|
||||
}
|
||||
printf("%s: repacking up/gate experts bias in layer %d\n", __func__, il);
|
||||
ggml_backend_tensor_get(l.ffn_up_exps_b, aux_buffer_up.data(), 0, nbytes);
|
||||
ggml_backend_tensor_get(l.ffn_gate_exps_b, aux_buffer_gate.data(), 0, nbytes);
|
||||
if (aux_buffer_up_gate.size() < 2*nbytes) {
|
||||
aux_buffer_up_gate.resize(2*nbytes);
|
||||
}
|
||||
offset_up_gate = 0;
|
||||
offset_up = 0;
|
||||
expert_size = l.ffn_up_exps_b->nb[1];
|
||||
for (int i1 = 0; i1 < (int)l.ffn_up_gate_exps_b->ne[1]; ++i1) {
|
||||
std::memcpy(aux_buffer_up_gate.data() + offset_up_gate, aux_buffer_up.data() + offset_up, expert_size);
|
||||
offset_up_gate += expert_size;
|
||||
std::memcpy(aux_buffer_up_gate.data() + offset_up_gate, aux_buffer_gate.data() + offset_up, expert_size);
|
||||
offset_up_gate += expert_size;
|
||||
offset_up += expert_size;
|
||||
}
|
||||
ggml_backend_tensor_set(l.ffn_up_gate_exps_b, aux_buffer_up_gate.data(), 0, 2*expert_size*l.ffn_up_gate_exps_b->ne[1]);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
struct llama_context * llama_new_context_with_model(
|
||||
struct llama_model * model,
|
||||
struct llama_context_params params) {
|
||||
@@ -4748,6 +4827,8 @@ struct llama_context * llama_new_context_with_model(
|
||||
LLAMA_LOG_INFO("%s: pipeline parallelism enabled (n_copies=%d)\n", __func__, ggml_backend_sched_get_n_copies(ctx->sched));
|
||||
}
|
||||
|
||||
llama_repack_up_gate_exps(*ctx);
|
||||
|
||||
// build worst-case graph
|
||||
int n_tokens = (int)std::min(cparams.n_ctx, cparams.n_ubatch);
|
||||
int n_past = cparams.n_ctx - n_tokens;
|
||||
|
||||
Reference in New Issue
Block a user