WIP GLM4.5 - runs with wrong results

This commit is contained in:
Kawrakow
2025-11-28 14:09:24 +00:00
parent f218e16e17
commit 43f644e482
5 changed files with 335 additions and 82 deletions

View File

@@ -877,12 +877,6 @@ GGML_CALL static void ggml_backend_cuda_split_buffer_set_tensor([[maybe_unused]]
auto extra = (ggml_split_tensor_t *)tensor->extra;
GGML_ASSERT(extra->n_device <= ggml_backend_cuda_get_device_count());
for (int i = 0; i < extra->n_device; ++i) {
auto split = extra->splits[i];
if (!split) continue;
//printf(" Split %d: %p, %p, %s\n", i, (void *)split->data, (void *)split->buffer, split->buffer ? ggml_backend_buffer_name(split->buffer) : "none");
}
if (extra->split_dim < 0) {
GGML_ASSERT(ggml_is_contiguous(tensor));
auto nbytes = ggml_nbytes(tensor);
@@ -904,6 +898,7 @@ GGML_CALL static void ggml_backend_cuda_split_buffer_set_tensor([[maybe_unused]]
//if (tt.row_meta_size > 0) {
// GGML_ABORT("Dim 0 copy is not implemented for tensors with row meta data\n");
//}
std::vector<char> host_buffer;
GGML_ASSERT(ggml_is_contiguous(tensor));
int nrows = ggml_nrows(tensor);
auto bs = tt.blck_size;
@@ -919,17 +914,26 @@ GGML_CALL static void ggml_backend_cuda_split_buffer_set_tensor([[maybe_unused]]
GGML_ASSERT(split->ne[0] % bs == 0);
auto source_offset = tt.row_meta_size + (ne / bs) * ts;
auto chost0 = (const char *)data;
//auto chost = (const char *)data + source_offset;
auto split_row_size = ggml_row_size(split->type, split->ne[0]);
if (host_buffer.size() < nrows*split_row_size) host_buffer.resize(nrows*split_row_size);
for (int ir = 0; ir < nrows; ++ir) {
auto dst = (char *)split->data + ir*split_row_size;
auto dst = host_buffer.data() + ir*split_row_size;
if (tt.row_meta_size > 0) {
CUDA_CHECK(cudaMemcpyAsync(dst, chost0, tt.row_meta_size, cudaMemcpyHostToDevice, cudaStreamPerThread));
memcpy(dst, chost0, tt.row_meta_size);
}
CUDA_CHECK(cudaMemcpyAsync(dst + tt.row_meta_size, chost0 + source_offset,
split_row_size - tt.row_meta_size, cudaMemcpyHostToDevice, cudaStreamPerThread));
memcpy(dst + tt.row_meta_size, chost0 + source_offset, split_row_size - tt.row_meta_size);
chost0 += row_size;
}
CUDA_CHECK(cudaMemcpyAsync(split->data, host_buffer.data(), nrows*split_row_size, cudaMemcpyHostToDevice, cudaStreamPerThread));
//for (int ir = 0; ir < nrows; ++ir) {
// auto dst = (char *)split->data + ir*split_row_size;
// if (tt.row_meta_size > 0) {
// CUDA_CHECK(cudaMemcpyAsync(dst, chost0, tt.row_meta_size, cudaMemcpyHostToDevice, cudaStreamPerThread));
// }
// CUDA_CHECK(cudaMemcpyAsync(dst + tt.row_meta_size, chost0 + source_offset,
// split_row_size - tt.row_meta_size, cudaMemcpyHostToDevice, cudaStreamPerThread));
// chost0 += row_size;
//}
ne += split->ne[0];
}
}

View File

@@ -1054,6 +1054,134 @@ llm_expert_gating_func_type gating_op,
}
ggml_tensor * llm_build_context::llm_build_std_moe_ffn(ggml_context * ctx, llama_context & lctx,
ggml_tensor * ffn_norm,
ggml_tensor * input,
ggml_tensor * gate_inp, ggml_tensor * gate_inp_b,
ggml_tensor * up_exps, ggml_tensor * up_exps_b,
ggml_tensor * gate_exps, ggml_tensor * gate_exps_b,
ggml_tensor * down_exps, ggml_tensor * down_exps_b,
ggml_tensor * exp_probs_b,
ggml_tensor * up_shexp, ggml_tensor * up_b_shexp,
ggml_tensor * gate_shexp, ggml_tensor * gate_b_shexp,
ggml_tensor * down_shexp, ggml_tensor * down_b_shexp,
int64_t n_expert,
int64_t n_expert_used,
llm_ffn_op_type type_op,
bool norm_w,
bool scale_w,
float w_scale,
llm_expert_gating_func_type gating_op,
llm_ffn_op_type type_op_shexp,
const llm_build_cb & cb, int il, ggml_cgraph * graph) {
auto split_up_exps = (ggml_split_tensor_t *)up_exps->extra;
auto split_gate_exps = (ggml_split_tensor_t *)gate_exps->extra;
auto split_down_exps = (ggml_split_tensor_t *)down_exps->extra;
if (!split_up_exps && !split_gate_exps && !split_down_exps) {
auto cur = input;
if (ffn_norm) {
cur = llm_build_norm(ctx, input, lctx.model.hparams, ffn_norm, nullptr, LLM_NORM_RMS, cb, il);
cb(cur, "ffn_inp_normed", il);
}
auto routed_out = llm_build_moe_ffn(ctx, lctx, cur,
gate_inp, gate_inp_b,
up_exps, up_exps_b,
gate_exps, gate_exps_b,
down_exps, down_exps_b,
exp_probs_b,
n_expert, n_expert_used,
type_op, norm_w, scale_w, w_scale,
gating_op, cb, il, graph);
cb(routed_out, "routed_out", il);
if (up_shexp && gate_shexp && down_shexp) {
auto shared_out = llm_build_ffn(ctx, lctx, nullptr, cur,
up_shexp, up_b_shexp, nullptr,
gate_shexp, gate_b_shexp, nullptr,
down_shexp, down_b_shexp, nullptr,
nullptr, type_op_shexp, LLM_FFN_PAR, cb, il);
cb(shared_out, "ffn_shexp_out", il);
cur = ggml_add(ctx, routed_out, shared_out);
cb(cur, "ffn_out", il);
} else {
cur = routed_out;
}
return cur;
}
GGML_ASSERT(split_up_exps && split_gate_exps && split_down_exps);
GGML_ASSERT(split_up_exps->n_device == split_gate_exps->n_device && split_up_exps->n_device == split_down_exps->n_device);
std::vector<ggml_tensor *> results(split_up_exps->n_device);
auto split_up_shexp = up_shexp ? (ggml_split_tensor_t *)up_shexp->extra : nullptr;
auto split_gate_shexp = gate_shexp ? (ggml_split_tensor_t *)gate_shexp->extra : nullptr;
auto split_down_shexp = down_shexp ? (ggml_split_tensor_t *)down_shexp->extra : nullptr;
GGML_ASSERT((!split_up_shexp && !split_gate_shexp && !split_down_shexp) ||
( split_up_shexp && split_gate_shexp && split_down_shexp));
auto split_gate_inp = (ggml_split_tensor_t *)gate_inp->extra;
GGML_ASSERT(split_gate_inp && split_gate_inp->n_device == split_up_exps->n_device);
auto split_exp_probs_b = exp_probs_b ? (ggml_split_tensor_t *)exp_probs_b->extra : nullptr;
GGML_ASSERT(!split_exp_probs_b || split_exp_probs_b->n_device == split_up_exps->n_device);
for (int id = 0; id < split_up_exps->n_device; ++id) {
int il_cb = 1000*(id + 1) + il;
auto cur = input;
if (ffn_norm) {
auto split_ffn_norm = (ggml_split_tensor_t *)ffn_norm->extra;
GGML_ASSERT(split_ffn_norm && split_ffn_norm->n_device == split_up_exps->n_device);
cur = llm_build_norm(ctx, input, lctx.model.hparams, split_ffn_norm->splits[id], nullptr, LLM_NORM_RMS, cb, il);
cb(cur, "ffn_inp_normed", il_cb);
}
else if (cur->type != GGML_TYPE_F32) {
cur = ggml_cast(ctx, cur, GGML_TYPE_F32);
}
auto routed_out = llm_build_moe_ffn(ctx, lctx, cur,
split_gate_inp->splits[id], gate_inp_b,
split_up_exps->splits[id], up_exps_b,
split_gate_exps->splits[id], gate_exps_b,
split_down_exps->splits[id], down_exps_b,
split_exp_probs_b ? split_exp_probs_b->splits[id] : nullptr,
n_expert, n_expert_used,
type_op, norm_w, scale_w, w_scale,
gating_op, cb, il, graph);
cb(routed_out, "routed_out", il_cb);
if (split_up_shexp) {
auto split_up_b_shexp = up_b_shexp ? (ggml_split_tensor_t *)up_b_shexp : nullptr;
auto split_gate_b_shexp = gate_b_shexp ? (ggml_split_tensor_t *)gate_b_shexp : nullptr;
auto split_down_b_shexp = down_b_shexp ? (ggml_split_tensor_t *)down_b_shexp : nullptr;
GGML_ASSERT(!split_up_b_shexp || split_up_b_shexp->n_device == split_up_exps->n_device);
GGML_ASSERT(!split_gate_b_shexp || split_gate_b_shexp->n_device == split_up_exps->n_device);
GGML_ASSERT(!split_down_b_shexp || split_down_b_shexp->n_device == split_up_exps->n_device);
auto shared_out = llm_build_ffn(ctx, lctx, nullptr, cur,
split_up_shexp->splits[id], split_up_b_shexp ? split_up_b_shexp->splits[id] : nullptr, nullptr,
split_gate_shexp->splits[id], split_gate_b_shexp ? split_gate_b_shexp->splits[id] : nullptr, nullptr,
split_down_shexp->splits[id], split_down_b_shexp ? split_down_b_shexp->splits[id] : nullptr, nullptr,
nullptr, type_op_shexp, LLM_FFN_PAR, cb, il);
cb(shared_out, "ffn_shexp_out", il_cb);
cur = ggml_add(ctx, routed_out, shared_out);
cb(cur, "ffn_out", il_cb);
} else {
cur = routed_out;
}
if (cur->ne[1] >= 32) {
cur = ggml_cast(ctx, cur, GGML_TYPE_F16);
cb(cur, "ffn_out_f16", il_cb);
}
results[id] = cur;
}
if (results.size() == 1) return results.front();
auto cur = ggml_add(ctx, results[0], results[1]);
cur->op_params[0] = 0xff;
cb(cur, "ffn_combined", il);
for (int id = 2; id < int(results.size()); ++id) {
cur = ggml_add(ctx, cur, results[id]);
cb(cur, "ffn_combined", il);
}
return cur;
}
static ggml_tensor * llm_build_kqv(
struct ggml_context * ctx,
struct llama_context & lctx,
@@ -1437,6 +1565,29 @@ std::tuple<ggml_tensor*, ggml_tensor*, ggml_tensor*> llm_build_context::llm_buil
return {Qcur, Kcur, Vcur};
}
static ggml_tensor * build_output(llama_context & lctx, ggml_context * ctx, ggml_tensor * cur, ggml_tensor * output, const llm_build_cb & cb) {
// lm_head
if (output->extra) {
auto split_output = (ggml_split_tensor_t *)output->extra;
std::vector<ggml_tensor *> o;
o.reserve(split_output->n_device);
for (int id = 0; id < split_output->n_device; ++id) {
auto split = split_output->splits[id];
if (!split) continue;
o.push_back(llm_build_context::llm_build_lora_mm(lctx, ctx, split, cur));
cb(o.back(), "output", id);
}
if (o.size() == 1) cur = o.front();
cur = ggml_concat(ctx, o[0], o[1], 0);
for (int id = 2; id < int(o.size()); ++id) {
cur = ggml_concat(ctx, cur, o[id], 0);
}
} else {
cur = llm_build_context::llm_build_lora_mm(lctx, ctx, output, cur);
}
return cur;
}
ggml_cgraph * llm_build_context::build_llama() {
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
@@ -1630,24 +1781,7 @@ ggml_cgraph * llm_build_context::build_llama() {
cb(cur, "result_norm", -1);
// lm_head
if (model.output->extra) {
auto output = (ggml_split_tensor_t *)model.output->extra;
std::vector<ggml_tensor *> o;
o.reserve(output->n_device);
for (int id = 0; id < output->n_device; ++id) {
auto split = output->splits[id];
if (!split) continue;
o.push_back(llm_build_lora_mm(lctx, ctx0, split, cur));
cb(o.back(), "output", id);
}
if (o.size() == 1) cur = o.front();
cur = ggml_concat(ctx0, o[0], o[1], 0);
for (int id = 2; id < int(o.size()); ++id) {
cur = ggml_concat(ctx0, cur, o[id], 0);
}
} else {
cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
}
cur = build_output(lctx, ctx0, cur, model.output, cb);
// For Granite architecture
if (hparams.f_logit_scale) {
@@ -6469,22 +6603,26 @@ ggml_cgraph * llm_build_context::build_glm4_moe() {
// output token IDs (for last layer cropping)
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
auto rope_cache = cparams.rope_cache && (rope_type == LLAMA_ROPE_TYPE_NEOX || rope_type == LLAMA_ROPE_TYPE_NORM) ?
auto rope_cache = model.split_mode != LLAMA_SPLIT_MODE_GRAPH && cparams.rope_cache && (rope_type == LLAMA_ROPE_TYPE_NEOX || rope_type == LLAMA_ROPE_TYPE_NORM) ?
ggml_rope_cache(ctx0, inp_pos, nullptr, n_embd_head, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
ext_factor, attn_factor, beta_fast, beta_slow) : nullptr;
float kq_scale = 1.0f/sqrtf(float(n_embd_head));
// Only process up to last layer (skip final NextN layer)
// Final layer tensors are loaded but not processed in forward pass
const int n_transformer_layers = n_layer - hparams.nextn_predict_layers;
for (int il = 0; il < n_transformer_layers; ++il) {
struct ggml_tensor * inpSA = inpL;
// Pre-attention norm
cur = llm_build_norm(ctx0, inpL, hparams, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, cb, il);
cb(cur, "attn_norm", il);
// self-attention
{
if (rope_cache == nullptr) {
cur = build_std_attention(gf, inpL, inp_pos, nullptr, KQ_mask, nullptr, kq_scale, 0.0f, 0, il);
} else {
// Pre-attention norm
cur = llm_build_norm(ctx0, inpL, hparams, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, cb, il);
cb(cur, "attn_norm", il);
auto [Qcur, Kcur, Vcur] = llm_build_mul_mat_qkv(gf, cur,
model.layers[il].wqkv, model.layers[il].bqkv,
model.layers[il].wqk, model.layers[il].bqk,
@@ -6526,46 +6664,57 @@ ggml_cgraph * llm_build_context::build_glm4_moe() {
struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
cb(ffn_inp, "ffn_inp", il);
// Post-attention norm
cur = llm_build_norm(ctx0, ffn_inp, hparams, model.layers[il].attn_post_norm, NULL, LLM_NORM_RMS, cb, il);
cb(cur, "post_attn_norm", il);
if ((uint32_t) il < hparams.n_layer_dense_lead) {
// dense FFN
cur = llm_build_ffn(ctx0, lctx, nullptr, cur,
cur = llm_build_ffn(ctx0, lctx, model.layers[il].ffn_norm, ffn_inp,
model.layers[il].ffn_up, NULL, NULL,
model.layers[il].ffn_gate, NULL, NULL,
model.layers[il].ffn_down, NULL, NULL,
NULL,
LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
LLM_FFN_SILU, LLM_FFN_PAR, cb, il, gf);
cb(cur, "ffn_out", il);
} else {
// MoE FFN
struct ggml_tensor * routed_out = llm_build_moe_ffn(ctx0, lctx, cur,
model.layers[il].ffn_gate_inp,
model.layers[il].ffn_up_exps,
model.layers[il].ffn_gate_exps,
model.layers[il].ffn_down_exps,
cur = llm_build_std_moe_ffn(ctx0, lctx, model.layers[il].ffn_norm, ffn_inp,
model.layers[il].ffn_gate_inp, model.layers[il].ffn_gate_inp_b,
model.layers[il].ffn_up_exps, model.layers[il].ffn_up_exps_b,
model.layers[il].ffn_gate_exps, model.layers[il].ffn_gate_exps_b,
model.layers[il].ffn_down_exps, model.layers[il].ffn_down_exps_b,
model.layers[il].ffn_exp_probs_b,
model.layers[il].ffn_up_shexp, nullptr, // we don't have shared expert biases?
model.layers[il].ffn_gate_shexp, nullptr,
model.layers[il].ffn_down_shexp, nullptr,
n_expert, n_expert_used,
LLM_FFN_SILU, hparams.expert_weights_norm,
true, hparams.expert_weights_scale,
(enum llm_expert_gating_func_type) hparams.expert_gating_func,
cb, il, gf);
cb(routed_out, "routed_out", il);
LLM_FFN_SILU, hparams.expert_weights_norm, true, hparams.expert_weights_scale,
(llm_expert_gating_func_type) hparams.expert_gating_func,
LLM_FFN_SILU, cb, il, gf);
{
struct ggml_tensor * shared_out = llm_build_ffn(ctx0, lctx, nullptr, cur,
model.layers[il].ffn_up_shexp, NULL, NULL,
model.layers[il].ffn_gate_shexp, NULL, NULL,
model.layers[il].ffn_down_shexp, NULL, NULL,
NULL,
LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
cb(shared_out, "ffn_shexp_out", il);
//// Post-attention norm
//cur = llm_build_norm(ctx0, ffn_inp, hparams, model.layers[il].ffn_norm, NULL, LLM_NORM_RMS, cb, il);
//cb(cur, "post_attn_norm", il);
//// MoE FFN
//auto routed_out = llm_build_moe_ffn(ctx0, lctx, cur,
// model.layers[il].ffn_gate_inp,
// model.layers[il].ffn_up_exps,
// model.layers[il].ffn_gate_exps,
// model.layers[il].ffn_down_exps,
// model.layers[il].ffn_exp_probs_b,
// n_expert, n_expert_used,
// LLM_FFN_SILU, hparams.expert_weights_norm,
// true, hparams.expert_weights_scale,
// (enum llm_expert_gating_func_type) hparams.expert_gating_func,
// cb, il, gf);
//cb(routed_out, "routed_out", il);
cur = ggml_add(ctx0, routed_out, shared_out);
cb(cur, "ffn_out", il);
}
//auto shared_out = llm_build_ffn(ctx0, lctx, nullptr, cur,
// model.layers[il].ffn_up_shexp, NULL, NULL,
// model.layers[il].ffn_gate_shexp, NULL, NULL,
// model.layers[il].ffn_down_shexp, NULL, NULL,
// NULL,
// LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
//cb(shared_out, "ffn_shexp_out", il);
//cur = ggml_add(ctx0, routed_out, shared_out);
//cb(cur, "ffn_out", il);
}
// residual and context vector
@@ -6584,7 +6733,7 @@ ggml_cgraph * llm_build_context::build_glm4_moe() {
cb(cur, "result_norm", -1);
// lm head
cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
cur = build_output(lctx, ctx0, cur, model.output, cb);
cb(cur, "result_output", -1);
ggml_build_forward_expand(gf, cur);
@@ -8984,6 +9133,23 @@ ggml_tensor * llm_build_context::build_std_attention(ggml_cgraph * gf, ggml_tens
auto kl = (ggml_split_tensor_t *)kv_self.k_l[il]->extra;
auto vl = (ggml_split_tensor_t *)kv_self.v_l[il]->extra;
GGML_ASSERT(wq->n_device == kl->n_device && wq->n_device == vl->n_device);
ggml_split_tensor_t *bq = nullptr, *bo = nullptr, *bk = nullptr, *bv = nullptr;
if (model.layers[il].bq && model.layers[il].bq->extra) {
bq = (ggml_split_tensor_t *)model.layers[il].bq->extra;
GGML_ASSERT(bq->n_device == wq->n_device);
}
if (model.layers[il].bo && model.layers[il].bo->extra) {
bo = (ggml_split_tensor_t *)model.layers[il].bo->extra;
GGML_ASSERT(bo->n_device == wq->n_device);
}
if (model.layers[il].bk && model.layers[il].bk->extra) {
bk = (ggml_split_tensor_t *)model.layers[il].bk->extra;
GGML_ASSERT(bk->n_device == wq->n_device);
}
if (model.layers[il].bv && model.layers[il].bv->extra) {
bv = (ggml_split_tensor_t *)model.layers[il].bv->extra;
GGML_ASSERT(bv->n_device == wq->n_device);
}
std::vector<ggml_tensor*> attn; attn.reserve(wq->n_device);
for (int id = 0; id < wq->n_device; ++id) {
int il_cb = 1000*(id+1) + il;
@@ -9006,7 +9172,9 @@ ggml_tensor * llm_build_context::build_std_attention(ggml_cgraph * gf, ggml_tens
cur = ggml_cast(ctx0, cur, GGML_TYPE_F32);
}
auto [Qcur, Kcur, Vcur] = llm_build_mul_mat_qkv(gf, cur, nullptr, nullptr, nullptr, nullptr,
split_wq, nullptr, split_wk, nullptr, split_wv, nullptr,
split_wq, bq ? bq->splits[id] : nullptr,
split_wk, bk ? bk->splits[id] : nullptr,
split_wv, bv ? bv->splits[id] : nullptr,
model.layers[il].attn_q_norm, model.layers[il].attn_k_norm, f_attn_scale, il_cb);
auto rope_factors = rope_factors_in;
if (!rope_factors && model.layers[il].rope_freqs && model.layers[il].rope_freqs->extra) {
@@ -9101,11 +9269,14 @@ ggml_tensor * llm_build_context::build_std_attention(ggml_cgraph * gf, ggml_tens
ggml_mul_mat_set_prec(cur, GGML_PREC_F32);
}
cb(cur, "kqv_wo", il_cb);
if (bo) {
cur = ggml_add(ctx0, cur, bo->splits[id]);
cb(cur, "kqv_wo_biased", il_cb);
}
if (cur->ne[1] >= 32) {
cur = ggml_cast(ctx0, cur, GGML_TYPE_F16);
}
ggml_build_forward_expand(gf, cur);
// TODO: wo_b
attn.push_back(cur);
}
if (attn.size() == 1) return attn.front();
@@ -9116,9 +9287,10 @@ ggml_tensor * llm_build_context::build_std_attention(ggml_cgraph * gf, ggml_tens
cur = ggml_add(ctx0, cur, attn[id]);
cb(cur, "combine_attn", il);
}
if (attn.size() > 2) {
cur->op_params[0] = 0xff;
}
// TODO: for more than 2 GPUs, do we need to add another forced graph split?
//if (attn.size() > 2) {
// cur->op_params[0] = 0xff;
//}
return cur;
}
}

View File

@@ -375,6 +375,27 @@ llm_expert_gating_func_type gating_op,
gating_op, cb, il, graph);
}
static ggml_tensor * llm_build_std_moe_ffn(ggml_context * ctx, llama_context & lctx,
ggml_tensor * ffn_norm,
ggml_tensor * input,
ggml_tensor * gate_inp, ggml_tensor * gate_inp_b,
ggml_tensor * up_exps, ggml_tensor * up_exps_b,
ggml_tensor * gate_exps, ggml_tensor * gate_exps_b,
ggml_tensor * down_exps, ggml_tensor * down_exps_b,
ggml_tensor * exp_probs_b,
ggml_tensor * up_shexp, ggml_tensor * up_b_shexp,
ggml_tensor * gate_shexp, ggml_tensor * gate_b_shexp,
ggml_tensor * down_shexp, ggml_tensor * down_b_shexp,
int64_t n_expert,
int64_t n_expert_used,
llm_ffn_op_type type_op,
bool norm_w,
bool scale_w,
float w_scale,
llm_expert_gating_func_type gating_op,
llm_ffn_op_type type_op_shexp,
const llm_build_cb & cb, int il, ggml_cgraph * graph);
static ggml_cgraph * llama_build_graph_defrag(llama_context & lctx, const std::vector<uint32_t> & ids);
static ggml_cgraph * llama_build_graph_k_shift(llama_context & lctx);

View File

@@ -202,7 +202,7 @@ create_tensors_helper::create_tensors_helper(llama_model_loader & _ml, llama_mod
if (model.splits.size() > 1) {
ctx_size += ggml_tensor_overhead()*n_layer*4; // for KV cache
ctx_size *= model.splits.size();
ctx_size *= (model.splits.size() + 1);
}
for (auto & it : buft_layer_count) {
@@ -1852,7 +1852,7 @@ bool create_tensors_helper::create_glm4_moe_tensors(const LLM_TN & tn) {
GGML_ASSERT(hparams.n_expert > 0 && "n_expert must be > 0 for GLM4_MOE MoE layers");
GGML_ASSERT(hparams.n_expert_used > 0 && "n_expert_used must be > 0 for GLM4_MOE MoE layers");
create_embd_output(tn, n_embd, n_vocab);
create_embd_output(tn, n_embd, n_vocab, true, true);
for (int i = 0; i < n_layer; ++i) {
ggml_context * ctx_layer = ctx_for_layer(i);
@@ -1866,7 +1866,7 @@ bool create_tensors_helper::create_glm4_moe_tensors(const LLM_TN & tn) {
auto & layer = model.layers[i];
layer.attn_norm = create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, flags);
layer.attn_norm = create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, flags);
// GLM-style attention with bias terms
if (!flags) {
@@ -1888,7 +1888,10 @@ bool create_tensors_helper::create_glm4_moe_tensors(const LLM_TN & tn) {
layer.attn_k_norm = create_tensor(ctx_layer,
tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), { n_embd_head_k }, llama_model_loader::TENSOR_NOT_REQUIRED | flags);
layer.attn_post_norm = create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), { n_embd }, flags);
// Why are we adding an additional tensor type?
// attn_post_norm is the exact same thing as ffn_norm
//layer.attn_post_norm = create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), { n_embd }, flags);
layer.ffn_norm = create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), { n_embd }, flags);
// Check if this layer uses MoE or dense FFN based on n_layer_dense_lead
// GLM 4.5 uses hybrid architecture: layer 0 is dense, layers 1+ are MoE
@@ -1896,9 +1899,9 @@ bool create_tensors_helper::create_glm4_moe_tensors(const LLM_TN & tn) {
if (use_moe) {
// MoE layers
layer.ffn_gate_inp = create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), { n_embd, n_expert }, flags);
layer.ffn_gate_inp = create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), { n_embd, n_expert }, flags);
// gate bias
layer.ffn_exp_probs_b = create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_EXP_PROBS_B, "bias", i), { n_expert }, flags);
layer.ffn_exp_probs_b = create_tensor(ctx_split, tn(LLM_TENSOR_FFN_EXP_PROBS_B, "bias", i), { n_expert }, flags);
// MoE branch
const int64_t n_ff_exp = hparams.n_ff_exp ? hparams.n_ff_exp : n_ff / n_expert_used;
@@ -2747,9 +2750,9 @@ bool create_tensors_helper::merge_qkv(const LLM_TN & tn, int i, int bias, bool i
layer.wv = create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa});
if (bias) {
auto flags = bias == 1 ? llama_model_loader::TENSOR_NOT_REQUIRED : 0;
layer.bq = create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q, "bias", i), {layer.wq->ne[1]}, flags);
layer.bk = create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K, "bias", i), {layer.wk->ne[1]}, flags);
layer.bv = create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_V, "bias", i), {layer.wv->ne[1]}, flags);
layer.bq = create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "bias", i), {layer.wq->ne[1]}, flags);
layer.bk = create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "bias", i), {layer.wk->ne[1]}, flags);
layer.bv = create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "bias", i), {layer.wv->ne[1]}, flags);
}
}
@@ -2801,6 +2804,8 @@ static void prepare_split_tensors(int split_dim, ggml_context * ctx, ggml_tensor
GGML_ASSERT(mem_used.size() >= splits.size());
for (int i = 0; i < split_tensor.ggml.n_device; ++i) {
if (split_tensor.ggml.splits[i]) {
//auto nbytes = ggml_nbytes(split_tensor.ggml.splits[i]);
//printf("mem_used(%s): %8.2f, total: %8.2f\n", split_tensor.ggml.splits[i]->name, nbytes/1024./1024., (mem_used[i] + nbytes)/1024./1024.);
mem_used[i] += ggml_nbytes(split_tensor.ggml.splits[i]);
}
}
@@ -2959,16 +2964,34 @@ bool create_tensors_helper::create_tensors() {
auto split = create_split(layer.wo->ne[0], attn_granularity, model.splits);
prepare_split_tensors(0, ctx_split, layer.wo, layer.split_wo, split, mem_used);
prepare_split_tensors(1, ctx_split, layer.wq, layer.split_wq, split, mem_used);
if (layer.bo) {
prepare_split_tensors(-1, ctx_split, layer.bo, layer.split_bo, split, mem_used);
}
if (layer.bq) {
prepare_split_tensors(0, ctx_split, layer.bq, layer.split_bq, split, mem_used);
}
for (auto & s : split) s /= gqa_ratio;
prepare_split_tensors(1, ctx_split, layer.wk, layer.split_wk, split, mem_used);
prepare_split_tensors(1, ctx_split, layer.wv, layer.split_wv, split, mem_used);
if (layer.bk) {
prepare_split_tensors(0, ctx_split, layer.bk, layer.split_bk, split, mem_used);
}
if (layer.bv) {
prepare_split_tensors(0, ctx_split, layer.bv, layer.split_bv, split, mem_used);
}
}
if (layer.ffn_norm) {
auto split = create_split(ggml_nrows(layer.ffn_norm), -1, model.splits);
prepare_split_tensors(-1, ctx_split, layer.ffn_norm, layer.split_ffn_norm, split, mem_used);
printf("Created splits for %s\n", layer.ffn_norm->name);
auto splits = (ggml_split_tensor_t *)layer.ffn_norm->extra;
if (!splits) {
printf("Oops: null extra?\n"); exit(1);
}
}
if (layer.ffn_down && layer.ffn_up && layer.ffn_gate) {
if (layer.ffn_norm) {
auto split = create_split(ggml_nrows(layer.ffn_norm), -1, model.splits);
prepare_split_tensors(-1, ctx_split, layer.ffn_norm, layer.split_ffn_norm, split, mem_used);
}
int ffn_granularity = 16;
if (ggml_is_quantized(layer.ffn_down->type)) {
auto tt = ggml_internal_get_type_traits(layer.ffn_down->type);
@@ -3002,6 +3025,27 @@ bool create_tensors_helper::create_tensors() {
prepare_split_tensors(0, ctx_split, layer.ffn_down_exps, layer.split_ffn_down_exps, split, mem_used);
prepare_split_tensors(1, ctx_split, layer.ffn_up_exps, layer.split_ffn_up_exps, split, mem_used);
prepare_split_tensors(1, ctx_split, layer.ffn_gate_exps, layer.split_ffn_gate_exps, split, mem_used);
//printf("=== Layer %d routed experts, %s, %s, %s:\n", il, ggml_type_name(layer.ffn_down_exps->type), ggml_type_name(layer.ffn_gate_exps->type), ggml_type_name(layer.ffn_up_exps->type));
//printf("mem_used:"); for (auto mem : mem_used) printf(" %8.2f", mem/1024./1024.);
//printf(" MiB\n");
//printf(" down:");
//for (auto split : layer.split_ffn_down_exps.tensor_splits) printf(" %ldx%ldx%ld", split->ne[0], split->ne[1], split->ne[2]);
//printf("\n");
//printf(" gate:");
//for (auto split : layer.split_ffn_gate_exps.tensor_splits) printf(" %ldx%ldx%ld", split->ne[0], split->ne[1], split->ne[2]);
//printf("\n");
//printf(" up:");
//for (auto split : layer.split_ffn_up_exps.tensor_splits) printf(" %ldx%ldx%ld", split->ne[0], split->ne[1], split->ne[2]);
//printf("\n");
if (layer.ffn_gate_inp) {
auto shared_split = create_split(ggml_nrows(layer.ffn_gate_inp), -1, model.splits);
prepare_split_tensors(-1, ctx_split, layer.ffn_gate_inp, layer.split_ffn_gate_inp, shared_split, mem_used);
}
if (layer.ffn_exp_probs_b) {
auto shared_split = create_split(ggml_nrows(layer.ffn_exp_probs_b), -1, model.splits);
prepare_split_tensors(-1, ctx_split, layer.ffn_exp_probs_b, layer.split_ffn_exp_probs_b, shared_split, mem_used);
}
}
}

View File

@@ -232,6 +232,7 @@ struct llama_layer {
struct ggml_tensor * ffn_down_exps = nullptr;
struct ggml_tensor * ffn_up_exps = nullptr;
llama_split_tensor split_ffn_gate_inp;
llama_split_tensor split_ffn_up_exps;
llama_split_tensor split_ffn_gate_exps;
llama_split_tensor split_ffn_down_exps;
@@ -255,6 +256,11 @@ struct llama_layer {
llama_split_tensor split_ffn_gate_shexp;
llama_split_tensor split_ffn_down_shexp;
llama_split_tensor split_ffn_gate_inp_b;
llama_split_tensor split_ffn_gate_exps_b;
llama_split_tensor split_ffn_down_exps_b;
llama_split_tensor split_ffn_up_exps_b;
// ff bias
struct ggml_tensor * ffn_gate_b = nullptr;
struct ggml_tensor * ffn_down_b = nullptr; // b2
@@ -262,6 +268,12 @@ struct llama_layer {
struct ggml_tensor * ffn_act = nullptr;
struct ggml_tensor * ffn_exp_probs_b = nullptr;
llama_split_tensor split_ffn_gate_b;
llama_split_tensor split_ffn_down_b;
llama_split_tensor split_ffn_up_b;
llama_split_tensor split_ffn_act;
llama_split_tensor split_ffn_exp_probs_b;
// mamba proj
struct ggml_tensor * ssm_in = nullptr;
struct ggml_tensor * ssm_x = nullptr;