Fix graph parallel when ngl < n_layers (#1241)

* Fix graph parallel when ngl < n_layers

* Fix using ffn_norm

When using graph parallel with ngl < n_layers, the ffn_norm tensor
may have ended up being split, while the ffn tensors are on the CPU.
In that case we will get a crash because we attempt to use the not-split
buffer of ffn_norm, which is invalid. Thi commit fixes that.

* Cleanup
This commit is contained in:
Kawrakow
2026-02-06 11:48:24 +02:00
committed by GitHub
parent 4d86907b18
commit c5d74f66e2
2 changed files with 68 additions and 62 deletions

View File

@@ -744,8 +744,12 @@ ggml_tensor * llm_build_context::llm_build_ffn(
}
auto cur = input;
//if (input->op == GGML_OP_REDUCE) {
// if (input->src[lctx.model.main_gpu]) cur = input->src[lctx.model.main_gpu];
//}
if (ffn_norm) {
cur = llm_build_norm(ctx, cur, lctx.model.hparams, ffn_norm, NULL, is_norm ? LLM_NORM : LLM_NORM_RMS, cb, il);
auto the_ffn_norm = ffn_norm->extra ? ((ggml_split_tensor_t *)ffn_norm->extra)->splits[lctx.model.main_gpu] : ffn_norm;
cur = llm_build_norm(ctx, cur, lctx.model.hparams, the_ffn_norm, NULL, is_norm ? LLM_NORM : LLM_NORM_RMS, cb, il);
cb(input, "ffn_norm", il);
}
if (cur->type != GGML_TYPE_F32) {

View File

@@ -570,9 +570,11 @@ bool llama_context::update_cache_copies() {
int n_layer = model.hparams.n_layer - model.hparams.nextn_predict_layers; //cache_copies.size()/2;
if ((int)kv_self.k_l.size() != n_layer) return false;
if (!(kv_self.v_l.empty() || (int)kv_self.v_l.size() == n_layer)) return false;
if ((model.split_mode == LLAMA_SPLIT_MODE_GRAPH || model.split_mode == LLAMA_SPLIT_MODE_ATTN) && model.splits.size() > 1) {
for (int il = 0; il < n_layer; ++il) {
auto kl = (ggml_split_tensor_t *)kv_self.k_l[il]->extra;
if (kl) {
GGML_ASSERT(model.split_mode == LLAMA_SPLIT_MODE_GRAPH || model.split_mode == LLAMA_SPLIT_MODE_ATTN);
GGML_ASSERT(model.splits.size() > 1);
auto vl = !kv_self.v_l.empty() && kv_self.v_l[il] ? (ggml_split_tensor_t *)kv_self.v_l[il]->extra : nullptr;
GGML_ASSERT(kl && (!kv_self.v_l[il] || vl));
if (vl) {
@@ -593,7 +595,6 @@ bool llama_context::update_cache_copies() {
c.cpy->src[1]->data = (char *)vl->splits[id]->data + c.cpy->view_offs;
c.cpy->data = c.cpy->src[1]->data;
}
}
} else {
for (int il = 0; il < n_layer; ++il) {
auto& c = cache_copies[2*il+0];
@@ -611,6 +612,7 @@ bool llama_context::update_cache_copies() {
c.cpy->data = c.cpy->src[1]->data;
}
}
}
return true;
}
@@ -781,6 +783,13 @@ static bool llama_kv_cache_init(
n_mla++;
}
else {
bool split_cache_i = split_cache;
auto K = model.layers[i].wk;
auto V = model.layers[i].wv;
if (split_cache && (!K || !V || !K->extra || !V->extra)) {
ctx = offload ? ctx_map.at(model.buft_layer[i].buft) : cache.ctxs.front();
split_cache_i = false;
}
int n_embd_head_v = hparams.n_embd_head_v;
k = ggml_new_tensor_2d(ctx, type_k, n_embd_head_k, n_head_kv*kv_size);
v = ggml_new_tensor_1d(ctx, type_v, n_embd_v_gqa*kv_size);
@@ -792,10 +801,7 @@ static bool llama_kv_cache_init(
//ggml_format_name(v, "cache_v_l%d", i);
cache.k_l.push_back(k);
cache.v_l.push_back(v);
if (split_cache) {
auto K = model.layers[i].wk;
auto V = model.layers[i].wv;
if (K && V && K->extra && V->extra) {
if (split_cache_i) {
bool use_V_for_K = model.layers[i].attn_k_norm && model.layers[i].attn_k_norm->ne[0] == K->ne[1] ? true : false;
auto extra_K = (const ggml_split_tensor_t *)K->extra;
auto extra_V = (const ggml_split_tensor_t *)V->extra;
@@ -833,10 +839,6 @@ static bool llama_kv_cache_init(
k->extra = (void *)&split_k_l.ggml;
v->extra = (void *)&split_v_l.ggml;
}
//} else {
// printf("Oops: don't have yet K and V for layer %d\n", i);
//}
}
}
}
if (model.arch == LLM_ARCH_DEEPSEEK2 && cparams.mla_attn && n_mla < n_layer && n_mla > 0) {