mirror of
https://github.com/ikawrakow/ik_llama.cpp.git
synced 2026-02-23 14:44:09 +00:00
Fix graph parallel when ngl < n_layers (#1241)
* Fix graph parallel when ngl < n_layers * Fix using ffn_norm When using graph parallel with ngl < n_layers, the ffn_norm tensor may have ended up being split, while the ffn tensors are on the CPU. In that case we will get a crash because we attempt to use the not-split buffer of ffn_norm, which is invalid. Thi commit fixes that. * Cleanup
This commit is contained in:
124
src/llama.cpp
124
src/llama.cpp
@@ -570,9 +570,11 @@ bool llama_context::update_cache_copies() {
|
||||
int n_layer = model.hparams.n_layer - model.hparams.nextn_predict_layers; //cache_copies.size()/2;
|
||||
if ((int)kv_self.k_l.size() != n_layer) return false;
|
||||
if (!(kv_self.v_l.empty() || (int)kv_self.v_l.size() == n_layer)) return false;
|
||||
if ((model.split_mode == LLAMA_SPLIT_MODE_GRAPH || model.split_mode == LLAMA_SPLIT_MODE_ATTN) && model.splits.size() > 1) {
|
||||
for (int il = 0; il < n_layer; ++il) {
|
||||
auto kl = (ggml_split_tensor_t *)kv_self.k_l[il]->extra;
|
||||
for (int il = 0; il < n_layer; ++il) {
|
||||
auto kl = (ggml_split_tensor_t *)kv_self.k_l[il]->extra;
|
||||
if (kl) {
|
||||
GGML_ASSERT(model.split_mode == LLAMA_SPLIT_MODE_GRAPH || model.split_mode == LLAMA_SPLIT_MODE_ATTN);
|
||||
GGML_ASSERT(model.splits.size() > 1);
|
||||
auto vl = !kv_self.v_l.empty() && kv_self.v_l[il] ? (ggml_split_tensor_t *)kv_self.v_l[il]->extra : nullptr;
|
||||
GGML_ASSERT(kl && (!kv_self.v_l[il] || vl));
|
||||
if (vl) {
|
||||
@@ -593,22 +595,22 @@ bool llama_context::update_cache_copies() {
|
||||
c.cpy->src[1]->data = (char *)vl->splits[id]->data + c.cpy->view_offs;
|
||||
c.cpy->data = c.cpy->src[1]->data;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
for (int il = 0; il < n_layer; ++il) {
|
||||
auto& c = cache_copies[2*il+0];
|
||||
if (!c.cpy || c.cpy->op != GGML_OP_CPY || c.cpy->view_src != kv_self.k_l[il]) return false;
|
||||
c.cpy->view_offs = kv_self.head*c.step;
|
||||
c.cpy->src[1]->data = (char *)kv_self.k_l[il]->data + c.cpy->view_offs;
|
||||
c.cpy->data = c.cpy->src[1]->data;
|
||||
}
|
||||
if (kv_self.v_l.empty()) return true;
|
||||
for (int il = 0; il < n_layer; ++il) {
|
||||
auto& c = cache_copies[2*il+1];
|
||||
if (!c.cpy || c.cpy->op != GGML_OP_CPY || c.cpy->view_src != kv_self.v_l[il]) return false;
|
||||
c.cpy->view_offs = kv_self.head*c.step;
|
||||
c.cpy->src[1]->data = (char *)kv_self.v_l[il]->data + c.cpy->view_offs;
|
||||
c.cpy->data = c.cpy->src[1]->data;
|
||||
} else {
|
||||
for (int il = 0; il < n_layer; ++il) {
|
||||
auto& c = cache_copies[2*il+0];
|
||||
if (!c.cpy || c.cpy->op != GGML_OP_CPY || c.cpy->view_src != kv_self.k_l[il]) return false;
|
||||
c.cpy->view_offs = kv_self.head*c.step;
|
||||
c.cpy->src[1]->data = (char *)kv_self.k_l[il]->data + c.cpy->view_offs;
|
||||
c.cpy->data = c.cpy->src[1]->data;
|
||||
}
|
||||
if (kv_self.v_l.empty()) return true;
|
||||
for (int il = 0; il < n_layer; ++il) {
|
||||
auto& c = cache_copies[2*il+1];
|
||||
if (!c.cpy || c.cpy->op != GGML_OP_CPY || c.cpy->view_src != kv_self.v_l[il]) return false;
|
||||
c.cpy->view_offs = kv_self.head*c.step;
|
||||
c.cpy->src[1]->data = (char *)kv_self.v_l[il]->data + c.cpy->view_offs;
|
||||
c.cpy->data = c.cpy->src[1]->data;
|
||||
}
|
||||
}
|
||||
}
|
||||
return true;
|
||||
@@ -781,6 +783,13 @@ static bool llama_kv_cache_init(
|
||||
n_mla++;
|
||||
}
|
||||
else {
|
||||
bool split_cache_i = split_cache;
|
||||
auto K = model.layers[i].wk;
|
||||
auto V = model.layers[i].wv;
|
||||
if (split_cache && (!K || !V || !K->extra || !V->extra)) {
|
||||
ctx = offload ? ctx_map.at(model.buft_layer[i].buft) : cache.ctxs.front();
|
||||
split_cache_i = false;
|
||||
}
|
||||
int n_embd_head_v = hparams.n_embd_head_v;
|
||||
k = ggml_new_tensor_2d(ctx, type_k, n_embd_head_k, n_head_kv*kv_size);
|
||||
v = ggml_new_tensor_1d(ctx, type_v, n_embd_v_gqa*kv_size);
|
||||
@@ -792,50 +801,43 @@ static bool llama_kv_cache_init(
|
||||
//ggml_format_name(v, "cache_v_l%d", i);
|
||||
cache.k_l.push_back(k);
|
||||
cache.v_l.push_back(v);
|
||||
if (split_cache) {
|
||||
auto K = model.layers[i].wk;
|
||||
auto V = model.layers[i].wv;
|
||||
if (K && V && K->extra && V->extra) {
|
||||
bool use_V_for_K = model.layers[i].attn_k_norm && model.layers[i].attn_k_norm->ne[0] == K->ne[1] ? true : false;
|
||||
auto extra_K = (const ggml_split_tensor_t *)K->extra;
|
||||
auto extra_V = (const ggml_split_tensor_t *)V->extra;
|
||||
auto & split_k_l = cache.split_k_l.emplace_back();
|
||||
auto & split_v_l = cache.split_v_l.emplace_back();
|
||||
split_k_l.tensor_splits.resize(extra_K->n_device, nullptr);
|
||||
split_v_l.tensor_splits.resize(extra_V->n_device, nullptr);
|
||||
for (int is = 0; is < extra_K->n_device; ++is) {
|
||||
auto split = use_V_for_K ? extra_V->splits[is] : extra_K->splits[is];
|
||||
if (!split) continue;
|
||||
int nhead_kv = use_V_for_K ? split->ne[1] / n_embd_head_v : split->ne[1]/n_embd_head_k;
|
||||
if (use_V_for_K) {
|
||||
LLAMA_LOG_DEBUG("K_cache(%d, %d): using %d instead of %ld heads\n",
|
||||
i, is, nhead_kv, extra_K->splits[is]->ne[1]/n_embd_head_k);
|
||||
}
|
||||
split_k_l.tensor_splits[is] = ggml_new_tensor_2d(ctx, type_k, n_embd_head_k, nhead_kv * kv_size);
|
||||
auto split_name = k_name + '.' + std::to_string(is);
|
||||
ggml_set_name(split_k_l.tensor_splits[is], split_name.c_str());
|
||||
mem_split[is] += ggml_nbytes(split_k_l.tensor_splits[is]);
|
||||
if (split_cache_i) {
|
||||
bool use_V_for_K = model.layers[i].attn_k_norm && model.layers[i].attn_k_norm->ne[0] == K->ne[1] ? true : false;
|
||||
auto extra_K = (const ggml_split_tensor_t *)K->extra;
|
||||
auto extra_V = (const ggml_split_tensor_t *)V->extra;
|
||||
auto & split_k_l = cache.split_k_l.emplace_back();
|
||||
auto & split_v_l = cache.split_v_l.emplace_back();
|
||||
split_k_l.tensor_splits.resize(extra_K->n_device, nullptr);
|
||||
split_v_l.tensor_splits.resize(extra_V->n_device, nullptr);
|
||||
for (int is = 0; is < extra_K->n_device; ++is) {
|
||||
auto split = use_V_for_K ? extra_V->splits[is] : extra_K->splits[is];
|
||||
if (!split) continue;
|
||||
int nhead_kv = use_V_for_K ? split->ne[1] / n_embd_head_v : split->ne[1]/n_embd_head_k;
|
||||
if (use_V_for_K) {
|
||||
LLAMA_LOG_DEBUG("K_cache(%d, %d): using %d instead of %ld heads\n",
|
||||
i, is, nhead_kv, extra_K->splits[is]->ne[1]/n_embd_head_k);
|
||||
}
|
||||
split_k_l.ggml.n_device = extra_K->n_device;
|
||||
split_k_l.ggml.split_dim = 0;
|
||||
split_k_l.ggml.splits = split_k_l.tensor_splits.data();
|
||||
for (int is = 0; is < extra_V->n_device; ++is) {
|
||||
auto split = extra_V->splits[is];
|
||||
if (!split) continue;
|
||||
split_v_l.tensor_splits[is] = ggml_new_tensor_1d(ctx, type_v, split->ne[1] * kv_size);
|
||||
auto split_name = v_name + '.' + std::to_string(is);
|
||||
ggml_set_name(split_v_l.tensor_splits[is], split_name.c_str());
|
||||
mem_split[is] += ggml_nbytes(split_v_l.tensor_splits[is]);
|
||||
}
|
||||
split_v_l.ggml.n_device = extra_V->n_device;
|
||||
split_v_l.ggml.split_dim = 0;
|
||||
split_v_l.ggml.splits = split_v_l.tensor_splits.data();
|
||||
k->extra = (void *)&split_k_l.ggml;
|
||||
v->extra = (void *)&split_v_l.ggml;
|
||||
split_k_l.tensor_splits[is] = ggml_new_tensor_2d(ctx, type_k, n_embd_head_k, nhead_kv * kv_size);
|
||||
auto split_name = k_name + '.' + std::to_string(is);
|
||||
ggml_set_name(split_k_l.tensor_splits[is], split_name.c_str());
|
||||
mem_split[is] += ggml_nbytes(split_k_l.tensor_splits[is]);
|
||||
}
|
||||
//} else {
|
||||
// printf("Oops: don't have yet K and V for layer %d\n", i);
|
||||
//}
|
||||
split_k_l.ggml.n_device = extra_K->n_device;
|
||||
split_k_l.ggml.split_dim = 0;
|
||||
split_k_l.ggml.splits = split_k_l.tensor_splits.data();
|
||||
for (int is = 0; is < extra_V->n_device; ++is) {
|
||||
auto split = extra_V->splits[is];
|
||||
if (!split) continue;
|
||||
split_v_l.tensor_splits[is] = ggml_new_tensor_1d(ctx, type_v, split->ne[1] * kv_size);
|
||||
auto split_name = v_name + '.' + std::to_string(is);
|
||||
ggml_set_name(split_v_l.tensor_splits[is], split_name.c_str());
|
||||
mem_split[is] += ggml_nbytes(split_v_l.tensor_splits[is]);
|
||||
}
|
||||
split_v_l.ggml.n_device = extra_V->n_device;
|
||||
split_v_l.ggml.split_dim = 0;
|
||||
split_v_l.ggml.splits = split_v_l.tensor_splits.data();
|
||||
k->extra = (void *)&split_k_l.ggml;
|
||||
v->extra = (void *)&split_v_l.ggml;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user