diff --git a/src/llama-build-context.cpp b/src/llama-build-context.cpp index a91dfe1e..b4143827 100644 --- a/src/llama-build-context.cpp +++ b/src/llama-build-context.cpp @@ -744,8 +744,12 @@ ggml_tensor * llm_build_context::llm_build_ffn( } auto cur = input; + //if (input->op == GGML_OP_REDUCE) { + // if (input->src[lctx.model.main_gpu]) cur = input->src[lctx.model.main_gpu]; + //} if (ffn_norm) { - cur = llm_build_norm(ctx, cur, lctx.model.hparams, ffn_norm, NULL, is_norm ? LLM_NORM : LLM_NORM_RMS, cb, il); + auto the_ffn_norm = ffn_norm->extra ? ((ggml_split_tensor_t *)ffn_norm->extra)->splits[lctx.model.main_gpu] : ffn_norm; + cur = llm_build_norm(ctx, cur, lctx.model.hparams, the_ffn_norm, NULL, is_norm ? LLM_NORM : LLM_NORM_RMS, cb, il); cb(input, "ffn_norm", il); } if (cur->type != GGML_TYPE_F32) { diff --git a/src/llama.cpp b/src/llama.cpp index 4e84f1c2..f4e7ebf5 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -570,9 +570,11 @@ bool llama_context::update_cache_copies() { int n_layer = model.hparams.n_layer - model.hparams.nextn_predict_layers; //cache_copies.size()/2; if ((int)kv_self.k_l.size() != n_layer) return false; if (!(kv_self.v_l.empty() || (int)kv_self.v_l.size() == n_layer)) return false; - if ((model.split_mode == LLAMA_SPLIT_MODE_GRAPH || model.split_mode == LLAMA_SPLIT_MODE_ATTN) && model.splits.size() > 1) { - for (int il = 0; il < n_layer; ++il) { - auto kl = (ggml_split_tensor_t *)kv_self.k_l[il]->extra; + for (int il = 0; il < n_layer; ++il) { + auto kl = (ggml_split_tensor_t *)kv_self.k_l[il]->extra; + if (kl) { + GGML_ASSERT(model.split_mode == LLAMA_SPLIT_MODE_GRAPH || model.split_mode == LLAMA_SPLIT_MODE_ATTN); + GGML_ASSERT(model.splits.size() > 1); auto vl = !kv_self.v_l.empty() && kv_self.v_l[il] ? (ggml_split_tensor_t *)kv_self.v_l[il]->extra : nullptr; GGML_ASSERT(kl && (!kv_self.v_l[il] || vl)); if (vl) { @@ -593,22 +595,22 @@ bool llama_context::update_cache_copies() { c.cpy->src[1]->data = (char *)vl->splits[id]->data + c.cpy->view_offs; c.cpy->data = c.cpy->src[1]->data; } - } - } else { - for (int il = 0; il < n_layer; ++il) { - auto& c = cache_copies[2*il+0]; - if (!c.cpy || c.cpy->op != GGML_OP_CPY || c.cpy->view_src != kv_self.k_l[il]) return false; - c.cpy->view_offs = kv_self.head*c.step; - c.cpy->src[1]->data = (char *)kv_self.k_l[il]->data + c.cpy->view_offs; - c.cpy->data = c.cpy->src[1]->data; - } - if (kv_self.v_l.empty()) return true; - for (int il = 0; il < n_layer; ++il) { - auto& c = cache_copies[2*il+1]; - if (!c.cpy || c.cpy->op != GGML_OP_CPY || c.cpy->view_src != kv_self.v_l[il]) return false; - c.cpy->view_offs = kv_self.head*c.step; - c.cpy->src[1]->data = (char *)kv_self.v_l[il]->data + c.cpy->view_offs; - c.cpy->data = c.cpy->src[1]->data; + } else { + for (int il = 0; il < n_layer; ++il) { + auto& c = cache_copies[2*il+0]; + if (!c.cpy || c.cpy->op != GGML_OP_CPY || c.cpy->view_src != kv_self.k_l[il]) return false; + c.cpy->view_offs = kv_self.head*c.step; + c.cpy->src[1]->data = (char *)kv_self.k_l[il]->data + c.cpy->view_offs; + c.cpy->data = c.cpy->src[1]->data; + } + if (kv_self.v_l.empty()) return true; + for (int il = 0; il < n_layer; ++il) { + auto& c = cache_copies[2*il+1]; + if (!c.cpy || c.cpy->op != GGML_OP_CPY || c.cpy->view_src != kv_self.v_l[il]) return false; + c.cpy->view_offs = kv_self.head*c.step; + c.cpy->src[1]->data = (char *)kv_self.v_l[il]->data + c.cpy->view_offs; + c.cpy->data = c.cpy->src[1]->data; + } } } return true; @@ -781,6 +783,13 @@ static bool llama_kv_cache_init( n_mla++; } else { + bool split_cache_i = split_cache; + auto K = model.layers[i].wk; + auto V = model.layers[i].wv; + if (split_cache && (!K || !V || !K->extra || !V->extra)) { + ctx = offload ? ctx_map.at(model.buft_layer[i].buft) : cache.ctxs.front(); + split_cache_i = false; + } int n_embd_head_v = hparams.n_embd_head_v; k = ggml_new_tensor_2d(ctx, type_k, n_embd_head_k, n_head_kv*kv_size); v = ggml_new_tensor_1d(ctx, type_v, n_embd_v_gqa*kv_size); @@ -792,50 +801,43 @@ static bool llama_kv_cache_init( //ggml_format_name(v, "cache_v_l%d", i); cache.k_l.push_back(k); cache.v_l.push_back(v); - if (split_cache) { - auto K = model.layers[i].wk; - auto V = model.layers[i].wv; - if (K && V && K->extra && V->extra) { - bool use_V_for_K = model.layers[i].attn_k_norm && model.layers[i].attn_k_norm->ne[0] == K->ne[1] ? true : false; - auto extra_K = (const ggml_split_tensor_t *)K->extra; - auto extra_V = (const ggml_split_tensor_t *)V->extra; - auto & split_k_l = cache.split_k_l.emplace_back(); - auto & split_v_l = cache.split_v_l.emplace_back(); - split_k_l.tensor_splits.resize(extra_K->n_device, nullptr); - split_v_l.tensor_splits.resize(extra_V->n_device, nullptr); - for (int is = 0; is < extra_K->n_device; ++is) { - auto split = use_V_for_K ? extra_V->splits[is] : extra_K->splits[is]; - if (!split) continue; - int nhead_kv = use_V_for_K ? split->ne[1] / n_embd_head_v : split->ne[1]/n_embd_head_k; - if (use_V_for_K) { - LLAMA_LOG_DEBUG("K_cache(%d, %d): using %d instead of %ld heads\n", - i, is, nhead_kv, extra_K->splits[is]->ne[1]/n_embd_head_k); - } - split_k_l.tensor_splits[is] = ggml_new_tensor_2d(ctx, type_k, n_embd_head_k, nhead_kv * kv_size); - auto split_name = k_name + '.' + std::to_string(is); - ggml_set_name(split_k_l.tensor_splits[is], split_name.c_str()); - mem_split[is] += ggml_nbytes(split_k_l.tensor_splits[is]); + if (split_cache_i) { + bool use_V_for_K = model.layers[i].attn_k_norm && model.layers[i].attn_k_norm->ne[0] == K->ne[1] ? true : false; + auto extra_K = (const ggml_split_tensor_t *)K->extra; + auto extra_V = (const ggml_split_tensor_t *)V->extra; + auto & split_k_l = cache.split_k_l.emplace_back(); + auto & split_v_l = cache.split_v_l.emplace_back(); + split_k_l.tensor_splits.resize(extra_K->n_device, nullptr); + split_v_l.tensor_splits.resize(extra_V->n_device, nullptr); + for (int is = 0; is < extra_K->n_device; ++is) { + auto split = use_V_for_K ? extra_V->splits[is] : extra_K->splits[is]; + if (!split) continue; + int nhead_kv = use_V_for_K ? split->ne[1] / n_embd_head_v : split->ne[1]/n_embd_head_k; + if (use_V_for_K) { + LLAMA_LOG_DEBUG("K_cache(%d, %d): using %d instead of %ld heads\n", + i, is, nhead_kv, extra_K->splits[is]->ne[1]/n_embd_head_k); } - split_k_l.ggml.n_device = extra_K->n_device; - split_k_l.ggml.split_dim = 0; - split_k_l.ggml.splits = split_k_l.tensor_splits.data(); - for (int is = 0; is < extra_V->n_device; ++is) { - auto split = extra_V->splits[is]; - if (!split) continue; - split_v_l.tensor_splits[is] = ggml_new_tensor_1d(ctx, type_v, split->ne[1] * kv_size); - auto split_name = v_name + '.' + std::to_string(is); - ggml_set_name(split_v_l.tensor_splits[is], split_name.c_str()); - mem_split[is] += ggml_nbytes(split_v_l.tensor_splits[is]); - } - split_v_l.ggml.n_device = extra_V->n_device; - split_v_l.ggml.split_dim = 0; - split_v_l.ggml.splits = split_v_l.tensor_splits.data(); - k->extra = (void *)&split_k_l.ggml; - v->extra = (void *)&split_v_l.ggml; + split_k_l.tensor_splits[is] = ggml_new_tensor_2d(ctx, type_k, n_embd_head_k, nhead_kv * kv_size); + auto split_name = k_name + '.' + std::to_string(is); + ggml_set_name(split_k_l.tensor_splits[is], split_name.c_str()); + mem_split[is] += ggml_nbytes(split_k_l.tensor_splits[is]); } - //} else { - // printf("Oops: don't have yet K and V for layer %d\n", i); - //} + split_k_l.ggml.n_device = extra_K->n_device; + split_k_l.ggml.split_dim = 0; + split_k_l.ggml.splits = split_k_l.tensor_splits.data(); + for (int is = 0; is < extra_V->n_device; ++is) { + auto split = extra_V->splits[is]; + if (!split) continue; + split_v_l.tensor_splits[is] = ggml_new_tensor_1d(ctx, type_v, split->ne[1] * kv_size); + auto split_name = v_name + '.' + std::to_string(is); + ggml_set_name(split_v_l.tensor_splits[is], split_name.c_str()); + mem_split[is] += ggml_nbytes(split_v_l.tensor_splits[is]); + } + split_v_l.ggml.n_device = extra_V->n_device; + split_v_l.ggml.split_dim = 0; + split_v_l.ggml.splits = split_v_l.tensor_splits.data(); + k->extra = (void *)&split_k_l.ggml; + v->extra = (void *)&split_v_l.ggml; } } }