WIP: allocate graph

2026-04-30 19:31:48 +00:00 · 2025-12-03 07:54:53 +00:00
parent b415e734e5
commit dde8028336
2 changed files with 202 additions and 18 deletions
--- a/src/llama-build-context.cpp
+++ b/src/llama-build-context.cpp
@@ -61,6 +61,110 @@ llm_build_context::llm_build_context(
            // all initializations should be done in init()
 }
 struct GraphAllocator {
    size_t cur_offset = 0;
    size_t max_offset = 0;
    std::unordered_map<ggml_tensor *, size_t> tensors;
    std::vector<std::pair<std::size_t,std::size_t>> gaps;
    bool debug = false;
    inline void add(ggml_tensor * t, bool in_place = false) {
        if (t->view_src) {
            auto it = tensors.find(t->view_src);
            GGML_ASSERT(it != tensors.end());
            auto status = tensors.emplace(std::make_pair(t, it->second + t->view_offs));
            GGML_ASSERT(status.second);
            return;
        }
        auto offset = cur_offset;
        auto nbytes = ggml_nbytes(t);
        if (in_place) {
            auto it = tensors.find(t->src[0]);
            GGML_ASSERT(it != tensors.end());
            offset = it->second;
        } else {
            for (auto it = gaps.begin(); it != gaps.end(); ++it) {
                if (nbytes == it->second) {
                    offset = it->first;
                    gaps.erase(it);
                    break;
                }
            }
        }
        auto status = tensors.emplace(std::make_pair(t, offset));
        GGML_ASSERT(status.second);
        if (debug) {
            printf("Added tensor %s at offset %zu. cur_offset = %zu, max_offset = %zu\n", t->name, offset, cur_offset, max_offset);
        }
        if (offset == cur_offset) {
            cur_offset += ggml_nbytes(t);
            max_offset = std::max(max_offset, cur_offset);
        }
    }
    inline void add(ggml_tensor * t, ggml_tensor * prev) {
        GGML_ASSERT(ggml_nbytes(t) == ggml_nbytes(prev));
        auto it = tensors.find(t);
        GGML_ASSERT(it != tensors.end());
        auto status = tensors.emplace(std::make_pair(t, it->second));
        GGML_ASSERT(status.second);
    }
    inline void remove(ggml_tensor * t) {
        if (t->view_src) return;
        auto it = tensors.find(t);
        GGML_ASSERT(it != tensors.end());
        auto nbytes = ggml_nbytes(t);
        if (it->second + nbytes == cur_offset) {
            cur_offset -= nbytes;
            if (debug) {
                printf("Removed tensor %s at end, cur_offset is now %zu\n", t->name, cur_offset);
            }
            return;
        }
        for (int i = 0; i < int(gaps.size()); ++i) {
            auto & gap = gaps[i];
            if (gap.first + gap.second == it->second) {
                gap.second += nbytes;
                if (debug) printf("Added gap for removed tensor %s to existing gap %zu, %zu\n", t->name, gap.first, gap.second);
                if (i + 1 < int(gaps.size()) && gap.first + gap.second == gaps[i+1].first) {
                    if (debug) printf("  merged gaps %d, %d\n", i, i+1);
                    gaps.erase(gaps.begin() + i + 1);
                }
                return;
            }
        }
        gaps.push_back({it->second, nbytes});
        if (debug) {
            auto & gap = gaps.back();
            printf("Created new gap %d: %zu,%zu after removing tensor %s\n", int(gaps.size()), gap.first, gap.second, t->name);
        }
    }
    inline void remove_from(ggml_tensor * t, bool after) {
        auto it = tensors.find(t);
        GGML_ASSERT(it != tensors.end());
        cur_offset = it->second;
        if (after) cur_offset += ggml_nbytes(t);
        for (auto gap = gaps.begin(); gap != gaps.end(); ) {
            if (gap->first >= cur_offset) {
                gap = gaps.erase(gap);
            } else {
                ++gap;
            }
        }
    }
    inline size_t offset(ggml_tensor * t) const {
        auto it = tensors.find(t);
        GGML_ASSERT(it != tensors.end());
        return it->second;
    }
 };
 struct llm_build_context_data {
    std::vector<GraphAllocator> alloc;
 };
 llm_build_context::~llm_build_context() = default;
 void llm_build_context::init() {
    struct ggml_init_params params = {
        /*.mem_size   =*/ buf_compute_meta.size(),
@@ -85,6 +189,12 @@ void llm_build_context::init() {
    lctx.inp_pos_bucket    = nullptr;
    lctx.inp_embd_enc      = nullptr;
    lctx.inp_KQ_mask_cross = nullptr;
    auto & model = lctx.model;
    if (model.split_mode == LLAMA_SPLIT_MODE_GRAPH && model.splits.size() > 1) {
        data = std::make_unique<llm_build_context_data>();
        data->alloc.resize(model.splits.size());
    }
 }
 void llm_build_context::free() {
@@ -1502,7 +1612,7 @@ std::tuple<ggml_tensor*, ggml_tensor*, ggml_tensor*> llm_build_context::llm_buil
            ggml_tensor * wq, ggml_tensor * bq,
            ggml_tensor * wk, ggml_tensor * bk,
            ggml_tensor * wv, ggml_tensor * bv,
-            float attention_scale, int il) const {
+            float attention_scale, int il, GraphAllocator * alloc) const {
    auto Qcur = llm_build_lora_mm(lctx, ctx0, wq, cur);
    cb(Qcur, "Qcur", il);
    auto Kcur = llm_build_lora_mm(lctx, ctx0, wk, cur);
@@ -1512,25 +1622,34 @@ std::tuple<ggml_tensor*, ggml_tensor*, ggml_tensor*> llm_build_context::llm_buil
    ggml_build_forward_expand(gf, Qcur);
    ggml_build_forward_expand(gf, Kcur);
    ggml_build_forward_expand(gf, Vcur);
    if (alloc) {
        alloc->add(Qcur);
        alloc->add(Kcur);
        alloc->add(Vcur);
    }
    if (attention_scale != 0) {
        Qcur = ggml_scale(ctx0, Qcur, attention_scale);
        cb(Qcur, "Qcur", il);
        if (alloc) alloc->add(Qcur, true);
    }
    if (bq) {
        Qcur = ggml_add(ctx0, Qcur, bq);
        cb(Qcur, "Qcur", il);
        ggml_build_forward_expand(gf, Qcur);
        if (alloc) alloc->add(Qcur, true);
    }
    if (bk) {
        Kcur = ggml_add(ctx0, Kcur, bk);
        cb(Kcur, "Kcur", il);
        ggml_build_forward_expand(gf, Kcur);
        if (alloc) alloc->add(Kcur, true);
    }
    if (bv) {
        Vcur = ggml_add(ctx0, Vcur, bv);
        cb(Vcur, "Vcur", il);
        ggml_build_forward_expand(gf, Vcur);
        if (alloc) alloc->add(Vcur, true);
    }
    return {Qcur, Kcur, Vcur};
 }
@@ -1541,15 +1660,17 @@ std::tuple<ggml_tensor*, ggml_tensor*, ggml_tensor*> llm_build_context::llm_buil
            ggml_tensor * wq, ggml_tensor * bq,
            ggml_tensor * wk, ggml_tensor * bk,
            ggml_tensor * wv, ggml_tensor * bv,
-            ggml_tensor * q_norm, ggml_tensor * k_norm, float attention_scale, int il) const {
+            ggml_tensor * q_norm, ggml_tensor * k_norm, float attention_scale, int il, GraphAllocator * alloc) const {
    const int64_t n_embd_head = hparams.n_embd_head_v;
    const int64_t n_embd_gqa  = hparams.n_embd_v_gqa();
    if (wqkv) {
        auto qkv = llm_build_lora_mm(lctx, ctx0, wqkv, cur);
        if (alloc) alloc->add(qkv);
        cb(qkv, "qkv", il);
        if (bqkv) {
            qkv = ggml_add(ctx0, qkv, bqkv);
            cb(qkv, "qkv_b", il);
            if (alloc) alloc->add(qkv, true);
        }
        auto Qcur = ggml_view_3d(ctx0, qkv, n_embd_head, n_head,    n_tokens, n_embd_head*sizeof(float), qkv->nb[1], 0*sizeof(float)*(n_embd));
        auto Kcur = ggml_view_3d(ctx0, qkv, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), qkv->nb[1], 1*sizeof(float)*Qcur->ne[0]*Qcur->ne[1]);
@@ -1557,15 +1678,22 @@ std::tuple<ggml_tensor*, ggml_tensor*, ggml_tensor*> llm_build_context::llm_buil
        cb(Qcur, "Qcur", il);
        cb(Kcur, "Kcur", il);
        cb(Vcur, "Vcur", il);
        if (alloc) {
            alloc->add(Qcur);
            alloc->add(Kcur);
            alloc->add(Vcur);
        }
        if (q_norm) {
            Qcur = llm_build_norm(ctx0, Qcur, hparams, q_norm, NULL, LLM_NORM_RMS, cb, il);
            cb(Qcur, "Qcur_normed", il);
            ggml_build_forward_expand(gf, Qcur);
            if (alloc) alloc->add(Qcur, true);
        }
        if (k_norm) {
            Kcur = llm_build_norm(ctx0, Kcur, hparams, k_norm, NULL, LLM_NORM_RMS, cb, il);
            cb(Kcur, "Kcur_normed", il);
            ggml_build_forward_expand(gf, Kcur);
            if (alloc) alloc->add(Kcur, true);
        }
        return {Qcur, Kcur, Vcur};
@@ -1577,49 +1705,63 @@ std::tuple<ggml_tensor*, ggml_tensor*, ggml_tensor*> llm_build_context::llm_buil
    if (wqk) {
        auto qk = llm_build_lora_mm(lctx, ctx0, wqk, cur);
        if (alloc) alloc->add(qk);
        cb(qk, "qkv", il);
        if (bqk) {
            qk = ggml_add(ctx0, qk, bqk);
            cb(qk, "qkv_b", il);
            if (alloc) alloc->add(qk, true);
        }
        auto Vcur = llm_build_lora_mm(lctx, ctx0, wv, cur);
        cb(Vcur, "Vcur", il);
        if (alloc) alloc->add(Vcur);
        if (bv) {
            Vcur = ggml_add(ctx0, Vcur, bv);
            cb(Vcur, "Vcur", il);
            if (alloc) alloc->add(Vcur, true);
        }
        ggml_build_forward_expand(gf, qk);
        ggml_build_forward_expand(gf, Vcur);
        auto Qcur = ggml_view_3d(ctx0, qk, n_embd_head, n_head,    n_tokens, n_embd_head*sizeof(float), qk->nb[1], 0*sizeof(float)*(n_embd));
        auto Kcur = ggml_view_3d(ctx0, qk, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), qk->nb[1], 1*sizeof(float)*Qcur->ne[0]*Qcur->ne[1]);
        if (alloc) {
            alloc->add(Qcur);
            alloc->add(Vcur);
        }
        cb(Qcur, "Qcur", il);
        cb(Kcur, "Kcur", il);
        if (q_norm) {
            Qcur = llm_build_norm(ctx0, Qcur, hparams, q_norm, NULL, LLM_NORM_RMS, cb, il);
            cb(Qcur, "Qcur_normed", il);
            ggml_build_forward_expand(gf, Qcur);
            if (alloc) alloc->add(Qcur, true);
        }
        if (k_norm) {
            Kcur = llm_build_norm(ctx0, Kcur, hparams, k_norm, NULL, LLM_NORM_RMS, cb, il);
            cb(Kcur, "Kcur_normed", il);
            ggml_build_forward_expand(gf, Kcur);
            if (alloc) alloc->add(Kcur, true);
        }
        return {Qcur, Kcur, Vcur};
    }
-    auto [Q, K, V] = llm_build_mul_mat_qkv(gf, cur, wq, bq, wk, bk, wv, bv, attention_scale, il);
+    auto [Q, K, V] = llm_build_mul_mat_qkv(gf, cur, wq, bq, wk, bk, wv, bv, attention_scale, il, alloc);
    auto Qcur = ggml_reshape_3d(ctx0, Q, n_embd_head, Q->ne[0]/n_embd_head, n_tokens);
    if (alloc) alloc->add(Qcur);
    if (q_norm) {
        Qcur = llm_build_norm(ctx0, Qcur, hparams, q_norm, NULL, LLM_NORM_RMS, cb, il);
        cb(Qcur, "Qcur_normed", il);
        if (alloc) alloc->add(Qcur, true);
    }
    auto Kcur = ggml_reshape_3d(ctx0, K, n_embd_head, K->ne[0]/n_embd_head, n_tokens);
    if (alloc) alloc->add(Kcur);
    if (k_norm) {
        Kcur = llm_build_norm(ctx0, Kcur, hparams, k_norm, NULL, LLM_NORM_RMS, cb, il);
        cb(Kcur, "Kcur_normed", il);
        if (alloc) alloc->add(Kcur, true);
    }
    auto Vcur = V;
    return {Qcur, Kcur, Vcur};
@@ -9238,6 +9380,8 @@ ggml_tensor * llm_build_context::build_std_attention(ggml_cgraph * gf, ggml_tens
            }
            std::vector<ggml_tensor*> attn; attn.reserve(wq->n_device);
            for (int id = 0; id < wq->n_device; ++id) {
                auto alloc = &data->alloc[id];
                alloc->debug = true;
                int il_cb = 1000*(id+1) + il;
                auto split_wq = wq->splits[id];
                auto split_wk = wk->splits[id];
@@ -9248,14 +9392,20 @@ ggml_tensor * llm_build_context::build_std_attention(ggml_cgraph * gf, ggml_tens
                GGML_ASSERT((!split_wq && !split_wk && !split_wv && !split_wo && !split_kl && !split_vl) ||
                        (split_wq && split_wk && split_wv && split_wo && split_kl && split_vl));
                if (!split_wq) continue;
                alloc->add(input);
                auto cur = input;
                auto offset = alloc->offset(input) + ggml_nbytes(input);
                if (attn_norm) {
                    auto split_norm = attn_norm->splits[id];
                    cur = llm_build_norm(ctx0, cur, hparams, split_norm, NULL, LLM_NORM_RMS, cb, il);
                    cb(cur, "attn_norm", il_cb);
                    alloc->add(cur);
                    offset = alloc->offset(cur);
                }
                else if (cur->type != GGML_TYPE_F32) {
                    cur = ggml_cast(ctx0, cur, GGML_TYPE_F32);
                    alloc->add(cur);
                    offset = alloc->offset(cur);
                }
                auto the_q_norm = model.layers[il].attn_q_norm ? model.layers[il].attn_q_norm->extra ?
                    ((ggml_split_tensor_t *)model.layers[il].attn_q_norm->extra)->splits[id] : model.layers[il].attn_q_norm : nullptr;
@@ -9265,7 +9415,7 @@ ggml_tensor * llm_build_context::build_std_attention(ggml_cgraph * gf, ggml_tens
                        split_wq, bq ? bq->splits[id] : nullptr,
                        split_wk, bk ? bk->splits[id] : nullptr,
                        split_wv, bv ? bv->splits[id] : nullptr,
-                        the_q_norm, the_k_norm, f_attn_scale, il_cb);
+                        the_q_norm, the_k_norm, f_attn_scale, il_cb, alloc);
                auto rope_factors = rope_factors_in;
                if (!rope_factors && model.layers[il].rope_freqs && model.layers[il].rope_freqs->extra) {
                    auto extra = (ggml_split_tensor_t *)model.layers[il].rope_freqs->extra;
@@ -9273,8 +9423,10 @@ ggml_tensor * llm_build_context::build_std_attention(ggml_cgraph * gf, ggml_tens
                }
                Qcur = ggml_rope_ext(ctx0, Qcur, inp_pos, rope_factors, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
                        ext_factor, attn_factor, beta_fast, beta_slow);
                alloc->add(Qcur, true);
                Kcur = ggml_rope_ext(ctx0, Kcur, inp_pos, rope_factors, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
                        ext_factor, attn_factor, beta_fast, beta_slow);
                alloc->add(Kcur, true);
                cb(Qcur, "Qcur", il_cb);
                cb(Kcur, "Kcur", il_cb);
                ggml_build_forward_expand(gf, Qcur);
@@ -9300,19 +9452,26 @@ ggml_tensor * llm_build_context::build_std_attention(ggml_cgraph * gf, ggml_tens
                struct ggml_tensor * v_cache_view = nullptr;
-                if (cparams.flash_attn) {
+                GGML_ASSERT(cparams.flash_attn);
-                    v_cache_view = ggml_view_1d(ctx0, split_vl, n_tokens*split_wv->ne[1],
+                v_cache_view = ggml_view_1d(ctx0, split_vl, n_tokens*split_wv->ne[1],
-                            kv_head*ggml_row_size(split_vl->type, split_wv->ne[1]));
+                        kv_head*ggml_row_size(split_vl->type, split_wv->ne[1]));
-                    lctx.cache_copies[idx+1].step = ggml_row_size(split_vl->type, split_wv->ne[1]);
+                lctx.cache_copies[idx+1].step = ggml_row_size(split_vl->type, split_wv->ne[1]);
-                } else {
+                alloc->remove(Vcur);
-                    // note: the V cache is transposed when not using flash attention
+                alloc->remove(Kcur);
                    v_cache_view = ggml_view_2d(ctx0, split_vl, n_tokens, split_wv->ne[1],
                            (  n_ctx)*ggml_element_size(split_vl),
                            (kv_head)*ggml_element_size(split_vl));
                    lctx.cache_copies[idx+1].step = ggml_element_size(split_vl);
-                    Vcur = ggml_transpose(ctx0, Vcur);
+                //if (cparams.flash_attn) {
-                }
+                //    v_cache_view = ggml_view_1d(ctx0, split_vl, n_tokens*split_wv->ne[1],
                //            kv_head*ggml_row_size(split_vl->type, split_wv->ne[1]));
                //    lctx.cache_copies[idx+1].step = ggml_row_size(split_vl->type, split_wv->ne[1]);
                //} else {
                //    // note: the V cache is transposed when not using flash attention
                //    v_cache_view = ggml_view_2d(ctx0, split_vl, n_tokens, split_wv->ne[1],
                //            (  n_ctx)*ggml_element_size(split_vl),
                //            (kv_head)*ggml_element_size(split_vl));
                //    lctx.cache_copies[idx+1].step = ggml_element_size(split_vl);
                //    Vcur = ggml_transpose(ctx0, Vcur);
                //}
                cb(v_cache_view, "v_cache_view", il_cb);
                lctx.cache_copies[idx+1].cpy  = ggml_cpy(ctx0, Vcur, v_cache_view);
@@ -9320,6 +9479,7 @@ ggml_tensor * llm_build_context::build_std_attention(ggml_cgraph * gf, ggml_tens
                auto q = ggml_permute(ctx0, Qcur, 0, 2, 1, 3);
                cb(q, "q", il_cb);
                alloc->add(q);
                auto k = ggml_view_3d(ctx0, split_kl, n_embd_head_k, n_kv, n_head_kv,
                             ggml_row_size(split_kl->type, n_embd_head_k)*n_head_kv, //n_embd_k_gqa),
@@ -9343,6 +9503,8 @@ ggml_tensor * llm_build_context::build_std_attention(ggml_cgraph * gf, ggml_tens
                if (n_swa > 0) {
                    ((int32_t *)cur->op_params)[4] = n_swa;
                }
                alloc->add(cur);
                alloc->remove(Qcur);
                // Some models produced NaNs/gibberish when FA is computed with f16 precision on CUDA
                if (use_f32_precision || model.arch == LLM_ARCH_PHI2 || model.arch == LLM_ARCH_PHI3 || model.arch == LLM_ARCH_GPTNEOX ||
@@ -9353,8 +9515,10 @@ ggml_tensor * llm_build_context::build_std_attention(ggml_cgraph * gf, ggml_tens
                cur = ggml_reshape_2d(ctx0, cur, split_wo->ne[0], n_tokens);
                cb(cur, "flash_attn_reshaped", il_cb);
                alloc->add(cur);
                cur = llm_build_lora_mm(lctx, ctx0, split_wo, cur);
                alloc->add(cur);
                if (lctx.model.arch == LLM_ARCH_GLM4 || lctx.model.arch == LLM_ARCH_GLM4_MOE) {
                    // GLM4 and GLM4_MOE seem to have numerical issues with half-precision accumulators
                    ggml_mul_mat_set_prec(cur, GGML_PREC_F32);
@@ -9363,22 +9527,34 @@ ggml_tensor * llm_build_context::build_std_attention(ggml_cgraph * gf, ggml_tens
                if (bo) {
                    cur = ggml_add(ctx0, cur, bo->splits[id]);
                    cb(cur, "kqv_wo_biased", il_cb);
                    alloc->add(cur, true);
                }
                if (cur->ne[1] >= 32) {
                    cur = ggml_cast(ctx0, cur, GGML_TYPE_F16);
                    alloc->add(cur);
                }
                ggml_build_forward_expand(gf, cur);
                attn.push_back(cur);
                if (id != model.main_gpu) {
                    data->alloc[id].cur_offset = offset;
                }
            }
            GGML_ASSERT(!attn.empty());
            if (attn.size() == 1) return attn.front();
            auto alloc = &data->alloc[model.main_gpu];
            if (model.main_gpu != 0) alloc->add(attn[0]);
            if (model.main_gpu != 1) alloc->add(attn[1]);
            auto cur = ggml_add(ctx0, attn[0], attn[1]);
            alloc->add(cur, true);
            cb(cur, "combine_attn", il);
            cur->op_params[0] = 0xff;
            for (int id = 2; id < (int)attn.size(); ++id) {
                if (id != model.main_gpu) alloc->add(attn[id]);
                cur = ggml_add(ctx0, cur, attn[id]);
                cb(cur, "combine_attn", il);
                alloc->add(cur, true);
            }
            alloc->remove_from(cur, true);
            return cur;
        }
    }
--- a/src/llama-build-context.h
+++ b/src/llama-build-context.h
@@ -6,6 +6,7 @@
 #include <cstdint>
 #include <functional>
 #include <tuple>
 #include <memory>
 struct llama_model;
 struct llama_context;
@@ -37,6 +38,9 @@ enum llm_norm_type {
    LLM_NORM_RMS,
 };
 struct llm_build_context_data;
 struct GraphAllocator;
 struct llm_build_context {
    const llama_model    & model;
          llama_context  & lctx;
@@ -94,6 +98,8 @@ struct llm_build_context {
    struct ggml_context * ctx0 = nullptr;
    std::unique_ptr<llm_build_context_data> data;
    // TODO: consider making the entire interface noexcept
    llm_build_context(
        llama_context  & lctx,
@@ -102,6 +108,8 @@ struct llm_build_context {
    bool   worst_case,
    bool   warmup);
    ~llm_build_context();
    void init();
    void free();
@@ -148,7 +156,7 @@ struct llm_build_context {
            ggml_tensor * wq, ggml_tensor * bq,
            ggml_tensor * wk, ggml_tensor * bk,
            ggml_tensor * wv, ggml_tensor * bv,
-            float attention_scale, int il) const;
+            float attention_scale, int il, GraphAllocator * alloc = nullptr) const;
    std::tuple<ggml_tensor*, ggml_tensor*, ggml_tensor*> llm_build_mul_mat_qkv(ggml_cgraph * gf, ggml_tensor * cur,
            ggml_tensor * wqkv, ggml_tensor * bqkv,
@@ -156,7 +164,7 @@ struct llm_build_context {
            ggml_tensor * wq, ggml_tensor * bq,
            ggml_tensor * wk, ggml_tensor * bk,
            ggml_tensor * wv, ggml_tensor * bv,
-            ggml_tensor * q_norm, ggml_tensor * k_norm, float attention_scale, int il) const;
+            ggml_tensor * q_norm, ggml_tensor * k_norm, float attention_scale, int il, GraphAllocator * alloc = nullptr) const;
    ggml_cgraph * build_llama();