llama_build_context

Surprisingly small reduction in llama.cpp compile time given the reduction in LOCs (22k -> 14k)
2026-04-20 22:49:31 +00:00 · 2025-10-10 08:33:28 +03:00
parent 0582186c66
commit 37bf216d21
7 changed files with 8832 additions and 8328 deletions
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -19,6 +19,8 @@ add_library(llama
            llama-sampling.cpp
            llama-mmap.cpp
            llama-model-loader.cpp
+            llama-build-context.h
+            llama-build-context.cpp
            unicode.h
            unicode.cpp
            unicode-data.cpp
--- a/src/llama-build-context.cpp
+++ b/src/llama-build-context.cpp
--- a/src/llama-build-context.h
+++ b/src/llama-build-context.h
@@ -0,0 +1,357 @@
+#pragma once
+
+#include "llama-impl.h"
+#include "llama-hparams.h"
+
+#include <cstdint>
+#include <functional>
+#include <tuple>
+
+struct llama_model;
+struct llama_context;
+struct llama_cparams;
+struct llama_batch;
+struct llama_kv_cache;
+
+struct ggml_cgraph;
+struct ggml_tensor;
+
+using llm_build_cb = std::function<void(struct ggml_tensor * cur, const char * name, int nl)>;
+
+enum llm_ffn_op_type {
+    LLM_FFN_SILU,
+    LLM_FFN_GELU,
+    LLM_FFN_RELU,
+    LLM_FFN_RELU_SQR,
+    LLM_FFN_SWIGLU,
+    LLM_FFN_SWIGLU_OAI_MOE,
+};
+
+enum llm_ffn_gate_type {
+    LLM_FFN_SEQ,
+    LLM_FFN_PAR, // ffn_gate is parallel to ffn_up
+};
+
+enum llm_norm_type {
+    LLM_NORM,
+    LLM_NORM_RMS,
+};
+
+struct llm_build_context {
+    const llama_model    & model;
+          llama_context  & lctx;
+    const llama_hparams  & hparams;
+    const llama_cparams  & cparams;
+    const llama_batch    & batch;
+    const llama_kv_cache & kv_self;
+
+    const int64_t n_embd;
+    const int64_t n_layer;
+    const int64_t n_rot;
+    const int64_t n_ctx;       // user-specified context size (can be different from n_ctx_train)
+    const int64_t n_head;
+    const int64_t n_head_kv;
+    const int64_t n_embd_head_k;
+    const int64_t n_embd_k_gqa;
+    const int64_t n_embd_head_v;
+    const int64_t n_embd_v_gqa;
+    const int64_t n_expert;
+    const int64_t n_expert_used;
+
+    const float freq_base;
+    const float freq_scale;
+    const float ext_factor;
+    const float attn_factor;
+    const float beta_fast;
+    const float beta_slow;
+    const float norm_eps;
+    const float norm_rms_eps;
+
+    const int32_t n_tokens;
+    const int32_t n_kv;     // size of KV cache to consider (n_kv <= kv_self.size)
+    const int32_t n_outputs;
+    const int32_t n_outputs_enc;
+    const int32_t kv_head;  // index of where we store new KV data in the cache
+    const int32_t n_ctx_orig;
+
+    const bool flash_attn;
+    const int  mla_attn;
+    const int  attn_max_batch;
+    const bool fused_moe_up_gate;
+    const bool fused_up_gate;
+    const int  min_experts;
+    const float thresh_experts;
+
+    const enum llama_pooling_type pooling_type;
+    const enum llama_rope_type    rope_type;
+
+    const llm_build_cb & cb;
+
+    std::vector<uint8_t> & buf_compute_meta;
+
+    struct ggml_context * ctx0 = nullptr;
+
+    // TODO: consider making the entire interface noexcept
+    llm_build_context(
+        llama_context  & lctx,
+    const llama_batch  & batch,
+    const llm_build_cb & cb,
+    bool   worst_case,
+    bool   warmup);
+
+    void init();
+
+    void free();
+
+    ggml_cgraph * build_k_shift();
+
+    ggml_cgraph * build_s_copy();
+
+    ggml_cgraph * build_defrag(const std::vector<uint32_t> & ids);
+
+    ggml_tensor * build_inp_pos();
+
+    ggml_tensor * build_input_scale(int n_tokens);
+
+    ggml_tensor * build_rope_factors(int il);
+
+    ggml_tensor * build_inp_out_ids();
+
+    ggml_tensor * build_inp_KQ_mask(bool causal = true);
+
+    ggml_tensor * build_inp_KQ_mask_swa(bool causal = true);
+
+    ggml_tensor * build_inp_mean();
+
+    ggml_tensor * build_inp_cls();
+
+    ggml_tensor * build_inp_s_copy();
+
+    ggml_tensor * build_inp_s_mask();
+
+    ggml_tensor * build_inp_s_seq();
+
+    ggml_cgraph * append_pooling(struct ggml_cgraph * gf);
+
+    ggml_tensor * llm_build_pos_bucket(bool causal);
+
+    ggml_tensor * llm_build_pos_bias(struct ggml_tensor * pos_bucket, struct ggml_tensor * attn_rel_b);
+
+    ggml_tensor * llm_build_inp_embd_enc();
+
+    ggml_tensor * llm_build_inp_KQ_mask_cross();
+
+    std::tuple<ggml_tensor*, ggml_tensor*, ggml_tensor*> llm_build_mul_mat_qkv(ggml_cgraph * gf, ggml_tensor * cur,
+            ggml_tensor * wq, ggml_tensor * bq,
+            ggml_tensor * wk, ggml_tensor * bk,
+            ggml_tensor * wv, ggml_tensor * bv,
+            float attention_scale, int il);
+
+    ggml_cgraph * build_llama();
+
+    ggml_cgraph * build_deci();
+
+    ggml_cgraph * build_baichuan();
+
+    ggml_cgraph * build_xverse();
+
+    ggml_cgraph * build_falcon();
+
+    ggml_cgraph * build_grok();
+
+    ggml_cgraph * build_dbrx();
+
+    ggml_cgraph * build_starcoder();
+
+    ggml_cgraph * build_refact();
+
+    ggml_cgraph * build_bert();
+
+    ggml_cgraph * build_bloom();
+
+    ggml_cgraph * build_mpt();
+
+    ggml_cgraph * build_stablelm();
+
+    ggml_cgraph * build_qwen();
+
+    ggml_cgraph * build_qwen2();
+
+    ggml_cgraph * build_qwen2vl();
+
+    ggml_cgraph * build_qwen2moe();
+
+    ggml_cgraph * build_qwen3();
+
+    ggml_cgraph * build_qwen3moe();
+
+    ggml_cgraph * build_phi2();
+
+    ggml_cgraph * build_phi3();
+
+    ggml_cgraph * build_plamo();
+
+    ggml_cgraph * build_gpt2();
+
+    ggml_cgraph * build_codeshell();
+
+    ggml_cgraph * build_orion();
+
+    ggml_cgraph * build_internlm2();
+
+    ggml_cgraph * build_minicpm();
+
+    ggml_cgraph * build_gemma();
+
+    ggml_cgraph * build_gemma2();
+
+    ggml_cgraph * build_gemma3();
+
+    ggml_cgraph * build_starcoder2();
+
+    ggml_cgraph * build_mamba();
+
+    ggml_cgraph * build_command_r();
+
+    ggml_cgraph * build_olmo();
+
+    ggml_cgraph * build_openelm();
+
+    ggml_cgraph * build_gptneox();
+
+    ggml_cgraph * build_arctic();
+
+    ggml_cgraph * build_deepseek2();
+
+    ggml_cgraph * build_glm4_moe();
+
+    ggml_cgraph * build_bitnet();
+
+    ggml_cgraph * build_bitnet_158();
+
+    ggml_cgraph * build_cohere2();
+
+    ggml_cgraph * build_t5_encoder();
+
+    ggml_cgraph * build_t5_decoder();
+
+    ggml_cgraph * build_jais();
+
+    ggml_cgraph * build_chatglm();
+
+    ggml_cgraph * build_glm4();
+
+    ggml_cgraph * build_dots1();
+
+    ggml_cgraph * build_ernie4_5();
+
+    ggml_cgraph * build_ernie4_5_moe();
+
+    ggml_cgraph * build_hunyuan_moe();
+
+    ggml_cgraph * build_openai_moe();
+
+    //
+    static ggml_tensor * llm_build_lora_mm(llama_context & lctx, ggml_context * ctx0,
+            ggml_tensor * w, ggml_tensor * cur);
+
+    static ggml_tensor * llm_build_lora_mm_id(llama_context & lctx, ggml_context * ctx0,
+          ggml_tensor * w, ggml_tensor * cur, ggml_tensor * ids);
+
+    static ggml_tensor * llm_build_inp_embd(ggml_context * ctx, llama_context & lctx,
+        const llama_hparams & hparams,
+          const llama_batch & batch,
+         struct ggml_tensor * tok_embd,
+         const llm_build_cb & cb);
+
+    static ggml_tensor * llm_build_norm(ggml_context * ctx, ggml_tensor * cur,
+         const llama_hparams & hparams,
+         ggml_tensor * mw,
+         ggml_tensor * mb,
+         llm_norm_type   type,
+         const llm_build_cb & cb, int il, float scale_eps = 1);
+
+    static void llm_build_kv_store(ggml_context * ctx, const llama_hparams & hparams,
+        const llama_cparams & cparams,
+       const llama_kv_cache & kv,
+         ggml_cgraph * graph,
+         ggml_tensor * k_cur,
+         ggml_tensor * v_cur,
+         int32_t   n_tokens,
+         int32_t   kv_head,
+         const llm_build_cb & cb, int64_t il);
+
+    static struct ggml_tensor * llm_build_kv(ggml_context * ctx, llama_context & lctx,
+       const llama_kv_cache & kv,
+         ggml_cgraph * graph,
+         ggml_tensor * wo,
+         ggml_tensor * wo_b,
+         ggml_tensor * k_cur,
+         ggml_tensor * v_cur,
+         ggml_tensor * q_cur,
+         ggml_tensor * kq_mask,
+                    int32_t   n_tokens,
+                    int32_t   kv_head,
+                    int32_t   n_kv,
+                    float     kq_scale,
+         const llm_build_cb & cb, int il, ggml_tensor * sinks = nullptr, int n_swa = 0);
+
+    static ggml_tensor * llm_build_ffn(ggml_context * ctx, llama_context & lctx,
+         ggml_tensor * cur,
+         ggml_tensor * up,
+         ggml_tensor * up_b,
+         ggml_tensor * up_s,
+         ggml_tensor * gate,
+         ggml_tensor * gate_b,
+         ggml_tensor * gate_s,
+         ggml_tensor * down,
+         ggml_tensor * down_b,
+         ggml_tensor * down_s,
+         ggml_tensor * act_scales,
+            llm_ffn_op_type   type_op,
+          llm_ffn_gate_type   type_gate,
+         const llm_build_cb & cb, int il);
+
+    static ggml_tensor * llm_build_moe_ffn(ggml_context * ctx, llama_context & lctx,
+         ggml_tensor * cur,
+         ggml_tensor * gate_inp,   ggml_tensor * gate_inp_b,
+         ggml_tensor * up_exps,    ggml_tensor * up_exps_b,
+         ggml_tensor * gate_exps,  ggml_tensor * gate_exps_b,
+         ggml_tensor * down_exps,  ggml_tensor * down_exps_b,
+         ggml_tensor * exp_probs_b,
+                    int64_t   n_expert,
+                    int64_t   n_expert_used,
+            llm_ffn_op_type   type_op,
+                       bool   norm_w,
+                       bool   scale_w,
+                      float   w_scale,
+llm_expert_gating_func_type   gating_op,
+         const llm_build_cb & cb, int il, ggml_cgraph * graph = nullptr);
+
+    static ggml_tensor * llm_build_moe_ffn(ggml_context * ctx, llama_context & lctx,
+         ggml_tensor * cur,
+         ggml_tensor * gate_inp,
+         ggml_tensor * up_exps,
+         ggml_tensor * gate_exps,
+         ggml_tensor * down_exps,
+         ggml_tensor * exp_probs_b,
+                    int64_t   n_expert,
+                    int64_t   n_expert_used,
+            llm_ffn_op_type   type_op,
+                       bool   norm_w,
+                       bool   scale_w,
+                      float   w_scale,
+llm_expert_gating_func_type   gating_op,
+         const llm_build_cb & cb, int il, ggml_cgraph * graph = nullptr) {
+        return llm_build_moe_ffn(ctx, lctx, cur,
+                gate_inp,   nullptr,
+                up_exps,    nullptr,
+                gate_exps,  nullptr,
+                down_exps,  nullptr,
+                exp_probs_b,
+                n_expert, n_expert_used,
+                type_op, norm_w, scale_w, w_scale,
+                gating_op, cb, il, graph);
+    }
+
+};
--- a/src/llama-context.h
+++ b/src/llama-context.h
@@ -0,0 +1,205 @@
+#pragma once
+
+#include "llama-impl.h"
+#include "llama-cparams.h"
+#include "llama-sampling.h"
+
+struct llama_model;
+
+#include <vector>
+#include <map>
+#include <set>
+
+struct llama_kv_cell {
+    llama_pos pos   = -1;
+    llama_pos delta = 0;
+    int32_t   src   = 0; // used by recurrent state models to copy states
+
+    std::set<llama_seq_id> seq_id;
+
+    bool has_seq_id(const llama_seq_id & id) const {
+        return seq_id.find(id) != seq_id.end();
+    }
+
+    bool is_empty() const {
+        return seq_id.empty();
+    }
+
+    bool is_same_seq(const llama_kv_cell & other) const {
+        return seq_id == other.seq_id;
+    }
+};
+
+// ring-buffer of cached KV data
+struct llama_kv_cache {
+    bool has_shift = false;
+    bool do_defrag = false;
+    bool do_copy   = false;
+    bool recurrent = false; // with recurrent state models, a cell can hold the state for more than one past token
+    bool v_trans   = true;  // the value tensor is transposed
+
+    // Note: The value of head isn't only used to optimize searching
+    // for a free KV slot. llama_decode_internal also uses it, so it
+    // cannot be freely changed after a slot has been allocated.
+    uint32_t head = 0;
+    uint32_t size = 0;
+    uint32_t used = 0; // used cells (i.e. at least one seq_id)
+
+    // computed before each graph build
+    uint32_t n = 0;
+
+    ggml_type type_k = GGML_TYPE_F16;
+    ggml_type type_v = GGML_TYPE_F16;
+
+    std::vector<llama_kv_cell> cells;
+
+    std::vector<struct ggml_tensor *> k_l; // per layer
+    std::vector<struct ggml_tensor *> v_l;
+
+    std::vector<struct ggml_context *> ctxs;
+    std::vector<ggml_backend_buffer_t> bufs;
+
+    size_t total_size() const {
+        size_t size = 0;
+        for (ggml_backend_buffer_t buf : bufs) {
+            size += ggml_backend_buffer_get_size(buf);
+        }
+        return size;
+    }
+
+    ~llama_kv_cache() {
+        for (struct ggml_context * ctx : ctxs) {
+            ggml_free(ctx);
+        }
+        for (ggml_backend_buffer_t buf : bufs) {
+            ggml_backend_buffer_free(buf);
+        }
+    }
+};
+
+struct llama_control_vector {
+    std::vector<struct ggml_tensor *> tensors; // per layer
+    std::vector<struct ggml_context *> ctxs;
+    std::vector<ggml_backend_buffer_t> bufs;
+
+    int32_t layer_start = -1;
+    int32_t layer_end   = -1;
+
+    struct ggml_tensor * tensor_for(int il) const {
+        if (il < 0 || il < layer_start || il > layer_end || (size_t) il >= tensors.size()) {
+            return nullptr;
+        }
+        return tensors[il];
+    }
+
+    struct ggml_tensor * apply_to(struct ggml_context * ctx, struct ggml_tensor * cur, int  il) const {
+        ggml_tensor * layer_dir = tensor_for(il);
+        if (layer_dir != nullptr) {
+            cur = ggml_add(ctx, cur, layer_dir);
+        }
+        return cur;
+    }
+
+    ~llama_control_vector() {
+        for (struct ggml_context * ctx : ctxs) {
+            ggml_free(ctx);
+        }
+        for (ggml_backend_buffer_t buf : bufs) {
+            ggml_backend_buffer_free(buf);
+        }
+    }
+};
+
+struct llama_context {
+
+    llama_context(const llama_model & model);
+
+    ~llama_context();
+
+    const struct llama_model & model;
+
+    struct llama_cparams        cparams;
+    struct llama_sampling       sampling;
+    struct llama_kv_cache       kv_self;
+    struct llama_control_vector cvec;
+
+    std::vector<float> scale_data;
+
+    std::unordered_map<struct llama_lora_adapter *, float> lora_adapters;
+
+    std::vector<ggml_backend_t> backends;
+#ifdef GGML_USE_METAL
+    ggml_backend_t backend_metal = nullptr;
+#endif
+#ifdef GGML_USE_BLAS
+    ggml_backend_t backend_blas = nullptr;
+#endif
+    ggml_backend_t backend_cpu = nullptr;
+
+    bool has_evaluated_once = false;
+
+    int64_t t_start_us;
+    int64_t t_load_us;
+    int64_t t_p_eval_us = 0;
+    int64_t t_eval_us   = 0;
+
+    int64_t t_compute_start_us = 0;
+    int64_t n_queued_tokens = 0;
+
+    int32_t n_p_eval = 0; // number of tokens in eval calls for the prompt (with batch size > 1)
+    int32_t n_eval   = 0; // number of eval calls
+
+    // host buffer for the model output (logits and embeddings)
+    ggml_backend_buffer_t buf_output = nullptr;
+
+    // decode output (2-dimensional array: [n_outputs][n_vocab])
+    size_t  logits_size = 0; // capacity (of floats) for logits
+    float * logits      = nullptr;
+
+    std::vector<int32_t> output_ids; // map batch token positions to ids of the logits and embd buffers
+    size_t  output_size = 0; // capacity (of tokens positions) for the output buffers
+    int32_t n_outputs   = 0; // number of actually-used outputs in the current ubatch or last logical batch
+
+    bool logits_all = false;
+
+    // embeddings output (2-dimensional array: [n_outputs][n_embd])
+    // populated only when pooling_type == LLAMA_POOLING_TYPE_NONE
+    size_t  embd_size = 0; // capacity (of floats) for embeddings
+    float * embd      = nullptr;
+
+    // sequence embeddings output (map of [n_embd] vectors)
+    // populated only when pooling_type != LLAMA_POOLING_TYPE_NONE
+    std::map<llama_seq_id, std::vector<float>> embd_seq;
+
+    // whether we are computing encoder output or decoder output
+    bool is_encoding = false;
+
+    // output of the encoder part of the encoder-decoder models
+    std::vector<float> embd_enc;
+    std::vector<std::set<llama_seq_id>> seq_ids_enc;
+
+    // memory buffers used to evaluate the model
+    std::vector<uint8_t> buf_compute_meta;
+    ggml_backend_sched_t sched = nullptr;
+
+    ggml_abort_callback abort_callback      = nullptr;
+    void *              abort_callback_data = nullptr;
+
+    // input tensors
+    struct ggml_tensor * inp_tokens;      // I32 [n_batch]
+    struct ggml_tensor * inp_embd;        // F32 [n_embd, n_batch]
+    struct ggml_tensor * inp_pos;         // I32 [n_batch]
+    struct ggml_tensor * inp_out_ids;     // I32 [n_outputs]
+    struct ggml_tensor * inp_KQ_mask;     // F32 [kv_size, n_batch]
+    struct ggml_tensor * inp_KQ_mask_swa; // F32 [kv_size, n_batch]
+    struct ggml_tensor * inp_K_shift;     // I32 [kv_size]
+    struct ggml_tensor * inp_mean;        // F32 [n_batch, n_batch]
+    struct ggml_tensor * inp_cls;         // I32 [n_batch]
+    struct ggml_tensor * inp_s_copy;      // I32 [kv_size]
+    struct ggml_tensor * inp_s_mask;      // F32 [1, n_kv]
+    struct ggml_tensor * inp_s_seq;       // I32 [n_kv, n_batch]
+    struct ggml_tensor * inp_pos_bucket;    // I32 [n_batch|n_kv, n_batch]
+    struct ggml_tensor * inp_embd_enc;      // F32 [n_embd, n_outputs_enc]
+    struct ggml_tensor * inp_KQ_mask_cross; // F32 [n_outputs_enc, n_batch]
+    struct ggml_tensor * inp_scale = nullptr; // F32 [n_tokens]
+};
--- a/src/llama-cparams.h
+++ b/src/llama-cparams.h
@@ -0,0 +1,42 @@
+#pragma once
+
+#include "llama-impl.h"
+
+#include <cstdint>
+
+struct llama_cparams {
+    uint32_t n_ctx;           // context size used during inference
+    uint32_t n_batch;
+    uint32_t n_ubatch;
+    uint32_t n_seq_max;
+    uint32_t n_threads;       // number of threads to use for generation
+    uint32_t n_threads_batch; // number of threads to use for batch processing
+
+    float rope_freq_base;
+    float rope_freq_scale;
+
+    uint32_t n_ctx_orig_yarn;
+    // These hyperparameters are not exposed in GGUF, because all
+    // existing YaRN models use the same values for them.
+    float yarn_ext_factor;
+    float yarn_attn_factor;
+    float yarn_beta_fast;
+    float yarn_beta_slow;
+    float defrag_thold;
+
+    bool embeddings;
+    bool causal_attn;
+    bool offload_kqv;
+    bool flash_attn;
+    int  mla_attn;
+    int  attn_max_batch;
+    bool fused_moe_up_gate;
+    bool fused_up_gate;
+    int  min_experts;
+    float thresh_experts;
+
+    enum llama_pooling_type pooling_type;
+
+    ggml_backend_sched_eval_callback cb_eval;
+    void * cb_eval_user_data;
+};
--- a/src/llama-model.h
+++ b/src/llama-model.h
@@ -230,7 +230,6 @@ struct llama_layer {

 struct llama_lora_adapter;

-
 struct llama_model {
    e_model     type  = MODEL_UNKNOWN;
    llm_arch    arch  = LLM_ARCH_UNKNOWN;
@@ -301,4 +300,50 @@ struct llama_model {
    std::set<llama_lora_adapter *> lora_adapters;

    ~llama_model();
+
+    // Not actually needed, but left in place for now
+    size_t max_nodes() const { return 65536; }
+};
+
+struct llama_lora_weight {
+    struct ggml_tensor * a = nullptr;
+    struct ggml_tensor * b = nullptr;
+    llama_lora_weight() = default;
+    llama_lora_weight(struct ggml_tensor * a, struct ggml_tensor * b): a(a), b(b) {}
+};
+
+struct llama_lora_adapter {
+    llama_model * base_model;
+    // map tensor name to lora_a_b
+    std::unordered_map<std::string, struct llama_lora_weight> ab_map;
+    std::vector<struct ggml_context *> ctxs;
+    std::vector<ggml_backend_buffer_t> bufs;
+
+    float alpha;
+
+    llama_lora_adapter(struct llama_model * base_model): base_model(base_model) {
+        base_model->lora_adapters.insert(this);
+    }
+
+    llama_lora_weight * get_weight(struct ggml_tensor * w) {
+        std::string name(w->name);
+        auto pos = ab_map.find(name);
+        if (ab_map.find(name) != ab_map.end()) {
+            return &pos->second;
+        }
+        return nullptr;
+    }
+
+    ~llama_lora_adapter() {
+        for (struct ggml_context * ctx : ctxs) {
+            ggml_free(ctx);
+        }
+        for (ggml_backend_buffer_t buf : bufs) {
+            ggml_backend_buffer_free(buf);
+        }
+        auto pos = base_model->lora_adapters.find(this);
+        if (pos != base_model->lora_adapters.end()) {
+            base_model->lora_adapters.erase(pos);
+        }
+    }
 };
--- a/src/llama.cpp
+++ b/src/llama.cpp