llama_build_context

Surprisingly small reduction in llama.cpp compile time given
the reduction in LOCs (22k -> 14k)
This commit is contained in:
Iwan Kawrakow
2025-10-10 08:33:28 +03:00
parent 0582186c66
commit 37bf216d21
7 changed files with 8832 additions and 8328 deletions

View File

@@ -19,6 +19,8 @@ add_library(llama
llama-sampling.cpp
llama-mmap.cpp
llama-model-loader.cpp
llama-build-context.h
llama-build-context.cpp
unicode.h
unicode.cpp
unicode-data.cpp

8179
src/llama-build-context.cpp Normal file

File diff suppressed because it is too large Load Diff

357
src/llama-build-context.h Normal file
View File

@@ -0,0 +1,357 @@
#pragma once
#include "llama-impl.h"
#include "llama-hparams.h"
#include <cstdint>
#include <functional>
#include <tuple>
struct llama_model;
struct llama_context;
struct llama_cparams;
struct llama_batch;
struct llama_kv_cache;
struct ggml_cgraph;
struct ggml_tensor;
using llm_build_cb = std::function<void(struct ggml_tensor * cur, const char * name, int nl)>;
enum llm_ffn_op_type {
LLM_FFN_SILU,
LLM_FFN_GELU,
LLM_FFN_RELU,
LLM_FFN_RELU_SQR,
LLM_FFN_SWIGLU,
LLM_FFN_SWIGLU_OAI_MOE,
};
enum llm_ffn_gate_type {
LLM_FFN_SEQ,
LLM_FFN_PAR, // ffn_gate is parallel to ffn_up
};
enum llm_norm_type {
LLM_NORM,
LLM_NORM_RMS,
};
struct llm_build_context {
const llama_model & model;
llama_context & lctx;
const llama_hparams & hparams;
const llama_cparams & cparams;
const llama_batch & batch;
const llama_kv_cache & kv_self;
const int64_t n_embd;
const int64_t n_layer;
const int64_t n_rot;
const int64_t n_ctx; // user-specified context size (can be different from n_ctx_train)
const int64_t n_head;
const int64_t n_head_kv;
const int64_t n_embd_head_k;
const int64_t n_embd_k_gqa;
const int64_t n_embd_head_v;
const int64_t n_embd_v_gqa;
const int64_t n_expert;
const int64_t n_expert_used;
const float freq_base;
const float freq_scale;
const float ext_factor;
const float attn_factor;
const float beta_fast;
const float beta_slow;
const float norm_eps;
const float norm_rms_eps;
const int32_t n_tokens;
const int32_t n_kv; // size of KV cache to consider (n_kv <= kv_self.size)
const int32_t n_outputs;
const int32_t n_outputs_enc;
const int32_t kv_head; // index of where we store new KV data in the cache
const int32_t n_ctx_orig;
const bool flash_attn;
const int mla_attn;
const int attn_max_batch;
const bool fused_moe_up_gate;
const bool fused_up_gate;
const int min_experts;
const float thresh_experts;
const enum llama_pooling_type pooling_type;
const enum llama_rope_type rope_type;
const llm_build_cb & cb;
std::vector<uint8_t> & buf_compute_meta;
struct ggml_context * ctx0 = nullptr;
// TODO: consider making the entire interface noexcept
llm_build_context(
llama_context & lctx,
const llama_batch & batch,
const llm_build_cb & cb,
bool worst_case,
bool warmup);
void init();
void free();
ggml_cgraph * build_k_shift();
ggml_cgraph * build_s_copy();
ggml_cgraph * build_defrag(const std::vector<uint32_t> & ids);
ggml_tensor * build_inp_pos();
ggml_tensor * build_input_scale(int n_tokens);
ggml_tensor * build_rope_factors(int il);
ggml_tensor * build_inp_out_ids();
ggml_tensor * build_inp_KQ_mask(bool causal = true);
ggml_tensor * build_inp_KQ_mask_swa(bool causal = true);
ggml_tensor * build_inp_mean();
ggml_tensor * build_inp_cls();
ggml_tensor * build_inp_s_copy();
ggml_tensor * build_inp_s_mask();
ggml_tensor * build_inp_s_seq();
ggml_cgraph * append_pooling(struct ggml_cgraph * gf);
ggml_tensor * llm_build_pos_bucket(bool causal);
ggml_tensor * llm_build_pos_bias(struct ggml_tensor * pos_bucket, struct ggml_tensor * attn_rel_b);
ggml_tensor * llm_build_inp_embd_enc();
ggml_tensor * llm_build_inp_KQ_mask_cross();
std::tuple<ggml_tensor*, ggml_tensor*, ggml_tensor*> llm_build_mul_mat_qkv(ggml_cgraph * gf, ggml_tensor * cur,
ggml_tensor * wq, ggml_tensor * bq,
ggml_tensor * wk, ggml_tensor * bk,
ggml_tensor * wv, ggml_tensor * bv,
float attention_scale, int il);
ggml_cgraph * build_llama();
ggml_cgraph * build_deci();
ggml_cgraph * build_baichuan();
ggml_cgraph * build_xverse();
ggml_cgraph * build_falcon();
ggml_cgraph * build_grok();
ggml_cgraph * build_dbrx();
ggml_cgraph * build_starcoder();
ggml_cgraph * build_refact();
ggml_cgraph * build_bert();
ggml_cgraph * build_bloom();
ggml_cgraph * build_mpt();
ggml_cgraph * build_stablelm();
ggml_cgraph * build_qwen();
ggml_cgraph * build_qwen2();
ggml_cgraph * build_qwen2vl();
ggml_cgraph * build_qwen2moe();
ggml_cgraph * build_qwen3();
ggml_cgraph * build_qwen3moe();
ggml_cgraph * build_phi2();
ggml_cgraph * build_phi3();
ggml_cgraph * build_plamo();
ggml_cgraph * build_gpt2();
ggml_cgraph * build_codeshell();
ggml_cgraph * build_orion();
ggml_cgraph * build_internlm2();
ggml_cgraph * build_minicpm();
ggml_cgraph * build_gemma();
ggml_cgraph * build_gemma2();
ggml_cgraph * build_gemma3();
ggml_cgraph * build_starcoder2();
ggml_cgraph * build_mamba();
ggml_cgraph * build_command_r();
ggml_cgraph * build_olmo();
ggml_cgraph * build_openelm();
ggml_cgraph * build_gptneox();
ggml_cgraph * build_arctic();
ggml_cgraph * build_deepseek2();
ggml_cgraph * build_glm4_moe();
ggml_cgraph * build_bitnet();
ggml_cgraph * build_bitnet_158();
ggml_cgraph * build_cohere2();
ggml_cgraph * build_t5_encoder();
ggml_cgraph * build_t5_decoder();
ggml_cgraph * build_jais();
ggml_cgraph * build_chatglm();
ggml_cgraph * build_glm4();
ggml_cgraph * build_dots1();
ggml_cgraph * build_ernie4_5();
ggml_cgraph * build_ernie4_5_moe();
ggml_cgraph * build_hunyuan_moe();
ggml_cgraph * build_openai_moe();
//
static ggml_tensor * llm_build_lora_mm(llama_context & lctx, ggml_context * ctx0,
ggml_tensor * w, ggml_tensor * cur);
static ggml_tensor * llm_build_lora_mm_id(llama_context & lctx, ggml_context * ctx0,
ggml_tensor * w, ggml_tensor * cur, ggml_tensor * ids);
static ggml_tensor * llm_build_inp_embd(ggml_context * ctx, llama_context & lctx,
const llama_hparams & hparams,
const llama_batch & batch,
struct ggml_tensor * tok_embd,
const llm_build_cb & cb);
static ggml_tensor * llm_build_norm(ggml_context * ctx, ggml_tensor * cur,
const llama_hparams & hparams,
ggml_tensor * mw,
ggml_tensor * mb,
llm_norm_type type,
const llm_build_cb & cb, int il, float scale_eps = 1);
static void llm_build_kv_store(ggml_context * ctx, const llama_hparams & hparams,
const llama_cparams & cparams,
const llama_kv_cache & kv,
ggml_cgraph * graph,
ggml_tensor * k_cur,
ggml_tensor * v_cur,
int32_t n_tokens,
int32_t kv_head,
const llm_build_cb & cb, int64_t il);
static struct ggml_tensor * llm_build_kv(ggml_context * ctx, llama_context & lctx,
const llama_kv_cache & kv,
ggml_cgraph * graph,
ggml_tensor * wo,
ggml_tensor * wo_b,
ggml_tensor * k_cur,
ggml_tensor * v_cur,
ggml_tensor * q_cur,
ggml_tensor * kq_mask,
int32_t n_tokens,
int32_t kv_head,
int32_t n_kv,
float kq_scale,
const llm_build_cb & cb, int il, ggml_tensor * sinks = nullptr, int n_swa = 0);
static ggml_tensor * llm_build_ffn(ggml_context * ctx, llama_context & lctx,
ggml_tensor * cur,
ggml_tensor * up,
ggml_tensor * up_b,
ggml_tensor * up_s,
ggml_tensor * gate,
ggml_tensor * gate_b,
ggml_tensor * gate_s,
ggml_tensor * down,
ggml_tensor * down_b,
ggml_tensor * down_s,
ggml_tensor * act_scales,
llm_ffn_op_type type_op,
llm_ffn_gate_type type_gate,
const llm_build_cb & cb, int il);
static ggml_tensor * llm_build_moe_ffn(ggml_context * ctx, llama_context & lctx,
ggml_tensor * cur,
ggml_tensor * gate_inp, ggml_tensor * gate_inp_b,
ggml_tensor * up_exps, ggml_tensor * up_exps_b,
ggml_tensor * gate_exps, ggml_tensor * gate_exps_b,
ggml_tensor * down_exps, ggml_tensor * down_exps_b,
ggml_tensor * exp_probs_b,
int64_t n_expert,
int64_t n_expert_used,
llm_ffn_op_type type_op,
bool norm_w,
bool scale_w,
float w_scale,
llm_expert_gating_func_type gating_op,
const llm_build_cb & cb, int il, ggml_cgraph * graph = nullptr);
static ggml_tensor * llm_build_moe_ffn(ggml_context * ctx, llama_context & lctx,
ggml_tensor * cur,
ggml_tensor * gate_inp,
ggml_tensor * up_exps,
ggml_tensor * gate_exps,
ggml_tensor * down_exps,
ggml_tensor * exp_probs_b,
int64_t n_expert,
int64_t n_expert_used,
llm_ffn_op_type type_op,
bool norm_w,
bool scale_w,
float w_scale,
llm_expert_gating_func_type gating_op,
const llm_build_cb & cb, int il, ggml_cgraph * graph = nullptr) {
return llm_build_moe_ffn(ctx, lctx, cur,
gate_inp, nullptr,
up_exps, nullptr,
gate_exps, nullptr,
down_exps, nullptr,
exp_probs_b,
n_expert, n_expert_used,
type_op, norm_w, scale_w, w_scale,
gating_op, cb, il, graph);
}
};

205
src/llama-context.h Normal file
View File

@@ -0,0 +1,205 @@
#pragma once
#include "llama-impl.h"
#include "llama-cparams.h"
#include "llama-sampling.h"
struct llama_model;
#include <vector>
#include <map>
#include <set>
struct llama_kv_cell {
llama_pos pos = -1;
llama_pos delta = 0;
int32_t src = 0; // used by recurrent state models to copy states
std::set<llama_seq_id> seq_id;
bool has_seq_id(const llama_seq_id & id) const {
return seq_id.find(id) != seq_id.end();
}
bool is_empty() const {
return seq_id.empty();
}
bool is_same_seq(const llama_kv_cell & other) const {
return seq_id == other.seq_id;
}
};
// ring-buffer of cached KV data
struct llama_kv_cache {
bool has_shift = false;
bool do_defrag = false;
bool do_copy = false;
bool recurrent = false; // with recurrent state models, a cell can hold the state for more than one past token
bool v_trans = true; // the value tensor is transposed
// Note: The value of head isn't only used to optimize searching
// for a free KV slot. llama_decode_internal also uses it, so it
// cannot be freely changed after a slot has been allocated.
uint32_t head = 0;
uint32_t size = 0;
uint32_t used = 0; // used cells (i.e. at least one seq_id)
// computed before each graph build
uint32_t n = 0;
ggml_type type_k = GGML_TYPE_F16;
ggml_type type_v = GGML_TYPE_F16;
std::vector<llama_kv_cell> cells;
std::vector<struct ggml_tensor *> k_l; // per layer
std::vector<struct ggml_tensor *> v_l;
std::vector<struct ggml_context *> ctxs;
std::vector<ggml_backend_buffer_t> bufs;
size_t total_size() const {
size_t size = 0;
for (ggml_backend_buffer_t buf : bufs) {
size += ggml_backend_buffer_get_size(buf);
}
return size;
}
~llama_kv_cache() {
for (struct ggml_context * ctx : ctxs) {
ggml_free(ctx);
}
for (ggml_backend_buffer_t buf : bufs) {
ggml_backend_buffer_free(buf);
}
}
};
struct llama_control_vector {
std::vector<struct ggml_tensor *> tensors; // per layer
std::vector<struct ggml_context *> ctxs;
std::vector<ggml_backend_buffer_t> bufs;
int32_t layer_start = -1;
int32_t layer_end = -1;
struct ggml_tensor * tensor_for(int il) const {
if (il < 0 || il < layer_start || il > layer_end || (size_t) il >= tensors.size()) {
return nullptr;
}
return tensors[il];
}
struct ggml_tensor * apply_to(struct ggml_context * ctx, struct ggml_tensor * cur, int il) const {
ggml_tensor * layer_dir = tensor_for(il);
if (layer_dir != nullptr) {
cur = ggml_add(ctx, cur, layer_dir);
}
return cur;
}
~llama_control_vector() {
for (struct ggml_context * ctx : ctxs) {
ggml_free(ctx);
}
for (ggml_backend_buffer_t buf : bufs) {
ggml_backend_buffer_free(buf);
}
}
};
struct llama_context {
llama_context(const llama_model & model);
~llama_context();
const struct llama_model & model;
struct llama_cparams cparams;
struct llama_sampling sampling;
struct llama_kv_cache kv_self;
struct llama_control_vector cvec;
std::vector<float> scale_data;
std::unordered_map<struct llama_lora_adapter *, float> lora_adapters;
std::vector<ggml_backend_t> backends;
#ifdef GGML_USE_METAL
ggml_backend_t backend_metal = nullptr;
#endif
#ifdef GGML_USE_BLAS
ggml_backend_t backend_blas = nullptr;
#endif
ggml_backend_t backend_cpu = nullptr;
bool has_evaluated_once = false;
int64_t t_start_us;
int64_t t_load_us;
int64_t t_p_eval_us = 0;
int64_t t_eval_us = 0;
int64_t t_compute_start_us = 0;
int64_t n_queued_tokens = 0;
int32_t n_p_eval = 0; // number of tokens in eval calls for the prompt (with batch size > 1)
int32_t n_eval = 0; // number of eval calls
// host buffer for the model output (logits and embeddings)
ggml_backend_buffer_t buf_output = nullptr;
// decode output (2-dimensional array: [n_outputs][n_vocab])
size_t logits_size = 0; // capacity (of floats) for logits
float * logits = nullptr;
std::vector<int32_t> output_ids; // map batch token positions to ids of the logits and embd buffers
size_t output_size = 0; // capacity (of tokens positions) for the output buffers
int32_t n_outputs = 0; // number of actually-used outputs in the current ubatch or last logical batch
bool logits_all = false;
// embeddings output (2-dimensional array: [n_outputs][n_embd])
// populated only when pooling_type == LLAMA_POOLING_TYPE_NONE
size_t embd_size = 0; // capacity (of floats) for embeddings
float * embd = nullptr;
// sequence embeddings output (map of [n_embd] vectors)
// populated only when pooling_type != LLAMA_POOLING_TYPE_NONE
std::map<llama_seq_id, std::vector<float>> embd_seq;
// whether we are computing encoder output or decoder output
bool is_encoding = false;
// output of the encoder part of the encoder-decoder models
std::vector<float> embd_enc;
std::vector<std::set<llama_seq_id>> seq_ids_enc;
// memory buffers used to evaluate the model
std::vector<uint8_t> buf_compute_meta;
ggml_backend_sched_t sched = nullptr;
ggml_abort_callback abort_callback = nullptr;
void * abort_callback_data = nullptr;
// input tensors
struct ggml_tensor * inp_tokens; // I32 [n_batch]
struct ggml_tensor * inp_embd; // F32 [n_embd, n_batch]
struct ggml_tensor * inp_pos; // I32 [n_batch]
struct ggml_tensor * inp_out_ids; // I32 [n_outputs]
struct ggml_tensor * inp_KQ_mask; // F32 [kv_size, n_batch]
struct ggml_tensor * inp_KQ_mask_swa; // F32 [kv_size, n_batch]
struct ggml_tensor * inp_K_shift; // I32 [kv_size]
struct ggml_tensor * inp_mean; // F32 [n_batch, n_batch]
struct ggml_tensor * inp_cls; // I32 [n_batch]
struct ggml_tensor * inp_s_copy; // I32 [kv_size]
struct ggml_tensor * inp_s_mask; // F32 [1, n_kv]
struct ggml_tensor * inp_s_seq; // I32 [n_kv, n_batch]
struct ggml_tensor * inp_pos_bucket; // I32 [n_batch|n_kv, n_batch]
struct ggml_tensor * inp_embd_enc; // F32 [n_embd, n_outputs_enc]
struct ggml_tensor * inp_KQ_mask_cross; // F32 [n_outputs_enc, n_batch]
struct ggml_tensor * inp_scale = nullptr; // F32 [n_tokens]
};

42
src/llama-cparams.h Normal file
View File

@@ -0,0 +1,42 @@
#pragma once
#include "llama-impl.h"
#include <cstdint>
struct llama_cparams {
uint32_t n_ctx; // context size used during inference
uint32_t n_batch;
uint32_t n_ubatch;
uint32_t n_seq_max;
uint32_t n_threads; // number of threads to use for generation
uint32_t n_threads_batch; // number of threads to use for batch processing
float rope_freq_base;
float rope_freq_scale;
uint32_t n_ctx_orig_yarn;
// These hyperparameters are not exposed in GGUF, because all
// existing YaRN models use the same values for them.
float yarn_ext_factor;
float yarn_attn_factor;
float yarn_beta_fast;
float yarn_beta_slow;
float defrag_thold;
bool embeddings;
bool causal_attn;
bool offload_kqv;
bool flash_attn;
int mla_attn;
int attn_max_batch;
bool fused_moe_up_gate;
bool fused_up_gate;
int min_experts;
float thresh_experts;
enum llama_pooling_type pooling_type;
ggml_backend_sched_eval_callback cb_eval;
void * cb_eval_user_data;
};

View File

@@ -230,7 +230,6 @@ struct llama_layer {
struct llama_lora_adapter;
struct llama_model {
e_model type = MODEL_UNKNOWN;
llm_arch arch = LLM_ARCH_UNKNOWN;
@@ -301,4 +300,50 @@ struct llama_model {
std::set<llama_lora_adapter *> lora_adapters;
~llama_model();
// Not actually needed, but left in place for now
size_t max_nodes() const { return 65536; }
};
struct llama_lora_weight {
struct ggml_tensor * a = nullptr;
struct ggml_tensor * b = nullptr;
llama_lora_weight() = default;
llama_lora_weight(struct ggml_tensor * a, struct ggml_tensor * b): a(a), b(b) {}
};
struct llama_lora_adapter {
llama_model * base_model;
// map tensor name to lora_a_b
std::unordered_map<std::string, struct llama_lora_weight> ab_map;
std::vector<struct ggml_context *> ctxs;
std::vector<ggml_backend_buffer_t> bufs;
float alpha;
llama_lora_adapter(struct llama_model * base_model): base_model(base_model) {
base_model->lora_adapters.insert(this);
}
llama_lora_weight * get_weight(struct ggml_tensor * w) {
std::string name(w->name);
auto pos = ab_map.find(name);
if (ab_map.find(name) != ab_map.end()) {
return &pos->second;
}
return nullptr;
}
~llama_lora_adapter() {
for (struct ggml_context * ctx : ctxs) {
ggml_free(ctx);
}
for (ggml_backend_buffer_t buf : bufs) {
ggml_backend_buffer_free(buf);
}
auto pos = base_model->lora_adapters.find(this);
if (pos != base_model->lora_adapters.end()) {
base_model->lora_adapters.erase(pos);
}
}
};

File diff suppressed because it is too large Load Diff