mirror of
https://github.com/ikawrakow/ik_llama.cpp.git
synced 2026-04-20 22:49:31 +00:00
llama_build_context
Surprisingly small reduction in llama.cpp compile time given the reduction in LOCs (22k -> 14k)
This commit is contained in:
@@ -19,6 +19,8 @@ add_library(llama
|
||||
llama-sampling.cpp
|
||||
llama-mmap.cpp
|
||||
llama-model-loader.cpp
|
||||
llama-build-context.h
|
||||
llama-build-context.cpp
|
||||
unicode.h
|
||||
unicode.cpp
|
||||
unicode-data.cpp
|
||||
|
||||
8179
src/llama-build-context.cpp
Normal file
8179
src/llama-build-context.cpp
Normal file
File diff suppressed because it is too large
Load Diff
357
src/llama-build-context.h
Normal file
357
src/llama-build-context.h
Normal file
@@ -0,0 +1,357 @@
|
||||
#pragma once
|
||||
|
||||
#include "llama-impl.h"
|
||||
#include "llama-hparams.h"
|
||||
|
||||
#include <cstdint>
|
||||
#include <functional>
|
||||
#include <tuple>
|
||||
|
||||
struct llama_model;
|
||||
struct llama_context;
|
||||
struct llama_cparams;
|
||||
struct llama_batch;
|
||||
struct llama_kv_cache;
|
||||
|
||||
struct ggml_cgraph;
|
||||
struct ggml_tensor;
|
||||
|
||||
using llm_build_cb = std::function<void(struct ggml_tensor * cur, const char * name, int nl)>;
|
||||
|
||||
enum llm_ffn_op_type {
|
||||
LLM_FFN_SILU,
|
||||
LLM_FFN_GELU,
|
||||
LLM_FFN_RELU,
|
||||
LLM_FFN_RELU_SQR,
|
||||
LLM_FFN_SWIGLU,
|
||||
LLM_FFN_SWIGLU_OAI_MOE,
|
||||
};
|
||||
|
||||
enum llm_ffn_gate_type {
|
||||
LLM_FFN_SEQ,
|
||||
LLM_FFN_PAR, // ffn_gate is parallel to ffn_up
|
||||
};
|
||||
|
||||
enum llm_norm_type {
|
||||
LLM_NORM,
|
||||
LLM_NORM_RMS,
|
||||
};
|
||||
|
||||
struct llm_build_context {
|
||||
const llama_model & model;
|
||||
llama_context & lctx;
|
||||
const llama_hparams & hparams;
|
||||
const llama_cparams & cparams;
|
||||
const llama_batch & batch;
|
||||
const llama_kv_cache & kv_self;
|
||||
|
||||
const int64_t n_embd;
|
||||
const int64_t n_layer;
|
||||
const int64_t n_rot;
|
||||
const int64_t n_ctx; // user-specified context size (can be different from n_ctx_train)
|
||||
const int64_t n_head;
|
||||
const int64_t n_head_kv;
|
||||
const int64_t n_embd_head_k;
|
||||
const int64_t n_embd_k_gqa;
|
||||
const int64_t n_embd_head_v;
|
||||
const int64_t n_embd_v_gqa;
|
||||
const int64_t n_expert;
|
||||
const int64_t n_expert_used;
|
||||
|
||||
const float freq_base;
|
||||
const float freq_scale;
|
||||
const float ext_factor;
|
||||
const float attn_factor;
|
||||
const float beta_fast;
|
||||
const float beta_slow;
|
||||
const float norm_eps;
|
||||
const float norm_rms_eps;
|
||||
|
||||
const int32_t n_tokens;
|
||||
const int32_t n_kv; // size of KV cache to consider (n_kv <= kv_self.size)
|
||||
const int32_t n_outputs;
|
||||
const int32_t n_outputs_enc;
|
||||
const int32_t kv_head; // index of where we store new KV data in the cache
|
||||
const int32_t n_ctx_orig;
|
||||
|
||||
const bool flash_attn;
|
||||
const int mla_attn;
|
||||
const int attn_max_batch;
|
||||
const bool fused_moe_up_gate;
|
||||
const bool fused_up_gate;
|
||||
const int min_experts;
|
||||
const float thresh_experts;
|
||||
|
||||
const enum llama_pooling_type pooling_type;
|
||||
const enum llama_rope_type rope_type;
|
||||
|
||||
const llm_build_cb & cb;
|
||||
|
||||
std::vector<uint8_t> & buf_compute_meta;
|
||||
|
||||
struct ggml_context * ctx0 = nullptr;
|
||||
|
||||
// TODO: consider making the entire interface noexcept
|
||||
llm_build_context(
|
||||
llama_context & lctx,
|
||||
const llama_batch & batch,
|
||||
const llm_build_cb & cb,
|
||||
bool worst_case,
|
||||
bool warmup);
|
||||
|
||||
void init();
|
||||
|
||||
void free();
|
||||
|
||||
ggml_cgraph * build_k_shift();
|
||||
|
||||
ggml_cgraph * build_s_copy();
|
||||
|
||||
ggml_cgraph * build_defrag(const std::vector<uint32_t> & ids);
|
||||
|
||||
ggml_tensor * build_inp_pos();
|
||||
|
||||
ggml_tensor * build_input_scale(int n_tokens);
|
||||
|
||||
ggml_tensor * build_rope_factors(int il);
|
||||
|
||||
ggml_tensor * build_inp_out_ids();
|
||||
|
||||
ggml_tensor * build_inp_KQ_mask(bool causal = true);
|
||||
|
||||
ggml_tensor * build_inp_KQ_mask_swa(bool causal = true);
|
||||
|
||||
ggml_tensor * build_inp_mean();
|
||||
|
||||
ggml_tensor * build_inp_cls();
|
||||
|
||||
ggml_tensor * build_inp_s_copy();
|
||||
|
||||
ggml_tensor * build_inp_s_mask();
|
||||
|
||||
ggml_tensor * build_inp_s_seq();
|
||||
|
||||
ggml_cgraph * append_pooling(struct ggml_cgraph * gf);
|
||||
|
||||
ggml_tensor * llm_build_pos_bucket(bool causal);
|
||||
|
||||
ggml_tensor * llm_build_pos_bias(struct ggml_tensor * pos_bucket, struct ggml_tensor * attn_rel_b);
|
||||
|
||||
ggml_tensor * llm_build_inp_embd_enc();
|
||||
|
||||
ggml_tensor * llm_build_inp_KQ_mask_cross();
|
||||
|
||||
std::tuple<ggml_tensor*, ggml_tensor*, ggml_tensor*> llm_build_mul_mat_qkv(ggml_cgraph * gf, ggml_tensor * cur,
|
||||
ggml_tensor * wq, ggml_tensor * bq,
|
||||
ggml_tensor * wk, ggml_tensor * bk,
|
||||
ggml_tensor * wv, ggml_tensor * bv,
|
||||
float attention_scale, int il);
|
||||
|
||||
ggml_cgraph * build_llama();
|
||||
|
||||
ggml_cgraph * build_deci();
|
||||
|
||||
ggml_cgraph * build_baichuan();
|
||||
|
||||
ggml_cgraph * build_xverse();
|
||||
|
||||
ggml_cgraph * build_falcon();
|
||||
|
||||
ggml_cgraph * build_grok();
|
||||
|
||||
ggml_cgraph * build_dbrx();
|
||||
|
||||
ggml_cgraph * build_starcoder();
|
||||
|
||||
ggml_cgraph * build_refact();
|
||||
|
||||
ggml_cgraph * build_bert();
|
||||
|
||||
ggml_cgraph * build_bloom();
|
||||
|
||||
ggml_cgraph * build_mpt();
|
||||
|
||||
ggml_cgraph * build_stablelm();
|
||||
|
||||
ggml_cgraph * build_qwen();
|
||||
|
||||
ggml_cgraph * build_qwen2();
|
||||
|
||||
ggml_cgraph * build_qwen2vl();
|
||||
|
||||
ggml_cgraph * build_qwen2moe();
|
||||
|
||||
ggml_cgraph * build_qwen3();
|
||||
|
||||
ggml_cgraph * build_qwen3moe();
|
||||
|
||||
ggml_cgraph * build_phi2();
|
||||
|
||||
ggml_cgraph * build_phi3();
|
||||
|
||||
ggml_cgraph * build_plamo();
|
||||
|
||||
ggml_cgraph * build_gpt2();
|
||||
|
||||
ggml_cgraph * build_codeshell();
|
||||
|
||||
ggml_cgraph * build_orion();
|
||||
|
||||
ggml_cgraph * build_internlm2();
|
||||
|
||||
ggml_cgraph * build_minicpm();
|
||||
|
||||
ggml_cgraph * build_gemma();
|
||||
|
||||
ggml_cgraph * build_gemma2();
|
||||
|
||||
ggml_cgraph * build_gemma3();
|
||||
|
||||
ggml_cgraph * build_starcoder2();
|
||||
|
||||
ggml_cgraph * build_mamba();
|
||||
|
||||
ggml_cgraph * build_command_r();
|
||||
|
||||
ggml_cgraph * build_olmo();
|
||||
|
||||
ggml_cgraph * build_openelm();
|
||||
|
||||
ggml_cgraph * build_gptneox();
|
||||
|
||||
ggml_cgraph * build_arctic();
|
||||
|
||||
ggml_cgraph * build_deepseek2();
|
||||
|
||||
ggml_cgraph * build_glm4_moe();
|
||||
|
||||
ggml_cgraph * build_bitnet();
|
||||
|
||||
ggml_cgraph * build_bitnet_158();
|
||||
|
||||
ggml_cgraph * build_cohere2();
|
||||
|
||||
ggml_cgraph * build_t5_encoder();
|
||||
|
||||
ggml_cgraph * build_t5_decoder();
|
||||
|
||||
ggml_cgraph * build_jais();
|
||||
|
||||
ggml_cgraph * build_chatglm();
|
||||
|
||||
ggml_cgraph * build_glm4();
|
||||
|
||||
ggml_cgraph * build_dots1();
|
||||
|
||||
ggml_cgraph * build_ernie4_5();
|
||||
|
||||
ggml_cgraph * build_ernie4_5_moe();
|
||||
|
||||
ggml_cgraph * build_hunyuan_moe();
|
||||
|
||||
ggml_cgraph * build_openai_moe();
|
||||
|
||||
//
|
||||
static ggml_tensor * llm_build_lora_mm(llama_context & lctx, ggml_context * ctx0,
|
||||
ggml_tensor * w, ggml_tensor * cur);
|
||||
|
||||
static ggml_tensor * llm_build_lora_mm_id(llama_context & lctx, ggml_context * ctx0,
|
||||
ggml_tensor * w, ggml_tensor * cur, ggml_tensor * ids);
|
||||
|
||||
static ggml_tensor * llm_build_inp_embd(ggml_context * ctx, llama_context & lctx,
|
||||
const llama_hparams & hparams,
|
||||
const llama_batch & batch,
|
||||
struct ggml_tensor * tok_embd,
|
||||
const llm_build_cb & cb);
|
||||
|
||||
static ggml_tensor * llm_build_norm(ggml_context * ctx, ggml_tensor * cur,
|
||||
const llama_hparams & hparams,
|
||||
ggml_tensor * mw,
|
||||
ggml_tensor * mb,
|
||||
llm_norm_type type,
|
||||
const llm_build_cb & cb, int il, float scale_eps = 1);
|
||||
|
||||
static void llm_build_kv_store(ggml_context * ctx, const llama_hparams & hparams,
|
||||
const llama_cparams & cparams,
|
||||
const llama_kv_cache & kv,
|
||||
ggml_cgraph * graph,
|
||||
ggml_tensor * k_cur,
|
||||
ggml_tensor * v_cur,
|
||||
int32_t n_tokens,
|
||||
int32_t kv_head,
|
||||
const llm_build_cb & cb, int64_t il);
|
||||
|
||||
static struct ggml_tensor * llm_build_kv(ggml_context * ctx, llama_context & lctx,
|
||||
const llama_kv_cache & kv,
|
||||
ggml_cgraph * graph,
|
||||
ggml_tensor * wo,
|
||||
ggml_tensor * wo_b,
|
||||
ggml_tensor * k_cur,
|
||||
ggml_tensor * v_cur,
|
||||
ggml_tensor * q_cur,
|
||||
ggml_tensor * kq_mask,
|
||||
int32_t n_tokens,
|
||||
int32_t kv_head,
|
||||
int32_t n_kv,
|
||||
float kq_scale,
|
||||
const llm_build_cb & cb, int il, ggml_tensor * sinks = nullptr, int n_swa = 0);
|
||||
|
||||
static ggml_tensor * llm_build_ffn(ggml_context * ctx, llama_context & lctx,
|
||||
ggml_tensor * cur,
|
||||
ggml_tensor * up,
|
||||
ggml_tensor * up_b,
|
||||
ggml_tensor * up_s,
|
||||
ggml_tensor * gate,
|
||||
ggml_tensor * gate_b,
|
||||
ggml_tensor * gate_s,
|
||||
ggml_tensor * down,
|
||||
ggml_tensor * down_b,
|
||||
ggml_tensor * down_s,
|
||||
ggml_tensor * act_scales,
|
||||
llm_ffn_op_type type_op,
|
||||
llm_ffn_gate_type type_gate,
|
||||
const llm_build_cb & cb, int il);
|
||||
|
||||
static ggml_tensor * llm_build_moe_ffn(ggml_context * ctx, llama_context & lctx,
|
||||
ggml_tensor * cur,
|
||||
ggml_tensor * gate_inp, ggml_tensor * gate_inp_b,
|
||||
ggml_tensor * up_exps, ggml_tensor * up_exps_b,
|
||||
ggml_tensor * gate_exps, ggml_tensor * gate_exps_b,
|
||||
ggml_tensor * down_exps, ggml_tensor * down_exps_b,
|
||||
ggml_tensor * exp_probs_b,
|
||||
int64_t n_expert,
|
||||
int64_t n_expert_used,
|
||||
llm_ffn_op_type type_op,
|
||||
bool norm_w,
|
||||
bool scale_w,
|
||||
float w_scale,
|
||||
llm_expert_gating_func_type gating_op,
|
||||
const llm_build_cb & cb, int il, ggml_cgraph * graph = nullptr);
|
||||
|
||||
static ggml_tensor * llm_build_moe_ffn(ggml_context * ctx, llama_context & lctx,
|
||||
ggml_tensor * cur,
|
||||
ggml_tensor * gate_inp,
|
||||
ggml_tensor * up_exps,
|
||||
ggml_tensor * gate_exps,
|
||||
ggml_tensor * down_exps,
|
||||
ggml_tensor * exp_probs_b,
|
||||
int64_t n_expert,
|
||||
int64_t n_expert_used,
|
||||
llm_ffn_op_type type_op,
|
||||
bool norm_w,
|
||||
bool scale_w,
|
||||
float w_scale,
|
||||
llm_expert_gating_func_type gating_op,
|
||||
const llm_build_cb & cb, int il, ggml_cgraph * graph = nullptr) {
|
||||
return llm_build_moe_ffn(ctx, lctx, cur,
|
||||
gate_inp, nullptr,
|
||||
up_exps, nullptr,
|
||||
gate_exps, nullptr,
|
||||
down_exps, nullptr,
|
||||
exp_probs_b,
|
||||
n_expert, n_expert_used,
|
||||
type_op, norm_w, scale_w, w_scale,
|
||||
gating_op, cb, il, graph);
|
||||
}
|
||||
|
||||
};
|
||||
205
src/llama-context.h
Normal file
205
src/llama-context.h
Normal file
@@ -0,0 +1,205 @@
|
||||
#pragma once
|
||||
|
||||
#include "llama-impl.h"
|
||||
#include "llama-cparams.h"
|
||||
#include "llama-sampling.h"
|
||||
|
||||
struct llama_model;
|
||||
|
||||
#include <vector>
|
||||
#include <map>
|
||||
#include <set>
|
||||
|
||||
struct llama_kv_cell {
|
||||
llama_pos pos = -1;
|
||||
llama_pos delta = 0;
|
||||
int32_t src = 0; // used by recurrent state models to copy states
|
||||
|
||||
std::set<llama_seq_id> seq_id;
|
||||
|
||||
bool has_seq_id(const llama_seq_id & id) const {
|
||||
return seq_id.find(id) != seq_id.end();
|
||||
}
|
||||
|
||||
bool is_empty() const {
|
||||
return seq_id.empty();
|
||||
}
|
||||
|
||||
bool is_same_seq(const llama_kv_cell & other) const {
|
||||
return seq_id == other.seq_id;
|
||||
}
|
||||
};
|
||||
|
||||
// ring-buffer of cached KV data
|
||||
struct llama_kv_cache {
|
||||
bool has_shift = false;
|
||||
bool do_defrag = false;
|
||||
bool do_copy = false;
|
||||
bool recurrent = false; // with recurrent state models, a cell can hold the state for more than one past token
|
||||
bool v_trans = true; // the value tensor is transposed
|
||||
|
||||
// Note: The value of head isn't only used to optimize searching
|
||||
// for a free KV slot. llama_decode_internal also uses it, so it
|
||||
// cannot be freely changed after a slot has been allocated.
|
||||
uint32_t head = 0;
|
||||
uint32_t size = 0;
|
||||
uint32_t used = 0; // used cells (i.e. at least one seq_id)
|
||||
|
||||
// computed before each graph build
|
||||
uint32_t n = 0;
|
||||
|
||||
ggml_type type_k = GGML_TYPE_F16;
|
||||
ggml_type type_v = GGML_TYPE_F16;
|
||||
|
||||
std::vector<llama_kv_cell> cells;
|
||||
|
||||
std::vector<struct ggml_tensor *> k_l; // per layer
|
||||
std::vector<struct ggml_tensor *> v_l;
|
||||
|
||||
std::vector<struct ggml_context *> ctxs;
|
||||
std::vector<ggml_backend_buffer_t> bufs;
|
||||
|
||||
size_t total_size() const {
|
||||
size_t size = 0;
|
||||
for (ggml_backend_buffer_t buf : bufs) {
|
||||
size += ggml_backend_buffer_get_size(buf);
|
||||
}
|
||||
return size;
|
||||
}
|
||||
|
||||
~llama_kv_cache() {
|
||||
for (struct ggml_context * ctx : ctxs) {
|
||||
ggml_free(ctx);
|
||||
}
|
||||
for (ggml_backend_buffer_t buf : bufs) {
|
||||
ggml_backend_buffer_free(buf);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
struct llama_control_vector {
|
||||
std::vector<struct ggml_tensor *> tensors; // per layer
|
||||
std::vector<struct ggml_context *> ctxs;
|
||||
std::vector<ggml_backend_buffer_t> bufs;
|
||||
|
||||
int32_t layer_start = -1;
|
||||
int32_t layer_end = -1;
|
||||
|
||||
struct ggml_tensor * tensor_for(int il) const {
|
||||
if (il < 0 || il < layer_start || il > layer_end || (size_t) il >= tensors.size()) {
|
||||
return nullptr;
|
||||
}
|
||||
return tensors[il];
|
||||
}
|
||||
|
||||
struct ggml_tensor * apply_to(struct ggml_context * ctx, struct ggml_tensor * cur, int il) const {
|
||||
ggml_tensor * layer_dir = tensor_for(il);
|
||||
if (layer_dir != nullptr) {
|
||||
cur = ggml_add(ctx, cur, layer_dir);
|
||||
}
|
||||
return cur;
|
||||
}
|
||||
|
||||
~llama_control_vector() {
|
||||
for (struct ggml_context * ctx : ctxs) {
|
||||
ggml_free(ctx);
|
||||
}
|
||||
for (ggml_backend_buffer_t buf : bufs) {
|
||||
ggml_backend_buffer_free(buf);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
struct llama_context {
|
||||
|
||||
llama_context(const llama_model & model);
|
||||
|
||||
~llama_context();
|
||||
|
||||
const struct llama_model & model;
|
||||
|
||||
struct llama_cparams cparams;
|
||||
struct llama_sampling sampling;
|
||||
struct llama_kv_cache kv_self;
|
||||
struct llama_control_vector cvec;
|
||||
|
||||
std::vector<float> scale_data;
|
||||
|
||||
std::unordered_map<struct llama_lora_adapter *, float> lora_adapters;
|
||||
|
||||
std::vector<ggml_backend_t> backends;
|
||||
#ifdef GGML_USE_METAL
|
||||
ggml_backend_t backend_metal = nullptr;
|
||||
#endif
|
||||
#ifdef GGML_USE_BLAS
|
||||
ggml_backend_t backend_blas = nullptr;
|
||||
#endif
|
||||
ggml_backend_t backend_cpu = nullptr;
|
||||
|
||||
bool has_evaluated_once = false;
|
||||
|
||||
int64_t t_start_us;
|
||||
int64_t t_load_us;
|
||||
int64_t t_p_eval_us = 0;
|
||||
int64_t t_eval_us = 0;
|
||||
|
||||
int64_t t_compute_start_us = 0;
|
||||
int64_t n_queued_tokens = 0;
|
||||
|
||||
int32_t n_p_eval = 0; // number of tokens in eval calls for the prompt (with batch size > 1)
|
||||
int32_t n_eval = 0; // number of eval calls
|
||||
|
||||
// host buffer for the model output (logits and embeddings)
|
||||
ggml_backend_buffer_t buf_output = nullptr;
|
||||
|
||||
// decode output (2-dimensional array: [n_outputs][n_vocab])
|
||||
size_t logits_size = 0; // capacity (of floats) for logits
|
||||
float * logits = nullptr;
|
||||
|
||||
std::vector<int32_t> output_ids; // map batch token positions to ids of the logits and embd buffers
|
||||
size_t output_size = 0; // capacity (of tokens positions) for the output buffers
|
||||
int32_t n_outputs = 0; // number of actually-used outputs in the current ubatch or last logical batch
|
||||
|
||||
bool logits_all = false;
|
||||
|
||||
// embeddings output (2-dimensional array: [n_outputs][n_embd])
|
||||
// populated only when pooling_type == LLAMA_POOLING_TYPE_NONE
|
||||
size_t embd_size = 0; // capacity (of floats) for embeddings
|
||||
float * embd = nullptr;
|
||||
|
||||
// sequence embeddings output (map of [n_embd] vectors)
|
||||
// populated only when pooling_type != LLAMA_POOLING_TYPE_NONE
|
||||
std::map<llama_seq_id, std::vector<float>> embd_seq;
|
||||
|
||||
// whether we are computing encoder output or decoder output
|
||||
bool is_encoding = false;
|
||||
|
||||
// output of the encoder part of the encoder-decoder models
|
||||
std::vector<float> embd_enc;
|
||||
std::vector<std::set<llama_seq_id>> seq_ids_enc;
|
||||
|
||||
// memory buffers used to evaluate the model
|
||||
std::vector<uint8_t> buf_compute_meta;
|
||||
ggml_backend_sched_t sched = nullptr;
|
||||
|
||||
ggml_abort_callback abort_callback = nullptr;
|
||||
void * abort_callback_data = nullptr;
|
||||
|
||||
// input tensors
|
||||
struct ggml_tensor * inp_tokens; // I32 [n_batch]
|
||||
struct ggml_tensor * inp_embd; // F32 [n_embd, n_batch]
|
||||
struct ggml_tensor * inp_pos; // I32 [n_batch]
|
||||
struct ggml_tensor * inp_out_ids; // I32 [n_outputs]
|
||||
struct ggml_tensor * inp_KQ_mask; // F32 [kv_size, n_batch]
|
||||
struct ggml_tensor * inp_KQ_mask_swa; // F32 [kv_size, n_batch]
|
||||
struct ggml_tensor * inp_K_shift; // I32 [kv_size]
|
||||
struct ggml_tensor * inp_mean; // F32 [n_batch, n_batch]
|
||||
struct ggml_tensor * inp_cls; // I32 [n_batch]
|
||||
struct ggml_tensor * inp_s_copy; // I32 [kv_size]
|
||||
struct ggml_tensor * inp_s_mask; // F32 [1, n_kv]
|
||||
struct ggml_tensor * inp_s_seq; // I32 [n_kv, n_batch]
|
||||
struct ggml_tensor * inp_pos_bucket; // I32 [n_batch|n_kv, n_batch]
|
||||
struct ggml_tensor * inp_embd_enc; // F32 [n_embd, n_outputs_enc]
|
||||
struct ggml_tensor * inp_KQ_mask_cross; // F32 [n_outputs_enc, n_batch]
|
||||
struct ggml_tensor * inp_scale = nullptr; // F32 [n_tokens]
|
||||
};
|
||||
42
src/llama-cparams.h
Normal file
42
src/llama-cparams.h
Normal file
@@ -0,0 +1,42 @@
|
||||
#pragma once
|
||||
|
||||
#include "llama-impl.h"
|
||||
|
||||
#include <cstdint>
|
||||
|
||||
struct llama_cparams {
|
||||
uint32_t n_ctx; // context size used during inference
|
||||
uint32_t n_batch;
|
||||
uint32_t n_ubatch;
|
||||
uint32_t n_seq_max;
|
||||
uint32_t n_threads; // number of threads to use for generation
|
||||
uint32_t n_threads_batch; // number of threads to use for batch processing
|
||||
|
||||
float rope_freq_base;
|
||||
float rope_freq_scale;
|
||||
|
||||
uint32_t n_ctx_orig_yarn;
|
||||
// These hyperparameters are not exposed in GGUF, because all
|
||||
// existing YaRN models use the same values for them.
|
||||
float yarn_ext_factor;
|
||||
float yarn_attn_factor;
|
||||
float yarn_beta_fast;
|
||||
float yarn_beta_slow;
|
||||
float defrag_thold;
|
||||
|
||||
bool embeddings;
|
||||
bool causal_attn;
|
||||
bool offload_kqv;
|
||||
bool flash_attn;
|
||||
int mla_attn;
|
||||
int attn_max_batch;
|
||||
bool fused_moe_up_gate;
|
||||
bool fused_up_gate;
|
||||
int min_experts;
|
||||
float thresh_experts;
|
||||
|
||||
enum llama_pooling_type pooling_type;
|
||||
|
||||
ggml_backend_sched_eval_callback cb_eval;
|
||||
void * cb_eval_user_data;
|
||||
};
|
||||
@@ -230,7 +230,6 @@ struct llama_layer {
|
||||
|
||||
struct llama_lora_adapter;
|
||||
|
||||
|
||||
struct llama_model {
|
||||
e_model type = MODEL_UNKNOWN;
|
||||
llm_arch arch = LLM_ARCH_UNKNOWN;
|
||||
@@ -301,4 +300,50 @@ struct llama_model {
|
||||
std::set<llama_lora_adapter *> lora_adapters;
|
||||
|
||||
~llama_model();
|
||||
|
||||
// Not actually needed, but left in place for now
|
||||
size_t max_nodes() const { return 65536; }
|
||||
};
|
||||
|
||||
struct llama_lora_weight {
|
||||
struct ggml_tensor * a = nullptr;
|
||||
struct ggml_tensor * b = nullptr;
|
||||
llama_lora_weight() = default;
|
||||
llama_lora_weight(struct ggml_tensor * a, struct ggml_tensor * b): a(a), b(b) {}
|
||||
};
|
||||
|
||||
struct llama_lora_adapter {
|
||||
llama_model * base_model;
|
||||
// map tensor name to lora_a_b
|
||||
std::unordered_map<std::string, struct llama_lora_weight> ab_map;
|
||||
std::vector<struct ggml_context *> ctxs;
|
||||
std::vector<ggml_backend_buffer_t> bufs;
|
||||
|
||||
float alpha;
|
||||
|
||||
llama_lora_adapter(struct llama_model * base_model): base_model(base_model) {
|
||||
base_model->lora_adapters.insert(this);
|
||||
}
|
||||
|
||||
llama_lora_weight * get_weight(struct ggml_tensor * w) {
|
||||
std::string name(w->name);
|
||||
auto pos = ab_map.find(name);
|
||||
if (ab_map.find(name) != ab_map.end()) {
|
||||
return &pos->second;
|
||||
}
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
~llama_lora_adapter() {
|
||||
for (struct ggml_context * ctx : ctxs) {
|
||||
ggml_free(ctx);
|
||||
}
|
||||
for (ggml_backend_buffer_t buf : bufs) {
|
||||
ggml_backend_buffer_free(buf);
|
||||
}
|
||||
auto pos = base_model->lora_adapters.find(this);
|
||||
if (pos != base_model->lora_adapters.end()) {
|
||||
base_model->lora_adapters.erase(pos);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
8328
src/llama.cpp
8328
src/llama.cpp
File diff suppressed because it is too large
Load Diff
Reference in New Issue
Block a user