mirror of
https://github.com/ikawrakow/ik_llama.cpp.git
synced 2026-03-01 17:40:25 +00:00
the graph is still not being computed in parallel. Why? Because the scheduler creates graph splits where the result of the computation on one GPU becomes an input for the other split. Hence, to trigger the computation on the second GPU one needs to wait for the computation on the first GPU to finish, even thiough the two can be done in parallel up to the sunchronization point. So, all that is left to do is to trick the scheduler to create to splits that can be done in parallel, and then have a graph split where the results get combined.
472 lines
13 KiB
C++
472 lines
13 KiB
C++
#pragma once
|
|
|
|
#include "llama-impl.h"
|
|
#include "llama-arch.h"
|
|
#include "llama-mmap.h"
|
|
#include "llama-vocab.h"
|
|
#include "llama-hparams.h"
|
|
|
|
#include "ggml-backend.h"
|
|
|
|
#include <vector>
|
|
#include <unordered_map>
|
|
#include <set>
|
|
|
|
// available llama models
|
|
enum e_model {
|
|
MODEL_UNKNOWN,
|
|
MODEL_14M,
|
|
MODEL_17M,
|
|
MODEL_22M,
|
|
MODEL_33M,
|
|
MODEL_60M,
|
|
MODEL_70M,
|
|
MODEL_80M,
|
|
MODEL_109M,
|
|
MODEL_137M,
|
|
MODEL_140M,
|
|
MODEL_160M,
|
|
MODEL_190M,
|
|
MODEL_220M,
|
|
MODEL_250M,
|
|
MODEL_256M,
|
|
MODEL_270M,
|
|
MODEL_335M,
|
|
MODEL_350M,
|
|
MODEL_360M,
|
|
MODEL_410M,
|
|
MODEL_450M,
|
|
MODEL_475M,
|
|
MODEL_558M,
|
|
MODEL_700M,
|
|
MODEL_770M,
|
|
MODEL_780M,
|
|
MODEL_950M,
|
|
MODEL_0_3B,
|
|
MODEL_0_5B,
|
|
MODEL_0_6B,
|
|
MODEL_1B,
|
|
MODEL_1_2B,
|
|
MODEL_1_3B,
|
|
MODEL_1_4B,
|
|
MODEL_1_5B,
|
|
MODEL_1_6B,
|
|
MODEL_1_7B,
|
|
MODEL_1_8B,
|
|
MODEL_2B,
|
|
MODEL_2_6B,
|
|
MODEL_2_8B,
|
|
MODEL_2_9B,
|
|
MODEL_3B,
|
|
MODEL_4B,
|
|
MODEL_6B,
|
|
MODEL_6_9B,
|
|
MODEL_7B,
|
|
MODEL_8B,
|
|
MODEL_9B,
|
|
MODEL_11B,
|
|
MODEL_12B,
|
|
MODEL_13B,
|
|
MODEL_14B,
|
|
MODEL_15B,
|
|
MODEL_16B,
|
|
MODEL_20B,
|
|
MODEL_27B,
|
|
MODEL_30B,
|
|
MODEL_32B,
|
|
MODEL_34B,
|
|
MODEL_35B,
|
|
MODEL_36B,
|
|
MODEL_40B,
|
|
MODEL_65B,
|
|
MODEL_70B,
|
|
MODEL_120B,
|
|
MODEL_142B,
|
|
MODEL_236B,
|
|
MODEL_290B,
|
|
MODEL_314B,
|
|
MODEL_405B,
|
|
MODEL_671B,
|
|
MODEL_SMALL,
|
|
MODEL_MEDIUM,
|
|
MODEL_LARGE,
|
|
MODEL_XL,
|
|
MODEL_A1_7B,
|
|
MODEL_A2_7B,
|
|
MODEL_8x7B,
|
|
MODEL_8x22B,
|
|
MODEL_16x12B,
|
|
MODEL_16x3_8B,
|
|
MODEL_10B_128x3_66B,
|
|
MODEL_57B_A14B,
|
|
MODEL_17B_16E,
|
|
MODEL_17B_128E,
|
|
MODEL_A13B,
|
|
MODEL_7B_A1B,
|
|
MODEL_8B_A1B,
|
|
MODEL_16B_A1B,
|
|
MODEL_21B_A3B, // Ernie MoE small
|
|
MODEL_30B_A3B,
|
|
MODEL_80B_A13B,
|
|
MODEL_100B_A6B,
|
|
MODEL_106B_A12B,
|
|
MODEL_230B_A10B, // Minimax M2
|
|
MODEL_235B_A22B,
|
|
MODEL_300B_A47B, // Ernie MoE big
|
|
MODEL_355B_A32B,
|
|
MODEL_E2B,
|
|
MODEL_E4B,
|
|
};
|
|
|
|
|
|
struct llama_layer_nextn {
|
|
struct ggml_tensor * eh_proj = nullptr;
|
|
struct ggml_tensor * embed_tokens = nullptr;
|
|
struct ggml_tensor * enorm = nullptr;
|
|
struct ggml_tensor * hnorm = nullptr;
|
|
struct ggml_tensor * shared_head_head = nullptr;
|
|
struct ggml_tensor * shared_head_norm = nullptr;
|
|
};
|
|
|
|
// TODO: separate into "llama_layer_enc" and "llama_layer_dec"
|
|
struct llama_layer {
|
|
// normalization
|
|
struct ggml_tensor * attn_norm = nullptr;
|
|
struct ggml_tensor * attn_norm_b = nullptr;
|
|
struct ggml_tensor * attn_norm_2 = nullptr;
|
|
struct ggml_tensor * attn_norm_2_b = nullptr;
|
|
struct ggml_tensor * attn_q_norm = nullptr;
|
|
struct ggml_tensor * attn_q_norm_b = nullptr;
|
|
struct ggml_tensor * attn_k_norm = nullptr;
|
|
struct ggml_tensor * attn_k_norm_b = nullptr;
|
|
struct ggml_tensor * attn_out_norm = nullptr;
|
|
struct ggml_tensor * attn_out_norm_b = nullptr;
|
|
struct ggml_tensor * attn_q_a_norm = nullptr;
|
|
struct ggml_tensor * attn_kv_a_norm = nullptr;
|
|
struct ggml_tensor * attn_sub_norm = nullptr;
|
|
struct ggml_tensor * attn_post_norm = nullptr;
|
|
struct ggml_tensor * ffn_sub_norm = nullptr;
|
|
struct ggml_tensor * attn_norm_cross = nullptr;
|
|
struct ggml_tensor * attn_norm_enc = nullptr;
|
|
|
|
// attention
|
|
struct ggml_tensor * wq = nullptr;
|
|
struct ggml_tensor * wk = nullptr;
|
|
struct ggml_tensor * wv = nullptr;
|
|
struct ggml_tensor * wo = nullptr;
|
|
struct ggml_tensor * wqkv = nullptr;
|
|
struct ggml_tensor * wqk = nullptr;
|
|
struct ggml_tensor * wkv = nullptr;
|
|
struct ggml_tensor * wq_a = nullptr;
|
|
struct ggml_tensor * wq_b = nullptr;
|
|
struct ggml_tensor * wkv_a_mqa = nullptr;
|
|
struct ggml_tensor * wkq_a_mqa = nullptr;
|
|
struct ggml_tensor * wkv_b = nullptr;
|
|
struct ggml_tensor * wk_b = nullptr;
|
|
struct ggml_tensor * wv_b = nullptr;
|
|
struct ggml_tensor * wq_cross = nullptr;
|
|
struct ggml_tensor * wk_cross = nullptr;
|
|
struct ggml_tensor * wv_cross = nullptr;
|
|
struct ggml_tensor * wo_cross = nullptr;
|
|
struct ggml_tensor * wq_enc = nullptr;
|
|
struct ggml_tensor * wk_enc = nullptr;
|
|
struct ggml_tensor * wv_enc = nullptr;
|
|
struct ggml_tensor * wo_enc = nullptr;
|
|
struct ggml_tensor * attn_sinks = nullptr;
|
|
|
|
// attention bias
|
|
struct ggml_tensor * bq = nullptr;
|
|
struct ggml_tensor * bk = nullptr;
|
|
struct ggml_tensor * bv = nullptr;
|
|
struct ggml_tensor * bo = nullptr;
|
|
struct ggml_tensor * bqkv = nullptr;
|
|
struct ggml_tensor * bqk = nullptr;
|
|
struct ggml_tensor * bkv = nullptr;
|
|
|
|
llama_split_tensor split_attn_norm;
|
|
llama_split_tensor split_wq;
|
|
llama_split_tensor split_wk;
|
|
llama_split_tensor split_wv;
|
|
llama_split_tensor split_wo;
|
|
llama_split_tensor split_wqkv;
|
|
llama_split_tensor split_wqk;
|
|
llama_split_tensor split_wkv;
|
|
llama_split_tensor split_bq;
|
|
llama_split_tensor split_bk;
|
|
llama_split_tensor split_bv;
|
|
llama_split_tensor split_bo;
|
|
llama_split_tensor split_bqkv;
|
|
llama_split_tensor split_bqk;
|
|
llama_split_tensor split_bkv;
|
|
|
|
// relative position bias
|
|
struct ggml_tensor * attn_rel_b = nullptr;
|
|
struct ggml_tensor * attn_rel_b_enc = nullptr;
|
|
struct ggml_tensor * attn_rel_b_cross = nullptr;
|
|
|
|
// normalization
|
|
struct ggml_tensor * ffn_norm = nullptr;
|
|
struct ggml_tensor * ffn_norm_b = nullptr;
|
|
struct ggml_tensor * ffn_post_norm = nullptr;
|
|
struct ggml_tensor * layer_out_norm = nullptr;
|
|
struct ggml_tensor * layer_out_norm_b = nullptr;
|
|
struct ggml_tensor * ffn_norm_exps = nullptr;
|
|
struct ggml_tensor * ffn_norm_enc = nullptr;
|
|
|
|
// ff
|
|
struct ggml_tensor * ffn_gate = nullptr; // w1
|
|
struct ggml_tensor * ffn_down = nullptr; // w2
|
|
struct ggml_tensor * ffn_up = nullptr; // w3
|
|
struct ggml_tensor * ffn_gate_enc = nullptr;
|
|
struct ggml_tensor * ffn_down_enc = nullptr;
|
|
struct ggml_tensor * ffn_up_enc = nullptr;
|
|
|
|
llama_split_tensor split_ffn_up;
|
|
llama_split_tensor split_ffn_gate;
|
|
llama_split_tensor split_ffn_down;
|
|
llama_split_tensor split_ffn_norm;
|
|
|
|
// ff MoE
|
|
struct ggml_tensor * ffn_gate_inp = nullptr;
|
|
struct ggml_tensor * ffn_gate_exps = nullptr;
|
|
struct ggml_tensor * ffn_down_exps = nullptr;
|
|
struct ggml_tensor * ffn_up_exps = nullptr;
|
|
|
|
llama_split_tensor split_ffn_up_exps;
|
|
llama_split_tensor split_ffn_gate_exps;
|
|
llama_split_tensor split_ffn_down_exps;
|
|
|
|
// ff MoE bias
|
|
struct ggml_tensor * ffn_gate_inp_b = nullptr;
|
|
struct ggml_tensor * ffn_gate_exps_b = nullptr;
|
|
struct ggml_tensor * ffn_down_exps_b = nullptr;
|
|
struct ggml_tensor * ffn_up_exps_b = nullptr;
|
|
struct ggml_tensor * ffn_gate_exps_b_dup = nullptr;
|
|
struct ggml_tensor * ffn_down_exps_b_dup = nullptr;
|
|
struct ggml_tensor * ffn_up_exps_b_dup = nullptr;
|
|
|
|
// ff shared expert (shexp)
|
|
struct ggml_tensor * ffn_gate_inp_shexp = nullptr;
|
|
struct ggml_tensor * ffn_gate_shexp = nullptr;
|
|
struct ggml_tensor * ffn_down_shexp = nullptr;
|
|
struct ggml_tensor * ffn_up_shexp = nullptr;
|
|
|
|
llama_split_tensor split_ffn_up_shexp;
|
|
llama_split_tensor split_ffn_gate_shexp;
|
|
llama_split_tensor split_ffn_down_shexp;
|
|
|
|
// ff bias
|
|
struct ggml_tensor * ffn_gate_b = nullptr;
|
|
struct ggml_tensor * ffn_down_b = nullptr; // b2
|
|
struct ggml_tensor * ffn_up_b = nullptr; // b3
|
|
struct ggml_tensor * ffn_act = nullptr;
|
|
struct ggml_tensor * ffn_exp_probs_b = nullptr;
|
|
|
|
// mamba proj
|
|
struct ggml_tensor * ssm_in = nullptr;
|
|
struct ggml_tensor * ssm_x = nullptr;
|
|
struct ggml_tensor * ssm_dt = nullptr;
|
|
struct ggml_tensor * ssm_out = nullptr;
|
|
|
|
// mamba
|
|
struct ggml_tensor * ssm_conv1d = nullptr;
|
|
struct ggml_tensor * ssm_a = nullptr;
|
|
struct ggml_tensor * ssm_d = nullptr;
|
|
|
|
// mamba bias
|
|
struct ggml_tensor * ssm_conv1d_b = nullptr;
|
|
struct ggml_tensor * ssm_dt_b = nullptr;
|
|
|
|
// long rope factors
|
|
struct ggml_tensor * rope_long = nullptr;
|
|
struct ggml_tensor * rope_short = nullptr;
|
|
struct ggml_tensor * rope_freqs = nullptr;
|
|
|
|
llama_split_tensor split_rope_freqs;
|
|
|
|
// bitnet scale
|
|
struct ggml_tensor * wq_scale = nullptr;
|
|
struct ggml_tensor * wk_scale = nullptr;
|
|
struct ggml_tensor * wv_scale = nullptr;
|
|
struct ggml_tensor * wo_scale = nullptr;
|
|
struct ggml_tensor * ffn_gate_scale = nullptr;
|
|
struct ggml_tensor * ffn_up_scale = nullptr;
|
|
struct ggml_tensor * ffn_down_scale = nullptr;
|
|
|
|
struct llama_layer_nextn nextn;
|
|
|
|
std::unique_ptr<ggml_tensor> computed_wk_b;
|
|
std::unique_ptr<ggml_tensor> computed_wv_b;
|
|
std::unique_ptr<ggml_tensor> computed_wkv_b;
|
|
};
|
|
|
|
struct llama_lora_adapter;
|
|
|
|
struct rpc_device {
|
|
std::string endpoint;
|
|
uint32_t device;
|
|
};
|
|
|
|
struct llama_model {
|
|
e_model type = MODEL_UNKNOWN;
|
|
llm_arch arch = LLM_ARCH_UNKNOWN;
|
|
llama_ftype ftype = LLAMA_FTYPE_ALL_F32;
|
|
|
|
std::string name = "n/a";
|
|
|
|
llama_hparams hparams = {};
|
|
llama_vocab vocab;
|
|
|
|
struct ggml_tensor * tok_embd;
|
|
struct ggml_tensor * type_embd;
|
|
struct ggml_tensor * pos_embd;
|
|
struct ggml_tensor * tok_norm;
|
|
struct ggml_tensor * tok_norm_b;
|
|
|
|
struct ggml_tensor * output_norm;
|
|
struct ggml_tensor * output_norm_b;
|
|
struct ggml_tensor * output;
|
|
struct ggml_tensor * output_b;
|
|
struct ggml_tensor * output_norm_enc;
|
|
|
|
llama_split_tensor split_output;
|
|
|
|
std::vector<llama_layer> layers;
|
|
|
|
llama_split_mode split_mode;
|
|
int main_gpu;
|
|
int n_gpu_layers;
|
|
|
|
std::vector<rpc_device> rpc_servers;
|
|
std::vector<int32_t> devices;
|
|
|
|
// gguf metadata
|
|
std::unordered_map<std::string, std::string> gguf_kv;
|
|
|
|
// layer -> buffer type mapping
|
|
struct layer_buft {
|
|
layer_buft() : buft_matrix(nullptr), buft(nullptr) {}
|
|
layer_buft(ggml_backend_buffer_type_t matrix) : buft_matrix(matrix), buft(matrix) {}
|
|
layer_buft(ggml_backend_buffer_type_t matrix, ggml_backend_buffer_type_t other) : buft_matrix(matrix), buft(other) {}
|
|
|
|
ggml_backend_buffer_type_t buft_matrix; // matrices only - used by split buffers and backends that support only matrix multiplication
|
|
ggml_backend_buffer_type_t buft; // everything else
|
|
};
|
|
|
|
layer_buft buft_input;
|
|
layer_buft buft_output;
|
|
std::vector<layer_buft> buft_layer;
|
|
|
|
// contexts where the model tensors metadata is stored
|
|
std::vector<struct ggml_context *> ctxs;
|
|
|
|
// the model memory buffers for the tensor data
|
|
std::vector<ggml_backend_buffer_t> bufs;
|
|
|
|
// model memory mapped files
|
|
llama_mmaps mappings;
|
|
|
|
// objects representing data potentially being locked in memory
|
|
llama_mlocks mlock_bufs;
|
|
llama_mlocks mlock_mmaps;
|
|
|
|
// for quantize-stats only
|
|
std::vector<std::pair<std::string, struct ggml_tensor *>> tensors_by_name;
|
|
|
|
int64_t t_load_us = 0;
|
|
int64_t t_start_us = 0;
|
|
|
|
// keep track of loaded lora adapters
|
|
std::set<llama_lora_adapter *> lora_adapters;
|
|
|
|
bool tensor_overrides;
|
|
|
|
~llama_model();
|
|
|
|
// Not actually needed, but left in place for now
|
|
size_t max_nodes() const { return 65536; }
|
|
|
|
bool has_tensor_overrides() const {
|
|
return tensor_overrides;
|
|
}
|
|
|
|
void set_tensor_overrides(const llama_model_params& params);
|
|
|
|
int device_count() const;
|
|
ggml_backend_buffer_type_t default_buffer_type_offload(int device) const;
|
|
|
|
std::vector<float> splits;
|
|
};
|
|
|
|
struct llama_lora_weight {
|
|
struct ggml_tensor * a = nullptr;
|
|
struct ggml_tensor * b = nullptr;
|
|
llama_lora_weight() = default;
|
|
llama_lora_weight(struct ggml_tensor * a, struct ggml_tensor * b): a(a), b(b) {}
|
|
};
|
|
|
|
struct llama_lora_adapter {
|
|
llama_model * base_model;
|
|
// map tensor name to lora_a_b
|
|
std::unordered_map<std::string, struct llama_lora_weight> ab_map;
|
|
std::vector<struct ggml_context *> ctxs;
|
|
std::vector<ggml_backend_buffer_t> bufs;
|
|
|
|
float alpha;
|
|
|
|
llama_lora_adapter(struct llama_model * base_model): base_model(base_model) {
|
|
base_model->lora_adapters.insert(this);
|
|
}
|
|
|
|
llama_lora_weight * get_weight(struct ggml_tensor * w) {
|
|
std::string name(w->name);
|
|
auto pos = ab_map.find(name);
|
|
if (ab_map.find(name) != ab_map.end()) {
|
|
return &pos->second;
|
|
}
|
|
return nullptr;
|
|
}
|
|
|
|
~llama_lora_adapter() {
|
|
for (struct ggml_context * ctx : ctxs) {
|
|
ggml_free(ctx);
|
|
}
|
|
for (ggml_backend_buffer_t buf : bufs) {
|
|
ggml_backend_buffer_free(buf);
|
|
}
|
|
auto pos = base_model->lora_adapters.find(this);
|
|
if (pos != base_model->lora_adapters.end()) {
|
|
base_model->lora_adapters.erase(pos);
|
|
}
|
|
}
|
|
};
|
|
|
|
// helper to handle gguf constants
|
|
// usage:
|
|
//
|
|
// const auto tn = LLM_TN(LLM_ARCH_LLAMA);
|
|
//
|
|
// std::string name = tn(LLM_TENSOR_OUTPUT); -> "output"
|
|
// std::string name = tn(LLM_TENSOR_TOKEN_EMBD, "bias"); -> "token_embd.bias"
|
|
// std::string name = tn(LLM_TENSOR_ATTN_NORM, "weight", 3); -> "blk.3.attn_norm.weight"
|
|
//
|
|
struct LLM_TN {
|
|
LLM_TN(llm_arch arch) : arch(arch) {}
|
|
|
|
llm_arch arch;
|
|
|
|
std::string operator()(llm_tensor tensor) const;
|
|
|
|
std::string operator()(llm_tensor tensor, const std::string & suffix) const;
|
|
|
|
std::string operator()(llm_tensor tensor, int bid) const;
|
|
|
|
std::string operator()(llm_tensor tensor, const std::string & suffix, int bid) const;
|
|
|
|
std::string operator()(llm_tensor tensor, const std::string & suffix, int bid, int xid) const;
|
|
};
|
|
|
|
std::string llama_model_ftype_name(llama_ftype ftype);
|
|
|
|
const char * llama_model_type_name(e_model type);
|