We are now at 6 seconds to build the src folder

2026-02-23 22:54:10 +00:00 · 2025-10-11 10:13:07 +03:00
parent ca73a21a0e
commit 4b71c16a75
6 changed files with 2460 additions and 2340 deletions
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -19,6 +19,7 @@ add_library(llama
            llama-sampling.cpp
            llama-mmap.cpp
            llama-model-loader.cpp
+            llama-load-tensors.cpp
            llama-build-context.h
            llama-build-context.cpp
            llama-model.h
--- a/src/llama-hparams.h
+++ b/src/llama-hparams.h
@@ -4,6 +4,7 @@

 #include <cstdint>
 #include <array>
+#include <cmath>

 #define LLAMA_MAX_LAYERS  512

--- a/src/llama-load-tensors.cpp
+++ b/src/llama-load-tensors.cpp
--- a/src/llama-model-loader.cpp
+++ b/src/llama-model-loader.cpp
@@ -1,6 +1,7 @@
 #include "llama-model-loader.h"
 #include "llama-impl.h"
 #include "llama-mmap.h"
+#include "llama-model.h"
 #include "ggml.h"
 //#include "ggml-backend.h"

@@ -20,6 +21,7 @@
 #include <map>
 #include <array>
 #include <future>
+#include <regex>

 #if defined(_WIN32)
    #define WIN32_LEAN_AND_MEAN
@@ -1080,3 +1082,4 @@ template bool llama_model_loader::get_key_or_arr<std::array<int, 4>>(enum llm_kv
 template bool llama_model_loader::get_key_or_arr<std::array<uint32_t, 512>>(enum llm_kv kid, std::array<uint32_t, 512> & result, uint32_t n, bool required);

 template std::enable_if<std::is_integral<unsigned int>::value, bool>::type llama_model_loader::get_arr_n<unsigned int>(enum llm_kv, unsigned int&, bool);
+
--- a/src/llama-model-loader.h
+++ b/src/llama-model-loader.h
@@ -10,6 +10,7 @@
 #include <stdexcept>
 #include <unordered_map>
 #include <vector>
+#include <map>

 enum llama_fver {
    GGUF_FILE_VERSION_V1 = 1,
@@ -29,6 +30,8 @@ static const char * llama_file_version_name(llama_fver version) {

 using llama_buf_map = std::unordered_map<uint32_t, ggml_backend_buffer_t>;

+struct llama_layer;
+
 struct llama_model_loader {
    int n_kv      = 0;
    int n_tensors = 0;
@@ -171,3 +174,127 @@ struct llama_model_loader {
 void llm_load_arch(llama_model_loader & ml, llama_model & model);

 void llm_load_hparams(llama_model_loader & ml, llama_model & model);
+
+struct LLM_TN;
+
+struct load_lensor_helper {
+    load_lensor_helper(llama_model_loader & ml, llama_model & model);
+
+    bool load_llama_tensors(const LLM_TN & tn);
+
+    bool load_deci_tensors(const LLM_TN & tn);
+
+    bool load_llama4_tensors(const LLM_TN & tn);
+
+    bool load_grok_tensors(const LLM_TN & tn);
+
+    bool load_dbrx_tensors(const LLM_TN & tn);
+
+    bool load_baichuan_tensors(const LLM_TN & tn, bool with_ffn_norm = true);
+
+    bool load_falcon_tensors(const LLM_TN & tn);
+
+    bool load_starcoder_tensors(const LLM_TN & tn);
+
+    bool load_bert_tensors(const LLM_TN & tn);
+
+    bool load_jina_bert2_tensors(const LLM_TN & tn);
+
+    bool load_bloom_tensors(const LLM_TN & tn);
+
+    bool load_mpt_tensors(const LLM_TN & tn);
+
+    bool load_stablelm_tensors(const LLM_TN & tn);
+
+    bool load_qwen_tensors(const LLM_TN & tn);
+
+    bool load_qwen2_tensors(const LLM_TN & tn);
+
+    bool load_qwen2_moe_tensors(const LLM_TN & tn);
+
+    bool load_qwen3_tensors(const LLM_TN & tn);
+
+    bool load_qwen3_moe_tensors(const LLM_TN & tn);
+
+    bool load_phi2_tensors(const LLM_TN & tn);
+
+    bool load_phi3_tensors(const LLM_TN & tn);
+
+    bool load_gpt2_tensors(const LLM_TN & tn);
+
+    bool load_codeshell_tensors(const LLM_TN & tn);
+
+    bool load_orion_tensors(const LLM_TN & tn);
+
+    bool load_internlm_tensors(const LLM_TN & tn);
+
+    bool load_gemma_tensors(const LLM_TN & tn, int version);
+
+    bool load_starcoder2_tensors(const LLM_TN & tn);
+
+    bool load_mamba_tensors(const LLM_TN & tn);
+
+    bool load_xverse_tensors(const LLM_TN & tn);
+
+    bool load_command_r_tensors(const LLM_TN & tn);
+
+    bool load_olmo_tensors(const LLM_TN & tn);
+
+    bool load_openelm_tensors(const LLM_TN & tn);
+
+    bool load_gptneox_tensors(const LLM_TN & tn);
+
+    bool load_arctix_tensors(const LLM_TN & tn);
+
+    bool load_deepseek2_tensors(const LLM_TN & tn);
+
+    bool load_glm4_tensors(const LLM_TN & tn);
+
+    bool load_glm4_moe_tensors(const LLM_TN & tn);
+
+    bool load_bitnet_tensors(const LLM_TN & tn);
+
+    bool load_bitnet2_tensors(const LLM_TN & tn);
+
+    bool load_t5_tensors(const LLM_TN & tn);
+
+    bool load_tsencoder_tensors(const LLM_TN & tn);
+
+    bool load_jais_tensors(const LLM_TN & tn);
+
+    bool load_chatglm_tensors(const LLM_TN & tn);
+
+    bool load_cohere2_tensors(const LLM_TN & tn);
+
+    bool load_dots1_tensors(const LLM_TN & tn);
+
+    bool load_ernie45_tensors(const LLM_TN & tn);
+
+    bool load_hunyuan_tensors(const LLM_TN & tn);
+
+    bool load_openai_moe_tensors(const LLM_TN & tn);
+
+    bool load_tensors();
+
+    llama_model_loader & ml;
+    llama_model        & model;
+
+    ggml_tensor * create_tensor(ggml_context * ctx, const std::string & name, const std::vector<int64_t> & ne, int flags = 0);
+
+    void load_default_embd_output(const LLM_TN & tn, int n_embd, int n_vocab, bool norm_bias);
+    void load_embd_output(const LLM_TN & tn, int n_embd, int n_vocab, bool has_norm = true);
+
+    void load_std_attn(int i, const LLM_TN & tn, llama_layer & layer, int n_embd, int n_embd_gqa, ggml_context * ctx_split);
+    void load_std_ffn(int i, const LLM_TN & tn, llama_layer & layer, int n_ff, int n_embd, ggml_context * ctx_split);
+
+    inline ggml_context * ctx_for_layer(int i) const;
+    inline ggml_context * ctx_for_layer_split(int i) const;
+
+    std::map<ggml_backend_buffer_type_t, int> buft_layer_count;
+    std::map<ggml_backend_buffer_type_t, ggml_context *> ctx_map;
+    size_t ctx_size;
+
+    ggml_context * ctx_input;
+    ggml_context * ctx_output;
+    ggml_context * ctx_output_split;
+};
--- a/src/llama.cpp
+++ b/src/llama.cpp