Refactor file llama.cpp (#823)

* llama_model and llama_hparams * llama_build_context Surprisingly small reduction in llama.cpp compile time given the reduction in LOCs (22k -> 14k) * LLM_TN llama.cpp compilation: 50 s -> 33 s * llama_quantize * arch names * All graph building is now in llm-build-context.cpp * hparams loading llama.cpp is now just 9300 LOC, but still takes 32 seconds to compile. * We are now at 6 seconds to build the src folder * load -> create We are not actually loading the tensors, but just creating them. --------- Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
2026-03-14 15:57:37 +00:00 · 2025-10-11 11:35:20 +03:00
parent f649e36a61
commit 335a1f9b71
16 changed files with 16361 additions and 15826 deletions
--- a/src/llama-model-loader.cpp
+++ b/src/llama-model-loader.cpp
@@ -1,6 +1,7 @@
 #include "llama-model-loader.h"
 #include "llama-impl.h"
 #include "llama-mmap.h"
+#include "llama-model.h"
 #include "ggml.h"
 //#include "ggml-backend.h"

@@ -20,6 +21,7 @@
 #include <map>
 #include <array>
 #include <future>
+#include <regex>

 #if defined(_WIN32)
    #define WIN32_LEAN_AND_MEAN
@@ -1080,3 +1082,4 @@ template bool llama_model_loader::get_key_or_arr<std::array<int, 4>>(enum llm_kv
 template bool llama_model_loader::get_key_or_arr<std::array<uint32_t, 512>>(enum llm_kv kid, std::array<uint32_t, 512> & result, uint32_t n, bool required);

 template std::enable_if<std::is_integral<unsigned int>::value, bool>::type llama_model_loader::get_arr_n<unsigned int>(enum llm_kv, unsigned int&, bool);
+