Give the user the option to override where model weights are stored (#232)

* Give the user the option to override where model weights are stored * Fix ggml_nbytes() problem and cleanup For a tensor with zero elements ggml_nbytes() was returning uint64_t::max, and this was causing graph allocation failure. * Add timing info to CUDA graph evaluation * Add more timing info --------- Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
2026-04-27 01:49:28 +00:00 · 2025-02-25 17:55:58 +02:00
parent 6ae06d2c5c
commit 85c6152e85
9 changed files with 848 additions and 621 deletions
--- a/include/llama.h
+++ b/include/llama.h
@@ -305,6 +305,11 @@ extern "C" {
        };
    };

+    struct llama_model_tensor_buft_override {
+        const char * pattern;
+        ggml_backend_buffer_type_t buft;
+    };
+
    struct llama_model_params {
        int32_t n_gpu_layers; // number of layers to store in VRAM
        enum llama_split_mode split_mode; // how to split the model across multiple GPUs
@@ -332,6 +337,8 @@ extern "C" {
        // override key-value pairs of the model meta data
        const struct llama_model_kv_override * kv_overrides;

+        const struct llama_model_tensor_buft_override * tensor_buft_overrides;
+
        // Keep the booleans together to avoid misalignment during copy-by-value.
        bool vocab_only;    // only load the vocabulary, no weights
        bool use_mmap;      // use mmap if possible