Give the user the option to override where model weights are stored (#232)

* Give the user the option to override where model weights are stored

* Fix ggml_nbytes() problem and cleanup

For a tensor with zero elements ggml_nbytes() was returning
uint64_t::max, and this was causing graph allocation failure.

* Add timing info to CUDA graph evaluation

* Add more timing info

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
This commit is contained in:
Kawrakow
2025-02-25 17:55:58 +02:00
committed by GitHub
parent 6ae06d2c5c
commit 85c6152e85
9 changed files with 848 additions and 621 deletions

View File

@@ -305,6 +305,11 @@ extern "C" {
};
};
struct llama_model_tensor_buft_override {
const char * pattern;
ggml_backend_buffer_type_t buft;
};
struct llama_model_params {
int32_t n_gpu_layers; // number of layers to store in VRAM
enum llama_split_mode split_mode; // how to split the model across multiple GPUs
@@ -332,6 +337,8 @@ extern "C" {
// override key-value pairs of the model meta data
const struct llama_model_kv_override * kv_overrides;
const struct llama_model_tensor_buft_override * tensor_buft_overrides;
// Keep the booleans together to avoid misalignment during copy-by-value.
bool vocab_only; // only load the vocabulary, no weights
bool use_mmap; // use mmap if possible