mirror of
https://github.com/ikawrakow/ik_llama.cpp.git
synced 2026-04-27 01:49:28 +00:00
Give the user the option to override where model weights are stored (#232)
* Give the user the option to override where model weights are stored * Fix ggml_nbytes() problem and cleanup For a tensor with zero elements ggml_nbytes() was returning uint64_t::max, and this was causing graph allocation failure. * Add timing info to CUDA graph evaluation * Add more timing info --------- Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
This commit is contained in:
@@ -305,6 +305,11 @@ extern "C" {
|
||||
};
|
||||
};
|
||||
|
||||
struct llama_model_tensor_buft_override {
|
||||
const char * pattern;
|
||||
ggml_backend_buffer_type_t buft;
|
||||
};
|
||||
|
||||
struct llama_model_params {
|
||||
int32_t n_gpu_layers; // number of layers to store in VRAM
|
||||
enum llama_split_mode split_mode; // how to split the model across multiple GPUs
|
||||
@@ -332,6 +337,8 @@ extern "C" {
|
||||
// override key-value pairs of the model meta data
|
||||
const struct llama_model_kv_override * kv_overrides;
|
||||
|
||||
const struct llama_model_tensor_buft_override * tensor_buft_overrides;
|
||||
|
||||
// Keep the booleans together to avoid misalignment during copy-by-value.
|
||||
bool vocab_only; // only load the vocabulary, no weights
|
||||
bool use_mmap; // use mmap if possible
|
||||
|
||||
Reference in New Issue
Block a user