Command line option to set max. extra VRAM that the scheduler can use

2026-04-24 08:29:29 +00:00 · 2025-12-16 06:35:06 +00:00
parent 5235c8b3e4
commit ec2ba592b5
6 changed files with 24 additions and 5 deletions
--- a/include/llama.h
+++ b/include/llama.h
@@ -404,6 +404,7 @@ extern "C" {
        uint32_t n_seq_max;         // max number of sequences (i.e. distinct states for recurrent models)
        uint32_t n_threads;         // number of threads to use for generation
        uint32_t n_threads_batch;   // number of threads to use for batch processing
+        int32_t  max_extra_alloc;   // Max. additional VRAM the scheduler is allowed to allocate

        enum llama_rope_scaling_type rope_scaling_type; // RoPE scaling type, from `enum llama_rope_scaling_type`
        enum llama_pooling_type      pooling_type;      // whether to pool (sum) embedding results by sequence id