Command line option to set max. extra VRAM that the scheduler can use

This commit is contained in:
Iwan Kawrakow
2025-12-16 06:35:06 +00:00
parent 5235c8b3e4
commit ec2ba592b5
6 changed files with 24 additions and 5 deletions

View File

@@ -404,6 +404,7 @@ extern "C" {
uint32_t n_seq_max; // max number of sequences (i.e. distinct states for recurrent models)
uint32_t n_threads; // number of threads to use for generation
uint32_t n_threads_batch; // number of threads to use for batch processing
int32_t max_extra_alloc; // Max. additional VRAM the scheduler is allowed to allocate
enum llama_rope_scaling_type rope_scaling_type; // RoPE scaling type, from `enum llama_rope_scaling_type`
enum llama_pooling_type pooling_type; // whether to pool (sum) embedding results by sequence id