Honor mmap setting when using tensor overrides

2026-02-24 23:24:13 +00:00 · 2025-03-19 17:05:04 +02:00
parent 22c84a126f
commit 1b62d0fae3
1 changed files with 1 additions and 1 deletions
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -8015,7 +8015,7 @@ static bool llm_load_tensors(
        // only the mmap region containing the tensors in the model is mapped to the backend buffer
        // this is important for metal with apple silicon: if the entire model could be mapped to a metal buffer, then we could just use metal for all layers
        // this allows using partial offloading when the model size exceeds the metal buffer size, but not the RAM size
-        if (ml.use_mmap && use_mmap_buffer && buft == llama_default_buffer_type_cpu(true)) {
+        if (ml.use_mmap && use_mmap_buffer && (buft == llama_default_buffer_type_cpu(true) || buft == ggml_backend_cpu_buffer_type())) {
            for (uint32_t idx = 0; idx < ml.files.size(); idx++) {
                void * addr = nullptr;
                size_t first, last;