Fix CUDA after latest changes

2026-02-24 07:04:11 +00:00 · 2025-08-13 17:23:13 +03:00
parent d2985c6a64
commit 0fe47c57eb
5 changed files with 17 additions and 3 deletions
--- a/ggml/src/ggml-cuda/fattn-new-mma.cu
+++ b/ggml/src/ggml-cuda/fattn-new-mma.cu
@@ -493,7 +493,7 @@ static __device__ __forceinline__ void flash_attn_ext_f16_iter(
            (V_h2 + k_VKQ_0*stride_V, tile_V, nbatch_V2, stride_V);
    } else {
        constexpr bool use_cp_async = nstages == 1;
-        if constexpr (ncols2 > 1 || mask_h2) {
+        if (ncols2 > 1 || mask_h2) {
            flash_attn_ext_f16_load_mask<ncols1, nwarps, c::nbatch_fa, use_cp_async>(mask_h2 + k_VKQ_0/2, tile_mask, stride_mask);
        }
    }
--- a/ggml/src/ggml-cuda/fattn.cu
+++ b/ggml/src/ggml-cuda/fattn.cu
@@ -524,7 +524,7 @@ void ggml_cuda_flash_attn_ext(ggml_backend_cuda_context & ctx, ggml_tensor * dst
    // Hence, we use it only for DeepSeek with MLA enabled, where head sizes are 576, 512,
    // so no other implementation works.
    //
-    if (new_mma_available(cc) && (Q->ne[0] == 576 || (Q->ne[0] == 64) && Q->ne[1] >= 128)) {
+    if (new_mma_available(cc) && Q->ne[0] == 576) {
        ggml_cuda_flash_attn_ext_mma_new(ctx, dst);
        return;
    }
--- a/src/llama-impl.h
+++ b/src/llama-impl.h
@@ -217,3 +217,4 @@ struct no_init {
 struct gguf_context;
 std::string gguf_kv_to_str(const gguf_context * ctx_gguf, int i);

+ggml_backend_buffer_type_t llama_default_buffer_type_cpu(bool host_buffer);
--- a/src/llama-model-loader.cpp
+++ b/src/llama-model-loader.cpp
@@ -2,6 +2,19 @@
 #include "llama-impl.h"
 #include "llama-mmap.h"
 #include "ggml.h"
+//#include "ggml-backend.h"
+
+#ifdef GGML_USE_CUDA
+#  include "ggml-cuda.h"
+#elif defined(GGML_USE_VULKAN)
+#  include "ggml-vulkan.h"
+#elif defined(GGML_USE_SYCL)
+#  include "ggml-sycl.h"
+#elif defined(GGML_USE_KOMPUTE)
+#   include "ggml-kompute.h"
+#elif defined(GGML_USE_CANN)
+#   include "ggml-cann.h"
+#endif

 #include <set>
 #include <map>
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -1705,7 +1705,7 @@ static std::string llama_token_to_piece(const struct llama_model * model, llama_
    return piece;
 }

-static ggml_backend_buffer_type_t llama_default_buffer_type_cpu(bool host_buffer) {
+ggml_backend_buffer_type_t llama_default_buffer_type_cpu(bool host_buffer) {
    ggml_backend_buffer_type_t buft = nullptr;

 #if defined(GGML_USE_CUDA)