diff --git a/ggml/src/ggml-cuda/fattn-new-mma.cu b/ggml/src/ggml-cuda/fattn-new-mma.cu
index 27a411ea..c37a618f 100644
--- a/ggml/src/ggml-cuda/fattn-new-mma.cu
+++ b/ggml/src/ggml-cuda/fattn-new-mma.cu
@@ -493,7 +493,7 @@ static __device__ __forceinline__ void flash_attn_ext_f16_iter(
             (V_h2 + k_VKQ_0*stride_V, tile_V, nbatch_V2, stride_V);
     } else {
         constexpr bool use_cp_async = nstages == 1;
-        if constexpr (ncols2 > 1 || mask_h2) {
+        if (ncols2 > 1 || mask_h2) {
             flash_attn_ext_f16_load_mask<ncols1, nwarps, c::nbatch_fa, use_cp_async>(mask_h2 + k_VKQ_0/2, tile_mask, stride_mask);
         }
     }
diff --git a/ggml/src/ggml-cuda/fattn.cu b/ggml/src/ggml-cuda/fattn.cu
index dfc31c77..ffcaf219 100644
--- a/ggml/src/ggml-cuda/fattn.cu
+++ b/ggml/src/ggml-cuda/fattn.cu
@@ -524,7 +524,7 @@ void ggml_cuda_flash_attn_ext(ggml_backend_cuda_context & ctx, ggml_tensor * dst
     // Hence, we use it only for DeepSeek with MLA enabled, where head sizes are 576, 512,
     // so no other implementation works.
     //
-    if (new_mma_available(cc) && (Q->ne[0] == 576 || (Q->ne[0] == 64) && Q->ne[1] >= 128)) {
+    if (new_mma_available(cc) && Q->ne[0] == 576) {
         ggml_cuda_flash_attn_ext_mma_new(ctx, dst);
         return;
     }
diff --git a/src/llama-impl.h b/src/llama-impl.h
index befdc559..cd4e0730 100644
--- a/src/llama-impl.h
+++ b/src/llama-impl.h
@@ -217,3 +217,4 @@ struct no_init {
 struct gguf_context;
 std::string gguf_kv_to_str(const gguf_context * ctx_gguf, int i);
 
+ggml_backend_buffer_type_t llama_default_buffer_type_cpu(bool host_buffer);
diff --git a/src/llama-model-loader.cpp b/src/llama-model-loader.cpp
index 2662a9aa..57d20285 100644
--- a/src/llama-model-loader.cpp
+++ b/src/llama-model-loader.cpp
@@ -2,6 +2,19 @@
 #include "llama-impl.h"
 #include "llama-mmap.h"
 #include "ggml.h"
+//#include "ggml-backend.h"
+
+#ifdef GGML_USE_CUDA
+#  include "ggml-cuda.h"
+#elif defined(GGML_USE_VULKAN)
+#  include "ggml-vulkan.h"
+#elif defined(GGML_USE_SYCL)
+#  include "ggml-sycl.h"
+#elif defined(GGML_USE_KOMPUTE)
+#   include "ggml-kompute.h"
+#elif defined(GGML_USE_CANN)
+#   include "ggml-cann.h"
+#endif
 
 #include <set>
 #include <map>
diff --git a/src/llama.cpp b/src/llama.cpp
index 65a1e89e..c5fa1026 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -1705,7 +1705,7 @@ static std::string llama_token_to_piece(const struct llama_model * model, llama_
     return piece;
 }
 
-static ggml_backend_buffer_type_t llama_default_buffer_type_cpu(bool host_buffer) {
+ggml_backend_buffer_type_t llama_default_buffer_type_cpu(bool host_buffer) {
     ggml_backend_buffer_type_t buft = nullptr;
 
 #if defined(GGML_USE_CUDA)