diff --git a/ggml/src/ggml-cuda/fattn-new-mma.cu b/ggml/src/ggml-cuda/fattn-new-mma.cu index 27a411ea..c37a618f 100644 --- a/ggml/src/ggml-cuda/fattn-new-mma.cu +++ b/ggml/src/ggml-cuda/fattn-new-mma.cu @@ -493,7 +493,7 @@ static __device__ __forceinline__ void flash_attn_ext_f16_iter( (V_h2 + k_VKQ_0*stride_V, tile_V, nbatch_V2, stride_V); } else { constexpr bool use_cp_async = nstages == 1; - if constexpr (ncols2 > 1 || mask_h2) { + if (ncols2 > 1 || mask_h2) { flash_attn_ext_f16_load_mask(mask_h2 + k_VKQ_0/2, tile_mask, stride_mask); } } diff --git a/ggml/src/ggml-cuda/fattn.cu b/ggml/src/ggml-cuda/fattn.cu index dfc31c77..ffcaf219 100644 --- a/ggml/src/ggml-cuda/fattn.cu +++ b/ggml/src/ggml-cuda/fattn.cu @@ -524,7 +524,7 @@ void ggml_cuda_flash_attn_ext(ggml_backend_cuda_context & ctx, ggml_tensor * dst // Hence, we use it only for DeepSeek with MLA enabled, where head sizes are 576, 512, // so no other implementation works. // - if (new_mma_available(cc) && (Q->ne[0] == 576 || (Q->ne[0] == 64) && Q->ne[1] >= 128)) { + if (new_mma_available(cc) && Q->ne[0] == 576) { ggml_cuda_flash_attn_ext_mma_new(ctx, dst); return; } diff --git a/src/llama-impl.h b/src/llama-impl.h index befdc559..cd4e0730 100644 --- a/src/llama-impl.h +++ b/src/llama-impl.h @@ -217,3 +217,4 @@ struct no_init { struct gguf_context; std::string gguf_kv_to_str(const gguf_context * ctx_gguf, int i); +ggml_backend_buffer_type_t llama_default_buffer_type_cpu(bool host_buffer); diff --git a/src/llama-model-loader.cpp b/src/llama-model-loader.cpp index 2662a9aa..57d20285 100644 --- a/src/llama-model-loader.cpp +++ b/src/llama-model-loader.cpp @@ -2,6 +2,19 @@ #include "llama-impl.h" #include "llama-mmap.h" #include "ggml.h" +//#include "ggml-backend.h" + +#ifdef GGML_USE_CUDA +# include "ggml-cuda.h" +#elif defined(GGML_USE_VULKAN) +# include "ggml-vulkan.h" +#elif defined(GGML_USE_SYCL) +# include "ggml-sycl.h" +#elif defined(GGML_USE_KOMPUTE) +# include "ggml-kompute.h" +#elif defined(GGML_USE_CANN) +# include "ggml-cann.h" +#endif #include #include diff --git a/src/llama.cpp b/src/llama.cpp index 65a1e89e..c5fa1026 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -1705,7 +1705,7 @@ static std::string llama_token_to_piece(const struct llama_model * model, llama_ return piece; } -static ggml_backend_buffer_type_t llama_default_buffer_type_cpu(bool host_buffer) { +ggml_backend_buffer_type_t llama_default_buffer_type_cpu(bool host_buffer) { ggml_backend_buffer_type_t buft = nullptr; #if defined(GGML_USE_CUDA)