Fix CUDA after latest changes

This commit is contained in:
Iwan Kawrakow
2025-08-13 17:23:13 +03:00
parent d2985c6a64
commit 0fe47c57eb
5 changed files with 17 additions and 3 deletions

View File

@@ -493,7 +493,7 @@ static __device__ __forceinline__ void flash_attn_ext_f16_iter(
(V_h2 + k_VKQ_0*stride_V, tile_V, nbatch_V2, stride_V);
} else {
constexpr bool use_cp_async = nstages == 1;
if constexpr (ncols2 > 1 || mask_h2) {
if (ncols2 > 1 || mask_h2) {
flash_attn_ext_f16_load_mask<ncols1, nwarps, c::nbatch_fa, use_cp_async>(mask_h2 + k_VKQ_0/2, tile_mask, stride_mask);
}
}

View File

@@ -524,7 +524,7 @@ void ggml_cuda_flash_attn_ext(ggml_backend_cuda_context & ctx, ggml_tensor * dst
// Hence, we use it only for DeepSeek with MLA enabled, where head sizes are 576, 512,
// so no other implementation works.
//
if (new_mma_available(cc) && (Q->ne[0] == 576 || (Q->ne[0] == 64) && Q->ne[1] >= 128)) {
if (new_mma_available(cc) && Q->ne[0] == 576) {
ggml_cuda_flash_attn_ext_mma_new(ctx, dst);
return;
}

View File

@@ -217,3 +217,4 @@ struct no_init {
struct gguf_context;
std::string gguf_kv_to_str(const gguf_context * ctx_gguf, int i);
ggml_backend_buffer_type_t llama_default_buffer_type_cpu(bool host_buffer);

View File

@@ -2,6 +2,19 @@
#include "llama-impl.h"
#include "llama-mmap.h"
#include "ggml.h"
//#include "ggml-backend.h"
#ifdef GGML_USE_CUDA
# include "ggml-cuda.h"
#elif defined(GGML_USE_VULKAN)
# include "ggml-vulkan.h"
#elif defined(GGML_USE_SYCL)
# include "ggml-sycl.h"
#elif defined(GGML_USE_KOMPUTE)
# include "ggml-kompute.h"
#elif defined(GGML_USE_CANN)
# include "ggml-cann.h"
#endif
#include <set>
#include <map>

View File

@@ -1705,7 +1705,7 @@ static std::string llama_token_to_piece(const struct llama_model * model, llama_
return piece;
}
static ggml_backend_buffer_type_t llama_default_buffer_type_cpu(bool host_buffer) {
ggml_backend_buffer_type_t llama_default_buffer_type_cpu(bool host_buffer) {
ggml_backend_buffer_type_t buft = nullptr;
#if defined(GGML_USE_CUDA)