mirror of
https://github.com/ikawrakow/ik_llama.cpp.git
synced 2026-02-24 07:04:11 +00:00
Fix CUDA after latest changes
This commit is contained in:
@@ -493,7 +493,7 @@ static __device__ __forceinline__ void flash_attn_ext_f16_iter(
|
||||
(V_h2 + k_VKQ_0*stride_V, tile_V, nbatch_V2, stride_V);
|
||||
} else {
|
||||
constexpr bool use_cp_async = nstages == 1;
|
||||
if constexpr (ncols2 > 1 || mask_h2) {
|
||||
if (ncols2 > 1 || mask_h2) {
|
||||
flash_attn_ext_f16_load_mask<ncols1, nwarps, c::nbatch_fa, use_cp_async>(mask_h2 + k_VKQ_0/2, tile_mask, stride_mask);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -524,7 +524,7 @@ void ggml_cuda_flash_attn_ext(ggml_backend_cuda_context & ctx, ggml_tensor * dst
|
||||
// Hence, we use it only for DeepSeek with MLA enabled, where head sizes are 576, 512,
|
||||
// so no other implementation works.
|
||||
//
|
||||
if (new_mma_available(cc) && (Q->ne[0] == 576 || (Q->ne[0] == 64) && Q->ne[1] >= 128)) {
|
||||
if (new_mma_available(cc) && Q->ne[0] == 576) {
|
||||
ggml_cuda_flash_attn_ext_mma_new(ctx, dst);
|
||||
return;
|
||||
}
|
||||
|
||||
@@ -217,3 +217,4 @@ struct no_init {
|
||||
struct gguf_context;
|
||||
std::string gguf_kv_to_str(const gguf_context * ctx_gguf, int i);
|
||||
|
||||
ggml_backend_buffer_type_t llama_default_buffer_type_cpu(bool host_buffer);
|
||||
|
||||
@@ -2,6 +2,19 @@
|
||||
#include "llama-impl.h"
|
||||
#include "llama-mmap.h"
|
||||
#include "ggml.h"
|
||||
//#include "ggml-backend.h"
|
||||
|
||||
#ifdef GGML_USE_CUDA
|
||||
# include "ggml-cuda.h"
|
||||
#elif defined(GGML_USE_VULKAN)
|
||||
# include "ggml-vulkan.h"
|
||||
#elif defined(GGML_USE_SYCL)
|
||||
# include "ggml-sycl.h"
|
||||
#elif defined(GGML_USE_KOMPUTE)
|
||||
# include "ggml-kompute.h"
|
||||
#elif defined(GGML_USE_CANN)
|
||||
# include "ggml-cann.h"
|
||||
#endif
|
||||
|
||||
#include <set>
|
||||
#include <map>
|
||||
|
||||
@@ -1705,7 +1705,7 @@ static std::string llama_token_to_piece(const struct llama_model * model, llama_
|
||||
return piece;
|
||||
}
|
||||
|
||||
static ggml_backend_buffer_type_t llama_default_buffer_type_cpu(bool host_buffer) {
|
||||
ggml_backend_buffer_type_t llama_default_buffer_type_cpu(bool host_buffer) {
|
||||
ggml_backend_buffer_type_t buft = nullptr;
|
||||
|
||||
#if defined(GGML_USE_CUDA)
|
||||
|
||||
Reference in New Issue
Block a user