Fix race in the CUDA DeepSeek FA kernel

Reference: https://github.com/ggml-org/llama.cpp/pull/13438
2026-04-30 11:21:56 +00:00 · 2025-05-11 08:03:10 +03:00
parent a2d24c97e5
commit 2f32589b8e
1 changed files with 2 additions and 0 deletions
--- a/ggml/src/ggml-cuda/fattn-new-mma.cu
+++ b/ggml/src/ggml-cuda/fattn-new-mma.cu
@@ -898,6 +898,8 @@ static __device__ __forceinline__ void flash_attn_ext_f16_process_tile(
            KQ_crs += __shfl_xor_sync(0xFFFFFFFF, KQ_crs, offset, WARP_SIZE);
        }
        __syncthreads();
        // Write back combined meta data:
 #pragma unroll
        for (int imeta = 0; imeta < nmeta; ++imeta) {