Fix race in the CUDA DeepSeek FA kernel (#406)

Reference: https://github.com/ggml-org/llama.cpp/pull/13438 Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
2026-02-23 06:34:13 +00:00 · 2025-05-11 08:12:47 +03:00
parent a961f41762
commit 0abcf0749e
1 changed files with 2 additions and 0 deletions
--- a/ggml/src/ggml-cuda/fattn-new-mma.cu
+++ b/ggml/src/ggml-cuda/fattn-new-mma.cu
@@ -898,6 +898,8 @@ static __device__ __forceinline__ void flash_attn_ext_f16_process_tile(
            KQ_crs += __shfl_xor_sync(0xFFFFFFFF, KQ_crs, offset, WARP_SIZE);
        }

+        __syncthreads();
+
        // Write back combined meta data:
 #pragma unroll
        for (int imeta = 0; imeta < nmeta; ++imeta) {