Fix race in the CUDA DeepSeek FA kernel (#406)

Reference: https://github.com/ggml-org/llama.cpp/pull/13438

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
This commit is contained in:
Kawrakow
2025-05-11 08:12:47 +03:00
committed by GitHub
parent a961f41762
commit 0abcf0749e

View File

@@ -898,6 +898,8 @@ static __device__ __forceinline__ void flash_attn_ext_f16_process_tile(
KQ_crs += __shfl_xor_sync(0xFFFFFFFF, KQ_crs, offset, WARP_SIZE);
}
__syncthreads();
// Write back combined meta data:
#pragma unroll
for (int imeta = 0; imeta < nmeta; ++imeta) {