mirror of
https://github.com/ikawrakow/ik_llama.cpp.git
synced 2026-04-30 11:21:56 +00:00
Fix race in the CUDA DeepSeek FA kernel
Reference: https://github.com/ggml-org/llama.cpp/pull/13438
This commit is contained in:
@@ -898,6 +898,8 @@ static __device__ __forceinline__ void flash_attn_ext_f16_process_tile(
|
|||||||
KQ_crs += __shfl_xor_sync(0xFFFFFFFF, KQ_crs, offset, WARP_SIZE);
|
KQ_crs += __shfl_xor_sync(0xFFFFFFFF, KQ_crs, offset, WARP_SIZE);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
__syncthreads();
|
||||||
|
|
||||||
// Write back combined meta data:
|
// Write back combined meta data:
|
||||||
#pragma unroll
|
#pragma unroll
|
||||||
for (int imeta = 0; imeta < nmeta; ++imeta) {
|
for (int imeta = 0; imeta < nmeta; ++imeta) {
|
||||||
|
|||||||
Reference in New Issue
Block a user