diff --git a/include/ck_tile/ops/fmha/pipeline/block_fmha_fwd_appendkv_pipeline.hpp b/include/ck_tile/ops/fmha/pipeline/block_fmha_fwd_appendkv_pipeline.hpp index d825b81cc4..194ff3ec59 100644 --- a/include/ck_tile/ops/fmha/pipeline/block_fmha_fwd_appendkv_pipeline.hpp +++ b/include/ck_tile/ops/fmha/pipeline/block_fmha_fwd_appendkv_pipeline.hpp @@ -185,6 +185,8 @@ struct BlockFmhaFwdAppendKVPipeline rotary_sin_block_window_tmp.get_window_origin(), Policy::template MakeRotaryCosSinTileDistribution()); + // We assume that each thread owns contiguous elements on head dimention. And we will + // use the distribution to enable/disable threads in order to override knew_tile content if constexpr(RotaryEnum == BlockRotaryEmbeddingEnum::INTERLEAVED) { auto rotary_cos_tile = load_tile(rotary_cos_window);