diff --git a/include/ck_tile/ops/flatmm/pipeline/mixed_prec_flatmm_pipeline_agmem_bgmem_creg_v1.hpp b/include/ck_tile/ops/flatmm/pipeline/mixed_prec_flatmm_pipeline_agmem_bgmem_creg_v1.hpp index 36724c25a6..2908845d3c 100644 --- a/include/ck_tile/ops/flatmm/pipeline/mixed_prec_flatmm_pipeline_agmem_bgmem_creg_v1.hpp +++ b/include/ck_tile/ops/flatmm/pipeline/mixed_prec_flatmm_pipeline_agmem_bgmem_creg_v1.hpp @@ -679,6 +679,7 @@ struct F16xMXF4FlatmmPipelineAGmemBGmemCRegV1 async_load_tile(lds_tile_a, dram_tile_a); }; auto prefill_lds_a_stage2 = [&](auto lds_tile_a) { + async_load_fence(); // data has been stored in lds, no need more operation. static_assert(std::is_same_v, "buffer_load_lds don't support element func fot A before mfma");