optimize gemm2 atomic_add pattern

This commit is contained in:
Feng Shijie
2025-08-11 08:38:23 +00:00
parent 87aed564dc
commit cc9c7b9e58

View File

@@ -693,13 +693,14 @@ struct MoeFlatmmKernel
"Currently, the CShuffle EpiloguePipeline only supports the Row Major "
"Output layout");
using TileEncodingPattern =
TileDistributionEncodingPattern2D<kBlockSize,
MPerIterationShuffle,
NPerIterationShuffle,
EpiloguePipeline::GetVectorSizeC(),
tile_distribution_pattern::thread_raked,
EpiProblem::kNumWaveGroups>;
using TileEncodingPattern = TileDistributionEncodingPattern2D<
kBlockSize,
MPerIterationShuffle,
NPerIterationShuffle,
kind == MoeFlatmmKind::kFFN_gemm2 ? 2 : EpiloguePipeline::GetVectorSizeC(),
tile_distribution_pattern::thread_raked,
EpiProblem::kNumWaveGroups>;
constexpr auto dram_tile_distribution =
TileEncodingPattern::Make2DStaticTileDistribution();