Add permuteN optimzization when NRepeat % 2 == 0 on flatmm

This commit is contained in:
Feng Shijie
2025-07-27 11:57:38 +00:00
parent bfb9f4002f
commit 5473f06461
5 changed files with 228 additions and 104 deletions

View File

@@ -751,7 +751,7 @@ struct FlatmmKernel
CK_TILE_DEVICE void operator()(FlatmmKernelArgs<ScaleM, ScaleN, DsDataType::size()> kargs,
int partition_idx = blockIdx.x) const
{
const auto [iM, iN] = TilePartitioner{kargs.M, kargs.N}.GetOutputTileIndex(blockIdx.x);
const auto [iM, iN] = TilePartitioner{kargs.M, kargs.N}.GetOutputTileIndex(partition_idx);
const index_t i_m = __builtin_amdgcn_readfirstlane(iM * TilePartitioner::MPerBlock);
const index_t i_n = __builtin_amdgcn_readfirstlane(iN * TilePartitioner::NPerBlock);