Shuffle fix for gfx950 (#3491)

* solve compiler issue

* solve the gfx950 mfma shuffle regression

* refactor jenkinsfile to handle arch name better

* [CK TILE] set divisor to count of thread along k dimension

* fix the compiler error

* solve degradation

* Finish the multiplies fix

* fix the scales

* solve compilation error

* solve the composes

* solve the error of tile sweeper

* fix the test and example

* fix for gfx950

---------

Co-authored-by: Max Podkorytov <4273004+tenpercent@users.noreply.github.com>
Co-authored-by: illsilin_amdeng <Illia.Silin@amd.com>
Co-authored-by: Cong Ma <congma13@amd.com>
This commit is contained in:
Thomas Ning
2026-01-14 01:21:29 +08:00
committed by GitHub
parent 9908a87c31
commit 00c46785a8
33 changed files with 161 additions and 152 deletions

View File

@@ -42,7 +42,8 @@ struct AtomicKernelShape
static constexpr index_t Repeat_M = Block_M * RepeatInWarp_M / (WarpPerBlock_M * Warp_M);
static constexpr index_t Repeat_N = Block_N * RepeatInWarp_N / (WarpPerBlock_N * Warp_N);
static constexpr index_t WaveNum = reduce_on_sequence(BlockWaves{}, multiplies{}, number<1>{});
static constexpr index_t WaveNum =
reduce_on_sequence(BlockWaves{}, multiplies<>{}, number<1>{});
static constexpr index_t BlockSize = get_warp_size() * WaveNum;
};

View File

@@ -42,7 +42,8 @@ struct TileCopyShape
static constexpr index_t Repeat_M = Block_M / (WarpPerBlock_M * Warp_M);
static constexpr index_t Repeat_N = Block_N / (WarpPerBlock_N * Warp_N);
static constexpr index_t WaveNum = reduce_on_sequence(BlockWaves{}, multiplies{}, number<1>{});
static constexpr index_t WaveNum =
reduce_on_sequence(BlockWaves{}, multiplies<>{}, number<1>{});
static constexpr index_t BlockSize = get_warp_size() * WaveNum;
static constexpr index_t WaveGroupSize = WaveNum / WaveGroups;