Shuffle fix for gfx950 (#3491)

* solve compiler issue * solve the gfx950 mfma shuffle regression * refactor jenkinsfile to handle arch name better * [CK TILE] set divisor to count of thread along k dimension * fix the compiler error * solve degradation * Finish the multiplies fix * fix the scales * solve compilation error * solve the composes * solve the error of tile sweeper * fix the test and example * fix for gfx950 --------- Co-authored-by: Max Podkorytov <4273004+tenpercent@users.noreply.github.com> Co-authored-by: illsilin_amdeng <Illia.Silin@amd.com> Co-authored-by: Cong Ma <congma13@amd.com>
2026-04-19 22:39:03 +00:00 · 2026-01-14 01:21:29 +08:00
parent 9908a87c31
commit 00c46785a8
33 changed files with 161 additions and 152 deletions
--- a/test/ck_tile/atomic_add_op/test_atomic.hpp
+++ b/test/ck_tile/atomic_add_op/test_atomic.hpp
@@ -42,7 +42,8 @@ struct AtomicKernelShape
    static constexpr index_t Repeat_M = Block_M * RepeatInWarp_M / (WarpPerBlock_M * Warp_M);
    static constexpr index_t Repeat_N = Block_N * RepeatInWarp_N / (WarpPerBlock_N * Warp_N);

-    static constexpr index_t WaveNum = reduce_on_sequence(BlockWaves{}, multiplies{}, number<1>{});
+    static constexpr index_t WaveNum =
+        reduce_on_sequence(BlockWaves{}, multiplies<>{}, number<1>{});

    static constexpr index_t BlockSize = get_warp_size() * WaveNum;
 };
--- a/test/ck_tile/memory_copy/test_copy.hpp
+++ b/test/ck_tile/memory_copy/test_copy.hpp
@@ -42,7 +42,8 @@ struct TileCopyShape
    static constexpr index_t Repeat_M = Block_M / (WarpPerBlock_M * Warp_M);
    static constexpr index_t Repeat_N = Block_N / (WarpPerBlock_N * Warp_N);

-    static constexpr index_t WaveNum = reduce_on_sequence(BlockWaves{}, multiplies{}, number<1>{});
+    static constexpr index_t WaveNum =
+        reduce_on_sequence(BlockWaves{}, multiplies<>{}, number<1>{});

    static constexpr index_t BlockSize     = get_warp_size() * WaveNum;
    static constexpr index_t WaveGroupSize = WaveNum / WaveGroups;