Modify the a_thread offset since the A data load is different from B.

2026-06-29 03:07:02 +00:00 · 2024-12-25 23:26:17 +08:00
parent 1fcd332967
commit f728087c61
1 changed files with 4 additions and 2 deletions
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_multi_d_ab_scale.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_multi_d_ab_scale.hpp
@@ -1368,8 +1368,10 @@ struct GridwiseGemmMultiD_ABScale_xdl_cshuffle_v3
            make_tuple(Number<ScaleSliceSizeN>{}, Number<ScaleSliceSizeK>{}));

        constexpr index_t MWaves = MPerBlock / (MXdlPerWave * MPerXdl);
-        auto a_thread_offset =
-            get_thread_local_1d_id() % MPerXdl + (get_thread_local_1d_id() / 64) % MWaves * MPerXdl;
+        // auto a_thread_offset =
+        //     get_thread_local_1d_id() % MPerXdl + (get_thread_local_1d_id() / 64) % MWaves * MPerXdl;
+
+        auto a_thread_offset = get_thread_local_1d_id() % MPerXdl + (get_thread_local_1d_id() / 128) * MPerXdl;
        
        auto a_scale_thread_copy =
            ThreadwiseTensorSliceTransfer_v2<AScaleType,