Modify the a_thread offset since the A data load is different from B.

This commit is contained in:
mtgu0705
2024-12-25 23:26:17 +08:00
parent 1fcd332967
commit f728087c61

View File

@@ -1368,8 +1368,10 @@ struct GridwiseGemmMultiD_ABScale_xdl_cshuffle_v3
make_tuple(Number<ScaleSliceSizeN>{}, Number<ScaleSliceSizeK>{}));
constexpr index_t MWaves = MPerBlock / (MXdlPerWave * MPerXdl);
auto a_thread_offset =
get_thread_local_1d_id() % MPerXdl + (get_thread_local_1d_id() / 64) % MWaves * MPerXdl;
// auto a_thread_offset =
// get_thread_local_1d_id() % MPerXdl + (get_thread_local_1d_id() / 64) % MWaves * MPerXdl;
auto a_thread_offset = get_thread_local_1d_id() % MPerXdl + (get_thread_local_1d_id() / 128) * MPerXdl;
auto a_scale_thread_copy =
ThreadwiseTensorSliceTransfer_v2<AScaleType,