From fdec0370c794841b6534bb115bc3f5ed363a0d57 Mon Sep 17 00:00:00 2001 From: zjing14 Date: Wed, 20 Apr 2022 22:10:35 -0500 Subject: [PATCH] removed unused lds loads (#196) [ROCm/composable_kernel commit: 860e291c3061611ebeb742675f8d6bc52f7cbf84] --- .../gpu/block/blockwise_gemm_xdlops.hpp | 20 ++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/include/ck/tensor_operation/gpu/block/blockwise_gemm_xdlops.hpp b/include/ck/tensor_operation/gpu/block/blockwise_gemm_xdlops.hpp index 064a763374..8fe4beecba 100644 --- a/include/ck/tensor_operation/gpu/block/blockwise_gemm_xdlops.hpp +++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_xdlops.hpp @@ -39,6 +39,8 @@ struct BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1 static constexpr auto xdlops_gemm = XdlopsGemm{}; + static constexpr index_t KPerThread = KPerBlock / xdlops_gemm.K0PerXdlops; + static constexpr index_t MWaves = MPerBlock / (MRepeat * MPerXDL); static constexpr index_t NWaves = NPerBlock / (NRepeat * NPerXDL); @@ -71,7 +73,7 @@ struct BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1 const auto xdlops_a_idx = xdlops_gemm.CalculateAThreadOriginDataIndex(); - return make_tuple(0, waveId_m, xdlops_a_idx[I1], Number{} * xdlops_a_idx[I0]); + return make_tuple(0, waveId_m, xdlops_a_idx[I1], KPerThread * xdlops_a_idx[I0]); } __device__ static auto CalculateBThreadOriginDataIndex() @@ -82,7 +84,7 @@ struct BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1 const auto xdlops_b_idx = xdlops_gemm.CalculateBThreadOriginDataIndex(); - return make_tuple(0, waveId_n, xdlops_b_idx[I1], Number{} * xdlops_b_idx[I0]); + return make_tuple(0, waveId_n, xdlops_b_idx[I1], KPerThread * xdlops_b_idx[I0]); } template @@ -273,7 +275,7 @@ struct BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1 make_tuple(I0, I0, I0, I0), b_thread_buf); - static_for<0, KPerBlock, KPack * xdlops_gemm.K0PerXdlops>{}([&](auto k) { + static_for<0, KPerThread, KPack>{}([&](auto k) { vector_type a_thread_vec; vector_type b_thread_vec; @@ -300,13 +302,13 @@ struct BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1 } private: - // A[M0, M1, M2, KPerBlock] + // A[M0, M1, M2, KPerThread] static constexpr auto a_thread_desc_ = - make_naive_tensor_descriptor_packed(make_tuple(I1, I1, I1, Number{})); + make_naive_tensor_descriptor_packed(make_tuple(I1, I1, I1, Number{})); - // B[N0, N1, N2, KPerBlock] + // B[N0, N1, N2, KPerThread] static constexpr auto b_thread_desc_ = - make_naive_tensor_descriptor_packed(make_tuple(I1, I1, I1, Number{})); + make_naive_tensor_descriptor_packed(make_tuple(I1, I1, I1, Number{})); // C[M, N, NumRegXdlops] static constexpr auto c_thread_desc_ = make_naive_tensor_descriptor_packed( @@ -316,7 +318,7 @@ struct BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1 FloatAB, decltype(a_block_desc_m0_m1_m2_k), decltype(a_thread_desc_), - Sequence<1, 1, 1, KPerBlock>, + Sequence<1, 1, 1, KPerThread>, Sequence<0, 1, 2, 3>, 3, A_K1, @@ -326,7 +328,7 @@ struct BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1 FloatAB, decltype(b_block_desc_n0_n1_n2_k), decltype(b_thread_desc_), - Sequence<1, 1, 1, KPerBlock>, + Sequence<1, 1, 1, KPerThread>, Sequence<0, 1, 2, 3>, 3, B_K1,