diff --git a/include/ck/tensor_operation/gpu/block/blockwise_gemm_mx_pipeline_xdlops_base.hpp b/include/ck/tensor_operation/gpu/block/blockwise_gemm_mx_pipeline_xdlops_base.hpp index 552c57e46f..0bbbc8d5f9 100644 --- a/include/ck/tensor_operation/gpu/block/blockwise_gemm_mx_pipeline_xdlops_base.hpp +++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_mx_pipeline_xdlops_base.hpp @@ -52,7 +52,6 @@ struct BlockwiseGemmXdlops_mx_pipeline_base static constexpr index_t B_K1 = BTileDesc{}.GetLength(Number < BTileDesc{}.GetNumOfDimension() == 4 ? 3 : 2 > {}); - static constexpr auto xdlops_gemm = XdlopsGemm{}; diff --git a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuflle_v1_mx.hpp b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuflle_v1_mx.hpp index 0cd5a09def..80c52afe05 100644 --- a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuflle_v1_mx.hpp +++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuflle_v1_mx.hpp @@ -144,6 +144,8 @@ struct BlockwiseGemmXdlops_pipeline_bpreshuffle_v1_mx{}([&](auto m0) { - static_for<0, xdlops_gemm.K1PerXdlops / KThreadChunk, 1>{}([&](auto chunk) { + static_for<0, xdlops_gemm.K1PerXdlops / APackedSize / KThreadChunk, 1>{}([&](auto chunk) { constexpr auto a_k_step_chunk = k_step + chunk * KThreadChunk * xdlops_gemm.mfma_instr.num_input_blks; a_thread_copy_.Run(a_block_desc_m0_m1_m2_k, @@ -489,10 +491,10 @@ struct BlockwiseGemmXdlops_pipeline_bpreshuffle_v1_mx::type; + xdlops_gemm.K1PerXdlops / APackedSize>::type; using mfma_input_type_b = typename vector_type::type; + xdlops_gemm.K1PerXdlops / BPackedSize>::type; constexpr index_t c_offset = c_thread_desc_.CalculateOffset(make_tuple(m0, n0, 0)); @@ -516,7 +518,7 @@ struct BlockwiseGemmXdlops_pipeline_bpreshuffle_v1_mx{}([&](auto m0) { - static_for<0, xdlops_gemm.K1PerXdlops / KThreadChunk, 1>{}( + static_for<0, xdlops_gemm.K1PerXdlops / APackedSize / KThreadChunk, 1>{}( [&](auto chunk) { constexpr auto a_k_step_chunk = k_step + chunk * KThreadChunk * @@ -645,9 +647,9 @@ struct BlockwiseGemmXdlops_pipeline_bpreshuffle_v1_mx::type; + typename vector_type::type; using mfma_input_type_b = - typename vector_type::type; + typename vector_type::type; constexpr index_t c_offset = c_thread_desc_.CalculateOffset(make_tuple(m0, n0, 0)); @@ -671,7 +673,7 @@ struct BlockwiseGemmXdlops_pipeline_bpreshuffle_v1_mx{}([&](auto m0) { - static_for<0, xdlops_gemm.K1PerXdlops / KThreadChunk, 1>{}([&](auto chunk) { + static_for<0, xdlops_gemm.K1PerXdlops / APackedSize / KThreadChunk, 1>{}([&](auto chunk) { constexpr auto a_k_step_chunk = k_step + chunk * KThreadChunk * xdlops_gemm.mfma_instr.num_input_blks; a_thread_copy_.Run(a_block_desc_m0_m1_m2_k, @@ -717,9 +719,9 @@ struct BlockwiseGemmXdlops_pipeline_bpreshuffle_v1_mx::type; + typename vector_type::type; using mfma_input_type_b = - typename vector_type::type; + typename vector_type::type; constexpr index_t c_offset = c_thread_desc_.CalculateOffset(make_tuple(m0, n0, 0)); @@ -770,9 +772,9 @@ struct BlockwiseGemmXdlops_pipeline_bpreshuffle_v1_mx::type; + typename vector_type::type; using mfma_input_type_b = - typename vector_type::type; + typename vector_type::type; constexpr index_t c_offset = c_thread_desc_.CalculateOffset(make_tuple(m0, n0, 0));