diff --git a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuffle_gufusion_v3.hpp b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuffle_gufusion_v3.hpp index df577b11fc..c2fad6132f 100644 --- a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuffle_gufusion_v3.hpp +++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuffle_gufusion_v3.hpp @@ -535,7 +535,7 @@ struct BlockwiseGemmXdlops_pipeline_bpreshuffle_gufusion_v3{}([&](auto k0) { static_for<0, KGroup, 1>{}([&](auto kg0) { a_thread_copy_.Run(a_block_desc_m0_m1_m2_k0_k1_k2, - make_tuple(m0, I0, I0, Number{}, I0, I0), + make_tuple(m0, I0, I0, Number{}, I0, I0), a_block_buf.At(I0), a_thread_desc_, make_tuple(m0, I0, I0, k0, I0, Number{}), @@ -631,7 +631,7 @@ struct BlockwiseGemmXdlops_pipeline_bpreshuffle_gufusion_v3{}, I0, I0, - Number{}, + Number{}, I0, I0), a_block_buf.At(local_read_buf), @@ -657,7 +657,7 @@ struct BlockwiseGemmXdlops_pipeline_bpreshuffle_gufusion_v3{}, I0, I0, - Number{}, + Number{}, I0, I0), a_block_buf.At(local_read_buf), @@ -683,7 +683,7 @@ struct BlockwiseGemmXdlops_pipeline_bpreshuffle_gufusion_v3{}, I0, I0, - Number{}, + Number{}, I0, I0), a_block_buf.At(mfma_reg_buf), @@ -700,8 +700,8 @@ struct BlockwiseGemmXdlops_pipeline_bpreshuffle_gufusion_v3{}, I0, I0, - Number{}, + Number{}, I0, I0), a_block_buf.At(I1), @@ -791,7 +791,7 @@ struct BlockwiseGemmXdlops_pipeline_bpreshuffle_gufusion_v3{}, I0, I0, - Number{}, + Number{}, I0, I0), a_block_buf.At(I1), @@ -811,7 +811,7 @@ struct BlockwiseGemmXdlops_pipeline_bpreshuffle_gufusion_v3{}, I0, I0, - Number{}, + Number{}, I0, I0), a_block_buf.At(I0), @@ -868,7 +868,7 @@ struct BlockwiseGemmXdlops_pipeline_bpreshuffle_gufusion_v3{}, I0, I0, Number{}, I0, I0), + Number{}, I0, I0, Number{}, I0, I0), a_block_buf.At(I1), a_thread_desc_, make_tuple(Number<(m0 + 2 + HotloopLocalBufSwitch) % 2>{}, @@ -930,7 +930,7 @@ struct BlockwiseGemmXdlops_pipeline_bpreshuffle_gufusion_v3{}, I0, I0, Number{}, I0, I0), + Number{}, I0, I0, Number{}, I0, I0), a_block_buf.At(I0), a_thread_desc_, make_tuple( diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_moe_gemm.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_moe_gemm.hpp index d963980f26..7b350952a6 100644 --- a/include/ck/tensor_operation/gpu/grid/gridwise_moe_gemm.hpp +++ b/include/ck/tensor_operation/gpu/grid/gridwise_moe_gemm.hpp @@ -190,6 +190,7 @@ struct GridwiseMoeGemm mfma_selector::GetKPerXdlops() / mfma_selector::GetK1PerXdlops(); static constexpr index_t KGroup = mfma_selector::selected_mfma.k_per_blk == 32 ? 2 : 1; + // static_assert(KGroup == 2, ""); static constexpr index_t KRepeat = KPerBlock / KLane / (KPack / KGroup); static constexpr index_t NLane = NPerXdl; static constexpr index_t NWave = NPerBlock / NPerXdl / NXdlPerWave; @@ -1349,7 +1350,7 @@ struct GridwiseMoeGemm make_multi_index(n_block_data_idx_on_grid, get_warp_local_1d_id() % NWave, 0, - KPack * (get_thread_local_1d_id() % warpSize))); + KPack / KGroup * (get_thread_local_1d_id() % warpSize))); blockwise_gemm_pipeline.template Run( a_grid_desc_ak0_m_ak1, a_block_desc_ak0_m_ak1, @@ -2064,7 +2065,7 @@ struct GridwiseMoeGemm make_multi_index(n_block_data_idx_on_grid, get_warp_local_1d_id() % NWave, 0, - KPack * (get_thread_local_1d_id() % warpSize))); + KPack / KGroup * (get_thread_local_1d_id() % warpSize))); blockwise_gemm_pipeline.template Run( a_grid_desc_ak0_m_ak1, a_block_desc_ak0_m_ak1,