From d60d23ea8e97ee202f175dc920e41f663950e496 Mon Sep 17 00:00:00 2001 From: aska-0096 Date: Wed, 19 Mar 2025 12:57:16 +0000 Subject: [PATCH] v1 performance debugging --- ...line_xdlops_blockscale_b_preshuffle_v1.hpp | 49 ++++++++++++++++++- ...xdl_cshuffle_v3_blockscale_bpreshuffle.hpp | 5 +- 2 files changed, 48 insertions(+), 6 deletions(-) diff --git a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_blockscale_b_preshuffle_v1.hpp b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_blockscale_b_preshuffle_v1.hpp index c2167a3db4..dde2f8d09d 100644 --- a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_blockscale_b_preshuffle_v1.hpp +++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_blockscale_b_preshuffle_v1.hpp @@ -200,7 +200,7 @@ struct BlockwiseGemmXdlops_pipeline_blockscale_bpreshuffle_v1{}([&](auto i) { __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA @@ -243,6 +243,50 @@ struct BlockwiseGemmXdlops_pipeline_blockscale_bpreshuffle_v1{}([&](auto i) { + __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA + __builtin_amdgcn_sched_group_barrier(0x020, 1, 0); // VMEM read + /* Judging issue v_pk_fma */ + if constexpr((i + 1) % num_mfma_per_kscaleblock == 0) + { + __builtin_amdgcn_sched_group_barrier( + 0x800, num_pk_fma_per_kscaleblock, 0); // PK_FMA + } + }); + + // A global + static_for<0, num_buffer_load_inst_a, 1>{}([&](auto i) { + __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA + __builtin_amdgcn_sched_group_barrier(0x200, 1, 0); // DS write + if constexpr((num_buffer_load_inst_b + 2 * i + 1) % num_mfma_per_kscaleblock == 0) + { + __builtin_amdgcn_sched_group_barrier( + 0x800, num_pk_fma_per_kscaleblock, 0); // PK_FMA + } + __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA + __builtin_amdgcn_sched_group_barrier(0x020, 1, 0); // VMEM read + if constexpr((num_buffer_load_inst_b + 2 * i + 2) % num_mfma_per_kscaleblock == 0) + { + __builtin_amdgcn_sched_group_barrier( + 0x800, num_pk_fma_per_kscaleblock, 0); // PK_FMA + } + }); + + // A local + static_for<0, num_ds_read_inst_a / 2, 1>{}([&](auto i) { + __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA + __builtin_amdgcn_sched_group_barrier(0x100, 2, 0); // DS read + if constexpr((num_buffer_load_inst_b + 2 * num_buffer_load_inst_a + i + 1) % + num_mfma_per_kscaleblock == + 0) + { + __builtin_amdgcn_sched_group_barrier( + 0x800, num_pk_fma_per_kscaleblock, 0); // PK_FMA + } + }); +#endif } template {}([&](auto m0) { a_scale_thread_copy.Run(a_scale_grid_desc, @@ -364,6 +407,8 @@ struct BlockwiseGemmXdlops_pipeline_blockscale_bpreshuffle_v1{}); constexpr auto num_scale_m_block = CScaleThreadDesc{}.GetLength(Number<1>{}); constexpr auto num_scale_n_block = CScaleThreadDesc{}.GetLength(Number<2>{}); diff --git a/include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_xdl_cshuffle_v3_blockscale_bpreshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_xdl_cshuffle_v3_blockscale_bpreshuffle.hpp index a412b12756..dd6d6ba316 100644 --- a/include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_xdl_cshuffle_v3_blockscale_bpreshuffle.hpp +++ b/include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_xdl_cshuffle_v3_blockscale_bpreshuffle.hpp @@ -233,11 +233,8 @@ struct DeviceGemmMultiD_BlockScale_Xdl_CShuffle_V3_BPreshuffle } }; + // unconditional 2 to remove agpr usage constexpr index_t minimum_occupancy = 2; - // (BlkGemmPipeSched == BlockGemmPipelineScheduler::Intrawave && - // MPerBlock * NPerBlock / BlockSize > 64) - // ? 1 - // : 2; if(has_main_k_block_loop) {