From 256aec203e8bf95226d01774f05ebb46e2d2b06d Mon Sep 17 00:00:00 2001 From: ozturkosu Date: Sun, 22 Jun 2025 19:29:34 -0400 Subject: [PATCH] print hipOccupancyDefined Grid size for best in ckProfiler --- .../gpu/device/device_base.hpp | 13 ++--- .../device_gemm_xdl_cshuffle_streamk_v3.hpp | 17 +++--- .../gridwise_gemm_xdl_cshuffle_streamk_v3.hpp | 15 ++---- .../profile_gemm_universal_streamk_impl.hpp | 53 ++++++++----------- 4 files changed, 37 insertions(+), 61 deletions(-) mode change 100755 => 100644 include/ck/tensor_operation/gpu/device/device_base.hpp mode change 100755 => 100644 include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_streamk_v3.hpp mode change 100755 => 100644 include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_streamk_v3.hpp mode change 100755 => 100644 profiler/include/profiler/profile_gemm_universal_streamk_impl.hpp diff --git a/include/ck/tensor_operation/gpu/device/device_base.hpp b/include/ck/tensor_operation/gpu/device/device_base.hpp old mode 100755 new mode 100644 index c023e1ffa3..1db4619d13 --- a/include/ck/tensor_operation/gpu/device/device_base.hpp +++ b/include/ck/tensor_operation/gpu/device/device_base.hpp @@ -56,17 +56,10 @@ struct BaseArgument virtual ~BaseArgument() {} void* p_workspace_ = nullptr; - - virtual dim3 GetLaunchGridDims() const - { - return dim3{0, 0, 0}; - } - - virtual bool HasLaunchGridDims() const - { - return false; - } + virtual dim3 GetLaunchGridDims() const { return dim3{0, 0, 0}; } + + virtual bool HasLaunchGridDims() const { return false; } }; struct BaseInvoker diff --git a/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_streamk_v3.hpp b/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_streamk_v3.hpp old mode 100755 new mode 100644 index 086ee43c2d..e75b7447a2 --- a/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_streamk_v3.hpp +++ b/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_streamk_v3.hpp @@ -168,17 +168,18 @@ struct DeviceGemm_Xdl_CShuffle_Streamk_V3 : public DeviceGemm_Streamk_V2 0; - } + dim3 GetLaunchGridDims() const override { return launch_grid_dims_; } + bool HasLaunchGridDims() const override { return launch_grid_dims_.x > 0; } }; struct SplitKBatchOffset diff --git a/profiler/include/profiler/profile_gemm_universal_streamk_impl.hpp b/profiler/include/profiler/profile_gemm_universal_streamk_impl.hpp old mode 100755 new mode 100644 index faa65023ad..b2f510f7f1 --- a/profiler/include/profiler/profile_gemm_universal_streamk_impl.hpp +++ b/profiler/include/profiler/profile_gemm_universal_streamk_impl.hpp @@ -251,14 +251,16 @@ bool profile_gemm_universal_streamk_impl(int do_verification, float gb_per_sec = num_btype / 1.E6 / ave_time; // // const auto actual_launch_grid_dims = argument_ptr->GetLaunchGridDims(); - // const auto* typed_argument_ptr = dynamic_cast::Argument*>(argument_ptr) + // const auto* typed_argument_ptr = dynamic_cast::Argument*>(argument_ptr) // Get actual launch grid dims from argument dim3 actual_launch_grid_dims = argument_ptr->GetLaunchGridDims(); std::cout << "Perf: " << std::setw(10) << ave_time << " ms, " << tflops - << " TFlops, " << gb_per_sec << " GB/s, " << op_name << ", Grid_size "; - + << " TFlops, " << gb_per_sec << " GB/s, " << op_name + << ", Grid_size "; + if(argument_ptr->HasLaunchGridDims() && actual_launch_grid_dims.x > 0) { std::cout << actual_launch_grid_dims.x; @@ -267,31 +269,9 @@ bool profile_gemm_universal_streamk_impl(int do_verification, { std::cout << grid_size_curr; } - + std::cout << ", streamk selection strategy " << streamk_sel_curr << std::endl; - // if (typed_argument_ptr) - // { - // const auto actual_launch_grid_dims = typed_argument_ptr->GetLaunchGridDims(); - // std::cout << "Actual Grid Dimensions: " << actual_launch_grid_dims.x << "x" - // << actual_launch_grid_dims.y << "x" << actual_launch_grid_dims.z << std::endl; - // } - // else - // { - // std::cerr << "Error: Failed to cast argument_ptr to the correct type." << std::endl; - // } - - // std::cout << "Perf: " << std::setw(10) << ave_time << " ms, " << tflops - // << " TFlops, " << gb_per_sec << " GB/s, " << op_name << ", Grid_size " - // << actual_launch_grid_dims.x // Use the x-dimension of the actual launch grid - // << ", streamk selection strategy " - // << streamk_sel_curr << std::endl; - - // std::cout << "Perf: " << std::setw(10) << ave_time << " ms, " << tflops - // << " TFlops, " << gb_per_sec << " GB/s, " << op_name << ", Grid_size " - // << grid_size_curr << ", streamk selection strategy" - // << streamk_sel_curr << std::endl; - #if defined CK_ENABLE_FP8 // set softer tolerances for fp8 if constexpr(is_same_v || is_same_v || @@ -313,11 +293,22 @@ bool profile_gemm_universal_streamk_impl(int do_verification, if(tflops > best_tflops) { - best_op_name = op_name; - best_tflops = tflops; - best_ave_time = ave_time; - best_gb_per_sec = gb_per_sec; - best_grid_size = grid_size_curr; + best_op_name = op_name; + best_tflops = tflops; + best_ave_time = ave_time; + best_gb_per_sec = gb_per_sec; + + best_grid_size = grid_size_curr; + + if(argument_ptr->HasLaunchGridDims() && actual_launch_grid_dims.x > 0) + { + best_grid_size = actual_launch_grid_dims.x; + } + else + { + best_grid_size = grid_size_curr; + } + best_streamk_sel = streamk_sel_curr; } }