From 5d7a0487f89251da693df6111fddfa8b06bf44bf Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ville=20Pietil=C3=A4?= <>
Date: Mon, 30 Jun 2025 14:20:10 +0000
Subject: [PATCH] Refactor conv profiler to produce statistics for analysing
 split-K autodeduction performance.

---
 ...e_grouped_conv_bwd_weight_xdl_cshuffle.hpp |   3 +-
 ...rouped_conv_bwd_weight_xdl_cshuffle_v3.hpp |   3 +-
 .../gpu/device/impl/split_k_arg.hpp           |   4 +-
 .../gpu/device/impl/split_k_utils.hpp         |   8 +
 .../profile_grouped_conv_bwd_weight_impl.hpp  | 257 +++++++++---------
 5 files changed, 135 insertions(+), 140 deletions(-)

diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_xdl_cshuffle.hpp
index 5ab2f2e36c..069308e597 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_xdl_cshuffle.hpp
@@ -566,8 +566,7 @@ struct DeviceGroupedConvBwdWeight_Xdl_CShuffle
                     : get_optimized_k_batch_value(max_occupancy.value_, grid_size, k_grid_size);
 
                 data_type_ = typeid(ABDataType).name();
-                arithmetic_intensity_ = (2.0 * k_dim_size_ * m_dim_size_ * n_dim_size_) /
-                                        ((m_dim_size_ * k_dim_size_ + k_dim_size_ * n_dim_size_ + m_dim_size_ * n_dim_size_) * sizeof(ABDataType));
+                arithmetic_intensity_ = calculate_arithmetic_intensity(m_dim_size_, n_dim_size_, k_dim_size_, sizeof(ABDataType));
                 
                 if (ck::EnvIsEnabled(CK_ENV(CK_LOGGING)))
                 {
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_xdl_cshuffle_v3.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_xdl_cshuffle_v3.hpp
index a4699a355a..da7b97cdb8 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_xdl_cshuffle_v3.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_xdl_cshuffle_v3.hpp
@@ -535,8 +535,7 @@ struct DeviceGroupedConvBwdWeight_Xdl_CShuffleV3
                     : get_optimized_k_batch_value(max_occupancy.value_, grid_size, k_grid_size);
 
                 data_type_ = typeid(ABDataType).name();
-                arithmetic_intensity_ = (2.0 * k_dim_size_ * m_dim_size_ * n_dim_size_) /
-                                        ((m_dim_size_ * k_dim_size_ + k_dim_size_ * n_dim_size_ + m_dim_size_ * n_dim_size_) * sizeof(ABDataType));
+                arithmetic_intensity_ = calculate_arithmetic_intensity(m_dim_size_, n_dim_size_, k_dim_size_, sizeof(ABDataType));
 
                 // For small GemmK size, cap the max value of the k_batch.
                 const auto k_batch_max = static_cast<index_t>((k_dim_size_ - 1) / K0PerBlock);
diff --git a/include/ck/tensor_operation/gpu/device/impl/split_k_arg.hpp b/include/ck/tensor_operation/gpu/device/impl/split_k_arg.hpp
index 5624b0d92b..b14c9782ec 100644
--- a/include/ck/tensor_operation/gpu/device/impl/split_k_arg.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/split_k_arg.hpp
@@ -14,14 +14,14 @@ struct ArgumentSplitK
   index_t k_dim_size() const { return k_dim_size_; }
   index_t m_dim_size() const { return m_dim_size_; }
   index_t n_dim_size() const { return n_dim_size_; }
-  index_t arithmetic_intensity() const { return arithmetic_intensity_; }
+  float arithmetic_intensity() const { return arithmetic_intensity_; }
   std::string data_type() const { return data_type_; }
   protected:
         index_t k_batch_{-1};
         index_t k_dim_size_{-1};
         index_t m_dim_size_{-1};
         index_t n_dim_size_{-1};
-        index_t arithmetic_intensity_{-1};
+        float arithmetic_intensity_{-1};
         std::string data_type_{""};
 };
 
diff --git a/include/ck/tensor_operation/gpu/device/impl/split_k_utils.hpp b/include/ck/tensor_operation/gpu/device/impl/split_k_utils.hpp
index 016a44162e..62079dbde2 100644
--- a/include/ck/tensor_operation/gpu/device/impl/split_k_utils.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/split_k_utils.hpp
@@ -130,6 +130,14 @@ inline ck::index_t get_optimized_k_batch_value(int max_occupancy, ck::index_t gr
   return best_split_k;
 }
 
+inline float calculate_arithmetic_intensity(ck::index_t gemmM,
+                                     ck::index_t gemmN,
+                                     ck::index_t gemmK,
+                                     float bytes_per_element)
+{
+  return (2.0f * gemmM * gemmN * gemmK) / (bytes_per_element * (gemmM * gemmK + gemmK * gemmN + gemmM * gemmN));
+}
+
 } // namespace device
 } // namespace tensor_operation
 } // namespace ck
diff --git a/profiler/include/profiler/profile_grouped_conv_bwd_weight_impl.hpp b/profiler/include/profiler/profile_grouped_conv_bwd_weight_impl.hpp
index 4cf674f021..dedd7ba8e3 100644
--- a/profiler/include/profiler/profile_grouped_conv_bwd_weight_impl.hpp
+++ b/profiler/include/profiler/profile_grouped_conv_bwd_weight_impl.hpp
@@ -32,20 +32,37 @@ CK_DECLARE_ENV_VAR_STR(CK_PROFILER_OUTPUT_FILE)
 namespace ck {
 namespace profiler {
 
+using SplitKStrategy = ck::tensor_operation::device::SplitKStrategy;
+using ParamsSplitK = ck::tensor_operation::device::ParamsSplitK;
+
+struct BestPerformance
+{
+    std::string op_name_{""};
+    float avg_time_{std::numeric_limits<float>::max()};
+    float tflops_{std::numeric_limits<float>::min()};
+    ck::index_t split_k_value_{0};
+};
+
 struct PerfResults
 {
-    using SplitKStrategy = ck::tensor_operation::device::SplitKStrategy;
-    using ParamsSplitK = ck::tensor_operation::device::ParamsSplitK;
+    // Best performance for each split-K strategy
+    std::map<SplitKStrategy, BestPerformance> best_performance_{};
 
-    void update_best_occupancy_split_k(const std::string& op_name, float avg_time, float tflops, ck::index_t split_k_arg, SplitKStrategy strategy)
+    // GEMM problem parameters
+    ck::index_t m_dim_size_{-1};
+    ck::index_t n_dim_size_{-1};
+    ck::index_t k_dim_size_{-1};
+    float arithmetic_intensity_{0.0f};
+    std::string data_type_{""};
+
+    std::vector<std::tuple<std::string, ck::index_t, SplitKStrategy, float>> ranking_;
+
+    void update_best_perf(const std::string& op_name, float avg_time, float tflops, ck::index_t split_k_arg, SplitKStrategy strategy)
     {
-        if(tflops > best_occupancy_split_k_tflops_)
+        const auto& current_best_perf = best_performance_[strategy];
+        if(tflops > current_best_perf.tflops_)
         {
-            best_occupancy_split_k_op_name_    = op_name;
-            best_occupancy_split_k_avg_time_   = avg_time;
-            best_occupancy_split_k_tflops_     = tflops;
-            best_occupancy_split_k_value_      = split_k_arg;
-            best_occupancy_split_k_strategy_   = strategy;
+            best_performance_[strategy] = {op_name, avg_time, tflops, split_k_arg};
         }
 
         ranking_.emplace_back(op_name, split_k_arg, strategy, tflops);
@@ -53,21 +70,6 @@ struct PerfResults
                   [](const auto& a, const auto& b) { return std::get<3>(a) > std::get<3>(b); });
     };
 
-    void update_fixed_split_k(const std::string& op_name, float avg_time, float tflops, ck::index_t split_k_arg)
-    {
-        if (tflops > fixed_split_k_tflops_)
-        {
-            fixed_split_k_op_name_    = op_name;
-            fixed_split_k_avg_time_   = avg_time;
-            fixed_split_k_tflops_     = tflops;
-            fixed_split_k_value_      = split_k_arg;
-        }
-
-        ranking_.emplace_back(op_name, split_k_arg, SplitKStrategy::FixedSplitK, tflops);
-        std::sort(ranking_.begin(), ranking_.end(),
-                  [](const auto& a, const auto& b) { return std::get<3>(a) > std::get<3>(b); });
-    };
-
     static std::string split_k_str(const ParamsSplitK split_k_params, ck::index_t split_k_arg)
     {
         switch (split_k_params.strategy_)
@@ -86,23 +88,7 @@ struct PerfResults
         }
     };
  
-    std::string print_fixed_split_k() const
-    {
-        ck::index_t rank, total_num;
-        std::tie(rank, total_num) = get_ranking(fixed_split_k_op_name_, fixed_split_k_value_);
-        std::stringstream ss;
-        ss << "\nFIXED SPLIT-K RESULTS"
-           << "\n========================";
-        ss << "\nname: " << fixed_split_k_op_name_ 
-            << "\navg_time: " << fixed_split_k_avg_time_
-            << "\ntflops: " << fixed_split_k_tflops_
-            << "\nGEMM-K: " << k_dim_size_
-            << "\nSplitK " << fixed_split_k_value_
-            << "\nRanking: " << rank << " / " << total_num;
-        return ss.str();
-    }
-
-    std::string print_best_occupancy_split_k() const
+    std::string print_best_performance() const
     {
         const auto& to_string = [](const SplitKStrategy strategy) {
             switch (strategy)
@@ -118,22 +104,36 @@ struct PerfResults
             }
         }; 
 
-        ck::index_t rank, total_num;
-        std::tie(rank, total_num) = get_ranking(best_occupancy_split_k_op_name_, best_occupancy_split_k_value_, best_occupancy_split_k_strategy_);
         std::stringstream ss;
-        ss << "\nBEST OCCUPANCY SPLIT-K RESULTS"
+        ss << "\nProblem Parameters"
            << "\n========================";
-        ss << "\nname: " << best_occupancy_split_k_op_name_ 
-            << "\navg_time: " << best_occupancy_split_k_avg_time_
-            << "\ntflops: " << best_occupancy_split_k_tflops_
-            << "\nGEMM-K: " << k_dim_size_
-            << "\nStrategy: " << to_string(best_occupancy_split_k_strategy_)
-            << "\nSplitK " << best_occupancy_split_k_value_
-            << "\nRanking: " << rank << " / " << total_num;
+        ss << "\nm_dim_size: " << m_dim_size_
+           << "\nn_dim_size: " << n_dim_size_
+           << "\nk_dim_size: " << k_dim_size_
+           << "\narithmetic_intensity: " << arithmetic_intensity_
+           << "\ndata_type: " << data_type_;
+        for (const auto& strategy : {SplitKStrategy::FixedSplitK, SplitKStrategy::BestOccupancy, SplitKStrategy::Optimized})
+        {
+            const auto& best_perf = best_performance_.find(strategy);
+            if (best_perf != best_performance_.end())
+            {
+                ck::index_t rank, total_num;
+                std::tie(rank, total_num) = get_ranking(best_perf->second.op_name_, best_perf->second.split_k_value_, strategy);
+                
+                ss << "\n\nBEST PERFORMANCE RESULTS (" << to_string(strategy) << ")"
+                   << "\n========================";
+                ss << "\nname: " << best_perf->second.op_name_ 
+                    << "\navg_time: " << best_perf->second.avg_time_
+                    << "\ntflops: " << best_perf->second.tflops_
+                    << "\nSplitK: " << best_perf->second.split_k_value_
+                    << "\nRanking: " << rank << " / " << total_num;
+            }
+        }
+
         return ss.str();
     }
 
-    std::tuple<size_t, size_t> get_ranking(const std::string& op_name, ck::index_t split_k, SplitKStrategy strategy = SplitKStrategy::FixedSplitK) const
+    std::tuple<size_t, size_t> get_ranking(const std::string& op_name, ck::index_t split_k, SplitKStrategy strategy) const
     {
         auto it = std::find_if(ranking_.begin(), ranking_.end(),
                                [&](const auto& entry) {
@@ -148,8 +148,24 @@ struct PerfResults
         return std::make_tuple(ranking_.size()+1, ranking_.size());
     };
 
-    void set_common_params(ck::index_t m_dim_size, ck::index_t n_dim_size, ck::index_t k_dim_size, float arithmetic_intensity)
+    void set_common_params(ck::index_t m_dim_size, ck::index_t n_dim_size, ck::index_t k_dim_size, float arithmetic_intensity, const std::string& data_type)
     {
+        if (data_type_.empty())
+        {
+            data_type_ = data_type;
+        }
+        else if (data_type_ != data_type)
+        {
+            std::cerr << "Error: data_type cannot be set multiple times. Old value " << data_type_ << ". New value " << data_type << std::endl;
+            exit(EXIT_FAILURE);
+        }
+
+        if (m_dim_size <= 0 || n_dim_size <= 0 || k_dim_size <= 0)
+        {
+            std::cerr << "Error: m_dim_size, n_dim_size, and k_dim_size must be positive integers." << std::endl;
+            exit(EXIT_FAILURE);
+        }
+
         if (m_dim_size_ > 0 && m_dim_size != m_dim_size_)
         {
             std::cerr << "Error: m_dim_size cannot be set multiple times. Old value " << m_dim_size_ << ". New value " << m_dim_size << std::endl;
@@ -178,35 +194,19 @@ struct PerfResults
             exit(EXIT_FAILURE);
         }
         arithmetic_intensity_ = arithmetic_intensity;
+
+        if (!data_type_.empty() && data_type != data_type_)
+        {
+            std::cerr << "Error: data_type cannot be set multiple times. Old value " << data_type_ << ". New value " << data_type << std::endl;
+            exit(EXIT_FAILURE);
+        }
+        data_type_ = data_type;
     }
-
-    // Fixed split-K results
-    std::string fixed_split_k_op_name_{""};
-    float fixed_split_k_avg_time_{std::numeric_limits<float>::max()};
-    float fixed_split_k_tflops_{std::numeric_limits<float>::min()};
-    ck::index_t fixed_split_k_value_{0};
-
-    // Best occupancy split-K results
-    std::string best_occupancy_split_k_op_name_{""};
-    float best_occupancy_split_k_avg_time_{std::numeric_limits<float>::max()};
-    float best_occupancy_split_k_tflops_{std::numeric_limits<float>::min()};
-    ck::index_t best_occupancy_split_k_value_{0};
-    SplitKStrategy best_occupancy_split_k_strategy_;
-
-    // GEMM problem parameters
-    ck::index_t m_dim_size_{-1};
-    ck::index_t n_dim_size_{-1};
-    ck::index_t k_dim_size_{-1};
-    float arithmetic_intensity_{0.0f};
-
-    std::vector<std::tuple<std::string, ck::index_t, SplitKStrategy, float>> ranking_;
 };
 
 void write_perf_results_to_file(const PerfResults& perf_results_global, 
                                 const std::vector<PerfResults>& perf_results_list)
 {
-    using SplitKStrategy = ck::tensor_operation::device::SplitKStrategy;
-
     const auto& results_file = ck::EnvGetString(CK_ENV(CK_PROFILER_OUTPUT_FILE));
 
     if (results_file.empty())
@@ -231,25 +231,35 @@ void write_perf_results_to_file(const PerfResults& perf_results_global,
     };
 
     const auto& write_to_file = [&](const PerfResults res, std::ofstream& file, bool only_one_op = false) {
-        const auto gemm_k_size = res.k_dim_size_ > 0 ? std::to_string(res.k_dim_size_) : "N/A";
-        ck::index_t rank_fixed_split_k, rank_best_occupancy_split_k, total_num;
-        std::tie(rank_fixed_split_k, total_num) = res.get_ranking(res.fixed_split_k_op_name_, res.fixed_split_k_value_);
-        std::tie(rank_best_occupancy_split_k, std::ignore) = 
-            res.get_ranking(res.best_occupancy_split_k_op_name_, res.best_occupancy_split_k_value_, res.best_occupancy_split_k_strategy_);
 
-        file << res.fixed_split_k_op_name_ << separator
-             << res.fixed_split_k_avg_time_ << separator
-             << res.fixed_split_k_value_ << separator
-             << rank_fixed_split_k << separator;
-        if (!only_one_op) 
+        ck::index_t total_num = -1;
+        bool write_op_name = true;
+        for (const auto& strategy : {SplitKStrategy::FixedSplitK, SplitKStrategy::BestOccupancy, SplitKStrategy::Optimized})
         {
-            file << res.best_occupancy_split_k_op_name_ << separator;
+            const auto& best_perf = res.best_performance_.find(strategy);
+            if (best_perf != res.best_performance_.end())
+            {
+                BestPerformance perf;
+                std::tie(std::ignore, perf) = *best_perf;
+                ck::index_t rank;
+                std::tie(rank, total_num) = res.get_ranking(perf.op_name_, perf.split_k_value_, strategy);
+                if (write_op_name)
+                {
+                    file << perf.op_name_ << separator;
+                    if (only_one_op)
+                    {
+                        // If only one op is written, we do not need to write the op name again
+                        write_op_name = false;
+                    }
+                }
+                file << perf.avg_time_ << separator
+                     << perf.tflops_ << separator
+                     << perf.split_k_value_ << separator
+                     << rank << separator
+                     << to_string(strategy) << separator;
+            }
         }
-        file << res.best_occupancy_split_k_avg_time_ << separator
-             << res.best_occupancy_split_k_value_ << separator
-             << to_string(res.best_occupancy_split_k_strategy_) << separator
-             << rank_best_occupancy_split_k << separator
-             << total_num;
+        file << total_num;
     };
 
     if(!results_file.empty())
@@ -261,7 +271,8 @@ void write_perf_results_to_file(const PerfResults& perf_results_global,
             file << perf_results_global.m_dim_size_ << separator
                  << perf_results_global.n_dim_size_ << separator
                  << perf_results_global.k_dim_size_ << separator
-                 << perf_results_global.arithmetic_intensity_ << separator;
+                 << perf_results_global.arithmetic_intensity_ << separator
+                 << perf_results_global.data_type_ << separator;
 
             // First the global results
             write_to_file(perf_results_global, file);
@@ -342,8 +353,8 @@ bool profile_grouped_conv_bwd_weight_impl(int do_verification,
     using InElementOp  = ck::tensor_operation::element_wise::PassThrough;
     using WeiElementOp = ck::tensor_operation::element_wise::PassThrough;
     using OutElementOp = ck::tensor_operation::element_wise::PassThrough;
-    using SplitKStrategy = ck::tensor_operation::device::SplitKStrategy;
-    using ParamsSplitK = ck::tensor_operation::device::ParamsSplitK;
+    // using SplitKStrategy = ck::tensor_operation::device::SplitKStrategy;
+    // using ParamsSplitK = ck::tensor_operation::device::ParamsSplitK;
 
     const auto in_element_op  = InElementOp{};
     const auto wei_element_op = WeiElementOp{};
@@ -465,7 +476,7 @@ bool profile_grouped_conv_bwd_weight_impl(int do_verification,
     range_copy(conv_param.input_right_pads_, begin(input_right_pads));
 
     std::vector<ck::index_t> fixed_split_k_list = {1, 2, 4, 8, 16, 32, 64, 128};
-    std::vector<SplitKStrategy> best_occupancy_list = {SplitKStrategy::BestOccupancy, SplitKStrategy::Optimized};
+    std::vector<SplitKStrategy> best_occupancy_list = {SplitKStrategy::BestOccupancy /*, SplitKStrategy::Optimized*/};
     bool profile_all = true;
     if(split_k != "all")
     {
@@ -542,10 +553,11 @@ bool profile_grouped_conv_bwd_weight_impl(int do_verification,
                 const auto m_dim_size = split_k_arg->m_dim_size();
                 const auto n_dim_size = split_k_arg->n_dim_size();
                 const auto arithmetic_intensity = split_k_arg->arithmetic_intensity();
+                const auto& data_type = split_k_arg->data_type();
                 if (k_dim_size > 0)
                 {
-                    perf_results_local.set_common_params(m_dim_size, n_dim_size, k_dim_size, arithmetic_intensity);
-                    perf_results_global.set_common_params(m_dim_size, n_dim_size, k_dim_size, arithmetic_intensity);
+                    perf_results_local.set_common_params(m_dim_size, n_dim_size, k_dim_size, arithmetic_intensity, data_type);
+                    perf_results_global.set_common_params(m_dim_size, n_dim_size, k_dim_size, arithmetic_intensity, data_type);
                 }
                 supports_split_k_optimization = true;
             }
@@ -566,8 +578,8 @@ bool profile_grouped_conv_bwd_weight_impl(int do_verification,
 
                 auto invoker_ptr = op_ptr->MakeInvokerPointer();
 
-                constexpr int n_warm_up = 50;
-                constexpr int n_repeat = 150;
+                constexpr int n_warm_up = 25;
+                constexpr int n_repeat = 100;
                 StreamConfig config{nullptr, time_kernel};
                 config.cold_niters_ = n_warm_up;
                 config.nrepeat_ = n_repeat;
@@ -584,38 +596,16 @@ bool profile_grouped_conv_bwd_weight_impl(int do_verification,
                           << " TFlops, " << gb_per_sec << " GB/s, " << op_name << ", SplitK "
                           << PerfResults::split_k_str(split_k_list[split_k_id], split_k_arg_value) << std::endl;
                 
-                if (split_k_list[split_k_id].strategy_ ==
-                    SplitKStrategy::BestOccupancy || split_k_list[split_k_id].strategy_ == SplitKStrategy::Optimized)
-                {
-                    const auto strategy = split_k_list[split_k_id].strategy_;
-                    
-                    perf_results_global.update_best_occupancy_split_k(
-                            op_name,
-                            avg_time, 
-                            tflops,                                                       
-                            split_k_arg_value,
-                            strategy);
-
-                    perf_results_local.update_best_occupancy_split_k(
-                            op_name,
-                            avg_time,
-                            tflops,                                                      
-                            split_k_arg_value,
-                            strategy);   
-                }
-                else 
-                {
-                    perf_results_global.update_fixed_split_k(op_name,
-                                                            avg_time,
-                                                            tflops,                                                             
-                                                            split_k_arg_value);
-
-                    perf_results_local.update_fixed_split_k(op_name,
-                                                            avg_time,
-                                                            tflops,                                                              
-                                                            split_k_arg_value);
-                }
-                
+                perf_results_local.update_best_perf(op_name,
+                                                    avg_time,
+                                                    tflops,
+                                                    split_k_arg_value,
+                                                    split_k_list[split_k_id].strategy_);
+                perf_results_global.update_best_perf(op_name,
+                                                     avg_time,
+                                                     tflops,
+                                                     split_k_arg_value,
+                                                     split_k_list[split_k_id].strategy_);
 
                 if(do_verification)
                 {
@@ -693,11 +683,10 @@ bool profile_grouped_conv_bwd_weight_impl(int do_verification,
 
     if (perf_results_list.size() > 0)
     {
-        std::cerr << perf_results_global.print_fixed_split_k() << std::endl;
+        std::cerr << perf_results_global.print_best_performance() << std::endl;
 
         if (profile_all)
         {
-            std::cerr << perf_results_global.print_best_occupancy_split_k() << std::endl;
             write_perf_results_to_file(perf_results_global, perf_results_list);
         }
     }