From 5d7a0487f89251da693df6111fddfa8b06bf44bf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ville=20Pietil=C3=A4?= <> Date: Mon, 30 Jun 2025 14:20:10 +0000 Subject: [PATCH] Refactor conv profiler to produce statistics for analysing split-K autodeduction performance. --- ...e_grouped_conv_bwd_weight_xdl_cshuffle.hpp | 3 +- ...rouped_conv_bwd_weight_xdl_cshuffle_v3.hpp | 3 +- .../gpu/device/impl/split_k_arg.hpp | 4 +- .../gpu/device/impl/split_k_utils.hpp | 8 + .../profile_grouped_conv_bwd_weight_impl.hpp | 257 +++++++++--------- 5 files changed, 135 insertions(+), 140 deletions(-) diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_xdl_cshuffle.hpp index 5ab2f2e36c..069308e597 100644 --- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_xdl_cshuffle.hpp +++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_xdl_cshuffle.hpp @@ -566,8 +566,7 @@ struct DeviceGroupedConvBwdWeight_Xdl_CShuffle : get_optimized_k_batch_value(max_occupancy.value_, grid_size, k_grid_size); data_type_ = typeid(ABDataType).name(); - arithmetic_intensity_ = (2.0 * k_dim_size_ * m_dim_size_ * n_dim_size_) / - ((m_dim_size_ * k_dim_size_ + k_dim_size_ * n_dim_size_ + m_dim_size_ * n_dim_size_) * sizeof(ABDataType)); + arithmetic_intensity_ = calculate_arithmetic_intensity(m_dim_size_, n_dim_size_, k_dim_size_, sizeof(ABDataType)); if (ck::EnvIsEnabled(CK_ENV(CK_LOGGING))) { diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_xdl_cshuffle_v3.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_xdl_cshuffle_v3.hpp index a4699a355a..da7b97cdb8 100644 --- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_xdl_cshuffle_v3.hpp +++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_xdl_cshuffle_v3.hpp @@ -535,8 +535,7 @@ struct DeviceGroupedConvBwdWeight_Xdl_CShuffleV3 : get_optimized_k_batch_value(max_occupancy.value_, grid_size, k_grid_size); data_type_ = typeid(ABDataType).name(); - arithmetic_intensity_ = (2.0 * k_dim_size_ * m_dim_size_ * n_dim_size_) / - ((m_dim_size_ * k_dim_size_ + k_dim_size_ * n_dim_size_ + m_dim_size_ * n_dim_size_) * sizeof(ABDataType)); + arithmetic_intensity_ = calculate_arithmetic_intensity(m_dim_size_, n_dim_size_, k_dim_size_, sizeof(ABDataType)); // For small GemmK size, cap the max value of the k_batch. const auto k_batch_max = static_cast((k_dim_size_ - 1) / K0PerBlock); diff --git a/include/ck/tensor_operation/gpu/device/impl/split_k_arg.hpp b/include/ck/tensor_operation/gpu/device/impl/split_k_arg.hpp index 5624b0d92b..b14c9782ec 100644 --- a/include/ck/tensor_operation/gpu/device/impl/split_k_arg.hpp +++ b/include/ck/tensor_operation/gpu/device/impl/split_k_arg.hpp @@ -14,14 +14,14 @@ struct ArgumentSplitK index_t k_dim_size() const { return k_dim_size_; } index_t m_dim_size() const { return m_dim_size_; } index_t n_dim_size() const { return n_dim_size_; } - index_t arithmetic_intensity() const { return arithmetic_intensity_; } + float arithmetic_intensity() const { return arithmetic_intensity_; } std::string data_type() const { return data_type_; } protected: index_t k_batch_{-1}; index_t k_dim_size_{-1}; index_t m_dim_size_{-1}; index_t n_dim_size_{-1}; - index_t arithmetic_intensity_{-1}; + float arithmetic_intensity_{-1}; std::string data_type_{""}; }; diff --git a/include/ck/tensor_operation/gpu/device/impl/split_k_utils.hpp b/include/ck/tensor_operation/gpu/device/impl/split_k_utils.hpp index 016a44162e..62079dbde2 100644 --- a/include/ck/tensor_operation/gpu/device/impl/split_k_utils.hpp +++ b/include/ck/tensor_operation/gpu/device/impl/split_k_utils.hpp @@ -130,6 +130,14 @@ inline ck::index_t get_optimized_k_batch_value(int max_occupancy, ck::index_t gr return best_split_k; } +inline float calculate_arithmetic_intensity(ck::index_t gemmM, + ck::index_t gemmN, + ck::index_t gemmK, + float bytes_per_element) +{ + return (2.0f * gemmM * gemmN * gemmK) / (bytes_per_element * (gemmM * gemmK + gemmK * gemmN + gemmM * gemmN)); +} + } // namespace device } // namespace tensor_operation } // namespace ck diff --git a/profiler/include/profiler/profile_grouped_conv_bwd_weight_impl.hpp b/profiler/include/profiler/profile_grouped_conv_bwd_weight_impl.hpp index 4cf674f021..dedd7ba8e3 100644 --- a/profiler/include/profiler/profile_grouped_conv_bwd_weight_impl.hpp +++ b/profiler/include/profiler/profile_grouped_conv_bwd_weight_impl.hpp @@ -32,20 +32,37 @@ CK_DECLARE_ENV_VAR_STR(CK_PROFILER_OUTPUT_FILE) namespace ck { namespace profiler { +using SplitKStrategy = ck::tensor_operation::device::SplitKStrategy; +using ParamsSplitK = ck::tensor_operation::device::ParamsSplitK; + +struct BestPerformance +{ + std::string op_name_{""}; + float avg_time_{std::numeric_limits::max()}; + float tflops_{std::numeric_limits::min()}; + ck::index_t split_k_value_{0}; +}; + struct PerfResults { - using SplitKStrategy = ck::tensor_operation::device::SplitKStrategy; - using ParamsSplitK = ck::tensor_operation::device::ParamsSplitK; + // Best performance for each split-K strategy + std::map best_performance_{}; - void update_best_occupancy_split_k(const std::string& op_name, float avg_time, float tflops, ck::index_t split_k_arg, SplitKStrategy strategy) + // GEMM problem parameters + ck::index_t m_dim_size_{-1}; + ck::index_t n_dim_size_{-1}; + ck::index_t k_dim_size_{-1}; + float arithmetic_intensity_{0.0f}; + std::string data_type_{""}; + + std::vector> ranking_; + + void update_best_perf(const std::string& op_name, float avg_time, float tflops, ck::index_t split_k_arg, SplitKStrategy strategy) { - if(tflops > best_occupancy_split_k_tflops_) + const auto& current_best_perf = best_performance_[strategy]; + if(tflops > current_best_perf.tflops_) { - best_occupancy_split_k_op_name_ = op_name; - best_occupancy_split_k_avg_time_ = avg_time; - best_occupancy_split_k_tflops_ = tflops; - best_occupancy_split_k_value_ = split_k_arg; - best_occupancy_split_k_strategy_ = strategy; + best_performance_[strategy] = {op_name, avg_time, tflops, split_k_arg}; } ranking_.emplace_back(op_name, split_k_arg, strategy, tflops); @@ -53,21 +70,6 @@ struct PerfResults [](const auto& a, const auto& b) { return std::get<3>(a) > std::get<3>(b); }); }; - void update_fixed_split_k(const std::string& op_name, float avg_time, float tflops, ck::index_t split_k_arg) - { - if (tflops > fixed_split_k_tflops_) - { - fixed_split_k_op_name_ = op_name; - fixed_split_k_avg_time_ = avg_time; - fixed_split_k_tflops_ = tflops; - fixed_split_k_value_ = split_k_arg; - } - - ranking_.emplace_back(op_name, split_k_arg, SplitKStrategy::FixedSplitK, tflops); - std::sort(ranking_.begin(), ranking_.end(), - [](const auto& a, const auto& b) { return std::get<3>(a) > std::get<3>(b); }); - }; - static std::string split_k_str(const ParamsSplitK split_k_params, ck::index_t split_k_arg) { switch (split_k_params.strategy_) @@ -86,23 +88,7 @@ struct PerfResults } }; - std::string print_fixed_split_k() const - { - ck::index_t rank, total_num; - std::tie(rank, total_num) = get_ranking(fixed_split_k_op_name_, fixed_split_k_value_); - std::stringstream ss; - ss << "\nFIXED SPLIT-K RESULTS" - << "\n========================"; - ss << "\nname: " << fixed_split_k_op_name_ - << "\navg_time: " << fixed_split_k_avg_time_ - << "\ntflops: " << fixed_split_k_tflops_ - << "\nGEMM-K: " << k_dim_size_ - << "\nSplitK " << fixed_split_k_value_ - << "\nRanking: " << rank << " / " << total_num; - return ss.str(); - } - - std::string print_best_occupancy_split_k() const + std::string print_best_performance() const { const auto& to_string = [](const SplitKStrategy strategy) { switch (strategy) @@ -118,22 +104,36 @@ struct PerfResults } }; - ck::index_t rank, total_num; - std::tie(rank, total_num) = get_ranking(best_occupancy_split_k_op_name_, best_occupancy_split_k_value_, best_occupancy_split_k_strategy_); std::stringstream ss; - ss << "\nBEST OCCUPANCY SPLIT-K RESULTS" + ss << "\nProblem Parameters" << "\n========================"; - ss << "\nname: " << best_occupancy_split_k_op_name_ - << "\navg_time: " << best_occupancy_split_k_avg_time_ - << "\ntflops: " << best_occupancy_split_k_tflops_ - << "\nGEMM-K: " << k_dim_size_ - << "\nStrategy: " << to_string(best_occupancy_split_k_strategy_) - << "\nSplitK " << best_occupancy_split_k_value_ - << "\nRanking: " << rank << " / " << total_num; + ss << "\nm_dim_size: " << m_dim_size_ + << "\nn_dim_size: " << n_dim_size_ + << "\nk_dim_size: " << k_dim_size_ + << "\narithmetic_intensity: " << arithmetic_intensity_ + << "\ndata_type: " << data_type_; + for (const auto& strategy : {SplitKStrategy::FixedSplitK, SplitKStrategy::BestOccupancy, SplitKStrategy::Optimized}) + { + const auto& best_perf = best_performance_.find(strategy); + if (best_perf != best_performance_.end()) + { + ck::index_t rank, total_num; + std::tie(rank, total_num) = get_ranking(best_perf->second.op_name_, best_perf->second.split_k_value_, strategy); + + ss << "\n\nBEST PERFORMANCE RESULTS (" << to_string(strategy) << ")" + << "\n========================"; + ss << "\nname: " << best_perf->second.op_name_ + << "\navg_time: " << best_perf->second.avg_time_ + << "\ntflops: " << best_perf->second.tflops_ + << "\nSplitK: " << best_perf->second.split_k_value_ + << "\nRanking: " << rank << " / " << total_num; + } + } + return ss.str(); } - std::tuple get_ranking(const std::string& op_name, ck::index_t split_k, SplitKStrategy strategy = SplitKStrategy::FixedSplitK) const + std::tuple get_ranking(const std::string& op_name, ck::index_t split_k, SplitKStrategy strategy) const { auto it = std::find_if(ranking_.begin(), ranking_.end(), [&](const auto& entry) { @@ -148,8 +148,24 @@ struct PerfResults return std::make_tuple(ranking_.size()+1, ranking_.size()); }; - void set_common_params(ck::index_t m_dim_size, ck::index_t n_dim_size, ck::index_t k_dim_size, float arithmetic_intensity) + void set_common_params(ck::index_t m_dim_size, ck::index_t n_dim_size, ck::index_t k_dim_size, float arithmetic_intensity, const std::string& data_type) { + if (data_type_.empty()) + { + data_type_ = data_type; + } + else if (data_type_ != data_type) + { + std::cerr << "Error: data_type cannot be set multiple times. Old value " << data_type_ << ". New value " << data_type << std::endl; + exit(EXIT_FAILURE); + } + + if (m_dim_size <= 0 || n_dim_size <= 0 || k_dim_size <= 0) + { + std::cerr << "Error: m_dim_size, n_dim_size, and k_dim_size must be positive integers." << std::endl; + exit(EXIT_FAILURE); + } + if (m_dim_size_ > 0 && m_dim_size != m_dim_size_) { std::cerr << "Error: m_dim_size cannot be set multiple times. Old value " << m_dim_size_ << ". New value " << m_dim_size << std::endl; @@ -178,35 +194,19 @@ struct PerfResults exit(EXIT_FAILURE); } arithmetic_intensity_ = arithmetic_intensity; + + if (!data_type_.empty() && data_type != data_type_) + { + std::cerr << "Error: data_type cannot be set multiple times. Old value " << data_type_ << ". New value " << data_type << std::endl; + exit(EXIT_FAILURE); + } + data_type_ = data_type; } - - // Fixed split-K results - std::string fixed_split_k_op_name_{""}; - float fixed_split_k_avg_time_{std::numeric_limits::max()}; - float fixed_split_k_tflops_{std::numeric_limits::min()}; - ck::index_t fixed_split_k_value_{0}; - - // Best occupancy split-K results - std::string best_occupancy_split_k_op_name_{""}; - float best_occupancy_split_k_avg_time_{std::numeric_limits::max()}; - float best_occupancy_split_k_tflops_{std::numeric_limits::min()}; - ck::index_t best_occupancy_split_k_value_{0}; - SplitKStrategy best_occupancy_split_k_strategy_; - - // GEMM problem parameters - ck::index_t m_dim_size_{-1}; - ck::index_t n_dim_size_{-1}; - ck::index_t k_dim_size_{-1}; - float arithmetic_intensity_{0.0f}; - - std::vector> ranking_; }; void write_perf_results_to_file(const PerfResults& perf_results_global, const std::vector& perf_results_list) { - using SplitKStrategy = ck::tensor_operation::device::SplitKStrategy; - const auto& results_file = ck::EnvGetString(CK_ENV(CK_PROFILER_OUTPUT_FILE)); if (results_file.empty()) @@ -231,25 +231,35 @@ void write_perf_results_to_file(const PerfResults& perf_results_global, }; const auto& write_to_file = [&](const PerfResults res, std::ofstream& file, bool only_one_op = false) { - const auto gemm_k_size = res.k_dim_size_ > 0 ? std::to_string(res.k_dim_size_) : "N/A"; - ck::index_t rank_fixed_split_k, rank_best_occupancy_split_k, total_num; - std::tie(rank_fixed_split_k, total_num) = res.get_ranking(res.fixed_split_k_op_name_, res.fixed_split_k_value_); - std::tie(rank_best_occupancy_split_k, std::ignore) = - res.get_ranking(res.best_occupancy_split_k_op_name_, res.best_occupancy_split_k_value_, res.best_occupancy_split_k_strategy_); - file << res.fixed_split_k_op_name_ << separator - << res.fixed_split_k_avg_time_ << separator - << res.fixed_split_k_value_ << separator - << rank_fixed_split_k << separator; - if (!only_one_op) + ck::index_t total_num = -1; + bool write_op_name = true; + for (const auto& strategy : {SplitKStrategy::FixedSplitK, SplitKStrategy::BestOccupancy, SplitKStrategy::Optimized}) { - file << res.best_occupancy_split_k_op_name_ << separator; + const auto& best_perf = res.best_performance_.find(strategy); + if (best_perf != res.best_performance_.end()) + { + BestPerformance perf; + std::tie(std::ignore, perf) = *best_perf; + ck::index_t rank; + std::tie(rank, total_num) = res.get_ranking(perf.op_name_, perf.split_k_value_, strategy); + if (write_op_name) + { + file << perf.op_name_ << separator; + if (only_one_op) + { + // If only one op is written, we do not need to write the op name again + write_op_name = false; + } + } + file << perf.avg_time_ << separator + << perf.tflops_ << separator + << perf.split_k_value_ << separator + << rank << separator + << to_string(strategy) << separator; + } } - file << res.best_occupancy_split_k_avg_time_ << separator - << res.best_occupancy_split_k_value_ << separator - << to_string(res.best_occupancy_split_k_strategy_) << separator - << rank_best_occupancy_split_k << separator - << total_num; + file << total_num; }; if(!results_file.empty()) @@ -261,7 +271,8 @@ void write_perf_results_to_file(const PerfResults& perf_results_global, file << perf_results_global.m_dim_size_ << separator << perf_results_global.n_dim_size_ << separator << perf_results_global.k_dim_size_ << separator - << perf_results_global.arithmetic_intensity_ << separator; + << perf_results_global.arithmetic_intensity_ << separator + << perf_results_global.data_type_ << separator; // First the global results write_to_file(perf_results_global, file); @@ -342,8 +353,8 @@ bool profile_grouped_conv_bwd_weight_impl(int do_verification, using InElementOp = ck::tensor_operation::element_wise::PassThrough; using WeiElementOp = ck::tensor_operation::element_wise::PassThrough; using OutElementOp = ck::tensor_operation::element_wise::PassThrough; - using SplitKStrategy = ck::tensor_operation::device::SplitKStrategy; - using ParamsSplitK = ck::tensor_operation::device::ParamsSplitK; + // using SplitKStrategy = ck::tensor_operation::device::SplitKStrategy; + // using ParamsSplitK = ck::tensor_operation::device::ParamsSplitK; const auto in_element_op = InElementOp{}; const auto wei_element_op = WeiElementOp{}; @@ -465,7 +476,7 @@ bool profile_grouped_conv_bwd_weight_impl(int do_verification, range_copy(conv_param.input_right_pads_, begin(input_right_pads)); std::vector fixed_split_k_list = {1, 2, 4, 8, 16, 32, 64, 128}; - std::vector best_occupancy_list = {SplitKStrategy::BestOccupancy, SplitKStrategy::Optimized}; + std::vector best_occupancy_list = {SplitKStrategy::BestOccupancy /*, SplitKStrategy::Optimized*/}; bool profile_all = true; if(split_k != "all") { @@ -542,10 +553,11 @@ bool profile_grouped_conv_bwd_weight_impl(int do_verification, const auto m_dim_size = split_k_arg->m_dim_size(); const auto n_dim_size = split_k_arg->n_dim_size(); const auto arithmetic_intensity = split_k_arg->arithmetic_intensity(); + const auto& data_type = split_k_arg->data_type(); if (k_dim_size > 0) { - perf_results_local.set_common_params(m_dim_size, n_dim_size, k_dim_size, arithmetic_intensity); - perf_results_global.set_common_params(m_dim_size, n_dim_size, k_dim_size, arithmetic_intensity); + perf_results_local.set_common_params(m_dim_size, n_dim_size, k_dim_size, arithmetic_intensity, data_type); + perf_results_global.set_common_params(m_dim_size, n_dim_size, k_dim_size, arithmetic_intensity, data_type); } supports_split_k_optimization = true; } @@ -566,8 +578,8 @@ bool profile_grouped_conv_bwd_weight_impl(int do_verification, auto invoker_ptr = op_ptr->MakeInvokerPointer(); - constexpr int n_warm_up = 50; - constexpr int n_repeat = 150; + constexpr int n_warm_up = 25; + constexpr int n_repeat = 100; StreamConfig config{nullptr, time_kernel}; config.cold_niters_ = n_warm_up; config.nrepeat_ = n_repeat; @@ -584,38 +596,16 @@ bool profile_grouped_conv_bwd_weight_impl(int do_verification, << " TFlops, " << gb_per_sec << " GB/s, " << op_name << ", SplitK " << PerfResults::split_k_str(split_k_list[split_k_id], split_k_arg_value) << std::endl; - if (split_k_list[split_k_id].strategy_ == - SplitKStrategy::BestOccupancy || split_k_list[split_k_id].strategy_ == SplitKStrategy::Optimized) - { - const auto strategy = split_k_list[split_k_id].strategy_; - - perf_results_global.update_best_occupancy_split_k( - op_name, - avg_time, - tflops, - split_k_arg_value, - strategy); - - perf_results_local.update_best_occupancy_split_k( - op_name, - avg_time, - tflops, - split_k_arg_value, - strategy); - } - else - { - perf_results_global.update_fixed_split_k(op_name, - avg_time, - tflops, - split_k_arg_value); - - perf_results_local.update_fixed_split_k(op_name, - avg_time, - tflops, - split_k_arg_value); - } - + perf_results_local.update_best_perf(op_name, + avg_time, + tflops, + split_k_arg_value, + split_k_list[split_k_id].strategy_); + perf_results_global.update_best_perf(op_name, + avg_time, + tflops, + split_k_arg_value, + split_k_list[split_k_id].strategy_); if(do_verification) { @@ -693,11 +683,10 @@ bool profile_grouped_conv_bwd_weight_impl(int do_verification, if (perf_results_list.size() > 0) { - std::cerr << perf_results_global.print_fixed_split_k() << std::endl; + std::cerr << perf_results_global.print_best_performance() << std::endl; if (profile_all) { - std::cerr << perf_results_global.print_best_occupancy_split_k() << std::endl; write_perf_results_to_file(perf_results_global, perf_results_list); } }