Refactor conv profiler to produce statistics for analysing split-K autodeduction performance.

This commit is contained in:
Ville Pietilä
2025-06-30 14:20:10 +00:00
parent 196b65d1d6
commit 5d7a0487f8
5 changed files with 135 additions and 140 deletions

View File

@@ -566,8 +566,7 @@ struct DeviceGroupedConvBwdWeight_Xdl_CShuffle
: get_optimized_k_batch_value(max_occupancy.value_, grid_size, k_grid_size);
data_type_ = typeid(ABDataType).name();
arithmetic_intensity_ = (2.0 * k_dim_size_ * m_dim_size_ * n_dim_size_) /
((m_dim_size_ * k_dim_size_ + k_dim_size_ * n_dim_size_ + m_dim_size_ * n_dim_size_) * sizeof(ABDataType));
arithmetic_intensity_ = calculate_arithmetic_intensity(m_dim_size_, n_dim_size_, k_dim_size_, sizeof(ABDataType));
if (ck::EnvIsEnabled(CK_ENV(CK_LOGGING)))
{

View File

@@ -535,8 +535,7 @@ struct DeviceGroupedConvBwdWeight_Xdl_CShuffleV3
: get_optimized_k_batch_value(max_occupancy.value_, grid_size, k_grid_size);
data_type_ = typeid(ABDataType).name();
arithmetic_intensity_ = (2.0 * k_dim_size_ * m_dim_size_ * n_dim_size_) /
((m_dim_size_ * k_dim_size_ + k_dim_size_ * n_dim_size_ + m_dim_size_ * n_dim_size_) * sizeof(ABDataType));
arithmetic_intensity_ = calculate_arithmetic_intensity(m_dim_size_, n_dim_size_, k_dim_size_, sizeof(ABDataType));
// For small GemmK size, cap the max value of the k_batch.
const auto k_batch_max = static_cast<index_t>((k_dim_size_ - 1) / K0PerBlock);

View File

@@ -14,14 +14,14 @@ struct ArgumentSplitK
index_t k_dim_size() const { return k_dim_size_; }
index_t m_dim_size() const { return m_dim_size_; }
index_t n_dim_size() const { return n_dim_size_; }
index_t arithmetic_intensity() const { return arithmetic_intensity_; }
float arithmetic_intensity() const { return arithmetic_intensity_; }
std::string data_type() const { return data_type_; }
protected:
index_t k_batch_{-1};
index_t k_dim_size_{-1};
index_t m_dim_size_{-1};
index_t n_dim_size_{-1};
index_t arithmetic_intensity_{-1};
float arithmetic_intensity_{-1};
std::string data_type_{""};
};

View File

@@ -130,6 +130,14 @@ inline ck::index_t get_optimized_k_batch_value(int max_occupancy, ck::index_t gr
return best_split_k;
}
inline float calculate_arithmetic_intensity(ck::index_t gemmM,
ck::index_t gemmN,
ck::index_t gemmK,
float bytes_per_element)
{
return (2.0f * gemmM * gemmN * gemmK) / (bytes_per_element * (gemmM * gemmK + gemmK * gemmN + gemmM * gemmN));
}
} // namespace device
} // namespace tensor_operation
} // namespace ck

View File

@@ -32,20 +32,37 @@ CK_DECLARE_ENV_VAR_STR(CK_PROFILER_OUTPUT_FILE)
namespace ck {
namespace profiler {
using SplitKStrategy = ck::tensor_operation::device::SplitKStrategy;
using ParamsSplitK = ck::tensor_operation::device::ParamsSplitK;
struct BestPerformance
{
std::string op_name_{""};
float avg_time_{std::numeric_limits<float>::max()};
float tflops_{std::numeric_limits<float>::min()};
ck::index_t split_k_value_{0};
};
struct PerfResults
{
using SplitKStrategy = ck::tensor_operation::device::SplitKStrategy;
using ParamsSplitK = ck::tensor_operation::device::ParamsSplitK;
// Best performance for each split-K strategy
std::map<SplitKStrategy, BestPerformance> best_performance_{};
void update_best_occupancy_split_k(const std::string& op_name, float avg_time, float tflops, ck::index_t split_k_arg, SplitKStrategy strategy)
// GEMM problem parameters
ck::index_t m_dim_size_{-1};
ck::index_t n_dim_size_{-1};
ck::index_t k_dim_size_{-1};
float arithmetic_intensity_{0.0f};
std::string data_type_{""};
std::vector<std::tuple<std::string, ck::index_t, SplitKStrategy, float>> ranking_;
void update_best_perf(const std::string& op_name, float avg_time, float tflops, ck::index_t split_k_arg, SplitKStrategy strategy)
{
if(tflops > best_occupancy_split_k_tflops_)
const auto& current_best_perf = best_performance_[strategy];
if(tflops > current_best_perf.tflops_)
{
best_occupancy_split_k_op_name_ = op_name;
best_occupancy_split_k_avg_time_ = avg_time;
best_occupancy_split_k_tflops_ = tflops;
best_occupancy_split_k_value_ = split_k_arg;
best_occupancy_split_k_strategy_ = strategy;
best_performance_[strategy] = {op_name, avg_time, tflops, split_k_arg};
}
ranking_.emplace_back(op_name, split_k_arg, strategy, tflops);
@@ -53,21 +70,6 @@ struct PerfResults
[](const auto& a, const auto& b) { return std::get<3>(a) > std::get<3>(b); });
};
void update_fixed_split_k(const std::string& op_name, float avg_time, float tflops, ck::index_t split_k_arg)
{
if (tflops > fixed_split_k_tflops_)
{
fixed_split_k_op_name_ = op_name;
fixed_split_k_avg_time_ = avg_time;
fixed_split_k_tflops_ = tflops;
fixed_split_k_value_ = split_k_arg;
}
ranking_.emplace_back(op_name, split_k_arg, SplitKStrategy::FixedSplitK, tflops);
std::sort(ranking_.begin(), ranking_.end(),
[](const auto& a, const auto& b) { return std::get<3>(a) > std::get<3>(b); });
};
static std::string split_k_str(const ParamsSplitK split_k_params, ck::index_t split_k_arg)
{
switch (split_k_params.strategy_)
@@ -86,23 +88,7 @@ struct PerfResults
}
};
std::string print_fixed_split_k() const
{
ck::index_t rank, total_num;
std::tie(rank, total_num) = get_ranking(fixed_split_k_op_name_, fixed_split_k_value_);
std::stringstream ss;
ss << "\nFIXED SPLIT-K RESULTS"
<< "\n========================";
ss << "\nname: " << fixed_split_k_op_name_
<< "\navg_time: " << fixed_split_k_avg_time_
<< "\ntflops: " << fixed_split_k_tflops_
<< "\nGEMM-K: " << k_dim_size_
<< "\nSplitK " << fixed_split_k_value_
<< "\nRanking: " << rank << " / " << total_num;
return ss.str();
}
std::string print_best_occupancy_split_k() const
std::string print_best_performance() const
{
const auto& to_string = [](const SplitKStrategy strategy) {
switch (strategy)
@@ -118,22 +104,36 @@ struct PerfResults
}
};
ck::index_t rank, total_num;
std::tie(rank, total_num) = get_ranking(best_occupancy_split_k_op_name_, best_occupancy_split_k_value_, best_occupancy_split_k_strategy_);
std::stringstream ss;
ss << "\nBEST OCCUPANCY SPLIT-K RESULTS"
ss << "\nProblem Parameters"
<< "\n========================";
ss << "\nname: " << best_occupancy_split_k_op_name_
<< "\navg_time: " << best_occupancy_split_k_avg_time_
<< "\ntflops: " << best_occupancy_split_k_tflops_
<< "\nGEMM-K: " << k_dim_size_
<< "\nStrategy: " << to_string(best_occupancy_split_k_strategy_)
<< "\nSplitK " << best_occupancy_split_k_value_
<< "\nRanking: " << rank << " / " << total_num;
ss << "\nm_dim_size: " << m_dim_size_
<< "\nn_dim_size: " << n_dim_size_
<< "\nk_dim_size: " << k_dim_size_
<< "\narithmetic_intensity: " << arithmetic_intensity_
<< "\ndata_type: " << data_type_;
for (const auto& strategy : {SplitKStrategy::FixedSplitK, SplitKStrategy::BestOccupancy, SplitKStrategy::Optimized})
{
const auto& best_perf = best_performance_.find(strategy);
if (best_perf != best_performance_.end())
{
ck::index_t rank, total_num;
std::tie(rank, total_num) = get_ranking(best_perf->second.op_name_, best_perf->second.split_k_value_, strategy);
ss << "\n\nBEST PERFORMANCE RESULTS (" << to_string(strategy) << ")"
<< "\n========================";
ss << "\nname: " << best_perf->second.op_name_
<< "\navg_time: " << best_perf->second.avg_time_
<< "\ntflops: " << best_perf->second.tflops_
<< "\nSplitK: " << best_perf->second.split_k_value_
<< "\nRanking: " << rank << " / " << total_num;
}
}
return ss.str();
}
std::tuple<size_t, size_t> get_ranking(const std::string& op_name, ck::index_t split_k, SplitKStrategy strategy = SplitKStrategy::FixedSplitK) const
std::tuple<size_t, size_t> get_ranking(const std::string& op_name, ck::index_t split_k, SplitKStrategy strategy) const
{
auto it = std::find_if(ranking_.begin(), ranking_.end(),
[&](const auto& entry) {
@@ -148,8 +148,24 @@ struct PerfResults
return std::make_tuple(ranking_.size()+1, ranking_.size());
};
void set_common_params(ck::index_t m_dim_size, ck::index_t n_dim_size, ck::index_t k_dim_size, float arithmetic_intensity)
void set_common_params(ck::index_t m_dim_size, ck::index_t n_dim_size, ck::index_t k_dim_size, float arithmetic_intensity, const std::string& data_type)
{
if (data_type_.empty())
{
data_type_ = data_type;
}
else if (data_type_ != data_type)
{
std::cerr << "Error: data_type cannot be set multiple times. Old value " << data_type_ << ". New value " << data_type << std::endl;
exit(EXIT_FAILURE);
}
if (m_dim_size <= 0 || n_dim_size <= 0 || k_dim_size <= 0)
{
std::cerr << "Error: m_dim_size, n_dim_size, and k_dim_size must be positive integers." << std::endl;
exit(EXIT_FAILURE);
}
if (m_dim_size_ > 0 && m_dim_size != m_dim_size_)
{
std::cerr << "Error: m_dim_size cannot be set multiple times. Old value " << m_dim_size_ << ". New value " << m_dim_size << std::endl;
@@ -178,35 +194,19 @@ struct PerfResults
exit(EXIT_FAILURE);
}
arithmetic_intensity_ = arithmetic_intensity;
if (!data_type_.empty() && data_type != data_type_)
{
std::cerr << "Error: data_type cannot be set multiple times. Old value " << data_type_ << ". New value " << data_type << std::endl;
exit(EXIT_FAILURE);
}
data_type_ = data_type;
}
// Fixed split-K results
std::string fixed_split_k_op_name_{""};
float fixed_split_k_avg_time_{std::numeric_limits<float>::max()};
float fixed_split_k_tflops_{std::numeric_limits<float>::min()};
ck::index_t fixed_split_k_value_{0};
// Best occupancy split-K results
std::string best_occupancy_split_k_op_name_{""};
float best_occupancy_split_k_avg_time_{std::numeric_limits<float>::max()};
float best_occupancy_split_k_tflops_{std::numeric_limits<float>::min()};
ck::index_t best_occupancy_split_k_value_{0};
SplitKStrategy best_occupancy_split_k_strategy_;
// GEMM problem parameters
ck::index_t m_dim_size_{-1};
ck::index_t n_dim_size_{-1};
ck::index_t k_dim_size_{-1};
float arithmetic_intensity_{0.0f};
std::vector<std::tuple<std::string, ck::index_t, SplitKStrategy, float>> ranking_;
};
void write_perf_results_to_file(const PerfResults& perf_results_global,
const std::vector<PerfResults>& perf_results_list)
{
using SplitKStrategy = ck::tensor_operation::device::SplitKStrategy;
const auto& results_file = ck::EnvGetString(CK_ENV(CK_PROFILER_OUTPUT_FILE));
if (results_file.empty())
@@ -231,25 +231,35 @@ void write_perf_results_to_file(const PerfResults& perf_results_global,
};
const auto& write_to_file = [&](const PerfResults res, std::ofstream& file, bool only_one_op = false) {
const auto gemm_k_size = res.k_dim_size_ > 0 ? std::to_string(res.k_dim_size_) : "N/A";
ck::index_t rank_fixed_split_k, rank_best_occupancy_split_k, total_num;
std::tie(rank_fixed_split_k, total_num) = res.get_ranking(res.fixed_split_k_op_name_, res.fixed_split_k_value_);
std::tie(rank_best_occupancy_split_k, std::ignore) =
res.get_ranking(res.best_occupancy_split_k_op_name_, res.best_occupancy_split_k_value_, res.best_occupancy_split_k_strategy_);
file << res.fixed_split_k_op_name_ << separator
<< res.fixed_split_k_avg_time_ << separator
<< res.fixed_split_k_value_ << separator
<< rank_fixed_split_k << separator;
if (!only_one_op)
ck::index_t total_num = -1;
bool write_op_name = true;
for (const auto& strategy : {SplitKStrategy::FixedSplitK, SplitKStrategy::BestOccupancy, SplitKStrategy::Optimized})
{
file << res.best_occupancy_split_k_op_name_ << separator;
const auto& best_perf = res.best_performance_.find(strategy);
if (best_perf != res.best_performance_.end())
{
BestPerformance perf;
std::tie(std::ignore, perf) = *best_perf;
ck::index_t rank;
std::tie(rank, total_num) = res.get_ranking(perf.op_name_, perf.split_k_value_, strategy);
if (write_op_name)
{
file << perf.op_name_ << separator;
if (only_one_op)
{
// If only one op is written, we do not need to write the op name again
write_op_name = false;
}
}
file << perf.avg_time_ << separator
<< perf.tflops_ << separator
<< perf.split_k_value_ << separator
<< rank << separator
<< to_string(strategy) << separator;
}
}
file << res.best_occupancy_split_k_avg_time_ << separator
<< res.best_occupancy_split_k_value_ << separator
<< to_string(res.best_occupancy_split_k_strategy_) << separator
<< rank_best_occupancy_split_k << separator
<< total_num;
file << total_num;
};
if(!results_file.empty())
@@ -261,7 +271,8 @@ void write_perf_results_to_file(const PerfResults& perf_results_global,
file << perf_results_global.m_dim_size_ << separator
<< perf_results_global.n_dim_size_ << separator
<< perf_results_global.k_dim_size_ << separator
<< perf_results_global.arithmetic_intensity_ << separator;
<< perf_results_global.arithmetic_intensity_ << separator
<< perf_results_global.data_type_ << separator;
// First the global results
write_to_file(perf_results_global, file);
@@ -342,8 +353,8 @@ bool profile_grouped_conv_bwd_weight_impl(int do_verification,
using InElementOp = ck::tensor_operation::element_wise::PassThrough;
using WeiElementOp = ck::tensor_operation::element_wise::PassThrough;
using OutElementOp = ck::tensor_operation::element_wise::PassThrough;
using SplitKStrategy = ck::tensor_operation::device::SplitKStrategy;
using ParamsSplitK = ck::tensor_operation::device::ParamsSplitK;
// using SplitKStrategy = ck::tensor_operation::device::SplitKStrategy;
// using ParamsSplitK = ck::tensor_operation::device::ParamsSplitK;
const auto in_element_op = InElementOp{};
const auto wei_element_op = WeiElementOp{};
@@ -465,7 +476,7 @@ bool profile_grouped_conv_bwd_weight_impl(int do_verification,
range_copy(conv_param.input_right_pads_, begin(input_right_pads));
std::vector<ck::index_t> fixed_split_k_list = {1, 2, 4, 8, 16, 32, 64, 128};
std::vector<SplitKStrategy> best_occupancy_list = {SplitKStrategy::BestOccupancy, SplitKStrategy::Optimized};
std::vector<SplitKStrategy> best_occupancy_list = {SplitKStrategy::BestOccupancy /*, SplitKStrategy::Optimized*/};
bool profile_all = true;
if(split_k != "all")
{
@@ -542,10 +553,11 @@ bool profile_grouped_conv_bwd_weight_impl(int do_verification,
const auto m_dim_size = split_k_arg->m_dim_size();
const auto n_dim_size = split_k_arg->n_dim_size();
const auto arithmetic_intensity = split_k_arg->arithmetic_intensity();
const auto& data_type = split_k_arg->data_type();
if (k_dim_size > 0)
{
perf_results_local.set_common_params(m_dim_size, n_dim_size, k_dim_size, arithmetic_intensity);
perf_results_global.set_common_params(m_dim_size, n_dim_size, k_dim_size, arithmetic_intensity);
perf_results_local.set_common_params(m_dim_size, n_dim_size, k_dim_size, arithmetic_intensity, data_type);
perf_results_global.set_common_params(m_dim_size, n_dim_size, k_dim_size, arithmetic_intensity, data_type);
}
supports_split_k_optimization = true;
}
@@ -566,8 +578,8 @@ bool profile_grouped_conv_bwd_weight_impl(int do_verification,
auto invoker_ptr = op_ptr->MakeInvokerPointer();
constexpr int n_warm_up = 50;
constexpr int n_repeat = 150;
constexpr int n_warm_up = 25;
constexpr int n_repeat = 100;
StreamConfig config{nullptr, time_kernel};
config.cold_niters_ = n_warm_up;
config.nrepeat_ = n_repeat;
@@ -584,38 +596,16 @@ bool profile_grouped_conv_bwd_weight_impl(int do_verification,
<< " TFlops, " << gb_per_sec << " GB/s, " << op_name << ", SplitK "
<< PerfResults::split_k_str(split_k_list[split_k_id], split_k_arg_value) << std::endl;
if (split_k_list[split_k_id].strategy_ ==
SplitKStrategy::BestOccupancy || split_k_list[split_k_id].strategy_ == SplitKStrategy::Optimized)
{
const auto strategy = split_k_list[split_k_id].strategy_;
perf_results_global.update_best_occupancy_split_k(
op_name,
avg_time,
tflops,
split_k_arg_value,
strategy);
perf_results_local.update_best_occupancy_split_k(
op_name,
avg_time,
tflops,
split_k_arg_value,
strategy);
}
else
{
perf_results_global.update_fixed_split_k(op_name,
avg_time,
tflops,
split_k_arg_value);
perf_results_local.update_fixed_split_k(op_name,
avg_time,
tflops,
split_k_arg_value);
}
perf_results_local.update_best_perf(op_name,
avg_time,
tflops,
split_k_arg_value,
split_k_list[split_k_id].strategy_);
perf_results_global.update_best_perf(op_name,
avg_time,
tflops,
split_k_arg_value,
split_k_list[split_k_id].strategy_);
if(do_verification)
{
@@ -693,11 +683,10 @@ bool profile_grouped_conv_bwd_weight_impl(int do_verification,
if (perf_results_list.size() > 0)
{
std::cerr << perf_results_global.print_fixed_split_k() << std::endl;
std::cerr << perf_results_global.print_best_performance() << std::endl;
if (profile_all)
{
std::cerr << perf_results_global.print_best_occupancy_split_k() << std::endl;
write_perf_results_to_file(perf_results_global, perf_results_list);
}
}