diff --git a/Jenkinsfile b/Jenkinsfile index 9e73614663..6ef7a62424 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -1182,7 +1182,7 @@ pipeline { agent{ label rocmnode("gfx942") } environment{ setup_args = "NO_CK_BUILD" - execute_args = """ ../script/cmake-ck-dev.sh ../ gfx942 -D USE_CUSTOM_CONFIG=OFF && \ + execute_args = """ ../script/cmake-ck-dev.sh ../ gfx942 && \ make -j64 tile_engine_gemm && \ ./bin/tile_engine_gemm """ } diff --git a/include/ck_tile/host/device_prop.hpp b/include/ck_tile/host/device_prop.hpp new file mode 100644 index 0000000000..d33b298369 --- /dev/null +++ b/include/ck_tile/host/device_prop.hpp @@ -0,0 +1,56 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#ifndef __HIPCC_RTC__ +#include +#include +#include + +namespace ck_tile { + +constexpr unsigned int fnv1a_hash(std::string_view str, unsigned int h = 2166136261u) +{ + return str.empty() ? h + : fnv1a_hash(str.substr(1), + (h ^ static_cast(str.front())) * 16777619u); +} +inline std::string get_device_name() +{ + hipDeviceProp_t props{}; + int device; + auto status = hipGetDevice(&device); + if(status != hipSuccess) + { + return std::string(); + } + status = hipGetDeviceProperties(&props, device); + if(status != hipSuccess) + { + return std::string(); + } + const std::string raw_name(props.gcnArchName); + const auto name = raw_name.substr(0, raw_name.find(':')); // str.substr(0, npos) returns str. + switch(fnv1a_hash(name)) + { + // https://github.com/ROCm/MIOpen/blob/8498875aef84878e04c1eabefdf6571514891086/src/target_properties.cpp#L40 + case fnv1a_hash("Ellesmere"): + case fnv1a_hash("Baffin"): + case fnv1a_hash("RacerX"): + case fnv1a_hash("Polaris10"): + case fnv1a_hash("Polaris11"): + case fnv1a_hash("Tonga"): + case fnv1a_hash("Fiji"): + case fnv1a_hash("gfx800"): + case fnv1a_hash("gfx802"): + case fnv1a_hash("gfx804"): return "gfx803"; + case fnv1a_hash("Vega10"): + case fnv1a_hash("gfx901"): return "gfx900"; + case fnv1a_hash("10.3.0 Sienna_Cichlid 18"): return "gfx1030"; + default: return name; + } +} +} // namespace ck_tile + +#endif diff --git a/tile_engine/ops/gemm/CMakeLists.txt b/tile_engine/ops/gemm/CMakeLists.txt index 916f843c36..8b8120638a 100644 --- a/tile_engine/ops/gemm/CMakeLists.txt +++ b/tile_engine/ops/gemm/CMakeLists.txt @@ -1,22 +1,11 @@ -option(USE_CUSTOM_CONFIG "Enable user-provided configuration file" ON) # generate a list of kernels, but not actually emit files at config stage -if(USE_CUSTOM_CONFIG) - execute_process( - COMMAND ${Python3_EXECUTABLE} ${CMAKE_CURRENT_LIST_DIR}/gemm_instance_builder.py - --working_path ${CMAKE_CURRENT_BINARY_DIR} - --config_json ${CMAKE_CURRENT_LIST_DIR}/configs/user_provided_config.json - --list_blobs - RESULT_VARIABLE ret - ) -else() - execute_process( - COMMAND ${Python3_EXECUTABLE} ${CMAKE_CURRENT_LIST_DIR}/gemm_instance_builder.py - --working_path ${CMAKE_CURRENT_BINARY_DIR} - --list_blobs - RESULT_VARIABLE ret - ) -endif() +execute_process( + COMMAND ${Python3_EXECUTABLE} ${CMAKE_CURRENT_LIST_DIR}/gemm_instance_builder.py + --working_path ${CMAKE_CURRENT_BINARY_DIR} + --list_blobs + RESULT_VARIABLE ret +) if(ret AND NOT ret EQUAL 0) message( FATAL_ERROR "Fail to list kernels via Python. ${ret}") @@ -24,22 +13,12 @@ endif() file(STRINGS ${CMAKE_CURRENT_BINARY_DIR}/gemm_instance_blobs.txt GEMM_CODEGEN_BLOBS) -if(USE_CUSTOM_CONFIG) - add_custom_command( - OUTPUT ${GEMM_CODEGEN_BLOBS} - COMMAND ${Python3_EXECUTABLE} ${CMAKE_CURRENT_LIST_DIR}/gemm_instance_builder.py - --working_path ${CMAKE_CURRENT_BINARY_DIR} - --config_json ${CMAKE_CURRENT_LIST_DIR}/configs/user_provided_config.json - --gen_blobs - ) -else() - add_custom_command( - OUTPUT ${GEMM_CODEGEN_BLOBS} - COMMAND ${Python3_EXECUTABLE} ${CMAKE_CURRENT_LIST_DIR}/gemm_instance_builder.py - --working_path ${CMAKE_CURRENT_BINARY_DIR} - --gen_blobs - ) -endif() +add_custom_command( + OUTPUT ${GEMM_CODEGEN_BLOBS} + COMMAND ${Python3_EXECUTABLE} ${CMAKE_CURRENT_LIST_DIR}/gemm_instance_builder.py + --working_path ${CMAKE_CURRENT_BINARY_DIR} + --gen_blobs +) set(EXECUTABLE_GEMM_INSTANCE "tile_engine_gemm") message("adding example ${EXECUTABLE_GEMM_INSTANCE}") diff --git a/tile_engine/ops/gemm/README.md b/tile_engine/ops/gemm/README.md index 26ece438bb..69e9ffbaf5 100644 --- a/tile_engine/ops/gemm/README.md +++ b/tile_engine/ops/gemm/README.md @@ -4,7 +4,11 @@ CK Tile Engine GEMM is used to generate and run GEMM kernels with different comb # Kernel Configurations -User can provide kernel configuration such as tile size, warp size, padding, pipeline, scheduler and epilogue in the config file. For reference please see `./configs/user_provided_config.json`. The Tile engine also has default kernel configuration to expand the range of kernel configuration which is saved in `./configs/default_config.json`. +User can provide kernel configuration such as tile size, warp size, padding, pipeline, scheduler and epilogue in the config file with limited values. For reference please see `./configs/user_provided_config.json`. + +The Tile engine also has a default kernel configuration for providing range of configuration parameter values, which helps users who lack kernel development experience to benchmark For reference please see in `./configs/default_config.json` + +If user does not provide kernel configuration, the tile engine uses default kernel configuration to generate kernel instances and benchmark. ## Build Instructions ``` bash @@ -12,8 +16,7 @@ User can provide kernel configuration such as tile size, warp size, padding, pip mkdir build && cd build # build composable kernel ## replace with the appropriate architecture (example gfx942) or leave blank -## "USE_CUSTOM_CONFIG=OFF" for default configuration, "USE_CUSTOM_CONFIG=ON" for user provided configuration -sh ../script/cmake-ck-dev.sh ../ -D USE_CUSTOM_CONFIG=ON +sh ../script/cmake-ck-dev.sh ../ # generate the executable make tile_engine_gemm -j ``` @@ -37,9 +40,10 @@ rm -rf tile_engine/ && make tile_engine_gemm -j # rebuild -log Wether output kernel instance information or not. Possible values are true or false. Default is false. -warmup The number of iterations before benchmark the kernel. Default is 50. -repeat The number of iterations to benchmark the kernel. Default is 100. - -timer The type of timer. Possible values are gpu timer or cpu timer. Default is gpu timer. + -timer Whether if the timer is gpu timer or not. Possible values are true or false. Default is true. -init The method of tensor initialization. Set to 0 for random, to 1 for linear, or 2 for constant(1). Default is 0, random. -metric Metric with which to measure kernel performance. Set to 0 for latency, 1 for tflops, or 2 for bandwidth. Default is 0, latency. + -csv_filename The filename of benchmark result. Default is gemm_kernel. -structured_sparsity whether use sparsity kernel or not. Possible values are true or false. Default is false. -pipeline The type of pipeline. Possible values are compv3, compv4 or mem. Default is compv3. -epilogue The type of epilogue. Possible values are cshuffle or default. Default is cshuffle. diff --git a/tile_engine/ops/gemm/benchmark_gemm.hpp b/tile_engine/ops/gemm/benchmark_gemm.hpp index a6aaad1a60..b3eeac79f4 100644 --- a/tile_engine/ops/gemm/benchmark_gemm.hpp +++ b/tile_engine/ops/gemm/benchmark_gemm.hpp @@ -3,257 +3,161 @@ #pragma once -#include -#include -#include -#include -#include -#include +#include "gemm_profiler.hpp" -#include "ck/host_utility/device_prop.hpp" -#include "ck_tile/host/timer.hpp" - -enum class Metric +template +void benchmark_gemm(const ck_tile::ArgParser& arg_parser, const std::vector& callables) { - LATENCY = 0, - TFLOPS = 1, - BANDWIDTH = 2 -}; + GemmProblem gemm_problem{arg_parser.get_int("split_k"), + arg_parser.get_int("m"), + arg_parser.get_int("n"), + arg_parser.get_int("k"), + arg_parser.get_int("stride_a"), + arg_parser.get_int("stride_b"), + arg_parser.get_int("stride_c"), + DataTypeTraits::name, + DataTypeTraits::name, + DataTypeTraits::name, + DataTypeTraits::name, + ALayout::name, + BLayout::name, + CLayout::name, + arg_parser.get_bool("structured_sparsity")}; -inline constexpr auto get_metric_name(Metric m) -{ - switch(m) + Setting setting{ + arg_parser.get_int("warmup"), + arg_parser.get_int("repeat"), + arg_parser.get_bool("timer"), + arg_parser.get_int("verify"), + arg_parser.get_int("init"), + arg_parser.get_bool("log"), + arg_parser.get_str("csv_filename"), + }; + + auto& profiler = GemmProfiler::instance(setting); + + const ALayout layout_a = ALayout{}; + const BLayout layout_b = BLayout{}; + const CLayout layout_c = CLayout{}; + + gemm_problem.stride_a_ = ck_tile::get_default_stride( + gemm_problem.m_, gemm_problem.k_, gemm_problem.stride_a_, is_row_major(layout_a)); + gemm_problem.stride_b_ = ck_tile::get_default_stride( + gemm_problem.k_, gemm_problem.n_, gemm_problem.stride_b_, is_row_major(layout_b)); + gemm_problem.stride_c_ = ck_tile::get_default_stride( + gemm_problem.m_, gemm_problem.n_, gemm_problem.stride_c_, is_row_major(layout_c)); + + ck_tile::HostTensor a_m_k(ck_tile::host_tensor_descriptor( + gemm_problem.m_, gemm_problem.k_, gemm_problem.stride_a_, is_row_major(layout_a))); + ck_tile::HostTensor b_k_n(ck_tile::host_tensor_descriptor( + gemm_problem.k_, gemm_problem.n_, gemm_problem.stride_b_, is_row_major(layout_b))); + ck_tile::HostTensor c_m_n_dev_result(ck_tile::host_tensor_descriptor( + gemm_problem.m_, gemm_problem.n_, gemm_problem.stride_c_, is_row_major(layout_c))); + + if(setting.init_method_ == 0) { - case Metric::LATENCY: return "latency"; - case Metric::TFLOPS: return "tflops"; - case Metric::BANDWIDTH: return "bandwidth"; - default: throw std::invalid_argument("Unsupported metric type"); + ck_tile::FillUniformDistribution{-1.f, 1.f}(a_m_k); + ck_tile::FillUniformDistribution{-1.f, 1.f}(b_k_n); + } + else if(setting.init_method_ == 1) + { + ck_tile::FillMonotonicSeq{}(a_m_k); + ck_tile::FillMonotonicSeq{}(b_k_n); + } + else if(setting.init_method_ == 2) + { + ck_tile::FillConstant{static_cast(1)}(a_m_k); + ck_tile::FillConstant{static_cast(1)}(b_k_n); + } + else + { + a_m_k.SetZero(); + b_k_n.SetZero(); + } + + if(gemm_problem.structured_sparsity_) + { + ck_tile::AdjustToStructuredSparsity{}(a_m_k); + } + + ck_tile::DeviceMem a_m_k_dev_buf(a_m_k.get_element_space_size_in_bytes()); + ck_tile::DeviceMem b_k_n_dev_buf(b_k_n.get_element_space_size_in_bytes()); + ck_tile::DeviceMem c_m_n_dev_buf(c_m_n_dev_result.get_element_space_size_in_bytes()); + + if constexpr(std::is_same_v) + { + // Permute vector pk_i4x4 data for device implementation + ck_tile::HostTensor b_k_n_dev = b_k_n; + // permute_tensor_b(b_k_n_dev); + permute_vectors_i4x4_b(b_k_n_dev); + b_k_n_dev_buf.ToDevice(b_k_n_dev.data()); + } + else + { + b_k_n_dev_buf.ToDevice(b_k_n.data()); + } + + a_m_k_dev_buf.ToDevice(a_m_k.data()); + c_m_n_dev_buf.SetZero(); + c_m_n_dev_result.SetZero(); + + ck_tile::GemmHostArgs gemm_args; + gemm_args.a_ptr = a_m_k_dev_buf.GetDeviceBuffer(); + gemm_args.b_ptr = b_k_n_dev_buf.GetDeviceBuffer(); + gemm_args.c_ptr = c_m_n_dev_buf.GetDeviceBuffer(); + gemm_args.k_batch = gemm_problem.split_k_; + gemm_args.M = gemm_problem.m_; + gemm_args.N = gemm_problem.n_; + gemm_args.K = gemm_problem.k_; + gemm_args.stride_A = gemm_problem.stride_a_; + gemm_args.stride_B = gemm_problem.stride_b_; + gemm_args.stride_C = gemm_problem.stride_c_; + + ck_tile::HostTensor c_m_n_host_result(ck_tile::host_tensor_descriptor( + gemm_problem.m_, gemm_problem.n_, gemm_problem.stride_c_, is_row_major(layout_c))); + + if(setting.verify_) + { + gemm_host_reference(setting.verify_, + a_m_k, + b_k_n, + c_m_n_host_result, + a_m_k_dev_buf, + b_k_n_dev_buf, + gemm_problem.m_, + gemm_problem.n_, + gemm_problem.k_, + gemm_problem.stride_a_, + gemm_problem.stride_b_, + gemm_problem.stride_c_); + } + + try + { + for(auto& callable : callables) + { + profiler.benchmark(gemm_problem, + c_m_n_dev_buf, + c_m_n_host_result, + c_m_n_dev_result, + callable(gemm_args, + ck_tile::stream_config{nullptr, + true, + setting.log_, + setting.n_warmup_, + setting.n_repeat_, + setting.is_gpu_timer_})); + } + profiler.select_best_instance(static_cast(arg_parser.get_int("metric"))); + } + catch(const std::exception& e) + { + std::cerr << "Benchmark failed: " << e.what() << std::endl; } } - -struct GemmProblem -{ - int split_k; - int m, n, k; - int stride_a, stride_b, stride_c; - - std::string dtype_a, dtype_b, dtype_acc, dtype_c; - std::string layout_a, layout_b, layout_c; - - friend std::ostream& operator<<(std::ostream& os, const GemmProblem& problem) - { - os << "{\n" - << " \"split_k\":" << problem.split_k << ",\n" - << " \"m\":" << problem.m << ",\n" - << " \"n\":" << problem.n << ",\n" - << " \"k\":" << problem.k << ",\n" - << " \"stride_a\":" << problem.stride_a << ",\n" - << " \"stride_b\":" << problem.stride_b << ",\n" - << " \"stride_c\":" << problem.stride_c << ",\n" - << " \"dtype_a\":\"" << problem.dtype_a << "\",\n" - << " \"dtype_b\":\"" << problem.dtype_b << "\",\n" - << " \"dtype_acc\":\"" << problem.dtype_acc << "\",\n" - << " \"dtype_c\":\"" << problem.dtype_c << "\",\n" - << " \"layout_a\":\"" << problem.layout_a << "\",\n" - << " \"layout_b\":\"" << problem.layout_b << "\",\n" - << " \"layout_c\":\"" << problem.layout_c << "\"\n" - << "}"; - return os; - } -}; - -struct PerformanceResult -{ - double latency; - double tflops; - double bandwidth; - - static bool compare(const PerformanceResult& a, const PerformanceResult& b, Metric m) - { - switch(m) - { - case Metric::LATENCY: return a.latency < b.latency; - case Metric::TFLOPS: return a.tflops > b.tflops; - case Metric::BANDWIDTH: return a.bandwidth > b.bandwidth; - default: throw std::invalid_argument("Unsupported metric type"); - } - } - - friend std::ostream& operator<<(std::ostream& os, const PerformanceResult& result) - { - os << "{\n" - << " \"latency(ms)\": " << std::fixed << std::setprecision(2) << result.latency - << ",\n" - << " \"tflops(TFlops)\": " << result.tflops << ",\n" - << " \"bandwidth(GB/s)\": " << result.bandwidth << "\n" - << "}"; - return os; - } -}; - -struct KernelInstance -{ - std::string name; - GemmProblem problem; - PerformanceResult perf_result; - - static bool compare(const KernelInstance& a, const KernelInstance& b, Metric m) - { - return PerformanceResult::compare(a.perf_result, b.perf_result, m); - } - - friend std::ostream& operator<<(std::ostream& os, const KernelInstance& obj) - { - os << "{\n" - << " \"name\": \"" - << "{\n" - << obj.name << "\n}" - << "\",\n" - << " \"problem\": \"" << obj.problem << "\",\n" - << " \"perf_result\": " << obj.perf_result << "\n" - << "}"; - return os; - } -}; - -class GemmProfiler -{ - public: - static GemmProfiler& instance() - { - static GemmProfiler instance; - return instance; - } - - static std::string get_rocm_version() - { - std::ifstream version_file("/opt/rocm/.info/version"); - if(version_file.is_open()) - { - std::string version; - std::getline(version_file, version); - return version; - } - return "Unknown"; - } - - template - void benchmark_kernel(ck_tile::DeviceMem& c_m_n_dev_buf, - ck_tile::HostTensor& c_m_n_host_result, - ck_tile::HostTensor& c_m_n_dev_result, - int verify, - ck_tile::GemmHostArgs& args, - const ck_tile::stream_config& stream) - { - std::string description = Kernel::get_name(); - - GemmProblem problem{args.k_batch, - args.M, - args.N, - args.K, - args.stride_A, - args.stride_B, - args.stride_C, - DataTypeTraits::name, - DataTypeTraits::name, - DataTypeTraits::name, - DataTypeTraits::name, - ALayout::name, - BLayout::name, - CLayout::name}; - - KernelInstance kernel_instance{description, problem, {-1.0f, -1.0f, -1.0f}}; - - float avg_time = Kernel::launch(args, stream); - std::size_t flop = std::size_t(2) * args.M * args.N * args.K; - std::size_t num_byte = sizeof(ADataType) * args.M * args.K + - sizeof(BDataType) * args.N * args.K + - sizeof(CDataType) * args.M * args.N; - - kernel_instance.perf_result.latency = avg_time; - kernel_instance.perf_result.tflops = static_cast(flop) / 1.E9 / avg_time; - kernel_instance.perf_result.bandwidth = num_byte / 1.E6 / avg_time; - - if(stream.log_level_ > 0) - { - std::cout << kernel_instance << std::endl; - } - - c_m_n_dev_buf.FromDevice(c_m_n_dev_result.data()); - bool verified_correct = - !verify || compare(args.K, args.k_batch, c_m_n_dev_result, c_m_n_host_result); - - if(verified_correct) - { - kernel_instances_.emplace_back(kernel_instance); - } - else - { - std::cout << "Verification failed, skip kernel: " << description << std::endl; - } - - c_m_n_dev_buf.SetZero(); - c_m_n_dev_result.SetZero(); - } - - KernelInstance select_best_instance(Metric metric, - const std::string& csv_filename = "gemm_kernels.csv") - { - if(kernel_instances_.empty()) - throw std::runtime_error("Empty instances"); - - auto kernel_instance = *std::max_element(kernel_instances_.begin(), - kernel_instances_.end(), - [metric](const auto& a, const auto& b) { - return PerformanceResult::compare( - b.perf_result, a.perf_result, metric); - }); - - std::cout << "**********************************" << std::endl; - std::cout << "According to given metrics: " << get_metric_name(metric) << "\n" - << "The best kernel instance is: " << kernel_instance << std::endl; - std::cout << "**********************************" << std::endl; - - if(!csv_filename.empty()) - { - std::ofstream file(csv_filename, std::ios::app); - - if(!file.is_open()) - { - std::cerr << "Warning: Failed to open CSV file for writing." << std::endl; - } - else - { - if(file.tellp() == 0) - { - file << "rocm_version, device_name," - << "split_k,m,n,k,stride_a,stride_b,stride_c," - << "dtype_a,dtype_b,dtype_acc,dtype_c," - << "layout_a,layout_b,layout_c," - << "latency(ms),tflops(TFlops),bandwidth(GB/s),metric\n"; - } - - const auto& p = kernel_instance.problem; - const auto& res = kernel_instance.perf_result; - - file << get_rocm_version() << "," << ck::get_device_name() << "," << p.split_k - << "," << p.m << "," << p.n << "," << p.k << "," << p.stride_a << "," - << p.stride_b << "," << p.stride_c << "," << p.dtype_a << "," << p.dtype_b - << "," << p.dtype_acc << "," << p.dtype_c << "," << p.layout_a << "," - << p.layout_b << "," << p.layout_c << "," << std::fixed << std::setprecision(2) - << res.latency << "," << std::fixed << std::setprecision(2) << res.tflops - << "," << std::fixed << std::setprecision(2) << res.bandwidth << "," - << get_metric_name(metric) << "\n"; - - if(!file) - { - std::cerr << "Warning: Error occurred while writing to CSV file." << std::endl; - } - } - } - - return kernel_instance; - } - - std::vector kernel_instances_; -}; diff --git a/tile_engine/ops/gemm/benchmark_utils.hpp b/tile_engine/ops/gemm/benchmark_utils.hpp new file mode 100644 index 0000000000..e9bd412db6 --- /dev/null +++ b/tile_engine/ops/gemm/benchmark_utils.hpp @@ -0,0 +1,219 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include +#include +#include +#include + +#include "ck_tile/host.hpp" + +enum class Metric +{ + LATENCY = 0, + TFLOPS = 1, + BANDWIDTH = 2 +}; + +inline constexpr auto get_metric_name(Metric m) +{ + switch(m) + { + case Metric::LATENCY: return "latency"; + case Metric::TFLOPS: return "tflops"; + case Metric::BANDWIDTH: return "bandwidth"; + default: throw std::invalid_argument("Unsupported metric type"); + } +} + +struct GemmProblem +{ + int split_k_; + int m_, n_, k_; + int stride_a_, stride_b_, stride_c_; + + std::string dtype_a_, dtype_b_, dtype_acc_, dtype_c_; + std::string layout_a_, layout_b_, layout_c_; + + bool structured_sparsity_; + + friend std::ostream& operator<<(std::ostream& os, const GemmProblem& problem) + { + os << "{\n" + << " \"split_k\":" << problem.split_k_ << ",\n" + << " \"m\":" << problem.m_ << ",\n" + << " \"n\":" << problem.n_ << ",\n" + << " \"k\":" << problem.k_ << ",\n" + << " \"stride_a\":" << problem.stride_a_ << ",\n" + << " \"stride_b\":" << problem.stride_b_ << ",\n" + << " \"stride_c\":" << problem.stride_c_ << ",\n" + << " \"dtype_a\":\"" << problem.dtype_a_ << "\",\n" + << " \"dtype_b\":\"" << problem.dtype_b_ << "\",\n" + << " \"dtype_acc\":\"" << problem.dtype_acc_ << "\",\n" + << " \"dtype_c\":\"" << problem.dtype_c_ << "\",\n" + << " \"layout_a\":\"" << problem.layout_a_ << "\",\n" + << " \"layout_b\":\"" << problem.layout_b_ << "\",\n" + << " \"layout_c\":\"" << problem.layout_c_ << "\"\n" + << " \"structured_sparsity\":\"" << problem.structured_sparsity_ << "\"\n" + << "}"; + return os; + } +}; + +struct PerformanceResult +{ + double latency_; + double tflops_; + double bandwidth_; + + static bool compare(const PerformanceResult& a, const PerformanceResult& b, Metric m) + { + switch(m) + { + case Metric::LATENCY: return a.latency_ < b.latency_; + case Metric::TFLOPS: return a.tflops_ > b.tflops_; + case Metric::BANDWIDTH: return a.bandwidth_ > b.bandwidth_; + default: throw std::invalid_argument("Unsupported metric type"); + } + } + + friend std::ostream& operator<<(std::ostream& os, const PerformanceResult& result) + { + os << "{\n" + << " \"latency(ms)\": " << std::fixed << std::setprecision(2) << result.latency_ + << ",\n" + << " \"tflops(TFlops)\": " << result.tflops_ << ",\n" + << " \"bandwidth(GB/s)\": " << result.bandwidth_ << "\n" + << "}"; + return os; + } +}; + +struct KernelInstance +{ + std::string name_; + GemmProblem problem_; + PerformanceResult perf_result_; + + static bool compare(const KernelInstance& a, const KernelInstance& b, Metric m) + { + return PerformanceResult::compare(a.perf_result_, b.perf_result_, m); + } + + friend std::ostream& operator<<(std::ostream& os, const KernelInstance& obj) + { + os << "{\n" + << " \"name\": \"" + << "{\n" + << obj.name_ << "\n}" + << "\",\n" + << " \"problem\": \"" << obj.problem_ << "\",\n" + << " \"perf_result\": " << obj.perf_result_ << "\n" + << "}"; + return os; + } +}; + +struct Setting +{ + int n_warmup_; + int n_repeat_; + bool is_gpu_timer_; + int verify_; + int init_method_; + bool log_; + std::string csv_filename_; +}; + +std::string get_rocm_version() +{ + std::ifstream version_file("/opt/rocm/.info/version"); + if(version_file.is_open()) + { + std::string version; + std::getline(version_file, version); + return version; + } + return "Unknown"; +} + +/// @brief Function to compare the results of the device and host computations +bool compare(ck_tile::index_t K, + ck_tile::index_t kbatch, + ck_tile::HostTensor& c_m_n_dev_result, + ck_tile::HostTensor& c_m_n_host_result) +{ + const float max_accumulated_value = + *std::max_element(c_m_n_host_result.mData.begin(), c_m_n_host_result.mData.end()); + const auto rtol_atol = calculate_rtol_atol( + K, kbatch, max_accumulated_value); + bool pass = ck_tile::check_err(c_m_n_dev_result, + c_m_n_host_result, + "Error: Incorrect results!", + rtol_atol.at(ck_tile::number<0>{}), + rtol_atol.at(ck_tile::number<1>{})); + + std::cout << "Relative error threshold: " << rtol_atol.at(ck_tile::number<0>{}) + << " Absolute error threshold: " << rtol_atol.at(ck_tile::number<1>{}) << std::endl; + std::cout << "The verification result is:" << (pass ? "correct" : "fail") << std::endl; + + return pass; +} + +/// @brief Function to get the kernel output with reference implementation on CPU/GPU +template +void gemm_host_reference(int verify, + ck_tile::HostTensor& a_m_k, + ck_tile::HostTensor& b_k_n, + ck_tile::HostTensor& c_m_n_host_result, + ck_tile::DeviceMem& a_m_k_dev_buf, + ck_tile::DeviceMem& b_k_n_dev_buf, + ck_tile::index_t M, + ck_tile::index_t N, + ck_tile::index_t K, + ck_tile::index_t stride_A, + ck_tile::index_t stride_B, + ck_tile::index_t stride_C) +{ + if(verify == 1) + { + c_m_n_host_result.SetZero(); + + ck_tile::reference_gemm( + a_m_k, b_k_n, c_m_n_host_result); + } + else if(verify == 2) + { + if constexpr(std::is_same_v) + { + // Restore input for B for gpu reference + b_k_n_dev_buf.ToDevice(b_k_n.data()); + } + + ck_tile::DeviceMem c_m_n_gpu_buf_ref(c_m_n_host_result.get_element_space_size_in_bytes()); + c_m_n_host_result.SetZero(); + c_m_n_gpu_buf_ref.SetZero(); + + ADataType* d_A = static_cast(a_m_k_dev_buf.GetDeviceBuffer()); + BDataType* d_B = static_cast(b_k_n_dev_buf.GetDeviceBuffer()); + CDataType* d_C = static_cast(c_m_n_gpu_buf_ref.GetDeviceBuffer()); + + ck_tile::reference_gemm_gpu(d_A, d_B, d_C, M, N, K, stride_A, stride_B, stride_C); + + c_m_n_gpu_buf_ref.FromDevice(c_m_n_host_result.data()); + } +} diff --git a/tile_engine/ops/gemm/configs/default_config.json b/tile_engine/ops/gemm/configs/default_config.json index b27b5566ff..1d23fa3aa0 100644 --- a/tile_engine/ops/gemm/configs/default_config.json +++ b/tile_engine/ops/gemm/configs/default_config.json @@ -36,19 +36,19 @@ "max": 512, "min": 64, "step": 8, - "execlude": [] + "exclude": [] }, "tile_n": { "max": 512, "min": 64, "step": 8, - "execlude": [] + "exclude": [] }, "tile_k": { "max": 512, "min": 64, "step": 8, - "execlude": [] + "exclude": [] }, "warp_m": { "max": 4, diff --git a/tile_engine/ops/gemm/configs/user_provided_config.json b/tile_engine/ops/gemm/configs/user_provided_config.json index 7744d33d3c..312cefb4ed 100644 --- a/tile_engine/ops/gemm/configs/user_provided_config.json +++ b/tile_engine/ops/gemm/configs/user_provided_config.json @@ -34,16 +34,12 @@ "tile_config": { "tile_m": { "values": [ - 256, - 128, - 64 + 256 ] }, "tile_n": { "values": [ - 256, - 128, - 64 + 256 ] }, "tile_k": { diff --git a/tile_engine/ops/gemm/gemm_host_api.cpp b/tile_engine/ops/gemm/gemm_host_api.cpp index 43d6509bf8..3afa3195da 100755 --- a/tile_engine/ops/gemm/gemm_host_api.cpp +++ b/tile_engine/ops/gemm/gemm_host_api.cpp @@ -5,127 +5,10 @@ #include "gemm_common.hpp" #include "gemm_dispatcher.hpp" #include "gemm_host_api.hpp" +#include "benchmark_gemm.hpp" -void gemm_kernel_launch(ck_tile::DeviceMem& c_m_n_dev_buf, - ck_tile::HostTensor& c_m_n_host_result, - ck_tile::HostTensor& c_m_n_dev_result, - int verify, - int metric, - bool structured_sparsity, - KernelTraits& trait, - ck_tile::GemmHostArgs& args, - const ck_tile::stream_config& stream) +auto run_single_trait(const ck_tile::ArgParser& arg_parser) { - return GemmDispatcher::dispatch(c_m_n_dev_buf, - c_m_n_host_result, - c_m_n_dev_result, - verify, - metric, - structured_sparsity, - trait, - args, - stream); -} - -template -void run(const ck_tile::ArgParser& arg_parser) -{ - const ALayout a_layout = ALayout{}; - const BLayout b_layout = BLayout{}; - - ck_tile::index_t kbatch = arg_parser.get_int("split_k"); - ck_tile::index_t M = arg_parser.get_int("m"); - ck_tile::index_t N = arg_parser.get_int("n"); - ck_tile::index_t K = arg_parser.get_int("k"); - - ck_tile::index_t stride_A = arg_parser.get_int("stride_a"); - ck_tile::index_t stride_B = arg_parser.get_int("stride_b"); - ck_tile::index_t stride_C = arg_parser.get_int("stride_c"); - - bool log = arg_parser.get_int("log"); - int n_warmup = arg_parser.get_int("warmup"); - int n_repeat = arg_parser.get_int("repeat"); - int verify = arg_parser.get_int("v"); - ck_tile::index_t init_method = arg_parser.get_int("init"); - int metric = arg_parser.get_int("metric"); - bool structured_sparsity = arg_parser.get_bool("structured_sparsity"); - - stride_A = ck_tile::get_default_stride(M, K, stride_A, is_row_major(a_layout)); - stride_B = ck_tile::get_default_stride(K, N, stride_B, is_row_major(b_layout)); - stride_C = ck_tile::get_default_stride(M, N, stride_C, is_row_major(CLayout{})); - - ck_tile::HostTensor a_m_k( - ck_tile::host_tensor_descriptor(M, K, stride_A, is_row_major(a_layout))); - ck_tile::HostTensor b_k_n( - ck_tile::host_tensor_descriptor(K, N, stride_B, is_row_major(b_layout))); - ck_tile::HostTensor c_m_n_dev_result( - ck_tile::host_tensor_descriptor(M, N, stride_C, is_row_major(CLayout{}))); - - if(init_method == 0) - { - ck_tile::FillUniformDistribution{-1.f, 1.f}(a_m_k); - ck_tile::FillUniformDistribution{-1.f, 1.f}(b_k_n); - } - else if(init_method == 1) - { - ck_tile::FillMonotonicSeq{}(a_m_k); - ck_tile::FillMonotonicSeq{}(b_k_n); - } - else if(init_method == 2) - { - ck_tile::FillConstant{static_cast(1)}(a_m_k); - ck_tile::FillConstant{static_cast(1)}(b_k_n); - } - else - { - a_m_k.SetZero(); - b_k_n.SetZero(); - } - - if(structured_sparsity) - { - ck_tile::AdjustToStructuredSparsity{}(a_m_k); - } - - ck_tile::DeviceMem a_m_k_dev_buf(a_m_k.get_element_space_size_in_bytes()); - ck_tile::DeviceMem b_k_n_dev_buf(b_k_n.get_element_space_size_in_bytes()); - ck_tile::DeviceMem c_m_n_dev_buf(c_m_n_dev_result.get_element_space_size_in_bytes()); - - if constexpr(std::is_same_v) - { - // Permute vector pk_i4x4 data for device implementation - ck_tile::HostTensor b_k_n_dev = b_k_n; - // permute_tensor_b(b_k_n_dev); - permute_vectors_i4x4_b(b_k_n_dev); - b_k_n_dev_buf.ToDevice(b_k_n_dev.data()); - } - else - { - b_k_n_dev_buf.ToDevice(b_k_n.data()); - } - - a_m_k_dev_buf.ToDevice(a_m_k.data()); - c_m_n_dev_buf.SetZero(); - c_m_n_dev_result.SetZero(); - - ck_tile::GemmHostArgs gemm_args; - gemm_args.a_ptr = a_m_k_dev_buf.GetDeviceBuffer(); - gemm_args.b_ptr = b_k_n_dev_buf.GetDeviceBuffer(); - gemm_args.c_ptr = c_m_n_dev_buf.GetDeviceBuffer(); - gemm_args.k_batch = kbatch; - gemm_args.M = M; - gemm_args.N = N; - gemm_args.K = K; - gemm_args.stride_A = stride_A; - gemm_args.stride_B = stride_B; - gemm_args.stride_C = stride_C; - KernelTraits trait; trait.pipeline = arg_parser.get_str("pipeline"); trait.scheduler = arg_parser.get_str("scheduler"); @@ -134,49 +17,9 @@ void run(const ck_tile::ArgParser& arg_parser) trait.pad_n = arg_parser.get_bool("pad_n"); trait.pad_k = arg_parser.get_bool("pad_k"); - std::cout << "Run Gemm kernel with M =" << M << " N =" << N << " K =" << K - << " StrideA =" << stride_A << " StrideB =" << stride_B << " StrideC =" << stride_C - << " A_Layout =" << ALayout::name << " B_Layout =" << BLayout::name - << " C_Layout =" << CLayout::name << " A Type = " << DataTypeTraits::name - << " B Type = " << DataTypeTraits::name - << " C Type = " << DataTypeTraits::name << std::endl; + bool structured_sparsity = arg_parser.get_bool("structured_sparsity"); - ck_tile::HostTensor c_m_n_host_result( - ck_tile::host_tensor_descriptor(M, N, stride_C, is_row_major(CLayout{}))); - - if(verify) - { - gemm_host_reference(verify, - a_m_k, - b_k_n, - c_m_n_host_result, - a_m_k_dev_buf, - b_k_n_dev_buf, - M, - N, - K, - stride_A, - stride_B, - stride_C); - } - - gemm_kernel_launch(c_m_n_dev_buf, - c_m_n_host_result, - c_m_n_dev_result, - verify, - metric, - structured_sparsity, - trait, - gemm_args, - ck_tile::stream_config{nullptr, true, log, n_warmup, n_repeat}); - - return; + return GemmDispatcher::dispatch(structured_sparsity, trait); } int main(int argc, char* argv[]) @@ -186,7 +29,7 @@ int main(int argc, char* argv[]) auto [result, parser] = create_args(argc, argv); if(!result) return EXIT_FAILURE; - run(parser); + benchmark_gemm(parser, run_single_trait(parser)); return 0; } catch(const std::exception& e) diff --git a/tile_engine/ops/gemm/gemm_host_api.hpp b/tile_engine/ops/gemm/gemm_host_api.hpp index 58eeb9a9ee..708e051104 100755 --- a/tile_engine/ops/gemm/gemm_host_api.hpp +++ b/tile_engine/ops/gemm/gemm_host_api.hpp @@ -114,7 +114,7 @@ inline auto create_args(int argc, char* argv[]) .insert("stride_b", "0", "The stride value for tensor B. Default is 0.") .insert("stride_c", "0", "The stride value for tensor C Default is 0.") .insert("split_k", "1", "The split value for k dimension. Default is 1.") - .insert("v", + .insert("verify", "2", "The type of validation. Set to 0 for no validation, 1 for validation on CPU, or 2 " "for validation on GPU. Default is 2, validation on GPU.") @@ -126,10 +126,10 @@ inline auto create_args(int argc, char* argv[]) "warmup", "50", "The number of iterations before benchmark the kernel. Default is 50.") .insert( "repeat", "100", "The number of iterations to benchmark the kernel. Default is 100.") - .insert( - "timer", - "gpu", - "The type of timer. Possible values are gpu timer or cpu timer. Default is gpu timer.") + .insert("timer", + "true", + "Whether if the timer is gpu timer or not. Possible values are false or true. " + "Default is true.") .insert("init", "0", "The method of tensor initialization. Set to 0 for random, to 1 for linear, or 2 " @@ -138,6 +138,9 @@ inline auto create_args(int argc, char* argv[]) "0", "Metric with which to measure kernel performance. Set to 0 for latency, 1 for " "tflops, or 2 for bandwidth. Default is 0, latency.") + .insert("csv_filename", + "gemm_kernel", + "The filename of benchmark result. Default is gemm_kernel.") .insert("structured_sparsity", "false", "Whether use sparsity kernel or not. Possible values are true or false. Default is " @@ -225,82 +228,3 @@ void permute_vectors_i4x4_b(Tensor& tensor) } } } - -/// @brief Function to compare the results of the device and host computations -bool compare(ck_tile::index_t K, - ck_tile::index_t kbatch, - ck_tile::HostTensor& c_m_n_dev_result, - ck_tile::HostTensor& c_m_n_host_result) -{ - const float max_accumulated_value = - *std::max_element(c_m_n_host_result.mData.begin(), c_m_n_host_result.mData.end()); - const auto rtol_atol = calculate_rtol_atol( - K, kbatch, max_accumulated_value); - bool pass = ck_tile::check_err(c_m_n_dev_result, - c_m_n_host_result, - "Error: Incorrect results!", - rtol_atol.at(ck_tile::number<0>{}), - rtol_atol.at(ck_tile::number<1>{})); - - std::cout << "Relative error threshold: " << rtol_atol.at(ck_tile::number<0>{}) - << " Absolute error threshold: " << rtol_atol.at(ck_tile::number<1>{}) << std::endl; - std::cout << "The verification result is:" << (pass ? "correct" : "fail") << std::endl; - - return pass; -} - -/// @brief Function to get the kernel output with reference implementation on CPU/GPU -template -void gemm_host_reference(int verify, - ck_tile::HostTensor& a_m_k, - ck_tile::HostTensor& b_k_n, - ck_tile::HostTensor& c_m_n_host_result, - ck_tile::DeviceMem& a_m_k_dev_buf, - ck_tile::DeviceMem& b_k_n_dev_buf, - ck_tile::index_t M, - ck_tile::index_t N, - ck_tile::index_t K, - ck_tile::index_t stride_A, - ck_tile::index_t stride_B, - ck_tile::index_t stride_C) -{ - if(verify == 1) - { - c_m_n_host_result.SetZero(); - - ck_tile::reference_gemm( - a_m_k, b_k_n, c_m_n_host_result); - } - else if(verify == 2) - { - if constexpr(std::is_same_v) - { - // Restore input for B for gpu reference - b_k_n_dev_buf.ToDevice(b_k_n.data()); - } - - ck_tile::DeviceMem c_m_n_gpu_buf_ref(c_m_n_host_result.get_element_space_size_in_bytes()); - c_m_n_host_result.SetZero(); - c_m_n_gpu_buf_ref.SetZero(); - - ADataType* d_A = static_cast(a_m_k_dev_buf.GetDeviceBuffer()); - BDataType* d_B = static_cast(b_k_n_dev_buf.GetDeviceBuffer()); - CDataType* d_C = static_cast(c_m_n_gpu_buf_ref.GetDeviceBuffer()); - - ck_tile::reference_gemm_gpu(d_A, d_B, d_C, M, N, K, stride_A, stride_B, stride_C); - - c_m_n_gpu_buf_ref.FromDevice(c_m_n_host_result.data()); - } -} diff --git a/tile_engine/ops/gemm/gemm_instance_builder.py b/tile_engine/ops/gemm/gemm_instance_builder.py index 75ddaf76c7..ba68727609 100755 --- a/tile_engine/ops/gemm/gemm_instance_builder.py +++ b/tile_engine/ops/gemm/gemm_instance_builder.py @@ -408,19 +408,13 @@ struct GemmKernel {{ #include "gemm_common.hpp" #include "gemm_instances.hpp" #include "gemm_host_api.hpp" -#include "benchmark_gemm.hpp" struct GemmDispatcher { static auto& get_kernel_map() { // Use a static local variable - static std::unordered_map&, - ck_tile::HostTensor&, - int, - ck_tile::GemmHostArgs&, - const ck_tile::stream_config& stream)>> + static std::unordered_map< + std::string, + std::vector(ck_tile::GemmHostArgs&, const ck_tile::stream_config&)>>> kernel_map; return kernel_map; } @@ -446,53 +440,48 @@ struct GemmDispatcher { )) for trait in self.all_trait_names: - content += f""" kernel_map["{trait}"] = [=]( GemmProfiler& profiler, - ck_tile::DeviceMem& c_m_n_dev_buf, - ck_tile::HostTensor& c_m_n_host_result, - ck_tile::HostTensor& c_m_n_dev_result, - int verify, - ck_tile::GemmHostArgs& args, - const ck_tile::stream_config& stream) {{ - if(structured_sparsity){{ // SMFMA""" + content += f""" kernel_map["{trait}"] = {{""" for tile in tile_params: if self.is_tile_valid(tile, trait): + content += f"""[&](ck_tile::GemmHostArgs& args, const ck_tile::stream_config& stream) {{ """ + content += f""" if(structured_sparsity){{ // SMFMA""" sparse = self.config.problem.datatype_map['matrix_a'] == 'fp16' and \ self.config.problem.datatype_map['matrix_b'] == 'fp16' and \ self.config.problem.datatype_map['matrix_c'] == 'fp16' and \ ((tile[6] == 32 and tile[7] == 32 and tile[8] == 16) or (tile[6] == 16 and tile[7] == 16 and tile[8] == 32)) content += f""" - profiler.benchmark_kernel<{trait}::GemmKernel<{tile[0]}, {tile[1]}, {tile[2]}, {tile[3]}, {tile[4]}, {tile[5]}, {tile[6]}, {tile[7]}, {tile[8]}, {BOOL_MAP(sparse)}>>(c_m_n_dev_buf, c_m_n_host_result, c_m_n_dev_result, verify, args, stream);""" - content += f""" - }} else {{""" - for tile in tile_params: - if self.is_tile_valid(tile, trait): + return run_kernel<{trait}::GemmKernel<{tile[0]}, {tile[1]}, {tile[2]}, {tile[3]}, {tile[4]}, {tile[5]}, {tile[6]}, {tile[7]}, {tile[8]}, {BOOL_MAP(sparse)}>>(args, stream);""" content += f""" - profiler.benchmark_kernel<{trait}::GemmKernel<{tile[0]}, {tile[1]}, {tile[2]}, {tile[3]}, {tile[4]}, {tile[5]}, {tile[6]}, {tile[7]}, {tile[8]}, {BOOL_MAP(False)}>>(c_m_n_dev_buf, c_m_n_host_result, c_m_n_dev_result, verify, args, stream);""" + }} else {{""" + content += f""" + return run_kernel<{trait}::GemmKernel<{tile[0]}, {tile[1]}, {tile[2]}, {tile[3]}, {tile[4]}, {tile[5]}, {tile[6]}, {tile[7]}, {tile[8]}, {BOOL_MAP(False)}>>(args, stream);""" + content += f""" + }} """ + content += f""" + }} """ content += f""" - }} - }};\n""" - + }};\n """ + content += """ } - static auto dispatch(ck_tile::DeviceMem& c_m_n_dev_buf, - ck_tile::HostTensor& c_m_n_host_result, - ck_tile::HostTensor& c_m_n_dev_result, - int verify, - int metric, - bool structured_sparsity, - const KernelTraits& trait, - ck_tile::GemmHostArgs& gemm_args, - const ck_tile::stream_config& stream) { + template + static std::tuple run_kernel(ck_tile::GemmHostArgs& args, const ck_tile::stream_config& stream) + { + std::string name = Kernel::get_name(); + float avg_time = Kernel::launch(args, stream); + + return std::make_tuple(name, avg_time); + } + + + static auto dispatch(bool structured_sparsity, const KernelTraits& trait) { init(structured_sparsity); const std::string key = assemble_key(trait); auto& kernel_map = get_kernel_map(); - auto& profiler = GemmProfiler::instance(); - if(auto it = kernel_map.find(key); it != kernel_map.end()) { - it->second( - profiler, c_m_n_dev_buf, c_m_n_host_result, c_m_n_dev_result, verify, gemm_args, stream); - profiler.select_best_instance(static_cast(metric)); - return; + if(auto it = kernel_map.find(key); it != kernel_map.end()) + { + return it->second; } throw std::runtime_error("No suitable kernel found: " + key); } diff --git a/tile_engine/ops/gemm/gemm_profiler.hpp b/tile_engine/ops/gemm/gemm_profiler.hpp new file mode 100644 index 0000000000..5a9fa74316 --- /dev/null +++ b/tile_engine/ops/gemm/gemm_profiler.hpp @@ -0,0 +1,142 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include +#include +#include + +#include "ck_tile/host/device_prop.hpp" +#include "benchmark_utils.hpp" + +class GemmProfiler +{ + public: + static GemmProfiler& instance(Setting setting) + { + static GemmProfiler instance{setting}; + return instance; + } + + void benchmark(const GemmProblem& gemm_problem, + ck_tile::DeviceMem& c_m_n_dev_buf, + ck_tile::HostTensor& c_m_n_host_result, + ck_tile::HostTensor& c_m_n_dev_result, + const std::tuple& kernel_run_result) + { + auto [name, avg_time] = kernel_run_result; + + KernelInstance kernel_instance{name, gemm_problem, {-1.0f, -1.0f, -1.0f}}; + + // compute performance metric + std::size_t flop = std::size_t(2) * gemm_problem.m_ * gemm_problem.n_ * gemm_problem.k_; + std::size_t num_byte = sizeof(ADataType) * gemm_problem.m_ * gemm_problem.k_ + + sizeof(BDataType) * gemm_problem.n_ * gemm_problem.k_ + + sizeof(CDataType) * gemm_problem.m_ * gemm_problem.n_; + + // update + kernel_instance.perf_result_.latency_ = avg_time; + kernel_instance.perf_result_.tflops_ = static_cast(flop) / 1.E9 / avg_time; + kernel_instance.perf_result_.bandwidth_ = num_byte / 1.E6 / avg_time; + + if(setting_.log_ > 0) + { + std::cout << kernel_instance << std::endl; + } + + // verify result + c_m_n_dev_buf.FromDevice(c_m_n_dev_result.data()); + bool verified_correct = + !setting_.verify_ || + compare(gemm_problem.k_, gemm_problem.split_k_, c_m_n_dev_result, c_m_n_host_result); + + if(verified_correct) + { + kernel_instances_.emplace_back(kernel_instance); + } + else + { + std::cout << "Verification failed, skip kernel: " << name << std::endl; + } + + // clear tensor + c_m_n_dev_buf.SetZero(); + c_m_n_dev_result.SetZero(); + } + + KernelInstance select_best_instance(Metric metric) + { + if(kernel_instances_.empty()) + throw std::runtime_error("Empty instances"); + + auto kernel_instance = *std::max_element(kernel_instances_.begin(), + kernel_instances_.end(), + [metric](const auto& a, const auto& b) { + return PerformanceResult::compare( + b.perf_result_, a.perf_result_, metric); + }); + + std::cout << "**********************************" << std::endl; + std::cout << "According to given metrics: " << get_metric_name(metric) << "\n" + << "The best kernel instance is: " << kernel_instance << std::endl; + std::cout << "**********************************" << std::endl; + + if(!setting_.csv_filename_.empty()) + { + std::ofstream file(setting_.csv_filename_, std::ios::app); + + if(!file.is_open()) + { + std::cerr << "Warning: Failed to open CSV file for writing." << std::endl; + } + else + { + if(file.tellp() == 0) + { + file << "rocm_version, device_name," + << "split_k,m,n,k,stride_a,stride_b,stride_c," + << "dtype_a,dtype_b,dtype_acc,dtype_c," + << "layout_a,layout_b,layout_c," + << "structured_sparsity," + << "name," + << "latency(ms),tflops(TFlops),bandwidth(GB/s),metric\n"; + } + + const auto& problem = kernel_instance.problem_; + const auto& name = kernel_instance.name_; + const auto& perf = kernel_instance.perf_result_; + + file << get_rocm_version() << "," << ck_tile::get_device_name() << "," + << problem.split_k_ << "," << problem.m_ << "," << problem.n_ << "," + << problem.k_ << "," << problem.stride_a_ << "," << problem.stride_b_ << "," + << problem.stride_c_ << "," << problem.dtype_a_ << "," << problem.dtype_b_ + << "," << problem.dtype_acc_ << "," << problem.dtype_c_ << "," + << problem.layout_a_ << "," << problem.layout_b_ << "," << problem.layout_c_ + << "," << problem.structured_sparsity_ << "," << name << "," << std::fixed + << std::setprecision(4) << perf.latency_ << "," << std::fixed + << std::setprecision(4) << perf.tflops_ << "," << std::fixed + << std::setprecision(4) << perf.bandwidth_ << "," << get_metric_name(metric) + << "\n"; + + if(!file) + { + std::cerr << "Warning: Error occurred while writing to CSV file." << std::endl; + } + } + } + + return kernel_instance; + } + + GemmProfiler(const GemmProfiler&) = delete; + GemmProfiler& operator=(const GemmProfiler&) = delete; + + private: + ~GemmProfiler() { kernel_instances_.clear(); } + GemmProfiler(Setting setting) : setting_(setting) {} + + Setting setting_; + + std::vector kernel_instances_; +};