diff --git a/example/ck_tile/03_gemm/gemm_utils.hpp b/example/ck_tile/03_gemm/gemm_utils.hpp index 1e867afd1a..300a3826d7 100644 --- a/example/ck_tile/03_gemm/gemm_utils.hpp +++ b/example/ck_tile/03_gemm/gemm_utils.hpp @@ -457,7 +457,8 @@ auto create_args(int argc, char* argv[]) .insert("timer", "gpu", "gpu:gpu timer, cpu:cpu timer") .insert("split_k", "1", "splitK value") .insert("init", "0", "0:random, 1:linear, 2:constant(1)") - .insert("persistent", "0", "0:non-persistent, 1:persistent"); + .insert("persistent", "0", "0:non-persistent, 1:persistent") + .insert("bench_time_ms", "0", "benchmark time in ms, defaults to 0 ms"); bool result = arg_parser.parse(argc, argv); return std::make_tuple(result, arg_parser); diff --git a/example/ck_tile/03_gemm/gemm_weight_preshuffle.cpp b/example/ck_tile/03_gemm/gemm_weight_preshuffle.cpp index 34333d5474..74e79574d1 100644 --- a/example/ck_tile/03_gemm/gemm_weight_preshuffle.cpp +++ b/example/ck_tile/03_gemm/gemm_weight_preshuffle.cpp @@ -146,18 +146,14 @@ float gemm(const ck_tile::GemmHostArgs& args, const ck_tile::stream_config& s) if(s.flush_cache_) { std::cout << "Flushing cache..." << std::endl; - static constexpr ck_tile::index_t APackedSize = - std::is_same_v ? 2 : 1; - static constexpr ck_tile::index_t BPackedSize = - std::is_same_v ? 2 : 1; ck_tile::HostTensor a_m(ck_tile::host_tensor_descriptor( args.M, args.K, args.stride_A, is_row_major(ALayout{}))); ck_tile::HostTensor b_n(ck_tile::host_tensor_descriptor( args.K, args.N, args.stride_B, is_row_major(BLayout{}))); - auto size_a_buffer = a_m.get_element_space_size_in_bytes() / APackedSize; - auto size_b_buffer = b_n.get_element_space_size_in_bytes() / BPackedSize; + auto size_a_buffer = a_m.get_element_space_size_in_bytes(); + auto size_b_buffer = b_n.get_element_space_size_in_bytes(); ck_tile::RotatingMemWrapper rotating_mem( kargs.as_ptr[0], kargs.bs_ptr[0], s.rotating_count_, size_a_buffer, size_b_buffer); @@ -173,7 +169,7 @@ float gemm(const ck_tile::GemmHostArgs& args, const ck_tile::stream_config& s) hipGetErrorString(hipMemsetAsync( args.e_ptr, 0, args.M * args.N * sizeof(CDataType), s.stream_id_)); }; - ave_time = ck_tile::launch_kernel_preprocess( + ave_time = ck_tile::launch_kernel_time_mask( s, run_flush_cache, ck_tile::make_kernel( diff --git a/example/ck_tile/03_gemm/run_gemm_example.inc b/example/ck_tile/03_gemm/run_gemm_example.inc index 7f87c2bc06..69331282a4 100644 --- a/example/ck_tile/03_gemm/run_gemm_example.inc +++ b/example/ck_tile/03_gemm/run_gemm_example.inc @@ -183,7 +183,8 @@ float invoke_gemm(ck_tile::DeviceMem& a_m_k_dev_buf, ck_tile::index_t kbatch, int n_warmup, int n_repeat, - bool persistent) + bool persistent, + int bench_time_ms) { ck_tile::GemmHostArgs args = {a_m_k_dev_buf.GetDeviceBuffer(), b_k_n_dev_buf.GetDeviceBuffer(), @@ -211,7 +212,9 @@ float invoke_gemm(ck_tile::DeviceMem& a_m_k_dev_buf, CLayout, true, CDEElementWise>( - args, ck_tile::stream_config{nullptr, true, 1, n_warmup, n_repeat, true, true, 50}); + args, + ck_tile::stream_config{ + nullptr, true, 1, n_warmup, n_repeat, true, true, 50, bench_time_ms}); } else { @@ -227,7 +230,9 @@ float invoke_gemm(ck_tile::DeviceMem& a_m_k_dev_buf, CLayout, false, CDEElementWise>( - args, ck_tile::stream_config{nullptr, true, 1, n_warmup, n_repeat, true, true, 50}); + args, + ck_tile::stream_config{ + nullptr, true, 1, n_warmup, n_repeat, true, true, 50, bench_time_ms}); } std::size_t flop = std::size_t(2) * M * N * K; @@ -236,15 +241,16 @@ float invoke_gemm(ck_tile::DeviceMem& a_m_k_dev_buf, float tflops = static_cast(flop) / 1.E9 / ave_time; float gb_per_sec = num_byte / 1.E6 / ave_time; - std::cout << "Run Gemm kernel with M=" << M << " N=" << N << " K=" << K + std::cout << "Run Gemm kernel with \n M=" << M << " N=" << N << " K=" << K << " StrideA=" << stride_A << " StrideB=" << stride_B << " StrideC=" << stride_C << " A_Layout=" << ALayout::name << " B_Layout =" << BLayout::name << " C_Layout=" << CLayout::name << " A_Type=" << DataTypeTraits::name << " B_Type=" << DataTypeTraits::name << " C_Type=" << DataTypeTraits::name << " StructuredSparsity=" << (GemmConfig::UseStructuredSparsity ? "on" : "off") - << " Persistent=" << (persistent ? "on" : "off") << " : " << ave_time << " ms, " - << tflops << " TFlops, " << gb_per_sec << " GB/s, " << std::endl; + << " Persistent=" << (persistent ? "on" : "off") << " : \n" + << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s, " + << std::endl; return ave_time; } @@ -297,6 +303,7 @@ int run_gemm_example_with_layouts(int argc, int n_repeat = arg_parser.get_int("repeat"); ck_tile::index_t init_method = arg_parser.get_int("init"); bool persistent = arg_parser.get_int("persistent"); + int bench_time_ms = arg_parser.get_int("bench_time_ms"); const bool preshuffle = GemmConfig::Preshuffle; @@ -414,7 +421,8 @@ int run_gemm_example_with_layouts(int argc, kbatch, n_warmup, n_repeat, - persistent); + persistent, + bench_time_ms); c_m_n_dev_buf.FromDevice(c_m_n_dev_result.data()); bool pass = true; diff --git a/example/ck_tile/03_gemm/universal_gemm.cpp b/example/ck_tile/03_gemm/universal_gemm.cpp index 6c60f98fa4..d82520241d 100644 --- a/example/ck_tile/03_gemm/universal_gemm.cpp +++ b/example/ck_tile/03_gemm/universal_gemm.cpp @@ -147,18 +147,14 @@ float gemm(const ck_tile::GemmHostArgs& args, const ck_tile::stream_config& s) if(s.flush_cache_) { std::cout << "Flushing cache..." << std::endl; - static constexpr ck_tile::index_t APackedSize = - std::is_same_v ? 2 : 1; - static constexpr ck_tile::index_t BPackedSize = - std::is_same_v ? 2 : 1; ck_tile::HostTensor a_m(ck_tile::host_tensor_descriptor( args.M, args.K, args.stride_A, is_row_major(ALayout{}))); ck_tile::HostTensor b_n(ck_tile::host_tensor_descriptor( args.K, args.N, args.stride_B, is_row_major(BLayout{}))); - auto size_a_buffer = a_m.get_element_space_size_in_bytes() / APackedSize; - auto size_b_buffer = b_n.get_element_space_size_in_bytes() / BPackedSize; + auto size_a_buffer = a_m.get_element_space_size_in_bytes(); + auto size_b_buffer = b_n.get_element_space_size_in_bytes(); ck_tile::RotatingMemWrapper rotating_mem( kargs.as_ptr[0], kargs.bs_ptr[0], s.rotating_count_, size_a_buffer, size_b_buffer); @@ -174,7 +170,7 @@ float gemm(const ck_tile::GemmHostArgs& args, const ck_tile::stream_config& s) hipGetErrorString(hipMemsetAsync( args.e_ptr, 0, args.M * args.N * sizeof(CDataType), s.stream_id_)); }; - ave_time = ck_tile::launch_kernel_preprocess( + ave_time = ck_tile::launch_kernel_time_mask( s, run_flush_cache, ck_tile::make_kernel( diff --git a/include/ck_tile/host/kernel_launch.hpp b/include/ck_tile/host/kernel_launch.hpp index f6ccb6968b..262b8bae45 100644 --- a/include/ck_tile/host/kernel_launch.hpp +++ b/include/ck_tile/host/kernel_launch.hpp @@ -3,6 +3,8 @@ #pragma once +#include +#include #include "ck_tile/core/config.hpp" #include "ck_tile/core/utility/ignore.hpp" #include "ck_tile/host/hip_check_error.hpp" @@ -63,6 +65,73 @@ CK_TILE_HOST void launch_and_check(const stream_config& sc, Callables&&... calla } } +template +typename std::iterator_traits::value_type median(it begin, it end) +{ + if(begin == end) + { + return std::numeric_limits::quiet_NaN(); + } + auto n = std::distance(begin, end); + auto n2 = n / 2; + std::nth_element(begin, begin + n2, end); + return (n % 2) ? begin[n2] : (*std::max_element(begin, begin + n2) + begin[n2]) / 2.0; +} + +inline void remove_outliers(std::vector& v) +{ + // 1.5x IQR method to detect and remove outliers + auto n2 = v.size() / 2; + std::nth_element(v.begin(), v.begin() + n2, v.end()); + auto q1 = median(v.begin(), v.begin() + n2); + auto q3 = median(v.begin() + ((v.size() % 2) ? n2 + 1 : n2), v.end()); + auto iqr = q3 - q1; + auto lb = q1 - 1.5 * iqr; + auto ub = q3 + 1.5 * iqr; + v.erase(std::remove_if(v.begin(), v.end(), [&](float f) { return f < lb || f > ub; }), v.end()); +} + +template +CK_TILE_HOST double timing_loop_impl(TimerType timer, + const stream_config& s, + CallablesFunc&& callables_func, + std::function preprocess = nullptr) +{ + for(int i = 0; i < s.cold_niters_; i++) + { + callables_func(); + } + + float per_iter_time = 0.f; + std::vector times; + int i = 0; + while(i < s.nrepeat_ || per_iter_time < s.bench_time_ms_) + { + if(preprocess) + preprocess(); + + timer.start(s.stream_id_, i); + callables_func(); + timer.stop(s.stream_id_, i); + + if(i > 0) + { + per_iter_time = timer.duration(i - 1); + times.push_back(per_iter_time); + per_iter_time = timer.is_exceed(i - 1); + } + i++; + } + + if(!i) + return 0.; + + per_iter_time = timer.duration(i - 1); + times.push_back(per_iter_time); + remove_outliers(times); + return std::accumulate(times.begin(), times.end(), 0.) / times.size(); +} + // clang-format off /* * launch_kernel() @@ -101,37 +170,21 @@ CK_TILE_HOST float launch_kernel(const stream_config& s, Callables&&... callable return 0; } - auto time_launches = [&](auto timer) { - // Warmup - for(int i = 0; i < s.cold_niters_; i++) - { - launch_and_check(s, std::forward(callables)...); - } - - timer.start(s.stream_id_); - for(int i = 0; i < s.nrepeat_; i++) - { - launch_and_check(s, std::forward(callables)...); - } - timer.stop(s.stream_id_); - - return timer.duration() / s.nrepeat_; - }; + auto callables_func = [&]() { launch_and_check(s, std::forward(callables)...); }; if(s.is_gpu_timer_) { - return time_launches(gpu_timer{}); + return timing_loop_impl(gpu_timer_new{s.stream_id_}, s, callables_func); } else { - return time_launches(cpu_timer{}); + return timing_loop_impl(cpu_timer{}, s, callables_func); } } template -CK_TILE_HOST float launch_kernel_preprocess(const stream_config& s, - PreprocessFunc preprocess, - Callables&&... callables) +CK_TILE_HOST float +launch_kernel_time_mask(const stream_config& s, PreprocessFunc preprocess, Callables&&... callables) { static_assert(sizeof...(callables) > 0, "At least one callable is required!"); @@ -142,39 +195,15 @@ CK_TILE_HOST float launch_kernel_preprocess(const stream_config& s, return 0; } - auto time_launches = [&](auto timer) { - // Warmup - for(int i = 0; i < s.cold_niters_; i++) - { - launch_and_check(s, std::forward(callables)...); - } - - timer.start(s.stream_id_); - for(int i = 0; i < s.nrepeat_; i++) - { - preprocess(); - launch_and_check(s, std::forward(callables)...); - } - timer.stop(s.stream_id_); - - hipDeviceProp_t deviceProps; - HIP_CHECK_ERROR(hipGetDeviceProperties(&deviceProps, 0)); - - float preprocess_offset = (deviceProps.multiProcessorCount >= HIGH_CU_PROCESSORS) - ? OPTIMAL_LATENCY_HIGH_CU_PROCESSORS - : (deviceProps.multiProcessorCount == LOW_CU_PROCESSORS) - ? OPTIMAL_LATENCY_LOW_CU_PROCESSORS - : OPTIMAL_LATENCY_SAFE_MARGIN; - return (timer.duration() - preprocess_offset * s.nrepeat_) / s.nrepeat_; - }; + auto callables_func = [&]() { launch_and_check(s, std::forward(callables)...); }; if(s.is_gpu_timer_) { - return time_launches(gpu_timer{}); + return timing_loop_impl(gpu_timer_new{s.stream_id_}, s, callables_func, preprocess); } else { - return time_launches(cpu_timer{}); + return timing_loop_impl(cpu_timer{}, s, callables_func, preprocess); } } } // namespace ck_tile diff --git a/include/ck_tile/host/stream_config.hpp b/include/ck_tile/host/stream_config.hpp index f6bd40f6f2..0c239d0a7d 100644 --- a/include/ck_tile/host/stream_config.hpp +++ b/include/ck_tile/host/stream_config.hpp @@ -32,5 +32,6 @@ struct stream_config bool is_gpu_timer_ = true; // keep compatible bool flush_cache_ = false; int rotating_count_ = 1; + int bench_time_ms_ = 0; }; } // namespace ck_tile diff --git a/include/ck_tile/host/timer.hpp b/include/ck_tile/host/timer.hpp index e5519643bf..b4bff932e4 100644 --- a/include/ck_tile/host/timer.hpp +++ b/include/ck_tile/host/timer.hpp @@ -48,31 +48,100 @@ struct gpu_timer hipEvent_t start_evt, stop_evt; }; +struct gpu_timer_new +{ + CK_TILE_HOST gpu_timer_new(const hipStream_t& s) + { + for(auto& e : start_event) + { + HIP_CHECK_ERROR(hipEventCreate(&e)); + } + for(auto& e : stop_event) + { + HIP_CHECK_ERROR(hipEventCreate(&e)); + } + HIP_CHECK_ERROR(hipEventCreate(&event0)); + HIP_CHECK_ERROR(hipEventRecord(event0, s)); + } + + CK_TILE_HOST ~gpu_timer_new() noexcept(false) + { + for(auto& e : start_event) + { + HIP_CHECK_ERROR(hipEventDestroy(e)); + } + for(auto& e : stop_event) + { + HIP_CHECK_ERROR(hipEventDestroy(e)); + } + HIP_CHECK_ERROR(hipEventDestroy(event0)); + } + + CK_TILE_HOST void start(const hipStream_t& s, int idx = 0) + { + HIP_CHECK_ERROR(hipEventRecord(start_event[idx % 2], s)); + } + + CK_TILE_HOST void stop(const hipStream_t& s, int idx = 0) + { + HIP_CHECK_ERROR(hipEventRecord(stop_event[idx % 2], s)); + } + // return in ms + CK_TILE_HOST float duration(int idx = 0) const + { + float ms; + HIP_CHECK_ERROR(hipEventSynchronize(stop_event[idx % 2])); + HIP_CHECK_ERROR(hipEventElapsedTime(&ms, start_event[idx % 2], stop_event[idx % 2])); + return ms; + } + CK_TILE_HOST float is_exceed(int idx = 0) const + { + float ms; + HIP_CHECK_ERROR(hipEventElapsedTime(&ms, event0, stop_event[idx % 2])); + return ms; + } + + private: + std::array start_event; + std::array stop_event; + hipEvent_t event0; +}; + struct cpu_timer { // torch.utils.benchmark.Timer(), there is a sync inside each timer callback - CK_TILE_HOST void start(const hipStream_t& s) + CK_TILE_HOST void start(const hipStream_t& s, [[maybe_unused]] int idx = 0) { HIP_CHECK_ERROR(hipStreamSynchronize(s)); - start_tick = std::chrono::high_resolution_clock::now(); + start_tick = std::chrono::high_resolution_clock::now(); + time_event0 = std::chrono::high_resolution_clock::now(); } // torch.utils.benchmark.Timer(), there is a sync inside each timer callback - CK_TILE_HOST void stop(const hipStream_t& s) + CK_TILE_HOST void stop(const hipStream_t& s, [[maybe_unused]] int idx = 0) { HIP_CHECK_ERROR(hipStreamSynchronize(s)); stop_tick = std::chrono::high_resolution_clock::now(); } // return in ms - CK_TILE_HOST float duration() const + CK_TILE_HOST float duration([[maybe_unused]] int idx = 0) const { double sec = std::chrono::duration_cast>(stop_tick - start_tick) .count(); return static_cast(sec * 1e3); } + // return in ms + CK_TILE_HOST float is_exceed([[maybe_unused]] int idx = 0) const + { + double sec = + std::chrono::duration_cast>(stop_tick - time_event0) + .count(); + return static_cast(sec * 1e3); + } private: std::chrono::time_point start_tick; + std::chrono::time_point time_event0; std::chrono::time_point stop_tick; }; diff --git a/test/ck_tile/gemm/test_gemm_pipeline_universal_run_test.inc b/test/ck_tile/gemm/test_gemm_pipeline_universal_run_test.inc index 860541ef18..7d89dda684 100644 --- a/test/ck_tile/gemm/test_gemm_pipeline_universal_run_test.inc +++ b/test/ck_tile/gemm/test_gemm_pipeline_universal_run_test.inc @@ -162,7 +162,7 @@ float gemm(const ck_tile::GemmHostArgs& args, const ck_tile::stream_config& s) hipGetErrorString(hipMemsetAsync( args.e_ptr, 0, args.M * args.N * sizeof(CDataType), s.stream_id_)); }; - ave_time = ck_tile::launch_kernel_preprocess( + ave_time = ck_tile::launch_kernel_time_mask( s, run_flush_cache, ck_tile::make_kernel( diff --git a/tile_engine/ops/gemm/benchmark_gemm.cpp b/tile_engine/ops/gemm/benchmark_gemm.cpp index db2b648437..5f240c8fe4 100644 --- a/tile_engine/ops/gemm/benchmark_gemm.cpp +++ b/tile_engine/ops/gemm/benchmark_gemm.cpp @@ -34,7 +34,8 @@ void benchmark_gemm(const ck_tile::ArgParser& arg_parser) arg_parser.get_bool("log"), arg_parser.get_str("csv_filename"), arg_parser.get_bool("flush_cache"), - arg_parser.get_int("rotating_count")}; + arg_parser.get_int("rotating_count"), + arg_parser.get_int("bench_time")}; auto& profiler = GemmProfiler::instance(setting); diff --git a/tile_engine/ops/gemm/benchmark_gemm.hpp b/tile_engine/ops/gemm/benchmark_gemm.hpp index ce8a6e8234..993e7ea1f5 100644 --- a/tile_engine/ops/gemm/benchmark_gemm.hpp +++ b/tile_engine/ops/gemm/benchmark_gemm.hpp @@ -125,6 +125,7 @@ struct Setting std::string csv_filename_; bool flush_cache_; int rotating_count_; + int bench_time_ms_; }; inline std::string get_rocm_version() diff --git a/tile_engine/ops/gemm/gemm_host_api.hpp b/tile_engine/ops/gemm/gemm_host_api.hpp index 2c4af8955f..8b18aa703d 100644 --- a/tile_engine/ops/gemm/gemm_host_api.hpp +++ b/tile_engine/ops/gemm/gemm_host_api.hpp @@ -110,6 +110,7 @@ inline auto create_args(int argc, char* argv[]) "To flush cache, possible values are true or false. " "Default is false.") .insert("rotating_count", "5", "number of iterations to rotate the cache. default is 5.") + .insert("bench_time", "0", "benchmark time in ms. default is 0 ms.") .insert("metric", "0", "Metric with which to measure kernel performance. Set to 0 for latency, 1 for " diff --git a/tile_engine/ops/gemm/gemm_instance_builder.py b/tile_engine/ops/gemm/gemm_instance_builder.py index 6796121328..4a35a2bcd3 100755 --- a/tile_engine/ops/gemm/gemm_instance_builder.py +++ b/tile_engine/ops/gemm/gemm_instance_builder.py @@ -348,7 +348,7 @@ struct GemmKernel {{ hipGetErrorString(hipMemsetAsync( args.e_ptr, 0, args.M * args.N * sizeof(CDataType), stream.stream_id_)); }}; - ave_time = ck_tile::launch_kernel_preprocess( + ave_time = ck_tile::launch_kernel_time_mask( stream, run_flush_cache, ck_tile::make_kernel( diff --git a/tile_engine/ops/gemm/gemm_profiler.hpp b/tile_engine/ops/gemm/gemm_profiler.hpp index 634e19de6e..d4efc7fa7f 100644 --- a/tile_engine/ops/gemm/gemm_profiler.hpp +++ b/tile_engine/ops/gemm/gemm_profiler.hpp @@ -131,7 +131,8 @@ class GemmProfiler setting_.n_repeat_, setting_.is_gpu_timer_, setting_.flush_cache_, - setting_.rotating_count_}); + setting_.rotating_count_, + setting_.bench_time_ms_}); process_result(gemm_problem, c_m_n_dev_buf, c_m_n_host_result,