diff --git a/example/ck_tile/03_gemm/gemm_utils.hpp b/example/ck_tile/03_gemm/gemm_utils.hpp index 300a3826d7..e9b779c00c 100644 --- a/example/ck_tile/03_gemm/gemm_utils.hpp +++ b/example/ck_tile/03_gemm/gemm_utils.hpp @@ -458,7 +458,8 @@ auto create_args(int argc, char* argv[]) .insert("split_k", "1", "splitK value") .insert("init", "0", "0:random, 1:linear, 2:constant(1)") .insert("persistent", "0", "0:non-persistent, 1:persistent") - .insert("bench_time_ms", "0", "benchmark time in ms, defaults to 0 ms"); + .insert("flush_cache", "true", "flush cache before running the kernel, defaults to true") + .insert("rotating_count", "1", "rotating count, defaults to 1"); bool result = arg_parser.parse(argc, argv); return std::make_tuple(result, arg_parser); diff --git a/example/ck_tile/03_gemm/run_gemm_example.inc b/example/ck_tile/03_gemm/run_gemm_example.inc index 69331282a4..cc10394065 100644 --- a/example/ck_tile/03_gemm/run_gemm_example.inc +++ b/example/ck_tile/03_gemm/run_gemm_example.inc @@ -184,7 +184,8 @@ float invoke_gemm(ck_tile::DeviceMem& a_m_k_dev_buf, int n_warmup, int n_repeat, bool persistent, - int bench_time_ms) + bool flush_cache, + int rotating_count) { ck_tile::GemmHostArgs args = {a_m_k_dev_buf.GetDeviceBuffer(), b_k_n_dev_buf.GetDeviceBuffer(), @@ -214,7 +215,7 @@ float invoke_gemm(ck_tile::DeviceMem& a_m_k_dev_buf, CDEElementWise>( args, ck_tile::stream_config{ - nullptr, true, 1, n_warmup, n_repeat, true, true, 50, bench_time_ms}); + nullptr, true, 1, n_warmup, n_repeat, true, flush_cache, rotating_count}); } else { @@ -232,7 +233,7 @@ float invoke_gemm(ck_tile::DeviceMem& a_m_k_dev_buf, CDEElementWise>( args, ck_tile::stream_config{ - nullptr, true, 1, n_warmup, n_repeat, true, true, 50, bench_time_ms}); + nullptr, true, 1, n_warmup, n_repeat, true, flush_cache, rotating_count}); } std::size_t flop = std::size_t(2) * M * N * K; @@ -303,7 +304,8 @@ int run_gemm_example_with_layouts(int argc, int n_repeat = arg_parser.get_int("repeat"); ck_tile::index_t init_method = arg_parser.get_int("init"); bool persistent = arg_parser.get_int("persistent"); - int bench_time_ms = arg_parser.get_int("bench_time_ms"); + bool flush_cache = arg_parser.get_bool("flush_cache"); + int rotating_count = arg_parser.get_int("rotating_count"); const bool preshuffle = GemmConfig::Preshuffle; @@ -422,7 +424,8 @@ int run_gemm_example_with_layouts(int argc, n_warmup, n_repeat, persistent, - bench_time_ms); + flush_cache, + rotating_count); c_m_n_dev_buf.FromDevice(c_m_n_dev_result.data()); bool pass = true; diff --git a/example/ck_tile/18_flatmm/flatmm_basic.cpp b/example/ck_tile/18_flatmm/flatmm_basic.cpp index 4d29b68694..0f2beca2c7 100644 --- a/example/ck_tile/18_flatmm/flatmm_basic.cpp +++ b/example/ck_tile/18_flatmm/flatmm_basic.cpp @@ -168,7 +168,7 @@ float flatmm_calc(const ck_tile::FlatmmHostArgs<>& args, const ck_tile::stream_c hipGetErrorString(hipMemsetAsync( args.e_ptr, 0, args.M * args.N * sizeof(CDataType), s.stream_id_)); }; - ave_time = ck_tile::launch_kernel_preprocess( + ave_time = ck_tile::launch_kernel_time_mask( s, run_flush_cache, ck_tile::make_kernel( diff --git a/example/ck_tile/20_grouped_convolution/grouped_convolution_backward_weight.cpp b/example/ck_tile/20_grouped_convolution/grouped_convolution_backward_weight.cpp index bdfaffecb6..67db775e09 100644 --- a/example/ck_tile/20_grouped_convolution/grouped_convolution_backward_weight.cpp +++ b/example/ck_tile/20_grouped_convolution/grouped_convolution_backward_weight.cpp @@ -120,7 +120,7 @@ float grouped_conv_bwd_weight(const ck_tile::GroupedConvBwdWeightHostArgs& args, << ", Vector size C: " << ConvEpilogue::GetVectorSizeC() << std::endl; } - float ave_time = ck_tile::launch_kernel_preprocess( + float ave_time = ck_tile::launch_kernel_time_mask( s, Kernel::Preprocess(kargs, s), ck_tile::make_kernel(Kernel{}, grids, blocks, 0, kargs)); diff --git a/include/ck_tile/host/kernel_launch.hpp b/include/ck_tile/host/kernel_launch.hpp index 262b8bae45..91ac3d5a0b 100644 --- a/include/ck_tile/host/kernel_launch.hpp +++ b/include/ck_tile/host/kernel_launch.hpp @@ -15,12 +15,6 @@ namespace ck_tile { -#define LOW_CU_PROCESSORS 80 -#define HIGH_CU_PROCESSORS 228 -#define OPTIMAL_LATENCY_LOW_CU_PROCESSORS 0.005 -#define OPTIMAL_LATENCY_HIGH_CU_PROCESSORS 0.0015 -#define OPTIMAL_LATENCY_SAFE_MARGIN 0.01 - template #if CK_TILE_USE_LAUNCH_BOUNDS __launch_bounds__(MaxThreadPerBlock, MinBlockPerCu) @@ -65,71 +59,58 @@ CK_TILE_HOST void launch_and_check(const stream_config& sc, Callables&&... calla } } -template -typename std::iterator_traits::value_type median(it begin, it end) +// Measure the preprocess time during the cold iterations +template +CK_TILE_HOST double +preprocess_profiling_impl(TimerType timer, const stream_config& s, PreprocessFunc preprocess) { - if(begin == end) + timer.start(s.stream_id_); + for(int i = 0; i < s.nrepeat_; i++) { - return std::numeric_limits::quiet_NaN(); + if constexpr(!std::is_same_v) + { + preprocess(); + } } - auto n = std::distance(begin, end); - auto n2 = n / 2; - std::nth_element(begin, begin + n2, end); - return (n % 2) ? begin[n2] : (*std::max_element(begin, begin + n2) + begin[n2]) / 2.0; + timer.stop(s.stream_id_); + + return timer.duration() / s.nrepeat_; } -inline void remove_outliers(std::vector& v) -{ - // 1.5x IQR method to detect and remove outliers - auto n2 = v.size() / 2; - std::nth_element(v.begin(), v.begin() + n2, v.end()); - auto q1 = median(v.begin(), v.begin() + n2); - auto q3 = median(v.begin() + ((v.size() % 2) ? n2 + 1 : n2), v.end()); - auto iqr = q3 - q1; - auto lb = q1 - 1.5 * iqr; - auto ub = q3 + 1.5 * iqr; - v.erase(std::remove_if(v.begin(), v.end(), [&](float f) { return f < lb || f > ub; }), v.end()); -} - -template +template CK_TILE_HOST double timing_loop_impl(TimerType timer, const stream_config& s, CallablesFunc&& callables_func, - std::function preprocess = nullptr) + PreprocessFunc preprocess = nullptr) { for(int i = 0; i < s.cold_niters_; i++) { callables_func(); } - - float per_iter_time = 0.f; - std::vector times; - int i = 0; - while(i < s.nrepeat_ || per_iter_time < s.bench_time_ms_) + // Only profile preprocess if it's provided + auto preprocess_time = 0.0; + if constexpr(!std::is_same_v) { - if(preprocess) - preprocess(); + preprocess_time = preprocess_profiling_impl(gpu_timer{}, s, preprocess); + } - timer.start(s.stream_id_, i); - callables_func(); - timer.stop(s.stream_id_, i); - - if(i > 0) + int i = 0; + timer.start(s.stream_id_); + while(i < s.nrepeat_) + { + if constexpr(!std::is_same_v) { - per_iter_time = timer.duration(i - 1); - times.push_back(per_iter_time); - per_iter_time = timer.is_exceed(i - 1); + preprocess(); } + + callables_func(); i++; } + timer.stop(s.stream_id_); if(!i) return 0.; - - per_iter_time = timer.duration(i - 1); - times.push_back(per_iter_time); - remove_outliers(times); - return std::accumulate(times.begin(), times.end(), 0.) / times.size(); + return (timer.duration() / s.nrepeat_) - preprocess_time; } // clang-format off @@ -174,7 +155,7 @@ CK_TILE_HOST float launch_kernel(const stream_config& s, Callables&&... callable if(s.is_gpu_timer_) { - return timing_loop_impl(gpu_timer_new{s.stream_id_}, s, callables_func); + return timing_loop_impl(gpu_timer{}, s, callables_func); } else { @@ -199,7 +180,7 @@ launch_kernel_time_mask(const stream_config& s, PreprocessFunc preprocess, Calla if(s.is_gpu_timer_) { - return timing_loop_impl(gpu_timer_new{s.stream_id_}, s, callables_func, preprocess); + return timing_loop_impl(gpu_timer{}, s, callables_func, preprocess); } else { diff --git a/include/ck_tile/host/stream_config.hpp b/include/ck_tile/host/stream_config.hpp index 0c239d0a7d..acb861b2e7 100644 --- a/include/ck_tile/host/stream_config.hpp +++ b/include/ck_tile/host/stream_config.hpp @@ -20,6 +20,10 @@ namespace ck_tile { * * // create stream config with _some_stream_id_, and benchmark using cpu timer * stream_config s = stream_config{_some_stream_id_, true, 0, 3, 10, false}; + * + * // create stream config with _some_stream_id_, and enable gpu timer for rotating buffer with + *rotating buffer count stream_config s = stream_config{_some_stream_id_, true, 0, 3, 10, true, + *true, 1}; **/ struct stream_config @@ -32,6 +36,5 @@ struct stream_config bool is_gpu_timer_ = true; // keep compatible bool flush_cache_ = false; int rotating_count_ = 1; - int bench_time_ms_ = 0; }; } // namespace ck_tile diff --git a/include/ck_tile/host/timer.hpp b/include/ck_tile/host/timer.hpp index b4bff932e4..e5519643bf 100644 --- a/include/ck_tile/host/timer.hpp +++ b/include/ck_tile/host/timer.hpp @@ -48,100 +48,31 @@ struct gpu_timer hipEvent_t start_evt, stop_evt; }; -struct gpu_timer_new -{ - CK_TILE_HOST gpu_timer_new(const hipStream_t& s) - { - for(auto& e : start_event) - { - HIP_CHECK_ERROR(hipEventCreate(&e)); - } - for(auto& e : stop_event) - { - HIP_CHECK_ERROR(hipEventCreate(&e)); - } - HIP_CHECK_ERROR(hipEventCreate(&event0)); - HIP_CHECK_ERROR(hipEventRecord(event0, s)); - } - - CK_TILE_HOST ~gpu_timer_new() noexcept(false) - { - for(auto& e : start_event) - { - HIP_CHECK_ERROR(hipEventDestroy(e)); - } - for(auto& e : stop_event) - { - HIP_CHECK_ERROR(hipEventDestroy(e)); - } - HIP_CHECK_ERROR(hipEventDestroy(event0)); - } - - CK_TILE_HOST void start(const hipStream_t& s, int idx = 0) - { - HIP_CHECK_ERROR(hipEventRecord(start_event[idx % 2], s)); - } - - CK_TILE_HOST void stop(const hipStream_t& s, int idx = 0) - { - HIP_CHECK_ERROR(hipEventRecord(stop_event[idx % 2], s)); - } - // return in ms - CK_TILE_HOST float duration(int idx = 0) const - { - float ms; - HIP_CHECK_ERROR(hipEventSynchronize(stop_event[idx % 2])); - HIP_CHECK_ERROR(hipEventElapsedTime(&ms, start_event[idx % 2], stop_event[idx % 2])); - return ms; - } - CK_TILE_HOST float is_exceed(int idx = 0) const - { - float ms; - HIP_CHECK_ERROR(hipEventElapsedTime(&ms, event0, stop_event[idx % 2])); - return ms; - } - - private: - std::array start_event; - std::array stop_event; - hipEvent_t event0; -}; - struct cpu_timer { // torch.utils.benchmark.Timer(), there is a sync inside each timer callback - CK_TILE_HOST void start(const hipStream_t& s, [[maybe_unused]] int idx = 0) + CK_TILE_HOST void start(const hipStream_t& s) { HIP_CHECK_ERROR(hipStreamSynchronize(s)); - start_tick = std::chrono::high_resolution_clock::now(); - time_event0 = std::chrono::high_resolution_clock::now(); + start_tick = std::chrono::high_resolution_clock::now(); } // torch.utils.benchmark.Timer(), there is a sync inside each timer callback - CK_TILE_HOST void stop(const hipStream_t& s, [[maybe_unused]] int idx = 0) + CK_TILE_HOST void stop(const hipStream_t& s) { HIP_CHECK_ERROR(hipStreamSynchronize(s)); stop_tick = std::chrono::high_resolution_clock::now(); } // return in ms - CK_TILE_HOST float duration([[maybe_unused]] int idx = 0) const + CK_TILE_HOST float duration() const { double sec = std::chrono::duration_cast>(stop_tick - start_tick) .count(); return static_cast(sec * 1e3); } - // return in ms - CK_TILE_HOST float is_exceed([[maybe_unused]] int idx = 0) const - { - double sec = - std::chrono::duration_cast>(stop_tick - time_event0) - .count(); - return static_cast(sec * 1e3); - } private: std::chrono::time_point start_tick; - std::chrono::time_point time_event0; std::chrono::time_point stop_tick; }; diff --git a/tile_engine/ops/gemm/benchmark_gemm.cpp b/tile_engine/ops/gemm/benchmark_gemm.cpp index 5f240c8fe4..db2b648437 100644 --- a/tile_engine/ops/gemm/benchmark_gemm.cpp +++ b/tile_engine/ops/gemm/benchmark_gemm.cpp @@ -34,8 +34,7 @@ void benchmark_gemm(const ck_tile::ArgParser& arg_parser) arg_parser.get_bool("log"), arg_parser.get_str("csv_filename"), arg_parser.get_bool("flush_cache"), - arg_parser.get_int("rotating_count"), - arg_parser.get_int("bench_time")}; + arg_parser.get_int("rotating_count")}; auto& profiler = GemmProfiler::instance(setting); diff --git a/tile_engine/ops/gemm/benchmark_gemm.hpp b/tile_engine/ops/gemm/benchmark_gemm.hpp index 993e7ea1f5..ce8a6e8234 100644 --- a/tile_engine/ops/gemm/benchmark_gemm.hpp +++ b/tile_engine/ops/gemm/benchmark_gemm.hpp @@ -125,7 +125,6 @@ struct Setting std::string csv_filename_; bool flush_cache_; int rotating_count_; - int bench_time_ms_; }; inline std::string get_rocm_version() diff --git a/tile_engine/ops/gemm/gemm_host_api.hpp b/tile_engine/ops/gemm/gemm_host_api.hpp index 8b18aa703d..2c4af8955f 100644 --- a/tile_engine/ops/gemm/gemm_host_api.hpp +++ b/tile_engine/ops/gemm/gemm_host_api.hpp @@ -110,7 +110,6 @@ inline auto create_args(int argc, char* argv[]) "To flush cache, possible values are true or false. " "Default is false.") .insert("rotating_count", "5", "number of iterations to rotate the cache. default is 5.") - .insert("bench_time", "0", "benchmark time in ms. default is 0 ms.") .insert("metric", "0", "Metric with which to measure kernel performance. Set to 0 for latency, 1 for " diff --git a/tile_engine/ops/gemm/gemm_profiler.hpp b/tile_engine/ops/gemm/gemm_profiler.hpp index d4efc7fa7f..634e19de6e 100644 --- a/tile_engine/ops/gemm/gemm_profiler.hpp +++ b/tile_engine/ops/gemm/gemm_profiler.hpp @@ -131,8 +131,7 @@ class GemmProfiler setting_.n_repeat_, setting_.is_gpu_timer_, setting_.flush_cache_, - setting_.rotating_count_, - setting_.bench_time_ms_}); + setting_.rotating_count_}); process_result(gemm_problem, c_m_n_dev_buf, c_m_n_host_result,