diff --git a/example/ck_tile/03_gemm/gemm_utils.hpp b/example/ck_tile/03_gemm/gemm_utils.hpp
index 300a3826d7..e9b779c00c 100644
--- a/example/ck_tile/03_gemm/gemm_utils.hpp
+++ b/example/ck_tile/03_gemm/gemm_utils.hpp
@@ -458,7 +458,8 @@ auto create_args(int argc, char* argv[])
         .insert("split_k", "1", "splitK value")
         .insert("init", "0", "0:random, 1:linear, 2:constant(1)")
         .insert("persistent", "0", "0:non-persistent, 1:persistent")
-        .insert("bench_time_ms", "0", "benchmark time in ms, defaults to 0 ms");
+        .insert("flush_cache", "true", "flush cache before running the kernel, defaults to true")
+        .insert("rotating_count", "1", "rotating count, defaults to 1");
 
     bool result = arg_parser.parse(argc, argv);
     return std::make_tuple(result, arg_parser);
diff --git a/example/ck_tile/03_gemm/run_gemm_example.inc b/example/ck_tile/03_gemm/run_gemm_example.inc
index 69331282a4..cc10394065 100644
--- a/example/ck_tile/03_gemm/run_gemm_example.inc
+++ b/example/ck_tile/03_gemm/run_gemm_example.inc
@@ -184,7 +184,8 @@ float invoke_gemm(ck_tile::DeviceMem& a_m_k_dev_buf,
                   int n_warmup,
                   int n_repeat,
                   bool persistent,
-                  int bench_time_ms)
+                  bool flush_cache,
+                  int rotating_count)
 {
     ck_tile::GemmHostArgs args = {a_m_k_dev_buf.GetDeviceBuffer(),
                                   b_k_n_dev_buf.GetDeviceBuffer(),
@@ -214,7 +215,7 @@ float invoke_gemm(ck_tile::DeviceMem& a_m_k_dev_buf,
                         CDEElementWise>(
             args,
             ck_tile::stream_config{
-                nullptr, true, 1, n_warmup, n_repeat, true, true, 50, bench_time_ms});
+                nullptr, true, 1, n_warmup, n_repeat, true, flush_cache, rotating_count});
     }
     else
     {
@@ -232,7 +233,7 @@ float invoke_gemm(ck_tile::DeviceMem& a_m_k_dev_buf,
                         CDEElementWise>(
             args,
             ck_tile::stream_config{
-                nullptr, true, 1, n_warmup, n_repeat, true, true, 50, bench_time_ms});
+                nullptr, true, 1, n_warmup, n_repeat, true, flush_cache, rotating_count});
     }
 
     std::size_t flop = std::size_t(2) * M * N * K;
@@ -303,7 +304,8 @@ int run_gemm_example_with_layouts(int argc,
     int n_repeat                 = arg_parser.get_int("repeat");
     ck_tile::index_t init_method = arg_parser.get_int("init");
     bool persistent              = arg_parser.get_int("persistent");
-    int bench_time_ms            = arg_parser.get_int("bench_time_ms");
+    bool flush_cache             = arg_parser.get_bool("flush_cache");
+    int rotating_count           = arg_parser.get_int("rotating_count");
 
     const bool preshuffle = GemmConfig::Preshuffle;
 
@@ -422,7 +424,8 @@ int run_gemm_example_with_layouts(int argc,
                          n_warmup,
                          n_repeat,
                          persistent,
-                         bench_time_ms);
+                         flush_cache,
+                         rotating_count);
 
     c_m_n_dev_buf.FromDevice(c_m_n_dev_result.data());
     bool pass = true;
diff --git a/example/ck_tile/18_flatmm/flatmm_basic.cpp b/example/ck_tile/18_flatmm/flatmm_basic.cpp
index 4d29b68694..0f2beca2c7 100644
--- a/example/ck_tile/18_flatmm/flatmm_basic.cpp
+++ b/example/ck_tile/18_flatmm/flatmm_basic.cpp
@@ -168,7 +168,7 @@ float flatmm_calc(const ck_tile::FlatmmHostArgs<>& args, const ck_tile::stream_c
                     hipGetErrorString(hipMemsetAsync(
                         args.e_ptr, 0, args.M * args.N * sizeof(CDataType), s.stream_id_));
             };
-            ave_time = ck_tile::launch_kernel_preprocess(
+            ave_time = ck_tile::launch_kernel_time_mask(
                 s,
                 run_flush_cache,
                 ck_tile::make_kernel<blocks.x, FlatmmConfig::kBlockPerCu>(
diff --git a/example/ck_tile/20_grouped_convolution/grouped_convolution_backward_weight.cpp b/example/ck_tile/20_grouped_convolution/grouped_convolution_backward_weight.cpp
index bdfaffecb6..67db775e09 100644
--- a/example/ck_tile/20_grouped_convolution/grouped_convolution_backward_weight.cpp
+++ b/example/ck_tile/20_grouped_convolution/grouped_convolution_backward_weight.cpp
@@ -120,7 +120,7 @@ float grouped_conv_bwd_weight(const ck_tile::GroupedConvBwdWeightHostArgs& args,
                       << ", Vector size C: " << ConvEpilogue::GetVectorSizeC() << std::endl;
         }
 
-        float ave_time = ck_tile::launch_kernel_preprocess(
+        float ave_time = ck_tile::launch_kernel_time_mask(
             s,
             Kernel::Preprocess(kargs, s),
             ck_tile::make_kernel<blocks.x, kBlockPerCu>(Kernel{}, grids, blocks, 0, kargs));
diff --git a/include/ck_tile/host/kernel_launch.hpp b/include/ck_tile/host/kernel_launch.hpp
index 262b8bae45..91ac3d5a0b 100644
--- a/include/ck_tile/host/kernel_launch.hpp
+++ b/include/ck_tile/host/kernel_launch.hpp
@@ -15,12 +15,6 @@
 
 namespace ck_tile {
 
-#define LOW_CU_PROCESSORS 80
-#define HIGH_CU_PROCESSORS 228
-#define OPTIMAL_LATENCY_LOW_CU_PROCESSORS 0.005
-#define OPTIMAL_LATENCY_HIGH_CU_PROCESSORS 0.0015
-#define OPTIMAL_LATENCY_SAFE_MARGIN 0.01
-
 template <int MaxThreadPerBlock, int MinBlockPerCu, typename Kernel, typename... Args>
 #if CK_TILE_USE_LAUNCH_BOUNDS
 __launch_bounds__(MaxThreadPerBlock, MinBlockPerCu)
@@ -65,71 +59,58 @@ CK_TILE_HOST void launch_and_check(const stream_config& sc, Callables&&... calla
     }
 }
 
-template <class it>
-typename std::iterator_traits<it>::value_type median(it begin, it end)
+// Measure the preprocess time during the cold iterations
+template <typename TimerType, typename PreprocessFunc>
+CK_TILE_HOST double
+preprocess_profiling_impl(TimerType timer, const stream_config& s, PreprocessFunc preprocess)
 {
-    if(begin == end)
+    timer.start(s.stream_id_);
+    for(int i = 0; i < s.nrepeat_; i++)
     {
-        return std::numeric_limits<double>::quiet_NaN();
+        if constexpr(!std::is_same_v<PreprocessFunc, std::nullptr_t>)
+        {
+            preprocess();
+        }
     }
-    auto n  = std::distance(begin, end);
-    auto n2 = n / 2;
-    std::nth_element(begin, begin + n2, end);
-    return (n % 2) ? begin[n2] : (*std::max_element(begin, begin + n2) + begin[n2]) / 2.0;
+    timer.stop(s.stream_id_);
+
+    return timer.duration() / s.nrepeat_;
 }
 
-inline void remove_outliers(std::vector<float>& v)
-{
-    // 1.5x IQR method to detect and remove outliers
-    auto n2 = v.size() / 2;
-    std::nth_element(v.begin(), v.begin() + n2, v.end());
-    auto q1  = median(v.begin(), v.begin() + n2);
-    auto q3  = median(v.begin() + ((v.size() % 2) ? n2 + 1 : n2), v.end());
-    auto iqr = q3 - q1;
-    auto lb  = q1 - 1.5 * iqr;
-    auto ub  = q3 + 1.5 * iqr;
-    v.erase(std::remove_if(v.begin(), v.end(), [&](float f) { return f < lb || f > ub; }), v.end());
-}
-
-template <typename TimerType, typename CallablesFunc>
+template <typename TimerType, typename CallablesFunc, typename PreprocessFunc = std::nullptr_t>
 CK_TILE_HOST double timing_loop_impl(TimerType timer,
                                      const stream_config& s,
                                      CallablesFunc&& callables_func,
-                                     std::function<void()> preprocess = nullptr)
+                                     PreprocessFunc preprocess = nullptr)
 {
     for(int i = 0; i < s.cold_niters_; i++)
     {
         callables_func();
     }
-
-    float per_iter_time = 0.f;
-    std::vector<float> times;
-    int i = 0;
-    while(i < s.nrepeat_ || per_iter_time < s.bench_time_ms_)
+    // Only profile preprocess if it's provided
+    auto preprocess_time = 0.0;
+    if constexpr(!std::is_same_v<PreprocessFunc, std::nullptr_t>)
     {
-        if(preprocess)
-            preprocess();
+        preprocess_time = preprocess_profiling_impl(gpu_timer{}, s, preprocess);
+    }
 
-        timer.start(s.stream_id_, i);
-        callables_func();
-        timer.stop(s.stream_id_, i);
-
-        if(i > 0)
+    int i = 0;
+    timer.start(s.stream_id_);
+    while(i < s.nrepeat_)
+    {
+        if constexpr(!std::is_same_v<PreprocessFunc, std::nullptr_t>)
         {
-            per_iter_time = timer.duration(i - 1);
-            times.push_back(per_iter_time);
-            per_iter_time = timer.is_exceed(i - 1);
+            preprocess();
         }
+
+        callables_func();
         i++;
     }
+    timer.stop(s.stream_id_);
 
     if(!i)
         return 0.;
-
-    per_iter_time = timer.duration(i - 1);
-    times.push_back(per_iter_time);
-    remove_outliers(times);
-    return std::accumulate(times.begin(), times.end(), 0.) / times.size();
+    return (timer.duration() / s.nrepeat_) - preprocess_time;
 }
 
 // clang-format off
@@ -174,7 +155,7 @@ CK_TILE_HOST float launch_kernel(const stream_config& s, Callables&&... callable
 
     if(s.is_gpu_timer_)
     {
-        return timing_loop_impl(gpu_timer_new{s.stream_id_}, s, callables_func);
+        return timing_loop_impl(gpu_timer{}, s, callables_func);
     }
     else
     {
@@ -199,7 +180,7 @@ launch_kernel_time_mask(const stream_config& s, PreprocessFunc preprocess, Calla
 
     if(s.is_gpu_timer_)
     {
-        return timing_loop_impl(gpu_timer_new{s.stream_id_}, s, callables_func, preprocess);
+        return timing_loop_impl(gpu_timer{}, s, callables_func, preprocess);
     }
     else
     {
diff --git a/include/ck_tile/host/stream_config.hpp b/include/ck_tile/host/stream_config.hpp
index 0c239d0a7d..acb861b2e7 100644
--- a/include/ck_tile/host/stream_config.hpp
+++ b/include/ck_tile/host/stream_config.hpp
@@ -20,6 +20,10 @@ namespace ck_tile {
  *
  *   // create stream config with _some_stream_id_, and benchmark using cpu timer
  *   stream_config s = stream_config{_some_stream_id_, true, 0, 3, 10, false};
+ *
+ *   // create stream config with _some_stream_id_, and enable gpu timer for rotating buffer with
+ *rotating buffer count stream_config s = stream_config{_some_stream_id_, true, 0, 3, 10, true,
+ *true, 1};
  **/
 
 struct stream_config
@@ -32,6 +36,5 @@ struct stream_config
     bool is_gpu_timer_     = true; // keep compatible
     bool flush_cache_      = false;
     int rotating_count_    = 1;
-    int bench_time_ms_     = 0;
 };
 } // namespace ck_tile
diff --git a/include/ck_tile/host/timer.hpp b/include/ck_tile/host/timer.hpp
index b4bff932e4..e5519643bf 100644
--- a/include/ck_tile/host/timer.hpp
+++ b/include/ck_tile/host/timer.hpp
@@ -48,100 +48,31 @@ struct gpu_timer
     hipEvent_t start_evt, stop_evt;
 };
 
-struct gpu_timer_new
-{
-    CK_TILE_HOST gpu_timer_new(const hipStream_t& s)
-    {
-        for(auto& e : start_event)
-        {
-            HIP_CHECK_ERROR(hipEventCreate(&e));
-        }
-        for(auto& e : stop_event)
-        {
-            HIP_CHECK_ERROR(hipEventCreate(&e));
-        }
-        HIP_CHECK_ERROR(hipEventCreate(&event0));
-        HIP_CHECK_ERROR(hipEventRecord(event0, s));
-    }
-
-    CK_TILE_HOST ~gpu_timer_new() noexcept(false)
-    {
-        for(auto& e : start_event)
-        {
-            HIP_CHECK_ERROR(hipEventDestroy(e));
-        }
-        for(auto& e : stop_event)
-        {
-            HIP_CHECK_ERROR(hipEventDestroy(e));
-        }
-        HIP_CHECK_ERROR(hipEventDestroy(event0));
-    }
-
-    CK_TILE_HOST void start(const hipStream_t& s, int idx = 0)
-    {
-        HIP_CHECK_ERROR(hipEventRecord(start_event[idx % 2], s));
-    }
-
-    CK_TILE_HOST void stop(const hipStream_t& s, int idx = 0)
-    {
-        HIP_CHECK_ERROR(hipEventRecord(stop_event[idx % 2], s));
-    }
-    // return in ms
-    CK_TILE_HOST float duration(int idx = 0) const
-    {
-        float ms;
-        HIP_CHECK_ERROR(hipEventSynchronize(stop_event[idx % 2]));
-        HIP_CHECK_ERROR(hipEventElapsedTime(&ms, start_event[idx % 2], stop_event[idx % 2]));
-        return ms;
-    }
-    CK_TILE_HOST float is_exceed(int idx = 0) const
-    {
-        float ms;
-        HIP_CHECK_ERROR(hipEventElapsedTime(&ms, event0, stop_event[idx % 2]));
-        return ms;
-    }
-
-    private:
-    std::array<hipEvent_t, 2> start_event;
-    std::array<hipEvent_t, 2> stop_event;
-    hipEvent_t event0;
-};
-
 struct cpu_timer
 {
     // torch.utils.benchmark.Timer(), there is a sync inside each timer callback
-    CK_TILE_HOST void start(const hipStream_t& s, [[maybe_unused]] int idx = 0)
+    CK_TILE_HOST void start(const hipStream_t& s)
     {
         HIP_CHECK_ERROR(hipStreamSynchronize(s));
-        start_tick  = std::chrono::high_resolution_clock::now();
-        time_event0 = std::chrono::high_resolution_clock::now();
+        start_tick = std::chrono::high_resolution_clock::now();
     }
     // torch.utils.benchmark.Timer(), there is a sync inside each timer callback
-    CK_TILE_HOST void stop(const hipStream_t& s, [[maybe_unused]] int idx = 0)
+    CK_TILE_HOST void stop(const hipStream_t& s)
     {
         HIP_CHECK_ERROR(hipStreamSynchronize(s));
         stop_tick = std::chrono::high_resolution_clock::now();
     }
     // return in ms
-    CK_TILE_HOST float duration([[maybe_unused]] int idx = 0) const
+    CK_TILE_HOST float duration() const
     {
         double sec =
             std::chrono::duration_cast<std::chrono::duration<double>>(stop_tick - start_tick)
                 .count();
         return static_cast<float>(sec * 1e3);
     }
-    // return in ms
-    CK_TILE_HOST float is_exceed([[maybe_unused]] int idx = 0) const
-    {
-        double sec =
-            std::chrono::duration_cast<std::chrono::duration<double>>(stop_tick - time_event0)
-                .count();
-        return static_cast<float>(sec * 1e3);
-    }
 
     private:
     std::chrono::time_point<std::chrono::high_resolution_clock> start_tick;
-    std::chrono::time_point<std::chrono::high_resolution_clock> time_event0;
     std::chrono::time_point<std::chrono::high_resolution_clock> stop_tick;
 };
 
diff --git a/tile_engine/ops/gemm/benchmark_gemm.cpp b/tile_engine/ops/gemm/benchmark_gemm.cpp
index 5f240c8fe4..db2b648437 100644
--- a/tile_engine/ops/gemm/benchmark_gemm.cpp
+++ b/tile_engine/ops/gemm/benchmark_gemm.cpp
@@ -34,8 +34,7 @@ void benchmark_gemm(const ck_tile::ArgParser& arg_parser)
                     arg_parser.get_bool("log"),
                     arg_parser.get_str("csv_filename"),
                     arg_parser.get_bool("flush_cache"),
-                    arg_parser.get_int("rotating_count"),
-                    arg_parser.get_int("bench_time")};
+                    arg_parser.get_int("rotating_count")};
 
     auto& profiler = GemmProfiler::instance(setting);
 
diff --git a/tile_engine/ops/gemm/benchmark_gemm.hpp b/tile_engine/ops/gemm/benchmark_gemm.hpp
index 993e7ea1f5..ce8a6e8234 100644
--- a/tile_engine/ops/gemm/benchmark_gemm.hpp
+++ b/tile_engine/ops/gemm/benchmark_gemm.hpp
@@ -125,7 +125,6 @@ struct Setting
     std::string csv_filename_;
     bool flush_cache_;
     int rotating_count_;
-    int bench_time_ms_;
 };
 
 inline std::string get_rocm_version()
diff --git a/tile_engine/ops/gemm/gemm_host_api.hpp b/tile_engine/ops/gemm/gemm_host_api.hpp
index 8b18aa703d..2c4af8955f 100644
--- a/tile_engine/ops/gemm/gemm_host_api.hpp
+++ b/tile_engine/ops/gemm/gemm_host_api.hpp
@@ -110,7 +110,6 @@ inline auto create_args(int argc, char* argv[])
                 "To flush cache, possible values are true or false. "
                 "Default is false.")
         .insert("rotating_count", "5", "number of iterations to rotate the cache. default is 5.")
-        .insert("bench_time", "0", "benchmark time in ms. default is 0 ms.")
         .insert("metric",
                 "0",
                 "Metric with which to measure kernel performance. Set to 0 for latency, 1 for "
diff --git a/tile_engine/ops/gemm/gemm_profiler.hpp b/tile_engine/ops/gemm/gemm_profiler.hpp
index d4efc7fa7f..634e19de6e 100644
--- a/tile_engine/ops/gemm/gemm_profiler.hpp
+++ b/tile_engine/ops/gemm/gemm_profiler.hpp
@@ -131,8 +131,7 @@ class GemmProfiler
                                                                      setting_.n_repeat_,
                                                                      setting_.is_gpu_timer_,
                                                                      setting_.flush_cache_,
-                                                                     setting_.rotating_count_,
-                                                                     setting_.bench_time_ms_});
+                                                                     setting_.rotating_count_});
             process_result(gemm_problem,
                            c_m_n_dev_buf,
                            c_m_n_host_result,