diff --git a/experimental/builder/include/ck_tile/builder/testing/conv/ck_tile.hpp b/experimental/builder/include/ck_tile/builder/testing/conv/ck_tile.hpp
index d25c15909e..862d965e5e 100644
--- a/experimental/builder/include/ck_tile/builder/testing/conv/ck_tile.hpp
+++ b/experimental/builder/include/ck_tile/builder/testing/conv/ck_tile.hpp
@@ -175,19 +175,40 @@ template <auto SIGNATURE, typename InDataType, typename WeiDataType, typename Ou
     constexpr index_t minimum_occupancy =
         Conv::GemmPipeline::Scheduler == ck_tile::GemmPipelineScheduler::Intrawave ? 1 : 2;
 
-    return RunResult::from_runtime(ck_tile::launch_kernel_time_mask(
-        s_conf,
-        preprocess,
-        ck_tile::make_kernel<minimum_occupancy>(conv, grids, blocks, 0, kargs),
-        ck_tile::make_kernel<minimum_occupancy>(elementwise_op,
-                                                kGridSize,
-                                                kBlockSize,
-                                                0,
-                                                input_size,
-                                                ck_tile::make_tuple(shape[1], 1), // Input Stride
-                                                ck_tile::make_tuple(shape[1], 1), // Output Stride
-                                                input_tensors,
-                                                static_cast<CDataType*>(c_ptr))));
+    if(s_conf.flush_cache_)
+    {
+        return RunResult::from_runtime(ck_tile::launch_kernel_time_mask_flush_cache(
+            s_conf,
+            preprocess,
+            ck_tile::make_kernel<minimum_occupancy>(conv, grids, blocks, 0, kargs),
+            ck_tile::make_kernel<minimum_occupancy>(
+                elementwise_op,
+                kGridSize,
+                kBlockSize,
+                0,
+                input_size,
+                ck_tile::make_tuple(shape[1], 1), // Input Stride
+                ck_tile::make_tuple(shape[1], 1), // Output Stride
+                input_tensors,
+                static_cast<CDataType*>(c_ptr))));
+    }
+    else
+    {
+        return RunResult::from_runtime(ck_tile::launch_kernel_time_mask(
+            s_conf,
+            preprocess,
+            ck_tile::make_kernel<minimum_occupancy>(conv, grids, blocks, 0, kargs),
+            ck_tile::make_kernel<minimum_occupancy>(
+                elementwise_op,
+                kGridSize,
+                kBlockSize,
+                0,
+                input_size,
+                ck_tile::make_tuple(shape[1], 1), // Input Stride
+                ck_tile::make_tuple(shape[1], 1), // Output Stride
+                input_tensors,
+                static_cast<CDataType*>(c_ptr))));
+    }
 }
 
 } // namespace detail
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_multiple_d_xdl_cshuffle_v3.hpp b/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_multiple_d_xdl_cshuffle_v3.hpp
index e25c03b1a8..01078ba67f 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_multiple_d_xdl_cshuffle_v3.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_multiple_d_xdl_cshuffle_v3.hpp
@@ -479,7 +479,7 @@ struct DeviceBatchedGemmMultiD_Xdl_CShuffle_V3
             const bool has_main_k_block_loop = GridwiseGemm::CalculateHasMainKBlockLoop(K_split);
 
             const auto Run = [&](const auto& kernel) {
-                if(stream_config.flush_cache)
+                if(stream_config.flush_cache && stream_config.rotating_count > 1)
                 {
 
                     std::array<std::size_t, NumDTensor> DsSize;
@@ -534,6 +534,27 @@ struct DeviceBatchedGemmMultiD_Xdl_CShuffle_V3
                         0,
                         arg_);
                 }
+                else if(stream_config.flush_cache)
+                {
+                    const auto clear_workspace = [&]() {
+                        if(arg.KBatch > 1)
+                            hipGetErrorString(
+                                hipMemsetAsync(arg.p_c_grid,
+                                               0,
+                                               arg.Batch * arg.M * arg.N * sizeof(CDataType),
+                                               stream_config.stream_id_));
+                    };
+
+                    BatchGemmArgument arg_ = reinterpret_cast<const BatchGemmArgument&>(arg);
+                    ave_time =
+                        launch_and_time_kernel_with_preprocess_flush_cache(stream_config,
+                                                                           clear_workspace,
+                                                                           kernel,
+                                                                           dim3(gdx, gdy, gdz),
+                                                                           dim3(BlockSize),
+                                                                           0,
+                                                                           arg_);
+                }
                 else
                 {
                     const auto clear_workspace = [&]() {
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_two_stage_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_two_stage_xdl_cshuffle.hpp
index 5228bdee98..99ec3387dc 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_two_stage_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_two_stage_xdl_cshuffle.hpp
@@ -1031,30 +1031,14 @@ struct DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle
             const auto Run = [&](const auto& kernel) {
                 if(stream_config.flush_cache)
                 {
-                    typename GridwiseGemm::Argument gemm_arg_ = gemm_arg;
-                    ck::utility::RotatingMemWrapper<typename GridwiseGemm::Argument> rotating_mem(
-                        gemm_arg_,
-                        stream_config.rotating_count,
-                        gemm_arg_.M * gemm_arg_.K * sizeof(ADataType),
-                        gemm_arg_.K * gemm_arg_.N * sizeof(BDataType));
-                    rotating_mem.Print();
-
-                    auto run_flush_cache = [&]() {
-                        // flush icache
-                        ck::utility::flush_icache();
-                        // rotating mem
-                        rotating_mem.Next();
-                        clear_workspace();
-                    };
-
-                    ave_time += ck::utility::launch_and_time_kernel_with_preprocess<false>(
+                    ave_time += launch_and_time_kernel_with_preprocess_flush_cache(
                         stream_config,
-                        run_flush_cache,
+                        clear_workspace,
                         kernel,
                         dim3(gdx, gdy, gdz),
                         dim3(BlockSize),
                         0,
-                        gemm_arg_,
+                        gemm_arg,
                         arg.a_grid_desc_k0_m_k1_,
                         arg.b_grid_desc_k0_n_k1_,
                         arg.c_grid_desc_mblock_mperblock_nblock_nperblock_,
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_xdl_cshuffle.hpp
index 585454221a..46a9009f83 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_xdl_cshuffle.hpp
@@ -998,30 +998,58 @@ struct DeviceGroupedConvBwdWeight_Xdl_CShuffle
                     hip_check_error(hipMemsetAsync(
                         p_e_grid, 0, arg.c_space_size_bytes, stream_config.stream_id_));
                 };
-
-                avg_time += launch_and_time_kernel_with_preprocess(
-                    stream_config,
-                    clear_workspace,
-                    kernel,
-                    dim3(grid_size),
-                    dim3(BlockSize),
-                    0,
-                    p_a_grid,
-                    p_b_grid,
-                    p_e_grid,
-                    arg.a_element_op_,
-                    arg.b_element_op_,
-                    arg.c_element_op_,
-                    arg.Conv_G_,
-                    arg.a_grid_desc_kbatch_k0_m_k1_,
-                    arg.b_grid_desc_kbatch_k0_n_k1_,
-                    c_grid_desc_mblock_mperblock_nblock_nperblock,
-                    arg.block_2_ctile_map_,
-                    arg.compute_ptr_offset_of_batch_,
-                    arg.split_k_stride_a_,
-                    arg.split_k_stride_b_,
-                    arg.split_k_offset_hack_,
-                    arg.k_batch_);
+                if(stream_config.flush_cache)
+                {
+                    avg_time += launch_and_time_kernel_with_preprocess_flush_cache(
+                        stream_config,
+                        clear_workspace,
+                        kernel,
+                        dim3(grid_size),
+                        dim3(BlockSize),
+                        0,
+                        p_a_grid,
+                        p_b_grid,
+                        p_e_grid,
+                        arg.a_element_op_,
+                        arg.b_element_op_,
+                        arg.c_element_op_,
+                        arg.Conv_G_,
+                        arg.a_grid_desc_kbatch_k0_m_k1_,
+                        arg.b_grid_desc_kbatch_k0_n_k1_,
+                        c_grid_desc_mblock_mperblock_nblock_nperblock,
+                        arg.block_2_ctile_map_,
+                        arg.compute_ptr_offset_of_batch_,
+                        arg.split_k_stride_a_,
+                        arg.split_k_stride_b_,
+                        arg.split_k_offset_hack_,
+                        arg.k_batch_);
+                }
+                else
+                {
+                    avg_time += launch_and_time_kernel_with_preprocess(
+                        stream_config,
+                        clear_workspace,
+                        kernel,
+                        dim3(grid_size),
+                        dim3(BlockSize),
+                        0,
+                        p_a_grid,
+                        p_b_grid,
+                        p_e_grid,
+                        arg.a_element_op_,
+                        arg.b_element_op_,
+                        arg.c_element_op_,
+                        arg.Conv_G_,
+                        arg.a_grid_desc_kbatch_k0_m_k1_,
+                        arg.b_grid_desc_kbatch_k0_n_k1_,
+                        c_grid_desc_mblock_mperblock_nblock_nperblock,
+                        arg.block_2_ctile_map_,
+                        arg.compute_ptr_offset_of_batch_,
+                        arg.split_k_stride_a_,
+                        arg.split_k_stride_b_,
+                        arg.split_k_offset_hack_,
+                        arg.k_batch_);
+                }
             };
 
             if(has_main_k0_block_loop)
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_xdl_cshuffle_v3.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_xdl_cshuffle_v3.hpp
index 718b04b955..2ab60581e7 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_xdl_cshuffle_v3.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_xdl_cshuffle_v3.hpp
@@ -805,29 +805,14 @@ struct DeviceGroupedConvBwdWeight_Xdl_CShuffleV3
             const auto Run = [&](const auto& kernel) {
                 if(stream_config.flush_cache)
                 {
-                    typename GridwiseGemm::Argument gemm_arg_ = gemm_arg;
-                    ck::utility::RotatingMemWrapper<typename GridwiseGemm::Argument> rotating_mem(
-                        gemm_arg_,
-                        stream_config.rotating_count,
-                        gemm_arg_.M * gemm_arg_.K * sizeof(ADataType),
-                        gemm_arg_.K * gemm_arg_.N * sizeof(BDataType));
-                    rotating_mem.Print();
-
-                    auto run_flush_cache = [&]() {
-                        // flush icache
-                        ck::utility::flush_icache();
-                        // rotating mem
-                        rotating_mem.Next();
-                        clear_workspace();
-                    };
-                    ave_time += ck::utility::launch_and_time_kernel_with_preprocess<false>(
+                    ave_time += launch_and_time_kernel_with_preprocess_flush_cache(
                         stream_config,
-                        run_flush_cache,
+                        clear_workspace,
                         kernel,
                         dim3(gdx, gdy, gdz),
                         dim3(BlockSize),
                         0,
-                        gemm_arg_,
+                        gemm_arg,
                         arg.a_grid_desc_k0_m_k1_,
                         arg.b_grid_desc_k0_n_k1_,
                         arg.c_grid_desc_mblock_mperblock_nblock_nperblock_,
diff --git a/profiler/include/profiler/grouped_convolution_backward_weight_tile_algs.hpp b/profiler/include/profiler/grouped_convolution_backward_weight_tile_algs.hpp
index cf9a40b274..f69c5bb7a1 100644
--- a/profiler/include/profiler/grouped_convolution_backward_weight_tile_algs.hpp
+++ b/profiler/include/profiler/grouped_convolution_backward_weight_tile_algs.hpp
@@ -114,7 +114,8 @@ run_grouped_conv_backward_weight_tile_algs(const ckt::Args<SIGNATURE>& args,
                                            const ckt::Outputs<SIGNATURE>& outputs,
                                            const ck_tile::stream_config& s_conf)
 {
-    float best_avg_time = std::numeric_limits<float>::max();
+    bool dummy_run_executed = false;
+    float best_avg_time     = std::numeric_limits<float>::max();
     std::string best_op_name, op_name;
     int best_split_k;
     bool is_supported;
@@ -154,6 +155,13 @@ run_grouped_conv_backward_weight_tile_algs(const ckt::Args<SIGNATURE>& args,
         {
             ckt::Args<SIGNATURE> args_k_batch = args;
             args_k_batch.k_batch              = k_batch;
+            if((s_conf.time_kernel_ || s_conf.flush_cache_) && !dummy_run_executed)
+            {
+                // Run first instance twice when profiling to stabilize timing
+                std::tie(is_supported, avg_time, op_name) =
+                    run_alg_func(args_k_batch, inputs, outputs, s_conf);
+                dummy_run_executed = true;
+            }
             std::tie(is_supported, avg_time, op_name) =
                 run_alg_func(args_k_batch, inputs, outputs, s_conf);
             if(is_supported)
diff --git a/profiler/include/profiler/profile_grouped_conv_bwd_weight_impl.hpp b/profiler/include/profiler/profile_grouped_conv_bwd_weight_impl.hpp
index 9b7a68224f..fb4d312249 100644
--- a/profiler/include/profiler/profile_grouped_conv_bwd_weight_impl.hpp
+++ b/profiler/include/profiler/profile_grouped_conv_bwd_weight_impl.hpp
@@ -272,7 +272,8 @@ bool profile_grouped_conv_bwd_weight_impl(int do_verification,
     index_t valid_instances     = 0;
 
     // profile device Conv instances
-    bool all_pass = true;
+    bool all_pass           = true;
+    bool dummy_run_executed = false;
 
     std::array<ck::index_t, NDimSpatial + 3> input_lengths{};
     std::array<ck::index_t, NDimSpatial + 3> filter_lengths{};
@@ -400,8 +401,25 @@ bool profile_grouped_conv_bwd_weight_impl(int do_verification,
 
                 auto invoker_ptr = op_ptr->MakeInvokerPointer();
 
-                float avg_time =
-                    invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel});
+                if(time_kernel && !dummy_run_executed)
+                {
+                    // Run first instance as dummy to get proper time from the first instance
+                    invoker_ptr->Run(argument_ptr.get(),
+                                     StreamConfig{nullptr,
+                                                  time_kernel,
+                                                  0 /*log_level*/,
+                                                  5 /*cold_iters*/,
+                                                  50 /*nrepeat_*/,
+                                                  time_kernel /*flush_cache*/});
+                    dummy_run_executed = true;
+                }
+                float avg_time = invoker_ptr->Run(argument_ptr.get(),
+                                                  StreamConfig{nullptr,
+                                                               time_kernel,
+                                                               0 /*log_level*/,
+                                                               5 /*cold_iters*/,
+                                                               50 /*nrepeat_*/,
+                                                               time_kernel /*flush_cache*/});
 
                 std::size_t flop      = conv_param.GetFlops();
                 std::size_t num_btype = conv_param.GetByte<InDataType, WeiDataType, OutDataType>();
diff --git a/profiler/src/profile_grouped_conv_bwd_weight_tile.cpp b/profiler/src/profile_grouped_conv_bwd_weight_tile.cpp
index 2c8258f280..7ee82fe8a9 100644
--- a/profiler/src/profile_grouped_conv_bwd_weight_tile.cpp
+++ b/profiler/src/profile_grouped_conv_bwd_weight_tile.cpp
@@ -141,7 +141,8 @@ int call_profiler(const ckt::Args<SIGNATURE>& args, const std::string& split_k,
                                    0 /*log_level*/,
                                    5 /*cold_iters*/,
                                    50 /*nrepeat_*/,
-                                   true /*is_gpu_timer_*/});
+                                   true /*is_gpu_timer_*/,
+                                   time_kernel /*flush_cache*/});
     if(time_kernel)
     {
         std::cout << "\nBest configuration parameters:" << "\n\tname: " << op_name
@@ -208,6 +209,14 @@ int profile_grouped_conv_bwd_weight_tile(int argc, char* argv[])
                     split_k,
                     time_kernel);
             }
+            else if(data_type == ConvDataType::F32_F32_F32)
+            {
+                constexpr auto SIGNATURE = ckp::SIGNATURE_NHWGC_FP32_BWD_WEIGHT;
+                return call_profiler<SIGNATURE>(
+                    ckp::parse_conv_args<SIGNATURE>(conv_params_start_idx, argv),
+                    split_k,
+                    time_kernel);
+            }
         }
         else if(num_dim_spatial == 3)
         {
@@ -227,6 +236,14 @@ int profile_grouped_conv_bwd_weight_tile(int argc, char* argv[])
                     split_k,
                     time_kernel);
             }
+            else if(data_type == ConvDataType::F32_F32_F32)
+            {
+                constexpr auto SIGNATURE = ckp::SIGNATURE_NDHWGC_FP32_BWD_WEIGHT;
+                return call_profiler<SIGNATURE>(
+                    ckp::parse_conv_args<SIGNATURE>(conv_params_start_idx, argv),
+                    split_k,
+                    time_kernel);
+            }
         }
     }