diff --git a/library/include/ck_tile/library/tensor_operation_instance/gpu/tile_grouped_conv_bwd_weight_bf16_instances.hpp b/library/include/ck_tile/library/tensor_operation_instance/gpu/tile_grouped_conv_bwd_weight_bf16_instances.hpp
index ed8bbe903e..fe197abfdb 100644
--- a/library/include/ck_tile/library/tensor_operation_instance/gpu/tile_grouped_conv_bwd_weight_bf16_instances.hpp
+++ b/library/include/ck_tile/library/tensor_operation_instance/gpu/tile_grouped_conv_bwd_weight_bf16_instances.hpp
@@ -49,7 +49,7 @@ using tile_grouped_conv_bwd_weight_bf16_instances = std::tuple<
     // GroupedConvolutionBackwardWeightInvoker<NDimSpatial,   ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,  PassThrough, PassThrough, PassThrough,        1,      16,      16,      64,      1,      1,      1,     16,     16,     32,      4,     4,       2,    true,  CK_TILE_PIPELINE_COMPUTE_V4>,
     // GroupedConvolutionBackwardWeightInvoker<NDimSpatial,   ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,  PassThrough, PassThrough, PassThrough,        2,      16,      16,      64,      1,      1,      1,     16,     16,     32,      4,     4,       2,    true,  CK_TILE_PIPELINE_COMPUTE_V4>,
     // GroupedConvolutionBackwardWeightInvoker<NDimSpatial,   ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,  PassThrough, PassThrough, PassThrough,        1,      16,      16,      64,      1,      1,      1,     16,     16,     32,      4,     4,       4,    true,  CK_TILE_PIPELINE_COMPUTE_V4>,
-    GroupedConvolutionBackwardWeightInvoker<NDimSpatial,   ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,  PassThrough, PassThrough, PassThrough,        2,      16,      16,      64,      1,      1,      1,     16,     16,     32,      4,     4,       4,    true,  CK_TILE_PIPELINE_COMPUTE_V4>
+    GroupedConvolutionBackwardWeightInvoker<NDimSpatial,   ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,  PassThrough, PassThrough, PassThrough,        2,      16,      16,      64,      1,      1,      1,     16,     16,     32,      2,     2,       2,    true,  CK_TILE_PIPELINE_COMPUTE_V4>
     // clang-format on
 >;
 
diff --git a/library/include/ck_tile/library/tensor_operation_instance/gpu/tile_grouped_conv_bwd_weight_invoker.hpp b/library/include/ck_tile/library/tensor_operation_instance/gpu/tile_grouped_conv_bwd_weight_invoker.hpp
index ca32e9f0c7..0e920eb1b3 100644
--- a/library/include/ck_tile/library/tensor_operation_instance/gpu/tile_grouped_conv_bwd_weight_invoker.hpp
+++ b/library/include/ck_tile/library/tensor_operation_instance/gpu/tile_grouped_conv_bwd_weight_invoker.hpp
@@ -32,7 +32,7 @@ template <ck_tile::index_t NDimSpatial,
 struct GroupedConvolutionBackwardWeightBaseInvoker
 {
     virtual bool IsSupportedArgument(const ck_tile::GroupedConvBwdWeightHostArgs& args) const = 0; 
-    virtual float Run(const ck_tile::GroupedConvBwdWeightHostArgs& args, bool time_kernel) const = 0;
+    virtual float Run(const ck_tile::GroupedConvBwdWeightHostArgs& args, bool time_kernel, int n_warmup, int n_repeat) const = 0;
     virtual std::string GetName(const ck_tile::GroupedConvBwdWeightHostArgs& args) const = 0;
     GroupedConvolutionBackwardWeightBaseInvoker() = default;
     GroupedConvolutionBackwardWeightBaseInvoker(const GroupedConvolutionBackwardWeightBaseInvoker&) = default;
@@ -197,7 +197,7 @@ struct GroupedConvolutionBackwardWeightInvoker :
         return Kernel::IsSupportedArgument(args);
     };
 
-    float Run(const ck_tile::GroupedConvBwdWeightHostArgs& args, bool time_kernel) const override
+    float Run(const ck_tile::GroupedConvBwdWeightHostArgs& args, bool time_kernel, int n_warmup=5, int n_repeat=50) const override
     {
         const ck_tile::index_t gemm_k =
             args.N_ * std::accumulate(args.output_spatial_lengths_.begin(),
@@ -226,8 +226,6 @@ struct GroupedConvolutionBackwardWeightInvoker :
             const dim3 grids  = Kernel::GridSize(args);
             const dim3 blocks = Kernel::BlockSize();
                    
-            constexpr int n_warmup = 5;
-            constexpr int n_repeat = 50;
             ck_tile::stream_config s {nullptr, time_kernel, 1, n_warmup, n_repeat};
 
             ave_time = ck_tile::launch_kernel(
diff --git a/library/include/ck_tile/library/tensor_operation_instance/gpu/tile_grouped_conv_fwd_bf16_instances.hpp b/library/include/ck_tile/library/tensor_operation_instance/gpu/tile_grouped_conv_fwd_bf16_instances.hpp
index a941133f3a..d361c58854 100644
--- a/library/include/ck_tile/library/tensor_operation_instance/gpu/tile_grouped_conv_fwd_bf16_instances.hpp
+++ b/library/include/ck_tile/library/tensor_operation_instance/gpu/tile_grouped_conv_fwd_bf16_instances.hpp
@@ -19,10 +19,10 @@ using tile_grouped_conv_fwd_bf16_instances = std::tuple<
     //##############################|       Num|  InLayout| WeiLayout| OutLayout| InData| WeiData| OutData|           In|         Wei|         Out|  K-block|  M-tile| N-tile | K-tile | M-warp| N-warp| K-warp| M-warp| N-warp| K-warp| Vector| Vector| Vector|  Double|                         GEMM|
     //##############################|       Dim|          |          |          |   Type|    Type|    Type|  Elementwise| Elementwise| Elementwise|      per|        |        |        |       |       |       |   tile|   tile|   tile|   size|   size|   size|    smem|                     pipeline|
     //##############################|   Spatial|          |          |          |       |        |        |    Operation|   Operation|   Operation|       CU|        |        |        |       |       |       |   size|   size|   size|      A|      B|      C|  buffer|                      version|
-    GroupedConvolutionForwardInvoker<NDimSpatial,   ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,  PassThrough, PassThrough, PassThrough,        2,      64,      64,      64,      2,      2,      1,     32,     32,     16,      2,     4,       2,   false,  CK_TILE_PIPELINE_COMPUTE_V3>
-    // GroupedConvolutionForwardInvoker<NDimSpatial,   ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,  PassThrough, PassThrough, PassThrough,        2,      64,      64,      64,      2,      2,      1,     32,     32,      8,      2,     4,       2,   false,  CK_TILE_PIPELINE_COMPUTE_V3>,
-    // GroupedConvolutionForwardInvoker<NDimSpatial,   ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,  PassThrough, PassThrough, PassThrough,        2,      64,      64,      64,      2,      2,      1,     16,     16,     32,      2,     4,       2,   false,  CK_TILE_PIPELINE_COMPUTE_V3>,
-    // GroupedConvolutionForwardInvoker<NDimSpatial,   ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,  PassThrough, PassThrough, PassThrough,        2,      64,      64,      64,      2,      2,      1,     16,     16,     32,      4,     4,       4,   false,  CK_TILE_PIPELINE_COMPUTE_V3>,
+    GroupedConvolutionForwardInvoker<NDimSpatial,   ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,  PassThrough, PassThrough, PassThrough,        2,      64,      64,      64,      2,      2,      1,     32,     32,     16,      2,     4,       2,   false,  CK_TILE_PIPELINE_COMPUTE_V3>,
+    GroupedConvolutionForwardInvoker<NDimSpatial,   ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,  PassThrough, PassThrough, PassThrough,        2,      64,      64,      64,      2,      2,      1,     32,     32,      8,      2,     4,       2,   false,  CK_TILE_PIPELINE_COMPUTE_V3>,
+    GroupedConvolutionForwardInvoker<NDimSpatial,   ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,  PassThrough, PassThrough, PassThrough,        2,      64,      64,      64,      2,      2,      1,     16,     16,     32,      2,     4,       2,   false,  CK_TILE_PIPELINE_COMPUTE_V3>,
+    GroupedConvolutionForwardInvoker<NDimSpatial,   ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,  PassThrough, PassThrough, PassThrough,        2,      64,      64,      64,      2,      2,      1,     16,     16,     32,      4,     4,       4,   false,  CK_TILE_PIPELINE_COMPUTE_V3>
     // GroupedConvolutionForwardInvoker<NDimSpatial,   ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,  PassThrough, PassThrough, PassThrough,        2,      64,      64,      64,      2,      2,      1,     16,     16,     32,      8,     8,       8,   false,  CK_TILE_PIPELINE_COMPUTE_V3>,
     // GroupedConvolutionForwardInvoker<NDimSpatial,   ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,  PassThrough, PassThrough, PassThrough,        1,      64,      64,      64,      1,      1,      1,     32,     32,      8,      2,     2,       2,   false,  CK_TILE_PIPELINE_COMPUTE_V3>,
     // GroupedConvolutionForwardInvoker<NDimSpatial,   ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,  PassThrough, PassThrough, PassThrough,        1,      64,      64,      64,      1,      1,      1,     32,     32,     16,      2,     2,       2,   false,  CK_TILE_PIPELINE_COMPUTE_V3>,
diff --git a/library/include/ck_tile/library/tensor_operation_instance/gpu/tile_grouped_conv_fwd_invoker.hpp b/library/include/ck_tile/library/tensor_operation_instance/gpu/tile_grouped_conv_fwd_invoker.hpp
index 854ab3a3f3..17c2d0a06b 100644
--- a/library/include/ck_tile/library/tensor_operation_instance/gpu/tile_grouped_conv_fwd_invoker.hpp
+++ b/library/include/ck_tile/library/tensor_operation_instance/gpu/tile_grouped_conv_fwd_invoker.hpp
@@ -32,7 +32,7 @@ template <ck_tile::index_t NDimSpatial,
 struct GroupedConvolutionForwardBaseInvoker
 {
     virtual bool IsSupportedArgument(const ck_tile::GroupedConvFwdHostArgs& args) const = 0; 
-    virtual float Run(const ck_tile::GroupedConvFwdHostArgs& args, bool time_kernel) const = 0;
+    virtual float Run(const ck_tile::GroupedConvFwdHostArgs& args, bool time_kernel, int n_warmup, int n_repeat) const = 0;
     virtual std::string GetName(const ck_tile::GroupedConvFwdHostArgs& args) const = 0;
     GroupedConvolutionForwardBaseInvoker() = default;
     GroupedConvolutionForwardBaseInvoker(const GroupedConvolutionForwardBaseInvoker&) = default;
@@ -198,7 +198,7 @@ struct GroupedConvolutionForwardInvoker :
         return Kernel::IsSupportedArgument(args);
     };
 
-    float Run(const ck_tile::GroupedConvFwdHostArgs& args, bool time_kernel) const override
+    float Run(const ck_tile::GroupedConvFwdHostArgs& args, bool time_kernel, int n_warmup=5, int n_repeat=50) const override
     {
         const ck_tile::index_t gemm_k =
             args.C_ * std::accumulate(args.filter_spatial_lengths_.begin(),
@@ -227,8 +227,6 @@ struct GroupedConvolutionForwardInvoker :
             const dim3 grids  = Kernel::GridSize(args);
             const dim3 blocks = Kernel::BlockSize();
                    
-            constexpr int n_warmup = 5;
-            constexpr int n_repeat = 50;
             ck_tile::stream_config s {nullptr, time_kernel, 1, n_warmup, n_repeat};
 
             ave_time = ck_tile::launch_kernel(
diff --git a/profiler/ck_tile/include/tile_profile_grouped_conv_bwd_weight_impl.hpp b/profiler/ck_tile/include/tile_profile_grouped_conv_bwd_weight_impl.hpp
index bbf663c1d0..b9854cbed8 100644
--- a/profiler/ck_tile/include/tile_profile_grouped_conv_bwd_weight_impl.hpp
+++ b/profiler/ck_tile/include/tile_profile_grouped_conv_bwd_weight_impl.hpp
@@ -97,14 +97,6 @@ bool profile_grouped_conv_bwd_weight_impl(int do_verification,
         output.SetZero();
     }
 
-    ck_tile::DeviceMem input_dev_buf(input.get_element_space_size_in_bytes());
-    ck_tile::DeviceMem weight_dev_buf(weight.get_element_space_size_in_bytes());
-    ck_tile::DeviceMem output_dev_buf(output.get_element_space_size_in_bytes());
-
-    input_dev_buf.ToDevice(input.data());
-    weight_dev_buf.SetZero();
-    output_dev_buf.ToDevice(output.data());
-
     using DeviceOp = ops::GroupedConvolutionBackwardWeightBaseInvoker<
                                                         NDimSpatial,
                                                         InLayout,
@@ -154,6 +146,14 @@ bool profile_grouped_conv_bwd_weight_impl(int do_verification,
             auto split_k_value     = split_k_list[split_k_id];
             auto split_k_param_str = std::to_string(split_k_value);
 
+            ck_tile::DeviceMem input_dev_buf(input.get_element_space_size_in_bytes());
+            ck_tile::DeviceMem weight_dev_buf(weight.get_element_space_size_in_bytes());
+            ck_tile::DeviceMem output_dev_buf(output.get_element_space_size_in_bytes());
+
+            input_dev_buf.ToDevice(input.data());
+            weight_dev_buf.SetZero();
+            output_dev_buf.ToDevice(output.data());
+
             ck_tile::GroupedConvBwdWeightHostArgs args(conv_param,
                                                input_dev_buf.GetDeviceBuffer(),
                                                weight_dev_buf.GetDeviceBuffer(),
@@ -174,22 +174,16 @@ bool profile_grouped_conv_bwd_weight_impl(int do_verification,
                 std::string op_name = op->GetName(args);
                 std::cout << op_name << ", SplitK " << split_k_param_str << " is profiled..." << std::endl;
 
-                float avg_time = op->Run(args, time_kernel);
-
-                std::size_t flop      = conv_param.GetFlops();
-                std::size_t num_btype = conv_param.GetByte<InDataType, WeiDataType, OutDataType>();
-
-                float tflops     = static_cast<float>(flop) / 1.E9 / avg_time;
-                float gb_per_sec = num_btype / 1.E6 / avg_time;
-
-                std::cout << "Perf: " << std::setw(10) << avg_time << " ms, " << tflops
-                          << " TFlops, " << gb_per_sec << " GB/s, " << op_name << ", SplitK "
-                          << split_k_param_str << std::endl;
-
+                // Run verification first. If it doesn't pass, no need to do performance measurement.
                 bool pass = false; 
                 if(do_verification)
                 {
+                    constexpr int n_warmup = 0;
+                    constexpr int n_repeat = 1;
+
+                    op->Run(args, false, n_warmup, n_repeat);
                     weight_dev_buf.FromDevice(weight.data());
+
                     ck_tile::HostTensor<WeiDataType> weight_host_ref(wei_g_k_c_xs_desc);
                     weight_host_ref.SetZero();
 
@@ -223,13 +217,30 @@ bool profile_grouped_conv_bwd_weight_impl(int do_verification,
 
                 bool is_valid = do_verification ? pass : true;
 
-                if(tflops > best_tflops && is_valid)
+                if (is_valid)
                 {
-                    best_op_name    = op_name;
-                    best_tflops     = tflops;
-                    best_avg_time   = avg_time;
-                    best_gb_per_sec = gb_per_sec;
-                    best_split_k    = split_k_param_str;
+                    constexpr int n_warmup = 5;
+                    constexpr int n_repeat = 50;
+                    float avg_time = op->Run(args, time_kernel, n_warmup, n_repeat);
+
+                    std::size_t flop      = conv_param.GetFlops();
+                    std::size_t num_btype = conv_param.GetByte<InDataType, WeiDataType, OutDataType>();
+
+                    float tflops     = static_cast<float>(flop) / 1.E9 / avg_time;
+                    float gb_per_sec = num_btype / 1.E6 / avg_time;
+
+                    std::cout << "Perf: " << std::setw(10) << avg_time << " ms, " << tflops
+                            << " TFlops, " << gb_per_sec << " GB/s, " << op_name << ", SplitK "
+                            << split_k_param_str << std::endl;
+                    
+                    if(tflops > best_tflops)
+                    {
+                        best_op_name    = op_name;
+                        best_tflops     = tflops;
+                        best_avg_time   = avg_time;
+                        best_gb_per_sec = gb_per_sec;
+                        best_split_k    = split_k_param_str;
+                    }
                 }
             }
             else 
diff --git a/profiler/ck_tile/include/tile_profile_grouped_conv_fwd_impl.hpp b/profiler/ck_tile/include/tile_profile_grouped_conv_fwd_impl.hpp
index 050c9205c6..daf47f171d 100644
--- a/profiler/ck_tile/include/tile_profile_grouped_conv_fwd_impl.hpp
+++ b/profiler/ck_tile/include/tile_profile_grouped_conv_fwd_impl.hpp
@@ -100,14 +100,6 @@ bool profile_grouped_conv_fwd_impl(int do_verification,
         weight.SetZero();
     }
 
-    ck_tile::DeviceMem input_dev_buf(input.get_element_space_size_in_bytes());
-    ck_tile::DeviceMem weight_dev_buf(weight.get_element_space_size_in_bytes());
-    ck_tile::DeviceMem output_dev_buf(output.get_element_space_size_in_bytes());
-
-    input_dev_buf.ToDevice(input.data());
-    weight_dev_buf.ToDevice(output.data());
-    output_dev_buf.SetZero();
-
     using DeviceOp = ops::GroupedConvolutionForwardBaseInvoker<
                                                         NDimSpatial,
                                                         InLayout,
@@ -136,6 +128,14 @@ bool profile_grouped_conv_fwd_impl(int do_verification,
     bool all_pass = true;
     for(auto& op : ops)
     {
+        ck_tile::DeviceMem input_dev_buf(input.get_element_space_size_in_bytes());
+        ck_tile::DeviceMem weight_dev_buf(weight.get_element_space_size_in_bytes());
+        ck_tile::DeviceMem output_dev_buf(output.get_element_space_size_in_bytes());
+
+        input_dev_buf.ToDevice(input.data());
+        weight_dev_buf.ToDevice(output.data());
+        output_dev_buf.SetZero();
+
         ck_tile::GroupedConvFwdHostArgs args(conv_param,
                                             input_dev_buf.GetDeviceBuffer(),
                                             weight_dev_buf.GetDeviceBuffer(),
@@ -155,20 +155,14 @@ bool profile_grouped_conv_fwd_impl(int do_verification,
             std::string op_name = op->GetName(args);
             std::cout << op_name << " is profiled..." << std::endl;
 
-            float avg_time = op->Run(args, time_kernel);
-
-            std::size_t flop      = conv_param.GetFlops();
-            std::size_t num_btype = conv_param.GetByte<InDataType, WeiDataType, OutDataType>();
-
-            float tflops     = static_cast<float>(flop) / 1.E9 / avg_time;
-            float gb_per_sec = num_btype / 1.E6 / avg_time;
-
-            std::cout << "Perf: " << std::setw(10) << avg_time << " ms, " << tflops
-                        << " TFlops, " << gb_per_sec << " GB/s, " << op_name << std::endl;
-
+            // Run verification first. If it doesn't pass, no need to do performance measurement.
             bool pass = false; 
             if(do_verification)
             {
+                constexpr int n_warmup = 0;
+                constexpr int n_repeat = 1;
+
+                op->Run(args, false, n_warmup, n_repeat);
                 output_dev_buf.FromDevice(output.data());
 
                 ck_tile::HostTensor<OutDataType> output_host_ref(out_g_n_k_wos_desc);
@@ -202,13 +196,28 @@ bool profile_grouped_conv_fwd_impl(int do_verification,
             }
 
             bool is_valid = do_verification ? pass : true;
-
-            if(tflops > best_tflops && is_valid)
+            if (is_valid)
             {
-                best_op_name    = op_name;
-                best_tflops     = tflops;
-                best_avg_time   = avg_time;
-                best_gb_per_sec = gb_per_sec;
+                constexpr int n_warmup = 5;
+                constexpr int n_repeat = 50;
+                float avg_time = op->Run(args, time_kernel, n_warmup, n_repeat);
+
+                std::size_t flop      = conv_param.GetFlops();
+                std::size_t num_btype = conv_param.GetByte<InDataType, WeiDataType, OutDataType>();
+
+                float tflops     = static_cast<float>(flop) / 1.E9 / avg_time;
+                float gb_per_sec = num_btype / 1.E6 / avg_time;
+
+                std::cout << "Perf: " << std::setw(10) << avg_time << " ms, " << tflops
+                            << " TFlops, " << gb_per_sec << " GB/s, " << op_name << std::endl;
+
+                if(tflops > best_tflops)
+                {
+                    best_op_name    = op_name;
+                    best_tflops     = tflops;
+                    best_avg_time   = avg_time;
+                    best_gb_per_sec = gb_per_sec;
+                }
             }
         }
         else