Fix validation.

2026-07-01 04:07:56 +00:00 · 2025-10-17 13:07:06 +00:00
parent 6789c219c1
commit 7722f901df
6 changed files with 80 additions and 64 deletions
--- a/library/include/ck_tile/library/tensor_operation_instance/gpu/tile_grouped_conv_bwd_weight_bf16_instances.hpp
+++ b/library/include/ck_tile/library/tensor_operation_instance/gpu/tile_grouped_conv_bwd_weight_bf16_instances.hpp
@@ -49,7 +49,7 @@ using tile_grouped_conv_bwd_weight_bf16_instances = std::tuple<
    // GroupedConvolutionBackwardWeightInvoker<NDimSpatial,   ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,  PassThrough, PassThrough, PassThrough,        1,      16,      16,      64,      1,      1,      1,     16,     16,     32,      4,     4,       2,    true,  CK_TILE_PIPELINE_COMPUTE_V4>,
    // GroupedConvolutionBackwardWeightInvoker<NDimSpatial,   ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,  PassThrough, PassThrough, PassThrough,        2,      16,      16,      64,      1,      1,      1,     16,     16,     32,      4,     4,       2,    true,  CK_TILE_PIPELINE_COMPUTE_V4>,
    // GroupedConvolutionBackwardWeightInvoker<NDimSpatial,   ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,  PassThrough, PassThrough, PassThrough,        1,      16,      16,      64,      1,      1,      1,     16,     16,     32,      4,     4,       4,    true,  CK_TILE_PIPELINE_COMPUTE_V4>,
-    GroupedConvolutionBackwardWeightInvoker<NDimSpatial,   ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,  PassThrough, PassThrough, PassThrough,        2,      16,      16,      64,      1,      1,      1,     16,     16,     32,      4,     4,       4,    true,  CK_TILE_PIPELINE_COMPUTE_V4>
+    GroupedConvolutionBackwardWeightInvoker<NDimSpatial,   ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,  PassThrough, PassThrough, PassThrough,        2,      16,      16,      64,      1,      1,      1,     16,     16,     32,      2,     2,       2,    true,  CK_TILE_PIPELINE_COMPUTE_V4>
    // clang-format on
 >;

--- a/library/include/ck_tile/library/tensor_operation_instance/gpu/tile_grouped_conv_bwd_weight_invoker.hpp
+++ b/library/include/ck_tile/library/tensor_operation_instance/gpu/tile_grouped_conv_bwd_weight_invoker.hpp
@@ -32,7 +32,7 @@ template <ck_tile::index_t NDimSpatial,
 struct GroupedConvolutionBackwardWeightBaseInvoker
 {
    virtual bool IsSupportedArgument(const ck_tile::GroupedConvBwdWeightHostArgs& args) const = 0; 
-    virtual float Run(const ck_tile::GroupedConvBwdWeightHostArgs& args, bool time_kernel) const = 0;
+    virtual float Run(const ck_tile::GroupedConvBwdWeightHostArgs& args, bool time_kernel, int n_warmup, int n_repeat) const = 0;
    virtual std::string GetName(const ck_tile::GroupedConvBwdWeightHostArgs& args) const = 0;
    GroupedConvolutionBackwardWeightBaseInvoker() = default;
    GroupedConvolutionBackwardWeightBaseInvoker(const GroupedConvolutionBackwardWeightBaseInvoker&) = default;
@@ -197,7 +197,7 @@ struct GroupedConvolutionBackwardWeightInvoker :
        return Kernel::IsSupportedArgument(args);
    };

-    float Run(const ck_tile::GroupedConvBwdWeightHostArgs& args, bool time_kernel) const override
+    float Run(const ck_tile::GroupedConvBwdWeightHostArgs& args, bool time_kernel, int n_warmup=5, int n_repeat=50) const override
    {
        const ck_tile::index_t gemm_k =
            args.N_ * std::accumulate(args.output_spatial_lengths_.begin(),
@@ -226,8 +226,6 @@ struct GroupedConvolutionBackwardWeightInvoker :
            const dim3 grids  = Kernel::GridSize(args);
            const dim3 blocks = Kernel::BlockSize();
                   
-            constexpr int n_warmup = 5;
-            constexpr int n_repeat = 50;
            ck_tile::stream_config s {nullptr, time_kernel, 1, n_warmup, n_repeat};

            ave_time = ck_tile::launch_kernel(
--- a/library/include/ck_tile/library/tensor_operation_instance/gpu/tile_grouped_conv_fwd_bf16_instances.hpp
+++ b/library/include/ck_tile/library/tensor_operation_instance/gpu/tile_grouped_conv_fwd_bf16_instances.hpp
@@ -19,10 +19,10 @@ using tile_grouped_conv_fwd_bf16_instances = std::tuple<
    //##############################|       Num|  InLayout| WeiLayout| OutLayout| InData| WeiData| OutData|           In|         Wei|         Out|  K-block|  M-tile| N-tile | K-tile | M-warp| N-warp| K-warp| M-warp| N-warp| K-warp| Vector| Vector| Vector|  Double|                         GEMM|
    //##############################|       Dim|          |          |          |   Type|    Type|    Type|  Elementwise| Elementwise| Elementwise|      per|        |        |        |       |       |       |   tile|   tile|   tile|   size|   size|   size|    smem|                     pipeline|
    //##############################|   Spatial|          |          |          |       |        |        |    Operation|   Operation|   Operation|       CU|        |        |        |       |       |       |   size|   size|   size|      A|      B|      C|  buffer|                      version|
-    GroupedConvolutionForwardInvoker<NDimSpatial,   ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,  PassThrough, PassThrough, PassThrough,        2,      64,      64,      64,      2,      2,      1,     32,     32,     16,      2,     4,       2,   false,  CK_TILE_PIPELINE_COMPUTE_V3>
-    // GroupedConvolutionForwardInvoker<NDimSpatial,   ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,  PassThrough, PassThrough, PassThrough,        2,      64,      64,      64,      2,      2,      1,     32,     32,      8,      2,     4,       2,   false,  CK_TILE_PIPELINE_COMPUTE_V3>,
-    // GroupedConvolutionForwardInvoker<NDimSpatial,   ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,  PassThrough, PassThrough, PassThrough,        2,      64,      64,      64,      2,      2,      1,     16,     16,     32,      2,     4,       2,   false,  CK_TILE_PIPELINE_COMPUTE_V3>,
-    // GroupedConvolutionForwardInvoker<NDimSpatial,   ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,  PassThrough, PassThrough, PassThrough,        2,      64,      64,      64,      2,      2,      1,     16,     16,     32,      4,     4,       4,   false,  CK_TILE_PIPELINE_COMPUTE_V3>,
+    GroupedConvolutionForwardInvoker<NDimSpatial,   ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,  PassThrough, PassThrough, PassThrough,        2,      64,      64,      64,      2,      2,      1,     32,     32,     16,      2,     4,       2,   false,  CK_TILE_PIPELINE_COMPUTE_V3>,
+    GroupedConvolutionForwardInvoker<NDimSpatial,   ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,  PassThrough, PassThrough, PassThrough,        2,      64,      64,      64,      2,      2,      1,     32,     32,      8,      2,     4,       2,   false,  CK_TILE_PIPELINE_COMPUTE_V3>,
+    GroupedConvolutionForwardInvoker<NDimSpatial,   ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,  PassThrough, PassThrough, PassThrough,        2,      64,      64,      64,      2,      2,      1,     16,     16,     32,      2,     4,       2,   false,  CK_TILE_PIPELINE_COMPUTE_V3>,
+    GroupedConvolutionForwardInvoker<NDimSpatial,   ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,  PassThrough, PassThrough, PassThrough,        2,      64,      64,      64,      2,      2,      1,     16,     16,     32,      4,     4,       4,   false,  CK_TILE_PIPELINE_COMPUTE_V3>
    // GroupedConvolutionForwardInvoker<NDimSpatial,   ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,  PassThrough, PassThrough, PassThrough,        2,      64,      64,      64,      2,      2,      1,     16,     16,     32,      8,     8,       8,   false,  CK_TILE_PIPELINE_COMPUTE_V3>,
    // GroupedConvolutionForwardInvoker<NDimSpatial,   ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,  PassThrough, PassThrough, PassThrough,        1,      64,      64,      64,      1,      1,      1,     32,     32,      8,      2,     2,       2,   false,  CK_TILE_PIPELINE_COMPUTE_V3>,
    // GroupedConvolutionForwardInvoker<NDimSpatial,   ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,  PassThrough, PassThrough, PassThrough,        1,      64,      64,      64,      1,      1,      1,     32,     32,     16,      2,     2,       2,   false,  CK_TILE_PIPELINE_COMPUTE_V3>,
--- a/library/include/ck_tile/library/tensor_operation_instance/gpu/tile_grouped_conv_fwd_invoker.hpp
+++ b/library/include/ck_tile/library/tensor_operation_instance/gpu/tile_grouped_conv_fwd_invoker.hpp
@@ -32,7 +32,7 @@ template <ck_tile::index_t NDimSpatial,
 struct GroupedConvolutionForwardBaseInvoker
 {
    virtual bool IsSupportedArgument(const ck_tile::GroupedConvFwdHostArgs& args) const = 0; 
-    virtual float Run(const ck_tile::GroupedConvFwdHostArgs& args, bool time_kernel) const = 0;
+    virtual float Run(const ck_tile::GroupedConvFwdHostArgs& args, bool time_kernel, int n_warmup, int n_repeat) const = 0;
    virtual std::string GetName(const ck_tile::GroupedConvFwdHostArgs& args) const = 0;
    GroupedConvolutionForwardBaseInvoker() = default;
    GroupedConvolutionForwardBaseInvoker(const GroupedConvolutionForwardBaseInvoker&) = default;
@@ -198,7 +198,7 @@ struct GroupedConvolutionForwardInvoker :
        return Kernel::IsSupportedArgument(args);
    };

-    float Run(const ck_tile::GroupedConvFwdHostArgs& args, bool time_kernel) const override
+    float Run(const ck_tile::GroupedConvFwdHostArgs& args, bool time_kernel, int n_warmup=5, int n_repeat=50) const override
    {
        const ck_tile::index_t gemm_k =
            args.C_ * std::accumulate(args.filter_spatial_lengths_.begin(),
@@ -227,8 +227,6 @@ struct GroupedConvolutionForwardInvoker :
            const dim3 grids  = Kernel::GridSize(args);
            const dim3 blocks = Kernel::BlockSize();
                   
-            constexpr int n_warmup = 5;
-            constexpr int n_repeat = 50;
            ck_tile::stream_config s {nullptr, time_kernel, 1, n_warmup, n_repeat};

            ave_time = ck_tile::launch_kernel(
--- a/profiler/ck_tile/include/tile_profile_grouped_conv_bwd_weight_impl.hpp
+++ b/profiler/ck_tile/include/tile_profile_grouped_conv_bwd_weight_impl.hpp
@@ -97,14 +97,6 @@ bool profile_grouped_conv_bwd_weight_impl(int do_verification,
        output.SetZero();
    }

-    ck_tile::DeviceMem input_dev_buf(input.get_element_space_size_in_bytes());
-    ck_tile::DeviceMem weight_dev_buf(weight.get_element_space_size_in_bytes());
-    ck_tile::DeviceMem output_dev_buf(output.get_element_space_size_in_bytes());
-
-    input_dev_buf.ToDevice(input.data());
-    weight_dev_buf.SetZero();
-    output_dev_buf.ToDevice(output.data());
-
    using DeviceOp = ops::GroupedConvolutionBackwardWeightBaseInvoker<
                                                        NDimSpatial,
                                                        InLayout,
@@ -154,6 +146,14 @@ bool profile_grouped_conv_bwd_weight_impl(int do_verification,
            auto split_k_value     = split_k_list[split_k_id];
            auto split_k_param_str = std::to_string(split_k_value);

+            ck_tile::DeviceMem input_dev_buf(input.get_element_space_size_in_bytes());
+            ck_tile::DeviceMem weight_dev_buf(weight.get_element_space_size_in_bytes());
+            ck_tile::DeviceMem output_dev_buf(output.get_element_space_size_in_bytes());
+
+            input_dev_buf.ToDevice(input.data());
+            weight_dev_buf.SetZero();
+            output_dev_buf.ToDevice(output.data());
+
            ck_tile::GroupedConvBwdWeightHostArgs args(conv_param,
                                               input_dev_buf.GetDeviceBuffer(),
                                               weight_dev_buf.GetDeviceBuffer(),
@@ -174,22 +174,16 @@ bool profile_grouped_conv_bwd_weight_impl(int do_verification,
                std::string op_name = op->GetName(args);
                std::cout << op_name << ", SplitK " << split_k_param_str << " is profiled..." << std::endl;

-                float avg_time = op->Run(args, time_kernel);
-
-                std::size_t flop      = conv_param.GetFlops();
-                std::size_t num_btype = conv_param.GetByte<InDataType, WeiDataType, OutDataType>();
-
-                float tflops     = static_cast<float>(flop) / 1.E9 / avg_time;
-                float gb_per_sec = num_btype / 1.E6 / avg_time;
-
-                std::cout << "Perf: " << std::setw(10) << avg_time << " ms, " << tflops
-                          << " TFlops, " << gb_per_sec << " GB/s, " << op_name << ", SplitK "
-                          << split_k_param_str << std::endl;
-
+                // Run verification first. If it doesn't pass, no need to do performance measurement.
                bool pass = false; 
                if(do_verification)
                {
+                    constexpr int n_warmup = 0;
+                    constexpr int n_repeat = 1;
+
+                    op->Run(args, false, n_warmup, n_repeat);
                    weight_dev_buf.FromDevice(weight.data());
+
                    ck_tile::HostTensor<WeiDataType> weight_host_ref(wei_g_k_c_xs_desc);
                    weight_host_ref.SetZero();

@@ -223,13 +217,30 @@ bool profile_grouped_conv_bwd_weight_impl(int do_verification,

                bool is_valid = do_verification ? pass : true;

-                if(tflops > best_tflops && is_valid)
+                if (is_valid)
                {
-                    best_op_name    = op_name;
-                    best_tflops     = tflops;
-                    best_avg_time   = avg_time;
-                    best_gb_per_sec = gb_per_sec;
-                    best_split_k    = split_k_param_str;
+                    constexpr int n_warmup = 5;
+                    constexpr int n_repeat = 50;
+                    float avg_time = op->Run(args, time_kernel, n_warmup, n_repeat);
+
+                    std::size_t flop      = conv_param.GetFlops();
+                    std::size_t num_btype = conv_param.GetByte<InDataType, WeiDataType, OutDataType>();
+
+                    float tflops     = static_cast<float>(flop) / 1.E9 / avg_time;
+                    float gb_per_sec = num_btype / 1.E6 / avg_time;
+
+                    std::cout << "Perf: " << std::setw(10) << avg_time << " ms, " << tflops
+                            << " TFlops, " << gb_per_sec << " GB/s, " << op_name << ", SplitK "
+                            << split_k_param_str << std::endl;
+                    
+                    if(tflops > best_tflops)
+                    {
+                        best_op_name    = op_name;
+                        best_tflops     = tflops;
+                        best_avg_time   = avg_time;
+                        best_gb_per_sec = gb_per_sec;
+                        best_split_k    = split_k_param_str;
+                    }
                }
            }
            else 
--- a/profiler/ck_tile/include/tile_profile_grouped_conv_fwd_impl.hpp
+++ b/profiler/ck_tile/include/tile_profile_grouped_conv_fwd_impl.hpp
@@ -100,14 +100,6 @@ bool profile_grouped_conv_fwd_impl(int do_verification,
        weight.SetZero();
    }

-    ck_tile::DeviceMem input_dev_buf(input.get_element_space_size_in_bytes());
-    ck_tile::DeviceMem weight_dev_buf(weight.get_element_space_size_in_bytes());
-    ck_tile::DeviceMem output_dev_buf(output.get_element_space_size_in_bytes());
-
-    input_dev_buf.ToDevice(input.data());
-    weight_dev_buf.ToDevice(output.data());
-    output_dev_buf.SetZero();
-
    using DeviceOp = ops::GroupedConvolutionForwardBaseInvoker<
                                                        NDimSpatial,
                                                        InLayout,
@@ -136,6 +128,14 @@ bool profile_grouped_conv_fwd_impl(int do_verification,
    bool all_pass = true;
    for(auto& op : ops)
    {
+        ck_tile::DeviceMem input_dev_buf(input.get_element_space_size_in_bytes());
+        ck_tile::DeviceMem weight_dev_buf(weight.get_element_space_size_in_bytes());
+        ck_tile::DeviceMem output_dev_buf(output.get_element_space_size_in_bytes());
+
+        input_dev_buf.ToDevice(input.data());
+        weight_dev_buf.ToDevice(output.data());
+        output_dev_buf.SetZero();
+
        ck_tile::GroupedConvFwdHostArgs args(conv_param,
                                            input_dev_buf.GetDeviceBuffer(),
                                            weight_dev_buf.GetDeviceBuffer(),
@@ -155,20 +155,14 @@ bool profile_grouped_conv_fwd_impl(int do_verification,
            std::string op_name = op->GetName(args);
            std::cout << op_name << " is profiled..." << std::endl;

-            float avg_time = op->Run(args, time_kernel);
-
-            std::size_t flop      = conv_param.GetFlops();
-            std::size_t num_btype = conv_param.GetByte<InDataType, WeiDataType, OutDataType>();
-
-            float tflops     = static_cast<float>(flop) / 1.E9 / avg_time;
-            float gb_per_sec = num_btype / 1.E6 / avg_time;
-
-            std::cout << "Perf: " << std::setw(10) << avg_time << " ms, " << tflops
-                        << " TFlops, " << gb_per_sec << " GB/s, " << op_name << std::endl;
-
+            // Run verification first. If it doesn't pass, no need to do performance measurement.
            bool pass = false; 
            if(do_verification)
            {
+                constexpr int n_warmup = 0;
+                constexpr int n_repeat = 1;
+
+                op->Run(args, false, n_warmup, n_repeat);
                output_dev_buf.FromDevice(output.data());

                ck_tile::HostTensor<OutDataType> output_host_ref(out_g_n_k_wos_desc);
@@ -202,13 +196,28 @@ bool profile_grouped_conv_fwd_impl(int do_verification,
            }

            bool is_valid = do_verification ? pass : true;
-
-            if(tflops > best_tflops && is_valid)
+            if (is_valid)
            {
-                best_op_name    = op_name;
-                best_tflops     = tflops;
-                best_avg_time   = avg_time;
-                best_gb_per_sec = gb_per_sec;
+                constexpr int n_warmup = 5;
+                constexpr int n_repeat = 50;
+                float avg_time = op->Run(args, time_kernel, n_warmup, n_repeat);
+
+                std::size_t flop      = conv_param.GetFlops();
+                std::size_t num_btype = conv_param.GetByte<InDataType, WeiDataType, OutDataType>();
+
+                float tflops     = static_cast<float>(flop) / 1.E9 / avg_time;
+                float gb_per_sec = num_btype / 1.E6 / avg_time;
+
+                std::cout << "Perf: " << std::setw(10) << avg_time << " ms, " << tflops
+                            << " TFlops, " << gb_per_sec << " GB/s, " << op_name << std::endl;
+
+                if(tflops > best_tflops)
+                {
+                    best_op_name    = op_name;
+                    best_tflops     = tflops;
+                    best_avg_time   = avg_time;
+                    best_gb_per_sec = gb_per_sec;
+                }
            }
        }
        else