[CK][CK Tile] Grouped Convolution backward weight profiler flush cache (#5454)

## Motivation Flush cache to get more stable results during profiling old ck and ck tile. ## Technical Details Flush cache before each kernel call and one more first run. ## Test Plan test_grouped_conv_bwd_weight_tile ## Test Result pass ## Submission Checklist - [x] Look over the contributing guidelines at https://github.com/ROCm/ROCm/blob/develop/CONTRIBUTING.md#pull-requests. AICK-966 --------- Co-authored-by: Copilot Autofix powered by AI <175728472+Copilot@users.noreply.github.com>
2026-05-14 02:02:46 +00:00 · 2026-03-16 18:46:21 +01:00
parent 4f091cacd0
commit 1e1f3647f7
8 changed files with 162 additions and 80 deletions
--- a/experimental/builder/include/ck_tile/builder/testing/conv/ck_tile.hpp
+++ b/experimental/builder/include/ck_tile/builder/testing/conv/ck_tile.hpp
@@ -175,19 +175,40 @@ template <auto SIGNATURE, typename InDataType, typename WeiDataType, typename Ou
    constexpr index_t minimum_occupancy =
        Conv::GemmPipeline::Scheduler == ck_tile::GemmPipelineScheduler::Intrawave ? 1 : 2;

-    return RunResult::from_runtime(ck_tile::launch_kernel_time_mask(
-        s_conf,
-        preprocess,
-        ck_tile::make_kernel<minimum_occupancy>(conv, grids, blocks, 0, kargs),
-        ck_tile::make_kernel<minimum_occupancy>(elementwise_op,
-                                                kGridSize,
-                                                kBlockSize,
-                                                0,
-                                                input_size,
-                                                ck_tile::make_tuple(shape[1], 1), // Input Stride
-                                                ck_tile::make_tuple(shape[1], 1), // Output Stride
-                                                input_tensors,
-                                                static_cast<CDataType*>(c_ptr))));
+    if(s_conf.flush_cache_)
+    {
+        return RunResult::from_runtime(ck_tile::launch_kernel_time_mask_flush_cache(
+            s_conf,
+            preprocess,
+            ck_tile::make_kernel<minimum_occupancy>(conv, grids, blocks, 0, kargs),
+            ck_tile::make_kernel<minimum_occupancy>(
+                elementwise_op,
+                kGridSize,
+                kBlockSize,
+                0,
+                input_size,
+                ck_tile::make_tuple(shape[1], 1), // Input Stride
+                ck_tile::make_tuple(shape[1], 1), // Output Stride
+                input_tensors,
+                static_cast<CDataType*>(c_ptr))));
+    }
+    else
+    {
+        return RunResult::from_runtime(ck_tile::launch_kernel_time_mask(
+            s_conf,
+            preprocess,
+            ck_tile::make_kernel<minimum_occupancy>(conv, grids, blocks, 0, kargs),
+            ck_tile::make_kernel<minimum_occupancy>(
+                elementwise_op,
+                kGridSize,
+                kBlockSize,
+                0,
+                input_size,
+                ck_tile::make_tuple(shape[1], 1), // Input Stride
+                ck_tile::make_tuple(shape[1], 1), // Output Stride
+                input_tensors,
+                static_cast<CDataType*>(c_ptr))));
+    }
 }

 } // namespace detail
--- a/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_multiple_d_xdl_cshuffle_v3.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_multiple_d_xdl_cshuffle_v3.hpp
@@ -479,7 +479,7 @@ struct DeviceBatchedGemmMultiD_Xdl_CShuffle_V3
            const bool has_main_k_block_loop = GridwiseGemm::CalculateHasMainKBlockLoop(K_split);

            const auto Run = [&](const auto& kernel) {
-                if(stream_config.flush_cache)
+                if(stream_config.flush_cache && stream_config.rotating_count > 1)
                {

                    std::array<std::size_t, NumDTensor> DsSize;
@@ -534,6 +534,27 @@ struct DeviceBatchedGemmMultiD_Xdl_CShuffle_V3
                        0,
                        arg_);
                }
+                else if(stream_config.flush_cache)
+                {
+                    const auto clear_workspace = [&]() {
+                        if(arg.KBatch > 1)
+                            hipGetErrorString(
+                                hipMemsetAsync(arg.p_c_grid,
+                                               0,
+                                               arg.Batch * arg.M * arg.N * sizeof(CDataType),
+                                               stream_config.stream_id_));
+                    };
+
+                    BatchGemmArgument arg_ = reinterpret_cast<const BatchGemmArgument&>(arg);
+                    ave_time =
+                        launch_and_time_kernel_with_preprocess_flush_cache(stream_config,
+                                                                           clear_workspace,
+                                                                           kernel,
+                                                                           dim3(gdx, gdy, gdz),
+                                                                           dim3(BlockSize),
+                                                                           0,
+                                                                           arg_);
+                }
                else
                {
                    const auto clear_workspace = [&]() {
--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_two_stage_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_two_stage_xdl_cshuffle.hpp
@@ -1031,30 +1031,14 @@ struct DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle
            const auto Run = [&](const auto& kernel) {
                if(stream_config.flush_cache)
                {
-                    typename GridwiseGemm::Argument gemm_arg_ = gemm_arg;
-                    ck::utility::RotatingMemWrapper<typename GridwiseGemm::Argument> rotating_mem(
-                        gemm_arg_,
-                        stream_config.rotating_count,
-                        gemm_arg_.M * gemm_arg_.K * sizeof(ADataType),
-                        gemm_arg_.K * gemm_arg_.N * sizeof(BDataType));
-                    rotating_mem.Print();
-
-                    auto run_flush_cache = [&]() {
-                        // flush icache
-                        ck::utility::flush_icache();
-                        // rotating mem
-                        rotating_mem.Next();
-                        clear_workspace();
-                    };
-
-                    ave_time += ck::utility::launch_and_time_kernel_with_preprocess<false>(
+                    ave_time += launch_and_time_kernel_with_preprocess_flush_cache(
                        stream_config,
-                        run_flush_cache,
+                        clear_workspace,
                        kernel,
                        dim3(gdx, gdy, gdz),
                        dim3(BlockSize),
                        0,
-                        gemm_arg_,
+                        gemm_arg,
                        arg.a_grid_desc_k0_m_k1_,
                        arg.b_grid_desc_k0_n_k1_,
                        arg.c_grid_desc_mblock_mperblock_nblock_nperblock_,
--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_xdl_cshuffle.hpp
@@ -998,30 +998,58 @@ struct DeviceGroupedConvBwdWeight_Xdl_CShuffle
                    hip_check_error(hipMemsetAsync(
                        p_e_grid, 0, arg.c_space_size_bytes, stream_config.stream_id_));
                };
-
-                avg_time += launch_and_time_kernel_with_preprocess(
-                    stream_config,
-                    clear_workspace,
-                    kernel,
-                    dim3(grid_size),
-                    dim3(BlockSize),
-                    0,
-                    p_a_grid,
-                    p_b_grid,
-                    p_e_grid,
-                    arg.a_element_op_,
-                    arg.b_element_op_,
-                    arg.c_element_op_,
-                    arg.Conv_G_,
-                    arg.a_grid_desc_kbatch_k0_m_k1_,
-                    arg.b_grid_desc_kbatch_k0_n_k1_,
-                    c_grid_desc_mblock_mperblock_nblock_nperblock,
-                    arg.block_2_ctile_map_,
-                    arg.compute_ptr_offset_of_batch_,
-                    arg.split_k_stride_a_,
-                    arg.split_k_stride_b_,
-                    arg.split_k_offset_hack_,
-                    arg.k_batch_);
+                if(stream_config.flush_cache)
+                {
+                    avg_time += launch_and_time_kernel_with_preprocess_flush_cache(
+                        stream_config,
+                        clear_workspace,
+                        kernel,
+                        dim3(grid_size),
+                        dim3(BlockSize),
+                        0,
+                        p_a_grid,
+                        p_b_grid,
+                        p_e_grid,
+                        arg.a_element_op_,
+                        arg.b_element_op_,
+                        arg.c_element_op_,
+                        arg.Conv_G_,
+                        arg.a_grid_desc_kbatch_k0_m_k1_,
+                        arg.b_grid_desc_kbatch_k0_n_k1_,
+                        c_grid_desc_mblock_mperblock_nblock_nperblock,
+                        arg.block_2_ctile_map_,
+                        arg.compute_ptr_offset_of_batch_,
+                        arg.split_k_stride_a_,
+                        arg.split_k_stride_b_,
+                        arg.split_k_offset_hack_,
+                        arg.k_batch_);
+                }
+                else
+                {
+                    avg_time += launch_and_time_kernel_with_preprocess(
+                        stream_config,
+                        clear_workspace,
+                        kernel,
+                        dim3(grid_size),
+                        dim3(BlockSize),
+                        0,
+                        p_a_grid,
+                        p_b_grid,
+                        p_e_grid,
+                        arg.a_element_op_,
+                        arg.b_element_op_,
+                        arg.c_element_op_,
+                        arg.Conv_G_,
+                        arg.a_grid_desc_kbatch_k0_m_k1_,
+                        arg.b_grid_desc_kbatch_k0_n_k1_,
+                        c_grid_desc_mblock_mperblock_nblock_nperblock,
+                        arg.block_2_ctile_map_,
+                        arg.compute_ptr_offset_of_batch_,
+                        arg.split_k_stride_a_,
+                        arg.split_k_stride_b_,
+                        arg.split_k_offset_hack_,
+                        arg.k_batch_);
+                }
            };

            if(has_main_k0_block_loop)
--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_xdl_cshuffle_v3.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_xdl_cshuffle_v3.hpp
@@ -805,29 +805,14 @@ struct DeviceGroupedConvBwdWeight_Xdl_CShuffleV3
            const auto Run = [&](const auto& kernel) {
                if(stream_config.flush_cache)
                {
-                    typename GridwiseGemm::Argument gemm_arg_ = gemm_arg;
-                    ck::utility::RotatingMemWrapper<typename GridwiseGemm::Argument> rotating_mem(
-                        gemm_arg_,
-                        stream_config.rotating_count,
-                        gemm_arg_.M * gemm_arg_.K * sizeof(ADataType),
-                        gemm_arg_.K * gemm_arg_.N * sizeof(BDataType));
-                    rotating_mem.Print();
-
-                    auto run_flush_cache = [&]() {
-                        // flush icache
-                        ck::utility::flush_icache();
-                        // rotating mem
-                        rotating_mem.Next();
-                        clear_workspace();
-                    };
-                    ave_time += ck::utility::launch_and_time_kernel_with_preprocess<false>(
+                    ave_time += launch_and_time_kernel_with_preprocess_flush_cache(
                        stream_config,
-                        run_flush_cache,
+                        clear_workspace,
                        kernel,
                        dim3(gdx, gdy, gdz),
                        dim3(BlockSize),
                        0,
-                        gemm_arg_,
+                        gemm_arg,
                        arg.a_grid_desc_k0_m_k1_,
                        arg.b_grid_desc_k0_n_k1_,
                        arg.c_grid_desc_mblock_mperblock_nblock_nperblock_,
--- a/profiler/include/profiler/grouped_convolution_backward_weight_tile_algs.hpp
+++ b/profiler/include/profiler/grouped_convolution_backward_weight_tile_algs.hpp
@@ -114,7 +114,8 @@ run_grouped_conv_backward_weight_tile_algs(const ckt::Args<SIGNATURE>& args,
                                           const ckt::Outputs<SIGNATURE>& outputs,
                                           const ck_tile::stream_config& s_conf)
 {
-    float best_avg_time = std::numeric_limits<float>::max();
+    bool dummy_run_executed = false;
+    float best_avg_time     = std::numeric_limits<float>::max();
    std::string best_op_name, op_name;
    int best_split_k;
    bool is_supported;
@@ -154,6 +155,13 @@ run_grouped_conv_backward_weight_tile_algs(const ckt::Args<SIGNATURE>& args,
        {
            ckt::Args<SIGNATURE> args_k_batch = args;
            args_k_batch.k_batch              = k_batch;
+            if((s_conf.time_kernel_ || s_conf.flush_cache_) && !dummy_run_executed)
+            {
+                // Run first instance twice when profiling to stabilize timing
+                std::tie(is_supported, avg_time, op_name) =
+                    run_alg_func(args_k_batch, inputs, outputs, s_conf);
+                dummy_run_executed = true;
+            }
            std::tie(is_supported, avg_time, op_name) =
                run_alg_func(args_k_batch, inputs, outputs, s_conf);
            if(is_supported)
--- a/profiler/include/profiler/profile_grouped_conv_bwd_weight_impl.hpp
+++ b/profiler/include/profiler/profile_grouped_conv_bwd_weight_impl.hpp
@@ -272,7 +272,8 @@ bool profile_grouped_conv_bwd_weight_impl(int do_verification,
    index_t valid_instances     = 0;

    // profile device Conv instances
-    bool all_pass = true;
+    bool all_pass           = true;
+    bool dummy_run_executed = false;

    std::array<ck::index_t, NDimSpatial + 3> input_lengths{};
    std::array<ck::index_t, NDimSpatial + 3> filter_lengths{};
@@ -400,8 +401,25 @@ bool profile_grouped_conv_bwd_weight_impl(int do_verification,

                auto invoker_ptr = op_ptr->MakeInvokerPointer();

-                float avg_time =
-                    invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel});
+                if(time_kernel && !dummy_run_executed)
+                {
+                    // Run first instance as dummy to get proper time from the first instance
+                    invoker_ptr->Run(argument_ptr.get(),
+                                     StreamConfig{nullptr,
+                                                  time_kernel,
+                                                  0 /*log_level*/,
+                                                  5 /*cold_iters*/,
+                                                  50 /*nrepeat_*/,
+                                                  time_kernel /*flush_cache*/});
+                    dummy_run_executed = true;
+                }
+                float avg_time = invoker_ptr->Run(argument_ptr.get(),
+                                                  StreamConfig{nullptr,
+                                                               time_kernel,
+                                                               0 /*log_level*/,
+                                                               5 /*cold_iters*/,
+                                                               50 /*nrepeat_*/,
+                                                               time_kernel /*flush_cache*/});

                std::size_t flop      = conv_param.GetFlops();
                std::size_t num_btype = conv_param.GetByte<InDataType, WeiDataType, OutDataType>();
--- a/profiler/src/profile_grouped_conv_bwd_weight_tile.cpp
+++ b/profiler/src/profile_grouped_conv_bwd_weight_tile.cpp
@@ -141,7 +141,8 @@ int call_profiler(const ckt::Args<SIGNATURE>& args, const std::string& split_k,
                                   0 /*log_level*/,
                                   5 /*cold_iters*/,
                                   50 /*nrepeat_*/,
-                                   true /*is_gpu_timer_*/});
+                                   true /*is_gpu_timer_*/,
+                                   time_kernel /*flush_cache*/});
    if(time_kernel)
    {
        std::cout << "\nBest configuration parameters:" << "\n\tname: " << op_name
@@ -208,6 +209,14 @@ int profile_grouped_conv_bwd_weight_tile(int argc, char* argv[])
                    split_k,
                    time_kernel);
            }
+            else if(data_type == ConvDataType::F32_F32_F32)
+            {
+                constexpr auto SIGNATURE = ckp::SIGNATURE_NHWGC_FP32_BWD_WEIGHT;
+                return call_profiler<SIGNATURE>(
+                    ckp::parse_conv_args<SIGNATURE>(conv_params_start_idx, argv),
+                    split_k,
+                    time_kernel);
+            }
        }
        else if(num_dim_spatial == 3)
        {
@@ -227,6 +236,14 @@ int profile_grouped_conv_bwd_weight_tile(int argc, char* argv[])
                    split_k,
                    time_kernel);
            }
+            else if(data_type == ConvDataType::F32_F32_F32)
+            {
+                constexpr auto SIGNATURE = ckp::SIGNATURE_NDHWGC_FP32_BWD_WEIGHT;
+                return call_profiler<SIGNATURE>(
+                    ckp::parse_conv_args<SIGNATURE>(conv_params_start_idx, argv),
+                    split_k,
+                    time_kernel);
+            }
        }
    }