[rocm-libraries] ROCm/rocm-libraries#5114 (commit 59b8cb5)

[CK][CK Tile] Improvements for grouped conv fwd tile profiling (#5114) ## Motivation Improve profiling for grouped convolution forward for better comparison between CK and CK Tile ## Technical Details - Include preprocessing time for ck tile - Add flush cache for conv fwd profiler - Switch configs to builder reflect - Add KPerXdl deduce - Add non-grouped ported instances ## Test Plan test_grouped_convnd_fwd_tile ## Test Result pass ## Submission Checklist - [x] Look over the contributing guidelines at https://github.com/ROCm/ROCm/blob/develop/CONTRIBUTING.md#pull-requests. AICK-786
2026-04-20 06:49:15 +00:00 · 2026-03-11 22:39:20 +00:00
parent c1f2d8166d
commit 2169367735
24 changed files with 2375 additions and 1874 deletions
--- a/include/ck/host_utility/flush_cache.hpp
+++ b/include/ck/host_utility/flush_cache.hpp
@@ -10,6 +10,7 @@

 #include "ck/ck.hpp"
 #include "ck/utility/env.hpp"
+#include "ck/utility/tuple.hpp"
 #include "ck/stream_config.hpp"
 #include "ck/host_utility/hip_check_error.hpp"
 #include "ck/utility/flush_icache.hpp"
--- a/include/ck/host_utility/kernel_launch.hpp
+++ b/include/ck/host_utility/kernel_launch.hpp
@@ -9,6 +9,7 @@
 #include "ck/utility/env.hpp"
 #include "ck/stream_config.hpp"
 #include "ck/host_utility/hip_check_error.hpp"
+#include "ck/host_utility/flush_cache.hpp"

 namespace ck {

@@ -170,6 +171,130 @@ float launch_and_time_kernel_with_preprocess(const StreamConfig& stream_config,
 #endif
 }

+template <typename... Args, typename F, typename PreProcessFunc>
+float launch_and_time_kernel_with_preprocess_flush_cache(const StreamConfig& stream_config,
+                                                         PreProcessFunc preprocess,
+                                                         F kernel,
+                                                         dim3 grid_dim,
+                                                         dim3 block_dim,
+                                                         std::size_t lds_byte,
+                                                         Args... args)
+{
+#if CK_TIME_KERNEL
+    if(stream_config.time_kernel_)
+    {
+        auto run_flush_cache = [&]() { ck::utility::flush_icache(); };
+        if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING)))
+        {
+            printf("%s: grid_dim {%u, %u, %u}, block_dim {%u, %u, %u} \n",
+                   __func__,
+                   grid_dim.x,
+                   grid_dim.y,
+                   grid_dim.z,
+                   block_dim.x,
+                   block_dim.y,
+                   block_dim.z);
+
+            printf("Warm up %d times\n", stream_config.cold_niters_);
+        }
+        // Warm up
+        preprocess();
+        for(int i = 0; i < stream_config.cold_niters_; ++i)
+        {
+            kernel<<<grid_dim, block_dim, lds_byte, stream_config.stream_id_>>>(args...);
+            hip_check_error(hipGetLastError());
+        }
+        float total_time = 0, flush_cache_total_time = 0;
+        const int nrepeat = stream_config.nrepeat_;
+        // Main timing loop
+        {
+            if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING)))
+            {
+                printf("Start running %d times...\n", nrepeat);
+            }
+            hipEvent_t start, stop;
+
+            hip_check_error(hipEventCreate(&start));
+            hip_check_error(hipEventCreate(&stop));
+
+            hip_check_error(hipDeviceSynchronize());
+            hip_check_error(hipEventRecord(start, stream_config.stream_id_));
+
+            for(int i = 0; i < nrepeat; ++i)
+            {
+                run_flush_cache();
+                preprocess();
+                kernel<<<grid_dim, block_dim, lds_byte, stream_config.stream_id_>>>(args...);
+                hip_check_error(hipGetLastError());
+            }
+
+            hip_check_error(hipEventRecord(stop, stream_config.stream_id_));
+            hip_check_error(hipEventSynchronize(stop));
+
+            hip_check_error(hipEventElapsedTime(&total_time, start, stop));
+
+            hip_check_error(hipEventDestroy(start));
+            hip_check_error(hipEventDestroy(stop));
+        }
+        // Flush cache timing loop
+        {
+            if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING)))
+            {
+                printf("Profile flush cache %d times...\n", nrepeat);
+            }
+            hipEvent_t start, stop;
+
+            hip_check_error(hipEventCreate(&start));
+            hip_check_error(hipEventCreate(&stop));
+
+            hip_check_error(hipDeviceSynchronize());
+            hip_check_error(hipEventRecord(start, stream_config.stream_id_));
+
+            for(int i = 0; i < nrepeat; ++i)
+            {
+                run_flush_cache();
+            }
+
+            hip_check_error(hipEventRecord(stop, stream_config.stream_id_));
+            hip_check_error(hipEventSynchronize(stop));
+
+            hip_check_error(hipEventElapsedTime(&flush_cache_total_time, start, stop));
+
+            hip_check_error(hipEventDestroy(start));
+            hip_check_error(hipEventDestroy(stop));
+        }
+        // Exclude flush cache from result
+        return (total_time - flush_cache_total_time) / nrepeat;
+    }
+    else
+    {
+        preprocess();
+        kernel<<<grid_dim, block_dim, lds_byte, stream_config.stream_id_>>>(args...);
+        hip_check_error(hipGetLastError());
+
+        return 0;
+    }
+#else
+    kernel<<<grid_dim, block_dim, lds_byte, stream_config.stream_id_>>>(args...);
+    hip_check_error(hipGetLastError());
+
+    return 0;
+#endif
+}
+
+template <typename... Args, typename F>
+float launch_and_time_kernel_flush_cache(const StreamConfig& stream_config,
+                                         F kernel,
+                                         dim3 grid_dim,
+                                         dim3 block_dim,
+                                         std::size_t lds_byte,
+                                         Args... args)
+{
+    auto preprocess = [&]() {};
+    return launch_and_time_kernel_with_preprocess_flush_cache(
+        stream_config, preprocess, kernel, grid_dim, block_dim, lds_byte, args...);
+}
+
 } // namespace ck

 #endif
--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_abd_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_abd_xdl_cshuffle.hpp
@@ -1158,26 +1158,52 @@ struct DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle
                        isMultiB,
                        CTranspose>;

-                    return launch_and_time_kernel(
-                        stream_config,
-                        kernel,
-                        dim3(gdx, gdy, gdz),
-                        dim3(BlockSize),
-                        0,
-                        arg.p_as_grid_,
-                        arg.p_bs_grid_,
-                        arg.p_ds_grid_,
-                        arg.p_e_grid_,
-                        arg.a_element_op_,
-                        arg.b_element_op_,
-                        arg.cde_element_op_,
-                        as_grid_desc_ak0_m_ak1,
-                        bs_grid_desc_bk0_n_bk1,
-                        arg.ds_grid_desc_mblock_mperblock_nblock_nperblock_,
-                        arg.e_grid_desc_mblock_mperblock_nblock_nperblock_,
-                        arg.block_2_etile_map_,
-                        arg.compute_ptr_offset_of_groups_,
-                        arg.compute_ptr_offset_of_n_);
+                    if(stream_config.flush_cache)
+                    {
+                        return launch_and_time_kernel_flush_cache(
+                            stream_config,
+                            kernel,
+                            dim3(gdx, gdy, gdz),
+                            dim3(BlockSize),
+                            0,
+                            arg.p_as_grid_,
+                            arg.p_bs_grid_,
+                            arg.p_ds_grid_,
+                            arg.p_e_grid_,
+                            arg.a_element_op_,
+                            arg.b_element_op_,
+                            arg.cde_element_op_,
+                            as_grid_desc_ak0_m_ak1,
+                            bs_grid_desc_bk0_n_bk1,
+                            arg.ds_grid_desc_mblock_mperblock_nblock_nperblock_,
+                            arg.e_grid_desc_mblock_mperblock_nblock_nperblock_,
+                            arg.block_2_etile_map_,
+                            arg.compute_ptr_offset_of_groups_,
+                            arg.compute_ptr_offset_of_n_);
+                    }
+                    else
+                    {
+                        return launch_and_time_kernel(
+                            stream_config,
+                            kernel,
+                            dim3(gdx, gdy, gdz),
+                            dim3(BlockSize),
+                            0,
+                            arg.p_as_grid_,
+                            arg.p_bs_grid_,
+                            arg.p_ds_grid_,
+                            arg.p_e_grid_,
+                            arg.a_element_op_,
+                            arg.b_element_op_,
+                            arg.cde_element_op_,
+                            as_grid_desc_ak0_m_ak1,
+                            bs_grid_desc_bk0_n_bk1,
+                            arg.ds_grid_desc_mblock_mperblock_nblock_nperblock_,
+                            arg.e_grid_desc_mblock_mperblock_nblock_nperblock_,
+                            arg.block_2_etile_map_,
+                            arg.compute_ptr_offset_of_groups_,
+                            arg.compute_ptr_offset_of_n_);
+                    }
                }
                else
                {
@@ -1230,26 +1256,53 @@ struct DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle
                            isMultiA,
                            isMultiB,
                            CTranspose>;
-                        return launch_and_time_kernel(
-                            stream_config,
-                            kernel,
-                            dim3(gdx, gdy, gdz),
-                            dim3(BlockSize),
-                            0,
-                            p_b_grid,
-                            p_a_grid,
-                            arg.p_ds_grid_,
-                            p_e_grid,
-                            arg.b_element_op_,
-                            arg.a_element_op_,
-                            arg.cde_element_op_,
-                            arg.b_grid_desc_bk0_n_bk1_,
-                            arg.a_grid_desc_ak0_m_ak1_,
-                            arg.ds_grid_desc_mblock_mperblock_nblock_nperblock_,
-                            arg.e_grid_desc_mblock_mperblock_nblock_nperblock_,
-                            arg.block_2_etile_map_,
-                            arg.compute_ptr_offset_of_groups_,
-                            arg.compute_ptr_offset_of_n_);
+
+                        if(stream_config.flush_cache)
+                        {
+                            return launch_and_time_kernel_flush_cache(
+                                stream_config,
+                                kernel,
+                                dim3(gdx, gdy, gdz),
+                                dim3(BlockSize),
+                                0,
+                                p_b_grid,
+                                p_a_grid,
+                                arg.p_ds_grid_,
+                                p_e_grid,
+                                arg.b_element_op_,
+                                arg.a_element_op_,
+                                arg.cde_element_op_,
+                                arg.b_grid_desc_bk0_n_bk1_,
+                                arg.a_grid_desc_ak0_m_ak1_,
+                                arg.ds_grid_desc_mblock_mperblock_nblock_nperblock_,
+                                arg.e_grid_desc_mblock_mperblock_nblock_nperblock_,
+                                arg.block_2_etile_map_,
+                                arg.compute_ptr_offset_of_groups_,
+                                arg.compute_ptr_offset_of_n_);
+                        }
+                        else
+                        {
+                            return launch_and_time_kernel(
+                                stream_config,
+                                kernel,
+                                dim3(gdx, gdy, gdz),
+                                dim3(BlockSize),
+                                0,
+                                p_b_grid,
+                                p_a_grid,
+                                arg.p_ds_grid_,
+                                p_e_grid,
+                                arg.b_element_op_,
+                                arg.a_element_op_,
+                                arg.cde_element_op_,
+                                arg.b_grid_desc_bk0_n_bk1_,
+                                arg.a_grid_desc_ak0_m_ak1_,
+                                arg.ds_grid_desc_mblock_mperblock_nblock_nperblock_,
+                                arg.e_grid_desc_mblock_mperblock_nblock_nperblock_,
+                                arg.block_2_etile_map_,
+                                arg.compute_ptr_offset_of_groups_,
+                                arg.compute_ptr_offset_of_n_);
+                        }
                    }
                    else
                    {
@@ -1274,26 +1327,52 @@ struct DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle
                            isMultiB,
                            CTranspose>;

-                        return launch_and_time_kernel(
-                            stream_config,
-                            kernel,
-                            dim3(gdx, gdy, gdz),
-                            dim3(BlockSize),
-                            0,
-                            p_a_grid,
-                            p_b_grid,
-                            arg.p_ds_grid_,
-                            p_e_grid,
-                            arg.a_element_op_,
-                            arg.b_element_op_,
-                            arg.cde_element_op_,
-                            arg.a_grid_desc_ak0_m_ak1_,
-                            arg.b_grid_desc_bk0_n_bk1_,
-                            arg.ds_grid_desc_mblock_mperblock_nblock_nperblock_,
-                            arg.e_grid_desc_mblock_mperblock_nblock_nperblock_,
-                            arg.block_2_etile_map_,
-                            arg.compute_ptr_offset_of_groups_,
-                            arg.compute_ptr_offset_of_n_);
+                        if(stream_config.flush_cache)
+                        {
+                            return launch_and_time_kernel_flush_cache(
+                                stream_config,
+                                kernel,
+                                dim3(gdx, gdy, gdz),
+                                dim3(BlockSize),
+                                0,
+                                p_a_grid,
+                                p_b_grid,
+                                arg.p_ds_grid_,
+                                p_e_grid,
+                                arg.a_element_op_,
+                                arg.b_element_op_,
+                                arg.cde_element_op_,
+                                arg.a_grid_desc_ak0_m_ak1_,
+                                arg.b_grid_desc_bk0_n_bk1_,
+                                arg.ds_grid_desc_mblock_mperblock_nblock_nperblock_,
+                                arg.e_grid_desc_mblock_mperblock_nblock_nperblock_,
+                                arg.block_2_etile_map_,
+                                arg.compute_ptr_offset_of_groups_,
+                                arg.compute_ptr_offset_of_n_);
+                        }
+                        else
+                        {
+                            return launch_and_time_kernel(
+                                stream_config,
+                                kernel,
+                                dim3(gdx, gdy, gdz),
+                                dim3(BlockSize),
+                                0,
+                                p_a_grid,
+                                p_b_grid,
+                                arg.p_ds_grid_,
+                                p_e_grid,
+                                arg.a_element_op_,
+                                arg.b_element_op_,
+                                arg.cde_element_op_,
+                                arg.a_grid_desc_ak0_m_ak1_,
+                                arg.b_grid_desc_bk0_n_bk1_,
+                                arg.ds_grid_desc_mblock_mperblock_nblock_nperblock_,
+                                arg.e_grid_desc_mblock_mperblock_nblock_nperblock_,
+                                arg.block_2_etile_map_,
+                                arg.compute_ptr_offset_of_groups_,
+                                arg.compute_ptr_offset_of_n_);
+                        }
                    }
                }
            };
--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_abd_xdl_cshuffle_v3.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_abd_xdl_cshuffle_v3.hpp
@@ -26,7 +26,6 @@
 #include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_utils.hpp"
 #include "ck/host_utility/device_prop.hpp"
 #include "ck/host_utility/kernel_launch.hpp"
-#include "ck/host_utility/flush_cache.hpp"
 #include "ck/host_utility/io.hpp"
 #ifdef CK_EXPERIMENTAL_BUILDER
 #include "ck_tile/builder/reflect/conv_describe.hpp"
@@ -1049,35 +1048,19 @@ struct DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3
            const auto Run = [&](const auto& kernel) {
                if(stream_config.flush_cache)
                {
-                    typename GridwiseGemm::Argument gemm_arg_ = gemm_arg;
-                    ck::utility::RotatingMemWrapper<typename GridwiseGemm::Argument> rotating_mem(
-                        gemm_arg_,
-                        stream_config.rotating_count,
-                        gemm_arg_.M * gemm_arg_.K * sizeof(ADataType),
-                        gemm_arg_.K * gemm_arg_.N * sizeof(BDataType));
-                    rotating_mem.Print();
-
-                    auto run_flush_cache = [&]() {
-                        // flush icache
-                        ck::utility::flush_icache();
-                        // rotating mem
-                        rotating_mem.Next();
-                    };
-
-                    ave_time += ck::utility::launch_and_time_kernel_with_preprocess<false>(
-                        stream_config,
-                        run_flush_cache,
-                        kernel,
-                        dim3(gdx, gdy, gdz),
-                        dim3(BlockSize),
-                        0,
-                        gemm_arg_,
-                        arg.a_grid_desc_ak0_m_ak1_,
-                        arg.b_grid_desc_bk0_n_bk1_,
-                        arg.ds_grid_desc_m_n_,
-                        arg.e_grid_desc_m_n_,
-                        arg.compute_ptr_offset_of_groups_,
-                        arg.compute_ptr_offset_of_n_);
+                    ave_time +=
+                        launch_and_time_kernel_flush_cache(stream_config,
+                                                           kernel,
+                                                           dim3(gdx, gdy, gdz),
+                                                           dim3(BlockSize),
+                                                           0,
+                                                           gemm_arg,
+                                                           arg.a_grid_desc_ak0_m_ak1_,
+                                                           arg.b_grid_desc_bk0_n_bk1_,
+                                                           arg.ds_grid_desc_m_n_,
+                                                           arg.e_grid_desc_m_n_,
+                                                           arg.compute_ptr_offset_of_groups_,
+                                                           arg.compute_ptr_offset_of_n_);
                }
                else
                {
--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_xdl_large_tensor_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_xdl_large_tensor_cshuffle.hpp
@@ -759,19 +759,36 @@ struct DeviceGroupedConvFwdMultipleD_Xdl_CShuffle_Large_Tensor
                    CDEElementwiseOperation,
                    ComputePtrOffsetOfStridedBatch<I1, I1, NumDTensor>,
                    has_main_loop>;
-
-                return launch_and_time_kernel(stream_config,
-                                              kernel,
-                                              dim3(gdx, gdy, gdz),
-                                              dim3(BlockSize),
-                                              0,
-                                              arg.gemm_desc_kernel_args_,
-                                              arg.gemms_count_,
-                                              arg.a_element_op_,
-                                              arg.b_element_op_,
-                                              arg.cde_element_op_,
-                                              arg.compute_ptr_offset_of_groups_,
-                                              arg.compute_ptr_offset_of_n_);
+                if(stream_config.flush_cache)
+                {
+                    return launch_and_time_kernel_flush_cache(stream_config,
+                                                              kernel,
+                                                              dim3(gdx, gdy, gdz),
+                                                              dim3(BlockSize),
+                                                              0,
+                                                              arg.gemm_desc_kernel_args_,
+                                                              arg.gemms_count_,
+                                                              arg.a_element_op_,
+                                                              arg.b_element_op_,
+                                                              arg.cde_element_op_,
+                                                              arg.compute_ptr_offset_of_groups_,
+                                                              arg.compute_ptr_offset_of_n_);
+                }
+                else
+                {
+                    return launch_and_time_kernel(stream_config,
+                                                  kernel,
+                                                  dim3(gdx, gdy, gdz),
+                                                  dim3(BlockSize),
+                                                  0,
+                                                  arg.gemm_desc_kernel_args_,
+                                                  arg.gemms_count_,
+                                                  arg.a_element_op_,
+                                                  arg.b_element_op_,
+                                                  arg.cde_element_op_,
+                                                  arg.compute_ptr_offset_of_groups_,
+                                                  arg.compute_ptr_offset_of_n_);
+                }
            };

            if(GridwiseGemm::CalculateHasMainKBlockLoop(K))
--- a/include/ck_tile/host/kernel_launch.hpp
+++ b/include/ck_tile/host/kernel_launch.hpp
@@ -10,6 +10,8 @@
 #include "ck_tile/host/hip_check_error.hpp"
 #include "ck_tile/host/stream_config.hpp"
 #include "ck_tile/host/timer.hpp"
+#include "ck_tile/host/flush_icache.hpp"
+#include "ck_tile/host/rotating_buffers.hpp"
 #include <cstddef>
 #include <hip/hip_runtime.h>

@@ -124,6 +126,47 @@ preprocess_profiling_impl(TimerType timer, const stream_config& s, PreprocessFun
    return timer.duration() / s.nrepeat_;
 }

+template <typename TimerType, typename CallablesFunc, typename PreprocessFunc = std::nullptr_t>
+CK_TILE_HOST double timing_loop_flush_cache_impl(TimerType timer,
+                                                 const stream_config& s,
+                                                 CallablesFunc&& callables_func,
+                                                 PreprocessFunc preprocess = nullptr)
+{
+    auto run_flush_cache = [&]() { ck_tile::flush_icache(); };
+    // Warm up
+    for(int i = 0; i < s.cold_niters_; i++)
+    {
+        if constexpr(!std::is_same_v<PreprocessFunc, std::nullptr_t>)
+        {
+            preprocess();
+        }
+        callables_func();
+    }
+    // Main timing loop
+    int i = 0;
+    timer.start(s.stream_id_);
+    while(i < s.nrepeat_)
+    {
+        run_flush_cache();
+        if constexpr(!std::is_same_v<PreprocessFunc, std::nullptr_t>)
+        {
+            preprocess();
+        }
+
+        callables_func();
+        i++;
+    }
+    timer.stop(s.stream_id_);
+    // Flush cache timing loop
+    auto flush_cache_time = preprocess_profiling_impl(gpu_timer{}, s, run_flush_cache);
+    if(i == 0)
+    {
+        return 0.;
+    }
+    // Exclude flush cache from result
+    return (timer.duration() / s.nrepeat_) - flush_cache_time;
+}
+
 template <typename TimerType, typename CallablesFunc, typename PreprocessFunc = std::nullptr_t>
 CK_TILE_HOST double timing_loop_impl(TimerType timer,
                                     const stream_config& s,
@@ -138,12 +181,6 @@ CK_TILE_HOST double timing_loop_impl(TimerType timer,
        }
        callables_func();
    }
-    // Only profile preprocess if it's provided
-    auto preprocess_time = 0.0;
-    if constexpr(!std::is_same_v<PreprocessFunc, std::nullptr_t>)
-    {
-        preprocess_time = preprocess_profiling_impl(gpu_timer{}, s, preprocess);
-    }

    int i = 0;
    timer.start(s.stream_id_);
@@ -159,9 +196,9 @@ CK_TILE_HOST double timing_loop_impl(TimerType timer,
    }
    timer.stop(s.stream_id_);

-    if(!i)
+    if(i == 0)
        return 0.;
-    return (timer.duration() / s.nrepeat_) - preprocess_time;
+    return timer.duration() / s.nrepeat_;
 }

 // clang-format off
@@ -238,4 +275,31 @@ launch_kernel_time_mask(const stream_config& s, PreprocessFunc preprocess, Calla
        return timing_loop_impl(cpu_timer{}, s, callables_func, preprocess);
    }
 }
+
+template <typename PreprocessFunc, typename... Callables>
+CK_TILE_HOST float launch_kernel_time_mask_flush_cache(const stream_config& s,
+                                                       PreprocessFunc preprocess,
+                                                       Callables&&... callables)
+{
+    static_assert(sizeof...(callables) > 0, "At least one callable is required!");
+
+    if(!s.time_kernel_)
+    {
+        preprocess();
+        launch_and_check(s, std::forward<Callables>(callables)...);
+        return 0;
+    }
+
+    auto callables_func = [&]() { launch_and_check(s, std::forward<Callables>(callables)...); };
+
+    if(s.is_gpu_timer_)
+    {
+        return timing_loop_flush_cache_impl(gpu_timer{}, s, callables_func, preprocess);
+    }
+    else
+    {
+        return timing_loop_flush_cache_impl(cpu_timer{}, s, callables_func, preprocess);
+    }
+}
+
 } // namespace ck_tile