Merge commit 'bbdaf79a52c7b1cb74fe3d758e7b5142b2084f70' into develop

2026-05-16 10:59:55 +00:00 · 2025-05-28 16:07:10 +00:00
parent 437f8ebc7a
commit 63ea0473f6
18 changed files with 81 additions and 416 deletions
--- a/example/ck_tile/03_gemm/gemm_utils.hpp
+++ b/example/ck_tile/03_gemm/gemm_utils.hpp
@@ -220,11 +220,4 @@ auto create_args(int argc, char* argv[])
 }

 // host API
-template <typename ADataType,
-          typename BDataType,
-          typename AccDataType,
-          typename CDataType,
-          typename ALayout,
-          typename BLayout,
-          typename CLayout>
 float gemm_calc(const ck_tile::GemmHostArgs& args, const ck_tile::stream_config& s);
--- a/example/ck_tile/03_gemm/run_gemm_example.inc
+++ b/example/ck_tile/03_gemm/run_gemm_example.inc
@@ -178,7 +178,7 @@ float invoke_gemm(ck_tile::DeviceMem& a_m_k_dev_buf,

    float ave_time =
        gemm_calc<ADataType, BDataType, AccDataType, CDataType, ALayout, BLayout, CLayout>(
-            args, ck_tile::stream_config{nullptr, true, 1, n_warmup, n_repeat, true, true, 50});
+            args, ck_tile::stream_config{nullptr, true, 1, n_warmup, n_repeat});

    std::size_t flop = std::size_t(2) * M * N * K;
    std::size_t num_byte =
--- a/example/ck_tile/03_gemm/universal_gemm.cpp
+++ b/example/ck_tile/03_gemm/universal_gemm.cpp
@@ -11,7 +11,6 @@

 #include "ck_tile/host.hpp"
 #include "gemm_utils.hpp"
-#include "run_gemm_example.inc"

 template <typename Pipeline, ck_tile::TailNumber TN>
 void try_run(ck_tile::TailNumber tn)
@@ -75,102 +74,64 @@ float gemm_calc(const ck_tile::GemmHostArgs& args, const ck_tile::stream_config&

    float ave_time{0};

-    const auto Run =
-        [&](const auto has_hot_loop_, const auto tail_number_, const auto memory_operation_) {
-            constexpr bool has_hot_loop_v   = has_hot_loop_.value;
-            constexpr auto tail_number_v    = tail_number_.value;
-            constexpr auto scheduler        = GEMM_PIPELINE_SCHEDULER;
-            constexpr auto memory_operation = memory_operation_.value;
+    const auto Run = [&](const auto has_hot_loop_,
+                         const auto tail_number_,
+                         const auto memory_operation_) {
+        constexpr bool has_hot_loop_v   = has_hot_loop_.value;
+        constexpr auto tail_number_v    = tail_number_.value;
+        constexpr auto scheduler        = GEMM_PIPELINE_SCHEDULER;
+        constexpr auto memory_operation = memory_operation_.value;

-            using UniversalGemmProblem = ck_tile::UniversalGemmPipelineProblem<ADataType,
-                                                                               BDataType,
-                                                                               AccDataType,
-                                                                               GemmShape,
-                                                                               GemmUniversalTraits,
-                                                                               scheduler,
-                                                                               has_hot_loop_v,
-                                                                               tail_number_v>;
+        using UniversalGemmProblem = ck_tile::UniversalGemmPipelineProblem<ADataType,
+                                                                           BDataType,
+                                                                           AccDataType,
+                                                                           GemmShape,
+                                                                           GemmUniversalTraits,
+                                                                           scheduler,
+                                                                           has_hot_loop_v,
+                                                                           tail_number_v>;

-            using GemmPipeline = GEMM_PIPELINE<UniversalGemmProblem>;
-            using GemmEpilogue = ck_tile::CShuffleEpilogue<
-                ck_tile::CShuffleEpilogueProblem<ADataType,
-                                                 BDataType,
-                                                 AccDataType,
-                                                 CDataType,
-                                                 CLayout,
-                                                 GemmPipelineProblem::kBlockSize,
-                                                 TilePartitioner::MPerBlock,
-                                                 TilePartitioner::NPerBlock,
-                                                 GemmConfig::M_Warp,
-                                                 GemmConfig::N_Warp,
-                                                 GemmConfig::M_Warp_Tile,
-                                                 GemmConfig::N_Warp_Tile,
-                                                 GemmConfig::K_Warp_Tile,
-                                                 UniversalGemmProblem::TransposeC,
-                                                 memory_operation>>;
-            using Kernel = ck_tile::GemmKernel<TilePartitioner, GemmPipeline, GemmEpilogue>;
-            auto kargs   = Kernel::MakeKernelArgs(args);
+        using GemmPipeline = GEMM_PIPELINE<UniversalGemmProblem>;
+        using GemmEpilogue = ck_tile::CShuffleEpilogue<
+            ck_tile::CShuffleEpilogueProblem<ADataType,
+                                             BDataType,
+                                             AccDataType,
+                                             CDataType,
+                                             CLayout,
+                                             GemmPipelineProblem::kBlockSize,
+                                             TilePartitioner::MPerBlock,
+                                             TilePartitioner::NPerBlock,
+                                             GemmConfig::M_Warp,
+                                             GemmConfig::N_Warp,
+                                             GemmConfig::M_Warp_Tile,
+                                             GemmConfig::N_Warp_Tile,
+                                             GemmConfig::K_Warp_Tile,
+                                             UniversalGemmProblem::TransposeC,
+                                             memory_operation>>;
+        using Kernel = ck_tile::GemmKernel<TilePartitioner, GemmPipeline, GemmEpilogue>;
+        auto kargs   = Kernel::MakeKernelArgs(args);

-            const dim3 grids      = Kernel::GridSize(args.M, args.N, args.k_batch);
-            constexpr dim3 blocks = Kernel::BlockSize();
+        const dim3 grids      = Kernel::GridSize(args.M, args.N, args.k_batch);
+        constexpr dim3 blocks = Kernel::BlockSize();

-            if(!Kernel::IsSupportedArgument(kargs))
-            {
-                throw std::runtime_error("Wrong! Arguments not supported! Skipping gemm!\n");
-            }
+        if(!Kernel::IsSupportedArgument(kargs))
+        {
+            throw std::runtime_error("Wrong! Arguments not supported! Skipping gemm!\n");
+        }

-            if(s.log_level_ > 0)
-            {
-                std::cout << "Launching kernel with args:"
-                          << " grid: {" << grids.x << ", " << grids.y << ", " << grids.z << "}"
-                          << ", blocks: {" << blocks.x << ", " << blocks.y << ", " << blocks.z
-                          << "}" << std::endl;
-            }
-            if(s.flush_cache_)
-            {
-                std::cout << "Flushing cache..." << std::endl;
-                static constexpr ck_tile::index_t APackedSize =
-                    std::is_same_v<BDataType, ck_tile::pk_int4_t> ? 2 : 1;
-                static constexpr ck_tile::index_t BPackedSize =
-                    std::is_same_v<BDataType, ck_tile::pk_int4_t> ? 2 : 1;
+        if(s.log_level_ > 0)
+        {
+            std::cout << "Launching kernel with args:"
+                      << " grid: {" << grids.x << ", " << grids.y << ", " << grids.z << "}"
+                      << ", blocks: {" << blocks.x << ", " << blocks.y << ", " << blocks.z << "}"
+                      << std::endl;
+        }

-                ck_tile::HostTensor<ADataType> a_m(ck_tile::host_tensor_descriptor(
-                    args.M, args.K, args.stride_A, is_row_major(ALayout{})));
-                ck_tile::HostTensor<BDataType> b_n(ck_tile::host_tensor_descriptor(
-                    args.K, args.N, args.stride_B, is_row_major(BLayout{})));
-
-                auto size_a_buffer = a_m.get_element_space_size_in_bytes() / APackedSize;
-                auto size_b_buffer = b_n.get_element_space_size_in_bytes() / BPackedSize;
-
-                ck_tile::RotatingMemWrapper<ADataType, BDataType> rotating_mem(
-                    kargs.a_ptr, kargs.b_ptr, s.rotating_count_, size_a_buffer, size_b_buffer);
-                rotating_mem.Print();
-
-                auto run_flush_cache = [&]() {
-                    // flush icache
-                    ck_tile::flush_icache();
-                    // rotating mem
-                    rotating_mem.Next();
-                    // clear c mem
-                    if(args.k_batch > 1)
-                        hipGetErrorString(hipMemsetAsync(
-                            args.c_ptr, 0, args.M * args.N * sizeof(CDataType), s.stream_id_));
-                };
-                ave_time = ck_tile::launch_kernel_preprocess(
-                    s,
-                    run_flush_cache,
-                    ck_tile::make_kernel<blocks.x, GemmConfig::kBlockPerCu>(
-                        Kernel{}, grids, blocks, 0, kargs));
-            }
-            else
-            {
-                ave_time =
-                    ck_tile::launch_kernel(s,
-                                           ck_tile::make_kernel<blocks.x, GemmConfig::kBlockPerCu>(
-                                               Kernel{}, grids, blocks, 0, kargs));
-            }
-            return ave_time;
-        };
+        ave_time = ck_tile::launch_kernel(s,
+                                          ck_tile::make_kernel<blocks.x, GemmConfig::kBlockPerCu>(
+                                              Kernel{}, grids, blocks, 0, kargs));
+        return ave_time;
+    };

    const auto RunSplitk = [&](const auto has_hot_loop_, const auto tail_number_) {
        if(args.k_batch == 1)
@@ -282,6 +243,8 @@ float gemm_calc(const ck_tile::GemmHostArgs& args, const ck_tile::stream_config&
    return ave_time;
 }

+#include "run_gemm_example.inc"
+
 template <typename APrecType, typename BPrecType = APrecType, typename CPrecType = APrecType>
 int run_gemm_example_prec_type(std::string a_layout, std::string b_layout, int argc, char* argv[])
 {
--- a/example/ck_tile/18_flatmm/flatmm_basic.cpp
+++ b/example/ck_tile/18_flatmm/flatmm_basic.cpp
@@ -11,7 +11,6 @@

 #include "ck_tile/host.hpp"
 #include "flatmm_basic.hpp"
-#include "run_flatmm_example.inc"

 template <typename ADataType,
          typename BDataType,
@@ -116,47 +115,9 @@ float flatmm_calc(const ck_tile::FlatmmHostArgs& args, const ck_tile::stream_con
                      << std::endl;
        }

-        float ave_time{0};
-        if(s.flush_cache_)
-        {
-            std::cout << "Flushing cache..." << std::endl;
-            static constexpr ck_tile::index_t APackedSize =
-                std::is_same_v<BDataType, ck_tile::pk_int4_t> ? 2 : 1;
-            static constexpr ck_tile::index_t BPackedSize =
-                std::is_same_v<BDataType, ck_tile::pk_int4_t> ? 2 : 1;
+        float ave_time = ck_tile::launch_kernel(
+            s, ck_tile::make_kernel<blocks.x, kBlockPerCu>(Kernel{}, grids, blocks, 0, kargs));

-            ck_tile::HostTensor<ADataType> a_m(ck_tile::host_tensor_descriptor(
-                args.M, args.K, args.stride_A, is_row_major(ALayout{})));
-            ck_tile::HostTensor<BDataType> b_n(ck_tile::host_tensor_descriptor(
-                args.K, args.N, args.stride_B, is_row_major(BLayout{})));
-
-            auto size_a_buffer = a_m.get_element_space_size_in_bytes() / APackedSize;
-            auto size_b_buffer = b_n.get_element_space_size_in_bytes() / BPackedSize;
-
-            ck_tile::RotatingMemWrapper<ADataType, BDataType> rotating_mem(
-                kargs.a_ptr, kargs.b_shuffle_ptr, s.rotating_count_, size_a_buffer, size_b_buffer);
-            rotating_mem.Print();
-
-            auto run_flush_cache = [&]() {
-                // flush icache
-                ck_tile::flush_icache();
-                // rotating mem
-                rotating_mem.Next();
-                // clear c mem
-                if(args.k_batch > 1)
-                    hipGetErrorString(hipMemsetAsync(
-                        args.c_ptr, 0, args.M * args.N * sizeof(CDataType), s.stream_id_));
-            };
-            ave_time = ck_tile::launch_kernel_preprocess(
-                s,
-                run_flush_cache,
-                ck_tile::make_kernel<blocks.x, kBlockPerCu>(Kernel{}, grids, blocks, 0, kargs));
-        }
-        else
-        {
-            ave_time = ck_tile::launch_kernel(
-                s, ck_tile::make_kernel<blocks.x, kBlockPerCu>(Kernel{}, grids, blocks, 0, kargs));
-        }
        return ave_time;
    };
    if(args.k_batch == 1)
@@ -171,6 +132,8 @@ float flatmm_calc(const ck_tile::FlatmmHostArgs& args, const ck_tile::stream_con
    }
 }

+#include "run_flatmm_example.inc"
+
 int run_flatmm_example(int argc, char* argv[])
 {
    auto [result, arg_parser] = create_args(argc, argv);
--- a/example/ck_tile/18_flatmm/flatmm_basic.hpp
+++ b/example/ck_tile/18_flatmm/flatmm_basic.hpp
@@ -133,11 +133,4 @@ auto create_args(int argc, char* argv[])
 }

 // host API
-template <typename ADataType,
-          typename BDataType,
-          typename AccDataType,
-          typename CDataType,
-          typename ALayout,
-          typename BLayout,
-          typename CLayout>
 float flatmm_calc(const ck_tile::FlatmmHostArgs& args, const ck_tile::stream_config& s);
--- a/example/ck_tile/18_flatmm/run_flatmm_example.inc
+++ b/example/ck_tile/18_flatmm/run_flatmm_example.inc
@@ -122,7 +122,7 @@ float invoke_flatmm(ck_tile::DeviceMem& a_dev_buf,

    float ave_time =
        flatmm_calc<ADataType, BDataType, AccDataType, CDataType, ALayout, BLayout, CLayout>(
-            args, ck_tile::stream_config{nullptr, true, 1, n_warmup, n_repeat, true, true, 50});
+            args, ck_tile::stream_config{nullptr, true, 1, n_warmup, n_repeat});

    std::size_t flop = std::size_t(2) * M * N * K;
    std::size_t num_byte =
--- a/include/ck_tile/host.hpp
+++ b/include/ck_tile/host.hpp
@@ -37,5 +37,3 @@
 #include "ck_tile/host/stream_config.hpp"
 #include "ck_tile/host/stream_utils.hpp"
 #include "ck_tile/host/timer.hpp"
-#include "ck_tile/host/flush_icache.hpp"
-#include "ck_tile/host/rotating_buffers.hpp"
--- a/include/ck_tile/host/flush_icache.hpp
+++ b/include/ck_tile/host/flush_icache.hpp
@@ -1,30 +0,0 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
-
-#pragma once
-
-#include <hip/hip_runtime.h>
-
-namespace ck_tile {
-static __global__ void flush_cache()
-{
-    asm __volatile__("s_icache_inv \n\t"
-                     "s_nop 0 \n\t"
-                     "s_nop 0 \n\t"
-                     "s_nop 0 \n\t"
-                     "s_nop 0 \n\t"
-                     "s_nop 0 \n\t"
-                     "s_nop 0 \n\t"
-                     "s_nop 0 \n\t"
-                     "s_nop 0 \n\t"
-                     "s_nop 0 \n\t"
-                     "s_nop 0 \n\t"
-                     "s_nop 0 \n\t"
-                     "s_nop 0 \n\t"
-                     "s_nop 0 \n\t"
-                     "s_nop 0 \n\t"
-                     "s_nop 0 \n\t"
-                     "s_nop 0 \n\t" ::
-                         :);
-}
-} // namespace ck_tile
--- a/include/ck_tile/host/kernel_launch.hpp
+++ b/include/ck_tile/host/kernel_launch.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.

 #pragma once

@@ -11,13 +11,6 @@
 #include <cstddef>

 namespace ck_tile {
-
-#define CU_FOR_MI308 80
-#define CU_FOR_MI300X 228
-#define OPTIMAL_LATENCY_MI308 0.005
-#define OPTIMAL_LATENCY_MI300X 0.0015
-#define OPTIMAL_LATENCY_SAFE_MARGIN 0.01
-
 template <int MaxThreadPerBlock, int MinBlockPerCu, typename Kernel, typename... Args>
 #if CK_TILE_USE_LAUNCH_BOUNDS
 __launch_bounds__(MaxThreadPerBlock, MinBlockPerCu)
@@ -88,8 +81,6 @@ CK_TILE_HOST void launch_and_check(const stream_config& sc, Callables&&... calla
 template <typename... Callables>
 CK_TILE_HOST float launch_kernel(const stream_config& s, Callables&&... callables)
 {
-    static_assert(sizeof...(callables) > 0, "At least one callable is required!");
-
    if(!s.time_kernel_)
    {
        launch_and_check(s, std::forward<Callables>(callables)...);
@@ -97,7 +88,7 @@ CK_TILE_HOST float launch_kernel(const stream_config& s, Callables&&... callable
    }

    auto time_launches = [&](auto timer) {
-        // Warmup
+        // warmup
        for(int i = 0; i < s.cold_niters_; i++)
        {
            launch_and_check(s, std::forward<Callables>(callables)...);
@@ -123,52 +114,4 @@ CK_TILE_HOST float launch_kernel(const stream_config& s, Callables&&... callable
    }
 }

-template <typename PreprocessFunc, typename... Callables>
-CK_TILE_HOST float launch_kernel_preprocess(const stream_config& s,
-                                            PreprocessFunc preprocess,
-                                            Callables&&... callables)
-{
-    static_assert(sizeof...(callables) > 0, "At least one callable is required!");
-
-    if(!s.time_kernel_)
-    {
-        preprocess();
-        launch_and_check(s, std::forward<Callables>(callables)...);
-        return 0;
-    }
-
-    auto time_launches = [&](auto timer) {
-        // Warmup
-        for(int i = 0; i < s.cold_niters_; i++)
-        {
-            launch_and_check(s, std::forward<Callables>(callables)...);
-        }
-
-        timer.start(s.stream_id_);
-        for(int i = 0; i < s.nrepeat_; i++)
-        {
-            preprocess();
-            launch_and_check(s, std::forward<Callables>(callables)...);
-        }
-        timer.stop(s.stream_id_);
-
-        hipDeviceProp_t deviceProps;
-        HIP_CHECK_ERROR(hipGetDeviceProperties(&deviceProps, 0));
-
-        float preprocess_offset =
-            (deviceProps.multiProcessorCount >= CU_FOR_MI300X)  ? OPTIMAL_LATENCY_MI300X
-            : (deviceProps.multiProcessorCount == CU_FOR_MI308) ? OPTIMAL_LATENCY_MI308
-                                                                : OPTIMAL_LATENCY_SAFE_MARGIN;
-        return (timer.duration() - preprocess_offset * s.nrepeat_) / s.nrepeat_;
-    };
-
-    if(s.is_gpu_timer_)
-    {
-        return time_launches(gpu_timer{});
-    }
-    else
-    {
-        return time_launches(cpu_timer{});
-    }
-}
 } // namespace ck_tile
--- a/include/ck_tile/host/rotating_buffers.hpp
+++ b/include/ck_tile/host/rotating_buffers.hpp
@@ -1,102 +0,0 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
-
-#pragma once
-
-#include "ck_tile/core/config.hpp"
-#include "ck_tile/host/hip_check_error.hpp"
-#include <hip/hip_runtime.h>
-
-namespace ck_tile {
-
-template <typename ADataType, typename BDataType>
-struct RotatingMemWrapper
-{
-    RotatingMemWrapper() = delete;
-    RotatingMemWrapper(const void* a_ptr_,
-                       const void* b_ptr_,
-                       std::size_t rotating_count_,
-                       std::size_t size_a_,
-                       std::size_t size_b_)
-        : a_ptr(a_ptr_),
-          b_ptr(b_ptr_),
-          rotating_count(rotating_count_),
-          size_a(size_a_),
-          size_b(size_b_)
-    {
-        p_a_grids.push_back(a_ptr);
-        p_b_grids.push_back(b_ptr);
-        for(size_t i = 1; i < rotating_count; i++)
-        {
-            {
-                void* pADeviceBuf;
-                HIP_CHECK_ERROR(hipMalloc(static_cast<void**>(&pADeviceBuf), size_a_));
-                HIP_CHECK_ERROR(hipMemcpy(static_cast<void*>(pADeviceBuf),
-                                          const_cast<void*>(p_a_grids[0]),
-                                          size_a_,
-                                          hipMemcpyDeviceToDevice));
-                p_a_grids.push_back(pADeviceBuf);
-            }
-
-            {
-                void* pBDeviceBuf;
-                HIP_CHECK_ERROR(hipMalloc(static_cast<void**>(&pBDeviceBuf), size_b_));
-                HIP_CHECK_ERROR(hipMemcpy(static_cast<void*>(pBDeviceBuf),
-                                          const_cast<void*>(p_b_grids[0]),
-                                          size_b_,
-                                          hipMemcpyDeviceToDevice));
-                p_b_grids.push_back(pBDeviceBuf);
-            }
-        }
-    }
-    void Next()
-    {
-        if(rotating_count > 1)
-        {
-            std::size_t idx = iter++ % rotating_count;
-            a_ptr           = p_a_grids[idx];
-            b_ptr           = p_b_grids[idx];
-        }
-    }
-    void Print()
-    {
-        std::cout << "RotatingMemWrapper: { size_a: " << size_a << ", size_b: " << size_b
-                  << ", rotating_count: " << rotating_count << "}" << std::endl;
-    }
-    ~RotatingMemWrapper() noexcept
-    {
-        if(rotating_count > 1)
-        {
-            // restore ptr
-            a_ptr = p_a_grids[0];
-            b_ptr = p_b_grids[0];
-
-            // free device mem
-            for(size_t i = 1; i < rotating_count; i++)
-            {
-                ck_tile::hip_check_error(hipFree(const_cast<void*>(p_a_grids[i])));
-                ck_tile::hip_check_error(hipFree(const_cast<void*>(p_b_grids[i])));
-            }
-        }
-    }
-
-    private:
-    const void* a_ptr;
-    const void* b_ptr;
-    std::size_t iter           = 0;
-    std::size_t rotating_count = 1;
-    std::size_t size_a         = 0;
-    std::size_t size_b         = 0;
-    std::vector<const void*> p_a_grids;
-    std::vector<const void*> p_b_grids;
-};
-inline void flush_icache()
-{
-    hipDeviceProp_t deviceProps;
-    HIP_CHECK_ERROR(hipGetDeviceProperties(&deviceProps, 0));
-    int32_t gpu_block3 = deviceProps.multiProcessorCount * 60;
-
-    ck_tile::flush_cache<<<dim3(gpu_block3), dim3(64), 0, nullptr>>>();
-    HIP_CHECK_ERROR(hipGetLastError());
-}
-} // namespace ck_tile
--- a/include/ck_tile/host/stream_config.hpp
+++ b/include/ck_tile/host/stream_config.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.

 #pragma once

@@ -30,7 +30,5 @@ struct stream_config
    int cold_niters_       = 3;
    int nrepeat_           = 10;
    bool is_gpu_timer_     = true; // keep compatible
-    bool flush_cache_      = false;
-    int rotating_count_    = 1;
 };
 } // namespace ck_tile
--- a/tile_engine/ops/gemm/README.md
+++ b/tile_engine/ops/gemm/README.md
@@ -42,8 +42,6 @@ rm -rf tile_engine/ && ninja benchmark_gemm  # rebuild
                 -repeat    The number of iterations to benchmark the kernel. Default is 100.
                  -timer    Whether if the timer is gpu timer or not. Possible values are true or false. Default is true.  
                   -init    The method of tensor initialization. Set to 0 for random, to 1 for linear, or 2 for constant(1). Default is 0, random.
-            -flush_cache    To flush cache in between different runs.Possible values are true or false. Default is false.
-         -rotating_count    count to flush cache. Default is 5.     
                 -metric    Metric with which to measure kernel performance. Set to 0 for latency, 1 for tflops, or 2 for bandwidth. Default is 0, latency.
           -csv_filename    The filename of benchmark result. Default is gemm_kernel.
    -structured_sparsity    whether use sparsity kernel or not. Possible values are true or false. Default is false.
--- a/tile_engine/ops/gemm/benchmark_gemm.cpp
+++ b/tile_engine/ops/gemm/benchmark_gemm.cpp
@@ -26,15 +26,15 @@ void benchmark_gemm(const ck_tile::ArgParser& arg_parser)
                             CLayout::name,
                             arg_parser.get_bool("structured_sparsity")};

-    Setting setting{arg_parser.get_int("warmup"),
-                    arg_parser.get_int("repeat"),
-                    arg_parser.get_bool("timer"),
-                    arg_parser.get_int("verify"),
-                    arg_parser.get_int("init"),
-                    arg_parser.get_bool("log"),
-                    arg_parser.get_str("csv_filename"),
-                    arg_parser.get_bool("flush_cache"),
-                    arg_parser.get_int("rotating_count")};
+    Setting setting{
+        arg_parser.get_int("warmup"),
+        arg_parser.get_int("repeat"),
+        arg_parser.get_bool("timer"),
+        arg_parser.get_int("verify"),
+        arg_parser.get_int("init"),
+        arg_parser.get_bool("log"),
+        arg_parser.get_str("csv_filename"),
+    };

    auto& profiler = GemmProfiler::instance(setting);

--- a/tile_engine/ops/gemm/benchmark_gemm.hpp
+++ b/tile_engine/ops/gemm/benchmark_gemm.hpp
@@ -125,8 +125,6 @@ struct Setting
    int init_method_;
    bool log_;
    std::string csv_filename_;
-    bool flush_cache_;
-    int rotating_count_;
 };

 inline std::string get_rocm_version()
--- a/tile_engine/ops/gemm/gemm_host_api.hpp
+++ b/tile_engine/ops/gemm/gemm_host_api.hpp
@@ -93,11 +93,6 @@ inline auto create_args(int argc, char* argv[])
                "0",
                "The method of tensor initialization. Set to 0 for random, to 1 for linear, or 2 "
                "for constant(1). Default is 0, random.")
-        .insert("flush_cache",
-                "false",
-                "To flush cache, possible values are true or false. "
-                "Default is false.")
-        .insert("rotating_count", "5", "number of iterations to rotate the cache. default is 5.")
        .insert("metric",
                "0",
                "Metric with which to measure kernel performance. Set to 0 for latency, 1 for "
--- a/tile_engine/ops/gemm/gemm_instance_builder.py
+++ b/tile_engine/ops/gemm/gemm_instance_builder.py
@@ -273,52 +273,9 @@ struct GemmKernel {{
                      << std::endl;
            }}

-            if(stream.flush_cache_)
-            {{
-                std::cout << "Flushing cache..." << std::endl;
-                static constexpr ck_tile::index_t APackedSize =
-                    std::is_same_v<BDataType, ck_tile::pk_int4_t> ? 2 : 1;
-                static constexpr ck_tile::index_t BPackedSize =
-                    std::is_same_v<BDataType, ck_tile::pk_int4_t> ? 2 : 1;
-                
-                auto is_row_major = [](auto layout_) {{
-                    return ck_tile::bool_constant<std::is_same_v<ck_tile::remove_cvref_t<decltype(layout_)>,
-                                                 ck_tile::tensor_layout::gemm::RowMajor>>{{}};
-                }};
-
-                ck_tile::HostTensor<ADataType> a_m(ck_tile::host_tensor_descriptor(
-                    args.M, args.K, args.stride_A, is_row_major(ALayout{{}})));
-                ck_tile::HostTensor<BDataType> b_n(ck_tile::host_tensor_descriptor(
-                    args.K, args.N, args.stride_B, is_row_major(BLayout{{}})));
-
-                auto size_a_buffer = a_m.get_element_space_size_in_bytes() / APackedSize;
-                auto size_b_buffer = b_n.get_element_space_size_in_bytes() / BPackedSize;
-
-                ck_tile::RotatingMemWrapper<ADataType, BDataType> rotating_mem(
-                    kargs.a_ptr, kargs.b_ptr, stream.rotating_count_, size_a_buffer, size_b_buffer);
-                rotating_mem.Print();
-
-                auto run_flush_cache = [&]() {{
-                    // flush icache
-                    ck_tile::flush_icache();
-                    // rotating mem
-                    rotating_mem.Next();
-                    // clear c mem
-                    if(args.k_batch > 1)
-                        hipGetErrorString(hipMemsetAsync(
-                            args.c_ptr, 0, args.M * args.N * sizeof(CDataType), stream.stream_id_));
-                }};
-                ave_time = ck_tile::launch_kernel_preprocess(
-                    stream,
-                    run_flush_cache,
-                    ck_tile::make_kernel<blocks.x, kBlockPerCu>(
-                        Kernel{{}}, grids, blocks, 0, kargs));
-            }}
-            else{{
-                ave_time = ck_tile::launch_kernel(stream,
+            ave_time = ck_tile::launch_kernel(stream,
                                          ck_tile::make_kernel<blocks.x, kBlockPerCu>(
                                              Kernel{{}}, grids, blocks, 0, kargs));
-            }}
            return ave_time;

        }};
--- a/tile_engine/ops/gemm/gemm_profiler.hpp
+++ b/tile_engine/ops/gemm/gemm_profiler.hpp
@@ -128,9 +128,7 @@ class GemmProfiler
                                                                     setting_.log_,
                                                                     setting_.n_warmup_,
                                                                     setting_.n_repeat_,
-                                                                     setting_.is_gpu_timer_,
-                                                                     setting_.flush_cache_,
-                                                                     setting_.rotating_count_});
+                                                                     setting_.is_gpu_timer_});
            process_result(gemm_problem,
                           c_m_n_dev_buf,
                           c_m_n_host_result,
--- a/tile_engine/ops/gemm/json_config.py
+++ b/tile_engine/ops/gemm/json_config.py
@@ -9,7 +9,7 @@ Handles loading, parsing, and validation of JSON configuration parameters.

 from pathlib import Path
 from dataclasses import dataclass
-from typing import List, Optional, Union, Tuple, Type
+from typing import List, Optional, Union, Tuple, Type, Dict
 import json


@@ -63,8 +63,8 @@ class ProblemConfig:
    layouts: Tuple[EnumConfigParam, ...]

    @property
-    def datatype_map(self) -> dict[str, str]:
-        """Get current layout selections as a key-value map."""
+    def datatype_map(self) -> Dict[str, str]:
+        """Get datatype as a key-value map."""
        return {
            'matrix_a': self.datatypes[0].values[0],
            'matrix_b': self.datatypes[1].values[0],
@@ -72,8 +72,8 @@ class ProblemConfig:
        }

    @property
-    def layout_map(self) -> dict[str, str]:
-        """Get current layout selections as a key-value map."""
+    def layout_map(self) -> Dict[str, str]:
+        """Get layout as a key-value map."""
        return {
            'matrix_a': self.layouts[0].values[0],
            'matrix_b': self.layouts[1].values[0],
@@ -83,7 +83,7 @@ class ProblemConfig:

@dataclass
 class TileConfig:
-    """configuration class for tile parameter."""
+    """Configuration class for tile parameter."""
    tile_m: Union[EnumConfigParam, RangeConfigParam]
    tile_n: Union[EnumConfigParam, RangeConfigParam]
    tile_k: Union[EnumConfigParam, RangeConfigParam]
@@ -99,7 +99,7 @@ class TileConfig:

@dataclass
 class TraitConfig:
-    """configuration class for kernel traits."""
+    """Configuration class for kernel traits."""
    pipeline: EnumConfigParam
    scheduler: EnumConfigParam
    epilogue: EnumConfigParam