diff --git a/example/ck_tile/03_gemm/gemm_utils.hpp b/example/ck_tile/03_gemm/gemm_utils.hpp index 4c9fecaba6..25fab6bde0 100644 --- a/example/ck_tile/03_gemm/gemm_utils.hpp +++ b/example/ck_tile/03_gemm/gemm_utils.hpp @@ -220,11 +220,4 @@ auto create_args(int argc, char* argv[]) } // host API -template float gemm_calc(const ck_tile::GemmHostArgs& args, const ck_tile::stream_config& s); diff --git a/example/ck_tile/03_gemm/run_gemm_example.inc b/example/ck_tile/03_gemm/run_gemm_example.inc index 3010130e6c..79ed9ce76b 100644 --- a/example/ck_tile/03_gemm/run_gemm_example.inc +++ b/example/ck_tile/03_gemm/run_gemm_example.inc @@ -178,7 +178,7 @@ float invoke_gemm(ck_tile::DeviceMem& a_m_k_dev_buf, float ave_time = gemm_calc( - args, ck_tile::stream_config{nullptr, true, 1, n_warmup, n_repeat, true, true, 50}); + args, ck_tile::stream_config{nullptr, true, 1, n_warmup, n_repeat}); std::size_t flop = std::size_t(2) * M * N * K; std::size_t num_byte = diff --git a/example/ck_tile/03_gemm/universal_gemm.cpp b/example/ck_tile/03_gemm/universal_gemm.cpp index 5dcb685839..5718baf677 100644 --- a/example/ck_tile/03_gemm/universal_gemm.cpp +++ b/example/ck_tile/03_gemm/universal_gemm.cpp @@ -11,7 +11,6 @@ #include "ck_tile/host.hpp" #include "gemm_utils.hpp" -#include "run_gemm_example.inc" template void try_run(ck_tile::TailNumber tn) @@ -75,102 +74,64 @@ float gemm_calc(const ck_tile::GemmHostArgs& args, const ck_tile::stream_config& float ave_time{0}; - const auto Run = - [&](const auto has_hot_loop_, const auto tail_number_, const auto memory_operation_) { - constexpr bool has_hot_loop_v = has_hot_loop_.value; - constexpr auto tail_number_v = tail_number_.value; - constexpr auto scheduler = GEMM_PIPELINE_SCHEDULER; - constexpr auto memory_operation = memory_operation_.value; + const auto Run = [&](const auto has_hot_loop_, + const auto tail_number_, + const auto memory_operation_) { + constexpr bool has_hot_loop_v = has_hot_loop_.value; + constexpr auto tail_number_v = tail_number_.value; + constexpr auto scheduler = GEMM_PIPELINE_SCHEDULER; + constexpr auto memory_operation = memory_operation_.value; - using UniversalGemmProblem = ck_tile::UniversalGemmPipelineProblem; + using UniversalGemmProblem = ck_tile::UniversalGemmPipelineProblem; - using GemmPipeline = GEMM_PIPELINE; - using GemmEpilogue = ck_tile::CShuffleEpilogue< - ck_tile::CShuffleEpilogueProblem>; - using Kernel = ck_tile::GemmKernel; - auto kargs = Kernel::MakeKernelArgs(args); + using GemmPipeline = GEMM_PIPELINE; + using GemmEpilogue = ck_tile::CShuffleEpilogue< + ck_tile::CShuffleEpilogueProblem>; + using Kernel = ck_tile::GemmKernel; + auto kargs = Kernel::MakeKernelArgs(args); - const dim3 grids = Kernel::GridSize(args.M, args.N, args.k_batch); - constexpr dim3 blocks = Kernel::BlockSize(); + const dim3 grids = Kernel::GridSize(args.M, args.N, args.k_batch); + constexpr dim3 blocks = Kernel::BlockSize(); - if(!Kernel::IsSupportedArgument(kargs)) - { - throw std::runtime_error("Wrong! Arguments not supported! Skipping gemm!\n"); - } + if(!Kernel::IsSupportedArgument(kargs)) + { + throw std::runtime_error("Wrong! Arguments not supported! Skipping gemm!\n"); + } - if(s.log_level_ > 0) - { - std::cout << "Launching kernel with args:" - << " grid: {" << grids.x << ", " << grids.y << ", " << grids.z << "}" - << ", blocks: {" << blocks.x << ", " << blocks.y << ", " << blocks.z - << "}" << std::endl; - } - if(s.flush_cache_) - { - std::cout << "Flushing cache..." << std::endl; - static constexpr ck_tile::index_t APackedSize = - std::is_same_v ? 2 : 1; - static constexpr ck_tile::index_t BPackedSize = - std::is_same_v ? 2 : 1; + if(s.log_level_ > 0) + { + std::cout << "Launching kernel with args:" + << " grid: {" << grids.x << ", " << grids.y << ", " << grids.z << "}" + << ", blocks: {" << blocks.x << ", " << blocks.y << ", " << blocks.z << "}" + << std::endl; + } - ck_tile::HostTensor a_m(ck_tile::host_tensor_descriptor( - args.M, args.K, args.stride_A, is_row_major(ALayout{}))); - ck_tile::HostTensor b_n(ck_tile::host_tensor_descriptor( - args.K, args.N, args.stride_B, is_row_major(BLayout{}))); - - auto size_a_buffer = a_m.get_element_space_size_in_bytes() / APackedSize; - auto size_b_buffer = b_n.get_element_space_size_in_bytes() / BPackedSize; - - ck_tile::RotatingMemWrapper rotating_mem( - kargs.a_ptr, kargs.b_ptr, s.rotating_count_, size_a_buffer, size_b_buffer); - rotating_mem.Print(); - - auto run_flush_cache = [&]() { - // flush icache - ck_tile::flush_icache(); - // rotating mem - rotating_mem.Next(); - // clear c mem - if(args.k_batch > 1) - hipGetErrorString(hipMemsetAsync( - args.c_ptr, 0, args.M * args.N * sizeof(CDataType), s.stream_id_)); - }; - ave_time = ck_tile::launch_kernel_preprocess( - s, - run_flush_cache, - ck_tile::make_kernel( - Kernel{}, grids, blocks, 0, kargs)); - } - else - { - ave_time = - ck_tile::launch_kernel(s, - ck_tile::make_kernel( - Kernel{}, grids, blocks, 0, kargs)); - } - return ave_time; - }; + ave_time = ck_tile::launch_kernel(s, + ck_tile::make_kernel( + Kernel{}, grids, blocks, 0, kargs)); + return ave_time; + }; const auto RunSplitk = [&](const auto has_hot_loop_, const auto tail_number_) { if(args.k_batch == 1) @@ -282,6 +243,8 @@ float gemm_calc(const ck_tile::GemmHostArgs& args, const ck_tile::stream_config& return ave_time; } +#include "run_gemm_example.inc" + template int run_gemm_example_prec_type(std::string a_layout, std::string b_layout, int argc, char* argv[]) { diff --git a/example/ck_tile/18_flatmm/flatmm_basic.cpp b/example/ck_tile/18_flatmm/flatmm_basic.cpp index 2dbff1bc5c..c8b4a10d05 100644 --- a/example/ck_tile/18_flatmm/flatmm_basic.cpp +++ b/example/ck_tile/18_flatmm/flatmm_basic.cpp @@ -11,7 +11,6 @@ #include "ck_tile/host.hpp" #include "flatmm_basic.hpp" -#include "run_flatmm_example.inc" template ? 2 : 1; - static constexpr ck_tile::index_t BPackedSize = - std::is_same_v ? 2 : 1; + float ave_time = ck_tile::launch_kernel( + s, ck_tile::make_kernel(Kernel{}, grids, blocks, 0, kargs)); - ck_tile::HostTensor a_m(ck_tile::host_tensor_descriptor( - args.M, args.K, args.stride_A, is_row_major(ALayout{}))); - ck_tile::HostTensor b_n(ck_tile::host_tensor_descriptor( - args.K, args.N, args.stride_B, is_row_major(BLayout{}))); - - auto size_a_buffer = a_m.get_element_space_size_in_bytes() / APackedSize; - auto size_b_buffer = b_n.get_element_space_size_in_bytes() / BPackedSize; - - ck_tile::RotatingMemWrapper rotating_mem( - kargs.a_ptr, kargs.b_shuffle_ptr, s.rotating_count_, size_a_buffer, size_b_buffer); - rotating_mem.Print(); - - auto run_flush_cache = [&]() { - // flush icache - ck_tile::flush_icache(); - // rotating mem - rotating_mem.Next(); - // clear c mem - if(args.k_batch > 1) - hipGetErrorString(hipMemsetAsync( - args.c_ptr, 0, args.M * args.N * sizeof(CDataType), s.stream_id_)); - }; - ave_time = ck_tile::launch_kernel_preprocess( - s, - run_flush_cache, - ck_tile::make_kernel(Kernel{}, grids, blocks, 0, kargs)); - } - else - { - ave_time = ck_tile::launch_kernel( - s, ck_tile::make_kernel(Kernel{}, grids, blocks, 0, kargs)); - } return ave_time; }; if(args.k_batch == 1) @@ -171,6 +132,8 @@ float flatmm_calc(const ck_tile::FlatmmHostArgs& args, const ck_tile::stream_con } } +#include "run_flatmm_example.inc" + int run_flatmm_example(int argc, char* argv[]) { auto [result, arg_parser] = create_args(argc, argv); diff --git a/example/ck_tile/18_flatmm/flatmm_basic.hpp b/example/ck_tile/18_flatmm/flatmm_basic.hpp index 55f2d4f367..bbce978724 100644 --- a/example/ck_tile/18_flatmm/flatmm_basic.hpp +++ b/example/ck_tile/18_flatmm/flatmm_basic.hpp @@ -133,11 +133,4 @@ auto create_args(int argc, char* argv[]) } // host API -template float flatmm_calc(const ck_tile::FlatmmHostArgs& args, const ck_tile::stream_config& s); diff --git a/example/ck_tile/18_flatmm/run_flatmm_example.inc b/example/ck_tile/18_flatmm/run_flatmm_example.inc index 3d4f154af7..c191fff7d0 100644 --- a/example/ck_tile/18_flatmm/run_flatmm_example.inc +++ b/example/ck_tile/18_flatmm/run_flatmm_example.inc @@ -122,7 +122,7 @@ float invoke_flatmm(ck_tile::DeviceMem& a_dev_buf, float ave_time = flatmm_calc( - args, ck_tile::stream_config{nullptr, true, 1, n_warmup, n_repeat, true, true, 50}); + args, ck_tile::stream_config{nullptr, true, 1, n_warmup, n_repeat}); std::size_t flop = std::size_t(2) * M * N * K; std::size_t num_byte = diff --git a/include/ck_tile/host.hpp b/include/ck_tile/host.hpp index 3459e728e0..24feaf7c62 100644 --- a/include/ck_tile/host.hpp +++ b/include/ck_tile/host.hpp @@ -37,5 +37,3 @@ #include "ck_tile/host/stream_config.hpp" #include "ck_tile/host/stream_utils.hpp" #include "ck_tile/host/timer.hpp" -#include "ck_tile/host/flush_icache.hpp" -#include "ck_tile/host/rotating_buffers.hpp" diff --git a/include/ck_tile/host/flush_icache.hpp b/include/ck_tile/host/flush_icache.hpp deleted file mode 100644 index 9230b50a13..0000000000 --- a/include/ck_tile/host/flush_icache.hpp +++ /dev/null @@ -1,30 +0,0 @@ -// SPDX-License-Identifier: MIT -// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. - -#pragma once - -#include - -namespace ck_tile { -static __global__ void flush_cache() -{ - asm __volatile__("s_icache_inv \n\t" - "s_nop 0 \n\t" - "s_nop 0 \n\t" - "s_nop 0 \n\t" - "s_nop 0 \n\t" - "s_nop 0 \n\t" - "s_nop 0 \n\t" - "s_nop 0 \n\t" - "s_nop 0 \n\t" - "s_nop 0 \n\t" - "s_nop 0 \n\t" - "s_nop 0 \n\t" - "s_nop 0 \n\t" - "s_nop 0 \n\t" - "s_nop 0 \n\t" - "s_nop 0 \n\t" - "s_nop 0 \n\t" :: - :); -} -} // namespace ck_tile diff --git a/include/ck_tile/host/kernel_launch.hpp b/include/ck_tile/host/kernel_launch.hpp index 269e59a103..d159787387 100644 --- a/include/ck_tile/host/kernel_launch.hpp +++ b/include/ck_tile/host/kernel_launch.hpp @@ -1,5 +1,5 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. #pragma once @@ -11,13 +11,6 @@ #include namespace ck_tile { - -#define CU_FOR_MI308 80 -#define CU_FOR_MI300X 228 -#define OPTIMAL_LATENCY_MI308 0.005 -#define OPTIMAL_LATENCY_MI300X 0.0015 -#define OPTIMAL_LATENCY_SAFE_MARGIN 0.01 - template #if CK_TILE_USE_LAUNCH_BOUNDS __launch_bounds__(MaxThreadPerBlock, MinBlockPerCu) @@ -88,8 +81,6 @@ CK_TILE_HOST void launch_and_check(const stream_config& sc, Callables&&... calla template CK_TILE_HOST float launch_kernel(const stream_config& s, Callables&&... callables) { - static_assert(sizeof...(callables) > 0, "At least one callable is required!"); - if(!s.time_kernel_) { launch_and_check(s, std::forward(callables)...); @@ -97,7 +88,7 @@ CK_TILE_HOST float launch_kernel(const stream_config& s, Callables&&... callable } auto time_launches = [&](auto timer) { - // Warmup + // warmup for(int i = 0; i < s.cold_niters_; i++) { launch_and_check(s, std::forward(callables)...); @@ -123,52 +114,4 @@ CK_TILE_HOST float launch_kernel(const stream_config& s, Callables&&... callable } } -template -CK_TILE_HOST float launch_kernel_preprocess(const stream_config& s, - PreprocessFunc preprocess, - Callables&&... callables) -{ - static_assert(sizeof...(callables) > 0, "At least one callable is required!"); - - if(!s.time_kernel_) - { - preprocess(); - launch_and_check(s, std::forward(callables)...); - return 0; - } - - auto time_launches = [&](auto timer) { - // Warmup - for(int i = 0; i < s.cold_niters_; i++) - { - launch_and_check(s, std::forward(callables)...); - } - - timer.start(s.stream_id_); - for(int i = 0; i < s.nrepeat_; i++) - { - preprocess(); - launch_and_check(s, std::forward(callables)...); - } - timer.stop(s.stream_id_); - - hipDeviceProp_t deviceProps; - HIP_CHECK_ERROR(hipGetDeviceProperties(&deviceProps, 0)); - - float preprocess_offset = - (deviceProps.multiProcessorCount >= CU_FOR_MI300X) ? OPTIMAL_LATENCY_MI300X - : (deviceProps.multiProcessorCount == CU_FOR_MI308) ? OPTIMAL_LATENCY_MI308 - : OPTIMAL_LATENCY_SAFE_MARGIN; - return (timer.duration() - preprocess_offset * s.nrepeat_) / s.nrepeat_; - }; - - if(s.is_gpu_timer_) - { - return time_launches(gpu_timer{}); - } - else - { - return time_launches(cpu_timer{}); - } -} } // namespace ck_tile diff --git a/include/ck_tile/host/rotating_buffers.hpp b/include/ck_tile/host/rotating_buffers.hpp deleted file mode 100644 index 86f68ad084..0000000000 --- a/include/ck_tile/host/rotating_buffers.hpp +++ /dev/null @@ -1,102 +0,0 @@ -// SPDX-License-Identifier: MIT -// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. - -#pragma once - -#include "ck_tile/core/config.hpp" -#include "ck_tile/host/hip_check_error.hpp" -#include - -namespace ck_tile { - -template -struct RotatingMemWrapper -{ - RotatingMemWrapper() = delete; - RotatingMemWrapper(const void* a_ptr_, - const void* b_ptr_, - std::size_t rotating_count_, - std::size_t size_a_, - std::size_t size_b_) - : a_ptr(a_ptr_), - b_ptr(b_ptr_), - rotating_count(rotating_count_), - size_a(size_a_), - size_b(size_b_) - { - p_a_grids.push_back(a_ptr); - p_b_grids.push_back(b_ptr); - for(size_t i = 1; i < rotating_count; i++) - { - { - void* pADeviceBuf; - HIP_CHECK_ERROR(hipMalloc(static_cast(&pADeviceBuf), size_a_)); - HIP_CHECK_ERROR(hipMemcpy(static_cast(pADeviceBuf), - const_cast(p_a_grids[0]), - size_a_, - hipMemcpyDeviceToDevice)); - p_a_grids.push_back(pADeviceBuf); - } - - { - void* pBDeviceBuf; - HIP_CHECK_ERROR(hipMalloc(static_cast(&pBDeviceBuf), size_b_)); - HIP_CHECK_ERROR(hipMemcpy(static_cast(pBDeviceBuf), - const_cast(p_b_grids[0]), - size_b_, - hipMemcpyDeviceToDevice)); - p_b_grids.push_back(pBDeviceBuf); - } - } - } - void Next() - { - if(rotating_count > 1) - { - std::size_t idx = iter++ % rotating_count; - a_ptr = p_a_grids[idx]; - b_ptr = p_b_grids[idx]; - } - } - void Print() - { - std::cout << "RotatingMemWrapper: { size_a: " << size_a << ", size_b: " << size_b - << ", rotating_count: " << rotating_count << "}" << std::endl; - } - ~RotatingMemWrapper() noexcept - { - if(rotating_count > 1) - { - // restore ptr - a_ptr = p_a_grids[0]; - b_ptr = p_b_grids[0]; - - // free device mem - for(size_t i = 1; i < rotating_count; i++) - { - ck_tile::hip_check_error(hipFree(const_cast(p_a_grids[i]))); - ck_tile::hip_check_error(hipFree(const_cast(p_b_grids[i]))); - } - } - } - - private: - const void* a_ptr; - const void* b_ptr; - std::size_t iter = 0; - std::size_t rotating_count = 1; - std::size_t size_a = 0; - std::size_t size_b = 0; - std::vector p_a_grids; - std::vector p_b_grids; -}; -inline void flush_icache() -{ - hipDeviceProp_t deviceProps; - HIP_CHECK_ERROR(hipGetDeviceProperties(&deviceProps, 0)); - int32_t gpu_block3 = deviceProps.multiProcessorCount * 60; - - ck_tile::flush_cache<<>>(); - HIP_CHECK_ERROR(hipGetLastError()); -} -} // namespace ck_tile diff --git a/include/ck_tile/host/stream_config.hpp b/include/ck_tile/host/stream_config.hpp index f6bd40f6f2..47cf0fd5e4 100644 --- a/include/ck_tile/host/stream_config.hpp +++ b/include/ck_tile/host/stream_config.hpp @@ -1,5 +1,5 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. #pragma once @@ -30,7 +30,5 @@ struct stream_config int cold_niters_ = 3; int nrepeat_ = 10; bool is_gpu_timer_ = true; // keep compatible - bool flush_cache_ = false; - int rotating_count_ = 1; }; } // namespace ck_tile diff --git a/tile_engine/ops/gemm/README.md b/tile_engine/ops/gemm/README.md index 3ae280f8ce..87267f8bce 100644 --- a/tile_engine/ops/gemm/README.md +++ b/tile_engine/ops/gemm/README.md @@ -42,8 +42,6 @@ rm -rf tile_engine/ && ninja benchmark_gemm # rebuild -repeat The number of iterations to benchmark the kernel. Default is 100. -timer Whether if the timer is gpu timer or not. Possible values are true or false. Default is true. -init The method of tensor initialization. Set to 0 for random, to 1 for linear, or 2 for constant(1). Default is 0, random. - -flush_cache To flush cache in between different runs.Possible values are true or false. Default is false. - -rotating_count count to flush cache. Default is 5. -metric Metric with which to measure kernel performance. Set to 0 for latency, 1 for tflops, or 2 for bandwidth. Default is 0, latency. -csv_filename The filename of benchmark result. Default is gemm_kernel. -structured_sparsity whether use sparsity kernel or not. Possible values are true or false. Default is false. diff --git a/tile_engine/ops/gemm/benchmark_gemm.cpp b/tile_engine/ops/gemm/benchmark_gemm.cpp index db2b648437..fb56e524d2 100644 --- a/tile_engine/ops/gemm/benchmark_gemm.cpp +++ b/tile_engine/ops/gemm/benchmark_gemm.cpp @@ -26,15 +26,15 @@ void benchmark_gemm(const ck_tile::ArgParser& arg_parser) CLayout::name, arg_parser.get_bool("structured_sparsity")}; - Setting setting{arg_parser.get_int("warmup"), - arg_parser.get_int("repeat"), - arg_parser.get_bool("timer"), - arg_parser.get_int("verify"), - arg_parser.get_int("init"), - arg_parser.get_bool("log"), - arg_parser.get_str("csv_filename"), - arg_parser.get_bool("flush_cache"), - arg_parser.get_int("rotating_count")}; + Setting setting{ + arg_parser.get_int("warmup"), + arg_parser.get_int("repeat"), + arg_parser.get_bool("timer"), + arg_parser.get_int("verify"), + arg_parser.get_int("init"), + arg_parser.get_bool("log"), + arg_parser.get_str("csv_filename"), + }; auto& profiler = GemmProfiler::instance(setting); diff --git a/tile_engine/ops/gemm/benchmark_gemm.hpp b/tile_engine/ops/gemm/benchmark_gemm.hpp index 459a40b080..292d67dad6 100644 --- a/tile_engine/ops/gemm/benchmark_gemm.hpp +++ b/tile_engine/ops/gemm/benchmark_gemm.hpp @@ -125,8 +125,6 @@ struct Setting int init_method_; bool log_; std::string csv_filename_; - bool flush_cache_; - int rotating_count_; }; inline std::string get_rocm_version() diff --git a/tile_engine/ops/gemm/gemm_host_api.hpp b/tile_engine/ops/gemm/gemm_host_api.hpp old mode 100644 new mode 100755 index b3aab6ad92..8cbc3f26f6 --- a/tile_engine/ops/gemm/gemm_host_api.hpp +++ b/tile_engine/ops/gemm/gemm_host_api.hpp @@ -93,11 +93,6 @@ inline auto create_args(int argc, char* argv[]) "0", "The method of tensor initialization. Set to 0 for random, to 1 for linear, or 2 " "for constant(1). Default is 0, random.") - .insert("flush_cache", - "false", - "To flush cache, possible values are true or false. " - "Default is false.") - .insert("rotating_count", "5", "number of iterations to rotate the cache. default is 5.") .insert("metric", "0", "Metric with which to measure kernel performance. Set to 0 for latency, 1 for " diff --git a/tile_engine/ops/gemm/gemm_instance_builder.py b/tile_engine/ops/gemm/gemm_instance_builder.py index ea7fa4e67c..c43797f3e0 100755 --- a/tile_engine/ops/gemm/gemm_instance_builder.py +++ b/tile_engine/ops/gemm/gemm_instance_builder.py @@ -273,52 +273,9 @@ struct GemmKernel {{ << std::endl; }} - if(stream.flush_cache_) - {{ - std::cout << "Flushing cache..." << std::endl; - static constexpr ck_tile::index_t APackedSize = - std::is_same_v ? 2 : 1; - static constexpr ck_tile::index_t BPackedSize = - std::is_same_v ? 2 : 1; - - auto is_row_major = [](auto layout_) {{ - return ck_tile::bool_constant, - ck_tile::tensor_layout::gemm::RowMajor>>{{}}; - }}; - - ck_tile::HostTensor a_m(ck_tile::host_tensor_descriptor( - args.M, args.K, args.stride_A, is_row_major(ALayout{{}}))); - ck_tile::HostTensor b_n(ck_tile::host_tensor_descriptor( - args.K, args.N, args.stride_B, is_row_major(BLayout{{}}))); - - auto size_a_buffer = a_m.get_element_space_size_in_bytes() / APackedSize; - auto size_b_buffer = b_n.get_element_space_size_in_bytes() / BPackedSize; - - ck_tile::RotatingMemWrapper rotating_mem( - kargs.a_ptr, kargs.b_ptr, stream.rotating_count_, size_a_buffer, size_b_buffer); - rotating_mem.Print(); - - auto run_flush_cache = [&]() {{ - // flush icache - ck_tile::flush_icache(); - // rotating mem - rotating_mem.Next(); - // clear c mem - if(args.k_batch > 1) - hipGetErrorString(hipMemsetAsync( - args.c_ptr, 0, args.M * args.N * sizeof(CDataType), stream.stream_id_)); - }}; - ave_time = ck_tile::launch_kernel_preprocess( - stream, - run_flush_cache, - ck_tile::make_kernel( - Kernel{{}}, grids, blocks, 0, kargs)); - }} - else{{ - ave_time = ck_tile::launch_kernel(stream, + ave_time = ck_tile::launch_kernel(stream, ck_tile::make_kernel( Kernel{{}}, grids, blocks, 0, kargs)); - }} return ave_time; }}; diff --git a/tile_engine/ops/gemm/gemm_profiler.hpp b/tile_engine/ops/gemm/gemm_profiler.hpp index 0125a759b3..9170952aa8 100644 --- a/tile_engine/ops/gemm/gemm_profiler.hpp +++ b/tile_engine/ops/gemm/gemm_profiler.hpp @@ -128,9 +128,7 @@ class GemmProfiler setting_.log_, setting_.n_warmup_, setting_.n_repeat_, - setting_.is_gpu_timer_, - setting_.flush_cache_, - setting_.rotating_count_}); + setting_.is_gpu_timer_}); process_result(gemm_problem, c_m_n_dev_buf, c_m_n_host_result,