mirror of
https://github.com/ROCm/composable_kernel.git
synced 2026-05-14 02:02:46 +00:00
Update to gpu_timer for rotating_buffer (#2524)
* update gpu_timer for rotating buffer as hipblasLt's implementation * timing fix * Updating gpu timer for old ck as well * Revert "Updating gpu timer for old ck as well" This reverts commit958cd1bc99. * code clean up with runtime argument; function rename * code cleanup * general timer fixes * bug fix * clang formatted * addressing reveiew comments * clang formatted * Addressing review comments * CI fix --------- Co-authored-by: Po Yen Chen <PoYen.Chen@amd.com> [ROCm/composable_kernel commit:61e21f5567]
This commit is contained in:
@@ -457,7 +457,8 @@ auto create_args(int argc, char* argv[])
|
||||
.insert("timer", "gpu", "gpu:gpu timer, cpu:cpu timer")
|
||||
.insert("split_k", "1", "splitK value")
|
||||
.insert("init", "0", "0:random, 1:linear, 2:constant(1)")
|
||||
.insert("persistent", "0", "0:non-persistent, 1:persistent");
|
||||
.insert("persistent", "0", "0:non-persistent, 1:persistent")
|
||||
.insert("bench_time_ms", "0", "benchmark time in ms, defaults to 0 ms");
|
||||
|
||||
bool result = arg_parser.parse(argc, argv);
|
||||
return std::make_tuple(result, arg_parser);
|
||||
|
||||
@@ -146,18 +146,14 @@ float gemm(const ck_tile::GemmHostArgs& args, const ck_tile::stream_config& s)
|
||||
if(s.flush_cache_)
|
||||
{
|
||||
std::cout << "Flushing cache..." << std::endl;
|
||||
static constexpr ck_tile::index_t APackedSize =
|
||||
std::is_same_v<BDataType, ck_tile::pk_int4_t> ? 2 : 1;
|
||||
static constexpr ck_tile::index_t BPackedSize =
|
||||
std::is_same_v<BDataType, ck_tile::pk_int4_t> ? 2 : 1;
|
||||
|
||||
ck_tile::HostTensor<ADataType> a_m(ck_tile::host_tensor_descriptor(
|
||||
args.M, args.K, args.stride_A, is_row_major(ALayout{})));
|
||||
ck_tile::HostTensor<BDataType> b_n(ck_tile::host_tensor_descriptor(
|
||||
args.K, args.N, args.stride_B, is_row_major(BLayout{})));
|
||||
|
||||
auto size_a_buffer = a_m.get_element_space_size_in_bytes() / APackedSize;
|
||||
auto size_b_buffer = b_n.get_element_space_size_in_bytes() / BPackedSize;
|
||||
auto size_a_buffer = a_m.get_element_space_size_in_bytes();
|
||||
auto size_b_buffer = b_n.get_element_space_size_in_bytes();
|
||||
|
||||
ck_tile::RotatingMemWrapper<ADataType, BDataType> rotating_mem(
|
||||
kargs.as_ptr[0], kargs.bs_ptr[0], s.rotating_count_, size_a_buffer, size_b_buffer);
|
||||
@@ -173,7 +169,7 @@ float gemm(const ck_tile::GemmHostArgs& args, const ck_tile::stream_config& s)
|
||||
hipGetErrorString(hipMemsetAsync(
|
||||
args.e_ptr, 0, args.M * args.N * sizeof(CDataType), s.stream_id_));
|
||||
};
|
||||
ave_time = ck_tile::launch_kernel_preprocess(
|
||||
ave_time = ck_tile::launch_kernel_time_mask(
|
||||
s,
|
||||
run_flush_cache,
|
||||
ck_tile::make_kernel<blocks.x, GemmConfig::kBlockPerCu>(
|
||||
|
||||
@@ -183,7 +183,8 @@ float invoke_gemm(ck_tile::DeviceMem& a_m_k_dev_buf,
|
||||
ck_tile::index_t kbatch,
|
||||
int n_warmup,
|
||||
int n_repeat,
|
||||
bool persistent)
|
||||
bool persistent,
|
||||
int bench_time_ms)
|
||||
{
|
||||
ck_tile::GemmHostArgs args = {a_m_k_dev_buf.GetDeviceBuffer(),
|
||||
b_k_n_dev_buf.GetDeviceBuffer(),
|
||||
@@ -211,7 +212,9 @@ float invoke_gemm(ck_tile::DeviceMem& a_m_k_dev_buf,
|
||||
CLayout,
|
||||
true,
|
||||
CDEElementWise>(
|
||||
args, ck_tile::stream_config{nullptr, true, 1, n_warmup, n_repeat, true, true, 50});
|
||||
args,
|
||||
ck_tile::stream_config{
|
||||
nullptr, true, 1, n_warmup, n_repeat, true, true, 50, bench_time_ms});
|
||||
}
|
||||
else
|
||||
{
|
||||
@@ -227,7 +230,9 @@ float invoke_gemm(ck_tile::DeviceMem& a_m_k_dev_buf,
|
||||
CLayout,
|
||||
false,
|
||||
CDEElementWise>(
|
||||
args, ck_tile::stream_config{nullptr, true, 1, n_warmup, n_repeat, true, true, 50});
|
||||
args,
|
||||
ck_tile::stream_config{
|
||||
nullptr, true, 1, n_warmup, n_repeat, true, true, 50, bench_time_ms});
|
||||
}
|
||||
|
||||
std::size_t flop = std::size_t(2) * M * N * K;
|
||||
@@ -236,15 +241,16 @@ float invoke_gemm(ck_tile::DeviceMem& a_m_k_dev_buf,
|
||||
float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
|
||||
float gb_per_sec = num_byte / 1.E6 / ave_time;
|
||||
|
||||
std::cout << "Run Gemm kernel with M=" << M << " N=" << N << " K=" << K
|
||||
std::cout << "Run Gemm kernel with \n M=" << M << " N=" << N << " K=" << K
|
||||
<< " StrideA=" << stride_A << " StrideB=" << stride_B << " StrideC=" << stride_C
|
||||
<< " A_Layout=" << ALayout::name << " B_Layout =" << BLayout::name
|
||||
<< " C_Layout=" << CLayout::name << " A_Type=" << DataTypeTraits<ADataType>::name
|
||||
<< " B_Type=" << DataTypeTraits<BDataType>::name
|
||||
<< " C_Type=" << DataTypeTraits<CDataType>::name
|
||||
<< " StructuredSparsity=" << (GemmConfig::UseStructuredSparsity ? "on" : "off")
|
||||
<< " Persistent=" << (persistent ? "on" : "off") << " : " << ave_time << " ms, "
|
||||
<< tflops << " TFlops, " << gb_per_sec << " GB/s, " << std::endl;
|
||||
<< " Persistent=" << (persistent ? "on" : "off") << " : \n"
|
||||
<< ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s, "
|
||||
<< std::endl;
|
||||
|
||||
return ave_time;
|
||||
}
|
||||
@@ -297,6 +303,7 @@ int run_gemm_example_with_layouts(int argc,
|
||||
int n_repeat = arg_parser.get_int("repeat");
|
||||
ck_tile::index_t init_method = arg_parser.get_int("init");
|
||||
bool persistent = arg_parser.get_int("persistent");
|
||||
int bench_time_ms = arg_parser.get_int("bench_time_ms");
|
||||
|
||||
const bool preshuffle = GemmConfig::Preshuffle;
|
||||
|
||||
@@ -414,7 +421,8 @@ int run_gemm_example_with_layouts(int argc,
|
||||
kbatch,
|
||||
n_warmup,
|
||||
n_repeat,
|
||||
persistent);
|
||||
persistent,
|
||||
bench_time_ms);
|
||||
|
||||
c_m_n_dev_buf.FromDevice(c_m_n_dev_result.data());
|
||||
bool pass = true;
|
||||
|
||||
@@ -147,18 +147,14 @@ float gemm(const ck_tile::GemmHostArgs& args, const ck_tile::stream_config& s)
|
||||
if(s.flush_cache_)
|
||||
{
|
||||
std::cout << "Flushing cache..." << std::endl;
|
||||
static constexpr ck_tile::index_t APackedSize =
|
||||
std::is_same_v<BDataType, ck_tile::pk_int4_t> ? 2 : 1;
|
||||
static constexpr ck_tile::index_t BPackedSize =
|
||||
std::is_same_v<BDataType, ck_tile::pk_int4_t> ? 2 : 1;
|
||||
|
||||
ck_tile::HostTensor<ADataType> a_m(ck_tile::host_tensor_descriptor(
|
||||
args.M, args.K, args.stride_A, is_row_major(ALayout{})));
|
||||
ck_tile::HostTensor<BDataType> b_n(ck_tile::host_tensor_descriptor(
|
||||
args.K, args.N, args.stride_B, is_row_major(BLayout{})));
|
||||
|
||||
auto size_a_buffer = a_m.get_element_space_size_in_bytes() / APackedSize;
|
||||
auto size_b_buffer = b_n.get_element_space_size_in_bytes() / BPackedSize;
|
||||
auto size_a_buffer = a_m.get_element_space_size_in_bytes();
|
||||
auto size_b_buffer = b_n.get_element_space_size_in_bytes();
|
||||
|
||||
ck_tile::RotatingMemWrapper<ADataType, BDataType> rotating_mem(
|
||||
kargs.as_ptr[0], kargs.bs_ptr[0], s.rotating_count_, size_a_buffer, size_b_buffer);
|
||||
@@ -174,7 +170,7 @@ float gemm(const ck_tile::GemmHostArgs& args, const ck_tile::stream_config& s)
|
||||
hipGetErrorString(hipMemsetAsync(
|
||||
args.e_ptr, 0, args.M * args.N * sizeof(CDataType), s.stream_id_));
|
||||
};
|
||||
ave_time = ck_tile::launch_kernel_preprocess(
|
||||
ave_time = ck_tile::launch_kernel_time_mask(
|
||||
s,
|
||||
run_flush_cache,
|
||||
ck_tile::make_kernel<blocks.x, GemmConfig::kBlockPerCu>(
|
||||
|
||||
@@ -3,6 +3,8 @@
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <numeric>
|
||||
#include <functional>
|
||||
#include "ck_tile/core/config.hpp"
|
||||
#include "ck_tile/core/utility/ignore.hpp"
|
||||
#include "ck_tile/host/hip_check_error.hpp"
|
||||
@@ -63,6 +65,73 @@ CK_TILE_HOST void launch_and_check(const stream_config& sc, Callables&&... calla
|
||||
}
|
||||
}
|
||||
|
||||
template <class it>
|
||||
typename std::iterator_traits<it>::value_type median(it begin, it end)
|
||||
{
|
||||
if(begin == end)
|
||||
{
|
||||
return std::numeric_limits<double>::quiet_NaN();
|
||||
}
|
||||
auto n = std::distance(begin, end);
|
||||
auto n2 = n / 2;
|
||||
std::nth_element(begin, begin + n2, end);
|
||||
return (n % 2) ? begin[n2] : (*std::max_element(begin, begin + n2) + begin[n2]) / 2.0;
|
||||
}
|
||||
|
||||
inline void remove_outliers(std::vector<float>& v)
|
||||
{
|
||||
// 1.5x IQR method to detect and remove outliers
|
||||
auto n2 = v.size() / 2;
|
||||
std::nth_element(v.begin(), v.begin() + n2, v.end());
|
||||
auto q1 = median(v.begin(), v.begin() + n2);
|
||||
auto q3 = median(v.begin() + ((v.size() % 2) ? n2 + 1 : n2), v.end());
|
||||
auto iqr = q3 - q1;
|
||||
auto lb = q1 - 1.5 * iqr;
|
||||
auto ub = q3 + 1.5 * iqr;
|
||||
v.erase(std::remove_if(v.begin(), v.end(), [&](float f) { return f < lb || f > ub; }), v.end());
|
||||
}
|
||||
|
||||
template <typename TimerType, typename CallablesFunc>
|
||||
CK_TILE_HOST double timing_loop_impl(TimerType timer,
|
||||
const stream_config& s,
|
||||
CallablesFunc&& callables_func,
|
||||
std::function<void()> preprocess = nullptr)
|
||||
{
|
||||
for(int i = 0; i < s.cold_niters_; i++)
|
||||
{
|
||||
callables_func();
|
||||
}
|
||||
|
||||
float per_iter_time = 0.f;
|
||||
std::vector<float> times;
|
||||
int i = 0;
|
||||
while(i < s.nrepeat_ || per_iter_time < s.bench_time_ms_)
|
||||
{
|
||||
if(preprocess)
|
||||
preprocess();
|
||||
|
||||
timer.start(s.stream_id_, i);
|
||||
callables_func();
|
||||
timer.stop(s.stream_id_, i);
|
||||
|
||||
if(i > 0)
|
||||
{
|
||||
per_iter_time = timer.duration(i - 1);
|
||||
times.push_back(per_iter_time);
|
||||
per_iter_time = timer.is_exceed(i - 1);
|
||||
}
|
||||
i++;
|
||||
}
|
||||
|
||||
if(!i)
|
||||
return 0.;
|
||||
|
||||
per_iter_time = timer.duration(i - 1);
|
||||
times.push_back(per_iter_time);
|
||||
remove_outliers(times);
|
||||
return std::accumulate(times.begin(), times.end(), 0.) / times.size();
|
||||
}
|
||||
|
||||
// clang-format off
|
||||
/*
|
||||
* launch_kernel()
|
||||
@@ -101,37 +170,21 @@ CK_TILE_HOST float launch_kernel(const stream_config& s, Callables&&... callable
|
||||
return 0;
|
||||
}
|
||||
|
||||
auto time_launches = [&](auto timer) {
|
||||
// Warmup
|
||||
for(int i = 0; i < s.cold_niters_; i++)
|
||||
{
|
||||
launch_and_check(s, std::forward<Callables>(callables)...);
|
||||
}
|
||||
|
||||
timer.start(s.stream_id_);
|
||||
for(int i = 0; i < s.nrepeat_; i++)
|
||||
{
|
||||
launch_and_check(s, std::forward<Callables>(callables)...);
|
||||
}
|
||||
timer.stop(s.stream_id_);
|
||||
|
||||
return timer.duration() / s.nrepeat_;
|
||||
};
|
||||
auto callables_func = [&]() { launch_and_check(s, std::forward<Callables>(callables)...); };
|
||||
|
||||
if(s.is_gpu_timer_)
|
||||
{
|
||||
return time_launches(gpu_timer{});
|
||||
return timing_loop_impl(gpu_timer_new{s.stream_id_}, s, callables_func);
|
||||
}
|
||||
else
|
||||
{
|
||||
return time_launches(cpu_timer{});
|
||||
return timing_loop_impl(cpu_timer{}, s, callables_func);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename PreprocessFunc, typename... Callables>
|
||||
CK_TILE_HOST float launch_kernel_preprocess(const stream_config& s,
|
||||
PreprocessFunc preprocess,
|
||||
Callables&&... callables)
|
||||
CK_TILE_HOST float
|
||||
launch_kernel_time_mask(const stream_config& s, PreprocessFunc preprocess, Callables&&... callables)
|
||||
{
|
||||
static_assert(sizeof...(callables) > 0, "At least one callable is required!");
|
||||
|
||||
@@ -142,39 +195,15 @@ CK_TILE_HOST float launch_kernel_preprocess(const stream_config& s,
|
||||
return 0;
|
||||
}
|
||||
|
||||
auto time_launches = [&](auto timer) {
|
||||
// Warmup
|
||||
for(int i = 0; i < s.cold_niters_; i++)
|
||||
{
|
||||
launch_and_check(s, std::forward<Callables>(callables)...);
|
||||
}
|
||||
|
||||
timer.start(s.stream_id_);
|
||||
for(int i = 0; i < s.nrepeat_; i++)
|
||||
{
|
||||
preprocess();
|
||||
launch_and_check(s, std::forward<Callables>(callables)...);
|
||||
}
|
||||
timer.stop(s.stream_id_);
|
||||
|
||||
hipDeviceProp_t deviceProps;
|
||||
HIP_CHECK_ERROR(hipGetDeviceProperties(&deviceProps, 0));
|
||||
|
||||
float preprocess_offset = (deviceProps.multiProcessorCount >= HIGH_CU_PROCESSORS)
|
||||
? OPTIMAL_LATENCY_HIGH_CU_PROCESSORS
|
||||
: (deviceProps.multiProcessorCount == LOW_CU_PROCESSORS)
|
||||
? OPTIMAL_LATENCY_LOW_CU_PROCESSORS
|
||||
: OPTIMAL_LATENCY_SAFE_MARGIN;
|
||||
return (timer.duration() - preprocess_offset * s.nrepeat_) / s.nrepeat_;
|
||||
};
|
||||
auto callables_func = [&]() { launch_and_check(s, std::forward<Callables>(callables)...); };
|
||||
|
||||
if(s.is_gpu_timer_)
|
||||
{
|
||||
return time_launches(gpu_timer{});
|
||||
return timing_loop_impl(gpu_timer_new{s.stream_id_}, s, callables_func, preprocess);
|
||||
}
|
||||
else
|
||||
{
|
||||
return time_launches(cpu_timer{});
|
||||
return timing_loop_impl(cpu_timer{}, s, callables_func, preprocess);
|
||||
}
|
||||
}
|
||||
} // namespace ck_tile
|
||||
|
||||
@@ -32,5 +32,6 @@ struct stream_config
|
||||
bool is_gpu_timer_ = true; // keep compatible
|
||||
bool flush_cache_ = false;
|
||||
int rotating_count_ = 1;
|
||||
int bench_time_ms_ = 0;
|
||||
};
|
||||
} // namespace ck_tile
|
||||
|
||||
@@ -48,31 +48,100 @@ struct gpu_timer
|
||||
hipEvent_t start_evt, stop_evt;
|
||||
};
|
||||
|
||||
struct gpu_timer_new
|
||||
{
|
||||
CK_TILE_HOST gpu_timer_new(const hipStream_t& s)
|
||||
{
|
||||
for(auto& e : start_event)
|
||||
{
|
||||
HIP_CHECK_ERROR(hipEventCreate(&e));
|
||||
}
|
||||
for(auto& e : stop_event)
|
||||
{
|
||||
HIP_CHECK_ERROR(hipEventCreate(&e));
|
||||
}
|
||||
HIP_CHECK_ERROR(hipEventCreate(&event0));
|
||||
HIP_CHECK_ERROR(hipEventRecord(event0, s));
|
||||
}
|
||||
|
||||
CK_TILE_HOST ~gpu_timer_new() noexcept(false)
|
||||
{
|
||||
for(auto& e : start_event)
|
||||
{
|
||||
HIP_CHECK_ERROR(hipEventDestroy(e));
|
||||
}
|
||||
for(auto& e : stop_event)
|
||||
{
|
||||
HIP_CHECK_ERROR(hipEventDestroy(e));
|
||||
}
|
||||
HIP_CHECK_ERROR(hipEventDestroy(event0));
|
||||
}
|
||||
|
||||
CK_TILE_HOST void start(const hipStream_t& s, int idx = 0)
|
||||
{
|
||||
HIP_CHECK_ERROR(hipEventRecord(start_event[idx % 2], s));
|
||||
}
|
||||
|
||||
CK_TILE_HOST void stop(const hipStream_t& s, int idx = 0)
|
||||
{
|
||||
HIP_CHECK_ERROR(hipEventRecord(stop_event[idx % 2], s));
|
||||
}
|
||||
// return in ms
|
||||
CK_TILE_HOST float duration(int idx = 0) const
|
||||
{
|
||||
float ms;
|
||||
HIP_CHECK_ERROR(hipEventSynchronize(stop_event[idx % 2]));
|
||||
HIP_CHECK_ERROR(hipEventElapsedTime(&ms, start_event[idx % 2], stop_event[idx % 2]));
|
||||
return ms;
|
||||
}
|
||||
CK_TILE_HOST float is_exceed(int idx = 0) const
|
||||
{
|
||||
float ms;
|
||||
HIP_CHECK_ERROR(hipEventElapsedTime(&ms, event0, stop_event[idx % 2]));
|
||||
return ms;
|
||||
}
|
||||
|
||||
private:
|
||||
std::array<hipEvent_t, 2> start_event;
|
||||
std::array<hipEvent_t, 2> stop_event;
|
||||
hipEvent_t event0;
|
||||
};
|
||||
|
||||
struct cpu_timer
|
||||
{
|
||||
// torch.utils.benchmark.Timer(), there is a sync inside each timer callback
|
||||
CK_TILE_HOST void start(const hipStream_t& s)
|
||||
CK_TILE_HOST void start(const hipStream_t& s, [[maybe_unused]] int idx = 0)
|
||||
{
|
||||
HIP_CHECK_ERROR(hipStreamSynchronize(s));
|
||||
start_tick = std::chrono::high_resolution_clock::now();
|
||||
start_tick = std::chrono::high_resolution_clock::now();
|
||||
time_event0 = std::chrono::high_resolution_clock::now();
|
||||
}
|
||||
// torch.utils.benchmark.Timer(), there is a sync inside each timer callback
|
||||
CK_TILE_HOST void stop(const hipStream_t& s)
|
||||
CK_TILE_HOST void stop(const hipStream_t& s, [[maybe_unused]] int idx = 0)
|
||||
{
|
||||
HIP_CHECK_ERROR(hipStreamSynchronize(s));
|
||||
stop_tick = std::chrono::high_resolution_clock::now();
|
||||
}
|
||||
// return in ms
|
||||
CK_TILE_HOST float duration() const
|
||||
CK_TILE_HOST float duration([[maybe_unused]] int idx = 0) const
|
||||
{
|
||||
double sec =
|
||||
std::chrono::duration_cast<std::chrono::duration<double>>(stop_tick - start_tick)
|
||||
.count();
|
||||
return static_cast<float>(sec * 1e3);
|
||||
}
|
||||
// return in ms
|
||||
CK_TILE_HOST float is_exceed([[maybe_unused]] int idx = 0) const
|
||||
{
|
||||
double sec =
|
||||
std::chrono::duration_cast<std::chrono::duration<double>>(stop_tick - time_event0)
|
||||
.count();
|
||||
return static_cast<float>(sec * 1e3);
|
||||
}
|
||||
|
||||
private:
|
||||
std::chrono::time_point<std::chrono::high_resolution_clock> start_tick;
|
||||
std::chrono::time_point<std::chrono::high_resolution_clock> time_event0;
|
||||
std::chrono::time_point<std::chrono::high_resolution_clock> stop_tick;
|
||||
};
|
||||
|
||||
|
||||
@@ -162,7 +162,7 @@ float gemm(const ck_tile::GemmHostArgs& args, const ck_tile::stream_config& s)
|
||||
hipGetErrorString(hipMemsetAsync(
|
||||
args.e_ptr, 0, args.M * args.N * sizeof(CDataType), s.stream_id_));
|
||||
};
|
||||
ave_time = ck_tile::launch_kernel_preprocess(
|
||||
ave_time = ck_tile::launch_kernel_time_mask(
|
||||
s,
|
||||
run_flush_cache,
|
||||
ck_tile::make_kernel<blocks.x, GemmConfig::kBlockPerCu>(
|
||||
|
||||
@@ -34,7 +34,8 @@ void benchmark_gemm(const ck_tile::ArgParser& arg_parser)
|
||||
arg_parser.get_bool("log"),
|
||||
arg_parser.get_str("csv_filename"),
|
||||
arg_parser.get_bool("flush_cache"),
|
||||
arg_parser.get_int("rotating_count")};
|
||||
arg_parser.get_int("rotating_count"),
|
||||
arg_parser.get_int("bench_time")};
|
||||
|
||||
auto& profiler = GemmProfiler::instance(setting);
|
||||
|
||||
|
||||
@@ -125,6 +125,7 @@ struct Setting
|
||||
std::string csv_filename_;
|
||||
bool flush_cache_;
|
||||
int rotating_count_;
|
||||
int bench_time_ms_;
|
||||
};
|
||||
|
||||
inline std::string get_rocm_version()
|
||||
|
||||
@@ -110,6 +110,7 @@ inline auto create_args(int argc, char* argv[])
|
||||
"To flush cache, possible values are true or false. "
|
||||
"Default is false.")
|
||||
.insert("rotating_count", "5", "number of iterations to rotate the cache. default is 5.")
|
||||
.insert("bench_time", "0", "benchmark time in ms. default is 0 ms.")
|
||||
.insert("metric",
|
||||
"0",
|
||||
"Metric with which to measure kernel performance. Set to 0 for latency, 1 for "
|
||||
|
||||
@@ -348,7 +348,7 @@ struct GemmKernel {{
|
||||
hipGetErrorString(hipMemsetAsync(
|
||||
args.e_ptr, 0, args.M * args.N * sizeof(CDataType), stream.stream_id_));
|
||||
}};
|
||||
ave_time = ck_tile::launch_kernel_preprocess(
|
||||
ave_time = ck_tile::launch_kernel_time_mask(
|
||||
stream,
|
||||
run_flush_cache,
|
||||
ck_tile::make_kernel<blocks.x, kBlockPerCu>(
|
||||
|
||||
@@ -131,7 +131,8 @@ class GemmProfiler
|
||||
setting_.n_repeat_,
|
||||
setting_.is_gpu_timer_,
|
||||
setting_.flush_cache_,
|
||||
setting_.rotating_count_});
|
||||
setting_.rotating_count_,
|
||||
setting_.bench_time_ms_});
|
||||
process_result(gemm_problem,
|
||||
c_m_n_dev_buf,
|
||||
c_m_n_host_result,
|
||||
|
||||
Reference in New Issue
Block a user