Merge commit '88d72178d6739c7e277074e5f9bb5d1e59bf0152' into develop

This commit is contained in:
assistant-librarian[bot]
2025-08-01 00:42:26 +00:00
parent b0bb499ae5
commit 50c936cf87
11 changed files with 54 additions and 139 deletions

View File

@@ -458,7 +458,8 @@ auto create_args(int argc, char* argv[])
.insert("split_k", "1", "splitK value")
.insert("init", "0", "0:random, 1:linear, 2:constant(1)")
.insert("persistent", "0", "0:non-persistent, 1:persistent")
.insert("bench_time_ms", "0", "benchmark time in ms, defaults to 0 ms");
.insert("flush_cache", "true", "flush cache before running the kernel, defaults to true")
.insert("rotating_count", "1", "rotating count, defaults to 1");
bool result = arg_parser.parse(argc, argv);
return std::make_tuple(result, arg_parser);

View File

@@ -184,7 +184,8 @@ float invoke_gemm(ck_tile::DeviceMem& a_m_k_dev_buf,
int n_warmup,
int n_repeat,
bool persistent,
int bench_time_ms)
bool flush_cache,
int rotating_count)
{
ck_tile::GemmHostArgs args = {a_m_k_dev_buf.GetDeviceBuffer(),
b_k_n_dev_buf.GetDeviceBuffer(),
@@ -214,7 +215,7 @@ float invoke_gemm(ck_tile::DeviceMem& a_m_k_dev_buf,
CDEElementWise>(
args,
ck_tile::stream_config{
nullptr, true, 1, n_warmup, n_repeat, true, true, 50, bench_time_ms});
nullptr, true, 1, n_warmup, n_repeat, true, flush_cache, rotating_count});
}
else
{
@@ -232,7 +233,7 @@ float invoke_gemm(ck_tile::DeviceMem& a_m_k_dev_buf,
CDEElementWise>(
args,
ck_tile::stream_config{
nullptr, true, 1, n_warmup, n_repeat, true, true, 50, bench_time_ms});
nullptr, true, 1, n_warmup, n_repeat, true, flush_cache, rotating_count});
}
std::size_t flop = std::size_t(2) * M * N * K;
@@ -303,7 +304,8 @@ int run_gemm_example_with_layouts(int argc,
int n_repeat = arg_parser.get_int("repeat");
ck_tile::index_t init_method = arg_parser.get_int("init");
bool persistent = arg_parser.get_int("persistent");
int bench_time_ms = arg_parser.get_int("bench_time_ms");
bool flush_cache = arg_parser.get_bool("flush_cache");
int rotating_count = arg_parser.get_int("rotating_count");
const bool preshuffle = GemmConfig::Preshuffle;
@@ -422,7 +424,8 @@ int run_gemm_example_with_layouts(int argc,
n_warmup,
n_repeat,
persistent,
bench_time_ms);
flush_cache,
rotating_count);
c_m_n_dev_buf.FromDevice(c_m_n_dev_result.data());
bool pass = true;

View File

@@ -168,7 +168,7 @@ float flatmm_calc(const ck_tile::FlatmmHostArgs<>& args, const ck_tile::stream_c
hipGetErrorString(hipMemsetAsync(
args.e_ptr, 0, args.M * args.N * sizeof(CDataType), s.stream_id_));
};
ave_time = ck_tile::launch_kernel_preprocess(
ave_time = ck_tile::launch_kernel_time_mask(
s,
run_flush_cache,
ck_tile::make_kernel<blocks.x, FlatmmConfig::kBlockPerCu>(

View File

@@ -120,7 +120,7 @@ float grouped_conv_bwd_weight(const ck_tile::GroupedConvBwdWeightHostArgs& args,
<< ", Vector size C: " << ConvEpilogue::GetVectorSizeC() << std::endl;
}
float ave_time = ck_tile::launch_kernel_preprocess(
float ave_time = ck_tile::launch_kernel_time_mask(
s,
Kernel::Preprocess(kargs, s),
ck_tile::make_kernel<blocks.x, kBlockPerCu>(Kernel{}, grids, blocks, 0, kargs));

View File

@@ -15,12 +15,6 @@
namespace ck_tile {
#define LOW_CU_PROCESSORS 80
#define HIGH_CU_PROCESSORS 228
#define OPTIMAL_LATENCY_LOW_CU_PROCESSORS 0.005
#define OPTIMAL_LATENCY_HIGH_CU_PROCESSORS 0.0015
#define OPTIMAL_LATENCY_SAFE_MARGIN 0.01
template <int MaxThreadPerBlock, int MinBlockPerCu, typename Kernel, typename... Args>
#if CK_TILE_USE_LAUNCH_BOUNDS
__launch_bounds__(MaxThreadPerBlock, MinBlockPerCu)
@@ -65,71 +59,58 @@ CK_TILE_HOST void launch_and_check(const stream_config& sc, Callables&&... calla
}
}
template <class it>
typename std::iterator_traits<it>::value_type median(it begin, it end)
// Measure the preprocess time during the cold iterations
template <typename TimerType, typename PreprocessFunc>
CK_TILE_HOST double
preprocess_profiling_impl(TimerType timer, const stream_config& s, PreprocessFunc preprocess)
{
if(begin == end)
timer.start(s.stream_id_);
for(int i = 0; i < s.nrepeat_; i++)
{
return std::numeric_limits<double>::quiet_NaN();
if constexpr(!std::is_same_v<PreprocessFunc, std::nullptr_t>)
{
preprocess();
}
}
auto n = std::distance(begin, end);
auto n2 = n / 2;
std::nth_element(begin, begin + n2, end);
return (n % 2) ? begin[n2] : (*std::max_element(begin, begin + n2) + begin[n2]) / 2.0;
timer.stop(s.stream_id_);
return timer.duration() / s.nrepeat_;
}
inline void remove_outliers(std::vector<float>& v)
{
// 1.5x IQR method to detect and remove outliers
auto n2 = v.size() / 2;
std::nth_element(v.begin(), v.begin() + n2, v.end());
auto q1 = median(v.begin(), v.begin() + n2);
auto q3 = median(v.begin() + ((v.size() % 2) ? n2 + 1 : n2), v.end());
auto iqr = q3 - q1;
auto lb = q1 - 1.5 * iqr;
auto ub = q3 + 1.5 * iqr;
v.erase(std::remove_if(v.begin(), v.end(), [&](float f) { return f < lb || f > ub; }), v.end());
}
template <typename TimerType, typename CallablesFunc>
template <typename TimerType, typename CallablesFunc, typename PreprocessFunc = std::nullptr_t>
CK_TILE_HOST double timing_loop_impl(TimerType timer,
const stream_config& s,
CallablesFunc&& callables_func,
std::function<void()> preprocess = nullptr)
PreprocessFunc preprocess = nullptr)
{
for(int i = 0; i < s.cold_niters_; i++)
{
callables_func();
}
float per_iter_time = 0.f;
std::vector<float> times;
int i = 0;
while(i < s.nrepeat_ || per_iter_time < s.bench_time_ms_)
// Only profile preprocess if it's provided
auto preprocess_time = 0.0;
if constexpr(!std::is_same_v<PreprocessFunc, std::nullptr_t>)
{
if(preprocess)
preprocess();
preprocess_time = preprocess_profiling_impl(gpu_timer{}, s, preprocess);
}
timer.start(s.stream_id_, i);
callables_func();
timer.stop(s.stream_id_, i);
if(i > 0)
int i = 0;
timer.start(s.stream_id_);
while(i < s.nrepeat_)
{
if constexpr(!std::is_same_v<PreprocessFunc, std::nullptr_t>)
{
per_iter_time = timer.duration(i - 1);
times.push_back(per_iter_time);
per_iter_time = timer.is_exceed(i - 1);
preprocess();
}
callables_func();
i++;
}
timer.stop(s.stream_id_);
if(!i)
return 0.;
per_iter_time = timer.duration(i - 1);
times.push_back(per_iter_time);
remove_outliers(times);
return std::accumulate(times.begin(), times.end(), 0.) / times.size();
return (timer.duration() / s.nrepeat_) - preprocess_time;
}
// clang-format off
@@ -174,7 +155,7 @@ CK_TILE_HOST float launch_kernel(const stream_config& s, Callables&&... callable
if(s.is_gpu_timer_)
{
return timing_loop_impl(gpu_timer_new{s.stream_id_}, s, callables_func);
return timing_loop_impl(gpu_timer{}, s, callables_func);
}
else
{
@@ -199,7 +180,7 @@ launch_kernel_time_mask(const stream_config& s, PreprocessFunc preprocess, Calla
if(s.is_gpu_timer_)
{
return timing_loop_impl(gpu_timer_new{s.stream_id_}, s, callables_func, preprocess);
return timing_loop_impl(gpu_timer{}, s, callables_func, preprocess);
}
else
{

View File

@@ -20,6 +20,10 @@ namespace ck_tile {
*
* // create stream config with _some_stream_id_, and benchmark using cpu timer
* stream_config s = stream_config{_some_stream_id_, true, 0, 3, 10, false};
*
* // create stream config with _some_stream_id_, and enable gpu timer for rotating buffer with
*rotating buffer count stream_config s = stream_config{_some_stream_id_, true, 0, 3, 10, true,
*true, 1};
**/
struct stream_config
@@ -32,6 +36,5 @@ struct stream_config
bool is_gpu_timer_ = true; // keep compatible
bool flush_cache_ = false;
int rotating_count_ = 1;
int bench_time_ms_ = 0;
};
} // namespace ck_tile

View File

@@ -48,100 +48,31 @@ struct gpu_timer
hipEvent_t start_evt, stop_evt;
};
struct gpu_timer_new
{
CK_TILE_HOST gpu_timer_new(const hipStream_t& s)
{
for(auto& e : start_event)
{
HIP_CHECK_ERROR(hipEventCreate(&e));
}
for(auto& e : stop_event)
{
HIP_CHECK_ERROR(hipEventCreate(&e));
}
HIP_CHECK_ERROR(hipEventCreate(&event0));
HIP_CHECK_ERROR(hipEventRecord(event0, s));
}
CK_TILE_HOST ~gpu_timer_new() noexcept(false)
{
for(auto& e : start_event)
{
HIP_CHECK_ERROR(hipEventDestroy(e));
}
for(auto& e : stop_event)
{
HIP_CHECK_ERROR(hipEventDestroy(e));
}
HIP_CHECK_ERROR(hipEventDestroy(event0));
}
CK_TILE_HOST void start(const hipStream_t& s, int idx = 0)
{
HIP_CHECK_ERROR(hipEventRecord(start_event[idx % 2], s));
}
CK_TILE_HOST void stop(const hipStream_t& s, int idx = 0)
{
HIP_CHECK_ERROR(hipEventRecord(stop_event[idx % 2], s));
}
// return in ms
CK_TILE_HOST float duration(int idx = 0) const
{
float ms;
HIP_CHECK_ERROR(hipEventSynchronize(stop_event[idx % 2]));
HIP_CHECK_ERROR(hipEventElapsedTime(&ms, start_event[idx % 2], stop_event[idx % 2]));
return ms;
}
CK_TILE_HOST float is_exceed(int idx = 0) const
{
float ms;
HIP_CHECK_ERROR(hipEventElapsedTime(&ms, event0, stop_event[idx % 2]));
return ms;
}
private:
std::array<hipEvent_t, 2> start_event;
std::array<hipEvent_t, 2> stop_event;
hipEvent_t event0;
};
struct cpu_timer
{
// torch.utils.benchmark.Timer(), there is a sync inside each timer callback
CK_TILE_HOST void start(const hipStream_t& s, [[maybe_unused]] int idx = 0)
CK_TILE_HOST void start(const hipStream_t& s)
{
HIP_CHECK_ERROR(hipStreamSynchronize(s));
start_tick = std::chrono::high_resolution_clock::now();
time_event0 = std::chrono::high_resolution_clock::now();
start_tick = std::chrono::high_resolution_clock::now();
}
// torch.utils.benchmark.Timer(), there is a sync inside each timer callback
CK_TILE_HOST void stop(const hipStream_t& s, [[maybe_unused]] int idx = 0)
CK_TILE_HOST void stop(const hipStream_t& s)
{
HIP_CHECK_ERROR(hipStreamSynchronize(s));
stop_tick = std::chrono::high_resolution_clock::now();
}
// return in ms
CK_TILE_HOST float duration([[maybe_unused]] int idx = 0) const
CK_TILE_HOST float duration() const
{
double sec =
std::chrono::duration_cast<std::chrono::duration<double>>(stop_tick - start_tick)
.count();
return static_cast<float>(sec * 1e3);
}
// return in ms
CK_TILE_HOST float is_exceed([[maybe_unused]] int idx = 0) const
{
double sec =
std::chrono::duration_cast<std::chrono::duration<double>>(stop_tick - time_event0)
.count();
return static_cast<float>(sec * 1e3);
}
private:
std::chrono::time_point<std::chrono::high_resolution_clock> start_tick;
std::chrono::time_point<std::chrono::high_resolution_clock> time_event0;
std::chrono::time_point<std::chrono::high_resolution_clock> stop_tick;
};

View File

@@ -34,8 +34,7 @@ void benchmark_gemm(const ck_tile::ArgParser& arg_parser)
arg_parser.get_bool("log"),
arg_parser.get_str("csv_filename"),
arg_parser.get_bool("flush_cache"),
arg_parser.get_int("rotating_count"),
arg_parser.get_int("bench_time")};
arg_parser.get_int("rotating_count")};
auto& profiler = GemmProfiler::instance(setting);

View File

@@ -125,7 +125,6 @@ struct Setting
std::string csv_filename_;
bool flush_cache_;
int rotating_count_;
int bench_time_ms_;
};
inline std::string get_rocm_version()

View File

@@ -110,7 +110,6 @@ inline auto create_args(int argc, char* argv[])
"To flush cache, possible values are true or false. "
"Default is false.")
.insert("rotating_count", "5", "number of iterations to rotate the cache. default is 5.")
.insert("bench_time", "0", "benchmark time in ms. default is 0 ms.")
.insert("metric",
"0",
"Metric with which to measure kernel performance. Set to 0 for latency, 1 for "

View File

@@ -131,8 +131,7 @@ class GemmProfiler
setting_.n_repeat_,
setting_.is_gpu_timer_,
setting_.flush_cache_,
setting_.rotating_count_,
setting_.bench_time_ms_});
setting_.rotating_count_});
process_result(gemm_problem,
c_m_n_dev_buf,
c_m_n_host_result,