[rocm-libraries] ROCm/rocm-libraries#5114 (commit 59b8cb5)

[CK][CK Tile] Improvements for grouped conv fwd tile
 profiling (#5114)

## Motivation

Improve profiling for grouped convolution forward for better comparison
between CK and CK Tile
## Technical Details

- Include preprocessing time for ck tile
- Add flush cache for conv fwd profiler
- Switch configs to builder reflect
- Add KPerXdl deduce
- Add non-grouped ported instances

## Test Plan

test_grouped_convnd_fwd_tile

## Test Result

pass

## Submission Checklist

- [x] Look over the contributing guidelines at
https://github.com/ROCm/ROCm/blob/develop/CONTRIBUTING.md#pull-requests.

AICK-786
This commit is contained in:
Bartłomiej Kocot
2026-03-11 22:39:20 +00:00
committed by assistant-librarian[bot]
parent c1f2d8166d
commit 2169367735
24 changed files with 2375 additions and 1874 deletions

View File

@@ -10,6 +10,8 @@
#include "ck_tile/host/hip_check_error.hpp"
#include "ck_tile/host/stream_config.hpp"
#include "ck_tile/host/timer.hpp"
#include "ck_tile/host/flush_icache.hpp"
#include "ck_tile/host/rotating_buffers.hpp"
#include <cstddef>
#include <hip/hip_runtime.h>
@@ -124,6 +126,47 @@ preprocess_profiling_impl(TimerType timer, const stream_config& s, PreprocessFun
return timer.duration() / s.nrepeat_;
}
template <typename TimerType, typename CallablesFunc, typename PreprocessFunc = std::nullptr_t>
CK_TILE_HOST double timing_loop_flush_cache_impl(TimerType timer,
const stream_config& s,
CallablesFunc&& callables_func,
PreprocessFunc preprocess = nullptr)
{
auto run_flush_cache = [&]() { ck_tile::flush_icache(); };
// Warm up
for(int i = 0; i < s.cold_niters_; i++)
{
if constexpr(!std::is_same_v<PreprocessFunc, std::nullptr_t>)
{
preprocess();
}
callables_func();
}
// Main timing loop
int i = 0;
timer.start(s.stream_id_);
while(i < s.nrepeat_)
{
run_flush_cache();
if constexpr(!std::is_same_v<PreprocessFunc, std::nullptr_t>)
{
preprocess();
}
callables_func();
i++;
}
timer.stop(s.stream_id_);
// Flush cache timing loop
auto flush_cache_time = preprocess_profiling_impl(gpu_timer{}, s, run_flush_cache);
if(i == 0)
{
return 0.;
}
// Exclude flush cache from result
return (timer.duration() / s.nrepeat_) - flush_cache_time;
}
template <typename TimerType, typename CallablesFunc, typename PreprocessFunc = std::nullptr_t>
CK_TILE_HOST double timing_loop_impl(TimerType timer,
const stream_config& s,
@@ -138,12 +181,6 @@ CK_TILE_HOST double timing_loop_impl(TimerType timer,
}
callables_func();
}
// Only profile preprocess if it's provided
auto preprocess_time = 0.0;
if constexpr(!std::is_same_v<PreprocessFunc, std::nullptr_t>)
{
preprocess_time = preprocess_profiling_impl(gpu_timer{}, s, preprocess);
}
int i = 0;
timer.start(s.stream_id_);
@@ -159,9 +196,9 @@ CK_TILE_HOST double timing_loop_impl(TimerType timer,
}
timer.stop(s.stream_id_);
if(!i)
if(i == 0)
return 0.;
return (timer.duration() / s.nrepeat_) - preprocess_time;
return timer.duration() / s.nrepeat_;
}
// clang-format off
@@ -238,4 +275,31 @@ launch_kernel_time_mask(const stream_config& s, PreprocessFunc preprocess, Calla
return timing_loop_impl(cpu_timer{}, s, callables_func, preprocess);
}
}
template <typename PreprocessFunc, typename... Callables>
CK_TILE_HOST float launch_kernel_time_mask_flush_cache(const stream_config& s,
PreprocessFunc preprocess,
Callables&&... callables)
{
static_assert(sizeof...(callables) > 0, "At least one callable is required!");
if(!s.time_kernel_)
{
preprocess();
launch_and_check(s, std::forward<Callables>(callables)...);
return 0;
}
auto callables_func = [&]() { launch_and_check(s, std::forward<Callables>(callables)...); };
if(s.is_gpu_timer_)
{
return timing_loop_flush_cache_impl(gpu_timer{}, s, callables_func, preprocess);
}
else
{
return timing_loop_flush_cache_impl(cpu_timer{}, s, callables_func, preprocess);
}
}
} // namespace ck_tile