mirror of
https://github.com/ROCm/composable_kernel.git
synced 2026-05-17 19:40:04 +00:00
Merge commit '88d72178d6739c7e277074e5f9bb5d1e59bf0152' into develop
This commit is contained in:
@@ -15,12 +15,6 @@
|
||||
|
||||
namespace ck_tile {
|
||||
|
||||
#define LOW_CU_PROCESSORS 80
|
||||
#define HIGH_CU_PROCESSORS 228
|
||||
#define OPTIMAL_LATENCY_LOW_CU_PROCESSORS 0.005
|
||||
#define OPTIMAL_LATENCY_HIGH_CU_PROCESSORS 0.0015
|
||||
#define OPTIMAL_LATENCY_SAFE_MARGIN 0.01
|
||||
|
||||
template <int MaxThreadPerBlock, int MinBlockPerCu, typename Kernel, typename... Args>
|
||||
#if CK_TILE_USE_LAUNCH_BOUNDS
|
||||
__launch_bounds__(MaxThreadPerBlock, MinBlockPerCu)
|
||||
@@ -65,71 +59,58 @@ CK_TILE_HOST void launch_and_check(const stream_config& sc, Callables&&... calla
|
||||
}
|
||||
}
|
||||
|
||||
template <class it>
|
||||
typename std::iterator_traits<it>::value_type median(it begin, it end)
|
||||
// Measure the preprocess time during the cold iterations
|
||||
template <typename TimerType, typename PreprocessFunc>
|
||||
CK_TILE_HOST double
|
||||
preprocess_profiling_impl(TimerType timer, const stream_config& s, PreprocessFunc preprocess)
|
||||
{
|
||||
if(begin == end)
|
||||
timer.start(s.stream_id_);
|
||||
for(int i = 0; i < s.nrepeat_; i++)
|
||||
{
|
||||
return std::numeric_limits<double>::quiet_NaN();
|
||||
if constexpr(!std::is_same_v<PreprocessFunc, std::nullptr_t>)
|
||||
{
|
||||
preprocess();
|
||||
}
|
||||
}
|
||||
auto n = std::distance(begin, end);
|
||||
auto n2 = n / 2;
|
||||
std::nth_element(begin, begin + n2, end);
|
||||
return (n % 2) ? begin[n2] : (*std::max_element(begin, begin + n2) + begin[n2]) / 2.0;
|
||||
timer.stop(s.stream_id_);
|
||||
|
||||
return timer.duration() / s.nrepeat_;
|
||||
}
|
||||
|
||||
inline void remove_outliers(std::vector<float>& v)
|
||||
{
|
||||
// 1.5x IQR method to detect and remove outliers
|
||||
auto n2 = v.size() / 2;
|
||||
std::nth_element(v.begin(), v.begin() + n2, v.end());
|
||||
auto q1 = median(v.begin(), v.begin() + n2);
|
||||
auto q3 = median(v.begin() + ((v.size() % 2) ? n2 + 1 : n2), v.end());
|
||||
auto iqr = q3 - q1;
|
||||
auto lb = q1 - 1.5 * iqr;
|
||||
auto ub = q3 + 1.5 * iqr;
|
||||
v.erase(std::remove_if(v.begin(), v.end(), [&](float f) { return f < lb || f > ub; }), v.end());
|
||||
}
|
||||
|
||||
template <typename TimerType, typename CallablesFunc>
|
||||
template <typename TimerType, typename CallablesFunc, typename PreprocessFunc = std::nullptr_t>
|
||||
CK_TILE_HOST double timing_loop_impl(TimerType timer,
|
||||
const stream_config& s,
|
||||
CallablesFunc&& callables_func,
|
||||
std::function<void()> preprocess = nullptr)
|
||||
PreprocessFunc preprocess = nullptr)
|
||||
{
|
||||
for(int i = 0; i < s.cold_niters_; i++)
|
||||
{
|
||||
callables_func();
|
||||
}
|
||||
|
||||
float per_iter_time = 0.f;
|
||||
std::vector<float> times;
|
||||
int i = 0;
|
||||
while(i < s.nrepeat_ || per_iter_time < s.bench_time_ms_)
|
||||
// Only profile preprocess if it's provided
|
||||
auto preprocess_time = 0.0;
|
||||
if constexpr(!std::is_same_v<PreprocessFunc, std::nullptr_t>)
|
||||
{
|
||||
if(preprocess)
|
||||
preprocess();
|
||||
preprocess_time = preprocess_profiling_impl(gpu_timer{}, s, preprocess);
|
||||
}
|
||||
|
||||
timer.start(s.stream_id_, i);
|
||||
callables_func();
|
||||
timer.stop(s.stream_id_, i);
|
||||
|
||||
if(i > 0)
|
||||
int i = 0;
|
||||
timer.start(s.stream_id_);
|
||||
while(i < s.nrepeat_)
|
||||
{
|
||||
if constexpr(!std::is_same_v<PreprocessFunc, std::nullptr_t>)
|
||||
{
|
||||
per_iter_time = timer.duration(i - 1);
|
||||
times.push_back(per_iter_time);
|
||||
per_iter_time = timer.is_exceed(i - 1);
|
||||
preprocess();
|
||||
}
|
||||
|
||||
callables_func();
|
||||
i++;
|
||||
}
|
||||
timer.stop(s.stream_id_);
|
||||
|
||||
if(!i)
|
||||
return 0.;
|
||||
|
||||
per_iter_time = timer.duration(i - 1);
|
||||
times.push_back(per_iter_time);
|
||||
remove_outliers(times);
|
||||
return std::accumulate(times.begin(), times.end(), 0.) / times.size();
|
||||
return (timer.duration() / s.nrepeat_) - preprocess_time;
|
||||
}
|
||||
|
||||
// clang-format off
|
||||
@@ -174,7 +155,7 @@ CK_TILE_HOST float launch_kernel(const stream_config& s, Callables&&... callable
|
||||
|
||||
if(s.is_gpu_timer_)
|
||||
{
|
||||
return timing_loop_impl(gpu_timer_new{s.stream_id_}, s, callables_func);
|
||||
return timing_loop_impl(gpu_timer{}, s, callables_func);
|
||||
}
|
||||
else
|
||||
{
|
||||
@@ -199,7 +180,7 @@ launch_kernel_time_mask(const stream_config& s, PreprocessFunc preprocess, Calla
|
||||
|
||||
if(s.is_gpu_timer_)
|
||||
{
|
||||
return timing_loop_impl(gpu_timer_new{s.stream_id_}, s, callables_func, preprocess);
|
||||
return timing_loop_impl(gpu_timer{}, s, callables_func, preprocess);
|
||||
}
|
||||
else
|
||||
{
|
||||
|
||||
@@ -20,6 +20,10 @@ namespace ck_tile {
|
||||
*
|
||||
* // create stream config with _some_stream_id_, and benchmark using cpu timer
|
||||
* stream_config s = stream_config{_some_stream_id_, true, 0, 3, 10, false};
|
||||
*
|
||||
* // create stream config with _some_stream_id_, and enable gpu timer for rotating buffer with
|
||||
*rotating buffer count stream_config s = stream_config{_some_stream_id_, true, 0, 3, 10, true,
|
||||
*true, 1};
|
||||
**/
|
||||
|
||||
struct stream_config
|
||||
@@ -32,6 +36,5 @@ struct stream_config
|
||||
bool is_gpu_timer_ = true; // keep compatible
|
||||
bool flush_cache_ = false;
|
||||
int rotating_count_ = 1;
|
||||
int bench_time_ms_ = 0;
|
||||
};
|
||||
} // namespace ck_tile
|
||||
|
||||
@@ -48,100 +48,31 @@ struct gpu_timer
|
||||
hipEvent_t start_evt, stop_evt;
|
||||
};
|
||||
|
||||
struct gpu_timer_new
|
||||
{
|
||||
CK_TILE_HOST gpu_timer_new(const hipStream_t& s)
|
||||
{
|
||||
for(auto& e : start_event)
|
||||
{
|
||||
HIP_CHECK_ERROR(hipEventCreate(&e));
|
||||
}
|
||||
for(auto& e : stop_event)
|
||||
{
|
||||
HIP_CHECK_ERROR(hipEventCreate(&e));
|
||||
}
|
||||
HIP_CHECK_ERROR(hipEventCreate(&event0));
|
||||
HIP_CHECK_ERROR(hipEventRecord(event0, s));
|
||||
}
|
||||
|
||||
CK_TILE_HOST ~gpu_timer_new() noexcept(false)
|
||||
{
|
||||
for(auto& e : start_event)
|
||||
{
|
||||
HIP_CHECK_ERROR(hipEventDestroy(e));
|
||||
}
|
||||
for(auto& e : stop_event)
|
||||
{
|
||||
HIP_CHECK_ERROR(hipEventDestroy(e));
|
||||
}
|
||||
HIP_CHECK_ERROR(hipEventDestroy(event0));
|
||||
}
|
||||
|
||||
CK_TILE_HOST void start(const hipStream_t& s, int idx = 0)
|
||||
{
|
||||
HIP_CHECK_ERROR(hipEventRecord(start_event[idx % 2], s));
|
||||
}
|
||||
|
||||
CK_TILE_HOST void stop(const hipStream_t& s, int idx = 0)
|
||||
{
|
||||
HIP_CHECK_ERROR(hipEventRecord(stop_event[idx % 2], s));
|
||||
}
|
||||
// return in ms
|
||||
CK_TILE_HOST float duration(int idx = 0) const
|
||||
{
|
||||
float ms;
|
||||
HIP_CHECK_ERROR(hipEventSynchronize(stop_event[idx % 2]));
|
||||
HIP_CHECK_ERROR(hipEventElapsedTime(&ms, start_event[idx % 2], stop_event[idx % 2]));
|
||||
return ms;
|
||||
}
|
||||
CK_TILE_HOST float is_exceed(int idx = 0) const
|
||||
{
|
||||
float ms;
|
||||
HIP_CHECK_ERROR(hipEventElapsedTime(&ms, event0, stop_event[idx % 2]));
|
||||
return ms;
|
||||
}
|
||||
|
||||
private:
|
||||
std::array<hipEvent_t, 2> start_event;
|
||||
std::array<hipEvent_t, 2> stop_event;
|
||||
hipEvent_t event0;
|
||||
};
|
||||
|
||||
struct cpu_timer
|
||||
{
|
||||
// torch.utils.benchmark.Timer(), there is a sync inside each timer callback
|
||||
CK_TILE_HOST void start(const hipStream_t& s, [[maybe_unused]] int idx = 0)
|
||||
CK_TILE_HOST void start(const hipStream_t& s)
|
||||
{
|
||||
HIP_CHECK_ERROR(hipStreamSynchronize(s));
|
||||
start_tick = std::chrono::high_resolution_clock::now();
|
||||
time_event0 = std::chrono::high_resolution_clock::now();
|
||||
start_tick = std::chrono::high_resolution_clock::now();
|
||||
}
|
||||
// torch.utils.benchmark.Timer(), there is a sync inside each timer callback
|
||||
CK_TILE_HOST void stop(const hipStream_t& s, [[maybe_unused]] int idx = 0)
|
||||
CK_TILE_HOST void stop(const hipStream_t& s)
|
||||
{
|
||||
HIP_CHECK_ERROR(hipStreamSynchronize(s));
|
||||
stop_tick = std::chrono::high_resolution_clock::now();
|
||||
}
|
||||
// return in ms
|
||||
CK_TILE_HOST float duration([[maybe_unused]] int idx = 0) const
|
||||
CK_TILE_HOST float duration() const
|
||||
{
|
||||
double sec =
|
||||
std::chrono::duration_cast<std::chrono::duration<double>>(stop_tick - start_tick)
|
||||
.count();
|
||||
return static_cast<float>(sec * 1e3);
|
||||
}
|
||||
// return in ms
|
||||
CK_TILE_HOST float is_exceed([[maybe_unused]] int idx = 0) const
|
||||
{
|
||||
double sec =
|
||||
std::chrono::duration_cast<std::chrono::duration<double>>(stop_tick - time_event0)
|
||||
.count();
|
||||
return static_cast<float>(sec * 1e3);
|
||||
}
|
||||
|
||||
private:
|
||||
std::chrono::time_point<std::chrono::high_resolution_clock> start_tick;
|
||||
std::chrono::time_point<std::chrono::high_resolution_clock> time_event0;
|
||||
std::chrono::time_point<std::chrono::high_resolution_clock> stop_tick;
|
||||
};
|
||||
|
||||
|
||||
Reference in New Issue
Block a user