mirror of
https://github.com/ROCm/composable_kernel.git
synced 2026-05-12 01:10:17 +00:00
[ck] Clamp negative kernel execution elapsed time to zero (#6379) ## Motivation hipEventElapsedTime can return a small negative value on Windows when timing a very fast kernel launch on the null stream. This caused consumers of launch_and_time_kernel to receive a negative elapsed time, which they reasonably treat as an error, breaking otherwise-correct kernel executions. ## Technical Details After calling hipEventElapsedTime, a clamp is applied in launch_and_time_kernel before the result is returned, avoiding the return of a physically impossible elapsed time. The negative value from hipEventElapsedTime has been observed on Windows. For kernels that complete in well under a millisecond, the HIP event timestamps can alias such that the computed difference is a small negative number (observed: ~-1.78 ms). No HIP error is reported by any surrounding call (hipEventRecord, hipEventSynchronize, hipGetLastError), confirming the kernel itself executed successfully. ## Test Plan - Recompile CK and validate no kernel execution reports a negative elapsed time during hipTensor tests. - Pass the CI/CD pre-checking tests for CK. ## Test Result - All tests passing ## Submission Checklist - [x] Look over the contributing guidelines at https://github.com/ROCm/ROCm/blob/develop/CONTRIBUTING.md#pull-requests.
306 lines
10 KiB
C++
306 lines
10 KiB
C++
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
|
// SPDX-License-Identifier: MIT
|
|
|
|
#pragma once
|
|
#ifndef __HIPCC_RTC__
|
|
#include <hip/hip_runtime.h>
|
|
|
|
#include "ck/ck.hpp"
|
|
#include "ck/utility/env.hpp"
|
|
#include "ck/stream_config.hpp"
|
|
#include "ck/host_utility/hip_check_error.hpp"
|
|
#include "ck/host_utility/flush_cache.hpp"
|
|
|
|
namespace ck {
|
|
|
|
template <typename... Args, typename F>
|
|
float launch_and_time_kernel(const StreamConfig& stream_config,
|
|
F kernel,
|
|
dim3 grid_dim,
|
|
dim3 block_dim,
|
|
std::size_t lds_byte,
|
|
Args... args)
|
|
{
|
|
#if CK_TIME_KERNEL
|
|
if(stream_config.time_kernel_)
|
|
{
|
|
if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING)))
|
|
{
|
|
printf("%s: grid_dim {%u, %u, %u}, block_dim {%u, %u, %u} \n",
|
|
__func__,
|
|
grid_dim.x,
|
|
grid_dim.y,
|
|
grid_dim.z,
|
|
block_dim.x,
|
|
block_dim.y,
|
|
block_dim.z);
|
|
|
|
printf("Warm up %d times\n", stream_config.cold_niters_);
|
|
}
|
|
// warm up
|
|
for(int i = 0; i < stream_config.cold_niters_; ++i)
|
|
{
|
|
kernel<<<grid_dim, block_dim, lds_byte, stream_config.stream_id_>>>(args...);
|
|
hip_check_error(hipGetLastError());
|
|
}
|
|
|
|
const int nrepeat = stream_config.nrepeat_;
|
|
if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING)))
|
|
{
|
|
printf("Start running %d times...\n", nrepeat);
|
|
}
|
|
hipEvent_t start, stop;
|
|
|
|
hip_check_error(hipEventCreate(&start));
|
|
hip_check_error(hipEventCreate(&stop));
|
|
|
|
hip_check_error(hipDeviceSynchronize());
|
|
hip_check_error(hipEventRecord(start, stream_config.stream_id_));
|
|
|
|
for(int i = 0; i < nrepeat; ++i)
|
|
{
|
|
kernel<<<grid_dim, block_dim, lds_byte, stream_config.stream_id_>>>(args...);
|
|
hip_check_error(hipGetLastError());
|
|
}
|
|
|
|
hip_check_error(hipEventRecord(stop, stream_config.stream_id_));
|
|
hip_check_error(hipEventSynchronize(stop));
|
|
|
|
float total_time = 0;
|
|
|
|
hip_check_error(hipEventElapsedTime(&total_time, start, stop));
|
|
|
|
// hipEventElapsedTime can return a small negative value on Windows for a
|
|
// very fast kernel. Clamp to zero, as negative elapsed time is never physical.
|
|
if(total_time < 0)
|
|
total_time = 0;
|
|
|
|
hip_check_error(hipEventDestroy(start));
|
|
hip_check_error(hipEventDestroy(stop));
|
|
|
|
return total_time / nrepeat;
|
|
}
|
|
else
|
|
{
|
|
kernel<<<grid_dim, block_dim, lds_byte, stream_config.stream_id_>>>(args...);
|
|
hip_check_error(hipGetLastError());
|
|
|
|
return 0;
|
|
}
|
|
#else
|
|
kernel<<<grid_dim, block_dim, lds_byte, stream_config.stream_id_>>>(args...);
|
|
hip_check_error(hipGetLastError());
|
|
|
|
return 0;
|
|
#endif
|
|
}
|
|
|
|
template <typename... Args, typename F, typename PreProcessFunc>
|
|
float launch_and_time_kernel_with_preprocess(const StreamConfig& stream_config,
|
|
PreProcessFunc preprocess,
|
|
F kernel,
|
|
dim3 grid_dim,
|
|
dim3 block_dim,
|
|
std::size_t lds_byte,
|
|
Args... args)
|
|
{
|
|
#if CK_TIME_KERNEL
|
|
if(stream_config.time_kernel_)
|
|
{
|
|
if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING)))
|
|
{
|
|
printf("%s: grid_dim {%u, %u, %u}, block_dim {%u, %u, %u} \n",
|
|
__func__,
|
|
grid_dim.x,
|
|
grid_dim.y,
|
|
grid_dim.z,
|
|
block_dim.x,
|
|
block_dim.y,
|
|
block_dim.z);
|
|
|
|
printf("Warm up %d times\n", stream_config.cold_niters_);
|
|
}
|
|
// warm up
|
|
preprocess();
|
|
for(int i = 0; i < stream_config.cold_niters_; ++i)
|
|
{
|
|
kernel<<<grid_dim, block_dim, lds_byte, stream_config.stream_id_>>>(args...);
|
|
hip_check_error(hipGetLastError());
|
|
}
|
|
|
|
const int nrepeat = stream_config.nrepeat_;
|
|
if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING)))
|
|
{
|
|
printf("Start running %d times...\n", nrepeat);
|
|
}
|
|
hipEvent_t start, stop;
|
|
|
|
hip_check_error(hipEventCreate(&start));
|
|
hip_check_error(hipEventCreate(&stop));
|
|
|
|
hip_check_error(hipDeviceSynchronize());
|
|
hip_check_error(hipEventRecord(start, stream_config.stream_id_));
|
|
|
|
for(int i = 0; i < nrepeat; ++i)
|
|
{
|
|
preprocess();
|
|
kernel<<<grid_dim, block_dim, lds_byte, stream_config.stream_id_>>>(args...);
|
|
hip_check_error(hipGetLastError());
|
|
}
|
|
|
|
hip_check_error(hipEventRecord(stop, stream_config.stream_id_));
|
|
hip_check_error(hipEventSynchronize(stop));
|
|
|
|
float total_time = 0;
|
|
|
|
hip_check_error(hipEventElapsedTime(&total_time, start, stop));
|
|
|
|
hip_check_error(hipEventDestroy(start));
|
|
hip_check_error(hipEventDestroy(stop));
|
|
|
|
return total_time / nrepeat;
|
|
}
|
|
else
|
|
{
|
|
preprocess();
|
|
kernel<<<grid_dim, block_dim, lds_byte, stream_config.stream_id_>>>(args...);
|
|
hip_check_error(hipGetLastError());
|
|
|
|
return 0;
|
|
}
|
|
#else
|
|
kernel<<<grid_dim, block_dim, lds_byte, stream_config.stream_id_>>>(args...);
|
|
hip_check_error(hipGetLastError());
|
|
|
|
return 0;
|
|
#endif
|
|
}
|
|
|
|
template <typename... Args, typename F, typename PreProcessFunc>
|
|
float launch_and_time_kernel_with_preprocess_flush_cache(const StreamConfig& stream_config,
|
|
PreProcessFunc preprocess,
|
|
F kernel,
|
|
dim3 grid_dim,
|
|
dim3 block_dim,
|
|
std::size_t lds_byte,
|
|
Args... args)
|
|
{
|
|
#if CK_TIME_KERNEL
|
|
if(stream_config.time_kernel_)
|
|
{
|
|
auto run_flush_cache = [&]() { ck::utility::flush_icache(); };
|
|
if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING)))
|
|
{
|
|
printf("%s: grid_dim {%u, %u, %u}, block_dim {%u, %u, %u} \n",
|
|
__func__,
|
|
grid_dim.x,
|
|
grid_dim.y,
|
|
grid_dim.z,
|
|
block_dim.x,
|
|
block_dim.y,
|
|
block_dim.z);
|
|
|
|
printf("Warm up %d times\n", stream_config.cold_niters_);
|
|
}
|
|
// Warm up
|
|
preprocess();
|
|
for(int i = 0; i < stream_config.cold_niters_; ++i)
|
|
{
|
|
kernel<<<grid_dim, block_dim, lds_byte, stream_config.stream_id_>>>(args...);
|
|
hip_check_error(hipGetLastError());
|
|
}
|
|
float total_time = 0, flush_cache_total_time = 0;
|
|
const int nrepeat = stream_config.nrepeat_;
|
|
// Main timing loop
|
|
{
|
|
if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING)))
|
|
{
|
|
printf("Start running %d times...\n", nrepeat);
|
|
}
|
|
hipEvent_t start, stop;
|
|
|
|
hip_check_error(hipEventCreate(&start));
|
|
hip_check_error(hipEventCreate(&stop));
|
|
|
|
hip_check_error(hipDeviceSynchronize());
|
|
hip_check_error(hipEventRecord(start, stream_config.stream_id_));
|
|
|
|
for(int i = 0; i < nrepeat; ++i)
|
|
{
|
|
run_flush_cache();
|
|
preprocess();
|
|
kernel<<<grid_dim, block_dim, lds_byte, stream_config.stream_id_>>>(args...);
|
|
hip_check_error(hipGetLastError());
|
|
}
|
|
|
|
hip_check_error(hipEventRecord(stop, stream_config.stream_id_));
|
|
hip_check_error(hipEventSynchronize(stop));
|
|
|
|
hip_check_error(hipEventElapsedTime(&total_time, start, stop));
|
|
|
|
hip_check_error(hipEventDestroy(start));
|
|
hip_check_error(hipEventDestroy(stop));
|
|
}
|
|
// Flush cache timing loop
|
|
{
|
|
if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING)))
|
|
{
|
|
printf("Profile flush cache %d times...\n", nrepeat);
|
|
}
|
|
hipEvent_t start, stop;
|
|
|
|
hip_check_error(hipEventCreate(&start));
|
|
hip_check_error(hipEventCreate(&stop));
|
|
|
|
hip_check_error(hipDeviceSynchronize());
|
|
hip_check_error(hipEventRecord(start, stream_config.stream_id_));
|
|
|
|
for(int i = 0; i < nrepeat; ++i)
|
|
{
|
|
run_flush_cache();
|
|
}
|
|
|
|
hip_check_error(hipEventRecord(stop, stream_config.stream_id_));
|
|
hip_check_error(hipEventSynchronize(stop));
|
|
|
|
hip_check_error(hipEventElapsedTime(&flush_cache_total_time, start, stop));
|
|
|
|
hip_check_error(hipEventDestroy(start));
|
|
hip_check_error(hipEventDestroy(stop));
|
|
}
|
|
// Exclude flush cache from result
|
|
return (total_time - flush_cache_total_time) / nrepeat;
|
|
}
|
|
else
|
|
{
|
|
preprocess();
|
|
kernel<<<grid_dim, block_dim, lds_byte, stream_config.stream_id_>>>(args...);
|
|
hip_check_error(hipGetLastError());
|
|
|
|
return 0;
|
|
}
|
|
#else
|
|
kernel<<<grid_dim, block_dim, lds_byte, stream_config.stream_id_>>>(args...);
|
|
hip_check_error(hipGetLastError());
|
|
|
|
return 0;
|
|
#endif
|
|
}
|
|
|
|
template <typename... Args, typename F>
|
|
float launch_and_time_kernel_flush_cache(const StreamConfig& stream_config,
|
|
F kernel,
|
|
dim3 grid_dim,
|
|
dim3 block_dim,
|
|
std::size_t lds_byte,
|
|
Args... args)
|
|
{
|
|
auto preprocess = [&]() {};
|
|
return launch_and_time_kernel_with_preprocess_flush_cache(
|
|
stream_config, preprocess, kernel, grid_dim, block_dim, lds_byte, args...);
|
|
}
|
|
|
|
} // namespace ck
|
|
|
|
#endif
|