add benchmark for cold and warmp up

This commit is contained in:
Yanxing-Shi
2025-05-15 11:11:18 +00:00
parent cfbbae9bd6
commit 62d2a63f43
4 changed files with 60 additions and 15 deletions

View File

@@ -11,6 +11,7 @@
#include <iomanip>
#include "ck/host_utility/device_prop.hpp"
#include "ck_tile/host/timer.hpp"
enum class Metric
{
@@ -136,6 +137,42 @@ class GemmProfiler
return "Unknown";
}
template <typename Timer>
static float execute_kernel(const ck_tile::stream_config& stream,
const std::function<void()>& kernel_launch)
{
Timer timer;
auto flush_cache = [&] {
#if defined(__HIP_DEVICE_COMPILE__) && __HIP_DEVICE_COMPILE__
__builtin_amdgcn_s_dcache_wb();
__builtin_amdgcn_s_dcache_inv();
#endif
};
// Cold iterations
for(int i = 0; i < stream.cold_niters_; ++i)
{
timer.start(stream.stream_id_);
kernel_launch();
timer.stop(stream.stream_id_);
}
// Warm iterations with measurement
std::vector<float> measured_times;
for(int i = 0; i < stream.nrepeat_; ++i)
{
timer.start(stream.stream_id_);
kernel_launch();
timer.stop(stream.stream_id_);
measured_times.push_back(timer.duration());
flush_cache();
}
return std::accumulate(measured_times.begin(), measured_times.end(), 0.0f) /
measured_times.size();
}
template <typename Kernel>
void benchmark_kernel(ck_tile::DeviceMem& c_m_n_dev_buf,
ck_tile::HostTensor<CDataType>& c_m_n_host_result,
@@ -163,22 +200,30 @@ class GemmProfiler
KernelInstance kernel_instance{description, problem, {-1.0f, -1.0f, -1.0f}};
float avg_time = Kernel::launch(args, stream);
c_m_n_dev_buf.FromDevice(c_m_n_dev_result.data());
auto kernel_launch = [&] { Kernel::launch(args, stream); };
float avg_time = 0.f;
if(stream.is_gpu_timer_)
{
avg_time = execute_kernel<ck_tile::gpu_timer>(stream, kernel_launch);
}
else
{
avg_time = execute_kernel<ck_tile::cpu_timer>(stream, kernel_launch);
}
std::size_t flop = std::size_t(2) * args.M * args.N * args.K;
std::size_t num_byte = sizeof(ADataType) * args.M * args.K +
sizeof(BDataType) * args.N * args.K +
sizeof(CDataType) * args.M * args.N;
float tflops = static_cast<float>(flop) / 1.E9 / avg_time;
float gb_per_sec = num_byte / 1.E6 / avg_time;
kernel_instance.perf_result.latency = avg_time;
kernel_instance.perf_result.tflops = tflops;
kernel_instance.perf_result.bandwidth = gb_per_sec;
kernel_instance.perf_result.tflops = static_cast<float>(flop) / 1.E9 / avg_time;
kernel_instance.perf_result.bandwidth = num_byte / 1.E6 / avg_time;
std::cout << kernel_instance << std::endl;
c_m_n_dev_buf.FromDevice(c_m_n_dev_result.data());
bool verified_correct =
!verify || compare(args.K, args.k_batch, c_m_n_dev_result, c_m_n_host_result);

View File

@@ -213,14 +213,14 @@ def get_gpu_name_by_id(gpu_id: int = 0) -> str:
check=True
)
arch_pattern = r'gfx\d{3}[a-z]?'
arch_pattern = r'gfx\d{3,4}[a-z]?'
match = re.search(arch_pattern, result.stdout.lower())
return match.group() if match else ""
except (FileNotFoundError, subprocess.CalledProcessError) as e:
print(f"[System Error] {str(e)}")
print(f"System Error: {str(e)}, when get the name of gpu:{gpu_id}")
return ""
except Exception as e:
print(f"[Runtime Exception] {str(e)}")
print(f"Runtime Exception: {str(e)}, when get the name of gpu:{gpu_id}")
return ""

View File

@@ -173,7 +173,7 @@ void run(const ck_tile::ArgParser& arg_parser)
structured_sparsity,
trait,
gemm_args,
ck_tile::stream_config{nullptr, true, 1, n_warmup, n_repeat});
ck_tile::stream_config{nullptr, false, 0, n_warmup, n_repeat});
return;
}

View File

@@ -127,7 +127,7 @@ using CLayout = {LAYOUT_MAP[self.config.problem.layout_map['matrix_c']]};
def _generate_all_trait_files(self):
"""Generate all kernel traits into files."""
if not self.all_trait_names: # Check if the list is empty
if not self.all_trait_names:
self._generate_all_traits()
for trait in self.all_trait_names:
self._generate_trait_file(trait)
@@ -177,7 +177,7 @@ struct GemmKernel {{
static constexpr bool kPadN = {pad_n};
static constexpr bool kPadK = {pad_k};
static float launch(ck_tile::GemmHostArgs& args, const ck_tile::stream_config& s) {{
static float launch(ck_tile::GemmHostArgs& args, const ck_tile::stream_config& stream) {{
static constexpr bool permuteA = false;
static constexpr bool permuteB = false;
static constexpr bool DoubleSmemBuffer ={"true" if pipeline == "compv4" else "false"};
@@ -249,7 +249,7 @@ struct GemmKernel {{
throw std::runtime_error("Wrong! Arguments not supported! Skipping gemm!");
}}
if(s.log_level_ > 0)
if(stream.log_level_ > 0)
{{
std::cout << "Launching kernel with args:"
<< " grid: {{" << grids.x << ", " << grids.y << ", " << grids.z << "}}"
@@ -257,7 +257,7 @@ struct GemmKernel {{
<< std::endl;
}}
ave_time = ck_tile::launch_kernel(s,
ave_time = ck_tile::launch_kernel(stream,
ck_tile::make_kernel<blocks.x, kBlockPerCu>(
Kernel{{}}, grids, blocks, 0, kargs));
return ave_time;
@@ -423,7 +423,7 @@ struct GemmDispatcher {
ck_tile::HostTensor<CDataType>&,
int,
ck_tile::GemmHostArgs&,
const ck_tile::stream_config&)>>
const ck_tile::stream_config& stream)>>
kernel_map;
return kernel_map;
}