mirror of
https://github.com/ROCm/composable_kernel.git
synced 2026-06-29 11:16:59 +00:00
add benchmark for cold and warmp up
This commit is contained in:
@@ -11,6 +11,7 @@
|
||||
#include <iomanip>
|
||||
|
||||
#include "ck/host_utility/device_prop.hpp"
|
||||
#include "ck_tile/host/timer.hpp"
|
||||
|
||||
enum class Metric
|
||||
{
|
||||
@@ -136,6 +137,42 @@ class GemmProfiler
|
||||
return "Unknown";
|
||||
}
|
||||
|
||||
template <typename Timer>
|
||||
static float execute_kernel(const ck_tile::stream_config& stream,
|
||||
const std::function<void()>& kernel_launch)
|
||||
{
|
||||
Timer timer;
|
||||
|
||||
auto flush_cache = [&] {
|
||||
#if defined(__HIP_DEVICE_COMPILE__) && __HIP_DEVICE_COMPILE__
|
||||
__builtin_amdgcn_s_dcache_wb();
|
||||
__builtin_amdgcn_s_dcache_inv();
|
||||
#endif
|
||||
};
|
||||
|
||||
// Cold iterations
|
||||
for(int i = 0; i < stream.cold_niters_; ++i)
|
||||
{
|
||||
timer.start(stream.stream_id_);
|
||||
kernel_launch();
|
||||
timer.stop(stream.stream_id_);
|
||||
}
|
||||
|
||||
// Warm iterations with measurement
|
||||
std::vector<float> measured_times;
|
||||
for(int i = 0; i < stream.nrepeat_; ++i)
|
||||
{
|
||||
timer.start(stream.stream_id_);
|
||||
kernel_launch();
|
||||
timer.stop(stream.stream_id_);
|
||||
measured_times.push_back(timer.duration());
|
||||
flush_cache();
|
||||
}
|
||||
|
||||
return std::accumulate(measured_times.begin(), measured_times.end(), 0.0f) /
|
||||
measured_times.size();
|
||||
}
|
||||
|
||||
template <typename Kernel>
|
||||
void benchmark_kernel(ck_tile::DeviceMem& c_m_n_dev_buf,
|
||||
ck_tile::HostTensor<CDataType>& c_m_n_host_result,
|
||||
@@ -163,22 +200,30 @@ class GemmProfiler
|
||||
|
||||
KernelInstance kernel_instance{description, problem, {-1.0f, -1.0f, -1.0f}};
|
||||
|
||||
float avg_time = Kernel::launch(args, stream);
|
||||
c_m_n_dev_buf.FromDevice(c_m_n_dev_result.data());
|
||||
auto kernel_launch = [&] { Kernel::launch(args, stream); };
|
||||
|
||||
float avg_time = 0.f;
|
||||
if(stream.is_gpu_timer_)
|
||||
{
|
||||
avg_time = execute_kernel<ck_tile::gpu_timer>(stream, kernel_launch);
|
||||
}
|
||||
else
|
||||
{
|
||||
avg_time = execute_kernel<ck_tile::cpu_timer>(stream, kernel_launch);
|
||||
}
|
||||
|
||||
std::size_t flop = std::size_t(2) * args.M * args.N * args.K;
|
||||
std::size_t num_byte = sizeof(ADataType) * args.M * args.K +
|
||||
sizeof(BDataType) * args.N * args.K +
|
||||
sizeof(CDataType) * args.M * args.N;
|
||||
float tflops = static_cast<float>(flop) / 1.E9 / avg_time;
|
||||
float gb_per_sec = num_byte / 1.E6 / avg_time;
|
||||
|
||||
kernel_instance.perf_result.latency = avg_time;
|
||||
kernel_instance.perf_result.tflops = tflops;
|
||||
kernel_instance.perf_result.bandwidth = gb_per_sec;
|
||||
kernel_instance.perf_result.tflops = static_cast<float>(flop) / 1.E9 / avg_time;
|
||||
kernel_instance.perf_result.bandwidth = num_byte / 1.E6 / avg_time;
|
||||
|
||||
std::cout << kernel_instance << std::endl;
|
||||
|
||||
c_m_n_dev_buf.FromDevice(c_m_n_dev_result.data());
|
||||
bool verified_correct =
|
||||
!verify || compare(args.K, args.k_batch, c_m_n_dev_result, c_m_n_host_result);
|
||||
|
||||
|
||||
@@ -213,14 +213,14 @@ def get_gpu_name_by_id(gpu_id: int = 0) -> str:
|
||||
check=True
|
||||
)
|
||||
|
||||
arch_pattern = r'gfx\d{3}[a-z]?'
|
||||
arch_pattern = r'gfx\d{3,4}[a-z]?'
|
||||
match = re.search(arch_pattern, result.stdout.lower())
|
||||
return match.group() if match else ""
|
||||
|
||||
except (FileNotFoundError, subprocess.CalledProcessError) as e:
|
||||
print(f"[System Error] {str(e)}")
|
||||
print(f"System Error: {str(e)}, when get the name of gpu:{gpu_id}")
|
||||
return ""
|
||||
except Exception as e:
|
||||
print(f"[Runtime Exception] {str(e)}")
|
||||
print(f"Runtime Exception: {str(e)}, when get the name of gpu:{gpu_id}")
|
||||
return ""
|
||||
|
||||
|
||||
@@ -173,7 +173,7 @@ void run(const ck_tile::ArgParser& arg_parser)
|
||||
structured_sparsity,
|
||||
trait,
|
||||
gemm_args,
|
||||
ck_tile::stream_config{nullptr, true, 1, n_warmup, n_repeat});
|
||||
ck_tile::stream_config{nullptr, false, 0, n_warmup, n_repeat});
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
@@ -127,7 +127,7 @@ using CLayout = {LAYOUT_MAP[self.config.problem.layout_map['matrix_c']]};
|
||||
|
||||
def _generate_all_trait_files(self):
|
||||
"""Generate all kernel traits into files."""
|
||||
if not self.all_trait_names: # Check if the list is empty
|
||||
if not self.all_trait_names:
|
||||
self._generate_all_traits()
|
||||
for trait in self.all_trait_names:
|
||||
self._generate_trait_file(trait)
|
||||
@@ -177,7 +177,7 @@ struct GemmKernel {{
|
||||
static constexpr bool kPadN = {pad_n};
|
||||
static constexpr bool kPadK = {pad_k};
|
||||
|
||||
static float launch(ck_tile::GemmHostArgs& args, const ck_tile::stream_config& s) {{
|
||||
static float launch(ck_tile::GemmHostArgs& args, const ck_tile::stream_config& stream) {{
|
||||
static constexpr bool permuteA = false;
|
||||
static constexpr bool permuteB = false;
|
||||
static constexpr bool DoubleSmemBuffer ={"true" if pipeline == "compv4" else "false"};
|
||||
@@ -249,7 +249,7 @@ struct GemmKernel {{
|
||||
throw std::runtime_error("Wrong! Arguments not supported! Skipping gemm!");
|
||||
}}
|
||||
|
||||
if(s.log_level_ > 0)
|
||||
if(stream.log_level_ > 0)
|
||||
{{
|
||||
std::cout << "Launching kernel with args:"
|
||||
<< " grid: {{" << grids.x << ", " << grids.y << ", " << grids.z << "}}"
|
||||
@@ -257,7 +257,7 @@ struct GemmKernel {{
|
||||
<< std::endl;
|
||||
}}
|
||||
|
||||
ave_time = ck_tile::launch_kernel(s,
|
||||
ave_time = ck_tile::launch_kernel(stream,
|
||||
ck_tile::make_kernel<blocks.x, kBlockPerCu>(
|
||||
Kernel{{}}, grids, blocks, 0, kargs));
|
||||
return ave_time;
|
||||
@@ -423,7 +423,7 @@ struct GemmDispatcher {
|
||||
ck_tile::HostTensor<CDataType>&,
|
||||
int,
|
||||
ck_tile::GemmHostArgs&,
|
||||
const ck_tile::stream_config&)>>
|
||||
const ck_tile::stream_config& stream)>>
|
||||
kernel_map;
|
||||
return kernel_map;
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user