mirror of
https://github.com/ROCm/composable_kernel.git
synced 2026-06-29 19:28:33 +00:00
recover benchmark_gemm and fix
This commit is contained in:
@@ -12,7 +12,7 @@ User can provide kernel configuration such as tile size, warp size, padding, pip
|
||||
mkdir build && cd build
|
||||
# build composable kernel
|
||||
## replace <arch> with the appropriate architecture (example gfx942) or leave blank
|
||||
## "USE_CUSTOM_CONFIG=ON" for default configuration, "USE_CUSTOM_CONFIG=ON" for user provided configuration
|
||||
## "USE_CUSTOM_CONFIG=OFF" for default configuration, "USE_CUSTOM_CONFIG=ON" for user provided configuration
|
||||
sh ../script/cmake-ck-dev.sh ../ <arch> -D USE_CUSTOM_CONFIG=ON
|
||||
# generate the executable
|
||||
make tile_engine_gemm -j
|
||||
@@ -29,22 +29,23 @@ rm -rf tile_engine/ && make tile_engine_gemm -j # rebuild
|
||||
-m The value for m dimension. Default is 3840.
|
||||
-n The value for n dimension. Default is 4096.
|
||||
-k The value for k dimension. Default is 2048.
|
||||
-stride_a The stride value for tensor A. Default is 0.
|
||||
-stride_b The stride value for tensor B. Default is 0.
|
||||
-stride_c The stride value for tensor C Default is 0.
|
||||
-stride_a The stride value for tensor A. Default is 0.
|
||||
-stride_b The stride value for tensor B. Default is 0.
|
||||
-stride_c The stride value for tensor C Default is 0.
|
||||
-split_k The split value for k dimension. Default is 1.
|
||||
-v The type of validation. Set to 0 for no validation, 1 for validation on CPU, or 2 for validation on GPU. Default is 2, validation on GPU.
|
||||
-warmup The number of iterations before benchmark the kernel. Default is 50.
|
||||
-repeat The number of iterations to benchmark the kernel. Default is 100.
|
||||
-log Wether output kernel instance information or not. Possible values are true or false. Default is false.
|
||||
-warmup The number of iterations before benchmark the kernel. Default is 50.
|
||||
-repeat The number of iterations to benchmark the kernel. Default is 100.
|
||||
-timer The type of timer. Possible values are gpu timer or cpu timer. Default is gpu timer.
|
||||
-init The method of tensor initialization. Set to 0 for random, to 1 for linear, or 2 for constant(1). Default is 0, random.
|
||||
-metric Metric with which to measure kernel performance. Set to 0 for latency, 1 for tflops, or 2 for bandwidth. Default is 0, latency.
|
||||
-structured_sparsity whether use sparsity kernel or not. Possible values are true or false. Default is false.
|
||||
-pipeline The type of pipeline. Possible values are compv3, compv4 or mem. Default is compv3.
|
||||
-epilogue The type of epilogue. Possible values are cshuffle or default. Default is cshuffle.
|
||||
-pad_m Whether pad or not in m direction. Possible values are true or false. Default is false.
|
||||
-pad_n Whether pad or not in n direction. Possible values are true or false. Default is false.
|
||||
-pad_k Whether pad or not in k direction. Possible values are true or false. Default is false.
|
||||
-init The method of tensor initialization. Set to 0 for random, to 1 for linear, or 2 for constant(1). Default is 0, random.
|
||||
-metric Metric with which to measure kernel performance. Set to 0 for latency, 1 for tflops, or 2 for bandwidth. Default is 0, latency.
|
||||
-structured_sparsity whether use sparsity kernel or not. Possible values are true or false. Default is false.
|
||||
-pipeline The type of pipeline. Possible values are compv3, compv4 or mem. Default is compv3.
|
||||
-epilogue The type of epilogue. Possible values are cshuffle or default. Default is cshuffle.
|
||||
-pad_m Whether pad or not in m direction. Possible values are true or false. Default is false.
|
||||
-pad_n Whether pad or not in n direction. Possible values are true or false. Default is false.
|
||||
-pad_k Whether pad or not in k direction. Possible values are true or false. Default is false.
|
||||
|
||||
Note: pipeline, scheduler, epilogue, pad_m, pad_n, pad_k should be one of the options specified in user_provided_config.json
|
||||
```
|
||||
|
||||
@@ -137,44 +137,6 @@ class GemmProfiler
|
||||
return "Unknown";
|
||||
}
|
||||
|
||||
template <typename Timer>
|
||||
static float time_kernel(const ck_tile::stream_config& stream,
|
||||
const std::function<void()>& kernel_launch_func)
|
||||
{
|
||||
Timer timer;
|
||||
|
||||
// flush cache
|
||||
auto flush_cache = [&] {
|
||||
#if defined(__HIP_DEVICE_COMPILE__) && __HIP_DEVICE_COMPILE__
|
||||
asm volatile("s_dcache_wb; s_dcache_inv; s_icache_inv" ::: "memory");
|
||||
#endif
|
||||
};
|
||||
|
||||
// Cold runs - excludes from final timing measurement
|
||||
for(int i = 0; i < stream.cold_niters_; ++i)
|
||||
{
|
||||
timer.start(stream.stream_id_);
|
||||
kernel_launch_func();
|
||||
timer.stop(stream.stream_id_);
|
||||
}
|
||||
|
||||
// Hot runs - actual performance measurement
|
||||
std::vector<float> measured_times;
|
||||
for(int i = 0; i < stream.nrepeat_; ++i)
|
||||
{
|
||||
timer.start(stream.stream_id_);
|
||||
kernel_launch_func();
|
||||
timer.stop(stream.stream_id_);
|
||||
measured_times.push_back(timer.duration());
|
||||
// Periodic cache flushing
|
||||
if(i % 4 == 0)
|
||||
flush_cache();
|
||||
}
|
||||
|
||||
return std::accumulate(measured_times.begin(), measured_times.end(), 0.0f) /
|
||||
measured_times.size();
|
||||
}
|
||||
|
||||
template <typename Kernel>
|
||||
void benchmark_kernel(ck_tile::DeviceMem& c_m_n_dev_buf,
|
||||
ck_tile::HostTensor<CDataType>& c_m_n_host_result,
|
||||
@@ -202,18 +164,7 @@ class GemmProfiler
|
||||
|
||||
KernelInstance kernel_instance{description, problem, {-1.0f, -1.0f, -1.0f}};
|
||||
|
||||
auto kernel_launch_func = [&] { Kernel::launch(args, stream); };
|
||||
|
||||
float avg_time = 0.f;
|
||||
if(stream.is_gpu_timer_)
|
||||
{
|
||||
avg_time = time_kernel<ck_tile::gpu_timer>(stream, kernel_launch_func);
|
||||
}
|
||||
else
|
||||
{
|
||||
avg_time = time_kernel<ck_tile::cpu_timer>(stream, kernel_launch_func);
|
||||
}
|
||||
|
||||
float avg_time = Kernel::launch(args, stream);
|
||||
std::size_t flop = std::size_t(2) * args.M * args.N * args.K;
|
||||
std::size_t num_byte = sizeof(ADataType) * args.M * args.K +
|
||||
sizeof(BDataType) * args.N * args.K +
|
||||
@@ -223,7 +174,10 @@ class GemmProfiler
|
||||
kernel_instance.perf_result.tflops = static_cast<float>(flop) / 1.E9 / avg_time;
|
||||
kernel_instance.perf_result.bandwidth = num_byte / 1.E6 / avg_time;
|
||||
|
||||
std::cout << kernel_instance << std::endl;
|
||||
if(stream.log_level_ > 0)
|
||||
{
|
||||
std::cout << kernel_instance << std::endl;
|
||||
}
|
||||
|
||||
c_m_n_dev_buf.FromDevice(c_m_n_dev_result.data());
|
||||
bool verified_correct =
|
||||
|
||||
@@ -88,9 +88,16 @@ RUN_MEM = """
|
||||
RunSplitk(ck_tile::bool_constant<true>{},
|
||||
ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Full>{});
|
||||
}
|
||||
// Variadic call using fold expression
|
||||
|
||||
auto check_tail = [&](auto... TNs) {
|
||||
(try_run< BaseGemmPipeline, decltype(TNs)::value>(tail_num), ...);
|
||||
([&]{
|
||||
if constexpr(BaseGemmPipeline::PrefetchStages > static_cast<int>(decltype(TNs)::value)) {
|
||||
if(tail_num == decltype(TNs)::value) {
|
||||
RunSplitk(ck_tile::bool_constant<true>{},
|
||||
ck_tile::integral_constant<ck_tile::TailNumber, decltype(TNs)::value>{});
|
||||
}
|
||||
}
|
||||
}(), ...);
|
||||
};
|
||||
|
||||
check_tail(
|
||||
@@ -200,28 +207,31 @@ def element_size(data_type: str) -> float:
|
||||
else:
|
||||
raise ValueError(f"Unsupported data type: {data_type}")
|
||||
|
||||
GPU_NAME_PATTERN = re.compile(r'Name:\s*(gfx\d+\w*)')
|
||||
|
||||
@lru_cache(maxsize=1)
|
||||
def get_gpu_name_by_id(gpu_id: int = 0) -> str:
|
||||
"""Retrieve GPU name (e.g. gfx90a) by device ID"""
|
||||
try:
|
||||
cmd = ['rocm-smi', '--showproductname', '-d', str(gpu_id)]
|
||||
result = subprocess.run(
|
||||
cmd,
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.PIPE,
|
||||
output = subprocess.check_output(
|
||||
["rocminfo"],
|
||||
text=True,
|
||||
check=True
|
||||
stderr=subprocess.PIPE,
|
||||
timeout=5
|
||||
)
|
||||
|
||||
arch_pattern = r'gfx\d{3,4}[a-z]?'
|
||||
match = re.search(arch_pattern, result.stdout.lower())
|
||||
return match.group() if match else ""
|
||||
|
||||
except (FileNotFoundError, subprocess.CalledProcessError) as e:
|
||||
print(f"System Error: {str(e)}, when get the name of gpu:{gpu_id}")
|
||||
if matches := GPU_NAME_PATTERN.finditer(output):
|
||||
gpu_list = [m.group(1) for m in matches]
|
||||
return gpu_list[gpu_id] if gpu_id < len(gpu_list) else ""
|
||||
|
||||
return ""
|
||||
|
||||
except subprocess.CalledProcessError as e:
|
||||
print(f"GPU query failed (exit {e.returncode}): {e.stderr.strip()}")
|
||||
except FileNotFoundError:
|
||||
print("ROCm tools not installed (requires rocminfo)")
|
||||
except subprocess.TimeoutExpired:
|
||||
print("GPU query timeout (5s)")
|
||||
except Exception as e:
|
||||
print(
|
||||
f"Runtime Exception: {str(e)}, when get the name of gpu:{gpu_id}")
|
||||
return ""
|
||||
print(f"GPU detection error: {str(e)}")
|
||||
|
||||
return ""
|
||||
@@ -34,12 +34,16 @@
|
||||
"tile_config": {
|
||||
"tile_m": {
|
||||
"values": [
|
||||
256
|
||||
256,
|
||||
128,
|
||||
64
|
||||
]
|
||||
},
|
||||
"tile_n": {
|
||||
"values": [
|
||||
256
|
||||
256,
|
||||
128,
|
||||
64
|
||||
]
|
||||
},
|
||||
"tile_k": {
|
||||
|
||||
@@ -48,6 +48,7 @@ void run(const ck_tile::ArgParser& arg_parser)
|
||||
ck_tile::index_t stride_B = arg_parser.get_int("stride_b");
|
||||
ck_tile::index_t stride_C = arg_parser.get_int("stride_c");
|
||||
|
||||
bool log = arg_parser.get_int("log");
|
||||
int n_warmup = arg_parser.get_int("warmup");
|
||||
int n_repeat = arg_parser.get_int("repeat");
|
||||
int verify = arg_parser.get_int("v");
|
||||
@@ -173,7 +174,7 @@ void run(const ck_tile::ArgParser& arg_parser)
|
||||
structured_sparsity,
|
||||
trait,
|
||||
gemm_args,
|
||||
ck_tile::stream_config{nullptr, false, 0, n_warmup, n_repeat});
|
||||
ck_tile::stream_config{nullptr, true, log, n_warmup, n_repeat});
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
@@ -118,6 +118,10 @@ inline auto create_args(int argc, char* argv[])
|
||||
"2",
|
||||
"The type of validation. Set to 0 for no validation, 1 for validation on CPU, or 2 "
|
||||
"for validation on GPU. Default is 2, validation on GPU.")
|
||||
.insert("log",
|
||||
"false",
|
||||
"Wether output kernel instance information or not. Possible values are true or "
|
||||
"false. Default is false")
|
||||
.insert(
|
||||
"warmup", "50", "The number of iterations before benchmark the kernel. Default is 50.")
|
||||
.insert(
|
||||
|
||||
@@ -109,6 +109,7 @@ class GemmCodeGenerator:
|
||||
// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "ck_tile/core.hpp"
|
||||
|
||||
// Data types
|
||||
@@ -141,6 +142,8 @@ using CLayout = {LAYOUT_MAP[self.config.problem.layout_map['matrix_c']]};
|
||||
content = f"""// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "gemm_common.hpp"
|
||||
#include "ck_tile/ops/gemm.hpp"
|
||||
#include "ck_tile/ops/epilogue.hpp"
|
||||
@@ -159,15 +162,7 @@ namespace {trait} {{
|
||||
pad_m: str, pad_n: str, pad_k: str) -> str:
|
||||
"""Generate the code block of kernel struct"""
|
||||
return f"""
|
||||
template <typename Pipeline, ck_tile::TailNumber TN>
|
||||
void try_run(ck_tile::TailNumber tn) {{
|
||||
if constexpr (Pipeline::PrefetchStages > static_cast<int>(TN) - 1) {{
|
||||
if (tn == TN) {{
|
||||
RunSplitk(ck_tile::bool_constant<true>{{}},
|
||||
ck_tile::integral_constant<ck_tile::TailNumber, TN>{{}});
|
||||
}}
|
||||
}}
|
||||
}}
|
||||
|
||||
template <int TileM, int TileN, int TileK,
|
||||
int WarpM, int WarpN, int WarpK,
|
||||
int WarpTileM, int WarpTileN, int WarpTileK,
|
||||
|
||||
Reference in New Issue
Block a user