recover benchmark_gemm and fix

This commit is contained in:
Yanxing-Shi
2025-05-16 10:37:59 +00:00
parent fb63cd5923
commit 012c77125a
7 changed files with 64 additions and 95 deletions

View File

@@ -12,7 +12,7 @@ User can provide kernel configuration such as tile size, warp size, padding, pip
mkdir build && cd build
# build composable kernel
## replace <arch> with the appropriate architecture (example gfx942) or leave blank
## "USE_CUSTOM_CONFIG=ON" for default configuration, "USE_CUSTOM_CONFIG=ON" for user provided configuration
## "USE_CUSTOM_CONFIG=OFF" for default configuration, "USE_CUSTOM_CONFIG=ON" for user provided configuration
sh ../script/cmake-ck-dev.sh ../ <arch> -D USE_CUSTOM_CONFIG=ON
# generate the executable
make tile_engine_gemm -j
@@ -29,22 +29,23 @@ rm -rf tile_engine/ && make tile_engine_gemm -j # rebuild
-m The value for m dimension. Default is 3840.
-n The value for n dimension. Default is 4096.
-k The value for k dimension. Default is 2048.
-stride_a The stride value for tensor A. Default is 0.
-stride_b The stride value for tensor B. Default is 0.
-stride_c The stride value for tensor C Default is 0.
-stride_a The stride value for tensor A. Default is 0.
-stride_b The stride value for tensor B. Default is 0.
-stride_c The stride value for tensor C Default is 0.
-split_k The split value for k dimension. Default is 1.
-v The type of validation. Set to 0 for no validation, 1 for validation on CPU, or 2 for validation on GPU. Default is 2, validation on GPU.
-warmup The number of iterations before benchmark the kernel. Default is 50.
-repeat The number of iterations to benchmark the kernel. Default is 100.
-log Wether output kernel instance information or not. Possible values are true or false. Default is false.
-warmup The number of iterations before benchmark the kernel. Default is 50.
-repeat The number of iterations to benchmark the kernel. Default is 100.
-timer The type of timer. Possible values are gpu timer or cpu timer. Default is gpu timer.
-init The method of tensor initialization. Set to 0 for random, to 1 for linear, or 2 for constant(1). Default is 0, random.
-metric Metric with which to measure kernel performance. Set to 0 for latency, 1 for tflops, or 2 for bandwidth. Default is 0, latency.
-structured_sparsity whether use sparsity kernel or not. Possible values are true or false. Default is false.
-pipeline The type of pipeline. Possible values are compv3, compv4 or mem. Default is compv3.
-epilogue The type of epilogue. Possible values are cshuffle or default. Default is cshuffle.
-pad_m Whether pad or not in m direction. Possible values are true or false. Default is false.
-pad_n Whether pad or not in n direction. Possible values are true or false. Default is false.
-pad_k Whether pad or not in k direction. Possible values are true or false. Default is false.
-init The method of tensor initialization. Set to 0 for random, to 1 for linear, or 2 for constant(1). Default is 0, random.
-metric Metric with which to measure kernel performance. Set to 0 for latency, 1 for tflops, or 2 for bandwidth. Default is 0, latency.
-structured_sparsity whether use sparsity kernel or not. Possible values are true or false. Default is false.
-pipeline The type of pipeline. Possible values are compv3, compv4 or mem. Default is compv3.
-epilogue The type of epilogue. Possible values are cshuffle or default. Default is cshuffle.
-pad_m Whether pad or not in m direction. Possible values are true or false. Default is false.
-pad_n Whether pad or not in n direction. Possible values are true or false. Default is false.
-pad_k Whether pad or not in k direction. Possible values are true or false. Default is false.
Note: pipeline, scheduler, epilogue, pad_m, pad_n, pad_k should be one of the options specified in user_provided_config.json
```

View File

@@ -137,44 +137,6 @@ class GemmProfiler
return "Unknown";
}
template <typename Timer>
static float time_kernel(const ck_tile::stream_config& stream,
const std::function<void()>& kernel_launch_func)
{
Timer timer;
// flush cache
auto flush_cache = [&] {
#if defined(__HIP_DEVICE_COMPILE__) && __HIP_DEVICE_COMPILE__
asm volatile("s_dcache_wb; s_dcache_inv; s_icache_inv" ::: "memory");
#endif
};
// Cold runs - excludes from final timing measurement
for(int i = 0; i < stream.cold_niters_; ++i)
{
timer.start(stream.stream_id_);
kernel_launch_func();
timer.stop(stream.stream_id_);
}
// Hot runs - actual performance measurement
std::vector<float> measured_times;
for(int i = 0; i < stream.nrepeat_; ++i)
{
timer.start(stream.stream_id_);
kernel_launch_func();
timer.stop(stream.stream_id_);
measured_times.push_back(timer.duration());
// Periodic cache flushing
if(i % 4 == 0)
flush_cache();
}
return std::accumulate(measured_times.begin(), measured_times.end(), 0.0f) /
measured_times.size();
}
template <typename Kernel>
void benchmark_kernel(ck_tile::DeviceMem& c_m_n_dev_buf,
ck_tile::HostTensor<CDataType>& c_m_n_host_result,
@@ -202,18 +164,7 @@ class GemmProfiler
KernelInstance kernel_instance{description, problem, {-1.0f, -1.0f, -1.0f}};
auto kernel_launch_func = [&] { Kernel::launch(args, stream); };
float avg_time = 0.f;
if(stream.is_gpu_timer_)
{
avg_time = time_kernel<ck_tile::gpu_timer>(stream, kernel_launch_func);
}
else
{
avg_time = time_kernel<ck_tile::cpu_timer>(stream, kernel_launch_func);
}
float avg_time = Kernel::launch(args, stream);
std::size_t flop = std::size_t(2) * args.M * args.N * args.K;
std::size_t num_byte = sizeof(ADataType) * args.M * args.K +
sizeof(BDataType) * args.N * args.K +
@@ -223,7 +174,10 @@ class GemmProfiler
kernel_instance.perf_result.tflops = static_cast<float>(flop) / 1.E9 / avg_time;
kernel_instance.perf_result.bandwidth = num_byte / 1.E6 / avg_time;
std::cout << kernel_instance << std::endl;
if(stream.log_level_ > 0)
{
std::cout << kernel_instance << std::endl;
}
c_m_n_dev_buf.FromDevice(c_m_n_dev_result.data());
bool verified_correct =

View File

@@ -88,9 +88,16 @@ RUN_MEM = """
RunSplitk(ck_tile::bool_constant<true>{},
ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Full>{});
}
// Variadic call using fold expression
auto check_tail = [&](auto... TNs) {
(try_run< BaseGemmPipeline, decltype(TNs)::value>(tail_num), ...);
([&]{
if constexpr(BaseGemmPipeline::PrefetchStages > static_cast<int>(decltype(TNs)::value)) {
if(tail_num == decltype(TNs)::value) {
RunSplitk(ck_tile::bool_constant<true>{},
ck_tile::integral_constant<ck_tile::TailNumber, decltype(TNs)::value>{});
}
}
}(), ...);
};
check_tail(
@@ -200,28 +207,31 @@ def element_size(data_type: str) -> float:
else:
raise ValueError(f"Unsupported data type: {data_type}")
GPU_NAME_PATTERN = re.compile(r'Name:\s*(gfx\d+\w*)')
@lru_cache(maxsize=1)
def get_gpu_name_by_id(gpu_id: int = 0) -> str:
"""Retrieve GPU name (e.g. gfx90a) by device ID"""
try:
cmd = ['rocm-smi', '--showproductname', '-d', str(gpu_id)]
result = subprocess.run(
cmd,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
output = subprocess.check_output(
["rocminfo"],
text=True,
check=True
stderr=subprocess.PIPE,
timeout=5
)
arch_pattern = r'gfx\d{3,4}[a-z]?'
match = re.search(arch_pattern, result.stdout.lower())
return match.group() if match else ""
except (FileNotFoundError, subprocess.CalledProcessError) as e:
print(f"System Error: {str(e)}, when get the name of gpu:{gpu_id}")
if matches := GPU_NAME_PATTERN.finditer(output):
gpu_list = [m.group(1) for m in matches]
return gpu_list[gpu_id] if gpu_id < len(gpu_list) else ""
return ""
except subprocess.CalledProcessError as e:
print(f"GPU query failed (exit {e.returncode}): {e.stderr.strip()}")
except FileNotFoundError:
print("ROCm tools not installed (requires rocminfo)")
except subprocess.TimeoutExpired:
print("GPU query timeout (5s)")
except Exception as e:
print(
f"Runtime Exception: {str(e)}, when get the name of gpu:{gpu_id}")
return ""
print(f"GPU detection error: {str(e)}")
return ""

View File

@@ -34,12 +34,16 @@
"tile_config": {
"tile_m": {
"values": [
256
256,
128,
64
]
},
"tile_n": {
"values": [
256
256,
128,
64
]
},
"tile_k": {

View File

@@ -48,6 +48,7 @@ void run(const ck_tile::ArgParser& arg_parser)
ck_tile::index_t stride_B = arg_parser.get_int("stride_b");
ck_tile::index_t stride_C = arg_parser.get_int("stride_c");
bool log = arg_parser.get_int("log");
int n_warmup = arg_parser.get_int("warmup");
int n_repeat = arg_parser.get_int("repeat");
int verify = arg_parser.get_int("v");
@@ -173,7 +174,7 @@ void run(const ck_tile::ArgParser& arg_parser)
structured_sparsity,
trait,
gemm_args,
ck_tile::stream_config{nullptr, false, 0, n_warmup, n_repeat});
ck_tile::stream_config{nullptr, true, log, n_warmup, n_repeat});
return;
}

View File

@@ -118,6 +118,10 @@ inline auto create_args(int argc, char* argv[])
"2",
"The type of validation. Set to 0 for no validation, 1 for validation on CPU, or 2 "
"for validation on GPU. Default is 2, validation on GPU.")
.insert("log",
"false",
"Wether output kernel instance information or not. Possible values are true or "
"false. Default is false")
.insert(
"warmup", "50", "The number of iterations before benchmark the kernel. Default is 50.")
.insert(

View File

@@ -109,6 +109,7 @@ class GemmCodeGenerator:
// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include "ck_tile/core.hpp"
// Data types
@@ -141,6 +142,8 @@ using CLayout = {LAYOUT_MAP[self.config.problem.layout_map['matrix_c']]};
content = f"""// SPDX-License-Identifier: MIT
// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include "gemm_common.hpp"
#include "ck_tile/ops/gemm.hpp"
#include "ck_tile/ops/epilogue.hpp"
@@ -159,15 +162,7 @@ namespace {trait} {{
pad_m: str, pad_n: str, pad_k: str) -> str:
"""Generate the code block of kernel struct"""
return f"""
template <typename Pipeline, ck_tile::TailNumber TN>
void try_run(ck_tile::TailNumber tn) {{
if constexpr (Pipeline::PrefetchStages > static_cast<int>(TN) - 1) {{
if (tn == TN) {{
RunSplitk(ck_tile::bool_constant<true>{{}},
ck_tile::integral_constant<ck_tile::TailNumber, TN>{{}});
}}
}}
}}
template <int TileM, int TileN, int TileK,
int WarpM, int WarpN, int WarpK,
int WarpTileM, int WarpTileN, int WarpTileK,