From 012c77125a0ef4b5491a28ea2dfc3187ff91fe60 Mon Sep 17 00:00:00 2001 From: Yanxing-Shi Date: Fri, 16 May 2025 10:37:59 +0000 Subject: [PATCH] recover benchmark_gemm and fix --- tile_engine/ops/gemm/README.md | 29 +++++----- tile_engine/ops/gemm/benchmark_gemm.hpp | 56 ++----------------- tile_engine/ops/gemm/codegen_utils.py | 46 +++++++++------ .../gemm/configs/user_provided_config.json | 8 ++- tile_engine/ops/gemm/gemm_host_api.cpp | 3 +- tile_engine/ops/gemm/gemm_host_api.hpp | 4 ++ tile_engine/ops/gemm/gemm_instance_builder.py | 13 ++--- 7 files changed, 64 insertions(+), 95 deletions(-) diff --git a/tile_engine/ops/gemm/README.md b/tile_engine/ops/gemm/README.md index aeac2f541f..26ece438bb 100644 --- a/tile_engine/ops/gemm/README.md +++ b/tile_engine/ops/gemm/README.md @@ -12,7 +12,7 @@ User can provide kernel configuration such as tile size, warp size, padding, pip mkdir build && cd build # build composable kernel ## replace with the appropriate architecture (example gfx942) or leave blank -## "USE_CUSTOM_CONFIG=ON" for default configuration, "USE_CUSTOM_CONFIG=ON" for user provided configuration +## "USE_CUSTOM_CONFIG=OFF" for default configuration, "USE_CUSTOM_CONFIG=ON" for user provided configuration sh ../script/cmake-ck-dev.sh ../ -D USE_CUSTOM_CONFIG=ON # generate the executable make tile_engine_gemm -j @@ -29,22 +29,23 @@ rm -rf tile_engine/ && make tile_engine_gemm -j # rebuild -m The value for m dimension. Default is 3840. -n The value for n dimension. Default is 4096. -k The value for k dimension. Default is 2048. - -stride_a The stride value for tensor A. Default is 0. - -stride_b The stride value for tensor B. Default is 0. - -stride_c The stride value for tensor C Default is 0. + -stride_a The stride value for tensor A. Default is 0. + -stride_b The stride value for tensor B. Default is 0. + -stride_c The stride value for tensor C Default is 0. -split_k The split value for k dimension. Default is 1. -v The type of validation. Set to 0 for no validation, 1 for validation on CPU, or 2 for validation on GPU. Default is 2, validation on GPU. - -warmup The number of iterations before benchmark the kernel. Default is 50. - -repeat The number of iterations to benchmark the kernel. Default is 100. + -log Wether output kernel instance information or not. Possible values are true or false. Default is false. + -warmup The number of iterations before benchmark the kernel. Default is 50. + -repeat The number of iterations to benchmark the kernel. Default is 100. -timer The type of timer. Possible values are gpu timer or cpu timer. Default is gpu timer. - -init The method of tensor initialization. Set to 0 for random, to 1 for linear, or 2 for constant(1). Default is 0, random. - -metric Metric with which to measure kernel performance. Set to 0 for latency, 1 for tflops, or 2 for bandwidth. Default is 0, latency. - -structured_sparsity whether use sparsity kernel or not. Possible values are true or false. Default is false. - -pipeline The type of pipeline. Possible values are compv3, compv4 or mem. Default is compv3. - -epilogue The type of epilogue. Possible values are cshuffle or default. Default is cshuffle. - -pad_m Whether pad or not in m direction. Possible values are true or false. Default is false. - -pad_n Whether pad or not in n direction. Possible values are true or false. Default is false. - -pad_k Whether pad or not in k direction. Possible values are true or false. Default is false. + -init The method of tensor initialization. Set to 0 for random, to 1 for linear, or 2 for constant(1). Default is 0, random. + -metric Metric with which to measure kernel performance. Set to 0 for latency, 1 for tflops, or 2 for bandwidth. Default is 0, latency. + -structured_sparsity whether use sparsity kernel or not. Possible values are true or false. Default is false. + -pipeline The type of pipeline. Possible values are compv3, compv4 or mem. Default is compv3. + -epilogue The type of epilogue. Possible values are cshuffle or default. Default is cshuffle. + -pad_m Whether pad or not in m direction. Possible values are true or false. Default is false. + -pad_n Whether pad or not in n direction. Possible values are true or false. Default is false. + -pad_k Whether pad or not in k direction. Possible values are true or false. Default is false. Note: pipeline, scheduler, epilogue, pad_m, pad_n, pad_k should be one of the options specified in user_provided_config.json ``` diff --git a/tile_engine/ops/gemm/benchmark_gemm.hpp b/tile_engine/ops/gemm/benchmark_gemm.hpp index 3c08388b50..a6aaad1a60 100644 --- a/tile_engine/ops/gemm/benchmark_gemm.hpp +++ b/tile_engine/ops/gemm/benchmark_gemm.hpp @@ -137,44 +137,6 @@ class GemmProfiler return "Unknown"; } - template - static float time_kernel(const ck_tile::stream_config& stream, - const std::function& kernel_launch_func) - { - Timer timer; - - // flush cache - auto flush_cache = [&] { -#if defined(__HIP_DEVICE_COMPILE__) && __HIP_DEVICE_COMPILE__ - asm volatile("s_dcache_wb; s_dcache_inv; s_icache_inv" ::: "memory"); -#endif - }; - - // Cold runs - excludes from final timing measurement - for(int i = 0; i < stream.cold_niters_; ++i) - { - timer.start(stream.stream_id_); - kernel_launch_func(); - timer.stop(stream.stream_id_); - } - - // Hot runs - actual performance measurement - std::vector measured_times; - for(int i = 0; i < stream.nrepeat_; ++i) - { - timer.start(stream.stream_id_); - kernel_launch_func(); - timer.stop(stream.stream_id_); - measured_times.push_back(timer.duration()); - // Periodic cache flushing - if(i % 4 == 0) - flush_cache(); - } - - return std::accumulate(measured_times.begin(), measured_times.end(), 0.0f) / - measured_times.size(); - } - template void benchmark_kernel(ck_tile::DeviceMem& c_m_n_dev_buf, ck_tile::HostTensor& c_m_n_host_result, @@ -202,18 +164,7 @@ class GemmProfiler KernelInstance kernel_instance{description, problem, {-1.0f, -1.0f, -1.0f}}; - auto kernel_launch_func = [&] { Kernel::launch(args, stream); }; - - float avg_time = 0.f; - if(stream.is_gpu_timer_) - { - avg_time = time_kernel(stream, kernel_launch_func); - } - else - { - avg_time = time_kernel(stream, kernel_launch_func); - } - + float avg_time = Kernel::launch(args, stream); std::size_t flop = std::size_t(2) * args.M * args.N * args.K; std::size_t num_byte = sizeof(ADataType) * args.M * args.K + sizeof(BDataType) * args.N * args.K + @@ -223,7 +174,10 @@ class GemmProfiler kernel_instance.perf_result.tflops = static_cast(flop) / 1.E9 / avg_time; kernel_instance.perf_result.bandwidth = num_byte / 1.E6 / avg_time; - std::cout << kernel_instance << std::endl; + if(stream.log_level_ > 0) + { + std::cout << kernel_instance << std::endl; + } c_m_n_dev_buf.FromDevice(c_m_n_dev_result.data()); bool verified_correct = diff --git a/tile_engine/ops/gemm/codegen_utils.py b/tile_engine/ops/gemm/codegen_utils.py index 1480300b68..1a696fef51 100644 --- a/tile_engine/ops/gemm/codegen_utils.py +++ b/tile_engine/ops/gemm/codegen_utils.py @@ -88,9 +88,16 @@ RUN_MEM = """ RunSplitk(ck_tile::bool_constant{}, ck_tile::integral_constant{}); } - // Variadic call using fold expression + auto check_tail = [&](auto... TNs) { - (try_run< BaseGemmPipeline, decltype(TNs)::value>(tail_num), ...); + ([&]{ + if constexpr(BaseGemmPipeline::PrefetchStages > static_cast(decltype(TNs)::value)) { + if(tail_num == decltype(TNs)::value) { + RunSplitk(ck_tile::bool_constant{}, + ck_tile::integral_constant{}); + } + } + }(), ...); }; check_tail( @@ -200,28 +207,31 @@ def element_size(data_type: str) -> float: else: raise ValueError(f"Unsupported data type: {data_type}") +GPU_NAME_PATTERN = re.compile(r'Name:\s*(gfx\d+\w*)') @lru_cache(maxsize=1) def get_gpu_name_by_id(gpu_id: int = 0) -> str: """Retrieve GPU name (e.g. gfx90a) by device ID""" try: - cmd = ['rocm-smi', '--showproductname', '-d', str(gpu_id)] - result = subprocess.run( - cmd, - stdout=subprocess.PIPE, - stderr=subprocess.PIPE, + output = subprocess.check_output( + ["rocminfo"], text=True, - check=True + stderr=subprocess.PIPE, + timeout=5 ) - - arch_pattern = r'gfx\d{3,4}[a-z]?' - match = re.search(arch_pattern, result.stdout.lower()) - return match.group() if match else "" - - except (FileNotFoundError, subprocess.CalledProcessError) as e: - print(f"System Error: {str(e)}, when get the name of gpu:{gpu_id}") + if matches := GPU_NAME_PATTERN.finditer(output): + gpu_list = [m.group(1) for m in matches] + return gpu_list[gpu_id] if gpu_id < len(gpu_list) else "" + return "" + + except subprocess.CalledProcessError as e: + print(f"GPU query failed (exit {e.returncode}): {e.stderr.strip()}") + except FileNotFoundError: + print("ROCm tools not installed (requires rocminfo)") + except subprocess.TimeoutExpired: + print("GPU query timeout (5s)") except Exception as e: - print( - f"Runtime Exception: {str(e)}, when get the name of gpu:{gpu_id}") - return "" + print(f"GPU detection error: {str(e)}") + + return "" \ No newline at end of file diff --git a/tile_engine/ops/gemm/configs/user_provided_config.json b/tile_engine/ops/gemm/configs/user_provided_config.json index 312cefb4ed..7744d33d3c 100644 --- a/tile_engine/ops/gemm/configs/user_provided_config.json +++ b/tile_engine/ops/gemm/configs/user_provided_config.json @@ -34,12 +34,16 @@ "tile_config": { "tile_m": { "values": [ - 256 + 256, + 128, + 64 ] }, "tile_n": { "values": [ - 256 + 256, + 128, + 64 ] }, "tile_k": { diff --git a/tile_engine/ops/gemm/gemm_host_api.cpp b/tile_engine/ops/gemm/gemm_host_api.cpp index 673ba63f94..43d6509bf8 100755 --- a/tile_engine/ops/gemm/gemm_host_api.cpp +++ b/tile_engine/ops/gemm/gemm_host_api.cpp @@ -48,6 +48,7 @@ void run(const ck_tile::ArgParser& arg_parser) ck_tile::index_t stride_B = arg_parser.get_int("stride_b"); ck_tile::index_t stride_C = arg_parser.get_int("stride_c"); + bool log = arg_parser.get_int("log"); int n_warmup = arg_parser.get_int("warmup"); int n_repeat = arg_parser.get_int("repeat"); int verify = arg_parser.get_int("v"); @@ -173,7 +174,7 @@ void run(const ck_tile::ArgParser& arg_parser) structured_sparsity, trait, gemm_args, - ck_tile::stream_config{nullptr, false, 0, n_warmup, n_repeat}); + ck_tile::stream_config{nullptr, true, log, n_warmup, n_repeat}); return; } diff --git a/tile_engine/ops/gemm/gemm_host_api.hpp b/tile_engine/ops/gemm/gemm_host_api.hpp index bbf71b84de..58eeb9a9ee 100755 --- a/tile_engine/ops/gemm/gemm_host_api.hpp +++ b/tile_engine/ops/gemm/gemm_host_api.hpp @@ -118,6 +118,10 @@ inline auto create_args(int argc, char* argv[]) "2", "The type of validation. Set to 0 for no validation, 1 for validation on CPU, or 2 " "for validation on GPU. Default is 2, validation on GPU.") + .insert("log", + "false", + "Wether output kernel instance information or not. Possible values are true or " + "false. Default is false") .insert( "warmup", "50", "The number of iterations before benchmark the kernel. Default is 50.") .insert( diff --git a/tile_engine/ops/gemm/gemm_instance_builder.py b/tile_engine/ops/gemm/gemm_instance_builder.py index bc74054723..75ddaf76c7 100755 --- a/tile_engine/ops/gemm/gemm_instance_builder.py +++ b/tile_engine/ops/gemm/gemm_instance_builder.py @@ -109,6 +109,7 @@ class GemmCodeGenerator: // Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. #pragma once + #include "ck_tile/core.hpp" // Data types @@ -141,6 +142,8 @@ using CLayout = {LAYOUT_MAP[self.config.problem.layout_map['matrix_c']]}; content = f"""// SPDX-License-Identifier: MIT // Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. +#pragma once + #include "gemm_common.hpp" #include "ck_tile/ops/gemm.hpp" #include "ck_tile/ops/epilogue.hpp" @@ -159,15 +162,7 @@ namespace {trait} {{ pad_m: str, pad_n: str, pad_k: str) -> str: """Generate the code block of kernel struct""" return f""" -template -void try_run(ck_tile::TailNumber tn) {{ - if constexpr (Pipeline::PrefetchStages > static_cast(TN) - 1) {{ - if (tn == TN) {{ - RunSplitk(ck_tile::bool_constant{{}}, - ck_tile::integral_constant{{}}); - }} - }} -}} + template