From ebcd89e5dc658e21dbe84efaae533765cf747a02 Mon Sep 17 00:00:00 2001 From: "assistant-librarian[bot]" Date: Tue, 27 May 2025 06:07:42 +0000 Subject: [PATCH] Merge commit '128f5a1eab330744d10c82a799bfdd9978d4c14b' into develop --- CHANGELOG.md | 1 + Jenkinsfile | 48 +- include/ck_tile/host/device_prop.hpp | 56 ++ tile_engine/ops/gemm/CMakeLists.txt | 53 +- tile_engine/ops/gemm/README.md | 65 +- tile_engine/ops/gemm/benchmark_gemm.cpp | 68 ++ tile_engine/ops/gemm/benchmark_gemm.hpp | 233 +++++ tile_engine/ops/gemm/codegen_utils.py | 239 +++++ .../ops/gemm/configs/default_config.json | 130 +++ .../gemm/configs/instance_combination.json | 62 -- .../gemm/configs/user_provided_config.json | 116 +++ tile_engine/ops/gemm/gemm_host_api.cpp | 192 ---- tile_engine/ops/gemm/gemm_host_api.hpp | 213 ++--- tile_engine/ops/gemm/gemm_instance_builder.py | 887 ++++++++---------- tile_engine/ops/gemm/gemm_profiler.hpp | 260 +++++ tile_engine/ops/gemm/json_config.py | 202 ++++ 16 files changed, 1911 insertions(+), 914 deletions(-) create mode 100644 include/ck_tile/host/device_prop.hpp create mode 100644 tile_engine/ops/gemm/benchmark_gemm.cpp create mode 100644 tile_engine/ops/gemm/benchmark_gemm.hpp create mode 100644 tile_engine/ops/gemm/codegen_utils.py create mode 100644 tile_engine/ops/gemm/configs/default_config.json delete mode 100644 tile_engine/ops/gemm/configs/instance_combination.json create mode 100644 tile_engine/ops/gemm/configs/user_provided_config.json delete mode 100755 tile_engine/ops/gemm/gemm_host_api.cpp create mode 100644 tile_engine/ops/gemm/gemm_profiler.hpp create mode 100644 tile_engine/ops/gemm/json_config.py diff --git a/CHANGELOG.md b/CHANGELOG.md index a1163f059c..d62a64f3e0 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -17,6 +17,7 @@ Documentation for Composable Kernel available at [https://rocm.docs.amd.com/proj * Added support for FP16 2:4 structured sparsity to universal GEMM. * Added support for Split K for grouped convolution backward data. * Added logit soft-capping support for fMHA forward kernels. +* Added benchmarking support for tile engine GEMM. ### Optimized diff --git a/Jenkinsfile b/Jenkinsfile index c26350f120..776cf8f9fa 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -774,7 +774,7 @@ def process_results(Map conf=[:]){ } //launch develop branch daily jobs -CRON_SETTINGS = BRANCH_NAME == "develop" ? '''0 23 * * * % RUN_FULL_QA=true;DISABLE_DL_KERNELS=true;RUN_CK_TILE_FMHA_TESTS=true;RUN_CK_TILE_TRANSPOSE_TESTS=true;RUN_CK_TILE_GEMM_TESTS=true +CRON_SETTINGS = BRANCH_NAME == "develop" ? '''0 23 * * * % RUN_FULL_QA=true;DISABLE_DL_KERNELS=true;RUN_CK_TILE_FMHA_TESTS=true;RUN_CK_TILE_TRANSPOSE_TESTS=true;RUN_CK_TILE_GEMM_TESTS=true;RUN_TILE_ENGINE_GEMM_TESTS=true 0 21 * * * % RUN_GROUPED_CONV_LARGE_CASES_TESTS=true;hipTensor_test=true;RUN_CODEGEN_TESTS=true;BUILD_GFX908=true 0 19 * * * % BUILD_DOCKER=true;COMPILER_VERSION=amd-staging;BUILD_COMPILER=/llvm-project/build/bin/clang++;USE_SCCACHE=false;NINJA_BUILD_TRACE=true 0 17 * * * % BUILD_DOCKER=true;COMPILER_VERSION=amd-mainline;BUILD_COMPILER=/llvm-project/build/bin/clang++;USE_SCCACHE=false;NINJA_BUILD_TRACE=true @@ -862,6 +862,10 @@ pipeline { name: "RUN_CK_TILE_GEMM_TESTS", defaultValue: false, description: "Run the ck_tile GEMM tests (default: OFF)") + booleanParam( + name: "RUN_TILE_ENGINE_GEMM_TESTS", + defaultValue: false, + description: "Run the tile_engine_gemm tests (default: OFF)") booleanParam( name: "BUILD_INSTANCES_ONLY", defaultValue: false, @@ -1145,6 +1149,48 @@ pipeline { } } } + stage("Run TILE_ENGINE_GEMM Tests") + { + parallel + { + stage("Run TILE_ENGINE_GEMM Tests on gfx90a") + { + when { + beforeAgent true + expression { params.RUN_TILE_ENGINE_GEMM_TESTS.toBoolean() } + } + agent{ label rocmnode("gfx90a") } + environment{ + setup_args = "NO_CK_BUILD" + execute_args = """ ../script/cmake-ck-dev.sh ../ gfx90a -G Ninja && \ + ninja benchmark_gemm && \ + ./bin/benchmark_gemm """ + } + steps{ + buildHipClangJobAndReboot(setup_args:setup_args, no_reboot:true, build_type: 'Release', execute_cmd: execute_args) + cleanWs() + } + } + stage("Run TILE_ENGINE_GEMM Tests on gfx942") + { + when { + beforeAgent true + expression { params.RUN_TILE_ENGINE_GEMM_TESTS.toBoolean() } + } + agent{ label rocmnode("gfx942") } + environment{ + setup_args = "NO_CK_BUILD" + execute_args = """ ../script/cmake-ck-dev.sh ../ gfx942 -G Ninja && \ + ninja benchmark_gemm && \ + ./bin/benchmark_gemm """ + } + steps{ + buildHipClangJobAndReboot(setup_args:setup_args, no_reboot:true, build_type: 'Release', execute_cmd: execute_args) + cleanWs() + } + } + } + } stage("Build CK and run Tests") { diff --git a/include/ck_tile/host/device_prop.hpp b/include/ck_tile/host/device_prop.hpp new file mode 100644 index 0000000000..d33b298369 --- /dev/null +++ b/include/ck_tile/host/device_prop.hpp @@ -0,0 +1,56 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#ifndef __HIPCC_RTC__ +#include +#include +#include + +namespace ck_tile { + +constexpr unsigned int fnv1a_hash(std::string_view str, unsigned int h = 2166136261u) +{ + return str.empty() ? h + : fnv1a_hash(str.substr(1), + (h ^ static_cast(str.front())) * 16777619u); +} +inline std::string get_device_name() +{ + hipDeviceProp_t props{}; + int device; + auto status = hipGetDevice(&device); + if(status != hipSuccess) + { + return std::string(); + } + status = hipGetDeviceProperties(&props, device); + if(status != hipSuccess) + { + return std::string(); + } + const std::string raw_name(props.gcnArchName); + const auto name = raw_name.substr(0, raw_name.find(':')); // str.substr(0, npos) returns str. + switch(fnv1a_hash(name)) + { + // https://github.com/ROCm/MIOpen/blob/8498875aef84878e04c1eabefdf6571514891086/src/target_properties.cpp#L40 + case fnv1a_hash("Ellesmere"): + case fnv1a_hash("Baffin"): + case fnv1a_hash("RacerX"): + case fnv1a_hash("Polaris10"): + case fnv1a_hash("Polaris11"): + case fnv1a_hash("Tonga"): + case fnv1a_hash("Fiji"): + case fnv1a_hash("gfx800"): + case fnv1a_hash("gfx802"): + case fnv1a_hash("gfx804"): return "gfx803"; + case fnv1a_hash("Vega10"): + case fnv1a_hash("gfx901"): return "gfx900"; + case fnv1a_hash("10.3.0 Sienna_Cichlid 18"): return "gfx1030"; + default: return name; + } +} +} // namespace ck_tile + +#endif diff --git a/tile_engine/ops/gemm/CMakeLists.txt b/tile_engine/ops/gemm/CMakeLists.txt index bc613a931e..72bf1aa8a4 100644 --- a/tile_engine/ops/gemm/CMakeLists.txt +++ b/tile_engine/ops/gemm/CMakeLists.txt @@ -1,43 +1,58 @@ - # generate a list of kernels, but not actually emit files at config stage execute_process( COMMAND ${Python3_EXECUTABLE} ${CMAKE_CURRENT_LIST_DIR}/gemm_instance_builder.py --working_path ${CMAKE_CURRENT_BINARY_DIR} - --json ${CMAKE_CURRENT_LIST_DIR}/configs/instance_combination.json + # --config_json ${CMAKE_CURRENT_LIST_DIR}/configs/user_provided_config.json --list_blobs - RESULT_VARIABLE ret -) -set_property(DIRECTORY APPEND PROPERTY CMAKE_CONFIGURE_DEPENDS - ${CMAKE_CURRENT_LIST_DIR}/gemm_instance_builder.py - ${CMAKE_CURRENT_LIST_DIR}/configs/instance_combination.json + RESULT_VARIABLE ret ) if(ret AND NOT ret EQUAL 0) - message( FATAL_ERROR "Fail to generate kernels via Python. ${ret}") + message( FATAL_ERROR "Fail to list kernels via Python. ${ret}") endif() file(STRINGS ${CMAKE_CURRENT_BINARY_DIR}/gemm_instance_blobs.txt GEMM_CODEGEN_BLOBS) +set(GEMM_CODEGEN_CPP_FILES "") +set(GEMM_CODEGEN_HPP_FILES "") + +foreach(blob ${GEMM_CODEGEN_BLOBS}) + string(STRIP "${blob}" stripped_blob) + + if(stripped_blob MATCHES "\\.cpp$") + list(APPEND GEMM_CODEGEN_CPP_FILES "${stripped_blob}") + elseif(stripped_blob MATCHES "\\.hpp$") + list(APPEND GEMM_CODEGEN_HPP_FILES "${stripped_blob}") + endif() +endforeach() + add_custom_command( OUTPUT ${GEMM_CODEGEN_BLOBS} COMMAND ${Python3_EXECUTABLE} ${CMAKE_CURRENT_LIST_DIR}/gemm_instance_builder.py --working_path ${CMAKE_CURRENT_BINARY_DIR} - --json ${CMAKE_CURRENT_LIST_DIR}/configs/instance_combination.json + # --config_json ${CMAKE_CURRENT_LIST_DIR}/configs/user_provided_config.json --gen_blobs - DEPENDS ${CMAKE_CURRENT_LIST_DIR}/gemm_instance_builder.py - ${CMAKE_CURRENT_BINARY_DIR}/gemm_instance_blobs.txt - ${CMAKE_CURRENT_LIST_DIR}/configs/instance_combination.json ) -set(EXECUTABLE_GEMM_INSTANCE "tile_engine_gemm") -message("adding example ${EXECUTABLE_GEMM_INSTANCE}") +add_library(gemm_template_instances OBJECT EXCLUDE_FROM_ALL ${GEMM_CODEGEN_CPP_FILES}) +target_include_directories(gemm_template_instances PRIVATE ${CMAKE_CURRENT_LIST_DIR}) +target_sources(gemm_template_instances PRIVATE ${GEMM_CODEGEN_HPP_FILES}) + +set(BENCHMARK_GEMM_EXECUTABLE "benchmark_gemm") +message("adding example ${BENCHMARK_GEMM_EXECUTABLE}") -# use build as include directory include_directories(${CMAKE_CURRENT_BINARY_DIR}) -add_executable(${EXECUTABLE_GEMM_INSTANCE} EXCLUDE_FROM_ALL gemm_host_api.cpp) -target_include_directories(${EXECUTABLE_GEMM_INSTANCE} PRIVATE ${CMAKE_CURRENT_LIST_DIR}) -target_sources(${EXECUTABLE_GEMM_INSTANCE} PRIVATE ${GEMM_CODEGEN_BLOBS}) + +add_library(gemm_host_api INTERFACE EXCLUDE_FROM_ALL) +target_include_directories(gemm_host_api INTERFACE ${CMAKE_CURRENT_LIST_DIR}) +target_sources(gemm_host_api INTERFACE ${GEMM_CODEGEN_HPP_FILES} gemm_host_api.hpp) +target_link_libraries(gemm_host_api INTERFACE gemm_template_instances) + +add_executable(${BENCHMARK_GEMM_EXECUTABLE} EXCLUDE_FROM_ALL benchmark_gemm.cpp) +target_include_directories(${BENCHMARK_GEMM_EXECUTABLE} PRIVATE ${CMAKE_CURRENT_LIST_DIR}) +target_sources(${BENCHMARK_GEMM_EXECUTABLE} PRIVATE benchmark_gemm.hpp gemm_profiler.hpp) +target_link_libraries(${BENCHMARK_GEMM_EXECUTABLE} PRIVATE gemm_host_api) set(EXECUTABLE_GEMM_INSTANCE_COMPILE_OPTIONS) @@ -46,6 +61,6 @@ list(APPEND EXECUTABLE_GEMM_INSTANCE_COMPILE_OPTIONS -Wno-float-equal --offload-compress) -target_compile_options(${EXECUTABLE_GEMM_INSTANCE} PRIVATE ${EXECUTABLE_GEMM_INSTANCE_COMPILE_OPTIONS}) +target_compile_options(${BENCHMARK_GEMM_EXECUTABLE} PRIVATE ${EXECUTABLE_GEMM_INSTANCE_COMPILE_OPTIONS}) set_property(GLOBAL PROPERTY RULE_MESSAGES OFF) \ No newline at end of file diff --git a/tile_engine/ops/gemm/README.md b/tile_engine/ops/gemm/README.md index f7d86e90fe..87267f8bce 100644 --- a/tile_engine/ops/gemm/README.md +++ b/tile_engine/ops/gemm/README.md @@ -4,53 +4,56 @@ CK Tile Engine GEMM is used to generate and run GEMM kernels with different comb # Kernel Configurations -Kernel parameters are specified in the `instance_combination.json` file, including matrix layouts, data types, padding settings, pipelines, schedulers, epilogues, and numerical values for tile and warp sizes. +User can provide kernel configuration such as tile size, warp size, padding, pipeline, scheduler and epilogue in the config file with limited values. For reference please see `./configs/user_provided_config.json`. -Given a valid set of values, tile_engine_gemm will automatically iterate over all possible combinations of BlockTile and WarpTile sizes, as well as the specified pipelines, schedulers, and epilogues from `./configs/instance_combination.json`, and build the corresponding kernels. +The Tile engine also has a default kernel configuration for providing range of configuration parameter values, which helps users who lack kernel development experience to benchmark For reference please see in `./configs/default_config.json` +If user does not provide kernel configuration, the tile engine uses default kernel configuration to generate kernel instances and benchmark. ## Build Instructions ``` bash # in the root of composable kernel create build directory mkdir build && cd build # build composable kernel -sh ../script/cmake-ck-dev.sh ../ # replace with the appropriate architecture (example gfx942) or leave blank +sh ../script/cmake-ck-dev.sh ../ -G Ninja # replace with the appropriate architecture (example gfx942) or leave blank # generate the executable -make tile_engine_gemm -j +ninja benchmark_gemm ``` -`tile_engine_gemm` will be located in the `./bin/` directory. +`benchmark_gemm` will be located in the `./bin/` directory. + +`benchmark_gemm` must be rebuilt everytime if configuration file is modified. -_`tile_engine_gemm` must be rebuilt everytime `instance_combination.json` is modified._ ``` bash -rm -rf tile_engine/ && make tile_engine_gemm -j # rebuild +rm -rf tile_engine/ && ninja benchmark_gemm # rebuild ``` -## tile_engine_gemm inputs +## benchmark_gemm inputs ``` + -m The value for m dimension. Default is 3840. + -n The value for n dimension. Default is 4096. + -k The value for k dimension. Default is 2048. + -stride_a The stride value for tensor A. Default is 0. + -stride_b The stride value for tensor B. Default is 0. + -stride_c The stride value for tensor C Default is 0. + -split_k The split value for k dimension. Default is 1. + -v The type of validation. Set to 0 for no validation, 1 for validation on CPU, or 2 for validation on GPU. Default is 2, validation on GPU. + -log Wether output kernel instance information or not. Possible values are true or false. Default is false. + -warmup The number of iterations before benchmark the kernel. Default is 50. + -repeat The number of iterations to benchmark the kernel. Default is 100. + -timer Whether if the timer is gpu timer or not. Possible values are true or false. Default is true. + -init The method of tensor initialization. Set to 0 for random, to 1 for linear, or 2 for constant(1). Default is 0, random. + -metric Metric with which to measure kernel performance. Set to 0 for latency, 1 for tflops, or 2 for bandwidth. Default is 0, latency. + -csv_filename The filename of benchmark result. Default is gemm_kernel. + -structured_sparsity whether use sparsity kernel or not. Possible values are true or false. Default is false. + -pipeline The type of pipeline. Possible values are compv3, compv4 or mem. Default is compv3. + -epilogue The type of epilogue. Possible values are cshuffle or default. Default is cshuffle. + -pad_m Whether pad or not in m direction. Possible values are true or false. Default is false. + -pad_n Whether pad or not in n direction. Possible values are true or false. Default is false. + -pad_k Whether pad or not in k direction. Possible values are true or false. Default is false. - -m m dimension (default:3840) - -n n dimension (default:4096) - -k k dimension (default:2048) - -stride_a Tensor A stride (default:0) - -stride_b Tensor B stride (default:0) - -stride_c Tensor C stride (default:0) - -split_k SplitK value (default:1) - -v No validation: 0, Validation on CPU: 1, Validation on GPU: 2 (default:2) - -warmup Number of iterations before benchmark the kernel (default:50) - -repeat Number of iterations to benchmark the kernel (default:100) - -timer gpu:gpu timer, cpu:cpu timer (default:gpu) - -init Value for initializing tensor- random: 0, linear: 1, constant(1): 2 (default:0) --structured_sparsity Sparsity for tensor - 0:false, 1:true (default: 0) - -pipeline possible values are: compv3, compv4, mem (default:compv3) - -scheduler possible values are: intrawave, interwave (default:intrawave) - -epilogue possible values are: cshuffle, default (default:cshuffle) - -pad_m Pad in m direction - true/false (default:false) - -pad_n Pad in n direction - true/false (default:false) - -pad_k Pad in k direction - true/false (default:false) - -Note: pipeline, scheduler, epilogue, pad_m, pad_n, pad_k should be one of the options specified in instance_combination.json +Note: pipeline, scheduler, epilogue, pad_m, pad_n, pad_k should be one of the options specified in user_provided_config.json ``` -Note: In `./configs/instance_combination.json` pipeline, scheduler, epilogue, pad_m, pad_n, pad_k should be from one of the values specified above. +Note: In `./configs/user_provided_config.json` pipeline, scheduler, epilogue, pad_m, pad_n, pad_k should be from one of the values specified above. ## Example @@ -86,7 +89,7 @@ The following JSON file specifies parameters used to generate and build GEMM ker At runtime, a specific subset of the generated kernels can be selected using command-line arguments. ``` bash -./bin/tile_engine_gemm -pipeline=compv3 -scheduler=intrawave -epilogue=default +./bin/benchmark_gemm -pipeline=compv3 -scheduler=intrawave -epilogue=default ``` The above command runs kernels configured with the compv3 pipeline, intrawave scheduler, and default epilogue, while sweeping over different BlockTile sizes, WarpTile sizes, and WarpTile mappings. diff --git a/tile_engine/ops/gemm/benchmark_gemm.cpp b/tile_engine/ops/gemm/benchmark_gemm.cpp new file mode 100644 index 0000000000..fb56e524d2 --- /dev/null +++ b/tile_engine/ops/gemm/benchmark_gemm.cpp @@ -0,0 +1,68 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. + +#include +#include +#include + +#include "gemm_profiler.hpp" +#include "benchmark_gemm.hpp" + +void benchmark_gemm(const ck_tile::ArgParser& arg_parser) +{ + GemmProblem gemm_problem{arg_parser.get_int("split_k"), + arg_parser.get_int("m"), + arg_parser.get_int("n"), + arg_parser.get_int("k"), + arg_parser.get_int("stride_a"), + arg_parser.get_int("stride_b"), + arg_parser.get_int("stride_c"), + DataTypeTraits::name, + DataTypeTraits::name, + DataTypeTraits::name, + DataTypeTraits::name, + ALayout::name, + BLayout::name, + CLayout::name, + arg_parser.get_bool("structured_sparsity")}; + + Setting setting{ + arg_parser.get_int("warmup"), + arg_parser.get_int("repeat"), + arg_parser.get_bool("timer"), + arg_parser.get_int("verify"), + arg_parser.get_int("init"), + arg_parser.get_bool("log"), + arg_parser.get_str("csv_filename"), + }; + + auto& profiler = GemmProfiler::instance(setting); + + try + { + auto kernel_func = get_kernel_func_by_trait(arg_parser); + profiler.benchmark(gemm_problem, kernel_func); + profiler.select_best_instance(static_cast(arg_parser.get_int("metric"))); + } + catch(const std::exception& e) + { + std::cerr << "Benchmark failed: " << e.what() << std::endl; + } +} + +int main(int argc, char* argv[]) +{ + try + { + auto [result, parser] = create_args(argc, argv); + if(!result) + return EXIT_FAILURE; + benchmark_gemm(parser); + return 0; + } + catch(const std::exception& e) + { + std::cerr << "Error: " << e.what() << "\n"; + return EXIT_FAILURE; + } +} diff --git a/tile_engine/ops/gemm/benchmark_gemm.hpp b/tile_engine/ops/gemm/benchmark_gemm.hpp new file mode 100644 index 0000000000..292d67dad6 --- /dev/null +++ b/tile_engine/ops/gemm/benchmark_gemm.hpp @@ -0,0 +1,233 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include +#include +#include +#include + +#include "gemm_host_api.hpp" + +enum class Metric +{ + LATENCY = 0, + TFLOPS = 1, + BANDWIDTH = 2 +}; + +inline constexpr auto get_metric_name(Metric m) +{ + switch(m) + { + case Metric::LATENCY: return "latency"; + case Metric::TFLOPS: return "tflops"; + case Metric::BANDWIDTH: return "bandwidth"; + default: throw std::invalid_argument("Unsupported metric type"); + } +} + +struct GemmProblem +{ + int split_k_; + int m_, n_, k_; + int stride_a_, stride_b_, stride_c_; + + std::string dtype_a_, dtype_b_, dtype_acc_, dtype_c_; + std::string layout_a_, layout_b_, layout_c_; + + bool structured_sparsity_; + + friend std::ostream& operator<<(std::ostream& os, const GemmProblem& problem) + { + os << "{\n" + << " \"split_k\":" << problem.split_k_ << ",\n" + << " \"m\":" << problem.m_ << ",\n" + << " \"n\":" << problem.n_ << ",\n" + << " \"k\":" << problem.k_ << ",\n" + << " \"stride_a\":" << problem.stride_a_ << ",\n" + << " \"stride_b\":" << problem.stride_b_ << ",\n" + << " \"stride_c\":" << problem.stride_c_ << ",\n" + << " \"dtype_a\":\"" << problem.dtype_a_ << "\",\n" + << " \"dtype_b\":\"" << problem.dtype_b_ << "\",\n" + << " \"dtype_acc\":\"" << problem.dtype_acc_ << "\",\n" + << " \"dtype_c\":\"" << problem.dtype_c_ << "\",\n" + << " \"layout_a\":\"" << problem.layout_a_ << "\",\n" + << " \"layout_b\":\"" << problem.layout_b_ << "\",\n" + << " \"layout_c\":\"" << problem.layout_c_ << "\"\n" + << " \"structured_sparsity\":\"" << problem.structured_sparsity_ << "\"\n" + << "}"; + return os; + } +}; + +struct PerformanceResult +{ + double latency_; + double tflops_; + double bandwidth_; + + static bool compare(const PerformanceResult& a, const PerformanceResult& b, Metric m) + { + switch(m) + { + case Metric::LATENCY: return a.latency_ < b.latency_; + case Metric::TFLOPS: return a.tflops_ > b.tflops_; + case Metric::BANDWIDTH: return a.bandwidth_ > b.bandwidth_; + default: throw std::invalid_argument("Unsupported metric type"); + } + } + + friend std::ostream& operator<<(std::ostream& os, const PerformanceResult& result) + { + os << "{\n" + << " \"latency(ms)\": " << std::fixed << std::setprecision(2) << result.latency_ + << ",\n" + << " \"tflops(TFlops)\": " << result.tflops_ << ",\n" + << " \"bandwidth(GB/s)\": " << result.bandwidth_ << "\n" + << "}"; + return os; + } +}; + +struct KernelInstance +{ + std::string name_; + GemmProblem problem_; + PerformanceResult perf_result_; + + static bool compare(const KernelInstance& a, const KernelInstance& b, Metric m) + { + return PerformanceResult::compare(a.perf_result_, b.perf_result_, m); + } + + friend std::ostream& operator<<(std::ostream& os, const KernelInstance& obj) + { + os << "{\n" + << " \"name\": \"" + << "{\n" + << obj.name_ << "\n}" + << "\",\n" + << " \"problem\": \"" << obj.problem_ << "\",\n" + << " \"perf_result\": " << obj.perf_result_ << "\n" + << "}"; + return os; + } +}; + +struct Setting +{ + int n_warmup_; + int n_repeat_; + bool is_gpu_timer_; + int verify_; + int init_method_; + bool log_; + std::string csv_filename_; +}; + +inline std::string get_rocm_version() +{ + std::ifstream version_file("/opt/rocm/.info/version"); + if(version_file.is_open()) + { + std::string version; + std::getline(version_file, version); + return version; + } + return "Unknown"; +} + +template +auto calculate_rtol_atol(const ck_tile::index_t K, + const ck_tile::index_t kbatch, + const float max_accumulated_value) +{ + using ComputeType = + std::conditional_t; + // Calculate thresholds + const auto rtol = ck_tile::get_relative_threshold( + ck_tile::integer_divide_ceil(K, kbatch)); + const auto atol = ck_tile::get_absolute_threshold( + max_accumulated_value / kbatch, ck_tile::integer_divide_ceil(K, kbatch)); + // Calculate error due to split_k accumulation + const auto rtol_split_k = + ck_tile::get_relative_threshold(kbatch); + const auto atol_split_k = ck_tile::get_absolute_threshold( + max_accumulated_value, kbatch); + // Use higher threshold + return ck_tile::make_tuple(std::max(rtol, rtol_split_k), std::max(atol, atol_split_k)); +} + +/// @brief Function to compare the results of the device and host computations +bool compare(ck_tile::index_t K, + ck_tile::index_t kbatch, + ck_tile::HostTensor& c_m_n_dev_result, + ck_tile::HostTensor& c_m_n_host_result) +{ + const float max_accumulated_value = + *std::max_element(c_m_n_host_result.mData.begin(), c_m_n_host_result.mData.end()); + const auto rtol_atol = calculate_rtol_atol( + K, kbatch, max_accumulated_value); + bool pass = ck_tile::check_err(c_m_n_dev_result, + c_m_n_host_result, + "Error: Incorrect results!", + rtol_atol.at(ck_tile::number<0>{}), + rtol_atol.at(ck_tile::number<1>{})); + + std::cout << "Relative error threshold: " << rtol_atol.at(ck_tile::number<0>{}) + << " Absolute error threshold: " << rtol_atol.at(ck_tile::number<1>{}) << std::endl; + std::cout << "The verification result is:" << (pass ? "correct" : "fail") << std::endl; + + return pass; +} + +/// @brief Function to get the kernel output with reference implementation on CPU/GPU +void gemm_host_reference(int verify, + ck_tile::HostTensor& a_m_k, + ck_tile::HostTensor& b_k_n, + ck_tile::HostTensor& c_m_n_host_result, + ck_tile::DeviceMem& a_m_k_dev_buf, + ck_tile::DeviceMem& b_k_n_dev_buf, + ck_tile::index_t M, + ck_tile::index_t N, + ck_tile::index_t K, + ck_tile::index_t stride_A, + ck_tile::index_t stride_B, + ck_tile::index_t stride_C) +{ + if(verify == 1) + { + c_m_n_host_result.SetZero(); + + ck_tile::reference_gemm( + a_m_k, b_k_n, c_m_n_host_result); + } + else if(verify == 2) + { + if constexpr(std::is_same_v) + { + // Restore input for B for gpu reference + b_k_n_dev_buf.ToDevice(b_k_n.data()); + } + + ck_tile::DeviceMem c_m_n_gpu_buf_ref(c_m_n_host_result.get_element_space_size_in_bytes()); + c_m_n_host_result.SetZero(); + c_m_n_gpu_buf_ref.SetZero(); + + ADataType* d_A = static_cast(a_m_k_dev_buf.GetDeviceBuffer()); + BDataType* d_B = static_cast(b_k_n_dev_buf.GetDeviceBuffer()); + CDataType* d_C = static_cast(c_m_n_gpu_buf_ref.GetDeviceBuffer()); + + ck_tile::reference_gemm_gpu(d_A, d_B, d_C, M, N, K, stride_A, stride_B, stride_C); + + c_m_n_gpu_buf_ref.FromDevice(c_m_n_host_result.data()); + } +} diff --git a/tile_engine/ops/gemm/codegen_utils.py b/tile_engine/ops/gemm/codegen_utils.py new file mode 100644 index 0000000000..a8955cec91 --- /dev/null +++ b/tile_engine/ops/gemm/codegen_utils.py @@ -0,0 +1,239 @@ +# SPDX-License-Identifier: MIT +# Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. + +# -*- coding: utf-8 -*- + +""" +Mappings and utility functions for kernel code generation. +""" + +import subprocess +import re +from functools import lru_cache + +DATA_TYPE_MAP = {'fp32': 'float', + 'fp16': 'ck_tile::half_t', + 'bf16': 'ck_tile::bf16_t', + 'int8': 'ck_tile::int8_t', + 'fp8': 'ck_tile::fp8_t', + 'bf8': 'ck_tile::bf8_t', + 'int4': 'ck_tile::pk_int4_t' + } + +LAYOUT_MAP = {'r': 'ck_tile::tensor_layout::gemm::RowMajor', + 'c': 'ck_tile::tensor_layout::gemm::ColumnMajor'} + +DEFAULT_EPILOGUE = """ + using GemmEpilogue = ck_tile::DefaultGemm2DEpilogue< + ck_tile::DefaultGemm2DEpilogueProblem>; +""" + +CSHUFFLE_EPILOGUE = """ + using GemmEpilogue = ck_tile::CShuffleEpilogue< + ck_tile::CShuffleEpilogueProblem>; +""" +HOT_LOOP_FALSE = """ + if(tail_num == ck_tile::TailNumber::Full) + { + RunSplitk(ck_tile::bool_constant{}, + ck_tile::integral_constant{}); + } + else if(tail_num == ck_tile::TailNumber::Odd) + { + RunSplitk(ck_tile::bool_constant{}, + ck_tile::integral_constant{}); + } + else if(tail_num == ck_tile::TailNumber::Even) + { + RunSplitk(ck_tile::bool_constant{}, + ck_tile::integral_constant{}); + } + else + { + throw std::runtime_error("Num K loop must be larger than number of prefetech stages."); + } +""" +RUN_MEM = """ + // Handle One and Full cases directly + if (tail_num == ck_tile::TailNumber::One) { + RunSplitk(ck_tile::bool_constant{}, + ck_tile::integral_constant{}); + } else if (tail_num == ck_tile::TailNumber::Full) { + RunSplitk(ck_tile::bool_constant{}, + ck_tile::integral_constant{}); + } + + auto check_tail = [&](auto... TNs) { + ([&]{ + if constexpr(BaseGemmPipeline::PrefetchStages > static_cast(decltype(TNs)::value)) { + if(tail_num == decltype(TNs)::value) { + RunSplitk(ck_tile::bool_constant{}, + ck_tile::integral_constant{}); + } + } + }(), ...); + }; + + check_tail( + ck_tile::integral_constant{}, + ck_tile::integral_constant{}, + ck_tile::integral_constant{}, + ck_tile::integral_constant{}, + ck_tile::integral_constant{}, + ck_tile::integral_constant{} + ); +""" + +RUN_COMPV3 = """ + if(tail_num == ck_tile::TailNumber::Full) + { + RunSplitk(ck_tile::bool_constant{}, + ck_tile::integral_constant{}); + } + else if(tail_num == ck_tile::TailNumber::Odd) + { + RunSplitk(ck_tile::bool_constant{}, + ck_tile::integral_constant{}); + } + else if(tail_num == ck_tile::TailNumber::Even) + { + RunSplitk(ck_tile::bool_constant{}, + ck_tile::integral_constant{}); + } + else + { + throw std::runtime_error("The tail number is wrong. It should be Full, Odd, or Even."); + } +""" + +RUN_COMPV4 = """ + if(tail_num == ck_tile::TailNumber::Three) + { + RunSplitk(ck_tile::bool_constant{}, + ck_tile::integral_constant{}); + } + else + { + RunSplitk(ck_tile::bool_constant{}, + ck_tile::integral_constant{}); + } +""" + + +PIPELINE_MAP = {'mem': ['ck_tile::BaseGemmPipelineAgBgCrMem', 'ck_tile::GemmPipelineAgBgCrMem'], + 'compv3': ['ck_tile::BaseGemmPipelineAgBgCrCompV3', 'ck_tile::GemmPipelineAgBgCrCompV3'], + 'compv4': ['ck_tile::BaseGemmPipelineAgBgCrCompV4', 'ck_tile::GemmPipelineAgBgCrCompV4']} + +SCHEDULER_MAP = {'interwave': 'ck_tile::GemmPipelineScheduler::Interwave', + 'intrawave': 'ck_tile::GemmPipelineScheduler::Intrawave'} + +EPILOGUE_MAP = {'default': DEFAULT_EPILOGUE, + 'cshuffle': CSHUFFLE_EPILOGUE} + +HOT_LOOP_TRUE = {'mem': RUN_MEM, + 'compv3': RUN_COMPV3, + 'compv4': RUN_COMPV4} + + +def BOOL_MAP(b_): return {True: 'true', False: 'false'}[bool(b_)] + + +# To Do: add some more supported combinations +warp_tile_supported_combinations = { + "gfx90a": { + 'fp16_fp16_fp16': [[32, 32, 8], [16, 16, 16], [32, 32, 16], [16, 16, 32]], + 'bf16_bf16_bf16': [[32, 32, 8], [16, 16, 16], [32, 32, 16], [16, 16, 32]], + 'fp8_fp8_fp16': [[32, 32, 16], [32, 32, 32]], + 'fp8_fp8_fp16': [[32, 32, 16], [32, 32, 32]] + }, + "gfx942": { + 'fp16_fp16_fp16': [[32, 32, 8], [16, 16, 16], [32, 32, 16], [16, 16, 32]], + 'bf16_bf16_bf16': [[32, 32, 8], [16, 16, 16], [32, 32, 16], [16, 16, 32]], + 'fp8_fp8_fp16': [[32, 32, 16], [32, 32, 32], [16, 16, 32], [16, 16, 64]], + 'fp8_fp8_fp16': [[32, 32, 16], [32, 32, 32], [16, 16, 64], [16, 16, 32]] + }, + "gfx950": { + 'fp16_fp16_fp16': [[32, 32, 8], [16, 16, 16], [32, 32, 16], [16, 16, 32]], + 'bf16_bf16_bf16': [[32, 32, 8], [16, 16, 16], [32, 32, 16], [16, 16, 32]], + 'fp8_fp8_fp16': [[32, 32, 16], [32, 32, 32], [16, 16, 32], [16, 16, 64], [16, 16, 128], [32, 32, 64]], + 'fp8_fp8_fp16': [[32, 32, 16], [32, 32, 32], [16, 16, 64], [16, 16, 32], [16, 16, 128], [32, 32, 64]] + } +} + +# To Do: remove some unsupported combinations +trait_unsupported_combinations = { + ("compv3", "cshuffle", "interwave"), + ("compv3", "default", "interwave"), + ("compv4", "cshuffle", "interwave"), + ("compv4", "default", "interwave") +} + + +def element_size(data_type: str) -> float: + """Calculate the size (in bytes) of a single element for given data type.""" + data_type = data_type.lower() + if data_type in {'fp16', 'bf16'}: + return 2 + elif data_type in {'int8', 'fp8', 'bf8'}: + return 1 + elif data_type == 'int4': + return 0.5 + else: + raise ValueError(f"Unsupported data type: {data_type}") + + +GPU_NAME_PATTERN = re.compile(r'Name:\s*(gfx\d+\w*)') + + +@lru_cache(maxsize=1) +def get_gpu_name_by_id(gpu_id: int = 0) -> str: + """Retrieve GPU name (e.g. gfx90a) by device ID""" + try: + output = subprocess.check_output( + ["rocminfo"], + text=True, + stderr=subprocess.PIPE, + timeout=5 + ) + if matches := GPU_NAME_PATTERN.finditer(output): + gpu_list = [m.group(1) for m in matches] + return gpu_list[gpu_id] if gpu_id < len(gpu_list) else "" + + return "" + + except subprocess.CalledProcessError as e: + print(f"GPU query failed (exit {e.returncode}): {e.stderr.strip()}") + except FileNotFoundError: + print("ROCm tools not installed (requires rocminfo)") + except subprocess.TimeoutExpired: + print("GPU query timeout (5s)") + except Exception as e: + print(f"GPU detection error: {str(e)}") + + return "" diff --git a/tile_engine/ops/gemm/configs/default_config.json b/tile_engine/ops/gemm/configs/default_config.json new file mode 100644 index 0000000000..09fe3b83ac --- /dev/null +++ b/tile_engine/ops/gemm/configs/default_config.json @@ -0,0 +1,130 @@ +{ + "problem": { + "layout_a": { + "values": [ + "r" + ] + }, + "layout_b": { + "values": [ + "c" + ] + }, + "layout_c": { + "values": [ + "r" + ] + }, + "datatype_a": { + "values": [ + "fp16" + ] + }, + "datatype_b": { + "values": [ + "fp16" + ] + }, + "datatype_c": { + "values": [ + "fp16" + ] + } + }, + "tile_config": { + "tile_m": { + "max": 512, + "min": 64, + "step": 64, + "exclude": [] + }, + "tile_n": { + "max": 512, + "min": 64, + "step": 32, + "exclude": [] + }, + "tile_k": { + "max": 512, + "min": 64, + "step": 64, + "exclude": [] + }, + "warp_m": { + "values": [ + 4, + 2, + 1 + ] + }, + "warp_n": { + "values": [ + 4, + 2, + 1 + ] + }, + "warp_k": { + "values": [ + 1 + ] + }, + "warp_tile_m": { + "values": [ + 16, + 32 + ] + }, + "warp_tile_n": { + "values": [ + 16, + 32 + ] + }, + "warp_tile_k": { + "values": [ + 8, + 16, + 32, + 64, + 128 + ] + } + }, + "trait_config": { + "pipeline": { + "values": [ + "compv4", + "compv3", + "mem" + ] + }, + "scheduler": { + "values": [ + "intrawave", + "interwave" + ] + }, + "epilogue": { + "values": [ + "default", + "cshuffle" + ] + }, + "pad_m": { + "values": [ + false + ] + }, + "pad_n": { + "values": [ + false + ] + }, + "pad_k": { + "values": [ + false + ] + } + } +} \ No newline at end of file diff --git a/tile_engine/ops/gemm/configs/instance_combination.json b/tile_engine/ops/gemm/configs/instance_combination.json deleted file mode 100644 index b497513efa..0000000000 --- a/tile_engine/ops/gemm/configs/instance_combination.json +++ /dev/null @@ -1,62 +0,0 @@ -{ - "architecture": { - "values": ["gfx90a"] - }, - "layout_a": { - "values": ["r"] - }, - "layout_b": { - "values": ["c"] - }, - "layout_c": { - "values": ["r"] - }, - "datatype": { - "values": ["fp16"] - }, - "tile_m": { - "values": [256] - }, - "tile_n": { - "values": [256] - }, - "tile_k": { - "values": [32] - }, - "warp_m": { - "values": [2] - }, - "warp_n": { - "values": [2] - }, - "warp_k": { - "values": [1] - }, - "warp_tile_m": { - "values": [32] - }, - "warp_tile_n": { - "values": [32] - }, - "warp_tile_k": { - "values": [16] - }, - "kPadM": { - "values": [false] - }, - "kPadN": { - "values": [false] - }, - "kPadK": { - "values": [false] - }, - "pipeline": { - "values": ["compv3", "compv4", "mem"] - }, - "scheduler": { - "values": ["intrawave", "interwave"] - }, - "epilogue": { - "values": ["default", "cshuffle"] - } -} diff --git a/tile_engine/ops/gemm/configs/user_provided_config.json b/tile_engine/ops/gemm/configs/user_provided_config.json new file mode 100644 index 0000000000..6a6e726e40 --- /dev/null +++ b/tile_engine/ops/gemm/configs/user_provided_config.json @@ -0,0 +1,116 @@ +{ + "problem": { + "layout_a": { + "values": [ + "r" + ] + }, + "layout_b": { + "values": [ + "c" + ] + }, + "layout_c": { + "values": [ + "r" + ] + }, + "datatype_a": { + "values": [ + "fp16" + ] + }, + "datatype_b": { + "values": [ + "fp16" + ] + }, + "datatype_c": { + "values": [ + "fp16" + ] + } + }, + "tile_config": { + "tile_m": { + "values": [ + 128 + ] + }, + "tile_n": { + "values": [ + 128 + ] + }, + "tile_k": { + "values": [ + 32 + ] + }, + "warp_m": { + "values": [ + 2 + ] + }, + "warp_n": { + "values": [ + 2 + ] + }, + "warp_k": { + "values": [ + 1 + ] + }, + "warp_tile_m": { + "values": [ + 32 + ] + }, + "warp_tile_n": { + "values": [ + 32 + ] + }, + "warp_tile_k": { + "values": [ + 16 + ] + } + }, + "trait_config": { + "pipeline": { + "values": [ + "compv3", + "mem" + ] + }, + "scheduler": { + "values": [ + "intrawave", + "interwave" + ] + }, + "epilogue": { + "values": [ + "default", + "cshuffle" + ] + }, + "pad_m": { + "values": [ + false + ] + }, + "pad_n": { + "values": [ + false + ] + }, + "pad_k": { + "values": [ + false + ] + } + } +} \ No newline at end of file diff --git a/tile_engine/ops/gemm/gemm_host_api.cpp b/tile_engine/ops/gemm/gemm_host_api.cpp deleted file mode 100755 index a5447cd658..0000000000 --- a/tile_engine/ops/gemm/gemm_host_api.cpp +++ /dev/null @@ -1,192 +0,0 @@ -// SPDX-License-Identifier: MIT -// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. - -#include "ck_tile/host.hpp" -#include "gemm_common.hpp" -#include "gemm_dispatcher.hpp" -#include "gemm_host_api.hpp" - -void gemm_kernel_launch(ck_tile::DeviceMem& c_m_n_dev_buf, - ck_tile::HostTensor& c_m_n_host_result, - ck_tile::HostTensor& c_m_n_dev_result, - int verify, - bool structured_sparsity, - KernelTraits& trait, - ck_tile::GemmHostArgs& args, - const ck_tile::stream_config& stream) -{ - return GemmDispatcher::dispatch(c_m_n_dev_buf, - c_m_n_host_result, - c_m_n_dev_result, - verify, - structured_sparsity, - trait, - args, - stream); -} - -template -void run(const ck_tile::ArgParser& arg_parser) -{ - const ALayout a_layout = ALayout{}; - const BLayout b_layout = BLayout{}; - - ck_tile::index_t kbatch = arg_parser.get_int("split_k"); - ck_tile::index_t M = arg_parser.get_int("m"); - ck_tile::index_t N = arg_parser.get_int("n"); - ck_tile::index_t K = arg_parser.get_int("k"); - - ck_tile::index_t stride_A = arg_parser.get_int("stride_a"); - ck_tile::index_t stride_B = arg_parser.get_int("stride_b"); - ck_tile::index_t stride_C = arg_parser.get_int("stride_c"); - - int n_warmup = arg_parser.get_int("warmup"); - int n_repeat = arg_parser.get_int("repeat"); - int verify = arg_parser.get_int("v"); - ck_tile::index_t init_method = arg_parser.get_int("init"); - bool structured_sparsity = arg_parser.get_bool("structured_sparsity"); - - stride_A = ck_tile::get_default_stride(M, K, stride_A, is_row_major(a_layout)); - stride_B = ck_tile::get_default_stride(K, N, stride_B, is_row_major(b_layout)); - stride_C = ck_tile::get_default_stride(M, N, stride_C, is_row_major(CLayout{})); - - ck_tile::HostTensor a_m_k( - ck_tile::host_tensor_descriptor(M, K, stride_A, is_row_major(a_layout))); - ck_tile::HostTensor b_k_n( - ck_tile::host_tensor_descriptor(K, N, stride_B, is_row_major(b_layout))); - ck_tile::HostTensor c_m_n_dev_result( - ck_tile::host_tensor_descriptor(M, N, stride_C, is_row_major(CLayout{}))); - - if(init_method == 0) - { - ck_tile::FillUniformDistribution{-1.f, 1.f}(a_m_k); - ck_tile::FillUniformDistribution{-1.f, 1.f}(b_k_n); - } - else if(init_method == 1) - { - ck_tile::FillMonotonicSeq{}(a_m_k); - ck_tile::FillMonotonicSeq{}(b_k_n); - } - else if(init_method == 2) - { - ck_tile::FillConstant{static_cast(1)}(a_m_k); - ck_tile::FillConstant{static_cast(1)}(b_k_n); - } - else - { - a_m_k.SetZero(); - b_k_n.SetZero(); - } - - if(structured_sparsity) - { - ck_tile::AdjustToStructuredSparsity{}(a_m_k); - } - - ck_tile::DeviceMem a_m_k_dev_buf(a_m_k.get_element_space_size_in_bytes()); - ck_tile::DeviceMem b_k_n_dev_buf(b_k_n.get_element_space_size_in_bytes()); - ck_tile::DeviceMem c_m_n_dev_buf(c_m_n_dev_result.get_element_space_size_in_bytes()); - - if constexpr(std::is_same_v) - { - // Permute vector pk_i4x4 data for device implementation - ck_tile::HostTensor b_k_n_dev = b_k_n; - // permute_tensor_b(b_k_n_dev); - permute_vectors_i4x4_b(b_k_n_dev); - b_k_n_dev_buf.ToDevice(b_k_n_dev.data()); - } - else - { - b_k_n_dev_buf.ToDevice(b_k_n.data()); - } - - a_m_k_dev_buf.ToDevice(a_m_k.data()); - c_m_n_dev_buf.SetZero(); - c_m_n_dev_result.SetZero(); - - ck_tile::GemmHostArgs gemm_args; - gemm_args.a_ptr = a_m_k_dev_buf.GetDeviceBuffer(); - gemm_args.b_ptr = b_k_n_dev_buf.GetDeviceBuffer(); - gemm_args.c_ptr = c_m_n_dev_buf.GetDeviceBuffer(); - gemm_args.k_batch = kbatch; - gemm_args.M = M; - gemm_args.N = N; - gemm_args.K = K; - gemm_args.stride_A = stride_A; - gemm_args.stride_B = stride_B; - gemm_args.stride_C = stride_C; - - KernelTraits trait; - trait.pipeline = arg_parser.get_str("pipeline"); - trait.scheduler = arg_parser.get_str("scheduler"); - trait.epilogue = arg_parser.get_str("epilogue"); - trait.kPadM = arg_parser.get_bool("pad_m"); - trait.kPadN = arg_parser.get_bool("pad_n"); - trait.kPadK = arg_parser.get_bool("pad_k"); - - std::cout << "Run Gemm kernel with M =" << M << " N =" << N << " K =" << K - << " StrideA =" << stride_A << " StrideB =" << stride_B << " StrideC =" << stride_C - << " A_Layout =" << ALayout::name << " B_Layout =" << BLayout::name - << " C_Layout =" << CLayout::name << " A Type = " << DataTypeTraits::name - << " B Type = " << DataTypeTraits::name - << " C Type = " << DataTypeTraits::name << std::endl; - - ck_tile::HostTensor c_m_n_host_result( - ck_tile::host_tensor_descriptor(M, N, stride_C, is_row_major(CLayout{}))); - - if(verify) - { - gemm_host_reference(verify, - a_m_k, - b_k_n, - c_m_n_host_result, - a_m_k_dev_buf, - b_k_n_dev_buf, - M, - N, - K, - stride_A, - stride_B, - stride_C); - } - - gemm_kernel_launch(c_m_n_dev_buf, - c_m_n_host_result, - c_m_n_dev_result, - verify, - structured_sparsity, - trait, - gemm_args, - ck_tile::stream_config{nullptr, true, 1, n_warmup, n_repeat}); - - return; -} - -int main(int argc, char* argv[]) -{ - try - { - auto [result, parser] = create_args(argc, argv); - if(!result) - return EXIT_FAILURE; - run(parser); - return 0; - } - catch(const std::exception& e) - { - std::cerr << "Error: " << e.what() << "\n"; - return EXIT_FAILURE; - } -} diff --git a/tile_engine/ops/gemm/gemm_host_api.hpp b/tile_engine/ops/gemm/gemm_host_api.hpp index 579d2770db..8cbc3f26f6 100755 --- a/tile_engine/ops/gemm/gemm_host_api.hpp +++ b/tile_engine/ops/gemm/gemm_host_api.hpp @@ -1,16 +1,15 @@ // SPDX-License-Identifier: MIT // Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. -#include +#pragma once #include -#include -#include #include #include -#include "ck_tile/ops/gemm.hpp" -#pragma once +#include "ck_tile/host.hpp" +#include "gemm_dispatcher.hpp" +#include "gemm_common.hpp" template struct DataTypeTraits; @@ -57,24 +56,6 @@ struct DataTypeTraits static constexpr const char* name = "pk_int4_t"; }; -/// @brief Defines the configuration parameters for a GEMM operation, enabling the selection of a -/// specific kernel instance based on the provided settings. -struct KernelTraits -{ - /// @brief The name of the pipeline. - std::string pipeline; - /// @brief The name of the scheduler (e.g., "intrawave", "interwave"). - std::string scheduler; - /// @brief The name of the epilogue (e.g., "cshuffle", "default"). - std::string epilogue; - /// @brief Indicates whether padding is applied to the M dimension. - bool kPadM; - /// @brief Indicates whether padding is applied to the N dimension. - bool kPadN; - /// @brief Indicates whether padding is applied to the K dimension. - bool kPadK; -}; - template static constexpr inline auto is_row_major(Layout layout_) { @@ -82,49 +63,71 @@ static constexpr inline auto is_row_major(Layout layout_) ck_tile::tensor_layout::gemm::RowMajor>>{}; } -template -auto calculate_rtol_atol(const ck_tile::index_t K, - const ck_tile::index_t kbatch, - const float max_accumulated_value) -{ - using ComputeType = - std::conditional_t; - // Calculate thresholds - const auto rtol = ck_tile::get_relative_threshold( - ck_tile::integer_divide_ceil(K, kbatch)); - const auto atol = ck_tile::get_absolute_threshold( - max_accumulated_value / kbatch, ck_tile::integer_divide_ceil(K, kbatch)); - // Calculate error due to split_k accumulation - const auto rtol_split_k = - ck_tile::get_relative_threshold(kbatch); - const auto atol_split_k = ck_tile::get_absolute_threshold( - max_accumulated_value, kbatch); - // Use higher threshold - return ck_tile::make_tuple(std::max(rtol, rtol_split_k), std::max(atol, atol_split_k)); -} - inline auto create_args(int argc, char* argv[]) { ck_tile::ArgParser arg_parser; - arg_parser.insert("m", "3840", "m dimension") - .insert("n", "4096", "n dimension") - .insert("k", "2048", "k dimension") - .insert("stride_a", "0", "Tensor A stride") - .insert("stride_b", "0", "Tensor B stride") - .insert("stride_c", "0", "Tensor C stride") - .insert("split_k", "1", "splitK value") - .insert("v", "2", "0. No validation, 1. Validation on CPU, 2. Validation on GPU") - .insert("warmup", "50", "number of iterations before benchmark the kernel") - .insert("repeat", "100", "number of iterations to benchmark the kernel") - .insert("timer", "gpu", "gpu:gpu timer, cpu:cpu timer") - .insert("init", "0", "0:random, 1:linear, 2:constant(1)") - .insert("structured_sparsity", "0", "0:false, 1:true") - .insert("pipeline", "compv3", "compv3, compv4, mem") - .insert("scheduler", "intrawave", "intrawave, interwave") - .insert("epilogue", "cshuffle", "cshuffle, default") - .insert("pad_m", "false", "true, false") - .insert("pad_n", "false", "true, false") - .insert("pad_k", "false", "true, false"); + arg_parser.insert("m", "3840", "The value for m dimension. Default is 3840.") + .insert("n", "4096", "The value for n dimension. Default is 4096.") + .insert("k", "2048", "The value for k dimension. Default is 2048.") + .insert("stride_a", "0", "The stride value for tensor A. Default is 0.") + .insert("stride_b", "0", "The stride value for tensor B. Default is 0.") + .insert("stride_c", "0", "The stride value for tensor C Default is 0.") + .insert("split_k", "1", "The split value for k dimension. Default is 1.") + .insert("verify", + "2", + "The type of validation. Set to 0 for no validation, 1 for validation on CPU, or 2 " + "for validation on GPU. Default is 2, validation on GPU.") + .insert("log", + "false", + "Wether output kernel instance information or not. Possible values are true or " + "false. Default is false") + .insert( + "warmup", "50", "The number of iterations before benchmark the kernel. Default is 50.") + .insert( + "repeat", "100", "The number of iterations to benchmark the kernel. Default is 100.") + .insert("timer", + "true", + "Whether if the timer is gpu timer or not. Possible values are false or true. " + "Default is true.") + .insert("init", + "0", + "The method of tensor initialization. Set to 0 for random, to 1 for linear, or 2 " + "for constant(1). Default is 0, random.") + .insert("metric", + "0", + "Metric with which to measure kernel performance. Set to 0 for latency, 1 for " + "tflops, or 2 for bandwidth. Default is 0, latency.") + .insert("csv_filename", + "gemm_kernel", + "The filename of benchmark result. Default is gemm_kernel.") + .insert("structured_sparsity", + "false", + "Whether use sparsity kernel or not. Possible values are true or false. Default is " + "false") + .insert( + "pipeline", + "compv3", + "The type of pipeline. Possible values are compv3, compv4 or mem. Default is compv3.") + .insert("scheduler", + "intrawave", + "The type of pipeline. Possible values are compv3, compv4 or mem. Default is " + "compv3.") + .insert( + "epilogue", + "cshuffle", + "The type of epilogue. Possible values are cshuffle or default. Default is csshuffle.") + .insert("pad_m", + "false", + "Whether pad or not in m direction. Possible values are true or false. Default is " + "false.") + .insert("pad_n", + "false", + "Whether pad or not in n direction. Possible values are true or false. Default is " + "false.") + .insert("pad_k", + "false", + "Whether pad or not in k direction. Possible values are true or false. Default is " + "false."); bool result = arg_parser.parse(argc, argv); return std::make_tuple(result, arg_parser); @@ -185,79 +188,17 @@ void permute_vectors_i4x4_b(Tensor& tensor) } } -/// @brief Function to compare the results of the device and host computations -void compare(ck_tile::index_t K, - ck_tile::index_t kbatch, - ck_tile::HostTensor& c_m_n_dev_result, - ck_tile::HostTensor& c_m_n_host_result) +auto get_kernel_func_by_trait(const ck_tile::ArgParser& arg_parser) { - const float max_accumulated_value = - *std::max_element(c_m_n_host_result.mData.begin(), c_m_n_host_result.mData.end()); - const auto rtol_atol = calculate_rtol_atol( - K, kbatch, max_accumulated_value); - bool pass = ck_tile::check_err(c_m_n_dev_result, - c_m_n_host_result, - "Error: Incorrect results!", - rtol_atol.at(ck_tile::number<0>{}), - rtol_atol.at(ck_tile::number<1>{})); + KernelTraits trait; + trait.pipeline = arg_parser.get_str("pipeline"); + trait.scheduler = arg_parser.get_str("scheduler"); + trait.epilogue = arg_parser.get_str("epilogue"); + trait.pad_m = arg_parser.get_bool("pad_m"); + trait.pad_n = arg_parser.get_bool("pad_n"); + trait.pad_k = arg_parser.get_bool("pad_k"); - std::cout << "Relative error threshold: " << rtol_atol.at(ck_tile::number<0>{}) - << " Absolute error threshold: " << rtol_atol.at(ck_tile::number<1>{}) << std::endl; - std::cout << "The verification result is:" << (pass ? "correct" : "fail") << std::endl; -} - -/// @brief Function to get the kernel output with reference implementation on CPU/GPU -template -void gemm_host_reference(int verify, - ck_tile::HostTensor& a_m_k, - ck_tile::HostTensor& b_k_n, - ck_tile::HostTensor& c_m_n_host_result, - ck_tile::DeviceMem& a_m_k_dev_buf, - ck_tile::DeviceMem& b_k_n_dev_buf, - ck_tile::index_t M, - ck_tile::index_t N, - ck_tile::index_t K, - ck_tile::index_t stride_A, - ck_tile::index_t stride_B, - ck_tile::index_t stride_C) -{ - if(verify == 1) - { - c_m_n_host_result.SetZero(); - - ck_tile::reference_gemm( - a_m_k, b_k_n, c_m_n_host_result); - } - else if(verify == 2) - { - if constexpr(std::is_same_v) - { - // Restore input for B for gpu reference - b_k_n_dev_buf.ToDevice(b_k_n.data()); - } - - ck_tile::DeviceMem c_m_n_gpu_buf_ref(c_m_n_host_result.get_element_space_size_in_bytes()); - c_m_n_host_result.SetZero(); - c_m_n_gpu_buf_ref.SetZero(); - - ADataType* d_A = static_cast(a_m_k_dev_buf.GetDeviceBuffer()); - BDataType* d_B = static_cast(b_k_n_dev_buf.GetDeviceBuffer()); - CDataType* d_C = static_cast(c_m_n_gpu_buf_ref.GetDeviceBuffer()); - - ck_tile::reference_gemm_gpu(d_A, d_B, d_C, M, N, K, stride_A, stride_B, stride_C); - - c_m_n_gpu_buf_ref.FromDevice(c_m_n_host_result.data()); - } + bool structured_sparsity = arg_parser.get_bool("structured_sparsity"); + + return GemmDispatcher::dispatch(structured_sparsity, trait); } diff --git a/tile_engine/ops/gemm/gemm_instance_builder.py b/tile_engine/ops/gemm/gemm_instance_builder.py index dd8b4d1157..c43797f3e0 100755 --- a/tile_engine/ops/gemm/gemm_instance_builder.py +++ b/tile_engine/ops/gemm/gemm_instance_builder.py @@ -1,385 +1,199 @@ # SPDX-License-Identifier: MIT # Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. -# generate kernel instances to speed up compilation + +# -*- coding: utf-8 -*- + +""" +generate kernel instances to speed up compilation +""" import argparse -from enum import IntEnum -from pathlib import Path -import sys -from typing import List, Optional, Dict, Any -import functools import itertools -import copy -import json -from dataclasses import dataclass - -DATA_TYPE_MAP = {'fp32' : 'float', - 'fp16' : 'ck_tile::half_t', - 'bf16' : 'ck_tile::bf16_t', - 'int8' : 'ck_tile::int8_t', - 'fp8' : 'ck_tile::fp8_t', - 'bf8' : 'ck_tile::bf8_t', - 'int4' : 'ck_tile::pk_int4_t' - } +from pathlib import Path +from typing import List, Optional +from json_config import GemmConfig, RangeConfigParam +from codegen_utils import ( + DATA_TYPE_MAP, + LAYOUT_MAP, + DEFAULT_EPILOGUE, + CSHUFFLE_EPILOGUE, + HOT_LOOP_FALSE, + RUN_MEM, + RUN_COMPV3, + RUN_COMPV4, + PIPELINE_MAP, + SCHEDULER_MAP, + EPILOGUE_MAP, + HOT_LOOP_TRUE, + BOOL_MAP, + warp_tile_supported_combinations, + trait_unsupported_combinations, + element_size, + get_gpu_name_by_id +) +import logging +import time -LAYOUT_MAP = {'r' : 'ck_tile::tensor_layout::gemm::RowMajor', - 'c' : 'ck_tile::tensor_layout::gemm::ColumnMajor'} - - -warp_tile_combinations_map = { - "gfx90a": { - 'fp16': [[32, 32, 8], [16, 16, 16], [32, 32, 16], [16, 16, 32], [4, 64, 16], [64, 4, 16]], - 'bf16': [[32, 32, 8], [16, 16, 16], [32, 32, 16], [16, 16, 32], [4, 64, 16], [64, 4, 16]], - 'fp8': [[32, 32, 16], [32, 32, 32]], - 'bf8': [[32, 32, 16], [32, 32, 32]] - }, - "gfx942": { - 'fp16': [[32, 32, 8], [16, 16, 16], [32, 32, 16], [16, 16, 32], [4, 64, 16], [64, 4, 16]], - 'bf16': [[32, 32, 8], [16, 16, 16], [32, 32, 16], [16, 16, 32], [4, 64, 16], [64, 4, 16]], - 'fp8': [[32, 32, 16], [32, 32, 32], [16, 16, 32], [16, 16, 64]], - 'bf8': [[32, 32, 16], [32, 32, 32], [16, 16, 64], [16, 16, 32]] - }, - "gfx950": { - 'fp16': [[32, 32, 8], [16, 16, 16], [32, 32, 16], [16, 16, 32], [4, 64, 16], [64, 4, 16]], - 'bf16': [[32, 32, 8], [16, 16, 16], [32, 32, 16], [16, 16, 32], [4, 64, 16], [64, 4, 16]], - 'fp8': [[32, 32, 16], [32, 32, 32], [16, 16, 32], [16, 16, 64], [16, 16, 128], [32, 32, 64]], - 'bf8': [[32, 32, 16], [32, 32, 32], [16, 16, 64], [16, 16, 32], [16, 16, 128], [32, 32, 64]] - } - } - -def sizeOf(data_type): - if data_type == 'fp16' or data_type == 'bf16': - return 2 - elif data_type == 'int8' or data_type == 'fp8' or data_type == 'bf8': - return 1 - elif data_type == 'int4': ## TODO:: needs to confirm - return 0.5 - else: - return 4 - -DEFAULT_EPILOGUE = """ - using GemmEpilogue = ck_tile::DefaultGemm2DEpilogue< - ck_tile::DefaultGemm2DEpilogueProblem>; -""" - -CSHUFFLE_EPILOGUE = """ - using GemmEpilogue = ck_tile::CShuffleEpilogue< - ck_tile::CShuffleEpilogueProblem>; -""" -HOT_LOOP_FALSE = """ - if(tail_num == ck_tile::TailNumber::Full) - { - RunSplitk(ck_tile::bool_constant{}, - ck_tile::integral_constant{}); - } - else if(tail_num == ck_tile::TailNumber::Odd) - { - RunSplitk(ck_tile::bool_constant{}, - ck_tile::integral_constant{}); - } - else if(tail_num == ck_tile::TailNumber::Even) - { - RunSplitk(ck_tile::bool_constant{}, - ck_tile::integral_constant{}); - } - else - { - throw std::runtime_error("Num K loop must be larger than number of prefetech stages."); - } -""" -RUN_MEM = """ - // Handle One and Full cases directly - if (tail_num == ck_tile::TailNumber::One) { - RunSplitk(ck_tile::bool_constant{}, - ck_tile::integral_constant{}); - } else if (tail_num == ck_tile::TailNumber::Full) { - RunSplitk(ck_tile::bool_constant{}, - ck_tile::integral_constant{}); - } - // Variadic call using fold expression - auto check_tail = [&](auto... TNs) { - (try_run< BaseGemmPipeline, decltype(TNs)::value>(tail_num), ...); - }; - - check_tail( - ck_tile::integral_constant{}, - ck_tile::integral_constant{}, - ck_tile::integral_constant{}, - ck_tile::integral_constant{}, - ck_tile::integral_constant{}, - ck_tile::integral_constant{} - ); -""" - -RUN_COMPV3 = """ - if(tail_num == ck_tile::TailNumber::Full) - { - RunSplitk(ck_tile::bool_constant{}, - ck_tile::integral_constant{}); - } - else if(tail_num == ck_tile::TailNumber::Odd) - { - RunSplitk(ck_tile::bool_constant{}, - ck_tile::integral_constant{}); - } - else if(tail_num == ck_tile::TailNumber::Even) - { - RunSplitk(ck_tile::bool_constant{}, - ck_tile::integral_constant{}); - } - else - { - throw std::runtime_error("The tail number is wrong. It should be Full, Odd, or Even."); - } -""" - -RUN_COMPV4 = """ - if(tail_num == ck_tile::TailNumber::Three) - { - RunSplitk(ck_tile::bool_constant{}, - ck_tile::integral_constant{}); - } - else - { - RunSplitk(ck_tile::bool_constant{}, - ck_tile::integral_constant{}); - } -""" - - -PIPELINE_MAP = {'mem' : ['ck_tile::BaseGemmPipelineAgBgCrMem', 'ck_tile::GemmPipelineAgBgCrMem'], - 'compv3' : ['ck_tile::BaseGemmPipelineAgBgCrCompV3', 'ck_tile::GemmPipelineAgBgCrCompV3'], - 'compv4' : ['ck_tile::BaseGemmPipelineAgBgCrCompV4', 'ck_tile::GemmPipelineAgBgCrCompV4']} - -SCHEDULER_MAP = {'interwave' : 'ck_tile::GemmPipelineScheduler::Interwave', - 'intrawave' : 'ck_tile::GemmPipelineScheduler::Intrawave'} - -EPILOGUE_MAP = {'default' :DEFAULT_EPILOGUE, - 'cshuffle' : CSHUFFLE_EPILOGUE} - -HOT_LOOP_TRUE = {'mem' : RUN_MEM, - 'compv3' : RUN_COMPV3, - 'compv4' : RUN_COMPV4} - - -def BOOL_MAP(b_) -> str: - if b_: - return 'true' - else: - return 'false' - -@dataclass -class GemmConfig: - def __init__(self, config_data): - self.matrix_cfg : Dict[str, Any] = {} - self.impl_cfg : Dict[str, Any] = {} - for key, value in config_data.items(): - if key in ["architecture", "datatype", "layout_a", "layout_b", "layout_c"]: - self.matrix_cfg[key] = value - else: - self.impl_cfg[key] = value - - @property - def architecture(self) -> str: - return self.matrix_cfg["architecture"]["values"][0] - - @property - def datatype(self) -> str: - return self.matrix_cfg["datatype"]["values"][0] - - @property - def layouts(self) -> List[str]: - return [ - self.matrix_cfg["layout_a"]["values"][0], - self.matrix_cfg["layout_b"]["values"][0], - self.matrix_cfg["layout_c"]["values"][0] - ] +logging.basicConfig(level=logging.INFO) class GemmCodeGenerator: - def __init__(self, output_dir: str, config: GemmConfig): + """GEMM (General Matrix Multiplication) code generator.""" + + def __init__(self, output_dir: str, + user_provided_config: Optional[GemmConfig] = None): self.output_dir = Path(output_dir) - if not self.output_dir.exists(): - self.output_dir.mkdir() + self.output_dir.mkdir(parents=True, exist_ok=True) - self.config = config - self.all_kernels = [] - self.unique_configs = [] - # Validate configurations - self._validate_config() + if user_provided_config is not None: + self.config = user_provided_config + else: + config_path = Path(__file__).resolve().parent / \ + "configs" / "default_config.json" + self.config = GemmConfig.from_json(config_path) - def _validate_config(self): - """Validate matrix and implementation configurations""" - # Matrix config validation - for param in ["architecture", "datatype", "layout_a", "layout_b", "layout_c"]: - if len(self.config.matrix_cfg[param]["values"]) != 1: - raise ValueError(f"Matrix config {param} must have exactly one value") - - # Implementation traits validation - required_params = ["tile_m", "tile_n", "tile_k", "warp_m", "warp_n", "warp_k", - "warp_tile_m", "warp_tile_n", "warp_tile_k", "pipeline", - "epilogue", "scheduler", "kPadM", "kPadN", "kPadK"] - for param in required_params: - if not self.config.impl_cfg.get(param, {}).get("values"): - raise ValueError(f"Missing implementation parameter: {param}") + self.valid_trait_names: List[str] = [] + self.valid_trait_tile_combinations: map[str, list[tuple[int]]] = {} - def list_all(self): - """List all possible kernel configurations""" + def list_all_trait_names(self): + """List all possible kernel trait names into file.""" w_p = Path(self.output_dir) - list_p = w_p / 'gemm_instance_blobs.txt' - self._list_config_groups() - with list_p.open('w') as list_f: - list_f.write(str(w_p / ("gemm_common.hpp")) + "\n") - list_f.write(str(w_p / ("gemm_instances.hpp")) + "\n") - list_f.write(str(w_p / ("gemm_dispatcher.hpp")) + "\n") - for group in self.all_kernels: - list_f.write(str(w_p / ("gemm_" + group + ".hpp")) + "\n") - + file_path = w_p / 'gemm_instance_blobs.txt' + self._generate_all_traits() + self._get_valid_trait_tile_combinations() + # Write all file paths to the header file + with file_path.open('w') as f: + f.write(str(w_p / "gemm_common.hpp") + "\n") + f.write(str(w_p / "gemm_instances.hpp") + "\n") + f.write(str(w_p / "gemm_dispatcher.hpp") + "\n") + for trait in self.valid_trait_names: + f.write(str(w_p / f"gemm_{trait}.hpp") + "\n") + for trait, tile_valid_params in self.valid_trait_tile_combinations.items(): + for tile in tile_valid_params: + for tile_m, tile_n, tile_k, warp_m, warp_n, warp_k, warp_tile_m, warp_tile_n, warp_tile_k in tile: + sparse = self.config.problem.datatype_map['matrix_a'] == 'fp16' and \ + self.config.problem.datatype_map['matrix_b'] == 'fp16' and \ + self.config.problem.datatype_map['matrix_c'] == 'fp16' and \ + ((warp_tile_m == 32 and warp_tile_n == 32 and warp_tile_k == 16) or + (warp_tile_m == 16 and warp_tile_n == 16 and warp_tile_k == 32)) + if sparse: + f.write(str( + w_p / f"gemm_{trait}_{tile_m}x{tile_n}x{tile_k}_{warp_m}x{warp_n}x{warp_k}_{warp_tile_m}x{warp_tile_n}x{warp_tile_k}_true.cpp") + "\n") + f.write(str( + w_p / f"gemm_{trait}_{tile_m}x{tile_n}x{tile_k}_{warp_m}x{warp_n}x{warp_k}_{warp_tile_m}x{warp_tile_n}x{warp_tile_k}_false.cpp") + "\n") - def _list_config_groups(self): + def _generate_all_traits(self): + """Generate all possible kernel traits names.""" params = [ - ("pipeline", "pipeline"), - ("epilogue", "epilogue"), - ("scheduler", "scheduler"), - ("kPadM", "kPadM"), - ("kPadN", "kPadN"), - ("kPadK", "kPadK") - ] - + "pipeline", + "epilogue", + "scheduler", + "pad_m", + "pad_n", + "pad_k"] + # Generate all unique_combinations - _unique = set(itertools.product(*[self.config.impl_cfg[p]["values"] for (p, _) in params])) + _unique = set(itertools.product(*[ + getattr(self.config.trait_config, param).values + for param in params + ])) + for combo in _unique: - config = {name: value for (_, name), value in zip(params, combo)} - pipeline, epilogue, scheduler, kPadM, kPadN, kPadK = config.values() - # To remove some unsupported combinations - unsupported_combination = [("compv3", "cshuffle", "interwave"), - ("compv3", "default", "interwave"), - ("compv4", "cshuffle", "interwave"), - ("compv4", "default", "interwave")] - if (pipeline, epilogue, scheduler) not in unsupported_combination: - group_name = f"{pipeline}_{epilogue}_{scheduler}_pad_{BOOL_MAP(kPadM)}_{BOOL_MAP(kPadN)}_{BOOL_MAP(kPadK)}" - self.all_kernels.append(group_name) - self.unique_configs.append(config) + pipeline, epilogue, scheduler, pad_m, pad_n, pad_k = combo + current_combination = (pipeline, epilogue, scheduler) - def generate_all(self): - self._generate_common_header() - self._generate_config_groups() - self._generate_dispatcher() - + if current_combination not in trait_unsupported_combinations: + trait_name = ( + f"{pipeline}_{epilogue}_{scheduler}_" + f"{BOOL_MAP(pad_m)}_{BOOL_MAP(pad_n)}_{BOOL_MAP(pad_k)}" + ) + self.valid_trait_names.append(trait_name) + else: + logging.debug( + f"Invalid combination: {pipeline}-{epilogue}-{scheduler}" + ) - def _generate_common_header(self): - """Generate common header with datatypes and layout""" - self.ctype = self.config.datatype - self.atype = self.config.datatype - self.btype = self.config.datatype - if self.config.datatype in ['fp8', 'bf8']: - self.ctype = 'fp16' - elif self.config.datatype in ['int4']: - self.atype = 'fp16' - self.ctype = 'fp16' + def generate_all_instance_files(self): + """Generate all kernel instances files.""" + self._generate_common_header_file() + self._generate_all_trait_files() + self._generate_dispatcher_file() + + def _generate_common_header_file(self): + """Generate common header file with datatypes and layout.""" content = f"""// SPDX-License-Identifier: MIT // Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. #pragma once + #include "ck_tile/core.hpp" +#include "ck_tile/ops/common.hpp" // Data types -using ADataType = {DATA_TYPE_MAP[self.atype]}; -using BDataType = {DATA_TYPE_MAP[self.btype]}; +using ADataType = {DATA_TYPE_MAP[self.config.problem.datatype_map['matrix_a']]}; +using BDataType = {DATA_TYPE_MAP[self.config.problem.datatype_map['matrix_b']]}; using AccDataType = float; -using CDataType = {DATA_TYPE_MAP[self.ctype]}; +using CDataType = {DATA_TYPE_MAP[self.config.problem.datatype_map['matrix_c']]}; // Layout configurations -using ALayout = {LAYOUT_MAP[self.config.layouts[0]]}; -using BLayout = {LAYOUT_MAP[self.config.layouts[1]]}; -using CLayout = {LAYOUT_MAP[self.config.layouts[2]]}; +using ALayout = {LAYOUT_MAP[self.config.problem.layout_map['matrix_a']]}; +using BLayout = {LAYOUT_MAP[self.config.problem.layout_map['matrix_b']]}; +using CLayout = {LAYOUT_MAP[self.config.problem.layout_map['matrix_c']]}; """ - (self.output_dir / "gemm_common.hpp").write_text(content) - def _generate_config_groups(self): - """Generate implementation configuration groups""" - if not self.unique_configs: # Check if the list is empty - self._list_config_groups() - for config in self.unique_configs: - self._generate_config_group(**config) - self.generate_common_instances_header() + def _generate_all_trait_files(self): + """Generate all kernel traits into files.""" + if not self.valid_trait_names: + self._generate_all_traits() + self._get_valid_trait_tile_combinations() + for trait in self.valid_trait_names: + self._generate_trait_file(trait) + self._generate_instantiation_source_files() + self._generate_common_instance_header_file() - - def _generate_config_group(self, pipeline: str, epilogue: str, scheduler: str, - kPadM: bool, kPadN: bool, kPadK: bool): - """Generate a configuration group with all tile/warp combinations""" - group_name = f"{pipeline}_{epilogue}_{scheduler}_pad_{BOOL_MAP(kPadM)}_{BOOL_MAP(kPadN)}_{BOOL_MAP(kPadK)}" - filename = f"gemm_{group_name}.hpp" + def _generate_trait_file(self, trait: str): + """Generate a trait with all tile/warp combinations.""" + pipeline, epilogue, scheduler, pad_m, pad_n, pad_k = trait.split("_") + filename = f"gemm_{trait}.hpp" content = f"""// SPDX-License-Identifier: MIT // Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. +#pragma once + #include "gemm_common.hpp" #include "ck_tile/ops/gemm.hpp" #include "ck_tile/ops/epilogue.hpp" #include "ck_tile/host.hpp" -namespace {group_name} {{ +namespace {trait} {{ """ # Add template struct with configuration - content += self._generate_kernel_struct(pipeline, epilogue, scheduler, kPadM, kPadN, kPadK) + content += self._generate_kernel_struct( + pipeline, epilogue, scheduler, pad_m, pad_n, pad_k) - content += f"\n}} // namespace {group_name}\n" + content += f"\n}} // namespace {trait}\n" (self.output_dir / filename).write_text(content) def _generate_kernel_struct(self, pipeline: str, epilogue: str, scheduler: str, - kPadM: bool, kPadN: bool, kPadK: bool) -> str: - """Generate kernel struct template""" + pad_m: str, pad_n: str, pad_k: str) -> str: + """Generate the code block of kernel struct""" return f""" -template -void try_run(ck_tile::TailNumber tn) {{ - if constexpr (Pipeline::PrefetchStages > static_cast(TN) - 1) {{ - if (tn == TN) {{ - RunSplitk(ck_tile::bool_constant{{}}, - ck_tile::integral_constant{{}}); - }} - }} -}} + template struct GemmKernel {{ - static constexpr bool kPadM = {BOOL_MAP(kPadM)}; - static constexpr bool kPadN = {BOOL_MAP(kPadN)}; - static constexpr bool kPadK = {BOOL_MAP(kPadK)}; - - static float launch(ck_tile::GemmHostArgs& args, const ck_tile::stream_config& s) {{ + static constexpr bool kPadM = {pad_m}; + static constexpr bool kPadN = {pad_n}; + static constexpr bool kPadK = {pad_k}; + + static float launch(ck_tile::GemmHostArgs& args, const ck_tile::stream_config& stream) {{ static constexpr bool permuteA = false; static constexpr bool permuteB = false; static constexpr bool DoubleSmemBuffer ={"true" if pipeline == "compv4" else "false"}; @@ -389,7 +203,7 @@ struct GemmKernel {{ static constexpr ck_tile::index_t TileParitionerGroupNum = 8; static constexpr ck_tile::index_t TileParitionerM01 = 4; - using GemmShape = + using GemmShape = ck_tile::TileGemmShape, ck_tile::sequence, ck_tile::sequence, @@ -403,22 +217,22 @@ struct GemmKernel {{ TileParitionerM01>; using Traits = - ck_tile::TileGemmTraits; + ck_tile::TileGemmTraits; using GemmUniversalTraits = ck_tile::TileGemmUniversalTraits; + ALayout, BLayout, CLayout, TransposeC, structured_sparsity>; using GemmPipelineProblem = ck_tile::GemmPipelineProblem; - using BaseGemmPipeline = {PIPELINE_MAP[pipeline][0]}; + using BaseGemmPipeline = {PIPELINE_MAP[pipeline][0]}; const ck_tile::index_t k_grain = args.k_batch * TileK; const ck_tile::index_t K_split = (args.K + k_grain - 1) / k_grain * TileK; const ck_tile::index_t num_loop = TilePartitioner::GetLoopNum(K_split); const bool has_hot_loop = BaseGemmPipeline::BlockHasHotloop(num_loop); - const ck_tile::TailNumber tail_num = BaseGemmPipeline::GetBlockLoopTailNum(num_loop); + const ck_tile::TailNumber tail_num = BaseGemmPipeline::GetBlockLoopTailNum(num_loop); float ave_time{{0}}; @@ -428,7 +242,7 @@ struct GemmKernel {{ constexpr auto scheduler = {SCHEDULER_MAP[scheduler]}; constexpr auto memory_operation = memory_operation_.value; - using UniversalGemmProblem = + using UniversalGemmProblem = ck_tile::UniversalGemmPipelineProblem; - using GemmPipeline = {PIPELINE_MAP[pipeline][1]}; + using GemmPipeline = {PIPELINE_MAP[pipeline][1]}; {EPILOGUE_MAP[epilogue]} using Kernel = ck_tile::GemmKernel; auto kargs = Kernel::MakeKernelArgs(args); @@ -451,7 +265,7 @@ struct GemmKernel {{ throw std::runtime_error("Wrong! Arguments not supported! Skipping gemm!"); }} - if(s.log_level_ > 0) + if(stream.log_level_ > 0) {{ std::cout << "Launching kernel with args:" << " grid: {{" << grids.x << ", " << grids.y << ", " << grids.z << "}}" @@ -459,7 +273,7 @@ struct GemmKernel {{ << std::endl; }} - ave_time = ck_tile::launch_kernel(s, + ave_time = ck_tile::launch_kernel(stream, ck_tile::make_kernel( Kernel{{}}, grids, blocks, 0, kargs)); return ave_time; @@ -488,206 +302,333 @@ struct GemmKernel {{ return ave_time; }} - + static std::string get_name() {{ - return std::string("GemmKernel bool: - """Check if the tile configuration is valid for the given group""" - # Extract tile parameters + def is_tile_valid(self, tile: tuple, trait: str) -> bool: + """Check if the tile configuration is valid for the given trait.""" tile_m, tile_n, tile_k, warp_m, warp_n, warp_k, warp_tile_m, warp_tile_n, warp_tile_k = tile + pipeline, *_ = trait.split("_") - # Extract the pipeline and epilogue from the group name - _, pipeline, epilogue, scheduler, *_ = group.split("_") + # Parameter validity check + invalid_params = [] + if (warp_m, warp_n, warp_k) not in [(1, 4, 1), (2, 2, 1), (4, 1, 1)]: + invalid_params.append( + f"warp_m({warp_m}) * warp_n({warp_n}) * warp_k({warp_k})") + if (warp_m * warp_tile_m) == 0: + invalid_params.append( + f"warp_m({warp_m}) * warp_tile_m({warp_tile_m})") + if (warp_n * warp_tile_n) == 0: + invalid_params.append( + f"warp_n({warp_n}) * warp_tile_n({warp_tile_n})") + if (warp_k * warp_tile_k) == 0: + invalid_params.append( + f"warp_k({warp_k}) * warp_tile_k({warp_tile_k})") - if tile_m % (warp_m * warp_tile_m) == 0 and \ - tile_n % (warp_n * warp_tile_n) == 0 and \ - tile_k % (warp_k * warp_tile_k) == 0: - total_tile_in_lds = (tile_m * tile_k + tile_n * tile_k ) * sizeOf(self.config.datatype) - # Validate and append valid tile parameters - is_compv4 = pipeline == "compv4" - max_tile_size = pow(2, 16) if is_compv4 else pow(2, 15) + if invalid_params: + logging.debug( + f"Trait: [{trait}], Invalid warp configuration: {', '.join(invalid_params)}. " + f"Parameter combination: warp=({warp_m},{warp_n},{warp_k}), " + f"warp_tile=({warp_tile_m},{warp_tile_n},{warp_tile_k})" + ) + return False - if total_tile_in_lds > max_tile_size: - raise ValueError(f'Total tile size should not exceed {max_tile_size / 1024}KB of LDS. ' - f'{tile_m} * {tile_n} * {tile_k} > {max_tile_size / 1024}KB') - arch = self.config.architecture - if [warp_tile_m, warp_tile_n, warp_tile_k] in warp_tile_combinations_map[arch][self.config.datatype]: - return True - return False + # Dimension alignment check + alignment_issues = [] + if tile_m % (warp_m * warp_tile_m) != 0: + alignment_issues.append( + f"tile_m({tile_m}) % [{warp_m}x{warp_tile_m}] = {tile_m % (warp_m * warp_tile_m)}") + if tile_n % (warp_n * warp_tile_n) != 0: + alignment_issues.append( + f"tile_n({tile_n}) % [{warp_n}x{warp_tile_n}] = {tile_n % (warp_n * warp_tile_n)}") + if tile_k % (warp_k * warp_tile_k) != 0: + alignment_issues.append( + f"tile_k({tile_k}) % [{warp_k}x{warp_tile_k}] = {tile_k % (warp_k * warp_tile_k)}") - def _generate_dispatcher(self): - """Generate dispatch mechanism""" - content = """// SPDX-License-Identifier: MIT + if alignment_issues: + logging.debug( + f"Trait: [{trait}], Dimension alignment failed: {', '.join(alignment_issues)}. " + f"Tile dimensions {tile_m}x{tile_n}x{tile_k} must be divisible by " + f"[warp]: {warp_m}x{warp_n}x{warp_k} x [warp_tile]: {warp_tile_m}x{warp_tile_n}x{warp_tile_k}" + ) + return False + + # LDS capacity verification + matrix_a_size = (tile_m * tile_k) * \ + pow(2, element_size(self.config.problem.datatype_map['matrix_a'])) + matrix_b_size = (tile_n * tile_k) * \ + pow(2, element_size(self.config.problem.datatype_map['matrix_b'])) + total_tile_in_lds = matrix_a_size + matrix_b_size + + max_tile_size = 2**16 if pipeline == "compv4" else 2**15 + if total_tile_in_lds > max_tile_size: + logging.debug( + f"LDS capacity exceeded [{trait}]: Total required {total_tile_in_lds:,}B ({total_tile_in_lds/1024:.1f}KB) > " + f"maximum allowed {max_tile_size:,}B ({max_tile_size/1024}KB). Breakdown:\n" + f"- Matrix A ({self.config.problem.datatype_map['matrix_a']}): {tile_m}x{tile_k} = {matrix_a_size:,}B\n" + f"- Matrix B ({self.config.problem.datatype_map['matrix_b']}): {tile_n}x{tile_k} = {matrix_b_size:,}B" + ) + return False + + # Warp combination validation + warp_tile_key = f"{self.config.problem.datatype_map['matrix_a']}_{self.config.problem.datatype_map['matrix_b']}_{self.config.problem.datatype_map['matrix_c']}" + current_combination = [warp_tile_m, warp_tile_n, warp_tile_k] + + gpu_name = get_gpu_name_by_id(0) + gpu_warp_tile_key = warp_tile_supported_combinations.get(gpu_name, {}) + if not gpu_warp_tile_key: + logging.debug( + f"Trait: [{trait}], No valid warp tile combinations found for {gpu_name}/{warp_tile_key}, skip this check.") + return False + + allowed_combinations = gpu_warp_tile_key.get(warp_tile_key, []) + if not allowed_combinations: + logging.debug( + f"Trait: [{trait}], No valid warp tile combinations found for {gpu_name}/{warp_tile_key}, skip this check.") + return False + + if current_combination not in allowed_combinations: + logging.debug( + f"Trait: [{trait}], Invalid warp combination: {current_combination} not in allowed list. " + f"Valid combinations for data type '{warp_tile_key}': {allowed_combinations}" + ) + return False + + return True + + def _get_valid_trait_tile_combinations(self): + def get_tile_value(tile_param): return tile_param.generate_candidates( + ) if isinstance(tile_param, RangeConfigParam) else tile_param.values + + tile_group = list(itertools.product( + get_tile_value(self.config.tile_config.tile_m), + get_tile_value(self.config.tile_config.tile_n), + get_tile_value(self.config.tile_config.tile_k) + )) + + warp_group = list(itertools.product( + get_tile_value(self.config.tile_config.warp_m), + get_tile_value(self.config.tile_config.warp_n), + get_tile_value(self.config.tile_config.warp_k) + )) + + warp_tile_group = list(itertools.product( + get_tile_value(self.config.tile_config.warp_tile_m), + get_tile_value(self.config.tile_config.warp_tile_n), + get_tile_value(self.config.tile_config.warp_tile_k) + )) + + tile_params = { + t + w + wt + for t in tile_group + for w in warp_group + for wt in warp_tile_group + } + + for trait in self.valid_trait_names: + tile_valid_params = list( + filter(lambda t: self.is_tile_valid(t, trait), tile_params)) + if trait not in self.valid_trait_tile_combinations: + self.valid_trait_tile_combinations[trait] = [] + self.valid_trait_tile_combinations[trait].append(tile_valid_params) + + def _generate_instantiation_source_files(self): + """Generate kernel instance instantiation source files """ + for trait, tile_valid_params in self.valid_trait_tile_combinations.items(): + for tile in tile_valid_params: + for tile_m, tile_n, tile_k, warp_m, warp_n, warp_k, warp_tile_m, warp_tile_n, warp_tile_k in tile: + content = f""" +// SPDX-License-Identifier: MIT // Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. -#include "gemm_common.hpp" -#include "gemm_instances.hpp" -#include "gemm_host_api.hpp" + + +#include "gemm_{trait}.hpp" + +""" + sparse = self.config.problem.datatype_map['matrix_a'] == 'fp16' and \ + self.config.problem.datatype_map['matrix_b'] == 'fp16' and \ + self.config.problem.datatype_map['matrix_c'] == 'fp16' and \ + ((warp_tile_m == 32 and warp_tile_n == 32 and warp_tile_k == 16) or + (warp_tile_m == 16 and warp_tile_n == 16 and warp_tile_k == 32)) + if sparse: + sparse_content = content + f""" +template struct {trait}::GemmKernel<{tile_m}, {tile_n}, {tile_k}, {warp_m}, {warp_n}, {warp_k}, {warp_tile_m}, {warp_tile_n}, {warp_tile_k}, true>; +""" + (self.output_dir / + f"gemm_{trait}_{tile_m}x{tile_n}x{tile_k}_{warp_m}x{warp_n}x{warp_k}_{warp_tile_m}x{warp_tile_n}x{warp_tile_k}_true.cpp").write_text(sparse_content) + + no_sparse_content = content + f""" +template struct {trait}::GemmKernel<{tile_m}, {tile_n}, {tile_k}, {warp_m}, {warp_n}, {warp_k}, {warp_tile_m}, {warp_tile_n}, {warp_tile_k}, false>; +""" + (self.output_dir / + f"gemm_{trait}_{tile_m}x{tile_n}x{tile_k}_{warp_m}x{warp_n}x{warp_k}_{warp_tile_m}x{warp_tile_n}x{warp_tile_k}_false.cpp").write_text(no_sparse_content) + + def _generate_dispatcher_file(self): + """Generate the code block of dispatch mechanism.""" + content = """ +// SPDX-License-Identifier: MIT +// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + #include #include #include +#include "gemm_common.hpp" +#include "gemm_instances.hpp" + +/// @brief Defines the configuration parameters for a GEMM operation, enabling the selection of a +/// specific kernel instance based on the provided settings. +struct KernelTraits +{ + /// @brief The name of the pipeline. + std::string pipeline; + /// @brief The name of the scheduler (e.g., "intrawave", "interwave"). + std::string scheduler; + /// @brief The name of the epilogue (e.g., "cshuffle", "default"). + std::string epilogue; + /// @brief Indicates whether padding is applied to the M dimension. + bool pad_m; + /// @brief Indicates whether padding is applied to the N dimension. + bool pad_n; + /// @brief Indicates whether padding is applied to the K dimension. + bool pad_k; +}; + struct GemmDispatcher { static auto& get_kernel_map() { // Use a static local variable - static std::unordered_map& c_m_n_host_result, - ck_tile::HostTensor& c_m_n_dev_result, - int verify, ck_tile::GemmHostArgs&, const ck_tile::stream_config&)>> kernel_map; + static std::unordered_map< + std::string, + std::vector(ck_tile::GemmHostArgs&, const ck_tile::stream_config&)>>> + kernel_map; return kernel_map; } static void init(bool structured_sparsity) { - auto& kernel_map = get_kernel_map(); - if(!kernel_map.empty()) return; + auto& kernel_map = get_kernel_map(); + if(!kernel_map.empty()) return; \n""" - # Add tile/warp instantiations - tile_params = set(itertools.product( - self.config.impl_cfg["tile_m"]["values"], - self.config.impl_cfg["tile_n"]["values"], - self.config.impl_cfg["tile_k"]["values"], - self.config.impl_cfg["warp_m"]["values"], - self.config.impl_cfg["warp_n"]["values"], - self.config.impl_cfg["warp_k"]["values"], - self.config.impl_cfg["warp_tile_m"]["values"], - self.config.impl_cfg["warp_tile_n"]["values"], - self.config.impl_cfg["warp_tile_k"]["values"] - )) - - for group in self.all_kernels: - content += f""" kernel_map["{group}"] = [=](ck_tile::DeviceMem& c_m_n_dev_buf, - ck_tile::HostTensor& c_m_n_host_result, - ck_tile::HostTensor& c_m_n_dev_result, - int verify, ck_tile::GemmHostArgs& args, - const ck_tile::stream_config& stream) {{ - if(structured_sparsity){{ // SMFMA""" - for tile in tile_params: - if self.is_tile_valid(tile, group): - sparse = self.atype == 'fp16' and \ - ((tile[6] == 32 and tile[7] == 32 and tile[8] == 16) or - (tile[6] == 16 and tile[7] == 16 and tile[8] == 32)) + for trait, tile_valid_params in self.valid_trait_tile_combinations.items(): + content += f""" kernel_map["{trait}"] = {{""" + for _, tile in enumerate(tile_valid_params): + for j in range(len(tile)): + tile_m, tile_n, tile_k, warp_m, warp_n, warp_k, warp_tile_m, warp_tile_n, warp_tile_k = tile[ + j] + content += f"""[=](ck_tile::GemmHostArgs& args, const ck_tile::stream_config& stream) {{ """ + content += f""" + if(structured_sparsity){{ // SMFMA""" + sparse = self.config.problem.datatype_map['matrix_a'] == 'fp16' and \ + self.config.problem.datatype_map['matrix_b'] == 'fp16' and \ + self.config.problem.datatype_map['matrix_c'] == 'fp16' and \ + ((warp_tile_m == 32 and warp_tile_n == 32 and warp_tile_k == 16) or + (warp_tile_m == 16 and warp_tile_n == 16 and warp_tile_k == 32)) content += f""" - run_kernel<{group}::GemmKernel<{tile[0]}, {tile[1]}, {tile[2]}, {tile[3]}, {tile[4]}, {tile[5]}, {tile[6]}, {tile[7]}, {tile[8]}, {BOOL_MAP(sparse)}>>(c_m_n_dev_buf, c_m_n_host_result, c_m_n_dev_result, verify, args, stream);""" - else: - raise ValueError(f"Invalid tile configuration for group {group}: {tile}") - content += f""" - }} else {{""" - for tile in tile_params: - if self.is_tile_valid(tile, group): + return run_kernel<{trait}::GemmKernel<{tile_m}, {tile_n}, {tile_k}, {warp_m}, {warp_n}, {warp_k}, {warp_tile_m}, {warp_tile_n}, {warp_tile_k}, {BOOL_MAP(sparse)}>>(args, stream);""" content += f""" - run_kernel<{group}::GemmKernel<{tile[0]}, {tile[1]}, {tile[2]}, {tile[3]}, {tile[4]}, {tile[5]}, {tile[6]}, {tile[7]}, {tile[8]}, {BOOL_MAP(False)}>>(c_m_n_dev_buf, c_m_n_host_result, c_m_n_dev_result, verify, args, stream);""" - else: - raise ValueError(f"Invalid tile configuration for group {group}: {tile}") + }} else {{""" + content += f""" + return run_kernel<{trait}::GemmKernel<{tile_m}, {tile_n}, {tile_k}, {warp_m}, {warp_n}, {warp_k}, {warp_tile_m}, {warp_tile_n}, {warp_tile_k}, {BOOL_MAP(False)}>>(args, stream);""" + content += f""" + }} """ + + if j == len(tile)-1: + content += f""" + }} """ + else: + content += f""" + }}, """ content += f""" - }} - }};\n""" + }};\n """ content += """ } - + template - static void run_kernel(ck_tile::DeviceMem& c_m_n_dev_buf, - ck_tile::HostTensor& c_m_n_host_result, - ck_tile::HostTensor& c_m_n_dev_result, - int verify, ck_tile::GemmHostArgs& args, const ck_tile::stream_config& stream) + static std::tuple run_kernel(ck_tile::GemmHostArgs& args, const ck_tile::stream_config& stream) { + std::string name = Kernel::get_name(); float avg_time = Kernel::launch(args, stream); - std::string description = Kernel::get_name(); - c_m_n_dev_buf.FromDevice(c_m_n_dev_result.data()); - std::size_t flop = std::size_t(2) * args.M * args.N * args.K; - std::size_t num_byte = sizeof(ADataType) * args.M * args.K + sizeof(BDataType) * args.N * args.K + sizeof(CDataType) * args.M * args.N; - float tflops = static_cast(flop) / 1.E9 / avg_time; - float gb_per_sec = num_byte / 1.E6 / avg_time; - - std::cout << "Performance for " << description << " : " << avg_time << " ms, " - << tflops << " TFlops, " << gb_per_sec << " GB/s, " << std::endl; - - if(verify) - compare(args.K, args.k_batch, c_m_n_dev_result, c_m_n_host_result); - c_m_n_dev_buf.SetZero(); - c_m_n_dev_result.SetZero(); + return std::make_tuple(name, avg_time); } - - static auto dispatch(ck_tile::DeviceMem& c_m_n_dev_buf, - ck_tile::HostTensor& c_m_n_host_result, - ck_tile::HostTensor& c_m_n_dev_result, - int verify, bool structured_sparsity, const KernelTraits &trait, ck_tile::GemmHostArgs& gemm_args, - const ck_tile::stream_config& stream) { + + + static auto dispatch(bool structured_sparsity, const KernelTraits& trait) { init(structured_sparsity); const std::string key = assemble_key(trait); - auto& kernel_map = get_kernel_map(); - if(auto it = kernel_map.find(key); it != kernel_map.end()) { - return it->second(c_m_n_dev_buf, c_m_n_host_result, c_m_n_dev_result, verify, gemm_args, stream); + auto& kernel_map = get_kernel_map(); + if(auto it = kernel_map.find(key); it != kernel_map.end()) + { + return it->second; } throw std::runtime_error("No suitable kernel found: " + key); } private: static std::string assemble_key(const KernelTraits &trait) { - return std::string(trait.pipeline) + "_" + - trait.epilogue + "_" + + return std::string(trait.pipeline) + "_" + + trait.epilogue + "_" + trait.scheduler + "_" + - "pad_" + - (trait.kPadM ? "true" : "false") + "_" + - (trait.kPadN ? "true" : "false") + "_" + - (trait.kPadK ? "true" : "false"); + (trait.pad_m ? "true" : "false") + "_" + + (trait.pad_n ? "true" : "false") + "_" + + (trait.pad_k ? "true" : "false"); } }; """ (self.output_dir / "gemm_dispatcher.hpp").write_text(content) - -def do_list_blobs(args, gemm_config): - generator = GemmCodeGenerator(args.working_path, gemm_config) - generator.list_all() -def do_gen_blobs(args, gemm_config): - generator = GemmCodeGenerator(args.working_path, gemm_config) - generator.generate_all() +def do_list_blobs(args: argparse.Namespace, + user_provide_config: Optional[GemmConfig] = None): + generator = GemmCodeGenerator(args.working_path, user_provide_config) + generator.list_all_trait_names() + + +def do_gen_blobs(args: argparse.Namespace, + user_provide_config: Optional[GemmConfig] = None): + generator = GemmCodeGenerator(args.working_path, user_provide_config) + generator.generate_all_instance_files() - def main(args): - # Read json file - with open(args.json, 'r') as json_file: - config_data = json.load(json_file) - - gemm_config = GemmConfig(config_data) + + gemm_config = GemmConfig.from_json( + args.config_json) if args.config_json is not None else args.config_json if args.list_blobs: do_list_blobs(args, gemm_config) elif args.gen_blobs: do_gen_blobs(args, gemm_config) else: - # If neither was specified, either do nothing or default to gen_blobs - print("No mode specified (use --list_blobs or --gen_blobs). Generating by default...") + logging.warning( + "No mode specified (use --list_blobs or --gen_blobs). Generating by default...") do_gen_blobs(args, gemm_config) - if __name__ == "__main__": @@ -696,18 +637,18 @@ if __name__ == "__main__": description="gen API for CK gemm kernel", ) parser.add_argument( - "-w", "--working_path", default="./", required=False, help="the path where all the blobs are going to be generated" + "-w", "--working_path", default="./", required=False, help="The path where all the blobs are going to be generated" ) parser.add_argument( - "-j", "--json", required=True, help="Path to the json which contains the kernel configurations" + "-j", "--config_json", required=False, help="Path to the json which contains the configurations that user provide" ) parser.add_argument( - "-l", "--list_blobs", action = 'store_true', help="List all kernel to file" + "-l", "--list_blobs", action='store_true', help="List all kernel instances to file" ) parser.add_argument( - "-g", "--gen_blobs", action = 'store_true', help="Generate all kernels into different files" + "-g", "--gen_blobs", action='store_true', help="Generate all kernel instances into different files" ) - + args = parser.parse_args() - + main(args) diff --git a/tile_engine/ops/gemm/gemm_profiler.hpp b/tile_engine/ops/gemm/gemm_profiler.hpp new file mode 100644 index 0000000000..9170952aa8 --- /dev/null +++ b/tile_engine/ops/gemm/gemm_profiler.hpp @@ -0,0 +1,260 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include +#include +#include + +#include "ck_tile/host/device_prop.hpp" +#include "ck_tile/ops/gemm.hpp" +#include "benchmark_gemm.hpp" + +class GemmProfiler +{ + public: + static GemmProfiler& instance(Setting setting) + { + static GemmProfiler instance{setting}; + return instance; + } + + void benchmark(GemmProblem& gemm_problem, + std::vector( + ck_tile::GemmHostArgs&, const ck_tile::stream_config&)>>& callables) + { + const ALayout layout_a = ALayout{}; + const BLayout layout_b = BLayout{}; + const CLayout layout_c = CLayout{}; + + gemm_problem.stride_a_ = ck_tile::get_default_stride( + gemm_problem.m_, gemm_problem.k_, gemm_problem.stride_a_, is_row_major(layout_a)); + gemm_problem.stride_b_ = ck_tile::get_default_stride( + gemm_problem.k_, gemm_problem.n_, gemm_problem.stride_b_, is_row_major(layout_b)); + gemm_problem.stride_c_ = ck_tile::get_default_stride( + gemm_problem.m_, gemm_problem.n_, gemm_problem.stride_c_, is_row_major(layout_c)); + + ck_tile::HostTensor a_m_k(ck_tile::host_tensor_descriptor( + gemm_problem.m_, gemm_problem.k_, gemm_problem.stride_a_, is_row_major(layout_a))); + ck_tile::HostTensor b_k_n(ck_tile::host_tensor_descriptor( + gemm_problem.k_, gemm_problem.n_, gemm_problem.stride_b_, is_row_major(layout_b))); + ck_tile::HostTensor c_m_n_dev_result(ck_tile::host_tensor_descriptor( + gemm_problem.m_, gemm_problem.n_, gemm_problem.stride_c_, is_row_major(layout_c))); + + if(setting_.init_method_ == 0) + { + ck_tile::FillUniformDistribution{-1.f, 1.f}(a_m_k); + ck_tile::FillUniformDistribution{-1.f, 1.f}(b_k_n); + } + else if(setting_.init_method_ == 1) + { + ck_tile::FillMonotonicSeq{}(a_m_k); + ck_tile::FillMonotonicSeq{}(b_k_n); + } + else if(setting_.init_method_ == 2) + { + ck_tile::FillConstant{static_cast(1)}(a_m_k); + ck_tile::FillConstant{static_cast(1)}(b_k_n); + } + else + { + a_m_k.SetZero(); + b_k_n.SetZero(); + } + + if(gemm_problem.structured_sparsity_) + { + ck_tile::AdjustToStructuredSparsity{}(a_m_k); + } + + ck_tile::DeviceMem a_m_k_dev_buf(a_m_k.get_element_space_size_in_bytes()); + ck_tile::DeviceMem b_k_n_dev_buf(b_k_n.get_element_space_size_in_bytes()); + ck_tile::DeviceMem c_m_n_dev_buf(c_m_n_dev_result.get_element_space_size_in_bytes()); + + if constexpr(std::is_same_v) + { + // Permute vector pk_i4x4 data for device implementation + ck_tile::HostTensor b_k_n_dev = b_k_n; + // permute_tensor_b(b_k_n_dev); + permute_vectors_i4x4_b(b_k_n_dev); + b_k_n_dev_buf.ToDevice(b_k_n_dev.data()); + } + else + { + b_k_n_dev_buf.ToDevice(b_k_n.data()); + } + + a_m_k_dev_buf.ToDevice(a_m_k.data()); + c_m_n_dev_buf.SetZero(); + c_m_n_dev_result.SetZero(); + + ck_tile::GemmHostArgs gemm_args; + gemm_args.a_ptr = a_m_k_dev_buf.GetDeviceBuffer(); + gemm_args.b_ptr = b_k_n_dev_buf.GetDeviceBuffer(); + gemm_args.c_ptr = c_m_n_dev_buf.GetDeviceBuffer(); + gemm_args.k_batch = gemm_problem.split_k_; + gemm_args.M = gemm_problem.m_; + gemm_args.N = gemm_problem.n_; + gemm_args.K = gemm_problem.k_; + gemm_args.stride_A = gemm_problem.stride_a_; + gemm_args.stride_B = gemm_problem.stride_b_; + gemm_args.stride_C = gemm_problem.stride_c_; + + ck_tile::HostTensor c_m_n_host_result(ck_tile::host_tensor_descriptor( + gemm_problem.m_, gemm_problem.n_, gemm_problem.stride_c_, is_row_major(layout_c))); + + if(setting_.verify_) + { + gemm_host_reference(setting_.verify_, + a_m_k, + b_k_n, + c_m_n_host_result, + a_m_k_dev_buf, + b_k_n_dev_buf, + gemm_problem.m_, + gemm_problem.n_, + gemm_problem.k_, + gemm_problem.stride_a_, + gemm_problem.stride_b_, + gemm_problem.stride_c_); + } + + for(auto& callable : callables) + { + auto kernel_run_result = callable(gemm_args, + ck_tile::stream_config{nullptr, + true, + setting_.log_, + setting_.n_warmup_, + setting_.n_repeat_, + setting_.is_gpu_timer_}); + process_result(gemm_problem, + c_m_n_dev_buf, + c_m_n_host_result, + c_m_n_dev_result, + kernel_run_result); + } + } + + void process_result(const GemmProblem& gemm_problem, + ck_tile::DeviceMem& c_m_n_dev_buf, + ck_tile::HostTensor& c_m_n_host_result, + ck_tile::HostTensor& c_m_n_dev_result, + const std::tuple& kernel_run_result) + { + auto [name, avg_time] = kernel_run_result; + + KernelInstance kernel_instance{name, gemm_problem, {-1.0f, -1.0f, -1.0f}}; + + // compute performance metric + std::size_t flop = std::size_t(2) * gemm_problem.m_ * gemm_problem.n_ * gemm_problem.k_; + std::size_t num_byte = sizeof(ADataType) * gemm_problem.m_ * gemm_problem.k_ + + sizeof(BDataType) * gemm_problem.n_ * gemm_problem.k_ + + sizeof(CDataType) * gemm_problem.m_ * gemm_problem.n_; + + // update + kernel_instance.perf_result_.latency_ = avg_time; + kernel_instance.perf_result_.tflops_ = static_cast(flop) / 1.E9 / avg_time; + kernel_instance.perf_result_.bandwidth_ = num_byte / 1.E6 / avg_time; + + if(setting_.log_ > 0) + { + std::cout << kernel_instance << std::endl; + } + + // verify result + c_m_n_dev_buf.FromDevice(c_m_n_dev_result.data()); + bool verified_correct = + !setting_.verify_ || + compare(gemm_problem.k_, gemm_problem.split_k_, c_m_n_dev_result, c_m_n_host_result); + + if(verified_correct) + { + kernel_instances_.emplace_back(kernel_instance); + } + else + { + std::cout << "Verification failed, skip kernel: " << name << std::endl; + } + + // clear tensor + c_m_n_dev_buf.SetZero(); + c_m_n_dev_result.SetZero(); + } + + KernelInstance select_best_instance(Metric metric) + { + if(kernel_instances_.empty()) + throw std::runtime_error("Empty instances"); + + auto kernel_instance = *std::max_element(kernel_instances_.begin(), + kernel_instances_.end(), + [metric](const auto& a, const auto& b) { + return PerformanceResult::compare( + b.perf_result_, a.perf_result_, metric); + }); + + std::cout << "**********************************" << std::endl; + std::cout << "According to given metrics: " << get_metric_name(metric) << "\n" + << "The best kernel instance is: " << kernel_instance << std::endl; + std::cout << "**********************************" << std::endl; + + if(!setting_.csv_filename_.empty()) + { + std::ofstream file(setting_.csv_filename_ + ".csv", std::ios::app); + + if(!file.is_open()) + { + std::cerr << "Warning: Failed to open CSV file for writing." << std::endl; + } + else + { + if(file.tellp() == 0) + { + file << "rocm_version,device_name," + << "split_k,m,n,k,stride_a,stride_b,stride_c," + << "dtype_a,dtype_b,dtype_acc,dtype_c," + << "layout_a,layout_b,layout_c," + << "structured_sparsity," + << "name," + << "latency(ms),tflops(TFlops),bandwidth(GB/s),metric\n"; + } + + const auto& problem = kernel_instance.problem_; + const auto& name = kernel_instance.name_; + const auto& perf = kernel_instance.perf_result_; + + file << get_rocm_version() << "," << ck_tile::get_device_name() << "," + << problem.split_k_ << "," << problem.m_ << "," << problem.n_ << "," + << problem.k_ << "," << problem.stride_a_ << "," << problem.stride_b_ << "," + << problem.stride_c_ << "," << problem.dtype_a_ << "," << problem.dtype_b_ + << "," << problem.dtype_acc_ << "," << problem.dtype_c_ << "," + << problem.layout_a_ << "," << problem.layout_b_ << "," << problem.layout_c_ + << "," << problem.structured_sparsity_ << "," << name << "," << std::fixed + << std::setprecision(4) << perf.latency_ << "," << std::fixed + << std::setprecision(4) << perf.tflops_ << "," << std::fixed + << std::setprecision(4) << perf.bandwidth_ << "," << get_metric_name(metric) + << "\n"; + + if(!file) + { + std::cerr << "Warning: Error occurred while writing to CSV file." << std::endl; + } + } + } + + return kernel_instance; + } + + GemmProfiler(const GemmProfiler&) = delete; + GemmProfiler& operator=(const GemmProfiler&) = delete; + + private: + ~GemmProfiler() { kernel_instances_.clear(); } + GemmProfiler(Setting setting) : setting_(setting) {} + + Setting setting_; + + std::vector kernel_instances_; +}; diff --git a/tile_engine/ops/gemm/json_config.py b/tile_engine/ops/gemm/json_config.py new file mode 100644 index 0000000000..f6303ec9f8 --- /dev/null +++ b/tile_engine/ops/gemm/json_config.py @@ -0,0 +1,202 @@ +# SPDX-License-Identifier: MIT +# Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. + +# -*- coding: utf-8 -*- + +""" +Handles loading, parsing, and validation of JSON configuration parameters. +""" + +from pathlib import Path +from dataclasses import dataclass +from typing import List, Optional, Union, Tuple, Type +import json + + +@dataclass +class EnumConfigParam: + """Represents an enumeration-type configuration parameter""" + values: List[Union[int, str, bool]] + + +@dataclass +class RangeConfigParam: + """Represents a numeric range-type configuration parameter""" + min: int + max: int + step: int + exclude: Optional[List[int]] + + def generate_candidates(self) -> List[int]: + """Generates valid candidates after applying range constraints""" + + if self.min > self.max: + raise ValueError( + f"Invalid range: min({self.min}) > max({self.max})" + ) + if self.step <= 0: + raise ValueError( + f"Step must be positive, got {self.step}" + ) + + candidates = list(range(self.min, self.max + 1, self.step)) + + if hasattr(self, 'exclude') and self.exclude: + if not isinstance(self.exclude, list): + raise TypeError("exclude must be list type") + exclude_set = set(self.exclude) + candidates = [x for x in candidates if x not in exclude_set] + + if not candidates: + raise ValueError( + f"No valid candidates for range [{self.min}-{self.max}] " + f"with step {self.step} and excludes {self.exclude}" + ) + + return candidates + + +@dataclass +class ProblemConfig: + """configuration class for problem parameter.""" + datatypes: Tuple[EnumConfigParam, ...] + layouts: Tuple[EnumConfigParam, ...] + + @property + def datatype_map(self) -> dict[str, str]: + """Get current layout selections as a key-value map.""" + return { + 'matrix_a': self.datatypes[0].values[0], + 'matrix_b': self.datatypes[1].values[0], + 'matrix_c': self.datatypes[2].values[0] + } + + @property + def layout_map(self) -> dict[str, str]: + """Get current layout selections as a key-value map.""" + return { + 'matrix_a': self.layouts[0].values[0], + 'matrix_b': self.layouts[1].values[0], + 'matrix_c': self.layouts[2].values[0] + } + + +@dataclass +class TileConfig: + """configuration class for tile parameter.""" + tile_m: Union[EnumConfigParam, RangeConfigParam] + tile_n: Union[EnumConfigParam, RangeConfigParam] + tile_k: Union[EnumConfigParam, RangeConfigParam] + + warp_m: Union[EnumConfigParam, RangeConfigParam] + warp_n: Union[EnumConfigParam, RangeConfigParam] + warp_k: Union[EnumConfigParam, RangeConfigParam] + + warp_tile_m: Union[EnumConfigParam, RangeConfigParam] + warp_tile_n: Union[EnumConfigParam, RangeConfigParam] + warp_tile_k: Union[EnumConfigParam, RangeConfigParam] + + +@dataclass +class TraitConfig: + """configuration class for kernel traits.""" + pipeline: EnumConfigParam + scheduler: EnumConfigParam + epilogue: EnumConfigParam + pad_m: EnumConfigParam + pad_n: EnumConfigParam + pad_k: EnumConfigParam + + +@dataclass +class GemmConfig: + """Main configuration class for GEMM operations """ + problem: ProblemConfig + tile_config: TileConfig + trait_config: TraitConfig + + @classmethod + def from_json(cls: Type["GemmConfig"], filepath: str) -> "GemmConfig": + """JSON configuration loader with validation controls""" + config_path = Path(filepath) + + try: + if not config_path.exists(): + raise FileNotFoundError(f"Config file {filepath} not found") + + with config_path.open('r') as f: + config_dict = json.load(f) + + # Parse problem config + problem = ProblemConfig( + datatypes=( + EnumConfigParam( + values=config_dict['problem']['datatype_a']['values']), + EnumConfigParam( + values=config_dict['problem']['datatype_b']['values']), + EnumConfigParam( + values=config_dict['problem']['datatype_c']['values']) + ), + layouts=( + EnumConfigParam( + values=config_dict['problem']['layout_a']['values']), + EnumConfigParam( + values=config_dict['problem']['layout_b']['values']), + EnumConfigParam( + values=config_dict['problem']['layout_c']['values']) + ) + ) + + # Parse tile config + def create_param(param_dict): + if 'values' in param_dict: + return EnumConfigParam(values=param_dict['values']) + else: + return RangeConfigParam( + min=param_dict['min'], + max=param_dict['max'], + step=param_dict['step'], + exclude=param_dict.get('exclude', []) + ) + + tile_config = TileConfig( + tile_m=create_param(config_dict['tile_config']['tile_m']), + tile_n=create_param(config_dict['tile_config']['tile_n']), + tile_k=create_param(config_dict['tile_config']['tile_k']), + warp_m=create_param(config_dict['tile_config']['warp_m']), + warp_n=create_param(config_dict['tile_config']['warp_n']), + warp_k=create_param(config_dict['tile_config']['warp_k']), + warp_tile_m=create_param( + config_dict['tile_config']['warp_tile_m']), + warp_tile_n=create_param( + config_dict['tile_config']['warp_tile_n']), + warp_tile_k=create_param( + config_dict['tile_config']['warp_tile_k']) + ) + + # Parse trait config + trait_config = TraitConfig( + pipeline=EnumConfigParam( + values=config_dict['trait_config']['pipeline']['values']), + scheduler=EnumConfigParam( + values=config_dict['trait_config']['scheduler']['values']), + epilogue=EnumConfigParam( + values=config_dict['trait_config']['epilogue']['values']), + pad_m=EnumConfigParam( + values=config_dict['trait_config']['pad_m']['values']), + pad_n=EnumConfigParam( + values=config_dict['trait_config']['pad_n']['values']), + pad_k=EnumConfigParam( + values=config_dict['trait_config']['pad_k']['values']) + ) + + return cls( + problem=problem, + tile_config=tile_config, + trait_config=trait_config + ) + + except json.JSONDecodeError as e: + raise ValueError(f"Invalid JSON format: {str(e)}") + except KeyError as e: + raise KeyError(f"Missing required configuration field: {str(e)}")