From 3791cfd71d0be05161737fbb351230bc27e7b657 Mon Sep 17 00:00:00 2001 From: Astha Date: Sun, 11 Jan 2026 23:15:28 -0500 Subject: [PATCH] Adding README back into the gemm directory and integrate new preshuffle functions --- Jenkinsfile | 6 +- test/ck_tile/gemm_tile_engine/CMakeLists.txt | 10 +- tile_engine/ops/common/benchmark_utils.py | 32 +- tile_engine/ops/common/utils.hpp | 83 ---- tile_engine/ops/gemm/README.md | 442 ++++++++++++++++++ tile_engine/ops/gemm/gemm_benchmark.hpp | 22 +- tile_engine/ops/gemm/gemm_benchmark.py | 23 +- tile_engine/ops/gemm/gemm_common.hpp | 96 ++++ .../gemm_multi_d/gemm_multi_d_benchmark.py | 10 +- .../gemm_multi_d_benchmark_single.cpp | 45 +- .../gemm_multi_d/gemm_multi_d_profiler.hpp | 22 +- .../gemm_preshuffle_benchmark.hpp | 17 + .../gemm_preshuffle_benchmark.py | 9 +- .../gemm_preshuffle_benchmark_single.cpp | 83 +--- .../gemm_preshuffle_common.hpp | 64 +-- .../gemm_preshuffle_profiler.hpp | 42 +- tile_engine/ops/gemm/gemm_profiler.hpp | 66 ++- .../gemm_universal_benchmark.py | 21 +- .../gemm_universal_benchmark_single.cpp | 1 + .../gemm_universal_profiler.hpp | 18 +- 20 files changed, 732 insertions(+), 380 deletions(-) create mode 100644 tile_engine/ops/gemm/README.md create mode 100644 tile_engine/ops/gemm/gemm_common.hpp diff --git a/Jenkinsfile b/Jenkinsfile index ca7c4f1d93..8bfee9fa21 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -1640,7 +1640,7 @@ pipeline { -D GEMM_PRESHUFFLE_LAYOUT="rcr" \ -D GEMM_PRESHUFFLE_CONFIG_FILE="default_ci_config.json" .. && \ ninja -j${nthreads()} benchmark_gemm_universal_all benchmark_gemm_preshuffle_all benchmark_gemm_multi_d_all && \ - python3 ../tile_engine/ops/gemm/gemm_universal/gemm_benchmark.py . --problem-sizes "1024,1024,1024" --warmup 5 --repeat 5 --verbose --json results.json && \ + python3 ../tile_engine/ops/gemm/gemm_universal/gemm_universal_benchmark.py . --problem-sizes "1024,1024,1024" --warmup 5 --repeat 5 --verbose --json results.json && \ python3 ../tile_engine/ops/gemm/gemm_preshuffle/gemm_preshuffle_benchmark.py . --problem-sizes "1024,1024,1024" --warmup 5 --repeat 5 --verbose --json results.json && \ python3 ../tile_engine/ops/gemm/gemm_multi_d/gemm_multi_d_benchmark.py . --problem-sizes "1024,1024,1024" --warmup 5 --repeat 5 --verbose --json results.json """ } @@ -1681,7 +1681,7 @@ pipeline { -D GEMM_PRESHUFFLE_DATATYPE="fp16;fp8;bf16;bf8" \ -D GEMM_PRESHUFFLE_LAYOUT="rcr" .. && \ ninja -j${nthreads()} benchmark_gemm_universal_all benchmark_gemm_preshuffle_all benchmark_gemm_multi_d_all benchmark_gemm_streamk_all && \ - python3 ../tile_engine/ops/gemm/gemm_universal/gemm_benchmark.py . --problem-sizes "1024,1024,1024" --warmup 5 --repeat 5 --verbose --json results.json && \ + python3 ../tile_engine/ops/gemm/gemm_universal/gemm_universal_benchmark.py . --problem-sizes "1024,1024,1024" --warmup 5 --repeat 5 --verbose --json results.json && \ python3 ../tile_engine/ops/gemm/gemm_preshuffle/gemm_preshuffle_benchmark.py . --problem-sizes "1024,1024,1024" --warmup 5 --repeat 5 --verbose --json results.json && \ python3 ../tile_engine/ops/gemm/gemm_multi_d/gemm_multi_d_benchmark.py . --problem-sizes "1024,1024,1024" --warmup 5 --repeat 5 --verbose --json results.json """ } @@ -1706,7 +1706,7 @@ pipeline { -D GEMM_UNIVERSAL_DATATYPE="fp16" \ -D GEMM_UNIVERSAL_LAYOUT="rcr;rrr;crr;ccr" .. && \ ninja -j${nthreads()} benchmark_gemm_universal_all && \ - python3 ../tile_engine/ops/gemm/gemm_universal/gemm_benchmark.py . --problem-sizes "1024,1024,1024" --warmup 5 --repeat 5 --verbose --json results.json """ + python3 ../tile_engine/ops/gemm/gemm_universal/gemm_universal_benchmark.py . --problem-sizes "1024,1024,1024" --warmup 5 --repeat 5 --verbose --json results.json """ } steps{ buildHipClangJobAndReboot(setup_args:setup_args, build_type: 'Release', execute_cmd: execute_args) diff --git a/test/ck_tile/gemm_tile_engine/CMakeLists.txt b/test/ck_tile/gemm_tile_engine/CMakeLists.txt index 33effcc120..dc148d45e7 100644 --- a/test/ck_tile/gemm_tile_engine/CMakeLists.txt +++ b/test/ck_tile/gemm_tile_engine/CMakeLists.txt @@ -10,7 +10,7 @@ # ============================================================================ # Locate tile_engine GEMM scripts directory -set(TILE_ENGINE_GEMM_DIR "${PROJECT_SOURCE_DIR}/tile_engine/ops/gemm") +set(TILE_ENGINE_GEMM_DIR "${PROJECT_SOURCE_DIR}/tile_engine/ops/gemm/gemm_universal") if(NOT EXISTS ${TILE_ENGINE_GEMM_DIR}) message(WARNING "Tile engine directory not found: ${TILE_ENGINE_GEMM_DIR}") @@ -32,11 +32,11 @@ endif() # config_json - Full path to JSON configuration file # ============================================================================ function(create_individual_gemm_test_target datatype layout config_name trait tile_config config_json) - set(target_name "test_gemm_tile_engine_${datatype}_${layout}_${config_name}_${trait}_${tile_config}") + set(target_name "test_gemm_universal_tile_engine_${datatype}_${layout}_${config_name}_${trait}_${tile_config}") set(working_path "${CMAKE_CURRENT_BINARY_DIR}/${datatype}/${layout}/${config_name}") # Generated header path (already created during cmake configuration) - set(test_header "${working_path}/gemm_single_${datatype}_${layout}_${trait}_${tile_config}.hpp") + set(test_header "${working_path}/gemm_universal_single_${datatype}_${layout}_${trait}_${tile_config}.hpp") set(test_params_header "${working_path}/test_params.hpp") # Verify header exists (should have been generated during cmake configuration) @@ -118,7 +118,7 @@ function(build_gemm_test_targets datatype layout config_name) # STEP 1: Discovery phase - list all valid kernel configurations execute_process( - COMMAND ${Python3_EXECUTABLE} -u ${TILE_ENGINE_GEMM_DIR}/gemm_instance_builder.py + COMMAND ${Python3_EXECUTABLE} -u ${TILE_ENGINE_GEMM_DIR}/gemm_universal_instance_builder.py --working_path ${working_path} --datatype ${datatype} --layout ${layout} @@ -178,7 +178,7 @@ function(build_gemm_test_targets datatype layout config_name) # Generate header using --gen_single execute_process( - COMMAND ${Python3_EXECUTABLE} -u ${TILE_ENGINE_GEMM_DIR}/gemm_instance_builder.py + COMMAND ${Python3_EXECUTABLE} -u ${TILE_ENGINE_GEMM_DIR}/gemm_universal_instance_builder.py --working_path ${working_path} --gpu_target "${GEMM_TEST_GPU_TARGETS}" --datatype ${datatype} diff --git a/tile_engine/ops/common/benchmark_utils.py b/tile_engine/ops/common/benchmark_utils.py index 0c158fa48c..f94bc4a969 100644 --- a/tile_engine/ops/common/benchmark_utils.py +++ b/tile_engine/ops/common/benchmark_utils.py @@ -2,17 +2,16 @@ # Copyright (c) Advanced Micro Devices, Inc., or its affiliates. # SPDX-License-Identifier: MIT -import sys import json import subprocess -import argparse import csv -import time from pathlib import Path -from typing import List, Dict, Tuple, Optional +from typing import List, Dict, Optional -def run_kernel(build_dir: Path, kernel_path: Path, params: Dict[str, str], verbose: bool = False) -> Optional[Dict]: +def run_kernel( + build_dir: Path, kernel_path: Path, params: Dict[str, str], verbose: bool = False +) -> Optional[Dict]: """Run a single kernel with given parameters and save output to individual JSON file""" # Create results directory results_dir = build_dir / "results" @@ -59,6 +58,7 @@ def run_kernel(build_dir: Path, kernel_path: Path, params: Dict[str, str], verbo print(f"Error running {kernel_path.name}: {e}") return None + def parse_json_file(json_file: Path, verbose: bool = False) -> Optional[Dict]: """Parse JSON data from individual kernel output file""" try: @@ -88,9 +88,8 @@ def parse_json_file(json_file: Path, verbose: bool = False) -> Optional[Dict]: print(f"Error reading JSON file {json_file}: {e}") return None -def find_best_kernel( - results: List[Dict], metric: str = "tflops" -) -> Optional[Dict]: + +def find_best_kernel(results: List[Dict], metric: str = "tflops") -> Optional[Dict]: """Find the best performing kernel based on metric""" if not results: return None @@ -126,7 +125,8 @@ def export_csv(results: List[Dict], filename: str, verbose: bool = False): print(f"Results exported to {filename}") -def export_best_kernels( best_kernels: Dict, filename: str, verbose: bool = False): + +def export_best_kernels(best_kernels: Dict, filename: str, verbose: bool = False): """Export best kernel selections to file""" with open(filename, "w") as f: f.write("# Best kernel selections\n") @@ -141,7 +141,10 @@ def export_best_kernels( best_kernels: Dict, filename: str, verbose: bool = Fals print(f"Best kernels exported to {filename}") -def export_json(results: List[Dict], filename: str, best_kernels: Dict = None, verbose: bool = False): + +def export_json( + results: List[Dict], filename: str, best_kernels: Dict = None, verbose: bool = False +): """Export all results and best kernels to JSON with comprehensive metadata""" from datetime import datetime @@ -223,9 +226,7 @@ def export_json(results: List[Dict], filename: str, best_kernels: Dict = None, v "benchmark_metadata": { "timestamp": datetime.now().isoformat(), "total_kernels_tested": len(results), - "unique_kernels": len( - set(r.get("name", "unknown") for r in results) - ), + "unique_kernels": len(set(r.get("name", "unknown") for r in results)), "successful_runs": len(successful_results), "failed_runs": len(results) - len(successful_results), }, @@ -265,9 +266,7 @@ def export_json(results: List[Dict], filename: str, best_kernels: Dict = None, v "by_scheduler": scheduler_stats, "by_data_type": data_type_stats, }, - "total_problem_configurations": len(best_kernels) - if best_kernels - else 0, + "total_problem_configurations": len(best_kernels) if best_kernels else 0, }, "kernel_results": results, "best_kernels_by_problem": best_kernels or {}, @@ -282,4 +281,3 @@ def export_json(results: List[Dict], filename: str, best_kernels: Dict = None, v print(f" - Best TFLOPS: {max(tflops_values, default=0):.2f}") print(f" - Best bandwidth: {max(bandwidth_values, default=0):.2f} GB/s") print(f" - Best latency: {min(latency_values, default=0):.2f}ms") - diff --git a/tile_engine/ops/common/utils.hpp b/tile_engine/ops/common/utils.hpp index 20994578e6..56bfbde5a0 100644 --- a/tile_engine/ops/common/utils.hpp +++ b/tile_engine/ops/common/utils.hpp @@ -20,89 +20,6 @@ constexpr auto is_row_major(Layout) return ck_tile::bool_constant>{}; } -// Structure to hold kernel traits for dispatcher -struct KernelTraits -{ - std::string pipeline; // compv3, compv4, mem - std::string scheduler; // intrawave, interwave - std::string epilogue; // cshuffle, default - bool pad_m; - bool pad_n; - bool pad_k; - bool persistent; - - // Constructor with defaults - KernelTraits() - : pipeline("compv3"), - scheduler("intrawave"), - epilogue("cshuffle"), - pad_m(false), - pad_n(false), - pad_k(false), - persistent(false) - { - } -}; - - -// Create argument parser -inline auto create_args(int argc, char* argv[]) -{ - ck_tile::ArgParser arg_parser; - arg_parser.insert("m", "3840", "The value for m dimension. Default is 3840.") - .insert("n", "4096", "The value for n dimension. Default is 4096.") - .insert("k", "2048", "The value for k dimension. Default is 2048.") - .insert("stride_a", "0", "The stride value for tensor A. Default is 0.") - .insert("stride_b", "0", "The stride value for tensor B. Default is 0.") - .insert("stride_ds", "0", "The stride value for tensor Ds . Default is 0.") - .insert("stride_c", "0", "The stride value for tensor C. Default is 0.") - .insert("split_k", "1", "The split value for k dimension. Default is 1.") - .insert("verify", - "2", - "The type of validation. Set to 0 for no validation, 1 for validation on CPU, or 2 " - "for validation on GPU. Default is 2, GPU validation.") - .insert("log", - "false", - "Whether output kernel instance information or not. Possible values are true or " - "false. Default is false") - .insert( - "warmup", "50", "The number of iterations before benchmark the kernel. Default is 50.") - .insert( - "repeat", "100", "The number of iterations to benchmark the kernel. Default is 100.") - .insert("timer", - "true", - "Whether if the timer is gpu timer or not. Possible values are false or true. " - "Default is true.") - .insert("init", - "0", - "The method of tensor initialization. Set to 0 for random, to 1 for linear, or 2 " - "for constant(1). Default is 0, random.") - .insert("flush_cache", - "true", - "To flush cache, possible values are true or false. " - "Default is false.") - .insert("rotating_count", "1000", "number of iterations to rotate the cache. default is 5.") - .insert("metric", - "0", - "Metric with which to measure kernel performance. Set to 0 for latency, 1 for " - "tflops, or 2 for bandwidth. Default is 0, latency.") - .insert("csv_filename", - "", - "The filename of benchmark result. Default is empty (no CSV output).") - .insert("structured_sparsity", - "false", - "Whether use sparsity kernel or not. Possible values are true or false. Default is " - "false") - .insert("json_output", - "false", - "Whether to output results in JSON format only. Possible values are true or false. " - "Default is " - "false"); - - bool result = arg_parser.parse(argc, argv); - return std::make_tuple(result, arg_parser); -} - enum class Metric { LATENCY = 0, diff --git a/tile_engine/ops/gemm/README.md b/tile_engine/ops/gemm/README.md new file mode 100644 index 0000000000..5e0bae7080 --- /dev/null +++ b/tile_engine/ops/gemm/README.md @@ -0,0 +1,442 @@ +# CK Tile Engine GEMM Operations + +## Overview + +The CK Tile Engine GEMM module provides a comprehensive system for generating, building, and benchmarking GEMM (General Matrix Multiplication) kernels with various configurations. It supports multiple data types, layouts, and optimization strategies. The system has evolved from a monolithic build approach (where all kernels compile into a single executable) to a more flexible individual kernel compilation system, providing better build parallelism and targeted testing capabilities. + +## Table of Contents + +1. [Build System Architecture](#build-system-architecture) +2. [Build Instructions](#build-instructions) +3. [Running Benchmarks](#running-benchmarks) +4. [Configuration System](#configuration-system) +5. [Scripts and Tools](#scripts-and-tools) +6. [Command Line Options](#command-line-options) +7. [Understanding Kernel Names](#understanding-kernel-names) +8. [Troubleshooting](#troubleshooting) +9. [Performance Tips](#performance-tips) + +## Build System Architecture + +### Individual Kernel Compilation (New Approach) + +The new tile engine benchmark system compiles each kernel configuration into a separate executable. This provides: +- Better build parallelism +- Faster incremental builds +- More targeted testing +- Easier debugging of specific configurations + +Each benchmark executable follows the naming pattern: +``` +benchmark_gemm____ +``` + +### Monolithic Build (Legacy Approach) + +The original system compiles all kernels into a single executable (`benchmark_gemm_[Datatype]_[Layout]`), which can then be filtered at runtime using command-line arguments. + +## Build Instructions + +### Prerequisites +- ROCm installation +- CMake 3.16 or higher +- C++17 compatible compiler + +### Basic Build + +```bash +# In the root of composable kernel, create build directory +mkdir build && cd build + +# Configure with specific datatypes and layouts +# Replace [Arch] with your GPU architecture (e.g., gfx90a, gfx942) +# Replace [Datatype1;Datatype2;...] with datatypes (fp8, bf8, int8, fp16, bf16, fp32, fp64) +# Replace [Layout1;Layout2;...] with layouts (rcr, rrr, crr, ccr) +../script/cmake-ck-dev.sh ../ [Arch] -DGEMM_DATATYPE="[Datatype1;Datatype2]" -DGEMM_LAYOUT="[Layout1;Layout2]" + +# Build specific benchmarks +make benchmark_gemm_[Datatype1]_[Layout1] -j +``` + +### Configuration Options + +The build system supports several configuration options: + +#### Using Custom Config Files +```bash +# Method 1: CMake variable (config file must be in configs/ directory) +cmake -DGEMM_CONFIG_FILE=my_custom_config.json ... + +# Method 2: Environment variable (takes precedence over CMake variable) +export GEMM_CONFIG_FILE=my_custom_config.json +cmake ... +``` + +#### Config File Priority Order +1. **Environment variable** `GEMM_CONFIG_FILE` (highest priority) +2. **CMake variable** `GEMM_CONFIG_FILE` +3. **Default config** (default_config.json for all layouts) + +**Note**: All custom config files must be placed in the `tile_engine/ops/gemm/configs/` directory. + +### Example Build Commands + +```bash +# Build for gfx942 with fp8 and fp16 datatypes, rcr layout +mkdir build && cd build +../script/cmake-ck-dev.sh ../ gfx942 -DGEMM_DATATYPE="fp8;fp16" -DGEMM_LAYOUT="rcr;ccr;rrr;crr" +make benchmark_gemm_universal_fp8_rcr -j +make benchmark_gemm_universal_fp16_rcr -j +``` + +### Building Individual Kernels + +```bash +# Build a specific kernel configuration +make benchmark_gemm_universal_fp8_rcr_compv4_default_intrawave_False_False_False_False_256x256x32_1x4x1_32x32x32 + +# Build all fp16 benchmarks in parallel +make -j$(nproc) $(make help | grep benchmark_gemm_fp16 | awk '{print $2}') +``` + +### Rebuilding After Configuration Changes + +If you modify the configuration file, you must rebuild: +```bash +rm -rf tile_engine/ && make benchmark_gemm_universal_[Datatype]_[Layout] -j +``` + +## Running Benchmarks + +### Individual Kernel Execution + +```bash +cd /path/to/build/directory +./bin/benchmark_gemm_universal_fp16_rcr_compv3_default_intrawave_False_False_False_False_256x128x32_4x1x1_32x32x16 \ + -m=512 -n=512 -k=512 -verify=1 +``` + +### Monolithic Executable (Legacy) + +```bash +# Run specific pipeline/scheduler/epilogue combination +./bin/benchmark_gemm_universal_[Datatype]_[Layout] -pipeline=compv3 -scheduler=intrawave -epilogue=default +``` + +### Automated Testing + +Use the provided test script to run multiple benchmarks: +```bash +cd /path/to/composable_kernel/tile_engine/ops/gemm +./test_benchmark.sh [build_directory] +``` + +## Configuration System + +### Configuration Files + +The system uses JSON configuration files to specify kernel parameters: + +- `configs/default_config.json` - Default configurations for various datatypes +- `configs/user_provided_config.json` - User-customizable configurations + +### Configuration Structure + +```json +{ + "tile_config": { + "tile_m": {"values": [256, 128]}, + "tile_n": {"values": [256, 128]}, + "tile_k": {"values": [64, 32]}, + "warp_m": {"values": [2, 4]}, + "warp_n": {"values": [2, 1]}, + "warp_k": {"values": [1]}, + "warp_tile_m": {"values": [32, 16]}, + "warp_tile_n": {"values": [32, 16]}, + "warp_tile_k": {"values": [16, 32]} + }, + "trait_config": { + "pipeline": {"values": ["compv3", "compv4", "mem"]}, + "scheduler": {"values": ["intrawave", "interwave"]}, + "epilogue": {"values": ["default", "cshuffle"]}, + "pad_m": {"values": [false]}, + "pad_n": {"values": [false]}, + "pad_k": {"values": [false]}, + "persistent": {"values": [false]} + } +} +``` + +## Scripts and Tools + +### Python Scripts + +#### gemm_universal_instance_builder.py +**Purpose**: Main kernel instance generation script that creates C++ kernel implementations based on configuration files. + +**Key Features**: +- Generates individual kernel header files for separate compilation +- Supports multiple data types (fp16, fp8, bf16, fp32, fp64) +- Validates tile configurations for correctness +- Creates CMake integration files + +**Usage**: +```bash +python gemm_universal_instance_builder.py \ + --working_path ./generated \ + --datatype fp16 \ + --layout rcr \ + --config_json configs/user_provided_config.json \ + --gen_all_individual +``` + +#### gemm_instance_builder_parallel.py +**Purpose**: Parallel version of the instance builder for faster generation of multiple kernel configurations. + +**Features**: +- Multi-threaded kernel generation +- Improved performance for large configuration spaces + +#### validation_utils.py +**Purpose**: Provides comprehensive validation functions for kernel configurations. + +**Key Functions**: +- `is_tile_config_valid()` - Validates tile dimensions and alignments +- `is_trait_combination_valid()` - Checks if pipeline/epilogue/scheduler combinations are supported +- `validate_warp_tile_combination()` - GPU-specific warp tile validation +- `validate_lds_capacity()` - Ensures configurations fit in LDS memory + +**Validation Checks**: +- Dimension alignment (tile dimensions must be divisible by warp dimensions) +- LDS capacity constraints +- GPU-specific warp tile support +- Unsupported trait combinations + +#### test_validation.py +**Purpose**: Test suite for the validation logic to ensure correctness. + +**Usage**: +```bash +python test_validation.py +``` + +**Tests**: +- Warp tile combination validation +- Trait combination validation +- Full tile configuration validation + +#### gemm_universal_benchmark.py +**Purpose**: Python script for running and analyzing GEMM benchmarks. + +**Features**: +- Automated benchmark execution +- Performance data collection +- Result analysis and reporting + +#### json_config.py +**Purpose**: Configuration file parsing and management. + +**Features**: +- JSON configuration loading +- Default configuration handling +- Configuration validation + +#### codegen_utils.py +**Purpose**: Utility functions for code generation. + +**Features**: +- Template processing +- Code formatting utilities +- File generation helpers + +### Shell Scripts + +#### test_benchmark.sh +**Purpose**: Automated benchmark testing script that finds and runs all built benchmark executables. + +**Features**: +- Automatic build directory detection +- Batch execution of multiple benchmarks +- CSV result collection +- Colored output for easy reading +- Example command generation + +**Usage**: +```bash +# Auto-detect build directory +./test_benchmark.sh + +# Specify build directory +./test_benchmark.sh /path/to/build/directory +``` + +**What it does**: +1. Finds all benchmark executables in the build directory +2. Runs each with multiple problem sizes (512, 1024, 2048) +3. Performs GPU verification +4. Saves results to timestamped CSV file +5. Provides summary statistics + +## Command Line Options + +All benchmark executables support the following options: + +### Matrix Dimensions +- `-m=` - M dimension (default: 3840) +- `-n=` - N dimension (default: 4096) +- `-k=` - K dimension (default: 2048) + +### Strides +- `-stride_a=` - Stride for matrix A (default: 0, auto-calculated) +- `-stride_b=` - Stride for matrix B (default: 0, auto-calculated) +- `-stride_c=` - Stride for matrix C (default: 0, auto-calculated) + +### Verification +- `-verify=<0|1|2>` - Verification mode + - 0: No verification (default) + - 1: CPU verification + - 2: GPU verification + +### Performance Testing +- `-warmup=` - Warmup iterations (default: 50) +- `-repeat=` - Benchmark iterations (default: 100) +- `-timer=` - Use GPU timer (default: true) +- `-flush_cache=` - Flush cache between runs (default: true) +- `-rotating_count=` - Cache rotation count (default: 1000) + +### Initialization +- `-init=<0|1|2>` - Tensor initialization method + - 0: Random values [-1, 1] (default) + - 1: Linear sequence (i % 17) + - 2: Constant value (1.0) + +### Output Options +- `-log=` - Enable verbose logging (default: false) +- `-metric=<0|1|2>` - Performance metric + - 0: Latency in ms (default) + - 1: TFLOPS + - 2: Bandwidth in GB/s +- `-json_output=` - JSON format output (default: false) +- `-csv_filename=` - Save results to CSV +- `-csv_format=` - CSV format (default: comprehensive) + +### Advanced Options +- `-split_k=` - Split-K factor (default: 1) +- `-structured_sparsity=` - Enable structured sparsity (default: false) +- `-pipeline=` - Pipeline type (default: compv3) +- `-scheduler=` - Scheduler type (default: intrawave) +- `-epilogue=` - Epilogue type (default: cshuffle) +- `-pad_m=` - Pad M dimension (default: false) +- `-pad_n=` - Pad N dimension (default: false) +- `-pad_k=` - Pad K dimension (default: false) +- `-persistent=` - Use persistent kernel (default: false) + +## Understanding Kernel Names + +The kernel naming convention encodes the configuration: + +``` +benchmark_gemm_universal_fp16_rcr_compv3_default_intrawave_False_False_False_False_256x128x32_4x1x1_32x32x16 + ^^^^ ^^^ ^^^^^^ ^^^^^^^ ^^^^^^^^^ ^^^^^^^^^^^^^^^^^^^^^^^ ^^^^^^^^^ ^^^^^^^ ^^^^^^^^^ + | | | | | | | | | + | | | | | Padding & flags | | Warp tile + | | | | Scheduler | Thread tile + | | | Epilogue Block tile + | | Pipeline + | Layout (Row-Column-Row) + Data type +``` + +### Components: +- **Data type**: fp16, fp32, bf16, fp8, bf8, int8 +- **Layout**: rcr (Row-Column-Row), rrr, crr, ccr +- **Pipeline**: mem, compv3, compv4 +- **Epilogue**: default, cshuffle +- **Scheduler**: intrawave, interwave +- **Flags**: pad_m, pad_n, pad_k, persistent (4 boolean flags) +- **Tile sizes**: BlockTile x ThreadTile x WarpTile + +## Troubleshooting + +### Common Issues + +1. **Kernel not found** + - Ensure the specific benchmark executable is built + - Check the build directory bin/ folder + +2. **Verification failures** + - Try GPU verification (-verify=2) which may be more accurate + - Check data type compatibility + - Verify stride calculations + +3. **Build failures** + - Check GPU architecture compatibility + - Ensure ROCm is properly installed + - Verify configuration file syntax + +4. **Performance variations** + - Increase warmup iterations + - Disable CPU frequency scaling + - Use GPU timer for accurate measurements + +### Debug Options + +Enable verbose logging: +```bash +./bin/benchmark_gemm_... -log=true -verify=1 +``` + +Test validation logic: +```bash +python test_validation.py +``` + +## Performance Tips + +1. **Optimal Problem Sizes**: Use sizes that are multiples of tile dimensions +2. **Warmup**: Use at least 50-100 warmup iterations +3. **GPU Timer**: Always use `-timer=true` for accurate measurements +4. **Cache Management**: Enable cache flushing for consistent results +5. **Thread Affinity**: Set CPU affinity to reduce variation + +## Integration Examples + +### Python Integration + +```python +import subprocess +import json + +# Run benchmark with JSON output +result = subprocess.run([ + './bin/benchmark_gemm_universal_fp16_rcr_...', + '-m=1024', '-n=1024', '-k=1024', + '-json_output=true' +], capture_output=True, text=True) + +# Parse results +data = json.loads(result.stdout) +print(f"Performance: {data['tflops']} TFLOPS") +``` + +### Batch Testing Script + +```bash +#!/bin/bash +SIZES="512 1024 2048 4096" +for size in $SIZES; do + echo "Testing ${size}x${size}x${size}" + ./bin/benchmark_gemm_... -m=$size -n=$size -k=$size \ + -verify=2 -csv_filename=results.csv +done +``` + +## Contributing + +When adding new features or configurations: +1. Update validation logic in `validation_utils.py` +2. Add tests to `test_validation.py` +3. Update configuration examples +4. Document new command-line options + +For more information about the Composable Kernel project, visit the main repository documentation. diff --git a/tile_engine/ops/gemm/gemm_benchmark.hpp b/tile_engine/ops/gemm/gemm_benchmark.hpp index 6ff09186a4..7439264a39 100644 --- a/tile_engine/ops/gemm/gemm_benchmark.hpp +++ b/tile_engine/ops/gemm/gemm_benchmark.hpp @@ -52,19 +52,27 @@ struct GemmProblem // Detect Problem::DsDataType, default to void when absent template -struct get_DsDataType { using type = void; }; +struct get_DsDataType +{ + using type = void; +}; template -struct get_DsDataType> { +struct get_DsDataType> +{ using type = typename T::DsDataType; }; // Detect Problem::D0DataType, default to void when absent template -struct get_D0DataType { using type = void; }; +struct get_D0DataType +{ + using type = void; +}; template -struct get_D0DataType> { +struct get_D0DataType> +{ using type = typename T::D0DataType; }; @@ -79,10 +87,10 @@ bool compare(std::string instanceName, using DDataType = typename get_D0DataType::type; const float max_accumulated_value = *std::max_element(c_m_n_host_result.mData.begin(), c_m_n_host_result.mData.end()); - //const auto rtol_atol = calculate_rtol_atol( - //K, kbatch, max_accumulated_value); + // const auto rtol_atol = calculate_rtol_atol( + // K, kbatch, max_accumulated_value); auto rtol_atol = [&] { - if constexpr (std::is_void_v) + if constexpr(std::is_void_v) { return calculate_rtol_atol( K, kbatch, max_accumulated_value); diff --git a/tile_engine/ops/gemm/gemm_benchmark.py b/tile_engine/ops/gemm/gemm_benchmark.py index c229b14233..b35390a1f9 100644 --- a/tile_engine/ops/gemm/gemm_benchmark.py +++ b/tile_engine/ops/gemm/gemm_benchmark.py @@ -3,15 +3,10 @@ # SPDX-License-Identifier: MIT import os -import sys -import json -import subprocess -import argparse -import csv -import time import importlib.util from pathlib import Path -from typing import List, Dict, Tuple, Optional +from typing import List, Dict, Tuple + # TODO: explore modularizing tile engine to avoid accessing imports like this def _import_benchmark_utils(): @@ -29,10 +24,14 @@ def _import_benchmark_utils(): return benchmark_utils + benchmark_utils = _import_benchmark_utils() + class GemmBenchmark: - def __init__(self, build_dir: str, verbose: bool = False, name: str = "benchmark_gemm_"): + def __init__( + self, build_dir: str, verbose: bool = False, name: str = "benchmark_gemm_" + ): self.build_dir = Path(build_dir) self.verbose = verbose self.results = [] @@ -57,7 +56,7 @@ class GemmBenchmark: """Extract comprehensive kernel information from filename""" name = kernel_path.stem if name.startswith(self.name): - args = name[len(self.name):] + args = name[len(self.name) :] else: args = name @@ -245,7 +244,9 @@ class GemmBenchmark: for kernel_path in kernels: kernel_info = self.extract_kernel_info(kernel_path) - result = benchmark_utils.run_kernel(self.build_dir, kernel_path, params, verbose=self.verbose) + result = benchmark_utils.run_kernel( + self.build_dir, kernel_path, params, verbose=self.verbose + ) if result: # Create new structured result format structured_result = { @@ -327,5 +328,3 @@ class GemmBenchmark: self.results = all_results return best_kernels - - diff --git a/tile_engine/ops/gemm/gemm_common.hpp b/tile_engine/ops/gemm/gemm_common.hpp new file mode 100644 index 0000000000..3a9aed2bc6 --- /dev/null +++ b/tile_engine/ops/gemm/gemm_common.hpp @@ -0,0 +1,96 @@ +// Copyright (c) Advanced Micro Devices, Inc., or its affiliates. +// SPDX-License-Identifier: MIT + +#pragma once +#include +#include +#include +#include +#include +#include +#include + +#include "ck_tile/core.hpp" +#include "ck_tile/host.hpp" + +// Structure to hold kernel traits for dispatcher +struct KernelTraits +{ + std::string pipeline; // compv3, compv4, mem + std::string scheduler; // intrawave, interwave + std::string epilogue; // cshuffle, default + bool pad_m; + bool pad_n; + bool pad_k; + bool persistent; + + // Constructor with defaults + KernelTraits() + : pipeline("compv3"), + scheduler("intrawave"), + epilogue("cshuffle"), + pad_m(false), + pad_n(false), + pad_k(false), + persistent(false) + { + } +}; + +// Create argument parser +inline auto create_args(int argc, char* argv[]) +{ + ck_tile::ArgParser arg_parser; + arg_parser.insert("m", "3840", "The value for m dimension. Default is 3840.") + .insert("n", "4096", "The value for n dimension. Default is 4096.") + .insert("k", "2048", "The value for k dimension. Default is 2048.") + .insert("stride_a", "0", "The stride value for tensor A. Default is 0.") + .insert("stride_b", "0", "The stride value for tensor B. Default is 0.") + .insert("stride_ds", "0", "The stride value for tensor Ds . Default is 0.") + .insert("stride_c", "0", "The stride value for tensor C. Default is 0.") + .insert("split_k", "1", "The split value for k dimension. Default is 1.") + .insert("verify", + "2", + "The type of validation. Set to 0 for no validation, 1 for validation on CPU, or 2 " + "for validation on GPU. Default is 2, GPU validation.") + .insert("log", + "false", + "Whether output kernel instance information or not. Possible values are true or " + "false. Default is false") + .insert( + "warmup", "50", "The number of iterations before benchmark the kernel. Default is 50.") + .insert( + "repeat", "100", "The number of iterations to benchmark the kernel. Default is 100.") + .insert("timer", + "true", + "Whether if the timer is gpu timer or not. Possible values are false or true. " + "Default is true.") + .insert("init", + "0", + "The method of tensor initialization. Set to 0 for random, to 1 for linear, or 2 " + "for constant(1). Default is 0, random.") + .insert("flush_cache", + "true", + "To flush cache, possible values are true or false. " + "Default is false.") + .insert("rotating_count", "1000", "number of iterations to rotate the cache. default is 5.") + .insert("metric", + "0", + "Metric with which to measure kernel performance. Set to 0 for latency, 1 for " + "tflops, or 2 for bandwidth. Default is 0, latency.") + .insert("csv_filename", + "", + "The filename of benchmark result. Default is empty (no CSV output).") + .insert("structured_sparsity", + "false", + "Whether use sparsity kernel or not. Possible values are true or false. Default is " + "false") + .insert("json_output", + "false", + "Whether to output results in JSON format only. Possible values are true or false. " + "Default is " + "false"); + + bool result = arg_parser.parse(argc, argv); + return std::make_tuple(result, arg_parser); +} diff --git a/tile_engine/ops/gemm/gemm_multi_d/gemm_multi_d_benchmark.py b/tile_engine/ops/gemm/gemm_multi_d/gemm_multi_d_benchmark.py index 2e313c3fed..d1fe7a91c7 100644 --- a/tile_engine/ops/gemm/gemm_multi_d/gemm_multi_d_benchmark.py +++ b/tile_engine/ops/gemm/gemm_multi_d/gemm_multi_d_benchmark.py @@ -3,14 +3,10 @@ import os import sys -import json -import subprocess import argparse -import csv import time import importlib.util -from pathlib import Path -from typing import List, Dict, Tuple, Optional + def _import_gemm_benchmark(): """Import validation utilities from commons directory.""" @@ -27,6 +23,7 @@ def _import_gemm_benchmark(): return gemm_benchmark_module.GemmBenchmark + def _import_benchmark_utils(): """Import benchmark utilities from commons directory.""" current_dir = os.path.dirname(os.path.abspath(__file__)) @@ -42,13 +39,16 @@ def _import_benchmark_utils(): return benchmark_utils + GemmBenchmark = _import_gemm_benchmark() benchmark_utils = _import_benchmark_utils() + class GemmMultiDBenchmark(GemmBenchmark): def __init__(self, build_dir: str, verbose: bool = False): super().__init__(build_dir, verbose, name="benchmark_gemm_multi_d_") + def main(): parser = argparse.ArgumentParser( description="GEMM Multi D Kernel Benchmarking Tool" diff --git a/tile_engine/ops/gemm/gemm_multi_d/gemm_multi_d_benchmark_single.cpp b/tile_engine/ops/gemm/gemm_multi_d/gemm_multi_d_benchmark_single.cpp index e72ceb0d76..767e8eda6e 100644 --- a/tile_engine/ops/gemm/gemm_multi_d/gemm_multi_d_benchmark_single.cpp +++ b/tile_engine/ops/gemm/gemm_multi_d/gemm_multi_d_benchmark_single.cpp @@ -11,6 +11,7 @@ #include "ck_tile/core.hpp" #include "ck_tile/host.hpp" +#include "gemm/gemm_common.hpp" #include "gemm_multi_d_profiler.hpp" // The kernel header is included via the compile command line with -include flag @@ -35,29 +36,27 @@ void benchmark_single(const ck_tile::ArgParser& arg_parser) std::string layout_d1 = D1Layout::name; // Create GemmMultiDProblem struct - GemmMultiDProblem gemm_multi_d_problem{ - GemmProblem{ - arg_parser.get_int("split_k"), - arg_parser.get_int("m"), - arg_parser.get_int("n"), - arg_parser.get_int("k"), - arg_parser.get_int("stride_a"), - arg_parser.get_int("stride_b"), - arg_parser.get_int("stride_c"), - dtype_a, - dtype_b, - dtype_acc, - dtype_c, - layout_a, - layout_b, - layout_c, - arg_parser.get_bool("structured_sparsity")}, - arg_parser.get_int("stride_ds"), - arg_parser.get_int("stride_ds"), - dtype_d0, - dtype_d1, - layout_d0, - layout_d1}; + GemmMultiDProblem gemm_multi_d_problem{GemmProblem{arg_parser.get_int("split_k"), + arg_parser.get_int("m"), + arg_parser.get_int("n"), + arg_parser.get_int("k"), + arg_parser.get_int("stride_a"), + arg_parser.get_int("stride_b"), + arg_parser.get_int("stride_c"), + dtype_a, + dtype_b, + dtype_acc, + dtype_c, + layout_a, + layout_b, + layout_c, + arg_parser.get_bool("structured_sparsity")}, + arg_parser.get_int("stride_ds"), + arg_parser.get_int("stride_ds"), + dtype_d0, + dtype_d1, + layout_d0, + layout_d1}; // Create Setting struct Setting setting{arg_parser.get_int("warmup"), diff --git a/tile_engine/ops/gemm/gemm_multi_d/gemm_multi_d_profiler.hpp b/tile_engine/ops/gemm/gemm_multi_d/gemm_multi_d_profiler.hpp index 583dfd85a7..aeac6c984d 100644 --- a/tile_engine/ops/gemm/gemm_multi_d/gemm_multi_d_profiler.hpp +++ b/tile_engine/ops/gemm/gemm_multi_d/gemm_multi_d_profiler.hpp @@ -11,25 +11,28 @@ #include #include - #include "ck_tile/host/device_prop.hpp" #include "ck_tile/ops/gemm.hpp" #include "gemm/gemm_profiler.hpp" +#include "common/utils.hpp" #include "gemm_multi_d_benchmark.hpp" -class GemmMultiDProfiler: public GemmProfiler> +class GemmMultiDProfiler : public GemmProfiler> { -public: + public: using BaseGemm = GemmProfiler>; + GemmMultiDProblem, + ck_tile::GemmMultiDHostArgs>; using BaseGemm::benchmark; GemmMultiDProfiler(Setting setting) - : GemmProfiler>(setting) {} - + : GemmProfiler>(setting) + { + } void benchmark( GemmMultiDProblem& gemm_multi_d_problem, @@ -157,5 +160,4 @@ public: kernel_run_result); } } - }; diff --git a/tile_engine/ops/gemm/gemm_preshuffle/gemm_preshuffle_benchmark.hpp b/tile_engine/ops/gemm/gemm_preshuffle/gemm_preshuffle_benchmark.hpp index 48f8a46ecb..817ff8e99e 100644 --- a/tile_engine/ops/gemm/gemm_preshuffle/gemm_preshuffle_benchmark.hpp +++ b/tile_engine/ops/gemm/gemm_preshuffle/gemm_preshuffle_benchmark.hpp @@ -8,6 +8,23 @@ #include "gemm_preshuffle_common.hpp" #include "gemm/gemm_benchmark.hpp" +struct KernelConfig +{ + ck_tile::index_t M_Tile; + ck_tile::index_t N_Tile; + ck_tile::index_t K_Tile; + + ck_tile::index_t M_Warp; + ck_tile::index_t N_Warp; + ck_tile::index_t K_Warp; + + ck_tile::index_t M_Warp_Tile; + ck_tile::index_t N_Warp_Tile; + ck_tile::index_t K_Warp_Tile; + + bool permuteN; +}; + /// @brief Function to get the kernel output with reference implementation on CPU/GPU void gemm_host_reference(int verify, ck_tile::HostTensor& a_m_k, diff --git a/tile_engine/ops/gemm/gemm_preshuffle/gemm_preshuffle_benchmark.py b/tile_engine/ops/gemm/gemm_preshuffle/gemm_preshuffle_benchmark.py index 935de186d6..f4ba383d73 100644 --- a/tile_engine/ops/gemm/gemm_preshuffle/gemm_preshuffle_benchmark.py +++ b/tile_engine/ops/gemm/gemm_preshuffle/gemm_preshuffle_benchmark.py @@ -3,14 +3,9 @@ import os import sys -import json -import subprocess import argparse -import csv import time import importlib.util -from pathlib import Path -from typing import List, Dict, Tuple, Optional def _import_gemm_benchmark(): @@ -28,6 +23,7 @@ def _import_gemm_benchmark(): return gemm_benchmark_module.GemmBenchmark + def _import_benchmark_utils(): """Import benchmark utilities from commons directory.""" current_dir = os.path.dirname(os.path.abspath(__file__)) @@ -43,13 +39,16 @@ def _import_benchmark_utils(): return benchmark_utils + GemmBenchmark = _import_gemm_benchmark() benchmark_utils = _import_benchmark_utils() + class GemmPreshuffleBenchmark(GemmBenchmark): def __init__(self, build_dir: str, verbose: bool = False): super().__init__(build_dir, verbose, name="benchmark_gemm_preshuffle_") + def main(): parser = argparse.ArgumentParser( description="GEMM Preshuffle Kernel Benchmarking Tool" diff --git a/tile_engine/ops/gemm/gemm_preshuffle/gemm_preshuffle_benchmark_single.cpp b/tile_engine/ops/gemm/gemm_preshuffle/gemm_preshuffle_benchmark_single.cpp index 4fbb25f0c9..d03b35f2b4 100644 --- a/tile_engine/ops/gemm/gemm_preshuffle/gemm_preshuffle_benchmark_single.cpp +++ b/tile_engine/ops/gemm/gemm_preshuffle/gemm_preshuffle_benchmark_single.cpp @@ -11,78 +11,21 @@ #include "ck_tile/core.hpp" #include "ck_tile/host.hpp" +#include "gemm/gemm_common.hpp" #include "gemm_preshuffle_profiler.hpp" #include "gemm_preshuffle_common.hpp" // The kernel header is included via the compile command line with -include flag // It defines SelectedKernel struct and KERNEL_NAME -// DataTypeTraits are now defined in gemm_common.hpp - -// Create argument parser -inline auto create_args(int argc, char* argv[]) -{ - ck_tile::ArgParser arg_parser; - arg_parser.insert("m", "3840", "The value for m dimension. Default is 3840.") - .insert("n", "4096", "The value for n dimension. Default is 4096.") - .insert("k", "2048", "The value for k dimension. Default is 2048.") - .insert("stride_a", "0", "The stride value for tensor A. Default is 0.") - .insert("stride_b", "0", "The stride value for tensor B. Default is 0.") - .insert("stride_c", "0", "The stride value for tensor C. Default is 0.") - .insert("split_k", "1", "The split value for k dimension. Default is 1.") - .insert("verify", - "2", - "The type of validation. Set to 0 for no validation, 1 for validation on CPU, or 2 " - "for validation on GPU. Default is 0, no validation.") - .insert("log", - "false", - "Whether output kernel instance information or not. Possible values are true or " - "false. Default is false") - .insert( - "warmup", "50", "The number of iterations before benchmark the kernel. Default is 50.") - .insert( - "repeat", "100", "The number of iterations to benchmark the kernel. Default is 100.") - .insert("timer", - "true", - "Whether if the timer is gpu timer or not. Possible values are false or true. " - "Default is true.") - .insert("init", - "0", - "The method of tensor initialization. Set to 0 for random, to 1 for linear, or 2 " - "for constant(1). Default is 0, random.") - .insert("flush_cache", - "true", - "To flush cache, possible values are true or false. " - "Default is false.") - .insert("rotating_count", "1000", "number of iterations to rotate the cache. default is 5.") - .insert("metric", - "0", - "Metric with which to measure kernel performance. Set to 0 for latency, 1 for " - "tflops, or 2 for bandwidth. Default is 0, latency.") - .insert("csv_filename", - "", - "The filename of benchmark result. Default is empty (no CSV output).") - .insert("structured_sparsity", - "false", - "Whether use sparsity kernel or not. Possible values are true or false. Default is " - "false") - .insert("json_output", - "false", - "Whether to output results in JSON format only. Possible values are true or false. " - "Default is " - "false"); - - bool result = arg_parser.parse(argc, argv); - return std::make_tuple(result, arg_parser); -} void benchmark_single(const ck_tile::ArgParser& arg_parser) { // Use DataTypeTraits to get the actual type names from the generated header // The generated header defines ADataType, BDataType, AccDataType, CDataType - std::string dtype_a = DataTypeTraits::name; - std::string dtype_b = DataTypeTraits::name; - std::string dtype_acc = DataTypeTraits::name; - std::string dtype_c = DataTypeTraits::name; + std::string dtype_a = ck_tile::DataTypeTraits::name; + std::string dtype_b = ck_tile::DataTypeTraits::name; + std::string dtype_acc = ck_tile::DataTypeTraits::name; + std::string dtype_c = ck_tile::DataTypeTraits::name; // Layout names from the layout types std::string layout_a = ALayout::name; @@ -119,29 +62,17 @@ void benchmark_single(const ck_tile::ArgParser& arg_parser) arg_parser.get_bool("json_output")}; // Get the profiler instance - auto& profiler = GemmProfiler::instance(setting); + auto& profiler = GemmPreshuffleProfiler::instance(setting); try { - // Create a lambda that wraps the kernel launch - std::tuple warp_tile_dims = std::make_tuple( - SelectedKernel::WarpTileM, SelectedKernel::WarpTileN, SelectedKernel::WarpTileK); - std::tuple tile_dims = - std::make_tuple(SelectedKernel::TileM, SelectedKernel::TileN, SelectedKernel::TileK); - std::tuple warp_dims = std::make_tuple(SelectedKernel::WarpPerBlock_M, - SelectedKernel::WarpPerBlock_N, - SelectedKernel::WarpPerBlock_K); - bool permuteN = SelectedKernel::PermuteN; - - KernelConfig config{tile_dims, warp_dims, warp_tile_dims, permuteN}; - auto kernel_func = [](const ck_tile::GemmHostArgs& args, const ck_tile::stream_config& stream) { return SelectedKernel::launch(args, stream); }; // Benchmark the kernel - profiler.benchmark(gemm_problem, kernel_func, config); + profiler.benchmark(gemm_problem, kernel_func); // Select best instance based on metric profiler.select_best_instance(static_cast(arg_parser.get_int("metric"))); diff --git a/tile_engine/ops/gemm/gemm_preshuffle/gemm_preshuffle_common.hpp b/tile_engine/ops/gemm/gemm_preshuffle/gemm_preshuffle_common.hpp index 7d3164f9d4..21cda28f75 100644 --- a/tile_engine/ops/gemm/gemm_preshuffle/gemm_preshuffle_common.hpp +++ b/tile_engine/ops/gemm/gemm_preshuffle/gemm_preshuffle_common.hpp @@ -8,35 +8,20 @@ #include "ck_tile/host.hpp" #include "ck_tile/core/numeric/integer.hpp" #include "ck_tile/core/numeric/pk_int4.hpp" +#include "gemm/gemm_common.hpp" // Structure to hold kernel traits for dispatcher -struct KernelTraits +struct PreshuffleKernelTraits : KernelTraits { - std::string pipeline; // preshufflev2 - std::string scheduler; // intrawave, interwave, default - std::string epilogue; // cshuffle, default - bool pad_m; - bool pad_n; - bool pad_k; - bool persistent; // Constructor with defaults - KernelTraits() - : pipeline("preshufflev2"), - scheduler("default"), - epilogue("default"), - pad_m(false), - pad_n(false), - pad_k(false), - persistent(false) - { - } + PreshuffleKernelTraits() : KernelTraits() { this->pipeline = "preshufflev2"; } }; // Helper to extract traits from kernel name -inline KernelTraits extract_traits_from_name(const std::string& kernel_name) +inline PreshuffleKernelTraits extract_traits_from_name(const std::string& kernel_name) { - KernelTraits traits; + PreshuffleKernelTraits traits; // Extract pipeline if(kernel_name.find("preshufflev2") != std::string::npos) @@ -74,42 +59,3 @@ inline KernelTraits extract_traits_from_name(const std::string& kernel_name) return traits; } - -template -auto shuffle_b(const ck_tile::HostTensor& t, - ck_tile::index_t N_Warp_Tile, - ck_tile::index_t K_Warp_Tile) -{ - assert(t.get_lengths().size() == 2); - int n_ = t.get_lengths()[1]; - int k_ = t.get_lengths()[0]; - int divisor = N_Warp_Tile == 32 ? 2 : 4; - ck_tile::HostTensor t_view( - {n_ / N_Warp_Tile, N_Warp_Tile, k_ / K_Warp_Tile, divisor, K_Warp_Tile / divisor}); - std::copy(t.begin(), t.end(), t_view.begin()); - return ck_tile::reference_permute(t_view, {0, 2, 3, 1, 4}); -} - -template -auto shuffle_b_permuteN(const ck_tile::HostTensor& t, - ck_tile::index_t N_Warp_Tile, - ck_tile::index_t K_Warp_Tile, - ck_tile::index_t N_Tile, - ck_tile::index_t N_Warp) -{ - assert(t.get_lengths().size() == 2); - - int n_ = t.get_lengths()[1]; - int k_ = t.get_lengths()[0]; - int divisor = N_Warp_Tile == 32 ? 2 : 4; - int NRepeat = N_Tile / N_Warp_Tile / N_Warp; - ck_tile::HostTensor t_view({n_ / N_Tile, - N_Warp, - N_Warp_Tile, - NRepeat, - k_ / K_Warp_Tile, - divisor, - K_Warp_Tile / divisor}); - std::copy(t.begin(), t.end(), t_view.begin()); - return ck_tile::reference_permute(t_view, {0, 3, 1, 4, 5, 2, 6}); -} diff --git a/tile_engine/ops/gemm/gemm_preshuffle/gemm_preshuffle_profiler.hpp b/tile_engine/ops/gemm/gemm_preshuffle/gemm_preshuffle_profiler.hpp index 4cf980dbf7..e7af073877 100644 --- a/tile_engine/ops/gemm/gemm_preshuffle/gemm_preshuffle_profiler.hpp +++ b/tile_engine/ops/gemm/gemm_preshuffle/gemm_preshuffle_profiler.hpp @@ -4,28 +4,26 @@ #pragma once #include "ck_tile/host/device_prop.hpp" +#include "ck_tile/host/tensor_shuffle_utils.hpp" #include "ck_tile/ops/gemm.hpp" #include "gemm/gemm_profiler.hpp" #include "gemm_preshuffle_benchmark.hpp" -class GemmPreshuffleProfiler : public GemmProfiler +class GemmPreshuffleProfiler + : public GemmProfiler { -public: - using BaseGemm = GemmProfiler; + public: + using BaseGemm = GemmProfiler; using BaseGemm::benchmark; GemmPreshuffleProfiler(Setting setting) - : GemmProfiler(setting) {} - + : GemmProfiler(setting) + { + } void benchmark(GemmProblem& gemm_problem, std::vector( - ck_tile::GemmHostArgs&, const ck_tile::stream_config&)>>& callables, - KernelConfig& config) override + ck_tile::GemmHostArgs&, const ck_tile::stream_config&)>>& callables) override { const ALayout layout_a = ALayout{}; const BLayout layout_b = BLayout{}; @@ -97,21 +95,28 @@ public: c_m_n_dev_buf.SetZero(); c_m_n_dev_result.SetZero(); + // Create a lambda that wraps the kernel launch + KernelConfig config{SelectedKernel::WarpTileM, + SelectedKernel::WarpTileN, + SelectedKernel::WarpTileK, + SelectedKernel::TileM, + SelectedKernel::TileN, + SelectedKernel::TileK, + SelectedKernel::WarpPerBlock_M, + SelectedKernel::WarpPerBlock_N, + SelectedKernel::WarpPerBlock_K, + SelectedKernel::PermuteN}; + for(const auto& callable : callables) { - ck_tile::index_t N_Warp_Tile = std::get<1>(config.warp_tile_dims); - ck_tile::index_t K_Warp_Tile = std::get<2>(config.warp_tile_dims); - ck_tile::index_t N_Tile = std::get<1>(config.tile_dims); - ck_tile::index_t N_Warp = std::get<1>(config.warp_dims); - ck_tile::HostTensor b_shuffle_host = [&]() { if(config.permuteN) { - return shuffle_b_permuteN(b_k_n, N_Warp_Tile, K_Warp_Tile, N_Tile, N_Warp); + return ck_tile::shuffle_b_permuteN(b_k_n, config); } else { - return shuffle_b(b_k_n, N_Warp_Tile, K_Warp_Tile); + return ck_tile::shuffle_b(b_k_n, config); } }(); @@ -144,5 +149,4 @@ public: gemm_problem, c_m_n_dev_buf, c_m_n_ref, c_m_n_dev_result, kernel_run_result); } } - }; diff --git a/tile_engine/ops/gemm/gemm_profiler.hpp b/tile_engine/ops/gemm/gemm_profiler.hpp index 4c9b706fe3..ab62b0616f 100644 --- a/tile_engine/ops/gemm/gemm_profiler.hpp +++ b/tile_engine/ops/gemm/gemm_profiler.hpp @@ -11,17 +11,14 @@ #include #include - #include "ck_tile/host/device_prop.hpp" #include "ck_tile/ops/gemm.hpp" #include "gemm_benchmark.hpp" -template +template class GemmProfiler { -public: + public: static Gemm& instance(Setting setting) { static Gemm instance{setting}; @@ -30,27 +27,25 @@ public: // Overload for single kernel benchmarking void benchmark(Problem& gemm_problem, - std::function - kernel_func) + std::function kernel_func) { // Create a vector with a single callable that returns both name and time - std::vector(GemmArgs&, - const ck_tile::stream_config&)>> + std::vector< + std::function(GemmArgs&, const ck_tile::stream_config&)>> callables; - callables.push_back( - [kernel_func](GemmArgs& args, const ck_tile::stream_config& stream) { - float time = kernel_func(args, stream); - return std::make_tuple(std::string(KERNEL_NAME), time); - }); + callables.push_back([kernel_func](GemmArgs& args, const ck_tile::stream_config& stream) { + float time = kernel_func(args, stream); + return std::make_tuple(std::string(KERNEL_NAME), time); + }); - benchmark(gemm_problem, callables); // TODO: need to cast this? + benchmark(gemm_problem, callables); } virtual void benchmark(Problem& gemm_problem, - std::vector( - GemmArgs&, const ck_tile::stream_config&)>>& callables) = 0; - + std::vector( + GemmArgs&, const ck_tile::stream_config&)>>& callables) = 0; + void process_result(const Problem& gemm_problem, ck_tile::DeviceMem& c_m_n_dev_buf, ck_tile::HostTensor& c_m_n_host_result, @@ -58,7 +53,7 @@ public: const std::tuple& kernel_run_result) { auto [name, avg_time] = kernel_run_result; - using DDataType = typename get_DsDataType::type; + using DDataType = typename get_DsDataType::type; KernelInstance kernel_instance{name, gemm_problem, {-1.0f, -1.0f, -1.0f}}; @@ -68,16 +63,14 @@ public: sizeof(BDataType) * gemm_problem.n_ * gemm_problem.k_ + sizeof(CDataType) * gemm_problem.m_ * gemm_problem.n_; - - if constexpr (!std::is_void_v) - { - ck_tile::static_for<0, DDataType::size(), 1>{}([&](auto i) { - using DType = ck_tile::remove_cvref_t>; - num_byte += sizeof(DType) * gemm_problem.m_ * gemm_problem.n_; - flop += sizeof(DType) * gemm_problem.m_ * gemm_problem.n_; - }); - } - + if constexpr(!std::is_void_v) + { + ck_tile::static_for<0, DDataType::size(), 1>{}([&](auto i) { + using DType = ck_tile::remove_cvref_t>; + num_byte += sizeof(DType) * gemm_problem.m_ * gemm_problem.n_; + flop += sizeof(DType) * gemm_problem.m_ * gemm_problem.n_; + }); + } // update kernel_instance.perf_result_.latency_ = avg_time; @@ -91,15 +84,14 @@ public: // verify result c_m_n_dev_buf.FromDevice(c_m_n_dev_result.data()); - int split_k = 1; - if constexpr (std::is_same_v) - { - split_k = gemm_problem.split_k_; - } + int split_k = 1; + if constexpr(std::is_same_v) + { + split_k = gemm_problem.split_k_; + } bool verified_correct = !setting_.verify_ || - compare( - name, gemm_problem.k_, split_k, c_m_n_dev_result, c_m_n_host_result); + compare(name, gemm_problem.k_, split_k, c_m_n_dev_result, c_m_n_host_result); if(verified_correct) { @@ -196,5 +188,3 @@ public: std::vector> kernel_instances_; }; - - diff --git a/tile_engine/ops/gemm/gemm_universal/gemm_universal_benchmark.py b/tile_engine/ops/gemm/gemm_universal/gemm_universal_benchmark.py index 4fa5d5dee0..008ffaa14f 100755 --- a/tile_engine/ops/gemm/gemm_universal/gemm_universal_benchmark.py +++ b/tile_engine/ops/gemm/gemm_universal/gemm_universal_benchmark.py @@ -4,14 +4,10 @@ import os import sys -import json -import subprocess import argparse -import csv import time import importlib.util -from pathlib import Path -from typing import List, Dict, Tuple, Optional + def _import_gemm_benchmark(): """Import validation utilities from commons directory.""" @@ -28,6 +24,7 @@ def _import_gemm_benchmark(): return gemm_benchmark_module.GemmBenchmark + def _import_benchmark_utils(): """Import benchmark utilities from commons directory.""" current_dir = os.path.dirname(os.path.abspath(__file__)) @@ -43,16 +40,20 @@ def _import_benchmark_utils(): return benchmark_utils + GemmBenchmark = _import_gemm_benchmark() benchmark_utils = _import_benchmark_utils() + class GemmUniversalBenchmark(GemmBenchmark): def __init__(self, build_dir: str, verbose: bool = False): - super().__init__(build_dir, verbose, name="benchmark_gemm_") + super().__init__(build_dir, verbose, name="benchmark_gemm_universal_") def main(): - parser = argparse.ArgumentParser(description="GEMM Kernel Benchmarking Tool") + parser = argparse.ArgumentParser( + description="Universal GEMM Kernel Benchmarking Tool" + ) parser.add_argument( "build_dir", help="Build directory containing kernel executables" ) @@ -67,7 +68,9 @@ def main(): ) parser.add_argument("--verify", action="store_true", help="Enable verification") parser.add_argument( - "--csv", default="gemm_benchmark_results.csv", help="CSV output filename" + "--csv", + default="gemm_universal_benchmark_results.csv", + help="CSV output filename", ) parser.add_argument( "--best", default="best_kernels.txt", help="Best kernels output filename" @@ -115,7 +118,7 @@ def main(): benchmark = GemmUniversalBenchmark(args.build_dir, verbose=args.verbose) # Run benchmark sweep - print("Starting GEMM kernel benchmark sweep...") + print("Starting Universal GEMM kernel benchmark sweep...") start_time = time.time() best_kernels = benchmark.benchmark_sweep( diff --git a/tile_engine/ops/gemm/gemm_universal/gemm_universal_benchmark_single.cpp b/tile_engine/ops/gemm/gemm_universal/gemm_universal_benchmark_single.cpp index bf81acabd2..b2015f8571 100644 --- a/tile_engine/ops/gemm/gemm_universal/gemm_universal_benchmark_single.cpp +++ b/tile_engine/ops/gemm/gemm_universal/gemm_universal_benchmark_single.cpp @@ -11,6 +11,7 @@ #include "ck_tile/core.hpp" #include "ck_tile/host.hpp" +#include "gemm/gemm_common.hpp" #include "gemm_universal_profiler.hpp" // The kernel header is included via the compile command line with -include flag diff --git a/tile_engine/ops/gemm/gemm_universal/gemm_universal_profiler.hpp b/tile_engine/ops/gemm/gemm_universal/gemm_universal_profiler.hpp index a1d749d678..6cfdcab800 100644 --- a/tile_engine/ops/gemm/gemm_universal/gemm_universal_profiler.hpp +++ b/tile_engine/ops/gemm/gemm_universal/gemm_universal_profiler.hpp @@ -9,21 +9,21 @@ #include "ck_tile/host/device_prop.hpp" #include "ck_tile/ops/gemm.hpp" -#include "gemm_universal_benchmark.hpp" +#include "gemm/gemm_benchmark.hpp" #include "gemm/gemm_profiler.hpp" +#include "gemm_universal_benchmark.hpp" -class UniversalGemmProfiler : public GemmProfiler +class UniversalGemmProfiler + : public GemmProfiler { -public: - using BaseGemm = GemmProfiler; + public: + using BaseGemm = GemmProfiler; using BaseGemm::benchmark; UniversalGemmProfiler(Setting setting) - : GemmProfiler(setting) {} + : GemmProfiler(setting) + { + } void benchmark(GemmProblem& gemm_problem, std::vector(