Adding README back into the gemm directory and integrate new preshuffle functions

This commit is contained in:
Astha
2026-01-11 23:15:28 -05:00
committed by Astha Rai
parent 523ec7c863
commit 3791cfd71d
20 changed files with 732 additions and 380 deletions

6
Jenkinsfile vendored
View File

@@ -1640,7 +1640,7 @@ pipeline {
-D GEMM_PRESHUFFLE_LAYOUT="rcr" \
-D GEMM_PRESHUFFLE_CONFIG_FILE="default_ci_config.json" .. && \
ninja -j${nthreads()} benchmark_gemm_universal_all benchmark_gemm_preshuffle_all benchmark_gemm_multi_d_all && \
python3 ../tile_engine/ops/gemm/gemm_universal/gemm_benchmark.py . --problem-sizes "1024,1024,1024" --warmup 5 --repeat 5 --verbose --json results.json && \
python3 ../tile_engine/ops/gemm/gemm_universal/gemm_universal_benchmark.py . --problem-sizes "1024,1024,1024" --warmup 5 --repeat 5 --verbose --json results.json && \
python3 ../tile_engine/ops/gemm/gemm_preshuffle/gemm_preshuffle_benchmark.py . --problem-sizes "1024,1024,1024" --warmup 5 --repeat 5 --verbose --json results.json && \
python3 ../tile_engine/ops/gemm/gemm_multi_d/gemm_multi_d_benchmark.py . --problem-sizes "1024,1024,1024" --warmup 5 --repeat 5 --verbose --json results.json """
}
@@ -1681,7 +1681,7 @@ pipeline {
-D GEMM_PRESHUFFLE_DATATYPE="fp16;fp8;bf16;bf8" \
-D GEMM_PRESHUFFLE_LAYOUT="rcr" .. && \
ninja -j${nthreads()} benchmark_gemm_universal_all benchmark_gemm_preshuffle_all benchmark_gemm_multi_d_all benchmark_gemm_streamk_all && \
python3 ../tile_engine/ops/gemm/gemm_universal/gemm_benchmark.py . --problem-sizes "1024,1024,1024" --warmup 5 --repeat 5 --verbose --json results.json && \
python3 ../tile_engine/ops/gemm/gemm_universal/gemm_universal_benchmark.py . --problem-sizes "1024,1024,1024" --warmup 5 --repeat 5 --verbose --json results.json && \
python3 ../tile_engine/ops/gemm/gemm_preshuffle/gemm_preshuffle_benchmark.py . --problem-sizes "1024,1024,1024" --warmup 5 --repeat 5 --verbose --json results.json && \
python3 ../tile_engine/ops/gemm/gemm_multi_d/gemm_multi_d_benchmark.py . --problem-sizes "1024,1024,1024" --warmup 5 --repeat 5 --verbose --json results.json """
}
@@ -1706,7 +1706,7 @@ pipeline {
-D GEMM_UNIVERSAL_DATATYPE="fp16" \
-D GEMM_UNIVERSAL_LAYOUT="rcr;rrr;crr;ccr" .. && \
ninja -j${nthreads()} benchmark_gemm_universal_all && \
python3 ../tile_engine/ops/gemm/gemm_universal/gemm_benchmark.py . --problem-sizes "1024,1024,1024" --warmup 5 --repeat 5 --verbose --json results.json """
python3 ../tile_engine/ops/gemm/gemm_universal/gemm_universal_benchmark.py . --problem-sizes "1024,1024,1024" --warmup 5 --repeat 5 --verbose --json results.json """
}
steps{
buildHipClangJobAndReboot(setup_args:setup_args, build_type: 'Release', execute_cmd: execute_args)

View File

@@ -10,7 +10,7 @@
# ============================================================================
# Locate tile_engine GEMM scripts directory
set(TILE_ENGINE_GEMM_DIR "${PROJECT_SOURCE_DIR}/tile_engine/ops/gemm")
set(TILE_ENGINE_GEMM_DIR "${PROJECT_SOURCE_DIR}/tile_engine/ops/gemm/gemm_universal")
if(NOT EXISTS ${TILE_ENGINE_GEMM_DIR})
message(WARNING "Tile engine directory not found: ${TILE_ENGINE_GEMM_DIR}")
@@ -32,11 +32,11 @@ endif()
# config_json - Full path to JSON configuration file
# ============================================================================
function(create_individual_gemm_test_target datatype layout config_name trait tile_config config_json)
set(target_name "test_gemm_tile_engine_${datatype}_${layout}_${config_name}_${trait}_${tile_config}")
set(target_name "test_gemm_universal_tile_engine_${datatype}_${layout}_${config_name}_${trait}_${tile_config}")
set(working_path "${CMAKE_CURRENT_BINARY_DIR}/${datatype}/${layout}/${config_name}")
# Generated header path (already created during cmake configuration)
set(test_header "${working_path}/gemm_single_${datatype}_${layout}_${trait}_${tile_config}.hpp")
set(test_header "${working_path}/gemm_universal_single_${datatype}_${layout}_${trait}_${tile_config}.hpp")
set(test_params_header "${working_path}/test_params.hpp")
# Verify header exists (should have been generated during cmake configuration)
@@ -118,7 +118,7 @@ function(build_gemm_test_targets datatype layout config_name)
# STEP 1: Discovery phase - list all valid kernel configurations
execute_process(
COMMAND ${Python3_EXECUTABLE} -u ${TILE_ENGINE_GEMM_DIR}/gemm_instance_builder.py
COMMAND ${Python3_EXECUTABLE} -u ${TILE_ENGINE_GEMM_DIR}/gemm_universal_instance_builder.py
--working_path ${working_path}
--datatype ${datatype}
--layout ${layout}
@@ -178,7 +178,7 @@ function(build_gemm_test_targets datatype layout config_name)
# Generate header using --gen_single
execute_process(
COMMAND ${Python3_EXECUTABLE} -u ${TILE_ENGINE_GEMM_DIR}/gemm_instance_builder.py
COMMAND ${Python3_EXECUTABLE} -u ${TILE_ENGINE_GEMM_DIR}/gemm_universal_instance_builder.py
--working_path ${working_path}
--gpu_target "${GEMM_TEST_GPU_TARGETS}"
--datatype ${datatype}

View File

@@ -2,17 +2,16 @@
# Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
# SPDX-License-Identifier: MIT
import sys
import json
import subprocess
import argparse
import csv
import time
from pathlib import Path
from typing import List, Dict, Tuple, Optional
from typing import List, Dict, Optional
def run_kernel(build_dir: Path, kernel_path: Path, params: Dict[str, str], verbose: bool = False) -> Optional[Dict]:
def run_kernel(
build_dir: Path, kernel_path: Path, params: Dict[str, str], verbose: bool = False
) -> Optional[Dict]:
"""Run a single kernel with given parameters and save output to individual JSON file"""
# Create results directory
results_dir = build_dir / "results"
@@ -59,6 +58,7 @@ def run_kernel(build_dir: Path, kernel_path: Path, params: Dict[str, str], verbo
print(f"Error running {kernel_path.name}: {e}")
return None
def parse_json_file(json_file: Path, verbose: bool = False) -> Optional[Dict]:
"""Parse JSON data from individual kernel output file"""
try:
@@ -88,9 +88,8 @@ def parse_json_file(json_file: Path, verbose: bool = False) -> Optional[Dict]:
print(f"Error reading JSON file {json_file}: {e}")
return None
def find_best_kernel(
results: List[Dict], metric: str = "tflops"
) -> Optional[Dict]:
def find_best_kernel(results: List[Dict], metric: str = "tflops") -> Optional[Dict]:
"""Find the best performing kernel based on metric"""
if not results:
return None
@@ -126,7 +125,8 @@ def export_csv(results: List[Dict], filename: str, verbose: bool = False):
print(f"Results exported to {filename}")
def export_best_kernels( best_kernels: Dict, filename: str, verbose: bool = False):
def export_best_kernels(best_kernels: Dict, filename: str, verbose: bool = False):
"""Export best kernel selections to file"""
with open(filename, "w") as f:
f.write("# Best kernel selections\n")
@@ -141,7 +141,10 @@ def export_best_kernels( best_kernels: Dict, filename: str, verbose: bool = Fals
print(f"Best kernels exported to {filename}")
def export_json(results: List[Dict], filename: str, best_kernels: Dict = None, verbose: bool = False):
def export_json(
results: List[Dict], filename: str, best_kernels: Dict = None, verbose: bool = False
):
"""Export all results and best kernels to JSON with comprehensive metadata"""
from datetime import datetime
@@ -223,9 +226,7 @@ def export_json(results: List[Dict], filename: str, best_kernels: Dict = None, v
"benchmark_metadata": {
"timestamp": datetime.now().isoformat(),
"total_kernels_tested": len(results),
"unique_kernels": len(
set(r.get("name", "unknown") for r in results)
),
"unique_kernels": len(set(r.get("name", "unknown") for r in results)),
"successful_runs": len(successful_results),
"failed_runs": len(results) - len(successful_results),
},
@@ -265,9 +266,7 @@ def export_json(results: List[Dict], filename: str, best_kernels: Dict = None, v
"by_scheduler": scheduler_stats,
"by_data_type": data_type_stats,
},
"total_problem_configurations": len(best_kernels)
if best_kernels
else 0,
"total_problem_configurations": len(best_kernels) if best_kernels else 0,
},
"kernel_results": results,
"best_kernels_by_problem": best_kernels or {},
@@ -282,4 +281,3 @@ def export_json(results: List[Dict], filename: str, best_kernels: Dict = None, v
print(f" - Best TFLOPS: {max(tflops_values, default=0):.2f}")
print(f" - Best bandwidth: {max(bandwidth_values, default=0):.2f} GB/s")
print(f" - Best latency: {min(latency_values, default=0):.2f}ms")

View File

@@ -20,89 +20,6 @@ constexpr auto is_row_major(Layout)
return ck_tile::bool_constant<std::is_same_v<Layout, ck_tile::tensor_layout::gemm::RowMajor>>{};
}
// Structure to hold kernel traits for dispatcher
struct KernelTraits
{
std::string pipeline; // compv3, compv4, mem
std::string scheduler; // intrawave, interwave
std::string epilogue; // cshuffle, default
bool pad_m;
bool pad_n;
bool pad_k;
bool persistent;
// Constructor with defaults
KernelTraits()
: pipeline("compv3"),
scheduler("intrawave"),
epilogue("cshuffle"),
pad_m(false),
pad_n(false),
pad_k(false),
persistent(false)
{
}
};
// Create argument parser
inline auto create_args(int argc, char* argv[])
{
ck_tile::ArgParser arg_parser;
arg_parser.insert("m", "3840", "The value for m dimension. Default is 3840.")
.insert("n", "4096", "The value for n dimension. Default is 4096.")
.insert("k", "2048", "The value for k dimension. Default is 2048.")
.insert("stride_a", "0", "The stride value for tensor A. Default is 0.")
.insert("stride_b", "0", "The stride value for tensor B. Default is 0.")
.insert("stride_ds", "0", "The stride value for tensor Ds . Default is 0.")
.insert("stride_c", "0", "The stride value for tensor C. Default is 0.")
.insert("split_k", "1", "The split value for k dimension. Default is 1.")
.insert("verify",
"2",
"The type of validation. Set to 0 for no validation, 1 for validation on CPU, or 2 "
"for validation on GPU. Default is 2, GPU validation.")
.insert("log",
"false",
"Whether output kernel instance information or not. Possible values are true or "
"false. Default is false")
.insert(
"warmup", "50", "The number of iterations before benchmark the kernel. Default is 50.")
.insert(
"repeat", "100", "The number of iterations to benchmark the kernel. Default is 100.")
.insert("timer",
"true",
"Whether if the timer is gpu timer or not. Possible values are false or true. "
"Default is true.")
.insert("init",
"0",
"The method of tensor initialization. Set to 0 for random, to 1 for linear, or 2 "
"for constant(1). Default is 0, random.")
.insert("flush_cache",
"true",
"To flush cache, possible values are true or false. "
"Default is false.")
.insert("rotating_count", "1000", "number of iterations to rotate the cache. default is 5.")
.insert("metric",
"0",
"Metric with which to measure kernel performance. Set to 0 for latency, 1 for "
"tflops, or 2 for bandwidth. Default is 0, latency.")
.insert("csv_filename",
"",
"The filename of benchmark result. Default is empty (no CSV output).")
.insert("structured_sparsity",
"false",
"Whether use sparsity kernel or not. Possible values are true or false. Default is "
"false")
.insert("json_output",
"false",
"Whether to output results in JSON format only. Possible values are true or false. "
"Default is "
"false");
bool result = arg_parser.parse(argc, argv);
return std::make_tuple(result, arg_parser);
}
enum class Metric
{
LATENCY = 0,

View File

@@ -0,0 +1,442 @@
# CK Tile Engine GEMM Operations
## Overview
The CK Tile Engine GEMM module provides a comprehensive system for generating, building, and benchmarking GEMM (General Matrix Multiplication) kernels with various configurations. It supports multiple data types, layouts, and optimization strategies. The system has evolved from a monolithic build approach (where all kernels compile into a single executable) to a more flexible individual kernel compilation system, providing better build parallelism and targeted testing capabilities.
## Table of Contents
1. [Build System Architecture](#build-system-architecture)
2. [Build Instructions](#build-instructions)
3. [Running Benchmarks](#running-benchmarks)
4. [Configuration System](#configuration-system)
5. [Scripts and Tools](#scripts-and-tools)
6. [Command Line Options](#command-line-options)
7. [Understanding Kernel Names](#understanding-kernel-names)
8. [Troubleshooting](#troubleshooting)
9. [Performance Tips](#performance-tips)
## Build System Architecture
### Individual Kernel Compilation (New Approach)
The new tile engine benchmark system compiles each kernel configuration into a separate executable. This provides:
- Better build parallelism
- Faster incremental builds
- More targeted testing
- Easier debugging of specific configurations
Each benchmark executable follows the naming pattern:
```
benchmark_gemm_<dtype>_<layout>_<config>_<tile_sizes>
```
### Monolithic Build (Legacy Approach)
The original system compiles all kernels into a single executable (`benchmark_gemm_[Datatype]_[Layout]`), which can then be filtered at runtime using command-line arguments.
## Build Instructions
### Prerequisites
- ROCm installation
- CMake 3.16 or higher
- C++17 compatible compiler
### Basic Build
```bash
# In the root of composable kernel, create build directory
mkdir build && cd build
# Configure with specific datatypes and layouts
# Replace [Arch] with your GPU architecture (e.g., gfx90a, gfx942)
# Replace [Datatype1;Datatype2;...] with datatypes (fp8, bf8, int8, fp16, bf16, fp32, fp64)
# Replace [Layout1;Layout2;...] with layouts (rcr, rrr, crr, ccr)
../script/cmake-ck-dev.sh ../ [Arch] -DGEMM_DATATYPE="[Datatype1;Datatype2]" -DGEMM_LAYOUT="[Layout1;Layout2]"
# Build specific benchmarks
make benchmark_gemm_[Datatype1]_[Layout1] -j
```
### Configuration Options
The build system supports several configuration options:
#### Using Custom Config Files
```bash
# Method 1: CMake variable (config file must be in configs/ directory)
cmake -DGEMM_CONFIG_FILE=my_custom_config.json ...
# Method 2: Environment variable (takes precedence over CMake variable)
export GEMM_CONFIG_FILE=my_custom_config.json
cmake ...
```
#### Config File Priority Order
1. **Environment variable** `GEMM_CONFIG_FILE` (highest priority)
2. **CMake variable** `GEMM_CONFIG_FILE`
3. **Default config** (default_config.json for all layouts)
**Note**: All custom config files must be placed in the `tile_engine/ops/gemm/configs/` directory.
### Example Build Commands
```bash
# Build for gfx942 with fp8 and fp16 datatypes, rcr layout
mkdir build && cd build
../script/cmake-ck-dev.sh ../ gfx942 -DGEMM_DATATYPE="fp8;fp16" -DGEMM_LAYOUT="rcr;ccr;rrr;crr"
make benchmark_gemm_universal_fp8_rcr -j
make benchmark_gemm_universal_fp16_rcr -j
```
### Building Individual Kernels
```bash
# Build a specific kernel configuration
make benchmark_gemm_universal_fp8_rcr_compv4_default_intrawave_False_False_False_False_256x256x32_1x4x1_32x32x32
# Build all fp16 benchmarks in parallel
make -j$(nproc) $(make help | grep benchmark_gemm_fp16 | awk '{print $2}')
```
### Rebuilding After Configuration Changes
If you modify the configuration file, you must rebuild:
```bash
rm -rf tile_engine/ && make benchmark_gemm_universal_[Datatype]_[Layout] -j
```
## Running Benchmarks
### Individual Kernel Execution
```bash
cd /path/to/build/directory
./bin/benchmark_gemm_universal_fp16_rcr_compv3_default_intrawave_False_False_False_False_256x128x32_4x1x1_32x32x16 \
-m=512 -n=512 -k=512 -verify=1
```
### Monolithic Executable (Legacy)
```bash
# Run specific pipeline/scheduler/epilogue combination
./bin/benchmark_gemm_universal_[Datatype]_[Layout] -pipeline=compv3 -scheduler=intrawave -epilogue=default
```
### Automated Testing
Use the provided test script to run multiple benchmarks:
```bash
cd /path/to/composable_kernel/tile_engine/ops/gemm
./test_benchmark.sh [build_directory]
```
## Configuration System
### Configuration Files
The system uses JSON configuration files to specify kernel parameters:
- `configs/default_config.json` - Default configurations for various datatypes
- `configs/user_provided_config.json` - User-customizable configurations
### Configuration Structure
```json
{
"tile_config": {
"tile_m": {"values": [256, 128]},
"tile_n": {"values": [256, 128]},
"tile_k": {"values": [64, 32]},
"warp_m": {"values": [2, 4]},
"warp_n": {"values": [2, 1]},
"warp_k": {"values": [1]},
"warp_tile_m": {"values": [32, 16]},
"warp_tile_n": {"values": [32, 16]},
"warp_tile_k": {"values": [16, 32]}
},
"trait_config": {
"pipeline": {"values": ["compv3", "compv4", "mem"]},
"scheduler": {"values": ["intrawave", "interwave"]},
"epilogue": {"values": ["default", "cshuffle"]},
"pad_m": {"values": [false]},
"pad_n": {"values": [false]},
"pad_k": {"values": [false]},
"persistent": {"values": [false]}
}
}
```
## Scripts and Tools
### Python Scripts
#### gemm_universal_instance_builder.py
**Purpose**: Main kernel instance generation script that creates C++ kernel implementations based on configuration files.
**Key Features**:
- Generates individual kernel header files for separate compilation
- Supports multiple data types (fp16, fp8, bf16, fp32, fp64)
- Validates tile configurations for correctness
- Creates CMake integration files
**Usage**:
```bash
python gemm_universal_instance_builder.py \
--working_path ./generated \
--datatype fp16 \
--layout rcr \
--config_json configs/user_provided_config.json \
--gen_all_individual
```
#### gemm_instance_builder_parallel.py
**Purpose**: Parallel version of the instance builder for faster generation of multiple kernel configurations.
**Features**:
- Multi-threaded kernel generation
- Improved performance for large configuration spaces
#### validation_utils.py
**Purpose**: Provides comprehensive validation functions for kernel configurations.
**Key Functions**:
- `is_tile_config_valid()` - Validates tile dimensions and alignments
- `is_trait_combination_valid()` - Checks if pipeline/epilogue/scheduler combinations are supported
- `validate_warp_tile_combination()` - GPU-specific warp tile validation
- `validate_lds_capacity()` - Ensures configurations fit in LDS memory
**Validation Checks**:
- Dimension alignment (tile dimensions must be divisible by warp dimensions)
- LDS capacity constraints
- GPU-specific warp tile support
- Unsupported trait combinations
#### test_validation.py
**Purpose**: Test suite for the validation logic to ensure correctness.
**Usage**:
```bash
python test_validation.py
```
**Tests**:
- Warp tile combination validation
- Trait combination validation
- Full tile configuration validation
#### gemm_universal_benchmark.py
**Purpose**: Python script for running and analyzing GEMM benchmarks.
**Features**:
- Automated benchmark execution
- Performance data collection
- Result analysis and reporting
#### json_config.py
**Purpose**: Configuration file parsing and management.
**Features**:
- JSON configuration loading
- Default configuration handling
- Configuration validation
#### codegen_utils.py
**Purpose**: Utility functions for code generation.
**Features**:
- Template processing
- Code formatting utilities
- File generation helpers
### Shell Scripts
#### test_benchmark.sh
**Purpose**: Automated benchmark testing script that finds and runs all built benchmark executables.
**Features**:
- Automatic build directory detection
- Batch execution of multiple benchmarks
- CSV result collection
- Colored output for easy reading
- Example command generation
**Usage**:
```bash
# Auto-detect build directory
./test_benchmark.sh
# Specify build directory
./test_benchmark.sh /path/to/build/directory
```
**What it does**:
1. Finds all benchmark executables in the build directory
2. Runs each with multiple problem sizes (512, 1024, 2048)
3. Performs GPU verification
4. Saves results to timestamped CSV file
5. Provides summary statistics
## Command Line Options
All benchmark executables support the following options:
### Matrix Dimensions
- `-m=<value>` - M dimension (default: 3840)
- `-n=<value>` - N dimension (default: 4096)
- `-k=<value>` - K dimension (default: 2048)
### Strides
- `-stride_a=<value>` - Stride for matrix A (default: 0, auto-calculated)
- `-stride_b=<value>` - Stride for matrix B (default: 0, auto-calculated)
- `-stride_c=<value>` - Stride for matrix C (default: 0, auto-calculated)
### Verification
- `-verify=<0|1|2>` - Verification mode
- 0: No verification (default)
- 1: CPU verification
- 2: GPU verification
### Performance Testing
- `-warmup=<value>` - Warmup iterations (default: 50)
- `-repeat=<value>` - Benchmark iterations (default: 100)
- `-timer=<true|false>` - Use GPU timer (default: true)
- `-flush_cache=<true|false>` - Flush cache between runs (default: true)
- `-rotating_count=<value>` - Cache rotation count (default: 1000)
### Initialization
- `-init=<0|1|2>` - Tensor initialization method
- 0: Random values [-1, 1] (default)
- 1: Linear sequence (i % 17)
- 2: Constant value (1.0)
### Output Options
- `-log=<true|false>` - Enable verbose logging (default: false)
- `-metric=<0|1|2>` - Performance metric
- 0: Latency in ms (default)
- 1: TFLOPS
- 2: Bandwidth in GB/s
- `-json_output=<true|false>` - JSON format output (default: false)
- `-csv_filename=<filename>` - Save results to CSV
- `-csv_format=<simple|comprehensive>` - CSV format (default: comprehensive)
### Advanced Options
- `-split_k=<value>` - Split-K factor (default: 1)
- `-structured_sparsity=<true|false>` - Enable structured sparsity (default: false)
- `-pipeline=<compv3|compv4|mem>` - Pipeline type (default: compv3)
- `-scheduler=<intrawave|interwave>` - Scheduler type (default: intrawave)
- `-epilogue=<cshuffle|default>` - Epilogue type (default: cshuffle)
- `-pad_m=<true|false>` - Pad M dimension (default: false)
- `-pad_n=<true|false>` - Pad N dimension (default: false)
- `-pad_k=<true|false>` - Pad K dimension (default: false)
- `-persistent=<true|false>` - Use persistent kernel (default: false)
## Understanding Kernel Names
The kernel naming convention encodes the configuration:
```
benchmark_gemm_universal_fp16_rcr_compv3_default_intrawave_False_False_False_False_256x128x32_4x1x1_32x32x16
^^^^ ^^^ ^^^^^^ ^^^^^^^ ^^^^^^^^^ ^^^^^^^^^^^^^^^^^^^^^^^ ^^^^^^^^^ ^^^^^^^ ^^^^^^^^^
| | | | | | | | |
| | | | | Padding & flags | | Warp tile
| | | | Scheduler | Thread tile
| | | Epilogue Block tile
| | Pipeline
| Layout (Row-Column-Row)
Data type
```
### Components:
- **Data type**: fp16, fp32, bf16, fp8, bf8, int8
- **Layout**: rcr (Row-Column-Row), rrr, crr, ccr
- **Pipeline**: mem, compv3, compv4
- **Epilogue**: default, cshuffle
- **Scheduler**: intrawave, interwave
- **Flags**: pad_m, pad_n, pad_k, persistent (4 boolean flags)
- **Tile sizes**: BlockTile x ThreadTile x WarpTile
## Troubleshooting
### Common Issues
1. **Kernel not found**
- Ensure the specific benchmark executable is built
- Check the build directory bin/ folder
2. **Verification failures**
- Try GPU verification (-verify=2) which may be more accurate
- Check data type compatibility
- Verify stride calculations
3. **Build failures**
- Check GPU architecture compatibility
- Ensure ROCm is properly installed
- Verify configuration file syntax
4. **Performance variations**
- Increase warmup iterations
- Disable CPU frequency scaling
- Use GPU timer for accurate measurements
### Debug Options
Enable verbose logging:
```bash
./bin/benchmark_gemm_... -log=true -verify=1
```
Test validation logic:
```bash
python test_validation.py
```
## Performance Tips
1. **Optimal Problem Sizes**: Use sizes that are multiples of tile dimensions
2. **Warmup**: Use at least 50-100 warmup iterations
3. **GPU Timer**: Always use `-timer=true` for accurate measurements
4. **Cache Management**: Enable cache flushing for consistent results
5. **Thread Affinity**: Set CPU affinity to reduce variation
## Integration Examples
### Python Integration
```python
import subprocess
import json
# Run benchmark with JSON output
result = subprocess.run([
'./bin/benchmark_gemm_universal_fp16_rcr_...',
'-m=1024', '-n=1024', '-k=1024',
'-json_output=true'
], capture_output=True, text=True)
# Parse results
data = json.loads(result.stdout)
print(f"Performance: {data['tflops']} TFLOPS")
```
### Batch Testing Script
```bash
#!/bin/bash
SIZES="512 1024 2048 4096"
for size in $SIZES; do
echo "Testing ${size}x${size}x${size}"
./bin/benchmark_gemm_... -m=$size -n=$size -k=$size \
-verify=2 -csv_filename=results.csv
done
```
## Contributing
When adding new features or configurations:
1. Update validation logic in `validation_utils.py`
2. Add tests to `test_validation.py`
3. Update configuration examples
4. Document new command-line options
For more information about the Composable Kernel project, visit the main repository documentation.

View File

@@ -52,19 +52,27 @@ struct GemmProblem
// Detect Problem::DsDataType, default to void when absent
template <class T, class = void>
struct get_DsDataType { using type = void; };
struct get_DsDataType
{
using type = void;
};
template <class T>
struct get_DsDataType<T, std::void_t<typename T::DsDataType>> {
struct get_DsDataType<T, std::void_t<typename T::DsDataType>>
{
using type = typename T::DsDataType;
};
// Detect Problem::D0DataType, default to void when absent
template <class T, class = void>
struct get_D0DataType { using type = void; };
struct get_D0DataType
{
using type = void;
};
template <class T>
struct get_D0DataType<T, std::void_t<typename T::D0DataType>> {
struct get_D0DataType<T, std::void_t<typename T::D0DataType>>
{
using type = typename T::D0DataType;
};
@@ -79,10 +87,10 @@ bool compare(std::string instanceName,
using DDataType = typename get_D0DataType<Problem>::type;
const float max_accumulated_value =
*std::max_element(c_m_n_host_result.mData.begin(), c_m_n_host_result.mData.end());
//const auto rtol_atol = calculate_rtol_atol<ADataType, BDataType, AccDataType, CDataType>(
//K, kbatch, max_accumulated_value);
// const auto rtol_atol = calculate_rtol_atol<ADataType, BDataType, AccDataType, CDataType>(
// K, kbatch, max_accumulated_value);
auto rtol_atol = [&] {
if constexpr (std::is_void_v<DDataType>)
if constexpr(std::is_void_v<DDataType>)
{
return calculate_rtol_atol<ADataType, BDataType, AccDataType, CDataType>(
K, kbatch, max_accumulated_value);

View File

@@ -3,15 +3,10 @@
# SPDX-License-Identifier: MIT
import os
import sys
import json
import subprocess
import argparse
import csv
import time
import importlib.util
from pathlib import Path
from typing import List, Dict, Tuple, Optional
from typing import List, Dict, Tuple
# TODO: explore modularizing tile engine to avoid accessing imports like this
def _import_benchmark_utils():
@@ -29,10 +24,14 @@ def _import_benchmark_utils():
return benchmark_utils
benchmark_utils = _import_benchmark_utils()
class GemmBenchmark:
def __init__(self, build_dir: str, verbose: bool = False, name: str = "benchmark_gemm_"):
def __init__(
self, build_dir: str, verbose: bool = False, name: str = "benchmark_gemm_"
):
self.build_dir = Path(build_dir)
self.verbose = verbose
self.results = []
@@ -57,7 +56,7 @@ class GemmBenchmark:
"""Extract comprehensive kernel information from filename"""
name = kernel_path.stem
if name.startswith(self.name):
args = name[len(self.name):]
args = name[len(self.name) :]
else:
args = name
@@ -245,7 +244,9 @@ class GemmBenchmark:
for kernel_path in kernels:
kernel_info = self.extract_kernel_info(kernel_path)
result = benchmark_utils.run_kernel(self.build_dir, kernel_path, params, verbose=self.verbose)
result = benchmark_utils.run_kernel(
self.build_dir, kernel_path, params, verbose=self.verbose
)
if result:
# Create new structured result format
structured_result = {
@@ -327,5 +328,3 @@ class GemmBenchmark:
self.results = all_results
return best_kernels

View File

@@ -0,0 +1,96 @@
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
// SPDX-License-Identifier: MIT
#pragma once
#include <iostream>
#include <functional>
#include <tuple>
#include <exception>
#include <sstream>
#include <vector>
#include <string>
#include "ck_tile/core.hpp"
#include "ck_tile/host.hpp"
// Structure to hold kernel traits for dispatcher
struct KernelTraits
{
std::string pipeline; // compv3, compv4, mem
std::string scheduler; // intrawave, interwave
std::string epilogue; // cshuffle, default
bool pad_m;
bool pad_n;
bool pad_k;
bool persistent;
// Constructor with defaults
KernelTraits()
: pipeline("compv3"),
scheduler("intrawave"),
epilogue("cshuffle"),
pad_m(false),
pad_n(false),
pad_k(false),
persistent(false)
{
}
};
// Create argument parser
inline auto create_args(int argc, char* argv[])
{
ck_tile::ArgParser arg_parser;
arg_parser.insert("m", "3840", "The value for m dimension. Default is 3840.")
.insert("n", "4096", "The value for n dimension. Default is 4096.")
.insert("k", "2048", "The value for k dimension. Default is 2048.")
.insert("stride_a", "0", "The stride value for tensor A. Default is 0.")
.insert("stride_b", "0", "The stride value for tensor B. Default is 0.")
.insert("stride_ds", "0", "The stride value for tensor Ds . Default is 0.")
.insert("stride_c", "0", "The stride value for tensor C. Default is 0.")
.insert("split_k", "1", "The split value for k dimension. Default is 1.")
.insert("verify",
"2",
"The type of validation. Set to 0 for no validation, 1 for validation on CPU, or 2 "
"for validation on GPU. Default is 2, GPU validation.")
.insert("log",
"false",
"Whether output kernel instance information or not. Possible values are true or "
"false. Default is false")
.insert(
"warmup", "50", "The number of iterations before benchmark the kernel. Default is 50.")
.insert(
"repeat", "100", "The number of iterations to benchmark the kernel. Default is 100.")
.insert("timer",
"true",
"Whether if the timer is gpu timer or not. Possible values are false or true. "
"Default is true.")
.insert("init",
"0",
"The method of tensor initialization. Set to 0 for random, to 1 for linear, or 2 "
"for constant(1). Default is 0, random.")
.insert("flush_cache",
"true",
"To flush cache, possible values are true or false. "
"Default is false.")
.insert("rotating_count", "1000", "number of iterations to rotate the cache. default is 5.")
.insert("metric",
"0",
"Metric with which to measure kernel performance. Set to 0 for latency, 1 for "
"tflops, or 2 for bandwidth. Default is 0, latency.")
.insert("csv_filename",
"",
"The filename of benchmark result. Default is empty (no CSV output).")
.insert("structured_sparsity",
"false",
"Whether use sparsity kernel or not. Possible values are true or false. Default is "
"false")
.insert("json_output",
"false",
"Whether to output results in JSON format only. Possible values are true or false. "
"Default is "
"false");
bool result = arg_parser.parse(argc, argv);
return std::make_tuple(result, arg_parser);
}

View File

@@ -3,14 +3,10 @@
import os
import sys
import json
import subprocess
import argparse
import csv
import time
import importlib.util
from pathlib import Path
from typing import List, Dict, Tuple, Optional
def _import_gemm_benchmark():
"""Import validation utilities from commons directory."""
@@ -27,6 +23,7 @@ def _import_gemm_benchmark():
return gemm_benchmark_module.GemmBenchmark
def _import_benchmark_utils():
"""Import benchmark utilities from commons directory."""
current_dir = os.path.dirname(os.path.abspath(__file__))
@@ -42,13 +39,16 @@ def _import_benchmark_utils():
return benchmark_utils
GemmBenchmark = _import_gemm_benchmark()
benchmark_utils = _import_benchmark_utils()
class GemmMultiDBenchmark(GemmBenchmark):
def __init__(self, build_dir: str, verbose: bool = False):
super().__init__(build_dir, verbose, name="benchmark_gemm_multi_d_")
def main():
parser = argparse.ArgumentParser(
description="GEMM Multi D Kernel Benchmarking Tool"

View File

@@ -11,6 +11,7 @@
#include "ck_tile/core.hpp"
#include "ck_tile/host.hpp"
#include "gemm/gemm_common.hpp"
#include "gemm_multi_d_profiler.hpp"
// The kernel header is included via the compile command line with -include flag
@@ -35,29 +36,27 @@ void benchmark_single(const ck_tile::ArgParser& arg_parser)
std::string layout_d1 = D1Layout::name;
// Create GemmMultiDProblem struct
GemmMultiDProblem gemm_multi_d_problem{
GemmProblem{
arg_parser.get_int("split_k"),
arg_parser.get_int("m"),
arg_parser.get_int("n"),
arg_parser.get_int("k"),
arg_parser.get_int("stride_a"),
arg_parser.get_int("stride_b"),
arg_parser.get_int("stride_c"),
dtype_a,
dtype_b,
dtype_acc,
dtype_c,
layout_a,
layout_b,
layout_c,
arg_parser.get_bool("structured_sparsity")},
arg_parser.get_int("stride_ds"),
arg_parser.get_int("stride_ds"),
dtype_d0,
dtype_d1,
layout_d0,
layout_d1};
GemmMultiDProblem gemm_multi_d_problem{GemmProblem{arg_parser.get_int("split_k"),
arg_parser.get_int("m"),
arg_parser.get_int("n"),
arg_parser.get_int("k"),
arg_parser.get_int("stride_a"),
arg_parser.get_int("stride_b"),
arg_parser.get_int("stride_c"),
dtype_a,
dtype_b,
dtype_acc,
dtype_c,
layout_a,
layout_b,
layout_c,
arg_parser.get_bool("structured_sparsity")},
arg_parser.get_int("stride_ds"),
arg_parser.get_int("stride_ds"),
dtype_d0,
dtype_d1,
layout_d0,
layout_d1};
// Create Setting struct
Setting setting{arg_parser.get_int("warmup"),

View File

@@ -11,25 +11,28 @@
#include <functional>
#include <tuple>
#include "ck_tile/host/device_prop.hpp"
#include "ck_tile/ops/gemm.hpp"
#include "gemm/gemm_profiler.hpp"
#include "common/utils.hpp"
#include "gemm_multi_d_benchmark.hpp"
class GemmMultiDProfiler: public GemmProfiler<GemmMultiDProfiler,
GemmMultiDProblem,
ck_tile::GemmMultiDHostArgs<DsDataType::size()>>
class GemmMultiDProfiler : public GemmProfiler<GemmMultiDProfiler,
GemmMultiDProblem,
ck_tile::GemmMultiDHostArgs<DsDataType::size()>>
{
public:
public:
using BaseGemm = GemmProfiler<GemmMultiDProfiler,
GemmMultiDProblem,
ck_tile::GemmMultiDHostArgs<DsDataType::size()>>;
GemmMultiDProblem,
ck_tile::GemmMultiDHostArgs<DsDataType::size()>>;
using BaseGemm::benchmark;
GemmMultiDProfiler(Setting setting)
: GemmProfiler<GemmMultiDProfiler, GemmMultiDProblem, ck_tile::GemmMultiDHostArgs<DsDataType::size()>>(setting) {}
: GemmProfiler<GemmMultiDProfiler,
GemmMultiDProblem,
ck_tile::GemmMultiDHostArgs<DsDataType::size()>>(setting)
{
}
void benchmark(
GemmMultiDProblem& gemm_multi_d_problem,
@@ -157,5 +160,4 @@ public:
kernel_run_result);
}
}
};

View File

@@ -8,6 +8,23 @@
#include "gemm_preshuffle_common.hpp"
#include "gemm/gemm_benchmark.hpp"
struct KernelConfig
{
ck_tile::index_t M_Tile;
ck_tile::index_t N_Tile;
ck_tile::index_t K_Tile;
ck_tile::index_t M_Warp;
ck_tile::index_t N_Warp;
ck_tile::index_t K_Warp;
ck_tile::index_t M_Warp_Tile;
ck_tile::index_t N_Warp_Tile;
ck_tile::index_t K_Warp_Tile;
bool permuteN;
};
/// @brief Function to get the kernel output with reference implementation on CPU/GPU
void gemm_host_reference(int verify,
ck_tile::HostTensor<ADataType>& a_m_k,

View File

@@ -3,14 +3,9 @@
import os
import sys
import json
import subprocess
import argparse
import csv
import time
import importlib.util
from pathlib import Path
from typing import List, Dict, Tuple, Optional
def _import_gemm_benchmark():
@@ -28,6 +23,7 @@ def _import_gemm_benchmark():
return gemm_benchmark_module.GemmBenchmark
def _import_benchmark_utils():
"""Import benchmark utilities from commons directory."""
current_dir = os.path.dirname(os.path.abspath(__file__))
@@ -43,13 +39,16 @@ def _import_benchmark_utils():
return benchmark_utils
GemmBenchmark = _import_gemm_benchmark()
benchmark_utils = _import_benchmark_utils()
class GemmPreshuffleBenchmark(GemmBenchmark):
def __init__(self, build_dir: str, verbose: bool = False):
super().__init__(build_dir, verbose, name="benchmark_gemm_preshuffle_")
def main():
parser = argparse.ArgumentParser(
description="GEMM Preshuffle Kernel Benchmarking Tool"

View File

@@ -11,78 +11,21 @@
#include "ck_tile/core.hpp"
#include "ck_tile/host.hpp"
#include "gemm/gemm_common.hpp"
#include "gemm_preshuffle_profiler.hpp"
#include "gemm_preshuffle_common.hpp"
// The kernel header is included via the compile command line with -include flag
// It defines SelectedKernel struct and KERNEL_NAME
// DataTypeTraits are now defined in gemm_common.hpp
// Create argument parser
inline auto create_args(int argc, char* argv[])
{
ck_tile::ArgParser arg_parser;
arg_parser.insert("m", "3840", "The value for m dimension. Default is 3840.")
.insert("n", "4096", "The value for n dimension. Default is 4096.")
.insert("k", "2048", "The value for k dimension. Default is 2048.")
.insert("stride_a", "0", "The stride value for tensor A. Default is 0.")
.insert("stride_b", "0", "The stride value for tensor B. Default is 0.")
.insert("stride_c", "0", "The stride value for tensor C. Default is 0.")
.insert("split_k", "1", "The split value for k dimension. Default is 1.")
.insert("verify",
"2",
"The type of validation. Set to 0 for no validation, 1 for validation on CPU, or 2 "
"for validation on GPU. Default is 0, no validation.")
.insert("log",
"false",
"Whether output kernel instance information or not. Possible values are true or "
"false. Default is false")
.insert(
"warmup", "50", "The number of iterations before benchmark the kernel. Default is 50.")
.insert(
"repeat", "100", "The number of iterations to benchmark the kernel. Default is 100.")
.insert("timer",
"true",
"Whether if the timer is gpu timer or not. Possible values are false or true. "
"Default is true.")
.insert("init",
"0",
"The method of tensor initialization. Set to 0 for random, to 1 for linear, or 2 "
"for constant(1). Default is 0, random.")
.insert("flush_cache",
"true",
"To flush cache, possible values are true or false. "
"Default is false.")
.insert("rotating_count", "1000", "number of iterations to rotate the cache. default is 5.")
.insert("metric",
"0",
"Metric with which to measure kernel performance. Set to 0 for latency, 1 for "
"tflops, or 2 for bandwidth. Default is 0, latency.")
.insert("csv_filename",
"",
"The filename of benchmark result. Default is empty (no CSV output).")
.insert("structured_sparsity",
"false",
"Whether use sparsity kernel or not. Possible values are true or false. Default is "
"false")
.insert("json_output",
"false",
"Whether to output results in JSON format only. Possible values are true or false. "
"Default is "
"false");
bool result = arg_parser.parse(argc, argv);
return std::make_tuple(result, arg_parser);
}
void benchmark_single(const ck_tile::ArgParser& arg_parser)
{
// Use DataTypeTraits to get the actual type names from the generated header
// The generated header defines ADataType, BDataType, AccDataType, CDataType
std::string dtype_a = DataTypeTraits<ADataType>::name;
std::string dtype_b = DataTypeTraits<BDataType>::name;
std::string dtype_acc = DataTypeTraits<AccDataType>::name;
std::string dtype_c = DataTypeTraits<CDataType>::name;
std::string dtype_a = ck_tile::DataTypeTraits<ADataType>::name;
std::string dtype_b = ck_tile::DataTypeTraits<BDataType>::name;
std::string dtype_acc = ck_tile::DataTypeTraits<AccDataType>::name;
std::string dtype_c = ck_tile::DataTypeTraits<CDataType>::name;
// Layout names from the layout types
std::string layout_a = ALayout::name;
@@ -119,29 +62,17 @@ void benchmark_single(const ck_tile::ArgParser& arg_parser)
arg_parser.get_bool("json_output")};
// Get the profiler instance
auto& profiler = GemmProfiler::instance(setting);
auto& profiler = GemmPreshuffleProfiler::instance(setting);
try
{
// Create a lambda that wraps the kernel launch
std::tuple<int, int, int> warp_tile_dims = std::make_tuple(
SelectedKernel::WarpTileM, SelectedKernel::WarpTileN, SelectedKernel::WarpTileK);
std::tuple<int, int, int> tile_dims =
std::make_tuple(SelectedKernel::TileM, SelectedKernel::TileN, SelectedKernel::TileK);
std::tuple<int, int, int> warp_dims = std::make_tuple(SelectedKernel::WarpPerBlock_M,
SelectedKernel::WarpPerBlock_N,
SelectedKernel::WarpPerBlock_K);
bool permuteN = SelectedKernel::PermuteN;
KernelConfig config{tile_dims, warp_dims, warp_tile_dims, permuteN};
auto kernel_func = [](const ck_tile::GemmHostArgs& args,
const ck_tile::stream_config& stream) {
return SelectedKernel::launch(args, stream);
};
// Benchmark the kernel
profiler.benchmark(gemm_problem, kernel_func, config);
profiler.benchmark(gemm_problem, kernel_func);
// Select best instance based on metric
profiler.select_best_instance(static_cast<Metric>(arg_parser.get_int("metric")));

View File

@@ -8,35 +8,20 @@
#include "ck_tile/host.hpp"
#include "ck_tile/core/numeric/integer.hpp"
#include "ck_tile/core/numeric/pk_int4.hpp"
#include "gemm/gemm_common.hpp"
// Structure to hold kernel traits for dispatcher
struct KernelTraits
struct PreshuffleKernelTraits : KernelTraits
{
std::string pipeline; // preshufflev2
std::string scheduler; // intrawave, interwave, default
std::string epilogue; // cshuffle, default
bool pad_m;
bool pad_n;
bool pad_k;
bool persistent;
// Constructor with defaults
KernelTraits()
: pipeline("preshufflev2"),
scheduler("default"),
epilogue("default"),
pad_m(false),
pad_n(false),
pad_k(false),
persistent(false)
{
}
PreshuffleKernelTraits() : KernelTraits() { this->pipeline = "preshufflev2"; }
};
// Helper to extract traits from kernel name
inline KernelTraits extract_traits_from_name(const std::string& kernel_name)
inline PreshuffleKernelTraits extract_traits_from_name(const std::string& kernel_name)
{
KernelTraits traits;
PreshuffleKernelTraits traits;
// Extract pipeline
if(kernel_name.find("preshufflev2") != std::string::npos)
@@ -74,42 +59,3 @@ inline KernelTraits extract_traits_from_name(const std::string& kernel_name)
return traits;
}
template <typename T>
auto shuffle_b(const ck_tile::HostTensor<T>& t,
ck_tile::index_t N_Warp_Tile,
ck_tile::index_t K_Warp_Tile)
{
assert(t.get_lengths().size() == 2);
int n_ = t.get_lengths()[1];
int k_ = t.get_lengths()[0];
int divisor = N_Warp_Tile == 32 ? 2 : 4;
ck_tile::HostTensor<T> t_view(
{n_ / N_Warp_Tile, N_Warp_Tile, k_ / K_Warp_Tile, divisor, K_Warp_Tile / divisor});
std::copy(t.begin(), t.end(), t_view.begin());
return ck_tile::reference_permute(t_view, {0, 2, 3, 1, 4});
}
template <typename T>
auto shuffle_b_permuteN(const ck_tile::HostTensor<T>& t,
ck_tile::index_t N_Warp_Tile,
ck_tile::index_t K_Warp_Tile,
ck_tile::index_t N_Tile,
ck_tile::index_t N_Warp)
{
assert(t.get_lengths().size() == 2);
int n_ = t.get_lengths()[1];
int k_ = t.get_lengths()[0];
int divisor = N_Warp_Tile == 32 ? 2 : 4;
int NRepeat = N_Tile / N_Warp_Tile / N_Warp;
ck_tile::HostTensor<T> t_view({n_ / N_Tile,
N_Warp,
N_Warp_Tile,
NRepeat,
k_ / K_Warp_Tile,
divisor,
K_Warp_Tile / divisor});
std::copy(t.begin(), t.end(), t_view.begin());
return ck_tile::reference_permute(t_view, {0, 3, 1, 4, 5, 2, 6});
}

View File

@@ -4,28 +4,26 @@
#pragma once
#include "ck_tile/host/device_prop.hpp"
#include "ck_tile/host/tensor_shuffle_utils.hpp"
#include "ck_tile/ops/gemm.hpp"
#include "gemm/gemm_profiler.hpp"
#include "gemm_preshuffle_benchmark.hpp"
class GemmPreshuffleProfiler : public GemmProfiler<GemmPreshuffleProfiler,
GemmProblem,
ck_tile::GemmHostArgs>
class GemmPreshuffleProfiler
: public GemmProfiler<GemmPreshuffleProfiler, GemmProblem, ck_tile::GemmHostArgs>
{
public:
using BaseGemm = GemmProfiler<GemmPreshuffleProfiler,
GemmProblem,
ck_tile::GemmHostArgs>;
public:
using BaseGemm = GemmProfiler<GemmPreshuffleProfiler, GemmProblem, ck_tile::GemmHostArgs>;
using BaseGemm::benchmark;
GemmPreshuffleProfiler(Setting setting)
: GemmProfiler<GemmPreshuffleProfiler, GemmProblem, ck_tile::GemmHostArgs>(setting) {}
: GemmProfiler<GemmPreshuffleProfiler, GemmProblem, ck_tile::GemmHostArgs>(setting)
{
}
void benchmark(GemmProblem& gemm_problem,
std::vector<std::function<std::tuple<std::string, float>(
ck_tile::GemmHostArgs&, const ck_tile::stream_config&)>>& callables,
KernelConfig& config) override
ck_tile::GemmHostArgs&, const ck_tile::stream_config&)>>& callables) override
{
const ALayout layout_a = ALayout{};
const BLayout layout_b = BLayout{};
@@ -97,21 +95,28 @@ public:
c_m_n_dev_buf.SetZero();
c_m_n_dev_result.SetZero();
// Create a lambda that wraps the kernel launch
KernelConfig config{SelectedKernel::WarpTileM,
SelectedKernel::WarpTileN,
SelectedKernel::WarpTileK,
SelectedKernel::TileM,
SelectedKernel::TileN,
SelectedKernel::TileK,
SelectedKernel::WarpPerBlock_M,
SelectedKernel::WarpPerBlock_N,
SelectedKernel::WarpPerBlock_K,
SelectedKernel::PermuteN};
for(const auto& callable : callables)
{
ck_tile::index_t N_Warp_Tile = std::get<1>(config.warp_tile_dims);
ck_tile::index_t K_Warp_Tile = std::get<2>(config.warp_tile_dims);
ck_tile::index_t N_Tile = std::get<1>(config.tile_dims);
ck_tile::index_t N_Warp = std::get<1>(config.warp_dims);
ck_tile::HostTensor<BDataType> b_shuffle_host = [&]() {
if(config.permuteN)
{
return shuffle_b_permuteN(b_k_n, N_Warp_Tile, K_Warp_Tile, N_Tile, N_Warp);
return ck_tile::shuffle_b_permuteN<KernelConfig>(b_k_n, config);
}
else
{
return shuffle_b(b_k_n, N_Warp_Tile, K_Warp_Tile);
return ck_tile::shuffle_b<KernelConfig>(b_k_n, config);
}
}();
@@ -144,5 +149,4 @@ public:
gemm_problem, c_m_n_dev_buf, c_m_n_ref, c_m_n_dev_result, kernel_run_result);
}
}
};

View File

@@ -11,17 +11,14 @@
#include <functional>
#include <tuple>
#include "ck_tile/host/device_prop.hpp"
#include "ck_tile/ops/gemm.hpp"
#include "gemm_benchmark.hpp"
template <typename Gemm,
typename Problem,
typename GemmArgs>
template <typename Gemm, typename Problem, typename GemmArgs>
class GemmProfiler
{
public:
public:
static Gemm& instance(Setting setting)
{
static Gemm instance{setting};
@@ -30,27 +27,25 @@ public:
// Overload for single kernel benchmarking
void benchmark(Problem& gemm_problem,
std::function<float(const GemmArgs&, const ck_tile::stream_config&)>
kernel_func)
std::function<float(const GemmArgs&, const ck_tile::stream_config&)> kernel_func)
{
// Create a vector with a single callable that returns both name and time
std::vector<std::function<std::tuple<std::string, float>(GemmArgs&,
const ck_tile::stream_config&)>>
std::vector<
std::function<std::tuple<std::string, float>(GemmArgs&, const ck_tile::stream_config&)>>
callables;
callables.push_back(
[kernel_func](GemmArgs& args, const ck_tile::stream_config& stream) {
float time = kernel_func(args, stream);
return std::make_tuple(std::string(KERNEL_NAME), time);
});
callables.push_back([kernel_func](GemmArgs& args, const ck_tile::stream_config& stream) {
float time = kernel_func(args, stream);
return std::make_tuple(std::string(KERNEL_NAME), time);
});
benchmark(gemm_problem, callables); // TODO: need to cast this?
benchmark(gemm_problem, callables);
}
virtual void benchmark(Problem& gemm_problem,
std::vector<std::function<std::tuple<std::string, float>(
GemmArgs&, const ck_tile::stream_config&)>>& callables) = 0;
std::vector<std::function<std::tuple<std::string, float>(
GemmArgs&, const ck_tile::stream_config&)>>& callables) = 0;
void process_result(const Problem& gemm_problem,
ck_tile::DeviceMem& c_m_n_dev_buf,
ck_tile::HostTensor<CDataType>& c_m_n_host_result,
@@ -58,7 +53,7 @@ public:
const std::tuple<std::string, float>& kernel_run_result)
{
auto [name, avg_time] = kernel_run_result;
using DDataType = typename get_DsDataType<Problem>::type;
using DDataType = typename get_DsDataType<Problem>::type;
KernelInstance<Problem> kernel_instance{name, gemm_problem, {-1.0f, -1.0f, -1.0f}};
@@ -68,16 +63,14 @@ public:
sizeof(BDataType) * gemm_problem.n_ * gemm_problem.k_ +
sizeof(CDataType) * gemm_problem.m_ * gemm_problem.n_;
if constexpr (!std::is_void_v<DDataType>)
{
ck_tile::static_for<0, DDataType::size(), 1>{}([&](auto i) {
using DType = ck_tile::remove_cvref_t<std::tuple_element_t<i, DDataType>>;
num_byte += sizeof(DType) * gemm_problem.m_ * gemm_problem.n_;
flop += sizeof(DType) * gemm_problem.m_ * gemm_problem.n_;
});
}
if constexpr(!std::is_void_v<DDataType>)
{
ck_tile::static_for<0, DDataType::size(), 1>{}([&](auto i) {
using DType = ck_tile::remove_cvref_t<std::tuple_element_t<i, DDataType>>;
num_byte += sizeof(DType) * gemm_problem.m_ * gemm_problem.n_;
flop += sizeof(DType) * gemm_problem.m_ * gemm_problem.n_;
});
}
// update
kernel_instance.perf_result_.latency_ = avg_time;
@@ -91,15 +84,14 @@ public:
// verify result
c_m_n_dev_buf.FromDevice(c_m_n_dev_result.data());
int split_k = 1;
if constexpr (std::is_same_v<Problem, GemmProblem>)
{
split_k = gemm_problem.split_k_;
}
int split_k = 1;
if constexpr(std::is_same_v<Problem, GemmProblem>)
{
split_k = gemm_problem.split_k_;
}
bool verified_correct =
!setting_.verify_ ||
compare<Problem>(
name, gemm_problem.k_, split_k, c_m_n_dev_result, c_m_n_host_result);
compare<Problem>(name, gemm_problem.k_, split_k, c_m_n_dev_result, c_m_n_host_result);
if(verified_correct)
{
@@ -196,5 +188,3 @@ public:
std::vector<KernelInstance<Problem>> kernel_instances_;
};

View File

@@ -4,14 +4,10 @@
import os
import sys
import json
import subprocess
import argparse
import csv
import time
import importlib.util
from pathlib import Path
from typing import List, Dict, Tuple, Optional
def _import_gemm_benchmark():
"""Import validation utilities from commons directory."""
@@ -28,6 +24,7 @@ def _import_gemm_benchmark():
return gemm_benchmark_module.GemmBenchmark
def _import_benchmark_utils():
"""Import benchmark utilities from commons directory."""
current_dir = os.path.dirname(os.path.abspath(__file__))
@@ -43,16 +40,20 @@ def _import_benchmark_utils():
return benchmark_utils
GemmBenchmark = _import_gemm_benchmark()
benchmark_utils = _import_benchmark_utils()
class GemmUniversalBenchmark(GemmBenchmark):
def __init__(self, build_dir: str, verbose: bool = False):
super().__init__(build_dir, verbose, name="benchmark_gemm_")
super().__init__(build_dir, verbose, name="benchmark_gemm_universal_")
def main():
parser = argparse.ArgumentParser(description="GEMM Kernel Benchmarking Tool")
parser = argparse.ArgumentParser(
description="Universal GEMM Kernel Benchmarking Tool"
)
parser.add_argument(
"build_dir", help="Build directory containing kernel executables"
)
@@ -67,7 +68,9 @@ def main():
)
parser.add_argument("--verify", action="store_true", help="Enable verification")
parser.add_argument(
"--csv", default="gemm_benchmark_results.csv", help="CSV output filename"
"--csv",
default="gemm_universal_benchmark_results.csv",
help="CSV output filename",
)
parser.add_argument(
"--best", default="best_kernels.txt", help="Best kernels output filename"
@@ -115,7 +118,7 @@ def main():
benchmark = GemmUniversalBenchmark(args.build_dir, verbose=args.verbose)
# Run benchmark sweep
print("Starting GEMM kernel benchmark sweep...")
print("Starting Universal GEMM kernel benchmark sweep...")
start_time = time.time()
best_kernels = benchmark.benchmark_sweep(

View File

@@ -11,6 +11,7 @@
#include "ck_tile/core.hpp"
#include "ck_tile/host.hpp"
#include "gemm/gemm_common.hpp"
#include "gemm_universal_profiler.hpp"
// The kernel header is included via the compile command line with -include flag

View File

@@ -9,21 +9,21 @@
#include "ck_tile/host/device_prop.hpp"
#include "ck_tile/ops/gemm.hpp"
#include "gemm_universal_benchmark.hpp"
#include "gemm/gemm_benchmark.hpp"
#include "gemm/gemm_profiler.hpp"
#include "gemm_universal_benchmark.hpp"
class UniversalGemmProfiler : public GemmProfiler<UniversalGemmProfiler,
GemmProblem,
ck_tile::GemmHostArgs>
class UniversalGemmProfiler
: public GemmProfiler<UniversalGemmProfiler, GemmProblem, ck_tile::GemmHostArgs>
{
public:
using BaseGemm = GemmProfiler<UniversalGemmProfiler,
GemmProblem,
ck_tile::GemmHostArgs>;
public:
using BaseGemm = GemmProfiler<UniversalGemmProfiler, GemmProblem, ck_tile::GemmHostArgs>;
using BaseGemm::benchmark;
UniversalGemmProfiler(Setting setting)
: GemmProfiler<UniversalGemmProfiler, GemmProblem, ck_tile::GemmHostArgs>(setting) {}
: GemmProfiler<UniversalGemmProfiler, GemmProblem, ck_tile::GemmHostArgs>(setting)
{
}
void benchmark(GemmProblem& gemm_problem,
std::vector<std::function<std::tuple<std::string, float>(