mirror of
https://github.com/ROCm/composable_kernel.git
synced 2026-06-30 11:47:48 +00:00
Adding README back into the gemm directory and integrate new preshuffle functions
This commit is contained in:
6
Jenkinsfile
vendored
6
Jenkinsfile
vendored
@@ -1640,7 +1640,7 @@ pipeline {
|
||||
-D GEMM_PRESHUFFLE_LAYOUT="rcr" \
|
||||
-D GEMM_PRESHUFFLE_CONFIG_FILE="default_ci_config.json" .. && \
|
||||
ninja -j${nthreads()} benchmark_gemm_universal_all benchmark_gemm_preshuffle_all benchmark_gemm_multi_d_all && \
|
||||
python3 ../tile_engine/ops/gemm/gemm_universal/gemm_benchmark.py . --problem-sizes "1024,1024,1024" --warmup 5 --repeat 5 --verbose --json results.json && \
|
||||
python3 ../tile_engine/ops/gemm/gemm_universal/gemm_universal_benchmark.py . --problem-sizes "1024,1024,1024" --warmup 5 --repeat 5 --verbose --json results.json && \
|
||||
python3 ../tile_engine/ops/gemm/gemm_preshuffle/gemm_preshuffle_benchmark.py . --problem-sizes "1024,1024,1024" --warmup 5 --repeat 5 --verbose --json results.json && \
|
||||
python3 ../tile_engine/ops/gemm/gemm_multi_d/gemm_multi_d_benchmark.py . --problem-sizes "1024,1024,1024" --warmup 5 --repeat 5 --verbose --json results.json """
|
||||
}
|
||||
@@ -1681,7 +1681,7 @@ pipeline {
|
||||
-D GEMM_PRESHUFFLE_DATATYPE="fp16;fp8;bf16;bf8" \
|
||||
-D GEMM_PRESHUFFLE_LAYOUT="rcr" .. && \
|
||||
ninja -j${nthreads()} benchmark_gemm_universal_all benchmark_gemm_preshuffle_all benchmark_gemm_multi_d_all benchmark_gemm_streamk_all && \
|
||||
python3 ../tile_engine/ops/gemm/gemm_universal/gemm_benchmark.py . --problem-sizes "1024,1024,1024" --warmup 5 --repeat 5 --verbose --json results.json && \
|
||||
python3 ../tile_engine/ops/gemm/gemm_universal/gemm_universal_benchmark.py . --problem-sizes "1024,1024,1024" --warmup 5 --repeat 5 --verbose --json results.json && \
|
||||
python3 ../tile_engine/ops/gemm/gemm_preshuffle/gemm_preshuffle_benchmark.py . --problem-sizes "1024,1024,1024" --warmup 5 --repeat 5 --verbose --json results.json && \
|
||||
python3 ../tile_engine/ops/gemm/gemm_multi_d/gemm_multi_d_benchmark.py . --problem-sizes "1024,1024,1024" --warmup 5 --repeat 5 --verbose --json results.json """
|
||||
}
|
||||
@@ -1706,7 +1706,7 @@ pipeline {
|
||||
-D GEMM_UNIVERSAL_DATATYPE="fp16" \
|
||||
-D GEMM_UNIVERSAL_LAYOUT="rcr;rrr;crr;ccr" .. && \
|
||||
ninja -j${nthreads()} benchmark_gemm_universal_all && \
|
||||
python3 ../tile_engine/ops/gemm/gemm_universal/gemm_benchmark.py . --problem-sizes "1024,1024,1024" --warmup 5 --repeat 5 --verbose --json results.json """
|
||||
python3 ../tile_engine/ops/gemm/gemm_universal/gemm_universal_benchmark.py . --problem-sizes "1024,1024,1024" --warmup 5 --repeat 5 --verbose --json results.json """
|
||||
}
|
||||
steps{
|
||||
buildHipClangJobAndReboot(setup_args:setup_args, build_type: 'Release', execute_cmd: execute_args)
|
||||
|
||||
@@ -10,7 +10,7 @@
|
||||
# ============================================================================
|
||||
|
||||
# Locate tile_engine GEMM scripts directory
|
||||
set(TILE_ENGINE_GEMM_DIR "${PROJECT_SOURCE_DIR}/tile_engine/ops/gemm")
|
||||
set(TILE_ENGINE_GEMM_DIR "${PROJECT_SOURCE_DIR}/tile_engine/ops/gemm/gemm_universal")
|
||||
|
||||
if(NOT EXISTS ${TILE_ENGINE_GEMM_DIR})
|
||||
message(WARNING "Tile engine directory not found: ${TILE_ENGINE_GEMM_DIR}")
|
||||
@@ -32,11 +32,11 @@ endif()
|
||||
# config_json - Full path to JSON configuration file
|
||||
# ============================================================================
|
||||
function(create_individual_gemm_test_target datatype layout config_name trait tile_config config_json)
|
||||
set(target_name "test_gemm_tile_engine_${datatype}_${layout}_${config_name}_${trait}_${tile_config}")
|
||||
set(target_name "test_gemm_universal_tile_engine_${datatype}_${layout}_${config_name}_${trait}_${tile_config}")
|
||||
set(working_path "${CMAKE_CURRENT_BINARY_DIR}/${datatype}/${layout}/${config_name}")
|
||||
|
||||
# Generated header path (already created during cmake configuration)
|
||||
set(test_header "${working_path}/gemm_single_${datatype}_${layout}_${trait}_${tile_config}.hpp")
|
||||
set(test_header "${working_path}/gemm_universal_single_${datatype}_${layout}_${trait}_${tile_config}.hpp")
|
||||
set(test_params_header "${working_path}/test_params.hpp")
|
||||
|
||||
# Verify header exists (should have been generated during cmake configuration)
|
||||
@@ -118,7 +118,7 @@ function(build_gemm_test_targets datatype layout config_name)
|
||||
|
||||
# STEP 1: Discovery phase - list all valid kernel configurations
|
||||
execute_process(
|
||||
COMMAND ${Python3_EXECUTABLE} -u ${TILE_ENGINE_GEMM_DIR}/gemm_instance_builder.py
|
||||
COMMAND ${Python3_EXECUTABLE} -u ${TILE_ENGINE_GEMM_DIR}/gemm_universal_instance_builder.py
|
||||
--working_path ${working_path}
|
||||
--datatype ${datatype}
|
||||
--layout ${layout}
|
||||
@@ -178,7 +178,7 @@ function(build_gemm_test_targets datatype layout config_name)
|
||||
|
||||
# Generate header using --gen_single
|
||||
execute_process(
|
||||
COMMAND ${Python3_EXECUTABLE} -u ${TILE_ENGINE_GEMM_DIR}/gemm_instance_builder.py
|
||||
COMMAND ${Python3_EXECUTABLE} -u ${TILE_ENGINE_GEMM_DIR}/gemm_universal_instance_builder.py
|
||||
--working_path ${working_path}
|
||||
--gpu_target "${GEMM_TEST_GPU_TARGETS}"
|
||||
--datatype ${datatype}
|
||||
|
||||
@@ -2,17 +2,16 @@
|
||||
# Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
||||
# SPDX-License-Identifier: MIT
|
||||
|
||||
import sys
|
||||
import json
|
||||
import subprocess
|
||||
import argparse
|
||||
import csv
|
||||
import time
|
||||
from pathlib import Path
|
||||
from typing import List, Dict, Tuple, Optional
|
||||
from typing import List, Dict, Optional
|
||||
|
||||
|
||||
def run_kernel(build_dir: Path, kernel_path: Path, params: Dict[str, str], verbose: bool = False) -> Optional[Dict]:
|
||||
def run_kernel(
|
||||
build_dir: Path, kernel_path: Path, params: Dict[str, str], verbose: bool = False
|
||||
) -> Optional[Dict]:
|
||||
"""Run a single kernel with given parameters and save output to individual JSON file"""
|
||||
# Create results directory
|
||||
results_dir = build_dir / "results"
|
||||
@@ -59,6 +58,7 @@ def run_kernel(build_dir: Path, kernel_path: Path, params: Dict[str, str], verbo
|
||||
print(f"Error running {kernel_path.name}: {e}")
|
||||
return None
|
||||
|
||||
|
||||
def parse_json_file(json_file: Path, verbose: bool = False) -> Optional[Dict]:
|
||||
"""Parse JSON data from individual kernel output file"""
|
||||
try:
|
||||
@@ -88,9 +88,8 @@ def parse_json_file(json_file: Path, verbose: bool = False) -> Optional[Dict]:
|
||||
print(f"Error reading JSON file {json_file}: {e}")
|
||||
return None
|
||||
|
||||
def find_best_kernel(
|
||||
results: List[Dict], metric: str = "tflops"
|
||||
) -> Optional[Dict]:
|
||||
|
||||
def find_best_kernel(results: List[Dict], metric: str = "tflops") -> Optional[Dict]:
|
||||
"""Find the best performing kernel based on metric"""
|
||||
if not results:
|
||||
return None
|
||||
@@ -126,7 +125,8 @@ def export_csv(results: List[Dict], filename: str, verbose: bool = False):
|
||||
|
||||
print(f"Results exported to {filename}")
|
||||
|
||||
def export_best_kernels( best_kernels: Dict, filename: str, verbose: bool = False):
|
||||
|
||||
def export_best_kernels(best_kernels: Dict, filename: str, verbose: bool = False):
|
||||
"""Export best kernel selections to file"""
|
||||
with open(filename, "w") as f:
|
||||
f.write("# Best kernel selections\n")
|
||||
@@ -141,7 +141,10 @@ def export_best_kernels( best_kernels: Dict, filename: str, verbose: bool = Fals
|
||||
|
||||
print(f"Best kernels exported to {filename}")
|
||||
|
||||
def export_json(results: List[Dict], filename: str, best_kernels: Dict = None, verbose: bool = False):
|
||||
|
||||
def export_json(
|
||||
results: List[Dict], filename: str, best_kernels: Dict = None, verbose: bool = False
|
||||
):
|
||||
"""Export all results and best kernels to JSON with comprehensive metadata"""
|
||||
from datetime import datetime
|
||||
|
||||
@@ -223,9 +226,7 @@ def export_json(results: List[Dict], filename: str, best_kernels: Dict = None, v
|
||||
"benchmark_metadata": {
|
||||
"timestamp": datetime.now().isoformat(),
|
||||
"total_kernels_tested": len(results),
|
||||
"unique_kernels": len(
|
||||
set(r.get("name", "unknown") for r in results)
|
||||
),
|
||||
"unique_kernels": len(set(r.get("name", "unknown") for r in results)),
|
||||
"successful_runs": len(successful_results),
|
||||
"failed_runs": len(results) - len(successful_results),
|
||||
},
|
||||
@@ -265,9 +266,7 @@ def export_json(results: List[Dict], filename: str, best_kernels: Dict = None, v
|
||||
"by_scheduler": scheduler_stats,
|
||||
"by_data_type": data_type_stats,
|
||||
},
|
||||
"total_problem_configurations": len(best_kernels)
|
||||
if best_kernels
|
||||
else 0,
|
||||
"total_problem_configurations": len(best_kernels) if best_kernels else 0,
|
||||
},
|
||||
"kernel_results": results,
|
||||
"best_kernels_by_problem": best_kernels or {},
|
||||
@@ -282,4 +281,3 @@ def export_json(results: List[Dict], filename: str, best_kernels: Dict = None, v
|
||||
print(f" - Best TFLOPS: {max(tflops_values, default=0):.2f}")
|
||||
print(f" - Best bandwidth: {max(bandwidth_values, default=0):.2f} GB/s")
|
||||
print(f" - Best latency: {min(latency_values, default=0):.2f}ms")
|
||||
|
||||
|
||||
@@ -20,89 +20,6 @@ constexpr auto is_row_major(Layout)
|
||||
return ck_tile::bool_constant<std::is_same_v<Layout, ck_tile::tensor_layout::gemm::RowMajor>>{};
|
||||
}
|
||||
|
||||
// Structure to hold kernel traits for dispatcher
|
||||
struct KernelTraits
|
||||
{
|
||||
std::string pipeline; // compv3, compv4, mem
|
||||
std::string scheduler; // intrawave, interwave
|
||||
std::string epilogue; // cshuffle, default
|
||||
bool pad_m;
|
||||
bool pad_n;
|
||||
bool pad_k;
|
||||
bool persistent;
|
||||
|
||||
// Constructor with defaults
|
||||
KernelTraits()
|
||||
: pipeline("compv3"),
|
||||
scheduler("intrawave"),
|
||||
epilogue("cshuffle"),
|
||||
pad_m(false),
|
||||
pad_n(false),
|
||||
pad_k(false),
|
||||
persistent(false)
|
||||
{
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
// Create argument parser
|
||||
inline auto create_args(int argc, char* argv[])
|
||||
{
|
||||
ck_tile::ArgParser arg_parser;
|
||||
arg_parser.insert("m", "3840", "The value for m dimension. Default is 3840.")
|
||||
.insert("n", "4096", "The value for n dimension. Default is 4096.")
|
||||
.insert("k", "2048", "The value for k dimension. Default is 2048.")
|
||||
.insert("stride_a", "0", "The stride value for tensor A. Default is 0.")
|
||||
.insert("stride_b", "0", "The stride value for tensor B. Default is 0.")
|
||||
.insert("stride_ds", "0", "The stride value for tensor Ds . Default is 0.")
|
||||
.insert("stride_c", "0", "The stride value for tensor C. Default is 0.")
|
||||
.insert("split_k", "1", "The split value for k dimension. Default is 1.")
|
||||
.insert("verify",
|
||||
"2",
|
||||
"The type of validation. Set to 0 for no validation, 1 for validation on CPU, or 2 "
|
||||
"for validation on GPU. Default is 2, GPU validation.")
|
||||
.insert("log",
|
||||
"false",
|
||||
"Whether output kernel instance information or not. Possible values are true or "
|
||||
"false. Default is false")
|
||||
.insert(
|
||||
"warmup", "50", "The number of iterations before benchmark the kernel. Default is 50.")
|
||||
.insert(
|
||||
"repeat", "100", "The number of iterations to benchmark the kernel. Default is 100.")
|
||||
.insert("timer",
|
||||
"true",
|
||||
"Whether if the timer is gpu timer or not. Possible values are false or true. "
|
||||
"Default is true.")
|
||||
.insert("init",
|
||||
"0",
|
||||
"The method of tensor initialization. Set to 0 for random, to 1 for linear, or 2 "
|
||||
"for constant(1). Default is 0, random.")
|
||||
.insert("flush_cache",
|
||||
"true",
|
||||
"To flush cache, possible values are true or false. "
|
||||
"Default is false.")
|
||||
.insert("rotating_count", "1000", "number of iterations to rotate the cache. default is 5.")
|
||||
.insert("metric",
|
||||
"0",
|
||||
"Metric with which to measure kernel performance. Set to 0 for latency, 1 for "
|
||||
"tflops, or 2 for bandwidth. Default is 0, latency.")
|
||||
.insert("csv_filename",
|
||||
"",
|
||||
"The filename of benchmark result. Default is empty (no CSV output).")
|
||||
.insert("structured_sparsity",
|
||||
"false",
|
||||
"Whether use sparsity kernel or not. Possible values are true or false. Default is "
|
||||
"false")
|
||||
.insert("json_output",
|
||||
"false",
|
||||
"Whether to output results in JSON format only. Possible values are true or false. "
|
||||
"Default is "
|
||||
"false");
|
||||
|
||||
bool result = arg_parser.parse(argc, argv);
|
||||
return std::make_tuple(result, arg_parser);
|
||||
}
|
||||
|
||||
enum class Metric
|
||||
{
|
||||
LATENCY = 0,
|
||||
|
||||
442
tile_engine/ops/gemm/README.md
Normal file
442
tile_engine/ops/gemm/README.md
Normal file
@@ -0,0 +1,442 @@
|
||||
# CK Tile Engine GEMM Operations
|
||||
|
||||
## Overview
|
||||
|
||||
The CK Tile Engine GEMM module provides a comprehensive system for generating, building, and benchmarking GEMM (General Matrix Multiplication) kernels with various configurations. It supports multiple data types, layouts, and optimization strategies. The system has evolved from a monolithic build approach (where all kernels compile into a single executable) to a more flexible individual kernel compilation system, providing better build parallelism and targeted testing capabilities.
|
||||
|
||||
## Table of Contents
|
||||
|
||||
1. [Build System Architecture](#build-system-architecture)
|
||||
2. [Build Instructions](#build-instructions)
|
||||
3. [Running Benchmarks](#running-benchmarks)
|
||||
4. [Configuration System](#configuration-system)
|
||||
5. [Scripts and Tools](#scripts-and-tools)
|
||||
6. [Command Line Options](#command-line-options)
|
||||
7. [Understanding Kernel Names](#understanding-kernel-names)
|
||||
8. [Troubleshooting](#troubleshooting)
|
||||
9. [Performance Tips](#performance-tips)
|
||||
|
||||
## Build System Architecture
|
||||
|
||||
### Individual Kernel Compilation (New Approach)
|
||||
|
||||
The new tile engine benchmark system compiles each kernel configuration into a separate executable. This provides:
|
||||
- Better build parallelism
|
||||
- Faster incremental builds
|
||||
- More targeted testing
|
||||
- Easier debugging of specific configurations
|
||||
|
||||
Each benchmark executable follows the naming pattern:
|
||||
```
|
||||
benchmark_gemm_<dtype>_<layout>_<config>_<tile_sizes>
|
||||
```
|
||||
|
||||
### Monolithic Build (Legacy Approach)
|
||||
|
||||
The original system compiles all kernels into a single executable (`benchmark_gemm_[Datatype]_[Layout]`), which can then be filtered at runtime using command-line arguments.
|
||||
|
||||
## Build Instructions
|
||||
|
||||
### Prerequisites
|
||||
- ROCm installation
|
||||
- CMake 3.16 or higher
|
||||
- C++17 compatible compiler
|
||||
|
||||
### Basic Build
|
||||
|
||||
```bash
|
||||
# In the root of composable kernel, create build directory
|
||||
mkdir build && cd build
|
||||
|
||||
# Configure with specific datatypes and layouts
|
||||
# Replace [Arch] with your GPU architecture (e.g., gfx90a, gfx942)
|
||||
# Replace [Datatype1;Datatype2;...] with datatypes (fp8, bf8, int8, fp16, bf16, fp32, fp64)
|
||||
# Replace [Layout1;Layout2;...] with layouts (rcr, rrr, crr, ccr)
|
||||
../script/cmake-ck-dev.sh ../ [Arch] -DGEMM_DATATYPE="[Datatype1;Datatype2]" -DGEMM_LAYOUT="[Layout1;Layout2]"
|
||||
|
||||
# Build specific benchmarks
|
||||
make benchmark_gemm_[Datatype1]_[Layout1] -j
|
||||
```
|
||||
|
||||
### Configuration Options
|
||||
|
||||
The build system supports several configuration options:
|
||||
|
||||
#### Using Custom Config Files
|
||||
```bash
|
||||
# Method 1: CMake variable (config file must be in configs/ directory)
|
||||
cmake -DGEMM_CONFIG_FILE=my_custom_config.json ...
|
||||
|
||||
# Method 2: Environment variable (takes precedence over CMake variable)
|
||||
export GEMM_CONFIG_FILE=my_custom_config.json
|
||||
cmake ...
|
||||
```
|
||||
|
||||
#### Config File Priority Order
|
||||
1. **Environment variable** `GEMM_CONFIG_FILE` (highest priority)
|
||||
2. **CMake variable** `GEMM_CONFIG_FILE`
|
||||
3. **Default config** (default_config.json for all layouts)
|
||||
|
||||
**Note**: All custom config files must be placed in the `tile_engine/ops/gemm/configs/` directory.
|
||||
|
||||
### Example Build Commands
|
||||
|
||||
```bash
|
||||
# Build for gfx942 with fp8 and fp16 datatypes, rcr layout
|
||||
mkdir build && cd build
|
||||
../script/cmake-ck-dev.sh ../ gfx942 -DGEMM_DATATYPE="fp8;fp16" -DGEMM_LAYOUT="rcr;ccr;rrr;crr"
|
||||
make benchmark_gemm_universal_fp8_rcr -j
|
||||
make benchmark_gemm_universal_fp16_rcr -j
|
||||
```
|
||||
|
||||
### Building Individual Kernels
|
||||
|
||||
```bash
|
||||
# Build a specific kernel configuration
|
||||
make benchmark_gemm_universal_fp8_rcr_compv4_default_intrawave_False_False_False_False_256x256x32_1x4x1_32x32x32
|
||||
|
||||
# Build all fp16 benchmarks in parallel
|
||||
make -j$(nproc) $(make help | grep benchmark_gemm_fp16 | awk '{print $2}')
|
||||
```
|
||||
|
||||
### Rebuilding After Configuration Changes
|
||||
|
||||
If you modify the configuration file, you must rebuild:
|
||||
```bash
|
||||
rm -rf tile_engine/ && make benchmark_gemm_universal_[Datatype]_[Layout] -j
|
||||
```
|
||||
|
||||
## Running Benchmarks
|
||||
|
||||
### Individual Kernel Execution
|
||||
|
||||
```bash
|
||||
cd /path/to/build/directory
|
||||
./bin/benchmark_gemm_universal_fp16_rcr_compv3_default_intrawave_False_False_False_False_256x128x32_4x1x1_32x32x16 \
|
||||
-m=512 -n=512 -k=512 -verify=1
|
||||
```
|
||||
|
||||
### Monolithic Executable (Legacy)
|
||||
|
||||
```bash
|
||||
# Run specific pipeline/scheduler/epilogue combination
|
||||
./bin/benchmark_gemm_universal_[Datatype]_[Layout] -pipeline=compv3 -scheduler=intrawave -epilogue=default
|
||||
```
|
||||
|
||||
### Automated Testing
|
||||
|
||||
Use the provided test script to run multiple benchmarks:
|
||||
```bash
|
||||
cd /path/to/composable_kernel/tile_engine/ops/gemm
|
||||
./test_benchmark.sh [build_directory]
|
||||
```
|
||||
|
||||
## Configuration System
|
||||
|
||||
### Configuration Files
|
||||
|
||||
The system uses JSON configuration files to specify kernel parameters:
|
||||
|
||||
- `configs/default_config.json` - Default configurations for various datatypes
|
||||
- `configs/user_provided_config.json` - User-customizable configurations
|
||||
|
||||
### Configuration Structure
|
||||
|
||||
```json
|
||||
{
|
||||
"tile_config": {
|
||||
"tile_m": {"values": [256, 128]},
|
||||
"tile_n": {"values": [256, 128]},
|
||||
"tile_k": {"values": [64, 32]},
|
||||
"warp_m": {"values": [2, 4]},
|
||||
"warp_n": {"values": [2, 1]},
|
||||
"warp_k": {"values": [1]},
|
||||
"warp_tile_m": {"values": [32, 16]},
|
||||
"warp_tile_n": {"values": [32, 16]},
|
||||
"warp_tile_k": {"values": [16, 32]}
|
||||
},
|
||||
"trait_config": {
|
||||
"pipeline": {"values": ["compv3", "compv4", "mem"]},
|
||||
"scheduler": {"values": ["intrawave", "interwave"]},
|
||||
"epilogue": {"values": ["default", "cshuffle"]},
|
||||
"pad_m": {"values": [false]},
|
||||
"pad_n": {"values": [false]},
|
||||
"pad_k": {"values": [false]},
|
||||
"persistent": {"values": [false]}
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
## Scripts and Tools
|
||||
|
||||
### Python Scripts
|
||||
|
||||
#### gemm_universal_instance_builder.py
|
||||
**Purpose**: Main kernel instance generation script that creates C++ kernel implementations based on configuration files.
|
||||
|
||||
**Key Features**:
|
||||
- Generates individual kernel header files for separate compilation
|
||||
- Supports multiple data types (fp16, fp8, bf16, fp32, fp64)
|
||||
- Validates tile configurations for correctness
|
||||
- Creates CMake integration files
|
||||
|
||||
**Usage**:
|
||||
```bash
|
||||
python gemm_universal_instance_builder.py \
|
||||
--working_path ./generated \
|
||||
--datatype fp16 \
|
||||
--layout rcr \
|
||||
--config_json configs/user_provided_config.json \
|
||||
--gen_all_individual
|
||||
```
|
||||
|
||||
#### gemm_instance_builder_parallel.py
|
||||
**Purpose**: Parallel version of the instance builder for faster generation of multiple kernel configurations.
|
||||
|
||||
**Features**:
|
||||
- Multi-threaded kernel generation
|
||||
- Improved performance for large configuration spaces
|
||||
|
||||
#### validation_utils.py
|
||||
**Purpose**: Provides comprehensive validation functions for kernel configurations.
|
||||
|
||||
**Key Functions**:
|
||||
- `is_tile_config_valid()` - Validates tile dimensions and alignments
|
||||
- `is_trait_combination_valid()` - Checks if pipeline/epilogue/scheduler combinations are supported
|
||||
- `validate_warp_tile_combination()` - GPU-specific warp tile validation
|
||||
- `validate_lds_capacity()` - Ensures configurations fit in LDS memory
|
||||
|
||||
**Validation Checks**:
|
||||
- Dimension alignment (tile dimensions must be divisible by warp dimensions)
|
||||
- LDS capacity constraints
|
||||
- GPU-specific warp tile support
|
||||
- Unsupported trait combinations
|
||||
|
||||
#### test_validation.py
|
||||
**Purpose**: Test suite for the validation logic to ensure correctness.
|
||||
|
||||
**Usage**:
|
||||
```bash
|
||||
python test_validation.py
|
||||
```
|
||||
|
||||
**Tests**:
|
||||
- Warp tile combination validation
|
||||
- Trait combination validation
|
||||
- Full tile configuration validation
|
||||
|
||||
#### gemm_universal_benchmark.py
|
||||
**Purpose**: Python script for running and analyzing GEMM benchmarks.
|
||||
|
||||
**Features**:
|
||||
- Automated benchmark execution
|
||||
- Performance data collection
|
||||
- Result analysis and reporting
|
||||
|
||||
#### json_config.py
|
||||
**Purpose**: Configuration file parsing and management.
|
||||
|
||||
**Features**:
|
||||
- JSON configuration loading
|
||||
- Default configuration handling
|
||||
- Configuration validation
|
||||
|
||||
#### codegen_utils.py
|
||||
**Purpose**: Utility functions for code generation.
|
||||
|
||||
**Features**:
|
||||
- Template processing
|
||||
- Code formatting utilities
|
||||
- File generation helpers
|
||||
|
||||
### Shell Scripts
|
||||
|
||||
#### test_benchmark.sh
|
||||
**Purpose**: Automated benchmark testing script that finds and runs all built benchmark executables.
|
||||
|
||||
**Features**:
|
||||
- Automatic build directory detection
|
||||
- Batch execution of multiple benchmarks
|
||||
- CSV result collection
|
||||
- Colored output for easy reading
|
||||
- Example command generation
|
||||
|
||||
**Usage**:
|
||||
```bash
|
||||
# Auto-detect build directory
|
||||
./test_benchmark.sh
|
||||
|
||||
# Specify build directory
|
||||
./test_benchmark.sh /path/to/build/directory
|
||||
```
|
||||
|
||||
**What it does**:
|
||||
1. Finds all benchmark executables in the build directory
|
||||
2. Runs each with multiple problem sizes (512, 1024, 2048)
|
||||
3. Performs GPU verification
|
||||
4. Saves results to timestamped CSV file
|
||||
5. Provides summary statistics
|
||||
|
||||
## Command Line Options
|
||||
|
||||
All benchmark executables support the following options:
|
||||
|
||||
### Matrix Dimensions
|
||||
- `-m=<value>` - M dimension (default: 3840)
|
||||
- `-n=<value>` - N dimension (default: 4096)
|
||||
- `-k=<value>` - K dimension (default: 2048)
|
||||
|
||||
### Strides
|
||||
- `-stride_a=<value>` - Stride for matrix A (default: 0, auto-calculated)
|
||||
- `-stride_b=<value>` - Stride for matrix B (default: 0, auto-calculated)
|
||||
- `-stride_c=<value>` - Stride for matrix C (default: 0, auto-calculated)
|
||||
|
||||
### Verification
|
||||
- `-verify=<0|1|2>` - Verification mode
|
||||
- 0: No verification (default)
|
||||
- 1: CPU verification
|
||||
- 2: GPU verification
|
||||
|
||||
### Performance Testing
|
||||
- `-warmup=<value>` - Warmup iterations (default: 50)
|
||||
- `-repeat=<value>` - Benchmark iterations (default: 100)
|
||||
- `-timer=<true|false>` - Use GPU timer (default: true)
|
||||
- `-flush_cache=<true|false>` - Flush cache between runs (default: true)
|
||||
- `-rotating_count=<value>` - Cache rotation count (default: 1000)
|
||||
|
||||
### Initialization
|
||||
- `-init=<0|1|2>` - Tensor initialization method
|
||||
- 0: Random values [-1, 1] (default)
|
||||
- 1: Linear sequence (i % 17)
|
||||
- 2: Constant value (1.0)
|
||||
|
||||
### Output Options
|
||||
- `-log=<true|false>` - Enable verbose logging (default: false)
|
||||
- `-metric=<0|1|2>` - Performance metric
|
||||
- 0: Latency in ms (default)
|
||||
- 1: TFLOPS
|
||||
- 2: Bandwidth in GB/s
|
||||
- `-json_output=<true|false>` - JSON format output (default: false)
|
||||
- `-csv_filename=<filename>` - Save results to CSV
|
||||
- `-csv_format=<simple|comprehensive>` - CSV format (default: comprehensive)
|
||||
|
||||
### Advanced Options
|
||||
- `-split_k=<value>` - Split-K factor (default: 1)
|
||||
- `-structured_sparsity=<true|false>` - Enable structured sparsity (default: false)
|
||||
- `-pipeline=<compv3|compv4|mem>` - Pipeline type (default: compv3)
|
||||
- `-scheduler=<intrawave|interwave>` - Scheduler type (default: intrawave)
|
||||
- `-epilogue=<cshuffle|default>` - Epilogue type (default: cshuffle)
|
||||
- `-pad_m=<true|false>` - Pad M dimension (default: false)
|
||||
- `-pad_n=<true|false>` - Pad N dimension (default: false)
|
||||
- `-pad_k=<true|false>` - Pad K dimension (default: false)
|
||||
- `-persistent=<true|false>` - Use persistent kernel (default: false)
|
||||
|
||||
## Understanding Kernel Names
|
||||
|
||||
The kernel naming convention encodes the configuration:
|
||||
|
||||
```
|
||||
benchmark_gemm_universal_fp16_rcr_compv3_default_intrawave_False_False_False_False_256x128x32_4x1x1_32x32x16
|
||||
^^^^ ^^^ ^^^^^^ ^^^^^^^ ^^^^^^^^^ ^^^^^^^^^^^^^^^^^^^^^^^ ^^^^^^^^^ ^^^^^^^ ^^^^^^^^^
|
||||
| | | | | | | | |
|
||||
| | | | | Padding & flags | | Warp tile
|
||||
| | | | Scheduler | Thread tile
|
||||
| | | Epilogue Block tile
|
||||
| | Pipeline
|
||||
| Layout (Row-Column-Row)
|
||||
Data type
|
||||
```
|
||||
|
||||
### Components:
|
||||
- **Data type**: fp16, fp32, bf16, fp8, bf8, int8
|
||||
- **Layout**: rcr (Row-Column-Row), rrr, crr, ccr
|
||||
- **Pipeline**: mem, compv3, compv4
|
||||
- **Epilogue**: default, cshuffle
|
||||
- **Scheduler**: intrawave, interwave
|
||||
- **Flags**: pad_m, pad_n, pad_k, persistent (4 boolean flags)
|
||||
- **Tile sizes**: BlockTile x ThreadTile x WarpTile
|
||||
|
||||
## Troubleshooting
|
||||
|
||||
### Common Issues
|
||||
|
||||
1. **Kernel not found**
|
||||
- Ensure the specific benchmark executable is built
|
||||
- Check the build directory bin/ folder
|
||||
|
||||
2. **Verification failures**
|
||||
- Try GPU verification (-verify=2) which may be more accurate
|
||||
- Check data type compatibility
|
||||
- Verify stride calculations
|
||||
|
||||
3. **Build failures**
|
||||
- Check GPU architecture compatibility
|
||||
- Ensure ROCm is properly installed
|
||||
- Verify configuration file syntax
|
||||
|
||||
4. **Performance variations**
|
||||
- Increase warmup iterations
|
||||
- Disable CPU frequency scaling
|
||||
- Use GPU timer for accurate measurements
|
||||
|
||||
### Debug Options
|
||||
|
||||
Enable verbose logging:
|
||||
```bash
|
||||
./bin/benchmark_gemm_... -log=true -verify=1
|
||||
```
|
||||
|
||||
Test validation logic:
|
||||
```bash
|
||||
python test_validation.py
|
||||
```
|
||||
|
||||
## Performance Tips
|
||||
|
||||
1. **Optimal Problem Sizes**: Use sizes that are multiples of tile dimensions
|
||||
2. **Warmup**: Use at least 50-100 warmup iterations
|
||||
3. **GPU Timer**: Always use `-timer=true` for accurate measurements
|
||||
4. **Cache Management**: Enable cache flushing for consistent results
|
||||
5. **Thread Affinity**: Set CPU affinity to reduce variation
|
||||
|
||||
## Integration Examples
|
||||
|
||||
### Python Integration
|
||||
|
||||
```python
|
||||
import subprocess
|
||||
import json
|
||||
|
||||
# Run benchmark with JSON output
|
||||
result = subprocess.run([
|
||||
'./bin/benchmark_gemm_universal_fp16_rcr_...',
|
||||
'-m=1024', '-n=1024', '-k=1024',
|
||||
'-json_output=true'
|
||||
], capture_output=True, text=True)
|
||||
|
||||
# Parse results
|
||||
data = json.loads(result.stdout)
|
||||
print(f"Performance: {data['tflops']} TFLOPS")
|
||||
```
|
||||
|
||||
### Batch Testing Script
|
||||
|
||||
```bash
|
||||
#!/bin/bash
|
||||
SIZES="512 1024 2048 4096"
|
||||
for size in $SIZES; do
|
||||
echo "Testing ${size}x${size}x${size}"
|
||||
./bin/benchmark_gemm_... -m=$size -n=$size -k=$size \
|
||||
-verify=2 -csv_filename=results.csv
|
||||
done
|
||||
```
|
||||
|
||||
## Contributing
|
||||
|
||||
When adding new features or configurations:
|
||||
1. Update validation logic in `validation_utils.py`
|
||||
2. Add tests to `test_validation.py`
|
||||
3. Update configuration examples
|
||||
4. Document new command-line options
|
||||
|
||||
For more information about the Composable Kernel project, visit the main repository documentation.
|
||||
@@ -52,19 +52,27 @@ struct GemmProblem
|
||||
|
||||
// Detect Problem::DsDataType, default to void when absent
|
||||
template <class T, class = void>
|
||||
struct get_DsDataType { using type = void; };
|
||||
struct get_DsDataType
|
||||
{
|
||||
using type = void;
|
||||
};
|
||||
|
||||
template <class T>
|
||||
struct get_DsDataType<T, std::void_t<typename T::DsDataType>> {
|
||||
struct get_DsDataType<T, std::void_t<typename T::DsDataType>>
|
||||
{
|
||||
using type = typename T::DsDataType;
|
||||
};
|
||||
|
||||
// Detect Problem::D0DataType, default to void when absent
|
||||
template <class T, class = void>
|
||||
struct get_D0DataType { using type = void; };
|
||||
struct get_D0DataType
|
||||
{
|
||||
using type = void;
|
||||
};
|
||||
|
||||
template <class T>
|
||||
struct get_D0DataType<T, std::void_t<typename T::D0DataType>> {
|
||||
struct get_D0DataType<T, std::void_t<typename T::D0DataType>>
|
||||
{
|
||||
using type = typename T::D0DataType;
|
||||
};
|
||||
|
||||
@@ -79,10 +87,10 @@ bool compare(std::string instanceName,
|
||||
using DDataType = typename get_D0DataType<Problem>::type;
|
||||
const float max_accumulated_value =
|
||||
*std::max_element(c_m_n_host_result.mData.begin(), c_m_n_host_result.mData.end());
|
||||
//const auto rtol_atol = calculate_rtol_atol<ADataType, BDataType, AccDataType, CDataType>(
|
||||
//K, kbatch, max_accumulated_value);
|
||||
// const auto rtol_atol = calculate_rtol_atol<ADataType, BDataType, AccDataType, CDataType>(
|
||||
// K, kbatch, max_accumulated_value);
|
||||
auto rtol_atol = [&] {
|
||||
if constexpr (std::is_void_v<DDataType>)
|
||||
if constexpr(std::is_void_v<DDataType>)
|
||||
{
|
||||
return calculate_rtol_atol<ADataType, BDataType, AccDataType, CDataType>(
|
||||
K, kbatch, max_accumulated_value);
|
||||
|
||||
@@ -3,15 +3,10 @@
|
||||
# SPDX-License-Identifier: MIT
|
||||
|
||||
import os
|
||||
import sys
|
||||
import json
|
||||
import subprocess
|
||||
import argparse
|
||||
import csv
|
||||
import time
|
||||
import importlib.util
|
||||
from pathlib import Path
|
||||
from typing import List, Dict, Tuple, Optional
|
||||
from typing import List, Dict, Tuple
|
||||
|
||||
|
||||
# TODO: explore modularizing tile engine to avoid accessing imports like this
|
||||
def _import_benchmark_utils():
|
||||
@@ -29,10 +24,14 @@ def _import_benchmark_utils():
|
||||
|
||||
return benchmark_utils
|
||||
|
||||
|
||||
benchmark_utils = _import_benchmark_utils()
|
||||
|
||||
|
||||
class GemmBenchmark:
|
||||
def __init__(self, build_dir: str, verbose: bool = False, name: str = "benchmark_gemm_"):
|
||||
def __init__(
|
||||
self, build_dir: str, verbose: bool = False, name: str = "benchmark_gemm_"
|
||||
):
|
||||
self.build_dir = Path(build_dir)
|
||||
self.verbose = verbose
|
||||
self.results = []
|
||||
@@ -57,7 +56,7 @@ class GemmBenchmark:
|
||||
"""Extract comprehensive kernel information from filename"""
|
||||
name = kernel_path.stem
|
||||
if name.startswith(self.name):
|
||||
args = name[len(self.name):]
|
||||
args = name[len(self.name) :]
|
||||
else:
|
||||
args = name
|
||||
|
||||
@@ -245,7 +244,9 @@ class GemmBenchmark:
|
||||
|
||||
for kernel_path in kernels:
|
||||
kernel_info = self.extract_kernel_info(kernel_path)
|
||||
result = benchmark_utils.run_kernel(self.build_dir, kernel_path, params, verbose=self.verbose)
|
||||
result = benchmark_utils.run_kernel(
|
||||
self.build_dir, kernel_path, params, verbose=self.verbose
|
||||
)
|
||||
if result:
|
||||
# Create new structured result format
|
||||
structured_result = {
|
||||
@@ -327,5 +328,3 @@ class GemmBenchmark:
|
||||
|
||||
self.results = all_results
|
||||
return best_kernels
|
||||
|
||||
|
||||
|
||||
96
tile_engine/ops/gemm/gemm_common.hpp
Normal file
96
tile_engine/ops/gemm/gemm_common.hpp
Normal file
@@ -0,0 +1,96 @@
|
||||
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
#pragma once
|
||||
#include <iostream>
|
||||
#include <functional>
|
||||
#include <tuple>
|
||||
#include <exception>
|
||||
#include <sstream>
|
||||
#include <vector>
|
||||
#include <string>
|
||||
|
||||
#include "ck_tile/core.hpp"
|
||||
#include "ck_tile/host.hpp"
|
||||
|
||||
// Structure to hold kernel traits for dispatcher
|
||||
struct KernelTraits
|
||||
{
|
||||
std::string pipeline; // compv3, compv4, mem
|
||||
std::string scheduler; // intrawave, interwave
|
||||
std::string epilogue; // cshuffle, default
|
||||
bool pad_m;
|
||||
bool pad_n;
|
||||
bool pad_k;
|
||||
bool persistent;
|
||||
|
||||
// Constructor with defaults
|
||||
KernelTraits()
|
||||
: pipeline("compv3"),
|
||||
scheduler("intrawave"),
|
||||
epilogue("cshuffle"),
|
||||
pad_m(false),
|
||||
pad_n(false),
|
||||
pad_k(false),
|
||||
persistent(false)
|
||||
{
|
||||
}
|
||||
};
|
||||
|
||||
// Create argument parser
|
||||
inline auto create_args(int argc, char* argv[])
|
||||
{
|
||||
ck_tile::ArgParser arg_parser;
|
||||
arg_parser.insert("m", "3840", "The value for m dimension. Default is 3840.")
|
||||
.insert("n", "4096", "The value for n dimension. Default is 4096.")
|
||||
.insert("k", "2048", "The value for k dimension. Default is 2048.")
|
||||
.insert("stride_a", "0", "The stride value for tensor A. Default is 0.")
|
||||
.insert("stride_b", "0", "The stride value for tensor B. Default is 0.")
|
||||
.insert("stride_ds", "0", "The stride value for tensor Ds . Default is 0.")
|
||||
.insert("stride_c", "0", "The stride value for tensor C. Default is 0.")
|
||||
.insert("split_k", "1", "The split value for k dimension. Default is 1.")
|
||||
.insert("verify",
|
||||
"2",
|
||||
"The type of validation. Set to 0 for no validation, 1 for validation on CPU, or 2 "
|
||||
"for validation on GPU. Default is 2, GPU validation.")
|
||||
.insert("log",
|
||||
"false",
|
||||
"Whether output kernel instance information or not. Possible values are true or "
|
||||
"false. Default is false")
|
||||
.insert(
|
||||
"warmup", "50", "The number of iterations before benchmark the kernel. Default is 50.")
|
||||
.insert(
|
||||
"repeat", "100", "The number of iterations to benchmark the kernel. Default is 100.")
|
||||
.insert("timer",
|
||||
"true",
|
||||
"Whether if the timer is gpu timer or not. Possible values are false or true. "
|
||||
"Default is true.")
|
||||
.insert("init",
|
||||
"0",
|
||||
"The method of tensor initialization. Set to 0 for random, to 1 for linear, or 2 "
|
||||
"for constant(1). Default is 0, random.")
|
||||
.insert("flush_cache",
|
||||
"true",
|
||||
"To flush cache, possible values are true or false. "
|
||||
"Default is false.")
|
||||
.insert("rotating_count", "1000", "number of iterations to rotate the cache. default is 5.")
|
||||
.insert("metric",
|
||||
"0",
|
||||
"Metric with which to measure kernel performance. Set to 0 for latency, 1 for "
|
||||
"tflops, or 2 for bandwidth. Default is 0, latency.")
|
||||
.insert("csv_filename",
|
||||
"",
|
||||
"The filename of benchmark result. Default is empty (no CSV output).")
|
||||
.insert("structured_sparsity",
|
||||
"false",
|
||||
"Whether use sparsity kernel or not. Possible values are true or false. Default is "
|
||||
"false")
|
||||
.insert("json_output",
|
||||
"false",
|
||||
"Whether to output results in JSON format only. Possible values are true or false. "
|
||||
"Default is "
|
||||
"false");
|
||||
|
||||
bool result = arg_parser.parse(argc, argv);
|
||||
return std::make_tuple(result, arg_parser);
|
||||
}
|
||||
@@ -3,14 +3,10 @@
|
||||
|
||||
import os
|
||||
import sys
|
||||
import json
|
||||
import subprocess
|
||||
import argparse
|
||||
import csv
|
||||
import time
|
||||
import importlib.util
|
||||
from pathlib import Path
|
||||
from typing import List, Dict, Tuple, Optional
|
||||
|
||||
|
||||
def _import_gemm_benchmark():
|
||||
"""Import validation utilities from commons directory."""
|
||||
@@ -27,6 +23,7 @@ def _import_gemm_benchmark():
|
||||
|
||||
return gemm_benchmark_module.GemmBenchmark
|
||||
|
||||
|
||||
def _import_benchmark_utils():
|
||||
"""Import benchmark utilities from commons directory."""
|
||||
current_dir = os.path.dirname(os.path.abspath(__file__))
|
||||
@@ -42,13 +39,16 @@ def _import_benchmark_utils():
|
||||
|
||||
return benchmark_utils
|
||||
|
||||
|
||||
GemmBenchmark = _import_gemm_benchmark()
|
||||
benchmark_utils = _import_benchmark_utils()
|
||||
|
||||
|
||||
class GemmMultiDBenchmark(GemmBenchmark):
|
||||
def __init__(self, build_dir: str, verbose: bool = False):
|
||||
super().__init__(build_dir, verbose, name="benchmark_gemm_multi_d_")
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(
|
||||
description="GEMM Multi D Kernel Benchmarking Tool"
|
||||
|
||||
@@ -11,6 +11,7 @@
|
||||
|
||||
#include "ck_tile/core.hpp"
|
||||
#include "ck_tile/host.hpp"
|
||||
#include "gemm/gemm_common.hpp"
|
||||
#include "gemm_multi_d_profiler.hpp"
|
||||
|
||||
// The kernel header is included via the compile command line with -include flag
|
||||
@@ -35,29 +36,27 @@ void benchmark_single(const ck_tile::ArgParser& arg_parser)
|
||||
std::string layout_d1 = D1Layout::name;
|
||||
|
||||
// Create GemmMultiDProblem struct
|
||||
GemmMultiDProblem gemm_multi_d_problem{
|
||||
GemmProblem{
|
||||
arg_parser.get_int("split_k"),
|
||||
arg_parser.get_int("m"),
|
||||
arg_parser.get_int("n"),
|
||||
arg_parser.get_int("k"),
|
||||
arg_parser.get_int("stride_a"),
|
||||
arg_parser.get_int("stride_b"),
|
||||
arg_parser.get_int("stride_c"),
|
||||
dtype_a,
|
||||
dtype_b,
|
||||
dtype_acc,
|
||||
dtype_c,
|
||||
layout_a,
|
||||
layout_b,
|
||||
layout_c,
|
||||
arg_parser.get_bool("structured_sparsity")},
|
||||
arg_parser.get_int("stride_ds"),
|
||||
arg_parser.get_int("stride_ds"),
|
||||
dtype_d0,
|
||||
dtype_d1,
|
||||
layout_d0,
|
||||
layout_d1};
|
||||
GemmMultiDProblem gemm_multi_d_problem{GemmProblem{arg_parser.get_int("split_k"),
|
||||
arg_parser.get_int("m"),
|
||||
arg_parser.get_int("n"),
|
||||
arg_parser.get_int("k"),
|
||||
arg_parser.get_int("stride_a"),
|
||||
arg_parser.get_int("stride_b"),
|
||||
arg_parser.get_int("stride_c"),
|
||||
dtype_a,
|
||||
dtype_b,
|
||||
dtype_acc,
|
||||
dtype_c,
|
||||
layout_a,
|
||||
layout_b,
|
||||
layout_c,
|
||||
arg_parser.get_bool("structured_sparsity")},
|
||||
arg_parser.get_int("stride_ds"),
|
||||
arg_parser.get_int("stride_ds"),
|
||||
dtype_d0,
|
||||
dtype_d1,
|
||||
layout_d0,
|
||||
layout_d1};
|
||||
|
||||
// Create Setting struct
|
||||
Setting setting{arg_parser.get_int("warmup"),
|
||||
|
||||
@@ -11,25 +11,28 @@
|
||||
#include <functional>
|
||||
#include <tuple>
|
||||
|
||||
|
||||
#include "ck_tile/host/device_prop.hpp"
|
||||
#include "ck_tile/ops/gemm.hpp"
|
||||
#include "gemm/gemm_profiler.hpp"
|
||||
#include "common/utils.hpp"
|
||||
#include "gemm_multi_d_benchmark.hpp"
|
||||
|
||||
class GemmMultiDProfiler: public GemmProfiler<GemmMultiDProfiler,
|
||||
GemmMultiDProblem,
|
||||
ck_tile::GemmMultiDHostArgs<DsDataType::size()>>
|
||||
class GemmMultiDProfiler : public GemmProfiler<GemmMultiDProfiler,
|
||||
GemmMultiDProblem,
|
||||
ck_tile::GemmMultiDHostArgs<DsDataType::size()>>
|
||||
{
|
||||
public:
|
||||
public:
|
||||
using BaseGemm = GemmProfiler<GemmMultiDProfiler,
|
||||
GemmMultiDProblem,
|
||||
ck_tile::GemmMultiDHostArgs<DsDataType::size()>>;
|
||||
GemmMultiDProblem,
|
||||
ck_tile::GemmMultiDHostArgs<DsDataType::size()>>;
|
||||
using BaseGemm::benchmark;
|
||||
|
||||
GemmMultiDProfiler(Setting setting)
|
||||
: GemmProfiler<GemmMultiDProfiler, GemmMultiDProblem, ck_tile::GemmMultiDHostArgs<DsDataType::size()>>(setting) {}
|
||||
|
||||
: GemmProfiler<GemmMultiDProfiler,
|
||||
GemmMultiDProblem,
|
||||
ck_tile::GemmMultiDHostArgs<DsDataType::size()>>(setting)
|
||||
{
|
||||
}
|
||||
|
||||
void benchmark(
|
||||
GemmMultiDProblem& gemm_multi_d_problem,
|
||||
@@ -157,5 +160,4 @@ public:
|
||||
kernel_run_result);
|
||||
}
|
||||
}
|
||||
|
||||
};
|
||||
|
||||
@@ -8,6 +8,23 @@
|
||||
#include "gemm_preshuffle_common.hpp"
|
||||
#include "gemm/gemm_benchmark.hpp"
|
||||
|
||||
struct KernelConfig
|
||||
{
|
||||
ck_tile::index_t M_Tile;
|
||||
ck_tile::index_t N_Tile;
|
||||
ck_tile::index_t K_Tile;
|
||||
|
||||
ck_tile::index_t M_Warp;
|
||||
ck_tile::index_t N_Warp;
|
||||
ck_tile::index_t K_Warp;
|
||||
|
||||
ck_tile::index_t M_Warp_Tile;
|
||||
ck_tile::index_t N_Warp_Tile;
|
||||
ck_tile::index_t K_Warp_Tile;
|
||||
|
||||
bool permuteN;
|
||||
};
|
||||
|
||||
/// @brief Function to get the kernel output with reference implementation on CPU/GPU
|
||||
void gemm_host_reference(int verify,
|
||||
ck_tile::HostTensor<ADataType>& a_m_k,
|
||||
|
||||
@@ -3,14 +3,9 @@
|
||||
|
||||
import os
|
||||
import sys
|
||||
import json
|
||||
import subprocess
|
||||
import argparse
|
||||
import csv
|
||||
import time
|
||||
import importlib.util
|
||||
from pathlib import Path
|
||||
from typing import List, Dict, Tuple, Optional
|
||||
|
||||
|
||||
def _import_gemm_benchmark():
|
||||
@@ -28,6 +23,7 @@ def _import_gemm_benchmark():
|
||||
|
||||
return gemm_benchmark_module.GemmBenchmark
|
||||
|
||||
|
||||
def _import_benchmark_utils():
|
||||
"""Import benchmark utilities from commons directory."""
|
||||
current_dir = os.path.dirname(os.path.abspath(__file__))
|
||||
@@ -43,13 +39,16 @@ def _import_benchmark_utils():
|
||||
|
||||
return benchmark_utils
|
||||
|
||||
|
||||
GemmBenchmark = _import_gemm_benchmark()
|
||||
benchmark_utils = _import_benchmark_utils()
|
||||
|
||||
|
||||
class GemmPreshuffleBenchmark(GemmBenchmark):
|
||||
def __init__(self, build_dir: str, verbose: bool = False):
|
||||
super().__init__(build_dir, verbose, name="benchmark_gemm_preshuffle_")
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(
|
||||
description="GEMM Preshuffle Kernel Benchmarking Tool"
|
||||
|
||||
@@ -11,78 +11,21 @@
|
||||
|
||||
#include "ck_tile/core.hpp"
|
||||
#include "ck_tile/host.hpp"
|
||||
#include "gemm/gemm_common.hpp"
|
||||
#include "gemm_preshuffle_profiler.hpp"
|
||||
#include "gemm_preshuffle_common.hpp"
|
||||
|
||||
// The kernel header is included via the compile command line with -include flag
|
||||
// It defines SelectedKernel struct and KERNEL_NAME
|
||||
// DataTypeTraits are now defined in gemm_common.hpp
|
||||
|
||||
// Create argument parser
|
||||
inline auto create_args(int argc, char* argv[])
|
||||
{
|
||||
ck_tile::ArgParser arg_parser;
|
||||
arg_parser.insert("m", "3840", "The value for m dimension. Default is 3840.")
|
||||
.insert("n", "4096", "The value for n dimension. Default is 4096.")
|
||||
.insert("k", "2048", "The value for k dimension. Default is 2048.")
|
||||
.insert("stride_a", "0", "The stride value for tensor A. Default is 0.")
|
||||
.insert("stride_b", "0", "The stride value for tensor B. Default is 0.")
|
||||
.insert("stride_c", "0", "The stride value for tensor C. Default is 0.")
|
||||
.insert("split_k", "1", "The split value for k dimension. Default is 1.")
|
||||
.insert("verify",
|
||||
"2",
|
||||
"The type of validation. Set to 0 for no validation, 1 for validation on CPU, or 2 "
|
||||
"for validation on GPU. Default is 0, no validation.")
|
||||
.insert("log",
|
||||
"false",
|
||||
"Whether output kernel instance information or not. Possible values are true or "
|
||||
"false. Default is false")
|
||||
.insert(
|
||||
"warmup", "50", "The number of iterations before benchmark the kernel. Default is 50.")
|
||||
.insert(
|
||||
"repeat", "100", "The number of iterations to benchmark the kernel. Default is 100.")
|
||||
.insert("timer",
|
||||
"true",
|
||||
"Whether if the timer is gpu timer or not. Possible values are false or true. "
|
||||
"Default is true.")
|
||||
.insert("init",
|
||||
"0",
|
||||
"The method of tensor initialization. Set to 0 for random, to 1 for linear, or 2 "
|
||||
"for constant(1). Default is 0, random.")
|
||||
.insert("flush_cache",
|
||||
"true",
|
||||
"To flush cache, possible values are true or false. "
|
||||
"Default is false.")
|
||||
.insert("rotating_count", "1000", "number of iterations to rotate the cache. default is 5.")
|
||||
.insert("metric",
|
||||
"0",
|
||||
"Metric with which to measure kernel performance. Set to 0 for latency, 1 for "
|
||||
"tflops, or 2 for bandwidth. Default is 0, latency.")
|
||||
.insert("csv_filename",
|
||||
"",
|
||||
"The filename of benchmark result. Default is empty (no CSV output).")
|
||||
.insert("structured_sparsity",
|
||||
"false",
|
||||
"Whether use sparsity kernel or not. Possible values are true or false. Default is "
|
||||
"false")
|
||||
.insert("json_output",
|
||||
"false",
|
||||
"Whether to output results in JSON format only. Possible values are true or false. "
|
||||
"Default is "
|
||||
"false");
|
||||
|
||||
bool result = arg_parser.parse(argc, argv);
|
||||
return std::make_tuple(result, arg_parser);
|
||||
}
|
||||
|
||||
void benchmark_single(const ck_tile::ArgParser& arg_parser)
|
||||
{
|
||||
// Use DataTypeTraits to get the actual type names from the generated header
|
||||
// The generated header defines ADataType, BDataType, AccDataType, CDataType
|
||||
std::string dtype_a = DataTypeTraits<ADataType>::name;
|
||||
std::string dtype_b = DataTypeTraits<BDataType>::name;
|
||||
std::string dtype_acc = DataTypeTraits<AccDataType>::name;
|
||||
std::string dtype_c = DataTypeTraits<CDataType>::name;
|
||||
std::string dtype_a = ck_tile::DataTypeTraits<ADataType>::name;
|
||||
std::string dtype_b = ck_tile::DataTypeTraits<BDataType>::name;
|
||||
std::string dtype_acc = ck_tile::DataTypeTraits<AccDataType>::name;
|
||||
std::string dtype_c = ck_tile::DataTypeTraits<CDataType>::name;
|
||||
|
||||
// Layout names from the layout types
|
||||
std::string layout_a = ALayout::name;
|
||||
@@ -119,29 +62,17 @@ void benchmark_single(const ck_tile::ArgParser& arg_parser)
|
||||
arg_parser.get_bool("json_output")};
|
||||
|
||||
// Get the profiler instance
|
||||
auto& profiler = GemmProfiler::instance(setting);
|
||||
auto& profiler = GemmPreshuffleProfiler::instance(setting);
|
||||
|
||||
try
|
||||
{
|
||||
// Create a lambda that wraps the kernel launch
|
||||
std::tuple<int, int, int> warp_tile_dims = std::make_tuple(
|
||||
SelectedKernel::WarpTileM, SelectedKernel::WarpTileN, SelectedKernel::WarpTileK);
|
||||
std::tuple<int, int, int> tile_dims =
|
||||
std::make_tuple(SelectedKernel::TileM, SelectedKernel::TileN, SelectedKernel::TileK);
|
||||
std::tuple<int, int, int> warp_dims = std::make_tuple(SelectedKernel::WarpPerBlock_M,
|
||||
SelectedKernel::WarpPerBlock_N,
|
||||
SelectedKernel::WarpPerBlock_K);
|
||||
bool permuteN = SelectedKernel::PermuteN;
|
||||
|
||||
KernelConfig config{tile_dims, warp_dims, warp_tile_dims, permuteN};
|
||||
|
||||
auto kernel_func = [](const ck_tile::GemmHostArgs& args,
|
||||
const ck_tile::stream_config& stream) {
|
||||
return SelectedKernel::launch(args, stream);
|
||||
};
|
||||
|
||||
// Benchmark the kernel
|
||||
profiler.benchmark(gemm_problem, kernel_func, config);
|
||||
profiler.benchmark(gemm_problem, kernel_func);
|
||||
|
||||
// Select best instance based on metric
|
||||
profiler.select_best_instance(static_cast<Metric>(arg_parser.get_int("metric")));
|
||||
|
||||
@@ -8,35 +8,20 @@
|
||||
#include "ck_tile/host.hpp"
|
||||
#include "ck_tile/core/numeric/integer.hpp"
|
||||
#include "ck_tile/core/numeric/pk_int4.hpp"
|
||||
#include "gemm/gemm_common.hpp"
|
||||
|
||||
// Structure to hold kernel traits for dispatcher
|
||||
struct KernelTraits
|
||||
struct PreshuffleKernelTraits : KernelTraits
|
||||
{
|
||||
std::string pipeline; // preshufflev2
|
||||
std::string scheduler; // intrawave, interwave, default
|
||||
std::string epilogue; // cshuffle, default
|
||||
bool pad_m;
|
||||
bool pad_n;
|
||||
bool pad_k;
|
||||
bool persistent;
|
||||
|
||||
// Constructor with defaults
|
||||
KernelTraits()
|
||||
: pipeline("preshufflev2"),
|
||||
scheduler("default"),
|
||||
epilogue("default"),
|
||||
pad_m(false),
|
||||
pad_n(false),
|
||||
pad_k(false),
|
||||
persistent(false)
|
||||
{
|
||||
}
|
||||
PreshuffleKernelTraits() : KernelTraits() { this->pipeline = "preshufflev2"; }
|
||||
};
|
||||
|
||||
// Helper to extract traits from kernel name
|
||||
inline KernelTraits extract_traits_from_name(const std::string& kernel_name)
|
||||
inline PreshuffleKernelTraits extract_traits_from_name(const std::string& kernel_name)
|
||||
{
|
||||
KernelTraits traits;
|
||||
PreshuffleKernelTraits traits;
|
||||
|
||||
// Extract pipeline
|
||||
if(kernel_name.find("preshufflev2") != std::string::npos)
|
||||
@@ -74,42 +59,3 @@ inline KernelTraits extract_traits_from_name(const std::string& kernel_name)
|
||||
|
||||
return traits;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
auto shuffle_b(const ck_tile::HostTensor<T>& t,
|
||||
ck_tile::index_t N_Warp_Tile,
|
||||
ck_tile::index_t K_Warp_Tile)
|
||||
{
|
||||
assert(t.get_lengths().size() == 2);
|
||||
int n_ = t.get_lengths()[1];
|
||||
int k_ = t.get_lengths()[0];
|
||||
int divisor = N_Warp_Tile == 32 ? 2 : 4;
|
||||
ck_tile::HostTensor<T> t_view(
|
||||
{n_ / N_Warp_Tile, N_Warp_Tile, k_ / K_Warp_Tile, divisor, K_Warp_Tile / divisor});
|
||||
std::copy(t.begin(), t.end(), t_view.begin());
|
||||
return ck_tile::reference_permute(t_view, {0, 2, 3, 1, 4});
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
auto shuffle_b_permuteN(const ck_tile::HostTensor<T>& t,
|
||||
ck_tile::index_t N_Warp_Tile,
|
||||
ck_tile::index_t K_Warp_Tile,
|
||||
ck_tile::index_t N_Tile,
|
||||
ck_tile::index_t N_Warp)
|
||||
{
|
||||
assert(t.get_lengths().size() == 2);
|
||||
|
||||
int n_ = t.get_lengths()[1];
|
||||
int k_ = t.get_lengths()[0];
|
||||
int divisor = N_Warp_Tile == 32 ? 2 : 4;
|
||||
int NRepeat = N_Tile / N_Warp_Tile / N_Warp;
|
||||
ck_tile::HostTensor<T> t_view({n_ / N_Tile,
|
||||
N_Warp,
|
||||
N_Warp_Tile,
|
||||
NRepeat,
|
||||
k_ / K_Warp_Tile,
|
||||
divisor,
|
||||
K_Warp_Tile / divisor});
|
||||
std::copy(t.begin(), t.end(), t_view.begin());
|
||||
return ck_tile::reference_permute(t_view, {0, 3, 1, 4, 5, 2, 6});
|
||||
}
|
||||
|
||||
@@ -4,28 +4,26 @@
|
||||
#pragma once
|
||||
|
||||
#include "ck_tile/host/device_prop.hpp"
|
||||
#include "ck_tile/host/tensor_shuffle_utils.hpp"
|
||||
#include "ck_tile/ops/gemm.hpp"
|
||||
#include "gemm/gemm_profiler.hpp"
|
||||
#include "gemm_preshuffle_benchmark.hpp"
|
||||
|
||||
class GemmPreshuffleProfiler : public GemmProfiler<GemmPreshuffleProfiler,
|
||||
GemmProblem,
|
||||
ck_tile::GemmHostArgs>
|
||||
class GemmPreshuffleProfiler
|
||||
: public GemmProfiler<GemmPreshuffleProfiler, GemmProblem, ck_tile::GemmHostArgs>
|
||||
{
|
||||
public:
|
||||
using BaseGemm = GemmProfiler<GemmPreshuffleProfiler,
|
||||
GemmProblem,
|
||||
ck_tile::GemmHostArgs>;
|
||||
public:
|
||||
using BaseGemm = GemmProfiler<GemmPreshuffleProfiler, GemmProblem, ck_tile::GemmHostArgs>;
|
||||
using BaseGemm::benchmark;
|
||||
|
||||
GemmPreshuffleProfiler(Setting setting)
|
||||
: GemmProfiler<GemmPreshuffleProfiler, GemmProblem, ck_tile::GemmHostArgs>(setting) {}
|
||||
|
||||
: GemmProfiler<GemmPreshuffleProfiler, GemmProblem, ck_tile::GemmHostArgs>(setting)
|
||||
{
|
||||
}
|
||||
|
||||
void benchmark(GemmProblem& gemm_problem,
|
||||
std::vector<std::function<std::tuple<std::string, float>(
|
||||
ck_tile::GemmHostArgs&, const ck_tile::stream_config&)>>& callables,
|
||||
KernelConfig& config) override
|
||||
ck_tile::GemmHostArgs&, const ck_tile::stream_config&)>>& callables) override
|
||||
{
|
||||
const ALayout layout_a = ALayout{};
|
||||
const BLayout layout_b = BLayout{};
|
||||
@@ -97,21 +95,28 @@ public:
|
||||
c_m_n_dev_buf.SetZero();
|
||||
c_m_n_dev_result.SetZero();
|
||||
|
||||
// Create a lambda that wraps the kernel launch
|
||||
KernelConfig config{SelectedKernel::WarpTileM,
|
||||
SelectedKernel::WarpTileN,
|
||||
SelectedKernel::WarpTileK,
|
||||
SelectedKernel::TileM,
|
||||
SelectedKernel::TileN,
|
||||
SelectedKernel::TileK,
|
||||
SelectedKernel::WarpPerBlock_M,
|
||||
SelectedKernel::WarpPerBlock_N,
|
||||
SelectedKernel::WarpPerBlock_K,
|
||||
SelectedKernel::PermuteN};
|
||||
|
||||
for(const auto& callable : callables)
|
||||
{
|
||||
ck_tile::index_t N_Warp_Tile = std::get<1>(config.warp_tile_dims);
|
||||
ck_tile::index_t K_Warp_Tile = std::get<2>(config.warp_tile_dims);
|
||||
ck_tile::index_t N_Tile = std::get<1>(config.tile_dims);
|
||||
ck_tile::index_t N_Warp = std::get<1>(config.warp_dims);
|
||||
|
||||
ck_tile::HostTensor<BDataType> b_shuffle_host = [&]() {
|
||||
if(config.permuteN)
|
||||
{
|
||||
return shuffle_b_permuteN(b_k_n, N_Warp_Tile, K_Warp_Tile, N_Tile, N_Warp);
|
||||
return ck_tile::shuffle_b_permuteN<KernelConfig>(b_k_n, config);
|
||||
}
|
||||
else
|
||||
{
|
||||
return shuffle_b(b_k_n, N_Warp_Tile, K_Warp_Tile);
|
||||
return ck_tile::shuffle_b<KernelConfig>(b_k_n, config);
|
||||
}
|
||||
}();
|
||||
|
||||
@@ -144,5 +149,4 @@ public:
|
||||
gemm_problem, c_m_n_dev_buf, c_m_n_ref, c_m_n_dev_result, kernel_run_result);
|
||||
}
|
||||
}
|
||||
|
||||
};
|
||||
|
||||
@@ -11,17 +11,14 @@
|
||||
#include <functional>
|
||||
#include <tuple>
|
||||
|
||||
|
||||
#include "ck_tile/host/device_prop.hpp"
|
||||
#include "ck_tile/ops/gemm.hpp"
|
||||
#include "gemm_benchmark.hpp"
|
||||
|
||||
template <typename Gemm,
|
||||
typename Problem,
|
||||
typename GemmArgs>
|
||||
template <typename Gemm, typename Problem, typename GemmArgs>
|
||||
class GemmProfiler
|
||||
{
|
||||
public:
|
||||
public:
|
||||
static Gemm& instance(Setting setting)
|
||||
{
|
||||
static Gemm instance{setting};
|
||||
@@ -30,27 +27,25 @@ public:
|
||||
|
||||
// Overload for single kernel benchmarking
|
||||
void benchmark(Problem& gemm_problem,
|
||||
std::function<float(const GemmArgs&, const ck_tile::stream_config&)>
|
||||
kernel_func)
|
||||
std::function<float(const GemmArgs&, const ck_tile::stream_config&)> kernel_func)
|
||||
{
|
||||
// Create a vector with a single callable that returns both name and time
|
||||
std::vector<std::function<std::tuple<std::string, float>(GemmArgs&,
|
||||
const ck_tile::stream_config&)>>
|
||||
std::vector<
|
||||
std::function<std::tuple<std::string, float>(GemmArgs&, const ck_tile::stream_config&)>>
|
||||
callables;
|
||||
|
||||
callables.push_back(
|
||||
[kernel_func](GemmArgs& args, const ck_tile::stream_config& stream) {
|
||||
float time = kernel_func(args, stream);
|
||||
return std::make_tuple(std::string(KERNEL_NAME), time);
|
||||
});
|
||||
callables.push_back([kernel_func](GemmArgs& args, const ck_tile::stream_config& stream) {
|
||||
float time = kernel_func(args, stream);
|
||||
return std::make_tuple(std::string(KERNEL_NAME), time);
|
||||
});
|
||||
|
||||
benchmark(gemm_problem, callables); // TODO: need to cast this?
|
||||
benchmark(gemm_problem, callables);
|
||||
}
|
||||
|
||||
virtual void benchmark(Problem& gemm_problem,
|
||||
std::vector<std::function<std::tuple<std::string, float>(
|
||||
GemmArgs&, const ck_tile::stream_config&)>>& callables) = 0;
|
||||
|
||||
std::vector<std::function<std::tuple<std::string, float>(
|
||||
GemmArgs&, const ck_tile::stream_config&)>>& callables) = 0;
|
||||
|
||||
void process_result(const Problem& gemm_problem,
|
||||
ck_tile::DeviceMem& c_m_n_dev_buf,
|
||||
ck_tile::HostTensor<CDataType>& c_m_n_host_result,
|
||||
@@ -58,7 +53,7 @@ public:
|
||||
const std::tuple<std::string, float>& kernel_run_result)
|
||||
{
|
||||
auto [name, avg_time] = kernel_run_result;
|
||||
using DDataType = typename get_DsDataType<Problem>::type;
|
||||
using DDataType = typename get_DsDataType<Problem>::type;
|
||||
|
||||
KernelInstance<Problem> kernel_instance{name, gemm_problem, {-1.0f, -1.0f, -1.0f}};
|
||||
|
||||
@@ -68,16 +63,14 @@ public:
|
||||
sizeof(BDataType) * gemm_problem.n_ * gemm_problem.k_ +
|
||||
sizeof(CDataType) * gemm_problem.m_ * gemm_problem.n_;
|
||||
|
||||
|
||||
if constexpr (!std::is_void_v<DDataType>)
|
||||
{
|
||||
ck_tile::static_for<0, DDataType::size(), 1>{}([&](auto i) {
|
||||
using DType = ck_tile::remove_cvref_t<std::tuple_element_t<i, DDataType>>;
|
||||
num_byte += sizeof(DType) * gemm_problem.m_ * gemm_problem.n_;
|
||||
flop += sizeof(DType) * gemm_problem.m_ * gemm_problem.n_;
|
||||
});
|
||||
}
|
||||
|
||||
if constexpr(!std::is_void_v<DDataType>)
|
||||
{
|
||||
ck_tile::static_for<0, DDataType::size(), 1>{}([&](auto i) {
|
||||
using DType = ck_tile::remove_cvref_t<std::tuple_element_t<i, DDataType>>;
|
||||
num_byte += sizeof(DType) * gemm_problem.m_ * gemm_problem.n_;
|
||||
flop += sizeof(DType) * gemm_problem.m_ * gemm_problem.n_;
|
||||
});
|
||||
}
|
||||
|
||||
// update
|
||||
kernel_instance.perf_result_.latency_ = avg_time;
|
||||
@@ -91,15 +84,14 @@ public:
|
||||
|
||||
// verify result
|
||||
c_m_n_dev_buf.FromDevice(c_m_n_dev_result.data());
|
||||
int split_k = 1;
|
||||
if constexpr (std::is_same_v<Problem, GemmProblem>)
|
||||
{
|
||||
split_k = gemm_problem.split_k_;
|
||||
}
|
||||
int split_k = 1;
|
||||
if constexpr(std::is_same_v<Problem, GemmProblem>)
|
||||
{
|
||||
split_k = gemm_problem.split_k_;
|
||||
}
|
||||
bool verified_correct =
|
||||
!setting_.verify_ ||
|
||||
compare<Problem>(
|
||||
name, gemm_problem.k_, split_k, c_m_n_dev_result, c_m_n_host_result);
|
||||
compare<Problem>(name, gemm_problem.k_, split_k, c_m_n_dev_result, c_m_n_host_result);
|
||||
|
||||
if(verified_correct)
|
||||
{
|
||||
@@ -196,5 +188,3 @@ public:
|
||||
|
||||
std::vector<KernelInstance<Problem>> kernel_instances_;
|
||||
};
|
||||
|
||||
|
||||
|
||||
@@ -4,14 +4,10 @@
|
||||
|
||||
import os
|
||||
import sys
|
||||
import json
|
||||
import subprocess
|
||||
import argparse
|
||||
import csv
|
||||
import time
|
||||
import importlib.util
|
||||
from pathlib import Path
|
||||
from typing import List, Dict, Tuple, Optional
|
||||
|
||||
|
||||
def _import_gemm_benchmark():
|
||||
"""Import validation utilities from commons directory."""
|
||||
@@ -28,6 +24,7 @@ def _import_gemm_benchmark():
|
||||
|
||||
return gemm_benchmark_module.GemmBenchmark
|
||||
|
||||
|
||||
def _import_benchmark_utils():
|
||||
"""Import benchmark utilities from commons directory."""
|
||||
current_dir = os.path.dirname(os.path.abspath(__file__))
|
||||
@@ -43,16 +40,20 @@ def _import_benchmark_utils():
|
||||
|
||||
return benchmark_utils
|
||||
|
||||
|
||||
GemmBenchmark = _import_gemm_benchmark()
|
||||
benchmark_utils = _import_benchmark_utils()
|
||||
|
||||
|
||||
class GemmUniversalBenchmark(GemmBenchmark):
|
||||
def __init__(self, build_dir: str, verbose: bool = False):
|
||||
super().__init__(build_dir, verbose, name="benchmark_gemm_")
|
||||
super().__init__(build_dir, verbose, name="benchmark_gemm_universal_")
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description="GEMM Kernel Benchmarking Tool")
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Universal GEMM Kernel Benchmarking Tool"
|
||||
)
|
||||
parser.add_argument(
|
||||
"build_dir", help="Build directory containing kernel executables"
|
||||
)
|
||||
@@ -67,7 +68,9 @@ def main():
|
||||
)
|
||||
parser.add_argument("--verify", action="store_true", help="Enable verification")
|
||||
parser.add_argument(
|
||||
"--csv", default="gemm_benchmark_results.csv", help="CSV output filename"
|
||||
"--csv",
|
||||
default="gemm_universal_benchmark_results.csv",
|
||||
help="CSV output filename",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--best", default="best_kernels.txt", help="Best kernels output filename"
|
||||
@@ -115,7 +118,7 @@ def main():
|
||||
benchmark = GemmUniversalBenchmark(args.build_dir, verbose=args.verbose)
|
||||
|
||||
# Run benchmark sweep
|
||||
print("Starting GEMM kernel benchmark sweep...")
|
||||
print("Starting Universal GEMM kernel benchmark sweep...")
|
||||
start_time = time.time()
|
||||
|
||||
best_kernels = benchmark.benchmark_sweep(
|
||||
|
||||
@@ -11,6 +11,7 @@
|
||||
|
||||
#include "ck_tile/core.hpp"
|
||||
#include "ck_tile/host.hpp"
|
||||
#include "gemm/gemm_common.hpp"
|
||||
#include "gemm_universal_profiler.hpp"
|
||||
|
||||
// The kernel header is included via the compile command line with -include flag
|
||||
|
||||
@@ -9,21 +9,21 @@
|
||||
|
||||
#include "ck_tile/host/device_prop.hpp"
|
||||
#include "ck_tile/ops/gemm.hpp"
|
||||
#include "gemm_universal_benchmark.hpp"
|
||||
#include "gemm/gemm_benchmark.hpp"
|
||||
#include "gemm/gemm_profiler.hpp"
|
||||
#include "gemm_universal_benchmark.hpp"
|
||||
|
||||
class UniversalGemmProfiler : public GemmProfiler<UniversalGemmProfiler,
|
||||
GemmProblem,
|
||||
ck_tile::GemmHostArgs>
|
||||
class UniversalGemmProfiler
|
||||
: public GemmProfiler<UniversalGemmProfiler, GemmProblem, ck_tile::GemmHostArgs>
|
||||
{
|
||||
public:
|
||||
using BaseGemm = GemmProfiler<UniversalGemmProfiler,
|
||||
GemmProblem,
|
||||
ck_tile::GemmHostArgs>;
|
||||
public:
|
||||
using BaseGemm = GemmProfiler<UniversalGemmProfiler, GemmProblem, ck_tile::GemmHostArgs>;
|
||||
using BaseGemm::benchmark;
|
||||
|
||||
UniversalGemmProfiler(Setting setting)
|
||||
: GemmProfiler<UniversalGemmProfiler, GemmProblem, ck_tile::GemmHostArgs>(setting) {}
|
||||
: GemmProfiler<UniversalGemmProfiler, GemmProblem, ck_tile::GemmHostArgs>(setting)
|
||||
{
|
||||
}
|
||||
|
||||
void benchmark(GemmProblem& gemm_problem,
|
||||
std::vector<std::function<std::tuple<std::string, float>(
|
||||
|
||||
Reference in New Issue
Block a user