[CK_TILE] Add pooling to ckTileEngine part3

This commit is contained in:
Aleksander Dudek
2025-12-09 11:59:37 +00:00
parent 990f13229f
commit 07c078d5ef
8 changed files with 2449 additions and 44 deletions

View File

@@ -0,0 +1,263 @@
set(POOL_DATATYPE "fp16;fp32" CACHE STRING "List of datatypes for Pool (semicolon-separated)")
set(POOL_REDUCE_OP "max;avg" CACHE STRING "List of reduce operations for Pool (semicolon-separated)")
set(POOL_CONFIG_FILE "" CACHE STRING "Custom config file name (without path, must be in configs/ folder)")
option(ENABLE_CCACHE_POOL "Enable ccache for Pool ops compilation" OFF)
# Store the directory path for use in functions
set(POOL_SOURCE_DIR ${CMAKE_CURRENT_LIST_DIR})
# Function to create individual Pool targets
function(create_individual_pool_target datatype reduce_op trait block_config config_json)
# Use the parent scope POOL_GPU_TARGETS_INDIVIDUAL variable
if(NOT POOL_GPU_TARGETS_INDIVIDUAL)
message(WARNING "Skipping individual Pool target ${datatype}_${reduce_op}_${trait}_${block_config}: No supported GPU targets")
return()
endif()
# Parse block configuration: format is block_mxblock_n_warp_mxwarp_n_thread_tile_mxthread_tile_n
string(REPLACE "_" ";" config_groups ${block_config})
list(GET config_groups 0 block_dims) # e.g., 128x1
list(GET config_groups 1 warp_dims) # e.g., 1x1
list(GET config_groups 2 thread_tile_dims) # e.g., 2x1
# Parse block dimensions
string(REPLACE "x" ";" block_parts ${block_dims})
list(GET block_parts 0 block_m)
list(GET block_parts 1 block_n)
# Parse warp dimensions
string(REPLACE "x" ";" warp_parts ${warp_dims})
list(GET warp_parts 0 warp_m)
list(GET warp_parts 1 warp_n)
# Parse thread tile dimensions
string(REPLACE "x" ";" thread_tile_parts ${thread_tile_dims})
list(GET thread_tile_parts 0 thread_tile_m)
list(GET thread_tile_parts 1 thread_tile_n)
# Parse trait combo to get pool_dim
string(REPLACE "_" ";" trait_parts ${trait})
list(GET trait_parts 2 pool_dim)
set(target_name "benchmark_pool${pool_dim}d_${datatype}_${reduce_op}_${trait}_${block_config}")
set(working_path "${CMAKE_CURRENT_BINARY_DIR}/${datatype}/${reduce_op}")
# Generate the single instance header for this kernel
set(instance_header "${working_path}/pool_single_${pool_dim}d_${datatype}_${reduce_op}_${trait}_${block_config}.hpp")
# Add custom command to generate the header file at build time
add_custom_command(
OUTPUT ${instance_header}
COMMAND ${Python3_EXECUTABLE} ${POOL_SOURCE_DIR}/pool_instance_builder.py
--working_path ${working_path}
--datatype ${datatype}
--reduce_op ${reduce_op}
--config_json ${config_json}
--gen_single
--kernel_name "pool${pool_dim}d_${datatype}_${reduce_op}_${trait}_${block_config}"
--block_config "${block_config}"
--trait_combo "${trait}"
--gpu_target "${POOL_GPU_TARGETS_INDIVIDUAL}"
DEPENDS ${POOL_SOURCE_DIR}/pool_instance_builder.py ${config_json}
COMMENT "Generating ${instance_header}"
)
# Create the executable
add_executable(${target_name}
EXCLUDE_FROM_ALL
${POOL_SOURCE_DIR}/pool_benchmark_single.cpp
${instance_header}
)
# Set GPU architectures
set_property(TARGET ${target_name} PROPERTY HIP_ARCHITECTURES ${POOL_GPU_TARGETS_INDIVIDUAL})
# Set compile definitions
target_compile_definitions(${target_name} PRIVATE
POOL_SINGLE_INSTANCE_HPP="${instance_header}"
)
# Include directories
target_include_directories(${target_name} PRIVATE
${POOL_SOURCE_DIR}
${working_path}
)
# Compile options
target_compile_options(${target_name} PRIVATE
-Wno-undefined-func-template
-Wno-float-equal
--offload-compress
-include ${instance_header}
)
# Add to collection targets
add_dependencies(benchmark_pool_all ${target_name})
add_dependencies(benchmark_pool_${datatype} ${target_name})
add_dependencies(benchmark_pool_${reduce_op} ${target_name})
add_dependencies(benchmark_pool_${datatype}_${reduce_op} ${target_name})
add_dependencies(benchmark_pool${pool_dim}d ${target_name})
endfunction()
# Function to build individual Pool targets
function(build_individual_pool_targets datatype reduce_op)
set(working_path "${CMAKE_CURRENT_BINARY_DIR}/${datatype}/${reduce_op}")
# Choose config file
if(DEFINED ENV{POOL_CONFIG_FILE} AND NOT "$ENV{POOL_CONFIG_FILE}" STREQUAL "")
set(config_filename "$ENV{POOL_CONFIG_FILE}")
set(json_blob "${CMAKE_CURRENT_LIST_DIR}/configs/${config_filename}")
message(VERBOSE " Using config from environment variable: ${config_filename}")
elseif(NOT "${POOL_CONFIG_FILE}" STREQUAL "")
set(json_blob "${CMAKE_CURRENT_LIST_DIR}/configs/${POOL_CONFIG_FILE}")
message(VERBOSE " Using custom config: ${POOL_CONFIG_FILE}")
else()
set(json_blob "${CMAKE_CURRENT_LIST_DIR}/configs/default_config.json")
message(VERBOSE " Using default config")
endif()
# Check if config file exists
if(NOT EXISTS ${json_blob})
message(FATAL_ERROR "Config file not found: ${json_blob}")
endif()
# Determine number of workers
if(DEFINED ENV{CMAKE_BUILD_PARALLEL_LEVEL})
set(num_workers $ENV{CMAKE_BUILD_PARALLEL_LEVEL})
else()
cmake_host_system_information(RESULT num_cores QUERY NUMBER_OF_LOGICAL_CORES)
math(EXPR num_workers "${num_cores}")
if(num_workers GREATER 8)
set(num_workers 8)
endif()
endif()
# Generate individual kernel files
message(VERBOSE "Generating individual kernels for ${datatype} ${reduce_op} using ${num_workers} workers...")
message(VERBOSE " Working path: ${working_path}")
message(VERBOSE " Config file: ${json_blob}")
# Create working directory first
file(MAKE_DIRECTORY ${working_path})
# List the kernels (fast operation)
message(VERBOSE " Listing kernel configurations...")
execute_process(
COMMAND ${Python3_EXECUTABLE} -u ${CMAKE_CURRENT_LIST_DIR}/pool_instance_builder.py
--working_path ${working_path}
--datatype ${datatype}
--reduce_op ${reduce_op}
--config_json ${json_blob}
--gpu_target ${POOL_GPU_TARGETS_INDIVIDUAL}
--list_kernels
WORKING_DIRECTORY ${CMAKE_CURRENT_LIST_DIR}
RESULT_VARIABLE ret
OUTPUT_VARIABLE list_output
ERROR_VARIABLE list_error
)
if(NOT ret EQUAL 0)
message(FATAL_ERROR "Failed to list kernels for ${datatype} ${reduce_op}: ${list_error}")
endif()
# Read kernel count
if(EXISTS ${working_path}/pool_kernel_count.txt)
file(READ ${working_path}/pool_kernel_count.txt kernel_count)
string(STRIP "${kernel_count}" kernel_count)
message(VERBOSE " Found ${kernel_count} kernel configurations")
else()
message(FATAL_ERROR "Kernel count file not found")
endif()
# Read kernel list and create targets
if(EXISTS ${working_path}/pool_kernel_list.txt)
file(STRINGS ${working_path}/pool_kernel_list.txt kernel_lines)
foreach(line IN LISTS kernel_lines)
# Parse line: kernel_name|block_config|trait_combo
string(REPLACE "|" ";" parts "${line}")
list(GET parts 0 kernel_name)
list(GET parts 1 block_config)
list(GET parts 2 trait_combo)
# Create individual target
create_individual_pool_target("${datatype}" "${reduce_op}" "${trait_combo}" "${block_config}" "${json_blob}")
endforeach()
else()
message(FATAL_ERROR "Kernel list file not found")
endif()
endfunction()
# Main build logic
message(VERBOSE "=== Starting Tile Engine Pool Configuration ===")
message(VERBOSE "POOL_DATATYPE: ${POOL_DATATYPE}")
message(VERBOSE "POOL_REDUCE_OP: ${POOL_REDUCE_OP}")
message(VERBOSE "SUPPORTED_GPU_TARGETS: ${SUPPORTED_GPU_TARGETS}")
# Filter GPU targets
set(POOL_GPU_TARGETS_INDIVIDUAL "")
set(DESIRED_TARGETS "gfx90a;gfx942;gfx950;gfx1201")
foreach(target IN LISTS SUPPORTED_GPU_TARGETS)
if(target IN_LIST DESIRED_TARGETS)
list(APPEND POOL_GPU_TARGETS_INDIVIDUAL ${target})
message(VERBOSE " Adding GPU target: ${target}")
endif()
endforeach()
# Skip build if no matching targets found
if(NOT POOL_GPU_TARGETS_INDIVIDUAL)
message(WARNING "Skipping Tile Engine Pool build: No supported GPU targets (gfx90a, gfx942, gfx950, gfx1201) found in SUPPORTED_GPU_TARGETS: ${SUPPORTED_GPU_TARGETS}")
else()
message(VERBOSE "Building individual Pool targets for GPU targets: ${POOL_GPU_TARGETS_INDIVIDUAL}")
# Set up job pools
set_property(GLOBAL PROPERTY JOB_POOLS
compile_heavy=4
compile_normal=16
)
# Enable compiler cache if requested
if(ENABLE_CCACHE_POOL)
find_program(CCACHE_PROGRAM ccache)
if(CCACHE_PROGRAM)
set(CMAKE_CXX_COMPILER_LAUNCHER ${CCACHE_PROGRAM})
message(VERBOSE "Using ccache for faster compilation")
else()
message(WARNING "ccache requested but not found")
endif()
else()
message(VERBOSE "ccache disabled for Pool ops (use -DENABLE_CCACHE_POOL=ON to enable)")
endif()
# Create master collection targets
add_custom_target(benchmark_pool_all)
# Create datatype collection targets
foreach(dt IN LISTS POOL_DATATYPE)
add_custom_target(benchmark_pool_${dt})
endforeach()
# Create reduce_op collection targets
foreach(op IN LISTS POOL_REDUCE_OP)
add_custom_target(benchmark_pool_${op})
endforeach()
# Create combined collection targets
foreach(dt IN LISTS POOL_DATATYPE)
foreach(op IN LISTS POOL_REDUCE_OP)
add_custom_target(benchmark_pool_${dt}_${op})
endforeach()
endforeach()
# Create pool dimension targets
add_custom_target(benchmark_pool2d)
add_custom_target(benchmark_pool3d)
# Build individual targets for each datatype/reduce_op combination
foreach(dt IN LISTS POOL_DATATYPE)
foreach(op IN LISTS POOL_REDUCE_OP)
build_individual_pool_targets(${dt} ${op})
endforeach()
endforeach()
endif()

View File

@@ -0,0 +1,381 @@
# CK Tile Engine Pool Operations
## Overview
The CK Tile Engine Pool module provides a comprehensive system for generating, building, and benchmarking pooling kernels (2D and 3D) with various configurations. It supports multiple data types, reduce operations (max, min, average), and optimization strategies. The system follows the same architecture as the GEMM module with individual kernel compilation for better build parallelism and targeted testing capabilities.
## Table of Contents
1. [Build System Architecture](#build-system-architecture)
2. [Build Instructions](#build-instructions)
3. [Running Benchmarks](#running-benchmarks)
4. [Configuration System](#configuration-system)
5. [Scripts and Tools](#scripts-and-tools)
6. [Command Line Options](#command-line-options)
7. [Understanding Kernel Names](#understanding-kernel-names)
8. [Troubleshooting](#troubleshooting)
9. [Performance Tips](#performance-tips)
## Build System Architecture
### Individual Kernel Compilation
The tile engine benchmark system compiles each kernel configuration into a separate executable. This provides:
- Better build parallelism
- Faster incremental builds
- More targeted testing
- Easier debugging of specific configurations
Each benchmark executable follows the naming pattern:
```
benchmark_pool<dim>d_<dtype>_<reduce_op>_<output_index>_<propagate_nan>_<block_config>
```
## Build Instructions
### Prerequisites
- ROCm installation
- CMake 3.16 or higher
- C++17 compatible compiler
- Python 3.6 or higher
### Basic Build
```bash
# In the root of composable kernel, create build directory
mkdir build && cd build
# Configure with specific datatypes and reduce operations
# Replace [Arch] with your GPU architecture (e.g., gfx90a, gfx942)
../script/cmake-ck-dev.sh ../ [Arch] -DPOOL_DATATYPE="fp16;fp32" -DPOOL_REDUCE_OP="max;avg"
# Build specific benchmarks
make benchmark_pool_fp16_max -j
```
### Configuration Options
The build system supports several configuration options:
#### Using Custom Config Files
```bash
# Method 1: CMake variable (config file must be in configs/ directory)
cmake -DPOOL_CONFIG_FILE=my_custom_config.json ...
# Method 2: Environment variable (takes precedence over CMake variable)
export POOL_CONFIG_FILE=my_custom_config.json
cmake ...
```
#### Config File Priority Order
1. **Environment variable** `POOL_CONFIG_FILE` (highest priority)
2. **CMake variable** `POOL_CONFIG_FILE`
3. **Default config** (default_config.json)
**Note**: All custom config files must be placed in the `tile_engine/ops/pooling/configs/` directory.
### Example Build Commands
```bash
# Build for gfx942 with fp16 datatype, max reduce operation
mkdir build && cd build
../script/cmake-ck-dev.sh ../ gfx942 -DPOOL_DATATYPE="fp16;fp32" -DPOOL_REDUCE_OP="max;avg"
make benchmark_pool_fp16_max -j
make benchmark_pool_fp32_avg -j
```
### Building Individual Kernels
```bash
# Build a specific kernel configuration
make benchmark_pool3d_fp16_max_True_False_128x1_1x1_2x1
# Build all fp16 max pooling benchmarks
make benchmark_pool_fp16_max -j$(nproc)
# Build all 3D pooling benchmarks
make benchmark_pool3d -j$(nproc)
```
### Rebuilding After Configuration Changes
If you modify the configuration file, you must rebuild:
```bash
rm -rf tile_engine/ && make benchmark_pool_[Datatype]_[ReduceOp] -j
```
## Running Benchmarks
### Individual Kernel Execution
```bash
cd /path/to/build/directory
./bin/benchmark_pool3d_fp16_max_True_False_128x1_1x1_2x1 \
-N=2 -D=30 -H=30 -W=30 -C=32 \
-Z=2 -Y=2 -X=2 \
-Sz=2 -Sy=2 -Sx=2 \
-verify=1
```
### Using the Benchmark Python Script
```bash
# Run benchmark sweep
python pool_benchmark.py /path/to/build \
--problem-sizes "2,30,30,30,32" "4,64,64,64,64" \
--window-sizes "2,2,2" "3,3,3" \
--stride-sizes "2,2,2" \
--pool-dim 3 \
--verify \
--json results.json
```
## Configuration System
### Configuration Files
The system uses JSON configuration files to specify kernel parameters:
- `configs/default_config.json` - Default configurations
### Configuration Structure
```json
{
"block_config": {
"block_m": {"values": [64, 128, 256]},
"block_n": {"values": [1]},
"warp_m": {"values": [1, 2]},
"warp_n": {"values": [1]},
"thread_tile_m": {"values": [1, 2, 4]},
"thread_tile_n": {"values": [1]}
},
"trait_config": {
"output_index": {"values": [true, false]},
"propagate_nan": {"values": [false]},
"pool_dim": {"values": [2, 3]}
},
"k_block_per_cu": 1
}
```
### Configuration Parameters
- **block_m/block_n**: Block tile dimensions for output
- **warp_m/warp_n**: Number of warps per block
- **thread_tile_m/thread_tile_n**: Thread tile sizes
- **output_index**: Whether to output indices (for max/min pooling)
- **propagate_nan**: Whether to propagate NaN values
- **pool_dim**: Pooling dimension (2 for 2D, 3 for 3D)
## Scripts and Tools
### Python Scripts
#### pool_instance_builder.py
**Purpose**: Main kernel instance generation script that creates C++ kernel implementations based on configuration files.
**Key Features**:
- Generates individual kernel header files for separate compilation
- Supports multiple data types (fp16, fp32, bf16)
- Validates block configurations for correctness
- Creates CMake integration files
**Usage**:
```bash
python pool_instance_builder.py \
--working_path ./generated \
--datatype fp16 \
--reduce_op max \
--config_json configs/default_config.json \
--gen_all_individual \
--gpu_target gfx942
```
#### pool_benchmark.py
**Purpose**: Python script for running and analyzing pool benchmarks.
**Features**:
- Automated benchmark execution
- Performance data collection
- Result analysis and reporting
- CSV and JSON export
**Usage**:
```bash
python pool_benchmark.py /path/to/build \
--problem-sizes "2,30,30,30,32" \
--window-sizes "2,2,2" \
--verbose \
--json results.json
```
## Command Line Options
All benchmark executables support the following options:
### Tensor Dimensions
- `-N=<value>` - Batch size (default: 2)
- `-D=<value>` - Depth dimension for 3D pooling (default: 30)
- `-H=<value>` - Height dimension (default: 30)
- `-W=<value>` - Width dimension (default: 30)
- `-C=<value>` - Channel dimension (default: 32)
### Window Parameters
- `-Z=<value>` - Window depth (default: 2)
- `-Y=<value>` - Window height (default: 2)
- `-X=<value>` - Window width (default: 2)
### Stride Parameters
- `-Sz=<value>` - Stride depth (default: 2)
- `-Sy=<value>` - Stride height (default: 2)
- `-Sx=<value>` - Stride width (default: 2)
### Dilation Parameters
- `-Dz=<value>` - Dilation depth (default: 1)
- `-Dy=<value>` - Dilation height (default: 1)
- `-Dx=<value>` - Dilation width (default: 1)
### Padding Parameters
- `-LeftPz=<value>` - Left padding depth (default: 0)
- `-LeftPy=<value>` - Left padding height (default: 0)
- `-LeftPx=<value>` - Left padding width (default: 0)
- `-RightPz=<value>` - Right padding depth (default: 0)
- `-RightPy=<value>` - Right padding height (default: 0)
- `-RightPx=<value>` - Right padding width (default: 0)
### Pool Dimension
- `-pool_dim=<2|3>` - Pooling dimension (default: 3)
### Verification
- `-verify=<0|1>` - Verification mode
- 0: No verification
- 1: CPU verification (default)
### Performance Testing
- `-warmup=<value>` - Warmup iterations (default: 20)
- `-repeat=<value>` - Benchmark iterations (default: 100)
- `-timer=<true|false>` - Use GPU timer (default: true)
- `-flush_cache=<true|false>` - Flush cache between runs (default: true)
- `-rotating_count=<value>` - Cache rotation count (default: 1000)
### Initialization
- `-init=<0|1|2>` - Tensor initialization method
- 0: Random values [-5, 5] (default)
- 1: Linear sequence
- 2: Constant value (1.0)
### Output Options
- `-log=<true|false>` - Enable verbose logging (default: false)
- `-metric=<0|1|2>` - Performance metric
- 0: Latency in ms
- 1: TFLOPS
- 2: Bandwidth in GB/s (default)
- `-json_output=<true|false>` - JSON format output (default: false)
- `-csv_filename=<filename>` - Save results to CSV
## Understanding Kernel Names
The kernel naming convention encodes the configuration:
```
benchmark_pool3d_fp16_max_True_False_128x1_1x1_2x1
^^^^ ^^^^ ^^^ ^^^^ ^^^^^ ^^^^^ ^^^ ^^^
| | | | | | | |
| | | | | | | Thread tile (MxN)
| | | | | | Warp config (MxN)
| | | | | Block tile (MxN)
| | | | Propagate NaN
| | | Output Index
| | Reduce operation
| Data type
Pool dimension (2D or 3D)
```
### Components:
- **Pool dimension**: 2d, 3d
- **Data type**: fp16, fp32, bf16
- **Reduce op**: max, min, avg
- **Output Index**: True/False (whether to output argmax/argmin)
- **Propagate NaN**: True/False
- **Block config**: Block_MxBlock_N_Warp_MxWarp_N_ThreadTile_MxThreadTile_N
## Troubleshooting
### Common Issues
1. **Kernel not found**
- Ensure the specific benchmark executable is built
- Check the build directory bin/ folder
2. **Verification failures**
- Check tensor dimensions are valid for the window/stride configuration
- Verify padding values are reasonable
3. **Build failures**
- Check GPU architecture compatibility
- Ensure ROCm is properly installed
- Verify configuration file syntax
4. **Performance variations**
- Increase warmup iterations
- Disable CPU frequency scaling
- Use GPU timer for accurate measurements
### Debug Options
Enable verbose logging:
```bash
./bin/benchmark_pool... -log=true -verify=1
```
## Performance Tips
1. **Optimal Problem Sizes**: Use sizes that are multiples of block dimensions
2. **Warmup**: Use at least 20-50 warmup iterations
3. **GPU Timer**: Always use `-timer=true` for accurate measurements
4. **Cache Management**: Enable cache flushing for consistent results
5. **Output Index**: Disable output index if not needed (reduces memory bandwidth)
## Integration Examples
### Python Integration
```python
import subprocess
import json
# Run benchmark with JSON output
result = subprocess.run([
'./bin/benchmark_pool3d_fp16_max_...',
'-N=2', '-D=30', '-H=30', '-W=30', '-C=32',
'-json_output=true'
], capture_output=True, text=True)
# Parse results
data = json.loads(result.stdout)
print(f"Bandwidth: {data['bandwidth_gb_s']} GB/s")
```
### Batch Testing Script
```bash
#!/bin/bash
SIZES="32 64 128 256"
for size in $SIZES; do
echo "Testing HxW=${size}x${size}"
./bin/benchmark_pool... -H=$size -W=$size \
-verify=1 -csv_filename=results.csv
done
```
## Contributing
When adding new features or configurations:
1. Update the instance builder (`pool_instance_builder.py`)
2. Update configuration examples in `configs/`
3. Document new command-line options in this README
4. Add appropriate tests
For more information about the Composable Kernel project, visit the main repository documentation.

View File

@@ -0,0 +1,35 @@
{
"block_config": {
"block_m": {
"values": [64, 128, 256]
},
"block_n": {
"values": [1]
},
"warp_m": {
"values": [1, 2]
},
"warp_n": {
"values": [1]
},
"thread_tile_m": {
"values": [1, 2, 4]
},
"thread_tile_n": {
"values": [1]
}
},
"trait_config": {
"output_index": {
"values": [true, false]
},
"propagate_nan": {
"values": [false]
},
"pool_dim": {
"values": [2, 3]
}
},
"k_block_per_cu": 1
}

View File

@@ -11,6 +11,7 @@
#include "ck_tile/core.hpp"
#include "ck_tile/host.hpp"
#include "pool_common.hpp"
enum class Metric
{
@@ -38,25 +39,56 @@ struct PoolProblem
std::string indexDType;
std::string blockShape;
std::string reduceOp;
int poolDim;
int N, D, H, W, C;
int windowZ, windowY, windowX;
int strideZ, strideY, strideX;
int dilationZ, dilationY, dilationX;
int leftPadZ, leftPadY, leftPadX;
int rightPadZ, rightPadY, rightPadX;
bool outputIndex;
bool propagateNan;
friend std::ostream& operator<<(std::ostream& os, const PoolProblem& problem)
{
os << "{\n"
<< " \"inDType\":" << problem.inDType << ",\n"
<< " \"outDType\":" << problem.outDType << ",\n"
<< " \"computeDType\":" << problem.computeDType << ",\n"
<< " \"indexDType\":" << problem.indexDType << ",\n"
<< " \"blockShape\":" << problem.blockShape << ",\n"
<< " \"reduceOp\":" << problem.reduceOp << ",\n"
<< " \"outputIndex\":" << (problem.outputIndex ? "true" : "false") << ",\n"
<< " \"propagateNan\":" << (problem.propagateNan ? "true" : "false")
<< " \"inDType\": \"" << problem.inDType << "\",\n"
<< " \"outDType\": \"" << problem.outDType << "\",\n"
<< " \"computeDType\": \"" << problem.computeDType << "\",\n"
<< " \"indexDType\": \"" << problem.indexDType << "\",\n"
<< " \"blockShape\": \"" << problem.blockShape << "\",\n"
<< " \"reduceOp\": \"" << problem.reduceOp << "\",\n"
<< " \"poolDim\": " << problem.poolDim << ",\n"
<< " \"N\": " << problem.N << ",\n"
<< " \"D\": " << problem.D << ",\n"
<< " \"H\": " << problem.H << ",\n"
<< " \"W\": " << problem.W << ",\n"
<< " \"C\": " << problem.C << ",\n"
<< " \"windowZ\": " << problem.windowZ << ",\n"
<< " \"windowY\": " << problem.windowY << ",\n"
<< " \"windowX\": " << problem.windowX << ",\n"
<< " \"strideZ\": " << problem.strideZ << ",\n"
<< " \"strideY\": " << problem.strideY << ",\n"
<< " \"strideX\": " << problem.strideX << ",\n"
<< " \"dilationZ\": " << problem.dilationZ << ",\n"
<< " \"dilationY\": " << problem.dilationY << ",\n"
<< " \"dilationX\": " << problem.dilationX << ",\n"
<< " \"leftPadZ\": " << problem.leftPadZ << ",\n"
<< " \"leftPadY\": " << problem.leftPadY << ",\n"
<< " \"leftPadX\": " << problem.leftPadX << ",\n"
<< " \"rightPadZ\": " << problem.rightPadZ << ",\n"
<< " \"rightPadY\": " << problem.rightPadY << ",\n"
<< " \"rightPadX\": " << problem.rightPadX << ",\n"
<< " \"outputIndex\": " << (problem.outputIndex ? "true" : "false") << ",\n"
<< " \"propagateNan\": " << (problem.propagateNan ? "true" : "false")
<< "\n"
<< "}";
return os;
}
}
};
struct PerformanceResult
{
@@ -109,6 +141,20 @@ struct KernelInstance
}
};
struct Setting
{
int n_warmup_;
int n_repeat_;
bool is_gpu_timer_;
int verify_;
int init_method_;
bool log_;
std::string csv_filename_;
bool flush_cache_;
int rotating_count_;
bool json_output_;
};
inline std::string get_rocm_version()
{
std::ifstream version_file("/opt/rocm/.info/version");
@@ -121,3 +167,30 @@ inline std::string get_rocm_version()
return "Unknown";
}
/// @brief Function to compare the results of the device and host computations
template <typename OutDataType>
bool compare_pool_results(std::string instanceName,
ck_tile::HostTensor<OutDataType>& out_dev_result,
ck_tile::HostTensor<OutDataType>& out_host_result)
{
bool pass = ck_tile::check_err(out_dev_result, out_host_result, "Error: Incorrect results!");
std::cout << "For " << instanceName << " verification result is: "
<< (pass ? "correct" : "fail") << std::endl;
return pass;
}
template <typename IndexDataType>
bool compare_pool_index_results(std::string instanceName,
ck_tile::HostTensor<IndexDataType>& out_index_dev_result,
ck_tile::HostTensor<IndexDataType>& out_index_host_result)
{
bool pass = ck_tile::check_err(
out_index_dev_result, out_index_host_result, "Error: Incorrect index results!");
std::cout << "For " << instanceName << " index verification result is: "
<< (pass ? "correct" : "fail") << std::endl;
return pass;
}

View File

@@ -0,0 +1,611 @@
#!/usr/bin/env python3
# Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
# SPDX-License-Identifier: MIT
import sys
import json
import subprocess
import argparse
import csv
import time
from pathlib import Path
from typing import List, Dict, Tuple, Optional
class PoolBenchmark:
def __init__(self, build_dir: str, verbose: bool = False):
self.build_dir = Path(build_dir)
self.verbose = verbose
self.results = []
def discover_kernels(self) -> List[Path]:
"""Find all benchmark_pool_* executables in the build directory"""
bin_dir = self.build_dir / "bin"
if not bin_dir.exists():
print(f"Error: Binary directory {bin_dir} does not exist")
return []
kernels = list(bin_dir.glob("benchmark_pool*"))
if self.verbose:
print(f"Found {len(kernels)} kernel executables")
for k in kernels:
print(f" - {k.name}")
return kernels
def extract_kernel_info(self, kernel_path: Path) -> Dict[str, str]:
"""Extract comprehensive kernel information from filename"""
name = kernel_path.stem
# Initialize with basic info
info = {
"executable": str(kernel_path),
"name": name,
"data_type": "unknown",
"reduce_op": "unknown",
"pool_dim": 0,
"output_index": False,
"propagate_nan": False,
}
# Parse the kernel name pattern:
# benchmark_pool3d_fp16_max_True_False_128x1_1x1_2x1
parts = name.split("_")
if len(parts) >= 3:
# Extract pool dimension (e.g., pool3d -> 3)
if "pool2d" in parts[1]:
info["pool_dim"] = 2
elif "pool3d" in parts[1]:
info["pool_dim"] = 3
# Extract data type
info["data_type"] = parts[2] if len(parts) > 2 else "unknown"
# Extract reduce op
info["reduce_op"] = parts[3] if len(parts) > 3 else "unknown"
# Extract flags
if len(parts) > 4:
info["output_index"] = parts[4] == "True"
if len(parts) > 5:
info["propagate_nan"] = parts[5] == "True"
# Extract block configuration
config_info = self.parse_block_config(name)
info.update(config_info)
# Generate config ID
info["config_id"] = self.generate_config_id(info)
return info
def parse_block_config(self, kernel_name: str) -> Dict:
"""Parse block configuration from kernel name"""
config = {
"block_sizes": {"block_m": 0, "block_n": 0},
"warp_config": {"warp_m": 0, "warp_n": 0},
"thread_tile": {"thread_tile_m": 0, "thread_tile_n": 0},
}
parts = kernel_name.split("_")
# Look for dimension patterns (e.g., 128x1)
dimension_groups = []
for part in parts:
if "x" in part and len(part.split("x")) == 2:
try:
dims = [int(x) for x in part.split("x")]
if all(d >= 0 for d in dims):
dimension_groups.append(dims)
except ValueError:
continue
# Assign dimensions based on order
if len(dimension_groups) >= 3:
config["block_sizes"]["block_m"] = dimension_groups[0][0]
config["block_sizes"]["block_n"] = dimension_groups[0][1]
config["warp_config"]["warp_m"] = dimension_groups[1][0]
config["warp_config"]["warp_n"] = dimension_groups[1][1]
config["thread_tile"]["thread_tile_m"] = dimension_groups[2][0]
config["thread_tile"]["thread_tile_n"] = dimension_groups[2][1]
elif len(dimension_groups) == 2:
config["block_sizes"]["block_m"] = dimension_groups[0][0]
config["block_sizes"]["block_n"] = dimension_groups[0][1]
config["warp_config"]["warp_m"] = dimension_groups[1][0]
config["warp_config"]["warp_n"] = dimension_groups[1][1]
elif len(dimension_groups) == 1:
config["block_sizes"]["block_m"] = dimension_groups[0][0]
config["block_sizes"]["block_n"] = dimension_groups[0][1]
return config
def generate_config_id(self, info: Dict) -> str:
"""Generate a compact config ID from kernel info"""
parts = [
f"pool{info.get('pool_dim', 0)}d",
info.get("data_type", "unk"),
info.get("reduce_op", "unk"),
]
block_sizes = info.get("block_sizes", {})
if block_sizes.get("block_m", 0) > 0:
block_str = f"{block_sizes['block_m']}x{block_sizes['block_n']}"
parts.append(block_str)
return "_".join(parts)
def run_kernel(self, kernel_path: Path, params: Dict[str, str]) -> Optional[Dict]:
"""Run a single kernel with given parameters"""
results_dir = self.build_dir / "results"
results_dir.mkdir(exist_ok=True)
json_file = results_dir / f"{kernel_path.stem}.json"
cmd = [str(kernel_path)]
for key, value in params.items():
cmd.append(f"-{key}={value}")
cmd.append("-json_output=true")
if self.verbose:
print(f"Running: {' '.join(cmd)}")
try:
result = subprocess.run(cmd, capture_output=True, text=True, timeout=120)
if result.returncode != 0:
print(f"Error running {kernel_path.name}: {result.stderr}")
return None
output = result.stdout.strip()
if output:
with open(json_file, "w") as f:
f.write(output)
return self.parse_json_file(json_file)
else:
print(f"No output from {kernel_path.name}")
return None
except subprocess.TimeoutExpired:
print(f"Timeout running {kernel_path.name}")
return None
except Exception as e:
print(f"Error running {kernel_path.name}: {e}")
return None
def parse_json_file(self, json_file: Path) -> Optional[Dict]:
"""Parse JSON data from individual kernel output file"""
try:
with open(json_file, "r") as f:
content = f.read().strip()
data = json.loads(content)
result = data.copy()
if "perf_result" in data:
perf = data["perf_result"]
result["time_ms"] = perf.get("latency(ms)", 0)
result["tflops"] = perf.get("tflops(TFlops)", 0)
result["bandwidth_gb_s"] = perf.get("bandwidth(GB/s)", 0)
return result
except json.JSONDecodeError as e:
if self.verbose:
print(f"Failed to parse JSON from {json_file}: {e}")
return None
except Exception as e:
if self.verbose:
print(f"Error reading JSON file {json_file}: {e}")
return None
def benchmark_problem_size(
self,
kernels: List[Path],
N: int,
D: int,
H: int,
W: int,
C: int,
window_z: int = 2,
window_y: int = 2,
window_x: int = 2,
stride_z: int = 2,
stride_y: int = 2,
stride_x: int = 2,
pool_dim: int = 3,
verify: int = 0,
warmup: int = 20,
repeat: int = 100,
flush_cache: bool = True,
rotating_count: int = 1000,
) -> List[Dict]:
"""Benchmark all kernels for a specific problem size"""
results = []
params = {
"N": N,
"D": D,
"H": H,
"W": W,
"C": C,
"Z": window_z,
"Y": window_y,
"X": window_x,
"Sz": stride_z,
"Sy": stride_y,
"Sx": stride_x,
"pool_dim": pool_dim,
"verify": verify,
"warmup": warmup,
"repeat": repeat,
"flush_cache": str(flush_cache).lower(),
"rotating_count": rotating_count,
}
print(f"\nBenchmarking N={N}, D={D}, H={H}, W={W}, C={C}")
print(f" Window: {window_z}x{window_y}x{window_x}, Stride: {stride_z}x{stride_y}x{stride_x}")
for kernel_path in kernels:
kernel_info = self.extract_kernel_info(kernel_path)
result = self.run_kernel(kernel_path, params)
if result:
structured_result = {
"name": kernel_info["name"],
"config_id": kernel_info["config_id"],
"problem": result.get("problem", {}),
"perf_result": result.get("perf_result", {}),
"config": {
"data_type": kernel_info["data_type"],
"reduce_op": kernel_info["reduce_op"],
"pool_dim": kernel_info["pool_dim"],
"output_index": kernel_info["output_index"],
"propagate_nan": kernel_info["propagate_nan"],
"block_sizes": kernel_info.get("block_sizes", {}),
"warp_config": kernel_info.get("warp_config", {}),
"thread_tile": kernel_info.get("thread_tile", {}),
},
"executable": kernel_info["executable"],
"time_ms": result.get("time_ms", 0),
"tflops": result.get("tflops", 0),
"bandwidth_gb_s": result.get("bandwidth_gb_s", 0),
}
results.append(structured_result)
if self.verbose:
print(
f" {kernel_info['config_id']}: {structured_result['bandwidth_gb_s']:.2f} GB/s, {structured_result['time_ms']:.2f}ms"
)
return results
def find_best_kernel(
self, results: List[Dict], metric: str = "bandwidth_gb_s"
) -> Optional[Dict]:
"""Find the best performing kernel based on metric"""
if not results:
return None
if metric == "bandwidth_gb_s":
return max(results, key=lambda x: x.get("bandwidth_gb_s", 0))
elif metric == "time_ms":
return min(results, key=lambda x: x.get("time_ms", float("inf")))
elif metric == "tflops":
return max(results, key=lambda x: x.get("tflops", 0))
else:
raise ValueError(f"Unknown metric: {metric}")
def benchmark_sweep(
self,
problem_sizes: List[Tuple[int, int, int, int, int]], # N, D, H, W, C
window_sizes: List[Tuple[int, int, int]] = [(2, 2, 2)],
stride_sizes: List[Tuple[int, int, int]] = [(2, 2, 2)],
pool_dim: int = 3,
verify: bool = False,
warmup: int = 20,
repeat: int = 100,
flush_cache: bool = True,
rotating_count: int = 1000,
) -> Dict:
"""Run comprehensive benchmark sweep"""
kernels = self.discover_kernels()
if not kernels:
print("No kernels found!")
return {}
all_results = []
best_kernels = {}
for N, D, H, W, C in problem_sizes:
for wz, wy, wx in window_sizes:
for sz, sy, sx in stride_sizes:
results = self.benchmark_problem_size(
kernels,
N, D, H, W, C,
window_z=wz, window_y=wy, window_x=wx,
stride_z=sz, stride_y=sy, stride_x=sx,
pool_dim=pool_dim,
verify=1 if verify else 0,
warmup=warmup,
repeat=repeat,
flush_cache=flush_cache,
rotating_count=rotating_count,
)
all_results.extend(results)
best = self.find_best_kernel(results)
if best:
key = f"N{N}_D{D}_H{H}_W{W}_C{C}_w{wz}x{wy}x{wx}_s{sz}x{sy}x{sx}"
best_kernels[key] = best
print(
f"Best for {key}: {best['name']} ({best['bandwidth_gb_s']:.2f} GB/s, {best['time_ms']:.2f}ms)"
)
self.results = all_results
return best_kernels
def export_csv(self, filename: str):
"""Export all results to CSV"""
if not self.results:
print("No results to export")
return
all_keys = set()
for result in self.results:
all_keys.update(result.keys())
fieldnames = sorted(all_keys)
with open(filename, "w", newline="") as csvfile:
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
writer.writeheader()
writer.writerows(self.results)
print(f"Results exported to {filename}")
def export_best_kernels(self, best_kernels: Dict, filename: str):
"""Export best kernel selections to file"""
with open(filename, "w") as f:
f.write("# Best kernel selections for pooling\n")
f.write("# Format: problem_size -> kernel_name (bandwidth, latency)\n\n")
for key, kernel in sorted(best_kernels.items()):
f.write(
f"{key}: {kernel['name']} ({kernel['bandwidth_gb_s']:.2f} GB/s, {kernel['time_ms']:.2f}ms)\n"
)
print(f"Best kernels exported to {filename}")
def export_json(self, filename: str, best_kernels: Dict = None):
"""Export all results and best kernels to JSON"""
from datetime import datetime
successful_results = [r for r in self.results if r.get("bandwidth_gb_s", 0) > 0]
bandwidth_values = [r.get("bandwidth_gb_s", 0) for r in successful_results]
latency_values = [
r.get("time_ms", 0) for r in successful_results if r.get("time_ms", 0) > 0
]
# Performance breakdown by kernel type
reduce_op_stats = {}
data_type_stats = {}
for result in successful_results:
config = result.get("config", {})
reduce_op = config.get("reduce_op", "unknown")
if reduce_op not in reduce_op_stats:
reduce_op_stats[reduce_op] = {
"count": 0,
"avg_bandwidth": 0,
"best_bandwidth": 0,
}
reduce_op_stats[reduce_op]["count"] += 1
reduce_op_stats[reduce_op]["best_bandwidth"] = max(
reduce_op_stats[reduce_op]["best_bandwidth"], result.get("bandwidth_gb_s", 0)
)
data_type = config.get("data_type", "unknown")
if data_type not in data_type_stats:
data_type_stats[data_type] = {
"count": 0,
"avg_bandwidth": 0,
"best_bandwidth": 0,
}
data_type_stats[data_type]["count"] += 1
data_type_stats[data_type]["best_bandwidth"] = max(
data_type_stats[data_type]["best_bandwidth"], result.get("bandwidth_gb_s", 0)
)
output_data = {
"benchmark_metadata": {
"timestamp": datetime.now().isoformat(),
"total_kernels_tested": len(self.results),
"unique_kernels": len(
set(r.get("name", "unknown") for r in self.results)
),
"successful_runs": len(successful_results),
"failed_runs": len(self.results) - len(successful_results),
},
"performance_summary": {
"bandwidth_stats": {
"best_gb_s": max(bandwidth_values, default=0),
"average_gb_s": sum(bandwidth_values) / len(bandwidth_values)
if bandwidth_values
else 0,
"min_gb_s": min(bandwidth_values, default=0),
},
"latency_stats": {
"best_ms": min(latency_values, default=0),
"average_ms": sum(latency_values) / len(latency_values)
if latency_values
else 0,
"max_ms": max(latency_values, default=0),
},
"kernel_type_breakdown": {
"by_reduce_op": reduce_op_stats,
"by_data_type": data_type_stats,
},
"total_problem_configurations": len(best_kernels)
if best_kernels
else 0,
},
"kernel_results": self.results,
"best_kernels_by_problem": best_kernels or {},
}
with open(filename, "w") as f:
json.dump(output_data, f, indent=2)
print(f"JSON results exported to {filename}")
print(f" - Total kernels: {len(self.results)}")
print(f" - Successful runs: {len(successful_results)}")
print(f" - Best bandwidth: {max(bandwidth_values, default=0):.2f} GB/s")
print(f" - Best latency: {min(latency_values, default=0):.2f}ms")
def main():
parser = argparse.ArgumentParser(description="Pool Kernel Benchmarking Tool")
parser.add_argument(
"build_dir", help="Build directory containing kernel executables"
)
parser.add_argument(
"--problem-sizes",
nargs="+",
default=["2,30,30,30,32", "4,64,64,64,64", "8,128,128,128,128"],
help="Problem sizes as N,D,H,W,C tuples",
)
parser.add_argument(
"--window-sizes",
nargs="+",
default=["2,2,2", "3,3,3"],
help="Window sizes as Z,Y,X tuples",
)
parser.add_argument(
"--stride-sizes",
nargs="+",
default=["2,2,2"],
help="Stride sizes as Z,Y,X tuples",
)
parser.add_argument(
"--pool-dim", type=int, default=3, help="Pooling dimension (2 or 3)"
)
parser.add_argument("--verify", action="store_true", help="Enable verification")
parser.add_argument(
"--csv", default="pool_benchmark_results.csv", help="CSV output filename"
)
parser.add_argument(
"--best", default="best_pool_kernels.txt", help="Best kernels output filename"
)
parser.add_argument("--verbose", action="store_true", help="Verbose output")
parser.add_argument(
"--warmup",
type=int,
default=20,
help="Number of warmup iterations (default: 20)",
)
parser.add_argument(
"--repeat",
type=int,
default=100,
help="Number of benchmark iterations (default: 100)",
)
parser.add_argument(
"--flush-cache",
action="store_true",
default=True,
help="Enable cache flushing (default: True)",
)
parser.add_argument(
"--rotating-count",
type=int,
default=1000,
help="Number of iterations to rotate cache (default: 1000)",
)
parser.add_argument("--json", help="JSON output filename (optional)")
args = parser.parse_args()
# Parse problem sizes
problem_sizes = []
for size_str in args.problem_sizes:
try:
parts = list(map(int, size_str.split(",")))
if len(parts) == 5:
problem_sizes.append(tuple(parts))
else:
print(f"Invalid problem size: {size_str} (expected N,D,H,W,C)")
return 1
except ValueError:
print(f"Invalid problem size: {size_str}")
return 1
# Parse window sizes
window_sizes = []
for size_str in args.window_sizes:
try:
parts = list(map(int, size_str.split(",")))
if len(parts) == 3:
window_sizes.append(tuple(parts))
else:
print(f"Invalid window size: {size_str} (expected Z,Y,X)")
return 1
except ValueError:
print(f"Invalid window size: {size_str}")
return 1
# Parse stride sizes
stride_sizes = []
for size_str in args.stride_sizes:
try:
parts = list(map(int, size_str.split(",")))
if len(parts) == 3:
stride_sizes.append(tuple(parts))
else:
print(f"Invalid stride size: {size_str} (expected Z,Y,X)")
return 1
except ValueError:
print(f"Invalid stride size: {size_str}")
return 1
# Create benchmark instance
benchmark = PoolBenchmark(args.build_dir, verbose=args.verbose)
# Run benchmark sweep
print("Starting Pool kernel benchmark sweep...")
start_time = time.time()
best_kernels = benchmark.benchmark_sweep(
problem_sizes=problem_sizes,
window_sizes=window_sizes,
stride_sizes=stride_sizes,
pool_dim=args.pool_dim,
verify=args.verify,
warmup=args.warmup,
repeat=args.repeat,
flush_cache=args.flush_cache,
rotating_count=args.rotating_count,
)
elapsed_time = time.time() - start_time
print(f"\nBenchmark completed in {elapsed_time:.2f} seconds")
# Export results
benchmark.export_csv(args.csv)
benchmark.export_best_kernels(best_kernels, args.best)
if args.json:
benchmark.export_json(args.json, best_kernels)
return 0
if __name__ == "__main__":
sys.exit(main())

View File

@@ -16,13 +16,72 @@
// The kernel header is included via the compile command line with -include flag
// It defines SelectedKernel struct and KERNEL_NAME
// DataTypeTraits are now defined in gemm_common.hpp
// DataTypeTraits are defined in pool_common.hpp
// Create argument parser TODO
// Create argument parser
inline auto create_args(int argc, char* argv[])
{
ck_tile::ArgParser arg_parser;
// TODO
arg_parser
.insert("N", "2", "Batch size N dimension. Default is 2.")
.insert("D", "30", "Depth D dimension (for 3D pooling). Default is 30.")
.insert("H", "30", "Height H dimension. Default is 30.")
.insert("W", "30", "Width W dimension. Default is 30.")
.insert("C", "32", "Channel C dimension. Default is 32.")
.insert("Z", "2", "Window depth Z dimension. Default is 2.")
.insert("Y", "2", "Window height Y dimension. Default is 2.")
.insert("X", "2", "Window width X dimension. Default is 2.")
.insert("Sz", "2", "Window stride depth. Default is 2.")
.insert("Sy", "2", "Window stride height. Default is 2.")
.insert("Sx", "2", "Window stride width. Default is 2.")
.insert("Dz", "1", "Window dilation depth. Default is 1.")
.insert("Dy", "1", "Window dilation height. Default is 1.")
.insert("Dx", "1", "Window dilation width. Default is 1.")
.insert("LeftPz", "0", "Left padding depth. Default is 0.")
.insert("LeftPy", "0", "Left padding height. Default is 0.")
.insert("LeftPx", "0", "Left padding width. Default is 0.")
.insert("RightPz", "0", "Right padding depth. Default is 0.")
.insert("RightPy", "0", "Right padding height. Default is 0.")
.insert("RightPx", "0", "Right padding width. Default is 0.")
.insert("pool_dim",
"3",
"Pooling dimension (2 for 2D, 3 for 3D). Default is 3.")
.insert("verify",
"1",
"The type of validation. Set to 0 for no validation, 1 for validation on CPU. "
"Default is 1, CPU validation.")
.insert("log",
"false",
"Whether output kernel instance information or not. Possible values are true or "
"false. Default is false")
.insert(
"warmup", "20", "The number of iterations before benchmark the kernel. Default is 20.")
.insert(
"repeat", "100", "The number of iterations to benchmark the kernel. Default is 100.")
.insert("timer",
"true",
"Whether if the timer is gpu timer or not. Possible values are false or true. "
"Default is true.")
.insert("init",
"0",
"The method of tensor initialization. Set to 0 for random, to 1 for linear, or 2 "
"for constant(1). Default is 0, random.")
.insert("flush_cache",
"true",
"To flush cache, possible values are true or false. "
"Default is true.")
.insert("rotating_count", "1000", "Number of iterations to rotate the cache. Default is 1000.")
.insert("metric",
"2",
"Metric with which to measure kernel performance. Set to 0 for latency, 1 for "
"tflops, or 2 for bandwidth. Default is 2, bandwidth.")
.insert("csv_filename",
"",
"The filename of benchmark result. Default is empty (no CSV output).")
.insert("json_output",
"false",
"Whether to output results in JSON format only. Possible values are true or false. "
"Default is false");
bool result = arg_parser.parse(argc, argv);
return std::make_tuple(result, arg_parser);
@@ -32,35 +91,73 @@ void benchmark_single(const ck_tile::ArgParser& arg_parser)
{
// Use DataTypeTraits to get the actual type names from the generated header
// The generated header defines InDataType, OutDataType, ComputeDataType, IndexDataType
std::string inDType = DataTypeTraits<InDataType>::name;
std::string outDType = DataTypeTraits<OutDataType>::name;
std::string inDType = DataTypeTraits<InDataType>::name;
std::string outDType = DataTypeTraits<OutDataType>::name;
std::string computeDType = DataTypeTraits<ComputeDataType>::name;
std::string indexDType = DataTypeTraits<IndexDataType>::name;
PoolProblem pool_problem{inDType,
outDType,
computeDType,
indexDType,
arg_parser.get_str("blockShape"),
arg_parser.get_str("reduceOp"),
arg_parser.get_bool("outputIndex"),
arg_parser.get_bool("propagateNan")};
// Get block shape from the generated kernel
std::string blockShape = BLOCK_SHAPE_NAME;
Settings settings{};
// Get reduce op from the generated kernel
std::string reduceOp = REDUCE_OP_NAME;
// Create PoolProblem struct
PoolProblem pool_problem{
inDType,
outDType,
computeDType,
indexDType,
blockShape,
reduceOp,
arg_parser.get_int("pool_dim"),
arg_parser.get_int("N"),
arg_parser.get_int("D"),
arg_parser.get_int("H"),
arg_parser.get_int("W"),
arg_parser.get_int("C"),
arg_parser.get_int("Z"),
arg_parser.get_int("Y"),
arg_parser.get_int("X"),
arg_parser.get_int("Sz"),
arg_parser.get_int("Sy"),
arg_parser.get_int("Sx"),
arg_parser.get_int("Dz"),
arg_parser.get_int("Dy"),
arg_parser.get_int("Dx"),
arg_parser.get_int("LeftPz"),
arg_parser.get_int("LeftPy"),
arg_parser.get_int("LeftPx"),
arg_parser.get_int("RightPz"),
arg_parser.get_int("RightPy"),
arg_parser.get_int("RightPx"),
OUTPUT_INDEX,
PROPAGATE_NAN};
// Create Setting struct
Setting setting{arg_parser.get_int("warmup"),
arg_parser.get_int("repeat"),
arg_parser.get_bool("timer"),
arg_parser.get_int("verify"),
arg_parser.get_int("init"),
arg_parser.get_bool("log"),
arg_parser.get_str("csv_filename"),
arg_parser.get_bool("flush_cache"),
arg_parser.get_int("rotating_count"),
arg_parser.get_bool("json_output")};
// Get the profiler instance
auto& profiler = PoolProfiler::instance(setting); // TODO
auto& profiler = PoolProfiler::instance(setting);
try
{
// Create a lambda that wraps the kernel launch
auto kernel_func = [](const ck_tile::&PoolHostArgs args, // TODO
const ck_tile::stream_config& stream) {
auto kernel_func = [&](const auto& args, const ck_tile::stream_config& stream) {
return SelectedKernel::launch(args, stream);
};
// Benchmark the kernel
profiler.benchmark(pool_problem, kernel_func);
// Benchmark the kernel using the templated version
profiler.template benchmark<TensorShapeType, WindowShapeType>(pool_problem, kernel_func);
// Select best instance based on metric
profiler.select_best_instance(static_cast<Metric>(arg_parser.get_int("metric")));

View File

@@ -0,0 +1,589 @@
#!/usr/bin/env python
# Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
# SPDX-License-Identifier: MIT
import os
import json
import argparse
import itertools
import multiprocessing
import concurrent.futures
from pathlib import Path
import logging
logging.basicConfig(level=logging.INFO)
def get_dtype_string(dtype):
"""Convert dtype name to C++ type string"""
dtype_map = {
"fp16": "ck_tile::half_t",
"fp32": "float",
"bf16": "ck_tile::bf16_t",
"fp8": "ck_tile::fp8_t",
"bf8": "ck_tile::bf8_t",
"int8": "ck_tile::int8_t",
"int32": "ck_tile::int32_t",
"index_t": "ck_tile::index_t",
}
return dtype_map.get(dtype, dtype)
def get_reduce_op_string(reduce_op):
"""Convert reduce op name to C++ type string"""
reduce_op_map = {
"max": "ck_tile::ReduceOp::Max",
"min": "ck_tile::ReduceOp::Min",
"add": "ck_tile::ReduceOp::Add",
"avg": "ck_tile::ReduceOp::Add", # Average uses Add and divides later
}
return reduce_op_map.get(reduce_op.lower(), "ck_tile::ReduceOp::Max")
class PoolKernelBuilder:
def __init__(self, working_path, gpu_target, datatype, reduce_op, config_json=None):
self.working_path = Path(working_path)
self.gpu_target = gpu_target
self.datatype = datatype
self.reduce_op = reduce_op
self.config_json = config_json
# Create working directory if it doesn't exist
self.working_path.mkdir(parents=True, exist_ok=True)
# Load configuration
if config_json and os.path.exists(config_json):
with open(config_json, "r") as f:
self.config = json.load(f)
else:
# Default configuration
self.config = self._get_default_config()
def _get_default_config(self):
"""Return default configuration for pooling kernels"""
return {
"block_config": {
"block_m": {"values": [64, 128, 256]},
"block_n": {"values": [1]},
"warp_m": {"values": [1, 2]},
"warp_n": {"values": [1]},
"thread_tile_m": {"values": [1, 2, 4]},
"thread_tile_n": {"values": [1]},
},
"trait_config": {
"output_index": {"values": [True, False]},
"propagate_nan": {"values": [False]},
"pool_dim": {"values": [2, 3]},
},
"k_block_per_cu": 1,
}
def write_kernel_list(self):
"""Write kernel list to file for CMake to read"""
block_configs = self._get_block_configs()
trait_combos = self._generate_trait_combinations()
kernel_list = []
for block_config in block_configs:
for trait_combo in trait_combos:
output_index, propagate_nan, pool_dim = trait_combo
# Create kernel name
kernel_name = f"pool{pool_dim}d_{self.datatype}_{self.reduce_op}"
kernel_name += f"_{str(output_index).capitalize()}_{str(propagate_nan).capitalize()}"
# Create block configuration string
block_str = f"{block_config['block_m']}x{block_config['block_n']}_"
block_str += f"{block_config['warp_m']}x{block_config['warp_n']}_"
block_str += f"{block_config['thread_tile_m']}x{block_config['thread_tile_n']}"
kernel_name += f"_{block_str}"
kernel_list.append(
{
"name": kernel_name,
"block_config": block_config,
"trait_combo": trait_combo,
}
)
# Write kernel count
with open(self.working_path / "pool_kernel_count.txt", "w") as f:
f.write(str(len(kernel_list)))
# Write kernel list
with open(self.working_path / "pool_kernel_list.txt", "w") as f:
for kernel in kernel_list:
block_config = kernel["block_config"]
trait_combo = kernel["trait_combo"]
block_str = f"{block_config['block_m']}x{block_config['block_n']}_"
block_str += f"{block_config['warp_m']}x{block_config['warp_n']}_"
block_str += f"{block_config['thread_tile_m']}x{block_config['thread_tile_n']}"
trait_str = "_".join(str(x) for x in trait_combo)
f.write(f"{kernel['name']}|{block_str}|{trait_str}\n")
print(f"Listed {len(kernel_list)} kernel configurations")
def _get_block_configs(self):
"""Get block configurations for the current datatype"""
block_config = self.config["block_config"]
block_m_values = block_config.get("block_m").get("values")
block_n_values = block_config.get("block_n").get("values")
warp_m_values = block_config.get("warp_m").get("values")
warp_n_values = block_config.get("warp_n").get("values")
thread_tile_m_values = block_config.get("thread_tile_m").get("values")
thread_tile_n_values = block_config.get("thread_tile_n").get("values")
configs = []
for block_m in block_m_values:
for block_n in block_n_values:
for warp_m in warp_m_values:
for warp_n in warp_n_values:
for thread_tile_m in thread_tile_m_values:
for thread_tile_n in thread_tile_n_values:
if self._validate_block_config(
block_m,
block_n,
warp_m,
warp_n,
thread_tile_m,
thread_tile_n,
):
configs.append(
{
"block_m": block_m,
"block_n": block_n,
"warp_m": warp_m,
"warp_n": warp_n,
"thread_tile_m": thread_tile_m,
"thread_tile_n": thread_tile_n,
}
)
return configs
def _validate_block_config(
self, block_m, block_n, warp_m, warp_n, thread_tile_m, thread_tile_n
):
"""Validate that block configuration is reasonable"""
if block_m <= 0 or block_n <= 0:
return False
if warp_m <= 0 or warp_n <= 0:
return False
if thread_tile_m <= 0 or thread_tile_n <= 0:
return False
# Warp size is 64 for AMD GPUs
warp_size = 64
# Calculate warp tile sizes
warp_tile_m = block_m // warp_m
warp_tile_n = block_n // warp_n
if warp_tile_m <= 0 or warp_tile_n <= 0:
return False
# Check thread tile fits in warp tile
if warp_tile_m % thread_tile_m != 0:
return False
if warp_tile_n % thread_tile_n != 0:
return False
# Check threads per warp constraint
threads_per_warp = (warp_tile_m // thread_tile_m) * (warp_tile_n // thread_tile_n)
if threads_per_warp > warp_size:
return False
return True
def _generate_trait_combinations(self):
"""Generate all combinations of traits"""
trait_config = self.config["trait_config"]
output_index_values = trait_config.get("output_index").get("values")
propagate_nan_values = trait_config.get("propagate_nan").get("values")
pool_dim_values = trait_config.get("pool_dim").get("values")
all_combinations = list(
itertools.product(
output_index_values,
propagate_nan_values,
pool_dim_values,
)
)
return all_combinations
def _generate_kernel_instance(self, block_config, trait_combo, k_block_per_cu, is_header=True):
"""Generate a single kernel instance"""
output_index, propagate_nan, pool_dim = trait_combo
# Create kernel name
kernel_name = f"pool{pool_dim}d_{self.datatype}_{self.reduce_op}"
kernel_name += f"_{str(output_index).capitalize()}_{str(propagate_nan).capitalize()}"
# Create block configuration string
block_str = f"{block_config['block_m']}x{block_config['block_n']}_"
block_str += f"{block_config['warp_m']}x{block_config['warp_n']}_"
block_str += f"{block_config['thread_tile_m']}x{block_config['thread_tile_n']}"
kernel_name += f"_{block_str}"
# Determine output type (same as input for pooling)
out_type = self.datatype
compute_type = "fp32" # Always use fp32 for compute
index_type = "index_t"
# Calculate warp tile sizes
warp_tile_m = block_config["block_m"] // block_config["warp_m"]
warp_tile_n = block_config["block_n"] // block_config["warp_n"]
# Generate kernel instance code
pragma_line = "#pragma once\n" if is_header else ""
instance_code = f"""// Generated kernel instance for {kernel_name}
{pragma_line}
#include <cstdint>
#include <utility>
#include <tuple>
#include "ck_tile/core.hpp"
#include "ck_tile/host/kernel_launch.hpp"
#include "ck_tile/ops/pooling.hpp"
#include "ck_tile/ops/pooling/kernel/pool_kernel.hpp"
#include "ck_tile/ops/pooling/pipeline/pool_problem.hpp"
#include "ck_tile/ops/pooling/pipeline/pool_shape.hpp"
using InDataType = {get_dtype_string(self.datatype)};
using OutDataType = {get_dtype_string(out_type)};
using ComputeDataType = {get_dtype_string(compute_type)};
using IndexDataType = {get_dtype_string(index_type)};
// Reduce operation
using ReduceOpType = {get_reduce_op_string(self.reduce_op)};
// Kernel name for display
constexpr const char* KERNEL_NAME = "{kernel_name}";
constexpr const char* BLOCK_SHAPE_NAME = "{block_str}";
constexpr const char* REDUCE_OP_NAME = "{self.reduce_op}";
// Flags
constexpr bool OUTPUT_INDEX = {"true" if output_index else "false"};
constexpr bool PROPAGATE_NAN = {"true" if propagate_nan else "false"};
// Block configuration
using BlockWarps = ck_tile::sequence<{block_config['warp_m']}, {block_config['warp_n']}>;
using BlockTile = ck_tile::sequence<{block_config['block_m']}, {block_config['block_n']}>;
using WarpTile = ck_tile::sequence<{warp_tile_m}, {warp_tile_n}>;
using ThreadTile = ck_tile::sequence<{block_config['thread_tile_m']}, {block_config['thread_tile_n']}>;
using PoolBlockShape = ck_tile::PoolShape<BlockWarps, BlockTile, WarpTile, ThreadTile>;
// Pool problem definition
using Problem = ck_tile::PoolProblem<InDataType,
OutDataType,
ComputeDataType,
IndexDataType,
ReduceOpType,
OUTPUT_INDEX,
PROPAGATE_NAN,
PoolBlockShape>;
// Pool kernel type
using Kernel = ck_tile::PoolKernel<Problem>;
// Shape types for {pool_dim}D pooling
"""
if pool_dim == 3:
instance_code += """// 3D pooling shapes (N, D, H, W, C)
using TensorShapeType = decltype(ck_tile::make_tuple(
ck_tile::index_t{}, ck_tile::index_t{}, ck_tile::index_t{},
ck_tile::index_t{}, ck_tile::index_t{}));
// Window shape (Z, Y, X)
using WindowShapeType = decltype(ck_tile::make_tuple(
ck_tile::index_t{}, ck_tile::index_t{}, ck_tile::index_t{}));
"""
else:
instance_code += """// 2D pooling shapes (N, H, W, C)
using TensorShapeType = decltype(ck_tile::make_tuple(
ck_tile::index_t{}, ck_tile::index_t{},
ck_tile::index_t{}, ck_tile::index_t{}));
// Window shape (Y, X)
using WindowShapeType = decltype(ck_tile::make_tuple(
ck_tile::index_t{}, ck_tile::index_t{}));
"""
instance_code += f"""
// Wrapper for simplified launch interface
struct SelectedKernel {{
template <typename TensorShape, typename WindowShape>
static float launch(const ck_tile::PoolHostArgs<TensorShape, WindowShape>& args,
const ck_tile::stream_config& stream) {{
auto kernel_args = Kernel::MakeKernelArgs(
const_cast<ck_tile::PoolHostArgs<TensorShape, WindowShape>&>(args));
if (!Kernel::IsSupportedArgument(kernel_args)) {{
throw std::runtime_error("Wrong! Arguments not supported! Skipping pooling kernel!");
}}
constexpr ck_tile::index_t kBlockPerCu = {k_block_per_cu};
const ck_tile::index_t kBlockSize = Kernel::BlockSize();
const ck_tile::index_t kGridSize = Kernel::CalculateGridSize(kernel_args);
if(stream.log_level_ > 0) {{
std::cout << "Launching kernel: " << KERNEL_NAME << '\\n'
<< "grid: " << kGridSize
<< ", blocks: " << kBlockSize
<< std::endl;
}}
// Launch kernel
float ave_time = ck_tile::launch_kernel(
stream,
ck_tile::make_kernel<kBlockPerCu>(Kernel{{}}, kGridSize, kBlockSize, 0, kernel_args));
return ave_time;
}}
}};
"""
return kernel_name, instance_code
def run(self, num_workers=None):
"""Run the builder to generate individual kernel files"""
self.generate_individual(num_workers)
def generate_individual(self, num_workers=None):
"""Generate individual kernel files for separate compilation"""
if num_workers is None:
num_workers = min(multiprocessing.cpu_count(), 8)
block_configs = self._get_block_configs()
trait_combos = self._generate_trait_combinations()
k_block_per_cu = self.config.get("k_block_per_cu", 1)
# Prepare work items
work_items = []
for block_config in block_configs:
for trait_combo in trait_combos:
work_items.append(
(
block_config,
trait_combo,
k_block_per_cu,
self.working_path,
self.gpu_target,
self.datatype,
self.reduce_op,
self.config_json,
)
)
print(
f"Generating {len(work_items)} individual kernel files using {num_workers} workers..."
)
print(f" Block configs: {len(block_configs)}")
print(f" Trait combinations: {len(trait_combos)}")
print(f" Total kernels: {len(work_items)}")
# Process work items
kernel_list = []
completed = 0
with concurrent.futures.ProcessPoolExecutor(max_workers=num_workers) as executor:
future_to_item = {
executor.submit(_generate_single_kernel_individual, item): item
for item in work_items
}
for future in concurrent.futures.as_completed(future_to_item):
completed += 1
if completed % 10 == 0 or completed == len(work_items):
print(f" Progress: {completed}/{len(work_items)} kernels generated")
try:
result = future.result()
if result:
kernel_list.append(result)
except Exception as exc:
item = future_to_item[future]
print(f"Kernel generation failed for {item}: {exc}")
# Sort kernel list
kernel_list.sort(key=lambda x: x[0])
# Generate CMake include file
self._generate_cmake_individual_targets(kernel_list)
print(f"Generated {len(kernel_list)} individual kernel files in {self.working_path}")
def _generate_cmake_individual_targets(self, kernel_list):
"""Generate CMake include file that creates individual targets"""
cmake_code = f"""# Generated CMake file for individual Pool targets
# Datatype: {self.datatype}, ReduceOp: {self.reduce_op}
"""
for kernel_name, trait_combo, block_config in kernel_list:
block_str = f"{block_config['block_m']}x{block_config['block_n']}_"
block_str += f"{block_config['warp_m']}x{block_config['warp_n']}_"
block_str += f"{block_config['thread_tile_m']}x{block_config['thread_tile_n']}"
trait_str = "_".join(str(x) for x in trait_combo)
cmake_code += f'create_individual_pool_target("{self.datatype}" "{self.reduce_op}" "{trait_str}" "{block_str}")\n'
with open(self.working_path / "pool_individual_targets.cmake", "w") as f:
f.write(cmake_code)
def _generate_single_kernel_individual(work_item):
"""Worker function to generate a single individual kernel file"""
(
block_config,
trait_combo,
k_block_per_cu,
working_path,
gpu_target,
datatype,
reduce_op,
config_json,
) = work_item
# Create a temporary builder instance
builder = PoolKernelBuilder(working_path, gpu_target, datatype, reduce_op, config_json)
try:
kernel_name, instance_code = builder._generate_kernel_instance(
block_config, trait_combo, k_block_per_cu
)
# Create simplified filename
simplified_name = kernel_name
if simplified_name.startswith("pool"):
simplified_name = simplified_name[4:] # Remove "pool" prefix
# Write individual header file
header_file = working_path / f"pool_single_{simplified_name}.hpp"
with open(header_file, "w") as f:
f.write(instance_code)
return (kernel_name, trait_combo, block_config)
except Exception as e:
print(f"Error generating individual kernel: {e}")
return None
def main():
parser = argparse.ArgumentParser(
description="Pool kernel instance builder with parallel support"
)
parser.add_argument("--working_path", required=True, help="Working directory path")
parser.add_argument(
"--gpu_target",
required=True,
help="GPU target architecture",
)
parser.add_argument(
"--datatype",
required=True,
choices=["fp16", "fp32", "bf16"],
help="Data type",
)
parser.add_argument(
"--reduce_op",
required=True,
choices=["max", "min", "avg"],
help="Reduce operation",
)
parser.add_argument("--config_json", help="Configuration JSON file")
parser.add_argument(
"--num_workers", type=int, help="Number of parallel workers (default: auto)"
)
parser.add_argument(
"--gen_all_individual",
action="store_true",
help="Generate individual kernel files",
)
parser.add_argument(
"--gen_single", action="store_true", help="Generate a single kernel file"
)
parser.add_argument("--kernel_name", help="Kernel name for single generation")
parser.add_argument(
"--block_config", help="Block configuration string for single generation"
)
parser.add_argument(
"--trait_combo", help="Trait combination string for single generation"
)
parser.add_argument(
"--list_kernels",
action="store_true",
help="List kernel configurations without generating files",
)
args = parser.parse_args()
# Create builder
builder = PoolKernelBuilder(
args.working_path, args.gpu_target, args.datatype, args.reduce_op, args.config_json
)
if args.list_kernels:
builder.write_kernel_list()
elif args.gen_single:
# Generate a single kernel file
if not args.kernel_name or not args.block_config or not args.trait_combo:
parser.error(
"--gen_single requires --kernel_name, --block_config, and --trait_combo"
)
# Parse block config
block_parts = args.block_config.split("_")
block_dims = block_parts[0].split("x")
warp_dims = block_parts[1].split("x")
thread_tile_dims = block_parts[2].split("x")
block_config = {
"block_m": int(block_dims[0]),
"block_n": int(block_dims[1]),
"warp_m": int(warp_dims[0]),
"warp_n": int(warp_dims[1]),
"thread_tile_m": int(thread_tile_dims[0]),
"thread_tile_n": int(thread_tile_dims[1]),
}
# Parse trait combo
trait_parts = args.trait_combo.split("_")
trait_combo = (
trait_parts[0] == "True", # output_index
trait_parts[1] == "True", # propagate_nan
int(trait_parts[2]), # pool_dim
)
k_block_per_cu = builder.config.get("k_block_per_cu", 1)
# Generate the kernel
kernel_name, instance_code = builder._generate_kernel_instance(
block_config, trait_combo, k_block_per_cu
)
# Write the file
simplified_name = kernel_name
if simplified_name.startswith("pool"):
simplified_name = simplified_name[4:]
header_file = builder.working_path / f"pool_single_{simplified_name}.hpp"
with open(header_file, "w") as f:
f.write(instance_code)
print(f"Generated {header_file}")
elif args.gen_all_individual:
builder.run(args.num_workers)
else:
parser.error(
"Must specify one of: --list_kernels, --gen_all_individual, or --gen_single"
)
if __name__ == "__main__":
main()

View File

@@ -9,48 +9,404 @@
#include "ck_tile/host/device_prop.hpp"
#include "ck_tile/ops/pooling.hpp"
#include "ck_tile/host/reference/reference_pool.hpp"
#include "pool_benchmark.hpp"
class PoolProfiler
{
public:
static PoolProfiler& instance(Settings settings)
static PoolProfiler& instance(Setting setting)
{
static PoolProfiler instance{settings};
static PoolProfiler instance{setting};
return instance;
}
// Overload for single kernel benchmarking
template <typename TensorShape, typename WindowShape>
void benchmark(PoolProblem& pool_problem,
std::function<float(const ck_tile::PoolHostArgs&, const ck_tile::stream_config&)>
kernel_func)
std::function<float(const ck_tile::PoolHostArgs<TensorShape, WindowShape>&,
const ck_tile::stream_config&)> kernel_func)
{
// Create a vector with a single callable that returns both name and time
std::vector<std::function<std::tuple<std::string, float>(ck_tile::PoolHostArgs&,
const ck_tile::stream_config&)>>
std::vector<std::function<std::tuple<std::string, float>(
ck_tile::PoolHostArgs<TensorShape, WindowShape>&, const ck_tile::stream_config&)>>
callables;
callables.push_back(
[kernel_func](ck_tile::PoolHostArgs& args, const ck_tile::stream_config& stream) {
float time = kernel_func(args, stream);
return std::make_tuple(std::string(KERNEL_NAME), time);
});
callables.push_back([kernel_func](ck_tile::PoolHostArgs<TensorShape, WindowShape>& args,
const ck_tile::stream_config& stream) {
float time = kernel_func(args, stream);
return std::make_tuple(std::string(KERNEL_NAME), time);
});
benchmark(pool_problem, callables);
}
////
template <typename TensorShape, typename WindowShape>
void benchmark(
PoolProblem& pool_problem,
std::vector<std::function<std::tuple<std::string, float>(
ck_tile::PoolHostArgs<TensorShape, WindowShape>&, const ck_tile::stream_config&)>>&
callables)
{
// Calculate output dimensions based on pool dimension
const ck_tile::index_t N = pool_problem.N;
const ck_tile::index_t D = pool_problem.D;
const ck_tile::index_t H = pool_problem.H;
const ck_tile::index_t W = pool_problem.W;
const ck_tile::index_t C = pool_problem.C;
const ck_tile::index_t Z = pool_problem.windowZ;
const ck_tile::index_t Y = pool_problem.windowY;
const ck_tile::index_t X = pool_problem.windowX;
const ck_tile::index_t Sz = pool_problem.strideZ;
const ck_tile::index_t Sy = pool_problem.strideY;
const ck_tile::index_t Sx = pool_problem.strideX;
const ck_tile::index_t Dz = pool_problem.dilationZ;
const ck_tile::index_t Dy = pool_problem.dilationY;
const ck_tile::index_t Dx = pool_problem.dilationX;
const ck_tile::index_t LeftPz = pool_problem.leftPadZ;
const ck_tile::index_t LeftPy = pool_problem.leftPadY;
const ck_tile::index_t LeftPx = pool_problem.leftPadX;
const ck_tile::index_t RightPz = pool_problem.rightPadZ;
const ck_tile::index_t RightPy = pool_problem.rightPadY;
const ck_tile::index_t RightPx = pool_problem.rightPadX;
// Calculate effective window sizes
const ck_tile::index_t Zs = (Z - 1) * Dz + 1;
const ck_tile::index_t Ys = (Y - 1) * Dy + 1;
const ck_tile::index_t Xs = (X - 1) * Dx + 1;
// Calculate output dimensions
const ck_tile::index_t Do = (D + LeftPz + RightPz - Zs) / Sz + 1;
const ck_tile::index_t Ho = (H + LeftPy + RightPy - Ys) / Sy + 1;
const ck_tile::index_t Wo = (W + LeftPx + RightPx - Xs) / Sx + 1;
// Create input/output tensors based on pool dimension (3D: NDHWC, 2D: NHWC)
ck_tile::HostTensor<InDataType> in_tensor(
pool_problem.poolDim == 3
? std::vector<std::size_t>{static_cast<std::size_t>(N),
static_cast<std::size_t>(D),
static_cast<std::size_t>(H),
static_cast<std::size_t>(W),
static_cast<std::size_t>(C)}
: std::vector<std::size_t>{static_cast<std::size_t>(N),
static_cast<std::size_t>(H),
static_cast<std::size_t>(W),
static_cast<std::size_t>(C)});
ck_tile::HostTensor<OutDataType> out_tensor(
pool_problem.poolDim == 3
? std::vector<std::size_t>{static_cast<std::size_t>(N),
static_cast<std::size_t>(Do),
static_cast<std::size_t>(Ho),
static_cast<std::size_t>(Wo),
static_cast<std::size_t>(C)}
: std::vector<std::size_t>{static_cast<std::size_t>(N),
static_cast<std::size_t>(Ho),
static_cast<std::size_t>(Wo),
static_cast<std::size_t>(C)});
ck_tile::HostTensor<OutDataType> out_host_result(
pool_problem.poolDim == 3
? std::vector<std::size_t>{static_cast<std::size_t>(N),
static_cast<std::size_t>(Do),
static_cast<std::size_t>(Ho),
static_cast<std::size_t>(Wo),
static_cast<std::size_t>(C)}
: std::vector<std::size_t>{static_cast<std::size_t>(N),
static_cast<std::size_t>(Ho),
static_cast<std::size_t>(Wo),
static_cast<std::size_t>(C)});
ck_tile::HostTensor<IndexDataType> out_index_tensor(
pool_problem.outputIndex
? (pool_problem.poolDim == 3
? std::vector<std::size_t>{static_cast<std::size_t>(N),
static_cast<std::size_t>(Do),
static_cast<std::size_t>(Ho),
static_cast<std::size_t>(Wo),
static_cast<std::size_t>(C)}
: std::vector<std::size_t>{static_cast<std::size_t>(N),
static_cast<std::size_t>(Ho),
static_cast<std::size_t>(Wo),
static_cast<std::size_t>(C)})
: std::vector<std::size_t>{1});
ck_tile::HostTensor<IndexDataType> out_index_host_result(
pool_problem.outputIndex
? (pool_problem.poolDim == 3
? std::vector<std::size_t>{static_cast<std::size_t>(N),
static_cast<std::size_t>(Do),
static_cast<std::size_t>(Ho),
static_cast<std::size_t>(Wo),
static_cast<std::size_t>(C)}
: std::vector<std::size_t>{static_cast<std::size_t>(N),
static_cast<std::size_t>(Ho),
static_cast<std::size_t>(Wo),
static_cast<std::size_t>(C)})
: std::vector<std::size_t>{1});
// Initialize input tensor
if(setting_.init_method_ == 0)
{
ck_tile::FillUniformDistribution<InDataType>{-5.f, 5.f}(in_tensor);
}
else if(setting_.init_method_ == 1)
{
ck_tile::FillMonotonicSeq<InDataType>{}(in_tensor);
}
else if(setting_.init_method_ == 2)
{
ck_tile::FillConstant<InDataType>{static_cast<InDataType>(1)}(in_tensor);
}
else
{
in_tensor.SetZero();
}
// Allocate device memory
ck_tile::DeviceMem in_dev_buf(in_tensor.get_element_space_size_in_bytes());
ck_tile::DeviceMem out_dev_buf(out_tensor.get_element_space_size_in_bytes());
ck_tile::DeviceMem out_index_dev_buf(
pool_problem.outputIndex ? out_index_tensor.get_element_space_size_in_bytes() : 0);
in_dev_buf.ToDevice(in_tensor.data());
out_dev_buf.SetZero();
if(pool_problem.outputIndex)
{
out_index_dev_buf.SetZero();
}
// Create shapes for host args
TensorShape input_shape, output_shape, input_strides, output_strides;
WindowShape window_lengths, window_strides, window_dilations, input_left_pads,
input_right_pads;
// Create host arguments
ck_tile::PoolHostArgs<TensorShape, WindowShape> pool_args{
in_dev_buf.GetDeviceBuffer(),
out_dev_buf.GetDeviceBuffer(),
pool_problem.outputIndex ? out_index_dev_buf.GetDeviceBuffer() : nullptr,
input_shape,
output_shape,
input_strides,
output_strides,
window_lengths,
window_strides,
window_dilations,
input_left_pads,
input_right_pads};
// Run reference if verification is enabled
// (Reference computation would be added here based on pool dimension)
for(auto& callable : callables)
{
auto kernel_run_result = callable(pool_args,
ck_tile::stream_config{nullptr,
true,
setting_.log_,
setting_.n_warmup_,
setting_.n_repeat_,
setting_.is_gpu_timer_,
setting_.flush_cache_,
setting_.rotating_count_});
process_result(pool_problem,
out_dev_buf,
out_host_result,
out_tensor,
out_index_dev_buf,
out_index_host_result,
out_index_tensor,
kernel_run_result);
}
}
void process_result(const PoolProblem& pool_problem,
ck_tile::DeviceMem& out_dev_buf,
ck_tile::HostTensor<OutDataType>& out_host_result,
ck_tile::HostTensor<OutDataType>& out_dev_result,
ck_tile::DeviceMem& out_index_dev_buf,
ck_tile::HostTensor<IndexDataType>& out_index_host_result,
ck_tile::HostTensor<IndexDataType>& out_index_dev_result,
const std::tuple<std::string, float>& kernel_run_result)
{
auto [name, avg_time] = kernel_run_result;
KernelInstance kernel_instance{name, pool_problem, {-1.0f, -1.0f, -1.0f}};
// Compute performance metrics
const ck_tile::index_t N = pool_problem.N;
const ck_tile::index_t D = pool_problem.D;
const ck_tile::index_t H = pool_problem.H;
const ck_tile::index_t W = pool_problem.W;
const ck_tile::index_t C = pool_problem.C;
const ck_tile::index_t Z = pool_problem.windowZ;
const ck_tile::index_t Y = pool_problem.windowY;
const ck_tile::index_t X = pool_problem.windowX;
const ck_tile::index_t Sz = pool_problem.strideZ;
const ck_tile::index_t Sy = pool_problem.strideY;
const ck_tile::index_t Sx = pool_problem.strideX;
const ck_tile::index_t Dz = pool_problem.dilationZ;
const ck_tile::index_t Dy = pool_problem.dilationY;
const ck_tile::index_t Dx = pool_problem.dilationX;
const ck_tile::index_t Zs = (Z - 1) * Dz + 1;
const ck_tile::index_t Ys = (Y - 1) * Dy + 1;
const ck_tile::index_t Xs = (X - 1) * Dx + 1;
const ck_tile::index_t Do =
(D + pool_problem.leftPadZ + pool_problem.rightPadZ - Zs) / Sz + 1;
const ck_tile::index_t Ho =
(H + pool_problem.leftPadY + pool_problem.rightPadY - Ys) / Sy + 1;
const ck_tile::index_t Wo =
(W + pool_problem.leftPadX + pool_problem.rightPadX - Xs) / Sx + 1;
// Calculate FLOPs: for pooling, we count one compare/add per window element per output
// element
std::size_t window_size =
static_cast<std::size_t>(Z) * static_cast<std::size_t>(Y) * static_cast<std::size_t>(X);
std::size_t output_elements = static_cast<std::size_t>(N) * static_cast<std::size_t>(Do) *
static_cast<std::size_t>(Ho) * static_cast<std::size_t>(Wo) *
static_cast<std::size_t>(C);
std::size_t flop = output_elements * window_size;
// Calculate memory bandwidth
std::size_t num_byte = sizeof(InDataType) * N * D * H * W * C +
sizeof(OutDataType) * N * Do * Ho * Wo * C;
// Update performance results
kernel_instance.perf_result_.latency_ = avg_time;
kernel_instance.perf_result_.tflops_ = static_cast<float>(flop) / 1.E9 / avg_time;
kernel_instance.perf_result_.bandwidth_ = num_byte / 1.E6 / avg_time;
if(setting_.log_ > 0 && !setting_.json_output_)
{
std::cout << kernel_instance << std::endl;
}
// Verify result
out_dev_buf.FromDevice(out_dev_result.data());
bool verified_correct = true;
if(setting_.verify_)
{
verified_correct = compare_pool_results(name, out_dev_result, out_host_result);
if(pool_problem.outputIndex)
{
out_index_dev_buf.FromDevice(out_index_dev_result.data());
verified_correct =
verified_correct &&
compare_pool_index_results(name, out_index_dev_result, out_index_host_result);
}
}
if(verified_correct)
{
kernel_instances_.emplace_back(kernel_instance);
}
else
{
std::cout << "Verification failed, skip kernel: " << name << std::endl;
}
// Clear tensors
out_dev_buf.SetZero();
out_dev_result.SetZero();
}
KernelInstance select_best_instance(Metric metric)
{
if(kernel_instances_.empty())
throw std::runtime_error("Empty instances");
auto kernel_instance = *std::max_element(kernel_instances_.begin(),
kernel_instances_.end(),
[metric](const auto& a, const auto& b) {
return PerformanceResult::compare(
b.perf_result_, a.perf_result_, metric);
});
if(setting_.json_output_)
{
// Output clean JSON only
std::cout << kernel_instance << std::endl;
}
else
{
std::cout << "**********************************" << std::endl;
std::cout << "According to given metrics: " << get_metric_name(metric) << "\n"
<< "Current kernel performance is: " << kernel_instance << std::endl;
std::cout << "**********************************" << std::endl;
}
if(!setting_.csv_filename_.empty())
{
std::ofstream file(setting_.csv_filename_ + ".csv", std::ios::app);
if(!file.is_open())
{
std::cerr << "Warning: Failed to open CSV file for writing." << std::endl;
}
else
{
if(file.tellp() == 0)
{
file << "rocm_version,device_name,"
<< "in_dtype,out_dtype,compute_dtype,index_dtype,"
<< "block_shape,reduce_op,pool_dim,"
<< "N,D,H,W,C,"
<< "window_z,window_y,window_x,"
<< "stride_z,stride_y,stride_x,"
<< "dilation_z,dilation_y,dilation_x,"
<< "left_pad_z,left_pad_y,left_pad_x,"
<< "right_pad_z,right_pad_y,right_pad_x,"
<< "output_index,propagate_nan," << "name,"
<< "latency(ms),tflops(TFlops),bandwidth(GB/s),metric\n";
}
const auto& problem = kernel_instance.problem_;
const auto& name = kernel_instance.name_;
const auto& perf = kernel_instance.perf_result_;
file << get_rocm_version() << "," << ck_tile::get_device_name() << ","
<< problem.inDType << "," << problem.outDType << "," << problem.computeDType
<< "," << problem.indexDType << "," << problem.blockShape << ","
<< problem.reduceOp << "," << problem.poolDim << "," << problem.N << ","
<< problem.D << "," << problem.H << "," << problem.W << "," << problem.C << ","
<< problem.windowZ << "," << problem.windowY << "," << problem.windowX << ","
<< problem.strideZ << "," << problem.strideY << "," << problem.strideX << ","
<< problem.dilationZ << "," << problem.dilationY << "," << problem.dilationX
<< "," << problem.leftPadZ << "," << problem.leftPadY << ","
<< problem.leftPadX << "," << problem.rightPadZ << "," << problem.rightPadY
<< "," << problem.rightPadX << "," << problem.outputIndex << ","
<< problem.propagateNan << "," << name << "," << std::fixed
<< std::setprecision(4) << perf.latency_ << "," << std::fixed
<< std::setprecision(4) << perf.tflops_ << "," << std::fixed
<< std::setprecision(4) << perf.bandwidth_ << "," << get_metric_name(metric)
<< "\n";
if(!file)
{
std::cerr << "Warning: Error occurred while writing to CSV file." << std::endl;
}
}
}
return kernel_instance;
}
////
PoolProfiler(const PoolProfiler&) = delete;
PoolProfiler& operator=(const PoolProfiler&) = delete;
private:
~PoolProfiler() { kernel_instances_.clear(); }
PoolProfiler(Settings settings) : settings_(settings) {}
PoolProfiler(Setting setting) : setting_(setting) {}
Settings settings_;
Setting setting_;
std::vector<KernelInstance> kernel_instances_;
}
};