[CK_TILE] Add pooling to ckTileEngine part3

2026-07-18 17:48:06 +00:00 · 2025-12-09 11:59:37 +00:00
parent 990f13229f
commit 07c078d5ef
8 changed files with 2449 additions and 44 deletions
--- a/tile_engine/ops/pooling/CMakeLists.txt
+++ b/tile_engine/ops/pooling/CMakeLists.txt
@@ -0,0 +1,263 @@
+set(POOL_DATATYPE "fp16;fp32" CACHE STRING "List of datatypes for Pool (semicolon-separated)")
+set(POOL_REDUCE_OP "max;avg" CACHE STRING "List of reduce operations for Pool (semicolon-separated)")
+set(POOL_CONFIG_FILE "" CACHE STRING "Custom config file name (without path, must be in configs/ folder)")
+option(ENABLE_CCACHE_POOL "Enable ccache for Pool ops compilation" OFF)
+
+# Store the directory path for use in functions
+set(POOL_SOURCE_DIR ${CMAKE_CURRENT_LIST_DIR})
+
+# Function to create individual Pool targets
+function(create_individual_pool_target datatype reduce_op trait block_config config_json)
+    # Use the parent scope POOL_GPU_TARGETS_INDIVIDUAL variable
+    if(NOT POOL_GPU_TARGETS_INDIVIDUAL)
+        message(WARNING "Skipping individual Pool target ${datatype}_${reduce_op}_${trait}_${block_config}: No supported GPU targets")
+        return()
+    endif()
+
+    # Parse block configuration: format is block_mxblock_n_warp_mxwarp_n_thread_tile_mxthread_tile_n
+    string(REPLACE "_" ";" config_groups ${block_config})
+    list(GET config_groups 0 block_dims)       # e.g., 128x1
+    list(GET config_groups 1 warp_dims)        # e.g., 1x1
+    list(GET config_groups 2 thread_tile_dims) # e.g., 2x1
+
+    # Parse block dimensions
+    string(REPLACE "x" ";" block_parts ${block_dims})
+    list(GET block_parts 0 block_m)
+    list(GET block_parts 1 block_n)
+
+    # Parse warp dimensions
+    string(REPLACE "x" ";" warp_parts ${warp_dims})
+    list(GET warp_parts 0 warp_m)
+    list(GET warp_parts 1 warp_n)
+
+    # Parse thread tile dimensions
+    string(REPLACE "x" ";" thread_tile_parts ${thread_tile_dims})
+    list(GET thread_tile_parts 0 thread_tile_m)
+    list(GET thread_tile_parts 1 thread_tile_n)
+
+    # Parse trait combo to get pool_dim
+    string(REPLACE "_" ";" trait_parts ${trait})
+    list(GET trait_parts 2 pool_dim)
+
+    set(target_name "benchmark_pool${pool_dim}d_${datatype}_${reduce_op}_${trait}_${block_config}")
+    set(working_path "${CMAKE_CURRENT_BINARY_DIR}/${datatype}/${reduce_op}")
+
+    # Generate the single instance header for this kernel
+    set(instance_header "${working_path}/pool_single_${pool_dim}d_${datatype}_${reduce_op}_${trait}_${block_config}.hpp")
+
+    # Add custom command to generate the header file at build time
+    add_custom_command(
+        OUTPUT ${instance_header}
+        COMMAND ${Python3_EXECUTABLE} ${POOL_SOURCE_DIR}/pool_instance_builder.py
+                --working_path ${working_path}
+                --datatype ${datatype}
+                --reduce_op ${reduce_op}
+                --config_json ${config_json}
+                --gen_single
+                --kernel_name "pool${pool_dim}d_${datatype}_${reduce_op}_${trait}_${block_config}"
+                --block_config "${block_config}"
+                --trait_combo "${trait}"
+                --gpu_target "${POOL_GPU_TARGETS_INDIVIDUAL}"
+        DEPENDS ${POOL_SOURCE_DIR}/pool_instance_builder.py ${config_json}
+        COMMENT "Generating ${instance_header}"
+    )
+
+    # Create the executable
+    add_executable(${target_name}
+        EXCLUDE_FROM_ALL
+        ${POOL_SOURCE_DIR}/pool_benchmark_single.cpp
+        ${instance_header}
+    )
+
+    # Set GPU architectures
+    set_property(TARGET ${target_name} PROPERTY HIP_ARCHITECTURES ${POOL_GPU_TARGETS_INDIVIDUAL})
+
+    # Set compile definitions
+    target_compile_definitions(${target_name} PRIVATE
+        POOL_SINGLE_INSTANCE_HPP="${instance_header}"
+    )
+
+    # Include directories
+    target_include_directories(${target_name} PRIVATE
+        ${POOL_SOURCE_DIR}
+        ${working_path}
+    )
+
+    # Compile options
+    target_compile_options(${target_name} PRIVATE
+        -Wno-undefined-func-template
+        -Wno-float-equal
+        --offload-compress
+        -include ${instance_header}
+    )
+
+    # Add to collection targets
+    add_dependencies(benchmark_pool_all ${target_name})
+    add_dependencies(benchmark_pool_${datatype} ${target_name})
+    add_dependencies(benchmark_pool_${reduce_op} ${target_name})
+    add_dependencies(benchmark_pool_${datatype}_${reduce_op} ${target_name})
+    add_dependencies(benchmark_pool${pool_dim}d ${target_name})
+endfunction()
+
+# Function to build individual Pool targets
+function(build_individual_pool_targets datatype reduce_op)
+    set(working_path "${CMAKE_CURRENT_BINARY_DIR}/${datatype}/${reduce_op}")
+
+    # Choose config file
+    if(DEFINED ENV{POOL_CONFIG_FILE} AND NOT "$ENV{POOL_CONFIG_FILE}" STREQUAL "")
+        set(config_filename "$ENV{POOL_CONFIG_FILE}")
+        set(json_blob "${CMAKE_CURRENT_LIST_DIR}/configs/${config_filename}")
+        message(VERBOSE "  Using config from environment variable: ${config_filename}")
+    elseif(NOT "${POOL_CONFIG_FILE}" STREQUAL "")
+        set(json_blob "${CMAKE_CURRENT_LIST_DIR}/configs/${POOL_CONFIG_FILE}")
+        message(VERBOSE "  Using custom config: ${POOL_CONFIG_FILE}")
+    else()
+        set(json_blob "${CMAKE_CURRENT_LIST_DIR}/configs/default_config.json")
+        message(VERBOSE "  Using default config")
+    endif()
+
+    # Check if config file exists
+    if(NOT EXISTS ${json_blob})
+        message(FATAL_ERROR "Config file not found: ${json_blob}")
+    endif()
+
+    # Determine number of workers
+    if(DEFINED ENV{CMAKE_BUILD_PARALLEL_LEVEL})
+        set(num_workers $ENV{CMAKE_BUILD_PARALLEL_LEVEL})
+    else()
+        cmake_host_system_information(RESULT num_cores QUERY NUMBER_OF_LOGICAL_CORES)
+        math(EXPR num_workers "${num_cores}")
+        if(num_workers GREATER 8)
+            set(num_workers 8)
+        endif()
+    endif()
+
+    # Generate individual kernel files
+    message(VERBOSE "Generating individual kernels for ${datatype} ${reduce_op} using ${num_workers} workers...")
+    message(VERBOSE "  Working path: ${working_path}")
+    message(VERBOSE "  Config file: ${json_blob}")
+
+    # Create working directory first
+    file(MAKE_DIRECTORY ${working_path})
+
+    # List the kernels (fast operation)
+    message(VERBOSE "  Listing kernel configurations...")
+    execute_process(
+        COMMAND ${Python3_EXECUTABLE} -u ${CMAKE_CURRENT_LIST_DIR}/pool_instance_builder.py
+                --working_path ${working_path}
+                --datatype ${datatype}
+                --reduce_op ${reduce_op}
+                --config_json ${json_blob}
+                --gpu_target ${POOL_GPU_TARGETS_INDIVIDUAL}
+                --list_kernels
+        WORKING_DIRECTORY ${CMAKE_CURRENT_LIST_DIR}
+        RESULT_VARIABLE ret
+        OUTPUT_VARIABLE list_output
+        ERROR_VARIABLE list_error
+    )
+
+    if(NOT ret EQUAL 0)
+        message(FATAL_ERROR "Failed to list kernels for ${datatype} ${reduce_op}: ${list_error}")
+    endif()
+
+    # Read kernel count
+    if(EXISTS ${working_path}/pool_kernel_count.txt)
+        file(READ ${working_path}/pool_kernel_count.txt kernel_count)
+        string(STRIP "${kernel_count}" kernel_count)
+        message(VERBOSE "  Found ${kernel_count} kernel configurations")
+    else()
+        message(FATAL_ERROR "Kernel count file not found")
+    endif()
+
+    # Read kernel list and create targets
+    if(EXISTS ${working_path}/pool_kernel_list.txt)
+        file(STRINGS ${working_path}/pool_kernel_list.txt kernel_lines)
+        foreach(line IN LISTS kernel_lines)
+            # Parse line: kernel_name|block_config|trait_combo
+            string(REPLACE "|" ";" parts "${line}")
+            list(GET parts 0 kernel_name)
+            list(GET parts 1 block_config)
+            list(GET parts 2 trait_combo)
+
+            # Create individual target
+            create_individual_pool_target("${datatype}" "${reduce_op}" "${trait_combo}" "${block_config}" "${json_blob}")
+        endforeach()
+    else()
+        message(FATAL_ERROR "Kernel list file not found")
+    endif()
+endfunction()
+
+# Main build logic
+message(VERBOSE "=== Starting Tile Engine Pool Configuration ===")
+message(VERBOSE "POOL_DATATYPE: ${POOL_DATATYPE}")
+message(VERBOSE "POOL_REDUCE_OP: ${POOL_REDUCE_OP}")
+message(VERBOSE "SUPPORTED_GPU_TARGETS: ${SUPPORTED_GPU_TARGETS}")
+
+# Filter GPU targets
+set(POOL_GPU_TARGETS_INDIVIDUAL "")
+set(DESIRED_TARGETS "gfx90a;gfx942;gfx950;gfx1201")
+
+foreach(target IN LISTS SUPPORTED_GPU_TARGETS)
+    if(target IN_LIST DESIRED_TARGETS)
+        list(APPEND POOL_GPU_TARGETS_INDIVIDUAL ${target})
+        message(VERBOSE "  Adding GPU target: ${target}")
+    endif()
+endforeach()
+
+# Skip build if no matching targets found
+if(NOT POOL_GPU_TARGETS_INDIVIDUAL)
+    message(WARNING "Skipping Tile Engine Pool build: No supported GPU targets (gfx90a, gfx942, gfx950, gfx1201) found in SUPPORTED_GPU_TARGETS: ${SUPPORTED_GPU_TARGETS}")
+else()
+    message(VERBOSE "Building individual Pool targets for GPU targets: ${POOL_GPU_TARGETS_INDIVIDUAL}")
+
+    # Set up job pools
+    set_property(GLOBAL PROPERTY JOB_POOLS
+        compile_heavy=4
+        compile_normal=16
+    )
+
+    # Enable compiler cache if requested
+    if(ENABLE_CCACHE_POOL)
+        find_program(CCACHE_PROGRAM ccache)
+        if(CCACHE_PROGRAM)
+            set(CMAKE_CXX_COMPILER_LAUNCHER ${CCACHE_PROGRAM})
+            message(VERBOSE "Using ccache for faster compilation")
+        else()
+            message(WARNING "ccache requested but not found")
+        endif()
+    else()
+        message(VERBOSE "ccache disabled for Pool ops (use -DENABLE_CCACHE_POOL=ON to enable)")
+    endif()
+
+    # Create master collection targets
+    add_custom_target(benchmark_pool_all)
+
+    # Create datatype collection targets
+    foreach(dt IN LISTS POOL_DATATYPE)
+        add_custom_target(benchmark_pool_${dt})
+    endforeach()
+
+    # Create reduce_op collection targets
+    foreach(op IN LISTS POOL_REDUCE_OP)
+        add_custom_target(benchmark_pool_${op})
+    endforeach()
+
+    # Create combined collection targets
+    foreach(dt IN LISTS POOL_DATATYPE)
+        foreach(op IN LISTS POOL_REDUCE_OP)
+            add_custom_target(benchmark_pool_${dt}_${op})
+        endforeach()
+    endforeach()
+
+    # Create pool dimension targets
+    add_custom_target(benchmark_pool2d)
+    add_custom_target(benchmark_pool3d)
+
+    # Build individual targets for each datatype/reduce_op combination
+    foreach(dt IN LISTS POOL_DATATYPE)
+        foreach(op IN LISTS POOL_REDUCE_OP)
+            build_individual_pool_targets(${dt} ${op})
+        endforeach()
+    endforeach()
+endif()
+
--- a/tile_engine/ops/pooling/README.md
+++ b/tile_engine/ops/pooling/README.md
@@ -0,0 +1,381 @@
+# CK Tile Engine Pool Operations
+
+## Overview
+
+The CK Tile Engine Pool module provides a comprehensive system for generating, building, and benchmarking pooling kernels (2D and 3D) with various configurations. It supports multiple data types, reduce operations (max, min, average), and optimization strategies. The system follows the same architecture as the GEMM module with individual kernel compilation for better build parallelism and targeted testing capabilities.
+
+## Table of Contents
+
+1. [Build System Architecture](#build-system-architecture)
+2. [Build Instructions](#build-instructions)
+3. [Running Benchmarks](#running-benchmarks)
+4. [Configuration System](#configuration-system)
+5. [Scripts and Tools](#scripts-and-tools)
+6. [Command Line Options](#command-line-options)
+7. [Understanding Kernel Names](#understanding-kernel-names)
+8. [Troubleshooting](#troubleshooting)
+9. [Performance Tips](#performance-tips)
+
+## Build System Architecture
+
+### Individual Kernel Compilation
+
+The tile engine benchmark system compiles each kernel configuration into a separate executable. This provides:
+- Better build parallelism
+- Faster incremental builds
+- More targeted testing
+- Easier debugging of specific configurations
+
+Each benchmark executable follows the naming pattern:
+```
+benchmark_pool<dim>d_<dtype>_<reduce_op>_<output_index>_<propagate_nan>_<block_config>
+```
+
+## Build Instructions
+
+### Prerequisites
+- ROCm installation
+- CMake 3.16 or higher
+- C++17 compatible compiler
+- Python 3.6 or higher
+
+### Basic Build
+
+```bash
+# In the root of composable kernel, create build directory
+mkdir build && cd build
+
+# Configure with specific datatypes and reduce operations
+# Replace [Arch] with your GPU architecture (e.g., gfx90a, gfx942)
+../script/cmake-ck-dev.sh ../ [Arch] -DPOOL_DATATYPE="fp16;fp32" -DPOOL_REDUCE_OP="max;avg"
+
+# Build specific benchmarks
+make benchmark_pool_fp16_max -j
+```
+
+### Configuration Options
+
+The build system supports several configuration options:
+
+#### Using Custom Config Files
+```bash
+# Method 1: CMake variable (config file must be in configs/ directory)
+cmake -DPOOL_CONFIG_FILE=my_custom_config.json ...
+
+# Method 2: Environment variable (takes precedence over CMake variable)
+export POOL_CONFIG_FILE=my_custom_config.json
+cmake ...
+```
+
+#### Config File Priority Order
+1. **Environment variable** `POOL_CONFIG_FILE` (highest priority)
+2. **CMake variable** `POOL_CONFIG_FILE`
+3. **Default config** (default_config.json)
+
+**Note**: All custom config files must be placed in the `tile_engine/ops/pooling/configs/` directory.
+
+### Example Build Commands
+
+```bash
+# Build for gfx942 with fp16 datatype, max reduce operation
+mkdir build && cd build
+../script/cmake-ck-dev.sh ../ gfx942 -DPOOL_DATATYPE="fp16;fp32" -DPOOL_REDUCE_OP="max;avg"
+make benchmark_pool_fp16_max -j
+make benchmark_pool_fp32_avg -j
+```
+
+### Building Individual Kernels
+
+```bash
+# Build a specific kernel configuration
+make benchmark_pool3d_fp16_max_True_False_128x1_1x1_2x1
+
+# Build all fp16 max pooling benchmarks
+make benchmark_pool_fp16_max -j$(nproc)
+
+# Build all 3D pooling benchmarks
+make benchmark_pool3d -j$(nproc)
+```
+
+### Rebuilding After Configuration Changes
+
+If you modify the configuration file, you must rebuild:
+```bash
+rm -rf tile_engine/ && make benchmark_pool_[Datatype]_[ReduceOp] -j
+```
+
+## Running Benchmarks
+
+### Individual Kernel Execution
+
+```bash
+cd /path/to/build/directory
+./bin/benchmark_pool3d_fp16_max_True_False_128x1_1x1_2x1 \
+    -N=2 -D=30 -H=30 -W=30 -C=32 \
+    -Z=2 -Y=2 -X=2 \
+    -Sz=2 -Sy=2 -Sx=2 \
+    -verify=1
+```
+
+### Using the Benchmark Python Script
+
+```bash
+# Run benchmark sweep
+python pool_benchmark.py /path/to/build \
+    --problem-sizes "2,30,30,30,32" "4,64,64,64,64" \
+    --window-sizes "2,2,2" "3,3,3" \
+    --stride-sizes "2,2,2" \
+    --pool-dim 3 \
+    --verify \
+    --json results.json
+```
+
+## Configuration System
+
+### Configuration Files
+
+The system uses JSON configuration files to specify kernel parameters:
+
+- `configs/default_config.json` - Default configurations
+
+### Configuration Structure
+
+```json
+{
+    "block_config": {
+        "block_m": {"values": [64, 128, 256]},
+        "block_n": {"values": [1]},
+        "warp_m": {"values": [1, 2]},
+        "warp_n": {"values": [1]},
+        "thread_tile_m": {"values": [1, 2, 4]},
+        "thread_tile_n": {"values": [1]}
+    },
+    "trait_config": {
+        "output_index": {"values": [true, false]},
+        "propagate_nan": {"values": [false]},
+        "pool_dim": {"values": [2, 3]}
+    },
+    "k_block_per_cu": 1
+}
+```
+
+### Configuration Parameters
+
+- **block_m/block_n**: Block tile dimensions for output
+- **warp_m/warp_n**: Number of warps per block
+- **thread_tile_m/thread_tile_n**: Thread tile sizes
+- **output_index**: Whether to output indices (for max/min pooling)
+- **propagate_nan**: Whether to propagate NaN values
+- **pool_dim**: Pooling dimension (2 for 2D, 3 for 3D)
+
+## Scripts and Tools
+
+### Python Scripts
+
+#### pool_instance_builder.py
+**Purpose**: Main kernel instance generation script that creates C++ kernel implementations based on configuration files.
+
+**Key Features**:
+- Generates individual kernel header files for separate compilation
+- Supports multiple data types (fp16, fp32, bf16)
+- Validates block configurations for correctness
+- Creates CMake integration files
+
+**Usage**:
+```bash
+python pool_instance_builder.py \
+    --working_path ./generated \
+    --datatype fp16 \
+    --reduce_op max \
+    --config_json configs/default_config.json \
+    --gen_all_individual \
+    --gpu_target gfx942
+```
+
+#### pool_benchmark.py
+**Purpose**: Python script for running and analyzing pool benchmarks.
+
+**Features**:
+- Automated benchmark execution
+- Performance data collection
+- Result analysis and reporting
+- CSV and JSON export
+
+**Usage**:
+```bash
+python pool_benchmark.py /path/to/build \
+    --problem-sizes "2,30,30,30,32" \
+    --window-sizes "2,2,2" \
+    --verbose \
+    --json results.json
+```
+
+## Command Line Options
+
+All benchmark executables support the following options:
+
+### Tensor Dimensions
+- `-N=<value>` - Batch size (default: 2)
+- `-D=<value>` - Depth dimension for 3D pooling (default: 30)
+- `-H=<value>` - Height dimension (default: 30)
+- `-W=<value>` - Width dimension (default: 30)
+- `-C=<value>` - Channel dimension (default: 32)
+
+### Window Parameters
+- `-Z=<value>` - Window depth (default: 2)
+- `-Y=<value>` - Window height (default: 2)
+- `-X=<value>` - Window width (default: 2)
+
+### Stride Parameters
+- `-Sz=<value>` - Stride depth (default: 2)
+- `-Sy=<value>` - Stride height (default: 2)
+- `-Sx=<value>` - Stride width (default: 2)
+
+### Dilation Parameters
+- `-Dz=<value>` - Dilation depth (default: 1)
+- `-Dy=<value>` - Dilation height (default: 1)
+- `-Dx=<value>` - Dilation width (default: 1)
+
+### Padding Parameters
+- `-LeftPz=<value>` - Left padding depth (default: 0)
+- `-LeftPy=<value>` - Left padding height (default: 0)
+- `-LeftPx=<value>` - Left padding width (default: 0)
+- `-RightPz=<value>` - Right padding depth (default: 0)
+- `-RightPy=<value>` - Right padding height (default: 0)
+- `-RightPx=<value>` - Right padding width (default: 0)
+
+### Pool Dimension
+- `-pool_dim=<2|3>` - Pooling dimension (default: 3)
+
+### Verification
+- `-verify=<0|1>` - Verification mode
+  - 0: No verification
+  - 1: CPU verification (default)
+
+### Performance Testing
+- `-warmup=<value>` - Warmup iterations (default: 20)
+- `-repeat=<value>` - Benchmark iterations (default: 100)
+- `-timer=<true|false>` - Use GPU timer (default: true)
+- `-flush_cache=<true|false>` - Flush cache between runs (default: true)
+- `-rotating_count=<value>` - Cache rotation count (default: 1000)
+
+### Initialization
+- `-init=<0|1|2>` - Tensor initialization method
+  - 0: Random values [-5, 5] (default)
+  - 1: Linear sequence
+  - 2: Constant value (1.0)
+
+### Output Options
+- `-log=<true|false>` - Enable verbose logging (default: false)
+- `-metric=<0|1|2>` - Performance metric
+  - 0: Latency in ms
+  - 1: TFLOPS
+  - 2: Bandwidth in GB/s (default)
+- `-json_output=<true|false>` - JSON format output (default: false)
+- `-csv_filename=<filename>` - Save results to CSV
+
+## Understanding Kernel Names
+
+The kernel naming convention encodes the configuration:
+
+```
+benchmark_pool3d_fp16_max_True_False_128x1_1x1_2x1
+              ^^^^ ^^^^ ^^^ ^^^^ ^^^^^ ^^^^^ ^^^ ^^^
+              |    |    |   |    |     |     |   |
+              |    |    |   |    |     |     |   Thread tile (MxN)
+              |    |    |   |    |     |     Warp config (MxN)
+              |    |    |   |    |     Block tile (MxN)
+              |    |    |   |    Propagate NaN
+              |    |    |   Output Index
+              |    |    Reduce operation
+              |    Data type
+              Pool dimension (2D or 3D)
+```
+
+### Components:
+- **Pool dimension**: 2d, 3d
+- **Data type**: fp16, fp32, bf16
+- **Reduce op**: max, min, avg
+- **Output Index**: True/False (whether to output argmax/argmin)
+- **Propagate NaN**: True/False
+- **Block config**: Block_MxBlock_N_Warp_MxWarp_N_ThreadTile_MxThreadTile_N
+
+## Troubleshooting
+
+### Common Issues
+
+1. **Kernel not found**
+   - Ensure the specific benchmark executable is built
+   - Check the build directory bin/ folder
+
+2. **Verification failures**
+   - Check tensor dimensions are valid for the window/stride configuration
+   - Verify padding values are reasonable
+
+3. **Build failures**
+   - Check GPU architecture compatibility
+   - Ensure ROCm is properly installed
+   - Verify configuration file syntax
+
+4. **Performance variations**
+   - Increase warmup iterations
+   - Disable CPU frequency scaling
+   - Use GPU timer for accurate measurements
+
+### Debug Options
+
+Enable verbose logging:
+```bash
+./bin/benchmark_pool... -log=true -verify=1
+```
+
+## Performance Tips
+
+1. **Optimal Problem Sizes**: Use sizes that are multiples of block dimensions
+2. **Warmup**: Use at least 20-50 warmup iterations
+3. **GPU Timer**: Always use `-timer=true` for accurate measurements
+4. **Cache Management**: Enable cache flushing for consistent results
+5. **Output Index**: Disable output index if not needed (reduces memory bandwidth)
+
+## Integration Examples
+
+### Python Integration
+
+```python
+import subprocess
+import json
+
+# Run benchmark with JSON output
+result = subprocess.run([
+    './bin/benchmark_pool3d_fp16_max_...', 
+    '-N=2', '-D=30', '-H=30', '-W=30', '-C=32',
+    '-json_output=true'
+], capture_output=True, text=True)
+
+# Parse results
+data = json.loads(result.stdout)
+print(f"Bandwidth: {data['bandwidth_gb_s']} GB/s")
+```
+
+### Batch Testing Script
+
+```bash
+#!/bin/bash
+SIZES="32 64 128 256"
+for size in $SIZES; do
+    echo "Testing HxW=${size}x${size}"
+    ./bin/benchmark_pool... -H=$size -W=$size \
+        -verify=1 -csv_filename=results.csv
+done
+```
+
+## Contributing
+
+When adding new features or configurations:
+1. Update the instance builder (`pool_instance_builder.py`)
+2. Update configuration examples in `configs/`
+3. Document new command-line options in this README
+4. Add appropriate tests
+
+For more information about the Composable Kernel project, visit the main repository documentation.
+
--- a/tile_engine/ops/pooling/configs/default_config.json
+++ b/tile_engine/ops/pooling/configs/default_config.json
@@ -0,0 +1,35 @@
+{
+    "block_config": {
+        "block_m": {
+            "values": [64, 128, 256]
+        },
+        "block_n": {
+            "values": [1]
+        },
+        "warp_m": {
+            "values": [1, 2]
+        },
+        "warp_n": {
+            "values": [1]
+        },
+        "thread_tile_m": {
+            "values": [1, 2, 4]
+        },
+        "thread_tile_n": {
+            "values": [1]
+        }
+    },
+    "trait_config": {
+        "output_index": {
+            "values": [true, false]
+        },
+        "propagate_nan": {
+            "values": [false]
+        },
+        "pool_dim": {
+            "values": [2, 3]
+        }
+    },
+    "k_block_per_cu": 1
+}
+
--- a/tile_engine/ops/pooling/pool_benchmark.hpp
+++ b/tile_engine/ops/pooling/pool_benchmark.hpp
@@ -11,6 +11,7 @@

 #include "ck_tile/core.hpp"
 #include "ck_tile/host.hpp"
+#include "pool_common.hpp"

 enum class Metric
 {
@@ -38,25 +39,56 @@ struct PoolProblem
    std::string indexDType;
    std::string blockShape;
    std::string reduceOp;
+
+    int poolDim;
+    int N, D, H, W, C;
+
+    int windowZ, windowY, windowX;
+    int strideZ, strideY, strideX;
+    int dilationZ, dilationY, dilationX;
+    int leftPadZ, leftPadY, leftPadX;
+    int rightPadZ, rightPadY, rightPadX;
+
    bool outputIndex;
    bool propagateNan;

    friend std::ostream& operator<<(std::ostream& os, const PoolProblem& problem)
    {
        os << "{\n"
-           << "    \"inDType\":" << problem.inDType << ",\n"
-           << "    \"outDType\":" << problem.outDType << ",\n"
-           << "    \"computeDType\":" << problem.computeDType << ",\n"
-           << "    \"indexDType\":" << problem.indexDType << ",\n"
-           << "    \"blockShape\":" << problem.blockShape << ",\n"
-           << "    \"reduceOp\":" << problem.reduceOp << ",\n"
-           << "    \"outputIndex\":" << (problem.outputIndex ? "true" : "false") << ",\n"
-           << "    \"propagateNan\":" << (problem.propagateNan ? "true" : "false")
+           << "    \"inDType\": \"" << problem.inDType << "\",\n"
+           << "    \"outDType\": \"" << problem.outDType << "\",\n"
+           << "    \"computeDType\": \"" << problem.computeDType << "\",\n"
+           << "    \"indexDType\": \"" << problem.indexDType << "\",\n"
+           << "    \"blockShape\": \"" << problem.blockShape << "\",\n"
+           << "    \"reduceOp\": \"" << problem.reduceOp << "\",\n"
+           << "    \"poolDim\": " << problem.poolDim << ",\n"
+           << "    \"N\": " << problem.N << ",\n"
+           << "    \"D\": " << problem.D << ",\n"
+           << "    \"H\": " << problem.H << ",\n"
+           << "    \"W\": " << problem.W << ",\n"
+           << "    \"C\": " << problem.C << ",\n"
+           << "    \"windowZ\": " << problem.windowZ << ",\n"
+           << "    \"windowY\": " << problem.windowY << ",\n"
+           << "    \"windowX\": " << problem.windowX << ",\n"
+           << "    \"strideZ\": " << problem.strideZ << ",\n"
+           << "    \"strideY\": " << problem.strideY << ",\n"
+           << "    \"strideX\": " << problem.strideX << ",\n"
+           << "    \"dilationZ\": " << problem.dilationZ << ",\n"
+           << "    \"dilationY\": " << problem.dilationY << ",\n"
+           << "    \"dilationX\": " << problem.dilationX << ",\n"
+           << "    \"leftPadZ\": " << problem.leftPadZ << ",\n"
+           << "    \"leftPadY\": " << problem.leftPadY << ",\n"
+           << "    \"leftPadX\": " << problem.leftPadX << ",\n"
+           << "    \"rightPadZ\": " << problem.rightPadZ << ",\n"
+           << "    \"rightPadY\": " << problem.rightPadY << ",\n"
+           << "    \"rightPadX\": " << problem.rightPadX << ",\n"
+           << "    \"outputIndex\": " << (problem.outputIndex ? "true" : "false") << ",\n"
+           << "    \"propagateNan\": " << (problem.propagateNan ? "true" : "false")
           << "\n"
           << "}";
        return os;
    }
-}
+};

 struct PerformanceResult
 {
@@ -109,6 +141,20 @@ struct KernelInstance
    }
 };

+struct Setting
+{
+    int n_warmup_;
+    int n_repeat_;
+    bool is_gpu_timer_;
+    int verify_;
+    int init_method_;
+    bool log_;
+    std::string csv_filename_;
+    bool flush_cache_;
+    int rotating_count_;
+    bool json_output_;
+};
+
 inline std::string get_rocm_version()
 {
    std::ifstream version_file("/opt/rocm/.info/version");
@@ -121,3 +167,30 @@ inline std::string get_rocm_version()
    return "Unknown";
 }

+/// @brief Function to compare the results of the device and host computations
+template <typename OutDataType>
+bool compare_pool_results(std::string instanceName,
+                          ck_tile::HostTensor<OutDataType>& out_dev_result,
+                          ck_tile::HostTensor<OutDataType>& out_host_result)
+{
+    bool pass = ck_tile::check_err(out_dev_result, out_host_result, "Error: Incorrect results!");
+
+    std::cout << "For " << instanceName << " verification result is: "
+              << (pass ? "correct" : "fail") << std::endl;
+
+    return pass;
+}
+
+template <typename IndexDataType>
+bool compare_pool_index_results(std::string instanceName,
+                                ck_tile::HostTensor<IndexDataType>& out_index_dev_result,
+                                ck_tile::HostTensor<IndexDataType>& out_index_host_result)
+{
+    bool pass = ck_tile::check_err(
+        out_index_dev_result, out_index_host_result, "Error: Incorrect index results!");
+
+    std::cout << "For " << instanceName << " index verification result is: "
+              << (pass ? "correct" : "fail") << std::endl;
+
+    return pass;
+}
--- a/tile_engine/ops/pooling/pool_benchmark.py
+++ b/tile_engine/ops/pooling/pool_benchmark.py
@@ -0,0 +1,611 @@
+#!/usr/bin/env python3
+# Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+# SPDX-License-Identifier: MIT
+
+import sys
+import json
+import subprocess
+import argparse
+import csv
+import time
+from pathlib import Path
+from typing import List, Dict, Tuple, Optional
+
+
+class PoolBenchmark:
+    def __init__(self, build_dir: str, verbose: bool = False):
+        self.build_dir = Path(build_dir)
+        self.verbose = verbose
+        self.results = []
+
+    def discover_kernels(self) -> List[Path]:
+        """Find all benchmark_pool_* executables in the build directory"""
+        bin_dir = self.build_dir / "bin"
+        if not bin_dir.exists():
+            print(f"Error: Binary directory {bin_dir} does not exist")
+            return []
+
+        kernels = list(bin_dir.glob("benchmark_pool*"))
+        if self.verbose:
+            print(f"Found {len(kernels)} kernel executables")
+            for k in kernels:
+                print(f"  - {k.name}")
+        return kernels
+
+    def extract_kernel_info(self, kernel_path: Path) -> Dict[str, str]:
+        """Extract comprehensive kernel information from filename"""
+        name = kernel_path.stem
+
+        # Initialize with basic info
+        info = {
+            "executable": str(kernel_path),
+            "name": name,
+            "data_type": "unknown",
+            "reduce_op": "unknown",
+            "pool_dim": 0,
+            "output_index": False,
+            "propagate_nan": False,
+        }
+
+        # Parse the kernel name pattern:
+        # benchmark_pool3d_fp16_max_True_False_128x1_1x1_2x1
+        parts = name.split("_")
+
+        if len(parts) >= 3:
+            # Extract pool dimension (e.g., pool3d -> 3)
+            if "pool2d" in parts[1]:
+                info["pool_dim"] = 2
+            elif "pool3d" in parts[1]:
+                info["pool_dim"] = 3
+
+            # Extract data type
+            info["data_type"] = parts[2] if len(parts) > 2 else "unknown"
+
+            # Extract reduce op
+            info["reduce_op"] = parts[3] if len(parts) > 3 else "unknown"
+
+            # Extract flags
+            if len(parts) > 4:
+                info["output_index"] = parts[4] == "True"
+            if len(parts) > 5:
+                info["propagate_nan"] = parts[5] == "True"
+
+        # Extract block configuration
+        config_info = self.parse_block_config(name)
+        info.update(config_info)
+
+        # Generate config ID
+        info["config_id"] = self.generate_config_id(info)
+
+        return info
+
+    def parse_block_config(self, kernel_name: str) -> Dict:
+        """Parse block configuration from kernel name"""
+        config = {
+            "block_sizes": {"block_m": 0, "block_n": 0},
+            "warp_config": {"warp_m": 0, "warp_n": 0},
+            "thread_tile": {"thread_tile_m": 0, "thread_tile_n": 0},
+        }
+
+        parts = kernel_name.split("_")
+
+        # Look for dimension patterns (e.g., 128x1)
+        dimension_groups = []
+        for part in parts:
+            if "x" in part and len(part.split("x")) == 2:
+                try:
+                    dims = [int(x) for x in part.split("x")]
+                    if all(d >= 0 for d in dims):
+                        dimension_groups.append(dims)
+                except ValueError:
+                    continue
+
+        # Assign dimensions based on order
+        if len(dimension_groups) >= 3:
+            config["block_sizes"]["block_m"] = dimension_groups[0][0]
+            config["block_sizes"]["block_n"] = dimension_groups[0][1]
+            config["warp_config"]["warp_m"] = dimension_groups[1][0]
+            config["warp_config"]["warp_n"] = dimension_groups[1][1]
+            config["thread_tile"]["thread_tile_m"] = dimension_groups[2][0]
+            config["thread_tile"]["thread_tile_n"] = dimension_groups[2][1]
+        elif len(dimension_groups) == 2:
+            config["block_sizes"]["block_m"] = dimension_groups[0][0]
+            config["block_sizes"]["block_n"] = dimension_groups[0][1]
+            config["warp_config"]["warp_m"] = dimension_groups[1][0]
+            config["warp_config"]["warp_n"] = dimension_groups[1][1]
+        elif len(dimension_groups) == 1:
+            config["block_sizes"]["block_m"] = dimension_groups[0][0]
+            config["block_sizes"]["block_n"] = dimension_groups[0][1]
+
+        return config
+
+    def generate_config_id(self, info: Dict) -> str:
+        """Generate a compact config ID from kernel info"""
+        parts = [
+            f"pool{info.get('pool_dim', 0)}d",
+            info.get("data_type", "unk"),
+            info.get("reduce_op", "unk"),
+        ]
+
+        block_sizes = info.get("block_sizes", {})
+        if block_sizes.get("block_m", 0) > 0:
+            block_str = f"{block_sizes['block_m']}x{block_sizes['block_n']}"
+            parts.append(block_str)
+
+        return "_".join(parts)
+
+    def run_kernel(self, kernel_path: Path, params: Dict[str, str]) -> Optional[Dict]:
+        """Run a single kernel with given parameters"""
+        results_dir = self.build_dir / "results"
+        results_dir.mkdir(exist_ok=True)
+
+        json_file = results_dir / f"{kernel_path.stem}.json"
+
+        cmd = [str(kernel_path)]
+
+        for key, value in params.items():
+            cmd.append(f"-{key}={value}")
+
+        cmd.append("-json_output=true")
+
+        if self.verbose:
+            print(f"Running: {' '.join(cmd)}")
+
+        try:
+            result = subprocess.run(cmd, capture_output=True, text=True, timeout=120)
+
+            if result.returncode != 0:
+                print(f"Error running {kernel_path.name}: {result.stderr}")
+                return None
+
+            output = result.stdout.strip()
+            if output:
+                with open(json_file, "w") as f:
+                    f.write(output)
+
+                return self.parse_json_file(json_file)
+            else:
+                print(f"No output from {kernel_path.name}")
+                return None
+
+        except subprocess.TimeoutExpired:
+            print(f"Timeout running {kernel_path.name}")
+            return None
+        except Exception as e:
+            print(f"Error running {kernel_path.name}: {e}")
+            return None
+
+    def parse_json_file(self, json_file: Path) -> Optional[Dict]:
+        """Parse JSON data from individual kernel output file"""
+        try:
+            with open(json_file, "r") as f:
+                content = f.read().strip()
+
+            data = json.loads(content)
+
+            result = data.copy()
+            if "perf_result" in data:
+                perf = data["perf_result"]
+                result["time_ms"] = perf.get("latency(ms)", 0)
+                result["tflops"] = perf.get("tflops(TFlops)", 0)
+                result["bandwidth_gb_s"] = perf.get("bandwidth(GB/s)", 0)
+
+            return result
+
+        except json.JSONDecodeError as e:
+            if self.verbose:
+                print(f"Failed to parse JSON from {json_file}: {e}")
+            return None
+        except Exception as e:
+            if self.verbose:
+                print(f"Error reading JSON file {json_file}: {e}")
+            return None
+
+    def benchmark_problem_size(
+        self,
+        kernels: List[Path],
+        N: int,
+        D: int,
+        H: int,
+        W: int,
+        C: int,
+        window_z: int = 2,
+        window_y: int = 2,
+        window_x: int = 2,
+        stride_z: int = 2,
+        stride_y: int = 2,
+        stride_x: int = 2,
+        pool_dim: int = 3,
+        verify: int = 0,
+        warmup: int = 20,
+        repeat: int = 100,
+        flush_cache: bool = True,
+        rotating_count: int = 1000,
+    ) -> List[Dict]:
+        """Benchmark all kernels for a specific problem size"""
+        results = []
+
+        params = {
+            "N": N,
+            "D": D,
+            "H": H,
+            "W": W,
+            "C": C,
+            "Z": window_z,
+            "Y": window_y,
+            "X": window_x,
+            "Sz": stride_z,
+            "Sy": stride_y,
+            "Sx": stride_x,
+            "pool_dim": pool_dim,
+            "verify": verify,
+            "warmup": warmup,
+            "repeat": repeat,
+            "flush_cache": str(flush_cache).lower(),
+            "rotating_count": rotating_count,
+        }
+
+        print(f"\nBenchmarking N={N}, D={D}, H={H}, W={W}, C={C}")
+        print(f"  Window: {window_z}x{window_y}x{window_x}, Stride: {stride_z}x{stride_y}x{stride_x}")
+
+        for kernel_path in kernels:
+            kernel_info = self.extract_kernel_info(kernel_path)
+            result = self.run_kernel(kernel_path, params)
+
+            if result:
+                structured_result = {
+                    "name": kernel_info["name"],
+                    "config_id": kernel_info["config_id"],
+                    "problem": result.get("problem", {}),
+                    "perf_result": result.get("perf_result", {}),
+                    "config": {
+                        "data_type": kernel_info["data_type"],
+                        "reduce_op": kernel_info["reduce_op"],
+                        "pool_dim": kernel_info["pool_dim"],
+                        "output_index": kernel_info["output_index"],
+                        "propagate_nan": kernel_info["propagate_nan"],
+                        "block_sizes": kernel_info.get("block_sizes", {}),
+                        "warp_config": kernel_info.get("warp_config", {}),
+                        "thread_tile": kernel_info.get("thread_tile", {}),
+                    },
+                    "executable": kernel_info["executable"],
+                    "time_ms": result.get("time_ms", 0),
+                    "tflops": result.get("tflops", 0),
+                    "bandwidth_gb_s": result.get("bandwidth_gb_s", 0),
+                }
+
+                results.append(structured_result)
+
+                if self.verbose:
+                    print(
+                        f"  {kernel_info['config_id']}: {structured_result['bandwidth_gb_s']:.2f} GB/s, {structured_result['time_ms']:.2f}ms"
+                    )
+
+        return results
+
+    def find_best_kernel(
+        self, results: List[Dict], metric: str = "bandwidth_gb_s"
+    ) -> Optional[Dict]:
+        """Find the best performing kernel based on metric"""
+        if not results:
+            return None
+
+        if metric == "bandwidth_gb_s":
+            return max(results, key=lambda x: x.get("bandwidth_gb_s", 0))
+        elif metric == "time_ms":
+            return min(results, key=lambda x: x.get("time_ms", float("inf")))
+        elif metric == "tflops":
+            return max(results, key=lambda x: x.get("tflops", 0))
+        else:
+            raise ValueError(f"Unknown metric: {metric}")
+
+    def benchmark_sweep(
+        self,
+        problem_sizes: List[Tuple[int, int, int, int, int]],  # N, D, H, W, C
+        window_sizes: List[Tuple[int, int, int]] = [(2, 2, 2)],
+        stride_sizes: List[Tuple[int, int, int]] = [(2, 2, 2)],
+        pool_dim: int = 3,
+        verify: bool = False,
+        warmup: int = 20,
+        repeat: int = 100,
+        flush_cache: bool = True,
+        rotating_count: int = 1000,
+    ) -> Dict:
+        """Run comprehensive benchmark sweep"""
+        kernels = self.discover_kernels()
+        if not kernels:
+            print("No kernels found!")
+            return {}
+
+        all_results = []
+        best_kernels = {}
+
+        for N, D, H, W, C in problem_sizes:
+            for wz, wy, wx in window_sizes:
+                for sz, sy, sx in stride_sizes:
+                    results = self.benchmark_problem_size(
+                        kernels,
+                        N, D, H, W, C,
+                        window_z=wz, window_y=wy, window_x=wx,
+                        stride_z=sz, stride_y=sy, stride_x=sx,
+                        pool_dim=pool_dim,
+                        verify=1 if verify else 0,
+                        warmup=warmup,
+                        repeat=repeat,
+                        flush_cache=flush_cache,
+                        rotating_count=rotating_count,
+                    )
+
+                    all_results.extend(results)
+
+                    best = self.find_best_kernel(results)
+                    if best:
+                        key = f"N{N}_D{D}_H{H}_W{W}_C{C}_w{wz}x{wy}x{wx}_s{sz}x{sy}x{sx}"
+                        best_kernels[key] = best
+                        print(
+                            f"Best for {key}: {best['name']} ({best['bandwidth_gb_s']:.2f} GB/s, {best['time_ms']:.2f}ms)"
+                        )
+
+        self.results = all_results
+        return best_kernels
+
+    def export_csv(self, filename: str):
+        """Export all results to CSV"""
+        if not self.results:
+            print("No results to export")
+            return
+
+        all_keys = set()
+        for result in self.results:
+            all_keys.update(result.keys())
+
+        fieldnames = sorted(all_keys)
+
+        with open(filename, "w", newline="") as csvfile:
+            writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
+            writer.writeheader()
+            writer.writerows(self.results)
+
+        print(f"Results exported to {filename}")
+
+    def export_best_kernels(self, best_kernels: Dict, filename: str):
+        """Export best kernel selections to file"""
+        with open(filename, "w") as f:
+            f.write("# Best kernel selections for pooling\n")
+            f.write("# Format: problem_size -> kernel_name (bandwidth, latency)\n\n")
+
+            for key, kernel in sorted(best_kernels.items()):
+                f.write(
+                    f"{key}: {kernel['name']} ({kernel['bandwidth_gb_s']:.2f} GB/s, {kernel['time_ms']:.2f}ms)\n"
+                )
+
+        print(f"Best kernels exported to {filename}")
+
+    def export_json(self, filename: str, best_kernels: Dict = None):
+        """Export all results and best kernels to JSON"""
+        from datetime import datetime
+
+        successful_results = [r for r in self.results if r.get("bandwidth_gb_s", 0) > 0]
+
+        bandwidth_values = [r.get("bandwidth_gb_s", 0) for r in successful_results]
+        latency_values = [
+            r.get("time_ms", 0) for r in successful_results if r.get("time_ms", 0) > 0
+        ]
+
+        # Performance breakdown by kernel type
+        reduce_op_stats = {}
+        data_type_stats = {}
+
+        for result in successful_results:
+            config = result.get("config", {})
+
+            reduce_op = config.get("reduce_op", "unknown")
+            if reduce_op not in reduce_op_stats:
+                reduce_op_stats[reduce_op] = {
+                    "count": 0,
+                    "avg_bandwidth": 0,
+                    "best_bandwidth": 0,
+                }
+            reduce_op_stats[reduce_op]["count"] += 1
+            reduce_op_stats[reduce_op]["best_bandwidth"] = max(
+                reduce_op_stats[reduce_op]["best_bandwidth"], result.get("bandwidth_gb_s", 0)
+            )
+
+            data_type = config.get("data_type", "unknown")
+            if data_type not in data_type_stats:
+                data_type_stats[data_type] = {
+                    "count": 0,
+                    "avg_bandwidth": 0,
+                    "best_bandwidth": 0,
+                }
+            data_type_stats[data_type]["count"] += 1
+            data_type_stats[data_type]["best_bandwidth"] = max(
+                data_type_stats[data_type]["best_bandwidth"], result.get("bandwidth_gb_s", 0)
+            )
+
+        output_data = {
+            "benchmark_metadata": {
+                "timestamp": datetime.now().isoformat(),
+                "total_kernels_tested": len(self.results),
+                "unique_kernels": len(
+                    set(r.get("name", "unknown") for r in self.results)
+                ),
+                "successful_runs": len(successful_results),
+                "failed_runs": len(self.results) - len(successful_results),
+            },
+            "performance_summary": {
+                "bandwidth_stats": {
+                    "best_gb_s": max(bandwidth_values, default=0),
+                    "average_gb_s": sum(bandwidth_values) / len(bandwidth_values)
+                    if bandwidth_values
+                    else 0,
+                    "min_gb_s": min(bandwidth_values, default=0),
+                },
+                "latency_stats": {
+                    "best_ms": min(latency_values, default=0),
+                    "average_ms": sum(latency_values) / len(latency_values)
+                    if latency_values
+                    else 0,
+                    "max_ms": max(latency_values, default=0),
+                },
+                "kernel_type_breakdown": {
+                    "by_reduce_op": reduce_op_stats,
+                    "by_data_type": data_type_stats,
+                },
+                "total_problem_configurations": len(best_kernels)
+                if best_kernels
+                else 0,
+            },
+            "kernel_results": self.results,
+            "best_kernels_by_problem": best_kernels or {},
+        }
+
+        with open(filename, "w") as f:
+            json.dump(output_data, f, indent=2)
+
+        print(f"JSON results exported to {filename}")
+        print(f"  - Total kernels: {len(self.results)}")
+        print(f"  - Successful runs: {len(successful_results)}")
+        print(f"  - Best bandwidth: {max(bandwidth_values, default=0):.2f} GB/s")
+        print(f"  - Best latency: {min(latency_values, default=0):.2f}ms")
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Pool Kernel Benchmarking Tool")
+    parser.add_argument(
+        "build_dir", help="Build directory containing kernel executables"
+    )
+    parser.add_argument(
+        "--problem-sizes",
+        nargs="+",
+        default=["2,30,30,30,32", "4,64,64,64,64", "8,128,128,128,128"],
+        help="Problem sizes as N,D,H,W,C tuples",
+    )
+    parser.add_argument(
+        "--window-sizes",
+        nargs="+",
+        default=["2,2,2", "3,3,3"],
+        help="Window sizes as Z,Y,X tuples",
+    )
+    parser.add_argument(
+        "--stride-sizes",
+        nargs="+",
+        default=["2,2,2"],
+        help="Stride sizes as Z,Y,X tuples",
+    )
+    parser.add_argument(
+        "--pool-dim", type=int, default=3, help="Pooling dimension (2 or 3)"
+    )
+    parser.add_argument("--verify", action="store_true", help="Enable verification")
+    parser.add_argument(
+        "--csv", default="pool_benchmark_results.csv", help="CSV output filename"
+    )
+    parser.add_argument(
+        "--best", default="best_pool_kernels.txt", help="Best kernels output filename"
+    )
+    parser.add_argument("--verbose", action="store_true", help="Verbose output")
+    parser.add_argument(
+        "--warmup",
+        type=int,
+        default=20,
+        help="Number of warmup iterations (default: 20)",
+    )
+    parser.add_argument(
+        "--repeat",
+        type=int,
+        default=100,
+        help="Number of benchmark iterations (default: 100)",
+    )
+    parser.add_argument(
+        "--flush-cache",
+        action="store_true",
+        default=True,
+        help="Enable cache flushing (default: True)",
+    )
+    parser.add_argument(
+        "--rotating-count",
+        type=int,
+        default=1000,
+        help="Number of iterations to rotate cache (default: 1000)",
+    )
+    parser.add_argument("--json", help="JSON output filename (optional)")
+
+    args = parser.parse_args()
+
+    # Parse problem sizes
+    problem_sizes = []
+    for size_str in args.problem_sizes:
+        try:
+            parts = list(map(int, size_str.split(",")))
+            if len(parts) == 5:
+                problem_sizes.append(tuple(parts))
+            else:
+                print(f"Invalid problem size: {size_str} (expected N,D,H,W,C)")
+                return 1
+        except ValueError:
+            print(f"Invalid problem size: {size_str}")
+            return 1
+
+    # Parse window sizes
+    window_sizes = []
+    for size_str in args.window_sizes:
+        try:
+            parts = list(map(int, size_str.split(",")))
+            if len(parts) == 3:
+                window_sizes.append(tuple(parts))
+            else:
+                print(f"Invalid window size: {size_str} (expected Z,Y,X)")
+                return 1
+        except ValueError:
+            print(f"Invalid window size: {size_str}")
+            return 1
+
+    # Parse stride sizes
+    stride_sizes = []
+    for size_str in args.stride_sizes:
+        try:
+            parts = list(map(int, size_str.split(",")))
+            if len(parts) == 3:
+                stride_sizes.append(tuple(parts))
+            else:
+                print(f"Invalid stride size: {size_str} (expected Z,Y,X)")
+                return 1
+        except ValueError:
+            print(f"Invalid stride size: {size_str}")
+            return 1
+
+    # Create benchmark instance
+    benchmark = PoolBenchmark(args.build_dir, verbose=args.verbose)
+
+    # Run benchmark sweep
+    print("Starting Pool kernel benchmark sweep...")
+    start_time = time.time()
+
+    best_kernels = benchmark.benchmark_sweep(
+        problem_sizes=problem_sizes,
+        window_sizes=window_sizes,
+        stride_sizes=stride_sizes,
+        pool_dim=args.pool_dim,
+        verify=args.verify,
+        warmup=args.warmup,
+        repeat=args.repeat,
+        flush_cache=args.flush_cache,
+        rotating_count=args.rotating_count,
+    )
+
+    elapsed_time = time.time() - start_time
+    print(f"\nBenchmark completed in {elapsed_time:.2f} seconds")
+
+    # Export results
+    benchmark.export_csv(args.csv)
+    benchmark.export_best_kernels(best_kernels, args.best)
+
+    if args.json:
+        benchmark.export_json(args.json, best_kernels)
+
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())
+
--- a/tile_engine/ops/pooling/pool_benchmark_single.cpp
+++ b/tile_engine/ops/pooling/pool_benchmark_single.cpp
@@ -16,13 +16,72 @@

 // The kernel header is included via the compile command line with -include flag
 // It defines SelectedKernel struct and KERNEL_NAME
-// DataTypeTraits are now defined in gemm_common.hpp
+// DataTypeTraits are defined in pool_common.hpp

-// Create argument parser TODO
+// Create argument parser
 inline auto create_args(int argc, char* argv[])
 {
    ck_tile::ArgParser arg_parser;
-    // TODO
+    arg_parser
+        .insert("N", "2", "Batch size N dimension. Default is 2.")
+        .insert("D", "30", "Depth D dimension (for 3D pooling). Default is 30.")
+        .insert("H", "30", "Height H dimension. Default is 30.")
+        .insert("W", "30", "Width W dimension. Default is 30.")
+        .insert("C", "32", "Channel C dimension. Default is 32.")
+        .insert("Z", "2", "Window depth Z dimension. Default is 2.")
+        .insert("Y", "2", "Window height Y dimension. Default is 2.")
+        .insert("X", "2", "Window width X dimension. Default is 2.")
+        .insert("Sz", "2", "Window stride depth. Default is 2.")
+        .insert("Sy", "2", "Window stride height. Default is 2.")
+        .insert("Sx", "2", "Window stride width. Default is 2.")
+        .insert("Dz", "1", "Window dilation depth. Default is 1.")
+        .insert("Dy", "1", "Window dilation height. Default is 1.")
+        .insert("Dx", "1", "Window dilation width. Default is 1.")
+        .insert("LeftPz", "0", "Left padding depth. Default is 0.")
+        .insert("LeftPy", "0", "Left padding height. Default is 0.")
+        .insert("LeftPx", "0", "Left padding width. Default is 0.")
+        .insert("RightPz", "0", "Right padding depth. Default is 0.")
+        .insert("RightPy", "0", "Right padding height. Default is 0.")
+        .insert("RightPx", "0", "Right padding width. Default is 0.")
+        .insert("pool_dim",
+                "3",
+                "Pooling dimension (2 for 2D, 3 for 3D). Default is 3.")
+        .insert("verify",
+                "1",
+                "The type of validation. Set to 0 for no validation, 1 for validation on CPU. "
+                "Default is 1, CPU validation.")
+        .insert("log",
+                "false",
+                "Whether output kernel instance information or not. Possible values are true or "
+                "false. Default is false")
+        .insert(
+            "warmup", "20", "The number of iterations before benchmark the kernel. Default is 20.")
+        .insert(
+            "repeat", "100", "The number of iterations to benchmark the kernel. Default is 100.")
+        .insert("timer",
+                "true",
+                "Whether if the timer is gpu timer or not. Possible values are false or true. "
+                "Default is true.")
+        .insert("init",
+                "0",
+                "The method of tensor initialization. Set to 0 for random, to 1 for linear, or 2 "
+                "for constant(1). Default is 0, random.")
+        .insert("flush_cache",
+                "true",
+                "To flush cache, possible values are true or false. "
+                "Default is true.")
+        .insert("rotating_count", "1000", "Number of iterations to rotate the cache. Default is 1000.")
+        .insert("metric",
+                "2",
+                "Metric with which to measure kernel performance. Set to 0 for latency, 1 for "
+                "tflops, or 2 for bandwidth. Default is 2, bandwidth.")
+        .insert("csv_filename",
+                "",
+                "The filename of benchmark result. Default is empty (no CSV output).")
+        .insert("json_output",
+                "false",
+                "Whether to output results in JSON format only. Possible values are true or false. "
+                "Default is false");

    bool result = arg_parser.parse(argc, argv);
    return std::make_tuple(result, arg_parser);
@@ -32,35 +91,73 @@ void benchmark_single(const ck_tile::ArgParser& arg_parser)
 {
    // Use DataTypeTraits to get the actual type names from the generated header
    // The generated header defines InDataType, OutDataType, ComputeDataType, IndexDataType
-    std::string inDType   = DataTypeTraits<InDataType>::name;
-    std::string outDType   = DataTypeTraits<OutDataType>::name;
+    std::string inDType      = DataTypeTraits<InDataType>::name;
+    std::string outDType     = DataTypeTraits<OutDataType>::name;
    std::string computeDType = DataTypeTraits<ComputeDataType>::name;
    std::string indexDType   = DataTypeTraits<IndexDataType>::name;

-    PoolProblem pool_problem{inDType,
-                             outDType,
-                             computeDType,
-                             indexDType,
-                             arg_parser.get_str("blockShape"),
-                             arg_parser.get_str("reduceOp"),
-                             arg_parser.get_bool("outputIndex"),
-                             arg_parser.get_bool("propagateNan")};
+    // Get block shape from the generated kernel
+    std::string blockShape = BLOCK_SHAPE_NAME;

-    Settings settings{};
+    // Get reduce op from the generated kernel
+    std::string reduceOp = REDUCE_OP_NAME;
+
+    // Create PoolProblem struct
+    PoolProblem pool_problem{
+        inDType,
+        outDType,
+        computeDType,
+        indexDType,
+        blockShape,
+        reduceOp,
+        arg_parser.get_int("pool_dim"),
+        arg_parser.get_int("N"),
+        arg_parser.get_int("D"),
+        arg_parser.get_int("H"),
+        arg_parser.get_int("W"),
+        arg_parser.get_int("C"),
+        arg_parser.get_int("Z"),
+        arg_parser.get_int("Y"),
+        arg_parser.get_int("X"),
+        arg_parser.get_int("Sz"),
+        arg_parser.get_int("Sy"),
+        arg_parser.get_int("Sx"),
+        arg_parser.get_int("Dz"),
+        arg_parser.get_int("Dy"),
+        arg_parser.get_int("Dx"),
+        arg_parser.get_int("LeftPz"),
+        arg_parser.get_int("LeftPy"),
+        arg_parser.get_int("LeftPx"),
+        arg_parser.get_int("RightPz"),
+        arg_parser.get_int("RightPy"),
+        arg_parser.get_int("RightPx"),
+        OUTPUT_INDEX,
+        PROPAGATE_NAN};
+
+    // Create Setting struct
+    Setting setting{arg_parser.get_int("warmup"),
+                    arg_parser.get_int("repeat"),
+                    arg_parser.get_bool("timer"),
+                    arg_parser.get_int("verify"),
+                    arg_parser.get_int("init"),
+                    arg_parser.get_bool("log"),
+                    arg_parser.get_str("csv_filename"),
+                    arg_parser.get_bool("flush_cache"),
+                    arg_parser.get_int("rotating_count"),
+                    arg_parser.get_bool("json_output")};

    // Get the profiler instance
-    auto& profiler = PoolProfiler::instance(setting); // TODO
+    auto& profiler = PoolProfiler::instance(setting);

    try
    {
        // Create a lambda that wraps the kernel launch
-        auto kernel_func = [](const ck_tile::&PoolHostArgs args, // TODO
-                              const ck_tile::stream_config& stream) {
+        auto kernel_func = [&](const auto& args, const ck_tile::stream_config& stream) {
            return SelectedKernel::launch(args, stream);
        };

-        // Benchmark the kernel
-        profiler.benchmark(pool_problem, kernel_func);
+        // Benchmark the kernel using the templated version
+        profiler.template benchmark<TensorShapeType, WindowShapeType>(pool_problem, kernel_func);

        // Select best instance based on metric
        profiler.select_best_instance(static_cast<Metric>(arg_parser.get_int("metric")));
--- a/tile_engine/ops/pooling/pool_instance_builder.py
+++ b/tile_engine/ops/pooling/pool_instance_builder.py
@@ -0,0 +1,589 @@
+#!/usr/bin/env python
+# Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+# SPDX-License-Identifier: MIT
+
+import os
+import json
+import argparse
+import itertools
+import multiprocessing
+import concurrent.futures
+from pathlib import Path
+import logging
+
+logging.basicConfig(level=logging.INFO)
+
+
+def get_dtype_string(dtype):
+    """Convert dtype name to C++ type string"""
+    dtype_map = {
+        "fp16": "ck_tile::half_t",
+        "fp32": "float",
+        "bf16": "ck_tile::bf16_t",
+        "fp8": "ck_tile::fp8_t",
+        "bf8": "ck_tile::bf8_t",
+        "int8": "ck_tile::int8_t",
+        "int32": "ck_tile::int32_t",
+        "index_t": "ck_tile::index_t",
+    }
+    return dtype_map.get(dtype, dtype)
+
+
+def get_reduce_op_string(reduce_op):
+    """Convert reduce op name to C++ type string"""
+    reduce_op_map = {
+        "max": "ck_tile::ReduceOp::Max",
+        "min": "ck_tile::ReduceOp::Min",
+        "add": "ck_tile::ReduceOp::Add",
+        "avg": "ck_tile::ReduceOp::Add",  # Average uses Add and divides later
+    }
+    return reduce_op_map.get(reduce_op.lower(), "ck_tile::ReduceOp::Max")
+
+
+class PoolKernelBuilder:
+    def __init__(self, working_path, gpu_target, datatype, reduce_op, config_json=None):
+        self.working_path = Path(working_path)
+        self.gpu_target = gpu_target
+        self.datatype = datatype
+        self.reduce_op = reduce_op
+        self.config_json = config_json
+
+        # Create working directory if it doesn't exist
+        self.working_path.mkdir(parents=True, exist_ok=True)
+
+        # Load configuration
+        if config_json and os.path.exists(config_json):
+            with open(config_json, "r") as f:
+                self.config = json.load(f)
+        else:
+            # Default configuration
+            self.config = self._get_default_config()
+
+    def _get_default_config(self):
+        """Return default configuration for pooling kernels"""
+        return {
+            "block_config": {
+                "block_m": {"values": [64, 128, 256]},
+                "block_n": {"values": [1]},
+                "warp_m": {"values": [1, 2]},
+                "warp_n": {"values": [1]},
+                "thread_tile_m": {"values": [1, 2, 4]},
+                "thread_tile_n": {"values": [1]},
+            },
+            "trait_config": {
+                "output_index": {"values": [True, False]},
+                "propagate_nan": {"values": [False]},
+                "pool_dim": {"values": [2, 3]},
+            },
+            "k_block_per_cu": 1,
+        }
+
+    def write_kernel_list(self):
+        """Write kernel list to file for CMake to read"""
+        block_configs = self._get_block_configs()
+        trait_combos = self._generate_trait_combinations()
+
+        kernel_list = []
+        for block_config in block_configs:
+            for trait_combo in trait_combos:
+                output_index, propagate_nan, pool_dim = trait_combo
+
+                # Create kernel name
+                kernel_name = f"pool{pool_dim}d_{self.datatype}_{self.reduce_op}"
+                kernel_name += f"_{str(output_index).capitalize()}_{str(propagate_nan).capitalize()}"
+
+                # Create block configuration string
+                block_str = f"{block_config['block_m']}x{block_config['block_n']}_"
+                block_str += f"{block_config['warp_m']}x{block_config['warp_n']}_"
+                block_str += f"{block_config['thread_tile_m']}x{block_config['thread_tile_n']}"
+
+                kernel_name += f"_{block_str}"
+
+                kernel_list.append(
+                    {
+                        "name": kernel_name,
+                        "block_config": block_config,
+                        "trait_combo": trait_combo,
+                    }
+                )
+
+        # Write kernel count
+        with open(self.working_path / "pool_kernel_count.txt", "w") as f:
+            f.write(str(len(kernel_list)))
+
+        # Write kernel list
+        with open(self.working_path / "pool_kernel_list.txt", "w") as f:
+            for kernel in kernel_list:
+                block_config = kernel["block_config"]
+                trait_combo = kernel["trait_combo"]
+
+                block_str = f"{block_config['block_m']}x{block_config['block_n']}_"
+                block_str += f"{block_config['warp_m']}x{block_config['warp_n']}_"
+                block_str += f"{block_config['thread_tile_m']}x{block_config['thread_tile_n']}"
+
+                trait_str = "_".join(str(x) for x in trait_combo)
+
+                f.write(f"{kernel['name']}|{block_str}|{trait_str}\n")
+
+        print(f"Listed {len(kernel_list)} kernel configurations")
+
+    def _get_block_configs(self):
+        """Get block configurations for the current datatype"""
+        block_config = self.config["block_config"]
+
+        block_m_values = block_config.get("block_m").get("values")
+        block_n_values = block_config.get("block_n").get("values")
+        warp_m_values = block_config.get("warp_m").get("values")
+        warp_n_values = block_config.get("warp_n").get("values")
+        thread_tile_m_values = block_config.get("thread_tile_m").get("values")
+        thread_tile_n_values = block_config.get("thread_tile_n").get("values")
+
+        configs = []
+        for block_m in block_m_values:
+            for block_n in block_n_values:
+                for warp_m in warp_m_values:
+                    for warp_n in warp_n_values:
+                        for thread_tile_m in thread_tile_m_values:
+                            for thread_tile_n in thread_tile_n_values:
+                                if self._validate_block_config(
+                                    block_m,
+                                    block_n,
+                                    warp_m,
+                                    warp_n,
+                                    thread_tile_m,
+                                    thread_tile_n,
+                                ):
+                                    configs.append(
+                                        {
+                                            "block_m": block_m,
+                                            "block_n": block_n,
+                                            "warp_m": warp_m,
+                                            "warp_n": warp_n,
+                                            "thread_tile_m": thread_tile_m,
+                                            "thread_tile_n": thread_tile_n,
+                                        }
+                                    )
+        return configs
+
+    def _validate_block_config(
+        self, block_m, block_n, warp_m, warp_n, thread_tile_m, thread_tile_n
+    ):
+        """Validate that block configuration is reasonable"""
+        if block_m <= 0 or block_n <= 0:
+            return False
+        if warp_m <= 0 or warp_n <= 0:
+            return False
+        if thread_tile_m <= 0 or thread_tile_n <= 0:
+            return False
+
+        # Warp size is 64 for AMD GPUs
+        warp_size = 64
+
+        # Calculate warp tile sizes
+        warp_tile_m = block_m // warp_m
+        warp_tile_n = block_n // warp_n
+
+        if warp_tile_m <= 0 or warp_tile_n <= 0:
+            return False
+
+        # Check thread tile fits in warp tile
+        if warp_tile_m % thread_tile_m != 0:
+            return False
+        if warp_tile_n % thread_tile_n != 0:
+            return False
+
+        # Check threads per warp constraint
+        threads_per_warp = (warp_tile_m // thread_tile_m) * (warp_tile_n // thread_tile_n)
+        if threads_per_warp > warp_size:
+            return False
+
+        return True
+
+    def _generate_trait_combinations(self):
+        """Generate all combinations of traits"""
+        trait_config = self.config["trait_config"]
+
+        output_index_values = trait_config.get("output_index").get("values")
+        propagate_nan_values = trait_config.get("propagate_nan").get("values")
+        pool_dim_values = trait_config.get("pool_dim").get("values")
+
+        all_combinations = list(
+            itertools.product(
+                output_index_values,
+                propagate_nan_values,
+                pool_dim_values,
+            )
+        )
+
+        return all_combinations
+
+    def _generate_kernel_instance(self, block_config, trait_combo, k_block_per_cu, is_header=True):
+        """Generate a single kernel instance"""
+        output_index, propagate_nan, pool_dim = trait_combo
+
+        # Create kernel name
+        kernel_name = f"pool{pool_dim}d_{self.datatype}_{self.reduce_op}"
+        kernel_name += f"_{str(output_index).capitalize()}_{str(propagate_nan).capitalize()}"
+
+        # Create block configuration string
+        block_str = f"{block_config['block_m']}x{block_config['block_n']}_"
+        block_str += f"{block_config['warp_m']}x{block_config['warp_n']}_"
+        block_str += f"{block_config['thread_tile_m']}x{block_config['thread_tile_n']}"
+
+        kernel_name += f"_{block_str}"
+
+        # Determine output type (same as input for pooling)
+        out_type = self.datatype
+        compute_type = "fp32"  # Always use fp32 for compute
+        index_type = "index_t"
+
+        # Calculate warp tile sizes
+        warp_tile_m = block_config["block_m"] // block_config["warp_m"]
+        warp_tile_n = block_config["block_n"] // block_config["warp_n"]
+
+        # Generate kernel instance code
+        pragma_line = "#pragma once\n" if is_header else ""
+        instance_code = f"""// Generated kernel instance for {kernel_name}
+{pragma_line}
+#include <cstdint>
+#include <utility>
+#include <tuple>
+#include "ck_tile/core.hpp"
+#include "ck_tile/host/kernel_launch.hpp"
+#include "ck_tile/ops/pooling.hpp"
+#include "ck_tile/ops/pooling/kernel/pool_kernel.hpp"
+#include "ck_tile/ops/pooling/pipeline/pool_problem.hpp"
+#include "ck_tile/ops/pooling/pipeline/pool_shape.hpp"
+
+using InDataType = {get_dtype_string(self.datatype)};
+using OutDataType = {get_dtype_string(out_type)};
+using ComputeDataType = {get_dtype_string(compute_type)};
+using IndexDataType = {get_dtype_string(index_type)};
+
+// Reduce operation
+using ReduceOpType = {get_reduce_op_string(self.reduce_op)};
+
+// Kernel name for display
+constexpr const char* KERNEL_NAME = "{kernel_name}";
+constexpr const char* BLOCK_SHAPE_NAME = "{block_str}";
+constexpr const char* REDUCE_OP_NAME = "{self.reduce_op}";
+
+// Flags
+constexpr bool OUTPUT_INDEX = {"true" if output_index else "false"};
+constexpr bool PROPAGATE_NAN = {"true" if propagate_nan else "false"};
+
+// Block configuration
+using BlockWarps = ck_tile::sequence<{block_config['warp_m']}, {block_config['warp_n']}>;
+using BlockTile = ck_tile::sequence<{block_config['block_m']}, {block_config['block_n']}>;
+using WarpTile = ck_tile::sequence<{warp_tile_m}, {warp_tile_n}>;
+using ThreadTile = ck_tile::sequence<{block_config['thread_tile_m']}, {block_config['thread_tile_n']}>;
+
+using PoolBlockShape = ck_tile::PoolShape<BlockWarps, BlockTile, WarpTile, ThreadTile>;
+
+// Pool problem definition
+using Problem = ck_tile::PoolProblem<InDataType,
+                                     OutDataType,
+                                     ComputeDataType,
+                                     IndexDataType,
+                                     ReduceOpType,
+                                     OUTPUT_INDEX,
+                                     PROPAGATE_NAN,
+                                     PoolBlockShape>;
+
+// Pool kernel type
+using Kernel = ck_tile::PoolKernel<Problem>;
+
+// Shape types for {pool_dim}D pooling
+"""
+        if pool_dim == 3:
+            instance_code += """// 3D pooling shapes (N, D, H, W, C)
+using TensorShapeType = decltype(ck_tile::make_tuple(
+    ck_tile::index_t{}, ck_tile::index_t{}, ck_tile::index_t{}, 
+    ck_tile::index_t{}, ck_tile::index_t{}));
+// Window shape (Z, Y, X)
+using WindowShapeType = decltype(ck_tile::make_tuple(
+    ck_tile::index_t{}, ck_tile::index_t{}, ck_tile::index_t{}));
+"""
+        else:
+            instance_code += """// 2D pooling shapes (N, H, W, C)
+using TensorShapeType = decltype(ck_tile::make_tuple(
+    ck_tile::index_t{}, ck_tile::index_t{}, 
+    ck_tile::index_t{}, ck_tile::index_t{}));
+// Window shape (Y, X)
+using WindowShapeType = decltype(ck_tile::make_tuple(
+    ck_tile::index_t{}, ck_tile::index_t{}));
+"""
+
+        instance_code += f"""
+// Wrapper for simplified launch interface
+struct SelectedKernel {{
+    template <typename TensorShape, typename WindowShape>
+    static float launch(const ck_tile::PoolHostArgs<TensorShape, WindowShape>& args,
+                        const ck_tile::stream_config& stream) {{
+        auto kernel_args = Kernel::MakeKernelArgs(
+            const_cast<ck_tile::PoolHostArgs<TensorShape, WindowShape>&>(args));
+        
+        if (!Kernel::IsSupportedArgument(kernel_args)) {{
+            throw std::runtime_error("Wrong! Arguments not supported! Skipping pooling kernel!");
+        }}
+        
+        constexpr ck_tile::index_t kBlockPerCu = {k_block_per_cu};
+        const ck_tile::index_t kBlockSize = Kernel::BlockSize();
+        const ck_tile::index_t kGridSize = Kernel::CalculateGridSize(kernel_args);
+        
+        if(stream.log_level_ > 0) {{
+            std::cout << "Launching kernel: " << KERNEL_NAME << '\\n'
+                      << "grid: " << kGridSize
+                      << ", blocks: " << kBlockSize
+                      << std::endl;
+        }}
+        
+        // Launch kernel
+        float ave_time = ck_tile::launch_kernel(
+            stream,
+            ck_tile::make_kernel<kBlockPerCu>(Kernel{{}}, kGridSize, kBlockSize, 0, kernel_args));
+        
+        return ave_time;
+    }}
+}};
+"""
+        return kernel_name, instance_code
+
+    def run(self, num_workers=None):
+        """Run the builder to generate individual kernel files"""
+        self.generate_individual(num_workers)
+
+    def generate_individual(self, num_workers=None):
+        """Generate individual kernel files for separate compilation"""
+        if num_workers is None:
+            num_workers = min(multiprocessing.cpu_count(), 8)
+
+        block_configs = self._get_block_configs()
+        trait_combos = self._generate_trait_combinations()
+        k_block_per_cu = self.config.get("k_block_per_cu", 1)
+
+        # Prepare work items
+        work_items = []
+        for block_config in block_configs:
+            for trait_combo in trait_combos:
+                work_items.append(
+                    (
+                        block_config,
+                        trait_combo,
+                        k_block_per_cu,
+                        self.working_path,
+                        self.gpu_target,
+                        self.datatype,
+                        self.reduce_op,
+                        self.config_json,
+                    )
+                )
+
+        print(
+            f"Generating {len(work_items)} individual kernel files using {num_workers} workers..."
+        )
+        print(f"  Block configs: {len(block_configs)}")
+        print(f"  Trait combinations: {len(trait_combos)}")
+        print(f"  Total kernels: {len(work_items)}")
+
+        # Process work items
+        kernel_list = []
+        completed = 0
+
+        with concurrent.futures.ProcessPoolExecutor(max_workers=num_workers) as executor:
+            future_to_item = {
+                executor.submit(_generate_single_kernel_individual, item): item
+                for item in work_items
+            }
+
+            for future in concurrent.futures.as_completed(future_to_item):
+                completed += 1
+                if completed % 10 == 0 or completed == len(work_items):
+                    print(f"  Progress: {completed}/{len(work_items)} kernels generated")
+                try:
+                    result = future.result()
+                    if result:
+                        kernel_list.append(result)
+                except Exception as exc:
+                    item = future_to_item[future]
+                    print(f"Kernel generation failed for {item}: {exc}")
+
+        # Sort kernel list
+        kernel_list.sort(key=lambda x: x[0])
+
+        # Generate CMake include file
+        self._generate_cmake_individual_targets(kernel_list)
+
+        print(f"Generated {len(kernel_list)} individual kernel files in {self.working_path}")
+
+    def _generate_cmake_individual_targets(self, kernel_list):
+        """Generate CMake include file that creates individual targets"""
+        cmake_code = f"""# Generated CMake file for individual Pool targets
+# Datatype: {self.datatype}, ReduceOp: {self.reduce_op}
+
+"""
+        for kernel_name, trait_combo, block_config in kernel_list:
+            block_str = f"{block_config['block_m']}x{block_config['block_n']}_"
+            block_str += f"{block_config['warp_m']}x{block_config['warp_n']}_"
+            block_str += f"{block_config['thread_tile_m']}x{block_config['thread_tile_n']}"
+
+            trait_str = "_".join(str(x) for x in trait_combo)
+
+            cmake_code += f'create_individual_pool_target("{self.datatype}" "{self.reduce_op}" "{trait_str}" "{block_str}")\n'
+
+        with open(self.working_path / "pool_individual_targets.cmake", "w") as f:
+            f.write(cmake_code)
+
+
+def _generate_single_kernel_individual(work_item):
+    """Worker function to generate a single individual kernel file"""
+    (
+        block_config,
+        trait_combo,
+        k_block_per_cu,
+        working_path,
+        gpu_target,
+        datatype,
+        reduce_op,
+        config_json,
+    ) = work_item
+
+    # Create a temporary builder instance
+    builder = PoolKernelBuilder(working_path, gpu_target, datatype, reduce_op, config_json)
+
+    try:
+        kernel_name, instance_code = builder._generate_kernel_instance(
+            block_config, trait_combo, k_block_per_cu
+        )
+
+        # Create simplified filename
+        simplified_name = kernel_name
+        if simplified_name.startswith("pool"):
+            simplified_name = simplified_name[4:]  # Remove "pool" prefix
+
+        # Write individual header file
+        header_file = working_path / f"pool_single_{simplified_name}.hpp"
+        with open(header_file, "w") as f:
+            f.write(instance_code)
+
+        return (kernel_name, trait_combo, block_config)
+    except Exception as e:
+        print(f"Error generating individual kernel: {e}")
+        return None
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Pool kernel instance builder with parallel support"
+    )
+    parser.add_argument("--working_path", required=True, help="Working directory path")
+    parser.add_argument(
+        "--gpu_target",
+        required=True,
+        help="GPU target architecture",
+    )
+    parser.add_argument(
+        "--datatype",
+        required=True,
+        choices=["fp16", "fp32", "bf16"],
+        help="Data type",
+    )
+    parser.add_argument(
+        "--reduce_op",
+        required=True,
+        choices=["max", "min", "avg"],
+        help="Reduce operation",
+    )
+    parser.add_argument("--config_json", help="Configuration JSON file")
+    parser.add_argument(
+        "--num_workers", type=int, help="Number of parallel workers (default: auto)"
+    )
+    parser.add_argument(
+        "--gen_all_individual",
+        action="store_true",
+        help="Generate individual kernel files",
+    )
+    parser.add_argument(
+        "--gen_single", action="store_true", help="Generate a single kernel file"
+    )
+    parser.add_argument("--kernel_name", help="Kernel name for single generation")
+    parser.add_argument(
+        "--block_config", help="Block configuration string for single generation"
+    )
+    parser.add_argument(
+        "--trait_combo", help="Trait combination string for single generation"
+    )
+    parser.add_argument(
+        "--list_kernels",
+        action="store_true",
+        help="List kernel configurations without generating files",
+    )
+
+    args = parser.parse_args()
+
+    # Create builder
+    builder = PoolKernelBuilder(
+        args.working_path, args.gpu_target, args.datatype, args.reduce_op, args.config_json
+    )
+
+    if args.list_kernels:
+        builder.write_kernel_list()
+    elif args.gen_single:
+        # Generate a single kernel file
+        if not args.kernel_name or not args.block_config or not args.trait_combo:
+            parser.error(
+                "--gen_single requires --kernel_name, --block_config, and --trait_combo"
+            )
+
+        # Parse block config
+        block_parts = args.block_config.split("_")
+        block_dims = block_parts[0].split("x")
+        warp_dims = block_parts[1].split("x")
+        thread_tile_dims = block_parts[2].split("x")
+
+        block_config = {
+            "block_m": int(block_dims[0]),
+            "block_n": int(block_dims[1]),
+            "warp_m": int(warp_dims[0]),
+            "warp_n": int(warp_dims[1]),
+            "thread_tile_m": int(thread_tile_dims[0]),
+            "thread_tile_n": int(thread_tile_dims[1]),
+        }
+
+        # Parse trait combo
+        trait_parts = args.trait_combo.split("_")
+        trait_combo = (
+            trait_parts[0] == "True",  # output_index
+            trait_parts[1] == "True",  # propagate_nan
+            int(trait_parts[2]),  # pool_dim
+        )
+
+        k_block_per_cu = builder.config.get("k_block_per_cu", 1)
+
+        # Generate the kernel
+        kernel_name, instance_code = builder._generate_kernel_instance(
+            block_config, trait_combo, k_block_per_cu
+        )
+
+        # Write the file
+        simplified_name = kernel_name
+        if simplified_name.startswith("pool"):
+            simplified_name = simplified_name[4:]
+
+        header_file = builder.working_path / f"pool_single_{simplified_name}.hpp"
+        with open(header_file, "w") as f:
+            f.write(instance_code)
+
+        print(f"Generated {header_file}")
+
+    elif args.gen_all_individual:
+        builder.run(args.num_workers)
+    else:
+        parser.error(
+            "Must specify one of: --list_kernels, --gen_all_individual, or --gen_single"
+        )
+
+
+if __name__ == "__main__":
+    main()
+
--- a/tile_engine/ops/pooling/pool_profiler.hpp
+++ b/tile_engine/ops/pooling/pool_profiler.hpp
@@ -9,48 +9,404 @@

 #include "ck_tile/host/device_prop.hpp"
 #include "ck_tile/ops/pooling.hpp"
+#include "ck_tile/host/reference/reference_pool.hpp"
 #include "pool_benchmark.hpp"

 class PoolProfiler
 {
    public:
-    static PoolProfiler& instance(Settings settings)
+    static PoolProfiler& instance(Setting setting)
    {
-        static PoolProfiler instance{settings};
+        static PoolProfiler instance{setting};
        return instance;
    }

    // Overload for single kernel benchmarking
+    template <typename TensorShape, typename WindowShape>
    void benchmark(PoolProblem& pool_problem,
-                   std::function<float(const ck_tile::PoolHostArgs&, const ck_tile::stream_config&)>
-                       kernel_func)
+                   std::function<float(const ck_tile::PoolHostArgs<TensorShape, WindowShape>&,
+                                       const ck_tile::stream_config&)> kernel_func)
    {
        // Create a vector with a single callable that returns both name and time
-        std::vector<std::function<std::tuple<std::string, float>(ck_tile::PoolHostArgs&,
-                                                                 const ck_tile::stream_config&)>>
+        std::vector<std::function<std::tuple<std::string, float>(
+            ck_tile::PoolHostArgs<TensorShape, WindowShape>&, const ck_tile::stream_config&)>>
            callables;

-        callables.push_back(
-            [kernel_func](ck_tile::PoolHostArgs& args, const ck_tile::stream_config& stream) {
-                float time = kernel_func(args, stream);
-                return std::make_tuple(std::string(KERNEL_NAME), time);
-            });
+        callables.push_back([kernel_func](ck_tile::PoolHostArgs<TensorShape, WindowShape>& args,
+                                          const ck_tile::stream_config& stream) {
+            float time = kernel_func(args, stream);
+            return std::make_tuple(std::string(KERNEL_NAME), time);
+        });

        benchmark(pool_problem, callables);
    }
-////

+    template <typename TensorShape, typename WindowShape>
+    void benchmark(
+        PoolProblem& pool_problem,
+        std::vector<std::function<std::tuple<std::string, float>(
+            ck_tile::PoolHostArgs<TensorShape, WindowShape>&, const ck_tile::stream_config&)>>&
+            callables)
+    {
+        // Calculate output dimensions based on pool dimension
+        const ck_tile::index_t N = pool_problem.N;
+        const ck_tile::index_t D = pool_problem.D;
+        const ck_tile::index_t H = pool_problem.H;
+        const ck_tile::index_t W = pool_problem.W;
+        const ck_tile::index_t C = pool_problem.C;

+        const ck_tile::index_t Z = pool_problem.windowZ;
+        const ck_tile::index_t Y = pool_problem.windowY;
+        const ck_tile::index_t X = pool_problem.windowX;
+
+        const ck_tile::index_t Sz = pool_problem.strideZ;
+        const ck_tile::index_t Sy = pool_problem.strideY;
+        const ck_tile::index_t Sx = pool_problem.strideX;
+
+        const ck_tile::index_t Dz = pool_problem.dilationZ;
+        const ck_tile::index_t Dy = pool_problem.dilationY;
+        const ck_tile::index_t Dx = pool_problem.dilationX;
+
+        const ck_tile::index_t LeftPz  = pool_problem.leftPadZ;
+        const ck_tile::index_t LeftPy  = pool_problem.leftPadY;
+        const ck_tile::index_t LeftPx  = pool_problem.leftPadX;
+        const ck_tile::index_t RightPz = pool_problem.rightPadZ;
+        const ck_tile::index_t RightPy = pool_problem.rightPadY;
+        const ck_tile::index_t RightPx = pool_problem.rightPadX;
+
+        // Calculate effective window sizes
+        const ck_tile::index_t Zs = (Z - 1) * Dz + 1;
+        const ck_tile::index_t Ys = (Y - 1) * Dy + 1;
+        const ck_tile::index_t Xs = (X - 1) * Dx + 1;
+
+        // Calculate output dimensions
+        const ck_tile::index_t Do = (D + LeftPz + RightPz - Zs) / Sz + 1;
+        const ck_tile::index_t Ho = (H + LeftPy + RightPy - Ys) / Sy + 1;
+        const ck_tile::index_t Wo = (W + LeftPx + RightPx - Xs) / Sx + 1;
+
+        // Create input/output tensors based on pool dimension (3D: NDHWC, 2D: NHWC)
+        ck_tile::HostTensor<InDataType> in_tensor(
+            pool_problem.poolDim == 3
+                ? std::vector<std::size_t>{static_cast<std::size_t>(N),
+                                           static_cast<std::size_t>(D),
+                                           static_cast<std::size_t>(H),
+                                           static_cast<std::size_t>(W),
+                                           static_cast<std::size_t>(C)}
+                : std::vector<std::size_t>{static_cast<std::size_t>(N),
+                                           static_cast<std::size_t>(H),
+                                           static_cast<std::size_t>(W),
+                                           static_cast<std::size_t>(C)});
+
+        ck_tile::HostTensor<OutDataType> out_tensor(
+            pool_problem.poolDim == 3
+                ? std::vector<std::size_t>{static_cast<std::size_t>(N),
+                                           static_cast<std::size_t>(Do),
+                                           static_cast<std::size_t>(Ho),
+                                           static_cast<std::size_t>(Wo),
+                                           static_cast<std::size_t>(C)}
+                : std::vector<std::size_t>{static_cast<std::size_t>(N),
+                                           static_cast<std::size_t>(Ho),
+                                           static_cast<std::size_t>(Wo),
+                                           static_cast<std::size_t>(C)});
+
+        ck_tile::HostTensor<OutDataType> out_host_result(
+            pool_problem.poolDim == 3
+                ? std::vector<std::size_t>{static_cast<std::size_t>(N),
+                                           static_cast<std::size_t>(Do),
+                                           static_cast<std::size_t>(Ho),
+                                           static_cast<std::size_t>(Wo),
+                                           static_cast<std::size_t>(C)}
+                : std::vector<std::size_t>{static_cast<std::size_t>(N),
+                                           static_cast<std::size_t>(Ho),
+                                           static_cast<std::size_t>(Wo),
+                                           static_cast<std::size_t>(C)});
+
+        ck_tile::HostTensor<IndexDataType> out_index_tensor(
+            pool_problem.outputIndex
+                ? (pool_problem.poolDim == 3
+                       ? std::vector<std::size_t>{static_cast<std::size_t>(N),
+                                                  static_cast<std::size_t>(Do),
+                                                  static_cast<std::size_t>(Ho),
+                                                  static_cast<std::size_t>(Wo),
+                                                  static_cast<std::size_t>(C)}
+                       : std::vector<std::size_t>{static_cast<std::size_t>(N),
+                                                  static_cast<std::size_t>(Ho),
+                                                  static_cast<std::size_t>(Wo),
+                                                  static_cast<std::size_t>(C)})
+                : std::vector<std::size_t>{1});
+
+        ck_tile::HostTensor<IndexDataType> out_index_host_result(
+            pool_problem.outputIndex
+                ? (pool_problem.poolDim == 3
+                       ? std::vector<std::size_t>{static_cast<std::size_t>(N),
+                                                  static_cast<std::size_t>(Do),
+                                                  static_cast<std::size_t>(Ho),
+                                                  static_cast<std::size_t>(Wo),
+                                                  static_cast<std::size_t>(C)}
+                       : std::vector<std::size_t>{static_cast<std::size_t>(N),
+                                                  static_cast<std::size_t>(Ho),
+                                                  static_cast<std::size_t>(Wo),
+                                                  static_cast<std::size_t>(C)})
+                : std::vector<std::size_t>{1});
+
+        // Initialize input tensor
+        if(setting_.init_method_ == 0)
+        {
+            ck_tile::FillUniformDistribution<InDataType>{-5.f, 5.f}(in_tensor);
+        }
+        else if(setting_.init_method_ == 1)
+        {
+            ck_tile::FillMonotonicSeq<InDataType>{}(in_tensor);
+        }
+        else if(setting_.init_method_ == 2)
+        {
+            ck_tile::FillConstant<InDataType>{static_cast<InDataType>(1)}(in_tensor);
+        }
+        else
+        {
+            in_tensor.SetZero();
+        }
+
+        // Allocate device memory
+        ck_tile::DeviceMem in_dev_buf(in_tensor.get_element_space_size_in_bytes());
+        ck_tile::DeviceMem out_dev_buf(out_tensor.get_element_space_size_in_bytes());
+        ck_tile::DeviceMem out_index_dev_buf(
+            pool_problem.outputIndex ? out_index_tensor.get_element_space_size_in_bytes() : 0);
+
+        in_dev_buf.ToDevice(in_tensor.data());
+        out_dev_buf.SetZero();
+        if(pool_problem.outputIndex)
+        {
+            out_index_dev_buf.SetZero();
+        }
+
+        // Create shapes for host args
+        TensorShape input_shape, output_shape, input_strides, output_strides;
+        WindowShape window_lengths, window_strides, window_dilations, input_left_pads,
+            input_right_pads;
+
+        // Create host arguments
+        ck_tile::PoolHostArgs<TensorShape, WindowShape> pool_args{
+            in_dev_buf.GetDeviceBuffer(),
+            out_dev_buf.GetDeviceBuffer(),
+            pool_problem.outputIndex ? out_index_dev_buf.GetDeviceBuffer() : nullptr,
+            input_shape,
+            output_shape,
+            input_strides,
+            output_strides,
+            window_lengths,
+            window_strides,
+            window_dilations,
+            input_left_pads,
+            input_right_pads};
+
+        // Run reference if verification is enabled
+        // (Reference computation would be added here based on pool dimension)
+
+        for(auto& callable : callables)
+        {
+            auto kernel_run_result = callable(pool_args,
+                                              ck_tile::stream_config{nullptr,
+                                                                     true,
+                                                                     setting_.log_,
+                                                                     setting_.n_warmup_,
+                                                                     setting_.n_repeat_,
+                                                                     setting_.is_gpu_timer_,
+                                                                     setting_.flush_cache_,
+                                                                     setting_.rotating_count_});
+            process_result(pool_problem,
+                           out_dev_buf,
+                           out_host_result,
+                           out_tensor,
+                           out_index_dev_buf,
+                           out_index_host_result,
+                           out_index_tensor,
+                           kernel_run_result);
+        }
+    }
+
+    void process_result(const PoolProblem& pool_problem,
+                        ck_tile::DeviceMem& out_dev_buf,
+                        ck_tile::HostTensor<OutDataType>& out_host_result,
+                        ck_tile::HostTensor<OutDataType>& out_dev_result,
+                        ck_tile::DeviceMem& out_index_dev_buf,
+                        ck_tile::HostTensor<IndexDataType>& out_index_host_result,
+                        ck_tile::HostTensor<IndexDataType>& out_index_dev_result,
+                        const std::tuple<std::string, float>& kernel_run_result)
+    {
+        auto [name, avg_time] = kernel_run_result;
+
+        KernelInstance kernel_instance{name, pool_problem, {-1.0f, -1.0f, -1.0f}};
+
+        // Compute performance metrics
+        const ck_tile::index_t N  = pool_problem.N;
+        const ck_tile::index_t D  = pool_problem.D;
+        const ck_tile::index_t H  = pool_problem.H;
+        const ck_tile::index_t W  = pool_problem.W;
+        const ck_tile::index_t C  = pool_problem.C;
+        const ck_tile::index_t Z  = pool_problem.windowZ;
+        const ck_tile::index_t Y  = pool_problem.windowY;
+        const ck_tile::index_t X  = pool_problem.windowX;
+        const ck_tile::index_t Sz = pool_problem.strideZ;
+        const ck_tile::index_t Sy = pool_problem.strideY;
+        const ck_tile::index_t Sx = pool_problem.strideX;
+        const ck_tile::index_t Dz = pool_problem.dilationZ;
+        const ck_tile::index_t Dy = pool_problem.dilationY;
+        const ck_tile::index_t Dx = pool_problem.dilationX;
+
+        const ck_tile::index_t Zs = (Z - 1) * Dz + 1;
+        const ck_tile::index_t Ys = (Y - 1) * Dy + 1;
+        const ck_tile::index_t Xs = (X - 1) * Dx + 1;
+
+        const ck_tile::index_t Do =
+            (D + pool_problem.leftPadZ + pool_problem.rightPadZ - Zs) / Sz + 1;
+        const ck_tile::index_t Ho =
+            (H + pool_problem.leftPadY + pool_problem.rightPadY - Ys) / Sy + 1;
+        const ck_tile::index_t Wo =
+            (W + pool_problem.leftPadX + pool_problem.rightPadX - Xs) / Sx + 1;
+
+        // Calculate FLOPs: for pooling, we count one compare/add per window element per output
+        // element
+        std::size_t window_size =
+            static_cast<std::size_t>(Z) * static_cast<std::size_t>(Y) * static_cast<std::size_t>(X);
+        std::size_t output_elements = static_cast<std::size_t>(N) * static_cast<std::size_t>(Do) *
+                                      static_cast<std::size_t>(Ho) * static_cast<std::size_t>(Wo) *
+                                      static_cast<std::size_t>(C);
+        std::size_t flop = output_elements * window_size;
+
+        // Calculate memory bandwidth
+        std::size_t num_byte = sizeof(InDataType) * N * D * H * W * C +
+                               sizeof(OutDataType) * N * Do * Ho * Wo * C;
+
+        // Update performance results
+        kernel_instance.perf_result_.latency_   = avg_time;
+        kernel_instance.perf_result_.tflops_    = static_cast<float>(flop) / 1.E9 / avg_time;
+        kernel_instance.perf_result_.bandwidth_ = num_byte / 1.E6 / avg_time;
+
+        if(setting_.log_ > 0 && !setting_.json_output_)
+        {
+            std::cout << kernel_instance << std::endl;
+        }
+
+        // Verify result
+        out_dev_buf.FromDevice(out_dev_result.data());
+
+        bool verified_correct = true;
+        if(setting_.verify_)
+        {
+            verified_correct = compare_pool_results(name, out_dev_result, out_host_result);
+            if(pool_problem.outputIndex)
+            {
+                out_index_dev_buf.FromDevice(out_index_dev_result.data());
+                verified_correct =
+                    verified_correct &&
+                    compare_pool_index_results(name, out_index_dev_result, out_index_host_result);
+            }
+        }
+
+        if(verified_correct)
+        {
+            kernel_instances_.emplace_back(kernel_instance);
+        }
+        else
+        {
+            std::cout << "Verification failed, skip kernel: " << name << std::endl;
+        }
+
+        // Clear tensors
+        out_dev_buf.SetZero();
+        out_dev_result.SetZero();
+    }
+
+    KernelInstance select_best_instance(Metric metric)
+    {
+        if(kernel_instances_.empty())
+            throw std::runtime_error("Empty instances");
+
+        auto kernel_instance = *std::max_element(kernel_instances_.begin(),
+                                                 kernel_instances_.end(),
+                                                 [metric](const auto& a, const auto& b) {
+                                                     return PerformanceResult::compare(
+                                                         b.perf_result_, a.perf_result_, metric);
+                                                 });
+
+        if(setting_.json_output_)
+        {
+            // Output clean JSON only
+            std::cout << kernel_instance << std::endl;
+        }
+        else
+        {
+            std::cout << "**********************************" << std::endl;
+            std::cout << "According to given metrics: " << get_metric_name(metric) << "\n"
+                      << "Current kernel performance is: " << kernel_instance << std::endl;
+            std::cout << "**********************************" << std::endl;
+        }
+
+        if(!setting_.csv_filename_.empty())
+        {
+            std::ofstream file(setting_.csv_filename_ + ".csv", std::ios::app);
+
+            if(!file.is_open())
+            {
+                std::cerr << "Warning: Failed to open CSV file for writing." << std::endl;
+            }
+            else
+            {
+                if(file.tellp() == 0)
+                {
+                    file << "rocm_version,device_name,"
+                         << "in_dtype,out_dtype,compute_dtype,index_dtype,"
+                         << "block_shape,reduce_op,pool_dim,"
+                         << "N,D,H,W,C,"
+                         << "window_z,window_y,window_x,"
+                         << "stride_z,stride_y,stride_x,"
+                         << "dilation_z,dilation_y,dilation_x,"
+                         << "left_pad_z,left_pad_y,left_pad_x,"
+                         << "right_pad_z,right_pad_y,right_pad_x,"
+                         << "output_index,propagate_nan," << "name,"
+                         << "latency(ms),tflops(TFlops),bandwidth(GB/s),metric\n";
+                }
+
+                const auto& problem = kernel_instance.problem_;
+                const auto& name    = kernel_instance.name_;
+                const auto& perf    = kernel_instance.perf_result_;
+
+                file << get_rocm_version() << "," << ck_tile::get_device_name() << ","
+                     << problem.inDType << "," << problem.outDType << "," << problem.computeDType
+                     << "," << problem.indexDType << "," << problem.blockShape << ","
+                     << problem.reduceOp << "," << problem.poolDim << "," << problem.N << ","
+                     << problem.D << "," << problem.H << "," << problem.W << "," << problem.C << ","
+                     << problem.windowZ << "," << problem.windowY << "," << problem.windowX << ","
+                     << problem.strideZ << "," << problem.strideY << "," << problem.strideX << ","
+                     << problem.dilationZ << "," << problem.dilationY << "," << problem.dilationX
+                     << "," << problem.leftPadZ << "," << problem.leftPadY << ","
+                     << problem.leftPadX << "," << problem.rightPadZ << "," << problem.rightPadY
+                     << "," << problem.rightPadX << "," << problem.outputIndex << ","
+                     << problem.propagateNan << "," << name << "," << std::fixed
+                     << std::setprecision(4) << perf.latency_ << "," << std::fixed
+                     << std::setprecision(4) << perf.tflops_ << "," << std::fixed
+                     << std::setprecision(4) << perf.bandwidth_ << "," << get_metric_name(metric)
+                     << "\n";
+
+                if(!file)
+                {
+                    std::cerr << "Warning: Error occurred while writing to CSV file." << std::endl;
+                }
+            }
+        }
+
+        return kernel_instance;
+    }

-////
    PoolProfiler(const PoolProfiler&)            = delete;
    PoolProfiler& operator=(const PoolProfiler&) = delete;

    private:
    ~PoolProfiler() { kernel_instances_.clear(); }
-    PoolProfiler(Settings settings) : settings_(settings) {}
+    PoolProfiler(Setting setting) : setting_(setting) {}

-    Settings settings_;
+    Setting setting_;

    std::vector<KernelInstance> kernel_instances_;
-}
+};