Dlejeune/ck tile 2d multiple reductions (#3147)

* WIP * Add Unit tests for the Multi Reduction Kernel * clang format * Rename multiblock to threadwise * Multiblock WIP * Fix multi reduce multi block unit tests * Multi Reduce Tile Engine: WIP * refactoring + try addressing precision error * Fix multiops examples * Cleanup * Clean up tile engine's reduce op * Update changelog * Fix remod/clang * Fix dates * Fix documentation & missing file * Fix comments * Use the update_tile api in the multi-block kernel * Unify threadwise/multiblock into a single kernel + default multiblock output to float in tests * Add TileParitioner * Cleanup * Add warning when no data to process, in the example * Refactoring Reduce kernel Tile Partioner + cleanup * Move the tile partioner to its own file * Add missing includes * Fix copyright header with update_amd_copyright_headers.py * Fix change of interface in Reduce2dProblem --------- Co-authored-by: Damien Lejeune <damien.lejeune@amd.com> Co-authored-by: Adam Osewski <19374865+aosewski@users.noreply.github.com>
2026-04-20 06:49:15 +00:00 · 2026-01-09 11:16:37 +01:00
parent e3884bbf05
commit 4216d43da8
26 changed files with 2661 additions and 2 deletions
--- a/tile_engine/CMakeLists.txt
+++ b/tile_engine/CMakeLists.txt
@@ -7,4 +7,5 @@ include_directories(BEFORE

 add_subdirectory(ops/gemm)
 add_subdirectory(ops/gemm_streamk)
+add_subdirectory(ops/reduce)

--- a/tile_engine/ops/reduce/CMakeLists.txt
+++ b/tile_engine/ops/reduce/CMakeLists.txt
@@ -0,0 +1,126 @@
+# Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+# SPDX-License-Identifier: MIT
+
+# cmake_minimum_required(VERSION 4.2)
+
+# enable_testing()
+
+set(MULTI_REDUCE_DATATYPE "fp16" CACHE STRING "List of datatypes Multi Reduce (semicolon-separated)")
+set(MULTI_REDUCE_VARIANTS "multiops_multiblock;multiops_threadwise" CACHE STRING "List of variants for Multi Reduce (semicolon-separated)")
+
+function(build_multi_reduce_for_datatype datatype variant)
+    # Filter GPU targets to only gfx942, and gfx950
+    set(GPU_TARGETS "")
+    set(DESIRED_TARGETS "gfx942;gfx950")
+    set(VALID_VARIANTS "multiops_multiblock;multiops_threadwise")
+    
+    foreach(target IN LISTS SUPPORTED_GPU_TARGETS)
+        if(target IN_LIST DESIRED_TARGETS)
+            list(APPEND GPU_TARGETS ${target})
+        endif()
+    endforeach()
+    
+    # Skip compilation if no matching targets found
+    if(NOT GPU_TARGETS)
+        message(WARNING "Skipping Tile Engine for Multi Reduction Kernel: No supported GPU targets (gfx942, gfx950) found in SUPPORTED_GPU_TARGETS: ${SUPPORTED_GPU_TARGETS}")
+        return()
+    endif()
+    
+    message(STATUS "Building Reduction for GPU targets: ${GPU_TARGETS}")
+    
+    set(working_path "${CMAKE_CURRENT_BINARY_DIR}/${datatype}/${variant}")
+    file(MAKE_DIRECTORY "${working_path}")
+
+    # Comment this if-else block when using user_provided_config
+    if(variant IN_LIST VALID_VARIANTS)
+        set(json_blob "${CMAKE_CURRENT_LIST_DIR}/configs/default_multi_reduce_config.json")
+    else()
+        # set(json_blob "${CMAKE_CURRENT_LIST_DIR}/configs/custom_ci_config.json")
+        message(WARNING "Unknown Multi Reduce variant: ${variant}.")
+        return()
+    endif()
+
+    # uncomment this if you want to use user_provided_config.json
+    # set(json_blob "${CMAKE_CURRENT_LIST_DIR}/configs/user_provided_config.json")
+    
+    # Generate kernel list
+    execute_process(
+        COMMAND ${Python3_EXECUTABLE} ${CMAKE_CURRENT_LIST_DIR}/reduce_instance_builder.py
+                --working_path ${working_path}
+                --datatype ${datatype}
+                --variant ${variant}
+                --config_json ${json_blob}
+                --list_blobs
+                --gpu_target "${GPU_TARGETS}"
+        RESULT_VARIABLE ret
+    )
+    if(NOT ret EQUAL 0)
+        message(FATAL_ERROR "Failed to list kernels for ${datatype} ${variant}: ${ret}")
+    endif()
+
+    file(STRINGS "${working_path}/reduce_${variant}_blobs_list.txt" codegen_blobs)
+    
+    # Generate the blobs
+    execute_process(
+        COMMAND ${Python3_EXECUTABLE} ${CMAKE_CURRENT_LIST_DIR}/reduce_instance_builder.py
+                --working_path "${working_path}"
+                --datatype ${datatype}
+                --config_json "${json_blob}"
+                --variant "${variant}"
+                --gen_blobs
+                --gpu_target "${GPU_TARGETS}"
+        RESULT_VARIABLE ret
+    )
+    if(NOT ret EQUAL 0)
+        message(FATAL_ERROR "Failed to generate kernels for ${datatype} ${variant}: ${ret}")
+    endif()
+
+    message(STATUS "Generated ${datatype} ${variant} reduction kernel blobs at: ${working_path}")
+
+    # # Add test executables for each generated test
+    file(STRINGS "${working_path}/reduce_${variant}_blobs_list.txt" test_basenames)
+
+    foreach(test_base IN LISTS test_basenames)
+        string(PREPEND test_base "test_")
+        set(test_src "${working_path}/${test_base}.cpp")
+        set(test_target "${test_base}")
+
+        add_executable(${test_target} ${test_src})
+        target_include_directories(${test_target} PRIVATE
+            "${CMAKE_SOURCE_DIR}/test/ck_tile/reduce/"
+            ${working_path}
+        )
+
+        target_compile_options(${test_target} PRIVATE -Wno-global-constructors -Wno-dev)
+        target_link_libraries(${test_target} PRIVATE gtest gtest_main)
+
+        add_test(NAME ${test_target} COMMAND ${test_target})
+        set_tests_properties(${test_target} PROPERTIES LABELS "multi_reduce")
+    endforeach()
+    add_custom_target(test_reduce_${variant}_${datatype} DEPENDS ${codegen_blobs})
+
+    # # Generating a single binary from all the tests (debug-only)
+    # set(test_srcs)
+    # foreach(test_base IN LISTS test_basenames)
+    #     list(APPEND test_srcs "${working_path}/test_${test_base}.cpp")
+    # endforeach()
+
+    # if(test_srcs)
+    #     set(test_target "test_reduce_${variant}_${datatype}")
+    #     add_executable(${test_target} ${test_srcs})
+    #     target_include_directories(${test_target} PRIVATE
+    #         ${working_path}
+    #         "${CMAKE_SOURCE_DIR}/test/ck_tile/reduce/"
+    #     )
+    #     target_compile_options(${test_target} PRIVATE -Wno-global-constructors -Wno-dev)
+    #     target_link_libraries(${test_target} PRIVATE gtest gtest_main)
+    # endif()
+
+endfunction()
+
+# Process each datatype in isolation
+foreach(dt IN LISTS MULTI_REDUCE_DATATYPE)
+    foreach(l IN LISTS MULTI_REDUCE_VARIANTS)
+        build_multi_reduce_for_datatype(${dt} ${l})
+    endforeach()
+endforeach()
--- a/tile_engine/ops/reduce/configs/default_multi_reduce_config.json
+++ b/tile_engine/ops/reduce/configs/default_multi_reduce_config.json
@@ -0,0 +1,51 @@
+{
+    "problem" : {
+    },
+
+    "problem_size" : {
+         "input_shape" : [
+            [128, 64, 2],
+            [32, 8, 64, 16]
+        ]
+    },
+
+    "tile_config" : {
+        "fixed": [
+            {"tile_m": 128, "tile_n": 128, "warp_per_block_m": 4, "warp_per_block_n": 1, "warp_tile_m": 32, "warp_tile_n": 128, "thread_tile_m": 8, "thread_tile_n": 8}
+        ],
+        "combination": {
+            "tile_m" : {
+                "values" : [
+                ]
+            },
+            "tile_n" : {
+                "values": [
+                ]
+            },
+            "warp_per_block_m" : {
+                "values" : [
+                ]
+            },
+            "warp_per_block_n" : {
+                "values" : [
+                ]
+            },
+            "warp_tile_m" : {
+                "values" : [
+                ]
+            },
+            "warp_tile_n" : {
+                "values" : [
+                ]
+            },
+            "thread_tile_m" : {
+                "values" : [
+                ]
+            },
+            "thread_tile_n" : {
+                "values" : [
+                ]
+            }
+        }
+    }
+}
--- a/tile_engine/ops/reduce/reduce_config.py
+++ b/tile_engine/ops/reduce/reduce_config.py
@@ -0,0 +1,11 @@
+# Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+# SPDX-License-Identifier: MIT
+
+import json
+
+
+class ReduceConfig:
+    def __init__(self, config_json_path: str):
+        self.config_json_path = config_json_path
+        with open(config_json_path, "r") as f:
+            self.config_dict = json.load(f)
--- a/tile_engine/ops/reduce/reduce_instance_builder.py
+++ b/tile_engine/ops/reduce/reduce_instance_builder.py
@@ -0,0 +1,171 @@
+# Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+# SPDX-License-Identifier: MIT
+
+import argparse
+from pathlib import Path
+
+from reduce_config import ReduceConfig
+from reduce_parameter import get_parameter_combinations, TYPE_MAP
+
+
+class MultiReduceBase:
+    def __init__(self, working_path, gpu_target, datatype, config_json=None):
+        self.working_path = Path(working_path)
+        self.gpu_target = gpu_target
+        self.datatype = datatype
+        self.output_type = self.datatype
+        self.config = ReduceConfig(config_json) if config_json else None
+        self.name = "multiops_base"
+
+        self.signature_test = {
+            3: "Test3D_KeepDim0_ReduceDim12",
+            4: "Test4D_KeepDim01_ReduceDim23",
+        }
+        self.header = "test_multi_reduce2d_multiblock_impl.hpp"
+        self.test_type = "TestCkTileMultiReduce2D"
+
+    def _generate_instances(self):
+        if not self.config:
+            raise ValueError("Configuration not provided.")
+
+        instances = []
+        for params in get_parameter_combinations(self.config.config_dict):
+            instance = self._create_instance(params)
+            instances.append((instance, params))
+        return instances
+
+    def _create_instance(self, parameters):
+        generated_test = self._get_test(parameters)
+
+        return generated_test
+
+    def do_list_blobs(self):
+        with open(
+            self.working_path / Path(f"reduce_{self.name}_blobs_list.txt"), "w"
+        ) as f:
+            combos_str = [
+                f"{self.name}_{params}"
+                for params in get_parameter_combinations(self.config.config_dict)
+            ]
+            f.write("\n".join(combos_str))
+            f.write("\n")
+
+    def do_generate_blobs(self):
+        instances = self._generate_instances()
+        for instance_code, params in instances:
+            blob_filename = self.working_path / Path(f"test_{self.name}_{params}.cpp")
+            with open(blob_filename, "w") as f:
+                f.write(instance_code)
+
+    def _get_test(self, params):
+        dimension = len(params.input_shape)
+        signature = self.signature_test.get(dimension, None)
+
+        if not signature:
+            raise ValueError(
+                f"No test signature found for input shape dimension: {dimension}"
+            )
+
+        shape_str = [str(i) for i in params.input_shape]
+        input_shape_arg_str = ",".join(shape_str)
+        input_shape_str = "x".join(shape_str)
+
+        t = f"""#include "{self.header}"
+
+using Shape_BlockWarps = ck_tile::sequence<{params.warp_per_block_m}, {params.warp_per_block_n}>;
+using Shape_BlockTile  = ck_tile::sequence<{params.tile_m}, {params.tile_n}>;
+using Shape_WarpTile   = ck_tile::sequence<{params.warp_m}, {params.warp_n}>;
+using Shape_ThreadTile = ck_tile::sequence<{params.thread_tile_m}, {params.thread_tile_n}>;
+
+using TestConfig =
+    std::tuple<{TYPE_MAP[self.datatype]},
+               float,
+               {TYPE_MAP[self.output_type]},
+               ck_tile::tuple<ck_tile::ReduceOp::Add, ck_tile::ReduceOp::Add>, // Intra block reductions
+               ck_tile::tuple<ck_tile::element_wise::PassThrough, ck_tile::element_wise::UnarySquare>, // Elementwise ops
+               ck_tile::tuple<ck_tile::element_wise::PassThrough, ck_tile::element_wise::UnaryDivide>, // Accumulator Elementiwise ops, intra block
+               ck_tile::tuple<ck_tile::ReduceOp::Add, ck_tile::ReduceOp::Add>, // Inter block reduction
+               Shape_BlockWarps,
+               Shape_BlockTile,
+               Shape_WarpTile,
+               Shape_ThreadTile>;
+
+// Register the type(s) for the typed test suite
+typedef ::testing::Types<TestConfig> TestTypes;
+TYPED_TEST_SUITE({self.test_type}, TestTypes);
+
+TYPED_TEST({self.test_type}, {signature}_{input_shape_str})
+{{
+    this->Run{signature}({input_shape_arg_str});
+}}
+"""
+
+        return t
+
+
+class MultiReduceThreadwiseKernelBuilder(MultiReduceBase):
+    def __init__(self, working_path, gpu_target, datatype, config_json=None):
+        super().__init__(working_path, gpu_target, datatype, config_json)
+
+        self.name = "multiops_threadwise"
+
+        self.header = "test_multi_reduce2d_threadwise_impl.hpp"
+        self.test_type = "TestCkTileMultiReduceThreadwise"
+
+
+class MultiReduceMultiBlockKernelBuilder(MultiReduceBase):
+    def __init__(self, working_path, gpu_target, datatype, config_json=None):
+        super().__init__(working_path, gpu_target, datatype, config_json)
+
+        self.name = "multiops_multiblock"
+
+        self.output_type = (
+            "float"  # Force float to be used as the output is also used as accumulator
+        )
+
+        self.header = "test_multi_reduce2d_multiblock_impl.hpp"
+        self.test_type = "TestCkTileMultiReduceMultiblock"
+
+
+def main(args):
+    variants = {
+        "multiops_threadwise": {"class": MultiReduceThreadwiseKernelBuilder},
+        "multiops_multiblock": {"class": MultiReduceMultiBlockKernelBuilder},
+    }
+    if not (args.list_blobs or args.gen_blobs):
+        raise ValueError("Please provide a list or generate blobs.")
+
+    builder = variants.get(args.variant)
+    builder_instance = builder["class"](
+        working_path=args.working_path,
+        gpu_target=args.gpu_target,
+        datatype=args.datatype,
+        config_json=args.config_json,
+    )
+
+    if args.list_blobs:
+        builder_instance.do_list_blobs()
+    if args.gen_blobs:
+        builder_instance.do_generate_blobs()
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Reduce Instance Builder")
+
+    parser.add_argument(
+        "--working_path", type=str, required=True, help="Working directory path"
+    )
+    parser.add_argument("--datatype", type=str, required=True, help="Data type")
+    parser.add_argument(
+        "--variant", type=str, required=True, help="Variant: multiblock or threadwise"
+    )
+    parser.add_argument(
+        "--config_json", type=str, required=True, help="Path to config JSON blob"
+    )
+    parser.add_argument("--list_blobs", action="store_true", help="List blobs")
+    parser.add_argument("--gen_blobs", action="store_true", help="Generate blobs")
+    parser.add_argument("--gpu_target", type=str, required=True, help="GPU target")
+
+    args = parser.parse_args()
+
+    main(args)
--- a/tile_engine/ops/reduce/reduce_parameter.py
+++ b/tile_engine/ops/reduce/reduce_parameter.py
@@ -0,0 +1,127 @@
+# Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+# SPDX-License-Identifier: MIT
+
+from dataclasses import dataclass
+from itertools import product
+
+from pyparsing import List
+
+TYPE_MAP = {"fp16": "ck_tile::half_t", "float": "float"}
+
+
+@dataclass
+class ParametersBlockwise:
+    tile_m: int
+    tile_n: int
+    warp_per_block_m: int
+    warp_per_block_n: int
+    warp_m: int
+    warp_n: int
+    thread_tile_m: int
+    thread_tile_n: int
+    input_shape: List[int]
+
+    def __str__(self):
+        tile_size = "x".join(str(i) for i in [self.tile_m, self.tile_n])
+        warp_per_block = "x".join(
+            str(i) for i in [self.warp_per_block_m, self.warp_per_block_n]
+        )
+        warp_size = "x".join(str(i) for i in [self.warp_m, self.warp_n])
+        thread_tile_size = "x".join(
+            str(i) for i in [self.thread_tile_m, self.thread_tile_n]
+        )
+        input_shape = "x".join(str(i) for i in self.input_shape)
+
+        return "_".join(
+            [tile_size, warp_per_block, warp_size, thread_tile_size, input_shape]
+        )
+
+
+def get_parameter_combinations(
+    config_dict: dict,
+) -> List[ParametersBlockwise]:
+    input_shape_configs = config_dict["problem_size"]["input_shape"]
+
+    fixed_configs = config_dict["tile_config"].get("fixed", None)
+
+    seen_config = set()
+
+    if fixed_configs is not None:
+        for fixed in fixed_configs:
+            tile_m_values = fixed["tile_m"]
+            tile_n_values = fixed["tile_n"]
+            warp_per_block_m_values = fixed["warp_per_block_m"]
+            warp_per_block_n_values = fixed["warp_per_block_n"]
+            warp_m_values = fixed["warp_tile_m"]
+            warp_n_values = fixed["warp_tile_n"]
+            thread_tile_m_values = fixed["thread_tile_m"]
+            thread_tile_n_values = fixed["thread_tile_n"]
+            for combo in product(
+                [tile_m_values],
+                [tile_n_values],
+                [warp_per_block_m_values],
+                [warp_per_block_n_values],
+                [warp_m_values],
+                [warp_n_values],
+                [thread_tile_m_values],
+                [thread_tile_n_values],
+                input_shape_configs,
+            ):
+                p = ParametersBlockwise(*combo)
+                if is_valid_combination(p):
+                    hashable_combo = (tuple(combo[-1]),) + combo[0:-1]
+                    seen_config.add(hashable_combo)
+                    yield p
+
+    combo_config = config_dict["tile_config"].get("combination", None)
+    if combo_config is None:
+        tile_m_values = combo_config["tile_m"]["values"]
+        tile_n_values = combo_config["tile_n"]["values"]
+        warp_per_block_m_values = combo_config["warp_per_block_m"]["values"]
+        warp_per_block_n_values = combo_config["warp_per_block_n"]["values"]
+        warp_m_values = combo_config["warp_tile_m"]["values"]
+        warp_n_values = combo_config["warp_tile_n"]["values"]
+        thread_tile_m_values = combo_config["thread_tile_m"]["values"]
+        thread_tile_n_values = combo_config["tile_config"]["thread_tile_n"]["values"]
+
+        for combo in product(
+            tile_m_values,
+            tile_n_values,
+            warp_per_block_m_values,
+            warp_per_block_n_values,
+            warp_m_values,
+            warp_n_values,
+            thread_tile_m_values,
+            thread_tile_n_values,
+            input_shape_configs,
+        ):
+            if combo:
+                p = ParametersBlockwise(*combo)
+                hashable_combo = (tuple(combo[-1]),) + combo[0:-1]
+                if is_valid_combination(p) and hashable_combo not in seen_config:
+                    yield p
+
+
+def is_valid_combination(p: ParametersBlockwise) -> bool:
+    # Thread tile must be at least 1
+    if p.thread_tile_m < 1 or p.thread_tile_n < 1:
+        return False
+
+    # Alignment check
+    if p.tile_m % (p.warp_per_block_m * p.warp_m) != 0:
+        return False
+    if p.tile_n % (p.warp_per_block_n * p.warp_n) != 0:
+        return False
+
+    # Reduction dimension size must be divisible by tile size
+    if len(p.input_shape) == 4 and (
+        p.input_shape[2] * p.input_shape[3] % p.thread_tile_n != 0
+    ):
+        return False
+
+    if len(p.input_shape) == 3 and (
+        p.input_shape[1] * p.input_shape[2] % p.thread_tile_n != 0
+    ):
+        return False
+
+    return True