Dlejeune/ck tile 2d multiple reductions (#3147)

* WIP

* Add Unit tests for the Multi Reduction Kernel

* clang format

* Rename multiblock to threadwise

* Multiblock WIP

* Fix multi reduce multi block unit tests

* Multi Reduce Tile Engine: WIP

* refactoring + try addressing precision error

* Fix multiops examples

* Cleanup

* Clean up tile engine's reduce op

* Update changelog

* Fix remod/clang

* Fix dates

* Fix documentation & missing file

* Fix comments

* Use the update_tile api in the multi-block kernel

* Unify threadwise/multiblock into a single kernel + default multiblock output to float in tests

* Add TileParitioner

* Cleanup

* Add warning when no data to process, in the example

* Refactoring Reduce kernel Tile Partioner + cleanup

* Move the tile partioner to its own file

* Add missing includes

* Fix copyright header with update_amd_copyright_headers.py

* Fix change of interface in Reduce2dProblem

---------

Co-authored-by: Damien Lejeune <damien.lejeune@amd.com>
Co-authored-by: Adam Osewski <19374865+aosewski@users.noreply.github.com>
This commit is contained in:
damien-lejeune
2026-01-09 11:16:37 +01:00
committed by GitHub
parent e3884bbf05
commit 4216d43da8
26 changed files with 2661 additions and 2 deletions

View File

@@ -7,4 +7,5 @@ include_directories(BEFORE
add_subdirectory(ops/gemm)
add_subdirectory(ops/gemm_streamk)
add_subdirectory(ops/reduce)

View File

@@ -0,0 +1,126 @@
# Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
# SPDX-License-Identifier: MIT
# cmake_minimum_required(VERSION 4.2)
# enable_testing()
set(MULTI_REDUCE_DATATYPE "fp16" CACHE STRING "List of datatypes Multi Reduce (semicolon-separated)")
set(MULTI_REDUCE_VARIANTS "multiops_multiblock;multiops_threadwise" CACHE STRING "List of variants for Multi Reduce (semicolon-separated)")
function(build_multi_reduce_for_datatype datatype variant)
# Filter GPU targets to only gfx942, and gfx950
set(GPU_TARGETS "")
set(DESIRED_TARGETS "gfx942;gfx950")
set(VALID_VARIANTS "multiops_multiblock;multiops_threadwise")
foreach(target IN LISTS SUPPORTED_GPU_TARGETS)
if(target IN_LIST DESIRED_TARGETS)
list(APPEND GPU_TARGETS ${target})
endif()
endforeach()
# Skip compilation if no matching targets found
if(NOT GPU_TARGETS)
message(WARNING "Skipping Tile Engine for Multi Reduction Kernel: No supported GPU targets (gfx942, gfx950) found in SUPPORTED_GPU_TARGETS: ${SUPPORTED_GPU_TARGETS}")
return()
endif()
message(STATUS "Building Reduction for GPU targets: ${GPU_TARGETS}")
set(working_path "${CMAKE_CURRENT_BINARY_DIR}/${datatype}/${variant}")
file(MAKE_DIRECTORY "${working_path}")
# Comment this if-else block when using user_provided_config
if(variant IN_LIST VALID_VARIANTS)
set(json_blob "${CMAKE_CURRENT_LIST_DIR}/configs/default_multi_reduce_config.json")
else()
# set(json_blob "${CMAKE_CURRENT_LIST_DIR}/configs/custom_ci_config.json")
message(WARNING "Unknown Multi Reduce variant: ${variant}.")
return()
endif()
# uncomment this if you want to use user_provided_config.json
# set(json_blob "${CMAKE_CURRENT_LIST_DIR}/configs/user_provided_config.json")
# Generate kernel list
execute_process(
COMMAND ${Python3_EXECUTABLE} ${CMAKE_CURRENT_LIST_DIR}/reduce_instance_builder.py
--working_path ${working_path}
--datatype ${datatype}
--variant ${variant}
--config_json ${json_blob}
--list_blobs
--gpu_target "${GPU_TARGETS}"
RESULT_VARIABLE ret
)
if(NOT ret EQUAL 0)
message(FATAL_ERROR "Failed to list kernels for ${datatype} ${variant}: ${ret}")
endif()
file(STRINGS "${working_path}/reduce_${variant}_blobs_list.txt" codegen_blobs)
# Generate the blobs
execute_process(
COMMAND ${Python3_EXECUTABLE} ${CMAKE_CURRENT_LIST_DIR}/reduce_instance_builder.py
--working_path "${working_path}"
--datatype ${datatype}
--config_json "${json_blob}"
--variant "${variant}"
--gen_blobs
--gpu_target "${GPU_TARGETS}"
RESULT_VARIABLE ret
)
if(NOT ret EQUAL 0)
message(FATAL_ERROR "Failed to generate kernels for ${datatype} ${variant}: ${ret}")
endif()
message(STATUS "Generated ${datatype} ${variant} reduction kernel blobs at: ${working_path}")
# # Add test executables for each generated test
file(STRINGS "${working_path}/reduce_${variant}_blobs_list.txt" test_basenames)
foreach(test_base IN LISTS test_basenames)
string(PREPEND test_base "test_")
set(test_src "${working_path}/${test_base}.cpp")
set(test_target "${test_base}")
add_executable(${test_target} ${test_src})
target_include_directories(${test_target} PRIVATE
"${CMAKE_SOURCE_DIR}/test/ck_tile/reduce/"
${working_path}
)
target_compile_options(${test_target} PRIVATE -Wno-global-constructors -Wno-dev)
target_link_libraries(${test_target} PRIVATE gtest gtest_main)
add_test(NAME ${test_target} COMMAND ${test_target})
set_tests_properties(${test_target} PROPERTIES LABELS "multi_reduce")
endforeach()
add_custom_target(test_reduce_${variant}_${datatype} DEPENDS ${codegen_blobs})
# # Generating a single binary from all the tests (debug-only)
# set(test_srcs)
# foreach(test_base IN LISTS test_basenames)
# list(APPEND test_srcs "${working_path}/test_${test_base}.cpp")
# endforeach()
# if(test_srcs)
# set(test_target "test_reduce_${variant}_${datatype}")
# add_executable(${test_target} ${test_srcs})
# target_include_directories(${test_target} PRIVATE
# ${working_path}
# "${CMAKE_SOURCE_DIR}/test/ck_tile/reduce/"
# )
# target_compile_options(${test_target} PRIVATE -Wno-global-constructors -Wno-dev)
# target_link_libraries(${test_target} PRIVATE gtest gtest_main)
# endif()
endfunction()
# Process each datatype in isolation
foreach(dt IN LISTS MULTI_REDUCE_DATATYPE)
foreach(l IN LISTS MULTI_REDUCE_VARIANTS)
build_multi_reduce_for_datatype(${dt} ${l})
endforeach()
endforeach()

View File

@@ -0,0 +1,51 @@
{
"problem" : {
},
"problem_size" : {
"input_shape" : [
[128, 64, 2],
[32, 8, 64, 16]
]
},
"tile_config" : {
"fixed": [
{"tile_m": 128, "tile_n": 128, "warp_per_block_m": 4, "warp_per_block_n": 1, "warp_tile_m": 32, "warp_tile_n": 128, "thread_tile_m": 8, "thread_tile_n": 8}
],
"combination": {
"tile_m" : {
"values" : [
]
},
"tile_n" : {
"values": [
]
},
"warp_per_block_m" : {
"values" : [
]
},
"warp_per_block_n" : {
"values" : [
]
},
"warp_tile_m" : {
"values" : [
]
},
"warp_tile_n" : {
"values" : [
]
},
"thread_tile_m" : {
"values" : [
]
},
"thread_tile_n" : {
"values" : [
]
}
}
}
}

View File

@@ -0,0 +1,11 @@
# Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
# SPDX-License-Identifier: MIT
import json
class ReduceConfig:
def __init__(self, config_json_path: str):
self.config_json_path = config_json_path
with open(config_json_path, "r") as f:
self.config_dict = json.load(f)

View File

@@ -0,0 +1,171 @@
# Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
# SPDX-License-Identifier: MIT
import argparse
from pathlib import Path
from reduce_config import ReduceConfig
from reduce_parameter import get_parameter_combinations, TYPE_MAP
class MultiReduceBase:
def __init__(self, working_path, gpu_target, datatype, config_json=None):
self.working_path = Path(working_path)
self.gpu_target = gpu_target
self.datatype = datatype
self.output_type = self.datatype
self.config = ReduceConfig(config_json) if config_json else None
self.name = "multiops_base"
self.signature_test = {
3: "Test3D_KeepDim0_ReduceDim12",
4: "Test4D_KeepDim01_ReduceDim23",
}
self.header = "test_multi_reduce2d_multiblock_impl.hpp"
self.test_type = "TestCkTileMultiReduce2D"
def _generate_instances(self):
if not self.config:
raise ValueError("Configuration not provided.")
instances = []
for params in get_parameter_combinations(self.config.config_dict):
instance = self._create_instance(params)
instances.append((instance, params))
return instances
def _create_instance(self, parameters):
generated_test = self._get_test(parameters)
return generated_test
def do_list_blobs(self):
with open(
self.working_path / Path(f"reduce_{self.name}_blobs_list.txt"), "w"
) as f:
combos_str = [
f"{self.name}_{params}"
for params in get_parameter_combinations(self.config.config_dict)
]
f.write("\n".join(combos_str))
f.write("\n")
def do_generate_blobs(self):
instances = self._generate_instances()
for instance_code, params in instances:
blob_filename = self.working_path / Path(f"test_{self.name}_{params}.cpp")
with open(blob_filename, "w") as f:
f.write(instance_code)
def _get_test(self, params):
dimension = len(params.input_shape)
signature = self.signature_test.get(dimension, None)
if not signature:
raise ValueError(
f"No test signature found for input shape dimension: {dimension}"
)
shape_str = [str(i) for i in params.input_shape]
input_shape_arg_str = ",".join(shape_str)
input_shape_str = "x".join(shape_str)
t = f"""#include "{self.header}"
using Shape_BlockWarps = ck_tile::sequence<{params.warp_per_block_m}, {params.warp_per_block_n}>;
using Shape_BlockTile = ck_tile::sequence<{params.tile_m}, {params.tile_n}>;
using Shape_WarpTile = ck_tile::sequence<{params.warp_m}, {params.warp_n}>;
using Shape_ThreadTile = ck_tile::sequence<{params.thread_tile_m}, {params.thread_tile_n}>;
using TestConfig =
std::tuple<{TYPE_MAP[self.datatype]},
float,
{TYPE_MAP[self.output_type]},
ck_tile::tuple<ck_tile::ReduceOp::Add, ck_tile::ReduceOp::Add>, // Intra block reductions
ck_tile::tuple<ck_tile::element_wise::PassThrough, ck_tile::element_wise::UnarySquare>, // Elementwise ops
ck_tile::tuple<ck_tile::element_wise::PassThrough, ck_tile::element_wise::UnaryDivide>, // Accumulator Elementiwise ops, intra block
ck_tile::tuple<ck_tile::ReduceOp::Add, ck_tile::ReduceOp::Add>, // Inter block reduction
Shape_BlockWarps,
Shape_BlockTile,
Shape_WarpTile,
Shape_ThreadTile>;
// Register the type(s) for the typed test suite
typedef ::testing::Types<TestConfig> TestTypes;
TYPED_TEST_SUITE({self.test_type}, TestTypes);
TYPED_TEST({self.test_type}, {signature}_{input_shape_str})
{{
this->Run{signature}({input_shape_arg_str});
}}
"""
return t
class MultiReduceThreadwiseKernelBuilder(MultiReduceBase):
def __init__(self, working_path, gpu_target, datatype, config_json=None):
super().__init__(working_path, gpu_target, datatype, config_json)
self.name = "multiops_threadwise"
self.header = "test_multi_reduce2d_threadwise_impl.hpp"
self.test_type = "TestCkTileMultiReduceThreadwise"
class MultiReduceMultiBlockKernelBuilder(MultiReduceBase):
def __init__(self, working_path, gpu_target, datatype, config_json=None):
super().__init__(working_path, gpu_target, datatype, config_json)
self.name = "multiops_multiblock"
self.output_type = (
"float" # Force float to be used as the output is also used as accumulator
)
self.header = "test_multi_reduce2d_multiblock_impl.hpp"
self.test_type = "TestCkTileMultiReduceMultiblock"
def main(args):
variants = {
"multiops_threadwise": {"class": MultiReduceThreadwiseKernelBuilder},
"multiops_multiblock": {"class": MultiReduceMultiBlockKernelBuilder},
}
if not (args.list_blobs or args.gen_blobs):
raise ValueError("Please provide a list or generate blobs.")
builder = variants.get(args.variant)
builder_instance = builder["class"](
working_path=args.working_path,
gpu_target=args.gpu_target,
datatype=args.datatype,
config_json=args.config_json,
)
if args.list_blobs:
builder_instance.do_list_blobs()
if args.gen_blobs:
builder_instance.do_generate_blobs()
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Reduce Instance Builder")
parser.add_argument(
"--working_path", type=str, required=True, help="Working directory path"
)
parser.add_argument("--datatype", type=str, required=True, help="Data type")
parser.add_argument(
"--variant", type=str, required=True, help="Variant: multiblock or threadwise"
)
parser.add_argument(
"--config_json", type=str, required=True, help="Path to config JSON blob"
)
parser.add_argument("--list_blobs", action="store_true", help="List blobs")
parser.add_argument("--gen_blobs", action="store_true", help="Generate blobs")
parser.add_argument("--gpu_target", type=str, required=True, help="GPU target")
args = parser.parse_args()
main(args)

View File

@@ -0,0 +1,127 @@
# Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
# SPDX-License-Identifier: MIT
from dataclasses import dataclass
from itertools import product
from pyparsing import List
TYPE_MAP = {"fp16": "ck_tile::half_t", "float": "float"}
@dataclass
class ParametersBlockwise:
tile_m: int
tile_n: int
warp_per_block_m: int
warp_per_block_n: int
warp_m: int
warp_n: int
thread_tile_m: int
thread_tile_n: int
input_shape: List[int]
def __str__(self):
tile_size = "x".join(str(i) for i in [self.tile_m, self.tile_n])
warp_per_block = "x".join(
str(i) for i in [self.warp_per_block_m, self.warp_per_block_n]
)
warp_size = "x".join(str(i) for i in [self.warp_m, self.warp_n])
thread_tile_size = "x".join(
str(i) for i in [self.thread_tile_m, self.thread_tile_n]
)
input_shape = "x".join(str(i) for i in self.input_shape)
return "_".join(
[tile_size, warp_per_block, warp_size, thread_tile_size, input_shape]
)
def get_parameter_combinations(
config_dict: dict,
) -> List[ParametersBlockwise]:
input_shape_configs = config_dict["problem_size"]["input_shape"]
fixed_configs = config_dict["tile_config"].get("fixed", None)
seen_config = set()
if fixed_configs is not None:
for fixed in fixed_configs:
tile_m_values = fixed["tile_m"]
tile_n_values = fixed["tile_n"]
warp_per_block_m_values = fixed["warp_per_block_m"]
warp_per_block_n_values = fixed["warp_per_block_n"]
warp_m_values = fixed["warp_tile_m"]
warp_n_values = fixed["warp_tile_n"]
thread_tile_m_values = fixed["thread_tile_m"]
thread_tile_n_values = fixed["thread_tile_n"]
for combo in product(
[tile_m_values],
[tile_n_values],
[warp_per_block_m_values],
[warp_per_block_n_values],
[warp_m_values],
[warp_n_values],
[thread_tile_m_values],
[thread_tile_n_values],
input_shape_configs,
):
p = ParametersBlockwise(*combo)
if is_valid_combination(p):
hashable_combo = (tuple(combo[-1]),) + combo[0:-1]
seen_config.add(hashable_combo)
yield p
combo_config = config_dict["tile_config"].get("combination", None)
if combo_config is None:
tile_m_values = combo_config["tile_m"]["values"]
tile_n_values = combo_config["tile_n"]["values"]
warp_per_block_m_values = combo_config["warp_per_block_m"]["values"]
warp_per_block_n_values = combo_config["warp_per_block_n"]["values"]
warp_m_values = combo_config["warp_tile_m"]["values"]
warp_n_values = combo_config["warp_tile_n"]["values"]
thread_tile_m_values = combo_config["thread_tile_m"]["values"]
thread_tile_n_values = combo_config["tile_config"]["thread_tile_n"]["values"]
for combo in product(
tile_m_values,
tile_n_values,
warp_per_block_m_values,
warp_per_block_n_values,
warp_m_values,
warp_n_values,
thread_tile_m_values,
thread_tile_n_values,
input_shape_configs,
):
if combo:
p = ParametersBlockwise(*combo)
hashable_combo = (tuple(combo[-1]),) + combo[0:-1]
if is_valid_combination(p) and hashable_combo not in seen_config:
yield p
def is_valid_combination(p: ParametersBlockwise) -> bool:
# Thread tile must be at least 1
if p.thread_tile_m < 1 or p.thread_tile_n < 1:
return False
# Alignment check
if p.tile_m % (p.warp_per_block_m * p.warp_m) != 0:
return False
if p.tile_n % (p.warp_per_block_n * p.warp_n) != 0:
return False
# Reduction dimension size must be divisible by tile size
if len(p.input_shape) == 4 and (
p.input_shape[2] * p.input_shape[3] % p.thread_tile_n != 0
):
return False
if len(p.input_shape) == 3 and (
p.input_shape[1] * p.input_shape[2] % p.thread_tile_n != 0
):
return False
return True