diff --git a/CHANGELOG.md b/CHANGELOG.md index 4c054b822a..7a21634b7d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -51,6 +51,10 @@ None None +### Upcoming changes + +* Non-grouped convolutions are deprecated. All of their functionality is supported by grouped convolution. + ## Composable Kernel 1.1.0 for ROCm 6.1.0 ### Additions diff --git a/CMakeLists.txt b/CMakeLists.txt index 6a3bdaf46b..07d2e166bb 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -98,6 +98,12 @@ add_compile_options(-Wno-pass-failed) add_compile_options(-Wno-switch-default) add_compile_options(-Wno-unique-object-duplication) +# add -Og -gdwarf64 for debug builds +add_compile_options( + "$<$:-Og>" + "$<$:-gdwarf64>" +) + # Recent change in compiler makes this warning ON by default, which led to compile errors. add_compile_options(-Wno-nrvo) diff --git a/Jenkinsfile b/Jenkinsfile index e1273db829..35bb91ccaf 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -438,6 +438,34 @@ def cmake_build(Map conf=[:]){ echo "could not locate the requested artifacts: ${err.getMessage()}. will skip the stashing." } } + if (params.RUN_CK_TILE_TRANSPOSE_TESTS){ + try{ + archiveArtifacts "perf_transpose_*.log" + if (arch_type == 1){ + stash includes: "perf_transpose_**_gfx90a.log", name: "perf_transpose_log_gfx90a" + } + else if (arch_type == 2){ + stash includes: "perf_transpose_**_gfx942.log", name: "perf_transpose_log_gfx942" + } + } + catch(Exception err){ + echo "could not locate the requested artifacts: ${err.getMessage()}. will skip the stashing." + } + } + if (params.RUN_CK_TILE_GEMM_TESTS){ + try{ + archiveArtifacts "perf_tile_gemm_**.log" + if (arch == 1){ + stash includes: "perf_tile_gemm_**_gfx90a.log", name: "perf_tile_gemm_log_gfx90a" + } + else if (arch == 2){ + stash includes: "perf_tile_gemm_**_gfx942.log", name: "perf_tile_gemm_log_gfx942" + } + } + catch(Exception err){ + echo "could not locate the requested artifacts: ${err.getMessage()}. will skip the stashing." + } + } } def buildHipClangJob(Map conf=[:]){ @@ -734,6 +762,24 @@ def process_results(Map conf=[:]){ echo "could not locate the FMHA performance logs: ${err.getMessage()}." } } + if (params.RUN_CK_TILE_TRANSPOSE_TESTS){ + try{ + unstash "perf_transpose_log_gfx942" + unstash "perf_transpose_log_gfx90a" + } + catch(Exception err){ + echo "could not locate the Transpose performance logs: ${err.getMessage()}." + } + } + if (params.RUN_CK_TILE_GEMM_TESTS){ + try{ + unstash "perf_tile_gemm_log_gfx942" + unstash "perf_tile_gemm_log_gfx90a" + } + catch(Exception err){ + echo "could not locate the GEMM performance logs: ${err.getMessage()}." + } + } if (params.RUN_FULL_QA || params.BUILD_INSTANCES_ONLY){ // unstash deb packages unstash "packages" @@ -815,7 +861,7 @@ def run_aiter_tests(Map conf=[:]){ } //launch develop branch daily jobs -CRON_SETTINGS = BRANCH_NAME == "develop" ? '''0 23 * * * % RUN_FULL_QA=true;DISABLE_DL_KERNELS=true;RUN_CK_TILE_FMHA_TESTS=true;RUN_TILE_ENGINE_GEMM_TESTS=true;RUN_PERFORMANCE_TESTS=true;RUN_ALL_UNIT_TESTS=true +CRON_SETTINGS = BRANCH_NAME == "develop" ? '''0 23 * * * % RUN_FULL_QA=true;DISABLE_DL_KERNELS=true;RUN_CK_TILE_FMHA_TESTS=true;RUN_CK_TILE_TRANSPOSE_TESTS=true;RUN_CK_TILE_GEMM_TESTS=true;RUN_TILE_ENGINE_GEMM_TESTS=true;RUN_PERFORMANCE_TESTS=true;RUN_ALL_UNIT_TESTS=true 0 21 * * * % RUN_GROUPED_CONV_LARGE_CASES_TESTS=true;hipTensor_test=true;BUILD_GFX908=true;BUILD_GFX942=true;BUILD_GFX950=true;RUN_PERFORMANCE_TESTS=true;RUN_ALL_UNIT_TESTS=true 0 19 * * * % BUILD_DOCKER=true;COMPILER_VERSION=amd-staging;BUILD_COMPILER=/llvm-project/build/bin/clang++;USE_SCCACHE=false;NINJA_BUILD_TRACE=true;RUN_ALL_UNIT_TESTS=true 0 17 * * * % BUILD_DOCKER=true;COMPILER_VERSION=amd-mainline;BUILD_COMPILER=/llvm-project/build/bin/clang++;USE_SCCACHE=false;NINJA_BUILD_TRACE=true;RUN_ALL_UNIT_TESTS=true @@ -895,6 +941,14 @@ pipeline { name: "RUN_CK_TILE_FMHA_TESTS", defaultValue: false, description: "Run the ck_tile FMHA tests (default: OFF)") + booleanParam( + name: "RUN_CK_TILE_TRANSPOSE_TESTS", + defaultValue: false, + description: "Run the ck_tile Transpose tests (default: OFF)") + booleanParam( + name: "RUN_CK_TILE_GEMM_TESTS", + defaultValue: false, + description: "Run the ck_tile GEMM tests (default: OFF)") booleanParam( name: "RUN_TILE_ENGINE_GEMM_TESTS", defaultValue: false, @@ -1144,6 +1198,94 @@ pipeline { } } } + stage("Run CK_TILE_TRANSPOSE Tests") + { + parallel + { + stage("Run CK_TILE_TRANSPOSE Tests on gfx90a") + { + when { + beforeAgent true + expression { params.RUN_CK_TILE_TRANSPOSE_TESTS.toBoolean() } + } + agent{ label rocmnode("gfx90a") } + environment{ + setup_args = "NO_CK_BUILD" + execute_args = """ ../script/cmake-ck-dev.sh ../ gfx90a && \ + make -j64 tile_example_batched_transpose && \ + cd ../ && + example/ck_tile/35_batched_transpose/script/run_full_test.sh "CI_${params.COMPILER_VERSION}" "${env.BRANCH_NAME}" "${NODE_NAME}" gfx90a """ + } + steps{ + buildHipClangJobAndReboot(setup_args:setup_args, no_reboot:true, build_type: 'Release', execute_cmd: execute_args) + cleanWs() + } + } + stage("Run CK_TILE_TRANSPOSE Tests on gfx942") + { + when { + beforeAgent true + expression { params.RUN_CK_TILE_TRANSPOSE_TESTS.toBoolean() } + } + agent{ label rocmnode("gfx942") } + environment{ + setup_args = "NO_CK_BUILD" + execute_args = """ ../script/cmake-ck-dev.sh ../ gfx942 && \ + make -j64 tile_example_batched_transpose && \ + cd ../ && + example/ck_tile/35_batched_transpose/script/run_full_test.sh "CI_${params.COMPILER_VERSION}" "${env.BRANCH_NAME}" "${NODE_NAME}" gfx942 """ + } + steps{ + buildHipClangJobAndReboot(setup_args:setup_args, no_reboot:true, build_type: 'Release', execute_cmd: execute_args) + cleanWs() + } + } + } + } + stage("Run CK_TILE_GEMM Tests") + { + parallel + { + stage("Run CK_TILE_GEMM Tests on gfx90a") + { + when { + beforeAgent true + expression { params.RUN_CK_TILE_GEMM_TESTS.toBoolean() } + } + agent{ label rocmnode("gfx90a") } + environment{ + setup_args = "NO_CK_BUILD" + execute_args = """ ../script/cmake-ck-dev.sh ../ gfx90a && \ + make -j64 tile_example_gemm_universal && \ + cd ../ && + example/ck_tile/03_gemm/script/run_full_test.sh "CI_${params.COMPILER_VERSION}" "${env.BRANCH_NAME}" "${NODE_NAME}" gfx90a """ + } + steps{ + buildHipClangJobAndReboot(setup_args:setup_args, no_reboot:true, build_type: 'Release', execute_cmd: execute_args) + cleanWs() + } + } + stage("Run CK_TILE_GEMM Tests on gfx942") + { + when { + beforeAgent true + expression { params.RUN_CK_TILE_GEMM_TESTS.toBoolean() } + } + agent{ label rocmnode("gfx942") } + environment{ + setup_args = "NO_CK_BUILD" + execute_args = """ ../script/cmake-ck-dev.sh ../ gfx942 && \ + make -j64 tile_example_gemm_universal && \ + cd ../ && + example/ck_tile/03_gemm/script/run_full_test.sh "CI_${params.COMPILER_VERSION}" "${env.BRANCH_NAME}" "${NODE_NAME}" gfx942 """ + } + steps{ + buildHipClangJobAndReboot(setup_args:setup_args, no_reboot:true, build_type: 'Release', execute_cmd: execute_args) + cleanWs() + } + } + } + } stage("Run TILE_ENGINE_GEMM Tests") { parallel @@ -1350,7 +1492,7 @@ pipeline { -DGPU_TARGETS="gfx90a" \ -DCMAKE_CXX_COMPILER="${build_compiler()}" \ -DCMAKE_C_COMPILER=/opt/rocm/llvm/bin/clang \ - -DCMAKE_CXX_FLAGS=" -O3 " .. && make -j 32""" + -DCMAKE_CXX_FLAGS=" -O3 " .. && make -j """ } steps{ Build_CK_and_Reboot(setup_args: setup_args, config_targets: "install", no_reboot:true, build_type: 'Release', execute_cmd: execute_args, prefixpath: '/usr/local') diff --git a/example/65_gemm_multiply_multiply/CMakeLists.txt b/example/65_gemm_multiply_multiply/CMakeLists.txt index 9f4c43338e..d1e1a51afd 100644 --- a/example/65_gemm_multiply_multiply/CMakeLists.txt +++ b/example/65_gemm_multiply_multiply/CMakeLists.txt @@ -31,7 +31,7 @@ foreach(gpu IN LISTS GPU_TARGETS) example_compile_options(example_moe_gemm1_xdl_pk_i4 PRIVATE ${EXAMPLE_COMPILE_OPTIONS}) example_compile_options(example_moe_gemm2_xdl_pk_i4 PRIVATE ${EXAMPLE_COMPILE_OPTIONS}) endif() - set(GEMM_OPTIONS "SHELL: -mllvm -greedy-reverse-local-assignment=1 -mllvm --slp-threshold=-32") + set(GEMM_OPTIONS "SHELL: -mllvm -greedy-reverse-local-assignment=1") example_compile_options(example_gemm_multiply_multiply_xdl_fp8_bpreshuffle PRIVATE ${GEMM_OPTIONS}) example_compile_options(example_moe_gemm1_xdl_fp8 PRIVATE ${GEMM_OPTIONS}) example_compile_options(example_moe_gemm2_xdl_fp8 PRIVATE ${GEMM_OPTIONS}) @@ -39,22 +39,22 @@ foreach(gpu IN LISTS GPU_TARGETS) endif() endforeach() -set(GEMM_OPTIONS "SHELL: -mllvm -greedy-reverse-local-assignment=1 -mllvm --slp-threshold=-32") +set(GEMM_OPTIONS "SHELL: -mllvm -greedy-reverse-local-assignment=1") set(BLOCKSCALE_GEMM_OPTIONS ) check_cxx_compiler_flag("-mllvm --misched-bottomup=1" HAS_MISCHED_BOTTOMUP) check_cxx_compiler_flag("-mllvm --misched-prera-direction=bottomup" HAS_MISCHED_PRERA_DIRECTION) if(hip_VERSION_FLAT LESS 600443483 OR hip_VERSION_FLAT GREATER_EQUAL 700000000) if(HAS_MISCHED_BOTTOMUP) - list(APPEND BLOCKSCALE_GEMM_OPTIONS "SHELL: -mllvm -greedy-reverse-local-assignment=1 -mllvm --slp-threshold=-32 -mllvm --schedmodel=0 -mllvm --misched-bottomup=1") + list(APPEND BLOCKSCALE_GEMM_OPTIONS "SHELL: -mllvm -greedy-reverse-local-assignment=1 -mllvm --schedmodel=0 -mllvm --misched-bottomup=1") elseif(HAS_MISCHED_PRERA_DIRECTION) - list(APPEND BLOCKSCALE_GEMM_OPTIONS "SHELL: -mllvm -greedy-reverse-local-assignment=1 -mllvm --slp-threshold=-32 -mllvm --schedmodel=0 -mllvm --misched-prera-direction=bottomup") + list(APPEND BLOCKSCALE_GEMM_OPTIONS "SHELL: -mllvm -greedy-reverse-local-assignment=1 -mllvm --schedmodel=0 -mllvm --misched-prera-direction=bottomup") endif() else() if(HAS_MISCHED_BOTTOMUP) - list(APPEND BLOCKSCALE_GEMM_OPTIONS "SHELL: -mllvm -greedy-reverse-local-assignment=1 -mllvm --slp-threshold=-32 -mllvm --misched-bottomup=1") + list(APPEND BLOCKSCALE_GEMM_OPTIONS "SHELL: -mllvm -greedy-reverse-local-assignment=1 -mllvm --misched-bottomup=1") elseif(HAS_MISCHED_PRERA_DIRECTION) - list(APPEND BLOCKSCALE_GEMM_OPTIONS "SHELL: -mllvm -greedy-reverse-local-assignment=1 -mllvm --slp-threshold=-32 -mllvm --misched-prera-direction=bottomup") + list(APPEND BLOCKSCALE_GEMM_OPTIONS "SHELL: -mllvm -greedy-reverse-local-assignment=1 -mllvm --misched-prera-direction=bottomup") endif() endif() @@ -62,7 +62,6 @@ check_cxx_compiler_flag("-mllvm --amdgpu-sched-strategy=gcn-iterative-max-occupa if(HAS_MAX_OCCUPANCY_EXPERIMENTAL) list(APPEND BLOCKSCALE_GEMM_OPTIONS -mllvm --amdgpu-sched-strategy=gcn-iterative-max-occupancy-experimental) endif() -# list(APPEND BLOCKSCALE_GEMM_OPTIONS "SHELL: -mllvm -greedy-reverse-local-assignment=1 -mllvm --slp-threshold=-32 -mllvm --misched-bottomup=1") example_compile_options(example_gemm_multiply_multiply_xdl_fp8_bpreshuffle PRIVATE ${GEMM_OPTIONS}) example_compile_options(example_moe_gemm1_xdl_fp8 PRIVATE ${GEMM_OPTIONS}) example_compile_options(example_moe_gemm2_xdl_fp8 PRIVATE ${GEMM_OPTIONS}) diff --git a/example/67_gemm_microscaling/CMakeLists.txt b/example/67_gemm_microscaling/CMakeLists.txt index 14b648c9f8..6ee43aac62 100644 --- a/example/67_gemm_microscaling/CMakeLists.txt +++ b/example/67_gemm_microscaling/CMakeLists.txt @@ -58,7 +58,7 @@ example_compile_options(example_moe_gemm1_xdl_mx_fp4_bpreshuffle PRIVATE ${FP4_M example_compile_options(example_moe_gemm2_xdl_mx_fp4_bpreshuffle PRIVATE ${FP4_MXGEMM_OPTIONS}) set(FP8_MXGEMM_OPTIONS) -list(APPEND FP8_MXGEMM_OPTIONS "SHELL: -mllvm -greedy-reverse-local-assignment=1 -mllvm --slp-threshold=-32") +list(APPEND FP8_MXGEMM_OPTIONS "SHELL: -mllvm -greedy-reverse-local-assignment=1") example_compile_options(example_gemm_mx_fp8 PRIVATE ${FP8_MXGEMM_OPTIONS}) example_compile_options(example_gemm_mx_bf8 PRIVATE ${FP8_MXGEMM_OPTIONS}) diff --git a/example/ck_tile/01_fmha/codegen/ops/fmha_bwd.py b/example/ck_tile/01_fmha/codegen/ops/fmha_bwd.py index 77b63a0c83..47cf6b3ad4 100644 --- a/example/ck_tile/01_fmha/codegen/ops/fmha_bwd.py +++ b/example/ck_tile/01_fmha/codegen/ops/fmha_bwd.py @@ -1,5 +1,5 @@ # SPDX-License-Identifier: MIT -# Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. +# Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved. # generate kernel instances to speed up compilation import copy @@ -8,21 +8,13 @@ import fnmatch import itertools from pathlib import Path from typing import List, Optional, Tuple, Dict, Literal +from collections import defaultdict from codegen.cmake_config import * from codegen.cpp_symbol_map import * +from codegen.utils import update_file -BWD_DQDKDV_PIPELINE_MAP = { - "kr_ktr_vr_iglp" : "ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP", - "kr_ktr_vr" : "ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR", -} - -BWD_DQDKDV_PIPELINE_ENUM_MAP = { - "kr_ktr_vr_iglp" : "ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP", - "kr_ktr_vr" : "ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR", -} - FMHA_BWD_KERNEL_HEADER = """// SPDX-License-Identifier: MIT // Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.\n // auto generated by generate.py @@ -56,8 +48,8 @@ using fmha_bwd_shape_{F_idx} = ck_tile::TileFmhaBwdShape; -using fmha_bwd_trait_{F_idx} = ck_tile::TileFmhaTraits<{F_spad}, - {F_skpad}, +using fmha_bwd_trait_{F_idx} = ck_tile::TileFmhaTraits; -using fmha_bwd_pipeline_{F_idx} = {F_pipeline}; +using fmha_bwd_pipeline_{F_idx} = ck_tile::BlockFmhaBwdDQDKDVPipeline; using fmha_bwd_dk_epilogue_{F_idx} = ck_tile::Default2DEpilogue< ck_tile::Default2DEpilogueProblem::AccDataType, typename FmhaBwdTypeConfig<{F_dtype}>::KGradDataType, - {F_skpad}, + false, {F_dpad}>>; using fmha_bwd_dv_epilogue_{F_idx} = ck_tile::Default2DEpilogue< ck_tile::Default2DEpilogueProblem::AccDataType, typename FmhaBwdTypeConfig<{F_dtype}>::VGradDataType, - {F_skpad}, + false, {F_dvpad}>>; using fmha_bwd_dq_dk_dv_kernel_{F_idx} = @@ -115,13 +107,10 @@ using fmha_bwd_dq_dk_dv_kernel_{F_idx} = using dq_dk_dv_trait_{F_idx} = fmha_bwd_dq_dk_dv_traits_<{F_hdim}, {F_dtype}, {F_mode}, - {F_pipeline_enum}, fmha_mask_{F_idx}, fmha_dropout_{F_idx}, {F_bias}, {F_dbias}, - {F_spad}, - {F_skpad}, {F_dpad}, {F_dvpad}, {F_deterministic}>; @@ -195,15 +184,18 @@ FMHA_BWD_API_PER_HDIM_CASE=""" {F_if} (t.hdim_q <= {F_hdim} && t.hdim_v < """ FMHA_BWD_API_INNER_DISPATCH=""" {F_if}((t.is_group_mode == {F_mode}) && ({F_mask_check}) && (t.bias_type == {F_bias_check}) && (t.has_dbias == {F_dbias}) && ({F_dropout_check}) && - ({F_scheck}) && ({F_skcheck}) && ({F_dcheck}) && ({F_dvcheck}) && (t.is_deterministic == {F_deterministic})) {{ - using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<{F_hdim}, {F_dtype}, {F_mode}, {F_spad1}, {F_dvpad}>; - using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<{F_hdim}, {F_dtype}, {F_mode}, {F_pipeline_enum}, {F_mask}, {F_dropout}, {F_bias}, {F_dbias}, {F_spad0}, {F_skpad}, {F_dpad}, {F_dvpad}, {F_deterministic}>; - using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<{F_hdim}, {F_dtype}, {F_mode}, {F_spad1}, {F_dpad}, {F_deterministic}>; + ({F_scheck}) && ({F_dcheck}) && ({F_dvcheck}) && (t.is_deterministic == {F_deterministic})) {{ + using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<{F_hdim}, {F_dtype}, {F_mode}, {F_spad1d}, {F_dvpad}>; + using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<{F_hdim}, {F_dtype}, {F_mode}, {F_mask}, {F_dropout}, {F_bias}, {F_dbias}, {F_dpad}, {F_dvpad}, {F_deterministic}>; + using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<{F_hdim}, {F_dtype}, {F_mode}, {F_spad1d}, {F_dpad}, {F_deterministic}>; r = fmha_bwd_(s, a); return r; }} """ +# M0 size for 1d kernels (dot/convert) +M0_1D = 64 + # GEMM0: Q@K=S^T # GEMM1: P^T@dO^T=dV(This was chosen as G1 to match fwd, but N1 must be equal to headdim_v) # GEMM2: dO@V=dP^T(This was chosen as G2 because of the calculation order) @@ -249,8 +241,6 @@ class FmhaBwdDQDKDVKernel: F_hdim : int # hdim F_dtype : str # data type F_tile : FmhaBwdDQDKDVTileSize - F_spad : str # true/false - F_skpad : str # F_dpad : str # F_dvpad : str # F_bias : str # @@ -259,7 +249,6 @@ class FmhaBwdDQDKDVKernel: F_mask : str # value from MASK_MAP F_mode : str # value from MODE_MAP F_deterministic : str # - F_pipeline : str # mask_impl : str # @property @@ -293,8 +282,6 @@ class FmhaBwdDQDKDVKernel: F_wm1 = self.F_tile.F_wm1, F_wn1 = self.F_tile.F_wn1, F_wk1 = self.F_tile.F_wk1, - F_spad = BOOL_MAP[self.F_spad], - F_skpad = BOOL_MAP[self.F_skpad], F_dpad = BOOL_MAP[self.F_dpad], F_dvpad = BOOL_MAP[self.F_dvpad], F_bias = BIAS_MAP[self.F_bias], @@ -304,21 +291,18 @@ class FmhaBwdDQDKDVKernel: F_mask = get_mask_map(self.mask_impl)[self.F_mask], F_mode = MODE_MAP[self.F_mode], F_deterministic = BOOL_MAP[self.F_deterministic], - F_pipeline_enum = BWD_DQDKDV_PIPELINE_ENUM_MAP[self.F_pipeline], - F_pipeline = BWD_DQDKDV_PIPELINE_MAP[self.F_pipeline]) + ) @property def name(self) -> str: def pad_name() -> str: n = '' - if self.F_spad == 't': n += 's' - if self.F_skpad == 't' : n += 'sk' if self.F_dpad == 't' : n += 'd' if self.F_dvpad == 't' : n += 'dv' if n != '' : n = 'p' + n return n pn = pad_name() - n = f"fmha_bwd_d{self.F_hdim}_{self.F_dtype}_{self.F_mode}_" + self.F_tile.name + f'_{self.F_pipeline}' + n = f"fmha_bwd_d{self.F_hdim}_{self.F_dtype}_{self.F_mode}_" + self.F_tile.name if pn != '' : n += f'_{pn}' else: n += '_npad' @@ -347,20 +331,15 @@ class FmhaBwdDQDKDVKernel: return self.name + ".cpp" # TODO: design a more practical way to do it -# this is current supported tile size & pipeline. +# this is current supported tile size. def get_fmha_bwd_dq_dk_dv_tile_ppl_dict_from_dtype(dtype : str) -> Optional[dict]: if dtype == 'fp16' or dtype == 'bf16': return { - '32' : [FmhaBwdDQDKDVTileSize( 32, 128, 32, 32, 32, 32, 64, 32, 32, 1, 4, 1, 4, 1, 1, 2, 2, 1, 16, 16, 32, 16, 16, 16, 1), - "kr_ktr_vr_iglp", "kr_ktr_vr"], - '64' : [FmhaBwdDQDKDVTileSize( 32, 128, 64, 32, 64, 32, 32, 64, 64, 1, 4, 1, 4, 1, 1, 1, 4, 1, 16, 16, 32, 16, 16, 16, 1), - "kr_ktr_vr_iglp", "kr_ktr_vr"], - '128' : [FmhaBwdDQDKDVTileSize( 16, 128, 128, 16, 128, 16, 32, 128, 128, 1, 4, 1, 4, 1, 1, 1, 4, 1, 16, 16, 32, 16, 16, 16, 1), - "kr_ktr_vr_iglp", "kr_ktr_vr"], - # '160' : [FmhaBwdDQDKDVTileSize( 32, 64, 160, 32, 160, 32, 32, 160, 160, 1, 4, 1, 4, 1, 1, 2, 2, 1, 16, 16, 32, 16, 16, 16, 1), - # "kr_ktr_vr_iglp", "kr_ktr_vr"], - '256' : [FmhaBwdDQDKDVTileSize( 16, 64, 256, 16, 256, 16, 32, 256, 256, 1, 4, 1, 4, 1, 1, 1, 4, 1, 16, 16, 32, 16, 16, 16, 1), - "kr_ktr_vr_iglp", "kr_ktr_vr"] + '32' : FmhaBwdDQDKDVTileSize( 32, 128, 32, 32, 32, 32, 64, 32, 32, 1, 4, 1, 4, 1, 1, 2, 2, 1, 16, 16, 32, 16, 16, 16, 1), + '64' : FmhaBwdDQDKDVTileSize( 32, 128, 64, 32, 64, 32, 32, 64, 64, 1, 4, 1, 4, 1, 1, 1, 4, 1, 16, 16, 32, 16, 16, 16, 1), + '128' : FmhaBwdDQDKDVTileSize( 16, 128, 128, 16, 128, 16, 32, 128, 128, 1, 4, 1, 4, 1, 1, 1, 4, 1, 16, 16, 32, 16, 16, 16, 1), + # '160' : FmhaBwdDQDKDVTileSize( 32, 64, 160, 32, 160, 32, 32, 160, 160, 1, 4, 1, 4, 1, 1, 2, 2, 1, 16, 16, 32, 16, 16, 16, 1), + '256' : FmhaBwdDQDKDVTileSize( 16, 64, 256, 16, 256, 16, 32, 256, 256, 1, 4, 1, 4, 1, 1, 1, 4, 1, 16, 16, 32, 16, 16, 16, 1), } else: return None @@ -375,7 +354,7 @@ using fmha_bwd_dot_do_o_pipeline_problem_{F_idx} = ck_tile::BlockFmhaBwdOGradDot typename FmhaBwdTypeConfig::ODataType, typename FmhaBwdTypeConfig::OGradDataType, typename FmhaBwdTypeConfig::DDataType, - /* BlockSize = */ 64, + /* BlockSize = M0 = */ 64, {F_hdim}, {F_mode}, fmha_bwd_dot_do_o_trait_{F_idx}>; @@ -580,7 +559,6 @@ class FmhaBwdConvertQGradKernel: @dataclass(frozen=True) class FmhaBwdApiTrait: idx : int # this is not a tunable, but a counter to differentiate symbol - pipeline : str # sync with fmha_bwd_traits<>, to generate fallback calls hdim : int dtype : str # data type @@ -590,9 +568,7 @@ class FmhaBwdApiTrait: bias : str dbias : str dropout : str - spad : str - spad1 : str # spad for dot/convert kernel - skpad : str + spad1d : str # spad for 1d kernels (dot/convert) dpad : str dvpad : str deterministic : str @@ -611,24 +587,14 @@ class FmhaBwdApiTrait: def bhdv(self) -> int: return self.tile.F_bhdv - def scheck(self, spad1 : str) -> str: - if self.mode == 'group': - return 'true' # always support - elif self.spad == 't' and spad1 == 't': - return f'a.seqlen_q % {self.bm0} != 0' - elif self.spad == 'f' and spad1 == 't': - return f'a.seqlen_q % {self.bm0} == 0 and a.seqlen_q % 64 != 0' - else: # self.skpad == 'f' and skpad1 == 'f' - return 'a.seqlen_q % 64 == 0' - @property - def skcheck(self) -> str: + def scheck(self) -> str: if self.mode == 'group': return 'true' # always support - elif self.skpad == 't': - return f'a.seqlen_k % {self.bn0} != 0' - else: - return f'a.seqlen_k % {self.bn0} == 0' + elif self.spad1d == 't': + return f'a.seqlen_q % {M0_1D} != 0' + else: # self.spad1d == 'f' + return f'a.seqlen_q % {M0_1D} == 0' @property def dcheck(self) -> str: @@ -647,14 +613,14 @@ class FmhaBwdApiTrait: def get_occupancy(dtype, hdim): return 2 - return FmhaBwdOGradDotOKernel(F_idx=self.idx, F_hdim=self.hdim, F_dtype=self.dtype, F_spad=self.spad1, + return FmhaBwdOGradDotOKernel(F_idx=self.idx, F_hdim=self.hdim, F_dtype=self.dtype, F_spad=self.spad1d, F_dvpad=self.dvpad, F_mode=self.mode, F_occupancy=get_occupancy(self.dtype, self.hdim)) @property def dq_dk_dv_kernel(self) -> FmhaBwdDQDKDVKernel: return FmhaBwdDQDKDVKernel(F_idx=self.idx, F_hdim=self.hdim, F_dtype=self.dtype, F_tile=self.tile, - F_spad=self.spad, F_skpad=self.skpad, F_dpad=self.dpad, F_dvpad=self.dvpad, F_bias=self.bias, - F_dbias=self.dbias, F_dropout=self.dropout, F_mask=self.mask, F_mode=self.mode, F_deterministic=self.deterministic, F_pipeline=self.pipeline, mask_impl=self.mask_impl) + F_dpad=self.dpad, F_dvpad=self.dvpad, F_bias=self.bias, F_dbias=self.dbias, F_dropout=self.dropout, + F_mask=self.mask, F_mode=self.mode, F_deterministic=self.deterministic, mask_impl=self.mask_impl) @property def convert_dq_kernel(self) -> FmhaBwdConvertQGradKernel: @@ -664,48 +630,46 @@ class FmhaBwdApiTrait: return 2 return FmhaBwdConvertQGradKernel(F_idx=self.idx, F_hdim=self.hdim, F_dtype=self.dtype, - F_bm0=64, F_bn0=self.tile.F_bn0, F_spad=self.spad, F_dpad=self.dpad, + F_bm0=M0_1D, F_bn0=self.tile.F_bn0, F_spad=self.spad1d, F_dpad=self.dpad, F_mode=self.mode, F_occupancy=get_occupancy(self.dtype, self.hdim), F_deterministic=self.deterministic) class FmhaBwdApiPool: def __init__(self, mask_impl): - self.dq_dk_dv_pool = dict() + self.dq_dk_dv_pool = defaultdict(lambda: defaultdict(list)) self.mask_impl = mask_impl def register_dq_dk_dv_traits(self, trait : FmhaBwdApiTrait) -> None: # TODO: do we need to check duplication? - if trait.dtype not in self.dq_dk_dv_pool.keys(): - self.dq_dk_dv_pool[trait.dtype] = dict() - if trait.hdim not in self.dq_dk_dv_pool[trait.dtype].keys(): - self.dq_dk_dv_pool[trait.dtype][trait.hdim] = list() - self.dq_dk_dv_pool[trait.dtype][trait.hdim].append(copy.copy(trait)) + @staticmethod + def if_(i: int) -> str: + return 'if' if i == 0 else 'else if' + + def _api_innders(self, traits: List[FmhaBwdApiTrait]) -> str: + inners = "" + i = 0 + for trait in traits: + inners += FMHA_BWD_API_INNER_DISPATCH.format(F_if=self.if_(i), F_mode=MODE_MAP[trait.mode], + F_mask_check=get_mask_check_map(self.mask_impl)[trait.mask], F_mask=get_mask_map(self.mask_impl)[trait.mask], F_bias_check=BIAS_CHECK_MAP[trait.bias], + F_bias=BIAS_MAP[trait.bias], F_dbias=BOOL_MAP[trait.dbias], F_dropout_check=DROPOUT_CHECK_MAP[trait.dropout], F_dropout=DROPOUT_MAP[trait.dropout], + F_scheck=trait.scheck, F_dcheck=trait.dcheck, F_dvcheck=trait.dvcheck, F_hdim=trait.hdim, F_dtype=BWD_DTYPE_MAP[trait.dtype], + F_spad1d=BOOL_MAP[trait.spad1d], F_dpad=BOOL_MAP[trait.dpad], F_dvpad=BOOL_MAP[trait.dvpad], + F_deterministic=BOOL_MAP[trait.deterministic]) + i += 1 + return inners + @property def api(self) -> str: per_dtypes=str() - for i, dtype in enumerate(self.dq_dk_dv_pool.keys()): + for i, dtype in enumerate(self.dq_dk_dv_pool): per_hdim_case=str() - for j, hdim in enumerate(self.dq_dk_dv_pool[dtype].keys()): + for j, hdim in enumerate(self.dq_dk_dv_pool[dtype]): traits=self.dq_dk_dv_pool[dtype][hdim] - inners=str() - for k, trait in enumerate(traits): - if_k = 'if' if k == 0 else 'else if' - for spad1 in ["t", "f"]: - if (spad1 == "f" and (trait.spad == "t" or trait.mode == "group")): - continue - inners = inners + FMHA_BWD_API_INNER_DISPATCH.format(F_if=if_k, F_mode=MODE_MAP[trait.mode], F_pipeline_enum=BWD_DQDKDV_PIPELINE_ENUM_MAP[trait.pipeline], - F_mask_check=get_mask_check_map(self.mask_impl)[trait.mask], F_mask=get_mask_map(self.mask_impl)[trait.mask], F_bias_check=BIAS_CHECK_MAP[trait.bias], - F_bias=BIAS_MAP[trait.bias], F_dbias=BOOL_MAP[trait.dbias], F_dropout_check=DROPOUT_CHECK_MAP[trait.dropout], F_dropout=DROPOUT_MAP[trait.dropout], - F_scheck=trait.scheck(spad1=spad1), F_skcheck=trait.skcheck, F_dcheck=trait.dcheck, F_dvcheck=trait.dvcheck, F_hdim=hdim, F_dtype=BWD_DTYPE_MAP[dtype], - F_spad0=BOOL_MAP[trait.spad], F_spad1=BOOL_MAP[spad1], F_skpad=BOOL_MAP[trait.skpad], F_dpad=BOOL_MAP[trait.dpad], F_dvpad=BOOL_MAP[trait.dvpad], - F_deterministic=BOOL_MAP[trait.deterministic]) - - if_j = 'if' if j == 0 else 'else if' - per_hdim_case = per_hdim_case + FMHA_BWD_API_PER_HDIM_CASE.format(F_if=if_j, F_hdim=hdim, F_inner_dispatch=inners) - if_i = 'if' if i == 0 else 'else if' - per_dtypes = per_dtypes + FMHA_BWD_API_PER_DTYPE.format(F_if=if_i, F_dtype=dtype, F_hdim_case=per_hdim_case) + inners = self._api_innders(traits) + per_hdim_case = per_hdim_case + FMHA_BWD_API_PER_HDIM_CASE.format(F_if=self.if_(j), F_hdim=hdim, F_inner_dispatch=inners) + per_dtypes += FMHA_BWD_API_PER_DTYPE.format(F_if=self.if_(i), F_dtype=dtype, F_hdim_case=per_hdim_case) if not per_dtypes: # empty string we add some ignore to suppress warning in api per_dtypes += ' (void)t ; (void)s ; (void)a;' @@ -730,21 +694,16 @@ def get_bwd_blobs(filter_list: str, receipt, mask_impl, optdim_list) -> Tuple[Fm d = get_fmha_bwd_dq_dk_dv_tile_ppl_dict_from_dtype(dtype) if d is None: continue - for hdim_str, mode, mask, bias, dbias, dropout, spad, spad1, skpad, dpad, dvpad, deterministic in itertools.product(d.keys(), MODE_MAP.keys(), get_mask_map(mask_impl).keys(), BIAS_MAP.keys(), ["t", "f"], DROPOUT_MAP.keys(), *([["t", "f"]] * 6)): - tile = d[hdim_str][0] - ppl = d[hdim_str][1] + for hdim_str, mode, mask, bias, dbias, dropout, spad1d, dpad, dvpad, deterministic in itertools.product(d.keys(), MODE_MAP.keys(), get_mask_map(mask_impl).keys(), BIAS_MAP.keys(), ["t", "f"], DROPOUT_MAP.keys(), *([["t", "f"]] * 4)): + tile = d[hdim_str] hdim = int(hdim_str) - if (mode == "group") and (spad == "f" or skpad == "f"): - continue - if (spad1 == "f") and (spad == "t" or mode == "group"): + if (mode == "group") and (spad1d == "f"): continue if ((bias == "no" or bias == "alibi") and dbias == "t"): continue if ("wg32" in dropout): continue - if (dpad == "t" or dvpad == "t"): - ppl = d[hdim_str][2] - t = FmhaBwdApiTrait(idx=0, pipeline=ppl, hdim=hdim, dtype=dtype, mode=mode,tile=tile,mask=mask, bias=bias, dbias=dbias, dropout=dropout, spad=spad, spad1=spad1, skpad=skpad, dpad=dpad, dvpad=dvpad, deterministic=deterministic, mask_impl=mask_impl) + t = FmhaBwdApiTrait(idx=0, hdim=hdim, dtype=dtype, mode=mode,tile=tile,mask=mask, bias=bias, dbias=dbias, dropout=dropout, spad1d=spad1d, dpad=dpad, dvpad=dvpad, deterministic=deterministic, mask_impl=mask_impl) if not fnmatch.fnmatch(t.dot_do_o_kernel.name, filter_dot_do_o): continue @@ -808,13 +767,13 @@ def get_bwd_blobs(filter_list: str, receipt, mask_impl, optdim_list) -> Tuple[Fm def write_blobs(output_dir : Path, filter_list : str, receipt, optdim_list, mask_impl) -> None: api_pool, kernels_dot_do_o, kernels_dq_dk_dv, kernels_convert_dq = get_bwd_blobs(filter_list, receipt, mask_impl, optdim_list) - (output_dir / FMHA_BWD_API_FILENAME).write_text(api_pool.api) + update_file(output_dir / FMHA_BWD_API_FILENAME, api_pool.api) for k in kernels_dot_do_o: - (output_dir / k.filename).write_text(k.template) + update_file(output_dir / k.filename, k.template) for k in kernels_convert_dq: - (output_dir / k.filename).write_text(k.template) + update_file(output_dir / k.filename, k.template) for k in kernels_dq_dk_dv: - (output_dir / k.filename).write_text(k.template) + update_file(output_dir / k.filename, k.template) def list_blobs(file_path: Path, filter_list: str, receipt, optdim_list, mask_impl) -> None: diff --git a/example/ck_tile/01_fmha/codegen/ops/fmha_fwd.py b/example/ck_tile/01_fmha/codegen/ops/fmha_fwd.py index 730641a6b0..269af4e6a7 100644 --- a/example/ck_tile/01_fmha/codegen/ops/fmha_fwd.py +++ b/example/ck_tile/01_fmha/codegen/ops/fmha_fwd.py @@ -533,20 +533,31 @@ class KernelComponentFactory: pipelines = [] if dtype in ['fp16', 'bf16']: for logits, mask, bias, lse, dropout, skip in itertools.product(["t", "f"], get_mask_map(mask_impl).keys(), BIAS_MAP.keys(), ["t", "f"], ["t", "f"], ["t", "f"]): - if bias == "bias": - # TODO: rocm 6.2 compiler problem if using qr_async for bias case + if hdim == 256 and hdim_v == 256: + # if True: pipelines.append(FmhaFwdPipeline('qr', 'row', 'f', 'f', 'f', 'f', logits, bias, lse, dropout, squant, mask, skip)) - pipelines.append(FmhaFwdPipeline('qr', 'row', 't', 't', 't', 't', logits, bias, lse, dropout, squant, mask, skip)) pipelines.append(FmhaFwdPipeline('qr', 'col', 'f', 'f', 'f', 'f', logits, bias, lse, dropout, squant, mask, skip)) + # the below two is used for hdim vectorize load + pipelines.append(FmhaFwdPipeline('qr', 'row', 't', 't', 'f', 'f', logits, bias, lse, dropout, squant, mask, skip)) + pipelines.append(FmhaFwdPipeline('qr', 'col', 't', 't', 'f', 'f', logits, bias, lse, dropout, squant, mask, skip)) + + pipelines.append(FmhaFwdPipeline('qr', 'row', 't', 't', 't', 't', logits, bias, lse, dropout, squant, mask, skip)) pipelines.append(FmhaFwdPipeline('qr', 'col', 't', 't', 't', 't', logits, bias, lse, dropout, squant, mask, skip)) else: - pipelines.append(FmhaFwdPipeline('qr_async', 'row', 't', 'f', 't', 't', logits, bias, lse, dropout, squant, mask, skip)) - pipelines.append(FmhaFwdPipeline('qr_async', 'row', 't', 't', 't', 't', logits, bias, lse, dropout, squant, mask, skip)) - pipelines.append(FmhaFwdPipeline('qr_async', 'col', 't', 'f', 't', 't', logits, bias, lse, dropout, squant, mask, skip)) - pipelines.append(FmhaFwdPipeline('qr_async', 'col', 't', 't', 't', 't', logits, bias, lse, dropout, squant, mask, skip)) - if receipt == 1 and bias != "bias": - pipelines.append(FmhaFwdPipeline('qr', 'row', 't', 't', 't', 't', logits, bias, lse, dropout, squant, mask, skip)) # TODO: cover arbitraty hdim - pipelines.append(FmhaFwdPipeline('qr', 'col', 't', 'f', 't', 't', logits, bias, lse, dropout, squant, mask, skip)) # TODO: cover arbitraty hdim + if bias == "bias": + # TODO: rocm 6.2 compiler problem if using qr_async for bias case + pipelines.append(FmhaFwdPipeline('qr', 'row', 'f', 'f', 'f', 'f', logits, bias, lse, dropout, squant, mask, skip)) + pipelines.append(FmhaFwdPipeline('qr', 'row', 't', 't', 't', 't', logits, bias, lse, dropout, squant, mask, skip)) + pipelines.append(FmhaFwdPipeline('qr', 'col', 'f', 'f', 'f', 'f', logits, bias, lse, dropout, squant, mask, skip)) + pipelines.append(FmhaFwdPipeline('qr', 'col', 't', 't', 't', 't', logits, bias, lse, dropout, squant, mask, skip)) + else: + pipelines.append(FmhaFwdPipeline('qr_async', 'row', 't', 'f', 't', 't', logits, bias, lse, dropout, squant, mask, skip)) + pipelines.append(FmhaFwdPipeline('qr_async', 'row', 't', 't', 't', 't', logits, bias, lse, dropout, squant, mask, skip)) + pipelines.append(FmhaFwdPipeline('qr_async', 'col', 't', 'f', 't', 't', logits, bias, lse, dropout, squant, mask, skip)) + pipelines.append(FmhaFwdPipeline('qr_async', 'col', 't', 't', 't', 't', logits, bias, lse, dropout, squant, mask, skip)) + if receipt == 1 and bias != "bias": + pipelines.append(FmhaFwdPipeline('qr', 'row', 't', 't', 't', 't', logits, bias, lse, dropout, squant, mask, skip)) # TODO: cover arbitraty hdim + pipelines.append(FmhaFwdPipeline('qr', 'col', 't', 'f', 't', 't', logits, bias, lse, dropout, squant, mask, skip)) # TODO: cover arbitraty hdim elif dtype in ['fp8', 'bf8']: # no need lse/dropout kernels for logits, mask, bias in itertools.product(["t", "f"], get_mask_map(mask_impl).keys(), BIAS_MAP.keys()): @@ -584,7 +595,7 @@ def get_fwd_blobs(kernel_filter : Optional[str], receipt, optdim_list, mask_impl if pipeline.F_spad != 't' or pipeline.F_skpad != 't': # in group mode, spad/skpad must be true, since we can't predict if seqlen of current batch need pad or not continue - if (hdim, hdim_v) == (192, 128) or hdim == 160: + if (hdim, hdim_v) == (192, 128): # NOTE: this is used to speedup deepseek prefill case, we don't gen training if pipeline.F_bias != 'no' or pipeline.F_dropout == 't': continue diff --git a/example/ck_tile/01_fmha/codegen/ops/fmha_fwd_splitkv.py b/example/ck_tile/01_fmha/codegen/ops/fmha_fwd_splitkv.py index 5b35e7f0bd..0e4ac44d45 100644 --- a/example/ck_tile/01_fmha/codegen/ops/fmha_fwd_splitkv.py +++ b/example/ck_tile/01_fmha/codegen/ops/fmha_fwd_splitkv.py @@ -41,7 +41,6 @@ K0_MAX_SUBMAX_MAP = { FMHA_FWD_SPLITKV_PIPELINE_MAP = { "qr" : "ck_tile::BlockFmhaFwdSplitKVPipelineQRKSVS", "qr_nwarp_sshuffle" : "ck_tile::BlockFmhaFwdSplitKVPipelineNWarpSShuffleQRKSVS", - "qr_async" : "ck_tile::BlockFmhaFwdSplitKVPipelineQRKSVSAsync", } FMHA_FWD_SPLITKV_KERNEL_BODY=""" @@ -685,28 +684,17 @@ def get_fwd_splitkv_blobs(kernel_filter : Optional[str], receipt, mask_impl, opt pipelines = [] if dtype in ['fp16', 'bf16']: for logits, mask, bias, pagedkv in itertools.product(["t", "f"], get_mask_map(mask_impl).keys(), BIAS_MAP.keys(), ["t", "f"]): - # TODO: use async pipeline when compiler is more stable - if hdim == 256 or hdim in [32, 64, 128]: ### [32, 64, 96, 128, 160]: - # if True: - pipelines.append(Pipeline('qr', 'row', 'f', 't', 'f', 'f', logits, bias, 't', squant, pagedkv, mask)) - pipelines.append(Pipeline('qr', 'col', 'f', 't', 'f', 'f', logits, bias, 't', squant, pagedkv, mask)) + pipelines.append(Pipeline('qr', 'row', 'f', 't', 'f', 'f', logits, bias, 't', squant, pagedkv, mask)) + pipelines.append(Pipeline('qr', 'col', 'f', 't', 'f', 'f', logits, bias, 't', squant, pagedkv, mask)) - pipelines.append(Pipeline('qr', 'row', 't', 'f', 'f', 'f', logits, bias, 't', squant, pagedkv, mask)) - pipelines.append(Pipeline('qr', 'col', 't', 'f', 'f', 'f', logits, bias, 't', squant, pagedkv, mask)) + pipelines.append(Pipeline('qr', 'row', 't', 'f', 'f', 'f', logits, bias, 't', squant, pagedkv, mask)) + pipelines.append(Pipeline('qr', 'col', 't', 'f', 'f', 'f', logits, bias, 't', squant, pagedkv, mask)) - pipelines.append(Pipeline('qr', 'row', 't', 't', 'f', 'f', logits, bias, 't', squant, pagedkv, mask)) - pipelines.append(Pipeline('qr', 'col', 't', 't', 'f', 'f', logits, bias, 't', squant, pagedkv, mask)) + pipelines.append(Pipeline('qr', 'row', 't', 't', 'f', 'f', logits, bias, 't', squant, pagedkv, mask)) + pipelines.append(Pipeline('qr', 'col', 't', 't', 'f', 'f', logits, bias, 't', squant, pagedkv, mask)) - pipelines.append(Pipeline('qr', 'row', 't', 't', 't', 't', logits, bias, 't', squant, pagedkv, mask)) - pipelines.append(Pipeline('qr', 'col', 't', 't', 't', 't', logits, bias, 't', squant, pagedkv, mask)) - else: - pipelines.append(Pipeline('qr_async', 'row', 't', 'f', 't', 't', logits, bias, 't', squant, pagedkv, mask)) - pipelines.append(Pipeline('qr_async', 'row', 't', 't', 't', 't', logits, bias, 't', squant, pagedkv, mask)) - pipelines.append(Pipeline('qr_async', 'col', 't', 'f', 't', 't', logits, bias, 't', squant, pagedkv, mask)) - pipelines.append(Pipeline('qr_async', 'col', 't', 't', 't', 't', logits, bias, 't', squant, pagedkv, mask)) - if receipt == 1: - pipelines.append(Pipeline('qr', 'row', 't', 't', 't', 't', logits, bias, 't', squant, pagedkv, mask)) # TODO: cover arbitraty hdim - pipelines.append(Pipeline('qr', 'col', 't', 'f', 't', 't', logits, bias, 't', squant, pagedkv, mask)) # TODO: cover arbitraty hdim + pipelines.append(Pipeline('qr', 'row', 't', 't', 't', 't', logits, bias, 't', squant, pagedkv, mask)) + pipelines.append(Pipeline('qr', 'col', 't', 't', 't', 't', logits, bias, 't', squant, pagedkv, mask)) elif dtype in ['fp8', 'bf8']: for logits, mask, bias in itertools.product(["t", "f"], get_mask_map(mask_impl).keys(), BIAS_MAP.keys()): pipelines.append(Pipeline('qr', 'col', 'f', 'f', 'f', 'f', logits, bias, 't', squant, 'f', mask)) diff --git a/example/ck_tile/01_fmha/codegen/utils.py b/example/ck_tile/01_fmha/codegen/utils.py new file mode 100644 index 0000000000..e3bbb18c42 --- /dev/null +++ b/example/ck_tile/01_fmha/codegen/utils.py @@ -0,0 +1,21 @@ +# SPDX-License-Identifier: MIT +# Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. +# generate kernel instances to speed up compilation + +import os.path as path + + +def update_file(file_path, content): + """Update the file at file_path with the given content if it differs from the existing content. + + It avoids unnecessary touching of the file which triggers rebuilds + """ + + existing_content = "" + if path.exists(file_path): + with open(file_path, "r") as file: + existing_content = file.read() + if existing_content == content: + return + with open(file_path, "w") as file: + file.write(content) diff --git a/example/ck_tile/01_fmha/fmha_bwd.hpp b/example/ck_tile/01_fmha/fmha_bwd.hpp index 9179dbd9be..c999cf750e 100644 --- a/example/ck_tile/01_fmha/fmha_bwd.hpp +++ b/example/ck_tile/01_fmha/fmha_bwd.hpp @@ -1,5 +1,5 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved. #pragma once @@ -357,31 +357,25 @@ auto fmha_bwd_convert_dq_create_kargs_and_grids(fmha_bwd_args args) template struct fmha_bwd_dq_dk_dv_traits_ { - static constexpr ck_tile::index_t HDim = HDim_; - using DataType = ck_tile::remove_cvref_t; - static constexpr bool kIsGroupMode = kIsGroupMode_; - static constexpr auto FmhaBwdPipelineEnum = FmhaBwdPipelineEnum_; - using FmhaMask = ck_tile::remove_cvref_t; - using FmhaDropout = ck_tile::remove_cvref_t; - static constexpr auto BiasEnum = BiasEnum_; - static constexpr bool kHasBiasGrad = kHasBiasGrad_; - static constexpr bool kPadS = kPadS_; - static constexpr bool kPadSK = kPadSK_; - static constexpr bool kPadD = kPadD_; - static constexpr bool kPadDv = kPadDv_; - static constexpr bool kIsDeterministic = kIsDeterministic_; + static constexpr ck_tile::index_t HDim = HDim_; + using DataType = ck_tile::remove_cvref_t; + static constexpr bool kIsGroupMode = kIsGroupMode_; + using FmhaMask = ck_tile::remove_cvref_t; + using FmhaDropout = ck_tile::remove_cvref_t; + static constexpr auto BiasEnum = BiasEnum_; + static constexpr bool kHasBiasGrad = kHasBiasGrad_; + static constexpr bool kPadD = kPadD_; + static constexpr bool kPadDv = kPadDv_; + static constexpr bool kIsDeterministic = kIsDeterministic_; }; template diff --git a/example/ck_tile/03_gemm/CMakeLists.txt b/example/ck_tile/03_gemm/CMakeLists.txt index e6f67e4c76..b1aede42c7 100644 --- a/example/ck_tile/03_gemm/CMakeLists.txt +++ b/example/ck_tile/03_gemm/CMakeLists.txt @@ -10,7 +10,7 @@ list(APPEND EXAMPLE_GEMM_COMPILE_OPTIONS -mllvm -enable-noalias-to-md-conversion list(APPEND EXAMPLE_WEIGHT_PRESHUFFLE_COMPILE_OPTIONS -Wno-unused-local-typedef) list(APPEND EXAMPLE_WEIGHT_PRESHUFFLE_COMPILE_OPTIONS -Wno-gnu-line-marker) list(APPEND EXAMPLE_WEIGHT_PRESHUFFLE_COMPILE_OPTIONS --save-temps) -list(APPEND EXAMPLE_WEIGHT_PRESHUFFLE_COMPILE_OPTIONS "SHELL: -mllvm -greedy-reverse-local-assignment=1 -mllvm --slp-threshold=-32 -mllvm -enable-noalias-to-md-conversion=0") +list(APPEND EXAMPLE_WEIGHT_PRESHUFFLE_COMPILE_OPTIONS "SHELL: -mllvm -greedy-reverse-local-assignment=1 -mllvm -enable-noalias-to-md-conversion=0") target_compile_options(tile_example_gemm_basic PRIVATE ${EXAMPLE_GEMM_COMPILE_OPTIONS}) target_compile_options(tile_example_gemm_universal PRIVATE ${EXAMPLE_GEMM_COMPILE_OPTIONS}) target_compile_options(tile_example_gemm_weight_preshuffle PRIVATE ${EXAMPLE_WEIGHT_PRESHUFFLE_COMPILE_OPTIONS}) diff --git a/example/ck_tile/05_reduce/reduce.cpp b/example/ck_tile/05_reduce/reduce.cpp index 602661f779..cf816caa88 100644 --- a/example/ck_tile/05_reduce/reduce.cpp +++ b/example/ck_tile/05_reduce/reduce.cpp @@ -1,16 +1,21 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved. + #include "ck_tile/host.hpp" -#include "reduce.hpp" +#include "ck_tile/ops/reduce.hpp" #include auto create_args(int argc, char* argv[]) { ck_tile::ArgParser arg_parser; - arg_parser.insert("m", "3328", "m dimension") - .insert("n", "4096", "n dimension") + arg_parser.insert("n", "32", "n dimension") + .insert("h", "7", "h dimension") + .insert("w", "7", "w dimension") + .insert("c", "512", "c dimension") .insert("v", "1", "cpu validation or not") .insert("prec", "fp16", "precision") - .insert("warmup", "5", "cold iter") - .insert("repeat", "20", "hot iter"); + .insert("warmup", "0", "cold iter") + .insert("repeat", "1", "hot iter"); bool result = arg_parser.parse(argc, argv); return std::make_tuple(result, arg_parser); @@ -23,15 +28,28 @@ bool run(const ck_tile::ArgParser& arg_parser) using ComputeDataType = float; using YDataType = DataType; - ck_tile::index_t m = arg_parser.get_int("m"); - ck_tile::index_t n = arg_parser.get_int("n"); + ck_tile::index_t N = arg_parser.get_int("n"); + ck_tile::index_t H = arg_parser.get_int("h"); + ck_tile::index_t W = arg_parser.get_int("w"); + ck_tile::index_t C = arg_parser.get_int("c"); int do_validation = arg_parser.get_int("v"); int warmup = arg_parser.get_int("warmup"); int repeat = arg_parser.get_int("repeat"); - ck_tile::HostTensor x_host({m, n}); - ck_tile::HostTensor y_host_ref({m}); - ck_tile::HostTensor y_host_dev({m}); + std::vector problem_shape = {N, H, W, C}; + std::vector strides(4); + strides[0] = H * W * C; + strides[1] = W * C; + strides[2] = C; + strides[3] = 1; + + // Define reduction specification: + constexpr auto kept_dim = ck_tile::sequence<0, 3>{}; // Which dimension to keep + constexpr auto reduce_dims = ck_tile::sequence<1, 2>{}; // Which dimensions to reduce + + ck_tile::HostTensor x_host(problem_shape, strides); + ck_tile::HostTensor y_host_ref({N, C}, {C, 1}); + ck_tile::HostTensor y_host_dev({N, C}, {C, 1}); ck_tile::FillUniformDistribution{-5.f, 5.f}(x_host); @@ -54,7 +72,9 @@ bool run(const ck_tile::ArgParser& arg_parser) constexpr ck_tile::index_t kBlockSize = 256; constexpr ck_tile::index_t kBlockPerCu = 1; - ck_tile::index_t kGridSize = (m / BlockTile::at(ck_tile::number<0>{})); + ck_tile::index_t kept_dim_len_prod = N * C; + ck_tile::index_t kGridSize = (kept_dim_len_prod + BlockTile::at(ck_tile::number<0>{}) - 1) / + BlockTile::at(ck_tile::number<0>{}); std::cout << "grid size " << kGridSize << std::endl; using Shape = ck_tile::Reduce2dShape; @@ -63,6 +83,17 @@ bool run(const ck_tile::ArgParser& arg_parser) using Kernel = ck_tile::Reduce; + // Create input tensor shape and strides + auto input_shape = + ck_tile::make_tuple(problem_shape[0], problem_shape[1], problem_shape[2], problem_shape[3]); + auto input_strides = ck_tile::make_tuple(strides[0], strides[1], strides[2], strides[3]); + + if(!Kernel::IsSupportedArgument( + C, input_strides)) // output tensor's continuous dimension and input strides + { + throw std::runtime_error("Wrong! Arguments not supported!\n"); + } + float ave_time = launch_kernel(ck_tile::stream_config{nullptr, true, 0, warmup, repeat}, ck_tile::make_kernel( Kernel{}, @@ -71,10 +102,12 @@ bool run(const ck_tile::ArgParser& arg_parser) 0, static_cast(x_buf.GetDeviceBuffer()), static_cast(y_buf.GetDeviceBuffer()), - m, - n)); + input_shape, + input_strides, + kept_dim, + reduce_dims)); - std::size_t num_btype = sizeof(XDataType) * m * n + sizeof(YDataType) * m; + std::size_t num_btype = sizeof(XDataType) * N * C * H * W + sizeof(YDataType) * N * C; float gb_per_sec = num_btype / 1.E6 / ave_time; @@ -86,7 +119,7 @@ bool run(const ck_tile::ArgParser& arg_parser) { // reference ck_tile::reference_reduce( - x_host, y_host_ref, ReduceOp{}); + x_host, y_host_ref, ReduceOp{}, kept_dim, reduce_dims); y_buf.FromDevice(y_host_dev.mData.data()); pass = ck_tile::check_err(y_host_dev, y_host_ref); diff --git a/example/ck_tile/05_reduce/reduce.hpp b/example/ck_tile/05_reduce/reduce.hpp deleted file mode 100644 index 6fbb0b4274..0000000000 --- a/example/ck_tile/05_reduce/reduce.hpp +++ /dev/null @@ -1,164 +0,0 @@ -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -#pragma once - -#include "ck_tile/core.hpp" -#include "ck_tile/ops/common.hpp" -#include "ck_tile/ops/reduce/block/block_reduce.hpp" -#include "ck_tile/ops/reduce/block/block_reduce2d_default_policy.hpp" - -namespace ck_tile { - -template - typename BlockTile, // block size, seq - typename WarpTile, // warp size, seq - typename Vector> // contiguous pixels(vector size) along seq -struct Reduce2dShape -{ - static constexpr index_t Block_M = BlockTile::at(number<0>{}); - static constexpr index_t Block_N = BlockTile::at(number<1>{}); - - static constexpr index_t Warp_M = WarpTile::at(number<0>{}); - static constexpr index_t Warp_N = WarpTile::at(number<1>{}); - - static constexpr index_t Vector_M = Vector::at(number<0>{}); - static constexpr index_t Vector_N = Vector::at(number<1>{}); - - static constexpr index_t WarpPerBlock_M = BlockWarps::at(number<0>{}); - static constexpr index_t WarpPerBlock_N = BlockWarps::at(number<1>{}); - - static constexpr index_t ThreadPerWarp_M = Warp_M / Vector_M; - static constexpr index_t ThreadPerWarp_N = Warp_N / Vector_N; - - static constexpr index_t Repeat_M = Block_M / (WarpPerBlock_M * Warp_M); - static constexpr index_t Repeat_N = Block_N / (WarpPerBlock_N * Warp_N); - - static constexpr index_t BlockSize = - ck_tile::get_warp_size() * reduce_on_sequence(BlockWarps{}, multiplies{}, number<1>{}); -}; - -template -struct Reduce2dProblem -{ - using XDataType = remove_cvref_t; - using ComputeDataType = remove_cvref_t; - using YDataType = remove_cvref_t; - using BlockShape = remove_cvref_t; - using ReduceOp = ReduceOp_; - - static constexpr bool kNeedCrossLaneSync = BlockShape::ThreadPerWarp_N > 1; - static constexpr bool kNeedCrossWarpSync = BlockShape::WarpPerBlock_N > 1; -}; - -template -struct Reduce -{ - using Problem = ck_tile::remove_cvref_t; - using Policy = ck_tile::remove_cvref_t; - - using XDataType = ck_tile::remove_cvref_t; - using ComputeDataType = ck_tile::remove_cvref_t; - using YDataType = ck_tile::remove_cvref_t; - -#if 0 - CK_TILE_DEVICE void operator()(const XDataType* p_x, YDataType* p_y, index_t M, index_t N) - const - { - using S = typename Problem::BlockShape; - - const auto x_m_n = make_naive_tensor_view( - p_x, make_tuple(M, N), make_tuple(N, 1), number{}, number<1>{}); - - const auto y_m = make_naive_tensor_view_packed( - p_y, make_tuple(M), number<1>{}); - - const auto iM = get_block_id() * S::Block_M; - - auto x_window = make_tile_window(x_m_n, - make_tuple(number{}, number{}), - {iM, 0}, - Policy::template MakeXBlockTileDistribution()); - - auto y_window = make_tile_window(y_m, make_tuple(number{}), {iM}); - - const auto f_reduce = [](const auto& v0, const auto& v1) { return v0 + v1; }; - - const XDataType reduce_init_value = 0; - - constexpr auto reduce_dims = sequence<1>{}; - - auto y_compute = decltype(block_tile_reduce( - load_tile(x_window), reduce_dims, f_reduce, reduce_init_value)){}; - - set_tile(y_compute, reduce_init_value); - - index_t num_n_tile_iteration = - __builtin_amdgcn_readfirstlane(integer_divide_ceil(N, S::Block_N)); - - for(int iN = __builtin_amdgcn_readfirstlane(0); iN < num_n_tile_iteration; ++iN) - { - const auto x = load_tile(x_window); - block_tile_reduce(y_compute, x, reduce_dims, f_reduce); - move_tile_window(x_window, {0, S::Block_N}); - } - - block_tile_reduce_sync(y_compute, f_reduce); - - store_tile(y_window, cast_tile(y_compute)); - } -#else - CK_TILE_DEVICE void operator()(const XDataType* p_x, YDataType* p_y, index_t M, index_t N) const - { - using S = typename Problem::BlockShape; - - const auto x_m_n = make_naive_tensor_view( - p_x, make_tuple(M, N), make_tuple(N, 1), number{}, number<1>{}); - - const auto y_m = make_naive_tensor_view_packed( - p_y, make_tuple(M), number<1>{}); - - const auto iM = get_block_id() * S::Block_M; - - auto x_window = make_tile_window(x_m_n, - make_tuple(number{}, number{}), - {iM, 0}, - Policy::template MakeXBlockTileDistribution()); - - auto y_window = make_tile_window(y_m, make_tuple(number{}), {iM}); - - __shared__ char smem[Policy::template GetSmemSize()]; - - index_t num_n_tile_iteration = - __builtin_amdgcn_readfirstlane(integer_divide_ceil(N, S::Block_N)); - - auto reduce_func = typename Problem::ReduceOp{}; - auto block_reduce2d = Policy::template GetBlockReduce2d(); - auto block_reduce2d_sync = Policy::template GetBlockReduce2dSync(); - auto block_reduce2d_cross_warp_sync = - Policy::template GetBlockReduce2dCrossWarpSync(); - - using XTensorType = decltype(load_tile(x_window)); - auto y_compute = block_reduce2d.template MakeYBlockTile(); - set_tile(y_compute, reduce_func.template GetIdentityValue()); - - for(int iN = __builtin_amdgcn_readfirstlane(0); iN < num_n_tile_iteration; ++iN) - { - const auto x = load_tile(x_window); - block_reduce2d(x, y_compute, reduce_func); - move_tile_window(x_window, {0, S::Block_N}); - } - - block_reduce2d_sync(y_compute, reduce_func); - block_reduce2d_cross_warp_sync(y_compute, smem, reduce_func); - - store_tile(y_window, cast_tile(y_compute)); - } -#endif -}; - -} // namespace ck_tile diff --git a/example/ck_tile/17_grouped_gemm/CMakeLists.txt b/example/ck_tile/17_grouped_gemm/CMakeLists.txt index 79df4e624d..475c13166d 100644 --- a/example/ck_tile/17_grouped_gemm/CMakeLists.txt +++ b/example/ck_tile/17_grouped_gemm/CMakeLists.txt @@ -1,2 +1 @@ add_executable(tile_example_grouped_gemm EXCLUDE_FROM_ALL grouped_gemm.cpp) -add_executable(tile_example_grouped_gemm_tileloop EXCLUDE_FROM_ALL grouped_gemm_tileloop.cpp) diff --git a/example/ck_tile/17_grouped_gemm/grouped_gemm.cpp b/example/ck_tile/17_grouped_gemm/grouped_gemm.cpp index bb0a0d5840..897952f03c 100644 --- a/example/ck_tile/17_grouped_gemm/grouped_gemm.cpp +++ b/example/ck_tile/17_grouped_gemm/grouped_gemm.cpp @@ -1,5 +1,5 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. #include @@ -16,19 +16,11 @@ #include "ck_tile/host.hpp" #include "grouped_gemm.hpp" -template -float grouped_gemm(const std::vector& gemm_descs, - const ck_tile::stream_config& s, - void* kargs_ptr) +template +float grouped_gemm_tileloop(const ck_tile::stream_config& s, + const ck_tile::index_t num_groups, + void* kargs_ptr, + bool splitk) { #if(CK_TILE_PIPELINE_DEFAULT == CK_TILE_PIPELINE_MEMORY) // Memory friendly for Interwave scheduler @@ -83,8 +75,6 @@ float grouped_gemm(const std::vector& gemm_descs, constexpr bool kPadN = false; constexpr bool kPadK = false; - constexpr bool TransposeC = false; - constexpr int kBlockPerCu = 1; constexpr ck_tile::index_t TileParitionerGroupNum = 8; constexpr ck_tile::index_t TileParitionerM01 = 4; @@ -97,54 +87,41 @@ float grouped_gemm(const std::vector& gemm_descs, GemmSpatiallyLocalTilePartitioner; using Traits = ck_tile::TileGemmTraits; - using GemmUniversalTraits = ck_tile::TileGemmUniversalTraits; + using GemmUniversalTraits = ck_tile::PersistentTileGemmUniversalTraits; using GemmPipelineProblem = ck_tile::GemmPipelineProblem; - using BaseGemmPipeline = UNIVERSAL_GEMM_PIPELINE; - - const ck_tile::index_t k_grain = gemm_descs[0].k_batch * K_Tile; - const ck_tile::index_t K_split = (gemm_descs[0].K + k_grain - 1) / k_grain * K_Tile; - const ck_tile::index_t num_loop = TilePartitioner::GetLoopNum(K_split); - const bool has_hot_loop = BaseGemmPipeline::BlockHasHotloop(num_loop); - const ck_tile::TailNumber tail_num = BaseGemmPipeline::GetBlockLoopTailNum(num_loop); - float ave_time{0}; - const auto Run = [&](const auto has_hot_loop_, - const auto tail_number_, - const auto memory_operation_) { - constexpr bool has_hot_loop_v = has_hot_loop_.value; - constexpr auto tail_number_v = tail_number_.value; + const auto Run = [&](const auto memory_operation_) { constexpr auto scheduler = GEMM_PIPELINE_SCHEDULER; constexpr auto memory_operation = memory_operation_.value; + // We create the GEMM pipeline without specifying hotloop or tailnumber. + // These are automatically run inside the kernel based on the given input data. using UniversalGemmProblem = ck_tile::UniversalGemmPipelineProblem; + scheduler>; using GemmPipeline = GEMM_PIPELINE; using GemmEpilogue = ck_tile::CShuffleEpilogue< ck_tile::CShuffleEpilogueProblem, AccDataType, CDataType, - DsLayout, + ck_tile::tuple<>, CLayout, - CDEElementWise, + ck_tile::element_wise::PassThrough, GemmPipelineProblem::kBlockSize, TilePartitioner::MPerBlock, TilePartitioner::NPerBlock, @@ -156,20 +133,8 @@ float grouped_gemm(const std::vector& gemm_descs, UniversalGemmProblem::TransposeC, memory_operation>>; using Kernel = ck_tile::GroupedGemmKernel; - auto kargs = Kernel::MakeKargs(gemm_descs); - if(!Kernel::IsSupportedArgument(kargs)) - { - throw std::runtime_error("Kernel arguments not supported!"); - } - constexpr dim3 blocks = Kernel::BlockSize(); - const dim3 grids = Kernel::GridSize(gemm_descs); - - HIP_CHECK_ERROR(hipMemcpyWithStream(kargs_ptr, - kargs.data(), - get_workspace_size(gemm_descs), - hipMemcpyHostToDevice, - s.stream_id_)); + const dim3 grids = Kernel::MaxOccupancyGridSize(s); if(s.log_level_ > 0) { @@ -186,45 +151,26 @@ float grouped_gemm(const std::vector& gemm_descs, blocks, 0, ck_tile::cast_pointer_to_constant_address_space(kargs_ptr), - gemm_descs.size())); + num_groups)); return ave_time; }; - const auto RunSplitk = [&](const auto has_hot_loop_, const auto tail_number_) { - if(gemm_descs[0].k_batch == 1) - { - Run(has_hot_loop_, - tail_number_, - ck_tile::integral_constant{}); - } - else - { - Run(has_hot_loop_, - tail_number_, - ck_tile::integral_constant{}); - } - }; - - BaseGemmPipeline::TailHandler(RunSplitk, has_hot_loop, tail_num); + if(!splitk) + { + Run(ck_tile::integral_constant{}); + } + else + { + Run(ck_tile::integral_constant{}); + } return ave_time; } #include "run_grouped_gemm_example.inc" -constexpr bool Persistent = false; -int main(int argc, char* argv[]) -{ - try - { - return !run_grouped_gemm_example(argc, argv); - } - catch(const std::runtime_error& e) - { - std::cerr << "Runtime error: " << e.what() << '\n'; - return EXIT_FAILURE; - } -} +constexpr bool Persistent = true; +int main(int argc, char* argv[]) { return !run_grouped_gemm_example(argc, argv); } diff --git a/example/ck_tile/17_grouped_gemm/grouped_gemm.hpp b/example/ck_tile/17_grouped_gemm/grouped_gemm.hpp index 74efb1bdeb..89d91fbef6 100644 --- a/example/ck_tile/17_grouped_gemm/grouped_gemm.hpp +++ b/example/ck_tile/17_grouped_gemm/grouped_gemm.hpp @@ -15,7 +15,7 @@ #define CK_TILE_PIPELINE_COMPUTE_V4 3 #ifndef CK_TILE_PIPELINE_DEFAULT -#define CK_TILE_PIPELINE_DEFAULT CK_TILE_PIPELINE_COMPUTE_V3 +#define CK_TILE_PIPELINE_DEFAULT CK_TILE_PIPELINE_COMPUTE_V4 #endif #if(CK_TILE_PIPELINE_DEFAULT == CK_TILE_PIPELINE_MEMORY) diff --git a/example/ck_tile/17_grouped_gemm/grouped_gemm_tileloop.cpp b/example/ck_tile/17_grouped_gemm/grouped_gemm_tileloop.cpp deleted file mode 100644 index 897952f03c..0000000000 --- a/example/ck_tile/17_grouped_gemm/grouped_gemm_tileloop.cpp +++ /dev/null @@ -1,176 +0,0 @@ -// SPDX-License-Identifier: MIT -// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. - -#include - -#include -#include -#include -#include -#include -#include - -#include "ck_tile/core.hpp" -#include "ck_tile/ops/epilogue.hpp" -#include "ck_tile/ops/gemm.hpp" -#include "ck_tile/host.hpp" -#include "grouped_gemm.hpp" - -template -float grouped_gemm_tileloop(const ck_tile::stream_config& s, - const ck_tile::index_t num_groups, - void* kargs_ptr, - bool splitk) -{ -#if(CK_TILE_PIPELINE_DEFAULT == CK_TILE_PIPELINE_MEMORY) - // Memory friendly for Interwave scheduler - constexpr ck_tile::index_t M_Tile = 128; - constexpr ck_tile::index_t N_Tile = 32; - constexpr ck_tile::index_t K_Tile = 64; - - constexpr ck_tile::index_t M_Warp = 4; - constexpr ck_tile::index_t N_Warp = 1; - constexpr ck_tile::index_t K_Warp = 1; - - constexpr ck_tile::index_t M_Warp_Tile = 32; - constexpr ck_tile::index_t N_Warp_Tile = 32; - constexpr ck_tile::index_t K_Warp_Tile = 8; - - constexpr bool DoubleSmemBuffer = false; -#endif -#if(CK_TILE_PIPELINE_DEFAULT == CK_TILE_PIPELINE_COMPUTE_V3) - // Compute friendly for Intrawave scheduler - constexpr ck_tile::index_t M_Tile = 256; - constexpr ck_tile::index_t N_Tile = 256; - constexpr ck_tile::index_t K_Tile = 64; - - constexpr ck_tile::index_t M_Warp = 2; - constexpr ck_tile::index_t N_Warp = 2; - constexpr ck_tile::index_t K_Warp = 1; - - constexpr ck_tile::index_t M_Warp_Tile = 32; - constexpr ck_tile::index_t N_Warp_Tile = 32; - constexpr ck_tile::index_t K_Warp_Tile = 16; - - constexpr bool DoubleSmemBuffer = false; -#elif(CK_TILE_PIPELINE_DEFAULT == CK_TILE_PIPELINE_COMPUTE_V4) - // Compute friendly for Intrawave scheduler - // Using the ping pong reader in the lds level - constexpr ck_tile::index_t M_Tile = 256; - constexpr ck_tile::index_t N_Tile = 256; - constexpr ck_tile::index_t K_Tile = 32; - - constexpr ck_tile::index_t M_Warp = 2; - constexpr ck_tile::index_t N_Warp = 2; - constexpr ck_tile::index_t K_Warp = 1; - - constexpr ck_tile::index_t M_Warp_Tile = 32; - constexpr ck_tile::index_t N_Warp_Tile = 32; - constexpr ck_tile::index_t K_Warp_Tile = 16; - - constexpr bool DoubleSmemBuffer = true; -#endif - - constexpr bool kPadM = false; - constexpr bool kPadN = false; - constexpr bool kPadK = false; - - constexpr int kBlockPerCu = 1; - constexpr ck_tile::index_t TileParitionerGroupNum = 8; - constexpr ck_tile::index_t TileParitionerM01 = 4; - - using GemmShape = - ck_tile::TileGemmShape, - ck_tile::sequence, - ck_tile::sequence>; - using TilePartitioner = ck_tile:: - GemmSpatiallyLocalTilePartitioner; - - using Traits = ck_tile::TileGemmTraits; - using GemmUniversalTraits = ck_tile::PersistentTileGemmUniversalTraits; - using GemmPipelineProblem = - ck_tile::GemmPipelineProblem; - - float ave_time{0}; - - const auto Run = [&](const auto memory_operation_) { - constexpr auto scheduler = GEMM_PIPELINE_SCHEDULER; - constexpr auto memory_operation = memory_operation_.value; - - // We create the GEMM pipeline without specifying hotloop or tailnumber. - // These are automatically run inside the kernel based on the given input data. - using UniversalGemmProblem = ck_tile::UniversalGemmPipelineProblem; - - using GemmPipeline = GEMM_PIPELINE; - using GemmEpilogue = ck_tile::CShuffleEpilogue< - ck_tile::CShuffleEpilogueProblem, - AccDataType, - CDataType, - ck_tile::tuple<>, - CLayout, - ck_tile::element_wise::PassThrough, - GemmPipelineProblem::kBlockSize, - TilePartitioner::MPerBlock, - TilePartitioner::NPerBlock, - M_Warp, - N_Warp, - M_Warp_Tile, - N_Warp_Tile, - K_Warp_Tile, - UniversalGemmProblem::TransposeC, - memory_operation>>; - using Kernel = ck_tile::GroupedGemmKernel; - constexpr dim3 blocks = Kernel::BlockSize(); - const dim3 grids = Kernel::MaxOccupancyGridSize(s); - - if(s.log_level_ > 0) - { - std::cout << "Launching kernel: " << Kernel::GetName() << " with args:" << " grid: {" - << grids.x << ", " << grids.y << ", " << grids.z << "}" << ", blocks: {" - << blocks.x << ", " << blocks.y << ", " << blocks.z << "}" << std::endl; - } - - ave_time = - ck_tile::launch_kernel(s, - ck_tile::make_kernel( - Kernel{}, - grids, - blocks, - 0, - ck_tile::cast_pointer_to_constant_address_space(kargs_ptr), - num_groups)); - - return ave_time; - }; - - if(!splitk) - { - Run(ck_tile::integral_constant{}); - } - else - { - Run(ck_tile::integral_constant{}); - } - - return ave_time; -} - -#include "run_grouped_gemm_example.inc" - -constexpr bool Persistent = true; -int main(int argc, char* argv[]) { return !run_grouped_gemm_example(argc, argv); } diff --git a/include/ck/tensor_operation/gpu/element/element_wise_operation.hpp b/include/ck/tensor_operation/gpu/element/element_wise_operation.hpp index b57ae22172..089d4c2a9d 100644 --- a/include/ck/tensor_operation/gpu/element/element_wise_operation.hpp +++ b/include/ck/tensor_operation/gpu/element/element_wise_operation.hpp @@ -562,6 +562,58 @@ struct NormalizeInInfer double epsilon_; }; +// used by Conv+Bias+BatchNorm+Clamp inference +struct BiasNormalizeInInferClamp +{ + BiasNormalizeInInferClamp(float floor = 0.f, + float ceil = NumericLimits::Max(), + float epsilon = 1e-4) + : clamp_(floor, ceil), epsilon_(epsilon) + { + } + + template + __host__ __device__ constexpr void operator()(T& y, + const T& x, + const T& bias, + const T& mean, + const T& variance, + const T& gamma, + const T& beta) const + { + using ck::type_convert; + using ck::math::sqrt; + + float tmp_x = type_convert(x) + type_convert(bias); + + float tmp_y = + ((tmp_x - type_convert(mean)) / sqrt(type_convert(variance) + epsilon_)) * + type_convert(gamma) + + type_convert(beta); + clamp_(tmp_y, tmp_y); + y = type_convert(tmp_y); + }; + + template <> + __host__ __device__ constexpr void operator()(float& y, + const float& x, + const float& bias, + const float& mean, + const float& variance, + const float& gamma, + const float& beta) const + { + using ck::type_convert; + using ck::math::sqrt; + + float tmp_y = (((x + bias) - mean) / sqrt(variance + epsilon_)) * gamma + beta; + clamp_(y, tmp_y); + }; + + Clamp clamp_; + float epsilon_; +}; + template struct UnaryTypeConvert; diff --git a/include/ck_tile/core.hpp b/include/ck_tile/core.hpp index 188cebaabc..9f3c996873 100644 --- a/include/ck_tile/core.hpp +++ b/include/ck_tile/core.hpp @@ -27,6 +27,7 @@ #include "ck_tile/core/container/thread_buffer.hpp" #include "ck_tile/core/container/tuple.hpp" #include "ck_tile/core/numeric/bfloat16.hpp" +#include "ck_tile/core/numeric/e8m0.hpp" #include "ck_tile/core/numeric/float8.hpp" #include "ck_tile/core/numeric/half.hpp" #include "ck_tile/core/numeric/int8.hpp" @@ -74,6 +75,7 @@ #include "ck_tile/core/utility/literals.hpp" #include "ck_tile/core/utility/magic_div.hpp" #include "ck_tile/core/utility/philox_rand.hpp" +#include "ck_tile/core/utility/print.hpp" #include "ck_tile/core/utility/random.hpp" #include "ck_tile/core/utility/reduce_operator.hpp" #include "ck_tile/core/utility/static_counter.hpp" diff --git a/include/ck_tile/core/algorithm/coordinate_transform.hpp b/include/ck_tile/core/algorithm/coordinate_transform.hpp index f7f9489f4c..7511413bba 100644 --- a/include/ck_tile/core/algorithm/coordinate_transform.hpp +++ b/include/ck_tile/core/algorithm/coordinate_transform.hpp @@ -9,6 +9,7 @@ #include "ck_tile/core/utility/functional.hpp" #include "ck_tile/core/utility/type_traits.hpp" #include "ck_tile/core/utility/magic_div.hpp" +#include "ck_tile/core/utility/print.hpp" namespace ck_tile { @@ -139,20 +140,19 @@ struct pass_through : public base_transform<1, 1> { return make_tuple(low_vector_lengths, low_vector_strides); } - - CK_TILE_HOST_DEVICE void print() const - { - printf("pass_through{"); - - // - printf("up_lengths_:"); - print(up_lengths_); - - // - printf("}"); - } }; +template +CK_TILE_HOST_DEVICE static void print(const pass_through& pt) +{ + printf("pass_through{"); + + printf("up_lengths_: "); + print(pt.get_upper_lengths()); + + printf("}"); +} + template ck_tile::is_known_at_compile_time::value && ck_tile::is_known_at_compile_time::value; } - - CK_TILE_HOST_DEVICE void print() const - { - printf("pad{"); - - // - printf("up_lengths_: "); - print(up_lengths_); - printf(", "); - - // - printf("left_pad_length_: "); - print(left_pad_length_); - printf(", "); - - // - printf("right_pad_length_: "); - print(right_pad_length_); - - printf("}"); - } }; +template +CK_TILE_HOST_DEVICE static void +print(const pad& p) +{ + printf("pad{"); + printf("up_lengths_: "); + print(p.up_lengths_); + printf(", left_pad_length_: "); + print(p.left_pad_length_); + printf(", right_pad_length_: "); + print(p.right_pad_length_); + printf("}"); +} + template struct left_pad { @@ -330,24 +326,20 @@ struct left_pad // It's up to runtime to check the padding length should be multiple of vector length return make_tuple(low_vector_lengths, low_vector_strides); } - - CK_TILE_HOST_DEVICE void print() const - { - printf("left_pad{"); - - // - printf("up_lengths_: "); - print(up_lengths_); - printf(", "); - - // - printf("left_pad_length_: "); - print(left_pad_length_); - - printf("}"); - } }; +template +CK_TILE_HOST_DEVICE static void +print(const left_pad& lp) +{ + printf("left_pad{"); + printf("up_lengths_: "); + print(lp.up_lengths_); + printf(", left_pad_length_: "); + print(lp.left_pad_length_); + printf("}"); +} + template struct right_pad : public base_transform<1, 1> { @@ -430,24 +422,20 @@ struct right_pad : public base_transform<1, 1> // It's up to runtime to check the padding length should be multiple of vector length return make_tuple(low_vector_lengths, low_vector_strides); } - - CK_TILE_HOST_DEVICE void print() const - { - printf("right_pad{"); - - // - printf("up_lengths_: "); - print(up_lengths_); - printf(", "); - - // - printf("right_pad_length_: "); - print(right_pad_length_); - - printf("}"); - } }; +template +CK_TILE_HOST_DEVICE static void +print(const right_pad& rp) +{ + printf("right_pad{"); + printf("up_lengths_: "); + print(rp.up_lengths_); + printf(", right_pad_length_: "); + print(rp.right_pad_length_); + printf("}"); +} + // idx_low = coefficients[0, ...nDimUp-1] * idx_up[0, ...nDimUp-1] // UpLengths and Coefficients can be either of the followings: // 1) Tuple of index_t, which is known at run-time, or @@ -532,24 +520,19 @@ struct embed : public base_transform<1, UpLengths::size()> return ck_tile::is_known_at_compile_time::value && ck_tile::is_known_at_compile_time::value; } - - CK_TILE_HOST_DEVICE void print() const - { - printf("embed{"); - - // - printf("up_lengths_: "); - print(up_lengths_); - printf(", "); - - // - printf("coefficients_: "); - print(coefficients_); - - printf("}"); - } }; +template +CK_TILE_HOST_DEVICE static void print(const embed& e) +{ + printf("embed{"); + printf("up_lengths_: "); + print(e.up_lengths_); + printf(", coefficients_: "); + print(e.coefficients_); + printf("}"); +} + template struct lambda_merge_generate_MagicDivision_calculate_magic_divisor { @@ -699,24 +682,19 @@ struct merge_v2_magic_division : public base_transform return make_tuple(up_vector_lengths, up_vector_strides); } - - CK_TILE_HOST_DEVICE void print() const - { - printf("merge_v2_magic_division{"); - - // - printf("low_lengths_ "); - print(low_lengths_); - printf(", "); - - // - printf("up_lengths_ "); - print(up_lengths_); - - printf("}"); - } }; +template +CK_TILE_HOST_DEVICE static void print(const merge_v2_magic_division& m) +{ + printf("merge_v2_magic_division{"); + printf("low_lengths_: "); + print(m.low_lengths_); + printf(", up_lengths_: "); + print(m.up_lengths_); + printf("}"); +} + // Implementation of "merge" transformation primitive that uses division and mod. It is supposed to // be used for low_lengths that are known at compile time and are power of 2, otherwise performance // will be very bad @@ -830,29 +808,21 @@ struct merge_v3_division_mod : public base_transform return make_tuple(up_vector_lengths, up_vector_strides); } - - CK_TILE_HOST_DEVICE void print() const - { - printf("Merge_v3_direct_division_mod{"); - - // - printf("low_lengths_ "); - print(low_lengths_); - printf(", "); - - // - printf("low_lengths_scan_ "); - print(low_lengths_scan_); - printf(", "); - - // - printf("up_lengths_ "); - print(up_lengths_); - - printf("}"); - } }; +template +CK_TILE_HOST_DEVICE static void print(const merge_v3_division_mod& m) +{ + printf("merge_v3_division_mod{"); + printf("low_lengths_: "); + print(m.low_lengths_); + printf(", low_lengths_scan_: "); + print(m.low_lengths_scan_); + printf(", up_lengths_: "); + print(m.up_lengths_); + printf("}"); +} + template struct unmerge : public base_transform<1, UpLengths::size()> { @@ -958,24 +928,19 @@ struct unmerge : public base_transform<1, UpLengths::size()> return make_tuple(up_vector_lengths, up_vector_strides); } - - CK_TILE_HOST_DEVICE void print() const - { - printf("unmerge{"); - - // - printf("up_lengths_"); - print(up_lengths_); - printf(", "); - - // - printf("up_lengths_scan_"); - print(up_lengths_scan_); - - printf("}"); - } }; +template +CK_TILE_HOST_DEVICE static void print(const unmerge& u) +{ + printf("unmerge{"); + printf("up_lengths_: "); + print(u.up_lengths_); + printf(", up_lengths_scan_: "); + print(u.up_lengths_scan_); + printf("}"); +} + template struct freeze : public base_transform<1, 0> { @@ -1023,19 +988,17 @@ struct freeze : public base_transform<1, 0> { return ck_tile::is_known_at_compile_time::value; } - - CK_TILE_HOST_DEVICE void print() const - { - printf("freeze{"); - - // - printf("low_idx_: "); - print(low_idx_); - - printf("}"); - } }; +template +CK_TILE_HOST_DEVICE static void print(const freeze& f) +{ + printf("freeze{"); + printf("low_idx_: "); + print(f.low_idx_); + printf("}"); +} + // insert a dangling upper dimension without lower dimension template struct insert : public base_transform<0, 1> @@ -1092,18 +1055,17 @@ struct insert : public base_transform<0, 1> { return ck_tile::is_known_at_compile_time::value; } - - CK_TILE_HOST_DEVICE void print() const - { - printf("insert{"); - - // - print(up_lengths_); - - printf("}"); - } }; +template +CK_TILE_HOST_DEVICE static void print(const insert& i) +{ + printf("insert{"); + printf("up_lengths_: "); + print(i.up_lengths_); + printf("}"); +} + // replicate the original tensor and create a higher dimensional tensor template struct replicate : public base_transform<0, UpLengths::size()> @@ -1152,21 +1114,19 @@ struct replicate : public base_transform<0, UpLengths::size()> return ck_tile::is_known_at_compile_time::value; } - CK_TILE_HOST_DEVICE void print() const - { - printf("replicate{"); - - // - printf("up_lengths_: "); - print(up_lengths_); - - printf("}"); - } - // UpLengths up_lengths_; }; +template +CK_TILE_HOST_DEVICE static void print(const replicate& r) +{ + printf("replicate{"); + printf("up_lengths_: "); + print(r.up_lengths_); + printf("}"); +} + template struct slice : public base_transform<1, 1> { @@ -1238,28 +1198,20 @@ struct slice : public base_transform<1, 1> ck_tile::is_known_at_compile_time::value && ck_tile::is_known_at_compile_time::value; } +}; - CK_TILE_HOST_DEVICE void print() const - { - printf("slice{"); - - // - printf("up_lengths_: "); - print(up_lengths_); - printf(", "); - - // - printf("slice_begin_: "); - print(slice_begin_); - printf(", "); - - // - printf("slice_end_: "); - print(slice_end_); - - printf("}"); - } // namespace ck -}; // namespace ck +template +CK_TILE_HOST_DEVICE static void print(const slice& s) +{ + printf("slice{"); + printf("up_lengths_: "); + print(s.up_lengths_); + printf(", slice_begin_: "); + print(s.slice_begin_); + printf(", slice_end_: "); + print(s.slice_end_); + printf("}"); +} /* * \brief lower_idx = upper_idx % modulus. @@ -1328,19 +1280,19 @@ struct modulo : public base_transform<1, 1> { return ck_tile::is_known_at_compile_time::value; } - - CK_TILE_HOST_DEVICE void print() const - { - printf("Modulus{"); - - // - printf("up_lengths_: "); - print(up_lengths_); - - printf("}"); - } }; +template +CK_TILE_HOST_DEVICE static void print(const modulo& m) +{ + printf("modulo{"); + printf("modulus_: "); + print(m.modulus_); + printf(", up_lengths_: "); + print(m.up_lengths_); + printf("}"); +} + // 2D XOR, NOTE: "xor" is a keyword template struct xor_t : public base_transform<2, 2> @@ -1424,20 +1376,17 @@ struct xor_t : public base_transform<2, 2> return make_tuple(up_vector_lengths, up_vector_strides); } - - CK_TILE_HOST_DEVICE void print() const - { - printf("xor_t{"); - - // - printf("up_lengths_: "); - print(up_lengths_); - printf(", "); - - printf("}"); - } }; +template +CK_TILE_HOST_DEVICE static void print(const xor_t& x) +{ + printf("xor_t{"); + printf("up_lengths_: "); + print(x.up_lengths_); + printf("}"); +} + template struct offset : public base_transform<1, 1> { @@ -1509,24 +1458,19 @@ struct offset : public base_transform<1, 1> return ck_tile::is_known_at_compile_time::value && ck_tile::is_known_at_compile_time::value; } - - CK_TILE_HOST_DEVICE void print() const - { - printf("offset{"); - - // - printf("up_lengths_: "); - print(up_lengths_); - printf(", "); - - // - printf("offset_length_: "); - print(offset_length_); - - printf("}"); - } }; +template +CK_TILE_HOST_DEVICE static void print(const offset& o) +{ + printf("offset{"); + printf("up_lengths_: "); + print(o.up_lengths_); + printf(", offset_length_: "); + print(o.offset_length_); + printf("}"); +} + template struct indexing : public base_transform<1, 1> { @@ -1595,20 +1539,19 @@ struct indexing : public base_transform<1, 1> return ck_tile::is_known_at_compile_time::value && IndexingAdaptor::is_known_at_compile_time(); } - - CK_TILE_HOST_DEVICE void print() const - { - printf("embed{"); - - // - printf("up_lengths_: "); - print(up_lengths_); - printf(", "); - - printf("}"); - } }; +template +CK_TILE_HOST_DEVICE static void print(const indexing& i) +{ + printf("indexing{"); + printf("up_lengths_: "); + print(i.up_lengths_); + printf(", iadaptor_: "); + print(i.iadaptor_); + printf("}"); +} + //******************************************************************************************************* template diff --git a/include/ck_tile/core/algorithm/static_encoding_pattern.hpp b/include/ck_tile/core/algorithm/static_encoding_pattern.hpp index 8a3de3e5e0..1f6c389090 100644 --- a/include/ck_tile/core/algorithm/static_encoding_pattern.hpp +++ b/include/ck_tile/core/algorithm/static_encoding_pattern.hpp @@ -77,6 +77,7 @@ #include "ck_tile/core/numeric/integer.hpp" #include "ck_tile/core/tensor/tile_distribution.hpp" #include "ck_tile/core/tensor/tile_distribution_encoding.hpp" +#include "ck_tile/core/utility/print.hpp" namespace ck_tile { @@ -317,4 +318,51 @@ struct TileDistributionEncodingPattern2D +CK_TILE_HOST_DEVICE void print(const TileDistributionEncodingPattern2D&) +{ + using PatternType = TileDistributionEncodingPattern2D; + + printf("TileDistributionEncodingPattern2D: ", + BlockSize, + YPerTile, + XPerTile, + VecSize, + tile_distribution_pattern_to_string(DistributionPattern)); + printf("{: <%d, %d, %d>, : <%d, %d>}\n", + PatternType::Y0, + PatternType::Y1, + PatternType::Y2, + PatternType::X0, + PatternType::X1); +} + } // namespace ck_tile diff --git a/include/ck_tile/core/arch/amd_buffer_addressing.hpp b/include/ck_tile/core/arch/amd_buffer_addressing.hpp index 29cc3fefe5..35da19cd3e 100644 --- a/include/ck_tile/core/arch/amd_buffer_addressing.hpp +++ b/include/ck_tile/core/arch/amd_buffer_addressing.hpp @@ -2754,54 +2754,6 @@ CK_TILE_DEVICE void amd_buffer_atomic_max(const thread_buffer& src_thread_ #endif } -template -CK_TILE_DEVICE void amd_direct_load_global_to_lds(const T* global_base_ptr, - const index_t global_offset, - T* lds_base_ptr, - const index_t lds_offset, - const bool is_valid, - const index_t src_element_space_size) -{ - const uint32_t* global_ptr = - reinterpret_cast(reinterpret_cast(global_base_ptr)); - const int32x4_t src_resource = - make_wave_buffer_resource(global_ptr, src_element_space_size * sizeof(T)); - const index_t global_offset_bytes = is_valid ? global_offset * sizeof(T) : 0x80000000; - -#if CK_TILE_USE_AMD_LDS_DIRECT_LOAD_INLINE_ASM - T* lds_ptr = lds_base_ptr + lds_offset; - auto const lds_ptr_sgpr = - __builtin_amdgcn_readfirstlane((reinterpret_cast(lds_ptr))); - asm volatile("s_mov_b32 m0, %0; \n\t" - "buffer_load_dword %1, %2, 0 offen lds;\n\t" ::"s"(lds_ptr_sgpr), - "v"(global_offset_bytes), - "s"(src_resource) - : "memory"); -#else - // Direct loads require that each thread reads and writes exactly a single DWORD. -#if defined(__gfx9__) - constexpr auto bytes_per_thread = sizeof(T) * NumElemsPerThread; -#endif - // Direct loads require that each thread reads and writes a multiple of DWORDs (4 bytes). - // For gfx950: supports 1, 3, or 4 DWORDs per thread - // For gfx942: supports exactly 1 DWORD per thread -#if defined(__gfx950__) - constexpr auto dword_bytes = 4; - static_assert(bytes_per_thread == dword_bytes || bytes_per_thread == dword_bytes * 3 || - bytes_per_thread == dword_bytes * 4); -#elif defined(__gfx9__) - constexpr auto dword_bytes = 4; - static_assert(bytes_per_thread == dword_bytes); -#endif - // LDS pointer must be attributed with the LDS address space. - as3_uint32_ptr lds_ptr = - reinterpret_cast(reinterpret_cast(lds_base_ptr + lds_offset)); - - llvm_amdgcn_raw_buffer_load_lds( - src_resource, lds_ptr, bytes_per_thread, global_offset_bytes, 0, 0, 0); -#endif -} - #if defined(__gfx950__) template __device__ auto amd_transpose_load_to_vgpr(const T* in_ptr) diff --git a/include/ck_tile/core/arch/arch.hpp b/include/ck_tile/core/arch/arch.hpp index 27c99e0cda..5946ed54af 100644 --- a/include/ck_tile/core/arch/arch.hpp +++ b/include/ck_tile/core/arch/arch.hpp @@ -178,22 +178,6 @@ CK_TILE_DEVICE void s_waitcnt_barrier() __builtin_amdgcn_s_barrier(); } -CK_TILE_DEVICE void block_sync_lds_direct_load() -{ -#if 1 - // invoke clang builtins which *should* produce the same result as the inline asm below - // difference: inline asm is being compiled to wait vmcnt(0) after the barrier - s_waitcnt_barrier<0, waitcnt_arg::kMaxExpCnt, 0>(); -#else - // same content as in old CK (#999) - asm volatile("\ - s_waitcnt vmcnt(0) \n \ - s_waitcnt lgkmcnt(0) \n \ - s_barrier \ - " ::); -#endif -} - CK_TILE_DEVICE void s_nop(index_t cnt = 0) { #if 1 @@ -238,6 +222,21 @@ CK_TILE_HOST_DEVICE constexpr index_t get_smem_capacity() #endif } +/// Helper function to convert address space enum to string +CK_TILE_HOST_DEVICE constexpr const char* address_space_to_string(address_space_enum addr_space) +{ + switch(addr_space) + { + case address_space_enum::generic: return "generic"; + case address_space_enum::global: return "global"; + case address_space_enum::lds: return "lds"; + case address_space_enum::sgpr: return "sgpr"; + case address_space_enum::constant: return "constant"; + case address_space_enum::vgpr: return "vgpr"; + default: return "unknown"; + } +} + // Architecture tags struct gfx11_t { @@ -254,5 +253,4 @@ CK_TILE_DEVICE static constexpr auto get_device_arch() return gfx12_t{}; #endif } - } // namespace ck_tile diff --git a/include/ck_tile/core/container/array.hpp b/include/ck_tile/core/container/array.hpp index 94aa40e278..352c645325 100644 --- a/include/ck_tile/core/container/array.hpp +++ b/include/ck_tile/core/container/array.hpp @@ -177,9 +177,27 @@ struct array CK_TILE_HOST_DEVICE constexpr array() {} CK_TILE_HOST_DEVICE static constexpr index_t size() { return 0; } CK_TILE_HOST_DEVICE static constexpr bool is_static() { return is_static_v; }; - CK_TILE_HOST_DEVICE void print() const { printf("array{size: 0, data: []}"); } }; +template +CK_TILE_HOST_DEVICE static void print(const array& a) +{ + printf("array{size: %ld, data: [", static_cast(N)); + for(index_t i = 0; i < N; ++i) + { + if(i > 0) + printf(", "); + print(a[i]); + } + printf("]}"); +} + +template +CK_TILE_HOST_DEVICE static void print(const array&) +{ + printf("array{size: 0, data: []}"); +} + template struct vector_traits; diff --git a/include/ck_tile/core/container/map.hpp b/include/ck_tile/core/container/map.hpp index 87b180cafc..7697995c92 100644 --- a/include/ck_tile/core/container/map.hpp +++ b/include/ck_tile/core/container/map.hpp @@ -139,26 +139,21 @@ struct map // WARNING: needed by compiler for C++ range-based for loop only, don't use this function! CK_TILE_HOST_DEVICE constexpr iterator end() { return iterator{impl_, size_}; } - - CK_TILE_HOST_DEVICE void print() const - { - printf("map{size_: %d, ", size_); - // - printf("impl_: ["); - // - for(const auto& [k, d] : *this) - { - printf("{key: "); - print(k); - printf(", data: "); - print(d); - printf("}, "); - } - // - printf("]"); - // - printf("}"); - } }; +template +CK_TILE_HOST_DEVICE static void print(const map& m) +{ + printf("map{size_: %d, impl_: [", m.size_); + for(const auto& [k, d] : m) + { + printf("{key: "); + print(k); + printf(", data: "); + print(d); + printf("}, "); + } + printf("]}"); +} + } // namespace ck_tile diff --git a/include/ck_tile/core/container/sequence.hpp b/include/ck_tile/core/container/sequence.hpp index 94309dd5dd..905b32dd15 100644 --- a/include/ck_tile/core/container/sequence.hpp +++ b/include/ck_tile/core/container/sequence.hpp @@ -9,13 +9,10 @@ #include "ck_tile/core/numeric/math.hpp" #include "ck_tile/core/utility/to_sequence.hpp" #include "ck_tile/core/utility/type_traits.hpp" -#include "ck_tile/core/utility/functional.hpp" +#include "ck_tile/core/utility/print.hpp" namespace ck_tile { -template -struct static_for; - template struct sequence; @@ -196,15 +193,24 @@ struct sequence { return sequence{}; } - - CK_TILE_HOST_DEVICE static void print() - { - printf("sequence{size: %d, data: [", size()); - ((printf("%d ", Is)), ...); - printf("]}"); - } }; +template +CK_TILE_HOST_DEVICE static void print(const sequence&) +{ + printf("sequence<"); + if constexpr(sizeof...(Is) > 0) + { + bool first = true; + (([&first](index_t value) { + printf("%s%d", first ? "" : ", ", value); + first = false; + }(Is)), + ...); + } + printf(">"); +} + namespace impl { template struct __integer_sequence; diff --git a/include/ck_tile/core/container/thread_buffer.hpp b/include/ck_tile/core/container/thread_buffer.hpp index 77c46e1b8c..d67581e7d2 100644 --- a/include/ck_tile/core/container/thread_buffer.hpp +++ b/include/ck_tile/core/container/thread_buffer.hpp @@ -42,7 +42,11 @@ struct thread_buffer { // TODO: this ctor can't ignore CK_TILE_HOST_DEVICE constexpr thread_buffer() : data{} {} - CK_TILE_HOST_DEVICE constexpr thread_buffer(const value_type & o) : data{o} {} + CK_TILE_HOST_DEVICE constexpr thread_buffer(const value_type & o) : data{} { + static_for<0, N, 1>{}( + [&](auto i) { data[i] = o; } + ); + } CK_TILE_HOST_DEVICE static constexpr auto size() { return N; } CK_TILE_HOST_DEVICE auto & get() {return data; } diff --git a/include/ck_tile/core/container/tuple.hpp b/include/ck_tile/core/container/tuple.hpp index 63d145d8b9..4c48b3d477 100644 --- a/include/ck_tile/core/container/tuple.hpp +++ b/include/ck_tile/core/container/tuple.hpp @@ -300,12 +300,29 @@ struct tuple : impl::tuple_base, T...> #undef TP_COM_ }; -template +template +CK_TILE_HOST_DEVICE void print(const tuple& t) +{ + printf("tuple<"); + if constexpr(sizeof...(T) > 0) + { + bool first = true; + static_for<0, sizeof...(T), 1>{}([&t, &first](auto i) { + if(!first) + printf(", "); + print(t.get(i)); + first = false; + }); + } + printf(">"); +} + +template struct vector_traits; // specialization for array template -struct vector_traits> +struct vector_traits, void> { using scalar_type = __type_pack_element<0, T...>; static constexpr index_t vector_size = sizeof...(T); diff --git a/include/ck_tile/core/numeric/e8m0.hpp b/include/ck_tile/core/numeric/e8m0.hpp new file mode 100644 index 0000000000..ea94880f27 --- /dev/null +++ b/include/ck_tile/core/numeric/e8m0.hpp @@ -0,0 +1,102 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck_tile/core/config.hpp" +#include "ck_tile/core/numeric/mxfp_convert.hpp" + +namespace ck_tile { + +/** + * @brief Unsigned representation of a conventional biased Float32 exponent. + * + * bias = 127; + * + * E8M0_1 = 0b01111111; => 2^(127-127) = 1 + * E8M0_2 = 0b10000000; => 2^(128-127) = 2^1 = 2 + * E8M0_3 = 0b10000010; => 2^(130-127) = 2^3 = 8 + * E8M0_135 = 0b10000111; => 2^(135-127) = 2^8 = 256 + * E8M0_142 = 0b10001110; => 2^(142-127) = 2^15 = 32768 + * E8M0_MIN = 0b00000000; => 2^-127 + * E8M0_MAX = 0b11111110; => 2^127 + * E8M0_NAN = 0b11111111; => NaN + */ + +struct e8m0_bexp_t +{ + using raw_type = uint8_t; + using type = raw_type; + + raw_type data; + + CK_TILE_HOST_DEVICE constexpr e8m0_bexp_t() : data{type{0b11111111}} {} + CK_TILE_HOST_DEVICE explicit constexpr e8m0_bexp_t(type init) : data{init} {} + CK_TILE_HOST_DEVICE explicit constexpr e8m0_bexp_t(float scale) + : e8m0_bexp_t(static_cast(numeric_utils::get_exponent(scale))) + { + } + CK_TILE_HOST_DEVICE constexpr operator type() const { return data; } + CK_TILE_HOST_DEVICE constexpr raw_type& get() { return data; } + CK_TILE_HOST_DEVICE constexpr raw_type get() const { return data; } + CK_TILE_HOST_DEVICE constexpr operator float() const; + + constexpr bool operator==(const e8m0_bexp_t& other) const { return data == other.data; } + + constexpr bool operator!=(const e8m0_bexp_t& other) const { return data != other.data; } +}; + +using e8m0_t = e8m0_bexp_t; +using e8m0_raw_t = typename e8m0_t::raw_type; + +template <> +struct numeric_traits +{ + using bitwise_type = e8m0_raw_t; + + static constexpr int exp = 8; + static constexpr int mant = 0; + static constexpr int bias = 127; + static constexpr int PackedSize = 1; +}; + +// limits +template +struct numeric; + +template <> +struct numeric +{ + static constexpr e8m0_raw_t binary_min = 0b00000000; // 2^-127 + static constexpr e8m0_raw_t binary_max = 0b11111110; // 2^127 + static constexpr e8m0_raw_t binary_nan = 0b11111111; + CK_TILE_HOST_DEVICE static constexpr e8m0_t min() { return e8m0_t{binary_min}; } + CK_TILE_HOST_DEVICE static constexpr e8m0_t max() { return e8m0_t{binary_max}; } + CK_TILE_HOST_DEVICE static constexpr e8m0_t quiet_NaN() { return e8m0_t{binary_nan}; } + CK_TILE_HOST_DEVICE static constexpr e8m0_t signaling_NaN() { return e8m0_t{binary_nan}; } + CK_TILE_HOST_DEVICE static constexpr bool has_inf() { return false; } + + CK_TILE_HOST_DEVICE static constexpr e8m0_t epsilon() { return signaling_NaN(); } + CK_TILE_HOST_DEVICE static constexpr e8m0_t round_error() { return signaling_NaN(); } + CK_TILE_HOST_DEVICE static constexpr e8m0_t zero() { return signaling_NaN(); } + CK_TILE_HOST_DEVICE static constexpr e8m0_t infinity() { return signaling_NaN(); } +}; + +CK_TILE_HOST_DEVICE constexpr e8m0_bexp_t::operator float() const +{ + using traits = numeric_traits; + if(data == numeric::binary_nan) + { + return traits::NaN; + } + else if(data == 0) + { + return std::numeric_limits::min(); + } + else + { + return bit_cast(static_cast(data) << traits::mant); + } +} + +} // namespace ck_tile diff --git a/include/ck_tile/core/numeric/integral_constant.hpp b/include/ck_tile/core/numeric/integral_constant.hpp index 33c24da8c5..2ba2fd10c6 100644 --- a/include/ck_tile/core/numeric/integral_constant.hpp +++ b/include/ck_tile/core/numeric/integral_constant.hpp @@ -19,14 +19,18 @@ struct constant CK_TILE_HOST_DEVICE static constexpr bool is_static() { return true; } }; +template +CK_TILE_HOST_DEVICE static void print(const constant&) +{ + printf("%ld", static_cast(v)); +} + template struct integral_constant : constant { using value_type = T; using type = integral_constant; // using injected-class-name static constexpr T value = v; - // constexpr CK_TILE_HOST_DEVICE operator value_type() const noexcept { return value; } - // constexpr CK_TILE_HOST_DEVICE value_type operator()() const noexcept { return value; } // }; template diff --git a/include/ck_tile/core/numeric/mxfp_convert.hpp b/include/ck_tile/core/numeric/mxfp_convert.hpp index b2e138e880..9b378933d0 100644 --- a/include/ck_tile/core/numeric/mxfp_convert.hpp +++ b/include/ck_tile/core/numeric/mxfp_convert.hpp @@ -12,15 +12,19 @@ struct numeric_utils : numeric_traits using traits = numeric_traits; using _numeric = numeric; - using raw_type = typename T::raw_type; + using raw_type = typename traits::bitwise_type; static constexpr int exp_mask = (1 << traits::exp) - 1; - static constexpr int get_exponent(raw_type x) + static constexpr raw_type get_exponent(raw_type x) { // TODO: check if repeated calls are optimized. return (x >> traits::mant) & exp_mask; } + static constexpr raw_type get_exponent(const T& x) + { + return get_exponent(bit_cast(x)); + } static constexpr bool is_positive(raw_type x) { return (x >> (traits::exp + traits::mant)) == _numeric::binary_zero; @@ -33,7 +37,7 @@ struct numeric_utils : numeric_traits static constexpr double get_mantissa(raw_type x) { double mantissa = is_subnormal(x) ? 0.0f : 1.0f; - for(uint32_t i = 0; i < traits::mant; ++i) + for(raw_type i = 0; i < traits::mant; ++i) { mantissa += std::ldexp(static_cast(x & 0b1), -(traits::mant - i)); x >>= 1; @@ -43,22 +47,23 @@ struct numeric_utils : numeric_traits }; template -CK_TILE_HOST_DEVICE float convert_to_float(typename T::raw_type data, int scale_exp = 127) +CK_TILE_HOST_DEVICE float convert_to_float(typename T::raw_type data, float scale = 1.f) { - using utils = numeric_utils; - static constexpr int e8m0_bias = 127; // TODO: make it generic. - float sign = utils::is_positive(data) ? 1.0 : -1.0; - int exp = (utils::is_subnormal(data) ? 1 : utils::get_exponent(data)) - utils::bias; - float mant = utils::get_mantissa(data); + using utils = numeric_utils; + float sign = utils::is_positive(data) ? 1.0 : -1.0; + int exp = (utils::is_subnormal(data) ? 1 : utils::get_exponent(data)) - utils::bias; + float mant = utils::get_mantissa(data); - return std::ldexp(sign * mant, exp + scale_exp - e8m0_bias); + return std::ldexp(sign * mant * scale, exp); } template -CK_TILE_HOST_DEVICE typename T::raw_type convert_to_type(float value) +CK_TILE_HOST_DEVICE typename T::raw_type convert_to_type(float value, float scale = 1.f) { using bitwise_type = typename numeric_traits::bitwise_type; + value /= scale; + if(std::abs(value) > float(numeric::max())) { float max_value = numeric::max(); diff --git a/include/ck_tile/core/numeric/pk_fp4.hpp b/include/ck_tile/core/numeric/pk_fp4.hpp index 0dee750b69..a345cd1b75 100644 --- a/include/ck_tile/core/numeric/pk_fp4.hpp +++ b/include/ck_tile/core/numeric/pk_fp4.hpp @@ -23,14 +23,11 @@ using fp32x2_t = float __attribute__((ext_vector_type(2))); using fp16x2_t = _Float16 __attribute__((ext_vector_type(2))); using bf16x2_t = bf16_raw_t __attribute__((ext_vector_type(2))); -CK_TILE_HOST_DEVICE constexpr uint8_t float_to_e2m1(float); +CK_TILE_HOST_DEVICE constexpr uint8_t float_to_e2m1(float x, float scale = 1.f); // TODO: Add stochastic method struct pk_float4_e2m1_t { - static constexpr int exponent = 2; - static constexpr int mantissa = 1; - static constexpr int bias = 1; // TODO: Can we merge raw_type and type? using raw_type = uint8_t; using type = raw_type; @@ -41,18 +38,27 @@ struct pk_float4_e2m1_t CK_TILE_HOST_DEVICE constexpr pk_float4_e2m1_t(T init) : data{static_cast(init)} { } - CK_TILE_HOST_DEVICE explicit constexpr pk_float4_e2m1_t(float init) : data{float_to_e2m1(init)} + CK_TILE_HOST_DEVICE explicit constexpr pk_float4_e2m1_t(float init, float scale = 1.f) + : data{float_to_e2m1(init, scale)} { } CK_TILE_HOST_DEVICE constexpr operator type() const { return data; } CK_TILE_HOST_DEVICE constexpr raw_type& get() { return data; } CK_TILE_HOST_DEVICE constexpr raw_type get() const { return data; } - CK_TILE_HOST_DEVICE constexpr operator float() const; - CK_TILE_HOST_DEVICE constexpr operator fp32x2_t() const; - CK_TILE_HOST_DEVICE constexpr operator fp16_t() const; - CK_TILE_HOST_DEVICE constexpr operator fp16x2_t() const; - CK_TILE_HOST_DEVICE constexpr operator bf16_t() const; - CK_TILE_HOST_DEVICE constexpr operator bf16x2_t() const; + + CK_TILE_HOST_DEVICE constexpr float to_float(float scale = 1.f) const; + CK_TILE_HOST_DEVICE constexpr fp32x2_t to_fp32x2(float scale = 1.f) const; + CK_TILE_HOST_DEVICE constexpr fp16_t to_fp16(float scale = 1.f) const; + CK_TILE_HOST_DEVICE constexpr fp16x2_t to_fp16x2(float scale = 1.f) const; + CK_TILE_HOST_DEVICE constexpr bf16_t to_bf16(float scale = 1.f) const; + CK_TILE_HOST_DEVICE constexpr bf16x2_t to_bf16x2(float scale = 1.f) const; + + CK_TILE_HOST_DEVICE constexpr operator float() const { return to_float(); } + CK_TILE_HOST_DEVICE constexpr operator fp32x2_t() const { return to_fp32x2(); } + CK_TILE_HOST_DEVICE constexpr operator fp16_t() const { return to_fp16(); } + CK_TILE_HOST_DEVICE constexpr operator fp16x2_t() const { return to_fp16x2(); } + CK_TILE_HOST_DEVICE constexpr operator bf16_t() const { return to_bf16(); } + CK_TILE_HOST_DEVICE constexpr operator bf16x2_t() const { return to_bf16x2(); } template CK_TILE_HOST_DEVICE constexpr raw_type unpack(number) const; @@ -191,131 +197,160 @@ CK_TILE_DEVICE pk_fp4_raw_t _to_f4(T src, float scale = 1.0f) } // namespace impl #endif -CK_TILE_HOST_DEVICE constexpr pk_fp4_t::operator bf16_t() const +CK_TILE_HOST_DEVICE constexpr bf16_t pk_fp4_t::to_bf16(float scale) const { #if CK_TILE_FP4_CVT_DEVICE - return impl::_from_f4(data); + return impl::_from_f4(data, scale); #else - return bf16_t{type_convert(convert_to_float(unpack(number<0>{})))}; + return bf16_t{type_convert(convert_to_float(unpack(number<0>{}), scale))}; #endif } -CK_TILE_HOST_DEVICE constexpr pk_fp4_t::operator bf16x2_t() const + +CK_TILE_HOST_DEVICE constexpr bf16x2_t pk_fp4_t::to_bf16x2(float scale) const { #if CK_TILE_FP4_CVT_DEVICE - return impl::_from_f4(data); + return impl::_from_f4(data, scale); #else - return bf16x2_t{type_convert(convert_to_float(unpack(number<0>{}))), - type_convert(convert_to_float(unpack(number<1>{})))}; + return bf16x2_t{type_convert(convert_to_float(unpack(number<0>{}), scale)), + type_convert(convert_to_float(unpack(number<1>{}), scale))}; #endif } // TODO: make float_to_e2m1 generic so that we can convert from directrly. -CK_TILE_HOST_DEVICE constexpr pk_fp4_raw_t float_to_e2m1(float x) +CK_TILE_HOST_DEVICE constexpr pk_fp4_raw_t float_to_e2m1(float x, float scale) { #if CK_TILE_FP4_CVT_DEVICE - return impl::_to_f4(x); + return impl::_to_f4(x, scale); #else - return convert_to_type(x); + return convert_to_type(x, scale); #endif } -CK_TILE_HOST_DEVICE constexpr fp32x2_t pk_fp4_to_fp32x2(const pk_fp4_t& x) { return fp32x2_t(x); } -CK_TILE_HOST_DEVICE constexpr fp16x2_t pk_fp4_to_fp16x2(const pk_fp4_t& x) { return fp16x2_t(x); } -CK_TILE_HOST_DEVICE constexpr bf16x2_t pk_fp4_to_bf16x2(const pk_fp4_t& x) { return bf16x2_t(x); } -CK_TILE_HOST_DEVICE constexpr pk_fp4_t float_to_pk_fp4(const float& x) { return float_to_e2m1(x); } -CK_TILE_HOST_DEVICE constexpr pk_fp4_t fp16_to_pk_fp4(const fp16_t& x) +CK_TILE_HOST_DEVICE constexpr pk_fp4_t float_to_pk_fp4(const float& x, float scale) +{ + return float_to_e2m1(x, scale); +} +CK_TILE_HOST_DEVICE constexpr pk_fp4_t fp16_to_pk_fp4(const fp16_t& x, float scale) { #if CK_TILE_FP4_CVT_DEVICE - return impl::_to_f4(x); + return impl::_to_f4(x, scale); #else - return float_to_e2m1(type_convert(x)); + return float_to_e2m1(type_convert(x), scale); #endif } -CK_TILE_HOST_DEVICE constexpr pk_fp4_t bf16_to_pk_fp4(const bf16_t& x) +CK_TILE_HOST_DEVICE constexpr pk_fp4_t bf16_to_pk_fp4(const bf16_t& x, float scale) { #if CK_TILE_FP4_CVT_DEVICE - return impl::_to_f4(x); + return impl::_to_f4(x, scale); #else - return float_to_e2m1(type_convert(x)); + return float_to_e2m1(type_convert(x), scale); #endif } -CK_TILE_HOST_DEVICE constexpr pk_fp4_t fp16x2_to_pk_fp4(const fp16x2_t& x) +CK_TILE_HOST_DEVICE constexpr pk_fp4_t fp16x2_to_pk_fp4(const fp16x2_t& x, float scale) { #if CK_TILE_FP4_CVT_DEVICE - return impl::_to_f4(x); + return impl::_to_f4(x, scale); #else - return pk_fp4_t::pack(float_to_e2m1(type_convert(x[0])), - float_to_e2m1(type_convert(x[1]))); + return pk_fp4_t::pack(float_to_e2m1(type_convert(x[0]), scale), + float_to_e2m1(type_convert(x[1]), scale)); #endif } -CK_TILE_HOST_DEVICE constexpr pk_fp4_t bf16x2_to_pk_fp4(const bf16x2_t& x) +CK_TILE_HOST_DEVICE constexpr pk_fp4_t bf16x2_to_pk_fp4(const bf16x2_t& x, float scale) { #if CK_TILE_FP4_CVT_DEVICE - return impl::_to_f4(x); + return impl::_to_f4(x, scale); #else - return pk_fp4_t::pack(float_to_e2m1(type_convert(x[0])), - float_to_e2m1(type_convert(x[1]))); + return pk_fp4_t::pack(float_to_e2m1(type_convert(x[0]), scale), + float_to_e2m1(type_convert(x[1]), scale)); #endif } -CK_TILE_HOST_DEVICE constexpr pk_fp4_t fp32x2_to_pk_fp4(const fp32x2_t& x) +CK_TILE_HOST_DEVICE constexpr pk_fp4_t fp32x2_to_pk_fp4(const fp32x2_t& x, float scale) { #if CK_TILE_FP4_CVT_DEVICE - return impl::_to_f4(x); + return impl::_to_f4(x, scale); #else - return pk_fp4_t::pack(float_to_e2m1(x[0]), float_to_e2m1(x[1])); + return pk_fp4_t::pack(float_to_e2m1(x[0], scale), float_to_e2m1(x[1], scale)); #endif } +CK_TILE_HOST_DEVICE constexpr fp32x2_t pk_fp4_to_fp32x2(const pk_fp4_t& x, float scale) +{ + return x.to_fp32x2(scale); +} +CK_TILE_HOST_DEVICE constexpr fp16x2_t pk_fp4_to_fp16x2(const pk_fp4_t& x, float scale) +{ + return x.to_fp16x2(scale); +} +CK_TILE_HOST_DEVICE constexpr bf16x2_t pk_fp4_to_bf16x2(const pk_fp4_t& x, float scale) +{ + return x.to_bf16x2(scale); +} +CK_TILE_HOST_DEVICE constexpr float pk_fp4_to_float(const pk_fp4_t& x, float scale) +{ + return x.to_float(scale); +} +CK_TILE_HOST_DEVICE constexpr fp16_t pk_fp4_to_fp16(const pk_fp4_t& x, float scale) +{ + return x.to_fp16(scale); +} +CK_TILE_HOST_DEVICE constexpr bf16_t pk_fp4_to_bf16(const pk_fp4_t& x, float scale) +{ + return x.to_bf16(scale); +} + #if TEST_convert_with_table == 0 -CK_TILE_HOST_DEVICE constexpr pk_fp4_t::operator float() const +CK_TILE_HOST_DEVICE constexpr float pk_fp4_t::to_float(float scale) const { #if CK_TILE_FP4_CVT_DEVICE - return impl::_from_f4(data); + return impl::_from_f4(data, scale); #else - return convert_to_float(unpack(number<0>{})); + return convert_to_float(unpack(number<0>{}), scale); #endif } -CK_TILE_HOST_DEVICE constexpr pk_fp4_t::operator fp32x2_t() const +CK_TILE_HOST_DEVICE constexpr fp32x2_t pk_fp4_t::to_fp32x2(float scale) const { #if CK_TILE_FP4_CVT_DEVICE - return impl::_from_f4(data); + return impl::_from_f4(data, scale); #else - return fp32x2_t{convert_to_float(unpack(number<0>{})), - convert_to_float(unpack(number<1>{}))}; + return fp32x2_t{convert_to_float(unpack(number<0>{}), scale), + convert_to_float(unpack(number<1>{}), scale)}; #endif } -CK_TILE_HOST_DEVICE constexpr pk_fp4_t::operator fp16_t() const + +CK_TILE_HOST_DEVICE constexpr fp16_t pk_fp4_t::to_fp16(float scale) const { #if CK_TILE_FP4_CVT_DEVICE - return impl::_from_f4(data); + return impl::_from_f4(data, scale); #else - return fp16_t{type_convert(convert_to_float(unpack(number<0>{})))}; + return fp16_t{type_convert(convert_to_float(unpack(number<0>{}), scale))}; #endif } -CK_TILE_HOST_DEVICE constexpr pk_fp4_t::operator fp16x2_t() const +CK_TILE_HOST_DEVICE constexpr fp16x2_t pk_fp4_t::to_fp16x2(float scale) const { #if CK_TILE_FP4_CVT_DEVICE - return impl::_from_f4(data); + return impl::_from_f4(data, scale); #else - return fp16x2_t{type_convert(convert_to_float(unpack(number<0>{}))), - type_convert(convert_to_float(unpack(number<1>{})))}; + return fp16x2_t{type_convert(convert_to_float(unpack(number<0>{}), scale)), + type_convert(convert_to_float(unpack(number<1>{}), scale))}; #endif } #else -CK_TILE_HOST_DEVICE constexpr pk_fp4_t::operator float() const +CK_TILE_HOST_DEVICE constexpr float pk_fp4_t::to_float(float scale) const { - return e2m1_to_fp32_table[data & 0xf]; + return e2m1_to_fp32_table[unpack(number<0>{})] * scale; } -CK_TILE_HOST_DEVICE constexpr pk_fp4_t::operator fp32x2_t() const +CK_TILE_HOST_DEVICE constexpr fp32x2_t pk_fp4_t::to_fp32x2(float scale) const { - return fp32x2_t{e2m1_to_fp32_table[data & 0xf], e2m1_to_fp32_table[data >> 4]}; + return fp32x2_t{e2m1_to_fp32_table[unpack(number<0>{})] * scale, e2m1_to_fp32_table[unpack(number<1>{}] * scale}; } -CK_TILE_HOST_DEVICE constexpr pk_fp4_t::operator fp16_t() const +CK_TILE_HOST_DEVICE constexpr fp16_t pk_fp4_t::to_fp16(float scale) const { - return e2m1_to_fp16_table[data & 0xf]; + return type_convert(e2m1_to_fp16_table[unpack(number<0>{})]) * scale; } -CK_TILE_HOST_DEVICE constexpr pk_fp4_t::operator fp16x2_t() const +CK_TILE_HOST_DEVICE constexpr fp16x2_t pk_fp4_t::to_fp16x2(float scale) const { - return fp16x2_t{e2m1_to_fp16_table[data & 0xf], e2m1_to_fp16_table[data >> 4]}; + return fp16x2_t{ + type_convert(type_convert(e2m1_to_fp16_table[unpack(number<0>{})]) * scale), + type_convert(type_convert(e2m1_to_fp16_table[unpack(number<1>{})]) * scale)}; } #endif diff --git a/include/ck_tile/core/numeric/type_convert.hpp b/include/ck_tile/core/numeric/type_convert.hpp index 94d6e3cd34..1455fce0ea 100644 --- a/include/ck_tile/core/numeric/type_convert.hpp +++ b/include/ck_tile/core/numeric/type_convert.hpp @@ -64,6 +64,7 @@ CK_TILE_TYPE_CONVERT(bf8_t, bf8, float, float) CK_TILE_TYPE_CONVERT(float, float, int8_t, int8) CK_TILE_TYPE_CONVERT(int8_t, int8, float, float) +#undef CK_TILE_TYPE_CONVERT } // namespace ck_tile @@ -71,16 +72,36 @@ CK_TILE_TYPE_CONVERT(int8_t, int8, float, float) namespace ck_tile { -CK_TILE_TYPE_CONVERT(pk_fp4_t, pk_fp4, fp32x2_t, fp32x2) -CK_TILE_TYPE_CONVERT(fp32x2_t, fp32x2, pk_fp4_t, pk_fp4) -CK_TILE_TYPE_CONVERT(pk_fp4_t, pk_fp4, fp16x2_t, fp16x2) -CK_TILE_TYPE_CONVERT(fp16x2_t, fp16x2, pk_fp4_t, pk_fp4) -CK_TILE_TYPE_CONVERT(pk_fp4_t, pk_fp4, bf16x2_t, bf16x2) -CK_TILE_TYPE_CONVERT(bf16x2_t, bf16x2, pk_fp4_t, pk_fp4) -CK_TILE_TYPE_CONVERT(pk_fp4_t, pk_fp4, float, float) -CK_TILE_TYPE_CONVERT(pk_fp4_t, pk_fp4, bf16_t, bf16) -CK_TILE_TYPE_CONVERT(pk_fp4_t, pk_fp4, fp16_t, fp16) -#undef CK_TILE_TYPE_CONVERT +template +CK_TILE_HOST_DEVICE constexpr Y scaled_type_convert(X x, float scale); + +#define CK_TILE_SCALED_TYPE_CONVERT(dtype_, dname_, stype_, sname_) \ + template <> \ + CK_TILE_HOST_DEVICE constexpr dtype_ scaled_type_convert(stype_ x, \ + float scale) \ + { \ + return sname_##_to_##dname_(x, scale); \ + } \ + template <> \ + CK_TILE_HOST_DEVICE constexpr dtype_ type_convert(stype_ x) \ + { \ + return sname_##_to_##dname_(x, 1.f); \ + } + +CK_TILE_SCALED_TYPE_CONVERT(pk_fp4_t, pk_fp4, fp32x2_t, fp32x2) +CK_TILE_SCALED_TYPE_CONVERT(fp32x2_t, fp32x2, pk_fp4_t, pk_fp4) +CK_TILE_SCALED_TYPE_CONVERT(pk_fp4_t, pk_fp4, fp16x2_t, fp16x2) +CK_TILE_SCALED_TYPE_CONVERT(fp16x2_t, fp16x2, pk_fp4_t, pk_fp4) +CK_TILE_SCALED_TYPE_CONVERT(pk_fp4_t, pk_fp4, bf16x2_t, bf16x2) +CK_TILE_SCALED_TYPE_CONVERT(bf16x2_t, bf16x2, pk_fp4_t, pk_fp4) +CK_TILE_SCALED_TYPE_CONVERT(pk_fp4_t, pk_fp4, float, float) +CK_TILE_SCALED_TYPE_CONVERT(float, float, pk_fp4_t, pk_fp4) +CK_TILE_SCALED_TYPE_CONVERT(pk_fp4_t, pk_fp4, bf16_t, bf16) +CK_TILE_SCALED_TYPE_CONVERT(bf16_t, bf16, pk_fp4_t, pk_fp4) +CK_TILE_SCALED_TYPE_CONVERT(pk_fp4_t, pk_fp4, fp16_t, fp16) +CK_TILE_SCALED_TYPE_CONVERT(fp16_t, fp16, pk_fp4_t, pk_fp4) +#undef CK_TILE_SCALED_TYPE_CONVERT + #endif } // namespace ck_tile diff --git a/include/ck_tile/core/numeric/vector_type.hpp b/include/ck_tile/core/numeric/vector_type.hpp index b165275a8c..58bdb43b08 100644 --- a/include/ck_tile/core/numeric/vector_type.hpp +++ b/include/ck_tile/core/numeric/vector_type.hpp @@ -84,7 +84,7 @@ using ext_vector_t = typename impl::ext_vector::type; // by default, any type will result in a vector_size=1 with scalar_type=T traits. // ... unless we have other vector_traits specialization -template +template struct vector_traits { using scalar_type = @@ -94,7 +94,7 @@ struct vector_traits // specialization for ext_vector_type() template -struct vector_traits +struct vector_traits { using scalar_type = std::conditional_t, int8_t, T>; static constexpr index_t vector_size = N; diff --git a/include/ck_tile/core/tensor/buffer_view.hpp b/include/ck_tile/core/tensor/buffer_view.hpp index 4b39773939..ca314a6abe 100644 --- a/include/ck_tile/core/tensor/buffer_view.hpp +++ b/include/ck_tile/core/tensor/buffer_view.hpp @@ -210,28 +210,6 @@ struct buffer_view(const_cast*>(p_data_))); - - // buffer_size_ - printf("buffer_size_: "); - print(buffer_size_); - printf(", "); - - // invalid_element_value_ - printf("invalid_element_value_: "); - print(invalid_element_value_); - - printf("}"); - } }; // Address Space: Global @@ -757,28 +735,6 @@ struct buffer_view(const_cast*>(p_data_))); - - // buffer_size_ - printf("buffer_size_: "); - print(buffer_size_); - printf(", "); - - // invalid_element_value_ - printf("invalid_element_value_: "); - print(invalid_element_value_); - - printf("}"); - } }; // Address Space: LDS @@ -1138,28 +1094,6 @@ struct buffer_view(const_cast*>(p_data_))); - - // buffer_size_ - printf("buffer_size_: "); - print(buffer_size_); - printf(", "); - - // invalid_element_value_ - printf("invalid_element_value_: "); - print(invalid_element_value_); - - printf("}"); - } }; // Address Space: Vgpr @@ -1313,28 +1247,6 @@ struct buffer_view(const_cast*>(p_data_))); - - // buffer_size_ - printf("buffer_size_: "); - print(buffer_size_); - printf(", "); - - // invalid_element_value_ - printf("invalid_element_value_: "); - print(invalid_element_value_); - - printf("}"); - } }; template +CK_TILE_HOST_DEVICE void print(const buffer_view& bv) +{ + printf("buffer_view{AddressSpace: %s, p_data_: %p, buffer_size_: ", + address_space_to_string(BufferAddressSpace), + static_cast(const_cast*>(bv.p_data_))); + print(bv.buffer_size_); + printf(", invalid_element_value_: "); + print(bv.invalid_element_value_); + printf("}"); +} + } // namespace ck_tile diff --git a/include/ck_tile/core/tensor/null_tile_window.hpp b/include/ck_tile/core/tensor/null_tile_window.hpp index de99be1965..f7eca73afb 100644 --- a/include/ck_tile/core/tensor/null_tile_window.hpp +++ b/include/ck_tile/core/tensor/null_tile_window.hpp @@ -1,5 +1,5 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved. #pragma once @@ -53,10 +53,13 @@ struct is_null_tile_window> : public std::true_type }; } // namespace impl +template +constexpr bool is_null_tile_window_v = impl::is_null_tile_window>::value; + template CK_TILE_DEVICE constexpr auto is_null_tile_window(const T&) { - return impl::is_null_tile_window>::value; + return is_null_tile_window_v>; } template diff --git a/include/ck_tile/core/tensor/tensor_adaptor.hpp b/include/ck_tile/core/tensor/tensor_adaptor.hpp index e2a6ae6555..ec5538d79c 100644 --- a/include/ck_tile/core/tensor/tensor_adaptor.hpp +++ b/include/ck_tile/core/tensor/tensor_adaptor.hpp @@ -305,42 +305,45 @@ struct tensor_adaptor get_container_subset(vector_strides, top_dims)); } - CK_TILE_HOST_DEVICE void print() const - { - printf("tensor_adaptor{"); - - // - printf("transforms: "); - print(transforms_); - printf(", "); - - // - printf("LowerDimensionHiddenIds: "); - print(LowerDimensionHiddenIdss{}); - printf(", "); - - // - printf("UpperDimensionHiddenIds: "); - print(UpperDimensionHiddenIdss{}); - printf(", "); - - // - printf("BottomDimensionHiddenIds: "); - print(BottomDimensionHiddenIds{}); - printf(", "); - - // - printf("TopDimensionHiddenIds: "); - print(TopDimensionHiddenIds{}); - - printf("}"); - } - private: Transforms transforms_; ElementSize element_size_; }; +template +CK_TILE_HOST_DEVICE static void print(const tensor_adaptor& adaptor) +{ + printf("tensor_adaptor{\n"); + printf(" transforms: ["); + print(adaptor.get_transforms()); + printf("],\n"); + + printf(" LowerDimensionHiddenIds: ["); + print(LowerDimensionHiddenIdss{}); + printf("],\n"); + + printf(" UpperDimensionHiddenIds: ["); + print(UpperDimensionHiddenIdss{}); + printf("],\n"); + + printf(" BottomDimensionHiddenIds: ["); + print(BottomDimensionHiddenIds{}); + printf("],\n"); + + // + printf(" TopDimensionHiddenIds: ["); + print(TopDimensionHiddenIds{}); + printf("]\n}\n"); +} + // Transforms: Tuple // LowerDimensionOldTopIdss: Tuple, ...> // UpperDimensionNewTopIdss: Tuple, ...> diff --git a/include/ck_tile/core/tensor/tensor_descriptor.hpp b/include/ck_tile/core/tensor/tensor_descriptor.hpp index 0c3e04f315..0e4787a2f1 100644 --- a/include/ck_tile/core/tensor/tensor_descriptor.hpp +++ b/include/ck_tile/core/tensor/tensor_descriptor.hpp @@ -140,25 +140,37 @@ struct tensor_descriptor : public tensor_adaptor(GuaranteedVectorStrides{})); } - CK_TILE_HOST_DEVICE void print() const - { - printf("tensor_descriptor{"); - - // tensor_adaptor - Base::print(); - printf(", "); - - // element_space_size_ - printf("element_space_size_: "); - print(element_space_size_); - - printf("}"); - } - // TODO make these private ElementSpaceSize element_space_size_; }; +template +CK_TILE_HOST_DEVICE static void print(const tensor_descriptor& descriptor) +{ + printf("tensor_descriptor{\n"); + // first print the tensor adaptor part of the descriptor using the base class print + print(static_cast(descriptor)); + printf("element_space_size_: %ld,\n", + static_cast(descriptor.get_element_space_size().value)); + printf("guaranteed_vector_lengths: "); + print(GuaranteedVectorLengths{}); + printf(",\nguaranteed_vector_strides: "); + print(GuaranteedVectorStrides{}); + printf("}\n}\n"); +} + template CK_TILE_HOST_DEVICE constexpr auto make_tensor_descriptor_from_adaptor(const Adaptor& adaptor, diff --git a/include/ck_tile/core/tensor/tile_distribution.hpp b/include/ck_tile/core/tensor/tile_distribution.hpp index 11e6b35c39..bc02ec74d2 100644 --- a/include/ck_tile/core/tensor/tile_distribution.hpp +++ b/include/ck_tile/core/tensor/tile_distribution.hpp @@ -228,24 +228,6 @@ struct tile_distribution { return PsYs2XsAdaptor::is_static() && Ys2DDescriptor::is_static(); } - - CK_TILE_HOST_DEVICE void print() const - { - printf("tile_distribution{"); - // - printf("tile_distribution_encoding: "); - print(DstrEncode{}); - printf(", "); - // - printf("ps_ys_to_xs_: "); - print(ps_ys_to_xs_); - printf(", "); - // - printf("ys_to_d_: "); - print(ys_to_d_); - // - printf("}"); - } }; namespace detail { @@ -710,4 +692,27 @@ CK_TILE_HOST_DEVICE constexpr auto slice_distribution_from_x( } } // namespace detail + +// Free print function for tile_distribution +template +CK_TILE_HOST_DEVICE void print(const tile_distribution& distribution) +{ + printf("tile_distribution{"); + printf("tile_distribution_encoding: "); + print(StaticTileDistributionEncoding_{}); + printf(", "); + printf("ps_ys_to_xs_: "); + print(distribution.ps_ys_to_xs_); + printf(", "); + printf("ys_to_d_: "); + print(distribution.ys_to_d_); + printf("}\n"); +} + } // namespace ck_tile diff --git a/include/ck_tile/core/tensor/tile_distribution_encoding.hpp b/include/ck_tile/core/tensor/tile_distribution_encoding.hpp index b380e7c9d8..90d1a2ccb2 100644 --- a/include/ck_tile/core/tensor/tile_distribution_encoding.hpp +++ b/include/ck_tile/core/tensor/tile_distribution_encoding.hpp @@ -428,109 +428,7 @@ struct tile_distribution_encoding { return get_sorted_info(get_uniformed_idx_y_to_h(), get_h_dim_lengths_prefix_sum()); } - - CK_TILE_HOST_DEVICE void print() const - { - printf("tile_distribution_encoding::detail{"); - // - printf("ndim_rh_major_: "); - print(ndim_rh_major_); - printf(", "); - // - printf("ndim_span_major_: "); - print(ndim_span_major_); - printf(", "); - // - printf("ndims_rhs_minor_: "); - print(ndims_rhs_minor_); - printf(", "); - // - printf("ndim_rh_major_: "); - print(ndim_rh_major_); - printf(", "); - // - printf("max_ndim_rh_minor_: "); - print(max_ndim_rh_minor_); - printf(", "); - // - printf("rhs_lengthss_: "); - print(rhs_lengthss_); - printf(", "); - // - printf("ys_lengths_: "); - print(ys_lengths_); - printf(", "); - // - printf("rhs_major_minor_to_ys_: "); - print(rhs_major_minor_to_ys_); - printf(", "); - // - printf("ndims_span_minor_: "); - print(ndims_span_minor_); - printf(", "); - // - printf("max_ndim_span_minor_: "); - print(max_ndim_span_minor_); - printf(", "); - // - printf("ys_to_span_major_: "); - print(ys_to_span_major_); - printf(", "); - // - printf("ys_to_span_minor_: "); - print(ys_to_span_minor_); - printf(", "); - // - printf("distributed_spans_lengthss_: "); - print(distributed_spans_lengthss_); - printf(", "); - // - printf("ndims_distributed_spans_minor_: "); - print(ndims_distributed_spans_minor_); - printf(", "); - // - printf("ps_over_rs_derivative_: "); - print(ps_over_rs_derivative_); - // - printf("}"); - } }; - - CK_TILE_HOST_DEVICE void print() const - { - printf("tile_distribution_encoding{"); - // - printf("NDimX: %d, NDimP: %d, NDimY: %d, ", NDimX, NDimP, NDimY); - // - printf("rs_lengths_: "); - print(rs_lengths_); - printf(", "); - // - printf("hs_lengthss_: "); - print(hs_lengthss_); - printf(", "); - // - printf("ps_to_rhss_major_: "); - print(ps_to_rhss_major_); - printf(", "); - // - printf("ps_to_rhss_minor_: "); - print(ps_to_rhss_minor_); - printf(", "); - // - printf("ys_to_rhs_major_: "); - print(ys_to_rhs_major_); - printf(", "); - // - printf("ys_to_rhs_minor_: "); - print(ys_to_rhs_minor_); - printf(", "); - // - printf("detail: "); - print(detail{}); - // - printf("}"); - } }; template @@ -896,4 +794,106 @@ make_reduce_tile_distribution_encoding(InDstr, sequence reduce } } // namespace detail + +// Free print function for tile_distribution_encoding::detail +template +CK_TILE_HOST_DEVICE void +print(const typename tile_distribution_encoding::detail& detail_obj) +{ + printf("tile_distribution_encoding::detail{"); + printf("ndim_rh_major_: "); + print(detail_obj.ndim_rh_major_); + printf(", "); + printf("ndim_span_major_: "); + print(detail_obj.ndim_span_major_); + printf(", "); + printf("ndims_rhs_minor_: "); + print(detail_obj.ndims_rhs_minor_); + printf(", "); + printf("ndim_rh_major_: "); + print(detail_obj.ndim_rh_major_); + printf(", "); + printf("max_ndim_rh_minor_: "); + print(detail_obj.max_ndim_rh_minor_); + printf(", "); + printf("rhs_lengthss_: "); + print(detail_obj.rhs_lengthss_); + printf(", "); + printf("ys_lengths_: "); + print(detail_obj.ys_lengths_); + printf(", "); + printf("rhs_major_minor_to_ys_: "); + print(detail_obj.rhs_major_minor_to_ys_); + printf(", "); + printf("ndims_span_minor_: "); + print(detail_obj.ndims_span_minor_); + printf(", "); + printf("max_ndim_span_minor_: "); + print(detail_obj.max_ndim_span_minor_); + printf(", "); + printf("ys_to_span_major_: "); + print(detail_obj.ys_to_span_major_); + printf(", "); + printf("ys_to_span_minor_: "); + print(detail_obj.ys_to_span_minor_); + printf(", "); + printf("distributed_spans_lengthss_: "); + print(detail_obj.distributed_spans_lengthss_); + printf(", "); + printf("ndims_distributed_spans_minor_: "); + print(detail_obj.ndims_distributed_spans_minor_); + printf(", "); + printf("ps_over_rs_derivative_: "); + print(detail_obj.ps_over_rs_derivative_); + printf("}"); +} + +// Free print function for tile_distribution_encoding +template +CK_TILE_HOST_DEVICE void print(const tile_distribution_encoding& encoding) +{ + printf("tile_distribution_encoding{"); + + printf("NDimX: %d, NDimP: %d, NDimY: %d, ", encoding.NDimX, encoding.NDimP, encoding.NDimY); + printf("rs_lengths_: "); + print(encoding.rs_lengths_); + printf(", "); + printf("hs_lengthss_: "); + print(encoding.hs_lengthss_); + printf(", "); + printf("ps_to_rhss_major_: "); + print(encoding.ps_to_rhss_major_); + printf(", "); + printf("ps_to_rhss_minor_: "); + print(encoding.ps_to_rhss_minor_); + printf(", "); + printf("ys_to_rhs_major_: "); + print(encoding.ys_to_rhs_major_); + printf(", "); + printf("ys_to_rhs_minor_: "); + print(encoding.ys_to_rhs_minor_); + printf(", "); + printf("}"); +} + } // namespace ck_tile diff --git a/include/ck_tile/core/utility/print.hpp b/include/ck_tile/core/utility/print.hpp new file mode 100644 index 0000000000..04635959af --- /dev/null +++ b/include/ck_tile/core/utility/print.hpp @@ -0,0 +1,76 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck_tile/core/config.hpp" + +namespace ck_tile { + +/// Declare a ck_tile::print() interface that gets specialized in each header file for types that +/// can be printed. +template +CK_TILE_HOST_DEVICE void print(const T&) +{ + static_assert(sizeof(T) == 0, + "No print implementation available for this type. Please specialize " + "ck_tile::print for your type."); +} + +/// Specialization for int +template <> +CK_TILE_HOST_DEVICE void print(const int& value) +{ + printf("%d", value); +} + +/// Specialization for float +template <> +CK_TILE_HOST_DEVICE void print(const float& value) +{ + printf("%f", value); +} + +/// Specialization for double +template <> +CK_TILE_HOST_DEVICE void print(const double& value) +{ + printf("%f", value); +} + +/// Specialization for long +template <> +CK_TILE_HOST_DEVICE void print(const long& value) +{ + printf("%ld", value); +} + +/// Specialization for unsigned int +template <> +CK_TILE_HOST_DEVICE void print(const unsigned int& value) +{ + printf("%u", value); +} + +/// Specialization for char +template <> +CK_TILE_HOST_DEVICE void print(const char& value) +{ + printf("%c", value); +} + +/// Specialization for array +template +CK_TILE_HOST_DEVICE void print(const T (&value)[N]) +{ + printf("["); + for(size_t i = 0; i < N; ++i) + { + if(i > 0) + printf(", "); + print(value[i]); // Recursively call print for each element + } + printf("]"); +} + +} // namespace ck_tile diff --git a/include/ck_tile/core/utility/reduce_operator.hpp b/include/ck_tile/core/utility/reduce_operator.hpp index 8b15d187fe..2d7ac78b06 100644 --- a/include/ck_tile/core/utility/reduce_operator.hpp +++ b/include/ck_tile/core/utility/reduce_operator.hpp @@ -26,7 +26,8 @@ struct Add } template || std::is_same_v>> + typename = std::enable_if_t || std::is_same_v || + std::is_same_v || std::is_same_v>> CK_TILE_HOST_DEVICE constexpr T operator()(T& y, T x) const { float y_ = type_convert(y); @@ -34,6 +35,8 @@ struct Add return type_convert(y_ + x_); } + + static constexpr bool requires_special_combine = false; }; struct SquareAdd @@ -51,13 +54,47 @@ struct SquareAdd { return y + (x * x); } + + template || std::is_same_v || + std::is_same_v || std::is_same_v>> + CK_TILE_HOST_DEVICE constexpr T operator()(T& y, T x) const + { + float y_ = type_convert(y); + float x_ = type_convert(x); + return type_convert(y_ + (x_ * x_)); + } + + // For combining partial results + template || std::is_same_v || + std::is_same_v || std::is_same_v>> + CK_TILE_HOST_DEVICE constexpr T combine_partial_results(const T& partial1, + const T& partial2) const + { + return partial1 + partial2; // Just add the partial sums, don't square again + } + + template || std::is_same_v || + std::is_same_v || std::is_same_v>> + CK_TILE_HOST_DEVICE constexpr T combine_partial_results(T& partial1, T& partial2) const + { + float partial1_ = type_convert(partial1); + float partial2_ = type_convert(partial2); + return type_convert(partial1_ + partial2_); + } + + static constexpr bool requires_special_combine = true; }; struct Max { template || std::is_same_v || - std::is_same_v || std::is_same_v>> + std::is_same_v || std::is_same_v || + std::is_same_v || std::is_same_v || + std::is_same_v || std::is_same_v>> CK_TILE_HOST_DEVICE static constexpr T GetIdentityValue() { return numeric::min(); @@ -65,18 +102,24 @@ struct Max template || std::is_same_v || - std::is_same_v || std::is_same_v>> + std::is_same_v || std::is_same_v || + std::is_same_v || std::is_same_v || + std::is_same_v || std::is_same_v>> CK_TILE_HOST_DEVICE constexpr T operator()(const T& y, const T x) const { return max(y, x); } + + static constexpr bool requires_special_combine = false; }; struct AbsMax { template || std::is_same_v || - std::is_same_v || std::is_same_v>> + std::is_same_v || std::is_same_v || + std::is_same_v || std::is_same_v || + std::is_same_v || std::is_same_v>> CK_TILE_HOST_DEVICE static constexpr T GetIdentityValue() { return numeric::min(); @@ -84,11 +127,15 @@ struct AbsMax template || std::is_same_v || - std::is_same_v || std::is_same_v>> + std::is_same_v || std::is_same_v || + std::is_same_v || std::is_same_v || + std::is_same_v || std::is_same_v>> CK_TILE_HOST_DEVICE constexpr T operator()(const T& y, const T x) const { return max(y, abs(x)); } + + static constexpr bool requires_special_combine = false; }; } // namespace ReduceOp diff --git a/include/ck_tile/host/host_tensor.hpp b/include/ck_tile/host/host_tensor.hpp index c3f1b7d221..b7329fcac7 100644 --- a/include/ck_tile/host/host_tensor.hpp +++ b/include/ck_tile/host/host_tensor.hpp @@ -409,7 +409,13 @@ struct HostTensor } // void SetZero() { ck_tile::ranges::fill(mData, 0); } - void SetZero() { std::fill(mData.begin(), mData.end(), 0); } + void SetZero() + { + if constexpr(std::is_same_v) + std::fill(mData.begin(), mData.end(), e8m0_t{1.f}); + else + std::fill(mData.begin(), mData.end(), 0); + } template void ForEach_impl(F&& f, std::vector& idx, size_t rank) diff --git a/include/ck_tile/host/reference/reference_reduce.hpp b/include/ck_tile/host/reference/reference_reduce.hpp index 8f8aa23670..9952b7b009 100644 --- a/include/ck_tile/host/reference/reference_reduce.hpp +++ b/include/ck_tile/host/reference/reference_reduce.hpp @@ -30,4 +30,82 @@ reference_reduce(const HostTensor& x_m_n, HostTensor& y_m, make_ParallelTensorFunctor(f, y_m.mDesc.get_lengths()[0])(std::thread::hardware_concurrency()); } + +// Generic reference reduce for arbitrary dimensions +template < + typename XDataType, + typename ComputeDataType, + typename YDataType, + typename ReduceOp, + typename KeptDim, // Expected type: ck_tile::sequence<...> containing dimension indices to keep + typename ReduceDims> // Expected type: ck_tile::sequence<...> containing dimension indices to + // reduce +CK_TILE_HOST void reference_reduce(const HostTensor& x_tensor, + HostTensor& y_tensor, + ReduceOp reduce_op, + KeptDim kept_dim, + ReduceDims reduce_dims) +{ + const auto& x_lengths = x_tensor.mDesc.get_lengths(); + + // Calculate total kept elements (product of all kept dimension lengths) + index_t total_kept_elements = 1; + static_for<0, kept_dim.size(), 1>{}( + [&](auto i) { total_kept_elements *= x_lengths[kept_dim.at(i)]; }); + + // Calculate total reduce elements (product of all reduce dimension lengths) + index_t total_reduce_elements = 1; + static_for<0, reduce_dims.size(), 1>{}( + [&](auto i) { total_reduce_elements *= x_lengths[reduce_dims.at(i)]; }); + + auto f = [&](auto linear_kept_idx) { + ComputeDataType v_acc = reduce_op.template GetIdentityValue(); + + // Convert linear kept index to multi-dimensional kept indices + std::vector kept_indices(kept_dim.size()); + index_t temp_kept = linear_kept_idx; + static_for<0, kept_dim.size(), 1>{}([&](auto i) { + constexpr auto dim_idx = kept_dim.size() - 1 - i; + constexpr auto dim = kept_dim.at(dim_idx); + const auto len = x_lengths[dim]; + kept_indices[dim_idx] = temp_kept % len; + temp_kept /= len; + }); + + for(index_t reduce_idx = 0; reduce_idx < total_reduce_elements; ++reduce_idx) + { + // Convert linear reduce index to multi-dimensional reduce indices + std::vector reduce_indices(reduce_dims.size()); + index_t temp_reduce = reduce_idx; + static_for<0, reduce_dims.size(), 1>{}([&](auto i) { + constexpr auto dim_idx = reduce_dims.size() - 1 - i; + constexpr auto dim = reduce_dims.at(dim_idx); + const auto len = x_lengths[dim]; + reduce_indices[dim_idx] = temp_reduce % len; + temp_reduce /= len; + }); + + // Build full input tensor indices by combining kept and reduce indices + std::vector full_indices(x_lengths.size(), 0); + static_for<0, kept_dim.size(), 1>{}( + [&](auto i) { full_indices[kept_dim.at(i)] = kept_indices[i]; }); + static_for<0, reduce_dims.size(), 1>{}( + [&](auto i) { full_indices[reduce_dims.at(i)] = reduce_indices[i]; }); + + // Access input tensor element + const auto v_a = type_convert(x_tensor(full_indices)); + + v_acc = reduce_op(v_acc, v_a); + } + + // Calculate output tensor index using kept indices + // The output tensor has the same structure as the kept dimensions + std::vector y_indices(kept_dim.size()); + static_for<0, kept_dim.size(), 1>{}([&](auto i) { y_indices[i] = kept_indices[i]; }); + + y_tensor(y_indices) = type_convert(v_acc); + }; + + make_ParallelTensorFunctor(f, total_kept_elements)(std::thread::hardware_concurrency()); +} } // namespace ck_tile diff --git a/include/ck_tile/host/reference/reference_softmax.hpp b/include/ck_tile/host/reference/reference_softmax.hpp index d86e879944..4e729c437d 100644 --- a/include/ck_tile/host/reference/reference_softmax.hpp +++ b/include/ck_tile/host/reference/reference_softmax.hpp @@ -1,5 +1,5 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. #pragma once @@ -14,7 +14,7 @@ CK_TILE_HOST void reference_softmax(const HostTensor& x, HostTensor& y, index_t dim = -1) { index_t rank = x.get_num_of_dimension(); - assert(rank == y.get_num_of_dimension()); + assert(static_cast(rank) == y.get_num_of_dimension()); assert(dim == -1 || dim < rank); index_t target_dim = dim == -1 ? (rank - 1) : dim; diff --git a/include/ck_tile/host/reference/reference_topk.hpp b/include/ck_tile/host/reference/reference_topk.hpp index 3d0404a2e5..0fc99a983a 100644 --- a/include/ck_tile/host/reference/reference_topk.hpp +++ b/include/ck_tile/host/reference/reference_topk.hpp @@ -1,5 +1,5 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. #pragma once @@ -38,8 +38,8 @@ CK_TILE_HOST void reference_topk(const HostTensor& x, { // rank must be the same index_t rank = x.get_num_of_dimension(); - assert(rank == y_values.get_num_of_dimension()); - assert(rank == y_indices.get_num_of_dimension()); + assert(static_cast(rank) == y_values.get_num_of_dimension()); + assert(static_cast(rank) == y_indices.get_num_of_dimension()); assert(dim == -1 || dim < rank); index_t topk_dim = dim == -1 ? (rank - 1) : dim; @@ -47,7 +47,8 @@ CK_TILE_HOST void reference_topk(const HostTensor& x, auto x_len = x.get_lengths(); assert(k <= topk_src_len); - assert(k == y_values.get_length(topk_dim) && k == y_indices.get_length(topk_dim)); + assert(static_cast(k) == y_values.get_length(topk_dim) && + static_cast(k) == y_indices.get_length(topk_dim)); index_t n_parallel = x.get_element_size() / topk_src_len; diff --git a/include/ck_tile/ops/batched_transpose/kernel/batched_transpose_kernel.hpp b/include/ck_tile/ops/batched_transpose/kernel/batched_transpose_kernel.hpp index a89a190489..a4150e8d84 100644 --- a/include/ck_tile/ops/batched_transpose/kernel/batched_transpose_kernel.hpp +++ b/include/ck_tile/ops/batched_transpose/kernel/batched_transpose_kernel.hpp @@ -49,9 +49,11 @@ struct BatchedTransposeKernel CK_TILE_HOST static constexpr auto GridSize(const Hargs& host_args) { - size_t grid_size_x = (host_args.height + host_args.dim_block_h - 1) / host_args.dim_block_h; - size_t grid_size_y = (host_args.width + host_args.dim_block_w - 1) / host_args.dim_block_w; - size_t grid_size_z = host_args.batch; + const size_t grid_size_x = + ck_tile::integer_divide_ceil(host_args.height, host_args.dim_block_h); + const size_t grid_size_y = + ck_tile::integer_divide_ceil(host_args.width, host_args.dim_block_w); + const size_t grid_size_z = host_args.batch; return dim3(grid_size_x, grid_size_y, grid_size_z); } @@ -71,41 +73,43 @@ struct BatchedTransposeKernel CK_TILE_DEVICE void operator()(Kargs kargs) const { - static constexpr ck_tile::index_t kMPerBlock = Problem::kMPerBlock; - static constexpr ck_tile::index_t kNPerBlock = Problem::kNPerBlock; - static constexpr bool kPadM = Problem::kPadM; - static constexpr bool kPadN = Problem::kPadN; - static constexpr ck_tile::index_t VectorSizeInput = Problem::VectorSizeInput; - static constexpr ck_tile::index_t VectorSizeOutput = Problem::VectorSizeOutput; + static constexpr ck_tile::index_t kMPerBlock = Problem::kMPerBlock; + static constexpr ck_tile::index_t kNPerBlock = Problem::kNPerBlock; + static constexpr bool kPadM = Problem::kPadM; + static constexpr bool kPadN = Problem::kPadN; + static constexpr ck_tile::index_t VectorSizeInput = Problem::VectorSizeInput; + static constexpr ck_tile::index_t VectorStrideInput = 1; + static constexpr ck_tile::index_t VectorSizeOutput = Problem::VectorSizeOutput; + static constexpr ck_tile::index_t VectorStrideOutput = 1; - const auto iM = __builtin_amdgcn_readfirstlane(blockIdx.x * kMPerBlock); - const auto iN = __builtin_amdgcn_readfirstlane(blockIdx.y * kNPerBlock); - const auto iDim = blockIdx.z; + const auto iM = __builtin_amdgcn_readfirstlane(blockIdx.x * kMPerBlock); + const auto iN = __builtin_amdgcn_readfirstlane(blockIdx.y * kNPerBlock); + const auto offset = __builtin_amdgcn_readfirstlane(blockIdx.z * kargs.height * kargs.width); const auto x_m_n = [&]() { const auto x_dram_naive = make_naive_tensor_view( - static_cast(kargs.p_input) + iDim * kargs.dim_stride, + static_cast(kargs.p_input) + offset, make_tuple(kargs.height, kargs.width), make_tuple(kargs.width, 1), number{}, - number<1>{}); + number{}); return pad_tensor_view(x_dram_naive, make_tuple(number{}, number{}), - sequence{}); + sequence{}); }(); const auto y_n_m = [&]() { const auto y_dram_naive = make_naive_tensor_view( - static_cast(kargs.p_output) + iDim * kargs.dim_stride, + static_cast(kargs.p_output) + offset, make_tuple(kargs.width, kargs.height), make_tuple(kargs.height, 1), number{}, - number<1>{}); + number{}); return pad_tensor_view(y_dram_naive, make_tuple(number{}, number{}), - sequence{}); + sequence{}); }(); auto x_block_window = make_tile_window( diff --git a/include/ck_tile/ops/batched_transpose/pipeline/batched_transpose_common_policy.hpp b/include/ck_tile/ops/batched_transpose/pipeline/batched_transpose_common_policy.hpp index e344c24bf5..3b8d5a142e 100644 --- a/include/ck_tile/ops/batched_transpose/pipeline/batched_transpose_common_policy.hpp +++ b/include/ck_tile/ops/batched_transpose/pipeline/batched_transpose_common_policy.hpp @@ -15,15 +15,15 @@ struct BatchedTransposeCommonPolicy template CK_TILE_DEVICE static constexpr auto MakeInputDistribution() { - constexpr index_t BlockSize = Problem::kBlockSize; - constexpr index_t LeadDimPerBlock = Problem::kMPerBlock; - constexpr index_t SecondDimPerBlock = Problem::kNPerBlock; + constexpr index_t kBlockSize = Problem::kBlockSize; + constexpr index_t kLeadDimPerBlock = Problem::kNPerBlock; + constexpr index_t kSecondDimPerBlock = Problem::kMPerBlock; - constexpr index_t kVectorSize = Problem::VectorSizeOutput; - - using TileEncodingPattern = TileDistributionEncodingPattern2D; return TileEncodingPattern::Make2DStaticTileDistribution(); diff --git a/include/ck_tile/ops/batched_transpose/pipeline/batched_transpose_lds_problem.hpp b/include/ck_tile/ops/batched_transpose/pipeline/batched_transpose_lds_problem.hpp index 491db37564..45803ae2da 100644 --- a/include/ck_tile/ops/batched_transpose/pipeline/batched_transpose_lds_problem.hpp +++ b/include/ck_tile/ops/batched_transpose/pipeline/batched_transpose_lds_problem.hpp @@ -18,19 +18,19 @@ struct BatchedTransposeLdsProblem { using DataType = remove_cvref_t; - static constexpr index_t kRowWarps_ = NumWarps::at(number<1>{}); - static constexpr index_t kColWarps_ = NumWarps::at(number<0>{}); + static constexpr index_t kRowWarps_ = NumWarps::at(number<0>{}); + static constexpr index_t kColWarps_ = NumWarps::at(number<1>{}); static constexpr index_t kBlockSize_ = get_warp_size() * kRowWarps_ * kColWarps_; - static constexpr index_t kRowPerBlock_ = BlockTile::at(number<1>{}); - static constexpr index_t kColPerBlock_ = BlockTile::at(number<0>{}); + static constexpr index_t kRowPerBlock_ = BlockTile::at(number<0>{}); + static constexpr index_t kColPerBlock_ = BlockTile::at(number<1>{}); static constexpr index_t kBlockSize = kBlockSize_; // warps per block - static constexpr index_t kLeadNumWarps = kRowWarps_; - static constexpr index_t kSecondNumWarps = kColWarps_; + static constexpr index_t kLeadNumWarps = kColWarps_; + static constexpr index_t kSecondNumWarps = kRowWarps_; - static constexpr index_t kLeadSizePerBlock = kRowPerBlock_; - static constexpr index_t kSecondSizePerBlock = kColPerBlock_; + static constexpr index_t kLeadSizePerBlock = kColPerBlock_; + static constexpr index_t kSecondSizePerBlock = kRowPerBlock_; static constexpr index_t kQuadrantLeadDim = LaneGroupTransposeTraits::kleadDim; static constexpr index_t kQuadrantSecondDim = LaneGroupTransposeTraits::ksecondDim; @@ -60,8 +60,8 @@ struct BatchedTransposeLdsProblem static constexpr bool kPadM = kPadM_; static constexpr bool kPadN = kPadN_; - static constexpr auto kMPerBlock = kLeadSizePerBlock; - static constexpr auto kNPerBlock = kSecondSizePerBlock; + static constexpr auto kMPerBlock = kSecondSizePerBlock; + static constexpr auto kNPerBlock = kLeadSizePerBlock; // 128-bit is the max single-instruction bandwidth for load/store static constexpr index_t MaxLoadStoreSize = 16; diff --git a/include/ck_tile/ops/batched_transpose/pipeline/batched_transpose_policy.hpp b/include/ck_tile/ops/batched_transpose/pipeline/batched_transpose_policy.hpp index 5238fecdc5..e6bbc709ea 100644 --- a/include/ck_tile/ops/batched_transpose/pipeline/batched_transpose_policy.hpp +++ b/include/ck_tile/ops/batched_transpose/pipeline/batched_transpose_policy.hpp @@ -19,8 +19,8 @@ struct BatchedTransposePolicy : public BatchedTransposeCommonPolicy constexpr index_t VecLoadSize = Problem::VectorSizeOutput; using TileEncodingPattern = TileDistributionEncodingPattern2D; return TileEncodingPattern::MakeShuffled2DStaticTileDistribution(); diff --git a/include/ck_tile/ops/fmha.hpp b/include/ck_tile/ops/fmha.hpp index 30bea193b7..313de5f29a 100644 --- a/include/ck_tile/ops/fmha.hpp +++ b/include/ck_tile/ops/fmha.hpp @@ -24,8 +24,8 @@ #include "ck_tile/ops/fmha/pipeline/block_fmha_bwd_dot_do_o.hpp" #include "ck_tile/ops/fmha/pipeline/block_fmha_bwd_dq_dk_dv_pipeline_kr_ktr_vr.hpp" #include "ck_tile/ops/fmha/pipeline/block_fmha_bwd_dq_dk_dv_pipeline_kr_ktr_vr_iglp.hpp" +#include "ck_tile/ops/fmha/pipeline/block_fmha_bwd_dq_dk_dv_pipeline_selector.hpp" #include "ck_tile/ops/fmha/pipeline/block_fmha_bwd_pipeline_default_policy.hpp" -#include "ck_tile/ops/fmha/pipeline/block_fmha_bwd_pipeline_enum.hpp" #include "ck_tile/ops/fmha/pipeline/block_fmha_bwd_pipeline_problem.hpp" #include "ck_tile/ops/fmha/pipeline/block_fmha_fwd_appendkv_pipeline.hpp" #include "ck_tile/ops/fmha/pipeline/block_fmha_fwd_appendkv_pipeline_default_policy.hpp" diff --git a/include/ck_tile/ops/fmha/kernel/fmha_bwd_kernel.hpp b/include/ck_tile/ops/fmha/kernel/fmha_bwd_kernel.hpp index ce3bf8fe8d..8b184b18f3 100644 --- a/include/ck_tile/ops/fmha/kernel/fmha_bwd_kernel.hpp +++ b/include/ck_tile/ops/fmha/kernel/fmha_bwd_kernel.hpp @@ -52,8 +52,6 @@ struct FmhaBwdDQDKDVKernel using BiasGradDataType = ck_tile::remove_cvref_t; static constexpr bool kIsGroupMode = FmhaPipeline::kIsGroupMode; - static constexpr bool kPadSeqLenQ = FmhaPipeline::kPadSeqLenQ; - static constexpr bool kPadSeqLenK = FmhaPipeline::kPadSeqLenK; static constexpr bool kPadHeadDimQ = FmhaPipeline::kPadHeadDimQ; static constexpr bool kPadHeadDimV = FmhaPipeline::kPadHeadDimV; static constexpr auto BiasEnum = FmhaPipeline::BiasEnum; @@ -85,8 +83,6 @@ struct FmhaBwdDQDKDVKernel #define _TS_ std::to_string auto pn = [&] () { std::string n; - if (kPadSeqLenQ) n += "s"; - if (kPadSeqLenK) n += "sk"; if (kPadHeadDimQ) n += "d"; if (kPadHeadDimV) n += "dv"; return n.empty() ? n : std::string("p") + n; }(); @@ -100,7 +96,7 @@ struct FmhaBwdDQDKDVKernel "r" + _TS_(gbr4::at(ck_tile::number<0>{})) + "x" + _TS_(gbr4::at(ck_tile::number<1>{})) + "x" + _TS_(gbr4::at(ck_tile::number<2>{})) + "_" + "w" + _TS_(gwt0::at(ck_tile::number<0>{})) + "x" + _TS_(gwt0::at(ck_tile::number<1>{})) + "x" + _TS_(gwt0::at(ck_tile::number<2>{})) + "_" + "w" + _TS_(gwt1::at(ck_tile::number<0>{})) + "x" + _TS_(gwt1::at(ck_tile::number<1>{})) + "x" + _TS_(gwt1::at(ck_tile::number<2>{})) + "_" + - ("o" + _TS_(kBlockPerCu) + "_") + _SS_(FmhaPipeline::name) + (pn.empty() ? "_npad" : "_" + pn) + + ("o" + _TS_(kBlockPerCu)) + (pn.empty() ? "_npad" : "_" + pn) + (BiasEnum == BlockAttentionBiasEnum::NO_BIAS ? _SS_("_nbias") : (_SS_("_") + BlockAttentionBiasEnumToStr::name)) + (kHasBiasGrad ? "_dbias" : "_ndbias") + (kHasMask ? "_" + _SS_(FmhaMask::name) : "_nmask") + (kHasDropout ? "_dropout" : "_ndropout" ) + (kIsStoreRandval ? "_storerandval" : "" ) + (kIsDeterministic ? "_deterministic" : "_ndeterministic" ); @@ -1221,7 +1217,7 @@ struct FmhaBwdDQDKDVKernel const auto q_dram = pad_tensor_view( q_dram_naive, make_tuple(number{}, number{}), - sequence{}); + sequence{}); const auto k_dram_naive = make_naive_tensor_view( k_ptr, @@ -1232,7 +1228,7 @@ struct FmhaBwdDQDKDVKernel const auto k_dram = pad_tensor_view( k_dram_naive, make_tuple(number{}, number{}), - sequence{}); + sequence{}); const auto v_dram = [&]() { const auto v_dram_naive = make_naive_tensor_view( @@ -1244,22 +1240,15 @@ struct FmhaBwdDQDKDVKernel return pad_tensor_view( v_dram_naive, make_tuple(number{}, number{}), - sequence{}); + sequence{}); }(); - const auto lse_dram = [&]() { - const auto lse_dram_naive = make_naive_tensor_view_packed( - lse_ptr, make_tuple(kargs.seqlen_q), number<1>{}); - return pad_tensor_view( - lse_dram_naive, make_tuple(number{}), sequence{}); - }(); + // lse and d should be fine to read unpaded data as they are not on the reduction dimension + const auto lse_dram = make_naive_tensor_view_packed( + lse_ptr, make_tuple(kargs.seqlen_q), number{}); - const auto d_dram = [&]() { - const auto d_dram_naive = make_naive_tensor_view_packed( - d_ptr, make_tuple(kargs.seqlen_q), number<1>{}); - return pad_tensor_view( - d_dram_naive, make_tuple(number{}), sequence{}); - }(); + const auto d_dram = make_naive_tensor_view_packed( + d_ptr, make_tuple(kargs.seqlen_q), number{}); const auto do_dram_naive = make_naive_tensor_view( do_ptr, @@ -1270,7 +1259,7 @@ struct FmhaBwdDQDKDVKernel const auto do_dram = pad_tensor_view( do_dram_naive, make_tuple(number{}, number{}), - sequence{}); + sequence{}); auto q_dram_window = make_tile_window( q_dram, @@ -1313,7 +1302,7 @@ struct FmhaBwdDQDKDVKernel return pad_tensor_view( dq_acc_dram_naive, make_tuple(number{}, number{}), - sequence{}); + sequence{}); }(); return make_tile_window( @@ -1341,7 +1330,7 @@ struct FmhaBwdDQDKDVKernel return pad_tensor_view( dq_acc_dram_naive, make_tuple(number{}, number{}), - sequence{}); + sequence{}); }(); return make_tile_window( @@ -1376,9 +1365,8 @@ struct FmhaBwdDQDKDVKernel number{}, number<1>{}); - return pad_tensor_view(bias_dram_naive, - bias_dram_window_lengths, - sequence{}); + return pad_tensor_view( + bias_dram_naive, bias_dram_window_lengths, sequence{}); }(); return make_tile_window(bias_dram, bias_dram_window_lengths, {0, i_n0}); @@ -1406,9 +1394,8 @@ struct FmhaBwdDQDKDVKernel number{}, number<1>{}); - return pad_tensor_view(dbias_dram_naive, - bias_dram_window_lengths, - sequence{}); + return pad_tensor_view( + dbias_dram_naive, bias_dram_window_lengths, sequence{}); }(); return make_tile_window(dbias_dram, bias_dram_window_lengths, {0, i_n0}); @@ -1495,9 +1482,8 @@ struct FmhaBwdDQDKDVKernel number<1>{}, number<1>{}); - return pad_tensor_view(randval_dram_naive, - randval_dram_window_lengths, - sequence{}); + return pad_tensor_view( + randval_dram_naive, randval_dram_window_lengths, sequence{}); }(); return make_tile_window(randval_dram, randval_dram_window_lengths, {0, i_n0}); @@ -1550,7 +1536,7 @@ struct FmhaBwdDQDKDVKernel return pad_tensor_view( dk_dram_naive, make_tuple(number{}, number{}), - sequence{}); + sequence{}); }(); auto dv_dram = [&]() { @@ -1564,7 +1550,7 @@ struct FmhaBwdDQDKDVKernel return pad_tensor_view( dv_dram_naive, make_tuple(number{}, number{}), - sequence{}); + sequence{}); }(); auto dk_dram_window = make_tile_window( diff --git a/include/ck_tile/ops/fmha/pipeline/block_fmha_bwd_dq_dk_dv_pipeline_kr_ktr_vr.hpp b/include/ck_tile/ops/fmha/pipeline/block_fmha_bwd_dq_dk_dv_pipeline_kr_ktr_vr.hpp index 8a13c0b060..1f11569533 100644 --- a/include/ck_tile/ops/fmha/pipeline/block_fmha_bwd_dq_dk_dv_pipeline_kr_ktr_vr.hpp +++ b/include/ck_tile/ops/fmha/pipeline/block_fmha_bwd_dq_dk_dv_pipeline_kr_ktr_vr.hpp @@ -49,8 +49,6 @@ struct BlockFmhaBwdDQDKDVPipelineKRKTRVR static constexpr index_t kVHeaddim = BlockFmhaShape::kVHeaddim; static constexpr bool kIsGroupMode = Problem::kIsGroupMode; - static constexpr bool kPadSeqLenQ = Problem::kPadSeqLenQ; - static constexpr bool kPadSeqLenK = Problem::kPadSeqLenK; static constexpr bool kPadHeadDimQ = Problem::kPadHeadDimQ; static constexpr bool kPadHeadDimV = Problem::kPadHeadDimV; static constexpr auto BiasEnum = Problem::BiasEnum; @@ -72,8 +70,7 @@ struct BlockFmhaBwdDQDKDVPipelineKRKTRVR kPadHeadDimQ ? 1 : Policy::template GetAlignmentKGrad(); static constexpr index_t kAlignmentVGrad = kPadHeadDimV ? 1 : Policy::template GetAlignmentVGrad(); - static constexpr index_t kAlignmentBias = - kPadSeqLenK ? 1 : Policy::template GetTransposedAlignmentBias(); + static constexpr index_t kAlignmentBias = 1; static constexpr const char* name = "kr_ktr_vr"; @@ -554,7 +551,6 @@ struct BlockFmhaBwdDQDKDVPipelineKRKTRVR }); } - if constexpr(kPadSeqLenK || FmhaMask::IsMasking) { bool need_perpixel_check = mask.IsEdgeTile( seqlen_q_step, k_origin.at(number<0>{}), number{}, number{}); diff --git a/include/ck_tile/ops/fmha/pipeline/block_fmha_bwd_dq_dk_dv_pipeline_kr_ktr_vr_iglp.hpp b/include/ck_tile/ops/fmha/pipeline/block_fmha_bwd_dq_dk_dv_pipeline_kr_ktr_vr_iglp.hpp index c88b058d32..967fe2362d 100644 --- a/include/ck_tile/ops/fmha/pipeline/block_fmha_bwd_dq_dk_dv_pipeline_kr_ktr_vr_iglp.hpp +++ b/include/ck_tile/ops/fmha/pipeline/block_fmha_bwd_dq_dk_dv_pipeline_kr_ktr_vr_iglp.hpp @@ -49,8 +49,6 @@ struct BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP static constexpr index_t kVHeaddim = BlockFmhaShape::kVHeaddim; static constexpr bool kIsGroupMode = Problem::kIsGroupMode; - static constexpr bool kPadSeqLenQ = Problem::kPadSeqLenQ; - static constexpr bool kPadSeqLenK = Problem::kPadSeqLenK; static constexpr bool kPadHeadDimQ = Problem::kPadHeadDimQ; static constexpr bool kPadHeadDimV = Problem::kPadHeadDimV; static constexpr auto BiasEnum = Problem::BiasEnum; @@ -72,8 +70,7 @@ struct BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP kPadHeadDimQ ? 1 : Policy::template GetAlignmentKGrad(); static constexpr index_t kAlignmentVGrad = kPadHeadDimV ? 1 : Policy::template GetAlignmentVGrad(); - static constexpr index_t kAlignmentBias = - kPadSeqLenK ? 1 : Policy::template GetTransposedAlignmentBias(); + static constexpr index_t kAlignmentBias = 1; static constexpr const char* name = "kr_ktr_vr_iglp"; @@ -590,7 +587,6 @@ struct BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP }); } - if constexpr(kPadSeqLenK || FmhaMask::IsMasking) { bool need_perpixel_check = mask.IsEdgeTile( seqlen_q_step, k_origin.at(number<0>{}), number{}, number{}); @@ -849,7 +845,6 @@ struct BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP }); } - if constexpr(kPadSeqLenK || FmhaMask::IsMasking) { bool need_perpixel_check = mask.IsEdgeTile( seqlen_q_step, k_origin.at(number<0>{}), number{}, number{}); diff --git a/include/ck_tile/ops/fmha/pipeline/block_fmha_bwd_dq_dk_dv_pipeline_selector.hpp b/include/ck_tile/ops/fmha/pipeline/block_fmha_bwd_dq_dk_dv_pipeline_selector.hpp new file mode 100644 index 0000000000..80c311de86 --- /dev/null +++ b/include/ck_tile/ops/fmha/pipeline/block_fmha_bwd_dq_dk_dv_pipeline_selector.hpp @@ -0,0 +1,30 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck_tile/core.hpp" +#include "ck_tile/ops/fmha/pipeline/block_fmha_bwd_dq_dk_dv_pipeline_kr_ktr_vr.hpp" +#include "ck_tile/ops/fmha/pipeline/block_fmha_bwd_dq_dk_dv_pipeline_kr_ktr_vr_iglp.hpp" + +namespace ck_tile { + +template +class BlockFmhaBwdDQDKDVPipelineSelector +{ + static constexpr bool has_dpad = Problem::Traits::kPadHeadDimQ || Problem::Traits::kPadHeadDimV; + + public: + using type = std::conditional_t, + BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP>; +}; + +template +class BlockFmhaBwdDQDKDVPipeline : public BlockFmhaBwdDQDKDVPipelineSelector::type +{ + public: + static constexpr const char* name = "auto"; +}; + +} // namespace ck_tile diff --git a/include/ck_tile/ops/fmha/pipeline/block_fmha_bwd_pipeline_enum.hpp b/include/ck_tile/ops/fmha/pipeline/block_fmha_bwd_pipeline_enum.hpp deleted file mode 100644 index 27f58ef2f8..0000000000 --- a/include/ck_tile/ops/fmha/pipeline/block_fmha_bwd_pipeline_enum.hpp +++ /dev/null @@ -1,15 +0,0 @@ -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -#pragma once - -namespace ck_tile { - -// This class is used for codegen pattern matching -enum class BlockFmhaBwdPipelineEnum -{ - KRKTRVR_IGLP = 0, - KRKTRVR, -}; - -} // namespace ck_tile diff --git a/include/ck_tile/ops/fmha/pipeline/block_fmha_bwd_pipeline_problem.hpp b/include/ck_tile/ops/fmha/pipeline/block_fmha_bwd_pipeline_problem.hpp index c4c4a745a7..f6c79c7db6 100644 --- a/include/ck_tile/ops/fmha/pipeline/block_fmha_bwd_pipeline_problem.hpp +++ b/include/ck_tile/ops/fmha/pipeline/block_fmha_bwd_pipeline_problem.hpp @@ -1,5 +1,5 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved. #pragma once @@ -55,13 +55,13 @@ struct BlockFmhaBwdPipelineProblem static constexpr bool kIsDeterministic = kIsDeterministic_; // attributes from traits - static constexpr bool kPadSeqLenQ = Traits::kPadSeqLenQ; - static constexpr bool kPadSeqLenK = Traits::kPadSeqLenK; static constexpr bool kPadHeadDimQ = Traits::kPadHeadDimQ; static constexpr bool kPadHeadDimV = Traits::kPadHeadDimV; static constexpr auto BiasEnum = Traits::BiasEnum; static constexpr bool kHasBiasGrad = Traits::kHasBiasGrad; static constexpr index_t kBlockPerCu = Traits::kBlockPerCu; + static_assert(!Traits::kPadSeqLenQ, "BlockFmhaBwdPipelineProblem does not need kPadSeqLenQ"); + static_assert(!Traits::kPadSeqLenK, "BlockFmhaBwdPipelineProblem does not need kPadSeqLenQ"); }; template & block_idx_2d, - const index_t block_idx_z) const - { - Run(kargs.group_karg, block_idx_2d, block_idx_z); - } - CK_TILE_DEVICE void Run(const UniversalGemmKernelArgs<>& kargs, const tuple& block_idx_2d, const index_t block_idx_z) const @@ -277,24 +270,56 @@ struct GroupedGemmKernel CDataType* c_ptr = static_cast(kargs.e_ptr); // allocate LDS - __shared__ char smem_ptr[GetSmemSize()]; + __shared__ char smem_ptr_0[GetSmemSize()]; - if constexpr(UsePersistentKernel) + if constexpr(GemmPipeline::DoubleSmemBuffer == true) { - RunGemmWithPipelineSelection( - a_ptr, b_ptr, c_ptr, smem_ptr, kargs, splitk_batch_offset, i_m, i_n); + __shared__ char smem_ptr_1[GetSmemSize()]; + if constexpr(UsePersistentKernel) + { + RunGemmWithPipelineSelection2LDS(a_ptr, + b_ptr, + c_ptr, + smem_ptr_0, + smem_ptr_1, + kargs, + splitk_batch_offset, + i_m, + i_n); + } + else + { + Base::RunGemm2LDS({a_ptr}, + {b_ptr}, + {/*ds_ptr*/}, + c_ptr, + smem_ptr_0, + smem_ptr_1, + kargs, + splitk_batch_offset, + i_m, + i_n); + } } else { - Base::RunGemm({a_ptr}, - {b_ptr}, - {/*ds_ptr*/}, - c_ptr, - smem_ptr, - kargs, - splitk_batch_offset, - i_m, - i_n); + if constexpr(UsePersistentKernel) + { + RunGemmWithPipelineSelection( + a_ptr, b_ptr, c_ptr, smem_ptr_0, kargs, splitk_batch_offset, i_m, i_n); + } + else + { + Base::RunGemm({a_ptr}, + {b_ptr}, + {/*ds_ptr*/}, + c_ptr, + smem_ptr_0, + kargs, + splitk_batch_offset, + i_m, + i_n); + } } } @@ -358,6 +383,69 @@ struct GroupedGemmKernel c_block_window, c_block_tile, d_block_window, smem_ptr_0); } + /** + * @brief Runs single GEMM problem cooperatively by whole workgroup. + * + * @note The GEMM pipeline is selected in-kernel based on the number of K-loops + * and the tail-number. This is needed for the persistent tile-loop when + * we didn't have access to the K dimension on the host. + * + * @param a_ptr input A pointer + * @param b_ptr input B pointer + * @param c_ptr output C pointer + * @param smem_ptr_0 The start memory pointer of the shared memory block. + * @param smem_ptr_1 The second start memory pointer of the shared memory block. + * @param kargs GEMM kernel arguments + * @param splitk_batch_offset splitk_batch_offset Utility structure used to calculate k + * batch. + * @param block_idx_m The GEMM's output M dimension tile index processed by this workgroup. + * @param block_idx_n The GEMM's output N dimension tile index processed by this workgroup. + * + */ + CK_TILE_DEVICE static void + RunGemmWithPipelineSelection2LDS(const ADataType* a_ptr, + const BDataType* b_ptr, + CDataType* c_ptr, + void* __restrict__ smem_ptr_0, + void* __restrict__ smem_ptr_1, + const UniversalGemmKernelArgs<>& kargs, + const typename Base::SplitKBatchOffset& splitk_batch_offset, + const index_t block_idx_m, + const index_t block_idx_n) + { + // Create Gemm tensor views, pad views and tile windows + const auto& gemm_tensor_views_tuple = + Base::template MakeGemmTensorViews( + {a_ptr}, {b_ptr}, {/*ds_ptr*/}, c_ptr, kargs, splitk_batch_offset); + + const auto& gemm_pad_views = Base::MakeGemmPadViews(gemm_tensor_views_tuple); + auto gemm_tile_windows = + Base::MakeGemmTileWindows(gemm_pad_views, block_idx_m, block_idx_n); + const auto& a_block_window = gemm_tile_windows.at(Base::I0); + const auto& b_block_window = gemm_tile_windows.at(Base::I1); + const auto& d_block_window = gemm_tile_windows.at(Base::I2); + + // Get hot-loop and tail configuration + const index_t num_loop = __builtin_amdgcn_readfirstlane( + TilePartitioner::GetLoopNum(splitk_batch_offset.splitted_k)); + const bool has_hot_loop = GemmPipeline::BlockHasHotloop(num_loop); + const TailNumber tail_num = GemmPipeline::GetBlockLoopTailNum(num_loop); + + // Run GEMM pipeline + const auto& c_block_tile = GemmPipeline{}.template operator()(a_block_window[Base::I0], + b_block_window[Base::I0], + num_loop, + has_hot_loop, + tail_num, + smem_ptr_0, + smem_ptr_1); + // Run Epilogue Pipeline + auto& c_block_window = gemm_tile_windows.at(Base::I3); + EpiloguePipeline{}.template + operator()( + c_block_window, c_block_tile, d_block_window, smem_ptr_0); + } + CK_TILE_DEVICE index_t FindGroupId(const GemmTransKernelArg* gemm_desc_ptr, index_t block_id, index_t group_count) const @@ -401,7 +489,7 @@ struct GroupedGemmKernel kargs.group_karg.M, kargs.group_karg.N, (block_id - kargs.block_start) % grid_size_2d); - Run(kargs, block_idx_2d, (block_id - kargs.block_start) / grid_size_2d); + Run(kargs.group_karg, block_idx_2d, (block_id - kargs.block_start) / grid_size_2d); } // For persistent kernels diff --git a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v4.hpp b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v4.hpp index 50a2110935..ce800d03ab 100644 --- a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v4.hpp +++ b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v4.hpp @@ -18,12 +18,14 @@ struct BaseGemmPipelineAgBgCrCompV4 static constexpr index_t PrefillStages = 1; static constexpr index_t GlobalBufferNum = 1; - CK_TILE_HOST static constexpr bool BlockHasHotloop(index_t num_loop) + static constexpr bool UsePersistentKernel = Problem::Traits::UsePersistentKernel; + + CK_TILE_HOST_DEVICE static constexpr bool BlockHasHotloop(index_t num_loop) { return num_loop > PrefetchStages; } - CK_TILE_HOST static constexpr TailNumber GetBlockLoopTailNum(index_t num_loop) + CK_TILE_HOST_DEVICE static constexpr TailNumber GetBlockLoopTailNum(index_t num_loop) { if(num_loop % PrefetchStages == 1) { diff --git a/include/ck_tile/ops/reduce.hpp b/include/ck_tile/ops/reduce.hpp index 80ead84e85..a6721c9305 100644 --- a/include/ck_tile/ops/reduce.hpp +++ b/include/ck_tile/ops/reduce.hpp @@ -5,8 +5,11 @@ #include "ck_tile/ops/reduce/block/block_reduce.hpp" #include "ck_tile/ops/reduce/block/block_reduce2d.hpp" -#include "ck_tile/ops/reduce/block/block_reduce2d_default_policy.hpp" #include "ck_tile/ops/reduce/block/block_reduce2d_problem.hpp" +#include "ck_tile/ops/reduce/kernel/reduce2d_kernel.hpp" +#include "ck_tile/ops/reduce/pipeline/reduce2d_default_policy.hpp" +#include "ck_tile/ops/reduce/pipeline/reduce2d_problem.hpp" +#include "ck_tile/ops/reduce/pipeline/reduce2d_shape.hpp" #include "ck_tile/ops/common/generic_2d_block_shape.hpp" #include "ck_tile/ops/common/tensor_layout.hpp" #include "ck_tile/ops/common/utils.hpp" diff --git a/include/ck_tile/ops/reduce/block/block_reduce2d.hpp b/include/ck_tile/ops/reduce/block/block_reduce2d.hpp index 62c9944bd2..849fa6c252 100644 --- a/include/ck_tile/ops/reduce/block/block_reduce2d.hpp +++ b/include/ck_tile/ops/reduce/block/block_reduce2d.hpp @@ -7,20 +7,55 @@ namespace ck_tile { +// BlockReduce2d implements a hierarchical 2D reduction operator that reduces data along the second +// dimension using a user-specified reduction function. +// +// The reduction is performed in a three-stage hierarchical approach: +// +// STAGE 1: Thread-level reduction (BlockReduce2d) +// =============================================== +// - Each thread processes multiple elements from the input tensor within its assigned data +// partition +// - Reduction is performed locally within each thread by iterating over assigned elements +// - ReducePacksPerXDim controls how many elements sweep_tile processes in one iteration per +// dimension +// (e.g., {1,1} = 1 element at a time from each dimension, {2,4} = 2 from dim0, 4 from dim1) +// - Results are accumulated into a thread-local output tensor stored in registers +// - The output tensor distribution is derived from the input tensor's distribution using +// make_reduce_tile_distribution_encoding() to handle dimension reduction +// +// STAGE 2: Warp-level reduction (BlockReduce2dSync) +// ================================================ +// - Performs inter-thread reduction within each warp +// - Uses warp shuffle operations to exchange data between threads in the same warp +// - Implements a tree-reduction pattern with power-of-2 stages +// - Only reduces along dimensions that map to lane IDs within the warp +// +// STAGE 3: Cross-warp reduction (BlockReduce2dCrossWarpSync) +// ======================================================== +// - Performs reduction across multiple warps within the same thread block +// - Uses shared memory (LDS) to facilitate data exchange between warps +// - Each warp's lane-0 thread stores its partial results to shared memory +// - All threads participate in loading and reducing data from shared memory +// - Implements block-level synchronization to ensure memory consistency + +// BlockReduce2d: Thread-level reduction (Stage 1) template struct BlockReduce2d { - // in-thread reduction + // Thread-level reduction implementation using Problem = remove_cvref_t; using XDataType = typename Problem::XDataType; using ComputeDataType = typename Problem::ComputeDataType; CK_TILE_DEVICE constexpr BlockReduce2d() {} - template > + template < + typename XDistributedTensor_, + typename YDistributedTensor_, + typename ReduceFunc, + typename ReducePacksPerXDim = + uniform_sequence_gen_t<2, 1>> // {1,1} = process 1 element at a time from each dimension CK_TILE_DEVICE void operator()(const XDistributedTensor_& x_tensor, YDistributedTensor_& y_tensor, const ReduceFunc& reduce_func, @@ -33,6 +68,7 @@ struct BlockReduce2d y_tensor(idx_0), ck_tile::type_convert(x_tensor[idx_])...); }, ReducePacksPerXDim{}); + #if 0 constexpr auto I0 = number<0>{}; constexpr auto I1 = number<1>{}; @@ -75,6 +111,8 @@ struct BlockReduce2d return tensor; } + // uniform_sequence_gen_t generates sequence of NSize elements filled with Value + // e.g., uniform_sequence_gen_t<2, 1> → {1, 1} and uniform_sequence_gen_t<3, 4> → {4, 4, 4} template > @@ -91,6 +129,7 @@ struct BlockReduce2d } }; +// BlockReduce2dSync: Warp-level reduction (Stage 2) template struct BlockReduce2dSync { @@ -145,8 +184,15 @@ struct BlockReduce2dSync // pull data from remote lane const auto v_remote = warp_shuffle(v_local, src_lane); - // reduce - v_local = reduce_func(v_local, v_remote); + // For reduce, use combine_partial_results for operations that require it + if constexpr(ReduceFunc::requires_special_combine) + { + v_local = reduce_func.combine_partial_results(v_local, v_remote); + } + else + { + v_local = reduce_func(v_local, v_remote); + } }); } }); @@ -157,6 +203,7 @@ struct BlockReduce2dSync } }; +// BlockReduce2dCrossWarpSync: Cross-warp reduction (Stage 3) template struct BlockReduce2dCrossWarpSync { @@ -263,8 +310,15 @@ struct BlockReduce2dCrossWarpSync constexpr auto i_1 = number{}; const DataType v_remote = all_scratch[i_0 * num_reduce_warps + i_1]; - // reduce - v_local = reduce_func(v_local, v_remote); + // For reduce, use combine_partial_results for operations that require it + if constexpr(ReduceFunc::requires_special_combine) + { + v_local = reduce_func.combine_partial_results(v_local, v_remote); + } + else + { + v_local = reduce_func(v_local, v_remote); + } }); y_tensor.get_thread_buffer()(i_0) = v_local; diff --git a/include/ck_tile/ops/reduce/kernel/reduce2d_kernel.hpp b/include/ck_tile/ops/reduce/kernel/reduce2d_kernel.hpp new file mode 100644 index 0000000000..f65487ea6e --- /dev/null +++ b/include/ck_tile/ops/reduce/kernel/reduce2d_kernel.hpp @@ -0,0 +1,219 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck_tile/core.hpp" +#include "ck_tile/ops/common.hpp" +#include "ck_tile/ops/reduce/block/block_reduce.hpp" +#include "ck_tile/ops/reduce/pipeline/reduce2d_default_policy.hpp" + +// Reduce2d Kernel: +// ======================================= +// This kernel implements a 2D reduction operation that reduces data along the second dimension +// of a matrix. The reduction is performed in multiple hierarchical stages. + +namespace ck_tile { + +template +struct Reduce +{ + using Problem = ck_tile::remove_cvref_t; + using Policy = ck_tile::remove_cvref_t; + + using XDataType = ck_tile::remove_cvref_t; + using ComputeDataType = ck_tile::remove_cvref_t; + using YDataType = ck_tile::remove_cvref_t; + + private: + // Helper function to calculate optimal vector size for input tensor + template + static constexpr index_t CalculateInputVectorSize() + { + using S = typename Problem::BlockShape; + constexpr index_t memory_vector_size = 16 / sizeof(XDataType); + constexpr index_t thread_tile_vector_size = S::ThreadTile_N; + + // Check if innermost reduce dimension is the last dimension (stride 1). + constexpr auto innermost_reduce_dim = ReduceDims{}.at(number{}); + constexpr bool is_innermost_contiguous = (innermost_reduce_dim == InputShape{}.size() - 1); + + // If innermost reduce dimension is not the last dim (not contiguous), limit vectorization + constexpr index_t stride_based_vector_size = + is_innermost_contiguous ? ck_tile::min(memory_vector_size, thread_tile_vector_size) : 1; + + return stride_based_vector_size; + } + + // Helper function to calculate optimal vector size for output tensor + static constexpr index_t CalculateOutputVectorSize() + { + using S = typename Problem::BlockShape; + constexpr index_t memory_vector_size = 16 / sizeof(YDataType); + constexpr index_t thread_tile_vector_size = S::ThreadTile_M; + constexpr index_t vector_size = ck_tile::min(memory_vector_size, thread_tile_vector_size); + + return vector_size; + } + + public: + template + CK_TILE_DEVICE void operator()(const XDataType* p_x, + YDataType* p_y, + InputShape input_shape, + InputStrides input_strides, + KeptDim kept_dim, + ReduceDims reduce_dims) const + { + using S = typename Problem::BlockShape; + const auto iM = get_block_id() * S::Block_M; + + static_assert(kept_dim.size() + reduce_dims.size() == InputShape::size(), + "Size of kept dimensions + reduced dimensions must equal input tensor rank"); + + // Extract lengths based on kept and reduced dimensions + const auto kept_lens = [&]() { + return generate_tuple([&](auto I) { return input_shape.at(number{}); }, + number{}); + }(); + const auto reduce_lens = [&]() { + return generate_tuple( + [&](auto I) { return input_shape.at(number{}); }, + number{}); + }(); + + const auto kept_merge_transform = make_merge_transform(kept_lens); + const auto reduce_merge_transform = make_merge_transform(reduce_lens); + + auto reduce_func = typename Problem::ReduceOp{}; + const XDataType custom_padding_value = + type_convert(reduce_func.template GetIdentityValue()); + + // Calculate optimal vector size for input tensor + constexpr auto x_tensor_vector_size = CalculateInputVectorSize(); + + // Create input tensor view with custom padding value + auto desc = make_naive_tensor_descriptor( + input_shape, input_strides, number{}, number<1>{}); + + // Create buffer view with custom padding value + auto buffer_view = make_buffer_view( + p_x, desc.get_element_space_size(), custom_padding_value); + + // Create tensor view with custom padding + const auto x_tensor = tensor_view{buffer_view, desc}; + const auto transformed_x_tensor = pad_tensor_view( + transform_tensor_view(x_tensor, + make_tuple(kept_merge_transform, reduce_merge_transform), + make_tuple(kept_dim, reduce_dims), + make_tuple(sequence<0>{}, sequence<1>{})), + make_tuple(number{}, number{}), + sequence<0, 1>{}); + + // Calculate strides for output tensor based on its own dimensions + const auto kept_strides = [&]() { + return generate_tuple( + [&](auto I) { + // Calculate stride for dimension I as product of all following dimensions + index_t stride = 1; + static_for{}( + [&](auto J) { stride *= kept_lens.at(number{}); }); + return stride; + }, + number{}); + }(); + + // Calculate optimal vector size for output tensor + constexpr auto y_tensor_vector_size = CalculateOutputVectorSize(); + + const auto y_m = make_naive_tensor_view( + p_y, kept_lens, kept_strides, number{}, number<1>{}); + + // Transform output tensor to 1D merged view + // This creates a view compatible with the 2D reduction pattern + const auto y_merged = transform_tensor_view( + y_m, + make_tuple(kept_merge_transform), + make_tuple(typename arithmetic_sequence_gen<0, kept_dim.size(), 1>::type{}), + make_tuple(sequence<0>{})); + + auto x_window = make_tile_window(transformed_x_tensor, + make_tuple(number{}, number{}), + {iM, 0}, + Policy::template MakeXBlockTileDistribution()); + + auto y_window = make_tile_window(y_merged, make_tuple(number{}), {iM}); + + __shared__ char smem[Policy::template GetSmemSize()]; + + // Get the merged dimension size from the transformed tensor + const auto merged_reduce_len = + transformed_x_tensor.get_tensor_descriptor().get_lengths().at(number<1>{}); + index_t num_n_tile_iteration = + __builtin_amdgcn_readfirstlane(integer_divide_ceil(merged_reduce_len, S::Block_N)); + + auto block_reduce2d = Policy::template GetBlockReduce2d(); + auto block_reduce2d_sync = Policy::template GetBlockReduce2dSync(); + auto block_reduce2d_cross_warp_sync = + Policy::template GetBlockReduce2dCrossWarpSync(); + + using XTensorType = decltype(load_tile(x_window)); + auto y_compute = block_reduce2d.template MakeYBlockTile(); + set_tile(y_compute, reduce_func.template GetIdentityValue()); + + for(int iN = __builtin_amdgcn_readfirstlane(0); iN < num_n_tile_iteration; ++iN) + { + const auto x = load_tile(x_window); + block_reduce2d(x, y_compute, reduce_func); + move_tile_window(x_window, {0, S::Block_N}); + } + + block_reduce2d_sync(y_compute, reduce_func); + block_reduce2d_cross_warp_sync(y_compute, smem, reduce_func); + + store_tile(y_window, cast_tile(y_compute)); + } + + /// @brief Validates if the given arguments are supported by the 2D reduction kernel. + /// + /// @param y_continous_dim Size of the continuous dimension of the output tensor. + /// Must be a multiple of ThreadTile_N for proper thread mapping. + /// + /// @param input_strides The stride configuration of the input tensor. + /// The last stride must be 1 to ensure contiguous memory access + /// and enable efficient vectorized loads. + /// + /// @return true if the arguments are supported, false otherwise. + /// Error messages are logged when CK_TILE_LOGGING is enabled. + /// + /// @note Requirements: + /// - y_continous_dim % ThreadTile_N == 0 (for proper thread distribution) + /// - input_strides[-1] == 1 (for contiguous memory access) + CK_TILE_HOST static bool IsSupportedArgument(index_t y_continous_dim, auto input_strides) + { + using S = typename Problem::BlockShape; + + if(y_continous_dim % S::ThreadTile_N != 0) + { + if(ck_tile::EnvIsEnabled(CK_TILE_ENV(CK_TILE_LOGGING))) + { + CK_TILE_ERROR("Total reduction size should be a multiple of ThreadTile_N!"); + } + return false; + } + + if(input_strides.at(number{}) != 1) + { + if(ck_tile::EnvIsEnabled(CK_TILE_ENV(CK_TILE_LOGGING))) + { + CK_TILE_ERROR( + "Input tensor's last stride must be 1 to support correct vector access!"); + } + return false; + } + + return true; + } +}; + +} // namespace ck_tile diff --git a/include/ck_tile/ops/reduce/block/block_reduce2d_default_policy.hpp b/include/ck_tile/ops/reduce/pipeline/reduce2d_default_policy.hpp similarity index 89% rename from include/ck_tile/ops/reduce/block/block_reduce2d_default_policy.hpp rename to include/ck_tile/ops/reduce/pipeline/reduce2d_default_policy.hpp index 3c547242d5..27bb4bcdcb 100644 --- a/include/ck_tile/ops/reduce/block/block_reduce2d_default_policy.hpp +++ b/include/ck_tile/ops/reduce/pipeline/reduce2d_default_policy.hpp @@ -1,5 +1,5 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved. #pragma once @@ -9,7 +9,7 @@ namespace ck_tile { -struct BlockReduce2dDefaultPolicy +struct Reduce2dDefaultPolicy { template CK_TILE_DEVICE static constexpr auto MakeXBlockTileDistribution() @@ -18,8 +18,9 @@ struct BlockReduce2dDefaultPolicy return make_static_tile_distribution( tile_distribution_encoding< sequence<>, - tuple, - sequence>, + tuple< + sequence, + sequence>, tuple, sequence<1, 2>>, tuple, sequence<2, 2>>, sequence<1, 1, 2, 2>, diff --git a/include/ck_tile/ops/reduce/pipeline/reduce2d_problem.hpp b/include/ck_tile/ops/reduce/pipeline/reduce2d_problem.hpp new file mode 100644 index 0000000000..67fdec9286 --- /dev/null +++ b/include/ck_tile/ops/reduce/pipeline/reduce2d_problem.hpp @@ -0,0 +1,27 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck_tile/core.hpp" + +namespace ck_tile { + +template +struct Reduce2dProblem +{ + using XDataType = remove_cvref_t; + using ComputeDataType = remove_cvref_t; + using YDataType = remove_cvref_t; + using BlockShape = remove_cvref_t; + using ReduceOp = ReduceOp_; + + static constexpr bool kNeedCrossLaneSync = BlockShape::ThreadPerWarp_N > 1; + static constexpr bool kNeedCrossWarpSync = BlockShape::WarpPerBlock_N > 1; +}; + +} // namespace ck_tile diff --git a/include/ck_tile/ops/reduce/pipeline/reduce2d_shape.hpp b/include/ck_tile/ops/reduce/pipeline/reduce2d_shape.hpp new file mode 100644 index 0000000000..31eb1f2f4f --- /dev/null +++ b/include/ck_tile/ops/reduce/pipeline/reduce2d_shape.hpp @@ -0,0 +1,37 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck_tile/core.hpp" + +namespace ck_tile { + +template + typename BlockTile, // block size, seq + typename WarpTile, // warp size, seq + typename ThreadTile> // contiguous pixels(vector size) along seq +struct Reduce2dShape +{ + static constexpr index_t Block_M = BlockTile::at(number<0>{}); + static constexpr index_t Block_N = BlockTile::at(number<1>{}); + + static constexpr index_t Warp_M = WarpTile::at(number<0>{}); + static constexpr index_t Warp_N = WarpTile::at(number<1>{}); + + static constexpr index_t ThreadTile_M = ThreadTile::at(number<0>{}); + static constexpr index_t ThreadTile_N = ThreadTile::at(number<1>{}); + + static constexpr index_t WarpPerBlock_M = BlockWarps::at(number<0>{}); + static constexpr index_t WarpPerBlock_N = BlockWarps::at(number<1>{}); + + static constexpr index_t ThreadPerWarp_M = Warp_M / ThreadTile_M; + static constexpr index_t ThreadPerWarp_N = Warp_N / ThreadTile_N; + + static constexpr index_t Repeat_M = Block_M / (WarpPerBlock_M * Warp_M); + static constexpr index_t Repeat_N = Block_N / (WarpPerBlock_N * Warp_N); + + static constexpr index_t BlockSize = + ck_tile::get_warp_size() * reduce_on_sequence(BlockWarps{}, multiplies{}, number<1>{}); +}; +} // namespace ck_tile diff --git a/library/include/ck/library/tensor_operation_instance/device_operation_instance_factory.hpp b/library/include/ck/library/tensor_operation_instance/device_operation_instance_factory.hpp index f6983810be..bf7f1b4fa4 100644 --- a/library/include/ck/library/tensor_operation_instance/device_operation_instance_factory.hpp +++ b/library/include/ck/library/tensor_operation_instance/device_operation_instance_factory.hpp @@ -113,29 +113,30 @@ using GK_Tuple = ck::Tuple; using GK_GK_Tuple = ck::Tuple; // pointwise functor -using PassThrough = ck::tensor_operation::element_wise::PassThrough; -using Relu = ck::tensor_operation::element_wise::Relu; -using TanH = ck::tensor_operation::element_wise::TanH; -using Scale = ck::tensor_operation::element_wise::Scale; -using Bilinear = ck::tensor_operation::element_wise::Bilinear; -using AddAddFastGelu = ck::tensor_operation::element_wise::AddAddFastGelu; -using AddFastGelu = ck::tensor_operation::element_wise::AddFastGelu; -using MultiplyAddFastGelu = ck::tensor_operation::element_wise::MultiplyAddFastGelu; -using AddRelu = ck::tensor_operation::element_wise::AddRelu; -using AddClamp = ck::tensor_operation::element_wise::AddClamp; -using Clamp = ck::tensor_operation::element_wise::Clamp; -using AddSilu = ck::tensor_operation::element_wise::AddSilu; -using AddReluAdd = ck::tensor_operation::element_wise::AddReluAdd; -using FastGelu = ck::tensor_operation::element_wise::FastGelu; -using MultiplyFastGelu = ck::tensor_operation::element_wise::MultiplyFastGelu; -using AddMultiply = ck::tensor_operation::element_wise::AddMultiply; -using MultiplyAdd = ck::tensor_operation::element_wise::MultiplyAdd; -using MultiplyMultiply = ck::tensor_operation::element_wise::MultiplyMultiply; -using ScaleAdd = ck::tensor_operation::element_wise::ScaleAdd; -using Gelu = ck::tensor_operation::element_wise::Gelu; -using Swish = ck::tensor_operation::element_wise::Swish; -using Add = ck::tensor_operation::element_wise::Add; -using Multiply = ck::tensor_operation::element_wise::Multiply; +using PassThrough = ck::tensor_operation::element_wise::PassThrough; +using Relu = ck::tensor_operation::element_wise::Relu; +using TanH = ck::tensor_operation::element_wise::TanH; +using Scale = ck::tensor_operation::element_wise::Scale; +using Bilinear = ck::tensor_operation::element_wise::Bilinear; +using AddAddFastGelu = ck::tensor_operation::element_wise::AddAddFastGelu; +using AddFastGelu = ck::tensor_operation::element_wise::AddFastGelu; +using MultiplyAddFastGelu = ck::tensor_operation::element_wise::MultiplyAddFastGelu; +using AddRelu = ck::tensor_operation::element_wise::AddRelu; +using BiasNormalizeInInferClamp = ck::tensor_operation::element_wise::BiasNormalizeInInferClamp; +using AddClamp = ck::tensor_operation::element_wise::AddClamp; +using Clamp = ck::tensor_operation::element_wise::Clamp; +using AddSilu = ck::tensor_operation::element_wise::AddSilu; +using AddReluAdd = ck::tensor_operation::element_wise::AddReluAdd; +using FastGelu = ck::tensor_operation::element_wise::FastGelu; +using MultiplyFastGelu = ck::tensor_operation::element_wise::MultiplyFastGelu; +using AddMultiply = ck::tensor_operation::element_wise::AddMultiply; +using MultiplyAdd = ck::tensor_operation::element_wise::MultiplyAdd; +using MultiplyMultiply = ck::tensor_operation::element_wise::MultiplyMultiply; +using ScaleAdd = ck::tensor_operation::element_wise::ScaleAdd; +using Gelu = ck::tensor_operation::element_wise::Gelu; +using Swish = ck::tensor_operation::element_wise::Swish; +using Add = ck::tensor_operation::element_wise::Add; +using Multiply = ck::tensor_operation::element_wise::Multiply; template using Activation_Mul_Clamp = ck::tensor_operation::element_wise::Activation_Mul_Clamp; diff --git a/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_comp_instance.hpp b/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_comp_instance.hpp index fca236d03e..bbc2a54c34 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_comp_instance.hpp +++ b/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_comp_instance.hpp @@ -32,9 +32,10 @@ using Empty_Tuple = ck::Tuple<>; using namespace ck::tensor_layout::convolution; -using PassThrough = ck::tensor_operation::element_wise::PassThrough; -using AddClamp = ck::tensor_operation::element_wise::AddClamp; -using Clamp = ck::tensor_operation::element_wise::Clamp; +using PassThrough = ck::tensor_operation::element_wise::PassThrough; +using BiasNormalizeInInferClamp = ck::tensor_operation::element_wise::BiasNormalizeInInferClamp; +using AddClamp = ck::tensor_operation::element_wise::AddClamp; +using Clamp = ck::tensor_operation::element_wise::Clamp; static constexpr auto ConvFwdDefault = ck::tensor_operation::device::ConvolutionForwardSpecialization::Default; diff --git a/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_instance.hpp b/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_instance.hpp index c641019b70..768fcbada0 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_instance.hpp +++ b/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_instance.hpp @@ -32,9 +32,10 @@ using Empty_Tuple = ck::Tuple<>; using namespace ck::tensor_layout::convolution; -using PassThrough = ck::tensor_operation::element_wise::PassThrough; -using AddClamp = ck::tensor_operation::element_wise::AddClamp; -using Clamp = ck::tensor_operation::element_wise::Clamp; +using PassThrough = ck::tensor_operation::element_wise::PassThrough; +using BiasNormalizeInInferClamp = ck::tensor_operation::element_wise::BiasNormalizeInInferClamp; +using AddClamp = ck::tensor_operation::element_wise::AddClamp; +using Clamp = ck::tensor_operation::element_wise::Clamp; static constexpr auto ConvFwdDefault = ck::tensor_operation::device::ConvolutionForwardSpecialization::Default; diff --git a/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_large_tensor_instance.hpp b/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_large_tensor_instance.hpp index 3e98852d58..5a4a011512 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_large_tensor_instance.hpp +++ b/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_large_tensor_instance.hpp @@ -24,9 +24,10 @@ using Empty_Tuple = ck::Tuple<>; using namespace ck::tensor_layout::convolution; -using PassThrough = ck::tensor_operation::element_wise::PassThrough; -using AddClamp = ck::tensor_operation::element_wise::AddClamp; -using Clamp = ck::tensor_operation::element_wise::Clamp; +using PassThrough = ck::tensor_operation::element_wise::PassThrough; +using BiasNormalizeInInferClamp = ck::tensor_operation::element_wise::BiasNormalizeInInferClamp; +using AddClamp = ck::tensor_operation::element_wise::AddClamp; +using Clamp = ck::tensor_operation::element_wise::Clamp; static constexpr auto ConvFwdDefault = ck::tensor_operation::device::ConvolutionForwardSpecialization::Default; diff --git a/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_mem_instance.hpp b/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_mem_instance.hpp index 4e6b9c3d1d..57bdeddcf9 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_mem_instance.hpp +++ b/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_mem_instance.hpp @@ -32,9 +32,10 @@ using Empty_Tuple = ck::Tuple<>; using namespace ck::tensor_layout::convolution; -using PassThrough = ck::tensor_operation::element_wise::PassThrough; -using AddClamp = ck::tensor_operation::element_wise::AddClamp; -using Clamp = ck::tensor_operation::element_wise::Clamp; +using PassThrough = ck::tensor_operation::element_wise::PassThrough; +using BiasNormalizeInInferClamp = ck::tensor_operation::element_wise::BiasNormalizeInInferClamp; +using AddClamp = ck::tensor_operation::element_wise::AddClamp; +using Clamp = ck::tensor_operation::element_wise::Clamp; static constexpr auto ConvFwdDefault = ck::tensor_operation::device::ConvolutionForwardSpecialization::Default; diff --git a/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_merged_groups_instance.hpp b/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_merged_groups_instance.hpp index 7ef78d46e2..d07d82e7ee 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_merged_groups_instance.hpp +++ b/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_merged_groups_instance.hpp @@ -24,9 +24,10 @@ using Empty_Tuple = ck::Tuple<>; using namespace ck::tensor_layout::convolution; -using PassThrough = ck::tensor_operation::element_wise::PassThrough; -using AddClamp = ck::tensor_operation::element_wise::AddClamp; -using Clamp = ck::tensor_operation::element_wise::Clamp; +using PassThrough = ck::tensor_operation::element_wise::PassThrough; +using BiasNormalizeInInferClamp = ck::tensor_operation::element_wise::BiasNormalizeInInferClamp; +using AddClamp = ck::tensor_operation::element_wise::AddClamp; +using Clamp = ck::tensor_operation::element_wise::Clamp; static constexpr auto ConvFwdDefault = ck::tensor_operation::device::ConvolutionForwardSpecialization::Default; diff --git a/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_bias_bnorm_clamp.hpp b/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_bias_bnorm_clamp.hpp new file mode 100644 index 0000000000..22cb7854a9 --- /dev/null +++ b/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_bias_bnorm_clamp.hpp @@ -0,0 +1,237 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include +#include +#include "ck/ck.hpp" +#include "ck/tensor_operation/gpu/device/device_grouped_conv_fwd_multiple_abd.hpp" +#include "ck/tensor_operation/gpu/device/tensor_layout.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" + +#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp" + +#ifdef CK_USE_XDL +#include "grouped_convolution_forward_bias_bnorm_clamp_xdl.inc" +#endif + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +template +struct DeviceOperationInstanceFactory> +{ + using DeviceOp = DeviceGroupedConvFwdMultipleABD< + NumDimSpatial, + InLayout, + WeiLayout, + DLayouts, + OutLayout, + InDataType, + WeiDataType, + DDataTypes, + OutDataType, + ck::tensor_operation::element_wise::PassThrough, + ck::tensor_operation::element_wise::PassThrough, + ck::tensor_operation::element_wise::BiasNormalizeInInferClamp, + AComputeType, + BComputeType>; + + static auto GetInstances() + { + std::vector> op_ptrs; + +#ifdef CK_USE_XDL + // layout NHWGC/GKYXC/NHWGK + if constexpr(NumDimSpatial == 2 && is_same_v && + is_same_v && is_same_v) + { +#ifdef CK_ENABLE_BF16 + if constexpr(is_same_v && + is_same_v && + is_same_v && + is_same_v && + is_same_v) + { + add_device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_instances( + op_ptrs); + add_device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_16x16_instances( + op_ptrs); + add_device_grouped_conv2d_fwd_bias_bn_clamp_xdl_large_tensor_nhwgc_gkyxc_nhwgk_bf16_instances( + op_ptrs); + add_device_grouped_conv2d_fwd_bias_bn_clamp_xdl_merged_groups_nhwgc_gkyxc_nhwgk_bf16_instances( + op_ptrs); + add_device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_instances( + op_ptrs); + add_device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_2x_instances( + op_ptrs); + add_device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_part2_instances( + op_ptrs); + add_device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_mem_intra_instances( + op_ptrs); + add_device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_mem_inter_instances( + op_ptrs); + } +#endif +#ifdef CK_ENABLE_FP16 + if constexpr(is_same_v && is_same_v && + is_same_v && is_same_v && + is_same_v) + { + add_device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f16_instances( + op_ptrs); + add_device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f16_16x16_instances( + op_ptrs); + add_device_grouped_conv2d_fwd_bias_bn_clamp_xdl_large_tensor_nhwgc_gkyxc_nhwgk_f16_instances( + op_ptrs); + add_device_grouped_conv2d_fwd_bias_bn_clamp_xdl_merged_groups_nhwgc_gkyxc_nhwgk_f16_instances( + op_ptrs); + add_device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f16_comp_instances( + op_ptrs); + add_device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f16_comp_2x_instances( + op_ptrs); + add_device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f16_comp_part2_instances( + op_ptrs); + add_device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f16_mem_intra_instances( + op_ptrs); + add_device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f16_mem_inter_instances( + op_ptrs); + } +#endif +#ifdef CK_ENABLE_FP32 + if constexpr(is_same_v && is_same_v && + is_same_v && is_same_v && + is_same_v) + { + add_device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f32_instances( + op_ptrs); + add_device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f32_16x16_instances( + op_ptrs); + add_device_grouped_conv2d_fwd_bias_bn_clamp_xdl_large_tensor_nhwgc_gkyxc_nhwgk_f32_instances( + op_ptrs); + add_device_grouped_conv2d_fwd_bias_bn_clamp_xdl_merged_groups_nhwgc_gkyxc_nhwgk_f32_instances( + op_ptrs); + add_device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f32_comp_instances( + op_ptrs); + add_device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f32_mem_intra_instances( + op_ptrs); + add_device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f32_mem_inter_instances( + op_ptrs); + } +#endif + } + // layout NDHWGC/GKZYXC/NDHWGK + if constexpr(NumDimSpatial == 3 && is_same_v && + is_same_v && is_same_v) + { +#ifdef CK_ENABLE_BF16 + if constexpr(is_same_v && + is_same_v && + is_same_v && + is_same_v && + is_same_v) + { + add_device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_instances( + op_ptrs); + add_device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_16x16_instances( + op_ptrs); + add_device_grouped_conv3d_fwd_bias_bn_clamp_xdl_large_tensor_ndhwgc_gkzyxc_ndhwgk_bf16_instances( + op_ptrs); + add_device_grouped_conv3d_fwd_bias_bn_clamp_xdl_merged_groups_ndhwgc_gkzyxc_ndhwgk_bf16_instances( + op_ptrs); + add_device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_comp_instances( + op_ptrs); + add_device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_comp_2x_instances( + op_ptrs); + add_device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_comp_part2_instances( + op_ptrs); + add_device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_mem_intra_instances( + op_ptrs); + add_device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_mem_inter_instances( + op_ptrs); + } +#endif +#ifdef CK_ENABLE_FP16 + if constexpr(is_same_v && is_same_v && + is_same_v && is_same_v && + is_same_v) + { + add_device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f16_instances( + op_ptrs); + add_device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f16_16x16_instances( + op_ptrs); + add_device_grouped_conv3d_fwd_bias_bn_clamp_xdl_large_tensor_ndhwgc_gkzyxc_ndhwgk_f16_instances( + op_ptrs); + add_device_grouped_conv3d_fwd_bias_bn_clamp_xdl_merged_groups_ndhwgc_gkzyxc_ndhwgk_f16_instances( + op_ptrs); + add_device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f16_comp_instances( + op_ptrs); + add_device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f16_comp_2x_instances( + op_ptrs); + add_device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f16_comp_part2_instances( + op_ptrs); + add_device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f16_mem_intra_instances( + op_ptrs); + add_device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f16_mem_inter_instances( + op_ptrs); + } +#endif +#ifdef CK_ENABLE_FP32 + if constexpr(is_same_v && is_same_v && + is_same_v && is_same_v && + is_same_v) + { + add_device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f32_instances( + op_ptrs); + add_device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f32_16x16_instances( + op_ptrs); + add_device_grouped_conv3d_fwd_bias_bn_clamp_xdl_large_tensor_ndhwgc_gkzyxc_ndhwgk_f32_instances( + op_ptrs); + add_device_grouped_conv3d_fwd_bias_bn_clamp_xdl_merged_groups_ndhwgc_gkzyxc_ndhwgk_f32_instances( + op_ptrs); + add_device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f32_comp_instances( + op_ptrs); + add_device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f32_mem_intra_instances( + op_ptrs); + add_device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f32_mem_inter_instances( + op_ptrs); + } +#endif + } +#endif // CK_USE_XDL + + return op_ptrs; + } +}; + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_bias_bnorm_clamp_xdl.inc b/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_bias_bnorm_clamp_xdl.inc new file mode 100644 index 0000000000..b11b428471 --- /dev/null +++ b/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_bias_bnorm_clamp_xdl.inc @@ -0,0 +1,776 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +#ifdef CK_ENABLE_BF16 + +void add_device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_instances( + std::vector< + std::unique_ptr, + NHWGK, + BF16, + BF16, + Tuple, + BF16, + PassThrough, + PassThrough, + BiasNormalizeInInferClamp>>>& instances); + +void add_device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_16x16_instances( + std::vector< + std::unique_ptr, + NHWGK, + BF16, + BF16, + Tuple, + BF16, + PassThrough, + PassThrough, + BiasNormalizeInInferClamp>>>& instances); + +void add_device_grouped_conv2d_fwd_bias_bn_clamp_xdl_large_tensor_nhwgc_gkyxc_nhwgk_bf16_instances( + std::vector< + std::unique_ptr, + NHWGK, + BF16, + BF16, + Tuple, + BF16, + PassThrough, + PassThrough, + BiasNormalizeInInferClamp>>>& instances); + +void add_device_grouped_conv2d_fwd_bias_bn_clamp_xdl_merged_groups_nhwgc_gkyxc_nhwgk_bf16_instances( + std::vector< + std::unique_ptr, + NHWGK, + BF16, + BF16, + Tuple, + BF16, + PassThrough, + PassThrough, + BiasNormalizeInInferClamp>>>& instances); + +void add_device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_instances( + std::vector< + std::unique_ptr, + NHWGK, + BF16, + BF16, + Tuple, + BF16, + PassThrough, + PassThrough, + BiasNormalizeInInferClamp>>>& instances); + +void add_device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_2x_instances( + std::vector< + std::unique_ptr, + NHWGK, + BF16, + BF16, + Tuple, + BF16, + PassThrough, + PassThrough, + BiasNormalizeInInferClamp>>>& instances); + +void add_device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_part2_instances( + std::vector< + std::unique_ptr, + NHWGK, + BF16, + BF16, + Tuple, + BF16, + PassThrough, + PassThrough, + BiasNormalizeInInferClamp>>>& instances); + +void add_device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_mem_intra_instances( + std::vector< + std::unique_ptr, + NHWGK, + BF16, + BF16, + Tuple, + BF16, + PassThrough, + PassThrough, + BiasNormalizeInInferClamp>>>& instances); + +void add_device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_mem_inter_instances( + std::vector< + std::unique_ptr, + NHWGK, + BF16, + BF16, + Tuple, + BF16, + PassThrough, + PassThrough, + BiasNormalizeInInferClamp>>>& instances); + +void add_device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_instances( + std::vector, + NDHWGK, + BF16, + BF16, + Tuple, + BF16, + PassThrough, + PassThrough, + BiasNormalizeInInferClamp>>>& instances); + +void add_device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_16x16_instances( + std::vector, + NDHWGK, + BF16, + BF16, + Tuple, + BF16, + PassThrough, + PassThrough, + BiasNormalizeInInferClamp>>>& instances); + +void add_device_grouped_conv3d_fwd_bias_bn_clamp_xdl_large_tensor_ndhwgc_gkzyxc_ndhwgk_bf16_instances( + std::vector, + NDHWGK, + BF16, + BF16, + Tuple, + BF16, + PassThrough, + PassThrough, + BiasNormalizeInInferClamp>>>& instances); + +void add_device_grouped_conv3d_fwd_bias_bn_clamp_xdl_merged_groups_ndhwgc_gkzyxc_ndhwgk_bf16_instances( + std::vector, + NDHWGK, + BF16, + BF16, + Tuple, + BF16, + PassThrough, + PassThrough, + BiasNormalizeInInferClamp>>>& instances); + +void add_device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_comp_instances( + std::vector, + NDHWGK, + BF16, + BF16, + Tuple, + BF16, + PassThrough, + PassThrough, + BiasNormalizeInInferClamp>>>& instances); + +void add_device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_comp_part2_instances( + std::vector, + NDHWGK, + BF16, + BF16, + Tuple, + BF16, + PassThrough, + PassThrough, + BiasNormalizeInInferClamp>>>& instances); + +void add_device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_comp_2x_instances( + std::vector, + NDHWGK, + BF16, + BF16, + Tuple, + BF16, + PassThrough, + PassThrough, + BiasNormalizeInInferClamp>>>& instances); + +void add_device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_mem_intra_instances( + std::vector, + NDHWGK, + BF16, + BF16, + Tuple, + BF16, + PassThrough, + PassThrough, + BiasNormalizeInInferClamp>>>& instances); + +void add_device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_mem_inter_instances( + std::vector, + NDHWGK, + BF16, + BF16, + Tuple, + BF16, + PassThrough, + PassThrough, + BiasNormalizeInInferClamp>>>& instances); + +#endif + +#ifdef CK_ENABLE_FP16 + +void add_device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f16_instances( + std::vector< + std::unique_ptr, + NHWGK, + F16, + F16, + Tuple, + F16, + PassThrough, + PassThrough, + BiasNormalizeInInferClamp>>>& instances); + +void add_device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f16_16x16_instances( + std::vector< + std::unique_ptr, + NHWGK, + F16, + F16, + Tuple, + F16, + PassThrough, + PassThrough, + BiasNormalizeInInferClamp>>>& instances); + +void add_device_grouped_conv2d_fwd_bias_bn_clamp_xdl_large_tensor_nhwgc_gkyxc_nhwgk_f16_instances( + std::vector< + std::unique_ptr, + NHWGK, + F16, + F16, + Tuple, + F16, + PassThrough, + PassThrough, + BiasNormalizeInInferClamp>>>& instances); + +void add_device_grouped_conv2d_fwd_bias_bn_clamp_xdl_merged_groups_nhwgc_gkyxc_nhwgk_f16_instances( + std::vector< + std::unique_ptr, + NHWGK, + F16, + F16, + Tuple, + F16, + PassThrough, + PassThrough, + BiasNormalizeInInferClamp>>>& instances); + +void add_device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f16_comp_instances( + std::vector< + std::unique_ptr, + NHWGK, + F16, + F16, + Tuple, + F16, + PassThrough, + PassThrough, + BiasNormalizeInInferClamp>>>& instances); + +void add_device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f16_comp_2x_instances( + std::vector< + std::unique_ptr, + NHWGK, + F16, + F16, + Tuple, + F16, + PassThrough, + PassThrough, + BiasNormalizeInInferClamp>>>& instances); + +void add_device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f16_comp_part2_instances( + std::vector< + std::unique_ptr, + NHWGK, + F16, + F16, + Tuple, + F16, + PassThrough, + PassThrough, + BiasNormalizeInInferClamp>>>& instances); + +void add_device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f16_mem_intra_instances( + std::vector< + std::unique_ptr, + NHWGK, + F16, + F16, + Tuple, + F16, + PassThrough, + PassThrough, + BiasNormalizeInInferClamp>>>& instances); + +void add_device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f16_mem_inter_instances( + std::vector< + std::unique_ptr, + NHWGK, + F16, + F16, + Tuple, + F16, + PassThrough, + PassThrough, + BiasNormalizeInInferClamp>>>& instances); + +void add_device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f16_instances( + std::vector, + NDHWGK, + F16, + F16, + Tuple, + F16, + PassThrough, + PassThrough, + BiasNormalizeInInferClamp>>>& instances); + +void add_device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f16_16x16_instances( + std::vector, + NDHWGK, + F16, + F16, + Tuple, + F16, + PassThrough, + PassThrough, + BiasNormalizeInInferClamp>>>& instances); + +void add_device_grouped_conv3d_fwd_bias_bn_clamp_xdl_large_tensor_ndhwgc_gkzyxc_ndhwgk_f16_instances( + std::vector, + NDHWGK, + F16, + F16, + Tuple, + F16, + PassThrough, + PassThrough, + BiasNormalizeInInferClamp>>>& instances); + +void add_device_grouped_conv3d_fwd_bias_bn_clamp_xdl_merged_groups_ndhwgc_gkzyxc_ndhwgk_f16_instances( + std::vector, + NDHWGK, + F16, + F16, + Tuple, + F16, + PassThrough, + PassThrough, + BiasNormalizeInInferClamp>>>& instances); + +void add_device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f16_comp_instances( + std::vector, + NDHWGK, + F16, + F16, + Tuple, + F16, + PassThrough, + PassThrough, + BiasNormalizeInInferClamp>>>& instances); + +void add_device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f16_comp_2x_instances( + std::vector, + NDHWGK, + F16, + F16, + Tuple, + F16, + PassThrough, + PassThrough, + BiasNormalizeInInferClamp>>>& instances); + +void add_device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f16_comp_part2_instances( + std::vector, + NDHWGK, + F16, + F16, + Tuple, + F16, + PassThrough, + PassThrough, + BiasNormalizeInInferClamp>>>& instances); + +void add_device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f16_mem_intra_instances( + std::vector, + NDHWGK, + F16, + F16, + Tuple, + F16, + PassThrough, + PassThrough, + BiasNormalizeInInferClamp>>>& instances); + +void add_device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f16_mem_inter_instances( + std::vector, + NDHWGK, + F16, + F16, + Tuple, + F16, + PassThrough, + PassThrough, + BiasNormalizeInInferClamp>>>& instances); + +#endif + +#ifdef CK_ENABLE_FP32 + +void add_device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f32_instances( + std::vector< + std::unique_ptr, + NHWGK, + F32, + F32, + Tuple, + F32, + PassThrough, + PassThrough, + BiasNormalizeInInferClamp>>>& instances); + +void add_device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f32_16x16_instances( + std::vector< + std::unique_ptr, + NHWGK, + F32, + F32, + Tuple, + F32, + PassThrough, + PassThrough, + BiasNormalizeInInferClamp>>>& instances); + +void add_device_grouped_conv2d_fwd_bias_bn_clamp_xdl_large_tensor_nhwgc_gkyxc_nhwgk_f32_instances( + std::vector< + std::unique_ptr, + NHWGK, + F32, + F32, + Tuple, + F32, + PassThrough, + PassThrough, + BiasNormalizeInInferClamp>>>& instances); + +void add_device_grouped_conv2d_fwd_bias_bn_clamp_xdl_merged_groups_nhwgc_gkyxc_nhwgk_f32_instances( + std::vector< + std::unique_ptr, + NHWGK, + F32, + F32, + Tuple, + F32, + PassThrough, + PassThrough, + BiasNormalizeInInferClamp>>>& instances); + +void add_device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f32_comp_instances( + std::vector< + std::unique_ptr, + NHWGK, + F32, + F32, + Tuple, + F32, + PassThrough, + PassThrough, + BiasNormalizeInInferClamp>>>& instances); + +void add_device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f32_mem_intra_instances( + std::vector< + std::unique_ptr, + NHWGK, + F32, + F32, + Tuple, + F32, + PassThrough, + PassThrough, + BiasNormalizeInInferClamp>>>& instances); + +void add_device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f32_mem_inter_instances( + std::vector< + std::unique_ptr, + NHWGK, + F32, + F32, + Tuple, + F32, + PassThrough, + PassThrough, + BiasNormalizeInInferClamp>>>& instances); + +void add_device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f32_instances( + std::vector, + NDHWGK, + F32, + F32, + Tuple, + F32, + PassThrough, + PassThrough, + BiasNormalizeInInferClamp>>>& instances); + +void add_device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f32_16x16_instances( + std::vector, + NDHWGK, + F32, + F32, + Tuple, + F32, + PassThrough, + PassThrough, + BiasNormalizeInInferClamp>>>& instances); + +void add_device_grouped_conv3d_fwd_bias_bn_clamp_xdl_large_tensor_ndhwgc_gkzyxc_ndhwgk_f32_instances( + std::vector, + NDHWGK, + F32, + F32, + Tuple, + F32, + PassThrough, + PassThrough, + BiasNormalizeInInferClamp>>>& instances); + +void add_device_grouped_conv3d_fwd_bias_bn_clamp_xdl_merged_groups_ndhwgc_gkzyxc_ndhwgk_f32_instances( + std::vector, + NDHWGK, + F32, + F32, + Tuple, + F32, + PassThrough, + PassThrough, + BiasNormalizeInInferClamp>>>& instances); + +void add_device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f32_comp_instances( + std::vector, + NDHWGK, + F32, + F32, + Tuple, + F32, + PassThrough, + PassThrough, + BiasNormalizeInInferClamp>>>& instances); + +void add_device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f32_mem_intra_instances( + std::vector, + NDHWGK, + F32, + F32, + Tuple, + F32, + PassThrough, + PassThrough, + BiasNormalizeInInferClamp>>>& instances); + +void add_device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f32_mem_inter_instances( + std::vector, + NDHWGK, + F32, + F32, + Tuple, + F32, + PassThrough, + PassThrough, + BiasNormalizeInInferClamp>>>& instances); + +#endif + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/CMakeLists.txt index 5204b51edf..1eaaa7e6ba 100644 --- a/library/src/tensor_operation_instance/gpu/CMakeLists.txt +++ b/library/src/tensor_operation_instance/gpu/CMakeLists.txt @@ -175,6 +175,10 @@ function(add_instance_library INSTANCE_NAME) target_compile_features(${INSTANCE_NAME} PUBLIC) + # splits debug information into separate .dwo files to reduce debug section size + if(CMAKE_BUILD_TYPE STREQUAL "Debug" OR CMAKE_BUILD_TYPE STREQUAL "RelWithDebInfo") + target_compile_options(${INSTANCE_NAME} PRIVATE -gsplit-dwarf) + endif() # flags to compress the library if(NOT DISABLE_OFFLOAD_COMPRESS AND NOT WIN32 AND ${hip_VERSION_FLAT} GREATER 600241132) message(DEBUG "Adding --offload-compress flag for ${INSTANCE_NAME}") diff --git a/library/src/tensor_operation_instance/gpu/conv1d_bwd_data/device_conv1d_bwd_data_xdl_nwc_kxc_nwk_bf16_instance.cpp b/library/src/tensor_operation_instance/gpu/conv1d_bwd_data/device_conv1d_bwd_data_xdl_nwc_kxc_nwk_bf16_instance.cpp index e3e90c966d..3c332c3b22 100644 --- a/library/src/tensor_operation_instance/gpu/conv1d_bwd_data/device_conv1d_bwd_data_xdl_nwc_kxc_nwk_bf16_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/conv1d_bwd_data/device_conv1d_bwd_data_xdl_nwc_kxc_nwk_bf16_instance.cpp @@ -1,5 +1,5 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved. #include @@ -90,10 +90,16 @@ void add_device_conv1d_bwd_data_xdl_nwc_kxc_nwk_bf16_instances( PassThrough, PassThrough>>>& instances) { +#if CK_BUILD_DEPRECATED +#pragma message "These instances are deprecated. They may be removed in a future release." add_device_operation_instances(instances, device_conv1d_bwd_data_xdl_nwc_kxc_nwk_bf16_instances{}); add_device_operation_instances( instances, device_conv1d_bwd_data_xdl_nwc_kxc_nwk_1x1_s1_p0_bf16_instances{}); +#else +#pragma message "These instances were deprecated" + std::ignore = instances; +#endif } } // namespace instance diff --git a/library/src/tensor_operation_instance/gpu/conv1d_bwd_data/device_conv1d_bwd_data_xdl_nwc_kxc_nwk_f16_instance.cpp b/library/src/tensor_operation_instance/gpu/conv1d_bwd_data/device_conv1d_bwd_data_xdl_nwc_kxc_nwk_f16_instance.cpp index 81e9122d95..aaaeda0312 100644 --- a/library/src/tensor_operation_instance/gpu/conv1d_bwd_data/device_conv1d_bwd_data_xdl_nwc_kxc_nwk_f16_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/conv1d_bwd_data/device_conv1d_bwd_data_xdl_nwc_kxc_nwk_f16_instance.cpp @@ -1,5 +1,5 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved. #include @@ -83,10 +83,16 @@ void add_device_conv1d_bwd_data_xdl_nwc_kxc_nwk_f16_instances( DeviceConvBwdData<1, NWC, KXC, NWK, F16, F16, F16, PassThrough, PassThrough, PassThrough>>>& instances) { +#if CK_BUILD_DEPRECATED +#pragma message "These instances are getting deprecated" add_device_operation_instances(instances, device_conv1d_bwd_data_xdl_nwc_kxc_nwk_f16_instances{}); add_device_operation_instances( instances, device_conv1d_bwd_data_xdl_nwc_kxc_nwk_1x1_s1_p0_f16_instances{}); +#else +#pragma message "These instances were deprecated" + std::ignore = instances; +#endif } } // namespace instance diff --git a/library/src/tensor_operation_instance/gpu/conv1d_bwd_data/device_conv1d_bwd_data_xdl_nwc_kxc_nwk_f32_instance.cpp b/library/src/tensor_operation_instance/gpu/conv1d_bwd_data/device_conv1d_bwd_data_xdl_nwc_kxc_nwk_f32_instance.cpp index dbc82168f4..331cc3c4b2 100644 --- a/library/src/tensor_operation_instance/gpu/conv1d_bwd_data/device_conv1d_bwd_data_xdl_nwc_kxc_nwk_f32_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/conv1d_bwd_data/device_conv1d_bwd_data_xdl_nwc_kxc_nwk_f32_instance.cpp @@ -1,5 +1,5 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved. #include @@ -82,10 +82,16 @@ void add_device_conv1d_bwd_data_xdl_nwc_kxc_nwk_f32_instances( DeviceConvBwdData<1, NWC, KXC, NWK, F32, F32, F32, PassThrough, PassThrough, PassThrough>>>& instances) { +#if CK_BUILD_DEPRECATED +#pragma message "These instances are getting deprecated" add_device_operation_instances(instances, device_conv1d_bwd_data_xdl_nwc_kxc_nwk_f32_instances{}); add_device_operation_instances( instances, device_conv1d_bwd_data_xdl_nwc_kxc_nwk_1x1_s1_p0_f32_instances{}); +#else +#pragma message "These instances were deprecated" + std::ignore = instances; +#endif } } // namespace instance diff --git a/library/src/tensor_operation_instance/gpu/conv1d_bwd_data/device_conv1d_bwd_data_xdl_nwc_kxc_nwk_int8_instance.cpp b/library/src/tensor_operation_instance/gpu/conv1d_bwd_data/device_conv1d_bwd_data_xdl_nwc_kxc_nwk_int8_instance.cpp index 3ac250f3e6..4e51074b3a 100644 --- a/library/src/tensor_operation_instance/gpu/conv1d_bwd_data/device_conv1d_bwd_data_xdl_nwc_kxc_nwk_int8_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/conv1d_bwd_data/device_conv1d_bwd_data_xdl_nwc_kxc_nwk_int8_instance.cpp @@ -1,5 +1,5 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved. #include @@ -87,10 +87,16 @@ void add_device_conv1d_bwd_data_xdl_nwc_kxc_nwk_int8_instances( PassThrough, PassThrough>>>& instances) { +#if CK_BUILD_DEPRECATED +#pragma message "These instances are getting deprecated" add_device_operation_instances(instances, device_conv1d_bwd_data_xdl_nwc_kxc_nwk_int8_instances{}); add_device_operation_instances( instances, device_conv1d_bwd_data_xdl_nwc_kxc_nwk_1x1_s1_p0_int8_instances{}); +#else +#pragma message "These instances were deprecated" + std::ignore = instances; +#endif } } // namespace instance diff --git a/library/src/tensor_operation_instance/gpu/conv2d_bwd_data/device_conv2d_bwd_data_dl_nhwc_kyxc_nhwk_f16_instance.cpp b/library/src/tensor_operation_instance/gpu/conv2d_bwd_data/device_conv2d_bwd_data_dl_nhwc_kyxc_nhwk_f16_instance.cpp index 6ca909c35e..58b3f8e37d 100644 --- a/library/src/tensor_operation_instance/gpu/conv2d_bwd_data/device_conv2d_bwd_data_dl_nhwc_kyxc_nhwk_f16_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/conv2d_bwd_data/device_conv2d_bwd_data_dl_nhwc_kyxc_nhwk_f16_instance.cpp @@ -1,5 +1,5 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved. #include @@ -71,10 +71,16 @@ void add_device_conv2d_bwd_data_dl_nhwc_kyxc_nhwk_f16_instances( PassThrough, PassThrough>>>& instances) { +#if CK_BUILD_DEPRECATED +#pragma message "These instances are getting deprecated" add_device_operation_instances(instances, device_conv2d_bwd_data_dl_nhwc_kyxc_nhwk_f16_instances{}); add_device_operation_instances( instances, device_conv2d_bwd_data_dl_nhwc_kyxc_nhwk_1x1_s1_p0_f16_instances{}); +#else +#pragma message "These instances were deprecated" + std::ignore = instances; +#endif } } // namespace instance diff --git a/library/src/tensor_operation_instance/gpu/conv2d_bwd_data/device_conv2d_bwd_data_dl_nhwc_kyxc_nhwk_f32_instance.cpp b/library/src/tensor_operation_instance/gpu/conv2d_bwd_data/device_conv2d_bwd_data_dl_nhwc_kyxc_nhwk_f32_instance.cpp index d263e98851..a487f0a6f0 100644 --- a/library/src/tensor_operation_instance/gpu/conv2d_bwd_data/device_conv2d_bwd_data_dl_nhwc_kyxc_nhwk_f32_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/conv2d_bwd_data/device_conv2d_bwd_data_dl_nhwc_kyxc_nhwk_f32_instance.cpp @@ -1,5 +1,5 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved. #include @@ -71,10 +71,16 @@ void add_device_conv2d_bwd_data_dl_nhwc_kyxc_nhwk_f32_instances( PassThrough, PassThrough>>>& instances) { +#if CK_BUILD_DEPRECATED +#pragma message "These instances are getting deprecated" add_device_operation_instances(instances, device_conv2d_bwd_data_dl_nhwc_kyxc_nhwk_f32_instances{}); add_device_operation_instances( instances, device_conv2d_bwd_data_dl_nhwc_kyxc_nhwk_1x1_s1_p0_f32_instances{}); +#else +#pragma message "These instances were deprecated" + std::ignore = instances; +#endif } } // namespace instance diff --git a/library/src/tensor_operation_instance/gpu/conv2d_bwd_data/device_conv2d_bwd_data_dl_nhwc_kyxc_nhwk_int8_instance.cpp b/library/src/tensor_operation_instance/gpu/conv2d_bwd_data/device_conv2d_bwd_data_dl_nhwc_kyxc_nhwk_int8_instance.cpp index bc949e757c..cfd4f849b8 100644 --- a/library/src/tensor_operation_instance/gpu/conv2d_bwd_data/device_conv2d_bwd_data_dl_nhwc_kyxc_nhwk_int8_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/conv2d_bwd_data/device_conv2d_bwd_data_dl_nhwc_kyxc_nhwk_int8_instance.cpp @@ -1,5 +1,5 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved. #include @@ -71,10 +71,16 @@ void add_device_conv2d_bwd_data_dl_nhwc_kyxc_nhwk_int8_instances( PassThrough, PassThrough>>>& instances) { +#if CK_BUILD_DEPRECATED +#pragma message "These instances are getting deprecated" add_device_operation_instances(instances, device_conv2d_bwd_data_dl_nhwc_kyxc_nhwk_int8_instances{}); add_device_operation_instances( instances, device_conv2d_bwd_data_dl_nhwc_kyxc_nhwk_1x1_s1_p0_int8_instances{}); +#else +#pragma message "These instances were deprecated" + std::ignore = instances; +#endif } } // namespace instance diff --git a/library/src/tensor_operation_instance/gpu/conv2d_bwd_data/device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_bf16_instance.cpp b/library/src/tensor_operation_instance/gpu/conv2d_bwd_data/device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_bf16_instance.cpp index 366d1fe160..c2f55d94eb 100644 --- a/library/src/tensor_operation_instance/gpu/conv2d_bwd_data/device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_bf16_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/conv2d_bwd_data/device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_bf16_instance.cpp @@ -1,5 +1,5 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved. #include @@ -140,6 +140,8 @@ void add_device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_bf16_instances( PassThrough, PassThrough>>>& instances) { +#if CK_BUILD_DEPRECATED +#pragma message "These instances are getting deprecated" add_device_operation_instances(instances, device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_bf16_instances{}); add_device_operation_instances( @@ -149,6 +151,10 @@ void add_device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_bf16_instances( add_device_operation_instances( instances, device_conv_dedidecate_2d_bwd_data_xdl_nhwc_kyxc_nhwk_1x1_s1_p0_bf16_instances{}); +#else +#pragma message "These instances were deprecated" + std::ignore = instances; +#endif } } // namespace instance diff --git a/library/src/tensor_operation_instance/gpu/conv2d_bwd_data/device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_f16_instance.cpp b/library/src/tensor_operation_instance/gpu/conv2d_bwd_data/device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_f16_instance.cpp index 422e37e926..5df1c9cf39 100644 --- a/library/src/tensor_operation_instance/gpu/conv2d_bwd_data/device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_f16_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/conv2d_bwd_data/device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_f16_instance.cpp @@ -1,5 +1,5 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved. #include @@ -142,6 +142,8 @@ void add_device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_f16_instances( PassThrough, PassThrough>>>& instances) { +#if CK_BUILD_DEPRECATED +#pragma message "These instances are getting deprecated" add_device_operation_instances(instances, device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_f16_instances{}); add_device_operation_instances( @@ -150,6 +152,10 @@ void add_device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_f16_instances( instances, device_conv_dedicated_2d_bwd_data_xdl_nhwc_kyxc_nhwk_f16_instances{}); add_device_operation_instances( instances, device_conv_dedicated_2d_bwd_data_xdl_nhwc_kyxc_nhwk_1x1_s1_p0_f16_instances{}); +#else +#pragma message "These instances were deprecated" + std::ignore = instances; +#endif } } // namespace instance diff --git a/library/src/tensor_operation_instance/gpu/conv2d_bwd_data/device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_f32_instance.cpp b/library/src/tensor_operation_instance/gpu/conv2d_bwd_data/device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_f32_instance.cpp index 5993f6bd7a..76ca976e37 100644 --- a/library/src/tensor_operation_instance/gpu/conv2d_bwd_data/device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_f32_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/conv2d_bwd_data/device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_f32_instance.cpp @@ -1,5 +1,5 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved. #include @@ -139,6 +139,8 @@ void add_device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_f32_instances( PassThrough, PassThrough>>>& instances) { +#if CK_BUILD_DEPRECATED +#pragma message "These instances are getting deprecated" add_device_operation_instances(instances, device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_f32_instances{}); add_device_operation_instances( @@ -147,6 +149,10 @@ void add_device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_f32_instances( instances, device_conv_dedicated_2d_bwd_data_xdl_nhwc_kyxc_nhwk_f32_instances{}); add_device_operation_instances( instances, device_conv_dedicated_2d_bwd_data_xdl_nhwc_kyxc_nhwk_1x1_s1_p0_f32_instances{}); +#else +#pragma message "These instances were deprecated" + std::ignore = instances; +#endif } } // namespace instance diff --git a/library/src/tensor_operation_instance/gpu/conv2d_bwd_data/device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_int8_instance.cpp b/library/src/tensor_operation_instance/gpu/conv2d_bwd_data/device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_int8_instance.cpp index 2f079c234c..8221515caa 100644 --- a/library/src/tensor_operation_instance/gpu/conv2d_bwd_data/device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_int8_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/conv2d_bwd_data/device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_int8_instance.cpp @@ -1,5 +1,5 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved. #include @@ -136,6 +136,8 @@ void add_device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_int8_instances( PassThrough, PassThrough>>>& instances) { +#if CK_BUILD_DEPRECATED +#pragma message "These instances are getting deprecated" add_device_operation_instances(instances, device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_int8_instances{}); add_device_operation_instances( @@ -144,6 +146,10 @@ void add_device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_int8_instances( instances, device_conv_dedicated_2d_bwd_data_xdl_nhwc_kyxc_nhwk_int8_instances{}); add_device_operation_instances( instances, device_conv_dedicated_2d_bwd_data_xdl_nhwc_kyxc_nhwk_1x1_s1_p0_int8_instances{}); +#else +#pragma message "These instances were deprecated" + std::ignore = instances; +#endif } } // namespace instance diff --git a/library/src/tensor_operation_instance/gpu/conv2d_fwd/device_conv2d_fwd_xdl_c_shuffle_nhwc_kyxc_nhwk_f16_instance.cpp b/library/src/tensor_operation_instance/gpu/conv2d_fwd/device_conv2d_fwd_xdl_c_shuffle_nhwc_kyxc_nhwk_f16_instance.cpp index 86c17aacf0..d7a82fdd2c 100644 --- a/library/src/tensor_operation_instance/gpu/conv2d_fwd/device_conv2d_fwd_xdl_c_shuffle_nhwc_kyxc_nhwk_f16_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/conv2d_fwd/device_conv2d_fwd_xdl_c_shuffle_nhwc_kyxc_nhwk_f16_instance.cpp @@ -1,5 +1,5 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved. #include @@ -180,6 +180,8 @@ void add_device_conv2d_fwd_xdl_c_shuffle_nhwc_kyxc_nhwk_f16_instances( DeviceConvFwd<2, NHWC, KYXC, NHWK, F16, F16, F16, PassThrough, PassThrough, PassThrough>>>& instances) { +#if CK_BUILD_DEPRECATED +#pragma message "These instances are getting deprecated" add_device_operation_instances(instances, device_conv2d_fwd_xdl_c_shuffle_nhwc_kyxc_nhwk_f16_instances{}); add_device_operation_instances( @@ -200,6 +202,10 @@ void add_device_conv2d_fwd_xdl_c_shuffle_nhwc_kyxc_nhwk_f16_instances( add_device_operation_instances( instances, device_conv2d_fwd_xdl_c_shuffle_nhwc_kyxc_nhwk_odd_c_f16_instances_2x{}); } +#else +#pragma message "These instances were deprecated" + std::ignore = instances; +#endif } } // namespace instance diff --git a/library/src/tensor_operation_instance/gpu/conv2d_fwd/device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_bf16_instance.cpp b/library/src/tensor_operation_instance/gpu/conv2d_fwd/device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_bf16_instance.cpp index 63c612523f..153b770e1b 100644 --- a/library/src/tensor_operation_instance/gpu/conv2d_fwd/device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_bf16_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/conv2d_fwd/device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_bf16_instance.cpp @@ -1,5 +1,5 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved. #include @@ -114,12 +114,18 @@ void add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_bf16_instances( PassThrough, PassThrough>>>& instances) { +#if CK_BUILD_DEPRECATED +#pragma message "These instances are getting deprecated" add_device_operation_instances(instances, device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_bf16_instances{}); add_device_operation_instances(instances, device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_1x1_p0_bf16_instances{}); add_device_operation_instances(instances, device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_1x1_s1_p0_bf16_instances{}); +#else +#pragma message "These instances were deprecated" + std::ignore = instances; +#endif } } // namespace instance diff --git a/library/src/tensor_operation_instance/gpu/conv2d_fwd/device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_f16_instance.cpp b/library/src/tensor_operation_instance/gpu/conv2d_fwd/device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_f16_instance.cpp index 0f3b9e7939..fd0c94250f 100644 --- a/library/src/tensor_operation_instance/gpu/conv2d_fwd/device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_f16_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/conv2d_fwd/device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_f16_instance.cpp @@ -1,5 +1,5 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved. #include @@ -107,11 +107,17 @@ void add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_f16_instances( DeviceConvFwd<2, NHWC, KYXC, NHWK, F16, F16, F16, PassThrough, PassThrough, PassThrough>>>& instances) { +#if CK_BUILD_DEPRECATED +#pragma message "These instances are getting deprecated" add_device_operation_instances(instances, device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_f16_instances{}); add_device_operation_instances(instances, device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_1x1_p0_f16_instances{}); add_device_operation_instances(instances, device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_1x1_s1_p0_f16_instances{}); +#else +#pragma message "These instances were deprecated" + std::ignore = instances; +#endif } } // namespace instance diff --git a/library/src/tensor_operation_instance/gpu/conv2d_fwd/device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_f32_instance.cpp b/library/src/tensor_operation_instance/gpu/conv2d_fwd/device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_f32_instance.cpp index 14f9b5cd6a..038316ac31 100644 --- a/library/src/tensor_operation_instance/gpu/conv2d_fwd/device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_f32_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/conv2d_fwd/device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_f32_instance.cpp @@ -1,5 +1,5 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved. #include @@ -106,11 +106,17 @@ void add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_f32_instances( DeviceConvFwd<2, NHWC, KYXC, NHWK, F32, F32, F32, PassThrough, PassThrough, PassThrough>>>& instances) { +#if CK_BUILD_DEPRECATED +#pragma message "These instances are getting deprecated" add_device_operation_instances(instances, device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_f32_instances{}); add_device_operation_instances(instances, device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_1x1_p0_f32_instances{}); add_device_operation_instances(instances, device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_1x1_s1_p0_f32_instances{}); +#else +#pragma message "These instances were deprecated" + std::ignore = instances; +#endif } } // namespace instance diff --git a/library/src/tensor_operation_instance/gpu/conv2d_fwd/device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_int8_instance.cpp b/library/src/tensor_operation_instance/gpu/conv2d_fwd/device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_int8_instance.cpp index 3f641cdadc..c77c8683c8 100644 --- a/library/src/tensor_operation_instance/gpu/conv2d_fwd/device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_int8_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/conv2d_fwd/device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_int8_instance.cpp @@ -1,5 +1,5 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved. #include @@ -111,12 +111,18 @@ void add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_int8_instances( PassThrough, PassThrough>>>& instances) { +#if CK_BUILD_DEPRECATED +#pragma message "These instances are getting deprecated" add_device_operation_instances(instances, device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_int8_instances{}); add_device_operation_instances(instances, device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_1x1_p0_int8_instances{}); add_device_operation_instances(instances, device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_1x1_s1_p0_int8_instances{}); +#else +#pragma message "These instances were deprecated" + std::ignore = instances; +#endif } } // namespace instance diff --git a/library/src/tensor_operation_instance/gpu/conv2d_fwd_bias_relu/device_conv2d_fwd_xdl_c_shuffle_bias_relu_nhwc_kyxc_nhwk_f16_instance.cpp b/library/src/tensor_operation_instance/gpu/conv2d_fwd_bias_relu/device_conv2d_fwd_xdl_c_shuffle_bias_relu_nhwc_kyxc_nhwk_f16_instance.cpp index 3402653e84..97830449ee 100644 --- a/library/src/tensor_operation_instance/gpu/conv2d_fwd_bias_relu/device_conv2d_fwd_xdl_c_shuffle_bias_relu_nhwc_kyxc_nhwk_f16_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/conv2d_fwd_bias_relu/device_conv2d_fwd_xdl_c_shuffle_bias_relu_nhwc_kyxc_nhwk_f16_instance.cpp @@ -1,5 +1,5 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved. #include @@ -179,6 +179,8 @@ using device_conv2d_fwd_xdl_c_shuffle_bias_relu_nhwc_kyxc_nhwk_odd_c_f16_instanc void add_device_conv2d_fwd_xdl_c_shuffle_bias_relu_nhwc_kyxc_nhwk_f16_instances( std::vector>& instances) { +#if CK_BUILD_DEPRECATED +#pragma message "These instances are getting deprecated" add_device_operation_instances( instances, device_conv2d_fwd_xdl_c_shuffle_bias_relu_nhwc_kyxc_nhwk_f16_instances{}); add_device_operation_instances( @@ -203,6 +205,10 @@ void add_device_conv2d_fwd_xdl_c_shuffle_bias_relu_nhwc_kyxc_nhwk_f16_instances( instances, device_conv2d_fwd_xdl_c_shuffle_bias_relu_nhwc_kyxc_nhwk_odd_c_f16_instances_2x{}); } +#else +#pragma message "These instances were deprecated" + std::ignore = instances; +#endif } } // namespace instance diff --git a/library/src/tensor_operation_instance/gpu/conv2d_fwd_bias_relu_add/device_conv2d_fwd_xdl_c_shuffle_bias_relu_add_nhwc_kyxc_nhwk_f16_instance.cpp b/library/src/tensor_operation_instance/gpu/conv2d_fwd_bias_relu_add/device_conv2d_fwd_xdl_c_shuffle_bias_relu_add_nhwc_kyxc_nhwk_f16_instance.cpp index faac2813ba..e5c682d3cd 100644 --- a/library/src/tensor_operation_instance/gpu/conv2d_fwd_bias_relu_add/device_conv2d_fwd_xdl_c_shuffle_bias_relu_add_nhwc_kyxc_nhwk_f16_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/conv2d_fwd_bias_relu_add/device_conv2d_fwd_xdl_c_shuffle_bias_relu_add_nhwc_kyxc_nhwk_f16_instance.cpp @@ -1,5 +1,5 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved. #include @@ -177,6 +177,8 @@ using device_conv2d_fwd_xdl_c_shuffle_bias_relu_add_nhwc_kyxc_nhwk_odd_c_f16_ins void add_device_conv2d_fwd_xdl_c_shuffle_bias_relu_add_nhwc_kyxc_nhwk_f16_instances( std::vector>& instances) { +#if CK_BUILD_DEPRECATED +#pragma message "These instances are getting deprecated" add_device_operation_instances( instances, device_conv2d_fwd_xdl_c_shuffle_bias_relu_add_nhwc_kyxc_nhwk_f16_instances{}); add_device_operation_instances( @@ -204,6 +206,10 @@ void add_device_conv2d_fwd_xdl_c_shuffle_bias_relu_add_nhwc_kyxc_nhwk_f16_instan instances, device_conv2d_fwd_xdl_c_shuffle_bias_relu_add_nhwc_kyxc_nhwk_odd_c_f16_instances_2x{}); } +#else +#pragma message "These instances were deprecated" + std::ignore = instances; +#endif } } // namespace instance diff --git a/library/src/tensor_operation_instance/gpu/conv3d_bwd_data/device_conv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_bf16_instance.cpp b/library/src/tensor_operation_instance/gpu/conv3d_bwd_data/device_conv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_bf16_instance.cpp index 94b2a47e50..0b9a6c2b8d 100644 --- a/library/src/tensor_operation_instance/gpu/conv3d_bwd_data/device_conv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_bf16_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/conv3d_bwd_data/device_conv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_bf16_instance.cpp @@ -1,5 +1,5 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved. #include @@ -90,10 +90,16 @@ void add_device_conv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_bf16_instances( PassThrough, PassThrough>>>& instances) { +#if CK_BUILD_DEPRECATED +#pragma message "These instances are getting deprecated" add_device_operation_instances(instances, device_conv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_bf16_instances{}); add_device_operation_instances( instances, device_conv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_1x1_s1_p0_bf16_instances{}); +#else +#pragma message "These instances were deprecated" + std::ignore = instances; +#endif } } // namespace instance diff --git a/library/src/tensor_operation_instance/gpu/conv3d_bwd_data/device_conv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_f16_instance.cpp b/library/src/tensor_operation_instance/gpu/conv3d_bwd_data/device_conv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_f16_instance.cpp index 4244ab7b87..6c54552cc8 100644 --- a/library/src/tensor_operation_instance/gpu/conv3d_bwd_data/device_conv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_f16_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/conv3d_bwd_data/device_conv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_f16_instance.cpp @@ -1,5 +1,5 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved. #include @@ -90,10 +90,16 @@ void add_device_conv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_f16_instances( PassThrough, PassThrough>>>& instances) { +#if CK_BUILD_DEPRECATED +#pragma message "These instances are getting deprecated" add_device_operation_instances(instances, device_conv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_f16_instances{}); add_device_operation_instances( instances, device_conv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_1x1_s1_p0_f16_instances{}); +#else +#pragma message "These instances were deprecated" + std::ignore = instances; +#endif } } // namespace instance diff --git a/library/src/tensor_operation_instance/gpu/conv3d_bwd_data/device_conv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_f32_instance.cpp b/library/src/tensor_operation_instance/gpu/conv3d_bwd_data/device_conv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_f32_instance.cpp index 5c7db4ca3b..363e342c1b 100644 --- a/library/src/tensor_operation_instance/gpu/conv3d_bwd_data/device_conv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_f32_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/conv3d_bwd_data/device_conv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_f32_instance.cpp @@ -1,5 +1,5 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved. #include @@ -89,10 +89,16 @@ void add_device_conv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_f32_instances( PassThrough, PassThrough>>>& instances) { +#if CK_BUILD_DEPRECATED +#pragma message "These instances are getting deprecated" add_device_operation_instances(instances, device_conv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_f32_instances{}); add_device_operation_instances( instances, device_conv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_1x1_s1_p0_f32_instances{}); +#else +#pragma message "These instances were deprecated" + std::ignore = instances; +#endif } } // namespace instance diff --git a/library/src/tensor_operation_instance/gpu/conv3d_bwd_data/device_conv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_int8_instance.cpp b/library/src/tensor_operation_instance/gpu/conv3d_bwd_data/device_conv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_int8_instance.cpp index ebc56487a1..35bca49fed 100644 --- a/library/src/tensor_operation_instance/gpu/conv3d_bwd_data/device_conv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_int8_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/conv3d_bwd_data/device_conv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_int8_instance.cpp @@ -1,5 +1,5 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved. #include @@ -87,10 +87,16 @@ void add_device_conv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_int8_instances( PassThrough, PassThrough>>>& instances) { +#if CK_BUILD_DEPRECATED +#pragma message "These instances are getting deprecated" add_device_operation_instances(instances, device_conv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_int8_instances{}); add_device_operation_instances( instances, device_conv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_1x1_s1_p0_int8_instances{}); +#else +#pragma message "These instances were deprecated" + std::ignore = instances; +#endif } } // namespace instance diff --git a/library/src/tensor_operation_instance/gpu/gemm_blockscale_wp/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/gemm_blockscale_wp/CMakeLists.txt index c8740e8d8c..0ffe5f95b2 100644 --- a/library/src/tensor_operation_instance/gpu/gemm_blockscale_wp/CMakeLists.txt +++ b/library/src/tensor_operation_instance/gpu/gemm_blockscale_wp/CMakeLists.txt @@ -10,14 +10,14 @@ list(APPEND GEMM_BLOCKSCALE_WP_INSTANCES check_cxx_compiler_flag("-mllvm --misched-bottomup=1" HAS_MISCHED_BOTTOMUP) check_cxx_compiler_flag("-mllvm --misched-prera-direction=bottomup" HAS_MISCHED_PRERA_DIRECTION) if(HAS_MISCHED_BOTTOMUP) - set_source_files_properties(device_gemm_blockscale_wp_xdl_f8_f8_bf16/device_gemm_blockscale_wp_xdl_f8_f8_bf16_mk_nk_mn_128_128_128_comp_default_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1;-mllvm;--slp-threshold=-32;-mllvm;--misched-bottomup=1") - set_source_files_properties(device_gemm_blockscale_wp_xdl_f8_f8_bf16/device_gemm_blockscale_wp_xdl_f8_f8_bf16_mk_nk_mn_128_128_128_comp_kpadding_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1;-mllvm;--slp-threshold=-32;-mllvm;--misched-bottomup=1") - set_source_files_properties(device_gemm_blockscale_wp_xdl_f8_f8_bf16/device_gemm_blockscale_wp_xdl_f8_f8_bf16_mk_nk_mn_128_128_128_mem_v1_default_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1;-mllvm;--slp-threshold=-32;-mllvm;--misched-bottomup=1") - set_source_files_properties(device_gemm_blockscale_wp_xdl_f8_f8_bf16/device_gemm_blockscale_wp_xdl_f8_f8_bf16_mk_nk_mn_128_128_128_mem_v1_kpadding_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1;-mllvm;--slp-threshold=-32;-mllvm;--misched-bottomup=1") + set_source_files_properties(device_gemm_blockscale_wp_xdl_f8_f8_bf16/device_gemm_blockscale_wp_xdl_f8_f8_bf16_mk_nk_mn_128_128_128_comp_default_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1;-mllvm;--misched-bottomup=1") + set_source_files_properties(device_gemm_blockscale_wp_xdl_f8_f8_bf16/device_gemm_blockscale_wp_xdl_f8_f8_bf16_mk_nk_mn_128_128_128_comp_kpadding_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1;-mllvm;--misched-bottomup=1") + set_source_files_properties(device_gemm_blockscale_wp_xdl_f8_f8_bf16/device_gemm_blockscale_wp_xdl_f8_f8_bf16_mk_nk_mn_128_128_128_mem_v1_default_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1;-mllvm;--misched-bottomup=1") + set_source_files_properties(device_gemm_blockscale_wp_xdl_f8_f8_bf16/device_gemm_blockscale_wp_xdl_f8_f8_bf16_mk_nk_mn_128_128_128_mem_v1_kpadding_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1;-mllvm;--misched-bottomup=1") elseif(HAS_MISCHED_PRERA_DIRECTION) - set_source_files_properties(device_gemm_blockscale_wp_xdl_f8_f8_bf16/device_gemm_blockscale_wp_xdl_f8_f8_bf16_mk_nk_mn_128_128_128_comp_default_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1;-mllvm;--slp-threshold=-32;-mllvm;--misched-prera-direction=bottomup") - set_source_files_properties(device_gemm_blockscale_wp_xdl_f8_f8_bf16/device_gemm_blockscale_wp_xdl_f8_f8_bf16_mk_nk_mn_128_128_128_comp_kpadding_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1;-mllvm;--slp-threshold=-32;-mllvm;--misched-prera-direction=bottomup") - set_source_files_properties(device_gemm_blockscale_wp_xdl_f8_f8_bf16/device_gemm_blockscale_wp_xdl_f8_f8_bf16_mk_nk_mn_128_128_128_mem_v1_default_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1;-mllvm;--slp-threshold=-32;-mllvm;--misched-prera-direction=bottomup") - set_source_files_properties(device_gemm_blockscale_wp_xdl_f8_f8_bf16/device_gemm_blockscale_wp_xdl_f8_f8_bf16_mk_nk_mn_128_128_128_mem_v1_kpadding_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1;-mllvm;--slp-threshold=-32;-mllvm;--misched-prera-direction=bottomup") + set_source_files_properties(device_gemm_blockscale_wp_xdl_f8_f8_bf16/device_gemm_blockscale_wp_xdl_f8_f8_bf16_mk_nk_mn_128_128_128_comp_default_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1;-mllvm;--misched-prera-direction=bottomup") + set_source_files_properties(device_gemm_blockscale_wp_xdl_f8_f8_bf16/device_gemm_blockscale_wp_xdl_f8_f8_bf16_mk_nk_mn_128_128_128_comp_kpadding_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1;-mllvm;--misched-prera-direction=bottomup") + set_source_files_properties(device_gemm_blockscale_wp_xdl_f8_f8_bf16/device_gemm_blockscale_wp_xdl_f8_f8_bf16_mk_nk_mn_128_128_128_mem_v1_default_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1;-mllvm;--misched-prera-direction=bottomup") + set_source_files_properties(device_gemm_blockscale_wp_xdl_f8_f8_bf16/device_gemm_blockscale_wp_xdl_f8_f8_bf16_mk_nk_mn_128_128_128_mem_v1_kpadding_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1;-mllvm;--misched-prera-direction=bottomup") endif() add_instance_library(device_gemm_blockscale_wp_instance ${GEMM_BLOCKSCALE_WP_INSTANCES}) diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/CMakeLists.txt new file mode 100644 index 0000000000..c06e4f5953 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/CMakeLists.txt @@ -0,0 +1,240 @@ +# ONLY XDL_KERNELS +set(GROUPED_CONV2D_FWD_BIAS_BNORM_CLAMP) +include(ShardInstantiation) + + +set(GENERATED_DIR ${CMAKE_CURRENT_BINARY_DIR}/generated) +generate_sharded_instantiations( + INSTANCES_NAME device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_instances + TEMPLATE_FILE xdl/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_instance.in + NUM_SHARDS 16 + SRC_LIST GROUPED_CONV2D_FWD_BIAS_BNORM_CLAMP + OUTPUT_DIR ${GENERATED_DIR}/xdl +) + +set(GENERATED_DIR ${CMAKE_CURRENT_BINARY_DIR}/generated) +generate_sharded_instantiations( + INSTANCES_NAME device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f16_instances + TEMPLATE_FILE xdl/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f16_instance.in + NUM_SHARDS 16 + SRC_LIST GROUPED_CONV2D_FWD_BIAS_BNORM_CLAMP + OUTPUT_DIR ${GENERATED_DIR}/xdl +) + +set(GENERATED_DIR ${CMAKE_CURRENT_BINARY_DIR}/generated) +generate_sharded_instantiations( + INSTANCES_NAME device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f32_instances + TEMPLATE_FILE xdl/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f32_instance.in + NUM_SHARDS 16 + SRC_LIST GROUPED_CONV2D_FWD_BIAS_BNORM_CLAMP + OUTPUT_DIR ${GENERATED_DIR}/xdl +) + +set(GENERATED_DIR ${CMAKE_CURRENT_BINARY_DIR}/generated) +generate_sharded_instantiations( + INSTANCES_NAME device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_16x16_instances + TEMPLATE_FILE xdl/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_16x16_instance.in + NUM_SHARDS 4 + SRC_LIST GROUPED_CONV2D_FWD_BIAS_BNORM_CLAMP + OUTPUT_DIR ${GENERATED_DIR}/xdl +) + +set(GENERATED_DIR ${CMAKE_CURRENT_BINARY_DIR}/generated) +generate_sharded_instantiations( + INSTANCES_NAME device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f16_16x16_instances + TEMPLATE_FILE xdl/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f16_16x16_instance.in + NUM_SHARDS 4 + SRC_LIST GROUPED_CONV2D_FWD_BIAS_BNORM_CLAMP + OUTPUT_DIR ${GENERATED_DIR}/xdl +) + +set(GENERATED_DIR ${CMAKE_CURRENT_BINARY_DIR}/generated) +generate_sharded_instantiations( + INSTANCES_NAME device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f32_16x16_instances + TEMPLATE_FILE xdl/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f32_16x16_instance.in + NUM_SHARDS 3 + SRC_LIST GROUPED_CONV2D_FWD_BIAS_BNORM_CLAMP + OUTPUT_DIR ${GENERATED_DIR}/xdl +) + # large tensor + # NHWGC, GKYXC, NHWGK + +set(GENERATED_DIR ${CMAKE_CURRENT_BINARY_DIR}/generated) +generate_sharded_instantiations( + INSTANCES_NAME device_grouped_conv2d_fwd_bias_bn_clamp_xdl_large_tensor_nhwgc_gkyxc_nhwgk_bf16_instances + TEMPLATE_FILE xdl/large_tensor/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_large_tensor_nhwgc_gkyxc_nhwgk_bf16_instance.in + NUM_SHARDS 3 + SRC_LIST GROUPED_CONV2D_FWD_BIAS_BNORM_CLAMP + OUTPUT_DIR ${GENERATED_DIR}/xdl/large_tensor +) + +set(GENERATED_DIR ${CMAKE_CURRENT_BINARY_DIR}/generated) +generate_sharded_instantiations( + INSTANCES_NAME device_grouped_conv2d_fwd_bias_bn_clamp_xdl_large_tensor_nhwgc_gkyxc_nhwgk_f16_instances + TEMPLATE_FILE xdl/large_tensor/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_large_tensor_nhwgc_gkyxc_nhwgk_f16_instance.in + NUM_SHARDS 3 + SRC_LIST GROUPED_CONV2D_FWD_BIAS_BNORM_CLAMP + OUTPUT_DIR ${GENERATED_DIR}/xdl/large_tensor +) + +set(GENERATED_DIR ${CMAKE_CURRENT_BINARY_DIR}/generated) +generate_sharded_instantiations( + INSTANCES_NAME device_grouped_conv2d_fwd_bias_bn_clamp_xdl_large_tensor_nhwgc_gkyxc_nhwgk_f32_instances + TEMPLATE_FILE xdl/large_tensor/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_large_tensor_nhwgc_gkyxc_nhwgk_f32_instance.in + NUM_SHARDS 2 + SRC_LIST GROUPED_CONV2D_FWD_BIAS_BNORM_CLAMP + OUTPUT_DIR ${GENERATED_DIR}/xdl/large_tensor +) + # merged groups + # NHWGC, GKYXC, NHWGK + +set(GENERATED_DIR ${CMAKE_CURRENT_BINARY_DIR}/generated) +generate_sharded_instantiations( + INSTANCES_NAME device_grouped_conv2d_fwd_bias_bn_clamp_xdl_merged_groups_nhwgc_gkyxc_nhwgk_bf16_instances + TEMPLATE_FILE xdl/merged_groups/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_merged_groups_nhwgc_gkyxc_nhwgk_bf16_instance.in + NUM_SHARDS 3 + SRC_LIST GROUPED_CONV2D_FWD_BIAS_BNORM_CLAMP + OUTPUT_DIR ${GENERATED_DIR}/xdl/merged_groups +) + +set(GENERATED_DIR ${CMAKE_CURRENT_BINARY_DIR}/generated) +generate_sharded_instantiations( + INSTANCES_NAME device_grouped_conv2d_fwd_bias_bn_clamp_xdl_merged_groups_nhwgc_gkyxc_nhwgk_f16_instances + TEMPLATE_FILE xdl/merged_groups/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_merged_groups_nhwgc_gkyxc_nhwgk_f16_instance.in + NUM_SHARDS 3 + SRC_LIST GROUPED_CONV2D_FWD_BIAS_BNORM_CLAMP + OUTPUT_DIR ${GENERATED_DIR}/xdl/merged_groups +) + +set(GENERATED_DIR ${CMAKE_CURRENT_BINARY_DIR}/generated) +generate_sharded_instantiations( + INSTANCES_NAME device_grouped_conv2d_fwd_bias_bn_clamp_xdl_merged_groups_nhwgc_gkyxc_nhwgk_f32_instances + TEMPLATE_FILE xdl/merged_groups/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_merged_groups_nhwgc_gkyxc_nhwgk_f32_instance.in + NUM_SHARDS 3 + SRC_LIST GROUPED_CONV2D_FWD_BIAS_BNORM_CLAMP + OUTPUT_DIR ${GENERATED_DIR}/xdl/merged_groups +) + #mem + # NHWGC, GKYXC, NHWGK + +set(GENERATED_DIR ${CMAKE_CURRENT_BINARY_DIR}/generated) +generate_sharded_instantiations( + INSTANCES_NAME device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_mem_intra_instances + TEMPLATE_FILE xdl/mem/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_mem_intra_instance.in + NUM_SHARDS 20 + SRC_LIST GROUPED_CONV2D_FWD_BIAS_BNORM_CLAMP + OUTPUT_DIR ${GENERATED_DIR}/xdl/mem +) + +set(GENERATED_DIR ${CMAKE_CURRENT_BINARY_DIR}/generated) +generate_sharded_instantiations( + INSTANCES_NAME device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f16_mem_intra_instances + TEMPLATE_FILE xdl/mem/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f16_mem_intra_instance.in + NUM_SHARDS 20 + SRC_LIST GROUPED_CONV2D_FWD_BIAS_BNORM_CLAMP + OUTPUT_DIR ${GENERATED_DIR}/xdl/mem +) + +set(GENERATED_DIR ${CMAKE_CURRENT_BINARY_DIR}/generated) +generate_sharded_instantiations( + INSTANCES_NAME device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f32_mem_intra_instances + TEMPLATE_FILE xdl/mem/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f32_mem_intra_instance.in + NUM_SHARDS 16 + SRC_LIST GROUPED_CONV2D_FWD_BIAS_BNORM_CLAMP + OUTPUT_DIR ${GENERATED_DIR}/xdl/mem +) + # NHWGC, GKYXC, NHWGK + +set(GENERATED_DIR ${CMAKE_CURRENT_BINARY_DIR}/generated) +generate_sharded_instantiations( + INSTANCES_NAME device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_mem_inter_instances + TEMPLATE_FILE xdl/mem/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_mem_inter_instance.in + NUM_SHARDS 20 + SRC_LIST GROUPED_CONV2D_FWD_BIAS_BNORM_CLAMP + OUTPUT_DIR ${GENERATED_DIR}/xdl/mem +) + +set(GENERATED_DIR ${CMAKE_CURRENT_BINARY_DIR}/generated) +generate_sharded_instantiations( + INSTANCES_NAME device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f16_mem_inter_instances + TEMPLATE_FILE xdl/mem/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f16_mem_inter_instance.in + NUM_SHARDS 20 + SRC_LIST GROUPED_CONV2D_FWD_BIAS_BNORM_CLAMP + OUTPUT_DIR ${GENERATED_DIR}/xdl/mem +) + +set(GENERATED_DIR ${CMAKE_CURRENT_BINARY_DIR}/generated) +generate_sharded_instantiations( + INSTANCES_NAME device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f32_mem_inter_instances + TEMPLATE_FILE xdl/mem/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f32_mem_inter_instance.in + NUM_SHARDS 16 + SRC_LIST GROUPED_CONV2D_FWD_BIAS_BNORM_CLAMP + OUTPUT_DIR ${GENERATED_DIR}/xdl/mem +) + #comp + # NHWGC, GKYXC, NHWGK + +set(GENERATED_DIR ${CMAKE_CURRENT_BINARY_DIR}/generated) +generate_sharded_instantiations( + INSTANCES_NAME device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_instances + TEMPLATE_FILE xdl/comp/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_instance.in + NUM_SHARDS 11 + SRC_LIST GROUPED_CONV2D_FWD_BIAS_BNORM_CLAMP + OUTPUT_DIR ${GENERATED_DIR}/xdl/comp +) + +set(GENERATED_DIR ${CMAKE_CURRENT_BINARY_DIR}/generated) +generate_sharded_instantiations( + INSTANCES_NAME device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f16_comp_instances + TEMPLATE_FILE xdl/comp/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f16_comp_instance.in + NUM_SHARDS 1 + SRC_LIST GROUPED_CONV2D_FWD_BIAS_BNORM_CLAMP + OUTPUT_DIR ${GENERATED_DIR}/xdl/comp +) + +set(GENERATED_DIR ${CMAKE_CURRENT_BINARY_DIR}/generated) +generate_sharded_instantiations( + INSTANCES_NAME device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f32_comp_instances + TEMPLATE_FILE xdl/comp/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f32_comp_instance.in + NUM_SHARDS 4 + SRC_LIST GROUPED_CONV2D_FWD_BIAS_BNORM_CLAMP + OUTPUT_DIR ${GENERATED_DIR}/xdl/comp +) + +set(GENERATED_DIR ${CMAKE_CURRENT_BINARY_DIR}/generated) +generate_sharded_instantiations( + INSTANCES_NAME device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_2x_instances + TEMPLATE_FILE xdl/comp/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_2x_instance.in + NUM_SHARDS 1 + SRC_LIST GROUPED_CONV2D_FWD_BIAS_BNORM_CLAMP + OUTPUT_DIR ${GENERATED_DIR}/xdl/comp +) + +set(GENERATED_DIR ${CMAKE_CURRENT_BINARY_DIR}/generated) +generate_sharded_instantiations( + INSTANCES_NAME device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f16_comp_2x_instances + TEMPLATE_FILE xdl/comp/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f16_comp_2x_instance.in + NUM_SHARDS 1 + SRC_LIST GROUPED_CONV2D_FWD_BIAS_BNORM_CLAMP + OUTPUT_DIR ${GENERATED_DIR}/xdl/comp +) + +set(GENERATED_DIR ${CMAKE_CURRENT_BINARY_DIR}/generated) +generate_sharded_instantiations( + INSTANCES_NAME device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_part2_instances + TEMPLATE_FILE xdl/comp/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_part2_instance.in + NUM_SHARDS 5 + SRC_LIST GROUPED_CONV2D_FWD_BIAS_BNORM_CLAMP + OUTPUT_DIR ${GENERATED_DIR}/xdl/comp +) + +set(GENERATED_DIR ${CMAKE_CURRENT_BINARY_DIR}/generated) +generate_sharded_instantiations( + INSTANCES_NAME device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f16_comp_part2_instances + TEMPLATE_FILE xdl/comp/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f16_comp_part2_instance.in + NUM_SHARDS 12 + SRC_LIST GROUPED_CONV2D_FWD_BIAS_BNORM_CLAMP + OUTPUT_DIR ${GENERATED_DIR}/xdl/comp +) + +add_instance_library(device_grouped_conv2d_fwd_bias_bnorm_clamp_instance ${GROUPED_CONV2D_FWD_BIAS_BNORM_CLAMP}) diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/comp/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_2x_instance.in b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/comp/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_2x_instance.in new file mode 100644 index 0000000000..51a12c33bd --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/comp/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_2x_instance.in @@ -0,0 +1,67 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_comp_instance.hpp" +#include "ck/host_utility/device_prop.hpp" +#include "ck/utility/filter_tuple.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +using device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_2x_instances = std::vector, + NHWGK, + BF16, + BF16, + Tuple, + BF16, + PassThrough, + PassThrough, + BiasNormalizeInInferClamp>>>; + +// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k] +template +void add_device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_2x_instances_shard(device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_2x_instances& instances) +{ + if(ck::get_device_name() == "gfx950") + { + add_device_operation_instances( + instances, ck::util::filter_tuple_by_modulo_t< + device_grouped_conv_fwd_xdl_bf16_comp_instances_2x<2, + NHWGC, + GKYXC, Tuple, + NHWGK, + ConvFwdDefault,Tuple, BiasNormalizeInInferClamp>, + Shards, + ShardIndex>{}); + + add_device_operation_instances( + instances, ck::util::filter_tuple_by_modulo_t< + device_grouped_conv_fwd_xdl_bf16_comp_instances_2x<2, + NHWGC, + GKYXC, Tuple, + NHWGK, + ConvFwd1x1P0,Tuple, BiasNormalizeInInferClamp>, + Shards, + ShardIndex>{}); + + add_device_operation_instances( + instances, ck::util::filter_tuple_by_modulo_t< + device_grouped_conv_fwd_xdl_bf16_comp_instances_2x<2, + NHWGC, + GKYXC, Tuple, + NHWGK, + ConvFwd1x1S1P0,Tuple, BiasNormalizeInInferClamp>, + Shards, + ShardIndex>{}); + } +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/comp/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_instance.in b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/comp/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_instance.in new file mode 100644 index 0000000000..22ee546ac8 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/comp/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_instance.in @@ -0,0 +1,63 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_comp_instance.hpp" +#include "ck/host_utility/device_prop.hpp" +#include "ck/utility/filter_tuple.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +using device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_instances = std::vector, + NHWGK, + BF16, + BF16, + Tuple, + BF16, + PassThrough, + PassThrough, + BiasNormalizeInInferClamp>>>; + +// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k] +template +void add_device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_instances_shard(device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_instances& instances) +{ + add_device_operation_instances( + instances, ck::util::filter_tuple_by_modulo_t< + device_grouped_conv_fwd_xdl_bf16_comp_instances<2, + NHWGC, + GKYXC, Tuple, + NHWGK, + ConvFwdDefault,Tuple, BiasNormalizeInInferClamp>, + Shards, + ShardIndex>{}); + + add_device_operation_instances(instances, ck::util::filter_tuple_by_modulo_t< + device_grouped_conv_fwd_xdl_bf16_comp_instances<2, + NHWGC, + GKYXC, Tuple, + NHWGK, + ConvFwd1x1P0,Tuple, BiasNormalizeInInferClamp>, + Shards, + ShardIndex>{}); + + add_device_operation_instances( + instances, ck::util::filter_tuple_by_modulo_t< + device_grouped_conv_fwd_xdl_bf16_comp_instances<2, + NHWGC, + GKYXC, Tuple, + NHWGK, + ConvFwd1x1S1P0,Tuple, BiasNormalizeInInferClamp>, + Shards, + ShardIndex>{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/comp/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_part2_instance.in b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/comp/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_part2_instance.in new file mode 100644 index 0000000000..632fee85a8 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/comp/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_part2_instance.in @@ -0,0 +1,67 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_comp_instance.hpp" +#include "ck/host_utility/device_prop.hpp" +#include "ck/utility/filter_tuple.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +using device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_part2_instances = std::vector, + NHWGK, + BF16, + BF16, + Tuple, + BF16, + PassThrough, + PassThrough, + BiasNormalizeInInferClamp>>>; + +// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k] +template +void add_device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_part2_instances_shard(device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_part2_instances& instances) +{ + if(ck::get_device_name() != "gfx950") + { + add_device_operation_instances( + instances, ck::util::filter_tuple_by_modulo_t< + device_grouped_conv_fwd_xdl_bf16_comp_instances_part2<2, + NHWGC, + GKYXC, Tuple, + NHWGK, + ConvFwdDefault,Tuple, BiasNormalizeInInferClamp>, + Shards, + ShardIndex>{}); + + add_device_operation_instances( + instances, ck::util::filter_tuple_by_modulo_t< + device_grouped_conv_fwd_xdl_bf16_comp_instances_part2<2, + NHWGC, + GKYXC, Tuple, + NHWGK, + ConvFwd1x1P0,Tuple, BiasNormalizeInInferClamp>, + Shards, + ShardIndex>{}); + + add_device_operation_instances( + instances, ck::util::filter_tuple_by_modulo_t< + device_grouped_conv_fwd_xdl_bf16_comp_instances_part2<2, + NHWGC, + GKYXC, Tuple, + NHWGK, + ConvFwd1x1S1P0,Tuple, BiasNormalizeInInferClamp>, + Shards, + ShardIndex>{}); + } +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/comp/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f16_comp_2x_instance.in b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/comp/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f16_comp_2x_instance.in new file mode 100644 index 0000000000..50bbf761f1 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/comp/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f16_comp_2x_instance.in @@ -0,0 +1,67 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_comp_instance.hpp" +#include "ck/host_utility/device_prop.hpp" +#include "ck/utility/filter_tuple.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +using device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f16_comp_2x_instances = std::vector, + NHWGK, + F16, + F16, + Tuple, + F16, + PassThrough, + PassThrough, + BiasNormalizeInInferClamp>>>; + +// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k] +template +void add_device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f16_comp_2x_instances_shard(device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f16_comp_2x_instances& instances) +{ + if(ck::get_device_name() == "gfx950") + { + add_device_operation_instances( + instances, ck::util::filter_tuple_by_modulo_t< + device_grouped_conv_fwd_xdl_f16_comp_instances_2x<2, + NHWGC, + GKYXC, Tuple, + NHWGK, + ConvFwdDefault,Tuple, BiasNormalizeInInferClamp>, + Shards, + ShardIndex>{}); + + add_device_operation_instances( + instances, ck::util::filter_tuple_by_modulo_t< + device_grouped_conv_fwd_xdl_f16_comp_instances_2x<2, + NHWGC, + GKYXC, Tuple, + NHWGK, + ConvFwd1x1P0,Tuple, BiasNormalizeInInferClamp>, + Shards, + ShardIndex>{}); + + add_device_operation_instances( + instances, ck::util::filter_tuple_by_modulo_t< + device_grouped_conv_fwd_xdl_f16_comp_instances_2x<2, + NHWGC, + GKYXC, Tuple, + NHWGK, + ConvFwd1x1S1P0,Tuple, BiasNormalizeInInferClamp>, + Shards, + ShardIndex>{}); + } +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/comp/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f16_comp_instance.in b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/comp/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f16_comp_instance.in new file mode 100644 index 0000000000..89baaff411 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/comp/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f16_comp_instance.in @@ -0,0 +1,63 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_comp_instance.hpp" +#include "ck/host_utility/device_prop.hpp" +#include "ck/utility/filter_tuple.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +using device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f16_comp_instances = std::vector, + NHWGK, + F16, + F16, + Tuple, + F16, + PassThrough, + PassThrough, + BiasNormalizeInInferClamp>>>; + +// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k] +template +void add_device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f16_comp_instances_shard(device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f16_comp_instances& instances) +{ + add_device_operation_instances( + instances, ck::util::filter_tuple_by_modulo_t< + device_grouped_conv_fwd_xdl_f16_comp_instances<2, + NHWGC, + GKYXC, Tuple, + NHWGK, + ConvFwdDefault,Tuple, BiasNormalizeInInferClamp>, + Shards, + ShardIndex>{}); + + add_device_operation_instances(instances, ck::util::filter_tuple_by_modulo_t< + device_grouped_conv_fwd_xdl_f16_comp_instances<2, + NHWGC, + GKYXC, Tuple, + NHWGK, + ConvFwd1x1P0,Tuple, BiasNormalizeInInferClamp>, + Shards, + ShardIndex>{}); + + add_device_operation_instances( + instances, ck::util::filter_tuple_by_modulo_t< + device_grouped_conv_fwd_xdl_f16_comp_instances<2, + NHWGC, + GKYXC, Tuple, + NHWGK, + ConvFwd1x1S1P0,Tuple, BiasNormalizeInInferClamp>, + Shards, + ShardIndex>{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/comp/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f16_comp_part2_instance.in b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/comp/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f16_comp_part2_instance.in new file mode 100644 index 0000000000..80a2655de6 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/comp/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f16_comp_part2_instance.in @@ -0,0 +1,67 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_comp_instance.hpp" +#include "ck/host_utility/device_prop.hpp" +#include "ck/utility/filter_tuple.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +using device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f16_comp_part2_instances = std::vector, + NHWGK, + F16, + F16, + Tuple, + F16, + PassThrough, + PassThrough, + BiasNormalizeInInferClamp>>>; + +// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k] +template +void add_device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f16_comp_part2_instances_shard(device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f16_comp_part2_instances& instances) +{ + if(ck::get_device_name() != "gfx950") + { + add_device_operation_instances( + instances, ck::util::filter_tuple_by_modulo_t< + device_grouped_conv_fwd_xdl_f16_comp_instances_part2<2, + NHWGC, + GKYXC, Tuple, + NHWGK, + ConvFwdDefault,Tuple, BiasNormalizeInInferClamp>, + Shards, + ShardIndex>{}); + + add_device_operation_instances( + instances, ck::util::filter_tuple_by_modulo_t< + device_grouped_conv_fwd_xdl_f16_comp_instances_part2<2, + NHWGC, + GKYXC, Tuple, + NHWGK, + ConvFwd1x1P0,Tuple, BiasNormalizeInInferClamp>, + Shards, + ShardIndex>{}); + + add_device_operation_instances( + instances, ck::util::filter_tuple_by_modulo_t< + device_grouped_conv_fwd_xdl_f16_comp_instances_part2<2, + NHWGC, + GKYXC, Tuple, + NHWGK, + ConvFwd1x1S1P0,Tuple, BiasNormalizeInInferClamp>, + Shards, + ShardIndex>{}); + } +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/comp/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f32_comp_instance.in b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/comp/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f32_comp_instance.in new file mode 100644 index 0000000000..395885d03d --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/comp/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f32_comp_instance.in @@ -0,0 +1,62 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_comp_instance.hpp" +#include "ck/utility/filter_tuple.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +using device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f32_comp_instances = std::vector, + NHWGK, + F32, + F32, + Tuple, + F32, + PassThrough, + PassThrough, + BiasNormalizeInInferClamp>>>; + +// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k] +template +void add_device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f32_comp_instances_shard(device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f32_comp_instances& instances) +{ + add_device_operation_instances( + instances, ck::util::filter_tuple_by_modulo_t< + device_grouped_conv_fwd_xdl_f32_comp_instances<2, + NHWGC, + GKYXC, Tuple, + NHWGK, + ConvFwdDefault,Tuple, BiasNormalizeInInferClamp>, + Shards, + ShardIndex>{}); + + add_device_operation_instances(instances, ck::util::filter_tuple_by_modulo_t< + device_grouped_conv_fwd_xdl_f32_comp_instances<2, + NHWGC, + GKYXC, Tuple, + NHWGK, + ConvFwd1x1P0,Tuple, BiasNormalizeInInferClamp>, + Shards, + ShardIndex>{}); + + add_device_operation_instances( + instances, ck::util::filter_tuple_by_modulo_t< + device_grouped_conv_fwd_xdl_f32_comp_instances<2, + NHWGC, + GKYXC, Tuple, + NHWGK, + ConvFwd1x1S1P0,Tuple, BiasNormalizeInInferClamp>, + Shards, + ShardIndex>{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_16x16_instance.in b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_16x16_instance.in new file mode 100644 index 0000000000..097254dc34 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_16x16_instance.in @@ -0,0 +1,63 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_instance.hpp" +#include "ck/utility/filter_tuple.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +using device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_16x16_instances = std::vector, + NHWGK, + BF16, + BF16, + Tuple, + BF16, + PassThrough, + PassThrough, + BiasNormalizeInInferClamp>>>; + +// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k] +template +void add_device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_16x16_instances_shard(device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_16x16_instances& instances) +{ + add_device_operation_instances( + instances, ck::util::filter_tuple_by_modulo_t< + device_grouped_conv_fwd_xdl_bf16_16x16_instances<2, + NHWGC, + GKYXC, Tuple, + NHWGK, + ConvFwdDefault,Tuple, BiasNormalizeInInferClamp>, + Shards, + ShardIndex>{}); + + add_device_operation_instances( + instances, ck::util::filter_tuple_by_modulo_t< + device_grouped_conv_fwd_xdl_bf16_16x16_instances<2, + NHWGC, + GKYXC, Tuple, + NHWGK, + ConvFwd1x1P0,Tuple, BiasNormalizeInInferClamp>, + Shards, + ShardIndex>{}); + + add_device_operation_instances( + instances, ck::util::filter_tuple_by_modulo_t< + device_grouped_conv_fwd_xdl_bf16_16x16_instances<2, + NHWGC, + GKYXC, Tuple, + NHWGK, + ConvFwd1x1S1P0,Tuple, BiasNormalizeInInferClamp>, + Shards, + ShardIndex>{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_instance.in b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_instance.in new file mode 100644 index 0000000000..7844440dd0 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_instance.in @@ -0,0 +1,60 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_instance.hpp" +#include "ck/utility/filter_tuple.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +using device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_instances = std::vector, + NHWGK, + BF16, + BF16, + Tuple, + BF16, + PassThrough, + PassThrough, + BiasNormalizeInInferClamp>>>; + +// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k] +template +void add_device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_instances_shard(device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_instances& instances) +{ + add_device_operation_instances(instances, ck::util::filter_tuple_by_modulo_t< + device_grouped_conv_fwd_xdl_bf16_instances<2, + NHWGC, + GKYXC, Tuple, + NHWGK, + ConvFwdDefault,Tuple, BiasNormalizeInInferClamp>, + Shards, + ShardIndex>{}); + + add_device_operation_instances(instances, ck::util::filter_tuple_by_modulo_t< + device_grouped_conv_fwd_xdl_bf16_instances<2, + NHWGC, + GKYXC, Tuple, + NHWGK, + ConvFwd1x1P0,Tuple, BiasNormalizeInInferClamp>, + Shards, + ShardIndex>{}); + + add_device_operation_instances(instances, ck::util::filter_tuple_by_modulo_t< + device_grouped_conv_fwd_xdl_bf16_instances<2, + NHWGC, + GKYXC, Tuple, + NHWGK, + ConvFwd1x1S1P0,Tuple, BiasNormalizeInInferClamp>, + Shards, + ShardIndex>{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f16_16x16_instance.in b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f16_16x16_instance.in new file mode 100644 index 0000000000..9db1750e8e --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f16_16x16_instance.in @@ -0,0 +1,62 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_instance.hpp" +#include "ck/utility/filter_tuple.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +using device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f16_16x16_instances = std::vector, + NHWGK, + F16, + F16, + Tuple, + F16, + PassThrough, + PassThrough, + BiasNormalizeInInferClamp>>>; + +// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k] +template +void add_device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f16_16x16_instances_shard(device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f16_16x16_instances& instances) +{ + add_device_operation_instances( + instances, ck::util::filter_tuple_by_modulo_t< + device_grouped_conv_fwd_xdl_f16_16x16_instances<2, + NHWGC, + GKYXC, Tuple, + NHWGK, + ConvFwdDefault,Tuple, BiasNormalizeInInferClamp>, + Shards, + ShardIndex>{}); + + add_device_operation_instances(instances, ck::util::filter_tuple_by_modulo_t< + device_grouped_conv_fwd_xdl_f16_16x16_instances<2, + NHWGC, + GKYXC, Tuple, + NHWGK, + ConvFwd1x1P0,Tuple, BiasNormalizeInInferClamp>, + Shards, + ShardIndex>{}); + + add_device_operation_instances( + instances, ck::util::filter_tuple_by_modulo_t< + device_grouped_conv_fwd_xdl_f16_16x16_instances<2, + NHWGC, + GKYXC, Tuple, + NHWGK, + ConvFwd1x1S1P0,Tuple, BiasNormalizeInInferClamp>, + Shards, + ShardIndex>{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f16_instance.in b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f16_instance.in new file mode 100644 index 0000000000..341fdf6eb6 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f16_instance.in @@ -0,0 +1,60 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_instance.hpp" +#include "ck/utility/filter_tuple.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +using device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f16_instances = std::vector, + NHWGK, + F16, + F16, + Tuple, + F16, + PassThrough, + PassThrough, + BiasNormalizeInInferClamp>>>; + +// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k] +template +void add_device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f16_instances_shard(device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f16_instances& instances) +{ + add_device_operation_instances(instances, ck::util::filter_tuple_by_modulo_t< + device_grouped_conv_fwd_xdl_f16_instances<2, + NHWGC, + GKYXC, Tuple, + NHWGK, + ConvFwdDefault,Tuple, BiasNormalizeInInferClamp>, + Shards, + ShardIndex>{}); + + add_device_operation_instances(instances, ck::util::filter_tuple_by_modulo_t< + device_grouped_conv_fwd_xdl_f16_instances<2, + NHWGC, + GKYXC, Tuple, + NHWGK, + ConvFwd1x1P0,Tuple, BiasNormalizeInInferClamp>, + Shards, + ShardIndex>{}); + + add_device_operation_instances(instances, ck::util::filter_tuple_by_modulo_t< + device_grouped_conv_fwd_xdl_f16_instances<2, + NHWGC, + GKYXC, Tuple, + NHWGK, + ConvFwd1x1S1P0,Tuple, BiasNormalizeInInferClamp>, + Shards, + ShardIndex>{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f32_16x16_instance.in b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f32_16x16_instance.in new file mode 100644 index 0000000000..bcb126392a --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f32_16x16_instance.in @@ -0,0 +1,62 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_instance.hpp" +#include "ck/utility/filter_tuple.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +using device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f32_16x16_instances = std::vector, + NHWGK, + F32, + F32, + Tuple, + F32, + PassThrough, + PassThrough, + BiasNormalizeInInferClamp>>>; + +// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k] +template +void add_device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f32_16x16_instances_shard(device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f32_16x16_instances& instances) +{ + add_device_operation_instances( + instances, ck::util::filter_tuple_by_modulo_t< + device_grouped_conv_fwd_xdl_f32_16x16_instances<2, + NHWGC, + GKYXC, Tuple, + NHWGK, + ConvFwdDefault,Tuple, BiasNormalizeInInferClamp>, + Shards, + ShardIndex>{}); + + add_device_operation_instances(instances, ck::util::filter_tuple_by_modulo_t< + device_grouped_conv_fwd_xdl_f32_16x16_instances<2, + NHWGC, + GKYXC, Tuple, + NHWGK, + ConvFwd1x1P0,Tuple, BiasNormalizeInInferClamp>, + Shards, + ShardIndex>{}); + + add_device_operation_instances( + instances, ck::util::filter_tuple_by_modulo_t< + device_grouped_conv_fwd_xdl_f32_16x16_instances<2, + NHWGC, + GKYXC, Tuple, + NHWGK, + ConvFwd1x1S1P0,Tuple, BiasNormalizeInInferClamp>, + Shards, + ShardIndex>{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f32_instance.in b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f32_instance.in new file mode 100644 index 0000000000..4e3a435e74 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f32_instance.in @@ -0,0 +1,60 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_instance.hpp" +#include "ck/utility/filter_tuple.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +using device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f32_instances = std::vector, + NHWGK, + F32, + F32, + Tuple, + F32, + PassThrough, + PassThrough, + BiasNormalizeInInferClamp>>>; + +// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k] +template +void add_device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f32_instances_shard(device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f32_instances& instances) +{ + add_device_operation_instances(instances, ck::util::filter_tuple_by_modulo_t< + device_grouped_conv_fwd_xdl_f32_instances<2, + NHWGC, + GKYXC, Tuple, + NHWGK, + ConvFwdDefault,Tuple, BiasNormalizeInInferClamp>, + Shards, + ShardIndex>{}); + + add_device_operation_instances(instances, ck::util::filter_tuple_by_modulo_t< + device_grouped_conv_fwd_xdl_f32_instances<2, + NHWGC, + GKYXC, Tuple, + NHWGK, + ConvFwd1x1P0,Tuple, BiasNormalizeInInferClamp>, + Shards, + ShardIndex>{}); + + add_device_operation_instances(instances, ck::util::filter_tuple_by_modulo_t< + device_grouped_conv_fwd_xdl_f32_instances<2, + NHWGC, + GKYXC, Tuple, + NHWGK, + ConvFwd1x1S1P0,Tuple, BiasNormalizeInInferClamp>, + Shards, + ShardIndex>{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/large_tensor/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_large_tensor_nhwgc_gkyxc_nhwgk_bf16_instance.in b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/large_tensor/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_large_tensor_nhwgc_gkyxc_nhwgk_bf16_instance.in new file mode 100644 index 0000000000..0956d9dd71 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/large_tensor/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_large_tensor_nhwgc_gkyxc_nhwgk_bf16_instance.in @@ -0,0 +1,43 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_large_tensor_instance.hpp" +#include "ck/utility/filter_tuple.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +using device_grouped_conv2d_fwd_bias_bn_clamp_xdl_large_tensor_nhwgc_gkyxc_nhwgk_bf16_instances = std::vector, + NHWGK, + BF16, + BF16, + Tuple, + BF16, + PassThrough, + PassThrough, + BiasNormalizeInInferClamp>>>; + +// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k] +template +void add_device_grouped_conv2d_fwd_bias_bn_clamp_xdl_large_tensor_nhwgc_gkyxc_nhwgk_bf16_instances_shard(device_grouped_conv2d_fwd_bias_bn_clamp_xdl_large_tensor_nhwgc_gkyxc_nhwgk_bf16_instances& instances) +{ + add_device_operation_instances( + instances, ck::util::filter_tuple_by_modulo_t< + device_grouped_conv_fwd_xdl_large_tensor_bf16_instances<2, + NHWGC, + GKYXC, Tuple, + NHWGK, + ConvFwdDefault,Tuple, BiasNormalizeInInferClamp>, + Shards, + ShardIndex>{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/large_tensor/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_large_tensor_nhwgc_gkyxc_nhwgk_f16_instance.in b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/large_tensor/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_large_tensor_nhwgc_gkyxc_nhwgk_f16_instance.in new file mode 100644 index 0000000000..b836dd8374 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/large_tensor/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_large_tensor_nhwgc_gkyxc_nhwgk_f16_instance.in @@ -0,0 +1,43 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_large_tensor_instance.hpp" +#include "ck/utility/filter_tuple.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +using device_grouped_conv2d_fwd_bias_bn_clamp_xdl_large_tensor_nhwgc_gkyxc_nhwgk_f16_instances = std::vector, + NHWGK, + F16, + F16, + Tuple, + F16, + PassThrough, + PassThrough, + BiasNormalizeInInferClamp>>>; + +// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k] +template +void add_device_grouped_conv2d_fwd_bias_bn_clamp_xdl_large_tensor_nhwgc_gkyxc_nhwgk_f16_instances_shard(device_grouped_conv2d_fwd_bias_bn_clamp_xdl_large_tensor_nhwgc_gkyxc_nhwgk_f16_instances& instances) +{ + add_device_operation_instances( + instances, ck::util::filter_tuple_by_modulo_t< + device_grouped_conv_fwd_xdl_large_tensor_f16_instances<2, + NHWGC, + GKYXC, Tuple, + NHWGK, + ConvFwdDefault,Tuple, BiasNormalizeInInferClamp>, + Shards, + ShardIndex>{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/large_tensor/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_large_tensor_nhwgc_gkyxc_nhwgk_f32_instance.in b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/large_tensor/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_large_tensor_nhwgc_gkyxc_nhwgk_f32_instance.in new file mode 100644 index 0000000000..6b8cbf1704 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/large_tensor/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_large_tensor_nhwgc_gkyxc_nhwgk_f32_instance.in @@ -0,0 +1,43 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_large_tensor_instance.hpp" +#include "ck/utility/filter_tuple.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +using device_grouped_conv2d_fwd_bias_bn_clamp_xdl_large_tensor_nhwgc_gkyxc_nhwgk_f32_instances = std::vector, + NHWGK, + F32, + F32, + Tuple, + F32, + PassThrough, + PassThrough, + BiasNormalizeInInferClamp>>>; + +// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k] +template +void add_device_grouped_conv2d_fwd_bias_bn_clamp_xdl_large_tensor_nhwgc_gkyxc_nhwgk_f32_instances_shard(device_grouped_conv2d_fwd_bias_bn_clamp_xdl_large_tensor_nhwgc_gkyxc_nhwgk_f32_instances& instances) +{ + add_device_operation_instances( + instances, ck::util::filter_tuple_by_modulo_t< + device_grouped_conv_fwd_xdl_large_tensor_f32_instances<2, + NHWGC, + GKYXC, Tuple, + NHWGK, + ConvFwdDefault,Tuple, BiasNormalizeInInferClamp>, + Shards, + ShardIndex>{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/mem/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_mem_inter_instance.in b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/mem/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_mem_inter_instance.in new file mode 100644 index 0000000000..a2c36ee52b --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/mem/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_mem_inter_instance.in @@ -0,0 +1,63 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_mem_instance.hpp" +#include "ck/utility/filter_tuple.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +using device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_mem_inter_instances = std::vector, + NHWGK, + BF16, + BF16, + Tuple, + BF16, + PassThrough, + PassThrough, + BiasNormalizeInInferClamp>>>; + +// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k] +template +void add_device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_mem_inter_instances_shard(device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_mem_inter_instances& instances) +{ + add_device_operation_instances(instances, ck::util::filter_tuple_by_modulo_t< + device_grouped_conv_fwd_xdl_bf16_mem_instances<2, + NHWGC, + GKYXC, Tuple, + NHWGK, + ConvFwdDefault, + Interwave,Tuple, BiasNormalizeInInferClamp>, + Shards, + ShardIndex>{}); + + add_device_operation_instances(instances, ck::util::filter_tuple_by_modulo_t< + device_grouped_conv_fwd_xdl_bf16_mem_instances<2, + NHWGC, + GKYXC, Tuple, + NHWGK, + ConvFwd1x1P0, + Interwave,Tuple, BiasNormalizeInInferClamp>, + Shards, + ShardIndex>{}); + + add_device_operation_instances(instances, ck::util::filter_tuple_by_modulo_t< + device_grouped_conv_fwd_xdl_bf16_mem_instances<2, + NHWGC, + GKYXC, Tuple, + NHWGK, + ConvFwd1x1S1P0, + Interwave,Tuple, BiasNormalizeInInferClamp>, + Shards, + ShardIndex>{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/mem/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_mem_intra_instance.in b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/mem/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_mem_intra_instance.in new file mode 100644 index 0000000000..1c12ae66a3 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/mem/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_mem_intra_instance.in @@ -0,0 +1,63 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_mem_instance.hpp" +#include "ck/utility/filter_tuple.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +using device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_mem_intra_instances = std::vector, + NHWGK, + BF16, + BF16, + Tuple, + BF16, + PassThrough, + PassThrough, + BiasNormalizeInInferClamp>>>; + +// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k] +template +void add_device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_mem_intra_instances_shard(device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_mem_intra_instances& instances) +{ + add_device_operation_instances(instances, ck::util::filter_tuple_by_modulo_t< + device_grouped_conv_fwd_xdl_bf16_mem_instances<2, + NHWGC, + GKYXC, Tuple, + NHWGK, + ConvFwdDefault, + Intrawave,Tuple, BiasNormalizeInInferClamp>, + Shards, + ShardIndex>{}); + + add_device_operation_instances(instances, ck::util::filter_tuple_by_modulo_t< + device_grouped_conv_fwd_xdl_bf16_mem_instances<2, + NHWGC, + GKYXC, Tuple, + NHWGK, + ConvFwd1x1P0, + Intrawave,Tuple, BiasNormalizeInInferClamp>, + Shards, + ShardIndex>{}); + + add_device_operation_instances(instances, ck::util::filter_tuple_by_modulo_t< + device_grouped_conv_fwd_xdl_bf16_mem_instances<2, + NHWGC, + GKYXC, Tuple, + NHWGK, + ConvFwd1x1S1P0, + Intrawave,Tuple, BiasNormalizeInInferClamp>, + Shards, + ShardIndex>{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/mem/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f16_mem_inter_instance.in b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/mem/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f16_mem_inter_instance.in new file mode 100644 index 0000000000..4fde5e662c --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/mem/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f16_mem_inter_instance.in @@ -0,0 +1,63 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_mem_instance.hpp" +#include "ck/utility/filter_tuple.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +using device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f16_mem_inter_instances = std::vector, + NHWGK, + F16, + F16, + Tuple, + F16, + PassThrough, + PassThrough, + BiasNormalizeInInferClamp>>>; + +// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k] +template +void add_device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f16_mem_inter_instances_shard(device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f16_mem_inter_instances& instances) +{ + add_device_operation_instances(instances, ck::util::filter_tuple_by_modulo_t< + device_grouped_conv_fwd_xdl_f16_mem_instances<2, + NHWGC, + GKYXC, Tuple, + NHWGK, + ConvFwdDefault, + Interwave,Tuple, BiasNormalizeInInferClamp>, + Shards, + ShardIndex>{}); + + add_device_operation_instances(instances, ck::util::filter_tuple_by_modulo_t< + device_grouped_conv_fwd_xdl_f16_mem_instances<2, + NHWGC, + GKYXC, Tuple, + NHWGK, + ConvFwd1x1P0, + Interwave,Tuple, BiasNormalizeInInferClamp>, + Shards, + ShardIndex>{}); + + add_device_operation_instances(instances, ck::util::filter_tuple_by_modulo_t< + device_grouped_conv_fwd_xdl_f16_mem_instances<2, + NHWGC, + GKYXC, Tuple, + NHWGK, + ConvFwd1x1S1P0, + Interwave,Tuple, BiasNormalizeInInferClamp>, + Shards, + ShardIndex>{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/mem/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f16_mem_intra_instance.in b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/mem/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f16_mem_intra_instance.in new file mode 100644 index 0000000000..d75c7f70d5 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/mem/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f16_mem_intra_instance.in @@ -0,0 +1,63 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_mem_instance.hpp" +#include "ck/utility/filter_tuple.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +using device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f16_mem_intra_instances = std::vector, + NHWGK, + F16, + F16, + Tuple, + F16, + PassThrough, + PassThrough, + BiasNormalizeInInferClamp>>>; + +// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k] +template +void add_device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f16_mem_intra_instances_shard(device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f16_mem_intra_instances& instances) +{ + add_device_operation_instances(instances, ck::util::filter_tuple_by_modulo_t< + device_grouped_conv_fwd_xdl_f16_mem_instances<2, + NHWGC, + GKYXC, Tuple, + NHWGK, + ConvFwdDefault, + Intrawave,Tuple, BiasNormalizeInInferClamp>, + Shards, + ShardIndex>{}); + + add_device_operation_instances(instances, ck::util::filter_tuple_by_modulo_t< + device_grouped_conv_fwd_xdl_f16_mem_instances<2, + NHWGC, + GKYXC, Tuple, + NHWGK, + ConvFwd1x1P0, + Intrawave,Tuple, BiasNormalizeInInferClamp>, + Shards, + ShardIndex>{}); + + add_device_operation_instances(instances, ck::util::filter_tuple_by_modulo_t< + device_grouped_conv_fwd_xdl_f16_mem_instances<2, + NHWGC, + GKYXC, Tuple, + NHWGK, + ConvFwd1x1S1P0, + Intrawave,Tuple, BiasNormalizeInInferClamp>, + Shards, + ShardIndex>{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/mem/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f32_mem_inter_instance.in b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/mem/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f32_mem_inter_instance.in new file mode 100644 index 0000000000..d51b3d01e3 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/mem/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f32_mem_inter_instance.in @@ -0,0 +1,63 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_mem_instance.hpp" +#include "ck/utility/filter_tuple.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +using device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f32_mem_inter_instances = std::vector, + NHWGK, + F32, + F32, + Tuple, + F32, + PassThrough, + PassThrough, + BiasNormalizeInInferClamp>>>; + +// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k] +template +void add_device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f32_mem_inter_instances_shard(device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f32_mem_inter_instances& instances) +{ + add_device_operation_instances(instances, ck::util::filter_tuple_by_modulo_t< + device_grouped_conv_fwd_xdl_f32_mem_instances<2, + NHWGC, + GKYXC, Tuple, + NHWGK, + ConvFwdDefault, + Interwave,Tuple, BiasNormalizeInInferClamp>, + Shards, + ShardIndex>{}); + + add_device_operation_instances(instances, ck::util::filter_tuple_by_modulo_t< + device_grouped_conv_fwd_xdl_f32_mem_instances<2, + NHWGC, + GKYXC, Tuple, + NHWGK, + ConvFwd1x1P0, + Interwave,Tuple, BiasNormalizeInInferClamp>, + Shards, + ShardIndex>{}); + + add_device_operation_instances(instances, ck::util::filter_tuple_by_modulo_t< + device_grouped_conv_fwd_xdl_f32_mem_instances<2, + NHWGC, + GKYXC, Tuple, + NHWGK, + ConvFwd1x1S1P0, + Interwave,Tuple, BiasNormalizeInInferClamp>, + Shards, + ShardIndex>{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/mem/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f32_mem_intra_instance.in b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/mem/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f32_mem_intra_instance.in new file mode 100644 index 0000000000..47135a2dd7 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/mem/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f32_mem_intra_instance.in @@ -0,0 +1,63 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_mem_instance.hpp" +#include "ck/utility/filter_tuple.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +using device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f32_mem_intra_instances = std::vector, + NHWGK, + F32, + F32, + Tuple, + F32, + PassThrough, + PassThrough, + BiasNormalizeInInferClamp>>>; + +// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k] +template +void add_device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f32_mem_intra_instances_shard(device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f32_mem_intra_instances& instances) +{ + add_device_operation_instances(instances, ck::util::filter_tuple_by_modulo_t< + device_grouped_conv_fwd_xdl_f32_mem_instances<2, + NHWGC, + GKYXC, Tuple, + NHWGK, + ConvFwdDefault, + Intrawave,Tuple, BiasNormalizeInInferClamp>, + Shards, + ShardIndex>{}); + + add_device_operation_instances(instances, ck::util::filter_tuple_by_modulo_t< + device_grouped_conv_fwd_xdl_f32_mem_instances<2, + NHWGC, + GKYXC, Tuple, + NHWGK, + ConvFwd1x1P0, + Intrawave,Tuple, BiasNormalizeInInferClamp>, + Shards, + ShardIndex>{}); + + add_device_operation_instances(instances, ck::util::filter_tuple_by_modulo_t< + device_grouped_conv_fwd_xdl_f32_mem_instances<2, + NHWGC, + GKYXC, Tuple, + NHWGK, + ConvFwd1x1S1P0, + Intrawave,Tuple, BiasNormalizeInInferClamp>, + Shards, + ShardIndex>{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/merged_groups/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_merged_groups_nhwgc_gkyxc_nhwgk_bf16_instance.in b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/merged_groups/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_merged_groups_nhwgc_gkyxc_nhwgk_bf16_instance.in new file mode 100644 index 0000000000..3e08e9668f --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/merged_groups/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_merged_groups_nhwgc_gkyxc_nhwgk_bf16_instance.in @@ -0,0 +1,79 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_merged_groups_instance.hpp" +#include "ck/host_utility/device_prop.hpp" +#include "ck/utility/filter_tuple.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +using device_grouped_conv2d_fwd_bias_bn_clamp_xdl_merged_groups_nhwgc_gkyxc_nhwgk_bf16_instances = std::vector, + NHWGK, + BF16, + BF16, + Tuple, + BF16, + PassThrough, + PassThrough, + BiasNormalizeInInferClamp>>>; + +// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k] +template +void add_device_grouped_conv2d_fwd_bias_bn_clamp_xdl_merged_groups_nhwgc_gkyxc_nhwgk_bf16_instances_shard(device_grouped_conv2d_fwd_bias_bn_clamp_xdl_merged_groups_nhwgc_gkyxc_nhwgk_bf16_instances& instances) +{ + if(ck::get_device_name() == "gfx950") + { + add_device_operation_instances( + instances, ck::util::filter_tuple_by_modulo_t< + device_grouped_conv_fwd_xdl_merged_groups_bf16_instances_2x<2, + NHWGC, + GKYXC, Tuple, + NHWGK, + ConvFwdDefault,Tuple, BiasNormalizeInInferClamp>, + Shards, + ShardIndex>{}); + + add_device_operation_instances( + instances, ck::util::filter_tuple_by_modulo_t< + device_grouped_conv_fwd_xdl_merged_groups_bf16_instances_2x<2, + NHWGC, + GKYXC, Tuple, + NHWGK, + ConvFwd3x3,Tuple, BiasNormalizeInInferClamp>, + Shards, + ShardIndex>{}); + } + else + { + add_device_operation_instances( + instances, ck::util::filter_tuple_by_modulo_t< + device_grouped_conv_fwd_xdl_merged_groups_bf16_instances<2, + NHWGC, + GKYXC, Tuple, + NHWGK, + ConvFwdDefault,Tuple, BiasNormalizeInInferClamp>, + Shards, + ShardIndex>{}); + + add_device_operation_instances( + instances, ck::util::filter_tuple_by_modulo_t< + device_grouped_conv_fwd_xdl_merged_groups_bf16_instances<2, + NHWGC, + GKYXC, Tuple, + NHWGK, + ConvFwd3x3,Tuple, BiasNormalizeInInferClamp>, + Shards, + ShardIndex>{}); + } +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/merged_groups/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_merged_groups_nhwgc_gkyxc_nhwgk_f16_instance.in b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/merged_groups/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_merged_groups_nhwgc_gkyxc_nhwgk_f16_instance.in new file mode 100644 index 0000000000..ec76a8e1d1 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/merged_groups/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_merged_groups_nhwgc_gkyxc_nhwgk_f16_instance.in @@ -0,0 +1,79 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_merged_groups_instance.hpp" +#include "ck/host_utility/device_prop.hpp" +#include "ck/utility/filter_tuple.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +using device_grouped_conv2d_fwd_bias_bn_clamp_xdl_merged_groups_nhwgc_gkyxc_nhwgk_f16_instances = std::vector, + NHWGK, + F16, + F16, + Tuple, + F16, + PassThrough, + PassThrough, + BiasNormalizeInInferClamp>>>; + +// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k] +template +void add_device_grouped_conv2d_fwd_bias_bn_clamp_xdl_merged_groups_nhwgc_gkyxc_nhwgk_f16_instances_shard(device_grouped_conv2d_fwd_bias_bn_clamp_xdl_merged_groups_nhwgc_gkyxc_nhwgk_f16_instances& instances) +{ + if(ck::get_device_name() == "gfx950") + { + add_device_operation_instances( + instances, ck::util::filter_tuple_by_modulo_t< + device_grouped_conv_fwd_xdl_merged_groups_f16_instances_2x<2, + NHWGC, + GKYXC, Tuple, + NHWGK, + ConvFwdDefault,Tuple, BiasNormalizeInInferClamp>, + Shards, + ShardIndex>{}); + + add_device_operation_instances( + instances, ck::util::filter_tuple_by_modulo_t< + device_grouped_conv_fwd_xdl_merged_groups_f16_instances_2x<2, + NHWGC, + GKYXC, Tuple, + NHWGK, + ConvFwd3x3,Tuple, BiasNormalizeInInferClamp>, + Shards, + ShardIndex>{}); + } + else + { + add_device_operation_instances( + instances, ck::util::filter_tuple_by_modulo_t< + device_grouped_conv_fwd_xdl_merged_groups_f16_instances<2, + NHWGC, + GKYXC, Tuple, + NHWGK, + ConvFwdDefault,Tuple, BiasNormalizeInInferClamp>, + Shards, + ShardIndex>{}); + + add_device_operation_instances( + instances, ck::util::filter_tuple_by_modulo_t< + device_grouped_conv_fwd_xdl_merged_groups_f16_instances<2, + NHWGC, + GKYXC, Tuple, + NHWGK, + ConvFwd3x3,Tuple, BiasNormalizeInInferClamp>, + Shards, + ShardIndex>{}); + } +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/merged_groups/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_merged_groups_nhwgc_gkyxc_nhwgk_f32_instance.in b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/merged_groups/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_merged_groups_nhwgc_gkyxc_nhwgk_f32_instance.in new file mode 100644 index 0000000000..2bbac89bbe --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/merged_groups/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_merged_groups_nhwgc_gkyxc_nhwgk_f32_instance.in @@ -0,0 +1,53 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_merged_groups_instance.hpp" +#include "ck/utility/filter_tuple.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +using device_grouped_conv2d_fwd_bias_bn_clamp_xdl_merged_groups_nhwgc_gkyxc_nhwgk_f32_instances = std::vector, + NHWGK, + F32, + F32, + Tuple, + F32, + PassThrough, + PassThrough, + BiasNormalizeInInferClamp>>>; + +// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k] +template +void add_device_grouped_conv2d_fwd_bias_bn_clamp_xdl_merged_groups_nhwgc_gkyxc_nhwgk_f32_instances_shard(device_grouped_conv2d_fwd_bias_bn_clamp_xdl_merged_groups_nhwgc_gkyxc_nhwgk_f32_instances& instances) +{ + add_device_operation_instances( + instances, ck::util::filter_tuple_by_modulo_t< + device_grouped_conv_fwd_xdl_merged_groups_f32_instances<2, + NHWGC, + GKYXC, Tuple, + NHWGK, + ConvFwdDefault,Tuple, BiasNormalizeInInferClamp>, + Shards, + ShardIndex>{}); + + add_device_operation_instances( + instances, ck::util::filter_tuple_by_modulo_t< + device_grouped_conv_fwd_xdl_merged_groups_f32_instances<2, + NHWGC, + GKYXC, Tuple, + NHWGK, + ConvFwd3x3,Tuple, BiasNormalizeInInferClamp>, + Shards, + ShardIndex>{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/CMakeLists.txt new file mode 100644 index 0000000000..bda9149227 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/CMakeLists.txt @@ -0,0 +1,240 @@ +# ONLY XDL_KERNELS +set(GROUPED_conv3d_FWD_BIAS_BNORM_CLAMP) +include(ShardInstantiation) + + +set(GENERATED_DIR ${CMAKE_CURRENT_BINARY_DIR}/generated) +generate_sharded_instantiations( + INSTANCES_NAME device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_instances + TEMPLATE_FILE xdl/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_instance.in + NUM_SHARDS 16 + SRC_LIST GROUPED_conv3d_FWD_BIAS_BNORM_CLAMP + OUTPUT_DIR ${GENERATED_DIR}/xdl +) + +set(GENERATED_DIR ${CMAKE_CURRENT_BINARY_DIR}/generated) +generate_sharded_instantiations( + INSTANCES_NAME device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f16_instances + TEMPLATE_FILE xdl/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f16_instance.in + NUM_SHARDS 16 + SRC_LIST GROUPED_conv3d_FWD_BIAS_BNORM_CLAMP + OUTPUT_DIR ${GENERATED_DIR}/xdl +) + +set(GENERATED_DIR ${CMAKE_CURRENT_BINARY_DIR}/generated) +generate_sharded_instantiations( + INSTANCES_NAME device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f32_instances + TEMPLATE_FILE xdl/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f32_instance.in + NUM_SHARDS 16 + SRC_LIST GROUPED_conv3d_FWD_BIAS_BNORM_CLAMP + OUTPUT_DIR ${GENERATED_DIR}/xdl +) + +set(GENERATED_DIR ${CMAKE_CURRENT_BINARY_DIR}/generated) +generate_sharded_instantiations( + INSTANCES_NAME device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_16x16_instances + TEMPLATE_FILE xdl/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_16x16_instance.in + NUM_SHARDS 4 + SRC_LIST GROUPED_conv3d_FWD_BIAS_BNORM_CLAMP + OUTPUT_DIR ${GENERATED_DIR}/xdl +) + +set(GENERATED_DIR ${CMAKE_CURRENT_BINARY_DIR}/generated) +generate_sharded_instantiations( + INSTANCES_NAME device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f16_16x16_instances + TEMPLATE_FILE xdl/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f16_16x16_instance.in + NUM_SHARDS 4 + SRC_LIST GROUPED_conv3d_FWD_BIAS_BNORM_CLAMP + OUTPUT_DIR ${GENERATED_DIR}/xdl +) + +set(GENERATED_DIR ${CMAKE_CURRENT_BINARY_DIR}/generated) +generate_sharded_instantiations( + INSTANCES_NAME device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f32_16x16_instances + TEMPLATE_FILE xdl/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f32_16x16_instance.in + NUM_SHARDS 3 + SRC_LIST GROUPED_conv3d_FWD_BIAS_BNORM_CLAMP + OUTPUT_DIR ${GENERATED_DIR}/xdl +) + # large tensor + # NDHWGC, GKZYXC, NDHWGK + +set(GENERATED_DIR ${CMAKE_CURRENT_BINARY_DIR}/generated) +generate_sharded_instantiations( + INSTANCES_NAME device_grouped_conv3d_fwd_bias_bn_clamp_xdl_large_tensor_ndhwgc_gkzyxc_ndhwgk_bf16_instances + TEMPLATE_FILE xdl/large_tensor/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_large_tensor_ndhwgc_gkzyxc_ndhwgk_bf16_instance.in + NUM_SHARDS 3 + SRC_LIST GROUPED_conv3d_FWD_BIAS_BNORM_CLAMP + OUTPUT_DIR ${GENERATED_DIR}/xdl/large_tensor +) + +set(GENERATED_DIR ${CMAKE_CURRENT_BINARY_DIR}/generated) +generate_sharded_instantiations( + INSTANCES_NAME device_grouped_conv3d_fwd_bias_bn_clamp_xdl_large_tensor_ndhwgc_gkzyxc_ndhwgk_f16_instances + TEMPLATE_FILE xdl/large_tensor/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_large_tensor_ndhwgc_gkzyxc_ndhwgk_f16_instance.in + NUM_SHARDS 3 + SRC_LIST GROUPED_conv3d_FWD_BIAS_BNORM_CLAMP + OUTPUT_DIR ${GENERATED_DIR}/xdl/large_tensor +) + +set(GENERATED_DIR ${CMAKE_CURRENT_BINARY_DIR}/generated) +generate_sharded_instantiations( + INSTANCES_NAME device_grouped_conv3d_fwd_bias_bn_clamp_xdl_large_tensor_ndhwgc_gkzyxc_ndhwgk_f32_instances + TEMPLATE_FILE xdl/large_tensor/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_large_tensor_ndhwgc_gkzyxc_ndhwgk_f32_instance.in + NUM_SHARDS 2 + SRC_LIST GROUPED_conv3d_FWD_BIAS_BNORM_CLAMP + OUTPUT_DIR ${GENERATED_DIR}/xdl/large_tensor +) + # merged groups + # NDHWGC, GKZYXC, NDHWGK + +set(GENERATED_DIR ${CMAKE_CURRENT_BINARY_DIR}/generated) +generate_sharded_instantiations( + INSTANCES_NAME device_grouped_conv3d_fwd_bias_bn_clamp_xdl_merged_groups_ndhwgc_gkzyxc_ndhwgk_bf16_instances + TEMPLATE_FILE xdl/merged_groups/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_merged_groups_ndhwgc_gkzyxc_ndhwgk_bf16_instance.in + NUM_SHARDS 3 + SRC_LIST GROUPED_conv3d_FWD_BIAS_BNORM_CLAMP + OUTPUT_DIR ${GENERATED_DIR}/xdl/merged_groups +) + +set(GENERATED_DIR ${CMAKE_CURRENT_BINARY_DIR}/generated) +generate_sharded_instantiations( + INSTANCES_NAME device_grouped_conv3d_fwd_bias_bn_clamp_xdl_merged_groups_ndhwgc_gkzyxc_ndhwgk_f16_instances + TEMPLATE_FILE xdl/merged_groups/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_merged_groups_ndhwgc_gkzyxc_ndhwgk_f16_instance.in + NUM_SHARDS 3 + SRC_LIST GROUPED_conv3d_FWD_BIAS_BNORM_CLAMP + OUTPUT_DIR ${GENERATED_DIR}/xdl/merged_groups +) + +set(GENERATED_DIR ${CMAKE_CURRENT_BINARY_DIR}/generated) +generate_sharded_instantiations( + INSTANCES_NAME device_grouped_conv3d_fwd_bias_bn_clamp_xdl_merged_groups_ndhwgc_gkzyxc_ndhwgk_f32_instances + TEMPLATE_FILE xdl/merged_groups/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_merged_groups_ndhwgc_gkzyxc_ndhwgk_f32_instance.in + NUM_SHARDS 3 + SRC_LIST GROUPED_conv3d_FWD_BIAS_BNORM_CLAMP + OUTPUT_DIR ${GENERATED_DIR}/xdl/merged_groups +) + #mem + # NDHWGC, GKZYXC, NDHWGK + +set(GENERATED_DIR ${CMAKE_CURRENT_BINARY_DIR}/generated) +generate_sharded_instantiations( + INSTANCES_NAME device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_mem_intra_instances + TEMPLATE_FILE xdl/mem/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_mem_intra_instance.in + NUM_SHARDS 20 + SRC_LIST GROUPED_conv3d_FWD_BIAS_BNORM_CLAMP + OUTPUT_DIR ${GENERATED_DIR}/xdl/mem +) + +set(GENERATED_DIR ${CMAKE_CURRENT_BINARY_DIR}/generated) +generate_sharded_instantiations( + INSTANCES_NAME device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f16_mem_intra_instances + TEMPLATE_FILE xdl/mem/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f16_mem_intra_instance.in + NUM_SHARDS 20 + SRC_LIST GROUPED_conv3d_FWD_BIAS_BNORM_CLAMP + OUTPUT_DIR ${GENERATED_DIR}/xdl/mem +) + +set(GENERATED_DIR ${CMAKE_CURRENT_BINARY_DIR}/generated) +generate_sharded_instantiations( + INSTANCES_NAME device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f32_mem_intra_instances + TEMPLATE_FILE xdl/mem/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f32_mem_intra_instance.in + NUM_SHARDS 16 + SRC_LIST GROUPED_conv3d_FWD_BIAS_BNORM_CLAMP + OUTPUT_DIR ${GENERATED_DIR}/xdl/mem +) + # NDHWGC, GKZYXC, NDHWGK + +set(GENERATED_DIR ${CMAKE_CURRENT_BINARY_DIR}/generated) +generate_sharded_instantiations( + INSTANCES_NAME device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_mem_inter_instances + TEMPLATE_FILE xdl/mem/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_mem_inter_instance.in + NUM_SHARDS 20 + SRC_LIST GROUPED_conv3d_FWD_BIAS_BNORM_CLAMP + OUTPUT_DIR ${GENERATED_DIR}/xdl/mem +) + +set(GENERATED_DIR ${CMAKE_CURRENT_BINARY_DIR}/generated) +generate_sharded_instantiations( + INSTANCES_NAME device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f16_mem_inter_instances + TEMPLATE_FILE xdl/mem/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f16_mem_inter_instance.in + NUM_SHARDS 20 + SRC_LIST GROUPED_conv3d_FWD_BIAS_BNORM_CLAMP + OUTPUT_DIR ${GENERATED_DIR}/xdl/mem +) + +set(GENERATED_DIR ${CMAKE_CURRENT_BINARY_DIR}/generated) +generate_sharded_instantiations( + INSTANCES_NAME device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f32_mem_inter_instances + TEMPLATE_FILE xdl/mem/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f32_mem_inter_instance.in + NUM_SHARDS 16 + SRC_LIST GROUPED_conv3d_FWD_BIAS_BNORM_CLAMP + OUTPUT_DIR ${GENERATED_DIR}/xdl/mem +) + #comp + # NDHWGC, GKZYXC, NDHWGK + +set(GENERATED_DIR ${CMAKE_CURRENT_BINARY_DIR}/generated) +generate_sharded_instantiations( + INSTANCES_NAME device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_comp_instances + TEMPLATE_FILE xdl/comp/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_comp_instance.in + NUM_SHARDS 11 + SRC_LIST GROUPED_conv3d_FWD_BIAS_BNORM_CLAMP + OUTPUT_DIR ${GENERATED_DIR}/xdl/comp +) + +set(GENERATED_DIR ${CMAKE_CURRENT_BINARY_DIR}/generated) +generate_sharded_instantiations( + INSTANCES_NAME device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f16_comp_instances + TEMPLATE_FILE xdl/comp/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f16_comp_instance.in + NUM_SHARDS 1 + SRC_LIST GROUPED_conv3d_FWD_BIAS_BNORM_CLAMP + OUTPUT_DIR ${GENERATED_DIR}/xdl/comp +) + +set(GENERATED_DIR ${CMAKE_CURRENT_BINARY_DIR}/generated) +generate_sharded_instantiations( + INSTANCES_NAME device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f32_comp_instances + TEMPLATE_FILE xdl/comp/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f32_comp_instance.in + NUM_SHARDS 4 + SRC_LIST GROUPED_conv3d_FWD_BIAS_BNORM_CLAMP + OUTPUT_DIR ${GENERATED_DIR}/xdl/comp +) + +set(GENERATED_DIR ${CMAKE_CURRENT_BINARY_DIR}/generated) +generate_sharded_instantiations( + INSTANCES_NAME device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_comp_2x_instances + TEMPLATE_FILE xdl/comp/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_comp_2x_instance.in + NUM_SHARDS 1 + SRC_LIST GROUPED_conv3d_FWD_BIAS_BNORM_CLAMP + OUTPUT_DIR ${GENERATED_DIR}/xdl/comp +) + +set(GENERATED_DIR ${CMAKE_CURRENT_BINARY_DIR}/generated) +generate_sharded_instantiations( + INSTANCES_NAME device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f16_comp_2x_instances + TEMPLATE_FILE xdl/comp/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f16_comp_2x_instance.in + NUM_SHARDS 1 + SRC_LIST GROUPED_conv3d_FWD_BIAS_BNORM_CLAMP + OUTPUT_DIR ${GENERATED_DIR}/xdl/comp +) + +set(GENERATED_DIR ${CMAKE_CURRENT_BINARY_DIR}/generated) +generate_sharded_instantiations( + INSTANCES_NAME device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_comp_part2_instances + TEMPLATE_FILE xdl/comp/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_comp_part2_instance.in + NUM_SHARDS 5 + SRC_LIST GROUPED_conv3d_FWD_BIAS_BNORM_CLAMP + OUTPUT_DIR ${GENERATED_DIR}/xdl/comp +) + +set(GENERATED_DIR ${CMAKE_CURRENT_BINARY_DIR}/generated) +generate_sharded_instantiations( + INSTANCES_NAME device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f16_comp_part2_instances + TEMPLATE_FILE xdl/comp/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f16_comp_part2_instance.in + NUM_SHARDS 12 + SRC_LIST GROUPED_conv3d_FWD_BIAS_BNORM_CLAMP + OUTPUT_DIR ${GENERATED_DIR}/xdl/comp +) + +add_instance_library(device_grouped_conv3d_fwd_bias_bnorm_clamp_instance ${GROUPED_conv3d_FWD_BIAS_BNORM_CLAMP}) diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/comp/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_comp_2x_instance.in b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/comp/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_comp_2x_instance.in new file mode 100644 index 0000000000..f397f0a810 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/comp/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_comp_2x_instance.in @@ -0,0 +1,67 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_comp_instance.hpp" +#include "ck/host_utility/device_prop.hpp" +#include "ck/utility/filter_tuple.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +using device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_comp_2x_instances = std::vector, + NDHWGK, + BF16, + BF16, + Tuple, + BF16, + PassThrough, + PassThrough, + BiasNormalizeInInferClamp>>>; + +// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k] +template +void add_device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_comp_2x_instances_shard(device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_comp_2x_instances& instances) +{ + if(ck::get_device_name() == "gfx950") + { + add_device_operation_instances( + instances, ck::util::filter_tuple_by_modulo_t< + device_grouped_conv_fwd_xdl_bf16_comp_instances_2x<3, + NDHWGC, + GKZYXC, Tuple, + NDHWGK, + ConvFwdDefault,Tuple, BiasNormalizeInInferClamp>, + Shards, + ShardIndex>{}); + + add_device_operation_instances( + instances, ck::util::filter_tuple_by_modulo_t< + device_grouped_conv_fwd_xdl_bf16_comp_instances_2x<3, + NDHWGC, + GKZYXC, Tuple, + NDHWGK, + ConvFwd1x1P0,Tuple, BiasNormalizeInInferClamp>, + Shards, + ShardIndex>{}); + + add_device_operation_instances( + instances, ck::util::filter_tuple_by_modulo_t< + device_grouped_conv_fwd_xdl_bf16_comp_instances_2x<3, + NDHWGC, + GKZYXC, Tuple, + NDHWGK, + ConvFwd1x1S1P0,Tuple, BiasNormalizeInInferClamp>, + Shards, + ShardIndex>{}); + } +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/comp/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_comp_instance.in b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/comp/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_comp_instance.in new file mode 100644 index 0000000000..d6aa4ea964 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/comp/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_comp_instance.in @@ -0,0 +1,63 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_comp_instance.hpp" +#include "ck/host_utility/device_prop.hpp" +#include "ck/utility/filter_tuple.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +using device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_comp_instances = std::vector, + NDHWGK, + BF16, + BF16, + Tuple, + BF16, + PassThrough, + PassThrough, + BiasNormalizeInInferClamp>>>; + +// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k] +template +void add_device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_comp_instances_shard(device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_comp_instances& instances) +{ + add_device_operation_instances( + instances, ck::util::filter_tuple_by_modulo_t< + device_grouped_conv_fwd_xdl_bf16_comp_instances<3, + NDHWGC, + GKZYXC, Tuple, + NDHWGK, + ConvFwdDefault,Tuple, BiasNormalizeInInferClamp>, + Shards, + ShardIndex>{}); + + add_device_operation_instances(instances, ck::util::filter_tuple_by_modulo_t< + device_grouped_conv_fwd_xdl_bf16_comp_instances<3, + NDHWGC, + GKZYXC, Tuple, + NDHWGK, + ConvFwd1x1P0,Tuple, BiasNormalizeInInferClamp>, + Shards, + ShardIndex>{}); + + add_device_operation_instances( + instances, ck::util::filter_tuple_by_modulo_t< + device_grouped_conv_fwd_xdl_bf16_comp_instances<3, + NDHWGC, + GKZYXC, Tuple, + NDHWGK, + ConvFwd1x1S1P0,Tuple, BiasNormalizeInInferClamp>, + Shards, + ShardIndex>{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/comp/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_comp_part2_instance.in b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/comp/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_comp_part2_instance.in new file mode 100644 index 0000000000..7c993f8b94 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/comp/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_comp_part2_instance.in @@ -0,0 +1,67 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_comp_instance.hpp" +#include "ck/host_utility/device_prop.hpp" +#include "ck/utility/filter_tuple.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +using device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_comp_part2_instances = std::vector, + NDHWGK, + BF16, + BF16, + Tuple, + BF16, + PassThrough, + PassThrough, + BiasNormalizeInInferClamp>>>; + +// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k] +template +void add_device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_comp_part2_instances_shard(device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_comp_part2_instances& instances) +{ + if(ck::get_device_name() != "gfx950") + { + add_device_operation_instances( + instances, ck::util::filter_tuple_by_modulo_t< + device_grouped_conv_fwd_xdl_bf16_comp_instances_part2<3, + NDHWGC, + GKZYXC, Tuple, + NDHWGK, + ConvFwdDefault,Tuple, BiasNormalizeInInferClamp>, + Shards, + ShardIndex>{}); + + add_device_operation_instances( + instances, ck::util::filter_tuple_by_modulo_t< + device_grouped_conv_fwd_xdl_bf16_comp_instances_part2<3, + NDHWGC, + GKZYXC, Tuple, + NDHWGK, + ConvFwd1x1P0,Tuple, BiasNormalizeInInferClamp>, + Shards, + ShardIndex>{}); + + add_device_operation_instances( + instances, ck::util::filter_tuple_by_modulo_t< + device_grouped_conv_fwd_xdl_bf16_comp_instances_part2<3, + NDHWGC, + GKZYXC, Tuple, + NDHWGK, + ConvFwd1x1S1P0,Tuple, BiasNormalizeInInferClamp>, + Shards, + ShardIndex>{}); + } +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/comp/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f16_comp_2x_instance.in b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/comp/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f16_comp_2x_instance.in new file mode 100644 index 0000000000..fb41ec60f8 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/comp/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f16_comp_2x_instance.in @@ -0,0 +1,67 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_comp_instance.hpp" +#include "ck/host_utility/device_prop.hpp" +#include "ck/utility/filter_tuple.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +using device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f16_comp_2x_instances = std::vector, + NDHWGK, + F16, + F16, + Tuple, + F16, + PassThrough, + PassThrough, + BiasNormalizeInInferClamp>>>; + +// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k] +template +void add_device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f16_comp_2x_instances_shard(device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f16_comp_2x_instances& instances) +{ + if(ck::get_device_name() == "gfx950") + { + add_device_operation_instances( + instances, ck::util::filter_tuple_by_modulo_t< + device_grouped_conv_fwd_xdl_f16_comp_instances_2x<3, + NDHWGC, + GKZYXC, Tuple, + NDHWGK, + ConvFwdDefault,Tuple, BiasNormalizeInInferClamp>, + Shards, + ShardIndex>{}); + + add_device_operation_instances( + instances, ck::util::filter_tuple_by_modulo_t< + device_grouped_conv_fwd_xdl_f16_comp_instances_2x<3, + NDHWGC, + GKZYXC, Tuple, + NDHWGK, + ConvFwd1x1P0,Tuple, BiasNormalizeInInferClamp>, + Shards, + ShardIndex>{}); + + add_device_operation_instances( + instances, ck::util::filter_tuple_by_modulo_t< + device_grouped_conv_fwd_xdl_f16_comp_instances_2x<3, + NDHWGC, + GKZYXC, Tuple, + NDHWGK, + ConvFwd1x1S1P0,Tuple, BiasNormalizeInInferClamp>, + Shards, + ShardIndex>{}); + } +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/comp/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f16_comp_instance.in b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/comp/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f16_comp_instance.in new file mode 100644 index 0000000000..e1d581e4fd --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/comp/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f16_comp_instance.in @@ -0,0 +1,63 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_comp_instance.hpp" +#include "ck/host_utility/device_prop.hpp" +#include "ck/utility/filter_tuple.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +using device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f16_comp_instances = std::vector, + NDHWGK, + F16, + F16, + Tuple, + F16, + PassThrough, + PassThrough, + BiasNormalizeInInferClamp>>>; + +// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k] +template +void add_device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f16_comp_instances_shard(device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f16_comp_instances& instances) +{ + add_device_operation_instances( + instances, ck::util::filter_tuple_by_modulo_t< + device_grouped_conv_fwd_xdl_f16_comp_instances<3, + NDHWGC, + GKZYXC, Tuple, + NDHWGK, + ConvFwdDefault,Tuple, BiasNormalizeInInferClamp>, + Shards, + ShardIndex>{}); + + add_device_operation_instances(instances, ck::util::filter_tuple_by_modulo_t< + device_grouped_conv_fwd_xdl_f16_comp_instances<3, + NDHWGC, + GKZYXC, Tuple, + NDHWGK, + ConvFwd1x1P0,Tuple, BiasNormalizeInInferClamp>, + Shards, + ShardIndex>{}); + + add_device_operation_instances( + instances, ck::util::filter_tuple_by_modulo_t< + device_grouped_conv_fwd_xdl_f16_comp_instances<3, + NDHWGC, + GKZYXC, Tuple, + NDHWGK, + ConvFwd1x1S1P0,Tuple, BiasNormalizeInInferClamp>, + Shards, + ShardIndex>{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/comp/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f16_comp_part2_instance.in b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/comp/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f16_comp_part2_instance.in new file mode 100644 index 0000000000..99b48d51a0 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/comp/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f16_comp_part2_instance.in @@ -0,0 +1,67 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_comp_instance.hpp" +#include "ck/host_utility/device_prop.hpp" +#include "ck/utility/filter_tuple.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +using device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f16_comp_part2_instances = std::vector, + NDHWGK, + F16, + F16, + Tuple, + F16, + PassThrough, + PassThrough, + BiasNormalizeInInferClamp>>>; + +// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k] +template +void add_device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f16_comp_part2_instances_shard(device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f16_comp_part2_instances& instances) +{ + if(ck::get_device_name() != "gfx950") + { + add_device_operation_instances( + instances, ck::util::filter_tuple_by_modulo_t< + device_grouped_conv_fwd_xdl_f16_comp_instances_part2<3, + NDHWGC, + GKZYXC, Tuple, + NDHWGK, + ConvFwdDefault,Tuple, BiasNormalizeInInferClamp>, + Shards, + ShardIndex>{}); + + add_device_operation_instances( + instances, ck::util::filter_tuple_by_modulo_t< + device_grouped_conv_fwd_xdl_f16_comp_instances_part2<3, + NDHWGC, + GKZYXC, Tuple, + NDHWGK, + ConvFwd1x1P0,Tuple, BiasNormalizeInInferClamp>, + Shards, + ShardIndex>{}); + + add_device_operation_instances( + instances, ck::util::filter_tuple_by_modulo_t< + device_grouped_conv_fwd_xdl_f16_comp_instances_part2<3, + NDHWGC, + GKZYXC, Tuple, + NDHWGK, + ConvFwd1x1S1P0,Tuple, BiasNormalizeInInferClamp>, + Shards, + ShardIndex>{}); + } +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/comp/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f32_comp_instance.in b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/comp/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f32_comp_instance.in new file mode 100644 index 0000000000..b172975635 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/comp/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f32_comp_instance.in @@ -0,0 +1,62 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_comp_instance.hpp" +#include "ck/utility/filter_tuple.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +using device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f32_comp_instances = std::vector, + NDHWGK, + F32, + F32, + Tuple, + F32, + PassThrough, + PassThrough, + BiasNormalizeInInferClamp>>>; + +// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k] +template +void add_device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f32_comp_instances_shard(device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f32_comp_instances& instances) +{ + add_device_operation_instances( + instances, ck::util::filter_tuple_by_modulo_t< + device_grouped_conv_fwd_xdl_f32_comp_instances<3, + NDHWGC, + GKZYXC, Tuple, + NDHWGK, + ConvFwdDefault,Tuple, BiasNormalizeInInferClamp>, + Shards, + ShardIndex>{}); + + add_device_operation_instances(instances, ck::util::filter_tuple_by_modulo_t< + device_grouped_conv_fwd_xdl_f32_comp_instances<3, + NDHWGC, + GKZYXC, Tuple, + NDHWGK, + ConvFwd1x1P0,Tuple, BiasNormalizeInInferClamp>, + Shards, + ShardIndex>{}); + + add_device_operation_instances( + instances, ck::util::filter_tuple_by_modulo_t< + device_grouped_conv_fwd_xdl_f32_comp_instances<3, + NDHWGC, + GKZYXC, Tuple, + NDHWGK, + ConvFwd1x1S1P0,Tuple, BiasNormalizeInInferClamp>, + Shards, + ShardIndex>{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_16x16_instance.in b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_16x16_instance.in new file mode 100644 index 0000000000..8ec8d9248f --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_16x16_instance.in @@ -0,0 +1,63 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_instance.hpp" +#include "ck/utility/filter_tuple.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +using device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_16x16_instances = std::vector, + NDHWGK, + BF16, + BF16, + Tuple, + BF16, + PassThrough, + PassThrough, + BiasNormalizeInInferClamp>>>; + +// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k] +template +void add_device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_16x16_instances_shard(device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_16x16_instances& instances) +{ + add_device_operation_instances( + instances, ck::util::filter_tuple_by_modulo_t< + device_grouped_conv_fwd_xdl_bf16_16x16_instances<3, + NDHWGC, + GKZYXC, Tuple, + NDHWGK, + ConvFwdDefault,Tuple, BiasNormalizeInInferClamp>, + Shards, + ShardIndex>{}); + + add_device_operation_instances( + instances, ck::util::filter_tuple_by_modulo_t< + device_grouped_conv_fwd_xdl_bf16_16x16_instances<3, + NDHWGC, + GKZYXC, Tuple, + NDHWGK, + ConvFwd1x1P0,Tuple, BiasNormalizeInInferClamp>, + Shards, + ShardIndex>{}); + + add_device_operation_instances( + instances, ck::util::filter_tuple_by_modulo_t< + device_grouped_conv_fwd_xdl_bf16_16x16_instances<3, + NDHWGC, + GKZYXC, Tuple, + NDHWGK, + ConvFwd1x1S1P0,Tuple, BiasNormalizeInInferClamp>, + Shards, + ShardIndex>{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_instance.in b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_instance.in new file mode 100644 index 0000000000..fb5c4159fd --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_instance.in @@ -0,0 +1,60 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_instance.hpp" +#include "ck/utility/filter_tuple.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +using device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_instances = std::vector, + NDHWGK, + BF16, + BF16, + Tuple, + BF16, + PassThrough, + PassThrough, + BiasNormalizeInInferClamp>>>; + +// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k] +template +void add_device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_instances_shard(device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_instances& instances) +{ + add_device_operation_instances(instances, ck::util::filter_tuple_by_modulo_t< + device_grouped_conv_fwd_xdl_bf16_instances<3, + NDHWGC, + GKZYXC, Tuple, + NDHWGK, + ConvFwdDefault,Tuple, BiasNormalizeInInferClamp>, + Shards, + ShardIndex>{}); + + add_device_operation_instances(instances, ck::util::filter_tuple_by_modulo_t< + device_grouped_conv_fwd_xdl_bf16_instances<3, + NDHWGC, + GKZYXC, Tuple, + NDHWGK, + ConvFwd1x1P0,Tuple, BiasNormalizeInInferClamp>, + Shards, + ShardIndex>{}); + + add_device_operation_instances(instances, ck::util::filter_tuple_by_modulo_t< + device_grouped_conv_fwd_xdl_bf16_instances<3, + NDHWGC, + GKZYXC, Tuple, + NDHWGK, + ConvFwd1x1S1P0,Tuple, BiasNormalizeInInferClamp>, + Shards, + ShardIndex>{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f16_16x16_instance.in b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f16_16x16_instance.in new file mode 100644 index 0000000000..a00fbf5342 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f16_16x16_instance.in @@ -0,0 +1,62 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_instance.hpp" +#include "ck/utility/filter_tuple.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +using device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f16_16x16_instances = std::vector, + NDHWGK, + F16, + F16, + Tuple, + F16, + PassThrough, + PassThrough, + BiasNormalizeInInferClamp>>>; + +// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k] +template +void add_device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f16_16x16_instances_shard(device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f16_16x16_instances& instances) +{ + add_device_operation_instances( + instances, ck::util::filter_tuple_by_modulo_t< + device_grouped_conv_fwd_xdl_f16_16x16_instances<3, + NDHWGC, + GKZYXC, Tuple, + NDHWGK, + ConvFwdDefault,Tuple, BiasNormalizeInInferClamp>, + Shards, + ShardIndex>{}); + + add_device_operation_instances(instances, ck::util::filter_tuple_by_modulo_t< + device_grouped_conv_fwd_xdl_f16_16x16_instances<3, + NDHWGC, + GKZYXC, Tuple, + NDHWGK, + ConvFwd1x1P0,Tuple, BiasNormalizeInInferClamp>, + Shards, + ShardIndex>{}); + + add_device_operation_instances( + instances, ck::util::filter_tuple_by_modulo_t< + device_grouped_conv_fwd_xdl_f16_16x16_instances<3, + NDHWGC, + GKZYXC, Tuple, + NDHWGK, + ConvFwd1x1S1P0,Tuple, BiasNormalizeInInferClamp>, + Shards, + ShardIndex>{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f16_instance.in b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f16_instance.in new file mode 100644 index 0000000000..222ec0c2e0 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f16_instance.in @@ -0,0 +1,60 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_instance.hpp" +#include "ck/utility/filter_tuple.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +using device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f16_instances = std::vector, + NDHWGK, + F16, + F16, + Tuple, + F16, + PassThrough, + PassThrough, + BiasNormalizeInInferClamp>>>; + +// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k] +template +void add_device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f16_instances_shard(device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f16_instances& instances) +{ + add_device_operation_instances(instances, ck::util::filter_tuple_by_modulo_t< + device_grouped_conv_fwd_xdl_f16_instances<3, + NDHWGC, + GKZYXC, Tuple, + NDHWGK, + ConvFwdDefault,Tuple, BiasNormalizeInInferClamp>, + Shards, + ShardIndex>{}); + + add_device_operation_instances(instances, ck::util::filter_tuple_by_modulo_t< + device_grouped_conv_fwd_xdl_f16_instances<3, + NDHWGC, + GKZYXC, Tuple, + NDHWGK, + ConvFwd1x1P0,Tuple, BiasNormalizeInInferClamp>, + Shards, + ShardIndex>{}); + + add_device_operation_instances(instances, ck::util::filter_tuple_by_modulo_t< + device_grouped_conv_fwd_xdl_f16_instances<3, + NDHWGC, + GKZYXC, Tuple, + NDHWGK, + ConvFwd1x1S1P0,Tuple, BiasNormalizeInInferClamp>, + Shards, + ShardIndex>{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f32_16x16_instance.in b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f32_16x16_instance.in new file mode 100644 index 0000000000..8fbedb7793 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f32_16x16_instance.in @@ -0,0 +1,62 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_instance.hpp" +#include "ck/utility/filter_tuple.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +using device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f32_16x16_instances = std::vector, + NDHWGK, + F32, + F32, + Tuple, + F32, + PassThrough, + PassThrough, + BiasNormalizeInInferClamp>>>; + +// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k] +template +void add_device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f32_16x16_instances_shard(device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f32_16x16_instances& instances) +{ + add_device_operation_instances( + instances, ck::util::filter_tuple_by_modulo_t< + device_grouped_conv_fwd_xdl_f32_16x16_instances<3, + NDHWGC, + GKZYXC, Tuple, + NDHWGK, + ConvFwdDefault,Tuple, BiasNormalizeInInferClamp>, + Shards, + ShardIndex>{}); + + add_device_operation_instances(instances, ck::util::filter_tuple_by_modulo_t< + device_grouped_conv_fwd_xdl_f32_16x16_instances<3, + NDHWGC, + GKZYXC, Tuple, + NDHWGK, + ConvFwd1x1P0,Tuple, BiasNormalizeInInferClamp>, + Shards, + ShardIndex>{}); + + add_device_operation_instances( + instances, ck::util::filter_tuple_by_modulo_t< + device_grouped_conv_fwd_xdl_f32_16x16_instances<3, + NDHWGC, + GKZYXC, Tuple, + NDHWGK, + ConvFwd1x1S1P0,Tuple, BiasNormalizeInInferClamp>, + Shards, + ShardIndex>{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f32_instance.in b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f32_instance.in new file mode 100644 index 0000000000..c538d50fc9 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f32_instance.in @@ -0,0 +1,60 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_instance.hpp" +#include "ck/utility/filter_tuple.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +using device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f32_instances = std::vector, + NDHWGK, + F32, + F32, + Tuple, + F32, + PassThrough, + PassThrough, + BiasNormalizeInInferClamp>>>; + +// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k] +template +void add_device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f32_instances_shard(device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f32_instances& instances) +{ + add_device_operation_instances(instances, ck::util::filter_tuple_by_modulo_t< + device_grouped_conv_fwd_xdl_f32_instances<3, + NDHWGC, + GKZYXC, Tuple, + NDHWGK, + ConvFwdDefault,Tuple, BiasNormalizeInInferClamp>, + Shards, + ShardIndex>{}); + + add_device_operation_instances(instances, ck::util::filter_tuple_by_modulo_t< + device_grouped_conv_fwd_xdl_f32_instances<3, + NDHWGC, + GKZYXC, Tuple, + NDHWGK, + ConvFwd1x1P0,Tuple, BiasNormalizeInInferClamp>, + Shards, + ShardIndex>{}); + + add_device_operation_instances(instances, ck::util::filter_tuple_by_modulo_t< + device_grouped_conv_fwd_xdl_f32_instances<3, + NDHWGC, + GKZYXC, Tuple, + NDHWGK, + ConvFwd1x1S1P0,Tuple, BiasNormalizeInInferClamp>, + Shards, + ShardIndex>{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/large_tensor/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_large_tensor_ndhwgc_gkzyxc_ndhwgk_bf16_instance.in b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/large_tensor/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_large_tensor_ndhwgc_gkzyxc_ndhwgk_bf16_instance.in new file mode 100644 index 0000000000..be76a48480 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/large_tensor/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_large_tensor_ndhwgc_gkzyxc_ndhwgk_bf16_instance.in @@ -0,0 +1,43 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_large_tensor_instance.hpp" +#include "ck/utility/filter_tuple.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +using device_grouped_conv3d_fwd_bias_bn_clamp_xdl_large_tensor_ndhwgc_gkzyxc_ndhwgk_bf16_instances = std::vector, + NDHWGK, + BF16, + BF16, + Tuple, + BF16, + PassThrough, + PassThrough, + BiasNormalizeInInferClamp>>>; + +// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k] +template +void add_device_grouped_conv3d_fwd_bias_bn_clamp_xdl_large_tensor_ndhwgc_gkzyxc_ndhwgk_bf16_instances_shard(device_grouped_conv3d_fwd_bias_bn_clamp_xdl_large_tensor_ndhwgc_gkzyxc_ndhwgk_bf16_instances& instances) +{ + add_device_operation_instances( + instances, ck::util::filter_tuple_by_modulo_t< + device_grouped_conv_fwd_xdl_large_tensor_bf16_instances<3, + NDHWGC, + GKZYXC, Tuple, + NDHWGK, + ConvFwdDefault,Tuple, BiasNormalizeInInferClamp>, + Shards, + ShardIndex>{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/large_tensor/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_large_tensor_ndhwgc_gkzyxc_ndhwgk_f16_instance.in b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/large_tensor/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_large_tensor_ndhwgc_gkzyxc_ndhwgk_f16_instance.in new file mode 100644 index 0000000000..dcfdb984c2 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/large_tensor/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_large_tensor_ndhwgc_gkzyxc_ndhwgk_f16_instance.in @@ -0,0 +1,43 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_large_tensor_instance.hpp" +#include "ck/utility/filter_tuple.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +using device_grouped_conv3d_fwd_bias_bn_clamp_xdl_large_tensor_ndhwgc_gkzyxc_ndhwgk_f16_instances = std::vector, + NDHWGK, + F16, + F16, + Tuple, + F16, + PassThrough, + PassThrough, + BiasNormalizeInInferClamp>>>; + +// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k] +template +void add_device_grouped_conv3d_fwd_bias_bn_clamp_xdl_large_tensor_ndhwgc_gkzyxc_ndhwgk_f16_instances_shard(device_grouped_conv3d_fwd_bias_bn_clamp_xdl_large_tensor_ndhwgc_gkzyxc_ndhwgk_f16_instances& instances) +{ + add_device_operation_instances( + instances, ck::util::filter_tuple_by_modulo_t< + device_grouped_conv_fwd_xdl_large_tensor_f16_instances<3, + NDHWGC, + GKZYXC, Tuple, + NDHWGK, + ConvFwdDefault,Tuple, BiasNormalizeInInferClamp>, + Shards, + ShardIndex>{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/large_tensor/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_large_tensor_ndhwgc_gkzyxc_ndhwgk_f32_instance.in b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/large_tensor/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_large_tensor_ndhwgc_gkzyxc_ndhwgk_f32_instance.in new file mode 100644 index 0000000000..ed1988cdf4 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/large_tensor/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_large_tensor_ndhwgc_gkzyxc_ndhwgk_f32_instance.in @@ -0,0 +1,43 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_large_tensor_instance.hpp" +#include "ck/utility/filter_tuple.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +using device_grouped_conv3d_fwd_bias_bn_clamp_xdl_large_tensor_ndhwgc_gkzyxc_ndhwgk_f32_instances = std::vector, + NDHWGK, + F32, + F32, + Tuple, + F32, + PassThrough, + PassThrough, + BiasNormalizeInInferClamp>>>; + +// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k] +template +void add_device_grouped_conv3d_fwd_bias_bn_clamp_xdl_large_tensor_ndhwgc_gkzyxc_ndhwgk_f32_instances_shard(device_grouped_conv3d_fwd_bias_bn_clamp_xdl_large_tensor_ndhwgc_gkzyxc_ndhwgk_f32_instances& instances) +{ + add_device_operation_instances( + instances, ck::util::filter_tuple_by_modulo_t< + device_grouped_conv_fwd_xdl_large_tensor_f32_instances<3, + NDHWGC, + GKZYXC, Tuple, + NDHWGK, + ConvFwdDefault,Tuple, BiasNormalizeInInferClamp>, + Shards, + ShardIndex>{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/mem/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_mem_inter_instance.in b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/mem/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_mem_inter_instance.in new file mode 100644 index 0000000000..83af7e09ce --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/mem/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_mem_inter_instance.in @@ -0,0 +1,63 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_mem_instance.hpp" +#include "ck/utility/filter_tuple.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +using device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_mem_inter_instances = std::vector, + NDHWGK, + BF16, + BF16, + Tuple, + BF16, + PassThrough, + PassThrough, + BiasNormalizeInInferClamp>>>; + +// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k] +template +void add_device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_mem_inter_instances_shard(device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_mem_inter_instances& instances) +{ + add_device_operation_instances(instances, ck::util::filter_tuple_by_modulo_t< + device_grouped_conv_fwd_xdl_bf16_mem_instances<3, + NDHWGC, + GKZYXC, Tuple, + NDHWGK, + ConvFwdDefault, + Interwave,Tuple, BiasNormalizeInInferClamp>, + Shards, + ShardIndex>{}); + + add_device_operation_instances(instances, ck::util::filter_tuple_by_modulo_t< + device_grouped_conv_fwd_xdl_bf16_mem_instances<3, + NDHWGC, + GKZYXC, Tuple, + NDHWGK, + ConvFwd1x1P0, + Interwave,Tuple, BiasNormalizeInInferClamp>, + Shards, + ShardIndex>{}); + + add_device_operation_instances(instances, ck::util::filter_tuple_by_modulo_t< + device_grouped_conv_fwd_xdl_bf16_mem_instances<3, + NDHWGC, + GKZYXC, Tuple, + NDHWGK, + ConvFwd1x1S1P0, + Interwave,Tuple, BiasNormalizeInInferClamp>, + Shards, + ShardIndex>{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/mem/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_mem_intra_instance.in b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/mem/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_mem_intra_instance.in new file mode 100644 index 0000000000..ce83cb566a --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/mem/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_mem_intra_instance.in @@ -0,0 +1,63 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_mem_instance.hpp" +#include "ck/utility/filter_tuple.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +using device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_mem_intra_instances = std::vector, + NDHWGK, + BF16, + BF16, + Tuple, + BF16, + PassThrough, + PassThrough, + BiasNormalizeInInferClamp>>>; + +// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k] +template +void add_device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_mem_intra_instances_shard(device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_mem_intra_instances& instances) +{ + add_device_operation_instances(instances, ck::util::filter_tuple_by_modulo_t< + device_grouped_conv_fwd_xdl_bf16_mem_instances<3, + NDHWGC, + GKZYXC, Tuple, + NDHWGK, + ConvFwdDefault, + Intrawave,Tuple, BiasNormalizeInInferClamp>, + Shards, + ShardIndex>{}); + + add_device_operation_instances(instances, ck::util::filter_tuple_by_modulo_t< + device_grouped_conv_fwd_xdl_bf16_mem_instances<3, + NDHWGC, + GKZYXC, Tuple, + NDHWGK, + ConvFwd1x1P0, + Intrawave,Tuple, BiasNormalizeInInferClamp>, + Shards, + ShardIndex>{}); + + add_device_operation_instances(instances, ck::util::filter_tuple_by_modulo_t< + device_grouped_conv_fwd_xdl_bf16_mem_instances<3, + NDHWGC, + GKZYXC, Tuple, + NDHWGK, + ConvFwd1x1S1P0, + Intrawave,Tuple, BiasNormalizeInInferClamp>, + Shards, + ShardIndex>{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/mem/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f16_mem_inter_instance.in b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/mem/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f16_mem_inter_instance.in new file mode 100644 index 0000000000..051aaf7cf3 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/mem/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f16_mem_inter_instance.in @@ -0,0 +1,63 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_mem_instance.hpp" +#include "ck/utility/filter_tuple.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +using device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f16_mem_inter_instances = std::vector, + NDHWGK, + F16, + F16, + Tuple, + F16, + PassThrough, + PassThrough, + BiasNormalizeInInferClamp>>>; + +// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k] +template +void add_device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f16_mem_inter_instances_shard(device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f16_mem_inter_instances& instances) +{ + add_device_operation_instances(instances, ck::util::filter_tuple_by_modulo_t< + device_grouped_conv_fwd_xdl_f16_mem_instances<3, + NDHWGC, + GKZYXC, Tuple, + NDHWGK, + ConvFwdDefault, + Interwave,Tuple, BiasNormalizeInInferClamp>, + Shards, + ShardIndex>{}); + + add_device_operation_instances(instances, ck::util::filter_tuple_by_modulo_t< + device_grouped_conv_fwd_xdl_f16_mem_instances<3, + NDHWGC, + GKZYXC, Tuple, + NDHWGK, + ConvFwd1x1P0, + Interwave,Tuple, BiasNormalizeInInferClamp>, + Shards, + ShardIndex>{}); + + add_device_operation_instances(instances, ck::util::filter_tuple_by_modulo_t< + device_grouped_conv_fwd_xdl_f16_mem_instances<3, + NDHWGC, + GKZYXC, Tuple, + NDHWGK, + ConvFwd1x1S1P0, + Interwave,Tuple, BiasNormalizeInInferClamp>, + Shards, + ShardIndex>{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/mem/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f16_mem_intra_instance.in b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/mem/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f16_mem_intra_instance.in new file mode 100644 index 0000000000..6fa3709cc6 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/mem/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f16_mem_intra_instance.in @@ -0,0 +1,63 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_mem_instance.hpp" +#include "ck/utility/filter_tuple.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +using device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f16_mem_intra_instances = std::vector, + NDHWGK, + F16, + F16, + Tuple, + F16, + PassThrough, + PassThrough, + BiasNormalizeInInferClamp>>>; + +// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k] +template +void add_device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f16_mem_intra_instances_shard(device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f16_mem_intra_instances& instances) +{ + add_device_operation_instances(instances, ck::util::filter_tuple_by_modulo_t< + device_grouped_conv_fwd_xdl_f16_mem_instances<3, + NDHWGC, + GKZYXC, Tuple, + NDHWGK, + ConvFwdDefault, + Intrawave,Tuple, BiasNormalizeInInferClamp>, + Shards, + ShardIndex>{}); + + add_device_operation_instances(instances, ck::util::filter_tuple_by_modulo_t< + device_grouped_conv_fwd_xdl_f16_mem_instances<3, + NDHWGC, + GKZYXC, Tuple, + NDHWGK, + ConvFwd1x1P0, + Intrawave,Tuple, BiasNormalizeInInferClamp>, + Shards, + ShardIndex>{}); + + add_device_operation_instances(instances, ck::util::filter_tuple_by_modulo_t< + device_grouped_conv_fwd_xdl_f16_mem_instances<3, + NDHWGC, + GKZYXC, Tuple, + NDHWGK, + ConvFwd1x1S1P0, + Intrawave,Tuple, BiasNormalizeInInferClamp>, + Shards, + ShardIndex>{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/mem/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f32_mem_inter_instance.in b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/mem/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f32_mem_inter_instance.in new file mode 100644 index 0000000000..2ba3e4ec93 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/mem/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f32_mem_inter_instance.in @@ -0,0 +1,63 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_mem_instance.hpp" +#include "ck/utility/filter_tuple.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +using device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f32_mem_inter_instances = std::vector, + NDHWGK, + F32, + F32, + Tuple, + F32, + PassThrough, + PassThrough, + BiasNormalizeInInferClamp>>>; + +// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k] +template +void add_device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f32_mem_inter_instances_shard(device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f32_mem_inter_instances& instances) +{ + add_device_operation_instances(instances, ck::util::filter_tuple_by_modulo_t< + device_grouped_conv_fwd_xdl_f32_mem_instances<3, + NDHWGC, + GKZYXC, Tuple, + NDHWGK, + ConvFwdDefault, + Interwave,Tuple, BiasNormalizeInInferClamp>, + Shards, + ShardIndex>{}); + + add_device_operation_instances(instances, ck::util::filter_tuple_by_modulo_t< + device_grouped_conv_fwd_xdl_f32_mem_instances<3, + NDHWGC, + GKZYXC, Tuple, + NDHWGK, + ConvFwd1x1P0, + Interwave,Tuple, BiasNormalizeInInferClamp>, + Shards, + ShardIndex>{}); + + add_device_operation_instances(instances, ck::util::filter_tuple_by_modulo_t< + device_grouped_conv_fwd_xdl_f32_mem_instances<3, + NDHWGC, + GKZYXC, Tuple, + NDHWGK, + ConvFwd1x1S1P0, + Interwave,Tuple, BiasNormalizeInInferClamp>, + Shards, + ShardIndex>{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/mem/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f32_mem_intra_instance.in b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/mem/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f32_mem_intra_instance.in new file mode 100644 index 0000000000..c4d33236af --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/mem/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f32_mem_intra_instance.in @@ -0,0 +1,63 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_mem_instance.hpp" +#include "ck/utility/filter_tuple.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +using device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f32_mem_intra_instances = std::vector, + NDHWGK, + F32, + F32, + Tuple, + F32, + PassThrough, + PassThrough, + BiasNormalizeInInferClamp>>>; + +// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k] +template +void add_device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f32_mem_intra_instances_shard(device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f32_mem_intra_instances& instances) +{ + add_device_operation_instances(instances, ck::util::filter_tuple_by_modulo_t< + device_grouped_conv_fwd_xdl_f32_mem_instances<3, + NDHWGC, + GKZYXC, Tuple, + NDHWGK, + ConvFwdDefault, + Intrawave,Tuple, BiasNormalizeInInferClamp>, + Shards, + ShardIndex>{}); + + add_device_operation_instances(instances, ck::util::filter_tuple_by_modulo_t< + device_grouped_conv_fwd_xdl_f32_mem_instances<3, + NDHWGC, + GKZYXC, Tuple, + NDHWGK, + ConvFwd1x1P0, + Intrawave,Tuple, BiasNormalizeInInferClamp>, + Shards, + ShardIndex>{}); + + add_device_operation_instances(instances, ck::util::filter_tuple_by_modulo_t< + device_grouped_conv_fwd_xdl_f32_mem_instances<3, + NDHWGC, + GKZYXC, Tuple, + NDHWGK, + ConvFwd1x1S1P0, + Intrawave,Tuple, BiasNormalizeInInferClamp>, + Shards, + ShardIndex>{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/merged_groups/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_merged_groups_ndhwgc_gkzyxc_ndhwgk_bf16_instance.in b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/merged_groups/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_merged_groups_ndhwgc_gkzyxc_ndhwgk_bf16_instance.in new file mode 100644 index 0000000000..6a902ed72d --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/merged_groups/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_merged_groups_ndhwgc_gkzyxc_ndhwgk_bf16_instance.in @@ -0,0 +1,79 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_merged_groups_instance.hpp" +#include "ck/host_utility/device_prop.hpp" +#include "ck/utility/filter_tuple.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +using device_grouped_conv3d_fwd_bias_bn_clamp_xdl_merged_groups_ndhwgc_gkzyxc_ndhwgk_bf16_instances = std::vector, + NDHWGK, + BF16, + BF16, + Tuple, + BF16, + PassThrough, + PassThrough, + BiasNormalizeInInferClamp>>>; + +// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k] +template +void add_device_grouped_conv3d_fwd_bias_bn_clamp_xdl_merged_groups_ndhwgc_gkzyxc_ndhwgk_bf16_instances_shard(device_grouped_conv3d_fwd_bias_bn_clamp_xdl_merged_groups_ndhwgc_gkzyxc_ndhwgk_bf16_instances& instances) +{ + if(ck::get_device_name() == "gfx950") + { + add_device_operation_instances( + instances, ck::util::filter_tuple_by_modulo_t< + device_grouped_conv_fwd_xdl_merged_groups_bf16_instances_2x<3, + NDHWGC, + GKZYXC, Tuple, + NDHWGK, + ConvFwdDefault,Tuple, BiasNormalizeInInferClamp>, + Shards, + ShardIndex>{}); + + add_device_operation_instances( + instances, ck::util::filter_tuple_by_modulo_t< + device_grouped_conv_fwd_xdl_merged_groups_bf16_instances_2x<3, + NDHWGC, + GKZYXC, Tuple, + NDHWGK, + ConvFwd3x3,Tuple, BiasNormalizeInInferClamp>, + Shards, + ShardIndex>{}); + } + else + { + add_device_operation_instances( + instances, ck::util::filter_tuple_by_modulo_t< + device_grouped_conv_fwd_xdl_merged_groups_bf16_instances<3, + NDHWGC, + GKZYXC, Tuple, + NDHWGK, + ConvFwdDefault,Tuple, BiasNormalizeInInferClamp>, + Shards, + ShardIndex>{}); + + add_device_operation_instances( + instances, ck::util::filter_tuple_by_modulo_t< + device_grouped_conv_fwd_xdl_merged_groups_bf16_instances<3, + NDHWGC, + GKZYXC, Tuple, + NDHWGK, + ConvFwd3x3,Tuple, BiasNormalizeInInferClamp>, + Shards, + ShardIndex>{}); + } +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/merged_groups/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_merged_groups_ndhwgc_gkzyxc_ndhwgk_f16_instance.in b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/merged_groups/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_merged_groups_ndhwgc_gkzyxc_ndhwgk_f16_instance.in new file mode 100644 index 0000000000..b8125423bc --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/merged_groups/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_merged_groups_ndhwgc_gkzyxc_ndhwgk_f16_instance.in @@ -0,0 +1,79 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_merged_groups_instance.hpp" +#include "ck/host_utility/device_prop.hpp" +#include "ck/utility/filter_tuple.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +using device_grouped_conv3d_fwd_bias_bn_clamp_xdl_merged_groups_ndhwgc_gkzyxc_ndhwgk_f16_instances = std::vector, + NDHWGK, + F16, + F16, + Tuple, + F16, + PassThrough, + PassThrough, + BiasNormalizeInInferClamp>>>; + +// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k] +template +void add_device_grouped_conv3d_fwd_bias_bn_clamp_xdl_merged_groups_ndhwgc_gkzyxc_ndhwgk_f16_instances_shard(device_grouped_conv3d_fwd_bias_bn_clamp_xdl_merged_groups_ndhwgc_gkzyxc_ndhwgk_f16_instances& instances) +{ + if(ck::get_device_name() == "gfx950") + { + add_device_operation_instances( + instances, ck::util::filter_tuple_by_modulo_t< + device_grouped_conv_fwd_xdl_merged_groups_f16_instances_2x<3, + NDHWGC, + GKZYXC, Tuple, + NDHWGK, + ConvFwdDefault,Tuple, BiasNormalizeInInferClamp>, + Shards, + ShardIndex>{}); + + add_device_operation_instances( + instances, ck::util::filter_tuple_by_modulo_t< + device_grouped_conv_fwd_xdl_merged_groups_f16_instances_2x<3, + NDHWGC, + GKZYXC, Tuple, + NDHWGK, + ConvFwd3x3,Tuple, BiasNormalizeInInferClamp>, + Shards, + ShardIndex>{}); + } + else + { + add_device_operation_instances( + instances, ck::util::filter_tuple_by_modulo_t< + device_grouped_conv_fwd_xdl_merged_groups_f16_instances<3, + NDHWGC, + GKZYXC, Tuple, + NDHWGK, + ConvFwdDefault,Tuple, BiasNormalizeInInferClamp>, + Shards, + ShardIndex>{}); + + add_device_operation_instances( + instances, ck::util::filter_tuple_by_modulo_t< + device_grouped_conv_fwd_xdl_merged_groups_f16_instances<3, + NDHWGC, + GKZYXC, Tuple, + NDHWGK, + ConvFwd3x3,Tuple, BiasNormalizeInInferClamp>, + Shards, + ShardIndex>{}); + } +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/merged_groups/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_merged_groups_ndhwgc_gkzyxc_ndhwgk_f32_instance.in b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/merged_groups/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_merged_groups_ndhwgc_gkzyxc_ndhwgk_f32_instance.in new file mode 100644 index 0000000000..f292d95cda --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/merged_groups/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_merged_groups_ndhwgc_gkzyxc_ndhwgk_f32_instance.in @@ -0,0 +1,53 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_merged_groups_instance.hpp" +#include "ck/utility/filter_tuple.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +using device_grouped_conv3d_fwd_bias_bn_clamp_xdl_merged_groups_ndhwgc_gkzyxc_ndhwgk_f32_instances = std::vector, + NDHWGK, + F32, + F32, + Tuple, + F32, + PassThrough, + PassThrough, + BiasNormalizeInInferClamp>>>; + +// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k] +template +void add_device_grouped_conv3d_fwd_bias_bn_clamp_xdl_merged_groups_ndhwgc_gkzyxc_ndhwgk_f32_instances_shard(device_grouped_conv3d_fwd_bias_bn_clamp_xdl_merged_groups_ndhwgc_gkzyxc_ndhwgk_f32_instances& instances) +{ + add_device_operation_instances( + instances, ck::util::filter_tuple_by_modulo_t< + device_grouped_conv_fwd_xdl_merged_groups_f32_instances<3, + NDHWGC, + GKZYXC, Tuple, + NDHWGK, + ConvFwdDefault,Tuple, BiasNormalizeInInferClamp>, + Shards, + ShardIndex>{}); + + add_device_operation_instances( + instances, ck::util::filter_tuple_by_modulo_t< + device_grouped_conv_fwd_xdl_merged_groups_f32_instances<3, + NDHWGC, + GKZYXC, Tuple, + NDHWGK, + ConvFwd3x3,Tuple, BiasNormalizeInInferClamp>, + Shards, + ShardIndex>{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/profiler/include/profiler/profile_grouped_conv_fwd_bias_bnorm_clamp_impl.hpp b/profiler/include/profiler/profile_grouped_conv_fwd_bias_bnorm_clamp_impl.hpp new file mode 100644 index 0000000000..cd6c141219 --- /dev/null +++ b/profiler/include/profiler/profile_grouped_conv_fwd_bias_bnorm_clamp_impl.hpp @@ -0,0 +1,427 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include +#include +#include + +#include "ck/ck.hpp" +#include "ck/tensor_operation/gpu/device/tensor_layout.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" + +#include "ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_bias_bnorm_clamp.hpp" + +#include "ck/library/utility/algorithm.hpp" +#include "ck/library/utility/check_err.hpp" +#include "ck/library/utility/device_memory.hpp" +#include "ck/library/utility/host_tensor.hpp" +#include "ck/library/utility/host_tensor_generator.hpp" +#include "ck/library/utility/convolution_parameter.hpp" +#include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp" +#include "ck/library/reference_tensor_operation/cpu/reference_conv_fwd.hpp" +#include "ck/library/reference_tensor_operation/cpu/reference_batchnorm_infer.hpp" + +namespace ck { +namespace profiler { + +using InElementOp = ck::tensor_operation::element_wise::PassThrough; +using WeiElementOp = ck::tensor_operation::element_wise::PassThrough; +using OutElementOp = ck::tensor_operation::element_wise::BiasNormalizeInInferClamp; +using Clamp = ck::tensor_operation::element_wise::Clamp; +using Add = ck::tensor_operation::element_wise::Add; + +// NOTE: Usage of NHWGK layout for GK bias is a workaround. This test is to +// just keep such implementation valid. +// TODO: Add possiblity to pass GK layout and GK lengths for bias and reuse +// the same instances. + +template +auto get_elementwise_desc(ck::index_t G, ck::index_t K) +{ + if constexpr(NDimSpatial == 1) + { + return HostTensorDescriptor({G, 1, K, 1}, {K, 0, 1, 0}); + } + else if constexpr(NDimSpatial == 2) + { + return HostTensorDescriptor({G, 1, K, 1, 1}, {K, 0, 1, 0, 0}); + } + else + { + return HostTensorDescriptor({G, 1, K, 1, 1, 1}, {K, 0, 1, 0, 0, 0}); + } +} + +template +void ref_bnorm_clamp_infer(Tensor& out, + Tensor& in, + Tensor& mean, + Tensor& variance, + Tensor& scale, + Tensor& shift, + const float floor, + const float ceil, + const float epsilon) +{ + + auto func = [&](auto... idxs) { + const float x = type_convert(in(idxs...)); + + const float invVariance = + type_convert(1.0f) / std::sqrt(epsilon + type_convert(variance(idxs...))); + + const float norm_x = (x - type_convert(mean(idxs...))) * invVariance; + + float y = + type_convert(scale(idxs...)) * norm_x + type_convert(shift(idxs...)); + + Clamp{floor, ceil}(y, y); + + out(idxs...) = type_convert(y); + }; + if constexpr(NDimSpatial == 1) + { + make_ParallelTensorFunctor(func, + out.GetLengths()[0], + out.GetLengths()[1], + out.GetLengths()[2], + out.GetLengths()[3])(std::thread::hardware_concurrency()); + } + else if constexpr(NDimSpatial == 2) + { + make_ParallelTensorFunctor(func, + out.GetLengths()[0], + out.GetLengths()[1], + out.GetLengths()[2], + out.GetLengths()[3], + out.GetLengths()[4])(std::thread::hardware_concurrency()); + } + else + { + make_ParallelTensorFunctor(func, + out.GetLengths()[0], + out.GetLengths()[1], + out.GetLengths()[2], + out.GetLengths()[3], + out.GetLengths()[4], + out.GetLengths()[5])(std::thread::hardware_concurrency()); + } +} + +template +bool profile_grouped_conv_fwd_bias_clamp_impl(int do_verification, + int init_method, + bool do_log, + bool time_kernel, + const ck::utils::conv::ConvParam& conv_param) +{ + const float floor = 0.f; + const float ceil = 2048.f; + const float epsilon = 1e-4; + + const auto in_element_op = InElementOp{}; + const auto wei_element_op = WeiElementOp{}; + const auto out_element_op = OutElementOp{floor, ceil, epsilon}; + + const auto in_g_n_c_wis_desc = + ck::utils::conv::make_input_host_tensor_descriptor_g_n_c_wis_packed(conv_param); + + const auto wei_g_k_c_xs_desc = + ck::utils::conv::make_weight_host_tensor_descriptor_g_k_c_xs_packed(conv_param); + + const auto out_g_n_k_wos_desc = + ck::utils::conv::make_output_host_tensor_descriptor_g_n_k_wos_packed(conv_param); + + const index_t G = conv_param.G_; + const index_t K = conv_param.K_; + + std::array a_g_n_c_wis_lengths{}; + std::array a_g_n_c_wis_strides{}; + std::array b_g_k_c_xs_lengths{}; + std::array b_g_k_c_xs_strides{}; + std::array e_g_n_k_wos_lengths{}; + std::array e_g_n_k_wos_strides{}; + std::array d_g_n_k_wos_strides{}; + std::array conv_filter_strides{}; + std::array conv_filter_dilations{}; + std::array input_left_pads{}; + std::array input_right_pads{}; + + auto copy = [](const auto& x, auto& y) { ck::ranges::copy(x, y.begin()); }; + + copy(in_g_n_c_wis_desc.GetLengths(), a_g_n_c_wis_lengths); + copy(in_g_n_c_wis_desc.GetStrides(), a_g_n_c_wis_strides); + copy(wei_g_k_c_xs_desc.GetLengths(), b_g_k_c_xs_lengths); + copy(wei_g_k_c_xs_desc.GetStrides(), b_g_k_c_xs_strides); + copy(out_g_n_k_wos_desc.GetLengths(), e_g_n_k_wos_lengths); + copy(out_g_n_k_wos_desc.GetStrides(), e_g_n_k_wos_strides); + copy(out_g_n_k_wos_desc.GetStrides(), d_g_n_k_wos_strides); + copy(conv_param.conv_filter_strides_, conv_filter_strides); + copy(conv_param.conv_filter_dilations_, conv_filter_dilations); + copy(conv_param.input_left_pads_, input_left_pads); + copy(conv_param.input_right_pads_, input_right_pads); + + Tensor input(in_g_n_c_wis_desc); + Tensor weight(wei_g_k_c_xs_desc); + Tensor host_output(out_g_n_k_wos_desc); + Tensor device_output(out_g_n_k_wos_desc); + const auto elementwise_desc = + ElementwiseGK ? get_elementwise_desc(G, K) : out_g_n_k_wos_desc; + + Tensor bias(elementwise_desc); + Tensor mean(elementwise_desc); + Tensor variance(elementwise_desc); + Tensor scale(elementwise_desc); + Tensor shift(elementwise_desc); + + std::cout << "input: " << input.mDesc << std::endl; + std::cout << "weight: " << weight.mDesc << std::endl; + std::cout << "output: " << host_output.mDesc << std::endl; + + std::cout << "bias: " << bias.mDesc << std::endl; + std::cout << "mean: " << mean.mDesc << std::endl; + std::cout << "variance: " << variance.mDesc << std::endl; + std::cout << "scale: " << scale.mDesc << std::endl; + std::cout << "shift: " << shift.mDesc << std::endl; + + switch(init_method) + { + case 0: break; + case 1: + input.GenerateTensorValue(GeneratorTensor_2{-5, 5}); + weight.GenerateTensorValue(GeneratorTensor_2{-5, 5}); + + bias.GenerateTensorValue(GeneratorTensor_2{-5, 5}); + mean.GenerateTensorValue(GeneratorTensor_2{-5, 5}); + variance.GenerateTensorValue(GeneratorTensor_2{0, 5}); + scale.GenerateTensorValue(GeneratorTensor_2{-5, 5}); + shift.GenerateTensorValue(GeneratorTensor_2{-5, 5}); + break; + default: + input.GenerateTensorValue(GeneratorTensor_3{0.0, 1.0}); + weight.GenerateTensorValue(GeneratorTensor_3{-0.5, 0.5}); + + bias.GenerateTensorValue(GeneratorTensor_3{-0.5, 0.5}); + mean.GenerateTensorValue(GeneratorTensor_3{-0.5, 0.5}); + variance.GenerateTensorValue(GeneratorTensor_3{0, 0.5}); + scale.GenerateTensorValue(GeneratorTensor_3{-0.5, 0.5}); + shift.GenerateTensorValue(GeneratorTensor_3{-0.5, 0.5}); + } + + DeviceMem in_device_buf(sizeof(InDataType) * input.mDesc.GetElementSpaceSize()); + DeviceMem wei_device_buf(sizeof(WeiDataType) * weight.mDesc.GetElementSpaceSize()); + DeviceMem out_device_buf(sizeof(OutDataType) * device_output.mDesc.GetElementSpaceSize()); + + const std::size_t elementwise_dev_buf_size = + ElementwiseGK ? sizeof(OutDataType) * G * K + : sizeof(OutDataType) * device_output.mDesc.GetElementSpaceSize(); + DeviceMem bias_device_buf(elementwise_dev_buf_size); + DeviceMem mean_device_buf(elementwise_dev_buf_size); + DeviceMem variance_device_buf(elementwise_dev_buf_size); + DeviceMem scale_device_buf(elementwise_dev_buf_size); + DeviceMem shift_device_buf(elementwise_dev_buf_size); + + in_device_buf.ToDevice(input.mData.data()); + wei_device_buf.ToDevice(weight.mData.data()); + + bias_device_buf.ToDevice(bias.mData.data()); + mean_device_buf.ToDevice(mean.mData.data()); + variance_device_buf.ToDevice(variance.mData.data()); + scale_device_buf.ToDevice(scale.mData.data()); + shift_device_buf.ToDevice(shift.mData.data()); + + if constexpr(ElementwiseGK) + { + constexpr ck::index_t spatial_offset = 3; + d_g_n_k_wos_strides[1] = 0; + for(int i = 0; i < NDimSpatial; i++) + { + d_g_n_k_wos_strides[i + spatial_offset] = 0; + } + } + + // run reference op + if(do_verification) + { + // Run Conv and Bnorm seperatly + auto ref_conv = ck::tensor_operation::host::ReferenceConvFwd{}; + + std::array, 1> d_tensors = {bias}; + auto ref_conv_invoker = ref_conv.MakeInvoker(); + auto ref_conv_argument = ref_conv.MakeArgument(input, + weight, + host_output, + conv_param.conv_filter_strides_, + conv_param.conv_filter_dilations_, + conv_param.input_left_pads_, + conv_param.input_right_pads_, + in_element_op, + wei_element_op, + Add{}, + {}, + {}, + d_tensors); + + // init host output to zero + host_output.SetZero(); + ref_conv_invoker.Run(ref_conv_argument); + ref_bnorm_clamp_infer( + host_output, host_output, mean, variance, scale, shift, floor, ceil, epsilon); + } + + std::string best_op_name; + float best_avg_time = 0; + float best_tflops = 0; + float best_gb_per_sec = 0; + + // profile device op instances + bool pass = true; + + auto run_impl = [&](auto& op_ptr, auto& argument_ptr) { + // workspace_sz will be equal to 0 for other layout than NGCHW + const std::size_t workspace_sz = op_ptr->GetWorkSpaceSize(argument_ptr.get()); + DeviceMem workspace_dev(workspace_sz); + op_ptr->SetWorkSpacePointer(argument_ptr.get(), workspace_dev.GetDeviceBuffer()); + + if(op_ptr->IsSupportedArgument(argument_ptr.get())) + { + // re-init output to zero before profiling next kernel + out_device_buf.SetZero(); + + std::string op_name = op_ptr->GetTypeString(); + + auto invoker_ptr = op_ptr->MakeInvokerPointer(); + + float avg_time = + invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel}); + + std::size_t flop = conv_param.GetFlops(); + std::size_t num_btype = conv_param.GetByte(); + + float tflops = static_cast(flop) / 1.E9 / avg_time; + + float gb_per_sec = num_btype / 1.E6 / avg_time; + + std::cout << "Perf: " << std::setw(10) << avg_time << " ms, " << tflops << " TFlops, " + << gb_per_sec << " GB/s, " << op_name << std::endl; + + if(tflops > best_tflops) + { + best_op_name = op_name; + best_tflops = tflops; + best_avg_time = avg_time; + best_gb_per_sec = gb_per_sec; + } + + if(do_verification) + { + out_device_buf.FromDevice(device_output.mData.data()); + + pass = pass & ck::utils::check_err(device_output, host_output); + + if(do_log) + { + LogRangeAsType(std::cout << "input : ", input.mData, ",") << std::endl; + LogRangeAsType(std::cout << "weight: ", weight.mData, ",") << std::endl; + LogRangeAsType(std::cout << "host_output : ", host_output.mData, ",") + << std::endl; + LogRangeAsType(std::cout << "device_output: ", device_output.mData, ",") + << std::endl; + } + } + } + else + { + std::cout << op_ptr->GetTypeString() << " does not support this problem" << std::endl; + } + }; + + using DeviceOp = ck::tensor_operation::device::DeviceGroupedConvFwdMultipleABD< + NDimSpatial, + InLayout, + WeiLayout, + ck::Tuple, + OutLayout, + InDataType, + WeiDataType, + ck::Tuple, + OutDataType, + InElementOp, + WeiElementOp, + OutElementOp, + AComputeType, + BComputeType>; + + // get device op instances + const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory< + DeviceOp>::GetInstances(); + + std::cout << "ckProfiler found " << op_ptrs.size() << " instances" << std::endl; + + for(auto& op_ptr : op_ptrs) + { + auto argument_ptr = op_ptr->MakeArgumentPointer(in_device_buf.GetDeviceBuffer(), + wei_device_buf.GetDeviceBuffer(), + {bias_device_buf.GetDeviceBuffer(), + mean_device_buf.GetDeviceBuffer(), + variance_device_buf.GetDeviceBuffer(), + scale_device_buf.GetDeviceBuffer(), + shift_device_buf.GetDeviceBuffer()}, + out_device_buf.GetDeviceBuffer(), + a_g_n_c_wis_lengths, + a_g_n_c_wis_strides, + b_g_k_c_xs_lengths, + b_g_k_c_xs_strides, + {e_g_n_k_wos_lengths, + e_g_n_k_wos_lengths, + e_g_n_k_wos_lengths, + e_g_n_k_wos_lengths, + e_g_n_k_wos_lengths}, + {d_g_n_k_wos_strides, + d_g_n_k_wos_strides, + d_g_n_k_wos_strides, + d_g_n_k_wos_strides, + d_g_n_k_wos_strides}, + e_g_n_k_wos_lengths, + e_g_n_k_wos_strides, + conv_filter_strides, + conv_filter_dilations, + input_left_pads, + input_right_pads, + in_element_op, + wei_element_op, + out_element_op); + + run_impl(op_ptr, argument_ptr); + } + + std::cout << "Best configuration parameters:" << "\nname: " << best_op_name + << "\navg_time: " << best_avg_time << "\ntflops: " << best_tflops + << "\nGB/s: " << best_gb_per_sec << std::endl; + + return pass; +} + +} // namespace profiler +} // namespace ck diff --git a/script/clang-format-overwrite.sh b/script/clang-format-overwrite.sh index a770970fef..ea2834ae62 100755 --- a/script/clang-format-overwrite.sh +++ b/script/clang-format-overwrite.sh @@ -1,2 +1,7 @@ +#!/bin/bash +set -euo pipefail +IFS=$'\n\t' + + find . -name deps -prune -o -name build -prune -o -iname '*.h' -o -iname '*.hpp' -o -iname '*.cpp' -o -iname '*.h.in' -o -iname '*.hpp.in' -o -iname '*.cpp.in' -o -iname '*.cl' -o -iname '*.cuh' -o -iname '*.cu' -o -iname '*.inc' | xargs -n 1 -P 16 -I{} -t sh -c 'clang-format-18 -i -style=file {}' git status --porcelain | awk '$1 != "D" && (match($2, "\\.cpp|.hpp|.inc")) {print $2}' | xargs -n 1 -P 16 -I{} -t sh -c 'clang-format-18 -i -style=file {}' diff --git a/script/cmake-ck-dev.sh b/script/cmake-ck-dev.sh index c45bb4330d..25a1590808 100755 --- a/script/cmake-ck-dev.sh +++ b/script/cmake-ck-dev.sh @@ -1,4 +1,7 @@ #!/bin/bash +set -euo pipefail +IFS=$'\n\t' + rm -f CMakeCache.txt rm -f *.cmake rm -rf CMakeFiles diff --git a/script/cmake-ck-release.sh b/script/cmake-ck-release.sh index 311ea91822..5263de92c8 100755 --- a/script/cmake-ck-release.sh +++ b/script/cmake-ck-release.sh @@ -1,4 +1,7 @@ #!/bin/bash +set -euo pipefail +IFS=$'\n\t' + rm -f CMakeCache.txt rm -f *.cmake rm -rf CMakeFiles diff --git a/test/ck_tile/CMakeLists.txt b/test/ck_tile/CMakeLists.txt index 42605f2513..374e5b4990 100644 --- a/test/ck_tile/CMakeLists.txt +++ b/test/ck_tile/CMakeLists.txt @@ -21,3 +21,5 @@ add_subdirectory(add_rmsnorm2d_rdquant) # add_subdirectory(layernorm2d) # add_subdirectory(rmsnorm2d) add_subdirectory(gemm_block_scale) +add_subdirectory(utility) +add_subdirectory(reduce) diff --git a/test/ck_tile/batched_transpose/test_batched_transpose.cpp b/test/ck_tile/batched_transpose/test_batched_transpose.cpp index cce00e27cb..77d5825eed 100644 --- a/test/ck_tile/batched_transpose/test_batched_transpose.cpp +++ b/test/ck_tile/batched_transpose/test_batched_transpose.cpp @@ -95,10 +95,12 @@ class TestCkTileBatchedTranspose // N C H W layout_in== ck_tile::HostTensor y_ref(Y_dim, Y_stride); ck_tile::FillUniformDistribution{-.5f, .5f}(x_host); + ck_tile::FillConstant{-37}(y_host); ck_tile::DeviceMem x_dev(x_host.get_element_space_size_in_bytes()); ck_tile::DeviceMem y_dev(y_host.get_element_space_size_in_bytes()); x_dev.ToDevice(x_host.data()); + y_dev.ToDevice(y_host.data()); using Kernel = typename Config::Kernel; @@ -131,8 +133,8 @@ class TestCkTileBatchedTranspose // N C H W layout_in== height, width, height * width, - Config::BlockTile::at(1), - Config::BlockTile::at(0)}; + Config::BlockTile::at(0), + Config::BlockTile::at(1)}; auto kargs = Kernel::MakeKargs(host_args); auto sc = ck_tile::stream_config{}; @@ -140,15 +142,24 @@ class TestCkTileBatchedTranspose // N C H W layout_in== constexpr dim3 block_size = Kernel::BlockSize(); ck_tile::launch_kernel( sc, ck_tile::make_kernel(Kernel{}, grid_size, block_size, 0, kargs)); + y_dev.FromDevice(y_host.data()); ck_tile::reference_batched_transpose(x_host, y_ref, layout_in, layout_out); std::ostringstream message; message << "N=" << N << " C=" << C << " H=" << H << " W=" << W << " layout_in=" << layout_in - << " layout_out=" << layout_out << " device_name=" << device_name; + << " layout_out=" << layout_out << " grid_size={" << grid_size.x << ", " + << grid_size.y << ", " << grid_size.z << "} block_size=" << block_size.x + << " device_name=" << device_name; + // NB: order of output and reference matters bool pass = ck_tile::check_err( - y_ref, y_host, message.str(), /* rtol */ 0, /* atol */ 0, /* allow inf */ false); + /* out */ y_host, + /* ref */ y_ref, + message.str(), + /* rtol */ 0, + /* atol */ 0, + /* allow inf */ false); EXPECT_TRUE(pass); } @@ -160,14 +171,16 @@ static const auto kTestingValues = ::testing::Values( // N C H W layout_in==NCHW std::tuple{1, 32, 1, 32, true}, std::tuple{1, 64, 1, 64, true}, + std::tuple{1, 32, 1, 64, true}, + std::tuple{1, 64, 1, 32, true}, std::tuple{2, 12, 1, 32, false}, std::tuple{3, 1334, 1, 37, false}, std::tuple{4, 27, 1, 32, true}, std::tuple{5, 1234, 1, 12, true}, std::tuple{1, 1, 1, 1, true}, std::tuple{1, 1, 1, 1, false}, - std::tuple{128, 1024, 64, 64, true}, - std::tuple{128, 1024, 64, 64, false}, + std::tuple{17, 1024, 64, 64, true}, + std::tuple{17, 1024, 64, 64, false}, std::tuple{16, 64, 32, 128, true}, std::tuple{16, 64, 128, 32, false}, std::tuple{1, 2048, 1, 1, true}, @@ -239,6 +252,60 @@ class CaseHalfPadMultiWarpLoadTranspose { }; +class CaseHalfPadMultiWarp128MNLoadTranspose + : public TestCkTileBatchedTranspose> +{ +}; + +class CaseHalfPadMultiWarp128MN + : public TestCkTileBatchedTranspose< + PipelineConfig> +{ +}; + +class CaseHalfPadRectTile1 + : public TestCkTileBatchedTranspose< + PipelineConfig> +{ +}; + +class CaseHalfPadRectTile2 + : public TestCkTileBatchedTranspose< + PipelineConfig> +{ +}; + +class CaseHalfPadRectTile1LoadTranspose + : public TestCkTileBatchedTranspose> +{ +}; + +class CaseHalfPadRectTile2LoadTranspose + : public TestCkTileBatchedTranspose> +{ +}; + TEST_P(CaseHalf, TestCorrectness) { this->Run(GetParam()); } TEST_P(CaseByte, TestCorrectness) { this->Run(GetParam()); } TEST_P(CaseWord, TestCorrectness) { this->Run(GetParam()); } @@ -248,6 +315,12 @@ TEST_P(CaseHalfPad, TestCorrectness) { this->Run(GetParam()); } TEST_P(CaseHalfPadLoadTranspose, TestCorrectness) { this->Run(GetParam()); } TEST_P(CaseHalfPadMultiWarp, TestCorrectness) { this->Run(GetParam()); } TEST_P(CaseHalfPadMultiWarpLoadTranspose, TestCorrectness) { this->Run(GetParam()); } +TEST_P(CaseHalfPadMultiWarp128MN, TestCorrectness) { this->Run(GetParam()); } +TEST_P(CaseHalfPadMultiWarp128MNLoadTranspose, TestCorrectness) { this->Run(GetParam()); } +TEST_P(CaseHalfPadRectTile1, TestCorrectness) { this->Run(GetParam()); } +TEST_P(CaseHalfPadRectTile1LoadTranspose, TestCorrectness) { this->Run(GetParam()); } +TEST_P(CaseHalfPadRectTile2, TestCorrectness) { this->Run(GetParam()); } +TEST_P(CaseHalfPadRectTile2LoadTranspose, TestCorrectness) { this->Run(GetParam()); } // clang-format off INSTANTIATE_TEST_SUITE_P(TestCkTileBatchedTransposeSuite, CaseHalf, kTestingValues); @@ -259,4 +332,11 @@ INSTANTIATE_TEST_SUITE_P(TestCkTileBatchedTransposeSuite, CaseHalfPad, kTestingV INSTANTIATE_TEST_SUITE_P(TestCkTileBatchedTransposeSuite, CaseHalfPadLoadTranspose, kTestingValues); INSTANTIATE_TEST_SUITE_P(TestCkTileBatchedTransposeSuite, CaseHalfPadMultiWarp, kTestingValues); INSTANTIATE_TEST_SUITE_P(TestCkTileBatchedTransposeSuite, CaseHalfPadMultiWarpLoadTranspose, kTestingValues); +INSTANTIATE_TEST_SUITE_P(TestCkTileBatchedTransposeSuite, CaseHalfPadMultiWarp128MN, kTestingValues); +INSTANTIATE_TEST_SUITE_P(TestCkTileBatchedTransposeSuite, CaseHalfPadMultiWarp128MNLoadTranspose, kTestingValues); +INSTANTIATE_TEST_SUITE_P(TestCkTileBatchedTransposeSuite, CaseHalfPadRectTile1, kTestingValues); +INSTANTIATE_TEST_SUITE_P(TestCkTileBatchedTransposeSuite, CaseHalfPadRectTile1LoadTranspose, kTestingValues); +INSTANTIATE_TEST_SUITE_P(TestCkTileBatchedTransposeSuite, CaseHalfPadRectTile2, kTestingValues); +INSTANTIATE_TEST_SUITE_P(TestCkTileBatchedTransposeSuite, CaseHalfPadRectTile2LoadTranspose, kTestingValues); + // clang-format on diff --git a/test/ck_tile/data_type/CMakeLists.txt b/test/ck_tile/data_type/CMakeLists.txt index a9ce48d1de..384fd3c1c4 100644 --- a/test/ck_tile/data_type/CMakeLists.txt +++ b/test/ck_tile/data_type/CMakeLists.txt @@ -3,11 +3,13 @@ if(GPU_TARGETS MATCHES "gfx9") endif() if(GPU_TARGETS MATCHES "gfx95") add_gtest_executable(test_ck_tile_pk_fp4 test_pk_fp4.cpp) + add_gtest_executable(test_ck_tile_mx_scale test_mx_scale.cpp) endif() if(CK_USE_OCP_FP8 OR CK_USE_FNUZ_FP8) add_gtest_executable(test_ck_tile_fp8 test_fp8.cpp) target_compile_options(test_ck_tile_fp8 PRIVATE -Wno-float-equal) + target_compile_definitions(test_ck_tile_fp8 PUBLIC GTEST_HAS_RTTI=0) # conditionally specify the use of OCP_FP8 if(CK_USE_OCP_FP8) target_compile_options(test_ck_tile_fp8 PRIVATE -DCK_TILE_USE_OCP_FP8) diff --git a/test/ck_tile/data_type/test_mx_scale.cpp b/test/ck_tile/data_type/test_mx_scale.cpp new file mode 100644 index 0000000000..7a024d238f --- /dev/null +++ b/test/ck_tile/data_type/test_mx_scale.cpp @@ -0,0 +1,162 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. + +#include "gtest/gtest.h" +#include + +#include "ck_tile/core.hpp" +#include "ck_tile/host.hpp" + +using ck_tile::bf16_t; +using ck_tile::bf16x2_t; +using ck_tile::fp16_t; +using ck_tile::fp16x2_t; +using ck_tile::fp32_t; +using ck_tile::fp32x2_t; +using ck_tile::number; +using ck_tile::pk_fp4_t; + +template +CK_TILE_HOST void test_convert(); + +using ck_tile::e8m0_raw_t; +using ck_tile::e8m0_t; + +TEST(OCP_Scale, NumericLimits) +{ + EXPECT_EQ(ck_tile::numeric::has_inf(), false); + EXPECT_EQ(ck_tile::numeric::zero(), ck_tile::numeric::signaling_NaN()); + EXPECT_EQ(ck_tile::numeric::min(), e8m0_t{e8m0_raw_t{0b00000000}}); + EXPECT_EQ(ck_tile::numeric::max(), e8m0_t{e8m0_raw_t{0b11111110}}); +} +TEST(OCP_Scale, NumericBasic) +{ + auto scale_1 = e8m0_t{1.0f}; + auto scale_2 = e8m0_t{e8m0_raw_t{ck_tile::numeric_traits::bias}}; // 2^0 + EXPECT_EQ(scale_1, scale_2); + + auto scale_3 = e8m0_t{8.0f}; + auto scale_4 = e8m0_t{e8m0_raw_t{3 + ck_tile::numeric_traits::bias}}; // 2^3 + EXPECT_EQ(scale_3, scale_4); +} + +TEST(OCP_Scale, ScaledConvertDevice) +{ + constexpr bool is_device = true; + test_convert(); // fp32 -> fp4 -> fp32 + test_convert(); + test_convert(); + test_convert(); + test_convert(); + test_convert(); + test_convert(); +} +TEST(OCP_Scale, ScaledConvertHost) +{ + constexpr bool is_device = false; + test_convert(); // fp32 -> fp4 -> fp32 + test_convert(); + test_convert(); + test_convert(); + test_convert(); + test_convert(); + test_convert(); +} +TEST(OCP_Scale, tensorInit) +{ + using scale_t = e8m0_t; + ck_tile::HostTensor scales({10, 10}); + ck_tile::FillUniformDistribution{1.f, 1.f}(scales); + scales.SetZero(); +} + +#define toPF4(x, y) ck_tile::scaled_type_convert(x, y) +#define toDST(x, y) ck_tile::scaled_type_convert(x, y) +#define toDSTx2(x, y) ck_tile::scaled_type_convert(x, y) + +#define toF32(x) ck_tile::type_convert(x) +#define toPF4_(x) ck_tile::type_convert(x) +#define toSRC(x) ck_tile::type_convert(x) +#define toDST_(x) ck_tile::type_convert(x) + +template +__global__ void MyKernel(Args... args) +{ + Kernel{}(args...); +} +template +struct SrcPkfp4Dst +{ + CK_TILE_HOST_DEVICE void + operator()(const SRC* src, DST* dst, e8m0_t scale1, e8m0_t scale2) const + { + + using SRCx2_t = ck_tile::ext_vector_t; + using DSTx2_t = ck_tile::ext_vector_t; + + ck_tile::static_for<0, N, 2>{}([&](auto i) { + const auto input2 = SRCx2_t{src[i], src[i + 1]}; + + if(i % 4 == 0) + { + // ex: fp32_t -> fp4 -> bf16_t + dst[i] = toDST(toPF4(src[i], scale1), scale2); + // ex: fp32x2_t -> pk_fp4 -> unpack<0> -> bf16_t + dst[i + 1] = toDST(toPF4_(toPF4(input2, scale1).unpack(number<1>{})), scale2); + } + else + { + // ex: fp32x2_t -> pk_fp4_t -> bf16x2_t + reinterpret_cast(dst)[i >> 1] = toDSTx2(toPF4(input2, scale1), scale2); + } + }); + } +}; + +template +CK_TILE_HOST void test_convert() +{ + const auto test_data = std::array{4.f, 6.f, 8.f, 10.f}; + const auto ref_data = std::array{8.f, 16.f, 16.f, 16.f}; + const auto scale1 = e8m0_t{8.0f}; + const auto scale2 = e8m0_t{16.0f}; + + static_assert(test_data.size() == ref_data.size()); + static_assert(test_data.size() % 2 == 0); + + constexpr int N = test_data.size(); + std::array in; + std::array ref, out; + + // prepare input and ground truth in host + for(int i = 0; i < N; ++i) + { + in[i] = toSRC(test_data[i]); + ref[i] = toDST_(ref_data[i]); + EXPECT_EQ(test_data[i], toF32(in[i])); + EXPECT_EQ(ref_data[i], toF32(ref[i])); + } + + using job = SrcPkfp4Dst; + + if constexpr(is_device) + { + auto in_d = std::make_unique(in.size() * sizeof(SRC)); + auto out_d = std::make_unique(out.size() * sizeof(DST)); + in_d->ToDevice(in.data()); + + MyKernel<<<1, 1>>>(reinterpret_cast(in_d->GetDeviceBuffer()), + reinterpret_cast(out_d->GetDeviceBuffer()), + scale1, + scale2); + + out_d->FromDevice(out.data()); + } + else + { + job{}(in.data(), out.data(), scale1, scale2); + } + + for(int i = 0; i < N; ++i) + EXPECT_EQ(ref[i], out[i]) << "i:" << i; +} diff --git a/test/ck_tile/reduce/CMakeLists.txt b/test/ck_tile/reduce/CMakeLists.txt new file mode 100644 index 0000000000..052669e20a --- /dev/null +++ b/test/ck_tile/reduce/CMakeLists.txt @@ -0,0 +1,7 @@ +if(GPU_TARGETS MATCHES "gfx9") + add_gtest_executable(test_ck_tile_reduce2d test_reduce2d.cpp) + if(result EQUAL 0) + target_link_libraries(test_ck_tile_reduce2d PRIVATE utility) + endif() +endif() + diff --git a/test/ck_tile/reduce/test_reduce2d.cpp b/test/ck_tile/reduce/test_reduce2d.cpp new file mode 100644 index 0000000000..4ce0b56ef3 --- /dev/null +++ b/test/ck_tile/reduce/test_reduce2d.cpp @@ -0,0 +1,359 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. + +#include +#include +#include +#include +#include + +#include "ck_tile/core.hpp" +#include "ck_tile/host.hpp" +#include "ck_tile/ops/reduce.hpp" +#include "ck_tile/host/kernel_launch.hpp" + +template +class TestCkTileReduce : public ::testing::Test +{ + protected: + using XDataType = std::tuple_element_t<0, Tuple>; + using ComputeDataType = std::tuple_element_t<1, Tuple>; + using YDataType = std::tuple_element_t<2, Tuple>; + using ReduceOpType = std::tuple_element_t<3, Tuple>; + using BlockWarps_ = std::tuple_element_t<4, Tuple>; + using BlockTile_ = std::tuple_element_t<5, Tuple>; + using WarpTile_ = std::tuple_element_t<6, Tuple>; + using ThreadTile_ = std::tuple_element_t<7, Tuple>; + + using TestReduce2dShape = + ck_tile::Reduce2dShape; + + template + void RunGenericTest(const std::vector& input_shape, + const std::vector& input_strides, + const std::vector& output_shape, + const std::vector& output_strides, + ck_tile::index_t kept_dim_len_prod, + ck_tile::index_t total_reduce_elements, + KeptDimSeq kept_dims, + ReduceDimSeq reduce_dims) + { + ck_tile::HostTensor h_x(input_shape, input_strides); + ck_tile::HostTensor h_y(output_shape, output_strides); + ck_tile::HostTensor h_y_ref(output_shape, output_strides); + + ck_tile::FillUniformDistribution{-5.f, 5.f}(h_x); + h_y.SetZero(); + h_y_ref.SetZero(); + + ck_tile::DeviceMem d_x_mem(h_x.get_element_space_size_in_bytes()); + ck_tile::DeviceMem d_y_mem(h_y.get_element_space_size_in_bytes()); + + d_x_mem.ToDevice(h_x.data()); + d_y_mem.ToDevice(h_y.data()); // Initialize device output buffer + + // Problem and kernel setup + using Problem = ck_tile:: + Reduce2dProblem; + + using Kernel = ck_tile::Reduce; + + // Launch configuration + constexpr ck_tile::index_t kBlockSize = 256; + constexpr ck_tile::index_t kBlockPerCu = 1; + + ck_tile::index_t kGridSize = + (kept_dim_len_prod + TestReduce2dShape::Block_M - 1) / TestReduce2dShape::Block_M; + + // Generic helper to create tuple from vector based on compile-time size + auto make_shape_tuple = [](const std::vector& vec) { + return [&vec](std::index_sequence) { + return ck_tile::make_tuple(vec[I]...); + }(std::make_index_sequence{}); + }; + + auto input_shape_tuple = make_shape_tuple.template operator()(input_shape); + auto input_strides_tuple = make_shape_tuple.template operator()(input_strides); + + if(!Kernel::IsSupportedArgument( + output_shape[output_shape.size() - 1], + input_strides_tuple)) // output tensor's continuous dimension + { + throw std::runtime_error("Wrong! Arguments not supported!\n"); + } + + ck_tile::launch_kernel(ck_tile::stream_config{nullptr, false, 0}, + ck_tile::make_kernel( + Kernel{}, + kGridSize, + kBlockSize, + 0, + static_cast(d_x_mem.GetDeviceBuffer()), + static_cast(d_y_mem.GetDeviceBuffer()), + input_shape_tuple, + input_strides_tuple, + kept_dims, + reduce_dims)); + + // Get results back + d_y_mem.FromDevice(h_y.data()); + + // Reference computation + ck_tile::reference_reduce( + h_x, h_y_ref, ReduceOpType{}, kept_dims, reduce_dims); + + // Calculate proper error thresholds based on data types and number of accumulations + const auto rtol = ck_tile::get_relative_threshold( + total_reduce_elements); + const auto atol = ck_tile::get_absolute_threshold( + 5.0f, total_reduce_elements); + + bool result = + ck_tile::check_err(h_y, h_y_ref, "Error: Incorrect reduce results!", rtol, atol); + EXPECT_TRUE(result); + } + + // Convenience functions for specific dimensional patterns + void RunTest2D_KeepDim0_ReduceDim1(ck_tile::index_t dim0, ck_tile::index_t dim1) + { + constexpr auto kept_dims = ck_tile::sequence<0>{}; + constexpr auto reduce_dims = ck_tile::sequence<1>{}; + + // Input shape and strides + std::vector input_shape = {dim0, dim1}; + std::vector input_strides = {dim1, 1}; + + // Output shape and strides (keep dim0) + std::vector output_shape = {dim0}; + std::vector output_strides = {1}; + + // Calculate products + ck_tile::index_t kept_dim_len_prod = dim0; + ck_tile::index_t total_reduce_elements = dim1; + + RunGenericTest<2>(input_shape, + input_strides, + output_shape, + output_strides, + kept_dim_len_prod, + total_reduce_elements, + kept_dims, + reduce_dims); + } + + void RunTest3D_KeepDim0_ReduceDim12(ck_tile::index_t dim0, + ck_tile::index_t dim1, + ck_tile::index_t dim2) + { + constexpr auto kept_dims = ck_tile::sequence<0>{}; + constexpr auto reduce_dims = ck_tile::sequence<1, 2>{}; + + // Input shape and strides + std::vector input_shape = {dim0, dim1, dim2}; + std::vector input_strides = {dim1 * dim2, dim2, 1}; + + // Output shape and strides (keep dim0) + std::vector output_shape = {dim0}; + std::vector output_strides = {1}; + + // Calculate products + ck_tile::index_t kept_dim_len_prod = dim0; // product of kept dimensions + ck_tile::index_t total_reduce_elements = dim1 * dim2; // product of reduced dimensions + + RunGenericTest<3>(input_shape, + input_strides, + output_shape, + output_strides, + kept_dim_len_prod, + total_reduce_elements, + kept_dims, + reduce_dims); + } + + void RunTest3D_KeepDim01_ReduceDim2(ck_tile::index_t dim0, + ck_tile::index_t dim1, + ck_tile::index_t dim2) + { + constexpr auto kept_dims = ck_tile::sequence<0, 1>{}; + constexpr auto reduce_dims = ck_tile::sequence<2>{}; + + // Input shape and strides + std::vector input_shape = {dim0, dim1, dim2}; + std::vector input_strides = {dim1 * dim2, dim2, 1}; + + // Output shape and strides (keep dim0) + std::vector output_shape = {dim0, dim1}; + std::vector output_strides = {dim1, 1}; + + // Calculate products + ck_tile::index_t kept_dim_len_prod = dim0 * dim1; // product of kept dimensions + ck_tile::index_t total_reduce_elements = dim2; // product of reduced dimensions + + RunGenericTest<3>(input_shape, + input_strides, + output_shape, + output_strides, + kept_dim_len_prod, + total_reduce_elements, + kept_dims, + reduce_dims); + } + + void RunTest4D_KeepDim01_ReduceDim23(ck_tile::index_t N, + ck_tile::index_t C, + ck_tile::index_t H, + ck_tile::index_t W) + { + constexpr auto kept_dims = ck_tile::sequence<0, 1>{}; + constexpr auto reduce_dims = ck_tile::sequence<2, 3>{}; + + // Input shape and strides + std::vector input_shape = {N, C, H, W}; + std::vector input_strides = {C * H * W, H * W, W, 1}; + + // Output shape and strides (keep dim0, dim1) + std::vector output_shape = {N, C}; + std::vector output_strides = {C, 1}; + + // Calculate products + ck_tile::index_t kept_dim_len_prod = N * C; // product of kept dimensions + ck_tile::index_t total_reduce_elements = H * W; // product of reduced dimensions + + RunGenericTest<4>(input_shape, + input_strides, + output_shape, + output_strides, + kept_dim_len_prod, + total_reduce_elements, + kept_dims, + reduce_dims); + } + + void RunTest4D_KeepDim03_ReduceDim12(ck_tile::index_t N, + ck_tile::index_t H, + ck_tile::index_t W, + ck_tile::index_t C) + { + constexpr auto kept_dims = ck_tile::sequence<0, 3>{}; + constexpr auto reduce_dims = ck_tile::sequence<1, 2>{}; + + // Input shape and strides + std::vector input_shape = {N, H, W, C}; + std::vector input_strides = {H * W * C, W * C, C, 1}; + + // Output shape and strides (keep dim0, dim1) + std::vector output_shape = {N, C}; + std::vector output_strides = {C, 1}; + + // Calculate products + ck_tile::index_t kept_dim_len_prod = N * C; // product of kept dimensions + ck_tile::index_t total_reduce_elements = H * W; // product of reduced dimensions + + RunGenericTest<4>(input_shape, + input_strides, + output_shape, + output_strides, + kept_dim_len_prod, + total_reduce_elements, + kept_dims, + reduce_dims); + } +}; + +// Shape parameters for different test configurations +using Shape1_BlockWarps = ck_tile::sequence<4, 1>; +using Shape1_BlockTile = ck_tile::sequence<128, 128>; +using Shape1_WarpTile = ck_tile::sequence<32, 128>; +using Shape1_ThreadTile = ck_tile::sequence<8, 8>; + +using Shape2_BlockWarps = ck_tile::sequence<2, 2>; // Cross-warp reduction test +using Shape2_BlockTile = ck_tile::sequence<2, 1024>; +using Shape2_WarpTile = ck_tile::sequence<1, 512>; +using Shape2_ThreadTile = ck_tile::sequence<1, 8>; + +// Test configurations for different data types and operations +using TestConfig_F32_Add = std::tuple; + +using TestConfig_F16_Add = std::tuple; + +using TestConfig_F32_CrossWarp = std::tuple; + +using TestConfig_F32_Max = std::tuple; + +using TestConfig_F32_SquareAdd = std::tuple; + +using TestTypes = ::testing::Types; + +TYPED_TEST_SUITE(TestCkTileReduce, TestTypes); + +// 2D Tests - Keep dim0, reduce dim1 +TYPED_TEST(TestCkTileReduce, Test2D_KeepDim0_ReduceDim1_64x32) +{ + this->RunTest2D_KeepDim0_ReduceDim1(64, 32); +} + +TYPED_TEST(TestCkTileReduce, Test2D_KeepDim0_ReduceDim1_1024x512) +{ + this->RunTest2D_KeepDim0_ReduceDim1(1024, 512); +} + +// 3D Tests - Keep dim0, reduce dim1,2 +TYPED_TEST(TestCkTileReduce, Test3D_KeepDim0_ReduceDim12_128x128x1) +{ + this->RunTest3D_KeepDim0_ReduceDim12(128, 128, 8); +} +// 3D Tests - Keep dim0,1, reduce dim1 +TYPED_TEST(TestCkTileReduce, Test3D_KeepDim01_ReduceDim2_512x1024x16) +{ + this->RunTest3D_KeepDim01_ReduceDim2(512, 1024, 16); +} + +// 4D Tests - Keep dim0,1, reduce dim2,3 (NCHW -> NC) +TYPED_TEST(TestCkTileReduce, Test4D_KeepDim01_ReduceDim23_32x256x16x16) +{ + this->RunTest4D_KeepDim01_ReduceDim23(32, 256, 16, 16); +} +// 4D Tests - Keep dim0,3, reduce dim1,2 (NHWC -> NC) +TYPED_TEST(TestCkTileReduce, Test4D_KeepDim03_ReduceDim12_16x32x32x128) +{ + this->RunTest4D_KeepDim03_ReduceDim12(16, 32, 32, 128); +} diff --git a/test/ck_tile/utility/CMakeLists.txt b/test/ck_tile/utility/CMakeLists.txt new file mode 100644 index 0000000000..c57cafca5a --- /dev/null +++ b/test/ck_tile/utility/CMakeLists.txt @@ -0,0 +1,4 @@ +message("-- Adding: test/ck_tile/utility/") + +# Add print tests +add_subdirectory(print) diff --git a/test/ck_tile/utility/print/CMakeLists.txt b/test/ck_tile/utility/print/CMakeLists.txt new file mode 100644 index 0000000000..5300dd20ca --- /dev/null +++ b/test/ck_tile/utility/print/CMakeLists.txt @@ -0,0 +1,8 @@ +# Print utility tests +add_gtest_executable(test_print_sequence test_print_sequence.cpp) +add_gtest_executable(test_print_array test_print_array.cpp) +add_gtest_executable(test_print_tuple test_print_tuple.cpp) +add_gtest_executable(test_print_coordinate_transform test_print_coordinate_transform.cpp) +add_gtest_executable(test_print_static_encoding_pattern test_print_static_encoding_pattern.cpp) +add_gtest_executable(test_print_buffer_view test_print_buffer_view.cpp) +add_gtest_executable(test_print_basic_types test_print_basic_types.cpp) diff --git a/test/ck_tile/utility/print/README.md b/test/ck_tile/utility/print/README.md new file mode 100644 index 0000000000..558c6faee4 --- /dev/null +++ b/test/ck_tile/utility/print/README.md @@ -0,0 +1,70 @@ +# Print Function Tests + +This directory contains unit tests for testing the print functionality of various data structures and coordinate transformations in the composable_kernel library. + +## Tests Included + +### test_print_sequence.cpp +Tests the print functionality for `sequence<...>` containers: +- Simple sequences with multiple elements +- Single element sequences +- Empty sequences +- Longer sequences + +### test_print_array.cpp +Tests the print functionality for `array` containers: +- Arrays with integer values +- Single element arrays +- Empty arrays (size 0) +- Arrays with floating point values + +### test_print_tuple.cpp +Tests the print functionality for `tuple<...>` containers: +- Simple tuples with numbers +- Single element tuples +- Empty tuples +- Mixed type tuples + +### test_print_coordinate_transform.cpp +Tests the print functionality for coordinate transformation structures: +- `pass_through` transform +- `embed` transform +- `merge` transform +- `unmerge` transform +- `freeze` transform + +## Testing Approach + +All tests use Google Test's `CaptureStdout()` functionality to capture the output from print functions and verify the formatting: + +```cpp +testing::internal::CaptureStdout(); +print(object); +std::string output = testing::internal::GetCapturedStdout(); +EXPECT_EQ(output, "expected_format"); +``` + +This approach enables testing of print function output without affecting the console during test execution. + +## Building and Running + +The tests are integrated into the CMake build system. To build and run the print tests: + +```bash +# Build the specific test +make test_print_sequence + +# Run the test +./test_print_sequence + +# Or run all print tests using CTest +ctest -R "test_print" +``` + +## Adding New Tests + +To add tests for new data structures: + +1. Create a new test file: `test_print_.cpp` +2. Follow the existing pattern using `CaptureStdout()` +3. Add the test executable to `CMakeLists.txt` diff --git a/test/ck_tile/utility/print/test_print_array.cpp b/test/ck_tile/utility/print/test_print_array.cpp new file mode 100644 index 0000000000..2fe9bc2a0c --- /dev/null +++ b/test/ck_tile/utility/print/test_print_array.cpp @@ -0,0 +1,59 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. + +#include "test_print_common.hpp" +#include "ck_tile/core/container/array.hpp" +#include "ck_tile/core/utility/print.hpp" + +namespace ck_tile { + +class PrintArrayTest : public PrintTest +{ +}; + +TEST_F(PrintArrayTest, PrintIntArray) +{ + // Test printing array + array arr{10, 20, 30}; + + std::string output = CapturePrintOutput(arr); + + // The expected format should match the array print function implementation + EXPECT_EQ(output, "array{size: 3, data: [10, 20, 30]}"); +} + +TEST_F(PrintArrayTest, PrintSingleElementArray) +{ + // Test printing array + array arr{42}; + + std::string output = CapturePrintOutput(arr); + + EXPECT_EQ(output, "array{size: 1, data: [42]}"); +} + +TEST_F(PrintArrayTest, PrintEmptyArray) +{ + // Test printing array (empty array) + array arr{}; + + std::string output = CapturePrintOutput(arr); + + EXPECT_EQ(output, "array{size: 0, data: []}"); +} + +TEST_F(PrintArrayTest, PrintFloatArray) +{ + // Test printing array with float values + array arr{3.14f, 2.71f}; + + std::string output = CapturePrintOutput(arr); + + // Note: float printing format may vary, so we'll test for basic structure + EXPECT_TRUE(output.find("array{size: 2, data: [") == 0); + EXPECT_TRUE(output.find("3.14") != std::string::npos); + EXPECT_TRUE(output.find("2.71") != std::string::npos); + EXPECT_TRUE(output.find("]}") == output.length() - 2); +} + +} // namespace ck_tile diff --git a/test/ck_tile/utility/print/test_print_basic_types.cpp b/test/ck_tile/utility/print/test_print_basic_types.cpp new file mode 100644 index 0000000000..7a26b6371a --- /dev/null +++ b/test/ck_tile/utility/print/test_print_basic_types.cpp @@ -0,0 +1,76 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. + +#include "test_print_common.hpp" +#include "ck_tile/core/utility/print.hpp" + +namespace ck_tile { + +class PrintBasicTypesTest : public PrintTest +{ +}; + +TEST_F(PrintBasicTypesTest, PrintIntArray) +{ + int arr[4] = {1, 2, 3, 4}; + + std::string output = CapturePrintOutput(arr); + + EXPECT_EQ(output, "[1, 2, 3, 4]"); +} + +TEST_F(PrintBasicTypesTest, PrintFloatArray) +{ + float arr[3] = {1.5f, 2.5f, 3.5f}; + + std::string output = CapturePrintOutput(arr); + + // Note: floating point formatting may vary, so we check for key elements + EXPECT_TRUE(output.find("[") == 0); + EXPECT_TRUE(output.find("1.5") != std::string::npos); + EXPECT_TRUE(output.find("2.5") != std::string::npos); + EXPECT_TRUE(output.find("3.5") != std::string::npos); + EXPECT_TRUE(output.back() == ']'); + EXPECT_TRUE(output.find(", ") != std::string::npos); +} + +TEST_F(PrintBasicTypesTest, PrintDoubleArray) +{ + double arr[2] = {10.123, 20.456}; + + std::string output = CapturePrintOutput(arr); + + EXPECT_TRUE(output.find("[") == 0); + EXPECT_TRUE(output.find("10.123") != std::string::npos); + EXPECT_TRUE(output.find("20.456") != std::string::npos); + EXPECT_TRUE(output.back() == ']'); +} + +TEST_F(PrintBasicTypesTest, PrintUnsignedIntArray) +{ + unsigned int arr[3] = {100u, 200u, 300u}; + + std::string output = CapturePrintOutput(arr); + + EXPECT_EQ(output, "[100, 200, 300]"); +} + +TEST_F(PrintBasicTypesTest, PrintCharArray) +{ + char arr[5] = {'a', 'b', 'c', 'd', 'e'}; + + std::string output = CapturePrintOutput(arr); + + EXPECT_EQ(output, "[a, b, c, d, e]"); +} + +TEST_F(PrintBasicTypesTest, PrintSingleElementArray) +{ + int arr[1] = {42}; + + std::string output = CapturePrintOutput(arr); + + EXPECT_EQ(output, "[42]"); +} + +} // namespace ck_tile diff --git a/test/ck_tile/utility/print/test_print_buffer_view.cpp b/test/ck_tile/utility/print/test_print_buffer_view.cpp new file mode 100644 index 0000000000..66668a2103 --- /dev/null +++ b/test/ck_tile/utility/print/test_print_buffer_view.cpp @@ -0,0 +1,78 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. + +#include "test_print_common.hpp" +#include "ck_tile/core/tensor/buffer_view.hpp" +#include "ck_tile/core/utility/print.hpp" + +namespace ck_tile { + +class PrintBufferViewTest : public PrintTest +{ +}; + +TEST_F(PrintBufferViewTest, PrintGenericBufferView) +{ + // Test printing generic address space buffer_view + float data[4] = {100.f, 200.f, 300.f, 400.f}; + auto bv = make_buffer_view(&data, 4); + + std::string output = CapturePrintOutput(bv); + + // Verify the output contains expected information + EXPECT_TRUE(output.find("buffer_view{AddressSpace: generic") != std::string::npos); + EXPECT_TRUE(output.find("p_data_:") != std::string::npos); + EXPECT_TRUE(output.find("buffer_size_:") != std::string::npos); + EXPECT_TRUE(output.find("invalid_element_value_:") != std::string::npos); + EXPECT_TRUE(output.find("}") != std::string::npos); +} + +TEST_F(PrintBufferViewTest, PrintGlobalBufferView) +{ + // Test printing global address space buffer_view + float data[4] = {100.f, 200.f, 300.f, 400.f}; + auto bv = make_buffer_view(&data, 4); + + std::string output = CapturePrintOutput(bv); + + // Verify the output contains expected information + EXPECT_TRUE(output.find("buffer_view{AddressSpace: global") != std::string::npos); + EXPECT_TRUE(output.find("p_data_:") != std::string::npos); + EXPECT_TRUE(output.find("buffer_size_:") != std::string::npos); + EXPECT_TRUE(output.find("invalid_element_value_:") != std::string::npos); + EXPECT_TRUE(output.find("}") != std::string::npos); +} + +TEST_F(PrintBufferViewTest, PrintLdsBufferView) +{ + // Test printing LDS address space buffer_view + float data[4] = {100.f, 200.f, 300.f, 400.f}; + auto bv = make_buffer_view(data, 4); + + std::string output = CapturePrintOutput(bv); + + // Verify the output contains expected information + EXPECT_TRUE(output.find("buffer_view{AddressSpace: lds") != std::string::npos); + EXPECT_TRUE(output.find("p_data_:") != std::string::npos); + EXPECT_TRUE(output.find("buffer_size_:") != std::string::npos); + EXPECT_TRUE(output.find("invalid_element_value_:") != std::string::npos); + EXPECT_TRUE(output.find("}") != std::string::npos); +} + +TEST_F(PrintBufferViewTest, PrintVgprBufferView) +{ + // Test printing VGPR address space buffer_view + float data[4] = {1.5f, 2.5f, 3.5f, 4.5f}; + auto bv = make_buffer_view(data, 4); + + std::string output = CapturePrintOutput(bv); + + // Verify the output contains expected information + EXPECT_TRUE(output.find("buffer_view{AddressSpace: vgpr") != std::string::npos); + EXPECT_TRUE(output.find("p_data_:") != std::string::npos); + EXPECT_TRUE(output.find("buffer_size_:") != std::string::npos); + EXPECT_TRUE(output.find("invalid_element_value_:") != std::string::npos); + EXPECT_TRUE(output.find("}") != std::string::npos); +} + +} // namespace ck_tile diff --git a/test/ck_tile/utility/print/test_print_common.hpp b/test/ck_tile/utility/print/test_print_common.hpp new file mode 100644 index 0000000000..3ba2270802 --- /dev/null +++ b/test/ck_tile/utility/print/test_print_common.hpp @@ -0,0 +1,25 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include +#include + +#include "ck_tile/core/utility/print.hpp" + +class PrintTest : public ::testing::Test +{ + protected: + void SetUp() override {} + void TearDown() override {} + // Helper function to capture and return the output of a print function + template + std::string CapturePrintOutput(const T& type) + { + using namespace ck_tile; + testing::internal::CaptureStdout(); + print(type); + return testing::internal::GetCapturedStdout(); + } +}; diff --git a/test/ck_tile/utility/print/test_print_coordinate_transform.cpp b/test/ck_tile/utility/print/test_print_coordinate_transform.cpp new file mode 100644 index 0000000000..639b113eb7 --- /dev/null +++ b/test/ck_tile/utility/print/test_print_coordinate_transform.cpp @@ -0,0 +1,83 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. + +#include "test_print_common.hpp" +#include "ck_tile/core/algorithm/coordinate_transform.hpp" +#include "ck_tile/core/utility/print.hpp" + +namespace ck_tile { + +class PrintCoordinateTransformTest : public PrintTest +{ +}; + +TEST_F(PrintCoordinateTransformTest, PrintPassThrough) +{ + // Test printing pass_through transform + auto pt = make_pass_through_transform(number<32>{}); + + std::string output = CapturePrintOutput(pt); + + // Verify it contains the pass_through identifier and some structure + EXPECT_TRUE(output.find("pass_through{") == 0); + EXPECT_TRUE(output.find("up_lengths_") != std::string::npos); + EXPECT_TRUE(output.back() == '}'); +} + +TEST_F(PrintCoordinateTransformTest, PrintEmbed) +{ + // Test printing embed transform + auto embed_transform = make_embed_transform(make_tuple(number<4>{}, number<8>{}), + make_tuple(number<1>{}, number<4>{})); + + std::string output = CapturePrintOutput(embed_transform); + + // Verify it contains the embed identifier and key fields + EXPECT_TRUE(output.find("embed{") == 0); + EXPECT_TRUE(output.find("up_lengths_") != std::string::npos); + EXPECT_TRUE(output.find("coefficients_") != std::string::npos); + EXPECT_TRUE(output.back() == '}'); +} + +TEST_F(PrintCoordinateTransformTest, PrintMerge) +{ + // Test printing merge transform + auto merge_transform = make_merge_transform(make_tuple(number<4>{}, number<8>{})); + + std::string output = CapturePrintOutput(merge_transform); + + // Verify it contains merge identifier and key fields + EXPECT_TRUE(output.find("merge") == + 0); // Could be merge_v2_magic_division or merge_v3_division_mod + EXPECT_TRUE(output.find("low_lengths_") != std::string::npos || + output.find("up_lengths_") != std::string::npos); + EXPECT_TRUE(output.back() == '}'); +} + +TEST_F(PrintCoordinateTransformTest, PrintUnmerge) +{ + // Test printing unmerge transform + auto unmerge_transform = make_unmerge_transform(make_tuple(number<4>{}, number<8>{})); + + std::string output = CapturePrintOutput(unmerge_transform); + + // Verify it contains the unmerge identifier and key fields + EXPECT_TRUE(output.find("unmerge{") == 0); + EXPECT_TRUE(output.find("up_lengths_") != std::string::npos); + EXPECT_TRUE(output.back() == '}'); +} + +TEST_F(PrintCoordinateTransformTest, PrintFreeze) +{ + // Test printing freeze transform + auto freeze_transform = make_freeze_transform(number<5>{}); + + std::string output = CapturePrintOutput(freeze_transform); + + // Verify it contains the freeze identifier and key fields + EXPECT_TRUE(output.find("freeze{") == 0); + EXPECT_TRUE(output.find("low_idx_") != std::string::npos); + EXPECT_TRUE(output.back() == '}'); +} + +} // namespace ck_tile diff --git a/test/ck_tile/utility/print/test_print_sequence.cpp b/test/ck_tile/utility/print/test_print_sequence.cpp new file mode 100644 index 0000000000..e73a9f7e33 --- /dev/null +++ b/test/ck_tile/utility/print/test_print_sequence.cpp @@ -0,0 +1,45 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. + +#include "test_print_common.hpp" +#include "ck_tile/core/utility/print.hpp" +#include "ck_tile/core/container/sequence.hpp" + +namespace ck_tile { + +class PrintSequenceTest : public PrintTest +{ +}; + +TEST_F(PrintSequenceTest, PrintSimpleSequence) +{ + // Test printing sequence<1, 5, 8> + constexpr auto seq = sequence<1, 5, 8>{}; + + std::string output = CapturePrintOutput(seq); + + // Verify the output format + EXPECT_EQ(output, "sequence<1, 5, 8>"); +} + +TEST_F(PrintSequenceTest, PrintSingleElementSequence) +{ + // Test printing sequence<42> + constexpr auto seq = sequence<42>{}; + + std::string output = CapturePrintOutput(seq); + + EXPECT_EQ(output, "sequence<42>"); +} + +TEST_F(PrintSequenceTest, PrintEmptySequence) +{ + // Test printing sequence<> (empty sequence) + constexpr auto seq = sequence<>{}; + + std::string output = CapturePrintOutput(seq); + + EXPECT_EQ(output, "sequence<>"); +} + +} // namespace ck_tile diff --git a/test/ck_tile/utility/print/test_print_static_encoding_pattern.cpp b/test/ck_tile/utility/print/test_print_static_encoding_pattern.cpp new file mode 100644 index 0000000000..d1cb408b5c --- /dev/null +++ b/test/ck_tile/utility/print/test_print_static_encoding_pattern.cpp @@ -0,0 +1,89 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. + +#include "test_print_common.hpp" +#include "ck_tile/core/algorithm/static_encoding_pattern.hpp" +#include "ck_tile/core/utility/print.hpp" + +#include + +namespace ck_tile { + +class PrintStaticEncodingPatternTest : public PrintTest +{ + protected: + void TestY0Y1Y2(const std::string& output, auto Y0, auto Y1, auto Y2) + { + std::stringstream expected; + expected << ": <" << Y0 << ", " << Y1 << ", " << Y2 << ">"; + EXPECT_TRUE(output.find(expected.str()) != std::string::npos); + } + void TestX0X1(const std::string& output, auto X0, auto X1) + { + std::stringstream expected; + expected << ": <" << X0 << ", " << X1 << ">"; + EXPECT_TRUE(output.find(expected.str()) != std::string::npos); + } +}; + +TEST_F(PrintStaticEncodingPatternTest, PrintThreadRakedPattern) +{ + // Test printing thread raked pattern + using PatternType = + TileDistributionEncodingPattern2D<64, 8, 16, 4, tile_distribution_pattern::thread_raked>; + PatternType pattern; + + std::string output = CapturePrintOutput(pattern); + + // Verify the output contains expected information + EXPECT_TRUE(output.find("TileDistributionEncodingPattern2D") != std::string::npos); + EXPECT_TRUE(output.find("BlockSize:64") != std::string::npos); + EXPECT_TRUE(output.find("YPerTile:8") != std::string::npos); + EXPECT_TRUE(output.find("XPerTile:16") != std::string::npos); + EXPECT_TRUE(output.find("VecSize:4") != std::string::npos); + EXPECT_TRUE(output.find("thread_raked") != std::string::npos); + TestY0Y1Y2(output, PatternType::Y0, PatternType::Y1, PatternType::Y2); + TestX0X1(output, PatternType::X0, PatternType::X1); +} + +TEST_F(PrintStaticEncodingPatternTest, PrintWarpRakedPattern) +{ + // Test printing warp raked pattern + using PatternType = + TileDistributionEncodingPattern2D<128, 16, 32, 8, tile_distribution_pattern::warp_raked>; + PatternType pattern; + + std::string output = CapturePrintOutput(pattern); + + // Verify the output contains expected information + EXPECT_TRUE(output.find("TileDistributionEncodingPattern2D") != std::string::npos); + EXPECT_TRUE(output.find("BlockSize:128") != std::string::npos); + EXPECT_TRUE(output.find("YPerTile:16") != std::string::npos); + EXPECT_TRUE(output.find("XPerTile:32") != std::string::npos); + EXPECT_TRUE(output.find("VecSize:8") != std::string::npos); + EXPECT_TRUE(output.find("warp_raked") != std::string::npos); + TestY0Y1Y2(output, PatternType::Y0, PatternType::Y1, PatternType::Y2); + TestX0X1(output, PatternType::X0, PatternType::X1); +} + +TEST_F(PrintStaticEncodingPatternTest, PrintBlockRakedPattern) +{ + // Test printing block raked pattern + using PatternType = + TileDistributionEncodingPattern2D<256, 32, 64, 16, tile_distribution_pattern::block_raked>; + PatternType pattern; + + std::string output = CapturePrintOutput(pattern); + + // Verify the output contains expected information + EXPECT_TRUE(output.find("TileDistributionEncodingPattern2D") != std::string::npos); + EXPECT_TRUE(output.find("BlockSize:256") != std::string::npos); + EXPECT_TRUE(output.find("YPerTile:32") != std::string::npos); + EXPECT_TRUE(output.find("XPerTile:64") != std::string::npos); + EXPECT_TRUE(output.find("VecSize:16") != std::string::npos); + EXPECT_TRUE(output.find("block_raked") != std::string::npos); + TestY0Y1Y2(output, PatternType::Y0, PatternType::Y1, PatternType::Y2); + TestX0X1(output, PatternType::X0, PatternType::X1); +} + +} // namespace ck_tile diff --git a/test/ck_tile/utility/print/test_print_tuple.cpp b/test/ck_tile/utility/print/test_print_tuple.cpp new file mode 100644 index 0000000000..79aaf1b3af --- /dev/null +++ b/test/ck_tile/utility/print/test_print_tuple.cpp @@ -0,0 +1,66 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. + +#include "test_print_common.hpp" +#include "ck_tile/core/container/tuple.hpp" +#include "ck_tile/core/numeric/integral_constant.hpp" +#include "ck_tile/core/utility/print.hpp" + +namespace ck_tile { + +class PrintTupleTest : public PrintTest +{ +}; + +TEST_F(PrintTupleTest, PrintSimpleTuple) +{ + // Test printing tuple with numbers + auto tup = make_tuple(number<1>{}, number<5>{}, number<8>{}); + + std::string output = CapturePrintOutput(tup); + + // Verify the output format matches tuple print implementation + EXPECT_TRUE(output.find("tuple<") == 0); + EXPECT_TRUE(output.find("1") != std::string::npos); + EXPECT_TRUE(output.find("5") != std::string::npos); + EXPECT_TRUE(output.find("8") != std::string::npos); + EXPECT_TRUE(output.back() == '>'); +} + +TEST_F(PrintTupleTest, PrintSingleElementTuple) +{ + // Test printing tuple with single element + auto tup = make_tuple(number<42>{}); + + std::string output = CapturePrintOutput(tup); + + EXPECT_TRUE(output.find("tuple<") == 0); + EXPECT_TRUE(output.find("42") != std::string::npos); + EXPECT_TRUE(output.back() == '>'); +} + +TEST_F(PrintTupleTest, PrintEmptyTuple) +{ + // Test printing empty tuple + auto tup = make_tuple(); + + std::string output = CapturePrintOutput(tup); + + EXPECT_EQ(output, "tuple<>"); +} + +TEST_F(PrintTupleTest, PrintMixedTypeTuple) +{ + // Test printing tuple with mixed types (numbers and constants) + auto tup = make_tuple(number<10>{}, constant<20>{}, number<30>{}); + + std::string output = CapturePrintOutput(tup); + + EXPECT_TRUE(output.find("tuple<") == 0); + EXPECT_TRUE(output.find("10") != std::string::npos); + EXPECT_TRUE(output.find("20") != std::string::npos); + EXPECT_TRUE(output.find("30") != std::string::npos); + EXPECT_TRUE(output.back() == '>'); +} + +} // namespace ck_tile diff --git a/test/grouped_convnd_fwd_activation/CMakeLists.txt b/test/grouped_convnd_fwd_activation/CMakeLists.txt index f964325c06..4d5196505c 100644 --- a/test/grouped_convnd_fwd_activation/CMakeLists.txt +++ b/test/grouped_convnd_fwd_activation/CMakeLists.txt @@ -1,4 +1,10 @@ if(GPU_TARGETS MATCHES "gfx9") + add_gtest_executable(test_grouped_convnd_fwd_bias_bnorm_clamp test_grouped_convnd_fwd_bias_bnorm_clamp.cpp) + target_link_libraries(test_grouped_convnd_fwd_bias_bnorm_clamp PRIVATE utility device_grouped_conv2d_fwd_bias_bnorm_clamp_instance device_grouped_conv3d_fwd_bias_bnorm_clamp_instance) + + add_gtest_executable(test_grouped_convnd_fwd_gk_bias_bnorm_clamp test_grouped_convnd_fwd_gk_bias_bnorm_clamp.cpp) + target_link_libraries(test_grouped_convnd_fwd_gk_bias_bnorm_clamp PRIVATE utility device_grouped_conv2d_fwd_bias_bnorm_clamp_instance device_grouped_conv3d_fwd_bias_bnorm_clamp_instance) + add_gtest_executable(test_grouped_convnd_fwd_bias_clamp test_grouped_convnd_fwd_bias_clamp.cpp) target_link_libraries(test_grouped_convnd_fwd_bias_clamp PRIVATE utility device_grouped_conv2d_fwd_bias_clamp_instance device_grouped_conv3d_fwd_bias_clamp_instance) diff --git a/test/grouped_convnd_fwd_activation/test_grouped_convnd_fwd_bias_bnorm_clamp.cpp b/test/grouped_convnd_fwd_activation/test_grouped_convnd_fwd_bias_bnorm_clamp.cpp new file mode 100644 index 0000000000..bf96d11d53 --- /dev/null +++ b/test/grouped_convnd_fwd_activation/test_grouped_convnd_fwd_bias_bnorm_clamp.cpp @@ -0,0 +1,97 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved. + +#include +#include +#include +#include +#include + +#include "profiler/profile_grouped_conv_fwd_bias_bnorm_clamp_impl.hpp" + +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" + +using BiasNormalizeInInferClamp = ck::tensor_operation::element_wise::BiasNormalizeInInferClamp; + +template +class TestGroupedConvndFwd : public ::testing::Test +{ + protected: + using DataType = std::tuple_element_t<0, Tuple>; + using InLayout = std::tuple_element_t<1, Tuple>; + using WeiLayout = std::tuple_element_t<2, Tuple>; + using OutLayout = std::tuple_element_t<3, Tuple>; + using IndexType = ck::index_t; + + std::vector conv_params; + + template + void Run() + { + EXPECT_FALSE(conv_params.empty()); + bool pass = true; + for(auto& param : conv_params) + { + pass = pass && ck::profiler::profile_grouped_conv_fwd_bias_clamp_impl( + true, // do_verification + 1, // init_method: integer value + false, // do_log + false, // time_kernel + param); + } + EXPECT_TRUE(pass); + } +}; + +using namespace ck::tensor_layout::convolution; + +using KernelTypes2d = ::testing::Types, + std::tuple, + std::tuple>; + +using KernelTypes3d = ::testing::Types, + std::tuple, + std::tuple>; + +template +class TestGroupedConvndFwd2d : public TestGroupedConvndFwd +{ +}; + +template +class TestGroupedConvndFwd3d : public TestGroupedConvndFwd +{ +}; + +TYPED_TEST_SUITE(TestGroupedConvndFwd2d, KernelTypes2d); +TYPED_TEST_SUITE(TestGroupedConvndFwd3d, KernelTypes3d); + +TYPED_TEST(TestGroupedConvndFwd2d, Test2D) +{ + this->conv_params.clear(); + this->conv_params.push_back( + {2, 2, 32, 128, 256, {1, 1}, {7, 7}, {2, 2}, {1, 1}, {0, 0}, {0, 0}}); + this->conv_params.push_back( + {2, 2, 32, 128, 256, {3, 3}, {14, 14}, {1, 1}, {1, 1}, {1, 1}, {1, 1}}); + this->template Run<2>(); +} + +TYPED_TEST(TestGroupedConvndFwd3d, Test3D) +{ + this->conv_params.clear(); + this->conv_params.push_back( + {3, 2, 32, 128, 256, {1, 1, 1}, {7, 7, 7}, {2, 2, 2}, {1, 1, 1}, {0, 0, 0}, {0, 0, 0}}); + this->conv_params.push_back( + {3, 2, 32, 128, 256, {3, 3, 3}, {14, 14, 3}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}}); + this->template Run<3>(); +} diff --git a/test/grouped_convnd_fwd_activation/test_grouped_convnd_fwd_gk_bias_bnorm_clamp.cpp b/test/grouped_convnd_fwd_activation/test_grouped_convnd_fwd_gk_bias_bnorm_clamp.cpp new file mode 100644 index 0000000000..2400008ffa --- /dev/null +++ b/test/grouped_convnd_fwd_activation/test_grouped_convnd_fwd_gk_bias_bnorm_clamp.cpp @@ -0,0 +1,98 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved. + +#include +#include +#include +#include +#include + +#include "profiler/profile_grouped_conv_fwd_bias_bnorm_clamp_impl.hpp" + +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" + +using BiasNormalizeInInferClamp = ck::tensor_operation::element_wise::BiasNormalizeInInferClamp; + +template +class TestGroupedConvndFwd : public ::testing::Test +{ + protected: + using DataType = std::tuple_element_t<0, Tuple>; + using InLayout = std::tuple_element_t<1, Tuple>; + using WeiLayout = std::tuple_element_t<2, Tuple>; + using OutLayout = std::tuple_element_t<3, Tuple>; + using IndexType = ck::index_t; + + std::vector conv_params; + + template + void Run() + { + EXPECT_FALSE(conv_params.empty()); + bool pass = true; + for(auto& param : conv_params) + { + pass = pass && + ck::profiler::profile_grouped_conv_fwd_bias_clamp_impl( + true, // do_verification + 1, // init_method: integer value + false, // do_log + false, // time_kernel + param); + } + EXPECT_TRUE(pass); + } +}; + +using namespace ck::tensor_layout::convolution; + +using KernelTypes2d = ::testing::Types, + std::tuple, + std::tuple>; + +using KernelTypes3d = ::testing::Types, + std::tuple, + std::tuple>; + +template +class TestGroupedConvndFwd2d : public TestGroupedConvndFwd +{ +}; + +template +class TestGroupedConvndFwd3d : public TestGroupedConvndFwd +{ +}; + +TYPED_TEST_SUITE(TestGroupedConvndFwd2d, KernelTypes2d); +TYPED_TEST_SUITE(TestGroupedConvndFwd3d, KernelTypes3d); + +TYPED_TEST(TestGroupedConvndFwd2d, Test2D) +{ + this->conv_params.clear(); + this->conv_params.push_back( + {2, 2, 32, 128, 256, {1, 1}, {7, 7}, {2, 2}, {1, 1}, {0, 0}, {0, 0}}); + this->conv_params.push_back( + {2, 2, 32, 128, 256, {3, 3}, {14, 14}, {1, 1}, {1, 1}, {1, 1}, {1, 1}}); + this->template Run<2>(); +} + +TYPED_TEST(TestGroupedConvndFwd3d, Test3D) +{ + this->conv_params.clear(); + this->conv_params.push_back( + {3, 2, 32, 128, 256, {1, 1, 1}, {7, 7, 7}, {2, 2, 2}, {1, 1, 1}, {0, 0, 0}, {0, 0, 0}}); + this->conv_params.push_back( + {3, 2, 32, 128, 256, {3, 3, 3}, {14, 14, 3}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}}); + this->template Run<3>(); +} diff --git a/test/mx_mfma_op/mx_mfma_op.hpp b/test/mx_mfma_op/mx_mfma_op.hpp index 4bb38a0c16..b2e615b9d8 100644 --- a/test/mx_mfma_op/mx_mfma_op.hpp +++ b/test/mx_mfma_op/mx_mfma_op.hpp @@ -187,11 +187,11 @@ __device__ AFragT load_A_col_major(AType const* input_ptr) auto kMinorOffset = col_major(minorStepCoord2D, BLOCK_M); auto kMajorOffset = col_major(majorStepCoord2D, BLOCK_M); - using ARawT = typename scalar_type::type; - using AScalarFragT = - vector_type, ck::f4x2_pk_t> ? 2 : 1)>::type; + using ARawT = typename scalar_type::type; + using AScalarFragT = typename vector_type< + ARawT, + BLOCK_M * BLOCK_K / WAVE_SIZE / + (ck::is_same_v, ck::f4x2_pk_t> ? 2 : 1)>::type; AScalarFragT fragA{}; @@ -319,8 +319,9 @@ __device__ AFragT load_A_row_major(AType const* input_ptr) // Flatten to 1D row_major offsets. auto row_major = [](auto const& coord, auto ld) { return coord.first * ld + coord.second; }; - using ARawT = typename scalar_type::type; - using AScalarChunkT = vector_type::vector_size / num_chunks>::type; + using ARawT = typename scalar_type::type; + using AScalarChunkT = + typename vector_type::vector_size / num_chunks>::type; union { @@ -544,8 +545,9 @@ __device__ BFragT load_B_col_major(BType const* input_ptr) auto majorStepCoord2D = std::make_pair(chunk_offset, 0); // read a chunk from a col - using BRawT = typename scalar_type::type; - using BScalarChunkT = vector_type::vector_size / num_chunks>::type; + using BRawT = typename scalar_type::type; + using BScalarChunkT = + typename vector_type::vector_size / num_chunks>::type; union { @@ -780,7 +782,7 @@ struct store_C_col_major // we can vector store 4 contiguous elements at a time. using CRawT = typename scalar_type::type; - using CScalarFragT = vector_type::type; + using CScalarFragT = typename vector_type::type; union { CFragT frag; @@ -940,12 +942,14 @@ __global__ void matmul(const packed_type_t* a, const packed_type_t assert(threadIdx.x < WAVE_SIZE); assert(blockDim.x == 1 && blockDim.y == 1 && blockDim.z == 1); - using AFragT = vector_type::type; - using BFragT = vector_type::type; + using AFragT = + typename vector_type::type; + using BFragT = + typename vector_type::type; - using CFragT = vector_type::type; + using CFragT = typename vector_type::type; using AccumFragT = vector_type; - using RawAccumFragT = vector_type::type; + using RawAccumFragT = typename vector_type::type; // Create frags auto fragA = AFragT{}; @@ -1019,14 +1023,16 @@ __global__ void matmul(const packed_type_t* a, assert(threadIdx.x < WAVE_SIZE); assert(blockDim.x == 1 && blockDim.y == 1 && blockDim.z == 1); - using AFragT = vector_type::type; - using BFragT = vector_type::type; + using AFragT = + typename vector_type::type; + using BFragT = + typename vector_type::type; - using CFragT = vector_type::type; + using CFragT = typename vector_type::type; using AccumFragT = vector_type; - using RawAccumFragT = vector_type::type; - using AScaleFragT = vector_type::type; - using BScaleFragT = vector_type::type; + using RawAccumFragT = typename vector_type::type; + using AScaleFragT = typename vector_type::type; + using BScaleFragT = typename vector_type::type; // Create frags auto fragA = AFragT{}; diff --git a/tile_engine/ops/gemm/CMakeLists.txt b/tile_engine/ops/gemm/CMakeLists.txt index d8200ed947..fe9b7802a7 100644 --- a/tile_engine/ops/gemm/CMakeLists.txt +++ b/tile_engine/ops/gemm/CMakeLists.txt @@ -1,215 +1,148 @@ + set(GEMM_DATATYPE "fp8;fp16" CACHE STRING "List of datatypes for GEMM (semicolon-separated)") set(GEMM_LAYOUT "rcr" CACHE STRING "List of layout for GEMM (semicolon-separated)") -# Pre-generate all kernel lists to avoid blocking during parallel builds -foreach(dt IN LISTS GEMM_DATATYPE) - foreach(l IN LISTS GEMM_LAYOUT) - set(working_path "${CMAKE_CURRENT_BINARY_DIR}/${dt}/${l}") - file(MAKE_DIRECTORY "${working_path}") - - if (l STREQUAL "rcr") - set(json_blob "${CMAKE_CURRENT_LIST_DIR}/configs/default_config.json") - else() - set(json_blob "${CMAKE_CURRENT_LIST_DIR}/configs/custom_ci_config.json") - endif() - - # Only run if files don't exist - if (NOT EXISTS "${working_path}/gemm_instance_blobs.txt") - execute_process( - COMMAND ${Python3_EXECUTABLE} "${CMAKE_CURRENT_LIST_DIR}/gemm_instance_builder.py" - --working_path "${working_path}" - --datatype "${dt}" - --layout "${l}" - --config_json "${json_blob}" - --list_blobs - RESULT_VARIABLE ret - ) - if (NOT ret EQUAL 0) - message(FATAL_ERROR "Failed to pre-generate kernel list for ${dt} ${l}") - endif() - endif() - endforeach() -endforeach() - function(build_gemm_for_datatype datatype layout) set(working_path "${CMAKE_CURRENT_BINARY_DIR}/${datatype}/${layout}") - if (layout STREQUAL "rcr") + # Comment this if-else block when using user_provided_config + if(layout STREQUAL "rcr") set(json_blob "${CMAKE_CURRENT_LIST_DIR}/configs/default_config.json") else() set(json_blob "${CMAKE_CURRENT_LIST_DIR}/configs/custom_ci_config.json") endif() - # Uncomment to override: - # set(json_blob "${CMAKE_CURRENT_LIST_DIR}/configs/user_provided_config.json") - # Read pre-generated kernel lists + # uncomment this if you want to use user_provided_config.json + # set(json_blob "${CMAKE_CURRENT_LIST_DIR}/configs/user_provided_config.json") + + # Generate kernel list + execute_process( + COMMAND ${Python3_EXECUTABLE} ${CMAKE_CURRENT_LIST_DIR}/gemm_instance_builder.py + --working_path ${working_path} + --datatype ${datatype} + --layout ${layout} + --config_json ${json_blob} + --list_blobs + RESULT_VARIABLE ret + ) + if(NOT ret EQUAL 0) + message(FATAL_ERROR "Failed to list kernels for ${datatype} ${layout}: ${ret}") + endif() + file(STRINGS "${working_path}/gemm_instance_blobs.txt" codegen_blobs) file(STRINGS "${working_path}/gemm_instance_blobs_range.txt" codegen_blobs_range) - + # Generate the blobs add_custom_command( OUTPUT ${codegen_blobs} - COMMAND ${Python3_EXECUTABLE} "${CMAKE_CURRENT_LIST_DIR}/gemm_instance_builder.py" + COMMAND ${Python3_EXECUTABLE} ${CMAKE_CURRENT_LIST_DIR}/gemm_instance_builder.py --working_path "${working_path}" - --datatype "${datatype}" - --layout "${layout}" + --datatype ${datatype} + --layout ${layout} --config_json "${json_blob}" --gen_blobs COMMENT "Generating GEMM instance sources for ${datatype} ${layout}" ) add_custom_target(gemm_gen_${datatype}_${layout} DEPENDS ${codegen_blobs}) - # Parse ranges to identify unique trait names - set(unique_traits) - foreach(range_line IN LISTS codegen_blobs_range) - string(STRIP "${range_line}" stripped_line) - separate_arguments(split_line UNIX_COMMAND "${stripped_line}") - list(GET split_line 0 trait_name) - list(APPEND unique_traits "${trait_name}") - endforeach() - list(REMOVE_DUPLICATES unique_traits) + set(intermediate_libs) + list(LENGTH codegen_blobs codegen_blobs_len) - # Build each trait separately - foreach(trait IN LISTS unique_traits) - set(trait_files) - foreach(range_line IN LISTS codegen_blobs_range) - string(STRIP "${range_line}" stripped_line) - separate_arguments(split_line UNIX_COMMAND "${stripped_line}") - list(GET split_line 0 name) - if (name STREQUAL trait) - list(GET split_line 1 first) - list(GET split_line 2 last) - math(EXPR total_files "${last} - ${first}") - if (total_files GREATER 0) - foreach(j RANGE ${first} ${last}-1) - list(LENGTH codegen_blobs blobs_len) - if (j LESS blobs_len) - list(GET codegen_blobs ${j} f) - list(APPEND trait_files "${f}") - endif() - endforeach() - endif() - endif() - endforeach() + foreach(blob IN LISTS codegen_blobs_range) + string(STRIP "${blob}" stripped_blob) + separate_arguments(spilit_blob UNIX_COMMAND "${stripped_blob}") + # Each line is: + list(GET spilit_blob 0 name) + list(GET spilit_blob 1 first) + list(GET spilit_blob 2 last) + math(EXPR total_files "${last} - ${first}") + if(total_files EQUAL 0) + continue() # nothing for this trait + endif() - if (trait_files) - # Create object libraries with chunking - set(chunk_size 3) # adjust as needed for memory vs parallelism - list(LENGTH trait_files num_files) - math(EXPR num_chunks "( ${num_files} + ${chunk_size} - 1 ) / ${chunk_size}") + # Object libraries (chunked) per trait + set(sub_intermediate_libs) + set(chunk_size 3) + math(EXPR num_chunks "( ${total_files} + ${chunk_size} - 1 ) / ${chunk_size}") + math(EXPR num_chunks_minus_1 "${num_chunks} - 1") + + foreach(i RANGE 0 ${num_chunks_minus_1}) + math(EXPR start "${first} + ${i} * ${chunk_size} ") + math(EXPR end "${start} + ${chunk_size} - 1") - set(trait_obj_libs) - foreach(i RANGE 0 ${num_chunks}-1) - math(EXPR start "${i} * ${chunk_size}") - math(EXPR end "${start} + ${chunk_size} - 1") - - set(chunk_files) - foreach(j RANGE ${start} ${end}) - if (j LESS ${num_files}) - list(GET trait_files ${j} f) - list(APPEND chunk_files "${f}") - endif() - endforeach() - - if (chunk_files) - set(obj_lib_name "gemm_obj_${trait}_${i}_${datatype}_${layout}") - add_library(${obj_lib_name} OBJECT ${chunk_files}) - add_dependencies(${obj_lib_name} gemm_gen_${datatype}_${layout}) - - target_compile_options(${obj_lib_name} PRIVATE - -Wno-undefined-func-template - -Wno-float-equal - --offload-compress - -O3 - -fno-exceptions - ) - - set_target_properties(${obj_lib_name} PROPERTIES - UNITY_BUILD ON - UNITY_BUILD_BATCH_SIZE 2 - ) - - list(APPEND trait_obj_libs "${obj_lib_name}") + set(chunk_files) + foreach(j RANGE ${start} ${end}) + if(j LESS ${last} AND j LESS ${codegen_blobs_len}) + list(GET codegen_blobs ${j} f) + list(APPEND chunk_files "${f}") endif() endforeach() - # Static library for this trait - if (trait_obj_libs) - set(trait_lib_name "gemm_lib_${trait}_${datatype}_${layout}") - set(obj_exprs) - foreach(objlib IN LISTS trait_obj_libs) - list(APPEND obj_exprs "$") - endforeach() - - add_library(${trait_lib_name} STATIC ${obj_exprs}) - add_dependencies(${trait_lib_name} gemm_gen_${datatype}_${layout}) - - # Trait-specific executable - set(exec_name "benchmark_gemm_${datatype}_${layout}_${trait}") - add_executable(${exec_name} benchmark_gemm.cpp) - target_link_libraries(${exec_name} PRIVATE ${trait_lib_name}) - target_include_directories(${exec_name} PRIVATE - "${CMAKE_CURRENT_LIST_DIR}" - "${working_path}" - ) - target_compile_definitions(${exec_name} PRIVATE - GEMM_TRAIT_FILTER="${trait}" - ) - target_compile_options(${exec_name} PRIVATE - -Wno-undefined-func-template - -Wno-float-equal - --offload-compress - ) + #list(LENGTH chunk_files chunk_files_len) + #if(chunk_files_len AND chunk_files_len GREATER 1) + if(chunk_files) + set(sub_intermediate_lib_name "gemm_objlib_${name}_${i}_${datatype}_${layout}") + add_library(${sub_intermediate_lib_name} OBJECT ${chunk_files}) + list(APPEND sub_intermediate_libs ${sub_intermediate_lib_name}) endif() - endif() - endforeach() - # Master executable including all traits - set(all_trait_libs) - foreach(trait IN LISTS unique_traits) - if (TARGET gemm_lib_${trait}_${datatype}_${layout}) - list(APPEND all_trait_libs "gemm_lib_${trait}_${datatype}_${layout}") - endif() - endforeach() + endforeach() - if (all_trait_libs) - add_executable(benchmark_gemm_${datatype}_${layout} benchmark_gemm.cpp) - target_link_libraries(benchmark_gemm_${datatype}_${layout} PRIVATE ${all_trait_libs}) - target_include_directories(benchmark_gemm_${datatype}_${layout} PRIVATE - "${CMAKE_CURRENT_LIST_DIR}" - "${working_path}" - ) - target_compile_options(benchmark_gemm_${datatype}_${layout} PRIVATE - -Wno-undefined-func-template - -Wno-float-equal - --offload-compress - ) - endif() + # ------------------ Bundle the object libs into one static lib --------- + #list(LENGTH sub_intermediate_libs sub_intermediate_libs_len) + #if(sub_intermediate_libs AND sub_intermediate_libs_len GREATER 1) + if(sub_intermediate_libs) + set(intermediate_lib_name "gemm_staticlib_${name}_${datatype}_${layout}") + # Collect the $ expressions + + set(obj_exprs) + foreach(objlib IN LISTS sub_intermediate_libs) + list(APPEND obj_exprs $) + endforeach() + + add_library(${intermediate_lib_name} STATIC ${obj_exprs}) + add_dependencies(${intermediate_lib_name} gemm_gen_${datatype}_${layout}) + #foreach(objlib IN LISTS sub_intermediate_libs) + # target_sources(${intermediate_lib_name} PRIVATE $) + #endforeach() + list(APPEND intermediate_libs ${intermediate_lib_name}) + endif() + + endforeach() + + # Interface library for instances + add_library(gemm_template_instances_${datatype}_${layout} INTERFACE) + add_dependencies(gemm_template_instances_${datatype}_${layout} gemm_gen_${datatype}_${layout}) + target_link_libraries(gemm_template_instances_${datatype}_${layout} INTERFACE ${intermediate_libs}) + target_include_directories(gemm_template_instances_${datatype}_${layout} INTERFACE + ${CMAKE_CURRENT_LIST_DIR} + "${working_path}" + ) + set_target_properties(gemm_template_instances_${datatype}_${layout} PROPERTIES LINKER_LANGUAGE CXX) + + # Host API interface library + add_library(gemm_host_api_${datatype}_${layout} INTERFACE) + target_link_libraries(gemm_host_api_${datatype}_${layout} INTERFACE gemm_template_instances_${datatype}_${layout}) + target_include_directories(gemm_host_api_${datatype}_${layout} INTERFACE + ${CMAKE_CURRENT_LIST_DIR} + "${working_path}" + ) + + + # Executable per datatype + set(exec_name "benchmark_gemm_${datatype}_${layout}") + add_executable(${exec_name} benchmark_gemm.cpp) + target_link_libraries(${exec_name} PRIVATE gemm_host_api_${datatype}_${layout}) + target_compile_options(${exec_name} PRIVATE + -Wno-undefined-func-template + -Wno-float-equal + --offload-compress + ) endfunction() -# Process each datatype/layout +# Process each datatype in isolation foreach(dt IN LISTS GEMM_DATATYPE) foreach(l IN LISTS GEMM_LAYOUT) - build_gemm_for_datatype("${dt}" "${l}") + build_gemm_for_datatype(${dt} ${l}) endforeach() endforeach() - -# Master target for parallel builds -set(ALL_GEMM_TARGETS) -foreach(dt IN LISTS GEMM_DATATYPE) - foreach(l IN LISTS GEMM_LAYOUT) - list(APPEND ALL_GEMM_TARGETS "benchmark_gemm_${dt}_${l}") - endforeach() -endforeach() -add_custom_target(benchmark_gemm_all DEPENDS ${ALL_GEMM_TARGETS}) - -# Use faster linker if available -find_program(LLD_LINKER "ld.lld") -find_program(MOLD_LINKER "mold") -if (MOLD_LINKER) - message(STATUS "Using mold linker for faster linking") - add_link_options(-fuse-ld=mold) -elseif (LLD_LINKER) - message(STATUS "Using lld linker for faster linking") - add_link_options(-fuse-ld=lld) -endif() \ No newline at end of file diff --git a/tile_engine/ops/gemm/codegen_utils.py b/tile_engine/ops/gemm/codegen_utils.py index 9ff76724cc..4a990f3309 100644 --- a/tile_engine/ops/gemm/codegen_utils.py +++ b/tile_engine/ops/gemm/codegen_utils.py @@ -65,93 +65,6 @@ CSHUFFLE_EPILOGUE = """ UniversalGemmProblem::TransposeC, memory_operation>>; """ -HOT_LOOP_FALSE = """ - if(tail_num == ck_tile::TailNumber::Full) - { - RunSplitk(ck_tile::bool_constant{}, - ck_tile::integral_constant{}); - } - else if(tail_num == ck_tile::TailNumber::Odd) - { - RunSplitk(ck_tile::bool_constant{}, - ck_tile::integral_constant{}); - } - else if(tail_num == ck_tile::TailNumber::Even) - { - RunSplitk(ck_tile::bool_constant{}, - ck_tile::integral_constant{}); - } - else - { - throw std::runtime_error("Num K loop must be larger than number of prefetech stages."); - } -""" -RUN_MEM = """ - // Handle One and Full cases directly - if (tail_num == ck_tile::TailNumber::One) { - RunSplitk(ck_tile::bool_constant{}, - ck_tile::integral_constant{}); - } else if (tail_num == ck_tile::TailNumber::Full) { - RunSplitk(ck_tile::bool_constant{}, - ck_tile::integral_constant{}); - } - - auto check_tail = [&](auto... TNs) { - ([&]{ - if constexpr(BaseGemmPipeline::PrefetchStages > static_cast(decltype(TNs)::value)) { - if(tail_num == decltype(TNs)::value) { - RunSplitk(ck_tile::bool_constant{}, - ck_tile::integral_constant{}); - } - } - }(), ...); - }; - - check_tail( - ck_tile::integral_constant{}, - ck_tile::integral_constant{}, - ck_tile::integral_constant{}, - ck_tile::integral_constant{}, - ck_tile::integral_constant{}, - ck_tile::integral_constant{} - ); -""" - -RUN_COMPV3 = """ - if(tail_num == ck_tile::TailNumber::Full) - { - RunSplitk(ck_tile::bool_constant{}, - ck_tile::integral_constant{}); - } - else if(tail_num == ck_tile::TailNumber::Odd) - { - RunSplitk(ck_tile::bool_constant{}, - ck_tile::integral_constant{}); - } - else if(tail_num == ck_tile::TailNumber::Even) - { - RunSplitk(ck_tile::bool_constant{}, - ck_tile::integral_constant{}); - } - else - { - throw std::runtime_error("The tail number is wrong. It should be Full, Odd, or Even."); - } -""" - -RUN_COMPV4 = """ - if(tail_num == ck_tile::TailNumber::Three) - { - RunSplitk(ck_tile::bool_constant{}, - ck_tile::integral_constant{}); - } - else - { - RunSplitk(ck_tile::bool_constant{}, - ck_tile::integral_constant{}); - } -""" - PIPELINE_MAP = { "mem": ["ck_tile::BaseGemmPipelineAgBgCrMem", "ck_tile::GemmPipelineAgBgCrMem"], @@ -172,8 +85,6 @@ SCHEDULER_MAP = { EPILOGUE_MAP = {"default": DEFAULT_EPILOGUE, "cshuffle": CSHUFFLE_EPILOGUE} -HOT_LOOP_TRUE = {"mem": RUN_MEM, "compv3": RUN_COMPV3, "compv4": RUN_COMPV4} - def BOOL_MAP(b_): return {True: "true", False: "false"}[bool(b_)] diff --git a/tile_engine/ops/gemm/configs/benchmark.json b/tile_engine/ops/gemm/configs/benchmark.json index 1560698b77..def3ca4453 100644 --- a/tile_engine/ops/gemm/configs/benchmark.json +++ b/tile_engine/ops/gemm/configs/benchmark.json @@ -96,6 +96,12 @@ "values": [ false ] + }, + "persistent": { + "values": [ + false, + true + ] } } } \ No newline at end of file diff --git a/tile_engine/ops/gemm/configs/custom_ci_config.json b/tile_engine/ops/gemm/configs/custom_ci_config.json index 9187fb01eb..ca6c7230fd 100644 --- a/tile_engine/ops/gemm/configs/custom_ci_config.json +++ b/tile_engine/ops/gemm/configs/custom_ci_config.json @@ -77,6 +77,12 @@ "values": [ false ] + }, + "persistent": { + "values": [ + false, + true + ] } } } \ No newline at end of file diff --git a/tile_engine/ops/gemm/configs/default_config.json b/tile_engine/ops/gemm/configs/default_config.json index 12a8ddd4b7..5bd51b809a 100644 --- a/tile_engine/ops/gemm/configs/default_config.json +++ b/tile_engine/ops/gemm/configs/default_config.json @@ -95,6 +95,11 @@ "values": [ false ] + }, + "persistent": { + "values": [ + false + ] } } -} \ No newline at end of file +} diff --git a/tile_engine/ops/gemm/configs/user_provided_config.json b/tile_engine/ops/gemm/configs/user_provided_config.json index 5761b39ada..76e194f6b9 100644 --- a/tile_engine/ops/gemm/configs/user_provided_config.json +++ b/tile_engine/ops/gemm/configs/user_provided_config.json @@ -82,6 +82,12 @@ "values": [ false ] + }, + "persistent": { + "values": [ + false, + true + ] } } } \ No newline at end of file diff --git a/tile_engine/ops/gemm/gemm_host_api.hpp b/tile_engine/ops/gemm/gemm_host_api.hpp index 2c4af8955f..f28f5dd29c 100644 --- a/tile_engine/ops/gemm/gemm_host_api.hpp +++ b/tile_engine/ops/gemm/gemm_host_api.hpp @@ -144,7 +144,8 @@ inline auto create_args(int argc, char* argv[]) .insert("pad_k", "false", "Whether pad or not in k direction. Possible values are true or false. Default is " - "false."); + "false.") + .insert("persistent", "false", "Whether to use persistent kernel. Default is false."); bool result = arg_parser.parse(argc, argv); return std::make_tuple(result, arg_parser); @@ -208,12 +209,13 @@ void permute_vectors_i4x4_b(Tensor& tensor) auto get_kernel_func_by_trait(const ck_tile::ArgParser& arg_parser) { KernelTraits trait; - trait.pipeline = arg_parser.get_str("pipeline"); - trait.scheduler = arg_parser.get_str("scheduler"); - trait.epilogue = arg_parser.get_str("epilogue"); - trait.pad_m = arg_parser.get_bool("pad_m"); - trait.pad_n = arg_parser.get_bool("pad_n"); - trait.pad_k = arg_parser.get_bool("pad_k"); + trait.pipeline = arg_parser.get_str("pipeline"); + trait.scheduler = arg_parser.get_str("scheduler"); + trait.epilogue = arg_parser.get_str("epilogue"); + trait.pad_m = arg_parser.get_bool("pad_m"); + trait.pad_n = arg_parser.get_bool("pad_n"); + trait.pad_k = arg_parser.get_bool("pad_k"); + trait.persistent = arg_parser.get_bool("persistent"); bool structured_sparsity = arg_parser.get_bool("structured_sparsity"); diff --git a/tile_engine/ops/gemm/gemm_instance_builder.py b/tile_engine/ops/gemm/gemm_instance_builder.py index 4a35a2bcd3..6d713bdcb8 100755 --- a/tile_engine/ops/gemm/gemm_instance_builder.py +++ b/tile_engine/ops/gemm/gemm_instance_builder.py @@ -15,16 +15,9 @@ from json_config import GemmConfig, RangeConfigParam from codegen_utils import ( DATA_TYPE_MAP, LAYOUT_MAP, - DEFAULT_EPILOGUE, - CSHUFFLE_EPILOGUE, - HOT_LOOP_FALSE, - RUN_MEM, - RUN_COMPV3, - RUN_COMPV4, PIPELINE_MAP, SCHEDULER_MAP, EPILOGUE_MAP, - HOT_LOOP_TRUE, BOOL_MAP, warp_tile_supported_combinations, trait_unsupported_combinations, @@ -114,7 +107,7 @@ class GemmCodeGenerator: def _generate_all_traits(self): """Generate all possible kernel traits names.""" - params = ["pipeline", "epilogue", "scheduler", "pad_m", "pad_n", "pad_k"] + params = ["pipeline", "epilogue", "scheduler", "pad_m", "pad_n", "pad_k", "persistent"] # Generate all unique_combinations _unique = set( @@ -124,13 +117,14 @@ class GemmCodeGenerator: ) for combo in _unique: - pipeline, epilogue, scheduler, pad_m, pad_n, pad_k = combo + pipeline, epilogue, scheduler, pad_m, pad_n, pad_k, persistent = combo current_combination = (pipeline, epilogue, scheduler) if current_combination not in trait_unsupported_combinations: trait_name = ( f"{pipeline}_{epilogue}_{scheduler}_" - f"{BOOL_MAP(pad_m)}_{BOOL_MAP(pad_n)}_{BOOL_MAP(pad_k)}" + f"{BOOL_MAP(pad_m)}_{BOOL_MAP(pad_n)}_{BOOL_MAP(pad_k)}_" + f"{BOOL_MAP(persistent)}" ) self.valid_trait_names.append(trait_name) else: @@ -189,7 +183,7 @@ using CLayout = {LAYOUT_MAP[self.config.problem.layout_map["matrix_c"]]}; def _generate_trait_file(self, trait: str): """Generate a trait with all tile/warp combinations.""" - pipeline, epilogue, scheduler, pad_m, pad_n, pad_k = trait.split("_") + pipeline, epilogue, scheduler, pad_m, pad_n, pad_k, persistent = trait.split("_") filename = f"gemm_{trait}.hpp" content = f"""// SPDX-License-Identifier: MIT @@ -206,8 +200,7 @@ namespace {trait} {{ """ # Add template struct with configuration content += self._generate_kernel_struct( - pipeline, epilogue, scheduler, pad_m, pad_n, pad_k - ) + pipeline, epilogue, scheduler, pad_m, pad_n, pad_k, persistent) content += f"\n}} // namespace {trait}\n" (self.output_dir / filename).write_text(content) @@ -220,6 +213,7 @@ namespace {trait} {{ pad_m: str, pad_n: str, pad_k: str, + persistent: str, ) -> str: """Generate the code block of kernel struct""" return f""" @@ -229,9 +223,10 @@ template struct GemmKernel {{ - static constexpr bool kPadM = {pad_m}; - static constexpr bool kPadN = {pad_n}; - static constexpr bool kPadK = {pad_k}; + static constexpr bool kPadM = {pad_m}; + static constexpr bool kPadN = {pad_n}; + static constexpr bool kPadK = {pad_k}; + static constexpr bool kPersistent = {persistent}; static float launch(ck_tile::GemmHostArgs& args, const ck_tile::stream_config& stream) {{ static constexpr bool permuteA = false; @@ -250,7 +245,6 @@ struct GemmKernel {{ permuteA, permuteB>; - using TilePartitioner = ck_tile::GemmSpatiallyLocalTilePartitioner; + ALayout, BLayout, CLayout, TransposeC, + structured_sparsity, kPersistent>; using GemmPipelineProblem = ck_tile::GemmPipelineProblem; @@ -297,14 +292,14 @@ struct GemmKernel {{ using Kernel = ck_tile::GemmKernel; auto kargs = Kernel::MakeKernelArgs(args); - const dim3 grids = Kernel::GridSize(args.M, args.N, args.k_batch); - constexpr dim3 blocks = Kernel::BlockSize(); - if(!Kernel::IsSupportedArgument(kargs)) {{ throw std::runtime_error("Wrong! Arguments not supported! Skipping gemm!"); }} + constexpr dim3 blocks = Kernel::BlockSize(); + const dim3 grids = {'Kernel::MaxOccupancyGridSize(stream)' if persistent == 'true' else 'Kernel::GridSize(args.M, args.N, args.k_batch)'}; + if(stream.log_level_ > 0) {{ std::cout << "Launching kernel with args:" @@ -377,11 +372,7 @@ struct GemmKernel {{ }} }}; - if(has_hot_loop) {{ - {HOT_LOOP_TRUE[pipeline]} - }} else {{ - {HOT_LOOP_FALSE} - }} + BaseGemmPipeline::TailHandler(RunSplitk, has_hot_loop, tail_num); return ave_time; }} @@ -395,7 +386,8 @@ struct GemmKernel {{ "{pad_k}" + "_" + "{pipeline}" + "_" + "{epilogue}" + "_" + - "{scheduler}"; + "{scheduler}" + "_" + + "{persistent}"; }} }}; """ @@ -673,6 +665,8 @@ struct KernelTraits bool pad_n; /// @brief Indicates whether padding is applied to the K dimension. bool pad_k; + /// @brief Indicates whether the kernel is persistent. + bool persistent; }; struct GemmDispatcher { @@ -773,7 +767,8 @@ private: trait.scheduler + "_" + (trait.pad_m ? "true" : "false") + "_" + (trait.pad_n ? "true" : "false") + "_" + - (trait.pad_k ? "true" : "false"); + (trait.pad_k ? "true" : "false") + "_" + + (trait.persistent ? "true" : "false"); } }; diff --git a/tile_engine/ops/gemm/json_config.py b/tile_engine/ops/gemm/json_config.py index 675a2052ef..04f2dd4890 100644 --- a/tile_engine/ops/gemm/json_config.py +++ b/tile_engine/ops/gemm/json_config.py @@ -107,6 +107,7 @@ class TraitConfig: pad_m: EnumConfigParam pad_n: EnumConfigParam pad_k: EnumConfigParam + persistent: EnumConfigParam @dataclass @@ -215,6 +216,9 @@ class GemmConfig: pad_k=EnumConfigParam( values=config_dict["trait_config"]["pad_k"]["values"] ), + persistent=EnumConfigParam( + values=config_dict["trait_config"]["persistent"]["values"] + ), ) return cls(