From ef3e871e6ee2aa09ad7eae87277df0b2cd5b0e4b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ville=20Pietil=C3=A4?= Date: Fri, 17 Oct 2025 10:47:23 +0000 Subject: [PATCH] Add grouped conv fwd direction profiling into CK Tile profiler. --- .../gpu/gemm_configs.hpp | 1 - ...grouped_conv_bwd_weight_bf16_instances.hpp | 54 ++-- ... tile_grouped_conv_bwd_weight_factory.hpp} | 2 - ...grouped_conv_bwd_weight_fp16_instances.hpp | 54 ++-- .../tile_grouped_conv_fwd_bf16_instances.hpp | 57 ++++ .../gpu/tile_grouped_conv_fwd_factory.hpp | 87 ++++++ .../tile_grouped_conv_fwd_fp16_instances.hpp | 57 ++++ .../gpu/tile_grouped_conv_fwd_instances.hpp | 66 +++++ .../gpu/tile_grouped_conv_fwd_invoker.hpp | 277 ++++++++++++++++++ .../tile_grouped_conv_instance_factory.hpp | 3 + ...e_profile_grouped_conv_bwd_weight_impl.hpp | 4 +- .../tile_profile_grouped_conv_fwd_impl.hpp | 233 +++++++++++++++ .../tile_profiler_operation_registry.hpp | 2 + profiler/ck_tile/src/CMakeLists.txt | 6 +- .../src/tile_profile_grouped_conv_fwd.cpp | 237 +++++++++++++++ 15 files changed, 1076 insertions(+), 64 deletions(-) rename library/include/ck_tile/library/tensor_operation_instance/gpu/{tile_grouped_conv_factory.hpp => tile_grouped_conv_bwd_weight_factory.hpp} (98%) create mode 100644 library/include/ck_tile/library/tensor_operation_instance/gpu/tile_grouped_conv_fwd_bf16_instances.hpp create mode 100644 library/include/ck_tile/library/tensor_operation_instance/gpu/tile_grouped_conv_fwd_factory.hpp create mode 100644 library/include/ck_tile/library/tensor_operation_instance/gpu/tile_grouped_conv_fwd_fp16_instances.hpp create mode 100644 library/include/ck_tile/library/tensor_operation_instance/gpu/tile_grouped_conv_fwd_instances.hpp create mode 100644 library/include/ck_tile/library/tensor_operation_instance/gpu/tile_grouped_conv_fwd_invoker.hpp create mode 100644 profiler/ck_tile/include/tile_profile_grouped_conv_fwd_impl.hpp create mode 100644 profiler/ck_tile/src/tile_profile_grouped_conv_fwd.cpp diff --git a/library/include/ck_tile/library/tensor_operation_instance/gpu/gemm_configs.hpp b/library/include/ck_tile/library/tensor_operation_instance/gpu/gemm_configs.hpp index 509486569b..c0fca96d9d 100644 --- a/library/include/ck_tile/library/tensor_operation_instance/gpu/gemm_configs.hpp +++ b/library/include/ck_tile/library/tensor_operation_instance/gpu/gemm_configs.hpp @@ -10,7 +10,6 @@ #include "ck_tile/host/kernel_launch.hpp" #include "ck_tile/ops/epilogue.hpp" #include "ck_tile/ops/gemm.hpp" -#include "ck_tile/utility/json_dump.hpp" #define CK_TILE_PIPELINE_COMPUTE_V3 1 #define CK_TILE_PIPELINE_MEMORY 2 diff --git a/library/include/ck_tile/library/tensor_operation_instance/gpu/tile_grouped_conv_bwd_weight_bf16_instances.hpp b/library/include/ck_tile/library/tensor_operation_instance/gpu/tile_grouped_conv_bwd_weight_bf16_instances.hpp index 770acb1e47..ed8bbe903e 100644 --- a/library/include/ck_tile/library/tensor_operation_instance/gpu/tile_grouped_conv_bwd_weight_bf16_instances.hpp +++ b/library/include/ck_tile/library/tensor_operation_instance/gpu/tile_grouped_conv_bwd_weight_bf16_instances.hpp @@ -19,36 +19,36 @@ using tile_grouped_conv_bwd_weight_bf16_instances = std::tuple< //#####################################| Num| InLayout| WeiLayout| OutLayout| InData| WeiData| OutData| In| Wei| Out| K-block| M-tile| N-tile | K-tile | M-warp| N-warp| K-warp| M-warp| N-warp| K-warp| Vector| Vector| Vector| Double| GEMM| //#####################################| Dim| | | | Type| Type| Type| Elementwise| Elementwise| Elementwise| per| | | | | | | tile| tile| tile| size| size| size| smem| pipeline| //#####################################| Spatial| | | | | | | Operation| Operation| Operation| CU| | | | | | | size| size| size| A| B| C| buffer| version| - GroupedConvolutionBackwardWeightInvoker, - GroupedConvolutionBackwardWeightInvoker, - GroupedConvolutionBackwardWeightInvoker, - GroupedConvolutionBackwardWeightInvoker, - GroupedConvolutionBackwardWeightInvoker, - GroupedConvolutionBackwardWeightInvoker, - GroupedConvolutionBackwardWeightInvoker, - GroupedConvolutionBackwardWeightInvoker, + // GroupedConvolutionBackwardWeightInvoker, + // GroupedConvolutionBackwardWeightInvoker, + // GroupedConvolutionBackwardWeightInvoker, + // GroupedConvolutionBackwardWeightInvoker, + // GroupedConvolutionBackwardWeightInvoker, + // GroupedConvolutionBackwardWeightInvoker, + // GroupedConvolutionBackwardWeightInvoker, + // GroupedConvolutionBackwardWeightInvoker, - GroupedConvolutionBackwardWeightInvoker, - GroupedConvolutionBackwardWeightInvoker, - GroupedConvolutionBackwardWeightInvoker, - GroupedConvolutionBackwardWeightInvoker, - GroupedConvolutionBackwardWeightInvoker, - GroupedConvolutionBackwardWeightInvoker, + // GroupedConvolutionBackwardWeightInvoker, + // GroupedConvolutionBackwardWeightInvoker, + // GroupedConvolutionBackwardWeightInvoker, + // GroupedConvolutionBackwardWeightInvoker, + // GroupedConvolutionBackwardWeightInvoker, + // GroupedConvolutionBackwardWeightInvoker, - GroupedConvolutionBackwardWeightInvoker, - GroupedConvolutionBackwardWeightInvoker, - GroupedConvolutionBackwardWeightInvoker, - GroupedConvolutionBackwardWeightInvoker, - GroupedConvolutionBackwardWeightInvoker, - GroupedConvolutionBackwardWeightInvoker, - GroupedConvolutionBackwardWeightInvoker, - GroupedConvolutionBackwardWeightInvoker, + // GroupedConvolutionBackwardWeightInvoker, + // GroupedConvolutionBackwardWeightInvoker, + // GroupedConvolutionBackwardWeightInvoker, + // GroupedConvolutionBackwardWeightInvoker, + // GroupedConvolutionBackwardWeightInvoker, + // GroupedConvolutionBackwardWeightInvoker, + // GroupedConvolutionBackwardWeightInvoker, + // GroupedConvolutionBackwardWeightInvoker, - GroupedConvolutionBackwardWeightInvoker, - GroupedConvolutionBackwardWeightInvoker, - GroupedConvolutionBackwardWeightInvoker, - GroupedConvolutionBackwardWeightInvoker, - GroupedConvolutionBackwardWeightInvoker, + // GroupedConvolutionBackwardWeightInvoker, + // GroupedConvolutionBackwardWeightInvoker, + // GroupedConvolutionBackwardWeightInvoker, + // GroupedConvolutionBackwardWeightInvoker, + // GroupedConvolutionBackwardWeightInvoker, GroupedConvolutionBackwardWeightInvoker // clang-format on >; diff --git a/library/include/ck_tile/library/tensor_operation_instance/gpu/tile_grouped_conv_factory.hpp b/library/include/ck_tile/library/tensor_operation_instance/gpu/tile_grouped_conv_bwd_weight_factory.hpp similarity index 98% rename from library/include/ck_tile/library/tensor_operation_instance/gpu/tile_grouped_conv_factory.hpp rename to library/include/ck_tile/library/tensor_operation_instance/gpu/tile_grouped_conv_bwd_weight_factory.hpp index 2b17f91260..ba287bce91 100644 --- a/library/include/ck_tile/library/tensor_operation_instance/gpu/tile_grouped_conv_factory.hpp +++ b/library/include/ck_tile/library/tensor_operation_instance/gpu/tile_grouped_conv_bwd_weight_factory.hpp @@ -15,8 +15,6 @@ namespace ck_tile { namespace ops { -template -struct DeviceOperationInstanceFactory; template , - GroupedConvolutionBackwardWeightInvoker, - GroupedConvolutionBackwardWeightInvoker, - GroupedConvolutionBackwardWeightInvoker, - GroupedConvolutionBackwardWeightInvoker, - GroupedConvolutionBackwardWeightInvoker, - GroupedConvolutionBackwardWeightInvoker, - GroupedConvolutionBackwardWeightInvoker, + // GroupedConvolutionBackwardWeightInvoker, + // GroupedConvolutionBackwardWeightInvoker, + // GroupedConvolutionBackwardWeightInvoker, + // GroupedConvolutionBackwardWeightInvoker, + // GroupedConvolutionBackwardWeightInvoker, + // GroupedConvolutionBackwardWeightInvoker, + // GroupedConvolutionBackwardWeightInvoker, + // GroupedConvolutionBackwardWeightInvoker, - GroupedConvolutionBackwardWeightInvoker, - GroupedConvolutionBackwardWeightInvoker, - GroupedConvolutionBackwardWeightInvoker, - GroupedConvolutionBackwardWeightInvoker, - GroupedConvolutionBackwardWeightInvoker, - GroupedConvolutionBackwardWeightInvoker, + // GroupedConvolutionBackwardWeightInvoker, + // GroupedConvolutionBackwardWeightInvoker, + // GroupedConvolutionBackwardWeightInvoker, + // GroupedConvolutionBackwardWeightInvoker, + // GroupedConvolutionBackwardWeightInvoker, + // GroupedConvolutionBackwardWeightInvoker, - GroupedConvolutionBackwardWeightInvoker, - GroupedConvolutionBackwardWeightInvoker, - GroupedConvolutionBackwardWeightInvoker, - GroupedConvolutionBackwardWeightInvoker, - GroupedConvolutionBackwardWeightInvoker, - GroupedConvolutionBackwardWeightInvoker, - GroupedConvolutionBackwardWeightInvoker, - GroupedConvolutionBackwardWeightInvoker, + // GroupedConvolutionBackwardWeightInvoker, + // GroupedConvolutionBackwardWeightInvoker, + // GroupedConvolutionBackwardWeightInvoker, + // GroupedConvolutionBackwardWeightInvoker, + // GroupedConvolutionBackwardWeightInvoker, + // GroupedConvolutionBackwardWeightInvoker, + // GroupedConvolutionBackwardWeightInvoker, + // GroupedConvolutionBackwardWeightInvoker, - GroupedConvolutionBackwardWeightInvoker, - GroupedConvolutionBackwardWeightInvoker, - GroupedConvolutionBackwardWeightInvoker, - GroupedConvolutionBackwardWeightInvoker, - GroupedConvolutionBackwardWeightInvoker, + // GroupedConvolutionBackwardWeightInvoker, + // GroupedConvolutionBackwardWeightInvoker, + // GroupedConvolutionBackwardWeightInvoker, + // GroupedConvolutionBackwardWeightInvoker, + // GroupedConvolutionBackwardWeightInvoker, GroupedConvolutionBackwardWeightInvoker // clang-format on >; diff --git a/library/include/ck_tile/library/tensor_operation_instance/gpu/tile_grouped_conv_fwd_bf16_instances.hpp b/library/include/ck_tile/library/tensor_operation_instance/gpu/tile_grouped_conv_fwd_bf16_instances.hpp new file mode 100644 index 0000000000..a941133f3a --- /dev/null +++ b/library/include/ck_tile/library/tensor_operation_instance/gpu/tile_grouped_conv_fwd_bf16_instances.hpp @@ -0,0 +1,57 @@ +// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck_tile/library/tensor_operation_instance/gpu/tile_grouped_conv_instance_factory.hpp" +#include "ck_tile/library/tensor_operation_instance/gpu/tile_grouped_conv_fwd_invoker.hpp" + +namespace ck_tile { +namespace ops { + +using BF16 = ck_tile::bfloat16_t; + +template +using tile_grouped_conv_fwd_bf16_instances = std::tuple< +// clang-format off + //##############################| Num| InLayout| WeiLayout| OutLayout| InData| WeiData| OutData| In| Wei| Out| K-block| M-tile| N-tile | K-tile | M-warp| N-warp| K-warp| M-warp| N-warp| K-warp| Vector| Vector| Vector| Double| GEMM| + //##############################| Dim| | | | Type| Type| Type| Elementwise| Elementwise| Elementwise| per| | | | | | | tile| tile| tile| size| size| size| smem| pipeline| + //##############################| Spatial| | | | | | | Operation| Operation| Operation| CU| | | | | | | size| size| size| A| B| C| buffer| version| + GroupedConvolutionForwardInvoker + // GroupedConvolutionForwardInvoker, + // GroupedConvolutionForwardInvoker, + // GroupedConvolutionForwardInvoker, + // GroupedConvolutionForwardInvoker, + // GroupedConvolutionForwardInvoker, + // GroupedConvolutionForwardInvoker, + // GroupedConvolutionForwardInvoker, + + // GroupedConvolutionForwardInvoker, + // GroupedConvolutionForwardInvoker, + // GroupedConvolutionForwardInvoker, + // GroupedConvolutionForwardInvoker, + // GroupedConvolutionForwardInvoker, + // GroupedConvolutionForwardInvoker, + + // GroupedConvolutionForwardInvoker, + // GroupedConvolutionForwardInvoker, + // GroupedConvolutionForwardInvoker, + // GroupedConvolutionForwardInvoker, + // GroupedConvolutionForwardInvoker, + // GroupedConvolutionForwardInvoker, + // GroupedConvolutionForwardInvoker, + // GroupedConvolutionForwardInvoker, + + // GroupedConvolutionForwardInvoker, + // GroupedConvolutionForwardInvoker, + // GroupedConvolutionForwardInvoker, + // GroupedConvolutionForwardInvoker, + // GroupedConvolutionForwardInvoker, + // GroupedConvolutionForwardInvoker + // clang-format on +>; + +} // namespace ops +} // namespace ck_tile diff --git a/library/include/ck_tile/library/tensor_operation_instance/gpu/tile_grouped_conv_fwd_factory.hpp b/library/include/ck_tile/library/tensor_operation_instance/gpu/tile_grouped_conv_fwd_factory.hpp new file mode 100644 index 0000000000..7caf1124f6 --- /dev/null +++ b/library/include/ck_tile/library/tensor_operation_instance/gpu/tile_grouped_conv_fwd_factory.hpp @@ -0,0 +1,87 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include +#include +#include +#include + +#include "ck_tile/library/tensor_operation_instance/gpu/tile_grouped_conv_instance_factory.hpp" +#include "ck_tile/library/tensor_operation_instance/gpu/tile_grouped_conv_fwd_invoker.hpp" +#include "ck_tile/library/tensor_operation_instance/gpu/tile_grouped_conv_fwd_instances.hpp" + +namespace ck_tile { +namespace ops { + +template +struct DeviceOperationInstanceFactory> +{ + using DeviceOp = GroupedConvolutionForwardBaseInvoker; + + static auto GetInstances() + { + std::vector> op_ptrs; + + if constexpr(NumDimSpatial == 2) + { + if constexpr(std::is_same_v && std::is_same_v && + std::is_same_v) + { + if constexpr(std::is_same_v && + std::is_same_v && + std::is_same_v && + std::is_same_v && + std::is_same_v) + { + add_grouped_conv2d_fwd_f16_instances(op_ptrs); + } + if constexpr(std::is_same_v && + std::is_same_v && + std::is_same_v && + std::is_same_v && + std::is_same_v) + { + add_grouped_conv2d_fwd_bf16_instances(op_ptrs); + } + } + } + + return op_ptrs; + } +}; + +} // namespace ops +} // namespace ck_tile diff --git a/library/include/ck_tile/library/tensor_operation_instance/gpu/tile_grouped_conv_fwd_fp16_instances.hpp b/library/include/ck_tile/library/tensor_operation_instance/gpu/tile_grouped_conv_fwd_fp16_instances.hpp new file mode 100644 index 0000000000..c80926de9e --- /dev/null +++ b/library/include/ck_tile/library/tensor_operation_instance/gpu/tile_grouped_conv_fwd_fp16_instances.hpp @@ -0,0 +1,57 @@ +// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck_tile/library/tensor_operation_instance/gpu/tile_grouped_conv_instance_factory.hpp" +#include "ck_tile/library/tensor_operation_instance/gpu/tile_grouped_conv_fwd_invoker.hpp" + +namespace ck_tile { +namespace ops { + +using F16 = ck_tile::half_t; + +template +using tile_grouped_conv_fwd_fp16_instances = std::tuple< +// clang-format off + //##############################| Num| InLayout| WeiLayout| OutLayout| InData| WeiData| OutData| In| Wei| Out| K-block| M-tile| N-tile | K-tile | M-warp| N-warp| K-warp| M-warp| N-warp| K-warp| Vector| Vector| Vector| Double| GEMM| + //##############################| Dim| | | | Type| Type| Type| Elementwise| Elementwise| Elementwise| per| | | | | | | tile| tile| tile| size| size| size| smem| pipeline| + //##############################| Spatial| | | | | | | Operation| Operation| Operation| CU| | | | | | | size| size| size| A| B| C| buffer| version| + GroupedConvolutionForwardInvoker + // GroupedConvolutionForwardInvoker, + // GroupedConvolutionForwardInvoker, + // GroupedConvolutionForwardInvoker, + // GroupedConvolutionForwardInvoker, + // GroupedConvolutionForwardInvoker, + // GroupedConvolutionForwardInvoker, + // GroupedConvolutionForwardInvoker, + + // GroupedConvolutionForwardInvoker, + // GroupedConvolutionForwardInvoker, + // GroupedConvolutionForwardInvoker, + // GroupedConvolutionForwardInvoker, + // GroupedConvolutionForwardInvoker, + // GroupedConvolutionForwardInvoker, + + // GroupedConvolutionForwardInvoker, + // GroupedConvolutionForwardInvoker, + // GroupedConvolutionForwardInvoker, + // GroupedConvolutionForwardInvoker, + // GroupedConvolutionForwardInvoker, + // GroupedConvolutionForwardInvoker, + // GroupedConvolutionForwardInvoker, + // GroupedConvolutionForwardInvoker, + + // GroupedConvolutionForwardInvoker, + // GroupedConvolutionForwardInvoker, + // GroupedConvolutionForwardInvoker, + // GroupedConvolutionForwardInvoker, + // GroupedConvolutionForwardInvoker, + // GroupedConvolutionForwardInvoker + // clang-format on +>; + +} // namespace ops +} // namespace ck_tile diff --git a/library/include/ck_tile/library/tensor_operation_instance/gpu/tile_grouped_conv_fwd_instances.hpp b/library/include/ck_tile/library/tensor_operation_instance/gpu/tile_grouped_conv_fwd_instances.hpp new file mode 100644 index 0000000000..fc7cb8b8b5 --- /dev/null +++ b/library/include/ck_tile/library/tensor_operation_instance/gpu/tile_grouped_conv_fwd_instances.hpp @@ -0,0 +1,66 @@ +// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include +#include + +#include "ck_tile/library/tensor_operation_instance/gpu/tile_grouped_conv_instance_factory.hpp" +#include "ck_tile/library/tensor_operation_instance/gpu/tile_grouped_conv_fwd_invoker.hpp" +#include "ck_tile/library/tensor_operation_instance/gpu/tile_grouped_conv_fwd_fp16_instances.hpp" +#include "ck_tile/library/tensor_operation_instance/gpu/tile_grouped_conv_fwd_bf16_instances.hpp" + +namespace ck_tile { +namespace ops { + +using BF16 = ck_tile::bfloat16_t; +using F16 = ck_tile::half_t; + +using DeviceOpFwd2DF16 = GroupedConvolutionForwardBaseInvoker<2, + NHWGC, + GKYXC, + NHWGK, + F16, + F16, + F16, + PassThrough, + PassThrough, + PassThrough, + F16, + F16>; + +using DeviceOpFwd2DBF16 = GroupedConvolutionForwardBaseInvoker<2, + NHWGC, + GKYXC, + NHWGK, + BF16, + BF16, + BF16, + PassThrough, + PassThrough, + PassThrough, + BF16, + BF16>; + +void add_grouped_conv2d_fwd_f16_instances(std::vector>& instances) +{ + add_device_operation_instances(instances, + tile_grouped_conv_fwd_fp16_instances< + 2, + NHWGC, + GKYXC, + NHWGK>{}); +} + +void add_grouped_conv2d_fwd_bf16_instances(std::vector>& instances) +{ + add_device_operation_instances(instances, + tile_grouped_conv_fwd_bf16_instances< + 2, + NHWGC, + GKYXC, + NHWGK>{}); +} + +} // namespace ops +} // namespace ck_tile diff --git a/library/include/ck_tile/library/tensor_operation_instance/gpu/tile_grouped_conv_fwd_invoker.hpp b/library/include/ck_tile/library/tensor_operation_instance/gpu/tile_grouped_conv_fwd_invoker.hpp new file mode 100644 index 0000000000..854ab3a3f3 --- /dev/null +++ b/library/include/ck_tile/library/tensor_operation_instance/gpu/tile_grouped_conv_fwd_invoker.hpp @@ -0,0 +1,277 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include +#include +#include + +#include "ck_tile/core.hpp" +#include "ck_tile/host/kernel_launch.hpp" +#include "ck_tile/ops/epilogue.hpp" +#include "ck_tile/ops/gemm.hpp" +#include "ck_tile/ops/grouped_convolution.hpp" +#include "ck_tile/library/tensor_operation_instance/gpu/gemm_configs.hpp" + +namespace ck_tile { +namespace ops { + +template +struct GroupedConvolutionForwardBaseInvoker +{ + virtual bool IsSupportedArgument(const ck_tile::GroupedConvFwdHostArgs& args) const = 0; + virtual float Run(const ck_tile::GroupedConvFwdHostArgs& args, bool time_kernel) const = 0; + virtual std::string GetName(const ck_tile::GroupedConvFwdHostArgs& args) const = 0; + GroupedConvolutionForwardBaseInvoker() = default; + GroupedConvolutionForwardBaseInvoker(const GroupedConvolutionForwardBaseInvoker&) = default; + GroupedConvolutionForwardBaseInvoker& operator=(const GroupedConvolutionForwardBaseInvoker&) = default; + GroupedConvolutionForwardBaseInvoker(GroupedConvolutionForwardBaseInvoker&&) = default; + GroupedConvolutionForwardBaseInvoker& operator=(GroupedConvolutionForwardBaseInvoker&&) = default; + virtual ~GroupedConvolutionForwardBaseInvoker() = default; +}; + +template < + ck_tile::index_t NDimSpatial, + typename InLayout, + typename WeiLayout, + typename OutLayout, + typename InDataType, + typename WeiDataType, + typename OutDataType, + typename InElementwiseOperation, + typename WeiElementwiseOperation, + typename OutElementwiseOperation, + int kBlockPerCu, + ck_tile::index_t M_Tile, + ck_tile::index_t N_Tile, + ck_tile::index_t K_Tile, + ck_tile::index_t M_Warp, + ck_tile::index_t N_Warp, + ck_tile::index_t K_Warp, + ck_tile::index_t M_Warp_Tile, + ck_tile::index_t N_Warp_Tile, + ck_tile::index_t K_Warp_Tile, + ck_tile::index_t VectorSizeA, + ck_tile::index_t VectorSizeB, + ck_tile::index_t VectorSizeC, + bool DoubleSmemBuffer, + ck_tile::index_t PipelineVersion> +struct GroupedConvolutionForwardInvoker : + public GroupedConvolutionForwardBaseInvoker +{ + using GemmShape = ck_tile::TileGemmShape< + ck_tile::sequence, + ck_tile::sequence, + ck_tile::sequence, + GemmConfigBase::PermuteA, + GemmConfigBase::PermuteB>; + + static constexpr auto ConvSpec = ck_tile::ConvolutionSpecialization::Default; + + using TilePartitioner = + ck_tile::GemmSpatiallyLocalTilePartitioner; + + using GroupedConvTraitsType = ck_tile::GroupedConvTraits, // = DsLayout + OutLayout, + VectorSizeA, + VectorSizeB, + VectorSizeC>; + + using GemmUniversalTraits = ck_tile::TileGemmUniversalTraits< + GemmConfigBase::kPadM, + GemmConfigBase::kPadN, + GemmConfigBase::kPadK, + DoubleSmemBuffer, + typename GroupedConvTraitsType::GroupedConvImplicitGemmTraitsFwd::AsLayout, + typename GroupedConvTraitsType::GroupedConvImplicitGemmTraitsFwd::BsLayout, + typename GroupedConvTraitsType::GroupedConvImplicitGemmTraitsFwd::CLayout, + GemmConfigBase::TransposeC, + GemmConfigBase::UseStructuredSparsity, + false, // Persistent, + GemmConfigBase::NumWaveGroups, + GemmConfigBase::Preshuffle>; + + using AccDataType = float; + using GemmPipelineProblem = ck_tile::GemmPipelineProblem< + InDataType, + WeiDataType, + AccDataType, + GemmShape, + typename GroupedConvTraitsType::GroupedConvImplicitGemmTraitsFwd, + ck_tile::element_wise::PassThrough, + ck_tile::element_wise::PassThrough, + OutDataType, + true, + VectorSizeA, + VectorSizeB>; + + using BaseGemmPipeline = typename PipelineTypeTraits::template UniversalGemmPipeline; + + template + auto CreateKernel() const + { + constexpr auto scheduler = GemmConfigBase::Scheduler; + + using UniversalGemmProblem = + ck_tile::UniversalGemmPipelineProblem; + + using GemmPipeline = typename PipelineTypeTraits::template GemmPipeline; + + using CDEElementWise = ck_tile::element_wise::PassThrough; + + using ConvEpilogue = ck_tile::CShuffleEpilogue, // = DsDataType + AccDataType, + OutDataType, + typename GroupedConvTraitsType::ImplicitGemmDsLayout, + ck_tile::tensor_layout::gemm::RowMajor, + CDEElementWise, + TilePartitioner::MPerBlock, + TilePartitioner::NPerBlock, + M_Warp, + N_Warp, + M_Warp_Tile, + N_Warp_Tile, + K_Warp_Tile, + GemmConfigBase::TransposeC, + MemOp, + 1, + true, + GroupedConvTraitsType::VectorSizeC>>; + + return ck_tile::GroupedConvolutionForwardKernel{}; + } + + bool IsSupportedArgument(const ck_tile::GroupedConvFwdHostArgs& args) const override + { + if (args.k_batch > 1) + { + using Kernel = decltype(CreateKernel()); + return Kernel::IsSupportedArgument(args); + } + using Kernel = decltype(CreateKernel()); + return Kernel::IsSupportedArgument(args); + }; + + float Run(const ck_tile::GroupedConvFwdHostArgs& args, bool time_kernel) const override + { + const ck_tile::index_t gemm_k = + args.C_ * std::accumulate(args.filter_spatial_lengths_.begin(), + args.filter_spatial_lengths_.end(), + 1, + std::multiplies()); + + const ck_tile::index_t k_grain = args.k_batch * K_Tile; + const ck_tile::index_t K_split = (gemm_k + k_grain - 1) / k_grain * K_Tile; + const ck_tile::index_t num_loop = TilePartitioner::GetLoopNum(K_split); + const bool has_hot_loop = BaseGemmPipeline::BlockHasHotloop(num_loop); + const ck_tile::TailNumber tail_num = BaseGemmPipeline::GetBlockLoopTailNum(num_loop); + float ave_time{0}; + + const auto Run = [&](const auto has_hot_loop_, + const auto tail_number_, + const auto memory_operation_) { + constexpr bool has_hot_loop_v = has_hot_loop_.value; + constexpr auto tail_number_v = tail_number_.value; + constexpr auto memory_operation = memory_operation_.value; + + auto kernel = CreateKernel(); + using Kernel = decltype(kernel); + + auto kargs = Kernel::MakeKernelArgs(args); + const dim3 grids = Kernel::GridSize(args); + const dim3 blocks = Kernel::BlockSize(); + + constexpr int n_warmup = 5; + constexpr int n_repeat = 50; + ck_tile::stream_config s {nullptr, time_kernel, 1, n_warmup, n_repeat}; + + ave_time = ck_tile::launch_kernel( + s, ck_tile::make_kernel(kernel, grids, blocks, 0, kargs)); + + return ave_time; + }; + + const auto RunSplitk = [&](const auto has_hot_loop_, const auto tail_number_) { + if(args.k_batch == 1) + { + Run(has_hot_loop_, tail_number_, MemoryOpSet{}); + } + else + { + Run(has_hot_loop_, tail_number_, MemoryOpAtomicAdd{}); + } + }; + + BaseGemmPipeline::TailHandler(RunSplitk, has_hot_loop, tail_num); + return ave_time; + }; + + std::string GetName(const ck_tile::GroupedConvFwdHostArgs& args) const override + { + std::stringstream min_occupancy; + min_occupancy << "_blk_per_cu_" << kBlockPerCu; + if (args.k_batch > 1) + { + using Kernel = decltype(CreateKernel()); + return Kernel::GetName() + min_occupancy.str(); + } + using Kernel = decltype(CreateKernel()); + return Kernel::GetName() + min_occupancy.str(); + }; + + GroupedConvolutionForwardInvoker() = default; + GroupedConvolutionForwardInvoker(const GroupedConvolutionForwardInvoker&) = default; + GroupedConvolutionForwardInvoker& operator=(const GroupedConvolutionForwardInvoker&) = default; + GroupedConvolutionForwardInvoker(GroupedConvolutionForwardInvoker&&) = default; + GroupedConvolutionForwardInvoker& operator=(GroupedConvolutionForwardInvoker&&) = default; + ~GroupedConvolutionForwardInvoker() override = default; + }; + +} +} diff --git a/library/include/ck_tile/library/tensor_operation_instance/gpu/tile_grouped_conv_instance_factory.hpp b/library/include/ck_tile/library/tensor_operation_instance/gpu/tile_grouped_conv_instance_factory.hpp index e4bbc9d8cc..083385ab72 100644 --- a/library/include/ck_tile/library/tensor_operation_instance/gpu/tile_grouped_conv_instance_factory.hpp +++ b/library/include/ck_tile/library/tensor_operation_instance/gpu/tile_grouped_conv_instance_factory.hpp @@ -18,6 +18,9 @@ namespace ck_tile { namespace ops { +template +struct DeviceOperationInstanceFactory; + using NHWGC = ck_tile::tensor_layout::convolution::NHWGC; using GKYXC = ck_tile::tensor_layout::convolution::GKYXC; using NHWGK = ck_tile::tensor_layout::convolution::NHWGK; diff --git a/profiler/ck_tile/include/tile_profile_grouped_conv_bwd_weight_impl.hpp b/profiler/ck_tile/include/tile_profile_grouped_conv_bwd_weight_impl.hpp index d7624f9849..bbf663c1d0 100644 --- a/profiler/ck_tile/include/tile_profile_grouped_conv_bwd_weight_impl.hpp +++ b/profiler/ck_tile/include/tile_profile_grouped_conv_bwd_weight_impl.hpp @@ -13,7 +13,7 @@ #include "ck_tile/host/convolution_parameter.hpp" #include "ck_tile/ops/elementwise/unary_element_wise_operation.hpp" #include "ck_tile/ops/grouped_convolution/utils/grouped_convolution_utils.hpp" -#include "ck_tile/library/tensor_operation_instance/gpu/tile_grouped_conv_factory.hpp" +#include "ck_tile/library/tensor_operation_instance/gpu/tile_grouped_conv_bwd_weight_factory.hpp" #include "ck_tile/ops/grouped_convolution/kernel/grouped_convolution_backward_weight_kernel.hpp" #include "ck_tile/host/reference/reference_grouped_conv_bwd_weight.hpp" namespace ck_tile { @@ -130,7 +130,7 @@ bool profile_grouped_conv_bwd_weight_impl(int do_verification, float best_gb_per_sec = 0; std::string best_split_k("1"); - std::vector split_k_list = {/*auto deduce value*/ -1, 1, 2, 4, 8, 16, 32, 64, 128}; + std::vector split_k_list = {1, 2, 4, 8, 16, 32, 64, 128}; if(split_k != "all") { try diff --git a/profiler/ck_tile/include/tile_profile_grouped_conv_fwd_impl.hpp b/profiler/ck_tile/include/tile_profile_grouped_conv_fwd_impl.hpp new file mode 100644 index 0000000000..050c9205c6 --- /dev/null +++ b/profiler/ck_tile/include/tile_profile_grouped_conv_fwd_impl.hpp @@ -0,0 +1,233 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include +#include +#include +#include +#include + +#include "ck_tile/host.hpp" +#include "ck_tile/host/host_tensor.hpp" +#include "ck_tile/host/convolution_parameter.hpp" +#include "ck_tile/ops/elementwise/unary_element_wise_operation.hpp" +#include "ck_tile/ops/grouped_convolution/utils/grouped_convolution_utils.hpp" +#include "ck_tile/library/tensor_operation_instance/gpu/tile_grouped_conv_fwd_factory.hpp" +#include "ck_tile/ops/grouped_convolution/kernel/grouped_convolution_backward_weight_kernel.hpp" +#include "ck_tile/host/reference/reference_grouped_conv_bwd_weight.hpp" +namespace ck_tile { +namespace profiler { + +template +auto calculate_rtol_atol(const ck_tile::index_t GemmK, + const ck_tile::index_t kbatch, + const float max_accumulated_value) +{ + using ComputeType = + std::conditional_t; + // Calculate thresholds + const auto rtol = ck_tile::get_relative_threshold( + ck_tile::integer_divide_ceil(GemmK, kbatch)); + const auto atol = ck_tile::get_absolute_threshold( + max_accumulated_value / kbatch, ck_tile::integer_divide_ceil(GemmK, kbatch)); + // Calculate error due to split_k accumulation + const auto rtol_split_k = + ck_tile::get_relative_threshold(kbatch); + const auto atol_split_k = + ck_tile::get_absolute_threshold( + max_accumulated_value, kbatch); + // Use higher threshold + return ck_tile::make_tuple(std::max(rtol, rtol_split_k), std::max(atol, atol_split_k)); +} + +template +bool profile_grouped_conv_fwd_impl(int do_verification, + int init_method, + bool /*do_log*/, + bool time_kernel, + const ck_tile::conv::ConvParam& conv_param, + const ck_tile::index_t k_batch, + ck_tile::index_t instance_index = -1) +{ + using AccDataType = float; + using InElementOp = ck_tile::element_wise::PassThrough; + using WeiElementOp = ck_tile::element_wise::PassThrough; + using OutElementOp = ck_tile::element_wise::PassThrough; + + const auto in_g_n_c_wis_desc = + ck_tile::conv::make_input_host_tensor_descriptor_g_n_c_wis_packed(conv_param); + const auto wei_g_k_c_xs_desc = + ck_tile::conv::make_weight_host_tensor_descriptor_g_k_c_xs_packed(conv_param); + const auto out_g_n_k_wos_desc = + ck_tile::conv::make_output_host_tensor_descriptor_g_n_k_wos_packed(conv_param); + + ck_tile::HostTensor input(in_g_n_c_wis_desc); + ck_tile::HostTensor weight(wei_g_k_c_xs_desc); + ck_tile::HostTensor output(out_g_n_k_wos_desc); + + std::cout << "input: " << input.mDesc << std::endl; + std::cout << "weight: " << weight.mDesc << std::endl; + std::cout << "output: " << output.mDesc << std::endl; + + if(init_method == 0) + { + ck_tile::FillUniformDistribution{-5.f, 5.f}(input); + ck_tile::FillUniformDistribution{-5.f, 5.f}(weight); + } + else if(init_method == 1) + { + ck_tile::FillMonotonicSeq{}(input); + ck_tile::FillMonotonicSeq{}(weight); + } + else if(init_method == 2) + { + ck_tile::FillUniformDistribution{1.f, 1.f}(input); + ck_tile::FillUniformDistribution{1.f, 1.f}(weight); + } + else + { + input.SetZero(); + weight.SetZero(); + } + + ck_tile::DeviceMem input_dev_buf(input.get_element_space_size_in_bytes()); + ck_tile::DeviceMem weight_dev_buf(weight.get_element_space_size_in_bytes()); + ck_tile::DeviceMem output_dev_buf(output.get_element_space_size_in_bytes()); + + input_dev_buf.ToDevice(input.data()); + weight_dev_buf.ToDevice(output.data()); + output_dev_buf.SetZero(); + + using DeviceOp = ops::GroupedConvolutionForwardBaseInvoker< + NDimSpatial, + InLayout, + WeiLayout, + OutLayout, + InDataType, + WeiDataType, + OutDataType, + InElementOp, + WeiElementOp, + OutElementOp, + ComputeTypeA, + ComputeTypeB>; + + // get device op instances + const auto ops = ck_tile::ops::DeviceOperationInstanceFactory::GetInstances(); + + std::cout << "found " << ops.size() << " instances" << std::endl; + + std::string best_op_name; + float best_avg_time = 0; + float best_tflops = 0; + float best_gb_per_sec = 0; + + index_t num_kernel = 0; + bool all_pass = true; + for(auto& op : ops) + { + ck_tile::GroupedConvFwdHostArgs args(conv_param, + input_dev_buf.GetDeviceBuffer(), + weight_dev_buf.GetDeviceBuffer(), + {}, + output_dev_buf.GetDeviceBuffer(), + k_batch); + + if(op->IsSupportedArgument(args)) + { + num_kernel++; + if((instance_index != -1) && (instance_index + 1 != num_kernel)) + { + // skip test if instance_index is specified + continue; + } + + std::string op_name = op->GetName(args); + std::cout << op_name << " is profiled..." << std::endl; + + float avg_time = op->Run(args, time_kernel); + + std::size_t flop = conv_param.GetFlops(); + std::size_t num_btype = conv_param.GetByte(); + + float tflops = static_cast(flop) / 1.E9 / avg_time; + float gb_per_sec = num_btype / 1.E6 / avg_time; + + std::cout << "Perf: " << std::setw(10) << avg_time << " ms, " << tflops + << " TFlops, " << gb_per_sec << " GB/s, " << op_name << std::endl; + + bool pass = false; + if(do_verification) + { + output_dev_buf.FromDevice(output.data()); + + ck_tile::HostTensor output_host_ref(out_g_n_k_wos_desc); + output_host_ref.SetZero(); + + ck_tile::reference_grouped_conv_fwd( + input, + weight, + output_host_ref, + conv_param.conv_filter_strides_, + conv_param.conv_filter_dilations_, + conv_param.input_left_pads_, + conv_param.input_right_pads_); + const ck_tile::index_t GemmK = weight.get_element_size() / (conv_param.G_ * conv_param.K_); + const float max_accumulated_value = + *std::max_element(output_host_ref.mData.begin(), output_host_ref.mData.end()); + const auto rtol_atol = + calculate_rtol_atol( + GemmK, k_batch, max_accumulated_value); + pass = ck_tile::check_err(output, + output_host_ref, + "Error: Incorrect results!", + rtol_atol.at(ck_tile::number<0>{}), + rtol_atol.at(ck_tile::number<1>{})); + + std::cout << "Relative error threshold: " << rtol_atol.at(ck_tile::number<0>{}) + << " Absolute error threshold: " << rtol_atol.at(ck_tile::number<1>{}) + << std::endl; + std::cout << "The CPU verification result is:" << (pass ? "correct" : "fail") << std::endl; + all_pass &= pass; + } + + bool is_valid = do_verification ? pass : true; + + if(tflops > best_tflops && is_valid) + { + best_op_name = op_name; + best_tflops = tflops; + best_avg_time = avg_time; + best_gb_per_sec = gb_per_sec; + } + } + else + { + std::cout << op->GetName(args) << " does not support this problem." << std::endl; + } + } + + std::cout << "Best configuration parameters:" << "\nname: " << best_op_name + << "\navg_time: " << best_avg_time << "\ntflops: " << best_tflops + << "\nGB/s: " << best_gb_per_sec << std::endl; + + if(instance_index != -1) + { + std::cout << "grouped_conv_fwd_instance (" << instance_index << "/" << num_kernel + << "): Passed" << std::endl; + } + return all_pass; +} + +} // namespace profiler +} // namespace ck_tile diff --git a/profiler/ck_tile/include/tile_profiler_operation_registry.hpp b/profiler/ck_tile/include/tile_profiler_operation_registry.hpp index ca2fd89f47..8bccf6b032 100644 --- a/profiler/ck_tile/include/tile_profiler_operation_registry.hpp +++ b/profiler/ck_tile/include/tile_profiler_operation_registry.hpp @@ -1,6 +1,8 @@ // SPDX-License-Identifier: MIT // Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. +#pragma once + #include #include #include diff --git a/profiler/ck_tile/src/CMakeLists.txt b/profiler/ck_tile/src/CMakeLists.txt index 842aca4d9e..817d7171f1 100644 --- a/profiler/ck_tile/src/CMakeLists.txt +++ b/profiler/ck_tile/src/CMakeLists.txt @@ -12,10 +12,7 @@ message(STATUS "CK_PROFILER_INSTANCE_FILTER: ${CK_PROFILER_INSTANCE_FILTER}") if(SUPPORTED_GPU_TARGETS MATCHES "gfx9" OR SUPPORTED_GPU_TARGETS MATCHES "gfx1[12]") list(APPEND PROFILER_OPS tile_profile_grouped_conv_bwd_weight.cpp) -endif() - -if(DL_KERNELS) - list(APPEND PROFILER_OPS tile_profile_grouped_conv_bwd_weight.cpp) + list(APPEND PROFILER_OPS tile_profile_grouped_conv_fwd.cpp) endif() set(PROFILER_SOURCES tile_profiler.cpp) @@ -33,7 +30,6 @@ message(VERBOSE "ckTileProfiler sources: ${PROFILER_SOURCES}") set(PROFILER_EXECUTABLE ckTileProfiler) add_executable(${PROFILER_EXECUTABLE} ${PROFILER_SOURCES}) -#target_include_directories(${PROFILER_EXECUTABLE} PRIVATE ${CMAKE_PROJECT_DIR}/include) target_compile_options(${PROFILER_EXECUTABLE} PRIVATE -Wno-global-constructors) # flags to compress the library if(NOT WIN32 AND ${hip_VERSION_FLAT} GREATER 600241132) diff --git a/profiler/ck_tile/src/tile_profile_grouped_conv_fwd.cpp b/profiler/ck_tile/src/tile_profile_grouped_conv_fwd.cpp new file mode 100644 index 0000000000..ee5e2ee960 --- /dev/null +++ b/profiler/ck_tile/src/tile_profile_grouped_conv_fwd.cpp @@ -0,0 +1,237 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. + +#include +#include +#include +#include + +#include "tile_profile_grouped_conv_fwd_impl.hpp" +#include "tile_profiler_operation_registry.hpp" + +// CK Tile library dependencies +#include "ck_tile/core/numeric/integral_constant.hpp" +#include "ck_tile/ops/common/tensor_layout.hpp" + +namespace { + +enum struct ConvLayout +{ + GNCHW_GKCYX_GNKHW, // 0 + GNHWC_GKYXC_GNHWK, // 1 + NHWGC_GKYXC_NHWGK, // 2 + NGCHW_GKYXC_NGKHW, // 3 + NGCHW_GKCYX_NGKHW, // 4 +}; + +enum struct ConvDataType +{ + F32_F32_F32, // 0 + F16_F16_F16, // 1 + BF16_F32_BF16, // 2 + F16_F16_F16_BF8_F8, // 3 + I8_I8_I8, // 4 + BF16_BF16_BF16, // 5 + F32_F32_F32_TF32, // 6 +}; + +#define OP_NAME "grouped_conv_fwd" +#define OP_DESC "Grouped Convolution Forward" + +static void print_helper_msg() +{ + std::string conv_param_parser_helper_msg; + + conv_param_parser_helper_msg += "Following arguments (depending on number of spatial dims):\n" + " Number of spatial dimensions (1=Conv1d, 2=Conv2d, 3=Conv3d)\n" + " G, N, K, C, \n" + " , (ie Y, X for 2D)\n" + " , (ie Hi, Wi for 2D)\n" + " , (ie Sy, Sx for 2D)\n" + " , (ie Dy, Dx for 2D)\n" + " , (ie LeftPy, LeftPx for 2D)\n" + " , (ie RightPy, RightPx for 2D)\n"; + + std::cout + // clang-format off + << "arg1: tensor operation (" OP_NAME ": " OP_DESC ")\n" + << "arg2: data type (0: Input fp32, Weight fp32, Output fp32\n" + << " 1: Input fp16, Weight fp16, Output fp16\n" + << " 2: Input bf16, Weight bf16, Output bf16\n" + << " 3: Input int8, Weight int8, Output int8\n" + << " 4: Input fp8, Weight fp8, Output fp8\n" + << " 5: Input bf8, Weight bf8, Output fp8\n" + << " 6: Input fp8, Weight bf8, Output fp8\n" + << " 7: Input bf8, Weight fp8, Output fp8\n" + << " 8: Input fp32, Weight fp32, Output fp32, Compute tf32)\n" + << "arg3: tensor layout (0: Input[G, N, Hi, Wi, C], Weight[G, K, Y, X, C], Output[G, N, Ho, Wo, K]\n" + << " 1: Input[N, Hi, Wi, G, C], Weight[G, K, Y, X, C], Output[N, Ho, Wo, G, K]\n" + << " 2: Input[N, G, C, Hi, Wi], Weight[G, K, Y, X, C], Output[N, " + "G, K, Ho, Wo]\n" + << " 3: Input[N, G, C, Hi, Wi], Weight[G, K, C, Y, X], Output[N, " + "G, K, Ho, Wo])\n" + << "arg4: indexing data type (0: 32-bit, 1: 64-bit)\n" + << "arg5: verification (0: no, 1: yes)\n" + << "arg6: initialization (0: no init, 1: integer value, 2: decimal value)\n" + << "arg7: print tensor value (0: no; 1: yes)\n" + << "arg8: time kernel (0: no, 1: yes)\n" + << conv_param_parser_helper_msg << std::endl; + // clang-format on +} + +} // namespace + +int tile_profile_grouped_conv_fwd(int argc, char* argv[]) +{ + // 8 for control, 1 for num_dim_spatial + if(argc < 10) + { + print_helper_msg(); + return 1; + } + + const auto data_type = static_cast(std::stoi(argv[2])); + const auto layout = static_cast(std::stoi(argv[3])); + const bool do_verification = std::stoi(argv[5]); + const int init_method = std::stoi(argv[6]); + const bool do_log = std::stoi(argv[7]); + const bool time_kernel = std::stoi(argv[8]); + const int num_dim_spatial = std::stoi(argv[9]); + + // 9 for control, 1 for num_dim_spatial, 4 for G/N/K/C, and 6 * num_dim_spatial + if(argc != 9 + 1 + 4 + 6 * num_dim_spatial) + { + print_helper_msg(); + return 1; + } + + const auto params = ck_tile::conv::parse_conv_param(num_dim_spatial, 10, argv); + constexpr ck_tile::index_t k_batch = 1; + + using F32 = float; + using F16 = ck_tile::half_t; + using BF16 = ck_tile::bfloat16_t; + using F8 = ck_tile::fp8_t; + using BF8 = ck_tile::bf8_t; +#if defined(__gfx942__) + using TF32 = ck::tf32_t; +#endif + + using NHWGC = ck_tile::tensor_layout::convolution::NHWGC; + using NDHWGC = ck_tile::tensor_layout::convolution::NDHWGC; + + using GKYXC = ck_tile::tensor_layout::convolution::GKYXC; + using GKZYXC = ck_tile::tensor_layout::convolution::GKZYXC; + + using NHWGK = ck_tile::tensor_layout::convolution::NHWGK; + using NDHWGK = ck_tile::tensor_layout::convolution::NDHWGK; + + constexpr auto I2 = ck_tile::number<2>{}; + constexpr auto I3 = ck_tile::number<3>{}; + + auto profile = [&](auto num_dim_spatial_tmp, + auto in_layout, + auto wei_layout, + auto out_layout, + auto in_type, + auto wei_type, + auto out_type, + auto compute_type_a, + auto compute_type_b) { + constexpr ck_tile::index_t NDimSpatial = num_dim_spatial_tmp.value; + + using InLayout = decltype(in_layout); + using WeiLayout = decltype(wei_layout); + using OutLayout = decltype(out_layout); + + using InDataType = decltype(in_type); + using WeiDataType = decltype(wei_type); + using OutDataType = decltype(out_type); + + using ComputeTypeA = decltype(compute_type_a); + using ComputeTypeB = decltype(compute_type_b); + + bool pass = ck_tile::profiler::profile_grouped_conv_fwd_impl( + do_verification, init_method, do_log, time_kernel, params, k_batch); + + return pass ? 0 : 1; + }; + + if(num_dim_spatial == 2 && layout == ConvLayout::NHWGC_GKYXC_NHWGK) + { + if(data_type == ConvDataType::F32_F32_F32) + { + return profile(I2, NHWGC{}, GKYXC{}, NHWGK{}, F32{}, F32{}, F32{}, F32{}, F32{}); + } + if(data_type == ConvDataType::F16_F16_F16) + { + return profile(I2, NHWGC{}, GKYXC{}, NHWGK{}, F16{}, F16{}, F16{}, F16{}, F16{}); + } + if(data_type == ConvDataType::BF16_F32_BF16) + { + // fp32 atomic add is used for weight tensor in bf16 kernel + return profile(I2, NHWGC{}, GKYXC{}, NHWGK{}, BF16{}, F32{}, BF16{}, BF16{}, BF16{}); + } + if(data_type == ConvDataType::BF16_BF16_BF16) + { + return profile(I2, NHWGC{}, GKYXC{}, NHWGK{}, BF16{}, BF16{}, BF16{}, BF16{}, BF16{}); + } + else if(data_type == ConvDataType::F32_F32_F32_TF32) + { +#if defined(__gfx942__) + return profile(I2, NHWGC{}, GKYXC{}, NHWGK{}, F32{}, F32{}, F32{}, TF32{}, TF32{}); +#endif + } + } + + if(num_dim_spatial == 3 && layout == ConvLayout::NHWGC_GKYXC_NHWGK) + { + if(data_type == ConvDataType::F32_F32_F32) + { + return profile(I3, NDHWGC{}, GKZYXC{}, NDHWGK{}, F32{}, F32{}, F32{}, F32{}, F32{}); + } + if(data_type == ConvDataType::F16_F16_F16) + { + return profile(I3, NDHWGC{}, GKZYXC{}, NDHWGK{}, F16{}, F16{}, F16{}, F16{}, F16{}); + } + if(data_type == ConvDataType::BF16_F32_BF16) + { + // fp32 atomic add is used for weight tensor in bf16 kernel + return profile(I3, NDHWGC{}, GKZYXC{}, NDHWGK{}, BF16{}, F32{}, BF16{}, BF16{}, BF16{}); + } + if(data_type == ConvDataType::BF16_BF16_BF16) + { + return profile( + I3, NDHWGC{}, GKZYXC{}, NDHWGK{}, BF16{}, BF16{}, BF16{}, BF16{}, BF16{}); + } + if(data_type == ConvDataType::F16_F16_F16_BF8_F8) + { + return profile(I3, NDHWGC{}, GKZYXC{}, NDHWGK{}, F16{}, F16{}, F16{}, BF8{}, F8{}); + } + else if(data_type == ConvDataType::I8_I8_I8) + { + return profile( + I3, NDHWGC{}, GKZYXC{}, NDHWGK{}, int8_t{}, int8_t{}, int8_t{}, int8_t{}, int8_t{}); + } + else if(data_type == ConvDataType::F32_F32_F32_TF32) + { +#if defined(__gfx942__) + return profile(I3, NDHWGC{}, GKZYXC{}, NDHWGK{}, F32{}, F32{}, F32{}, TF32{}, TF32{}); +#endif + } + } + + std::cout << "this data_type & layout is not implemented" << std::endl; + + return 1; +} + +REGISTER_PROFILER_OPERATION(OP_NAME, OP_DESC, tile_profile_grouped_conv_fwd);