From e7696eeff1f25a4e1abaa49d81763ce088d50731 Mon Sep 17 00:00:00 2001 From: Jakub Piasecki Date: Thu, 30 Oct 2025 13:17:06 +0000 Subject: [PATCH] tmp save --- ...ed_convolution_backward_weight_invoker.hpp | 6 +- ...emm_universal_pipeline_ag_bg_cr_policy.hpp | 12 +- ...ouped_convolution_backward_data_kernel.hpp | 4 +- include/ck_tile/utility.hpp | 6 + ...e_grouped_conv_bwd_data_bf16_instances.hpp | 162 ++++++++++ .../tile_grouped_conv_bwd_data_factory.hpp | 131 ++++++++ .../tile_grouped_conv_bwd_data_invoker.hpp | 274 ++++++++++++++++ ...grouped_conv_bwd_weight_bf16_instances.hpp | 35 +- ...ped_conv_bwd_weight_bf16_instances_opt.hpp | 38 ++- .../tile_grouped_conv_fwd_bf16_instances.hpp | 108 +++++-- ...tile_grouped_conv_fwd_bf16_instances_2.hpp | 147 +++++++++ ...tile_grouped_conv_fwd_bf16_instances_3.hpp | 151 +++++++++ .../gpu/tile_grouped_conv_fwd_factory.hpp | 5 + .../gpu/CMakeLists.txt | 27 ++ ...e_grouped_conv_bwd_data_bf16_instances.cpp | 20 ++ ...tile_grouped_conv_fwd_bf16_instances_2.cpp | 20 ++ ...tile_grouped_conv_fwd_bf16_instances_3.cpp | 20 ++ ...ile_profile_grouped_conv_bwd_data_impl.hpp | 299 ++++++++++++++++++ ...e_profile_grouped_conv_bwd_weight_impl.hpp | 3 +- .../tile_profile_grouped_conv_fwd_impl.hpp | 23 +- profiler/ck_tile/src/CMakeLists.txt | 5 + .../tile_profile_grouped_conv_bwd_data.cpp | 219 +++++++++++++ .../src/tile_profile_grouped_conv_fwd.cpp | 26 +- .../profile_grouped_conv_bwd_data_impl.hpp | 19 +- .../profile_grouped_conv_fwd_impl.hpp | 19 ++ .../src/profile_grouped_conv_bwd_data.cpp | 4 +- profiler/src/profile_grouped_conv_fwd.cpp | 27 +- script/benchmark_ck_vs_ck_tile.py | 92 +++++- script/convert_miopen_driver_commands.py | 11 +- ...convert_old_ck_conv_bwd_data_to_ck_tile.py | 128 ++++++++ .../convert_old_ck_conv_bwd_wei_to_ck_tile.py | 120 +++++++ script/convert_old_ck_conv_fwd_to_ck_tile.py | 121 +++++++ 32 files changed, 2189 insertions(+), 93 deletions(-) create mode 100644 include/ck_tile/utility.hpp create mode 100644 library/include/ck_tile/library/tensor_operation_instance/gpu/tile_grouped_conv_bwd_data_bf16_instances.hpp create mode 100644 library/include/ck_tile/library/tensor_operation_instance/gpu/tile_grouped_conv_bwd_data_factory.hpp create mode 100644 library/include/ck_tile/library/tensor_operation_instance/gpu/tile_grouped_conv_bwd_data_invoker.hpp create mode 100644 library/include/ck_tile/library/tensor_operation_instance/gpu/tile_grouped_conv_fwd_bf16_instances_2.hpp create mode 100644 library/include/ck_tile/library/tensor_operation_instance/gpu/tile_grouped_conv_fwd_bf16_instances_3.hpp create mode 100644 library/src/ck_tile/tensor_operation_instance/gpu/tile_grouped_conv_bwd_data_bf16_instances.cpp create mode 100644 library/src/ck_tile/tensor_operation_instance/gpu/tile_grouped_conv_fwd_bf16_instances_2.cpp create mode 100644 library/src/ck_tile/tensor_operation_instance/gpu/tile_grouped_conv_fwd_bf16_instances_3.cpp create mode 100644 profiler/ck_tile/include/tile_profile_grouped_conv_bwd_data_impl.hpp create mode 100644 profiler/ck_tile/src/tile_profile_grouped_conv_bwd_data.cpp create mode 100644 script/convert_old_ck_conv_bwd_data_to_ck_tile.py create mode 100644 script/convert_old_ck_conv_bwd_wei_to_ck_tile.py create mode 100644 script/convert_old_ck_conv_fwd_to_ck_tile.py diff --git a/example/ck_tile/20_grouped_convolution/grouped_convolution_backward_weight_invoker.hpp b/example/ck_tile/20_grouped_convolution/grouped_convolution_backward_weight_invoker.hpp index 78573ed482..bf8bafda3f 100644 --- a/example/ck_tile/20_grouped_convolution/grouped_convolution_backward_weight_invoker.hpp +++ b/example/ck_tile/20_grouped_convolution/grouped_convolution_backward_weight_invoker.hpp @@ -32,9 +32,9 @@ struct GroupedConvolutionBackwardWeightInvoker GemmConfig::PermuteA, GemmConfig::PermuteB>; - constexpr ck_tile::index_t VectorSizeA = 4; - constexpr ck_tile::index_t VectorSizeB = 8; - constexpr ck_tile::index_t VectorSizeC = 8; + constexpr ck_tile::index_t VectorSizeA = 1; + constexpr ck_tile::index_t VectorSizeB = 2; + constexpr ck_tile::index_t VectorSizeC = 2; constexpr auto ConvSpec = ck_tile::ConvolutionSpecialization::Default; using TilePartitioner = diff --git a/include/ck_tile/ops/gemm/pipeline/gemm_universal_pipeline_ag_bg_cr_policy.hpp b/include/ck_tile/ops/gemm/pipeline/gemm_universal_pipeline_ag_bg_cr_policy.hpp index 4030783ecc..c8837f9d98 100644 --- a/include/ck_tile/ops/gemm/pipeline/gemm_universal_pipeline_ag_bg_cr_policy.hpp +++ b/include/ck_tile/ops/gemm/pipeline/gemm_universal_pipeline_ag_bg_cr_policy.hpp @@ -392,7 +392,7 @@ struct UniversalGemmBasePolicy } template - CK_TILE_HOST_DEVICE static constexpr auto GetVectorSizeA() + CK_TILE_HOST_DEVICE static constexpr index_t GetVectorSizeA() { using AsLayout = remove_cvref_t; using AsDataType = remove_cvref_t; @@ -402,6 +402,10 @@ struct UniversalGemmBasePolicy using ALayout = remove_cvref_t{}, AsLayout>>; using ADataType = remove_cvref_t{}, AsDataType>>; + if constexpr(Problem::FixedVectorSize) { + return Problem::VectorSizeA; + } + if constexpr(std::is_same_v) { return GetGlobalVectorLoadSize - CK_TILE_HOST_DEVICE static constexpr auto GetVectorSizeB() + CK_TILE_HOST_DEVICE static constexpr index_t GetVectorSizeB() { using BsLayout = remove_cvref_t; using BsDataType = remove_cvref_t; @@ -431,6 +435,10 @@ struct UniversalGemmBasePolicy using BLayout = remove_cvref_t{}, BsLayout>>; using BDataType = remove_cvref_t{}, BsDataType>>; + if constexpr(Problem::FixedVectorSize) { + return Problem::VectorSizeB; + } + if constexpr(std::is_same_v) { return GetGlobalVectorLoadSize; + +template +using tile_grouped_conv_bwd_data_bf16_instances = std::tuple< +// clang-format off + //#####################################| Num| InLayout| WeiLayout| OutLayout| InData| WeiData| OutData| In| Wei| Out| K-block| M-tile| N-tile | K-tile | M-warp| N-warp| K-warp| M-warp| N-warp| K-warp| Vector| Vector| Vector| Double| GEMM| + //#####################################| Dim| | | | Type| Type| Type| Elementwise| Elementwise| Elementwise| per| | | | | | | tile| tile| tile| size| size| size| smem| pipeline| + //#####################################| Spatial| | | | | | | Operation| Operation| Operation| CU| | | | | | | size| size| size| A| B| C| buffer| version| +GroupedConvolutionBackwardDataInvoker, +GroupedConvolutionBackwardDataInvoker, +// // GroupedConvolutionBackwardDataInvoker, +GroupedConvolutionBackwardDataInvoker, +GroupedConvolutionBackwardDataInvoker, +// // GroupedConvolutionBackwardDataInvoker, +GroupedConvolutionBackwardDataInvoker, +GroupedConvolutionBackwardDataInvoker, +// // GroupedConvolutionBackwardDataInvoker, +GroupedConvolutionBackwardDataInvoker, +GroupedConvolutionBackwardDataInvoker, +// // GroupedConvolutionBackwardDataInvoker, +GroupedConvolutionBackwardDataInvoker, +GroupedConvolutionBackwardDataInvoker, +// // GroupedConvolutionBackwardDataInvoker, +GroupedConvolutionBackwardDataInvoker, +GroupedConvolutionBackwardDataInvoker, +// // GroupedConvolutionBackwardDataInvoker, +GroupedConvolutionBackwardDataInvoker, +GroupedConvolutionBackwardDataInvoker, +// // GroupedConvolutionBackwardDataInvoker, +GroupedConvolutionBackwardDataInvoker, +GroupedConvolutionBackwardDataInvoker, +// // GroupedConvolutionBackwardDataInvoker, +GroupedConvolutionBackwardDataInvoker, +GroupedConvolutionBackwardDataInvoker, +// // GroupedConvolutionBackwardDataInvoker, +GroupedConvolutionBackwardDataInvoker, +GroupedConvolutionBackwardDataInvoker, +// // GroupedConvolutionBackwardDataInvoker, +GroupedConvolutionBackwardDataInvoker, +GroupedConvolutionBackwardDataInvoker, +// // GroupedConvolutionBackwardDataInvoker, +GroupedConvolutionBackwardDataInvoker, +GroupedConvolutionBackwardDataInvoker, +// // GroupedConvolutionBackwardDataInvoker, +GroupedConvolutionBackwardDataInvoker, +GroupedConvolutionBackwardDataInvoker, +// // GroupedConvolutionBackwardDataInvoker, +GroupedConvolutionBackwardDataInvoker, +GroupedConvolutionBackwardDataInvoker, +// // GroupedConvolutionBackwardDataInvoker, +GroupedConvolutionBackwardDataInvoker, +GroupedConvolutionBackwardDataInvoker, // +// // GroupedConvolutionBackwardDataInvoker, +GroupedConvolutionBackwardDataInvoker, +GroupedConvolutionBackwardDataInvoker, +// // GroupedConvolutionBackwardDataInvoker, +GroupedConvolutionBackwardDataInvoker, +GroupedConvolutionBackwardDataInvoker, +// // GroupedConvolutionBackwardDataInvoker, +GroupedConvolutionBackwardDataInvoker, +GroupedConvolutionBackwardDataInvoker, +// // GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// // GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// // GroupedConvolutionBackwardDataInvoker, +GroupedConvolutionBackwardDataInvoker, +GroupedConvolutionBackwardDataInvoker, +// // GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// // GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// // GroupedConvolutionBackwardDataInvoker, +GroupedConvolutionBackwardDataInvoker, +GroupedConvolutionBackwardDataInvoker, // +// // GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// // GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// // GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, // test +// GroupedConvolutionBackwardDataInvoker, +// // GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// // GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// // GroupedConvolutionBackwardDataInvoker, +GroupedConvolutionBackwardDataInvoker, +GroupedConvolutionBackwardDataInvoker, +// // GroupedConvolutionBackwardDataInvoker, +GroupedConvolutionBackwardDataInvoker, +GroupedConvolutionBackwardDataInvoker, +// // GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// // GroupedConvolutionBackwardDataInvoker, +GroupedConvolutionBackwardDataInvoker, +GroupedConvolutionBackwardDataInvoker, +// // GroupedConvolutionBackwardDataInvoker, +GroupedConvolutionBackwardDataInvoker, +GroupedConvolutionBackwardDataInvoker, // +// // GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker // +// // GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// // GroupedConvolutionBackwardDataInvoker, +GroupedConvolutionBackwardDataInvoker, +GroupedConvolutionBackwardDataInvoker, +// // GroupedConvolutionBackwardDataInvoker, +GroupedConvolutionBackwardDataInvoker, +GroupedConvolutionBackwardDataInvoker, +// // GroupedConvolutionBackwardDataInvoker, +GroupedConvolutionBackwardDataInvoker, +GroupedConvolutionBackwardDataInvoker, +// // GroupedConvolutionBackwardDataInvoker, +GroupedConvolutionBackwardDataInvoker, +GroupedConvolutionBackwardDataInvoker, +// // GroupedConvolutionBackwardDataInvoker, +GroupedConvolutionBackwardDataInvoker, +GroupedConvolutionBackwardDataInvoker +// // GroupedConvolutionBackwardDataInvoker +// clang-format on +>; + +} // namespace ops +} // namespace ck_tile diff --git a/library/include/ck_tile/library/tensor_operation_instance/gpu/tile_grouped_conv_bwd_data_factory.hpp b/library/include/ck_tile/library/tensor_operation_instance/gpu/tile_grouped_conv_bwd_data_factory.hpp new file mode 100644 index 0000000000..d4467bcf32 --- /dev/null +++ b/library/include/ck_tile/library/tensor_operation_instance/gpu/tile_grouped_conv_bwd_data_factory.hpp @@ -0,0 +1,131 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include +#include +#include +#include + +#include "ck_tile/library/tensor_operation_instance/gpu/tile_grouped_conv_instance_factory.hpp" +#include "ck_tile/library/tensor_operation_instance/gpu/tile_grouped_conv_bwd_data_invoker.hpp" + +namespace ck_tile { +namespace ops { + +using DeviceOp2DF16 = GroupedConvolutionBackwardDataBaseInvoker<2, + NHWGC, + GKYXC, + NHWGK, + ck_tile::half_t, + ck_tile::half_t, + ck_tile::half_t, + ck_tile::element_wise::PassThrough, + ck_tile::element_wise::PassThrough, + ck_tile::element_wise::PassThrough, + ck_tile::half_t, + ck_tile::half_t>; + +using DeviceOp2DBF16 = GroupedConvolutionBackwardDataBaseInvoker<2, + NHWGC, + GKYXC, + NHWGK, + ck_tile::bfloat16_t, + ck_tile::bfloat16_t, + ck_tile::bfloat16_t, + ck_tile::element_wise::PassThrough, + ck_tile::element_wise::PassThrough, + ck_tile::element_wise::PassThrough, + ck_tile::bfloat16_t, + ck_tile::bfloat16_t>; + +using DeviceOp2DF32 = GroupedConvolutionBackwardDataBaseInvoker<2, + NHWGC, + GKYXC, + NHWGK, + float, + float, + float, + ck_tile::element_wise::PassThrough, + ck_tile::element_wise::PassThrough, + ck_tile::element_wise::PassThrough, + float, + float>; + +// Forward declarations for instance factory functions +// void add_grouped_conv2d_bwd_weight_f16_instances(std::vector>& instances); +void add_grouped_conv2d_bwd_data_bf16_instances(std::vector>& instances); +// void add_grouped_conv2d_bwd_weight_bf16_instances_opt(std::vector>& instances); + +template +struct DeviceOperationInstanceFactory> +{ + using DeviceOp = GroupedConvolutionBackwardDataBaseInvoker; + + static auto GetInstances() + { + std::vector> op_ptrs; + + if constexpr(NumDimSpatial == 2) + { + if constexpr(std::is_same_v && std::is_same_v && + std::is_same_v) + { + if constexpr(std::is_same_v && + std::is_same_v && + std::is_same_v && + std::is_same_v && + std::is_same_v) + { + // add_grouped_conv2d_bwd_weight_f16_instances(op_ptrs); + } + if constexpr(std::is_same_v && + std::is_same_v && + std::is_same_v && + std::is_same_v && + std::is_same_v) + { + add_grouped_conv2d_bwd_data_bf16_instances(op_ptrs); + // add_grouped_conv2d_bwd_weight_bf16_instances_opt(op_ptrs); + } + } + } + + return op_ptrs; + } +}; + +} // namespace ops +} // namespace ck_tile diff --git a/library/include/ck_tile/library/tensor_operation_instance/gpu/tile_grouped_conv_bwd_data_invoker.hpp b/library/include/ck_tile/library/tensor_operation_instance/gpu/tile_grouped_conv_bwd_data_invoker.hpp new file mode 100644 index 0000000000..606292b379 --- /dev/null +++ b/library/include/ck_tile/library/tensor_operation_instance/gpu/tile_grouped_conv_bwd_data_invoker.hpp @@ -0,0 +1,274 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include +#include +#include + +#include "ck_tile/core.hpp" +#include "ck_tile/host/kernel_launch.hpp" +#include "ck_tile/ops/epilogue.hpp" +#include "ck_tile/ops/gemm.hpp" +#include "ck_tile/ops/grouped_convolution.hpp" +#include "ck_tile/library/tensor_operation_instance/gpu/gemm_configs.hpp" + +namespace ck_tile { +namespace ops { + +template +struct GroupedConvolutionBackwardDataBaseInvoker +{ + virtual bool IsSupportedArgument(const ck_tile::GroupedConvBwdDataHostArgs& args) const = 0; + virtual float Run(const ck_tile::GroupedConvBwdDataHostArgs& args, bool time_kernel, int n_warmup, int n_repeat) const = 0; + virtual std::string GetName(const ck_tile::GroupedConvBwdDataHostArgs& args) const = 0; + GroupedConvolutionBackwardDataBaseInvoker() = default; + GroupedConvolutionBackwardDataBaseInvoker(const GroupedConvolutionBackwardDataBaseInvoker&) = default; + GroupedConvolutionBackwardDataBaseInvoker& operator=(const GroupedConvolutionBackwardDataBaseInvoker&) = default; + GroupedConvolutionBackwardDataBaseInvoker(GroupedConvolutionBackwardDataBaseInvoker&&) = default; + GroupedConvolutionBackwardDataBaseInvoker& operator=(GroupedConvolutionBackwardDataBaseInvoker&&) = default; + virtual ~GroupedConvolutionBackwardDataBaseInvoker() = default; +}; + +template < + ck_tile::index_t NDimSpatial, + typename InLayout, + typename WeiLayout, + typename OutLayout, + typename InDataType, + typename WeiDataType, + typename OutDataType, + typename InElementwiseOperation, + typename WeiElementwiseOperation, + typename OutElementwiseOperation, + int kBlockPerCu, + ck_tile::index_t M_Tile, + ck_tile::index_t N_Tile, + ck_tile::index_t K_Tile, + ck_tile::index_t M_Warp, + ck_tile::index_t N_Warp, + ck_tile::index_t K_Warp, + ck_tile::index_t M_Warp_Tile, + ck_tile::index_t N_Warp_Tile, + ck_tile::index_t K_Warp_Tile, + ck_tile::index_t VectorSizeA, + ck_tile::index_t VectorSizeB, + ck_tile::index_t VectorSizeC, + bool DoubleSmemBuffer, + ck_tile::index_t PipelineVersion> +struct GroupedConvolutionBackwardDataInvoker : + public GroupedConvolutionBackwardDataBaseInvoker +{ + using GemmShape = ck_tile::TileGemmShape< + ck_tile::sequence, + ck_tile::sequence, + ck_tile::sequence, + GemmConfigBase::PermuteA, + GemmConfigBase::PermuteB>; + + static constexpr auto ConvSpec = ck_tile::ConvolutionSpecialization::Default; + + using TilePartitioner = + ck_tile::GemmSpatiallyLocalTilePartitioner; + + using GroupedConvTraitsType = ck_tile::GroupedConvTraits, // = DsLayout + OutLayout, + VectorSizeA, + VectorSizeB, + VectorSizeC>; + + using GemmUniversalTraits = ck_tile::TileGemmUniversalTraits< + GemmConfigBase::kPadM, + GemmConfigBase::kPadN, + GemmConfigBase::kPadK, + DoubleSmemBuffer, + typename GroupedConvTraitsType::GroupedConvImplicitGemmTraitsBwdData::AsLayout, + typename GroupedConvTraitsType::GroupedConvImplicitGemmTraitsBwdData::BsLayout, + typename GroupedConvTraitsType::GroupedConvImplicitGemmTraitsBwdData::CLayout, + GemmConfigBase::TransposeC, + GemmConfigBase::UseStructuredSparsity, + false, // Persistent, + GemmConfigBase::NumWaveGroups>; + + using AccDataType = float; + using GemmPipelineProblem = ck_tile::GemmPipelineProblem< + OutDataType, + WeiDataType, + AccDataType, + GemmShape, + typename GroupedConvTraitsType::GroupedConvImplicitGemmTraitsBwdData, + ck_tile::element_wise::PassThrough, + ck_tile::element_wise::PassThrough, + InDataType, + true, + VectorSizeA, + VectorSizeB>; + + using BaseGemmPipeline = typename PipelineTypeTraits::template UniversalGemmPipeline; + + template + auto CreateKernel() const + { + constexpr auto scheduler = GemmConfigBase::Scheduler; + + using UniversalGemmProblem = + ck_tile::UniversalGemmPipelineProblem; + + using GemmPipeline = typename PipelineTypeTraits::template GemmPipeline; + + using CDEElementWise = ck_tile::element_wise::PassThrough; + + using ConvEpilogue = ck_tile::CShuffleEpilogue, // = DsDataType + AccDataType, + InDataType, + typename GroupedConvTraitsType::ImplicitGemmDsLayout, + ck_tile::tensor_layout::gemm::RowMajor, + CDEElementWise, + TilePartitioner::MPerBlock, + TilePartitioner::NPerBlock, + M_Warp, + N_Warp, + M_Warp_Tile, + N_Warp_Tile, + K_Warp_Tile, + GemmConfigBase::TransposeC, + MemOp, + 1, + true, + GroupedConvTraitsType::VectorSizeC>>; + + return ck_tile::GroupedConvolutionBackwardDataKernel{}; + } + + bool IsSupportedArgument(const ck_tile::GroupedConvBwdDataHostArgs& args) const override + { + if (args.k_batch > 1) + { + using Kernel = decltype(CreateKernel()); + return Kernel::IsSupportedArgument(args); + } + using Kernel = decltype(CreateKernel()); + return Kernel::IsSupportedArgument(args); + }; + + float Run(const ck_tile::GroupedConvBwdDataHostArgs& args, bool time_kernel, int n_warmup=5, int n_repeat=50) const override + { + const ck_tile::index_t gemm_k = + args.N_ * std::accumulate(args.output_spatial_lengths_.begin(), + args.output_spatial_lengths_.end(), + 1, + std::multiplies()); + + const ck_tile::index_t k_grain = args.k_batch * K_Tile; + const ck_tile::index_t K_split = (gemm_k + k_grain - 1) / k_grain * K_Tile; + const ck_tile::index_t num_loop = TilePartitioner::GetLoopNum(K_split); + const bool has_hot_loop = BaseGemmPipeline::BlockHasHotloop(num_loop); + const ck_tile::TailNumber tail_num = BaseGemmPipeline::GetBlockLoopTailNum(num_loop); + float ave_time{0}; + + const auto Run = [&](const auto has_hot_loop_, + const auto tail_number_, + const auto memory_operation_) { + constexpr bool has_hot_loop_v = has_hot_loop_.value; + constexpr auto tail_number_v = tail_number_.value; + constexpr auto memory_operation = memory_operation_.value; + + auto kernel = CreateKernel(); + using Kernel = decltype(kernel); + + auto kargs = Kernel::MakeKernelArgs(args); + const dim3 grids = Kernel::GridSize(args); + const dim3 blocks = Kernel::BlockSize(); + + ck_tile::stream_config s {nullptr, time_kernel, 1, n_warmup, n_repeat}; + + ave_time = ck_tile::launch_kernel( + s, ck_tile::make_kernel(kernel, grids, blocks, 0, kargs)); + + return ave_time; + }; + + const auto RunSplitk = [&](const auto has_hot_loop_, const auto tail_number_) { + if(args.k_batch == 1) + { + Run(has_hot_loop_, tail_number_, MemoryOpSet{}); + } + else + { + Run(has_hot_loop_, tail_number_, MemoryOpAtomicAdd{}); + } + }; + + BaseGemmPipeline::TailHandler(RunSplitk, has_hot_loop, tail_num); + return ave_time; + }; + + std::string GetName(const ck_tile::GroupedConvBwdDataHostArgs& args) const override + { + std::stringstream min_occupancy; + min_occupancy << "_blk_per_cu_" << kBlockPerCu; + if (args.k_batch > 1) + { + using Kernel = decltype(CreateKernel()); + return Kernel::GetName() + min_occupancy.str(); + } + using Kernel = decltype(CreateKernel()); + return Kernel::GetName() + min_occupancy.str(); + }; + + GroupedConvolutionBackwardDataInvoker() = default; + GroupedConvolutionBackwardDataInvoker(const GroupedConvolutionBackwardDataInvoker&) = default; + GroupedConvolutionBackwardDataInvoker& operator=(const GroupedConvolutionBackwardDataInvoker&) = default; + GroupedConvolutionBackwardDataInvoker(GroupedConvolutionBackwardDataInvoker&&) = default; + GroupedConvolutionBackwardDataInvoker& operator=(GroupedConvolutionBackwardDataInvoker&&) = default; + ~GroupedConvolutionBackwardDataInvoker() override = default; + }; + +} +} diff --git a/library/include/ck_tile/library/tensor_operation_instance/gpu/tile_grouped_conv_bwd_weight_bf16_instances.hpp b/library/include/ck_tile/library/tensor_operation_instance/gpu/tile_grouped_conv_bwd_weight_bf16_instances.hpp index a2d9fc358e..ba216b8dc2 100644 --- a/library/include/ck_tile/library/tensor_operation_instance/gpu/tile_grouped_conv_bwd_weight_bf16_instances.hpp +++ b/library/include/ck_tile/library/tensor_operation_instance/gpu/tile_grouped_conv_bwd_weight_bf16_instances.hpp @@ -63,24 +63,49 @@ using tile_grouped_conv_bwd_weight_bf16_instances = std::tuple< GroupedConvolutionBackwardWeightInvoker, GroupedConvolutionBackwardWeightInvoker, GroupedConvolutionBackwardWeightInvoker, - GroupedConvolutionBackwardWeightInvoker, GroupedConvolutionBackwardWeightInvoker, GroupedConvolutionBackwardWeightInvoker, - GroupedConvolutionBackwardWeightInvoker, GroupedConvolutionBackwardWeightInvoker, GroupedConvolutionBackwardWeightInvoker, GroupedConvolutionBackwardWeightInvoker, - GroupedConvolutionBackwardWeightInvoker, GroupedConvolutionBackwardWeightInvoker, GroupedConvolutionBackwardWeightInvoker, - GroupedConvolutionBackwardWeightInvoker, GroupedConvolutionBackwardWeightInvoker, GroupedConvolutionBackwardWeightInvoker, - GroupedConvolutionBackwardWeightInvoker + GroupedConvolutionBackwardWeightInvoker, + +GroupedConvolutionBackwardWeightInvoker, +GroupedConvolutionBackwardWeightInvoker, +GroupedConvolutionBackwardWeightInvoker, +GroupedConvolutionBackwardWeightInvoker, +GroupedConvolutionBackwardWeightInvoker, +GroupedConvolutionBackwardWeightInvoker, +GroupedConvolutionBackwardWeightInvoker, +GroupedConvolutionBackwardWeightInvoker, +GroupedConvolutionBackwardWeightInvoker, +GroupedConvolutionBackwardWeightInvoker, +GroupedConvolutionBackwardWeightInvoker, +GroupedConvolutionBackwardWeightInvoker, +GroupedConvolutionBackwardWeightInvoker, +GroupedConvolutionBackwardWeightInvoker, +GroupedConvolutionBackwardWeightInvoker, +GroupedConvolutionBackwardWeightInvoker, +GroupedConvolutionBackwardWeightInvoker, +GroupedConvolutionBackwardWeightInvoker, +GroupedConvolutionBackwardWeightInvoker, +GroupedConvolutionBackwardWeightInvoker, +GroupedConvolutionBackwardWeightInvoker, +GroupedConvolutionBackwardWeightInvoker, +GroupedConvolutionBackwardWeightInvoker, +GroupedConvolutionBackwardWeightInvoker, +GroupedConvolutionBackwardWeightInvoker, +GroupedConvolutionBackwardWeightInvoker, +GroupedConvolutionBackwardWeightInvoker, +GroupedConvolutionBackwardWeightInvoker // clang-format on >; diff --git a/library/include/ck_tile/library/tensor_operation_instance/gpu/tile_grouped_conv_bwd_weight_bf16_instances_opt.hpp b/library/include/ck_tile/library/tensor_operation_instance/gpu/tile_grouped_conv_bwd_weight_bf16_instances_opt.hpp index 6598e5eae8..01e034dd40 100644 --- a/library/include/ck_tile/library/tensor_operation_instance/gpu/tile_grouped_conv_bwd_weight_bf16_instances_opt.hpp +++ b/library/include/ck_tile/library/tensor_operation_instance/gpu/tile_grouped_conv_bwd_weight_bf16_instances_opt.hpp @@ -41,7 +41,43 @@ using tile_grouped_conv_bwd_weight_bf16_instances_opt = std::tuple< GroupedConvolutionBackwardWeightInvoker, GroupedConvolutionBackwardWeightInvoker, GroupedConvolutionBackwardWeightInvoker, - GroupedConvolutionBackwardWeightInvoker + GroupedConvolutionBackwardWeightInvoker, + + +GroupedConvolutionBackwardWeightInvoker, +GroupedConvolutionBackwardWeightInvoker, +GroupedConvolutionBackwardWeightInvoker, +GroupedConvolutionBackwardWeightInvoker, +GroupedConvolutionBackwardWeightInvoker, +GroupedConvolutionBackwardWeightInvoker, +GroupedConvolutionBackwardWeightInvoker, +GroupedConvolutionBackwardWeightInvoker, +GroupedConvolutionBackwardWeightInvoker, +GroupedConvolutionBackwardWeightInvoker, +GroupedConvolutionBackwardWeightInvoker, +GroupedConvolutionBackwardWeightInvoker, +GroupedConvolutionBackwardWeightInvoker, +GroupedConvolutionBackwardWeightInvoker, +GroupedConvolutionBackwardWeightInvoker, +GroupedConvolutionBackwardWeightInvoker, +GroupedConvolutionBackwardWeightInvoker, +GroupedConvolutionBackwardWeightInvoker, +GroupedConvolutionBackwardWeightInvoker, +GroupedConvolutionBackwardWeightInvoker, +GroupedConvolutionBackwardWeightInvoker, +GroupedConvolutionBackwardWeightInvoker, +GroupedConvolutionBackwardWeightInvoker, +GroupedConvolutionBackwardWeightInvoker, +GroupedConvolutionBackwardWeightInvoker, +GroupedConvolutionBackwardWeightInvoker, +GroupedConvolutionBackwardWeightInvoker, +GroupedConvolutionBackwardWeightInvoker, +GroupedConvolutionBackwardWeightInvoker, +GroupedConvolutionBackwardWeightInvoker, +GroupedConvolutionBackwardWeightInvoker, +GroupedConvolutionBackwardWeightInvoker + + // clang-format on >; diff --git a/library/include/ck_tile/library/tensor_operation_instance/gpu/tile_grouped_conv_fwd_bf16_instances.hpp b/library/include/ck_tile/library/tensor_operation_instance/gpu/tile_grouped_conv_fwd_bf16_instances.hpp index efac525010..20b2630db6 100644 --- a/library/include/ck_tile/library/tensor_operation_instance/gpu/tile_grouped_conv_fwd_bf16_instances.hpp +++ b/library/include/ck_tile/library/tensor_operation_instance/gpu/tile_grouped_conv_fwd_bf16_instances.hpp @@ -32,37 +32,89 @@ using tile_grouped_conv_fwd_bf16_instances = std::tuple< //##############################| Num| InLayout| WeiLayout| OutLayout| InData| WeiData| OutData| In| Wei| Out| K-block| M-tile| N-tile | K-tile | M-warp| N-warp| K-warp| M-warp| N-warp| K-warp| Vector| Vector| Vector| Double| GEMM| //##############################| Dim| | | | Type| Type| Type| Elementwise| Elementwise| Elementwise| per| | | | | | | tile| tile| tile| size| size| size| smem| pipeline| //##############################| Spatial| | | | | | | Operation| Operation| Operation| CU| | | | | | | size| size| size| A| B| C| buffer| version| - GroupedConvolutionForwardInvoker, - GroupedConvolutionForwardInvoker, - GroupedConvolutionForwardInvoker, - GroupedConvolutionForwardInvoker, - GroupedConvolutionForwardInvoker, - GroupedConvolutionForwardInvoker, - GroupedConvolutionForwardInvoker, - GroupedConvolutionForwardInvoker, - GroupedConvolutionForwardInvoker, - GroupedConvolutionForwardInvoker, - GroupedConvolutionForwardInvoker, - GroupedConvolutionForwardInvoker, - GroupedConvolutionForwardInvoker, - GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker - GroupedConvolutionForwardInvoker, - GroupedConvolutionForwardInvoker, - GroupedConvolutionForwardInvoker, - GroupedConvolutionForwardInvoker, - GroupedConvolutionForwardInvoker, - GroupedConvolutionForwardInvoker, - GroupedConvolutionForwardInvoker, - GroupedConvolutionForwardInvoker, - GroupedConvolutionForwardInvoker, - GroupedConvolutionForwardInvoker, - GroupedConvolutionForwardInvoker, - GroupedConvolutionForwardInvoker, - GroupedConvolutionForwardInvoker, - GroupedConvolutionForwardInvoker + // old + // GroupedConvolutionForwardInvoker, +// GroupedConvolutionForwardInvoker, +// GroupedConvolutionForwardInvoker, +// GroupedConvolutionForwardInvoker, +// GroupedConvolutionForwardInvoker, +// GroupedConvolutionForwardInvoker, +// GroupedConvolutionForwardInvoker, +// GroupedConvolutionForwardInvoker, +// GroupedConvolutionForwardInvoker, +// GroupedConvolutionForwardInvoker, +// GroupedConvolutionForwardInvoker, +// GroupedConvolutionForwardInvoker, +// GroupedConvolutionForwardInvoker, +// GroupedConvolutionForwardInvoker, +// GroupedConvolutionForwardInvoker, +// GroupedConvolutionForwardInvoker, +// GroupedConvolutionForwardInvoker, +// GroupedConvolutionForwardInvoker, +// GroupedConvolutionForwardInvoker, +// GroupedConvolutionForwardInvoker, + +// // GroupedConvolutionForwardInvoker, +// // GroupedConvolutionForwardInvoker, +// // GroupedConvolutionForwardInvoker, +// // GroupedConvolutionForwardInvoker, +// // GroupedConvolutionForwardInvoker, +// // GroupedConvolutionForwardInvoker, + +// GroupedConvolutionForwardInvoker, +// GroupedConvolutionForwardInvoker, +// GroupedConvolutionForwardInvoker, +// GroupedConvolutionForwardInvoker, +// GroupedConvolutionForwardInvoker, +// GroupedConvolutionForwardInvoker, +// GroupedConvolutionForwardInvoker, +// GroupedConvolutionForwardInvoker, + +// GroupedConvolutionForwardInvoker, +// GroupedConvolutionForwardInvoker, +// GroupedConvolutionForwardInvoker, +// GroupedConvolutionForwardInvoker, +// GroupedConvolutionForwardInvoker, +// GroupedConvolutionForwardInvoker // clang-format on >; diff --git a/library/include/ck_tile/library/tensor_operation_instance/gpu/tile_grouped_conv_fwd_bf16_instances_2.hpp b/library/include/ck_tile/library/tensor_operation_instance/gpu/tile_grouped_conv_fwd_bf16_instances_2.hpp new file mode 100644 index 0000000000..1356444c4b --- /dev/null +++ b/library/include/ck_tile/library/tensor_operation_instance/gpu/tile_grouped_conv_fwd_bf16_instances_2.hpp @@ -0,0 +1,147 @@ +// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck_tile/library/tensor_operation_instance/gpu/tile_grouped_conv_instance_factory.hpp" +#include "ck_tile/library/tensor_operation_instance/gpu/tile_grouped_conv_fwd_invoker.hpp" + +namespace ck_tile { +namespace ops { + +using BF16 = ck_tile::bfloat16_t; + +using DeviceOpFwd2DBF16 = GroupedConvolutionForwardBaseInvoker<2, + NHWGC, + GKYXC, + NHWGK, + BF16, + BF16, + BF16, + PassThrough, + PassThrough, + PassThrough, + BF16, + BF16>; + +template +using tile_grouped_conv_fwd_bf16_instances_2 = std::tuple< +// clang-format off + //##############################| Num| InLayout| WeiLayout| OutLayout| InData| WeiData| OutData| In| Wei| Out| K-block| M-tile| N-tile | K-tile | M-warp| N-warp| K-warp| M-warp| N-warp| K-warp| Vector| Vector| Vector| Double| GEMM| + //##############################| Dim| | | | Type| Type| Type| Elementwise| Elementwise| Elementwise| per| | | | | | | tile| tile| tile| size| size| size| smem| pipeline| + //##############################| Spatial| | | | | | | Operation| Operation| Operation| CU| | | | | | | size| size| size| A| B| C| buffer| version| + +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +// GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +// GroupedConvolutionForwardInvoker, +// GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +// GroupedConvolutionForwardInvoker, +// GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker +// GroupedConvolutionForwardInvoker + + + + // old + // GroupedConvolutionForwardInvoker, +// GroupedConvolutionForwardInvoker, +// GroupedConvolutionForwardInvoker, +// GroupedConvolutionForwardInvoker, +// GroupedConvolutionForwardInvoker, +// GroupedConvolutionForwardInvoker, +// GroupedConvolutionForwardInvoker, +// GroupedConvolutionForwardInvoker, +// GroupedConvolutionForwardInvoker, +// GroupedConvolutionForwardInvoker, +// GroupedConvolutionForwardInvoker, +// GroupedConvolutionForwardInvoker, +// GroupedConvolutionForwardInvoker, +// GroupedConvolutionForwardInvoker, +// GroupedConvolutionForwardInvoker, +// GroupedConvolutionForwardInvoker, +// GroupedConvolutionForwardInvoker, +// GroupedConvolutionForwardInvoker, +// GroupedConvolutionForwardInvoker, +// GroupedConvolutionForwardInvoker, + +// // GroupedConvolutionForwardInvoker, +// // GroupedConvolutionForwardInvoker, +// // GroupedConvolutionForwardInvoker, +// // GroupedConvolutionForwardInvoker, +// // GroupedConvolutionForwardInvoker, +// // GroupedConvolutionForwardInvoker, + +// GroupedConvolutionForwardInvoker, +// GroupedConvolutionForwardInvoker, +// GroupedConvolutionForwardInvoker, +// GroupedConvolutionForwardInvoker, +// GroupedConvolutionForwardInvoker, +// GroupedConvolutionForwardInvoker, +// GroupedConvolutionForwardInvoker, +// GroupedConvolutionForwardInvoker, + +// GroupedConvolutionForwardInvoker, +// GroupedConvolutionForwardInvoker, +// GroupedConvolutionForwardInvoker, +// GroupedConvolutionForwardInvoker, +// GroupedConvolutionForwardInvoker, +// GroupedConvolutionForwardInvoker + // clang-format on +>; + +} // namespace ops +} // namespace ck_tile diff --git a/library/include/ck_tile/library/tensor_operation_instance/gpu/tile_grouped_conv_fwd_bf16_instances_3.hpp b/library/include/ck_tile/library/tensor_operation_instance/gpu/tile_grouped_conv_fwd_bf16_instances_3.hpp new file mode 100644 index 0000000000..06e70b0146 --- /dev/null +++ b/library/include/ck_tile/library/tensor_operation_instance/gpu/tile_grouped_conv_fwd_bf16_instances_3.hpp @@ -0,0 +1,151 @@ +// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck_tile/library/tensor_operation_instance/gpu/tile_grouped_conv_instance_factory.hpp" +#include "ck_tile/library/tensor_operation_instance/gpu/tile_grouped_conv_fwd_invoker.hpp" + +namespace ck_tile { +namespace ops { + +using BF16 = ck_tile::bfloat16_t; + +using DeviceOpFwd2DBF16 = GroupedConvolutionForwardBaseInvoker<2, + NHWGC, + GKYXC, + NHWGK, + BF16, + BF16, + BF16, + PassThrough, + PassThrough, + PassThrough, + BF16, + BF16>; + +template +using tile_grouped_conv_fwd_bf16_instances_3 = std::tuple< +// clang-format off + //##############################| Num| InLayout| WeiLayout| OutLayout| InData| WeiData| OutData| In| Wei| Out| K-block| M-tile| N-tile | K-tile | M-warp| N-warp| K-warp| M-warp| N-warp| K-warp| Vector| Vector| Vector| Double| GEMM| + //##############################| Dim| | | | Type| Type| Type| Elementwise| Elementwise| Elementwise| per| | | | | | | tile| tile| tile| size| size| size| smem| pipeline| + //##############################| Spatial| | | | | | | Operation| Operation| Operation| CU| | | | | | | size| size| size| A| B| C| buffer| version| + +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +// GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +// GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker, +GroupedConvolutionForwardInvoker + + // old + // GroupedConvolutionForwardInvoker, +// GroupedConvolutionForwardInvoker, +// GroupedConvolutionForwardInvoker, +// GroupedConvolutionForwardInvoker, +// GroupedConvolutionForwardInvoker, +// GroupedConvolutionForwardInvoker, +// GroupedConvolutionForwardInvoker, +// GroupedConvolutionForwardInvoker, +// GroupedConvolutionForwardInvoker, +// GroupedConvolutionForwardInvoker, +// GroupedConvolutionForwardInvoker, +// GroupedConvolutionForwardInvoker, +// GroupedConvolutionForwardInvoker, +// GroupedConvolutionForwardInvoker, +// GroupedConvolutionForwardInvoker, +// GroupedConvolutionForwardInvoker, +// GroupedConvolutionForwardInvoker, +// GroupedConvolutionForwardInvoker, +// GroupedConvolutionForwardInvoker, +// GroupedConvolutionForwardInvoker, + +// // GroupedConvolutionForwardInvoker, +// // GroupedConvolutionForwardInvoker, +// // GroupedConvolutionForwardInvoker, +// // GroupedConvolutionForwardInvoker, +// // GroupedConvolutionForwardInvoker, +// // GroupedConvolutionForwardInvoker, + +// GroupedConvolutionForwardInvoker, +// GroupedConvolutionForwardInvoker, +// GroupedConvolutionForwardInvoker, +// GroupedConvolutionForwardInvoker, +// GroupedConvolutionForwardInvoker, +// GroupedConvolutionForwardInvoker, +// GroupedConvolutionForwardInvoker, +// GroupedConvolutionForwardInvoker, + +// GroupedConvolutionForwardInvoker, +// GroupedConvolutionForwardInvoker, +// GroupedConvolutionForwardInvoker, +// GroupedConvolutionForwardInvoker, +// GroupedConvolutionForwardInvoker, +// GroupedConvolutionForwardInvoker + // clang-format on +>; + +} // namespace ops +} // namespace ck_tile diff --git a/library/include/ck_tile/library/tensor_operation_instance/gpu/tile_grouped_conv_fwd_factory.hpp b/library/include/ck_tile/library/tensor_operation_instance/gpu/tile_grouped_conv_fwd_factory.hpp index 9f7848d951..a63de4ea55 100644 --- a/library/include/ck_tile/library/tensor_operation_instance/gpu/tile_grouped_conv_fwd_factory.hpp +++ b/library/include/ck_tile/library/tensor_operation_instance/gpu/tile_grouped_conv_fwd_factory.hpp @@ -41,6 +41,9 @@ using DeviceOpFwd2DF16 = GroupedConvolutionForwardBaseInvoker<2, F16>; void add_grouped_conv2d_fwd_bf16_instances(std::vector>& instances); +void add_grouped_conv2d_fwd_bf16_instances_2(std::vector>& instances); +void add_grouped_conv2d_fwd_bf16_instances_3(std::vector>& instances); + void add_grouped_conv2d_fwd_f16_instances(std::vector>& instances); template ) { add_grouped_conv2d_fwd_bf16_instances(op_ptrs); + add_grouped_conv2d_fwd_bf16_instances_2(op_ptrs); + add_grouped_conv2d_fwd_bf16_instances_3(op_ptrs); } else { diff --git a/library/src/ck_tile/tensor_operation_instance/gpu/CMakeLists.txt b/library/src/ck_tile/tensor_operation_instance/gpu/CMakeLists.txt index ac97cd0b97..af8c808f2a 100644 --- a/library/src/ck_tile/tensor_operation_instance/gpu/CMakeLists.txt +++ b/library/src/ck_tile/tensor_operation_instance/gpu/CMakeLists.txt @@ -3,6 +3,10 @@ # Moving these large template instantiations to separate .cpp files enables parallel compilation +add_library(ck_tile_grouped_conv_bwd_data_bf16_instances STATIC + tile_grouped_conv_bwd_data_bf16_instances.cpp +) + add_library(ck_tile_grouped_conv_bwd_weight_fp16_instances STATIC tile_grouped_conv_bwd_weight_fp16_instances.cpp ) @@ -23,6 +27,19 @@ add_library(ck_tile_grouped_conv_fwd_bf16_instances STATIC tile_grouped_conv_fwd_bf16_instances.cpp ) +add_library(ck_tile_grouped_conv_fwd_bf16_instances_2 STATIC + tile_grouped_conv_fwd_bf16_instances_2.cpp +) + +add_library(ck_tile_grouped_conv_fwd_bf16_instances_3 STATIC + tile_grouped_conv_fwd_bf16_instances_3.cpp +) + +target_include_directories(ck_tile_grouped_conv_bwd_data_bf16_instances PRIVATE + ${PROJECT_SOURCE_DIR}/include + ${PROJECT_SOURCE_DIR}/library/include +) + target_include_directories(ck_tile_grouped_conv_bwd_weight_fp16_instances PRIVATE ${PROJECT_SOURCE_DIR}/include ${PROJECT_SOURCE_DIR}/library/include @@ -47,3 +64,13 @@ target_include_directories(ck_tile_grouped_conv_fwd_bf16_instances PRIVATE ${PROJECT_SOURCE_DIR}/include ${PROJECT_SOURCE_DIR}/library/include ) + +target_include_directories(ck_tile_grouped_conv_fwd_bf16_instances_2 PRIVATE + ${PROJECT_SOURCE_DIR}/include + ${PROJECT_SOURCE_DIR}/library/include +) + +target_include_directories(ck_tile_grouped_conv_fwd_bf16_instances_3 PRIVATE + ${PROJECT_SOURCE_DIR}/include + ${PROJECT_SOURCE_DIR}/library/include +) diff --git a/library/src/ck_tile/tensor_operation_instance/gpu/tile_grouped_conv_bwd_data_bf16_instances.cpp b/library/src/ck_tile/tensor_operation_instance/gpu/tile_grouped_conv_bwd_data_bf16_instances.cpp new file mode 100644 index 0000000000..b8df0528de --- /dev/null +++ b/library/src/ck_tile/tensor_operation_instance/gpu/tile_grouped_conv_bwd_data_bf16_instances.cpp @@ -0,0 +1,20 @@ +// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck_tile/library/tensor_operation_instance/gpu/tile_grouped_conv_instance_factory.hpp" +#include "ck_tile/library/tensor_operation_instance/gpu/tile_grouped_conv_bwd_data_bf16_instances.hpp" + +namespace ck_tile { +namespace ops { + +void add_grouped_conv2d_bwd_data_bf16_instances(std::vector>& instances) +{ + add_device_operation_instances(instances, + tile_grouped_conv_bwd_data_bf16_instances< + 2, + NHWGC, + GKYXC, + NHWGK>{}); +} + +} // namespace ops +} // namespace ck_tile diff --git a/library/src/ck_tile/tensor_operation_instance/gpu/tile_grouped_conv_fwd_bf16_instances_2.cpp b/library/src/ck_tile/tensor_operation_instance/gpu/tile_grouped_conv_fwd_bf16_instances_2.cpp new file mode 100644 index 0000000000..f55b3d9946 --- /dev/null +++ b/library/src/ck_tile/tensor_operation_instance/gpu/tile_grouped_conv_fwd_bf16_instances_2.cpp @@ -0,0 +1,20 @@ +// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck_tile/library/tensor_operation_instance/gpu/tile_grouped_conv_instance_factory.hpp" +#include "ck_tile/library/tensor_operation_instance/gpu/tile_grouped_conv_fwd_bf16_instances_2.hpp" + +namespace ck_tile { +namespace ops { + +void add_grouped_conv2d_fwd_bf16_instances_2(std::vector>& instances) +{ + add_device_operation_instances(instances, + tile_grouped_conv_fwd_bf16_instances_2< + 2, + NHWGC, + GKYXC, + NHWGK>{}); +} + +} // namespace ops +} // namespace ck_tile diff --git a/library/src/ck_tile/tensor_operation_instance/gpu/tile_grouped_conv_fwd_bf16_instances_3.cpp b/library/src/ck_tile/tensor_operation_instance/gpu/tile_grouped_conv_fwd_bf16_instances_3.cpp new file mode 100644 index 0000000000..4829e41780 --- /dev/null +++ b/library/src/ck_tile/tensor_operation_instance/gpu/tile_grouped_conv_fwd_bf16_instances_3.cpp @@ -0,0 +1,20 @@ +// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck_tile/library/tensor_operation_instance/gpu/tile_grouped_conv_instance_factory.hpp" +#include "ck_tile/library/tensor_operation_instance/gpu/tile_grouped_conv_fwd_bf16_instances_3.hpp" + +namespace ck_tile { +namespace ops { + +void add_grouped_conv2d_fwd_bf16_instances_3(std::vector>& instances) +{ + add_device_operation_instances(instances, + tile_grouped_conv_fwd_bf16_instances_3< + 2, + NHWGC, + GKYXC, + NHWGK>{}); +} + +} // namespace ops +} // namespace ck_tile diff --git a/profiler/ck_tile/include/tile_profile_grouped_conv_bwd_data_impl.hpp b/profiler/ck_tile/include/tile_profile_grouped_conv_bwd_data_impl.hpp new file mode 100644 index 0000000000..f9e4ec1971 --- /dev/null +++ b/profiler/ck_tile/include/tile_profile_grouped_conv_bwd_data_impl.hpp @@ -0,0 +1,299 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include +#include +#include +#include +#include +#include + +#include "ck_tile/host.hpp" +#include "ck_tile/host/convolution_parameter.hpp" +#include "ck_tile/ops/elementwise/unary_element_wise_operation.hpp" +#include "ck_tile/ops/grouped_convolution/utils/grouped_convolution_utils.hpp" +#include "ck_tile/library/tensor_operation_instance/gpu/tile_grouped_conv_bwd_data_factory.hpp" +#include "ck_tile/ops/grouped_convolution/kernel/grouped_convolution_backward_data_kernel.hpp" +#include "ck_tile/host/reference/reference_grouped_conv_bwd_data.hpp" +namespace ck_tile { +namespace profiler { + +template +auto calculate_rtol_atol(const ck_tile::index_t GemmK, + const ck_tile::index_t kbatch, + const float max_accumulated_value) +{ + using ComputeType = + std::conditional_t; + // Calculate thresholds + const auto rtol = ck_tile::get_relative_threshold( + ck_tile::integer_divide_ceil(GemmK, kbatch)); + const auto atol = ck_tile::get_absolute_threshold( + max_accumulated_value / kbatch, ck_tile::integer_divide_ceil(GemmK, kbatch)); + // Calculate error due to split_k accumulation + const auto rtol_split_k = + ck_tile::get_relative_threshold(kbatch); + const auto atol_split_k = + ck_tile::get_absolute_threshold( + max_accumulated_value, kbatch); + // Use higher threshold + return ck_tile::make_tuple(std::max(rtol, rtol_split_k), std::max(atol, atol_split_k)); +} + +template +bool profile_grouped_conv_bwd_data_impl(int do_verification, + int init_method, + bool /*do_log*/, + bool time_kernel, + const ck_tile::conv::ConvParam& conv_param, + const std::string& split_k, + ck_tile::index_t instance_index = -1) +{ + using AccDataType = float; + using InElementOp = ck_tile::element_wise::PassThrough; + using WeiElementOp = ck_tile::element_wise::PassThrough; + using OutElementOp = ck_tile::element_wise::PassThrough; + + const auto in_g_n_c_wis_desc = + ck_tile::conv::make_input_host_tensor_descriptor_g_n_c_wis_packed(conv_param); + const auto wei_g_k_c_xs_desc = + ck_tile::conv::make_weight_host_tensor_descriptor_g_k_c_xs_packed(conv_param); + const auto out_g_n_k_wos_desc = + ck_tile::conv::make_output_host_tensor_descriptor_g_n_k_wos_packed(conv_param); + + ck_tile::HostTensor input(in_g_n_c_wis_desc); + ck_tile::HostTensor weight(wei_g_k_c_xs_desc); + ck_tile::HostTensor output(out_g_n_k_wos_desc); + + std::cout << "input: " << input.mDesc << std::endl; + std::cout << "weight: " << weight.mDesc << std::endl; + std::cout << "output: " << output.mDesc << std::endl; + + switch(init_method) + { + case 0: + ck_tile::FillUniformDistribution{-1.f, 1.f}(weight); + ck_tile::FillUniformDistribution{-1.f, 1.f}(output); + break; + case 1: + ck_tile::FillMonotonicSeq{}(weight); + ck_tile::FillMonotonicSeq{}(output); + break; + case 2: + ck_tile::FillUniformDistribution{1.f, 1.f}(weight); + ck_tile::FillUniformDistribution{1.f, 1.f}(output); + break; + default: + input.SetZero(); + output.SetZero(); + } + + using DeviceOp = ops::GroupedConvolutionBackwardDataBaseInvoker< + NDimSpatial, + InLayout, + WeiLayout, + OutLayout, + InDataType, + WeiDataType, + OutDataType, + InElementOp, + WeiElementOp, + OutElementOp, + ComputeTypeA, + ComputeTypeB>; + + // get device op instances + const auto ops = ck_tile::ops::DeviceOperationInstanceFactory::GetInstances(); + + std::cout << "found " << ops.size() << " instances" << std::endl; + + std::string best_op_name; + float best_avg_time = 0; + float best_tflops = 0; + float best_gb_per_sec = 0; + std::string best_split_k("1"); + + std::vector split_k_list = {1, 2, 4, 8, 16, 32, 64, 128}; + if(split_k != "all") + { + try + { + ck_tile::index_t split_k_value = std::stoi(split_k); + split_k_list = {split_k_value}; + } + catch(const std::exception& e) + { + std::cerr << e.what() << '\n'; + exit(EXIT_FAILURE); + } + } + + // First, calculate the reference result if verification is needed. + ck_tile::HostTensor input_host_ref(in_g_n_c_wis_desc); + input_host_ref.SetZero(); + if (do_verification) + { + ck_tile::reference_grouped_conv_bwd_data( + input_host_ref, + weight, + output, + conv_param.conv_filter_strides_, + conv_param.conv_filter_dilations_, + conv_param.input_left_pads_, + conv_param.input_right_pads_); + } + + + index_t num_kernel = 0; + bool all_pass = true; + for(auto& op : ops) + { + for(std::size_t split_k_id = 0; split_k_id < split_k_list.size(); split_k_id++) + { + auto split_k_value = split_k_list[split_k_id]; + auto split_k_param_str = std::to_string(split_k_value); + + ck_tile::DeviceMem input_dev_buf(input.get_element_space_size_in_bytes()); + ck_tile::DeviceMem weight_dev_buf(weight.get_element_space_size_in_bytes()); + ck_tile::DeviceMem output_dev_buf(output.get_element_space_size_in_bytes()); + + input_dev_buf.SetZero(); + weight_dev_buf.ToDevice(weight.data()); + output_dev_buf.ToDevice(output.data()); + + ck_tile::GroupedConvBwdDataHostArgs args(conv_param, + input_dev_buf.GetDeviceBuffer(), + weight_dev_buf.GetDeviceBuffer(), + {}, + output_dev_buf.GetDeviceBuffer(), + split_k_value); + + // Split-K autodeduction is not supported. + if(op->IsSupportedArgument(args) && split_k_value >= 1) + { + num_kernel++; + if((instance_index != -1) && (instance_index + 1 != num_kernel)) + { + // skip test if instance_index is specified + continue; + } + + std::string op_name = op->GetName(args); + std::cout << op_name << ", SplitK " << split_k_param_str << " is profiled..." << std::endl; + + // Run verification first. If it doesn't pass, no need to do performance measurement. + bool pass = false; + if(do_verification) + { + constexpr int n_warmup = 0; + constexpr int n_repeat = 1; + + op->Run(args, false, n_warmup, n_repeat); + input_dev_buf.FromDevice(input.data()); + + const ck_tile::index_t GemmK = weight.get_element_size() / (conv_param.G_ * conv_param.K_); + const float max_accumulated_value = + *std::max_element(input_host_ref.mData.begin(), input_host_ref.mData.end()); + const auto rtol_atol = + calculate_rtol_atol( + GemmK, split_k_value, max_accumulated_value); + + pass = ck_tile::check_err(input, + input_host_ref, + "Error: Incorrect results!", + rtol_atol.at(ck_tile::number<0>{}), + rtol_atol.at(ck_tile::number<1>{})); + + std::cout << "Relative error threshold: " << rtol_atol.at(ck_tile::number<0>{}) + << " Absolute error threshold: " << rtol_atol.at(ck_tile::number<1>{}) + << std::endl; + std::cout << "The CPU verification result is:" << (pass ? "correct" : "fail") << std::endl; + + all_pass &= pass; + } + + bool is_valid = do_verification ? pass : true; + + if (is_valid) + { + constexpr int n_warmup = 5; + constexpr int n_repeat = 50; + float avg_time = op->Run(args, time_kernel, n_warmup, n_repeat); + + std::size_t flop = conv_param.GetFlops(); + std::size_t num_btype = conv_param.GetByte(); + + float tflops = static_cast(flop) / 1.E9 / avg_time; + float gb_per_sec = num_btype / 1.E6 / avg_time; + + std::cout << "Perf: " << std::setw(10) << avg_time << " ms, " << tflops + << " TFlops, " << gb_per_sec << " GB/s, " << op_name << ", SplitK " + << split_k_param_str << std::endl; + + if(tflops > best_tflops) + { + best_op_name = op_name; + best_tflops = tflops; + best_avg_time = avg_time; + best_gb_per_sec = gb_per_sec; + best_split_k = split_k_param_str; + } + } + } + else + { + std::cout << op->GetName(args) << ", SplitK " << split_k_param_str + << " does not support this problem." << std::endl; + } + } + } + + std::stringstream ss; + ss << "\n********************************" + << "\nCK Tile best configuration parameters:" + << "\n********************************" + << "\nname: " << best_op_name + << "\navg_time: " << best_avg_time << "\ntflops: " << best_tflops + << "\nGB/s: " << best_gb_per_sec + << "\nSplitK: " << best_split_k + << std::endl; + + std::cout << ss.str(); + + const char* log_file = std::getenv("CK_TILE_PROFILER_LOG_FILE"); + if(log_file != nullptr) + { + std::ofstream out(log_file, std::ios::app); + if(out.is_open()) + { + std::stringstream out_ss; + out_ss << "CK Tile best configuration:" << std::endl + << "name: " << best_op_name << std::endl + << "avg_time: " << best_avg_time << std::endl + << "SplitK: " << best_split_k << std::endl + << "all_pass " << (all_pass ? "true" : "false") << std::endl; + out << out_ss.str(); + out.close(); + } + } + + if(instance_index != -1) + { + std::cout << "grouped_conv_bwd_data_instance (" << instance_index << "/" << num_kernel + << "): Passed" << std::endl; + } + return all_pass; +} + +} // namespace profiler +} // namespace ck_tile diff --git a/profiler/ck_tile/include/tile_profile_grouped_conv_bwd_weight_impl.hpp b/profiler/ck_tile/include/tile_profile_grouped_conv_bwd_weight_impl.hpp index 7906007369..038d7176dc 100644 --- a/profiler/ck_tile/include/tile_profile_grouped_conv_bwd_weight_impl.hpp +++ b/profiler/ck_tile/include/tile_profile_grouped_conv_bwd_weight_impl.hpp @@ -280,7 +280,8 @@ bool profile_grouped_conv_bwd_weight_impl(int do_verification, out_ss << "CK Tile best configuration:" << std::endl << "name: " << best_op_name << std::endl << "avg_time: " << best_avg_time << std::endl - << "SplitK: " << best_split_k << std::endl; + << "SplitK: " << best_split_k << std::endl + << "all_pass " << (all_pass ? "true" : "false") << std::endl; out << out_ss.str(); out.close(); } diff --git a/profiler/ck_tile/include/tile_profile_grouped_conv_fwd_impl.hpp b/profiler/ck_tile/include/tile_profile_grouped_conv_fwd_impl.hpp index 3c2d920836..a9b0ae9e8a 100644 --- a/profiler/ck_tile/include/tile_profile_grouped_conv_fwd_impl.hpp +++ b/profiler/ck_tile/include/tile_profile_grouped_conv_fwd_impl.hpp @@ -91,8 +91,8 @@ bool profile_grouped_conv_fwd_impl(int do_verification, } else if(init_method == 2) { - ck_tile::FillUniformDistribution{1.f, 1.f}(input); - ck_tile::FillUniformDistribution{1.f, 1.f}(weight); + ck_tile::FillUniformDistribution{0.f, 1.f}(input); + ck_tile::FillUniformDistribution{0.f, 1.f}(weight); } else { @@ -133,7 +133,7 @@ bool profile_grouped_conv_fwd_impl(int do_verification, ck_tile::DeviceMem output_dev_buf(output.get_element_space_size_in_bytes()); input_dev_buf.ToDevice(input.data()); - weight_dev_buf.ToDevice(output.data()); + weight_dev_buf.ToDevice(weight.data()); output_dev_buf.SetZero(); ck_tile::GroupedConvFwdHostArgs args(conv_param, @@ -233,6 +233,23 @@ bool profile_grouped_conv_fwd_impl(int do_verification, << "\navg_time: " << best_avg_time << "\ntflops: " << best_tflops << "\nGB/s: " << best_gb_per_sec << std::endl; + const char* log_file = std::getenv("CK_TILE_PROFILER_LOG_FILE"); + if(log_file != nullptr) + { + std::ofstream out(log_file, std::ios::app); + if(out.is_open()) + { + std::stringstream out_ss; + out_ss << "CK Tile best configuration:" << std::endl + << "name: " << best_op_name << std::endl + << "avg_time: " << best_avg_time << std::endl + << "SplitK: " << 1 << std::endl + << "all_pass " << (all_pass ? "true" : "false") << std::endl; + out << out_ss.str(); + out.close(); + } + } + if(instance_index != -1) { std::cout << "grouped_conv_fwd_instance (" << instance_index << "/" << num_kernel diff --git a/profiler/ck_tile/src/CMakeLists.txt b/profiler/ck_tile/src/CMakeLists.txt index 26db40b9fa..7c42cfe065 100644 --- a/profiler/ck_tile/src/CMakeLists.txt +++ b/profiler/ck_tile/src/CMakeLists.txt @@ -11,6 +11,7 @@ message(STATUS "CK_PROFILER_OP_FILTER: ${CK_PROFILER_OP_FILTER}") message(STATUS "CK_PROFILER_INSTANCE_FILTER: ${CK_PROFILER_INSTANCE_FILTER}") if(SUPPORTED_GPU_TARGETS MATCHES "gfx9" OR SUPPORTED_GPU_TARGETS MATCHES "gfx1[12]") + list(APPEND PROFILER_OPS tile_profile_grouped_conv_bwd_data.cpp) list(APPEND PROFILER_OPS tile_profile_grouped_conv_bwd_weight.cpp) list(APPEND PROFILER_OPS tile_profile_grouped_conv_fwd.cpp) endif() @@ -39,11 +40,15 @@ endif() set(DEVICE_INSTANCES "") if(SUPPORTED_GPU_TARGETS MATCHES "gfx9" OR SUPPORTED_GPU_TARGETS MATCHES "gfx1[12]") + list(APPEND DEVICE_INSTANCES ck_tile_grouped_conv_bwd_data_bf16_instances) list(APPEND DEVICE_INSTANCES ck_tile_grouped_conv_bwd_weight_fp16_instances) list(APPEND DEVICE_INSTANCES ck_tile_grouped_conv_bwd_weight_bf16_instances) list(APPEND DEVICE_INSTANCES ck_tile_grouped_conv_bwd_weight_bf16_instances_opt) list(APPEND DEVICE_INSTANCES ck_tile_grouped_conv_fwd_fp16_instances) list(APPEND DEVICE_INSTANCES ck_tile_grouped_conv_fwd_bf16_instances) + list(APPEND DEVICE_INSTANCES ck_tile_grouped_conv_fwd_bf16_instances_2) + list(APPEND DEVICE_INSTANCES ck_tile_grouped_conv_fwd_bf16_instances_3) + endif() if(SUPPORTED_GPU_TARGETS MATCHES "gfx9" OR SUPPORTED_GPU_TARGETS MATCHES "gfx1[12]") diff --git a/profiler/ck_tile/src/tile_profile_grouped_conv_bwd_data.cpp b/profiler/ck_tile/src/tile_profile_grouped_conv_bwd_data.cpp new file mode 100644 index 0000000000..bfd5393407 --- /dev/null +++ b/profiler/ck_tile/src/tile_profile_grouped_conv_bwd_data.cpp @@ -0,0 +1,219 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. + +#include +#include +#include +#include + +#include "tile_profile_grouped_conv_bwd_data_impl.hpp" +#include "tile_profiler_operation_registry.hpp" + +// CK Tile library dependencies +#include "ck_tile/core/numeric/integral_constant.hpp" +#include "ck_tile/ops/common/tensor_layout.hpp" + +namespace { + +enum struct ConvLayout +{ + GNHWC_GKYXC_GNHWK, // 0 + NHWGC_GKYXC_NHWGK, // 1 + NGCHW_GKYXC_NGKHW, // 2 + NGCHW_GKCYX_NGKHW, // 3 +}; + +enum struct ConvDataType +{ + F32_F32_F32, // 0 + F16_F16_F16, // 1 + BF16_BF16_BF16, // 2 +}; + +#define OP_NAME "grouped_conv_bwd_data" +#define OP_DESC "Grouped Convolution Backward Data" + +static void print_helper_msg() +{ + std::string conv_param_parser_helper_msg; + + conv_param_parser_helper_msg += "Following arguments (depending on number of spatial dims):\n" + " Number of spatial dimensions (1=Conv1d, 2=Conv2d, 3=Conv3d)\n" + " G, N, K, C, \n" + " , (ie Y, X for 2D)\n" + " , (ie Hi, Wi for 2D)\n" + " , (ie Sy, Sx for 2D)\n" + " , (ie Dy, Dx for 2D)\n" + " , (ie LeftPy, LeftPx for 2D)\n" + " , (ie RightPy, RightPx for 2D)\n"; + + std::cout << "arg1: tensor operation (" OP_NAME ": " OP_DESC ")\n" + << "arg2: data type (0: Input fp32, Weight fp32, Output fp32\n" + << " 1: Input fp16, Weight fp16, Output fp16\n" + << " 2: Input bf16, Weight fp32, Output bf16\n" + << " 3: Input fp16, Weight fp16, Output fp16, Gemm bf8@fp8\n" + << " 4: Input int8, Weight int8, Output int8\n" + << " 5: Input bf16, Weight bf16, Output bf16\n" + << " 6: Input fp32, Weight fp32, Output fp32, Compute tf32)\n" + << "arg3: tensor layout (0: Input[G, N, C, Hi, Wi], Weight[G, K, C, Y, X], Output[G, " + "N, K, Ho, Wo]\n" + << " 1: Input[G, N, Hi, Wi, C], Weight[G, K, Y, X, C], Output[G, " + "N, Ho, Wo, K]\n" + << " 2: Input[N, Hi, Wi, G, C], Weight[G, K, Y, X, C], Output[N, " + "Ho, Wo, G, K]\n" + << " 3: Input[N, G, C, Hi, Wi], Weight[G, K, Y, X, C], Output[N, " + "G, K, Ho, Wo]\n" + << " 4: Input[N, G, C, Hi, Wi], Weight[G, K, C, Y, X], Output[N, " + "G, K, Ho, Wo]\n" + << "arg4: verification (0: no, 1: yes)\n" + << "arg5: initialization (0: no init, 1: integer value, 2: decimal value)\n" + << "arg6: print tensor value (0: no; 1: yes)\n" + << "arg7: time kernel (0: no, 1: yes)\n" + << conv_param_parser_helper_msg + << " SplitK (-1 for internally computed split-K value, positive value to set k " + "batches explicitly, or 'all' to test all internal split-K values)\n" + << std::endl; +} + +} // namespace + +int tile_profile_grouped_conv_bwd_data(int argc, char* argv[]) +{ + // 8 for control, 1 for num_dim_spatial + if(argc < 9) + { + print_helper_msg(); + return 1; + } + + const auto data_type = static_cast(std::stoi(argv[2])); + const auto layout = static_cast(std::stoi(argv[3])); + const bool do_verification = std::stoi(argv[4]); + const int init_method = std::stoi(argv[5]); + const bool do_log = std::stoi(argv[6]); + const bool time_kernel = std::stoi(argv[7]); + const int num_dim_spatial = std::stoi(argv[8]); + + // 8 for control, 1 for num_dim_spatial, 4 for G/N/K/C, and 6 * num_dim_spatial, 1 for split-K + if(argc != 8 + 1 + 4 + 6 * num_dim_spatial + 1) + { + print_helper_msg(); + return 1; + } + + const auto params = ck_tile::conv::parse_conv_param(num_dim_spatial, 9, argv); + + const auto& split_k = std::string(argv[8 + 1 + 4 + 6 * num_dim_spatial]); + + // using F32 = float; + // using F16 = ck_tile::half_t; + using BF16 = ck_tile::bfloat16_t; + // using F8 = ck_tile::fp8_t; + // using BF8 = ck_tile::bf8_t; + + using NHWGC = ck_tile::tensor_layout::convolution::NHWGC; + // using NDHWGC = ck_tile::tensor_layout::convolution::NDHWGC; + + using GKYXC = ck_tile::tensor_layout::convolution::GKYXC; + // using GKZYXC = ck_tile::tensor_layout::convolution::GKZYXC; + + using NHWGK = ck_tile::tensor_layout::convolution::NHWGK; + // using NDHWGK = ck_tile::tensor_layout::convolution::NDHWGK; + + constexpr auto I2 = ck_tile::number<2>{}; + //constexpr auto I3 = ck_tile::number<3>{}; + + auto profile = [&](auto num_dim_spatial_tmp, + auto in_layout, + auto wei_layout, + auto out_layout, + auto in_type, + auto wei_type, + auto out_type, + auto compute_type_a, + auto compute_type_b) { + constexpr ck_tile::index_t NDimSpatial = num_dim_spatial_tmp.value; + + using InLayout = decltype(in_layout); + using WeiLayout = decltype(wei_layout); + using OutLayout = decltype(out_layout); + + using InDataType = decltype(in_type); + using WeiDataType = decltype(wei_type); + using OutDataType = decltype(out_type); + + using ComputeTypeA = decltype(compute_type_a); + using ComputeTypeB = decltype(compute_type_b); + + bool pass = ck_tile::profiler::profile_grouped_conv_bwd_data_impl( + do_verification, init_method, do_log, time_kernel, params, split_k); + + return pass ? 0 : 1; + }; + + if(num_dim_spatial == 2 && layout == ConvLayout::NHWGC_GKYXC_NHWGK) + { + // if(data_type == ConvDataType::F32_F32_F32) + // { + // return profile(I2, NHWGC{}, GKYXC{}, NHWGK{}, F32{}, F32{}, F32{}, F32{}, F32{}); + // } + // if(data_type == ConvDataType::F16_F16_F16) + // { + // return profile(I2, NHWGC{}, GKYXC{}, NHWGK{}, F16{}, F16{}, F16{}, F16{}, F16{}); + // } + // if(data_type == ConvDataType::BF16_F32_BF16) + // { + // // fp32 atomic add is used for weight tensor in bf16 kernel + // return profile(I2, NHWGC{}, GKYXC{}, NHWGK{}, BF16{}, F32{}, BF16{}, BF16{}, BF16{}); + // } + if(data_type == ConvDataType::BF16_BF16_BF16) + { + return profile(I2, NHWGC{}, GKYXC{}, NHWGK{}, BF16{}, BF16{}, BF16{}, BF16{}, BF16{}); + } + } + + // if(num_dim_spatial == 3 && layout == ConvLayout::NHWGC_GKYXC_NHWGK) + // { + // if(data_type == ConvDataType::F32_F32_F32) + // { + // return profile(I3, NDHWGC{}, GKZYXC{}, NDHWGK{}, F32{}, F32{}, F32{}, F32{}, F32{}); + // } + // if(data_type == ConvDataType::F16_F16_F16) + // { + // return profile(I3, NDHWGC{}, GKZYXC{}, NDHWGK{}, F16{}, F16{}, F16{}, F16{}, F16{}); + // } + // if(data_type == ConvDataType::BF16_F32_BF16) + // { + // // fp32 atomic add is used for weight tensor in bf16 kernel + // return profile(I3, NDHWGC{}, GKZYXC{}, NDHWGK{}, BF16{}, F32{}, BF16{}, BF16{}, BF16{}); + // } + // if(data_type == ConvDataType::BF16_BF16_BF16) + // { + // return profile( + // I3, NDHWGC{}, GKZYXC{}, NDHWGK{}, BF16{}, BF16{}, BF16{}, BF16{}, BF16{}); + // } + // if(data_type == ConvDataType::F16_F16_F16_BF8_F8) + // { + // return profile(I3, NDHWGC{}, GKZYXC{}, NDHWGK{}, F16{}, F16{}, F16{}, BF8{}, F8{}); + // } + // else if(data_type == ConvDataType::I8_I8_I8) + // { + // return profile( + // I3, NDHWGC{}, GKZYXC{}, NDHWGK{}, int8_t{}, int8_t{}, int8_t{}, int8_t{}, int8_t{}); + // } + // } + + std::cout << "this data_type & layout is not implemented" << std::endl; + + return 1; +} + +REGISTER_PROFILER_OPERATION(OP_NAME, OP_DESC, tile_profile_grouped_conv_bwd_data); diff --git a/profiler/ck_tile/src/tile_profile_grouped_conv_fwd.cpp b/profiler/ck_tile/src/tile_profile_grouped_conv_fwd.cpp index 9458e3d677..bb447039ea 100644 --- a/profiler/ck_tile/src/tile_profile_grouped_conv_fwd.cpp +++ b/profiler/ck_tile/src/tile_profile_grouped_conv_fwd.cpp @@ -66,11 +66,10 @@ static void print_helper_msg() "G, K, Ho, Wo]\n" << " 3: Input[N, G, C, Hi, Wi], Weight[G, K, C, Y, X], Output[N, " "G, K, Ho, Wo])\n" - << "arg4: indexing data type (0: 32-bit, 1: 64-bit)\n" - << "arg5: verification (0: no, 1: yes)\n" - << "arg6: initialization (0: no init, 1: integer value, 2: decimal value)\n" - << "arg7: print tensor value (0: no; 1: yes)\n" - << "arg8: time kernel (0: no, 1: yes)\n" + << "arg4: verification (0: no, 1: yes)\n" + << "arg5: initialization (0: no init, 1: integer value, 2: decimal value)\n" + << "arg6: print tensor value (0: no; 1: yes)\n" + << "arg7: time kernel (0: no, 1: yes)\n" << conv_param_parser_helper_msg << std::endl; // clang-format on } @@ -80,7 +79,7 @@ static void print_helper_msg() int tile_profile_grouped_conv_fwd(int argc, char* argv[]) { // 8 for control, 1 for num_dim_spatial - if(argc < 10) + if(argc < 9) { print_helper_msg(); return 1; @@ -88,20 +87,21 @@ int tile_profile_grouped_conv_fwd(int argc, char* argv[]) const auto data_type = static_cast(std::stoi(argv[2])); const auto layout = static_cast(std::stoi(argv[3])); - const bool do_verification = std::stoi(argv[5]); - const int init_method = std::stoi(argv[6]); - const bool do_log = std::stoi(argv[7]); - const bool time_kernel = std::stoi(argv[8]); - const int num_dim_spatial = std::stoi(argv[9]); + const bool do_verification = std::stoi(argv[4]); + const int init_method = std::stoi(argv[5]); + const bool do_log = std::stoi(argv[6]); + const bool time_kernel = std::stoi(argv[7]); + const int num_dim_spatial = std::stoi(argv[8]); // 9 for control, 1 for num_dim_spatial, 4 for G/N/K/C, and 6 * num_dim_spatial - if(argc != 9 + 1 + 4 + 6 * num_dim_spatial) + if(argc != 8 + 1 + 4 + 6 * num_dim_spatial + 1) { + std::cout << argc << std::endl; print_helper_msg(); return 1; } - const auto params = ck_tile::conv::parse_conv_param(num_dim_spatial, 10, argv); + const auto params = ck_tile::conv::parse_conv_param(num_dim_spatial, 9, argv); constexpr ck_tile::index_t k_batch = 1; using F32 = float; diff --git a/profiler/include/profiler/profile_grouped_conv_bwd_data_impl.hpp b/profiler/include/profiler/profile_grouped_conv_bwd_data_impl.hpp index 2369b2eac8..0dcbb36391 100644 --- a/profiler/include/profiler/profile_grouped_conv_bwd_data_impl.hpp +++ b/profiler/include/profiler/profile_grouped_conv_bwd_data_impl.hpp @@ -36,7 +36,7 @@ bool profile_grouped_conv_bwd_data_impl(int do_verification, bool do_log, bool time_kernel, const ck::utils::conv::ConvParam& conv_param, - ck::index_t split_k = 1, + ck::index_t split_k = 0, index_t instance_index = -1) { using OutElementOp = ck::tensor_operation::element_wise::PassThrough; @@ -316,6 +316,23 @@ bool profile_grouped_conv_bwd_data_impl(int do_verification, << "\navg_time: " << best_avg_time << "\ntflops: " << best_tflops << "\nGB/s: " << best_gb_per_sec << ", SplitK " << best_split_k << std::endl; + const char* log_file = std::getenv("CK_PROFILER_LOG_FILE"); + if(log_file != nullptr) + { + std::ofstream out_stream(log_file, std::ios::app); + if(out_stream.is_open()) + { + std::stringstream out_ss; + out_ss << "CK best configuration:" << std::endl + << "name: " << best_op_name << std::endl + << "avg_time: " << best_avg_time << std::endl + << "SplitK: " << best_split_k << std::endl; + out_stream << out_ss.str(); + out_stream.close(); + } + } + + if(instance_index != -1) { std::cout << "grouped_conv_bwd_data_instance (" << instance_index << "/" << num_kernel diff --git a/profiler/include/profiler/profile_grouped_conv_fwd_impl.hpp b/profiler/include/profiler/profile_grouped_conv_fwd_impl.hpp index d490cf4167..35424b2f28 100644 --- a/profiler/include/profiler/profile_grouped_conv_fwd_impl.hpp +++ b/profiler/include/profiler/profile_grouped_conv_fwd_impl.hpp @@ -210,6 +210,8 @@ bool profile_grouped_conv_fwd_impl(int do_verification, { std::cout << op_ptr->GetTypeString() << " does not support this problem" << std::endl; } + + }; using DeviceOp = ck::tensor_operation::device::DeviceGroupedConvFwdMultipleABD( - do_verification, init_method, do_log, time_kernel, params, split_k); + do_verification, init_method, do_log, time_kernel, params); return pass ? 0 : 1; }; diff --git a/profiler/src/profile_grouped_conv_fwd.cpp b/profiler/src/profile_grouped_conv_fwd.cpp index 13f5cd1cda..d933792ee4 100644 --- a/profiler/src/profile_grouped_conv_fwd.cpp +++ b/profiler/src/profile_grouped_conv_fwd.cpp @@ -61,11 +61,10 @@ static void print_helper_msg() "G, K, Ho, Wo]\n" << " 3: Input[N, G, C, Hi, Wi], Weight[G, K, C, Y, X], Output[N, " "G, K, Ho, Wo])\n" - << "arg4: indexing data type (0: 32-bit, 1: 64-bit)\n" - << "arg5: verification (0: no, 1: yes)\n" - << "arg6: initialization (0: no init, 1: integer value, 2: decimal value)\n" - << "arg7: print tensor value (0: no; 1: yes)\n" - << "arg8: time kernel (0: no, 1: yes)\n" + << "arg4: verification (0: no, 1: yes)\n" + << "arg5: initialization (0: no init, 1: integer value, 2: decimal value)\n" + << "arg6: print tensor value (0: no; 1: yes)\n" + << "arg7: time kernel (0: no, 1: yes)\n" << ck::utils::conv::get_conv_param_parser_helper_msg() << std::endl; // clang-format on } @@ -75,7 +74,7 @@ static void print_helper_msg() int profile_grouped_conv_fwd(int argc, char* argv[]) { // 8 for control, 1 for num_dim_spatial - if(argc < 10) + if(argc < 9) { print_helper_msg(); return 1; @@ -83,21 +82,21 @@ int profile_grouped_conv_fwd(int argc, char* argv[]) const auto data_type = static_cast(std::stoi(argv[2])); const auto layout = static_cast(std::stoi(argv[3])); - const auto index_type = static_cast(std::stoi(argv[4])); - const bool do_verification = std::stoi(argv[5]); - const int init_method = std::stoi(argv[6]); - const bool do_log = std::stoi(argv[7]); - const bool time_kernel = std::stoi(argv[8]); - const int num_dim_spatial = std::stoi(argv[9]); + const auto index_type = IndexType::INDEX_T; + const bool do_verification = std::stoi(argv[4]); + const int init_method = std::stoi(argv[5]); + const bool do_log = std::stoi(argv[6]); + const bool time_kernel = std::stoi(argv[7]); + const int num_dim_spatial = std::stoi(argv[8]); // 9 for control, 1 for num_dim_spatial, 4 for G/N/K/C, and 6 * num_dim_spatial - if(argc != 9 + 1 + 4 + 6 * num_dim_spatial) + if(argc != 8 + 1 + 4 + 6 * num_dim_spatial + 1) { print_helper_msg(); return 1; } - const auto params = ck::utils::conv::parse_conv_param(num_dim_spatial, 10, argv); + const auto params = ck::utils::conv::parse_conv_param(num_dim_spatial, 9, argv); using F32 = float; using F16 = ck::half_t; diff --git a/script/benchmark_ck_vs_ck_tile.py b/script/benchmark_ck_vs_ck_tile.py index 1963de22de..1e44b404be 100755 --- a/script/benchmark_ck_vs_ck_tile.py +++ b/script/benchmark_ck_vs_ck_tile.py @@ -9,6 +9,8 @@ import matplotlib.pyplot as plt plt.switch_backend('Agg') import numpy as np +import xlsxwriter + def parse_cli_args(): """Parse command line arguments""" parser = argparse.ArgumentParser(description="Run CK and CK Tile convolution profilers.") @@ -45,7 +47,7 @@ def run_ck_profiler_cmd(cmd_args, profiler_type, bin_path, results_file, log_to_ subprocess.run(cmd) else: with open(os.devnull, 'w') as devnull: - timeoutInSec = 15 * 60 # 15 minutes timeout + timeoutInSec = 300 * 60 # 300 minutes timeout try: subprocess.run(cmd, stdout=devnull, stderr=devnull, timeout=timeoutInSec, env=env) except subprocess.TimeoutExpired: @@ -77,8 +79,8 @@ def run_analysis(results_file): while i < len(lines): line = lines[i].strip() - # Look for grouped_conv_bwd_weight command lines - if line.startswith('grouped_conv_bwd_weight'): + # Look for grouped_conv_* command lines + if line.startswith('grouped_conv_'): current_case = {'command': line} i += 1 @@ -97,6 +99,9 @@ def run_analysis(results_file): if i < len(lines) and lines[i].strip().startswith('SplitK:'): current_case['ck_tile_splitk'] = lines[i].strip().replace('SplitK:', '').strip() i += 1 + if i < len(lines) and lines[i].strip().startswith('all_pass'): + current_case['ck_tile_all_pass'] = lines[i].strip().replace('all_pass', '').strip() + i += 1 # Parse CK results while i < len(lines) and not lines[i].strip().startswith('CK best configuration:'): @@ -129,25 +134,82 @@ def run_analysis(results_file): ck_times = [] ck_tile_times = [] case_labels = [] - + + workbook = xlsxwriter.Workbook('conv_perf.xlsx') + worksheet = workbook.add_worksheet() + + header_format = workbook.add_format() + header_format.set_bold() + + offset = 4 + + worksheet.write(offset, 0, "command", header_format) + worksheet.set_column(0, 0, 66) + worksheet.write(offset, 1, "CK Time", header_format) + worksheet.set_column(1, 1, 11) + worksheet.write(offset, 2, "CK Tile Time", header_format) + worksheet.set_column(2, 2, 11) + worksheet.write(offset, 3, "CK / CK Tile", header_format) + worksheet.set_column(3, 3, 11) + worksheet.write(offset, 4, "All pass", header_format) + worksheet.set_column(4, 4, 11) + worksheet.write(offset, 5, "CK best kernel", header_format) + worksheet.set_column(5, 5, 25) + worksheet.write(offset, 6, "CK tile best kernel", header_format) + worksheet.set_column(6, 6, 25) + + offset += 1 + + num_of_ck_tile_slower = 0 + for i, case in enumerate(test_cases): + worksheet.write(i + offset, 0, case['command']) + worksheet.write(i + offset, 1, case['ck_time']) + worksheet.write(i + offset, 2, case['ck_tile_time']) + + format = workbook.add_format() + ratio = case['ck_time'] / case['ck_tile_time'] + + if ratio < 1.0: + format.set_bg_color('red') + num_of_ck_tile_slower += 1 + else: + format.set_bg_color('green') + + all_pass = case['ck_tile_all_pass'] + + worksheet.write(i + offset, 3, ratio, format) + + format2 = workbook.add_format() + format2.set_bg_color('green' if all_pass == "true" else 'red') + worksheet.write(i + offset, 4, all_pass, format2) + worksheet.write(i + offset, 5, case['ck_name']) + worksheet.write(i + offset, 6, case['ck_tile_name']) + ck_time = case['ck_time'] ck_tile_time = case['ck_tile_time'] # Performance ratio: CK_time / CK_Tile_time * 100% # >100% means CK Tile is faster, <100% means CK is faster - ratio = (ck_time / ck_tile_time) * 100 - performance_ratios.append(ratio) - ck_times.append(ck_time) - ck_tile_times.append(ck_tile_time) + # ratio = (ck_time / ck_tile_time) * 100 + # performance_ratios.append(ratio) + # ck_times.append(ck_time) + # ck_tile_times.append(ck_tile_time) - # Create a short label for the test case - cmd_parts = case['command'].split() - if len(cmd_parts) >= 8: - label = f"G{cmd_parts[8]}_N{cmd_parts[9]}_K{cmd_parts[10]}_C{cmd_parts[11]}" - else: - label = f"Case_{i+1}" - case_labels.append(label) + # # Create a short label for the test case + # cmd_parts = case['command'].split() + # if len(cmd_parts) >= 8: + # label = f"G{cmd_parts[8]}_N{cmd_parts[9]}_K{cmd_parts[10]}_C{cmd_parts[11]}" + # else: + # label = f"Case_{i+1}" + # case_labels.append(label) + + worksheet.write(0, 0, f"all cases: {len(test_cases)}") + worksheet.write(1, 0, f"ck tile slower: {num_of_ck_tile_slower}") + worksheet.write(2, 0, f"ck tile slower: {(num_of_ck_tile_slower / len(test_cases) * 100):2.1f}%") + + workbook.close() + return max_cases_to_detailed_plot = 10 diff --git a/script/convert_miopen_driver_commands.py b/script/convert_miopen_driver_commands.py index 4c32fd90dc..df539d8abe 100755 --- a/script/convert_miopen_driver_commands.py +++ b/script/convert_miopen_driver_commands.py @@ -33,7 +33,9 @@ def parse_miopen_command(miopen_cmd): def determine_operation_type(params): """Determine the operation type based on MIOpen parameters""" # TODO: Current data is for bwd weight. - return "grouped_conv_bwd_weight" + return "grouped_conv_bwd_data" + #return "grouped_conv_bwd_weight" + #return "grouped_conv_fwd"#"grouped_conv_bwd_weight" def convert_miopen_to_ck_profiler(miopen_cmd): """Convert MIOpen driver command to CK profiler command""" @@ -44,8 +46,8 @@ def convert_miopen_to_ck_profiler(miopen_cmd): # Determine operation type operation = determine_operation_type(params) - data_type = 5 # BF16 - layout = 2 # channels last + data_type = 2 #2 for bwd data 2 FOR FWD 5 FOR BWD WEI # BF16 + layout = 1 #1 FIR BWD DATA 1 FOR FWD 2 FOR BWE WEI # channels last verification = 1 # with verification init_method = 2 # uniform data print_output = 0 # no print output @@ -172,11 +174,14 @@ def parse_arguments(): def main(): args = parse_arguments() + print(args.input_file) # Check if input file exists if not os.path.exists(args.input_file): print(f"Error: Input file '{args.input_file}' does not exist") sys.exit(1) + + print(args.input_file) # Generate output filename if not provided output_file = args.output if args.output else generate_output_filename(args.input_file) diff --git a/script/convert_old_ck_conv_bwd_data_to_ck_tile.py b/script/convert_old_ck_conv_bwd_data_to_ck_tile.py new file mode 100644 index 0000000000..32a62a8812 --- /dev/null +++ b/script/convert_old_ck_conv_bwd_data_to_ck_tile.py @@ -0,0 +1,128 @@ +import re + +def extract_template_parameters(template_str): + # Extract everything inside the outermost <> + match = re.search(r"<(.*)>", template_str, re.DOTALL) + if not match: + return [] + + inside = match.group(1).strip() + + params = [] + current = [] + depth = 0 # track nested < > + + for char in inside: + if char == '<': + depth += 1 + current.append(char) + elif char == '>': + depth -= 1 + current.append(char) + elif char == ',' and depth == 0: + param = ''.join(current).strip() + if param: + params.append(param) + current = [] + else: + current.append(char) + + # Append last parameter if any + if current: + params.append(''.join(current).strip()) + + return params + + +input_path = "inputkernel.txt" +output_path = "outputkernel_bwd_data.txt" + +with open(input_path, 'r') as f: + lines = f.readlines() + +for line in lines: + + # Example usage + #input_str = " DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 16, 1, 4>, 8>" + + params = extract_template_parameters(line) + + NDimSpatial = params[0] + ALayout = params[1] + BLayout = params[2] + DsLayout = params[3] + ELayout = params[4] + + ADataType = params[5] + BDataType = params[6] + AccDataType = params[7] + CshuffleDataType= params[8] + DsDataTypes = params[9] + EDataType = params[10] + + AElementwiseOp = params[11] + BElementwiseOp = params[12] + CElementwiseOp = "PassThrough"#params[13] + ConvFwdSpec = params[14] + + DoPadGemmM = params[15] + DoPadGemmN = params[16] + NumGemmK = params[17] + + BlockSize = params[18] + MPerBlock = params[19] + NPerBlock = params[20] + KPerBlock = params[21] + AK1 = params[22] + BK1 = params[23] + MPerXDL = params[24] + NPerXDL = params[25] + MXdlPerWave = params[26] + NXdlPerWave = params[27] + ABlockTransferClusterLengths = params[28] + ABlockTransferArrangeOrder = params[29] + ABlockTransferSrcAccessOrder = params[30] + ABlockTransferSrcVectorDim = params[31] + ABlockTransferSrcScalarPerVector = params[32] + ABlockTransferDstScalarPerVector_K1 = params[33] + ABlockLdsAddExtraM = params[34] + BBlockTransferClusterLengths = params[35] + BBlockTransferArrangeOrder = params[36] + BBlockTransferSrcVectorDim = params[37] + BBlockTransferSrcAccessOrder = params[38] + BBlockTransferSrcScalarPerVector = params[39] + BBlockTransferDstScalarPerVector_K1 = params[40] + BBlockLdsAddExtraM = params[41] + CShuffleMXdlPerwave = params[42] + CShuffleNXdlPerWavePerShuffle = params[43] + CBlockTransferClusterLengths = params[44] + CBlockTransferScalarPerVector = params[45] + + + KBlockPerCu = 1 + MWarp = int(MPerBlock) // (int(MPerXDL) * int(MXdlPerWave)) + NWarp = int(NPerBlock) // (int(NPerXDL) * int(NXdlPerWave)) + KWarp = 1 + KPerXdl = 16 if MPerXDL == "32" else 32 + DoubleSMemBuffer = 'false' + GemmPipelineVersion = "CK_TILE_PIPELINE_COMPUTE_V3" + + print(MPerBlock, NPerBlock, KPerBlock) + + pipelines = ["CK_TILE_PIPELINE_MEMORY", "CK_TILE_PIPELINE_COMPUTE_V3", "CK_TILE_PIPELINE_COMPUTE_V4"] + + for pipeline in pipelines: + DoubleSMemBuffer = 'false' if pipeline != 'CK_TILE_PIPELINE_COMPUTE_V4' else 'true' + with open(output_path, 'a') as f: + f.write(f'GroupedConvolutionBackwardDataInvoker<{NDimSpatial}, {ALayout}, {BLayout}, {ELayout}, {ADataType},' + f'{BDataType}, {EDataType}, {AElementwiseOp}, {BElementwiseOp}, {CElementwiseOp},' + f'{KBlockPerCu}, {MPerBlock}, {NPerBlock}, {KPerBlock}, {MWarp}, {NWarp}, {KWarp},' + f'{MPerXDL}, {NPerXDL}, {KPerXdl}, {ABlockTransferSrcScalarPerVector}, {BBlockTransferSrcScalarPerVector},' + f'{CBlockTransferScalarPerVector}, {DoubleSMemBuffer}, {pipeline}>,\n') + + +# print(params[0]) + +# # Print each parameter as a separate variable +# for i, p in enumerate(params, start=1): +# print(f"param_{i} = '{p}'") \ No newline at end of file diff --git a/script/convert_old_ck_conv_bwd_wei_to_ck_tile.py b/script/convert_old_ck_conv_bwd_wei_to_ck_tile.py new file mode 100644 index 0000000000..b8b2f0832c --- /dev/null +++ b/script/convert_old_ck_conv_bwd_wei_to_ck_tile.py @@ -0,0 +1,120 @@ +import re + +def extract_template_parameters(template_str): + # Extract everything inside the outermost <> + match = re.search(r"<(.*)>", template_str, re.DOTALL) + if not match: + return [] + + inside = match.group(1).strip() + + params = [] + current = [] + depth = 0 # track nested < > + + for char in inside: + if char == '<': + depth += 1 + current.append(char) + elif char == '>': + depth -= 1 + current.append(char) + elif char == ',' and depth == 0: + param = ''.join(current).strip() + if param: + params.append(param) + current = [] + else: + current.append(char) + + # Append last parameter if any + if current: + params.append(''.join(current).strip()) + + return params + + +input_path = "inputkernel.txt" +output_path = "outputkernel_bwd_wei.txt" + +with open(input_path, 'r') as f: + lines = f.readlines() + +for line in lines: + + # Example usage + #input_str = " DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 16, 1, 4>, 8>" + + params = extract_template_parameters(line) + + NDimSpatial = params[0] + ALayout = params[1] + BLayout = params[2] + ELayout = params[3] + + ADataType = params[4] + BDataType = params[5] + EDataType = params[6] + AccDataType = params[7] + + AElementwiseOp = params[8] + BElementwiseOp = params[9] + CElementwiseOp = "PassThrough"#params[13] + ConvFwdSpec = params[11] + + BlockSize = params[12] + MPerBlock = params[13] + NPerBlock = params[14] + KPerBlock = params[15] + K1 = params[16] + MPerXDL = params[17] + NPerXDL = params[18] + MXdlPerWave = params[19] + NXdlPerWave = params[20] + ABlockTransferClusterLengths = params[21] + ABlockTransferArrangeOrder = params[22] + ABlockTransferSrcAccessOrder = params[23] + ABlockTransferSrcVectorDim = params[24] + ABlockTransferSrcScalarPerVector = params[25] + ABlockTransferDstScalarPerVector_K1 = params[26] + ABlockLdsAddExtraM = params[27] + BBlockTransferClusterLengths = params[28] + BBlockTransferArrangeOrder = params[29] + BBlockTransferSrcVectorDim = params[30] + BBlockTransferSrcAccessOrder = params[31] + BBlockTransferSrcScalarPerVector = params[32] + BBlockTransferDstScalarPerVector_K1 = params[33] + BBlockLdsAddExtraM = params[34] + CShuffleMXdlPerwave = params[35] + CShuffleNXdlPerWavePerShuffle = params[36] + CBlockTransferClusterLengths = params[37] + CBlockTransferScalarPerVector = params[38] + + + KBlockPerCu = 1 + MWarp = int(MPerBlock) // (int(MPerXDL) * int(MXdlPerWave)) + NWarp = int(NPerBlock) // (int(NPerXDL) * int(NXdlPerWave)) + KWarp = 1 + KPerXdl = 16 if MPerXDL == "32" else 32 + DoubleSMemBuffer = 'false' + GemmPipelineVersion = "CK_TILE_PIPELINE_COMPUTE_V3" + + print(MPerBlock, NPerBlock, KPerBlock) + + pipelines = ["CK_TILE_PIPELINE_MEMORY", "CK_TILE_PIPELINE_COMPUTE_V3", "CK_TILE_PIPELINE_COMPUTE_V4"] + + for pipeline in pipelines: + DoubleSMemBuffer = 'false' if pipeline != 'CK_TILE_PIPELINE_COMPUTE_V4' else 'true' + with open(output_path, 'a') as f: + f.write(f'GroupedConvolutionBackwardWeightInvoker<{NDimSpatial}, {ALayout}, {BLayout}, {ELayout}, {ADataType},' + f'{BDataType}, {EDataType}, {AElementwiseOp}, {BElementwiseOp}, {CElementwiseOp},' + f'{KBlockPerCu}, {MPerBlock}, {NPerBlock}, {KPerBlock}, {MWarp}, {NWarp}, {KWarp},' + f'{MPerXDL}, {NPerXDL}, {KPerXdl}, {ABlockTransferSrcScalarPerVector}, {BBlockTransferSrcScalarPerVector},' + f'{CBlockTransferScalarPerVector}, {DoubleSMemBuffer}, {pipeline}>,\n') + + +# print(params[0]) + +# # Print each parameter as a separate variable +# for i, p in enumerate(params, start=1): +# print(f"param_{i} = '{p}'") \ No newline at end of file diff --git a/script/convert_old_ck_conv_fwd_to_ck_tile.py b/script/convert_old_ck_conv_fwd_to_ck_tile.py new file mode 100644 index 0000000000..f6ffef7936 --- /dev/null +++ b/script/convert_old_ck_conv_fwd_to_ck_tile.py @@ -0,0 +1,121 @@ +import re + +def extract_template_parameters(template_str): + # Extract everything inside the outermost <> + match = re.search(r"<(.*)>", template_str, re.DOTALL) + if not match: + return [] + + inside = match.group(1).strip() + + params = [] + current = [] + depth = 0 # track nested < > + + for char in inside: + if char == '<': + depth += 1 + current.append(char) + elif char == '>': + depth -= 1 + current.append(char) + elif char == ',' and depth == 0: + param = ''.join(current).strip() + if param: + params.append(param) + current = [] + else: + current.append(char) + + # Append last parameter if any + if current: + params.append(''.join(current).strip()) + + return params + + +input_path = "inputkernel.txt" +output_path = "outputkernel.txt" + +with open(input_path, 'r') as f: + lines = f.readlines() + +for line in lines: + + # Example usage + #input_str = " DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 16, 1, 4>, 8>" + + params = extract_template_parameters(line) + + NDimSpatial = params[0] + ALayout = params[1] + BLayout = params[2] + DsLayout = params[3] + ELayout = params[4] + ADataType = params[5] + BDataType = params[6] + AccDataType = params[7] + CShuffleDataType= params[8] + DsDataTypes = params[9] + EDataType = params[10] + AElementwiseOp = params[11] + BElementwiseOp = params[12] + CElementwiseOp = "PassThrough"#params[13] + ConvFwdSpec = params[14] + GemmSpec = params[15] + NummGemmKPref = params[16] + BlockSize = params[17] + MPerBlock = params[18] + NPerBlock = params[19] + KPerBlock = params[20] + AK1 = params[21] + BK1 = params[22] + MPerXDL = params[23] + NPerXDL = params[24] + MXdlPerWave = params[25] + NXdlPerWave = params[26] + ABlockTransferClusterLengths = params[27] + ABlockTransferArrangeOrder = params[28] + ABlockTransferSrcAccessOrder = params[29] + ABlockTransferSrcVectorDim = params[30] + ABlockTransferSrcScalarPerVector = params[31] + ABlockTransferDstScalarPerVector_K1 = params[32] + ABlockLdsAddExtraM = params[33] + BBlockTransferClusterLengths = params[34] + BBlockTransferArrangeOrder = params[35] + BBlockTransferSrcVectorDim = params[36] + BBlockTransferSrcAccessOrder = params[37] + BBlockTransferSrcScalarPerVector = params[38] + BBlockTransferDstScalarPerVector_K1 = params[39] + BBlockLdsAddExtraM = params[40] + CShuffleMXdlPerwave = params[41] + CShuffleNXdlPerWavePerShuffle = params[42] + CBlockTransferClusterLengths = params[43] + CBlockTransferScalarPerVector = params[44] + + + KBlockPerCu = 1 + MWarp = int(MPerBlock) // (int(MPerXDL) * int(MXdlPerWave)) + NWarp = int(NPerBlock) // (int(NPerXDL) * int(NXdlPerWave)) + KWarp = 1 + KPerXdl = 16 if MPerXDL == "32" else 32 + DoubleSMemBuffer = 'false' + GemmPipelineVersion = "CK_TILE_PIPELINE_COMPUTE_V3" + + pipelines = ["CK_TILE_PIPELINE_MEMORY", "CK_TILE_PIPELINE_COMPUTE_V3", "CK_TILE_PIPELINE_COMPUTE_V4"] + + for pipeline in pipelines: + DoubleSMemBuffer = 'false' if pipeline != 'CK_TILE_PIPELINE_COMPUTE_V4' else 'true' + with open(output_path, 'a') as f: + f.write(f'GroupedConvolutionForwardInvoker<{NDimSpatial}, {ALayout}, {BLayout}, {ELayout}, {ADataType},' + f'{BDataType}, {EDataType}, {AElementwiseOp}, {BElementwiseOp}, {CElementwiseOp},' + f'{KBlockPerCu}, {MPerBlock}, {NPerBlock}, {KPerBlock}, {MWarp}, {NWarp}, {KWarp},' + f'{MPerXDL}, {NPerXDL}, {KPerXdl}, {ABlockTransferSrcScalarPerVector}, {BBlockTransferSrcScalarPerVector},' + f'{CBlockTransferScalarPerVector}, {DoubleSMemBuffer}, {pipeline}>,\n') + + +# print(params[0]) + +# # Print each parameter as a separate variable +# for i, p in enumerate(params, start=1): +# print(f"param_{i} = '{p}'") \ No newline at end of file