From fc6a9e3931e12aab7bfd29be6a52ca4424010f71 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ville=20Pietil=C3=A4?= Date: Mon, 13 Oct 2025 15:22:50 +0000 Subject: [PATCH] Create invoker for the kernel and a factory for creating invokers. --- ...ped_convolution_backward_weight_kernel.hpp | 20 --- ...le_grouped_convolution_backward_weight.hpp | 165 +++++++++++++++++- ...e_profile_grouped_conv_bwd_weight_impl.hpp | 10 +- 3 files changed, 164 insertions(+), 31 deletions(-) diff --git a/include/ck_tile/ops/grouped_convolution/kernel/grouped_convolution_backward_weight_kernel.hpp b/include/ck_tile/ops/grouped_convolution/kernel/grouped_convolution_backward_weight_kernel.hpp index 4fc4056f52..b85660aea3 100644 --- a/include/ck_tile/ops/grouped_convolution/kernel/grouped_convolution_backward_weight_kernel.hpp +++ b/include/ck_tile/ops/grouped_convolution/kernel/grouped_convolution_backward_weight_kernel.hpp @@ -294,26 +294,6 @@ struct GroupedConvBwdWeightKernelArgs long_index_t group_stride_c; }; -template -struct GroupedConvolutionBackwardWeightInvoker -{ - virtual bool IsSupportedArgument(const ck_tile::GroupedConvBwdWeightHostArgs& args) const = 0; - virtual float Run(const ck_tile::GroupedConvBwdWeightHostArgs& args, bool time_kernel) = 0; - virtual std::string GetName() const = 0; - virtual ~GroupedConvolutionBackwardWeightInvoker() = default; -}; - /// @brief The Grouped Convolution Backward Weight kernel template. /// /// @paragraph Overview Overview diff --git a/library/include/ck_tile/library/tensor_operation_instance/gpu/tile_grouped_convolution_backward_weight.hpp b/library/include/ck_tile/library/tensor_operation_instance/gpu/tile_grouped_convolution_backward_weight.hpp index b0b956e1dd..10e2086ff2 100644 --- a/library/include/ck_tile/library/tensor_operation_instance/gpu/tile_grouped_convolution_backward_weight.hpp +++ b/library/include/ck_tile/library/tensor_operation_instance/gpu/tile_grouped_convolution_backward_weight.hpp @@ -13,6 +13,11 @@ // #include "ck_tile/ops/common/element_wise_operation.hpp" #include "ck_tile/ops/grouped_convolution/kernel/grouped_convolution_backward_weight_kernel.hpp" +#include "ck_tile/core.hpp" +#include "ck_tile/host/kernel_launch.hpp" +#include "ck_tile/ops/epilogue.hpp" +#include "ck_tile/ops/gemm.hpp" +#include "ck_tile/ops/grouped_convolution.hpp" namespace ck_tile { namespace ops { @@ -20,6 +25,162 @@ namespace ops { template struct DeviceOperationInstanceFactory; +template +struct GroupedConvolutionBackwardWeightBaseInvoker +{ + virtual bool IsSupportedArgument(const ck_tile::GroupedConvBwdWeightHostArgs& args) const = 0; + virtual float Run(const ck_tile::GroupedConvBwdWeightHostArgs& args, bool time_kernel) = 0; + virtual std::string GetName() const = 0; + virtual ~GroupedConvolutionBackwardWeightBaseInvoker() = default; +}; + +template < + ck_tile::index_t NDimSpatial, + typename InLayout, + typename WeiLayout, + typename OutLayout, + typename InDataType, + typename WeiDataType, + typename OutDataType, + typename InElementwiseOperation, + typename WeiElementwiseOperation, + typename OutElementwiseOperation, + int kBlockPerCu, + ck_tile::index_t M_Tile, + ck_tile::index_t N_Tile, + ck_tile::index_t K_Tile, + ck_tile::index_t M_Warp, + ck_tile::index_t N_Warp, + ck_tile::index_t K_Warp, + ck_tile::index_t M_Warp_Tile, + ck_tile::index_t N_Warp_Tile, + ck_tile::index_t K_Warp_Tile, + ck_tile::index_t VectorSizeA, + ck_tile::index_t VectorSizeB, + ck_tile::index_t VectorSizeC, + bool UseSplitK> +struct GroupedConvolutionBackwardWeightInvoker : + public GroupedConvolutionBackwardWeightBaseInvoker +{ + using CodegenShape_ = + ck_tile::TileGemmShape, + ck_tile::sequence, + ck_tile::sequence>; + + static constexpr auto ConvSpec_ = ck_tile::ConvolutionSpecialization::Default; + + using TilePartitioner_ = ck_tile::GemmTile1DPartitioner; + using GroupedConvTraitsType_ = ck_tile::GroupedConvTraits; + + using AccDataType = float; + using DsDataType = OutDataType; + using CDEElementWise = ck_tile::element_wise::PassThrough; + + using CodegenPipelineProblem_ = ck_tile::GemmPipelineProblem< + InDataType, + WeiDataType, + AccDataType, + CodegenShape_, + typename GroupedConvTraitsType_::GroupedConvImplicitGemmTraitsBwdWeight, + ck_tile::element_wise::PassThrough, + ck_tile::element_wise::PassThrough, + InDataType, + true, + GroupedConvTraitsType_::VectorSizeA, + GroupedConvTraitsType_::VectorSizeB>; + + using CodegenPipeline_ = ck_tile::GemmPipelineAGmemBGmemCRegV1; + + using MemOp = std::conditional_t, + ck_tile::integral_constant>; + using ConvEpilogue_ = ck_tile::CShuffleEpilogue>; + + using Kernel = ck_tile::GroupedConvolutionBackwardWeightKernel; + + bool IsSupportedArgument(const ck_tile::GroupedConvBwdWeightHostArgs& args) const override + { + return Kernel::IsSupportedArgument(Kernel::MakeKernelArgs(args)); + }; + + float Run(const ck_tile::GroupedConvBwdWeightHostArgs& args, bool time_kernel) override + { + auto kargs = Kernel::MakeKernelArgs(args); + const dim3 grids = Kernel::GridSize(kargs); + const dim3 blocks = Kernel::BlockSize(); + + constexpr int n_warmup = 5; + constexpr int n_repeat = 50; + ck_tile::stream_config s {nullptr, time_kernel, 1, n_warmup, n_repeat}; + float avg_time = ck_tile::launch_kernel_time_mask( + s, + Kernel::Preprocess(kargs, s), + ck_tile::make_kernel(Kernel{}, grids, blocks, 0, kargs)); + + return avg_time; + }; + + std::string GetName() const override + { + return Kernel::GetName(); + }; + + ~GroupedConvolutionBackwardWeightInvoker() override = default; +}; + template -struct DeviceOperationInstanceFactory> { - using DeviceOp = GroupedConvolutionBackwardWeightInvokerGetName(); - // constexpr int kBlockPerCu = 1; - // constexpr int n_warmup = 5; - // constexpr int n_repeat = 50; - // ck_tile::stream_config s {nullptr, time_kernel, 1, n_warmup, n_repeat}; - // float avg_time = ck_tile::launch_kernel_time_mask( - // s, - // Kernel::Preprocess(kargs, s), - // ck_tile::make_kernel(*op, grids, blocks, 0, kargs)); float avg_time = op->Run(args, time_kernel); std::size_t flop = conv_param.GetFlops();