diff --git a/library/include/ck_tile/library/tensor_operation_instance/gpu/tile_grouped_conv_bwd_weight_factory.hpp b/library/include/ck_tile/library/tensor_operation_instance/gpu/tile_grouped_conv_bwd_weight_factory.hpp index 10e2086ff2..eb68513006 100644 --- a/library/include/ck_tile/library/tensor_operation_instance/gpu/tile_grouped_conv_bwd_weight_factory.hpp +++ b/library/include/ck_tile/library/tensor_operation_instance/gpu/tile_grouped_conv_bwd_weight_factory.hpp @@ -1,186 +1,22 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. #pragma once #include #include #include +#include -// Add these missing includes: -// #include "ck_tile/core/tensor_layout.hpp" -// #include "ck_tile/ops/common/tensor_layout.hpp" -// #include "ck_tile/ops/common/element_wise_operation.hpp" - -#include "ck_tile/ops/grouped_convolution/kernel/grouped_convolution_backward_weight_kernel.hpp" -#include "ck_tile/core.hpp" -#include "ck_tile/host/kernel_launch.hpp" -#include "ck_tile/ops/epilogue.hpp" -#include "ck_tile/ops/gemm.hpp" -#include "ck_tile/ops/grouped_convolution.hpp" +#include "ck_tile/library/tensor_operation_instance/gpu/tile_grouped_conv_instance_factory.hpp" +#include "ck_tile/library/tensor_operation_instance/gpu/tile_grouped_conv_bwd_weight_invoker.hpp" +#include "ck_tile/library/tensor_operation_instance/gpu/tile_grouped_conv_bwd_weight_instances.hpp" namespace ck_tile { namespace ops { template struct DeviceOperationInstanceFactory; - -template -struct GroupedConvolutionBackwardWeightBaseInvoker -{ - virtual bool IsSupportedArgument(const ck_tile::GroupedConvBwdWeightHostArgs& args) const = 0; - virtual float Run(const ck_tile::GroupedConvBwdWeightHostArgs& args, bool time_kernel) = 0; - virtual std::string GetName() const = 0; - virtual ~GroupedConvolutionBackwardWeightBaseInvoker() = default; -}; - -template < - ck_tile::index_t NDimSpatial, - typename InLayout, - typename WeiLayout, - typename OutLayout, - typename InDataType, - typename WeiDataType, - typename OutDataType, - typename InElementwiseOperation, - typename WeiElementwiseOperation, - typename OutElementwiseOperation, - int kBlockPerCu, - ck_tile::index_t M_Tile, - ck_tile::index_t N_Tile, - ck_tile::index_t K_Tile, - ck_tile::index_t M_Warp, - ck_tile::index_t N_Warp, - ck_tile::index_t K_Warp, - ck_tile::index_t M_Warp_Tile, - ck_tile::index_t N_Warp_Tile, - ck_tile::index_t K_Warp_Tile, - ck_tile::index_t VectorSizeA, - ck_tile::index_t VectorSizeB, - ck_tile::index_t VectorSizeC, - bool UseSplitK> -struct GroupedConvolutionBackwardWeightInvoker : - public GroupedConvolutionBackwardWeightBaseInvoker -{ - using CodegenShape_ = - ck_tile::TileGemmShape, - ck_tile::sequence, - ck_tile::sequence>; - - static constexpr auto ConvSpec_ = ck_tile::ConvolutionSpecialization::Default; - - using TilePartitioner_ = ck_tile::GemmTile1DPartitioner; - using GroupedConvTraitsType_ = ck_tile::GroupedConvTraits; - - using AccDataType = float; - using DsDataType = OutDataType; - using CDEElementWise = ck_tile::element_wise::PassThrough; - - using CodegenPipelineProblem_ = ck_tile::GemmPipelineProblem< - InDataType, - WeiDataType, - AccDataType, - CodegenShape_, - typename GroupedConvTraitsType_::GroupedConvImplicitGemmTraitsBwdWeight, - ck_tile::element_wise::PassThrough, - ck_tile::element_wise::PassThrough, - InDataType, - true, - GroupedConvTraitsType_::VectorSizeA, - GroupedConvTraitsType_::VectorSizeB>; - - using CodegenPipeline_ = ck_tile::GemmPipelineAGmemBGmemCRegV1; - - using MemOp = std::conditional_t, - ck_tile::integral_constant>; - using ConvEpilogue_ = ck_tile::CShuffleEpilogue>; - - using Kernel = ck_tile::GroupedConvolutionBackwardWeightKernel; - - bool IsSupportedArgument(const ck_tile::GroupedConvBwdWeightHostArgs& args) const override - { - return Kernel::IsSupportedArgument(Kernel::MakeKernelArgs(args)); - }; - - float Run(const ck_tile::GroupedConvBwdWeightHostArgs& args, bool time_kernel) override - { - auto kargs = Kernel::MakeKernelArgs(args); - const dim3 grids = Kernel::GridSize(kargs); - const dim3 blocks = Kernel::BlockSize(); - - constexpr int n_warmup = 5; - constexpr int n_repeat = 50; - ck_tile::stream_config s {nullptr, time_kernel, 1, n_warmup, n_repeat}; - float avg_time = ck_tile::launch_kernel_time_mask( - s, - Kernel::Preprocess(kargs, s), - ck_tile::make_kernel(Kernel{}, grids, blocks, 0, kargs)); - - return avg_time; - }; - - std::string GetName() const override - { - return Kernel::GetName(); - }; - - ~GroupedConvolutionBackwardWeightInvoker() override = default; -}; - template > { @@ -221,127 +57,37 @@ struct DeviceOperationInstanceFactory> op_ptrs; -// if constexpr(NumDimSpatial == 2) -// { -// if constexpr(is_same_v && is_same_v && -// is_same_v) -// { -// #ifdef CK_ENABLE_FP32 -// if constexpr(is_same_v && is_same_v && -// is_same_v && is_same_v && -// is_same_v) -// { -// } -// #endif -// #ifdef CK_ENABLE_FP16 -// if constexpr(is_same_v && is_same_v && -// is_same_v && is_same_v && -// is_same_v) -// { -// } -// #endif -// #ifdef CK_ENABLE_BF16 -// if constexpr(is_same_v && is_same_v && -// is_same_v && -// is_same_v && -// is_same_v) -// { -// } -// #endif -// } -// if constexpr(is_same_v && is_same_v && -// is_same_v) -// { -// #ifdef CK_ENABLE_FP32 -// if constexpr(is_same_v && is_same_v && -// is_same_v) -// { -// static_assert(is_same_v, -// "Error: ComputeTypeA and ComputeTypeB should be the same"); -// if constexpr(is_same_v) -// { - -// } -// } -// #endif -// #ifdef CK_ENABLE_FP16 -// if constexpr(is_same_v && is_same_v && -// is_same_v && is_same_v && -// is_same_v) -// { -// } -// #endif -// #ifdef CK_ENABLE_BF16 -// if constexpr(is_same_v && is_same_v && -// is_same_v && -// is_same_v && -// is_same_v) -// { -// } -// if constexpr(is_same_v && -// is_same_v && -// is_same_v && -// is_same_v && -// is_same_v) -// { -// } -// #endif -// } -// if constexpr(is_same_v && is_same_v && -// is_same_v) -// { -// #ifdef CK_ENABLE_FP16 -// if constexpr(is_same_v && is_same_v && -// is_same_v && is_same_v && -// is_same_v) -// { -// } -// #endif -// #ifdef CK_ENABLE_BF16 -// if constexpr(is_same_v && -// is_same_v && -// is_same_v && -// is_same_v && -// is_same_v) -// { -// } -// #endif -// #ifdef CK_ENABLE_FP32 -// if constexpr(is_same_v && is_same_v && -// is_same_v && is_same_v && -// is_same_v) -// { -// } -// #endif -// } -// if constexpr(is_same_v && is_same_v && -// is_same_v) -// { -// #ifdef CK_ENABLE_FP16 -// if constexpr(is_same_v && is_same_v && -// is_same_v && is_same_v && -// is_same_v) -// { -// } -// #endif -// #ifdef CK_ENABLE_BF16 -// if constexpr(is_same_v && -// is_same_v && -// is_same_v && -// is_same_v && -// is_same_v) -// { -// } -// #endif -// #ifdef CK_ENABLE_FP32 -// if constexpr(is_same_v && is_same_v && -// is_same_v && is_same_v && -// is_same_v) -// { -// } -// #endif -// } -// } + if constexpr(NumDimSpatial == 2) + { + if constexpr(std::is_same_v && std::is_same_v && + std::is_same_v) + { + if constexpr(std::is_same_v && + std::is_same_v && + std::is_same_v && + std::is_same_v && + std::is_same_v) + { + add_grouped_conv2d_bwd_weight_f32_instances(op_ptrs); + } + if constexpr(std::is_same_v && + std::is_same_v && + std::is_same_v && + std::is_same_v && + std::is_same_v) + { + add_grouped_conv2d_bwd_weight_f16_instances(op_ptrs); + } + if constexpr(std::is_same_v && + std::is_same_v && + std::is_same_v && + std::is_same_v && + std::is_same_v) + { + add_grouped_conv2d_bwd_weight_bf16_instances(op_ptrs); + } + } + } return op_ptrs; } diff --git a/library/include/ck_tile/library/tensor_operation_instance/gpu/tile_grouped_conv_bwd_weight_instances.hpp b/library/include/ck_tile/library/tensor_operation_instance/gpu/tile_grouped_conv_bwd_weight_instances.hpp new file mode 100644 index 0000000000..1d11285fd7 --- /dev/null +++ b/library/include/ck_tile/library/tensor_operation_instance/gpu/tile_grouped_conv_bwd_weight_instances.hpp @@ -0,0 +1,73 @@ +// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include +#include + +#include "ck_tile/library/tensor_operation_instance/gpu/tile_grouped_conv_instance_factory.hpp" +#include "ck_tile/library/tensor_operation_instance/gpu/tile_grouped_conv_bwd_weight_invoker.hpp" + +namespace ck_tile { +namespace ops { + +using BF16 = ck_tile::bfloat16_t; +using F16 = ck_tile::half_t; +using F32 = float; + +using DeviceOp2DF32 = GroupedConvolutionBackwardWeightBaseInvoker<2, + NHWGC, + GKYXC, + NHWGK, + float, + float, + float, + PassThrough, + PassThrough, + PassThrough, + float, + float>; + +using DeviceOp2DF16 = GroupedConvolutionBackwardWeightBaseInvoker<2, + NHWGC, + GKYXC, + NHWGK, + F16, + F16, + F16, + PassThrough, + PassThrough, + PassThrough, + F16, + F16>; + +using DeviceOp2DBF16 = GroupedConvolutionBackwardWeightBaseInvoker<2, + NHWGC, + GKYXC, + NHWGK, + BF16, + BF16, + BF16, + PassThrough, + PassThrough, + PassThrough, + BF16, + BF16>; + +void add_grouped_conv2d_bwd_weight_f32_instances(std::vector>& instances) +{ + (void)instances; +} + +void add_grouped_conv2d_bwd_weight_f16_instances(std::vector>& instances) +{ + (void)instances; +} + +void add_grouped_conv2d_bwd_weight_bf16_instances(std::vector>& instances) +{ + (void)instances; +} + +} // namespace ops +} // namespace ck_tile diff --git a/library/include/ck_tile/library/tensor_operation_instance/gpu/tile_grouped_conv_bwd_weight_invoker.hpp b/library/include/ck_tile/library/tensor_operation_instance/gpu/tile_grouped_conv_bwd_weight_invoker.hpp new file mode 100644 index 0000000000..5072e153ec --- /dev/null +++ b/library/include/ck_tile/library/tensor_operation_instance/gpu/tile_grouped_conv_bwd_weight_invoker.hpp @@ -0,0 +1,178 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include +#include +#include + +//#include "ck_tile/ops/grouped_convolution/kernel/grouped_convolution_backward_weight_kernel.hpp" +#include "ck_tile/core.hpp" +#include "ck_tile/host/kernel_launch.hpp" +#include "ck_tile/ops/epilogue.hpp" +#include "ck_tile/ops/gemm.hpp" +#include "ck_tile/ops/grouped_convolution.hpp" +//#include "ck_tile/library/tensor_operation_instance/gpu/tile_grouped_conv_bwd_weight_instances.hpp" + +namespace ck_tile { +namespace ops { + +template +struct GroupedConvolutionBackwardWeightBaseInvoker +{ + virtual bool IsSupportedArgument(const ck_tile::GroupedConvBwdWeightHostArgs& args) const = 0; + virtual float Run(const ck_tile::GroupedConvBwdWeightHostArgs& args, bool time_kernel) = 0; + virtual std::string GetName() const = 0; + virtual ~GroupedConvolutionBackwardWeightBaseInvoker() = default; +}; + +template < + ck_tile::index_t NDimSpatial, + typename InLayout, + typename WeiLayout, + typename OutLayout, + typename InDataType, + typename WeiDataType, + typename OutDataType, + typename InElementwiseOperation, + typename WeiElementwiseOperation, + typename OutElementwiseOperation, + int kBlockPerCu, + ck_tile::index_t M_Tile, + ck_tile::index_t N_Tile, + ck_tile::index_t K_Tile, + ck_tile::index_t M_Warp, + ck_tile::index_t N_Warp, + ck_tile::index_t K_Warp, + ck_tile::index_t M_Warp_Tile, + ck_tile::index_t N_Warp_Tile, + ck_tile::index_t K_Warp_Tile, + ck_tile::index_t VectorSizeA, + ck_tile::index_t VectorSizeB, + ck_tile::index_t VectorSizeC, + bool UseSplitK> +struct GroupedConvolutionBackwardWeightInvoker : + public GroupedConvolutionBackwardWeightBaseInvoker +{ + using CodegenShape_ = + ck_tile::TileGemmShape, + ck_tile::sequence, + ck_tile::sequence>; + + static constexpr auto ConvSpec_ = ck_tile::ConvolutionSpecialization::Default; + + using TilePartitioner_ = ck_tile::GemmTile1DPartitioner; + using GroupedConvTraitsType_ = ck_tile::GroupedConvTraits; + + using AccDataType = float; + using DsDataType = OutDataType; + using CDEElementWise = ck_tile::element_wise::PassThrough; + + using CodegenPipelineProblem_ = ck_tile::GemmPipelineProblem< + InDataType, + WeiDataType, + AccDataType, + CodegenShape_, + typename GroupedConvTraitsType_::GroupedConvImplicitGemmTraitsBwdWeight, + ck_tile::element_wise::PassThrough, + ck_tile::element_wise::PassThrough, + InDataType, + true, + GroupedConvTraitsType_::VectorSizeA, + GroupedConvTraitsType_::VectorSizeB>; + + using CodegenPipeline_ = ck_tile::GemmPipelineAGmemBGmemCRegV1; + + using MemOp = std::conditional_t, + ck_tile::integral_constant>; + using ConvEpilogue_ = ck_tile::CShuffleEpilogue>; + + using Kernel = ck_tile::GroupedConvolutionBackwardWeightKernel; + + bool IsSupportedArgument(const ck_tile::GroupedConvBwdWeightHostArgs& args) const override + { + return Kernel::IsSupportedArgument(Kernel::MakeKernelArgs(args)); + }; + + float Run(const ck_tile::GroupedConvBwdWeightHostArgs& args, bool time_kernel) override + { + auto kargs = Kernel::MakeKernelArgs(args); + const dim3 grids = Kernel::GridSize(kargs); + const dim3 blocks = Kernel::BlockSize(); + + constexpr int n_warmup = 5; + constexpr int n_repeat = 50; + ck_tile::stream_config s {nullptr, time_kernel, 1, n_warmup, n_repeat}; + float avg_time = ck_tile::launch_kernel_time_mask( + s, + Kernel::Preprocess(kargs, s), + ck_tile::make_kernel(Kernel{}, grids, blocks, 0, kargs)); + + return avg_time; + }; + + std::string GetName() const override + { + return Kernel::GetName(); + }; + + ~GroupedConvolutionBackwardWeightInvoker() override = default; +}; + +} +} diff --git a/library/include/ck_tile/library/tensor_operation_instance/gpu/tile_grouped_conv_instance_factory.hpp b/library/include/ck_tile/library/tensor_operation_instance/gpu/tile_grouped_conv_instance_factory.hpp new file mode 100644 index 0000000000..c802613854 --- /dev/null +++ b/library/include/ck_tile/library/tensor_operation_instance/gpu/tile_grouped_conv_instance_factory.hpp @@ -0,0 +1,19 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck_tile/ops/common/tensor_layout.hpp" +#include "ck_tile/ops/elementwise/unary_element_wise_operation.hpp" + +namespace ck_tile { +namespace ops { + +using NHWGC = ck_tile::tensor_layout::convolution::NHWGC; +using GKYXC = ck_tile::tensor_layout::convolution::GKYXC; +using NHWGK = ck_tile::tensor_layout::convolution::NHWGK; + +using PassThrough = ck_tile::element_wise::PassThrough; + +} // namespace ops +} // namespace ck_tile