[CK TILE] Refactor grouped conv fwd large tensor (#3144)

2026-06-08 23:38:11 +00:00 · 2025-11-04 00:34:48 +01:00
parent c7ded76cc7
commit 99f38e4d9b
5 changed files with 161 additions and 167 deletions
--- a/example/ck_tile/20_grouped_convolution/grouped_convolution_forward_invoker.hpp
+++ b/example/ck_tile/20_grouped_convolution/grouped_convolution_forward_invoker.hpp
@@ -112,89 +112,87 @@ struct GroupedConvolutionForwardInvoker
        // =====================================================================
        // Regular Convolution: Simple, no split-image
        // =====================================================================
-        const auto Run = [&]<bool EnableSplitImage>(const auto has_hot_loop_,
-                                                    const auto tail_number_,
-                                                    const auto memory_operation_) {
-            constexpr bool has_hot_loop_v   = has_hot_loop_.value;
-            constexpr auto tail_number_v    = tail_number_.value;
-            constexpr auto scheduler        = GemmConfig::Scheduler;
-            constexpr auto memory_operation = memory_operation_.value;
+        const auto Run =
+            [&](const auto has_hot_loop_, const auto tail_number_, const auto memory_operation_) {
+                constexpr bool has_hot_loop_v   = has_hot_loop_.value;
+                constexpr auto tail_number_v    = tail_number_.value;
+                constexpr auto scheduler        = GemmConfig::Scheduler;
+                constexpr auto memory_operation = memory_operation_.value;

-            using UniversalGemmProblem =
-                ck_tile::UniversalGemmPipelineProblem<InDataType,
-                                                      WeiDataType,
-                                                      AccDataType,
-                                                      GemmShape,
-                                                      GemmUniversalTraits,
-                                                      scheduler,
-                                                      has_hot_loop_v,
-                                                      tail_number_v,
-                                                      ck_tile::element_wise::PassThrough,
-                                                      ck_tile::element_wise::PassThrough,
-                                                      OutDataType,
-                                                      true,
-                                                      VectorSizeA,
-                                                      VectorSizeB>;
+                using UniversalGemmProblem =
+                    ck_tile::UniversalGemmPipelineProblem<InDataType,
+                                                          WeiDataType,
+                                                          AccDataType,
+                                                          GemmShape,
+                                                          GemmUniversalTraits,
+                                                          scheduler,
+                                                          has_hot_loop_v,
+                                                          tail_number_v,
+                                                          ck_tile::element_wise::PassThrough,
+                                                          ck_tile::element_wise::PassThrough,
+                                                          OutDataType,
+                                                          true,
+                                                          VectorSizeA,
+                                                          VectorSizeB>;

-            using GemmPipeline = typename PipelineTypeTraits<
-                GemmConfig::Pipeline>::template GemmPipeline<UniversalGemmProblem>;
+                using GemmPipeline = typename PipelineTypeTraits<
+                    GemmConfig::Pipeline>::template GemmPipeline<UniversalGemmProblem>;

-            using ConvEpilogue = ck_tile::CShuffleEpilogue<ck_tile::CShuffleEpilogueProblem<
-                InDataType,
-                WeiDataType,
-                DsDataType,
-                AccDataType,
-                OutDataType,
-                typename GroupedConvTraitsType::ImplicitGemmDsLayout,
-                ck_tile::tensor_layout::gemm::RowMajor,
-                CDElementWise,
-                TilePartitioner::MPerBlock,
-                TilePartitioner::NPerBlock,
-                GemmConfig::M_Warp,
-                GemmConfig::N_Warp,
-                GemmConfig::M_Warp_Tile,
-                GemmConfig::N_Warp_Tile,
-                GemmConfig::K_Warp_Tile,
-                GemmConfig::TransposeC,
-                memory_operation,
-                1,
-                true,
-                GroupedConvTraitsType::VectorSizeC>>;
+                using ConvEpilogue = ck_tile::CShuffleEpilogue<ck_tile::CShuffleEpilogueProblem<
+                    InDataType,
+                    WeiDataType,
+                    DsDataType,
+                    AccDataType,
+                    OutDataType,
+                    typename GroupedConvTraitsType::ImplicitGemmDsLayout,
+                    ck_tile::tensor_layout::gemm::RowMajor,
+                    CDElementWise,
+                    TilePartitioner::MPerBlock,
+                    TilePartitioner::NPerBlock,
+                    GemmConfig::M_Warp,
+                    GemmConfig::N_Warp,
+                    GemmConfig::M_Warp_Tile,
+                    GemmConfig::N_Warp_Tile,
+                    GemmConfig::K_Warp_Tile,
+                    GemmConfig::TransposeC,
+                    memory_operation,
+                    1,
+                    true,
+                    GroupedConvTraitsType::VectorSizeC>>;

-            using Kernel = ck_tile::GroupedConvolutionForwardKernel<EnableSplitImage,
-                                                                    GroupedConvTraitsType,
-                                                                    TilePartitioner,
-                                                                    GemmPipeline,
-                                                                    ConvEpilogue>;
-            auto kargs   = Kernel::MakeKernelArgs(args);
+                using Kernel = ck_tile::GroupedConvolutionForwardKernel<GroupedConvTraitsType,
+                                                                        TilePartitioner,
+                                                                        GemmPipeline,
+                                                                        ConvEpilogue>;
+                auto kargs   = Kernel::MakeKernelArgs(args);

-            const dim3 grids  = Kernel::GridSize(kargs);
-            const dim3 blocks = Kernel::BlockSize();
+                const dim3 grids  = Kernel::GridSize(kargs);
+                const dim3 blocks = Kernel::BlockSize();

-            if(!Kernel::IsSupportedArgument(kargs))
-            {
-                throw std::runtime_error("Wrong! Arguments not supported! Skipping conv!\n");
-            }
+                if(!Kernel::IsSupportedArgument(kargs))
+                {
+                    throw std::runtime_error("Wrong! Arguments not supported! Skipping conv!\n");
+                }

-            if(s.log_level_ > 0)
-            {
-                std::cout << "Launching kernel with args: " << Kernel::GetName() << '\n'
-                          << "shape: " << GemmShape::GetName() << '\n'
-                          << "problem: " << UniversalGemmProblem::GetName() << '\n'
-                          << "pipeline: " << GemmPipeline::GetName() << '\n'
-                          << "grid: {" << grids.x << ", " << grids.y << ", " << grids.z << "}"
-                          << ", blocks: {" << blocks.x << ", " << blocks.y << ", " << blocks.z
-                          << "}" << '\n'
-                          << "Vector size A: " << GemmPipeline::GetVectorSizeA()
-                          << ", Vector size B: " << GemmPipeline::GetVectorSizeB()
-                          << ", Vector size C: " << ConvEpilogue::GetVectorSizeC() << std::endl;
-            }
+                if(s.log_level_ > 0)
+                {
+                    std::cout << "Launching kernel with args: " << Kernel::GetName() << '\n'
+                              << "shape: " << GemmShape::GetName() << '\n'
+                              << "problem: " << UniversalGemmProblem::GetName() << '\n'
+                              << "pipeline: " << GemmPipeline::GetName() << '\n'
+                              << "grid: {" << grids.x << ", " << grids.y << ", " << grids.z << "}"
+                              << ", blocks: {" << blocks.x << ", " << blocks.y << ", " << blocks.z
+                              << "}" << '\n'
+                              << "Vector size A: " << GemmPipeline::GetVectorSizeA()
+                              << ", Vector size B: " << GemmPipeline::GetVectorSizeB()
+                              << ", Vector size C: " << ConvEpilogue::GetVectorSizeC() << std::endl;
+                }

-            ave_time = ck_tile::launch_kernel(
-                s, ck_tile::make_kernel<kBlockPerCu>(Kernel{}, grids, blocks, 0, kargs));
+                ave_time = ck_tile::launch_kernel(
+                    s, ck_tile::make_kernel<kBlockPerCu>(Kernel{}, grids, blocks, 0, kargs));

-            return ave_time;
-        };
+                return ave_time;
+            };

        // =====================================================================
        // Split-K lambda
@@ -202,11 +200,11 @@ struct GroupedConvolutionForwardInvoker
        const auto RunSplitk = [&](const auto has_hot_loop_, const auto tail_number_) {
            if(args.k_batch == 1)
            {
-                Run.template operator()<false>(has_hot_loop_, tail_number_, MemoryOpSet{});
+                Run.template operator()(has_hot_loop_, tail_number_, MemoryOpSet{});
            }
            else
            {
-                Run.template operator()<false>(has_hot_loop_, tail_number_, MemoryOpAtomicAdd{});
+                Run.template operator()(has_hot_loop_, tail_number_, MemoryOpAtomicAdd{});
            }
        };

--- a/example/ck_tile/20_grouped_convolution/grouped_convolution_forward_large_tensor_invoker.hpp
+++ b/example/ck_tile/20_grouped_convolution/grouped_convolution_forward_large_tensor_invoker.hpp
@@ -53,7 +53,10 @@ struct GroupedConvolutionForwardInvoker
                                                                 OutLayout,
                                                                 VectorSizeA,
                                                                 VectorSizeB,
-                                                                 VectorSizeC>;
+                                                                 VectorSizeC,
+                                                                 1, /*NumGroupsToMerge*/
+                                                                 ck_tile::element_wise::PassThrough,
+                                                                 true /*EnableSplitImage*/>;

        using GemmUniversalTraits = ck_tile::TileGemmUniversalTraits<
            GemmConfig::kPadM,
@@ -238,68 +241,64 @@ struct GroupedConvolutionForwardInvoker
        // =====================================================================
        // Kernel launch lambda: Uses EnableSplitImage based on layout support
        // =====================================================================
-        const auto Run = [&]<bool EnableSplitImage>(const auto has_hot_loop_,
-                                                    const auto tail_number_,
-                                                    const auto memory_operation_) {
-            constexpr bool has_hot_loop_v   = has_hot_loop_.value;
-            constexpr auto tail_number_v    = tail_number_.value;
-            constexpr auto scheduler        = GemmConfig::Scheduler;
-            constexpr auto memory_operation = memory_operation_.value;
+        const auto Run =
+            [&](const auto has_hot_loop_, const auto tail_number_, const auto memory_operation_) {
+                constexpr bool has_hot_loop_v   = has_hot_loop_.value;
+                constexpr auto tail_number_v    = tail_number_.value;
+                constexpr auto scheduler        = GemmConfig::Scheduler;
+                constexpr auto memory_operation = memory_operation_.value;

-            using UniversalGemmProblem =
-                ck_tile::UniversalGemmPipelineProblem<InDataType,
-                                                      WeiDataType,
-                                                      AccDataType,
-                                                      GemmShape,
-                                                      GemmUniversalTraits,
-                                                      scheduler,
-                                                      has_hot_loop_v,
-                                                      tail_number_v,
-                                                      ck_tile::element_wise::PassThrough,
-                                                      ck_tile::element_wise::PassThrough,
-                                                      OutDataType,
-                                                      true,
-                                                      VectorSizeA,
-                                                      VectorSizeB>;
+                using UniversalGemmProblem =
+                    ck_tile::UniversalGemmPipelineProblem<InDataType,
+                                                          WeiDataType,
+                                                          AccDataType,
+                                                          GemmShape,
+                                                          GemmUniversalTraits,
+                                                          scheduler,
+                                                          has_hot_loop_v,
+                                                          tail_number_v,
+                                                          ck_tile::element_wise::PassThrough,
+                                                          ck_tile::element_wise::PassThrough,
+                                                          OutDataType,
+                                                          true,
+                                                          VectorSizeA,
+                                                          VectorSizeB>;

-            using GemmPipeline = typename PipelineTypeTraits<
-                GemmConfig::Pipeline>::template GemmPipeline<UniversalGemmProblem>;
+                using GemmPipeline = typename PipelineTypeTraits<
+                    GemmConfig::Pipeline>::template GemmPipeline<UniversalGemmProblem>;

-            using ConvEpilogue = ck_tile::CShuffleEpilogue<ck_tile::CShuffleEpilogueProblem<
-                InDataType,
-                WeiDataType,
-                DsDataType,
-                AccDataType,
-                OutDataType,
-                typename GroupedConvTraitsType::ImplicitGemmDsLayout,
-                ck_tile::tensor_layout::gemm::RowMajor,
-                CDEElementWise,
-                TilePartitioner::MPerBlock,
-                TilePartitioner::NPerBlock,
-                GemmConfig::M_Warp,
-                GemmConfig::N_Warp,
-                GemmConfig::M_Warp_Tile,
-                GemmConfig::N_Warp_Tile,
-                GemmConfig::K_Warp_Tile,
-                GemmConfig::TransposeC,
-                memory_operation,
-                1,
-                true,
-                GroupedConvTraitsType::VectorSizeC>>;
+                using ConvEpilogue = ck_tile::CShuffleEpilogue<ck_tile::CShuffleEpilogueProblem<
+                    InDataType,
+                    WeiDataType,
+                    DsDataType,
+                    AccDataType,
+                    OutDataType,
+                    typename GroupedConvTraitsType::ImplicitGemmDsLayout,
+                    ck_tile::tensor_layout::gemm::RowMajor,
+                    CDEElementWise,
+                    TilePartitioner::MPerBlock,
+                    TilePartitioner::NPerBlock,
+                    GemmConfig::M_Warp,
+                    GemmConfig::N_Warp,
+                    GemmConfig::M_Warp_Tile,
+                    GemmConfig::N_Warp_Tile,
+                    GemmConfig::K_Warp_Tile,
+                    GemmConfig::TransposeC,
+                    memory_operation,
+                    1,
+                    true,
+                    GroupedConvTraitsType::VectorSizeC>>;

-            // Use split-image kernel if layout supports it, otherwise use regular kernel
-            using Kernel = ck_tile::GroupedConvolutionForwardKernel<EnableSplitImage,
-                                                                    GroupedConvTraitsType,
-                                                                    TilePartitioner,
-                                                                    GemmPipeline,
-                                                                    ConvEpilogue>;
+                // Use split-image kernel if layout supports it, otherwise use regular kernel
+                using Kernel = ck_tile::GroupedConvolutionForwardKernel<GroupedConvTraitsType,
+                                                                        TilePartitioner,
+                                                                        GemmPipeline,
+                                                                        ConvEpilogue>;

-            // Create kargs
-            auto kargs = Kernel::MakeKernelArgs(args);
+                // Create kargs
+                auto kargs = Kernel::MakeKernelArgs(args);

-            // Populate split-image metadata ONLY if using split-image kernel
-            if constexpr(EnableSplitImage)
-            {
+                // Populate split-image metadata ONLY if using split-image kernel
                kargs.num_spatial_pieces        = total_pieces;
                kargs.split_image.total_d       = total_d;
                kargs.split_image.total_h       = total_h;
@@ -320,41 +319,35 @@ struct GroupedConvolutionForwardInvoker
                                                   temp_pieces[i].h_size,
                                                   temp_pieces[i].w_size};
                }
-            }

-            // Calculate grid: use total_blocks for split-image, or normal GridSize for regular
-            const dim3 grids = [&]() {
-                if constexpr(EnableSplitImage)
-                    return dim3(total_blocks, kargs.GemmBatch, kargs.n_splits);
-                else
-                    return Kernel::GridSize(kargs);
-            }();
-            const dim3 blocks = Kernel::BlockSize();
+                // Calculate grid: use total_blocks for split-image, or normal GridSize for regular
+                const dim3 grids  = dim3(total_blocks, kargs.GemmBatch, kargs.n_splits);
+                const dim3 blocks = Kernel::BlockSize();

-            if(!Kernel::IsSupportedArgument(kargs))
-            {
-                throw std::runtime_error("Wrong! Arguments not supported! Skipping conv!\n");
-            }
+                if(!Kernel::IsSupportedArgument(kargs))
+                {
+                    throw std::runtime_error("Wrong! Arguments not supported! Skipping conv!\n");
+                }

-            if(s.log_level_ > 0)
-            {
-                std::cout << "Launching kernel with args: " << Kernel::GetName() << '\n'
-                          << "shape: " << GemmShape::GetName() << '\n'
-                          << "problem: " << UniversalGemmProblem::GetName() << '\n'
-                          << "pipeline: " << GemmPipeline::GetName() << '\n'
-                          << "grid: {" << grids.x << ", " << grids.y << ", " << grids.z << "}"
-                          << ", blocks: {" << blocks.x << ", " << blocks.y << ", " << blocks.z
-                          << "}" << '\n'
-                          << "Vector size A: " << GemmPipeline::GetVectorSizeA()
-                          << ", Vector size B: " << GemmPipeline::GetVectorSizeB()
-                          << ", Vector size C: " << ConvEpilogue::GetVectorSizeC() << std::endl;
-            }
+                if(s.log_level_ > 0)
+                {
+                    std::cout << "Launching kernel with args: " << Kernel::GetName() << '\n'
+                              << "shape: " << GemmShape::GetName() << '\n'
+                              << "problem: " << UniversalGemmProblem::GetName() << '\n'
+                              << "pipeline: " << GemmPipeline::GetName() << '\n'
+                              << "grid: {" << grids.x << ", " << grids.y << ", " << grids.z << "}"
+                              << ", blocks: {" << blocks.x << ", " << blocks.y << ", " << blocks.z
+                              << "}" << '\n'
+                              << "Vector size A: " << GemmPipeline::GetVectorSizeA()
+                              << ", Vector size B: " << GemmPipeline::GetVectorSizeB()
+                              << ", Vector size C: " << ConvEpilogue::GetVectorSizeC() << std::endl;
+                }

-            ave_time = ck_tile::launch_kernel(
-                s, ck_tile::make_kernel<kBlockPerCu>(Kernel{}, grids, blocks, 0, kargs));
+                ave_time = ck_tile::launch_kernel(
+                    s, ck_tile::make_kernel<kBlockPerCu>(Kernel{}, grids, blocks, 0, kargs));

-            return ave_time;
-        };
+                return ave_time;
+            };

        // =====================================================================
        // Step 4: Dispatch kernel (split-image or regular based on decision)
--- a/example/ck_tile/20_grouped_convolution/grouped_convolution_utils.hpp
+++ b/example/ck_tile/20_grouped_convolution/grouped_convolution_utils.hpp
@@ -11,7 +11,9 @@
 #include "ck_tile/ops/epilogue.hpp"
 #include "ck_tile/ops/gemm.hpp"
 #include "ck_tile/ops/grouped_convolution.hpp"
+#include "ck_tile/ops/elementwise/unary_element_wise_operation.hpp"
 #include "conv_configs.hpp"
+
 using MemoryOpSet =
    std::integral_constant<ck_tile::memory_operation_enum, ck_tile::memory_operation_enum::set>;
 using MemoryOpAtomicAdd = std::integral_constant<ck_tile::memory_operation_enum,
--- a/include/ck_tile/ops/grouped_convolution/kernel/grouped_convolution_forward_kernel.hpp
+++ b/include/ck_tile/ops/grouped_convolution/kernel/grouped_convolution_forward_kernel.hpp
@@ -434,14 +434,13 @@ struct GroupedConvFwdKernelArgs
 ///                                     multiplication implementation. It is responsible for storing
 ///                                     results calculated by @ref GemmPipeline_ "GemmPipeline" to
 ///                                     the output C tensor in global memory.
-template <bool EnableSplitImage_,
-          typename GroupedConvTraitsType_,
+template <typename GroupedConvTraitsType_,
          typename TilePartitioner_,
          typename GemmPipeline_,
          typename EpiloguePipeline_>
 struct GroupedConvolutionForwardKernel
 {
-    static constexpr bool EnableSplitImage = EnableSplitImage_;
+    static constexpr bool EnableSplitImage = GroupedConvTraitsType_::EnableSplitImage;
    static constexpr index_t NDimSpatial   = GroupedConvTraitsType_::NDimSpatial;
    static constexpr ConvolutionSpecialization ConvSpecialization =
        GroupedConvTraitsType_::ConvSpecialization;
--- a/include/ck_tile/ops/grouped_convolution/utils/grouped_convolution_utils.hpp
+++ b/include/ck_tile/ops/grouped_convolution/utils/grouped_convolution_utils.hpp
@@ -63,7 +63,8 @@ template <index_t NDimSpatial_,
          index_t VectorSizeB_      = 1,
          index_t VectorSizeC_      = 1,
          index_t NumGroupsToMerge_ = 1,
-          typename CDElementwise_   = PassThrough>
+          typename CDElementwise_   = PassThrough,
+          bool EnableSplitImage_    = false>
 struct GroupedConvTraits
 {
    private:
@@ -74,6 +75,7 @@ struct GroupedConvTraits
    }

    public:
+    static constexpr bool EnableSplitImage                        = EnableSplitImage_;
    static constexpr index_t NumGroupsToMerge                     = NumGroupsToMerge_;
    static constexpr index_t NDimSpatial                          = NDimSpatial_;
    static constexpr ConvolutionSpecialization ConvSpecialization = ConvSpecialization_;