From bbed3a62dc223aa38dabfdfc8309f95bb63fe155 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ville=20Pietil=C3=A4?= Date: Tue, 14 Oct 2025 13:35:37 +0000 Subject: [PATCH] Fully functional CK Tile profiler. --- include/ck_tile/core/utility/env.hpp | 2 +- .../tile_grouped_conv_bwd_weight_factory.hpp | 8 --- ...tile_grouped_conv_bwd_weight_instances.hpp | 64 +++++++++++++------ .../tile_grouped_conv_bwd_weight_invoker.hpp | 19 ++++-- .../tile_grouped_conv_instance_factory.hpp | 26 ++++++++ ...e_profile_grouped_conv_bwd_weight_impl.hpp | 15 +++-- 6 files changed, 93 insertions(+), 41 deletions(-) diff --git a/include/ck_tile/core/utility/env.hpp b/include/ck_tile/core/utility/env.hpp index 9b148b3e0b..f05c43a199 100644 --- a/include/ck_tile/core/utility/env.hpp +++ b/include/ck_tile/core/utility/env.hpp @@ -13,7 +13,7 @@ void CK_TILE_ERROR(Args&&... args) noexcept { std::ostringstream oss; (oss << ... << args); - std::cerr << "[ERROR] " << oss.str() << std::endl; + std::cerr << "[CK TILE ERROR] " << oss.str() << std::endl; } namespace internal { diff --git a/library/include/ck_tile/library/tensor_operation_instance/gpu/tile_grouped_conv_bwd_weight_factory.hpp b/library/include/ck_tile/library/tensor_operation_instance/gpu/tile_grouped_conv_bwd_weight_factory.hpp index eb68513006..2b17f91260 100644 --- a/library/include/ck_tile/library/tensor_operation_instance/gpu/tile_grouped_conv_bwd_weight_factory.hpp +++ b/library/include/ck_tile/library/tensor_operation_instance/gpu/tile_grouped_conv_bwd_weight_factory.hpp @@ -62,14 +62,6 @@ struct DeviceOperationInstanceFactory && std::is_same_v && std::is_same_v) { - if constexpr(std::is_same_v && - std::is_same_v && - std::is_same_v && - std::is_same_v && - std::is_same_v) - { - add_grouped_conv2d_bwd_weight_f32_instances(op_ptrs); - } if constexpr(std::is_same_v && std::is_same_v && std::is_same_v && diff --git a/library/include/ck_tile/library/tensor_operation_instance/gpu/tile_grouped_conv_bwd_weight_instances.hpp b/library/include/ck_tile/library/tensor_operation_instance/gpu/tile_grouped_conv_bwd_weight_instances.hpp index 1d11285fd7..1f6e5932c6 100644 --- a/library/include/ck_tile/library/tensor_operation_instance/gpu/tile_grouped_conv_bwd_weight_instances.hpp +++ b/library/include/ck_tile/library/tensor_operation_instance/gpu/tile_grouped_conv_bwd_weight_instances.hpp @@ -15,19 +15,6 @@ using BF16 = ck_tile::bfloat16_t; using F16 = ck_tile::half_t; using F32 = float; -using DeviceOp2DF32 = GroupedConvolutionBackwardWeightBaseInvoker<2, - NHWGC, - GKYXC, - NHWGK, - float, - float, - float, - PassThrough, - PassThrough, - PassThrough, - float, - float>; - using DeviceOp2DF16 = GroupedConvolutionBackwardWeightBaseInvoker<2, NHWGC, GKYXC, @@ -54,19 +41,58 @@ using DeviceOp2DBF16 = GroupedConvolutionBackwardWeightBaseInvoker<2, BF16, BF16>; -void add_grouped_conv2d_bwd_weight_f32_instances(std::vector>& instances) -{ - (void)instances; -} +template +using tile_grouped_conv_bwd_weight_f16_instances = std::tuple< + // clang-format off + //#####################################| Num| InLayout| WeiLayout| OutLayout| InData| WeiData| OutData| In| Wei| Out| K-block| M-tile| N-tile | K-tile | M-warp| N-warp| K-warp| M-warp| N-warp| K-warp| Vector| Vector| Vector| Split-K| + //#####################################| Dim| | | | Type| Type| Type| Elementwise| Elementwise| Elementwise| per| | | | | | | tile| tile| tile| size| size| size| in| + //#####################################| Spatial| | | | | | | Operation| Operation| Operation| CU| | | | | | | size| size| size| A| B| C| use| + //#####################################| | | | | | | | | | | + GroupedConvolutionBackwardWeightInvoker, + GroupedConvolutionBackwardWeightInvoker, + GroupedConvolutionBackwardWeightInvoker, + GroupedConvolutionBackwardWeightInvoker + // clang-format on + >; + +template +using tile_grouped_conv_bwd_weight_bf16_instances = std::tuple< + // clang-format off + //#####################################| Num| InLayout| WeiLayout| OutLayout| InData| WeiData| OutData| In| Wei| Out| K-block| M-tile| N-tile | K-tile | M-warp| N-warp| K-warp| M-warp| N-warp| K-warp| Vector| Vector| Vector| Split-K| + //#####################################| Dim| | | | Type| Type| Type| Elementwise| Elementwise| Elementwise| per| | | | | | | tile| tile| tile| size| size| size| in| + //#####################################| Spatial| | | | | | | Operation| Operation| Operation| CU| | | | | | | size| size| size| A| B| C| use| + //#####################################| | | | | | | | | | | + GroupedConvolutionBackwardWeightInvoker, + GroupedConvolutionBackwardWeightInvoker, + GroupedConvolutionBackwardWeightInvoker, + GroupedConvolutionBackwardWeightInvoker + // clang-format on + >; void add_grouped_conv2d_bwd_weight_f16_instances(std::vector>& instances) { - (void)instances; + add_device_operation_instances(instances, + tile_grouped_conv_bwd_weight_f16_instances< + 2, + NHWGC, + GKYXC, + NHWGK>{}); } void add_grouped_conv2d_bwd_weight_bf16_instances(std::vector>& instances) { - (void)instances; + add_device_operation_instances(instances, + tile_grouped_conv_bwd_weight_bf16_instances< + 2, + NHWGC, + GKYXC, + NHWGK>{}); } } // namespace ops diff --git a/library/include/ck_tile/library/tensor_operation_instance/gpu/tile_grouped_conv_bwd_weight_invoker.hpp b/library/include/ck_tile/library/tensor_operation_instance/gpu/tile_grouped_conv_bwd_weight_invoker.hpp index 5072e153ec..5c20726671 100644 --- a/library/include/ck_tile/library/tensor_operation_instance/gpu/tile_grouped_conv_bwd_weight_invoker.hpp +++ b/library/include/ck_tile/library/tensor_operation_instance/gpu/tile_grouped_conv_bwd_weight_invoker.hpp @@ -7,13 +7,11 @@ #include #include -//#include "ck_tile/ops/grouped_convolution/kernel/grouped_convolution_backward_weight_kernel.hpp" #include "ck_tile/core.hpp" #include "ck_tile/host/kernel_launch.hpp" #include "ck_tile/ops/epilogue.hpp" #include "ck_tile/ops/gemm.hpp" #include "ck_tile/ops/grouped_convolution.hpp" -//#include "ck_tile/library/tensor_operation_instance/gpu/tile_grouped_conv_bwd_weight_instances.hpp" namespace ck_tile { namespace ops { @@ -35,6 +33,11 @@ struct GroupedConvolutionBackwardWeightBaseInvoker virtual bool IsSupportedArgument(const ck_tile::GroupedConvBwdWeightHostArgs& args) const = 0; virtual float Run(const ck_tile::GroupedConvBwdWeightHostArgs& args, bool time_kernel) = 0; virtual std::string GetName() const = 0; + GroupedConvolutionBackwardWeightBaseInvoker() = default; + GroupedConvolutionBackwardWeightBaseInvoker(const GroupedConvolutionBackwardWeightBaseInvoker&) = default; + GroupedConvolutionBackwardWeightBaseInvoker& operator=(const GroupedConvolutionBackwardWeightBaseInvoker&) = default; + GroupedConvolutionBackwardWeightBaseInvoker(GroupedConvolutionBackwardWeightBaseInvoker&&) = default; + GroupedConvolutionBackwardWeightBaseInvoker& operator=(GroupedConvolutionBackwardWeightBaseInvoker&&) = default; virtual ~GroupedConvolutionBackwardWeightBaseInvoker() = default; }; @@ -87,14 +90,13 @@ struct GroupedConvolutionBackwardWeightInvoker : ConvSpec_, InLayout, WeiLayout, - OutLayout, // = DsLayout + ck_tile::tuple<>, // = DsLayout OutLayout, VectorSizeA, VectorSizeB, VectorSizeC>; using AccDataType = float; - using DsDataType = OutDataType; using CDEElementWise = ck_tile::element_wise::PassThrough; using CodegenPipelineProblem_ = ck_tile::GemmPipelineProblem< @@ -120,7 +122,7 @@ struct GroupedConvolutionBackwardWeightInvoker : using ConvEpilogue_ = ck_tile::CShuffleEpilogue, // = DsDataType, AccDataType, OutDataType, typename GroupedConvTraitsType_::ImplicitGemmDsLayout, @@ -171,8 +173,13 @@ struct GroupedConvolutionBackwardWeightInvoker : return Kernel::GetName(); }; + GroupedConvolutionBackwardWeightInvoker() = default; + GroupedConvolutionBackwardWeightInvoker(const GroupedConvolutionBackwardWeightInvoker&) = default; + GroupedConvolutionBackwardWeightInvoker& operator=(const GroupedConvolutionBackwardWeightInvoker&) = default; + GroupedConvolutionBackwardWeightInvoker(GroupedConvolutionBackwardWeightInvoker&&) = default; + GroupedConvolutionBackwardWeightInvoker& operator=(GroupedConvolutionBackwardWeightInvoker&&) = default; ~GroupedConvolutionBackwardWeightInvoker() override = default; -}; + }; } } diff --git a/library/include/ck_tile/library/tensor_operation_instance/gpu/tile_grouped_conv_instance_factory.hpp b/library/include/ck_tile/library/tensor_operation_instance/gpu/tile_grouped_conv_instance_factory.hpp index c802613854..6ec251b052 100644 --- a/library/include/ck_tile/library/tensor_operation_instance/gpu/tile_grouped_conv_instance_factory.hpp +++ b/library/include/ck_tile/library/tensor_operation_instance/gpu/tile_grouped_conv_instance_factory.hpp @@ -3,6 +3,10 @@ #pragma once +#include +#include +#include + #include "ck_tile/ops/common/tensor_layout.hpp" #include "ck_tile/ops/elementwise/unary_element_wise_operation.hpp" @@ -15,5 +19,27 @@ using NHWGK = ck_tile::tensor_layout::convolution::NHWGK; using PassThrough = ck_tile::element_wise::PassThrough; +template +void add_device_operation_instances(std::vector>& op_instances, + const NewOpInstances& new_op_instances) +{ + ck_tile::static_for<0, std::tuple_size_v, 1>{}([&](auto i) { + const auto new_op_instance = std::get(new_op_instances); + + using NewOpInstance = remove_cvref_t; + if constexpr(std::is_same_v) + { + return; // We can use nullptr_t to enable trailing comma + } + else + { + static_assert(std::is_base_of_v, + "NewOpInstance must be derived from BaseOp"); + + op_instances.push_back(std::make_unique(new_op_instance)); + } + }); +} + } // namespace ops } // namespace ck_tile diff --git a/profiler/ck_tile/include/tile_profile_grouped_conv_bwd_weight_impl.hpp b/profiler/ck_tile/include/tile_profile_grouped_conv_bwd_weight_impl.hpp index 0b46c0cf04..95dcc946ba 100644 --- a/profiler/ck_tile/include/tile_profile_grouped_conv_bwd_weight_impl.hpp +++ b/profiler/ck_tile/include/tile_profile_grouped_conv_bwd_weight_impl.hpp @@ -161,13 +161,8 @@ bool profile_grouped_conv_bwd_weight_impl(int do_verification, output_dev_buf.GetDeviceBuffer(), split_k_value); - //using Kernel = remove_cvref_tKernel())>; - - // auto kargs = Kernel::MakeKernelArgs(args); - // const dim3 grids = Kernel::GridSize(kargs); - // const dim3 blocks = Kernel::BlockSize(); - - if(op->IsSupportedArgument(args)) + // Split-K autodeduction is not supported. + if(op->IsSupportedArgument(args) && split_k_value >= 1) { num_kernel++; if((instance_index != -1) && (instance_index + 1 != num_kernel)) @@ -177,6 +172,7 @@ bool profile_grouped_conv_bwd_weight_impl(int do_verification, } std::string op_name = op->GetName(); + std::cout << op->GetName() << ", SplitK " << split_k_param_str << " is profiled..." << std::endl; float avg_time = op->Run(args, time_kernel); @@ -233,6 +229,11 @@ bool profile_grouped_conv_bwd_weight_impl(int do_verification, all_pass &= pass; } } + else + { + std::cout << op->GetName() << ", SplitK " << split_k_param_str + << " does not support this problem." << std::endl; + } } }