Fully functional CK Tile profiler.

This commit is contained in:
Ville Pietilä
2025-10-14 13:35:37 +00:00
parent 0f6bf78caa
commit bbed3a62dc
6 changed files with 93 additions and 41 deletions

View File

@@ -13,7 +13,7 @@ void CK_TILE_ERROR(Args&&... args) noexcept
{
std::ostringstream oss;
(oss << ... << args);
std::cerr << "[ERROR] " << oss.str() << std::endl;
std::cerr << "[CK TILE ERROR] " << oss.str() << std::endl;
}
namespace internal {

View File

@@ -62,14 +62,6 @@ struct DeviceOperationInstanceFactory<GroupedConvolutionBackwardWeightBaseInvoke
if constexpr(std::is_same_v<InLayout, NHWGC> && std::is_same_v<WeiLayout, GKYXC> &&
std::is_same_v<OutLayout, NHWGK>)
{
if constexpr(std::is_same_v<InDataType, float> &&
std::is_same_v<WeiDataType, float> &&
std::is_same_v<OutDataType, float> &&
std::is_same_v<ComputeTypeA, float> &&
std::is_same_v<ComputeTypeB, float>)
{
add_grouped_conv2d_bwd_weight_f32_instances(op_ptrs);
}
if constexpr(std::is_same_v<InDataType, ck_tile::half_t> &&
std::is_same_v<WeiDataType, ck_tile::half_t> &&
std::is_same_v<OutDataType, ck_tile::half_t> &&

View File

@@ -15,19 +15,6 @@ using BF16 = ck_tile::bfloat16_t;
using F16 = ck_tile::half_t;
using F32 = float;
using DeviceOp2DF32 = GroupedConvolutionBackwardWeightBaseInvoker<2,
NHWGC,
GKYXC,
NHWGK,
float,
float,
float,
PassThrough,
PassThrough,
PassThrough,
float,
float>;
using DeviceOp2DF16 = GroupedConvolutionBackwardWeightBaseInvoker<2,
NHWGC,
GKYXC,
@@ -54,19 +41,58 @@ using DeviceOp2DBF16 = GroupedConvolutionBackwardWeightBaseInvoker<2,
BF16,
BF16>;
void add_grouped_conv2d_bwd_weight_f32_instances(std::vector<std::unique_ptr<DeviceOp2DF32>>& instances)
{
(void)instances;
}
template <ck_tile::index_t NDimSpatial,
typename ALayout,
typename BLayout,
typename ELayout>
using tile_grouped_conv_bwd_weight_f16_instances = std::tuple<
// clang-format off
//#####################################| Num| InLayout| WeiLayout| OutLayout| InData| WeiData| OutData| In| Wei| Out| K-block| M-tile| N-tile | K-tile | M-warp| N-warp| K-warp| M-warp| N-warp| K-warp| Vector| Vector| Vector| Split-K|
//#####################################| Dim| | | | Type| Type| Type| Elementwise| Elementwise| Elementwise| per| | | | | | | tile| tile| tile| size| size| size| in|
//#####################################| Spatial| | | | | | | Operation| Operation| Operation| CU| | | | | | | size| size| size| A| B| C| use|
//#####################################| | | | | | | | | | |
GroupedConvolutionBackwardWeightInvoker<NDimSpatial, ALayout, BLayout, ELayout, F16, F16, F16, PassThrough, PassThrough, PassThrough, 1, 64, 64, 64, 2, 2, 1, 32, 32, 16, 1, 1, 8, false>,
GroupedConvolutionBackwardWeightInvoker<NDimSpatial, ALayout, BLayout, ELayout, F16, F16, F16, PassThrough, PassThrough, PassThrough, 1, 64, 64, 64, 2, 2, 1, 32, 32, 16, 1, 1, 2, true>,
GroupedConvolutionBackwardWeightInvoker<NDimSpatial, ALayout, BLayout, ELayout, F16, F16, F16, PassThrough, PassThrough, PassThrough, 1, 64, 64, 64, 2, 2, 1, 32, 32, 16, 1, 1, 4, true>,
GroupedConvolutionBackwardWeightInvoker<NDimSpatial, ALayout, BLayout, ELayout, F16, F16, F16, PassThrough, PassThrough, PassThrough, 1, 64, 64, 64, 2, 2, 1, 32, 32, 16, 1, 1, 8, true>
// clang-format on
>;
template <ck_tile::index_t NDimSpatial,
typename ALayout,
typename BLayout,
typename ELayout>
using tile_grouped_conv_bwd_weight_bf16_instances = std::tuple<
// clang-format off
//#####################################| Num| InLayout| WeiLayout| OutLayout| InData| WeiData| OutData| In| Wei| Out| K-block| M-tile| N-tile | K-tile | M-warp| N-warp| K-warp| M-warp| N-warp| K-warp| Vector| Vector| Vector| Split-K|
//#####################################| Dim| | | | Type| Type| Type| Elementwise| Elementwise| Elementwise| per| | | | | | | tile| tile| tile| size| size| size| in|
//#####################################| Spatial| | | | | | | Operation| Operation| Operation| CU| | | | | | | size| size| size| A| B| C| use|
//#####################################| | | | | | | | | | |
GroupedConvolutionBackwardWeightInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, 1, 64, 64, 64, 2, 2, 1, 32, 32, 16, 1, 1, 8, false>,
GroupedConvolutionBackwardWeightInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, 1, 64, 64, 64, 2, 2, 1, 32, 32, 16, 1, 1, 2, true>,
GroupedConvolutionBackwardWeightInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, 1, 64, 64, 64, 2, 2, 1, 32, 32, 16, 1, 1, 4, true>,
GroupedConvolutionBackwardWeightInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, 1, 64, 64, 64, 2, 2, 1, 32, 32, 16, 1, 1, 8, true>
// clang-format on
>;
void add_grouped_conv2d_bwd_weight_f16_instances(std::vector<std::unique_ptr<DeviceOp2DF16>>& instances)
{
(void)instances;
add_device_operation_instances(instances,
tile_grouped_conv_bwd_weight_f16_instances<
2,
NHWGC,
GKYXC,
NHWGK>{});
}
void add_grouped_conv2d_bwd_weight_bf16_instances(std::vector<std::unique_ptr<DeviceOp2DBF16>>& instances)
{
(void)instances;
add_device_operation_instances(instances,
tile_grouped_conv_bwd_weight_bf16_instances<
2,
NHWGC,
GKYXC,
NHWGK>{});
}
} // namespace ops

View File

@@ -7,13 +7,11 @@
#include <vector>
#include <memory>
//#include "ck_tile/ops/grouped_convolution/kernel/grouped_convolution_backward_weight_kernel.hpp"
#include "ck_tile/core.hpp"
#include "ck_tile/host/kernel_launch.hpp"
#include "ck_tile/ops/epilogue.hpp"
#include "ck_tile/ops/gemm.hpp"
#include "ck_tile/ops/grouped_convolution.hpp"
//#include "ck_tile/library/tensor_operation_instance/gpu/tile_grouped_conv_bwd_weight_instances.hpp"
namespace ck_tile {
namespace ops {
@@ -35,6 +33,11 @@ struct GroupedConvolutionBackwardWeightBaseInvoker
virtual bool IsSupportedArgument(const ck_tile::GroupedConvBwdWeightHostArgs& args) const = 0;
virtual float Run(const ck_tile::GroupedConvBwdWeightHostArgs& args, bool time_kernel) = 0;
virtual std::string GetName() const = 0;
GroupedConvolutionBackwardWeightBaseInvoker() = default;
GroupedConvolutionBackwardWeightBaseInvoker(const GroupedConvolutionBackwardWeightBaseInvoker&) = default;
GroupedConvolutionBackwardWeightBaseInvoker& operator=(const GroupedConvolutionBackwardWeightBaseInvoker&) = default;
GroupedConvolutionBackwardWeightBaseInvoker(GroupedConvolutionBackwardWeightBaseInvoker&&) = default;
GroupedConvolutionBackwardWeightBaseInvoker& operator=(GroupedConvolutionBackwardWeightBaseInvoker&&) = default;
virtual ~GroupedConvolutionBackwardWeightBaseInvoker() = default;
};
@@ -87,14 +90,13 @@ struct GroupedConvolutionBackwardWeightInvoker :
ConvSpec_,
InLayout,
WeiLayout,
OutLayout, // = DsLayout
ck_tile::tuple<>, // = DsLayout
OutLayout,
VectorSizeA,
VectorSizeB,
VectorSizeC>;
using AccDataType = float;
using DsDataType = OutDataType;
using CDEElementWise = ck_tile::element_wise::PassThrough;
using CodegenPipelineProblem_ = ck_tile::GemmPipelineProblem<
@@ -120,7 +122,7 @@ struct GroupedConvolutionBackwardWeightInvoker :
using ConvEpilogue_ = ck_tile::CShuffleEpilogue<ck_tile::CShuffleEpilogueProblem<
InDataType,
WeiDataType,
DsDataType,
ck_tile::tuple<>, // = DsDataType,
AccDataType,
OutDataType,
typename GroupedConvTraitsType_::ImplicitGemmDsLayout,
@@ -171,8 +173,13 @@ struct GroupedConvolutionBackwardWeightInvoker :
return Kernel::GetName();
};
GroupedConvolutionBackwardWeightInvoker() = default;
GroupedConvolutionBackwardWeightInvoker(const GroupedConvolutionBackwardWeightInvoker&) = default;
GroupedConvolutionBackwardWeightInvoker& operator=(const GroupedConvolutionBackwardWeightInvoker&) = default;
GroupedConvolutionBackwardWeightInvoker(GroupedConvolutionBackwardWeightInvoker&&) = default;
GroupedConvolutionBackwardWeightInvoker& operator=(GroupedConvolutionBackwardWeightInvoker&&) = default;
~GroupedConvolutionBackwardWeightInvoker() override = default;
};
};
}
}

View File

@@ -3,6 +3,10 @@
#pragma once
#include <vector>
#include <memory>
#include <type_traits>
#include "ck_tile/ops/common/tensor_layout.hpp"
#include "ck_tile/ops/elementwise/unary_element_wise_operation.hpp"
@@ -15,5 +19,27 @@ using NHWGK = ck_tile::tensor_layout::convolution::NHWGK;
using PassThrough = ck_tile::element_wise::PassThrough;
template <typename BaseOp, typename NewOpInstances>
void add_device_operation_instances(std::vector<std::unique_ptr<BaseOp>>& op_instances,
const NewOpInstances& new_op_instances)
{
ck_tile::static_for<0, std::tuple_size_v<NewOpInstances>, 1>{}([&](auto i) {
const auto new_op_instance = std::get<i>(new_op_instances);
using NewOpInstance = remove_cvref_t<decltype(new_op_instance)>;
if constexpr(std::is_same_v<NewOpInstance, std::nullptr_t>)
{
return; // We can use nullptr_t to enable trailing comma
}
else
{
static_assert(std::is_base_of_v<BaseOp, NewOpInstance>,
"NewOpInstance must be derived from BaseOp");
op_instances.push_back(std::make_unique<NewOpInstance>(new_op_instance));
}
});
}
} // namespace ops
} // namespace ck_tile

View File

@@ -161,13 +161,8 @@ bool profile_grouped_conv_bwd_weight_impl(int do_verification,
output_dev_buf.GetDeviceBuffer(),
split_k_value);
//using Kernel = remove_cvref_t<decltype(op->Kernel())>;
// auto kargs = Kernel::MakeKernelArgs(args);
// const dim3 grids = Kernel::GridSize(kargs);
// const dim3 blocks = Kernel::BlockSize();
if(op->IsSupportedArgument(args))
// Split-K autodeduction is not supported.
if(op->IsSupportedArgument(args) && split_k_value >= 1)
{
num_kernel++;
if((instance_index != -1) && (instance_index + 1 != num_kernel))
@@ -177,6 +172,7 @@ bool profile_grouped_conv_bwd_weight_impl(int do_verification,
}
std::string op_name = op->GetName();
std::cout << op->GetName() << ", SplitK " << split_k_param_str << " is profiled..." << std::endl;
float avg_time = op->Run(args, time_kernel);
@@ -233,6 +229,11 @@ bool profile_grouped_conv_bwd_weight_impl(int do_verification,
all_pass &= pass;
}
}
else
{
std::cout << op->GetName() << ", SplitK " << split_k_param_str
<< " does not support this problem." << std::endl;
}
}
}