mirror of
https://github.com/ROCm/composable_kernel.git
synced 2026-06-30 19:57:40 +00:00
Fully functional CK Tile profiler.
This commit is contained in:
@@ -13,7 +13,7 @@ void CK_TILE_ERROR(Args&&... args) noexcept
|
||||
{
|
||||
std::ostringstream oss;
|
||||
(oss << ... << args);
|
||||
std::cerr << "[ERROR] " << oss.str() << std::endl;
|
||||
std::cerr << "[CK TILE ERROR] " << oss.str() << std::endl;
|
||||
}
|
||||
|
||||
namespace internal {
|
||||
|
||||
@@ -62,14 +62,6 @@ struct DeviceOperationInstanceFactory<GroupedConvolutionBackwardWeightBaseInvoke
|
||||
if constexpr(std::is_same_v<InLayout, NHWGC> && std::is_same_v<WeiLayout, GKYXC> &&
|
||||
std::is_same_v<OutLayout, NHWGK>)
|
||||
{
|
||||
if constexpr(std::is_same_v<InDataType, float> &&
|
||||
std::is_same_v<WeiDataType, float> &&
|
||||
std::is_same_v<OutDataType, float> &&
|
||||
std::is_same_v<ComputeTypeA, float> &&
|
||||
std::is_same_v<ComputeTypeB, float>)
|
||||
{
|
||||
add_grouped_conv2d_bwd_weight_f32_instances(op_ptrs);
|
||||
}
|
||||
if constexpr(std::is_same_v<InDataType, ck_tile::half_t> &&
|
||||
std::is_same_v<WeiDataType, ck_tile::half_t> &&
|
||||
std::is_same_v<OutDataType, ck_tile::half_t> &&
|
||||
|
||||
@@ -15,19 +15,6 @@ using BF16 = ck_tile::bfloat16_t;
|
||||
using F16 = ck_tile::half_t;
|
||||
using F32 = float;
|
||||
|
||||
using DeviceOp2DF32 = GroupedConvolutionBackwardWeightBaseInvoker<2,
|
||||
NHWGC,
|
||||
GKYXC,
|
||||
NHWGK,
|
||||
float,
|
||||
float,
|
||||
float,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
float,
|
||||
float>;
|
||||
|
||||
using DeviceOp2DF16 = GroupedConvolutionBackwardWeightBaseInvoker<2,
|
||||
NHWGC,
|
||||
GKYXC,
|
||||
@@ -54,19 +41,58 @@ using DeviceOp2DBF16 = GroupedConvolutionBackwardWeightBaseInvoker<2,
|
||||
BF16,
|
||||
BF16>;
|
||||
|
||||
void add_grouped_conv2d_bwd_weight_f32_instances(std::vector<std::unique_ptr<DeviceOp2DF32>>& instances)
|
||||
{
|
||||
(void)instances;
|
||||
}
|
||||
template <ck_tile::index_t NDimSpatial,
|
||||
typename ALayout,
|
||||
typename BLayout,
|
||||
typename ELayout>
|
||||
using tile_grouped_conv_bwd_weight_f16_instances = std::tuple<
|
||||
// clang-format off
|
||||
//#####################################| Num| InLayout| WeiLayout| OutLayout| InData| WeiData| OutData| In| Wei| Out| K-block| M-tile| N-tile | K-tile | M-warp| N-warp| K-warp| M-warp| N-warp| K-warp| Vector| Vector| Vector| Split-K|
|
||||
//#####################################| Dim| | | | Type| Type| Type| Elementwise| Elementwise| Elementwise| per| | | | | | | tile| tile| tile| size| size| size| in|
|
||||
//#####################################| Spatial| | | | | | | Operation| Operation| Operation| CU| | | | | | | size| size| size| A| B| C| use|
|
||||
//#####################################| | | | | | | | | | |
|
||||
GroupedConvolutionBackwardWeightInvoker<NDimSpatial, ALayout, BLayout, ELayout, F16, F16, F16, PassThrough, PassThrough, PassThrough, 1, 64, 64, 64, 2, 2, 1, 32, 32, 16, 1, 1, 8, false>,
|
||||
GroupedConvolutionBackwardWeightInvoker<NDimSpatial, ALayout, BLayout, ELayout, F16, F16, F16, PassThrough, PassThrough, PassThrough, 1, 64, 64, 64, 2, 2, 1, 32, 32, 16, 1, 1, 2, true>,
|
||||
GroupedConvolutionBackwardWeightInvoker<NDimSpatial, ALayout, BLayout, ELayout, F16, F16, F16, PassThrough, PassThrough, PassThrough, 1, 64, 64, 64, 2, 2, 1, 32, 32, 16, 1, 1, 4, true>,
|
||||
GroupedConvolutionBackwardWeightInvoker<NDimSpatial, ALayout, BLayout, ELayout, F16, F16, F16, PassThrough, PassThrough, PassThrough, 1, 64, 64, 64, 2, 2, 1, 32, 32, 16, 1, 1, 8, true>
|
||||
// clang-format on
|
||||
>;
|
||||
|
||||
template <ck_tile::index_t NDimSpatial,
|
||||
typename ALayout,
|
||||
typename BLayout,
|
||||
typename ELayout>
|
||||
using tile_grouped_conv_bwd_weight_bf16_instances = std::tuple<
|
||||
// clang-format off
|
||||
//#####################################| Num| InLayout| WeiLayout| OutLayout| InData| WeiData| OutData| In| Wei| Out| K-block| M-tile| N-tile | K-tile | M-warp| N-warp| K-warp| M-warp| N-warp| K-warp| Vector| Vector| Vector| Split-K|
|
||||
//#####################################| Dim| | | | Type| Type| Type| Elementwise| Elementwise| Elementwise| per| | | | | | | tile| tile| tile| size| size| size| in|
|
||||
//#####################################| Spatial| | | | | | | Operation| Operation| Operation| CU| | | | | | | size| size| size| A| B| C| use|
|
||||
//#####################################| | | | | | | | | | |
|
||||
GroupedConvolutionBackwardWeightInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, 1, 64, 64, 64, 2, 2, 1, 32, 32, 16, 1, 1, 8, false>,
|
||||
GroupedConvolutionBackwardWeightInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, 1, 64, 64, 64, 2, 2, 1, 32, 32, 16, 1, 1, 2, true>,
|
||||
GroupedConvolutionBackwardWeightInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, 1, 64, 64, 64, 2, 2, 1, 32, 32, 16, 1, 1, 4, true>,
|
||||
GroupedConvolutionBackwardWeightInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, 1, 64, 64, 64, 2, 2, 1, 32, 32, 16, 1, 1, 8, true>
|
||||
// clang-format on
|
||||
>;
|
||||
|
||||
void add_grouped_conv2d_bwd_weight_f16_instances(std::vector<std::unique_ptr<DeviceOp2DF16>>& instances)
|
||||
{
|
||||
(void)instances;
|
||||
add_device_operation_instances(instances,
|
||||
tile_grouped_conv_bwd_weight_f16_instances<
|
||||
2,
|
||||
NHWGC,
|
||||
GKYXC,
|
||||
NHWGK>{});
|
||||
}
|
||||
|
||||
void add_grouped_conv2d_bwd_weight_bf16_instances(std::vector<std::unique_ptr<DeviceOp2DBF16>>& instances)
|
||||
{
|
||||
(void)instances;
|
||||
add_device_operation_instances(instances,
|
||||
tile_grouped_conv_bwd_weight_bf16_instances<
|
||||
2,
|
||||
NHWGC,
|
||||
GKYXC,
|
||||
NHWGK>{});
|
||||
}
|
||||
|
||||
} // namespace ops
|
||||
|
||||
@@ -7,13 +7,11 @@
|
||||
#include <vector>
|
||||
#include <memory>
|
||||
|
||||
//#include "ck_tile/ops/grouped_convolution/kernel/grouped_convolution_backward_weight_kernel.hpp"
|
||||
#include "ck_tile/core.hpp"
|
||||
#include "ck_tile/host/kernel_launch.hpp"
|
||||
#include "ck_tile/ops/epilogue.hpp"
|
||||
#include "ck_tile/ops/gemm.hpp"
|
||||
#include "ck_tile/ops/grouped_convolution.hpp"
|
||||
//#include "ck_tile/library/tensor_operation_instance/gpu/tile_grouped_conv_bwd_weight_instances.hpp"
|
||||
|
||||
namespace ck_tile {
|
||||
namespace ops {
|
||||
@@ -35,6 +33,11 @@ struct GroupedConvolutionBackwardWeightBaseInvoker
|
||||
virtual bool IsSupportedArgument(const ck_tile::GroupedConvBwdWeightHostArgs& args) const = 0;
|
||||
virtual float Run(const ck_tile::GroupedConvBwdWeightHostArgs& args, bool time_kernel) = 0;
|
||||
virtual std::string GetName() const = 0;
|
||||
GroupedConvolutionBackwardWeightBaseInvoker() = default;
|
||||
GroupedConvolutionBackwardWeightBaseInvoker(const GroupedConvolutionBackwardWeightBaseInvoker&) = default;
|
||||
GroupedConvolutionBackwardWeightBaseInvoker& operator=(const GroupedConvolutionBackwardWeightBaseInvoker&) = default;
|
||||
GroupedConvolutionBackwardWeightBaseInvoker(GroupedConvolutionBackwardWeightBaseInvoker&&) = default;
|
||||
GroupedConvolutionBackwardWeightBaseInvoker& operator=(GroupedConvolutionBackwardWeightBaseInvoker&&) = default;
|
||||
virtual ~GroupedConvolutionBackwardWeightBaseInvoker() = default;
|
||||
};
|
||||
|
||||
@@ -87,14 +90,13 @@ struct GroupedConvolutionBackwardWeightInvoker :
|
||||
ConvSpec_,
|
||||
InLayout,
|
||||
WeiLayout,
|
||||
OutLayout, // = DsLayout
|
||||
ck_tile::tuple<>, // = DsLayout
|
||||
OutLayout,
|
||||
VectorSizeA,
|
||||
VectorSizeB,
|
||||
VectorSizeC>;
|
||||
|
||||
using AccDataType = float;
|
||||
using DsDataType = OutDataType;
|
||||
using CDEElementWise = ck_tile::element_wise::PassThrough;
|
||||
|
||||
using CodegenPipelineProblem_ = ck_tile::GemmPipelineProblem<
|
||||
@@ -120,7 +122,7 @@ struct GroupedConvolutionBackwardWeightInvoker :
|
||||
using ConvEpilogue_ = ck_tile::CShuffleEpilogue<ck_tile::CShuffleEpilogueProblem<
|
||||
InDataType,
|
||||
WeiDataType,
|
||||
DsDataType,
|
||||
ck_tile::tuple<>, // = DsDataType,
|
||||
AccDataType,
|
||||
OutDataType,
|
||||
typename GroupedConvTraitsType_::ImplicitGemmDsLayout,
|
||||
@@ -171,8 +173,13 @@ struct GroupedConvolutionBackwardWeightInvoker :
|
||||
return Kernel::GetName();
|
||||
};
|
||||
|
||||
GroupedConvolutionBackwardWeightInvoker() = default;
|
||||
GroupedConvolutionBackwardWeightInvoker(const GroupedConvolutionBackwardWeightInvoker&) = default;
|
||||
GroupedConvolutionBackwardWeightInvoker& operator=(const GroupedConvolutionBackwardWeightInvoker&) = default;
|
||||
GroupedConvolutionBackwardWeightInvoker(GroupedConvolutionBackwardWeightInvoker&&) = default;
|
||||
GroupedConvolutionBackwardWeightInvoker& operator=(GroupedConvolutionBackwardWeightInvoker&&) = default;
|
||||
~GroupedConvolutionBackwardWeightInvoker() override = default;
|
||||
};
|
||||
};
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
@@ -3,6 +3,10 @@
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <vector>
|
||||
#include <memory>
|
||||
#include <type_traits>
|
||||
|
||||
#include "ck_tile/ops/common/tensor_layout.hpp"
|
||||
#include "ck_tile/ops/elementwise/unary_element_wise_operation.hpp"
|
||||
|
||||
@@ -15,5 +19,27 @@ using NHWGK = ck_tile::tensor_layout::convolution::NHWGK;
|
||||
|
||||
using PassThrough = ck_tile::element_wise::PassThrough;
|
||||
|
||||
template <typename BaseOp, typename NewOpInstances>
|
||||
void add_device_operation_instances(std::vector<std::unique_ptr<BaseOp>>& op_instances,
|
||||
const NewOpInstances& new_op_instances)
|
||||
{
|
||||
ck_tile::static_for<0, std::tuple_size_v<NewOpInstances>, 1>{}([&](auto i) {
|
||||
const auto new_op_instance = std::get<i>(new_op_instances);
|
||||
|
||||
using NewOpInstance = remove_cvref_t<decltype(new_op_instance)>;
|
||||
if constexpr(std::is_same_v<NewOpInstance, std::nullptr_t>)
|
||||
{
|
||||
return; // We can use nullptr_t to enable trailing comma
|
||||
}
|
||||
else
|
||||
{
|
||||
static_assert(std::is_base_of_v<BaseOp, NewOpInstance>,
|
||||
"NewOpInstance must be derived from BaseOp");
|
||||
|
||||
op_instances.push_back(std::make_unique<NewOpInstance>(new_op_instance));
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
} // namespace ops
|
||||
} // namespace ck_tile
|
||||
|
||||
@@ -161,13 +161,8 @@ bool profile_grouped_conv_bwd_weight_impl(int do_verification,
|
||||
output_dev_buf.GetDeviceBuffer(),
|
||||
split_k_value);
|
||||
|
||||
//using Kernel = remove_cvref_t<decltype(op->Kernel())>;
|
||||
|
||||
// auto kargs = Kernel::MakeKernelArgs(args);
|
||||
// const dim3 grids = Kernel::GridSize(kargs);
|
||||
// const dim3 blocks = Kernel::BlockSize();
|
||||
|
||||
if(op->IsSupportedArgument(args))
|
||||
// Split-K autodeduction is not supported.
|
||||
if(op->IsSupportedArgument(args) && split_k_value >= 1)
|
||||
{
|
||||
num_kernel++;
|
||||
if((instance_index != -1) && (instance_index + 1 != num_kernel))
|
||||
@@ -177,6 +172,7 @@ bool profile_grouped_conv_bwd_weight_impl(int do_verification,
|
||||
}
|
||||
|
||||
std::string op_name = op->GetName();
|
||||
std::cout << op->GetName() << ", SplitK " << split_k_param_str << " is profiled..." << std::endl;
|
||||
|
||||
float avg_time = op->Run(args, time_kernel);
|
||||
|
||||
@@ -233,6 +229,11 @@ bool profile_grouped_conv_bwd_weight_impl(int do_verification,
|
||||
all_pass &= pass;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
std::cout << op->GetName() << ", SplitK " << split_k_param_str
|
||||
<< " does not support this problem." << std::endl;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user