[CK_BUILDER] Add grouped conv fwd ck tile profiler (#3518)

* [BULDER] Add grouped conv fwd ck tile profiler

* [CK TILE] Fix grouped conv kernels splitk and double lds

* Updates

* Fixes

* Move to ckProfiler

* Fixes

* fix

* fix

* Change instances to empty list by default

* fix

* fix

* Update grouped_convolution_signatures.hpp

* Update grouped_convolution_forward_tile_algs.hpp

* [CK TILE] Add grouped convolution forward tests (#3556)

* [CK TILE] Add grouped convolution forward tests

* fix jenkins

* fixes

* comments fixes

* unit test

* unit test fix

* Move instances outside builder

* fix includes

* clang format fix

* readme fix

* fix includes

* fixes
This commit is contained in:
Bartłomiej Kocot
2026-01-20 06:29:01 +01:00
committed by GitHub
parent 0517d43d31
commit 0727e85e52
44 changed files with 3083 additions and 65 deletions

View File

@@ -98,27 +98,26 @@ struct ConvTileFactory
using GemmPipeline = typename internal::TilePipelineType<
BLOCK_GEMM.pipeline_version>::template GemmPipeline<UniversalGemmProblem>;
using ConvEpilogue = ck_tile::CShuffleEpilogue<ck_tile::CShuffleEpilogueProblem<
typename Types::ADataType,
typename Types::BDataType,
typename Types::DsDataTypes,
typename Types::AccDataType,
typename Types::EDataType,
typename GroupedConvTraitsType::ImplicitGemmDsLayout,
typename GroupedConvTraitsType::FixedGemmParams::ELayout,
typename Ops::CDEElementwiseOp,
BLOCK.per_block.m,
BLOCK.per_block.n,
BLOCK_GEMM.warps.m,
BLOCK_GEMM.warps.n,
BLOCK_GEMM.warp_tile.m,
BLOCK_GEMM.warp_tile.n,
BLOCK_GEMM.warp_tile.k,
GroupedConvTraitsType::FixedGemmParams::TransposeC,
// TODO:: This template parameter will be moved inside the kernel
BLOCK_GEMM.num_wave_groups,
GroupedConvTraitsType::FixedGemmParams::FixedVectorSize,
SCALAR_PER_VECTOR.c>>;
using ConvEpilogue = ck_tile::CShuffleEpilogue<
ck_tile::CShuffleEpilogueProblem<typename Types::ADataType,
typename Types::BDataType,
typename Types::DsDataTypes,
typename Types::AccDataType,
typename Types::EDataType,
typename GroupedConvTraitsType::ImplicitGemmDsLayout,
typename GroupedConvTraitsType::FixedGemmParams::ELayout,
typename Ops::CDEElementwiseOp,
BLOCK.per_block.m,
BLOCK.per_block.n,
BLOCK_GEMM.warps.m,
BLOCK_GEMM.warps.n,
BLOCK_GEMM.warp_tile.m,
BLOCK_GEMM.warp_tile.n,
BLOCK_GEMM.warp_tile.k,
GroupedConvTraitsType::FixedGemmParams::TransposeC,
BLOCK_GEMM.num_wave_groups,
GroupedConvTraitsType::FixedGemmParams::FixedVectorSize,
SCALAR_PER_VECTOR.c>>;
using Instance = typename internal::GroupedConvolutionTileKernel<SIGNATURE,
GroupedConvTraitsType,

View File

@@ -10,6 +10,7 @@
#include "ck_tile/builder/testing/testing_reflect.hpp"
#include "ck_tile/builder/testing/filter_extent.hpp"
#include "ck_tile/builder/testing/tensor_buffer.hpp"
#include "ck_tile/host/convolution_parameter.hpp"
#include "ck_tile/builder/testing/tensor_initialization.hpp"
#include "ck_tile/builder/testing/tensor_descriptor.hpp"
#include "ck_tile/builder/testing/validation.hpp"
@@ -93,6 +94,8 @@ struct Args<SIGNATURE>
Ops::WeiElementwiseOp b_elementwise_op;
Ops::OutElementwiseOp cde_elementwise_op;
int k_batch = 1;
/// This function returns the `TensorDescriptor` corresponding to
/// the input-tensor of the convolution problem. This can then
/// be used to, for example, allocate memory.
@@ -169,6 +172,36 @@ struct Args<SIGNATURE>
to_vector(this->input_left_pad),
to_vector(this->input_right_pad));
}
/// Convert the Args structure into a CK Tile conv_param structure.
/// This function is mainly used to be able to use the existing
/// CK Tile functionality to obtain tensor descriptors.
ck_tile::conv::ConvParam to_ck_tile_conv_param() const
{
const auto to_vector = [](const auto& extent) {
if constexpr(SPATIAL_DIM == 1)
return std::vector<ck_tile::index_t>{ck::index_t(extent.width)};
else if constexpr(SPATIAL_DIM == 2)
return std::vector<ck_tile::index_t>{ck::index_t(extent.height),
ck::index_t(extent.width)};
else
return std::vector<ck_tile::index_t>{ck::index_t(extent.depth),
ck::index_t(extent.height),
ck::index_t(extent.width)};
};
return ck_tile::conv::ConvParam(SPATIAL_DIM,
this->lengths.groups,
this->lengths.batch_size,
this->lengths.output_channels,
this->lengths.input_channels,
to_vector(this->lengths.filter),
to_vector(this->lengths.image),
to_vector(this->filter_strides),
to_vector(this->filter_dilation),
to_vector(this->input_left_pad),
to_vector(this->input_right_pad));
}
};
/// @brief `Inputs` specialization for forward convolution.

View File

@@ -4,6 +4,7 @@
#pragma once
#include "ck_tile/builder/testing/conv_fwd.hpp"
#include "ck_tile/host/kernel_launch.hpp"
#include "ck_tile/builder/factory/helpers/ck/conv_elementwise_op.hpp"
#include <type_traits>
#include <array>
@@ -87,16 +88,19 @@ concept CkConvInstance = detail::CkConvInstance<Conv, SIGNATURE>;
/// @brief `run()` specialization for forward convolution and old CK.
///
/// @tparam SIGNATURE Forward convolution signature.
/// @throws std::runtime_error if the arguments werent actually valid for the
/// @throws std::runtime_error if the arguments weren't actually valid for the
/// operation. This should be caught and reported by the testing framework.
/// @return std::tuple<bool, float> - whether the problem is supported and
/// kernel execution time (0.0f if s_conf time_kernel is false).
///
/// @see run()
template <auto SIGNATURE>
requires ValidConvSignature<SIGNATURE> && ConvDirectionIsForward<SIGNATURE>
void run(CkConvInstance<SIGNATURE> auto& conv,
const Args<SIGNATURE>& args,
const Inputs<SIGNATURE>& inputs,
const Outputs<SIGNATURE>& outputs)
std::tuple<bool, float> run(CkConvInstance<SIGNATURE> auto& conv,
const Args<SIGNATURE>& args,
const Inputs<SIGNATURE>& inputs,
const Outputs<SIGNATURE>& outputs,
const StreamConfig s_conf = {})
{
constexpr auto spatial_dim = SIGNATURE.spatial_dim;
@@ -144,10 +148,10 @@ void run(CkConvInstance<SIGNATURE> auto& conv,
if(!conv.IsSupportedArgument(ck_args))
{
throw std::runtime_error("invalid argument");
std::cout << "invalid argument" << std::endl;
}
conv.MakeInvoker().Run(ck_args, {});
return std::make_tuple(true, conv.MakeInvoker().Run(ck_args, s_conf));
}
} // namespace ck_tile::builder::test

View File

@@ -0,0 +1,91 @@
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
// SPDX-License-Identifier: MIT
#pragma once
#include "ck_tile/builder/testing/conv_fwd.hpp"
#include "ck_tile/host/kernel_launch.hpp"
#include "ck_tile/builder/factory/helpers/ck/conv_elementwise_op.hpp"
#include "ck_tile/ops/gemm.hpp"
#include "ck_tile/ops/grouped_convolution.hpp"
#include <type_traits>
#include <array>
/// This file contains the implementation details for invoking/testing
/// grouped convolution operations in CK Tile. The main item is the
/// `run()` function, which is the main implementation used to invoke
/// CK Tile grouped forward convolution kernels.
namespace ck_tile::builder::test {
namespace detail {
/// @brief Concept for checking whether this is the CK Tile convolution
/// implementation.
///
/// This is the same as `::ck_tile::builder::test::CkConvInstance`, except
/// with some utility aliases. For that reason, its moved to this detail
/// namespace.
template <typename Conv, auto SIGNATURE>
concept CkTileConvInstance = requires(Conv&) {
{ Conv::BlockSize() };
};
} // namespace detail
/// @brief Concept for checking whether a convolution is invoked like CK Tile.
///
/// This concept is used to tell whether a convolution implementation is
/// likely to be an "CK Tile" implementation - that is, whether we should
/// invoke it as an CK Tile kernel. This is mainly used with `run()` to
/// differentiate which implementation that should be invoked.
///
/// - SIGNATURE is the operation signature.
/// - Conv is a convolution instance created by the CK Builder API.
template <typename Conv, auto SIGNATURE>
concept CkTileConvInstance = detail::CkTileConvInstance<Conv, SIGNATURE>;
/// @brief `run()` specialization for forward convolution and CK Tile.
///
/// @tparam SIGNATURE Forward convolution signature.
/// @throws std::runtime_error if the arguments weren't actually valid for the
/// operation. This should be caught and reported by the testing framework.
/// @return std::tuple<bool, float> - whether the problem is supported and
/// kernel execution time (0.0f if s_conf time_kernel is false).
///
/// @see run()
template <auto SIGNATURE>
requires ValidConvSignature<SIGNATURE> && ConvDirectionIsForward<SIGNATURE>
std::tuple<bool, float> run(CkTileConvInstance<SIGNATURE> auto& conv,
const Args<SIGNATURE>& args,
const Inputs<SIGNATURE>& inputs,
const Outputs<SIGNATURE>& outputs,
const ck_tile::stream_config s_conf = {})
{
using Conv = std::remove_reference_t<decltype(conv)>;
const auto param = args.to_ck_tile_conv_param();
ck_tile::GroupedConvFwdHostArgs<> host_args(
param, inputs.input, inputs.weight, {}, outputs.output, args.k_batch);
auto kargs = Conv::MakeKernelArgs(host_args);
const dim3 grids = Conv::GridSize(kargs);
const dim3 blocks = Conv::BlockSize();
if(!Conv::IsSupportedArgument(kargs))
{
std::cout << "Not supported!";
return std::make_tuple(false, 0.f);
}
constexpr index_t minimum_occupancy =
Conv::GemmPipeline::Scheduler == ck_tile::GemmPipelineScheduler::Intrawave ? 1 : 2;
return std::make_tuple(
true,
ck_tile::launch_kernel(
s_conf, ck_tile::make_kernel<minimum_occupancy>(conv, grids, blocks, 0, kargs)));
}
} // namespace ck_tile::builder::test

View File

@@ -62,6 +62,8 @@ concept RefConvInstance = requires(Conv& conv,
/// @throws std::runtime_error if the arguments weren't actually valid for the
/// operation. This should be caught and reported by the testing framework.
///
/// @return std::tuple<bool, float> - whether the problem is supported and
/// kernel execution time (0.0f for reference).
/// @see run()
template <auto SIGNATURE>
requires ValidConvSignature<SIGNATURE> &&
@@ -69,10 +71,10 @@ template <auto SIGNATURE>
// for now, just concern outselves with reference and see when the
// rest of the bwd/weight plumbing is there.
ConvDirectionIsForward<SIGNATURE>
void run(RefConvInstance<SIGNATURE> auto& conv,
const Args<SIGNATURE>& args,
const Inputs<SIGNATURE>& inputs,
const Outputs<SIGNATURE>& outputs)
std::tuple<bool, float> run(RefConvInstance<SIGNATURE> auto& conv,
const Args<SIGNATURE>& args,
const Inputs<SIGNATURE>& inputs,
const Outputs<SIGNATURE>& outputs)
{
// We don't want to compute the output dims manually, just get
// them via the existing infrastructure
@@ -86,15 +88,27 @@ void run(RefConvInstance<SIGNATURE> auto& conv,
for(auto right_pad : param.input_right_pads_)
{
if(right_pad != 0)
throw std::runtime_error("TODO: Support right pad in reference conv");
{
std::cout << "TODO: Support right pad in reference conv" << std::endl;
return std::make_tuple(false, 0.0f);
}
}
if(!args.make_input_descriptor().is_packed())
throw std::runtime_error("TODO: Support non-packed input tensor in reference conv");
{
std::cout << "TODO: Support non-packed input tensor in reference conv" << std::endl;
return std::make_tuple(false, 0.0f);
}
if(!args.make_weight_descriptor().is_packed())
throw std::runtime_error("TODO: Support non-packed weight tensor in reference conv");
{
std::cout << "TODO: Support non-packed weight tensor in reference conv" << std::endl;
return std::make_tuple(false, 0.0f);
}
if(!args.make_output_descriptor().is_packed())
throw std::runtime_error("TODO: Support non-packed output tensor in reference conv");
{
std::cout << "TODO: Support non-packed output tensor in reference conv" << std::endl;
return std::make_tuple(false, 0.0f);
}
conv.Run(inputs.input,
inputs.weight,
@@ -109,6 +123,7 @@ void run(RefConvInstance<SIGNATURE> auto& conv,
param.conv_filter_strides_,
param.conv_filter_dilations_,
param.input_left_pads_);
return std::make_tuple(true, 0.0f);
}
} // namespace ck_tile::builder::test

View File

@@ -34,4 +34,25 @@ struct FilterExtent<3>
size_t depth = 1;
};
template <int SPATIAL_DIM>
inline FilterExtent<SPATIAL_DIM> filter_extent_from_vector(const std::vector<std::size_t>& vec);
template <>
inline FilterExtent<1> filter_extent_from_vector<1>(const std::vector<std::size_t>& vec)
{
return FilterExtent<1>{.width = vec[0]};
}
template <>
inline FilterExtent<2> filter_extent_from_vector<2>(const std::vector<std::size_t>& vec)
{
return FilterExtent<2>{.width = vec[1], .height = vec[0]};
}
template <>
inline FilterExtent<3> filter_extent_from_vector<3>(const std::vector<std::size_t>& vec)
{
return FilterExtent<3>{.width = vec[2], .height = vec[1], .depth = vec[0]};
}
} // namespace ck_tile::builder::test

View File

@@ -418,6 +418,10 @@ struct TensorDescriptor
size_t x = 1;
for(size_t i = 0; i < RANK; ++i)
{
if(lengths[indices[i]] == 1)
{
continue;
}
if(strides[indices[i]] != x)
return false;
@@ -443,6 +447,15 @@ struct TensorDescriptor
return TensorDescriptor<DT, 1>(lengths, strides);
}
/// @brief Print tensor descriptor details.
///
/// Print tensor descriptor details - lengths and strides.
friend std::ostream& operator<<(std::ostream& os, const TensorDescriptor<DT, RANK>& tensor_desc)
{
os << tensor_desc.inner_descriptor_;
return os;
}
private:
ck_tile::HostTensorDescriptor inner_descriptor_;
};

View File

@@ -317,13 +317,17 @@ ValidationReport validate(const Args<SIGNATURE>& args,
/// @param inputs The input tensor data. Will not be modified by this function.
/// @param outputs The output tensor data. The contents will be overwritten by
/// this function.
/// @param s_conf Stream config used to launch kernel.
/// @return std::tuple<bool, float> - whether the problem is supported and
/// kernel execution time (0.0f if s_conf time_kernel is false).
///
/// @note This function is explicitly deleted to generate compile errors
/// for missing implementations.
template <auto SIGNATURE, typename Operation>
void run(Operation& operation,
const Args<SIGNATURE>& args,
const Inputs<SIGNATURE>& inputs,
const Outputs<SIGNATURE>& outputs) = delete;
template <auto SIGNATURE, typename Operation, typename StreamConf>
std::tuple<bool, float> run(Operation& operation,
const Args<SIGNATURE>& args,
const Inputs<SIGNATURE>& inputs,
const Outputs<SIGNATURE>& outputs,
const StreamConf s_conf = {}) = delete;
} // namespace ck_tile::builder::test