[CK_TILE] Add conv fwd + bias + clamp example (#3012)

* Implement argument passing to element-wise functions for fwd convolution

* Add files for fwd + bias + clamp example

* Implement Bias

* Implement Clamp

* Elementwise function composition

* Composition unit test

* Implement fwd + bias + clamp example

* Simplify argument passing and composition

* elfunc -> bias_and_clamp

* Rename function to specify example

* Move element-wise function instantiation to kernel

* Make bias a runtime tensor

* No ugly namespace aliasing

* Initialize element-wise function on host

* Remove function initialization helper, simplify Compose initialization

* Remove unintended LSP compatibility patch

* Clean up includes and unused code

* Switch names in cshuffle epilogue

* Move CDElementwise to conv traits

* Re-add required include

* Initialize bias in same way as other tensors

* Better type specification for ds pointer

* Disable 1D convolution

* Add warning for non-group-constant bias
This commit is contained in:
Johannes Graner
2025-10-27 18:43:09 +01:00
committed by GitHub
parent 054fdb765c
commit 5c1974065e
11 changed files with 524 additions and 41 deletions

View File

@@ -1540,6 +1540,23 @@ struct Logistic
const float alpha_;
};
struct Clamp
{
CK_TILE_HOST_DEVICE Clamp(float lower = std::numeric_limits<float>::lowest(),
float upper = std::numeric_limits<float>::max())
: lower_(lower), upper_(upper) {};
template <typename T>
CK_TILE_HOST_DEVICE constexpr void operator()(T& y, const T& x) const
{
T lower = ck_tile::type_convert<T>(lower_);
T upper = ck_tile::type_convert<T>(upper_);
y = ck_tile::clamp(x, lower, upper);
}
float lower_, upper_;
};
struct ConvInvscale
{
static constexpr const char* name = "ConvInvscale";
@@ -1629,6 +1646,55 @@ struct Cast
};
};
/**
* @brief Compose two unary element-wise functions into one.
*
*
* @note The Ds tensor can be used by at most one of the composed functions.
* This holds even if compositions are chained:
* In `Compose<FA, Compose<FB, FC>>`, only one of `FA`, `FB`, or `FC` can use
* the Ds tensor.
*
* @tparam FuncA The first function to be applied.
* @tparam FuncB The second function to be applied.
* @tparam FuncADs Whether `FuncA` uses the Ds tensor.
* @tparam FuncBDs Whether `FuncB` uses the Ds tensor.
*/
template <typename FuncA, typename FuncB, bool FuncADs = false, bool FuncBDs = false>
struct Compose
{
static_assert(!(FuncADs && FuncBDs), "Only one composed function may use the Ds tensor.");
CK_TILE_HOST_DEVICE Compose(FuncA func_a_ = FuncA{}, FuncB func_b_ = FuncB{})
: func_a(func_a_), func_b(func_b_)
{
}
template <typename AIn, typename BOut, typename AOut = AIn, typename... ADs>
CK_TILE_HOST_DEVICE constexpr void operator()(BOut& y, const AIn& x, const ADs&... ds) const
{
AOut tmp;
if constexpr(FuncADs)
{
func_a(tmp, x, ds...);
func_b(y, tmp);
}
else if constexpr(FuncBDs)
{
func_a(tmp, x);
func_b(y, tmp, ds...);
}
else
{
func_a(tmp, x);
func_b(y, tmp);
}
}
const FuncA func_a;
const FuncB func_b;
};
// support fastconvert of int8 to fp16
#if 0
template <typename InputDataType, typename OutputDataType, index_t RegPackNumber>

View File

@@ -8,7 +8,7 @@
#include "ck_tile/ops/common/tensor_layout.hpp"
#include "ck_tile/ops/elementwise/unary_element_wise_operation.hpp"
#include <optional>
#include <type_traits>
namespace ck_tile {
@@ -117,6 +117,10 @@ struct CShuffleEpilogue
static constexpr index_t MRepeat = kMPerBlock / (MPerXdl * MWave);
static constexpr index_t NRepeat = kNPerBlock / (NPerXdl * NWave);
CDElementwise elfunc_;
CK_TILE_DEVICE CShuffleEpilogue(CDElementwise elfunc = CDElementwise{}) : elfunc_(elfunc) {};
static_assert(NumDTensor == DsLayout::size(),
"The size of DsDataType and DsLayout should be the same");
/**
@@ -385,7 +389,7 @@ struct CShuffleEpilogue
generate_tie([&](auto idx) -> const auto& { return ds_tensor[idx]; },
number<NumDTensor>{}));
tile_elementwise_inout_unpack(typename Problem::CDElementwise{}, c_ds_tiles);
tile_elementwise_inout_unpack(elfunc_, c_ds_tiles);
}
template <typename OutDramWindow, typename COutTensor>
@@ -450,7 +454,7 @@ struct CShuffleEpilogue
CK_TILE_DEVICE auto operator()(ODramWindow& out_dram_window,
const OAccTile& o_acc_tile,
const DsDramWindows& ds_dram_windows,
void* /*p_smem*/,
void* /* p_smem */,
const ScaleM& scale_m = {},
const ScaleN& scale_n = {})
{

View File

@@ -7,10 +7,12 @@
#include <string>
#include "ck_tile/core.hpp"
#include "ck_tile/core/tensor/tile_elementwise.hpp"
#include "ck_tile/ops/common.hpp"
#include "ck_tile/host/concat.hpp"
#include "ck_tile/core/utility/env.hpp"
#include "ck_tile/host/convolution_parameter.hpp"
#include "ck_tile/ops/elementwise/unary_element_wise_operation.hpp"
#include "ck_tile/ops/grouped_convolution/utils/transform_conv_fwd_to_gemm.hpp"
#include "ck_tile/ops/grouped_convolution/utils/grouped_convolution_utils.hpp"
@@ -28,6 +30,7 @@ struct GroupedConvFwdKernelArgs
GroupedConvTraitsType_::VectorSizeB,
GroupedConvTraitsType_::VectorSizeC,
true>; // Split N enabled
using CDElementwise = typename GroupedConvTraitsType_::CDElementwise;
static constexpr index_t NumDTensor = GroupedConvTraitsType_::NumDTensor;
template <
@@ -38,7 +41,8 @@ struct GroupedConvFwdKernelArgs
std::is_same_v<WeiLay, tensor_layout::convolution::GKXC> &&
std::is_same_v<OutLay, tensor_layout::convolution::NWGK>,
bool>::type = false>
CK_TILE_HOST GroupedConvFwdKernelArgs(const GroupedConvFwdHostArgs& args)
CK_TILE_HOST GroupedConvFwdKernelArgs(const GroupedConvFwdHostArgs<CDElementwise>& args)
: elfunc(args.elfunc)
{
in_g_n_c_wis_lengths = {static_cast<index_t>(args.G_),
static_cast<index_t>(args.N_),
@@ -121,7 +125,8 @@ struct GroupedConvFwdKernelArgs
std::is_same_v<WeiLay, tensor_layout::convolution::GKYXC> &&
std::is_same_v<OutLay, tensor_layout::convolution::NHWGK>,
bool>::type = false>
CK_TILE_HOST GroupedConvFwdKernelArgs(const GroupedConvFwdHostArgs& args)
CK_TILE_HOST GroupedConvFwdKernelArgs(const GroupedConvFwdHostArgs<CDElementwise>& args)
: elfunc(args.elfunc)
{
in_g_n_c_wis_lengths = {static_cast<index_t>(args.G_),
static_cast<index_t>(args.N_),
@@ -213,7 +218,8 @@ struct GroupedConvFwdKernelArgs
std::is_same_v<WeiLay, tensor_layout::convolution::GKZYXC> &&
std::is_same_v<OutLay, tensor_layout::convolution::NDHWGK>,
bool>::type = false>
CK_TILE_HOST GroupedConvFwdKernelArgs(const GroupedConvFwdHostArgs& args)
CK_TILE_HOST GroupedConvFwdKernelArgs(const GroupedConvFwdHostArgs<CDElementwise>& args)
: elfunc(args.elfunc)
{
in_g_n_c_wis_lengths = {static_cast<index_t>(args.G_),
static_cast<index_t>(args.N_),
@@ -335,6 +341,7 @@ struct GroupedConvFwdKernelArgs
const void* in_ptr;
const void* wei_ptr;
std::array<const void*, NumDTensor> ds_ptr;
const CDElementwise elfunc;
void* out_ptr;
AGridDescMK a_grid_desc_m_k;
@@ -423,6 +430,8 @@ struct GroupedConvolutionForwardKernel
// Below type is actually accumulation data type - the output of block GEMM.
using OutDataType = remove_cvref_t<typename EpiloguePipeline::ODataType>;
using CDElementwise = typename EpiloguePipeline::CDElementwise;
using GroupedConvFwdKernelArgsSpecialized = GroupedConvFwdKernelArgs<GroupedConvTraitsType_>;
// TODO: Enable this
@@ -458,7 +467,7 @@ struct GroupedConvolutionForwardKernel
}
CK_TILE_HOST static constexpr GroupedConvFwdKernelArgsSpecialized
MakeKernelArgs(const GroupedConvFwdHostArgs& hostArgs)
MakeKernelArgs(const GroupedConvFwdHostArgs<CDElementwise>& hostArgs)
{
return GroupedConvFwdKernelArgsSpecialized(hostArgs);
}
@@ -636,7 +645,7 @@ struct GroupedConvolutionForwardKernel
"Not supported!");
return make_tensor_view<address_space_enum::global>(
static_cast<OutDataType*>(ds_ptr[i]), kargs.c_grid_desc_m_n);
static_cast<const OutDataType*>(ds_ptr[i]), kargs.c_grid_desc_m_n);
},
number<NumDTensor>{});
@@ -765,8 +774,9 @@ struct GroupedConvolutionForwardKernel
// Run Epilogue Pipeline
auto& c_block_window = gemm_tile_windows.at(I3);
EpiloguePipeline{}.template operator()<decltype(c_block_window), decltype(c_block_tile)>(
c_block_window, c_block_tile, d_block_window, smem_ptr_0);
EpiloguePipeline{kargs.elfunc}
.template operator()<decltype(c_block_window), decltype(c_block_tile)>(
c_block_window, c_block_tile, d_block_window, smem_ptr_0);
}
/**

View File

@@ -5,6 +5,7 @@
#include "ck_tile/core.hpp"
#include "ck_tile/host/convolution_parameter.hpp"
#include "ck_tile/ops/elementwise/unary_element_wise_operation.hpp"
namespace ck_tile {
@@ -14,7 +15,7 @@ namespace ck_tile {
/// This structure is passed to Grouped Convolution Kernels when creating kernel
/// arguments object. It contain all necessary information required to
/// build proper kernel argument and launch kernel on GPU.
template <typename InPtr, typename WeiPtr, typename OutPtr>
template <typename InPtr, typename WeiPtr, typename OutPtr, typename CDElementwise>
struct GroupedConvHostArgs : public conv::ConvParam
{
CK_TILE_HOST GroupedConvHostArgs() = delete;
@@ -23,13 +24,15 @@ struct GroupedConvHostArgs : public conv::ConvParam
WeiPtr wei_ptr_,
const std::vector<const void*> ds_ptr_,
OutPtr out_ptr_,
index_t k_batch_)
index_t k_batch_,
CDElementwise elfunc_ = CDElementwise{})
: conv::ConvParam(conv_param),
in_ptr(in_ptr_),
wei_ptr(wei_ptr_),
ds_ptr(ds_ptr_),
out_ptr(out_ptr_),
k_batch(k_batch_)
k_batch(k_batch_),
elfunc(elfunc_)
{
}
@@ -38,11 +41,17 @@ struct GroupedConvHostArgs : public conv::ConvParam
const std::vector<const void*> ds_ptr;
OutPtr out_ptr;
index_t k_batch;
const CDElementwise elfunc;
};
using GroupedConvFwdHostArgs = GroupedConvHostArgs<const void*, const void*, void*>;
using GroupedConvBwdWeightHostArgs = GroupedConvHostArgs<const void*, void*, const void*>;
using GroupedConvBwdDataHostArgs = GroupedConvHostArgs<void*, const void*, const void*>;
using PassThrough = ck_tile::element_wise::PassThrough;
template <typename CDElementwise = PassThrough>
using GroupedConvFwdHostArgs = GroupedConvHostArgs<const void*, const void*, void*, CDElementwise>;
using GroupedConvBwdWeightHostArgs =
GroupedConvHostArgs<const void*, void*, const void*, PassThrough>;
using GroupedConvBwdDataHostArgs =
GroupedConvHostArgs<void*, const void*, const void*, PassThrough>;
template <index_t NDimSpatial_,
ConvolutionSpecialization ConvSpecialization_,
@@ -50,9 +59,10 @@ template <index_t NDimSpatial_,
typename WeiLayout_,
typename DsLayout_,
typename OutLayout_,
index_t VectorSizeA_ = 1,
index_t VectorSizeB_ = 1,
index_t VectorSizeC_ = 1>
index_t VectorSizeA_ = 1,
index_t VectorSizeB_ = 1,
index_t VectorSizeC_ = 1,
typename CDElementwise_ = PassThrough>
struct GroupedConvTraits
{
private:
@@ -70,6 +80,7 @@ struct GroupedConvTraits
using WeiLayout = WeiLayout_;
using DsLayout = DsLayout_;
using OutLayout = OutLayout_;
using CDElementwise = CDElementwise_;
using GroupedConvImplicitGemmTraitsFwd =
TileGemmTraits<true,
true,