[CK_TILE] Add conv fwd + bias + clamp example (#3012)

* Implement argument passing to element-wise functions for fwd convolution * Add files for fwd + bias + clamp example * Implement Bias * Implement Clamp * Elementwise function composition * Composition unit test * Implement fwd + bias + clamp example * Simplify argument passing and composition * elfunc -> bias_and_clamp * Rename function to specify example * Move element-wise function instantiation to kernel * Make bias a runtime tensor * No ugly namespace aliasing * Initialize element-wise function on host * Remove function initialization helper, simplify Compose initialization * Remove unintended LSP compatibility patch * Clean up includes and unused code * Switch names in cshuffle epilogue * Move CDElementwise to conv traits * Re-add required include * Initialize bias in same way as other tensors * Better type specification for ds pointer * Disable 1D convolution * Add warning for non-group-constant bias
2026-05-05 22:22:27 +00:00 · 2025-10-27 18:43:09 +01:00
parent 054fdb765c
commit 5c1974065e
11 changed files with 524 additions and 41 deletions
--- a/include/ck_tile/ops/elementwise/unary_element_wise_operation.hpp
+++ b/include/ck_tile/ops/elementwise/unary_element_wise_operation.hpp
@@ -1540,6 +1540,23 @@ struct Logistic
    const float alpha_;
 };

+struct Clamp
+{
+    CK_TILE_HOST_DEVICE Clamp(float lower = std::numeric_limits<float>::lowest(),
+                              float upper = std::numeric_limits<float>::max())
+        : lower_(lower), upper_(upper) {};
+
+    template <typename T>
+    CK_TILE_HOST_DEVICE constexpr void operator()(T& y, const T& x) const
+    {
+        T lower = ck_tile::type_convert<T>(lower_);
+        T upper = ck_tile::type_convert<T>(upper_);
+        y       = ck_tile::clamp(x, lower, upper);
+    }
+
+    float lower_, upper_;
+};
+
 struct ConvInvscale
 {
    static constexpr const char* name = "ConvInvscale";
@@ -1629,6 +1646,55 @@ struct Cast
    };
 };

+/**
+ * @brief Compose two unary element-wise functions into one.
+ *
+ *
+ * @note The Ds tensor can be used by at most one of the composed functions.
+ * This holds even if compositions are chained:
+ * In `Compose<FA, Compose<FB, FC>>`, only one of `FA`, `FB`, or `FC` can use
+ * the Ds tensor.
+ *
+ * @tparam FuncA    The first function to be applied.
+ * @tparam FuncB    The second function to be applied.
+ * @tparam FuncADs  Whether `FuncA` uses the Ds tensor.
+ * @tparam FuncBDs  Whether `FuncB` uses the Ds tensor.
+ */
+template <typename FuncA, typename FuncB, bool FuncADs = false, bool FuncBDs = false>
+struct Compose
+{
+    static_assert(!(FuncADs && FuncBDs), "Only one composed function may use the Ds tensor.");
+
+    CK_TILE_HOST_DEVICE Compose(FuncA func_a_ = FuncA{}, FuncB func_b_ = FuncB{})
+        : func_a(func_a_), func_b(func_b_)
+    {
+    }
+
+    template <typename AIn, typename BOut, typename AOut = AIn, typename... ADs>
+    CK_TILE_HOST_DEVICE constexpr void operator()(BOut& y, const AIn& x, const ADs&... ds) const
+    {
+        AOut tmp;
+        if constexpr(FuncADs)
+        {
+            func_a(tmp, x, ds...);
+            func_b(y, tmp);
+        }
+        else if constexpr(FuncBDs)
+        {
+            func_a(tmp, x);
+            func_b(y, tmp, ds...);
+        }
+        else
+        {
+            func_a(tmp, x);
+            func_b(y, tmp);
+        }
+    }
+
+    const FuncA func_a;
+    const FuncB func_b;
+};
+
 // support fastconvert of int8 to fp16
 #if 0
 template <typename InputDataType, typename OutputDataType, index_t RegPackNumber>
--- a/include/ck_tile/ops/epilogue/cshuffle_epilogue.hpp
+++ b/include/ck_tile/ops/epilogue/cshuffle_epilogue.hpp
@@ -8,7 +8,7 @@
 #include "ck_tile/ops/common/tensor_layout.hpp"
 #include "ck_tile/ops/elementwise/unary_element_wise_operation.hpp"

-#include <optional>
+#include <type_traits>

 namespace ck_tile {

@@ -117,6 +117,10 @@ struct CShuffleEpilogue
    static constexpr index_t MRepeat                       = kMPerBlock / (MPerXdl * MWave);
    static constexpr index_t NRepeat                       = kNPerBlock / (NPerXdl * NWave);

+    CDElementwise elfunc_;
+
+    CK_TILE_DEVICE CShuffleEpilogue(CDElementwise elfunc = CDElementwise{}) : elfunc_(elfunc) {};
+
    static_assert(NumDTensor == DsLayout::size(),
                  "The size of DsDataType and DsLayout should be the same");
    /**
@@ -385,7 +389,7 @@ struct CShuffleEpilogue
            generate_tie([&](auto idx) -> const auto& { return ds_tensor[idx]; },
                         number<NumDTensor>{}));

-        tile_elementwise_inout_unpack(typename Problem::CDElementwise{}, c_ds_tiles);
+        tile_elementwise_inout_unpack(elfunc_, c_ds_tiles);
    }

    template <typename OutDramWindow, typename COutTensor>
@@ -450,7 +454,7 @@ struct CShuffleEpilogue
    CK_TILE_DEVICE auto operator()(ODramWindow& out_dram_window,
                                   const OAccTile& o_acc_tile,
                                   const DsDramWindows& ds_dram_windows,
-                                   void* /*p_smem*/,
+                                   void* /* p_smem */,
                                   const ScaleM& scale_m = {},
                                   const ScaleN& scale_n = {})
    {
--- a/include/ck_tile/ops/grouped_convolution/kernel/grouped_convolution_forward_kernel.hpp
+++ b/include/ck_tile/ops/grouped_convolution/kernel/grouped_convolution_forward_kernel.hpp
@@ -7,10 +7,12 @@
 #include <string>

 #include "ck_tile/core.hpp"
+#include "ck_tile/core/tensor/tile_elementwise.hpp"
 #include "ck_tile/ops/common.hpp"
 #include "ck_tile/host/concat.hpp"
 #include "ck_tile/core/utility/env.hpp"
 #include "ck_tile/host/convolution_parameter.hpp"
+#include "ck_tile/ops/elementwise/unary_element_wise_operation.hpp"
 #include "ck_tile/ops/grouped_convolution/utils/transform_conv_fwd_to_gemm.hpp"
 #include "ck_tile/ops/grouped_convolution/utils/grouped_convolution_utils.hpp"

@@ -28,6 +30,7 @@ struct GroupedConvFwdKernelArgs
                               GroupedConvTraitsType_::VectorSizeB,
                               GroupedConvTraitsType_::VectorSizeC,
                               true>; // Split N enabled
+    using CDElementwise                 = typename GroupedConvTraitsType_::CDElementwise;
    static constexpr index_t NumDTensor = GroupedConvTraitsType_::NumDTensor;

    template <
@@ -38,7 +41,8 @@ struct GroupedConvFwdKernelArgs
                                    std::is_same_v<WeiLay, tensor_layout::convolution::GKXC> &&
                                    std::is_same_v<OutLay, tensor_layout::convolution::NWGK>,
                                bool>::type = false>
-    CK_TILE_HOST GroupedConvFwdKernelArgs(const GroupedConvFwdHostArgs& args)
+    CK_TILE_HOST GroupedConvFwdKernelArgs(const GroupedConvFwdHostArgs<CDElementwise>& args)
+        : elfunc(args.elfunc)
    {
        in_g_n_c_wis_lengths  = {static_cast<index_t>(args.G_),
                                 static_cast<index_t>(args.N_),
@@ -121,7 +125,8 @@ struct GroupedConvFwdKernelArgs
                                    std::is_same_v<WeiLay, tensor_layout::convolution::GKYXC> &&
                                    std::is_same_v<OutLay, tensor_layout::convolution::NHWGK>,
                                bool>::type = false>
-    CK_TILE_HOST GroupedConvFwdKernelArgs(const GroupedConvFwdHostArgs& args)
+    CK_TILE_HOST GroupedConvFwdKernelArgs(const GroupedConvFwdHostArgs<CDElementwise>& args)
+        : elfunc(args.elfunc)
    {
        in_g_n_c_wis_lengths  = {static_cast<index_t>(args.G_),
                                 static_cast<index_t>(args.N_),
@@ -213,7 +218,8 @@ struct GroupedConvFwdKernelArgs
                                    std::is_same_v<WeiLay, tensor_layout::convolution::GKZYXC> &&
                                    std::is_same_v<OutLay, tensor_layout::convolution::NDHWGK>,
                                bool>::type = false>
-    CK_TILE_HOST GroupedConvFwdKernelArgs(const GroupedConvFwdHostArgs& args)
+    CK_TILE_HOST GroupedConvFwdKernelArgs(const GroupedConvFwdHostArgs<CDElementwise>& args)
+        : elfunc(args.elfunc)
    {
        in_g_n_c_wis_lengths  = {static_cast<index_t>(args.G_),
                                 static_cast<index_t>(args.N_),
@@ -335,6 +341,7 @@ struct GroupedConvFwdKernelArgs
    const void* in_ptr;
    const void* wei_ptr;
    std::array<const void*, NumDTensor> ds_ptr;
+    const CDElementwise elfunc;
    void* out_ptr;

    AGridDescMK a_grid_desc_m_k;
@@ -423,6 +430,8 @@ struct GroupedConvolutionForwardKernel
    // Below type is actually accumulation data type - the output of block GEMM.
    using OutDataType = remove_cvref_t<typename EpiloguePipeline::ODataType>;

+    using CDElementwise = typename EpiloguePipeline::CDElementwise;
+
    using GroupedConvFwdKernelArgsSpecialized = GroupedConvFwdKernelArgs<GroupedConvTraitsType_>;

    // TODO: Enable this
@@ -458,7 +467,7 @@ struct GroupedConvolutionForwardKernel
    }

    CK_TILE_HOST static constexpr GroupedConvFwdKernelArgsSpecialized
-    MakeKernelArgs(const GroupedConvFwdHostArgs& hostArgs)
+    MakeKernelArgs(const GroupedConvFwdHostArgs<CDElementwise>& hostArgs)
    {
        return GroupedConvFwdKernelArgsSpecialized(hostArgs);
    }
@@ -636,7 +645,7 @@ struct GroupedConvolutionForwardKernel
                              "Not supported!");

                return make_tensor_view<address_space_enum::global>(
-                    static_cast<OutDataType*>(ds_ptr[i]), kargs.c_grid_desc_m_n);
+                    static_cast<const OutDataType*>(ds_ptr[i]), kargs.c_grid_desc_m_n);
            },
            number<NumDTensor>{});

@@ -765,8 +774,9 @@ struct GroupedConvolutionForwardKernel
        // Run Epilogue Pipeline
        auto& c_block_window = gemm_tile_windows.at(I3);

-        EpiloguePipeline{}.template operator()<decltype(c_block_window), decltype(c_block_tile)>(
-            c_block_window, c_block_tile, d_block_window, smem_ptr_0);
+        EpiloguePipeline{kargs.elfunc}
+            .template operator()<decltype(c_block_window), decltype(c_block_tile)>(
+                c_block_window, c_block_tile, d_block_window, smem_ptr_0);
    }

    /**
--- a/include/ck_tile/ops/grouped_convolution/utils/grouped_convolution_utils.hpp
+++ b/include/ck_tile/ops/grouped_convolution/utils/grouped_convolution_utils.hpp
@@ -5,6 +5,7 @@

 #include "ck_tile/core.hpp"
 #include "ck_tile/host/convolution_parameter.hpp"
+#include "ck_tile/ops/elementwise/unary_element_wise_operation.hpp"

 namespace ck_tile {

@@ -14,7 +15,7 @@ namespace ck_tile {
 ///      This structure is passed to Grouped Convolution Kernels when creating kernel
 ///      arguments object. It contain all necessary information required to
 ///      build proper kernel argument and launch kernel on GPU.
-template <typename InPtr, typename WeiPtr, typename OutPtr>
+template <typename InPtr, typename WeiPtr, typename OutPtr, typename CDElementwise>
 struct GroupedConvHostArgs : public conv::ConvParam
 {
    CK_TILE_HOST GroupedConvHostArgs() = delete;
@@ -23,13 +24,15 @@ struct GroupedConvHostArgs : public conv::ConvParam
                                     WeiPtr wei_ptr_,
                                     const std::vector<const void*> ds_ptr_,
                                     OutPtr out_ptr_,
-                                     index_t k_batch_)
+                                     index_t k_batch_,
+                                     CDElementwise elfunc_ = CDElementwise{})
        : conv::ConvParam(conv_param),
          in_ptr(in_ptr_),
          wei_ptr(wei_ptr_),
          ds_ptr(ds_ptr_),
          out_ptr(out_ptr_),
-          k_batch(k_batch_)
+          k_batch(k_batch_),
+          elfunc(elfunc_)
    {
    }

@@ -38,11 +41,17 @@ struct GroupedConvHostArgs : public conv::ConvParam
    const std::vector<const void*> ds_ptr;
    OutPtr out_ptr;
    index_t k_batch;
+    const CDElementwise elfunc;
 };

-using GroupedConvFwdHostArgs       = GroupedConvHostArgs<const void*, const void*, void*>;
-using GroupedConvBwdWeightHostArgs = GroupedConvHostArgs<const void*, void*, const void*>;
-using GroupedConvBwdDataHostArgs   = GroupedConvHostArgs<void*, const void*, const void*>;
+using PassThrough = ck_tile::element_wise::PassThrough;
+
+template <typename CDElementwise = PassThrough>
+using GroupedConvFwdHostArgs = GroupedConvHostArgs<const void*, const void*, void*, CDElementwise>;
+using GroupedConvBwdWeightHostArgs =
+    GroupedConvHostArgs<const void*, void*, const void*, PassThrough>;
+using GroupedConvBwdDataHostArgs =
+    GroupedConvHostArgs<void*, const void*, const void*, PassThrough>;

 template <index_t NDimSpatial_,
          ConvolutionSpecialization ConvSpecialization_,
@@ -50,9 +59,10 @@ template <index_t NDimSpatial_,
          typename WeiLayout_,
          typename DsLayout_,
          typename OutLayout_,
-          index_t VectorSizeA_ = 1,
-          index_t VectorSizeB_ = 1,
-          index_t VectorSizeC_ = 1>
+          index_t VectorSizeA_    = 1,
+          index_t VectorSizeB_    = 1,
+          index_t VectorSizeC_    = 1,
+          typename CDElementwise_ = PassThrough>
 struct GroupedConvTraits
 {
    private:
@@ -70,6 +80,7 @@ struct GroupedConvTraits
    using WeiLayout                                               = WeiLayout_;
    using DsLayout                                                = DsLayout_;
    using OutLayout                                               = OutLayout_;
+    using CDElementwise                                           = CDElementwise_;
    using GroupedConvImplicitGemmTraitsFwd =
        TileGemmTraits<true,
                       true,