Add elementwise with dynamic vector dim (#1198)

* Add elementwise with dynamic vector dim * Reduce number of instaces * Fixes * Fixes [ROCm/composable_kernel commit: 9c052804a7]
2026-05-17 03:19:48 +00:00 · 2024-03-22 10:40:43 +01:00
parent 5f84554b12
commit aa64a8be0a
28 changed files with 2157 additions and 359 deletions
--- a/example/44_elementwise_permute/elementwise_permute_4D_fp16.cpp
+++ b/example/44_elementwise_permute/elementwise_permute_4D_fp16.cpp
@@ -6,7 +6,7 @@

 #include "ck/ck.hpp"
 #include "ck/tensor_operation/gpu/element/binary_element_wise_operation.hpp"
-#include "ck/tensor_operation/gpu/device/impl/device_elementwise_impl.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_elementwise_dynamic_vector_dims_impl.hpp"

 #include "ck/library/utility/algorithm.hpp"
 #include "ck/library/utility/check_err.hpp"
@@ -20,15 +20,20 @@ using F32 = float;
 using ADataType = F16;
 using BDataType = F16;

-using PassThrough = ck::tensor_operation::element_wise::PassThrough;
-using DeviceElementwisePermuteInstance =
-    ck::tensor_operation::device::DeviceElementwiseImpl<ck::Tuple<ADataType>, // InDataTypeTuple
-                                                        ck::Tuple<BDataType>, // OutDataTypeTuple
-                                                        PassThrough,          // Elementwise op
-                                                        4,                    // NumDim
-                                                        8,                    // MPerThread
-                                                        ck::Sequence<8>,  // InScalarPerVectorSeq
-                                                        ck::Sequence<1>>; // OutScalarPerVectorSeq
+using PassThrough                      = ck::tensor_operation::element_wise::PassThrough;
+using DeviceElementwisePermuteInstance = ck::tensor_operation::device::DeviceElementwiseImpl<
+    ck::Tuple<ADataType>, // InDataTypeTuple
+    ck::Tuple<BDataType>, // OutDataTypeTuple
+    PassThrough,          // Elementwise
+    4,                    // NumDim
+    256,                  // BlockSize
+    128,                  // M0PerBlock
+    128,                  // M1PerBlock
+    8,                    // M0PerThread
+    8,                    // M1PerThread
+    ck::Sequence<1, 0>,   // ThreadClusterArrangeOrder
+    ck::Sequence<8>,      // InScalarPerVectorSeq
+    ck::Sequence<8>>;     // OutScalarPerVectorSeq

 template <typename HostTensorA, typename HostTensorB, typename Functor>
 void host_elementwise4D(HostTensorB& B_nhwc, const HostTensorA& A_nchw, Functor functor)
--- a/example/44_elementwise_permute/elementwise_permute_4D_fp16_col.cpp
+++ b/example/44_elementwise_permute/elementwise_permute_4D_fp16_col.cpp
@@ -7,7 +7,7 @@

 #include "ck/ck.hpp"
 #include "ck/tensor_operation/gpu/element/binary_element_wise_operation.hpp"
-#include "ck/tensor_operation/gpu/device/impl/device_elementwise_scale_impl.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_elementwise_dynamic_vector_dims_impl.hpp"

 #include "ck/library/utility/algorithm.hpp"
 #include "ck/library/utility/check_err.hpp"
@@ -21,26 +21,23 @@ using F32 = float;
 using ADataType = F16;
 using BDataType = F16;

-using PassThrough = ck::tensor_operation::element_wise::PassThrough;
-using UnaryOp     = ck::tensor_operation::element_wise::UnarySquare;
-using Scale       = ck::tensor_operation::element_wise::Scale;
-using DeviceElementwisePermuteInstance =
-    ck::tensor_operation::device::DeviceElementwiseImpl<ck::Tuple<ADataType>, // InDataTypeTuple
-                                                        ck::Tuple<BDataType>, // OutDataTypeTuple
-                                                        PassThrough,          // ElementwiseOp
-                                                        UnaryOp,              // UnaryOp
-                                                        Scale,                // Scalar
-                                                        4,                    // NumDim
-                                                        8,                    // MPerThread
-                                                        ck::Sequence<1>,  // InScalarPerVectorSeq
-                                                        ck::Sequence<1>>; // OutScalarPerVectorSeq
+using UnaryOp                          = ck::tensor_operation::element_wise::Scale;
+using DeviceElementwisePermuteInstance = ck::tensor_operation::device::DeviceElementwiseImpl<
+    ck::Tuple<ADataType>, // InDataTypeTuple
+    ck::Tuple<BDataType>, // OutDataTypeTuple
+    UnaryOp,              // UnaryOp
+    4,                    // NumDim
+    256,                  // BlockSize
+    128,                  // M0PerBlock
+    128,                  // M1PerBlock
+    8,                    // M0PerThread
+    8,                    // M1PerThread
+    ck::Sequence<1, 0>,   // ThreadClusterArrangeOrder
+    ck::Sequence<8>,      // InScalarPerVectorSeq
+    ck::Sequence<8>>;     // OutScalarPerVectorSeq

-template <typename HostTensorA, typename HostTensorB, typename FunctorA, typename FunctorB>
-void host_elementwise4D(HostTensorB& B_nhwc,
-                        const HostTensorA& A_nchw,
-                        FunctorA functor_a,
-                        FunctorB functor_b,
-                        float scale)
+template <typename HostTensorA, typename HostTensorB, typename Functor>
+void host_elementwise4D(HostTensorB& B_nhwc, const HostTensorA& A_nchw, Functor functor)
 {
    std::size_t N = A_nchw.mDesc.GetLengths()[0];
    std::size_t C = A_nchw.mDesc.GetLengths()[1];
@@ -51,11 +48,8 @@ void host_elementwise4D(HostTensorB& B_nhwc,
            for(std::size_t c = 0; c < C; ++c)
                for(std::size_t n = 0; n < N; ++n)
                {
-                    ADataType tmp_val;
                    auto a_val = A_nchw.mData[(n) + (c * N) + (h * C * N) + (w * H * C * N)];
-                    functor_b(tmp_val, a_val);
-                    functor_a(B_nhwc.mData[(n) + (c * W * H * N) + (h * N) + (w * H * N)],
-                              scale * tmp_val);
+                    functor(B_nhwc.mData[(n) + (c * W * H * N) + (h * N) + (w * H * N)], a_val);
                }
 }

@@ -104,14 +98,8 @@ int main()
    ck::ranges::copy(nchw, ab_lengths.begin());

    auto broadcastPermute = DeviceElementwisePermuteInstance{};
-    auto argument         = broadcastPermute.MakeArgumentPointer(ab_lengths,
-                                                         {a_strides},
-                                                         {b_strides},
-                                                         input,
-                                                         output,
-                                                         PassThrough{},
-                                                         UnaryOp{},
-                                                         Scale{scale});
+    auto argument         = broadcastPermute.MakeArgumentPointer(
+        ab_lengths, {a_strides}, {b_strides}, input, output, UnaryOp{scale});

    if(!broadcastPermute.IsSupportedArgument(argument.get()))
    {
@@ -143,7 +131,7 @@ int main()
    {
        b_device_buf.FromDevice(b.mData.data());
        Tensor<BDataType> host_b(nhwc);
-        host_elementwise4D(host_b, a, PassThrough{}, UnaryOp{}, scale);
+        host_elementwise4D(host_b, a, UnaryOp{scale});

        pass &=
            ck::utils::check_err(b.mData, host_b.mData, "Error: Incorrect results b", 1e-3, 1e-3);
--- a/example/44_elementwise_permute/elementwise_permute_4D_fp16_row.cpp
+++ b/example/44_elementwise_permute/elementwise_permute_4D_fp16_row.cpp
@@ -6,7 +6,7 @@

 #include "ck/ck.hpp"
 #include "ck/tensor_operation/gpu/element/binary_element_wise_operation.hpp"
-#include "ck/tensor_operation/gpu/device/impl/device_elementwise_scale_impl.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_elementwise_dynamic_vector_dims_impl.hpp"

 #include "ck/library/utility/algorithm.hpp"
 #include "ck/library/utility/check_err.hpp"
@@ -20,36 +20,31 @@ using F32 = float;
 using ADataType = F16;
 using BDataType = F16;

-using PassThrough = ck::tensor_operation::element_wise::PassThrough;
-using UnaryOp     = ck::tensor_operation::element_wise::UnarySquare;
-using Scale       = ck::tensor_operation::element_wise::Scale;
-using DeviceElementwisePermuteInstance =
-    ck::tensor_operation::device::DeviceElementwiseImpl<ck::Tuple<ADataType>, // InDataTypeTuple
-                                                        ck::Tuple<BDataType>, // OutDataTypeTuple
-                                                        PassThrough,          // ElementwiseOp
-                                                        UnaryOp,              // UnaryOp
-                                                        Scale,                // Scalar
-                                                        4,                    // NumDim
-                                                        8,                    // MPerThread
-                                                        ck::Sequence<8>,  // InScalarPerVectorSeq
-                                                        ck::Sequence<1>>; // OutScalarPerVectorSeq
+using UnaryOp                          = ck::tensor_operation::element_wise::Scale;
+using DeviceElementwisePermuteInstance = ck::tensor_operation::device::DeviceElementwiseImpl<
+    ck::Tuple<ADataType>, // InDataTypeTuple
+    ck::Tuple<BDataType>, // OutDataTypeTuple
+    UnaryOp,              // UnaryOp
+    4,                    // NumDim
+    256,                  // BlockSize
+    128,                  // M0PerBlock
+    128,                  // M1PerBlock
+    8,                    // M0PerThread
+    8,                    // M1PerThread
+    ck::Sequence<1, 0>,   // ThreadClusterArrangeOrder
+    ck::Sequence<8>,      // InScalarPerVectorSeq
+    ck::Sequence<8>>;     // OutScalarPerVectorSeq

-template <typename HostTensorA, typename HostTensorB, typename FunctorA, typename FunctorB>
-void host_elementwise4D(HostTensorB& B_nhwc,
-                        const HostTensorA& A_nchw,
-                        FunctorA functor_a,
-                        FunctorB functor_b,
-                        float scale)
+template <typename HostTensorA, typename HostTensorB, typename Functor>
+void host_elementwise4D(HostTensorB& B_nhwc, const HostTensorA& A_nchw, Functor functor)
 {
    for(std::size_t n = 0; n < A_nchw.mDesc.GetLengths()[0]; ++n)
        for(std::size_t c = 0; c < A_nchw.mDesc.GetLengths()[1]; ++c)
            for(std::size_t h = 0; h < A_nchw.mDesc.GetLengths()[2]; ++h)
                for(std::size_t w = 0; w < A_nchw.mDesc.GetLengths()[3]; ++w)
                {
-                    ADataType tmp_val;
                    auto a_val = A_nchw(n, c, h, w);
-                    functor_b(tmp_val, a_val);
-                    functor_a(B_nhwc(n, h, w, c), scale * tmp_val);
+                    functor(B_nhwc(n, h, w, c), a_val);
                }
 }

@@ -86,14 +81,8 @@ int main()
    ck::ranges::copy(nchw, ab_lengths.begin());

    auto broadcastPermute = DeviceElementwisePermuteInstance{};
-    auto argument         = broadcastPermute.MakeArgumentPointer(ab_lengths,
-                                                         {a_strides},
-                                                         {b_strides},
-                                                         input,
-                                                         output,
-                                                         PassThrough{},
-                                                         UnaryOp{},
-                                                         Scale{scale});
+    auto argument         = broadcastPermute.MakeArgumentPointer(
+        ab_lengths, {a_strides}, {b_strides}, input, output, UnaryOp{scale});

    if(!broadcastPermute.IsSupportedArgument(argument.get()))
    {
@@ -125,7 +114,7 @@ int main()
    {
        b_device_buf.FromDevice(b.mData.data());
        Tensor<BDataType> host_b(nhwc);
-        host_elementwise4D(host_b, a, PassThrough{}, UnaryOp{}, scale);
+        host_elementwise4D(host_b, a, UnaryOp{scale});

        pass &=
            ck::utils::check_err(b.mData, host_b.mData, "Error: Incorrect results b", 1e-3, 1e-3);
--- a/example/44_elementwise_permute/elementwise_permute_4D_fp32_col.cpp
+++ b/example/44_elementwise_permute/elementwise_permute_4D_fp32_col.cpp
@@ -6,7 +6,7 @@

 #include "ck/ck.hpp"
 #include "ck/tensor_operation/gpu/element/binary_element_wise_operation.hpp"
-#include "ck/tensor_operation/gpu/device/impl/device_elementwise_scale_impl.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_elementwise_dynamic_vector_dims_impl.hpp"

 #include "ck/library/utility/algorithm.hpp"
 #include "ck/library/utility/check_err.hpp"
@@ -20,26 +20,23 @@ using F32 = float;
 using ADataType = F32;
 using BDataType = F32;

-using PassThrough = ck::tensor_operation::element_wise::PassThrough;
-using UnaryOp     = ck::tensor_operation::element_wise::UnarySquare;
-using Scale       = ck::tensor_operation::element_wise::Scale;
-using DeviceElementwisePermuteInstance =
-    ck::tensor_operation::device::DeviceElementwiseImpl<ck::Tuple<ADataType>, // InDataTypeTuple
-                                                        ck::Tuple<BDataType>, // OutDataTypeTuple
-                                                        PassThrough,          // ElementwiseOp
-                                                        UnaryOp,              // UnaryOp
-                                                        Scale,                // Scalar
-                                                        4,                    // NumDim
-                                                        1,                    // MPerThread
-                                                        ck::Sequence<1>,  // InScalarPerVectorSeq
-                                                        ck::Sequence<1>>; // OutScalarPerVectorSeq
+using UnaryOp                          = ck::tensor_operation::element_wise::Scale;
+using DeviceElementwisePermuteInstance = ck::tensor_operation::device::DeviceElementwiseImpl<
+    ck::Tuple<ADataType>, // InDataTypeTuple
+    ck::Tuple<BDataType>, // OutDataTypeTuple
+    UnaryOp,              // UnaryOp
+    4,                    // NumDim
+    256,                  // BlockSize
+    128,                  // M0PerBlock
+    128,                  // M1PerBlock
+    8,                    // M0PerThread
+    8,                    // M1PerThread
+    ck::Sequence<1, 0>,   // ThreadClusterArrangeOrder
+    ck::Sequence<1>,      // InScalarPerVectorSeq
+    ck::Sequence<1>>;     // OutScalarPerVectorSeq

-template <typename HostTensorA, typename HostTensorB, typename FunctorA, typename FunctorB>
-void host_elementwise4D(HostTensorB& B_nhwc,
-                        const HostTensorA& A_nchw,
-                        FunctorA functor_a,
-                        FunctorB functor_b,
-                        float scale)
+template <typename HostTensorA, typename HostTensorB, typename Functor>
+void host_elementwise4D(HostTensorB& B_nhwc, const HostTensorA& A_nchw, Functor functor)
 {
    std::size_t N = A_nchw.mDesc.GetLengths()[0];
    std::size_t C = A_nchw.mDesc.GetLengths()[1];
@@ -50,11 +47,8 @@ void host_elementwise4D(HostTensorB& B_nhwc,
            for(std::size_t c = 0; c < C; ++c)
                for(std::size_t n = 0; n < N; ++n)
                {
-                    ADataType tmp_val;
                    auto a_val = A_nchw.mData[(n) + (c * N) + (h * C * N) + (w * H * C * N)];
-                    functor_b(tmp_val, a_val);
-                    functor_a(B_nhwc.mData[(n) + (c * W * H * N) + (h * N) + (w * H * N)],
-                              scale * tmp_val);
+                    functor(B_nhwc.mData[(n) + (c * W * H * N) + (h * N) + (w * H * N)], a_val);
                }
 }

@@ -104,14 +98,8 @@ int main()
    ck::ranges::copy(nchw, ab_lengths.begin());

    auto broadcastPermute = DeviceElementwisePermuteInstance{};
-    auto argument         = broadcastPermute.MakeArgumentPointer(ab_lengths,
-                                                         {a_strides},
-                                                         {b_strides},
-                                                         input,
-                                                         output,
-                                                         PassThrough{},
-                                                         UnaryOp{},
-                                                         Scale{scale});
+    auto argument         = broadcastPermute.MakeArgumentPointer(
+        ab_lengths, {a_strides}, {b_strides}, input, output, UnaryOp{scale});

    if(!broadcastPermute.IsSupportedArgument(argument.get()))
    {
@@ -143,7 +131,7 @@ int main()
    {
        b_device_buf.FromDevice(b.mData.data());
        Tensor<BDataType> host_b(nhwc);
-        host_elementwise4D(host_b, a, PassThrough{}, UnaryOp{}, scale);
+        host_elementwise4D(host_b, a, UnaryOp{scale});

        pass &=
            ck::utils::check_err(b.mData, host_b.mData, "Error: Incorrect results b", 1e-3, 1e-3);
--- a/example/44_elementwise_permute/elementwise_permute_4D_fp32_row.cpp
+++ b/example/44_elementwise_permute/elementwise_permute_4D_fp32_row.cpp
@@ -6,7 +6,7 @@

 #include "ck/ck.hpp"
 #include "ck/tensor_operation/gpu/element/binary_element_wise_operation.hpp"
-#include "ck/tensor_operation/gpu/device/impl/device_elementwise_scale_impl.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_elementwise_dynamic_vector_dims_impl.hpp"

 #include "ck/library/utility/algorithm.hpp"
 #include "ck/library/utility/check_err.hpp"
@@ -20,36 +20,31 @@ using F32 = float;
 using ADataType = F32;
 using BDataType = F32;

-using PassThrough = ck::tensor_operation::element_wise::PassThrough;
-using UnaryOp     = ck::tensor_operation::element_wise::UnarySquare;
-using Scale       = ck::tensor_operation::element_wise::Scale;
-using DeviceElementwisePermuteInstance =
-    ck::tensor_operation::device::DeviceElementwiseImpl<ck::Tuple<ADataType>, // InDataTypeTuple
-                                                        ck::Tuple<BDataType>, // OutDataTypeTuple
-                                                        PassThrough,          // ElementwiseOp
-                                                        UnaryOp,              // UnaryOp
-                                                        Scale,                // Scalar
-                                                        4,                    // NumDim
-                                                        8,                    // MPerThread
-                                                        ck::Sequence<8>,  // InScalarPerVectorSeq
-                                                        ck::Sequence<1>>; // OutScalarPerVectorSeq
+using UnaryOp                          = ck::tensor_operation::element_wise::Scale;
+using DeviceElementwisePermuteInstance = ck::tensor_operation::device::DeviceElementwiseImpl<
+    ck::Tuple<ADataType>, // InDataTypeTuple
+    ck::Tuple<BDataType>, // OutDataTypeTuple
+    UnaryOp,              // UnaryOp
+    4,                    // NumDim
+    256,                  // BlockSize
+    128,                  // M0PerBlock
+    128,                  // M1PerBlock
+    8,                    // M0PerThread
+    8,                    // M1PerThread
+    ck::Sequence<1, 0>,   // ThreadClusterArrangeOrder
+    ck::Sequence<8>,      // InScalarPerVectorSeq
+    ck::Sequence<8>>;     // OutScalarPerVectorSeq

-template <typename HostTensorA, typename HostTensorB, typename FunctorA, typename FunctorB>
-void host_elementwise4D(HostTensorB& B_nhwc,
-                        const HostTensorA& A_nchw,
-                        FunctorA functor_a,
-                        FunctorB functor_b,
-                        float scale)
+template <typename HostTensorA, typename HostTensorB, typename Functor>
+void host_elementwise4D(HostTensorB& B_nhwc, const HostTensorA& A_nchw, Functor functor)
 {
    for(std::size_t n = 0; n < A_nchw.mDesc.GetLengths()[0]; ++n)
        for(std::size_t c = 0; c < A_nchw.mDesc.GetLengths()[1]; ++c)
            for(std::size_t h = 0; h < A_nchw.mDesc.GetLengths()[2]; ++h)
                for(std::size_t w = 0; w < A_nchw.mDesc.GetLengths()[3]; ++w)
                {
-                    ADataType tmp_val;
                    auto a_val = A_nchw(n, c, h, w);
-                    functor_b(tmp_val, a_val);
-                    functor_a(B_nhwc(n, h, w, c), scale * tmp_val);
+                    functor(B_nhwc(n, h, w, c), a_val);
                }
 }

@@ -86,14 +81,8 @@ int main()
    ck::ranges::copy(nchw, ab_lengths.begin());

    auto broadcastPermute = DeviceElementwisePermuteInstance{};
-    auto argument         = broadcastPermute.MakeArgumentPointer(ab_lengths,
-                                                         {a_strides},
-                                                         {b_strides},
-                                                         input,
-                                                         output,
-                                                         PassThrough{},
-                                                         UnaryOp{},
-                                                         Scale{scale});
+    auto argument         = broadcastPermute.MakeArgumentPointer(
+        ab_lengths, {a_strides}, {b_strides}, input, output, UnaryOp{scale});

    if(!broadcastPermute.IsSupportedArgument(argument.get()))
    {
@@ -125,7 +114,7 @@ int main()
    {
        b_device_buf.FromDevice(b.mData.data());
        Tensor<BDataType> host_b(nhwc);
-        host_elementwise4D(host_b, a, PassThrough{}, UnaryOp{}, scale);
+        host_elementwise4D(host_b, a, UnaryOp{scale});

        pass &=
            ck::utils::check_err(b.mData, host_b.mData, "Error: Incorrect results b", 1e-3, 1e-3);