adding ThreadwiseGenericTensorSliceCopy_v1r2

2026-05-11 08:50:17 +00:00 · 2019-08-07 16:51:14 -05:00
parent 1b3c2e4035
commit 9d99a58072
4 changed files with 278 additions and 50 deletions
--- a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer.hpp
+++ b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer.hpp
@@ -449,18 +449,28 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer
                out_n0_n1_n2_k0_k1_k2_h_w_thread_desc.GetLengths(),
                arithmetic_sequence_gen<0, 8, 1>::type{},
                Number<1>{});
-#elif 1
-            ThreadwiseGenericTensorSliceCopy_v1<
+#elif 0
+            ThreadwiseGenericTensorSliceCopy_v1r1<
                decltype(out_n0_n1_n2_k0_k1_k2_h_w_thread_desc),
                decltype(out_n0_n1_n2_k0_k1_k2_h_w_global_mem_desc),
                decltype(out_n0_n1_n2_k0_k1_k2_h_w_thread_desc.GetLengths()),
                arithmetic_sequence_gen<0, 8, 1>::type,
                arithmetic_sequence_gen<0, 8, 1>::type,
-                0,
-                0,
+                7,
+                7,
                1,
-                1>({0, 0, 0, 0, 0, 0, 0, 0}, {0, 0, 0, 0, 0, 0, 0, 0})
+                1>(make_zero_array<index_t, 8>(), make_zero_array<index_t, 8>())
                .Run(p_out_thread, p_out_thread_on_global);
+#elif 1
+            ThreadwiseGenericTensorSliceCopy_v1r2<
+                decltype(out_n0_n1_n2_k0_k1_k2_h_w_thread_desc),
+                decltype(out_n0_n1_n2_k0_k1_k2_h_w_global_mem_desc),
+                decltype(out_n0_n1_n2_k0_k1_k2_h_w_thread_desc.GetLengths()),
+                arithmetic_sequence_gen<0, 8, 1>::type,
+                7,
+                1,
+                1>(make_zero_array<index_t, 8>(), make_zero_array<index_t, 8>())
+                .Run_non_static(p_out_thread, p_out_thread_on_global);
 #elif 0
            ThreadwiseGenericTensorSliceCopy_v2<
                decltype(out_n0_n1_n2_k0_k1_k2_h_w_thread_desc),
--- a/composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp
+++ b/composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp
@@ -245,7 +245,7 @@ struct BlockwiseGenericTensorSliceCopy_v1
 // dimension need to be evenly dividable by its sub-lengths. Also, the repeat-length on
 // the merged dimension need to be 1. These sanity checks are performed in constructor
 // of BlockwiseGenericTensorSliceCopy_v1
-#if 0 // debug
+#if 0
            threadwise_generic_tensor_slice_copy_v1(SrcDesc{},
                                                    p_src + src_offset + mThreadSrcOffset,
                                                    make_zero_array<index_t, nDim>(),
@@ -255,18 +255,28 @@ struct BlockwiseGenericTensorSliceCopy_v1
                                                    thread_sub_tensor_lengths,
                                                    SrcDimAccessOrder{},
                                                    Number<SrcDataPerAccess>{});
-#else
-            ThreadwiseGenericTensorSliceCopy_v1<SrcDesc,
-                                                decltype(thread_buffer_desc),
-                                                SubLengths,
-                                                SrcDimAccessOrder,
-                                                typename arithmetic_sequence_gen<0, nDim, 1>::type,
-                                                SrcVectorAccessDim,
-                                                0,
-                                                SrcDataPerAccess,
-                                                1>(make_zero_array<index_t, nDim>(),
-                                                   make_zero_array<index_t, nDim>())
+#elif 0
+            ThreadwiseGenericTensorSliceCopy_v1r1<
+                SrcDesc,
+                decltype(thread_buffer_desc),
+                SubLengths,
+                SrcDimAccessOrder,
+                typename arithmetic_sequence_gen<0, nDim, 1>::type,
+                SrcVectorAccessDim,
+                0,
+                SrcDataPerAccess,
+                1>(make_zero_array<index_t, nDim>(), make_zero_array<index_t, nDim>())
                .Run(p_src + src_offset + mThreadSrcOffset, p_buffer + buffer_offset);
+#elif 1
+        ThreadwiseGenericTensorSliceCopy_v1r2<SrcDesc,
+                                              decltype(thread_buffer_desc),
+                                              SubLengths,
+                                              SrcDimAccessOrder,
+                                              SrcVectorAccessDim,
+                                              SrcDataPerAccess,
+                                              1>(make_zero_array<index_t, nDim>(),
+                                                 make_zero_array<index_t, nDim>())
+            .Run_non_static(p_src + src_offset + mThreadSrcOffset, p_buffer + buffer_offset);
 #endif
        });
    }
@@ -312,7 +322,7 @@ struct BlockwiseGenericTensorSliceCopy_v1
 // By setting SubLengths = 1 at the merged dimension, this is always true;
 // If in the future, you want to enable SubLengths > 1 at the merged dimension,
 // special care in implementation is needed
-#if 0 // debug
+#if 0
            threadwise_generic_tensor_slice_copy_v1(thread_buffer_desc,
                                                    p_buffer + buffer_offset,
                                                    make_zero_array<index_t, nDim>(),
@@ -322,18 +332,29 @@ struct BlockwiseGenericTensorSliceCopy_v1
                                                    thread_sub_tensor_lengths,
                                                    DstDimAccessOrder{},
                                                    Number<DstDataPerAccess>{});
-#else
-            ThreadwiseGenericTensorSliceCopy_v1<decltype(thread_buffer_desc),
-                                                DstDesc,
-                                                SubLengths,
-                                                typename arithmetic_sequence_gen<0, nDim, 1>::type,
-                                                DstDimAccessOrder,
-                                                0,
-                                                DstVectorAccessDim,
-                                                1,
-                                                DstDataPerAccess>(make_zero_array<index_t, nDim>(),
-                                                                  make_zero_array<index_t, nDim>())
+#elif 0
+            ThreadwiseGenericTensorSliceCopy_v1r1<
+                decltype(thread_buffer_desc),
+                DstDesc,
+                SubLengths,
+                typename arithmetic_sequence_gen<0, nDim, 1>::type,
+                DstDimAccessOrder,
+                0,
+                DstVectorAccessDim,
+                1,
+                DstDataPerAccess>(make_zero_array<index_t, nDim>(),
+                                  make_zero_array<index_t, nDim>())
                .Run(p_buffer + buffer_offset, p_dst + dst_offset + mThreadDstOffset);
+#elif 1
+    ThreadwiseGenericTensorSliceCopy_v1r2<decltype(thread_buffer_desc),
+                                          DstDesc,
+                                          SubLengths,
+                                          DstDimAccessOrder,
+                                          DstVectorAccessDim,
+                                          1,
+                                          DstDataPerAccess>(make_zero_array<index_t, nDim>(),
+                                                            make_zero_array<index_t, nDim>())
+        .Run_non_static(p_buffer + buffer_offset, p_dst + dst_offset + mThreadDstOffset);
 #endif
        });
    }
--- a/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp
+++ b/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp
@@ -106,7 +106,15 @@ __device__ void threadwise_generic_tensor_slice_copy_v1(
 #endif
 }

-#if 1
+// This threadwise copy allow vector access of src and dst.
+// It allows the dimensions of vector access to be different on src and dst.
+// It also allows the vector size to be different on src and dst.
+// It also allows order of access to be different on src and dst.
+// It use register as buffer to hold all data moving from src to dst.
+// It is designed for copying small amount of data, and src and dst are
+// device memory or LDS.
+// When copying large amout of data, let's hope compiler will reduce register
+// used for the buffer.
 template <class SrcDesc,
          class DstDesc,
          class SliceLengths,
@@ -116,12 +124,12 @@ template <class SrcDesc,
          index_t DstVectorAccessDim,
          index_t SrcDataPerAccess,
          index_t DstDataPerAccess>
-struct ThreadwiseGenericTensorSliceCopy_v1
+struct ThreadwiseGenericTensorSliceCopy_v1r1
 {
    static constexpr index_t nDim = SliceLengths::GetSize();

-    __device__ constexpr ThreadwiseGenericTensorSliceCopy_v1(Array<index_t, nDim> src_slice_origin,
-                                                             Array<index_t, nDim> dst_slice_origin)
+    __device__ constexpr ThreadwiseGenericTensorSliceCopy_v1r1(
+        Array<index_t, nDim> src_slice_origin, Array<index_t, nDim> dst_slice_origin)
        : mSrcSliceOrigin(src_slice_origin), mDstSliceOrigin(dst_slice_origin)
    {
        static_assert(nDim == SrcDesc::GetNumOfDimension() &&
@@ -145,7 +153,7 @@ struct ThreadwiseGenericTensorSliceCopy_v1
        static_if<!SrcDesc::ContainMultipleOriginalDimensions(src_vector_access_dim)>{}(
            [&](auto fwd) {
                static_assert(
-                    (fwd(SrcDesc{}).GetStrides()[SrcVectorAccessDim] == 1 || SrcDataPerAccess == 1),
+                    (fwd(SrcDesc{}).GetStride(src_vector_access_dim) == 1 || SrcDataPerAccess == 1),
                    "wrong! vectorized access is allowed only if stride == 1");
            })
            .Else([&](auto fwd) {
@@ -158,7 +166,7 @@ struct ThreadwiseGenericTensorSliceCopy_v1
        static_if<!DstDesc::ContainMultipleOriginalDimensions(dst_vector_access_dim)>{}(
            [&](auto fwd) {
                static_assert(
-                    (fwd(DstDesc{}).GetStrides()[DstVectorAccessDim] == 1 || DstDataPerAccess == 1),
+                    (fwd(DstDesc{}).GetStride(dst_vector_access_dim) == 1 || DstDataPerAccess == 1),
                    "wrong! vectorized access is allowed only if stride == 1");
            })
            .Else([&](auto fwd) {
@@ -169,9 +177,9 @@ struct ThreadwiseGenericTensorSliceCopy_v1
            });
    }

-    __device__ constexpr ThreadwiseGenericTensorSliceCopy_v1()
-        : ThreadwiseGenericTensorSliceCopy_v1(make_zero_array<index_t, nDim>(),
-                                              make_zero_array<index_t, nDim>())
+    __device__ constexpr ThreadwiseGenericTensorSliceCopy_v1r1()
+        : ThreadwiseGenericTensorSliceCopy_v1r1(make_zero_array<index_t, nDim>(),
+                                                make_zero_array<index_t, nDim>())
    {
    }

@@ -205,12 +213,12 @@ struct ThreadwiseGenericTensorSliceCopy_v1
                SliceLengths::Get(src_vector_access_dim) / src_data_per_access);

            static_ford<decltype(src_access_lengths), SrcDimAccessOrder>{}([&](auto src_access_id) {
-                constexpr auto src_data_id = src_access_id.Modify(
+                constexpr auto src_data_begin_id = src_access_id.Modify(
                    src_vector_access_dim,
                    src_access_id[src_vector_access_dim] * src_data_per_access);

                const index_t src_offset =
-                    SrcDesc::GetOffsetFromMultiIndex(mSrcSliceOrigin + src_data_id);
+                    SrcDesc::GetOffsetFromMultiIndex(mSrcSliceOrigin + src_data_begin_id);

                // load vector from src
                const vector_t vector_data = *reinterpret_cast<const vector_t*>(&p_src[src_offset]);
@@ -222,7 +230,7 @@ struct ThreadwiseGenericTensorSliceCopy_v1
                                                                              i);

                    constexpr index_t buffer_offset =
-                        buffer_desc.GetOffsetFromMultiIndex(src_data_id + scalar_id);
+                        buffer_desc.GetOffsetFromMultiIndex(src_data_begin_id + scalar_id);

                    p_buffer[buffer_offset] = reinterpret_cast<const TData*>(&vector_data)[i];
                });
@@ -241,7 +249,7 @@ struct ThreadwiseGenericTensorSliceCopy_v1
                SliceLengths::Get(dst_vector_access_dim) / dst_data_per_access);

            static_ford<decltype(dst_access_lengths), DstDimAccessOrder>{}([&](auto dst_access_id) {
-                constexpr auto dst_data_id = dst_access_id.Modify(
+                constexpr auto dst_data_begin_id = dst_access_id.Modify(
                    dst_vector_access_dim,
                    dst_access_id[dst_vector_access_dim] * dst_data_per_access);

@@ -254,13 +262,13 @@ struct ThreadwiseGenericTensorSliceCopy_v1
                                                                              i);

                    constexpr index_t buffer_offset =
-                        buffer_desc.GetOffsetFromMultiIndex(dst_data_id + scalar_id);
+                        buffer_desc.GetOffsetFromMultiIndex(dst_data_begin_id + scalar_id);

                    reinterpret_cast<TData*>(&vector_data)[i] = p_buffer[buffer_offset];
                });

                const index_t dst_offset =
-                    DstDesc::GetOffsetFromMultiIndex(mDstSliceOrigin + dst_data_id);
+                    DstDesc::GetOffsetFromMultiIndex(mDstSliceOrigin + dst_data_begin_id);

                // store vector into dst
                *reinterpret_cast<vector_t*>(&p_dst[dst_offset]) = vector_data;
@@ -272,7 +280,196 @@ struct ThreadwiseGenericTensorSliceCopy_v1
    Array<index_t, nDim> mSrcSliceOrigin;
    Array<index_t, nDim> mDstSliceOrigin;
 };
-#endif
+
+// This threadwise copy allow vector access of src and dst.
+// It allows the vector size to be different on src and dst.
+// The dimensions of vector access should be the same on src and dst.
+// The dimension access order should be the same on src and dst.
+// It is designed for cases, where one of src and dst is register, and
+// the other is device memory or LDS
+template <class SrcDesc,
+          class DstDesc,
+          class SliceLengths,
+          class DimAccessOrder,
+          index_t VectorAccessDim,
+          index_t SrcDataPerAccess,
+          index_t DstDataPerAccess>
+struct ThreadwiseGenericTensorSliceCopy_v1r2
+{
+    static constexpr index_t nDim = SliceLengths::GetSize();
+
+    __device__ constexpr ThreadwiseGenericTensorSliceCopy_v1r2(
+        Array<index_t, nDim> src_slice_origin, Array<index_t, nDim> dst_slice_origin)
+        : mSrcSliceOrigin(src_slice_origin), mDstSliceOrigin(dst_slice_origin)
+    {
+        static_assert(nDim == SrcDesc::GetNumOfDimension() &&
+                          nDim == DstDesc::GetNumOfDimension() && nDim == SliceLengths::GetSize() &&
+                          nDim == DimAccessOrder::GetSize(),
+                      "wrong! # of dimensions not the same");
+
+        static_assert(is_valid_sequence_map<DimAccessOrder>::value, "wrong! map is not valid");
+
+        static_assert(
+            SliceLengths{}[VectorAccessDim] % math::lcm(SrcDataPerAccess, DstDataPerAccess) == 0,
+            "wrong! cannot evenly divide");
+
+        // check vectorized memory access
+        constexpr auto vector_access_dim = Number<VectorAccessDim>{};
+
+        static_if<!SrcDesc::ContainMultipleOriginalDimensions(vector_access_dim)>{}([&](auto fwd) {
+            static_assert(
+                (fwd(SrcDesc{}).GetStride(vector_access_dim) == 1 || SrcDataPerAccess == 1),
+                "wrong! vectorized access is allowed only if stride == 1");
+        }).Else([&](auto fwd) {
+            static_assert((fwd(SrcDesc{}).GetLastOriginalDimensionStride(vector_access_dim) == 1 ||
+                           SrcDataPerAccess == 1),
+                          "wrong! vectorized access is allowed only if stride == 1");
+        });
+
+        static_if<!DstDesc::ContainMultipleOriginalDimensions(vector_access_dim)>{}([&](auto fwd) {
+            static_assert(
+                (fwd(DstDesc{}).GetStride(vector_access_dim) == 1 || DstDataPerAccess == 1),
+                "wrong! vectorized access is allowed only if stride == 1");
+        }).Else([&](auto fwd) {
+            static_assert((fwd(DstDesc{}).GetLastOriginalDimensionStride(vector_access_dim) == 1 ||
+                           DstDataPerAccess == 1),
+                          "wrong! vectorized access is allowed only if stride == 1");
+        });
+    }
+
+    __device__ constexpr ThreadwiseGenericTensorSliceCopy_v1r2()
+        : ThreadwiseGenericTensorSliceCopy_v1r2(make_zero_array<index_t, nDim>(),
+                                                make_zero_array<index_t, nDim>())
+    {
+    }
+
+    __device__ void SetSrcSliceOrigin(Array<index_t, nDim> src_slice_origin)
+    {
+        mSrcSliceOrigin = src_slice_origin;
+    }
+
+    __device__ void SetDstSliceOrigin(Array<index_t, nDim> dst_slice_origin)
+    {
+        mDstSliceOrigin = dst_slice_origin;
+    }
+
+    template <class TData>
+    __device__ void Run(const TData* p_src, TData* p_dst) const
+    {
+        using src_vector_t = typename vector_type<TData, SrcDataPerAccess>::MemoryType;
+        using dst_vector_t = typename vector_type<TData, DstDataPerAccess>::MemoryType;
+
+        constexpr auto vector_access_dim = Number<VectorAccessDim>{};
+
+        constexpr auto src_data_per_access = Number<SrcDataPerAccess>{};
+        constexpr auto dst_data_per_access = Number<DstDataPerAccess>{};
+
+        constexpr auto long_vector_size = Number<math::lcm(SrcDataPerAccess, DstDataPerAccess)>{};
+
+        constexpr auto long_vector_access_lengths = SliceLengths::Modify(
+            vector_access_dim, SliceLengths::Get(vector_access_dim) / long_vector_size);
+
+        static_ford<decltype(long_vector_access_lengths), DimAccessOrder>{}([&](
+            auto long_vector_access_id) {
+            // data id w.r.t slicing-window
+            constexpr auto long_vector_data_begin_id = long_vector_access_id.Modify(
+                vector_access_dim, long_vector_access_id[vector_access_dim] * long_vector_size);
+
+            // buffer to hold a long-vector
+            TData p_long_vector[long_vector_size];
+
+            // load data from src to the long-vector buffer
+            static_for<0, long_vector_size / src_data_per_access, 1>{}([&](auto i) {
+                constexpr auto scalar_id = typename uniform_sequence_gen<nDim, 0>::type{}.Modify(
+                    vector_access_dim, i * src_data_per_access);
+
+                const index_t src_offset = SrcDesc::GetOffsetFromMultiIndex(
+                    mSrcSliceOrigin + (long_vector_data_begin_id + scalar_id));
+
+                constexpr index_t buffer_offset = i * src_data_per_access;
+
+                *reinterpret_cast<src_vector_t*>(&p_long_vector[buffer_offset]) =
+                    *reinterpret_cast<const src_vector_t*>(&p_src[src_offset]);
+            });
+
+            // store data from the long-vector buffer to dst
+            static_for<0, long_vector_size / dst_data_per_access, 1>{}([&](auto i) {
+                constexpr auto scalar_id = typename uniform_sequence_gen<nDim, 0>::type{}.Modify(
+                    vector_access_dim, i * dst_data_per_access);
+
+                constexpr index_t buffer_offset = i * dst_data_per_access;
+
+                const index_t dst_offset = DstDesc::GetOffsetFromMultiIndex(
+                    mDstSliceOrigin + (long_vector_data_begin_id + scalar_id));
+
+                *reinterpret_cast<dst_vector_t*>(&p_dst[dst_offset]) =
+                    *reinterpret_cast<dst_vector_t*>(&p_long_vector[buffer_offset]);
+            });
+        });
+    }
+
+    template <class TData>
+    __device__ void Run_non_static(const TData* p_src, TData* p_dst) const
+    {
+        using src_vector_t = typename vector_type<TData, SrcDataPerAccess>::MemoryType;
+        using dst_vector_t = typename vector_type<TData, DstDataPerAccess>::MemoryType;
+
+        constexpr auto vector_access_dim = Number<VectorAccessDim>{};
+
+        constexpr auto src_data_per_access = Number<SrcDataPerAccess>{};
+        constexpr auto dst_data_per_access = Number<DstDataPerAccess>{};
+
+        constexpr auto long_vector_size = Number<math::lcm(SrcDataPerAccess, DstDataPerAccess)>{};
+
+        constexpr auto long_vector_access_lengths = SliceLengths::Modify(
+            vector_access_dim, SliceLengths::Get(vector_access_dim) / long_vector_size);
+
+        ford<decltype(long_vector_access_lengths), DimAccessOrder>{}(
+            [&](auto long_vector_access_id) {
+                // data id w.r.t slicing-window
+                auto long_vector_data_begin_id = long_vector_access_id;
+                long_vector_data_begin_id(vector_access_dim) =
+                    long_vector_size * long_vector_access_id[vector_access_dim];
+
+                // buffer to hold a long-vector
+                TData p_long_vector[long_vector_size];
+
+                // load data from src to the long-vector buffer
+                for(index_t i = 0; i < long_vector_size / src_data_per_access; ++i)
+                {
+                    auto scalar_id               = make_zero_array<index_t, nDim>();
+                    scalar_id(vector_access_dim) = i * src_data_per_access;
+
+                    const index_t src_offset = SrcDesc::GetOffsetFromMultiIndex(
+                        mSrcSliceOrigin + (long_vector_data_begin_id + scalar_id));
+
+                    const index_t buffer_offset = i * src_data_per_access;
+
+                    *reinterpret_cast<src_vector_t*>(&p_long_vector[buffer_offset]) =
+                        *reinterpret_cast<const src_vector_t*>(&p_src[src_offset]);
+                }
+
+                // store data from the long-vector buffer to dst
+                for(index_t i = 0; i < long_vector_size / dst_data_per_access; ++i)
+                {
+                    auto scalar_id               = make_zero_array<index_t, nDim>();
+                    scalar_id(vector_access_dim) = i * dst_data_per_access;
+
+                    const index_t buffer_offset = i * dst_data_per_access;
+
+                    const index_t dst_offset = DstDesc::GetOffsetFromMultiIndex(
+                        mDstSliceOrigin + (long_vector_data_begin_id + scalar_id));
+
+                    *reinterpret_cast<dst_vector_t*>(&p_dst[dst_offset]) =
+                        *reinterpret_cast<dst_vector_t*>(&p_long_vector[buffer_offset]);
+                }
+            });
+    }
+
+    private:
+    Array<index_t, nDim> mSrcSliceOrigin;
+    Array<index_t, nDim> mDstSliceOrigin;
+};

 template <class SrcDesc,
          class DstDesc,
--- a/driver/include/device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp
+++ b/driver/include/device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp
@@ -59,7 +59,7 @@ void device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw(InDesc,

    constexpr index_t B = (N * Ho * Wo) / (N1 * N2);

-#if 1
+#if 0
    // each thread hold 64 data
    constexpr index_t BlockSize = 256;

@@ -112,14 +112,14 @@ void device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw(InDesc,
    constexpr index_t GemmDataPerReadA   = 4;
    constexpr index_t GemmDataPerReadB   = 4;

-    using InBlockCopySubLengths_E_N1_B_N2      = Sequence<1, 1, 1, 4>;
-    using InBlockCopyClusterLengths_E_N1_B_N2  = Sequence<8, 2, 16, 1>;
+    using InBlockCopySubLengths_E_N1_B_N2      = Sequence<1, 1, 4, 1>;
+    using InBlockCopyClusterLengths_E_N1_B_N2  = Sequence<8, 2, 4, 4>;
    using InBlockCopyThreadClusterArrangeOrder = Sequence<0, 1, 3, 2>; // [E, N1, N2, B]
    using InBlockCopySrcAccessOrder            = Sequence<0, 1, 3, 2>; // [E, N1, N2, B]
    using InBlockCopyDstAccessOrder            = Sequence<0, 1, 2, 3>; // [E, N1, B, N2]

-    constexpr index_t InBlockCopySrcDataPerRead_B   = 1;
-    constexpr index_t InBlockCopyDstDataPerWrite_N2 = 4;
+    constexpr index_t InBlockCopySrcDataPerRead_B   = 4;
+    constexpr index_t InBlockCopyDstDataPerWrite_N2 = 1;

    using WeiBlockCopySubLengths_E_K            = Sequence<2, 2>;
    using WeiBlockCopyClusterLengths_E_K        = Sequence<4, 64>;