use vectorized read and write for threadwise generic tensor copy

2026-05-11 17:00:18 +00:00 · 2019-06-03 18:58:01 -05:00
parent 33c38e2a23
commit 917d7a2b1d
11 changed files with 177 additions and 123 deletions
--- a/src/include/blockwise_generic_tensor_slice_op.hip.hpp
+++ b/src/include/blockwise_generic_tensor_slice_op.hip.hpp
@@ -15,7 +15,7 @@ template <index_t BlockSize,
          class SrcAccessOrder,
          class DstAccessOrder,
          index_t SrcDataPerRead,
-          index_t DstDataPerRead>
+          index_t DstDataPerWrite>
 struct BlockwiseGenericTensorSliceCopy_v1
 {
    static constexpr index_t nDim = SrcDesc::GetNumOfDimension();
@@ -217,14 +217,15 @@ struct BlockwiseGenericTensorSliceCopy_v1
            const index_t clipboard_offset = thread_tensor_desc.GetOffsetFromMultiIndex(
                clipboard_data_multi_id_begin); // cannot not constexpr, why?

-            threadwise_generic_tensor_slice_copy(SrcDesc{},
-                                                 p_src + src_offset + mThreadSrcOffset,
-                                                 make_zero_array<index_t, nDim>(),
-                                                 thread_tensor_desc,
-                                                 p_clipboard + clipboard_offset,
-                                                 make_zero_array<index_t, nDim>(),
-                                                 thread_sub_tensor_lengths,
-                                                 SrcAccessOrder{});
+            threadwise_generic_tensor_slice_copy_v1(SrcDesc{},
+                                                    p_src + src_offset + mThreadSrcOffset,
+                                                    make_zero_array<index_t, nDim>(),
+                                                    thread_tensor_desc,
+                                                    p_clipboard + clipboard_offset,
+                                                    make_zero_array<index_t, nDim>(),
+                                                    thread_sub_tensor_lengths,
+                                                    SrcAccessOrder{},
+                                                    Number<SrcDataPerRead>{});
        });
    }

@@ -255,14 +256,15 @@ struct BlockwiseGenericTensorSliceCopy_v1
            const index_t dst_offset = DstDesc{}.GetOffsetFromMultiIndex(
                dst_data_multi_id_begin); // cannot not constexpr, why?

-            threadwise_generic_tensor_slice_copy(thread_tensor_desc,
-                                                 p_clipboard + clipboard_offset,
-                                                 make_zero_array<index_t, nDim>(),
-                                                 DstDesc{},
-                                                 p_dst + dst_offset + mThreadDstOffset,
-                                                 make_zero_array<index_t, nDim>(),
-                                                 thread_sub_tensor_lengths,
-                                                 DstAccessOrder{});
+            threadwise_generic_tensor_slice_copy_v1(thread_tensor_desc,
+                                                    p_clipboard + clipboard_offset,
+                                                    make_zero_array<index_t, nDim>(),
+                                                    DstDesc{},
+                                                    p_dst + dst_offset + mThreadDstOffset,
+                                                    make_zero_array<index_t, nDim>(),
+                                                    thread_sub_tensor_lengths,
+                                                    DstAccessOrder{},
+                                                    Number<DstDataPerWrite>{});
        });
    }

--- a/src/include/common.hip.hpp
+++ b/src/include/common.hip.hpp
@@ -105,4 +105,13 @@ __host__ __device__ constexpr T min(T x, Ts... xs)

    return x < y ? x : y;
 }
+
+// this is wrong
+// TODO: implement correct least common multiple, instead of calling max()
+template <class T, class... Ts>
+__host__ __device__ constexpr T least_common_multiple(T x, Ts... xs)
+{
+    return max(x, xs...);
+}
+
 } // namespace mod_conv
--- a/src/include/conv_common.hip.hpp
+++ b/src/include/conv_common.hip.hpp
@@ -3,8 +3,7 @@

 // this is ugly, only for 4d
 template <class InDesc, class WeiDesc>
-constexpr auto get_convolution_output_default_4d_tensor_descriptor(InDesc,
-                                                                                       WeiDesc)
+constexpr auto get_convolution_output_default_4d_tensor_descriptor(InDesc, WeiDesc)
 {
    constexpr auto in_desc  = InDesc{};
    constexpr auto wei_desc = WeiDesc{};
@@ -34,8 +33,10 @@ constexpr auto get_convolution_output_default_4d_tensor_descriptor(InDesc,
 }

 template <class InDesc, class WeiDesc, class LowerPads, class UpperPads>
-constexpr auto get_convolution_with_padding_output_default_4d_tensor_descriptor(
-    InDesc, WeiDesc, LowerPads, UpperPads)
+constexpr auto get_convolution_with_padding_output_default_4d_tensor_descriptor(InDesc,
+                                                                                WeiDesc,
+                                                                                LowerPads,
+                                                                                UpperPads)
 {
    constexpr auto in_desc  = InDesc{};
    constexpr auto wei_desc = WeiDesc{};
@@ -113,5 +114,6 @@ constexpr std::size_t calculate_convolution_memory_size(Float, InDesc, WeiDesc,
    constexpr index_t Y = wei_desc.GetLength(I2);
    constexpr index_t X = wei_desc.GetLength(I3);

-    return sizeof(Float) * (InDesc::GetElementSpace() + WeiDesc::GetElementSpace() + OutDesc::GetElementSpace());
+    return sizeof(Float) *
+           (InDesc::GetElementSpace() + WeiDesc::GetElementSpace() + OutDesc::GetElementSpace());
 }
--- a/src/include/gridwise_convolution_implicit_gemm_v1r3_lds_double_buffer_nchw_cyxk_nkhw.hip.hpp
+++ b/src/include/gridwise_convolution_implicit_gemm_v1r3_lds_double_buffer_nchw_cyxk_nkhw.hip.hpp
@@ -474,7 +474,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_nchw_cyxk_nkhw
                map_out_global2thread,
                Number<OutThreadCopyDataPerWrite_W>{});
 #else
-            threadwise_generic_tensor_slice_copy(
+            threadwise_generic_tensor_slice_copy_v1(
                out_10d_thread_desc.ReorderGivenNew2Old(map_out_global2thread),
                p_out_thread,
                make_zero_array<index_t, 10>(),
--- a/src/include/gridwise_convolution_implicit_gemm_v1r3_nchw_cyxk_nkhw.hip.hpp
+++ b/src/include/gridwise_convolution_implicit_gemm_v1r3_nchw_cyxk_nkhw.hip.hpp
@@ -423,7 +423,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_nchw_cyxk_nkhw
                map_out_global2thread,
                Number<OutThreadCopyDataPerWrite_W>{});
 #else
-            threadwise_generic_tensor_slice_copy(
+            threadwise_generic_tensor_slice_copy_v1(
                out_10d_thread_desc.ReorderGivenNew2Old(map_out_global2thread),
                p_out_thread,
                make_zero_array<index_t, 10>(),
--- a/src/include/gridwise_convolution_implicit_gemm_v3_lds_double_buffer_nchw_cyxk_nkhw.hip.hpp
+++ b/src/include/gridwise_convolution_implicit_gemm_v3_lds_double_buffer_nchw_cyxk_nkhw.hip.hpp
@@ -405,14 +405,15 @@ struct GridwiseConvolutionImplicitGemm_v3_lds_double_buffer_nchw_cyxk_nkhw
                out_k_n1_b_n2_global_merged_desc.GetOffsetFromMultiIndex(
                    k_thread_data_on_global, 0, b_thread_data_on_global, 0);

-            threadwise_generic_tensor_slice_copy(out_n0_n1_n2_k0_k1_k2_h_w_thread_desc,
-                                                 p_out_thread,
-                                                 {0, 0, 0, 0, 0, 0, 0, 0},
-                                                 out_n0_n1_n2_k0_k1_k2_h_w_global_mem_desc,
-                                                 p_out_thread_on_global,
-                                                 {0, 0, 0, 0, 0, 0, 0, 0},
-                                                 out_n0_n1_n2_k0_k1_k2_h_w_thread_desc.GetLengths(),
-                                                 arithmetic_sequence_gen<0, 8, 1>::SeqType{});
+            threadwise_generic_tensor_slice_copy_v1(
+                out_n0_n1_n2_k0_k1_k2_h_w_thread_desc,
+                p_out_thread,
+                {0, 0, 0, 0, 0, 0, 0, 0},
+                out_n0_n1_n2_k0_k1_k2_h_w_global_mem_desc,
+                p_out_thread_on_global,
+                {0, 0, 0, 0, 0, 0, 0, 0},
+                out_n0_n1_n2_k0_k1_k2_h_w_thread_desc.GetLengths(),
+                arithmetic_sequence_gen<0, 8, 1>::SeqType{});
        }
    }
 };
--- a/src/include/gridwise_convolution_implicit_gemm_v3_nchw_cyxk_nkhw.hip.hpp
+++ b/src/include/gridwise_convolution_implicit_gemm_v3_nchw_cyxk_nkhw.hip.hpp
@@ -358,14 +358,15 @@ struct GridwiseConvolutionImplicitGemm_v3_nchw_cyxk_nkhw
                out_k_n1_b_n2_global_merged_desc.GetOffsetFromMultiIndex(
                    k_thread_data_on_global, 0, b_thread_data_on_global, 0);

-            threadwise_generic_tensor_slice_copy(out_n0_n1_n2_k0_k1_k2_h_w_thread_desc,
-                                                 p_out_thread,
-                                                 {0, 0, 0, 0, 0, 0, 0, 0},
-                                                 out_n0_n1_n2_k0_k1_k2_h_w_global_mem_desc,
-                                                 p_out_thread_on_global,
-                                                 {0, 0, 0, 0, 0, 0, 0, 0},
-                                                 out_n0_n1_n2_k0_k1_k2_h_w_thread_desc.GetLengths(),
-                                                 arithmetic_sequence_gen<0, 8, 1>::SeqType{});
+            threadwise_generic_tensor_slice_copy_v1(
+                out_n0_n1_n2_k0_k1_k2_h_w_thread_desc,
+                p_out_thread,
+                {0, 0, 0, 0, 0, 0, 0, 0},
+                out_n0_n1_n2_k0_k1_k2_h_w_global_mem_desc,
+                p_out_thread_on_global,
+                {0, 0, 0, 0, 0, 0, 0, 0},
+                out_n0_n1_n2_k0_k1_k2_h_w_thread_desc.GetLengths(),
+                arithmetic_sequence_gen<0, 8, 1>::SeqType{});
        }
    }
 };
--- a/src/include/gridwise_convolution_implicit_gemm_v4_lds_double_buffer_nchw_kcyx_nkhw.hip.hpp
+++ b/src/include/gridwise_convolution_implicit_gemm_v4_lds_double_buffer_nchw_kcyx_nkhw.hip.hpp
@@ -5,7 +5,7 @@
 #include "ConstantMatrixDescriptor.hip.hpp"
 #include "blockwise_generic_tensor_slice_op.hip.hpp"
 #include "blockwise_gemm.hip.hpp"
-#include "threadwise_tensor_slice_op.hip.hpp"
+#include "threadwise_generic_tensor_slice_op.hip.hpp"

 // define B = merge(N, Ho, Wo)
 template <index_t GridSize,
@@ -170,7 +170,7 @@ struct GridwiseConvolutionImplicitGemm_v4_lds_double_buffer_nchw_kcyx_nkhw
 // operator for blockwise copy of weight into LDS
 //     slice a tensor, and copy it into another tensor
 //     this copy operator already have blockwise offset built-in
-#if 0
+#if 1
        auto blockwise_wei_copy =
            BlockwiseGenericTensorSliceCopy_v1<BlockSize,
                                               Float,
@@ -188,7 +188,7 @@ struct GridwiseConvolutionImplicitGemm_v4_lds_double_buffer_nchw_kcyx_nkhw
 #else
        constexpr auto map_k_e_2_e_k = Sequence<1, 0>{};

-        const auto blockwise_wei_copy = BlockwiseTensorSliceReorderCopy_v3<
+        auto blockwise_wei_copy = BlockwiseTensorSliceReorderCopy_v3<
            BlockSize,
            Float,
            decltype(wei_e_k_global_desc.ReorderGivenNew2Old(map_k_e_2_e_k)),
@@ -324,11 +324,7 @@ struct GridwiseConvolutionImplicitGemm_v4_lds_double_buffer_nchw_kcyx_nkhw
                Float p_wei_register_clipboard[blockwise_wei_copy.GetRegisterClipboardSize()];

                blockwise_in_copy.MoveSlicingWindowOnSourceTensor(I0, Number<EPerBlock>{}, True);
-#if 0
-                blockwise_wei_copy.MoveSlicingWindowOnSourceTensor(I0, Number<EPerBlock>{}, True);
-#else
                p_wei_block_on_global += EPerBlock * wei_e_k_global_desc.GetStride(I0);
-#endif

                __syncthreads();

@@ -355,11 +351,7 @@ struct GridwiseConvolutionImplicitGemm_v4_lds_double_buffer_nchw_kcyx_nkhw

            // even iteration
            blockwise_in_copy.MoveSlicingWindowOnSourceTensor(I0, Number<EPerBlock>{}, True);
-#if 0
-            blockwise_wei_copy.MoveSlicingWindowOnSourceTensor(I0, Number<EPerBlock>{}, True);
-#else
            p_wei_block_on_global += EPerBlock * wei_e_k_global_desc.GetStride(I0);
-#endif

            __syncthreads();

@@ -434,14 +426,16 @@ struct GridwiseConvolutionImplicitGemm_v4_lds_double_buffer_nchw_kcyx_nkhw
                out_k_n1_b_n2_global_merged_desc.GetOffsetFromMultiIndex(
                    k_thread_data_on_global, 0, b_thread_data_on_global, 0);

-            threadwise_generic_tensor_slice_copy(out_n0_n1_n2_k0_k1_k2_h_w_thread_desc,
-                                                 p_out_thread,
-                                                 {0, 0, 0, 0, 0, 0, 0, 0},
-                                                 out_n0_n1_n2_k0_k1_k2_h_w_global_mem_desc,
-                                                 p_out_thread_on_global,
-                                                 {0, 0, 0, 0, 0, 0, 0, 0},
-                                                 out_n0_n1_n2_k0_k1_k2_h_w_thread_desc.GetLengths(),
-                                                 arithmetic_sequence_gen<0, 8, 1>::SeqType{});
+            threadwise_generic_tensor_slice_copy_v1(
+                out_n0_n1_n2_k0_k1_k2_h_w_thread_desc,
+                p_out_thread,
+                {0, 0, 0, 0, 0, 0, 0, 0},
+                out_n0_n1_n2_k0_k1_k2_h_w_global_mem_desc,
+                p_out_thread_on_global,
+                {0, 0, 0, 0, 0, 0, 0, 0},
+                out_n0_n1_n2_k0_k1_k2_h_w_thread_desc.GetLengths(),
+                arithmetic_sequence_gen<0, 8, 1>::SeqType{},
+                Number<1>{});
        }
    }
 };
--- a/src/include/gridwise_convolution_implicit_gemm_v4_nchw_kcyx_nkhw.hip.hpp
+++ b/src/include/gridwise_convolution_implicit_gemm_v4_nchw_kcyx_nkhw.hip.hpp
@@ -5,7 +5,7 @@
 #include "ConstantMatrixDescriptor.hip.hpp"
 #include "blockwise_generic_tensor_slice_op.hip.hpp"
 #include "blockwise_gemm.hip.hpp"
-#include "threadwise_tensor_slice_op.hip.hpp"
+#include "threadwise_generic_tensor_slice_op.hip.hpp"

 // define B = merge(N, Ho, Wo)
 template <index_t GridSize,
@@ -327,14 +327,15 @@ struct GridwiseConvolutionImplicitGemm_v4_nchw_kcyx_nkhw
                out_k_n1_b_n2_global_merged_desc.GetOffsetFromMultiIndex(
                    k_thread_data_on_global, 0, b_thread_data_on_global, 0);

-            threadwise_generic_tensor_slice_copy(out_n0_n1_n2_k0_k1_k2_h_w_thread_desc,
-                                                 p_out_thread,
-                                                 {0, 0, 0, 0, 0, 0, 0, 0},
-                                                 out_n0_n1_n2_k0_k1_k2_h_w_global_mem_desc,
-                                                 p_out_thread_on_global,
-                                                 {0, 0, 0, 0, 0, 0, 0, 0},
-                                                 out_n0_n1_n2_k0_k1_k2_h_w_thread_desc.GetLengths(),
-                                                 arithmetic_sequence_gen<0, 8, 1>::SeqType{});
+            threadwise_generic_tensor_slice_copy_v1(
+                out_n0_n1_n2_k0_k1_k2_h_w_thread_desc,
+                p_out_thread,
+                {0, 0, 0, 0, 0, 0, 0, 0},
+                out_n0_n1_n2_k0_k1_k2_h_w_global_mem_desc,
+                p_out_thread_on_global,
+                {0, 0, 0, 0, 0, 0, 0, 0},
+                out_n0_n1_n2_k0_k1_k2_h_w_thread_desc.GetLengths(),
+                arithmetic_sequence_gen<0, 8, 1>::SeqType{});
        }
    }
 };
--- a/src/include/threadwise_generic_tensor_slice_op.hip.hpp
+++ b/src/include/threadwise_generic_tensor_slice_op.hip.hpp
@@ -0,0 +1,99 @@
+#pragma once
+#include "ConstantTensorDescriptor.hip.hpp"
+#include "ConstantMergedTensorDescriptor.hip.hpp"
+
+template <class Float,
+          class SrcDesc,
+          class DstDesc,
+          class SliceLengths,
+          class DimAccessOrder,
+          index_t DataPerAccess>
+__device__ void threadwise_generic_tensor_slice_copy_v1(
+    SrcDesc,
+    const Float* __restrict__ p_src,
+    Array<index_t, SrcDesc::GetNumOfDimension()> src_multi_id_begin,
+    DstDesc,
+    Float* __restrict__ p_dst,
+    Array<index_t, DstDesc::GetNumOfDimension()> dst_multi_id_begin,
+    SliceLengths,
+    DimAccessOrder,
+    Number<DataPerAccess>)
+{
+    constexpr index_t nDim = SrcDesc::GetNumOfDimension();
+
+    static_assert(nDim == SrcDesc::GetNumOfDimension() && nDim == DstDesc::GetNumOfDimension() &&
+                      nDim == SliceLengths::GetSize() && nDim == DimAccessOrder::GetSize(),
+                  "wrong! # of dimensions not the same");
+
+    static_assert(is_valid_sequence_map<DimAccessOrder>::value, "wrong! map is not valid");
+
+#if 0 
+    // doesn't compile, because merged-tensor reordering is not implemented
+    // TODO: implement tensor desc ops for merged-tensor
+    constexpr auto src_strides_in_access_order =
+        SrcDesc::ReorderGivenNew2Old(DimAccessOrder{}).GetStride(Number<nDim-1>{});
+
+    constexpr auto dst_strides_in_access_order =
+        SrcDesc::ReorderGivenNew2Old(DimAccessOrder{}).GetStride(Number<nDim-1>{});
+
+    // check src/dst stride on the lowest access dimension
+    static_assert((DataPerAccess == 1 || src_strides_in_access_order.Back() == 1) &&
+                      (DataPerAccess == 1 || dst_strides_in_access_order.Back() == 1),
+                  "wrong! src/dst stride on the lowest access dimension needs to be 1 for "
+                  "vectorized read/write");
+#endif
+
+    constexpr auto slice_lengths_in_access_order =
+        SliceLengths::ReorderGivenNew2Old(DimAccessOrder{});
+
+    // check slice length on the lowest access dimension
+    static_assert(slice_lengths_in_access_order.Back() % DataPerAccess == 0,
+                  "wrong! slice length on the lowest access dimension should be evenly divided by "
+                  "DataPerAccess");
+
+    constexpr index_t num_access_on_lowest_access_dimension =
+        slice_lengths_in_access_order.Back() / DataPerAccess;
+
+    constexpr auto access_lengths = slice_lengths_in_access_order.Modify(
+        Number<nDim - 1>{}, Number<num_access_on_lowest_access_dimension>{});
+
+    using vector_t = typename vector_type<Float, DataPerAccess>::MemoryType;
+
+#if 1
+    ford<decltype(access_lengths)>{}([&](auto access_multi_id) {
+        auto data_multi_id_in_access_order      = access_multi_id;
+        data_multi_id_in_access_order[nDim - 1] = access_multi_id[nDim - 1] * DataPerAccess;
+
+        const auto data_multi_id =
+            reorder_array_given_old2new(data_multi_id_in_access_order, DimAccessOrder{});
+
+        const index_t src_index =
+            SrcDesc::GetOffsetFromMultiIndex(src_multi_id_begin + data_multi_id);
+
+        const index_t dst_index =
+            DstDesc::GetOffsetFromMultiIndex(dst_multi_id_begin + data_multi_id);
+
+        *reinterpret_cast<vector_t*>(&p_dst[dst_index]) =
+            *reinterpret_cast<const vector_t*>(&p_src[src_index]);
+    });
+#else
+    static_ford<decltype(access_lengths)>{}([&](auto access_multi_id_) {
+        const auto access_multi_id = sequence2array(access_multi_id_);
+
+        auto data_multi_id_in_access_order      = access_multi_id;
+        data_multi_id_in_access_order[nDim - 1] = access_multi_id[nDim - 1] * DataPerAccess;
+
+        const auto data_multi_id =
+            reorder_array_given_old2new(data_multi_id_in_access_order, DimAccessOrder{});
+
+        const index_t src_index =
+            SrcDesc::GetOffsetFromMultiIndex(src_multi_id_begin + data_multi_id);
+
+        const index_t dst_index =
+            DstDesc::GetOffsetFromMultiIndex(dst_multi_id_begin + data_multi_id);
+
+        *reinterpret_cast<vector_t*>(&p_dst[dst_index]) =
+            *reinterpret_cast<const vector_t*>(&p_src[src_index]);
+    });
+#endif
+}
--- a/src/include/threadwise_tensor_slice_op.hip.hpp
+++ b/src/include/threadwise_tensor_slice_op.hip.hpp
@@ -192,58 +192,3 @@ threadwise_tensor_slice_copy_reorder_given_dst2src_v3(SrcDesc,
        });
    });
 }
-
-template <class Float, class SrcDesc, class DstDesc, class SliceLengths, class DimAccessOrder>
-__device__ void threadwise_generic_tensor_slice_copy(
-    SrcDesc,
-    const Float* __restrict__ p_src,
-    Array<index_t, SrcDesc::GetNumOfDimension()> src_multi_id_begin,
-    DstDesc,
-    Float* __restrict__ p_dst,
-    Array<index_t, DstDesc::GetNumOfDimension()> dst_multi_id_begin,
-    SliceLengths,
-    DimAccessOrder)
-{
-    constexpr index_t nDim = SrcDesc::GetNumOfDimension();
-
-    static_assert(nDim == SrcDesc::GetNumOfDimension() && nDim == DstDesc::GetNumOfDimension() &&
-                      nDim == SliceLengths::GetSize() && nDim == DimAccessOrder::GetSize(),
-                  "wrong! # of dimensions not the same");
-
-    static_assert(is_valid_sequence_map<DimAccessOrder>::value, "wrong! map is not valid");
-
-    constexpr auto slice_lengths_in_access_order =
-        SliceLengths::ReorderGivenNew2Old(DimAccessOrder{});
-
-#if 1
-    ford<decltype(slice_lengths_in_access_order)>{}([&](auto data_multi_id_in_access_order) {
-        const auto data_multi_id =
-            reorder_array_given_old2new(data_multi_id_in_access_order, DimAccessOrder{});
-
-        const index_t src_index =
-            SrcDesc::GetOffsetFromMultiIndex(src_multi_id_begin + data_multi_id);
-
-        const index_t dst_index =
-            DstDesc::GetOffsetFromMultiIndex(dst_multi_id_begin + data_multi_id);
-
-        p_dst[dst_index] = p_src[src_index];
-    });
-#else
-    static_ford<decltype(slice_lengths_in_access_order)>{}(
-        [&](auto data_multi_id_in_access_order_) {
-            constexpr auto data_multi_id_in_access_order =
-                sequence2array(decltype(data_multi_id_in_access_order_){});
-
-            const auto data_multi_id =
-                reorder_array_given_old2new(data_multi_id_in_access_order, DimAccessOrder{});
-
-            const index_t src_index =
-                SrcDesc::GetOffsetFromMultiIndex(src_multi_id_begin + data_multi_id);
-
-            const index_t dst_index =
-                DstDesc::GetOffsetFromMultiIndex(dst_multi_id_begin + data_multi_id);
-
-            p_dst[dst_index] = p_src[src_index];
-        });
-#endif
-}