clean up

2026-05-13 17:55:48 +00:00 · 2019-08-13 00:37:23 -05:00
parent fab2f10a55
commit 8bdaba51f8
20 changed files with 505 additions and 651 deletions
--- a/composable_kernel/include/tensor_operation/blockwise_2d_tensor_op.hpp
+++ b/composable_kernel/include/tensor_operation/blockwise_2d_tensor_op.hpp
@@ -563,7 +563,7 @@ struct Blockwise2dTensorCopy3
        }
    }

-    __device__ constexpr index_t GetRegisterClipboardSize() const
+    __device__ constexpr index_t GetRegisterBufferSize() const
    {
        static_assert(is_same<Float, float>{}, "wrong! only support float!\n");

@@ -579,8 +579,8 @@ struct Blockwise2dTensorCopy3
        return DataPerRead * (L0 + thread_per_d0 - 1) / thread_per_d0;
    }

-    __device__ void RunLoadRegisterClipboard(const Float* __restrict__ p_src,
-                                             Float* __restrict__ p_clipboard) const
+    __device__ void RunLoadRegisterBuffer(const Float* __restrict__ p_src,
+                                          Float* __restrict__ p_clipboard) const
    {
        constexpr auto I0 = Number<0>{};
        constexpr auto I1 = Number<1>{};
@@ -630,8 +630,8 @@ struct Blockwise2dTensorCopy3
        }
    }

-    __device__ void RunStoreRegisterClipboard(const Float* __restrict__ p_clipboard,
-                                              Float* __restrict__ p_dst) const
+    __device__ void RunStoreRegisterBuffer(const Float* __restrict__ p_clipboard,
+                                           Float* __restrict__ p_dst) const
    {
        constexpr auto I0 = Number<0>{};
        constexpr auto I1 = Number<1>{};
@@ -681,8 +681,8 @@ struct Blockwise2dTensorCopy3
    }

 #if CK_USE_AMD_INLINE_ASM
-    __device__ void RunLoadRegisterClipboard_asm(const Float* __restrict__ p_src,
-                                                 Float* p_clipboard) const
+    __device__ void RunLoadRegisterBuffer_asm(const Float* __restrict__ p_src,
+                                              Float* p_clipboard) const
    {
        constexpr auto I0 = Number<0>{};
        constexpr auto I1 = Number<1>{};
@@ -741,8 +741,8 @@ struct Blockwise2dTensorCopy3
        }
    }

-    __device__ void RunStoreRegisterClipboard_asm(const Float* __restrict__ p_clipboard,
-                                                  Float* __restrict__ p_dst) const
+    __device__ void RunStoreRegisterBuffer_asm(const Float* __restrict__ p_clipboard,
+                                               Float* __restrict__ p_dst) const
    {
        constexpr auto I0 = Number<0>{};
        constexpr auto I1 = Number<1>{};
--- a/composable_kernel/include/tensor_operation/blockwise_3d_tensor_op.hpp
+++ b/composable_kernel/include/tensor_operation/blockwise_3d_tensor_op.hpp
@@ -237,7 +237,7 @@ struct Blockwise3dTensorCopy3
        }
    }

-    __device__ static constexpr index_t GetRegisterClipboardSize()
+    __device__ static constexpr index_t GetRegisterBufferSize()
    {
        static_assert(is_same<Float, float>{}, "wrong! only support float!\n");

@@ -260,8 +260,8 @@ struct Blockwise3dTensorCopy3
        return DataPerRead * nloop_d0 * nloop_d1 * nloop_d2;
    }

-    __device__ void RunLoadRegisterClipboard(const Float* __restrict__ p_src,
-                                             Float* __restrict__ p_clipboard) const
+    __device__ void RunLoadRegisterBuffer(const Float* __restrict__ p_src,
+                                          Float* __restrict__ p_clipboard) const
    {
        constexpr auto I0 = Number<0>{};
        constexpr auto I1 = Number<1>{};
@@ -316,8 +316,8 @@ struct Blockwise3dTensorCopy3
        }
    }

-    __device__ void RunStoreRegisterClipboard(const Float* __restrict__ p_clipboard,
-                                              Float* __restrict__ p_dst) const
+    __device__ void RunStoreRegisterBuffer(const Float* __restrict__ p_clipboard,
+                                           Float* __restrict__ p_dst) const
    {
        constexpr auto I0 = Number<0>{};
        constexpr auto I1 = Number<1>{};
--- a/composable_kernel/include/tensor_operation/blockwise_4d_tensor_op.hpp
+++ b/composable_kernel/include/tensor_operation/blockwise_4d_tensor_op.hpp
@@ -596,7 +596,7 @@ struct Blockwise4dTensorCopy3
        }
    }

-    __device__ constexpr index_t GetRegisterClipboardSize() const
+    __device__ constexpr index_t GetRegisterBufferSize() const
    {
        static_assert(is_same<Float, float>{}, "wrong! only support float!\n");

@@ -623,8 +623,8 @@ struct Blockwise4dTensorCopy3
        return DataPerRead * nloop_d0 * nloop_d1 * nloop_d2 * nloop_d3;
    }

-    __device__ void RunLoadRegisterClipboard(const Float* __restrict__ p_src,
-                                             Float* __restrict__ p_clipboard) const
+    __device__ void RunLoadRegisterBuffer(const Float* __restrict__ p_src,
+                                          Float* __restrict__ p_clipboard) const
    {
        constexpr auto I0 = Number<0>{};
        constexpr auto I1 = Number<1>{};
@@ -690,8 +690,8 @@ struct Blockwise4dTensorCopy3
        }
    }

-    __device__ void RunStoreRegisterClipboard(const Float* __restrict__ p_clipboard,
-                                              Float* __restrict__ p_dst) const
+    __device__ void RunStoreRegisterBuffer(const Float* __restrict__ p_clipboard,
+                                           Float* __restrict__ p_dst) const
    {
        constexpr auto I0 = Number<0>{};
        constexpr auto I1 = Number<1>{};
--- a/composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp
+++ b/composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp
@@ -420,8 +420,6 @@ struct BlockwiseGenericTensorSliceCopy_v1
 template <index_t BlockSize,
          class SrcDesc,
          class DstDesc,
-          class SrcCoordinate,
-          class DstCoordinate,
          class SliceLengths,
          class SubLengths,
          class ThreadClusterLengths,
@@ -436,6 +434,9 @@ struct BlockwiseGenericTensorSliceCopy_v2
 {
    static constexpr index_t nDim = SrcDesc::GetNumOfDimension();

+    using SrcCoordinate = typename TensorCoordinate<SrcDesc>::type;
+    using DstCoordinate = typename TensorCoordinate<DstDesc>::type;
+
    __device__ constexpr BlockwiseGenericTensorSliceCopy_v2(SrcCoordinate src_block_slice_origin,
                                                            DstCoordinate dst_block_slice_origin)
    {
@@ -515,31 +516,25 @@ struct BlockwiseGenericTensorSliceCopy_v2
    private:
    using RegisterBufferDesc = decltype(make_ConstantTensorDescriptor_packed(SubLengths{}));

-    using ThreadwiseLoad =
-        ThreadwiseGenericTensorSliceCopy_v2r1<SrcDesc,
-                                              RegisterBufferDesc,
-                                              SrcCoordinate,
-                                              NormalTensorCoordinate<RegisterBufferDesc>,
-                                              SubLengths,
-                                              SrcDimAccessOrder,
-                                              SrcDimAccessOrder,
-                                              SrcVectorAccessDim,
-                                              SrcVectorAccessDim,
-                                              SrcDataPerAccess,
-                                              1>;
+    using ThreadwiseLoad = ThreadwiseGenericTensorSliceCopy_v2r1<SrcDesc,
+                                                                 RegisterBufferDesc,
+                                                                 SubLengths,
+                                                                 SrcDimAccessOrder,
+                                                                 SrcDimAccessOrder,
+                                                                 SrcVectorAccessDim,
+                                                                 SrcVectorAccessDim,
+                                                                 SrcDataPerAccess,
+                                                                 1>;

-    using ThreadwiseStore =
-        ThreadwiseGenericTensorSliceCopy_v2r1<RegisterBufferDesc,
-                                              DstDesc,
-                                              NormalTensorCoordinate<RegisterBufferDesc>,
-                                              DstCoordinate,
-                                              SubLengths,
-                                              DstDimAccessOrder,
-                                              DstDimAccessOrder,
-                                              DstVectorAccessDim,
-                                              DstVectorAccessDim,
-                                              1,
-                                              DstDataPerAccess>;
+    using ThreadwiseStore = ThreadwiseGenericTensorSliceCopy_v2r1<RegisterBufferDesc,
+                                                                  DstDesc,
+                                                                  SubLengths,
+                                                                  DstDimAccessOrder,
+                                                                  DstDimAccessOrder,
+                                                                  DstVectorAccessDim,
+                                                                  DstVectorAccessDim,
+                                                                  1,
+                                                                  DstDataPerAccess>;

    ThreadwiseLoad mThreadwiseLoad;
    ThreadwiseStore mThreadwiseStore;
--- a/composable_kernel/include/tensor_operation/blockwise_tensor_slice_copy.hpp
+++ b/composable_kernel/include/tensor_operation/blockwise_tensor_slice_copy.hpp
@@ -165,7 +165,7 @@ struct BlockwiseTensorSliceReorderCopy_v3
 #endif
    }

-    __device__ static constexpr index_t GetRegisterClipboardSize()
+    __device__ static constexpr index_t GetRegisterBufferSize()
    {
        constexpr auto thread_sub_tensor_lengths = SrcSubLengths{};

@@ -183,8 +183,8 @@ struct BlockwiseTensorSliceReorderCopy_v3
        return thread_tensor_desc.GetElementSpace();
    }

-    __device__ void RunLoadRegisterClipboard(const Float* __restrict__ p_src,
-                                             Float* __restrict__ p_clipboard) const
+    __device__ void RunLoadRegisterBuffer(const Float* __restrict__ p_src,
+                                          Float* __restrict__ p_clipboard) const
    {
        constexpr auto thread_sub_tensor_lengths = SrcSubLengths{};

@@ -219,8 +219,8 @@ struct BlockwiseTensorSliceReorderCopy_v3
        });
    }

-    __device__ void RunStoreRegisterClipboard(const Float* __restrict__ p_clipboard,
-                                              Float* __restrict__ p_dst) const
+    __device__ void RunStoreRegisterBuffer(const Float* __restrict__ p_clipboard,
+                                           Float* __restrict__ p_dst) const
    {
        constexpr auto thread_sub_tensor_lengths = SrcSubLengths{};

@@ -274,10 +274,10 @@ struct BlockwiseTensorSliceReorderCopy_v3

    __device__ void Run(const Float* __restrict__ p_src, Float* __restrict__ p_dst) const
    {
-        Float p_clipboard[GetRegisterClipboardSize()];
+        Float p_clipboard[GetRegisterBufferSize()];

-        RunLoadRegisterClipboard(p_src, p_clipboard);
-        RunStoreRegisterClipboard(p_clipboard, p_dst);
+        RunLoadRegisterBuffer(p_src, p_clipboard);
+        RunStoreRegisterBuffer(p_clipboard, p_dst);
    }

    // this function doesn't do santiy check on whether the slicing window is out of the boundary
--- a/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp
+++ b/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp
@@ -14,10 +14,6 @@
 #define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1R2 0
 #endif

-#ifndef CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V2
-#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V2 0
-#endif
-
 #ifndef CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V2R1
 #define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V2R1 0
 #endif
@@ -430,170 +426,6 @@ struct ThreadwiseGenericTensorSliceCopy_v1r2
    Array<index_t, nDim> mDstSliceOrigin;
 };

-template <class SrcDesc,
-          class DstDesc,
-          class SrcCoordinate,
-          class DstCoordinate,
-          class SliceLengths>
-struct ThreadwiseGenericTensorSliceCopy_v2
-{
-    static constexpr index_t nDim = SrcDesc::GetNumOfDimension();
-
-    __device__ constexpr ThreadwiseGenericTensorSliceCopy_v2(SrcCoordinate src_slice_origin,
-                                                             DstCoordinate dst_slice_origin)
-        : mSrcSliceOrigin(src_slice_origin), mDstSliceOrigin(dst_slice_origin)
-    {
-    }
-
-    __device__ constexpr ThreadwiseGenericTensorSliceCopy_v2()
-        : ThreadwiseGenericTensorSliceCopy_v2(make_zero_array<index_t, nDim>(),
-                                              make_zero_array<index_t, nDim>())
-    {
-    }
-
-    __device__ void SetSrcSliceOrigin(SrcCoordinate src_slice_origin)
-    {
-        mSrcSliceOrigin = src_slice_origin;
-    }
-
-    __device__ void SetDstSliceOrigin(DstCoordinate dst_slice_origin)
-    {
-        mDstSliceOrigin = dst_slice_origin;
-    }
-
-    template <class TDesc, class Seq>
-    struct IsolateMergedDimSliceLengthsHack
-    {
-        template <class IDim>
-        __device__ constexpr index_t operator()(IDim idim) const
-        {
-            return TDesc::ContainMultipleOriginalDimensions(idim) ? Seq{}[idim] : 1;
-        }
-    };
-
-    template <class TData>
-    __device__ void Run(const TData* p_src, TData* p_dst) const
-    {
-        constexpr auto buffer_desc = make_ConstantTensorDescriptor_packed(SliceLengths{});
-
-        TData p_buffer_[buffer_desc.GetElementSpace()];
-        TData* p_buffer = p_buffer_;
-
-        // hacks to isolate merged dimension from normal dimensions, and calculate their offset
-        // seperately
-        // SrcMergedDimSliceLengthsHack has entry same as SliceLengths on src merged dimensions,
-        // but 1 on normal dimensions;
-        // SrcNormalDimSliceLengthsHack has entry same as SliceLengths on src normal dimensions,
-        // but 1 on merged dimensions;
-        using SrcMergedDimSliceLengthsHack =
-            typename sequence_gen<SliceLengths::GetSize(),
-                                  IsolateMergedDimSliceLengthsHack<SrcDesc, SliceLengths>>::type;
-
-        using SrcNormalDimSliceLengthsHack =
-            decltype((SliceLengths{} + Number<1>{}) - SrcMergedDimSliceLengthsHack{});
-
-#if CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V2
-        static_ford<SrcMergedDimSliceLengthsHack>{}([&](auto merged_dim_data_id_) {
-            constexpr auto merged_dim_data_id = decltype(merged_dim_data_id_){};
-
-            const TData* p_src_tmp = p_src + (mSrcSliceOrigin + merged_dim_data_id).GetOffset();
-
-            static_ford<SrcNormalDimSliceLengthsHack>{}([&](auto normal_dim_data_id_) {
-                constexpr auto normal_dim_data_id = decltype(normal_dim_data_id_){};
-
-                constexpr index_t buffer_offset =
-                    buffer_desc.GetOffsetFromMultiIndex(merged_dim_data_id + normal_dim_data_id);
-
-                constexpr index_t src_normal_offset =
-                    SrcDesc::GetOffsetFromMultiIndex(normal_dim_data_id);
-
-                p_buffer[buffer_offset] = p_src_tmp[src_normal_offset];
-            });
-        });
-#else
-        ford<SrcMergedDimSliceLengthsHack>{}([&](auto merged_dim_data_id) {
-            const TData* p_src_tmp = p_src + (mSrcSliceOrigin + merged_dim_data_id).GetOffset();
-
-            ford<SrcNormalDimSliceLengthsHack>{}([&](auto normal_dim_data_id) {
-                const index_t buffer_offset =
-                    buffer_desc.GetOffsetFromMultiIndex(merged_dim_data_id + normal_dim_data_id);
-
-                const index_t src_normal_offset =
-                    SrcDesc::GetOffsetFromMultiIndex(normal_dim_data_id);
-
-                p_buffer[buffer_offset] = p_src_tmp[src_normal_offset];
-            });
-        });
-#endif
-
-        // DstMergedDimSliceLengthsHack has entry same as SliceLengths on dst merged dimensions,
-        // but 1 on normal dimensions;
-        // DstNormalDimSliceLengthsHack has entry same as SliceLengths on dst normal dimensions,
-        // but 1 on merged dimensions;
-        using DstMergedDimSliceLengthsHack =
-            typename sequence_gen<SliceLengths::GetSize(),
-                                  IsolateMergedDimSliceLengthsHack<DstDesc, SliceLengths>>::type;
-
-        using DstNormalDimSliceLengthsHack =
-            decltype((SliceLengths{} + Number<1>{}) - DstMergedDimSliceLengthsHack{});
-
-#if CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V2
-        static_ford<DstMergedDimSliceLengthsHack>{}([&](auto merged_dim_data_id_) {
-            constexpr auto merged_dim_data_id = decltype(merged_dim_data_id_){};
-
-            TData* p_dst_tmp = p_dst + (mDstSliceOrigin + merged_dim_data_id).GetOffset();
-
-            static_ford<DstNormalDimSliceLengthsHack>{}([&](auto normal_dim_data_id_) {
-                constexpr auto normal_dim_data_id = decltype(normal_dim_data_id_){};
-
-                constexpr index_t buffer_offset =
-                    buffer_desc.GetOffsetFromMultiIndex(merged_dim_data_id + normal_dim_data_id);
-
-                constexpr index_t dst_normal_offset =
-                    DstDesc::GetOffsetFromMultiIndex(normal_dim_data_id);
-
-                p_dst_tmp[dst_normal_offset] = p_buffer[buffer_offset];
-            });
-        });
-#else
-        ford<DstMergedDimSliceLengthsHack>{}([&](auto merged_dim_data_id) {
-            TData* p_dst_tmp = p_dst + (mDstSliceOrigin + merged_dim_data_id).GetOffset();
-
-            ford<DstNormalDimSliceLengthsHack>{}([&](auto normal_dim_data_id) {
-                const index_t buffer_offset =
-                    buffer_desc.GetOffsetFromMultiIndex(merged_dim_data_id + normal_dim_data_id);
-
-                const index_t dst_normal_offset =
-                    DstDesc::GetOffsetFromMultiIndex(normal_dim_data_id);
-
-                p_dst_tmp[dst_normal_offset] = p_buffer[buffer_offset];
-            });
-        });
-#endif
-    }
-
-    // T can be Sequence or Array
-    template <class T, bool PositiveDirection>
-    __device__ void MoveSrcSlicingWindow(T step_sizes, integral_constant<bool, PositiveDirection>)
-    {
-        static_if<PositiveDirection>{}([&](auto) {
-            mSrcSliceOrigin += step_sizes;
-        }).Else([&](auto) { mSrcSliceOrigin -= step_sizes; });
-    }
-
-    template <class T, bool PositiveDirection>
-    __device__ void MoveDstSlicingWindow(T step_sizes, integral_constant<bool, PositiveDirection>)
-    {
-        static_if<PositiveDirection>{}([&](auto) {
-            mDstSliceOrigin += step_sizes;
-        }).Else([&](auto) { mDstSliceOrigin -= step_sizes; });
-    }
-
-    private:
-    SrcCoordinate mSrcSliceOrigin;
-    DstCoordinate mDstSliceOrigin;
-};
-
 // This threadwise copy allow vector access of src and dst.
 // It allows the dimensions of vector access to be different on src and dst.
 // It also allows the vector size to be different on src and dst.
@@ -605,8 +437,6 @@ struct ThreadwiseGenericTensorSliceCopy_v2
 // used for the buffer.
 template <class SrcDesc,
          class DstDesc,
-          class SrcCoordinate,
-          class DstCoordinate,
          class SliceLengths,
          class SrcDimAccessOrder,
          class DstDimAccessOrder,
@@ -618,6 +448,9 @@ struct ThreadwiseGenericTensorSliceCopy_v2r1
 {
    static constexpr index_t nDim = SliceLengths::GetSize();

+    using SrcCoordinate = typename TensorCoordinate<SrcDesc>::type;
+    using DstCoordinate = typename TensorCoordinate<DstDesc>::type;
+
    __device__ constexpr ThreadwiseGenericTensorSliceCopy_v2r1(SrcCoordinate src_slice_origin,
                                                               DstCoordinate dst_slice_origin)
        : mSrcSliceOrigin(src_slice_origin), mDstSliceOrigin(dst_slice_origin)