From b12bbceebca5e35ba2cde0b876d25be40187efe3 Mon Sep 17 00:00:00 2001 From: Chao Liu Date: Thu, 26 Sep 2019 14:59:19 -0500 Subject: [PATCH] clean up --- ..._v4r1_nchw_kcyx_nkhw_lds_double_buffer.hpp | 51 +- ..._v4r4_nchw_kcyx_nkhw_lds_double_buffer.hpp | 57 +- .../tensor_description/tensor_coordinate.hpp | 372 ++---- .../tensor_coordinate_deprecated.hpp | 346 +++++ .../tensor_coordinate_helper.hpp | 2 +- .../tensor_coordinate_v2.hpp | 215 ---- .../tensor_description/tensor_view.hpp | 4 +- .../tensor_description/tensor_visit.hpp | 4 +- .../blockwise_generic_tensor_slice_copy.hpp | 671 +--------- ...e_generic_tensor_slice_copy_deprecated.hpp | 692 ++++++++++ .../threadwise_generic_tensor_slice_copy.hpp | 1120 +--------------- ...e_generic_tensor_slice_copy_deprecated.hpp | 1129 +++++++++++++++++ ...tion_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp | 2 +- ...tion_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp | 4 +- ...plicit_gemm_v4r4_nchw_kcyx_nkhw_padded.hpp | 2 +- driver/src/driver.cpp | 10 +- 16 files changed, 2377 insertions(+), 2304 deletions(-) create mode 100644 composable_kernel/include/tensor_description/tensor_coordinate_deprecated.hpp delete mode 100644 composable_kernel/include/tensor_description/tensor_coordinate_v2.hpp create mode 100644 composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy_deprecated.hpp create mode 100644 composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy_deprecated.hpp diff --git a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer.hpp b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer.hpp index 42931a6ae8..c7375766da 100644 --- a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer.hpp +++ b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer.hpp @@ -5,9 +5,9 @@ #include "ConstantTensorDescriptor.hpp" #include "ConstantMergedTensorDescriptor.hpp" #include "ConstantMatrixDescriptor.hpp" -#include "blockwise_generic_tensor_slice_copy.hpp" +#include "blockwise_generic_tensor_slice_copy_deprecated.hpp" #include "blockwise_gemm.hpp" -#include "threadwise_generic_tensor_slice_copy.hpp" +#include "threadwise_generic_tensor_slice_copy_deprecated.hpp" namespace ck { @@ -265,8 +265,10 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer // LDS double buffer: preload data into LDS { - blockwise_in_copy.Run(p_in_global, p_in_block_double); - blockwise_wei_copy.Run(p_wei_global, p_wei_block_double); + blockwise_in_copy.template Run(p_in_global, + p_in_block_double); + blockwise_wei_copy.template Run(p_wei_global, + p_wei_block_double); } // LDS double buffer: main body @@ -288,8 +290,8 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer Float* p_wei_block_next = even_loop ? p_wei_block_double + wei_block_space : p_wei_block_double; - Float p_in_register_buffer[blockwise_in_copy.GetRegisterBufferSize()]; - Float p_wei_register_buffer[blockwise_wei_copy.GetRegisterBufferSize()]; + Float p_in_thread_buffer[blockwise_in_copy.GetThreadBufferSize()]; + Float p_wei_thread_buffer[blockwise_wei_copy.GetThreadBufferSize()]; blockwise_in_copy.MoveSrcSliceWindow(Sequence{}, True); blockwise_wei_copy.MoveSrcSliceWindow(Sequence{}, True); @@ -297,23 +299,25 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer __syncthreads(); // LDS doubel buffer: load next data from device mem - blockwise_in_copy.RunLoadRegisterBuffer(p_in_global, p_in_register_buffer); - blockwise_wei_copy.RunLoadRegisterBuffer(p_wei_global, p_wei_register_buffer); + blockwise_in_copy.template RunLoadThreadBuffer( + p_in_global, p_in_thread_buffer); + blockwise_wei_copy.template RunLoadThreadBuffer( + p_wei_global, p_wei_thread_buffer); // LDS double buffer: GEMM on current data blockwise_gemm.Run(p_wei_block_now, p_in_block_now, p_out_thread); // LDS double buffer: store next data to LDS - blockwise_in_copy.RunStoreRegisterBuffer(p_in_register_buffer, p_in_block_next); - blockwise_wei_copy.RunStoreRegisterBuffer(p_wei_register_buffer, p_wei_block_next); + blockwise_in_copy.RunStoreThreadBuffer(p_in_thread_buffer, p_in_block_next); + blockwise_wei_copy.RunStoreThreadBuffer(p_wei_thread_buffer, p_wei_block_next); } } // LDS double buffer: tail { // even iteration - Float p_in_register_buffer[blockwise_in_copy.GetRegisterBufferSize()]; - Float p_wei_register_buffer[blockwise_wei_copy.GetRegisterBufferSize()]; + Float p_in_thread_buffer[blockwise_in_copy.GetThreadBufferSize()]; + Float p_wei_thread_buffer[blockwise_wei_copy.GetThreadBufferSize()]; blockwise_in_copy.MoveSrcSliceWindow(Sequence{}, True); blockwise_wei_copy.MoveSrcSliceWindow(Sequence{}, True); @@ -321,17 +325,19 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer __syncthreads(); // LDS doubel buffer: load next data from device mem - blockwise_in_copy.RunLoadRegisterBuffer(p_in_global, p_in_register_buffer); - blockwise_wei_copy.RunLoadRegisterBuffer(p_wei_global, p_wei_register_buffer); + blockwise_in_copy.template RunLoadThreadBuffer( + p_in_global, p_in_thread_buffer); + blockwise_wei_copy.template RunLoadThreadBuffer( + p_wei_global, p_wei_thread_buffer); // LDS double buffer: GEMM on current data blockwise_gemm.Run(p_wei_block_double, p_in_block_double, p_out_thread); // LDS double buffer: store next data to LDS - blockwise_in_copy.RunStoreRegisterBuffer(p_in_register_buffer, - p_in_block_double + in_block_space); - blockwise_wei_copy.RunStoreRegisterBuffer(p_wei_register_buffer, - p_wei_block_double + wei_block_space); + blockwise_in_copy.RunStoreThreadBuffer(p_in_thread_buffer, + p_in_block_double + in_block_space); + blockwise_wei_copy.RunStoreThreadBuffer(p_wei_thread_buffer, + p_wei_block_double + wei_block_space); // odd iteration __syncthreads(); @@ -390,7 +396,14 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer 0, b_thread_data_on_global, 0}) - .template Run_amd_experiment(p_out_thread, p_out_global); +#if 0 + .Run +#else // tweaking + .template Run_optimized_address_calculation +#endif + (p_out_thread, p_out_global); } } }; diff --git a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_lds_double_buffer.hpp b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_lds_double_buffer.hpp index e3c31ed36a..f852b10e98 100644 --- a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_lds_double_buffer.hpp +++ b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_lds_double_buffer.hpp @@ -5,9 +5,9 @@ #include "ConstantTensorDescriptor.hpp" #include "ConstantMergedTensorDescriptor.hpp" #include "ConstantMatrixDescriptor.hpp" -#include "blockwise_generic_tensor_slice_copy.hpp" +#include "blockwise_generic_tensor_slice_copy_deprecated.hpp" #include "blockwise_gemm.hpp" -#include "threadwise_generic_tensor_slice_copy.hpp" +#include "threadwise_generic_tensor_slice_copy_deprecated.hpp" namespace ck { @@ -251,8 +251,10 @@ struct GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw_lds_double_buffer // LDS double buffer: preload data into LDS { - blockwise_in_copy.Run(p_in_global, p_in_block_double); - blockwise_wei_copy.Run(p_wei_global, p_wei_block_double); + blockwise_in_copy.template Run(p_in_global, + p_in_block_double); + blockwise_wei_copy.template Run(p_wei_global, + p_wei_block_double); } // LDS double buffer: main body @@ -274,51 +276,54 @@ struct GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw_lds_double_buffer Float* p_wei_block_next = even_loop ? p_wei_block_double + wei_block_space : p_wei_block_double; - Float p_in_register_buffer[blockwise_in_copy.GetRegisterBufferSize()]; - Float p_wei_register_buffer[blockwise_wei_copy.GetRegisterBufferSize()]; + Float p_in_thread_buffer[blockwise_in_copy.GetThreadBufferSize()]; + Float p_wei_thread_buffer[blockwise_wei_copy.GetThreadBufferSize()]; blockwise_in_copy.MoveSrcSliceWindow(Sequence{}, True); - p_wei_block_on_global += EPerBlock * wei_e_k_global_desc.GetStrides()[0]; + blockwise_wei_copy.MoveSrcSliceWindow(Sequence{}, True); __syncthreads(); // LDS doubel buffer: load next data from device mem - blockwise_in_copy.RunLoadRegisterBuffer(p_in_global, p_in_register_buffer); - blockwise_wei_copy.RunLoadRegisterBuffer(p_wei_block_on_global, - p_wei_register_buffer); + blockwise_in_copy.template RunLoadThreadBuffer( + p_in_global, p_in_thread_buffer); + blockwise_wei_copy.template RunLoadThreadBuffer( + p_wei_global, p_wei_thread_buffer); // LDS double buffer: GEMM on current data blockwise_gemm.Run(p_wei_block_now, p_in_block_now, p_out_thread); // LDS double buffer: store next data to LDS - blockwise_in_copy.RunStoreRegisterBuffer(p_in_register_buffer, p_in_block_next); - blockwise_wei_copy.RunStoreRegisterBuffer(p_wei_register_buffer, p_wei_block_next); + blockwise_in_copy.RunStoreThreadBuffer(p_in_thread_buffer, p_in_block_next); + blockwise_wei_copy.RunStoreThreadBuffer(p_wei_thread_buffer, p_wei_block_next); } } // LDS double buffer: tail { - Float p_in_register_buffer[blockwise_in_copy.GetRegisterBufferSize()]; - Float p_wei_register_buffer[blockwise_wei_copy.GetRegisterBufferSize()]; + Float p_in_thread_buffer[blockwise_in_copy.GetThreadBufferSize()]; + Float p_wei_thread_buffer[blockwise_wei_copy.GetThreadBufferSize()]; // even iteration blockwise_in_copy.MoveSrcSliceWindow(Sequence{}, True); - p_wei_block_on_global += EPerBlock * wei_e_k_global_desc.GetStrides()[0]; + blockwise_wei_copy.MoveSrcSliceWindow(Sequence{}, True); __syncthreads(); // LDS doubel buffer: load next data from device mem - blockwise_in_copy.RunLoadRegisterBuffer(p_in_global, p_in_register_buffer); - blockwise_wei_copy.RunLoadRegisterBuffer(p_wei_block_on_global, p_wei_register_buffer); + blockwise_in_copy.template RunLoadThreadBuffer( + p_in_global, p_in_thread_buffer); + blockwise_wei_copy.template RunLoadThreadBuffer( + p_wei_global, p_wei_thread_buffer); // LDS double buffer: GEMM on current data blockwise_gemm.Run(p_wei_block_double, p_in_block_double, p_out_thread); // LDS double buffer: store next data to LDS - blockwise_in_copy.RunStoreRegisterBuffer(p_in_register_buffer, - p_in_block_double + in_block_space); - blockwise_wei_copy.RunStoreRegisterBuffer(p_wei_register_buffer, - p_wei_block_double + wei_block_space); + blockwise_in_copy.RunStoreThreadBuffer(p_in_thread_buffer, + p_in_block_double + in_block_space); + blockwise_wei_copy.RunStoreThreadBuffer(p_wei_thread_buffer, + p_wei_block_double + wei_block_space); // odd iteration __syncthreads(); @@ -385,7 +390,15 @@ struct GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw_lds_double_buffer for(index_t nrepeat = 0; nrepeat < GemmNRepeat; ++nrepeat) { - threadwise_out_copy.Run(p_out_thread, p_out_global); + threadwise_out_copy +#if 1 + .Run +#else // tweaking + .template Run_optimized_address_calculation +#endif + (p_out_thread, p_out_global); threadwise_out_copy.MoveSrcSliceWindow(Sequence<0, 0, GemmNPerThreadSubC>{}, True); threadwise_out_copy.MoveDstSliceWindow(Sequence<0, 0, B1>{}, True); diff --git a/composable_kernel/include/tensor_description/tensor_coordinate.hpp b/composable_kernel/include/tensor_description/tensor_coordinate.hpp index 223d0d5bed..5114b2ce99 100644 --- a/composable_kernel/include/tensor_description/tensor_coordinate.hpp +++ b/composable_kernel/include/tensor_description/tensor_coordinate.hpp @@ -1,340 +1,210 @@ -#ifndef CK_TENSOR_COORDINATE_HPP -#define CK_TENSOR_COORDINATE_HPP +#ifndef CK_TENSOR_COORDINATE_V2_HPP +#define CK_TENSOR_COORDINATE_V2_HPP #include "common_header.hpp" -#include "ConstantTensorDescriptor.hpp" -#include "ConstantMergedTensorDescriptor.hpp" +#include "dimension.hpp" +#include "multi_index_transform.hpp" +#include "tensor_descriptor.hpp" namespace ck { -// TensorDesc is ConstantTensorDescriptor -template -struct NormalTensorCoordinate +template +struct TensorCoordinate; + +template +struct NativeTensorCoordinate { - using type = NormalTensorCoordinate; - using tensor_desc_type = TensorDesc; - + using type = NativeTensorCoordinate; + using tensor_desc_type = NativeTensorDesc; static constexpr index_t nDim = tensor_desc_type::GetNumOfDimension(); + using Index = MultiIndex; - __host__ __device__ constexpr NormalTensorCoordinate(Array tensor_index) - : mOffset{tensor_desc_type::GetOffsetFromMultiIndex(tensor_index)} + __host__ __device__ constexpr NativeTensorCoordinate(Index idx) + : mIndex(idx), mOffset(tensor_desc_type::CalculateOffset(idx)) { } - template - __host__ __device__ constexpr NormalTensorCoordinate(Xs... xs) - : NormalTensorCoordinate(Array{xs...}) + template + __host__ __device__ constexpr NativeTensorCoordinate(Xs... xs) + : NativeTensorCoordinate(Index{xs...}) { } template - __host__ __device__ constexpr NormalTensorCoordinate(Sequence) - : NormalTensorCoordinate(Array{Xs...}) + __host__ __device__ constexpr NativeTensorCoordinate(Sequence) + : NativeTensorCoordinate(Index{Xs...}) { } - __host__ __device__ constexpr index_t GetOffset() const { return mOffset; } + __host__ __device__ static constexpr auto GetTensorDescriptor() { return tensor_desc_type{}; } - // T is Array or Sequence - template - __host__ __device__ type operator+=(T step_sizes) + __host__ __device__ constexpr const Index& GetIndex() const { return mIndex; } + + __host__ __device__ constexpr const index_t& GetOffset() const { return mOffset; } + + __host__ __device__ constexpr type operator+=(const Index& idx_diff) { - static_assert(is_same{} && T::GetSize() == nDim, "wrong!"); + // mIndex is updated here, but some (or all) of its entries may never be used + // compiler should remove those entries as dead code + mIndex += idx_diff; - mOffset += tensor_desc_type::GetOffsetFromMultiIndex(step_sizes); + mOffset += tensor_desc_type::CalculateOffsetDiff(idx_diff); return *this; } - template - __host__ __device__ type operator-=(T step_sizes) + __host__ __device__ constexpr type operator-=(const Index& idx_diff) { - static_assert(is_same{} && T::GetSize() == nDim, "wrong!"); + // mIndex is updated here, but some (or all) of its entries may never be used + // compiler should remove those entries as dead code + mIndex -= idx_diff; - mOffset -= tensor_desc_type::GetOffsetFromMultiIndex(step_sizes); + mOffset -= tensor_desc_type::CalculateOffsetDiff(idx_diff); return *this; } - template - __host__ __device__ constexpr type operator+(T step_sizes) const + __host__ __device__ constexpr type operator+(const Index& idx_diff) const { type coord = *this; - coord += step_sizes; + coord += idx_diff; return coord; } - template - __host__ __device__ constexpr type operator-(T step_sizes) const + __host__ __device__ constexpr type operator-(const Index& idx_diff) const { type coord = *this; - coord -= step_sizes; + coord -= idx_diff; return coord; } - // reposition point of origin, and return compensated offset. - // This is a hack to reduce index calculation during looping over - // a tensor whose origin is this TensorCoordinate. It does so, by spitting - // out the run-time offset to the pointer (to the tensor data) held by this - // TensorCoordiante, so the caller can add the offset into the run-time pointer of - // the data, so only 1 run-time variable (update pointer) is needed, instead - // of 2 run-time variables (old pointer and this offset) - // TODO: after introducing the concept of "run-time tensor view", which contains the - // run-time pointer to the data, always keep track of the pointer, instead of both - // offset and the pointer. This also bring additional benefit that we don't need to - // worry the offset might underflow (because offset is unsigned integer) when updating it. - __host__ __device__ constexpr index_t RepositionOrigin() - { - index_t offset_diff = mOffset; - mOffset = 0; - return offset_diff; - } + __host__ __device__ static constexpr bool IsUpperIndexMappedToValidOffset() { return true; } private: + // mIndex may be saved and updated, however, the value of some (or all) of its entries may + // never be used. Compiler should be able to remove these entries as well as its calculation + // as dead code. + // TODO: make sure compiler indeed remove these dead code + Index mIndex; index_t mOffset; }; -// TensorDesc is ConstantMergedTensorDescriptor -template -struct MergedTensorCoordinate +template +struct TransformedTensorCoordinate { - using type = MergedTensorCoordinate; - using tensor_desc_type = TensorDesc; - + using tensor_desc_type = TransformedTensorDesc; + using LowerCoord = + typename TensorCoordinate::type; + using UpperCoord = TransformedTensorCoordinate; static constexpr index_t nDim = tensor_desc_type::GetNumOfDimension(); - static constexpr index_t nOriginalDim = - tensor_desc_type::GetOriginalTensorDescriptor().GetNumOfDimension(); + using UpperIndex = MultiIndex; - __host__ __device__ constexpr MergedTensorCoordinate(Array tensor_index) - : mOriginalIndex{tensor_desc_type::GetOriginalMultiIndexFromMultiIndex(tensor_index)} - { - // partial offset on each dimension - static_for<0, nDim, 1>{}([&](auto idim) { - constexpr auto partial_original_dims = - tensor_desc_type::GetContainedOriginalDimensions(idim); - - constexpr auto partial_original_desc = - tensor_desc_type::GetOriginalTensorDescriptor().Extract(partial_original_dims); - - mPartialOffsets(idim) = partial_original_desc.GetOffsetFromMultiIndex( - extract_array(mOriginalIndex, partial_original_dims)); - }); - - // complete offset - mOffset = - accumulate_on_array(mPartialOffsets, math::plus{}, static_cast(0)); - } - - template - __host__ __device__ constexpr MergedTensorCoordinate(Xs... xs) - : MergedTensorCoordinate(Array{xs...}) + __host__ __device__ constexpr TransformedTensorCoordinate(UpperIndex idx) + : mIndexUp{idx}, mCoordLow{tensor_desc_type::CalculateLowerIndex(idx)} { } - __host__ __device__ constexpr index_t GetOffset() const { return mOffset; } - - template - __host__ __device__ void - MoveOnDimension(IDim idim_, T step_size, integral_constant) + template + __host__ __device__ constexpr TransformedTensorCoordinate(Xs... xs) + : TransformedTensorCoordinate(UpperIndex{xs...}) { - constexpr auto idim = idim_; - - // if step_size is known at compile time - static_if::value>{}( - [&](auto) { static_if{}([&](auto) { return; }); }); - - // update original index - static_if{}([&](auto) { - constexpr auto partial_original_dims = - tensor_desc_type::GetContainedOriginalDimensions(idim); - - constexpr index_t ndim_partial_original = partial_original_dims.GetSize(); - - constexpr auto partial_original_desc = - tensor_desc_type::GetOriginalTensorDescriptor().Extract(partial_original_dims); - - const auto partial_original_step_sizes = - partial_original_desc.GetMultiIndexFrom1dIndex(step_size); - - // update partial original multi-id - auto partial_original_id = extract_array(mOriginalIndex, partial_original_dims); - - static_if{}([&](auto) { - partial_original_id += partial_original_step_sizes; - - bool carry = false; - - // do carry check in reversed order, starting from lowest dimension - // don't check the highest dimension - static_for<0, ndim_partial_original - 1, 1>{}([&](auto IReverse) { - constexpr index_t i = ndim_partial_original - 1 - IReverse; - - if(carry) - { - ++partial_original_id(i); - } - - carry = false; - - if(partial_original_id[i] >= partial_original_desc.GetLength(i)) - { - partial_original_id(i) -= partial_original_desc.GetLength(i); - carry = true; - } - }); - - // highest dimension - if(carry) - { - ++partial_original_id(0); - } - }).Else([&](auto) { - // shift up multi-id to avoid unsigned integer underflow during intermediate - // calculations. After the shift, should have new_multi_id[...] >= 1 - partial_original_id += - partial_original_desc.GetLengths() - partial_original_step_sizes; - - bool borrow = false; - - // do borrow check in reversed order, starting from lowest dimension - // don't check the highest dimension - static_for<0, ndim_partial_original - 1, 1>{}([&](auto IReverse) { - constexpr index_t i = ndim_partial_original - 1 - IReverse; - - if(borrow) - { - --partial_original_id(i); - } - - borrow = false; - - if(partial_original_id[i] < partial_original_desc.GetLength(i)) - { - partial_original_id(i) += partial_original_desc.GetLength(i); - borrow = true; - } - }); - - // highest dimension - if(borrow) - { - --partial_original_id(0); - } - - // shift back down multi-id - // here, should have new_multi_id[...] >= GetLengths() - partial_original_id = partial_original_id - partial_original_desc.GetLengths(); - }); - - // update "mOriginalIndex" - static_for<0, ndim_partial_original, 1>{}([&](auto I) { - constexpr auto idim_original = partial_original_dims[I]; - - mOriginalIndex(idim_original) = partial_original_id[I]; - }); - - // calculate new partial offset on this merged dimension - const index_t old_partial_offset = mPartialOffsets[idim]; - - mPartialOffsets(idim) = - partial_original_desc.GetOffsetFromMultiIndex(partial_original_id); - - // update "mThreadSrcOffset", do "+" before "-" to avoid underflow - mOffset = (mOffset + mPartialOffsets[idim]) - old_partial_offset; - }).Else([&](auto fwd) { - static_if{}([&](auto) { - mOffset += step_size * fwd(tensor_desc_type{}).GetStride(idim); - }).Else([&](auto) { mOffset -= step_size * fwd(tensor_desc_type{}).GetStride(idim); }); - }); } - // T is Array or Sequence - template - __host__ __device__ type operator+=(T step_sizes) + template + __host__ __device__ constexpr TransformedTensorCoordinate(Sequence) + : TransformedTensorCoordinate(UpperIndex{Xs...}) { - static_assert(is_same{} && T::GetSize() == nDim, "wrong!"); + } - static_for<0, nDim, 1>{}([&](auto idim) { - // compiler should remove dead code path, because step_sizes is known at - // compile time - if(step_sizes[idim] != 0) - { - this->MoveOnDimension(idim, step_sizes[idim], integral_constant{}); - } - }); + __host__ __device__ static constexpr auto GetTensorDescriptor() { return tensor_desc_type{}; } + + __host__ __device__ constexpr const LowerCoord& GetLowerCoordinate() const { return mCoordLow; } + + __host__ __device__ constexpr const UpperIndex& GetUpperIndex() const { return mIndexUp; } + + __host__ __device__ constexpr const UpperIndex& GetIndex() const { return GetUpperIndex(); } + + __host__ __device__ constexpr const index_t& GetOffset() const + { + return GetLowerCoordinate().GetOffset(); + } + + __host__ __device__ constexpr UpperCoord operator+=(const UpperIndex& idx_up_diff) + { + // For transformation of multi-index difference, not all transformation functions need to + // know the old lower-index or the old upper-index. We pass both of them to the + // transformation function. The transformation function itself decides to use them or not. + mCoordLow += tensor_desc_type::CalculateLowerIndexDiff( + idx_up_diff, GetIndex(), GetLowerCoordinate().GetIndex()); + + // mIndexUp is updated here, but some (or all) of its entries may never be used + // compiler should remove those entries as dead code + mIndexUp += idx_up_diff; return *this; } - template - __host__ __device__ type operator-=(T step_sizes) + __host__ __device__ constexpr UpperCoord operator-=(const UpperIndex& idx_up_diff) { - static_assert(is_same{} && T::GetSize() == nDim, "wrong!"); + mCoordLow -= tensor_desc_type::CalculateLowerIndexDiff( + idx_up_diff, GetIndex(), GetLowerCoordinate().GetIndex()); - static_for<0, nDim, 1>{}([&](auto idim) { - // compiler should remove dead code path, because step_sizes is known at - // compile time - if(step_sizes[idim] != 0) - { - this->MoveOnDimension(idim, step_sizes[idim], integral_constant{}); - } - }); + // mIndex is updated here, but some (or all) of its entries may never be used + // compiler should remove those entries as dead code + mIndexUp -= idx_up_diff; return *this; } - template - __host__ __device__ constexpr type operator+(T step_sizes) const + __host__ __device__ constexpr UpperCoord operator+(const UpperIndex& idx_up_diff) const { - type coord = *this; - coord += step_sizes; - return coord; + UpperCoord coord_up = *this; + coord_up += idx_up_diff; + return coord_up; } - template - __host__ __device__ constexpr type operator-(T step_sizes) const + __host__ __device__ constexpr UpperCoord operator-(const UpperIndex& idx_up_diff) const { - type coord = *this; - coord -= step_sizes; - return coord; + UpperCoord coord_up = *this; + coord_up -= idx_up_diff; + return coord_up; } - __host__ __device__ static constexpr index_t RepositionOrigin() { return 0; } + // this function should be inexpensive, because there is no upper-to-lower index transformation + __host__ __device__ constexpr bool IsUpperIndexMappedToValidOffset() const + { + return tensor_desc_type::IsUpperIndexMappedToValidLowerIndex(GetIndex()) && + mCoordLow.IsUpperIndexMappedToValidOffset(); + } private: - // Allocate register memory for all merged dimensions and normal dimensions. - // However, only those merged dimensions, whose index will be involved in arithmetic - // after the construction of this TensorCoordinate (e.g. when user move a slicing - // window on the merged dimension), will use these register memory. - // Let's hope compiler will optimize away those register memory allocated for normal - // dimensions, and those merged dimensions, that would never be involved in index - // arithmetic after construction of TensorCoordinate. - // TODO: refactor TensorCoordinate, after introducing the concept of "dimensions" - // and simplify implementation of ConstantMergedTensorDescriptor, so we don't need to - // count on compiler to optimize away those register memory for us - Array mOriginalIndex; - Array mPartialOffsets; - - // complete offset - index_t mOffset; + // mIndexUp may be calculated and updated, however, the value of some (or all) of its entries + // may + // never be used. Compiler should be able to remove these entries as well as its calculation + // as dead code. + // TODO: make sure compiler indeed remove these dead code + UpperIndex mIndexUp; + LowerCoord mCoordLow; }; -template +template struct TensorCoordinate { private: - template + template __host__ __device__ static constexpr auto - MakeDummyTensorCoordinate(ConstantTensorDescriptor) + MakeDummyTensorCoordinate(NativeTensorDescriptor) { - return NormalTensorCoordinate>(); + return NativeTensorCoordinate>( + make_zero_array()); } - template + template __host__ __device__ static constexpr auto - MakeDummyTensorCoordinate(ConstantMergedTensorDescriptor) + MakeDummyTensorCoordinate(TransformedTensorDescriptor) { - return MergedTensorCoordinate>(); + return TransformedTensorCoordinate>( + make_zero_array()); } public: diff --git a/composable_kernel/include/tensor_description/tensor_coordinate_deprecated.hpp b/composable_kernel/include/tensor_description/tensor_coordinate_deprecated.hpp new file mode 100644 index 0000000000..46e551ddd4 --- /dev/null +++ b/composable_kernel/include/tensor_description/tensor_coordinate_deprecated.hpp @@ -0,0 +1,346 @@ +#ifndef CK_TENSOR_COORDINATE_DEPRECATED_HPP +#define CK_TENSOR_COORDINATE_DEPRECATED_HPP + +#include "common_header.hpp" +#include "ConstantTensorDescriptor.hpp" +#include "ConstantMergedTensorDescriptor.hpp" + +namespace ck { + +// TensorDesc is ConstantTensorDescriptor +template +struct NormalTensorCoordinate_deprecated +{ + using type = NormalTensorCoordinate_deprecated; + using tensor_desc_type = TensorDesc; + + static constexpr index_t nDim = tensor_desc_type::GetNumOfDimension(); + + __host__ + __device__ constexpr NormalTensorCoordinate_deprecated(Array tensor_index) + : mOffset{tensor_desc_type::GetOffsetFromMultiIndex(tensor_index)} + { + } + + template + __host__ __device__ constexpr NormalTensorCoordinate_deprecated(Xs... xs) + : NormalTensorCoordinate_deprecated(Array{xs...}) + { + } + + template + __host__ __device__ constexpr NormalTensorCoordinate_deprecated(Sequence) + : NormalTensorCoordinate_deprecated(Array{Xs...}) + { + } + + __host__ __device__ constexpr index_t GetOffset() const { return mOffset; } + + // T is Array or Sequence + template + __host__ __device__ type operator+=(T step_sizes) + { + static_assert(is_same{} && T::GetSize() == nDim, "wrong!"); + + mOffset += tensor_desc_type::GetOffsetFromMultiIndex(step_sizes); + + return *this; + } + + template + __host__ __device__ type operator-=(T step_sizes) + { + static_assert(is_same{} && T::GetSize() == nDim, "wrong!"); + + mOffset -= tensor_desc_type::GetOffsetFromMultiIndex(step_sizes); + + return *this; + } + + template + __host__ __device__ constexpr type operator+(T step_sizes) const + { + type coord = *this; + coord += step_sizes; + return coord; + } + + template + __host__ __device__ constexpr type operator-(T step_sizes) const + { + type coord = *this; + coord -= step_sizes; + return coord; + } + + // reposition point of origin, and return compensated offset. + // This is a hack to reduce index calculation during looping over + // a tensor whose origin is this TensorCoordinate. It does so, by spitting + // out the run-time offset to the pointer (to the tensor data) held by this + // TensorCoordiante, so the caller can add the offset into the run-time pointer of + // the data, so only 1 run-time variable (update pointer) is needed, instead + // of 2 run-time variables (old pointer and this offset) + // TODO: after introducing the concept of "run-time tensor view", which contains the + // run-time pointer to the data, always keep track of the pointer, instead of both + // offset and the pointer. This also bring additional benefit that we don't need to + // worry the offset might underflow (because offset is unsigned integer) when updating it. + __host__ __device__ constexpr index_t RepositionOrigin() + { + index_t offset_diff = mOffset; + mOffset = 0; + return offset_diff; + } + + private: + index_t mOffset; +}; + +// TensorDesc is ConstantMergedTensorDescriptor +template +struct MergedTensorCoordinate +{ + using type = MergedTensorCoordinate; + using tensor_desc_type = TensorDesc; + + static constexpr index_t nDim = tensor_desc_type::GetNumOfDimension(); + static constexpr index_t nOriginalDim = + tensor_desc_type::GetOriginalTensorDescriptor().GetNumOfDimension(); + + __host__ __device__ constexpr MergedTensorCoordinate(Array tensor_index) + : mOriginalIndex{tensor_desc_type::GetOriginalMultiIndexFromMultiIndex(tensor_index)} + { + // partial offset on each dimension + static_for<0, nDim, 1>{}([&](auto idim) { + constexpr auto partial_original_dims = + tensor_desc_type::GetContainedOriginalDimensions(idim); + + constexpr auto partial_original_desc = + tensor_desc_type::GetOriginalTensorDescriptor().Extract(partial_original_dims); + + mPartialOffsets(idim) = partial_original_desc.GetOffsetFromMultiIndex( + extract_array(mOriginalIndex, partial_original_dims)); + }); + + // complete offset + mOffset = + accumulate_on_array(mPartialOffsets, math::plus{}, static_cast(0)); + } + + template + __host__ __device__ constexpr MergedTensorCoordinate(Xs... xs) + : MergedTensorCoordinate(Array{xs...}) + { + } + + __host__ __device__ constexpr index_t GetOffset() const { return mOffset; } + + template + __host__ __device__ void + MoveOnDimension(IDim idim_, T step_size, integral_constant) + { + constexpr auto idim = idim_; + + // if step_size is known at compile time + static_if::value>{}( + [&](auto) { static_if{}([&](auto) { return; }); }); + + // update original index + static_if{}([&](auto) { + constexpr auto partial_original_dims = + tensor_desc_type::GetContainedOriginalDimensions(idim); + + constexpr index_t ndim_partial_original = partial_original_dims.GetSize(); + + constexpr auto partial_original_desc = + tensor_desc_type::GetOriginalTensorDescriptor().Extract(partial_original_dims); + + const auto partial_original_step_sizes = + partial_original_desc.GetMultiIndexFrom1dIndex(step_size); + + // update partial original multi-id + auto partial_original_id = extract_array(mOriginalIndex, partial_original_dims); + + static_if{}([&](auto) { + partial_original_id += partial_original_step_sizes; + + bool carry = false; + + // do carry check in reversed order, starting from lowest dimension + // don't check the highest dimension + static_for<0, ndim_partial_original - 1, 1>{}([&](auto IReverse) { + constexpr index_t i = ndim_partial_original - 1 - IReverse; + + if(carry) + { + ++partial_original_id(i); + } + + carry = false; + + if(partial_original_id[i] >= partial_original_desc.GetLength(i)) + { + partial_original_id(i) -= partial_original_desc.GetLength(i); + carry = true; + } + }); + + // highest dimension + if(carry) + { + ++partial_original_id(0); + } + }).Else([&](auto) { + // shift up multi-id to avoid unsigned integer underflow during intermediate + // calculations. After the shift, should have new_multi_id[...] >= 1 + partial_original_id += + partial_original_desc.GetLengths() - partial_original_step_sizes; + + bool borrow = false; + + // do borrow check in reversed order, starting from lowest dimension + // don't check the highest dimension + static_for<0, ndim_partial_original - 1, 1>{}([&](auto IReverse) { + constexpr index_t i = ndim_partial_original - 1 - IReverse; + + if(borrow) + { + --partial_original_id(i); + } + + borrow = false; + + if(partial_original_id[i] < partial_original_desc.GetLength(i)) + { + partial_original_id(i) += partial_original_desc.GetLength(i); + borrow = true; + } + }); + + // highest dimension + if(borrow) + { + --partial_original_id(0); + } + + // shift back down multi-id + // here, should have new_multi_id[...] >= GetLengths() + partial_original_id = partial_original_id - partial_original_desc.GetLengths(); + }); + + // update "mOriginalIndex" + static_for<0, ndim_partial_original, 1>{}([&](auto I) { + constexpr auto idim_original = partial_original_dims[I]; + + mOriginalIndex(idim_original) = partial_original_id[I]; + }); + + // calculate new partial offset on this merged dimension + const index_t old_partial_offset = mPartialOffsets[idim]; + + mPartialOffsets(idim) = + partial_original_desc.GetOffsetFromMultiIndex(partial_original_id); + + // update "mThreadSrcOffset", do "+" before "-" to avoid underflow + mOffset = (mOffset + mPartialOffsets[idim]) - old_partial_offset; + }).Else([&](auto fwd) { + static_if{}([&](auto) { + mOffset += step_size * fwd(tensor_desc_type{}).GetStride(idim); + }).Else([&](auto) { mOffset -= step_size * fwd(tensor_desc_type{}).GetStride(idim); }); + }); + } + + // T is Array or Sequence + template + __host__ __device__ type operator+=(T step_sizes) + { + static_assert(is_same{} && T::GetSize() == nDim, "wrong!"); + + static_for<0, nDim, 1>{}([&](auto idim) { + // compiler should remove dead code path, because step_sizes is known at + // compile time + if(step_sizes[idim] != 0) + { + this->MoveOnDimension(idim, step_sizes[idim], integral_constant{}); + } + }); + + return *this; + } + + template + __host__ __device__ type operator-=(T step_sizes) + { + static_assert(is_same{} && T::GetSize() == nDim, "wrong!"); + + static_for<0, nDim, 1>{}([&](auto idim) { + // compiler should remove dead code path, because step_sizes is known at + // compile time + if(step_sizes[idim] != 0) + { + this->MoveOnDimension(idim, step_sizes[idim], integral_constant{}); + } + }); + + return *this; + } + + template + __host__ __device__ constexpr type operator+(T step_sizes) const + { + type coord = *this; + coord += step_sizes; + return coord; + } + + template + __host__ __device__ constexpr type operator-(T step_sizes) const + { + type coord = *this; + coord -= step_sizes; + return coord; + } + + __host__ __device__ static constexpr index_t RepositionOrigin() { return 0; } + + private: + // Allocate register memory for all merged dimensions and normal dimensions. + // However, only those merged dimensions, whose index will be involved in arithmetic + // after the construction of this TensorCoordinate (e.g. when user move a slicing + // window on the merged dimension), will use these register memory. + // Let's hope compiler will optimize away those register memory allocated for normal + // dimensions, and those merged dimensions, that would never be involved in index + // arithmetic after construction of TensorCoordinate. + // TODO: refactor TensorCoordinate, after introducing the concept of "dimensions" + // and simplify implementation of ConstantMergedTensorDescriptor, so we don't need to + // count on compiler to optimize away those register memory for us + Array mOriginalIndex; + Array mPartialOffsets; + + // complete offset + index_t mOffset; +}; + +template +struct TensorCoordinate_deprecated +{ + private: + template + __host__ __device__ static constexpr auto + MakeDummyTensorCoordinate(ConstantTensorDescriptor) + { + return NormalTensorCoordinate_deprecated>(); + } + + template + __host__ __device__ static constexpr auto + MakeDummyTensorCoordinate(ConstantMergedTensorDescriptor) + { + return MergedTensorCoordinate>(); + } + + public: + using type = decltype(MakeDummyTensorCoordinate(TensorDesc{})); +}; + +} // namespace ck +#endif diff --git a/composable_kernel/include/tensor_description/tensor_coordinate_helper.hpp b/composable_kernel/include/tensor_description/tensor_coordinate_helper.hpp index 2b0550f8ab..93cb077c24 100644 --- a/composable_kernel/include/tensor_description/tensor_coordinate_helper.hpp +++ b/composable_kernel/include/tensor_description/tensor_coordinate_helper.hpp @@ -9,7 +9,7 @@ template __host__ __device__ constexpr auto make_tensor_coordinate_v2(TensorDesc, MultiIndex idx) { - return typename TensorCoordinate_v2::type(idx); + return typename TensorCoordinate::type(idx); } } // namespace ck diff --git a/composable_kernel/include/tensor_description/tensor_coordinate_v2.hpp b/composable_kernel/include/tensor_description/tensor_coordinate_v2.hpp deleted file mode 100644 index cbb9a703df..0000000000 --- a/composable_kernel/include/tensor_description/tensor_coordinate_v2.hpp +++ /dev/null @@ -1,215 +0,0 @@ -#ifndef CK_TENSOR_COORDINATE_V2_HPP -#define CK_TENSOR_COORDINATE_V2_HPP - -#include "common_header.hpp" -#include "dimension.hpp" -#include "multi_index_transform.hpp" -#include "tensor_descriptor.hpp" - -namespace ck { - -template -struct TensorCoordinate_v2; - -template -struct NativeTensorCoordinate -{ - using type = NativeTensorCoordinate; - using tensor_desc_type = NativeTensorDesc; - static constexpr index_t nDim = tensor_desc_type::GetNumOfDimension(); - using Index = MultiIndex; - - __host__ __device__ constexpr NativeTensorCoordinate(Index idx) - : mIndex(idx), mOffset(tensor_desc_type::CalculateOffset(idx)) - { - } - - template - __host__ __device__ constexpr NativeTensorCoordinate(Xs... xs) - : NativeTensorCoordinate(Index{xs...}) - { - } - - template - __host__ __device__ constexpr NativeTensorCoordinate(Sequence) - : NativeTensorCoordinate(Index{Xs...}) - { - } - - __host__ __device__ static constexpr auto GetTensorDescriptor() { return tensor_desc_type{}; } - - __host__ __device__ constexpr const Index& GetIndex() const { return mIndex; } - - __host__ __device__ constexpr const index_t& GetOffset() const { return mOffset; } - - __host__ __device__ constexpr type operator+=(const Index& idx_diff) - { - // mIndex is updated here, but some (or all) of its entries may never be used - // compiler should remove those entries as dead code - mIndex += idx_diff; - - mOffset += tensor_desc_type::CalculateOffsetDiff(idx_diff); - - return *this; - } - - __host__ __device__ constexpr type operator-=(const Index& idx_diff) - { - // mIndex is updated here, but some (or all) of its entries may never be used - // compiler should remove those entries as dead code - mIndex -= idx_diff; - - mOffset -= tensor_desc_type::CalculateOffsetDiff(idx_diff); - - return *this; - } - - __host__ __device__ constexpr type operator+(const Index& idx_diff) const - { - type coord = *this; - coord += idx_diff; - return coord; - } - - __host__ __device__ constexpr type operator-(const Index& idx_diff) const - { - type coord = *this; - coord -= idx_diff; - return coord; - } - - __host__ __device__ static constexpr bool IsUpperIndexMappedToValidOffset() { return true; } - - private: - // mIndex may be saved and updated, however, the value of some (or all) of its entries may - // never be used. Compiler should be able to remove these entries as well as its calculation - // as dead code. - // TODO: make sure compiler indeed remove these dead code - Index mIndex; - index_t mOffset; -}; - -template -struct TransformedTensorCoordinate -{ - using tensor_desc_type = TransformedTensorDesc; - using LowerCoord = - typename TensorCoordinate_v2::type; - using UpperCoord = TransformedTensorCoordinate; - static constexpr index_t nDim = tensor_desc_type::GetNumOfDimension(); - using UpperIndex = MultiIndex; - - __host__ __device__ constexpr TransformedTensorCoordinate(UpperIndex idx) - : mIndexUp{idx}, mCoordLow{tensor_desc_type::CalculateLowerIndex(idx)} - { - } - - template - __host__ __device__ constexpr TransformedTensorCoordinate(Xs... xs) - : TransformedTensorCoordinate(UpperIndex{xs...}) - { - } - - template - __host__ __device__ constexpr TransformedTensorCoordinate(Sequence) - : TransformedTensorCoordinate(UpperIndex{Xs...}) - { - } - - __host__ __device__ static constexpr auto GetTensorDescriptor() { return tensor_desc_type{}; } - - __host__ __device__ constexpr const LowerCoord& GetLowerCoordinate() const { return mCoordLow; } - - __host__ __device__ constexpr const UpperIndex& GetUpperIndex() const { return mIndexUp; } - - __host__ __device__ constexpr const UpperIndex& GetIndex() const { return GetUpperIndex(); } - - __host__ __device__ constexpr const index_t& GetOffset() const - { - return GetLowerCoordinate().GetOffset(); - } - - __host__ __device__ constexpr UpperCoord operator+=(const UpperIndex& idx_up_diff) - { - // For transformation of multi-index difference, not all transformation functions need to - // know the old lower-index or the old upper-index. We pass both of them to the - // transformation function. The transformation function itself decides to use them or not. - mCoordLow += tensor_desc_type::CalculateLowerIndexDiff( - idx_up_diff, GetIndex(), GetLowerCoordinate().GetIndex()); - - // mIndexUp is updated here, but some (or all) of its entries may never be used - // compiler should remove those entries as dead code - mIndexUp += idx_up_diff; - - return *this; - } - - __host__ __device__ constexpr UpperCoord operator-=(const UpperIndex& idx_up_diff) - { - mCoordLow -= tensor_desc_type::CalculateLowerIndexDiff( - idx_up_diff, GetIndex(), GetLowerCoordinate().GetIndex()); - - // mIndex is updated here, but some (or all) of its entries may never be used - // compiler should remove those entries as dead code - mIndexUp -= idx_up_diff; - - return *this; - } - - __host__ __device__ constexpr UpperCoord operator+(const UpperIndex& idx_up_diff) const - { - UpperCoord coord_up = *this; - coord_up += idx_up_diff; - return coord_up; - } - - __host__ __device__ constexpr UpperCoord operator-(const UpperIndex& idx_up_diff) const - { - UpperCoord coord_up = *this; - coord_up -= idx_up_diff; - return coord_up; - } - - // this function should be inexpensive, because there is no upper-to-lower index transformation - __host__ __device__ constexpr bool IsUpperIndexMappedToValidOffset() const - { - return tensor_desc_type::IsUpperIndexMappedToValidLowerIndex(GetIndex()) && - mCoordLow.IsUpperIndexMappedToValidOffset(); - } - - private: - // mIndexUp may be calculated and updated, however, the value of some (or all) of its entries - // may - // never be used. Compiler should be able to remove these entries as well as its calculation - // as dead code. - // TODO: make sure compiler indeed remove these dead code - UpperIndex mIndexUp; - LowerCoord mCoordLow; -}; - -template -struct TensorCoordinate_v2 -{ - private: - template - __host__ __device__ static constexpr auto - MakeDummyTensorCoordinate(NativeTensorDescriptor) - { - return NativeTensorCoordinate>( - make_zero_array()); - } - - template - __host__ __device__ static constexpr auto - MakeDummyTensorCoordinate(TransformedTensorDescriptor) - { - return TransformedTensorCoordinate>( - make_zero_array()); - } - - public: - using type = decltype(MakeDummyTensorCoordinate(TensorDesc{})); -}; - -} // namespace ck -#endif diff --git a/composable_kernel/include/tensor_description/tensor_view.hpp b/composable_kernel/include/tensor_description/tensor_view.hpp index b9a9a0ca03..7a3d4410d7 100644 --- a/composable_kernel/include/tensor_description/tensor_view.hpp +++ b/composable_kernel/include/tensor_description/tensor_view.hpp @@ -4,7 +4,7 @@ #include "common_header.hpp" #include "ConstantTensorDescriptor.hpp" #include "ConstantMergedTensorDescriptor.hpp" -#include "tensor_coordinate.hpp" +#include "tensor_coordinate_deprecated.hpp" namespace ck { @@ -14,7 +14,7 @@ struct NormalTensorView { using type = NormalTensorView; using tensor_desc_type = TensorDesc; - using coordinate_type = typename NormalTensorCoordinate::type; + using coordinate_type = typename NormalTensorCoordinate_deprecated::type; using data_type = TData; static constexpr auto nDim = TensorDesc::GetNumOfDimension(); diff --git a/composable_kernel/include/tensor_description/tensor_visit.hpp b/composable_kernel/include/tensor_description/tensor_visit.hpp index 1ff538d8a8..7754e9ed94 100644 --- a/composable_kernel/include/tensor_description/tensor_visit.hpp +++ b/composable_kernel/include/tensor_description/tensor_visit.hpp @@ -5,7 +5,7 @@ #include "dimension.hpp" #include "dimension_transform.hpp" #include "tensor_descriptor.hpp" -#include "tensor_coordinate_v2.hpp" +#include "tensor_coordinate.hpp" namespace ck { @@ -13,7 +13,7 @@ template struct TensorVisit { using Index = typename TensorDescriptor::Index; - using Coordinate = typename TensorCoordinate_v2::type; + using Coordinate = typename TensorCoordinate::type; __host__ __device__ static void Run_v1(Index idx_begin) { diff --git a/composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp b/composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp index b662440b1e..1650f06e86 100644 --- a/composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp +++ b/composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp @@ -4,680 +4,11 @@ #include "common_header.hpp" #include "tensor_descriptor.hpp" #include "tensor_descriptor_helper.hpp" -#include "tensor_coordinate_v2.hpp" +#include "tensor_coordinate.hpp" #include "threadwise_generic_tensor_slice_copy.hpp" -#ifndef CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_BLOCKWISE_GENERIC_SLICE_COPY_V1 -#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_BLOCKWISE_GENERIC_SLICE_COPY_V1 1 -#endif - namespace ck { -#if 0 - -// Slice a (normal or merged) tensor, and copy it into another (normal or merged) tensor -// memory layout (ordering of dimensions) can be different between src and dst. -// This functions assume each thread is reading and writing a normal (not merged) tensor, -// to simplify index calculations. To satisfy this assumption, the user need to make sure -// that, on a merged dimension that constains multiple original dimensions, the length of -// the last original dimension need to be evenly dividable by its sub-lengths. Also, the -// repeat-length on the merged dimension need to be 1. These sanity checks are performed -// in constructor of BlockwiseGenericTensorSliceCopy_v1 -template -struct BlockwiseGenericTensorSliceCopy_v1 -{ - static constexpr index_t nDim = SrcDesc::GetNumOfDimension(); - - static constexpr index_t nOriginalDimSrc = - SrcDesc::GetOriginalTensorDescriptor().GetNumOfDimension(); - static constexpr index_t nOriginalDimDst = - DstDesc::GetOriginalTensorDescriptor().GetNumOfDimension(); - - // per-thread offset - index_t mThreadSrcOffset; - index_t mThreadDstOffset; - - // "mThreadSrcOriginalMultiId", "mThreadSrcPartialOffsets, "mThreadDstOriginalMultiId", - // "mThreadDstPartialOffsets" are always calculated inside constructor, and would be - // updated if slicing-window is moved. However, they will not be used if you always move - // the slicing-window along a non-merged dimension. In that case, compiler should be - // able to remove these calculation. - // TODO: make sure compiler would actually remove them in that case - - // partial offset in each (merged) dimension - Array mThreadSrcPartialOffsets; - Array mThreadDstPartialOffsets; - - // multi-id of original tensor - Array mThreadSrcOriginalMultiId; - Array mThreadDstOriginalMultiId; - - __device__ BlockwiseGenericTensorSliceCopy_v1(Array src_block_data_id_begin, - Array dst_block_data_id_begin) - { - // check NDim consistency - static_assert( - nDim == SrcDesc::GetNumOfDimension() && nDim == DstDesc::GetNumOfDimension() && - nDim == SliceLengths::GetSize() && nDim == SubLengths::GetSize() && - nDim == ThreadClusterLengths::GetSize() && - nDim == ThreadClusterArrangeOrder::GetSize() && - nDim == SrcDimAccessOrder::GetSize() && nDim == DstDimAccessOrder::GetSize(), - "wrong"); - - // check thread arrange order and read/write access order are valid - static_assert(is_valid_sequence_map::value && - is_valid_sequence_map::value && - is_valid_sequence_map::value, - "wrong!"); - - // thread cluster - constexpr auto thread_cluster_desc = make_ConstantTensorDescriptor_packed( - ThreadClusterLengths::ReorderGivenNew2Old(ThreadClusterArrangeOrder{})); - - // BlockSize - static_assert(BlockSize == thread_cluster_desc.GetElementSize(), "wrong! BlockSize"); - - // divide work - constexpr auto data_per_cluster_per_dims = SubLengths{} * ThreadClusterLengths{}; - - static_for<0, nDim, 1>{}([&](auto IDim) { - static_assert(SliceLengths::Get(IDim) % data_per_cluster_per_dims.Get(IDim) == 0, - "wrong! cannot evenly divide sliced tensor into cluster"); - }); - - constexpr auto repeat_lengths = SliceLengths{} / data_per_cluster_per_dims; - - // additional check for merged dimension - static_for<0, nDim, 1>{}([&](auto IDim_) { - // src - static_if{}([&](auto) { - constexpr auto IDim = decltype(IDim_){}; - - // on a merged dimension that constains multiple original dimensions, - // the length of the last original dimension need to evenly dividable by its - // sub-length, - // so each thread is effectively reading a normal (not merged) tensor - constexpr auto idim_last_original_src = - SrcDesc::GetContainedOriginalDimensions(IDim).Back(); - static_assert( - SrcDesc::GetOriginalTensorDescriptor().GetLength(idim_last_original_src) % - SubLengths::Get(IDim) == - 0, - "wrong!"); - - // merged dimension should have repeat_lengths = 1 - static_assert(repeat_lengths[IDim] == 1, - "wrong! repeat_lengths shoud be 1 on merged dimension"); - }); - - // dst - static_if{}([&](auto) { - constexpr auto IDim = decltype(IDim_){}; - - // on a merged dimension that constains multiple original dimensions, - // the length of the last original dimension need to evenly dividable by its - // sub-length, - // so each thread is effectively reading a normal (not merged) tensor - constexpr auto idim_last_original_dst = - DstDesc::GetContainedOriginalDimensions(IDim).Back(); - static_assert( - DstDesc::GetOriginalTensorDescriptor().GetLength(idim_last_original_dst) % - SubLengths::Get(IDim) == - 0, - "wrong!"); - - // merged dimension should have repeat_lengths = 1 - static_assert(repeat_lengths[IDim] == 1, - "wrong! repeat_lengths shoud be 1 on merged dimension"); - }); - }); - - // calculate mThreadSrcOffset, mThreadDstOffset - const auto thread_cluster_id = - thread_cluster_desc.GetMultiIndexFrom1dIndex(get_thread_local_1d_id()); - - const auto data_cluster_id = - reorder_array_given_old2new(thread_cluster_id, ThreadClusterArrangeOrder{}); - - const auto thread_data_id_begin = data_cluster_id * SubLengths{}; - - // original multi-id - mThreadSrcOriginalMultiId = SrcDesc::GetOriginalMultiIndexFromMultiIndex( - src_block_data_id_begin + thread_data_id_begin); - - mThreadDstOriginalMultiId = DstDesc::GetOriginalMultiIndexFromMultiIndex( - dst_block_data_id_begin + thread_data_id_begin); - - // partial offset on each dimension - static_for<0, nDim, 1>{}([&](auto IDim) { - constexpr auto src_partial_original_dims = - SrcDesc::GetContainedOriginalDimensions(IDim); - - constexpr auto src_partial_original_desc = - SrcDesc::GetOriginalTensorDescriptor().Extract(src_partial_original_dims); - - mThreadSrcPartialOffsets(IDim) = src_partial_original_desc.GetOffsetFromMultiIndex( - extract_array(mThreadSrcOriginalMultiId, src_partial_original_dims)); - }); - - static_for<0, nDim, 1>{}([&](auto IDim) { - constexpr auto dst_partial_original_dims = - DstDesc::GetContainedOriginalDimensions(IDim); - - constexpr auto dst_partial_original_desc = - DstDesc::GetOriginalTensorDescriptor().Extract(dst_partial_original_dims); - - mThreadDstPartialOffsets(IDim) = dst_partial_original_desc.GetOffsetFromMultiIndex( - extract_array(mThreadDstOriginalMultiId, dst_partial_original_dims)); - }); - - // complete offset - mThreadSrcOffset = accumulate_on_array( - mThreadSrcPartialOffsets, math::plus{}, static_cast(0)); - - mThreadDstOffset = accumulate_on_array( - mThreadDstPartialOffsets, math::plus{}, static_cast(0)); - } - - __device__ static constexpr auto GetRegisterBufferDescriptor() - { - constexpr auto repeat_lengths = SliceLengths{} / (SubLengths{} * ThreadClusterLengths{}); - - return make_ConstantTensorDescriptor_packed(SubLengths{} * repeat_lengths); - } - - __device__ static constexpr index_t GetRegisterBufferSize() - { - return GetRegisterBufferDescriptor().GetElementSpace(); - } - - template - __device__ void RunLoadRegisterBuffer(const TData* __restrict__ p_src, - TData* __restrict__ p_buffer) const - { - constexpr auto thread_sub_tensor_lengths = SubLengths{}; - - constexpr auto data_per_cluster_per_dims = - thread_sub_tensor_lengths * ThreadClusterLengths{}; - - constexpr auto repeat_lengths = SliceLengths{} / (SubLengths{} * ThreadClusterLengths{}); - - constexpr auto thread_buffer_desc = GetRegisterBufferDescriptor(); - -#if CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_BLOCKWISE_GENERIC_SLICE_COPY_V1 - static_ford{}([&](auto repeat_id) { - constexpr auto src_thread_data_id_begin = repeat_id * data_per_cluster_per_dims; - - constexpr auto buffer_data_id_begin = repeat_id * thread_sub_tensor_lengths; - - constexpr index_t src_offset = - SrcDesc::GetOffsetFromMultiIndex(src_thread_data_id_begin); - - constexpr index_t buffer_offset = - thread_buffer_desc.GetOffsetFromMultiIndex(buffer_data_id_begin); -#else - ford{}([&](auto repeat_id) { - const auto src_thread_data_id_begin = repeat_id * data_per_cluster_per_dims; - - const auto buffer_data_id_begin = repeat_id * thread_sub_tensor_lengths; - - const index_t src_offset = SrcDesc::GetOffsetFromMultiIndex(src_thread_data_id_begin); - - const index_t buffer_offset = - thread_buffer_desc.GetOffsetFromMultiIndex(buffer_data_id_begin); -#endif - - // By position the origin of the per-thread window at the point, where multi-index - // of the SrcDesc (might be a merged tensor) is all-zero. This threadwise slice copy - // is assuming each thread is copy a noraml (not merged) tensor. - // To satisfy this assumption, the user need to make sure that, on a merged dimension - // that constains multiple original dimensions, the length of the last original - // dimension need to be evenly dividable by its sub-lengths. Also, the repeat-length on - // the merged dimension need to be 1. These sanity checks are performed in constructor - // of BlockwiseGenericTensorSliceCopy_v1 - ThreadwiseGenericTensorSliceCopy_v1r2(make_zero_array(), - make_zero_array()) - .Run(p_src + src_offset + mThreadSrcOffset, p_buffer + buffer_offset); - }); - } - - template - __device__ void RunStoreRegisterBuffer(const TData* __restrict__ p_buffer, - TData* __restrict__ p_dst) const - { - constexpr auto thread_sub_tensor_lengths = SubLengths{}; - - constexpr auto data_per_cluster_per_dims = - thread_sub_tensor_lengths * ThreadClusterLengths{}; - - constexpr auto repeat_lengths = SliceLengths{} / (SubLengths{} * ThreadClusterLengths{}); - - constexpr auto thread_buffer_desc = GetRegisterBufferDescriptor(); - -#if CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_BLOCKWISE_GENERIC_SLICE_COPY_V1 - static_ford{}([&](auto repeat_id) { - constexpr auto buffer_data_id_begin = repeat_id * thread_sub_tensor_lengths; - - constexpr auto dst_data_id_begin = repeat_id * data_per_cluster_per_dims; - - constexpr index_t buffer_offset = - thread_buffer_desc.GetOffsetFromMultiIndex(buffer_data_id_begin); - - constexpr index_t dst_offset = DstDesc::GetOffsetFromMultiIndex(dst_data_id_begin); -#else - ford{}([&](auto repeat_id) { - const auto buffer_data_id_begin = repeat_id * thread_sub_tensor_lengths; - - const auto dst_data_id_begin = repeat_id * data_per_cluster_per_dims; - - const index_t buffer_offset = - thread_buffer_desc.GetOffsetFromMultiIndex(buffer_data_id_begin); - - const index_t dst_offset = DstDesc::GetOffsetFromMultiIndex(dst_data_id_begin); -#endif - - // By position the origin of the per-thread window at the point, where multi-index - // of the SrcDesc (might be a merged tensor) is all-zero. This threadwise slice copy - // is assuming each thread is copy a noraml (not merged) tensor. - // To satisfy this assumption, the user need to make sure that, on a merged dimension - // that constains multiple original dimensions, the length of the last original - // dimension need to be evenly dividable by its sub-lengths. Also, the repeat-length on - // the merged dimension need to be 1. These sanity checks are performed in constructor - // of BlockwiseGenericTensorSliceCopy_v1 - ThreadwiseGenericTensorSliceCopy_v1r2( - make_zero_array(), make_zero_array()) - .Run(p_buffer + buffer_offset, p_dst + dst_offset + mThreadDstOffset); - }); - } - - template - __device__ void Run(const TData* __restrict__ p_src, TData* __restrict__ p_dst) const - { - TData p_buffer[GetRegisterBufferSize()]; - - RunLoadRegisterBuffer(p_src, p_buffer); - RunStoreRegisterBuffer(p_buffer, p_dst); - } - - // When moving the slicing windows along a merged dimension, if the strides of the - // contained (by the merged dimension) original dimensions are not in descending order, - // then there is no guarantee that the new offset will be larger than the old offset - // for movement in positive direction (vice versue for movement in negative direction). - // As a result, there is the possiblity that the offset calculation may result in - // unsigned integer underflow (due to "-" operation). However, this hazard should not - // happen, as long as the users make sure the slicing window would not be moved out of - // the boundary of the tensor being sliced. This functions doesn't do runtime sanity - // check on out-of-bound slicing window, for performance reason - template - __device__ void MoveSlicingWindowOnSourceTensor( - Number, Number, integral_constant direction) - { - constexpr auto IDim = Number{}; - - static_if{}([&](auto) { - // logic for a merged dimension, also works for non-merged dimension, but its logic may - // be unncessarily complicated for compiler to remove calculations that are useless for - // a non-merged dimension - - // extract partial original dimensions - constexpr auto src_partial_original_dims = - SrcDesc::GetContainedOriginalDimensions(IDim); - - constexpr auto src_partial_original_desc = - SrcDesc::GetOriginalTensorDescriptor().Extract(src_partial_original_dims); - - // calculate new partial original multi-id - auto old_src_partial_original_id = - extract_array(mThreadSrcOriginalMultiId, src_partial_original_dims); - - auto new_src_partial_original_id = - src_partial_original_desc.UpdateMultiIndexGivenStepSizeOf1dIndex( - old_src_partial_original_id, StepSize, direction); - - // update "mThreadSrcOriginalMultiId" - static_for<0, decltype(src_partial_original_dims)::GetSize(), 1>{}([&](auto I) { - constexpr auto IDimOriginal = src_partial_original_dims[I]; - - mThreadSrcOriginalMultiId(IDimOriginal) = new_src_partial_original_id[I]; - }); - - // calculate new partial offset on this merged dimension - const index_t old_src_partial_offset = mThreadSrcPartialOffsets[IDim]; - - const index_t new_src_partial_offset = - src_partial_original_desc.GetOffsetFromMultiIndex(new_src_partial_original_id); - - // update "mThreadSrcPartialOffsets" - mThreadSrcPartialOffsets(IDim) = new_src_partial_offset; - - // update "mThreadSrcOffset", do "+" before "-" to avoid underflow - mThreadSrcOffset = (mThreadSrcOffset + new_src_partial_offset) - old_src_partial_offset; - }).Else([&](auto) { - // Logic for non-merged dimension. If you are never going to move the slicing window on - // a merged dimension, then "mThreadSrcOriginalMultiId" and "mThreadSrcPartialOffsets", - // which are being calculated here, will never be used later. In this case, compiler - // should be able to remove these calculations. - // TODO: make sure compiler would actually remove them in this case. - - // It is the user's responsiblity to make sure the slicing window will not be moved out - // of the boundary of the tensor being sliced. Otherwise, there might be hazard like - // unsigned integer underflow. That is NO runtime sanity check to prevent the hazard - - constexpr auto IDimOriginal = SrcDesc::GetContainedOriginalDimensions(IDim).Front(); - - static_if{}([&](auto fwd) { - mThreadSrcOffset += StepSize * fwd(SrcDesc{}).GetStride(IDim); - - mThreadSrcOriginalMultiId(IDimOriginal) += StepSize; - - mThreadSrcPartialOffsets(IDim) += StepSize * fwd(SrcDesc{}).GetStride(IDim); - }).Else([&](auto fwd) { - mThreadSrcOffset -= StepSize * fwd(SrcDesc{}).GetStride(IDim); - - mThreadSrcOriginalMultiId(IDimOriginal) -= StepSize; - - mThreadSrcPartialOffsets(IDim) -= StepSize * fwd(SrcDesc{}).GetStride(IDim); - }); - }); - } - - template - __device__ void - MoveSrcSliceWindow(T step_sizes, integral_constant positive_direction) - { - static_for<0, nDim, 1>{}([&](auto idim) { - if(step_sizes[idim] != 0) - { - MoveSlicingWindowOnSourceTensor(idim, step_sizes[idim], positive_direction); - } - }); - } -}; - -// This version use TensorCoordiante -// Slice a (normal or merged) tensor, and copy it into another (normal or merged) tensor -// memory layout (ordering of dimensions) can be different between src and dst. -template -struct BlockwiseGenericTensorSliceCopy_v2 -{ - static constexpr index_t nDim = SrcDesc::GetNumOfDimension(); - - using Index = MultiIndex; - - __device__ constexpr BlockwiseGenericTensorSliceCopy_v2(const Index& src_block_slice_origin, - const Index& dst_block_slice_origin) - { - static_assert( - nDim == SrcDesc::GetNumOfDimension() && nDim == DstDesc::GetNumOfDimension() && - nDim == SliceLengths::GetSize() && nDim == SubLengths::GetSize() && - nDim == ThreadClusterLengths::GetSize() && - nDim == ThreadClusterArrangeOrder::GetSize() && - nDim == SrcDimAccessOrder::GetSize() && nDim == DstDimAccessOrder::GetSize(), - "wrong! nDim not consistent"); - - static_assert(is_same{}, - "wrong! threads should be mapped to cover entire slicing window"); - - constexpr auto thread_cluster_desc = make_ConstantTensorDescriptor_packed( - ThreadClusterLengths::ReorderGivenNew2Old(ThreadClusterArrangeOrder{})); - - static_assert(BlockSize == thread_cluster_desc.GetElementSize(), - "wrong! BlockSize not consistent with ThreadClusterLengths"); - - const auto thread_cluster_id = - thread_cluster_desc.GetMultiIndexFrom1dIndex(get_thread_local_1d_id()); - - const auto data_cluster_id = - reorder_array_given_old2new(thread_cluster_id, ThreadClusterArrangeOrder{}); - - const auto thread_data_id_begin = data_cluster_id * SubLengths{}; - - mThreadwiseLoad.SetSrcSliceOrigin(src_block_slice_origin + thread_data_id_begin); - mThreadwiseLoad.SetDstSliceOrigin(make_zero_array()); - - mThreadwiseStore.SetSrcSliceOrigin(make_zero_array()); - mThreadwiseStore.SetDstSliceOrigin(dst_block_slice_origin + thread_data_id_begin); - } - - __device__ static constexpr index_t GetRegisterBufferSize() - { - return RegisterBufferDesc::GetElementSpace(); - } - - template - __device__ void RunLoadRegisterBuffer(const TData* p_src, TData* p_buffer) const - { -#if 0 - mThreadwiseLoad.Run(p_src, p_buffer); -#else - // hardcoded: global to register - mThreadwiseLoad.template Run_amd_experiment(p_src, p_buffer); -#endif - } - - template - __device__ void RunStoreRegisterBuffer(const TData* p_buffer, TData* p_dst) const - { -#if 0 - mThreadwiseStore.Run(p_buffer, p_dst); -#else - // hardcoded: register to LDS - mThreadwiseStore.template Run_amd_experiment(p_buffer, p_dst); -#endif - } - - template - __device__ void Run(const TData* p_src, TData* p_dst) const - { - TData p_buffer[GetRegisterBufferSize()]; - - RunLoadRegisterBuffer(p_src, p_buffer); - RunStoreRegisterBuffer(p_buffer, p_dst); - } - - template - __device__ void - MoveSrcSliceWindow(T step_sizes, integral_constant positive_direction) - { - mThreadwiseLoad.MoveSrcSliceWindow(step_sizes, positive_direction); - } - - template - __device__ void - MoveDstSliceWindow(T step_sizes, integral_constant positive_direction) - { - mThreadwiseStore.MoveDstSliceWindow(step_sizes, positive_direction); - } - - private: - using RegisterBufferDesc = decltype(make_ConstantTensorDescriptor_packed(SubLengths{})); - - using ThreadwiseLoad = ThreadwiseGenericTensorSliceCopy_v2r1; - - using ThreadwiseStore = ThreadwiseGenericTensorSliceCopy_v2r1; - - ThreadwiseLoad mThreadwiseLoad; - ThreadwiseStore mThreadwiseStore; -}; - -// this version use TensorView and TensorCoordinate -template -struct BlockwiseGenericTensorSliceCopy_v3 -{ - static constexpr index_t nDim = SrcTensor::GetNumOfDimension(); - using data_type = remove_cv_t; - - using SrcCoordinate = typename SrcTensor::coordinate_type; - using DstCoordinate = typename DstTensor::coordinate_type; - - __device__ constexpr BlockwiseGenericTensorSliceCopy_v3(SrcTensor src_block, - SrcCoordinate src_block_slice_origin, - DstTensor dst_block, - DstCoordinate dst_block_slice_origin) - : mThreadBuffer{make_TensorView(ThreadBufferDesc{}, mpBuffer)} - { - static_assert( - nDim == SrcTensor::GetNumOfDimension() && nDim == DstTensor::GetNumOfDimension() && - nDim == SliceLengths::GetSize() && nDim == SubLengths::GetSize() && - nDim == ThreadClusterLengths::GetSize() && - nDim == ThreadClusterArrangeOrder::GetSize() && - nDim == SrcDimAccessOrder::GetSize() && nDim == DstDimAccessOrder::GetSize(), - "wrong! nDim not consistent"); - - static_assert(is_same{}, - "wrong! threads should be mapped to cover entire slicing window"); - - static_assert(is_same, - remove_cv_t>{}, - "wrong! type conversion not supported yet"); - - constexpr auto thread_cluster_desc = make_ConstantTensorDescriptor_packed( - ThreadClusterLengths::ReorderGivenNew2Old(ThreadClusterArrangeOrder{})); - - static_assert(BlockSize == thread_cluster_desc.GetElementSize(), - "wrong! BlockSize not consistent with ThreadClusterLengths"); - - const auto thread_cluster_id = - thread_cluster_desc.GetMultiIndexFrom1dIndex(get_thread_local_1d_id()); - - const auto data_cluster_id = - reorder_array_given_old2new(thread_cluster_id, ThreadClusterArrangeOrder{}); - - const auto thread_data_id_begin = data_cluster_id * SubLengths{}; - - mThreadwiseLoad = ThreadwiseLoad(src_block, - src_block_slice_origin + thread_data_id_begin, - mThreadBuffer, - make_zero_array()); - - mThreadwiseStore = ThreadwiseStore(mThreadBuffer, - make_zero_array(), - dst_block, - dst_block_slice_origin + thread_data_id_begin); - } - - __device__ void RunLoadRegisterBuffer() { mThreadwiseLoad.Run(); } - - __device__ void RunStoreRegisterBuffer() const { mThreadwiseStore.Run(); } - - __device__ void Run() - { - mThreadwiseLoad.Run(); - mThreadwiseStore.Run(); - } - - template - __device__ void - MoveSrcSliceWindow(T step_sizes, integral_constant positive_direction) - { - mThreadwiseLoad.MoveSrcSliceWindow(step_sizes, positive_direction); - } - - template - __device__ void - MoveDstSliceWindow(T step_sizes, integral_constant positive_direction) - { - mThreadwiseStore.MoveDstSliceWindow(step_sizes, positive_direction); - } - - private: - using ThreadBufferDesc = decltype(make_ConstantTensorDescriptor_packed(SubLengths{})); - using ThreadBufferTensor = NormalTensorView; - - using ThreadwiseLoad = ThreadwiseGenericTensorSliceCopy_v3r1; - - using ThreadwiseStore = ThreadwiseGenericTensorSliceCopy_v3r1; - - data_type mpBuffer[ThreadBufferDesc::GetElementSpace()]; - - ThreadBufferTensor mThreadBuffer; - - ThreadwiseLoad mThreadwiseLoad; - ThreadwiseStore mThreadwiseStore; -}; - -#endif - template +struct BlockwiseGenericTensorSliceCopy_v1 +{ + static constexpr index_t nDim = SrcDesc::GetNumOfDimension(); + + static constexpr index_t nOriginalDimSrc = + SrcDesc::GetOriginalTensorDescriptor().GetNumOfDimension(); + static constexpr index_t nOriginalDimDst = + DstDesc::GetOriginalTensorDescriptor().GetNumOfDimension(); + + // per-thread offset + index_t mThreadSrcOffset; + index_t mThreadDstOffset; + + // "mThreadSrcOriginalMultiId", "mThreadSrcPartialOffsets, "mThreadDstOriginalMultiId", + // "mThreadDstPartialOffsets" are always calculated inside constructor, and would be + // updated if slicing-window is moved. However, they will not be used if you always move + // the slicing-window along a non-merged dimension. In that case, compiler should be + // able to remove these calculation. + // TODO: make sure compiler would actually remove them in that case + + // partial offset in each (merged) dimension + Array mThreadSrcPartialOffsets; + Array mThreadDstPartialOffsets; + + // multi-id of original tensor + Array mThreadSrcOriginalMultiId; + Array mThreadDstOriginalMultiId; + + __device__ BlockwiseGenericTensorSliceCopy_v1(Array src_block_data_id_begin, + Array dst_block_data_id_begin) + { + // check NDim consistency + static_assert( + nDim == SrcDesc::GetNumOfDimension() && nDim == DstDesc::GetNumOfDimension() && + nDim == SliceLengths::GetSize() && nDim == SubLengths::GetSize() && + nDim == ThreadClusterLengths::GetSize() && + nDim == ThreadClusterArrangeOrder::GetSize() && + nDim == SrcDimAccessOrder::GetSize() && nDim == DstDimAccessOrder::GetSize(), + "wrong"); + + // check thread arrange order and read/write access order are valid + static_assert(is_valid_sequence_map::value && + is_valid_sequence_map::value && + is_valid_sequence_map::value, + "wrong!"); + + // thread cluster + constexpr auto thread_cluster_desc = make_ConstantTensorDescriptor_packed( + ThreadClusterLengths::ReorderGivenNew2Old(ThreadClusterArrangeOrder{})); + + // BlockSize + static_assert(BlockSize == thread_cluster_desc.GetElementSize(), "wrong! BlockSize"); + + // divide work + constexpr auto data_per_cluster_per_dims = SubLengths{} * ThreadClusterLengths{}; + + static_for<0, nDim, 1>{}([&](auto IDim) { + static_assert(SliceLengths::Get(IDim) % data_per_cluster_per_dims.Get(IDim) == 0, + "wrong! cannot evenly divide sliced tensor into cluster"); + }); + + constexpr auto repeat_lengths = SliceLengths{} / data_per_cluster_per_dims; + + // additional check for merged dimension + static_for<0, nDim, 1>{}([&](auto IDim_) { + // src + static_if{}([&](auto) { + constexpr auto IDim = decltype(IDim_){}; + + // on a merged dimension that constains multiple original dimensions, + // the length of the last original dimension need to evenly dividable by its + // sub-length, + // so each thread is effectively reading a normal (not merged) tensor + constexpr auto idim_last_original_src = + SrcDesc::GetContainedOriginalDimensions(IDim).Back(); + static_assert( + SrcDesc::GetOriginalTensorDescriptor().GetLength(idim_last_original_src) % + SubLengths::Get(IDim) == + 0, + "wrong!"); + + // merged dimension should have repeat_lengths = 1 + static_assert(repeat_lengths[IDim] == 1, + "wrong! repeat_lengths shoud be 1 on merged dimension"); + }); + + // dst + static_if{}([&](auto) { + constexpr auto IDim = decltype(IDim_){}; + + // on a merged dimension that constains multiple original dimensions, + // the length of the last original dimension need to evenly dividable by its + // sub-length, + // so each thread is effectively reading a normal (not merged) tensor + constexpr auto idim_last_original_dst = + DstDesc::GetContainedOriginalDimensions(IDim).Back(); + static_assert( + DstDesc::GetOriginalTensorDescriptor().GetLength(idim_last_original_dst) % + SubLengths::Get(IDim) == + 0, + "wrong!"); + + // merged dimension should have repeat_lengths = 1 + static_assert(repeat_lengths[IDim] == 1, + "wrong! repeat_lengths shoud be 1 on merged dimension"); + }); + }); + + // calculate mThreadSrcOffset, mThreadDstOffset + const auto thread_cluster_id = + thread_cluster_desc.GetMultiIndexFrom1dIndex(get_thread_local_1d_id()); + + const auto data_cluster_id = + reorder_array_given_old2new(thread_cluster_id, ThreadClusterArrangeOrder{}); + + const auto thread_data_id_begin = data_cluster_id * SubLengths{}; + + // original multi-id + mThreadSrcOriginalMultiId = SrcDesc::GetOriginalMultiIndexFromMultiIndex( + src_block_data_id_begin + thread_data_id_begin); + + mThreadDstOriginalMultiId = DstDesc::GetOriginalMultiIndexFromMultiIndex( + dst_block_data_id_begin + thread_data_id_begin); + + // partial offset on each dimension + static_for<0, nDim, 1>{}([&](auto IDim) { + constexpr auto src_partial_original_dims = + SrcDesc::GetContainedOriginalDimensions(IDim); + + constexpr auto src_partial_original_desc = + SrcDesc::GetOriginalTensorDescriptor().Extract(src_partial_original_dims); + + mThreadSrcPartialOffsets(IDim) = src_partial_original_desc.GetOffsetFromMultiIndex( + extract_array(mThreadSrcOriginalMultiId, src_partial_original_dims)); + }); + + static_for<0, nDim, 1>{}([&](auto IDim) { + constexpr auto dst_partial_original_dims = + DstDesc::GetContainedOriginalDimensions(IDim); + + constexpr auto dst_partial_original_desc = + DstDesc::GetOriginalTensorDescriptor().Extract(dst_partial_original_dims); + + mThreadDstPartialOffsets(IDim) = dst_partial_original_desc.GetOffsetFromMultiIndex( + extract_array(mThreadDstOriginalMultiId, dst_partial_original_dims)); + }); + + // complete offset + mThreadSrcOffset = accumulate_on_array( + mThreadSrcPartialOffsets, math::plus{}, static_cast(0)); + + mThreadDstOffset = accumulate_on_array( + mThreadDstPartialOffsets, math::plus{}, static_cast(0)); + } + + __device__ static constexpr auto GetRegisterBufferDescriptor() + { + constexpr auto repeat_lengths = SliceLengths{} / (SubLengths{} * ThreadClusterLengths{}); + + return make_ConstantTensorDescriptor_packed(SubLengths{} * repeat_lengths); + } + + __device__ static constexpr index_t GetRegisterBufferSize() + { + return GetRegisterBufferDescriptor().GetElementSpace(); + } + + template + __device__ void RunLoadRegisterBuffer(const TData* __restrict__ p_src, + TData* __restrict__ p_buffer) const + { + constexpr auto thread_sub_tensor_lengths = SubLengths{}; + + constexpr auto data_per_cluster_per_dims = + thread_sub_tensor_lengths * ThreadClusterLengths{}; + + constexpr auto repeat_lengths = SliceLengths{} / (SubLengths{} * ThreadClusterLengths{}); + + constexpr auto thread_buffer_desc = GetRegisterBufferDescriptor(); + +#if CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_BLOCKWISE_GENERIC_SLICE_COPY_V1 + static_ford{}([&](auto repeat_id) { + constexpr auto src_thread_data_id_begin = repeat_id * data_per_cluster_per_dims; + + constexpr auto buffer_data_id_begin = repeat_id * thread_sub_tensor_lengths; + + constexpr index_t src_offset = + SrcDesc::GetOffsetFromMultiIndex(src_thread_data_id_begin); + + constexpr index_t buffer_offset = + thread_buffer_desc.GetOffsetFromMultiIndex(buffer_data_id_begin); +#else + ford{}([&](auto repeat_id) { + const auto src_thread_data_id_begin = repeat_id * data_per_cluster_per_dims; + + const auto buffer_data_id_begin = repeat_id * thread_sub_tensor_lengths; + + const index_t src_offset = SrcDesc::GetOffsetFromMultiIndex(src_thread_data_id_begin); + + const index_t buffer_offset = + thread_buffer_desc.GetOffsetFromMultiIndex(buffer_data_id_begin); +#endif + + // By position the origin of the per-thread window at the point, where multi-index + // of the SrcDesc (might be a merged tensor) is all-zero. This threadwise slice copy + // is assuming each thread is copy a noraml (not merged) tensor. + // To satisfy this assumption, the user need to make sure that, on a merged dimension + // that constains multiple original dimensions, the length of the last original + // dimension need to be evenly dividable by its sub-lengths. Also, the repeat-length on + // the merged dimension need to be 1. These sanity checks are performed in constructor + // of BlockwiseGenericTensorSliceCopy_v1 + ThreadwiseGenericTensorSliceCopy_v1r2(make_zero_array(), + make_zero_array()) + .Run(p_src + src_offset + mThreadSrcOffset, p_buffer + buffer_offset); + }); + } + + template + __device__ void RunStoreRegisterBuffer(const TData* __restrict__ p_buffer, + TData* __restrict__ p_dst) const + { + constexpr auto thread_sub_tensor_lengths = SubLengths{}; + + constexpr auto data_per_cluster_per_dims = + thread_sub_tensor_lengths * ThreadClusterLengths{}; + + constexpr auto repeat_lengths = SliceLengths{} / (SubLengths{} * ThreadClusterLengths{}); + + constexpr auto thread_buffer_desc = GetRegisterBufferDescriptor(); + +#if CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_BLOCKWISE_GENERIC_SLICE_COPY_V1 + static_ford{}([&](auto repeat_id) { + constexpr auto buffer_data_id_begin = repeat_id * thread_sub_tensor_lengths; + + constexpr auto dst_data_id_begin = repeat_id * data_per_cluster_per_dims; + + constexpr index_t buffer_offset = + thread_buffer_desc.GetOffsetFromMultiIndex(buffer_data_id_begin); + + constexpr index_t dst_offset = DstDesc::GetOffsetFromMultiIndex(dst_data_id_begin); +#else + ford{}([&](auto repeat_id) { + const auto buffer_data_id_begin = repeat_id * thread_sub_tensor_lengths; + + const auto dst_data_id_begin = repeat_id * data_per_cluster_per_dims; + + const index_t buffer_offset = + thread_buffer_desc.GetOffsetFromMultiIndex(buffer_data_id_begin); + + const index_t dst_offset = DstDesc::GetOffsetFromMultiIndex(dst_data_id_begin); +#endif + + // By position the origin of the per-thread window at the point, where multi-index + // of the SrcDesc (might be a merged tensor) is all-zero. This threadwise slice copy + // is assuming each thread is copy a noraml (not merged) tensor. + // To satisfy this assumption, the user need to make sure that, on a merged dimension + // that constains multiple original dimensions, the length of the last original + // dimension need to be evenly dividable by its sub-lengths. Also, the repeat-length on + // the merged dimension need to be 1. These sanity checks are performed in constructor + // of BlockwiseGenericTensorSliceCopy_v1 + ThreadwiseGenericTensorSliceCopy_v1r2( + make_zero_array(), make_zero_array()) + .Run(p_buffer + buffer_offset, p_dst + dst_offset + mThreadDstOffset); + }); + } + + template + __device__ void Run(const TData* __restrict__ p_src, TData* __restrict__ p_dst) const + { + TData p_buffer[GetRegisterBufferSize()]; + + RunLoadRegisterBuffer(p_src, p_buffer); + RunStoreRegisterBuffer(p_buffer, p_dst); + } + + // When moving the slicing windows along a merged dimension, if the strides of the + // contained (by the merged dimension) original dimensions are not in descending order, + // then there is no guarantee that the new offset will be larger than the old offset + // for movement in positive direction (vice versue for movement in negative direction). + // As a result, there is the possiblity that the offset calculation may result in + // unsigned integer underflow (due to "-" operation). However, this hazard should not + // happen, as long as the users make sure the slicing window would not be moved out of + // the boundary of the tensor being sliced. This functions doesn't do runtime sanity + // check on out-of-bound slicing window, for performance reason + template + __device__ void MoveSlicingWindowOnSourceTensor( + Number, Number, integral_constant direction) + { + constexpr auto IDim = Number{}; + + static_if{}([&](auto) { + // logic for a merged dimension, also works for non-merged dimension, but its logic may + // be unncessarily complicated for compiler to remove calculations that are useless for + // a non-merged dimension + + // extract partial original dimensions + constexpr auto src_partial_original_dims = + SrcDesc::GetContainedOriginalDimensions(IDim); + + constexpr auto src_partial_original_desc = + SrcDesc::GetOriginalTensorDescriptor().Extract(src_partial_original_dims); + + // calculate new partial original multi-id + auto old_src_partial_original_id = + extract_array(mThreadSrcOriginalMultiId, src_partial_original_dims); + + auto new_src_partial_original_id = + src_partial_original_desc.UpdateMultiIndexGivenStepSizeOf1dIndex( + old_src_partial_original_id, StepSize, direction); + + // update "mThreadSrcOriginalMultiId" + static_for<0, decltype(src_partial_original_dims)::GetSize(), 1>{}([&](auto I) { + constexpr auto IDimOriginal = src_partial_original_dims[I]; + + mThreadSrcOriginalMultiId(IDimOriginal) = new_src_partial_original_id[I]; + }); + + // calculate new partial offset on this merged dimension + const index_t old_src_partial_offset = mThreadSrcPartialOffsets[IDim]; + + const index_t new_src_partial_offset = + src_partial_original_desc.GetOffsetFromMultiIndex(new_src_partial_original_id); + + // update "mThreadSrcPartialOffsets" + mThreadSrcPartialOffsets(IDim) = new_src_partial_offset; + + // update "mThreadSrcOffset", do "+" before "-" to avoid underflow + mThreadSrcOffset = (mThreadSrcOffset + new_src_partial_offset) - old_src_partial_offset; + }).Else([&](auto) { + // Logic for non-merged dimension. If you are never going to move the slicing window on + // a merged dimension, then "mThreadSrcOriginalMultiId" and "mThreadSrcPartialOffsets", + // which are being calculated here, will never be used later. In this case, compiler + // should be able to remove these calculations. + // TODO: make sure compiler would actually remove them in this case. + + // It is the user's responsiblity to make sure the slicing window will not be moved out + // of the boundary of the tensor being sliced. Otherwise, there might be hazard like + // unsigned integer underflow. That is NO runtime sanity check to prevent the hazard + + constexpr auto IDimOriginal = SrcDesc::GetContainedOriginalDimensions(IDim).Front(); + + static_if{}([&](auto fwd) { + mThreadSrcOffset += StepSize * fwd(SrcDesc{}).GetStride(IDim); + + mThreadSrcOriginalMultiId(IDimOriginal) += StepSize; + + mThreadSrcPartialOffsets(IDim) += StepSize * fwd(SrcDesc{}).GetStride(IDim); + }).Else([&](auto fwd) { + mThreadSrcOffset -= StepSize * fwd(SrcDesc{}).GetStride(IDim); + + mThreadSrcOriginalMultiId(IDimOriginal) -= StepSize; + + mThreadSrcPartialOffsets(IDim) -= StepSize * fwd(SrcDesc{}).GetStride(IDim); + }); + }); + } + + template + __device__ void + MoveSrcSliceWindow(T step_sizes, integral_constant positive_direction) + { + static_for<0, nDim, 1>{}([&](auto idim) { + if(step_sizes[idim] != 0) + { + MoveSlicingWindowOnSourceTensor(idim, step_sizes[idim], positive_direction); + } + }); + } +}; + +// This version use TensorCoordiante +// Slice a (normal or merged) tensor, and copy it into another (normal or merged) tensor +// memory layout (ordering of dimensions) can be different between src and dst. +template +struct BlockwiseGenericTensorSliceCopy_v2 +{ + static constexpr index_t nDim = SrcDesc::GetNumOfDimension(); + + using Index = MultiIndex; + + __device__ constexpr BlockwiseGenericTensorSliceCopy_v2(const Index& src_block_slice_origin, + const Index& dst_block_slice_origin) + { + static_assert( + nDim == SrcDesc::GetNumOfDimension() && nDim == DstDesc::GetNumOfDimension() && + nDim == SliceLengths::GetSize() && nDim == SubLengths::GetSize() && + nDim == ThreadClusterLengths::GetSize() && + nDim == ThreadClusterArrangeOrder::GetSize() && + nDim == SrcDimAccessOrder::GetSize() && nDim == DstDimAccessOrder::GetSize(), + "wrong! nDim not consistent"); + + static_assert(is_same{}, + "wrong! threads should be mapped to cover entire slicing window"); + + constexpr auto thread_cluster_desc = make_ConstantTensorDescriptor_packed( + ThreadClusterLengths::ReorderGivenNew2Old(ThreadClusterArrangeOrder{})); + + static_assert(BlockSize == thread_cluster_desc.GetElementSize(), + "wrong! BlockSize not consistent with ThreadClusterLengths"); + + const auto thread_cluster_id = + thread_cluster_desc.GetMultiIndexFrom1dIndex(get_thread_local_1d_id()); + + const auto data_cluster_id = + reorder_array_given_old2new(thread_cluster_id, ThreadClusterArrangeOrder{}); + + const auto thread_data_id_begin = data_cluster_id * SubLengths{}; + + mThreadwiseLoad.SetSrcSliceOrigin(src_block_slice_origin + thread_data_id_begin); + mThreadwiseLoad.SetDstSliceOrigin(make_zero_array()); + + mThreadwiseStore.SetSrcSliceOrigin(make_zero_array()); + mThreadwiseStore.SetDstSliceOrigin(dst_block_slice_origin + thread_data_id_begin); + } + + __device__ static constexpr index_t GetThreadBufferSize() + { + return ThreadBufferDesc::GetElementSpace(); + } + + template + __device__ void RunLoadThreadBuffer(const TData* p_block_src, TData* p_thread_buffer) const + { +#if 0 + mThreadwiseLoad.Run(p_block_src, p_thread_buffer); +#else // tweaking + mThreadwiseLoad.template Run_optimized_address_calculation( + p_block_src, p_thread_buffer); +#endif + } + + template + __device__ void RunStoreThreadBuffer(const TData* p_thread_buffer, TData* p_block_dst) const + { +#if 0 + mThreadwiseStore.Run(p_thread_buffer, p_block_dst); +#else // tweaking + mThreadwiseStore.template Run_optimized_address_calculation( + p_thread_buffer, p_block_dst); +#endif + } + + template + __device__ void Run(const TData* p_block_src, TData* p_block_dst) const + { + TData p_thread_buffer[GetThreadBufferSize()]; + + RunLoadThreadBuffer(p_block_src, + p_thread_buffer); + RunStoreThreadBuffer(p_thread_buffer, + p_block_dst); + } + + template + __device__ void + MoveSrcSliceWindow(T step_sizes, integral_constant positive_direction) + { + mThreadwiseLoad.MoveSrcSliceWindow(step_sizes, positive_direction); + } + + template + __device__ void + MoveDstSliceWindow(T step_sizes, integral_constant positive_direction) + { + mThreadwiseStore.MoveDstSliceWindow(step_sizes, positive_direction); + } + + private: + using ThreadBufferDesc = decltype(make_ConstantTensorDescriptor_packed(SubLengths{})); + + using ThreadwiseLoad = ThreadwiseGenericTensorSliceCopy_v2r1; + + using ThreadwiseStore = ThreadwiseGenericTensorSliceCopy_v2r1; + + ThreadwiseLoad mThreadwiseLoad; + ThreadwiseStore mThreadwiseStore; +}; + +// this version use TensorView and TensorCoordinate_deprecated +template +struct BlockwiseGenericTensorSliceCopy_v3 +{ + static constexpr index_t nDim = SrcTensor::GetNumOfDimension(); + using data_type = remove_cv_t; + + using SrcCoordinate = typename SrcTensor::coordinate_type; + using DstCoordinate = typename DstTensor::coordinate_type; + + __device__ constexpr BlockwiseGenericTensorSliceCopy_v3(SrcTensor src_block, + SrcCoordinate src_block_slice_origin, + DstTensor dst_block, + DstCoordinate dst_block_slice_origin) + : mThreadBuffer{make_TensorView(ThreadBufferDesc{}, mpBuffer)} + { + static_assert( + nDim == SrcTensor::GetNumOfDimension() && nDim == DstTensor::GetNumOfDimension() && + nDim == SliceLengths::GetSize() && nDim == SubLengths::GetSize() && + nDim == ThreadClusterLengths::GetSize() && + nDim == ThreadClusterArrangeOrder::GetSize() && + nDim == SrcDimAccessOrder::GetSize() && nDim == DstDimAccessOrder::GetSize(), + "wrong! nDim not consistent"); + + static_assert(is_same{}, + "wrong! threads should be mapped to cover entire slicing window"); + + static_assert(is_same, + remove_cv_t>{}, + "wrong! type conversion not supported yet"); + + constexpr auto thread_cluster_desc = make_ConstantTensorDescriptor_packed( + ThreadClusterLengths::ReorderGivenNew2Old(ThreadClusterArrangeOrder{})); + + static_assert(BlockSize == thread_cluster_desc.GetElementSize(), + "wrong! BlockSize not consistent with ThreadClusterLengths"); + + const auto thread_cluster_id = + thread_cluster_desc.GetMultiIndexFrom1dIndex(get_thread_local_1d_id()); + + const auto data_cluster_id = + reorder_array_given_old2new(thread_cluster_id, ThreadClusterArrangeOrder{}); + + const auto thread_data_id_begin = data_cluster_id * SubLengths{}; + + mThreadwiseLoad = ThreadwiseLoad(src_block, + src_block_slice_origin + thread_data_id_begin, + mThreadBuffer, + make_zero_array()); + + mThreadwiseStore = ThreadwiseStore(mThreadBuffer, + make_zero_array(), + dst_block, + dst_block_slice_origin + thread_data_id_begin); + } + + __device__ void RunLoadRegisterBuffer() { mThreadwiseLoad.Run(); } + + __device__ void RunStoreRegisterBuffer() const { mThreadwiseStore.Run(); } + + __device__ void Run() + { + mThreadwiseLoad.Run(); + mThreadwiseStore.Run(); + } + + template + __device__ void + MoveSrcSliceWindow(T step_sizes, integral_constant positive_direction) + { + mThreadwiseLoad.MoveSrcSliceWindow(step_sizes, positive_direction); + } + + template + __device__ void + MoveDstSliceWindow(T step_sizes, integral_constant positive_direction) + { + mThreadwiseStore.MoveDstSliceWindow(step_sizes, positive_direction); + } + + private: + using ThreadBufferDesc = decltype(make_ConstantTensorDescriptor_packed(SubLengths{})); + using ThreadBufferTensor = NormalTensorView; + + using ThreadwiseLoad = ThreadwiseGenericTensorSliceCopy_v3r1; + + using ThreadwiseStore = ThreadwiseGenericTensorSliceCopy_v3r1; + + data_type mpBuffer[ThreadBufferDesc::GetElementSpace()]; + + ThreadBufferTensor mThreadBuffer; + + ThreadwiseLoad mThreadwiseLoad; + ThreadwiseStore mThreadwiseStore; +}; + +} // namespace ck + +#endif diff --git a/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp b/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp index 8012d27519..931210c558 100644 --- a/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp +++ b/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp @@ -4,1124 +4,18 @@ #include "common_header.hpp" #include "tensor_descriptor.hpp" #include "tensor_descriptor_helper.hpp" -#include "tensor_coordinate_v2.hpp" +#include "tensor_coordinate.hpp" -#ifndef CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1R1 -#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1R1 0 +#ifndef CK_USE_AMD_INTRINSIC +#define CK_USE_AMD_INTRINSIC 1 #endif -#ifndef CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1R2 -#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1R2 0 -#endif - -#ifndef CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V2R1 -#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V2R1 0 +#ifndef CK_USE_AMD_INTRINSIC_BUFFER_LOAD_STORE +#define CK_USE_AMD_INTRINSIC_BUFFER_LOAD_STORE 1 #endif namespace ck { -#if 0 - -// This threadwise copy allow vector access of src and dst. -// It allows the dimensions of vector access to be different on src and dst. -// It also allows the vector size to be different on src and dst. -// It also allows order of access to be different on src and dst. -// It use register as buffer to hold all data moving from src to dst. -// It is designed for copying small amount of data, and src and dst are -// device memory or LDS. -// When copying large amout of data, let's hope compiler will reduce register -// used for the buffer. -template -struct ThreadwiseGenericTensorSliceCopy_v1r1 -{ - static constexpr index_t nDim = SliceLengths::GetSize(); - - __device__ constexpr ThreadwiseGenericTensorSliceCopy_v1r1( - Array src_slice_origin, Array dst_slice_origin) - : mSrcSliceOrigin(src_slice_origin), mDstSliceOrigin(dst_slice_origin) - { - static_assert(nDim == SrcDesc::GetNumOfDimension() && - nDim == DstDesc::GetNumOfDimension() && nDim == SliceLengths::GetSize() && - nDim == SrcDimAccessOrder::GetSize() && - nDim == DstDimAccessOrder::GetSize(), - "wrong! # of dimensions not the same"); - - static_assert(is_valid_sequence_map::value && - is_valid_sequence_map::value, - "wrong! map is not valid"); - - static_assert(SliceLengths{}[SrcVectorAccessDim] % SrcDataPerAccess == 0 && - SliceLengths{}[DstVectorAccessDim] % DstDataPerAccess == 0, - "wrong! cannot evenly divide"); - - // check vectorized memory access - constexpr auto src_vector_access_dim = Number{}; - constexpr auto dst_vector_access_dim = Number{}; - - static_if{}( - [&](auto fwd) { - static_assert( - (fwd(SrcDesc{}).GetStride(src_vector_access_dim) == 1 || SrcDataPerAccess == 1), - "wrong! vectorized access is allowed only if stride == 1"); - }) - .Else([&](auto fwd) { - static_assert( - (fwd(SrcDesc{}).GetLastOriginalDimensionStride(src_vector_access_dim) == 1 || - SrcDataPerAccess == 1), - "wrong! vectorized access is allowed only if stride == 1"); - }); - - static_if{}( - [&](auto fwd) { - static_assert( - (fwd(DstDesc{}).GetStride(dst_vector_access_dim) == 1 || DstDataPerAccess == 1), - "wrong! vectorized access is allowed only if stride == 1"); - }) - .Else([&](auto fwd) { - static_assert( - (fwd(DstDesc{}).GetLastOriginalDimensionStride(dst_vector_access_dim) == 1 || - DstDataPerAccess == 1), - "wrong! vectorized access is allowed only if stride == 1"); - }); - } - - __device__ constexpr ThreadwiseGenericTensorSliceCopy_v1r1() - : ThreadwiseGenericTensorSliceCopy_v1r1(make_zero_array(), - make_zero_array()) - { - } - - __device__ void SetSrcSliceOrigin(Array src_slice_origin) - { - mSrcSliceOrigin = src_slice_origin; - } - - __device__ void SetDstSliceOrigin(Array dst_slice_origin) - { - mDstSliceOrigin = dst_slice_origin; - } - - template - __device__ void Run(const TData* p_src, TData* p_dst) const - { - constexpr auto buffer_desc = make_ConstantTensorDescriptor_packed(SliceLengths{}); - - TData p_buffer_[buffer_desc.GetElementSpace()]; - TData* p_buffer = p_buffer_; - - // copy data from src into buffer - { - using vector_t = typename vector_type::MemoryType; - - constexpr auto src_vector_access_dim = Number{}; - constexpr auto src_data_per_access = Number{}; - - constexpr auto src_access_lengths = SliceLengths::Modify( - src_vector_access_dim, - SliceLengths::Get(src_vector_access_dim) / src_data_per_access); - -#if CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1R1 - static_ford{}([&](auto src_access_id) { - constexpr auto src_data_begin_id = src_access_id.Modify( - src_vector_access_dim, - src_access_id[src_vector_access_dim] * src_data_per_access); - - const index_t src_offset = - SrcDesc::GetOffsetFromMultiIndex(mSrcSliceOrigin + src_data_begin_id); - - // load vector from src - const vector_t vector_data = *reinterpret_cast(&p_src[src_offset]); - - // unpack vector into buffer - static_for<0, SrcDataPerAccess, 1>{}([&](auto i) { - constexpr auto scalar_id = - typename uniform_sequence_gen::type{}.Modify(src_vector_access_dim, - i); - - constexpr index_t buffer_offset = - buffer_desc.GetOffsetFromMultiIndex(src_data_begin_id + scalar_id); - - p_buffer[buffer_offset] = reinterpret_cast(&vector_data)[i]; - }); - }); -#else - ford{}([&](auto src_access_id) { - auto src_data_begin_id = src_access_id; - src_data_begin_id(src_vector_access_dim) = - src_access_id[src_vector_access_dim] * src_data_per_access; - - const index_t src_offset = - SrcDesc::GetOffsetFromMultiIndex(mSrcSliceOrigin + src_data_begin_id); - - // load vector from src - const vector_t vector_data = *reinterpret_cast(&p_src[src_offset]); - - // unpack vector into buffer - for(index_t i = 0; i < SrcDataPerAccess; ++i) - { - auto scalar_id = make_zero_array(); - scalar_id(src_vector_access_dim) = i; - - const index_t buffer_offset = - buffer_desc.GetOffsetFromMultiIndex(src_data_begin_id + scalar_id); - - p_buffer[buffer_offset] = reinterpret_cast(&vector_data)[i]; - } - }); -#endif - } - - // copy data from buffer to dst - { - using vector_t = typename vector_type::MemoryType; - - constexpr auto dst_vector_access_dim = Number{}; - constexpr auto dst_data_per_access = Number{}; - - constexpr auto dst_access_lengths = SliceLengths::Modify( - dst_vector_access_dim, - SliceLengths::Get(dst_vector_access_dim) / dst_data_per_access); - -#if CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1R1 - static_ford{}([&](auto dst_access_id) { - constexpr auto dst_data_begin_id = dst_access_id.Modify( - dst_vector_access_dim, - dst_access_id[dst_vector_access_dim] * dst_data_per_access); - - vector_t vector_data{}; - - // pack vector from buffer - static_for<0, DstDataPerAccess, 1>{}([&](auto i) { - constexpr auto scalar_id = - typename uniform_sequence_gen::type{}.Modify(dst_vector_access_dim, - i); - - constexpr index_t buffer_offset = - buffer_desc.GetOffsetFromMultiIndex(dst_data_begin_id + scalar_id); - - reinterpret_cast(&vector_data)[i] = p_buffer[buffer_offset]; - }); - - const index_t dst_offset = - DstDesc::GetOffsetFromMultiIndex(mDstSliceOrigin + dst_data_begin_id); - - // store vector into dst - *reinterpret_cast(&p_dst[dst_offset]) = vector_data; - }); -#else - ford{}([&](auto dst_access_id) { - auto dst_data_begin_id = dst_access_id; - dst_data_begin_id(dst_vector_access_dim) = - dst_access_id[dst_vector_access_dim] * dst_data_per_access; - - vector_t vector_data{}; - - // pack vector from buffer - for(index_t i = 0; i < DstDataPerAccess; ++i) - { - auto scalar_id = make_zero_array(); - scalar_id(dst_vector_access_dim) = i; - - const index_t buffer_offset = - buffer_desc.GetOffsetFromMultiIndex(dst_data_begin_id + scalar_id); - - reinterpret_cast(&vector_data)[i] = p_buffer[buffer_offset]; - } - - const index_t dst_offset = - DstDesc::GetOffsetFromMultiIndex(mDstSliceOrigin + dst_data_begin_id); - - // store vector into dst - *reinterpret_cast(&p_dst[dst_offset]) = vector_data; - }); -#endif - } - } - - private: - Array mSrcSliceOrigin; - Array mDstSliceOrigin; -}; - -// This threadwise copy allow vector access of src and dst. -// It allows the vector size to be different on src and dst. -// The dimensions of vector access should be the same on src and dst. -// The dimension access order should be the same on src and dst. -// It is designed for cases, where one of src and dst is register, and -// the other is device memory or LDS -template -struct ThreadwiseGenericTensorSliceCopy_v1r2 -{ - static constexpr index_t nDim = SliceLengths::GetSize(); - - __device__ constexpr ThreadwiseGenericTensorSliceCopy_v1r2( - Array src_slice_origin, Array dst_slice_origin) - : mSrcSliceOrigin(src_slice_origin), mDstSliceOrigin(dst_slice_origin) - { - static_assert(nDim == SrcDesc::GetNumOfDimension() && - nDim == DstDesc::GetNumOfDimension() && nDim == SliceLengths::GetSize() && - nDim == DimAccessOrder::GetSize(), - "wrong! # of dimensions not the same"); - - static_assert(is_valid_sequence_map::value, "wrong! map is not valid"); - - static_assert( - SliceLengths{}[VectorAccessDim] % math::lcm(SrcDataPerAccess, DstDataPerAccess) == 0, - "wrong! cannot evenly divide"); - - // check vectorized memory access - constexpr auto vector_access_dim = Number{}; - - static_if{}([&](auto fwd) { - static_assert( - (fwd(SrcDesc{}).GetStride(vector_access_dim) == 1 || SrcDataPerAccess == 1), - "wrong! vectorized access is allowed only if stride == 1"); - }).Else([&](auto fwd) { - static_assert((fwd(SrcDesc{}).GetLastOriginalDimensionStride(vector_access_dim) == 1 || - SrcDataPerAccess == 1), - "wrong! vectorized access is allowed only if stride == 1"); - }); - - static_if{}([&](auto fwd) { - static_assert( - (fwd(DstDesc{}).GetStride(vector_access_dim) == 1 || DstDataPerAccess == 1), - "wrong! vectorized access is allowed only if stride == 1"); - }).Else([&](auto fwd) { - static_assert((fwd(DstDesc{}).GetLastOriginalDimensionStride(vector_access_dim) == 1 || - DstDataPerAccess == 1), - "wrong! vectorized access is allowed only if stride == 1"); - }); - } - - __device__ constexpr ThreadwiseGenericTensorSliceCopy_v1r2() - : ThreadwiseGenericTensorSliceCopy_v1r2(make_zero_array(), - make_zero_array()) - { - } - - __device__ void SetSrcSliceOrigin(Array src_slice_origin) - { - mSrcSliceOrigin = src_slice_origin; - } - - __device__ void SetDstSliceOrigin(Array dst_slice_origin) - { - mDstSliceOrigin = dst_slice_origin; - } - - template - __device__ void Run(const TData* p_src, TData* p_dst) const - { - using src_vector_t = typename vector_type::MemoryType; - using dst_vector_t = typename vector_type::MemoryType; - - constexpr auto vector_access_dim = Number{}; - - constexpr auto src_data_per_access = Number{}; - constexpr auto dst_data_per_access = Number{}; - - constexpr auto long_vector_size = Number{}; - - constexpr auto long_vector_access_lengths = SliceLengths::Modify( - vector_access_dim, SliceLengths::Get(vector_access_dim) / long_vector_size); - -#if CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1R2 - static_ford{}([&]( - auto long_vector_access_id) { - - // data id w.r.t slicing-window - constexpr auto long_vector_data_begin_id = long_vector_access_id.Modify( - vector_access_dim, long_vector_access_id[vector_access_dim] * long_vector_size); - - // buffer to hold a long-vector - TData p_long_vector[long_vector_size]; - - // load data from src to the long-vector buffer - static_for<0, long_vector_size / src_data_per_access, 1>{}([&](auto i) { - constexpr auto scalar_id = typename uniform_sequence_gen::type{}.Modify( - vector_access_dim, i * src_data_per_access); - - const index_t src_offset = SrcDesc::GetOffsetFromMultiIndex( - mSrcSliceOrigin + (long_vector_data_begin_id + scalar_id)); - - constexpr index_t buffer_offset = i * src_data_per_access; - - *reinterpret_cast(&p_long_vector[buffer_offset]) = - *reinterpret_cast(&p_src[src_offset]); - }); - - // store data from the long-vector buffer to dst - static_for<0, long_vector_size / dst_data_per_access, 1>{}([&](auto i) { - constexpr auto scalar_id = typename uniform_sequence_gen::type{}.Modify( - vector_access_dim, i * dst_data_per_access); - - constexpr index_t buffer_offset = i * dst_data_per_access; - - const index_t dst_offset = DstDesc::GetOffsetFromMultiIndex( - mDstSliceOrigin + (long_vector_data_begin_id + scalar_id)); - - *reinterpret_cast(&p_dst[dst_offset]) = - *reinterpret_cast(&p_long_vector[buffer_offset]); - }); - }); -#else - ford{}( - [&](auto long_vector_access_id) { - - // data id w.r.t slicing-window - auto long_vector_data_begin_id = long_vector_access_id; - long_vector_data_begin_id(vector_access_dim) = - long_vector_size * long_vector_access_id[vector_access_dim]; - - // buffer to hold a long-vector - TData p_long_vector[long_vector_size]; - - // load data from src to the long-vector buffer - for(index_t i = 0; i < long_vector_size / src_data_per_access; ++i) - { - auto scalar_id = make_zero_array(); - scalar_id(vector_access_dim) = i * src_data_per_access; - - const index_t src_offset = SrcDesc::GetOffsetFromMultiIndex( - mSrcSliceOrigin + (long_vector_data_begin_id + scalar_id)); - - const index_t buffer_offset = i * src_data_per_access; - - *reinterpret_cast(&p_long_vector[buffer_offset]) = - *reinterpret_cast(&p_src[src_offset]); - } - - // store data from the long-vector buffer to dst - for(index_t i = 0; i < long_vector_size / dst_data_per_access; ++i) - { - auto scalar_id = make_zero_array(); - scalar_id(vector_access_dim) = i * dst_data_per_access; - - const index_t buffer_offset = i * dst_data_per_access; - - const index_t dst_offset = DstDesc::GetOffsetFromMultiIndex( - mDstSliceOrigin + (long_vector_data_begin_id + scalar_id)); - - *reinterpret_cast(&p_dst[dst_offset]) = - *reinterpret_cast(&p_long_vector[buffer_offset]); - } - }); -#endif - } - - private: - Array mSrcSliceOrigin; - Array mDstSliceOrigin; -}; - -// This version use TensorCoordinate -// This threadwise copy allow vector access of src and dst. -// It allows the dimensions of vector access to be different on src and dst. -// It also allows the vector size to be different on src and dst. -// It also allows order of access to be different on src and dst. -// It use register as buffer to hold all data moving from src to dst. -// It is designed for copying small amount of data, and src and dst are -// device memory or LDS. -// When copying large amout of data, let's hope compiler will reduce register -// used for the buffer. -template -struct ThreadwiseGenericTensorSliceCopy_v2r1 -{ - static constexpr index_t nDim = SliceLengths::GetSize(); - - using Index = MultiIndex; - - using SrcCoordinate = typename TensorCoordinate::type; - using DstCoordinate = typename TensorCoordinate::type; - - __device__ constexpr ThreadwiseGenericTensorSliceCopy_v2r1(const Index& src_slice_origin, - const Index& dst_slice_origin) - : mSrcSliceOrigin(src_slice_origin), mDstSliceOrigin(dst_slice_origin) - { - static_assert(nDim == SrcDesc::GetNumOfDimension() && - nDim == DstDesc::GetNumOfDimension() && nDim == SliceLengths::GetSize() && - nDim == SrcDimAccessOrder::GetSize() && - nDim == DstDimAccessOrder::GetSize(), - "wrong! # of dimensions not the same"); - - static_assert(is_valid_sequence_map::value && - is_valid_sequence_map::value, - "wrong! map is not valid"); - - static_assert(SliceLengths{}[SrcVectorAccessDim] % SrcDataPerAccess == 0 && - SliceLengths{}[DstVectorAccessDim] % DstDataPerAccess == 0, - "wrong! cannot evenly divide"); - - // check vectorized memory access - constexpr auto src_vector_access_dim = Number{}; - constexpr auto dst_vector_access_dim = Number{}; - - static_if{}( - [&](auto fwd) { - static_assert( - (fwd(SrcDesc{}).GetStride(src_vector_access_dim) == 1 || SrcDataPerAccess == 1), - "wrong! vectorized access is allowed only if stride == 1"); - }) - .Else([&](auto fwd) { - static_assert( - (fwd(SrcDesc{}).GetLastOriginalDimensionStride(src_vector_access_dim) == 1 || - SrcDataPerAccess == 1), - "wrong! vectorized access is allowed only if stride == 1"); - }); - - static_if{}( - [&](auto fwd) { - static_assert( - (fwd(DstDesc{}).GetStride(dst_vector_access_dim) == 1 || DstDataPerAccess == 1), - "wrong! vectorized access is allowed only if stride == 1"); - }) - .Else([&](auto fwd) { - static_assert( - (fwd(DstDesc{}).GetLastOriginalDimensionStride(dst_vector_access_dim) == 1 || - DstDataPerAccess == 1), - "wrong! vectorized access is allowed only if stride == 1"); - }); - } - - __device__ constexpr ThreadwiseGenericTensorSliceCopy_v2r1() - : ThreadwiseGenericTensorSliceCopy_v2r1(make_zero_array(), - make_zero_array()) - { - } - - __device__ void SetSrcSliceOrigin(SrcCoordinate src_slice_origin) - { - mSrcSliceOrigin = src_slice_origin; - } - - __device__ void SetDstSliceOrigin(DstCoordinate dst_slice_origin) - { - mDstSliceOrigin = dst_slice_origin; - } - - template - struct IsolateMergedDimLengths - { - template - __device__ constexpr index_t operator()(IDim idim) const - { - return TDesc::ContainMultipleOriginalDimensions(idim) ? Lengths{}[idim] : 1; - } - }; - - template - __device__ void Run(const TData* p_src, TData* p_dst) const - { - constexpr auto buffer_desc = make_ConstantTensorDescriptor_packed(SliceLengths{}); - - TData p_buffer_[buffer_desc.GetElementSpace()]; - TData* p_buffer = p_buffer_; - - // copy data from src into buffer - { - using src_vector_t = typename vector_type::MemoryType; - - constexpr auto src_vector_access_dim = Number{}; - constexpr auto src_data_per_access = Number{}; - - constexpr auto src_access_lengths = SliceLengths::Modify( - src_vector_access_dim, - SliceLengths::Get(src_vector_access_dim) / src_data_per_access); - - // Offset w.r.t merged dimensions need to be calculated at run-time. Offset w.r.t - // normal dimensions is known at compile time. - // Below is a hack to isolate merged dimension id from normal dimension id, so the - // corresponding offset can be calculated seperately at run-time and compile-time. - // src_merged_dim_access_lengths has the same value as src_access_lengths on src's - // merged dimensions, and has value = 1 on normal dimensions; - // src_merged_dim_access_lengths has the same value as src_access_lengths on src's - // normal dimensions, and has value = 1 on merged dimensions; - constexpr auto src_merged_dim_access_lengths = typename sequence_gen< - nDim, - IsolateMergedDimLengths>::type{}; - - constexpr auto src_normal_dim_access_lengths = - src_access_lengths + Number<1>{} - src_merged_dim_access_lengths; - -#if CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V2R1 - // offset w.r.t. merged dimension need to be computed at run-time - static_ford{}([&]( - auto src_merged_dim_access_id_) { - - constexpr auto src_merged_dim_access_id = decltype(src_merged_dim_access_id_){}; - - constexpr auto src_merged_dim_data_id = src_merged_dim_access_id.Modify( - src_vector_access_dim, - src_merged_dim_access_id[src_vector_access_dim] * src_data_per_access); - - const TData* p_src_tmp = - p_src + (mSrcSliceOrigin + src_merged_dim_data_id).GetOffset(); - - // offset w.r.t. normal dimension can be computed at compile-time - static_ford{}([&]( - auto src_normal_dim_access_id_) { - - constexpr auto src_normal_dim_access_id = decltype(src_normal_dim_access_id_){}; - - constexpr auto src_normal_dim_data_id = src_normal_dim_access_id.Modify( - src_vector_access_dim, - src_normal_dim_access_id[src_vector_access_dim] * src_data_per_access); - - constexpr index_t src_normal_offset = - SrcDesc::GetOffsetFromMultiIndex(src_normal_dim_data_id); - - // load vector from src - const src_vector_t vector_data = - *reinterpret_cast(&p_src_tmp[src_normal_offset]); - - // unpack vector into buffer - static_for<0, SrcDataPerAccess, 1>{}([&](auto i) { - constexpr auto scalar_id = - typename uniform_sequence_gen::type{}.Modify( - src_vector_access_dim, i); - - constexpr index_t buffer_offset = buffer_desc.GetOffsetFromMultiIndex( - src_merged_dim_data_id + src_normal_dim_data_id + scalar_id); - - p_buffer[buffer_offset] = reinterpret_cast(&vector_data)[i]; - }); - }); - }); -#else - ford{}([&]( - auto src_merged_dim_access_id) { - - auto src_merged_dim_data_id = src_merged_dim_access_id; - src_merged_dim_data_id(src_vector_access_dim) = - src_merged_dim_access_id[src_vector_access_dim] * src_data_per_access; - - const TData* p_src_tmp = - p_src + (mSrcSliceOrigin + src_merged_dim_data_id).GetOffset(); - - // these should be compile-time known - ford{}([&]( - auto src_normal_dim_access_id) { - - auto src_normal_dim_data_id = src_normal_dim_access_id; - src_normal_dim_data_id(src_vector_access_dim) = - src_normal_dim_access_id[src_vector_access_dim] * src_data_per_access; - - const index_t src_normal_offset = - SrcDesc::GetOffsetFromMultiIndex(src_normal_dim_data_id); - - // load vector from src - const src_vector_t vector_data = - *reinterpret_cast(&p_src_tmp[src_normal_offset]); - - // unpack vector into buffer - for(index_t i = 0; i < SrcDataPerAccess; ++i) - { - auto scalar_id = make_zero_array(); - scalar_id(src_vector_access_dim) = i; - - const index_t buffer_offset = buffer_desc.GetOffsetFromMultiIndex( - src_merged_dim_data_id + src_normal_dim_data_id + scalar_id); - - p_buffer[buffer_offset] = reinterpret_cast(&vector_data)[i]; - } - }); - }); -#endif - } - - // copy data from buffer into dst - { - using dst_vector_t = typename vector_type::MemoryType; - - constexpr auto dst_vector_access_dim = Number{}; - constexpr auto dst_data_per_access = Number{}; - - constexpr auto dst_access_lengths = SliceLengths::Modify( - dst_vector_access_dim, - SliceLengths::Get(dst_vector_access_dim) / dst_data_per_access); - - constexpr auto dst_merged_dim_access_lengths = typename sequence_gen< - nDim, - IsolateMergedDimLengths>::type{}; - - constexpr auto dst_normal_dim_access_lengths = - dst_access_lengths + Number<1>{} - dst_merged_dim_access_lengths; - -#if CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V2R1 - // offset w.r.t. merged dimension need to be computed at run-time - static_ford{}([&]( - auto dst_merged_dim_access_id_) { - - constexpr auto dst_merged_dim_access_id = decltype(dst_merged_dim_access_id_){}; - - constexpr auto dst_merged_dim_data_id = dst_merged_dim_access_id.Modify( - dst_vector_access_dim, - dst_merged_dim_access_id[dst_vector_access_dim] * dst_data_per_access); - - TData* p_dst_tmp = p_dst + (mDstSliceOrigin + dst_merged_dim_data_id).GetOffset(); - - // offset w.r.t. normal dimension can be computed at compile-time - static_ford{}([&]( - auto dst_normal_dim_access_id_) { - constexpr auto dst_normal_dim_access_id = decltype(dst_normal_dim_access_id_){}; - - constexpr auto dst_normal_dim_data_id = dst_normal_dim_access_id.Modify( - dst_vector_access_dim, - dst_normal_dim_access_id[dst_vector_access_dim] * dst_data_per_access); - - dst_vector_t vector_data; - - // pack vector from buffer - static_for<0, DstDataPerAccess, 1>{}([&](auto i) { - constexpr auto scalar_id = - typename uniform_sequence_gen::type{}.Modify( - dst_vector_access_dim, i); - - constexpr index_t buffer_offset = buffer_desc.GetOffsetFromMultiIndex( - dst_merged_dim_data_id + dst_normal_dim_data_id + scalar_id); - - reinterpret_cast(&vector_data)[i] = p_buffer[buffer_offset]; - }); - - constexpr index_t dst_normal_offset = - DstDesc::GetOffsetFromMultiIndex(dst_normal_dim_data_id); - - // write vector into dst - *reinterpret_cast(&p_dst_tmp[dst_normal_offset]) = vector_data; - }); - }); -#else - // offset w.r.t. merged dimension need to be computed at run-time - ford{}([&]( - auto dst_merged_dim_access_id) { - - auto dst_merged_dim_data_id = dst_merged_dim_access_id; - dst_merged_dim_data_id(dst_vector_access_dim) = - dst_merged_dim_access_id[dst_vector_access_dim] * dst_data_per_access; - - TData* p_dst_tmp = p_dst + (mDstSliceOrigin + dst_merged_dim_data_id).GetOffset(); - - // offset w.r.t. normal dimension can be computed at compile-time - ford{}([&]( - auto dst_normal_dim_access_id) { - - auto dst_normal_dim_data_id = dst_normal_dim_access_id; - dst_normal_dim_data_id(dst_vector_access_dim) = - dst_normal_dim_access_id[dst_vector_access_dim] * dst_data_per_access; - - dst_vector_t vector_data; - - // pack vector from buffer - for(index_t i = 0; i < DstDataPerAccess; ++i) - { - auto scalar_id = make_zero_array(); - scalar_id(dst_vector_access_dim) = i; - - const index_t buffer_offset = buffer_desc.GetOffsetFromMultiIndex( - dst_merged_dim_data_id + dst_normal_dim_data_id + scalar_id); - - reinterpret_cast(&vector_data)[i] = p_buffer[buffer_offset]; - } - - const index_t dst_normal_offset = - DstDesc::GetOffsetFromMultiIndex(dst_normal_dim_data_id); - - // write vector into dst - *reinterpret_cast(&p_dst_tmp[dst_normal_offset]) = vector_data; - }); - }); -#endif - } - } - - // memory-space - // 0: VGPR - // 1: LDS - // 2: global-memory - template - __device__ void Run_amd_experiment(const TData* p_src, TData* p_dst) const - { - constexpr auto buffer_desc = make_ConstantTensorDescriptor_packed(SliceLengths{}); - - TData p_buffer_[buffer_desc.GetElementSpace()]; - TData* p_buffer = p_buffer_; - - // copy data from src into buffer - { - using src_vector_t = typename vector_type::MemoryType; - - constexpr auto src_vector_access_dim = Number{}; - constexpr auto src_data_per_access = Number{}; - - constexpr auto src_access_lengths = SliceLengths::Modify( - src_vector_access_dim, - SliceLengths::Get(src_vector_access_dim) / src_data_per_access); - - // Offset w.r.t merged dimensions need to be calculated at run-time. Offset w.r.t - // normal dimensions is known at compile time. - // Below is a hack to isolate merged dimension id from normal dimension id, so the - // corresponding offset can be calculated seperately at run-time and compile-time. - // src_merged_dim_access_lengths has the same value as src_access_lengths on src's - // merged dimensions, and has value = 1 on normal dimensions; - // src_merged_dim_access_lengths has the same value as src_access_lengths on src's - // normal dimensions, and has value = 1 on merged dimensions; - constexpr auto src_merged_dim_access_lengths = typename sequence_gen< - nDim, - IsolateMergedDimLengths>::type{}; - - constexpr auto src_normal_dim_access_lengths = - src_access_lengths + Number<1>{} - src_merged_dim_access_lengths; - - ford{}([&]( - auto src_merged_dim_access_id) { - - auto src_merged_dim_data_id = src_merged_dim_access_id; - src_merged_dim_data_id(src_vector_access_dim) = - src_merged_dim_access_id[src_vector_access_dim] * src_data_per_access; - - // offset w.r.t. merged dimension need be computed at run-time, - const index_t src_merged_offset = - (mSrcSliceOrigin + src_merged_dim_data_id).GetOffset(); - - ford{}([&]( - auto src_normal_dim_access_id) { - - auto src_normal_dim_data_id = src_normal_dim_access_id; - src_normal_dim_data_id(src_vector_access_dim) = - src_normal_dim_access_id[src_vector_access_dim] * src_data_per_access; - - // offset w.r.t. normal dimension is known at compile-time - const index_t src_normal_offset = - SrcDesc::GetOffsetFromMultiIndex(src_normal_dim_data_id); - - src_vector_t vector_data; - - // Read vector from src. - // 1. Source code version can take src of all kinds of memory-space - // 2. Inline asm versions using global_load or buffer_load can only take - // src from global-memory - // - // Commemt for loading from global-memory: - // When - // 1) using source code, in order for compiler to emit optimal - // load instruction, or - // 2) using inline asm (global_load or buffer_load), in order - // for inline asm to be valid, - // following assumptions need to be satisfied: - // 1. p_src need to be block-invariant (assumption) - // 2. src_normal_offset must be calculatd at compile time (guaranteed) - // 3. src_merged_offset can be runtime value (no assumption imposed) - static_if{}([&](auto) { -#if CK_USE_AMD_INTRINSIC && CK_USE_AMD_INTRINSIC_BUFFER_LOAD_STORE - vector_data = __buffer_load( - p_src, - static_cast(src_merged_offset), - static_cast(src_normal_offset)); -#else - vector_data = *reinterpret_cast( - &p_src[src_normal_offset + src_merged_offset]); -#endif - }).Else([&](auto) { - // src can be all kinds of memory-space. - vector_data = *reinterpret_cast( - &p_src[src_normal_offset + src_merged_offset]); - }); - - // unpack vector into buffer - for(index_t i = 0; i < SrcDataPerAccess; ++i) - { - auto scalar_id = make_zero_array(); - scalar_id(src_vector_access_dim) = i; - - const index_t buffer_offset = buffer_desc.GetOffsetFromMultiIndex( - src_merged_dim_data_id + src_normal_dim_data_id + scalar_id); - - p_buffer[buffer_offset] = reinterpret_cast(&vector_data)[i]; - } - }); - }); - } - - // copy data from buffer into dst - { - using dst_vector_t = typename vector_type::MemoryType; - - constexpr auto dst_vector_access_dim = Number{}; - constexpr auto dst_data_per_access = Number{}; - - constexpr auto dst_access_lengths = SliceLengths::Modify( - dst_vector_access_dim, - SliceLengths::Get(dst_vector_access_dim) / dst_data_per_access); - - constexpr auto dst_merged_dim_access_lengths = typename sequence_gen< - nDim, - IsolateMergedDimLengths>::type{}; - - constexpr auto dst_normal_dim_access_lengths = - dst_access_lengths + Number<1>{} - dst_merged_dim_access_lengths; - - ford{}( - [&](auto dst_merged_dim_access_id) { - - auto dst_merged_dim_data_id = dst_merged_dim_access_id; - dst_merged_dim_data_id(dst_vector_access_dim) = - dst_merged_dim_access_id[dst_vector_access_dim] * dst_data_per_access; - - // offset w.r.t. merged dimension need be computed at run-time, - const index_t dst_merged_offset = - (mDstSliceOrigin + dst_merged_dim_data_id).GetOffset(); - - ford{}([&]( - auto dst_normal_dim_access_id) { - - auto dst_normal_dim_data_id = dst_normal_dim_access_id; - dst_normal_dim_data_id(dst_vector_access_dim) = - dst_normal_dim_access_id[dst_vector_access_dim] * dst_data_per_access; - - dst_vector_t vector_data; - - // pack vector from buffer - for(index_t i = 0; i < DstDataPerAccess; ++i) - { - auto scalar_id = make_zero_array(); - scalar_id(dst_vector_access_dim) = i; - - const index_t buffer_offset = buffer_desc.GetOffsetFromMultiIndex( - dst_merged_dim_data_id + dst_normal_dim_data_id + scalar_id); - - reinterpret_cast(&vector_data)[i] = p_buffer[buffer_offset]; - } - - // offset w.r.t. normal dimension is known at compile-time - const index_t dst_normal_offset = - DstDesc::GetOffsetFromMultiIndex(dst_normal_dim_data_id); - - // Write vector into dst. - // 1. Source code version can take dst of all kinds of memory-space - // 2. Inline asm versions using global_store or buffer_store can only take - // dst from global-memory - // - // Commemt for storing into global-memory: - // When - // 1) using source code, in order for compiler to emit optimal - // store instruction, or - // 2) using inline asm (global_store or buffer_store), in order - // for inline asm to be valid, - // following assumptions need to be satisfied: - // 1. p_dst need to be block-invariant (assumption) - // 2. dst_normal_offset must be calculatd at compile time (guaranteed) - // 3. dst_merged_offset can be runtime value (no assumption imposed) - static_if{}([&](auto) { -#if CK_USE_AMD_INTRINSIC && CK_USE_AMD_INTRINSIC_BUFFER_LOAD_STORE - __buffer_store( - vector_data, p_dst, dst_merged_offset, dst_normal_offset); -#else - *reinterpret_cast( - &p_dst[dst_normal_offset + dst_merged_offset]) = vector_data; -#endif - }).Else([&](auto) { - // dst can be all kinds of memory-space - *reinterpret_cast( - &p_dst[dst_normal_offset + dst_merged_offset]) = vector_data; - }); - }); - }); - } - } - - // T can be Sequence or Array - template - __device__ void MoveSrcSliceWindow(T step_sizes, integral_constant) - { - static_if{}([&](auto) { - mSrcSliceOrigin += step_sizes; - }).Else([&](auto) { mSrcSliceOrigin -= step_sizes; }); - } - - template - __device__ void MoveDstSliceWindow(T step_sizes, integral_constant) - { - static_if{}([&](auto) { - mDstSliceOrigin += step_sizes; - }).Else([&](auto) { mDstSliceOrigin -= step_sizes; }); - } - - private: - SrcCoordinate mSrcSliceOrigin; - DstCoordinate mDstSliceOrigin; -}; - -// this version use TensorView and TensorCoordinate -template -struct ThreadwiseGenericTensorSliceCopy_v3r1 -{ - static constexpr index_t nDim = SrcTensor::GetNumOfDimension(); - using data_type = remove_cv_t; - - using SrcCoordinate = typename SrcTensor::coordinate_type; - using DstCoordinate = typename DstTensor::coordinate_type; - - __device__ constexpr ThreadwiseGenericTensorSliceCopy_v3r1(SrcTensor src, - SrcCoordinate src_slice_origin, - DstTensor dst, - DstCoordinate dst_slice_origin) - : mSrc{src}, - mDst{dst}, - mSrcSlice{src.Slice(src_slice_origin, SliceLengths{})}, - mDstSlice{dst.Slice(dst_slice_origin, SliceLengths{})} - { - static_assert(nDim == SrcTensor::GetNumOfDimension() && - nDim == DstTensor::GetNumOfDimension() && - nDim == SliceLengths::GetSize() && nDim == SrcDimAccessOrder::GetSize() && - nDim == DstDimAccessOrder::GetSize(), - "wrong! # of dimensions not the same"); - - static_assert(is_valid_sequence_map::value && - is_valid_sequence_map::value, - "wrong! map is not valid"); - - static_assert(is_same, - remove_cv_t>{}, - "wrong! type conversion is not supported yet"); - - static_assert(decltype(mSrcSlice)::IsVectorizationAllowed(Number{}, - Number{}) && - decltype(mDstSlice)::IsVectorizationAllowed(Number{}, - Number{}), - "wrong! vectorized access is not allowed"); - } - - __device__ constexpr ThreadwiseGenericTensorSliceCopy_v3r1() - : ThreadwiseGenericTensorSliceCopy_v3r1( - SrcTensor{}, SrcCoordinate{}, DstTensor{}, DstCoordinate{}) - { - } - - __device__ void Run() const - { - // buffer - constexpr auto buffer_desc = make_ConstantTensorDescriptor_packed(SrcTensor::GetLengths()); - data_type p_buffer[buffer_desc.GetElementSpace()]; - auto buffer = make_TensorView(buffer_desc, p_buffer); - - // copy data from src into buffer - { - using src_vector_t = typename vector_type::MemoryType; - - constexpr auto src_vector_access_dim = Number{}; - constexpr auto src_data_per_access = Number{}; - - auto src_slice_vectorized = - mSrcSlice.Vectorize(src_vector_access_dim, src_data_per_access); - - ford{}( - [&](auto src_vector_id) { - // load vector from src - const src_vector_t vector_data = src_slice_vectorized[src_vector_id]; - - // unpack vector into buffer - auto src_scalar_id = src_vector_id; - src_scalar_id(src_vector_access_dim) *= src_data_per_access; - - for(index_t i = 0; i < SrcDataPerAccess; ++i) - { - auto id = make_zero_array(); - id(src_vector_access_dim) = i; - - buffer(src_scalar_id + id) = - reinterpret_cast(&vector_data)[i]; - } - }); - } - - // copy data from buffer into dst - { - using dst_vector_t = typename vector_type::MemoryType; - - constexpr auto dst_vector_access_dim = Number{}; - constexpr auto dst_data_per_access = Number{}; - - auto dst_slice_vectorized = - mDstSlice.Vectorize(dst_vector_access_dim, dst_data_per_access); - - ford{}( - [&](auto dst_vector_id) { - - dst_vector_t vector_data{}; - - // pack vector from buffer - auto dst_scalar_id = dst_vector_id; - dst_scalar_id(dst_vector_access_dim) *= dst_data_per_access; - - for(index_t i = 0; i < DstDataPerAccess; ++i) - { - auto id = make_zero_array(); - id(dst_vector_access_dim) = i; - - reinterpret_cast(&vector_data)[i] = buffer[dst_scalar_id + id]; - } - - // write vector into dst - dst_slice_vectorized(dst_vector_id) = vector_data; - }); - } - } - - // T can be Sequence or Array - template - __device__ void MoveSrcSliceWindow(T step_sizes, integral_constant) - { - mSrc.MoveSliceWindow(mSrcSlice, step_sizes, integral_constant{}); - } - - template - __device__ void MoveDstSliceWindow(T step_sizes, integral_constant) - { - mDst.MoveSliceWindow(mDstSlice, step_sizes, integral_constant{}); - } - - private: - using SrcSlice = decltype(SrcTensor{}.Slice(make_zero_array(), SliceLengths{})); - using DstSlice = decltype(DstTensor{}.Slice(make_zero_array(), SliceLengths{})); - - SrcTensor mSrc; - DstTensor mDst; - SrcSlice mSrcSlice; - DstSlice mDstSlice; -}; - -#endif - // This version use multi-index transformation // This threadwise copy allow vector access of src and dst. // It allows the vector size to be different on src and dst. @@ -1141,8 +35,8 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2 static constexpr index_t nDim = SliceLengths::Size(); using Index = MultiIndex; - using SrcCoord = typename TensorCoordinate_v2::type; - using DstCoord = typename TensorCoordinate_v2::type; + using SrcCoord = typename TensorCoordinate::type; + using DstCoord = typename TensorCoordinate::type; __device__ constexpr ThreadwiseGenericTensorSliceCopy_v4r2(const Index& src_slice_origin, const Index& dst_slice_origin) diff --git a/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy_deprecated.hpp b/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy_deprecated.hpp new file mode 100644 index 0000000000..f942422cf0 --- /dev/null +++ b/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy_deprecated.hpp @@ -0,0 +1,1129 @@ +#ifndef CK_THREADWISE_GENERIC_TENSOR_SLICE_COPY_DEPRECATED_HPP +#define CK_THREADWISE_GENERIC_TENSOR_SLICE_COPY_DEPRECATED_HPP + +#include "common_header.hpp" +#include "ConstantTensorDescriptor.hpp" +#include "ConstantMergedTensorDescriptor.hpp" +#include "tensor_view.hpp" +#include "tensor_coordinate_deprecated.hpp" + +#ifndef CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1R1 +#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1R1 0 +#endif + +#ifndef CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1R2 +#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1R2 0 +#endif + +#ifndef CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V2R1 +#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V2R1 0 +#endif + +#ifndef CK_USE_AMD_INTRINSIC +#define CK_USE_AMD_INTRINSIC 1 +#endif + +#ifndef CK_USE_AMD_INTRINSIC_BUFFER_LOAD_STORE +#define CK_USE_AMD_INTRINSIC_BUFFER_LOAD_STORE 1 +#endif + +namespace ck { + +// This threadwise copy allow vector access of src and dst. +// It allows the dimensions of vector access to be different on src and dst. +// It also allows the vector size to be different on src and dst. +// It also allows order of access to be different on src and dst. +// It use register as buffer to hold all data moving from src to dst. +// It is designed for copying small amount of data, and src and dst are +// device memory or LDS. +// When copying large amout of data, let's hope compiler will reduce register +// used for the buffer. +template +struct ThreadwiseGenericTensorSliceCopy_v1r1 +{ + static constexpr index_t nDim = SliceLengths::GetSize(); + + __device__ constexpr ThreadwiseGenericTensorSliceCopy_v1r1( + Array src_slice_origin, Array dst_slice_origin) + : mSrcSliceOrigin(src_slice_origin), mDstSliceOrigin(dst_slice_origin) + { + static_assert(nDim == SrcDesc::GetNumOfDimension() && + nDim == DstDesc::GetNumOfDimension() && nDim == SliceLengths::GetSize() && + nDim == SrcDimAccessOrder::GetSize() && + nDim == DstDimAccessOrder::GetSize(), + "wrong! # of dimensions not the same"); + + static_assert(is_valid_sequence_map::value && + is_valid_sequence_map::value, + "wrong! map is not valid"); + + static_assert(SliceLengths{}[SrcVectorAccessDim] % SrcDataPerAccess == 0 && + SliceLengths{}[DstVectorAccessDim] % DstDataPerAccess == 0, + "wrong! cannot evenly divide"); + + // check vectorized memory access + constexpr auto src_vector_access_dim = Number{}; + constexpr auto dst_vector_access_dim = Number{}; + + static_if{}( + [&](auto fwd) { + static_assert( + (fwd(SrcDesc{}).GetStride(src_vector_access_dim) == 1 || SrcDataPerAccess == 1), + "wrong! vectorized access is allowed only if stride == 1"); + }) + .Else([&](auto fwd) { + static_assert( + (fwd(SrcDesc{}).GetLastOriginalDimensionStride(src_vector_access_dim) == 1 || + SrcDataPerAccess == 1), + "wrong! vectorized access is allowed only if stride == 1"); + }); + + static_if{}( + [&](auto fwd) { + static_assert( + (fwd(DstDesc{}).GetStride(dst_vector_access_dim) == 1 || DstDataPerAccess == 1), + "wrong! vectorized access is allowed only if stride == 1"); + }) + .Else([&](auto fwd) { + static_assert( + (fwd(DstDesc{}).GetLastOriginalDimensionStride(dst_vector_access_dim) == 1 || + DstDataPerAccess == 1), + "wrong! vectorized access is allowed only if stride == 1"); + }); + } + + __device__ constexpr ThreadwiseGenericTensorSliceCopy_v1r1() + : ThreadwiseGenericTensorSliceCopy_v1r1(make_zero_array(), + make_zero_array()) + { + } + + __device__ void SetSrcSliceOrigin(Array src_slice_origin) + { + mSrcSliceOrigin = src_slice_origin; + } + + __device__ void SetDstSliceOrigin(Array dst_slice_origin) + { + mDstSliceOrigin = dst_slice_origin; + } + + template + __device__ void Run(const TData* p_src, TData* p_dst) const + { + constexpr auto buffer_desc = make_ConstantTensorDescriptor_packed(SliceLengths{}); + + TData p_buffer_[buffer_desc.GetElementSpace()]; + TData* p_buffer = p_buffer_; + + // copy data from src into buffer + { + using vector_t = typename vector_type::MemoryType; + + constexpr auto src_vector_access_dim = Number{}; + constexpr auto src_data_per_access = Number{}; + + constexpr auto src_access_lengths = SliceLengths::Modify( + src_vector_access_dim, + SliceLengths::Get(src_vector_access_dim) / src_data_per_access); + +#if CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1R1 + static_ford{}([&](auto src_access_id) { + constexpr auto src_data_begin_id = src_access_id.Modify( + src_vector_access_dim, + src_access_id[src_vector_access_dim] * src_data_per_access); + + const index_t src_offset = + SrcDesc::GetOffsetFromMultiIndex(mSrcSliceOrigin + src_data_begin_id); + + // load vector from src + const vector_t vector_data = *reinterpret_cast(&p_src[src_offset]); + + // unpack vector into buffer + static_for<0, SrcDataPerAccess, 1>{}([&](auto i) { + constexpr auto scalar_id = + typename uniform_sequence_gen::type{}.Modify(src_vector_access_dim, + i); + + constexpr index_t buffer_offset = + buffer_desc.GetOffsetFromMultiIndex(src_data_begin_id + scalar_id); + + p_buffer[buffer_offset] = reinterpret_cast(&vector_data)[i]; + }); + }); +#else + ford{}([&](auto src_access_id) { + auto src_data_begin_id = src_access_id; + src_data_begin_id(src_vector_access_dim) = + src_access_id[src_vector_access_dim] * src_data_per_access; + + const index_t src_offset = + SrcDesc::GetOffsetFromMultiIndex(mSrcSliceOrigin + src_data_begin_id); + + // load vector from src + const vector_t vector_data = *reinterpret_cast(&p_src[src_offset]); + + // unpack vector into buffer + for(index_t i = 0; i < SrcDataPerAccess; ++i) + { + auto scalar_id = make_zero_array(); + scalar_id(src_vector_access_dim) = i; + + const index_t buffer_offset = + buffer_desc.GetOffsetFromMultiIndex(src_data_begin_id + scalar_id); + + p_buffer[buffer_offset] = reinterpret_cast(&vector_data)[i]; + } + }); +#endif + } + + // copy data from buffer to dst + { + using vector_t = typename vector_type::MemoryType; + + constexpr auto dst_vector_access_dim = Number{}; + constexpr auto dst_data_per_access = Number{}; + + constexpr auto dst_access_lengths = SliceLengths::Modify( + dst_vector_access_dim, + SliceLengths::Get(dst_vector_access_dim) / dst_data_per_access); + +#if CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1R1 + static_ford{}([&](auto dst_access_id) { + constexpr auto dst_data_begin_id = dst_access_id.Modify( + dst_vector_access_dim, + dst_access_id[dst_vector_access_dim] * dst_data_per_access); + + vector_t vector_data{}; + + // pack vector from buffer + static_for<0, DstDataPerAccess, 1>{}([&](auto i) { + constexpr auto scalar_id = + typename uniform_sequence_gen::type{}.Modify(dst_vector_access_dim, + i); + + constexpr index_t buffer_offset = + buffer_desc.GetOffsetFromMultiIndex(dst_data_begin_id + scalar_id); + + reinterpret_cast(&vector_data)[i] = p_buffer[buffer_offset]; + }); + + const index_t dst_offset = + DstDesc::GetOffsetFromMultiIndex(mDstSliceOrigin + dst_data_begin_id); + + // store vector into dst + *reinterpret_cast(&p_dst[dst_offset]) = vector_data; + }); +#else + ford{}([&](auto dst_access_id) { + auto dst_data_begin_id = dst_access_id; + dst_data_begin_id(dst_vector_access_dim) = + dst_access_id[dst_vector_access_dim] * dst_data_per_access; + + vector_t vector_data{}; + + // pack vector from buffer + for(index_t i = 0; i < DstDataPerAccess; ++i) + { + auto scalar_id = make_zero_array(); + scalar_id(dst_vector_access_dim) = i; + + const index_t buffer_offset = + buffer_desc.GetOffsetFromMultiIndex(dst_data_begin_id + scalar_id); + + reinterpret_cast(&vector_data)[i] = p_buffer[buffer_offset]; + } + + const index_t dst_offset = + DstDesc::GetOffsetFromMultiIndex(mDstSliceOrigin + dst_data_begin_id); + + // store vector into dst + *reinterpret_cast(&p_dst[dst_offset]) = vector_data; + }); +#endif + } + } + + private: + Array mSrcSliceOrigin; + Array mDstSliceOrigin; +}; + +// This threadwise copy allow vector access of src and dst. +// It allows the vector size to be different on src and dst. +// The dimensions of vector access should be the same on src and dst. +// The dimension access order should be the same on src and dst. +// It is designed for cases, where one of src and dst is register, and +// the other is device memory or LDS +template +struct ThreadwiseGenericTensorSliceCopy_v1r2 +{ + static constexpr index_t nDim = SliceLengths::GetSize(); + + __device__ constexpr ThreadwiseGenericTensorSliceCopy_v1r2( + Array src_slice_origin, Array dst_slice_origin) + : mSrcSliceOrigin(src_slice_origin), mDstSliceOrigin(dst_slice_origin) + { + static_assert(nDim == SrcDesc::GetNumOfDimension() && + nDim == DstDesc::GetNumOfDimension() && nDim == SliceLengths::GetSize() && + nDim == DimAccessOrder::GetSize(), + "wrong! # of dimensions not the same"); + + static_assert(is_valid_sequence_map::value, "wrong! map is not valid"); + + static_assert( + SliceLengths{}[VectorAccessDim] % math::lcm(SrcDataPerAccess, DstDataPerAccess) == 0, + "wrong! cannot evenly divide"); + + // check vectorized memory access + constexpr auto vector_access_dim = Number{}; + + static_if{}([&](auto fwd) { + static_assert( + (fwd(SrcDesc{}).GetStride(vector_access_dim) == 1 || SrcDataPerAccess == 1), + "wrong! vectorized access is allowed only if stride == 1"); + }).Else([&](auto fwd) { + static_assert((fwd(SrcDesc{}).GetLastOriginalDimensionStride(vector_access_dim) == 1 || + SrcDataPerAccess == 1), + "wrong! vectorized access is allowed only if stride == 1"); + }); + + static_if{}([&](auto fwd) { + static_assert( + (fwd(DstDesc{}).GetStride(vector_access_dim) == 1 || DstDataPerAccess == 1), + "wrong! vectorized access is allowed only if stride == 1"); + }).Else([&](auto fwd) { + static_assert((fwd(DstDesc{}).GetLastOriginalDimensionStride(vector_access_dim) == 1 || + DstDataPerAccess == 1), + "wrong! vectorized access is allowed only if stride == 1"); + }); + } + + __device__ constexpr ThreadwiseGenericTensorSliceCopy_v1r2() + : ThreadwiseGenericTensorSliceCopy_v1r2(make_zero_array(), + make_zero_array()) + { + } + + __device__ void SetSrcSliceOrigin(Array src_slice_origin) + { + mSrcSliceOrigin = src_slice_origin; + } + + __device__ void SetDstSliceOrigin(Array dst_slice_origin) + { + mDstSliceOrigin = dst_slice_origin; + } + + template + __device__ void Run(const TData* p_src, TData* p_dst) const + { + using src_vector_t = typename vector_type::MemoryType; + using dst_vector_t = typename vector_type::MemoryType; + + constexpr auto vector_access_dim = Number{}; + + constexpr auto src_data_per_access = Number{}; + constexpr auto dst_data_per_access = Number{}; + + constexpr auto long_vector_size = Number{}; + + constexpr auto long_vector_access_lengths = SliceLengths::Modify( + vector_access_dim, SliceLengths::Get(vector_access_dim) / long_vector_size); + +#if CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1R2 + static_ford{}([&]( + auto long_vector_access_id) { + + // data id w.r.t slicing-window + constexpr auto long_vector_data_begin_id = long_vector_access_id.Modify( + vector_access_dim, long_vector_access_id[vector_access_dim] * long_vector_size); + + // buffer to hold a long-vector + TData p_long_vector[long_vector_size]; + + // load data from src to the long-vector buffer + static_for<0, long_vector_size / src_data_per_access, 1>{}([&](auto i) { + constexpr auto scalar_id = typename uniform_sequence_gen::type{}.Modify( + vector_access_dim, i * src_data_per_access); + + const index_t src_offset = SrcDesc::GetOffsetFromMultiIndex( + mSrcSliceOrigin + (long_vector_data_begin_id + scalar_id)); + + constexpr index_t buffer_offset = i * src_data_per_access; + + *reinterpret_cast(&p_long_vector[buffer_offset]) = + *reinterpret_cast(&p_src[src_offset]); + }); + + // store data from the long-vector buffer to dst + static_for<0, long_vector_size / dst_data_per_access, 1>{}([&](auto i) { + constexpr auto scalar_id = typename uniform_sequence_gen::type{}.Modify( + vector_access_dim, i * dst_data_per_access); + + constexpr index_t buffer_offset = i * dst_data_per_access; + + const index_t dst_offset = DstDesc::GetOffsetFromMultiIndex( + mDstSliceOrigin + (long_vector_data_begin_id + scalar_id)); + + *reinterpret_cast(&p_dst[dst_offset]) = + *reinterpret_cast(&p_long_vector[buffer_offset]); + }); + }); +#else + ford{}( + [&](auto long_vector_access_id) { + + // data id w.r.t slicing-window + auto long_vector_data_begin_id = long_vector_access_id; + long_vector_data_begin_id(vector_access_dim) = + long_vector_size * long_vector_access_id[vector_access_dim]; + + // buffer to hold a long-vector + TData p_long_vector[long_vector_size]; + + // load data from src to the long-vector buffer + for(index_t i = 0; i < long_vector_size / src_data_per_access; ++i) + { + auto scalar_id = make_zero_array(); + scalar_id(vector_access_dim) = i * src_data_per_access; + + const index_t src_offset = SrcDesc::GetOffsetFromMultiIndex( + mSrcSliceOrigin + (long_vector_data_begin_id + scalar_id)); + + const index_t buffer_offset = i * src_data_per_access; + + *reinterpret_cast(&p_long_vector[buffer_offset]) = + *reinterpret_cast(&p_src[src_offset]); + } + + // store data from the long-vector buffer to dst + for(index_t i = 0; i < long_vector_size / dst_data_per_access; ++i) + { + auto scalar_id = make_zero_array(); + scalar_id(vector_access_dim) = i * dst_data_per_access; + + const index_t buffer_offset = i * dst_data_per_access; + + const index_t dst_offset = DstDesc::GetOffsetFromMultiIndex( + mDstSliceOrigin + (long_vector_data_begin_id + scalar_id)); + + *reinterpret_cast(&p_dst[dst_offset]) = + *reinterpret_cast(&p_long_vector[buffer_offset]); + } + }); +#endif + } + + private: + Array mSrcSliceOrigin; + Array mDstSliceOrigin; +}; + +// This version use TensorCoordinate_deprecated +// This threadwise copy allow vector access of src and dst. +// It allows the dimensions of vector access to be different on src and dst. +// It also allows the vector size to be different on src and dst. +// It also allows order of access to be different on src and dst. +// It use register as buffer to hold all data moving from src to dst. +// It is designed for copying small amount of data, and src and dst are +// device memory or LDS. +// When copying large amout of data, let's hope compiler will reduce register +// used for the buffer. +template +struct ThreadwiseGenericTensorSliceCopy_v2r1 +{ + static constexpr index_t nDim = SliceLengths::GetSize(); + + using Index = MultiIndex; + + using SrcCoordinate = typename TensorCoordinate_deprecated::type; + using DstCoordinate = typename TensorCoordinate_deprecated::type; + + __device__ constexpr ThreadwiseGenericTensorSliceCopy_v2r1(const Index& src_slice_origin, + const Index& dst_slice_origin) + : mSrcSliceOrigin(src_slice_origin), mDstSliceOrigin(dst_slice_origin) + { + static_assert(nDim == SrcDesc::GetNumOfDimension() && + nDim == DstDesc::GetNumOfDimension() && nDim == SliceLengths::GetSize() && + nDim == SrcDimAccessOrder::GetSize() && + nDim == DstDimAccessOrder::GetSize(), + "wrong! # of dimensions not the same"); + + static_assert(is_valid_sequence_map::value && + is_valid_sequence_map::value, + "wrong! map is not valid"); + + static_assert(SliceLengths{}[SrcVectorAccessDim] % SrcDataPerAccess == 0 && + SliceLengths{}[DstVectorAccessDim] % DstDataPerAccess == 0, + "wrong! cannot evenly divide"); + + // check vectorized memory access + constexpr auto src_vector_access_dim = Number{}; + constexpr auto dst_vector_access_dim = Number{}; + + static_if{}( + [&](auto fwd) { + static_assert( + (fwd(SrcDesc{}).GetStride(src_vector_access_dim) == 1 || SrcDataPerAccess == 1), + "wrong! vectorized access is allowed only if stride == 1"); + }) + .Else([&](auto fwd) { + static_assert( + (fwd(SrcDesc{}).GetLastOriginalDimensionStride(src_vector_access_dim) == 1 || + SrcDataPerAccess == 1), + "wrong! vectorized access is allowed only if stride == 1"); + }); + + static_if{}( + [&](auto fwd) { + static_assert( + (fwd(DstDesc{}).GetStride(dst_vector_access_dim) == 1 || DstDataPerAccess == 1), + "wrong! vectorized access is allowed only if stride == 1"); + }) + .Else([&](auto fwd) { + static_assert( + (fwd(DstDesc{}).GetLastOriginalDimensionStride(dst_vector_access_dim) == 1 || + DstDataPerAccess == 1), + "wrong! vectorized access is allowed only if stride == 1"); + }); + } + + __device__ constexpr ThreadwiseGenericTensorSliceCopy_v2r1() + : ThreadwiseGenericTensorSliceCopy_v2r1(make_zero_array(), + make_zero_array()) + { + } + + __device__ void SetSrcSliceOrigin(SrcCoordinate src_slice_origin) + { + mSrcSliceOrigin = src_slice_origin; + } + + __device__ void SetDstSliceOrigin(DstCoordinate dst_slice_origin) + { + mDstSliceOrigin = dst_slice_origin; + } + + template + struct IsolateMergedDimLengths + { + template + __device__ constexpr index_t operator()(IDim idim) const + { + return TDesc::ContainMultipleOriginalDimensions(idim) ? Lengths{}[idim] : 1; + } + }; + + template + __device__ void Run(const TData* p_src, TData* p_dst) const + { + constexpr auto buffer_desc = make_ConstantTensorDescriptor_packed(SliceLengths{}); + + TData p_buffer_[buffer_desc.GetElementSpace()]; + TData* p_buffer = p_buffer_; + + // copy data from src into buffer + { + using src_vector_t = typename vector_type::MemoryType; + + constexpr auto src_vector_access_dim = Number{}; + constexpr auto src_data_per_access = Number{}; + + constexpr auto src_access_lengths = SliceLengths::Modify( + src_vector_access_dim, + SliceLengths::Get(src_vector_access_dim) / src_data_per_access); + + // Offset w.r.t merged dimensions need to be calculated at run-time. Offset w.r.t + // normal dimensions is known at compile time. + // Below is a hack to isolate merged dimension id from normal dimension id, so the + // corresponding offset can be calculated seperately at run-time and compile-time. + // src_merged_dim_access_lengths has the same value as src_access_lengths on src's + // merged dimensions, and has value = 1 on normal dimensions; + // src_merged_dim_access_lengths has the same value as src_access_lengths on src's + // normal dimensions, and has value = 1 on merged dimensions; + constexpr auto src_merged_dim_access_lengths = typename sequence_gen< + nDim, + IsolateMergedDimLengths>::type{}; + + constexpr auto src_normal_dim_access_lengths = + src_access_lengths + Number<1>{} - src_merged_dim_access_lengths; + +#if CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V2R1 + // offset w.r.t. merged dimension need to be computed at run-time + static_ford{}([&]( + auto src_merged_dim_access_id_) { + + constexpr auto src_merged_dim_access_id = decltype(src_merged_dim_access_id_){}; + + constexpr auto src_merged_dim_data_id = src_merged_dim_access_id.Modify( + src_vector_access_dim, + src_merged_dim_access_id[src_vector_access_dim] * src_data_per_access); + + const TData* p_src_tmp = + p_src + (mSrcSliceOrigin + src_merged_dim_data_id).GetOffset(); + + // offset w.r.t. normal dimension can be computed at compile-time + static_ford{}([&]( + auto src_normal_dim_access_id_) { + + constexpr auto src_normal_dim_access_id = decltype(src_normal_dim_access_id_){}; + + constexpr auto src_normal_dim_data_id = src_normal_dim_access_id.Modify( + src_vector_access_dim, + src_normal_dim_access_id[src_vector_access_dim] * src_data_per_access); + + constexpr index_t src_normal_offset = + SrcDesc::GetOffsetFromMultiIndex(src_normal_dim_data_id); + + // load vector from src + const src_vector_t vector_data = + *reinterpret_cast(&p_src_tmp[src_normal_offset]); + + // unpack vector into buffer + static_for<0, SrcDataPerAccess, 1>{}([&](auto i) { + constexpr auto scalar_id = + typename uniform_sequence_gen::type{}.Modify( + src_vector_access_dim, i); + + constexpr index_t buffer_offset = buffer_desc.GetOffsetFromMultiIndex( + src_merged_dim_data_id + src_normal_dim_data_id + scalar_id); + + p_buffer[buffer_offset] = reinterpret_cast(&vector_data)[i]; + }); + }); + }); +#else + ford{}([&]( + auto src_merged_dim_access_id) { + + auto src_merged_dim_data_id = src_merged_dim_access_id; + src_merged_dim_data_id(src_vector_access_dim) = + src_merged_dim_access_id[src_vector_access_dim] * src_data_per_access; + + const TData* p_src_tmp = + p_src + (mSrcSliceOrigin + src_merged_dim_data_id).GetOffset(); + + // these should be compile-time known + ford{}([&]( + auto src_normal_dim_access_id) { + + auto src_normal_dim_data_id = src_normal_dim_access_id; + src_normal_dim_data_id(src_vector_access_dim) = + src_normal_dim_access_id[src_vector_access_dim] * src_data_per_access; + + const index_t src_normal_offset = + SrcDesc::GetOffsetFromMultiIndex(src_normal_dim_data_id); + + // load vector from src + const src_vector_t vector_data = + *reinterpret_cast(&p_src_tmp[src_normal_offset]); + + // unpack vector into buffer + for(index_t i = 0; i < SrcDataPerAccess; ++i) + { + auto scalar_id = make_zero_array(); + scalar_id(src_vector_access_dim) = i; + + const index_t buffer_offset = buffer_desc.GetOffsetFromMultiIndex( + src_merged_dim_data_id + src_normal_dim_data_id + scalar_id); + + p_buffer[buffer_offset] = reinterpret_cast(&vector_data)[i]; + } + }); + }); +#endif + } + + // copy data from buffer into dst + { + using dst_vector_t = typename vector_type::MemoryType; + + constexpr auto dst_vector_access_dim = Number{}; + constexpr auto dst_data_per_access = Number{}; + + constexpr auto dst_access_lengths = SliceLengths::Modify( + dst_vector_access_dim, + SliceLengths::Get(dst_vector_access_dim) / dst_data_per_access); + + constexpr auto dst_merged_dim_access_lengths = typename sequence_gen< + nDim, + IsolateMergedDimLengths>::type{}; + + constexpr auto dst_normal_dim_access_lengths = + dst_access_lengths + Number<1>{} - dst_merged_dim_access_lengths; + +#if CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V2R1 + // offset w.r.t. merged dimension need to be computed at run-time + static_ford{}([&]( + auto dst_merged_dim_access_id_) { + + constexpr auto dst_merged_dim_access_id = decltype(dst_merged_dim_access_id_){}; + + constexpr auto dst_merged_dim_data_id = dst_merged_dim_access_id.Modify( + dst_vector_access_dim, + dst_merged_dim_access_id[dst_vector_access_dim] * dst_data_per_access); + + TData* p_dst_tmp = p_dst + (mDstSliceOrigin + dst_merged_dim_data_id).GetOffset(); + + // offset w.r.t. normal dimension can be computed at compile-time + static_ford{}([&]( + auto dst_normal_dim_access_id_) { + constexpr auto dst_normal_dim_access_id = decltype(dst_normal_dim_access_id_){}; + + constexpr auto dst_normal_dim_data_id = dst_normal_dim_access_id.Modify( + dst_vector_access_dim, + dst_normal_dim_access_id[dst_vector_access_dim] * dst_data_per_access); + + dst_vector_t vector_data; + + // pack vector from buffer + static_for<0, DstDataPerAccess, 1>{}([&](auto i) { + constexpr auto scalar_id = + typename uniform_sequence_gen::type{}.Modify( + dst_vector_access_dim, i); + + constexpr index_t buffer_offset = buffer_desc.GetOffsetFromMultiIndex( + dst_merged_dim_data_id + dst_normal_dim_data_id + scalar_id); + + reinterpret_cast(&vector_data)[i] = p_buffer[buffer_offset]; + }); + + constexpr index_t dst_normal_offset = + DstDesc::GetOffsetFromMultiIndex(dst_normal_dim_data_id); + + // write vector into dst + *reinterpret_cast(&p_dst_tmp[dst_normal_offset]) = vector_data; + }); + }); +#else + // offset w.r.t. merged dimension need to be computed at run-time + ford{}([&]( + auto dst_merged_dim_access_id) { + + auto dst_merged_dim_data_id = dst_merged_dim_access_id; + dst_merged_dim_data_id(dst_vector_access_dim) = + dst_merged_dim_access_id[dst_vector_access_dim] * dst_data_per_access; + + TData* p_dst_tmp = p_dst + (mDstSliceOrigin + dst_merged_dim_data_id).GetOffset(); + + // offset w.r.t. normal dimension can be computed at compile-time + ford{}([&]( + auto dst_normal_dim_access_id) { + + auto dst_normal_dim_data_id = dst_normal_dim_access_id; + dst_normal_dim_data_id(dst_vector_access_dim) = + dst_normal_dim_access_id[dst_vector_access_dim] * dst_data_per_access; + + dst_vector_t vector_data; + + // pack vector from buffer + for(index_t i = 0; i < DstDataPerAccess; ++i) + { + auto scalar_id = make_zero_array(); + scalar_id(dst_vector_access_dim) = i; + + const index_t buffer_offset = buffer_desc.GetOffsetFromMultiIndex( + dst_merged_dim_data_id + dst_normal_dim_data_id + scalar_id); + + reinterpret_cast(&vector_data)[i] = p_buffer[buffer_offset]; + } + + const index_t dst_normal_offset = + DstDesc::GetOffsetFromMultiIndex(dst_normal_dim_data_id); + + // write vector into dst + *reinterpret_cast(&p_dst_tmp[dst_normal_offset]) = vector_data; + }); + }); +#endif + } + } + + template + __device__ void Run_optimized_address_calculation(const TData* p_src, TData* p_dst) const + { + constexpr auto buffer_desc = make_ConstantTensorDescriptor_packed(SliceLengths{}); + + TData p_buffer_[buffer_desc.GetElementSpace()]; + TData* p_buffer = p_buffer_; + + // copy data from src into buffer + { + using src_vector_t = typename vector_type::MemoryType; + + constexpr auto src_vector_access_dim = Number{}; + constexpr auto src_data_per_access = Number{}; + + constexpr auto src_access_lengths = SliceLengths::Modify( + src_vector_access_dim, + SliceLengths::Get(src_vector_access_dim) / src_data_per_access); + + // Offset w.r.t merged dimensions need to be calculated at run-time. Offset w.r.t + // normal dimensions is known at compile time. + // Below is a hack to isolate merged dimension id from normal dimension id, so the + // corresponding offset can be calculated seperately at run-time and compile-time. + // src_merged_dim_access_lengths has the same value as src_access_lengths on src's + // merged dimensions, and has value = 1 on normal dimensions; + // src_merged_dim_access_lengths has the same value as src_access_lengths on src's + // normal dimensions, and has value = 1 on merged dimensions; + constexpr auto src_merged_dim_access_lengths = typename sequence_gen< + nDim, + IsolateMergedDimLengths>::type{}; + + constexpr auto src_normal_dim_access_lengths = + src_access_lengths + Number<1>{} - src_merged_dim_access_lengths; + + ford{}([&]( + auto src_merged_dim_access_id) { + + auto src_merged_dim_data_id = src_merged_dim_access_id; + src_merged_dim_data_id(src_vector_access_dim) = + src_merged_dim_access_id[src_vector_access_dim] * src_data_per_access; + + // offset w.r.t. merged dimension need be computed at run-time, + const index_t src_merged_offset = + (mSrcSliceOrigin + src_merged_dim_data_id).GetOffset(); + + ford{}([&]( + auto src_normal_dim_access_id) { + + auto src_normal_dim_data_id = src_normal_dim_access_id; + src_normal_dim_data_id(src_vector_access_dim) = + src_normal_dim_access_id[src_vector_access_dim] * src_data_per_access; + + // offset w.r.t. normal dimension is known at compile-time + const index_t src_normal_offset = + SrcDesc::GetOffsetFromMultiIndex(src_normal_dim_data_id); + + src_vector_t vector_data; + + // Read vector from src. + // 1. Source code version can take src of all kinds of memory-space + // 2. Inline asm versions using global_load or buffer_load can only take + // src from global-memory + // + // Commemt for loading from global-memory: + // When + // 1) using source code, in order for compiler to emit optimal + // load instruction, or + // 2) using inline asm (global_load or buffer_load), in order + // for inline asm to be valid, + // following assumptions need to be satisfied: + // 1. p_src need to be block-invariant (assumption) + // 2. src_normal_offset must be calculatd at compile time (guaranteed) + // 3. src_merged_offset can be runtime value (no assumption imposed) + static_if{}([&](auto) { +#if CK_USE_AMD_INTRINSIC && CK_USE_AMD_INTRINSIC_BUFFER_LOAD_STORE + vector_data = __buffer_load( + p_src, + static_cast(src_merged_offset), + static_cast(src_normal_offset)); +#else + vector_data = *reinterpret_cast( + &p_src[src_normal_offset + src_merged_offset]); +#endif + }).Else([&](auto) { + // src can be all kinds of memory-space. + vector_data = *reinterpret_cast( + &p_src[src_normal_offset + src_merged_offset]); + }); + + // unpack vector into buffer + for(index_t i = 0; i < SrcDataPerAccess; ++i) + { + auto scalar_id = make_zero_array(); + scalar_id(src_vector_access_dim) = i; + + const index_t buffer_offset = buffer_desc.GetOffsetFromMultiIndex( + src_merged_dim_data_id + src_normal_dim_data_id + scalar_id); + + p_buffer[buffer_offset] = reinterpret_cast(&vector_data)[i]; + } + }); + }); + } + + // copy data from buffer into dst + { + using dst_vector_t = typename vector_type::MemoryType; + + constexpr auto dst_vector_access_dim = Number{}; + constexpr auto dst_data_per_access = Number{}; + + constexpr auto dst_access_lengths = SliceLengths::Modify( + dst_vector_access_dim, + SliceLengths::Get(dst_vector_access_dim) / dst_data_per_access); + + constexpr auto dst_merged_dim_access_lengths = typename sequence_gen< + nDim, + IsolateMergedDimLengths>::type{}; + + constexpr auto dst_normal_dim_access_lengths = + dst_access_lengths + Number<1>{} - dst_merged_dim_access_lengths; + + ford{}( + [&](auto dst_merged_dim_access_id) { + + auto dst_merged_dim_data_id = dst_merged_dim_access_id; + dst_merged_dim_data_id(dst_vector_access_dim) = + dst_merged_dim_access_id[dst_vector_access_dim] * dst_data_per_access; + + // offset w.r.t. merged dimension need be computed at run-time, + const index_t dst_merged_offset = + (mDstSliceOrigin + dst_merged_dim_data_id).GetOffset(); + + ford{}([&]( + auto dst_normal_dim_access_id) { + + auto dst_normal_dim_data_id = dst_normal_dim_access_id; + dst_normal_dim_data_id(dst_vector_access_dim) = + dst_normal_dim_access_id[dst_vector_access_dim] * dst_data_per_access; + + dst_vector_t vector_data; + + // pack vector from buffer + for(index_t i = 0; i < DstDataPerAccess; ++i) + { + auto scalar_id = make_zero_array(); + scalar_id(dst_vector_access_dim) = i; + + const index_t buffer_offset = buffer_desc.GetOffsetFromMultiIndex( + dst_merged_dim_data_id + dst_normal_dim_data_id + scalar_id); + + reinterpret_cast(&vector_data)[i] = p_buffer[buffer_offset]; + } + + // offset w.r.t. normal dimension is known at compile-time + const index_t dst_normal_offset = + DstDesc::GetOffsetFromMultiIndex(dst_normal_dim_data_id); + + // Write vector into dst. + // 1. Source code version can take dst of all kinds of memory-space + // 2. Inline asm versions using global_store or buffer_store can only take + // dst from global-memory + // + // Commemt for storing into global-memory: + // When + // 1) using source code, in order for compiler to emit optimal + // store instruction, or + // 2) using inline asm (global_store or buffer_store), in order + // for inline asm to be valid, + // following assumptions need to be satisfied: + // 1. p_dst need to be block-invariant (assumption) + // 2. dst_normal_offset must be calculatd at compile time (guaranteed) + // 3. dst_merged_offset can be runtime value (no assumption imposed) + static_if{}([&](auto) { +#if CK_USE_AMD_INTRINSIC && CK_USE_AMD_INTRINSIC_BUFFER_LOAD_STORE + __buffer_store( + vector_data, p_dst, dst_merged_offset, dst_normal_offset); +#else + *reinterpret_cast( + &p_dst[dst_normal_offset + dst_merged_offset]) = vector_data; +#endif + }).Else([&](auto) { + // dst can be all kinds of memory-space + *reinterpret_cast( + &p_dst[dst_normal_offset + dst_merged_offset]) = vector_data; + }); + }); + }); + } + } + + // T can be Sequence or Array + template + __device__ void MoveSrcSliceWindow(T step_sizes, integral_constant) + { + static_if{}([&](auto) { + mSrcSliceOrigin += step_sizes; + }).Else([&](auto) { mSrcSliceOrigin -= step_sizes; }); + } + + template + __device__ void MoveDstSliceWindow(T step_sizes, integral_constant) + { + static_if{}([&](auto) { + mDstSliceOrigin += step_sizes; + }).Else([&](auto) { mDstSliceOrigin -= step_sizes; }); + } + + private: + SrcCoordinate mSrcSliceOrigin; + DstCoordinate mDstSliceOrigin; +}; + +// this version use TensorView and TensorCoordinate_deprecated +template +struct ThreadwiseGenericTensorSliceCopy_v3r1 +{ + static constexpr index_t nDim = SrcTensor::GetNumOfDimension(); + using data_type = remove_cv_t; + + using SrcCoordinate = typename SrcTensor::coordinate_type; + using DstCoordinate = typename DstTensor::coordinate_type; + + __device__ constexpr ThreadwiseGenericTensorSliceCopy_v3r1(SrcTensor src, + SrcCoordinate src_slice_origin, + DstTensor dst, + DstCoordinate dst_slice_origin) + : mSrc{src}, + mDst{dst}, + mSrcSlice{src.Slice(src_slice_origin, SliceLengths{})}, + mDstSlice{dst.Slice(dst_slice_origin, SliceLengths{})} + { + static_assert(nDim == SrcTensor::GetNumOfDimension() && + nDim == DstTensor::GetNumOfDimension() && + nDim == SliceLengths::GetSize() && nDim == SrcDimAccessOrder::GetSize() && + nDim == DstDimAccessOrder::GetSize(), + "wrong! # of dimensions not the same"); + + static_assert(is_valid_sequence_map::value && + is_valid_sequence_map::value, + "wrong! map is not valid"); + + static_assert(is_same, + remove_cv_t>{}, + "wrong! type conversion is not supported yet"); + + static_assert(decltype(mSrcSlice)::IsVectorizationAllowed(Number{}, + Number{}) && + decltype(mDstSlice)::IsVectorizationAllowed(Number{}, + Number{}), + "wrong! vectorized access is not allowed"); + } + + __device__ constexpr ThreadwiseGenericTensorSliceCopy_v3r1() + : ThreadwiseGenericTensorSliceCopy_v3r1( + SrcTensor{}, SrcCoordinate{}, DstTensor{}, DstCoordinate{}) + { + } + + __device__ void Run() const + { + // buffer + constexpr auto buffer_desc = make_ConstantTensorDescriptor_packed(SrcTensor::GetLengths()); + data_type p_buffer[buffer_desc.GetElementSpace()]; + auto buffer = make_TensorView(buffer_desc, p_buffer); + + // copy data from src into buffer + { + using src_vector_t = typename vector_type::MemoryType; + + constexpr auto src_vector_access_dim = Number{}; + constexpr auto src_data_per_access = Number{}; + + auto src_slice_vectorized = + mSrcSlice.Vectorize(src_vector_access_dim, src_data_per_access); + + ford{}( + [&](auto src_vector_id) { + // load vector from src + const src_vector_t vector_data = src_slice_vectorized[src_vector_id]; + + // unpack vector into buffer + auto src_scalar_id = src_vector_id; + src_scalar_id(src_vector_access_dim) *= src_data_per_access; + + for(index_t i = 0; i < SrcDataPerAccess; ++i) + { + auto id = make_zero_array(); + id(src_vector_access_dim) = i; + + buffer(src_scalar_id + id) = + reinterpret_cast(&vector_data)[i]; + } + }); + } + + // copy data from buffer into dst + { + using dst_vector_t = typename vector_type::MemoryType; + + constexpr auto dst_vector_access_dim = Number{}; + constexpr auto dst_data_per_access = Number{}; + + auto dst_slice_vectorized = + mDstSlice.Vectorize(dst_vector_access_dim, dst_data_per_access); + + ford{}( + [&](auto dst_vector_id) { + + dst_vector_t vector_data{}; + + // pack vector from buffer + auto dst_scalar_id = dst_vector_id; + dst_scalar_id(dst_vector_access_dim) *= dst_data_per_access; + + for(index_t i = 0; i < DstDataPerAccess; ++i) + { + auto id = make_zero_array(); + id(dst_vector_access_dim) = i; + + reinterpret_cast(&vector_data)[i] = buffer[dst_scalar_id + id]; + } + + // write vector into dst + dst_slice_vectorized(dst_vector_id) = vector_data; + }); + } + } + + // T can be Sequence or Array + template + __device__ void MoveSrcSliceWindow(T step_sizes, integral_constant) + { + mSrc.MoveSliceWindow(mSrcSlice, step_sizes, integral_constant{}); + } + + template + __device__ void MoveDstSliceWindow(T step_sizes, integral_constant) + { + mDst.MoveSliceWindow(mDstSlice, step_sizes, integral_constant{}); + } + + private: + using SrcSlice = decltype(SrcTensor{}.Slice(make_zero_array(), SliceLengths{})); + using DstSlice = decltype(DstTensor{}.Slice(make_zero_array(), SliceLengths{})); + + SrcTensor mSrc; + DstTensor mDst; + SrcSlice mSrcSlice; + DstSlice mDstSlice; +}; + +} // namespace ck +#endif diff --git a/driver/include/device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp b/driver/include/device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp index 7c1f142a8d..4d18e7a344 100644 --- a/driver/include/device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp +++ b/driver/include/device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp @@ -3,7 +3,7 @@ #include "device.hpp" #include "tensor.hpp" #include "gridwise_convolution_kernel_wrapper.hpp" -#include "gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp" +//#include "gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp" #include "gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer.hpp" template ; using RightPads = Sequence<0, 0>; -#elif 1 +#elif 0 // 1x1 filter, 8x8 image // cudnn@V100 68%, ck@V100 72%, ck@P100 52%, ck@VII 42% constexpr index_t N = 64; @@ -295,7 +295,7 @@ int main(int argc, char* argv[]) using LeftPads = Sequence<0, 0>; using RightPads = Sequence<0, 0>; -#elif 0 +#elif 1 // 3x3 filter, 2x2 stride, 35x35 input, 17x17 output // cudnn@V100 90%, ck@V100 93%, ck@P100 83%, ck@VII 81% constexpr index_t N = 128; @@ -341,7 +341,7 @@ int main(int argc, char* argv[]) using LeftPads = Sequence<3, 0>; using RightPads = Sequence<3, 0>; -#elif 1 +#elif 0 // 1x7 filter, 0x3 pad, 17x17 input constexpr index_t N = 128; constexpr index_t C = 128;