From 238d58c2f5947246a3e62f72db2b175b2e948554 Mon Sep 17 00:00:00 2001 From: Chao Liu Date: Tue, 20 Aug 2019 17:29:54 -0500 Subject: [PATCH] adding tensor_view --- ...plicit_gemm_v1r3_chwn_cyxk_khwn_padded.hpp | 120 ++++++++++++- .../ConstantTensorDescriptor.hpp | 63 ++++++- .../tensor_description/tensor_coordinate.hpp | 14 +- .../tensor_description/tensor_view.hpp | 100 +++++++++++ .../blockwise_generic_tensor_slice_copy.hpp | 137 +++++++++++++- .../threadwise_generic_tensor_slice_copy.hpp | 167 ++++++++++++++++++ .../include/utility/integral_constant.hpp | 3 + .../include/utility/vector_type.hpp | 20 ++- driver/src/driver.cpp | 4 +- 9 files changed, 609 insertions(+), 19 deletions(-) create mode 100644 composable_kernel/include/tensor_description/tensor_view.hpp diff --git a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v1r3_chwn_cyxk_khwn_padded.hpp b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v1r3_chwn_cyxk_khwn_padded.hpp index 08014398e1..19074c80c9 100644 --- a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v1r3_chwn_cyxk_khwn_padded.hpp +++ b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v1r3_chwn_cyxk_khwn_padded.hpp @@ -62,6 +62,9 @@ struct GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn_padded constexpr auto I2 = Number<2>{}; constexpr auto I3 = Number<3>{}; + constexpr auto True = integral_constant{}; + constexpr auto False = integral_constant{}; + constexpr auto in_c_h_w_n_global_desc = InGlobalDesc{}; constexpr auto wei_c_y_x_k_global_desc = WeiGlobalDesc{}; constexpr auto out_k_h_w_n_global_desc = OutGlobalDesc{}; @@ -121,10 +124,21 @@ struct GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn_padded constexpr auto wei_c_k_block_desc = make_ConstantTensorDescriptor_aligned( Sequence{}, Number{}); + constexpr auto wei_c_1_1_k_block_desc = make_ConstantTensorDescriptor_aligned( + Sequence{}, Number{}); + + // LDS: be careful of alignment + constexpr index_t in_block_space = in_c_h_w_n_block_desc.GetElementSpace(); + constexpr index_t wei_block_space = wei_c_k_block_desc.GetElementSpace(); + + __shared__ Float p_in_block[in_block_space]; + __shared__ Float p_wei_block[wei_block_space]; + // tensor view of threadwise output in register constexpr auto out_k_h_w_n_thread_desc = make_ConstantTensorDescriptor_packed( Sequence{}); +#if 0 // blockwise input copy // format is [C, Hi, Wi, N] auto blockwise_in_copy = @@ -142,7 +156,31 @@ struct GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn_padded InBlockCopyDataPerAccess_N, InBlockCopyDataPerAccess_N>({0, 0, 0, 0}, {0, 0, 0, 0}); +#else + auto in_c_h_w_n_global = make_TensorView(in_c_h_w_n_global_desc, p_in_global); + auto in_c_h_w_n_block = make_TensorView(in_c_h_w_n_block_desc, p_in_block); + auto blockwise_in_copy = + BlockwiseGenericTensorSliceCopy_v3, + Sequence<0, 1, 2, 3>, + Sequence<0, 1, 2, 3>, + 3, + 3, + InBlockCopyDataPerAccess_N, + InBlockCopyDataPerAccess_N>( + in_c_h_w_n_global, + {0, hi_block_data_begin, wi_block_data_begin, n_block_data_begin}, + in_c_h_w_n_block, + {0, 0, 0, 0}); +#endif + +#if 0 // blockwise wei copy // format is [CPerBlock, KPerBlock] const auto blockwise_wei_copy = @@ -159,6 +197,38 @@ struct GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn_padded 1, WeiBlockCopyDataPerAccess_K, WeiBlockCopyDataPerAccess_K>({0, 0}, {0, 0}); +#else + auto wei_c_y_x_k_global = make_TensorView(wei_c_y_x_k_global_desc, p_wei_global); + auto wei_c_1_1_k_block = make_TensorView(wei_c_1_1_k_block_desc, p_wei_block); + + constexpr index_t WeiBlockCopySubLengths_C = WeiBlockCopySubLengths_CK{}[0]; + constexpr index_t WeiBlockCopySubLengths_K = WeiBlockCopySubLengths_CK{}[1]; + + using WeiBlockCopySubLengths_CYXK = + Sequence; + + constexpr index_t WeiBlockCopyClusterLengths_C = WeiBlockCopyClusterLengths_CK{}[0]; + constexpr index_t WeiBlockCopyClusterLengths_K = WeiBlockCopyClusterLengths_CK{}[1]; + + using WeiBlockCopyClusterLengths_CYXK = + Sequence; + + auto blockwise_wei_copy = + BlockwiseGenericTensorSliceCopy_v3, + Sequence<0, 1, 2, 3>, + Sequence<0, 1, 2, 3>, + 3, + 3, + WeiBlockCopyDataPerAccess_K, + WeiBlockCopyDataPerAccess_K>( + wei_c_y_x_k_global, {0, 0, 0, k_block_data_begin}, wei_c_1_1_k_block, {0, 0, 0, 0}); +#endif // a series of blockwise batched GEMM // C_matrix += transpose(A_matrix) * B_matrix @@ -200,13 +270,6 @@ struct GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn_padded GemmDataPerReadA, GemmDataPerReadB>{}; - // LDS: be careful of alignment - constexpr index_t in_block_space = in_c_h_w_n_block_desc.GetElementSpace(); - constexpr index_t wei_block_space = wei_c_k_block_desc.GetElementSpace(); - - __shared__ Float p_in_block[in_block_space]; - __shared__ Float p_wei_block[wei_block_space]; - // register // C++ lambda doesn't capture array, use pointer instead Float p_out_thread_data[out_k_h_w_n_thread_desc.GetElementSpace()]; @@ -215,6 +278,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn_padded // set threadwise output tensor to 0 threadwise_matrix_set_zero(c_k_wn_thread_mtx_desc, p_out_thread); +#if 0 for(index_t y = 0; y < Y; ++y) { for(index_t x = 0; x < X; ++x) @@ -246,6 +310,48 @@ struct GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn_padded } } } +#else + for(index_t y = 0; y < Y; ++y) + { + for(index_t x = 0; x < X; ++x) + { + for(index_t c_block_data_begin = 0; c_block_data_begin < C; + c_block_data_begin += CPerBlock) + { +#if 1 // debug + blockwise_in_copy.Run(); + blockwise_wei_copy.Run(); +#endif + + __syncthreads(); + + blockwise_batch_gemm.Run(p_wei_block, p_in_block, p_out_thread); + + __syncthreads(); + + // move along C + blockwise_in_copy.MoveSrcSliceWindow(Sequence{}, True); + blockwise_wei_copy.MoveSrcSliceWindow(Sequence{}, True); + } + + // reset C + blockwise_in_copy.MoveSrcSliceWindow(Sequence{}, False); + blockwise_wei_copy.MoveSrcSliceWindow(Sequence{}, False); + + // move aling X + blockwise_in_copy.MoveSrcSliceWindow(Sequence<0, 0, 1, 0>{}, True); + blockwise_wei_copy.MoveSrcSliceWindow(Sequence<0, 0, 1, 0>{}, True); + } + + // reset X + blockwise_in_copy.MoveSrcSliceWindow(Sequence<0, 0, X, 0>{}, False); + blockwise_wei_copy.MoveSrcSliceWindow(Sequence<0, 0, X, 0>{}, False); + + // move along Y + blockwise_in_copy.MoveSrcSliceWindow(Sequence<0, 1, 0, 0>{}, False); + blockwise_wei_copy.MoveSrcSliceWindow(Sequence<0, 1, 0, 0>{}, False); + } +#endif // output: register to global mem const auto c_thread_mtx_begin = diff --git a/composable_kernel/include/tensor_description/ConstantTensorDescriptor.hpp b/composable_kernel/include/tensor_description/ConstantTensorDescriptor.hpp index e59d8e9a67..3949652174 100644 --- a/composable_kernel/include/tensor_description/ConstantTensorDescriptor.hpp +++ b/composable_kernel/include/tensor_description/ConstantTensorDescriptor.hpp @@ -204,7 +204,7 @@ struct ConstantTensorDescriptor } // This function doesn't do carry check on the highest dimension for positive stepping (or - // borrow check on the lowest dimension for negative stepping) , for performance reason. It is + // borrow check on the highest dimension for negative stepping) , for performance reason. It is // the user's responsibility to make sure the result "new_mutli_id" is not out-of-bound on the // highest dimension for positive stepping (or on the lowest dimension for negative stepping) template @@ -304,14 +304,73 @@ struct ConstantTensorDescriptor GetStrides().PushBack(leaf_tensor::GetStrides()))>{}; } + template + struct lambda_IsVectorizationAllowed + { + bool& is_allowed; + + __host__ __device__ constexpr lambda_IsVectorizationAllowed(bool& is_allowed_) + : is_allowed(is_allowed_) + { + } + + template + __host__ __device__ constexpr void operator()(Number) const + { + constexpr auto IDim = Number{}; + + if(IDimVector != IDim && Strides::Get(IDim) % DataPerVector != 0) + { + is_allowed = false; + } + } + }; + + template + __host__ __device__ static constexpr bool IsVectorizationAllowed(Number, + Number) + { + bool is_allowed = (Strides{}[IDimVector] == 1 || DataPerVector == 1) && + Lengths{}[IDimVector] % DataPerVector == 0; + + static_for<0, nDim, 1>{}( + lambda_IsVectorizationAllowed{is_allowed}); + + return is_allowed; + } + + template + __host__ __device__ static constexpr auto Vectorize(Number, Number) + { + constexpr auto idim = Number{}; + constexpr auto data_per_vector = Number{}; + + static_assert(IsVectorizationAllowed(idim, data_per_vector), "wrong!"); + + using vectorized_lengths = + decltype(Lengths::Modify(Number{}, Number{})); + using vectorized_strides = + decltype((Strides{} / Number{}).Modify(Number{}, Number<1>{})); + + return ConstantTensorDescriptor{}; + } + template __host__ __device__ static constexpr auto Slice(Number, Number) { - using slice_lengths = decltype(Lengths{}.Modify(Number{}, Number{})); + using slice_lengths = decltype(Lengths::Modify(Number{}, Number{})); return ConstantTensorDescriptor{}; } + template + __host__ __device__ static constexpr auto Slice(Sequence slice_lengths) + { + static_assert(slice_lengths.GetSize() == nDim, "wrong!"); + + return ConstantTensorDescriptor{}; + } + template __host__ __device__ static constexpr auto StridedSlice(Number, Number, Number) diff --git a/composable_kernel/include/tensor_description/tensor_coordinate.hpp b/composable_kernel/include/tensor_description/tensor_coordinate.hpp index 77ed7c052b..4600b682ac 100644 --- a/composable_kernel/include/tensor_description/tensor_coordinate.hpp +++ b/composable_kernel/include/tensor_description/tensor_coordinate.hpp @@ -7,6 +7,7 @@ namespace ck { +// TensorDesc is ConstantTensorDescriptor template struct NormalTensorCoordinate { @@ -26,6 +27,12 @@ struct NormalTensorCoordinate { } + template + __host__ __device__ constexpr NormalTensorCoordinate(Sequence) + : NormalTensorCoordinate(Array{Xs...}) + { + } + __host__ __device__ constexpr index_t GetOffset() const { return mOffset; } // T is Array or Sequence @@ -87,6 +94,7 @@ struct NormalTensorCoordinate index_t mOffset; }; +// TensorDesc is ConstantMergedTensorDescriptor template struct MergedTensorCoordinate { @@ -235,6 +243,8 @@ struct MergedTensorCoordinate static_assert(is_same{} && T::GetSize() == nDim, "wrong!"); static_for<0, nDim, 1>{}([&](auto idim) { + // compiler should remove dead code path, because step_sizes is known at + // compile time if(step_sizes[idim] != 0) { this->MoveOnDimension(idim, step_sizes[idim], integral_constant{}); @@ -250,6 +260,8 @@ struct MergedTensorCoordinate static_assert(is_same{} && T::GetSize() == nDim, "wrong!"); static_for<0, nDim, 1>{}([&](auto idim) { + // compiler should remove dead code path, because step_sizes is known at + // compile time if(step_sizes[idim] != 0) { this->MoveOnDimension(idim, step_sizes[idim], integral_constant{}); @@ -287,7 +299,7 @@ struct MergedTensorCoordinate // arithmetic after construction of TensorCoordinate. // TODO: refactor TensorCoordinate, after introducing the concept of "dimensions" // and simplify implementation of ConstantMergedTensorDescriptor, so we don't need to - // count on compiler to optimize way those register memory for us + // count on compiler to optimize away those register memory for us Array mOriginalIndex; Array mPartialOffsets; diff --git a/composable_kernel/include/tensor_description/tensor_view.hpp b/composable_kernel/include/tensor_description/tensor_view.hpp new file mode 100644 index 0000000000..b9a9a0ca03 --- /dev/null +++ b/composable_kernel/include/tensor_description/tensor_view.hpp @@ -0,0 +1,100 @@ +#ifndef CK_TENSOR_VIEW_HPP +#define CK_TENSOR_VIEW_HPP + +#include "common_header.hpp" +#include "ConstantTensorDescriptor.hpp" +#include "ConstantMergedTensorDescriptor.hpp" +#include "tensor_coordinate.hpp" + +namespace ck { + +// TensorDesc is ConstantTensorDescriptor or ConstantMergedTensorDescriptor +template +struct NormalTensorView +{ + using type = NormalTensorView; + using tensor_desc_type = TensorDesc; + using coordinate_type = typename NormalTensorCoordinate::type; + using data_type = TData; + + static constexpr auto nDim = TensorDesc::GetNumOfDimension(); + + __host__ __device__ constexpr NormalTensorView(TData* p_data) : mpData{p_data} {} + + __host__ __device__ constexpr NormalTensorView() : NormalTensorView{nullptr} {} + + __host__ __device__ static constexpr auto GetNumOfDimension() { return nDim; } + + __host__ __device__ static constexpr auto GetLengths() { return TensorDesc::GetLengths(); } + + __host__ __device__ const TData& operator[](coordinate_type coord) const + { + return mpData[coord.GetOffset()]; + } + + __host__ __device__ TData& operator()(coordinate_type coord) const + { + return mpData[coord.GetOffset()]; + } + + template + __host__ __device__ static constexpr auto IsVectorizationAllowed(IDim, DataPerVector) + { + return TensorDesc::IsVectorizationAllowed(IDim{}, DataPerVector{}); + } + + template + __host__ __device__ auto Vectorize(IDim idim, DataPerVector data_per_vector) const + { + static_assert(IsVectorizationAllowed(idim, data_per_vector), "wrong!"); + + using vector_t = typename vector_type::MemoryType; + return NormalTensorView( + reinterpret_cast(mpData)); + } + + template + __host__ __device__ auto Slice(coordinate_type slice_origin, Sequence slice_lengths) + { + static_assert(slice_lengths.GetSize() == nDim, "wrong!"); + + return NormalTensorView( + mpData + slice_origin.GetOffset()); + } + + template + __host__ __device__ auto + Slice(coordinate_type slice_origin, IDim idim, SliceLen slice_len) const + { + return NormalTensorView( + mpData + slice_origin.GetOffset()); + } + + // slice_window is a slicing window on "*this" + template + __device__ void MoveSliceWindow(SliceWindow& slice_window, + T step_sizes, + integral_constant) + { + if(PositiveDirection) + { + slice_window.mpData += coordinate_type{step_sizes}.GetOffset(); + } + else + { + slice_window.mpData -= coordinate_type{step_sizes}.GetOffset(); + } + } + + // private: + data_type* mpData; +}; + +template +__host__ __device__ constexpr auto make_TensorView(ConstantTensorDescriptor, TData* p_data) +{ + return NormalTensorView, TData>{p_data}; +} + +} // namespace ck +#endif diff --git a/composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp b/composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp index 6bb0795cfe..23b173f587 100644 --- a/composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp +++ b/composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp @@ -5,6 +5,7 @@ #include "ConstantTensorDescriptor.hpp" #include "ConstantMergedTensorDescriptor.hpp" #include "tensor_coordinate.hpp" +#include "tensor_view.hpp" #include "threadwise_generic_tensor_slice_copy.hpp" #ifndef CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_BLOCKWISE_GENERIC_SLICE_COPY_V1 @@ -442,12 +443,13 @@ struct BlockwiseGenericTensorSliceCopy_v2 __device__ constexpr BlockwiseGenericTensorSliceCopy_v2(SrcCoordinate src_block_slice_origin, DstCoordinate dst_block_slice_origin) { - static_assert(nDim == SrcDesc::GetNumOfDimension() && - nDim == DstDesc::GetNumOfDimension() && nDim == SliceLengths::GetSize() && - nDim == SubLengths::GetSize() && - nDim == ThreadClusterLengths::GetSize() && - nDim == ThreadClusterArrangeOrder::GetSize(), - "wrong! nDim not consistent"); + static_assert( + nDim == SrcDesc::GetNumOfDimension() && nDim == DstDesc::GetNumOfDimension() && + nDim == SliceLengths::GetSize() && nDim == SubLengths::GetSize() && + nDim == ThreadClusterLengths::GetSize() && + nDim == ThreadClusterArrangeOrder::GetSize() && + nDim == SrcDimAccessOrder::GetSize() && nDim == DstDimAccessOrder::GetSize(), + "wrong! nDim not consistent"); static_assert(is_same{}, "wrong! threads should be mapped to cover entire slicing window"); @@ -542,6 +544,129 @@ struct BlockwiseGenericTensorSliceCopy_v2 ThreadwiseStore mThreadwiseStore; }; +template +struct BlockwiseGenericTensorSliceCopy_v3 +{ + static constexpr index_t nDim = SrcTensor::GetNumOfDimension(); + using data_type = remove_cv_t; + + using SrcCoordinate = typename SrcTensor::coordinate_type; + using DstCoordinate = typename DstTensor::coordinate_type; + + __device__ constexpr BlockwiseGenericTensorSliceCopy_v3(SrcTensor src_block, + SrcCoordinate src_block_slice_origin, + DstTensor dst_block, + DstCoordinate dst_block_slice_origin) + : mThreadBuffer{make_TensorView(ThreadBufferDesc{}, mpBuffer)} + { + static_assert( + nDim == SrcTensor::GetNumOfDimension() && nDim == DstTensor::GetNumOfDimension() && + nDim == SliceLengths::GetSize() && nDim == SubLengths::GetSize() && + nDim == ThreadClusterLengths::GetSize() && + nDim == ThreadClusterArrangeOrder::GetSize() && + nDim == SrcDimAccessOrder::GetSize() && nDim == DstDimAccessOrder::GetSize(), + "wrong! nDim not consistent"); + + static_assert(is_same{}, + "wrong! threads should be mapped to cover entire slicing window"); + + static_assert(is_same, + remove_cv_t>{}, + "wrong! type conversion not supported yet"); + + constexpr auto thread_cluster_desc = make_ConstantTensorDescriptor_packed( + ThreadClusterLengths::ReorderGivenNew2Old(ThreadClusterArrangeOrder{})); + + static_assert(BlockSize == thread_cluster_desc.GetElementSize(), + "wrong! BlockSize not consistent with ThreadClusterLengths"); + + const auto thread_cluster_id = + thread_cluster_desc.GetMultiIndexFrom1dIndex(get_thread_local_1d_id()); + + const auto data_cluster_id = + reorder_array_given_old2new(thread_cluster_id, ThreadClusterArrangeOrder{}); + + const auto thread_data_id_begin = data_cluster_id * SubLengths{}; + + mThreadwiseLoad = ThreadwiseLoad(src_block, + src_block_slice_origin + thread_data_id_begin, + mThreadBuffer, + make_zero_array()); + + mThreadwiseStore = ThreadwiseStore(mThreadBuffer, + make_zero_array(), + dst_block, + dst_block_slice_origin + thread_data_id_begin); + } + + __device__ void RunLoadRegisterBuffer() { mThreadwiseLoad.Run(); } + + __device__ void RunStoreRegisterBuffer() const { mThreadwiseStore.Run(); } + + __device__ void Run() + { + mThreadwiseLoad.Run(); + mThreadwiseStore.Run(); + } + + template + __device__ void + MoveSrcSliceWindow(T step_sizes, integral_constant positive_direction) + { + mThreadwiseLoad.MoveSrcSliceWindow(step_sizes, positive_direction); + } + + template + __device__ void + MoveDstSliceWindow(T step_sizes, integral_constant positive_direction) + { + mThreadwiseStore.MoveDstSliceWindow(step_sizes, positive_direction); + } + + private: + using ThreadBufferDesc = decltype(make_ConstantTensorDescriptor_packed(SubLengths{})); + using ThreadBufferTensor = NormalTensorView; + + using ThreadwiseLoad = ThreadwiseGenericTensorSliceCopy_v3; + + using ThreadwiseStore = ThreadwiseGenericTensorSliceCopy_v3; + + data_type mpBuffer[ThreadBufferDesc::GetElementSpace()]; + + ThreadBufferTensor mThreadBuffer; + + ThreadwiseLoad mThreadwiseLoad; + ThreadwiseStore mThreadwiseStore; +}; + } // namespace ck #endif diff --git a/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp b/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp index 9b5c138abf..c0928c2bd3 100644 --- a/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp +++ b/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp @@ -5,6 +5,7 @@ #include "ConstantTensorDescriptor.hpp" #include "ConstantMergedTensorDescriptor.hpp" #include "tensor_coordinate.hpp" +#include "tensor_view.hpp" #ifndef CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1R1 #define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1R1 0 @@ -773,5 +774,171 @@ struct ThreadwiseGenericTensorSliceCopy_v2r1 DstCoordinate mDstSliceOrigin; }; +template +struct ThreadwiseGenericTensorSliceCopy_v3 +{ + static constexpr index_t nDim = SrcTensor::GetNumOfDimension(); + using data_type = remove_cv_t; + + using SrcCoordinate = typename SrcTensor::coordinate_type; + using DstCoordinate = typename DstTensor::coordinate_type; + + __device__ constexpr ThreadwiseGenericTensorSliceCopy_v3(SrcTensor src, + SrcCoordinate src_slice_origin, + DstTensor dst, + DstCoordinate dst_slice_origin) + : mSrc{src}, + mDst{dst}, + mSrcSlice{src.Slice(src_slice_origin, SliceLengths{})}, + mDstSlice{dst.Slice(dst_slice_origin, SliceLengths{})} + { + static_assert(nDim == SrcTensor::GetNumOfDimension() && + nDim == DstTensor::GetNumOfDimension() && + nDim == SliceLengths::GetSize() && nDim == SrcDimAccessOrder::GetSize() && + nDim == DstDimAccessOrder::GetSize(), + "wrong! # of dimensions not the same"); + + static_assert(is_valid_sequence_map::value && + is_valid_sequence_map::value, + "wrong! map is not valid"); + + static_assert(is_same, + remove_cv_t>{}, + "wrong! type conversion is not supported yet"); + + static_assert(decltype(mSrcSlice)::IsVectorizationAllowed(Number{}, + Number{}) && + decltype(mDstSlice)::IsVectorizationAllowed(Number{}, + Number{}), + "wrong! vectorized access is not allowed"); + } + + __device__ constexpr ThreadwiseGenericTensorSliceCopy_v3() + : ThreadwiseGenericTensorSliceCopy_v3( + SrcTensor{}, SrcCoordinate{}, DstTensor{}, DstCoordinate{}) + { + } + + __device__ void Run() const + { + // buffer + constexpr auto buffer_desc = make_ConstantTensorDescriptor_packed(SrcTensor::GetLengths()); + data_type p_buffer[buffer_desc.GetElementSpace()]; + auto buffer = make_TensorView(buffer_desc, p_buffer); + + // copy data from src into buffer + { + using src_vector_t = typename vector_type::MemoryType; + + constexpr auto src_vector_access_dim = Number{}; + constexpr auto src_data_per_access = Number{}; + + auto src_slice_vectorized = + mSrcSlice.Vectorize(src_vector_access_dim, src_data_per_access); + +#if 0 + if(get_thread_local_1d_id() == 0 && get_block_1d_id() == 0) + { + print_ConstantTensorDescriptor("mSrcSlice: ", typename decltype(mSrcSlice)::tensor_desc_type{}); + print_ConstantTensorDescriptor("src_slice_vector: ", typename decltype(src_slice_vectorized)::tensor_desc_type{}); + } +#endif + +#if 1 // debug + ford{}( + [&](auto src_vector_id) { + // load vector from src + const src_vector_t vector_data = src_slice_vectorized[src_vector_id]; + + // unpack vector into buffer + auto src_scalar_id = src_vector_id; + src_scalar_id(src_vector_access_dim) *= src_data_per_access; + + for(index_t i = 0; i < SrcDataPerAccess; ++i) + { + auto id = make_zero_array(); + id(src_vector_access_dim) = i; + + buffer(src_scalar_id + id) = + reinterpret_cast(&vector_data)[i]; + } + }); +#endif + } + + // copy data from buffer into dst + { + using dst_vector_t = typename vector_type::MemoryType; + + constexpr auto dst_vector_access_dim = Number{}; + constexpr auto dst_data_per_access = Number{}; + + auto dst_slice_vectorized = + mDstSlice.Vectorize(dst_vector_access_dim, dst_data_per_access); + +#if 0 + if(get_thread_local_1d_id() == 0 && get_block_1d_id() == 0) + { + print_ConstantTensorDescriptor("mDstSlice: ", typename decltype(mDstSlice)::tensor_desc_type{}); + print_ConstantTensorDescriptor("dst_slice_vector: ", typename decltype(dst_slice_vectorized)::tensor_desc_type{}); + } +#endif + +#if 1 // debug + ford{}( + [&](auto dst_vector_id) { + + dst_vector_t vector_data{}; + + // pack vector from buffer + auto dst_scalar_id = dst_vector_id; + dst_scalar_id(dst_vector_access_dim) *= dst_data_per_access; + + for(index_t i = 0; i < DstDataPerAccess; ++i) + { + auto id = make_zero_array(); + id(dst_vector_access_dim) = i; + + reinterpret_cast(&vector_data)[i] = buffer[dst_scalar_id + id]; + } + + // write vector into dst + dst_slice_vectorized(dst_vector_id) = vector_data; + }); +#endif + } + } + + // T can be Sequence or Array + template + __device__ void MoveSrcSliceWindow(T step_sizes, integral_constant) + { + mSrc.MoveSliceWindow(mSrcSlice, step_sizes, integral_constant{}); + } + + template + __device__ void MoveDstSliceWindow(T step_sizes, integral_constant) + { + mDst.MoveSliceWindow(mDstSlice, step_sizes, integral_constant{}); + } + + private: + using SrcSlice = decltype(SrcTensor{}.Slice(make_zero_array(), SliceLengths{})); + using DstSlice = decltype(DstTensor{}.Slice(make_zero_array(), SliceLengths{})); + + SrcTensor mSrc; + DstTensor mDst; + SrcSlice mSrcSlice; + DstSlice mDstSlice; +}; + } // namespace ck #endif diff --git a/composable_kernel/include/utility/integral_constant.hpp b/composable_kernel/include/utility/integral_constant.hpp index bdeb944423..cae52ebe3a 100644 --- a/composable_kernel/include/utility/integral_constant.hpp +++ b/composable_kernel/include/utility/integral_constant.hpp @@ -23,6 +23,9 @@ struct is_same : public integral_constant { }; +template +using remove_cv_t = typename std::remove_cv::type; + template using Number = integral_constant; diff --git a/composable_kernel/include/utility/vector_type.hpp b/composable_kernel/include/utility/vector_type.hpp index 2b33887ffb..01c6539b2b 100644 --- a/composable_kernel/include/utility/vector_type.hpp +++ b/composable_kernel/include/utility/vector_type.hpp @@ -14,7 +14,7 @@ struct vector_type template <> struct vector_type { - typedef float MemoryType; + using MemoryType = float; template __host__ __device__ static void SetScalar(MemoryType& v, float s, Number) @@ -64,6 +64,24 @@ struct vector_type } }; +template <> +struct vector_type +{ + using MemoryType = const float; +}; + +template <> +struct vector_type +{ + using MemoryType = const float2_t; +}; + +template <> +struct vector_type +{ + using MemoryType = const float4_t; +}; + } // namespace ck #endif diff --git a/driver/src/driver.cpp b/driver/src/driver.cpp index 5046fbdbd6..dd9dffb9f5 100644 --- a/driver/src/driver.cpp +++ b/driver/src/driver.cpp @@ -72,9 +72,9 @@ int main(int argc, char* argv[]) { using namespace ck; -#if 0 +#if 1 constexpr index_t N = 64; - constexpr index_t C = 1536; + constexpr index_t C = 8; constexpr index_t HI = 8; constexpr index_t WI = 8; constexpr index_t K = 256;