From a9a392b44dd53a4c29561b601c4ac194880ab45a Mon Sep 17 00:00:00 2001 From: Chao Liu Date: Thu, 1 Aug 2019 15:32:40 -0500 Subject: [PATCH] experimenting TensorCoordinate and new merged tensor copy operator --- ...tion_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp | 45 ++++++ ...tion_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp | 34 ----- .../tensor_description/tensor_coordinate.hpp | 142 ++++++------------ ...tion_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp | 2 +- ...tion_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp | 104 ++++++++----- driver/src/driver.cpp | 2 +- 6 files changed, 158 insertions(+), 171 deletions(-) diff --git a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp index fecd7c5ca1..46230c8cb7 100644 --- a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp +++ b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp @@ -155,6 +155,7 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw static_assert(in_e_n1_b_n2_block_desc.GetStride(I1) % GemmDataPerReadB == 0, "GemmDataPerReadB alignment requirement is not satisfied"); +#if 0 // debug // input blockwise copy // slice a merged tensor, reorder and copy to a normal tensor // this copy operator already has blockwise offset built-in @@ -172,6 +173,19 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw InBlockCopySrcDataPerRead_B, InBlockCopyDstDataPerWrite_N2>( {0, 0, b_block_data_on_global, 0}, {0, 0, 0, 0}); +#else + auto blockwise_in_copy = BlockwiseGenericTensorSliceCopy_v2< + BlockSize, + Float, + decltype(in_e_n1_b_n2_global_merged_desc), + decltype(in_e_n1_b_n2_block_desc), + MergedTensorCoordinate, + NormalTensorCoordinate, + decltype(in_e_n1_b_n2_block_desc.GetLengths()), + InBlockCopySubLengths_E_N1_B_N2, + InBlockCopyClusterLengths_E_N1_B_N2, + InBlockCopyThreadClusterArrangeOrder>({0, 0, b_block_data_on_global, 0}, {0, 0, 0, 0}); +#endif // weight tensor // tensor descriptor in device memory, src of blockwise copy @@ -184,6 +198,7 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw Sequence{}, Number{}); +#if 0 // debug // operator for blockwise copy of weight into LDS // slice a tensor, and copy it into another tensor // this copy operator already have blockwise offset built-in @@ -201,6 +216,19 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw WeiBlockCopySrcDataPerRead_E, WeiBlockCopyDstDataPerWrite_K>( {0, k_block_data_on_global}, {0, 0}); +#else + auto blockwise_wei_copy = BlockwiseGenericTensorSliceCopy_v2< + BlockSize, + Float, + decltype(wei_e_k_global_desc), + decltype(wei_e_k_block_desc), + NormalTensorCoordinate, + NormalTensorCoordinate, + decltype(wei_e_k_block_desc.GetLengths()), + WeiBlockCopySubLengths_E_K, + WeiBlockCopyClusterLengths_E_K, + WeiBlockCopyThreadClusterArrangeOrder>({0, k_block_data_on_global}, {0, 0}); +#endif // GEMM definition // c_mtx += transpose(a_mtx) * b_mtx @@ -277,8 +305,13 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw __syncthreads(); +#if 0 blockwise_in_copy.MoveSlicingWindowOnSourceTensor(I0, Number{}, True); blockwise_wei_copy.MoveSlicingWindowOnSourceTensor(I0, Number{}, True); +#else + blockwise_in_copy.MoveSrcSlicingWindow({EPerBlock, 0, 0, 0}, true); + blockwise_wei_copy.MoveSrcSlicingWindow({EPerBlock, 0}, true); +#endif } // copy output: register to global memory @@ -328,6 +361,7 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw out_k_n1_b_n2_global_merged_desc.GetOffsetFromMultiIndex( k_thread_data_on_global, 0, b_thread_data_on_global, 0); +#if 0 // debug threadwise_generic_tensor_slice_copy_v1( out_n0_n1_n2_k0_k1_k2_h_w_thread_desc, p_out_thread, @@ -338,6 +372,17 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw out_n0_n1_n2_k0_k1_k2_h_w_thread_desc.GetLengths(), arithmetic_sequence_gen<0, 8, 1>::type{}, Number<1>{}); +#else + ThreadwiseGenericTensorSliceCopy_v2< + Float, + decltype(out_n0_n1_n2_k0_k1_k2_h_w_thread_desc), + decltype(out_n0_n1_n2_k0_k1_k2_h_w_global_mem_desc), + NormalTensorCoordinate, + MergedTensorCoordinate, + decltype(out_n0_n1_n2_k0_k1_k2_h_w_thread_desc.GetLengths())>( + {0, 0, 0, 0, 0, 0, 0, 0}, {0, 0, 0, 0, 0, 0, 0, 0}) + .Run(p_out_thread, p_out_thread_on_global); +#endif } } }; diff --git a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp index 2fd7c9c6bd..918f71073d 100644 --- a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp +++ b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp @@ -301,40 +301,6 @@ struct GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw b_thread_data_on_global % B1}); threadwise_out_copy.Run(p_out_thread, p_out_thread_on_global); -#elif 0 - // This is a hack, because slicing a merged dimension is not supported yet. - // This should be replaced with logic above, once slicing a merged dimension support - // become available - // dst descriptor - constexpr auto out_k0_k1_b_global_desc = - make_ConstantMergedTensorDescriptor(out_n_k_h_w_global_desc.Fold(I1, Number{}), - Sequence<1>{}, - Sequence<2>{}, - Sequence<0, 3, 4>{}); - - // src descriptor - constexpr auto out_k0_k1_b_thread_desc = make_ConstantTensorDescriptor_packed( - Sequence{}); - - auto threadwise_out_copy = ThreadwiseGenericTensorSliceCopy_v2< - Float, - decltype(out_k0_k1_b_thread_desc), - decltype(out_k0_k1_b_global_desc), - NormalTensorCoordinate, - MergedTensorCoordinate, - Sequence>( - {0, 0, 0}, - {k_thread_data_on_global / K1, - k_thread_data_on_global % K1, - b_thread_data_on_global}); - - for(index_t nrepeat = 0; nrepeat < GemmNRepeat; ++nrepeat) - { - threadwise_out_copy.Run(p_out_thread, p_out_global); - - threadwise_out_copy.MoveSrcSlicingWindow({0, 0, GemmNPerThreadSubC}, true); - threadwise_out_copy.MoveDstSlicingWindow({0, 0, B1}, true); - } #elif 1 // This is a hack, because slicing a merged dimension is not supported yet. // This should be replaced with logic above, once slicing a merged dimension support diff --git a/composable_kernel/include/tensor_description/tensor_coordinate.hpp b/composable_kernel/include/tensor_description/tensor_coordinate.hpp index 98de85bf6a..709beef171 100644 --- a/composable_kernel/include/tensor_description/tensor_coordinate.hpp +++ b/composable_kernel/include/tensor_description/tensor_coordinate.hpp @@ -16,7 +16,7 @@ struct NormalTensorCoordinate static constexpr index_t nDim = tensor_desc_type::GetNumOfDimension(); __host__ __device__ constexpr NormalTensorCoordinate(Array tensor_index) - : mIndex{tensor_index}, mOffset{tensor_desc_type::GetOffsetFromMultiIndex(tensor_index)} + : mOffset{tensor_desc_type::GetOffsetFromMultiIndex(tensor_index)} { } @@ -26,38 +26,15 @@ struct NormalTensorCoordinate { } - __host__ __device__ constexpr Array GetIndex() const { return mIndex; } - __host__ __device__ constexpr index_t GetOffset() const { return mOffset; } - template - __host__ __device__ void - MoveOnDimension(IDim idim, index_t step_size, integral_constant) - { - if(PositiveDirection) - { - mIndex(idim) += step_size; - mOffset += step_size * tensor_desc_type::GetStride(idim); - } - else - { - mIndex(idim) -= step_size; - mOffset -= step_size * tensor_desc_type::GetStride(idim); - } - } - // T is Array or Sequence template __host__ __device__ type operator+=(T step_sizes) { -#if 0 - static_assert(is_same, "wrong!"); -#endif - static_assert(T::GetSize() == nDim, "wrong!"); + static_assert(is_same{} && T::GetSize() == nDim, "wrong!"); - static_for<0, nDim, 1>{}([&](auto idim) { - this->MoveOnDimension(idim, step_sizes[idim], integral_constant{}); - }); + mOffset += tensor_desc_type::GetOffsetFromMultiIndex(step_sizes); return *this; } @@ -65,14 +42,9 @@ struct NormalTensorCoordinate template __host__ __device__ type operator-=(T step_sizes) { -#if 0 - static_assert(is_same, "wrong!"); -#endif - static_assert(T::GetSize() == nDim, "wrong!"); + static_assert(is_same{} && T::GetSize() == nDim, "wrong!"); - static_for<0, nDim, 1>{}([&](auto idim) { - this->MoveOnDimension(idim, step_sizes[idim], integral_constant{}); - }); + mOffset -= tensor_desc_type::GetOffsetFromMultiIndex(step_sizes); return *this; } @@ -93,19 +65,25 @@ struct NormalTensorCoordinate return coord; } - // reposition point of origin, and return compensated offset + // reposition point of origin, and return compensated offset. + // This is a hack to reduce index calculation during looping over + // a tensor whose origin is this TensorCoordinate. It does so, by spitting + // out the run-time offset to the pointer (to the tensor data) held by this + // TensorCoordiante, so the caller can add the offset into the run-time pointer of + // the data, so only 1 run-time variable (update pointer) is needed, instead + // of 2 run-time variables (old pointer and this offset) + // TODO: after introducing the concept of "run-time tensor view", which contains the + // run-time pointer to the data, always keep track of the pointer, instead of both + // offset and the pointer. This also bring additional benefit that we don't need to + // worry the offset might underflow (because offset is unsigned integer) when updating it. __host__ __device__ constexpr index_t RepositionOrigin() { index_t offset_diff = mOffset; - - mIndex = make_zero_array(); - mOffset = 0; - + mOffset = 0; return offset_diff; } - // private: - Array mIndex; + private: index_t mOffset; }; @@ -120,8 +98,7 @@ struct MergedTensorCoordinate tensor_desc_type::GetOriginalTensorDescriptor().GetNumOfDimension(); __host__ __device__ constexpr MergedTensorCoordinate(Array tensor_index) - : mIndex{tensor_index}, - mOriginalIndex{tensor_desc_type::GetOriginalMultiIndexFromMultiIndex(tensor_index)} + : mOriginalIndex{tensor_desc_type::GetOriginalMultiIndexFromMultiIndex(tensor_index)} { // partial offset on each dimension static_for<0, nDim, 1>{}([&](auto idim) { @@ -146,8 +123,6 @@ struct MergedTensorCoordinate { } - __host__ __device__ constexpr Array GetIndex() const { return mIndex; } - __host__ __device__ constexpr index_t GetOffset() const { return mOffset; } // step_size should be known at compile time @@ -157,17 +132,7 @@ struct MergedTensorCoordinate { constexpr auto idim = IDim{}; - // update multi-index - if(PositiveDirection) - { - mIndex(idim) += step_size; - } - else - { - mIndex(idim) -= step_size; - } - - // update rest + // update original index static_if{}([&](auto) { constexpr auto partial_original_dims = tensor_desc_type::GetContainedOriginalDimensions(idim); @@ -253,19 +218,10 @@ struct MergedTensorCoordinate // update "mThreadSrcOffset", do "+" before "-" to avoid underflow mOffset = (mOffset + mPartialOffsets[idim]) - old_partial_offset; - }).Else([&](auto) { - constexpr auto idim_original = - tensor_desc_type::GetContainedOriginalDimensions(idim).Front(); - - static_if{}([&](auto fwd) { - mOriginalIndex(idim_original) += step_size; - mPartialOffsets(idim) += step_size * fwd(tensor_desc_type{}).GetStride(idim); + }).Else([&](auto fwd) { + static_if{}([&](auto) { mOffset += step_size * fwd(tensor_desc_type{}).GetStride(idim); - }).Else([&](auto fwd) { - mOriginalIndex(idim_original) -= step_size; - mPartialOffsets(idim) -= step_size * fwd(tensor_desc_type{}).GetStride(idim); - mOffset -= step_size * fwd(tensor_desc_type{}).GetStride(idim); - }); + }).Else([&](auto) { mOffset -= step_size * fwd(tensor_desc_type{}).GetStride(idim); }); }); } @@ -273,10 +229,9 @@ struct MergedTensorCoordinate template __host__ __device__ type operator+=(T step_sizes) { -#if 0 - static_assert(is_same, "wrong!"); -#endif - static_assert(T::GetSize() == nDim, "wrong!"); + static_assert(is_same{} && T::GetSize() == nDim, "wrong!"); + + index_t normal_offset_diff = 0; static_for<0, nDim, 1>{}([&](auto idim) { this->MoveOnDimension(idim, step_sizes[idim], integral_constant{}); @@ -288,10 +243,7 @@ struct MergedTensorCoordinate template __host__ __device__ type operator-=(T step_sizes) { -#if 0 - static_assert(is_same, "wrong!"); -#endif - static_assert(T::GetSize() == nDim, "wrong!"); + static_assert(is_same{} && T::GetSize() == nDim, "wrong!"); static_for<0, nDim, 1>{}([&](auto idim) { this->MoveOnDimension(idim, step_sizes[idim], integral_constant{}); @@ -316,33 +268,23 @@ struct MergedTensorCoordinate return coord; } - // reposition point of origin, and return compensated offset - __host__ __device__ constexpr index_t RepositionOrigin() - { - index_t offset_diff = 0; + __host__ __device__ static constexpr index_t RepositionOrigin() { return 0; } - static_for<0, nDim, 1>{}([&](auto idim_) { - constexpr auto idim = decltype(idim_){}; - - static_if{}([&](auto) { - constexpr auto idim_original = - tensor_desc_type::GetContainedOriginalDimensions(idim).Front(); - - mIndex(idim) = 0; - mOriginalIndex(idim_original) = 0; - mOffset -= mPartialOffsets[idim]; - offset_diff += mPartialOffsets[idim]; - mPartialOffsets(idim) = 0; - }); - }); - - return offset_diff; - } - - // private: - Array mIndex; + private: + // Allocate register memory for all merged dimensions and normal dimensions. + // However, only those merged dimensions, whose index will be involved in arithmetic + // after the construction of this TensorCoordinate (e.g. when user move a slicing + // window on the merged dimension), will use these register memory. + // Let's hope compiler will optimize away those register memory allocated for normal + // dimensions, and those merged dimensions, that would never be involved in index + // arithmetic after construction of TensorCoordinate. + // TODO: refactor TensorCoordinate, after introducing the concept of "dimensions" + // and simplify implementation of ConstantMergedTensorDescriptor, so we don't need to + // count on compiler to optimize way those register memory for us Array mOriginalIndex; - Array mPartialOffsets; // mPartialOffsets is needed for for unsigned index type + Array mPartialOffsets; + + // complete offset index_t mOffset; }; diff --git a/driver/include/device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp b/driver/include/device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp index 67395b978d..f79b17ae6d 100644 --- a/driver/include/device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp +++ b/driver/include/device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp @@ -139,7 +139,7 @@ void device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw(InDesc, for(index_t i = 0; i < nrepeat; ++i) { constexpr auto gridwise_conv = -#if 0 +#if 1 GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw #else GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer diff --git a/driver/include/device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp b/driver/include/device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp index 1788af0cf1..f6e9560385 100644 --- a/driver/include/device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp +++ b/driver/include/device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp @@ -85,6 +85,40 @@ void device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw(InDesc, using WeiBlockCopySrcAccessOrder = Sequence<1, 0>; // [K, E] using WeiBlockCopyDstAccessOrder = Sequence<0, 1>; // [E, K] + constexpr index_t WeiBlockCopySrcDataPerRead_E = 1; + constexpr index_t WeiBlockCopyDstDataPerWrite_K = 1; +#elif 1 + // 1x1 filter, 8x8 image + constexpr index_t BlockSize = 256; + + constexpr index_t BPerBlock = 128; + constexpr index_t KPerBlock = 128; + constexpr index_t EPerBlock = 8; + + constexpr index_t GemmMPerThreadSubC = 4; + constexpr index_t GemmNPerThreadSubC = 4; + constexpr index_t GemmMLevel0Cluster = 4; + constexpr index_t GemmNLevel0Cluster = 4; + constexpr index_t GemmMLevel1Cluster = 4; + constexpr index_t GemmNLevel1Cluster = 4; + constexpr index_t GemmKPerThreadLoop = 1; + constexpr index_t GemmDataPerReadA = 4; + constexpr index_t GemmDataPerReadB = 4; + + using InBlockCopySubLengths_E_B = Sequence<2, 2>; + using InBlockCopyClusterLengths_E_B = Sequence<4, 64>; + using InBlockCopyThreadClusterArrangeOrder = Sequence<0, 1>; // [E, B] + using InBlockCopySrcAccessOrder = Sequence<0, 1>; // [E, B] + using InBlockCopyDstAccessOrder = Sequence<0, 1>; // [E, B] + + constexpr index_t InBlockCopyDataPerAccess_B = 1; + + using WeiBlockCopySubLengths_E_K = Sequence<4, 1>; + using WeiBlockCopyClusterLengths_E_K = Sequence<2, 128>; + using WeiBlockCopyThreadClusterArrangeOrder = Sequence<1, 0>; // [K, E] + using WeiBlockCopySrcAccessOrder = Sequence<1, 0>; // [K, E] + using WeiBlockCopyDstAccessOrder = Sequence<0, 1>; // [E, K] + constexpr index_t WeiBlockCopySrcDataPerRead_E = 1; constexpr index_t WeiBlockCopyDstDataPerWrite_K = 1; #endif @@ -96,43 +130,43 @@ void device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw(InDesc, printf("%s: BlockSize %u, GridSize %u \n", __func__, BlockSize, GridSize); + constexpr auto gridwise_conv = + GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw{}; + for(index_t i = 0; i < nrepeat; ++i) { - constexpr auto gridwise_conv = GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw< - GridSize, - BlockSize, - T, - decltype(in_nchw_desc), - decltype(wei_kcyx_desc), - decltype(out_nkhw_desc), - ConvStrides, - ConvDilations, - BPerBlock, - KPerBlock, - EPerBlock, - GemmMPerThreadSubC, - GemmNPerThreadSubC, - GemmMLevel0Cluster, - GemmNLevel0Cluster, - GemmMLevel1Cluster, - GemmNLevel1Cluster, - GemmKPerThreadLoop, - GemmDataPerReadA, - GemmDataPerReadB, - InBlockCopySubLengths_E_B, - InBlockCopyClusterLengths_E_B, - InBlockCopyThreadClusterArrangeOrder, - InBlockCopySrcAccessOrder, - InBlockCopyDstAccessOrder, - InBlockCopyDataPerAccess_B, - WeiBlockCopySubLengths_E_K, - WeiBlockCopyClusterLengths_E_K, - WeiBlockCopyThreadClusterArrangeOrder, - WeiBlockCopySrcAccessOrder, - WeiBlockCopyDstAccessOrder, - WeiBlockCopySrcDataPerRead_E, - WeiBlockCopyDstDataPerWrite_K>{}; - float time = launch_kernel(run_gridwise_convolution_kernel, dim3(GridSize), dim3(BlockSize), diff --git a/driver/src/driver.cpp b/driver/src/driver.cpp index b118a55ae7..c9488b211a 100644 --- a/driver/src/driver.cpp +++ b/driver/src/driver.cpp @@ -71,7 +71,7 @@ int main(int argc, char* argv[]) { using namespace ck; -#if 0 +#if 1 constexpr index_t N = 64; constexpr index_t C = 1536; constexpr index_t HI = 8;