From 4908fe3fdc624ec2205f7554aa4780c260a518cd Mon Sep 17 00:00:00 2001 From: Chao Liu Date: Thu, 8 Aug 2019 12:14:06 -0500 Subject: [PATCH] tweak on amd --- ..._v4r1_nchw_kcyx_nkhw_lds_double_buffer.hpp | 22 +----- ..._v4r4_nchw_kcyx_nkhw_lds_double_buffer.hpp | 11 ++- .../tensor_description/tensor_coordinate.hpp | 10 ++- .../blockwise_generic_tensor_slice_copy.hpp | 30 ++++---- .../threadwise_generic_tensor_slice_copy.hpp | 48 ++++++++++-- ...tion_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp | 73 ++++++++++--------- driver/src/driver.cpp | 8 +- 7 files changed, 117 insertions(+), 85 deletions(-) diff --git a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer.hpp b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer.hpp index 62dbcae361..491e9a0914 100644 --- a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer.hpp +++ b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer.hpp @@ -335,15 +335,8 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer Float p_in_register_buffer[blockwise_in_copy.GetRegisterBufferSize()]; Float p_wei_register_buffer[blockwise_wei_copy.GetRegisterBufferSize()]; -#if 0 - blockwise_in_copy.MoveSlicingWindowOnSourceTensor(I0, Number{}, True); - // blockwise_wei_copy.MoveSlicingWindowOnSourceTensor(I0, Number{}, - // True); - p_wei_block_on_global += EPerBlock * wei_e_k_global_desc.GetStride(I0); -#else blockwise_in_copy.MoveSrcSlicingWindow(Sequence{}, True); - blockwise_wei_copy.MoveSrcSlicingWindow(Sequence{}, True); -#endif + p_wei_block_on_global += EPerBlock * wei_e_k_global_desc.GetStride(I0); __syncthreads(); @@ -367,14 +360,8 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer Float p_in_register_buffer[blockwise_in_copy.GetRegisterBufferSize()]; Float p_wei_register_buffer[blockwise_wei_copy.GetRegisterBufferSize()]; -#if 0 - blockwise_in_copy.MoveSlicingWindowOnSourceTensor(I0, Number{}, True); - // blockwise_wei_copy.MoveSlicingWindowOnSourceTensor(I0, Number{}, True); - p_wei_block_on_global += EPerBlock * wei_e_k_global_desc.GetStride(I0); -#else blockwise_in_copy.MoveSrcSlicingWindow(Sequence{}, True); - blockwise_wei_copy.MoveSrcSlicingWindow(Sequence{}, True); -#endif + p_wei_block_on_global += EPerBlock * wei_e_k_global_desc.GetStride(I0); __syncthreads(); @@ -447,7 +434,7 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer out_k_n1_b_n2_global_merged_desc.GetOffsetFromMultiIndex( k_thread_data_on_global, 0, b_thread_data_on_global, 0); -#if 0 +#if 1 ThreadwiseGenericTensorSliceCopy_v1r2< decltype(out_n0_n1_n2_k0_k1_k2_h_w_thread_desc), decltype(out_n0_n1_n2_k0_k1_k2_h_w_global_mem_desc), @@ -469,8 +456,7 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer 7, 7, 1, - 1>( - {0, 0, 0, 0, 0, 0, 0, 0}, {0, 0, 0, 0, 0, 0, 0, 0}) + 1>({0, 0, 0, 0, 0, 0, 0, 0}, {0, 0, 0, 0, 0, 0, 0, 0}) .Run(p_out_thread, p_out_thread_on_global); #endif } diff --git a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_lds_double_buffer.hpp b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_lds_double_buffer.hpp index 08491ddddd..5241086a1a 100644 --- a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_lds_double_buffer.hpp +++ b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_lds_double_buffer.hpp @@ -244,6 +244,8 @@ struct GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw_lds_double_buffer // zero out threadwise output threadwise_matrix_set_zero(c_k0k1_b0b1_thread_mtx_desc, p_out_thread); + const Float* p_wei_block_on_global = p_wei_global; + // LDS double buffer: preload data into LDS { blockwise_in_copy.Run(p_in_global, p_in_block_double); @@ -273,13 +275,14 @@ struct GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw_lds_double_buffer Float p_wei_register_buffer[blockwise_wei_copy.GetRegisterBufferSize()]; blockwise_in_copy.MoveSrcSlicingWindow(Sequence{}, True); - blockwise_wei_copy.MoveSrcSlicingWindow(Sequence{}, True); + p_wei_block_on_global += EPerBlock * wei_e_k_global_desc.GetStrides()[0]; __syncthreads(); // LDS doubel buffer: load next data from device mem blockwise_in_copy.RunLoadRegisterBuffer(p_in_global, p_in_register_buffer); - blockwise_wei_copy.RunLoadRegisterBuffer(p_wei_global, p_wei_register_buffer); + blockwise_wei_copy.RunLoadRegisterBuffer(p_wei_block_on_global, + p_wei_register_buffer); // LDS double buffer: GEMM on current data blockwise_gemm.Run(p_wei_block_now, p_in_block_now, p_out_thread); @@ -297,13 +300,13 @@ struct GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw_lds_double_buffer // even iteration blockwise_in_copy.MoveSrcSlicingWindow(Sequence{}, True); - blockwise_wei_copy.MoveSrcSlicingWindow(Sequence{}, True); + p_wei_block_on_global += EPerBlock * wei_e_k_global_desc.GetStrides()[0]; __syncthreads(); // LDS doubel buffer: load next data from device mem blockwise_in_copy.RunLoadRegisterBuffer(p_in_global, p_in_register_buffer); - blockwise_wei_copy.RunLoadRegisterBuffer(p_wei_global, p_wei_register_buffer); + blockwise_wei_copy.RunLoadRegisterBuffer(p_wei_block_on_global, p_wei_register_buffer); // LDS double buffer: GEMM on current data blockwise_gemm.Run(p_wei_block_double, p_in_block_double, p_out_thread); diff --git a/composable_kernel/include/tensor_description/tensor_coordinate.hpp b/composable_kernel/include/tensor_description/tensor_coordinate.hpp index 25c1124755..7b76eac8bc 100644 --- a/composable_kernel/include/tensor_description/tensor_coordinate.hpp +++ b/composable_kernel/include/tensor_description/tensor_coordinate.hpp @@ -237,7 +237,10 @@ struct MergedTensorCoordinate index_t normal_offset_diff = 0; static_for<0, nDim, 1>{}([&](auto idim) { - this->MoveOnDimension(idim, step_sizes[idim], integral_constant{}); + if(step_sizes[idim] != 0) + { + this->MoveOnDimension(idim, step_sizes[idim], integral_constant{}); + } }); return *this; @@ -249,7 +252,10 @@ struct MergedTensorCoordinate static_assert(is_same{} && T::GetSize() == nDim, "wrong!"); static_for<0, nDim, 1>{}([&](auto idim) { - this->MoveOnDimension(idim, step_sizes[idim], integral_constant{}); + if(step_sizes[idim] != 0) + { + this->MoveOnDimension(idim, step_sizes[idim], integral_constant{}); + } }); return *this; diff --git a/composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp b/composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp index 158d82bab9..cc58e44fb7 100644 --- a/composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp +++ b/composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp @@ -402,6 +402,19 @@ struct BlockwiseGenericTensorSliceCopy_v1 }); }); } + + template + __device__ void + MoveSrcSlicingWindow(T step_sizes, + integral_constant positive_direction) + { + static_for<0, nDim, 1>{}([&](auto idim) { + if(step_sizes[idim] != 0) + { + MoveSlicingWindowOnSourceTensor(idim, step_sizes[idim], positive_direction); + } + }); + } }; template , - SubLengths>; - - using ThreadwiseStore = - ThreadwiseGenericTensorSliceCopy_v2, - DstCoordinate, - SubLengths>; -#else using ThreadwiseLoad = ThreadwiseGenericTensorSliceCopy_v2r1; -#endif + ThreadwiseLoad mThreadwiseLoad; ThreadwiseStore mThreadwiseStore; }; diff --git a/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp b/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp index 5574e4d275..57dec923f1 100644 --- a/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp +++ b/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp @@ -594,7 +594,6 @@ struct ThreadwiseGenericTensorSliceCopy_v2 DstCoordinate mDstSliceOrigin; }; -#if 1 // This threadwise copy allow vector access of src and dst. // It allows the dimensions of vector access to be different on src and dst. // It also allows the vector size to be different on src and dst. @@ -623,6 +622,49 @@ struct ThreadwiseGenericTensorSliceCopy_v2r1 DstCoordinate dst_slice_origin) : mSrcSliceOrigin(src_slice_origin), mDstSliceOrigin(dst_slice_origin) { + static_assert(nDim == SrcDesc::GetNumOfDimension() && + nDim == DstDesc::GetNumOfDimension() && nDim == SliceLengths::GetSize() && + nDim == SrcDimAccessOrder::GetSize() && + nDim == DstDimAccessOrder::GetSize(), + "wrong! # of dimensions not the same"); + + static_assert(is_valid_sequence_map::value && + is_valid_sequence_map::value, + "wrong! map is not valid"); + + static_assert(SliceLengths{}[SrcVectorAccessDim] % SrcDataPerAccess == 0 && + SliceLengths{}[DstVectorAccessDim] % DstDataPerAccess == 0, + "wrong! cannot evenly divide"); + + // check vectorized memory access + constexpr auto src_vector_access_dim = Number{}; + constexpr auto dst_vector_access_dim = Number{}; + + static_if{}( + [&](auto fwd) { + static_assert( + (fwd(SrcDesc{}).GetStride(src_vector_access_dim) == 1 || SrcDataPerAccess == 1), + "wrong! vectorized access is allowed only if stride == 1"); + }) + .Else([&](auto fwd) { + static_assert( + (fwd(SrcDesc{}).GetLastOriginalDimensionStride(src_vector_access_dim) == 1 || + SrcDataPerAccess == 1), + "wrong! vectorized access is allowed only if stride == 1"); + }); + + static_if{}( + [&](auto fwd) { + static_assert( + (fwd(DstDesc{}).GetStride(dst_vector_access_dim) == 1 || DstDataPerAccess == 1), + "wrong! vectorized access is allowed only if stride == 1"); + }) + .Else([&](auto fwd) { + static_assert( + (fwd(DstDesc{}).GetLastOriginalDimensionStride(dst_vector_access_dim) == 1 || + DstDataPerAccess == 1), + "wrong! vectorized access is allowed only if stride == 1"); + }); } __device__ constexpr ThreadwiseGenericTensorSliceCopy_v2r1() @@ -725,9 +767,6 @@ struct ThreadwiseGenericTensorSliceCopy_v2r1 constexpr index_t buffer_offset = buffer_desc.GetOffsetFromMultiIndex( src_merged_dim_data_id + src_normal_dim_data_id + scalar_id); - constexpr index_t buffer_offset = - buffer_desc.GetOffsetFromMultiIndex(src_data_begin_id + scalar_id); - p_buffer[buffer_offset] = reinterpret_cast(&vector_data)[i]; }); }); @@ -900,7 +939,6 @@ struct ThreadwiseGenericTensorSliceCopy_v2r1 SrcCoordinate mSrcSliceOrigin; DstCoordinate mDstSliceOrigin; }; -#endif } // namespace ck #endif diff --git a/driver/include/device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp b/driver/include/device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp index 8c9a791bdd..79b7da25f5 100644 --- a/driver/include/device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp +++ b/driver/include/device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp @@ -54,7 +54,7 @@ void device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw(InDesc, wei_kcyx_device_buf.ToDevice(wei_kcyx.mData.data()); out_nkhw_device_buf.ToDevice(out_nkhw.mData.data()); -#if 1 +#if 0 constexpr index_t BlockSize = 256; constexpr index_t BPerBlock = 128; @@ -88,41 +88,6 @@ void device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw(InDesc, constexpr index_t WeiBlockCopySrcDataPerRead_E = 4; constexpr index_t WeiBlockCopyDstDataPerWrite_K = 1; - constexpr index_t OutThreadCopyDataPerAccess_B = 1; -#elif 0 // debug - constexpr index_t BlockSize = 256; - - constexpr index_t BPerBlock = 128; - constexpr index_t KPerBlock = 128; - constexpr index_t EPerBlock = 8; - - constexpr index_t GemmMPerThreadSubC = 4; - constexpr index_t GemmNPerThreadSubC = 4; - constexpr index_t GemmMLevel0Cluster = 4; - constexpr index_t GemmNLevel0Cluster = 4; - constexpr index_t GemmMLevel1Cluster = 4; - constexpr index_t GemmNLevel1Cluster = 4; - constexpr index_t GemmKPerThreadLoop = 1; - constexpr index_t GemmDataPerReadA = 4; - constexpr index_t GemmDataPerReadB = 4; - - using InBlockCopySubLengths_E_B = Sequence<1, 4>; - using InBlockCopyClusterLengths_E_B = Sequence<8, 32>; - using InBlockCopyThreadClusterArrangeOrder = Sequence<0, 1>; // [E, B] - using InBlockCopySrcAccessOrder = Sequence<0, 1>; // [E, B] - using InBlockCopyDstAccessOrder = Sequence<0, 1>; // [E, B] - - constexpr index_t InBlockCopyDataPerAccess_B = 1; - - using WeiBlockCopySubLengths_E_K = Sequence<4, 1>; - using WeiBlockCopyClusterLengths_E_K = Sequence<2, 128>; - using WeiBlockCopyThreadClusterArrangeOrder = Sequence<1, 0>; // [K, E] - using WeiBlockCopySrcAccessOrder = Sequence<1, 0>; // [K, E] - using WeiBlockCopyDstAccessOrder = Sequence<0, 1>; // [E, K] - - constexpr index_t WeiBlockCopySrcDataPerRead_E = 4; - constexpr index_t WeiBlockCopyDstDataPerWrite_K = 1; - constexpr index_t OutThreadCopyDataPerAccess_B = 1; #elif 1 // 1x1 filter, 8x8 image @@ -160,6 +125,42 @@ void device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw(InDesc, constexpr index_t WeiBlockCopyDstDataPerWrite_K = 1; constexpr index_t OutThreadCopyDataPerAccess_B = 4; +#elif 0 + // 1x1 filter, 14x14 image + constexpr index_t BlockSize = 256; + + constexpr index_t BPerBlock = 128; + constexpr index_t KPerBlock = 128; + constexpr index_t EPerBlock = 8; + + constexpr index_t GemmMPerThreadSubC = 4; + constexpr index_t GemmNPerThreadSubC = 4; + constexpr index_t GemmMLevel0Cluster = 4; + constexpr index_t GemmNLevel0Cluster = 4; + constexpr index_t GemmMLevel1Cluster = 4; + constexpr index_t GemmNLevel1Cluster = 4; + constexpr index_t GemmKPerThreadLoop = 1; + constexpr index_t GemmDataPerReadA = 4; + constexpr index_t GemmDataPerReadB = 4; + + using InBlockCopySubLengths_E_B = Sequence<2, 2>; + using InBlockCopyClusterLengths_E_B = Sequence<4, 64>; + using InBlockCopyThreadClusterArrangeOrder = Sequence<0, 1>; // [E, B] + using InBlockCopySrcAccessOrder = Sequence<0, 1>; // [E, B] + using InBlockCopyDstAccessOrder = Sequence<0, 1>; // [E, B] + + constexpr index_t InBlockCopyDataPerAccess_B = 2; + + using WeiBlockCopySubLengths_E_K = Sequence<4, 1>; + using WeiBlockCopyClusterLengths_E_K = Sequence<2, 128>; + using WeiBlockCopyThreadClusterArrangeOrder = Sequence<1, 0>; // [K, E] + using WeiBlockCopySrcAccessOrder = Sequence<1, 0>; // [K, E] + using WeiBlockCopyDstAccessOrder = Sequence<0, 1>; // [E, K] + + constexpr index_t WeiBlockCopySrcDataPerRead_E = 4; + constexpr index_t WeiBlockCopyDstDataPerWrite_K = 1; + + constexpr index_t OutThreadCopyDataPerAccess_B = 2; #endif constexpr index_t B = N * Ho * Wo; diff --git a/driver/src/driver.cpp b/driver/src/driver.cpp index 2f9a2c3e76..7110a1a45e 100644 --- a/driver/src/driver.cpp +++ b/driver/src/driver.cpp @@ -87,7 +87,7 @@ int main(int argc, char* argv[]) constexpr index_t WPad = 0; #elif 0 // 3x3, 34x34 - constexpr index_t N = 128; + constexpr index_t N = 64; constexpr index_t C = 256; constexpr index_t HI = 34; constexpr index_t WI = 34; @@ -228,7 +228,7 @@ int main(int argc, char* argv[]) constexpr index_t HPad = 0; constexpr index_t WPad = 0; -#elif 0 +#elif 1 // 3x3 filter, 2x2 stride, 35x35 input, 17x17 output // cudnn@V100 90%, ck@V100 93%, ck@P100 83%, ck@VII 81% constexpr index_t N = 128; @@ -244,7 +244,7 @@ int main(int argc, char* argv[]) constexpr index_t HPad = 0; constexpr index_t WPad = 0; -#elif 0 +#elif 1 // 1x1 filter, 17x17 input // cudnn@V100 81%, ck@V100 76%, ck@P100 70%, ck@VII 76% constexpr index_t N = 128; @@ -379,7 +379,7 @@ int main(int argc, char* argv[]) #elif 0 device_convolution_implicit_gemm_v3_nchw_cyxk_nkhw( (in_nchw_desc, in_nchw, wei_kcyx_desc, wei_kcyx, out_nkhw_desc, out_nkhw_device, nrepeat); -#elif 1 +#elif 0 device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw(in_nchw_desc, in_nchw, wei_kcyx_desc,