From bc9ea646f8bb006913713dffacafdb0c929c899e Mon Sep 17 00:00:00 2001 From: Chao Liu Date: Wed, 7 Aug 2019 19:09:13 -0500 Subject: [PATCH] use ford/for instead of static_ford/static_for in threadwise copy, somehow register spill is greatly reduced on AMD --- ..._v4r1_nchw_kcyx_nkhw_lds_double_buffer.hpp | 25 ++--- .../blockwise_generic_tensor_slice_copy.hpp | 20 ++-- .../threadwise_generic_tensor_slice_copy.hpp | 96 ++++++++++++++++++- .../include/utility/config_amd.hpp.in | 2 + .../include/utility/config_nvidia.hpp.in | 2 + ...tion_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp | 8 +- driver/src/driver.cpp | 4 +- 7 files changed, 122 insertions(+), 35 deletions(-) diff --git a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer.hpp b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer.hpp index 938d203755..66fe01c71a 100644 --- a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer.hpp +++ b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer.hpp @@ -155,7 +155,7 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer static_assert(in_e_n1_b_n2_block_desc.GetStride(I1) % GemmDataPerReadB == 0, "GemmDataPerReadB alignment requirement is not satisfied"); -#if 1 +#if 0 // input blockwise copy // slice a merged tensor, reorder and copy to a normal tensor // this copy operator already has blockwise offset built-in @@ -178,7 +178,6 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer #else auto blockwise_in_copy = BlockwiseGenericTensorSliceCopy_v2< BlockSize, - Float, decltype(in_e_n1_b_n2_global_merged_desc), decltype(in_e_n1_b_n2_block_desc), MergedTensorCoordinate, @@ -200,7 +199,7 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer Sequence{}, Number{}); -#if 1 +#if 0 // operator for blockwise copy of weight into LDS // slice a tensor, and copy it into another tensor // this copy operator already have blockwise offset built-in @@ -223,7 +222,6 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer #else auto blockwise_wei_copy = BlockwiseGenericTensorSliceCopy_v2< BlockSize, - Float, decltype(wei_e_k_global_desc), decltype(wei_e_k_block_desc), NormalTensorCoordinate, @@ -326,7 +324,7 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer Float p_in_register_buffer[blockwise_in_copy.GetRegisterBufferSize()]; Float p_wei_register_buffer[blockwise_wei_copy.GetRegisterBufferSize()]; -#if 1 +#if 0 blockwise_in_copy.MoveSlicingWindowOnSourceTensor(I0, Number{}, True); // blockwise_wei_copy.MoveSlicingWindowOnSourceTensor(I0, Number{}, // True); @@ -358,7 +356,7 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer Float p_in_register_buffer[blockwise_in_copy.GetRegisterBufferSize()]; Float p_wei_register_buffer[blockwise_wei_copy.GetRegisterBufferSize()]; -#if 1 +#if 0 blockwise_in_copy.MoveSlicingWindowOnSourceTensor(I0, Number{}, True); // blockwise_wei_copy.MoveSlicingWindowOnSourceTensor(I0, Number{}, True); p_wei_block_on_global += EPerBlock * wei_e_k_global_desc.GetStride(I0); @@ -439,17 +437,6 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer k_thread_data_on_global, 0, b_thread_data_on_global, 0); #if 0 - threadwise_generic_tensor_slice_copy_v1( - out_n0_n1_n2_k0_k1_k2_h_w_thread_desc, - p_out_thread, - {0, 0, 0, 0, 0, 0, 0, 0}, - out_n0_n1_n2_k0_k1_k2_h_w_global_mem_desc, - p_out_thread_on_global, - {0, 0, 0, 0, 0, 0, 0, 0}, - out_n0_n1_n2_k0_k1_k2_h_w_thread_desc.GetLengths(), - arithmetic_sequence_gen<0, 8, 1>::type{}, - Number<1>{}); -#elif 0 ThreadwiseGenericTensorSliceCopy_v1r1< decltype(out_n0_n1_n2_k0_k1_k2_h_w_thread_desc), decltype(out_n0_n1_n2_k0_k1_k2_h_w_global_mem_desc), @@ -461,7 +448,7 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer 1, 1>(make_zero_array(), make_zero_array()) .Run(p_out_thread, p_out_thread_on_global); -#elif 1 +#elif 0 ThreadwiseGenericTensorSliceCopy_v1r2< decltype(out_n0_n1_n2_k0_k1_k2_h_w_thread_desc), decltype(out_n0_n1_n2_k0_k1_k2_h_w_global_mem_desc), @@ -471,7 +458,7 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer 1, 1>(make_zero_array(), make_zero_array()) .Run(p_out_thread, p_out_thread_on_global); -#elif 0 +#elif 1 ThreadwiseGenericTensorSliceCopy_v2< decltype(out_n0_n1_n2_k0_k1_k2_h_w_thread_desc), decltype(out_n0_n1_n2_k0_k1_k2_h_w_global_mem_desc), diff --git a/composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp b/composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp index 8b82c3978b..7473194177 100644 --- a/composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp +++ b/composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp @@ -22,7 +22,6 @@ namespace ck { // repeat-length on the merged dimension need to be 1. These sanity checks are performed // in constructor of BlockwiseGenericTensorSliceCopy_v1 template + __device__ void RunLoadRegisterBuffer(const TData* __restrict__ p_src, + TData* __restrict__ p_buffer) const { constexpr auto thread_sub_tensor_lengths = SubLengths{}; @@ -255,7 +255,7 @@ struct BlockwiseGenericTensorSliceCopy_v1 thread_sub_tensor_lengths, SrcDimAccessOrder{}, Number{}); -#elif 0 +#elif 1 ThreadwiseGenericTensorSliceCopy_v1r1< SrcDesc, decltype(thread_buffer_desc), @@ -281,8 +281,9 @@ struct BlockwiseGenericTensorSliceCopy_v1 }); } - __device__ void RunStoreRegisterBuffer(const Float* __restrict__ p_buffer, - Float* __restrict__ p_dst) const + template + __device__ void RunStoreRegisterBuffer(const TData* __restrict__ p_buffer, + TData* __restrict__ p_dst) const { constexpr auto thread_sub_tensor_lengths = SubLengths{}; @@ -333,7 +334,7 @@ struct BlockwiseGenericTensorSliceCopy_v1 thread_sub_tensor_lengths, DstDimAccessOrder{}, Number{}); -#elif 0 +#elif 1 ThreadwiseGenericTensorSliceCopy_v1r1< decltype(thread_buffer_desc), DstDesc, @@ -360,9 +361,10 @@ struct BlockwiseGenericTensorSliceCopy_v1 }); } - __device__ void Run(const Float* __restrict__ p_src, Float* __restrict__ p_dst) const + template + __device__ void Run(const TData* __restrict__ p_src, TData* __restrict__ p_dst) const { - Float p_buffer[GetRegisterBufferSize()]; + TData p_buffer[GetRegisterBufferSize()]; RunLoadRegisterBuffer(p_src, p_buffer); RunStoreRegisterBuffer(p_buffer, p_dst); diff --git a/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp b/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp index 3fd9b0d37a..5354cdd61c 100644 --- a/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp +++ b/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp @@ -10,10 +10,18 @@ #define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1 0 #endif +#ifndef CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1R1 +#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1R1 0 +#endif + #ifndef CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1R2 #define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1R2 0 #endif +#ifndef CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V2 +#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V2 0 +#endif + namespace ck { // user need to make sure alignment requirement is satisfied when setting DataPerAccesss > 1 @@ -216,6 +224,7 @@ struct ThreadwiseGenericTensorSliceCopy_v1r1 src_vector_access_dim, SliceLengths::Get(src_vector_access_dim) / src_data_per_access); +#if CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1R1 static_ford{}([&](auto src_access_id) { constexpr auto src_data_begin_id = src_access_id.Modify( src_vector_access_dim, @@ -239,6 +248,31 @@ struct ThreadwiseGenericTensorSliceCopy_v1r1 p_buffer[buffer_offset] = reinterpret_cast(&vector_data)[i]; }); }); +#else + ford{}([&](auto src_access_id) { + auto src_data_begin_id = src_access_id; + src_data_begin_id(src_vector_access_dim) = + src_access_id[src_vector_access_dim] * src_data_per_access; + + const index_t src_offset = + SrcDesc::GetOffsetFromMultiIndex(mSrcSliceOrigin + src_data_begin_id); + + // load vector from src + const vector_t vector_data = *reinterpret_cast(&p_src[src_offset]); + + // unpack vector into buffer + for(index_t i = 0; i < SrcDataPerAccess; ++i) + { + auto scalar_id = make_zero_array(); + scalar_id(src_vector_access_dim) = i; + + const index_t buffer_offset = + buffer_desc.GetOffsetFromMultiIndex(src_data_begin_id + scalar_id); + + p_buffer[buffer_offset] = reinterpret_cast(&vector_data)[i]; + } + }); +#endif } // copy data from buffer to dst @@ -252,6 +286,7 @@ struct ThreadwiseGenericTensorSliceCopy_v1r1 dst_vector_access_dim, SliceLengths::Get(dst_vector_access_dim) / dst_data_per_access); +#if CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1R1 static_ford{}([&](auto dst_access_id) { constexpr auto dst_data_begin_id = dst_access_id.Modify( dst_vector_access_dim, @@ -277,6 +312,33 @@ struct ThreadwiseGenericTensorSliceCopy_v1r1 // store vector into dst *reinterpret_cast(&p_dst[dst_offset]) = vector_data; }); +#else + ford{}([&](auto dst_access_id) { + auto dst_data_begin_id = dst_access_id; + dst_data_begin_id(dst_vector_access_dim) = + dst_access_id[dst_vector_access_dim] * dst_data_per_access; + + vector_t vector_data; + + // pack vector from buffer + for(index_t i = 0; i < DstDataPerAccess; ++i) + { + auto scalar_id = make_zero_array(); + scalar_id(dst_vector_access_dim) = i; + + const index_t buffer_offset = + buffer_desc.GetOffsetFromMultiIndex(dst_data_begin_id + scalar_id); + + reinterpret_cast(&vector_data)[i] = p_buffer[buffer_offset]; + } + + const index_t dst_offset = + DstDesc::GetOffsetFromMultiIndex(mDstSliceOrigin + dst_data_begin_id); + + // store vector into dst + *reinterpret_cast(&p_dst[dst_offset]) = vector_data; + }); +#endif } } @@ -373,7 +435,7 @@ struct ThreadwiseGenericTensorSliceCopy_v1r2 constexpr auto long_vector_access_lengths = SliceLengths::Modify( vector_access_dim, SliceLengths::Get(vector_access_dim) / long_vector_size); -#ifndef CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1R2 +#if CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1R2 static_ford{}([&]( auto long_vector_access_id) { @@ -524,6 +586,7 @@ struct ThreadwiseGenericTensorSliceCopy_v2 using SrcNormalDimSliceLengthsHack = decltype((SliceLengths{} + Number<1>{}) - SrcMergedDimSliceLengthsHack{}); +#if CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V2 static_ford{}([&](auto merged_dim_data_id_) { constexpr auto merged_dim_data_id = decltype(merged_dim_data_id_){}; @@ -541,6 +604,21 @@ struct ThreadwiseGenericTensorSliceCopy_v2 p_buffer[buffer_offset] = p_src_tmp[src_normal_offset]; }); }); +#else + ford{}([&](auto merged_dim_data_id) { + const TData* p_src_tmp = p_src + (mSrcSliceOrigin + merged_dim_data_id).GetOffset(); + + ford{}([&](auto normal_dim_data_id) { + const index_t buffer_offset = + buffer_desc.GetOffsetFromMultiIndex(merged_dim_data_id + normal_dim_data_id); + + const index_t src_normal_offset = + SrcDesc::GetOffsetFromMultiIndex(normal_dim_data_id); + + p_buffer[buffer_offset] = p_src_tmp[src_normal_offset]; + }); + }); +#endif // DstMergedDimSliceLengthsHack has entry same as SliceLengths on dst merged dimensions, // but 1 on normal dimensions; @@ -553,6 +631,7 @@ struct ThreadwiseGenericTensorSliceCopy_v2 using DstNormalDimSliceLengthsHack = decltype((SliceLengths{} + Number<1>{}) - DstMergedDimSliceLengthsHack{}); +#if CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V2 static_ford{}([&](auto merged_dim_data_id_) { constexpr auto merged_dim_data_id = decltype(merged_dim_data_id_){}; @@ -570,6 +649,21 @@ struct ThreadwiseGenericTensorSliceCopy_v2 p_dst_tmp[dst_normal_offset] = p_buffer[buffer_offset]; }); }); +#else + ford{}([&](auto merged_dim_data_id) { + TData* p_dst_tmp = p_dst + (mDstSliceOrigin + merged_dim_data_id).GetOffset(); + + ford{}([&](auto normal_dim_data_id) { + const index_t buffer_offset = + buffer_desc.GetOffsetFromMultiIndex(merged_dim_data_id + normal_dim_data_id); + + const index_t dst_normal_offset = + DstDesc::GetOffsetFromMultiIndex(normal_dim_data_id); + + p_dst_tmp[dst_normal_offset] = p_buffer[buffer_offset]; + }); + }); +#endif } // T can be Sequence or Array diff --git a/composable_kernel/include/utility/config_amd.hpp.in b/composable_kernel/include/utility/config_amd.hpp.in index c0484a1f18..aed2947c7d 100644 --- a/composable_kernel/include/utility/config_amd.hpp.in +++ b/composable_kernel/include/utility/config_amd.hpp.in @@ -8,7 +8,9 @@ #define CK_USE_AMD_INLINE_ASM 1 #define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_BLOCKWISE_GENERIC_SLICE_COPY_V1 1 #define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1 0 +#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1R1 0 #define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1R2 0 +#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V2 0 namespace ck { diff --git a/composable_kernel/include/utility/config_nvidia.hpp.in b/composable_kernel/include/utility/config_nvidia.hpp.in index 2287868410..3599dc8f8a 100644 --- a/composable_kernel/include/utility/config_nvidia.hpp.in +++ b/composable_kernel/include/utility/config_nvidia.hpp.in @@ -10,7 +10,9 @@ #define CK_USE_AMD_INLINE_ASM 0 #define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_BLOCKWISE_GENERIC_SLICE_COPY_V1 0 #define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1 0 +#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1R1 0 #define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1R2 0 +#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V2 0 namespace ck { diff --git a/driver/include/device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp b/driver/include/device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp index 58827649a8..8a40a60e24 100644 --- a/driver/include/device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp +++ b/driver/include/device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp @@ -112,14 +112,14 @@ void device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw(InDesc, constexpr index_t GemmDataPerReadA = 4; constexpr index_t GemmDataPerReadB = 4; - using InBlockCopySubLengths_E_N1_B_N2 = Sequence<1, 1, 4, 1>; - using InBlockCopyClusterLengths_E_N1_B_N2 = Sequence<8, 2, 4, 4>; + using InBlockCopySubLengths_E_N1_B_N2 = Sequence<1, 1, 2, 2>; + using InBlockCopyClusterLengths_E_N1_B_N2 = Sequence<8, 2, 8, 2>; using InBlockCopyThreadClusterArrangeOrder = Sequence<0, 1, 3, 2>; // [E, N1, N2, B] using InBlockCopySrcAccessOrder = Sequence<0, 1, 3, 2>; // [E, N1, N2, B] using InBlockCopyDstAccessOrder = Sequence<0, 1, 2, 3>; // [E, N1, B, N2] - constexpr index_t InBlockCopySrcDataPerRead_B = 4; - constexpr index_t InBlockCopyDstDataPerWrite_N2 = 1; + constexpr index_t InBlockCopySrcDataPerRead_B = 2; + constexpr index_t InBlockCopyDstDataPerWrite_N2 = 2; using WeiBlockCopySubLengths_E_K = Sequence<2, 2>; using WeiBlockCopyClusterLengths_E_K = Sequence<4, 64>; diff --git a/driver/src/driver.cpp b/driver/src/driver.cpp index d70483926d..ea9f29b2e5 100644 --- a/driver/src/driver.cpp +++ b/driver/src/driver.cpp @@ -16,7 +16,7 @@ #include "device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp" //#include "device_convolution_implicit_gemm_v4r2_nchw_kcyx_nkhw.hpp" //#include "device_convolution_implicit_gemm_v4r3_nchw_kcyx_nkhw.hpp" -//#include "device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp" +#include "device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp" struct GeneratorTensor_1 { @@ -379,7 +379,7 @@ int main(int argc, char* argv[]) #elif 0 device_convolution_implicit_gemm_v3_nchw_cyxk_nkhw( (in_nchw_desc, in_nchw, wei_kcyx_desc, wei_kcyx, out_nkhw_desc, out_nkhw_device, nrepeat); -#elif 1 +#elif 0 device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw(in_nchw_desc, in_nchw, wei_kcyx_desc,