diff --git a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer.hpp b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer.hpp index 9747bf5e55..62dbcae361 100644 --- a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer.hpp +++ b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer.hpp @@ -47,7 +47,8 @@ template + index_t WeiBlockCopyDstDataPerWrite_K, + index_t OutThreadCopyDataPerAccess_W> struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer { __device__ void Run(const Float* const __restrict__ p_in_global, @@ -155,7 +156,7 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer static_assert(in_e_n1_b_n2_block_desc.GetStride(I1) % GemmDataPerReadB == 0, "GemmDataPerReadB alignment requirement is not satisfied"); -#if 1 +#if 0 // input blockwise copy // slice a merged tensor, reorder and copy to a normal tensor // this copy operator already has blockwise offset built-in @@ -184,7 +185,13 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer decltype(in_e_n1_b_n2_block_desc.GetLengths()), InBlockCopySubLengths_E_N1_B_N2, InBlockCopyClusterLengths_E_N1_B_N2, - InBlockCopyThreadClusterArrangeOrder>({0, 0, b_block_data_on_global, 0}, {0, 0, 0, 0}); + InBlockCopyThreadClusterArrangeOrder, + InBlockCopySrcAccessOrder, + InBlockCopyDstAccessOrder, + 2, + 3, + InBlockCopySrcDataPerRead_B, + InBlockCopyDstDataPerWrite_N2>({0, 0, b_block_data_on_global, 0}, {0, 0, 0, 0}); #endif // weight tensor @@ -198,7 +205,7 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer Sequence{}, Number{}); -#if 1 +#if 0 // operator for blockwise copy of weight into LDS // slice a tensor, and copy it into another tensor // this copy operator already have blockwise offset built-in @@ -227,7 +234,13 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer decltype(wei_e_k_block_desc.GetLengths()), WeiBlockCopySubLengths_E_K, WeiBlockCopyClusterLengths_E_K, - WeiBlockCopyThreadClusterArrangeOrder>({0, k_block_data_on_global}, {0, 0}); + WeiBlockCopyThreadClusterArrangeOrder, + WeiBlockCopySrcAccessOrder, + WeiBlockCopyDstAccessOrder, + 0, + 1, + WeiBlockCopySrcDataPerRead_E, + WeiBlockCopyDstDataPerWrite_K>({0, k_block_data_on_global}, {0, 0}); #endif // GEMM definition @@ -322,7 +335,7 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer Float p_in_register_buffer[blockwise_in_copy.GetRegisterBufferSize()]; Float p_wei_register_buffer[blockwise_wei_copy.GetRegisterBufferSize()]; -#if 1 +#if 0 blockwise_in_copy.MoveSlicingWindowOnSourceTensor(I0, Number{}, True); // blockwise_wei_copy.MoveSlicingWindowOnSourceTensor(I0, Number{}, // True); @@ -354,7 +367,7 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer Float p_in_register_buffer[blockwise_in_copy.GetRegisterBufferSize()]; Float p_wei_register_buffer[blockwise_wei_copy.GetRegisterBufferSize()]; -#if 1 +#if 0 blockwise_in_copy.MoveSlicingWindowOnSourceTensor(I0, Number{}, True); // blockwise_wei_copy.MoveSlicingWindowOnSourceTensor(I0, Number{}, True); p_wei_block_on_global += EPerBlock * wei_e_k_global_desc.GetStride(I0); @@ -434,7 +447,7 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer out_k_n1_b_n2_global_merged_desc.GetOffsetFromMultiIndex( k_thread_data_on_global, 0, b_thread_data_on_global, 0); -#if 1 +#if 0 ThreadwiseGenericTensorSliceCopy_v1r2< decltype(out_n0_n1_n2_k0_k1_k2_h_w_thread_desc), decltype(out_n0_n1_n2_k0_k1_k2_h_w_global_mem_desc), @@ -445,12 +458,18 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer 1>(make_zero_array(), make_zero_array()) .Run(p_out_thread, p_out_thread_on_global); #elif 1 - ThreadwiseGenericTensorSliceCopy_v2< + ThreadwiseGenericTensorSliceCopy_v2r1< decltype(out_n0_n1_n2_k0_k1_k2_h_w_thread_desc), decltype(out_n0_n1_n2_k0_k1_k2_h_w_global_mem_desc), NormalTensorCoordinate, MergedTensorCoordinate, - decltype(out_n0_n1_n2_k0_k1_k2_h_w_thread_desc.GetLengths())>( + decltype(out_n0_n1_n2_k0_k1_k2_h_w_thread_desc.GetLengths()), + arithmetic_sequence_gen<0, 8, 1>::type, + arithmetic_sequence_gen<0, 8, 1>::type, + 7, + 7, + 1, + 1>( {0, 0, 0, 0, 0, 0, 0, 0}, {0, 0, 0, 0, 0, 0, 0, 0}) .Run(p_out_thread, p_out_thread_on_global); #endif diff --git a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_lds_double_buffer.hpp b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_lds_double_buffer.hpp index 168109da56..08491ddddd 100644 --- a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_lds_double_buffer.hpp +++ b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_lds_double_buffer.hpp @@ -44,7 +44,8 @@ template + index_t WeiBlockCopyDstDataPerWrite_K, + index_t OutThreadCopyDataPerAccess_B> struct GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw_lds_double_buffer { __device__ void Run(const Float* const __restrict__ p_in_global, @@ -138,7 +139,13 @@ struct GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw_lds_double_buffer decltype(in_e_b_block_desc.GetLengths()), InBlockCopySubLengths_E_B, InBlockCopyClusterLengths_E_B, - InBlockCopyThreadClusterArrangeOrder>( + InBlockCopyThreadClusterArrangeOrder, + InBlockCopySrcAccessOrder, + InBlockCopyDstAccessOrder, + 1, + 1, + InBlockCopyDataPerAccess_B, + InBlockCopyDataPerAccess_B>( {0, b_block_data_on_global}, {0, 0}); // weight tensor @@ -164,7 +171,13 @@ struct GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw_lds_double_buffer decltype(wei_e_k_block_desc.GetLengths()), WeiBlockCopySubLengths_E_K, WeiBlockCopyClusterLengths_E_K, - WeiBlockCopyThreadClusterArrangeOrder>({0, k_block_data_on_global}, {0, 0}); + WeiBlockCopyThreadClusterArrangeOrder, + WeiBlockCopySrcAccessOrder, + WeiBlockCopyDstAccessOrder, + 0, + 1, + WeiBlockCopySrcDataPerRead_E, + WeiBlockCopyDstDataPerWrite_K>({0, k_block_data_on_global}, {0, 0}); // GEMM definition // c_mtx += transpose(a_mtx) * b_mtx @@ -349,15 +362,21 @@ struct GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw_lds_double_buffer using OutThreadCopySliceLengths = Sequence; - auto threadwise_out_copy = ThreadwiseGenericTensorSliceCopy_v2< + auto threadwise_out_copy = ThreadwiseGenericTensorSliceCopy_v2r1< decltype(out_k0_k1_b_thread_desc), decltype(out_k0_k1_b_global_desc), NormalTensorCoordinate, MergedTensorCoordinate, - OutThreadCopySliceLengths>({0, 0, 0}, - {k_thread_data_on_global / K1, - k_thread_data_on_global % K1, - b_thread_data_on_global}); + OutThreadCopySliceLengths, + arithmetic_sequence_gen<0, 3, 1>::type, + arithmetic_sequence_gen<0, 3, 1>::type, + 2, + 2, + OutThreadCopyDataPerAccess_B, + OutThreadCopyDataPerAccess_B>({0, 0, 0}, + {k_thread_data_on_global / K1, + k_thread_data_on_global % K1, + b_thread_data_on_global}); for(index_t nrepeat = 0; nrepeat < GemmNRepeat; ++nrepeat) { diff --git a/composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp b/composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp index b20d2fd4d4..158d82bab9 100644 --- a/composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp +++ b/composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp @@ -412,7 +412,13 @@ template + class ThreadClusterArrangeOrder, + class SrcDimAccessOrder, + class DstDimAccessOrder, + index_t SrcVectorAccessDim, + index_t DstVectorAccessDim, + index_t SrcDataPerAccess, + index_t DstDataPerAccess> struct BlockwiseGenericTensorSliceCopy_v2 { static constexpr index_t nDim = SrcDesc::GetNumOfDimension(); @@ -496,6 +502,7 @@ struct BlockwiseGenericTensorSliceCopy_v2 private: using RegisterBufferDesc = decltype(make_ConstantTensorDescriptor_packed(SubLengths{})); +#if 0 using ThreadwiseLoad = ThreadwiseGenericTensorSliceCopy_v2, DstCoordinate, SubLengths>; +#else + using ThreadwiseLoad = + ThreadwiseGenericTensorSliceCopy_v2r1, + SubLengths, + SrcDimAccessOrder, + SrcDimAccessOrder, + SrcVectorAccessDim, + SrcVectorAccessDim, + SrcDataPerAccess, + 1>; + + using ThreadwiseStore = + ThreadwiseGenericTensorSliceCopy_v2r1, + DstCoordinate, + SubLengths, + DstDimAccessOrder, + DstDimAccessOrder, + DstVectorAccessDim, + DstVectorAccessDim, + 1, + DstDataPerAccess>; +#endif ThreadwiseLoad mThreadwiseLoad; ThreadwiseStore mThreadwiseStore; }; diff --git a/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp b/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp index 9ab18f4f3a..5574e4d275 100644 --- a/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp +++ b/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp @@ -18,6 +18,10 @@ #define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V2 0 #endif +#ifndef CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V2R1 +#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V2R1 0 +#endif + namespace ck { // This threadwise copy allow vector access of src and dst. @@ -590,5 +594,313 @@ struct ThreadwiseGenericTensorSliceCopy_v2 DstCoordinate mDstSliceOrigin; }; +#if 1 +// This threadwise copy allow vector access of src and dst. +// It allows the dimensions of vector access to be different on src and dst. +// It also allows the vector size to be different on src and dst. +// It also allows order of access to be different on src and dst. +// It use register as buffer to hold all data moving from src to dst. +// It is designed for copying small amount of data, and src and dst are +// device memory or LDS. +// When copying large amout of data, let's hope compiler will reduce register +// used for the buffer. +template +struct ThreadwiseGenericTensorSliceCopy_v2r1 +{ + static constexpr index_t nDim = SliceLengths::GetSize(); + + __device__ constexpr ThreadwiseGenericTensorSliceCopy_v2r1(SrcCoordinate src_slice_origin, + DstCoordinate dst_slice_origin) + : mSrcSliceOrigin(src_slice_origin), mDstSliceOrigin(dst_slice_origin) + { + } + + __device__ constexpr ThreadwiseGenericTensorSliceCopy_v2r1() + : ThreadwiseGenericTensorSliceCopy_v2r1(make_zero_array(), + make_zero_array()) + { + } + + __device__ void SetSrcSliceOrigin(SrcCoordinate src_slice_origin) + { + mSrcSliceOrigin = src_slice_origin; + } + + __device__ void SetDstSliceOrigin(DstCoordinate dst_slice_origin) + { + mDstSliceOrigin = dst_slice_origin; + } + + template + struct IsolateMergedDimLengths + { + template + __device__ constexpr index_t operator()(IDim idim) const + { + return TDesc::ContainMultipleOriginalDimensions(idim) ? Lengths{}[idim] : 1; + } + }; + + template + __device__ void Run(const TData* p_src, TData* p_dst) const + { + constexpr auto buffer_desc = make_ConstantTensorDescriptor_packed(SliceLengths{}); + + TData p_buffer_[buffer_desc.GetElementSpace()]; + TData* p_buffer = p_buffer_; + + // copy data from src into buffer + { + using src_vector_t = typename vector_type::MemoryType; + + constexpr auto src_vector_access_dim = Number{}; + constexpr auto src_data_per_access = Number{}; + + constexpr auto src_access_lengths = SliceLengths::Modify( + src_vector_access_dim, + SliceLengths::Get(src_vector_access_dim) / src_data_per_access); + + // Offset w.r.t merged dimensions need to be calculated at run-time. Offset w.r.t + // normal dimensions is known at compile time. + // Below is a hack to isolate merged dimension id from normal dimension id, so the + // corresponding offset can be calculated seperately at run-time and compile-time. + // src_merged_dim_access_lengths has the same value as src_access_lengths on src's + // merged dimensions, and has value = 1 on normal dimensions; + // src_merged_dim_access_lengths has the same value as src_access_lengths on src's + // normal dimensions, and has value = 1 on merged dimensions; + constexpr auto src_merged_dim_access_lengths = typename sequence_gen< + nDim, + IsolateMergedDimLengths>::type{}; + + constexpr auto src_normal_dim_access_lengths = + src_access_lengths + Number<1>{} - src_merged_dim_access_lengths; + +#if CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V2R1 + // offset w.r.t. merged dimension need to be computed at run-time + static_ford{}([&]( + auto src_merged_dim_access_id_) { + + constexpr auto src_merged_dim_access_id = decltype(src_merged_dim_access_id_){}; + + constexpr auto src_merged_dim_data_id = src_merged_dim_access_id.Modify( + src_vector_access_dim, + src_merged_dim_access_id[src_vector_access_dim] * src_data_per_access); + + const TData* p_src_tmp = + p_src + (mSrcSliceOrigin + src_merged_dim_data_id).GetOffset(); + + // offset w.r.t. normal dimension can be computed at compile-time + static_ford{}([&]( + auto src_normal_dim_access_id_) { + + constexpr auto src_normal_dim_access_id = decltype(src_normal_dim_access_id_){}; + + constexpr auto src_normal_dim_data_id = src_normal_dim_access_id.Modify( + src_vector_access_dim, + src_normal_dim_access_id[src_vector_access_dim] * src_data_per_access); + + constexpr index_t src_normal_offset = + SrcDesc::GetOffsetFromMultiIndex(src_normal_dim_data_id); + + // load vector from src + const src_vector_t vector_data = + *reinterpret_cast(&p_src_tmp[src_normal_offset]); + + // unpack vector into buffer + static_for<0, SrcDataPerAccess, 1>{}([&](auto i) { + constexpr auto scalar_id = + typename uniform_sequence_gen::type{}.Modify( + src_vector_access_dim, i); + + constexpr index_t buffer_offset = buffer_desc.GetOffsetFromMultiIndex( + src_merged_dim_data_id + src_normal_dim_data_id + scalar_id); + + constexpr index_t buffer_offset = + buffer_desc.GetOffsetFromMultiIndex(src_data_begin_id + scalar_id); + + p_buffer[buffer_offset] = reinterpret_cast(&vector_data)[i]; + }); + }); + }); +#else + ford{}([&]( + auto src_merged_dim_access_id) { + + auto src_merged_dim_data_id = src_merged_dim_access_id; + src_merged_dim_data_id(src_vector_access_dim) = + src_merged_dim_access_id[src_vector_access_dim] * src_data_per_access; + + const TData* p_src_tmp = + p_src + (mSrcSliceOrigin + src_merged_dim_data_id).GetOffset(); + + // these should be compile-time known + ford{}([&]( + auto src_normal_dim_access_id) { + + auto src_normal_dim_data_id = src_normal_dim_access_id; + src_normal_dim_data_id(src_vector_access_dim) = + src_normal_dim_access_id[src_vector_access_dim] * src_data_per_access; + + const index_t src_normal_offset = + SrcDesc::GetOffsetFromMultiIndex(src_normal_dim_data_id); + + // load vector from src + const src_vector_t vector_data = + *reinterpret_cast(&p_src_tmp[src_normal_offset]); + + // unpack vector into buffer + for(index_t i = 0; i < SrcDataPerAccess; ++i) + { + auto scalar_id = make_zero_array(); + scalar_id(src_vector_access_dim) = i; + + const index_t buffer_offset = buffer_desc.GetOffsetFromMultiIndex( + src_merged_dim_data_id + src_normal_dim_data_id + scalar_id); + + p_buffer[buffer_offset] = reinterpret_cast(&vector_data)[i]; + } + }); + }); +#endif + } + + // copy data from buffer into dst + { + using dst_vector_t = typename vector_type::MemoryType; + + constexpr auto dst_vector_access_dim = Number{}; + constexpr auto dst_data_per_access = Number{}; + + constexpr auto dst_access_lengths = SliceLengths::Modify( + dst_vector_access_dim, + SliceLengths::Get(dst_vector_access_dim) / dst_data_per_access); + + constexpr auto dst_merged_dim_access_lengths = typename sequence_gen< + nDim, + IsolateMergedDimLengths>::type{}; + + constexpr auto dst_normal_dim_access_lengths = + dst_access_lengths + Number<1>{} - dst_merged_dim_access_lengths; + +#if CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V2R1 + // offset w.r.t. merged dimension need to be computed at run-time + static_ford{}([&]( + auto dst_merged_dim_access_id_) { + + constexpr auto dst_merged_dim_access_id = decltype(dst_merged_dim_access_id_){}; + + constexpr auto dst_merged_dim_data_id = dst_merged_dim_access_id.Modify( + dst_vector_access_dim, + dst_merged_dim_access_id[dst_vector_access_dim] * dst_data_per_access); + + TData* p_dst_tmp = p_dst + (mDstSliceOrigin + dst_merged_dim_data_id).GetOffset(); + + // offset w.r.t. normal dimension can be computed at compile-time + static_ford{}([&]( + auto dst_normal_dim_access_id_) { + constexpr auto dst_normal_dim_access_id = decltype(dst_normal_dim_access_id_){}; + + constexpr auto dst_normal_dim_data_id = dst_normal_dim_access_id.Modify( + dst_vector_access_dim, + dst_normal_dim_access_id[dst_vector_access_dim] * dst_data_per_access); + + dst_vector_t vector_data; + + // pack vector from buffer + static_for<0, DstDataPerAccess, 1>{}([&](auto i) { + constexpr auto scalar_id = + typename uniform_sequence_gen::type{}.Modify( + dst_vector_access_dim, i); + + constexpr index_t buffer_offset = buffer_desc.GetOffsetFromMultiIndex( + dst_merged_dim_data_id + dst_normal_dim_data_id + scalar_id); + + reinterpret_cast(&vector_data)[i] = p_buffer[buffer_offset]; + }); + + constexpr index_t dst_normal_offset = + DstDesc::GetOffsetFromMultiIndex(dst_normal_dim_data_id); + + // write vector into dst + *reinterpret_cast(&p_dst_tmp[dst_normal_offset]) = vector_data; + }); + }); +#else + // offset w.r.t. merged dimension need to be computed at run-time + ford{}([&]( + auto dst_merged_dim_access_id) { + + auto dst_merged_dim_data_id = dst_merged_dim_access_id; + dst_merged_dim_data_id(dst_vector_access_dim) = + dst_merged_dim_access_id[dst_vector_access_dim] * dst_data_per_access; + + TData* p_dst_tmp = p_dst + (mDstSliceOrigin + dst_merged_dim_data_id).GetOffset(); + + // offset w.r.t. normal dimension can be computed at compile-time + ford{}([&]( + auto dst_normal_dim_access_id) { + + auto dst_normal_dim_data_id = dst_normal_dim_access_id; + dst_normal_dim_data_id(dst_vector_access_dim) = + dst_normal_dim_access_id[dst_vector_access_dim] * dst_data_per_access; + + dst_vector_t vector_data; + + // pack vector from buffer + for(index_t i = 0; i < DstDataPerAccess; ++i) + { + auto scalar_id = make_zero_array(); + scalar_id(dst_vector_access_dim) = i; + + const index_t buffer_offset = buffer_desc.GetOffsetFromMultiIndex( + dst_merged_dim_data_id + dst_normal_dim_data_id + scalar_id); + + reinterpret_cast(&vector_data)[i] = p_buffer[buffer_offset]; + } + + const index_t dst_normal_offset = + DstDesc::GetOffsetFromMultiIndex(dst_normal_dim_data_id); + + // write vector into dst + *reinterpret_cast(&p_dst_tmp[dst_normal_offset]) = vector_data; + }); + }); +#endif + } + } + + // T can be Sequence or Array + template + __device__ void MoveSrcSlicingWindow(T step_sizes, integral_constant) + { + static_if{}([&](auto) { + mSrcSliceOrigin += step_sizes; + }).Else([&](auto) { mSrcSliceOrigin -= step_sizes; }); + } + + template + __device__ void MoveDstSlicingWindow(T step_sizes, integral_constant) + { + static_if{}([&](auto) { + mDstSliceOrigin += step_sizes; + }).Else([&](auto) { mDstSliceOrigin -= step_sizes; }); + } + + private: + SrcCoordinate mSrcSliceOrigin; + DstCoordinate mDstSliceOrigin; +}; +#endif + } // namespace ck #endif diff --git a/composable_kernel/include/utility/config_amd.hpp.in b/composable_kernel/include/utility/config_amd.hpp.in index aed2947c7d..a99f68aba9 100644 --- a/composable_kernel/include/utility/config_amd.hpp.in +++ b/composable_kernel/include/utility/config_amd.hpp.in @@ -11,6 +11,7 @@ #define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1R1 0 #define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1R2 0 #define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V2 0 +#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V2R1 0 namespace ck { diff --git a/composable_kernel/include/utility/config_nvidia.hpp.in b/composable_kernel/include/utility/config_nvidia.hpp.in index 3599dc8f8a..e9842eda35 100644 --- a/composable_kernel/include/utility/config_nvidia.hpp.in +++ b/composable_kernel/include/utility/config_nvidia.hpp.in @@ -13,6 +13,7 @@ #define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1R1 0 #define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1R2 0 #define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V2 0 +#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V2R1 0 namespace ck { diff --git a/driver/include/device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp b/driver/include/device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp index 8a40a60e24..1aa4590488 100644 --- a/driver/include/device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp +++ b/driver/include/device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp @@ -3,7 +3,7 @@ #include "device.hpp" #include "tensor.hpp" #include "gridwise_convolution_kernel_wrapper.hpp" -#include "gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp" +//#include "gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp" #include "gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer.hpp" template {}; + WeiBlockCopyDstDataPerWrite_K, + OutThreadCopyDataPerAccess_W>{}; float time = launch_kernel(run_gridwise_convolution_kernel, dim3(GridSize), diff --git a/driver/include/device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp b/driver/include/device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp index e1f950739a..8c9a791bdd 100644 --- a/driver/include/device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp +++ b/driver/include/device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp @@ -3,7 +3,7 @@ #include "device.hpp" #include "tensor.hpp" #include "gridwise_convolution_kernel_wrapper.hpp" -#include "gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp" +//#include "gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp" #include "gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_lds_double_buffer.hpp" using namespace ck; @@ -55,7 +55,6 @@ void device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw(InDesc, out_nkhw_device_buf.ToDevice(out_nkhw.mData.data()); #if 1 - // 1x1 filter, 8x8 image constexpr index_t BlockSize = 256; constexpr index_t BPerBlock = 128; @@ -86,8 +85,45 @@ void device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw(InDesc, using WeiBlockCopySrcAccessOrder = Sequence<1, 0>; // [K, E] using WeiBlockCopyDstAccessOrder = Sequence<0, 1>; // [E, K] - constexpr index_t WeiBlockCopySrcDataPerRead_E = 1; + constexpr index_t WeiBlockCopySrcDataPerRead_E = 4; constexpr index_t WeiBlockCopyDstDataPerWrite_K = 1; + + constexpr index_t OutThreadCopyDataPerAccess_B = 1; +#elif 0 // debug + constexpr index_t BlockSize = 256; + + constexpr index_t BPerBlock = 128; + constexpr index_t KPerBlock = 128; + constexpr index_t EPerBlock = 8; + + constexpr index_t GemmMPerThreadSubC = 4; + constexpr index_t GemmNPerThreadSubC = 4; + constexpr index_t GemmMLevel0Cluster = 4; + constexpr index_t GemmNLevel0Cluster = 4; + constexpr index_t GemmMLevel1Cluster = 4; + constexpr index_t GemmNLevel1Cluster = 4; + constexpr index_t GemmKPerThreadLoop = 1; + constexpr index_t GemmDataPerReadA = 4; + constexpr index_t GemmDataPerReadB = 4; + + using InBlockCopySubLengths_E_B = Sequence<1, 4>; + using InBlockCopyClusterLengths_E_B = Sequence<8, 32>; + using InBlockCopyThreadClusterArrangeOrder = Sequence<0, 1>; // [E, B] + using InBlockCopySrcAccessOrder = Sequence<0, 1>; // [E, B] + using InBlockCopyDstAccessOrder = Sequence<0, 1>; // [E, B] + + constexpr index_t InBlockCopyDataPerAccess_B = 1; + + using WeiBlockCopySubLengths_E_K = Sequence<4, 1>; + using WeiBlockCopyClusterLengths_E_K = Sequence<2, 128>; + using WeiBlockCopyThreadClusterArrangeOrder = Sequence<1, 0>; // [K, E] + using WeiBlockCopySrcAccessOrder = Sequence<1, 0>; // [K, E] + using WeiBlockCopyDstAccessOrder = Sequence<0, 1>; // [E, K] + + constexpr index_t WeiBlockCopySrcDataPerRead_E = 4; + constexpr index_t WeiBlockCopyDstDataPerWrite_K = 1; + + constexpr index_t OutThreadCopyDataPerAccess_B = 1; #elif 1 // 1x1 filter, 8x8 image constexpr index_t BlockSize = 256; @@ -106,13 +142,13 @@ void device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw(InDesc, constexpr index_t GemmDataPerReadA = 4; constexpr index_t GemmDataPerReadB = 4; - using InBlockCopySubLengths_E_B = Sequence<2, 2>; - using InBlockCopyClusterLengths_E_B = Sequence<4, 64>; + using InBlockCopySubLengths_E_B = Sequence<1, 4>; + using InBlockCopyClusterLengths_E_B = Sequence<8, 32>; using InBlockCopyThreadClusterArrangeOrder = Sequence<0, 1>; // [E, B] using InBlockCopySrcAccessOrder = Sequence<0, 1>; // [E, B] using InBlockCopyDstAccessOrder = Sequence<0, 1>; // [E, B] - constexpr index_t InBlockCopyDataPerAccess_B = 1; + constexpr index_t InBlockCopyDataPerAccess_B = 4; using WeiBlockCopySubLengths_E_K = Sequence<4, 1>; using WeiBlockCopyClusterLengths_E_K = Sequence<2, 128>; @@ -120,8 +156,10 @@ void device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw(InDesc, using WeiBlockCopySrcAccessOrder = Sequence<1, 0>; // [K, E] using WeiBlockCopyDstAccessOrder = Sequence<0, 1>; // [E, K] - constexpr index_t WeiBlockCopySrcDataPerRead_E = 1; + constexpr index_t WeiBlockCopySrcDataPerRead_E = 4; constexpr index_t WeiBlockCopyDstDataPerWrite_K = 1; + + constexpr index_t OutThreadCopyDataPerAccess_B = 4; #endif constexpr index_t B = N * Ho * Wo; @@ -169,7 +207,8 @@ void device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw(InDesc, WeiBlockCopySrcAccessOrder, WeiBlockCopyDstAccessOrder, WeiBlockCopySrcDataPerRead_E, - WeiBlockCopyDstDataPerWrite_K>{}; + WeiBlockCopyDstDataPerWrite_K, + OutThreadCopyDataPerAccess_B>{}; for(index_t i = 0; i < nrepeat; ++i) {