diff --git a/driver/device_convolution_direct_v2_nchw_kcyx_nkhw.hpp b/driver/device_convolution_direct_v2_nchw_kcyx_nkhw.hpp index a18a7be5c5..7e20cccac5 100644 --- a/driver/device_convolution_direct_v2_nchw_kcyx_nkhw.hpp +++ b/driver/device_convolution_direct_v2_nchw_kcyx_nkhw.hpp @@ -47,8 +47,8 @@ void device_convolution_direct_v2_nchw_kcyx_nkhw(InDesc, constexpr index_t HoPerThread = 2; constexpr index_t WoPerThread = 2; - constexpr index_t InBlockCopyDataPerRead = 2; - constexpr index_t WeiBlockCopyDataPerRead = 4; + constexpr index_t InBlockCopyDataPerRead = 1; + constexpr index_t WeiBlockCopyDataPerRead = 1; constexpr index_t BlockSize = 128; #endif diff --git a/driver/device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw.hpp b/driver/device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw.hpp index 04e2baf225..39fcaef9be 100644 --- a/driver/device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw.hpp +++ b/driver/device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw.hpp @@ -92,7 +92,7 @@ void device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw(InDesc, constexpr index_t WeiBlockCopyDataPerRead_K = 4; constexpr index_t OutThreadCopyDataPerWrite_W = 2; -#elif 1 +#elif 0 // for 3x3, 34x34, v1r3, Vega 20, WoPerBlock = 32 constexpr index_t BlockSize = 256; diff --git a/driver/device_convolution_implicit_gemm_v3_nchw_cyxk_nkhw.hpp b/driver/device_convolution_implicit_gemm_v3_nchw_cyxk_nkhw.hpp index ee8028e55c..83f610eea7 100644 --- a/driver/device_convolution_implicit_gemm_v3_nchw_cyxk_nkhw.hpp +++ b/driver/device_convolution_implicit_gemm_v3_nchw_cyxk_nkhw.hpp @@ -85,6 +85,9 @@ void device_convolution_implicit_gemm_v3_nchw_cyxk_nkhw(InDesc, constexpr index_t InBlockCopySrcDataPerRead_B = 1; constexpr index_t InBlockCopyDstDataPerWrite_N2 = 4; + using WeiBlockCopySubLengths_C_K = Sequence<1, 4>; + using WeiBlockCopyClusterLengths_C_K = Sequence<8, 32>; + constexpr index_t WeiBlockCopyDataPerAccess_K = 4; #endif @@ -123,8 +126,11 @@ void device_convolution_implicit_gemm_v3_nchw_cyxk_nkhw(InDesc, InBlockCopyClusterLengths_N1_N2_C_B, InBlockCopySrcDataPerRead_B, InBlockCopyDstDataPerWrite_N2, + WeiBlockCopySubLengths_C_K, + WeiBlockCopyClusterLengths_C_K, WeiBlockCopyDataPerAccess_K>{}; +#if 1 float time = launch_kernel(run_gridwise_convolution, dim3(GridSize), dim3(BlockSize), @@ -138,6 +144,7 @@ void device_convolution_implicit_gemm_v3_nchw_cyxk_nkhw(InDesc, (float)calculate_convolution_flops(InDesc{}, WeiDesc{}, OutDesc{}) / (std::size_t(1000) * 1000 * 1000) / time); usleep(std::min(time * 1000, float(10000))); +#endif } out_nkhw_device_buf.FromDevice(out_nkhw.mData.data()); diff --git a/driver/driver.hip.cpp b/driver/driver.hip.cpp index 46cad0a87c..c7f988f129 100644 --- a/driver/driver.hip.cpp +++ b/driver/driver.hip.cpp @@ -411,7 +411,18 @@ void check_error(const Tensor& ref, const Tensor& result) int main(int argc, char* argv[]) { -#if 1 +#if 0 + constexpr index_t N = 8; + constexpr index_t C = 8; + constexpr index_t HI = 3; + constexpr index_t WI = 18; + constexpr index_t K = 128; + constexpr index_t Y = 3; + constexpr index_t X = 3; + + constexpr index_t HPad = 0; + constexpr index_t WPad = 0; +#elif 1 // 3x3, 34x34 constexpr index_t N = 64; constexpr index_t C = 256; @@ -635,11 +646,13 @@ int main(int argc, char* argv[]) if(do_verification) { +#if 1 if(Y == 3 && X == 3) { host_winograd_3x3_convolution(in_nchw, wei_kcyx, out_nkhw_host, lower_pads, upper_pads); } else +#endif { host_direct_convolution(in_nchw, wei_kcyx, out_nkhw_host, lower_pads, upper_pads); } diff --git a/src/include/Array.hip.hpp b/src/include/Array.hip.hpp index 96dc7d3996..7f15448293 100644 --- a/src/include/Array.hip.hpp +++ b/src/include/Array.hip.hpp @@ -57,8 +57,8 @@ __host__ __device__ constexpr auto make_zero_array() } template -__host__ __device__ auto reorder_array_given_new2old(const Array& old_array, - Sequence new2old) +__host__ __device__ constexpr auto reorder_array_given_new2old(const Array& old_array, + Sequence new2old) { Array new_array; @@ -73,8 +73,8 @@ __host__ __device__ auto reorder_array_given_new2old(const Array& } template -__host__ __device__ auto reorder_array_given_old2new(const Array& old_array, - Sequence old2new) +__host__ __device__ constexpr auto reorder_array_given_old2new(const Array& old_array, + Sequence old2new) { Array new_array; @@ -89,7 +89,7 @@ __host__ __device__ auto reorder_array_given_old2new(const Array& } template -__host__ __device__ auto extract_array(const Array& old_array, ExtractSeq) +__host__ __device__ constexpr auto extract_array(const Array& old_array, ExtractSeq) { Array new_array; @@ -112,7 +112,8 @@ __host__ __device__ constexpr auto operator+(Array a, Array{}([&](auto I) { constexpr index_t i = I.Get(); - result[i] = a[i] + b[i]; + + result[i] = a[i] + b[i]; }); return result; @@ -129,7 +130,7 @@ __host__ __device__ constexpr auto operator*(Array a, Sequence{}([&](auto I) { constexpr index_t i = I.Get(); - result[i] = a[i] + b.Get(I); + result[i] = a[i] * b.Get(I); }); return result; diff --git a/src/include/ConstantMergedTensorDescriptor.hip.hpp b/src/include/ConstantMergedTensorDescriptor.hip.hpp index 1fe2ee32c0..1f345ec085 100644 --- a/src/include/ConstantMergedTensorDescriptor.hip.hpp +++ b/src/include/ConstantMergedTensorDescriptor.hip.hpp @@ -26,6 +26,11 @@ struct ConstantMergedTensorDescriptor // TODO: check there is no duplication in OriginalDimMergeSeqs } + __host__ __device__ static constexpr auto GetOriginalTensorDescriptor() + { + return OriginalTensorDesc{}; + } + __host__ __device__ static constexpr index_t GetNumOfDimension() { return nDim; } __host__ __device__ static constexpr index_t GetNumOfOriginalDimension() @@ -120,3 +125,9 @@ __host__ __device__ constexpr auto make_ConstantMergedTensorDescriptor(OriginalT { return ConstantMergedTensorDescriptor{}; } + +template +__host__ __device__ void print_ConstantMergedTensorDescriptor(TDesc, const char* s) +{ + print_ConstantTensorDescriptor(TDesc::GetOriginalTensorDescriptor(), s); +} diff --git a/src/include/ConstantTensorDescriptor.hip.hpp b/src/include/ConstantTensorDescriptor.hip.hpp index ecf9e5fae8..c789b57600 100644 --- a/src/include/ConstantTensorDescriptor.hip.hpp +++ b/src/include/ConstantTensorDescriptor.hip.hpp @@ -396,31 +396,35 @@ __host__ __device__ constexpr auto make_ConstantTensorDescriptor_default_rank_al template __host__ __device__ void print_ConstantTensorDescriptor(TDesc, const char* s) { - constexpr auto desc = TDesc{}; - constexpr index_t ndim = desc.GetNumOfDimension(); + constexpr index_t ndim = TDesc::GetNumOfDimension(); static_assert(ndim >= 2 && ndim <= 10, "wrong!"); - if(ndim == 2) - { + static_if{}([&](auto fwd) { constexpr auto I0 = Number<0>{}; constexpr auto I1 = Number<1>{}; - printf("%s dim %u, lengths {%u %u}, strides {%u %u}\n", + constexpr auto desc = fwd(TDesc{}); + + printf("%s dim %u, lengths {%u %u}, strides {%u %u}, ranks {%u %u}\n", s, desc.GetNumOfDimension(), desc.GetLength(I0), desc.GetLength(I1), desc.GetStride(I0), - desc.GetStride(I1)); - } - else if(ndim == 3) - { + desc.GetStride(I1), + desc.GetMemoryRank(I0), + desc.GetMemoryRank(I1)); + }); + + static_if{}([&](auto fwd) { constexpr auto I0 = Number<0>{}; constexpr auto I1 = Number<1>{}; constexpr auto I2 = Number<2>{}; - printf("%s dim %u, lengths {%u %u %u}, strides {%u %u %u}\n", + constexpr auto desc = fwd(TDesc{}); + + printf("%s dim %u, lengths {%u %u %u}, strides {%u %u %u}, ranks {%u %u %u}\n", s, desc.GetNumOfDimension(), desc.GetLength(I0), @@ -428,16 +432,21 @@ __host__ __device__ void print_ConstantTensorDescriptor(TDesc, const char* s) desc.GetLength(I2), desc.GetStride(I0), desc.GetStride(I1), - desc.GetStride(I2)); - } - else if(ndim == 4) - { + desc.GetStride(I2), + desc.GetMemoryRank(I0), + desc.GetMemoryRank(I1), + desc.GetMemoryRank(I2)); + }); + + static_if{}([&](auto fwd) { constexpr auto I0 = Number<0>{}; constexpr auto I1 = Number<1>{}; constexpr auto I2 = Number<2>{}; constexpr auto I3 = Number<3>{}; - printf("%s dim %u, lengths {%u %u %u %u}, strides {%u %u %u %u}\n", + constexpr auto desc = fwd(TDesc{}); + + printf("%s dim %u, lengths {%u %u %u %u}, strides {%u %u %u %u}, ranks {%u %u %u %u}\n", s, desc.GetNumOfDimension(), desc.GetLength(I0), @@ -447,17 +456,24 @@ __host__ __device__ void print_ConstantTensorDescriptor(TDesc, const char* s) desc.GetStride(I0), desc.GetStride(I1), desc.GetStride(I2), - desc.GetStride(I3)); - } - else if(ndim == 5) - { + desc.GetStride(I3), + desc.GetMemoryRank(I0), + desc.GetMemoryRank(I1), + desc.GetMemoryRank(I2), + desc.GetMemoryRank(I3)); + }); + + static_if{}([&](auto fwd) { constexpr auto I0 = Number<0>{}; constexpr auto I1 = Number<1>{}; constexpr auto I2 = Number<2>{}; constexpr auto I3 = Number<3>{}; constexpr auto I4 = Number<4>{}; - printf("%s dim %u, lengths {%u %u %u %u %u}, strides {%u %u %u %u %u}\n", + constexpr auto desc = fwd(TDesc{}); + + printf("%s dim %u, lengths {%u %u %u %u %u}, strides {%u %u %u %u %u}, ranks {%u %u %u %u " + "%u}\n", s, desc.GetNumOfDimension(), desc.GetLength(I0), @@ -469,10 +485,15 @@ __host__ __device__ void print_ConstantTensorDescriptor(TDesc, const char* s) desc.GetStride(I1), desc.GetStride(I2), desc.GetStride(I3), - desc.GetStride(I4)); - } - else if(ndim == 6) - { + desc.GetStride(I4), + desc.GetMemoryRank(I0), + desc.GetMemoryRank(I1), + desc.GetMemoryRank(I2), + desc.GetMemoryRank(I3), + desc.GetMemoryRank(I4)); + }); + + static_if{}([&](auto fwd) { constexpr auto I0 = Number<0>{}; constexpr auto I1 = Number<1>{}; constexpr auto I2 = Number<2>{}; @@ -480,7 +501,10 @@ __host__ __device__ void print_ConstantTensorDescriptor(TDesc, const char* s) constexpr auto I4 = Number<4>{}; constexpr auto I5 = Number<5>{}; - printf("%s dim %u, lengths {%u %u %u %u %u %u}, strides {%u %u %u %u %u %u}\n", + constexpr auto desc = fwd(TDesc{}); + + printf("%s dim %u, lengths {%u %u %u %u %u %u}, strides {%u %u %u %u %u %u}, ranks {%u %u " + "%u %u %u %u}\n", s, desc.GetNumOfDimension(), desc.GetLength(I0), @@ -494,10 +518,16 @@ __host__ __device__ void print_ConstantTensorDescriptor(TDesc, const char* s) desc.GetStride(I2), desc.GetStride(I3), desc.GetStride(I4), - desc.GetStride(I5)); - } - else if(ndim == 7) - { + desc.GetStride(I5), + desc.GetMemoryRank(I0), + desc.GetMemoryRank(I1), + desc.GetMemoryRank(I2), + desc.GetMemoryRank(I3), + desc.GetMemoryRank(I4), + desc.GetMemoryRank(I5)); + }); + + static_if{}([&](auto fwd) { constexpr auto I0 = Number<0>{}; constexpr auto I1 = Number<1>{}; constexpr auto I2 = Number<2>{}; @@ -506,7 +536,10 @@ __host__ __device__ void print_ConstantTensorDescriptor(TDesc, const char* s) constexpr auto I5 = Number<5>{}; constexpr auto I6 = Number<6>{}; - printf("%s dim %u, lengths {%u %u %u %u %u %u %u}, strides {%u %u %u %u %u %u %u}\n", + constexpr auto desc = fwd(TDesc{}); + + printf("%s dim %u, lengths {%u %u %u %u %u %u %u}, strides {%u %u %u %u %u %u %u}, ranks " + "{%u %u %u %u %u %u %u}\n", s, desc.GetNumOfDimension(), desc.GetLength(I0), @@ -522,10 +555,17 @@ __host__ __device__ void print_ConstantTensorDescriptor(TDesc, const char* s) desc.GetStride(I3), desc.GetStride(I4), desc.GetStride(I5), - desc.GetStride(I6)); - } - else if(ndim == 8) - { + desc.GetStride(I6), + desc.GetMemoryRank(I0), + desc.GetMemoryRank(I1), + desc.GetMemoryRank(I2), + desc.GetMemoryRank(I3), + desc.GetMemoryRank(I4), + desc.GetMemoryRank(I5), + desc.GetMemoryRank(I6)); + }); + + static_if{}([&](auto fwd) { constexpr auto I0 = Number<0>{}; constexpr auto I1 = Number<1>{}; constexpr auto I2 = Number<2>{}; @@ -535,7 +575,10 @@ __host__ __device__ void print_ConstantTensorDescriptor(TDesc, const char* s) constexpr auto I6 = Number<6>{}; constexpr auto I7 = Number<7>{}; - printf("%s dim %u, lengths {%u %u %u %u %u %u %u %u}, strides {%u %u %u %u %u %u %u %u}\n", + constexpr auto desc = fwd(TDesc{}); + + printf("%s dim %u, lengths {%u %u %u %u %u %u %u %u}, strides {%u %u %u %u %u %u %u %u}, " + "ranks {%u %u %u %u %u %u %u %u}\n", s, desc.GetNumOfDimension(), desc.GetLength(I0), @@ -553,10 +596,18 @@ __host__ __device__ void print_ConstantTensorDescriptor(TDesc, const char* s) desc.GetStride(I4), desc.GetStride(I5), desc.GetStride(I6), - desc.GetStride(I7)); - } - else if(ndim == 9) - { + desc.GetStride(I7), + desc.GetMemoryRank(I0), + desc.GetMemoryRank(I1), + desc.GetMemoryRank(I2), + desc.GetMemoryRank(I3), + desc.GetMemoryRank(I4), + desc.GetMemoryRank(I5), + desc.GetMemoryRank(I6), + desc.GetMemoryRank(I7)); + }); + + static_if{}([&](auto fwd) { constexpr auto I0 = Number<0>{}; constexpr auto I1 = Number<1>{}; constexpr auto I2 = Number<2>{}; @@ -567,8 +618,10 @@ __host__ __device__ void print_ConstantTensorDescriptor(TDesc, const char* s) constexpr auto I7 = Number<7>{}; constexpr auto I8 = Number<8>{}; + constexpr auto desc = fwd(TDesc{}); + printf("%s dim %u, lengths {%u %u %u %u %u %u %u %u %u}, strides {%u %u %u %u %u %u %u %u " - "%u}\n", + "%u}, ranks {%u %u %u %u %u %u %u %u %u}\n", s, desc.GetNumOfDimension(), desc.GetLength(I0), @@ -588,10 +641,19 @@ __host__ __device__ void print_ConstantTensorDescriptor(TDesc, const char* s) desc.GetStride(I5), desc.GetStride(I6), desc.GetStride(I7), - desc.GetStride(I8)); - } - else if(ndim == 10) - { + desc.GetStride(I8), + desc.GetMemoryRank(I0), + desc.GetMemoryRank(I1), + desc.GetMemoryRank(I2), + desc.GetMemoryRank(I3), + desc.GetMemoryRank(I4), + desc.GetMemoryRank(I5), + desc.GetMemoryRank(I6), + desc.GetMemoryRank(I7), + desc.GetMemoryRank(I8)); + }); + + static_if{}([&](auto fwd) { constexpr auto I0 = Number<0>{}; constexpr auto I1 = Number<1>{}; constexpr auto I2 = Number<2>{}; @@ -603,8 +665,10 @@ __host__ __device__ void print_ConstantTensorDescriptor(TDesc, const char* s) constexpr auto I8 = Number<8>{}; constexpr auto I9 = Number<9>{}; + constexpr auto desc = fwd(TDesc{}); + printf("%s dim %u, lengths {%u %u %u %u %u %u %u %u %u %u}, strides {%u %u %u %u %u %u %u " - "%u %u %u}\n", + "%u %u %u}, ranks {%u %u %u %u %u %u %u %u %u %u}\n", s, desc.GetNumOfDimension(), desc.GetLength(I0), @@ -626,6 +690,16 @@ __host__ __device__ void print_ConstantTensorDescriptor(TDesc, const char* s) desc.GetStride(I6), desc.GetStride(I7), desc.GetStride(I8), - desc.GetStride(I9)); - } + desc.GetStride(I9), + desc.GetMemoryRank(I0), + desc.GetMemoryRank(I1), + desc.GetMemoryRank(I2), + desc.GetMemoryRank(I3), + desc.GetMemoryRank(I4), + desc.GetMemoryRank(I5), + desc.GetMemoryRank(I6), + desc.GetMemoryRank(I7), + desc.GetMemoryRank(I8), + desc.GetMemoryRank(I9)); + }); } diff --git a/src/include/Sequence.hip.hpp b/src/include/Sequence.hip.hpp index cf713bcc8c..81998dd9fb 100644 --- a/src/include/Sequence.hip.hpp +++ b/src/include/Sequence.hip.hpp @@ -263,7 +263,19 @@ struct sequence_map_inverse> using SeqMapType = typename sequence_map_inverse_impl, is_valid_map>::SeqMapType; }; + #endif +template +struct is_valid_sequence_map +{ + static constexpr bool value = +#if 0 // sequence_sort is not implemented yet + is_same::SeqType, + typename sequence_sort::SortedSeqType>::value; +#else + true; +#endif +}; template __host__ __device__ constexpr auto operator+(Sequence, Sequence) diff --git a/src/include/blockwise_merged_tensor_slice_op.hip.hpp b/src/include/blockwise_merged_tensor_slice_op.hip.hpp index b616e9f010..7a534feda4 100644 --- a/src/include/blockwise_merged_tensor_slice_op.hip.hpp +++ b/src/include/blockwise_merged_tensor_slice_op.hip.hpp @@ -26,7 +26,18 @@ struct BlockwiseTensorSliceCopy_generic_v1 Array dst_block_data_multi_id_begin) { // check NDim consistent - static_assert(SrcDesc::GetNumOfDimension() == DstDesc::GetNumOfDimension(), "wrong"); + static_assert(nDim == SrcDesc::GetNumOfDimension() && + nDim == DstDesc::GetNumOfDimension() && nDim == SliceLengths::GetSize() && + nDim == SubLengths::GetSize() && nDim == DataClusterLengths::GetSize() && + nDim == ThreadClusterArrangeOrder::GetSize() && + nDim == SrcAccessOrder::GetSize() && nDim == DstAccessOrder::GetSize(), + "wrong"); + + // check + static_assert(is_valid_sequence_map::value && + is_valid_sequence_map::value && + is_valid_sequence_map::value, + "wrong!"); // thread cluster constexpr auto thread_cluster_desc = make_ConstantTensorDescriptor_default_rank_packed( @@ -73,8 +84,38 @@ struct BlockwiseTensorSliceCopy_generic_v1 mSrcMyThreadOffset = SrcDesc::GetOffsetFromMultiIndex(src_block_data_multi_id_begin + thread_data_multi_id_begin); - mSrcMyThreadOffset = DstDesc::GetOffsetFromMultiIndex(dst_block_data_multi_id_begin + + mDstMyThreadOffset = DstDesc::GetOffsetFromMultiIndex(dst_block_data_multi_id_begin + thread_data_multi_id_begin); +#if 0 + { + printf("id %5u %5u: " + "src_block_data_multi_id_begin: %u %u %u %u, " + "thread_cluster_multi_id: %u %u %u %u, " + "data_cluster_multi_id: %u %u %u %u, " + "thread_data_multi_id_begin: %u %u %u %u, " + "mSrcMyThreadOffset %u, mDstMyThreadOffset %u \n", + get_block_1d_id(), + get_thread_local_1d_id(), + src_block_data_multi_id_begin[0], + src_block_data_multi_id_begin[1], + src_block_data_multi_id_begin[2], + src_block_data_multi_id_begin[3], + thread_cluster_multi_id[0], + thread_cluster_multi_id[1], + thread_cluster_multi_id[2], + thread_cluster_multi_id[3], + data_cluster_multi_id[0], + data_cluster_multi_id[1], + data_cluster_multi_id[2], + data_cluster_multi_id[3], + thread_data_multi_id_begin[0], + thread_data_multi_id_begin[1], + thread_data_multi_id_begin[2], + thread_data_multi_id_begin[3], + mSrcMyThreadOffset, + mDstMyThreadOffset); + } +#endif } __device__ static constexpr index_t GetRegisterClipboardSize() diff --git a/src/include/blockwise_tensor_slice_op.hip.hpp b/src/include/blockwise_tensor_slice_op.hip.hpp index 8fad695136..f69682b5d5 100644 --- a/src/include/blockwise_tensor_slice_op.hip.hpp +++ b/src/include/blockwise_tensor_slice_op.hip.hpp @@ -130,6 +130,7 @@ struct BlockwiseTensorSliceReorderCopy_v3 mSrcMyThreadOffset = src_desc.GetOffsetFromMultiIndex(src_data_multi_id + src_block_data_multi_id_begin); + mDstMyThreadOffset = dst_desc.GetOffsetFromMultiIndex(dst_data_multi_id + dst_block_data_multi_id_begin); } diff --git a/src/include/gridwise_convolution_direct_v2_nchw_kcyx_nkhw.hip.hpp b/src/include/gridwise_convolution_direct_v2_nchw_kcyx_nkhw.hip.hpp index c169d7feb0..e2b009da06 100644 --- a/src/include/gridwise_convolution_direct_v2_nchw_kcyx_nkhw.hip.hpp +++ b/src/include/gridwise_convolution_direct_v2_nchw_kcyx_nkhw.hip.hpp @@ -45,23 +45,23 @@ struct GridwiseConvolutionDirect_v2_nchw_kcyx_nkhw constexpr index_t Y = wei_kcyx_global_desc.GetLength(I2); constexpr index_t X = wei_kcyx_global_desc.GetLength(I3); - constexpr auto wei_ke_global_desc = make_ConstantTensorDescriptor( + constexpr auto wei_ke_global_desc = make_ConstantTensorDescriptor_default_rank_packed( Sequence{}); // 2d view of wei for blockwise copy constexpr index_t HiPerBlock = HoPerBlock + Y - 1; constexpr index_t WiPerBlock = WoPerBlock + X - 1; - constexpr auto in_nchw_block_desc = make_ConstantTensorDescriptor_aligned( + constexpr auto in_nchw_block_desc = make_ConstantTensorDescriptor_default_rank_aligned( Sequence{}, Number{}); - constexpr auto wei_ke_block_desc = make_ConstantTensorDescriptor_aligned( + constexpr auto wei_ke_block_desc = make_ConstantTensorDescriptor_default_rank_aligned( Sequence{}, Number{}); // 2d view of wei for blockwise copy - constexpr auto wei_kcyx_block_desc = - make_ConstantTensorDescriptor(Sequence{}, - Sequence{}); + constexpr auto wei_kcyx_block_desc = make_ConstantTensorDescriptor_default_rank( + Sequence{}, + Sequence{}); // shared mem constexpr index_t in_block_element_size = @@ -82,11 +82,11 @@ struct GridwiseConvolutionDirect_v2_nchw_kcyx_nkhw constexpr index_t HiPerThread = HoPerThread + Y - 1; constexpr index_t WiPerThread = WoPerThread + X - 1; - constexpr auto in_nchw_thread_block_desc = make_ConstantTensorDescriptor( + constexpr auto in_nchw_thread_block_desc = make_ConstantTensorDescriptor_default_rank( Sequence{}, in_nchw_block_desc.GetStrides()); - constexpr auto wei_kcyx_thread_block_desc = make_ConstantTensorDescriptor( + constexpr auto wei_kcyx_thread_block_desc = make_ConstantTensorDescriptor_default_rank( Sequence{}, wei_kcyx_block_desc.GetStrides()); constexpr auto out_nkhw_thread_desc = get_convolution_output_default_4d_tensor_descriptor( @@ -170,7 +170,7 @@ struct GridwiseConvolutionDirect_v2_nchw_kcyx_nkhw decltype(wei_ke_global_desc), decltype(wei_ke_block_desc), decltype(wei_ke_block_desc.GetLengths()), - WeiBlockCopyDataPerRead>{}; + WeiBlockCopyDataPerRead>({0, 0}, {0, 0}); #endif // set threadwise output tensor to 0 diff --git a/src/include/gridwise_convolution_implicit_gemm_v1r3_lds_double_buffer_nchw_cyxk_nkhw.hip.hpp b/src/include/gridwise_convolution_implicit_gemm_v1r3_lds_double_buffer_nchw_cyxk_nkhw.hip.hpp index 02a87650d4..b217f79548 100644 --- a/src/include/gridwise_convolution_implicit_gemm_v1r3_lds_double_buffer_nchw_cyxk_nkhw.hip.hpp +++ b/src/include/gridwise_convolution_implicit_gemm_v1r3_lds_double_buffer_nchw_cyxk_nkhw.hip.hpp @@ -459,7 +459,8 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_nchw_cyxk_nkhw constexpr auto map_out_global2thread = Sequence<8, 9, 0, 1, 2, 3, 4, 5, 6, 7>{}; - threadwise_tensor_slice_copy_reorder_given_dst2src_v2( +#if 0 + threadwise_tensor_slice_copy_reorder_given_dst2src_v3( out_10d_thread_desc, p_out_thread, out_10d_global_desc, @@ -470,8 +471,24 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_nchw_cyxk_nkhw ho_block_data_begin + ho_thread_data_begin, wo_block_data_begin + wo_thread_data_begin), out_10d_thread_desc.GetLengths(), - map_out_global2thread); - // Number{}); + map_out_global2thread, + Number{}); +#else + threadwise_tensor_slice_copy_generic( + out_10d_thread_desc.ReorderGivenNew2Old(map_out_global2thread), + p_out_thread, + make_zero_array(), + out_10d_global_desc, + p_out_global + + out_n_k_h_w_global_desc.GetOffsetFromMultiIndex( + n_block_data_begin + n_thread_data_begin, + k_block_data_begin + k_thread_data_begin, + ho_block_data_begin + ho_thread_data_begin, + wo_block_data_begin + wo_thread_data_begin), + make_zero_array(), + out_10d_thread_desc.GetLengths().ReorderGivenNew2Old(map_out_global2thread), + arithmetic_sequence_gen<0, 10, 1>::SeqType{}); +#endif }); } }; diff --git a/src/include/gridwise_convolution_implicit_gemm_v3_nchw_cyxk_nkhw.hip.hpp b/src/include/gridwise_convolution_implicit_gemm_v3_nchw_cyxk_nkhw.hip.hpp index b9f8c8cc1b..4af4e33589 100644 --- a/src/include/gridwise_convolution_implicit_gemm_v3_nchw_cyxk_nkhw.hip.hpp +++ b/src/include/gridwise_convolution_implicit_gemm_v3_nchw_cyxk_nkhw.hip.hpp @@ -32,6 +32,8 @@ template struct GridwiseConvolutionImplicitGemm_v3_nchw_cyxk_nkhw { @@ -40,7 +42,7 @@ struct GridwiseConvolutionImplicitGemm_v3_nchw_cyxk_nkhw Float* const __restrict__ p_out_global) const { // this is a mess - // TODO: more elegent way of specifying (or calculating) performance variables + // TODO: fidn more elegent way of specifying (or calculating) performance parameters static_assert(N2 == GemmNPerThreadSubC, "wrong!"); static_assert((N1 * N2 * BPerBlock) % (GemmNPerThreadSubC * GemmNLevel0Cluster * GemmNLevel1Cluster) == @@ -132,7 +134,7 @@ struct GridwiseConvolutionImplicitGemm_v3_nchw_cyxk_nkhw Float, decltype(in_n1_n2_c_b_global_merged_desc), decltype(in_n1_n2_c_b_block_desc), - Sequence, + decltype(in_n1_n2_c_b_block_desc.GetLengths()), InBlockCopySubLengths_N1_N2_C_B, InBlockCopyClusterLengths_N1_N2_C_B, Sequence<2, 0, 1, 3>, // thread_arrange_order [C, N1, N2, B] @@ -153,15 +155,21 @@ struct GridwiseConvolutionImplicitGemm_v3_nchw_cyxk_nkhw // operator for blockwise copy of weight into LDS // slicing a tensor - // this copy operator already have tensor offset built-in + // this copy operator already have blockwise offset built-in const auto blockwise_wei_copy = - Blockwise2dTensorCopy3({0, k_block_data_on_global}, - {0, 0}); + BlockwiseTensorSliceCopy_generic_v1, // thread_arrange_order [C, K] + Sequence<0, 1>, // src_access_order [C, K] + Sequence<0, 1>, // dst_access_order [C, K] + WeiBlockCopyDataPerAccess_K, + WeiBlockCopyDataPerAccess_K>( + {0, k_block_data_on_global}, {0, 0}); // GEMM definition // c_mtx += transpose(a_mtx) * b_mtx @@ -244,12 +252,16 @@ struct GridwiseConvolutionImplicitGemm_v3_nchw_cyxk_nkhw p_in_block_on_global += CPerBlock * in_n_c_h_w_global_desc.GetStride(I1), p_wei_block_on_global += CPerBlock * wei_c_y_x_k_global_desc.GetStride(I0)) { +#if 1 // debug blockwise_in_copy.Run(p_in_block_on_global, p_in_block); blockwise_wei_copy.Run(p_wei_block_on_global, p_wei_block); +#endif __syncthreads(); +#if 1 // debug blockwise_gemm.Run(p_wei_block, p_in_block, p_out_thread); +#endif __syncthreads(); } @@ -296,7 +308,7 @@ struct GridwiseConvolutionImplicitGemm_v3_nchw_cyxk_nkhw k_block_data_on_global + c_thread_mtx_on_block.row; const index_t b_thread_data_on_global = - b_block_data_on_global + c_thread_mtx_on_block.col; + b_block_data_on_global + c_thread_mtx_on_block.col / N2; // output merged global tensor descriptor, for calculating origin of thread tensor // in global memory @@ -320,7 +332,6 @@ struct GridwiseConvolutionImplicitGemm_v3_nchw_cyxk_nkhw out_k_n1_b_n2_global_merged_desc.GetOffsetFromMultiIndex( k_thread_data_on_global, 0, 0, 0); // dst origin on merged global tensor - // copy threadwise_tensor_slice_copy_generic( out_k0_k1_k2_n1_b_n2_thread_mem_desc, // src thread tensor (in register) descriptor p_out_thread, // origin of src @@ -335,8 +346,33 @@ struct GridwiseConvolutionImplicitGemm_v3_nchw_cyxk_nkhw b_thread_data_on_global, 0}, // starting point of slice w.r.t. origin of dst out_k0_k1_k2_n1_b_n2_thread_mem_desc.GetLengths(), // slice lengths - Sequence<2, 3, 4, 0, 5, 1>{} // order of dimension access + Sequence<3, 5, 0, 1, 2, 4>{} // dimension access order [n1, n2, k0, k1, k2, b] ); + +#if 0 + if(get_thread_local_1d_id() == 0 && get_block_1d_id() == 0) + { + print_ConstantTensorDescriptor(in_n0_n1_n2_c_h_w_global_mem_desc, + "in_n0_n1_n2_c_h_w_global_mem_desc"); + + print_ConstantMergedTensorDescriptor(in_n1_n2_c_b_global_merged_desc, + "in_n1_n2_c_b_global_merged_desc"); + + print_ConstantTensorDescriptor(in_c_n1_b_n2_block_mem_desc, + "in_c_n1_b_n2_block_mem_desc"); + + print_ConstantTensorDescriptor(in_n1_n2_c_b_block_desc, "in_n1_n2_c_b_block_desc"); + + print_ConstantTensorDescriptor(out_n0_n1_n2_k0_k1_k2_h_w_global_mem_desc, + "out_n0_n1_n2_k0_k1_k2_h_w_global_mem_desc"); + + print_ConstantMergedTensorDescriptor(out_k_n1_b_n2_global_merged_desc, + "out_k_n1_b_n2_global_merged_desc"); + + print_ConstantTensorDescriptor(out_k0_k1_k2_n1_b_n2_thread_mem_desc, + "out_k0_k1_k2_n1_b_n2_thread_mem_desc"); + } +#endif } } }; diff --git a/src/include/threadwise_direct_convolution.hip.hpp b/src/include/threadwise_direct_convolution.hip.hpp index 068ba52c41..e6b2fdd8c2 100644 --- a/src/include/threadwise_direct_convolution.hip.hpp +++ b/src/include/threadwise_direct_convolution.hip.hpp @@ -80,8 +80,10 @@ __device__ void threadwise_direct_convolution_2(InDesc, constexpr auto wei_desc = WeiDesc{}; constexpr auto out_desc = OutDesc{}; - constexpr auto in_reg_desc = make_ConstantTensorDescriptor(in_desc.GetLengths()); - constexpr auto wei_reg_desc = make_ConstantTensorDescriptor(wei_desc.GetLengths()); + constexpr auto in_reg_desc = + make_ConstantTensorDescriptor_default_rank_packed(in_desc.GetLengths()); + constexpr auto wei_reg_desc = + make_ConstantTensorDescriptor_default_rank_packed(wei_desc.GetLengths()); // register TInWei p_in_reg[in_reg_desc.GetElementSpace()]; diff --git a/src/include/threadwise_gemm.hip.hpp b/src/include/threadwise_gemm.hip.hpp index acd813d437..61a4e45151 100644 --- a/src/include/threadwise_gemm.hip.hpp +++ b/src/include/threadwise_gemm.hip.hpp @@ -67,6 +67,22 @@ __device__ void threadwise_gemm(MatrixA, integral_constant, FloatC* __restrict__ p_c_thread) { +#if 0 + if(get_thread_local_1d_id() == 0 && get_block_1d_id() == 0) + { + printf("p_a_thread: %f %f %f %f\n", + p_a_thread[0], + p_a_thread[1], + p_a_thread[2], + p_a_thread[3]); + printf("p_b_thread: %f %f %f %f\n", + p_b_thread[0], + p_b_thread[1], + p_b_thread[2], + p_b_thread[3]); + } +#endif + if(TransA && (!TransB) && (!TransC)) { constexpr auto a_mtx = MatrixA{}; diff --git a/src/include/threadwise_tensor_slice_op.hip.hpp b/src/include/threadwise_tensor_slice_op.hip.hpp index 3456a44ddf..5f0f77e467 100644 --- a/src/include/threadwise_tensor_slice_op.hip.hpp +++ b/src/include/threadwise_tensor_slice_op.hip.hpp @@ -204,25 +204,46 @@ __device__ void threadwise_tensor_slice_copy_generic( SliceLengths, DimAccessOrder) { - static_assert(SrcDesc::GetNumOfDimension() == DstDesc::GetNumOfDimension(), + constexpr index_t nDim = SrcDesc::GetNumOfDimension(); + + static_assert(nDim == SrcDesc::GetNumOfDimension() && nDim == DstDesc::GetNumOfDimension() && + nDim == SliceLengths::GetSize() && nDim == DimAccessOrder::GetSize(), "wrong! # of dimensions not the same"); - constexpr auto src_desc = SrcDesc{}; - constexpr auto dst_desc = DstDesc{}; + static_assert(is_valid_sequence_map::value, "wrong! map is not valid"); constexpr auto slice_lengths_in_access_order = - SliceLengths{}.ReorderGivenNew2Old(DimAccessOrder{}); + SliceLengths::ReorderGivenNew2Old(DimAccessOrder{}); +#if 1 ford{}([&](auto data_multi_id_in_access_order) { const auto data_multi_id = reorder_array_given_old2new(data_multi_id_in_access_order, DimAccessOrder{}); - const index_t dst_index = - dst_desc.GetOffsetFromMultiIndex(src_multi_id_begin + data_multi_id); - const index_t src_index = - src_desc.GetOffsetFromMultiIndex(dst_multi_id_begin + data_multi_id); + SrcDesc::GetOffsetFromMultiIndex(src_multi_id_begin + data_multi_id); + + const index_t dst_index = + DstDesc::GetOffsetFromMultiIndex(dst_multi_id_begin + data_multi_id); p_dst[dst_index] = p_src[src_index]; }); +#else + static_ford{}( + [&](auto data_multi_id_in_access_order_) { + constexpr auto data_multi_id_in_access_order = + sequence2array(decltype(data_multi_id_in_access_order_){}); + + const auto data_multi_id = + reorder_array_given_old2new(data_multi_id_in_access_order, DimAccessOrder{}); + + const index_t src_index = + SrcDesc::GetOffsetFromMultiIndex(src_multi_id_begin + data_multi_id); + + const index_t dst_index = + DstDesc::GetOffsetFromMultiIndex(dst_multi_id_begin + data_multi_id); + + p_dst[dst_index] = p_src[src_index]; + }); +#endif }