diff --git a/src/include/blockwise_generic_tensor_slice_op.hip.hpp b/src/include/blockwise_generic_tensor_slice_op.hip.hpp index 3f0bf5690f..d7b46cde1b 100644 --- a/src/include/blockwise_generic_tensor_slice_op.hip.hpp +++ b/src/include/blockwise_generic_tensor_slice_op.hip.hpp @@ -15,7 +15,7 @@ template + index_t DstDataPerWrite> struct BlockwiseGenericTensorSliceCopy_v1 { static constexpr index_t nDim = SrcDesc::GetNumOfDimension(); @@ -217,14 +217,15 @@ struct BlockwiseGenericTensorSliceCopy_v1 const index_t clipboard_offset = thread_tensor_desc.GetOffsetFromMultiIndex( clipboard_data_multi_id_begin); // cannot not constexpr, why? - threadwise_generic_tensor_slice_copy(SrcDesc{}, - p_src + src_offset + mThreadSrcOffset, - make_zero_array(), - thread_tensor_desc, - p_clipboard + clipboard_offset, - make_zero_array(), - thread_sub_tensor_lengths, - SrcAccessOrder{}); + threadwise_generic_tensor_slice_copy_v1(SrcDesc{}, + p_src + src_offset + mThreadSrcOffset, + make_zero_array(), + thread_tensor_desc, + p_clipboard + clipboard_offset, + make_zero_array(), + thread_sub_tensor_lengths, + SrcAccessOrder{}, + Number{}); }); } @@ -255,14 +256,15 @@ struct BlockwiseGenericTensorSliceCopy_v1 const index_t dst_offset = DstDesc{}.GetOffsetFromMultiIndex( dst_data_multi_id_begin); // cannot not constexpr, why? - threadwise_generic_tensor_slice_copy(thread_tensor_desc, - p_clipboard + clipboard_offset, - make_zero_array(), - DstDesc{}, - p_dst + dst_offset + mThreadDstOffset, - make_zero_array(), - thread_sub_tensor_lengths, - DstAccessOrder{}); + threadwise_generic_tensor_slice_copy_v1(thread_tensor_desc, + p_clipboard + clipboard_offset, + make_zero_array(), + DstDesc{}, + p_dst + dst_offset + mThreadDstOffset, + make_zero_array(), + thread_sub_tensor_lengths, + DstAccessOrder{}, + Number{}); }); } diff --git a/src/include/common.hip.hpp b/src/include/common.hip.hpp index 1de33aa4df..577e44ac97 100644 --- a/src/include/common.hip.hpp +++ b/src/include/common.hip.hpp @@ -105,4 +105,13 @@ __host__ __device__ constexpr T min(T x, Ts... xs) return x < y ? x : y; } + +// this is wrong +// TODO: implement correct least common multiple, instead of calling max() +template +__host__ __device__ constexpr T least_common_multiple(T x, Ts... xs) +{ + return max(x, xs...); +} + } // namespace mod_conv diff --git a/src/include/conv_common.hip.hpp b/src/include/conv_common.hip.hpp index 32f448a7f0..0958041c5a 100644 --- a/src/include/conv_common.hip.hpp +++ b/src/include/conv_common.hip.hpp @@ -3,8 +3,7 @@ // this is ugly, only for 4d template -constexpr auto get_convolution_output_default_4d_tensor_descriptor(InDesc, - WeiDesc) +constexpr auto get_convolution_output_default_4d_tensor_descriptor(InDesc, WeiDesc) { constexpr auto in_desc = InDesc{}; constexpr auto wei_desc = WeiDesc{}; @@ -34,8 +33,10 @@ constexpr auto get_convolution_output_default_4d_tensor_descriptor(InDesc, } template -constexpr auto get_convolution_with_padding_output_default_4d_tensor_descriptor( - InDesc, WeiDesc, LowerPads, UpperPads) +constexpr auto get_convolution_with_padding_output_default_4d_tensor_descriptor(InDesc, + WeiDesc, + LowerPads, + UpperPads) { constexpr auto in_desc = InDesc{}; constexpr auto wei_desc = WeiDesc{}; @@ -113,5 +114,6 @@ constexpr std::size_t calculate_convolution_memory_size(Float, InDesc, WeiDesc, constexpr index_t Y = wei_desc.GetLength(I2); constexpr index_t X = wei_desc.GetLength(I3); - return sizeof(Float) * (InDesc::GetElementSpace() + WeiDesc::GetElementSpace() + OutDesc::GetElementSpace()); + return sizeof(Float) * + (InDesc::GetElementSpace() + WeiDesc::GetElementSpace() + OutDesc::GetElementSpace()); } diff --git a/src/include/gridwise_convolution_implicit_gemm_v1r3_lds_double_buffer_nchw_cyxk_nkhw.hip.hpp b/src/include/gridwise_convolution_implicit_gemm_v1r3_lds_double_buffer_nchw_cyxk_nkhw.hip.hpp index 0f9a11e218..aa7faac964 100644 --- a/src/include/gridwise_convolution_implicit_gemm_v1r3_lds_double_buffer_nchw_cyxk_nkhw.hip.hpp +++ b/src/include/gridwise_convolution_implicit_gemm_v1r3_lds_double_buffer_nchw_cyxk_nkhw.hip.hpp @@ -474,7 +474,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_nchw_cyxk_nkhw map_out_global2thread, Number{}); #else - threadwise_generic_tensor_slice_copy( + threadwise_generic_tensor_slice_copy_v1( out_10d_thread_desc.ReorderGivenNew2Old(map_out_global2thread), p_out_thread, make_zero_array(), diff --git a/src/include/gridwise_convolution_implicit_gemm_v1r3_nchw_cyxk_nkhw.hip.hpp b/src/include/gridwise_convolution_implicit_gemm_v1r3_nchw_cyxk_nkhw.hip.hpp index 6d55d66cf2..8b2c3f388b 100644 --- a/src/include/gridwise_convolution_implicit_gemm_v1r3_nchw_cyxk_nkhw.hip.hpp +++ b/src/include/gridwise_convolution_implicit_gemm_v1r3_nchw_cyxk_nkhw.hip.hpp @@ -423,7 +423,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_nchw_cyxk_nkhw map_out_global2thread, Number{}); #else - threadwise_generic_tensor_slice_copy( + threadwise_generic_tensor_slice_copy_v1( out_10d_thread_desc.ReorderGivenNew2Old(map_out_global2thread), p_out_thread, make_zero_array(), diff --git a/src/include/gridwise_convolution_implicit_gemm_v3_lds_double_buffer_nchw_cyxk_nkhw.hip.hpp b/src/include/gridwise_convolution_implicit_gemm_v3_lds_double_buffer_nchw_cyxk_nkhw.hip.hpp index ca1b9e8c2e..25e61f9e35 100644 --- a/src/include/gridwise_convolution_implicit_gemm_v3_lds_double_buffer_nchw_cyxk_nkhw.hip.hpp +++ b/src/include/gridwise_convolution_implicit_gemm_v3_lds_double_buffer_nchw_cyxk_nkhw.hip.hpp @@ -405,14 +405,15 @@ struct GridwiseConvolutionImplicitGemm_v3_lds_double_buffer_nchw_cyxk_nkhw out_k_n1_b_n2_global_merged_desc.GetOffsetFromMultiIndex( k_thread_data_on_global, 0, b_thread_data_on_global, 0); - threadwise_generic_tensor_slice_copy(out_n0_n1_n2_k0_k1_k2_h_w_thread_desc, - p_out_thread, - {0, 0, 0, 0, 0, 0, 0, 0}, - out_n0_n1_n2_k0_k1_k2_h_w_global_mem_desc, - p_out_thread_on_global, - {0, 0, 0, 0, 0, 0, 0, 0}, - out_n0_n1_n2_k0_k1_k2_h_w_thread_desc.GetLengths(), - arithmetic_sequence_gen<0, 8, 1>::SeqType{}); + threadwise_generic_tensor_slice_copy_v1( + out_n0_n1_n2_k0_k1_k2_h_w_thread_desc, + p_out_thread, + {0, 0, 0, 0, 0, 0, 0, 0}, + out_n0_n1_n2_k0_k1_k2_h_w_global_mem_desc, + p_out_thread_on_global, + {0, 0, 0, 0, 0, 0, 0, 0}, + out_n0_n1_n2_k0_k1_k2_h_w_thread_desc.GetLengths(), + arithmetic_sequence_gen<0, 8, 1>::SeqType{}); } } }; diff --git a/src/include/gridwise_convolution_implicit_gemm_v3_nchw_cyxk_nkhw.hip.hpp b/src/include/gridwise_convolution_implicit_gemm_v3_nchw_cyxk_nkhw.hip.hpp index 42dfa288e0..d999302232 100644 --- a/src/include/gridwise_convolution_implicit_gemm_v3_nchw_cyxk_nkhw.hip.hpp +++ b/src/include/gridwise_convolution_implicit_gemm_v3_nchw_cyxk_nkhw.hip.hpp @@ -358,14 +358,15 @@ struct GridwiseConvolutionImplicitGemm_v3_nchw_cyxk_nkhw out_k_n1_b_n2_global_merged_desc.GetOffsetFromMultiIndex( k_thread_data_on_global, 0, b_thread_data_on_global, 0); - threadwise_generic_tensor_slice_copy(out_n0_n1_n2_k0_k1_k2_h_w_thread_desc, - p_out_thread, - {0, 0, 0, 0, 0, 0, 0, 0}, - out_n0_n1_n2_k0_k1_k2_h_w_global_mem_desc, - p_out_thread_on_global, - {0, 0, 0, 0, 0, 0, 0, 0}, - out_n0_n1_n2_k0_k1_k2_h_w_thread_desc.GetLengths(), - arithmetic_sequence_gen<0, 8, 1>::SeqType{}); + threadwise_generic_tensor_slice_copy_v1( + out_n0_n1_n2_k0_k1_k2_h_w_thread_desc, + p_out_thread, + {0, 0, 0, 0, 0, 0, 0, 0}, + out_n0_n1_n2_k0_k1_k2_h_w_global_mem_desc, + p_out_thread_on_global, + {0, 0, 0, 0, 0, 0, 0, 0}, + out_n0_n1_n2_k0_k1_k2_h_w_thread_desc.GetLengths(), + arithmetic_sequence_gen<0, 8, 1>::SeqType{}); } } }; diff --git a/src/include/gridwise_convolution_implicit_gemm_v4_lds_double_buffer_nchw_kcyx_nkhw.hip.hpp b/src/include/gridwise_convolution_implicit_gemm_v4_lds_double_buffer_nchw_kcyx_nkhw.hip.hpp index 6c7b77e46d..3cb67d4058 100644 --- a/src/include/gridwise_convolution_implicit_gemm_v4_lds_double_buffer_nchw_kcyx_nkhw.hip.hpp +++ b/src/include/gridwise_convolution_implicit_gemm_v4_lds_double_buffer_nchw_kcyx_nkhw.hip.hpp @@ -5,7 +5,7 @@ #include "ConstantMatrixDescriptor.hip.hpp" #include "blockwise_generic_tensor_slice_op.hip.hpp" #include "blockwise_gemm.hip.hpp" -#include "threadwise_tensor_slice_op.hip.hpp" +#include "threadwise_generic_tensor_slice_op.hip.hpp" // define B = merge(N, Ho, Wo) template {}; - const auto blockwise_wei_copy = BlockwiseTensorSliceReorderCopy_v3< + auto blockwise_wei_copy = BlockwiseTensorSliceReorderCopy_v3< BlockSize, Float, decltype(wei_e_k_global_desc.ReorderGivenNew2Old(map_k_e_2_e_k)), @@ -324,11 +324,7 @@ struct GridwiseConvolutionImplicitGemm_v4_lds_double_buffer_nchw_kcyx_nkhw Float p_wei_register_clipboard[blockwise_wei_copy.GetRegisterClipboardSize()]; blockwise_in_copy.MoveSlicingWindowOnSourceTensor(I0, Number{}, True); -#if 0 - blockwise_wei_copy.MoveSlicingWindowOnSourceTensor(I0, Number{}, True); -#else p_wei_block_on_global += EPerBlock * wei_e_k_global_desc.GetStride(I0); -#endif __syncthreads(); @@ -355,11 +351,7 @@ struct GridwiseConvolutionImplicitGemm_v4_lds_double_buffer_nchw_kcyx_nkhw // even iteration blockwise_in_copy.MoveSlicingWindowOnSourceTensor(I0, Number{}, True); -#if 0 - blockwise_wei_copy.MoveSlicingWindowOnSourceTensor(I0, Number{}, True); -#else p_wei_block_on_global += EPerBlock * wei_e_k_global_desc.GetStride(I0); -#endif __syncthreads(); @@ -434,14 +426,16 @@ struct GridwiseConvolutionImplicitGemm_v4_lds_double_buffer_nchw_kcyx_nkhw out_k_n1_b_n2_global_merged_desc.GetOffsetFromMultiIndex( k_thread_data_on_global, 0, b_thread_data_on_global, 0); - threadwise_generic_tensor_slice_copy(out_n0_n1_n2_k0_k1_k2_h_w_thread_desc, - p_out_thread, - {0, 0, 0, 0, 0, 0, 0, 0}, - out_n0_n1_n2_k0_k1_k2_h_w_global_mem_desc, - p_out_thread_on_global, - {0, 0, 0, 0, 0, 0, 0, 0}, - out_n0_n1_n2_k0_k1_k2_h_w_thread_desc.GetLengths(), - arithmetic_sequence_gen<0, 8, 1>::SeqType{}); + threadwise_generic_tensor_slice_copy_v1( + out_n0_n1_n2_k0_k1_k2_h_w_thread_desc, + p_out_thread, + {0, 0, 0, 0, 0, 0, 0, 0}, + out_n0_n1_n2_k0_k1_k2_h_w_global_mem_desc, + p_out_thread_on_global, + {0, 0, 0, 0, 0, 0, 0, 0}, + out_n0_n1_n2_k0_k1_k2_h_w_thread_desc.GetLengths(), + arithmetic_sequence_gen<0, 8, 1>::SeqType{}, + Number<1>{}); } } }; diff --git a/src/include/gridwise_convolution_implicit_gemm_v4_nchw_kcyx_nkhw.hip.hpp b/src/include/gridwise_convolution_implicit_gemm_v4_nchw_kcyx_nkhw.hip.hpp index 0702204821..8fe8d8fac2 100644 --- a/src/include/gridwise_convolution_implicit_gemm_v4_nchw_kcyx_nkhw.hip.hpp +++ b/src/include/gridwise_convolution_implicit_gemm_v4_nchw_kcyx_nkhw.hip.hpp @@ -5,7 +5,7 @@ #include "ConstantMatrixDescriptor.hip.hpp" #include "blockwise_generic_tensor_slice_op.hip.hpp" #include "blockwise_gemm.hip.hpp" -#include "threadwise_tensor_slice_op.hip.hpp" +#include "threadwise_generic_tensor_slice_op.hip.hpp" // define B = merge(N, Ho, Wo) template ::SeqType{}); + threadwise_generic_tensor_slice_copy_v1( + out_n0_n1_n2_k0_k1_k2_h_w_thread_desc, + p_out_thread, + {0, 0, 0, 0, 0, 0, 0, 0}, + out_n0_n1_n2_k0_k1_k2_h_w_global_mem_desc, + p_out_thread_on_global, + {0, 0, 0, 0, 0, 0, 0, 0}, + out_n0_n1_n2_k0_k1_k2_h_w_thread_desc.GetLengths(), + arithmetic_sequence_gen<0, 8, 1>::SeqType{}); } } }; diff --git a/src/include/threadwise_generic_tensor_slice_op.hip.hpp b/src/include/threadwise_generic_tensor_slice_op.hip.hpp new file mode 100644 index 0000000000..d40f51b6b2 --- /dev/null +++ b/src/include/threadwise_generic_tensor_slice_op.hip.hpp @@ -0,0 +1,99 @@ +#pragma once +#include "ConstantTensorDescriptor.hip.hpp" +#include "ConstantMergedTensorDescriptor.hip.hpp" + +template +__device__ void threadwise_generic_tensor_slice_copy_v1( + SrcDesc, + const Float* __restrict__ p_src, + Array src_multi_id_begin, + DstDesc, + Float* __restrict__ p_dst, + Array dst_multi_id_begin, + SliceLengths, + DimAccessOrder, + Number) +{ + constexpr index_t nDim = SrcDesc::GetNumOfDimension(); + + static_assert(nDim == SrcDesc::GetNumOfDimension() && nDim == DstDesc::GetNumOfDimension() && + nDim == SliceLengths::GetSize() && nDim == DimAccessOrder::GetSize(), + "wrong! # of dimensions not the same"); + + static_assert(is_valid_sequence_map::value, "wrong! map is not valid"); + +#if 0 + // doesn't compile, because merged-tensor reordering is not implemented + // TODO: implement tensor desc ops for merged-tensor + constexpr auto src_strides_in_access_order = + SrcDesc::ReorderGivenNew2Old(DimAccessOrder{}).GetStride(Number{}); + + constexpr auto dst_strides_in_access_order = + SrcDesc::ReorderGivenNew2Old(DimAccessOrder{}).GetStride(Number{}); + + // check src/dst stride on the lowest access dimension + static_assert((DataPerAccess == 1 || src_strides_in_access_order.Back() == 1) && + (DataPerAccess == 1 || dst_strides_in_access_order.Back() == 1), + "wrong! src/dst stride on the lowest access dimension needs to be 1 for " + "vectorized read/write"); +#endif + + constexpr auto slice_lengths_in_access_order = + SliceLengths::ReorderGivenNew2Old(DimAccessOrder{}); + + // check slice length on the lowest access dimension + static_assert(slice_lengths_in_access_order.Back() % DataPerAccess == 0, + "wrong! slice length on the lowest access dimension should be evenly divided by " + "DataPerAccess"); + + constexpr index_t num_access_on_lowest_access_dimension = + slice_lengths_in_access_order.Back() / DataPerAccess; + + constexpr auto access_lengths = slice_lengths_in_access_order.Modify( + Number{}, Number{}); + + using vector_t = typename vector_type::MemoryType; + +#if 1 + ford{}([&](auto access_multi_id) { + auto data_multi_id_in_access_order = access_multi_id; + data_multi_id_in_access_order[nDim - 1] = access_multi_id[nDim - 1] * DataPerAccess; + + const auto data_multi_id = + reorder_array_given_old2new(data_multi_id_in_access_order, DimAccessOrder{}); + + const index_t src_index = + SrcDesc::GetOffsetFromMultiIndex(src_multi_id_begin + data_multi_id); + + const index_t dst_index = + DstDesc::GetOffsetFromMultiIndex(dst_multi_id_begin + data_multi_id); + + *reinterpret_cast(&p_dst[dst_index]) = + *reinterpret_cast(&p_src[src_index]); + }); +#else + static_ford{}([&](auto access_multi_id_) { + const auto access_multi_id = sequence2array(access_multi_id_); + + auto data_multi_id_in_access_order = access_multi_id; + data_multi_id_in_access_order[nDim - 1] = access_multi_id[nDim - 1] * DataPerAccess; + + const auto data_multi_id = + reorder_array_given_old2new(data_multi_id_in_access_order, DimAccessOrder{}); + + const index_t src_index = + SrcDesc::GetOffsetFromMultiIndex(src_multi_id_begin + data_multi_id); + + const index_t dst_index = + DstDesc::GetOffsetFromMultiIndex(dst_multi_id_begin + data_multi_id); + + *reinterpret_cast(&p_dst[dst_index]) = + *reinterpret_cast(&p_src[src_index]); + }); +#endif +} diff --git a/src/include/threadwise_tensor_slice_op.hip.hpp b/src/include/threadwise_tensor_slice_op.hip.hpp index 3d69810316..2ae284be07 100644 --- a/src/include/threadwise_tensor_slice_op.hip.hpp +++ b/src/include/threadwise_tensor_slice_op.hip.hpp @@ -192,58 +192,3 @@ threadwise_tensor_slice_copy_reorder_given_dst2src_v3(SrcDesc, }); }); } - -template -__device__ void threadwise_generic_tensor_slice_copy( - SrcDesc, - const Float* __restrict__ p_src, - Array src_multi_id_begin, - DstDesc, - Float* __restrict__ p_dst, - Array dst_multi_id_begin, - SliceLengths, - DimAccessOrder) -{ - constexpr index_t nDim = SrcDesc::GetNumOfDimension(); - - static_assert(nDim == SrcDesc::GetNumOfDimension() && nDim == DstDesc::GetNumOfDimension() && - nDim == SliceLengths::GetSize() && nDim == DimAccessOrder::GetSize(), - "wrong! # of dimensions not the same"); - - static_assert(is_valid_sequence_map::value, "wrong! map is not valid"); - - constexpr auto slice_lengths_in_access_order = - SliceLengths::ReorderGivenNew2Old(DimAccessOrder{}); - -#if 1 - ford{}([&](auto data_multi_id_in_access_order) { - const auto data_multi_id = - reorder_array_given_old2new(data_multi_id_in_access_order, DimAccessOrder{}); - - const index_t src_index = - SrcDesc::GetOffsetFromMultiIndex(src_multi_id_begin + data_multi_id); - - const index_t dst_index = - DstDesc::GetOffsetFromMultiIndex(dst_multi_id_begin + data_multi_id); - - p_dst[dst_index] = p_src[src_index]; - }); -#else - static_ford{}( - [&](auto data_multi_id_in_access_order_) { - constexpr auto data_multi_id_in_access_order = - sequence2array(decltype(data_multi_id_in_access_order_){}); - - const auto data_multi_id = - reorder_array_given_old2new(data_multi_id_in_access_order, DimAccessOrder{}); - - const index_t src_index = - SrcDesc::GetOffsetFromMultiIndex(src_multi_id_begin + data_multi_id); - - const index_t dst_index = - DstDesc::GetOffsetFromMultiIndex(dst_multi_id_begin + data_multi_id); - - p_dst[dst_index] = p_src[src_index]; - }); -#endif -}