From 17f3d2d4bccebcc3a70606a916f93dc90e5eaa3a Mon Sep 17 00:00:00 2001 From: Chao Liu Date: Tue, 16 Apr 2019 17:36:18 -0500 Subject: [PATCH] refactor ConstantTensorDescriptor and functional --- ...lution_implicit_gemm_v1_chwn_cyxk_khwn.hpp | 12 +- ...lution_implicit_gemm_v2_chwn_cyxk_khwn.hpp | 12 +- ...onvolution_2_vectorized_nchw_kcyx_nkhw.hpp | 30 ++-- driver/driver.hip.cpp | 24 +-- src/include/Array.hip.hpp | 4 +- src/include/ConstantTensorDescriptor.hip.hpp | 73 ++++----- src/include/Sequence.hip.hpp | 20 +++ src/include/blockwise_2d_tensor_op.hip.hpp | 9 +- src/include/blockwise_3d_tensor_op.hip.hpp | 9 +- src/include/blockwise_4d_tensor_op.hip.hpp | 144 +++++++++++++++--- src/include/blockwise_batched_gemm.hip.hpp | 5 +- .../blockwise_direct_convolution.hip.hpp | 27 ++-- src/include/functional.hip.hpp | 48 ++++-- ..._implicit_gemm_v1r1_chwn_cyxk_khwn.hip.hpp | 49 +++--- ...1_chwn_cyxk_khwn_lds_double_buffer.hip.hpp | 26 ++-- ..._implicit_gemm_v1r2_chwn_cyxk_khwn.hip.hpp | 51 +++---- ...2_chwn_cyxk_khwn_lds_double_buffer.hip.hpp | 15 +- .../gridwise_direct_convolution_1.hip.hpp | 19 ++- ...irect_convolution_2_nchw_kcyx_nkhw.hip.hpp | 43 +++--- ...lution_2_vectorized_nchw_kcyx_nkhw.hip.hpp | 34 ++--- ...onvolution_1_chwn_cyxk_khwn_padded.hip.hpp | 9 +- src/include/tensor.hpp | 3 +- 22 files changed, 390 insertions(+), 276 deletions(-) diff --git a/driver/device_convolution_implicit_gemm_v1_chwn_cyxk_khwn.hpp b/driver/device_convolution_implicit_gemm_v1_chwn_cyxk_khwn.hpp index 86ffc58e77..3532a4d4ce 100644 --- a/driver/device_convolution_implicit_gemm_v1_chwn_cyxk_khwn.hpp +++ b/driver/device_convolution_implicit_gemm_v1_chwn_cyxk_khwn.hpp @@ -8,12 +8,12 @@ template void device_convolution_implicit_gemm_v1_chwn_cyxk_khwn(InDesc, - const Tensor& in_nchw, - WeiDesc, - const Tensor& wei_kcyx, - OutDesc, - Tensor& out_nkhw, - index_t nrepeat) + const Tensor& in_nchw, + WeiDesc, + const Tensor& wei_kcyx, + OutDesc, + Tensor& out_nkhw, + index_t nrepeat) { constexpr auto I0 = Number<0>{}; constexpr auto I1 = Number<1>{}; diff --git a/driver/device_convolution_implicit_gemm_v2_chwn_cyxk_khwn.hpp b/driver/device_convolution_implicit_gemm_v2_chwn_cyxk_khwn.hpp index e8a893957b..54e777dbe4 100644 --- a/driver/device_convolution_implicit_gemm_v2_chwn_cyxk_khwn.hpp +++ b/driver/device_convolution_implicit_gemm_v2_chwn_cyxk_khwn.hpp @@ -7,12 +7,12 @@ template void device_convolution_implicit_gemm_v2_chwn_cyxk_khwn(InDesc, - const Tensor& in_nchw, - WeiDesc, - const Tensor& wei_kcyx, - OutDesc, - Tensor& out_nkhw, - index_t nrepeat) + const Tensor& in_nchw, + WeiDesc, + const Tensor& wei_kcyx, + OutDesc, + Tensor& out_nkhw, + index_t nrepeat) { constexpr auto I0 = Number<0>{}; constexpr auto I1 = Number<1>{}; diff --git a/driver/device_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hpp b/driver/device_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hpp index 7790900f83..938bc4cd30 100644 --- a/driver/device_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hpp +++ b/driver/device_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hpp @@ -52,7 +52,7 @@ void device_direct_convolution_2_vectorized_nchw_kcyx_nkhw(InDesc, in_nchw_vec(n, c, h, w) = vector_t::Pack(in_nchw(n, 2 * c, h, w), in_nchw(n, 2 * c + 1, h, w)); #elif 1 - in_nchw_vec(n, c, h, w) = vector_t::Pack(in_nchw(n, 4 * c, h, w), + in_nchw_vec(n, c, h, w) = vector_t::Pack(in_nchw(n, 4 * c, h, w), in_nchw(n, 4 * c + 1, h, w), in_nchw(n, 4 * c + 2, h, w), in_nchw(n, 4 * c + 3, h, w)); @@ -114,37 +114,37 @@ void device_direct_convolution_2_vectorized_nchw_kcyx_nkhw(InDesc, constexpr index_t BlockSize = 128; #elif 0 // 3x3, 34x34, 128 thread, fp32, vector = 2 - constexpr index_t NPerBlock = 2; - constexpr index_t KPerBlock = 32; - constexpr index_t CPerBlock = 2; + constexpr index_t NPerBlock = 2; + constexpr index_t KPerBlock = 32; + constexpr index_t CPerBlock = 2; constexpr index_t HoPerBlock = 2; constexpr index_t WoPerBlock = 32; - constexpr index_t NPerThread = 2; - constexpr index_t KPerThread = 4; - constexpr index_t CPerThread = 1; + constexpr index_t NPerThread = 2; + constexpr index_t KPerThread = 4; + constexpr index_t CPerThread = 1; constexpr index_t HoPerThread = 2; constexpr index_t WoPerThread = 2; - constexpr index_t InBlockCopyDataPerRead = 2; + constexpr index_t InBlockCopyDataPerRead = 2; constexpr index_t WeiBlockCopyDataPerRead = 2; constexpr index_t BlockSize = 128; #elif 0 // 3x3, 34x34, 128 thread, int8, vector = 4 - constexpr index_t NPerBlock = 2; - constexpr index_t KPerBlock = 32; - constexpr index_t CPerBlock = 8; + constexpr index_t NPerBlock = 2; + constexpr index_t KPerBlock = 32; + constexpr index_t CPerBlock = 8; constexpr index_t HoPerBlock = 4; constexpr index_t WoPerBlock = 32; - constexpr index_t NPerThread = 1; - constexpr index_t KPerThread = 8; - constexpr index_t CPerThread = 2; + constexpr index_t NPerThread = 1; + constexpr index_t KPerThread = 8; + constexpr index_t CPerThread = 2; constexpr index_t HoPerThread = 4; constexpr index_t WoPerThread = 2; - constexpr index_t InBlockCopyDataPerRead = 2; + constexpr index_t InBlockCopyDataPerRead = 2; constexpr index_t WeiBlockCopyDataPerRead = 2; constexpr index_t BlockSize = 128; diff --git a/driver/driver.hip.cpp b/driver/driver.hip.cpp index e1ae924d90..5eaf42a8b7 100644 --- a/driver/driver.hip.cpp +++ b/driver/driver.hip.cpp @@ -11,6 +11,7 @@ #include "device_direct_convolution_2_nchw_kcyx_nkhw.hpp" //#include "device_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hpp" #include "device_convolution_implicit_gemm_v1_chwn_cyxk_khwn.hpp" +#include "device_convolution_implicit_gemm_v1_nchw_cyxk_khwn.hpp" //#include "device_implicit_gemm_convolution_1_chwn_cyxk_khwn_padded.hpp" #include "device_convolution_implicit_gemm_v2_chwn_cyxk_khwn.hpp" @@ -48,13 +49,10 @@ struct GeneratorTensor_3 #if 0 auto f_acc = std::plus{}; #else - auto f_acc = [](auto a, auto b){ return 10*a + b;}; + auto f_acc = [](auto a, auto b) { return 10 * a + b; }; #endif - return std::accumulate(dims.begin(), - dims.end(), - index_t(0), - f_acc); + return std::accumulate(dims.begin(), dims.end(), index_t(0), f_acc); } }; @@ -376,7 +374,7 @@ void host_winograd_3x3_convolution(const Tensor& in_nchw, std::size_t ho = HoPerTile * htile + j; for(int i = 0; i < WoPerTile; ++i) { - std::size_t wo = WoPerTile * wtile + i; + std::size_t wo = WoPerTile * wtile + i; out_nkhw(n, k, ho, wo) = out_hold(n, k, htile, wtile, j, i); } } @@ -435,13 +433,13 @@ int main(int argc, char* argv[]) constexpr index_t WPad = 0; #elif 0 // 3x3, 56x56 - constexpr index_t N = 64; - constexpr index_t C = 64; + constexpr index_t N = 64; + constexpr index_t C = 64; constexpr index_t HI = 56; constexpr index_t WI = 56; - constexpr index_t K = 128; - constexpr index_t Y = 3; - constexpr index_t X = 3; + constexpr index_t K = 128; + constexpr index_t Y = 3; + constexpr index_t X = 3; constexpr index_t HPad = 0; constexpr index_t WPad = 0; @@ -505,7 +503,7 @@ int main(int argc, char* argv[]) constexpr index_t C = 256; constexpr index_t HI = 28; constexpr index_t WI = 28; - constexpr index_t K = 512; + constexpr index_t K = 128; constexpr index_t Y = 3; constexpr index_t X = 3; @@ -666,6 +664,8 @@ int main(int argc, char* argv[]) device_direct_convolution_2_vectorized_nchw_kcyx_nkhw #elif 1 device_convolution_implicit_gemm_v1_chwn_cyxk_khwn +#elif 0 + device_convolution_implicit_gemm_v1_nchw_cyxk_khwn #elif 0 device_convolution_implicit_gemm_v2_chwn_cyxk_khwn #endif diff --git a/src/include/Array.hip.hpp b/src/include/Array.hip.hpp index f3a3d13681..65762c82a1 100644 --- a/src/include/Array.hip.hpp +++ b/src/include/Array.hip.hpp @@ -14,5 +14,7 @@ struct Array { } - __host__ __device__ TData operator[](index_t i) const { return mData[i]; } + __host__ __device__ const TData& operator[](index_t i) const { return mData[i]; } + + __host__ __device__ TData& operator[](index_t i) { return mData[i]; } }; diff --git a/src/include/ConstantTensorDescriptor.hip.hpp b/src/include/ConstantTensorDescriptor.hip.hpp index c8b621d384..d204cba9ce 100644 --- a/src/include/ConstantTensorDescriptor.hip.hpp +++ b/src/include/ConstantTensorDescriptor.hip.hpp @@ -115,46 +115,27 @@ struct ConstantTensorDescriptor static_assert(Lengths::nDim == Strides::nDim, "nDim not consistent"); } - __host__ __device__ constexpr index_t GetDimension() const { return nDim; } + __host__ __device__ static constexpr index_t GetDimension() { return nDim; } - __host__ __device__ constexpr Lengths GetLengths() const { return Lengths{}; } + __host__ __device__ static constexpr Lengths GetLengths() { return Lengths{}; } - __host__ __device__ constexpr Strides GetStrides() const { return Strides{}; } + __host__ __device__ static constexpr Strides GetStrides() { return Strides{}; } template - __host__ __device__ constexpr index_t GetLength(Number) const + __host__ __device__ static constexpr index_t GetLength(Number) { return Lengths{}.Get(Number{}); } template - __host__ __device__ constexpr index_t GetStride(Number) const + __host__ __device__ static constexpr index_t GetStride(Number) { return Strides{}.Get(Number{}); } - // c++14 doesn't support constexpr lambdas, has to use this trick instead - struct GetElementSize_f + __host__ __device__ static constexpr index_t GetElementSize() { - template - __host__ __device__ constexpr index_t operator()(IDim idim) const - { - return Type{}.GetLength(idim); - } - }; - - __host__ __device__ constexpr index_t GetElementSize() const - { - // c++14 doesn't support constexpr lambdas, has to use this trick instead - struct multiply - { - __host__ __device__ constexpr index_t operator()(index_t a, index_t b) const - { - return a * b; - } - }; - - return static_const_reduce_n{}(GetElementSize_f{}, multiply{}); + return accumulate_on_sequence(Lengths{}, mod_conv::multiplies{}, Number<1>{}); } // c++14 doesn't support constexpr lambdas, has to use this trick instead @@ -168,25 +149,16 @@ struct ConstantTensorDescriptor }; template > - __host__ __device__ constexpr index_t GetElementSpace(Align align = Align{}) const + __host__ __device__ static constexpr index_t GetElementSpace(Align align = Align{}) { - // c++14 doesn't support constexpr lambdas, has to use this trick instead - struct add - { - __host__ __device__ constexpr index_t operator()(index_t a, index_t b) const - { - return a + b; - } - }; - index_t element_space_unaligned = - static_const_reduce_n{}(GetElementSpace_f{}, add{}) + 1; + static_const_reduce_n{}(GetElementSpace_f{}, mod_conv::plus{}) + 1; return align.Get() * ((element_space_unaligned + align.Get() - 1) / align.Get()); } template - __host__ __device__ index_t Get1dIndex(Is... is) const + __host__ __device__ static index_t Get1dIndex(Is... is) { static_assert(sizeof...(Is) == nDim, "number of multi-index is wrong"); @@ -194,7 +166,7 @@ struct ConstantTensorDescriptor index_t id = 0; - static_loop_n{}([&](auto IDim) { + static_for<0, nDim, 1>{}([&](auto IDim) { constexpr index_t idim = IDim.Get(); #if DEVICE_BACKEND_HIP id += __mul24(multi_id[idim], GetStride(IDim)); @@ -206,17 +178,26 @@ struct ConstantTensorDescriptor return id; } - __host__ __device__ constexpr auto Condense() const + __host__ __device__ static Array GetMultiIndex(index_t id) + { + Array multi_id; + + static_for<0, nDim - 1, 1>{}([&](auto IDim) { + constexpr index_t idim = IDim.Get(); + multi_id[idim] = id / GetStride(IDim); + id -= multi_id[idim] * GetStride(IDim); + }); + + multi_id[nDim - 1] = id / GetStride(Number{}); + + return multi_id; + } + + __host__ __device__ static constexpr auto Condense() { constexpr auto default_strides = calculate_default_strides(Lengths{}); return ConstantTensorDescriptor{}; } - - template - __host__ __device__ constexpr auto Vectorize(Number, Number) const - { - assert(false); // not implemented - } }; template diff --git a/src/include/Sequence.hip.hpp b/src/include/Sequence.hip.hpp index 55caf14591..4ea641a47a 100644 --- a/src/include/Sequence.hip.hpp +++ b/src/include/Sequence.hip.hpp @@ -17,6 +17,8 @@ struct Sequence return mData[I]; } + __host__ __device__ index_t operator[](index_t i) const { return mData[i]; } + // this is ugly, only for nDIm = 4 template __host__ __device__ constexpr auto ReorderByGetNewFromOld(Sequence) const @@ -90,3 +92,21 @@ __host__ __device__ constexpr auto Sequence::PopBack() const { return sequence_pop_back(Type{}); } + +template +struct accumulate_on_sequence_f +{ + template + __host__ __device__ constexpr index_t operator()(IDim) const + { + return Seq{}.Get(IDim{}); + } +}; + +template +__host__ __device__ constexpr index_t accumulate_on_sequence(Seq, Reduce, Number) +{ + constexpr index_t a = + static_const_reduce_n{}(accumulate_on_sequence_f{}, Reduce{}); + return Reduce{}(a, I); +} diff --git a/src/include/blockwise_2d_tensor_op.hip.hpp b/src/include/blockwise_2d_tensor_op.hip.hpp index 6521ddc0a5..cfbcce2a86 100644 --- a/src/include/blockwise_2d_tensor_op.hip.hpp +++ b/src/include/blockwise_2d_tensor_op.hip.hpp @@ -211,8 +211,7 @@ struct Blockwise2dTensorCopy1 constexpr index_t read_per_d1 = integer_divide_ceil(L1, DataPerRead); - constexpr auto ref_desc = - make_ConstantTensorDescriptor(Sequence{}); + constexpr auto ref_desc = make_ConstantTensorDescriptor(Sequence{}); constexpr index_t NLoop = ref_desc.GetElementSize() / BlockSize; @@ -225,10 +224,8 @@ struct Blockwise2dTensorCopy1 did[1] = is / ref_desc.GetStride(I1); - const index_t src_index = - src_desc.Get1dIndex(did[0], did[1] * DataPerRead); - const index_t dst_index = - dst_desc.Get1dIndex(did[0], did[1] * DataPerRead); + const index_t src_index = src_desc.Get1dIndex(did[0], did[1] * DataPerRead); + const index_t dst_index = dst_desc.Get1dIndex(did[0], did[1] * DataPerRead); *(reinterpret_cast(p_dst + dst_index)) = *(reinterpret_cast(p_src + src_index)); diff --git a/src/include/blockwise_3d_tensor_op.hip.hpp b/src/include/blockwise_3d_tensor_op.hip.hpp index c7a85431b2..a6fe257e55 100644 --- a/src/include/blockwise_3d_tensor_op.hip.hpp +++ b/src/include/blockwise_3d_tensor_op.hip.hpp @@ -54,8 +54,7 @@ struct Blockwise3dTensorCopy1 constexpr index_t read_per_d2 = integer_divide_ceil(L2, DataPerRead); - constexpr auto ref_desc = - make_ConstantTensorDescriptor(Sequence{}); + constexpr auto ref_desc = make_ConstantTensorDescriptor(Sequence{}); constexpr index_t NLoop = ref_desc.GetElementSize() / BlockSize; @@ -72,10 +71,8 @@ struct Blockwise3dTensorCopy1 did[2] = is / ref_desc.GetStride(I2); - const index_t src_index = - src_desc.Get1dIndex(did[0], did[1], did[2] * DataPerRead); - const index_t dst_index = - dst_desc.Get1dIndex(did[0], did[1], did[2] * DataPerRead); + const index_t src_index = src_desc.Get1dIndex(did[0], did[1], did[2] * DataPerRead); + const index_t dst_index = dst_desc.Get1dIndex(did[0], did[1], did[2] * DataPerRead); *(reinterpret_cast(p_dst + dst_index)) = *(reinterpret_cast(p_src + src_index)); diff --git a/src/include/blockwise_4d_tensor_op.hip.hpp b/src/include/blockwise_4d_tensor_op.hip.hpp index 45de7823b5..bd4124de57 100644 --- a/src/include/blockwise_4d_tensor_op.hip.hpp +++ b/src/include/blockwise_4d_tensor_op.hip.hpp @@ -340,11 +340,10 @@ struct BlockwiseChwnTensorCopyPadded constexpr index_t NLoop = ref_desc.GetElementSize() / BlockSize; const Float* p_src_tmp = - p_src + - src_desc.Get1dIndex(c_block_data_begin, - (ho_block_data_begin + h_block_pad_low) - h_global_pad_low, - (wo_block_data_begin + w_block_pad_low) - w_global_pad_low, - n_block_data_begin); + p_src + src_desc.Get1dIndex(c_block_data_begin, + (ho_block_data_begin + h_block_pad_low) - h_global_pad_low, + (wo_block_data_begin + w_block_pad_low) - w_global_pad_low, + n_block_data_begin); #if 0 if(get_thread_local_1d_id() == 0) @@ -494,7 +493,7 @@ struct Blockwise4dTensorCopy3 "wrrong! BlockSize is not big enough for ThreadPerDims!"); constexpr index_t num_active_thread = - thread_per_d0 * thread_per_d1 * thread_per_d2 * thread_per_d3; + accumulate_on_sequence(ThreadPerDims{}, mod_conv::multiplies{}, Number<1>{}); if(BlockSize > num_active_thread) { @@ -504,19 +503,18 @@ struct Blockwise4dTensorCopy3 } } - const index_t thread_id_d0 = - get_thread_local_1d_id() / (thread_per_d1 * thread_per_d2 * thread_per_d3); - index_t itmp = get_thread_local_1d_id() - - thread_id_d0 * (thread_per_d1 * thread_per_d2 * thread_per_d3); - const index_t thread_id_d1 = itmp / (thread_per_d2 * thread_per_d3); - itmp -= thread_id_d1 * (thread_per_d2 * thread_per_d3); - const index_t thread_id_d2 = itmp / thread_per_d3; - const index_t thread_id_d3 = itmp - thread_id_d2 * thread_per_d3; + constexpr auto thread_cluster_desc = make_ConstantTensorDescriptor(ThreadPerDims{}); + const auto thread_multi_id = thread_cluster_desc.GetMultiIndex(get_thread_local_1d_id()); - mSrcMyThreadOffset = SrcDesc{}.Get1dIndex( - thread_id_d0, thread_id_d1, thread_id_d2, thread_id_d3 * DataPerRead); - mDstMyThreadOffset = DstDesc{}.Get1dIndex( - thread_id_d0, thread_id_d1, thread_id_d2, thread_id_d3 * DataPerRead); + mSrcMyThreadOffset = SrcDesc{}.Get1dIndex(thread_multi_id[0], + thread_multi_id[1], + thread_multi_id[2], + thread_multi_id[3] * DataPerRead); + + mDstMyThreadOffset = DstDesc{}.Get1dIndex(thread_multi_id[0], + thread_multi_id[1], + thread_multi_id[2], + thread_multi_id[3] * DataPerRead); } __device__ void Run(const Float* __restrict__ p_src, Float* __restrict__ p_dst) const @@ -745,3 +743,113 @@ struct Blockwise4dTensorCopy3 } } }; + +template +struct Blockwise4dTensorCopyReorder1 +{ + __device__ void Run(const Float* __restrict__ p_src, Float* __restrict__ p_dst) const + { + auto f_copy = [](const Float& src, Float& dst) { dst = src; }; + + blockwise_4d_tensor_pointwise_operation_binary_reorder_by_get_dst_from_src( + SrcDesc{}, p_src, DstDesc{}, p_dst, SrcOpLengths{}, DstFromSrcReorder{}, f_copy); + } +}; + +#if 0 +template +struct Blockwise4dTensorCopyReorder3 +{ + index_t mSrcMyThreadOffset; + index_t mDstMyThreadOffset; + + __device__ Blockwise4dTensorCopyReorder3() + { + constexpr index_t nDim = SrcDesc{}.GetDimension(); + + static_assert(DstDesc{}.GetDimension() == nDim && SrcOpLengths::nDim == nDim && + SrcOpThreadPerDims::nDim == nDim && DstFromSrcReorder::nDim == nDim, + "wrong! nDim is not consistent\n"); + + // Src + static_assert(DataPerRead == 1 || DataPerRead == 2 || DataPerRead == 4, + "wrong! only support DataPerRead == 1, 2 or 4!\n"); + + static_assert(DataPerRead == 1 || SrcDesc{}.GetStride(Number{}) == 1, + "wrong! only support src.stride(nDim-1) == 1 if DataPerRead > 1!\n"); + + static_assert( + SrcDesc{}.GetStride(Number{}) % DataPerRead == 0, + "wrong! src.stride(nDim-2) should be multiple of DataPerRead to keep alignment"); + + static_assert(SrcSubLengths{}.Get(Number{}) % DataPerRead == 0, "wrong! SrcSubLengths[nDim-1] % DataPerRead != 0\n"); + + static_loop([](auto I){ + constexpr index_t src_len = SrcLengths{}.Get(I); + constexpr index_t src_sub_len = SrcSubLengths{}.Get(I); + constexpr index_t thread_per_dim = SrcThreadPerDims{}.Get(I); + static_assert(src_len % (src_sub_len * thread_per_dim) == 0, + "wrong! cannot evenly divide tensor lengths"); + }); + + constexpr index_t num_active_thread = accumulate_on_sequence(SrcOpThreadPerDims{}, mod_conv::multiplies{}, Number<1>{}); + + static_assert(BlockSize >= num_active_thread, + "wrong! BlockSize is not big enough for ThreadPerDims!"); + + if(BlockSize > num_active_thread) + { + if(get_thread_local_1d_id() >= num_active_thread) + { + return; + } + } + + const auto thread_multi_id = SrcOpThreadPerDims::GetMultiIndex(get_thread_local_1d_id()); + + + const index_t thread_id_d0 = + get_thread_local_1d_id() / (thread_per_d1 * thread_per_d2 * thread_per_d3); + index_t itmp = get_thread_local_1d_id() - + thread_id_d0 * (thread_per_d1 * thread_per_d2 * thread_per_d3); + const index_t thread_id_d1 = itmp / (thread_per_d2 * thread_per_d3); + itmp -= thread_id_d1 * (thread_per_d2 * thread_per_d3); + const index_t thread_id_d2 = itmp / thread_per_d3; + const index_t thread_id_d3 = itmp - thread_id_d2 * thread_per_d3; + + + mSrcMyThreadOffset = SrcDesc{}.Get1dIndex( + thread_id_d0, thread_id_d1, thread_id_d2, thread_id_d3 * DataPerRead); + + } + + __device__ static constexpr index_t GetRegisterClipboardSize() + { + static_assert(is_same::value, "wrong! only support float!\n"); + } + + __device__ void RunLoadRegisterClipboard(const Float* __restrict__ p_src, + Float* __restrict__ p_clipboard) const + { + } + + __device__ void RunStoreRegisterClipboard(const Float* __restrict__ p_clipboard, + Float* __restrict__ p_dst) const + { + } +}; +#endif diff --git a/src/include/blockwise_batched_gemm.hip.hpp b/src/include/blockwise_batched_gemm.hip.hpp index bdaab2e90a..364d3646d0 100644 --- a/src/include/blockwise_batched_gemm.hip.hpp +++ b/src/include/blockwise_batched_gemm.hip.hpp @@ -393,9 +393,8 @@ struct BlockwiseBatchGemmBlockABlockBThreadCTransANormalBNormalC_V2 { threadwise_matrix_copy( c_thread_sub_mtx, - p_c_thread + - c_thread_sub_mtx.Get1dIndex(m_repeat * MPerLevel1Cluster, - n_repeat * NPerLevel1Cluster), + p_c_thread + c_thread_sub_mtx.Get1dIndex(m_repeat * MPerLevel1Cluster, + n_repeat * NPerLevel1Cluster), c_block_mtx, p_c_block + c_block_mtx.Get1dIndex(m_repeat * MPerLevel1Cluster, diff --git a/src/include/blockwise_direct_convolution.hip.hpp b/src/include/blockwise_direct_convolution.hip.hpp index c79833f17d..5f1665aabd 100644 --- a/src/include/blockwise_direct_convolution.hip.hpp +++ b/src/include/blockwise_direct_convolution.hip.hpp @@ -93,11 +93,10 @@ __device__ void blockwise_direct_convolution(InBlockDesc, Float p_out_thread[out_thread_desc.GetElementSpace()]; threadwise_4d_tensor_copy(out_block_desc, - p_out_block + - out_block_desc.Get1dIndex(n_thread_data_begin, - k_thread_data_begin, - ho_thread_data_begin, - wo_thread_data_begin), + p_out_block + out_block_desc.Get1dIndex(n_thread_data_begin, + k_thread_data_begin, + ho_thread_data_begin, + wo_thread_data_begin), out_thread_desc, p_out_thread, out_thread_desc.GetLengths()); @@ -108,11 +107,10 @@ __device__ void blockwise_direct_convolution(InBlockDesc, // threadwise convolution threadwise_direct_convolution_2( in_thread_block_desc, - p_in_block + - in_block_desc.Get1dIndex(n_thread_data_begin, - c_thread_data_begin, - hi_thread_data_begin, - wi_thread_data_begin), + p_in_block + in_block_desc.Get1dIndex(n_thread_data_begin, + c_thread_data_begin, + hi_thread_data_begin, + wi_thread_data_begin), wei_thread_block_desc, p_wei_block + wei_block_desc.Get1dIndex(k_thread_data_begin, c_thread_data_begin, 0, 0), @@ -124,11 +122,10 @@ __device__ void blockwise_direct_convolution(InBlockDesc, threadwise_4d_tensor_copy(out_thread_desc, p_out_thread, out_block_desc, - p_out_block + - out_block_desc.Get1dIndex(n_thread_data_begin, - k_thread_data_begin, - ho_thread_data_begin, - wo_thread_data_begin), + p_out_block + out_block_desc.Get1dIndex(n_thread_data_begin, + k_thread_data_begin, + ho_thread_data_begin, + wo_thread_data_begin), out_thread_desc.GetLengths()); } } diff --git a/src/include/functional.hip.hpp b/src/include/functional.hip.hpp index c5403f0452..2cb91c1922 100644 --- a/src/include/functional.hip.hpp +++ b/src/include/functional.hip.hpp @@ -1,26 +1,41 @@ #pragma once #include "constant_integral.hip.hpp" -template -struct static_loop_n +template +struct static_for_impl { template __host__ __device__ void operator()(F f) const { - static_assert(NLoop > 1, "out-of-range"); + static_assert(Remaining % Increment == 0, "wrong! Remaining % Increment != 0"); + static_assert(Increment <= Remaining, "will go out-of-range"); - f(Number{}); - static_loop_n{}(f); + f(Number{}); + static_for_impl{}(f); } }; -template <> -struct static_loop_n<1> +template +struct static_for_impl +{ + template + __host__ __device__ void operator()(F) const + { + // do nothing + return; + } +}; + +template +struct static_for { template __host__ __device__ void operator()(F f) const { - f(Number<0>{}); + static_assert(NBegin < NEnd, "Wrong! we should have NBegin < NEnd"); + static_assert((NEnd - NBegin) % Increment == 0, + "Wrong! should satisfy (NEnd - NBegin) % Increment == 0"); + static_for_impl{}(f); } }; @@ -54,4 +69,19 @@ __host__ __device__ constexpr auto unpacker(F f) { return [=](auto xs_array){ f(xs...); }; } -#endif \ No newline at end of file +#endif + +namespace mod_conv { +template +struct multiplies +{ + __host__ __device__ constexpr T operator()(T a, T b) const { return a * b; } +}; + +template +struct plus +{ + __host__ __device__ constexpr T operator()(T a, T b) const { return a + b; } +}; + +} // namespace mod_conv diff --git a/src/include/gridwise_convolution_implicit_gemm_v1r1_chwn_cyxk_khwn.hip.hpp b/src/include/gridwise_convolution_implicit_gemm_v1r1_chwn_cyxk_khwn.hip.hpp index 4d2a93492f..2a26255c32 100644 --- a/src/include/gridwise_convolution_implicit_gemm_v1r1_chwn_cyxk_khwn.hip.hpp +++ b/src/include/gridwise_convolution_implicit_gemm_v1r1_chwn_cyxk_khwn.hip.hpp @@ -99,8 +99,8 @@ struct GridwiseConvolutionImplicitGemm_v1r1_chwn_cyxk_khwn // tensor view of blockwise input and weight in LDS // be careful of alignment - constexpr index_t max_align = - mod_conv::max(InBlockCopyDataPerRead, WeiBlockCopyDataPerRead, GemmDataPerReadA, GemmDataPerReadB); + constexpr index_t max_align = mod_conv::max( + InBlockCopyDataPerRead, WeiBlockCopyDataPerRead, GemmDataPerReadA, GemmDataPerReadB); constexpr auto in_chwn_block_desc = make_ConstantTensorDescriptor_aligned( Sequence{}, Number{}); @@ -135,16 +135,15 @@ struct GridwiseConvolutionImplicitGemm_v1r1_chwn_cyxk_khwn InBlockCopyDataPerRead>{}; #endif - - // blockwise wei copy - // format is [CPerBlock*Y*X,KPerBlock] - const auto blockwise_wei_copy = - Blockwise2dTensorCopy3{}; + // blockwise wei copy + // format is [CPerBlock*Y*X,KPerBlock] + const auto blockwise_wei_copy = + Blockwise2dTensorCopy3{}; // a series of blockwise batched GEMM // C_matrix += transpose(A_matrix) * B_matrix @@ -202,9 +201,8 @@ struct GridwiseConvolutionImplicitGemm_v1r1_chwn_cyxk_khwn threadwise_4d_tensor_set_zero(out_khwn_thread_desc, p_out_thread); const Float* p_in_global_block_offset = - p_in_global + - in_chwn_global_desc.Get1dIndex( - 0, hi_block_data_begin, wi_block_data_begin, n_block_data_begin); + p_in_global + in_chwn_global_desc.Get1dIndex( + 0, hi_block_data_begin, wi_block_data_begin, n_block_data_begin); const Float* p_wei_global_block_offset = p_wei_global + wei_cyxk_global_desc.Get1dIndex(0, 0, 0, k_block_data_begin); @@ -323,17 +321,16 @@ struct GridwiseConvolutionImplicitGemm_v1r1_chwn_cyxk_khwn } #endif - threadwise_10d_tensor_copy( - out_10d_thread_desc, - p_out_thread, - out_10d_global_desc, - p_out_global + - out_khwn_global_desc.Get1dIndex(k_block_data_begin + k_thread_data_begin, - ho_block_data_begin + ho_thread_data_begin, - wo_block_data_begin + wo_thread_data_begin, - n_block_data_begin + n_thread_data_begin), - out_10d_thread_desc.GetLengths(), - Number{}); + threadwise_10d_tensor_copy(out_10d_thread_desc, + p_out_thread, + out_10d_global_desc, + p_out_global + out_khwn_global_desc.Get1dIndex( + k_block_data_begin + k_thread_data_begin, + ho_block_data_begin + ho_thread_data_begin, + wo_block_data_begin + wo_thread_data_begin, + n_block_data_begin + n_thread_data_begin), + out_10d_thread_desc.GetLengths(), + Number{}); #endif } }; diff --git a/src/include/gridwise_convolution_implicit_gemm_v1r1_chwn_cyxk_khwn_lds_double_buffer.hip.hpp b/src/include/gridwise_convolution_implicit_gemm_v1r1_chwn_cyxk_khwn_lds_double_buffer.hip.hpp index 365697ecfd..3a024bbaaa 100644 --- a/src/include/gridwise_convolution_implicit_gemm_v1r1_chwn_cyxk_khwn_lds_double_buffer.hip.hpp +++ b/src/include/gridwise_convolution_implicit_gemm_v1r1_chwn_cyxk_khwn_lds_double_buffer.hip.hpp @@ -190,9 +190,8 @@ struct GridwiseConvolutionImplicitGemm_v1r1_chwn_cyxk_khwn_lds_double_buffer __shared__ Float p_wei_block_double[2 * wei_block_space]; const Float* p_in_global_block_offset = - p_in_global + - in_chwn_global_desc.Get1dIndex( - 0, hi_block_data_begin, wi_block_data_begin, n_block_data_begin); + p_in_global + in_chwn_global_desc.Get1dIndex( + 0, hi_block_data_begin, wi_block_data_begin, n_block_data_begin); const Float* p_wei_global_block_offset = p_wei_global + wei_cyxk_global_desc.Get1dIndex(0, 0, 0, k_block_data_begin); @@ -393,17 +392,16 @@ struct GridwiseConvolutionImplicitGemm_v1r1_chwn_cyxk_khwn_lds_double_buffer } #endif - threadwise_10d_tensor_copy( - out_10d_thread_desc, - p_out_thread, - out_10d_global_desc, - p_out_global + - out_khwn_global_desc.Get1dIndex(k_block_data_begin + k_thread_data_begin, - ho_block_data_begin + ho_thread_data_begin, - wo_block_data_begin + wo_thread_data_begin, - n_block_data_begin + n_thread_data_begin), - out_10d_thread_desc.GetLengths(), - Number{}); + threadwise_10d_tensor_copy(out_10d_thread_desc, + p_out_thread, + out_10d_global_desc, + p_out_global + out_khwn_global_desc.Get1dIndex( + k_block_data_begin + k_thread_data_begin, + ho_block_data_begin + ho_thread_data_begin, + wo_block_data_begin + wo_thread_data_begin, + n_block_data_begin + n_thread_data_begin), + out_10d_thread_desc.GetLengths(), + Number{}); #endif } }; diff --git a/src/include/gridwise_convolution_implicit_gemm_v1r2_chwn_cyxk_khwn.hip.hpp b/src/include/gridwise_convolution_implicit_gemm_v1r2_chwn_cyxk_khwn.hip.hpp index bc05d3ffd0..454ed30392 100644 --- a/src/include/gridwise_convolution_implicit_gemm_v1r2_chwn_cyxk_khwn.hip.hpp +++ b/src/include/gridwise_convolution_implicit_gemm_v1r2_chwn_cyxk_khwn.hip.hpp @@ -101,8 +101,8 @@ struct GridwiseConvolutionImplicitGemm_v1r2_chwn_cyxk_khwn // LDS tensor view // be careful of alignment - constexpr index_t max_align = - mod_conv::max(InBlockCopyDataPerRead, WeiBlockCopyDataPerRead, GemmDataPerReadA, GemmDataPerReadB); + constexpr index_t max_align = mod_conv::max( + InBlockCopyDataPerRead, WeiBlockCopyDataPerRead, GemmDataPerReadA, GemmDataPerReadB); constexpr auto in_c_h_w_n_block_desc = make_ConstantTensorDescriptor_aligned( Sequence{}, Number{}); @@ -116,8 +116,8 @@ struct GridwiseConvolutionImplicitGemm_v1r2_chwn_cyxk_khwn // blockwise copy // input: format is [C, Hi, Wi, N] - const auto blockwise_in_copy = #if 0 + const auto blockwise_in_copy = Blockwise4dTensorCopy1{}; #else + const auto blockwise_in_copy = Blockwise4dTensorCopy3{}, - Number{}, - Number{}); + constexpr auto a_c_k_block_mtx_desc = make_ConstantMatrixDescriptor( + Number{}, Number{}, Number{}); constexpr auto b_c_wn_block_mtx_desc = make_ConstantMatrixDescriptor(Number{}, @@ -187,8 +186,10 @@ struct GridwiseConvolutionImplicitGemm_v1r2_chwn_cyxk_khwn GemmDataPerReadB>{}; // LDS: be careful of alignment - constexpr index_t in_block_space = in_c_h_w_n_block_desc.GetElementSpace(Number{}); - constexpr index_t wei_block_space = wei_c_x_k_block_desc.GetElementSpace(Number{}); + constexpr index_t in_block_space = + in_c_h_w_n_block_desc.GetElementSpace(Number{}); + constexpr index_t wei_block_space = + wei_c_x_k_block_desc.GetElementSpace(Number{}); __shared__ Float p_in_block[in_block_space]; __shared__ Float p_wei_block[wei_block_space]; @@ -213,9 +214,8 @@ struct GridwiseConvolutionImplicitGemm_v1r2_chwn_cyxk_khwn threadwise_4d_tensor_set_zero(out_k_h_w_n_thread_desc, p_out_thread); const Float* p_in_global_block_offset = - p_in_global + - in_c_h_w_n_global_desc.Get1dIndex( - 0, hi_block_data_begin, wi_block_data_begin, n_block_data_begin); + p_in_global + in_c_h_w_n_global_desc.Get1dIndex( + 0, hi_block_data_begin, wi_block_data_begin, n_block_data_begin); const Float* p_wei_global_block_offset = p_wei_global + wei_c_y_x_k_global_desc.Get1dIndex(0, 0, 0, k_block_data_begin); @@ -227,7 +227,7 @@ struct GridwiseConvolutionImplicitGemm_v1r2_chwn_cyxk_khwn for(index_t y = 0; y < Y; ++y) { blockwise_in_copy.Run(p_in_global_block_offset + - in_c_h_w_n_global_desc.Get1dIndex(0, y, 0, 0), + in_c_h_w_n_global_desc.Get1dIndex(0, y, 0, 0), p_in_block); blockwise_wei_copy.Run(p_wei_global_block_offset + @@ -239,9 +239,9 @@ struct GridwiseConvolutionImplicitGemm_v1r2_chwn_cyxk_khwn for(index_t x = 0; x < X; ++x) { blockwise_batch_gemm.Run(p_wei_block + wei_c_x_k_block_desc.Get1dIndex(0, x, 0), - p_in_block + in_c_h_w_n_block_desc.Get1dIndex(0, 0, x, 0), + p_in_block + + in_c_h_w_n_block_desc.Get1dIndex(0, 0, x, 0), p_out_thread); - } __syncthreads(); @@ -321,17 +321,16 @@ struct GridwiseConvolutionImplicitGemm_v1r2_chwn_cyxk_khwn } #endif - threadwise_10d_tensor_copy( - out_10d_thread_desc, - p_out_thread, - out_10d_global_desc, - p_out_global + - out_k_h_w_n_global_desc.Get1dIndex(k_block_data_begin + k_thread_data_begin, - ho_block_data_begin + ho_thread_data_begin, - wo_block_data_begin + wo_thread_data_begin, - n_block_data_begin + n_thread_data_begin), - out_10d_thread_desc.GetLengths(), - Number{}); + threadwise_10d_tensor_copy(out_10d_thread_desc, + p_out_thread, + out_10d_global_desc, + p_out_global + out_k_h_w_n_global_desc.Get1dIndex( + k_block_data_begin + k_thread_data_begin, + ho_block_data_begin + ho_thread_data_begin, + wo_block_data_begin + wo_thread_data_begin, + n_block_data_begin + n_thread_data_begin), + out_10d_thread_desc.GetLengths(), + Number{}); #endif } }; diff --git a/src/include/gridwise_convolution_implicit_gemm_v2_chwn_cyxk_khwn_lds_double_buffer.hip.hpp b/src/include/gridwise_convolution_implicit_gemm_v2_chwn_cyxk_khwn_lds_double_buffer.hip.hpp index 90370b7b4f..2e6fdcf049 100644 --- a/src/include/gridwise_convolution_implicit_gemm_v2_chwn_cyxk_khwn_lds_double_buffer.hip.hpp +++ b/src/include/gridwise_convolution_implicit_gemm_v2_chwn_cyxk_khwn_lds_double_buffer.hip.hpp @@ -365,14 +365,13 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn_lds_double_buffer constexpr auto out_kb_global_desc = make_ConstantTensorDescriptor(Sequence{}); - threadwise_6d_tensor_copy( - out_6d_thread_desc, - p_out_thread, - out_6d_global_desc, - p_out_global + - out_kb_global_desc.Get1dIndex(k_thread_data_begin, b_thread_data_begin), - out_6d_thread_desc.GetLengths(), - Number{}); + threadwise_6d_tensor_copy(out_6d_thread_desc, + p_out_thread, + out_6d_global_desc, + p_out_global + out_kb_global_desc.Get1dIndex( + k_thread_data_begin, b_thread_data_begin), + out_6d_thread_desc.GetLengths(), + Number{}); } else { diff --git a/src/include/gridwise_direct_convolution_1.hip.hpp b/src/include/gridwise_direct_convolution_1.hip.hpp index 7723fb78b4..29c7e86b37 100644 --- a/src/include/gridwise_direct_convolution_1.hip.hpp +++ b/src/include/gridwise_direct_convolution_1.hip.hpp @@ -113,11 +113,10 @@ __global__ void gridwise_direct_convolution_1(const Float* const __restrict__ p_ c_block_work_begin += CPerBlock) { // copy input tensor to LDS - blockwise_in_copy.Run(p_in_global + - in_global_desc.Get1dIndex(n_block_work_begin, - c_block_work_begin, - hi_block_work_begin, - wi_block_work_begin), + blockwise_in_copy.Run(p_in_global + in_global_desc.Get1dIndex(n_block_work_begin, + c_block_work_begin, + hi_block_work_begin, + wi_block_work_begin), p_in_block); // copy weight tensor to LDS @@ -144,9 +143,9 @@ __global__ void gridwise_direct_convolution_1(const Float* const __restrict__ p_ } // copy output tensor from LDS to device mem - blockwise_out_copy.Run( - p_out_block, - p_out_global + - out_global_desc.Get1dIndex( - n_block_work_begin, k_block_work_begin, ho_block_work_begin, wo_block_work_begin)); + blockwise_out_copy.Run(p_out_block, + p_out_global + out_global_desc.Get1dIndex(n_block_work_begin, + k_block_work_begin, + ho_block_work_begin, + wo_block_work_begin)); } diff --git a/src/include/gridwise_direct_convolution_2_nchw_kcyx_nkhw.hip.hpp b/src/include/gridwise_direct_convolution_2_nchw_kcyx_nkhw.hip.hpp index cbebe28f17..c585c4495b 100644 --- a/src/include/gridwise_direct_convolution_2_nchw_kcyx_nkhw.hip.hpp +++ b/src/include/gridwise_direct_convolution_2_nchw_kcyx_nkhw.hip.hpp @@ -175,18 +175,16 @@ gridwise_direct_convolution_2_nchw_kcyx_nkhw(const Float* const __restrict__ p_i c_block_data_begin += CPerBlock, __syncthreads()) { // copy input tensor to LDS - blockwise_in_copy.Run(p_in_global + - in_nchw_global_desc.Get1dIndex(n_block_data_begin, - c_block_data_begin, - hi_block_data_begin, - wi_block_data_begin), + blockwise_in_copy.Run(p_in_global + in_nchw_global_desc.Get1dIndex(n_block_data_begin, + c_block_data_begin, + hi_block_data_begin, + wi_block_data_begin), p_in_block); // copy weight tensor to LDS - blockwise_wei_copy.Run( - p_wei_global + - wei_kcyx_global_desc.Get1dIndex(k_block_data_begin, c_block_data_begin, 0, 0), - p_wei_block); + blockwise_wei_copy.Run(p_wei_global + wei_kcyx_global_desc.Get1dIndex( + k_block_data_begin, c_block_data_begin, 0, 0), + p_wei_block); __syncthreads(); @@ -196,11 +194,10 @@ gridwise_direct_convolution_2_nchw_kcyx_nkhw(const Float* const __restrict__ p_i #if 1 threadwise_direct_convolution_2( in_nchw_thread_block_desc, - p_in_block + - in_nchw_block_desc.Get1dIndex(n_thread_data_begin, - c_thread_data, - hi_thread_data_begin, - wi_thread_data_begin), + p_in_block + in_nchw_block_desc.Get1dIndex(n_thread_data_begin, + c_thread_data, + hi_thread_data_begin, + wi_thread_data_begin), wei_kcyx_thread_block_desc, p_wei_block + wei_kcyx_block_desc.Get1dIndex(k_thread_data_begin, c_thread_data, 0, 0), @@ -209,11 +206,10 @@ gridwise_direct_convolution_2_nchw_kcyx_nkhw(const Float* const __restrict__ p_i #elif 0 threadwise_direct_convolution_3( in_nchw_thread_block_desc, - p_in_block + - in_nchw_block_desc.Get1dIndex(n_thread_data_begin, - c_thread_data, - hi_thread_data_begin, - wi_thread_data_begin), + p_in_block + in_nchw_block_desc.Get1dIndex(n_thread_data_begin, + c_thread_data, + hi_thread_data_begin, + wi_thread_data_begin), wei_kcyx_thread_block_desc, p_wei_block + wei_kcyx_block_desc.Get1dIndex(k_thread_data_begin, c_thread_data, 0, 0), @@ -228,10 +224,9 @@ gridwise_direct_convolution_2_nchw_kcyx_nkhw(const Float* const __restrict__ p_i out_nkhw_thread_desc, p_out_thread, out_nkhw_global_desc, - p_out_global + - out_nkhw_global_desc.Get1dIndex(n_block_data_begin + n_thread_data_begin, - k_block_data_begin + k_thread_data_begin, - ho_block_data_begin + ho_thread_data_begin, - wo_block_data_begin + wo_thread_data_begin), + p_out_global + out_nkhw_global_desc.Get1dIndex(n_block_data_begin + n_thread_data_begin, + k_block_data_begin + k_thread_data_begin, + ho_block_data_begin + ho_thread_data_begin, + wo_block_data_begin + wo_thread_data_begin), out_nkhw_thread_desc.GetLengths()); } diff --git a/src/include/gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hip.hpp b/src/include/gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hip.hpp index 0b83eccc3c..df8c53f107 100644 --- a/src/include/gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hip.hpp +++ b/src/include/gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hip.hpp @@ -198,10 +198,9 @@ __global__ void gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw( p_in_vec_block); // copy weight tensor to LDS - blockwise_wei_copy.Run( - p_wei_vec_global + - wei_kcyx_vec_global_desc.Get1dIndex(k_block_data_begin, c_block_data_begin, 0, 0), - p_wei_vec_block); + blockwise_wei_copy.Run(p_wei_vec_global + wei_kcyx_vec_global_desc.Get1dIndex( + k_block_data_begin, c_block_data_begin, 0, 0), + p_wei_vec_block); __syncthreads(); @@ -211,11 +210,10 @@ __global__ void gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw( #if 1 threadwise_direct_convolution_2( in_nchw_vec_thread_block_desc, - p_in_vec_block + - in_nchw_vec_block_desc.Get1dIndex(n_thread_data_begin, - c_thread_data, - hi_thread_data_begin, - wi_thread_data_begin), + p_in_vec_block + in_nchw_vec_block_desc.Get1dIndex(n_thread_data_begin, + c_thread_data, + hi_thread_data_begin, + wi_thread_data_begin), wei_kcyx_vec_thread_block_desc, p_wei_vec_block + wei_kcyx_vec_block_desc.Get1dIndex(k_thread_data_begin, c_thread_data, 0, 0), @@ -224,11 +222,10 @@ __global__ void gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw( #elif 0 threadwise_direct_convolution_3( in_nchw_vec_thread_block_desc, - p_in_vec_block + - in_nchw_vec_block_desc.Get1dIndex(n_thread_data_begin, - c_thread_data, - hi_thread_data_begin, - wi_thread_data_begin), + p_in_vec_block + in_nchw_vec_block_desc.Get1dIndex(n_thread_data_begin, + c_thread_data, + hi_thread_data_begin, + wi_thread_data_begin), wei_kcyx_vec_thread_block_desc, p_wei_vec_block + wei_kcyx_vec_block_desc.Get1dIndex(k_thread_data_begin, c_thread_data, 0, 0), @@ -243,10 +240,9 @@ __global__ void gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw( out_nkhw_thread_desc, p_out_thread, out_nkhw_global_desc, - p_out_global + - out_nkhw_global_desc.Get1dIndex(n_block_data_begin + n_thread_data_begin, - k_block_data_begin + k_thread_data_begin, - ho_block_data_begin + ho_thread_data_begin, - wo_block_data_begin + wo_thread_data_begin), + p_out_global + out_nkhw_global_desc.Get1dIndex(n_block_data_begin + n_thread_data_begin, + k_block_data_begin + k_thread_data_begin, + ho_block_data_begin + ho_thread_data_begin, + wo_block_data_begin + wo_thread_data_begin), out_nkhw_thread_desc.GetLengths()); } diff --git a/src/include/gridwise_implicit_gemm_convolution_1_chwn_cyxk_khwn_padded.hip.hpp b/src/include/gridwise_implicit_gemm_convolution_1_chwn_cyxk_khwn_padded.hip.hpp index f04a283fcf..fe1ee2191f 100644 --- a/src/include/gridwise_implicit_gemm_convolution_1_chwn_cyxk_khwn_padded.hip.hpp +++ b/src/include/gridwise_implicit_gemm_convolution_1_chwn_cyxk_khwn_padded.hip.hpp @@ -283,11 +283,10 @@ __global__ void gridwise_implicit_gemm_convolution_1_chwn_cyxk_khwn_padded( out_hkwn_thread_desc, p_out_thread, out_khwn_global_desc, - p_out_global + - out_khwn_global_desc.Get1dIndex(k_block_data_begin + k_thread_data_begin, - ho_block_data_begin + ho_thread_data_begin, - wo_block_data_begin + wo_thread_data_begin, - n_block_data_begin + n_thread_data_begin), + p_out_global + out_khwn_global_desc.Get1dIndex(k_block_data_begin + k_thread_data_begin, + ho_block_data_begin + ho_thread_data_begin, + wo_block_data_begin + wo_thread_data_begin, + n_block_data_begin + n_thread_data_begin), out_hkwn_thread_desc.GetLengths(), reorder_khwn_from_hkwn); } diff --git a/src/include/tensor.hpp b/src/include/tensor.hpp index 1ebfef0c5d..d0c785c16e 100644 --- a/src/include/tensor.hpp +++ b/src/include/tensor.hpp @@ -22,7 +22,8 @@ std::ostream& LogRange(std::ostream& os, Range&& range, std::string delim) return os; } -typedef enum { +typedef enum +{ Half = 0, Float = 1, } DataType_t;