diff --git a/driver/device_convolution_implicit_gemm_v1_chwn_cyxk_khwn.hpp b/driver/device_convolution_implicit_gemm_v1_chwn_cyxk_khwn.hpp index 9a4e28b410..d74b41ff22 100644 --- a/driver/device_convolution_implicit_gemm_v1_chwn_cyxk_khwn.hpp +++ b/driver/device_convolution_implicit_gemm_v1_chwn_cyxk_khwn.hpp @@ -38,7 +38,7 @@ void device_convolution_implicit_gemm_v1_chwn_cyxk_khwn(InDesc, constexpr index_t X = wei_kcyx_desc.GetLength(I3); // reorder weight - auto wei_cyxk_desc = make_packed_ConstantTensorDescriptor(Sequence{}); + auto wei_cyxk_desc = make_ConstantTensorDescriptor_default_rank_packed(Sequence{}); ostream_ConstantTensorDescriptor(wei_cyxk_desc, std::cout << "wei_cyxk_desc: "); Tensor wei_cyxk(make_TensorDescriptor(wei_cyxk_desc)); @@ -51,7 +51,7 @@ void device_convolution_implicit_gemm_v1_chwn_cyxk_khwn(InDesc, std::thread::hardware_concurrency()); // reorder input - auto in_chwn_desc = make_packed_ConstantTensorDescriptor(Sequence{}); + auto in_chwn_desc = make_ConstantTensorDescriptor_default_rank_packed(Sequence{}); ostream_ConstantTensorDescriptor(in_chwn_desc, std::cout << "in_chwn_desc: "); Tensor in_chwn(make_TensorDescriptor(in_chwn_desc)); @@ -64,7 +64,8 @@ void device_convolution_implicit_gemm_v1_chwn_cyxk_khwn(InDesc, std::thread::hardware_concurrency()); // output - auto out_khwn_desc = make_packed_ConstantTensorDescriptor(Sequence{}); + auto out_khwn_desc = + make_ConstantTensorDescriptor_default_rank_packed(Sequence{}); ostream_ConstantTensorDescriptor(out_khwn_desc, std::cout << "out_khwn_desc: "); Tensor out_khwn(make_TensorDescriptor(out_khwn_desc)); diff --git a/driver/device_convolution_implicit_gemm_v1_nchw_cyxk_khwn.hpp b/driver/device_convolution_implicit_gemm_v1_nchw_cyxk_khwn.hpp index 80c15a1a50..a4381b370b 100644 --- a/driver/device_convolution_implicit_gemm_v1_nchw_cyxk_khwn.hpp +++ b/driver/device_convolution_implicit_gemm_v1_nchw_cyxk_khwn.hpp @@ -37,7 +37,7 @@ void device_convolution_implicit_gemm_v1_nchw_cyxk_khwn(InDesc, constexpr index_t X = wei_kcyx_desc.GetLength(I3); // reorder weight - auto wei_cyxk_desc = make_packed_ConstantTensorDescriptor(Sequence{}); + auto wei_cyxk_desc = make_ConstantTensorDescriptor_default_rank_packed(Sequence{}); ostream_ConstantTensorDescriptor(wei_cyxk_desc, std::cout << "wei_cyxk_desc: "); Tensor wei_cyxk(make_TensorDescriptor(wei_cyxk_desc)); @@ -50,7 +50,8 @@ void device_convolution_implicit_gemm_v1_nchw_cyxk_khwn(InDesc, std::thread::hardware_concurrency()); // output - auto out_khwn_desc = make_packed_ConstantTensorDescriptor(Sequence{}); + auto out_khwn_desc = + make_ConstantTensorDescriptor_default_rank_packed(Sequence{}); ostream_ConstantTensorDescriptor(out_khwn_desc, std::cout << "out_khwn_desc: "); Tensor out_khwn(make_TensorDescriptor(out_khwn_desc)); diff --git a/driver/device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw.hpp b/driver/device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw.hpp index 7ab9fd7c75..04e2baf225 100644 --- a/driver/device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw.hpp +++ b/driver/device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw.hpp @@ -36,7 +36,7 @@ void device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw(InDesc, constexpr index_t X = wei_kcyx_desc.GetLength(I3); // reorder weight - auto wei_cyxk_desc = make_packed_ConstantTensorDescriptor(Sequence{}); + auto wei_cyxk_desc = make_ConstantTensorDescriptor_default_rank_packed(Sequence{}); ostream_ConstantTensorDescriptor(wei_cyxk_desc, std::cout << "wei_cyxk_desc: "); Tensor wei_cyxk(make_TensorDescriptor(wei_cyxk_desc)); @@ -57,7 +57,7 @@ void device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw(InDesc, wei_cyxk_device_buf.ToDevice(wei_cyxk.mData.data()); out_nkhw_device_buf.ToDevice(out_nkhw.mData.data()); -#if 0 +#if 1 // for 3x3, 34x34, v1r3, Pascal constexpr index_t BlockSize = 128; @@ -92,7 +92,7 @@ void device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw(InDesc, constexpr index_t WeiBlockCopyDataPerRead_K = 4; constexpr index_t OutThreadCopyDataPerWrite_W = 2; -#elif 0 +#elif 1 // for 3x3, 34x34, v1r3, Vega 20, WoPerBlock = 32 constexpr index_t BlockSize = 256; @@ -162,7 +162,7 @@ void device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw(InDesc, constexpr index_t WeiBlockCopyDataPerRead_K = 4; constexpr index_t OutThreadCopyDataPerWrite_W = 2; -#elif 1 +#elif 0 // for 3x3, 34x34, v1r3, Vega 20, WoPerBlock = 8 constexpr index_t BlockSize = 256; diff --git a/driver/device_convolution_implicit_gemm_v3_nchw_cyxk_nkhw.hpp b/driver/device_convolution_implicit_gemm_v3_nchw_cyxk_nkhw.hpp index 2960471635..ee8028e55c 100644 --- a/driver/device_convolution_implicit_gemm_v3_nchw_cyxk_nkhw.hpp +++ b/driver/device_convolution_implicit_gemm_v3_nchw_cyxk_nkhw.hpp @@ -35,7 +35,7 @@ void device_convolution_implicit_gemm_v3_nchw_cyxk_nkhw(InDesc, constexpr index_t X = wei_kcyx_desc.GetLength(I3); // reorder weight - auto wei_cyxk_desc = make_ConstantTensorDescriptor(Sequence{}); + auto wei_cyxk_desc = make_ConstantTensorDescriptor_default_rank_packed(Sequence{}); ostream_ConstantTensorDescriptor(wei_cyxk_desc, std::cout << "wei_cyxk_desc: "); Tensor wei_cyxk(make_TensorDescriptor(wei_cyxk_desc)); @@ -56,37 +56,40 @@ void device_convolution_implicit_gemm_v3_nchw_cyxk_nkhw(InDesc, wei_cyxk_device_buf.ToDevice(wei_cyxk.mData.data()); out_nkhw_device_buf.ToDevice(out_nkhw.mData.data()); + constexpr index_t N1 = 2; + constexpr index_t N2 = 4; + + constexpr index_t B = (N * Ho * Wo) / (N1 * N2); + #if 1 - // for 3x3, 28x28, v3, Pascal - constexpr index_t BlockSize = 128; + // for 3x3, 28x28, v3 + constexpr index_t BlockSize = 256; constexpr index_t BPerBlock = 16; constexpr index_t KPerBlock = 128; constexpr index_t CPerBlock = 8; - constexpr index_t BPerThread = 1; - constexpr index_t KPerThread = 8; - constexpr index_t GemmMPerThreadSubC = 4; constexpr index_t GemmNPerThreadSubC = 4; constexpr index_t GemmMLevel0Cluster = 4; - constexpr index_t GemmNLevel0Cluster = 2; + constexpr index_t GemmNLevel0Cluster = 4; constexpr index_t GemmMLevel1Cluster = 4; - constexpr index_t GemmNLevel1Cluster = 2; + constexpr index_t GemmNLevel1Cluster = 4; constexpr index_t GemmKPerThreadLoop = 1; constexpr index_t GemmDataPerReadA = 4; constexpr index_t GemmDataPerReadB = 4; - using InBlockReorderSrcSubLengths_NCHW = Sequence<4, 1, 1, 1>; - using InBlockReorderSrcClusterLengths_NCHW = Sequence<4, 8, 2, 2>; - using InBlockReorderMapThreadCluster2SrcCluster_CHNW2NCHW = Sequence<1, 2, 0, 3>; + using InBlockCopySubLengths_N1_N2_C_B = Sequence<1, 4, 1, 1>; + using InBlockCopyClusterLengths_N1_N2_C_B = Sequence<2, 1, 8, 16>; - constexpr index_t WeiBlockCopyDataPerRead_K = 4; + constexpr index_t InBlockCopySrcDataPerRead_B = 1; + constexpr index_t InBlockCopyDstDataPerWrite_N2 = 4; + + constexpr index_t WeiBlockCopyDataPerAccess_K = 4; #endif constexpr index_t GridSize = - ((N + NPerBlock - 1) / NPerBlock) * ((K + KPerBlock - 1) / KPerBlock) * - ((Ho + HoPerBlock - 1) / HoPerBlock) * ((Wo + WoPerBlock - 1) / WoPerBlock); + ((B + BPerBlock - 1) / BPerBlock) * ((K + KPerBlock - 1) / KPerBlock); printf("%s: BlockSize %u, GridSize %u \n", __func__, BlockSize, GridSize); @@ -102,15 +105,11 @@ void device_convolution_implicit_gemm_v3_nchw_cyxk_nkhw(InDesc, decltype(in_nchw_desc), decltype(wei_cyxk_desc), decltype(out_nkhw_desc), - NPerBlock, + BPerBlock, KPerBlock, CPerBlock, - HoPerBlock, - WoPerBlock, - NPerThread, - KPerThread, - HoPerThread, - WoPerThread, + N1, + N2, GemmMPerThreadSubC, GemmNPerThreadSubC, GemmMLevel0Cluster, @@ -120,14 +119,11 @@ void device_convolution_implicit_gemm_v3_nchw_cyxk_nkhw(InDesc, GemmKPerThreadLoop, GemmDataPerReadA, GemmDataPerReadB, - InBlockReorderSrcSubLengths_NCHW, - InBlockReorderSrcClusterLengths_NCHW, - InBlockReorderMapThreadCluster2SrcCluster_CHNW2NCHW, - InBlockReorderDataPerRead_W, - InBlockReorderDataPerWrite_N, - WeiBlockCopyClusterLengths, - WeiBlockCopyDataPerRead_K, - OutThreadCopyDataPerWrite_W>{}; + InBlockCopySubLengths_N1_N2_C_B, + InBlockCopyClusterLengths_N1_N2_C_B, + InBlockCopySrcDataPerRead_B, + InBlockCopyDstDataPerWrite_N2, + WeiBlockCopyDataPerAccess_K>{}; float time = launch_kernel(run_gridwise_convolution, dim3(GridSize), diff --git a/driver/driver.hip.cpp b/driver/driver.hip.cpp index 3adab07387..46cad0a87c 100644 --- a/driver/driver.hip.cpp +++ b/driver/driver.hip.cpp @@ -13,7 +13,7 @@ #include "device_convolution_implicit_gemm_v1_nchw_cyxk_khwn.hpp" #include "device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw.hpp" #include "device_convolution_implicit_gemm_v2_chwn_cyxk_khwn.hpp" -//#include "device_convolution_implicit_gemm_v3_nchw_cyxk_nkhw.hpp" +#include "device_convolution_implicit_gemm_v3_nchw_cyxk_nkhw.hpp" struct GeneratorTensor_1 { @@ -548,8 +548,8 @@ int main(int argc, char* argv[]) auto lower_pads = Sequence{}; auto upper_pads = Sequence{}; - auto in_nchw_desc = make_packed_ConstantTensorDescriptor(Sequence{}); - auto wei_kcyx_desc = make_packed_ConstantTensorDescriptor(Sequence{}); + auto in_nchw_desc = make_ConstantTensorDescriptor_default_rank_packed(Sequence{}); + auto wei_kcyx_desc = make_ConstantTensorDescriptor_default_rank_packed(Sequence{}); auto out_nkhw_desc = get_convolution_with_padding_output_default_4d_tensor_descriptor( in_nchw_desc, wei_kcyx_desc, lower_pads, upper_pads); @@ -612,11 +612,11 @@ int main(int argc, char* argv[]) device_convolution_implicit_gemm_v1_chwn_cyxk_khwn #elif 0 device_convolution_implicit_gemm_v1_nchw_cyxk_khwn -#elif 1 +#elif 0 device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw #elif 0 device_convolution_implicit_gemm_v2_chwn_cyxk_khwn -#elif 0 +#elif 1 device_convolution_implicit_gemm_v3_nchw_cyxk_nkhw #endif (in_nchw_desc, in_nchw, wei_kcyx_desc, wei_kcyx, out_nkhw_desc, out_nkhw_device, nrepeat); diff --git a/src/include/Array.hip.hpp b/src/include/Array.hip.hpp index 103639e7e8..96dc7d3996 100644 --- a/src/include/Array.hip.hpp +++ b/src/include/Array.hip.hpp @@ -12,7 +12,7 @@ struct Array index_t mData[nSize]; template - __host__ __device__ Array(Xs... xs) : mData{static_cast(xs)...} + __host__ __device__ constexpr Array(Xs... xs) : mData{static_cast(xs)...} { } @@ -37,6 +37,25 @@ struct Array } }; +template +__host__ __device__ constexpr auto sequence2array(Sequence) +{ + return Array{Is...}; +} + +template +__host__ __device__ constexpr auto make_zero_array() +{ + Array a; + + static_for<0, NSize, 1>{}([&](auto I) { + constexpr index_t i = I.Get(); + a[i] = static_cast(0); + }); + + return a; +} + template __host__ __device__ auto reorder_array_given_new2old(const Array& old_array, Sequence new2old) @@ -80,15 +99,14 @@ __host__ __device__ auto extract_array(const Array& old_array, Ext static_for<0, new_size, 1>{}([&](auto I) { constexpr index_t i = I.Get(); - new_array[i] = old_array[ExtractSeq{}.Get(I)]; + new_array[i] = old_array[ExtractSeq::Get(I)]; }); return new_array; } template -__host__ __device__ constexpr auto operator+(const Array& a, - const Array& b) +__host__ __device__ constexpr auto operator+(Array a, Array b) { Array result; @@ -99,3 +117,20 @@ __host__ __device__ constexpr auto operator+(const Array& a, return result; } + +// Array = Array * Sequence +template +__host__ __device__ constexpr auto operator*(Array a, Sequence b) +{ + static_assert(sizeof...(Is) == NSize, "wrong! size not the same"); + + Array result; + + static_for<0, NSize, 1>{}([&](auto I) { + constexpr index_t i = I.Get(); + + result[i] = a[i] + b.Get(I); + }); + + return result; +} diff --git a/src/include/ConstantMatrixDescriptor.hip.hpp b/src/include/ConstantMatrixDescriptor.hip.hpp index 9cb3050382..4b15f75fa9 100644 --- a/src/include/ConstantMatrixDescriptor.hip.hpp +++ b/src/include/ConstantMatrixDescriptor.hip.hpp @@ -9,26 +9,26 @@ struct ConstantMatrixDescriptor static_assert(NCol_ <= RowStride_, "wrong! NCol > RowStride!"); } - __host__ __device__ constexpr index_t NRow() const { return NRow_; } + __host__ __device__ static constexpr index_t NRow() { return NRow_; } - __host__ __device__ constexpr index_t NCol() const { return NCol_; } + __host__ __device__ static constexpr index_t NCol() { return NCol_; } - __host__ __device__ constexpr index_t RowStride() const { return RowStride_; } + __host__ __device__ static constexpr index_t RowStride() { return RowStride_; } - __host__ __device__ constexpr auto GetLengths() const { return Sequence{}; } + __host__ __device__ static constexpr auto GetLengths() { return Sequence{}; } - __host__ __device__ constexpr index_t GetElementSize() const { return NRow_ * NCol_; } + __host__ __device__ static constexpr index_t GetElementSize() { return NRow_ * NCol_; } - __host__ __device__ constexpr index_t GetElementSpace() const { return NRow_ * RowStride_; } + __host__ __device__ static constexpr index_t GetElementSpace() { return NRow_ * RowStride_; } - __host__ __device__ index_t GetOffsetFromMultiIndex(index_t irow, index_t icol) const + __host__ __device__ static index_t GetOffsetFromMultiIndex(index_t irow, index_t icol) { return irow * RowStride_ + icol; } template - __host__ __device__ constexpr auto MakeSubMatrixDescriptor(Number, - Number) const + __host__ __device__ static constexpr auto MakeSubMatrixDescriptor(Number, + Number) { return ConstantMatrixDescriptor{}; } diff --git a/src/include/ConstantMergedTensorDescriptor.hip.hpp b/src/include/ConstantMergedTensorDescriptor.hip.hpp index 5daecd7105..1fe2ee32c0 100644 --- a/src/include/ConstantMergedTensorDescriptor.hip.hpp +++ b/src/include/ConstantMergedTensorDescriptor.hip.hpp @@ -11,8 +11,8 @@ struct ConstantMergedTensorDescriptor { static constexpr auto mOriginalDimMergeSeqs = std::tuple{}; - static constexpr index_t nDim = std::tuple_size::value; - static constexpr index_t nOriginalDim = OriginalDesc::GetNumOfDimension(); + static constexpr index_t nDim = sizeof...(OriginalDimMergeSeqs); + static constexpr index_t nOriginalDim = OriginalTensorDesc::GetNumOfDimension(); __host__ __device__ constexpr ConstantMergedTensorDescriptor() { @@ -21,25 +21,28 @@ struct ConstantMergedTensorDescriptor // TODO: check each of OriginalDimMergeSeqs contains at least 1, and at most // OriginalTensorDesc::nDim number of dimensions - // TODO: check there is no duplication in OriginalDimMergeSeqs - // TODO: check OriginalDimMergeSeqs contains all original dimensions + + // TODO: check there is no duplication in OriginalDimMergeSeqs } __host__ __device__ static constexpr index_t GetNumOfDimension() { return nDim; } - __host__ __device__ static constexpr index_t GetNumOfOriginalDimension() { return nOriginalDim } + __host__ __device__ static constexpr index_t GetNumOfOriginalDimension() + { + return nOriginalDim; + } template __host__ __device__ static constexpr bool ContainMultipleOriginalDimensions(Number) { - return (std::Get(mOriginalDimMergeSeqs).GetSize() > 1); + return (std::get(mOriginalDimMergeSeqs).GetSize() > 1); } template __host__ __device__ static constexpr index_t GetLength(Number) { - constexpr auto original_dims_partial = std::Get(mOriginalDimMergeSeqs); + constexpr auto original_dims_partial = std::get(mOriginalDimMergeSeqs); return OriginalTensorDesc::Extract(original_dims_partial).GetElementSize(); } @@ -50,14 +53,14 @@ struct ConstantMergedTensorDescriptor static_assert(!ContainMultipleOriginalDimensions(Number{}), "wrong! stride of a merged dimension is undefined"); - constexpr auto idim_original = std::Get(mOriginalDimMergeSeqs).Front(); + constexpr auto idim_original = std::get(mOriginalDimMergeSeqs).Front(); return OriginalTensorDesc::GetStride(Number{}); } __host__ __device__ static constexpr auto GetLengths() { - return Sequence{}; + return Sequence{}; } __host__ __device__ static constexpr index_t GetElementSize() @@ -75,17 +78,16 @@ struct ConstantMergedTensorDescriptor constexpr auto original_dims_partial = std::get(mOriginalDimMergeSeqs); // get partial original-multi-id corresponding to this merged dimension - constexpr auto original_multi_id_partial = + const auto original_multi_id_partial = OriginalTensorDesc::Extract(original_dims_partial) .GetMultiIndexFrom1dIndex(multi_id[idim]); - // make sure compiler unroll this loop and propagate all the constants - for(index_t i = 0; i < original_dims_partial.GetSize(); ++i) - { - index_t idim_original = original_dims_partial[i]; + static_for<0, original_dims_partial.GetSize(), 1>{}([&](auto I_) { + constexpr auto I = decltype(I_){}; + constexpr index_t idim_original = original_dims_partial.Get(I); - original_multi_id[idim_original] = original_multi_id_partial[i] - } + original_multi_id[idim_original] = original_multi_id_partial[I.Get()]; + }); }); return original_multi_id; @@ -95,10 +97,10 @@ struct ConstantMergedTensorDescriptor { const auto original_multi_id = GetOriginalMultiIndexFromMultiIndex(multi_id); - return OriginalTensorDesc::GetOffsetFromMultiIndex(orginal_multi_id); + return OriginalTensorDesc::GetOffsetFromMultiIndex(original_multi_id); } - template + template __host__ __device__ static index_t GetOffsetFromMultiIndex(Is... is) { return GetOffsetFromMultiIndex(Array{is...}); @@ -106,14 +108,15 @@ struct ConstantMergedTensorDescriptor __host__ __device__ static Array GetMultiIndexFrom1dIndex(index_t id) { - constexpr auto dummy_desc = make_packed_ConstantTensorDescriptor(GetLengths()); + constexpr auto dummy_desc = make_ConstantTensorDescriptor_default_rank_packed(GetLengths()); return dummy_desc.GetMultiIndexFrom1dIndex(id); } }; template -constexpr auto make_ConstantMergedTensorDescriptor(OriginalTensorDesc, OriginalDimMergeSeqs...) +__host__ __device__ constexpr auto make_ConstantMergedTensorDescriptor(OriginalTensorDesc, + OriginalDimMergeSeqs...) { return ConstantMergedTensorDescriptor{}; } diff --git a/src/include/ConstantTensorDescriptor.hip.hpp b/src/include/ConstantTensorDescriptor.hip.hpp index ff422de6e5..ecf9e5fae8 100644 --- a/src/include/ConstantTensorDescriptor.hip.hpp +++ b/src/include/ConstantTensorDescriptor.hip.hpp @@ -2,20 +2,20 @@ #include "common.hip.hpp" template -__host__ __device__ constexpr auto calculate_packed_tensor_strides(Lengths) +__host__ __device__ constexpr auto calculate_tensor_strides_default_rank_packed(Lengths) { return reverse_inclusive_scan_sequence(Lengths{}.PopFront(), std::multiplies{}) .PushBack(Number<1>{}); } template -__host__ __device__ constexpr auto - calculate_rank_tensor_default_strides_with_alignment(Lengths, Number) +__host__ __device__ constexpr auto calculate_tensor_strides_default_rank_aligned(Lengths, + Number) { constexpr index_t L_back_align = Align * mod_conv::integer_divide_ceiler{}(Lengths{}.Back(), Align); - return calculate_packed_tensor_strides( + return calculate_tensor_strides_default_rank_packed( Lengths{}.Modify(Number{}, Number{})); } @@ -66,6 +66,12 @@ struct ConstantTensorDescriptor return MemoryRanks{}.Get(Number{}); } + template + __host__ __device__ static constexpr bool ContainMultipleOriginalDimensions(T) + { + return false; + } + __host__ __device__ static constexpr index_t GetElementSize() { return accumulate_on_sequence(Lengths{}, std::multiplies{}, Number<1>{}); @@ -146,7 +152,7 @@ struct ConstantTensorDescriptor { Array multi_id; - constexpr auto dummy_strides = calculate_packed_tensor_strides(GetLengths()); + constexpr auto dummy_strides = calculate_tensor_strides_default_rank_packed(GetLengths()); // calculate index in each of the dimensions in the order of their dimension (not rank) static_for<0, nDim - 1, 1>{}([&](auto IDim) { @@ -181,6 +187,12 @@ struct ConstantTensorDescriptor return ConstantTensorDescriptor{}; } + template + __host__ __device__ static constexpr auto Extract(Sequence) + { + return Extract(Number{}...); + } + template __host__ __device__ static constexpr auto Slice(Number, Number) { @@ -271,9 +283,11 @@ struct ConstantTensorDescriptor FirstUnfoldDim <= LastUnfoldDim, "wrong! should have FirstUnfoldDim <= LastUnfoldDim!"); +#if 0 // cannot compile: compiler complain about constexpr // dimensions to be unfold need to be in descending order (w.r.t. strides), and need to be // packed in memory, otherwise, unfolding is invalid - static_for{}([&](auto IDim) { + static_for{}([&](auto IDim_) { + constexpr auto IDim = decltype(IDim_){}; constexpr auto IDim_p1 = IDim + Number<1>{}; // check stride @@ -285,11 +299,12 @@ struct ConstantTensorDescriptor static_assert(GetStride(IDim_p1) * GetLength(IDim_p1) == GetStride(IDim), "wrong! dimensions to be unfolded need to be packed"); - // checkt ranks + // check ranks static_assert(GetMemoryRank(IDim_p1) == GetMemoryRank(IDim) + 1, "wrong! ranks of dimensions to be unfolded need to be in increasing and " "continuous ranks"); }); +#endif // left and right constexpr auto left = typename arithmetic_sequence_gen<0, FirstUnfoldDim, 1>::SeqType{}; @@ -308,9 +323,9 @@ struct ConstantTensorDescriptor // decrease the ranks that are larger than the rank of LastUnfoldDim constexpr auto tmp_ranks = - transform_sequences(GetMemoryRanks(), - f_unfold_impl{}), - LastUnfoldDim - FirstUnfoldDim + 1>{}); + transform_sequences(f_unfold_impl{}), + LastUnfoldDim - FirstUnfoldDim + 1>{}, + GetMemoryRanks()); // new lengths, strides and ranks constexpr auto new_lengths = GetLengths() @@ -354,26 +369,26 @@ struct ConstantTensorDescriptor }; template -__host__ __device__ constexpr auto make_packed_ConstantTensorDescriptor(Lengths) +__host__ __device__ constexpr auto make_ConstantTensorDescriptor_default_rank_packed(Lengths) { - using Strides = decltype(calculate_packed_tensor_strides(Lengths{})); + using Strides = decltype(calculate_tensor_strides_default_rank_packed(Lengths{})); using MemoryRanks = typename arithmetic_sequence_gen<0, Lengths::GetSize(), 1>::SeqType; return ConstantTensorDescriptor{}; } template -__host__ __device__ constexpr auto make_ranked_ConstantTensorDescriptor(Lengths, Strides) +__host__ __device__ constexpr auto make_ConstantTensorDescriptor_default_rank(Lengths, Strides) { using MemoryRanks = typename arithmetic_sequence_gen<0, Lengths::GetSize(), 1>::SeqType; return ConstantTensorDescriptor{}; } template -__host__ __device__ constexpr auto - make_ranked_ConstantTensorDescriptor_with_alignment(Lengths, Number) +__host__ __device__ constexpr auto make_ConstantTensorDescriptor_default_rank_aligned(Lengths, + Number) { using Strides = - decltype(calculate_rank_tensor_default_strides_with_alignment(Lengths{}, Number{})); + decltype(calculate_tensor_strides_default_rank_aligned(Lengths{}, Number{})); using MemoryRanks = typename arithmetic_sequence_gen<0, Lengths::GetSize(), 1>::SeqType; return ConstantTensorDescriptor{}; } diff --git a/src/include/Sequence.hip.hpp b/src/include/Sequence.hip.hpp index b5a3aacbd2..cf713bcc8c 100644 --- a/src/include/Sequence.hip.hpp +++ b/src/include/Sequence.hip.hpp @@ -1,5 +1,5 @@ #pragma once -#include "constant_integral.hip.hpp" +#include "integral_constant.hip.hpp" #include "functional.hip.hpp" template @@ -21,12 +21,6 @@ struct Sequence return mData[I]; } - __host__ __device__ index_t operator[](index_t i) const - { - const index_t mData[mSize + 1] = {Is..., 0}; - return mData[i]; - } - template __host__ __device__ static constexpr auto ReorderGivenNew2Old(Sequence /*new2old*/) { @@ -164,6 +158,12 @@ struct sequence_reverse_inclusive_scan, Reduce> using SeqType = Sequence; }; +template +struct sequence_reverse_inclusive_scan, Reduce> +{ + using SeqType = Sequence<>; +}; + template struct sequence_extract; diff --git a/src/include/blockwise_2d_tensor_op.hip.hpp b/src/include/blockwise_2d_tensor_op.hip.hpp index 1753a48e87..d6ddf8db5f 100644 --- a/src/include/blockwise_2d_tensor_op.hip.hpp +++ b/src/include/blockwise_2d_tensor_op.hip.hpp @@ -457,7 +457,8 @@ struct Blockwise2dTensorCopy3 index_t mSrcMyThreadOffset; index_t mDstMyThreadOffset; - __device__ Blockwise2dTensorCopy3() + __device__ Blockwise2dTensorCopy3(Array src_block_data_multi_id_begin, + Array dst_block_data_multi_id_begin) { constexpr auto I0 = Number<0>{}; constexpr auto I1 = Number<1>{}; @@ -499,10 +500,13 @@ struct Blockwise2dTensorCopy3 const index_t thread_id_d0 = get_thread_local_1d_id() / thread_per_d1; const index_t thread_id_d1 = get_thread_local_1d_id() - thread_id_d0 * thread_per_d1; - mSrcMyThreadOffset = - SrcDesc{}.GetOffsetFromMultiIndex(thread_id_d0, thread_id_d1 * DataPerRead); - mDstMyThreadOffset = - DstDesc{}.GetOffsetFromMultiIndex(thread_id_d0, thread_id_d1 * DataPerRead); + mSrcMyThreadOffset = SrcDesc{}.GetOffsetFromMultiIndex( + src_block_data_multi_id_begin + + Array{thread_id_d0, thread_id_d1 * DataPerRead}); + + mDstMyThreadOffset = DstDesc{}.GetOffsetFromMultiIndex( + dst_block_data_multi_id_begin + + Array{thread_id_d0, thread_id_d1 * DataPerRead}); } __device__ void Run(const Float* __restrict__ p_src, Float* __restrict__ p_dst) const diff --git a/src/include/blockwise_4d_tensor_op.hip.hpp b/src/include/blockwise_4d_tensor_op.hip.hpp index 754a310afb..ef3dc3e4d6 100644 --- a/src/include/blockwise_4d_tensor_op.hip.hpp +++ b/src/include/blockwise_4d_tensor_op.hip.hpp @@ -13,7 +13,7 @@ blockwise_4d_tensor_pointwise_operation_unary(DstDesc, Float* __restrict__ p_dst constexpr auto dst_desc = DstDesc{}; - constexpr auto desc = make_packed_ConstantTensorDescriptor(dst_desc.GetLengths()); + constexpr auto desc = make_ConstantTensorDescriptor_default_rank_packed(dst_desc.GetLengths()); #if 0 if(get_thread_local_1d_id() == 0) @@ -108,7 +108,7 @@ __device__ void blockwise_4d_tensor_pointwise_operation_binary_reorder_by_get_ds constexpr auto src_desc = SrcDesc{}; constexpr auto dst_desc = DstDesc{}; - constexpr auto ref_desc = make_packed_ConstantTensorDescriptor(SrcOpLengths{}); + constexpr auto ref_desc = make_ConstantTensorDescriptor_default_rank_packed(SrcOpLengths{}); constexpr index_t NLoop = ref_desc.GetElementSize() / BlockSize; @@ -259,7 +259,7 @@ struct Blockwise4dTensorCopy1 constexpr index_t read_per_d3 = mod_conv::integer_divide_ceil(L3, DataPerRead); constexpr auto ref_desc = - make_packed_ConstantTensorDescriptor(Sequence{}); + make_ConstantTensorDescriptor_default_rank_packed(Sequence{}); constexpr index_t NLoop = ref_desc.GetElementSize() / BlockSize; @@ -336,7 +336,7 @@ struct BlockwiseChwnTensorCopyPadded constexpr auto src_desc = SrcDesc{}; constexpr auto dst_desc = DstDesc{}; - constexpr auto ref_desc = make_packed_ConstantTensorDescriptor(DstOpLengths{}); + constexpr auto ref_desc = make_ConstantTensorDescriptor_default_rank_packed(DstOpLengths{}); constexpr auto h_global_pad_low = GlobalLowerPads{}.Get(I0); constexpr auto w_global_pad_low = GlobalLowerPads{}.Get(I1); @@ -510,7 +510,8 @@ struct Blockwise4dTensorCopy3 } } - constexpr auto thread_cluster_desc = make_packed_ConstantTensorDescriptor(ThreadPerDims{}); + constexpr auto thread_cluster_desc = + make_ConstantTensorDescriptor_default_rank_packed(ThreadPerDims{}); const auto thread_multi_id = thread_cluster_desc.GetMultiIndexFrom1dIndex(get_thread_local_1d_id()); @@ -652,7 +653,7 @@ struct Blockwise4dTensorCopy3 constexpr index_t nloop_d2 = L2 / thread_per_d2; constexpr index_t nloop_d3 = mod_conv::integer_divide_ceil(L3, thread_per_d3 * DataPerRead); - constexpr auto clipboard_desc = make_packed_ConstantTensorDescriptor( + constexpr auto clipboard_desc = make_ConstantTensorDescriptor_default_rank_packed( Sequence{}); #pragma unroll @@ -719,7 +720,7 @@ struct Blockwise4dTensorCopy3 constexpr index_t nloop_d2 = L2 / thread_per_d2; constexpr index_t nloop_d3 = mod_conv::integer_divide_ceil(L3, thread_per_d3 * DataPerRead); - constexpr auto clipboard_desc = make_packed_ConstantTensorDescriptor( + constexpr auto clipboard_desc = make_ConstantTensorDescriptor_default_rank_packed( Sequence{}); #pragma unroll diff --git a/src/include/blockwise_gemm.hip.hpp b/src/include/blockwise_gemm.hip.hpp index fd51d86e15..b155f36dca 100644 --- a/src/include/blockwise_gemm.hip.hpp +++ b/src/include/blockwise_gemm.hip.hpp @@ -46,7 +46,7 @@ struct BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_v2 N % (NPerThreadSubC * NLevel0Cluster * NLevel1Cluster) == 0, "wrong! Cannot evenly divide work among\n"); - static_assert(ThreadMatrixC::GetLengths() == GetThreadMatrixCLengths(), + static_assert(is_same_type(ThreadMatrixC::GetLengths(), GetThreadMatrixCLengths()), "wrong! ThreadMatrixC lengths is wrong"); auto c_thread_mtx_index = GetBeginOfThreadMatrixC(get_thread_local_1d_id()); @@ -55,7 +55,7 @@ struct BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_v2 mMyThreadOffsetB = BlockMatrixB::GetOffsetFromMultiIndex(0, c_thread_mtx_index.col); } - __device__ static auto GetThreadMatrixCLengths() + __device__ static constexpr auto GetThreadMatrixCLengths() { constexpr index_t M = BlockMatrixA::NCol(); // A is transposed constexpr index_t N = BlockMatrixB::NCol(); diff --git a/src/include/blockwise_merged_tensor_slice_op.hip.hpp b/src/include/blockwise_merged_tensor_slice_op.hip.hpp index b47df1f602..b616e9f010 100644 --- a/src/include/blockwise_merged_tensor_slice_op.hip.hpp +++ b/src/include/blockwise_merged_tensor_slice_op.hip.hpp @@ -1,19 +1,19 @@ #pragma once #include "threadwise_tensor_slice_op.hip.hpp" -// slice a merged tensor, reorder and copy it into a normal tensor -// src: a merged tensor, -// dst: a normal tensor +// slice a (normal or merged) tensor, reorder and copy it into another (normal or merged) tensor template + class DstAccessOrder, + index_t SrcDataPerRead, + index_t DstDataPerRead> struct BlockwiseTensorSliceCopy_generic_v1 { static constexpr index_t nDim = SrcDesc::GetNumOfDimension(); @@ -21,39 +21,44 @@ struct BlockwiseTensorSliceCopy_generic_v1 index_t mSrcMyThreadOffset; index_t mDstMyThreadOffset; - __device__ BlockwiseTensorSliceCopy_generic_v1(Array src_block_multi_offset, - Array dst_block_multi_offset) + __device__ + BlockwiseTensorSliceCopy_generic_v1(Array src_block_data_multi_id_begin, + Array dst_block_data_multi_id_begin) { // check NDim consistent static_assert(SrcDesc::GetNumOfDimension() == DstDesc::GetNumOfDimension(), "wrong"); - constexpr auto thread_cluster_desc = make_packed_ConstantTensorDescriptor( - ClusterLengths{}.ReorderGivenNew2Old(ThreadClusterArrangeOrder{})); + // thread cluster + constexpr auto thread_cluster_desc = make_ConstantTensorDescriptor_default_rank_packed( + DataClusterLengths{}.ReorderGivenNew2Old(ThreadClusterArrangeOrder{})); // BlockSize static_assert(BlockSize == thread_cluster_desc.GetElementSize(), "wrong! BlockSize"); // divide work - static_for<0, nDim, 1>{}([&](auto IDim) { - static_assert(SliceLengths{}.Get(IDim) % SubLenghs{}.Get(IDim) == 0, + constexpr auto data_per_cluster_per_dims = SubLengths{} * DataClusterLengths{}; + + static_for<0, nDim, 1>{}([&](auto IDim_) { + constexpr auto IDim = decltype(IDim_){}; + + static_assert(SliceLengths::Get(IDim) % SubLengths::Get(IDim) == 0, "wrong! cannot evenly divide sliced tensor into sub-tensor"); + + static_assert(SliceLengths::Get(IDim) % data_per_cluster_per_dims.Get(IDim) == 0, + "wrong! cannot evenly divide sliced tensor into cluster"); }); - constexpr auto thread_work_desc = - make_packed_ConstantTensorDescriptor(SliceLengths{} / SliceSubLengths{}); + constexpr auto repeat_lengths = SliceLengths{} / data_per_cluster_per_dims; - static_for<0, nDim, 1>{}([&](auto IDim) { - static_assert(thread_work_desc.GetLength(IDim) % thread_cluster_desc.Get(IDim) == 0, - "wrong! cannot evenly divide work to cluster"); - }); + // for now, only support SubLengths.Get() == 1 on a merged dimension that is merge from + // multiple dimensions + static_for<0, nDim, 1>{}([&](auto IDim_) { + constexpr auto IDim = decltype(IDim_){}; - // only support SubLengths.Get() == 1 on merged dimension, for now - static_for<0, nDim, 1>{}([&](auto IDim) { - static_if<(SrcDesc::ContainMultipleOriginalDimensions(IDim) || - DstDesc::ContainMultipleOriginalDimensions(IDim))>{}([&](auto fwd) { - static_assert(fwd(SubLengths{}).Get(IDim) == 1, - "wrong! Sub-Lengths on merged dimension should be 1"); - }); + static_assert(SubLengths::Get(IDim) == 1 || + (!SrcDesc::ContainMultipleOriginalDimensions(IDim) && + !DstDesc::ContainMultipleOriginalDimensions(IDim)), + "wrong! only surpport Sub-Length == 1 on a merged dimension"); }); // calculate mSrcMyThreadOffset, mDstMyThreadOffset @@ -63,22 +68,23 @@ struct BlockwiseTensorSliceCopy_generic_v1 const auto data_cluster_multi_id = reorder_array_given_old2new(thread_cluster_multi_id, ThreadClusterArrangeOrder{}); - const auto thread_data_multi_offset = data_cluster_multi_id * SubLengths{}; + const auto thread_data_multi_id_begin = data_cluster_multi_id * SubLengths{}; - mSrcMythreadOffset = - SrcDesc::GetOffsetFromMultiIndex(src_block_multi_offset + thread_data_multi_offset); - mSrcMythreadOffset = - DstDesc::GetOffsetFromMultiIndex(dst_block_multi_offset + thread_data_multi_offset); + mSrcMyThreadOffset = SrcDesc::GetOffsetFromMultiIndex(src_block_data_multi_id_begin + + thread_data_multi_id_begin); + + mSrcMyThreadOffset = DstDesc::GetOffsetFromMultiIndex(dst_block_data_multi_id_begin + + thread_data_multi_id_begin); } __device__ static constexpr index_t GetRegisterClipboardSize() { - constexpr auto repeat_lengths = SliceLengths{} / (SubLengths{} * ClusterLengths{}); + constexpr auto repeat_lengths = SliceLengths{} / (SubLengths{} * DataClusterLengths{}); constexpr auto thread_tensor_desc = - make_packed_ConstantTensorDescriptor(SubLengths{} * repeat_lengths); + make_ConstantTensorDescriptor_default_rank_packed(SubLengths{} * repeat_lengths); - return thread_tensor_desc.GetElementSpaceSize(); + return thread_tensor_desc.GetElementSpace(); } __device__ void RunLoadRegisterClipboard(const Float* __restrict__ p_src, @@ -86,32 +92,34 @@ struct BlockwiseTensorSliceCopy_generic_v1 { constexpr auto thread_sub_tensor_lengths = SubLengths{}; - constexpr auto data_per_cluster_per_dims = thread_sub_tensor_lengths * ClusterLengths{}; + constexpr auto data_per_cluster_per_dims = thread_sub_tensor_lengths * DataClusterLengths{}; - constexpr auto repeat_lengths = SliceLengths{} / (SubLengths{} * ClusterLengths{}); + constexpr auto repeat_lengths = SliceLengths{} / (SubLengths{} * DataClusterLengths{}); - constexpr auto thread_tensor_desc = - make_packed_ConstantTensorDescriptor(thread_sub_tensor_lengths * repeat_lengths); + constexpr auto thread_tensor_desc = make_ConstantTensorDescriptor_default_rank_packed( + thread_sub_tensor_lengths * repeat_lengths); static_ford{}([&](auto repeat_multi_id_) { - constexpr auto repeat_multi_id = decltype(repeat_multi_id_){}; + constexpr auto repeat_multi_id = sequence2array(decltype(repeat_multi_id_){}); - constexpr auto src_data_multi_offset = repeat_multi_id * data_per_cluster_per_dims; + const auto src_thread_data_multi_id_begin = + repeat_multi_id * data_per_cluster_per_dims; // cannot not constexpr, why? - constexpr auto clipboard_data_multi_offset = - repeat_multi_id * thread_sub_tensor_lengths; + const auto clipboard_data_multi_id_begin = + repeat_multi_id * thread_sub_tensor_lengths; // cannot not constexpr, why? - constexpr index_t src_offset = SrcDesc{}.GetOffsetFromMultiIndex(src_data_multi_id); - constexpr index_t clipboard_offset = - thread_tensor_desc.GetOffsetFromMultiIndex(clipboard_data_multi_id); + const index_t src_offset = SrcDesc{}.GetOffsetFromMultiIndex( + src_thread_data_multi_id_begin); // cannot not constexpr, why? + + const index_t clipboard_offset = thread_tensor_desc.GetOffsetFromMultiIndex( + clipboard_data_multi_id_begin); // cannot not constexpr, why? threadwise_tensor_slice_copy_generic(SrcDesc{}, p_src + src_offset + mSrcMyThreadOffset, - thread_tensor_desc, - zero_array{}, + make_zero_array(), thread_tensor_desc, p_clipboard + clipboard_offset, - zero_array{}, + make_zero_array(), thread_sub_tensor_lengths, SrcAccessOrder{}); }); @@ -122,41 +130,44 @@ struct BlockwiseTensorSliceCopy_generic_v1 { constexpr auto thread_sub_tensor_lengths = SubLengths{}; - constexpr auto data_per_cluster_per_dims = thread_sub_tensor_lengths * ClusterLengths{}; + constexpr auto data_per_cluster_per_dims = thread_sub_tensor_lengths * DataClusterLengths{}; - constexpr auto repeat_lengths = SliceLengths{} / (SubLengths{} * ClusterLengths{}); + constexpr auto repeat_lengths = SliceLengths{} / (SubLengths{} * DataClusterLengths{}); - constexpr auto thread_tensor_desc = - make_packed_ConstantTensorDescriptor(thread_sub_tensor_lengths * repeat_lengths); + constexpr auto thread_tensor_desc = make_ConstantTensorDescriptor_default_rank_packed( + thread_sub_tensor_lengths * repeat_lengths); static_ford{}([&](auto repeat_multi_id_) { - constexpr auto repeat_multi_id = decltype(repeat_multi_id_){}; + constexpr auto repeat_multi_id = sequence2array(decltype(repeat_multi_id_){}); - constexpr auto clipboard_data_multi_offset = - repeat_multi_id * thread_sub_tensor_lengths; + const auto clipboard_data_multi_id_begin = + repeat_multi_id * thread_sub_tensor_lengths; // cannot not constexpr, why? - constexpr auto dst_data_multi_offset = repeat_multi_id * data_per_cluster_per_dims; + const auto dst_data_multi_id_begin = + repeat_multi_id * data_per_cluster_per_dims; // cannot not constexpr, why? - constexpr index_t clipboard_offset = - thread_tensor_desc.GetOffsetFromMultiIndex(clipboard_data_multi_offset); + const index_t clipboard_offset = thread_tensor_desc.GetOffsetFromMultiIndex( + clipboard_data_multi_id_begin); // cannot not constexpr, why? - constexpr index_t dst_offset = DstDesc{}.GetOffsetFromMultiIndex(dst_data_multi_offset); + const index_t dst_offset = DstDesc{}.GetOffsetFromMultiIndex( + dst_data_multi_id_begin); // cannot not constexpr, why? threadwise_tensor_slice_copy_generic(thread_tensor_desc, p_clipboard + clipboard_offset, - zero_array{}, + make_zero_array(), DstDesc{}, p_dst + dst_offset + mDstMyThreadOffset, - zero_array{}, + make_zero_array(), thread_sub_tensor_lengths, DstAccessOrder{}); + }); } __device__ void Run(const Float* __restrict__ p_src, Float* __restrict__ p_dst) const { - Float p_clipboard[GetRegisterClipboardSize()]; + Float p_clipboard[GetRegisterClipboardSize()]; - RunLoadRegisterClipboard(p_src, p_clipboard); - RunStoreRegisterClipboard(p_clipboard, p_dst); + RunLoadRegisterClipboard(p_src, p_clipboard); + RunStoreRegisterClipboard(p_clipboard, p_dst); } - }; +}; diff --git a/src/include/blockwise_tensor_slice_op.hip.hpp b/src/include/blockwise_tensor_slice_op.hip.hpp index 0285528648..8fad695136 100644 --- a/src/include/blockwise_tensor_slice_op.hip.hpp +++ b/src/include/blockwise_tensor_slice_op.hip.hpp @@ -40,7 +40,7 @@ struct BlockwiseTensorSliceReorderCopy_v3 src_cluster_lengths.ReorderGivenNew2Old(map_thread_cluster_2_src_cluster); constexpr auto thread_cluster_desc = - make_packed_ConstantTensorDescriptor(thread_cluster_lengths); + make_ConstantTensorDescriptor_default_rank_packed(thread_cluster_lengths); // sanity check: data type static_assert(is_same::value, "wrong! only support float for now!\n"); @@ -149,7 +149,7 @@ struct BlockwiseTensorSliceReorderCopy_v3 constexpr auto thread_tensor_lengths = thread_sub_tensor_lengths * repeat_lengths; constexpr auto thread_tensor_desc = - make_packed_ConstantTensorDescriptor(thread_tensor_lengths); + make_ConstantTensorDescriptor_default_rank_packed(thread_tensor_lengths); return thread_tensor_desc.GetElementSpace(); } @@ -170,7 +170,7 @@ struct BlockwiseTensorSliceReorderCopy_v3 constexpr auto thread_tensor_lengths = thread_sub_tensor_lengths * repeat_lengths; constexpr auto thread_tensor_desc = - make_packed_ConstantTensorDescriptor(thread_tensor_lengths); + make_ConstantTensorDescriptor_default_rank_packed(thread_tensor_lengths); static_ford{}([&](auto repeat_multi_id_) { constexpr auto repeat_multi_id = decltype(repeat_multi_id_){}; @@ -208,7 +208,7 @@ struct BlockwiseTensorSliceReorderCopy_v3 constexpr auto thread_tensor_lengths = thread_sub_tensor_lengths * repeat_lengths; constexpr auto thread_tensor_desc = - make_packed_ConstantTensorDescriptor(thread_tensor_lengths); + make_ConstantTensorDescriptor_default_rank_packed(thread_tensor_lengths); static_ford{}([&](auto repeat_multi_id_) { constexpr auto repeat_multi_id = decltype(repeat_multi_id_){}; diff --git a/src/include/common.hip.hpp b/src/include/common.hip.hpp index 2c5ee402ae..da047b1472 100644 --- a/src/include/common.hip.hpp +++ b/src/include/common.hip.hpp @@ -1,6 +1,6 @@ #pragma once #include "vector_type.hip.hpp" -#include "constant_integral.hip.hpp" +#include "integral_constant.hip.hpp" #include "Sequence.hip.hpp" #include "Array.hip.hpp" #include "functional.hip.hpp" @@ -17,15 +17,21 @@ __device__ index_t get_block_1d_id() { return blockIdx.x; } template struct is_same { - static const bool value = false; + static constexpr bool value = false; }; template struct is_same { - static const bool value = true; + static constexpr bool value = true; }; +template +__host__ __device__ constexpr bool is_same_type(X, Y) +{ + return is_same::value; +} + namespace mod_conv { // namespace mod_conv template struct scales diff --git a/src/include/conv_common.hip.hpp b/src/include/conv_common.hip.hpp index e56743f242..2eb5a83d30 100644 --- a/src/include/conv_common.hip.hpp +++ b/src/include/conv_common.hip.hpp @@ -30,7 +30,7 @@ __host__ __device__ constexpr auto get_convolution_output_default_4d_tensor_desc constexpr auto HO = HI + 1 - Y; constexpr auto WO = WI + 1 - X; - return make_packed_ConstantTensorDescriptor(Sequence{}); + return make_ConstantTensorDescriptor_default_rank_packed(Sequence{}); } template @@ -67,7 +67,7 @@ __host__ __device__ constexpr auto get_convolution_with_padding_output_default_4 constexpr auto HO = HI + HPadLow + HPadUp + 1 - Y; constexpr auto WO = WI + WPadLow + WPadUp + 1 - X; - return make_packed_ConstantTensorDescriptor(Sequence{}); + return make_ConstantTensorDescriptor_default_rank_packed(Sequence{}); } template diff --git a/src/include/functional.hip.hpp b/src/include/functional.hip.hpp index e25dffa8c7..25209147db 100644 --- a/src/include/functional.hip.hpp +++ b/src/include/functional.hip.hpp @@ -1,5 +1,5 @@ #pragma once -#include "constant_integral.hip.hpp" +#include "integral_constant.hip.hpp" struct forwarder { diff --git a/src/include/gridwise_convolution_implicit_gemm_v1r3_lds_double_buffer_chwn_cyxk_khwn.hip.hpp b/src/include/gridwise_convolution_implicit_gemm_v1r3_lds_double_buffer_chwn_cyxk_khwn.hip.hpp index a1cd646b7f..0b68d25c41 100644 --- a/src/include/gridwise_convolution_implicit_gemm_v1r3_lds_double_buffer_chwn_cyxk_khwn.hip.hpp +++ b/src/include/gridwise_convolution_implicit_gemm_v1r3_lds_double_buffer_chwn_cyxk_khwn.hip.hpp @@ -85,7 +85,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_chwn_cyxk_khwn constexpr index_t WBlockWork = mod_conv::integer_divide_ceil(Wo, WoPerBlock); constexpr index_t NBlockWork = mod_conv::integer_divide_ceil(N, NPerBlock); - constexpr auto block_work_desc = make_packed_ConstantTensorDescriptor( + constexpr auto block_work_desc = make_ConstantTensorDescriptor_default_rank_packed( Sequence{}); const auto block_work_multi_id = @@ -109,7 +109,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_chwn_cyxk_khwn GemmDataPerReadA, GemmDataPerReadB); - constexpr auto in_c_h_w_n_block_desc = make_ranked_ConstantTensorDescriptor_with_alignment( + constexpr auto in_c_h_w_n_block_desc = make_ConstantTensorDescriptor_default_rank_aligned( Sequence{}, Number{}); @@ -118,12 +118,12 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_chwn_cyxk_khwn static_assert(in_c_h_w_n_block_desc.GetStride(I1) % GemmDataPerReadB == 0, "GemmDataPerReadB alignment requirement is not meet"); - constexpr auto wei_c_k_block_desc = make_ranked_ConstantTensorDescriptor_with_alignment( + constexpr auto wei_c_k_block_desc = make_ConstantTensorDescriptor_default_rank_aligned( Sequence{}, Number{}); // tensor view of threadwise output in register - constexpr auto out_k_h_w_n_thread_desc = make_packed_ConstantTensorDescriptor( + constexpr auto out_k_h_w_n_thread_desc = make_ConstantTensorDescriptor_default_rank_packed( Sequence{}); // blockwise copy diff --git a/src/include/gridwise_convolution_implicit_gemm_v1r3_lds_double_buffer_nchw_cyxk_khwn.hip.hpp b/src/include/gridwise_convolution_implicit_gemm_v1r3_lds_double_buffer_nchw_cyxk_khwn.hip.hpp index f721b92af1..6e228b3f31 100644 --- a/src/include/gridwise_convolution_implicit_gemm_v1r3_lds_double_buffer_nchw_cyxk_khwn.hip.hpp +++ b/src/include/gridwise_convolution_implicit_gemm_v1r3_lds_double_buffer_nchw_cyxk_khwn.hip.hpp @@ -86,7 +86,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_nchw_cyxk_khwn constexpr index_t HBlockWork = mod_conv::integer_divide_ceil(Ho, HoPerBlock); constexpr index_t WBlockWork = mod_conv::integer_divide_ceil(Wo, WoPerBlock); - constexpr auto block_work_desc = make_packed_ConstantTensorDescriptor( + constexpr auto block_work_desc = make_ConstantTensorDescriptor_default_rank_packed( Sequence{}); const auto block_work_multi_id = @@ -102,7 +102,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_nchw_cyxk_khwn // global tensor view constexpr auto wei_c_k_global_desc = - make_ranked_ConstantTensorDescriptor(Sequence{}, Sequence{}); + make_ConstantTensorDescriptor_default_rank(Sequence{}, Sequence{}); // LDS tensor view // be careful of alignment @@ -111,7 +111,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_nchw_cyxk_khwn GemmDataPerReadA, GemmDataPerReadB); - constexpr auto in_c_h_w_n_block_desc = make_ranked_ConstantTensorDescriptor_with_alignment( + constexpr auto in_c_h_w_n_block_desc = make_ConstantTensorDescriptor_default_rank_aligned( Sequence{}, Number{}); @@ -120,12 +120,12 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_nchw_cyxk_khwn static_assert(in_c_h_w_n_block_desc.GetStride(I1) % GemmDataPerReadB == 0, "GemmDataPerReadB alignment requirement is not meet"); - constexpr auto wei_c_k_block_desc = make_ranked_ConstantTensorDescriptor_with_alignment( + constexpr auto wei_c_k_block_desc = make_ConstantTensorDescriptor_default_rank_aligned( Sequence{}, Number{}); // tensor view of threadwise output in register - constexpr auto out_k_h_w_n_thread_desc = make_packed_ConstantTensorDescriptor( + constexpr auto out_k_h_w_n_thread_desc = make_ConstantTensorDescriptor_default_rank_packed( Sequence{}); // blockwise copy @@ -448,10 +448,10 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_nchw_cyxk_khwn constexpr index_t K1 = KPerBlock / KPerThread; #if 0 - constexpr auto out_10d_global_desc = make_packed_ConstantTensorDescriptor( + constexpr auto out_10d_global_desc = make_ConstantTensorDescriptor_default_rank_packed( Sequence{}); - constexpr auto out_10d_thread_desc = make_packed_ConstantTensorDescriptor( + constexpr auto out_10d_thread_desc = make_ConstantTensorDescriptor_default_rank_packed( Sequence{}); #else constexpr auto out_10d_global_desc = diff --git a/src/include/gridwise_convolution_implicit_gemm_v1r3_lds_double_buffer_nchw_cyxk_nkhw.hip.hpp b/src/include/gridwise_convolution_implicit_gemm_v1r3_lds_double_buffer_nchw_cyxk_nkhw.hip.hpp index 8549f30a01..02a87650d4 100644 --- a/src/include/gridwise_convolution_implicit_gemm_v1r3_lds_double_buffer_nchw_cyxk_nkhw.hip.hpp +++ b/src/include/gridwise_convolution_implicit_gemm_v1r3_lds_double_buffer_nchw_cyxk_nkhw.hip.hpp @@ -86,7 +86,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_nchw_cyxk_nkhw constexpr index_t HBlockWork = mod_conv::integer_divide_ceil(Ho, HoPerBlock); constexpr index_t WBlockWork = mod_conv::integer_divide_ceil(Wo, WoPerBlock); - constexpr auto block_work_desc = make_packed_ConstantTensorDescriptor( + constexpr auto block_work_desc = make_ConstantTensorDescriptor_default_rank_packed( Sequence{}); const auto block_work_multi_id = @@ -110,7 +110,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_nchw_cyxk_nkhw GemmDataPerReadA, GemmDataPerReadB); - constexpr auto in_c_h_w_n_block_desc = make_ranked_ConstantTensorDescriptor_with_alignment( + constexpr auto in_c_h_w_n_block_desc = make_ConstantTensorDescriptor_default_rank_aligned( Sequence{}, Number{}); @@ -119,12 +119,12 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_nchw_cyxk_nkhw static_assert(in_c_h_w_n_block_desc.GetStride(I1) % GemmDataPerReadB == 0, "GemmDataPerReadB alignment requirement is not meet"); - constexpr auto wei_c_k_block_desc = make_ranked_ConstantTensorDescriptor_with_alignment( + constexpr auto wei_c_k_block_desc = make_ConstantTensorDescriptor_default_rank_aligned( Sequence{}, Number{}); // tensor view of threadwise output in register - constexpr auto out_k_h_w_n_thread_desc = make_packed_ConstantTensorDescriptor( + constexpr auto out_k_h_w_n_thread_desc = make_ConstantTensorDescriptor_default_rank_packed( Sequence{}); // blockwise copy @@ -152,7 +152,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_nchw_cyxk_nkhw decltype(wei_c_k_global_desc), decltype(wei_c_k_block_desc), decltype(wei_c_k_block_desc.GetLengths()), - WeiBlockCopyDataPerRead_K>{}; + WeiBlockCopyDataPerRead_K>({0, 0}, {0, 0}); // a series of blockwise batched GEMM // C_matrix += transpose(A_matrix) * B_matrix @@ -196,7 +196,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_nchw_cyxk_nkhw // choose GEMM implementation here const auto run_blockwise_batch_gemm = [&](auto... Xs) { -#if 0 +#if 1 return blockwise_batch_gemm.Run(Xs...); #elif 0 return blockwise_batch_gemm.Run_asm(Xs...); diff --git a/src/include/gridwise_convolution_implicit_gemm_v3_nchw_cyxk_nkhw.hip.hpp b/src/include/gridwise_convolution_implicit_gemm_v3_nchw_cyxk_nkhw.hip.hpp index 2af70e8314..b9f8c8cc1b 100644 --- a/src/include/gridwise_convolution_implicit_gemm_v3_nchw_cyxk_nkhw.hip.hpp +++ b/src/include/gridwise_convolution_implicit_gemm_v3_nchw_cyxk_nkhw.hip.hpp @@ -1,8 +1,11 @@ #pragma once #include "common.hip.hpp" #include "ConstantTensorDescriptor.hip.hpp" +#include "ConstantMergedTensorDescriptor.hip.hpp" #include "ConstantMatrixDescriptor.hip.hpp" +#include "blockwise_merged_tensor_slice_op.hip.hpp" #include "blockwise_gemm.hip.hpp" +#include "threadwise_tensor_slice_op.hip.hpp" // define B = merge(N, Ho, Wo) template + index_t GemmDataPerReadB, + class InBlockCopySubLengths_N1_N2_C_B, + class InBlockCopyClusterLengths_N1_N2_C_B, + index_t InBlockCopySrcDataPerRead_B, + index_t InBlockCopyDstDataPerWrite_N2, + index_t WeiBlockCopyDataPerAccess_K> struct GridwiseConvolutionImplicitGemm_v3_nchw_cyxk_nkhw { __device__ void Run(const Float* const __restrict__ p_in_global, @@ -34,12 +42,10 @@ struct GridwiseConvolutionImplicitGemm_v3_nchw_cyxk_nkhw // this is a mess // TODO: more elegent way of specifying (or calculating) performance variables static_assert(N2 == GemmNPerThreadSubC, "wrong!"); - static_assert(KPerBlock == - N1 * GemmNPerThreadSubC * GemmNLevel0Cluster * GemmNLevel1Cluster, + static_assert((N1 * N2 * BPerBlock) % + (GemmNPerThreadSubC * GemmNLevel0Cluster * GemmNLevel1Cluster) == + 0, "wrong!"); - static_assert( - KPerBlock % (N1 * GemmNPerThreadSubC * GemmNLevel0Cluster * GemmNLevel1Cluster) == 0, - "wrong!"); constexpr auto I0 = Number<0>{}; constexpr auto I1 = Number<1>{}; @@ -73,15 +79,14 @@ struct GridwiseConvolutionImplicitGemm_v3_nchw_cyxk_nkhw constexpr index_t B = N0 * Ho * Wo; // divide block work by [K, B] - static_assert(K % KPerBlock == 0 && B % BPerBlock == 0, - C % CPerBlock == 0, + static_assert(K % KPerBlock == 0 && B % BPerBlock == 0 && C % CPerBlock == 0, "wrong! cannot divide work evenly among block"); constexpr index_t KBlockWork = K / KPerBlock; constexpr index_t BBlockWork = B / BPerBlock; constexpr auto block_work_desc = - make_ConstantTensorDescriptor(Sequence{}); + make_ConstantTensorDescriptor_default_rank_packed(Sequence{}); const auto block_work_multi_id = block_work_desc.GetMultiIndexFrom1dIndex(get_block_1d_id()); @@ -95,16 +100,20 @@ struct GridwiseConvolutionImplicitGemm_v3_nchw_cyxk_nkhw in_n_c_h_w_global_desc.Fold(I0, Number{}, Number{}); // merged tensor descriptor in device memory [N1, N2, C, B], src of blockwise copy - constexpr auto in_n1_n2_c_b_global_merged_desc = - in_n0_n1_n2_c_h_w_global_desc.ReorderGivenNew2Old(Sequence<1, 2, 3, 0, 4, 5>{}) + constexpr auto in_n1_n2_c_b_global_merged_desc = make_ConstantMergedTensorDescriptor( + in_n0_n1_n2_c_h_w_global_mem_desc.ReorderGivenNew2Old(Sequence<1, 2, 3, 0, 4, 5>{}) .Slice(I4, Number{}) - .Slice(I5, Number{}) - .Merge(I3, I5); + .Slice(I5, Number{}), + Sequence<0>{}, + Sequence<1>{}, + Sequence<2>{}, + Sequence<3, 4, 5>{}); // memory layout descriptor in LDS [C, N1, B, N2] // be careful of LDS alignment - constexpr auto in_c_n1_b_n2_block_mem_desc = make_ConstantTensorDescriptor_aligned( - Sequence{}, Number{}); + constexpr auto in_c_n1_b_n2_block_mem_desc = + make_ConstantTensorDescriptor_default_rank_aligned( + Sequence{}, Number{}); // tensor descriptor in LDS [N1, N2, C, B], dst of blockwise copy constexpr auto in_n1_n2_c_b_block_desc = @@ -112,7 +121,7 @@ struct GridwiseConvolutionImplicitGemm_v3_nchw_cyxk_nkhw // this check is ad-hoc // TODO: need to properly implement tensor descriptor with alignment - static_assert(in_c_n1_b_n2_block_desc.GetStride(I1) % GemmDataPerReadB == 0, + static_assert(in_c_n1_b_n2_block_mem_desc.GetStride(I1) % GemmDataPerReadB == 0, "GemmDataPerReadB alignment requirement is not satisfied"); // input blockwise copy @@ -129,7 +138,8 @@ struct GridwiseConvolutionImplicitGemm_v3_nchw_cyxk_nkhw Sequence<2, 0, 1, 3>, // thread_arrange_order [C, N1, N2, B] Sequence<0, 1, 2, 3>, // src_access_order [N1, N2, C, B] Sequence<2, 0, 3, 1>, // dst_access_order [C, N1, B, N2] - >({0, 0, 0, b_block_data_on_global}, {0, 0, 0, 0}); + InBlockCopySrcDataPerRead_B, + InBlockCopyDstDataPerWrite_N2>({0, 0, 0, b_block_data_on_global}, {0, 0, 0, 0}); // weight tensor // tensor descriptor in device memory, src of blockwise copy @@ -137,9 +147,9 @@ struct GridwiseConvolutionImplicitGemm_v3_nchw_cyxk_nkhw // tensor descriptor in LDS, dst of blockwise copy // be careful of LDS alignment - constexpr auto wei_c_k_block_desc = make_ConstantTensorDescriptor_aligned( + constexpr auto wei_c_k_block_desc = make_ConstantTensorDescriptor_default_rank_aligned( Sequence{}, - Number{}); + Number{}); // operator for blockwise copy of weight into LDS // slicing a tensor @@ -150,7 +160,8 @@ struct GridwiseConvolutionImplicitGemm_v3_nchw_cyxk_nkhw decltype(wei_c_k_global_desc), decltype(wei_c_k_block_desc), decltype(wei_c_k_block_desc.GetLengths()), - WeiBlockCopyDataPerRead_K>({0, k_block_data_on_global}, {0, 0}); + WeiBlockCopyDataPerAccess_K>({0, k_block_data_on_global}, + {0, 0}); // GEMM definition // c_mtx += transpose(a_mtx) * b_mtx @@ -167,7 +178,8 @@ struct GridwiseConvolutionImplicitGemm_v3_nchw_cyxk_nkhw Number{}); // sanity check - static_assert(KPerBlock % (GemmMPerThreadSubC * GemmMLevel0Cluster * GemmMLevel1Cluster), + static_assert(KPerBlock % (GemmMPerThreadSubC * GemmMLevel0Cluster * GemmMLevel1Cluster) == + 0, "wrong!"); constexpr index_t GemmMRepeat = @@ -194,8 +206,8 @@ struct GridwiseConvolutionImplicitGemm_v3_nchw_cyxk_nkhw GemmDataPerReadB>{}; // LDS allocation for input and weight: be careful of alignment - constexpr index_t max_align = mod_conv::max(InBlockReorderDataPerWrite_N, - WeiBlockCopyDataPerRead_K, + constexpr index_t max_align = mod_conv::max(InBlockCopyDstDataPerWrite_N2, + WeiBlockCopyDataPerAccess_K, GemmDataPerReadA, GemmDataPerReadB); @@ -211,7 +223,7 @@ struct GridwiseConvolutionImplicitGemm_v3_nchw_cyxk_nkhw Float p_out_thread[c_k0k2_n1n2_thread_mtx_desc.GetElementSpace()]; // zero out threadwise output - threadwise_matrix_set_zero(out_k0_k1_k2_n1_n0_h_w_n2_thread_desc, p_out_thread); + threadwise_matrix_set_zero(c_k0k2_n1n2_thread_mtx_desc, p_out_thread); // do work for(index_t y = 0; y < Y; ++y) @@ -229,15 +241,15 @@ struct GridwiseConvolutionImplicitGemm_v3_nchw_cyxk_nkhw c_block_data_on_global = 0; c_block_data_on_global < C; c_block_data_on_global += CPerBlock, - p_in_block_ont_global += CPerBlock * in_n_c_h_w_global_desc.GetStride(I1), + p_in_block_on_global += CPerBlock * in_n_c_h_w_global_desc.GetStride(I1), p_wei_block_on_global += CPerBlock * wei_c_y_x_k_global_desc.GetStride(I0)) { - blockwise_in_copy.run(p_in_block_on_global, p_in_block); - blockwise_wei_copy.run(p_wei_block_on_global, p_wei_block); + blockwise_in_copy.Run(p_in_block_on_global, p_in_block); + blockwise_wei_copy.Run(p_wei_block_on_global, p_wei_block); __syncthreads(); - blockwise_gemm.run(p_wei_block, p_in_block, p_out_thread); + blockwise_gemm.Run(p_wei_block, p_in_block, p_out_thread); __syncthreads(); } @@ -253,19 +265,26 @@ struct GridwiseConvolutionImplicitGemm_v3_nchw_cyxk_nkhw // define tensor descriptor for threadwise copy // output tensor (also, memory layout) descriptor in register, src of threadwise // copy - constexpr auto out_k0_k1_k2_n1_b_n2_thread_mem_desc = make_ConstantTensorDescriptor( - Sequence{}); + constexpr auto out_k0_k1_k2_n1_b_n2_thread_mem_desc = + make_ConstantTensorDescriptor_default_rank_packed( + Sequence{}); // output memory layout descriptor in device memory constexpr auto out_n0_n1_n2_k0_k1_k2_h_w_global_mem_desc = - out_n_k_h_w_global.Fold(I1, Number{}, Number{}) + out_n_k_h_w_global_desc.Fold(I1, Number{}, Number{}) .Fold(I0, Number{}, Number{}); // output merged tensor descriptor in device memory, dst of threadwise copy constexpr auto out_k0_k1_k2_n1_b_n2_global_merged_desc = - out_n0_n1_n2_k0_k1_k2_h_w_global_mem_desc - .ReorderGivenNew2Old(Sequence<3, 4, 5, 1, 0, 6, 7, 2>{}) - .Merge(I4, I6); + make_ConstantMergedTensorDescriptor( + out_n0_n1_n2_k0_k1_k2_h_w_global_mem_desc.ReorderGivenNew2Old( + Sequence<3, 4, 5, 1, 0, 6, 7, 2>{}), + Sequence<0>{}, + Sequence<1>{}, + Sequence<2>{}, + Sequence<3>{}, + Sequence<4, 5, 6>{}, + Sequence<7>{}); // calculate origin of thread output tensor on global memory // blockwise GEMM c matrix starting index @@ -273,18 +292,30 @@ struct GridwiseConvolutionImplicitGemm_v3_nchw_cyxk_nkhw blockwise_gemm.GetBeginOfThreadMatrixC(get_thread_local_1d_id()); // origin of thread tensor on global - const index_t k_thread_data_on_global k_block_data_on_global + - c_thread_mtx_on_block.row; + const index_t k_thread_data_on_global = + k_block_data_on_global + c_thread_mtx_on_block.row; + const index_t b_thread_data_on_global = b_block_data_on_global + c_thread_mtx_on_block.col; - // output merged global tensor descriptor, for calculating origin of thread tensor - // in global memory +// output merged global tensor descriptor, for calculating origin of thread tensor +// in global memory +#if 0 // unfold a merged tensor is not implemented yet constexpr auto out_k_n1_b_n2_global_merged_desc = - out_k0_k1_k2_n1_b_n2_global_merged_desc.Unfold(I1, I2); + out_k0_k1_k2_n1_b_n2_global_merged_desc.Unfold(I0, I2); +#else + constexpr auto out_k_n1_b_n2_global_merged_desc = make_ConstantMergedTensorDescriptor( + out_n0_n1_n2_k0_k1_k2_h_w_global_mem_desc + .ReorderGivenNew2Old(Sequence<3, 4, 5, 1, 0, 6, 7, 2>{}) + .Unfold(I0, I2), + Sequence<0>{}, + Sequence<1>{}, + Sequence<2, 3, 4>{}, + Sequence<5>{}); +#endif // origin of thread tensor in global memory - const index_t p_out_thread_on_global = + Float* p_out_thread_on_global = p_out_global + out_k_n1_b_n2_global_merged_desc.GetOffsetFromMultiIndex( k_thread_data_on_global, 0, 0, 0); // dst origin on merged global tensor @@ -303,8 +334,8 @@ struct GridwiseConvolutionImplicitGemm_v3_nchw_cyxk_nkhw 0, b_thread_data_on_global, 0}, // starting point of slice w.r.t. origin of dst - out_k0_k1_k2_n1_b_n2_thread_desc.GetLengths(), // slice lengths - Sequence<2, 3, 4, 0, 5, 1>{} // order of dimension access + out_k0_k1_k2_n1_b_n2_thread_mem_desc.GetLengths(), // slice lengths + Sequence<2, 3, 4, 0, 5, 1>{} // order of dimension access ); } } diff --git a/src/include/constant_integral.hip.hpp b/src/include/integral_constant.hip.hpp similarity index 56% rename from src/include/constant_integral.hip.hpp rename to src/include/integral_constant.hip.hpp index cdba3290a0..0f134ae76c 100644 --- a/src/include/constant_integral.hip.hpp +++ b/src/include/integral_constant.hip.hpp @@ -8,5 +8,11 @@ struct integral_constant __host__ __device__ constexpr T Get() const { return value; } }; +template +__host__ __device__ constexpr auto operator+(integral_constant, integral_constant) +{ + return integral_constant{}; +} + template using Number = integral_constant; diff --git a/src/include/threadwise_gemm.hip.hpp b/src/include/threadwise_gemm.hip.hpp index 5d82493e28..acd813d437 100644 --- a/src/include/threadwise_gemm.hip.hpp +++ b/src/include/threadwise_gemm.hip.hpp @@ -10,7 +10,7 @@ __device__ void threadwise_matrix_set_zero(Matrix, Float* __restrict__ p_thread) for(index_t j = 0; j < Matrix::NCol(); ++j) { const index_t id = Matrix::GetOffsetFromMultiIndex(i, j); - p_thread[id] = 0; + p_thread[id] = Float(0); } } } diff --git a/src/include/threadwise_tensor_slice_op.hip.hpp b/src/include/threadwise_tensor_slice_op.hip.hpp index 6c441d289f..3456a44ddf 100644 --- a/src/include/threadwise_tensor_slice_op.hip.hpp +++ b/src/include/threadwise_tensor_slice_op.hip.hpp @@ -19,7 +19,7 @@ __device__ void threadwise_tensor_slice_copy(SrcDesc, constexpr auto src_desc = SrcDesc{}; constexpr auto dst_desc = DstDesc{}; - constexpr auto ref_desc = make_packed_ConstantTensorDescriptor(SrcOpLengths{}); + constexpr auto ref_desc = make_ConstantTensorDescriptor_default_rank_packed(SrcOpLengths{}); #if 0 if(get_thread_local_1d_id() == 0 && get_block_1d_id() == 0) @@ -194,16 +194,19 @@ threadwise_tensor_slice_copy_reorder_given_dst2src_v3(SrcDesc, } template -__device__ void -threadwise_tensor_slice_copy_generic(SrcDesc, - const Float* __restrict__ p_src, - Array src_multi_offset, - DstDesc, - Float* __restrict__ p_dst, - Array dst_multi_offset, - SliceLengths, - DimAccessOrder) +__device__ void threadwise_tensor_slice_copy_generic( + SrcDesc, + const Float* __restrict__ p_src, + Array src_multi_id_begin, + DstDesc, + Float* __restrict__ p_dst, + Array dst_multi_id_begin, + SliceLengths, + DimAccessOrder) { + static_assert(SrcDesc::GetNumOfDimension() == DstDesc::GetNumOfDimension(), + "wrong! # of dimensions not the same"); + constexpr auto src_desc = SrcDesc{}; constexpr auto dst_desc = DstDesc{}; @@ -215,9 +218,10 @@ threadwise_tensor_slice_copy_generic(SrcDesc, reorder_array_given_old2new(data_multi_id_in_access_order, DimAccessOrder{}); const index_t dst_index = - dst_desc.GetOffsetFromMultiIndex(src_multi_offset + data_multi_id); + dst_desc.GetOffsetFromMultiIndex(src_multi_id_begin + data_multi_id); + const index_t src_index = - src_desc.GetOffsetFromMultiIndex(dst_multi_offset + data_multi_id); + src_desc.GetOffsetFromMultiIndex(dst_multi_id_begin + data_multi_id); p_dst[dst_index] = p_src[src_index]; }); diff --git a/src/include/vector_type.hip.hpp b/src/include/vector_type.hip.hpp index d2d3db92ec..ef8feeadda 100644 --- a/src/include/vector_type.hip.hpp +++ b/src/include/vector_type.hip.hpp @@ -1,6 +1,6 @@ #pragma once #include "config.h" -#include "constant_integral.hip.hpp" +#include "integral_constant.hip.hpp" template struct vector_type