From b2439ec9dd8acc7a6788c3225fda80eb7f416ce6 Mon Sep 17 00:00:00 2001 From: Chao Liu Date: Thu, 30 May 2019 17:50:49 -0500 Subject: [PATCH] adding implicit gemm v4 (nchw, kcyx) --- driver/driver.hip.cpp | 24 ++- src/include/Array.hip.hpp | 162 +++++++++++++++++- .../ConstantMergedTensorDescriptor.hip.hpp | 42 +---- src/include/ConstantTensorDescriptor.hip.hpp | 108 +++++++++--- src/include/Sequence.hip.hpp | 36 ++++ src/include/blockwise_3d_tensor_op.hip.hpp | 2 +- src/include/blockwise_4d_tensor_op.hip.hpp | 2 +- .../blockwise_generic_tensor_slice_op.hip.hpp | 43 +++-- src/include/common.hip.hpp | 12 ++ ...on_implicit_gemm_v3_nchw_cyxk_nkhw.hip.hpp | 89 +++++++--- ...on_implicit_gemm_v4_nchw_kcyx_nkhw.hip.hpp | 50 +++++- 11 files changed, 440 insertions(+), 130 deletions(-) diff --git a/driver/driver.hip.cpp b/driver/driver.hip.cpp index 3c3602040c..e1a68de451 100644 --- a/driver/driver.hip.cpp +++ b/driver/driver.hip.cpp @@ -44,11 +44,7 @@ struct GeneratorTensor_3 { std::array dims = {{static_cast(is)...}}; -#if 0 - auto f_acc = std::plus{}; -#else auto f_acc = [](auto a, auto b) { return 100 * a + b; }; -#endif return std::accumulate(dims.begin(), dims.end(), index_t(0), f_acc); } @@ -447,7 +443,7 @@ int main(int argc, char* argv[]) constexpr index_t HPad = 0; constexpr index_t WPad = 0; -#elif 1 +#elif 0 // 3x3 filter, 28x28 image constexpr index_t N = 128; constexpr index_t C = 256; @@ -543,7 +539,7 @@ int main(int argc, char* argv[]) constexpr index_t HPad = 0; constexpr index_t WPad = 0; -#elif 0 +#elif 1 // 1x1 filter, 14x14 image constexpr index_t N = 128; constexpr index_t C = 512; @@ -553,6 +549,18 @@ int main(int argc, char* argv[]) constexpr index_t Y = 1; constexpr index_t X = 1; + constexpr index_t HPad = 0; + constexpr index_t WPad = 0; +#elif 1 + // 1x1 filter, 73x73 image + constexpr index_t N = 128; + constexpr index_t C = 64; + constexpr index_t HI = 73; + constexpr index_t WI = 73; + constexpr index_t K = 128; + constexpr index_t Y = 1; + constexpr index_t X = 1; + constexpr index_t HPad = 0; constexpr index_t WPad = 0; #endif @@ -609,8 +617,6 @@ int main(int argc, char* argv[]) }; wei_kcyx.GenerateTensorValue(gen_wei, num_thread); #endif - - // out_nkhw_device.GenerateTensorValue(GeneratorTensor_1{}, num_thread); } #if 1 @@ -649,7 +655,7 @@ int main(int argc, char* argv[]) if(do_verification) { -#if 1 +#if 0 if(Y == 3 && X == 3) { host_winograd_3x3_convolution(in_nchw, wei_kcyx, out_nkhw_host, lower_pads, upper_pads); diff --git a/src/include/Array.hip.hpp b/src/include/Array.hip.hpp index 494f72d917..14f7a6524d 100644 --- a/src/include/Array.hip.hpp +++ b/src/include/Array.hip.hpp @@ -105,6 +105,7 @@ __host__ __device__ constexpr auto extract_array(const Array& old_ return new_array; } +// Array = Array + Array template __host__ __device__ constexpr auto operator+(Array a, Array b) { @@ -119,6 +120,55 @@ __host__ __device__ constexpr auto operator+(Array a, Array +__host__ __device__ constexpr auto operator-(Array a, Array b) +{ + Array result; + + static_for<0, NSize, 1>{}([&](auto I) { + constexpr index_t i = I.Get(); + + result[i] = a[i] - b[i]; + }); + + return result; +} + +// Array = Array + Sequence +template +__host__ __device__ constexpr auto operator+(Array a, Sequence b) +{ + static_assert(sizeof...(Is) == NSize, "wrong! size not the same"); + + Array result; + + static_for<0, NSize, 1>{}([&](auto I) { + constexpr index_t i = I.Get(); + + result[i] = a[i] + b.Get(I); + }); + + return result; +} + +// Array = Array - Sequence +template +__host__ __device__ constexpr auto operator-(Array a, Sequence b) +{ + static_assert(sizeof...(Is) == NSize, "wrong! size not the same"); + + Array result; + + static_for<0, NSize, 1>{}([&](auto I) { + constexpr index_t i = I.Get(); + + result[i] = a[i] - b.Get(I); + }); + + return result; +} + // Array = Array * Sequence template __host__ __device__ constexpr auto operator*(Array a, Sequence b) @@ -136,15 +186,119 @@ __host__ __device__ constexpr auto operator*(Array a, Sequence -__host__ __device__ constexpr TData reduce_on_array(Array a, F f) +// Array = Sequence - Array +template +__host__ __device__ constexpr auto operator-(Sequence a, Array b) { - TData result = a[0]; + static_assert(sizeof...(Is) == NSize, "wrong! size not the same"); - static_for<1, NSize, 1>{}([&](auto I) { + Array result; + + static_for<0, NSize, 1>{}([&](auto I) { + constexpr index_t i = I.Get(); + + result[i] = a.Get(I) - b[i]; + }); + + return result; +} + +template +__host__ __device__ constexpr TData +accumulate_on_array(const Array& a, Reduce f, TData init) +{ + TData result = init; + + static_assert(NSize > 0, "wrong"); + + static_for<0, NSize, 1>{}([&](auto I) { constexpr index_t i = I.Get(); result = f(result, a[i]); }); return result; } + +template +__host__ __device__ void print_Array(const char* s, Array a) +{ + constexpr index_t nsize = a.GetSize(); + + static_assert(nsize > 0 && nsize <= 10, "wrong!"); + + static_if{}([&](auto) { printf("%s size %u, {%u}\n", s, nsize, a[0]); }); + + static_if{}([&](auto) { printf("%s size %u, {%u %u}\n", s, nsize, a[0], a[1]); }); + + static_if{}( + [&](auto) { printf("%s size %u, {%u %u %u}\n", s, nsize, a[0], a[1], a[2]); }); + + static_if{}( + [&](auto) { printf("%s size %u, {%u %u %u %u}\n", s, nsize, a[0], a[1], a[2], a[3]); }); + + static_if{}([&](auto) { + printf("%s size %u, {%u %u %u %u %u}\n", s, nsize, a[0], a[1], a[2], a[3], a[4]); + }); + + static_if{}([&](auto) { + printf("%s size %u, {%u %u %u %u %u %u}\n", s, nsize, a[0], a[1], a[2], a[3], a[4], a[5]); + }); + + static_if{}([&](auto) { + printf("%s size %u, {%u %u %u %u %u %u %u}\n", + s, + nsize, + a[0], + a[1], + a[2], + a[3], + a[4], + a[5], + a[6]); + }); + + static_if{}([&](auto) { + printf("%s size %u, {%u %u %u %u %u %u %u %u}\n", + s, + nsize, + a[0], + a[1], + a[2], + a[3], + a[4], + a[5], + a[6], + a[7]); + }); + + static_if{}([&](auto) { + printf("%s size %u, {%u %u %u %u %u %u %u %u %u}\n", + s, + nsize, + a[0], + a[1], + a[2], + a[3], + a[4], + a[5], + a[6], + a[7], + a[8]); + }); + + static_if{}([&](auto) { + printf("%s size %u, {%u %u %u %u %u %u %u %u %u %u}\n", + s, + nsize, + a[0], + a[1], + a[2], + a[3], + a[4], + a[5], + a[6], + a[7], + a[8], + a[9]); + }); +} diff --git a/src/include/ConstantMergedTensorDescriptor.hip.hpp b/src/include/ConstantMergedTensorDescriptor.hip.hpp index 31bbf89b89..e8fb88d9b2 100644 --- a/src/include/ConstantMergedTensorDescriptor.hip.hpp +++ b/src/include/ConstantMergedTensorDescriptor.hip.hpp @@ -99,15 +99,7 @@ struct ConstantMergedTensorDescriptor return original_multi_id; } -#if 0 // not needed - __host__ __device__ static index_t - GetOffsetFromOriginalMultiIndex(Array original_multi_id) - { - return OriginalTensorDesc::GetOffsetFromMultiIndex(original_multi_id); - } -#endif - - __host__ __device__ static index_t GetOffsetFromMultiIndexA(Array multi_id) + __host__ __device__ static index_t GetOffsetFromMultiIndex(Array multi_id) { const auto original_multi_id = GetOriginalMultiIndexFromMultiIndex(multi_id); @@ -126,38 +118,6 @@ struct ConstantMergedTensorDescriptor return dummy_desc.GetMultiIndexFrom1dIndex(id); } - -#if 0 // not needed - template - __host__ __device__ static index_t GetNewOriginalMultiIndexAfterMovingAlongOneDimension( - Array old_original_multi_id, Number, index_t step_size) - { - auto new_original_multi_id = old_original_multi_id; - - // get partial-original-multi-id corresponding to this merged dimension - constexpr auto original_partial_dims = std::get(mOriginalDimMergeSeqs); - - constexpr auto original_partial_tensor_desc = - OriginalTensorDesc::Extract(original_partial_dims); - - auto old_original_partial_multi_id = - extract_array(old_original_mutli_id, original_paritial_dims); - - auto new_original_partial_multi_id = - original_partial_tensor_desc.GetNewMultiIndexGivenStepSizeOf1dIndex( - old_original_partial_multi_id, step_size); - - // update original-mutli-id - static_for<0, original_dims_partial.GetSize(), 1>{}([&](auto I_) { - constexpr auto I = decltype(I_){}; - constexpr index_t idim_original = original_dims_partial.Get(I); - - new_original_multi_id[idim_original] = original_multi_id_partial[I.Get()]; - }); - - return new_original_multi_id; - } -#endif }; template diff --git a/src/include/ConstantTensorDescriptor.hip.hpp b/src/include/ConstantTensorDescriptor.hip.hpp index 9e2b14550c..86188adc50 100644 --- a/src/include/ConstantTensorDescriptor.hip.hpp +++ b/src/include/ConstantTensorDescriptor.hip.hpp @@ -4,7 +4,7 @@ template __host__ __device__ constexpr auto calculate_tensor_strides_default_rank_packed(Lengths) { - return reverse_inclusive_scan_sequence(Lengths{}.PopFront(), std::multiplies{}) + return reverse_inclusive_scan_sequence(Lengths{}.PopFront(), mod_conv::multiplies{}) .PushBack(Number<1>{}); } @@ -95,7 +95,7 @@ struct ConstantTensorDescriptor __host__ __device__ static constexpr index_t GetElementSize() { - return accumulate_on_sequence(Lengths{}, std::multiplies{}, Number<1>{}); + return accumulate_on_sequence(Lengths{}, mod_conv::multiplies{}, Number<1>{}); } // WRONG! ReorderGivenOld2New is broken @@ -107,10 +107,10 @@ struct ConstantTensorDescriptor constexpr auto strides_in_rank = GetStrides().ReorderGivenOld2new(MemoryRank{}); constexpr index_t element_space_unaligned = accumulate_on_sequence( - (lengths_in_rank - Number<1>{}) * strides_in_rank, std::plus{}, Number<1>{}); + (lengths_in_rank - Number<1>{}) * strides_in_rank, mod_conv::plus{}, Number<1>{}); #else // WRONG! align shouldbe applied to the last memory rank, not the last tensor dimension constexpr index_t element_space_unaligned = accumulate_on_sequence( - (GetLengths() - Number<1>{}) * GetStrides(), std::plus{}, Number<1>{}); + (GetLengths() - Number<1>{}) * GetStrides(), mod_conv::plus{}, Number<1>{}); #endif return align.Get() * ((element_space_unaligned + align.Get() - 1) / align.Get()); @@ -144,7 +144,8 @@ struct ConstantTensorDescriptor constexpr auto multi_id = Sequence{}; - return accumulate_on_sequence(multi_id * GetStrides(), std::plus{}, Number<0>{}); + return accumulate_on_sequence( + multi_id * GetStrides(), mod_conv::plus{}, Number<0>{}); } #if 0 // ReorderGivenOld2new is broken @@ -197,32 +198,70 @@ struct ConstantTensorDescriptor // This function doesn't do carry check on the highest dimension, for performance reason. // It is the user's responsibility to make sure the result "new_mutli_id" is not out-of-bound // on the highest dimension + template __host__ __device__ static Array UpdateMultiIndexGivenStepSizeOf1dIndex(Array old_multi_id, - index_t step_size_of_1d_index) + index_t step_size_of_1d_index, + integral_constant) { - auto new_multi_id = old_multi_id + GetMultiIndexFrom1dIndex(step_size_of_1d_index); + Array new_multi_id; - bool carry = false; + const auto step_sizes = GetMultiIndexFrom1dIndex(step_size_of_1d_index); - // do carry check in reversed order, starting from lowest dimension - // don't check the highest dimension - static_for<0, nDim - 1, 1>{}([&](auto IDimReverse) { - constexpr index_t idim = nDim - 1 - IDimReverse.Get(); - constexpr auto IDim = Number{}; + static_if{}([&](auto) { + new_multi_id = old_multi_id + step_sizes; - if(carry) - { - ++new_multi_id[idim]; - } + bool carry = false; - carry = false; + // do carry check in reversed order, starting from lowest dimension + // don't check the highest dimension + static_for<0, nDim - 1, 1>{}([&](auto IDimReverse) { + constexpr index_t idim = nDim - 1 - IDimReverse.Get(); + constexpr auto IDim = Number{}; - if(new_multi_id[idim] >= GetLength(IDim)) - { - new_multi_id[idim] -= GetLength(IDim); - carry = true; - } + if(carry) + { + ++new_multi_id[idim]; + } + + carry = false; + + if(new_multi_id[idim] >= GetLength(IDim)) + { + new_multi_id[idim] -= GetLength(IDim); + carry = true; + } + }); + }).Else([&](auto) { + // shift up multi-id to avoid unsigned integer underflow during intermediate + // calculations. After the shift, should have new_multi_id[...] >= 1 + new_multi_id = old_multi_id + (GetLengths() - step_sizes); + + bool borrow = false; + + // do borrow check in reversed order, starting from lowest dimension + // don't check the highest dimension + static_for<0, nDim - 1, 1>{}([&](auto IDimReverse) { + constexpr index_t idim = nDim - 1 - IDimReverse.Get(); + constexpr auto IDim = Number{}; + + if(borrow) + { + --new_multi_id[idim]; + } + + borrow = false; + + if(new_multi_id[idim] < GetLength(IDim)) + { + new_multi_id[idim] += GetLength(IDim); + borrow = true; + } + }); + + // shift back down multi-id + // here, should have new_multi_id[...] >= GetLengths() + new_multi_id = new_multi_id - GetLengths(); }); return new_multi_id; @@ -255,7 +294,7 @@ struct ConstantTensorDescriptor } template - __host__ __device__ static constexpr auto Inject(ConstantTensorDescriptor) + __host__ __device__ static constexpr auto Embed(ConstantTensorDescriptor) { using leaf_tensor = ConstantTensorDescriptor; @@ -290,7 +329,7 @@ struct ConstantTensorDescriptor constexpr auto fold_intervals = Sequence{}; constexpr index_t fold_intervals_product = - accumulate_on_sequence(fold_intervals, std::multiplies{}, Number<1>{}); + accumulate_on_sequence(fold_intervals, mod_conv::multiplies{}, Number<1>{}); constexpr auto unfold_length = GetLength(Number{}); constexpr auto unfold_stride = GetStride(Number{}); @@ -309,7 +348,7 @@ struct ConstantTensorDescriptor constexpr auto fold_strides = Number{} * reverse_inclusive_scan_sequence(fold_intervals.PushBack(Number<1>{}), - std::multiplies{}); + mod_conv::multiplies{}); // folded_ranks constexpr auto fold_ranks = @@ -389,7 +428,7 @@ struct ConstantTensorDescriptor // unfolded length, stride and rank constexpr index_t unfold_length = accumulate_on_sequence( - GetLengths().Extract(middle), std::multiplies{}, Number<1>{}); + GetLengths().Extract(middle), mod_conv::multiplies{}, Number<1>{}); constexpr index_t unfold_stride = GetStride(Number{}); @@ -472,7 +511,20 @@ __host__ __device__ void print_ConstantTensorDescriptor(TDesc, const char* s) { constexpr index_t ndim = TDesc::GetNumOfDimension(); - static_assert(ndim >= 2 && ndim <= 10, "wrong!"); + static_assert(ndim >= 1 && ndim <= 10, "wrong!"); + + static_if{}([&](auto fwd) { + constexpr auto I0 = Number<0>{}; + + constexpr auto desc = fwd(TDesc{}); + + printf("%s dim %u, lengths {%u}, strides {%u}, ranks {%u}\n", + s, + desc.GetNumOfDimension(), + desc.GetLength(I0), + desc.GetStride(I0), + desc.GetMemoryRank(I0)); + }); static_if{}([&](auto fwd) { constexpr auto I0 = Number<0>{}; diff --git a/src/include/Sequence.hip.hpp b/src/include/Sequence.hip.hpp index 81998dd9fb..1b96b9351b 100644 --- a/src/include/Sequence.hip.hpp +++ b/src/include/Sequence.hip.hpp @@ -495,3 +495,39 @@ __host__ __device__ constexpr auto Sequence::Modify(Number, Number) return seq_left.PushBack(Number{}).Append(seq_right); } + +template +__host__ __device__ void print_Sequence(const char* s, Sequence) +{ + constexpr index_t nsize = Sequence::GetSize(); + + static_assert(nsize <= 10, "wrong!"); + + static_if{}([&](auto) { printf("%s size %u, {}\n", s, nsize, Xs...); }); + + static_if{}([&](auto) { printf("%s size %u, {%u}\n", s, nsize, Xs...); }); + + static_if{}([&](auto) { printf("%s size %u, {%u %u}\n", s, nsize, Xs...); }); + + static_if{}([&](auto) { printf("%s size %u, {%u %u %u}\n", s, nsize, Xs...); }); + + static_if{}([&](auto) { printf("%s size %u, {%u %u %u %u}\n", s, nsize, Xs...); }); + + static_if{}( + [&](auto) { printf("%s size %u, {%u %u %u %u %u}\n", s, nsize, Xs...); }); + + static_if{}( + [&](auto) { printf("%s size %u, {%u %u %u %u %u %u}\n", s, nsize, Xs...); }); + + static_if{}( + [&](auto) { printf("%s size %u, {%u %u %u %u %u %u %u}\n", s, nsize, Xs...); }); + + static_if{}( + [&](auto) { printf("%s size %u, {%u %u %u %u %u %u %u %u}\n", s, nsize, Xs...); }); + + static_if{}( + [&](auto) { printf("%s size %u, {%u %u %u %u %u %u %u %u %u}\n", s, nsize, Xs...); }); + + static_if{}( + [&](auto) { printf("%s size %u, {%u %u %u %u %u %u %u %u %u %u}\n", s, nsize, Xs...); }); +} diff --git a/src/include/blockwise_3d_tensor_op.hip.hpp b/src/include/blockwise_3d_tensor_op.hip.hpp index e7e7ee5592..058a5b4401 100644 --- a/src/include/blockwise_3d_tensor_op.hip.hpp +++ b/src/include/blockwise_3d_tensor_op.hip.hpp @@ -158,7 +158,7 @@ struct Blockwise3dTensorCopy3 "wrrong! BlockSize is not big enough for ThreadPerDims!"); constexpr index_t num_active_thread = - accumulate_on_sequence(ThreadPerDims{}, std::multiplies{}, Number<1>{}); + accumulate_on_sequence(ThreadPerDims{}, mod_conv::multiplies{}, Number<1>{}); if(BlockSize > num_active_thread) { diff --git a/src/include/blockwise_4d_tensor_op.hip.hpp b/src/include/blockwise_4d_tensor_op.hip.hpp index ef3dc3e4d6..fdb32e8f05 100644 --- a/src/include/blockwise_4d_tensor_op.hip.hpp +++ b/src/include/blockwise_4d_tensor_op.hip.hpp @@ -500,7 +500,7 @@ struct Blockwise4dTensorCopy3 "wrrong! BlockSize is not big enough for ThreadPerDims!"); constexpr index_t num_active_thread = - accumulate_on_sequence(ThreadPerDims{}, std::multiplies{}, Number<1>{}); + accumulate_on_sequence(ThreadPerDims{}, mod_conv::multiplies{}, Number<1>{}); if(BlockSize > num_active_thread) { diff --git a/src/include/blockwise_generic_tensor_slice_op.hip.hpp b/src/include/blockwise_generic_tensor_slice_op.hip.hpp index cb9be4309c..71c3b46153 100644 --- a/src/include/blockwise_generic_tensor_slice_op.hip.hpp +++ b/src/include/blockwise_generic_tensor_slice_op.hip.hpp @@ -3,6 +3,7 @@ // slice a (normal or merged) tensor, and copy it into another (normal or merged) tensor // memory layout (ordering of dimensions) can be different between src and dst +// For now, only support SubLengths == 1 on a merged dimension template src_block_data_multi_id_begin, Array dst_block_data_multi_id_begin) { - // check NDim consistent + // check NDim consistency static_assert(nDim == SrcDesc::GetNumOfDimension() && nDim == DstDesc::GetNumOfDimension() && nDim == SliceLengths::GetSize() && nDim == SubLengths::GetSize() && nDim == DataClusterLengths::GetSize() && @@ -55,7 +56,7 @@ struct BlockwiseGenericTensorSliceCopy_v1 nDim == SrcAccessOrder::GetSize() && nDim == DstAccessOrder::GetSize(), "wrong"); - // check + // check thread arrange order and read/write access order are valid static_assert(is_valid_sequence_map::value && is_valid_sequence_map::value && is_valid_sequence_map::value, @@ -140,10 +141,14 @@ struct BlockwiseGenericTensorSliceCopy_v1 }); // complete offset - mThreadSrcOffset = reduce_on_array(mThreadSrcPartialOffsets, std::plus{}); - mThreadDstOffset = reduce_on_array(mThreadDstPartialOffsets, std::plus{}); + mThreadSrcOffset = accumulate_on_array( + mThreadSrcPartialOffsets, mod_conv::plus{}, static_cast(0)); + + mThreadDstOffset = accumulate_on_array( + mThreadDstPartialOffsets, mod_conv::plus{}, static_cast(0)); #if 0 + if(get_block_1d_id() == 0) { printf("id %5u %5u: " "src_block_data_multi_id_begin: %u %u %u %u, " @@ -279,13 +284,9 @@ struct BlockwiseGenericTensorSliceCopy_v1 // the boundary of the tensor being sliced. This functions doesn't do runtime sanity // check on out-of-bound slicing window, for performance reason template - __device__ void MoveSlicingWindowOnSourceTensor(Number, - Number, - integral_constant) + __device__ void MoveSlicingWindowOnSourceTensor( + Number, Number, integral_constant direction) { - static_assert(PositiveDirection, - "wrong! only support movement in positive direction for now"); - constexpr auto IDim = Number{}; constexpr index_t idim = IDim.Get(); @@ -306,7 +307,7 @@ struct BlockwiseGenericTensorSliceCopy_v1 auto new_src_partial_original_multi_id = src_partial_original_desc.UpdateMultiIndexGivenStepSizeOf1dIndex( - old_src_partial_original_multi_id, StepSize); + old_src_partial_original_multi_id, StepSize, direction); // update "mThreadSrcOriginalMultiId" static_for<0, src_partial_original_dims.GetSize(), 1>{}([&](auto I_) { @@ -328,7 +329,7 @@ struct BlockwiseGenericTensorSliceCopy_v1 mThreadSrcPartialOffsets[idim] = new_src_partial_offset; // update "mThreadSrcOffset", do "+" before "-" to avoid underflow - mThreadSrcOffset = mThreadSrcOffset + new_src_partial_offset - old_src_partial_offset; + mThreadSrcOffset = (mThreadSrcOffset + new_src_partial_offset) - old_src_partial_offset; }).Else([&](auto fwd) { // Logic for non-merged dimension. If you are never going to move the slicing window on // a merged dimension, then "mThreadSrcOriginalMultiId" and "mThreadSrcPartialOffsets", @@ -336,13 +337,25 @@ struct BlockwiseGenericTensorSliceCopy_v1 // should be able to remove these calculations. // TODO: make sure compiler would actually remove them in this case. + // It is the user's responsiblity to make sure the slicing window will not be moved out + // of the boundary of the tensor being sliced. Otherwise, there might be hazard like + // unsigned integer underflow. That is NO runtime sanity check to prevent the hazard + constexpr index_t idim_original = SrcDesc::GetContainedOriginalDimensions(IDim).Front(); - mThreadSrcOffset += StepSize * SrcDesc::GetStride(IDim); + static_if{}([&](auto) { + mThreadSrcOffset += StepSize * SrcDesc::GetStride(IDim); - mThreadSrcOriginalMultiId[idim_original] += StepSize; + mThreadSrcOriginalMultiId[idim_original] += StepSize; - mThreadSrcPartialOffsets[idim] += StepSize * SrcDesc::GetStride(IDim); + mThreadSrcPartialOffsets[idim] += StepSize * SrcDesc::GetStride(IDim); + }).Else([&](auto) { + mThreadSrcOffset -= StepSize * SrcDesc::GetStride(IDim); + + mThreadSrcOriginalMultiId[idim_original] -= StepSize; + + mThreadSrcPartialOffsets[idim] -= StepSize * SrcDesc::GetStride(IDim); + }); }); } }; diff --git a/src/include/common.hip.hpp b/src/include/common.hip.hpp index da047b1472..1de33aa4df 100644 --- a/src/include/common.hip.hpp +++ b/src/include/common.hip.hpp @@ -39,6 +39,18 @@ struct scales __host__ __device__ constexpr T operator()(T a) const { return s * a; } }; +template +struct plus +{ + __host__ __device__ constexpr T operator()(T a, T b) const { return a + b; } +}; + +template +struct multiplies +{ + __host__ __device__ constexpr T operator()(T a, T b) const { return a * b; } +}; + template struct integer_divide_ceiler { diff --git a/src/include/gridwise_convolution_implicit_gemm_v3_nchw_cyxk_nkhw.hip.hpp b/src/include/gridwise_convolution_implicit_gemm_v3_nchw_cyxk_nkhw.hip.hpp index 5c924dd67a..f680531d38 100644 --- a/src/include/gridwise_convolution_implicit_gemm_v3_nchw_cyxk_nkhw.hip.hpp +++ b/src/include/gridwise_convolution_implicit_gemm_v3_nchw_cyxk_nkhw.hip.hpp @@ -58,6 +58,9 @@ struct GridwiseConvolutionImplicitGemm_v3_nchw_cyxk_nkhw constexpr auto I6 = Number<6>{}; constexpr auto I7 = Number<7>{}; + constexpr auto True = integral_constant{}; + constexpr auto False = integral_constant{}; + constexpr auto in_n_c_h_w_global_desc = InGlobalDesc{}; constexpr auto wei_c_y_x_k_global_desc = WeiGlobalDesc{}; constexpr auto out_n_k_h_w_global_desc = OutGlobalDesc{}; @@ -123,7 +126,7 @@ struct GridwiseConvolutionImplicitGemm_v3_nchw_cyxk_nkhw // input blockwise copy // slice a merged tensor, reorder and copy to a normal tensor // this copy operator already has blockwise offset built-in - const auto blockwise_in_copy = BlockwiseGenericTensorSliceCopy_v1< + auto blockwise_in_copy = BlockwiseGenericTensorSliceCopy_v1< BlockSize, Float, decltype(in_c_n1_b_n2_global_merged_desc), @@ -150,20 +153,20 @@ struct GridwiseConvolutionImplicitGemm_v3_nchw_cyxk_nkhw // operator for blockwise copy of weight into LDS // slice a tensor, and copy it into another tensor // this copy operator already have blockwise offset built-in - const auto blockwise_wei_copy = -#if 0 + auto blockwise_wei_copy = +#if 1 BlockwiseGenericTensorSliceCopy_v1, // thread_arrange_order [C, K] - Sequence<0, 1>, // src_access_order [C, K] - Sequence<0, 1>, // dst_access_order [C, K] - WeiBlockCopyDataPerAccess_K, - WeiBlockCopyDataPerAccess_K>( + Float, + decltype(wei_c_k_global_desc), + decltype(wei_c_k_block_desc), + decltype(wei_c_k_block_desc.GetLengths()), + WeiBlockCopySubLengths_C_K, + WeiBlockCopyClusterLengths_C_K, + Sequence<0, 1>, // thread_arrange_order [C, K] + Sequence<0, 1>, // src_access_order [C, K] + Sequence<0, 1>, // dst_access_order [C, K] + WeiBlockCopyDataPerAccess_K, + WeiBlockCopyDataPerAccess_K>( {0, k_block_data_on_global}, {0, 0}); #else Blockwise2dTensorCopy3{}, - Number{}, - Number{}); + // GEMM definition + // c_mtx += transpose(a_mtx) * b_mtx + // a_mtx[CPerBlock, KPerBlock] is in LDS + // b_mtx[CPerBlocl, N1 * BPerBlock * N2] is in LDS + // c_mtx[KPerBlock, N1 * BPerBlock * N2] is distributed among threads, and saved in + // register + constexpr auto a_c_k_block_mtx_desc = make_ConstantMatrixDescriptor( + Number{}, Number{}, Number{}); constexpr auto b_c_n1bn2_block_mtx_desc = make_ConstantMatrixDescriptor(Number{}, @@ -239,6 +240,7 @@ struct GridwiseConvolutionImplicitGemm_v3_nchw_cyxk_nkhw // zero out threadwise output threadwise_matrix_set_zero(c_k0k2_n1n2_thread_mtx_desc, p_out_thread); +#if 0 // do work for(index_t y = 0; y < Y; ++y) { @@ -269,6 +271,45 @@ struct GridwiseConvolutionImplicitGemm_v3_nchw_cyxk_nkhw } } } +#else + for(index_t y = 0; y < Y; ++y) + { + for(index_t x = 0; x < X; ++x) + { + // calculate origin of block input and weight tensor on global memory + const Float* p_in_block_on_global = + p_in_global + in_n_c_h_w_global_desc.GetOffsetFromMultiIndex(0, 0, y, x); + + const Float* p_wei_block_on_global = + p_wei_global + wei_c_y_x_k_global_desc.GetOffsetFromMultiIndex(0, y, x, 0); + + for(index_t c_block_data_on_global = 0; c_block_data_on_global < C; + c_block_data_on_global += CPerBlock) + { + blockwise_in_copy.Run(p_in_block_on_global, p_in_block); + blockwise_wei_copy.Run(p_wei_block_on_global, p_wei_block); + + __syncthreads(); + + blockwise_gemm.Run(p_wei_block, p_in_block, p_out_thread); + + __syncthreads(); + + // move on C: C_N1_B_N2, C_K + blockwise_in_copy.MoveSlicingWindowOnSourceTensor( + I0, Number{}, True); + + blockwise_wei_copy.MoveSlicingWindowOnSourceTensor( + I0, Number{}, True); + } + + // reset C + blockwise_in_copy.MoveSlicingWindowOnSourceTensor(I0, Number{}, False); + + blockwise_wei_copy.MoveSlicingWindowOnSourceTensor(I0, Number{}, False); + } + } +#endif // copy output: register to global memory { diff --git a/src/include/gridwise_convolution_implicit_gemm_v4_nchw_kcyx_nkhw.hip.hpp b/src/include/gridwise_convolution_implicit_gemm_v4_nchw_kcyx_nkhw.hip.hpp index e8ce51e76c..f492b477a1 100644 --- a/src/include/gridwise_convolution_implicit_gemm_v4_nchw_kcyx_nkhw.hip.hpp +++ b/src/include/gridwise_convolution_implicit_gemm_v4_nchw_kcyx_nkhw.hip.hpp @@ -59,7 +59,7 @@ struct GridwiseConvolutionImplicitGemm_v4_nchw_kcyx_nkhw constexpr auto I6 = Number<6>{}; constexpr auto I7 = Number<7>{}; - constexpr auto TRUE = integral_constant{}; + constexpr auto True = integral_constant{}; constexpr auto in_n_c_h_w_global_desc = InGlobalDesc{}; constexpr auto wei_k_c_y_x_global_desc = WeiGlobalDesc{}; @@ -102,9 +102,9 @@ struct GridwiseConvolutionImplicitGemm_v4_nchw_kcyx_nkhw const index_t b_block_data_on_global = block_work_multi_id[1] * BPerBlock; // input tensor - // tensor descriptor in device memory [N0, N1, N2, H, W] - constexpr auto in_n0_n1_n2_h_w_global_desc = in_n_c_h_w_global_desc.Slice(I2, Number{}) - .Slice(I3, Number{}) + // tensor descriptor in device memory [N0, N1, N2, Ho, Wo] + constexpr auto in_n0_n1_n2_h_w_global_desc = in_n_c_h_w_global_desc.Slice(I2, Number{}) + .Slice(I3, Number{}) .Fold(I0, Number{}, Number{}) .Extract(Sequence<0, 1, 2, 4, 5>{}); @@ -115,12 +115,23 @@ struct GridwiseConvolutionImplicitGemm_v4_nchw_kcyx_nkhw // merged tensor descriptor in device memory [E, N1, B, N2], src of blockwise copy constexpr auto in_e_n1_b_n2_global_merged_desc = make_ConstantMergedTensorDescriptor( - in_c_y_x_global_desc.Inject(in_n0_n1_n2_h_w_global_desc), + in_c_y_x_global_desc.Embed(in_n0_n1_n2_h_w_global_desc), Sequence<0, 1, 2>{}, Sequence<4>{}, Sequence<3, 6, 7>{}, Sequence<5>{}); +#if 0 + if(get_block_1d_id() == 0 && get_thread_local_1d_id() == 0) + { + print_ConstantTensorDescriptor(in_n0_n1_n2_h_w_global_desc, + "in_n0_n1_n2_h_w_global_desc: "); + print_ConstantTensorDescriptor(in_c_y_x_global_desc, "in_c_y_x_global_desc: "); + print_ConstantMergedTensorDescriptor(in_e_n1_b_n2_global_merged_desc, + "in_e_n1_b_n2_global_merged_desc: "); + } +#endif + // memory layout descriptor in LDS [E, N1, B, N2], dst of blockwise copy // be careful of LDS alignment constexpr auto in_e_n1_b_n2_block_desc = make_ConstantTensorDescriptor_default_rank_aligned( @@ -243,6 +254,31 @@ struct GridwiseConvolutionImplicitGemm_v4_nchw_kcyx_nkhw // do work for(index_t e = 0; e < E; e += EPerBlock) { +#if 0 + if(e == 1 * EPerBlock && get_block_1d_id() == 0) + { + printf("id %5u %5u: " + "mThreadSrcOriginalMultiId %u %u %u %u %u %u %u %u, " + "mThreadSrcPartialOffsets %u %u %u %u, " + "mThreadSrcOffset %u, mThreadDstOffset %u \n", + get_block_1d_id(), + get_thread_local_1d_id(), + blockwise_in_copy.mThreadSrcOriginalMultiId[0], + blockwise_in_copy.mThreadSrcOriginalMultiId[1], + blockwise_in_copy.mThreadSrcOriginalMultiId[2], + blockwise_in_copy.mThreadSrcOriginalMultiId[3], + blockwise_in_copy.mThreadSrcOriginalMultiId[4], + blockwise_in_copy.mThreadSrcOriginalMultiId[5], + blockwise_in_copy.mThreadSrcOriginalMultiId[6], + blockwise_in_copy.mThreadSrcOriginalMultiId[7], + blockwise_in_copy.mThreadSrcPartialOffsets[0], + blockwise_in_copy.mThreadSrcPartialOffsets[1], + blockwise_in_copy.mThreadSrcPartialOffsets[2], + blockwise_in_copy.mThreadSrcPartialOffsets[3], + blockwise_in_copy.mThreadSrcOffset, + blockwise_in_copy.mThreadDstOffset); + } +#endif // marching slicing window blockwise_in_copy.Run(p_in_global, p_in_block); blockwise_wei_copy.Run(p_wei_global, p_wei_block); @@ -253,8 +289,8 @@ struct GridwiseConvolutionImplicitGemm_v4_nchw_kcyx_nkhw __syncthreads(); - blockwise_in_copy.MoveSlicingWindowOnSourceTensor(I0, Number{}, TRUE); - blockwise_wei_copy.MoveSlicingWindowOnSourceTensor(I0, Number{}, TRUE); + blockwise_in_copy.MoveSlicingWindowOnSourceTensor(I0, Number{}, True); + blockwise_wei_copy.MoveSlicingWindowOnSourceTensor(I0, Number{}, True); } // copy output: register to global memory