diff --git a/src/include/Array.hip.hpp b/src/include/Array.hip.hpp index 33cc547a97..5e00d32a09 100644 --- a/src/include/Array.hip.hpp +++ b/src/include/Array.hip.hpp @@ -34,14 +34,6 @@ struct Array __host__ __device__ TData& operator()(index_t i) { return mData[i]; } - template - __host__ __device__ constexpr TData Get(Number) const - { - static_assert(I < NSize, "wrong!"); - - return mData[I]; - } - template __host__ __device__ constexpr void Set(Number, TData x) { @@ -50,16 +42,33 @@ struct Array mData[I] = x; } + __host__ __device__ constexpr void Set(index_t I, TData x) { mData[I] = x; } + + struct lambda_PushBack // emulate constexpr lambda + { + const Array& old_array; + Array& new_array; + + __host__ __device__ constexpr lambda_PushBack(const Array& old_array_, + Array& new_array_) + : old_array(old_array_), new_array(new_array_) + { + } + + template + __host__ __device__ constexpr void operator()(Number) const + { + new_array.Set(Number{}, old_array[I]); + } + }; + __host__ __device__ constexpr auto PushBack(TData x) const { Array new_array; - static_for<0, NSize, 1>{}([&](auto I) { - constexpr index_t i = I.Get(); - new_array(i) = mData[i]; - }); + static_for<0, NSize, 1>{}(lambda_PushBack(*this, new_array)); - new_array(NSize) = x; + new_array.Set(Number{}, x); return new_array; } @@ -81,18 +90,13 @@ __host__ __device__ constexpr auto make_zero_array() template __host__ __device__ constexpr auto reorder_array_given_new2old(const Array& old_array, - Sequence new2old) + Sequence /*new2old*/) { - Array new_array; - static_assert(NSize == sizeof...(IRs), "NSize not consistent"); - static_for<0, NSize, 1>{}([&](auto IDim) { - constexpr index_t idim = IDim.Get(); - new_array[idim] = old_array[new2old.Get(IDim)]; - }); + static_assert(is_valid_sequence_map>::value, "wrong! invalid reorder map"); - return new_array; + return Array{old_array.mSize[IRs]...}; } template @@ -120,12 +124,14 @@ struct lambda_reorder_array_given_old2new template __host__ __device__ constexpr auto reorder_array_given_old2new(const Array& old_array, - Sequence old2new) + Sequence /*old2new*/) { Array new_array; static_assert(NSize == sizeof...(IRs), "NSize not consistent"); + static_assert(is_valid_sequence_map>::value, "wrong! invalid reorder map"); + static_for<0, NSize, 1>{}( lambda_reorder_array_given_old2new>(old_array, new_array)); @@ -141,25 +147,44 @@ __host__ __device__ constexpr auto extract_array(const Array& old_ static_assert(new_size <= NSize, "wrong! too many extract"); - static_for<0, new_size, 1>{}([&](auto I) { - constexpr index_t i = I.Get(); - new_array(i) = old_array[ExtractSeq::Get(I)]; - }); + static_for<0, new_size, 1>{}([&](auto I) { new_array(I) = old_array[ExtractSeq::Get(I)]; }); return new_array; } +template // emulate constepxr lambda for array math +struct lambda_array_math +{ + const F& f; + const X& x; + const Y& y; + Z& z; + + __host__ __device__ constexpr lambda_array_math(const F& f_, const X& x_, const Y& y_, Z& z_) + : f(f_), x(x_), y(y_), z(z_) + { + } + + template + __host__ __device__ constexpr void operator()(Number) const + { + constexpr auto IDim = Number{}; + + z.Set(IDim, f(x[IDim], y[IDim])); + } +}; + // Array = Array + Array template __host__ __device__ constexpr auto operator+(Array a, Array b) { Array result; - static_for<0, NSize, 1>{}([&](auto I) { - constexpr index_t i = I.Get(); + auto f = mod_conv::plus{}; - result(i) = a[i] + b[i]; - }); + static_for<0, NSize, 1>{}( + lambda_array_math( + f, a, b, result)); return result; } @@ -170,11 +195,11 @@ __host__ __device__ constexpr auto operator-(Array a, Array result; - static_for<0, NSize, 1>{}([&](auto I) { - constexpr index_t i = I.Get(); + auto f = mod_conv::minus{}; - result(i) = a[i] - b[i]; - }); + static_for<0, NSize, 1>{}( + lambda_array_math( + f, a, b, result)); return result; } @@ -187,11 +212,11 @@ __host__ __device__ constexpr auto operator+(Array a, Sequence result; - static_for<0, NSize, 1>{}([&](auto I) { - constexpr index_t i = I.Get(); + auto f = mod_conv::plus{}; - result(i) = a[i] + b.Get(I); - }); + static_for<0, NSize, 1>{}( + lambda_array_math( + f, a, b, result)); return result; } @@ -204,11 +229,11 @@ __host__ __device__ constexpr auto operator-(Array a, Sequence result; - static_for<0, NSize, 1>{}([&](auto I) { - constexpr index_t i = I.Get(); + auto f = mod_conv::minus{}; - result(i) = a[i] - b.Get(I); - }); + static_for<0, NSize, 1>{}( + lambda_array_math( + f, a, b, result)); return result; } @@ -221,11 +246,11 @@ __host__ __device__ constexpr auto operator*(Array a, Sequence result; - static_for<0, NSize, 1>{}([&](auto I) { - constexpr index_t i = I.Get(); + auto f = mod_conv::multiplies{}; - result(i) = a[i] * b.Get(I); - }); + static_for<0, NSize, 1>{}( + lambda_array_math( + f, a, b, result)); return result; } @@ -238,11 +263,11 @@ __host__ __device__ constexpr auto operator-(Sequence a, Array result; - static_for<0, NSize, 1>{}([&](auto I) { - constexpr index_t i = I.Get(); + auto f = mod_conv::minus{}; - result(i) = a.Get(I) - b[i]; - }); + static_for<0, NSize, 1>{}( + lambda_array_math( + f, a, b, result)); return result; } @@ -255,10 +280,7 @@ accumulate_on_array(const Array& a, Reduce f, TData init) static_assert(NSize > 0, "wrong"); - static_for<0, NSize, 1>{}([&](auto I) { - constexpr index_t i = I.Get(); - result = f(result, a[i]); - }); + static_for<0, NSize, 1>{}([&](auto I) { result = f(result, a[I]); }); return result; } diff --git a/src/include/ConstantTensorDescriptor.hip.hpp b/src/include/ConstantTensorDescriptor.hip.hpp index 845e6e0190..f28cb32733 100644 --- a/src/include/ConstantTensorDescriptor.hip.hpp +++ b/src/include/ConstantTensorDescriptor.hip.hpp @@ -48,13 +48,13 @@ struct ConstantTensorDescriptor template __host__ __device__ static constexpr index_t GetLength(Number) { - return Lengths{}.Get(Number{}); + return Lengths::Get(Number{}); } template __host__ __device__ static constexpr index_t GetStride(Number) { - return Strides{}.Get(Number{}); + return Strides::Get(Number{}); } struct lambda_AreDimensionsContinuous @@ -131,7 +131,7 @@ struct ConstantTensorDescriptor template __host__ __device__ constexpr void operator()(X IDim) const { - offset += multi_id.Get(IDim) * Type::GetStride(IDim); + offset += multi_id[IDim] * Type::GetStride(IDim); } }; diff --git a/src/include/Sequence.hip.hpp b/src/include/Sequence.hip.hpp index fdb3634363..a29506f215 100644 --- a/src/include/Sequence.hip.hpp +++ b/src/include/Sequence.hip.hpp @@ -2,6 +2,9 @@ #include "integral_constant.hip.hpp" #include "functional.hip.hpp" +template +struct is_valid_sequence_map; + template struct Sequence { @@ -40,27 +43,24 @@ struct Sequence template __host__ __device__ static constexpr auto ReorderGivenNew2Old(Sequence /*new2old*/) { -#if 0 // require sequence_sort, which is not implemented yet - static_assert(is_same>::SortedSeqType, - arithmetic_sequence_gen<0, mSize, 1>::SeqType>::value, - "wrong! invalid new2old map"); -#endif - static_assert(sizeof...(Is) == sizeof...(IRs), - "wrong! new2old map should have the same size as Sequence to be rerodered"); + "wrong! reorder map should have the same size as Sequence to be rerodered"); - return Sequence{})...>{}; + static_assert(is_valid_sequence_map>::value, "wrong! invalid reorder map"); + + return Sequence{})...>{}; } #if 0 // require sequence_sort, which is not implemented yet template __host__ __device__ static constexpr auto ReorderGivenOld2New(MapOld2New /*old2new*/) { -#if 0 - static_assert(is_same::SortedSeqType, - arithmetic_sequence_gen<0, mSize, 1>::SeqType>::value, - "wrong! invalid old2new map"); -#endif + static_assert(sizeof...(Is) == MapOld2New::GetSize(), + "wrong! reorder map should have the same size as Sequence to be rerodered"); + + static_assert(is_valid_sequence_map::value, + "wrong! invalid reorder map"); + constexpr auto map_new2old = typename sequence_map_inverse::SeqMapType{}; return ReorderGivenNew2Old(map_new2old); @@ -106,13 +106,13 @@ struct Sequence template __host__ __device__ static constexpr auto Extract(Number...) { - return Sequence{})...>{}; + return Sequence{})...>{}; } template __host__ __device__ static constexpr auto Extract(Sequence) { - return Sequence{})...>{}; + return Sequence{})...>{}; } template @@ -316,6 +316,7 @@ struct sequence_map_inverse> }; #endif + template struct is_valid_sequence_map { diff --git a/src/include/base.hip.hpp b/src/include/base.hip.hpp new file mode 100644 index 0000000000..dd6fc19b1f --- /dev/null +++ b/src/include/base.hip.hpp @@ -0,0 +1,113 @@ +#pragma once + +__device__ index_t get_thread_local_1d_id() { return threadIdx.x; } + +__device__ index_t get_block_1d_id() { return blockIdx.x; } + +template +struct is_same +{ + static constexpr bool value = false; +}; + +template +struct is_same +{ + static constexpr bool value = true; +}; + +template +__host__ __device__ constexpr bool is_same_type(X, Y) +{ + return is_same::value; +} + +namespace mod_conv { // namespace mod_conv +template +struct scales +{ + __host__ __device__ constexpr T operator()(T a) const { return s * a; } +}; + +template +struct plus +{ + __host__ __device__ constexpr T operator()(T a, T b) const { return a + b; } +}; + +template +struct minus +{ + __host__ __device__ constexpr T operator()(T a, T b) const { return a - b; } +}; + +template +struct multiplies +{ + __host__ __device__ constexpr T operator()(T a, T b) const { return a * b; } +}; + +template +struct integer_divide_ceiler +{ + __host__ __device__ constexpr T operator()(T a, T b) const + { + static_assert(is_same::value || is_same::value, "wrong type"); + + return (a + b - 1) / b; + } +}; + +template +__host__ __device__ constexpr T integer_divide_ceil(T a, T b) +{ + static_assert(is_same::value || is_same::value, "wrong type"); + + return (a + b - 1) / b; +} + +template +__host__ __device__ constexpr T max(T x, T y) +{ + return x > y ? x : y; +} + +template +__host__ __device__ constexpr T max(T x, Ts... xs) +{ + static_assert(sizeof...(xs) > 0, "not enough argument"); + + auto y = max(xs...); + + static_assert(is_same::value, "not the same type"); + + return x > y ? x : y; +} + +template +__host__ __device__ constexpr T min(T x, T y) +{ + return x < y ? x : y; +} + +template +__host__ __device__ constexpr T min(T x, Ts... xs) +{ + static_assert(sizeof...(xs) > 0, "not enough argument"); + + auto y = min(xs...); + + static_assert(is_same::value, "not the same type"); + + return x < y ? x : y; +} + +// this is wrong +// TODO: implement correct least common multiple, instead of calling max() +template +__host__ __device__ constexpr T lcm(T x, Ts... xs) +{ + return max(x, xs...); +} + +} // namespace mod_conv diff --git a/src/include/blockwise_generic_tensor_slice_op.hip.hpp b/src/include/blockwise_generic_tensor_slice_op.hip.hpp index 5665fb2726..ef49f7a33e 100644 --- a/src/include/blockwise_generic_tensor_slice_op.hip.hpp +++ b/src/include/blockwise_generic_tensor_slice_op.hip.hpp @@ -203,20 +203,18 @@ struct BlockwiseGenericTensorSliceCopy_v1 make_ConstantTensorDescriptor_packed(thread_sub_tensor_lengths * repeat_lengths); static_ford{}([&](auto repeat_multi_id_) { -#if 0 +#if 1 constexpr auto repeat_multi_id = sequence2array(decltype(repeat_multi_id_){}); - const auto src_thread_data_multi_id_begin = - repeat_multi_id * data_per_cluster_per_dims; // cannot not constexpr, why? + const auto src_thread_data_multi_id_begin = repeat_multi_id * data_per_cluster_per_dims; - const auto clipboard_data_multi_id_begin = - repeat_multi_id * thread_sub_tensor_lengths; // cannot not constexpr, why? + const auto clipboard_data_multi_id_begin = repeat_multi_id * thread_sub_tensor_lengths; - const index_t src_offset = SrcDesc{}.GetOffsetFromMultiIndex( - src_thread_data_multi_id_begin); // cannot not constexpr, why? + const index_t src_offset = + SrcDesc{}.GetOffsetFromMultiIndex(src_thread_data_multi_id_begin); - const index_t clipboard_offset = thread_tensor_desc.GetOffsetFromMultiIndex( - clipboard_data_multi_id_begin); // cannot not constexpr, why? + const index_t clipboard_offset = + thread_tensor_desc.GetOffsetFromMultiIndex(clipboard_data_multi_id_begin); #else constexpr auto repeat_multi_id = decltype(repeat_multi_id_){}; @@ -258,20 +256,17 @@ struct BlockwiseGenericTensorSliceCopy_v1 make_ConstantTensorDescriptor_packed(thread_sub_tensor_lengths * repeat_lengths); static_ford{}([&](auto repeat_multi_id_) { -#if 0 +#if 1 constexpr auto repeat_multi_id = sequence2array(decltype(repeat_multi_id_){}); - const auto clipboard_data_multi_id_begin = - repeat_multi_id * thread_sub_tensor_lengths; // cannot not constexpr, why? + const auto clipboard_data_multi_id_begin = repeat_multi_id * thread_sub_tensor_lengths; - const auto dst_data_multi_id_begin = - repeat_multi_id * data_per_cluster_per_dims; // cannot not constexpr, why? + const auto dst_data_multi_id_begin = repeat_multi_id * data_per_cluster_per_dims; - const index_t clipboard_offset = thread_tensor_desc.GetOffsetFromMultiIndex( - clipboard_data_multi_id_begin); // cannot not constexpr, why? + const index_t clipboard_offset = + thread_tensor_desc.GetOffsetFromMultiIndex(clipboard_data_multi_id_begin); - const index_t dst_offset = DstDesc{}.GetOffsetFromMultiIndex( - dst_data_multi_id_begin); // cannot not constexpr, why? + const index_t dst_offset = DstDesc{}.GetOffsetFromMultiIndex(dst_data_multi_id_begin); #else constexpr auto repeat_multi_id = decltype(repeat_multi_id_){}; diff --git a/src/include/common.hip.hpp b/src/include/common.hip.hpp index 28ff7003bf..f59b74d0fd 100644 --- a/src/include/common.hip.hpp +++ b/src/include/common.hip.hpp @@ -1,4 +1,5 @@ #pragma once +#include "base.hip.hpp" #include "vector_type.hip.hpp" #include "integral_constant.hip.hpp" #include "Sequence.hip.hpp" @@ -10,109 +11,3 @@ #if USE_AMD_INLINE_ASM #include "amd_inline_asm.hip.hpp" #endif - -__device__ index_t get_thread_local_1d_id() { return threadIdx.x; } - -__device__ index_t get_block_1d_id() { return blockIdx.x; } - -template -struct is_same -{ - static constexpr bool value = false; -}; - -template -struct is_same -{ - static constexpr bool value = true; -}; - -template -__host__ __device__ constexpr bool is_same_type(X, Y) -{ - return is_same::value; -} - -namespace mod_conv { // namespace mod_conv -template -struct scales -{ - __host__ __device__ constexpr T operator()(T a) const { return s * a; } -}; - -template -struct plus -{ - __host__ __device__ constexpr T operator()(T a, T b) const { return a + b; } -}; - -template -struct multiplies -{ - __host__ __device__ constexpr T operator()(T a, T b) const { return a * b; } -}; - -template -struct integer_divide_ceiler -{ - __host__ __device__ constexpr T operator()(T a, T b) const - { - static_assert(is_same::value || is_same::value, "wrong type"); - - return (a + b - 1) / b; - } -}; - -template -__host__ __device__ constexpr T integer_divide_ceil(T a, T b) -{ - static_assert(is_same::value || is_same::value, "wrong type"); - - return (a + b - 1) / b; -} - -template -__host__ __device__ constexpr T max(T x, T y) -{ - return x > y ? x : y; -} - -template -__host__ __device__ constexpr T max(T x, Ts... xs) -{ - static_assert(sizeof...(xs) > 0, "not enough argument"); - - auto y = max(xs...); - - static_assert(is_same::value, "not the same type"); - - return x > y ? x : y; -} - -template -__host__ __device__ constexpr T min(T x, T y) -{ - return x < y ? x : y; -} - -template -__host__ __device__ constexpr T min(T x, Ts... xs) -{ - static_assert(sizeof...(xs) > 0, "not enough argument"); - - auto y = min(xs...); - - static_assert(is_same::value, "not the same type"); - - return x < y ? x : y; -} - -// this is wrong -// TODO: implement correct least common multiple, instead of calling max() -template -__host__ __device__ constexpr T lcm(T x, Ts... xs) -{ - return max(x, xs...); -} - -} // namespace mod_conv diff --git a/src/include/functional3.hip.hpp b/src/include/functional3.hip.hpp index 78b95200c5..4019725c4c 100644 --- a/src/include/functional3.hip.hpp +++ b/src/include/functional3.hip.hpp @@ -11,7 +11,7 @@ struct static_ford_impl // F signature: F(Sequence<...> multi_id) // CurrentMultiIndex: Sequence<...> template - __host__ __device__ void operator()(F f, CurrentMultiIndex) const + __host__ __device__ constexpr void operator()(F f, CurrentMultiIndex) const { static_assert(RemainLengths::GetSize() > 0, "wrong! should not get here"); @@ -28,7 +28,7 @@ struct static_ford_impl> // F signature: F(Sequence<...> multi_id) // CurrentMultiIndex: Sequence<...> template - __host__ __device__ void operator()(F f, CurrentMultiIndex) const + __host__ __device__ constexpr void operator()(F f, CurrentMultiIndex) const { f(CurrentMultiIndex{}); } @@ -40,7 +40,7 @@ struct static_ford { // F signature: F(Sequence<...> multi_id) template - __host__ __device__ void operator()(F f) const + __host__ __device__ constexpr void operator()(F f) const { static_assert(Lengths::GetSize() > 0, "wrong! Lengths is empty"); @@ -55,7 +55,7 @@ struct ford_impl // CurrentMultiIndex: Array<...> // RemainLengths: Sequence<...> template - __host__ __device__ void + __host__ __device__ constexpr void operator()(F f, CurrentMultiIndex current_multi_id, RemainLengths) const { static_assert(RemainLengths::GetSize() == RemainDim, "wrong!"); @@ -77,7 +77,7 @@ struct ford_impl<1> // CurrentMultiIndex: Array<...> // RemainLengths: Sequence<...> template - __host__ __device__ void + __host__ __device__ constexpr void operator()(F f, CurrentMultiIndex current_multi_id, RemainLengths) const { static_assert(RemainLengths::GetSize() == 1, "wrong!"); @@ -97,7 +97,7 @@ struct ford { // F signature: F(Array<...> multi_id) template - __host__ __device__ void operator()(F f) const + __host__ __device__ constexpr void operator()(F f) const { constexpr index_t first_length = Lengths{}.Front();