From 3bf52e60c5374c9a63256dff5e3442a4046c81dc Mon Sep 17 00:00:00 2001 From: Chao Liu Date: Mon, 12 Apr 2021 21:32:55 -0500 Subject: [PATCH] Initial implementation of magic number division and "Merge" transformation that use it (#28) * initial implementation for magic number division and DynamicMerge_v2_magic_division that uses it * turn off DynamicMerge_v2_magic_division that use magic number division by default --- .../dynamic_multi_index_transform.hpp | 185 +++++++++++++++++- .../dynamic_multi_index_transform_helper.hpp | 6 +- .../include/utility/common_header.hpp | 1 + .../include/utility/config.amd.hpp.in | 3 + .../include/utility/magic_division.hpp | 136 +++++++++++++ composable_kernel/include/utility/type.hpp | 14 ++ 6 files changed, 339 insertions(+), 6 deletions(-) create mode 100644 composable_kernel/include/utility/magic_division.hpp diff --git a/composable_kernel/include/tensor_description/dynamic_multi_index_transform.hpp b/composable_kernel/include/tensor_description/dynamic_multi_index_transform.hpp index 429473c8f6..0f1f0d5c29 100644 --- a/composable_kernel/include/tensor_description/dynamic_multi_index_transform.hpp +++ b/composable_kernel/include/tensor_description/dynamic_multi_index_transform.hpp @@ -467,8 +467,10 @@ struct DynamicEmbed } }; +// Implementation of "Merge" transformation primitive that uses regular to do lowering of +// multi-index and use carry-and-borrow check to do lowering of multi-index delta template -struct DynamicMerge +struct DynamicMerge_v1_carry_check { static constexpr index_t NDimLow = LowLengths::Size(); @@ -485,9 +487,9 @@ struct DynamicMerge LowLengthsScan low_lengths_scan_; UpLengths up_lengths_; - __host__ __device__ constexpr DynamicMerge() = default; + __host__ __device__ constexpr DynamicMerge_v1_carry_check() = default; - __host__ __device__ constexpr DynamicMerge(const LowLengths& low_lengths) + __host__ __device__ constexpr DynamicMerge_v1_carry_check(const LowLengths& low_lengths) : low_lengths_{low_lengths}, low_lengths_scan_{ container_reverse_exclusive_scan(low_lengths, math::multiplies_v2{}, Number<1>{})}, @@ -511,7 +513,8 @@ struct DynamicMerge index_t tmp = idx_up[Number<0>{}]; - static_for<0, NDimLow - 1, 1>{}([&idx_low, &tmp, this](auto i) { + // normal division + static_for<0, NDimLow - 1, 1>{}([&](auto i) { idx_low(i) = tmp / this->low_lengths_scan_[i]; tmp -= idx_low[i] * this->low_lengths_scan_[i]; }); @@ -978,7 +981,7 @@ struct DynamicMerge __host__ __device__ void Print() const { printf("{"); - printf("DynamicMerge, "); + printf("DynamicMerge_v1_carry_check, "); printf("low_lengths_ "); print_multi_index(low_lengths_); printf("low_lengths_scan_ "); @@ -989,6 +992,178 @@ struct DynamicMerge } }; +template +struct lambda_merge_generate_MagicDivision_calculate_magic_multiplier +{ + template + __host__ __device__ constexpr auto operator()(Number i) const + { + return MagicDivision::CalculateMagicMultiplier(LowLengths{}[i]); + } +}; + +template +struct lambda_merge_generate_MagicDivision_calculate_magic_shift +{ + template + __host__ __device__ constexpr auto operator()(Number i) const + { + return MagicDivision::CalculateMagicShift(LowLengths{}[i]); + } +}; + +// Implementation of "Merge" transformation primitive that uses magic-number-division to do lowering +// of both multi-index and delta of multi-index +// Caution: +// 1. The magic number division implementation being used would produce correct result if the +// dividended is uint32_t and its value is with in 31-bit value range of uint32_t. +// 2. The magic number division for int32_t dividened has not been implemented, the int32_t +// dividend would be bit-wise interpreted as uint32_t and magic number division implementation for +// uint32_t is then used. +// 3. For Merge primitive, upper-index is the dividend. +// 4. When upper-index is uint32_t, its value need to be within 31-bit range. +// 5. When upper-index is int32_t type (when index_t is int32_t), its value need to be +// non-negative. +template +struct DynamicMerge_v2_magic_division +{ + static constexpr index_t NDimLow = LowLengths::Size(); + + using LowerIndex = MultiIndex; + using UpperIndex = MultiIndex<1>; + + using UpLengths = + decltype(make_tuple(container_reduce(LowLengths{}, math::multiplies_v2{}, Number<1>{}))); + + using LowLengthsMagicDivisorMultipiler = decltype( + generate_tuple(lambda_merge_generate_MagicDivision_calculate_magic_multiplier{}, + Number{})); + + using LowLengthsMagicDivisorShift = decltype( + generate_tuple(lambda_merge_generate_MagicDivision_calculate_magic_shift{}, + Number{})); + + LowLengths low_lengths_; + LowLengthsMagicDivisorMultipiler low_lengths_magic_divisor_multiplier_; + LowLengthsMagicDivisorShift low_lengths_magic_divisor_shift_; + UpLengths up_lengths_; + + __host__ __device__ constexpr DynamicMerge_v2_magic_division() = default; + + __host__ __device__ constexpr DynamicMerge_v2_magic_division(const LowLengths& low_lengths) + : low_lengths_{low_lengths}, + low_lengths_magic_divisor_multiplier_{generate_tuple( + [&](auto i) { return MagicDivision::CalculateMagicMultiplier(low_lengths[i]); }, + Number{})}, + low_lengths_magic_divisor_shift_{generate_tuple( + [&](auto i) { return MagicDivision::CalculateMagicShift(low_lengths[i]); }, + Number{})}, + up_lengths_{make_tuple(container_reduce(low_lengths, math::multiplies_v2{}, Number<1>{}))} + { + static_assert(LowerIndex::Size() == NDimLow, "wrong!"); + } + + __host__ __device__ static constexpr index_t GetNumOfLowerDimension() { return NDimLow; } + + __host__ __device__ static constexpr index_t GetNumOfUpperDimension() { return 1; } + + __host__ __device__ constexpr const auto& GetUpperLengths() const { return up_lengths_; } + + template + __host__ __device__ constexpr void CalculateLowerIndex(LowIdx& idx_low, + const UpIdx& idx_up) const + { + static_assert(LowIdx::Size() == NDimLow && UpIdx::Size() == 1, + "wrong! inconsistent # of dimension"); + + index_t tmp = idx_up[Number<0>{}]; + + static_for{}([&, this](auto i) { + index_t tmp2 = + MagicDivision::DoMagicDivision(tmp, + this->low_lengths_magic_divisor_multiplier_[i], + this->low_lengths_magic_divisor_shift_[i]); + idx_low(i) = tmp - tmp2 * this->low_lengths_[i]; + tmp = tmp2; + }); + + idx_low(Number<0>{}) = tmp; + } + + template + __host__ __device__ void UpdateLowerIndex(LowIdxDiff& idx_diff_low, + const UpIdxDiff& idx_diff_up, + LowIdx& idx_low, + const UpIdx& idx_up_new, + Number) const + { + static_assert(LowIdxDiff::Size() == NDimLow && UpIdxDiff::Size() == 1 && + LowIdx::Size() == NDimLow && UpIdx::Size() == 1, + "wrong! inconsistent # of dimension"); + + index_t tmp = idx_up_new[Number<0>{}]; + + static_for{}([&, this](auto i) { + index_t tmp2 = + MagicDivision::DoMagicDivision(tmp, + this->low_lengths_magic_divisor_multiplier_[i], + this->low_lengths_magic_divisor_shift_[i]); + + index_t idx_low_old = idx_low[i]; + + idx_low(i) = tmp - tmp2 * this->low_lengths_[i]; + tmp = tmp2; + + idx_diff_low(i) = idx_low[i] - idx_low_old; + }); + + idx_diff_low(Number<0>{}) = tmp - idx_low(Number<0>{}); + + idx_low(Number<0>{}) = tmp; + } + + __host__ __device__ static constexpr bool IsLinearTransform() { return false; } + + __host__ __device__ static constexpr bool IsValidUpperIndexAlwaysMappedToValidLowerIndex() + { + return true; + } + + __host__ __device__ static constexpr bool IsKnownAtCompileTime() + { + return is_known_at_compile_time::value && + is_known_at_compile_time::value && + is_known_at_compile_time::value && + is_known_at_compile_time::value; + } + + template + __host__ __device__ static constexpr bool + IsValidUpperIndexMappedToValidLowerIndex(const UpIdx& /* idx_up */) + { + return true; + } + + __host__ __device__ void Print() const + { + printf("{"); + printf("DynamicMerge_v2_magic_division, "); + printf("low_lengths_ "); + print_multi_index(low_lengths_); + printf("low_lengths_magic_divisor_multiplier_ "); + print_multi_index(low_lengths_magic_divisor_multiplier_); + printf("low_lengths_magic_divisor_shift_ "); + print_multi_index(low_lengths_magic_divisor_shift_); + printf("up_lengths_ "); + print_multi_index(up_lengths_); + printf("}"); + } +}; + template struct DynamicUnMerge { diff --git a/composable_kernel/include/tensor_description/dynamic_multi_index_transform_helper.hpp b/composable_kernel/include/tensor_description/dynamic_multi_index_transform_helper.hpp index f460599ee5..591cedb76b 100644 --- a/composable_kernel/include/tensor_description/dynamic_multi_index_transform_helper.hpp +++ b/composable_kernel/include/tensor_description/dynamic_multi_index_transform_helper.hpp @@ -53,7 +53,11 @@ __host__ __device__ constexpr auto make_embed_transform(const UpLengths& up_leng template __host__ __device__ constexpr auto make_merge_transform(const LowLengths& low_lengths) { - return DynamicMerge{low_lengths}; +#if !CK_EXPERIMENTAL_MERGE_USE_MAGIC_DIVISION + return DynamicMerge_v1_carry_check{low_lengths}; +#else + return DynamicMerge_v2_magic_division{low_lengths}; +#endif } template diff --git a/composable_kernel/include/utility/common_header.hpp b/composable_kernel/include/utility/common_header.hpp index 63f94bd3c2..5a26f8958f 100644 --- a/composable_kernel/include/utility/common_header.hpp +++ b/composable_kernel/include/utility/common_header.hpp @@ -22,6 +22,7 @@ #include "tuple_helper.hpp" #include "type.hpp" #include "utility.hpp" +#include "magic_division.hpp" #if CK_USE_AMD_INLINE_ASM #include "amd_inline_asm.hpp" diff --git a/composable_kernel/include/utility/config.amd.hpp.in b/composable_kernel/include/utility/config.amd.hpp.in index 0f8388d09f..bca451a60a 100644 --- a/composable_kernel/include/utility/config.amd.hpp.in +++ b/composable_kernel/include/utility/config.amd.hpp.in @@ -115,6 +115,9 @@ #define CK_EXPERIMENTAL_PASS_TENSOR_DESCRIPTOR_BY_VALUE 1 #define CK_EXPERIMENTAL_PASS_TENSOR_DESCRIPTOR_BY_VOID_POINTER 0 +// merge transformation use magic number division +#define CK_EXPERIMENTAL_MERGE_USE_MAGIC_DIVISION 0 + // hack: have underlying assumption that need to be satsified, otherwise it's a bug // hack for forcing register to keep idx_diff_low_const in SGPR. idx_diff_low_const must be // thread-invariant, otherwise it's a bug diff --git a/composable_kernel/include/utility/magic_division.hpp b/composable_kernel/include/utility/magic_division.hpp new file mode 100644 index 0000000000..8e29e75348 --- /dev/null +++ b/composable_kernel/include/utility/magic_division.hpp @@ -0,0 +1,136 @@ +#ifndef CK_MAGIC_DIVISION_HPP +#define CK_MAGIC_DIVISION_HPP + +#include "config.hpp" +#include "integral_constant.hpp" +#include "number.hpp" +#include "type.hpp" +#include "tuple.hpp" + +namespace ck { + +// magic number division +// Caution: +// 1. For uint32_t as dividend: magic number division implementation being used would produce +// correct result if the dividend is uint32_t and its value is within 31-bit value range. +// 2. For int32_t as dividendd: magic number division for int32_t dividened has not been +// implemented, the int32_t dividend would be bit-wise interpreted as uint32_t and magic number +// division implementation for uint32_t is then used. Therefore, dividend value need to be +// non-negative. +// TODO: +// 1. Implement magic number divison for int32_t +// 2. Implement magic number divison for unit32_t with 32-bit value range +struct MagicDivision +{ + // uint32_t + __host__ __device__ static constexpr auto CalculateMagicNumbers(uint32_t divisor) + { + // assert(divisior >= 1 && divisior <= INT32_MAX); + uint32_t shift = 0; + for(shift = 0; shift < 32; ++shift) + { + if((1U << shift) >= divisor) + { + break; + } + } + + uint64_t one = 1; + uint64_t multiplier = ((one << 32) * ((one << shift) - divisor)) / divisor + 1; + // assert(multiplier <= 0xffffffffUL); + + return make_tuple(uint32_t(multiplier), shift); + } + + __host__ __device__ static constexpr uint32_t CalculateMagicMultiplier(uint32_t divisor) + { + auto tmp = CalculateMagicNumbers(divisor); + + return tmp[Number<0>{}]; + } + + __host__ __device__ static constexpr uint32_t CalculateMagicShift(uint32_t divisor) + { + auto tmp = CalculateMagicNumbers(divisor); + + return tmp[Number<1>{}]; + } + + // integral_constant + template + __host__ __device__ static constexpr auto + CalculateMagicNumbers(integral_constant) + { + constexpr auto tmp = CalculateMagicNumbers(uint32_t{Divisor}); + + constexpr uint32_t multiplier = tmp[Number<0>{}]; + constexpr uint32_t shift = tmp[Number<1>{}]; + + return make_tuple(integral_constant{}, + integral_constant{}); + } + + template + __host__ __device__ static constexpr auto + CalculateMagicMultiplier(integral_constant) + { + constexpr uint32_t multiplier = CalculateMagicMultiplier(uint32_t{Divisor}); + + return integral_constant{}; + } + + template + __host__ __device__ static constexpr auto + CalculateMagicShift(integral_constant) + { + constexpr uint32_t shift = CalculateMagicShift(uint32_t{Divisor}); + + return integral_constant{}; + } + + // integral_constant + template + __host__ __device__ static constexpr auto + CalculateMagicNumbers(integral_constant) + { + return CalculateMagicNumbers(integral_constant{}); + } + + template + __host__ __device__ static constexpr auto + CalculateMagicMultiplier(integral_constant) + { + return CalculateMagicMultiplier(integral_constant{}); + } + + template + __host__ __device__ static constexpr auto + CalculateMagicShift(integral_constant) + { + return CalculateMagicShift(integral_constant{}); + } + + // magic division for uint32_t + __host__ __device__ static constexpr uint32_t + DoMagicDivision(uint32_t dividend, uint32_t multiplier, uint32_t shift) + { + uint32_t tmp = (uint64_t(dividend) * uint64_t(multiplier)) >> 32; + return (tmp + dividend) >> shift; + } + + // HACK: magic division for int32_t + // HACK: use dividend_i32 as if it's uint32_t, dividend_i32 need to be + // non-negative for result to be correct + // TODO: figure out how to do magic number divison for int32_t as dividended + __host__ __device__ static constexpr int32_t + DoMagicDivision(int32_t dividend_i32, uint32_t multiplier, uint32_t shift) + { + uint32_t dividend_u32 = as_type(dividend_i32); + uint32_t tmp = ((uint64_t)dividend_u32 * (uint64_t)multiplier) >> 32; + return (tmp + dividend_i32) >> shift; + } +}; + +} // namespace ck + +#endif diff --git a/composable_kernel/include/utility/type.hpp b/composable_kernel/include/utility/type.hpp index b137168a1f..32f7dfb569 100644 --- a/composable_kernel/include/utility/type.hpp +++ b/composable_kernel/include/utility/type.hpp @@ -42,5 +42,19 @@ struct is_known_at_compile_time> static constexpr bool value = true; }; +template ::type = false> +__host__ __device__ constexpr Y as_type(X x) +{ + union AsType + { + X x; + Y y; + }; + + return AsType{x}.y; +} + } // namespace ck #endif