mirror of
https://github.com/ROCm/composable_kernel.git
synced 2026-05-12 01:10:17 +00:00
* updating codegen build for MIOpen access: adding .cmake for codegen component * updating CMake * adding in header guards for some headers due to issues with hiprtc compilation in MIOpen * some more header guards * putting env file in header guard * cleaning up some includes * updated types file for hiprtc purposes * fixed types file: bit-wise/memcpy issue * updating multiple utility files to deal with standard header inclusion for hiprtc * added some more header guards in the utility files, replacing some standard header functionality * added some more header guards * fixing some conflicts in utility files, another round of header guards * fixing errors in data type file * resolved conflict errors in a few utility files * added header guards/replicated functionality in device files * resolved issues with standard headers in device files: device_base and device_grouped_conv_fwd_multiple_abd * resolved issues with standard headers in device files: device_base.hpp, device_grouped_conv_fwd_multiple_abd.hpp, device_grouped_conv_fwd_multiple_abd_xdl_cshuffle.hpp * added header guards for gridwise gemm files: gridwise_gemm_multiple_abd_xdl_cshuffle.hpp and gridwise_gemm_multiple_d_xdl_cshuffle.hpp * fixed issue with numerics header, removed from transform_conv_fwd_to_gemm and added to device_column_to_image_impl, device_grouped_conv_fwd_multiple_abd_xdl_cshuffle, device_grouped_conv_fwd_multiple_abd_xdl_cshuffle_v3, device_image_to_column_impl * replaced standard header usage and added header guards in block to ctile map and gridwise_gemm_pipeline_selector * resolved errors in device_gemm_xdl_splitk_c_shuffle files in regards to replacement of standard headers in previous commit * added replicated functionality for standard header methods in utility files * replaced standard header functionality in threadwise tensor slice transfer files and added header guards in element_wise_operation.hpp * temp fix for namespace error in MIOpen * remove standard header usage in codegen device op * removed standard header usage in elementwise files, resolved namespace errors * formatting fix * changed codegen argument to ON for testing * temporarily removing codegen compiler flag for testing purposes * added codegen flag again, set default to ON * set codegen flag default back to OFF * replaced enable_if_t standard header usage in data_type.hpp * added some debug prints to pinpoint issues in MIOpen * added print outs to debug in MIOpen * removed debug print outs from device op * resolved stdexcept include error * formatting fix * adding includes to new fp8 file to resolve ck::enable_if_t errors * made changes to amd_wave_read_first_lane * updated functionality in type utility file * fixed end of file issue * resovled errors in type utility file, added functionality to array utility file * fixed standard header usage replication in data_type file, resolves error with failing examples on navi3x * formatting fix * replaced standard header usage in amd_ck_fp8 file * added include to random_gen file * removed and replicated standard header usage from data_type and type_convert files for fp8 changes * replicated standard unsigned integer types in random_gen * resolved comments from review: put calls to reinterpret_cast for size_t in header guards * updated/added copyright headers * removed duplicate header * fixed typo in header guard * updated copyright headers --------- Co-authored-by: Illia Silin <98187287+illsilin@users.noreply.github.com>
237 lines
7.5 KiB
C++
237 lines
7.5 KiB
C++
// SPDX-License-Identifier: MIT
|
|
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
|
|
|
|
#pragma once
|
|
|
|
#include "ck/ck.hpp"
|
|
#include "integral_constant.hpp"
|
|
#include "number.hpp"
|
|
#include "type.hpp"
|
|
#include "tuple.hpp"
|
|
|
|
#ifdef CK_CODE_GEN_RTC
|
|
#define INT32_MAX 2147483647
|
|
#endif
|
|
|
|
namespace ck {
|
|
|
|
// magic number division
|
|
// Caution:
|
|
// 1. For uint32_t as dividend: magic number division implementation being used would produce
|
|
// correct result if the dividend is uint32_t and its value is within 31-bit value range.
|
|
// 2. For int32_t as dividendd: magic number division for int32_t dividened has not been
|
|
// implemented, the int32_t dividend would be bit-wise interpreted as uint32_t and magic number
|
|
// division implementation for uint32_t is then used. Therefore, dividend value need to be
|
|
// non-negative.
|
|
// TODO:
|
|
// 1. Implement magic number divison for int32_t
|
|
// 2. Implement magic number divison for unit32_t with 32-bit value range
|
|
struct MagicDivision
|
|
{
|
|
// uint32_t
|
|
__host__ __device__ static constexpr auto CalculateMagicNumbers(uint32_t divisor)
|
|
{
|
|
// WARNING: magic division is only applicable for division inside this range.
|
|
// You should use the return value of CalculateMagicNumbers, if division is not inside this
|
|
// range. The "else" logic below is to quiet down run-time error.
|
|
if(divisor >= 1 && divisor <= INT32_MAX)
|
|
{
|
|
uint32_t shift = 0;
|
|
for(shift = 0; shift < 32; ++shift)
|
|
{
|
|
if((1U << shift) >= divisor)
|
|
{
|
|
break;
|
|
}
|
|
}
|
|
|
|
uint64_t one = 1;
|
|
uint64_t multiplier = ((one << 32) * ((one << shift) - divisor)) / divisor + 1;
|
|
// assert(multiplier <= 0xffffffffUL);
|
|
|
|
return make_tuple(uint32_t(multiplier), shift);
|
|
}
|
|
else
|
|
{
|
|
return make_tuple(uint32_t(0), uint32_t(0));
|
|
}
|
|
}
|
|
|
|
__host__ __device__ static constexpr uint32_t CalculateMagicMultiplier(uint32_t divisor)
|
|
{
|
|
auto tmp = CalculateMagicNumbers(divisor);
|
|
|
|
return tmp[Number<0>{}];
|
|
}
|
|
|
|
__host__ __device__ static constexpr uint32_t CalculateMagicShift(uint32_t divisor)
|
|
{
|
|
auto tmp = CalculateMagicNumbers(divisor);
|
|
|
|
return tmp[Number<1>{}];
|
|
}
|
|
|
|
// integral_constant<uint32_t, .>
|
|
template <uint32_t Divisor>
|
|
__host__ __device__ static constexpr auto
|
|
CalculateMagicNumbers(integral_constant<uint32_t, Divisor>)
|
|
{
|
|
constexpr auto tmp = CalculateMagicNumbers(uint32_t{Divisor});
|
|
|
|
constexpr uint32_t multiplier = tmp[Number<0>{}];
|
|
constexpr uint32_t shift = tmp[Number<1>{}];
|
|
|
|
return make_tuple(integral_constant<uint32_t, multiplier>{},
|
|
integral_constant<uint32_t, shift>{});
|
|
}
|
|
|
|
template <uint32_t Divisor>
|
|
__host__ __device__ static constexpr auto
|
|
CalculateMagicMultiplier(integral_constant<uint32_t, Divisor>)
|
|
{
|
|
constexpr uint32_t multiplier = CalculateMagicMultiplier(uint32_t{Divisor});
|
|
|
|
return integral_constant<uint32_t, multiplier>{};
|
|
}
|
|
|
|
template <uint32_t Divisor>
|
|
__host__ __device__ static constexpr auto
|
|
CalculateMagicShift(integral_constant<uint32_t, Divisor>)
|
|
{
|
|
constexpr uint32_t shift = CalculateMagicShift(uint32_t{Divisor});
|
|
|
|
return integral_constant<uint32_t, shift>{};
|
|
}
|
|
|
|
// integral_constant<int32_t, .>
|
|
template <int32_t Divisor>
|
|
__host__ __device__ static constexpr auto
|
|
CalculateMagicNumbers(integral_constant<int32_t, Divisor>)
|
|
{
|
|
return CalculateMagicNumbers(integral_constant<uint32_t, Divisor>{});
|
|
}
|
|
|
|
template <int32_t Divisor>
|
|
__host__ __device__ static constexpr auto
|
|
CalculateMagicMultiplier(integral_constant<int32_t, Divisor>)
|
|
{
|
|
return CalculateMagicMultiplier(integral_constant<uint32_t, Divisor>{});
|
|
}
|
|
|
|
template <int32_t Divisor>
|
|
__host__ __device__ static constexpr auto
|
|
CalculateMagicShift(integral_constant<int32_t, Divisor>)
|
|
{
|
|
return CalculateMagicShift(integral_constant<uint32_t, Divisor>{});
|
|
}
|
|
|
|
// magic division for uint32_t
|
|
__device__ static constexpr uint32_t
|
|
DoMagicDivision(uint32_t dividend, uint32_t multiplier, uint32_t shift)
|
|
{
|
|
uint32_t tmp = __umulhi(dividend, multiplier);
|
|
return (tmp + dividend) >> shift;
|
|
}
|
|
|
|
__host__ static constexpr uint32_t
|
|
DoMagicDivision(uint32_t dividend, uint32_t multiplier, uint32_t shift)
|
|
{
|
|
uint32_t tmp = static_cast<uint64_t>(dividend) * multiplier >> 32;
|
|
return (tmp + dividend) >> shift;
|
|
}
|
|
|
|
// magic division for int32_t
|
|
// HACK: use dividend_i32 as if it's uint32_t, dividend_i32 need to be
|
|
// non-negative for result to be correct
|
|
// TODO: figure out how to do magic number divison for int32_t as dividended
|
|
__device__ static constexpr int32_t
|
|
DoMagicDivision(int32_t dividend_i32, uint32_t multiplier, uint32_t shift)
|
|
{
|
|
uint32_t dividend_u32 = bit_cast<uint32_t>(dividend_i32);
|
|
uint32_t tmp = __umulhi(dividend_u32, multiplier);
|
|
return (tmp + dividend_u32) >> shift;
|
|
}
|
|
|
|
__host__ static constexpr int32_t
|
|
DoMagicDivision(int32_t dividend_i32, uint32_t multiplier, uint32_t shift)
|
|
{
|
|
uint32_t dividend_u32 = bit_cast<uint32_t>(dividend_i32);
|
|
uint32_t tmp = static_cast<uint64_t>(dividend_u32) * multiplier >> 32;
|
|
return (tmp + dividend_u32) >> shift;
|
|
}
|
|
};
|
|
|
|
struct MDiv
|
|
{
|
|
// 1 dword -> 3 dword storage
|
|
uint32_t divisor;
|
|
uint32_t multiplier;
|
|
uint32_t shift; // TODO: 8 bit is enough
|
|
|
|
// prefer construct on host
|
|
__host__ __device__ MDiv(uint32_t divisor_) : divisor(divisor_)
|
|
{
|
|
auto tmp = MagicDivision::CalculateMagicNumbers(divisor_);
|
|
|
|
multiplier = tmp[Number<0>{}];
|
|
shift = tmp[Number<1>{}];
|
|
}
|
|
|
|
__host__ __device__ MDiv() : divisor(0), multiplier(0), shift(0) {}
|
|
|
|
__host__ __device__ void update(uint32_t divisor_)
|
|
{
|
|
divisor = divisor_;
|
|
auto tmp = MagicDivision::CalculateMagicNumbers(divisor_);
|
|
|
|
multiplier = tmp[Number<0>{}];
|
|
shift = tmp[Number<1>{}];
|
|
}
|
|
|
|
__host__ __device__ uint32_t div(uint32_t dividend_) const
|
|
{
|
|
return MagicDivision::DoMagicDivision(dividend_, multiplier, shift);
|
|
}
|
|
|
|
__host__ __device__ void
|
|
divmod(uint32_t dividend_, uint32_t& quotient_, uint32_t& remainder_) const
|
|
{
|
|
quotient_ = div(dividend_);
|
|
remainder_ = dividend_ - (quotient_ * divisor);
|
|
}
|
|
|
|
__host__ __device__ uint32_t get() const { return divisor; }
|
|
};
|
|
|
|
struct MDiv2
|
|
{
|
|
// 1 dword -> 2 dword storage, divisor need compute from runtime
|
|
uint32_t multiplier;
|
|
uint32_t shift; // TODO: 8 bit is enough
|
|
|
|
// prefer construct on host
|
|
__host__ __device__ MDiv2(uint32_t divisor_)
|
|
{
|
|
auto tmp = MagicDivision::CalculateMagicNumbers(divisor_);
|
|
|
|
multiplier = tmp[Number<0>{}];
|
|
shift = tmp[Number<1>{}];
|
|
}
|
|
|
|
__host__ __device__ MDiv2() : multiplier(0), shift(0) {}
|
|
|
|
__host__ __device__ uint32_t div(uint32_t dividend_) const
|
|
{
|
|
return MagicDivision::DoMagicDivision(dividend_, multiplier, shift);
|
|
}
|
|
|
|
__host__ __device__ void
|
|
divmod(uint32_t dividend_, uint32_t divisor_, uint32_t& quotient_, uint32_t& remainder_) const
|
|
{
|
|
quotient_ = div(dividend_);
|
|
remainder_ = dividend_ - (quotient_ * divisor_);
|
|
}
|
|
};
|
|
|
|
} // namespace ck
|