mirror of
https://github.com/ROCm/composable_kernel.git
synced 2026-04-20 06:49:15 +00:00
Compile for gfx908 and gfx90a (#130)
* adding compilation for multiple targets * fix build * clean * update Jekinsfile * update readme * update Jenkins * use ck::half_t instead of ushort for bf16 * rename enum classes * clean * rename * clean
This commit is contained in:
@@ -1,6 +1,4 @@
|
||||
#ifndef CK_BUFFER_HPP
|
||||
#define CK_BUFFER_HPP
|
||||
|
||||
#pragma once
|
||||
#include "amd_buffer_addressing.hpp"
|
||||
#include "c_style_pointer_cast.hpp"
|
||||
#include "config.hpp"
|
||||
@@ -8,7 +6,7 @@
|
||||
|
||||
namespace ck {
|
||||
|
||||
template <AddressSpaceEnum_t BufferAddressSpace,
|
||||
template <AddressSpaceEnum BufferAddressSpace,
|
||||
typename T,
|
||||
typename ElementSpaceSize,
|
||||
bool InvalidElementUseNumericalZeroValue>
|
||||
@@ -34,7 +32,7 @@ struct DynamicBuffer
|
||||
{
|
||||
}
|
||||
|
||||
__host__ __device__ static constexpr AddressSpaceEnum_t GetAddressSpace()
|
||||
__host__ __device__ static constexpr AddressSpaceEnum GetAddressSpace()
|
||||
{
|
||||
return BufferAddressSpace;
|
||||
}
|
||||
@@ -55,7 +53,7 @@ struct DynamicBuffer
|
||||
constexpr index_t scalar_per_x_vector = scalar_type<remove_cvref_t<X>>::vector_size;
|
||||
|
||||
static_assert(scalar_per_x_vector % scalar_per_t_vector == 0,
|
||||
"wrong! X need to be multiple T");
|
||||
"wrong! X should contain multiple T");
|
||||
|
||||
#if CK_USE_AMD_BUFFER_LOAD
|
||||
bool constexpr use_amd_buffer_addressing = true;
|
||||
@@ -63,7 +61,7 @@ struct DynamicBuffer
|
||||
bool constexpr use_amd_buffer_addressing = false;
|
||||
#endif
|
||||
|
||||
if constexpr(GetAddressSpace() == AddressSpaceEnum_t::Global && use_amd_buffer_addressing)
|
||||
if constexpr(GetAddressSpace() == AddressSpaceEnum::Global && use_amd_buffer_addressing)
|
||||
{
|
||||
constexpr index_t t_per_x = scalar_per_x_vector / scalar_per_t_vector;
|
||||
|
||||
@@ -81,50 +79,48 @@ struct DynamicBuffer
|
||||
}
|
||||
else
|
||||
{
|
||||
if constexpr(InvalidElementUseNumericalZeroValue)
|
||||
if(is_valid_element)
|
||||
{
|
||||
#if CK_EXPERIMENTAL_USE_MEMCPY_FOR_VECTOR_ACCESS
|
||||
X tmp;
|
||||
|
||||
__builtin_memcpy(&tmp, &(p_data_[i]), sizeof(X));
|
||||
|
||||
return is_valid_element ? tmp : X{0};
|
||||
return tmp;
|
||||
#else
|
||||
return is_valid_element ? *c_style_pointer_cast<const X*>(&p_data_[i]) : X{0};
|
||||
return *c_style_pointer_cast<const X*>(&p_data_[i]);
|
||||
#endif
|
||||
}
|
||||
else
|
||||
{
|
||||
#if CK_EXPERIMENTAL_USE_MEMCPY_FOR_VECTOR_ACCESS
|
||||
X tmp;
|
||||
|
||||
__builtin_memcpy(&tmp, &(p_data_[i]), sizeof(X));
|
||||
|
||||
return is_valid_element ? tmp : X{invalid_element_value_};
|
||||
#else
|
||||
return is_valid_element ? *c_style_pointer_cast<const X*>(&p_data_[i])
|
||||
: X{invalid_element_value_};
|
||||
#endif
|
||||
if constexpr(InvalidElementUseNumericalZeroValue)
|
||||
{
|
||||
return X{0};
|
||||
}
|
||||
else
|
||||
{
|
||||
return X{invalid_element_value_};
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <InMemoryDataOperationEnum_t Op,
|
||||
template <InMemoryDataOperationEnum Op,
|
||||
typename X,
|
||||
typename enable_if<is_same<typename scalar_type<remove_cvref_t<X>>::type,
|
||||
typename scalar_type<remove_cvref_t<T>>::type>::value,
|
||||
bool>::type = false>
|
||||
__host__ __device__ void Update(index_t i, bool is_valid_element, const X& x)
|
||||
{
|
||||
if constexpr(Op == InMemoryDataOperationEnum_t::Set)
|
||||
if constexpr(Op == InMemoryDataOperationEnum::Set)
|
||||
{
|
||||
this->template Set<X>(i, is_valid_element, x);
|
||||
}
|
||||
else if constexpr(Op == InMemoryDataOperationEnum_t::AtomicAdd)
|
||||
else if constexpr(Op == InMemoryDataOperationEnum::AtomicAdd)
|
||||
{
|
||||
this->template AtomicAdd<X>(i, is_valid_element, x);
|
||||
}
|
||||
else if constexpr(Op == InMemoryDataOperationEnum_t::Add)
|
||||
else if constexpr(Op == InMemoryDataOperationEnum::Add)
|
||||
{
|
||||
auto tmp = this->template Get<X>(i, is_valid_element);
|
||||
this->template Set<X>(i, is_valid_element, x + tmp);
|
||||
@@ -145,143 +141,120 @@ struct DynamicBuffer
|
||||
constexpr index_t scalar_per_x_vector = scalar_type<remove_cvref_t<X>>::vector_size;
|
||||
|
||||
static_assert(scalar_per_x_vector % scalar_per_t_vector == 0,
|
||||
"wrong! X need to be multiple T");
|
||||
"wrong! X should contain multiple T");
|
||||
|
||||
if constexpr(GetAddressSpace() == AddressSpaceEnum_t::Global)
|
||||
{
|
||||
#if CK_USE_AMD_BUFFER_STORE
|
||||
bool constexpr use_amd_buffer_addressing = true;
|
||||
#else
|
||||
bool constexpr use_amd_buffer_addressing = false;
|
||||
#endif
|
||||
|
||||
#if CK_WORKAROUND_SWDEV_XXXXXX_INT8_DS_WRITE_ISSUE
|
||||
bool constexpr workaround_int8_ds_write_issue = true;
|
||||
#else
|
||||
bool constexpr workaround_int8_ds_write_issue = false;
|
||||
#endif
|
||||
|
||||
if constexpr(GetAddressSpace() == AddressSpaceEnum::Global && use_amd_buffer_addressing)
|
||||
{
|
||||
constexpr index_t t_per_x = scalar_per_x_vector / scalar_per_t_vector;
|
||||
|
||||
amd_buffer_store<remove_cvref_t<T>, t_per_x>(
|
||||
x, p_data_, i, is_valid_element, element_space_size_);
|
||||
#else
|
||||
if(is_valid_element)
|
||||
{
|
||||
#if CK_EXPERIMENTAL_USE_MEMCPY_FOR_VECTOR_ACCESS
|
||||
X tmp = x;
|
||||
|
||||
__builtin_memcpy(&(p_data_[i]), &tmp, sizeof(X));
|
||||
#else
|
||||
*c_style_pointer_cast<X*>(&p_data_[i]) = x;
|
||||
#endif
|
||||
}
|
||||
#endif
|
||||
}
|
||||
else if constexpr(GetAddressSpace() == AddressSpaceEnum_t::Lds)
|
||||
else if constexpr(GetAddressSpace() == AddressSpaceEnum::Lds &&
|
||||
is_same<typename scalar_type<remove_cvref_t<T>>::type, int8_t>::value &&
|
||||
workaround_int8_ds_write_issue)
|
||||
{
|
||||
if(is_valid_element)
|
||||
{
|
||||
#if !CK_WORKAROUND_SWDEV_XXXXXX_INT8_DS_WRITE_ISSUE
|
||||
#if CK_EXPERIMENTAL_USE_MEMCPY_FOR_VECTOR_ACCESS
|
||||
X tmp = x;
|
||||
|
||||
__builtin_memcpy(&(p_data_[i]), &tmp, sizeof(X));
|
||||
#else
|
||||
*c_style_pointer_cast<X*>(&p_data_[i]) = x;
|
||||
#endif
|
||||
#else
|
||||
// HACK: compiler would lower IR "store<i8, 16> address_space(3)" into
|
||||
// inefficient
|
||||
// HACK: compiler would lower IR "store<i8, 16> address_space(3)" into inefficient
|
||||
// ISA, so I try to let compiler emit IR "store<i32, 4>" which would be lower to
|
||||
// ds_write_b128
|
||||
// TODO: remove this after compiler fix
|
||||
if constexpr(is_same<typename scalar_type<remove_cvref_t<T>>::type, int8_t>::value)
|
||||
{
|
||||
static_assert((is_same<remove_cvref_t<T>, int8_t>::value &&
|
||||
is_same<remove_cvref_t<X>, int8_t>::value) ||
|
||||
(is_same<remove_cvref_t<T>, int8_t>::value &&
|
||||
is_same<remove_cvref_t<X>, int8x2_t>::value) ||
|
||||
(is_same<remove_cvref_t<T>, int8_t>::value &&
|
||||
is_same<remove_cvref_t<X>, int8x4_t>::value) ||
|
||||
(is_same<remove_cvref_t<T>, int8_t>::value &&
|
||||
is_same<remove_cvref_t<X>, int8x8_t>::value) ||
|
||||
(is_same<remove_cvref_t<T>, int8_t>::value &&
|
||||
is_same<remove_cvref_t<X>, int8x16_t>::value) ||
|
||||
(is_same<remove_cvref_t<T>, int8x4_t>::value &&
|
||||
is_same<remove_cvref_t<X>, int8x4_t>::value) ||
|
||||
(is_same<remove_cvref_t<T>, int8x8_t>::value &&
|
||||
is_same<remove_cvref_t<X>, int8x8_t>::value) ||
|
||||
(is_same<remove_cvref_t<T>, int8x16_t>::value &&
|
||||
is_same<remove_cvref_t<X>, int8x16_t>::value),
|
||||
"wrong! not implemented for this combination, please add "
|
||||
"implementation");
|
||||
static_assert((is_same<remove_cvref_t<T>, int8_t>::value &&
|
||||
is_same<remove_cvref_t<X>, int8_t>::value) ||
|
||||
(is_same<remove_cvref_t<T>, int8_t>::value &&
|
||||
is_same<remove_cvref_t<X>, int8x2_t>::value) ||
|
||||
(is_same<remove_cvref_t<T>, int8_t>::value &&
|
||||
is_same<remove_cvref_t<X>, int8x4_t>::value) ||
|
||||
(is_same<remove_cvref_t<T>, int8_t>::value &&
|
||||
is_same<remove_cvref_t<X>, int8x8_t>::value) ||
|
||||
(is_same<remove_cvref_t<T>, int8_t>::value &&
|
||||
is_same<remove_cvref_t<X>, int8x16_t>::value) ||
|
||||
(is_same<remove_cvref_t<T>, int8x4_t>::value &&
|
||||
is_same<remove_cvref_t<X>, int8x4_t>::value) ||
|
||||
(is_same<remove_cvref_t<T>, int8x8_t>::value &&
|
||||
is_same<remove_cvref_t<X>, int8x8_t>::value) ||
|
||||
(is_same<remove_cvref_t<T>, int8x16_t>::value &&
|
||||
is_same<remove_cvref_t<X>, int8x16_t>::value),
|
||||
"wrong! not implemented for this combination, please add "
|
||||
"implementation");
|
||||
|
||||
if constexpr(is_same<remove_cvref_t<T>, int8_t>::value &&
|
||||
is_same<remove_cvref_t<X>, int8_t>::value)
|
||||
{
|
||||
// HACK: cast pointer of x is bad
|
||||
// TODO: remove this after compiler fix
|
||||
*c_style_pointer_cast<int8_t*>(&p_data_[i]) =
|
||||
*c_style_pointer_cast<const int8_t*>(&x);
|
||||
}
|
||||
else if constexpr(is_same<remove_cvref_t<T>, int8_t>::value &&
|
||||
is_same<remove_cvref_t<X>, int8x2_t>::value)
|
||||
{
|
||||
// HACK: cast pointer of x is bad
|
||||
// TODO: remove this after compiler fix
|
||||
*c_style_pointer_cast<int16_t*>(&p_data_[i]) =
|
||||
*c_style_pointer_cast<const int16_t*>(&x);
|
||||
}
|
||||
else if constexpr(is_same<remove_cvref_t<T>, int8_t>::value &&
|
||||
is_same<remove_cvref_t<X>, int8x4_t>::value)
|
||||
{
|
||||
// HACK: cast pointer of x is bad
|
||||
// TODO: remove this after compiler fix
|
||||
*c_style_pointer_cast<int32_t*>(&p_data_[i]) =
|
||||
*c_style_pointer_cast<const int32_t*>(&x);
|
||||
}
|
||||
else if constexpr(is_same<remove_cvref_t<T>, int8_t>::value &&
|
||||
is_same<remove_cvref_t<X>, int8x8_t>::value)
|
||||
{
|
||||
// HACK: cast pointer of x is bad
|
||||
// TODO: remove this after compiler fix
|
||||
*c_style_pointer_cast<int32x2_t*>(&p_data_[i]) =
|
||||
*c_style_pointer_cast<const int32x2_t*>(&x);
|
||||
}
|
||||
else if constexpr(is_same<remove_cvref_t<T>, int8_t>::value &&
|
||||
is_same<remove_cvref_t<X>, int8x16_t>::value)
|
||||
{
|
||||
// HACK: cast pointer of x is bad
|
||||
// TODO: remove this after compiler fix
|
||||
*c_style_pointer_cast<int32x4_t*>(&p_data_[i]) =
|
||||
*c_style_pointer_cast<const int32x4_t*>(&x);
|
||||
}
|
||||
else if constexpr(is_same<remove_cvref_t<T>, int8x4_t>::value &&
|
||||
is_same<remove_cvref_t<X>, int8x4_t>::value)
|
||||
{
|
||||
// HACK: cast pointer of x is bad
|
||||
// TODO: remove this after compiler fix
|
||||
*c_style_pointer_cast<int32_t*>(&p_data_[i]) =
|
||||
*c_style_pointer_cast<const int32_t*>(&x);
|
||||
}
|
||||
else if constexpr(is_same<remove_cvref_t<T>, int8x8_t>::value &&
|
||||
is_same<remove_cvref_t<X>, int8x8_t>::value)
|
||||
{
|
||||
// HACK: cast pointer of x is bad
|
||||
// TODO: remove this after compiler fix
|
||||
*c_style_pointer_cast<int32x2_t*>(&p_data_[i]) =
|
||||
*c_style_pointer_cast<const int32x2_t*>(&x);
|
||||
}
|
||||
else if constexpr(is_same<remove_cvref_t<T>, int8x16_t>::value &&
|
||||
is_same<remove_cvref_t<X>, int8x16_t>::value)
|
||||
{
|
||||
// HACK: cast pointer of x is bad
|
||||
// TODO: remove this after compiler fix
|
||||
*c_style_pointer_cast<int32x4_t*>(&p_data_[i]) =
|
||||
*c_style_pointer_cast<const int32x4_t*>(&x);
|
||||
}
|
||||
}
|
||||
else
|
||||
if constexpr(is_same<remove_cvref_t<T>, int8_t>::value &&
|
||||
is_same<remove_cvref_t<X>, int8_t>::value)
|
||||
{
|
||||
#if CK_EXPERIMENTAL_USE_MEMCPY_FOR_VECTOR_ACCESS
|
||||
X tmp = x;
|
||||
|
||||
__builtin_memcpy(&(p_data_[i]), &tmp, sizeof(X));
|
||||
#else
|
||||
*c_style_pointer_cast<X*>(&p_data_[i]) = x;
|
||||
#endif
|
||||
// HACK: cast pointer of x is bad
|
||||
// TODO: remove this after compiler fix
|
||||
*c_style_pointer_cast<int8_t*>(&p_data_[i]) =
|
||||
*c_style_pointer_cast<const int8_t*>(&x);
|
||||
}
|
||||
else if constexpr(is_same<remove_cvref_t<T>, int8_t>::value &&
|
||||
is_same<remove_cvref_t<X>, int8x2_t>::value)
|
||||
{
|
||||
// HACK: cast pointer of x is bad
|
||||
// TODO: remove this after compiler fix
|
||||
*c_style_pointer_cast<int16_t*>(&p_data_[i]) =
|
||||
*c_style_pointer_cast<const int16_t*>(&x);
|
||||
}
|
||||
else if constexpr(is_same<remove_cvref_t<T>, int8_t>::value &&
|
||||
is_same<remove_cvref_t<X>, int8x4_t>::value)
|
||||
{
|
||||
// HACK: cast pointer of x is bad
|
||||
// TODO: remove this after compiler fix
|
||||
*c_style_pointer_cast<int32_t*>(&p_data_[i]) =
|
||||
*c_style_pointer_cast<const int32_t*>(&x);
|
||||
}
|
||||
else if constexpr(is_same<remove_cvref_t<T>, int8_t>::value &&
|
||||
is_same<remove_cvref_t<X>, int8x8_t>::value)
|
||||
{
|
||||
// HACK: cast pointer of x is bad
|
||||
// TODO: remove this after compiler fix
|
||||
*c_style_pointer_cast<int32x2_t*>(&p_data_[i]) =
|
||||
*c_style_pointer_cast<const int32x2_t*>(&x);
|
||||
}
|
||||
else if constexpr(is_same<remove_cvref_t<T>, int8_t>::value &&
|
||||
is_same<remove_cvref_t<X>, int8x16_t>::value)
|
||||
{
|
||||
// HACK: cast pointer of x is bad
|
||||
// TODO: remove this after compiler fix
|
||||
*c_style_pointer_cast<int32x4_t*>(&p_data_[i]) =
|
||||
*c_style_pointer_cast<const int32x4_t*>(&x);
|
||||
}
|
||||
else if constexpr(is_same<remove_cvref_t<T>, int8x4_t>::value &&
|
||||
is_same<remove_cvref_t<X>, int8x4_t>::value)
|
||||
{
|
||||
// HACK: cast pointer of x is bad
|
||||
// TODO: remove this after compiler fix
|
||||
*c_style_pointer_cast<int32_t*>(&p_data_[i]) =
|
||||
*c_style_pointer_cast<const int32_t*>(&x);
|
||||
}
|
||||
else if constexpr(is_same<remove_cvref_t<T>, int8x8_t>::value &&
|
||||
is_same<remove_cvref_t<X>, int8x8_t>::value)
|
||||
{
|
||||
// HACK: cast pointer of x is bad
|
||||
// TODO: remove this after compiler fix
|
||||
*c_style_pointer_cast<int32x2_t*>(&p_data_[i]) =
|
||||
*c_style_pointer_cast<const int32x2_t*>(&x);
|
||||
}
|
||||
else if constexpr(is_same<remove_cvref_t<T>, int8x16_t>::value &&
|
||||
is_same<remove_cvref_t<X>, int8x16_t>::value)
|
||||
{
|
||||
// HACK: cast pointer of x is bad
|
||||
// TODO: remove this after compiler fix
|
||||
*c_style_pointer_cast<int32x4_t*>(&p_data_[i]) =
|
||||
*c_style_pointer_cast<const int32x4_t*>(&x);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
}
|
||||
else
|
||||
@@ -305,27 +278,49 @@ struct DynamicBuffer
|
||||
bool>::type = false>
|
||||
__host__ __device__ void AtomicAdd(index_t i, bool is_valid_element, const X& x)
|
||||
{
|
||||
using scalar_t = typename scalar_type<remove_cvref_t<T>>::type;
|
||||
|
||||
// X contains multiple T
|
||||
constexpr index_t scalar_per_t_vector = scalar_type<remove_cvref_t<T>>::vector_size;
|
||||
|
||||
constexpr index_t scalar_per_x_vector = scalar_type<remove_cvref_t<X>>::vector_size;
|
||||
|
||||
static_assert(scalar_per_x_vector % scalar_per_t_vector == 0,
|
||||
"wrong! X need to be multiple T");
|
||||
"wrong! X should contain multiple T");
|
||||
|
||||
static_assert(GetAddressSpace() == AddressSpaceEnum_t::Global, "only support global mem");
|
||||
static_assert(GetAddressSpace() == AddressSpaceEnum::Global, "only support global mem");
|
||||
|
||||
#if CK_USE_AMD_BUFFER_ATOMIC_ADD
|
||||
constexpr index_t t_per_x = scalar_per_x_vector / scalar_per_t_vector;
|
||||
|
||||
amd_buffer_atomic_add<remove_cvref_t<T>, t_per_x>(
|
||||
x, p_data_, i, is_valid_element, element_space_size_);
|
||||
#if CK_USE_AMD_BUFFER_ATOMIC_ADD_INTEGER && CK_USE_AMD_BUFFER_ATOMIC_ADD_FLOAT
|
||||
bool constexpr use_amd_buffer_addressing =
|
||||
is_same_v<remove_cvref_t<scalar_t>, int32_t> ||
|
||||
is_same_v<remove_cvref_t<scalar_t>, float> ||
|
||||
(is_same_v<remove_cvref_t<scalar_t>, half_t> && scalar_per_x_vector % 2 == 0);
|
||||
#elif CK_USE_AMD_BUFFER_ATOMIC_ADD_INTEGER && (!CK_USE_AMD_BUFFER_ATOMIC_ADD_FLOAT)
|
||||
bool constexpr use_amd_buffer_addressing = is_same_v<remove_cvref_t<scalar_t>, int32_t>;
|
||||
#elif(!CK_USE_AMD_BUFFER_ATOMIC_ADD_INTEGER) && CK_USE_AMD_BUFFER_ATOMIC_ADD_FLOAT
|
||||
bool constexpr use_amd_buffer_addressing =
|
||||
is_same_v<remove_cvref_t<scalar_t>, float> ||
|
||||
(is_same_v<remove_cvref_t<scalar_t>, half_t> && scalar_per_x_vector % 2 == 0);
|
||||
#else
|
||||
if(is_valid_element)
|
||||
{
|
||||
atomicAdd(&p_data_[i], x);
|
||||
}
|
||||
bool constexpr use_amd_buffer_addressing = false;
|
||||
#endif
|
||||
|
||||
if constexpr(use_amd_buffer_addressing)
|
||||
{
|
||||
constexpr index_t t_per_x = scalar_per_x_vector / scalar_per_t_vector;
|
||||
|
||||
amd_buffer_atomic_add<remove_cvref_t<T>, t_per_x>(
|
||||
x, p_data_, i, is_valid_element, element_space_size_);
|
||||
}
|
||||
else
|
||||
{
|
||||
if(is_valid_element)
|
||||
{
|
||||
// FIXME: atomicAdd is defined by HIP, need to avoid implicit type casting when
|
||||
// calling it
|
||||
atomicAdd(c_style_pointer_cast<X*>(&p_data_[i]), x);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
__host__ __device__ static constexpr bool IsStaticBuffer() { return false; }
|
||||
@@ -333,14 +328,14 @@ struct DynamicBuffer
|
||||
__host__ __device__ static constexpr bool IsDynamicBuffer() { return true; }
|
||||
};
|
||||
|
||||
template <AddressSpaceEnum_t BufferAddressSpace, typename T, typename ElementSpaceSize>
|
||||
template <AddressSpaceEnum BufferAddressSpace, typename T, typename ElementSpaceSize>
|
||||
__host__ __device__ constexpr auto make_dynamic_buffer(T* p, ElementSpaceSize element_space_size)
|
||||
{
|
||||
return DynamicBuffer<BufferAddressSpace, T, ElementSpaceSize, true>{p, element_space_size};
|
||||
}
|
||||
|
||||
template <
|
||||
AddressSpaceEnum_t BufferAddressSpace,
|
||||
AddressSpaceEnum BufferAddressSpace,
|
||||
typename T,
|
||||
typename ElementSpaceSize,
|
||||
typename X,
|
||||
@@ -353,4 +348,3 @@ make_dynamic_buffer(T* p, ElementSpaceSize element_space_size, X invalid_element
|
||||
}
|
||||
|
||||
} // namespace ck
|
||||
#endif
|
||||
|
||||
Reference in New Issue
Block a user