mirror of
https://github.com/ROCm/composable_kernel.git
synced 2026-05-20 04:49:54 +00:00
* support dynamic tensor descriptor
* use buffer load OOB feature for padding case
* add navi support
* add int8x4 inference kernel
Co-authored-by: Chao Liu <chao@ixt-rack-81.local.lan>
Co-authored-by: Jing Zhang <jizhan@amd.com>
[ROCm/composable_kernel commit: fcbb978828]
110 lines
3.7 KiB
C++
110 lines
3.7 KiB
C++
#ifndef CK_IN_MEMORY_OPERATION_NVIDIA_HPP
|
|
#define CK_IN_MEMORY_OPERATION_NVIDIA_HPP
|
|
|
|
namespace ck {
|
|
|
|
template <typename T>
|
|
__device__ void atomic_add_impl(T* p_dst, T src)
|
|
{
|
|
atomicAdd(p_dst, src);
|
|
}
|
|
|
|
// atomicAdd for float does not support vector type
|
|
template <>
|
|
__device__ void atomic_add_impl<float2_t>(float2_t* p_dst, float2_t src)
|
|
{
|
|
float* p_dst_float = reinterpret_cast<float*>(p_dst);
|
|
const float* p_src_float = reinterpret_cast<const float*>(&src);
|
|
|
|
for(index_t i = 0; i < 2; ++i)
|
|
{
|
|
atomicAdd(&(p_dst_float[i]), p_src_float[i]);
|
|
}
|
|
}
|
|
|
|
template <>
|
|
__device__ void atomic_add_impl<float4_t>(float4_t* p_dst, float4_t src)
|
|
{
|
|
float* p_dst_float = reinterpret_cast<float*>(p_dst);
|
|
const float* p_src_float = reinterpret_cast<const float*>(&src);
|
|
|
|
for(index_t i = 0; i < 4; ++i)
|
|
{
|
|
atomicAdd(&(p_dst_float[i]), p_src_float[i]);
|
|
}
|
|
}
|
|
|
|
template <typename T, index_t DataPerAccess>
|
|
struct SetData
|
|
{
|
|
using vector_t = typename vector_type<T, DataPerAccess>::type;
|
|
|
|
template <AddressSpace SrcAddressSpace, AddressSpace DstAddressSpace>
|
|
__device__ void Run(const T* p_src, index_t src_offset, T* p_dst, index_t dst_offset) const
|
|
{
|
|
*reinterpret_cast<vector_t*>(&p_dst[dst_offset]) =
|
|
*reinterpret_cast<const vector_t*>(&p_src[src_offset]);
|
|
}
|
|
};
|
|
|
|
template <typename T, index_t DataPerAccess>
|
|
struct AtomicAddData
|
|
{
|
|
using vector_t = typename vector_type<T, DataPerAccess>::type;
|
|
|
|
template <AddressSpace SrcAddressSpace, AddressSpace DstAddressSpace>
|
|
__device__ void Run(const T* p_src, index_t src_offset, T* p_dst, index_t dst_offset) const
|
|
{
|
|
atomic_add_impl(reinterpret_cast<vector_t*>(&p_dst[dst_offset]),
|
|
*reinterpret_cast<const vector_t*>(&p_src[src_offset]));
|
|
}
|
|
};
|
|
|
|
template <typename T,
|
|
index_t DataPerAccess,
|
|
AddressSpace SrcAddressSpace,
|
|
AddressSpace DstAddressSpace,
|
|
InMemoryDataOperation DstInMemOp,
|
|
index_t SrcDataStride = 1,
|
|
index_t DstDataStride = 1>
|
|
__device__ void transfer_data(const T* p_src, index_t src_offset, T* p_dst, index_t dst_offset)
|
|
{
|
|
static_assert(DstInMemOp == InMemoryDataOperation::Set ||
|
|
DstInMemOp == InMemoryDataOperation::AtomicAdd,
|
|
"wrong! InMemoryDataOperation not supported!");
|
|
|
|
// keep it simple, don't use static_if here, otherwise compiler will do weird things
|
|
if(SrcDataStride == 1 && DstDataStride == 1)
|
|
{
|
|
// TODO: use static_if::ElseIf
|
|
static_if<DstInMemOp == InMemoryDataOperation::Set>{}([&](auto) {
|
|
SetData<T, DataPerAccess>{}.template Run<SrcAddressSpace, DstAddressSpace>(
|
|
p_src, src_offset, p_dst, dst_offset);
|
|
});
|
|
|
|
static_if<DstInMemOp == InMemoryDataOperation::AtomicAdd>{}([&](auto) {
|
|
AtomicAddData<T, DataPerAccess>{}.template Run<SrcAddressSpace, DstAddressSpace>(
|
|
p_src, src_offset, p_dst, dst_offset);
|
|
});
|
|
}
|
|
else
|
|
{
|
|
for(index_t i = 0; i < DataPerAccess; i++)
|
|
{
|
|
// TODO: use static_if::ElseIf
|
|
static_if<DstInMemOp == InMemoryDataOperation::Set>{}([&](auto) {
|
|
SetData<T, 1>{}.template Run<SrcAddressSpace, DstAddressSpace>(
|
|
p_src, src_offset + i * SrcDataStride, p_dst, dst_offset + i * DstDataStride);
|
|
});
|
|
|
|
static_if<DstInMemOp == InMemoryDataOperation::AtomicAdd>{}([&](auto) {
|
|
AtomicAddData<T, 1>{}.template Run<SrcAddressSpace, DstAddressSpace>(
|
|
p_src, src_offset + i * SrcDataStride, p_dst, dst_offset + i * DstDataStride);
|
|
});
|
|
}
|
|
}
|
|
}
|
|
|
|
} // namespace ck
|
|
#endif
|