mirror of
https://github.com/ROCm/composable_kernel.git
synced 2026-05-12 17:26:00 +00:00
removing old implementation of tensor descriptor
This commit is contained in:
@@ -431,9 +431,9 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_padded_lds_double_buf
|
||||
b_thread_data_on_global,
|
||||
0})
|
||||
#if 1
|
||||
.template Run_generic<Float, address_space_t::generic, address_space_t::global>
|
||||
.template Run_generic<Float, Float, address_space_t::generic, address_space_t::global>
|
||||
#elif 1
|
||||
.template Run_optimized_dst_address_calculation<Float, address_space_t::global>
|
||||
.template Run_optimized_dst_address_calculation<Float, Float, address_space_t::global>
|
||||
#endif
|
||||
(p_out_thread, p_out_global);
|
||||
}
|
||||
|
||||
@@ -734,43 +734,46 @@ struct BlockwiseGenericTensorSliceCopy_v4
|
||||
return RegisterBufferDesc::GetElementSpace();
|
||||
}
|
||||
|
||||
template <typename TData, address_space_t SrcAddressSpace = address_space_t::generic>
|
||||
__device__ void RunLoadRegisterBuffer(const TData* p_src, TData* p_buffer) const
|
||||
template <typename SrcData, typename BufferData, address_space_t SrcAddressSpace = address_space_t::generic>
|
||||
__device__ void RunLoadRegisterBuffer(const SrcData* p_src, BufferData* p_buffer) const
|
||||
{
|
||||
#if 1
|
||||
mThreadwiseLoad.template Run_generic<TData, SrcAddressSpace, address_space_t::generic>(
|
||||
mThreadwiseLoad.template Run_generic<SrcData, BufferData, SrcAddressSpace, address_space_t::generic>(
|
||||
p_src, p_buffer);
|
||||
#else
|
||||
mThreadwiseLoad.template Run_optimized_src_address_calculation<TData,
|
||||
mThreadwiseLoad.template Run_optimized_src_address_calculation<SrcData,
|
||||
BufferData,
|
||||
SrcAddressSpace,
|
||||
address_space_t::generic>(
|
||||
p_src, p_buffer);
|
||||
#endif
|
||||
}
|
||||
|
||||
template <typename TData, address_space_t DstAddressSpace = address_space_t::generic>
|
||||
__device__ void RunStoreRegisterBuffer(const TData* p_buffer, TData* p_dst) const
|
||||
template <typename BufferData, typename DstData, address_space_t DstAddressSpace = address_space_t::generic>
|
||||
__device__ void RunStoreRegisterBuffer(const BufferData* p_buffer, DstData* p_dst) const
|
||||
{
|
||||
#if 1
|
||||
mThreadwiseStore.template Run_generic<TData, address_space_t::generic, DstAddressSpace>(
|
||||
mThreadwiseStore.template Run_generic<BufferData, DstData, address_space_t::generic, DstAddressSpace>(
|
||||
p_buffer, p_dst);
|
||||
#else
|
||||
mThreadwiseStore.template Run_optimized_dst_address_calculation<TData,
|
||||
mThreadwiseStore.template Run_optimized_dst_address_calculation<BufferData,
|
||||
DstData,
|
||||
address_space_t::generic,
|
||||
DstAddressSpace>(p_buffer,
|
||||
p_dst);
|
||||
#endif
|
||||
}
|
||||
|
||||
template <typename TData,
|
||||
template <typename SrcData,
|
||||
typename DstData,
|
||||
address_space_t SrcAddressSpace = address_space_t::generic,
|
||||
address_space_t DstAddressSpace = address_space_t::generic>
|
||||
__device__ void Run(const TData* p_src, TData* p_dst) const
|
||||
__device__ void Run(const SrcData* p_src, DstData* p_dst) const
|
||||
{
|
||||
TData p_buffer[GetRegisterBufferSize()];
|
||||
SrcData p_src_buffer[GetRegisterBufferSize()];
|
||||
|
||||
RunLoadRegisterBuffer<TData, SrcAddressSpace>(p_src, p_buffer);
|
||||
RunStoreRegisterBuffer<TData, DstAddressSpace>(p_buffer, p_dst);
|
||||
RunLoadRegisterBuffer<SrcData, SrcData, SrcAddressSpace>(p_src, p_buffer);
|
||||
RunStoreRegisterBuffer<SrcData, DstData, DstAddressSpace>(p_buffer, p_dst);
|
||||
}
|
||||
|
||||
template <typename T, bool PositiveDirection>
|
||||
|
||||
@@ -1179,13 +1179,14 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
|
||||
|
||||
// Will do padding check on src data: Read 0 if src data is in padding area.
|
||||
// Will do padding check on dst data: No write if dst data is in paddin area.
|
||||
template <typename TData,
|
||||
template <typename SrcData,
|
||||
typename DstData,
|
||||
address_space_t SrcAddressSpace = address_space_t::generic,
|
||||
address_space_t DstAddressSpace = address_space_t::generic>
|
||||
__device__ void Run_generic(const TData* p_src, TData* p_dst) const
|
||||
__device__ void Run_generic(const SrcData* p_src, DstData* p_dst) const
|
||||
{
|
||||
using src_vector_t = typename vector_type<TData, SrcDataPerAccess>::MemoryType;
|
||||
using dst_vector_t = typename vector_type<TData, DstDataPerAccess>::MemoryType;
|
||||
using src_vector_t = typename vector_type<SrcData, SrcDataPerAccess>::MemoryType;
|
||||
using dst_vector_t = typename vector_type<DstData, DstDataPerAccess>::MemoryType;
|
||||
|
||||
constexpr auto vector_access_dim = Number<VectorAccessDim>{};
|
||||
|
||||
@@ -1205,13 +1206,13 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
|
||||
long_vector_data_begin_id(vector_access_dim) =
|
||||
long_vector_size * long_vector_access_id[vector_access_dim];
|
||||
|
||||
// buffer to hold a long-vector
|
||||
TData p_long_vector[long_vector_size];
|
||||
// buffer to hold a src long-vector
|
||||
SrcData p_src_long_vector[long_vector_size];
|
||||
|
||||
// zero out buffer
|
||||
for(index_t i = 0; i < long_vector_size; ++i)
|
||||
{
|
||||
p_long_vector[i] = 0;
|
||||
p_src_long_vector[i] = 0;
|
||||
}
|
||||
|
||||
// load data from src to the long-vector buffer
|
||||
@@ -1231,20 +1232,28 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
|
||||
{
|
||||
static_if<SrcAddressSpace == address_space_t::global>{}([&](auto) {
|
||||
#if CK_USE_AMD_INTRINSIC && CK_USE_AMD_INTRINSIC_BUFFER_LOAD_STORE
|
||||
*reinterpret_cast<src_vector_t*>(&p_long_vector[buffer_offset]) =
|
||||
__buffer_load<TData, SrcDataPerAccess>(p_src, src_coord.GetOffset(), 0);
|
||||
*reinterpret_cast<src_vector_t*>(&p_src_long_vector[buffer_offset]) =
|
||||
__buffer_load<SrcData, SrcDataPerAccess>(p_src, src_coord.GetOffset(), 0);
|
||||
#else
|
||||
*reinterpret_cast<src_vector_t*>(&p_long_vector[buffer_offset]) =
|
||||
*reinterpret_cast<src_vector_t*>(&p_src_long_vector[buffer_offset]) =
|
||||
*reinterpret_cast<const src_vector_t*>(&p_src[src_coord.GetOffset()]);
|
||||
#endif
|
||||
}).Else([&](auto) {
|
||||
// src can be all kinds of memory-space.
|
||||
*reinterpret_cast<src_vector_t*>(&p_long_vector[buffer_offset]) =
|
||||
*reinterpret_cast<src_vector_t*>(&p_src_long_vector[buffer_offset]) =
|
||||
*reinterpret_cast<const src_vector_t*>(&p_src[src_coord.GetOffset()]);
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
// SrcData to DstData conversion
|
||||
DstData p_dst_long_vector[long_vector_size];
|
||||
|
||||
for(index_t i = 0; i < long_vector_size; ++i)
|
||||
{
|
||||
p_dst_long_vector[i] = type_convert<DstData>(p_src_long_vector[i]);
|
||||
}
|
||||
|
||||
// store data from the long-vector buffer to dst
|
||||
for(index_t i = 0; i < long_vector_size / dst_data_per_access; ++i)
|
||||
{
|
||||
@@ -1262,19 +1271,19 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
|
||||
{
|
||||
static_if<DstAddressSpace == address_space_t::global>{}([&](auto) {
|
||||
#if CK_USE_AMD_INTRINSIC && CK_USE_AMD_INTRINSIC_BUFFER_LOAD_STORE
|
||||
__buffer_store<TData, DstDataPerAccess>(
|
||||
*reinterpret_cast<dst_vector_t*>(&p_long_vector[buffer_offset]),
|
||||
__buffer_store<DstData, DstDataPerAccess>(
|
||||
*reinterpret_cast<dst_vector_t*>(&p_dst_long_vector[buffer_offset]),
|
||||
p_dst,
|
||||
dst_coord.GetOffset(),
|
||||
0);
|
||||
#else
|
||||
*reinterpret_cast<dst_vector_t*>(&p_dst[dst_coord.GetOffset()]) =
|
||||
*reinterpret_cast<dst_vector_t*>(&p_long_vector[buffer_offset]);
|
||||
*reinterpret_cast<dst_vector_t*>(&p_dst_long_vector[buffer_offset]);
|
||||
#endif
|
||||
}).Else([&](auto) {
|
||||
// dst can be all kinds of memory-space
|
||||
*reinterpret_cast<dst_vector_t*>(&p_dst[dst_coord.GetOffset()]) =
|
||||
*reinterpret_cast<dst_vector_t*>(&p_long_vector[buffer_offset]);
|
||||
*reinterpret_cast<dst_vector_t*>(&p_dst_long_vector[buffer_offset]);
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user