mirror of
https://github.com/ROCm/composable_kernel.git
synced 2026-05-12 01:10:17 +00:00
clean up
This commit is contained in:
@@ -396,14 +396,8 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer
|
||||
0,
|
||||
b_thread_data_on_global,
|
||||
0})
|
||||
#if 0
|
||||
.Run
|
||||
#else // tweaking
|
||||
.template Run_optimized_address_calculation<Float,
|
||||
address_space_t::generic,
|
||||
address_space_t::global>
|
||||
#endif
|
||||
(p_out_thread, p_out_global);
|
||||
.template Run<Float, address_space_t::generic, address_space_t::global>(
|
||||
p_out_thread, p_out_global);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
@@ -427,10 +427,7 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_padded_lds_double_buf
|
||||
b_thread_data_on_global,
|
||||
0})
|
||||
#if 1
|
||||
.template Run_generic<Float,
|
||||
Float,
|
||||
address_space_t::generic,
|
||||
address_space_t::global>
|
||||
.template Run<Float, Float, address_space_t::generic, address_space_t::global>
|
||||
#else // tweaking
|
||||
.template Run_optimized_dst_address_calculation<Float,
|
||||
Float,
|
||||
|
||||
@@ -391,14 +391,8 @@ struct GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw_lds_double_buffer
|
||||
for(index_t nrepeat = 0; nrepeat < GemmNRepeat; ++nrepeat)
|
||||
{
|
||||
threadwise_out_copy
|
||||
#if 1
|
||||
.Run
|
||||
#else // tweaking
|
||||
.template Run_optimized_address_calculation<Float,
|
||||
address_space_t::generic,
|
||||
address_space_t::global>
|
||||
#endif
|
||||
(p_out_thread, p_out_global);
|
||||
.template Run<Float, address_space_t::generic, address_space_t::global>(
|
||||
p_out_thread, p_out_global);
|
||||
|
||||
threadwise_out_copy.MoveSrcSliceWindow(Sequence<0, 0, GemmNPerThreadSubC>{}, True);
|
||||
threadwise_out_copy.MoveDstSliceWindow(Sequence<0, 0, B1>{}, True);
|
||||
|
||||
@@ -390,10 +390,7 @@ struct GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw_padded_lds_double_buf
|
||||
b_thread_data_on_global / B1,
|
||||
b_thread_data_on_global % B1})
|
||||
#if 1
|
||||
.template Run_generic<Float,
|
||||
Float,
|
||||
address_space_t::generic,
|
||||
address_space_t::global>
|
||||
.template Run<Float, Float, address_space_t::generic, address_space_t::global>
|
||||
#else // tweaking
|
||||
.template Run_optimized_dst_address_calculation<Float,
|
||||
Float,
|
||||
|
||||
@@ -73,8 +73,8 @@ struct BlockwiseGenericTensorSliceCopy_v4
|
||||
__device__ void RunLoadThreadBuffer(const BlockSrcData* p_block_src,
|
||||
ThreadBufferData* p_thread_buffer) const
|
||||
{
|
||||
#if 1
|
||||
mThreadwiseLoad.template Run_generic<BlockSrcData,
|
||||
#if 0
|
||||
mThreadwiseLoad.template Run<BlockSrcData,
|
||||
ThreadBufferData,
|
||||
BlockSrcAddressSpace,
|
||||
ThreadBufferAddressSpace>(p_block_src,
|
||||
@@ -95,8 +95,8 @@ struct BlockwiseGenericTensorSliceCopy_v4
|
||||
__device__ void RunStoreThreadBuffer(const ThreadBufferData* p_thread_buffer,
|
||||
BlockDstData* p_block_dst) const
|
||||
{
|
||||
#if 1
|
||||
mThreadwiseStore.template Run_generic<ThreadBufferData,
|
||||
#if 0
|
||||
mThreadwiseStore.template Run<ThreadBufferData,
|
||||
BlockDstData,
|
||||
ThreadBufferAddressSpace,
|
||||
BlockDstAddressSpace>(p_thread_buffer, p_block_dst);
|
||||
|
||||
@@ -78,7 +78,7 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
|
||||
typename DstData,
|
||||
address_space_t SrcAddressSpace = address_space_t::generic,
|
||||
address_space_t DstAddressSpace = address_space_t::generic>
|
||||
__device__ void Run_generic(const SrcData* p_src, DstData* p_dst) const
|
||||
__device__ void Run(const SrcData* p_src, DstData* p_dst) const
|
||||
{
|
||||
using src_vector_t = typename vector_type<SrcData, SrcDataPerAccess>::MemoryType;
|
||||
using dst_vector_t = typename vector_type<DstData, DstDataPerAccess>::MemoryType;
|
||||
@@ -130,7 +130,7 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
|
||||
#if CK_USE_AMD_INTRINSIC && CK_USE_AMD_INTRINSIC_BUFFER_LOAD_STORE
|
||||
*reinterpret_cast<src_vector_t*>(&p_src_long_vector[buffer_offset]) =
|
||||
__buffer_load<SrcData, SrcDataPerAccess>(
|
||||
p_src, src_coord.GetOffset(), 0);
|
||||
p_src, 0, src_coord.GetOffset());
|
||||
#else
|
||||
*reinterpret_cast<src_vector_t*>(&p_src_long_vector[buffer_offset]) =
|
||||
*reinterpret_cast<const src_vector_t*>(&p_src[src_coord.GetOffset()]);
|
||||
@@ -172,8 +172,8 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
|
||||
__buffer_store<DstData, DstDataPerAccess>(
|
||||
*reinterpret_cast<dst_vector_t*>(&p_dst_long_vector[buffer_offset]),
|
||||
p_dst,
|
||||
dst_coord.GetOffset(),
|
||||
0);
|
||||
0,
|
||||
dst_coord.GetOffset());
|
||||
#else
|
||||
*reinterpret_cast<dst_vector_t*>(&p_dst[dst_coord.GetOffset()]) =
|
||||
*reinterpret_cast<dst_vector_t*>(&p_dst_long_vector[buffer_offset]);
|
||||
|
||||
@@ -538,235 +538,10 @@ struct ThreadwiseGenericTensorSliceCopy_v2r1
|
||||
}
|
||||
};
|
||||
|
||||
template <typename TData>
|
||||
__device__ void Run(const TData* p_src, TData* p_dst) const
|
||||
{
|
||||
constexpr auto buffer_desc = make_ConstantTensorDescriptor_packed(SliceLengths{});
|
||||
|
||||
TData p_buffer_[buffer_desc.GetElementSpace()];
|
||||
TData* p_buffer = p_buffer_;
|
||||
|
||||
// copy data from src into buffer
|
||||
{
|
||||
using src_vector_t = typename vector_type<TData, SrcDataPerAccess>::MemoryType;
|
||||
|
||||
constexpr auto src_vector_access_dim = Number<SrcVectorAccessDim>{};
|
||||
constexpr auto src_data_per_access = Number<SrcDataPerAccess>{};
|
||||
|
||||
constexpr auto src_access_lengths = SliceLengths::Modify(
|
||||
src_vector_access_dim,
|
||||
SliceLengths::Get(src_vector_access_dim) / src_data_per_access);
|
||||
|
||||
// Offset w.r.t merged dimensions need to be calculated at run-time. Offset w.r.t
|
||||
// normal dimensions is known at compile time.
|
||||
// Below is a hack to isolate merged dimension id from normal dimension id, so the
|
||||
// corresponding offset can be calculated seperately at run-time and compile-time.
|
||||
// src_merged_dim_access_lengths has the same value as src_access_lengths on src's
|
||||
// merged dimensions, and has value = 1 on normal dimensions;
|
||||
// src_merged_dim_access_lengths has the same value as src_access_lengths on src's
|
||||
// normal dimensions, and has value = 1 on merged dimensions;
|
||||
constexpr auto src_merged_dim_access_lengths = typename sequence_gen<
|
||||
nDim,
|
||||
IsolateMergedDimLengths<SrcDesc, decltype(src_access_lengths)>>::type{};
|
||||
|
||||
constexpr auto src_normal_dim_access_lengths =
|
||||
src_access_lengths + Number<1>{} - src_merged_dim_access_lengths;
|
||||
|
||||
#if CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V2R1
|
||||
// offset w.r.t. merged dimension need to be computed at run-time
|
||||
static_ford<decltype(src_merged_dim_access_lengths), SrcDimAccessOrder>{}([&](
|
||||
auto src_merged_dim_access_id_) {
|
||||
|
||||
constexpr auto src_merged_dim_access_id = decltype(src_merged_dim_access_id_){};
|
||||
|
||||
constexpr auto src_merged_dim_data_id = src_merged_dim_access_id.Modify(
|
||||
src_vector_access_dim,
|
||||
src_merged_dim_access_id[src_vector_access_dim] * src_data_per_access);
|
||||
|
||||
const TData* p_src_tmp =
|
||||
p_src + (mSrcSliceOrigin + src_merged_dim_data_id).GetOffset();
|
||||
|
||||
// offset w.r.t. normal dimension can be computed at compile-time
|
||||
static_ford<decltype(src_normal_dim_access_lengths), SrcDimAccessOrder>{}([&](
|
||||
auto src_normal_dim_access_id_) {
|
||||
|
||||
constexpr auto src_normal_dim_access_id = decltype(src_normal_dim_access_id_){};
|
||||
|
||||
constexpr auto src_normal_dim_data_id = src_normal_dim_access_id.Modify(
|
||||
src_vector_access_dim,
|
||||
src_normal_dim_access_id[src_vector_access_dim] * src_data_per_access);
|
||||
|
||||
constexpr index_t src_normal_offset =
|
||||
SrcDesc::GetOffsetFromMultiIndex(src_normal_dim_data_id);
|
||||
|
||||
// load vector from src
|
||||
const src_vector_t vector_data =
|
||||
*reinterpret_cast<const src_vector_t*>(&p_src_tmp[src_normal_offset]);
|
||||
|
||||
// unpack vector into buffer
|
||||
static_for<0, SrcDataPerAccess, 1>{}([&](auto i) {
|
||||
constexpr auto scalar_id =
|
||||
typename uniform_sequence_gen<nDim, 0>::type{}.Modify(
|
||||
src_vector_access_dim, i);
|
||||
|
||||
constexpr index_t buffer_offset = buffer_desc.GetOffsetFromMultiIndex(
|
||||
src_merged_dim_data_id + src_normal_dim_data_id + scalar_id);
|
||||
|
||||
p_buffer[buffer_offset] = reinterpret_cast<const TData*>(&vector_data)[i];
|
||||
});
|
||||
});
|
||||
});
|
||||
#else
|
||||
ford<decltype(src_merged_dim_access_lengths), SrcDimAccessOrder>{}([&](
|
||||
auto src_merged_dim_access_id) {
|
||||
|
||||
auto src_merged_dim_data_id = src_merged_dim_access_id;
|
||||
src_merged_dim_data_id(src_vector_access_dim) =
|
||||
src_merged_dim_access_id[src_vector_access_dim] * src_data_per_access;
|
||||
|
||||
const TData* p_src_tmp =
|
||||
p_src + (mSrcSliceOrigin + src_merged_dim_data_id).GetOffset();
|
||||
|
||||
// these should be compile-time known
|
||||
ford<decltype(src_normal_dim_access_lengths), SrcDimAccessOrder>{}([&](
|
||||
auto src_normal_dim_access_id) {
|
||||
|
||||
auto src_normal_dim_data_id = src_normal_dim_access_id;
|
||||
src_normal_dim_data_id(src_vector_access_dim) =
|
||||
src_normal_dim_access_id[src_vector_access_dim] * src_data_per_access;
|
||||
|
||||
const index_t src_normal_offset =
|
||||
SrcDesc::GetOffsetFromMultiIndex(src_normal_dim_data_id);
|
||||
|
||||
// load vector from src
|
||||
const src_vector_t vector_data =
|
||||
*reinterpret_cast<const src_vector_t*>(&p_src_tmp[src_normal_offset]);
|
||||
|
||||
// unpack vector into buffer
|
||||
for(index_t i = 0; i < SrcDataPerAccess; ++i)
|
||||
{
|
||||
auto scalar_id = make_zero_array<index_t, nDim>();
|
||||
scalar_id(src_vector_access_dim) = i;
|
||||
|
||||
const index_t buffer_offset = buffer_desc.GetOffsetFromMultiIndex(
|
||||
src_merged_dim_data_id + src_normal_dim_data_id + scalar_id);
|
||||
|
||||
p_buffer[buffer_offset] = reinterpret_cast<const TData*>(&vector_data)[i];
|
||||
}
|
||||
});
|
||||
});
|
||||
#endif
|
||||
}
|
||||
|
||||
// copy data from buffer into dst
|
||||
{
|
||||
using dst_vector_t = typename vector_type<TData, DstDataPerAccess>::MemoryType;
|
||||
|
||||
constexpr auto dst_vector_access_dim = Number<DstVectorAccessDim>{};
|
||||
constexpr auto dst_data_per_access = Number<DstDataPerAccess>{};
|
||||
|
||||
constexpr auto dst_access_lengths = SliceLengths::Modify(
|
||||
dst_vector_access_dim,
|
||||
SliceLengths::Get(dst_vector_access_dim) / dst_data_per_access);
|
||||
|
||||
constexpr auto dst_merged_dim_access_lengths = typename sequence_gen<
|
||||
nDim,
|
||||
IsolateMergedDimLengths<DstDesc, decltype(dst_access_lengths)>>::type{};
|
||||
|
||||
constexpr auto dst_normal_dim_access_lengths =
|
||||
dst_access_lengths + Number<1>{} - dst_merged_dim_access_lengths;
|
||||
|
||||
#if CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V2R1
|
||||
// offset w.r.t. merged dimension need to be computed at run-time
|
||||
static_ford<decltype(dst_merged_dim_access_lengths), DstDimAccessOrder>{}([&](
|
||||
auto dst_merged_dim_access_id_) {
|
||||
|
||||
constexpr auto dst_merged_dim_access_id = decltype(dst_merged_dim_access_id_){};
|
||||
|
||||
constexpr auto dst_merged_dim_data_id = dst_merged_dim_access_id.Modify(
|
||||
dst_vector_access_dim,
|
||||
dst_merged_dim_access_id[dst_vector_access_dim] * dst_data_per_access);
|
||||
|
||||
TData* p_dst_tmp = p_dst + (mDstSliceOrigin + dst_merged_dim_data_id).GetOffset();
|
||||
|
||||
// offset w.r.t. normal dimension can be computed at compile-time
|
||||
static_ford<decltype(dst_normal_dim_access_lengths), DstDimAccessOrder>{}([&](
|
||||
auto dst_normal_dim_access_id_) {
|
||||
constexpr auto dst_normal_dim_access_id = decltype(dst_normal_dim_access_id_){};
|
||||
|
||||
constexpr auto dst_normal_dim_data_id = dst_normal_dim_access_id.Modify(
|
||||
dst_vector_access_dim,
|
||||
dst_normal_dim_access_id[dst_vector_access_dim] * dst_data_per_access);
|
||||
|
||||
dst_vector_t vector_data;
|
||||
|
||||
// pack vector from buffer
|
||||
static_for<0, DstDataPerAccess, 1>{}([&](auto i) {
|
||||
constexpr auto scalar_id =
|
||||
typename uniform_sequence_gen<nDim, 0>::type{}.Modify(
|
||||
dst_vector_access_dim, i);
|
||||
|
||||
constexpr index_t buffer_offset = buffer_desc.GetOffsetFromMultiIndex(
|
||||
dst_merged_dim_data_id + dst_normal_dim_data_id + scalar_id);
|
||||
|
||||
reinterpret_cast<TData*>(&vector_data)[i] = p_buffer[buffer_offset];
|
||||
});
|
||||
|
||||
constexpr index_t dst_normal_offset =
|
||||
DstDesc::GetOffsetFromMultiIndex(dst_normal_dim_data_id);
|
||||
|
||||
// write vector into dst
|
||||
*reinterpret_cast<dst_vector_t*>(&p_dst_tmp[dst_normal_offset]) = vector_data;
|
||||
});
|
||||
});
|
||||
#else
|
||||
// offset w.r.t. merged dimension need to be computed at run-time
|
||||
ford<decltype(dst_merged_dim_access_lengths), DstDimAccessOrder>{}([&](
|
||||
auto dst_merged_dim_access_id) {
|
||||
|
||||
auto dst_merged_dim_data_id = dst_merged_dim_access_id;
|
||||
dst_merged_dim_data_id(dst_vector_access_dim) =
|
||||
dst_merged_dim_access_id[dst_vector_access_dim] * dst_data_per_access;
|
||||
|
||||
TData* p_dst_tmp = p_dst + (mDstSliceOrigin + dst_merged_dim_data_id).GetOffset();
|
||||
|
||||
// offset w.r.t. normal dimension can be computed at compile-time
|
||||
ford<decltype(dst_normal_dim_access_lengths), DstDimAccessOrder>{}([&](
|
||||
auto dst_normal_dim_access_id) {
|
||||
|
||||
auto dst_normal_dim_data_id = dst_normal_dim_access_id;
|
||||
dst_normal_dim_data_id(dst_vector_access_dim) =
|
||||
dst_normal_dim_access_id[dst_vector_access_dim] * dst_data_per_access;
|
||||
|
||||
dst_vector_t vector_data;
|
||||
|
||||
// pack vector from buffer
|
||||
for(index_t i = 0; i < DstDataPerAccess; ++i)
|
||||
{
|
||||
auto scalar_id = make_zero_array<index_t, nDim>();
|
||||
scalar_id(dst_vector_access_dim) = i;
|
||||
|
||||
const index_t buffer_offset = buffer_desc.GetOffsetFromMultiIndex(
|
||||
dst_merged_dim_data_id + dst_normal_dim_data_id + scalar_id);
|
||||
|
||||
reinterpret_cast<TData*>(&vector_data)[i] = p_buffer[buffer_offset];
|
||||
}
|
||||
|
||||
const index_t dst_normal_offset =
|
||||
DstDesc::GetOffsetFromMultiIndex(dst_normal_dim_data_id);
|
||||
|
||||
// write vector into dst
|
||||
*reinterpret_cast<dst_vector_t*>(&p_dst_tmp[dst_normal_offset]) = vector_data;
|
||||
});
|
||||
});
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
template <typename TData,
|
||||
address_space_t SrcAddressSpace = address_space_t::generic,
|
||||
address_space_t DstAddressSpace = address_space_t::generic>
|
||||
__device__ void Run_optimized_address_calculation(const TData* p_src, TData* p_dst) const
|
||||
__device__ void Run(const TData* p_src, TData* p_dst) const
|
||||
{
|
||||
constexpr auto buffer_desc = make_ConstantTensorDescriptor_packed(SliceLengths{});
|
||||
|
||||
@@ -841,9 +616,7 @@ struct ThreadwiseGenericTensorSliceCopy_v2r1
|
||||
static_if<SrcAddressSpace == address_space_t::global>{}([&](auto) {
|
||||
#if CK_USE_AMD_INTRINSIC && CK_USE_AMD_INTRINSIC_BUFFER_LOAD_STORE
|
||||
vector_data = __buffer_load<TData, SrcDataPerAccess>(
|
||||
p_src,
|
||||
static_cast<uint32_t>(src_merged_offset),
|
||||
static_cast<uint32_t>(src_normal_offset));
|
||||
p_src, src_merged_offset, src_normal_offset);
|
||||
#else
|
||||
vector_data = *reinterpret_cast<const src_vector_t*>(
|
||||
&p_src[src_normal_offset + src_merged_offset]);
|
||||
|
||||
@@ -448,7 +448,7 @@ int main(int argc, char* argv[])
|
||||
ConvStrides{},
|
||||
ConvDilations{},
|
||||
nrepeat);
|
||||
#elif 0
|
||||
#elif 1
|
||||
device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded(in_nchw_desc,
|
||||
in_nchw,
|
||||
wei_kcyx_desc,
|
||||
@@ -490,7 +490,7 @@ int main(int argc, char* argv[])
|
||||
ConvStrides{},
|
||||
ConvDilations{},
|
||||
nrepeat);
|
||||
#elif 1
|
||||
#elif 0
|
||||
device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_padded(in_nchw_desc,
|
||||
in_nchw,
|
||||
wei_kcyx_desc,
|
||||
|
||||
Reference in New Issue
Block a user