mirror of
https://github.com/ROCm/composable_kernel.git
synced 2026-05-11 17:00:18 +00:00
clean up
This commit is contained in:
@@ -155,13 +155,12 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer
|
||||
static_assert(in_e_n1_b_n2_block_desc.GetStride(I1) % GemmDataPerReadB == 0,
|
||||
"GemmDataPerReadB alignment requirement is not satisfied");
|
||||
|
||||
#if 0
|
||||
#if 1
|
||||
// input blockwise copy
|
||||
// slice a merged tensor, reorder and copy to a normal tensor
|
||||
// this copy operator already has blockwise offset built-in
|
||||
auto blockwise_in_copy =
|
||||
BlockwiseGenericTensorSliceCopy_v1<BlockSize,
|
||||
Float,
|
||||
decltype(in_e_n1_b_n2_global_merged_desc),
|
||||
decltype(in_e_n1_b_n2_block_desc),
|
||||
decltype(in_e_n1_b_n2_block_desc.GetLengths()),
|
||||
@@ -199,13 +198,12 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer
|
||||
Sequence<EPerBlock, KPerBlock>{},
|
||||
Number<math::lcm(WeiBlockCopyDstDataPerWrite_K, GemmDataPerReadA)>{});
|
||||
|
||||
#if 0
|
||||
#if 1
|
||||
// operator for blockwise copy of weight into LDS
|
||||
// slice a tensor, and copy it into another tensor
|
||||
// this copy operator already have blockwise offset built-in
|
||||
auto blockwise_wei_copy =
|
||||
BlockwiseGenericTensorSliceCopy_v1<BlockSize,
|
||||
Float,
|
||||
decltype(wei_e_k_global_desc),
|
||||
decltype(wei_e_k_block_desc),
|
||||
decltype(wei_e_k_block_desc.GetLengths()),
|
||||
@@ -324,7 +322,7 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer
|
||||
Float p_in_register_buffer[blockwise_in_copy.GetRegisterBufferSize()];
|
||||
Float p_wei_register_buffer[blockwise_wei_copy.GetRegisterBufferSize()];
|
||||
|
||||
#if 0
|
||||
#if 1
|
||||
blockwise_in_copy.MoveSlicingWindowOnSourceTensor(I0, Number<EPerBlock>{}, True);
|
||||
// blockwise_wei_copy.MoveSlicingWindowOnSourceTensor(I0, Number<EPerBlock>{},
|
||||
// True);
|
||||
@@ -356,7 +354,7 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer
|
||||
Float p_in_register_buffer[blockwise_in_copy.GetRegisterBufferSize()];
|
||||
Float p_wei_register_buffer[blockwise_wei_copy.GetRegisterBufferSize()];
|
||||
|
||||
#if 0
|
||||
#if 1
|
||||
blockwise_in_copy.MoveSlicingWindowOnSourceTensor(I0, Number<EPerBlock>{}, True);
|
||||
// blockwise_wei_copy.MoveSlicingWindowOnSourceTensor(I0, Number<EPerBlock>{}, True);
|
||||
p_wei_block_on_global += EPerBlock * wei_e_k_global_desc.GetStride(I0);
|
||||
@@ -436,19 +434,7 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer
|
||||
out_k_n1_b_n2_global_merged_desc.GetOffsetFromMultiIndex(
|
||||
k_thread_data_on_global, 0, b_thread_data_on_global, 0);
|
||||
|
||||
#if 0
|
||||
ThreadwiseGenericTensorSliceCopy_v1r1<
|
||||
decltype(out_n0_n1_n2_k0_k1_k2_h_w_thread_desc),
|
||||
decltype(out_n0_n1_n2_k0_k1_k2_h_w_global_mem_desc),
|
||||
decltype(out_n0_n1_n2_k0_k1_k2_h_w_thread_desc.GetLengths()),
|
||||
arithmetic_sequence_gen<0, 8, 1>::type,
|
||||
arithmetic_sequence_gen<0, 8, 1>::type,
|
||||
7,
|
||||
7,
|
||||
1,
|
||||
1>(make_zero_array<index_t, 8>(), make_zero_array<index_t, 8>())
|
||||
.Run(p_out_thread, p_out_thread_on_global);
|
||||
#elif 0
|
||||
#if 1
|
||||
ThreadwiseGenericTensorSliceCopy_v1r2<
|
||||
decltype(out_n0_n1_n2_k0_k1_k2_h_w_thread_desc),
|
||||
decltype(out_n0_n1_n2_k0_k1_k2_h_w_global_mem_desc),
|
||||
|
||||
@@ -237,47 +237,23 @@ struct BlockwiseGenericTensorSliceCopy_v1
|
||||
thread_buffer_desc.GetOffsetFromMultiIndex(buffer_data_id_begin);
|
||||
#endif
|
||||
|
||||
// By position the origin of the per-thread window at the point, where multi-index
|
||||
// of the SrcDesc (might be a merged tensor) is all-zero. This threadwise slice copy
|
||||
// is assuming each thread is copy a noraml (not merged) tensor.
|
||||
// To satisfy this assumption, the user need to make sure that, on a merged dimension
|
||||
// that constains multiple original dimensions, the length of the last original
|
||||
// dimension need to be evenly dividable by its sub-lengths. Also, the repeat-length on
|
||||
// the merged dimension need to be 1. These sanity checks are performed in constructor
|
||||
// of BlockwiseGenericTensorSliceCopy_v1
|
||||
#if 0
|
||||
threadwise_generic_tensor_slice_copy_v1(SrcDesc{},
|
||||
p_src + src_offset + mThreadSrcOffset,
|
||||
make_zero_array<index_t, nDim>(),
|
||||
thread_buffer_desc,
|
||||
p_buffer + buffer_offset,
|
||||
make_zero_array<index_t, nDim>(),
|
||||
thread_sub_tensor_lengths,
|
||||
SrcDimAccessOrder{},
|
||||
Number<SrcDataPerAccess>{});
|
||||
#elif 1
|
||||
ThreadwiseGenericTensorSliceCopy_v1r1<
|
||||
SrcDesc,
|
||||
decltype(thread_buffer_desc),
|
||||
SubLengths,
|
||||
SrcDimAccessOrder,
|
||||
typename arithmetic_sequence_gen<0, nDim, 1>::type,
|
||||
SrcVectorAccessDim,
|
||||
0,
|
||||
SrcDataPerAccess,
|
||||
1>(make_zero_array<index_t, nDim>(), make_zero_array<index_t, nDim>())
|
||||
// By position the origin of the per-thread window at the point, where multi-index
|
||||
// of the SrcDesc (might be a merged tensor) is all-zero. This threadwise slice copy
|
||||
// is assuming each thread is copy a noraml (not merged) tensor.
|
||||
// To satisfy this assumption, the user need to make sure that, on a merged dimension
|
||||
// that constains multiple original dimensions, the length of the last original
|
||||
// dimension need to be evenly dividable by its sub-lengths. Also, the repeat-length on
|
||||
// the merged dimension need to be 1. These sanity checks are performed in constructor
|
||||
// of BlockwiseGenericTensorSliceCopy_v1
|
||||
ThreadwiseGenericTensorSliceCopy_v1r2<SrcDesc,
|
||||
decltype(thread_buffer_desc),
|
||||
SubLengths,
|
||||
SrcDimAccessOrder,
|
||||
SrcVectorAccessDim,
|
||||
SrcDataPerAccess,
|
||||
1>(make_zero_array<index_t, nDim>(),
|
||||
make_zero_array<index_t, nDim>())
|
||||
.Run(p_src + src_offset + mThreadSrcOffset, p_buffer + buffer_offset);
|
||||
#elif 1
|
||||
ThreadwiseGenericTensorSliceCopy_v1r2<SrcDesc,
|
||||
decltype(thread_buffer_desc),
|
||||
SubLengths,
|
||||
SrcDimAccessOrder,
|
||||
SrcVectorAccessDim,
|
||||
SrcDataPerAccess,
|
||||
1>(make_zero_array<index_t, nDim>(),
|
||||
make_zero_array<index_t, nDim>())
|
||||
.Run(p_src + src_offset + mThreadSrcOffset, p_buffer + buffer_offset);
|
||||
#endif
|
||||
});
|
||||
}
|
||||
|
||||
@@ -316,48 +292,23 @@ struct BlockwiseGenericTensorSliceCopy_v1
|
||||
const index_t dst_offset = DstDesc::GetOffsetFromMultiIndex(dst_data_id_begin);
|
||||
#endif
|
||||
|
||||
// By position the origin of the per-thread window at the point, where multi-index
|
||||
// of the SrcDesc (might be a merged tensor) is all-zero. This threadwise slice copy
|
||||
// is assuming each thread is copy a noraml (not merged) tensor.
|
||||
// To satisfy this assumption, the user need to make sure that, on a merged dimension
|
||||
// that constains multiple original dimensions, the length of the last original
|
||||
// dimension need to be evenly dividable by its sub-lengths. Also, the repeat-length on
|
||||
// the merged dimension need to be 1. These sanity checks are performed in constructor
|
||||
// of BlockwiseGenericTensorSliceCopy_v1
|
||||
#if 0
|
||||
threadwise_generic_tensor_slice_copy_v1(thread_buffer_desc,
|
||||
p_buffer + buffer_offset,
|
||||
make_zero_array<index_t, nDim>(),
|
||||
DstDesc{},
|
||||
p_dst + dst_offset + mThreadDstOffset,
|
||||
make_zero_array<index_t, nDim>(),
|
||||
thread_sub_tensor_lengths,
|
||||
DstDimAccessOrder{},
|
||||
Number<DstDataPerAccess>{});
|
||||
#elif 1
|
||||
ThreadwiseGenericTensorSliceCopy_v1r1<
|
||||
decltype(thread_buffer_desc),
|
||||
DstDesc,
|
||||
SubLengths,
|
||||
typename arithmetic_sequence_gen<0, nDim, 1>::type,
|
||||
DstDimAccessOrder,
|
||||
0,
|
||||
DstVectorAccessDim,
|
||||
1,
|
||||
DstDataPerAccess>(make_zero_array<index_t, nDim>(),
|
||||
make_zero_array<index_t, nDim>())
|
||||
// By position the origin of the per-thread window at the point, where multi-index
|
||||
// of the SrcDesc (might be a merged tensor) is all-zero. This threadwise slice copy
|
||||
// is assuming each thread is copy a noraml (not merged) tensor.
|
||||
// To satisfy this assumption, the user need to make sure that, on a merged dimension
|
||||
// that constains multiple original dimensions, the length of the last original
|
||||
// dimension need to be evenly dividable by its sub-lengths. Also, the repeat-length on
|
||||
// the merged dimension need to be 1. These sanity checks are performed in constructor
|
||||
// of BlockwiseGenericTensorSliceCopy_v1
|
||||
ThreadwiseGenericTensorSliceCopy_v1r2<decltype(thread_buffer_desc),
|
||||
DstDesc,
|
||||
SubLengths,
|
||||
DstDimAccessOrder,
|
||||
DstVectorAccessDim,
|
||||
1,
|
||||
DstDataPerAccess>(
|
||||
make_zero_array<index_t, nDim>(), make_zero_array<index_t, nDim>())
|
||||
.Run(p_buffer + buffer_offset, p_dst + dst_offset + mThreadDstOffset);
|
||||
#elif 1
|
||||
ThreadwiseGenericTensorSliceCopy_v1r2<decltype(thread_buffer_desc),
|
||||
DstDesc,
|
||||
SubLengths,
|
||||
DstDimAccessOrder,
|
||||
DstVectorAccessDim,
|
||||
1,
|
||||
DstDataPerAccess>(make_zero_array<index_t, nDim>(),
|
||||
make_zero_array<index_t, nDim>())
|
||||
.Run(p_buffer + buffer_offset, p_dst + dst_offset + mThreadDstOffset);
|
||||
#endif
|
||||
});
|
||||
}
|
||||
|
||||
|
||||
@@ -6,10 +6,6 @@
|
||||
#include "ConstantMergedTensorDescriptor.hpp"
|
||||
#include "tensor_coordinate.hpp"
|
||||
|
||||
#ifndef CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1
|
||||
#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1 0
|
||||
#endif
|
||||
|
||||
#ifndef CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1R1
|
||||
#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1R1 0
|
||||
#endif
|
||||
@@ -24,100 +20,6 @@
|
||||
|
||||
namespace ck {
|
||||
|
||||
// user need to make sure alignment requirement is satisfied when setting DataPerAccesss > 1
|
||||
template <class Float,
|
||||
class SrcDesc,
|
||||
class DstDesc,
|
||||
class SliceLengths,
|
||||
class DimAccessOrder,
|
||||
index_t DataPerAccess>
|
||||
__device__ void threadwise_generic_tensor_slice_copy_v1(
|
||||
SrcDesc,
|
||||
const Float* __restrict__ p_src,
|
||||
Array<index_t, SrcDesc::GetNumOfDimension()> src_multi_id_begin,
|
||||
DstDesc,
|
||||
Float* __restrict__ p_dst,
|
||||
Array<index_t, DstDesc::GetNumOfDimension()> dst_multi_id_begin,
|
||||
SliceLengths,
|
||||
DimAccessOrder,
|
||||
Number<DataPerAccess>)
|
||||
{
|
||||
constexpr index_t nDim = SrcDesc::GetNumOfDimension();
|
||||
|
||||
static_assert(nDim == SrcDesc::GetNumOfDimension() && nDim == DstDesc::GetNumOfDimension() &&
|
||||
nDim == SliceLengths::GetSize() && nDim == DimAccessOrder::GetSize(),
|
||||
"wrong! # of dimensions not the same");
|
||||
|
||||
static_assert(is_valid_sequence_map<DimAccessOrder>::value, "wrong! map is not valid");
|
||||
|
||||
// TODO: do more sanity-check here, something like:
|
||||
// constexpr auto src_strides_in_access_order =
|
||||
// SrcDesc::ReorderGivenNew2Old(DimAccessOrder{}).GetStride(Number<nDim-1>{});
|
||||
|
||||
// constexpr auto dst_strides_in_access_order =
|
||||
// SrcDesc::ReorderGivenNew2Old(DimAccessOrder{}).GetStride(Number<nDim-1>{});
|
||||
|
||||
// // check src/dst stride on the lowest access dimension
|
||||
// static_assert((DataPerAccess == 1 || src_strides_in_access_order.Back() == 1) &&
|
||||
// (DataPerAccess == 1 || dst_strides_in_access_order.Back() == 1),
|
||||
// "wrong! src/dst stride on the lowest access dimension needs to be 1 for "
|
||||
// "vectorized read/write");
|
||||
|
||||
constexpr auto slice_lengths_in_access_order =
|
||||
SliceLengths::ReorderGivenNew2Old(DimAccessOrder{});
|
||||
|
||||
// check slice length on the lowest access dimension
|
||||
static_assert(slice_lengths_in_access_order.Back() % DataPerAccess == 0,
|
||||
"wrong! slice length on the lowest access dimension should be evenly divided by "
|
||||
"DataPerAccess");
|
||||
|
||||
constexpr index_t num_access_on_lowest_access_dimension =
|
||||
slice_lengths_in_access_order.Back() / DataPerAccess;
|
||||
|
||||
constexpr auto access_lengths = slice_lengths_in_access_order.Modify(
|
||||
Number<nDim - 1>{}, Number<num_access_on_lowest_access_dimension>{});
|
||||
|
||||
using vector_t = typename vector_type<Float, DataPerAccess>::MemoryType;
|
||||
|
||||
#if CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1
|
||||
static_ford<decltype(access_lengths)>{}([&](auto access_multi_id) {
|
||||
constexpr index_t itmp = access_multi_id.Back() * DataPerAccess;
|
||||
|
||||
constexpr auto data_multi_id_in_access_order =
|
||||
access_multi_id.Modify(Number<nDim - 1>{}, Number<itmp>{});
|
||||
|
||||
constexpr auto data_multi_id =
|
||||
data_multi_id_in_access_order.ReorderGivenOld2New(DimAccessOrder{});
|
||||
|
||||
const index_t src_index =
|
||||
SrcDesc::GetOffsetFromMultiIndex(src_multi_id_begin + data_multi_id);
|
||||
|
||||
const index_t dst_index =
|
||||
DstDesc::GetOffsetFromMultiIndex(dst_multi_id_begin + data_multi_id);
|
||||
|
||||
*reinterpret_cast<vector_t*>(&p_dst[dst_index]) =
|
||||
*reinterpret_cast<const vector_t*>(&p_src[src_index]);
|
||||
});
|
||||
#else
|
||||
ford<decltype(access_lengths)>{}([&](auto access_multi_id) {
|
||||
auto data_multi_id_in_access_order = access_multi_id;
|
||||
data_multi_id_in_access_order(nDim - 1) = access_multi_id[nDim - 1] * DataPerAccess;
|
||||
|
||||
const auto data_multi_id =
|
||||
reorder_array_given_old2new(data_multi_id_in_access_order, DimAccessOrder{});
|
||||
|
||||
const index_t src_index =
|
||||
SrcDesc::GetOffsetFromMultiIndex(src_multi_id_begin + data_multi_id);
|
||||
|
||||
const index_t dst_index =
|
||||
DstDesc::GetOffsetFromMultiIndex(dst_multi_id_begin + data_multi_id);
|
||||
|
||||
*reinterpret_cast<vector_t*>(&p_dst[dst_index]) =
|
||||
*reinterpret_cast<const vector_t*>(&p_src[src_index]);
|
||||
});
|
||||
#endif
|
||||
}
|
||||
|
||||
// This threadwise copy allow vector access of src and dst.
|
||||
// It allows the dimensions of vector access to be different on src and dst.
|
||||
// It also allows the vector size to be different on src and dst.
|
||||
|
||||
@@ -379,7 +379,7 @@ int main(int argc, char* argv[])
|
||||
#elif 0
|
||||
device_convolution_implicit_gemm_v3_nchw_cyxk_nkhw(
|
||||
(in_nchw_desc, in_nchw, wei_kcyx_desc, wei_kcyx, out_nkhw_desc, out_nkhw_device, nrepeat);
|
||||
#elif 0
|
||||
#elif 1
|
||||
device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw(in_nchw_desc,
|
||||
in_nchw,
|
||||
wei_kcyx_desc,
|
||||
|
||||
Reference in New Issue
Block a user