mirror of
https://github.com/ROCm/composable_kernel.git
synced 2026-05-12 17:26:00 +00:00
added implicit gemm v4 (nchw, kcyx)
This commit is contained in:
@@ -215,7 +215,7 @@ struct ConstantTensorDescriptor
|
||||
|
||||
// do carry check in reversed order, starting from lowest dimension
|
||||
// don't check the highest dimension
|
||||
static_for<0, nDim - 1, 1>{}([&](auto IDimReverse) {
|
||||
static_for<0, nDim, 1>{}([&](auto IDimReverse) {
|
||||
constexpr index_t idim = nDim - 1 - IDimReverse.Get();
|
||||
constexpr auto IDim = Number<idim>{};
|
||||
|
||||
@@ -241,7 +241,7 @@ struct ConstantTensorDescriptor
|
||||
|
||||
// do borrow check in reversed order, starting from lowest dimension
|
||||
// don't check the highest dimension
|
||||
static_for<0, nDim - 1, 1>{}([&](auto IDimReverse) {
|
||||
static_for<0, nDim, 1>{}([&](auto IDimReverse) {
|
||||
constexpr index_t idim = nDim - 1 - IDimReverse.Get();
|
||||
constexpr auto IDim = Number<idim>{};
|
||||
|
||||
|
||||
@@ -3,7 +3,7 @@
|
||||
|
||||
// slice a (normal or merged) tensor, and copy it into another (normal or merged) tensor
|
||||
// memory layout (ordering of dimensions) can be different between src and dst
|
||||
// For now, only support SubLengths == 1 on a merged dimension
|
||||
// For now, only support SubLengths[...] == 1 on a merged dimension
|
||||
template <index_t BlockSize,
|
||||
class Float,
|
||||
class SrcDesc,
|
||||
@@ -84,8 +84,8 @@ struct BlockwiseGenericTensorSliceCopy_v1
|
||||
|
||||
constexpr auto repeat_lengths = SliceLengths{} / data_per_cluster_per_dims;
|
||||
|
||||
// for now, only support SubLengths.Get() == 1 on a merged dimension that is merge from
|
||||
// multiple dimensions
|
||||
// for now, only support SubLengths.Get() == 1 on a merged dimension that constains
|
||||
// multiple original dimensions
|
||||
static_for<0, nDim, 1>{}([&](auto IDim_) {
|
||||
constexpr auto IDim = decltype(IDim_){};
|
||||
|
||||
@@ -292,7 +292,8 @@ struct BlockwiseGenericTensorSliceCopy_v1
|
||||
|
||||
static_if<SrcDesc::ContainMultipleOriginalDimensions(IDim)>{}([&](auto fwd) {
|
||||
// logic for a merged dimension, also works for non-merged dimension, but its logic may
|
||||
// be unncessarily complicated for compiler to remove uselss calculations
|
||||
// be unncessarily complicated for compiler to remove calculations that are useless for
|
||||
// a non-merged dimension
|
||||
|
||||
// extract partial original dimensions
|
||||
constexpr auto src_partial_original_dims =
|
||||
@@ -309,6 +310,27 @@ struct BlockwiseGenericTensorSliceCopy_v1
|
||||
src_partial_original_desc.UpdateMultiIndexGivenStepSizeOf1dIndex(
|
||||
old_src_partial_original_multi_id, StepSize, direction);
|
||||
|
||||
#if 0
|
||||
{
|
||||
if(debug_flag && get_block_1d_id() == 0)
|
||||
{
|
||||
printf("id %5u %5u: "
|
||||
"old_src_partial_original_multi_id %u %u %u, "
|
||||
"new_src_partial_original_multi_id %u %u %u, "
|
||||
"mThreadSrcOffset %u, mThreadDstOffset %u \n",
|
||||
get_block_1d_id(),
|
||||
get_thread_local_1d_id(),
|
||||
old_src_partial_original_multi_id[0],
|
||||
old_src_partial_original_multi_id[1],
|
||||
old_src_partial_original_multi_id[2],
|
||||
new_src_partial_original_multi_id[0],
|
||||
new_src_partial_original_multi_id[1],
|
||||
new_src_partial_original_multi_id[2]
|
||||
);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
// update "mThreadSrcOriginalMultiId"
|
||||
static_for<0, src_partial_original_dims.GetSize(), 1>{}([&](auto I_) {
|
||||
constexpr auto I = decltype(I_){};
|
||||
|
||||
@@ -255,28 +255,14 @@ struct GridwiseConvolutionImplicitGemm_v4_nchw_kcyx_nkhw
|
||||
for(index_t e = 0; e < E; e += EPerBlock)
|
||||
{
|
||||
#if 0
|
||||
if(e == 1 * EPerBlock && get_block_1d_id() == 0)
|
||||
if(e == 0 * EPerBlock && get_block_1d_id() == 0)
|
||||
{
|
||||
printf("id %5u %5u: "
|
||||
"mThreadSrcOriginalMultiId %u %u %u %u %u %u %u %u, "
|
||||
"mThreadSrcPartialOffsets %u %u %u %u, "
|
||||
"mThreadSrcOffset %u, mThreadDstOffset %u \n",
|
||||
get_block_1d_id(),
|
||||
get_thread_local_1d_id(),
|
||||
blockwise_in_copy.mThreadSrcOriginalMultiId[0],
|
||||
blockwise_in_copy.mThreadSrcOriginalMultiId[1],
|
||||
blockwise_in_copy.mThreadSrcOriginalMultiId[2],
|
||||
blockwise_in_copy.mThreadSrcOriginalMultiId[3],
|
||||
blockwise_in_copy.mThreadSrcOriginalMultiId[4],
|
||||
blockwise_in_copy.mThreadSrcOriginalMultiId[5],
|
||||
blockwise_in_copy.mThreadSrcOriginalMultiId[6],
|
||||
blockwise_in_copy.mThreadSrcOriginalMultiId[7],
|
||||
blockwise_in_copy.mThreadSrcPartialOffsets[0],
|
||||
blockwise_in_copy.mThreadSrcPartialOffsets[1],
|
||||
blockwise_in_copy.mThreadSrcPartialOffsets[2],
|
||||
blockwise_in_copy.mThreadSrcPartialOffsets[3],
|
||||
blockwise_in_copy.mThreadSrcOffset,
|
||||
blockwise_in_copy.mThreadDstOffset);
|
||||
blockwise_wei_copy.mThreadSrcOffset,
|
||||
blockwise_wei_copy.mThreadDstOffset);
|
||||
}
|
||||
#endif
|
||||
// marching slicing window
|
||||
|
||||
Reference in New Issue
Block a user