mirror of
https://github.com/ROCm/composable_kernel.git
synced 2026-05-11 17:00:18 +00:00
use vectorized read and write for threadwise generic tensor copy
This commit is contained in:
@@ -15,7 +15,7 @@ template <index_t BlockSize,
|
||||
class SrcAccessOrder,
|
||||
class DstAccessOrder,
|
||||
index_t SrcDataPerRead,
|
||||
index_t DstDataPerRead>
|
||||
index_t DstDataPerWrite>
|
||||
struct BlockwiseGenericTensorSliceCopy_v1
|
||||
{
|
||||
static constexpr index_t nDim = SrcDesc::GetNumOfDimension();
|
||||
@@ -217,14 +217,15 @@ struct BlockwiseGenericTensorSliceCopy_v1
|
||||
const index_t clipboard_offset = thread_tensor_desc.GetOffsetFromMultiIndex(
|
||||
clipboard_data_multi_id_begin); // cannot not constexpr, why?
|
||||
|
||||
threadwise_generic_tensor_slice_copy(SrcDesc{},
|
||||
p_src + src_offset + mThreadSrcOffset,
|
||||
make_zero_array<index_t, nDim>(),
|
||||
thread_tensor_desc,
|
||||
p_clipboard + clipboard_offset,
|
||||
make_zero_array<index_t, nDim>(),
|
||||
thread_sub_tensor_lengths,
|
||||
SrcAccessOrder{});
|
||||
threadwise_generic_tensor_slice_copy_v1(SrcDesc{},
|
||||
p_src + src_offset + mThreadSrcOffset,
|
||||
make_zero_array<index_t, nDim>(),
|
||||
thread_tensor_desc,
|
||||
p_clipboard + clipboard_offset,
|
||||
make_zero_array<index_t, nDim>(),
|
||||
thread_sub_tensor_lengths,
|
||||
SrcAccessOrder{},
|
||||
Number<SrcDataPerRead>{});
|
||||
});
|
||||
}
|
||||
|
||||
@@ -255,14 +256,15 @@ struct BlockwiseGenericTensorSliceCopy_v1
|
||||
const index_t dst_offset = DstDesc{}.GetOffsetFromMultiIndex(
|
||||
dst_data_multi_id_begin); // cannot not constexpr, why?
|
||||
|
||||
threadwise_generic_tensor_slice_copy(thread_tensor_desc,
|
||||
p_clipboard + clipboard_offset,
|
||||
make_zero_array<index_t, nDim>(),
|
||||
DstDesc{},
|
||||
p_dst + dst_offset + mThreadDstOffset,
|
||||
make_zero_array<index_t, nDim>(),
|
||||
thread_sub_tensor_lengths,
|
||||
DstAccessOrder{});
|
||||
threadwise_generic_tensor_slice_copy_v1(thread_tensor_desc,
|
||||
p_clipboard + clipboard_offset,
|
||||
make_zero_array<index_t, nDim>(),
|
||||
DstDesc{},
|
||||
p_dst + dst_offset + mThreadDstOffset,
|
||||
make_zero_array<index_t, nDim>(),
|
||||
thread_sub_tensor_lengths,
|
||||
DstAccessOrder{},
|
||||
Number<DstDataPerWrite>{});
|
||||
});
|
||||
}
|
||||
|
||||
|
||||
@@ -105,4 +105,13 @@ __host__ __device__ constexpr T min(T x, Ts... xs)
|
||||
|
||||
return x < y ? x : y;
|
||||
}
|
||||
|
||||
// this is wrong
|
||||
// TODO: implement correct least common multiple, instead of calling max()
|
||||
template <class T, class... Ts>
|
||||
__host__ __device__ constexpr T least_common_multiple(T x, Ts... xs)
|
||||
{
|
||||
return max(x, xs...);
|
||||
}
|
||||
|
||||
} // namespace mod_conv
|
||||
|
||||
@@ -3,8 +3,7 @@
|
||||
|
||||
// this is ugly, only for 4d
|
||||
template <class InDesc, class WeiDesc>
|
||||
constexpr auto get_convolution_output_default_4d_tensor_descriptor(InDesc,
|
||||
WeiDesc)
|
||||
constexpr auto get_convolution_output_default_4d_tensor_descriptor(InDesc, WeiDesc)
|
||||
{
|
||||
constexpr auto in_desc = InDesc{};
|
||||
constexpr auto wei_desc = WeiDesc{};
|
||||
@@ -34,8 +33,10 @@ constexpr auto get_convolution_output_default_4d_tensor_descriptor(InDesc,
|
||||
}
|
||||
|
||||
template <class InDesc, class WeiDesc, class LowerPads, class UpperPads>
|
||||
constexpr auto get_convolution_with_padding_output_default_4d_tensor_descriptor(
|
||||
InDesc, WeiDesc, LowerPads, UpperPads)
|
||||
constexpr auto get_convolution_with_padding_output_default_4d_tensor_descriptor(InDesc,
|
||||
WeiDesc,
|
||||
LowerPads,
|
||||
UpperPads)
|
||||
{
|
||||
constexpr auto in_desc = InDesc{};
|
||||
constexpr auto wei_desc = WeiDesc{};
|
||||
@@ -113,5 +114,6 @@ constexpr std::size_t calculate_convolution_memory_size(Float, InDesc, WeiDesc,
|
||||
constexpr index_t Y = wei_desc.GetLength(I2);
|
||||
constexpr index_t X = wei_desc.GetLength(I3);
|
||||
|
||||
return sizeof(Float) * (InDesc::GetElementSpace() + WeiDesc::GetElementSpace() + OutDesc::GetElementSpace());
|
||||
return sizeof(Float) *
|
||||
(InDesc::GetElementSpace() + WeiDesc::GetElementSpace() + OutDesc::GetElementSpace());
|
||||
}
|
||||
|
||||
@@ -474,7 +474,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_nchw_cyxk_nkhw
|
||||
map_out_global2thread,
|
||||
Number<OutThreadCopyDataPerWrite_W>{});
|
||||
#else
|
||||
threadwise_generic_tensor_slice_copy(
|
||||
threadwise_generic_tensor_slice_copy_v1(
|
||||
out_10d_thread_desc.ReorderGivenNew2Old(map_out_global2thread),
|
||||
p_out_thread,
|
||||
make_zero_array<index_t, 10>(),
|
||||
|
||||
@@ -423,7 +423,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_nchw_cyxk_nkhw
|
||||
map_out_global2thread,
|
||||
Number<OutThreadCopyDataPerWrite_W>{});
|
||||
#else
|
||||
threadwise_generic_tensor_slice_copy(
|
||||
threadwise_generic_tensor_slice_copy_v1(
|
||||
out_10d_thread_desc.ReorderGivenNew2Old(map_out_global2thread),
|
||||
p_out_thread,
|
||||
make_zero_array<index_t, 10>(),
|
||||
|
||||
@@ -405,14 +405,15 @@ struct GridwiseConvolutionImplicitGemm_v3_lds_double_buffer_nchw_cyxk_nkhw
|
||||
out_k_n1_b_n2_global_merged_desc.GetOffsetFromMultiIndex(
|
||||
k_thread_data_on_global, 0, b_thread_data_on_global, 0);
|
||||
|
||||
threadwise_generic_tensor_slice_copy(out_n0_n1_n2_k0_k1_k2_h_w_thread_desc,
|
||||
p_out_thread,
|
||||
{0, 0, 0, 0, 0, 0, 0, 0},
|
||||
out_n0_n1_n2_k0_k1_k2_h_w_global_mem_desc,
|
||||
p_out_thread_on_global,
|
||||
{0, 0, 0, 0, 0, 0, 0, 0},
|
||||
out_n0_n1_n2_k0_k1_k2_h_w_thread_desc.GetLengths(),
|
||||
arithmetic_sequence_gen<0, 8, 1>::SeqType{});
|
||||
threadwise_generic_tensor_slice_copy_v1(
|
||||
out_n0_n1_n2_k0_k1_k2_h_w_thread_desc,
|
||||
p_out_thread,
|
||||
{0, 0, 0, 0, 0, 0, 0, 0},
|
||||
out_n0_n1_n2_k0_k1_k2_h_w_global_mem_desc,
|
||||
p_out_thread_on_global,
|
||||
{0, 0, 0, 0, 0, 0, 0, 0},
|
||||
out_n0_n1_n2_k0_k1_k2_h_w_thread_desc.GetLengths(),
|
||||
arithmetic_sequence_gen<0, 8, 1>::SeqType{});
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
@@ -358,14 +358,15 @@ struct GridwiseConvolutionImplicitGemm_v3_nchw_cyxk_nkhw
|
||||
out_k_n1_b_n2_global_merged_desc.GetOffsetFromMultiIndex(
|
||||
k_thread_data_on_global, 0, b_thread_data_on_global, 0);
|
||||
|
||||
threadwise_generic_tensor_slice_copy(out_n0_n1_n2_k0_k1_k2_h_w_thread_desc,
|
||||
p_out_thread,
|
||||
{0, 0, 0, 0, 0, 0, 0, 0},
|
||||
out_n0_n1_n2_k0_k1_k2_h_w_global_mem_desc,
|
||||
p_out_thread_on_global,
|
||||
{0, 0, 0, 0, 0, 0, 0, 0},
|
||||
out_n0_n1_n2_k0_k1_k2_h_w_thread_desc.GetLengths(),
|
||||
arithmetic_sequence_gen<0, 8, 1>::SeqType{});
|
||||
threadwise_generic_tensor_slice_copy_v1(
|
||||
out_n0_n1_n2_k0_k1_k2_h_w_thread_desc,
|
||||
p_out_thread,
|
||||
{0, 0, 0, 0, 0, 0, 0, 0},
|
||||
out_n0_n1_n2_k0_k1_k2_h_w_global_mem_desc,
|
||||
p_out_thread_on_global,
|
||||
{0, 0, 0, 0, 0, 0, 0, 0},
|
||||
out_n0_n1_n2_k0_k1_k2_h_w_thread_desc.GetLengths(),
|
||||
arithmetic_sequence_gen<0, 8, 1>::SeqType{});
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
@@ -5,7 +5,7 @@
|
||||
#include "ConstantMatrixDescriptor.hip.hpp"
|
||||
#include "blockwise_generic_tensor_slice_op.hip.hpp"
|
||||
#include "blockwise_gemm.hip.hpp"
|
||||
#include "threadwise_tensor_slice_op.hip.hpp"
|
||||
#include "threadwise_generic_tensor_slice_op.hip.hpp"
|
||||
|
||||
// define B = merge(N, Ho, Wo)
|
||||
template <index_t GridSize,
|
||||
@@ -170,7 +170,7 @@ struct GridwiseConvolutionImplicitGemm_v4_lds_double_buffer_nchw_kcyx_nkhw
|
||||
// operator for blockwise copy of weight into LDS
|
||||
// slice a tensor, and copy it into another tensor
|
||||
// this copy operator already have blockwise offset built-in
|
||||
#if 0
|
||||
#if 1
|
||||
auto blockwise_wei_copy =
|
||||
BlockwiseGenericTensorSliceCopy_v1<BlockSize,
|
||||
Float,
|
||||
@@ -188,7 +188,7 @@ struct GridwiseConvolutionImplicitGemm_v4_lds_double_buffer_nchw_kcyx_nkhw
|
||||
#else
|
||||
constexpr auto map_k_e_2_e_k = Sequence<1, 0>{};
|
||||
|
||||
const auto blockwise_wei_copy = BlockwiseTensorSliceReorderCopy_v3<
|
||||
auto blockwise_wei_copy = BlockwiseTensorSliceReorderCopy_v3<
|
||||
BlockSize,
|
||||
Float,
|
||||
decltype(wei_e_k_global_desc.ReorderGivenNew2Old(map_k_e_2_e_k)),
|
||||
@@ -324,11 +324,7 @@ struct GridwiseConvolutionImplicitGemm_v4_lds_double_buffer_nchw_kcyx_nkhw
|
||||
Float p_wei_register_clipboard[blockwise_wei_copy.GetRegisterClipboardSize()];
|
||||
|
||||
blockwise_in_copy.MoveSlicingWindowOnSourceTensor(I0, Number<EPerBlock>{}, True);
|
||||
#if 0
|
||||
blockwise_wei_copy.MoveSlicingWindowOnSourceTensor(I0, Number<EPerBlock>{}, True);
|
||||
#else
|
||||
p_wei_block_on_global += EPerBlock * wei_e_k_global_desc.GetStride(I0);
|
||||
#endif
|
||||
|
||||
__syncthreads();
|
||||
|
||||
@@ -355,11 +351,7 @@ struct GridwiseConvolutionImplicitGemm_v4_lds_double_buffer_nchw_kcyx_nkhw
|
||||
|
||||
// even iteration
|
||||
blockwise_in_copy.MoveSlicingWindowOnSourceTensor(I0, Number<EPerBlock>{}, True);
|
||||
#if 0
|
||||
blockwise_wei_copy.MoveSlicingWindowOnSourceTensor(I0, Number<EPerBlock>{}, True);
|
||||
#else
|
||||
p_wei_block_on_global += EPerBlock * wei_e_k_global_desc.GetStride(I0);
|
||||
#endif
|
||||
|
||||
__syncthreads();
|
||||
|
||||
@@ -434,14 +426,16 @@ struct GridwiseConvolutionImplicitGemm_v4_lds_double_buffer_nchw_kcyx_nkhw
|
||||
out_k_n1_b_n2_global_merged_desc.GetOffsetFromMultiIndex(
|
||||
k_thread_data_on_global, 0, b_thread_data_on_global, 0);
|
||||
|
||||
threadwise_generic_tensor_slice_copy(out_n0_n1_n2_k0_k1_k2_h_w_thread_desc,
|
||||
p_out_thread,
|
||||
{0, 0, 0, 0, 0, 0, 0, 0},
|
||||
out_n0_n1_n2_k0_k1_k2_h_w_global_mem_desc,
|
||||
p_out_thread_on_global,
|
||||
{0, 0, 0, 0, 0, 0, 0, 0},
|
||||
out_n0_n1_n2_k0_k1_k2_h_w_thread_desc.GetLengths(),
|
||||
arithmetic_sequence_gen<0, 8, 1>::SeqType{});
|
||||
threadwise_generic_tensor_slice_copy_v1(
|
||||
out_n0_n1_n2_k0_k1_k2_h_w_thread_desc,
|
||||
p_out_thread,
|
||||
{0, 0, 0, 0, 0, 0, 0, 0},
|
||||
out_n0_n1_n2_k0_k1_k2_h_w_global_mem_desc,
|
||||
p_out_thread_on_global,
|
||||
{0, 0, 0, 0, 0, 0, 0, 0},
|
||||
out_n0_n1_n2_k0_k1_k2_h_w_thread_desc.GetLengths(),
|
||||
arithmetic_sequence_gen<0, 8, 1>::SeqType{},
|
||||
Number<1>{});
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
@@ -5,7 +5,7 @@
|
||||
#include "ConstantMatrixDescriptor.hip.hpp"
|
||||
#include "blockwise_generic_tensor_slice_op.hip.hpp"
|
||||
#include "blockwise_gemm.hip.hpp"
|
||||
#include "threadwise_tensor_slice_op.hip.hpp"
|
||||
#include "threadwise_generic_tensor_slice_op.hip.hpp"
|
||||
|
||||
// define B = merge(N, Ho, Wo)
|
||||
template <index_t GridSize,
|
||||
@@ -327,14 +327,15 @@ struct GridwiseConvolutionImplicitGemm_v4_nchw_kcyx_nkhw
|
||||
out_k_n1_b_n2_global_merged_desc.GetOffsetFromMultiIndex(
|
||||
k_thread_data_on_global, 0, b_thread_data_on_global, 0);
|
||||
|
||||
threadwise_generic_tensor_slice_copy(out_n0_n1_n2_k0_k1_k2_h_w_thread_desc,
|
||||
p_out_thread,
|
||||
{0, 0, 0, 0, 0, 0, 0, 0},
|
||||
out_n0_n1_n2_k0_k1_k2_h_w_global_mem_desc,
|
||||
p_out_thread_on_global,
|
||||
{0, 0, 0, 0, 0, 0, 0, 0},
|
||||
out_n0_n1_n2_k0_k1_k2_h_w_thread_desc.GetLengths(),
|
||||
arithmetic_sequence_gen<0, 8, 1>::SeqType{});
|
||||
threadwise_generic_tensor_slice_copy_v1(
|
||||
out_n0_n1_n2_k0_k1_k2_h_w_thread_desc,
|
||||
p_out_thread,
|
||||
{0, 0, 0, 0, 0, 0, 0, 0},
|
||||
out_n0_n1_n2_k0_k1_k2_h_w_global_mem_desc,
|
||||
p_out_thread_on_global,
|
||||
{0, 0, 0, 0, 0, 0, 0, 0},
|
||||
out_n0_n1_n2_k0_k1_k2_h_w_thread_desc.GetLengths(),
|
||||
arithmetic_sequence_gen<0, 8, 1>::SeqType{});
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
99
src/include/threadwise_generic_tensor_slice_op.hip.hpp
Normal file
99
src/include/threadwise_generic_tensor_slice_op.hip.hpp
Normal file
@@ -0,0 +1,99 @@
|
||||
#pragma once
|
||||
#include "ConstantTensorDescriptor.hip.hpp"
|
||||
#include "ConstantMergedTensorDescriptor.hip.hpp"
|
||||
|
||||
template <class Float,
|
||||
class SrcDesc,
|
||||
class DstDesc,
|
||||
class SliceLengths,
|
||||
class DimAccessOrder,
|
||||
index_t DataPerAccess>
|
||||
__device__ void threadwise_generic_tensor_slice_copy_v1(
|
||||
SrcDesc,
|
||||
const Float* __restrict__ p_src,
|
||||
Array<index_t, SrcDesc::GetNumOfDimension()> src_multi_id_begin,
|
||||
DstDesc,
|
||||
Float* __restrict__ p_dst,
|
||||
Array<index_t, DstDesc::GetNumOfDimension()> dst_multi_id_begin,
|
||||
SliceLengths,
|
||||
DimAccessOrder,
|
||||
Number<DataPerAccess>)
|
||||
{
|
||||
constexpr index_t nDim = SrcDesc::GetNumOfDimension();
|
||||
|
||||
static_assert(nDim == SrcDesc::GetNumOfDimension() && nDim == DstDesc::GetNumOfDimension() &&
|
||||
nDim == SliceLengths::GetSize() && nDim == DimAccessOrder::GetSize(),
|
||||
"wrong! # of dimensions not the same");
|
||||
|
||||
static_assert(is_valid_sequence_map<DimAccessOrder>::value, "wrong! map is not valid");
|
||||
|
||||
#if 0
|
||||
// doesn't compile, because merged-tensor reordering is not implemented
|
||||
// TODO: implement tensor desc ops for merged-tensor
|
||||
constexpr auto src_strides_in_access_order =
|
||||
SrcDesc::ReorderGivenNew2Old(DimAccessOrder{}).GetStride(Number<nDim-1>{});
|
||||
|
||||
constexpr auto dst_strides_in_access_order =
|
||||
SrcDesc::ReorderGivenNew2Old(DimAccessOrder{}).GetStride(Number<nDim-1>{});
|
||||
|
||||
// check src/dst stride on the lowest access dimension
|
||||
static_assert((DataPerAccess == 1 || src_strides_in_access_order.Back() == 1) &&
|
||||
(DataPerAccess == 1 || dst_strides_in_access_order.Back() == 1),
|
||||
"wrong! src/dst stride on the lowest access dimension needs to be 1 for "
|
||||
"vectorized read/write");
|
||||
#endif
|
||||
|
||||
constexpr auto slice_lengths_in_access_order =
|
||||
SliceLengths::ReorderGivenNew2Old(DimAccessOrder{});
|
||||
|
||||
// check slice length on the lowest access dimension
|
||||
static_assert(slice_lengths_in_access_order.Back() % DataPerAccess == 0,
|
||||
"wrong! slice length on the lowest access dimension should be evenly divided by "
|
||||
"DataPerAccess");
|
||||
|
||||
constexpr index_t num_access_on_lowest_access_dimension =
|
||||
slice_lengths_in_access_order.Back() / DataPerAccess;
|
||||
|
||||
constexpr auto access_lengths = slice_lengths_in_access_order.Modify(
|
||||
Number<nDim - 1>{}, Number<num_access_on_lowest_access_dimension>{});
|
||||
|
||||
using vector_t = typename vector_type<Float, DataPerAccess>::MemoryType;
|
||||
|
||||
#if 1
|
||||
ford<decltype(access_lengths)>{}([&](auto access_multi_id) {
|
||||
auto data_multi_id_in_access_order = access_multi_id;
|
||||
data_multi_id_in_access_order[nDim - 1] = access_multi_id[nDim - 1] * DataPerAccess;
|
||||
|
||||
const auto data_multi_id =
|
||||
reorder_array_given_old2new(data_multi_id_in_access_order, DimAccessOrder{});
|
||||
|
||||
const index_t src_index =
|
||||
SrcDesc::GetOffsetFromMultiIndex(src_multi_id_begin + data_multi_id);
|
||||
|
||||
const index_t dst_index =
|
||||
DstDesc::GetOffsetFromMultiIndex(dst_multi_id_begin + data_multi_id);
|
||||
|
||||
*reinterpret_cast<vector_t*>(&p_dst[dst_index]) =
|
||||
*reinterpret_cast<const vector_t*>(&p_src[src_index]);
|
||||
});
|
||||
#else
|
||||
static_ford<decltype(access_lengths)>{}([&](auto access_multi_id_) {
|
||||
const auto access_multi_id = sequence2array(access_multi_id_);
|
||||
|
||||
auto data_multi_id_in_access_order = access_multi_id;
|
||||
data_multi_id_in_access_order[nDim - 1] = access_multi_id[nDim - 1] * DataPerAccess;
|
||||
|
||||
const auto data_multi_id =
|
||||
reorder_array_given_old2new(data_multi_id_in_access_order, DimAccessOrder{});
|
||||
|
||||
const index_t src_index =
|
||||
SrcDesc::GetOffsetFromMultiIndex(src_multi_id_begin + data_multi_id);
|
||||
|
||||
const index_t dst_index =
|
||||
DstDesc::GetOffsetFromMultiIndex(dst_multi_id_begin + data_multi_id);
|
||||
|
||||
*reinterpret_cast<vector_t*>(&p_dst[dst_index]) =
|
||||
*reinterpret_cast<const vector_t*>(&p_src[src_index]);
|
||||
});
|
||||
#endif
|
||||
}
|
||||
@@ -192,58 +192,3 @@ threadwise_tensor_slice_copy_reorder_given_dst2src_v3(SrcDesc,
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
template <class Float, class SrcDesc, class DstDesc, class SliceLengths, class DimAccessOrder>
|
||||
__device__ void threadwise_generic_tensor_slice_copy(
|
||||
SrcDesc,
|
||||
const Float* __restrict__ p_src,
|
||||
Array<index_t, SrcDesc::GetNumOfDimension()> src_multi_id_begin,
|
||||
DstDesc,
|
||||
Float* __restrict__ p_dst,
|
||||
Array<index_t, DstDesc::GetNumOfDimension()> dst_multi_id_begin,
|
||||
SliceLengths,
|
||||
DimAccessOrder)
|
||||
{
|
||||
constexpr index_t nDim = SrcDesc::GetNumOfDimension();
|
||||
|
||||
static_assert(nDim == SrcDesc::GetNumOfDimension() && nDim == DstDesc::GetNumOfDimension() &&
|
||||
nDim == SliceLengths::GetSize() && nDim == DimAccessOrder::GetSize(),
|
||||
"wrong! # of dimensions not the same");
|
||||
|
||||
static_assert(is_valid_sequence_map<DimAccessOrder>::value, "wrong! map is not valid");
|
||||
|
||||
constexpr auto slice_lengths_in_access_order =
|
||||
SliceLengths::ReorderGivenNew2Old(DimAccessOrder{});
|
||||
|
||||
#if 1
|
||||
ford<decltype(slice_lengths_in_access_order)>{}([&](auto data_multi_id_in_access_order) {
|
||||
const auto data_multi_id =
|
||||
reorder_array_given_old2new(data_multi_id_in_access_order, DimAccessOrder{});
|
||||
|
||||
const index_t src_index =
|
||||
SrcDesc::GetOffsetFromMultiIndex(src_multi_id_begin + data_multi_id);
|
||||
|
||||
const index_t dst_index =
|
||||
DstDesc::GetOffsetFromMultiIndex(dst_multi_id_begin + data_multi_id);
|
||||
|
||||
p_dst[dst_index] = p_src[src_index];
|
||||
});
|
||||
#else
|
||||
static_ford<decltype(slice_lengths_in_access_order)>{}(
|
||||
[&](auto data_multi_id_in_access_order_) {
|
||||
constexpr auto data_multi_id_in_access_order =
|
||||
sequence2array(decltype(data_multi_id_in_access_order_){});
|
||||
|
||||
const auto data_multi_id =
|
||||
reorder_array_given_old2new(data_multi_id_in_access_order, DimAccessOrder{});
|
||||
|
||||
const index_t src_index =
|
||||
SrcDesc::GetOffsetFromMultiIndex(src_multi_id_begin + data_multi_id);
|
||||
|
||||
const index_t dst_index =
|
||||
DstDesc::GetOffsetFromMultiIndex(dst_multi_id_begin + data_multi_id);
|
||||
|
||||
p_dst[dst_index] = p_src[src_index];
|
||||
});
|
||||
#endif
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user