mirror of
https://github.com/ROCm/composable_kernel.git
synced 2026-05-12 17:26:00 +00:00
adding ConstantMergedTensorDescriptor, refactering ConstantTensorDescriptor, Sequence
This commit is contained in:
@@ -16,6 +16,8 @@ struct Array
|
||||
{
|
||||
}
|
||||
|
||||
__host__ __device__ constexpr index_t GetSize() const { return NSize; }
|
||||
|
||||
__host__ __device__ const TData& operator[](index_t i) const { return mData[i]; }
|
||||
|
||||
__host__ __device__ TData& operator[](index_t i) { return mData[i]; }
|
||||
@@ -67,6 +69,23 @@ __host__ __device__ auto reorder_array_given_old2new(const Array<TData, NSize>&
|
||||
return new_array;
|
||||
}
|
||||
|
||||
template <class TData, index_t NSize, class ExtractSeq>
|
||||
__host__ __device__ auto extract_array(const Array<TData, NSize>& old_array, ExtractSeq)
|
||||
{
|
||||
Array<TData, ExtractSeq::GetSize()> new_array;
|
||||
|
||||
constexpr index_t new_size = ExtractSeq::GetSize();
|
||||
|
||||
static_assert(new_size <= NSize, "wrong! too many extract");
|
||||
|
||||
static_for<0, new_size, 1>{}([&](auto I) {
|
||||
constexpr index_t i = I.Get();
|
||||
new_array[i] = old_array[ExtractSeq{}.Get(I)];
|
||||
});
|
||||
|
||||
return new_array;
|
||||
}
|
||||
|
||||
template <class TData, index_t NSize>
|
||||
__host__ __device__ constexpr auto operator+(const Array<TData, NSize>& a,
|
||||
const Array<TData, NSize>& b)
|
||||
|
||||
@@ -21,7 +21,7 @@ struct ConstantMatrixDescriptor
|
||||
|
||||
__host__ __device__ constexpr index_t GetElementSpace() const { return NRow_ * RowStride_; }
|
||||
|
||||
__host__ __device__ index_t Get1dIndex(index_t irow, index_t icol) const
|
||||
__host__ __device__ index_t GetOffsetFromMultiIndex(index_t irow, index_t icol) const
|
||||
{
|
||||
return irow * RowStride_ + icol;
|
||||
}
|
||||
|
||||
@@ -2,94 +2,118 @@
|
||||
#include "common.hip.hpp"
|
||||
#include "ConstantTensorDescriptor.hip.hpp"
|
||||
|
||||
// TensorDesc: ConstantTensorDescriptor<...>
|
||||
// MergedDimRanges: Sequence<FirstMergedDim, LastMergedDim>
|
||||
template <class TensorDesc, class... MergedDimRanges>
|
||||
// OriginalTensorDesc : ConstantTensorDescriptor<...>
|
||||
// it's the tensor whose dimensions are to be merged
|
||||
// OriginalDimMergeSeqs : Sequence<...>...
|
||||
// each is a sequence of original dimensions (of OriginalTensorDesc) to be merged
|
||||
template <class OriginalTensorDesc, class... OriginalDimMergeSeqs>
|
||||
struct ConstantMergedTensorDescriptor
|
||||
{
|
||||
static constexpr index_t nOriginalDim = GetNumOfOriginalDimension();
|
||||
static constexpr index_t nDim = GetNumOfDimension();
|
||||
static constexpr auto mOriginalDimMergeSeqs = std::tuple<OriginalDimMergeSeqs...>{};
|
||||
|
||||
static constexpr index_t nDim = std::tuple_size<mOriginalDimMergeSeqs>::value;
|
||||
static constexpr index_t nOriginalDim = OriginalDesc::GetNumOfDimension();
|
||||
|
||||
template <class... Is>
|
||||
__host__ __device__ constexpr ConstantMergedTensorDescriptor()
|
||||
{
|
||||
constexpr auto merged_dim_ranges = std::make_tuple(MergedDimRanges{}...);
|
||||
static_assert(nDim <= nOriginalDim, "wrong!");
|
||||
|
||||
static_for<0, sizeof...(MergedDimRanges), 1>{}([&](auto I) {
|
||||
constexpr index_t i = I.Get();
|
||||
constexpr auto merged_dim_range = std::get<i>(merged_dim_ranges);
|
||||
// TODO: check each of OriginalDimMergeSeqs contains at least 1, and at most
|
||||
// OriginalTensorDesc::nDim number of dimensions
|
||||
|
||||
static_assert(merged_dim_range.GetSize() == 2,
|
||||
"wrong! should specify first and last dimension to be merged");
|
||||
static_assert(merged_dim_range.Get(Number<0>{}) < GetNumOfUnmergedDimension(),
|
||||
"wrong!");
|
||||
static_assert(merged_dim_range.Get(Number<1>{}) < GetNumOfUnmergedDimension(),
|
||||
"wrong!");
|
||||
static_assert(merged_dim_range.Get(Number<0>{}) <= merged_dim_range.Get(Number<1>{}),
|
||||
"wrong!");
|
||||
});
|
||||
// TODO: check there is no duplication in OriginalDimMergeSeqs
|
||||
|
||||
// TODO: check OriginalDimMergeSeqs contains all original dimensions
|
||||
}
|
||||
|
||||
__host__ __device__ static constexpr index_t GetNumOfDimension()
|
||||
{
|
||||
constexpr auto merged_dim_ranges = std::make_tuple(MergedDimRanges...);
|
||||
__host__ __device__ static constexpr index_t GetNumOfDimension() { return nDim; }
|
||||
|
||||
struct f_calculate_num_of_lost_dim
|
||||
{
|
||||
__host__ __device__ constexpr index_t operator()(auto I) const
|
||||
__host__ __device__ static constexpr index_t GetNumOfOriginalDimension() { return nOriginalDim }
|
||||
|
||||
template <index_t IDim>
|
||||
__host__ __device__ static constexpr bool ContainMultipleOriginalDimensions(Number<IDim>)
|
||||
{
|
||||
return (std::Get<IDIM>(mOriginalDimMergeSeqs).GetSize() > 1);
|
||||
}
|
||||
|
||||
template <index_t IDim>
|
||||
__host__ __device__ static constexpr index_t GetLength(Number<IDim>)
|
||||
{
|
||||
constexpr auto original_dims_partial = std::Get<IDim>(mOriginalDimMergeSeqs);
|
||||
|
||||
return OriginalTensorDesc::Extract(original_dims_partial).GetElementSize();
|
||||
}
|
||||
|
||||
template <index_t IDim>
|
||||
__host__ __device__ static constexpr index_t GetStride(Number<IDim>)
|
||||
{
|
||||
static_assert(!ContainMultipleOriginalDimensions(Number<IDim>{}),
|
||||
"wrong! stride of a merged dimension is undefined");
|
||||
|
||||
constexpr auto idim_original = std::Get<IDim>(mOriginalDimMergeSeqs).Front();
|
||||
|
||||
return OriginalTensorDesc::GetStride(Number<idim_original>{});
|
||||
}
|
||||
|
||||
__host__ __device__ static constexpr auto GetLengths()
|
||||
{
|
||||
return Sequence<OriginalTensorDesc::Extract(OriginalDimMergeSeqs).GetElementSize()...>{};
|
||||
}
|
||||
|
||||
__host__ __device__ static constexpr index_t GetElementSize()
|
||||
{
|
||||
return OriginalTensorDesc::GetElementSize();
|
||||
}
|
||||
|
||||
__host__ __device__ static auto
|
||||
GetOriginalMultiIndexFromMultiIndex(Array<index_t, nDim> multi_id)
|
||||
{
|
||||
Array<index_t, nOriginalDim> original_multi_id;
|
||||
|
||||
static_for<0, nDim, 1>{}([&](auto IDim) {
|
||||
constexpr index_t idim = IDim.Get();
|
||||
constexpr auto original_dims_partial = std::get<idim>(mOriginalDimMergeSeqs);
|
||||
|
||||
// get partial original-multi-id corresponding to this merged dimension
|
||||
constexpr auto original_multi_id_partial =
|
||||
OriginalTensorDesc::Extract(original_dims_partial)
|
||||
.GetMultiIndexFrom1dIndex(multi_id[idim]);
|
||||
|
||||
// make sure compiler unroll this loop and propagate all the constants
|
||||
for(index_t i = 0; i < original_dims_partial.GetSize(); ++i)
|
||||
{
|
||||
constexpr index_t i = I.Get();
|
||||
constexpr auto merged_dim_range = std::get<i>(merged_dim_ranges);
|
||||
index_t idim_original = original_dims_partial[i];
|
||||
|
||||
return merged_dim_range.Get(Number<1>{}) - merged_dim_range.Get(Number<0>{});
|
||||
original_multi_id[idim_original] = original_multi_id_partial[i]
|
||||
}
|
||||
};
|
||||
});
|
||||
|
||||
constexpr index_t num_lost_dim = static_const_reduce_n<sizeof...(MergedDimRanges)>{}(
|
||||
f_calculate_num_of_lost_dim, std::plus<index_t>{});
|
||||
|
||||
return TensorDesc::GetNumOfDimension() - num_lost_dim;
|
||||
return original_multi_id;
|
||||
}
|
||||
|
||||
__host__ __device__ static constexpr index_t GetNumOfOriginalDimension()
|
||||
__host__ __device__ static index_t GetOffsetFromMultiIndex(Array<index_t, nDim> multi_id)
|
||||
{
|
||||
return TensorDesc::GetNumOfDimension();
|
||||
const auto original_multi_id = GetOriginalMultiIndexFromMultiIndex(multi_id);
|
||||
|
||||
return OriginalTensorDesc::GetOffsetFromMultiIndex(orginal_multi_id);
|
||||
}
|
||||
|
||||
template <index_t IDim>
|
||||
__host__ __device__ static constexpr bool IsMergedDimension(Number<IDim>)
|
||||
template <index_t... Is>
|
||||
__host__ __device__ static index_t GetOffsetFromMultiIndex(Is... is)
|
||||
{
|
||||
// not implemented
|
||||
return GetOffsetFromMultiIndex(Array<index_t, nDim>{is...});
|
||||
}
|
||||
|
||||
template <index_t IDim>
|
||||
__host__ __device__ static constexpr bool GetLength(Number<IDim>)
|
||||
__host__ __device__ static Array<index_t, nDim> GetMultiIndexFrom1dIndex(index_t id)
|
||||
{
|
||||
// not implemented
|
||||
}
|
||||
constexpr auto dummy_desc = make_packed_ConstantTensorDescriptor(GetLengths());
|
||||
|
||||
template <index_t IDim>
|
||||
__host__ __device__ static constexpr bool GetStride(Number<IDim>)
|
||||
{
|
||||
static_assert(!IsMergedDimension(Number<IDim>{}, "wrong! stride of a merged dimension is undefined")
|
||||
// not implemented
|
||||
}
|
||||
|
||||
template <class... Is>
|
||||
__host__ __device__ auto MultiIndex2OriginalMultiIndex(Is... is) const
|
||||
{
|
||||
// not implemented
|
||||
}
|
||||
|
||||
template <class... Is>
|
||||
__host__ __device__ auto OriginalMultiIndex2MultiIndex(Is... is) const
|
||||
{
|
||||
// not implemented
|
||||
return dummy_desc.GetMultiIndexFrom1dIndex(id);
|
||||
}
|
||||
};
|
||||
|
||||
template <class TensorDesc, class... MergedDimRanges>
|
||||
constexpr auto make_ConstantMergedTensorDescriptor(TensorDesc, MergedDimRanges...)
|
||||
template <class OriginalTensorDesc, class... OriginalDimMergeSeqs>
|
||||
constexpr auto make_ConstantMergedTensorDescriptor(OriginalTensorDesc, OriginalDimMergeSeqs...)
|
||||
{
|
||||
return ConstantMergedTensorDescriptor<TensorDesc, MergedDimRanges...>{};
|
||||
return ConstantMergedTensorDescriptor<OriginalTensorDesc, OriginalDimMergeSeqs...>{};
|
||||
}
|
||||
|
||||
@@ -2,40 +2,25 @@
|
||||
#include "common.hip.hpp"
|
||||
|
||||
template <class Lengths>
|
||||
__host__ __device__ constexpr auto calculate_default_strides(Lengths)
|
||||
__host__ __device__ constexpr auto calculate_packed_tensor_strides(Lengths)
|
||||
{
|
||||
return reverse_inclusive_scan_sequence(Lengths{}.PopFront().PushBack(Number<1>{}),
|
||||
std::multiplies<index_t>{});
|
||||
return reverse_inclusive_scan_sequence(Lengths{}.PopFront(), std::multiplies<index_t>{})
|
||||
.PushBack(Number<1>{});
|
||||
}
|
||||
|
||||
// this is ugly, only for 2d
|
||||
template <index_t L0, index_t L1, index_t Align>
|
||||
__host__ __device__ constexpr auto calculate_default_strides_aligned(Sequence<L0, L1>,
|
||||
Number<Align>)
|
||||
template <class Lengths, index_t Align>
|
||||
__host__ __device__ constexpr auto
|
||||
calculate_rank_tensor_default_strides_with_alignment(Lengths, Number<Align>)
|
||||
{
|
||||
constexpr index_t L1_align = Align * ((L1 + Align - 1) / Align);
|
||||
return Sequence<L1_align, 1>{};
|
||||
constexpr index_t L_back_align =
|
||||
Align * mod_conv::integer_divide_ceiler<index_t>{}(Lengths{}.Back(), Align);
|
||||
|
||||
return calculate_packed_tensor_strides(
|
||||
Lengths{}.Modify(Number<Lengths{}.GetSize() - 1>{}, Number<L_back_align>{}));
|
||||
}
|
||||
|
||||
// this is ugly, only for 3d
|
||||
template <index_t L0, index_t L1, index_t L2, index_t Align>
|
||||
__host__ __device__ constexpr auto calculate_default_strides_aligned(Sequence<L0, L1, L2>,
|
||||
Number<Align>)
|
||||
{
|
||||
constexpr index_t L2_align = Align * ((L2 + Align - 1) / Align);
|
||||
return Sequence<L1 * L2_align, L2_align, 1>{};
|
||||
}
|
||||
|
||||
// this is ugly, only for 4d
|
||||
template <index_t L0, index_t L1, index_t L2, index_t L3, index_t Align>
|
||||
__host__ __device__ constexpr auto calculate_default_strides_aligned(Sequence<L0, L1, L2, L3>,
|
||||
Number<Align>)
|
||||
{
|
||||
constexpr index_t L3_align = Align * ((L3 + Align - 1) / Align);
|
||||
return Sequence<L1 * L2 * L3_align, L2 * L3_align, L3_align, 1>{};
|
||||
}
|
||||
|
||||
template <class Lengths, class Strides>
|
||||
// MemoryRanks of dimensions is for conversion from offset to multi-index
|
||||
template <class Lengths, class Strides, class MemoryRanks>
|
||||
struct ConstantTensorDescriptor
|
||||
{
|
||||
using Type = ConstantTensorDescriptor;
|
||||
@@ -44,14 +29,24 @@ struct ConstantTensorDescriptor
|
||||
|
||||
__host__ __device__ constexpr ConstantTensorDescriptor()
|
||||
{
|
||||
static_assert(Lengths::GetSize() == Strides::GetSize(), "nDim not consistent");
|
||||
static_assert(Lengths::GetSize() == Strides::GetSize() &&
|
||||
Lengths::GetSize() == MemoryRanks::GetSize(),
|
||||
"nDim not consistent");
|
||||
|
||||
#if 0 // require sequence_sort, but it's not implemented yet
|
||||
static_assert(is_same<typename sequence_sort<MemoryRanks>::SortedSeqType,
|
||||
typename arithmetic_sequence_gen<0, nDim, 1>::SeqType>::value,
|
||||
"wrong! invalid MemoryRanks");
|
||||
#endif
|
||||
}
|
||||
|
||||
__host__ __device__ static constexpr index_t GetNumOfDimension() { return nDim; }
|
||||
|
||||
__host__ __device__ static constexpr Lengths GetLengths() { return Lengths{}; }
|
||||
__host__ __device__ static constexpr auto GetLengths() { return Lengths{}; }
|
||||
|
||||
__host__ __device__ static constexpr Strides GetStrides() { return Strides{}; }
|
||||
__host__ __device__ static constexpr auto GetStrides() { return Strides{}; }
|
||||
|
||||
__host__ __device__ static constexpr auto GetMemoryRanks() { return MemoryRanks{}; }
|
||||
|
||||
template <index_t I>
|
||||
__host__ __device__ static constexpr index_t GetLength(Number<I>)
|
||||
@@ -65,47 +60,58 @@ struct ConstantTensorDescriptor
|
||||
return Strides{}.Get(Number<I>{});
|
||||
}
|
||||
|
||||
template <index_t I>
|
||||
__host__ __device__ static constexpr index_t GetMemoryRank(Number<I>)
|
||||
{
|
||||
return MemoryRanks{}.Get(Number<I>{});
|
||||
}
|
||||
|
||||
__host__ __device__ static constexpr index_t GetElementSize()
|
||||
{
|
||||
return accumulate_on_sequence(Lengths{}, std::multiplies<index_t>{}, Number<1>{});
|
||||
}
|
||||
|
||||
// WRONG! ReorderGivenOld2New is broken
|
||||
template <class Align = Number<1>>
|
||||
__host__ __device__ static constexpr index_t GetElementSpace(Align align = Align{})
|
||||
{
|
||||
#if 0
|
||||
constexpr auto lengths_in_rank = GetLengths().ReorderGivenOld2New(MemoryRank{});
|
||||
constexpr auto strides_in_rank = GetStrides().ReorderGivenOld2new(MemoryRank{});
|
||||
|
||||
constexpr index_t element_space_unaligned = accumulate_on_sequence(
|
||||
(lengths_in_rank - Number<1>{}) * strides_in_rank, std::plus<index_t>{}, Number<1>{});
|
||||
#else // WRONG! align shouldbe applied to the last memory rank, not the last tensor dimension
|
||||
constexpr index_t element_space_unaligned = accumulate_on_sequence(
|
||||
(GetLengths() - Number<1>{}) * GetStrides(), std::plus<index_t>{}, Number<1>{});
|
||||
#endif
|
||||
|
||||
return align.Get() * ((element_space_unaligned + align.Get() - 1) / align.Get());
|
||||
}
|
||||
|
||||
template <index_t NSize>
|
||||
__host__ __device__ static index_t Get1dIndex(Array<index_t, NSize> multi_id)
|
||||
__host__ __device__ static index_t GetOffsetFromMultiIndex(Array<index_t, NSize> multi_id)
|
||||
{
|
||||
static_assert(NSize == nDim, "wrong! Dimension not consistent");
|
||||
|
||||
index_t id = 0;
|
||||
index_t offset = 0;
|
||||
|
||||
static_for<0, nDim, 1>{}([&](auto IDim) {
|
||||
constexpr index_t idim = IDim.Get();
|
||||
id += multi_id[idim] * GetStride(IDim);
|
||||
offset += multi_id[idim] * GetStride(IDim);
|
||||
});
|
||||
|
||||
return id;
|
||||
return offset;
|
||||
}
|
||||
|
||||
template <class... Is>
|
||||
__host__ __device__ static index_t Get1dIndex(Is... is)
|
||||
__host__ __device__ static index_t GetOffsetFromMultiIndex(Is... is)
|
||||
{
|
||||
static_assert(sizeof...(Is) == nDim, "number of multi-index is wrong");
|
||||
|
||||
const auto multi_id = Array<index_t, nDim>(is...);
|
||||
|
||||
return Get1dIndex(multi_id);
|
||||
return GetOffsetFromMultiIndex(Array<index_t, sizeof...(Is)>{is...});
|
||||
}
|
||||
|
||||
template <index_t... Is>
|
||||
__host__ __device__ static constexpr index_t Get1dIndex(Sequence<Is...> /*multi_id*/)
|
||||
__host__ __device__ static constexpr index_t GetOffsetFromMultiIndex(Sequence<Is...>)
|
||||
{
|
||||
static_assert(sizeof...(Is) == nDim, "wrong! Dimension not consistent");
|
||||
|
||||
@@ -114,44 +120,84 @@ struct ConstantTensorDescriptor
|
||||
return accumulate_on_sequence(multi_id * GetStrides(), std::plus<index_t>{}, Number<0>{});
|
||||
}
|
||||
|
||||
__host__ __device__ static Array<index_t, nDim> GetMultiIndex(index_t id)
|
||||
#if 0 // ReorderGivenOld2new is broken
|
||||
__host__ __device__ static Array<index_t, nDim> GetMultiIndexFromOffset(index_t offset)
|
||||
{
|
||||
Array<index_t, nDim> ranked_multi_id;
|
||||
|
||||
constexpr auto ranked_strides =
|
||||
GetStrides().ReorderGivenOld2new(MemoryRanks{}); // check this
|
||||
|
||||
// calculate index in each of the dimensions in the order of their rank (not dimension)
|
||||
static_for<0, nDim - 1, 1>{}([&](auto IDim) {
|
||||
constexpr index_t idim = IDim.Get();
|
||||
constexpr index_t stride = ranked_strides.Get(Number<idim>{});
|
||||
ranked_multi_id[idim] = offset / stride;
|
||||
offset -= ranked_multi_id[idim] * stride;
|
||||
});
|
||||
|
||||
ranked_multi_id[nDim - 1] = offset / ranked_strides.Get(Number<nDim - 1>{});
|
||||
|
||||
return reorder_array_given_new2old(ranked_multi_id, MemoryRanks{}); // check this
|
||||
}
|
||||
#endif
|
||||
|
||||
__host__ __device__ static Array<index_t, nDim> GetMultiIndexFrom1dIndex(index_t id)
|
||||
{
|
||||
Array<index_t, nDim> multi_id;
|
||||
|
||||
constexpr auto dummy_strides = calculate_packed_tensor_strides(GetLengths());
|
||||
|
||||
// calculate index in each of the dimensions in the order of their dimension (not rank)
|
||||
static_for<0, nDim - 1, 1>{}([&](auto IDim) {
|
||||
constexpr index_t idim = IDim.Get();
|
||||
multi_id[idim] = id / GetStride(IDim);
|
||||
id -= multi_id[idim] * GetStride(IDim);
|
||||
constexpr index_t idim = IDim.Get();
|
||||
constexpr index_t stride = dummy_strides.Get(Number<idim>{});
|
||||
multi_id[idim] = id / stride;
|
||||
id -= multi_id[idim] * stride;
|
||||
});
|
||||
|
||||
multi_id[nDim - 1] = id / GetStride(Number<nDim - 1>{});
|
||||
multi_id[nDim - 1] = id / dummy_strides.Get(Number<nDim - 1>{});
|
||||
|
||||
return multi_id;
|
||||
}
|
||||
|
||||
__host__ __device__ static constexpr auto Pack()
|
||||
{
|
||||
constexpr auto default_strides = calculate_default_strides(Lengths{});
|
||||
return ConstantTensorDescriptor<Lengths, decltype(default_strides)>{};
|
||||
}
|
||||
|
||||
// WRONG! Ranks is broken
|
||||
template <index_t... IDims>
|
||||
__host__ __device__ static constexpr auto Extract(Number<IDims>... extract_dims)
|
||||
{
|
||||
static_assert(sizeof...(IDims) <= GetNumOfDimension(),
|
||||
"wrong! too many number of dimensions to be extracted");
|
||||
|
||||
return make_ConstantTensorDescriptor(Lengths{}.Extract(extract_dims...),
|
||||
Strides{}.Extract(extract_dims...));
|
||||
using extract_lengths = decltype(Lengths{}.Extract(extract_dims...));
|
||||
using extract_strides = decltype(Strides{}.Extract(extract_dims...));
|
||||
using extract_ranks = decltype(MemoryRanks{}.Extract(extract_dims...));
|
||||
|
||||
#if 0
|
||||
using new_ranks = typename sequence_sort<extract_ranks>::Original2SortedType;
|
||||
#else // WRONG! TODO:: implement sequence_sort
|
||||
using new_ranks = typename arithmetic_sequence_gen<0, sizeof...(IDims), 1>::SeqType;
|
||||
#endif
|
||||
|
||||
return ConstantTensorDescriptor<extract_lengths, extract_strides, new_ranks>{};
|
||||
}
|
||||
|
||||
template <index_t IDim, index_t SliceLen>
|
||||
__host__ __device__ static constexpr auto Slice(Number<IDim>, Number<SliceLen>)
|
||||
{
|
||||
return make_ConstantTensorDescriptor(Lengths{}.Modify(Number<IDim>{}, Number<SliceLen>{}),
|
||||
Strides{});
|
||||
using slice_lengths = decltype(Lengths{}.Modify(Number<IDim>{}, Number<SliceLen>{}));
|
||||
|
||||
return ConstantTensorDescriptor<slice_lengths, Strides, MemoryRanks>{};
|
||||
}
|
||||
|
||||
template <index_t Threashold, index_t Delta>
|
||||
struct f_fold_impl
|
||||
{
|
||||
__host__ __device__ constexpr index_t operator()(index_t x) const
|
||||
{
|
||||
return x > Threashold ? x + Delta : x;
|
||||
}
|
||||
};
|
||||
|
||||
template <index_t IDim, index_t... FoldIntervals>
|
||||
__host__ __device__ static constexpr auto Fold(Number<IDim>, Number<FoldIntervals>...)
|
||||
{
|
||||
@@ -162,6 +208,7 @@ struct ConstantTensorDescriptor
|
||||
|
||||
constexpr auto unfold_length = GetLength(Number<IDim>{});
|
||||
constexpr auto unfold_stride = GetStride(Number<IDim>{});
|
||||
constexpr auto unfold_rank = GetMemoryRank(Number<IDim>{});
|
||||
|
||||
// length of the dimension to be folded needs to be dividable by fold_interval_product,
|
||||
// otherwise, folding is invalid
|
||||
@@ -178,16 +225,45 @@ struct ConstantTensorDescriptor
|
||||
reverse_inclusive_scan_sequence(fold_intervals.PushBack(Number<1>{}),
|
||||
std::multiplies<index_t>{});
|
||||
|
||||
// left and right
|
||||
constexpr auto left = make_increasing_sequence(Number<0>{}, Number<IDim>{}, Number<1>{});
|
||||
constexpr auto right = make_increasing_sequence(
|
||||
Number<IDim + 1>{}, Number<GetNumOfDimension()>{}, Number<1>{});
|
||||
// folded_ranks
|
||||
constexpr auto fold_ranks =
|
||||
typename arithmetic_sequence_gen<unfold_rank,
|
||||
unfold_rank + fold_intervals.GetSize() + 1,
|
||||
1>::SeqType{};
|
||||
|
||||
return make_ConstantTensorDescriptor(
|
||||
GetLengths().Extract(left).Append(fold_lengths).Append(GetLengths().Extract(right)),
|
||||
GetStrides().Extract(left).Append(fold_strides).Append(GetStrides().Extract(right)));
|
||||
// increase the ranks that are larger than unfold_rank
|
||||
constexpr auto tmp_ranks = transform_sequences(
|
||||
f_fold_impl<unfold_rank, fold_intervals.GetSize()>{}, GetMemoryRanks());
|
||||
|
||||
// left and right
|
||||
constexpr auto left = typename arithmetic_sequence_gen<0, IDim, 1>::SeqType{};
|
||||
constexpr auto right =
|
||||
typename arithmetic_sequence_gen<IDim + 1, GetNumOfDimension(), 1>::SeqType{};
|
||||
|
||||
constexpr auto new_lengths =
|
||||
GetLengths().Extract(left).Append(fold_lengths).Append(GetLengths().Extract(right));
|
||||
constexpr auto new_strides =
|
||||
GetStrides().Extract(left).Append(fold_strides).Append(GetStrides().Extract(right));
|
||||
constexpr auto new_ranks =
|
||||
tmp_ranks.Extract(left).Append(fold_ranks).Append(tmp_ranks.Extract(right));
|
||||
|
||||
static_assert(new_ranks.GetSize() == new_lengths.GetSize(), "wrong!");
|
||||
static_assert(fold_ranks.GetSize() == fold_lengths.GetSize(), "wrong!");
|
||||
|
||||
return ConstantTensorDescriptor<decltype(new_lengths),
|
||||
decltype(new_strides),
|
||||
decltype(new_ranks)>{};
|
||||
}
|
||||
|
||||
template <index_t Threashold, index_t Delta>
|
||||
struct f_unfold_impl
|
||||
{
|
||||
__host__ __device__ constexpr index_t operator()(index_t x) const
|
||||
{
|
||||
return x > Threashold ? x - Delta : x;
|
||||
}
|
||||
};
|
||||
|
||||
template <index_t FirstUnfoldDim, index_t LastUnfoldDim>
|
||||
__host__ __device__ static constexpr auto Unfold(Number<FirstUnfoldDim>, Number<LastUnfoldDim>)
|
||||
{
|
||||
@@ -198,66 +274,109 @@ struct ConstantTensorDescriptor
|
||||
// dimensions to be unfold need to be in descending order (w.r.t. strides), and need to be
|
||||
// packed in memory, otherwise, unfolding is invalid
|
||||
static_for<FirstUnfoldDim, LastUnfoldDim, 1>{}([&](auto IDim) {
|
||||
constexpr auto IDim_p1 = IDim + Number<1>{};
|
||||
|
||||
// check stride
|
||||
static_assert(
|
||||
GetStride(IDim) >= GetStride(Number<IDim.Get() + 1>{}),
|
||||
GetStride(IDim) >= GetStride(IDim_p1),
|
||||
"wrong! dimensions to be unfolded need to be in descending order w.r.t strides");
|
||||
|
||||
static_assert(GetStride(IDim + 1) * GetLength(IDim + 1) == GetStride(IDim),
|
||||
// check if packed
|
||||
static_assert(GetStride(IDim_p1) * GetLength(IDim_p1) == GetStride(IDim),
|
||||
"wrong! dimensions to be unfolded need to be packed");
|
||||
|
||||
// checkt ranks
|
||||
static_assert(GetMemoryRank(IDim_p1) = GetMemoryRank(IDim) + 1,
|
||||
"wrong! ranks of dimensions to be "
|
||||
"unfolded need to be in increasing "
|
||||
"and continuous ranks");
|
||||
});
|
||||
|
||||
// left and right
|
||||
constexpr auto left =
|
||||
make_increasing_sequence(Number<0>{}, Number<FirstUnfoldDim>{}, Number<1>{});
|
||||
constexpr auto middle = make_increasing_sequence(
|
||||
Number<FirstUnfoldDim>{}, Number<LastUnfoldDim + 1>{}, Number<1>{});
|
||||
constexpr auto right = make_increasing_sequence(
|
||||
Number<LastUnfoldDim + 1>{}, Number<GetNumOfDimension()>{}, Number<1>{});
|
||||
constexpr auto left = typename arithmetic_sequence_gen<0, FirstUnfoldDim, 1>::SeqType{};
|
||||
constexpr auto middle =
|
||||
typename arithmetic_sequence_gen<FirstUnfoldDim, LastUnfoldDim + 1, 1>::SeqType{};
|
||||
constexpr auto right =
|
||||
typename arithmetic_sequence_gen<LastUnfoldDim + 1, GetNumOfDimension(), 1>::SeqType{};
|
||||
|
||||
// length and stride
|
||||
// unfolded length, stride and rank
|
||||
constexpr index_t unfold_length = accumulate_on_sequence(
|
||||
GetLengths().Extract(middle), std::multiplies<index_t>{}, Number<1>{});
|
||||
|
||||
constexpr index_t unfold_stride = GetStride(Number<LastUnfoldDim>{});
|
||||
|
||||
return make_ConstantTensorDescriptor(GetLengths()
|
||||
.Extract(left)
|
||||
.PushBack(Number<unfold_length>{})
|
||||
.Append(GetLengths().Extract(right)),
|
||||
GetStrides()
|
||||
.Extract(left)
|
||||
.PushBack(Number<unfold_stride>{})
|
||||
.Append(GetStrides().Extract(right)));
|
||||
constexpr index_t unfold_rank = GetMemoryRank(Number<FirstUnfoldDim>{});
|
||||
|
||||
// decrease the ranks that are larger than the rank of LastUnfoldDim
|
||||
constexpr auto tmp_ranks =
|
||||
transform_sequences(GetMemoryRanks(),
|
||||
f_unfold_impl<GetMemoryRank(Number<LastUnfoldDim>{}),
|
||||
LastUnfoldDim - FirstUnfoldDim + 1>{});
|
||||
|
||||
// new lengths, strides and ranks
|
||||
constexpr auto new_lengths = GetLengths()
|
||||
.Extract(left)
|
||||
.PushBack(Number<unfold_length>{})
|
||||
.Append(GetLengths().Extract(right));
|
||||
|
||||
constexpr auto new_strides = GetStrides()
|
||||
.Extract(left)
|
||||
.PushBack(Number<unfold_stride>{})
|
||||
.Append(GetStrides().Extract(right));
|
||||
|
||||
constexpr auto new_ranks = tmp_ranks.Extract(left)
|
||||
.PushBack(Number<unfold_rank>{})
|
||||
.Append(tmp_ranks.Extract(right));
|
||||
|
||||
return ConstantTensorDescriptor<decltype(new_lengths),
|
||||
decltype(new_strides),
|
||||
decltype(new_ranks)>{};
|
||||
}
|
||||
|
||||
template <index_t... IRs>
|
||||
__host__ __device__ static constexpr auto ReorderGivenNew2Old(Sequence<IRs...> /*new2old*/)
|
||||
template <class MapNew2Old>
|
||||
__host__ __device__ static constexpr auto ReorderGivenNew2Old(MapNew2Old)
|
||||
{
|
||||
static_assert(sizeof...(IRs) == GetNumOfDimension(), "wrong! dimension is wrong");
|
||||
constexpr auto map_new2old = Sequence<IRs...>{};
|
||||
return make_ConstantTensorDescriptor(Lengths{}.ReorderGivenNew2Old(map_new2old),
|
||||
Strides{}.ReorderGivenNew2Old(map_new2old));
|
||||
return ConstantTensorDescriptor<decltype(Lengths{}.ReorderGivenNew2Old(MapNew2Old{})),
|
||||
decltype(Strides{}.ReorderGivenNew2Old(MapNew2Old{})),
|
||||
decltype(
|
||||
MemoryRanks{}.ReorderGivenNew2Old(MapNew2Old{}))>{};
|
||||
}
|
||||
|
||||
#if 0 // require sequence_sort, which is not implemented yet
|
||||
template <class MapOld2New>
|
||||
__host__ __device__ static constexpr auto ReorderGivenOld2New(MapOld2New)
|
||||
{
|
||||
return ConstantTensorDescriptor<decltype(Lengths{}.ReorderGivenOld2New(MapOld2New{})),
|
||||
decltype(Strides{}.ReorderGivenOld2New(MapOld2New{})),
|
||||
decltype(
|
||||
MemoryRanks{}.ReorderGivenOld2New(MapOld2New{}))>{};
|
||||
}
|
||||
#endif
|
||||
};
|
||||
|
||||
template <class Lengths>
|
||||
__host__ __device__ constexpr auto make_ConstantTensorDescriptor(Lengths)
|
||||
__host__ __device__ constexpr auto make_packed_ConstantTensorDescriptor(Lengths)
|
||||
{
|
||||
using Strides = decltype(calculate_default_strides(Lengths{}));
|
||||
return ConstantTensorDescriptor<Lengths, Strides>{};
|
||||
using Strides = decltype(calculate_packed_tensor_strides(Lengths{}));
|
||||
using MemoryRanks = typename arithmetic_sequence_gen<0, Lengths::GetSize(), 1>::SeqType;
|
||||
return ConstantTensorDescriptor<Lengths, Strides, MemoryRanks>{};
|
||||
}
|
||||
|
||||
template <class Lengths, class Strides>
|
||||
__host__ __device__ constexpr auto make_ConstantTensorDescriptor(Lengths, Strides)
|
||||
__host__ __device__ constexpr auto make_ranked_ConstantTensorDescriptor(Lengths, Strides)
|
||||
{
|
||||
return ConstantTensorDescriptor<Lengths, Strides>{};
|
||||
using MemoryRanks = typename arithmetic_sequence_gen<0, Lengths::GetSize(), 1>::SeqType;
|
||||
return ConstantTensorDescriptor<Lengths, Strides, MemoryRanks>{};
|
||||
}
|
||||
|
||||
template <class Lengths, index_t Align>
|
||||
__host__ __device__ constexpr auto make_ConstantTensorDescriptor_aligned(Lengths, Number<Align>)
|
||||
__host__ __device__ constexpr auto
|
||||
make_ranked_ConstantTensorDescriptor_with_alignment(Lengths, Number<Align>)
|
||||
{
|
||||
using Strides = decltype(calculate_default_strides_aligned(Lengths{}, Number<Align>{}));
|
||||
return ConstantTensorDescriptor<Lengths, Strides>{};
|
||||
using Strides =
|
||||
decltype(calculate_rank_tensor_default_strides_with_alignment(Lengths{}, Number<Align>{}));
|
||||
using MemoryRanks = typename arithmetic_sequence_gen<0, Lengths::GetSize(), 1>::SeqType;
|
||||
return ConstantTensorDescriptor<Lengths, Strides, MemoryRanks>{};
|
||||
}
|
||||
|
||||
template <class TDesc>
|
||||
|
||||
@@ -9,76 +9,100 @@ struct Sequence
|
||||
|
||||
static constexpr index_t mSize = sizeof...(Is);
|
||||
|
||||
const index_t mData[mSize + 1] = {
|
||||
Is..., 0}; // the last element is dummy, to prevent compiler complain on empty Sequence
|
||||
|
||||
__host__ __device__ static constexpr index_t GetSize() { return mSize; }
|
||||
|
||||
template <index_t I>
|
||||
__host__ __device__ constexpr index_t Get(Number<I>) const
|
||||
__host__ __device__ static constexpr index_t Get(Number<I>)
|
||||
{
|
||||
static_assert(I < mSize, "wrong! I too large");
|
||||
|
||||
// the last dummy element is to prevent compiler complain about empty Sequence
|
||||
const index_t mData[mSize + 1] = {Is..., 0};
|
||||
return mData[I];
|
||||
}
|
||||
|
||||
__host__ __device__ index_t operator[](index_t i) const { return mData[i]; }
|
||||
|
||||
template <index_t... IRs>
|
||||
__host__ __device__ constexpr auto ReorderGivenNew2Old(Sequence<IRs...> /*new2old*/) const
|
||||
__host__ __device__ index_t operator[](index_t i) const
|
||||
{
|
||||
static_assert(mSize == sizeof...(IRs), "mSize not consistent");
|
||||
|
||||
constexpr auto old = Type{};
|
||||
|
||||
return Sequence<old.Get(Number<IRs>{})...>{};
|
||||
const index_t mData[mSize + 1] = {Is..., 0};
|
||||
return mData[i];
|
||||
}
|
||||
|
||||
template <index_t... IRs>
|
||||
__host__ __device__ constexpr auto ReorderGivenOld2New(Sequence<IRs...> /*old2new*/) const
|
||||
__host__ __device__ static constexpr auto ReorderGivenNew2Old(Sequence<IRs...> /*new2old*/)
|
||||
{
|
||||
// TODO: don't know how to implement this
|
||||
printf("Sequence::ReorderGivenOld2New not implemented");
|
||||
assert(false);
|
||||
#if 0 // require sequence_sort, which is not implemented yet
|
||||
static_assert(is_same<sequence_sort<Sequence<IRs...>>::SortedSeqType,
|
||||
arithmetic_sequence_gen<0, mSize, 1>::SeqType>::value,
|
||||
"wrong! invalid new2old map");
|
||||
#endif
|
||||
|
||||
return Sequence<Type{}.Get(Number<IRs>{})...>{};
|
||||
}
|
||||
|
||||
__host__ __device__ constexpr auto Reverse() const;
|
||||
#if 0 // require sequence_sort, which is not implemented yet
|
||||
template <class MapOld2New>
|
||||
__host__ __device__ static constexpr auto ReorderGivenOld2New(MapOld2New /*old2new*/)
|
||||
{
|
||||
static_assert(is_same<sequence_sort<MapOld2New>::SortedSeqType,
|
||||
arithmetic_sequence_gen<0, mSize, 1>::SeqType>::value,
|
||||
"wrong! invalid old2new map");
|
||||
|
||||
__host__ __device__ constexpr index_t Front() const { return mData[0]; }
|
||||
constexpr auto map_new2old = typename sequence_map_inverse<MapOld2New>::SeqMapType{};
|
||||
|
||||
__host__ __device__ constexpr index_t Back() const { return mData[mSize - 1]; }
|
||||
return ReorderGivenNew2Old(map_new2old);
|
||||
}
|
||||
#endif
|
||||
|
||||
__host__ __device__ static constexpr auto Reverse();
|
||||
|
||||
__host__ __device__ static constexpr index_t Front()
|
||||
{
|
||||
const index_t mData[mSize + 1] = {Is..., 0};
|
||||
return mData[0];
|
||||
}
|
||||
|
||||
__host__ __device__ static constexpr index_t Back()
|
||||
{
|
||||
const index_t mData[mSize + 1] = {Is..., 0};
|
||||
return mData[mSize - 1];
|
||||
}
|
||||
|
||||
template <index_t I>
|
||||
__host__ __device__ constexpr auto PushFront(Number<I>) const
|
||||
__host__ __device__ static constexpr auto PushFront(Number<I>)
|
||||
{
|
||||
return Sequence<I, Is...>{};
|
||||
}
|
||||
|
||||
template <index_t I>
|
||||
__host__ __device__ constexpr auto PushBack(Number<I>) const
|
||||
__host__ __device__ static constexpr auto PushBack(Number<I>)
|
||||
{
|
||||
return Sequence<Is..., I>{};
|
||||
}
|
||||
|
||||
__host__ __device__ constexpr auto PopFront() const;
|
||||
__host__ __device__ static constexpr auto PopFront();
|
||||
|
||||
__host__ __device__ constexpr auto PopBack() const;
|
||||
__host__ __device__ static constexpr auto PopBack();
|
||||
|
||||
template <index_t... Xs>
|
||||
__host__ __device__ constexpr auto Append(Sequence<Xs...>) const
|
||||
__host__ __device__ static constexpr auto Append(Sequence<Xs...>)
|
||||
{
|
||||
return Sequence<Is..., Xs...>{};
|
||||
}
|
||||
|
||||
template <index_t... Ns>
|
||||
__host__ __device__ constexpr auto Extract(Number<Ns>...) const
|
||||
__host__ __device__ static constexpr auto Extract(Number<Ns>...)
|
||||
{
|
||||
return Sequence<Type{}.Get(Number<Ns>{})...>{};
|
||||
}
|
||||
|
||||
template <index_t... Ns>
|
||||
__host__ __device__ constexpr auto Extract(Sequence<Ns...>) const
|
||||
__host__ __device__ static constexpr auto Extract(Sequence<Ns...>)
|
||||
{
|
||||
return Sequence<Type{}.Get(Number<Ns>{})...>{};
|
||||
}
|
||||
|
||||
template <index_t I, index_t X>
|
||||
__host__ __device__ static constexpr auto Modify(Number<I>, Number<X>);
|
||||
};
|
||||
|
||||
template <class, class>
|
||||
@@ -91,43 +115,36 @@ struct sequence_merge<Sequence<Xs...>, Sequence<Ys...>>
|
||||
};
|
||||
|
||||
template <index_t IBegin, index_t NSize, index_t Increment>
|
||||
struct increasing_sequence_gen_impl
|
||||
struct arithmetic_sequence_gen_impl
|
||||
{
|
||||
static constexpr index_t NSizeLeft = NSize / 2;
|
||||
|
||||
using SeqType = typename sequence_merge<
|
||||
typename increasing_sequence_gen_impl<IBegin, NSizeLeft, Increment>::SeqType,
|
||||
typename increasing_sequence_gen_impl<IBegin + NSizeLeft * Increment,
|
||||
typename arithmetic_sequence_gen_impl<IBegin, NSizeLeft, Increment>::SeqType,
|
||||
typename arithmetic_sequence_gen_impl<IBegin + NSizeLeft * Increment,
|
||||
NSize - NSizeLeft,
|
||||
Increment>::SeqType>::SeqType;
|
||||
};
|
||||
|
||||
template <index_t IBegin, index_t Increment>
|
||||
struct increasing_sequence_gen_impl<IBegin, 1, Increment>
|
||||
struct arithmetic_sequence_gen_impl<IBegin, 1, Increment>
|
||||
{
|
||||
using SeqType = Sequence<IBegin>;
|
||||
};
|
||||
|
||||
template <index_t IBegin, index_t Increment>
|
||||
struct increasing_sequence_gen_impl<IBegin, 0, Increment>
|
||||
struct arithmetic_sequence_gen_impl<IBegin, 0, Increment>
|
||||
{
|
||||
using SeqType = Sequence<>;
|
||||
};
|
||||
|
||||
template <index_t IBegin, index_t IEnd, index_t Increment>
|
||||
struct increasing_sequence_gen
|
||||
struct arithmetic_sequence_gen
|
||||
{
|
||||
using SeqType =
|
||||
typename increasing_sequence_gen_impl<IBegin, IEnd - IBegin, Increment>::SeqType;
|
||||
typename arithmetic_sequence_gen_impl<IBegin, IEnd - IBegin, Increment>::SeqType;
|
||||
};
|
||||
|
||||
template <index_t IBegin, index_t IEnd, index_t Increment>
|
||||
__host__ __device__ constexpr auto
|
||||
make_increasing_sequence(Number<IBegin>, Number<IEnd>, Number<Increment>)
|
||||
{
|
||||
return typename increasing_sequence_gen<IBegin, IEnd, Increment>::SeqType{};
|
||||
}
|
||||
|
||||
template <class, class>
|
||||
struct sequence_reverse_inclusive_scan;
|
||||
|
||||
@@ -161,8 +178,8 @@ struct sequence_split
|
||||
{
|
||||
static constexpr index_t NSize = Seq{}.GetSize();
|
||||
|
||||
using range0 = typename increasing_sequence_gen<0, I, 1>::SeqType;
|
||||
using range1 = typename increasing_sequence_gen<I, NSize, 1>::SeqType;
|
||||
using range0 = typename arithmetic_sequence_gen<0, I, 1>::SeqType;
|
||||
using range1 = typename arithmetic_sequence_gen<I, NSize, 1>::SeqType;
|
||||
|
||||
using SeqType0 = typename sequence_extract<Seq, range0>::SeqType;
|
||||
using SeqType1 = typename sequence_extract<Seq, range1>::SeqType;
|
||||
@@ -191,6 +208,63 @@ struct sequence_reverse<Sequence<I0, I1>>
|
||||
using SeqType = Sequence<I1, I0>;
|
||||
};
|
||||
|
||||
#if 0 // not fully implemented
|
||||
template <class KeySeq0, class ValSeq0, class KeySeq1, class ValSeq1>
|
||||
struct sequence_sort_merge_impl;
|
||||
|
||||
template <index_t Key0,
|
||||
index_t... Keys0,
|
||||
index_t Val0,
|
||||
index_t... Vals0,
|
||||
index_t Key1,
|
||||
index_t... Keys1,
|
||||
index_t Val0,
|
||||
index_t... Vals1>
|
||||
struct sequence_sort_merge_impl<Sequence<Key0, Keys0...>,
|
||||
Sequence<Val0, Vals0...>,
|
||||
Sequence<Key1, Keys1...>,
|
||||
Sequence<Val1, Vals1...>>
|
||||
{
|
||||
};
|
||||
|
||||
template <class>
|
||||
struct sequence_sort;
|
||||
|
||||
template <index_t... Is>
|
||||
struct sequence_sort<Sequence<Is...>>
|
||||
{
|
||||
using OriginalSeqType = Sequence<Is...>;
|
||||
using SortedSeqType = xxxxx;
|
||||
using MapSorted2OriginalType = xxx;
|
||||
};
|
||||
|
||||
template <class Seq, class IsValidSeqMap>
|
||||
struct sequence_map_inverse_impl;
|
||||
|
||||
// impl for valid map, no impl for invalid map
|
||||
template <index_t... Is>
|
||||
struct sequence_map_inverse_impl<Sequence<Is...>, true>
|
||||
{
|
||||
using SeqMapType = sequence_sort<Sequence<Is...>>::MapSorted2OriginalType;
|
||||
};
|
||||
|
||||
template <class>
|
||||
struct sequence_map_inverse;
|
||||
|
||||
template <class Is...>
|
||||
struct sequence_map_inverse<Sequence<Is...>>
|
||||
{
|
||||
// TODO: make sure the map to be inversed is valid: [0, sizeof...(Is))
|
||||
static constexpr bool is_valid_sequence_map =
|
||||
is_same<typename sequence_sort<Sequence<Is...>>::SortedSeqType,
|
||||
typename arithmetic_sequence_gen<0, sizeof...(Is), 1>::SeqType>::value;
|
||||
|
||||
// make compiler fails, if is_valid_map != true
|
||||
using SeqMapType =
|
||||
typename sequence_map_inverse_impl<Sequence<Is...>, is_valid_map>::SeqMapType;
|
||||
};
|
||||
#endif
|
||||
|
||||
template <index_t... Xs, index_t... Ys>
|
||||
__host__ __device__ constexpr auto operator+(Sequence<Xs...>, Sequence<Ys...>)
|
||||
{
|
||||
@@ -243,7 +317,7 @@ __host__ __device__ constexpr auto operator+(Sequence<Xs...>, Number<Y>)
|
||||
template <index_t... Xs, index_t Y>
|
||||
__host__ __device__ constexpr auto operator-(Sequence<Xs...>, Number<Y>)
|
||||
{
|
||||
#if 0 // doesn't compile
|
||||
#if 0 // TODO: turn it on. Doesn't compile
|
||||
constexpr auto seq_x = Sequence<Xs...>{};
|
||||
|
||||
static_for<0, sizeof...(Xs), 1>{}([&](auto Iter) {
|
||||
@@ -313,14 +387,13 @@ __host__ __device__ constexpr auto operator%(Number<Y>, Sequence<Xs...>)
|
||||
template <index_t I, index_t... Is>
|
||||
__host__ __device__ constexpr auto sequence_pop_front(Sequence<I, Is...>)
|
||||
{
|
||||
static_assert(sizeof...(Is) > 0, "empty Sequence!");
|
||||
return Sequence<Is...>{};
|
||||
}
|
||||
|
||||
template <class Seq>
|
||||
__host__ __device__ constexpr auto sequence_pop_back(Seq)
|
||||
{
|
||||
static_assert(Seq{}.GetSize() > 0, "empty Sequence!");
|
||||
static_assert(Seq{}.GetSize() > 0, "wrong! cannot pop an empty Sequence!");
|
||||
return sequence_pop_front(Seq{}.Reverse()).Reverse();
|
||||
}
|
||||
|
||||
@@ -349,16 +422,16 @@ transform_sequences(F f, Sequence<Xs...>, Sequence<Ys...>, Sequence<Zs...>)
|
||||
return Sequence<f(Xs, Ys, Zs)...>{};
|
||||
}
|
||||
|
||||
template <index_t... Is>
|
||||
__host__ __device__ constexpr auto Sequence<Is...>::PopFront() const
|
||||
template <class Seq, class Reduce>
|
||||
__host__ __device__ constexpr auto reverse_inclusive_scan_sequence(Seq, Reduce)
|
||||
{
|
||||
return sequence_pop_front(Type{});
|
||||
return typename sequence_reverse_inclusive_scan<Seq, Reduce>::SeqType{};
|
||||
}
|
||||
|
||||
template <index_t... Is>
|
||||
__host__ __device__ constexpr auto Sequence<Is...>::PopBack() const
|
||||
template <class Seq, class Reduce>
|
||||
__host__ __device__ constexpr auto inclusive_scan_sequence(Seq, Reduce)
|
||||
{
|
||||
return sequence_pop_back(Type{});
|
||||
return reverse_inclusive_scan_sequence(Seq{}.Reverse(), Reduce{}).Reverse();
|
||||
}
|
||||
|
||||
template <class Seq>
|
||||
@@ -381,19 +454,32 @@ __host__ __device__ constexpr index_t
|
||||
}
|
||||
|
||||
template <index_t... Is>
|
||||
__host__ __device__ constexpr auto Sequence<Is...>::Reverse() const
|
||||
__host__ __device__ constexpr auto Sequence<Is...>::PopFront()
|
||||
{
|
||||
return sequence_pop_front(Type{});
|
||||
}
|
||||
|
||||
template <index_t... Is>
|
||||
__host__ __device__ constexpr auto Sequence<Is...>::PopBack()
|
||||
{
|
||||
return sequence_pop_back(Type{});
|
||||
}
|
||||
|
||||
template <index_t... Is>
|
||||
__host__ __device__ constexpr auto Sequence<Is...>::Reverse()
|
||||
{
|
||||
return typename sequence_reverse<Sequence<Is...>>::SeqType{};
|
||||
}
|
||||
|
||||
template <class Seq, class Reduce>
|
||||
__host__ __device__ constexpr auto reverse_inclusive_scan_sequence(Seq, Reduce)
|
||||
template <index_t... Is>
|
||||
template <index_t I, index_t X>
|
||||
__host__ __device__ constexpr auto Sequence<Is...>::Modify(Number<I>, Number<X>)
|
||||
{
|
||||
return typename sequence_reverse_inclusive_scan<Seq, Reduce>::SeqType{};
|
||||
}
|
||||
static_assert(I < GetSize(), "wrong!");
|
||||
|
||||
template <class Seq, class Reduce>
|
||||
__host__ __device__ constexpr auto inclusive_scan_sequence(Seq, Reduce)
|
||||
{
|
||||
return reverse_inclusive_scan_sequence(Seq{}.Reverse(), Reduce{}).Reverse();
|
||||
using seq_split = sequence_split<Type, I>;
|
||||
constexpr auto seq_left = typename seq_split::SeqType0{};
|
||||
constexpr auto seq_right = typename seq_split::SeqType1{}.PopFront();
|
||||
|
||||
return seq_left.PushBack(Number<X>{}).Append(seq_right);
|
||||
}
|
||||
|
||||
@@ -33,7 +33,7 @@ blockwise_2d_tensor_pointwise_operation_unary(DstDesc, Float* __restrict__ p_dst
|
||||
|
||||
const index_t did1 = is / desc.GetStride(I1);
|
||||
|
||||
const index_t dindex = dst_desc.Get1dIndex(did0, did1);
|
||||
const index_t dindex = dst_desc.GetOffsetFromMultiIndex(did0, did1);
|
||||
|
||||
f(p_dst[dindex]);
|
||||
}
|
||||
@@ -52,7 +52,7 @@ blockwise_2d_tensor_pointwise_operation_unary(DstDesc, Float* __restrict__ p_dst
|
||||
|
||||
const index_t did1 = is / desc.GetStride(I1);
|
||||
|
||||
const index_t dindex = dst_desc.Get1dIndex(did0, did1);
|
||||
const index_t dindex = dst_desc.GetOffsetFromMultiIndex(did0, did1);
|
||||
|
||||
f(p_dst[dindex]);
|
||||
}
|
||||
@@ -102,9 +102,9 @@ __device__ void blockwise_2d_tensor_pointwise_operation_binary_reorder_by_get_ds
|
||||
|
||||
did[1] = is / ref_desc.GetStride(I1);
|
||||
|
||||
const index_t aindex = src_desc.Get1dIndex(did[0], did[1]);
|
||||
const index_t aindex = src_desc.GetOffsetFromMultiIndex(did[0], did[1]);
|
||||
|
||||
const index_t bindex = dst_desc.Get1dIndex(did[IR0], did[IR1]);
|
||||
const index_t bindex = dst_desc.GetOffsetFromMultiIndex(did[IR0], did[IR1]);
|
||||
|
||||
f(p_src[aindex], p_dst[bindex]);
|
||||
}
|
||||
@@ -125,9 +125,9 @@ __device__ void blockwise_2d_tensor_pointwise_operation_binary_reorder_by_get_ds
|
||||
|
||||
did[1] = is / ref_desc.GetStride(I1);
|
||||
|
||||
const index_t aindex = src_desc.Get1dIndex(did[0], did[1]);
|
||||
const index_t aindex = src_desc.GetOffsetFromMultiIndex(did[0], did[1]);
|
||||
|
||||
const index_t bindex = dst_desc.Get1dIndex(did[IR0], did[IR1]);
|
||||
const index_t bindex = dst_desc.GetOffsetFromMultiIndex(did[IR0], did[IR1]);
|
||||
|
||||
f(p_src[aindex], p_dst[bindex]);
|
||||
}
|
||||
@@ -224,8 +224,10 @@ struct Blockwise2dTensorCopy1
|
||||
|
||||
did[1] = is / ref_desc.GetStride(I1);
|
||||
|
||||
const index_t src_index = src_desc.Get1dIndex(did[0], did[1] * DataPerRead);
|
||||
const index_t dst_index = dst_desc.Get1dIndex(did[0], did[1] * DataPerRead);
|
||||
const index_t src_index =
|
||||
src_desc.GetOffsetFromMultiIndex(did[0], did[1] * DataPerRead);
|
||||
const index_t dst_index =
|
||||
dst_desc.GetOffsetFromMultiIndex(did[0], did[1] * DataPerRead);
|
||||
|
||||
*(reinterpret_cast<vector_t*>(p_dst + dst_index)) =
|
||||
*(reinterpret_cast<const vector_t*>(p_src + src_index));
|
||||
@@ -328,8 +330,8 @@ struct Blockwise2dTensorCopy2
|
||||
{
|
||||
index_t did1 = d1v4loop * 4 * ThreadPerDim1 + 4 * mThreadId1;
|
||||
|
||||
const index_t sindex = src_desc.Get1dIndex(did0, did1);
|
||||
const index_t dindex = dst_desc.Get1dIndex(did0, did1);
|
||||
const index_t sindex = src_desc.GetOffsetFromMultiIndex(did0, did1);
|
||||
const index_t dindex = dst_desc.GetOffsetFromMultiIndex(did0, did1);
|
||||
|
||||
*(reinterpret_cast<Float4*>(p_dst + dindex)) =
|
||||
*(reinterpret_cast<const Float4*>(p_src + sindex));
|
||||
@@ -341,8 +343,8 @@ struct Blockwise2dTensorCopy2
|
||||
index_t did1 =
|
||||
Dim1V4Loop * 4 * ThreadPerDim1 + d1v2loop * 2 * ThreadPerDim1 + 2 * mThreadId1;
|
||||
|
||||
const index_t sindex = src_desc.Get1dIndex(did0, did1);
|
||||
const index_t dindex = dst_desc.Get1dIndex(did0, did1);
|
||||
const index_t sindex = src_desc.GetOffsetFromMultiIndex(did0, did1);
|
||||
const index_t dindex = dst_desc.GetOffsetFromMultiIndex(did0, did1);
|
||||
|
||||
*(reinterpret_cast<Float2*>(p_dst + dindex)) =
|
||||
*(reinterpret_cast<const Float2*>(p_src + sindex));
|
||||
@@ -354,8 +356,8 @@ struct Blockwise2dTensorCopy2
|
||||
index_t did1 = Dim1V4Loop * 4 * ThreadPerDim1 + Dim1V2Loop * 2 * ThreadPerDim1 +
|
||||
d1v1loop * ThreadPerDim1 + mThreadId1;
|
||||
|
||||
const index_t sindex = src_desc.Get1dIndex(did0, did1);
|
||||
const index_t dindex = dst_desc.Get1dIndex(did0, did1);
|
||||
const index_t sindex = src_desc.GetOffsetFromMultiIndex(did0, did1);
|
||||
const index_t dindex = dst_desc.GetOffsetFromMultiIndex(did0, did1);
|
||||
|
||||
p_dst[dindex] = p_src[sindex];
|
||||
}
|
||||
@@ -368,8 +370,8 @@ struct Blockwise2dTensorCopy2
|
||||
|
||||
if(did1 < L1)
|
||||
{
|
||||
const index_t sindex = src_desc.Get1dIndex(did0, did1);
|
||||
const index_t dindex = dst_desc.Get1dIndex(did0, did1);
|
||||
const index_t sindex = src_desc.GetOffsetFromMultiIndex(did0, did1);
|
||||
const index_t dindex = dst_desc.GetOffsetFromMultiIndex(did0, did1);
|
||||
|
||||
p_dst[dindex] = p_src[sindex];
|
||||
}
|
||||
@@ -389,8 +391,8 @@ struct Blockwise2dTensorCopy2
|
||||
{
|
||||
index_t did1 = d1v4loop * 4 * ThreadPerDim1 + 4 * mThreadId1;
|
||||
|
||||
const index_t sindex = src_desc.Get1dIndex(did0, did1);
|
||||
const index_t dindex = dst_desc.Get1dIndex(did0, did1);
|
||||
const index_t sindex = src_desc.GetOffsetFromMultiIndex(did0, did1);
|
||||
const index_t dindex = dst_desc.GetOffsetFromMultiIndex(did0, did1);
|
||||
|
||||
*(reinterpret_cast<Float4*>(p_dst + dindex)) =
|
||||
*(reinterpret_cast<const Float4*>(p_src + sindex));
|
||||
@@ -402,8 +404,8 @@ struct Blockwise2dTensorCopy2
|
||||
index_t did1 = Dim1V4Loop * 4 * ThreadPerDim1 + d1v2loop * 2 * ThreadPerDim1 +
|
||||
2 * mThreadId1;
|
||||
|
||||
const index_t sindex = src_desc.Get1dIndex(did0, did1);
|
||||
const index_t dindex = dst_desc.Get1dIndex(did0, did1);
|
||||
const index_t sindex = src_desc.GetOffsetFromMultiIndex(did0, did1);
|
||||
const index_t dindex = dst_desc.GetOffsetFromMultiIndex(did0, did1);
|
||||
|
||||
*(reinterpret_cast<Float2*>(p_dst + dindex)) =
|
||||
*(reinterpret_cast<const Float2*>(p_src + sindex));
|
||||
@@ -415,8 +417,8 @@ struct Blockwise2dTensorCopy2
|
||||
index_t did1 = Dim1V4Loop * 4 * ThreadPerDim1 + Dim1V2Loop * 2 * ThreadPerDim1 +
|
||||
d1v1loop * ThreadPerDim1 + mThreadId1;
|
||||
|
||||
const index_t sindex = src_desc.Get1dIndex(did0, did1);
|
||||
const index_t dindex = dst_desc.Get1dIndex(did0, did1);
|
||||
const index_t sindex = src_desc.GetOffsetFromMultiIndex(did0, did1);
|
||||
const index_t dindex = dst_desc.GetOffsetFromMultiIndex(did0, did1);
|
||||
|
||||
p_dst[dindex] = p_src[sindex];
|
||||
}
|
||||
@@ -429,8 +431,8 @@ struct Blockwise2dTensorCopy2
|
||||
|
||||
if(did1 < L1)
|
||||
{
|
||||
const index_t sindex = src_desc.Get1dIndex(did0, did1);
|
||||
const index_t dindex = dst_desc.Get1dIndex(did0, did1);
|
||||
const index_t sindex = src_desc.GetOffsetFromMultiIndex(did0, did1);
|
||||
const index_t dindex = dst_desc.GetOffsetFromMultiIndex(did0, did1);
|
||||
|
||||
p_dst[dindex] = p_src[sindex];
|
||||
}
|
||||
@@ -497,8 +499,10 @@ struct Blockwise2dTensorCopy3
|
||||
const index_t thread_id_d0 = get_thread_local_1d_id() / thread_per_d1;
|
||||
const index_t thread_id_d1 = get_thread_local_1d_id() - thread_id_d0 * thread_per_d1;
|
||||
|
||||
mSrcMyThreadOffset = SrcDesc{}.Get1dIndex(thread_id_d0, thread_id_d1 * DataPerRead);
|
||||
mDstMyThreadOffset = DstDesc{}.Get1dIndex(thread_id_d0, thread_id_d1 * DataPerRead);
|
||||
mSrcMyThreadOffset =
|
||||
SrcDesc{}.GetOffsetFromMultiIndex(thread_id_d0, thread_id_d1 * DataPerRead);
|
||||
mDstMyThreadOffset =
|
||||
DstDesc{}.GetOffsetFromMultiIndex(thread_id_d0, thread_id_d1 * DataPerRead);
|
||||
}
|
||||
|
||||
__device__ void Run(const Float* __restrict__ p_src, Float* __restrict__ p_dst) const
|
||||
|
||||
@@ -71,8 +71,10 @@ struct Blockwise3dTensorCopy1
|
||||
|
||||
did[2] = is / ref_desc.GetStride(I2);
|
||||
|
||||
const index_t src_index = src_desc.Get1dIndex(did[0], did[1], did[2] * DataPerRead);
|
||||
const index_t dst_index = dst_desc.Get1dIndex(did[0], did[1], did[2] * DataPerRead);
|
||||
const index_t src_index =
|
||||
src_desc.GetOffsetFromMultiIndex(did[0], did[1], did[2] * DataPerRead);
|
||||
const index_t dst_index =
|
||||
dst_desc.GetOffsetFromMultiIndex(did[0], did[1], did[2] * DataPerRead);
|
||||
|
||||
*(reinterpret_cast<vector_t*>(p_dst + dst_index)) =
|
||||
*(reinterpret_cast<const vector_t*>(p_src + src_index));
|
||||
@@ -167,12 +169,13 @@ struct Blockwise3dTensorCopy3
|
||||
}
|
||||
|
||||
constexpr auto thread_cluster_desc = make_ConstantTensorDescriptor(ThreadPerDims{});
|
||||
const auto thread_multi_id = thread_cluster_desc.GetMultiIndex(get_thread_local_1d_id());
|
||||
const auto thread_multi_id =
|
||||
thread_cluster_desc.GetMultiIndexFrom1dIndex(get_thread_local_1d_id());
|
||||
|
||||
mSrcMyThreadOffset = SrcDesc{}.Get1dIndex(
|
||||
mSrcMyThreadOffset = SrcDesc{}.GetOffsetFromMultiIndex(
|
||||
thread_multi_id[0], thread_multi_id[1], thread_multi_id[2] * DataPerRead);
|
||||
|
||||
mDstMyThreadOffset = DstDesc{}.Get1dIndex(
|
||||
mDstMyThreadOffset = DstDesc{}.GetOffsetFromMultiIndex(
|
||||
thread_multi_id[0], thread_multi_id[1], thread_multi_id[2] * DataPerRead);
|
||||
}
|
||||
|
||||
@@ -214,14 +217,14 @@ struct Blockwise3dTensorCopy3
|
||||
for(index_t iloop_d2 = 0; iloop_d2 < nloop_d2; ++iloop_d2)
|
||||
{
|
||||
const index_t src_offset =
|
||||
SrcDesc{}.Get1dIndex(iloop_d0 * thread_per_d0,
|
||||
iloop_d1 * thread_per_d1,
|
||||
iloop_d2 * thread_per_d2 * DataPerRead);
|
||||
SrcDesc{}.GetOffsetFromMultiIndex(iloop_d0 * thread_per_d0,
|
||||
iloop_d1 * thread_per_d1,
|
||||
iloop_d2 * thread_per_d2 * DataPerRead);
|
||||
|
||||
const index_t dst_offset =
|
||||
DstDesc{}.Get1dIndex(iloop_d0 * thread_per_d0,
|
||||
iloop_d1 * thread_per_d1,
|
||||
iloop_d2 * thread_per_d2 * DataPerRead);
|
||||
DstDesc{}.GetOffsetFromMultiIndex(iloop_d0 * thread_per_d0,
|
||||
iloop_d1 * thread_per_d1,
|
||||
iloop_d2 * thread_per_d2 * DataPerRead);
|
||||
|
||||
*(reinterpret_cast<vector_t*>(&p_dst[dst_offset + mDstMyThreadOffset])) = *(
|
||||
reinterpret_cast<const vector_t*>(&p_src[src_offset + mSrcMyThreadOffset]));
|
||||
@@ -295,12 +298,12 @@ struct Blockwise3dTensorCopy3
|
||||
for(index_t iloop_d2 = 0; iloop_d2 < nloop_d2; ++iloop_d2)
|
||||
{
|
||||
const index_t src_offset =
|
||||
SrcDesc{}.Get1dIndex(iloop_d0 * thread_per_d0,
|
||||
iloop_d1 * thread_per_d1,
|
||||
iloop_d2 * thread_per_d2 * DataPerRead);
|
||||
SrcDesc{}.GetOffsetFromMultiIndex(iloop_d0 * thread_per_d0,
|
||||
iloop_d1 * thread_per_d1,
|
||||
iloop_d2 * thread_per_d2 * DataPerRead);
|
||||
|
||||
const index_t clipboard_offset =
|
||||
clipboard_desc.Get1dIndex(iloop_d0, iloop_d1, iloop_d2 * DataPerRead);
|
||||
const index_t clipboard_offset = clipboard_desc.GetOffsetFromMultiIndex(
|
||||
iloop_d0, iloop_d1, iloop_d2 * DataPerRead);
|
||||
|
||||
*(reinterpret_cast<vector_t*>(&p_clipboard[clipboard_offset])) = *(
|
||||
reinterpret_cast<const vector_t*>(&p_src[src_offset + mSrcMyThreadOffset]));
|
||||
@@ -350,13 +353,13 @@ struct Blockwise3dTensorCopy3
|
||||
#pragma unroll
|
||||
for(index_t iloop_d2 = 0; iloop_d2 < nloop_d2; ++iloop_d2)
|
||||
{
|
||||
const index_t clipboard_offset =
|
||||
clipboard_desc.Get1dIndex(iloop_d0, iloop_d1, iloop_d2 * DataPerRead);
|
||||
const index_t clipboard_offset = clipboard_desc.GetOffsetFromMultiIndex(
|
||||
iloop_d0, iloop_d1, iloop_d2 * DataPerRead);
|
||||
|
||||
const index_t dst_offset =
|
||||
DstDesc{}.Get1dIndex(iloop_d0 * thread_per_d0,
|
||||
iloop_d1 * thread_per_d1,
|
||||
iloop_d2 * thread_per_d2 * DataPerRead);
|
||||
DstDesc{}.GetOffsetFromMultiIndex(iloop_d0 * thread_per_d0,
|
||||
iloop_d1 * thread_per_d1,
|
||||
iloop_d2 * thread_per_d2 * DataPerRead);
|
||||
|
||||
*(reinterpret_cast<vector_t*>(&p_dst[dst_offset + mDstMyThreadOffset])) =
|
||||
*(reinterpret_cast<const vector_t*>(&p_clipboard[clipboard_offset]));
|
||||
|
||||
@@ -13,7 +13,7 @@ blockwise_4d_tensor_pointwise_operation_unary(DstDesc, Float* __restrict__ p_dst
|
||||
|
||||
constexpr auto dst_desc = DstDesc{};
|
||||
|
||||
constexpr auto desc = make_ConstantTensorDescriptor(dst_desc.GetLengths());
|
||||
constexpr auto desc = make_packed_ConstantTensorDescriptor(dst_desc.GetLengths());
|
||||
|
||||
#if 0
|
||||
if(get_thread_local_1d_id() == 0)
|
||||
@@ -43,7 +43,7 @@ blockwise_4d_tensor_pointwise_operation_unary(DstDesc, Float* __restrict__ p_dst
|
||||
|
||||
const index_t did3 = is / desc.GetStride(I3);
|
||||
|
||||
const index_t dindex = dst_desc.Get1dIndex(did0, did1, did2, did3);
|
||||
const index_t dindex = dst_desc.GetOffsetFromMultiIndex(did0, did1, did2, did3);
|
||||
|
||||
f(p_dst[dindex]);
|
||||
}
|
||||
@@ -70,7 +70,7 @@ blockwise_4d_tensor_pointwise_operation_unary(DstDesc, Float* __restrict__ p_dst
|
||||
|
||||
const index_t did3 = is / desc.GetStride(I3);
|
||||
|
||||
const index_t dindex = dst_desc.Get1dIndex(did0, did1, did2, did3);
|
||||
const index_t dindex = dst_desc.GetOffsetFromMultiIndex(did0, did1, did2, did3);
|
||||
|
||||
f(p_dst[dindex]);
|
||||
}
|
||||
@@ -108,7 +108,7 @@ __device__ void blockwise_4d_tensor_pointwise_operation_binary_reorder_by_get_ds
|
||||
|
||||
constexpr auto src_desc = SrcDesc{};
|
||||
constexpr auto dst_desc = DstDesc{};
|
||||
constexpr auto ref_desc = make_ConstantTensorDescriptor(SrcOpLengths{});
|
||||
constexpr auto ref_desc = make_packed_ConstantTensorDescriptor(SrcOpLengths{});
|
||||
|
||||
constexpr index_t NLoop = ref_desc.GetElementSize() / BlockSize;
|
||||
|
||||
@@ -132,9 +132,10 @@ __device__ void blockwise_4d_tensor_pointwise_operation_binary_reorder_by_get_ds
|
||||
|
||||
did[3] = is / ref_desc.GetStride(I3);
|
||||
|
||||
const index_t src_index = src_desc.Get1dIndex(did[0], did[1], did[2], did[3]);
|
||||
const index_t src_index = src_desc.GetOffsetFromMultiIndex(did[0], did[1], did[2], did[3]);
|
||||
|
||||
const index_t dst_index = dst_desc.Get1dIndex(did[IR0], did[IR1], did[IR2], did[IR3]);
|
||||
const index_t dst_index =
|
||||
dst_desc.GetOffsetFromMultiIndex(did[IR0], did[IR1], did[IR2], did[IR3]);
|
||||
|
||||
f(p_src[src_index], p_dst[dst_index]);
|
||||
}
|
||||
@@ -163,9 +164,11 @@ __device__ void blockwise_4d_tensor_pointwise_operation_binary_reorder_by_get_ds
|
||||
|
||||
did[3] = is / ref_desc.GetStride(I3);
|
||||
|
||||
const index_t src_index = src_desc.Get1dIndex(did[0], did[1], did[2], did[3]);
|
||||
const index_t src_index =
|
||||
src_desc.GetOffsetFromMultiIndex(did[0], did[1], did[2], did[3]);
|
||||
|
||||
const index_t dst_index = dst_desc.Get1dIndex(did[IR0], did[IR1], did[IR2], did[IR3]);
|
||||
const index_t dst_index =
|
||||
dst_desc.GetOffsetFromMultiIndex(did[IR0], did[IR1], did[IR2], did[IR3]);
|
||||
|
||||
f(p_src[src_index], p_dst[dst_index]);
|
||||
}
|
||||
@@ -256,7 +259,7 @@ struct Blockwise4dTensorCopy1
|
||||
constexpr index_t read_per_d3 = mod_conv::integer_divide_ceil(L3, DataPerRead);
|
||||
|
||||
constexpr auto ref_desc =
|
||||
make_ConstantTensorDescriptor(Sequence<L0, L1, L2, read_per_d3>{});
|
||||
make_packed_ConstantTensorDescriptor(Sequence<L0, L1, L2, read_per_d3>{});
|
||||
|
||||
constexpr index_t NLoop = ref_desc.GetElementSize() / BlockSize;
|
||||
|
||||
@@ -278,9 +281,9 @@ struct Blockwise4dTensorCopy1
|
||||
did[3] = is / ref_desc.GetStride(I3);
|
||||
|
||||
const index_t src_index =
|
||||
src_desc.Get1dIndex(did[0], did[1], did[2], did[3] * DataPerRead);
|
||||
src_desc.GetOffsetFromMultiIndex(did[0], did[1], did[2], did[3] * DataPerRead);
|
||||
const index_t dst_index =
|
||||
dst_desc.Get1dIndex(did[0], did[1], did[2], did[3] * DataPerRead);
|
||||
dst_desc.GetOffsetFromMultiIndex(did[0], did[1], did[2], did[3] * DataPerRead);
|
||||
|
||||
*(reinterpret_cast<vector_t*>(p_dst + dst_index)) =
|
||||
*(reinterpret_cast<const vector_t*>(p_src + src_index));
|
||||
@@ -333,19 +336,19 @@ struct BlockwiseChwnTensorCopyPadded
|
||||
|
||||
constexpr auto src_desc = SrcDesc{};
|
||||
constexpr auto dst_desc = DstDesc{};
|
||||
constexpr auto ref_desc = make_ConstantTensorDescriptor(DstOpLengths{});
|
||||
constexpr auto ref_desc = make_packed_ConstantTensorDescriptor(DstOpLengths{});
|
||||
|
||||
constexpr auto h_global_pad_low = GlobalLowerPads{}.Get(I0);
|
||||
constexpr auto w_global_pad_low = GlobalLowerPads{}.Get(I1);
|
||||
|
||||
constexpr index_t NLoop = ref_desc.GetElementSize() / BlockSize;
|
||||
|
||||
const Float* p_src_tmp =
|
||||
p_src +
|
||||
src_desc.Get1dIndex(c_block_data_begin,
|
||||
(ho_block_data_begin + h_block_pad_low) - h_global_pad_low,
|
||||
(wo_block_data_begin + w_block_pad_low) - w_global_pad_low,
|
||||
n_block_data_begin);
|
||||
const Float* p_src_tmp = p_src +
|
||||
src_desc.GetOffsetFromMultiIndex(
|
||||
c_block_data_begin,
|
||||
(ho_block_data_begin + h_block_pad_low) - h_global_pad_low,
|
||||
(wo_block_data_begin + w_block_pad_low) - w_global_pad_low,
|
||||
n_block_data_begin);
|
||||
|
||||
#if 0
|
||||
if(get_thread_local_1d_id() == 0)
|
||||
@@ -389,13 +392,13 @@ struct BlockwiseChwnTensorCopyPadded
|
||||
|
||||
did[3] = is / ref_desc.GetStride(I3);
|
||||
|
||||
const index_t bindex = dst_desc.Get1dIndex(did[0], did[1], did[2], did[3]);
|
||||
const index_t bindex = dst_desc.GetOffsetFromMultiIndex(did[0], did[1], did[2], did[3]);
|
||||
|
||||
p_dst[bindex] =
|
||||
(did[1] < h_block_pad_low || did[1] + h_block_pad_up >= ref_desc.GetLength(I1) ||
|
||||
did[2] < w_block_pad_low || did[2] + w_block_pad_up >= ref_desc.GetLength(I2))
|
||||
? Float(0)
|
||||
: p_src_tmp[src_desc.Get1dIndex(did[0], did[1], did[2], did[3])];
|
||||
: p_src_tmp[src_desc.GetOffsetFromMultiIndex(did[0], did[1], did[2], did[3])];
|
||||
}
|
||||
|
||||
constexpr bool has_tail = (ref_desc.GetElementSize() > NLoop * BlockSize);
|
||||
@@ -422,14 +425,16 @@ struct BlockwiseChwnTensorCopyPadded
|
||||
|
||||
did[3] = is / ref_desc.GetStride(I3);
|
||||
|
||||
const index_t bindex = dst_desc.Get1dIndex(did[0], did[1], did[2], did[3]);
|
||||
const index_t bindex =
|
||||
dst_desc.GetOffsetFromMultiIndex(did[0], did[1], did[2], did[3]);
|
||||
|
||||
p_dst[bindex] =
|
||||
(did[1] < h_block_pad_low ||
|
||||
did[1] + h_block_pad_up >= ref_desc.GetLength(I1) ||
|
||||
did[2] < w_block_pad_low || did[2] + w_block_pad_up >= ref_desc.GetLength(I2))
|
||||
? Float(0)
|
||||
: p_src_tmp[src_desc.Get1dIndex(did[0], did[1], did[2], did[3])];
|
||||
: p_src_tmp[src_desc.GetOffsetFromMultiIndex(
|
||||
did[0], did[1], did[2], did[3])];
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -505,18 +510,19 @@ struct Blockwise4dTensorCopy3
|
||||
}
|
||||
}
|
||||
|
||||
constexpr auto thread_cluster_desc = make_ConstantTensorDescriptor(ThreadPerDims{});
|
||||
const auto thread_multi_id = thread_cluster_desc.GetMultiIndex(get_thread_local_1d_id());
|
||||
constexpr auto thread_cluster_desc = make_packed_ConstantTensorDescriptor(ThreadPerDims{});
|
||||
const auto thread_multi_id =
|
||||
thread_cluster_desc.GetMultiIndexFrom1dIndex(get_thread_local_1d_id());
|
||||
|
||||
mSrcMyThreadOffset = SrcDesc{}.Get1dIndex(thread_multi_id[0],
|
||||
thread_multi_id[1],
|
||||
thread_multi_id[2],
|
||||
thread_multi_id[3] * DataPerRead);
|
||||
mSrcMyThreadOffset = SrcDesc{}.GetOffsetFromMultiIndex(thread_multi_id[0],
|
||||
thread_multi_id[1],
|
||||
thread_multi_id[2],
|
||||
thread_multi_id[3] * DataPerRead);
|
||||
|
||||
mDstMyThreadOffset = DstDesc{}.Get1dIndex(thread_multi_id[0],
|
||||
thread_multi_id[1],
|
||||
thread_multi_id[2],
|
||||
thread_multi_id[3] * DataPerRead);
|
||||
mDstMyThreadOffset = DstDesc{}.GetOffsetFromMultiIndex(thread_multi_id[0],
|
||||
thread_multi_id[1],
|
||||
thread_multi_id[2],
|
||||
thread_multi_id[3] * DataPerRead);
|
||||
}
|
||||
|
||||
__device__ void Run(const Float* __restrict__ p_src, Float* __restrict__ p_dst) const
|
||||
@@ -564,17 +570,17 @@ struct Blockwise4dTensorCopy3
|
||||
#pragma unroll
|
||||
for(index_t iloop_d3 = 0; iloop_d3 < nloop_d3; ++iloop_d3)
|
||||
{
|
||||
const index_t src_offset =
|
||||
SrcDesc{}.Get1dIndex(iloop_d0 * thread_per_d0,
|
||||
iloop_d1 * thread_per_d1,
|
||||
iloop_d2 * thread_per_d2,
|
||||
iloop_d3 * thread_per_d3 * DataPerRead);
|
||||
const index_t src_offset = SrcDesc{}.GetOffsetFromMultiIndex(
|
||||
iloop_d0 * thread_per_d0,
|
||||
iloop_d1 * thread_per_d1,
|
||||
iloop_d2 * thread_per_d2,
|
||||
iloop_d3 * thread_per_d3 * DataPerRead);
|
||||
|
||||
const index_t dst_offset =
|
||||
DstDesc{}.Get1dIndex(iloop_d0 * thread_per_d0,
|
||||
iloop_d1 * thread_per_d1,
|
||||
iloop_d2 * thread_per_d2,
|
||||
iloop_d3 * thread_per_d3 * DataPerRead);
|
||||
const index_t dst_offset = DstDesc{}.GetOffsetFromMultiIndex(
|
||||
iloop_d0 * thread_per_d0,
|
||||
iloop_d1 * thread_per_d1,
|
||||
iloop_d2 * thread_per_d2,
|
||||
iloop_d3 * thread_per_d3 * DataPerRead);
|
||||
|
||||
*(reinterpret_cast<vector_t*>(&p_dst[dst_offset + mDstMyThreadOffset])) =
|
||||
*(reinterpret_cast<const vector_t*>(
|
||||
@@ -646,7 +652,7 @@ struct Blockwise4dTensorCopy3
|
||||
constexpr index_t nloop_d2 = L2 / thread_per_d2;
|
||||
constexpr index_t nloop_d3 = mod_conv::integer_divide_ceil(L3, thread_per_d3 * DataPerRead);
|
||||
|
||||
constexpr auto clipboard_desc = make_ConstantTensorDescriptor(
|
||||
constexpr auto clipboard_desc = make_packed_ConstantTensorDescriptor(
|
||||
Sequence<nloop_d0, nloop_d1, nloop_d2, nloop_d3 * DataPerRead>{});
|
||||
|
||||
#pragma unroll
|
||||
@@ -661,13 +667,13 @@ struct Blockwise4dTensorCopy3
|
||||
#pragma unroll
|
||||
for(index_t iloop_d3 = 0; iloop_d3 < nloop_d3; ++iloop_d3)
|
||||
{
|
||||
const index_t src_offset =
|
||||
SrcDesc{}.Get1dIndex(iloop_d0 * thread_per_d0,
|
||||
iloop_d1 * thread_per_d1,
|
||||
iloop_d2 * thread_per_d2,
|
||||
iloop_d3 * thread_per_d3 * DataPerRead);
|
||||
const index_t src_offset = SrcDesc{}.GetOffsetFromMultiIndex(
|
||||
iloop_d0 * thread_per_d0,
|
||||
iloop_d1 * thread_per_d1,
|
||||
iloop_d2 * thread_per_d2,
|
||||
iloop_d3 * thread_per_d3 * DataPerRead);
|
||||
|
||||
const index_t clipboard_offset = clipboard_desc.Get1dIndex(
|
||||
const index_t clipboard_offset = clipboard_desc.GetOffsetFromMultiIndex(
|
||||
iloop_d0, iloop_d1, iloop_d2, iloop_d3 * DataPerRead);
|
||||
|
||||
*(reinterpret_cast<vector_t*>(&p_clipboard[clipboard_offset])) =
|
||||
@@ -713,7 +719,7 @@ struct Blockwise4dTensorCopy3
|
||||
constexpr index_t nloop_d2 = L2 / thread_per_d2;
|
||||
constexpr index_t nloop_d3 = mod_conv::integer_divide_ceil(L3, thread_per_d3 * DataPerRead);
|
||||
|
||||
constexpr auto clipboard_desc = make_ConstantTensorDescriptor(
|
||||
constexpr auto clipboard_desc = make_packed_ConstantTensorDescriptor(
|
||||
Sequence<nloop_d0, nloop_d1, nloop_d2, nloop_d3 * DataPerRead>{});
|
||||
|
||||
#pragma unroll
|
||||
@@ -728,14 +734,14 @@ struct Blockwise4dTensorCopy3
|
||||
#pragma unroll
|
||||
for(index_t iloop_d3 = 0; iloop_d3 < nloop_d3; ++iloop_d3)
|
||||
{
|
||||
const index_t clipboard_offset = clipboard_desc.Get1dIndex(
|
||||
const index_t clipboard_offset = clipboard_desc.GetOffsetFromMultiIndex(
|
||||
iloop_d0, iloop_d1, iloop_d2, iloop_d3 * DataPerRead);
|
||||
|
||||
const index_t dst_offset =
|
||||
DstDesc{}.Get1dIndex(iloop_d0 * thread_per_d0,
|
||||
iloop_d1 * thread_per_d1,
|
||||
iloop_d2 * thread_per_d2,
|
||||
iloop_d3 * thread_per_d3 * DataPerRead);
|
||||
const index_t dst_offset = DstDesc{}.GetOffsetFromMultiIndex(
|
||||
iloop_d0 * thread_per_d0,
|
||||
iloop_d1 * thread_per_d1,
|
||||
iloop_d2 * thread_per_d2,
|
||||
iloop_d3 * thread_per_d3 * DataPerRead);
|
||||
|
||||
*(reinterpret_cast<vector_t*>(&p_dst[dst_offset + mDstMyThreadOffset])) =
|
||||
*(reinterpret_cast<const vector_t*>(&p_clipboard[clipboard_offset]));
|
||||
|
||||
@@ -87,10 +87,10 @@ struct BlockwiseBatchGemmBlockABlockBThreadCTransANormalBNormalC_V2
|
||||
const auto c_thread_mtx_index = GetBeginOfThreadMatrixC(get_thread_local_1d_id());
|
||||
|
||||
mMyThreadOffsetA = c_thread_mtx_index.batch * BlockMatrixStrideA +
|
||||
a_block_mtx.Get1dIndex(0, c_thread_mtx_index.row);
|
||||
a_block_mtx.GetOffsetFromMultiIndex(0, c_thread_mtx_index.row);
|
||||
|
||||
mMyThreadOffsetB = c_thread_mtx_index.batch * BlockMatrixStrideB +
|
||||
b_block_mtx.Get1dIndex(0, c_thread_mtx_index.col);
|
||||
b_block_mtx.GetOffsetFromMultiIndex(0, c_thread_mtx_index.col);
|
||||
|
||||
#if 0
|
||||
if(get_thread_local_1d_id() == 0 && get_block_1d_id() == 0)
|
||||
@@ -221,10 +221,12 @@ struct BlockwiseBatchGemmBlockABlockBThreadCTransANormalBNormalC_V2
|
||||
threadwise_matrix_copy(
|
||||
a_block_mtx,
|
||||
p_a_block +
|
||||
a_block_mtx.Get1dIndex(k_begin, m_repeat * MPerLevel1Cluster) +
|
||||
a_block_mtx.GetOffsetFromMultiIndex(k_begin,
|
||||
m_repeat * MPerLevel1Cluster) +
|
||||
ib * BlockMatrixStrideA + mMyThreadOffsetA,
|
||||
a_thread_mtx,
|
||||
p_a_thread + a_thread_mtx.Get1dIndex(0, m_repeat * MPerThreadSubC),
|
||||
p_a_thread +
|
||||
a_thread_mtx.GetOffsetFromMultiIndex(0, m_repeat * MPerThreadSubC),
|
||||
a_thread_sub_mtx.GetLengths(),
|
||||
Number<DataPerReadA>{});
|
||||
}
|
||||
@@ -238,10 +240,12 @@ struct BlockwiseBatchGemmBlockABlockBThreadCTransANormalBNormalC_V2
|
||||
threadwise_matrix_copy(
|
||||
b_block_mtx,
|
||||
p_b_block +
|
||||
b_block_mtx.Get1dIndex(k_begin, n_repeat * NPerLevel1Cluster) +
|
||||
b_block_mtx.GetOffsetFromMultiIndex(k_begin,
|
||||
n_repeat * NPerLevel1Cluster) +
|
||||
ib * BlockMatrixStrideB + mMyThreadOffsetB,
|
||||
b_thread_mtx,
|
||||
p_b_thread + b_thread_mtx.Get1dIndex(0, n_repeat * NPerThreadSubC),
|
||||
p_b_thread +
|
||||
b_thread_mtx.GetOffsetFromMultiIndex(0, n_repeat * NPerThreadSubC),
|
||||
b_thread_sub_mtx.GetLengths(),
|
||||
Number<DataPerReadB>{});
|
||||
}
|
||||
@@ -343,9 +347,11 @@ struct BlockwiseBatchGemmBlockABlockBThreadCTransANormalBNormalC_V2
|
||||
reg_a[0] = *reinterpret_cast<const Float4*>(&p_a_block[mMyThreadOffsetA]);
|
||||
reg_b[0] = *reinterpret_cast<const Float4*>(&p_b_block[mMyThreadOffsetB]);
|
||||
reg_b[1] = *reinterpret_cast<const Float4*>(
|
||||
&p_b_block[b_block_mtx.Get1dIndex(0, NPerLevel1Cluster) + mMyThreadOffsetB]);
|
||||
&p_b_block[b_block_mtx.GetOffsetFromMultiIndex(0, NPerLevel1Cluster) +
|
||||
mMyThreadOffsetB]);
|
||||
reg_a[1] = *reinterpret_cast<const Float4*>(
|
||||
&p_a_block[a_block_mtx.Get1dIndex(0, MPerLevel1Cluster) + mMyThreadOffsetA]);
|
||||
&p_a_block[a_block_mtx.GetOffsetFromMultiIndex(0, MPerLevel1Cluster) +
|
||||
mMyThreadOffsetA]);
|
||||
outerProduct4x4(reg_a[0], reg_b[0], reg_c[0], reg_c[2], reg_c[4], reg_c[6]);
|
||||
outerProduct4x4(reg_a[0], reg_b[1], reg_c[1], reg_c[3], reg_c[5], reg_c[7]);
|
||||
|
||||
@@ -353,15 +359,17 @@ struct BlockwiseBatchGemmBlockABlockBThreadCTransANormalBNormalC_V2
|
||||
for(index_t k = 1; k < K; ++k)
|
||||
{
|
||||
reg_a[0] = *reinterpret_cast<const Float4*>(
|
||||
&p_a_block[a_block_mtx.Get1dIndex(k, 0) + mMyThreadOffsetA]);
|
||||
&p_a_block[a_block_mtx.GetOffsetFromMultiIndex(k, 0) + mMyThreadOffsetA]);
|
||||
outerProduct4x4(reg_a[1], reg_b[0], reg_c[8], reg_c[10], reg_c[12], reg_c[14]);
|
||||
reg_b[0] = *reinterpret_cast<const Float4*>(
|
||||
&p_b_block[b_block_mtx.Get1dIndex(k, 0) + mMyThreadOffsetB]);
|
||||
&p_b_block[b_block_mtx.GetOffsetFromMultiIndex(k, 0) + mMyThreadOffsetB]);
|
||||
outerProduct4x4(reg_a[1], reg_b[1], reg_c[9], reg_c[11], reg_c[13], reg_c[15]);
|
||||
reg_b[1] = *reinterpret_cast<const Float4*>(
|
||||
&p_b_block[b_block_mtx.Get1dIndex(k, NPerLevel1Cluster) + mMyThreadOffsetB]);
|
||||
&p_b_block[b_block_mtx.GetOffsetFromMultiIndex(k, NPerLevel1Cluster) +
|
||||
mMyThreadOffsetB]);
|
||||
reg_a[1] = *reinterpret_cast<const Float4*>(
|
||||
&p_a_block[a_block_mtx.Get1dIndex(k, MPerLevel1Cluster) + mMyThreadOffsetA]);
|
||||
&p_a_block[a_block_mtx.GetOffsetFromMultiIndex(k, MPerLevel1Cluster) +
|
||||
mMyThreadOffsetA]);
|
||||
outerProduct4x4(reg_a[0], reg_b[0], reg_c[0], reg_c[2], reg_c[4], reg_c[6]);
|
||||
outerProduct4x4(reg_a[0], reg_b[1], reg_c[1], reg_c[3], reg_c[5], reg_c[7]);
|
||||
}
|
||||
@@ -489,7 +497,7 @@ struct BlockwiseBatchGemmBlockABlockBThreadCTransANormalBNormalC_V2
|
||||
|
||||
const index_t c_thread_offset =
|
||||
c_thread_mtx_begin.batch * BlockMatrixStrideC +
|
||||
c_block_mtx.Get1dIndex(c_thread_mtx_begin.row, c_thread_mtx_begin.col);
|
||||
c_block_mtx.GetOffsetFromMultiIndex(c_thread_mtx_begin.row, c_thread_mtx_begin.col);
|
||||
|
||||
for(index_t m_repeat = 0; m_repeat < MRepeat; ++m_repeat)
|
||||
{
|
||||
@@ -498,12 +506,12 @@ struct BlockwiseBatchGemmBlockABlockBThreadCTransANormalBNormalC_V2
|
||||
threadwise_matrix_copy(
|
||||
c_thread_sub_mtx,
|
||||
p_c_thread +
|
||||
c_thread_sub_mtx.Get1dIndex(m_repeat * MPerLevel1Cluster,
|
||||
n_repeat * NPerLevel1Cluster),
|
||||
c_thread_sub_mtx.GetOffsetFromMultiIndex(m_repeat * MPerLevel1Cluster,
|
||||
n_repeat * NPerLevel1Cluster),
|
||||
c_block_mtx,
|
||||
p_c_block +
|
||||
c_block_mtx.Get1dIndex(m_repeat * MPerLevel1Cluster,
|
||||
n_repeat * NPerLevel1Cluster) +
|
||||
c_block_mtx.GetOffsetFromMultiIndex(m_repeat * MPerLevel1Cluster,
|
||||
n_repeat * NPerLevel1Cluster) +
|
||||
c_thread_offset,
|
||||
c_thread_sub_mtx.GetLengths());
|
||||
}
|
||||
|
||||
@@ -51,8 +51,8 @@ struct BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_v2
|
||||
|
||||
auto c_thread_mtx_index = GetBeginOfThreadMatrixC(get_thread_local_1d_id());
|
||||
|
||||
mMyThreadOffsetA = BlockMatrixA::Get1dIndex(0, c_thread_mtx_index.row);
|
||||
mMyThreadOffsetB = BlockMatrixB::Get1dIndex(0, c_thread_mtx_index.col);
|
||||
mMyThreadOffsetA = BlockMatrixA::GetOffsetFromMultiIndex(0, c_thread_mtx_index.row);
|
||||
mMyThreadOffsetB = BlockMatrixB::GetOffsetFromMultiIndex(0, c_thread_mtx_index.col);
|
||||
}
|
||||
|
||||
__device__ static auto GetThreadMatrixCLengths()
|
||||
@@ -248,10 +248,11 @@ struct BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_v2
|
||||
{
|
||||
threadwise_matrix_copy(
|
||||
a_block_mtx,
|
||||
p_a_block + a_block_mtx.Get1dIndex(k_begin, m_repeat * MPerLevel1Cluster) +
|
||||
p_a_block +
|
||||
a_block_mtx.GetOffsetFromMultiIndex(k_begin, m_repeat * MPerLevel1Cluster) +
|
||||
mMyThreadOffsetA,
|
||||
a_thread_mtx,
|
||||
p_a_thread + a_thread_mtx.Get1dIndex(0, m_repeat * MPerThreadSubC),
|
||||
p_a_thread + a_thread_mtx.GetOffsetFromMultiIndex(0, m_repeat * MPerThreadSubC),
|
||||
a_thread_sub_mtx.GetLengths(),
|
||||
Number<DataPerReadA>{});
|
||||
}
|
||||
@@ -262,10 +263,11 @@ struct BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_v2
|
||||
{
|
||||
threadwise_matrix_copy(
|
||||
b_block_mtx,
|
||||
p_b_block + b_block_mtx.Get1dIndex(k_begin, n_repeat * NPerLevel1Cluster) +
|
||||
p_b_block +
|
||||
b_block_mtx.GetOffsetFromMultiIndex(k_begin, n_repeat * NPerLevel1Cluster) +
|
||||
mMyThreadOffsetB,
|
||||
b_thread_mtx,
|
||||
p_b_thread + b_thread_mtx.Get1dIndex(0, n_repeat * NPerThreadSubC),
|
||||
p_b_thread + b_thread_mtx.GetOffsetFromMultiIndex(0, n_repeat * NPerThreadSubC),
|
||||
b_thread_sub_mtx.GetLengths(),
|
||||
Number<DataPerReadB>{});
|
||||
}
|
||||
|
||||
@@ -11,7 +11,7 @@ template <index_t BlockSize,
|
||||
class SliceLengths,
|
||||
class SubLengths,
|
||||
class ClusterLengths,
|
||||
class ThreadArrangeOrder,
|
||||
class ThreadClusterArrangeOrder,
|
||||
class SrcAccessOrder,
|
||||
class DstAccessOrder>
|
||||
struct BlockwiseTensorSliceCopy_generic_v1
|
||||
@@ -21,35 +21,142 @@ struct BlockwiseTensorSliceCopy_generic_v1
|
||||
index_t mSrcMyThreadOffset;
|
||||
index_t mDstMyThreadOffset;
|
||||
|
||||
__device__ BlockwiseTensorSliceCopy_generic_v1(Array<index_t, nDim> src_block_multi_id_offset,
|
||||
Array<index_t, nDim> dst_block_multi_id_offset)
|
||||
__device__ BlockwiseTensorSliceCopy_generic_v1(Array<index_t, nDim> src_block_multi_offset,
|
||||
Array<index_t, nDim> dst_block_multi_offset)
|
||||
{
|
||||
// only support SrcSubLengths.GetLength() == 1 on merged dimension, for now
|
||||
// check SrcDataPerRead should be 1, if last dimension is a merged dimension
|
||||
|
||||
// check NDim consistent
|
||||
static_assert(SrcDesc::GetNumOfDimension() == DstDesc::GetNumOfDimension(), "wrong");
|
||||
|
||||
// calculate mSrcMyThreadOffset
|
||||
// calculate mDstMyThreadOffset
|
||||
constexpr auto thread_cluster_desc = make_packed_ConstantTensorDescriptor(
|
||||
ClusterLengths{}.ReorderGivenNew2Old(ThreadClusterArrangeOrder{}));
|
||||
|
||||
// BlockSize
|
||||
static_assert(BlockSize == thread_cluster_desc.GetElementSize(), "wrong! BlockSize");
|
||||
|
||||
// divide work
|
||||
static_for<0, nDim, 1>{}([&](auto IDim) {
|
||||
static_assert(SliceLengths{}.Get(IDim) % SubLenghs{}.Get(IDim) == 0,
|
||||
"wrong! cannot evenly divide sliced tensor into sub-tensor");
|
||||
});
|
||||
|
||||
constexpr auto thread_work_desc =
|
||||
make_packed_ConstantTensorDescriptor(SliceLengths{} / SliceSubLengths{});
|
||||
|
||||
static_for<0, nDim, 1>{}([&](auto IDim) {
|
||||
static_assert(thread_work_desc.GetLength(IDim) % thread_cluster_desc.Get(IDim) == 0,
|
||||
"wrong! cannot evenly divide work to cluster");
|
||||
});
|
||||
|
||||
// only support SubLengths.Get() == 1 on merged dimension, for now
|
||||
static_for<0, nDim, 1>{}([&](auto IDim) {
|
||||
static_if<(SrcDesc::ContainMultipleOriginalDimensions(IDim) ||
|
||||
DstDesc::ContainMultipleOriginalDimensions(IDim))>{}([&](auto fwd) {
|
||||
static_assert(fwd(SubLengths{}).Get(IDim) == 1,
|
||||
"wrong! Sub-Lengths on merged dimension should be 1");
|
||||
});
|
||||
});
|
||||
|
||||
// calculate mSrcMyThreadOffset, mDstMyThreadOffset
|
||||
const auto thread_cluster_multi_id =
|
||||
thread_cluster_desc.GetMultiIndexFrom1dIndex(get_thread_local_1d_id());
|
||||
|
||||
const auto data_cluster_multi_id =
|
||||
reorder_array_given_old2new(thread_cluster_multi_id, ThreadClusterArrangeOrder{});
|
||||
|
||||
const auto thread_data_multi_offset = data_cluster_multi_id * SubLengths{};
|
||||
|
||||
mSrcMythreadOffset =
|
||||
SrcDesc::GetOffsetFromMultiIndex(src_block_multi_offset + thread_data_multi_offset);
|
||||
mSrcMythreadOffset =
|
||||
DstDesc::GetOffsetFromMultiIndex(dst_block_multi_offset + thread_data_multi_offset);
|
||||
}
|
||||
|
||||
__device__ static constexpr index_t GetRegisterClipboardSize() {}
|
||||
__device__ static constexpr index_t GetRegisterClipboardSize()
|
||||
{
|
||||
constexpr auto repeat_lengths = SliceLengths{} / (SubLengths{} * ClusterLengths{});
|
||||
|
||||
constexpr auto thread_tensor_desc =
|
||||
make_packed_ConstantTensorDescriptor(SubLengths{} * repeat_lengths);
|
||||
|
||||
return thread_tensor_desc.GetElementSpaceSize();
|
||||
}
|
||||
|
||||
__device__ void RunLoadRegisterClipboard(const Float* __restrict__ p_src,
|
||||
Float* __restrict__ p_clipboard) const
|
||||
{
|
||||
constexpr auto thread_sub_tensor_lengths = SubLengths{};
|
||||
|
||||
constexpr auto data_per_cluster_per_dims = thread_sub_tensor_lengths * ClusterLengths{};
|
||||
|
||||
constexpr auto repeat_lengths = SliceLengths{} / (SubLengths{} * ClusterLengths{});
|
||||
|
||||
constexpr auto thread_tensor_desc =
|
||||
make_packed_ConstantTensorDescriptor(thread_sub_tensor_lengths * repeat_lengths);
|
||||
|
||||
static_ford<decltype(repeat_lengths)>{}([&](auto repeat_multi_id_) {
|
||||
constexpr auto repeat_multi_id = decltype(repeat_multi_id_){};
|
||||
|
||||
constexpr auto src_data_multi_offset = repeat_multi_id * data_per_cluster_per_dims;
|
||||
|
||||
constexpr auto clipboard_data_multi_offset =
|
||||
repeat_multi_id * thread_sub_tensor_lengths;
|
||||
|
||||
constexpr index_t src_offset = SrcDesc{}.GetOffsetFromMultiIndex(src_data_multi_id);
|
||||
constexpr index_t clipboard_offset =
|
||||
thread_tensor_desc.GetOffsetFromMultiIndex(clipboard_data_multi_id);
|
||||
|
||||
threadwise_tensor_slice_copy_generic(SrcDesc{},
|
||||
p_src + src_offset + mSrcMyThreadOffset,
|
||||
thread_tensor_desc,
|
||||
zero_array<index_t, nDim>{},
|
||||
thread_tensor_desc,
|
||||
p_clipboard + clipboard_offset,
|
||||
zero_array<index_t, nDim>{},
|
||||
thread_sub_tensor_lengths,
|
||||
SrcAccessOrder{});
|
||||
});
|
||||
}
|
||||
|
||||
__device__ void RunStoreRegisterClipboard(const Float* __restrict__ p_clipboard,
|
||||
Float* __restrict__ p_dst) const
|
||||
{
|
||||
constexpr auto thread_sub_tensor_lengths = SubLengths{};
|
||||
|
||||
constexpr auto data_per_cluster_per_dims = thread_sub_tensor_lengths * ClusterLengths{};
|
||||
|
||||
constexpr auto repeat_lengths = SliceLengths{} / (SubLengths{} * ClusterLengths{});
|
||||
|
||||
constexpr auto thread_tensor_desc =
|
||||
make_packed_ConstantTensorDescriptor(thread_sub_tensor_lengths * repeat_lengths);
|
||||
|
||||
static_ford<decltype(repeat_lengths)>{}([&](auto repeat_multi_id_) {
|
||||
constexpr auto repeat_multi_id = decltype(repeat_multi_id_){};
|
||||
|
||||
constexpr auto clipboard_data_multi_offset =
|
||||
repeat_multi_id * thread_sub_tensor_lengths;
|
||||
|
||||
constexpr auto dst_data_multi_offset = repeat_multi_id * data_per_cluster_per_dims;
|
||||
|
||||
constexpr index_t clipboard_offset =
|
||||
thread_tensor_desc.GetOffsetFromMultiIndex(clipboard_data_multi_offset);
|
||||
|
||||
constexpr index_t dst_offset = DstDesc{}.GetOffsetFromMultiIndex(dst_data_multi_offset);
|
||||
|
||||
threadwise_tensor_slice_copy_generic(thread_tensor_desc,
|
||||
p_clipboard + clipboard_offset,
|
||||
zero_array<index_t, nDim>{},
|
||||
DstDesc{},
|
||||
p_dst + dst_offset + mDstMyThreadOffset,
|
||||
zero_array<index_t, nDim>{},
|
||||
thread_sub_tensor_lengths,
|
||||
DstAccessOrder{});
|
||||
}
|
||||
|
||||
__device__ void Run(const Float* __restrict__ p_src, Float* __restrict__ p_dst) const
|
||||
{
|
||||
Float p_clipboard[GetRegisterClipboardSize()];
|
||||
Float p_clipboard[GetRegisterClipboardSize()];
|
||||
|
||||
RunLoadRegisterClipboard(p_src, p_clipboard);
|
||||
RunStoreRegisterClipboard(p_clipboard, p_dst);
|
||||
RunLoadRegisterClipboard(p_src, p_clipboard);
|
||||
RunStoreRegisterClipboard(p_clipboard, p_dst);
|
||||
}
|
||||
};
|
||||
};
|
||||
|
||||
@@ -39,7 +39,7 @@ struct BlockwiseTensorSliceReorderCopy_v3
|
||||
constexpr auto thread_cluster_lengths =
|
||||
src_cluster_lengths.ReorderGivenNew2Old(map_thread_cluster_2_src_cluster);
|
||||
|
||||
constexpr auto thread_cluster_desc = make_ConstantTensorDescriptor(thread_cluster_lengths);
|
||||
constexpr auto thread_cluster_desc = make_packed_ConstantTensorDescriptor(thread_cluster_lengths);
|
||||
|
||||
// sanity check: data type
|
||||
static_assert(is_same<Float, float>::value, "wrong! only support float for now!\n");
|
||||
@@ -105,7 +105,8 @@ struct BlockwiseTensorSliceReorderCopy_v3
|
||||
}
|
||||
}
|
||||
|
||||
const auto thread_multi_id = thread_cluster_desc.GetMultiIndex(get_thread_local_1d_id());
|
||||
const auto thread_multi_id =
|
||||
thread_cluster_desc.GetMultiIndexFrom1dIndex(get_thread_local_1d_id());
|
||||
|
||||
// compiler: thread_multi_id, src_data_multi_id, dst_data_multi_id, will use separate
|
||||
// regsiters, or only one copy???
|
||||
@@ -115,17 +116,21 @@ struct BlockwiseTensorSliceReorderCopy_v3
|
||||
static_for<0, nDim, 1>{}([&](auto IDim) {
|
||||
constexpr auto I = decltype(IDim){};
|
||||
constexpr index_t i = I.Get();
|
||||
// compiler: will it really compute index here, or be merged with Get1dIndex and
|
||||
// compiler: will it really compute index here, or be merged with
|
||||
// GetOffsetFromMultiIndex and
|
||||
// optimized away???
|
||||
src_data_multi_id[i] *= src_sub_lengths.Get(I);
|
||||
});
|
||||
|
||||
// compiler: will it really compute index here, or be merged with Get1dIndex and
|
||||
// compiler: will it really compute index here, or be merged with GetOffsetFromMultiIndex
|
||||
// and
|
||||
// optimized away???
|
||||
const auto dst_data_multi_id = reorder_array_given_new2old(src_data_multi_id, map_dst2src);
|
||||
|
||||
mSrcMyThreadOffset = src_desc.Get1dIndex(src_data_multi_id + src_block_data_multi_id_begin);
|
||||
mDstMyThreadOffset = dst_desc.Get1dIndex(dst_data_multi_id + dst_block_data_multi_id_begin);
|
||||
mSrcMyThreadOffset =
|
||||
src_desc.GetOffsetFromMultiIndex(src_data_multi_id + src_block_data_multi_id_begin);
|
||||
mDstMyThreadOffset =
|
||||
dst_desc.GetOffsetFromMultiIndex(dst_data_multi_id + dst_block_data_multi_id_begin);
|
||||
}
|
||||
|
||||
__device__ static constexpr index_t GetRegisterClipboardSize()
|
||||
@@ -142,7 +147,7 @@ struct BlockwiseTensorSliceReorderCopy_v3
|
||||
|
||||
constexpr auto thread_tensor_lengths = thread_sub_tensor_lengths * repeat_lengths;
|
||||
|
||||
constexpr auto thread_tensor_desc = make_ConstantTensorDescriptor(thread_tensor_lengths);
|
||||
constexpr auto thread_tensor_desc = make_packed_ConstantTensorDescriptor(thread_tensor_lengths);
|
||||
|
||||
return thread_tensor_desc.GetElementSpace();
|
||||
}
|
||||
@@ -162,7 +167,7 @@ struct BlockwiseTensorSliceReorderCopy_v3
|
||||
|
||||
constexpr auto thread_tensor_lengths = thread_sub_tensor_lengths * repeat_lengths;
|
||||
|
||||
constexpr auto thread_tensor_desc = make_ConstantTensorDescriptor(thread_tensor_lengths);
|
||||
constexpr auto thread_tensor_desc = make_packed_ConstantTensorDescriptor(thread_tensor_lengths);
|
||||
|
||||
static_ford<decltype(repeat_lengths)>{}([&](auto repeat_multi_id_) {
|
||||
constexpr auto repeat_multi_id = decltype(repeat_multi_id_){};
|
||||
@@ -171,9 +176,9 @@ struct BlockwiseTensorSliceReorderCopy_v3
|
||||
|
||||
constexpr auto clipboard_data_multi_id = repeat_multi_id * thread_sub_tensor_lengths;
|
||||
|
||||
constexpr index_t src_offset = SrcDesc{}.Get1dIndex(src_data_multi_id);
|
||||
constexpr index_t src_offset = SrcDesc{}.GetOffsetFromMultiIndex(src_data_multi_id);
|
||||
constexpr index_t clipboard_offset =
|
||||
thread_tensor_desc.Get1dIndex(clipboard_data_multi_id);
|
||||
thread_tensor_desc.GetOffsetFromMultiIndex(clipboard_data_multi_id);
|
||||
|
||||
threadwise_tensor_slice_copy(SrcDesc{},
|
||||
p_src + src_offset + mSrcMyThreadOffset,
|
||||
@@ -199,7 +204,7 @@ struct BlockwiseTensorSliceReorderCopy_v3
|
||||
|
||||
constexpr auto thread_tensor_lengths = thread_sub_tensor_lengths * repeat_lengths;
|
||||
|
||||
constexpr auto thread_tensor_desc = make_ConstantTensorDescriptor(thread_tensor_lengths);
|
||||
constexpr auto thread_tensor_desc = make_packed_ConstantTensorDescriptor(thread_tensor_lengths);
|
||||
|
||||
static_ford<decltype(repeat_lengths)>{}([&](auto repeat_multi_id_) {
|
||||
constexpr auto repeat_multi_id = decltype(repeat_multi_id_){};
|
||||
@@ -212,9 +217,9 @@ struct BlockwiseTensorSliceReorderCopy_v3
|
||||
constexpr auto dst_data_multi_id = src_data_multi_id.ReorderGivenNew2Old(MapDst2Src{});
|
||||
|
||||
constexpr index_t clipboard_offset =
|
||||
thread_tensor_desc.Get1dIndex(clipboard_data_multi_id);
|
||||
thread_tensor_desc.GetOffsetFromMultiIndex(clipboard_data_multi_id);
|
||||
|
||||
constexpr index_t dst_offset = DstDesc{}.Get1dIndex(dst_data_multi_id);
|
||||
constexpr index_t dst_offset = DstDesc{}.GetOffsetFromMultiIndex(dst_data_multi_id);
|
||||
|
||||
// write in the order of dst
|
||||
#if 1
|
||||
|
||||
@@ -30,7 +30,7 @@ __host__ __device__ constexpr auto get_convolution_output_default_4d_tensor_desc
|
||||
constexpr auto HO = HI + 1 - Y;
|
||||
constexpr auto WO = WI + 1 - X;
|
||||
|
||||
return make_ConstantTensorDescriptor(Sequence<N, K, HO, WO>{});
|
||||
return make_packed_ConstantTensorDescriptor(Sequence<N, K, HO, WO>{});
|
||||
}
|
||||
|
||||
template <class InDesc, class WeiDesc, class LowerPads, class UpperPads>
|
||||
@@ -67,7 +67,7 @@ __host__ __device__ constexpr auto get_convolution_with_padding_output_default_4
|
||||
constexpr auto HO = HI + HPadLow + HPadUp + 1 - Y;
|
||||
constexpr auto WO = WI + WPadLow + WPadUp + 1 - X;
|
||||
|
||||
return make_ConstantTensorDescriptor(Sequence<N, K, HO, WO>{});
|
||||
return make_packed_ConstantTensorDescriptor(Sequence<N, K, HO, WO>{});
|
||||
}
|
||||
|
||||
template <class InDesc, class WeiDesc, class OutDesc>
|
||||
|
||||
@@ -180,18 +180,19 @@ struct GridwiseConvolutionDirect_v2_nchw_kcyx_nkhw
|
||||
c_block_data_begin += CPerBlock, __syncthreads())
|
||||
{
|
||||
// copy input tensor to LDS
|
||||
blockwise_in_copy.Run(p_in_global +
|
||||
in_nchw_global_desc.Get1dIndex(n_block_data_begin,
|
||||
c_block_data_begin,
|
||||
hi_block_data_begin,
|
||||
wi_block_data_begin),
|
||||
p_in_block);
|
||||
blockwise_in_copy.Run(
|
||||
p_in_global +
|
||||
in_nchw_global_desc.GetOffsetFromMultiIndex(n_block_data_begin,
|
||||
c_block_data_begin,
|
||||
hi_block_data_begin,
|
||||
wi_block_data_begin),
|
||||
p_in_block);
|
||||
|
||||
// copy weight tensor to LDS
|
||||
blockwise_wei_copy.Run(
|
||||
p_wei_global +
|
||||
wei_kcyx_global_desc.Get1dIndex(k_block_data_begin, c_block_data_begin, 0, 0),
|
||||
p_wei_block);
|
||||
blockwise_wei_copy.Run(p_wei_global +
|
||||
wei_kcyx_global_desc.GetOffsetFromMultiIndex(
|
||||
k_block_data_begin, c_block_data_begin, 0, 0),
|
||||
p_wei_block);
|
||||
|
||||
__syncthreads();
|
||||
|
||||
@@ -202,26 +203,28 @@ struct GridwiseConvolutionDirect_v2_nchw_kcyx_nkhw
|
||||
threadwise_direct_convolution_2(
|
||||
in_nchw_thread_block_desc,
|
||||
p_in_block +
|
||||
in_nchw_block_desc.Get1dIndex(n_thread_data_begin,
|
||||
c_thread_data,
|
||||
hi_thread_data_begin,
|
||||
wi_thread_data_begin),
|
||||
in_nchw_block_desc.GetOffsetFromMultiIndex(n_thread_data_begin,
|
||||
c_thread_data,
|
||||
hi_thread_data_begin,
|
||||
wi_thread_data_begin),
|
||||
wei_kcyx_thread_block_desc,
|
||||
p_wei_block +
|
||||
wei_kcyx_block_desc.Get1dIndex(k_thread_data_begin, c_thread_data, 0, 0),
|
||||
wei_kcyx_block_desc.GetOffsetFromMultiIndex(
|
||||
k_thread_data_begin, c_thread_data, 0, 0),
|
||||
out_nkhw_thread_desc,
|
||||
p_out_thread);
|
||||
#elif 0
|
||||
threadwise_direct_convolution_3(
|
||||
in_nchw_thread_block_desc,
|
||||
p_in_block +
|
||||
in_nchw_block_desc.Get1dIndex(n_thread_data_begin,
|
||||
c_thread_data,
|
||||
hi_thread_data_begin,
|
||||
wi_thread_data_begin),
|
||||
in_nchw_block_desc.GetOffsetFromMultiIndex(n_thread_data_begin,
|
||||
c_thread_data,
|
||||
hi_thread_data_begin,
|
||||
wi_thread_data_begin),
|
||||
wei_kcyx_thread_block_desc,
|
||||
p_wei_block +
|
||||
wei_kcyx_block_desc.Get1dIndex(k_thread_data_begin, c_thread_data, 0, 0),
|
||||
wei_kcyx_block_desc.GetOffsetFromMultiIndex(
|
||||
k_thread_data_begin, c_thread_data, 0, 0),
|
||||
out_nkhw_thread_desc,
|
||||
p_out_thread);
|
||||
#endif
|
||||
@@ -229,16 +232,16 @@ struct GridwiseConvolutionDirect_v2_nchw_kcyx_nkhw
|
||||
}
|
||||
|
||||
// copy output tensor from register to global mem
|
||||
threadwise_tensor_slice_copy(
|
||||
out_nkhw_thread_desc,
|
||||
p_out_thread,
|
||||
out_nkhw_global_desc,
|
||||
p_out_global +
|
||||
out_nkhw_global_desc.Get1dIndex(n_block_data_begin + n_thread_data_begin,
|
||||
k_block_data_begin + k_thread_data_begin,
|
||||
ho_block_data_begin + ho_thread_data_begin,
|
||||
wo_block_data_begin + wo_thread_data_begin),
|
||||
out_nkhw_thread_desc.GetLengths(),
|
||||
Number<1>{});
|
||||
threadwise_tensor_slice_copy(out_nkhw_thread_desc,
|
||||
p_out_thread,
|
||||
out_nkhw_global_desc,
|
||||
p_out_global +
|
||||
out_nkhw_global_desc.GetOffsetFromMultiIndex(
|
||||
n_block_data_begin + n_thread_data_begin,
|
||||
k_block_data_begin + k_thread_data_begin,
|
||||
ho_block_data_begin + ho_thread_data_begin,
|
||||
wo_block_data_begin + wo_thread_data_begin),
|
||||
out_nkhw_thread_desc.GetLengths(),
|
||||
Number<1>{});
|
||||
}
|
||||
};
|
||||
|
||||
@@ -221,11 +221,12 @@ struct GridwiseConvolutionImplicitGemm_v1r1_chwn_cyxk_khwn
|
||||
|
||||
const Float* p_in_global_block_offset =
|
||||
p_in_global +
|
||||
in_c_h_w_n_global_desc.Get1dIndex(
|
||||
in_c_h_w_n_global_desc.GetOffsetFromMultiIndex(
|
||||
0, hi_block_data_begin, wi_block_data_begin, n_block_data_begin);
|
||||
|
||||
const Float* p_wei_global_block_offset =
|
||||
p_wei_global + wei_c_y_x_k_global_desc.Get1dIndex(0, 0, 0, k_block_data_begin);
|
||||
p_wei_global +
|
||||
wei_c_y_x_k_global_desc.GetOffsetFromMultiIndex(0, 0, 0, k_block_data_begin);
|
||||
|
||||
for(index_t c_block_data_begin = 0; c_block_data_begin < C; c_block_data_begin += CPerBlock,
|
||||
p_in_global_block_offset += CPerBlock * in_c_h_w_n_global_desc.GetStride(I0),
|
||||
@@ -261,8 +262,8 @@ struct GridwiseConvolutionImplicitGemm_v1r1_chwn_cyxk_khwn
|
||||
#else
|
||||
blockwise_batch_gemm.Run_asm
|
||||
#endif
|
||||
(p_wei_block + wei_c_y_x_k_block_desc.Get1dIndex(0, y, x, 0),
|
||||
p_in_block + in_c_h_w_n_block_desc.Get1dIndex(0, y, x, 0),
|
||||
(p_wei_block + wei_c_y_x_k_block_desc.GetOffsetFromMultiIndex(0, y, x, 0),
|
||||
p_in_block + in_c_h_w_n_block_desc.GetOffsetFromMultiIndex(0, y, x, 0),
|
||||
p_out_thread);
|
||||
}
|
||||
}
|
||||
@@ -325,17 +326,17 @@ struct GridwiseConvolutionImplicitGemm_v1r1_chwn_cyxk_khwn
|
||||
}
|
||||
#endif
|
||||
|
||||
threadwise_tensor_slice_copy(
|
||||
out_10d_thread_desc,
|
||||
p_out_thread,
|
||||
out_10d_global_desc,
|
||||
p_out_global +
|
||||
out_k_h_w_n_global_desc.Get1dIndex(k_block_data_begin + k_thread_data_begin,
|
||||
ho_block_data_begin + ho_thread_data_begin,
|
||||
wo_block_data_begin + wo_thread_data_begin,
|
||||
n_block_data_begin + n_thread_data_begin),
|
||||
out_10d_thread_desc.GetLengths(),
|
||||
Number<OutThreadCopyDataPerWrite_N>{});
|
||||
threadwise_tensor_slice_copy(out_10d_thread_desc,
|
||||
p_out_thread,
|
||||
out_10d_global_desc,
|
||||
p_out_global +
|
||||
out_k_h_w_n_global_desc.GetOffsetFromMultiIndex(
|
||||
k_block_data_begin + k_thread_data_begin,
|
||||
ho_block_data_begin + ho_thread_data_begin,
|
||||
wo_block_data_begin + wo_thread_data_begin,
|
||||
n_block_data_begin + n_thread_data_begin),
|
||||
out_10d_thread_desc.GetLengths(),
|
||||
Number<OutThreadCopyDataPerWrite_N>{});
|
||||
}).else_([&](auto f_dummy) {
|
||||
static_assert(f_dummy(GemmNPerThreadSubC) >= NPerBlock && NPerThread == NPerBlock &&
|
||||
GemmNPerThreadSubC % NPerThread == 0,
|
||||
@@ -375,17 +376,17 @@ struct GridwiseConvolutionImplicitGemm_v1r1_chwn_cyxk_khwn
|
||||
}
|
||||
#endif
|
||||
|
||||
threadwise_tensor_slice_copy(
|
||||
out_10d_thread_desc,
|
||||
p_out_thread,
|
||||
out_10d_global_desc,
|
||||
p_out_global +
|
||||
out_k_h_w_n_global_desc.Get1dIndex(k_block_data_begin + k_thread_data_begin,
|
||||
ho_block_data_begin + ho_thread_data_begin,
|
||||
wo_block_data_begin + wo_thread_data_begin,
|
||||
n_block_data_begin + n_thread_data_begin),
|
||||
out_10d_thread_desc.GetLengths(),
|
||||
Number<OutThreadCopyDataPerWrite_N>{});
|
||||
threadwise_tensor_slice_copy(out_10d_thread_desc,
|
||||
p_out_thread,
|
||||
out_10d_global_desc,
|
||||
p_out_global +
|
||||
out_k_h_w_n_global_desc.GetOffsetFromMultiIndex(
|
||||
k_block_data_begin + k_thread_data_begin,
|
||||
ho_block_data_begin + ho_thread_data_begin,
|
||||
wo_block_data_begin + wo_thread_data_begin,
|
||||
n_block_data_begin + n_thread_data_begin),
|
||||
out_10d_thread_desc.GetLengths(),
|
||||
Number<OutThreadCopyDataPerWrite_N>{});
|
||||
});
|
||||
}
|
||||
};
|
||||
|
||||
@@ -230,11 +230,12 @@ struct GridwiseConvolutionImplicitGemm_v1r2_chwn_cyxk_khwn
|
||||
#if 1
|
||||
const Float* p_in_global_block_offset =
|
||||
p_in_global +
|
||||
in_c_h_w_n_global_desc.Get1dIndex(
|
||||
in_c_h_w_n_global_desc.GetOffsetFromMultiIndex(
|
||||
0, hi_block_data_begin, wi_block_data_begin, n_block_data_begin);
|
||||
|
||||
const Float* p_wei_global_block_offset =
|
||||
p_wei_global + wei_c_y_x_k_global_desc.Get1dIndex(0, 0, 0, k_block_data_begin);
|
||||
p_wei_global +
|
||||
wei_c_y_x_k_global_desc.GetOffsetFromMultiIndex(0, 0, 0, k_block_data_begin);
|
||||
|
||||
for(index_t c_block_data_begin = 0; c_block_data_begin < C; c_block_data_begin += CPerBlock,
|
||||
p_in_global_block_offset += CPerBlock * in_c_h_w_n_global_desc.GetStride(I0),
|
||||
@@ -242,22 +243,24 @@ struct GridwiseConvolutionImplicitGemm_v1r2_chwn_cyxk_khwn
|
||||
{
|
||||
for(index_t y = 0; y < Y; ++y)
|
||||
{
|
||||
blockwise_in_copy.Run(p_in_global_block_offset +
|
||||
in_c_h_w_n_global_desc.Get1dIndex(0, y, 0, 0),
|
||||
p_in_block);
|
||||
blockwise_in_copy.Run(
|
||||
p_in_global_block_offset +
|
||||
in_c_h_w_n_global_desc.GetOffsetFromMultiIndex(0, y, 0, 0),
|
||||
p_in_block);
|
||||
|
||||
blockwise_wei_copy.Run(p_wei_global_block_offset +
|
||||
wei_c_y_x_k_global_desc.Get1dIndex(0, y, 0, 0),
|
||||
p_wei_block);
|
||||
blockwise_wei_copy.Run(
|
||||
p_wei_global_block_offset +
|
||||
wei_c_y_x_k_global_desc.GetOffsetFromMultiIndex(0, y, 0, 0),
|
||||
p_wei_block);
|
||||
|
||||
__syncthreads();
|
||||
|
||||
for(index_t x = 0; x < X; ++x)
|
||||
{
|
||||
blockwise_batch_gemm.Run(p_wei_block + wei_c_x_k_block_desc.Get1dIndex(0, x, 0),
|
||||
p_in_block +
|
||||
in_c_h_w_n_block_desc.Get1dIndex(0, 0, x, 0),
|
||||
p_out_thread);
|
||||
blockwise_batch_gemm.Run(
|
||||
p_wei_block + wei_c_x_k_block_desc.GetOffsetFromMultiIndex(0, x, 0),
|
||||
p_in_block + in_c_h_w_n_block_desc.GetOffsetFromMultiIndex(0, 0, x, 0),
|
||||
p_out_thread);
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
@@ -269,11 +272,12 @@ struct GridwiseConvolutionImplicitGemm_v1r2_chwn_cyxk_khwn
|
||||
{
|
||||
const Float* p_in_global_block_offset =
|
||||
p_in_global +
|
||||
in_c_h_w_n_global_desc.Get1dIndex(
|
||||
in_c_h_w_n_global_desc.GetOffsetFromMultiIndex(
|
||||
0, hi_block_data_begin + y, wi_block_data_begin, n_block_data_begin);
|
||||
|
||||
const Float* p_wei_global_block_offset =
|
||||
p_wei_global + wei_c_y_x_k_global_desc.Get1dIndex(0, y, 0, k_block_data_begin);
|
||||
p_wei_global +
|
||||
wei_c_y_x_k_global_desc.GetOffsetFromMultiIndex(0, y, 0, k_block_data_begin);
|
||||
|
||||
for(index_t
|
||||
c_block_data_begin = 0;
|
||||
@@ -290,10 +294,10 @@ struct GridwiseConvolutionImplicitGemm_v1r2_chwn_cyxk_khwn
|
||||
|
||||
for(index_t x = 0; x < X; ++x)
|
||||
{
|
||||
blockwise_batch_gemm.Run(p_wei_block + wei_c_x_k_block_desc.Get1dIndex(0, x, 0),
|
||||
p_in_block +
|
||||
in_c_h_w_n_block_desc.Get1dIndex(0, 0, x, 0),
|
||||
p_out_thread);
|
||||
blockwise_batch_gemm.Run(
|
||||
p_wei_block + wei_c_x_k_block_desc.GetOffsetFromMultiIndex(0, x, 0),
|
||||
p_in_block + in_c_h_w_n_block_desc.GetOffsetFromMultiIndex(0, 0, x, 0),
|
||||
p_out_thread);
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
@@ -358,17 +362,17 @@ struct GridwiseConvolutionImplicitGemm_v1r2_chwn_cyxk_khwn
|
||||
}
|
||||
#endif
|
||||
|
||||
threadwise_tensor_slice_copy(
|
||||
out_10d_thread_desc,
|
||||
p_out_thread,
|
||||
out_10d_global_desc,
|
||||
p_out_global +
|
||||
out_k_h_w_n_global_desc.Get1dIndex(k_block_data_begin + k_thread_data_begin,
|
||||
ho_block_data_begin + ho_thread_data_begin,
|
||||
wo_block_data_begin + wo_thread_data_begin,
|
||||
n_block_data_begin + n_thread_data_begin),
|
||||
out_10d_thread_desc.GetLengths(),
|
||||
Number<OutThreadCopyDataPerWrite_N>{});
|
||||
threadwise_tensor_slice_copy(out_10d_thread_desc,
|
||||
p_out_thread,
|
||||
out_10d_global_desc,
|
||||
p_out_global +
|
||||
out_k_h_w_n_global_desc.GetOffsetFromMultiIndex(
|
||||
k_block_data_begin + k_thread_data_begin,
|
||||
ho_block_data_begin + ho_thread_data_begin,
|
||||
wo_block_data_begin + wo_thread_data_begin,
|
||||
n_block_data_begin + n_thread_data_begin),
|
||||
out_10d_thread_desc.GetLengths(),
|
||||
Number<OutThreadCopyDataPerWrite_N>{});
|
||||
}).else_([&](auto f_dummy) {
|
||||
static_assert(f_dummy(GemmNPerThreadSubC) >= NPerBlock && NPerThread == NPerBlock &&
|
||||
GemmNPerThreadSubC % NPerThread == 0,
|
||||
@@ -408,17 +412,17 @@ struct GridwiseConvolutionImplicitGemm_v1r2_chwn_cyxk_khwn
|
||||
}
|
||||
#endif
|
||||
|
||||
threadwise_tensor_slice_copy(
|
||||
out_10d_thread_desc,
|
||||
p_out_thread,
|
||||
out_10d_global_desc,
|
||||
p_out_global +
|
||||
out_k_h_w_n_global_desc.Get1dIndex(k_block_data_begin + k_thread_data_begin,
|
||||
ho_block_data_begin + ho_thread_data_begin,
|
||||
wo_block_data_begin + wo_thread_data_begin,
|
||||
n_block_data_begin + n_thread_data_begin),
|
||||
out_10d_thread_desc.GetLengths(),
|
||||
Number<OutThreadCopyDataPerWrite_N>{});
|
||||
threadwise_tensor_slice_copy(out_10d_thread_desc,
|
||||
p_out_thread,
|
||||
out_10d_global_desc,
|
||||
p_out_global +
|
||||
out_k_h_w_n_global_desc.GetOffsetFromMultiIndex(
|
||||
k_block_data_begin + k_thread_data_begin,
|
||||
ho_block_data_begin + ho_thread_data_begin,
|
||||
wo_block_data_begin + wo_thread_data_begin,
|
||||
n_block_data_begin + n_thread_data_begin),
|
||||
out_10d_thread_desc.GetLengths(),
|
||||
Number<OutThreadCopyDataPerWrite_N>{});
|
||||
});
|
||||
}
|
||||
};
|
||||
|
||||
@@ -221,11 +221,11 @@ struct GridwiseConvolutionImplicitGemm_v1r2_nchw_cyxk_khwn
|
||||
|
||||
#if 0
|
||||
const Float* p_in_global_block_offset =
|
||||
p_in_global + in_n_c_h_w_global_desc.Get1dIndex(
|
||||
p_in_global + in_n_c_h_w_global_desc.GetOffsetFromMultiIndex(
|
||||
n_block_data_begin, 0, hi_block_data_begin, wi_block_data_begin);
|
||||
|
||||
const Float* p_wei_global_block_offset =
|
||||
p_wei_global + wei_c_y_x_k_global_desc.Get1dIndex(0, 0, 0, k_block_data_begin);
|
||||
p_wei_global + wei_c_y_x_k_global_desc.GetOffsetFromMultiIndex(0, 0, 0, k_block_data_begin);
|
||||
|
||||
for(index_t c_block_data_begin = 0; c_block_data_begin < C; c_block_data_begin += CPerBlock,
|
||||
p_in_global_block_offset += CPerBlock * in_n_c_h_w_global_desc.GetStride(I1),
|
||||
@@ -234,20 +234,20 @@ struct GridwiseConvolutionImplicitGemm_v1r2_nchw_cyxk_khwn
|
||||
for(index_t y = 0; y < Y; ++y)
|
||||
{
|
||||
blockwise_in_copy_reorder.Run(p_in_global_block_offset +
|
||||
in_n_c_h_w_global_desc.Get1dIndex(0, 0, y, 0),
|
||||
in_n_c_h_w_global_desc.GetOffsetFromMultiIndex(0, 0, y, 0),
|
||||
p_in_block);
|
||||
|
||||
blockwise_wei_copy.Run(p_wei_global_block_offset +
|
||||
wei_c_y_x_k_global_desc.Get1dIndex(0, y, 0, 0),
|
||||
wei_c_y_x_k_global_desc.GetOffsetFromMultiIndex(0, y, 0, 0),
|
||||
p_wei_block);
|
||||
|
||||
__syncthreads();
|
||||
|
||||
for(index_t x = 0; x < X; ++x)
|
||||
{
|
||||
blockwise_batch_gemm.Run(p_wei_block + wei_c_x_k_block_desc.Get1dIndex(0, x, 0),
|
||||
blockwise_batch_gemm.Run(p_wei_block + wei_c_x_k_block_desc.GetOffsetFromMultiIndex(0, x, 0),
|
||||
p_in_block +
|
||||
in_c_h_w_n_block_desc.Get1dIndex(0, 0, x, 0),
|
||||
in_c_h_w_n_block_desc.GetOffsetFromMultiIndex(0, 0, x, 0),
|
||||
p_out_thread);
|
||||
}
|
||||
|
||||
@@ -259,11 +259,12 @@ struct GridwiseConvolutionImplicitGemm_v1r2_nchw_cyxk_khwn
|
||||
{
|
||||
const Float* p_in_global_block_offset =
|
||||
p_in_global +
|
||||
in_n_c_h_w_global_desc.Get1dIndex(
|
||||
in_n_c_h_w_global_desc.GetOffsetFromMultiIndex(
|
||||
n_block_data_begin, 0, hi_block_data_begin + y, wi_block_data_begin);
|
||||
|
||||
const Float* p_wei_global_block_offset =
|
||||
p_wei_global + wei_c_y_x_k_global_desc.Get1dIndex(0, y, 0, k_block_data_begin);
|
||||
p_wei_global +
|
||||
wei_c_y_x_k_global_desc.GetOffsetFromMultiIndex(0, y, 0, k_block_data_begin);
|
||||
|
||||
for(index_t
|
||||
c_block_data_begin = 0;
|
||||
@@ -287,10 +288,10 @@ struct GridwiseConvolutionImplicitGemm_v1r2_nchw_cyxk_khwn
|
||||
|
||||
for(index_t x = 0; x < X; ++x)
|
||||
{
|
||||
blockwise_batch_gemm.Run(p_wei_block + wei_c_x_k_block_desc.Get1dIndex(0, x, 0),
|
||||
p_in_block +
|
||||
in_c_h_w_n_block_desc.Get1dIndex(0, 0, x, 0),
|
||||
p_out_thread);
|
||||
blockwise_batch_gemm.Run(
|
||||
p_wei_block + wei_c_x_k_block_desc.GetOffsetFromMultiIndex(0, x, 0),
|
||||
p_in_block + in_c_h_w_n_block_desc.GetOffsetFromMultiIndex(0, 0, x, 0),
|
||||
p_out_thread);
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
@@ -336,16 +337,16 @@ struct GridwiseConvolutionImplicitGemm_v1r2_nchw_cyxk_khwn
|
||||
}
|
||||
#endif
|
||||
|
||||
threadwise_10d_tensor_copy(
|
||||
out_10d_thread_desc,
|
||||
p_out_thread,
|
||||
out_10d_global_desc,
|
||||
p_out_global +
|
||||
out_k_h_w_n_global_desc.Get1dIndex(k_block_data_begin + k_thread_data_begin,
|
||||
ho_block_data_begin + ho_thread_data_begin,
|
||||
wo_block_data_begin + wo_thread_data_begin,
|
||||
n_block_data_begin + n_thread_data_begin),
|
||||
out_10d_thread_desc.GetLengths(),
|
||||
Number<OutThreadCopyDataPerWrite_N>{});
|
||||
threadwise_10d_tensor_copy(out_10d_thread_desc,
|
||||
p_out_thread,
|
||||
out_10d_global_desc,
|
||||
p_out_global +
|
||||
out_k_h_w_n_global_desc.GetOffsetFromMultiIndex(
|
||||
k_block_data_begin + k_thread_data_begin,
|
||||
ho_block_data_begin + ho_thread_data_begin,
|
||||
wo_block_data_begin + wo_thread_data_begin,
|
||||
n_block_data_begin + n_thread_data_begin),
|
||||
out_10d_thread_desc.GetLengths(),
|
||||
Number<OutThreadCopyDataPerWrite_N>{});
|
||||
}
|
||||
};
|
||||
|
||||
@@ -82,7 +82,8 @@ struct GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn
|
||||
constexpr auto block_work_desc = make_ConstantTensorDescriptor(
|
||||
Sequence<NBlockWork, KBlockWork, HBlockWork, WBlockWork>{});
|
||||
|
||||
const auto block_work_multi_id = block_work_desc.GetMultiIndex(get_block_1d_id());
|
||||
const auto block_work_multi_id =
|
||||
block_work_desc.GetMultiIndexFrom1dIndex(get_block_1d_id());
|
||||
|
||||
const index_t n_block_data_begin = block_work_multi_id[0] * NPerBlock;
|
||||
const index_t k_block_data_begin = block_work_multi_id[1] * KPerBlock;
|
||||
@@ -225,11 +226,12 @@ struct GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn
|
||||
#if 1
|
||||
const Float* p_in_global_block_offset =
|
||||
p_in_global +
|
||||
in_c_h_w_n_global_desc.Get1dIndex(
|
||||
in_c_h_w_n_global_desc.GetOffsetFromMultiIndex(
|
||||
0, hi_block_data_begin, wi_block_data_begin, n_block_data_begin);
|
||||
|
||||
const Float* p_wei_global_block_offset =
|
||||
p_wei_global + wei_c_y_x_k_global_desc.Get1dIndex(0, 0, 0, k_block_data_begin);
|
||||
p_wei_global +
|
||||
wei_c_y_x_k_global_desc.GetOffsetFromMultiIndex(0, 0, 0, k_block_data_begin);
|
||||
|
||||
for(index_t c_block_data_begin = 0; c_block_data_begin < C; c_block_data_begin += CPerBlock,
|
||||
p_in_global_block_offset += CPerBlock * in_c_h_w_n_global_desc.GetStride(I0),
|
||||
@@ -240,13 +242,15 @@ struct GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn
|
||||
#pragma unroll
|
||||
for(index_t x = 0; x < X; ++x)
|
||||
{
|
||||
blockwise_in_copy.Run(p_in_global_block_offset +
|
||||
in_c_h_w_n_global_desc.Get1dIndex(0, y, x, 0),
|
||||
p_in_block);
|
||||
blockwise_in_copy.Run(
|
||||
p_in_global_block_offset +
|
||||
in_c_h_w_n_global_desc.GetOffsetFromMultiIndex(0, y, x, 0),
|
||||
p_in_block);
|
||||
|
||||
blockwise_wei_copy.Run(p_wei_global_block_offset +
|
||||
wei_c_y_x_k_global_desc.Get1dIndex(0, y, x, 0),
|
||||
p_wei_block);
|
||||
blockwise_wei_copy.Run(
|
||||
p_wei_global_block_offset +
|
||||
wei_c_y_x_k_global_desc.GetOffsetFromMultiIndex(0, y, x, 0),
|
||||
p_wei_block);
|
||||
|
||||
__syncthreads();
|
||||
|
||||
@@ -263,11 +267,12 @@ struct GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn
|
||||
{
|
||||
const Float* p_in_global_block_offset =
|
||||
p_in_global +
|
||||
in_c_h_w_n_global_desc.Get1dIndex(
|
||||
in_c_h_w_n_global_desc.GetOffsetFromMultiIndex(
|
||||
0, hi_block_data_begin + y, wi_block_data_begin + x, n_block_data_begin);
|
||||
|
||||
const Float* p_wei_global_block_offset =
|
||||
p_wei_global + wei_c_y_x_k_global_desc.Get1dIndex(0, y, x, k_block_data_begin);
|
||||
p_wei_global +
|
||||
wei_c_y_x_k_global_desc.GetOffsetFromMultiIndex(0, y, x, k_block_data_begin);
|
||||
|
||||
for(index_t c_block_data_begin = 0; c_block_data_begin < C;
|
||||
c_block_data_begin += CPerBlock,
|
||||
@@ -347,17 +352,17 @@ struct GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn
|
||||
}
|
||||
#endif
|
||||
|
||||
threadwise_tensor_slice_copy(
|
||||
out_10d_thread_desc,
|
||||
p_out_thread,
|
||||
out_10d_global_desc,
|
||||
p_out_global +
|
||||
out_k_h_w_n_global_desc.Get1dIndex(k_block_data_begin + k_thread_data_begin,
|
||||
ho_block_data_begin + ho_thread_data_begin,
|
||||
wo_block_data_begin + wo_thread_data_begin,
|
||||
n_block_data_begin + n_thread_data_begin),
|
||||
out_10d_thread_desc.GetLengths(),
|
||||
Number<OutThreadCopyDataPerWrite_N>{});
|
||||
threadwise_tensor_slice_copy(out_10d_thread_desc,
|
||||
p_out_thread,
|
||||
out_10d_global_desc,
|
||||
p_out_global +
|
||||
out_k_h_w_n_global_desc.GetOffsetFromMultiIndex(
|
||||
k_block_data_begin + k_thread_data_begin,
|
||||
ho_block_data_begin + ho_thread_data_begin,
|
||||
wo_block_data_begin + wo_thread_data_begin,
|
||||
n_block_data_begin + n_thread_data_begin),
|
||||
out_10d_thread_desc.GetLengths(),
|
||||
Number<OutThreadCopyDataPerWrite_N>{});
|
||||
}).else_([&](auto f_dummy) {
|
||||
static_assert(f_dummy(GemmNPerThreadSubC) >= NPerBlock && NPerThread == NPerBlock &&
|
||||
GemmNPerThreadSubC % NPerThread == 0,
|
||||
@@ -397,17 +402,17 @@ struct GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn
|
||||
}
|
||||
#endif
|
||||
|
||||
threadwise_tensor_slice_copy(
|
||||
out_10d_thread_desc,
|
||||
p_out_thread,
|
||||
out_10d_global_desc,
|
||||
p_out_global +
|
||||
out_k_h_w_n_global_desc.Get1dIndex(k_block_data_begin + k_thread_data_begin,
|
||||
ho_block_data_begin + ho_thread_data_begin,
|
||||
wo_block_data_begin + wo_thread_data_begin,
|
||||
n_block_data_begin + n_thread_data_begin),
|
||||
out_10d_thread_desc.GetLengths(),
|
||||
Number<OutThreadCopyDataPerWrite_N>{});
|
||||
threadwise_tensor_slice_copy(out_10d_thread_desc,
|
||||
p_out_thread,
|
||||
out_10d_global_desc,
|
||||
p_out_global +
|
||||
out_k_h_w_n_global_desc.GetOffsetFromMultiIndex(
|
||||
k_block_data_begin + k_thread_data_begin,
|
||||
ho_block_data_begin + ho_thread_data_begin,
|
||||
wo_block_data_begin + wo_thread_data_begin,
|
||||
n_block_data_begin + n_thread_data_begin),
|
||||
out_10d_thread_desc.GetLengths(),
|
||||
Number<OutThreadCopyDataPerWrite_N>{});
|
||||
});
|
||||
}
|
||||
};
|
||||
|
||||
@@ -85,10 +85,11 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_chwn_cyxk_khwn
|
||||
constexpr index_t WBlockWork = mod_conv::integer_divide_ceil(Wo, WoPerBlock);
|
||||
constexpr index_t NBlockWork = mod_conv::integer_divide_ceil(N, NPerBlock);
|
||||
|
||||
constexpr auto block_work_desc = make_ConstantTensorDescriptor(
|
||||
constexpr auto block_work_desc = make_packed_ConstantTensorDescriptor(
|
||||
Sequence<KBlockWork, HBlockWork, WBlockWork, NBlockWork>{});
|
||||
|
||||
const auto block_work_multi_id = block_work_desc.GetMultiIndex(get_block_1d_id());
|
||||
const auto block_work_multi_id =
|
||||
block_work_desc.GetMultiIndexFrom1dIndex(get_block_1d_id());
|
||||
|
||||
const index_t k_block_data_begin = block_work_multi_id[0] * KPerBlock;
|
||||
const index_t ho_block_data_begin = block_work_multi_id[1] * HoPerBlock;
|
||||
@@ -108,7 +109,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_chwn_cyxk_khwn
|
||||
GemmDataPerReadA,
|
||||
GemmDataPerReadB);
|
||||
|
||||
constexpr auto in_c_h_w_n_block_desc = make_ConstantTensorDescriptor_aligned(
|
||||
constexpr auto in_c_h_w_n_block_desc = make_ranked_ConstantTensorDescriptor_with_alignment(
|
||||
Sequence<CPerBlock, HoPerBlock, WoPerBlock, NPerBlock>{},
|
||||
Number<InBlockCopyDataPerRead_N>{});
|
||||
|
||||
@@ -117,12 +118,12 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_chwn_cyxk_khwn
|
||||
static_assert(in_c_h_w_n_block_desc.GetStride(I1) % GemmDataPerReadB == 0,
|
||||
"GemmDataPerReadB alignment requirement is not meet");
|
||||
|
||||
constexpr auto wei_c_k_block_desc = make_ConstantTensorDescriptor_aligned(
|
||||
constexpr auto wei_c_k_block_desc = make_ranked_ConstantTensorDescriptor_with_alignment(
|
||||
Sequence<CPerBlock, KPerBlock>{},
|
||||
Number<mod_conv::max(WeiBlockCopyDataPerRead_K, GemmDataPerReadA)>{});
|
||||
|
||||
// tensor view of threadwise output in register
|
||||
constexpr auto out_k_h_w_n_thread_desc = make_ConstantTensorDescriptor(
|
||||
constexpr auto out_k_h_w_n_thread_desc = make_packed_ConstantTensorDescriptor(
|
||||
Sequence<KPerThread, HoPerThread, WoPerThread, NPerThread>{});
|
||||
|
||||
// blockwise copy
|
||||
@@ -243,11 +244,12 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_chwn_cyxk_khwn
|
||||
{
|
||||
const Float* p_in_global_block_offset =
|
||||
p_in_global +
|
||||
in_c_h_w_n_global_desc.Get1dIndex(
|
||||
in_c_h_w_n_global_desc.GetOffsetFromMultiIndex(
|
||||
0, hi_block_data_begin + y, wi_block_data_begin + x, n_block_data_begin);
|
||||
|
||||
const Float* p_wei_global_block_offset =
|
||||
p_wei_global + wei_c_y_x_k_global_desc.Get1dIndex(0, y, x, k_block_data_begin);
|
||||
p_wei_global +
|
||||
wei_c_y_x_k_global_desc.GetOffsetFromMultiIndex(0, y, x, k_block_data_begin);
|
||||
|
||||
// LDS double buffer: preload data into LDS
|
||||
{
|
||||
@@ -399,17 +401,17 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_chwn_cyxk_khwn
|
||||
}
|
||||
#endif
|
||||
|
||||
threadwise_tensor_slice_copy(
|
||||
out_10d_thread_desc,
|
||||
p_out_thread,
|
||||
out_10d_global_desc,
|
||||
p_out_global +
|
||||
out_k_h_w_n_global_desc.Get1dIndex(k_block_data_begin + k_thread_data_begin,
|
||||
ho_block_data_begin + ho_thread_data_begin,
|
||||
wo_block_data_begin + wo_thread_data_begin,
|
||||
n_block_data_begin + n_thread_data_begin),
|
||||
out_10d_thread_desc.GetLengths(),
|
||||
Number<OutThreadCopyDataPerWrite_N>{});
|
||||
threadwise_tensor_slice_copy(out_10d_thread_desc,
|
||||
p_out_thread,
|
||||
out_10d_global_desc,
|
||||
p_out_global +
|
||||
out_k_h_w_n_global_desc.GetOffsetFromMultiIndex(
|
||||
k_block_data_begin + k_thread_data_begin,
|
||||
ho_block_data_begin + ho_thread_data_begin,
|
||||
wo_block_data_begin + wo_thread_data_begin,
|
||||
n_block_data_begin + n_thread_data_begin),
|
||||
out_10d_thread_desc.GetLengths(),
|
||||
Number<OutThreadCopyDataPerWrite_N>{});
|
||||
}).else_([&](auto fwd) {
|
||||
static_assert(fwd(GemmNPerThreadSubC) >= NPerBlock && NPerThread == NPerBlock &&
|
||||
GemmNPerThreadSubC % NPerThread == 0,
|
||||
@@ -450,17 +452,17 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_chwn_cyxk_khwn
|
||||
}
|
||||
#endif
|
||||
|
||||
threadwise_tensor_slice_copy(
|
||||
out_10d_thread_desc,
|
||||
p_out_thread,
|
||||
out_10d_global_desc,
|
||||
p_out_global +
|
||||
out_k_h_w_n_global_desc.Get1dIndex(k_block_data_begin + k_thread_data_begin,
|
||||
ho_block_data_begin + ho_thread_data_begin,
|
||||
wo_block_data_begin + wo_thread_data_begin,
|
||||
n_block_data_begin + n_thread_data_begin),
|
||||
out_10d_thread_desc.GetLengths(),
|
||||
Number<OutThreadCopyDataPerWrite_N>{});
|
||||
threadwise_tensor_slice_copy(out_10d_thread_desc,
|
||||
p_out_thread,
|
||||
out_10d_global_desc,
|
||||
p_out_global +
|
||||
out_k_h_w_n_global_desc.GetOffsetFromMultiIndex(
|
||||
k_block_data_begin + k_thread_data_begin,
|
||||
ho_block_data_begin + ho_thread_data_begin,
|
||||
wo_block_data_begin + wo_thread_data_begin,
|
||||
n_block_data_begin + n_thread_data_begin),
|
||||
out_10d_thread_desc.GetLengths(),
|
||||
Number<OutThreadCopyDataPerWrite_N>{});
|
||||
});
|
||||
}
|
||||
};
|
||||
|
||||
@@ -86,10 +86,11 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_nchw_cyxk_khwn
|
||||
constexpr index_t HBlockWork = mod_conv::integer_divide_ceil(Ho, HoPerBlock);
|
||||
constexpr index_t WBlockWork = mod_conv::integer_divide_ceil(Wo, WoPerBlock);
|
||||
|
||||
constexpr auto block_work_desc = make_ConstantTensorDescriptor(
|
||||
constexpr auto block_work_desc = make_packed_ConstantTensorDescriptor(
|
||||
Sequence<NBlockWork, KBlockWork, HBlockWork, WBlockWork>{});
|
||||
|
||||
const auto block_work_multi_id = block_work_desc.GetMultiIndex(get_block_1d_id());
|
||||
const auto block_work_multi_id =
|
||||
block_work_desc.GetMultiIndexFrom1dIndex(get_block_1d_id());
|
||||
|
||||
const index_t n_block_data_begin = block_work_multi_id[0] * NPerBlock;
|
||||
const index_t k_block_data_begin = block_work_multi_id[1] * KPerBlock;
|
||||
@@ -101,7 +102,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_nchw_cyxk_khwn
|
||||
|
||||
// global tensor view
|
||||
constexpr auto wei_c_k_global_desc =
|
||||
make_ConstantTensorDescriptor(Sequence<C, K>{}, Sequence<Y * X * K, 1>{});
|
||||
make_ranked_ConstantTensorDescriptor(Sequence<C, K>{}, Sequence<Y * X * K, 1>{});
|
||||
|
||||
// LDS tensor view
|
||||
// be careful of alignment
|
||||
@@ -110,7 +111,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_nchw_cyxk_khwn
|
||||
GemmDataPerReadA,
|
||||
GemmDataPerReadB);
|
||||
|
||||
constexpr auto in_c_h_w_n_block_desc = make_ConstantTensorDescriptor_aligned(
|
||||
constexpr auto in_c_h_w_n_block_desc = make_ranked_ConstantTensorDescriptor_with_alignment(
|
||||
Sequence<CPerBlock, HoPerBlock, WoPerBlock, NPerBlock>{},
|
||||
Number<InBlockReorderDataPerWrite_N>{});
|
||||
|
||||
@@ -119,12 +120,12 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_nchw_cyxk_khwn
|
||||
static_assert(in_c_h_w_n_block_desc.GetStride(I1) % GemmDataPerReadB == 0,
|
||||
"GemmDataPerReadB alignment requirement is not meet");
|
||||
|
||||
constexpr auto wei_c_k_block_desc = make_ConstantTensorDescriptor_aligned(
|
||||
constexpr auto wei_c_k_block_desc = make_ranked_ConstantTensorDescriptor_with_alignment(
|
||||
Sequence<CPerBlock, KPerBlock>{},
|
||||
Number<mod_conv::max(WeiBlockCopyDataPerRead_K, GemmDataPerReadA)>{});
|
||||
|
||||
// tensor view of threadwise output in register
|
||||
constexpr auto out_k_h_w_n_thread_desc = make_ConstantTensorDescriptor(
|
||||
constexpr auto out_k_h_w_n_thread_desc = make_packed_ConstantTensorDescriptor(
|
||||
Sequence<KPerThread, HoPerThread, WoPerThread, NPerThread>{});
|
||||
|
||||
// blockwise copy
|
||||
@@ -241,11 +242,12 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_nchw_cyxk_khwn
|
||||
{
|
||||
const Float* p_in_global_block_offset =
|
||||
p_in_global +
|
||||
in_n_c_h_w_global_desc.Get1dIndex(
|
||||
in_n_c_h_w_global_desc.GetOffsetFromMultiIndex(
|
||||
n_block_data_begin, 0, hi_block_data_begin + y, wi_block_data_begin + x);
|
||||
|
||||
const Float* p_wei_global_block_offset =
|
||||
p_wei_global + wei_c_y_x_k_global_desc.Get1dIndex(0, y, x, k_block_data_begin);
|
||||
p_wei_global +
|
||||
wei_c_y_x_k_global_desc.GetOffsetFromMultiIndex(0, y, x, k_block_data_begin);
|
||||
|
||||
// LDS double buffer: preload data into LDS
|
||||
{
|
||||
@@ -359,13 +361,13 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_nchw_cyxk_khwn
|
||||
const index_t wo_thread_data_begin = c_thread_mtx_begin.col / NPerBlock;
|
||||
const index_t n_thread_data_begin = c_thread_mtx_begin.col % NPerBlock;
|
||||
|
||||
static_if<GemmNPerThreadSubC <= NPerBlock>{}([&](auto f_dummy) { // f_dummy do nothing but
|
||||
static_if<GemmNPerThreadSubC <= NPerBlock>{}([&](auto fwd) { // fwd do nothing but
|
||||
// perfect forwarding.
|
||||
// Using this trick to
|
||||
// make this lambda a generic lambda, so it won't be compiled until
|
||||
// instantiated
|
||||
static_assert(
|
||||
(f_dummy(GemmNPerThreadSubC) <= NPerBlock && NPerBlock % GemmNPerThreadSubC == 0),
|
||||
(fwd(GemmNPerThreadSubC) <= NPerBlock && NPerBlock % GemmNPerThreadSubC == 0),
|
||||
"wrong!");
|
||||
|
||||
// output is a 10d tensor
|
||||
@@ -373,12 +375,13 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_nchw_cyxk_khwn
|
||||
constexpr index_t N1 = NPerBlock / N2;
|
||||
|
||||
constexpr index_t W2 =
|
||||
(GemmNLevel0Cluster * GemmNLevel1Cluster) / f_dummy(NPerBlock / GemmNPerThreadSubC);
|
||||
(GemmNLevel0Cluster * GemmNLevel1Cluster) / fwd(NPerBlock / GemmNPerThreadSubC);
|
||||
constexpr index_t W1 = WoPerBlock / W2;
|
||||
|
||||
constexpr index_t K2 = GemmMPerThreadSubC;
|
||||
constexpr index_t K1 = KPerBlock / KPerThread;
|
||||
|
||||
#if 0
|
||||
constexpr auto out_10d_global_desc =
|
||||
make_ConstantTensorDescriptor(Sequence<K / (K1 * K2),
|
||||
K1,
|
||||
@@ -387,12 +390,23 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_nchw_cyxk_khwn
|
||||
Wo / (W1 * W2),
|
||||
W1,
|
||||
W2,
|
||||
N / f_dummy(N1 * N2),
|
||||
N / fwd(N1 * N2),
|
||||
N1,
|
||||
N2>{});
|
||||
|
||||
constexpr auto out_10d_thread_desc = make_ConstantTensorDescriptor(
|
||||
Sequence<KPerThread / K2, 1, K2, HoPerThread, 1, W1, 1, 1, 1, N2>{});
|
||||
#else
|
||||
constexpr auto out_10d_global_desc = fwd(out_k_h_w_n_global_desc)
|
||||
.Fold(I3, Number<N1>{}, Number<N2>{})
|
||||
.Fold(I2, Number<W1>{}, Number<W2>{})
|
||||
.Fold(I0, Number<K1>{}, Number<K2>{});
|
||||
|
||||
constexpr auto out_10d_thread_desc = fwd(out_k_h_w_n_thread_desc)
|
||||
.Fold(I3, Number<1>{}, Number<N2>{})
|
||||
.Fold(I2, Number<W1>{}, Number<1>{})
|
||||
.Fold(I0, Number<1>{}, Number<K2>{});
|
||||
#endif
|
||||
|
||||
#if 0
|
||||
if(get_thread_local_1d_id() == 0 && get_block_1d_id() == 0)
|
||||
@@ -407,19 +421,19 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_nchw_cyxk_khwn
|
||||
}
|
||||
#endif
|
||||
|
||||
threadwise_tensor_slice_copy(
|
||||
out_10d_thread_desc,
|
||||
p_out_thread,
|
||||
out_10d_global_desc,
|
||||
p_out_global +
|
||||
out_k_h_w_n_global_desc.Get1dIndex(k_block_data_begin + k_thread_data_begin,
|
||||
ho_block_data_begin + ho_thread_data_begin,
|
||||
wo_block_data_begin + wo_thread_data_begin,
|
||||
n_block_data_begin + n_thread_data_begin),
|
||||
out_10d_thread_desc.GetLengths(),
|
||||
Number<OutThreadCopyDataPerWrite_N>{});
|
||||
}).else_([&](auto f_dummy) {
|
||||
static_assert(f_dummy(GemmNPerThreadSubC) >= NPerBlock && NPerThread == NPerBlock &&
|
||||
threadwise_tensor_slice_copy(out_10d_thread_desc,
|
||||
p_out_thread,
|
||||
out_10d_global_desc,
|
||||
p_out_global +
|
||||
out_k_h_w_n_global_desc.GetOffsetFromMultiIndex(
|
||||
k_block_data_begin + k_thread_data_begin,
|
||||
ho_block_data_begin + ho_thread_data_begin,
|
||||
wo_block_data_begin + wo_thread_data_begin,
|
||||
n_block_data_begin + n_thread_data_begin),
|
||||
out_10d_thread_desc.GetLengths(),
|
||||
Number<OutThreadCopyDataPerWrite_N>{});
|
||||
}).else_([&](auto fwd) {
|
||||
static_assert(fwd(GemmNPerThreadSubC) >= NPerBlock && NPerThread == NPerBlock &&
|
||||
GemmNPerThreadSubC % NPerThread == 0,
|
||||
"wrong!");
|
||||
|
||||
@@ -428,16 +442,30 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_nchw_cyxk_khwn
|
||||
|
||||
constexpr index_t W3 = GemmNPerThreadSubC / NPerBlock;
|
||||
constexpr index_t W2 = GemmNLevel0Cluster * GemmNLevel1Cluster;
|
||||
constexpr index_t W1 = WoPerBlock / f_dummy(W2 * W3);
|
||||
constexpr index_t W1 = WoPerBlock / fwd(W2 * W3);
|
||||
|
||||
constexpr index_t K2 = GemmMPerThreadSubC;
|
||||
constexpr index_t K1 = KPerBlock / KPerThread;
|
||||
|
||||
constexpr auto out_10d_global_desc = make_ConstantTensorDescriptor(
|
||||
#if 0
|
||||
constexpr auto out_10d_global_desc = make_packed_ConstantTensorDescriptor(
|
||||
Sequence<K / (K1 * K2), K1, K2, Ho, Wo / (W1 * W2 * W3), W1, W2, W3, N / N1, N1>{});
|
||||
|
||||
constexpr auto out_10d_thread_desc = make_ConstantTensorDescriptor(
|
||||
constexpr auto out_10d_thread_desc = make_packed_ConstantTensorDescriptor(
|
||||
Sequence<KPerThread / K2, 1, K2, HoPerThread, 1, W1, 1, W3, 1, N1>{});
|
||||
#else
|
||||
constexpr auto out_10d_global_desc =
|
||||
fwd(out_k_h_w_n_global_desc)
|
||||
.Fold(I3, Number<N1>{})
|
||||
.Fold(I2, Number<W1>{}, Number<W2>{}, Number<W3>{})
|
||||
.Fold(I0, Number<K1>{}, Number<K2>{});
|
||||
|
||||
constexpr auto out_10d_thread_desc =
|
||||
fwd(out_k_h_w_n_thread_desc)
|
||||
.Fold(I3, Number<N1>{})
|
||||
.Fold(I2, Number<W1>{}, Number<1>{}, Number<W3>{})
|
||||
.Fold(I0, Number<1>{}, Number<K2>{});
|
||||
#endif
|
||||
|
||||
#if 0
|
||||
if(get_thread_local_1d_id() == 0 && get_block_1d_id() == 0)
|
||||
@@ -457,17 +485,17 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_nchw_cyxk_khwn
|
||||
}
|
||||
#endif
|
||||
|
||||
threadwise_tensor_slice_copy(
|
||||
out_10d_thread_desc,
|
||||
p_out_thread,
|
||||
out_10d_global_desc,
|
||||
p_out_global +
|
||||
out_k_h_w_n_global_desc.Get1dIndex(k_block_data_begin + k_thread_data_begin,
|
||||
ho_block_data_begin + ho_thread_data_begin,
|
||||
wo_block_data_begin + wo_thread_data_begin,
|
||||
n_block_data_begin + n_thread_data_begin),
|
||||
out_10d_thread_desc.GetLengths(),
|
||||
Number<OutThreadCopyDataPerWrite_N>{});
|
||||
threadwise_tensor_slice_copy(out_10d_thread_desc,
|
||||
p_out_thread,
|
||||
out_10d_global_desc,
|
||||
p_out_global +
|
||||
out_k_h_w_n_global_desc.GetOffsetFromMultiIndex(
|
||||
k_block_data_begin + k_thread_data_begin,
|
||||
ho_block_data_begin + ho_thread_data_begin,
|
||||
wo_block_data_begin + wo_thread_data_begin,
|
||||
n_block_data_begin + n_thread_data_begin),
|
||||
out_10d_thread_desc.GetLengths(),
|
||||
Number<OutThreadCopyDataPerWrite_N>{});
|
||||
});
|
||||
}
|
||||
};
|
||||
|
||||
@@ -86,10 +86,11 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_nchw_cyxk_nkhw
|
||||
constexpr index_t HBlockWork = mod_conv::integer_divide_ceil(Ho, HoPerBlock);
|
||||
constexpr index_t WBlockWork = mod_conv::integer_divide_ceil(Wo, WoPerBlock);
|
||||
|
||||
constexpr auto block_work_desc = make_ConstantTensorDescriptor(
|
||||
constexpr auto block_work_desc = make_packed_ConstantTensorDescriptor(
|
||||
Sequence<NBlockWork, KBlockWork, HBlockWork, WBlockWork>{});
|
||||
|
||||
const auto block_work_multi_id = block_work_desc.GetMultiIndex(get_block_1d_id());
|
||||
const auto block_work_multi_id =
|
||||
block_work_desc.GetMultiIndexFrom1dIndex(get_block_1d_id());
|
||||
|
||||
const index_t n_block_data_begin = block_work_multi_id[0] * NPerBlock;
|
||||
const index_t k_block_data_begin = block_work_multi_id[1] * KPerBlock;
|
||||
@@ -109,7 +110,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_nchw_cyxk_nkhw
|
||||
GemmDataPerReadA,
|
||||
GemmDataPerReadB);
|
||||
|
||||
constexpr auto in_c_h_w_n_block_desc = make_ConstantTensorDescriptor_aligned(
|
||||
constexpr auto in_c_h_w_n_block_desc = make_ranked_ConstantTensorDescriptor_with_alignment(
|
||||
Sequence<CPerBlock, HoPerBlock, WoPerBlock, NPerBlock>{},
|
||||
Number<InBlockReorderDataPerWrite_N>{});
|
||||
|
||||
@@ -118,12 +119,12 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_nchw_cyxk_nkhw
|
||||
static_assert(in_c_h_w_n_block_desc.GetStride(I1) % GemmDataPerReadB == 0,
|
||||
"GemmDataPerReadB alignment requirement is not meet");
|
||||
|
||||
constexpr auto wei_c_k_block_desc = make_ConstantTensorDescriptor_aligned(
|
||||
constexpr auto wei_c_k_block_desc = make_ranked_ConstantTensorDescriptor_with_alignment(
|
||||
Sequence<CPerBlock, KPerBlock>{},
|
||||
Number<mod_conv::max(WeiBlockCopyDataPerRead_K, GemmDataPerReadA)>{});
|
||||
|
||||
// tensor view of threadwise output in register
|
||||
constexpr auto out_k_h_w_n_thread_desc = make_ConstantTensorDescriptor(
|
||||
constexpr auto out_k_h_w_n_thread_desc = make_packed_ConstantTensorDescriptor(
|
||||
Sequence<KPerThread, HoPerThread, WoPerThread, NPerThread>{});
|
||||
|
||||
// blockwise copy
|
||||
@@ -240,11 +241,12 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_nchw_cyxk_nkhw
|
||||
{
|
||||
const Float* p_in_global_block_offset =
|
||||
p_in_global +
|
||||
in_n_c_h_w_global_desc.Get1dIndex(
|
||||
in_n_c_h_w_global_desc.GetOffsetFromMultiIndex(
|
||||
n_block_data_begin, 0, hi_block_data_begin + y, wi_block_data_begin + x);
|
||||
|
||||
const Float* p_wei_global_block_offset =
|
||||
p_wei_global + wei_c_y_x_k_global_desc.Get1dIndex(0, y, x, k_block_data_begin);
|
||||
p_wei_global +
|
||||
wei_c_y_x_k_global_desc.GetOffsetFromMultiIndex(0, y, x, k_block_data_begin);
|
||||
|
||||
// LDS double buffer: preload data into LDS
|
||||
{
|
||||
@@ -407,10 +409,11 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_nchw_cyxk_nkhw
|
||||
p_out_thread,
|
||||
out_10d_global_desc,
|
||||
p_out_global +
|
||||
out_n_k_h_w_global_desc.Get1dIndex(n_block_data_begin + n_thread_data_begin,
|
||||
k_block_data_begin + k_thread_data_begin,
|
||||
ho_block_data_begin + ho_thread_data_begin,
|
||||
wo_block_data_begin + wo_thread_data_begin),
|
||||
out_n_k_h_w_global_desc.GetOffsetFromMultiIndex(
|
||||
n_block_data_begin + n_thread_data_begin,
|
||||
k_block_data_begin + k_thread_data_begin,
|
||||
ho_block_data_begin + ho_thread_data_begin,
|
||||
wo_block_data_begin + wo_thread_data_begin),
|
||||
out_10d_thread_desc.GetLengths(),
|
||||
map_out_global2thread);
|
||||
// Number<OutThreadCopyDataPerWrite_W>{});
|
||||
@@ -461,10 +464,11 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_nchw_cyxk_nkhw
|
||||
p_out_thread,
|
||||
out_10d_global_desc,
|
||||
p_out_global +
|
||||
out_n_k_h_w_global_desc.Get1dIndex(n_block_data_begin + n_thread_data_begin,
|
||||
k_block_data_begin + k_thread_data_begin,
|
||||
ho_block_data_begin + ho_thread_data_begin,
|
||||
wo_block_data_begin + wo_thread_data_begin),
|
||||
out_n_k_h_w_global_desc.GetOffsetFromMultiIndex(
|
||||
n_block_data_begin + n_thread_data_begin,
|
||||
k_block_data_begin + k_thread_data_begin,
|
||||
ho_block_data_begin + ho_thread_data_begin,
|
||||
wo_block_data_begin + wo_thread_data_begin),
|
||||
out_10d_thread_desc.GetLengths(),
|
||||
map_out_global2thread);
|
||||
// Number<OutThreadCopyDataPerWrite_W>{});
|
||||
|
||||
@@ -236,11 +236,12 @@ struct GridwiseConvolutionImplicitGemm_v1r3_nchw_cyxk_khwn
|
||||
#if 1
|
||||
const Float* p_in_global_block_offset =
|
||||
p_in_global +
|
||||
in_n_c_h_w_global_desc.Get1dIndex(
|
||||
in_n_c_h_w_global_desc.GetOffsetFromMultiIndex(
|
||||
n_block_data_begin, 0, hi_block_data_begin, wi_block_data_begin);
|
||||
|
||||
const Float* p_wei_global_block_offset =
|
||||
p_wei_global + wei_c_y_x_k_global_desc.Get1dIndex(0, 0, 0, k_block_data_begin);
|
||||
p_wei_global +
|
||||
wei_c_y_x_k_global_desc.GetOffsetFromMultiIndex(0, 0, 0, k_block_data_begin);
|
||||
|
||||
for(index_t c_block_data_begin = 0; c_block_data_begin < C; c_block_data_begin += CPerBlock,
|
||||
p_in_global_block_offset += CPerBlock * in_n_c_h_w_global_desc.GetStride(I1),
|
||||
@@ -251,23 +252,27 @@ struct GridwiseConvolutionImplicitGemm_v1r3_nchw_cyxk_khwn
|
||||
for(index_t x = 0; x < X; ++x)
|
||||
{
|
||||
#if 1
|
||||
blockwise_in_copy_reorder.Run(p_in_global_block_offset +
|
||||
in_n_c_h_w_global_desc.Get1dIndex(0, 0, y, x),
|
||||
p_in_block);
|
||||
blockwise_in_copy_reorder.Run(
|
||||
p_in_global_block_offset +
|
||||
in_n_c_h_w_global_desc.GetOffsetFromMultiIndex(0, 0, y, x),
|
||||
p_in_block);
|
||||
|
||||
blockwise_wei_copy.Run(p_wei_global_block_offset +
|
||||
wei_c_y_x_k_global_desc.Get1dIndex(0, y, x, 0),
|
||||
p_wei_block);
|
||||
blockwise_wei_copy.Run(
|
||||
p_wei_global_block_offset +
|
||||
wei_c_y_x_k_global_desc.GetOffsetFromMultiIndex(0, y, x, 0),
|
||||
p_wei_block);
|
||||
#else
|
||||
Float p_in_clipboard[blockwise_in_copy_reorder.GetRegisterClipboardSize()];
|
||||
Float p_wei_clipboard[blockwise_wei_copy.GetRegisterClipboardSize()];
|
||||
|
||||
blockwise_in_copy_reorder.RunLoadRegisterClipboard(
|
||||
p_in_global_block_offset + in_n_c_h_w_global_desc.Get1dIndex(0, 0, y, x),
|
||||
p_in_global_block_offset +
|
||||
in_n_c_h_w_global_desc.GetOffsetFromMultiIndex(0, 0, y, x),
|
||||
p_in_clipboard);
|
||||
|
||||
blockwise_wei_copy.RunLoadRegisterClipboard(
|
||||
p_wei_global_block_offset + wei_c_y_x_k_global_desc.Get1dIndex(0, y, x, 0),
|
||||
p_wei_global_block_offset +
|
||||
wei_c_y_x_k_global_desc.GetOffsetFromMultiIndex(0, y, x, 0),
|
||||
p_wei_clipboard);
|
||||
|
||||
blockwise_wei_copy.RunStoreRegisterClipboard(p_wei_clipboard, p_wei_block);
|
||||
@@ -291,11 +296,12 @@ struct GridwiseConvolutionImplicitGemm_v1r3_nchw_cyxk_khwn
|
||||
{
|
||||
const Float* p_in_global_block_offset =
|
||||
p_in_global +
|
||||
in_n_c_h_w_global_desc.Get1dIndex(
|
||||
in_n_c_h_w_global_desc.GetOffsetFromMultiIndex(
|
||||
n_block_data_begin, 0, hi_block_data_begin + y, wi_block_data_begin + x);
|
||||
|
||||
const Float* p_wei_global_block_offset =
|
||||
p_wei_global + wei_c_y_x_k_global_desc.Get1dIndex(0, y, x, k_block_data_begin);
|
||||
p_wei_global +
|
||||
wei_c_y_x_k_global_desc.GetOffsetFromMultiIndex(0, y, x, k_block_data_begin);
|
||||
|
||||
for(index_t c_block_data_begin = 0; c_block_data_begin < C;
|
||||
c_block_data_begin += CPerBlock,
|
||||
@@ -390,17 +396,17 @@ struct GridwiseConvolutionImplicitGemm_v1r3_nchw_cyxk_khwn
|
||||
}
|
||||
#endif
|
||||
|
||||
threadwise_tensor_slice_copy(
|
||||
out_10d_thread_desc,
|
||||
p_out_thread,
|
||||
out_10d_global_desc,
|
||||
p_out_global +
|
||||
out_k_h_w_n_global_desc.Get1dIndex(k_block_data_begin + k_thread_data_begin,
|
||||
ho_block_data_begin + ho_thread_data_begin,
|
||||
wo_block_data_begin + wo_thread_data_begin,
|
||||
n_block_data_begin + n_thread_data_begin),
|
||||
out_10d_thread_desc.GetLengths(),
|
||||
Number<OutThreadCopyDataPerWrite_N>{});
|
||||
threadwise_tensor_slice_copy(out_10d_thread_desc,
|
||||
p_out_thread,
|
||||
out_10d_global_desc,
|
||||
p_out_global +
|
||||
out_k_h_w_n_global_desc.GetOffsetFromMultiIndex(
|
||||
k_block_data_begin + k_thread_data_begin,
|
||||
ho_block_data_begin + ho_thread_data_begin,
|
||||
wo_block_data_begin + wo_thread_data_begin,
|
||||
n_block_data_begin + n_thread_data_begin),
|
||||
out_10d_thread_desc.GetLengths(),
|
||||
Number<OutThreadCopyDataPerWrite_N>{});
|
||||
}).else_([&](auto f_dummy) {
|
||||
static_assert(f_dummy(GemmNPerThreadSubC) >= NPerBlock && NPerThread == NPerBlock &&
|
||||
GemmNPerThreadSubC % NPerThread == 0,
|
||||
@@ -440,17 +446,17 @@ struct GridwiseConvolutionImplicitGemm_v1r3_nchw_cyxk_khwn
|
||||
}
|
||||
#endif
|
||||
|
||||
threadwise_tensor_slice_copy(
|
||||
out_10d_thread_desc,
|
||||
p_out_thread,
|
||||
out_10d_global_desc,
|
||||
p_out_global +
|
||||
out_k_h_w_n_global_desc.Get1dIndex(k_block_data_begin + k_thread_data_begin,
|
||||
ho_block_data_begin + ho_thread_data_begin,
|
||||
wo_block_data_begin + wo_thread_data_begin,
|
||||
n_block_data_begin + n_thread_data_begin),
|
||||
out_10d_thread_desc.GetLengths(),
|
||||
Number<OutThreadCopyDataPerWrite_N>{});
|
||||
threadwise_tensor_slice_copy(out_10d_thread_desc,
|
||||
p_out_thread,
|
||||
out_10d_global_desc,
|
||||
p_out_global +
|
||||
out_k_h_w_n_global_desc.GetOffsetFromMultiIndex(
|
||||
k_block_data_begin + k_thread_data_begin,
|
||||
ho_block_data_begin + ho_thread_data_begin,
|
||||
wo_block_data_begin + wo_thread_data_begin,
|
||||
n_block_data_begin + n_thread_data_begin),
|
||||
out_10d_thread_desc.GetLengths(),
|
||||
Number<OutThreadCopyDataPerWrite_N>{});
|
||||
});
|
||||
}
|
||||
};
|
||||
|
||||
@@ -86,7 +86,8 @@ struct GridwiseConvolutionImplicitGemm_v1r3_nchw_cyxk_nkhw
|
||||
constexpr auto block_work_desc = make_ConstantTensorDescriptor(
|
||||
Sequence<NBlockWork, KBlockWork, HBlockWork, WBlockWork>{});
|
||||
|
||||
const auto block_work_multi_id = block_work_desc.GetMultiIndex(get_block_1d_id());
|
||||
const auto block_work_multi_id =
|
||||
block_work_desc.GetMultiIndexFrom1dIndex(get_block_1d_id());
|
||||
|
||||
const index_t n_block_data_begin = block_work_multi_id[0] * NPerBlock;
|
||||
const index_t k_block_data_begin = block_work_multi_id[1] * KPerBlock;
|
||||
@@ -234,11 +235,11 @@ struct GridwiseConvolutionImplicitGemm_v1r3_nchw_cyxk_nkhw
|
||||
#if 0
|
||||
const Float* p_in_global_block_offset =
|
||||
p_in_global +
|
||||
in_n_c_h_w_global_desc.Get1dIndex(
|
||||
in_n_c_h_w_global_desc.GetOffsetFromMultiIndex(
|
||||
n_block_data_begin, 0, hi_block_data_begin, wi_block_data_begin);
|
||||
|
||||
const Float* p_wei_global_block_offset =
|
||||
p_wei_global + wei_c_y_x_k_global_desc.Get1dIndex(0, 0, 0, k_block_data_begin);
|
||||
p_wei_global + wei_c_y_x_k_global_desc.GetOffsetFromMultiIndex(0, 0, 0, k_block_data_begin);
|
||||
|
||||
for(index_t c_block_data_begin = 0; c_block_data_begin < C; c_block_data_begin += CPerBlock,
|
||||
p_in_global_block_offset += CPerBlock * in_n_c_h_w_global_desc.GetStride(I1),
|
||||
@@ -250,22 +251,22 @@ struct GridwiseConvolutionImplicitGemm_v1r3_nchw_cyxk_nkhw
|
||||
{
|
||||
#if 1
|
||||
blockwise_in_copy_reorder.Run(p_in_global_block_offset +
|
||||
in_n_c_h_w_global_desc.Get1dIndex(0, 0, y, x),
|
||||
in_n_c_h_w_global_desc.GetOffsetFromMultiIndex(0, 0, y, x),
|
||||
p_in_block);
|
||||
|
||||
blockwise_wei_copy.Run(p_wei_global_block_offset +
|
||||
wei_c_y_x_k_global_desc.Get1dIndex(0, y, x, 0),
|
||||
wei_c_y_x_k_global_desc.GetOffsetFromMultiIndex(0, y, x, 0),
|
||||
p_wei_block);
|
||||
#else
|
||||
Float p_in_clipboard[blockwise_in_copy_reorder.GetRegisterClipboardSize()];
|
||||
Float p_wei_clipboard[blockwise_wei_copy.GetRegisterClipboardSize()];
|
||||
|
||||
blockwise_in_copy_reorder.RunLoadRegisterClipboard(
|
||||
p_in_global_block_offset + in_n_c_h_w_global_desc.Get1dIndex(0, 0, y, x),
|
||||
p_in_global_block_offset + in_n_c_h_w_global_desc.GetOffsetFromMultiIndex(0, 0, y, x),
|
||||
p_in_clipboard);
|
||||
|
||||
blockwise_wei_copy.RunLoadRegisterClipboard(
|
||||
p_wei_global_block_offset + wei_c_y_x_k_global_desc.Get1dIndex(0, y, x, 0),
|
||||
p_wei_global_block_offset + wei_c_y_x_k_global_desc.GetOffsetFromMultiIndex(0, y, x, 0),
|
||||
p_wei_clipboard);
|
||||
|
||||
blockwise_wei_copy.RunStoreRegisterClipboard(p_wei_clipboard, p_wei_block);
|
||||
@@ -289,11 +290,12 @@ struct GridwiseConvolutionImplicitGemm_v1r3_nchw_cyxk_nkhw
|
||||
{
|
||||
const Float* p_in_global_block_offset =
|
||||
p_in_global +
|
||||
in_n_c_h_w_global_desc.Get1dIndex(
|
||||
in_n_c_h_w_global_desc.GetOffsetFromMultiIndex(
|
||||
n_block_data_begin, 0, hi_block_data_begin + y, wi_block_data_begin + x);
|
||||
|
||||
const Float* p_wei_global_block_offset =
|
||||
p_wei_global + wei_c_y_x_k_global_desc.Get1dIndex(0, y, x, k_block_data_begin);
|
||||
p_wei_global +
|
||||
wei_c_y_x_k_global_desc.GetOffsetFromMultiIndex(0, y, x, k_block_data_begin);
|
||||
|
||||
for(index_t c_block_data_begin = 0; c_block_data_begin < C;
|
||||
c_block_data_begin += CPerBlock,
|
||||
@@ -395,10 +397,11 @@ struct GridwiseConvolutionImplicitGemm_v1r3_nchw_cyxk_nkhw
|
||||
p_out_thread,
|
||||
out_10d_global_desc,
|
||||
p_out_global +
|
||||
out_n_k_h_w_global_desc.Get1dIndex(n_block_data_begin + n_thread_data_begin,
|
||||
k_block_data_begin + k_thread_data_begin,
|
||||
ho_block_data_begin + ho_thread_data_begin,
|
||||
wo_block_data_begin + wo_thread_data_begin),
|
||||
out_n_k_h_w_global_desc.GetOffsetFromMultiIndex(
|
||||
n_block_data_begin + n_thread_data_begin,
|
||||
k_block_data_begin + k_thread_data_begin,
|
||||
ho_block_data_begin + ho_thread_data_begin,
|
||||
wo_block_data_begin + wo_thread_data_begin),
|
||||
out_10d_thread_desc.GetLengths(),
|
||||
map_out_global2thread);
|
||||
// Number<OutThreadCopyDataPerWrite_W>{});
|
||||
@@ -444,10 +447,11 @@ struct GridwiseConvolutionImplicitGemm_v1r3_nchw_cyxk_nkhw
|
||||
p_out_thread,
|
||||
out_10d_global_desc,
|
||||
p_out_global +
|
||||
out_n_k_h_w_global_desc.Get1dIndex(n_block_data_begin + n_thread_data_begin,
|
||||
k_block_data_begin + k_thread_data_begin,
|
||||
ho_block_data_begin + ho_thread_data_begin,
|
||||
wo_block_data_begin + wo_thread_data_begin),
|
||||
out_n_k_h_w_global_desc.GetOffsetFromMultiIndex(
|
||||
n_block_data_begin + n_thread_data_begin,
|
||||
k_block_data_begin + k_thread_data_begin,
|
||||
ho_block_data_begin + ho_thread_data_begin,
|
||||
wo_block_data_begin + wo_thread_data_begin),
|
||||
out_10d_thread_desc.GetLengths(),
|
||||
map_out_global2thread);
|
||||
// Number<OutThreadCopyDataPerWrite_W>{});
|
||||
|
||||
@@ -193,10 +193,11 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn
|
||||
__shared__ Float p_wei_block[wei_block_space];
|
||||
|
||||
const Float* p_in_global_block_offset =
|
||||
p_in_global + in_cb_global_desc.Get1dIndex(0, b_block_data_begin);
|
||||
p_in_global + in_cb_global_desc.GetOffsetFromMultiIndex(0, b_block_data_begin);
|
||||
|
||||
const Float* p_wei_global_block_offset =
|
||||
p_wei_global + wei_cyxk_global_desc.Get1dIndex(0, 0, 0, k_block_data_begin);
|
||||
p_wei_global +
|
||||
wei_cyxk_global_desc.GetOffsetFromMultiIndex(0, 0, 0, k_block_data_begin);
|
||||
|
||||
// register
|
||||
Float p_out_thread[out_kb_thread_desc.GetElementSpace()];
|
||||
@@ -236,7 +237,7 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn
|
||||
#elif 1
|
||||
blockwise_gemm.Run_asm
|
||||
#endif
|
||||
(p_wei_block + wei_cyxk_block_desc.Get1dIndex(0, y, x, 0),
|
||||
(p_wei_block + wei_cyxk_block_desc.GetOffsetFromMultiIndex(0, y, x, 0),
|
||||
p_in_block + y * Wi + x,
|
||||
p_out_thread);
|
||||
}
|
||||
@@ -267,8 +268,9 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn
|
||||
|
||||
if(n_data < N && h_data < Ho && w_data < Wo)
|
||||
{
|
||||
p_out_global[out_khwn_global_desc.Get1dIndex(k_data, h_data, w_data, n_data)] =
|
||||
p_out_thread[out_kb_thread_desc.Get1dIndex(k, b)];
|
||||
p_out_global[out_khwn_global_desc.GetOffsetFromMultiIndex(
|
||||
k_data, h_data, w_data, n_data)] =
|
||||
p_out_thread[out_kb_thread_desc.GetOffsetFromMultiIndex(k, b)];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -198,10 +198,11 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn_lds_double_buffer
|
||||
__shared__ Float p_wei_block_double[2 * wei_block_space];
|
||||
|
||||
const Float* p_in_global_block_offset =
|
||||
p_in_global + in_cb_global_desc.Get1dIndex(0, b_block_data_begin);
|
||||
p_in_global + in_cb_global_desc.GetOffsetFromMultiIndex(0, b_block_data_begin);
|
||||
|
||||
const Float* p_wei_global_block_offset =
|
||||
p_wei_global + wei_cyxk_global_desc.Get1dIndex(0, 0, 0, k_block_data_begin);
|
||||
p_wei_global +
|
||||
wei_cyxk_global_desc.GetOffsetFromMultiIndex(0, 0, 0, k_block_data_begin);
|
||||
|
||||
// preload data into LDS
|
||||
{
|
||||
@@ -269,7 +270,8 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn_lds_double_buffer
|
||||
#elif 0
|
||||
blockwise_gemm.Run_asm
|
||||
#endif
|
||||
(p_wei_block_now + wei_cyxk_block_desc.Get1dIndex(0, y, x, 0),
|
||||
(p_wei_block_now +
|
||||
wei_cyxk_block_desc.GetOffsetFromMultiIndex(0, y, x, 0),
|
||||
p_in_block_now + y * Wi + x,
|
||||
p_out_thread);
|
||||
}
|
||||
@@ -310,7 +312,8 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn_lds_double_buffer
|
||||
#elif 0
|
||||
blockwise_gemm.Run_asm
|
||||
#endif
|
||||
(p_wei_block_double + wei_cyxk_block_desc.Get1dIndex(0, y, x, 0),
|
||||
(p_wei_block_double +
|
||||
wei_cyxk_block_desc.GetOffsetFromMultiIndex(0, y, x, 0),
|
||||
p_in_block_double + y * Wi + x,
|
||||
p_out_thread);
|
||||
}
|
||||
@@ -336,7 +339,7 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn_lds_double_buffer
|
||||
blockwise_gemm.Run_asm
|
||||
#endif
|
||||
(p_wei_block_double + wei_block_space +
|
||||
wei_cyxk_block_desc.Get1dIndex(0, y, x, 0),
|
||||
wei_cyxk_block_desc.GetOffsetFromMultiIndex(0, y, x, 0),
|
||||
p_in_block_double + in_block_space + y * Wi + x,
|
||||
p_out_thread);
|
||||
}
|
||||
@@ -365,14 +368,14 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn_lds_double_buffer
|
||||
|
||||
constexpr auto out_kb_global_desc = make_ConstantTensorDescriptor(Sequence<K, B>{});
|
||||
|
||||
threadwise_6d_tensor_copy(
|
||||
out_6d_thread_desc,
|
||||
p_out_thread,
|
||||
out_6d_global_desc,
|
||||
p_out_global +
|
||||
out_kb_global_desc.Get1dIndex(k_thread_data_begin, b_thread_data_begin),
|
||||
out_6d_thread_desc.GetLengths(),
|
||||
Number<OutThreadCopyDataPerWrite>{});
|
||||
threadwise_6d_tensor_copy(out_6d_thread_desc,
|
||||
p_out_thread,
|
||||
out_6d_global_desc,
|
||||
p_out_global +
|
||||
out_kb_global_desc.GetOffsetFromMultiIndex(
|
||||
k_thread_data_begin, b_thread_data_begin),
|
||||
out_6d_thread_desc.GetLengths(),
|
||||
Number<OutThreadCopyDataPerWrite>{});
|
||||
}
|
||||
else
|
||||
{
|
||||
@@ -393,9 +396,9 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn_lds_double_buffer
|
||||
|
||||
if(n_data < N && h_data < Ho && w_data < Wo)
|
||||
{
|
||||
p_out_global[out_khwn_global_desc.Get1dIndex(
|
||||
p_out_global[out_khwn_global_desc.GetOffsetFromMultiIndex(
|
||||
k_data, h_data, w_data, n_data)] =
|
||||
p_out_thread[out_kb_thread_desc.Get1dIndex(k, b)];
|
||||
p_out_thread[out_kb_thread_desc.GetOffsetFromMultiIndex(k, b)];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -83,7 +83,8 @@ struct GridwiseConvolutionImplicitGemm_v3_nchw_cyxk_nkhw
|
||||
constexpr auto block_work_desc =
|
||||
make_ConstantTensorDescriptor(Sequence<KBlockWork, BBlockWork>{});
|
||||
|
||||
const auto block_work_multi_id = block_work_desc.GetMultiIndex(get_block_1d_id());
|
||||
const auto block_work_multi_id =
|
||||
block_work_desc.GetMultiIndexFrom1dIndex(get_block_1d_id());
|
||||
|
||||
const index_t k_block_data_on_global = block_work_multi_id[0] * KPerBlock;
|
||||
const index_t b_block_data_on_global = block_work_multi_id[1] * BPerBlock;
|
||||
@@ -219,10 +220,10 @@ struct GridwiseConvolutionImplicitGemm_v3_nchw_cyxk_nkhw
|
||||
{
|
||||
// calculate origin of block input and weight tensor on global memory
|
||||
const Float* p_in_block_on_global =
|
||||
p_in_global + in_n_c_h_w_global_desc.Get1dIndex(0, 0, y, x);
|
||||
p_in_global + in_n_c_h_w_global_desc.GetOffsetFromMultiIndex(0, 0, y, x);
|
||||
|
||||
const Float* p_wei_block_on_global =
|
||||
p_wei_global + wei_c_y_x_k_global_desc.Get1dIndex(0, y, x, 0);
|
||||
p_wei_global + wei_c_y_x_k_global_desc.GetOffsetFromMultiIndex(0, y, x, 0);
|
||||
|
||||
for(index_t
|
||||
c_block_data_on_global = 0;
|
||||
@@ -285,7 +286,7 @@ struct GridwiseConvolutionImplicitGemm_v3_nchw_cyxk_nkhw
|
||||
// origin of thread tensor in global memory
|
||||
const index_t p_out_thread_on_global =
|
||||
p_out_global +
|
||||
out_k_n1_b_n2_global_merged_desc.Get1dIndex(
|
||||
out_k_n1_b_n2_global_merged_desc.GetOffsetFromMultiIndex(
|
||||
k_thread_data_on_global, 0, 0, 0); // dst origin on merged global tensor
|
||||
|
||||
// copy
|
||||
|
||||
@@ -190,18 +190,19 @@ __global__ void gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw(
|
||||
c_block_data_begin += CPerBlock, __syncthreads())
|
||||
{
|
||||
// copy input tensor to LDS
|
||||
blockwise_in_copy.Run(p_in_vec_global +
|
||||
in_nchw_vec_global_desc.Get1dIndex(n_block_data_begin,
|
||||
c_block_data_begin,
|
||||
hi_block_data_begin,
|
||||
wi_block_data_begin),
|
||||
p_in_vec_block);
|
||||
blockwise_in_copy.Run(
|
||||
p_in_vec_global +
|
||||
in_nchw_vec_global_desc.GetOffsetFromMultiIndex(n_block_data_begin,
|
||||
c_block_data_begin,
|
||||
hi_block_data_begin,
|
||||
wi_block_data_begin),
|
||||
p_in_vec_block);
|
||||
|
||||
// copy weight tensor to LDS
|
||||
blockwise_wei_copy.Run(
|
||||
p_wei_vec_global +
|
||||
wei_kcyx_vec_global_desc.Get1dIndex(k_block_data_begin, c_block_data_begin, 0, 0),
|
||||
p_wei_vec_block);
|
||||
blockwise_wei_copy.Run(p_wei_vec_global +
|
||||
wei_kcyx_vec_global_desc.GetOffsetFromMultiIndex(
|
||||
k_block_data_begin, c_block_data_begin, 0, 0),
|
||||
p_wei_vec_block);
|
||||
|
||||
__syncthreads();
|
||||
|
||||
@@ -212,26 +213,28 @@ __global__ void gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw(
|
||||
threadwise_direct_convolution_2(
|
||||
in_nchw_vec_thread_block_desc,
|
||||
p_in_vec_block +
|
||||
in_nchw_vec_block_desc.Get1dIndex(n_thread_data_begin,
|
||||
c_thread_data,
|
||||
hi_thread_data_begin,
|
||||
wi_thread_data_begin),
|
||||
in_nchw_vec_block_desc.GetOffsetFromMultiIndex(n_thread_data_begin,
|
||||
c_thread_data,
|
||||
hi_thread_data_begin,
|
||||
wi_thread_data_begin),
|
||||
wei_kcyx_vec_thread_block_desc,
|
||||
p_wei_vec_block +
|
||||
wei_kcyx_vec_block_desc.Get1dIndex(k_thread_data_begin, c_thread_data, 0, 0),
|
||||
wei_kcyx_vec_block_desc.GetOffsetFromMultiIndex(
|
||||
k_thread_data_begin, c_thread_data, 0, 0),
|
||||
out_nkhw_thread_desc,
|
||||
p_out_thread);
|
||||
#elif 0
|
||||
threadwise_direct_convolution_3(
|
||||
in_nchw_vec_thread_block_desc,
|
||||
p_in_vec_block +
|
||||
in_nchw_vec_block_desc.Get1dIndex(n_thread_data_begin,
|
||||
c_thread_data,
|
||||
hi_thread_data_begin,
|
||||
wi_thread_data_begin),
|
||||
in_nchw_vec_block_desc.GetOffsetFromMultiIndex(n_thread_data_begin,
|
||||
c_thread_data,
|
||||
hi_thread_data_begin,
|
||||
wi_thread_data_begin),
|
||||
wei_kcyx_vec_thread_block_desc,
|
||||
p_wei_vec_block +
|
||||
wei_kcyx_vec_block_desc.Get1dIndex(k_thread_data_begin, c_thread_data, 0, 0),
|
||||
wei_kcyx_vec_block_desc.GetOffsetFromMultiIndex(
|
||||
k_thread_data_begin, c_thread_data, 0, 0),
|
||||
out_nkhw_thread_desc,
|
||||
p_out_thread);
|
||||
#endif
|
||||
@@ -239,14 +242,14 @@ __global__ void gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw(
|
||||
}
|
||||
|
||||
// copy output tensor from register to global mem
|
||||
threadwise_4d_tensor_copy(
|
||||
out_nkhw_thread_desc,
|
||||
p_out_thread,
|
||||
out_nkhw_global_desc,
|
||||
p_out_global +
|
||||
out_nkhw_global_desc.Get1dIndex(n_block_data_begin + n_thread_data_begin,
|
||||
k_block_data_begin + k_thread_data_begin,
|
||||
ho_block_data_begin + ho_thread_data_begin,
|
||||
wo_block_data_begin + wo_thread_data_begin),
|
||||
out_nkhw_thread_desc.GetLengths());
|
||||
threadwise_4d_tensor_copy(out_nkhw_thread_desc,
|
||||
p_out_thread,
|
||||
out_nkhw_global_desc,
|
||||
p_out_global +
|
||||
out_nkhw_global_desc.GetOffsetFromMultiIndex(
|
||||
n_block_data_begin + n_thread_data_begin,
|
||||
k_block_data_begin + k_thread_data_begin,
|
||||
ho_block_data_begin + ho_thread_data_begin,
|
||||
wo_block_data_begin + wo_thread_data_begin),
|
||||
out_nkhw_thread_desc.GetLengths());
|
||||
}
|
||||
|
||||
@@ -217,7 +217,7 @@ __global__ void gridwise_implicit_gemm_convolution_1_chwn_cyxk_khwn_padded(
|
||||
threadwise_4d_tensor_set_zero(out_hkwn_thread_desc, p_out_thread);
|
||||
|
||||
const Float* p_wei_global_block_begin =
|
||||
p_wei_global + wei_ek_global_desc.Get1dIndex(0, k_block_data_begin);
|
||||
p_wei_global + wei_ek_global_desc.GetOffsetFromMultiIndex(0, k_block_data_begin);
|
||||
|
||||
for(index_t c_block_data_begin = 0; c_block_data_begin < C; c_block_data_begin += CPerBlock,
|
||||
p_wei_global_block_begin += CPerBlock * wei_ek_global_desc.GetStride(I0),
|
||||
@@ -251,10 +251,11 @@ __global__ void gridwise_implicit_gemm_convolution_1_chwn_cyxk_khwn_padded(
|
||||
{
|
||||
auto f_accum = [](auto& acc, const auto&& v) { acc += v; };
|
||||
|
||||
blockwise_batch_gemm.Run(p_wei_block + wei_cyxk_block_desc.Get1dIndex(0, y, x, 0),
|
||||
p_in_block + in_chwn_block_desc.Get1dIndex(0, y, x, 0),
|
||||
p_out_thread,
|
||||
f_accum);
|
||||
blockwise_batch_gemm.Run(
|
||||
p_wei_block + wei_cyxk_block_desc.GetOffsetFromMultiIndex(0, y, x, 0),
|
||||
p_in_block + in_chwn_block_desc.GetOffsetFromMultiIndex(0, y, x, 0),
|
||||
p_out_thread,
|
||||
f_accum);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -284,10 +285,10 @@ __global__ void gridwise_implicit_gemm_convolution_1_chwn_cyxk_khwn_padded(
|
||||
p_out_thread,
|
||||
out_khwn_global_desc,
|
||||
p_out_global +
|
||||
out_khwn_global_desc.Get1dIndex(k_block_data_begin + k_thread_data_begin,
|
||||
ho_block_data_begin + ho_thread_data_begin,
|
||||
wo_block_data_begin + wo_thread_data_begin,
|
||||
n_block_data_begin + n_thread_data_begin),
|
||||
out_khwn_global_desc.GetOffsetFromMultiIndex(k_block_data_begin + k_thread_data_begin,
|
||||
ho_block_data_begin + ho_thread_data_begin,
|
||||
wo_block_data_begin + wo_thread_data_begin,
|
||||
n_block_data_begin + n_thread_data_begin),
|
||||
out_hkwn_thread_desc.GetLengths(),
|
||||
reorder_khwn_from_hkwn);
|
||||
}
|
||||
|
||||
@@ -93,7 +93,7 @@ struct TensorDescriptor
|
||||
const std::vector<std::size_t>& GetStrides() const;
|
||||
|
||||
template <class... Is>
|
||||
std::size_t Get1dIndex(Is... is) const
|
||||
std::size_t GetOffsetFromMultiIndex(Is... is) const
|
||||
{
|
||||
assert(sizeof...(Is) == this->GetNumOfDimension());
|
||||
std::initializer_list<std::size_t> iss{static_cast<std::size_t>(is)...};
|
||||
@@ -246,13 +246,13 @@ struct Tensor
|
||||
template <class... Is>
|
||||
T& operator()(Is... is)
|
||||
{
|
||||
return mData[mDesc.Get1dIndex(is...)];
|
||||
return mData[mDesc.GetOffsetFromMultiIndex(is...)];
|
||||
}
|
||||
|
||||
template <class... Is>
|
||||
const T& operator()(Is... is) const
|
||||
{
|
||||
return mData[mDesc.Get1dIndex(is...)];
|
||||
return mData[mDesc.GetOffsetFromMultiIndex(is...)];
|
||||
}
|
||||
|
||||
typename std::vector<T>::iterator begin() { return mData.begin(); }
|
||||
|
||||
@@ -20,7 +20,7 @@ __device__ void threadwise_2d_tensor_pointwise_operation_unary(Desc, Float* __re
|
||||
{
|
||||
for(index_t did1 = 0; did1 < desc.GetLength(I1); ++did1)
|
||||
{
|
||||
const index_t dindex = desc.Get1dIndex(did0, did1);
|
||||
const index_t dindex = desc.GetOffsetFromMultiIndex(did0, did1);
|
||||
|
||||
f(p[dindex]);
|
||||
}
|
||||
@@ -53,11 +53,11 @@ __device__ void threadwise_2d_tensor_pointwise_operation_binary_reorder_by_get_d
|
||||
{
|
||||
for(index_t did1 = 0; did1 < ref_desc.GetLength(I1); ++did1)
|
||||
{
|
||||
const index_t aindex = src_desc.Get1dIndex(did0, did1);
|
||||
const index_t aindex = src_desc.GetOffsetFromMultiIndex(did0, did1);
|
||||
|
||||
const index_t did[2] = {did0, did1};
|
||||
|
||||
const index_t bindex = dst_desc.Get1dIndex(did[IR0], did[IR1]);
|
||||
const index_t bindex = dst_desc.GetOffsetFromMultiIndex(did[IR0], did[IR1]);
|
||||
|
||||
f(p_src[aindex], p_dst[bindex]);
|
||||
}
|
||||
@@ -127,7 +127,7 @@ __device__ void threadwise_2d_tensor_shift_down(Desc, Float* __restrict__ p, IDi
|
||||
{
|
||||
for(index_t did1 = 0; did1 < did1_end; ++did1)
|
||||
{
|
||||
const index_t dindex = desc.Get1dIndex(did0, did1);
|
||||
const index_t dindex = desc.GetOffsetFromMultiIndex(did0, did1);
|
||||
|
||||
const index_t sindex = dindex + nshift * desc.GetStride(IDim{});
|
||||
|
||||
|
||||
@@ -26,7 +26,7 @@ __device__ void threadwise_4d_tensor_pointwise_operation_unary(Desc, Float* __re
|
||||
{
|
||||
for(index_t did3 = 0; did3 < desc.GetLength(I3); ++did3)
|
||||
{
|
||||
const index_t dindex = desc.Get1dIndex(did0, did1, did2, did3);
|
||||
const index_t dindex = desc.GetOffsetFromMultiIndex(did0, did1, did2, did3);
|
||||
|
||||
f(p[dindex]);
|
||||
}
|
||||
@@ -75,12 +75,12 @@ __device__ void threadwise_4d_tensor_pointwise_operation_binary_reorder_given_ds
|
||||
{
|
||||
for(index_t did3 = 0; did3 < ref_desc.GetLength(I3); ++did3)
|
||||
{
|
||||
const index_t aindex = src_desc.Get1dIndex(did0, did1, did2, did3);
|
||||
const index_t aindex = src_desc.GetOffsetFromMultiIndex(did0, did1, did2, did3);
|
||||
|
||||
const index_t did[4] = {did0, did1, did2, did3};
|
||||
|
||||
const index_t bindex =
|
||||
dst_desc.Get1dIndex(did[IR0], did[IR1], did[IR2], did[IR3]);
|
||||
dst_desc.GetOffsetFromMultiIndex(did[IR0], did[IR1], did[IR2], did[IR3]);
|
||||
|
||||
f(p_src[aindex], p_dst[bindex]);
|
||||
|
||||
@@ -178,7 +178,7 @@ __device__ void threadwise_4d_tensor_shift_down(Desc, Float* __restrict__ p, IDi
|
||||
{
|
||||
for(index_t did3 = 0; did3 < did3_end; ++did3)
|
||||
{
|
||||
const index_t dindex = desc.Get1dIndex(did0, did1, did2, did3);
|
||||
const index_t dindex = desc.GetOffsetFromMultiIndex(did0, did1, did2, did3);
|
||||
|
||||
const index_t sindex = dindex + nshift * desc.GetStride(IDim{});
|
||||
|
||||
|
||||
@@ -46,11 +46,14 @@ __device__ void threadwise_direct_convolution_1(InDesc,
|
||||
const index_t hi = ho + y;
|
||||
const index_t wi = wo + x;
|
||||
|
||||
const index_t in_index = in_desc.Get1dIndex(n, c, hi, wi);
|
||||
const index_t in_index =
|
||||
in_desc.GetOffsetFromMultiIndex(n, c, hi, wi);
|
||||
|
||||
const index_t wei_index = wei_desc.Get1dIndex(k, c, y, x);
|
||||
const index_t wei_index =
|
||||
wei_desc.GetOffsetFromMultiIndex(k, c, y, x);
|
||||
|
||||
const index_t out_index = out_desc.Get1dIndex(n, k, ho, wo);
|
||||
const index_t out_index =
|
||||
out_desc.GetOffsetFromMultiIndex(n, k, ho, wo);
|
||||
|
||||
fused_multiply_accumulate(
|
||||
p_out[out_index], p_wei[wei_index], p_in[in_index]);
|
||||
@@ -143,14 +146,14 @@ __device__ void threadwise_direct_convolution_3(InDesc,
|
||||
{
|
||||
// read first input
|
||||
threadwise_4d_tensor_copy(in_desc,
|
||||
p_in + in_desc.Get1dIndex(0, 0, y, 0),
|
||||
p_in + in_desc.GetOffsetFromMultiIndex(0, 0, y, 0),
|
||||
in_reg_desc,
|
||||
p_in_reg,
|
||||
in_reg_desc.GetLengths());
|
||||
|
||||
// read first 1x1 weight
|
||||
threadwise_4d_tensor_copy(wei_desc,
|
||||
p_wei + wei_desc.Get1dIndex(0, 0, y, 0),
|
||||
p_wei + wei_desc.GetOffsetFromMultiIndex(0, 0, y, 0),
|
||||
wei_reg_desc,
|
||||
p_wei_reg,
|
||||
wei_reg_desc.GetLengths());
|
||||
@@ -164,7 +167,7 @@ __device__ void threadwise_direct_convolution_3(InDesc,
|
||||
{
|
||||
// read new weight
|
||||
threadwise_4d_tensor_copy(wei_desc,
|
||||
p_wei + wei_desc.Get1dIndex(0, 0, y, x),
|
||||
p_wei + wei_desc.GetOffsetFromMultiIndex(0, 0, y, x),
|
||||
wei_reg_desc,
|
||||
p_wei_reg,
|
||||
wei_reg_desc.GetLengths());
|
||||
@@ -175,10 +178,10 @@ __device__ void threadwise_direct_convolution_3(InDesc,
|
||||
// read new input
|
||||
threadwise_4d_tensor_copy(
|
||||
in_desc,
|
||||
p_in + in_desc.Get1dIndex(0, 0, y, x + in_reg_desc.GetLength(I3) - 1),
|
||||
p_in + in_desc.GetOffsetFromMultiIndex(0, 0, y, x + in_reg_desc.GetLength(I3) - 1),
|
||||
in_reg_desc,
|
||||
p_in_reg +
|
||||
in_reg_desc.Get1dIndex(0, 0, 0, in_reg_desc.GetLength(I3) - in_w_new_read),
|
||||
in_reg_desc.GetOffsetFromMultiIndex(0, 0, 0, in_reg_desc.GetLength(I3) - in_w_new_read),
|
||||
in_desc_reg_new_read.GetLengths());
|
||||
|
||||
// do 1x1 conv
|
||||
@@ -196,14 +199,14 @@ __device__ void threadwise_direct_convolution_3(InDesc,
|
||||
{
|
||||
// read new weight
|
||||
threadwise_4d_tensor_copy(wei_desc,
|
||||
p_wei + wei_desc.Get1dIndex(0, 0, y, x),
|
||||
p_wei + wei_desc.GetOffsetFromMultiIndex(0, 0, y, x),
|
||||
wei_reg_desc,
|
||||
p_wei_reg,
|
||||
wei_reg_desc.GetLengths());
|
||||
|
||||
// read new input
|
||||
threadwise_4d_tensor_copy(in_desc,
|
||||
p_in + in_desc.Get1dIndex(0, 0, y, x),
|
||||
p_in + in_desc.GetOffsetFromMultiIndex(0, 0, y, x),
|
||||
in_reg_desc,
|
||||
p_in_reg,
|
||||
in_reg_desc.GetLengths());
|
||||
|
||||
@@ -9,7 +9,7 @@ __device__ void threadwise_matrix_set_zero(Matrix, Float* __restrict__ p_thread)
|
||||
{
|
||||
for(index_t j = 0; j < Matrix::NCol(); ++j)
|
||||
{
|
||||
const index_t id = Matrix::Get1dIndex(i, j);
|
||||
const index_t id = Matrix::GetOffsetFromMultiIndex(i, j);
|
||||
p_thread[id] = 0;
|
||||
}
|
||||
}
|
||||
@@ -39,8 +39,8 @@ __device__ void threadwise_matrix_copy(SrcMatrix,
|
||||
{
|
||||
for(index_t j = 0; j < NCol; j += DataPerRead)
|
||||
{
|
||||
const index_t src_index = src_mtx.Get1dIndex(i, j);
|
||||
const index_t dst_index = dst_mtx.Get1dIndex(i, j);
|
||||
const index_t src_index = src_mtx.GetOffsetFromMultiIndex(i, j);
|
||||
const index_t dst_index = dst_mtx.GetOffsetFromMultiIndex(i, j);
|
||||
|
||||
*reinterpret_cast<vector_t*>(&p_dst[dst_index]) =
|
||||
*reinterpret_cast<const vector_t*>(&p_src[src_index]);
|
||||
@@ -83,9 +83,9 @@ __device__ void threadwise_gemm(MatrixA,
|
||||
{
|
||||
for(index_t j = 0; j < N; ++j)
|
||||
{
|
||||
const index_t aindex = a_mtx.Get1dIndex(k, i); // A is transposed
|
||||
const index_t bindex = b_mtx.Get1dIndex(k, j);
|
||||
const index_t cindex = c_mtx.Get1dIndex(i, j);
|
||||
const index_t aindex = a_mtx.GetOffsetFromMultiIndex(k, i); // A is transposed
|
||||
const index_t bindex = b_mtx.GetOffsetFromMultiIndex(k, j);
|
||||
const index_t cindex = c_mtx.GetOffsetFromMultiIndex(i, j);
|
||||
|
||||
p_c_thread[cindex] += p_a_thread[aindex] * p_b_thread[bindex];
|
||||
}
|
||||
|
||||
@@ -19,7 +19,7 @@ __device__ void threadwise_tensor_slice_copy(SrcDesc,
|
||||
|
||||
constexpr auto src_desc = SrcDesc{};
|
||||
constexpr auto dst_desc = DstDesc{};
|
||||
constexpr auto ref_desc = make_ConstantTensorDescriptor(SrcOpLengths{});
|
||||
constexpr auto ref_desc = make_packed_ConstantTensorDescriptor(SrcOpLengths{});
|
||||
|
||||
#if 0
|
||||
if(get_thread_local_1d_id() == 0 && get_block_1d_id() == 0)
|
||||
@@ -53,9 +53,9 @@ __device__ void threadwise_tensor_slice_copy(SrcDesc,
|
||||
static_for<0, nRead, 1>{}([&](auto IRead) {
|
||||
constexpr auto multi_id = decltype(Ids){}.PushBack(Number<IRead.Get() * DataPerRead>{});
|
||||
|
||||
const index_t src_index = src_desc.Get1dIndex(multi_id);
|
||||
const index_t src_index = src_desc.GetOffsetFromMultiIndex(multi_id);
|
||||
|
||||
const index_t dst_index = dst_desc.Get1dIndex(multi_id);
|
||||
const index_t dst_index = dst_desc.GetOffsetFromMultiIndex(multi_id);
|
||||
|
||||
*(reinterpret_cast<vector_t*>(&p_dst[dst_index])) =
|
||||
*(reinterpret_cast<const vector_t*>(&p_src[src_index]));
|
||||
@@ -84,9 +84,9 @@ threadwise_tensor_slice_copy_reorder_given_dst2src_v1(SrcDesc,
|
||||
ford<SrcOpLengths>{}([&](auto src_multi_id) {
|
||||
const auto dst_multi_id = reorder_array_given_new2old(src_multi_id, MapDst2Src{});
|
||||
|
||||
const index_t dst_index = dst_desc.Get1dIndex(dst_multi_id);
|
||||
const index_t dst_index = dst_desc.GetOffsetFromMultiIndex(dst_multi_id);
|
||||
|
||||
const index_t src_index = src_desc.Get1dIndex(src_multi_id);
|
||||
const index_t src_index = src_desc.GetOffsetFromMultiIndex(src_multi_id);
|
||||
|
||||
p_dst[dst_index] = p_src[src_index];
|
||||
});
|
||||
@@ -115,9 +115,9 @@ threadwise_tensor_slice_copy_reorder_given_dst2src_v2(SrcDesc,
|
||||
ford<decltype(dst_op_lengths)>{}([&](auto dst_multi_id) {
|
||||
const auto src_multi_id = reorder_array_given_old2new(dst_multi_id, MapDst2Src{});
|
||||
|
||||
const index_t dst_index = dst_desc.Get1dIndex(dst_multi_id);
|
||||
const index_t dst_index = dst_desc.GetOffsetFromMultiIndex(dst_multi_id);
|
||||
|
||||
const index_t src_index = src_desc.Get1dIndex(src_multi_id);
|
||||
const index_t src_index = src_desc.GetOffsetFromMultiIndex(src_multi_id);
|
||||
|
||||
p_dst[dst_index] = p_src[src_index];
|
||||
});
|
||||
@@ -177,7 +177,7 @@ threadwise_tensor_slice_copy_reorder_given_dst2src_v3(SrcDesc,
|
||||
|
||||
const auto src_multi_id = reorder_array_given_old2new(dst_multi_id, MapDst2Src{});
|
||||
|
||||
const index_t src_index = src_desc.Get1dIndex(src_multi_id);
|
||||
const index_t src_index = src_desc.GetOffsetFromMultiIndex(src_multi_id);
|
||||
|
||||
vector_type<Float, DstDataPerWrite>::SetScalar(
|
||||
dst_vec_data, p_src[src_index], IDstData);
|
||||
@@ -186,7 +186,7 @@ threadwise_tensor_slice_copy_reorder_given_dst2src_v3(SrcDesc,
|
||||
// write data
|
||||
const auto dst_multi_id = ids.PushBack(IWrite.Get() * DstDataPerWrite);
|
||||
|
||||
const index_t dst_index = dst_desc.Get1dIndex(dst_multi_id);
|
||||
const index_t dst_index = dst_desc.GetOffsetFromMultiIndex(dst_multi_id);
|
||||
|
||||
*(reinterpret_cast<vector_t*>(&p_dst[dst_index])) = dst_vec_data;
|
||||
});
|
||||
@@ -204,5 +204,21 @@ threadwise_tensor_slice_copy_generic(SrcDesc,
|
||||
SliceLengths,
|
||||
DimAccessOrder)
|
||||
{
|
||||
// not implemented
|
||||
constexpr auto src_desc = SrcDesc{};
|
||||
constexpr auto dst_desc = DstDesc{};
|
||||
|
||||
constexpr auto slice_lengths_in_access_order =
|
||||
SliceLengths{}.ReorderGivenNew2Old(DimAccessOrder{});
|
||||
|
||||
ford<decltype(slice_lengths_in_access_order)>{}([&](auto data_multi_id_in_access_order) {
|
||||
const auto data_multi_id =
|
||||
reorder_array_given_old2new(data_multi_id_in_access_order, DimAccessOrder{});
|
||||
|
||||
const index_t dst_index =
|
||||
dst_desc.GetOffsetFromMultiIndex(src_multi_offset + data_multi_id);
|
||||
const index_t src_index =
|
||||
src_desc.GetOffsetFromMultiIndex(dst_multi_offset + data_multi_id);
|
||||
|
||||
p_dst[dst_index] = p_src[src_index];
|
||||
});
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user