mirror of
https://github.com/ROCm/composable_kernel.git
synced 2026-05-12 01:10:17 +00:00
WIP: explicitly separate offset component into compile-time, block-invariant and per-thread components
This commit is contained in:
@@ -3,7 +3,8 @@
|
||||
|
||||
#include "common_header.hpp"
|
||||
#include "ConstantTensorDescriptor.hpp"
|
||||
#include "ConstantMergedTensorDescriptor.hpp"
|
||||
#include "tensor_descriptor.hpp"
|
||||
#include "tensor_descriptor_helper.hpp"
|
||||
#include "ConstantMatrixDescriptor.hpp"
|
||||
#include "blockwise_generic_tensor_slice_copy.hpp"
|
||||
#include "blockwise_gemm.hpp"
|
||||
@@ -172,6 +173,10 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_padded_lds_double_buf
|
||||
BlockwiseGenericTensorSliceCopy_v4<BlockSize,
|
||||
decltype(in_e_n1_b_n2_global_desc),
|
||||
decltype(in_e_n1_b_n2_block_desc),
|
||||
Sequence<0, 1, 0, 1>,
|
||||
Sequence<1, 0, 1, 0>,
|
||||
Sequence<1, 1, 1, 1>,
|
||||
Sequence<0, 0, 0, 0>,
|
||||
decltype(in_e_n1_b_n2_block_desc.GetLengths()),
|
||||
InBlockCopySubLengths_E_N1_B_N2,
|
||||
InBlockCopyClusterLengths_E_N1_B_N2,
|
||||
@@ -213,6 +218,10 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_padded_lds_double_buf
|
||||
BlockwiseGenericTensorSliceCopy_v4<BlockSize,
|
||||
decltype(wei_e_k_global_desc),
|
||||
decltype(wei_e_k_block_desc),
|
||||
Sequence<1, 1>,
|
||||
Sequence<0, 0>,
|
||||
Sequence<1, 1>,
|
||||
Sequence<0, 0>,
|
||||
decltype(wei_e_k_block_desc.GetLengths()),
|
||||
WeiBlockCopySubLengths_E_K,
|
||||
WeiBlockCopyClusterLengths_E_K,
|
||||
@@ -414,6 +423,10 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_padded_lds_double_buf
|
||||
|
||||
ThreadwiseGenericTensorSliceCopy_v4r2<decltype(out_k0_k1_n1_b_n2_thread_desc),
|
||||
decltype(out_k0_k1_n1_b_n2_global_desc),
|
||||
Sequence<1, 1, 1, 1, 1>,
|
||||
Sequence<0, 0, 0, 0, 0>,
|
||||
Sequence<1, 1, 1, 0, 1>,
|
||||
Sequence<0, 0, 0, 1, 0>,
|
||||
decltype(
|
||||
out_k0_k1_n1_b_n2_thread_desc.GetLengths()),
|
||||
arithmetic_sequence_gen<0, 5, 1>::type,
|
||||
|
||||
@@ -368,5 +368,49 @@ struct Embed
|
||||
}
|
||||
};
|
||||
|
||||
template <index_t LowerLength, index_t VectorSize>
|
||||
struct Vectorize
|
||||
{
|
||||
using LowerIndex = MultiIndex<1>;
|
||||
using UpperIndex = MultiIndex<1>;
|
||||
|
||||
__host__ __device__ constexpr Vectorize()
|
||||
{
|
||||
static_assert(VectorSize > 0 && LowerLength % VectorSize == 0,
|
||||
"wrong! cannot evenly divide");
|
||||
}
|
||||
|
||||
__host__ __device__ static constexpr auto GetNumOfLowerDimension() { return Number<1>{}; }
|
||||
|
||||
__host__ __device__ static constexpr auto GetNumOfUpperDimension() { return Number<1>{}; }
|
||||
|
||||
__host__ __device__ static constexpr auto GetUpperLengths()
|
||||
{
|
||||
return Sequence<LowerLength / VectorSize>{};
|
||||
}
|
||||
|
||||
__host__ __device__ static constexpr auto CalculateLowerIndex(const UpperIndex& idx_up)
|
||||
{
|
||||
return VectorSize * idx_up;
|
||||
}
|
||||
|
||||
__host__ __device__ static constexpr auto
|
||||
CalculateLowerIndexDiff(const UpperIndex& idx_up_diff,
|
||||
const UpperIndex& /* idx_up_old */,
|
||||
const LowerIndex& /* idx_low_old */)
|
||||
{
|
||||
return VectorSize * idx_up_diff;
|
||||
}
|
||||
|
||||
__host__ __device__ static constexpr bool IsLinearTransform() { return true; }
|
||||
|
||||
// TODO: should this function be here? should it be specific for padding check?
|
||||
__host__ __device__ static constexpr bool
|
||||
IsUpperIndexInPaddingArea(const UpperIndex& /* idx_up */)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace ck
|
||||
#endif
|
||||
|
||||
@@ -101,17 +101,24 @@ struct NativeTensorDescriptor
|
||||
return true;
|
||||
}
|
||||
|
||||
__host__ __device__ static constexpr auto GetLinearDimensions()
|
||||
__host__ __device__ static constexpr auto GetMaskOfLinearDimensions()
|
||||
{
|
||||
return typename arithmetic_sequence_gen<0, nDim, 1>::type{};
|
||||
return typename uniform_sequence_gen<nDim, 1>::type{};
|
||||
}
|
||||
|
||||
__host__ __device__ static constexpr auto GetMaskOfNonLinearDimensions()
|
||||
{
|
||||
return typename uniform_sequence_gen<nDim, 0>::type{};
|
||||
}
|
||||
|
||||
__host__ __device__ static constexpr auto GetNonLinearDimensions() { return Sequence<>{}; }
|
||||
|
||||
#if 0
|
||||
__host__ __device__ static constexpr auto GetNonLinearIndependentDimensionGroups()
|
||||
{
|
||||
return Tuple<>{};
|
||||
}
|
||||
#endif
|
||||
|
||||
// TODO: should this function be here? should it be specific for padding check?
|
||||
__host__ __device__ static constexpr bool IsUpperIndexInPaddingArea(const Index& /* idx */)
|
||||
@@ -233,7 +240,7 @@ struct TransformedTensorDescriptor
|
||||
__host__ __device__ static constexpr auto GetUpperLengths()
|
||||
{
|
||||
constexpr auto tuple_of_up_lengths =
|
||||
transform_tuple(lambda_GetUpperLengths{}, Transforms{});
|
||||
transform_tuples(lambda_GetUpperLengths{}, Transforms{});
|
||||
|
||||
constexpr auto mingled_up_lengths = unpack(lambda_merge_sequences{}, tuple_of_up_lengths);
|
||||
|
||||
@@ -346,67 +353,92 @@ struct TransformedTensorDescriptor
|
||||
return GetLowerTensorDescriptor().CalculateOffset(CalculateLowerIndex(idx_up));
|
||||
}
|
||||
|
||||
#if 1
|
||||
#if 0
|
||||
struct lambda_sequence_logic_or
|
||||
{
|
||||
template <typename... Seqs>
|
||||
__host__ __device__ constexpr auto operator()(Seqs... seqs) const
|
||||
{
|
||||
// TODO: should use math::logic_or<bool>, after Sequence can take bool
|
||||
return typename sequence_reduce<math::logic_or<index_t>, Seqs...>::type{};
|
||||
return typename sequence_reduce<math::logic_or<bool>, Seqs...>::type{};
|
||||
}
|
||||
};
|
||||
|
||||
struct lambda_1
|
||||
{
|
||||
template <typename Transform>
|
||||
__host__ __device__ constexpr auto operator()(const Transform& tran) const
|
||||
// check only one transform at a time
|
||||
template <typename Transform, typename LowDimensionId, typename UpDimensionId>
|
||||
__host__ __device__ constexpr auto
|
||||
operator()(const Transform& tran, LowDimensionId, UpDimensionId) const
|
||||
{
|
||||
return tran.GetUpperLengths();
|
||||
// judge if transformation is linear
|
||||
constexpr bool is_linear_transform = tran.IsLinearTransform();
|
||||
|
||||
// judge if all lower dimension are linear
|
||||
constexpr bool is_all_low_dim_linear = math::accumulate_on_sequence(
|
||||
pick_sequence_elements_by_mask(
|
||||
GetLowerTensorDescriptor().GetMaskOfLinearDimensions(), LowDimensionId{}),
|
||||
math::logic_and<bool>{},
|
||||
integral_constant<bool, true>{});
|
||||
|
||||
// judge if upper dimenisons are linear
|
||||
constexpr bool is_up_dim_nonlinear = !(is_linear_transform && is_all_low_dim_linear);
|
||||
|
||||
constexpr auto value_sequence =
|
||||
typename uniform_sequence_gen<tran.GetNumOfUpperDimension(),
|
||||
is_up_dim_nonlinear>::type{};
|
||||
|
||||
constexpr auto mask_of_up_nonlinear_dims = modifiy_sequence(
|
||||
typename uniform_sequence_gen<nDimUp, 0>::type{}, value_sequence, UpDimensionId{});
|
||||
|
||||
return mask_of_up_nonlinear_dims;
|
||||
};
|
||||
|
||||
__host__ __device__ static constexpr bool GetMaskOfNonLinearDimensions()
|
||||
{
|
||||
// create tuple of linear dimension masks, for all transformations
|
||||
constexpr auto tuple_of_nonlinear_dimension_mask =
|
||||
transform_tuples(lambda_1{}, Transforms{}, LowDimensionIds{}, UpDimensionIds{});
|
||||
|
||||
// reduce tuple of masks into one mask
|
||||
constexpr auto nonlinear_dimension_mask =
|
||||
unpack(lambda_sequence_logic_or{}, tuple_of_nonlinear_dimension_mask);
|
||||
|
||||
return nonlinear_dimension_mask;
|
||||
}
|
||||
};
|
||||
|
||||
template <index_t IDim>
|
||||
__host__ __device__ static constexpr bool GetMaskOfLinearDimensions()
|
||||
{
|
||||
// create tuple of linear dimension masks, for all transformations
|
||||
constexpr auto tuple_of_linear_dimension_mask =
|
||||
transform_tuple(lambda_1, Transforms{});
|
||||
__host__ __device__ static constexpr bool GetMaskOfLinearDimensions()
|
||||
{
|
||||
return GetMaskOfNonLinearDimensions().Transform(math::logic_not<bool>{});
|
||||
}
|
||||
|
||||
// reduce tuple of masks into one mask
|
||||
constexpr auto linear_dimension_mask =
|
||||
unpack(lambda_sequence_logic_or{}, tuple_of_linear_dimension_mask);
|
||||
template <index_t IDim>
|
||||
__host__ __device__ static constexpr bool IsLinearDimension(Number<IDim>)
|
||||
{
|
||||
return GetMaskOfLinearDimensions().At(Number<IDim>{});
|
||||
}
|
||||
|
||||
return linear_dimension_mask;
|
||||
}
|
||||
__host__ __device__ static constexpr auto GetLinearDimensions()
|
||||
{
|
||||
constexpr auto linear_dimension_mask = GetMaskOfLienarDimensions();
|
||||
|
||||
template <index_t IDim>
|
||||
__host__ __device__ static constexpr bool IsLinearDimension(Number<IDim>)
|
||||
{
|
||||
return GetMaskOfLinearDimensions().At(Number<IDim>{});
|
||||
}
|
||||
return pick_sequence_elements_by_mask(
|
||||
typename arithmetic_sequence_gen<0, nDimUp, 1>::type{}, linear_dimension_mask);
|
||||
}
|
||||
|
||||
__host__ __device__ static constexpr auto GetLinearDimensions()
|
||||
{
|
||||
constexpr auto linear_dimension_mask = GetMaskOfLienarDimensions();
|
||||
__host__ __device__ static constexpr auto GetNonLinearDimensions()
|
||||
{
|
||||
constexpr auto nonlinear_dimension_mask =
|
||||
GetMaskOfLienarDimensions().Transform(math::logic_not<index_t>{});
|
||||
|
||||
return pick_sequence_elements_by_mask(
|
||||
typename arithmetic_sequence_gen<0, nDimUp, 1>::type{}, linear_dimension_mask);
|
||||
}
|
||||
return pick_sequence_elements_by_mask(
|
||||
typename arithmetic_sequence_gen<0, nDimUp, 1>::type{}, nonlinear_dimension_mask);
|
||||
}
|
||||
|
||||
__host__ __device__ static constexpr auto GetNonLinearDimensions()
|
||||
{
|
||||
constexpr auto nonlinear_dimension_mask =
|
||||
GetMaskOfLienarDimensions().Transform(math::logic_not<index_t>{});
|
||||
|
||||
return pick_sequence_elements_by_mask(
|
||||
typename arithmetic_sequence_gen<0, nDimUp, 1>::type{}, nonlinear_dimension_mask);
|
||||
}
|
||||
|
||||
__host__ __device__ static constexpr auto GetNonLinearIndependentDimensionGroups()
|
||||
{
|
||||
// not implemented
|
||||
}
|
||||
__host__ __device__ static constexpr auto GetNonLinearIndependentDimensionGroups()
|
||||
{
|
||||
// not implemented
|
||||
}
|
||||
#endif
|
||||
|
||||
// TODO: should this function be here? should it be specific for padding check?
|
||||
|
||||
@@ -96,6 +96,19 @@ __host__ __device__ constexpr auto
|
||||
LowerTensorDescriptor{}, typename sequence_map_inverse<MapUpper2Lower>::type{});
|
||||
}
|
||||
|
||||
template <typename LowerTensorDescriptor, index_t VectorDim, index_t VectorSize>
|
||||
__host__ __device__ constexpr auto
|
||||
vectorize_tensor_descriptor(LowerTensorDescriptor, Number<VectorDim> vector_dim, Number<VectorSize>)
|
||||
{
|
||||
constexpr index_t nDim = LowerTensorDescriptor::GetNumOfDimension();
|
||||
|
||||
return transform_tensor_descriptor(
|
||||
LowerTensorDescriptor{},
|
||||
Vectorize<LowerTensorDescriptor::GetLength(vector_dim), VectorSize>{},
|
||||
typename arithmetic_sequence_gen<0, nDim, 1>::type{},
|
||||
typename arithmetic_sequence_gen<0, nDim, 1>::type{});
|
||||
}
|
||||
|
||||
template <typename... NativeDimensions>
|
||||
__host__ __device__ void
|
||||
print_tensor_descriptor(const char* s, const NativeTensorDescriptor<NativeDimensions...>& desc)
|
||||
|
||||
@@ -680,6 +680,10 @@ struct BlockwiseGenericTensorSliceCopy_v3
|
||||
template <index_t BlockSize,
|
||||
typename SrcDesc,
|
||||
typename DstDesc,
|
||||
typename SrcLinearDimensionMask,
|
||||
typename SrcNonLinearDimensionMask,
|
||||
typename DstLinearDimensionMask,
|
||||
typename DstNonLinearDimensionMask,
|
||||
typename SliceLengths,
|
||||
typename SubLengths,
|
||||
typename ThreadClusterLengths,
|
||||
@@ -739,7 +743,9 @@ struct BlockwiseGenericTensorSliceCopy_v4
|
||||
{
|
||||
#if 0
|
||||
mThreadwiseLoad.Run(p_src, p_buffer);
|
||||
#else
|
||||
#elif 1
|
||||
mThreadwiseLoad.Run_access_order_optimized_for_source_index_calculation(p_src, p_buffer);
|
||||
#elif 0
|
||||
// hardcoded: global to register
|
||||
mThreadwiseLoad.template Run_amd_experiment<TData, 2, 0>(p_src, p_buffer);
|
||||
#endif
|
||||
@@ -750,7 +756,7 @@ struct BlockwiseGenericTensorSliceCopy_v4
|
||||
{
|
||||
#if 0
|
||||
mThreadwiseStore.Run(p_buffer, p_dst);
|
||||
#else
|
||||
#elif 1
|
||||
// hardcoded: register to LDS
|
||||
mThreadwiseStore.template Run_amd_experiment<TData, 0, 1>(p_buffer, p_dst);
|
||||
#endif
|
||||
@@ -784,21 +790,31 @@ struct BlockwiseGenericTensorSliceCopy_v4
|
||||
private:
|
||||
using RegisterBufferDesc = decltype(make_native_tensor_descriptor_packed(SubLengths{}));
|
||||
|
||||
using ThreadwiseLoad = ThreadwiseGenericTensorSliceCopy_v4r2<SrcDesc,
|
||||
RegisterBufferDesc,
|
||||
SubLengths,
|
||||
SrcDimAccessOrder,
|
||||
SrcVectorAccessDim,
|
||||
SrcDataPerAccess,
|
||||
1>;
|
||||
using ThreadwiseLoad =
|
||||
ThreadwiseGenericTensorSliceCopy_v4r2<SrcDesc,
|
||||
RegisterBufferDesc,
|
||||
SrcLinearDimensionMask,
|
||||
SrcNonLinearDimensionMask,
|
||||
typename uniform_sequence_gen<nDim, 1>::type,
|
||||
typename uniform_sequence_gen<nDim, 0>::type,
|
||||
SubLengths,
|
||||
SrcDimAccessOrder,
|
||||
SrcVectorAccessDim,
|
||||
SrcDataPerAccess,
|
||||
1>;
|
||||
|
||||
using ThreadwiseStore = ThreadwiseGenericTensorSliceCopy_v4r2<RegisterBufferDesc,
|
||||
DstDesc,
|
||||
SubLengths,
|
||||
DstDimAccessOrder,
|
||||
DstVectorAccessDim,
|
||||
1,
|
||||
DstDataPerAccess>;
|
||||
using ThreadwiseStore =
|
||||
ThreadwiseGenericTensorSliceCopy_v4r2<RegisterBufferDesc,
|
||||
DstDesc,
|
||||
typename uniform_sequence_gen<nDim, 1>::type,
|
||||
typename uniform_sequence_gen<nDim, 0>::type,
|
||||
DstLinearDimensionMask,
|
||||
DstNonLinearDimensionMask,
|
||||
SubLengths,
|
||||
DstDimAccessOrder,
|
||||
DstVectorAccessDim,
|
||||
1,
|
||||
DstDataPerAccess>;
|
||||
|
||||
ThreadwiseLoad mThreadwiseLoad;
|
||||
ThreadwiseStore mThreadwiseStore;
|
||||
|
||||
@@ -1136,6 +1136,10 @@ struct ThreadwiseGenericTensorSliceCopy_v3r1
|
||||
// the other is device memory or LDS
|
||||
template <typename SrcDesc,
|
||||
typename DstDesc,
|
||||
typename SrcLinearDimensionMask,
|
||||
typename SrcNonLinearDimensionMask,
|
||||
typename DstLinearDimensionMask,
|
||||
typename DstNonLinearDimensionMask,
|
||||
typename SliceLengths,
|
||||
typename DimAccessOrder,
|
||||
index_t VectorAccessDim,
|
||||
@@ -1254,6 +1258,117 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
|
||||
});
|
||||
}
|
||||
|
||||
template <index_t... Lengths, index_t... Mask>
|
||||
__device__ static constexpr auto mask_lengths(Sequence<Lengths...>, Sequence<Mask...>)
|
||||
{
|
||||
return Sequence<(Mask ? Lengths : 1)...>{};
|
||||
}
|
||||
|
||||
template <class TData>
|
||||
__device__ void Run_access_order_optimized_for_source_index_calculation(const TData* p_src,
|
||||
TData* p_dst) const
|
||||
{
|
||||
using src_vector_t = typename vector_type<TData, SrcDataPerAccess>::MemoryType;
|
||||
using dst_vector_t = typename vector_type<TData, DstDataPerAccess>::MemoryType;
|
||||
|
||||
constexpr auto vector_access_dim = Number<VectorAccessDim>{};
|
||||
|
||||
constexpr auto src_data_per_access = Number<SrcDataPerAccess>{};
|
||||
constexpr auto dst_data_per_access = Number<DstDataPerAccess>{};
|
||||
|
||||
constexpr auto long_vector_size = Number<math::lcm(SrcDataPerAccess, DstDataPerAccess)>{};
|
||||
|
||||
constexpr auto long_vector_access_lengths = SliceLengths::Modify(
|
||||
vector_access_dim, SliceLengths::Get(vector_access_dim) / long_vector_size);
|
||||
|
||||
// TODO:: don't use hack
|
||||
constexpr auto src_linear_dim_mask = SrcLinearDimensionMask{};
|
||||
constexpr auto src_nonlinear_dim_mask = SrcNonLinearDimensionMask{};
|
||||
|
||||
// separate steps into linear and non-linear components
|
||||
constexpr auto linear_long_vector_access_lengths =
|
||||
mask_lengths(long_vector_access_lengths, src_linear_dim_mask);
|
||||
|
||||
constexpr auto nonlinear_long_vector_access_lengths =
|
||||
mask_lengths(long_vector_access_lengths, src_nonlinear_dim_mask);
|
||||
|
||||
// loop over src's non-linear dimensions
|
||||
ford<decltype(nonlinear_long_vector_access_lengths)>{}(
|
||||
[&](auto nonlinear_dim_long_vector_access_id) {
|
||||
|
||||
// step-sizes along src's nonlinear dimensions
|
||||
auto nonlinear_dim_data_steps = nonlinear_dim_long_vector_access_id;
|
||||
nonlinear_dim_data_steps(vector_access_dim) =
|
||||
long_vector_size * nonlinear_dim_long_vector_access_id[vector_access_dim];
|
||||
|
||||
// move src cooridnate along nonlinear dimensions
|
||||
const auto src_nonlinear_coord = mSrcSliceOrigin + nonlinear_dim_data_steps;
|
||||
|
||||
// loop over src's linear dimensions
|
||||
ford<decltype(linear_long_vector_access_lengths)>{}(
|
||||
[&](auto linear_dim_long_vector_access_id) {
|
||||
|
||||
// step-sizes along src's linear dimensions
|
||||
auto linear_dim_data_steps = linear_dim_long_vector_access_id;
|
||||
linear_dim_data_steps(vector_access_dim) =
|
||||
long_vector_size * linear_dim_long_vector_access_id[vector_access_dim];
|
||||
|
||||
// buffer to hold a long-vector
|
||||
TData p_long_vector[long_vector_size];
|
||||
|
||||
// set 0
|
||||
for(index_t i = 0; i < long_vector_size; ++i)
|
||||
{
|
||||
p_long_vector[i] = 0;
|
||||
}
|
||||
|
||||
// load data from src to the long-vector buffer
|
||||
for(index_t i = 0; i < long_vector_size / src_data_per_access; ++i)
|
||||
{
|
||||
auto scalar_id = make_zero_array<index_t, nDim>();
|
||||
scalar_id(vector_access_dim) = i * src_data_per_access;
|
||||
|
||||
// move src cooridnate along linear dimensions
|
||||
const auto src_coord =
|
||||
src_nonlinear_coord + (linear_dim_data_steps + scalar_id);
|
||||
|
||||
// TODO: good implementation?
|
||||
const index_t src_linear_offset_diff =
|
||||
src_coord.GetOffset() - src_nonlinear_coord.GetOffset();
|
||||
|
||||
// check for padding
|
||||
// TODO: still kind of messy
|
||||
if(!src_coord.IsAnyLevelIndexInPaddingArea())
|
||||
{
|
||||
const index_t src_offset = src_coord.GetOffset();
|
||||
|
||||
const index_t buffer_offset = i * src_data_per_access;
|
||||
|
||||
*reinterpret_cast<src_vector_t*>(&p_long_vector[buffer_offset]) =
|
||||
*reinterpret_cast<const src_vector_t*>(&p_src[src_offset]);
|
||||
}
|
||||
}
|
||||
|
||||
// store data from the long-vector buffer to dst
|
||||
for(index_t i = 0; i < long_vector_size / dst_data_per_access; ++i)
|
||||
{
|
||||
auto scalar_id = make_zero_array<index_t, nDim>();
|
||||
scalar_id(vector_access_dim) = i * dst_data_per_access;
|
||||
|
||||
const index_t buffer_offset = i * dst_data_per_access;
|
||||
|
||||
const index_t dst_offset =
|
||||
(mDstSliceOrigin +
|
||||
(nonlinear_dim_data_steps + linear_dim_data_steps + scalar_id))
|
||||
.GetOffset();
|
||||
|
||||
*reinterpret_cast<dst_vector_t*>(&p_dst[dst_offset]) =
|
||||
*reinterpret_cast<dst_vector_t*>(&p_long_vector[buffer_offset]);
|
||||
}
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
// memory-space
|
||||
// 0: VGPR
|
||||
// 1: LDS
|
||||
|
||||
@@ -389,6 +389,20 @@ __host__ __device__ constexpr auto operator-(Sequence<Is...> a, Array<TData, NSi
|
||||
return result;
|
||||
}
|
||||
|
||||
// Array = Array * TData
|
||||
template <typename TData, index_t NSize>
|
||||
__host__ __device__ constexpr auto operator*(TData v, Array<TData, NSize> a)
|
||||
{
|
||||
Array<TData, NSize> result;
|
||||
|
||||
for(index_t i = 0; i < NSize; ++i)
|
||||
{
|
||||
result(i) = a[i] * v;
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
template <typename TData, index_t NSize, typename Reduce>
|
||||
__host__ __device__ constexpr TData
|
||||
accumulate_on_array(const Array<TData, NSize>& a, Reduce f, TData init)
|
||||
|
||||
@@ -706,18 +706,18 @@ __host__ __device__ constexpr auto sequence_pop_back(Seq)
|
||||
return sequence_pop_front(Seq::Reverse()).Reverse();
|
||||
}
|
||||
|
||||
template <typename F, index_t... Xs>
|
||||
__host__ __device__ constexpr auto transform_sequences(F f, Sequence<Xs...>)
|
||||
{
|
||||
return Sequence<f(Xs)...>{};
|
||||
}
|
||||
|
||||
template <typename... Seqs>
|
||||
__host__ __device__ constexpr auto merge_sequences(Seqs...)
|
||||
{
|
||||
return typename sequence_merge<Seqs...>::type{};
|
||||
}
|
||||
|
||||
template <typename F, index_t... Xs>
|
||||
__host__ __device__ constexpr auto transform_sequences(F f, Sequence<Xs...>)
|
||||
{
|
||||
return Sequence<f(Xs)...>{};
|
||||
}
|
||||
|
||||
template <typename F, index_t... Xs, index_t... Ys>
|
||||
__host__ __device__ constexpr auto transform_sequences(F f, Sequence<Xs...>, Sequence<Ys...>)
|
||||
{
|
||||
|
||||
@@ -113,19 +113,33 @@ __host__ __device__ constexpr auto make_tuple(Xs&&... xs)
|
||||
namespace detail {
|
||||
|
||||
template <typename F, typename X, index_t... Is>
|
||||
__host__ __device__ constexpr auto transform_tuple_impl(F f, const X& x, Sequence<Is...>)
|
||||
__host__ __device__ constexpr auto transform_tuples_impl(F f, const X& x, Sequence<Is...>)
|
||||
{
|
||||
return make_tuple(f(x.At(Number<Is>{}))...);
|
||||
}
|
||||
|
||||
template <typename F, typename X, typename Y, index_t... Is>
|
||||
__host__ __device__ constexpr auto
|
||||
transform_tuples_impl(F f, const X& x, const Y& y, Sequence<Is...>)
|
||||
{
|
||||
return make_tuple(f(x.At(Number<Is>{}), y.At(Number<Is>{}))...);
|
||||
}
|
||||
|
||||
} // namespace detail
|
||||
|
||||
template <typename F, typename X>
|
||||
__host__ __device__ constexpr auto transform_tuple(F f, const X& x)
|
||||
__host__ __device__ constexpr auto transform_tuples(F f, const X& x)
|
||||
{
|
||||
return detail::transform_tuple_impl(
|
||||
return detail::transform_tuples_impl(
|
||||
f, x, typename arithmetic_sequence_gen<0, X::Size(), 1>::type{});
|
||||
}
|
||||
|
||||
template <typename F, typename X, typename Y>
|
||||
__host__ __device__ constexpr auto transform_tuples(F f, const X& x, const Y& y)
|
||||
{
|
||||
return detail::transform_tuples_impl(
|
||||
f, x, y, typename arithmetic_sequence_gen<0, X::Size(), 1>::type{});
|
||||
}
|
||||
|
||||
} // namespace ck
|
||||
#endif
|
||||
|
||||
@@ -3,7 +3,7 @@
|
||||
#include "device.hpp"
|
||||
#include "tensor.hpp"
|
||||
#include "gridwise_convolution_kernel_wrapper.hpp"
|
||||
#include "gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded.hpp"
|
||||
//#include "gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded.hpp"
|
||||
#include "gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded_lds_double_buffer.hpp"
|
||||
|
||||
template <typename T,
|
||||
|
||||
@@ -9,17 +9,17 @@
|
||||
#include "conv_common.hpp"
|
||||
#include "host_conv.hpp"
|
||||
#include "device_convolution_direct_v2_nchw_kcyx_nkhw.hpp"
|
||||
#include "device_convolution_implicit_gemm_v1_chwn_cyxk_khwn.hpp"
|
||||
#include "device_convolution_implicit_gemm_v1_chwn_cyxk_khwn_padded.hpp"
|
||||
//#include "device_convolution_implicit_gemm_v1_chwn_cyxk_khwn.hpp"
|
||||
//#include "device_convolution_implicit_gemm_v1_chwn_cyxk_khwn_padded.hpp"
|
||||
//#include "device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw.hpp"
|
||||
//#include "device_convolution_implicit_gemm_v2_chwn_cyxk_khwn.hpp"
|
||||
//#include "device_convolution_implicit_gemm_v3_nchw_cyxk_nkhw.hpp"
|
||||
#include "device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp"
|
||||
//#include "device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp"
|
||||
#include "device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded.hpp"
|
||||
//#include "device_convolution_implicit_gemm_v4r2_nchw_kcyx_nkhw.hpp"
|
||||
//#include "device_convolution_implicit_gemm_v4r3_nchw_kcyx_nkhw.hpp"
|
||||
#include "device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp"
|
||||
#include "device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_padded.hpp"
|
||||
//#include "device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp"
|
||||
//#include "device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_padded.hpp"
|
||||
|
||||
struct GeneratorTensor_1
|
||||
{
|
||||
|
||||
Reference in New Issue
Block a user