adding implicit gemm v3

This commit is contained in:
Chao Liu
2019-05-22 19:39:56 -05:00
parent 2a48812edb
commit 8a4b59785b
26 changed files with 373 additions and 259 deletions

View File

@@ -38,7 +38,7 @@ void device_convolution_implicit_gemm_v1_chwn_cyxk_khwn(InDesc,
constexpr index_t X = wei_kcyx_desc.GetLength(I3);
// reorder weight
auto wei_cyxk_desc = make_packed_ConstantTensorDescriptor(Sequence<C, Y, X, K>{});
auto wei_cyxk_desc = make_ConstantTensorDescriptor_default_rank_packed(Sequence<C, Y, X, K>{});
ostream_ConstantTensorDescriptor(wei_cyxk_desc, std::cout << "wei_cyxk_desc: ");
Tensor<T> wei_cyxk(make_TensorDescriptor(wei_cyxk_desc));
@@ -51,7 +51,7 @@ void device_convolution_implicit_gemm_v1_chwn_cyxk_khwn(InDesc,
std::thread::hardware_concurrency());
// reorder input
auto in_chwn_desc = make_packed_ConstantTensorDescriptor(Sequence<C, Hi, Wi, N>{});
auto in_chwn_desc = make_ConstantTensorDescriptor_default_rank_packed(Sequence<C, Hi, Wi, N>{});
ostream_ConstantTensorDescriptor(in_chwn_desc, std::cout << "in_chwn_desc: ");
Tensor<T> in_chwn(make_TensorDescriptor(in_chwn_desc));
@@ -64,7 +64,8 @@ void device_convolution_implicit_gemm_v1_chwn_cyxk_khwn(InDesc,
std::thread::hardware_concurrency());
// output
auto out_khwn_desc = make_packed_ConstantTensorDescriptor(Sequence<K, Ho, Wo, N>{});
auto out_khwn_desc =
make_ConstantTensorDescriptor_default_rank_packed(Sequence<K, Ho, Wo, N>{});
ostream_ConstantTensorDescriptor(out_khwn_desc, std::cout << "out_khwn_desc: ");
Tensor<T> out_khwn(make_TensorDescriptor(out_khwn_desc));

View File

@@ -37,7 +37,7 @@ void device_convolution_implicit_gemm_v1_nchw_cyxk_khwn(InDesc,
constexpr index_t X = wei_kcyx_desc.GetLength(I3);
// reorder weight
auto wei_cyxk_desc = make_packed_ConstantTensorDescriptor(Sequence<C, Y, X, K>{});
auto wei_cyxk_desc = make_ConstantTensorDescriptor_default_rank_packed(Sequence<C, Y, X, K>{});
ostream_ConstantTensorDescriptor(wei_cyxk_desc, std::cout << "wei_cyxk_desc: ");
Tensor<T> wei_cyxk(make_TensorDescriptor(wei_cyxk_desc));
@@ -50,7 +50,8 @@ void device_convolution_implicit_gemm_v1_nchw_cyxk_khwn(InDesc,
std::thread::hardware_concurrency());
// output
auto out_khwn_desc = make_packed_ConstantTensorDescriptor(Sequence<K, Ho, Wo, N>{});
auto out_khwn_desc =
make_ConstantTensorDescriptor_default_rank_packed(Sequence<K, Ho, Wo, N>{});
ostream_ConstantTensorDescriptor(out_khwn_desc, std::cout << "out_khwn_desc: ");
Tensor<T> out_khwn(make_TensorDescriptor(out_khwn_desc));

View File

@@ -36,7 +36,7 @@ void device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw(InDesc,
constexpr index_t X = wei_kcyx_desc.GetLength(I3);
// reorder weight
auto wei_cyxk_desc = make_packed_ConstantTensorDescriptor(Sequence<C, Y, X, K>{});
auto wei_cyxk_desc = make_ConstantTensorDescriptor_default_rank_packed(Sequence<C, Y, X, K>{});
ostream_ConstantTensorDescriptor(wei_cyxk_desc, std::cout << "wei_cyxk_desc: ");
Tensor<T> wei_cyxk(make_TensorDescriptor(wei_cyxk_desc));
@@ -57,7 +57,7 @@ void device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw(InDesc,
wei_cyxk_device_buf.ToDevice(wei_cyxk.mData.data());
out_nkhw_device_buf.ToDevice(out_nkhw.mData.data());
#if 0
#if 1
// for 3x3, 34x34, v1r3, Pascal
constexpr index_t BlockSize = 128;
@@ -92,7 +92,7 @@ void device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw(InDesc,
constexpr index_t WeiBlockCopyDataPerRead_K = 4;
constexpr index_t OutThreadCopyDataPerWrite_W = 2;
#elif 0
#elif 1
// for 3x3, 34x34, v1r3, Vega 20, WoPerBlock = 32
constexpr index_t BlockSize = 256;
@@ -162,7 +162,7 @@ void device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw(InDesc,
constexpr index_t WeiBlockCopyDataPerRead_K = 4;
constexpr index_t OutThreadCopyDataPerWrite_W = 2;
#elif 1
#elif 0
// for 3x3, 34x34, v1r3, Vega 20, WoPerBlock = 8
constexpr index_t BlockSize = 256;

View File

@@ -35,7 +35,7 @@ void device_convolution_implicit_gemm_v3_nchw_cyxk_nkhw(InDesc,
constexpr index_t X = wei_kcyx_desc.GetLength(I3);
// reorder weight
auto wei_cyxk_desc = make_ConstantTensorDescriptor(Sequence<C, Y, X, K>{});
auto wei_cyxk_desc = make_ConstantTensorDescriptor_default_rank_packed(Sequence<C, Y, X, K>{});
ostream_ConstantTensorDescriptor(wei_cyxk_desc, std::cout << "wei_cyxk_desc: ");
Tensor<T> wei_cyxk(make_TensorDescriptor(wei_cyxk_desc));
@@ -56,37 +56,40 @@ void device_convolution_implicit_gemm_v3_nchw_cyxk_nkhw(InDesc,
wei_cyxk_device_buf.ToDevice(wei_cyxk.mData.data());
out_nkhw_device_buf.ToDevice(out_nkhw.mData.data());
constexpr index_t N1 = 2;
constexpr index_t N2 = 4;
constexpr index_t B = (N * Ho * Wo) / (N1 * N2);
#if 1
// for 3x3, 28x28, v3, Pascal
constexpr index_t BlockSize = 128;
// for 3x3, 28x28, v3
constexpr index_t BlockSize = 256;
constexpr index_t BPerBlock = 16;
constexpr index_t KPerBlock = 128;
constexpr index_t CPerBlock = 8;
constexpr index_t BPerThread = 1;
constexpr index_t KPerThread = 8;
constexpr index_t GemmMPerThreadSubC = 4;
constexpr index_t GemmNPerThreadSubC = 4;
constexpr index_t GemmMLevel0Cluster = 4;
constexpr index_t GemmNLevel0Cluster = 2;
constexpr index_t GemmNLevel0Cluster = 4;
constexpr index_t GemmMLevel1Cluster = 4;
constexpr index_t GemmNLevel1Cluster = 2;
constexpr index_t GemmNLevel1Cluster = 4;
constexpr index_t GemmKPerThreadLoop = 1;
constexpr index_t GemmDataPerReadA = 4;
constexpr index_t GemmDataPerReadB = 4;
using InBlockReorderSrcSubLengths_NCHW = Sequence<4, 1, 1, 1>;
using InBlockReorderSrcClusterLengths_NCHW = Sequence<4, 8, 2, 2>;
using InBlockReorderMapThreadCluster2SrcCluster_CHNW2NCHW = Sequence<1, 2, 0, 3>;
using InBlockCopySubLengths_N1_N2_C_B = Sequence<1, 4, 1, 1>;
using InBlockCopyClusterLengths_N1_N2_C_B = Sequence<2, 1, 8, 16>;
constexpr index_t WeiBlockCopyDataPerRead_K = 4;
constexpr index_t InBlockCopySrcDataPerRead_B = 1;
constexpr index_t InBlockCopyDstDataPerWrite_N2 = 4;
constexpr index_t WeiBlockCopyDataPerAccess_K = 4;
#endif
constexpr index_t GridSize =
((N + NPerBlock - 1) / NPerBlock) * ((K + KPerBlock - 1) / KPerBlock) *
((Ho + HoPerBlock - 1) / HoPerBlock) * ((Wo + WoPerBlock - 1) / WoPerBlock);
((B + BPerBlock - 1) / BPerBlock) * ((K + KPerBlock - 1) / KPerBlock);
printf("%s: BlockSize %u, GridSize %u \n", __func__, BlockSize, GridSize);
@@ -102,15 +105,11 @@ void device_convolution_implicit_gemm_v3_nchw_cyxk_nkhw(InDesc,
decltype(in_nchw_desc),
decltype(wei_cyxk_desc),
decltype(out_nkhw_desc),
NPerBlock,
BPerBlock,
KPerBlock,
CPerBlock,
HoPerBlock,
WoPerBlock,
NPerThread,
KPerThread,
HoPerThread,
WoPerThread,
N1,
N2,
GemmMPerThreadSubC,
GemmNPerThreadSubC,
GemmMLevel0Cluster,
@@ -120,14 +119,11 @@ void device_convolution_implicit_gemm_v3_nchw_cyxk_nkhw(InDesc,
GemmKPerThreadLoop,
GemmDataPerReadA,
GemmDataPerReadB,
InBlockReorderSrcSubLengths_NCHW,
InBlockReorderSrcClusterLengths_NCHW,
InBlockReorderMapThreadCluster2SrcCluster_CHNW2NCHW,
InBlockReorderDataPerRead_W,
InBlockReorderDataPerWrite_N,
WeiBlockCopyClusterLengths,
WeiBlockCopyDataPerRead_K,
OutThreadCopyDataPerWrite_W>{};
InBlockCopySubLengths_N1_N2_C_B,
InBlockCopyClusterLengths_N1_N2_C_B,
InBlockCopySrcDataPerRead_B,
InBlockCopyDstDataPerWrite_N2,
WeiBlockCopyDataPerAccess_K>{};
float time = launch_kernel(run_gridwise_convolution<decltype(gridwise_conv), T>,
dim3(GridSize),

View File

@@ -13,7 +13,7 @@
#include "device_convolution_implicit_gemm_v1_nchw_cyxk_khwn.hpp"
#include "device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw.hpp"
#include "device_convolution_implicit_gemm_v2_chwn_cyxk_khwn.hpp"
//#include "device_convolution_implicit_gemm_v3_nchw_cyxk_nkhw.hpp"
#include "device_convolution_implicit_gemm_v3_nchw_cyxk_nkhw.hpp"
struct GeneratorTensor_1
{
@@ -548,8 +548,8 @@ int main(int argc, char* argv[])
auto lower_pads = Sequence<HPad, WPad>{};
auto upper_pads = Sequence<HPad, WPad>{};
auto in_nchw_desc = make_packed_ConstantTensorDescriptor(Sequence<N, C, HI, WI>{});
auto wei_kcyx_desc = make_packed_ConstantTensorDescriptor(Sequence<K, C, Y, X>{});
auto in_nchw_desc = make_ConstantTensorDescriptor_default_rank_packed(Sequence<N, C, HI, WI>{});
auto wei_kcyx_desc = make_ConstantTensorDescriptor_default_rank_packed(Sequence<K, C, Y, X>{});
auto out_nkhw_desc = get_convolution_with_padding_output_default_4d_tensor_descriptor(
in_nchw_desc, wei_kcyx_desc, lower_pads, upper_pads);
@@ -612,11 +612,11 @@ int main(int argc, char* argv[])
device_convolution_implicit_gemm_v1_chwn_cyxk_khwn
#elif 0
device_convolution_implicit_gemm_v1_nchw_cyxk_khwn
#elif 1
#elif 0
device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw
#elif 0
device_convolution_implicit_gemm_v2_chwn_cyxk_khwn
#elif 0
#elif 1
device_convolution_implicit_gemm_v3_nchw_cyxk_nkhw
#endif
(in_nchw_desc, in_nchw, wei_kcyx_desc, wei_kcyx, out_nkhw_desc, out_nkhw_device, nrepeat);

View File

@@ -12,7 +12,7 @@ struct Array
index_t mData[nSize];
template <class... Xs>
__host__ __device__ Array(Xs... xs) : mData{static_cast<TData>(xs)...}
__host__ __device__ constexpr Array(Xs... xs) : mData{static_cast<TData>(xs)...}
{
}
@@ -37,6 +37,25 @@ struct Array
}
};
template <index_t... Is>
__host__ __device__ constexpr auto sequence2array(Sequence<Is...>)
{
return Array<index_t, sizeof...(Is)>{Is...};
}
template <class TData, index_t NSize>
__host__ __device__ constexpr auto make_zero_array()
{
Array<TData, NSize> a;
static_for<0, NSize, 1>{}([&](auto I) {
constexpr index_t i = I.Get();
a[i] = static_cast<TData>(0);
});
return a;
}
template <class TData, index_t NSize, index_t... IRs>
__host__ __device__ auto reorder_array_given_new2old(const Array<TData, NSize>& old_array,
Sequence<IRs...> new2old)
@@ -80,15 +99,14 @@ __host__ __device__ auto extract_array(const Array<TData, NSize>& old_array, Ext
static_for<0, new_size, 1>{}([&](auto I) {
constexpr index_t i = I.Get();
new_array[i] = old_array[ExtractSeq{}.Get(I)];
new_array[i] = old_array[ExtractSeq::Get(I)];
});
return new_array;
}
template <class TData, index_t NSize>
__host__ __device__ constexpr auto operator+(const Array<TData, NSize>& a,
const Array<TData, NSize>& b)
__host__ __device__ constexpr auto operator+(Array<TData, NSize> a, Array<TData, NSize> b)
{
Array<TData, NSize> result;
@@ -99,3 +117,20 @@ __host__ __device__ constexpr auto operator+(const Array<TData, NSize>& a,
return result;
}
// Array = Array * Sequence
template <class TData, index_t NSize, index_t... Is>
__host__ __device__ constexpr auto operator*(Array<TData, NSize> a, Sequence<Is...> b)
{
static_assert(sizeof...(Is) == NSize, "wrong! size not the same");
Array<TData, NSize> result;
static_for<0, NSize, 1>{}([&](auto I) {
constexpr index_t i = I.Get();
result[i] = a[i] + b.Get(I);
});
return result;
}

View File

@@ -9,26 +9,26 @@ struct ConstantMatrixDescriptor
static_assert(NCol_ <= RowStride_, "wrong! NCol > RowStride!");
}
__host__ __device__ constexpr index_t NRow() const { return NRow_; }
__host__ __device__ static constexpr index_t NRow() { return NRow_; }
__host__ __device__ constexpr index_t NCol() const { return NCol_; }
__host__ __device__ static constexpr index_t NCol() { return NCol_; }
__host__ __device__ constexpr index_t RowStride() const { return RowStride_; }
__host__ __device__ static constexpr index_t RowStride() { return RowStride_; }
__host__ __device__ constexpr auto GetLengths() const { return Sequence<NRow_, NCol_>{}; }
__host__ __device__ static constexpr auto GetLengths() { return Sequence<NRow_, NCol_>{}; }
__host__ __device__ constexpr index_t GetElementSize() const { return NRow_ * NCol_; }
__host__ __device__ static constexpr index_t GetElementSize() { return NRow_ * NCol_; }
__host__ __device__ constexpr index_t GetElementSpace() const { return NRow_ * RowStride_; }
__host__ __device__ static constexpr index_t GetElementSpace() { return NRow_ * RowStride_; }
__host__ __device__ index_t GetOffsetFromMultiIndex(index_t irow, index_t icol) const
__host__ __device__ static index_t GetOffsetFromMultiIndex(index_t irow, index_t icol)
{
return irow * RowStride_ + icol;
}
template <index_t SubNRow, index_t SubNCol>
__host__ __device__ constexpr auto MakeSubMatrixDescriptor(Number<SubNRow>,
Number<SubNCol>) const
__host__ __device__ static constexpr auto MakeSubMatrixDescriptor(Number<SubNRow>,
Number<SubNCol>)
{
return ConstantMatrixDescriptor<SubNRow, SubNCol, RowStride_>{};
}

View File

@@ -11,8 +11,8 @@ struct ConstantMergedTensorDescriptor
{
static constexpr auto mOriginalDimMergeSeqs = std::tuple<OriginalDimMergeSeqs...>{};
static constexpr index_t nDim = std::tuple_size<mOriginalDimMergeSeqs>::value;
static constexpr index_t nOriginalDim = OriginalDesc::GetNumOfDimension();
static constexpr index_t nDim = sizeof...(OriginalDimMergeSeqs);
static constexpr index_t nOriginalDim = OriginalTensorDesc::GetNumOfDimension();
__host__ __device__ constexpr ConstantMergedTensorDescriptor()
{
@@ -21,25 +21,28 @@ struct ConstantMergedTensorDescriptor
// TODO: check each of OriginalDimMergeSeqs contains at least 1, and at most
// OriginalTensorDesc::nDim number of dimensions
// TODO: check there is no duplication in OriginalDimMergeSeqs
// TODO: check OriginalDimMergeSeqs contains all original dimensions
// TODO: check there is no duplication in OriginalDimMergeSeqs
}
__host__ __device__ static constexpr index_t GetNumOfDimension() { return nDim; }
__host__ __device__ static constexpr index_t GetNumOfOriginalDimension() { return nOriginalDim }
__host__ __device__ static constexpr index_t GetNumOfOriginalDimension()
{
return nOriginalDim;
}
template <index_t IDim>
__host__ __device__ static constexpr bool ContainMultipleOriginalDimensions(Number<IDim>)
{
return (std::Get<IDIM>(mOriginalDimMergeSeqs).GetSize() > 1);
return (std::get<IDim>(mOriginalDimMergeSeqs).GetSize() > 1);
}
template <index_t IDim>
__host__ __device__ static constexpr index_t GetLength(Number<IDim>)
{
constexpr auto original_dims_partial = std::Get<IDim>(mOriginalDimMergeSeqs);
constexpr auto original_dims_partial = std::get<IDim>(mOriginalDimMergeSeqs);
return OriginalTensorDesc::Extract(original_dims_partial).GetElementSize();
}
@@ -50,14 +53,14 @@ struct ConstantMergedTensorDescriptor
static_assert(!ContainMultipleOriginalDimensions(Number<IDim>{}),
"wrong! stride of a merged dimension is undefined");
constexpr auto idim_original = std::Get<IDim>(mOriginalDimMergeSeqs).Front();
constexpr auto idim_original = std::get<IDim>(mOriginalDimMergeSeqs).Front();
return OriginalTensorDesc::GetStride(Number<idim_original>{});
}
__host__ __device__ static constexpr auto GetLengths()
{
return Sequence<OriginalTensorDesc::Extract(OriginalDimMergeSeqs).GetElementSize()...>{};
return Sequence<OriginalTensorDesc::Extract(OriginalDimMergeSeqs{}).GetElementSize()...>{};
}
__host__ __device__ static constexpr index_t GetElementSize()
@@ -75,17 +78,16 @@ struct ConstantMergedTensorDescriptor
constexpr auto original_dims_partial = std::get<idim>(mOriginalDimMergeSeqs);
// get partial original-multi-id corresponding to this merged dimension
constexpr auto original_multi_id_partial =
const auto original_multi_id_partial =
OriginalTensorDesc::Extract(original_dims_partial)
.GetMultiIndexFrom1dIndex(multi_id[idim]);
// make sure compiler unroll this loop and propagate all the constants
for(index_t i = 0; i < original_dims_partial.GetSize(); ++i)
{
index_t idim_original = original_dims_partial[i];
static_for<0, original_dims_partial.GetSize(), 1>{}([&](auto I_) {
constexpr auto I = decltype(I_){};
constexpr index_t idim_original = original_dims_partial.Get(I);
original_multi_id[idim_original] = original_multi_id_partial[i]
}
original_multi_id[idim_original] = original_multi_id_partial[I.Get()];
});
});
return original_multi_id;
@@ -95,10 +97,10 @@ struct ConstantMergedTensorDescriptor
{
const auto original_multi_id = GetOriginalMultiIndexFromMultiIndex(multi_id);
return OriginalTensorDesc::GetOffsetFromMultiIndex(orginal_multi_id);
return OriginalTensorDesc::GetOffsetFromMultiIndex(original_multi_id);
}
template <index_t... Is>
template <class... Is>
__host__ __device__ static index_t GetOffsetFromMultiIndex(Is... is)
{
return GetOffsetFromMultiIndex(Array<index_t, nDim>{is...});
@@ -106,14 +108,15 @@ struct ConstantMergedTensorDescriptor
__host__ __device__ static Array<index_t, nDim> GetMultiIndexFrom1dIndex(index_t id)
{
constexpr auto dummy_desc = make_packed_ConstantTensorDescriptor(GetLengths());
constexpr auto dummy_desc = make_ConstantTensorDescriptor_default_rank_packed(GetLengths());
return dummy_desc.GetMultiIndexFrom1dIndex(id);
}
};
template <class OriginalTensorDesc, class... OriginalDimMergeSeqs>
constexpr auto make_ConstantMergedTensorDescriptor(OriginalTensorDesc, OriginalDimMergeSeqs...)
__host__ __device__ constexpr auto make_ConstantMergedTensorDescriptor(OriginalTensorDesc,
OriginalDimMergeSeqs...)
{
return ConstantMergedTensorDescriptor<OriginalTensorDesc, OriginalDimMergeSeqs...>{};
}

View File

@@ -2,20 +2,20 @@
#include "common.hip.hpp"
template <class Lengths>
__host__ __device__ constexpr auto calculate_packed_tensor_strides(Lengths)
__host__ __device__ constexpr auto calculate_tensor_strides_default_rank_packed(Lengths)
{
return reverse_inclusive_scan_sequence(Lengths{}.PopFront(), std::multiplies<index_t>{})
.PushBack(Number<1>{});
}
template <class Lengths, index_t Align>
__host__ __device__ constexpr auto
calculate_rank_tensor_default_strides_with_alignment(Lengths, Number<Align>)
__host__ __device__ constexpr auto calculate_tensor_strides_default_rank_aligned(Lengths,
Number<Align>)
{
constexpr index_t L_back_align =
Align * mod_conv::integer_divide_ceiler<index_t>{}(Lengths{}.Back(), Align);
return calculate_packed_tensor_strides(
return calculate_tensor_strides_default_rank_packed(
Lengths{}.Modify(Number<Lengths{}.GetSize() - 1>{}, Number<L_back_align>{}));
}
@@ -66,6 +66,12 @@ struct ConstantTensorDescriptor
return MemoryRanks{}.Get(Number<I>{});
}
template <class T>
__host__ __device__ static constexpr bool ContainMultipleOriginalDimensions(T)
{
return false;
}
__host__ __device__ static constexpr index_t GetElementSize()
{
return accumulate_on_sequence(Lengths{}, std::multiplies<index_t>{}, Number<1>{});
@@ -146,7 +152,7 @@ struct ConstantTensorDescriptor
{
Array<index_t, nDim> multi_id;
constexpr auto dummy_strides = calculate_packed_tensor_strides(GetLengths());
constexpr auto dummy_strides = calculate_tensor_strides_default_rank_packed(GetLengths());
// calculate index in each of the dimensions in the order of their dimension (not rank)
static_for<0, nDim - 1, 1>{}([&](auto IDim) {
@@ -181,6 +187,12 @@ struct ConstantTensorDescriptor
return ConstantTensorDescriptor<extract_lengths, extract_strides, new_ranks>{};
}
template <index_t... IDims>
__host__ __device__ static constexpr auto Extract(Sequence<IDims...>)
{
return Extract(Number<IDims>{}...);
}
template <index_t IDim, index_t SliceLen>
__host__ __device__ static constexpr auto Slice(Number<IDim>, Number<SliceLen>)
{
@@ -271,9 +283,11 @@ struct ConstantTensorDescriptor
FirstUnfoldDim <= LastUnfoldDim,
"wrong! should have FirstUnfoldDim <= LastUnfoldDim!");
#if 0 // cannot compile: compiler complain about constexpr
// dimensions to be unfold need to be in descending order (w.r.t. strides), and need to be
// packed in memory, otherwise, unfolding is invalid
static_for<FirstUnfoldDim, LastUnfoldDim, 1>{}([&](auto IDim) {
static_for<FirstUnfoldDim, LastUnfoldDim, 1>{}([&](auto IDim_) {
constexpr auto IDim = decltype(IDim_){};
constexpr auto IDim_p1 = IDim + Number<1>{};
// check stride
@@ -285,11 +299,12 @@ struct ConstantTensorDescriptor
static_assert(GetStride(IDim_p1) * GetLength(IDim_p1) == GetStride(IDim),
"wrong! dimensions to be unfolded need to be packed");
// checkt ranks
// check ranks
static_assert(GetMemoryRank(IDim_p1) == GetMemoryRank(IDim) + 1,
"wrong! ranks of dimensions to be unfolded need to be in increasing and "
"continuous ranks");
});
#endif
// left and right
constexpr auto left = typename arithmetic_sequence_gen<0, FirstUnfoldDim, 1>::SeqType{};
@@ -308,9 +323,9 @@ struct ConstantTensorDescriptor
// decrease the ranks that are larger than the rank of LastUnfoldDim
constexpr auto tmp_ranks =
transform_sequences(GetMemoryRanks(),
f_unfold_impl<GetMemoryRank(Number<LastUnfoldDim>{}),
LastUnfoldDim - FirstUnfoldDim + 1>{});
transform_sequences(f_unfold_impl<GetMemoryRank(Number<LastUnfoldDim>{}),
LastUnfoldDim - FirstUnfoldDim + 1>{},
GetMemoryRanks());
// new lengths, strides and ranks
constexpr auto new_lengths = GetLengths()
@@ -354,26 +369,26 @@ struct ConstantTensorDescriptor
};
template <class Lengths>
__host__ __device__ constexpr auto make_packed_ConstantTensorDescriptor(Lengths)
__host__ __device__ constexpr auto make_ConstantTensorDescriptor_default_rank_packed(Lengths)
{
using Strides = decltype(calculate_packed_tensor_strides(Lengths{}));
using Strides = decltype(calculate_tensor_strides_default_rank_packed(Lengths{}));
using MemoryRanks = typename arithmetic_sequence_gen<0, Lengths::GetSize(), 1>::SeqType;
return ConstantTensorDescriptor<Lengths, Strides, MemoryRanks>{};
}
template <class Lengths, class Strides>
__host__ __device__ constexpr auto make_ranked_ConstantTensorDescriptor(Lengths, Strides)
__host__ __device__ constexpr auto make_ConstantTensorDescriptor_default_rank(Lengths, Strides)
{
using MemoryRanks = typename arithmetic_sequence_gen<0, Lengths::GetSize(), 1>::SeqType;
return ConstantTensorDescriptor<Lengths, Strides, MemoryRanks>{};
}
template <class Lengths, index_t Align>
__host__ __device__ constexpr auto
make_ranked_ConstantTensorDescriptor_with_alignment(Lengths, Number<Align>)
__host__ __device__ constexpr auto make_ConstantTensorDescriptor_default_rank_aligned(Lengths,
Number<Align>)
{
using Strides =
decltype(calculate_rank_tensor_default_strides_with_alignment(Lengths{}, Number<Align>{}));
decltype(calculate_tensor_strides_default_rank_aligned(Lengths{}, Number<Align>{}));
using MemoryRanks = typename arithmetic_sequence_gen<0, Lengths::GetSize(), 1>::SeqType;
return ConstantTensorDescriptor<Lengths, Strides, MemoryRanks>{};
}

View File

@@ -1,5 +1,5 @@
#pragma once
#include "constant_integral.hip.hpp"
#include "integral_constant.hip.hpp"
#include "functional.hip.hpp"
template <index_t... Is>
@@ -21,12 +21,6 @@ struct Sequence
return mData[I];
}
__host__ __device__ index_t operator[](index_t i) const
{
const index_t mData[mSize + 1] = {Is..., 0};
return mData[i];
}
template <index_t... IRs>
__host__ __device__ static constexpr auto ReorderGivenNew2Old(Sequence<IRs...> /*new2old*/)
{
@@ -164,6 +158,12 @@ struct sequence_reverse_inclusive_scan<Sequence<I>, Reduce>
using SeqType = Sequence<I>;
};
template <class Reduce>
struct sequence_reverse_inclusive_scan<Sequence<>, Reduce>
{
using SeqType = Sequence<>;
};
template <class, class>
struct sequence_extract;

View File

@@ -457,7 +457,8 @@ struct Blockwise2dTensorCopy3
index_t mSrcMyThreadOffset;
index_t mDstMyThreadOffset;
__device__ Blockwise2dTensorCopy3()
__device__ Blockwise2dTensorCopy3(Array<index_t, 2> src_block_data_multi_id_begin,
Array<index_t, 2> dst_block_data_multi_id_begin)
{
constexpr auto I0 = Number<0>{};
constexpr auto I1 = Number<1>{};
@@ -499,10 +500,13 @@ struct Blockwise2dTensorCopy3
const index_t thread_id_d0 = get_thread_local_1d_id() / thread_per_d1;
const index_t thread_id_d1 = get_thread_local_1d_id() - thread_id_d0 * thread_per_d1;
mSrcMyThreadOffset =
SrcDesc{}.GetOffsetFromMultiIndex(thread_id_d0, thread_id_d1 * DataPerRead);
mDstMyThreadOffset =
DstDesc{}.GetOffsetFromMultiIndex(thread_id_d0, thread_id_d1 * DataPerRead);
mSrcMyThreadOffset = SrcDesc{}.GetOffsetFromMultiIndex(
src_block_data_multi_id_begin +
Array<index_t, 2>{thread_id_d0, thread_id_d1 * DataPerRead});
mDstMyThreadOffset = DstDesc{}.GetOffsetFromMultiIndex(
dst_block_data_multi_id_begin +
Array<index_t, 2>{thread_id_d0, thread_id_d1 * DataPerRead});
}
__device__ void Run(const Float* __restrict__ p_src, Float* __restrict__ p_dst) const

View File

@@ -13,7 +13,7 @@ blockwise_4d_tensor_pointwise_operation_unary(DstDesc, Float* __restrict__ p_dst
constexpr auto dst_desc = DstDesc{};
constexpr auto desc = make_packed_ConstantTensorDescriptor(dst_desc.GetLengths());
constexpr auto desc = make_ConstantTensorDescriptor_default_rank_packed(dst_desc.GetLengths());
#if 0
if(get_thread_local_1d_id() == 0)
@@ -108,7 +108,7 @@ __device__ void blockwise_4d_tensor_pointwise_operation_binary_reorder_by_get_ds
constexpr auto src_desc = SrcDesc{};
constexpr auto dst_desc = DstDesc{};
constexpr auto ref_desc = make_packed_ConstantTensorDescriptor(SrcOpLengths{});
constexpr auto ref_desc = make_ConstantTensorDescriptor_default_rank_packed(SrcOpLengths{});
constexpr index_t NLoop = ref_desc.GetElementSize() / BlockSize;
@@ -259,7 +259,7 @@ struct Blockwise4dTensorCopy1
constexpr index_t read_per_d3 = mod_conv::integer_divide_ceil(L3, DataPerRead);
constexpr auto ref_desc =
make_packed_ConstantTensorDescriptor(Sequence<L0, L1, L2, read_per_d3>{});
make_ConstantTensorDescriptor_default_rank_packed(Sequence<L0, L1, L2, read_per_d3>{});
constexpr index_t NLoop = ref_desc.GetElementSize() / BlockSize;
@@ -336,7 +336,7 @@ struct BlockwiseChwnTensorCopyPadded
constexpr auto src_desc = SrcDesc{};
constexpr auto dst_desc = DstDesc{};
constexpr auto ref_desc = make_packed_ConstantTensorDescriptor(DstOpLengths{});
constexpr auto ref_desc = make_ConstantTensorDescriptor_default_rank_packed(DstOpLengths{});
constexpr auto h_global_pad_low = GlobalLowerPads{}.Get(I0);
constexpr auto w_global_pad_low = GlobalLowerPads{}.Get(I1);
@@ -510,7 +510,8 @@ struct Blockwise4dTensorCopy3
}
}
constexpr auto thread_cluster_desc = make_packed_ConstantTensorDescriptor(ThreadPerDims{});
constexpr auto thread_cluster_desc =
make_ConstantTensorDescriptor_default_rank_packed(ThreadPerDims{});
const auto thread_multi_id =
thread_cluster_desc.GetMultiIndexFrom1dIndex(get_thread_local_1d_id());
@@ -652,7 +653,7 @@ struct Blockwise4dTensorCopy3
constexpr index_t nloop_d2 = L2 / thread_per_d2;
constexpr index_t nloop_d3 = mod_conv::integer_divide_ceil(L3, thread_per_d3 * DataPerRead);
constexpr auto clipboard_desc = make_packed_ConstantTensorDescriptor(
constexpr auto clipboard_desc = make_ConstantTensorDescriptor_default_rank_packed(
Sequence<nloop_d0, nloop_d1, nloop_d2, nloop_d3 * DataPerRead>{});
#pragma unroll
@@ -719,7 +720,7 @@ struct Blockwise4dTensorCopy3
constexpr index_t nloop_d2 = L2 / thread_per_d2;
constexpr index_t nloop_d3 = mod_conv::integer_divide_ceil(L3, thread_per_d3 * DataPerRead);
constexpr auto clipboard_desc = make_packed_ConstantTensorDescriptor(
constexpr auto clipboard_desc = make_ConstantTensorDescriptor_default_rank_packed(
Sequence<nloop_d0, nloop_d1, nloop_d2, nloop_d3 * DataPerRead>{});
#pragma unroll

View File

@@ -46,7 +46,7 @@ struct BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_v2
N % (NPerThreadSubC * NLevel0Cluster * NLevel1Cluster) == 0,
"wrong! Cannot evenly divide work among\n");
static_assert(ThreadMatrixC::GetLengths() == GetThreadMatrixCLengths(),
static_assert(is_same_type(ThreadMatrixC::GetLengths(), GetThreadMatrixCLengths()),
"wrong! ThreadMatrixC lengths is wrong");
auto c_thread_mtx_index = GetBeginOfThreadMatrixC(get_thread_local_1d_id());
@@ -55,7 +55,7 @@ struct BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_v2
mMyThreadOffsetB = BlockMatrixB::GetOffsetFromMultiIndex(0, c_thread_mtx_index.col);
}
__device__ static auto GetThreadMatrixCLengths()
__device__ static constexpr auto GetThreadMatrixCLengths()
{
constexpr index_t M = BlockMatrixA::NCol(); // A is transposed
constexpr index_t N = BlockMatrixB::NCol();

View File

@@ -1,19 +1,19 @@
#pragma once
#include "threadwise_tensor_slice_op.hip.hpp"
// slice a merged tensor, reorder and copy it into a normal tensor
// src: a merged tensor,
// dst: a normal tensor
// slice a (normal or merged) tensor, reorder and copy it into another (normal or merged) tensor
template <index_t BlockSize,
class Float,
class SrcDesc,
class DstDesc,
class SliceLengths,
class SubLengths,
class ClusterLengths,
class DataClusterLengths,
class ThreadClusterArrangeOrder,
class SrcAccessOrder,
class DstAccessOrder>
class DstAccessOrder,
index_t SrcDataPerRead,
index_t DstDataPerRead>
struct BlockwiseTensorSliceCopy_generic_v1
{
static constexpr index_t nDim = SrcDesc::GetNumOfDimension();
@@ -21,39 +21,44 @@ struct BlockwiseTensorSliceCopy_generic_v1
index_t mSrcMyThreadOffset;
index_t mDstMyThreadOffset;
__device__ BlockwiseTensorSliceCopy_generic_v1(Array<index_t, nDim> src_block_multi_offset,
Array<index_t, nDim> dst_block_multi_offset)
__device__
BlockwiseTensorSliceCopy_generic_v1(Array<index_t, nDim> src_block_data_multi_id_begin,
Array<index_t, nDim> dst_block_data_multi_id_begin)
{
// check NDim consistent
static_assert(SrcDesc::GetNumOfDimension() == DstDesc::GetNumOfDimension(), "wrong");
constexpr auto thread_cluster_desc = make_packed_ConstantTensorDescriptor(
ClusterLengths{}.ReorderGivenNew2Old(ThreadClusterArrangeOrder{}));
// thread cluster
constexpr auto thread_cluster_desc = make_ConstantTensorDescriptor_default_rank_packed(
DataClusterLengths{}.ReorderGivenNew2Old(ThreadClusterArrangeOrder{}));
// BlockSize
static_assert(BlockSize == thread_cluster_desc.GetElementSize(), "wrong! BlockSize");
// divide work
static_for<0, nDim, 1>{}([&](auto IDim) {
static_assert(SliceLengths{}.Get(IDim) % SubLenghs{}.Get(IDim) == 0,
constexpr auto data_per_cluster_per_dims = SubLengths{} * DataClusterLengths{};
static_for<0, nDim, 1>{}([&](auto IDim_) {
constexpr auto IDim = decltype(IDim_){};
static_assert(SliceLengths::Get(IDim) % SubLengths::Get(IDim) == 0,
"wrong! cannot evenly divide sliced tensor into sub-tensor");
static_assert(SliceLengths::Get(IDim) % data_per_cluster_per_dims.Get(IDim) == 0,
"wrong! cannot evenly divide sliced tensor into cluster");
});
constexpr auto thread_work_desc =
make_packed_ConstantTensorDescriptor(SliceLengths{} / SliceSubLengths{});
constexpr auto repeat_lengths = SliceLengths{} / data_per_cluster_per_dims;
static_for<0, nDim, 1>{}([&](auto IDim) {
static_assert(thread_work_desc.GetLength(IDim) % thread_cluster_desc.Get(IDim) == 0,
"wrong! cannot evenly divide work to cluster");
});
// for now, only support SubLengths.Get() == 1 on a merged dimension that is merge from
// multiple dimensions
static_for<0, nDim, 1>{}([&](auto IDim_) {
constexpr auto IDim = decltype(IDim_){};
// only support SubLengths.Get() == 1 on merged dimension, for now
static_for<0, nDim, 1>{}([&](auto IDim) {
static_if<(SrcDesc::ContainMultipleOriginalDimensions(IDim) ||
DstDesc::ContainMultipleOriginalDimensions(IDim))>{}([&](auto fwd) {
static_assert(fwd(SubLengths{}).Get(IDim) == 1,
"wrong! Sub-Lengths on merged dimension should be 1");
});
static_assert(SubLengths::Get(IDim) == 1 ||
(!SrcDesc::ContainMultipleOriginalDimensions(IDim) &&
!DstDesc::ContainMultipleOriginalDimensions(IDim)),
"wrong! only surpport Sub-Length == 1 on a merged dimension");
});
// calculate mSrcMyThreadOffset, mDstMyThreadOffset
@@ -63,22 +68,23 @@ struct BlockwiseTensorSliceCopy_generic_v1
const auto data_cluster_multi_id =
reorder_array_given_old2new(thread_cluster_multi_id, ThreadClusterArrangeOrder{});
const auto thread_data_multi_offset = data_cluster_multi_id * SubLengths{};
const auto thread_data_multi_id_begin = data_cluster_multi_id * SubLengths{};
mSrcMythreadOffset =
SrcDesc::GetOffsetFromMultiIndex(src_block_multi_offset + thread_data_multi_offset);
mSrcMythreadOffset =
DstDesc::GetOffsetFromMultiIndex(dst_block_multi_offset + thread_data_multi_offset);
mSrcMyThreadOffset = SrcDesc::GetOffsetFromMultiIndex(src_block_data_multi_id_begin +
thread_data_multi_id_begin);
mSrcMyThreadOffset = DstDesc::GetOffsetFromMultiIndex(dst_block_data_multi_id_begin +
thread_data_multi_id_begin);
}
__device__ static constexpr index_t GetRegisterClipboardSize()
{
constexpr auto repeat_lengths = SliceLengths{} / (SubLengths{} * ClusterLengths{});
constexpr auto repeat_lengths = SliceLengths{} / (SubLengths{} * DataClusterLengths{});
constexpr auto thread_tensor_desc =
make_packed_ConstantTensorDescriptor(SubLengths{} * repeat_lengths);
make_ConstantTensorDescriptor_default_rank_packed(SubLengths{} * repeat_lengths);
return thread_tensor_desc.GetElementSpaceSize();
return thread_tensor_desc.GetElementSpace();
}
__device__ void RunLoadRegisterClipboard(const Float* __restrict__ p_src,
@@ -86,32 +92,34 @@ struct BlockwiseTensorSliceCopy_generic_v1
{
constexpr auto thread_sub_tensor_lengths = SubLengths{};
constexpr auto data_per_cluster_per_dims = thread_sub_tensor_lengths * ClusterLengths{};
constexpr auto data_per_cluster_per_dims = thread_sub_tensor_lengths * DataClusterLengths{};
constexpr auto repeat_lengths = SliceLengths{} / (SubLengths{} * ClusterLengths{});
constexpr auto repeat_lengths = SliceLengths{} / (SubLengths{} * DataClusterLengths{});
constexpr auto thread_tensor_desc =
make_packed_ConstantTensorDescriptor(thread_sub_tensor_lengths * repeat_lengths);
constexpr auto thread_tensor_desc = make_ConstantTensorDescriptor_default_rank_packed(
thread_sub_tensor_lengths * repeat_lengths);
static_ford<decltype(repeat_lengths)>{}([&](auto repeat_multi_id_) {
constexpr auto repeat_multi_id = decltype(repeat_multi_id_){};
constexpr auto repeat_multi_id = sequence2array(decltype(repeat_multi_id_){});
constexpr auto src_data_multi_offset = repeat_multi_id * data_per_cluster_per_dims;
const auto src_thread_data_multi_id_begin =
repeat_multi_id * data_per_cluster_per_dims; // cannot not constexpr, why?
constexpr auto clipboard_data_multi_offset =
repeat_multi_id * thread_sub_tensor_lengths;
const auto clipboard_data_multi_id_begin =
repeat_multi_id * thread_sub_tensor_lengths; // cannot not constexpr, why?
constexpr index_t src_offset = SrcDesc{}.GetOffsetFromMultiIndex(src_data_multi_id);
constexpr index_t clipboard_offset =
thread_tensor_desc.GetOffsetFromMultiIndex(clipboard_data_multi_id);
const index_t src_offset = SrcDesc{}.GetOffsetFromMultiIndex(
src_thread_data_multi_id_begin); // cannot not constexpr, why?
const index_t clipboard_offset = thread_tensor_desc.GetOffsetFromMultiIndex(
clipboard_data_multi_id_begin); // cannot not constexpr, why?
threadwise_tensor_slice_copy_generic(SrcDesc{},
p_src + src_offset + mSrcMyThreadOffset,
thread_tensor_desc,
zero_array<index_t, nDim>{},
make_zero_array<index_t, nDim>(),
thread_tensor_desc,
p_clipboard + clipboard_offset,
zero_array<index_t, nDim>{},
make_zero_array<index_t, nDim>(),
thread_sub_tensor_lengths,
SrcAccessOrder{});
});
@@ -122,41 +130,44 @@ struct BlockwiseTensorSliceCopy_generic_v1
{
constexpr auto thread_sub_tensor_lengths = SubLengths{};
constexpr auto data_per_cluster_per_dims = thread_sub_tensor_lengths * ClusterLengths{};
constexpr auto data_per_cluster_per_dims = thread_sub_tensor_lengths * DataClusterLengths{};
constexpr auto repeat_lengths = SliceLengths{} / (SubLengths{} * ClusterLengths{});
constexpr auto repeat_lengths = SliceLengths{} / (SubLengths{} * DataClusterLengths{});
constexpr auto thread_tensor_desc =
make_packed_ConstantTensorDescriptor(thread_sub_tensor_lengths * repeat_lengths);
constexpr auto thread_tensor_desc = make_ConstantTensorDescriptor_default_rank_packed(
thread_sub_tensor_lengths * repeat_lengths);
static_ford<decltype(repeat_lengths)>{}([&](auto repeat_multi_id_) {
constexpr auto repeat_multi_id = decltype(repeat_multi_id_){};
constexpr auto repeat_multi_id = sequence2array(decltype(repeat_multi_id_){});
constexpr auto clipboard_data_multi_offset =
repeat_multi_id * thread_sub_tensor_lengths;
const auto clipboard_data_multi_id_begin =
repeat_multi_id * thread_sub_tensor_lengths; // cannot not constexpr, why?
constexpr auto dst_data_multi_offset = repeat_multi_id * data_per_cluster_per_dims;
const auto dst_data_multi_id_begin =
repeat_multi_id * data_per_cluster_per_dims; // cannot not constexpr, why?
constexpr index_t clipboard_offset =
thread_tensor_desc.GetOffsetFromMultiIndex(clipboard_data_multi_offset);
const index_t clipboard_offset = thread_tensor_desc.GetOffsetFromMultiIndex(
clipboard_data_multi_id_begin); // cannot not constexpr, why?
constexpr index_t dst_offset = DstDesc{}.GetOffsetFromMultiIndex(dst_data_multi_offset);
const index_t dst_offset = DstDesc{}.GetOffsetFromMultiIndex(
dst_data_multi_id_begin); // cannot not constexpr, why?
threadwise_tensor_slice_copy_generic(thread_tensor_desc,
p_clipboard + clipboard_offset,
zero_array<index_t, nDim>{},
make_zero_array<index_t, nDim>(),
DstDesc{},
p_dst + dst_offset + mDstMyThreadOffset,
zero_array<index_t, nDim>{},
make_zero_array<index_t, nDim>(),
thread_sub_tensor_lengths,
DstAccessOrder{});
});
}
__device__ void Run(const Float* __restrict__ p_src, Float* __restrict__ p_dst) const
{
Float p_clipboard[GetRegisterClipboardSize()];
Float p_clipboard[GetRegisterClipboardSize()];
RunLoadRegisterClipboard(p_src, p_clipboard);
RunStoreRegisterClipboard(p_clipboard, p_dst);
RunLoadRegisterClipboard(p_src, p_clipboard);
RunStoreRegisterClipboard(p_clipboard, p_dst);
}
};
};

View File

@@ -40,7 +40,7 @@ struct BlockwiseTensorSliceReorderCopy_v3
src_cluster_lengths.ReorderGivenNew2Old(map_thread_cluster_2_src_cluster);
constexpr auto thread_cluster_desc =
make_packed_ConstantTensorDescriptor(thread_cluster_lengths);
make_ConstantTensorDescriptor_default_rank_packed(thread_cluster_lengths);
// sanity check: data type
static_assert(is_same<Float, float>::value, "wrong! only support float for now!\n");
@@ -149,7 +149,7 @@ struct BlockwiseTensorSliceReorderCopy_v3
constexpr auto thread_tensor_lengths = thread_sub_tensor_lengths * repeat_lengths;
constexpr auto thread_tensor_desc =
make_packed_ConstantTensorDescriptor(thread_tensor_lengths);
make_ConstantTensorDescriptor_default_rank_packed(thread_tensor_lengths);
return thread_tensor_desc.GetElementSpace();
}
@@ -170,7 +170,7 @@ struct BlockwiseTensorSliceReorderCopy_v3
constexpr auto thread_tensor_lengths = thread_sub_tensor_lengths * repeat_lengths;
constexpr auto thread_tensor_desc =
make_packed_ConstantTensorDescriptor(thread_tensor_lengths);
make_ConstantTensorDescriptor_default_rank_packed(thread_tensor_lengths);
static_ford<decltype(repeat_lengths)>{}([&](auto repeat_multi_id_) {
constexpr auto repeat_multi_id = decltype(repeat_multi_id_){};
@@ -208,7 +208,7 @@ struct BlockwiseTensorSliceReorderCopy_v3
constexpr auto thread_tensor_lengths = thread_sub_tensor_lengths * repeat_lengths;
constexpr auto thread_tensor_desc =
make_packed_ConstantTensorDescriptor(thread_tensor_lengths);
make_ConstantTensorDescriptor_default_rank_packed(thread_tensor_lengths);
static_ford<decltype(repeat_lengths)>{}([&](auto repeat_multi_id_) {
constexpr auto repeat_multi_id = decltype(repeat_multi_id_){};

View File

@@ -1,6 +1,6 @@
#pragma once
#include "vector_type.hip.hpp"
#include "constant_integral.hip.hpp"
#include "integral_constant.hip.hpp"
#include "Sequence.hip.hpp"
#include "Array.hip.hpp"
#include "functional.hip.hpp"
@@ -17,15 +17,21 @@ __device__ index_t get_block_1d_id() { return blockIdx.x; }
template <class T1, class T2>
struct is_same
{
static const bool value = false;
static constexpr bool value = false;
};
template <class T>
struct is_same<T, T>
{
static const bool value = true;
static constexpr bool value = true;
};
template <class X, class Y>
__host__ __device__ constexpr bool is_same_type(X, Y)
{
return is_same<X, Y>::value;
}
namespace mod_conv { // namespace mod_conv
template <class T, T s>
struct scales

View File

@@ -30,7 +30,7 @@ __host__ __device__ constexpr auto get_convolution_output_default_4d_tensor_desc
constexpr auto HO = HI + 1 - Y;
constexpr auto WO = WI + 1 - X;
return make_packed_ConstantTensorDescriptor(Sequence<N, K, HO, WO>{});
return make_ConstantTensorDescriptor_default_rank_packed(Sequence<N, K, HO, WO>{});
}
template <class InDesc, class WeiDesc, class LowerPads, class UpperPads>
@@ -67,7 +67,7 @@ __host__ __device__ constexpr auto get_convolution_with_padding_output_default_4
constexpr auto HO = HI + HPadLow + HPadUp + 1 - Y;
constexpr auto WO = WI + WPadLow + WPadUp + 1 - X;
return make_packed_ConstantTensorDescriptor(Sequence<N, K, HO, WO>{});
return make_ConstantTensorDescriptor_default_rank_packed(Sequence<N, K, HO, WO>{});
}
template <class InDesc, class WeiDesc, class OutDesc>

View File

@@ -1,5 +1,5 @@
#pragma once
#include "constant_integral.hip.hpp"
#include "integral_constant.hip.hpp"
struct forwarder
{

View File

@@ -85,7 +85,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_chwn_cyxk_khwn
constexpr index_t WBlockWork = mod_conv::integer_divide_ceil(Wo, WoPerBlock);
constexpr index_t NBlockWork = mod_conv::integer_divide_ceil(N, NPerBlock);
constexpr auto block_work_desc = make_packed_ConstantTensorDescriptor(
constexpr auto block_work_desc = make_ConstantTensorDescriptor_default_rank_packed(
Sequence<KBlockWork, HBlockWork, WBlockWork, NBlockWork>{});
const auto block_work_multi_id =
@@ -109,7 +109,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_chwn_cyxk_khwn
GemmDataPerReadA,
GemmDataPerReadB);
constexpr auto in_c_h_w_n_block_desc = make_ranked_ConstantTensorDescriptor_with_alignment(
constexpr auto in_c_h_w_n_block_desc = make_ConstantTensorDescriptor_default_rank_aligned(
Sequence<CPerBlock, HoPerBlock, WoPerBlock, NPerBlock>{},
Number<InBlockCopyDataPerRead_N>{});
@@ -118,12 +118,12 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_chwn_cyxk_khwn
static_assert(in_c_h_w_n_block_desc.GetStride(I1) % GemmDataPerReadB == 0,
"GemmDataPerReadB alignment requirement is not meet");
constexpr auto wei_c_k_block_desc = make_ranked_ConstantTensorDescriptor_with_alignment(
constexpr auto wei_c_k_block_desc = make_ConstantTensorDescriptor_default_rank_aligned(
Sequence<CPerBlock, KPerBlock>{},
Number<mod_conv::max(WeiBlockCopyDataPerRead_K, GemmDataPerReadA)>{});
// tensor view of threadwise output in register
constexpr auto out_k_h_w_n_thread_desc = make_packed_ConstantTensorDescriptor(
constexpr auto out_k_h_w_n_thread_desc = make_ConstantTensorDescriptor_default_rank_packed(
Sequence<KPerThread, HoPerThread, WoPerThread, NPerThread>{});
// blockwise copy

View File

@@ -86,7 +86,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_nchw_cyxk_khwn
constexpr index_t HBlockWork = mod_conv::integer_divide_ceil(Ho, HoPerBlock);
constexpr index_t WBlockWork = mod_conv::integer_divide_ceil(Wo, WoPerBlock);
constexpr auto block_work_desc = make_packed_ConstantTensorDescriptor(
constexpr auto block_work_desc = make_ConstantTensorDescriptor_default_rank_packed(
Sequence<NBlockWork, KBlockWork, HBlockWork, WBlockWork>{});
const auto block_work_multi_id =
@@ -102,7 +102,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_nchw_cyxk_khwn
// global tensor view
constexpr auto wei_c_k_global_desc =
make_ranked_ConstantTensorDescriptor(Sequence<C, K>{}, Sequence<Y * X * K, 1>{});
make_ConstantTensorDescriptor_default_rank(Sequence<C, K>{}, Sequence<Y * X * K, 1>{});
// LDS tensor view
// be careful of alignment
@@ -111,7 +111,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_nchw_cyxk_khwn
GemmDataPerReadA,
GemmDataPerReadB);
constexpr auto in_c_h_w_n_block_desc = make_ranked_ConstantTensorDescriptor_with_alignment(
constexpr auto in_c_h_w_n_block_desc = make_ConstantTensorDescriptor_default_rank_aligned(
Sequence<CPerBlock, HoPerBlock, WoPerBlock, NPerBlock>{},
Number<InBlockReorderDataPerWrite_N>{});
@@ -120,12 +120,12 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_nchw_cyxk_khwn
static_assert(in_c_h_w_n_block_desc.GetStride(I1) % GemmDataPerReadB == 0,
"GemmDataPerReadB alignment requirement is not meet");
constexpr auto wei_c_k_block_desc = make_ranked_ConstantTensorDescriptor_with_alignment(
constexpr auto wei_c_k_block_desc = make_ConstantTensorDescriptor_default_rank_aligned(
Sequence<CPerBlock, KPerBlock>{},
Number<mod_conv::max(WeiBlockCopyDataPerRead_K, GemmDataPerReadA)>{});
// tensor view of threadwise output in register
constexpr auto out_k_h_w_n_thread_desc = make_packed_ConstantTensorDescriptor(
constexpr auto out_k_h_w_n_thread_desc = make_ConstantTensorDescriptor_default_rank_packed(
Sequence<KPerThread, HoPerThread, WoPerThread, NPerThread>{});
// blockwise copy
@@ -448,10 +448,10 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_nchw_cyxk_khwn
constexpr index_t K1 = KPerBlock / KPerThread;
#if 0
constexpr auto out_10d_global_desc = make_packed_ConstantTensorDescriptor(
constexpr auto out_10d_global_desc = make_ConstantTensorDescriptor_default_rank_packed(
Sequence<K / (K1 * K2), K1, K2, Ho, Wo / (W1 * W2 * W3), W1, W2, W3, N / N1, N1>{});
constexpr auto out_10d_thread_desc = make_packed_ConstantTensorDescriptor(
constexpr auto out_10d_thread_desc = make_ConstantTensorDescriptor_default_rank_packed(
Sequence<KPerThread / K2, 1, K2, HoPerThread, 1, W1, 1, W3, 1, N1>{});
#else
constexpr auto out_10d_global_desc =

View File

@@ -86,7 +86,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_nchw_cyxk_nkhw
constexpr index_t HBlockWork = mod_conv::integer_divide_ceil(Ho, HoPerBlock);
constexpr index_t WBlockWork = mod_conv::integer_divide_ceil(Wo, WoPerBlock);
constexpr auto block_work_desc = make_packed_ConstantTensorDescriptor(
constexpr auto block_work_desc = make_ConstantTensorDescriptor_default_rank_packed(
Sequence<NBlockWork, KBlockWork, HBlockWork, WBlockWork>{});
const auto block_work_multi_id =
@@ -110,7 +110,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_nchw_cyxk_nkhw
GemmDataPerReadA,
GemmDataPerReadB);
constexpr auto in_c_h_w_n_block_desc = make_ranked_ConstantTensorDescriptor_with_alignment(
constexpr auto in_c_h_w_n_block_desc = make_ConstantTensorDescriptor_default_rank_aligned(
Sequence<CPerBlock, HoPerBlock, WoPerBlock, NPerBlock>{},
Number<InBlockReorderDataPerWrite_N>{});
@@ -119,12 +119,12 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_nchw_cyxk_nkhw
static_assert(in_c_h_w_n_block_desc.GetStride(I1) % GemmDataPerReadB == 0,
"GemmDataPerReadB alignment requirement is not meet");
constexpr auto wei_c_k_block_desc = make_ranked_ConstantTensorDescriptor_with_alignment(
constexpr auto wei_c_k_block_desc = make_ConstantTensorDescriptor_default_rank_aligned(
Sequence<CPerBlock, KPerBlock>{},
Number<mod_conv::max(WeiBlockCopyDataPerRead_K, GemmDataPerReadA)>{});
// tensor view of threadwise output in register
constexpr auto out_k_h_w_n_thread_desc = make_packed_ConstantTensorDescriptor(
constexpr auto out_k_h_w_n_thread_desc = make_ConstantTensorDescriptor_default_rank_packed(
Sequence<KPerThread, HoPerThread, WoPerThread, NPerThread>{});
// blockwise copy
@@ -152,7 +152,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_nchw_cyxk_nkhw
decltype(wei_c_k_global_desc),
decltype(wei_c_k_block_desc),
decltype(wei_c_k_block_desc.GetLengths()),
WeiBlockCopyDataPerRead_K>{};
WeiBlockCopyDataPerRead_K>({0, 0}, {0, 0});
// a series of blockwise batched GEMM
// C_matrix += transpose(A_matrix) * B_matrix
@@ -196,7 +196,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_nchw_cyxk_nkhw
// choose GEMM implementation here
const auto run_blockwise_batch_gemm = [&](auto... Xs) {
#if 0
#if 1
return blockwise_batch_gemm.Run(Xs...);
#elif 0
return blockwise_batch_gemm.Run_asm(Xs...);

View File

@@ -1,8 +1,11 @@
#pragma once
#include "common.hip.hpp"
#include "ConstantTensorDescriptor.hip.hpp"
#include "ConstantMergedTensorDescriptor.hip.hpp"
#include "ConstantMatrixDescriptor.hip.hpp"
#include "blockwise_merged_tensor_slice_op.hip.hpp"
#include "blockwise_gemm.hip.hpp"
#include "threadwise_tensor_slice_op.hip.hpp"
// define B = merge(N, Ho, Wo)
template <index_t GridSize,
@@ -24,7 +27,12 @@ template <index_t GridSize,
index_t GemmNLevel1Cluster,
index_t GemmKPerThreadLoop,
index_t GemmDataPerReadA,
index_t GemmDataPerReadB>
index_t GemmDataPerReadB,
class InBlockCopySubLengths_N1_N2_C_B,
class InBlockCopyClusterLengths_N1_N2_C_B,
index_t InBlockCopySrcDataPerRead_B,
index_t InBlockCopyDstDataPerWrite_N2,
index_t WeiBlockCopyDataPerAccess_K>
struct GridwiseConvolutionImplicitGemm_v3_nchw_cyxk_nkhw
{
__device__ void Run(const Float* const __restrict__ p_in_global,
@@ -34,12 +42,10 @@ struct GridwiseConvolutionImplicitGemm_v3_nchw_cyxk_nkhw
// this is a mess
// TODO: more elegent way of specifying (or calculating) performance variables
static_assert(N2 == GemmNPerThreadSubC, "wrong!");
static_assert(KPerBlock ==
N1 * GemmNPerThreadSubC * GemmNLevel0Cluster * GemmNLevel1Cluster,
static_assert((N1 * N2 * BPerBlock) %
(GemmNPerThreadSubC * GemmNLevel0Cluster * GemmNLevel1Cluster) ==
0,
"wrong!");
static_assert(
KPerBlock % (N1 * GemmNPerThreadSubC * GemmNLevel0Cluster * GemmNLevel1Cluster) == 0,
"wrong!");
constexpr auto I0 = Number<0>{};
constexpr auto I1 = Number<1>{};
@@ -73,15 +79,14 @@ struct GridwiseConvolutionImplicitGemm_v3_nchw_cyxk_nkhw
constexpr index_t B = N0 * Ho * Wo;
// divide block work by [K, B]
static_assert(K % KPerBlock == 0 && B % BPerBlock == 0,
C % CPerBlock == 0,
static_assert(K % KPerBlock == 0 && B % BPerBlock == 0 && C % CPerBlock == 0,
"wrong! cannot divide work evenly among block");
constexpr index_t KBlockWork = K / KPerBlock;
constexpr index_t BBlockWork = B / BPerBlock;
constexpr auto block_work_desc =
make_ConstantTensorDescriptor(Sequence<KBlockWork, BBlockWork>{});
make_ConstantTensorDescriptor_default_rank_packed(Sequence<KBlockWork, BBlockWork>{});
const auto block_work_multi_id =
block_work_desc.GetMultiIndexFrom1dIndex(get_block_1d_id());
@@ -95,16 +100,20 @@ struct GridwiseConvolutionImplicitGemm_v3_nchw_cyxk_nkhw
in_n_c_h_w_global_desc.Fold(I0, Number<N1>{}, Number<N2>{});
// merged tensor descriptor in device memory [N1, N2, C, B], src of blockwise copy
constexpr auto in_n1_n2_c_b_global_merged_desc =
in_n0_n1_n2_c_h_w_global_desc.ReorderGivenNew2Old(Sequence<1, 2, 3, 0, 4, 5>{})
constexpr auto in_n1_n2_c_b_global_merged_desc = make_ConstantMergedTensorDescriptor(
in_n0_n1_n2_c_h_w_global_mem_desc.ReorderGivenNew2Old(Sequence<1, 2, 3, 0, 4, 5>{})
.Slice(I4, Number<Ho>{})
.Slice(I5, Number<Wo>{})
.Merge(I3, I5);
.Slice(I5, Number<Wo>{}),
Sequence<0>{},
Sequence<1>{},
Sequence<2>{},
Sequence<3, 4, 5>{});
// memory layout descriptor in LDS [C, N1, B, N2]
// be careful of LDS alignment
constexpr auto in_c_n1_b_n2_block_mem_desc = make_ConstantTensorDescriptor_aligned(
Sequence<CPerBlock, N1, BPerBlock, N2>{}, Number<InBlockCopyDstDataPerWrite_N2>{});
constexpr auto in_c_n1_b_n2_block_mem_desc =
make_ConstantTensorDescriptor_default_rank_aligned(
Sequence<CPerBlock, N1, BPerBlock, N2>{}, Number<InBlockCopyDstDataPerWrite_N2>{});
// tensor descriptor in LDS [N1, N2, C, B], dst of blockwise copy
constexpr auto in_n1_n2_c_b_block_desc =
@@ -112,7 +121,7 @@ struct GridwiseConvolutionImplicitGemm_v3_nchw_cyxk_nkhw
// this check is ad-hoc
// TODO: need to properly implement tensor descriptor with alignment
static_assert(in_c_n1_b_n2_block_desc.GetStride(I1) % GemmDataPerReadB == 0,
static_assert(in_c_n1_b_n2_block_mem_desc.GetStride(I1) % GemmDataPerReadB == 0,
"GemmDataPerReadB alignment requirement is not satisfied");
// input blockwise copy
@@ -129,7 +138,8 @@ struct GridwiseConvolutionImplicitGemm_v3_nchw_cyxk_nkhw
Sequence<2, 0, 1, 3>, // thread_arrange_order [C, N1, N2, B]
Sequence<0, 1, 2, 3>, // src_access_order [N1, N2, C, B]
Sequence<2, 0, 3, 1>, // dst_access_order [C, N1, B, N2]
>({0, 0, 0, b_block_data_on_global}, {0, 0, 0, 0});
InBlockCopySrcDataPerRead_B,
InBlockCopyDstDataPerWrite_N2>({0, 0, 0, b_block_data_on_global}, {0, 0, 0, 0});
// weight tensor
// tensor descriptor in device memory, src of blockwise copy
@@ -137,9 +147,9 @@ struct GridwiseConvolutionImplicitGemm_v3_nchw_cyxk_nkhw
// tensor descriptor in LDS, dst of blockwise copy
// be careful of LDS alignment
constexpr auto wei_c_k_block_desc = make_ConstantTensorDescriptor_aligned(
constexpr auto wei_c_k_block_desc = make_ConstantTensorDescriptor_default_rank_aligned(
Sequence<CPerBlock, KPerBlock>{},
Number<mod_conv::max(WeiBlockCopyDataPerRead_K, GemmDataPerReadA)>{});
Number<mod_conv::max(WeiBlockCopyDataPerAccess_K, GemmDataPerReadA)>{});
// operator for blockwise copy of weight into LDS
// slicing a tensor
@@ -150,7 +160,8 @@ struct GridwiseConvolutionImplicitGemm_v3_nchw_cyxk_nkhw
decltype(wei_c_k_global_desc),
decltype(wei_c_k_block_desc),
decltype(wei_c_k_block_desc.GetLengths()),
WeiBlockCopyDataPerRead_K>({0, k_block_data_on_global}, {0, 0});
WeiBlockCopyDataPerAccess_K>({0, k_block_data_on_global},
{0, 0});
// GEMM definition
// c_mtx += transpose(a_mtx) * b_mtx
@@ -167,7 +178,8 @@ struct GridwiseConvolutionImplicitGemm_v3_nchw_cyxk_nkhw
Number<in_c_n1_b_n2_block_mem_desc.GetStride(I0)>{});
// sanity check
static_assert(KPerBlock % (GemmMPerThreadSubC * GemmMLevel0Cluster * GemmMLevel1Cluster),
static_assert(KPerBlock % (GemmMPerThreadSubC * GemmMLevel0Cluster * GemmMLevel1Cluster) ==
0,
"wrong!");
constexpr index_t GemmMRepeat =
@@ -194,8 +206,8 @@ struct GridwiseConvolutionImplicitGemm_v3_nchw_cyxk_nkhw
GemmDataPerReadB>{};
// LDS allocation for input and weight: be careful of alignment
constexpr index_t max_align = mod_conv::max(InBlockReorderDataPerWrite_N,
WeiBlockCopyDataPerRead_K,
constexpr index_t max_align = mod_conv::max(InBlockCopyDstDataPerWrite_N2,
WeiBlockCopyDataPerAccess_K,
GemmDataPerReadA,
GemmDataPerReadB);
@@ -211,7 +223,7 @@ struct GridwiseConvolutionImplicitGemm_v3_nchw_cyxk_nkhw
Float p_out_thread[c_k0k2_n1n2_thread_mtx_desc.GetElementSpace()];
// zero out threadwise output
threadwise_matrix_set_zero(out_k0_k1_k2_n1_n0_h_w_n2_thread_desc, p_out_thread);
threadwise_matrix_set_zero(c_k0k2_n1n2_thread_mtx_desc, p_out_thread);
// do work
for(index_t y = 0; y < Y; ++y)
@@ -229,15 +241,15 @@ struct GridwiseConvolutionImplicitGemm_v3_nchw_cyxk_nkhw
c_block_data_on_global = 0;
c_block_data_on_global < C;
c_block_data_on_global += CPerBlock,
p_in_block_ont_global += CPerBlock * in_n_c_h_w_global_desc.GetStride(I1),
p_in_block_on_global += CPerBlock * in_n_c_h_w_global_desc.GetStride(I1),
p_wei_block_on_global += CPerBlock * wei_c_y_x_k_global_desc.GetStride(I0))
{
blockwise_in_copy.run(p_in_block_on_global, p_in_block);
blockwise_wei_copy.run(p_wei_block_on_global, p_wei_block);
blockwise_in_copy.Run(p_in_block_on_global, p_in_block);
blockwise_wei_copy.Run(p_wei_block_on_global, p_wei_block);
__syncthreads();
blockwise_gemm.run(p_wei_block, p_in_block, p_out_thread);
blockwise_gemm.Run(p_wei_block, p_in_block, p_out_thread);
__syncthreads();
}
@@ -253,19 +265,26 @@ struct GridwiseConvolutionImplicitGemm_v3_nchw_cyxk_nkhw
// define tensor descriptor for threadwise copy
// output tensor (also, memory layout) descriptor in register, src of threadwise
// copy
constexpr auto out_k0_k1_k2_n1_b_n2_thread_mem_desc = make_ConstantTensorDescriptor(
Sequence<KPerBlock / (K1 * K2), 1, K2, N1, 1, 1, 1, N2>{});
constexpr auto out_k0_k1_k2_n1_b_n2_thread_mem_desc =
make_ConstantTensorDescriptor_default_rank_packed(
Sequence<KPerBlock / (K1 * K2), 1, K2, N1, 1, N2>{});
// output memory layout descriptor in device memory
constexpr auto out_n0_n1_n2_k0_k1_k2_h_w_global_mem_desc =
out_n_k_h_w_global.Fold(I1, Number<K1>{}, Number<K2>{})
out_n_k_h_w_global_desc.Fold(I1, Number<K1>{}, Number<K2>{})
.Fold(I0, Number<N1>{}, Number<N2>{});
// output merged tensor descriptor in device memory, dst of threadwise copy
constexpr auto out_k0_k1_k2_n1_b_n2_global_merged_desc =
out_n0_n1_n2_k0_k1_k2_h_w_global_mem_desc
.ReorderGivenNew2Old(Sequence<3, 4, 5, 1, 0, 6, 7, 2>{})
.Merge(I4, I6);
make_ConstantMergedTensorDescriptor(
out_n0_n1_n2_k0_k1_k2_h_w_global_mem_desc.ReorderGivenNew2Old(
Sequence<3, 4, 5, 1, 0, 6, 7, 2>{}),
Sequence<0>{},
Sequence<1>{},
Sequence<2>{},
Sequence<3>{},
Sequence<4, 5, 6>{},
Sequence<7>{});
// calculate origin of thread output tensor on global memory
// blockwise GEMM c matrix starting index
@@ -273,18 +292,30 @@ struct GridwiseConvolutionImplicitGemm_v3_nchw_cyxk_nkhw
blockwise_gemm.GetBeginOfThreadMatrixC(get_thread_local_1d_id());
// origin of thread tensor on global
const index_t k_thread_data_on_global k_block_data_on_global +
c_thread_mtx_on_block.row;
const index_t k_thread_data_on_global =
k_block_data_on_global + c_thread_mtx_on_block.row;
const index_t b_thread_data_on_global =
b_block_data_on_global + c_thread_mtx_on_block.col;
// output merged global tensor descriptor, for calculating origin of thread tensor
// in global memory
// output merged global tensor descriptor, for calculating origin of thread tensor
// in global memory
#if 0 // unfold a merged tensor is not implemented yet
constexpr auto out_k_n1_b_n2_global_merged_desc =
out_k0_k1_k2_n1_b_n2_global_merged_desc.Unfold(I1, I2);
out_k0_k1_k2_n1_b_n2_global_merged_desc.Unfold(I0, I2);
#else
constexpr auto out_k_n1_b_n2_global_merged_desc = make_ConstantMergedTensorDescriptor(
out_n0_n1_n2_k0_k1_k2_h_w_global_mem_desc
.ReorderGivenNew2Old(Sequence<3, 4, 5, 1, 0, 6, 7, 2>{})
.Unfold(I0, I2),
Sequence<0>{},
Sequence<1>{},
Sequence<2, 3, 4>{},
Sequence<5>{});
#endif
// origin of thread tensor in global memory
const index_t p_out_thread_on_global =
Float* p_out_thread_on_global =
p_out_global +
out_k_n1_b_n2_global_merged_desc.GetOffsetFromMultiIndex(
k_thread_data_on_global, 0, 0, 0); // dst origin on merged global tensor
@@ -303,8 +334,8 @@ struct GridwiseConvolutionImplicitGemm_v3_nchw_cyxk_nkhw
0,
b_thread_data_on_global,
0}, // starting point of slice w.r.t. origin of dst
out_k0_k1_k2_n1_b_n2_thread_desc.GetLengths(), // slice lengths
Sequence<2, 3, 4, 0, 5, 1>{} // order of dimension access
out_k0_k1_k2_n1_b_n2_thread_mem_desc.GetLengths(), // slice lengths
Sequence<2, 3, 4, 0, 5, 1>{} // order of dimension access
);
}
}

View File

@@ -8,5 +8,11 @@ struct integral_constant
__host__ __device__ constexpr T Get() const { return value; }
};
template <class T, index_t X, index_t Y>
__host__ __device__ constexpr auto operator+(integral_constant<T, X>, integral_constant<T, Y>)
{
return integral_constant<T, X + Y>{};
}
template <index_t N>
using Number = integral_constant<index_t, N>;

View File

@@ -10,7 +10,7 @@ __device__ void threadwise_matrix_set_zero(Matrix, Float* __restrict__ p_thread)
for(index_t j = 0; j < Matrix::NCol(); ++j)
{
const index_t id = Matrix::GetOffsetFromMultiIndex(i, j);
p_thread[id] = 0;
p_thread[id] = Float(0);
}
}
}

View File

@@ -19,7 +19,7 @@ __device__ void threadwise_tensor_slice_copy(SrcDesc,
constexpr auto src_desc = SrcDesc{};
constexpr auto dst_desc = DstDesc{};
constexpr auto ref_desc = make_packed_ConstantTensorDescriptor(SrcOpLengths{});
constexpr auto ref_desc = make_ConstantTensorDescriptor_default_rank_packed(SrcOpLengths{});
#if 0
if(get_thread_local_1d_id() == 0 && get_block_1d_id() == 0)
@@ -194,16 +194,19 @@ threadwise_tensor_slice_copy_reorder_given_dst2src_v3(SrcDesc,
}
template <class Float, class SrcDesc, class DstDesc, class SliceLengths, class DimAccessOrder>
__device__ void
threadwise_tensor_slice_copy_generic(SrcDesc,
const Float* __restrict__ p_src,
Array<index_t, SrcDesc::GetNumOfDimension()> src_multi_offset,
DstDesc,
Float* __restrict__ p_dst,
Array<index_t, DstDesc::GetNumOfDimension()> dst_multi_offset,
SliceLengths,
DimAccessOrder)
__device__ void threadwise_tensor_slice_copy_generic(
SrcDesc,
const Float* __restrict__ p_src,
Array<index_t, SrcDesc::GetNumOfDimension()> src_multi_id_begin,
DstDesc,
Float* __restrict__ p_dst,
Array<index_t, DstDesc::GetNumOfDimension()> dst_multi_id_begin,
SliceLengths,
DimAccessOrder)
{
static_assert(SrcDesc::GetNumOfDimension() == DstDesc::GetNumOfDimension(),
"wrong! # of dimensions not the same");
constexpr auto src_desc = SrcDesc{};
constexpr auto dst_desc = DstDesc{};
@@ -215,9 +218,10 @@ threadwise_tensor_slice_copy_generic(SrcDesc,
reorder_array_given_old2new(data_multi_id_in_access_order, DimAccessOrder{});
const index_t dst_index =
dst_desc.GetOffsetFromMultiIndex(src_multi_offset + data_multi_id);
dst_desc.GetOffsetFromMultiIndex(src_multi_id_begin + data_multi_id);
const index_t src_index =
src_desc.GetOffsetFromMultiIndex(dst_multi_offset + data_multi_id);
src_desc.GetOffsetFromMultiIndex(dst_multi_id_begin + data_multi_id);
p_dst[dst_index] = p_src[src_index];
});

View File

@@ -1,6 +1,6 @@
#pragma once
#include "config.h"
#include "constant_integral.hip.hpp"
#include "integral_constant.hip.hpp"
template <class T, index_t N>
struct vector_type