mirror of
https://github.com/ROCm/composable_kernel.git
synced 2026-05-12 17:26:00 +00:00
Merge pull request #14 from ROCmSoftwarePlatform/miopen_downstream_init_integration
MIOpen Downstream: Initial integration 2nd PR
This commit is contained in:
@@ -8,7 +8,7 @@ namespace ck {
|
|||||||
|
|
||||||
template <typename Lengths,
|
template <typename Lengths,
|
||||||
typename ArrangeOrder = typename arithmetic_sequence_gen<0, Lengths::Size(), 1>::type>
|
typename ArrangeOrder = typename arithmetic_sequence_gen<0, Lengths::Size(), 1>::type>
|
||||||
__host__ __device__ constexpr auto make_cluster_descriptor_v2(
|
__host__ __device__ constexpr auto make_cluster_descriptor(
|
||||||
const Lengths& lengths,
|
const Lengths& lengths,
|
||||||
ArrangeOrder order = typename arithmetic_sequence_gen<0, Lengths::Size(), 1>::type{})
|
ArrangeOrder order = typename arithmetic_sequence_gen<0, Lengths::Size(), 1>::type{})
|
||||||
{
|
{
|
||||||
|
|||||||
@@ -481,11 +481,11 @@ struct Merge_v1_carry_check
|
|||||||
using LowerIndex = MultiIndex<NDimLow>;
|
using LowerIndex = MultiIndex<NDimLow>;
|
||||||
using UpperIndex = MultiIndex<1>;
|
using UpperIndex = MultiIndex<1>;
|
||||||
|
|
||||||
using LowLengthsScan = decltype(
|
using LowLengthsScan =
|
||||||
container_reverse_exclusive_scan(LowLengths{}, math::multiplies_v2{}, Number<1>{}));
|
decltype(container_reverse_exclusive_scan(LowLengths{}, math::multiplies{}, Number<1>{}));
|
||||||
|
|
||||||
using UpLengths =
|
using UpLengths =
|
||||||
decltype(make_tuple(container_reduce(LowLengths{}, math::multiplies_v2{}, Number<1>{})));
|
decltype(make_tuple(container_reduce(LowLengths{}, math::multiplies{}, Number<1>{})));
|
||||||
|
|
||||||
LowLengths low_lengths_;
|
LowLengths low_lengths_;
|
||||||
LowLengthsScan low_lengths_scan_;
|
LowLengthsScan low_lengths_scan_;
|
||||||
@@ -496,8 +496,8 @@ struct Merge_v1_carry_check
|
|||||||
__host__ __device__ constexpr Merge_v1_carry_check(const LowLengths& low_lengths)
|
__host__ __device__ constexpr Merge_v1_carry_check(const LowLengths& low_lengths)
|
||||||
: low_lengths_{low_lengths},
|
: low_lengths_{low_lengths},
|
||||||
low_lengths_scan_{
|
low_lengths_scan_{
|
||||||
container_reverse_exclusive_scan(low_lengths, math::multiplies_v2{}, Number<1>{})},
|
container_reverse_exclusive_scan(low_lengths, math::multiplies{}, Number<1>{})},
|
||||||
up_lengths_{make_tuple(container_reduce(low_lengths, math::multiplies_v2{}, Number<1>{}))}
|
up_lengths_{make_tuple(container_reduce(low_lengths, math::multiplies{}, Number<1>{}))}
|
||||||
{
|
{
|
||||||
static_assert(LowerIndex::Size() == NDimLow, "wrong!");
|
static_assert(LowerIndex::Size() == NDimLow, "wrong!");
|
||||||
}
|
}
|
||||||
@@ -1037,7 +1037,7 @@ struct Merge_v2_magic_division
|
|||||||
using UpperIndex = MultiIndex<1>;
|
using UpperIndex = MultiIndex<1>;
|
||||||
|
|
||||||
using UpLengths =
|
using UpLengths =
|
||||||
decltype(make_tuple(container_reduce(LowLengths{}, math::multiplies_v2{}, Number<1>{})));
|
decltype(make_tuple(container_reduce(LowLengths{}, math::multiplies{}, Number<1>{})));
|
||||||
|
|
||||||
using LowLengthsMagicDivisorMultipiler = decltype(
|
using LowLengthsMagicDivisorMultipiler = decltype(
|
||||||
generate_tuple(lambda_merge_generate_MagicDivision_calculate_magic_multiplier<LowLengths>{},
|
generate_tuple(lambda_merge_generate_MagicDivision_calculate_magic_multiplier<LowLengths>{},
|
||||||
@@ -1062,7 +1062,7 @@ struct Merge_v2_magic_division
|
|||||||
low_lengths_magic_divisor_shift_{generate_tuple(
|
low_lengths_magic_divisor_shift_{generate_tuple(
|
||||||
[&](auto i) { return MagicDivision::CalculateMagicShift(low_lengths[i]); },
|
[&](auto i) { return MagicDivision::CalculateMagicShift(low_lengths[i]); },
|
||||||
Number<NDimLow>{})},
|
Number<NDimLow>{})},
|
||||||
up_lengths_{make_tuple(container_reduce(low_lengths, math::multiplies_v2{}, Number<1>{}))}
|
up_lengths_{make_tuple(container_reduce(low_lengths, math::multiplies{}, Number<1>{}))}
|
||||||
{
|
{
|
||||||
static_assert(LowerIndex::Size() == NDimLow, "wrong!");
|
static_assert(LowerIndex::Size() == NDimLow, "wrong!");
|
||||||
}
|
}
|
||||||
@@ -1188,11 +1188,11 @@ struct Merge_v2r2_magic_division
|
|||||||
using LowerIndex = MultiIndex<NDimLow>;
|
using LowerIndex = MultiIndex<NDimLow>;
|
||||||
using UpperIndex = MultiIndex<1>;
|
using UpperIndex = MultiIndex<1>;
|
||||||
|
|
||||||
using LowLengthsScan = decltype(
|
using LowLengthsScan =
|
||||||
container_reverse_exclusive_scan(LowLengths{}, math::multiplies_v2{}, Number<1>{}));
|
decltype(container_reverse_exclusive_scan(LowLengths{}, math::multiplies{}, Number<1>{}));
|
||||||
|
|
||||||
using UpLengths =
|
using UpLengths =
|
||||||
decltype(make_tuple(container_reduce(LowLengths{}, math::multiplies_v2{}, Number<1>{})));
|
decltype(make_tuple(container_reduce(LowLengths{}, math::multiplies{}, Number<1>{})));
|
||||||
|
|
||||||
using LowLengthsScanMagicDivisorMultipiler = decltype(generate_tuple(
|
using LowLengthsScanMagicDivisorMultipiler = decltype(generate_tuple(
|
||||||
lambda_merge_generate_MagicDivision_calculate_magic_multiplier<LowLengthsScan>{},
|
lambda_merge_generate_MagicDivision_calculate_magic_multiplier<LowLengthsScan>{},
|
||||||
@@ -1213,14 +1213,14 @@ struct Merge_v2r2_magic_division
|
|||||||
__host__ __device__ constexpr Merge_v2r2_magic_division(const LowLengths& low_lengths)
|
__host__ __device__ constexpr Merge_v2r2_magic_division(const LowLengths& low_lengths)
|
||||||
: low_lengths_{low_lengths},
|
: low_lengths_{low_lengths},
|
||||||
low_lengths_scan_{
|
low_lengths_scan_{
|
||||||
container_reverse_exclusive_scan(low_lengths, math::multiplies_v2{}, Number<1>{})},
|
container_reverse_exclusive_scan(low_lengths, math::multiplies{}, Number<1>{})},
|
||||||
low_lengths_scan_magic_divisor_multiplier_{generate_tuple(
|
low_lengths_scan_magic_divisor_multiplier_{generate_tuple(
|
||||||
[&](auto i) { return MagicDivision::CalculateMagicMultiplier(low_lengths_scan_[i]); },
|
[&](auto i) { return MagicDivision::CalculateMagicMultiplier(low_lengths_scan_[i]); },
|
||||||
Number<NDimLow>{})},
|
Number<NDimLow>{})},
|
||||||
low_lengths_scan_magic_divisor_shift_{generate_tuple(
|
low_lengths_scan_magic_divisor_shift_{generate_tuple(
|
||||||
[&](auto i) { return MagicDivision::CalculateMagicShift(low_lengths_scan_[i]); },
|
[&](auto i) { return MagicDivision::CalculateMagicShift(low_lengths_scan_[i]); },
|
||||||
Number<NDimLow>{})},
|
Number<NDimLow>{})},
|
||||||
up_lengths_{make_tuple(container_reduce(low_lengths, math::multiplies_v2{}, Number<1>{}))}
|
up_lengths_{make_tuple(container_reduce(low_lengths, math::multiplies{}, Number<1>{}))}
|
||||||
{
|
{
|
||||||
static_assert(LowerIndex::Size() == NDimLow, "wrong!");
|
static_assert(LowerIndex::Size() == NDimLow, "wrong!");
|
||||||
}
|
}
|
||||||
@@ -1336,7 +1336,7 @@ struct UnMerge
|
|||||||
using UpperIndex = MultiIndex<NDimUp>;
|
using UpperIndex = MultiIndex<NDimUp>;
|
||||||
|
|
||||||
using UpLengthsScan =
|
using UpLengthsScan =
|
||||||
decltype(container_reverse_exclusive_scan(UpLengths{}, math::multiplies_v2{}, Number<1>{}));
|
decltype(container_reverse_exclusive_scan(UpLengths{}, math::multiplies{}, Number<1>{}));
|
||||||
|
|
||||||
UpLengths up_lengths_;
|
UpLengths up_lengths_;
|
||||||
UpLengthsScan up_lengths_scan_;
|
UpLengthsScan up_lengths_scan_;
|
||||||
@@ -1346,7 +1346,7 @@ struct UnMerge
|
|||||||
__host__ __device__ constexpr UnMerge(const UpLengths& up_lengths)
|
__host__ __device__ constexpr UnMerge(const UpLengths& up_lengths)
|
||||||
: up_lengths_{up_lengths},
|
: up_lengths_{up_lengths},
|
||||||
up_lengths_scan_{
|
up_lengths_scan_{
|
||||||
container_reverse_exclusive_scan(up_lengths, math::multiplies_v2{}, Number<1>{})}
|
container_reverse_exclusive_scan(up_lengths, math::multiplies{}, Number<1>{})}
|
||||||
{
|
{
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -64,7 +64,7 @@ struct TensorAdaptor
|
|||||||
Number<ndim_top_>{});
|
Number<ndim_top_>{});
|
||||||
|
|
||||||
// TODO: make container_reduce support tuple of Number and index_t
|
// TODO: make container_reduce support tuple of Number and index_t
|
||||||
return container_reduce(lengths, math::multiplies_v2{}, Number<1>{});
|
return container_reduce(lengths, math::multiplies{}, Number<1>{});
|
||||||
}
|
}
|
||||||
|
|
||||||
template <index_t IDim>
|
template <index_t IDim>
|
||||||
|
|||||||
@@ -69,7 +69,7 @@ struct TensorDescriptor
|
|||||||
Number<ndim_visible_>{});
|
Number<ndim_visible_>{});
|
||||||
|
|
||||||
// TODO: make container_reduce support tuple of Number and index_t
|
// TODO: make container_reduce support tuple of Number and index_t
|
||||||
return container_reduce(lengths, math::multiplies_v2{}, Number<1>{});
|
return container_reduce(lengths, math::multiplies{}, Number<1>{});
|
||||||
}
|
}
|
||||||
|
|
||||||
template <index_t IDim>
|
template <index_t IDim>
|
||||||
|
|||||||
@@ -38,8 +38,8 @@ __host__ __device__ constexpr auto calculate_element_space_size_impl(const Lengt
|
|||||||
template <typename... Lengths,
|
template <typename... Lengths,
|
||||||
typename... Strides,
|
typename... Strides,
|
||||||
typename enable_if<sizeof...(Lengths) == sizeof...(Strides), bool>::type = false>
|
typename enable_if<sizeof...(Lengths) == sizeof...(Strides), bool>::type = false>
|
||||||
__host__ __device__ constexpr auto make_naive_tensor_descriptor_v2(const Tuple<Lengths...>& lengths,
|
__host__ __device__ constexpr auto make_naive_tensor_descriptor(const Tuple<Lengths...>& lengths,
|
||||||
const Tuple<Strides...>& strides)
|
const Tuple<Strides...>& strides)
|
||||||
{
|
{
|
||||||
constexpr index_t N = sizeof...(Lengths);
|
constexpr index_t N = sizeof...(Lengths);
|
||||||
|
|
||||||
@@ -100,7 +100,7 @@ make_naive_tensor_descriptor_packed(const Tuple<Lengths...>& lengths)
|
|||||||
|
|
||||||
constexpr auto visible_dim_hidden_ids = typename arithmetic_sequence_gen<1, N + 1, 1>::type{};
|
constexpr auto visible_dim_hidden_ids = typename arithmetic_sequence_gen<1, N + 1, 1>::type{};
|
||||||
|
|
||||||
const auto element_space_size = container_reduce(lengths, math::multiplies_v2{}, Number<1>{});
|
const auto element_space_size = container_reduce(lengths, math::multiplies{}, Number<1>{});
|
||||||
|
|
||||||
return TensorDescriptor<remove_cv_t<decltype(transforms)>,
|
return TensorDescriptor<remove_cv_t<decltype(transforms)>,
|
||||||
remove_cv_t<decltype(low_dim_hidden_idss)>,
|
remove_cv_t<decltype(low_dim_hidden_idss)>,
|
||||||
@@ -112,7 +112,7 @@ make_naive_tensor_descriptor_packed(const Tuple<Lengths...>& lengths)
|
|||||||
|
|
||||||
template <typename... Lengths, typename Align>
|
template <typename... Lengths, typename Align>
|
||||||
__host__ __device__ constexpr auto
|
__host__ __device__ constexpr auto
|
||||||
make_naive_tensor_descriptor_aligned_v2(const Tuple<Lengths...>& lengths, Align align)
|
make_naive_tensor_descriptor_aligned(const Tuple<Lengths...>& lengths, Align align)
|
||||||
{
|
{
|
||||||
constexpr auto I1 = Number<1>{};
|
constexpr auto I1 = Number<1>{};
|
||||||
|
|
||||||
@@ -133,7 +133,7 @@ make_naive_tensor_descriptor_aligned_v2(const Tuple<Lengths...>& lengths, Align
|
|||||||
else
|
else
|
||||||
{
|
{
|
||||||
return container_reduce(lengths,
|
return container_reduce(lengths,
|
||||||
math::multiplies_v2{},
|
math::multiplies{},
|
||||||
Number<stride_n_minus_2>{},
|
Number<stride_n_minus_2>{},
|
||||||
i + I1,
|
i + I1,
|
||||||
Number<N - 1>{},
|
Number<N - 1>{},
|
||||||
@@ -142,7 +142,7 @@ make_naive_tensor_descriptor_aligned_v2(const Tuple<Lengths...>& lengths, Align
|
|||||||
},
|
},
|
||||||
Number<N>{});
|
Number<N>{});
|
||||||
|
|
||||||
return make_naive_tensor_descriptor_v2(lengths, strides);
|
return make_naive_tensor_descriptor(lengths, strides);
|
||||||
}
|
}
|
||||||
|
|
||||||
} // namespace ck
|
} // namespace ck
|
||||||
|
|||||||
@@ -143,7 +143,7 @@ struct BlockwiseTensorSliceTransfer_v4
|
|||||||
|
|
||||||
private:
|
private:
|
||||||
static constexpr auto thread_cluster_desc_ =
|
static constexpr auto thread_cluster_desc_ =
|
||||||
make_cluster_descriptor_v2(ThreadClusterLengths{}, ThreadClusterArrangeOrder{});
|
make_cluster_descriptor(ThreadClusterLengths{}, ThreadClusterArrangeOrder{});
|
||||||
|
|
||||||
using ThreadwiseTransfer =
|
using ThreadwiseTransfer =
|
||||||
ThreadwiseTensorSliceTransfer_v3<ThreadSliceLengths,
|
ThreadwiseTensorSliceTransfer_v3<ThreadSliceLengths,
|
||||||
|
|||||||
@@ -131,7 +131,7 @@ struct BlockwiseTensorSliceTransfer_v4r1
|
|||||||
|
|
||||||
private:
|
private:
|
||||||
static constexpr auto thread_cluster_desc_ =
|
static constexpr auto thread_cluster_desc_ =
|
||||||
make_cluster_descriptor_v2(ThreadClusterLengths{}, ThreadClusterArrangeOrder{});
|
make_cluster_descriptor(ThreadClusterLengths{}, ThreadClusterArrangeOrder{});
|
||||||
|
|
||||||
using ThreadwiseTransfer =
|
using ThreadwiseTransfer =
|
||||||
ThreadwiseTensorSliceTransfer_v3r1<ThreadSliceLengths,
|
ThreadwiseTensorSliceTransfer_v3r1<ThreadSliceLengths,
|
||||||
|
|||||||
@@ -110,13 +110,13 @@ struct GridwiseContractionDlops_A_GK0_GM0_GM1_GK1_B_GK0_GN0_GN1_GK1_C_GM0_GM1_GN
|
|||||||
|
|
||||||
// A matrix in LDS memory, dst of blockwise copy
|
// A matrix in LDS memory, dst of blockwise copy
|
||||||
// be careful of LDS alignment
|
// be careful of LDS alignment
|
||||||
constexpr auto a_block_desc_gk0_gm0_gm10_gm11_gk1 = make_naive_tensor_descriptor_aligned_v2(
|
constexpr auto a_block_desc_gk0_gm0_gm10_gm11_gk1 = make_naive_tensor_descriptor_aligned(
|
||||||
make_tuple(Number<GK0PerBlock>{}, GM0, I1, Number<GM1PerBlockGM11>{}, GK1),
|
make_tuple(Number<GK0PerBlock>{}, GM0, I1, Number<GM1PerBlockGM11>{}, GK1),
|
||||||
max_lds_align);
|
max_lds_align);
|
||||||
|
|
||||||
// B matrix in LDS memory, dst of blockwise copy
|
// B matrix in LDS memory, dst of blockwise copy
|
||||||
// be careful of LDS alignment
|
// be careful of LDS alignment
|
||||||
constexpr auto b_block_desc_gk0_gn0_gn10_gn11_gk1 = make_naive_tensor_descriptor_aligned_v2(
|
constexpr auto b_block_desc_gk0_gn0_gn10_gn11_gk1 = make_naive_tensor_descriptor_aligned(
|
||||||
make_tuple(Number<GK0PerBlock>{}, GN0, I1, Number<GN1PerBlockGN11>{}, GK1),
|
make_tuple(Number<GK0PerBlock>{}, GN0, I1, Number<GN1PerBlockGN11>{}, GK1),
|
||||||
max_lds_align);
|
max_lds_align);
|
||||||
|
|
||||||
@@ -248,10 +248,10 @@ struct GridwiseContractionDlops_A_GK0_GM0_GM1_GK1_B_GK0_GN0_GN1_GK1_C_GM0_GM1_GN
|
|||||||
constexpr auto BN = GN0 * GN11;
|
constexpr auto BN = GN0 * GN11;
|
||||||
|
|
||||||
constexpr auto BM1 =
|
constexpr auto BM1 =
|
||||||
Number<container_reduce(BM10BN10ThreadClusterBM10Xs{}, math::multiplies_v2{}, I1) *
|
Number<container_reduce(BM10BN10ThreadClusterBM10Xs{}, math::multiplies{}, I1) *
|
||||||
BM1PerThreadBM11>{};
|
BM1PerThreadBM11>{};
|
||||||
constexpr auto BN1 =
|
constexpr auto BN1 =
|
||||||
Number<container_reduce(BM10BN10ThreadClusterBN10Xs{}, math::multiplies_v2{}, I1) *
|
Number<container_reduce(BM10BN10ThreadClusterBN10Xs{}, math::multiplies{}, I1) *
|
||||||
BN1PerThreadBN11>{};
|
BN1PerThreadBN11>{};
|
||||||
|
|
||||||
constexpr auto BM0 = BM / BM1;
|
constexpr auto BM0 = BM / BM1;
|
||||||
@@ -354,24 +354,24 @@ struct GridwiseContractionDlops_A_GK0_GM0_GM1_GK1_B_GK0_GN0_GN1_GK1_C_GM0_GM1_GN
|
|||||||
|
|
||||||
// A matrix in LDS memory, dst of blockwise copy
|
// A matrix in LDS memory, dst of blockwise copy
|
||||||
// be careful of LDS alignment
|
// be careful of LDS alignment
|
||||||
constexpr auto a_block_desc_gk0_gm0_gm10_gm11_gk1 = make_naive_tensor_descriptor_aligned_v2(
|
constexpr auto a_block_desc_gk0_gm0_gm10_gm11_gk1 = make_naive_tensor_descriptor_aligned(
|
||||||
make_tuple(Number<GK0PerBlock>{}, GM0, I1, Number<GM1PerBlockGM11>{}, GK1),
|
make_tuple(Number<GK0PerBlock>{}, GM0, I1, Number<GM1PerBlockGM11>{}, GK1),
|
||||||
max_lds_align);
|
max_lds_align);
|
||||||
|
|
||||||
// B matrix in LDS memory, dst of blockwise copy
|
// B matrix in LDS memory, dst of blockwise copy
|
||||||
// be careful of LDS alignment
|
// be careful of LDS alignment
|
||||||
constexpr auto b_block_desc_gk0_gn0_gn10_gn11_gk1 = make_naive_tensor_descriptor_aligned_v2(
|
constexpr auto b_block_desc_gk0_gn0_gn10_gn11_gk1 = make_naive_tensor_descriptor_aligned(
|
||||||
make_tuple(Number<GK0PerBlock>{}, GN0, I1, Number<GN1PerBlockGN11>{}, GK1),
|
make_tuple(Number<GK0PerBlock>{}, GN0, I1, Number<GN1PerBlockGN11>{}, GK1),
|
||||||
max_lds_align);
|
max_lds_align);
|
||||||
|
|
||||||
// A matrix in LDS memory for blockwise GEMM
|
// A matrix in LDS memory for blockwise GEMM
|
||||||
// be careful of LDS alignment
|
// be careful of LDS alignment
|
||||||
constexpr auto a_block_desc_gk0_bm_gk1 = make_naive_tensor_descriptor_aligned_v2(
|
constexpr auto a_block_desc_gk0_bm_gk1 = make_naive_tensor_descriptor_aligned(
|
||||||
make_tuple(Number<GK0PerBlock>{}, GM0 * Number<GM1PerBlockGM11>{}, GK1), max_lds_align);
|
make_tuple(Number<GK0PerBlock>{}, GM0 * Number<GM1PerBlockGM11>{}, GK1), max_lds_align);
|
||||||
|
|
||||||
// B matrix in LDS memory for blockwise GEMM
|
// B matrix in LDS memory for blockwise GEMM
|
||||||
// be careful of LDS alignment
|
// be careful of LDS alignment
|
||||||
constexpr auto b_block_desc_gk0_bn_gk1 = make_naive_tensor_descriptor_aligned_v2(
|
constexpr auto b_block_desc_gk0_bn_gk1 = make_naive_tensor_descriptor_aligned(
|
||||||
make_tuple(Number<GK0PerBlock>{}, GN0 * Number<GN1PerBlockGN11>{}, GK1), max_lds_align);
|
make_tuple(Number<GK0PerBlock>{}, GN0 * Number<GN1PerBlockGN11>{}, GK1), max_lds_align);
|
||||||
|
|
||||||
static_assert(a_block_desc_gk0_gm0_gm10_gm11_gk1.GetElementSpaceSize() ==
|
static_assert(a_block_desc_gk0_gm0_gm10_gm11_gk1.GetElementSpaceSize() ==
|
||||||
|
|||||||
@@ -166,12 +166,12 @@ struct GridwiseGemmDlops_km_kn_mn_v1r2
|
|||||||
|
|
||||||
// A matrix in LDS memory, dst of blockwise copy
|
// A matrix in LDS memory, dst of blockwise copy
|
||||||
// be careful of LDS alignment
|
// be careful of LDS alignment
|
||||||
constexpr auto a_k_m_block_desc = make_naive_tensor_descriptor_aligned_v2(
|
constexpr auto a_k_m_block_desc = make_naive_tensor_descriptor_aligned(
|
||||||
make_tuple(Number<KPerBlock>{}, Number<MPerBlockM1>{}), max_lds_align);
|
make_tuple(Number<KPerBlock>{}, Number<MPerBlockM1>{}), max_lds_align);
|
||||||
|
|
||||||
// B matrix in LDS memory, dst of blockwise copy
|
// B matrix in LDS memory, dst of blockwise copy
|
||||||
// be careful of LDS alignment
|
// be careful of LDS alignment
|
||||||
constexpr auto b_k_n_block_desc = make_naive_tensor_descriptor_aligned_v2(
|
constexpr auto b_k_n_block_desc = make_naive_tensor_descriptor_aligned(
|
||||||
make_tuple(Number<KPerBlock>{}, Number<NPerBlockN1>{}), max_lds_align);
|
make_tuple(Number<KPerBlock>{}, Number<NPerBlockN1>{}), max_lds_align);
|
||||||
|
|
||||||
// LDS allocation for A and B: be careful of alignment
|
// LDS allocation for A and B: be careful of alignment
|
||||||
@@ -351,22 +351,22 @@ struct GridwiseGemmDlops_km_kn_mn_v1r2
|
|||||||
|
|
||||||
// A matrix in LDS memory, dst of blockwise copy
|
// A matrix in LDS memory, dst of blockwise copy
|
||||||
// be careful of LDS alignment
|
// be careful of LDS alignment
|
||||||
constexpr auto a_k_m_block_desc = make_naive_tensor_descriptor_aligned_v2(
|
constexpr auto a_k_m_block_desc = make_naive_tensor_descriptor_aligned(
|
||||||
make_tuple(Number<KPerBlock>{}, Number<MPerBlockM1>{}), max_lds_align);
|
make_tuple(Number<KPerBlock>{}, Number<MPerBlockM1>{}), max_lds_align);
|
||||||
|
|
||||||
// B matrix in LDS memory, dst of blockwise copy
|
// B matrix in LDS memory, dst of blockwise copy
|
||||||
// be careful of LDS alignment
|
// be careful of LDS alignment
|
||||||
constexpr auto b_k_n_block_desc = make_naive_tensor_descriptor_aligned_v2(
|
constexpr auto b_k_n_block_desc = make_naive_tensor_descriptor_aligned(
|
||||||
make_tuple(Number<KPerBlock>{}, Number<NPerBlockN1>{}), max_lds_align);
|
make_tuple(Number<KPerBlock>{}, Number<NPerBlockN1>{}), max_lds_align);
|
||||||
|
|
||||||
// A matrix in LDS memory, dst of blockwise copy
|
// A matrix in LDS memory, dst of blockwise copy
|
||||||
// be careful of LDS alignment
|
// be careful of LDS alignment
|
||||||
constexpr auto a_k_m0_m1_block_desc = make_naive_tensor_descriptor_aligned_v2(
|
constexpr auto a_k_m0_m1_block_desc = make_naive_tensor_descriptor_aligned(
|
||||||
make_tuple(Number<KPerBlock>{}, I1, Number<MPerBlockM1>{}), max_lds_align);
|
make_tuple(Number<KPerBlock>{}, I1, Number<MPerBlockM1>{}), max_lds_align);
|
||||||
|
|
||||||
// B matrix in LDS memory, dst of blockwise copy
|
// B matrix in LDS memory, dst of blockwise copy
|
||||||
// be careful of LDS alignment
|
// be careful of LDS alignment
|
||||||
constexpr auto b_k_n0_n1_block_desc = make_naive_tensor_descriptor_aligned_v2(
|
constexpr auto b_k_n0_n1_block_desc = make_naive_tensor_descriptor_aligned(
|
||||||
make_tuple(Number<KPerBlock>{}, I1, Number<NPerBlockN1>{}), max_lds_align);
|
make_tuple(Number<KPerBlock>{}, I1, Number<NPerBlockN1>{}), max_lds_align);
|
||||||
|
|
||||||
// A matrix blockwise copy
|
// A matrix blockwise copy
|
||||||
|
|||||||
@@ -163,12 +163,12 @@ struct GridwiseGemmDlops_km_kn_mn_v1r3
|
|||||||
|
|
||||||
// TODO: check alignment
|
// TODO: check alignment
|
||||||
// A matrix in LDS memory, dst of blockwise copy
|
// A matrix in LDS memory, dst of blockwise copy
|
||||||
constexpr auto a_k_m_block_desc = make_naive_tensor_descriptor_aligned_v2(
|
constexpr auto a_k_m_block_desc = make_naive_tensor_descriptor_aligned(
|
||||||
make_tuple(Number<KPerBlock>{}, Number<MPerBlockM1>{}, K1), max_lds_align);
|
make_tuple(Number<KPerBlock>{}, Number<MPerBlockM1>{}, K1), max_lds_align);
|
||||||
|
|
||||||
// TODO: check alignment
|
// TODO: check alignment
|
||||||
// B matrix in LDS memory, dst of blockwise copy
|
// B matrix in LDS memory, dst of blockwise copy
|
||||||
constexpr auto b_k_n_block_desc = make_naive_tensor_descriptor_aligned_v2(
|
constexpr auto b_k_n_block_desc = make_naive_tensor_descriptor_aligned(
|
||||||
make_tuple(Number<KPerBlock>{}, Number<NPerBlockN1>{}, K1), max_lds_align);
|
make_tuple(Number<KPerBlock>{}, Number<NPerBlockN1>{}, K1), max_lds_align);
|
||||||
|
|
||||||
// TODO: check alignment
|
// TODO: check alignment
|
||||||
@@ -274,10 +274,10 @@ struct GridwiseGemmDlops_km_kn_mn_v1r3
|
|||||||
const auto N0 = N / N1;
|
const auto N0 = N / N1;
|
||||||
|
|
||||||
constexpr auto M11 =
|
constexpr auto M11 =
|
||||||
Number<container_reduce(M11N11ThreadClusterM110Xs{}, math::multiplies_v2{}, I1) *
|
Number<container_reduce(M11N11ThreadClusterM110Xs{}, math::multiplies{}, I1) *
|
||||||
M1PerThreadM111>{};
|
M1PerThreadM111>{};
|
||||||
constexpr auto N11 =
|
constexpr auto N11 =
|
||||||
Number<container_reduce(M11N11ThreadClusterN110Xs{}, math::multiplies_v2{}, I1) *
|
Number<container_reduce(M11N11ThreadClusterN110Xs{}, math::multiplies{}, I1) *
|
||||||
N1PerThreadN111>{};
|
N1PerThreadN111>{};
|
||||||
|
|
||||||
constexpr auto M10 = M1 / M11;
|
constexpr auto M10 = M1 / M11;
|
||||||
@@ -354,23 +354,23 @@ struct GridwiseGemmDlops_km_kn_mn_v1r3
|
|||||||
// TODO: check alignment
|
// TODO: check alignment
|
||||||
// A matrix in LDS memory, dst of blockwise copy
|
// A matrix in LDS memory, dst of blockwise copy
|
||||||
// be careful of LDS alignment
|
// be careful of LDS alignment
|
||||||
constexpr auto a_k0_m0_m1_k1_block_desc = make_naive_tensor_descriptor_aligned_v2(
|
constexpr auto a_k0_m0_m1_k1_block_desc = make_naive_tensor_descriptor_aligned(
|
||||||
make_tuple(Number<KPerBlock>{}, I1, Number<MPerBlockM1>{}, K1), max_lds_align);
|
make_tuple(Number<KPerBlock>{}, I1, Number<MPerBlockM1>{}, K1), max_lds_align);
|
||||||
|
|
||||||
// TODO: check alignment
|
// TODO: check alignment
|
||||||
// B matrix in LDS memory, dst of blockwise copy
|
// B matrix in LDS memory, dst of blockwise copy
|
||||||
// be careful of LDS alignment
|
// be careful of LDS alignment
|
||||||
constexpr auto b_k0_n0_n1_k1_block_desc = make_naive_tensor_descriptor_aligned_v2(
|
constexpr auto b_k0_n0_n1_k1_block_desc = make_naive_tensor_descriptor_aligned(
|
||||||
make_tuple(Number<KPerBlock>{}, I1, Number<NPerBlockN1>{}, K1), max_lds_align);
|
make_tuple(Number<KPerBlock>{}, I1, Number<NPerBlockN1>{}, K1), max_lds_align);
|
||||||
|
|
||||||
// TODO: check alignment
|
// TODO: check alignment
|
||||||
// A matrix in LDS memory, for blockwise GEMM
|
// A matrix in LDS memory, for blockwise GEMM
|
||||||
constexpr auto a_k0_m_k1_block_desc = make_naive_tensor_descriptor_aligned_v2(
|
constexpr auto a_k0_m_k1_block_desc = make_naive_tensor_descriptor_aligned(
|
||||||
make_tuple(Number<KPerBlock>{}, Number<MPerBlockM1>{}, K1), max_lds_align);
|
make_tuple(Number<KPerBlock>{}, Number<MPerBlockM1>{}, K1), max_lds_align);
|
||||||
|
|
||||||
// TODO: check alignment
|
// TODO: check alignment
|
||||||
// B matrix in LDS memory, for blockwise GEMM
|
// B matrix in LDS memory, for blockwise GEMM
|
||||||
constexpr auto b_k0_n_k1_block_desc = make_naive_tensor_descriptor_aligned_v2(
|
constexpr auto b_k0_n_k1_block_desc = make_naive_tensor_descriptor_aligned(
|
||||||
make_tuple(Number<KPerBlock>{}, Number<NPerBlockN1>{}, K1), max_lds_align);
|
make_tuple(Number<KPerBlock>{}, Number<NPerBlockN1>{}, K1), max_lds_align);
|
||||||
|
|
||||||
static_assert(a_k0_m0_m1_k1_block_desc.GetElementSpaceSize() ==
|
static_assert(a_k0_m0_m1_k1_block_desc.GetElementSpaceSize() ==
|
||||||
|
|||||||
@@ -58,7 +58,7 @@ struct GridwiseGemmDlops_km_kn_mn_v3
|
|||||||
|
|
||||||
// A matrix in LDS memory, dst of blockwise copy
|
// A matrix in LDS memory, dst of blockwise copy
|
||||||
// be careful of LDS alignment
|
// be careful of LDS alignment
|
||||||
constexpr auto a_e_k_desc = make_naive_tensor_descriptor_aligned_v2(
|
constexpr auto a_e_k_desc = make_naive_tensor_descriptor_aligned(
|
||||||
make_tuple(Number<E>{}, Number<KPerBlock>{}), max_lds_align);
|
make_tuple(Number<E>{}, Number<KPerBlock>{}), max_lds_align);
|
||||||
|
|
||||||
// LDS allocation for A and B: be careful of alignment
|
// LDS allocation for A and B: be careful of alignment
|
||||||
@@ -132,10 +132,10 @@ struct GridwiseGemmDlops_km_kn_mn_v3
|
|||||||
|
|
||||||
// A matrix in LDS memory, dst of blockwise copy
|
// A matrix in LDS memory, dst of blockwise copy
|
||||||
// be careful of LDS alignment
|
// be careful of LDS alignment
|
||||||
constexpr auto a_e_k_block_desc = make_naive_tensor_descriptor_aligned_v2(
|
constexpr auto a_e_k_block_desc = make_naive_tensor_descriptor_aligned(
|
||||||
make_tuple(Number<EPerBlock>{}, Number<KPerBlock>{}), max_lds_align);
|
make_tuple(Number<EPerBlock>{}, Number<KPerBlock>{}), max_lds_align);
|
||||||
|
|
||||||
constexpr auto a_e_k_desc = make_naive_tensor_descriptor_aligned_v2(
|
constexpr auto a_e_k_desc = make_naive_tensor_descriptor_aligned(
|
||||||
make_tuple(Number<E>{}, Number<KPerBlock>{}), max_lds_align);
|
make_tuple(Number<E>{}, Number<KPerBlock>{}), max_lds_align);
|
||||||
|
|
||||||
// B matrix in LDS memory, dst of blockwise copy
|
// B matrix in LDS memory, dst of blockwise copy
|
||||||
|
|||||||
@@ -148,12 +148,12 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r3
|
|||||||
|
|
||||||
// A matrix in LDS memory, dst of blockwise copy
|
// A matrix in LDS memory, dst of blockwise copy
|
||||||
// be careful of LDS alignment
|
// be careful of LDS alignment
|
||||||
constexpr auto a_k0_m_k1_block_desc = make_naive_tensor_descriptor_aligned_v2(
|
constexpr auto a_k0_m_k1_block_desc = make_naive_tensor_descriptor_aligned(
|
||||||
make_tuple(Number<KPerBlock>{}, Number<MPerBlock>{}, K1), max_lds_align);
|
make_tuple(Number<KPerBlock>{}, Number<MPerBlock>{}, K1), max_lds_align);
|
||||||
|
|
||||||
// B matrix in LDS memory, dst of blockwise copy
|
// B matrix in LDS memory, dst of blockwise copy
|
||||||
// be careful of LDS alignment
|
// be careful of LDS alignment
|
||||||
constexpr auto b_k0_n_k1_block_desc = make_naive_tensor_descriptor_aligned_v2(
|
constexpr auto b_k0_n_k1_block_desc = make_naive_tensor_descriptor_aligned(
|
||||||
make_tuple(Number<KPerBlock>{}, Number<NPerBlock>{}, K1), max_lds_align);
|
make_tuple(Number<KPerBlock>{}, Number<NPerBlock>{}, K1), max_lds_align);
|
||||||
|
|
||||||
// LDS allocation for A and B: be careful of alignment
|
// LDS allocation for A and B: be careful of alignment
|
||||||
@@ -290,12 +290,12 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r3
|
|||||||
|
|
||||||
// A matrix in LDS memory, dst of blockwise copy
|
// A matrix in LDS memory, dst of blockwise copy
|
||||||
// be careful of LDS alignment
|
// be careful of LDS alignment
|
||||||
constexpr auto a_k0_m_k1_block_desc = make_naive_tensor_descriptor_aligned_v2(
|
constexpr auto a_k0_m_k1_block_desc = make_naive_tensor_descriptor_aligned(
|
||||||
make_tuple(Number<KPerBlock>{}, Number<MPerBlock>{}, K1), max_lds_align);
|
make_tuple(Number<KPerBlock>{}, Number<MPerBlock>{}, K1), max_lds_align);
|
||||||
|
|
||||||
// B matrix in LDS memory, dst of blockwise copy
|
// B matrix in LDS memory, dst of blockwise copy
|
||||||
// be careful of LDS alignment
|
// be careful of LDS alignment
|
||||||
constexpr auto b_k0_n_k1_block_desc = make_naive_tensor_descriptor_aligned_v2(
|
constexpr auto b_k0_n_k1_block_desc = make_naive_tensor_descriptor_aligned(
|
||||||
make_tuple(Number<KPerBlock>{}, Number<NPerBlock>{}, K1), max_lds_align);
|
make_tuple(Number<KPerBlock>{}, Number<NPerBlock>{}, K1), max_lds_align);
|
||||||
|
|
||||||
// A matrix blockwise copy
|
// A matrix blockwise copy
|
||||||
|
|||||||
@@ -91,13 +91,13 @@ struct ThreadwiseTensorSliceTransfer_v3r1
|
|||||||
container_reverse_exclusive_scan(
|
container_reverse_exclusive_scan(
|
||||||
container_reorder_given_new2old(src_vector_tensor_lengths,
|
container_reorder_given_new2old(src_vector_tensor_lengths,
|
||||||
SrcVectorTensorContiguousDimOrder{}),
|
SrcVectorTensorContiguousDimOrder{}),
|
||||||
math::multiplies_v2{},
|
math::multiplies{},
|
||||||
I1),
|
I1),
|
||||||
SrcVectorTensorContiguousDimOrder{});
|
SrcVectorTensorContiguousDimOrder{});
|
||||||
|
|
||||||
constexpr auto src_vector_desc =
|
constexpr auto src_vector_desc =
|
||||||
make_naive_tensor_descriptor_v2(sequence_to_tuple_of_number(src_vector_tensor_lengths),
|
make_naive_tensor_descriptor(sequence_to_tuple_of_number(src_vector_tensor_lengths),
|
||||||
sequence_to_tuple_of_number(src_vector_tensor_strides));
|
sequence_to_tuple_of_number(src_vector_tensor_strides));
|
||||||
|
|
||||||
// access order and lengths
|
// access order and lengths
|
||||||
constexpr auto src_access_lengths = SliceLengths{} / src_vector_tensor_lengths;
|
constexpr auto src_access_lengths = SliceLengths{} / src_vector_tensor_lengths;
|
||||||
@@ -259,13 +259,13 @@ struct ThreadwiseTensorSliceTransfer_v3r1
|
|||||||
container_reverse_exclusive_scan(
|
container_reverse_exclusive_scan(
|
||||||
container_reorder_given_new2old(dst_vector_tensor_lengths,
|
container_reorder_given_new2old(dst_vector_tensor_lengths,
|
||||||
DstVectorTensorContiguousDimOrder{}),
|
DstVectorTensorContiguousDimOrder{}),
|
||||||
math::multiplies_v2{},
|
math::multiplies{},
|
||||||
I1),
|
I1),
|
||||||
DstVectorTensorContiguousDimOrder{});
|
DstVectorTensorContiguousDimOrder{});
|
||||||
|
|
||||||
constexpr auto dst_vector_desc =
|
constexpr auto dst_vector_desc =
|
||||||
make_naive_tensor_descriptor_v2(sequence_to_tuple_of_number(dst_vector_tensor_lengths),
|
make_naive_tensor_descriptor(sequence_to_tuple_of_number(dst_vector_tensor_lengths),
|
||||||
sequence_to_tuple_of_number(dst_vector_tensor_strides));
|
sequence_to_tuple_of_number(dst_vector_tensor_strides));
|
||||||
|
|
||||||
// dst access order and lengths
|
// dst access order and lengths
|
||||||
constexpr auto dst_access_lengths = SliceLengths{} / dst_vector_tensor_lengths;
|
constexpr auto dst_access_lengths = SliceLengths{} / dst_vector_tensor_lengths;
|
||||||
@@ -699,13 +699,13 @@ struct ThreadwiseTensorSliceTransfer_v4r1
|
|||||||
container_reverse_exclusive_scan(
|
container_reverse_exclusive_scan(
|
||||||
container_reorder_given_new2old(src_vector_tensor_lengths,
|
container_reorder_given_new2old(src_vector_tensor_lengths,
|
||||||
SrcVectorTensorContiguousDimOrder{}),
|
SrcVectorTensorContiguousDimOrder{}),
|
||||||
math::multiplies_v2{},
|
math::multiplies{},
|
||||||
I1),
|
I1),
|
||||||
SrcVectorTensorContiguousDimOrder{});
|
SrcVectorTensorContiguousDimOrder{});
|
||||||
|
|
||||||
constexpr auto src_vector_desc =
|
constexpr auto src_vector_desc =
|
||||||
make_naive_tensor_descriptor_v2(sequence_to_tuple_of_number(src_vector_tensor_lengths),
|
make_naive_tensor_descriptor(sequence_to_tuple_of_number(src_vector_tensor_lengths),
|
||||||
sequence_to_tuple_of_number(src_vector_tensor_strides));
|
sequence_to_tuple_of_number(src_vector_tensor_strides));
|
||||||
|
|
||||||
// access order and lengths
|
// access order and lengths
|
||||||
constexpr auto access_lengths = SliceLengths{} / src_vector_tensor_lengths;
|
constexpr auto access_lengths = SliceLengths{} / src_vector_tensor_lengths;
|
||||||
|
|||||||
@@ -28,13 +28,7 @@ struct minus
|
|||||||
__host__ __device__ constexpr T operator()(T a, T b) const { return a - b; }
|
__host__ __device__ constexpr T operator()(T a, T b) const { return a - b; }
|
||||||
};
|
};
|
||||||
|
|
||||||
template <typename T>
|
|
||||||
struct multiplies
|
struct multiplies
|
||||||
{
|
|
||||||
__host__ __device__ constexpr T operator()(T a, T b) const { return a * b; }
|
|
||||||
};
|
|
||||||
|
|
||||||
struct multiplies_v2
|
|
||||||
{
|
{
|
||||||
template <typename A, typename B>
|
template <typename A, typename B>
|
||||||
__host__ __device__ constexpr auto operator()(const A& a, const B& b) const
|
__host__ __device__ constexpr auto operator()(const A& a, const B& b) const
|
||||||
|
|||||||
Reference in New Issue
Block a user