mirror of
https://github.com/ROCm/composable_kernel.git
synced 2026-05-12 01:10:17 +00:00
adding implicit gemm v4 (nchw, kcyx)
This commit is contained in:
@@ -44,11 +44,7 @@ struct GeneratorTensor_3
|
||||
{
|
||||
std::array<index_t, sizeof...(Is)> dims = {{static_cast<index_t>(is)...}};
|
||||
|
||||
#if 0
|
||||
auto f_acc = std::plus<index_t>{};
|
||||
#else
|
||||
auto f_acc = [](auto a, auto b) { return 100 * a + b; };
|
||||
#endif
|
||||
|
||||
return std::accumulate(dims.begin(), dims.end(), index_t(0), f_acc);
|
||||
}
|
||||
@@ -447,7 +443,7 @@ int main(int argc, char* argv[])
|
||||
|
||||
constexpr index_t HPad = 0;
|
||||
constexpr index_t WPad = 0;
|
||||
#elif 1
|
||||
#elif 0
|
||||
// 3x3 filter, 28x28 image
|
||||
constexpr index_t N = 128;
|
||||
constexpr index_t C = 256;
|
||||
@@ -543,7 +539,7 @@ int main(int argc, char* argv[])
|
||||
|
||||
constexpr index_t HPad = 0;
|
||||
constexpr index_t WPad = 0;
|
||||
#elif 0
|
||||
#elif 1
|
||||
// 1x1 filter, 14x14 image
|
||||
constexpr index_t N = 128;
|
||||
constexpr index_t C = 512;
|
||||
@@ -553,6 +549,18 @@ int main(int argc, char* argv[])
|
||||
constexpr index_t Y = 1;
|
||||
constexpr index_t X = 1;
|
||||
|
||||
constexpr index_t HPad = 0;
|
||||
constexpr index_t WPad = 0;
|
||||
#elif 1
|
||||
// 1x1 filter, 73x73 image
|
||||
constexpr index_t N = 128;
|
||||
constexpr index_t C = 64;
|
||||
constexpr index_t HI = 73;
|
||||
constexpr index_t WI = 73;
|
||||
constexpr index_t K = 128;
|
||||
constexpr index_t Y = 1;
|
||||
constexpr index_t X = 1;
|
||||
|
||||
constexpr index_t HPad = 0;
|
||||
constexpr index_t WPad = 0;
|
||||
#endif
|
||||
@@ -609,8 +617,6 @@ int main(int argc, char* argv[])
|
||||
};
|
||||
wei_kcyx.GenerateTensorValue(gen_wei, num_thread);
|
||||
#endif
|
||||
|
||||
// out_nkhw_device.GenerateTensorValue(GeneratorTensor_1{}, num_thread);
|
||||
}
|
||||
|
||||
#if 1
|
||||
@@ -649,7 +655,7 @@ int main(int argc, char* argv[])
|
||||
|
||||
if(do_verification)
|
||||
{
|
||||
#if 1
|
||||
#if 0
|
||||
if(Y == 3 && X == 3)
|
||||
{
|
||||
host_winograd_3x3_convolution(in_nchw, wei_kcyx, out_nkhw_host, lower_pads, upper_pads);
|
||||
|
||||
@@ -105,6 +105,7 @@ __host__ __device__ constexpr auto extract_array(const Array<TData, NSize>& old_
|
||||
return new_array;
|
||||
}
|
||||
|
||||
// Array = Array + Array
|
||||
template <class TData, index_t NSize>
|
||||
__host__ __device__ constexpr auto operator+(Array<TData, NSize> a, Array<TData, NSize> b)
|
||||
{
|
||||
@@ -119,6 +120,55 @@ __host__ __device__ constexpr auto operator+(Array<TData, NSize> a, Array<TData,
|
||||
return result;
|
||||
}
|
||||
|
||||
// Array = Array - Array
|
||||
template <class TData, index_t NSize>
|
||||
__host__ __device__ constexpr auto operator-(Array<TData, NSize> a, Array<TData, NSize> b)
|
||||
{
|
||||
Array<TData, NSize> result;
|
||||
|
||||
static_for<0, NSize, 1>{}([&](auto I) {
|
||||
constexpr index_t i = I.Get();
|
||||
|
||||
result[i] = a[i] - b[i];
|
||||
});
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
// Array = Array + Sequence
|
||||
template <class TData, index_t NSize, index_t... Is>
|
||||
__host__ __device__ constexpr auto operator+(Array<TData, NSize> a, Sequence<Is...> b)
|
||||
{
|
||||
static_assert(sizeof...(Is) == NSize, "wrong! size not the same");
|
||||
|
||||
Array<TData, NSize> result;
|
||||
|
||||
static_for<0, NSize, 1>{}([&](auto I) {
|
||||
constexpr index_t i = I.Get();
|
||||
|
||||
result[i] = a[i] + b.Get(I);
|
||||
});
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
// Array = Array - Sequence
|
||||
template <class TData, index_t NSize, index_t... Is>
|
||||
__host__ __device__ constexpr auto operator-(Array<TData, NSize> a, Sequence<Is...> b)
|
||||
{
|
||||
static_assert(sizeof...(Is) == NSize, "wrong! size not the same");
|
||||
|
||||
Array<TData, NSize> result;
|
||||
|
||||
static_for<0, NSize, 1>{}([&](auto I) {
|
||||
constexpr index_t i = I.Get();
|
||||
|
||||
result[i] = a[i] - b.Get(I);
|
||||
});
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
// Array = Array * Sequence
|
||||
template <class TData, index_t NSize, index_t... Is>
|
||||
__host__ __device__ constexpr auto operator*(Array<TData, NSize> a, Sequence<Is...> b)
|
||||
@@ -136,15 +186,119 @@ __host__ __device__ constexpr auto operator*(Array<TData, NSize> a, Sequence<Is.
|
||||
return result;
|
||||
}
|
||||
|
||||
template <class TData, index_t NSize, class F>
|
||||
__host__ __device__ constexpr TData reduce_on_array(Array<TData, NSize> a, F f)
|
||||
// Array = Sequence - Array
|
||||
template <class TData, index_t NSize, index_t... Is>
|
||||
__host__ __device__ constexpr auto operator-(Sequence<Is...> a, Array<TData, NSize> b)
|
||||
{
|
||||
TData result = a[0];
|
||||
static_assert(sizeof...(Is) == NSize, "wrong! size not the same");
|
||||
|
||||
static_for<1, NSize, 1>{}([&](auto I) {
|
||||
Array<TData, NSize> result;
|
||||
|
||||
static_for<0, NSize, 1>{}([&](auto I) {
|
||||
constexpr index_t i = I.Get();
|
||||
|
||||
result[i] = a.Get(I) - b[i];
|
||||
});
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
template <class TData, index_t NSize, class Reduce>
|
||||
__host__ __device__ constexpr TData
|
||||
accumulate_on_array(const Array<TData, NSize>& a, Reduce f, TData init)
|
||||
{
|
||||
TData result = init;
|
||||
|
||||
static_assert(NSize > 0, "wrong");
|
||||
|
||||
static_for<0, NSize, 1>{}([&](auto I) {
|
||||
constexpr index_t i = I.Get();
|
||||
result = f(result, a[i]);
|
||||
});
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
template <class T, index_t NSize>
|
||||
__host__ __device__ void print_Array(const char* s, Array<T, NSize> a)
|
||||
{
|
||||
constexpr index_t nsize = a.GetSize();
|
||||
|
||||
static_assert(nsize > 0 && nsize <= 10, "wrong!");
|
||||
|
||||
static_if<nsize == 1>{}([&](auto) { printf("%s size %u, {%u}\n", s, nsize, a[0]); });
|
||||
|
||||
static_if<nsize == 2>{}([&](auto) { printf("%s size %u, {%u %u}\n", s, nsize, a[0], a[1]); });
|
||||
|
||||
static_if<nsize == 3>{}(
|
||||
[&](auto) { printf("%s size %u, {%u %u %u}\n", s, nsize, a[0], a[1], a[2]); });
|
||||
|
||||
static_if<nsize == 4>{}(
|
||||
[&](auto) { printf("%s size %u, {%u %u %u %u}\n", s, nsize, a[0], a[1], a[2], a[3]); });
|
||||
|
||||
static_if<nsize == 5>{}([&](auto) {
|
||||
printf("%s size %u, {%u %u %u %u %u}\n", s, nsize, a[0], a[1], a[2], a[3], a[4]);
|
||||
});
|
||||
|
||||
static_if<nsize == 6>{}([&](auto) {
|
||||
printf("%s size %u, {%u %u %u %u %u %u}\n", s, nsize, a[0], a[1], a[2], a[3], a[4], a[5]);
|
||||
});
|
||||
|
||||
static_if<nsize == 7>{}([&](auto) {
|
||||
printf("%s size %u, {%u %u %u %u %u %u %u}\n",
|
||||
s,
|
||||
nsize,
|
||||
a[0],
|
||||
a[1],
|
||||
a[2],
|
||||
a[3],
|
||||
a[4],
|
||||
a[5],
|
||||
a[6]);
|
||||
});
|
||||
|
||||
static_if<nsize == 8>{}([&](auto) {
|
||||
printf("%s size %u, {%u %u %u %u %u %u %u %u}\n",
|
||||
s,
|
||||
nsize,
|
||||
a[0],
|
||||
a[1],
|
||||
a[2],
|
||||
a[3],
|
||||
a[4],
|
||||
a[5],
|
||||
a[6],
|
||||
a[7]);
|
||||
});
|
||||
|
||||
static_if<nsize == 9>{}([&](auto) {
|
||||
printf("%s size %u, {%u %u %u %u %u %u %u %u %u}\n",
|
||||
s,
|
||||
nsize,
|
||||
a[0],
|
||||
a[1],
|
||||
a[2],
|
||||
a[3],
|
||||
a[4],
|
||||
a[5],
|
||||
a[6],
|
||||
a[7],
|
||||
a[8]);
|
||||
});
|
||||
|
||||
static_if<nsize == 10>{}([&](auto) {
|
||||
printf("%s size %u, {%u %u %u %u %u %u %u %u %u %u}\n",
|
||||
s,
|
||||
nsize,
|
||||
a[0],
|
||||
a[1],
|
||||
a[2],
|
||||
a[3],
|
||||
a[4],
|
||||
a[5],
|
||||
a[6],
|
||||
a[7],
|
||||
a[8],
|
||||
a[9]);
|
||||
});
|
||||
}
|
||||
|
||||
@@ -99,15 +99,7 @@ struct ConstantMergedTensorDescriptor
|
||||
return original_multi_id;
|
||||
}
|
||||
|
||||
#if 0 // not needed
|
||||
__host__ __device__ static index_t
|
||||
GetOffsetFromOriginalMultiIndex(Array<index_t, nOriginalDim> original_multi_id)
|
||||
{
|
||||
return OriginalTensorDesc::GetOffsetFromMultiIndex(original_multi_id);
|
||||
}
|
||||
#endif
|
||||
|
||||
__host__ __device__ static index_t GetOffsetFromMultiIndexA(Array<index_t, nDim> multi_id)
|
||||
__host__ __device__ static index_t GetOffsetFromMultiIndex(Array<index_t, nDim> multi_id)
|
||||
{
|
||||
const auto original_multi_id = GetOriginalMultiIndexFromMultiIndex(multi_id);
|
||||
|
||||
@@ -126,38 +118,6 @@ struct ConstantMergedTensorDescriptor
|
||||
|
||||
return dummy_desc.GetMultiIndexFrom1dIndex(id);
|
||||
}
|
||||
|
||||
#if 0 // not needed
|
||||
template <index_t IDim>
|
||||
__host__ __device__ static index_t GetNewOriginalMultiIndexAfterMovingAlongOneDimension(
|
||||
Array<index_t, nOriginalDim> old_original_multi_id, Number<IDim>, index_t step_size)
|
||||
{
|
||||
auto new_original_multi_id = old_original_multi_id;
|
||||
|
||||
// get partial-original-multi-id corresponding to this merged dimension
|
||||
constexpr auto original_partial_dims = std::get<IDim>(mOriginalDimMergeSeqs);
|
||||
|
||||
constexpr auto original_partial_tensor_desc =
|
||||
OriginalTensorDesc::Extract(original_partial_dims);
|
||||
|
||||
auto old_original_partial_multi_id =
|
||||
extract_array(old_original_mutli_id, original_paritial_dims);
|
||||
|
||||
auto new_original_partial_multi_id =
|
||||
original_partial_tensor_desc.GetNewMultiIndexGivenStepSizeOf1dIndex(
|
||||
old_original_partial_multi_id, step_size);
|
||||
|
||||
// update original-mutli-id
|
||||
static_for<0, original_dims_partial.GetSize(), 1>{}([&](auto I_) {
|
||||
constexpr auto I = decltype(I_){};
|
||||
constexpr index_t idim_original = original_dims_partial.Get(I);
|
||||
|
||||
new_original_multi_id[idim_original] = original_multi_id_partial[I.Get()];
|
||||
});
|
||||
|
||||
return new_original_multi_id;
|
||||
}
|
||||
#endif
|
||||
};
|
||||
|
||||
template <class OriginalTensorDesc, class... OriginalDimMergeSeqs>
|
||||
|
||||
@@ -4,7 +4,7 @@
|
||||
template <class Lengths>
|
||||
__host__ __device__ constexpr auto calculate_tensor_strides_default_rank_packed(Lengths)
|
||||
{
|
||||
return reverse_inclusive_scan_sequence(Lengths{}.PopFront(), std::multiplies<index_t>{})
|
||||
return reverse_inclusive_scan_sequence(Lengths{}.PopFront(), mod_conv::multiplies<index_t>{})
|
||||
.PushBack(Number<1>{});
|
||||
}
|
||||
|
||||
@@ -95,7 +95,7 @@ struct ConstantTensorDescriptor
|
||||
|
||||
__host__ __device__ static constexpr index_t GetElementSize()
|
||||
{
|
||||
return accumulate_on_sequence(Lengths{}, std::multiplies<index_t>{}, Number<1>{});
|
||||
return accumulate_on_sequence(Lengths{}, mod_conv::multiplies<index_t>{}, Number<1>{});
|
||||
}
|
||||
|
||||
// WRONG! ReorderGivenOld2New is broken
|
||||
@@ -107,10 +107,10 @@ struct ConstantTensorDescriptor
|
||||
constexpr auto strides_in_rank = GetStrides().ReorderGivenOld2new(MemoryRank{});
|
||||
|
||||
constexpr index_t element_space_unaligned = accumulate_on_sequence(
|
||||
(lengths_in_rank - Number<1>{}) * strides_in_rank, std::plus<index_t>{}, Number<1>{});
|
||||
(lengths_in_rank - Number<1>{}) * strides_in_rank, mod_conv::plus<index_t>{}, Number<1>{});
|
||||
#else // WRONG! align shouldbe applied to the last memory rank, not the last tensor dimension
|
||||
constexpr index_t element_space_unaligned = accumulate_on_sequence(
|
||||
(GetLengths() - Number<1>{}) * GetStrides(), std::plus<index_t>{}, Number<1>{});
|
||||
(GetLengths() - Number<1>{}) * GetStrides(), mod_conv::plus<index_t>{}, Number<1>{});
|
||||
#endif
|
||||
|
||||
return align.Get() * ((element_space_unaligned + align.Get() - 1) / align.Get());
|
||||
@@ -144,7 +144,8 @@ struct ConstantTensorDescriptor
|
||||
|
||||
constexpr auto multi_id = Sequence<Is...>{};
|
||||
|
||||
return accumulate_on_sequence(multi_id * GetStrides(), std::plus<index_t>{}, Number<0>{});
|
||||
return accumulate_on_sequence(
|
||||
multi_id * GetStrides(), mod_conv::plus<index_t>{}, Number<0>{});
|
||||
}
|
||||
|
||||
#if 0 // ReorderGivenOld2new is broken
|
||||
@@ -197,32 +198,70 @@ struct ConstantTensorDescriptor
|
||||
// This function doesn't do carry check on the highest dimension, for performance reason.
|
||||
// It is the user's responsibility to make sure the result "new_mutli_id" is not out-of-bound
|
||||
// on the highest dimension
|
||||
template <bool PositiveDirection>
|
||||
__host__ __device__ static Array<index_t, nDim>
|
||||
UpdateMultiIndexGivenStepSizeOf1dIndex(Array<index_t, nDim> old_multi_id,
|
||||
index_t step_size_of_1d_index)
|
||||
index_t step_size_of_1d_index,
|
||||
integral_constant<bool, PositiveDirection>)
|
||||
{
|
||||
auto new_multi_id = old_multi_id + GetMultiIndexFrom1dIndex(step_size_of_1d_index);
|
||||
Array<index_t, nDim> new_multi_id;
|
||||
|
||||
bool carry = false;
|
||||
const auto step_sizes = GetMultiIndexFrom1dIndex(step_size_of_1d_index);
|
||||
|
||||
// do carry check in reversed order, starting from lowest dimension
|
||||
// don't check the highest dimension
|
||||
static_for<0, nDim - 1, 1>{}([&](auto IDimReverse) {
|
||||
constexpr index_t idim = nDim - 1 - IDimReverse.Get();
|
||||
constexpr auto IDim = Number<idim>{};
|
||||
static_if<PositiveDirection>{}([&](auto) {
|
||||
new_multi_id = old_multi_id + step_sizes;
|
||||
|
||||
if(carry)
|
||||
{
|
||||
++new_multi_id[idim];
|
||||
}
|
||||
bool carry = false;
|
||||
|
||||
carry = false;
|
||||
// do carry check in reversed order, starting from lowest dimension
|
||||
// don't check the highest dimension
|
||||
static_for<0, nDim - 1, 1>{}([&](auto IDimReverse) {
|
||||
constexpr index_t idim = nDim - 1 - IDimReverse.Get();
|
||||
constexpr auto IDim = Number<idim>{};
|
||||
|
||||
if(new_multi_id[idim] >= GetLength(IDim))
|
||||
{
|
||||
new_multi_id[idim] -= GetLength(IDim);
|
||||
carry = true;
|
||||
}
|
||||
if(carry)
|
||||
{
|
||||
++new_multi_id[idim];
|
||||
}
|
||||
|
||||
carry = false;
|
||||
|
||||
if(new_multi_id[idim] >= GetLength(IDim))
|
||||
{
|
||||
new_multi_id[idim] -= GetLength(IDim);
|
||||
carry = true;
|
||||
}
|
||||
});
|
||||
}).Else([&](auto) {
|
||||
// shift up multi-id to avoid unsigned integer underflow during intermediate
|
||||
// calculations. After the shift, should have new_multi_id[...] >= 1
|
||||
new_multi_id = old_multi_id + (GetLengths() - step_sizes);
|
||||
|
||||
bool borrow = false;
|
||||
|
||||
// do borrow check in reversed order, starting from lowest dimension
|
||||
// don't check the highest dimension
|
||||
static_for<0, nDim - 1, 1>{}([&](auto IDimReverse) {
|
||||
constexpr index_t idim = nDim - 1 - IDimReverse.Get();
|
||||
constexpr auto IDim = Number<idim>{};
|
||||
|
||||
if(borrow)
|
||||
{
|
||||
--new_multi_id[idim];
|
||||
}
|
||||
|
||||
borrow = false;
|
||||
|
||||
if(new_multi_id[idim] < GetLength(IDim))
|
||||
{
|
||||
new_multi_id[idim] += GetLength(IDim);
|
||||
borrow = true;
|
||||
}
|
||||
});
|
||||
|
||||
// shift back down multi-id
|
||||
// here, should have new_multi_id[...] >= GetLengths()
|
||||
new_multi_id = new_multi_id - GetLengths();
|
||||
});
|
||||
|
||||
return new_multi_id;
|
||||
@@ -255,7 +294,7 @@ struct ConstantTensorDescriptor
|
||||
}
|
||||
|
||||
template <class... Ts>
|
||||
__host__ __device__ static constexpr auto Inject(ConstantTensorDescriptor<Ts...>)
|
||||
__host__ __device__ static constexpr auto Embed(ConstantTensorDescriptor<Ts...>)
|
||||
{
|
||||
using leaf_tensor = ConstantTensorDescriptor<Ts...>;
|
||||
|
||||
@@ -290,7 +329,7 @@ struct ConstantTensorDescriptor
|
||||
constexpr auto fold_intervals = Sequence<FoldIntervals...>{};
|
||||
|
||||
constexpr index_t fold_intervals_product =
|
||||
accumulate_on_sequence(fold_intervals, std::multiplies<index_t>{}, Number<1>{});
|
||||
accumulate_on_sequence(fold_intervals, mod_conv::multiplies<index_t>{}, Number<1>{});
|
||||
|
||||
constexpr auto unfold_length = GetLength(Number<IDim>{});
|
||||
constexpr auto unfold_stride = GetStride(Number<IDim>{});
|
||||
@@ -309,7 +348,7 @@ struct ConstantTensorDescriptor
|
||||
constexpr auto fold_strides =
|
||||
Number<unfold_stride>{} *
|
||||
reverse_inclusive_scan_sequence(fold_intervals.PushBack(Number<1>{}),
|
||||
std::multiplies<index_t>{});
|
||||
mod_conv::multiplies<index_t>{});
|
||||
|
||||
// folded_ranks
|
||||
constexpr auto fold_ranks =
|
||||
@@ -389,7 +428,7 @@ struct ConstantTensorDescriptor
|
||||
|
||||
// unfolded length, stride and rank
|
||||
constexpr index_t unfold_length = accumulate_on_sequence(
|
||||
GetLengths().Extract(middle), std::multiplies<index_t>{}, Number<1>{});
|
||||
GetLengths().Extract(middle), mod_conv::multiplies<index_t>{}, Number<1>{});
|
||||
|
||||
constexpr index_t unfold_stride = GetStride(Number<LastUnfoldDim>{});
|
||||
|
||||
@@ -472,7 +511,20 @@ __host__ __device__ void print_ConstantTensorDescriptor(TDesc, const char* s)
|
||||
{
|
||||
constexpr index_t ndim = TDesc::GetNumOfDimension();
|
||||
|
||||
static_assert(ndim >= 2 && ndim <= 10, "wrong!");
|
||||
static_assert(ndim >= 1 && ndim <= 10, "wrong!");
|
||||
|
||||
static_if<ndim == 1>{}([&](auto fwd) {
|
||||
constexpr auto I0 = Number<0>{};
|
||||
|
||||
constexpr auto desc = fwd(TDesc{});
|
||||
|
||||
printf("%s dim %u, lengths {%u}, strides {%u}, ranks {%u}\n",
|
||||
s,
|
||||
desc.GetNumOfDimension(),
|
||||
desc.GetLength(I0),
|
||||
desc.GetStride(I0),
|
||||
desc.GetMemoryRank(I0));
|
||||
});
|
||||
|
||||
static_if<ndim == 2>{}([&](auto fwd) {
|
||||
constexpr auto I0 = Number<0>{};
|
||||
|
||||
@@ -495,3 +495,39 @@ __host__ __device__ constexpr auto Sequence<Is...>::Modify(Number<I>, Number<X>)
|
||||
|
||||
return seq_left.PushBack(Number<X>{}).Append(seq_right);
|
||||
}
|
||||
|
||||
template <index_t... Xs>
|
||||
__host__ __device__ void print_Sequence(const char* s, Sequence<Xs...>)
|
||||
{
|
||||
constexpr index_t nsize = Sequence<Xs...>::GetSize();
|
||||
|
||||
static_assert(nsize <= 10, "wrong!");
|
||||
|
||||
static_if<nsize == 0>{}([&](auto) { printf("%s size %u, {}\n", s, nsize, Xs...); });
|
||||
|
||||
static_if<nsize == 1>{}([&](auto) { printf("%s size %u, {%u}\n", s, nsize, Xs...); });
|
||||
|
||||
static_if<nsize == 2>{}([&](auto) { printf("%s size %u, {%u %u}\n", s, nsize, Xs...); });
|
||||
|
||||
static_if<nsize == 3>{}([&](auto) { printf("%s size %u, {%u %u %u}\n", s, nsize, Xs...); });
|
||||
|
||||
static_if<nsize == 4>{}([&](auto) { printf("%s size %u, {%u %u %u %u}\n", s, nsize, Xs...); });
|
||||
|
||||
static_if<nsize == 5>{}(
|
||||
[&](auto) { printf("%s size %u, {%u %u %u %u %u}\n", s, nsize, Xs...); });
|
||||
|
||||
static_if<nsize == 6>{}(
|
||||
[&](auto) { printf("%s size %u, {%u %u %u %u %u %u}\n", s, nsize, Xs...); });
|
||||
|
||||
static_if<nsize == 7>{}(
|
||||
[&](auto) { printf("%s size %u, {%u %u %u %u %u %u %u}\n", s, nsize, Xs...); });
|
||||
|
||||
static_if<nsize == 8>{}(
|
||||
[&](auto) { printf("%s size %u, {%u %u %u %u %u %u %u %u}\n", s, nsize, Xs...); });
|
||||
|
||||
static_if<nsize == 9>{}(
|
||||
[&](auto) { printf("%s size %u, {%u %u %u %u %u %u %u %u %u}\n", s, nsize, Xs...); });
|
||||
|
||||
static_if<nsize == 10>{}(
|
||||
[&](auto) { printf("%s size %u, {%u %u %u %u %u %u %u %u %u %u}\n", s, nsize, Xs...); });
|
||||
}
|
||||
|
||||
@@ -158,7 +158,7 @@ struct Blockwise3dTensorCopy3
|
||||
"wrrong! BlockSize is not big enough for ThreadPerDims!");
|
||||
|
||||
constexpr index_t num_active_thread =
|
||||
accumulate_on_sequence(ThreadPerDims{}, std::multiplies<index_t>{}, Number<1>{});
|
||||
accumulate_on_sequence(ThreadPerDims{}, mod_conv::multiplies<index_t>{}, Number<1>{});
|
||||
|
||||
if(BlockSize > num_active_thread)
|
||||
{
|
||||
|
||||
@@ -500,7 +500,7 @@ struct Blockwise4dTensorCopy3
|
||||
"wrrong! BlockSize is not big enough for ThreadPerDims!");
|
||||
|
||||
constexpr index_t num_active_thread =
|
||||
accumulate_on_sequence(ThreadPerDims{}, std::multiplies<index_t>{}, Number<1>{});
|
||||
accumulate_on_sequence(ThreadPerDims{}, mod_conv::multiplies<index_t>{}, Number<1>{});
|
||||
|
||||
if(BlockSize > num_active_thread)
|
||||
{
|
||||
|
||||
@@ -3,6 +3,7 @@
|
||||
|
||||
// slice a (normal or merged) tensor, and copy it into another (normal or merged) tensor
|
||||
// memory layout (ordering of dimensions) can be different between src and dst
|
||||
// For now, only support SubLengths == 1 on a merged dimension
|
||||
template <index_t BlockSize,
|
||||
class Float,
|
||||
class SrcDesc,
|
||||
@@ -47,7 +48,7 @@ struct BlockwiseGenericTensorSliceCopy_v1
|
||||
BlockwiseGenericTensorSliceCopy_v1(Array<index_t, nDim> src_block_data_multi_id_begin,
|
||||
Array<index_t, nDim> dst_block_data_multi_id_begin)
|
||||
{
|
||||
// check NDim consistent
|
||||
// check NDim consistency
|
||||
static_assert(nDim == SrcDesc::GetNumOfDimension() &&
|
||||
nDim == DstDesc::GetNumOfDimension() && nDim == SliceLengths::GetSize() &&
|
||||
nDim == SubLengths::GetSize() && nDim == DataClusterLengths::GetSize() &&
|
||||
@@ -55,7 +56,7 @@ struct BlockwiseGenericTensorSliceCopy_v1
|
||||
nDim == SrcAccessOrder::GetSize() && nDim == DstAccessOrder::GetSize(),
|
||||
"wrong");
|
||||
|
||||
// check
|
||||
// check thread arrange order and read/write access order are valid
|
||||
static_assert(is_valid_sequence_map<ThreadClusterArrangeOrder>::value &&
|
||||
is_valid_sequence_map<SrcAccessOrder>::value &&
|
||||
is_valid_sequence_map<DstAccessOrder>::value,
|
||||
@@ -140,10 +141,14 @@ struct BlockwiseGenericTensorSliceCopy_v1
|
||||
});
|
||||
|
||||
// complete offset
|
||||
mThreadSrcOffset = reduce_on_array(mThreadSrcPartialOffsets, std::plus<index_t>{});
|
||||
mThreadDstOffset = reduce_on_array(mThreadDstPartialOffsets, std::plus<index_t>{});
|
||||
mThreadSrcOffset = accumulate_on_array(
|
||||
mThreadSrcPartialOffsets, mod_conv::plus<index_t>{}, static_cast<index_t>(0));
|
||||
|
||||
mThreadDstOffset = accumulate_on_array(
|
||||
mThreadDstPartialOffsets, mod_conv::plus<index_t>{}, static_cast<index_t>(0));
|
||||
|
||||
#if 0
|
||||
if(get_block_1d_id() == 0)
|
||||
{
|
||||
printf("id %5u %5u: "
|
||||
"src_block_data_multi_id_begin: %u %u %u %u, "
|
||||
@@ -279,13 +284,9 @@ struct BlockwiseGenericTensorSliceCopy_v1
|
||||
// the boundary of the tensor being sliced. This functions doesn't do runtime sanity
|
||||
// check on out-of-bound slicing window, for performance reason
|
||||
template <index_t IDim_, index_t StepSize, bool PositiveDirection>
|
||||
__device__ void MoveSlicingWindowOnSourceTensor(Number<IDim_>,
|
||||
Number<StepSize>,
|
||||
integral_constant<bool, PositiveDirection>)
|
||||
__device__ void MoveSlicingWindowOnSourceTensor(
|
||||
Number<IDim_>, Number<StepSize>, integral_constant<bool, PositiveDirection> direction)
|
||||
{
|
||||
static_assert(PositiveDirection,
|
||||
"wrong! only support movement in positive direction for now");
|
||||
|
||||
constexpr auto IDim = Number<IDim_>{};
|
||||
constexpr index_t idim = IDim.Get();
|
||||
|
||||
@@ -306,7 +307,7 @@ struct BlockwiseGenericTensorSliceCopy_v1
|
||||
|
||||
auto new_src_partial_original_multi_id =
|
||||
src_partial_original_desc.UpdateMultiIndexGivenStepSizeOf1dIndex(
|
||||
old_src_partial_original_multi_id, StepSize);
|
||||
old_src_partial_original_multi_id, StepSize, direction);
|
||||
|
||||
// update "mThreadSrcOriginalMultiId"
|
||||
static_for<0, src_partial_original_dims.GetSize(), 1>{}([&](auto I_) {
|
||||
@@ -328,7 +329,7 @@ struct BlockwiseGenericTensorSliceCopy_v1
|
||||
mThreadSrcPartialOffsets[idim] = new_src_partial_offset;
|
||||
|
||||
// update "mThreadSrcOffset", do "+" before "-" to avoid underflow
|
||||
mThreadSrcOffset = mThreadSrcOffset + new_src_partial_offset - old_src_partial_offset;
|
||||
mThreadSrcOffset = (mThreadSrcOffset + new_src_partial_offset) - old_src_partial_offset;
|
||||
}).Else([&](auto fwd) {
|
||||
// Logic for non-merged dimension. If you are never going to move the slicing window on
|
||||
// a merged dimension, then "mThreadSrcOriginalMultiId" and "mThreadSrcPartialOffsets",
|
||||
@@ -336,13 +337,25 @@ struct BlockwiseGenericTensorSliceCopy_v1
|
||||
// should be able to remove these calculations.
|
||||
// TODO: make sure compiler would actually remove them in this case.
|
||||
|
||||
// It is the user's responsiblity to make sure the slicing window will not be moved out
|
||||
// of the boundary of the tensor being sliced. Otherwise, there might be hazard like
|
||||
// unsigned integer underflow. That is NO runtime sanity check to prevent the hazard
|
||||
|
||||
constexpr index_t idim_original = SrcDesc::GetContainedOriginalDimensions(IDim).Front();
|
||||
|
||||
mThreadSrcOffset += StepSize * SrcDesc::GetStride(IDim);
|
||||
static_if<PositiveDirection>{}([&](auto) {
|
||||
mThreadSrcOffset += StepSize * SrcDesc::GetStride(IDim);
|
||||
|
||||
mThreadSrcOriginalMultiId[idim_original] += StepSize;
|
||||
mThreadSrcOriginalMultiId[idim_original] += StepSize;
|
||||
|
||||
mThreadSrcPartialOffsets[idim] += StepSize * SrcDesc::GetStride(IDim);
|
||||
mThreadSrcPartialOffsets[idim] += StepSize * SrcDesc::GetStride(IDim);
|
||||
}).Else([&](auto) {
|
||||
mThreadSrcOffset -= StepSize * SrcDesc::GetStride(IDim);
|
||||
|
||||
mThreadSrcOriginalMultiId[idim_original] -= StepSize;
|
||||
|
||||
mThreadSrcPartialOffsets[idim] -= StepSize * SrcDesc::GetStride(IDim);
|
||||
});
|
||||
});
|
||||
}
|
||||
};
|
||||
|
||||
@@ -39,6 +39,18 @@ struct scales
|
||||
__host__ __device__ constexpr T operator()(T a) const { return s * a; }
|
||||
};
|
||||
|
||||
template <class T>
|
||||
struct plus
|
||||
{
|
||||
__host__ __device__ constexpr T operator()(T a, T b) const { return a + b; }
|
||||
};
|
||||
|
||||
template <class T>
|
||||
struct multiplies
|
||||
{
|
||||
__host__ __device__ constexpr T operator()(T a, T b) const { return a * b; }
|
||||
};
|
||||
|
||||
template <class T>
|
||||
struct integer_divide_ceiler
|
||||
{
|
||||
|
||||
@@ -58,6 +58,9 @@ struct GridwiseConvolutionImplicitGemm_v3_nchw_cyxk_nkhw
|
||||
constexpr auto I6 = Number<6>{};
|
||||
constexpr auto I7 = Number<7>{};
|
||||
|
||||
constexpr auto True = integral_constant<bool, true>{};
|
||||
constexpr auto False = integral_constant<bool, false>{};
|
||||
|
||||
constexpr auto in_n_c_h_w_global_desc = InGlobalDesc{};
|
||||
constexpr auto wei_c_y_x_k_global_desc = WeiGlobalDesc{};
|
||||
constexpr auto out_n_k_h_w_global_desc = OutGlobalDesc{};
|
||||
@@ -123,7 +126,7 @@ struct GridwiseConvolutionImplicitGemm_v3_nchw_cyxk_nkhw
|
||||
// input blockwise copy
|
||||
// slice a merged tensor, reorder and copy to a normal tensor
|
||||
// this copy operator already has blockwise offset built-in
|
||||
const auto blockwise_in_copy = BlockwiseGenericTensorSliceCopy_v1<
|
||||
auto blockwise_in_copy = BlockwiseGenericTensorSliceCopy_v1<
|
||||
BlockSize,
|
||||
Float,
|
||||
decltype(in_c_n1_b_n2_global_merged_desc),
|
||||
@@ -150,20 +153,20 @@ struct GridwiseConvolutionImplicitGemm_v3_nchw_cyxk_nkhw
|
||||
// operator for blockwise copy of weight into LDS
|
||||
// slice a tensor, and copy it into another tensor
|
||||
// this copy operator already have blockwise offset built-in
|
||||
const auto blockwise_wei_copy =
|
||||
#if 0
|
||||
auto blockwise_wei_copy =
|
||||
#if 1
|
||||
BlockwiseGenericTensorSliceCopy_v1<BlockSize,
|
||||
Float,
|
||||
decltype(wei_c_k_global_desc),
|
||||
decltype(wei_c_k_block_desc),
|
||||
decltype(wei_c_k_block_desc.GetLengths()),
|
||||
WeiBlockCopySubLengths_C_K,
|
||||
WeiBlockCopyClusterLengths_C_K,
|
||||
Sequence<0, 1>, // thread_arrange_order [C, K]
|
||||
Sequence<0, 1>, // src_access_order [C, K]
|
||||
Sequence<0, 1>, // dst_access_order [C, K]
|
||||
WeiBlockCopyDataPerAccess_K,
|
||||
WeiBlockCopyDataPerAccess_K>(
|
||||
Float,
|
||||
decltype(wei_c_k_global_desc),
|
||||
decltype(wei_c_k_block_desc),
|
||||
decltype(wei_c_k_block_desc.GetLengths()),
|
||||
WeiBlockCopySubLengths_C_K,
|
||||
WeiBlockCopyClusterLengths_C_K,
|
||||
Sequence<0, 1>, // thread_arrange_order [C, K]
|
||||
Sequence<0, 1>, // src_access_order [C, K]
|
||||
Sequence<0, 1>, // dst_access_order [C, K]
|
||||
WeiBlockCopyDataPerAccess_K,
|
||||
WeiBlockCopyDataPerAccess_K>(
|
||||
{0, k_block_data_on_global}, {0, 0});
|
||||
#else
|
||||
Blockwise2dTensorCopy3<BlockSize,
|
||||
@@ -175,16 +178,14 @@ struct GridwiseConvolutionImplicitGemm_v3_nchw_cyxk_nkhw
|
||||
{0, 0});
|
||||
#endif
|
||||
|
||||
// GEMM definition
|
||||
// c_mtx += transpose(a_mtx) * b_mtx
|
||||
// a_mtx[CPerBlock, KPerBlock] is in LDS
|
||||
// b_mtx[CPerBlocl, N1 * BPerBlock * N2] is in LDS
|
||||
// c_mtx[KPerBlock, N1 * BPerBlock * N2] is distributed among threads, and saved in
|
||||
// register
|
||||
constexpr auto a_c_k_block_mtx_desc =
|
||||
make_ConstantMatrixDescriptor(Number<CPerBlock>{},
|
||||
Number<KPerBlock>{},
|
||||
Number<wei_c_k_block_desc.GetStride(I0)>{});
|
||||
// GEMM definition
|
||||
// c_mtx += transpose(a_mtx) * b_mtx
|
||||
// a_mtx[CPerBlock, KPerBlock] is in LDS
|
||||
// b_mtx[CPerBlocl, N1 * BPerBlock * N2] is in LDS
|
||||
// c_mtx[KPerBlock, N1 * BPerBlock * N2] is distributed among threads, and saved in
|
||||
// register
|
||||
constexpr auto a_c_k_block_mtx_desc = make_ConstantMatrixDescriptor(
|
||||
Number<CPerBlock>{}, Number<KPerBlock>{}, Number<wei_c_k_block_desc.GetStride(I0)>{});
|
||||
|
||||
constexpr auto b_c_n1bn2_block_mtx_desc =
|
||||
make_ConstantMatrixDescriptor(Number<CPerBlock>{},
|
||||
@@ -239,6 +240,7 @@ struct GridwiseConvolutionImplicitGemm_v3_nchw_cyxk_nkhw
|
||||
// zero out threadwise output
|
||||
threadwise_matrix_set_zero(c_k0k2_n1n2_thread_mtx_desc, p_out_thread);
|
||||
|
||||
#if 0
|
||||
// do work
|
||||
for(index_t y = 0; y < Y; ++y)
|
||||
{
|
||||
@@ -269,6 +271,45 @@ struct GridwiseConvolutionImplicitGemm_v3_nchw_cyxk_nkhw
|
||||
}
|
||||
}
|
||||
}
|
||||
#else
|
||||
for(index_t y = 0; y < Y; ++y)
|
||||
{
|
||||
for(index_t x = 0; x < X; ++x)
|
||||
{
|
||||
// calculate origin of block input and weight tensor on global memory
|
||||
const Float* p_in_block_on_global =
|
||||
p_in_global + in_n_c_h_w_global_desc.GetOffsetFromMultiIndex(0, 0, y, x);
|
||||
|
||||
const Float* p_wei_block_on_global =
|
||||
p_wei_global + wei_c_y_x_k_global_desc.GetOffsetFromMultiIndex(0, y, x, 0);
|
||||
|
||||
for(index_t c_block_data_on_global = 0; c_block_data_on_global < C;
|
||||
c_block_data_on_global += CPerBlock)
|
||||
{
|
||||
blockwise_in_copy.Run(p_in_block_on_global, p_in_block);
|
||||
blockwise_wei_copy.Run(p_wei_block_on_global, p_wei_block);
|
||||
|
||||
__syncthreads();
|
||||
|
||||
blockwise_gemm.Run(p_wei_block, p_in_block, p_out_thread);
|
||||
|
||||
__syncthreads();
|
||||
|
||||
// move on C: C_N1_B_N2, C_K
|
||||
blockwise_in_copy.MoveSlicingWindowOnSourceTensor(
|
||||
I0, Number<CPerBlock>{}, True);
|
||||
|
||||
blockwise_wei_copy.MoveSlicingWindowOnSourceTensor(
|
||||
I0, Number<CPerBlock>{}, True);
|
||||
}
|
||||
|
||||
// reset C
|
||||
blockwise_in_copy.MoveSlicingWindowOnSourceTensor(I0, Number<C>{}, False);
|
||||
|
||||
blockwise_wei_copy.MoveSlicingWindowOnSourceTensor(I0, Number<C>{}, False);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
// copy output: register to global memory
|
||||
{
|
||||
|
||||
@@ -59,7 +59,7 @@ struct GridwiseConvolutionImplicitGemm_v4_nchw_kcyx_nkhw
|
||||
constexpr auto I6 = Number<6>{};
|
||||
constexpr auto I7 = Number<7>{};
|
||||
|
||||
constexpr auto TRUE = integral_constant<bool, true>{};
|
||||
constexpr auto True = integral_constant<bool, true>{};
|
||||
|
||||
constexpr auto in_n_c_h_w_global_desc = InGlobalDesc{};
|
||||
constexpr auto wei_k_c_y_x_global_desc = WeiGlobalDesc{};
|
||||
@@ -102,9 +102,9 @@ struct GridwiseConvolutionImplicitGemm_v4_nchw_kcyx_nkhw
|
||||
const index_t b_block_data_on_global = block_work_multi_id[1] * BPerBlock;
|
||||
|
||||
// input tensor
|
||||
// tensor descriptor in device memory [N0, N1, N2, H, W]
|
||||
constexpr auto in_n0_n1_n2_h_w_global_desc = in_n_c_h_w_global_desc.Slice(I2, Number<Hi>{})
|
||||
.Slice(I3, Number<Wi>{})
|
||||
// tensor descriptor in device memory [N0, N1, N2, Ho, Wo]
|
||||
constexpr auto in_n0_n1_n2_h_w_global_desc = in_n_c_h_w_global_desc.Slice(I2, Number<Ho>{})
|
||||
.Slice(I3, Number<Wo>{})
|
||||
.Fold(I0, Number<N1>{}, Number<N2>{})
|
||||
.Extract(Sequence<0, 1, 2, 4, 5>{});
|
||||
|
||||
@@ -115,12 +115,23 @@ struct GridwiseConvolutionImplicitGemm_v4_nchw_kcyx_nkhw
|
||||
|
||||
// merged tensor descriptor in device memory [E, N1, B, N2], src of blockwise copy
|
||||
constexpr auto in_e_n1_b_n2_global_merged_desc = make_ConstantMergedTensorDescriptor(
|
||||
in_c_y_x_global_desc.Inject(in_n0_n1_n2_h_w_global_desc),
|
||||
in_c_y_x_global_desc.Embed(in_n0_n1_n2_h_w_global_desc),
|
||||
Sequence<0, 1, 2>{},
|
||||
Sequence<4>{},
|
||||
Sequence<3, 6, 7>{},
|
||||
Sequence<5>{});
|
||||
|
||||
#if 0
|
||||
if(get_block_1d_id() == 0 && get_thread_local_1d_id() == 0)
|
||||
{
|
||||
print_ConstantTensorDescriptor(in_n0_n1_n2_h_w_global_desc,
|
||||
"in_n0_n1_n2_h_w_global_desc: ");
|
||||
print_ConstantTensorDescriptor(in_c_y_x_global_desc, "in_c_y_x_global_desc: ");
|
||||
print_ConstantMergedTensorDescriptor(in_e_n1_b_n2_global_merged_desc,
|
||||
"in_e_n1_b_n2_global_merged_desc: ");
|
||||
}
|
||||
#endif
|
||||
|
||||
// memory layout descriptor in LDS [E, N1, B, N2], dst of blockwise copy
|
||||
// be careful of LDS alignment
|
||||
constexpr auto in_e_n1_b_n2_block_desc = make_ConstantTensorDescriptor_default_rank_aligned(
|
||||
@@ -243,6 +254,31 @@ struct GridwiseConvolutionImplicitGemm_v4_nchw_kcyx_nkhw
|
||||
// do work
|
||||
for(index_t e = 0; e < E; e += EPerBlock)
|
||||
{
|
||||
#if 0
|
||||
if(e == 1 * EPerBlock && get_block_1d_id() == 0)
|
||||
{
|
||||
printf("id %5u %5u: "
|
||||
"mThreadSrcOriginalMultiId %u %u %u %u %u %u %u %u, "
|
||||
"mThreadSrcPartialOffsets %u %u %u %u, "
|
||||
"mThreadSrcOffset %u, mThreadDstOffset %u \n",
|
||||
get_block_1d_id(),
|
||||
get_thread_local_1d_id(),
|
||||
blockwise_in_copy.mThreadSrcOriginalMultiId[0],
|
||||
blockwise_in_copy.mThreadSrcOriginalMultiId[1],
|
||||
blockwise_in_copy.mThreadSrcOriginalMultiId[2],
|
||||
blockwise_in_copy.mThreadSrcOriginalMultiId[3],
|
||||
blockwise_in_copy.mThreadSrcOriginalMultiId[4],
|
||||
blockwise_in_copy.mThreadSrcOriginalMultiId[5],
|
||||
blockwise_in_copy.mThreadSrcOriginalMultiId[6],
|
||||
blockwise_in_copy.mThreadSrcOriginalMultiId[7],
|
||||
blockwise_in_copy.mThreadSrcPartialOffsets[0],
|
||||
blockwise_in_copy.mThreadSrcPartialOffsets[1],
|
||||
blockwise_in_copy.mThreadSrcPartialOffsets[2],
|
||||
blockwise_in_copy.mThreadSrcPartialOffsets[3],
|
||||
blockwise_in_copy.mThreadSrcOffset,
|
||||
blockwise_in_copy.mThreadDstOffset);
|
||||
}
|
||||
#endif
|
||||
// marching slicing window
|
||||
blockwise_in_copy.Run(p_in_global, p_in_block);
|
||||
blockwise_wei_copy.Run(p_wei_global, p_wei_block);
|
||||
@@ -253,8 +289,8 @@ struct GridwiseConvolutionImplicitGemm_v4_nchw_kcyx_nkhw
|
||||
|
||||
__syncthreads();
|
||||
|
||||
blockwise_in_copy.MoveSlicingWindowOnSourceTensor(I0, Number<EPerBlock>{}, TRUE);
|
||||
blockwise_wei_copy.MoveSlicingWindowOnSourceTensor(I0, Number<EPerBlock>{}, TRUE);
|
||||
blockwise_in_copy.MoveSlicingWindowOnSourceTensor(I0, Number<EPerBlock>{}, True);
|
||||
blockwise_wei_copy.MoveSlicingWindowOnSourceTensor(I0, Number<EPerBlock>{}, True);
|
||||
}
|
||||
|
||||
// copy output: register to global memory
|
||||
|
||||
Reference in New Issue
Block a user