mirror of
https://github.com/ROCm/composable_kernel.git
synced 2026-05-12 17:26:00 +00:00
added blockwise tensor reorder operation
This commit is contained in:
@@ -59,7 +59,7 @@ __device__ void blockwise_direct_convolution(InBlockDesc,
|
||||
make_ConstantTensorDescriptor(Sequence<KPerThread, CPerThread, S, R>{});
|
||||
|
||||
constexpr auto out_thread_desc =
|
||||
get_output_4d_tensor_descriptor(in_thread_desc, wei_thread_desc);
|
||||
get_convolution_output_4d_tensor_descriptor(in_thread_desc, wei_thread_desc);
|
||||
|
||||
constexpr auto in_thread_block_desc =
|
||||
make_ConstantTensorDescriptor(in_thread_desc.GetLengths(), in_block_desc.GetStrides());
|
||||
|
||||
@@ -2,7 +2,8 @@
|
||||
#include "constant_tensor_descriptor.cuh"
|
||||
|
||||
template <class TFloat, class DstDesc, class F, unsigned BlockSize>
|
||||
__device__ void blockwise_4d_tensor_pointwise_op_unary(DstDesc, TFloat* __restrict__ p_dst, F f)
|
||||
__device__ void
|
||||
blockwise_4d_tensor_pointwise_operation_unary(DstDesc, TFloat* __restrict__ p_dst, F f)
|
||||
{
|
||||
constexpr auto I0 = Number<0>{};
|
||||
constexpr auto I1 = Number<1>{};
|
||||
@@ -75,82 +76,94 @@ __device__ void blockwise_4d_tensor_pointwise_op_unary(DstDesc, TFloat* __restri
|
||||
}
|
||||
}
|
||||
|
||||
template <class TFloat, class DescA, class DescB, class DescRef, class F, unsigned BlockSize>
|
||||
__device__ void blockwise_4d_tensor_pointwise_op_binary(
|
||||
DescA, TFloat* const __restrict__ p_a, DescB, TFloat* __restrict__ p_b, DescRef, F f)
|
||||
template <class TFloat,
|
||||
class SrcDesc,
|
||||
class DstDesc,
|
||||
class RefDesc,
|
||||
class Reorder,
|
||||
class F,
|
||||
unsigned BlockSize>
|
||||
__device__ void
|
||||
blockwise_4d_tensor_pointwise_operation_binary_reorder(SrcDesc,
|
||||
TFloat* const __restrict__ p_src,
|
||||
DstDesc,
|
||||
TFloat* __restrict__ p_dst,
|
||||
RefDesc,
|
||||
Reorder,
|
||||
F f)
|
||||
{
|
||||
constexpr auto I0 = Number<0>{};
|
||||
constexpr auto I1 = Number<1>{};
|
||||
constexpr auto I2 = Number<2>{};
|
||||
constexpr auto I3 = Number<3>{};
|
||||
|
||||
constexpr auto desc_a = DescA{};
|
||||
constexpr auto desc_b = DescB{};
|
||||
constexpr auto desc_ref = DescRef{};
|
||||
constexpr unsigned IT0 = Reorder{}.Get(I0);
|
||||
constexpr unsigned IT1 = Reorder{}.Get(I1);
|
||||
constexpr unsigned IT2 = Reorder{}.Get(I2);
|
||||
constexpr unsigned IT3 = Reorder{}.Get(I3);
|
||||
|
||||
#if 0
|
||||
if(threadIdx.x == 0)
|
||||
{
|
||||
print_ConstantTensorDescriptor(desc_a, "blockwise_4d_tensor_op_binary: desc_a: ");
|
||||
print_ConstantTensorDescriptor(desc_b, "blockwise_4d_tensor_op_binary: desc_b: ");
|
||||
print_ConstantTensorDescriptor(desc_ref, "blockwise_4d_tensor_op_binary: desc_ref: ");
|
||||
}
|
||||
#endif
|
||||
constexpr auto src_desc = SrcDesc{};
|
||||
constexpr auto dst_desc = DstDesc{};
|
||||
constexpr auto ref_desc = RefDesc{};
|
||||
|
||||
constexpr unsigned NLoop = desc_ref.GetElementSize() / BlockSize;
|
||||
constexpr unsigned NLoop = ref_desc.GetElementSize() / BlockSize;
|
||||
|
||||
for(unsigned iloop = 0; iloop < NLoop; ++iloop)
|
||||
{
|
||||
unsigned is = threadIdx.x + iloop * BlockSize;
|
||||
|
||||
const unsigned did0 = is / desc_ref.GetStride(I0);
|
||||
unsigned did[4];
|
||||
|
||||
is -= did0 * desc_ref.GetStride(I0);
|
||||
did[0] = is / ref_desc.GetStride(I0);
|
||||
|
||||
const unsigned did1 = is / desc_ref.GetStride(I1);
|
||||
is -= did[0] * ref_desc.GetStride(I0);
|
||||
|
||||
is -= did1 * desc_ref.GetStride(I1);
|
||||
did[1] = is / ref_desc.GetStride(I1);
|
||||
|
||||
const unsigned did2 = is / desc_ref.GetStride(I2);
|
||||
is -= did[1] * ref_desc.GetStride(I1);
|
||||
|
||||
is -= did2 * desc_ref.GetStride(I2);
|
||||
did[2] = is / ref_desc.GetStride(I2);
|
||||
|
||||
const unsigned did3 = is / desc_ref.GetStride(I3);
|
||||
is -= did[2] * ref_desc.GetStride(I2);
|
||||
|
||||
const unsigned aindex = desc_a.Get1dIndex(did0, did1, did2, did3);
|
||||
did[3] = is / ref_desc.GetStride(I3);
|
||||
|
||||
const unsigned bindex = desc_b.Get1dIndex(did0, did1, did2, did3);
|
||||
const unsigned aindex = src_desc.Get1dIndex(did[0], did[1], did[2], did[3]);
|
||||
|
||||
f(p_a[aindex], p_b[bindex]);
|
||||
const unsigned bindex = dst_desc.Get1dIndex(did[IT0], did[IT1], did[IT2], did[IT3]);
|
||||
|
||||
f(p_src[aindex], p_dst[bindex]);
|
||||
}
|
||||
|
||||
constexpr bool has_tail = (desc_ref.GetElementSize() > NLoop * BlockSize);
|
||||
constexpr bool has_tail = (ref_desc.GetElementSize() > NLoop * BlockSize);
|
||||
|
||||
if(has_tail)
|
||||
{
|
||||
unsigned is = threadIdx.x + NLoop * BlockSize;
|
||||
|
||||
if(is < desc_ref.GetElementSize())
|
||||
if(is < ref_desc.GetElementSize())
|
||||
{
|
||||
const unsigned did0 = is / desc_ref.GetStride(I0);
|
||||
unsigned did[4];
|
||||
|
||||
is -= did0 * desc_ref.GetStride(I0);
|
||||
did[0] = is / ref_desc.GetStride(I0);
|
||||
|
||||
const unsigned did1 = is / desc_ref.GetStride(I1);
|
||||
is -= did[0] * ref_desc.GetStride(I0);
|
||||
|
||||
is -= did1 * desc_ref.GetStride(I1);
|
||||
did[1] = is / ref_desc.GetStride(I1);
|
||||
|
||||
const unsigned did2 = is / desc_ref.GetStride(I2);
|
||||
is -= did[1] * ref_desc.GetStride(I1);
|
||||
|
||||
is -= did2 * desc_ref.GetStride(I2);
|
||||
did[2] = is / ref_desc.GetStride(I2);
|
||||
|
||||
const unsigned did3 = is / desc_ref.GetStride(I3);
|
||||
is -= did[2] * ref_desc.GetStride(I2);
|
||||
|
||||
const unsigned aindex = desc_a.Get1dIndex(did0, did1, did2, did3);
|
||||
did[3] = is / ref_desc.GetStride(I3);
|
||||
|
||||
const unsigned bindex = desc_b.Get1dIndex(did0, did1, did2, did3);
|
||||
const unsigned aindex = src_desc.Get1dIndex(did[0], did[1], did[2], did[3]);
|
||||
|
||||
f(p_a[aindex], p_b[bindex]);
|
||||
const unsigned bindex = dst_desc.Get1dIndex(did[IT0], did[IT1], did[IT2], did[IT3]);
|
||||
|
||||
f(p_src[aindex], p_dst[bindex]);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -160,21 +173,53 @@ __device__ void blockwise_4d_tensor_set_zero(DstDesc, TFloat* __restrict__ p_dst
|
||||
{
|
||||
auto f_set_zero = [](TFloat& v) { v = TFloat(0); };
|
||||
|
||||
blockwise_4d_tensor_pointwise_op_unary<TFloat, DstDesc, decltype(f_set_zero), BlockSize>(
|
||||
blockwise_4d_tensor_pointwise_operation_unary<TFloat, DstDesc, decltype(f_set_zero), BlockSize>(
|
||||
DstDesc{}, p_dst, f_set_zero);
|
||||
}
|
||||
|
||||
template <class TFloat,
|
||||
class SrcDesc,
|
||||
class DstDesc,
|
||||
class RefDesc,
|
||||
class Reorder,
|
||||
unsigned BlockSize>
|
||||
__device__ void blockwise_4d_tensor_copy_reorder(SrcDesc,
|
||||
TFloat* const __restrict__ p_src,
|
||||
DstDesc,
|
||||
TFloat* __restrict__ p_dst,
|
||||
RefDesc,
|
||||
Reorder)
|
||||
{
|
||||
auto f_copy = [](const TFloat& src, TFloat& dst) { dst = src; };
|
||||
|
||||
blockwise_4d_tensor_pointwise_operation_binary_reorder<TFloat,
|
||||
SrcDesc,
|
||||
DstDesc,
|
||||
RefDesc,
|
||||
Reorder,
|
||||
decltype(f_copy),
|
||||
BlockSize>(
|
||||
SrcDesc{}, p_src, DstDesc{}, p_dst, RefDesc{}, Reorder{}, f_copy);
|
||||
}
|
||||
|
||||
template <class TFloat, class SrcDesc, class DstDesc, class RefDesc, unsigned BlockSize>
|
||||
__device__ void blockwise_4d_tensor_copy(
|
||||
SrcDesc, TFloat* const __restrict__ p_src, DstDesc, TFloat* __restrict__ p_dst, RefDesc)
|
||||
{
|
||||
auto f_copy = [](const TFloat& src, TFloat& dst) { dst = src; };
|
||||
constexpr auto reorder = Sequence<0, 1, 2, 3>{};
|
||||
|
||||
blockwise_4d_tensor_pointwise_op_binary<TFloat,
|
||||
SrcDesc,
|
||||
DstDesc,
|
||||
RefDesc,
|
||||
decltype(f_copy),
|
||||
BlockSize>(
|
||||
SrcDesc{}, p_src, DstDesc{}, p_dst, RefDesc{}, f_copy);
|
||||
blockwise_4d_tensor_copy_reorder<TFloat,
|
||||
SrcDesc,
|
||||
DstDesc,
|
||||
RefDesc,
|
||||
decltype(reorder),
|
||||
BlockSize>(
|
||||
SrcDesc{}, p_src, DstDesc{}, p_dst, RefDesc{}, reorder);
|
||||
}
|
||||
|
||||
template <class TFloat, class ImDesc, class WDesc, class ColDesc, unsigned BlockSize>
|
||||
__device__ void blockwise_4d_tensor_im2col(
|
||||
ImDesc, const __restrict__ TFloat* p_im, WDesc, ColDesc, __restrict__ TFloat* p_col)
|
||||
{
|
||||
// do nothing
|
||||
}
|
||||
|
||||
@@ -22,6 +22,14 @@ struct Sequence
|
||||
{
|
||||
return mData[I];
|
||||
}
|
||||
|
||||
template <unsigned I>
|
||||
__host__ __device__ constexpr auto GetNumber(Number<I>) const
|
||||
{
|
||||
constexpr unsigned N = Get(I);
|
||||
|
||||
return Number<N>{};
|
||||
}
|
||||
};
|
||||
|
||||
template <class Lengths, class Strides>
|
||||
@@ -113,9 +121,31 @@ __host__ __device__ constexpr auto make_ConstantTensorDescriptor(Lengths, Stride
|
||||
return ConstantTensorDescriptor<Lengths, Strides>{};
|
||||
}
|
||||
|
||||
// this is ugly, only for 4d
|
||||
template <class Desc, class Reorder>
|
||||
__host__ __device__ constexpr auto get_reordered_4d_tensor_descriptor(Desc, Reorder)
|
||||
{
|
||||
constexpr auto I0 = Number<0>{};
|
||||
constexpr auto I1 = Number<1>{};
|
||||
constexpr auto I2 = Number<2>{};
|
||||
constexpr auto I3 = Number<3>{};
|
||||
|
||||
constexpr auto IT0 = Reorder{}.GetNumber(I0);
|
||||
constexpr auto IT1 = Reorder{}.GetNumber(I1);
|
||||
constexpr auto IT2 = Reorder{}.GetNumber(I2);
|
||||
constexpr auto IT3 = Reorder{}.GetNumber(I3);
|
||||
|
||||
constexpr unsigned L0 = Desc{}.GetLength(IT0);
|
||||
constexpr unsigned L1 = Desc{}.GetLength(IT1);
|
||||
constexpr unsigned L2 = Desc{}.GetLength(IT2);
|
||||
constexpr unsigned L3 = Desc{}.GetLength(IT3);
|
||||
|
||||
return make_ConstantTensorDescriptor(Sequence<L0, L1, L2, L3>{});
|
||||
}
|
||||
|
||||
// this is ugly, only for 4d
|
||||
template <class InDesc, class WeiDesc>
|
||||
__host__ __device__ constexpr auto get_output_4d_tensor_descriptor(InDesc, WeiDesc)
|
||||
__host__ __device__ constexpr auto get_convolution_output_4d_tensor_descriptor(InDesc, WeiDesc)
|
||||
{
|
||||
constexpr auto in_desc = InDesc{};
|
||||
constexpr auto wei_desc = WeiDesc{};
|
||||
|
||||
@@ -70,7 +70,7 @@ __global__ void gridwise_direct_convolution_2(InGlobalDesc,
|
||||
Sequence<KPerThread, CPerThread, S, R>{}, wei_block_desc.GetStrides());
|
||||
|
||||
constexpr auto out_thread_desc =
|
||||
get_output_4d_tensor_descriptor(in_thread_block_desc, wei_thread_block_desc);
|
||||
get_convolution_output_4d_tensor_descriptor(in_thread_block_desc, wei_thread_block_desc);
|
||||
|
||||
// register
|
||||
TFloat p_out_thread[out_thread_desc.GetElementSpace()];
|
||||
|
||||
Reference in New Issue
Block a user