mirror of
https://github.com/ROCm/composable_kernel.git
synced 2026-05-13 01:36:06 +00:00
refactor
This commit is contained in:
@@ -3,7 +3,8 @@
|
||||
#include "threadwise_tensor_op.cuh"
|
||||
#include "threadwise_direct_convolution.cuh"
|
||||
|
||||
template <class TFloat,
|
||||
template <unsigned BlockSize,
|
||||
class TFloat,
|
||||
class InBlockDesc,
|
||||
class WeiBlockDesc,
|
||||
class OutBlockDesc,
|
||||
@@ -11,8 +12,7 @@ template <class TFloat,
|
||||
unsigned OutTileSizeW,
|
||||
unsigned NPerThread,
|
||||
unsigned KPerThread,
|
||||
unsigned CPerThread,
|
||||
unsigned BlockSize>
|
||||
unsigned CPerThread>
|
||||
__device__ void blockwise_direct_convolution(InBlockDesc,
|
||||
TFloat* const __restrict__ p_in_block,
|
||||
WeiBlockDesc,
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
#pragma once
|
||||
#include "constant_tensor_descriptor.cuh"
|
||||
|
||||
template <class TFloat, class DstDesc, class F, unsigned BlockSize>
|
||||
template <unsigned BlockSize, class TFloat, class DstDesc, class F>
|
||||
__device__ void
|
||||
blockwise_4d_tensor_pointwise_operation_unary(DstDesc, TFloat* __restrict__ p_dst, F f)
|
||||
{
|
||||
@@ -78,13 +78,13 @@ blockwise_4d_tensor_pointwise_operation_unary(DstDesc, TFloat* __restrict__ p_ds
|
||||
|
||||
// TODO: in order to optimize mem access for different mem type,
|
||||
// need to write specialized version
|
||||
template <class TFloat,
|
||||
template <unsigned BlockSize,
|
||||
class TFloat,
|
||||
class SrcDesc,
|
||||
class DstDesc,
|
||||
class RefDesc,
|
||||
class Reorder,
|
||||
class F,
|
||||
unsigned BlockSize>
|
||||
class F>
|
||||
__device__ void
|
||||
blockwise_4d_tensor_pointwise_operation_binary_reorder(SrcDesc,
|
||||
TFloat* const __restrict__ p_src,
|
||||
@@ -170,21 +170,20 @@ blockwise_4d_tensor_pointwise_operation_binary_reorder(SrcDesc,
|
||||
}
|
||||
}
|
||||
|
||||
template <class TFloat, class DstDesc, unsigned BlockSize>
|
||||
template <unsigned BlockSize, class TFloat, class DstDesc>
|
||||
__device__ void blockwise_4d_tensor_set_zero(DstDesc, TFloat* __restrict__ p_dst)
|
||||
{
|
||||
auto f_set_zero = [](TFloat& v) { v = TFloat(0); };
|
||||
|
||||
blockwise_4d_tensor_pointwise_operation_unary<TFloat, DstDesc, decltype(f_set_zero), BlockSize>(
|
||||
DstDesc{}, p_dst, f_set_zero);
|
||||
blockwise_4d_tensor_pointwise_operation_unary<BlockSize>(DstDesc{}, p_dst, f_set_zero);
|
||||
}
|
||||
|
||||
template <class TFloat,
|
||||
template <unsigned BlockSize,
|
||||
class TFloat,
|
||||
class SrcDesc,
|
||||
class DstDesc,
|
||||
class RefDesc,
|
||||
class Reorder,
|
||||
unsigned BlockSize>
|
||||
class Reorder>
|
||||
__device__ void blockwise_4d_tensor_copy_reorder(SrcDesc,
|
||||
TFloat* const __restrict__ p_src,
|
||||
DstDesc,
|
||||
@@ -194,34 +193,16 @@ __device__ void blockwise_4d_tensor_copy_reorder(SrcDesc,
|
||||
{
|
||||
auto f_copy = [](const TFloat& src, TFloat& dst) { dst = src; };
|
||||
|
||||
blockwise_4d_tensor_pointwise_operation_binary_reorder<TFloat,
|
||||
SrcDesc,
|
||||
DstDesc,
|
||||
RefDesc,
|
||||
Reorder,
|
||||
decltype(f_copy),
|
||||
BlockSize>(
|
||||
blockwise_4d_tensor_pointwise_operation_binary_reorder<BlockSize>(
|
||||
SrcDesc{}, p_src, DstDesc{}, p_dst, RefDesc{}, Reorder{}, f_copy);
|
||||
}
|
||||
|
||||
template <class TFloat, class SrcDesc, class DstDesc, class RefDesc, unsigned BlockSize>
|
||||
template <unsigned BlockSize, class TFloat, class SrcDesc, class DstDesc, class RefDesc>
|
||||
__device__ void blockwise_4d_tensor_copy(
|
||||
SrcDesc, TFloat* const __restrict__ p_src, DstDesc, TFloat* __restrict__ p_dst, RefDesc)
|
||||
{
|
||||
constexpr auto reorder = Sequence<0, 1, 2, 3>{};
|
||||
|
||||
blockwise_4d_tensor_copy_reorder<TFloat,
|
||||
SrcDesc,
|
||||
DstDesc,
|
||||
RefDesc,
|
||||
decltype(reorder),
|
||||
BlockSize>(
|
||||
blockwise_4d_tensor_copy_reorder<BlockSize>(
|
||||
SrcDesc{}, p_src, DstDesc{}, p_dst, RefDesc{}, reorder);
|
||||
}
|
||||
|
||||
template <class TFloat, class ImDesc, class WDesc, class ColDesc, unsigned BlockSize>
|
||||
__device__ void blockwise_4d_tensor_im2col(
|
||||
ImDesc, const __restrict__ TFloat* p_im, WDesc, ColDesc, __restrict__ TFloat* p_col)
|
||||
{
|
||||
// do nothing
|
||||
}
|
||||
|
||||
@@ -24,12 +24,55 @@ struct Sequence
|
||||
}
|
||||
|
||||
template <unsigned I>
|
||||
__host__ __device__ constexpr auto GetNumber(Number<I>) const
|
||||
__host__ __device__ constexpr auto GetConstant(Number<I>) const
|
||||
{
|
||||
constexpr unsigned N = Get(I);
|
||||
|
||||
return Number<N>{};
|
||||
}
|
||||
|
||||
template <unsigned I0, unsigned I1>
|
||||
__host__ __device__ constexpr auto Reorder(Number<I0>, Number<I1>) const
|
||||
{
|
||||
constexpr unsigned IR0 = Get(Number<I0>{});
|
||||
constexpr unsigned IR1 = Get(Number<I1>{});
|
||||
|
||||
return Sequence<IR0, IR1>{};
|
||||
}
|
||||
|
||||
template <unsigned I0, unsigned I1, unsigned I2>
|
||||
__host__ __device__ constexpr auto Reorder(Number<I0>, Number<I1>, Number<I2>) const
|
||||
{
|
||||
constexpr unsigned IR0 = Get(Number<I0>{});
|
||||
constexpr unsigned IR1 = Get(Number<I1>{});
|
||||
constexpr unsigned IR2 = Get(Number<I2>{});
|
||||
|
||||
return Sequence<IR0, IR1, IR2>{};
|
||||
}
|
||||
|
||||
template <unsigned I0, unsigned I1, unsigned I2, unsigned I3>
|
||||
__host__ __device__ constexpr auto Reorder(Number<I0>, Number<I1>, Number<I2>, Number<I3>) const
|
||||
{
|
||||
constexpr unsigned IR0 = Get(Number<I0>{});
|
||||
constexpr unsigned IR1 = Get(Number<I1>{});
|
||||
constexpr unsigned IR2 = Get(Number<I2>{});
|
||||
constexpr unsigned IR3 = Get(Number<I3>{});
|
||||
|
||||
return Sequence<IR0, IR1, IR2, IR3>{};
|
||||
}
|
||||
|
||||
template <unsigned I0, unsigned I1, unsigned I2, unsigned I3, unsigned I4>
|
||||
__host__ __device__ constexpr auto
|
||||
Reorder(Number<I0>, Number<I1>, Number<I2>, Number<I3>, Number<I4>) const
|
||||
{
|
||||
constexpr unsigned IR0 = Get(Number<I0>{});
|
||||
constexpr unsigned IR1 = Get(Number<I1>{});
|
||||
constexpr unsigned IR2 = Get(Number<I2>{});
|
||||
constexpr unsigned IR3 = Get(Number<I3>{});
|
||||
constexpr unsigned IR4 = Get(Number<I4>{});
|
||||
|
||||
return Sequence<IR0, IR1, IR2, IR3, IR4>{};
|
||||
}
|
||||
};
|
||||
|
||||
template <class Lengths, class Strides>
|
||||
@@ -99,6 +142,15 @@ struct ConstantTensorDescriptor
|
||||
static_assert(nDim == 4, "nDim is not 4");
|
||||
return n * GetStride(I0) + c * GetStride(I1) + h * GetStride(I2) + w * GetStride(I3);
|
||||
}
|
||||
|
||||
template <class... Is>
|
||||
__host__ __device__ constexpr auto Reorder(Is... is) const
|
||||
{
|
||||
constexpr auto lengths = Lengths{}.Reorder(is...);
|
||||
constexpr auto strides = Strides{}.Reorder(is...);
|
||||
|
||||
return ConstantTensorDescriptor<decltype(lengths), decltype(strides)>{};
|
||||
}
|
||||
};
|
||||
|
||||
// this is ugly, only for 4d
|
||||
@@ -121,28 +173,6 @@ __host__ __device__ constexpr auto make_ConstantTensorDescriptor(Lengths, Stride
|
||||
return ConstantTensorDescriptor<Lengths, Strides>{};
|
||||
}
|
||||
|
||||
// this is ugly, only for 4d
|
||||
template <class Desc, class Reorder>
|
||||
__host__ __device__ constexpr auto get_reordered_4d_tensor_descriptor(Desc, Reorder)
|
||||
{
|
||||
constexpr auto I0 = Number<0>{};
|
||||
constexpr auto I1 = Number<1>{};
|
||||
constexpr auto I2 = Number<2>{};
|
||||
constexpr auto I3 = Number<3>{};
|
||||
|
||||
constexpr auto IT0 = Reorder{}.GetNumber(I0);
|
||||
constexpr auto IT1 = Reorder{}.GetNumber(I1);
|
||||
constexpr auto IT2 = Reorder{}.GetNumber(I2);
|
||||
constexpr auto IT3 = Reorder{}.GetNumber(I3);
|
||||
|
||||
constexpr unsigned L0 = Desc{}.GetLength(IT0);
|
||||
constexpr unsigned L1 = Desc{}.GetLength(IT1);
|
||||
constexpr unsigned L2 = Desc{}.GetLength(IT2);
|
||||
constexpr unsigned L3 = Desc{}.GetLength(IT3);
|
||||
|
||||
return make_ConstantTensorDescriptor(Sequence<L0, L1, L2, L3>{});
|
||||
}
|
||||
|
||||
// this is ugly, only for 4d
|
||||
template <class InDesc, class WeiDesc>
|
||||
__host__ __device__ constexpr auto get_convolution_output_4d_tensor_descriptor(InDesc, WeiDesc)
|
||||
|
||||
@@ -122,18 +122,13 @@ __global__ void gridwise_direct_convolution_1(InGlobalDesc,
|
||||
#endif
|
||||
|
||||
// set output tensor in LDS to 0
|
||||
blockwise_4d_tensor_set_zero<TFloat, decltype(out_block_desc), BlockSize>(out_block_desc,
|
||||
p_out_block);
|
||||
blockwise_4d_tensor_set_zero<BlockSize>(out_block_desc, p_out_block);
|
||||
|
||||
for(unsigned c_block_work_begin = 0; c_block_work_begin < in_global_desc.GetLength(I1);
|
||||
c_block_work_begin += CPerBlock)
|
||||
{
|
||||
// copy input tensor to LDS
|
||||
blockwise_4d_tensor_copy<TFloat,
|
||||
decltype(in_block_global_desc),
|
||||
decltype(in_block_desc),
|
||||
decltype(in_block_desc),
|
||||
BlockSize>(in_block_global_desc,
|
||||
blockwise_4d_tensor_copy<BlockSize>(in_block_global_desc,
|
||||
p_in_global +
|
||||
in_global_desc.Get1dIndex(n_block_work_begin,
|
||||
c_block_work_begin,
|
||||
@@ -144,11 +139,7 @@ __global__ void gridwise_direct_convolution_1(InGlobalDesc,
|
||||
in_block_desc);
|
||||
|
||||
// copy weight tensor to LDS
|
||||
blockwise_4d_tensor_copy<TFloat,
|
||||
decltype(wei_block_global_desc),
|
||||
decltype(wei_block_desc),
|
||||
decltype(wei_block_desc),
|
||||
BlockSize>(
|
||||
blockwise_4d_tensor_copy<BlockSize>(
|
||||
wei_block_global_desc,
|
||||
p_wei_global + wei_global_desc.Get1dIndex(k_block_work_begin, c_block_work_begin, 0, 0),
|
||||
wei_block_desc,
|
||||
@@ -158,7 +149,8 @@ __global__ void gridwise_direct_convolution_1(InGlobalDesc,
|
||||
__syncthreads();
|
||||
|
||||
// blockwise convolution
|
||||
blockwise_direct_convolution<TFloat,
|
||||
blockwise_direct_convolution<BlockSize,
|
||||
TFloat,
|
||||
decltype(in_block_desc),
|
||||
decltype(wei_block_desc),
|
||||
decltype(out_block_desc),
|
||||
@@ -166,19 +158,14 @@ __global__ void gridwise_direct_convolution_1(InGlobalDesc,
|
||||
OutTileSizeW,
|
||||
NPerThread,
|
||||
KPerThread,
|
||||
CPerThread,
|
||||
BlockSize>(
|
||||
CPerThread>(
|
||||
in_block_desc, p_in_block, wei_block_desc, p_wei_block, out_block_desc, p_out_block);
|
||||
|
||||
__syncthreads();
|
||||
}
|
||||
|
||||
// copy output tensor from LDS to device mem
|
||||
blockwise_4d_tensor_copy<TFloat,
|
||||
decltype(out_block_desc),
|
||||
decltype(out_block_global_desc),
|
||||
decltype(out_block_desc),
|
||||
BlockSize>(
|
||||
blockwise_4d_tensor_copy<BlockSize>(
|
||||
out_block_desc,
|
||||
p_out_block,
|
||||
out_block_global_desc,
|
||||
@@ -186,4 +173,4 @@ __global__ void gridwise_direct_convolution_1(InGlobalDesc,
|
||||
out_global_desc.Get1dIndex(
|
||||
n_block_work_begin, k_block_work_begin, ho_block_work_begin, wo_block_work_begin),
|
||||
out_block_desc);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -151,11 +151,7 @@ __global__ void gridwise_direct_convolution_2(InGlobalDesc,
|
||||
c_block_data_begin += CPerBlock, __syncthreads())
|
||||
{
|
||||
// copy input tensor to LDS
|
||||
blockwise_4d_tensor_copy<TFloat,
|
||||
decltype(in_global_desc),
|
||||
decltype(in_block_desc),
|
||||
decltype(in_block_desc),
|
||||
BlockSize>(in_global_desc,
|
||||
blockwise_4d_tensor_copy<BlockSize>(in_global_desc,
|
||||
p_in_global +
|
||||
in_global_desc.Get1dIndex(n_block_data_begin,
|
||||
c_block_data_begin,
|
||||
@@ -166,11 +162,7 @@ __global__ void gridwise_direct_convolution_2(InGlobalDesc,
|
||||
in_block_desc);
|
||||
|
||||
// copy weight tensor to LDS
|
||||
blockwise_4d_tensor_copy<TFloat,
|
||||
decltype(wei_global_desc),
|
||||
decltype(wei_block_desc),
|
||||
decltype(wei_block_desc),
|
||||
BlockSize>(
|
||||
blockwise_4d_tensor_copy<BlockSize>(
|
||||
wei_global_desc,
|
||||
p_wei_global + wei_global_desc.Get1dIndex(k_block_data_begin, c_block_data_begin, 0, 0),
|
||||
wei_block_desc,
|
||||
@@ -182,7 +174,7 @@ __global__ void gridwise_direct_convolution_2(InGlobalDesc,
|
||||
for(unsigned c_thread_data = 0; c_thread_data < CPerBlock; c_thread_data += CPerThread)
|
||||
{
|
||||
// threadwise convolution
|
||||
#if 0
|
||||
#if 1
|
||||
threadwise_direct_convolution_2(
|
||||
in_thread_block_desc,
|
||||
p_in_block + in_block_desc.Get1dIndex(n_thread_data_begin,
|
||||
|
||||
Reference in New Issue
Block a user