mirror of
https://github.com/ROCm/composable_kernel.git
synced 2026-05-17 03:19:48 +00:00
@@ -7,26 +7,16 @@
|
||||
#include "constant_tensor_descriptor.cuh"
|
||||
#include "device_direct_convolution_1.cuh"
|
||||
#include "device_direct_convolution_2.cuh"
|
||||
//#include "device_winograd_convolution.cuh"
|
||||
|
||||
struct GeneratorConstant
|
||||
{
|
||||
double value = 0;
|
||||
|
||||
template <class... Is>
|
||||
double operator()(Is...)
|
||||
{
|
||||
return value;
|
||||
}
|
||||
};
|
||||
|
||||
struct GeneratorTensor
|
||||
struct GeneratorTensor_1
|
||||
{
|
||||
template <class... Is>
|
||||
double operator()(Is... is)
|
||||
{
|
||||
#if 1
|
||||
#if 0
|
||||
return double(std::rand()) / double(RAND_MAX);
|
||||
#elif 0
|
||||
#elif 1
|
||||
return 1;
|
||||
#elif 0
|
||||
std::initializer_list<std::size_t> ls = {static_cast<std::size_t>(is)...};
|
||||
@@ -395,7 +385,11 @@ int main()
|
||||
Tensor<float> out_host(make_TensorDescriptor(out_desc));
|
||||
Tensor<float> out_device(make_TensorDescriptor(out_desc));
|
||||
|
||||
#if 1
|
||||
#if 0
|
||||
std::size_t num_thread = std::thread::hardware_concurrency();
|
||||
in.GenerateTensorValue(GeneratorTensor_1{}, num_thread);
|
||||
wei.GenerateTensorValue(GeneratorTensor_1{}, num_thread);
|
||||
#elif 0
|
||||
std::size_t num_thread = std::thread::hardware_concurrency();
|
||||
in.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread);
|
||||
wei.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread);
|
||||
@@ -410,7 +404,7 @@ int main()
|
||||
#endif
|
||||
}
|
||||
|
||||
#if 1
|
||||
#if 0
|
||||
host_winograd_3x3_convolution(in, wei, out_host);
|
||||
check_error(out_host, out_device);
|
||||
#elif 0
|
||||
|
||||
@@ -27,15 +27,14 @@ void device_direct_convolution_1(
|
||||
constexpr unsigned OutTileSizeH = 2;
|
||||
constexpr unsigned OutTileSizeW = 2;
|
||||
constexpr unsigned NPerBlock = 1;
|
||||
constexpr unsigned KPerBlock = 4;
|
||||
constexpr unsigned CPerBlock = 2;
|
||||
constexpr unsigned YPerBlock = 8;
|
||||
constexpr unsigned KPerBlock = 16;
|
||||
constexpr unsigned CPerBlock = 4;
|
||||
constexpr unsigned YPerBlock = 4;
|
||||
constexpr unsigned XPerBlock = 16;
|
||||
|
||||
constexpr unsigned NBlockOpLen0 = 1;
|
||||
constexpr unsigned NBlockOpLen1 = 1;
|
||||
constexpr unsigned NBlockOpLen2 = 4;
|
||||
constexpr unsigned NBlockOpLen3 = 32;
|
||||
constexpr unsigned NPerThread = 1;
|
||||
constexpr unsigned KPerThread = 4;
|
||||
constexpr unsigned CPerThread = 2;
|
||||
|
||||
constexpr unsigned BlockSize = 128;
|
||||
|
||||
@@ -66,10 +65,9 @@ void device_direct_convolution_1(
|
||||
CPerBlock,
|
||||
YPerBlock,
|
||||
XPerBlock,
|
||||
NBlockOpLen0,
|
||||
NBlockOpLen1,
|
||||
NBlockOpLen2,
|
||||
NBlockOpLen3,
|
||||
NPerThread,
|
||||
KPerThread,
|
||||
CPerThread,
|
||||
BlockSize,
|
||||
GridSize>
|
||||
<<<grid_dim, block_dim>>>(InDesc{},
|
||||
|
||||
@@ -9,13 +9,16 @@ template <class TFloat,
|
||||
class OutBlockDesc,
|
||||
unsigned OutTileSizeH,
|
||||
unsigned OutTileSizeW,
|
||||
unsigned NPerThread,
|
||||
unsigned KPerThread,
|
||||
unsigned CPerThread,
|
||||
unsigned BlockSize>
|
||||
__device__ void blockwise_convolution(InBlockDesc,
|
||||
TFloat* const __restrict__ p_in_block,
|
||||
WeiBlockDesc,
|
||||
TFloat* const __restrict__ p_wei_block,
|
||||
OutBlockDesc,
|
||||
TFloat* __restrict__ p_out_block)
|
||||
__device__ void blockwise_direct_convolution(InBlockDesc,
|
||||
TFloat* const __restrict__ p_in_block,
|
||||
WeiBlockDesc,
|
||||
TFloat* const __restrict__ p_wei_block,
|
||||
OutBlockDesc,
|
||||
TFloat* __restrict__ p_out_block)
|
||||
{
|
||||
constexpr auto I0 = Number<0>{};
|
||||
constexpr auto I1 = Number<1>{};
|
||||
@@ -29,16 +32,17 @@ __device__ void blockwise_convolution(InBlockDesc,
|
||||
constexpr unsigned S = wei_block_desc.GetLength(I2);
|
||||
constexpr unsigned R = wei_block_desc.GetLength(I3);
|
||||
|
||||
constexpr unsigned NPerBlock = out_block_desc.GetLength(I0);
|
||||
constexpr unsigned KPerBlock = out_block_desc.GetLength(I1);
|
||||
constexpr unsigned YPerBlock = (out_block_desc.GetLength(I2) + OutTileSizeH - 1) / OutTileSizeH;
|
||||
constexpr unsigned XPerBlock = (out_block_desc.GetLength(I3) + OutTileSizeW - 1) / OutTileSizeW;
|
||||
|
||||
constexpr unsigned CPerBlock = in_block_desc.GetLength(I1);
|
||||
|
||||
constexpr unsigned InTileSizeH = OutTileSizeH + S - 1;
|
||||
constexpr unsigned InTileSizeW = OutTileSizeW + R - 1;
|
||||
|
||||
// divide thread work
|
||||
constexpr unsigned NThreadWork = (out_block_desc.GetLength(I0) + NPerThread - 1) / NPerThread;
|
||||
constexpr unsigned KThreadWork = (out_block_desc.GetLength(I1) + KPerThread - 1) / KPerThread;
|
||||
constexpr unsigned YThreadWork =
|
||||
(out_block_desc.GetLength(I2) + OutTileSizeH - 1) / OutTileSizeH;
|
||||
constexpr unsigned XThreadWork =
|
||||
(out_block_desc.GetLength(I3) + OutTileSizeW - 1) / OutTileSizeW;
|
||||
|
||||
#if 0
|
||||
if(threadIdx.x == 0)
|
||||
{
|
||||
@@ -48,90 +52,94 @@ __device__ void blockwise_convolution(InBlockDesc,
|
||||
}
|
||||
#endif
|
||||
|
||||
constexpr auto in_thread_src_desc = make_ConstantTensorDescriptor(
|
||||
Sequence<1, CPerBlock, InTileSizeH, InTileSizeW>{}, in_block_desc.GetStrides());
|
||||
constexpr auto in_thread_desc =
|
||||
make_ConstantTensorDescriptor(Sequence<NPerThread, CPerThread, InTileSizeH, InTileSizeW>{});
|
||||
|
||||
constexpr auto wei_thread_src_desc =
|
||||
make_ConstantTensorDescriptor(Sequence<1, CPerBlock, S, R>{}, wei_block_desc.GetStrides());
|
||||
constexpr auto wei_thread_desc =
|
||||
make_ConstantTensorDescriptor(Sequence<KPerThread, CPerThread, S, R>{});
|
||||
|
||||
constexpr auto out_thread_src_desc = make_ConstantTensorDescriptor(
|
||||
Sequence<1, 1, OutTileSizeH, OutTileSizeW>{}, out_block_desc.GetStrides());
|
||||
constexpr auto out_thread_desc = make_ConstantTensorDescriptor(
|
||||
Sequence<NPerThread, KPerThread, OutTileSizeH, OutTileSizeW>{});
|
||||
|
||||
constexpr auto in_thread_dst_desc =
|
||||
make_ConstantTensorDescriptor(in_thread_src_desc.GetLengths());
|
||||
constexpr auto in_thread_block_desc =
|
||||
make_ConstantTensorDescriptor(in_thread_desc.GetLengths(), in_block_desc.GetStrides());
|
||||
|
||||
constexpr auto wei_thread_dst_desc =
|
||||
make_ConstantTensorDescriptor(wei_thread_src_desc.GetLengths());
|
||||
constexpr auto wei_thread_block_desc =
|
||||
make_ConstantTensorDescriptor(wei_thread_desc.GetLengths(), wei_block_desc.GetStrides());
|
||||
|
||||
constexpr auto out_thread_dst_desc =
|
||||
make_ConstantTensorDescriptor(out_thread_src_desc.GetLengths());
|
||||
constexpr auto out_thread_block_desc =
|
||||
make_ConstantTensorDescriptor(out_thread_desc.GetLengths(), out_block_desc.GetStrides());
|
||||
|
||||
const unsigned thread_id = threadIdx.x;
|
||||
|
||||
for(unsigned thread_work_id = thread_id; thread_work_id < NPerBlock * YPerBlock * XPerBlock;
|
||||
for(unsigned thread_work_id = thread_id;
|
||||
thread_work_id < NThreadWork * KThreadWork * YThreadWork * XThreadWork;
|
||||
thread_work_id += BlockSize)
|
||||
{
|
||||
unsigned itmp = thread_work_id;
|
||||
unsigned n_thread_work_id = itmp / (YPerBlock * XPerBlock);
|
||||
itmp -= n_thread_work_id * (YPerBlock * XPerBlock);
|
||||
unsigned y_thread_work_id = itmp / XPerBlock;
|
||||
unsigned x_thread_work_id = itmp - y_thread_work_id * XPerBlock;
|
||||
unsigned n_thread_work_id = itmp / (KThreadWork * YThreadWork * XThreadWork);
|
||||
itmp -= n_thread_work_id * (KThreadWork * YThreadWork * XThreadWork);
|
||||
unsigned k_thread_work_id = itmp / (YThreadWork * XThreadWork);
|
||||
itmp -= k_thread_work_id * (YThreadWork * XThreadWork);
|
||||
unsigned y_thread_work_id = itmp / XThreadWork;
|
||||
unsigned x_thread_work_id = itmp - y_thread_work_id * XThreadWork;
|
||||
|
||||
unsigned n_thread_work_begin = n_thread_work_id * 1;
|
||||
unsigned ho_thread_work_begin = y_thread_work_id * OutTileSizeH;
|
||||
unsigned wo_thread_work_begin = x_thread_work_id * OutTileSizeW;
|
||||
unsigned n_thread_data_begin = n_thread_work_id * NPerThread;
|
||||
unsigned k_thread_data_begin = k_thread_work_id * KPerThread;
|
||||
unsigned ho_thread_data_begin = y_thread_work_id * OutTileSizeH;
|
||||
unsigned wo_thread_data_begin = x_thread_work_id * OutTileSizeW;
|
||||
|
||||
unsigned hi_thread_work_begin = ho_thread_work_begin; // minus padding
|
||||
unsigned wi_thread_work_begin = wo_thread_work_begin; // minus padding
|
||||
unsigned hi_thread_data_begin = ho_thread_data_begin; // minus padding
|
||||
unsigned wi_thread_data_begin = wo_thread_data_begin; // minus padding
|
||||
|
||||
TFloat p_in_thread[in_thread_src_desc.GetElementSpace()];
|
||||
TFloat p_wei_thread[wei_thread_src_desc.GetElementSpace()];
|
||||
TFloat p_out_thread[out_thread_src_desc.GetElementSpace()];
|
||||
TFloat p_in_thread[in_thread_desc.GetElementSpace()];
|
||||
TFloat p_wei_thread[wei_thread_desc.GetElementSpace()];
|
||||
TFloat p_out_thread[out_thread_desc.GetElementSpace()];
|
||||
|
||||
// copy input tensor into register
|
||||
threadwise_4d_tensor_copy(
|
||||
in_thread_src_desc,
|
||||
p_in_block + in_block_desc.Get1dIndex(
|
||||
n_thread_work_begin, 0, hi_thread_work_begin, wi_thread_work_begin),
|
||||
in_thread_dst_desc,
|
||||
p_in_thread);
|
||||
threadwise_4d_tensor_copy(out_thread_block_desc,
|
||||
p_out_block + out_block_desc.Get1dIndex(n_thread_data_begin,
|
||||
k_thread_data_begin,
|
||||
ho_thread_data_begin,
|
||||
wo_thread_data_begin),
|
||||
out_thread_desc,
|
||||
p_out_thread);
|
||||
|
||||
for(unsigned k_thread_work_begin = 0; k_thread_work_begin < KPerBlock;
|
||||
++k_thread_work_begin)
|
||||
for(unsigned c_thread_data_begin = 0; c_thread_data_begin < in_block_desc.GetLength(I1);
|
||||
c_thread_data_begin += CPerThread)
|
||||
{
|
||||
// copy weight tensor into register
|
||||
threadwise_4d_tensor_copy(wei_thread_src_desc,
|
||||
p_wei_block +
|
||||
wei_block_desc.Get1dIndex(k_thread_work_begin, 0, 0, 0),
|
||||
wei_thread_dst_desc,
|
||||
p_wei_thread);
|
||||
// copy input into register
|
||||
threadwise_4d_tensor_copy(in_thread_block_desc,
|
||||
p_in_block + in_block_desc.Get1dIndex(n_thread_data_begin,
|
||||
c_thread_data_begin,
|
||||
hi_thread_data_begin,
|
||||
wi_thread_data_begin),
|
||||
in_thread_desc,
|
||||
p_in_thread);
|
||||
|
||||
// copy output tensor into register
|
||||
threadwise_4d_tensor_copy(out_thread_src_desc,
|
||||
p_out_block + out_block_desc.Get1dIndex(n_thread_work_begin,
|
||||
k_thread_work_begin,
|
||||
ho_thread_work_begin,
|
||||
wo_thread_work_begin),
|
||||
out_thread_dst_desc,
|
||||
p_out_thread);
|
||||
// copy weight into register
|
||||
threadwise_4d_tensor_copy(
|
||||
wei_thread_block_desc,
|
||||
p_wei_block +
|
||||
wei_block_desc.Get1dIndex(k_thread_data_begin, c_thread_data_begin, 0, 0),
|
||||
wei_thread_desc,
|
||||
p_wei_thread);
|
||||
|
||||
// threadwise convolution
|
||||
threadwise_direct_convolution(in_thread_dst_desc,
|
||||
threadwise_direct_convolution(in_thread_desc,
|
||||
p_in_thread,
|
||||
wei_thread_dst_desc,
|
||||
wei_thread_desc,
|
||||
p_wei_thread,
|
||||
out_thread_dst_desc,
|
||||
out_thread_desc,
|
||||
p_out_thread);
|
||||
|
||||
// accumulate output tensor into LDS
|
||||
threadwise_4d_tensor_copy(out_thread_dst_desc,
|
||||
p_out_thread,
|
||||
out_thread_src_desc,
|
||||
p_out_block +
|
||||
out_block_desc.Get1dIndex(n_thread_work_begin,
|
||||
k_thread_work_begin,
|
||||
ho_thread_work_begin,
|
||||
wo_thread_work_begin));
|
||||
}
|
||||
|
||||
// copy output into LDS
|
||||
threadwise_4d_tensor_copy(out_thread_desc,
|
||||
p_out_thread,
|
||||
out_thread_block_desc,
|
||||
p_out_block + out_block_desc.Get1dIndex(n_thread_data_begin,
|
||||
k_thread_data_begin,
|
||||
ho_thread_data_begin,
|
||||
wo_thread_data_begin));
|
||||
}
|
||||
}
|
||||
|
||||
@@ -14,10 +14,9 @@ template <class TFloat,
|
||||
unsigned CPerBlock,
|
||||
unsigned YPerBlock,
|
||||
unsigned XPerBlock,
|
||||
unsigned NBlockOpLen0,
|
||||
unsigned NBlockOpLen1,
|
||||
unsigned NBlockOpLen2,
|
||||
unsigned NBlockOpLen3,
|
||||
unsigned NPerThread,
|
||||
unsigned KPerThread,
|
||||
unsigned CPerThread,
|
||||
unsigned BlockSize,
|
||||
unsigned GridSize>
|
||||
__global__ void gridwise_direct_convolution_1(InGlobalDesc,
|
||||
@@ -125,9 +124,8 @@ __global__ void gridwise_direct_convolution_1(InGlobalDesc,
|
||||
p_out_block);
|
||||
|
||||
for(unsigned c_block_work_begin = 0; c_block_work_begin < in_global_desc.GetLength(I1);
|
||||
c_block_work_begin += CPerBlock, __syncthreads())
|
||||
c_block_work_begin += CPerBlock)
|
||||
{
|
||||
|
||||
// copy input tensor to LDS
|
||||
blockwise_4d_tensor_copy<TFloat,
|
||||
decltype(in_block_src_desc),
|
||||
@@ -154,14 +152,19 @@ __global__ void gridwise_direct_convolution_1(InGlobalDesc,
|
||||
__syncthreads();
|
||||
|
||||
// blockwise convolution
|
||||
blockwise_convolution<TFloat,
|
||||
decltype(in_block_desc),
|
||||
decltype(wei_block_desc),
|
||||
decltype(out_block_desc),
|
||||
OutTileSizeH,
|
||||
OutTileSizeW,
|
||||
BlockSize>(
|
||||
blockwise_direct_convolution<TFloat,
|
||||
decltype(in_block_desc),
|
||||
decltype(wei_block_desc),
|
||||
decltype(out_block_desc),
|
||||
OutTileSizeH,
|
||||
OutTileSizeW,
|
||||
NPerThread,
|
||||
KPerThread,
|
||||
CPerThread,
|
||||
BlockSize>(
|
||||
in_block_desc, p_in_block, wei_block_desc, p_wei_block, out_block_desc, p_out_block);
|
||||
|
||||
__syncthreads();
|
||||
}
|
||||
|
||||
// copy output tensor from LDS to device mem
|
||||
|
||||
@@ -108,16 +108,16 @@ __global__ void gridwise_direct_convolution_2(InGlobalDesc,
|
||||
const unsigned y_block_work_id = itmp / XBlockWork;
|
||||
const unsigned x_block_work_id = itmp - y_block_work_id * XBlockWork;
|
||||
|
||||
const unsigned n_block_data_offset = n_block_work_id * NPerBlock;
|
||||
const unsigned k_block_data_offset = k_block_work_id * KPerBlock;
|
||||
const unsigned y_block_data_offset = y_block_work_id * YPerBlock;
|
||||
const unsigned x_block_data_offset = x_block_work_id * XPerBlock;
|
||||
const unsigned n_block_data_begin = n_block_work_id * NPerBlock;
|
||||
const unsigned k_block_data_begin = k_block_work_id * KPerBlock;
|
||||
const unsigned y_block_data_begin = y_block_work_id * YPerBlock;
|
||||
const unsigned x_block_data_begin = x_block_work_id * XPerBlock;
|
||||
|
||||
const unsigned ho_block_data_offset = y_block_data_offset * OutTileSizeH;
|
||||
const unsigned wo_block_data_offset = x_block_data_offset * OutTileSizeW;
|
||||
const unsigned ho_block_data_begin = y_block_data_begin * OutTileSizeH;
|
||||
const unsigned wo_block_data_begin = x_block_data_begin * OutTileSizeW;
|
||||
|
||||
const unsigned hi_block_data_offset = ho_block_data_offset; // minus padding
|
||||
const unsigned wi_block_data_offset = wo_block_data_offset; // minus padding
|
||||
const unsigned hi_block_data_begin = ho_block_data_begin; // minus padding
|
||||
const unsigned wi_block_data_begin = wo_block_data_begin; // minus padding
|
||||
|
||||
// divide thread work
|
||||
constexpr unsigned NThreadWork = (NPerBlock + NPerThread - 1) / NPerThread;
|
||||
@@ -135,13 +135,13 @@ __global__ void gridwise_direct_convolution_2(InGlobalDesc,
|
||||
const unsigned y_thread_work_id = itmp / XThreadWork;
|
||||
const unsigned x_thread_work_id = itmp - y_thread_work_id * XThreadWork;
|
||||
|
||||
const unsigned n_thread_data_offset = n_thread_work_id * NPerThread;
|
||||
const unsigned k_thread_data_offset = k_thread_work_id * KPerThread;
|
||||
const unsigned ho_thread_data_offset = y_thread_work_id * OutTileSizeH;
|
||||
const unsigned wo_thread_data_offset = x_thread_work_id * OutTileSizeW;
|
||||
const unsigned n_thread_data_begin = n_thread_work_id * NPerThread;
|
||||
const unsigned k_thread_data_begin = k_thread_work_id * KPerThread;
|
||||
const unsigned ho_thread_data_begin = y_thread_work_id * OutTileSizeH;
|
||||
const unsigned wo_thread_data_begin = x_thread_work_id * OutTileSizeW;
|
||||
|
||||
const unsigned hi_thread_data_offset = ho_thread_data_offset;
|
||||
const unsigned wi_thread_data_offset = wo_thread_data_offset;
|
||||
const unsigned hi_thread_data_begin = ho_thread_data_begin;
|
||||
const unsigned wi_thread_data_begin = wo_thread_data_begin;
|
||||
|
||||
#if 0
|
||||
if(threadIdx.x == 0)
|
||||
@@ -152,20 +152,20 @@ __global__ void gridwise_direct_convolution_2(InGlobalDesc,
|
||||
}
|
||||
|
||||
printf("threadIdx.x %u \t"
|
||||
"n_thread_data_offset %u, k_thread_data_offset %u, ho_thread_data_offset %u, "
|
||||
"wo_thread_data_offset %u\n",
|
||||
"n_thread_data_begin %u, k_thread_data_begin %u, ho_thread_data_begin %u, "
|
||||
"wo_thread_data_begin %u\n",
|
||||
threadIdx.x,
|
||||
n_thread_data_offset,
|
||||
k_thread_data_offset,
|
||||
ho_thread_data_offset,
|
||||
wo_thread_data_offset);
|
||||
n_thread_data_begin,
|
||||
k_thread_data_begin,
|
||||
ho_thread_data_begin,
|
||||
wo_thread_data_begin);
|
||||
#endif
|
||||
|
||||
// set threadwise output tensor to 0
|
||||
threadwise_4d_tensor_set_zero(out_thread_desc, p_out_thread);
|
||||
|
||||
for(unsigned c_block_data_offset = 0; c_block_data_offset < in_global_desc.GetLength(I1);
|
||||
c_block_data_offset += CPerBlock, __syncthreads())
|
||||
for(unsigned c_block_data_begin = 0; c_block_data_begin < in_global_desc.GetLength(I1);
|
||||
c_block_data_begin += CPerBlock, __syncthreads())
|
||||
{
|
||||
// copy input tensor to LDS
|
||||
blockwise_4d_tensor_copy<TFloat,
|
||||
@@ -173,10 +173,10 @@ __global__ void gridwise_direct_convolution_2(InGlobalDesc,
|
||||
decltype(in_block_desc),
|
||||
BlockSize>(in_block_global_desc,
|
||||
p_in_global +
|
||||
in_global_desc.Get1dIndex(n_block_data_offset,
|
||||
c_block_data_offset,
|
||||
hi_block_data_offset,
|
||||
wi_block_data_offset),
|
||||
in_global_desc.Get1dIndex(n_block_data_begin,
|
||||
c_block_data_begin,
|
||||
hi_block_data_begin,
|
||||
wi_block_data_begin),
|
||||
in_block_desc,
|
||||
p_in_block);
|
||||
|
||||
@@ -186,8 +186,7 @@ __global__ void gridwise_direct_convolution_2(InGlobalDesc,
|
||||
decltype(wei_block_desc),
|
||||
BlockSize>(
|
||||
wei_block_global_desc,
|
||||
p_wei_global +
|
||||
wei_global_desc.Get1dIndex(k_block_data_offset, c_block_data_offset, 0, 0),
|
||||
p_wei_global + wei_global_desc.Get1dIndex(k_block_data_begin, c_block_data_begin, 0, 0),
|
||||
wei_block_desc,
|
||||
p_wei_block);
|
||||
|
||||
@@ -197,30 +196,27 @@ __global__ void gridwise_direct_convolution_2(InGlobalDesc,
|
||||
{
|
||||
// copy input tensor into register
|
||||
threadwise_4d_tensor_copy(in_thread_block_desc,
|
||||
p_in_block + in_block_desc.Get1dIndex(n_thread_data_offset,
|
||||
p_in_block + in_block_desc.Get1dIndex(n_thread_data_begin,
|
||||
c_thread_data,
|
||||
hi_thread_data_offset,
|
||||
wi_thread_data_offset),
|
||||
hi_thread_data_begin,
|
||||
wi_thread_data_begin),
|
||||
in_thread_desc,
|
||||
p_in_thread);
|
||||
|
||||
// copy weight tensor into register
|
||||
threadwise_4d_tensor_copy(
|
||||
wei_thread_block_desc,
|
||||
p_wei_block + wei_block_desc.Get1dIndex(k_thread_data_offset, c_thread_data, 0, 0),
|
||||
p_wei_block + wei_block_desc.Get1dIndex(k_thread_data_begin, c_thread_data, 0, 0),
|
||||
wei_thread_desc,
|
||||
p_wei_thread);
|
||||
|
||||
// threadwise convolution
|
||||
threadwise_direct_convolution<TFloat,
|
||||
decltype(in_thread_desc),
|
||||
decltype(wei_thread_desc),
|
||||
decltype(out_thread_desc)>(in_thread_desc,
|
||||
p_in_thread,
|
||||
wei_thread_desc,
|
||||
p_wei_thread,
|
||||
out_thread_desc,
|
||||
p_out_thread);
|
||||
threadwise_direct_convolution(in_thread_desc,
|
||||
p_in_thread,
|
||||
wei_thread_desc,
|
||||
p_wei_thread,
|
||||
out_thread_desc,
|
||||
p_out_thread);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -229,8 +225,8 @@ __global__ void gridwise_direct_convolution_2(InGlobalDesc,
|
||||
out_thread_desc,
|
||||
p_out_thread,
|
||||
out_thread_global_desc,
|
||||
p_out_global + out_global_desc.Get1dIndex(n_block_data_offset + n_thread_data_offset,
|
||||
k_block_data_offset + k_thread_data_offset,
|
||||
ho_block_data_offset + ho_thread_data_offset,
|
||||
wo_block_data_offset + wo_thread_data_offset));
|
||||
p_out_global + out_global_desc.Get1dIndex(n_block_data_begin + n_thread_data_begin,
|
||||
k_block_data_begin + k_thread_data_begin,
|
||||
ho_block_data_begin + ho_thread_data_begin,
|
||||
wo_block_data_begin + wo_thread_data_begin));
|
||||
}
|
||||
|
||||
237
src/include/gridwise_winograd_convolution.cuh
Normal file
237
src/include/gridwise_winograd_convolution.cuh
Normal file
@@ -0,0 +1,237 @@
|
||||
#pragma once
|
||||
#include "constant_tensor_descriptor.cuh"
|
||||
#include "blockwise_winograd_transform.cuh"
|
||||
#include "threadwise_winograd_transform.cuh"
|
||||
|
||||
template <class TFloat,
|
||||
class InGlobalDesc,
|
||||
class WeiGlobalDesc,
|
||||
class OutGlobalDesc,
|
||||
unsigned OutTileSizeH,
|
||||
unsigned OutTileSizeW,
|
||||
unsigned NPerBlock,
|
||||
unsigned KPerBlock,
|
||||
unsigned CPerBlock,
|
||||
unsigned YPerBlock,
|
||||
unsigned XPerBlock,
|
||||
unsigned NPerThread,
|
||||
unsigned KPerThread,
|
||||
unsigned CPerThread,
|
||||
unsigned BlockSize,
|
||||
unsigned GridSize>
|
||||
__global__ void gridwise_winograd_convolution(InGlobalDesc,
|
||||
TFloat* const __restrict__ p_in_global,
|
||||
WeiGlobalDesc,
|
||||
TFloat* const __restrict__ p_wei_global,
|
||||
OutGlobalDesc,
|
||||
TFloat* __restrict__ p_out_global)
|
||||
{
|
||||
constexpr auto I0 = Number<0>{};
|
||||
constexpr auto I1 = Number<1>{};
|
||||
constexpr auto I2 = Number<2>{};
|
||||
constexpr auto I3 = Number<3>{};
|
||||
|
||||
constexpr auto in_global_desc = InGlobalDesc{};
|
||||
constexpr auto wei_global_desc = WeiGlobalDesc{};
|
||||
constexpr auto out_global_desc = OutGlobalDesc{};
|
||||
|
||||
constexpr unsigned S = wei_global_desc.GetLength(I2);
|
||||
constexpr unsigned R = wei_global_desc.GetLength(I3);
|
||||
|
||||
constexpr unsigned HoPerBlock = OutTileSizeH * YPerBlock;
|
||||
constexpr unsigned WoPerBlock = OutTileSizeW * XPerBlock;
|
||||
|
||||
constexpr unsigned HiPerBlock = YPerBlock * OutTileSizeH + S - 1;
|
||||
constexpr unsigned WiPerBlock = XPerBlock * OutTileSizeW + R - 1;
|
||||
|
||||
constexpr unsigned InTileSizeH = OutTileSizeH + S - 1;
|
||||
constexpr unsigned InTileSizeW = OutTileSizeW + R - 1;
|
||||
|
||||
// divide block work
|
||||
constexpr unsigned NBlockWork = (out_global_desc.GetLength(I0) + NPerBlock - 1) / NPerBlock;
|
||||
constexpr unsigned KBlockWork = (out_global_desc.GetLength(I1) + KPerBlock - 1) / KPerBlock;
|
||||
constexpr unsigned YBlockWork = (out_global_desc.GetLength(I2) + HoPerBlock - 1) / HoPerBlock;
|
||||
constexpr unsigned XBlockWork = (out_global_desc.GetLength(I3) + WoPerBlock - 1) / WoPerBlock;
|
||||
|
||||
const unsigned block_id = blockIdx.x;
|
||||
|
||||
unsigned itmp = block_id;
|
||||
const unsigned n_block_work_id = itmp / (KBlockWork * YBlockWork * XBlockWork);
|
||||
itmp -= n_block_work_id * (KBlockWork * YBlockWork * XBlockWork);
|
||||
const unsigned k_block_work_id = itmp / (YBlockWork * XBlockWork);
|
||||
itmp -= k_block_work_id * (YBlockWork * XBlockWork);
|
||||
const unsigned y_block_work_id = itmp / XBlockWork;
|
||||
const unsigned x_block_work_id = itmp - y_block_work_id * XBlockWork;
|
||||
|
||||
const unsigned n_block_data_begin = n_block_work_id * NPerBlock;
|
||||
const unsigned k_block_data_begin = k_block_work_id * KPerBlock;
|
||||
const unsigned y_block_data_begin = y_block_work_id * YPerBlock;
|
||||
const unsigned x_block_data_begin = x_block_work_id * XPerBlock;
|
||||
|
||||
const unsigned ho_block_data_begin = y_block_data_begin * OutTileSizeH;
|
||||
const unsigned wo_block_data_begin = x_block_data_begin * OutTileSizeW;
|
||||
|
||||
const unsigned hi_block_data_begin = ho_block_data_begin; // minus padding
|
||||
const unsigned wi_block_data_begin = wo_block_data_begin; // minus padding
|
||||
|
||||
// divide thread work
|
||||
constexpr unsigned NThreadWork = (NPerBlock + NPerThread - 1) / NPerThread;
|
||||
constexpr unsigned KThreadWork = (KPerBlock + KPerThread - 1) / KPerThread;
|
||||
constexpr unsigned YThreadWork = YPerBlock;
|
||||
constexpr unsigned XThreadWork = XPerBlock;
|
||||
|
||||
const unsigned thread_id = threadIdx.x;
|
||||
|
||||
itmp = thread_id;
|
||||
const unsigned n_thread_work_id = itmp / (KThreadWork * YThreadWork * XThreadWork);
|
||||
itmp -= n_thread_work_id * (KThreadWork * YThreadWork * XThreadWork);
|
||||
const unsigned k_thread_work_id = itmp / (YThreadWork * XThreadWork);
|
||||
itmp -= k_thread_work_id * (YThreadWork * XThreadWork);
|
||||
const unsigned y_thread_work_id = itmp / XThreadWork;
|
||||
const unsigned x_thread_work_id = itmp - y_thread_work_id * XThreadWork;
|
||||
|
||||
const unsigned n_thread_data_begin = n_thread_work_id * NPerThread;
|
||||
const unsigned k_thread_data_begin = k_thread_work_id * KPerThread;
|
||||
const unsigned y_thread_data_begin = y_thread_work_id;
|
||||
const unsigned x_thread_data_begin = x_thread_work_id;
|
||||
|
||||
// block data
|
||||
constexpr auto in_transform_block_desc = make_ConstantTensorDescriptor(
|
||||
Sequence<NPerBlock, CPerBlock, YPerBlock * InTileSizeH, XPerBlock * InTileSizeW>{});
|
||||
|
||||
constexpr auto wei_transform_block_desc =
|
||||
make_ConstantTensorDescriptor(Sequence<KPerBlock, CPerBlock, InTileSizeH, InTileSizeW>{});
|
||||
|
||||
constexpr unsigned in_transform_block_size = in_transform_block_desc.GetElementSpace();
|
||||
constexpr unsigned wei_transform_block_size = wei_transform_block_desc.GetElementSpace();
|
||||
|
||||
__shared__ TFloat p_in_transform_block[in_transform_block_size];
|
||||
__shared__ TFloat p_wei_transform_block[wei_transform_block_size];
|
||||
|
||||
// thread data
|
||||
constexpr auto in_transform_thread_block_desc =
|
||||
make_ConstantTensorDescriptor(Sequence<NPerThread, CPerThread, InTileSizeH, InTileSizeW>{},
|
||||
in_transform_block_desc.GetStrides());
|
||||
|
||||
constexpr auto wei_transform_thread_block_desc =
|
||||
make_ConstantTensorDescriptor(Sequence<KPerThread, CPerThread, InTileSizeH, InTileSizeW>{},
|
||||
wei_transform_block_desc.GetStrides());
|
||||
|
||||
constexpr auto out_transform_thread_desc =
|
||||
make_ConstantTensorDescriptor(Sequence<NPerThread, KPerThread, InTileSizeH, InTileSizeW>{});
|
||||
|
||||
constexpr auto out_thread_desc = make_ConstantTensorDescriptor(
|
||||
Sequence<NPerThread, KPerThread, OutTileSizeH, OutTileSizeW>{});
|
||||
|
||||
constexpr auto out_thread_global_desc =
|
||||
make_ConstantTensorDescriptor(out_thread_desc.GetLengths(), out_global_desc.GetStrides());
|
||||
|
||||
constexpr unsigned out_transform_thread_size = out_transform_thread_desc.GetElementSpace();
|
||||
constexpr unsigned out_thread_size = out_thread_desc.GetElementSpace();
|
||||
|
||||
TFloat p_out_transform_thread[out_transform_thread_size];
|
||||
TFloat p_out_thread[out_thread_size];
|
||||
|
||||
#if 0
|
||||
if(blockIdx.x == 0 && threadIdx.x == 0)
|
||||
{
|
||||
printf("in_transform_block_size %u, wei_transform_block_size %u, out_transform_thread_size "
|
||||
"%u, out_thread_size %u \n",
|
||||
in_transform_block_size,
|
||||
wei_transform_block_size,
|
||||
out_transform_thread_size,
|
||||
out_thread_size);
|
||||
}
|
||||
#endif
|
||||
|
||||
// set threadwise output transform tensor to 0
|
||||
threadwise_4d_tensor_set_zero(out_transform_thread_desc, p_out_transform_thread);
|
||||
|
||||
for(unsigned c_block_data_begin = 0; c_block_data_begin < in_global_desc.GetLength(I1);
|
||||
c_block_data_begin += CPerBlock, __syncthreads())
|
||||
{
|
||||
#if 0
|
||||
// blockwise transform input
|
||||
blockwise_winograd_transform_input<TFloat,
|
||||
InTileSizeH,
|
||||
InTileSizeW,
|
||||
S,
|
||||
R,
|
||||
OutTileSizeH,
|
||||
OutTileSizeW,
|
||||
NPerBlock,
|
||||
CPerBlock,
|
||||
YPerBlock,
|
||||
XPerBlock,
|
||||
BlockSize>(
|
||||
p_in_global + in_global_desc.Get1dIndex(n_block_data_begin,
|
||||
c_block_data_begin,
|
||||
hi_block_data_begin,
|
||||
wi_block_data_begin),
|
||||
p_in_transform_block);
|
||||
|
||||
#endif
|
||||
// blockwise transform weights
|
||||
blockwise_winograd_transform_weight<TFloat,
|
||||
InTileSizeH,
|
||||
InTileSizeW,
|
||||
S,
|
||||
R,
|
||||
OutTileSizeH,
|
||||
OutTileSizeW,
|
||||
KPerBlock,
|
||||
CPerBlock,
|
||||
BlockSize>(
|
||||
p_wei_global + wei_global_desc.Get1dIndex(k_block_data_begin, c_block_data_begin, 0, 0),
|
||||
p_wei_transform_block);
|
||||
|
||||
for(unsigned c_thread_data = 0; c_thread_data < CPerBlock; c_thread_data += CPerThread)
|
||||
{
|
||||
// threadwise point multiplication
|
||||
threadwise_winograd_calculate_transformed_output<
|
||||
TFloat,
|
||||
decltype(in_transform_thread_block_desc),
|
||||
decltype(wei_transform_thread_block_desc),
|
||||
decltype(out_transform_thread_desc),
|
||||
InTileSizeH,
|
||||
InTileSizeW,
|
||||
S,
|
||||
R,
|
||||
OutTileSizeH,
|
||||
OutTileSizeW>(in_transform_thread_block_desc,
|
||||
p_in_transform_block + in_transform_block_desc.Get1dIndex(
|
||||
n_thread_data_begin,
|
||||
c_thread_data,
|
||||
y_thread_data_begin * InTileSizeH,
|
||||
x_thread_data_begin * InTileSizeW),
|
||||
wei_transform_thread_block_desc,
|
||||
p_wei_transform_block + wei_transform_block_desc.Get1dIndex(
|
||||
k_thread_data_begin, c_thread_data, 0, 0),
|
||||
out_transform_thread_desc,
|
||||
p_out_transform_thread);
|
||||
}
|
||||
};
|
||||
|
||||
// transform back
|
||||
threadwise_winograd_reverse_transform_output<TFloat,
|
||||
decltype(out_transform_thread_desc),
|
||||
decltype(out_thread_desc),
|
||||
InTileSizeH,
|
||||
InTileSizeW,
|
||||
S,
|
||||
R,
|
||||
OutTileSizeH,
|
||||
OutTileSizeW>(
|
||||
out_transform_thread_desc, p_out_transform_thread, out_thread_desc, p_out_thread);
|
||||
|
||||
// copy output tensor from register to global mem
|
||||
threadwise_4d_tensor_copy(
|
||||
out_thread_desc,
|
||||
p_out_thread,
|
||||
out_thread_global_desc,
|
||||
p_out_global +
|
||||
out_global_desc.Get1dIndex(n_block_data_begin + n_thread_data_begin,
|
||||
k_block_data_begin + k_thread_data_begin,
|
||||
ho_block_data_begin + y_thread_data_begin * OutTileSizeH,
|
||||
wo_block_data_begin + x_thread_data_begin * OutTileSizeW));
|
||||
}
|
||||
@@ -19,11 +19,11 @@ __device__ void threadwise_direct_convolution(InDesc,
|
||||
constexpr auto out_desc = OutDesc{};
|
||||
|
||||
#if 0
|
||||
if(threadIdx.x == 0)
|
||||
if(blockIdx.x == 0 && threadIdx.x == 0)
|
||||
{
|
||||
print_ConstantTensorDescriptor(in_desc, "threadwise_direct_convolution: ");
|
||||
print_ConstantTensorDescriptor(wei_desc, "threadwise_direct_convolution: ");
|
||||
print_ConstantTensorDescriptor(out_desc, "threadwise_direct_convolution: ");
|
||||
print_ConstantTensorDescriptor(in_desc, "threadwise_direct_convolution: in_desc: ");
|
||||
print_ConstantTensorDescriptor(wei_desc, "threadwise_direct_convolution: wei_desc: ");
|
||||
print_ConstantTensorDescriptor(out_desc, "threadwise_direct_convolution: out_desc: ");
|
||||
}
|
||||
#endif
|
||||
|
||||
|
||||
@@ -2,7 +2,7 @@
|
||||
#include "constant_tensor_descriptor.cuh"
|
||||
|
||||
template <class TFloat, class Desc, class F>
|
||||
__device__ void threadwise_4d_tensor_pointwise_op_unary(Desc, TFloat* __restrict__ p_dst, F f)
|
||||
__device__ void threadwise_4d_tensor_pointwise_op_unary(Desc, TFloat* __restrict__ p, F f)
|
||||
{
|
||||
constexpr auto I0 = Number<0>{};
|
||||
constexpr auto I1 = Number<1>{};
|
||||
@@ -28,7 +28,7 @@ __device__ void threadwise_4d_tensor_pointwise_op_unary(Desc, TFloat* __restrict
|
||||
{
|
||||
const unsigned dindex = desc.Get1dIndex(did0, did1, did2, did3);
|
||||
|
||||
f(p_dst[dindex]);
|
||||
f(p[dindex]);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -77,12 +77,12 @@ __device__ void threadwise_4d_tensor_pointwise_op_binary(
|
||||
}
|
||||
|
||||
template <class TFloat, class Desc>
|
||||
__device__ void threadwise_4d_tensor_set_zero(Desc, TFloat* __restrict__ p_dst)
|
||||
__device__ void threadwise_4d_tensor_set_zero(Desc, TFloat* __restrict__ p)
|
||||
{
|
||||
auto f_set_zero = [](TFloat& v) { v = TFloat(0); };
|
||||
|
||||
threadwise_4d_tensor_pointwise_op_unary<TFloat, Desc, decltype(f_set_zero)>(
|
||||
Desc{}, p_dst, f_set_zero);
|
||||
Desc{}, p, f_set_zero);
|
||||
}
|
||||
|
||||
template <class TFloat, class SrcDesc, class DstDesc>
|
||||
@@ -95,4 +95,49 @@ __device__ void threadwise_4d_tensor_copy(SrcDesc,
|
||||
|
||||
threadwise_4d_tensor_pointwise_op_binary<TFloat, SrcDesc, DstDesc, decltype(f_copy)>(
|
||||
SrcDesc{}, p_src, DstDesc{}, p_dst, f_copy);
|
||||
}
|
||||
|
||||
template <class TFloat, class Desc, class IDim>
|
||||
__device__ void threadwise_4d_tensor_shift_down(Desc, TFloat* __restrict__ p, IDim, unsigned shift)
|
||||
{
|
||||
constexpr auto I0 = Number<0>{};
|
||||
constexpr auto I1 = Number<1>{};
|
||||
constexpr auto I2 = Number<2>{};
|
||||
constexpr auto I3 = Number<3>{};
|
||||
|
||||
constexpr auto desc = Desc{};
|
||||
|
||||
#if 0
|
||||
if(threadIdx.x == 0)
|
||||
{
|
||||
print_ConstantTensorDescriptor(desc, "threadwise_4d_tensor_shift_down: ");
|
||||
}
|
||||
#endif
|
||||
|
||||
const unsigned did0_end =
|
||||
is_same<decltype(I0), IDim>::value ? desc.GetLength(I0) - shift : desc.GetLength(I0);
|
||||
const unsigned did1_end =
|
||||
is_same<decltype(I1), IDim>::value ? desc.GetLength(I1) - shift : desc.GetLength(I1);
|
||||
const unsigned did2_end =
|
||||
is_same<decltype(I2), IDim>::value ? desc.GetLength(I2) - shift : desc.GetLength(I2);
|
||||
const unsigned did3_end =
|
||||
is_same<decltype(I3), IDim>::value ? desc.GetLength(I3) - shift : desc.GetLength(I3);
|
||||
|
||||
for(unsigned did0 = 0; did0 < did0_end; ++did0)
|
||||
{
|
||||
for(unsigned did1 = 0; did1 < did1_end; ++did1)
|
||||
{
|
||||
for(unsigned did2 = 0; did2 < did2_end; ++did2)
|
||||
{
|
||||
for(unsigned did3 = 0; did3 < did3_end; ++did3)
|
||||
{
|
||||
const unsigned dindex = desc.Get1dIndex(did0, did1, did2, did3);
|
||||
|
||||
const unsigned sindex = dindex + shift * desc.GetStride(IDim{});
|
||||
|
||||
p[dindex] = p[sindex];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user