mirror of
https://github.com/ROCm/composable_kernel.git
synced 2026-05-12 17:26:00 +00:00
refactor
This commit is contained in:
@@ -58,8 +58,8 @@ __device__ void blockwise_direct_convolution(InBlockDesc,
|
||||
constexpr auto wei_thread_desc =
|
||||
make_ConstantTensorDescriptor(Sequence<KPerThread, CPerThread, S, R>{});
|
||||
|
||||
constexpr auto out_thread_desc = make_ConstantTensorDescriptor(
|
||||
Sequence<NPerThread, KPerThread, OutTileSizeH, OutTileSizeW>{});
|
||||
constexpr auto out_thread_desc =
|
||||
get_output_4d_tensor_descriptor(in_thread_desc, wei_thread_desc);
|
||||
|
||||
constexpr auto in_thread_block_desc =
|
||||
make_ConstantTensorDescriptor(in_thread_desc.GetLengths(), in_block_desc.GetStrides());
|
||||
@@ -92,11 +92,9 @@ __device__ void blockwise_direct_convolution(InBlockDesc,
|
||||
unsigned hi_thread_data_begin = ho_thread_data_begin; // minus padding
|
||||
unsigned wi_thread_data_begin = wo_thread_data_begin; // minus padding
|
||||
|
||||
TFloat p_in_thread[in_thread_desc.GetElementSpace()];
|
||||
TFloat p_wei_thread[wei_thread_desc.GetElementSpace()];
|
||||
TFloat p_out_thread[out_thread_desc.GetElementSpace()];
|
||||
|
||||
threadwise_4d_tensor_copy(out_thread_block_desc,
|
||||
threadwise_4d_tensor_copy(out_block_desc,
|
||||
p_out_block + out_block_desc.Get1dIndex(n_thread_data_begin,
|
||||
k_thread_data_begin,
|
||||
ho_thread_data_begin,
|
||||
@@ -108,38 +106,24 @@ __device__ void blockwise_direct_convolution(InBlockDesc,
|
||||
for(unsigned c_thread_data_begin = 0; c_thread_data_begin < in_block_desc.GetLength(I1);
|
||||
c_thread_data_begin += CPerThread)
|
||||
{
|
||||
// copy input into register
|
||||
threadwise_4d_tensor_copy(in_thread_block_desc,
|
||||
p_in_block + in_block_desc.Get1dIndex(n_thread_data_begin,
|
||||
c_thread_data_begin,
|
||||
hi_thread_data_begin,
|
||||
wi_thread_data_begin),
|
||||
in_thread_desc,
|
||||
p_in_thread,
|
||||
in_thread_desc);
|
||||
|
||||
// copy weight into register
|
||||
threadwise_4d_tensor_copy(
|
||||
// threadwise convolution
|
||||
threadwise_direct_convolution_2(
|
||||
in_thread_block_desc,
|
||||
p_in_block + in_block_desc.Get1dIndex(n_thread_data_begin,
|
||||
c_thread_data_begin,
|
||||
hi_thread_data_begin,
|
||||
wi_thread_data_begin),
|
||||
wei_thread_block_desc,
|
||||
p_wei_block +
|
||||
wei_block_desc.Get1dIndex(k_thread_data_begin, c_thread_data_begin, 0, 0),
|
||||
wei_thread_desc,
|
||||
p_wei_thread,
|
||||
wei_thread_desc);
|
||||
|
||||
// threadwise convolution
|
||||
threadwise_direct_convolution_2(in_thread_desc,
|
||||
p_in_thread,
|
||||
wei_thread_desc,
|
||||
p_wei_thread,
|
||||
out_thread_desc,
|
||||
p_out_thread);
|
||||
out_thread_desc,
|
||||
p_out_thread);
|
||||
}
|
||||
|
||||
// copy output into LDS
|
||||
threadwise_4d_tensor_copy(out_thread_desc,
|
||||
p_out_thread,
|
||||
out_thread_block_desc,
|
||||
out_block_desc,
|
||||
p_out_block + out_block_desc.Get1dIndex(n_thread_data_begin,
|
||||
k_thread_data_begin,
|
||||
ho_thread_data_begin,
|
||||
|
||||
@@ -49,18 +49,20 @@ __global__ void gridwise_direct_convolution_1(InGlobalDesc,
|
||||
constexpr unsigned YBlockWork = (out_global_desc.GetLength(I2) + HoPerBlock - 1) / HoPerBlock;
|
||||
constexpr unsigned XBlockWork = (out_global_desc.GetLength(I3) + WoPerBlock - 1) / WoPerBlock;
|
||||
|
||||
constexpr auto in_block_src_desc = make_ConstantTensorDescriptor(
|
||||
constexpr auto in_block_global_desc = make_ConstantTensorDescriptor(
|
||||
Sequence<NPerBlock, CPerBlock, HiPerBlock, WiPerBlock>{}, in_global_desc.GetStrides());
|
||||
|
||||
constexpr auto wei_block_src_desc = make_ConstantTensorDescriptor(
|
||||
constexpr auto wei_block_global_desc = make_ConstantTensorDescriptor(
|
||||
Sequence<KPerBlock, CPerBlock, S, R>{}, wei_global_desc.GetStrides());
|
||||
|
||||
constexpr auto out_block_src_desc = make_ConstantTensorDescriptor(
|
||||
constexpr auto out_block_global_desc = make_ConstantTensorDescriptor(
|
||||
Sequence<NPerBlock, KPerBlock, HoPerBlock, WoPerBlock>{}, out_global_desc.GetStrides());
|
||||
|
||||
constexpr auto in_block_desc = make_ConstantTensorDescriptor(in_block_src_desc.GetLengths());
|
||||
constexpr auto wei_block_desc = make_ConstantTensorDescriptor(wei_block_src_desc.GetLengths());
|
||||
constexpr auto out_block_desc = make_ConstantTensorDescriptor(out_block_src_desc.GetLengths());
|
||||
constexpr auto in_block_desc = make_ConstantTensorDescriptor(in_block_global_desc.GetLengths());
|
||||
constexpr auto wei_block_desc =
|
||||
make_ConstantTensorDescriptor(wei_block_global_desc.GetLengths());
|
||||
constexpr auto out_block_desc =
|
||||
make_ConstantTensorDescriptor(out_block_global_desc.GetLengths());
|
||||
|
||||
constexpr unsigned in_block_size = in_block_desc.GetElementSpace();
|
||||
constexpr unsigned wei_block_size = wei_block_desc.GetElementSpace();
|
||||
@@ -97,9 +99,9 @@ __global__ void gridwise_direct_convolution_1(InGlobalDesc,
|
||||
print_ConstantTensorDescriptor( in_global_desc, "gridwise_convolution: in_global_desc: ");
|
||||
print_ConstantTensorDescriptor(wei_global_desc, "gridwise_convolution: wei_global_desc: ");
|
||||
print_ConstantTensorDescriptor(out_global_desc, "gridwise_convolution: out_global_desc: ");
|
||||
print_ConstantTensorDescriptor( in_block_src_desc, "gridwise_convolution: in_block_src_desc: ");
|
||||
print_ConstantTensorDescriptor(wei_block_src_desc, "gridwise_convolution: wei_block_src_desc: ");
|
||||
print_ConstantTensorDescriptor(out_block_src_desc, "gridwise_convolution: out_block_src_desc: ");
|
||||
print_ConstantTensorDescriptor( in_block_global_desc, "gridwise_convolution: in_block_global_desc: ");
|
||||
print_ConstantTensorDescriptor(wei_block_global_desc, "gridwise_convolution: wei_block_global_desc: ");
|
||||
print_ConstantTensorDescriptor(out_block_global_desc, "gridwise_convolution: out_block_global_desc: ");
|
||||
print_ConstantTensorDescriptor( in_block_desc, "gridwise_convolution: in_block_desc: ");
|
||||
print_ConstantTensorDescriptor(wei_block_desc, "gridwise_convolution: wei_block_desc: ");
|
||||
print_ConstantTensorDescriptor(out_block_desc, "gridwise_convolution: out_block_desc: ");
|
||||
@@ -128,10 +130,10 @@ __global__ void gridwise_direct_convolution_1(InGlobalDesc,
|
||||
{
|
||||
// copy input tensor to LDS
|
||||
blockwise_4d_tensor_copy<TFloat,
|
||||
decltype(in_block_src_desc),
|
||||
decltype(in_block_global_desc),
|
||||
decltype(in_block_desc),
|
||||
decltype(in_block_desc),
|
||||
BlockSize>(in_block_src_desc,
|
||||
BlockSize>(in_block_global_desc,
|
||||
p_in_global +
|
||||
in_global_desc.Get1dIndex(n_block_work_begin,
|
||||
c_block_work_begin,
|
||||
@@ -143,11 +145,11 @@ __global__ void gridwise_direct_convolution_1(InGlobalDesc,
|
||||
|
||||
// copy weight tensor to LDS
|
||||
blockwise_4d_tensor_copy<TFloat,
|
||||
decltype(wei_block_src_desc),
|
||||
decltype(wei_block_global_desc),
|
||||
decltype(wei_block_desc),
|
||||
decltype(wei_block_desc),
|
||||
BlockSize>(
|
||||
wei_block_src_desc,
|
||||
wei_block_global_desc,
|
||||
p_wei_global + wei_global_desc.Get1dIndex(k_block_work_begin, c_block_work_begin, 0, 0),
|
||||
wei_block_desc,
|
||||
p_wei_block,
|
||||
@@ -174,12 +176,12 @@ __global__ void gridwise_direct_convolution_1(InGlobalDesc,
|
||||
// copy output tensor from LDS to device mem
|
||||
blockwise_4d_tensor_copy<TFloat,
|
||||
decltype(out_block_desc),
|
||||
decltype(out_block_src_desc),
|
||||
decltype(out_block_global_desc),
|
||||
decltype(out_block_desc),
|
||||
BlockSize>(
|
||||
out_block_desc,
|
||||
p_out_block,
|
||||
out_block_src_desc,
|
||||
out_block_global_desc,
|
||||
p_out_global +
|
||||
out_global_desc.Get1dIndex(
|
||||
n_block_work_begin, k_block_work_begin, ho_block_work_begin, wo_block_work_begin),
|
||||
|
||||
@@ -63,18 +63,16 @@ __global__ void gridwise_direct_convolution_2(InGlobalDesc,
|
||||
constexpr unsigned InTileSizeH = OutTileSizeH + S - 1;
|
||||
constexpr unsigned InTileSizeW = OutTileSizeW + R - 1;
|
||||
|
||||
constexpr auto in_thread_desc =
|
||||
make_ConstantTensorDescriptor(Sequence<NPerThread, CPerThread, InTileSizeH, InTileSizeW>{});
|
||||
constexpr auto in_thread_block_desc = make_ConstantTensorDescriptor(
|
||||
Sequence<NPerThread, CPerThread, InTileSizeH, InTileSizeW>{}, in_block_desc.GetStrides());
|
||||
|
||||
constexpr auto wei_thread_desc =
|
||||
make_ConstantTensorDescriptor(Sequence<KPerThread, CPerThread, S, R>{});
|
||||
constexpr auto wei_thread_block_desc = make_ConstantTensorDescriptor(
|
||||
Sequence<KPerThread, CPerThread, S, R>{}, wei_block_desc.GetStrides());
|
||||
|
||||
constexpr auto out_thread_desc =
|
||||
get_output_4d_tensor_descriptor(in_thread_desc, wei_thread_desc);
|
||||
get_output_4d_tensor_descriptor(in_thread_block_desc, wei_thread_block_desc);
|
||||
|
||||
// register
|
||||
TFloat p_in_thread[in_thread_desc.GetElementSpace()];
|
||||
TFloat p_wei_thread[wei_thread_desc.GetElementSpace()];
|
||||
TFloat p_out_thread[out_thread_desc.GetElementSpace()];
|
||||
|
||||
// divide block work
|
||||
@@ -183,31 +181,30 @@ __global__ void gridwise_direct_convolution_2(InGlobalDesc,
|
||||
|
||||
for(unsigned c_thread_data = 0; c_thread_data < CPerBlock; c_thread_data += CPerThread)
|
||||
{
|
||||
// copy input tensor into register
|
||||
threadwise_4d_tensor_copy(in_block_desc,
|
||||
p_in_block + in_block_desc.Get1dIndex(n_thread_data_begin,
|
||||
c_thread_data,
|
||||
hi_thread_data_begin,
|
||||
wi_thread_data_begin),
|
||||
in_thread_desc,
|
||||
p_in_thread,
|
||||
in_thread_desc);
|
||||
|
||||
// copy weight tensor into register
|
||||
threadwise_4d_tensor_copy(
|
||||
wei_block_desc,
|
||||
p_wei_block + wei_block_desc.Get1dIndex(k_thread_data_begin, c_thread_data, 0, 0),
|
||||
wei_thread_desc,
|
||||
p_wei_thread,
|
||||
wei_thread_desc);
|
||||
|
||||
// threadwise convolution
|
||||
threadwise_direct_convolution_1(in_thread_desc,
|
||||
p_in_thread,
|
||||
wei_thread_desc,
|
||||
p_wei_thread,
|
||||
out_thread_desc,
|
||||
p_out_thread);
|
||||
#if 1
|
||||
threadwise_direct_convolution_2(
|
||||
in_thread_block_desc,
|
||||
p_in_block + in_block_desc.Get1dIndex(n_thread_data_begin,
|
||||
c_thread_data,
|
||||
hi_thread_data_begin,
|
||||
wi_thread_data_begin),
|
||||
wei_thread_block_desc,
|
||||
p_wei_block + wei_block_desc.Get1dIndex(k_thread_data_begin, c_thread_data, 0, 0),
|
||||
out_thread_desc,
|
||||
p_out_thread);
|
||||
#elif 1
|
||||
threadwise_direct_convolution_3(
|
||||
in_thread_block_desc,
|
||||
p_in_block + in_block_desc.Get1dIndex(n_thread_data_begin,
|
||||
c_thread_data,
|
||||
hi_thread_data_begin,
|
||||
wi_thread_data_begin),
|
||||
wei_thread_block_desc,
|
||||
p_wei_block + wei_block_desc.Get1dIndex(k_thread_data_begin, c_thread_data, 0, 0),
|
||||
out_thread_desc,
|
||||
p_out_thread);
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -1,208 +0,0 @@
|
||||
#pragma once
|
||||
#include "constant_tensor_descriptor.cuh"
|
||||
#include "blockwise_tensor_op.cuh"
|
||||
#include "blockwise_direct_convolution.cuh"
|
||||
#include "threadwise_tensor_op.cuh"
|
||||
#include "threadwise_direct_convolution.cuh"
|
||||
|
||||
template <class TFloat,
|
||||
class InGlobalDesc,
|
||||
class WeiGlobalDesc,
|
||||
class OutGlobalDesc,
|
||||
unsigned OutTileSizeH,
|
||||
unsigned OutTileSizeW,
|
||||
unsigned NPerBlock,
|
||||
unsigned KPerBlock,
|
||||
unsigned CPerBlock,
|
||||
unsigned YPerBlock,
|
||||
unsigned XPerBlock,
|
||||
unsigned NPerThread,
|
||||
unsigned KPerThread,
|
||||
unsigned CPerThread,
|
||||
unsigned BlockSize,
|
||||
unsigned GridSize>
|
||||
__global__ void gridwise_direct_convolution_3(InGlobalDesc,
|
||||
TFloat* const __restrict__ p_in_global,
|
||||
WeiGlobalDesc,
|
||||
TFloat* const __restrict__ p_wei_global,
|
||||
OutGlobalDesc,
|
||||
TFloat* __restrict__ p_out_global)
|
||||
{
|
||||
constexpr auto I0 = Number<0>{};
|
||||
constexpr auto I1 = Number<1>{};
|
||||
constexpr auto I2 = Number<2>{};
|
||||
constexpr auto I3 = Number<3>{};
|
||||
|
||||
constexpr auto in_global_desc = InGlobalDesc{};
|
||||
constexpr auto wei_global_desc = WeiGlobalDesc{};
|
||||
constexpr auto out_global_desc = OutGlobalDesc{};
|
||||
|
||||
constexpr unsigned S = wei_global_desc.GetLength(I2);
|
||||
constexpr unsigned R = wei_global_desc.GetLength(I3);
|
||||
|
||||
constexpr unsigned HoPerBlock = OutTileSizeH * YPerBlock;
|
||||
constexpr unsigned WoPerBlock = OutTileSizeW * XPerBlock;
|
||||
|
||||
constexpr unsigned HiPerBlock = YPerBlock * OutTileSizeH + S - 1;
|
||||
constexpr unsigned WiPerBlock = XPerBlock * OutTileSizeW + R - 1;
|
||||
|
||||
constexpr auto in_block_desc =
|
||||
make_ConstantTensorDescriptor(Sequence<NPerBlock, CPerBlock, HiPerBlock, WiPerBlock>{});
|
||||
|
||||
constexpr auto wei_block_desc =
|
||||
make_ConstantTensorDescriptor(Sequence<KPerBlock, CPerBlock, S, R>{});
|
||||
|
||||
// shared mem
|
||||
constexpr unsigned in_block_size = in_block_desc.GetElementSpace();
|
||||
constexpr unsigned wei_block_size = wei_block_desc.GetElementSpace();
|
||||
|
||||
__shared__ TFloat p_in_block[in_block_size];
|
||||
__shared__ TFloat p_wei_block[wei_block_size];
|
||||
|
||||
// threadwise tensors
|
||||
constexpr unsigned InTileSizeH = OutTileSizeH + S - 1;
|
||||
constexpr unsigned InTileSizeW = OutTileSizeW + R - 1;
|
||||
|
||||
constexpr auto in_thread_block_desc = make_ConstantTensorDescriptor(
|
||||
Sequence<NPerThread, CPerThread, InTileSizeH, InTileSizeW>{}, in_block_desc.GetStrides());
|
||||
|
||||
constexpr auto wei_thread_block_desc = make_ConstantTensorDescriptor(
|
||||
Sequence<KPerThread, CPerThread, S, R>{}, wei_block_desc.GetStrides());
|
||||
|
||||
constexpr auto out_thread_desc =
|
||||
get_output_4d_tensor_descriptor(in_thread_block_desc, wei_thread_block_desc);
|
||||
|
||||
// register
|
||||
TFloat p_out_thread[out_thread_desc.GetElementSpace()];
|
||||
|
||||
// divide block work
|
||||
constexpr unsigned NBlockWork = (out_global_desc.GetLength(I0) + NPerBlock - 1) / NPerBlock;
|
||||
constexpr unsigned KBlockWork = (out_global_desc.GetLength(I1) + KPerBlock - 1) / KPerBlock;
|
||||
constexpr unsigned YBlockWork = (out_global_desc.GetLength(I2) + HoPerBlock - 1) / HoPerBlock;
|
||||
constexpr unsigned XBlockWork = (out_global_desc.GetLength(I3) + WoPerBlock - 1) / WoPerBlock;
|
||||
|
||||
const unsigned block_id = blockIdx.x;
|
||||
|
||||
unsigned itmp = block_id;
|
||||
const unsigned n_block_work_id = itmp / (KBlockWork * YBlockWork * XBlockWork);
|
||||
itmp -= n_block_work_id * (KBlockWork * YBlockWork * XBlockWork);
|
||||
const unsigned k_block_work_id = itmp / (YBlockWork * XBlockWork);
|
||||
itmp -= k_block_work_id * (YBlockWork * XBlockWork);
|
||||
const unsigned y_block_work_id = itmp / XBlockWork;
|
||||
const unsigned x_block_work_id = itmp - y_block_work_id * XBlockWork;
|
||||
|
||||
const unsigned n_block_data_begin = n_block_work_id * NPerBlock;
|
||||
const unsigned k_block_data_begin = k_block_work_id * KPerBlock;
|
||||
const unsigned y_block_data_begin = y_block_work_id * YPerBlock;
|
||||
const unsigned x_block_data_begin = x_block_work_id * XPerBlock;
|
||||
|
||||
const unsigned ho_block_data_begin = y_block_data_begin * OutTileSizeH;
|
||||
const unsigned wo_block_data_begin = x_block_data_begin * OutTileSizeW;
|
||||
|
||||
const unsigned hi_block_data_begin = ho_block_data_begin; // minus padding
|
||||
const unsigned wi_block_data_begin = wo_block_data_begin; // minus padding
|
||||
|
||||
// divide thread work
|
||||
constexpr unsigned NThreadWork = (NPerBlock + NPerThread - 1) / NPerThread;
|
||||
constexpr unsigned KThreadWork = (KPerBlock + KPerThread - 1) / KPerThread;
|
||||
constexpr unsigned YThreadWork = YPerBlock;
|
||||
constexpr unsigned XThreadWork = XPerBlock;
|
||||
|
||||
const unsigned thread_id = threadIdx.x;
|
||||
|
||||
itmp = thread_id;
|
||||
const unsigned n_thread_work_id = itmp / (KThreadWork * YThreadWork * XThreadWork);
|
||||
itmp -= n_thread_work_id * (KThreadWork * YThreadWork * XThreadWork);
|
||||
const unsigned k_thread_work_id = itmp / (YThreadWork * XThreadWork);
|
||||
itmp -= k_thread_work_id * (YThreadWork * XThreadWork);
|
||||
const unsigned y_thread_work_id = itmp / XThreadWork;
|
||||
const unsigned x_thread_work_id = itmp - y_thread_work_id * XThreadWork;
|
||||
|
||||
const unsigned n_thread_data_begin = n_thread_work_id * NPerThread;
|
||||
const unsigned k_thread_data_begin = k_thread_work_id * KPerThread;
|
||||
const unsigned ho_thread_data_begin = y_thread_work_id * OutTileSizeH;
|
||||
const unsigned wo_thread_data_begin = x_thread_work_id * OutTileSizeW;
|
||||
|
||||
const unsigned hi_thread_data_begin = ho_thread_data_begin;
|
||||
const unsigned wi_thread_data_begin = wo_thread_data_begin;
|
||||
|
||||
#if 0
|
||||
if(threadIdx.x == 0)
|
||||
{
|
||||
print_ConstantTensorDescriptor(in_global_desc, "gridwise_convolution: in_global_desc: ");
|
||||
print_ConstantTensorDescriptor(wei_global_desc, "gridwise_convolution: wei_global_desc: ");
|
||||
print_ConstantTensorDescriptor(out_global_desc, "gridwise_convolution: out_global_desc: ");
|
||||
}
|
||||
|
||||
printf("threadIdx.x %u \t"
|
||||
"n_thread_data_begin %u, k_thread_data_begin %u, ho_thread_data_begin %u, "
|
||||
"wo_thread_data_begin %u\n",
|
||||
threadIdx.x,
|
||||
n_thread_data_begin,
|
||||
k_thread_data_begin,
|
||||
ho_thread_data_begin,
|
||||
wo_thread_data_begin);
|
||||
#endif
|
||||
|
||||
// set threadwise output tensor to 0
|
||||
threadwise_4d_tensor_set_zero(out_thread_desc, p_out_thread);
|
||||
|
||||
for(unsigned c_block_data_begin = 0; c_block_data_begin < in_global_desc.GetLength(I1);
|
||||
c_block_data_begin += CPerBlock, __syncthreads())
|
||||
{
|
||||
// copy input tensor to LDS
|
||||
blockwise_4d_tensor_copy<TFloat,
|
||||
decltype(in_global_desc),
|
||||
decltype(in_block_desc),
|
||||
decltype(in_block_desc),
|
||||
BlockSize>(in_global_desc,
|
||||
p_in_global +
|
||||
in_global_desc.Get1dIndex(n_block_data_begin,
|
||||
c_block_data_begin,
|
||||
hi_block_data_begin,
|
||||
wi_block_data_begin),
|
||||
in_block_desc,
|
||||
p_in_block,
|
||||
in_block_desc);
|
||||
|
||||
// copy weight tensor to LDS
|
||||
blockwise_4d_tensor_copy<TFloat,
|
||||
decltype(wei_global_desc),
|
||||
decltype(wei_block_desc),
|
||||
decltype(wei_block_desc),
|
||||
BlockSize>(
|
||||
wei_global_desc,
|
||||
p_wei_global + wei_global_desc.Get1dIndex(k_block_data_begin, c_block_data_begin, 0, 0),
|
||||
wei_block_desc,
|
||||
p_wei_block,
|
||||
wei_block_desc);
|
||||
|
||||
__syncthreads();
|
||||
|
||||
for(unsigned c_thread_data = 0; c_thread_data < CPerBlock; c_thread_data += CPerThread)
|
||||
{
|
||||
// threadwise convolution
|
||||
threadwise_direct_convolution_2(
|
||||
in_thread_block_desc,
|
||||
p_in_block + in_block_desc.Get1dIndex(n_thread_data_begin,
|
||||
c_thread_data,
|
||||
hi_thread_data_begin,
|
||||
wi_thread_data_begin),
|
||||
wei_thread_block_desc,
|
||||
p_wei_block + wei_block_desc.Get1dIndex(k_thread_data_begin, c_thread_data, 0, 0),
|
||||
out_thread_desc,
|
||||
p_out_thread);
|
||||
}
|
||||
}
|
||||
|
||||
// copy output tensor from register to global mem
|
||||
threadwise_4d_tensor_copy(
|
||||
out_thread_desc,
|
||||
p_out_thread,
|
||||
out_global_desc,
|
||||
p_out_global + out_global_desc.Get1dIndex(n_block_data_begin + n_thread_data_begin,
|
||||
k_block_data_begin + k_thread_data_begin,
|
||||
ho_block_data_begin + ho_thread_data_begin,
|
||||
wo_block_data_begin + wo_thread_data_begin),
|
||||
out_thread_desc);
|
||||
}
|
||||
@@ -79,11 +79,43 @@ __device__ void threadwise_direct_convolution_1(InDesc,
|
||||
}
|
||||
}
|
||||
|
||||
// Optimized for scenario if p_in and p_wei are in LDS, p_out are in register
|
||||
// Copy in and wei into register before doing convolution
|
||||
template <class TFloat, class InDesc, class WeiDesc, class OutDesc>
|
||||
__device__ void threadwise_direct_convolution_2(InDesc,
|
||||
TFloat* const __restrict__ p_in,
|
||||
WeiDesc,
|
||||
TFloat* const __restrict__ p_wei,
|
||||
OutDesc,
|
||||
TFloat* __restrict__ p_out)
|
||||
{
|
||||
constexpr auto in_desc = InDesc{};
|
||||
constexpr auto wei_desc = WeiDesc{};
|
||||
constexpr auto out_desc = OutDesc{};
|
||||
|
||||
constexpr auto in_reg_desc = make_ConstantTensorDescriptor(in_desc.GetLengths());
|
||||
constexpr auto wei_reg_desc = make_ConstantTensorDescriptor(wei_desc.GetLengths());
|
||||
|
||||
// register
|
||||
TFloat p_in_reg[in_reg_desc.GetElementSpace()];
|
||||
TFloat p_wei_reg[wei_reg_desc.GetElementSpace()];
|
||||
|
||||
// copy input tensor into register
|
||||
threadwise_4d_tensor_copy(in_desc, p_in, in_reg_desc, p_in_reg, in_reg_desc);
|
||||
|
||||
// copy input tensor into register
|
||||
threadwise_4d_tensor_copy(wei_desc, p_wei, wei_reg_desc, p_wei_reg, wei_reg_desc);
|
||||
|
||||
// do convolution
|
||||
threadwise_direct_convolution_1(
|
||||
in_reg_desc, p_in_reg, wei_reg_desc, p_wei_reg, out_desc, p_out);
|
||||
}
|
||||
|
||||
// optimized for scenario where p_in and p_wei are in LDS, p_out is in register
|
||||
// break down a non-1x1 convolution into a sequence of 1x1 convolutions,
|
||||
// load 1x1 weight into register, and do 1x1 convolution in register.
|
||||
template <class TFloat, class InDesc, class WeiDesc, class OutDesc>
|
||||
__device__ void threadwise_direct_convolution_2(InDesc,
|
||||
__device__ void threadwise_direct_convolution_3(InDesc,
|
||||
TFloat* const __restrict__ p_in,
|
||||
WeiDesc,
|
||||
TFloat* const __restrict__ p_wei,
|
||||
@@ -95,100 +127,100 @@ __device__ void threadwise_direct_convolution_2(InDesc,
|
||||
constexpr auto I2 = Number<2>{};
|
||||
constexpr auto I3 = Number<3>{};
|
||||
|
||||
constexpr auto in_desc_lds = InDesc{};
|
||||
constexpr auto wei_desc_lds = WeiDesc{};
|
||||
constexpr auto out_desc_reg = OutDesc{};
|
||||
constexpr auto in_desc = InDesc{};
|
||||
constexpr auto wei_desc = WeiDesc{};
|
||||
constexpr auto out_desc = OutDesc{};
|
||||
|
||||
constexpr auto in_desc_reg =
|
||||
make_ConstantTensorDescriptor(Sequence<in_desc_lds.GetLength(I0),
|
||||
in_desc_lds.GetLength(I1),
|
||||
out_desc_reg.GetLength(I2),
|
||||
out_desc_reg.GetLength(I3)>{});
|
||||
constexpr auto in_reg_desc = make_ConstantTensorDescriptor(Sequence<in_desc.GetLength(I0),
|
||||
in_desc.GetLength(I1),
|
||||
out_desc.GetLength(I2),
|
||||
out_desc.GetLength(I3)>{});
|
||||
|
||||
constexpr auto wei_desc_reg = make_ConstantTensorDescriptor(
|
||||
Sequence<wei_desc_lds.GetLength(I0), wei_desc_lds.GetLength(I1), 1, 1>{});
|
||||
constexpr auto wei_reg_desc = make_ConstantTensorDescriptor(
|
||||
Sequence<wei_desc.GetLength(I0), wei_desc.GetLength(I1), 1, 1>{});
|
||||
|
||||
TFloat p_in_reg[in_desc_reg.GetElementSpace()];
|
||||
TFloat p_wei_reg[wei_desc_reg.GetElementSpace()];
|
||||
TFloat p_in_reg[in_reg_desc.GetElementSpace()];
|
||||
TFloat p_wei_reg[wei_reg_desc.GetElementSpace()];
|
||||
|
||||
constexpr unsigned in_w_new_read = 1;
|
||||
|
||||
constexpr auto in_desc_reg_new_read =
|
||||
make_ConstantTensorDescriptor(Sequence<in_desc_reg.GetLength(I0),
|
||||
in_desc_reg.GetLength(I1),
|
||||
in_desc_reg.GetLength(I2),
|
||||
make_ConstantTensorDescriptor(Sequence<in_reg_desc.GetLength(I0),
|
||||
in_reg_desc.GetLength(I1),
|
||||
in_reg_desc.GetLength(I2),
|
||||
in_w_new_read>{});
|
||||
|
||||
#if 0
|
||||
// loop over vertical direction
|
||||
for(unsigned s = 0; s < wei_desc_lds.GetLength(I2); ++s)
|
||||
for(unsigned s = 0; s < wei_desc.GetLength(I2); ++s)
|
||||
{
|
||||
#if 1
|
||||
// read first input
|
||||
threadwise_4d_tensor_copy(in_desc_lds,
|
||||
p_in + in_desc_lds.Get1dIndex(0, 0, s, 0),
|
||||
in_desc_reg,
|
||||
threadwise_4d_tensor_copy(in_desc,
|
||||
p_in + in_desc.Get1dIndex(0, 0, s, 0),
|
||||
in_reg_desc,
|
||||
p_in_reg,
|
||||
in_desc_reg);
|
||||
in_reg_desc);
|
||||
|
||||
// read first 1x1 weight
|
||||
threadwise_4d_tensor_copy(wei_desc_lds,
|
||||
p_wei + wei_desc_lds.Get1dIndex(0, 0, s, 0),
|
||||
wei_desc_reg,
|
||||
threadwise_4d_tensor_copy(wei_desc,
|
||||
p_wei + wei_desc.Get1dIndex(0, 0, s, 0),
|
||||
wei_reg_desc,
|
||||
p_wei_reg,
|
||||
wei_desc_reg);
|
||||
wei_reg_desc);
|
||||
|
||||
// do first 1x1 conv
|
||||
threadwise_direct_convolution_1(
|
||||
in_desc_reg, p_in_reg, wei_desc_reg, p_wei_reg, out_desc_reg, p_out);
|
||||
in_reg_desc, p_in_reg, wei_reg_desc, p_wei_reg, out_desc, p_out);
|
||||
|
||||
// loop over horizontal direction
|
||||
for(unsigned r = 1; r < wei_desc_lds.GetLength(I3); ++r)
|
||||
for(unsigned r = 1; r < wei_desc.GetLength(I3); ++r)
|
||||
{
|
||||
// read new weight
|
||||
threadwise_4d_tensor_copy(wei_desc_lds,
|
||||
p_wei + wei_desc_lds.Get1dIndex(0, 0, s, r),
|
||||
wei_desc_reg,
|
||||
threadwise_4d_tensor_copy(wei_desc,
|
||||
p_wei + wei_desc.Get1dIndex(0, 0, s, r),
|
||||
wei_reg_desc,
|
||||
p_wei_reg,
|
||||
wei_desc_reg);
|
||||
wei_reg_desc);
|
||||
|
||||
// shift old input to the left
|
||||
threadwise_4d_tensor_shift_down(in_desc_reg, p_in_reg, I3, Number<in_w_new_read>{});
|
||||
threadwise_4d_tensor_shift_down(in_reg_desc, p_in_reg, I3, Number<in_w_new_read>{});
|
||||
|
||||
// read new input
|
||||
threadwise_4d_tensor_copy(
|
||||
in_desc_lds,
|
||||
p_in + in_desc_lds.Get1dIndex(0, 0, s, in_desc_reg.GetLength(I3) + r - 1),
|
||||
in_desc_reg,
|
||||
in_desc,
|
||||
p_in + in_desc.Get1dIndex(0, 0, s, r + in_reg_desc.GetLength(I3) - 1),
|
||||
in_reg_desc,
|
||||
p_in_reg +
|
||||
in_desc_reg.Get1dIndex(0, 0, 0, in_desc_reg.GetLength(I3) - in_w_new_read),
|
||||
in_reg_desc.Get1dIndex(0, 0, 0, in_reg_desc.GetLength(I3) - in_w_new_read),
|
||||
in_desc_reg_new_read);
|
||||
|
||||
// do 1x1 conv
|
||||
threadwise_direct_convolution_1(
|
||||
in_desc_reg, p_in_reg, wei_desc_reg, p_wei_reg, out_desc_reg, p_out);
|
||||
in_reg_desc, p_in_reg, wei_reg_desc, p_wei_reg, out_desc, p_out);
|
||||
}
|
||||
}
|
||||
#elif 1
|
||||
// loop over vertical direction
|
||||
for(unsigned s = 0; s < wei_desc.GetLength(I2); ++s)
|
||||
{
|
||||
// loop over horizontal direction
|
||||
for(unsigned r = 0; r < wei_desc_lds.GetLength(I3); ++r)
|
||||
for(unsigned r = 0; r < wei_desc.GetLength(I3); ++r)
|
||||
{
|
||||
// read new weight
|
||||
threadwise_4d_tensor_copy(wei_desc_lds,
|
||||
p_wei + wei_desc_lds.Get1dIndex(0, 0, s, r),
|
||||
wei_desc_reg,
|
||||
threadwise_4d_tensor_copy(wei_desc,
|
||||
p_wei + wei_desc.Get1dIndex(0, 0, s, r),
|
||||
wei_reg_desc,
|
||||
p_wei_reg,
|
||||
wei_desc_reg);
|
||||
wei_reg_desc);
|
||||
|
||||
// read new input
|
||||
threadwise_4d_tensor_copy(in_desc_lds,
|
||||
p_in + in_desc_lds.Get1dIndex(0, 0, s, r),
|
||||
in_desc_reg,
|
||||
p_in_reg,
|
||||
in_desc_reg);
|
||||
threadwise_4d_tensor_copy(
|
||||
in_desc, p_in + in_desc.Get1dIndex(0, 0, s, r), in_reg_desc, p_in_reg, in_reg_desc);
|
||||
|
||||
// do 1x1 conv
|
||||
threadwise_direct_convolution_1(
|
||||
in_desc_reg, p_in_reg, wei_desc_reg, p_wei_reg, out_desc_reg, p_out);
|
||||
in_reg_desc, p_in_reg, wei_reg_desc, p_wei_reg, out_desc, p_out);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
}
|
||||
#endif
|
||||
}
|
||||
Reference in New Issue
Block a user