[ROCm/composable_kernel commit: 0b8e67ef08]
This commit is contained in:
Chao Liu
2019-01-08 14:05:03 -06:00
parent 81ffcfcd88
commit 220f5aea05
7 changed files with 81 additions and 89 deletions

View File

@@ -2,13 +2,13 @@
#include "constant_tensor_descriptor.cuh"
// optimized for scenario if p_in, p_wei, p_out are in register
template <class TFloat, class InDesc, class WeiDesc, class OutDesc>
template <class Float, class InDesc, class WeiDesc, class OutDesc>
__device__ void threadwise_direct_convolution_1(InDesc,
TFloat* const __restrict__ p_in,
Float* const __restrict__ p_in,
WeiDesc,
TFloat* const __restrict__ p_wei,
Float* const __restrict__ p_wei,
OutDesc,
TFloat* __restrict__ p_out)
Float* __restrict__ p_out)
{
constexpr auto I0 = Number<0>{};
constexpr auto I1 = Number<1>{};
@@ -81,13 +81,13 @@ __device__ void threadwise_direct_convolution_1(InDesc,
// Optimized for scenario if p_in and p_wei are in LDS, p_out are in register
// Copy in and wei into register before doing convolution
template <class TFloat, class InDesc, class WeiDesc, class OutDesc>
template <class Float, class InDesc, class WeiDesc, class OutDesc>
__device__ void threadwise_direct_convolution_2(InDesc,
TFloat* const __restrict__ p_in,
Float* const __restrict__ p_in,
WeiDesc,
TFloat* const __restrict__ p_wei,
Float* const __restrict__ p_wei,
OutDesc,
TFloat* __restrict__ p_out)
Float* __restrict__ p_out)
{
constexpr auto in_desc = InDesc{};
constexpr auto wei_desc = WeiDesc{};
@@ -97,8 +97,8 @@ __device__ void threadwise_direct_convolution_2(InDesc,
constexpr auto wei_reg_desc = make_ConstantTensorDescriptor(wei_desc.GetLengths());
// register
TFloat p_in_reg[in_reg_desc.GetElementSpace()];
TFloat p_wei_reg[wei_reg_desc.GetElementSpace()];
Float p_in_reg[in_reg_desc.GetElementSpace()];
Float p_wei_reg[wei_reg_desc.GetElementSpace()];
// copy input tensor into register
threadwise_4d_tensor_copy(in_desc, p_in, in_reg_desc, p_in_reg, in_reg_desc);
@@ -114,13 +114,13 @@ __device__ void threadwise_direct_convolution_2(InDesc,
// optimized for scenario where p_in and p_wei are in LDS, p_out is in register
// break down a non-1x1 convolution into a sequence of 1x1 convolutions,
// load 1x1 weight into register, and do 1x1 convolution in register.
template <class TFloat, class InDesc, class WeiDesc, class OutDesc>
template <class Float, class InDesc, class WeiDesc, class OutDesc>
__device__ void threadwise_direct_convolution_3(InDesc,
TFloat* const __restrict__ p_in,
Float* const __restrict__ p_in,
WeiDesc,
TFloat* const __restrict__ p_wei,
Float* const __restrict__ p_wei,
OutDesc,
TFloat* __restrict__ p_out)
Float* __restrict__ p_out)
{
constexpr auto I0 = Number<0>{};
constexpr auto I1 = Number<1>{};
@@ -139,8 +139,8 @@ __device__ void threadwise_direct_convolution_3(InDesc,
constexpr auto wei_reg_desc = make_ConstantTensorDescriptor(
Sequence<wei_desc.GetLength(I0), wei_desc.GetLength(I1), 1, 1>{});
TFloat p_in_reg[in_reg_desc.GetElementSpace()];
TFloat p_wei_reg[wei_reg_desc.GetElementSpace()];
Float p_in_reg[in_reg_desc.GetElementSpace()];
Float p_wei_reg[wei_reg_desc.GetElementSpace()];
constexpr unsigned in_w_new_read = 1;