Restructure gridwise and blockwise GEMM, add tensor contraction and FWD-v4r5 (#36)

* experimenting magic number division

* overhauling fwd-v4r4 to clearly reflect transformation graph

* added fwd-v4r5

* bug fix for make_dynamic_naive_tensor_descriptor_aligned_v2

* bug fix and added sanity-check in transform_dynamic_tensor_descriptor

* added conv_driver_v2

[ROCm/composable_kernel commit: 30072aec37]
This commit is contained in:
Chao Liu
2021-06-09 23:53:08 -05:00
committed by GitHub
parent 040023fdcd
commit f4acec502e
38 changed files with 4791 additions and 2050 deletions

View File

@@ -6,58 +6,94 @@ template <class TIn,
class TOut,
class ConvStrides,
class ConvDilations,
class LowerPads,
class UpperPads>
void host_direct_convolution(const Tensor<TIn>& in_nchw,
const Tensor<TWei>& wei_kcyx,
Tensor<TOut>& out_nkhw,
ConvStrides,
ConvDilations,
LowerPads,
UpperPads)
class InLeftPads,
class InRightPads>
void host_direct_convolution(const Tensor<TIn>& in,
const Tensor<TWei>& wei,
Tensor<TOut>& out,
const ConvStrides& conv_strides,
const ConvDilations& conv_dilations,
const InLeftPads& in_left_pads,
const InRightPads& in_right_pads,
const ConvTensorLayout layout = ConvTensorLayout::NCHW)
{
using namespace ck;
index_t h_pad_low = LowerPads{}.Get(Number<0>{});
index_t w_pad_low = LowerPads{}.Get(Number<1>{});
constexpr auto I0 = Number<0>{};
constexpr auto I1 = Number<1>{};
constexpr auto I2 = Number<2>{};
constexpr auto I3 = Number<3>{};
auto f = [&](auto n, auto k, auto ho, auto wo) {
auto f_nchw = [&](auto n, auto k, auto ho, auto wo) {
double v = 0;
for(int c = 0; c < wei_kcyx.mDesc.GetLengths()[1]; ++c)
for(int c = 0; c < wei.mDesc.GetLengths()[1]; ++c)
{
for(int y = 0; y < wei_kcyx.mDesc.GetLengths()[2]; ++y)
for(int y = 0; y < wei.mDesc.GetLengths()[2]; ++y)
{
int hi = ho * ConvStrides{}[0] + y * ConvDilations{}[0] - h_pad_low;
for(int x = 0; x < wei_kcyx.mDesc.GetLengths()[3]; ++x)
int hi = ho * conv_strides[I0] + y * conv_dilations[I0] - in_left_pads[I0];
for(int x = 0; x < wei.mDesc.GetLengths()[3]; ++x)
{
int wi = wo * ConvStrides{}[1] + x * ConvDilations{}[1] - w_pad_low;
if(hi >= 0 && hi < in_nchw.mDesc.GetLengths()[2] && wi >= 0 &&
wi < in_nchw.mDesc.GetLengths()[3])
int wi = wo * conv_strides[I1] + x * conv_dilations[I1] - in_left_pads[I1];
if(hi >= 0 && hi < in.mDesc.GetLengths()[2] && wi >= 0 &&
wi < in.mDesc.GetLengths()[3])
{
v += static_cast<const double>(in_nchw(n, c, hi, wi)) *
static_cast<const double>(wei_kcyx(k, c, y, x));
v += static_cast<const double>(in(n, c, hi, wi)) *
static_cast<const double>(wei(k, c, y, x));
}
}
}
}
out_nkhw(n, k, ho, wo) = v;
out(n, k, ho, wo) = v;
};
auto f_par = make_ParallelTensorFunctor(f,
out_nkhw.mDesc.GetLengths()[0],
out_nkhw.mDesc.GetLengths()[1],
out_nkhw.mDesc.GetLengths()[2],
out_nkhw.mDesc.GetLengths()[3]);
auto f_nhwc = [&](auto n, auto ho, auto wo, auto k) {
double v = 0;
for(int c = 0; c < wei.mDesc.GetLengths()[3]; ++c)
{
for(int y = 0; y < wei.mDesc.GetLengths()[1]; ++y)
{
int hi = ho * conv_strides[I0] + y * conv_dilations[I0] - in_left_pads[I0];
for(int x = 0; x < wei.mDesc.GetLengths()[2]; ++x)
{
int wi = wo * conv_strides[I1] + x * conv_dilations[I1] - in_left_pads[I1];
if(hi >= 0 && hi < in.mDesc.GetLengths()[1] && wi >= 0 &&
wi < in.mDesc.GetLengths()[2])
{
v += static_cast<const double>(in(n, hi, wi, c)) *
static_cast<const double>(wei(k, y, x, c));
}
}
}
}
out(n, ho, wo, k) = v;
};
f_par(std::thread::hardware_concurrency());
switch(layout)
{
case ConvTensorLayout::NCHW:
make_ParallelTensorFunctor(f_nchw,
out.mDesc.GetLengths()[0],
out.mDesc.GetLengths()[1],
out.mDesc.GetLengths()[2],
out.mDesc.GetLengths()[3])(std::thread::hardware_concurrency());
break;
case ConvTensorLayout::NHWC:
make_ParallelTensorFunctor(f_nhwc,
out.mDesc.GetLengths()[0],
out.mDesc.GetLengths()[1],
out.mDesc.GetLengths()[2],
out.mDesc.GetLengths()[3])(std::thread::hardware_concurrency());
break;
default: throw std::runtime_error("wrong! not supported layout");
}
}
template <class TIn, class TWei, class TOut, class LowerPads, class UpperPads>
template <class TIn, class TWei, class TOut, class InLeftPads, class InRightPads>
void host_winograd_3x3_convolution(const Tensor<TIn>& in_nchw,
const Tensor<TWei>& wei_kcyx,
Tensor<TOut>& out_nkhw,
LowerPads,
UpperPads)
InLeftPads,
InRightPads)
{
using namespace ck;
@@ -76,8 +112,8 @@ void host_winograd_3x3_convolution(const Tensor<TIn>& in_nchw,
std::size_t HO = out_nkhw.mDesc.GetLengths()[2];
std::size_t WO = out_nkhw.mDesc.GetLengths()[3];
index_t h_pad_low = LowerPads{}.Get(Number<0>{});
index_t w_pad_low = LowerPads{}.Get(Number<1>{});
index_t h_pad_low = InLeftPads{}.Get(Number<0>{});
index_t w_pad_low = InLeftPads{}.Get(Number<1>{});
std::size_t HiPerTile = HoPerTile + Y - 1;
std::size_t WiPerTile = WoPerTile + X - 1;