mirror of
https://github.com/ROCm/composable_kernel.git
synced 2026-05-21 21:39:15 +00:00
ckProfiler and device-level XDL GEMM operator (#48)
* add DeviceGemmXdl
* update script
* fix naming issue
* fix comment
* output HostTensorDescriptor
* rename
* padded GEMM for fwd v4r4r4 nhwc
* refactor
* refactor
* refactor
* adding ckProfiler
* adding ckProfiler
* refactor
* fix tuning parameter bug
* add more gemm instances
* add more fp16 GEMM instances
* fix profiler driver
* fix bug in tuning parameter
* add fp32 gemm instances
* small fix
* refactor
* rename
* refactor gemm profiler; adding DeviceConv and conv profiler
* refactor
* fix
* add conv profiler
* refactor
* adding more GEMM and Conv instance
* Create README.md
Add build instruction for ckProfiler
* Create README.md
Add Readme for gemm_xdl example
* Update README.md
Remove build instruction from top most folder
* Update README.md
* clean up
[ROCm/composable_kernel commit: e823d518cb]
This commit is contained in:
@@ -3,15 +3,6 @@
|
||||
|
||||
#include "tensor_descriptor.hpp"
|
||||
|
||||
enum ConvTensorLayout
|
||||
{
|
||||
NCHW,
|
||||
NHWC,
|
||||
CHWN,
|
||||
NCHWc,
|
||||
NHWCc
|
||||
};
|
||||
|
||||
template <typename... InDesc,
|
||||
typename... WeiDesc,
|
||||
typename ConvStrides,
|
||||
|
||||
@@ -1,16 +0,0 @@
|
||||
#ifndef GEMM_COMMON_HPP
|
||||
#define GEMM_COMMON_HPP
|
||||
|
||||
enum GemmMatrixLayout
|
||||
{
|
||||
MK_KN_MN, // 0
|
||||
MK_NK_MN, // 1
|
||||
KM_KN_MN, // 2
|
||||
KM_NK_MN, // 3
|
||||
MK_KN_NM, // 4
|
||||
MK_NK_NM, // 5
|
||||
KM_KN_NM, // 6
|
||||
KM_NK_NM, // 7
|
||||
};
|
||||
|
||||
#endif
|
||||
@@ -1,5 +1,6 @@
|
||||
#pragma once
|
||||
#include "host_tensor.hpp"
|
||||
#include "conv_common.hpp"
|
||||
|
||||
template <typename TIn,
|
||||
typename TWei,
|
||||
@@ -8,19 +9,16 @@ template <typename TIn,
|
||||
typename ConvDilations,
|
||||
typename InLeftPads,
|
||||
typename InRightPads>
|
||||
void host_direct_convolution(const Tensor<TIn>& in,
|
||||
const Tensor<TWei>& wei,
|
||||
Tensor<TOut>& out,
|
||||
const ConvStrides& conv_strides,
|
||||
const ConvDilations& conv_dilations,
|
||||
const InLeftPads& in_left_pads,
|
||||
const InRightPads&,
|
||||
const ConvTensorLayout layout = ConvTensorLayout::NCHW)
|
||||
void host_conv_nchw_kcyx_nkhw(const Tensor<TIn>& in,
|
||||
const Tensor<TWei>& wei,
|
||||
Tensor<TOut>& out,
|
||||
const ConvStrides& conv_strides,
|
||||
const ConvDilations& conv_dilations,
|
||||
const InLeftPads& in_left_pads,
|
||||
const InRightPads&)
|
||||
{
|
||||
using namespace ck;
|
||||
|
||||
constexpr auto I0 = Number<0>{};
|
||||
constexpr auto I1 = Number<1>{};
|
||||
constexpr auto I0 = ck::Number<0>{};
|
||||
constexpr auto I1 = ck::Number<1>{};
|
||||
|
||||
auto f_nchw = [&](auto n, auto k, auto ho, auto wo) {
|
||||
double v = 0;
|
||||
@@ -44,281 +42,9 @@ void host_direct_convolution(const Tensor<TIn>& in,
|
||||
out(n, k, ho, wo) = v;
|
||||
};
|
||||
|
||||
auto f_nhwc = [&](auto n, auto ho, auto wo, auto k) {
|
||||
double v = 0;
|
||||
for(int c = 0; c < wei.mDesc.GetLengths()[3]; ++c)
|
||||
{
|
||||
for(int y = 0; y < wei.mDesc.GetLengths()[1]; ++y)
|
||||
{
|
||||
int hi = ho * conv_strides[I0] + y * conv_dilations[I0] - in_left_pads[I0];
|
||||
for(int x = 0; x < wei.mDesc.GetLengths()[2]; ++x)
|
||||
{
|
||||
int wi = wo * conv_strides[I1] + x * conv_dilations[I1] - in_left_pads[I1];
|
||||
if(hi >= 0 && hi < in.mDesc.GetLengths()[1] && wi >= 0 &&
|
||||
wi < in.mDesc.GetLengths()[2])
|
||||
{
|
||||
v += static_cast<const double>(in(n, hi, wi, c)) *
|
||||
static_cast<const double>(wei(k, y, x, c));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
out(n, ho, wo, k) = v;
|
||||
};
|
||||
|
||||
if(layout == ConvTensorLayout::NCHW)
|
||||
{
|
||||
make_ParallelTensorFunctor(f_nchw,
|
||||
out.mDesc.GetLengths()[0],
|
||||
out.mDesc.GetLengths()[1],
|
||||
out.mDesc.GetLengths()[2],
|
||||
out.mDesc.GetLengths()[3])(std::thread::hardware_concurrency());
|
||||
}
|
||||
else if(layout == ConvTensorLayout::NHWC)
|
||||
{
|
||||
make_ParallelTensorFunctor(f_nhwc,
|
||||
out.mDesc.GetLengths()[0],
|
||||
out.mDesc.GetLengths()[1],
|
||||
out.mDesc.GetLengths()[2],
|
||||
out.mDesc.GetLengths()[3])(std::thread::hardware_concurrency());
|
||||
}
|
||||
else
|
||||
{
|
||||
throw std::runtime_error("wrong! not supported layout");
|
||||
}
|
||||
}
|
||||
|
||||
template <typename TIn, typename TWei, typename TOut, typename InLeftPads, typename InRightPads>
|
||||
void host_winograd_3x3_convolution(const Tensor<TIn>& in_nchw,
|
||||
const Tensor<TWei>& wei_kcyx,
|
||||
Tensor<TOut>& out_nkhw,
|
||||
InLeftPads,
|
||||
InRightPads)
|
||||
{
|
||||
using namespace ck;
|
||||
|
||||
constexpr std::size_t HoPerTile = 2;
|
||||
constexpr std::size_t WoPerTile = 2;
|
||||
|
||||
std::size_t N = in_nchw.mDesc.GetLengths()[0];
|
||||
std::size_t C = in_nchw.mDesc.GetLengths()[1];
|
||||
|
||||
std::size_t K = wei_kcyx.mDesc.GetLengths()[0];
|
||||
std::size_t Y = wei_kcyx.mDesc.GetLengths()[2];
|
||||
std::size_t X = wei_kcyx.mDesc.GetLengths()[3];
|
||||
|
||||
std::size_t Ho = out_nkhw.mDesc.GetLengths()[2];
|
||||
std::size_t Wo = out_nkhw.mDesc.GetLengths()[3];
|
||||
|
||||
index_t h_pad_low = InLeftPads{}.Get(Number<0>{});
|
||||
index_t w_pad_low = InLeftPads{}.Get(Number<1>{});
|
||||
|
||||
std::size_t HiPerTile = HoPerTile + Y - 1;
|
||||
std::size_t WiPerTile = WoPerTile + X - 1;
|
||||
|
||||
std::size_t HTile = (Ho + HoPerTile - 1) / HoPerTile;
|
||||
std::size_t WTile = (Wo + WoPerTile - 1) / WoPerTile;
|
||||
|
||||
Tensor<double> in_hold({N, C, HTile, WTile, HiPerTile, WiPerTile});
|
||||
Tensor<double> in_transform({N, C, HTile, WTile, HiPerTile, WiPerTile});
|
||||
Tensor<double> wei_transform({K, C, HiPerTile, WiPerTile});
|
||||
Tensor<double> out_transform({N, K, HTile, WTile, HiPerTile, HiPerTile});
|
||||
Tensor<double> out_hold({N, K, HTile, WTile, HoPerTile, WoPerTile});
|
||||
|
||||
auto f_in_hold = [&](auto n, auto c, auto htile, auto wtile) {
|
||||
for(int j = 0; j < HiPerTile; ++j)
|
||||
{
|
||||
int hi = HoPerTile * htile + j - h_pad_low;
|
||||
for(int i = 0; i < WiPerTile; ++i)
|
||||
{
|
||||
int wi = WoPerTile * wtile + i - w_pad_low;
|
||||
|
||||
if(hi >= 0 && hi < in_nchw.mDesc.GetLengths()[2] && wi >= 0 &&
|
||||
wi < in_nchw.mDesc.GetLengths()[3])
|
||||
{
|
||||
in_hold(n, c, htile, wtile, j, i) = in_nchw(n, c, hi, wi);
|
||||
}
|
||||
else
|
||||
{
|
||||
in_hold(n, c, htile, wtile, j, i) = TIn(0);
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
auto f_in_transform = [&](auto n, auto c, auto htile, auto wtile) {
|
||||
in_transform(n, c, htile, wtile, 0, 0) =
|
||||
in_hold(n, c, htile, wtile, 0, 0) - in_hold(n, c, htile, wtile, 0, 2) -
|
||||
in_hold(n, c, htile, wtile, 2, 0) + in_hold(n, c, htile, wtile, 2, 2);
|
||||
in_transform(n, c, htile, wtile, 0, 1) =
|
||||
in_hold(n, c, htile, wtile, 0, 1) + in_hold(n, c, htile, wtile, 0, 2) -
|
||||
in_hold(n, c, htile, wtile, 2, 1) - in_hold(n, c, htile, wtile, 2, 2);
|
||||
in_transform(n, c, htile, wtile, 0, 2) =
|
||||
-in_hold(n, c, htile, wtile, 0, 1) + in_hold(n, c, htile, wtile, 0, 2) +
|
||||
in_hold(n, c, htile, wtile, 2, 1) - in_hold(n, c, htile, wtile, 2, 2);
|
||||
in_transform(n, c, htile, wtile, 0, 3) =
|
||||
in_hold(n, c, htile, wtile, 0, 1) - in_hold(n, c, htile, wtile, 0, 3) -
|
||||
in_hold(n, c, htile, wtile, 2, 1) + in_hold(n, c, htile, wtile, 2, 3);
|
||||
|
||||
in_transform(n, c, htile, wtile, 1, 0) =
|
||||
in_hold(n, c, htile, wtile, 1, 0) - in_hold(n, c, htile, wtile, 1, 2) +
|
||||
in_hold(n, c, htile, wtile, 2, 0) - in_hold(n, c, htile, wtile, 2, 2);
|
||||
in_transform(n, c, htile, wtile, 1, 1) =
|
||||
in_hold(n, c, htile, wtile, 1, 1) + in_hold(n, c, htile, wtile, 1, 2) +
|
||||
in_hold(n, c, htile, wtile, 2, 1) + in_hold(n, c, htile, wtile, 2, 2);
|
||||
in_transform(n, c, htile, wtile, 1, 2) =
|
||||
-in_hold(n, c, htile, wtile, 1, 1) + in_hold(n, c, htile, wtile, 1, 2) -
|
||||
in_hold(n, c, htile, wtile, 2, 1) + in_hold(n, c, htile, wtile, 2, 2);
|
||||
in_transform(n, c, htile, wtile, 1, 3) =
|
||||
in_hold(n, c, htile, wtile, 1, 1) - in_hold(n, c, htile, wtile, 1, 3) +
|
||||
in_hold(n, c, htile, wtile, 2, 1) - in_hold(n, c, htile, wtile, 2, 3);
|
||||
|
||||
in_transform(n, c, htile, wtile, 2, 0) =
|
||||
-in_hold(n, c, htile, wtile, 1, 0) + in_hold(n, c, htile, wtile, 1, 2) +
|
||||
in_hold(n, c, htile, wtile, 2, 0) - in_hold(n, c, htile, wtile, 2, 2);
|
||||
in_transform(n, c, htile, wtile, 2, 1) =
|
||||
-in_hold(n, c, htile, wtile, 1, 1) - in_hold(n, c, htile, wtile, 1, 2) +
|
||||
in_hold(n, c, htile, wtile, 2, 1) + in_hold(n, c, htile, wtile, 2, 2);
|
||||
in_transform(n, c, htile, wtile, 2, 2) =
|
||||
in_hold(n, c, htile, wtile, 1, 1) - in_hold(n, c, htile, wtile, 1, 2) -
|
||||
in_hold(n, c, htile, wtile, 2, 1) + in_hold(n, c, htile, wtile, 2, 2);
|
||||
in_transform(n, c, htile, wtile, 2, 3) =
|
||||
-in_hold(n, c, htile, wtile, 1, 1) + in_hold(n, c, htile, wtile, 1, 3) +
|
||||
in_hold(n, c, htile, wtile, 2, 1) - in_hold(n, c, htile, wtile, 2, 3);
|
||||
|
||||
in_transform(n, c, htile, wtile, 3, 0) =
|
||||
in_hold(n, c, htile, wtile, 1, 0) - in_hold(n, c, htile, wtile, 1, 2) -
|
||||
in_hold(n, c, htile, wtile, 3, 0) + in_hold(n, c, htile, wtile, 3, 2);
|
||||
in_transform(n, c, htile, wtile, 3, 1) =
|
||||
in_hold(n, c, htile, wtile, 1, 1) + in_hold(n, c, htile, wtile, 1, 2) -
|
||||
in_hold(n, c, htile, wtile, 3, 1) - in_hold(n, c, htile, wtile, 3, 2);
|
||||
in_transform(n, c, htile, wtile, 3, 2) =
|
||||
-in_hold(n, c, htile, wtile, 1, 1) + in_hold(n, c, htile, wtile, 1, 2) +
|
||||
in_hold(n, c, htile, wtile, 3, 1) - in_hold(n, c, htile, wtile, 3, 2);
|
||||
in_transform(n, c, htile, wtile, 3, 3) =
|
||||
in_hold(n, c, htile, wtile, 1, 1) - in_hold(n, c, htile, wtile, 1, 3) -
|
||||
in_hold(n, c, htile, wtile, 3, 1) + in_hold(n, c, htile, wtile, 3, 3);
|
||||
};
|
||||
|
||||
auto f_wei_transform = [&](auto k, auto c) {
|
||||
wei_transform(k, c, 0, 0) = double(wei_kcyx(k, c, 0, 0));
|
||||
wei_transform(k, c, 0, 1) = 0.5 * double(wei_kcyx(k, c, 0, 0)) +
|
||||
0.5 * double(wei_kcyx(k, c, 0, 1)) +
|
||||
0.5 * double(wei_kcyx(k, c, 0, 2));
|
||||
wei_transform(k, c, 0, 2) = 0.5 * double(wei_kcyx(k, c, 0, 0)) -
|
||||
0.5 * double(wei_kcyx(k, c, 0, 1)) +
|
||||
0.5 * double(wei_kcyx(k, c, 0, 2));
|
||||
wei_transform(k, c, 0, 3) = double(wei_kcyx(k, c, 0, 2));
|
||||
|
||||
wei_transform(k, c, 1, 0) = 0.5 * double(wei_kcyx(k, c, 0, 0)) +
|
||||
0.5 * double(wei_kcyx(k, c, 1, 0)) +
|
||||
0.5 * double(wei_kcyx(k, c, 2, 0));
|
||||
wei_transform(k, c, 1, 1) =
|
||||
0.25 * double(wei_kcyx(k, c, 0, 0)) + 0.25 * double(wei_kcyx(k, c, 0, 1)) +
|
||||
0.25 * double(wei_kcyx(k, c, 0, 2)) + 0.25 * double(wei_kcyx(k, c, 1, 0)) +
|
||||
0.25 * double(wei_kcyx(k, c, 1, 1)) + 0.25 * double(wei_kcyx(k, c, 1, 2)) +
|
||||
0.25 * double(wei_kcyx(k, c, 2, 0)) + 0.25 * double(wei_kcyx(k, c, 2, 1)) +
|
||||
0.25 * double(wei_kcyx(k, c, 2, 2));
|
||||
wei_transform(k, c, 1, 2) =
|
||||
0.25 * double(wei_kcyx(k, c, 0, 0)) - 0.25 * double(wei_kcyx(k, c, 0, 1)) +
|
||||
0.25 * double(wei_kcyx(k, c, 0, 2)) + 0.25 * double(wei_kcyx(k, c, 1, 0)) -
|
||||
0.25 * double(wei_kcyx(k, c, 1, 1)) + 0.25 * double(wei_kcyx(k, c, 1, 2)) +
|
||||
0.25 * double(wei_kcyx(k, c, 2, 0)) - 0.25 * double(wei_kcyx(k, c, 2, 1)) +
|
||||
0.25 * double(wei_kcyx(k, c, 2, 2));
|
||||
wei_transform(k, c, 1, 3) = 0.5 * double(wei_kcyx(k, c, 0, 2)) +
|
||||
0.5 * double(wei_kcyx(k, c, 1, 2)) +
|
||||
0.5 * double(wei_kcyx(k, c, 2, 2));
|
||||
|
||||
wei_transform(k, c, 2, 0) = 0.5 * double(wei_kcyx(k, c, 0, 0)) -
|
||||
0.5 * double(wei_kcyx(k, c, 1, 0)) +
|
||||
0.5 * double(wei_kcyx(k, c, 2, 0));
|
||||
wei_transform(k, c, 2, 1) =
|
||||
0.25 * double(wei_kcyx(k, c, 0, 0)) + 0.25 * double(wei_kcyx(k, c, 0, 1)) +
|
||||
0.25 * double(wei_kcyx(k, c, 0, 2)) - 0.25 * double(wei_kcyx(k, c, 1, 0)) -
|
||||
0.25 * double(wei_kcyx(k, c, 1, 1)) - 0.25 * double(wei_kcyx(k, c, 1, 2)) +
|
||||
0.25 * double(wei_kcyx(k, c, 2, 0)) + 0.25 * double(wei_kcyx(k, c, 2, 1)) +
|
||||
0.25 * double(wei_kcyx(k, c, 2, 2));
|
||||
wei_transform(k, c, 2, 2) =
|
||||
0.25 * double(wei_kcyx(k, c, 0, 0)) - 0.25 * double(wei_kcyx(k, c, 0, 1)) +
|
||||
0.25 * double(wei_kcyx(k, c, 0, 2)) - 0.25 * double(wei_kcyx(k, c, 1, 0)) +
|
||||
0.25 * double(wei_kcyx(k, c, 1, 1)) - 0.25 * double(wei_kcyx(k, c, 1, 2)) +
|
||||
0.25 * double(wei_kcyx(k, c, 2, 0)) - 0.25 * double(wei_kcyx(k, c, 2, 1)) +
|
||||
0.25 * double(wei_kcyx(k, c, 2, 2));
|
||||
wei_transform(k, c, 2, 3) = 0.5 * double(wei_kcyx(k, c, 0, 2)) -
|
||||
0.5 * double(wei_kcyx(k, c, 1, 2)) +
|
||||
0.5 * double(wei_kcyx(k, c, 2, 2));
|
||||
|
||||
wei_transform(k, c, 3, 0) = double(wei_kcyx(k, c, 2, 0));
|
||||
wei_transform(k, c, 3, 1) = 0.5 * double(wei_kcyx(k, c, 2, 0)) +
|
||||
0.5 * double(wei_kcyx(k, c, 2, 1)) +
|
||||
0.5 * double(wei_kcyx(k, c, 2, 2));
|
||||
wei_transform(k, c, 3, 2) = 0.5 * double(wei_kcyx(k, c, 2, 0)) -
|
||||
0.5 * double(wei_kcyx(k, c, 2, 1)) +
|
||||
0.5 * double(wei_kcyx(k, c, 2, 2));
|
||||
wei_transform(k, c, 3, 3) = double(wei_kcyx(k, c, 2, 2));
|
||||
};
|
||||
|
||||
auto f_out_transform = [&](auto n, auto k, auto htile, auto wtile) {
|
||||
for(int j = 0; j < HiPerTile; ++j)
|
||||
{
|
||||
for(int i = 0; i < WiPerTile; ++i)
|
||||
{
|
||||
double v = 0;
|
||||
for(int c = 0; c < C; ++c)
|
||||
{
|
||||
v += in_transform(n, c, htile, wtile, j, i) * wei_transform(k, c, j, i);
|
||||
}
|
||||
|
||||
out_transform(n, k, htile, wtile, j, i) = v;
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
auto f_out_hold = [&](auto n, auto k, auto htile, auto wtile) {
|
||||
out_hold(n, k, htile, wtile, 0, 0) =
|
||||
out_transform(n, k, htile, wtile, 0, 0) + out_transform(n, k, htile, wtile, 0, 1) +
|
||||
out_transform(n, k, htile, wtile, 0, 2) + out_transform(n, k, htile, wtile, 1, 0) +
|
||||
out_transform(n, k, htile, wtile, 1, 1) + out_transform(n, k, htile, wtile, 1, 2) +
|
||||
out_transform(n, k, htile, wtile, 2, 0) + out_transform(n, k, htile, wtile, 2, 1) +
|
||||
out_transform(n, k, htile, wtile, 2, 2);
|
||||
out_hold(n, k, htile, wtile, 0, 1) =
|
||||
out_transform(n, k, htile, wtile, 0, 1) - out_transform(n, k, htile, wtile, 0, 2) -
|
||||
out_transform(n, k, htile, wtile, 0, 3) + out_transform(n, k, htile, wtile, 1, 1) -
|
||||
out_transform(n, k, htile, wtile, 1, 2) - out_transform(n, k, htile, wtile, 1, 3) +
|
||||
out_transform(n, k, htile, wtile, 2, 1) - out_transform(n, k, htile, wtile, 2, 2) -
|
||||
out_transform(n, k, htile, wtile, 2, 3);
|
||||
out_hold(n, k, htile, wtile, 1, 0) =
|
||||
out_transform(n, k, htile, wtile, 1, 0) + out_transform(n, k, htile, wtile, 1, 1) +
|
||||
out_transform(n, k, htile, wtile, 1, 2) - out_transform(n, k, htile, wtile, 2, 0) -
|
||||
out_transform(n, k, htile, wtile, 2, 1) - out_transform(n, k, htile, wtile, 2, 2) -
|
||||
out_transform(n, k, htile, wtile, 3, 0) - out_transform(n, k, htile, wtile, 3, 1) -
|
||||
out_transform(n, k, htile, wtile, 3, 2);
|
||||
out_hold(n, k, htile, wtile, 1, 1) =
|
||||
out_transform(n, k, htile, wtile, 1, 1) - out_transform(n, k, htile, wtile, 1, 2) -
|
||||
out_transform(n, k, htile, wtile, 1, 3) - out_transform(n, k, htile, wtile, 2, 1) +
|
||||
out_transform(n, k, htile, wtile, 2, 2) + out_transform(n, k, htile, wtile, 2, 3) -
|
||||
out_transform(n, k, htile, wtile, 3, 1) + out_transform(n, k, htile, wtile, 3, 2) +
|
||||
out_transform(n, k, htile, wtile, 3, 3);
|
||||
};
|
||||
|
||||
auto f_out = [&](auto n, auto k, auto htile, auto wtile) {
|
||||
for(int j = 0; j < HoPerTile; ++j)
|
||||
{
|
||||
std::size_t ho = HoPerTile * htile + j;
|
||||
for(int i = 0; i < WoPerTile; ++i)
|
||||
{
|
||||
std::size_t wo = WoPerTile * wtile + i;
|
||||
out_nkhw(n, k, ho, wo) = out_hold(n, k, htile, wtile, j, i);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
std::size_t num_thread = std::thread::hardware_concurrency();
|
||||
|
||||
make_ParallelTensorFunctor(f_in_hold, N, C, HTile, WTile)(num_thread);
|
||||
make_ParallelTensorFunctor(f_in_transform, N, C, HTile, WTile)(num_thread);
|
||||
make_ParallelTensorFunctor(f_wei_transform, K, C)(num_thread);
|
||||
make_ParallelTensorFunctor(f_out_transform, N, K, HTile, WTile)(num_thread);
|
||||
make_ParallelTensorFunctor(f_out_hold, N, K, HTile, WTile)(num_thread);
|
||||
make_ParallelTensorFunctor(f_out, N, K, HTile, WTile)(num_thread);
|
||||
make_ParallelTensorFunctor(f_nchw,
|
||||
out.mDesc.GetLengths()[0],
|
||||
out.mDesc.GetLengths()[1],
|
||||
out.mDesc.GetLengths()[2],
|
||||
out.mDesc.GetLengths()[3])(std::thread::hardware_concurrency());
|
||||
}
|
||||
|
||||
@@ -1,135 +0,0 @@
|
||||
#pragma once
|
||||
#include "host_tensor.hpp"
|
||||
|
||||
template <typename TIn,
|
||||
typename TWei,
|
||||
typename TOut,
|
||||
typename ConvStrides,
|
||||
typename ConvDilations,
|
||||
typename InLeftPads,
|
||||
typename InRightPads>
|
||||
void host_direct_convolution_backward_data(Tensor<TIn>& in,
|
||||
const Tensor<TWei>& wei,
|
||||
const Tensor<TOut>& out,
|
||||
const ConvStrides& conv_strides,
|
||||
const ConvDilations& conv_dilations,
|
||||
const InLeftPads& in_left_pads,
|
||||
const InRightPads& /* in_right_pads */,
|
||||
const ConvTensorLayout layout = ConvTensorLayout::NCHW)
|
||||
{
|
||||
using namespace ck;
|
||||
|
||||
constexpr auto I0 = Number<0>{};
|
||||
constexpr auto I1 = Number<1>{};
|
||||
constexpr auto I2 = Number<2>{};
|
||||
constexpr auto I3 = Number<3>{};
|
||||
|
||||
auto f_nchw = [&](auto n, auto c, auto hi, auto wi) {
|
||||
std::size_t K = wei.mDesc.GetLengths()[I0];
|
||||
std::size_t Y = wei.mDesc.GetLengths()[I2];
|
||||
std::size_t X = wei.mDesc.GetLengths()[I3];
|
||||
|
||||
std::size_t Ho = out.mDesc.GetLengths()[I2];
|
||||
std::size_t Wo = out.mDesc.GetLengths()[I3];
|
||||
|
||||
double v = 0;
|
||||
|
||||
for(int y = 0; y < Y; ++y)
|
||||
{
|
||||
int h_tmp = hi + in_left_pads[I0] - y * conv_dilations[I0];
|
||||
|
||||
if(h_tmp % conv_strides[I0] == 0)
|
||||
{
|
||||
int ho = h_tmp / conv_strides[I0];
|
||||
|
||||
if(ho >= 0 && ho < Ho)
|
||||
{
|
||||
for(int x = 0; x < X; ++x)
|
||||
{
|
||||
int w_tmp = wi + in_left_pads[I1] - x * conv_dilations[I1];
|
||||
|
||||
if(w_tmp % conv_strides[I1] == 0)
|
||||
{
|
||||
int wo = w_tmp / conv_strides[I1];
|
||||
|
||||
if(wo >= 0 && wo < Wo)
|
||||
{
|
||||
for(int k = 0; k < K; ++k)
|
||||
{
|
||||
v += out(n, k, ho, wo) * wei(k, c, y, x);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
in(n, c, hi, wi) = v;
|
||||
};
|
||||
|
||||
auto f_nhwc = [&](auto n, auto hi, auto wi, auto c) {
|
||||
std::size_t K = wei.mDesc.GetLengths()[I0];
|
||||
std::size_t Y = wei.mDesc.GetLengths()[I1];
|
||||
std::size_t X = wei.mDesc.GetLengths()[I2];
|
||||
|
||||
std::size_t Ho = out.mDesc.GetLengths()[I1];
|
||||
std::size_t Wo = out.mDesc.GetLengths()[I2];
|
||||
|
||||
double v = 0;
|
||||
|
||||
for(int y = 0; y < Y; ++y)
|
||||
{
|
||||
int h_tmp = hi + in_left_pads[I0] - y * conv_dilations[I0];
|
||||
|
||||
if(h_tmp % conv_strides[I0] == 0)
|
||||
{
|
||||
int ho = h_tmp / conv_strides[I0];
|
||||
|
||||
if(ho >= 0 && ho < Ho)
|
||||
{
|
||||
for(int x = 0; x < X; ++x)
|
||||
{
|
||||
int w_tmp = wi + in_left_pads[I1] - x * conv_dilations[I1];
|
||||
|
||||
if(w_tmp % conv_strides[I1] == 0)
|
||||
{
|
||||
int wo = w_tmp / conv_strides[I1];
|
||||
|
||||
if(wo >= 0 && wo < Wo)
|
||||
{
|
||||
for(int k = 0; k < K; ++k)
|
||||
{
|
||||
v += out(n, ho, wo, k) * wei(k, y, x, c);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
in(n, hi, wi, c) = v;
|
||||
};
|
||||
|
||||
if(layout == ConvTensorLayout::NCHW)
|
||||
{
|
||||
make_ParallelTensorFunctor(f_nchw,
|
||||
in.mDesc.GetLengths()[0],
|
||||
in.mDesc.GetLengths()[1],
|
||||
in.mDesc.GetLengths()[2],
|
||||
in.mDesc.GetLengths()[3])(std::thread::hardware_concurrency());
|
||||
}
|
||||
else if(layout == ConvTensorLayout::NHWC)
|
||||
{
|
||||
make_ParallelTensorFunctor(f_nhwc,
|
||||
in.mDesc.GetLengths()[0],
|
||||
in.mDesc.GetLengths()[1],
|
||||
in.mDesc.GetLengths()[2],
|
||||
in.mDesc.GetLengths()[3])(std::thread::hardware_concurrency());
|
||||
}
|
||||
else
|
||||
{
|
||||
throw std::runtime_error("wrong! not supported layout");
|
||||
}
|
||||
}
|
||||
@@ -1,89 +0,0 @@
|
||||
#pragma once
|
||||
#include "host_tensor.hpp"
|
||||
|
||||
template <typename TOut,
|
||||
typename TIn,
|
||||
typename TWei,
|
||||
typename ConvStrides,
|
||||
typename ConvDilations,
|
||||
typename InLeftPads,
|
||||
typename InRightPads>
|
||||
void host_direct_convolution_backward_weights(
|
||||
const Tensor<TOut>& out,
|
||||
const Tensor<TIn>& in,
|
||||
Tensor<TWei>& wei,
|
||||
const ConvStrides& conv_strides,
|
||||
const ConvDilations& conv_dilations,
|
||||
const InLeftPads& in_left_pads,
|
||||
const InRightPads&,
|
||||
const ConvTensorLayout layout = ConvTensorLayout::NCHW)
|
||||
{
|
||||
using namespace ck;
|
||||
|
||||
constexpr auto I0 = Number<0>{};
|
||||
constexpr auto I1 = Number<1>{};
|
||||
auto f_kcyx = [&](auto k, auto c, auto y, auto x) {
|
||||
double v = 0;
|
||||
for(int n = 0; n < out.mDesc.GetLengths()[0]; ++n)
|
||||
{
|
||||
for(int ho = 0; ho < out.mDesc.GetLengths()[2]; ++ho)
|
||||
{
|
||||
int hi = ho * conv_strides[I0] + y * conv_dilations[I0] - in_left_pads[I0];
|
||||
for(int wo = 0; wo < out.mDesc.GetLengths()[3]; ++wo)
|
||||
{
|
||||
int wi = wo * conv_strides[I1] + x * conv_dilations[I1] - in_left_pads[I1];
|
||||
if(hi >= 0 && hi < in.mDesc.GetLengths()[2] && wi >= 0 &&
|
||||
wi < in.mDesc.GetLengths()[3])
|
||||
{
|
||||
v += static_cast<const double>(in(n, c, hi, wi)) *
|
||||
static_cast<const double>(out(n, k, ho, wo));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
wei(k, c, y, x) = v;
|
||||
};
|
||||
|
||||
auto f_kyxc = [&](auto k, auto y, auto x, auto c) {
|
||||
double v = 0;
|
||||
for(int n = 0; n < out.mDesc.GetLengths()[0]; ++n)
|
||||
{
|
||||
for(int ho = 0; ho < out.mDesc.GetLengths()[1]; ++ho)
|
||||
{
|
||||
int hi = ho * conv_strides[I0] + y * conv_dilations[I0] - in_left_pads[I0];
|
||||
for(int wo = 0; wo < out.mDesc.GetLengths()[2]; ++wo)
|
||||
{
|
||||
int wi = wo * conv_strides[I1] + x * conv_dilations[I1] - in_left_pads[I1];
|
||||
if(hi >= 0 && hi < in.mDesc.GetLengths()[1] && wi >= 0 &&
|
||||
wi < in.mDesc.GetLengths()[2])
|
||||
{
|
||||
v += static_cast<const double>(in(n, hi, wi, c)) *
|
||||
static_cast<const double>(out(n, ho, wo, k));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
wei(k, y, x, c) = v;
|
||||
};
|
||||
|
||||
if(layout == ConvTensorLayout::NCHW)
|
||||
{
|
||||
make_ParallelTensorFunctor(f_kcyx,
|
||||
wei.mDesc.GetLengths()[0],
|
||||
wei.mDesc.GetLengths()[1],
|
||||
wei.mDesc.GetLengths()[2],
|
||||
wei.mDesc.GetLengths()[3])(std::thread::hardware_concurrency());
|
||||
}
|
||||
else if(layout == ConvTensorLayout::NHWC)
|
||||
{
|
||||
make_ParallelTensorFunctor(f_kyxc,
|
||||
wei.mDesc.GetLengths()[0],
|
||||
wei.mDesc.GetLengths()[1],
|
||||
wei.mDesc.GetLengths()[2],
|
||||
wei.mDesc.GetLengths()[3])(std::thread::hardware_concurrency());
|
||||
}
|
||||
else
|
||||
{
|
||||
throw std::runtime_error("wrong! not supported layout");
|
||||
}
|
||||
}
|
||||
@@ -157,3 +157,26 @@ void host_gemm(const Tensor<AType>& a,
|
||||
throw std::runtime_error("wrong! not supported layout");
|
||||
}
|
||||
}
|
||||
|
||||
template <typename AType, typename BType, typename CType>
|
||||
void host_gemm_mk_kn_mn(const Tensor<AType>& a_m_k,
|
||||
const Tensor<BType>& b_k_n,
|
||||
Tensor<CType>& c_m_n)
|
||||
{
|
||||
auto f_mk_kn_mn = [&](auto m, auto n) {
|
||||
const int K = a_m_k.mDesc.GetLengths()[1];
|
||||
|
||||
double v = 0;
|
||||
|
||||
for(int k = 0; k < K; ++k)
|
||||
{
|
||||
v += static_cast<const double>(a_m_k(m, k)) * static_cast<const double>(b_k_n(k, n));
|
||||
}
|
||||
|
||||
c_m_n(m, n) = v;
|
||||
};
|
||||
|
||||
make_ParallelTensorFunctor(f_mk_kn_mn,
|
||||
c_m_n.mDesc.GetLengths()[0],
|
||||
c_m_n.mDesc.GetLengths()[1])(std::thread::hardware_concurrency());
|
||||
}
|
||||
|
||||
@@ -120,6 +120,8 @@ struct HostTensorDescriptor
|
||||
return std::inner_product(iss.begin(), iss.end(), mStrides.begin(), std::size_t{0});
|
||||
}
|
||||
|
||||
friend std::ostream& operator<<(std::ostream& os, const HostTensorDescriptor& desc);
|
||||
|
||||
private:
|
||||
std::vector<std::size_t> mLens;
|
||||
std::vector<std::size_t> mStrides;
|
||||
@@ -224,7 +226,7 @@ struct Tensor
|
||||
Tensor(const HostTensorDescriptor& desc) : mDesc(desc), mData(mDesc.GetElementSpace()) {}
|
||||
|
||||
template <typename G>
|
||||
void GenerateTensorValue(G g, std::size_t num_thread = 1)
|
||||
void GenerateTensorValue(G g, std::size_t num_thread = std::thread::hardware_concurrency())
|
||||
{
|
||||
switch(mDesc.GetNumOfDimension())
|
||||
{
|
||||
|
||||
Reference in New Issue
Block a user