reorganize files to prepare for MIOpen integration (#51)

* change olc cmake * adding online compile to fwd-v4r5r2 * update scripts * remane fwd-v4r5r2 to fwd-v6r1 * clean up
2026-05-11 17:00:18 +00:00 · 2021-07-18 00:43:05 -05:00
parent fbdf4332c7
commit 1264925422
110 changed files with 2992 additions and 4982 deletions
--- a/host/host_tensor/include/conv_common.hpp
+++ b/host/host_tensor/include/conv_common.hpp
@@ -0,0 +1,86 @@
+#ifndef CONV_COMMON_HPP
+#define CONV_COMMON_HPP
+
+#include "dynamic_tensor_descriptor.hpp"
+
+enum ConvTensorLayout
+{
+    NCHW,
+    NHWC,
+    CHWN,
+    NCHWc,
+    NHWCc
+};
+
+template <typename... InDesc,
+          typename... WeiDesc,
+          typename ConvStrides,
+          typename ConvDilations,
+          typename LeftPads,
+          typename RightPads>
+constexpr auto get_convolution_output_default_4d_tensor_descriptor(
+    const ck::DynamicTensorDescriptor<InDesc...>& in_desc,
+    const ck::DynamicTensorDescriptor<WeiDesc...>& wei_desc,
+    const ConvStrides& conv_strides,
+    const ConvDilations conv_dilations,
+    const LeftPads& left_pads,
+    const RightPads& right_pads)
+{
+    using namespace ck;
+
+    constexpr auto I0 = Number<0>{};
+    constexpr auto I1 = Number<1>{};
+    constexpr auto I2 = Number<2>{};
+    constexpr auto I3 = Number<3>{};
+
+    assert(in_desc.GetNumOfDimension() == 4);
+    assert(wei_desc.GetNumOfDimension() == 4);
+    assert(in_desc.GetLength(I1) == wei_desc.GetLength(I1));
+
+    const auto N  = in_desc.GetLength(I0);
+    const auto Hi = in_desc.GetLength(I2);
+    const auto Wi = in_desc.GetLength(I3);
+
+    const auto K = wei_desc.GetLength(I0);
+    const auto Y = wei_desc.GetLength(I2);
+    const auto X = wei_desc.GetLength(I3);
+
+    const auto LeftPadH = left_pads[I0];
+    const auto LeftPadW = left_pads[I1];
+
+    const auto RightPadH = right_pads[I0];
+    const auto RightPadW = right_pads[I1];
+
+    const auto YEff = (Y - I1) * conv_dilations[I0] + I1;
+    const auto XEff = (X - I1) * conv_dilations[I1] + I1;
+
+    const auto Ho = (Hi + LeftPadH + RightPadH - YEff) / conv_strides[I0] + I1;
+    const auto Wo = (Wi + LeftPadW + RightPadW - XEff) / conv_strides[I1] + I1;
+
+    return make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(N, K, Ho, Wo));
+}
+
+template <class InDesc, class WeiDesc, class OutDesc>
+constexpr std::size_t
+calculate_convolution_flops(const InDesc& in_desc, const WeiDesc& wei_desc, const OutDesc& out_desc)
+{
+    using namespace ck;
+
+    constexpr auto I0 = Number<0>{};
+    constexpr auto I1 = Number<1>{};
+    constexpr auto I2 = Number<2>{};
+    constexpr auto I3 = Number<3>{};
+
+    const index_t N  = out_desc.GetLength(I0);
+    const index_t K  = out_desc.GetLength(I1);
+    const index_t Ho = out_desc.GetLength(I2);
+    const index_t Wo = out_desc.GetLength(I3);
+
+    const index_t C = wei_desc.GetLength(I1);
+    const index_t Y = wei_desc.GetLength(I2);
+    const index_t X = wei_desc.GetLength(I3);
+
+    return std::size_t(2) * N * K * Ho * Wo * C * Y * X;
+}
+
+#endif
--- a/host/host_tensor/include/device.hpp
+++ b/host/host_tensor/include/device.hpp
@@ -0,0 +1,86 @@
+#ifndef DEVICE_HPP
+#define DEVICE_HPP
+
+#include <memory>
+#include "hip/hip_runtime.h"
+#include "hip/hip_fp16.h"
+
+struct DeviceMem
+{
+    DeviceMem() = delete;
+    DeviceMem(std::size_t mem_size);
+    void* GetDeviceBuffer();
+    void ToDevice(const void* p);
+    void FromDevice(void* p);
+    ~DeviceMem();
+
+    void* mpDeviceBuf;
+    std::size_t mMemSize;
+};
+
+struct KernelTimerImpl;
+
+struct KernelTimer
+{
+    KernelTimer();
+    ~KernelTimer();
+    void Start();
+    void End();
+    float GetElapsedTime() const;
+
+    std::unique_ptr<KernelTimerImpl> impl;
+};
+
+using device_stream_t = hipStream_t;
+
+template <typename... Args, typename F>
+void launch_kernel(F kernel,
+                   dim3 grid_dim,
+                   dim3 block_dim,
+                   std::size_t lds_byte,
+                   hipStream_t stream_id,
+                   Args... args)
+{
+    hipLaunchKernelGGL(kernel, grid_dim, block_dim, lds_byte, stream_id, args...);
+}
+
+template <typename... Args, typename F>
+float launch_and_time_kernel(F kernel,
+                             int nrepeat,
+                             dim3 grid_dim,
+                             dim3 block_dim,
+                             std::size_t lds_byte,
+                             hipStream_t stream_id,
+                             Args... args)
+{
+    KernelTimer timer;
+
+    printf("%s: grid_dim {%d, %d, %d}, block_dim {%d, %d, %d} \n",
+           __func__,
+           grid_dim.x,
+           grid_dim.y,
+           grid_dim.z,
+           block_dim.x,
+           block_dim.y,
+           block_dim.z);
+
+    printf("Warm up\n");
+
+    // warm up
+    hipLaunchKernelGGL(kernel, grid_dim, block_dim, lds_byte, stream_id, args...);
+
+    printf("Start running %d times...\n", nrepeat);
+
+    timer.Start();
+
+    for(int i = 0; i < nrepeat; ++i)
+    {
+        hipLaunchKernelGGL(kernel, grid_dim, block_dim, lds_byte, stream_id, args...);
+    }
+
+    timer.End();
+
+    return timer.GetElapsedTime() / nrepeat;
+}
+
+#endif
--- a/host/host_tensor/include/device_tensor.hpp
+++ b/host/host_tensor/include/device_tensor.hpp
@@ -0,0 +1,9 @@
+#pragma once
+#include "host_tensor.hpp"
+#include "common_header.hpp"
+
+template <typename TensorDesc>
+void ostream_tensor_descriptor(TensorDesc, std::ostream& os = std::cout)
+{
+    ostream_HostTensorDescriptor(make_HostTensorDescriptor(TensorDesc{}), os);
+}
--- a/host/host_tensor/include/host_conv.hpp
+++ b/host/host_tensor/include/host_conv.hpp
@@ -0,0 +1,326 @@
+#pragma once
+#include "host_tensor.hpp"
+
+template <typename TIn,
+          typename TWei,
+          typename TOut,
+          typename ConvStrides,
+          typename ConvDilations,
+          typename InLeftPads,
+          typename InRightPads>
+void host_direct_convolution(const Tensor<TIn>& in,
+                             const Tensor<TWei>& wei,
+                             Tensor<TOut>& out,
+                             const ConvStrides& conv_strides,
+                             const ConvDilations& conv_dilations,
+                             const InLeftPads& in_left_pads,
+                             const InRightPads& in_right_pads,
+                             const ConvTensorLayout layout = ConvTensorLayout::NCHW)
+{
+    using namespace ck;
+
+    constexpr auto I0 = Number<0>{};
+    constexpr auto I1 = Number<1>{};
+    constexpr auto I2 = Number<2>{};
+    constexpr auto I3 = Number<3>{};
+
+    auto f_nchw = [&](auto n, auto k, auto ho, auto wo) {
+        double v = 0;
+        for(int c = 0; c < wei.mDesc.GetLengths()[1]; ++c)
+        {
+            for(int y = 0; y < wei.mDesc.GetLengths()[2]; ++y)
+            {
+                int hi = ho * conv_strides[I0] + y * conv_dilations[I0] - in_left_pads[I0];
+                for(int x = 0; x < wei.mDesc.GetLengths()[3]; ++x)
+                {
+                    int wi = wo * conv_strides[I1] + x * conv_dilations[I1] - in_left_pads[I1];
+                    if(hi >= 0 && hi < in.mDesc.GetLengths()[2] && wi >= 0 &&
+                       wi < in.mDesc.GetLengths()[3])
+                    {
+                        v += static_cast<const double>(in(n, c, hi, wi)) *
+                             static_cast<const double>(wei(k, c, y, x));
+                    }
+                }
+            }
+        }
+        out(n, k, ho, wo) = v;
+    };
+
+    auto f_nhwc = [&](auto n, auto ho, auto wo, auto k) {
+        double v = 0;
+        for(int c = 0; c < wei.mDesc.GetLengths()[3]; ++c)
+        {
+            for(int y = 0; y < wei.mDesc.GetLengths()[1]; ++y)
+            {
+                int hi = ho * conv_strides[I0] + y * conv_dilations[I0] - in_left_pads[I0];
+                for(int x = 0; x < wei.mDesc.GetLengths()[2]; ++x)
+                {
+                    int wi = wo * conv_strides[I1] + x * conv_dilations[I1] - in_left_pads[I1];
+                    if(hi >= 0 && hi < in.mDesc.GetLengths()[1] && wi >= 0 &&
+                       wi < in.mDesc.GetLengths()[2])
+                    {
+                        v += static_cast<const double>(in(n, hi, wi, c)) *
+                             static_cast<const double>(wei(k, y, x, c));
+                    }
+                }
+            }
+        }
+        out(n, ho, wo, k) = v;
+    };
+
+    switch(layout)
+    {
+    case ConvTensorLayout::NCHW:
+        make_ParallelTensorFunctor(f_nchw,
+                                   out.mDesc.GetLengths()[0],
+                                   out.mDesc.GetLengths()[1],
+                                   out.mDesc.GetLengths()[2],
+                                   out.mDesc.GetLengths()[3])(std::thread::hardware_concurrency());
+        break;
+    case ConvTensorLayout::NHWC:
+        make_ParallelTensorFunctor(f_nhwc,
+                                   out.mDesc.GetLengths()[0],
+                                   out.mDesc.GetLengths()[1],
+                                   out.mDesc.GetLengths()[2],
+                                   out.mDesc.GetLengths()[3])(std::thread::hardware_concurrency());
+        break;
+    default: throw std::runtime_error("wrong! not supported layout");
+    }
+}
+
+template <typename TIn, typename TWei, typename TOut, typename InLeftPads, typename InRightPads>
+void host_winograd_3x3_convolution(const Tensor<TIn>& in_nchw,
+                                   const Tensor<TWei>& wei_kcyx,
+                                   Tensor<TOut>& out_nkhw,
+                                   InLeftPads,
+                                   InRightPads)
+{
+    using namespace ck;
+
+    constexpr std::size_t HoPerTile = 2;
+    constexpr std::size_t WoPerTile = 2;
+
+    std::size_t N  = in_nchw.mDesc.GetLengths()[0];
+    std::size_t C  = in_nchw.mDesc.GetLengths()[1];
+    std::size_t HI = in_nchw.mDesc.GetLengths()[2];
+    std::size_t WI = in_nchw.mDesc.GetLengths()[3];
+
+    std::size_t K = wei_kcyx.mDesc.GetLengths()[0];
+    std::size_t Y = wei_kcyx.mDesc.GetLengths()[2];
+    std::size_t X = wei_kcyx.mDesc.GetLengths()[3];
+
+    std::size_t HO = out_nkhw.mDesc.GetLengths()[2];
+    std::size_t WO = out_nkhw.mDesc.GetLengths()[3];
+
+    index_t h_pad_low = InLeftPads{}.Get(Number<0>{});
+    index_t w_pad_low = InLeftPads{}.Get(Number<1>{});
+
+    std::size_t HiPerTile = HoPerTile + Y - 1;
+    std::size_t WiPerTile = WoPerTile + X - 1;
+
+    std::size_t HTile = (HO + HoPerTile - 1) / HoPerTile;
+    std::size_t WTile = (WO + WoPerTile - 1) / WoPerTile;
+
+    Tensor<double> in_hold({N, C, HTile, WTile, HiPerTile, WiPerTile});
+    Tensor<double> in_transform({N, C, HTile, WTile, HiPerTile, WiPerTile});
+    Tensor<double> wei_transform({K, C, HiPerTile, WiPerTile});
+    Tensor<double> out_transform({N, K, HTile, WTile, HiPerTile, HiPerTile});
+    Tensor<double> out_hold({N, K, HTile, WTile, HoPerTile, WoPerTile});
+
+    auto f_in_hold = [&](auto n, auto c, auto htile, auto wtile) {
+        for(int j = 0; j < HiPerTile; ++j)
+        {
+            int hi = HoPerTile * htile + j - h_pad_low;
+            for(int i = 0; i < WiPerTile; ++i)
+            {
+                int wi = WoPerTile * wtile + i - w_pad_low;
+
+                if(hi >= 0 && hi < in_nchw.mDesc.GetLengths()[2] && wi >= 0 &&
+                   wi < in_nchw.mDesc.GetLengths()[3])
+                {
+                    in_hold(n, c, htile, wtile, j, i) = in_nchw(n, c, hi, wi);
+                }
+                else
+                {
+                    in_hold(n, c, htile, wtile, j, i) = TIn(0);
+                }
+            }
+        }
+    };
+
+    auto f_in_transform = [&](auto n, auto c, auto htile, auto wtile) {
+        in_transform(n, c, htile, wtile, 0, 0) =
+            in_hold(n, c, htile, wtile, 0, 0) - in_hold(n, c, htile, wtile, 0, 2) -
+            in_hold(n, c, htile, wtile, 2, 0) + in_hold(n, c, htile, wtile, 2, 2);
+        in_transform(n, c, htile, wtile, 0, 1) =
+            in_hold(n, c, htile, wtile, 0, 1) + in_hold(n, c, htile, wtile, 0, 2) -
+            in_hold(n, c, htile, wtile, 2, 1) - in_hold(n, c, htile, wtile, 2, 2);
+        in_transform(n, c, htile, wtile, 0, 2) =
+            -in_hold(n, c, htile, wtile, 0, 1) + in_hold(n, c, htile, wtile, 0, 2) +
+            in_hold(n, c, htile, wtile, 2, 1) - in_hold(n, c, htile, wtile, 2, 2);
+        in_transform(n, c, htile, wtile, 0, 3) =
+            in_hold(n, c, htile, wtile, 0, 1) - in_hold(n, c, htile, wtile, 0, 3) -
+            in_hold(n, c, htile, wtile, 2, 1) + in_hold(n, c, htile, wtile, 2, 3);
+
+        in_transform(n, c, htile, wtile, 1, 0) =
+            in_hold(n, c, htile, wtile, 1, 0) - in_hold(n, c, htile, wtile, 1, 2) +
+            in_hold(n, c, htile, wtile, 2, 0) - in_hold(n, c, htile, wtile, 2, 2);
+        in_transform(n, c, htile, wtile, 1, 1) =
+            in_hold(n, c, htile, wtile, 1, 1) + in_hold(n, c, htile, wtile, 1, 2) +
+            in_hold(n, c, htile, wtile, 2, 1) + in_hold(n, c, htile, wtile, 2, 2);
+        in_transform(n, c, htile, wtile, 1, 2) =
+            -in_hold(n, c, htile, wtile, 1, 1) + in_hold(n, c, htile, wtile, 1, 2) -
+            in_hold(n, c, htile, wtile, 2, 1) + in_hold(n, c, htile, wtile, 2, 2);
+        in_transform(n, c, htile, wtile, 1, 3) =
+            in_hold(n, c, htile, wtile, 1, 1) - in_hold(n, c, htile, wtile, 1, 3) +
+            in_hold(n, c, htile, wtile, 2, 1) - in_hold(n, c, htile, wtile, 2, 3);
+
+        in_transform(n, c, htile, wtile, 2, 0) =
+            -in_hold(n, c, htile, wtile, 1, 0) + in_hold(n, c, htile, wtile, 1, 2) +
+            in_hold(n, c, htile, wtile, 2, 0) - in_hold(n, c, htile, wtile, 2, 2);
+        in_transform(n, c, htile, wtile, 2, 1) =
+            -in_hold(n, c, htile, wtile, 1, 1) - in_hold(n, c, htile, wtile, 1, 2) +
+            in_hold(n, c, htile, wtile, 2, 1) + in_hold(n, c, htile, wtile, 2, 2);
+        in_transform(n, c, htile, wtile, 2, 2) =
+            in_hold(n, c, htile, wtile, 1, 1) - in_hold(n, c, htile, wtile, 1, 2) -
+            in_hold(n, c, htile, wtile, 2, 1) + in_hold(n, c, htile, wtile, 2, 2);
+        in_transform(n, c, htile, wtile, 2, 3) =
+            -in_hold(n, c, htile, wtile, 1, 1) + in_hold(n, c, htile, wtile, 1, 3) +
+            in_hold(n, c, htile, wtile, 2, 1) - in_hold(n, c, htile, wtile, 2, 3);
+
+        in_transform(n, c, htile, wtile, 3, 0) =
+            in_hold(n, c, htile, wtile, 1, 0) - in_hold(n, c, htile, wtile, 1, 2) -
+            in_hold(n, c, htile, wtile, 3, 0) + in_hold(n, c, htile, wtile, 3, 2);
+        in_transform(n, c, htile, wtile, 3, 1) =
+            in_hold(n, c, htile, wtile, 1, 1) + in_hold(n, c, htile, wtile, 1, 2) -
+            in_hold(n, c, htile, wtile, 3, 1) - in_hold(n, c, htile, wtile, 3, 2);
+        in_transform(n, c, htile, wtile, 3, 2) =
+            -in_hold(n, c, htile, wtile, 1, 1) + in_hold(n, c, htile, wtile, 1, 2) +
+            in_hold(n, c, htile, wtile, 3, 1) - in_hold(n, c, htile, wtile, 3, 2);
+        in_transform(n, c, htile, wtile, 3, 3) =
+            in_hold(n, c, htile, wtile, 1, 1) - in_hold(n, c, htile, wtile, 1, 3) -
+            in_hold(n, c, htile, wtile, 3, 1) + in_hold(n, c, htile, wtile, 3, 3);
+    };
+
+    auto f_wei_transform = [&](auto k, auto c) {
+        wei_transform(k, c, 0, 0) = double(wei_kcyx(k, c, 0, 0));
+        wei_transform(k, c, 0, 1) = 0.5 * double(wei_kcyx(k, c, 0, 0)) +
+                                    0.5 * double(wei_kcyx(k, c, 0, 1)) +
+                                    0.5 * double(wei_kcyx(k, c, 0, 2));
+        wei_transform(k, c, 0, 2) = 0.5 * double(wei_kcyx(k, c, 0, 0)) -
+                                    0.5 * double(wei_kcyx(k, c, 0, 1)) +
+                                    0.5 * double(wei_kcyx(k, c, 0, 2));
+        wei_transform(k, c, 0, 3) = double(wei_kcyx(k, c, 0, 2));
+
+        wei_transform(k, c, 1, 0) = 0.5 * double(wei_kcyx(k, c, 0, 0)) +
+                                    0.5 * double(wei_kcyx(k, c, 1, 0)) +
+                                    0.5 * double(wei_kcyx(k, c, 2, 0));
+        wei_transform(k, c, 1, 1) =
+            0.25 * double(wei_kcyx(k, c, 0, 0)) + 0.25 * double(wei_kcyx(k, c, 0, 1)) +
+            0.25 * double(wei_kcyx(k, c, 0, 2)) + 0.25 * double(wei_kcyx(k, c, 1, 0)) +
+            0.25 * double(wei_kcyx(k, c, 1, 1)) + 0.25 * double(wei_kcyx(k, c, 1, 2)) +
+            0.25 * double(wei_kcyx(k, c, 2, 0)) + 0.25 * double(wei_kcyx(k, c, 2, 1)) +
+            0.25 * double(wei_kcyx(k, c, 2, 2));
+        wei_transform(k, c, 1, 2) =
+            0.25 * double(wei_kcyx(k, c, 0, 0)) - 0.25 * double(wei_kcyx(k, c, 0, 1)) +
+            0.25 * double(wei_kcyx(k, c, 0, 2)) + 0.25 * double(wei_kcyx(k, c, 1, 0)) -
+            0.25 * double(wei_kcyx(k, c, 1, 1)) + 0.25 * double(wei_kcyx(k, c, 1, 2)) +
+            0.25 * double(wei_kcyx(k, c, 2, 0)) - 0.25 * double(wei_kcyx(k, c, 2, 1)) +
+            0.25 * double(wei_kcyx(k, c, 2, 2));
+        wei_transform(k, c, 1, 3) = 0.5 * double(wei_kcyx(k, c, 0, 2)) +
+                                    0.5 * double(wei_kcyx(k, c, 1, 2)) +
+                                    0.5 * double(wei_kcyx(k, c, 2, 2));
+
+        wei_transform(k, c, 2, 0) = 0.5 * double(wei_kcyx(k, c, 0, 0)) -
+                                    0.5 * double(wei_kcyx(k, c, 1, 0)) +
+                                    0.5 * double(wei_kcyx(k, c, 2, 0));
+        wei_transform(k, c, 2, 1) =
+            0.25 * double(wei_kcyx(k, c, 0, 0)) + 0.25 * double(wei_kcyx(k, c, 0, 1)) +
+            0.25 * double(wei_kcyx(k, c, 0, 2)) - 0.25 * double(wei_kcyx(k, c, 1, 0)) -
+            0.25 * double(wei_kcyx(k, c, 1, 1)) - 0.25 * double(wei_kcyx(k, c, 1, 2)) +
+            0.25 * double(wei_kcyx(k, c, 2, 0)) + 0.25 * double(wei_kcyx(k, c, 2, 1)) +
+            0.25 * double(wei_kcyx(k, c, 2, 2));
+        wei_transform(k, c, 2, 2) =
+            0.25 * double(wei_kcyx(k, c, 0, 0)) - 0.25 * double(wei_kcyx(k, c, 0, 1)) +
+            0.25 * double(wei_kcyx(k, c, 0, 2)) - 0.25 * double(wei_kcyx(k, c, 1, 0)) +
+            0.25 * double(wei_kcyx(k, c, 1, 1)) - 0.25 * double(wei_kcyx(k, c, 1, 2)) +
+            0.25 * double(wei_kcyx(k, c, 2, 0)) - 0.25 * double(wei_kcyx(k, c, 2, 1)) +
+            0.25 * double(wei_kcyx(k, c, 2, 2));
+        wei_transform(k, c, 2, 3) = 0.5 * double(wei_kcyx(k, c, 0, 2)) -
+                                    0.5 * double(wei_kcyx(k, c, 1, 2)) +
+                                    0.5 * double(wei_kcyx(k, c, 2, 2));
+
+        wei_transform(k, c, 3, 0) = double(wei_kcyx(k, c, 2, 0));
+        wei_transform(k, c, 3, 1) = 0.5 * double(wei_kcyx(k, c, 2, 0)) +
+                                    0.5 * double(wei_kcyx(k, c, 2, 1)) +
+                                    0.5 * double(wei_kcyx(k, c, 2, 2));
+        wei_transform(k, c, 3, 2) = 0.5 * double(wei_kcyx(k, c, 2, 0)) -
+                                    0.5 * double(wei_kcyx(k, c, 2, 1)) +
+                                    0.5 * double(wei_kcyx(k, c, 2, 2));
+        wei_transform(k, c, 3, 3) = double(wei_kcyx(k, c, 2, 2));
+    };
+
+    auto f_out_transform = [&](auto n, auto k, auto htile, auto wtile) {
+        for(int j = 0; j < HiPerTile; ++j)
+        {
+            for(int i = 0; i < WiPerTile; ++i)
+            {
+                double v = 0;
+                for(int c = 0; c < C; ++c)
+                {
+                    v += in_transform(n, c, htile, wtile, j, i) * wei_transform(k, c, j, i);
+                }
+
+                out_transform(n, k, htile, wtile, j, i) = v;
+            }
+        }
+    };
+
+    auto f_out_hold = [&](auto n, auto k, auto htile, auto wtile) {
+        out_hold(n, k, htile, wtile, 0, 0) =
+            out_transform(n, k, htile, wtile, 0, 0) + out_transform(n, k, htile, wtile, 0, 1) +
+            out_transform(n, k, htile, wtile, 0, 2) + out_transform(n, k, htile, wtile, 1, 0) +
+            out_transform(n, k, htile, wtile, 1, 1) + out_transform(n, k, htile, wtile, 1, 2) +
+            out_transform(n, k, htile, wtile, 2, 0) + out_transform(n, k, htile, wtile, 2, 1) +
+            out_transform(n, k, htile, wtile, 2, 2);
+        out_hold(n, k, htile, wtile, 0, 1) =
+            out_transform(n, k, htile, wtile, 0, 1) - out_transform(n, k, htile, wtile, 0, 2) -
+            out_transform(n, k, htile, wtile, 0, 3) + out_transform(n, k, htile, wtile, 1, 1) -
+            out_transform(n, k, htile, wtile, 1, 2) - out_transform(n, k, htile, wtile, 1, 3) +
+            out_transform(n, k, htile, wtile, 2, 1) - out_transform(n, k, htile, wtile, 2, 2) -
+            out_transform(n, k, htile, wtile, 2, 3);
+        out_hold(n, k, htile, wtile, 1, 0) =
+            out_transform(n, k, htile, wtile, 1, 0) + out_transform(n, k, htile, wtile, 1, 1) +
+            out_transform(n, k, htile, wtile, 1, 2) - out_transform(n, k, htile, wtile, 2, 0) -
+            out_transform(n, k, htile, wtile, 2, 1) - out_transform(n, k, htile, wtile, 2, 2) -
+            out_transform(n, k, htile, wtile, 3, 0) - out_transform(n, k, htile, wtile, 3, 1) -
+            out_transform(n, k, htile, wtile, 3, 2);
+        out_hold(n, k, htile, wtile, 1, 1) =
+            out_transform(n, k, htile, wtile, 1, 1) - out_transform(n, k, htile, wtile, 1, 2) -
+            out_transform(n, k, htile, wtile, 1, 3) - out_transform(n, k, htile, wtile, 2, 1) +
+            out_transform(n, k, htile, wtile, 2, 2) + out_transform(n, k, htile, wtile, 2, 3) -
+            out_transform(n, k, htile, wtile, 3, 1) + out_transform(n, k, htile, wtile, 3, 2) +
+            out_transform(n, k, htile, wtile, 3, 3);
+    };
+
+    auto f_out = [&](auto n, auto k, auto htile, auto wtile) {
+        for(int j = 0; j < HoPerTile; ++j)
+        {
+            std::size_t ho = HoPerTile * htile + j;
+            for(int i = 0; i < WoPerTile; ++i)
+            {
+                std::size_t wo         = WoPerTile * wtile + i;
+                out_nkhw(n, k, ho, wo) = out_hold(n, k, htile, wtile, j, i);
+            }
+        }
+    };
+
+    std::size_t num_thread = std::thread::hardware_concurrency();
+
+    make_ParallelTensorFunctor(f_in_hold, N, C, HTile, WTile)(num_thread);
+    make_ParallelTensorFunctor(f_in_transform, N, C, HTile, WTile)(num_thread);
+    make_ParallelTensorFunctor(f_wei_transform, K, C)(num_thread);
+    make_ParallelTensorFunctor(f_out_transform, N, K, HTile, WTile)(num_thread);
+    make_ParallelTensorFunctor(f_out_hold, N, K, HTile, WTile)(num_thread);
+    make_ParallelTensorFunctor(f_out, N, K, HTile, WTile)(num_thread);
+}
--- a/host/host_tensor/include/host_conv_bwd_data.hpp
+++ b/host/host_tensor/include/host_conv_bwd_data.hpp
@@ -0,0 +1,143 @@
+#pragma once
+#include "host_tensor.hpp"
+
+template <typename TIn,
+          typename TWei,
+          typename TOut,
+          typename ConvStrides,
+          typename ConvDilations,
+          typename InLeftPads,
+          typename InRightPads>
+void host_direct_convolution_backward_data(Tensor<TIn>& in,
+                                           const Tensor<TWei>& wei,
+                                           const Tensor<TOut>& out,
+                                           const ConvStrides& conv_strides,
+                                           const ConvDilations& conv_dilations,
+                                           const InLeftPads& in_left_pads,
+                                           const InRightPads& in_right_pads,
+                                           const ConvTensorLayout layout = ConvTensorLayout::NCHW)
+{
+    using namespace ck;
+
+    constexpr auto I0 = Number<0>{};
+    constexpr auto I1 = Number<1>{};
+    constexpr auto I2 = Number<2>{};
+    constexpr auto I3 = Number<3>{};
+
+    auto f_nchw = [&](auto n, auto c, auto hi, auto wi) {
+        std::size_t N  = in.mDesc.GetLengths()[I0];
+        std::size_t C  = in.mDesc.GetLengths()[I1];
+        std::size_t Hi = in.mDesc.GetLengths()[I2];
+        std::size_t Wi = in.mDesc.GetLengths()[I3];
+
+        std::size_t K = wei.mDesc.GetLengths()[I0];
+        std::size_t Y = wei.mDesc.GetLengths()[I2];
+        std::size_t X = wei.mDesc.GetLengths()[I3];
+
+        std::size_t Ho = out.mDesc.GetLengths()[I2];
+        std::size_t Wo = out.mDesc.GetLengths()[I3];
+
+        double v = 0;
+
+        for(int y = 0; y < Y; ++y)
+        {
+            int h_tmp = hi + in_left_pads[I0] - y * conv_dilations[I0];
+
+            if(h_tmp % conv_strides[I0] == 0)
+            {
+                int ho = h_tmp / conv_strides[I0];
+
+                if(ho >= 0 && ho < Ho)
+                {
+                    for(int x = 0; x < X; ++x)
+                    {
+                        int w_tmp = wi + in_left_pads[I1] - x * conv_dilations[I1];
+
+                        if(w_tmp % conv_strides[I1] == 0)
+                        {
+                            int wo = w_tmp / conv_strides[I1];
+
+                            if(wo >= 0 && wo < Wo)
+                            {
+                                for(int k = 0; k < K; ++k)
+                                {
+                                    v += out(n, k, ho, wo) * wei(k, c, y, x);
+                                }
+                            }
+                        }
+                    }
+                }
+            }
+        }
+
+        in(n, c, hi, wi) = v;
+    };
+
+    auto f_nhwc = [&](auto n, auto hi, auto wi, auto c) {
+        std::size_t N  = in.mDesc.GetLengths()[I0];
+        std::size_t Hi = in.mDesc.GetLengths()[I1];
+        std::size_t Wi = in.mDesc.GetLengths()[I2];
+        std::size_t C  = in.mDesc.GetLengths()[I3];
+
+        std::size_t K = wei.mDesc.GetLengths()[I0];
+        std::size_t Y = wei.mDesc.GetLengths()[I1];
+        std::size_t X = wei.mDesc.GetLengths()[I2];
+
+        std::size_t Ho = out.mDesc.GetLengths()[I1];
+        std::size_t Wo = out.mDesc.GetLengths()[I2];
+
+        double v = 0;
+
+        for(int y = 0; y < Y; ++y)
+        {
+            int h_tmp = hi + in_left_pads[I0] - y * conv_dilations[I0];
+
+            if(h_tmp % conv_strides[I0] == 0)
+            {
+                int ho = h_tmp / conv_strides[I0];
+
+                if(ho >= 0 && ho < Ho)
+                {
+                    for(int x = 0; x < X; ++x)
+                    {
+                        int w_tmp = wi + in_left_pads[I1] - x * conv_dilations[I1];
+
+                        if(w_tmp % conv_strides[I1] == 0)
+                        {
+                            int wo = w_tmp / conv_strides[I1];
+
+                            if(wo >= 0 && wo < Wo)
+                            {
+                                for(int k = 0; k < K; ++k)
+                                {
+                                    v += out(n, ho, wo, k) * wei(k, y, x, c);
+                                }
+                            }
+                        }
+                    }
+                }
+            }
+        }
+
+        in(n, hi, wi, c) = v;
+    };
+
+    switch(layout)
+    {
+    case ConvTensorLayout::NCHW:
+        make_ParallelTensorFunctor(f_nchw,
+                                   in.mDesc.GetLengths()[0],
+                                   in.mDesc.GetLengths()[1],
+                                   in.mDesc.GetLengths()[2],
+                                   in.mDesc.GetLengths()[3])(std::thread::hardware_concurrency());
+        break;
+    case ConvTensorLayout::NHWC:
+        make_ParallelTensorFunctor(f_nhwc,
+                                   in.mDesc.GetLengths()[0],
+                                   in.mDesc.GetLengths()[1],
+                                   in.mDesc.GetLengths()[2],
+                                   in.mDesc.GetLengths()[3])(std::thread::hardware_concurrency());
+        break;
+    default: throw std::runtime_error("wrong! not supported layout");
+    }
+}
--- a/host/host_tensor/include/host_tensor.hpp
+++ b/host/host_tensor/include/host_tensor.hpp
@@ -0,0 +1,325 @@
+#ifndef HOST_TENSOR_HPP
+#define HOST_TENSOR_HPP
+
+#include <thread>
+#include <vector>
+#include <numeric>
+#include <algorithm>
+#include <utility>
+#include <cassert>
+#include <iostream>
+
+template <typename Range>
+std::ostream& LogRange(std::ostream& os, Range&& range, std::string delim)
+{
+    bool first = true;
+    for(auto&& v : range)
+    {
+        if(first)
+            first = false;
+        else
+            os << delim;
+        os << v;
+    }
+    return os;
+}
+
+template <typename T, typename Range>
+std::ostream& LogRangeAsType(std::ostream& os, Range&& range, std::string delim)
+{
+    bool first = true;
+    for(auto&& v : range)
+    {
+        if(first)
+            first = false;
+        else
+            os << delim;
+        os << T{v};
+    }
+    return os;
+}
+
+typedef enum {
+    Half  = 0,
+    Float = 1,
+} DataType_t;
+
+template <typename T>
+struct DataType;
+
+template <>
+struct DataType<float> : std::integral_constant<DataType_t, DataType_t::Float>
+{
+};
+
+template <typename F, typename T, std::size_t... Is>
+auto call_f_unpack_args_impl(F f, T args, std::index_sequence<Is...>)
+{
+    return f(std::get<Is>(args)...);
+}
+
+template <typename F, typename T>
+auto call_f_unpack_args(F f, T args)
+{
+    constexpr std::size_t N = std::tuple_size<T>{};
+
+    return call_f_unpack_args_impl(f, args, std::make_index_sequence<N>{});
+}
+
+template <typename F, typename T, std::size_t... Is>
+auto construct_f_unpack_args_impl(T args, std::index_sequence<Is...>)
+{
+    return F(std::get<Is>(args)...);
+}
+
+template <typename F, typename T>
+auto construct_f_unpack_args(F, T args)
+{
+    constexpr std::size_t N = std::tuple_size<T>{};
+
+    return construct_f_unpack_args_impl<F>(args, std::make_index_sequence<N>{});
+}
+
+struct HostTensorDescriptor
+{
+    HostTensorDescriptor() = delete;
+
+    template <typename X>
+    HostTensorDescriptor(std::vector<X> lens);
+
+    template <typename X, typename Y>
+    HostTensorDescriptor(std::vector<X> lens, std::vector<Y> strides);
+
+    void CalculateStrides();
+
+    template <typename Range>
+    HostTensorDescriptor(const Range& lens) : mLens(lens.begin(), lens.end())
+    {
+        this->CalculateStrides();
+    }
+
+    template <typename Range1, typename Range2>
+    HostTensorDescriptor(const Range1& lens, const Range2& strides)
+        : mLens(lens.begin(), lens.end()), mStrides(strides.begin(), strides.end())
+    {
+    }
+
+    std::size_t GetNumOfDimension() const;
+    std::size_t GetElementSize() const;
+    std::size_t GetElementSpace() const;
+
+    const std::vector<std::size_t>& GetLengths() const;
+    const std::vector<std::size_t>& GetStrides() const;
+
+    template <typename... Is>
+    std::size_t GetOffsetFromMultiIndex(Is... is) const
+    {
+        assert(sizeof...(Is) == this->GetNumOfDimension());
+        std::initializer_list<std::size_t> iss{static_cast<std::size_t>(is)...};
+        return std::inner_product(iss.begin(), iss.end(), mStrides.begin(), std::size_t{0});
+    }
+
+    private:
+    std::vector<std::size_t> mLens;
+    std::vector<std::size_t> mStrides;
+};
+
+struct joinable_thread : std::thread
+{
+    template <typename... Xs>
+    joinable_thread(Xs&&... xs) : std::thread(std::forward<Xs>(xs)...)
+    {
+    }
+
+    joinable_thread(joinable_thread&&) = default;
+    joinable_thread& operator=(joinable_thread&&) = default;
+
+    ~joinable_thread()
+    {
+        if(this->joinable())
+            this->join();
+    }
+};
+
+template <typename F, typename... Xs>
+struct ParallelTensorFunctor
+{
+    F mF;
+    static constexpr std::size_t NDIM = sizeof...(Xs);
+    std::array<std::size_t, NDIM> mLens;
+    std::array<std::size_t, NDIM> mStrides;
+    std::size_t mN1d;
+
+    ParallelTensorFunctor(F f, Xs... xs) : mF(f), mLens({static_cast<std::size_t>(xs)...})
+    {
+        mStrides.back() = 1;
+        std::partial_sum(mLens.rbegin(),
+                         mLens.rend() - 1,
+                         mStrides.rbegin() + 1,
+                         std::multiplies<std::size_t>());
+        mN1d = mStrides[0] * mLens[0];
+    }
+
+    std::array<std::size_t, NDIM> GetNdIndices(std::size_t i) const
+    {
+        std::array<std::size_t, NDIM> indices;
+
+        for(int idim = 0; idim < NDIM; ++idim)
+        {
+            indices[idim] = i / mStrides[idim];
+            i -= indices[idim] * mStrides[idim];
+        }
+
+        return indices;
+    }
+
+    void operator()(std::size_t num_thread = std::thread::hardware_concurrency()) const
+    {
+        std::size_t work_per_thread = (mN1d + num_thread - 1) / num_thread;
+
+        std::vector<joinable_thread> threads(num_thread);
+
+        for(std::size_t it = 0; it < num_thread; ++it)
+        {
+            std::size_t iw_begin = it * work_per_thread;
+            std::size_t iw_end   = std::min((it + 1) * work_per_thread, mN1d);
+
+            auto f = [=] {
+                for(std::size_t iw = iw_begin; iw < iw_end; ++iw)
+                {
+                    call_f_unpack_args(mF, GetNdIndices(iw));
+                }
+            };
+            threads[it] = joinable_thread(f);
+        }
+    }
+};
+
+template <typename F, typename... Xs>
+auto make_ParallelTensorFunctor(F f, Xs... xs)
+{
+    return ParallelTensorFunctor<F, Xs...>(f, xs...);
+}
+
+template <typename T>
+struct Tensor
+{
+    template <typename X>
+    Tensor(std::initializer_list<X> lens) : mDesc(lens), mData(mDesc.GetElementSpace())
+    {
+    }
+
+    template <typename X>
+    Tensor(std::vector<X> lens) : mDesc(lens), mData(mDesc.GetElementSpace())
+    {
+    }
+
+    template <typename X, typename Y>
+    Tensor(std::vector<X> lens, std::vector<Y> strides)
+        : mDesc(lens, strides), mData(mDesc.GetElementSpace())
+    {
+    }
+
+    Tensor(const HostTensorDescriptor& desc) : mDesc(desc), mData(mDesc.GetElementSpace()) {}
+
+    template <typename G>
+    void GenerateTensorValue(G g, std::size_t num_thread = 1)
+    {
+        switch(mDesc.GetNumOfDimension())
+        {
+        case 1:
+        {
+            auto f = [&](auto i) { (*this)(i) = g(i); };
+            make_ParallelTensorFunctor(f, mDesc.GetLengths()[0])(num_thread);
+            break;
+        }
+        case 2:
+        {
+            auto f = [&](auto i0, auto i1) { (*this)(i0, i1) = g(i0, i1); };
+            make_ParallelTensorFunctor(f, mDesc.GetLengths()[0], mDesc.GetLengths()[1])(num_thread);
+            break;
+        }
+        case 3:
+        {
+            auto f = [&](auto i0, auto i1, auto i2) { (*this)(i0, i1, i2) = g(i0, i1, i2); };
+            make_ParallelTensorFunctor(
+                f, mDesc.GetLengths()[0], mDesc.GetLengths()[1], mDesc.GetLengths()[2])(num_thread);
+            break;
+        }
+        case 4:
+        {
+            auto f = [&](auto i0, auto i1, auto i2, auto i3) {
+                (*this)(i0, i1, i2, i3) = g(i0, i1, i2, i3);
+            };
+            make_ParallelTensorFunctor(f,
+                                       mDesc.GetLengths()[0],
+                                       mDesc.GetLengths()[1],
+                                       mDesc.GetLengths()[2],
+                                       mDesc.GetLengths()[3])(num_thread);
+            break;
+        }
+        default: throw std::runtime_error("unspported dimension");
+        }
+    }
+
+    template <typename... Is>
+    T& operator()(Is... is)
+    {
+        return mData[mDesc.GetOffsetFromMultiIndex(is...)];
+    }
+
+    template <typename... Is>
+    const T& operator()(Is... is) const
+    {
+        return mData[mDesc.GetOffsetFromMultiIndex(is...)];
+    }
+
+    typename std::vector<T>::iterator begin() { return mData.begin(); }
+
+    typename std::vector<T>::iterator end() { return mData.end(); }
+
+    typename std::vector<T>::const_iterator begin() const { return mData.begin(); }
+
+    typename std::vector<T>::const_iterator end() const { return mData.end(); }
+
+    HostTensorDescriptor mDesc;
+    std::vector<T> mData;
+};
+
+template <typename X>
+HostTensorDescriptor::HostTensorDescriptor(std::vector<X> lens) : mLens(lens)
+{
+    this->CalculateStrides();
+}
+
+template <typename X, typename Y>
+HostTensorDescriptor::HostTensorDescriptor(std::vector<X> lens, std::vector<Y> strides)
+    : mLens(lens), mStrides(strides)
+{
+}
+
+void ostream_HostTensorDescriptor(const HostTensorDescriptor& desc, std::ostream& os = std::cout);
+
+template <typename T>
+void check_error(const Tensor<T>& ref, const Tensor<T>& result)
+{
+    float error     = 0;
+    float max_diff  = -1;
+    float ref_value = 0, result_value = 0;
+    for(int i = 0; i < ref.mData.size(); ++i)
+    {
+        error += std::abs(double(ref.mData[i]) - double(result.mData[i]));
+        float diff = std::abs(double(ref.mData[i]) - double(result.mData[i]));
+        if(max_diff < diff)
+        {
+            max_diff     = diff;
+            ref_value    = ref.mData[i];
+            result_value = result.mData[i];
+        }
+    }
+
+    std::cout << "error: " << error << std::endl;
+    std::cout << "max_diff: " << max_diff << ", " << ref_value << ", " << result_value << std::endl;
+}
+
+#endif
--- a/host/host_tensor/include/host_tensor_generator.hpp
+++ b/host/host_tensor/include/host_tensor_generator.hpp
@@ -0,0 +1,60 @@
+#ifndef HOST_TENSOR_GENERATOR_HPP
+#define HOST_TENSOR_GENERATOR_HPP
+
+#include <cmath>
+#include "config.hpp"
+
+struct GeneratorTensor_1
+{
+    int value = 1;
+
+    template <typename... Is>
+    float operator()(Is... is)
+    {
+        return value;
+    }
+};
+
+struct GeneratorTensor_2
+{
+    int min_value = 0;
+    int max_value = 1;
+
+    template <typename... Is>
+    float operator()(Is...)
+    {
+        return (std::rand() % (max_value - min_value)) + min_value;
+    }
+};
+
+template <typename T>
+struct GeneratorTensor_3
+{
+    T min_value = 0;
+    T max_value = 1;
+
+    template <typename... Is>
+    float operator()(Is...)
+    {
+        float tmp = float(std::rand()) / float(RAND_MAX);
+
+        return min_value + tmp * (max_value - min_value);
+    }
+};
+
+struct GeneratorTensor_Checkboard
+{
+    template <typename... Ts>
+    float operator()(Ts... Xs) const
+    {
+        std::array<ck::index_t, sizeof...(Ts)> dims = {{static_cast<ck::index_t>(Xs)...}};
+        return std::accumulate(dims.begin(),
+                               dims.end(),
+                               true,
+                               [](bool init, ck::index_t x) -> int { return init != (x % 2); })
+                   ? 1
+                   : -1;
+    }
+};
+
+#endif