Composable kernel init integration v3 (#1097)

* Squashed 'src/composable_kernel/' content from commit f6edda611

git-subtree-dir: src/composable_kernel
git-subtree-split: f6edda6119

* add solver ConvIgemmFwdV6r1DlopsNchwKcyxNkhw; rename static ck source files

* Squashed 'src/composable_kernel/' changes from f6edda611..5781adf5c

5781adf5c Update develop (#5) (#6)
97e6d514f Merge pull request #4 from ROCmSoftwarePlatform/separate_online_compile
7b1ec41e5 refactor
49c33aaea refactor
54b3e73d1 rename

git-subtree-dir: src/composable_kernel
git-subtree-split: 5781adf5cf

* fix

* refactor

* remove online compilation from CK

* refactor

* fix

* add ctest

* add c-style pointer cast

* vector/scalar pointer cast use c-style pointer cast instead of reinterpret_cast

* fix clang warning suppression

* tidy

* suppress cppcheck

* fix enum issue

* revert chagnes to hip build

* fix kernel filename

* update CK build script

* rename

* rename

* make innner product compatiable on gfx900

* Update src/include/miopen/solver/ck_utility_common.hpp

Co-authored-by: JD <Jehandad.Khan@amd.com>

* compiler parameter use stream

* use int instead of index_t in kernel wrapper

* DynamicBuffer, StaticBuffer, amd_buffer_load support customized value for invalid element

* refactor

* refactor

* change cmakelist

* change ck common utility

* fix

Co-authored-by: JD <Jehandad.Khan@amd.com>
This commit is contained in:
Chao Liu
2021-08-19 10:55:03 -05:00
committed by GitHub
commit 6fe3627a9e
126 changed files with 30654 additions and 0 deletions

View File

@@ -0,0 +1,21 @@
include_directories(BEFORE
include
)
set(HOST_TENSOR_SOURCE
src/host_tensor.cpp;
src/device.cpp;
)
## the library target
add_library(host_tensor SHARED ${HOST_TENSOR_SOURCE})
target_include_directories(host_tensor SYSTEM PUBLIC $<BUILD_INTERFACE:${HALF_INCLUDE_DIR}>)
target_link_libraries(host_tensor PRIVATE hip::device)
target_link_libraries(host_tensor INTERFACE hip::host)
target_compile_features(host_tensor PUBLIC)
set_target_properties(host_tensor PROPERTIES POSITION_INDEPENDENT_CODE ON)
install(TARGETS host_tensor LIBRARY DESTINATION lib)

View File

@@ -0,0 +1,86 @@
#ifndef CONV_COMMON_HPP
#define CONV_COMMON_HPP
#include "tensor_descriptor.hpp"
enum ConvTensorLayout
{
NCHW,
NHWC,
CHWN,
NCHWc,
NHWCc
};
template <typename... InDesc,
typename... WeiDesc,
typename ConvStrides,
typename ConvDilations,
typename LeftPads,
typename RightPads>
constexpr auto get_convolution_output_default_4d_tensor_descriptor(
const ck::TensorDescriptor<InDesc...>& in_desc,
const ck::TensorDescriptor<WeiDesc...>& wei_desc,
const ConvStrides& conv_strides,
const ConvDilations conv_dilations,
const LeftPads& left_pads,
const RightPads& right_pads)
{
using namespace ck;
constexpr auto I0 = Number<0>{};
constexpr auto I1 = Number<1>{};
constexpr auto I2 = Number<2>{};
constexpr auto I3 = Number<3>{};
assert(in_desc.GetNumOfDimension() == 4);
assert(wei_desc.GetNumOfDimension() == 4);
assert(in_desc.GetLength(I1) == wei_desc.GetLength(I1));
const auto N = in_desc.GetLength(I0);
const auto Hi = in_desc.GetLength(I2);
const auto Wi = in_desc.GetLength(I3);
const auto K = wei_desc.GetLength(I0);
const auto Y = wei_desc.GetLength(I2);
const auto X = wei_desc.GetLength(I3);
const auto LeftPadH = left_pads[I0];
const auto LeftPadW = left_pads[I1];
const auto RightPadH = right_pads[I0];
const auto RightPadW = right_pads[I1];
const auto YEff = (Y - I1) * conv_dilations[I0] + I1;
const auto XEff = (X - I1) * conv_dilations[I1] + I1;
const auto Ho = (Hi + LeftPadH + RightPadH - YEff) / conv_strides[I0] + I1;
const auto Wo = (Wi + LeftPadW + RightPadW - XEff) / conv_strides[I1] + I1;
return make_naive_tensor_descriptor_packed(make_tuple(N, K, Ho, Wo));
}
template <class InDesc, class WeiDesc, class OutDesc>
constexpr std::size_t
calculate_convolution_flops(const InDesc&, const WeiDesc& wei_desc, const OutDesc& out_desc)
{
using namespace ck;
constexpr auto I0 = Number<0>{};
constexpr auto I1 = Number<1>{};
constexpr auto I2 = Number<2>{};
constexpr auto I3 = Number<3>{};
const index_t N = out_desc.GetLength(I0);
const index_t K = out_desc.GetLength(I1);
const index_t Ho = out_desc.GetLength(I2);
const index_t Wo = out_desc.GetLength(I3);
const index_t C = wei_desc.GetLength(I1);
const index_t Y = wei_desc.GetLength(I2);
const index_t X = wei_desc.GetLength(I3);
return std::size_t(2) * N * K * Ho * Wo * C * Y * X;
}
#endif

View File

@@ -0,0 +1,80 @@
#ifndef DEVICE_HPP
#define DEVICE_HPP
#include <memory>
#include "hip/hip_runtime.h"
#include "hip/hip_fp16.h"
struct DeviceMem
{
DeviceMem() = delete;
DeviceMem(std::size_t mem_size);
void* GetDeviceBuffer();
void ToDevice(const void* p);
void FromDevice(void* p);
~DeviceMem();
void* mpDeviceBuf;
std::size_t mMemSize;
};
struct KernelTimerImpl;
struct KernelTimer
{
KernelTimer();
~KernelTimer();
void Start();
void End();
float GetElapsedTime() const;
std::unique_ptr<KernelTimerImpl> impl;
};
using device_stream_t = hipStream_t;
template <typename... Args, typename F>
void launch_kernel(F kernel, dim3 grid_dim, dim3 block_dim, std::size_t lds_byte, Args... args)
{
hipStream_t stream_id = nullptr;
hipLaunchKernelGGL(kernel, grid_dim, block_dim, lds_byte, stream_id, args...);
}
template <typename... Args, typename F>
float launch_and_time_kernel(
F kernel, int nrepeat, dim3 grid_dim, dim3 block_dim, std::size_t lds_byte, Args... args)
{
KernelTimer timer;
printf("%s: grid_dim {%d, %d, %d}, block_dim {%d, %d, %d} \n",
__func__,
grid_dim.x,
grid_dim.y,
grid_dim.z,
block_dim.x,
block_dim.y,
block_dim.z);
printf("Warm up\n");
hipStream_t stream_id = nullptr;
// warm up
hipLaunchKernelGGL(kernel, grid_dim, block_dim, lds_byte, stream_id, args...);
printf("Start running %d times...\n", nrepeat);
timer.Start();
for(int i = 0; i < nrepeat; ++i)
{
hipLaunchKernelGGL(kernel, grid_dim, block_dim, lds_byte, stream_id, args...);
}
timer.End();
return timer.GetElapsedTime() / nrepeat;
}
#endif

View File

@@ -0,0 +1,9 @@
#pragma once
#include "host_tensor.hpp"
#include "common_header.hpp"
template <typename TensorDesc>
void ostream_tensor_descriptor(TensorDesc, std::ostream& os = std::cout)
{
ostream_HostTensorDescriptor(make_HostTensorDescriptor(TensorDesc{}), os);
}

View File

@@ -0,0 +1,324 @@
#pragma once
#include "host_tensor.hpp"
template <typename TIn,
typename TWei,
typename TOut,
typename ConvStrides,
typename ConvDilations,
typename InLeftPads,
typename InRightPads>
void host_direct_convolution(const Tensor<TIn>& in,
const Tensor<TWei>& wei,
Tensor<TOut>& out,
const ConvStrides& conv_strides,
const ConvDilations& conv_dilations,
const InLeftPads& in_left_pads,
const InRightPads&,
const ConvTensorLayout layout = ConvTensorLayout::NCHW)
{
using namespace ck;
constexpr auto I0 = Number<0>{};
constexpr auto I1 = Number<1>{};
auto f_nchw = [&](auto n, auto k, auto ho, auto wo) {
double v = 0;
for(int c = 0; c < wei.mDesc.GetLengths()[1]; ++c)
{
for(int y = 0; y < wei.mDesc.GetLengths()[2]; ++y)
{
int hi = ho * conv_strides[I0] + y * conv_dilations[I0] - in_left_pads[I0];
for(int x = 0; x < wei.mDesc.GetLengths()[3]; ++x)
{
int wi = wo * conv_strides[I1] + x * conv_dilations[I1] - in_left_pads[I1];
if(hi >= 0 && hi < in.mDesc.GetLengths()[2] && wi >= 0 &&
wi < in.mDesc.GetLengths()[3])
{
v += static_cast<const double>(in(n, c, hi, wi)) *
static_cast<const double>(wei(k, c, y, x));
}
}
}
}
out(n, k, ho, wo) = v;
};
auto f_nhwc = [&](auto n, auto ho, auto wo, auto k) {
double v = 0;
for(int c = 0; c < wei.mDesc.GetLengths()[3]; ++c)
{
for(int y = 0; y < wei.mDesc.GetLengths()[1]; ++y)
{
int hi = ho * conv_strides[I0] + y * conv_dilations[I0] - in_left_pads[I0];
for(int x = 0; x < wei.mDesc.GetLengths()[2]; ++x)
{
int wi = wo * conv_strides[I1] + x * conv_dilations[I1] - in_left_pads[I1];
if(hi >= 0 && hi < in.mDesc.GetLengths()[1] && wi >= 0 &&
wi < in.mDesc.GetLengths()[2])
{
v += static_cast<const double>(in(n, hi, wi, c)) *
static_cast<const double>(wei(k, y, x, c));
}
}
}
}
out(n, ho, wo, k) = v;
};
if(layout == ConvTensorLayout::NCHW)
{
make_ParallelTensorFunctor(f_nchw,
out.mDesc.GetLengths()[0],
out.mDesc.GetLengths()[1],
out.mDesc.GetLengths()[2],
out.mDesc.GetLengths()[3])(std::thread::hardware_concurrency());
}
else if(layout == ConvTensorLayout::NHWC)
{
make_ParallelTensorFunctor(f_nhwc,
out.mDesc.GetLengths()[0],
out.mDesc.GetLengths()[1],
out.mDesc.GetLengths()[2],
out.mDesc.GetLengths()[3])(std::thread::hardware_concurrency());
}
else
{
throw std::runtime_error("wrong! not supported layout");
}
}
template <typename TIn, typename TWei, typename TOut, typename InLeftPads, typename InRightPads>
void host_winograd_3x3_convolution(const Tensor<TIn>& in_nchw,
const Tensor<TWei>& wei_kcyx,
Tensor<TOut>& out_nkhw,
InLeftPads,
InRightPads)
{
using namespace ck;
constexpr std::size_t HoPerTile = 2;
constexpr std::size_t WoPerTile = 2;
std::size_t N = in_nchw.mDesc.GetLengths()[0];
std::size_t C = in_nchw.mDesc.GetLengths()[1];
std::size_t K = wei_kcyx.mDesc.GetLengths()[0];
std::size_t Y = wei_kcyx.mDesc.GetLengths()[2];
std::size_t X = wei_kcyx.mDesc.GetLengths()[3];
std::size_t Ho = out_nkhw.mDesc.GetLengths()[2];
std::size_t Wo = out_nkhw.mDesc.GetLengths()[3];
index_t h_pad_low = InLeftPads{}.Get(Number<0>{});
index_t w_pad_low = InLeftPads{}.Get(Number<1>{});
std::size_t HiPerTile = HoPerTile + Y - 1;
std::size_t WiPerTile = WoPerTile + X - 1;
std::size_t HTile = (Ho + HoPerTile - 1) / HoPerTile;
std::size_t WTile = (Wo + WoPerTile - 1) / WoPerTile;
Tensor<double> in_hold({N, C, HTile, WTile, HiPerTile, WiPerTile});
Tensor<double> in_transform({N, C, HTile, WTile, HiPerTile, WiPerTile});
Tensor<double> wei_transform({K, C, HiPerTile, WiPerTile});
Tensor<double> out_transform({N, K, HTile, WTile, HiPerTile, HiPerTile});
Tensor<double> out_hold({N, K, HTile, WTile, HoPerTile, WoPerTile});
auto f_in_hold = [&](auto n, auto c, auto htile, auto wtile) {
for(int j = 0; j < HiPerTile; ++j)
{
int hi = HoPerTile * htile + j - h_pad_low;
for(int i = 0; i < WiPerTile; ++i)
{
int wi = WoPerTile * wtile + i - w_pad_low;
if(hi >= 0 && hi < in_nchw.mDesc.GetLengths()[2] && wi >= 0 &&
wi < in_nchw.mDesc.GetLengths()[3])
{
in_hold(n, c, htile, wtile, j, i) = in_nchw(n, c, hi, wi);
}
else
{
in_hold(n, c, htile, wtile, j, i) = TIn(0);
}
}
}
};
auto f_in_transform = [&](auto n, auto c, auto htile, auto wtile) {
in_transform(n, c, htile, wtile, 0, 0) =
in_hold(n, c, htile, wtile, 0, 0) - in_hold(n, c, htile, wtile, 0, 2) -
in_hold(n, c, htile, wtile, 2, 0) + in_hold(n, c, htile, wtile, 2, 2);
in_transform(n, c, htile, wtile, 0, 1) =
in_hold(n, c, htile, wtile, 0, 1) + in_hold(n, c, htile, wtile, 0, 2) -
in_hold(n, c, htile, wtile, 2, 1) - in_hold(n, c, htile, wtile, 2, 2);
in_transform(n, c, htile, wtile, 0, 2) =
-in_hold(n, c, htile, wtile, 0, 1) + in_hold(n, c, htile, wtile, 0, 2) +
in_hold(n, c, htile, wtile, 2, 1) - in_hold(n, c, htile, wtile, 2, 2);
in_transform(n, c, htile, wtile, 0, 3) =
in_hold(n, c, htile, wtile, 0, 1) - in_hold(n, c, htile, wtile, 0, 3) -
in_hold(n, c, htile, wtile, 2, 1) + in_hold(n, c, htile, wtile, 2, 3);
in_transform(n, c, htile, wtile, 1, 0) =
in_hold(n, c, htile, wtile, 1, 0) - in_hold(n, c, htile, wtile, 1, 2) +
in_hold(n, c, htile, wtile, 2, 0) - in_hold(n, c, htile, wtile, 2, 2);
in_transform(n, c, htile, wtile, 1, 1) =
in_hold(n, c, htile, wtile, 1, 1) + in_hold(n, c, htile, wtile, 1, 2) +
in_hold(n, c, htile, wtile, 2, 1) + in_hold(n, c, htile, wtile, 2, 2);
in_transform(n, c, htile, wtile, 1, 2) =
-in_hold(n, c, htile, wtile, 1, 1) + in_hold(n, c, htile, wtile, 1, 2) -
in_hold(n, c, htile, wtile, 2, 1) + in_hold(n, c, htile, wtile, 2, 2);
in_transform(n, c, htile, wtile, 1, 3) =
in_hold(n, c, htile, wtile, 1, 1) - in_hold(n, c, htile, wtile, 1, 3) +
in_hold(n, c, htile, wtile, 2, 1) - in_hold(n, c, htile, wtile, 2, 3);
in_transform(n, c, htile, wtile, 2, 0) =
-in_hold(n, c, htile, wtile, 1, 0) + in_hold(n, c, htile, wtile, 1, 2) +
in_hold(n, c, htile, wtile, 2, 0) - in_hold(n, c, htile, wtile, 2, 2);
in_transform(n, c, htile, wtile, 2, 1) =
-in_hold(n, c, htile, wtile, 1, 1) - in_hold(n, c, htile, wtile, 1, 2) +
in_hold(n, c, htile, wtile, 2, 1) + in_hold(n, c, htile, wtile, 2, 2);
in_transform(n, c, htile, wtile, 2, 2) =
in_hold(n, c, htile, wtile, 1, 1) - in_hold(n, c, htile, wtile, 1, 2) -
in_hold(n, c, htile, wtile, 2, 1) + in_hold(n, c, htile, wtile, 2, 2);
in_transform(n, c, htile, wtile, 2, 3) =
-in_hold(n, c, htile, wtile, 1, 1) + in_hold(n, c, htile, wtile, 1, 3) +
in_hold(n, c, htile, wtile, 2, 1) - in_hold(n, c, htile, wtile, 2, 3);
in_transform(n, c, htile, wtile, 3, 0) =
in_hold(n, c, htile, wtile, 1, 0) - in_hold(n, c, htile, wtile, 1, 2) -
in_hold(n, c, htile, wtile, 3, 0) + in_hold(n, c, htile, wtile, 3, 2);
in_transform(n, c, htile, wtile, 3, 1) =
in_hold(n, c, htile, wtile, 1, 1) + in_hold(n, c, htile, wtile, 1, 2) -
in_hold(n, c, htile, wtile, 3, 1) - in_hold(n, c, htile, wtile, 3, 2);
in_transform(n, c, htile, wtile, 3, 2) =
-in_hold(n, c, htile, wtile, 1, 1) + in_hold(n, c, htile, wtile, 1, 2) +
in_hold(n, c, htile, wtile, 3, 1) - in_hold(n, c, htile, wtile, 3, 2);
in_transform(n, c, htile, wtile, 3, 3) =
in_hold(n, c, htile, wtile, 1, 1) - in_hold(n, c, htile, wtile, 1, 3) -
in_hold(n, c, htile, wtile, 3, 1) + in_hold(n, c, htile, wtile, 3, 3);
};
auto f_wei_transform = [&](auto k, auto c) {
wei_transform(k, c, 0, 0) = double(wei_kcyx(k, c, 0, 0));
wei_transform(k, c, 0, 1) = 0.5 * double(wei_kcyx(k, c, 0, 0)) +
0.5 * double(wei_kcyx(k, c, 0, 1)) +
0.5 * double(wei_kcyx(k, c, 0, 2));
wei_transform(k, c, 0, 2) = 0.5 * double(wei_kcyx(k, c, 0, 0)) -
0.5 * double(wei_kcyx(k, c, 0, 1)) +
0.5 * double(wei_kcyx(k, c, 0, 2));
wei_transform(k, c, 0, 3) = double(wei_kcyx(k, c, 0, 2));
wei_transform(k, c, 1, 0) = 0.5 * double(wei_kcyx(k, c, 0, 0)) +
0.5 * double(wei_kcyx(k, c, 1, 0)) +
0.5 * double(wei_kcyx(k, c, 2, 0));
wei_transform(k, c, 1, 1) =
0.25 * double(wei_kcyx(k, c, 0, 0)) + 0.25 * double(wei_kcyx(k, c, 0, 1)) +
0.25 * double(wei_kcyx(k, c, 0, 2)) + 0.25 * double(wei_kcyx(k, c, 1, 0)) +
0.25 * double(wei_kcyx(k, c, 1, 1)) + 0.25 * double(wei_kcyx(k, c, 1, 2)) +
0.25 * double(wei_kcyx(k, c, 2, 0)) + 0.25 * double(wei_kcyx(k, c, 2, 1)) +
0.25 * double(wei_kcyx(k, c, 2, 2));
wei_transform(k, c, 1, 2) =
0.25 * double(wei_kcyx(k, c, 0, 0)) - 0.25 * double(wei_kcyx(k, c, 0, 1)) +
0.25 * double(wei_kcyx(k, c, 0, 2)) + 0.25 * double(wei_kcyx(k, c, 1, 0)) -
0.25 * double(wei_kcyx(k, c, 1, 1)) + 0.25 * double(wei_kcyx(k, c, 1, 2)) +
0.25 * double(wei_kcyx(k, c, 2, 0)) - 0.25 * double(wei_kcyx(k, c, 2, 1)) +
0.25 * double(wei_kcyx(k, c, 2, 2));
wei_transform(k, c, 1, 3) = 0.5 * double(wei_kcyx(k, c, 0, 2)) +
0.5 * double(wei_kcyx(k, c, 1, 2)) +
0.5 * double(wei_kcyx(k, c, 2, 2));
wei_transform(k, c, 2, 0) = 0.5 * double(wei_kcyx(k, c, 0, 0)) -
0.5 * double(wei_kcyx(k, c, 1, 0)) +
0.5 * double(wei_kcyx(k, c, 2, 0));
wei_transform(k, c, 2, 1) =
0.25 * double(wei_kcyx(k, c, 0, 0)) + 0.25 * double(wei_kcyx(k, c, 0, 1)) +
0.25 * double(wei_kcyx(k, c, 0, 2)) - 0.25 * double(wei_kcyx(k, c, 1, 0)) -
0.25 * double(wei_kcyx(k, c, 1, 1)) - 0.25 * double(wei_kcyx(k, c, 1, 2)) +
0.25 * double(wei_kcyx(k, c, 2, 0)) + 0.25 * double(wei_kcyx(k, c, 2, 1)) +
0.25 * double(wei_kcyx(k, c, 2, 2));
wei_transform(k, c, 2, 2) =
0.25 * double(wei_kcyx(k, c, 0, 0)) - 0.25 * double(wei_kcyx(k, c, 0, 1)) +
0.25 * double(wei_kcyx(k, c, 0, 2)) - 0.25 * double(wei_kcyx(k, c, 1, 0)) +
0.25 * double(wei_kcyx(k, c, 1, 1)) - 0.25 * double(wei_kcyx(k, c, 1, 2)) +
0.25 * double(wei_kcyx(k, c, 2, 0)) - 0.25 * double(wei_kcyx(k, c, 2, 1)) +
0.25 * double(wei_kcyx(k, c, 2, 2));
wei_transform(k, c, 2, 3) = 0.5 * double(wei_kcyx(k, c, 0, 2)) -
0.5 * double(wei_kcyx(k, c, 1, 2)) +
0.5 * double(wei_kcyx(k, c, 2, 2));
wei_transform(k, c, 3, 0) = double(wei_kcyx(k, c, 2, 0));
wei_transform(k, c, 3, 1) = 0.5 * double(wei_kcyx(k, c, 2, 0)) +
0.5 * double(wei_kcyx(k, c, 2, 1)) +
0.5 * double(wei_kcyx(k, c, 2, 2));
wei_transform(k, c, 3, 2) = 0.5 * double(wei_kcyx(k, c, 2, 0)) -
0.5 * double(wei_kcyx(k, c, 2, 1)) +
0.5 * double(wei_kcyx(k, c, 2, 2));
wei_transform(k, c, 3, 3) = double(wei_kcyx(k, c, 2, 2));
};
auto f_out_transform = [&](auto n, auto k, auto htile, auto wtile) {
for(int j = 0; j < HiPerTile; ++j)
{
for(int i = 0; i < WiPerTile; ++i)
{
double v = 0;
for(int c = 0; c < C; ++c)
{
v += in_transform(n, c, htile, wtile, j, i) * wei_transform(k, c, j, i);
}
out_transform(n, k, htile, wtile, j, i) = v;
}
}
};
auto f_out_hold = [&](auto n, auto k, auto htile, auto wtile) {
out_hold(n, k, htile, wtile, 0, 0) =
out_transform(n, k, htile, wtile, 0, 0) + out_transform(n, k, htile, wtile, 0, 1) +
out_transform(n, k, htile, wtile, 0, 2) + out_transform(n, k, htile, wtile, 1, 0) +
out_transform(n, k, htile, wtile, 1, 1) + out_transform(n, k, htile, wtile, 1, 2) +
out_transform(n, k, htile, wtile, 2, 0) + out_transform(n, k, htile, wtile, 2, 1) +
out_transform(n, k, htile, wtile, 2, 2);
out_hold(n, k, htile, wtile, 0, 1) =
out_transform(n, k, htile, wtile, 0, 1) - out_transform(n, k, htile, wtile, 0, 2) -
out_transform(n, k, htile, wtile, 0, 3) + out_transform(n, k, htile, wtile, 1, 1) -
out_transform(n, k, htile, wtile, 1, 2) - out_transform(n, k, htile, wtile, 1, 3) +
out_transform(n, k, htile, wtile, 2, 1) - out_transform(n, k, htile, wtile, 2, 2) -
out_transform(n, k, htile, wtile, 2, 3);
out_hold(n, k, htile, wtile, 1, 0) =
out_transform(n, k, htile, wtile, 1, 0) + out_transform(n, k, htile, wtile, 1, 1) +
out_transform(n, k, htile, wtile, 1, 2) - out_transform(n, k, htile, wtile, 2, 0) -
out_transform(n, k, htile, wtile, 2, 1) - out_transform(n, k, htile, wtile, 2, 2) -
out_transform(n, k, htile, wtile, 3, 0) - out_transform(n, k, htile, wtile, 3, 1) -
out_transform(n, k, htile, wtile, 3, 2);
out_hold(n, k, htile, wtile, 1, 1) =
out_transform(n, k, htile, wtile, 1, 1) - out_transform(n, k, htile, wtile, 1, 2) -
out_transform(n, k, htile, wtile, 1, 3) - out_transform(n, k, htile, wtile, 2, 1) +
out_transform(n, k, htile, wtile, 2, 2) + out_transform(n, k, htile, wtile, 2, 3) -
out_transform(n, k, htile, wtile, 3, 1) + out_transform(n, k, htile, wtile, 3, 2) +
out_transform(n, k, htile, wtile, 3, 3);
};
auto f_out = [&](auto n, auto k, auto htile, auto wtile) {
for(int j = 0; j < HoPerTile; ++j)
{
std::size_t ho = HoPerTile * htile + j;
for(int i = 0; i < WoPerTile; ++i)
{
std::size_t wo = WoPerTile * wtile + i;
out_nkhw(n, k, ho, wo) = out_hold(n, k, htile, wtile, j, i);
}
}
};
std::size_t num_thread = std::thread::hardware_concurrency();
make_ParallelTensorFunctor(f_in_hold, N, C, HTile, WTile)(num_thread);
make_ParallelTensorFunctor(f_in_transform, N, C, HTile, WTile)(num_thread);
make_ParallelTensorFunctor(f_wei_transform, K, C)(num_thread);
make_ParallelTensorFunctor(f_out_transform, N, K, HTile, WTile)(num_thread);
make_ParallelTensorFunctor(f_out_hold, N, K, HTile, WTile)(num_thread);
make_ParallelTensorFunctor(f_out, N, K, HTile, WTile)(num_thread);
}

View File

@@ -0,0 +1,135 @@
#pragma once
#include "host_tensor.hpp"
template <typename TIn,
typename TWei,
typename TOut,
typename ConvStrides,
typename ConvDilations,
typename InLeftPads,
typename InRightPads>
void host_direct_convolution_backward_data(Tensor<TIn>& in,
const Tensor<TWei>& wei,
const Tensor<TOut>& out,
const ConvStrides& conv_strides,
const ConvDilations& conv_dilations,
const InLeftPads& in_left_pads,
const InRightPads& /* in_right_pads */,
const ConvTensorLayout layout = ConvTensorLayout::NCHW)
{
using namespace ck;
constexpr auto I0 = Number<0>{};
constexpr auto I1 = Number<1>{};
constexpr auto I2 = Number<2>{};
constexpr auto I3 = Number<3>{};
auto f_nchw = [&](auto n, auto c, auto hi, auto wi) {
std::size_t K = wei.mDesc.GetLengths()[I0];
std::size_t Y = wei.mDesc.GetLengths()[I2];
std::size_t X = wei.mDesc.GetLengths()[I3];
std::size_t Ho = out.mDesc.GetLengths()[I2];
std::size_t Wo = out.mDesc.GetLengths()[I3];
double v = 0;
for(int y = 0; y < Y; ++y)
{
int h_tmp = hi + in_left_pads[I0] - y * conv_dilations[I0];
if(h_tmp % conv_strides[I0] == 0)
{
int ho = h_tmp / conv_strides[I0];
if(ho >= 0 && ho < Ho)
{
for(int x = 0; x < X; ++x)
{
int w_tmp = wi + in_left_pads[I1] - x * conv_dilations[I1];
if(w_tmp % conv_strides[I1] == 0)
{
int wo = w_tmp / conv_strides[I1];
if(wo >= 0 && wo < Wo)
{
for(int k = 0; k < K; ++k)
{
v += out(n, k, ho, wo) * wei(k, c, y, x);
}
}
}
}
}
}
}
in(n, c, hi, wi) = v;
};
auto f_nhwc = [&](auto n, auto hi, auto wi, auto c) {
std::size_t K = wei.mDesc.GetLengths()[I0];
std::size_t Y = wei.mDesc.GetLengths()[I1];
std::size_t X = wei.mDesc.GetLengths()[I2];
std::size_t Ho = out.mDesc.GetLengths()[I1];
std::size_t Wo = out.mDesc.GetLengths()[I2];
double v = 0;
for(int y = 0; y < Y; ++y)
{
int h_tmp = hi + in_left_pads[I0] - y * conv_dilations[I0];
if(h_tmp % conv_strides[I0] == 0)
{
int ho = h_tmp / conv_strides[I0];
if(ho >= 0 && ho < Ho)
{
for(int x = 0; x < X; ++x)
{
int w_tmp = wi + in_left_pads[I1] - x * conv_dilations[I1];
if(w_tmp % conv_strides[I1] == 0)
{
int wo = w_tmp / conv_strides[I1];
if(wo >= 0 && wo < Wo)
{
for(int k = 0; k < K; ++k)
{
v += out(n, ho, wo, k) * wei(k, y, x, c);
}
}
}
}
}
}
}
in(n, hi, wi, c) = v;
};
if(layout == ConvTensorLayout::NCHW)
{
make_ParallelTensorFunctor(f_nchw,
in.mDesc.GetLengths()[0],
in.mDesc.GetLengths()[1],
in.mDesc.GetLengths()[2],
in.mDesc.GetLengths()[3])(std::thread::hardware_concurrency());
}
else if(layout == ConvTensorLayout::NHWC)
{
make_ParallelTensorFunctor(f_nhwc,
in.mDesc.GetLengths()[0],
in.mDesc.GetLengths()[1],
in.mDesc.GetLengths()[2],
in.mDesc.GetLengths()[3])(std::thread::hardware_concurrency());
}
else
{
throw std::runtime_error("wrong! not supported layout");
}
}

View File

@@ -0,0 +1,322 @@
#ifndef HOST_TENSOR_HPP
#define HOST_TENSOR_HPP
#include <thread>
#include <vector>
#include <numeric>
#include <algorithm>
#include <utility>
#include <cassert>
#include <iostream>
template <typename Range>
std::ostream& LogRange(std::ostream& os, Range&& range, std::string delim)
{
bool first = true;
for(auto&& v : range)
{
if(first)
first = false;
else
os << delim;
os << v;
}
return os;
}
template <typename T, typename Range>
std::ostream& LogRangeAsType(std::ostream& os, Range&& range, std::string delim)
{
bool first = true;
for(auto&& v : range)
{
if(first)
first = false;
else
os << delim;
os << static_cast<T>(v);
}
return os;
}
typedef enum
{
Half = 0,
Float = 1,
} DataType_t;
template <typename T>
struct DataType;
template <>
struct DataType<float> : std::integral_constant<DataType_t, DataType_t::Float>
{
};
template <typename F, typename T, std::size_t... Is>
auto call_f_unpack_args_impl(F f, T args, std::index_sequence<Is...>)
{
return f(std::get<Is>(args)...);
}
template <typename F, typename T>
auto call_f_unpack_args(F f, T args)
{
constexpr std::size_t N = std::tuple_size<T>{};
return call_f_unpack_args_impl(f, args, std::make_index_sequence<N>{});
}
template <typename F, typename T, std::size_t... Is>
auto construct_f_unpack_args_impl(T args, std::index_sequence<Is...>)
{
return F(std::get<Is>(args)...);
}
template <typename F, typename T>
auto construct_f_unpack_args(F, T args)
{
constexpr std::size_t N = std::tuple_size<T>{};
return construct_f_unpack_args_impl<F>(args, std::make_index_sequence<N>{});
}
struct HostTensorDescriptor
{
HostTensorDescriptor() = delete;
template <typename X>
HostTensorDescriptor(std::vector<X> lens);
template <typename X, typename Y>
HostTensorDescriptor(std::vector<X> lens, std::vector<Y> strides);
void CalculateStrides();
template <typename Range>
HostTensorDescriptor(const Range& lens) : mLens(lens.begin(), lens.end())
{
this->CalculateStrides();
}
template <typename Range1, typename Range2>
HostTensorDescriptor(const Range1& lens, const Range2& strides)
: mLens(lens.begin(), lens.end()), mStrides(strides.begin(), strides.end())
{
}
std::size_t GetNumOfDimension() const;
std::size_t GetElementSize() const;
std::size_t GetElementSpace() const;
const std::vector<std::size_t>& GetLengths() const;
const std::vector<std::size_t>& GetStrides() const;
template <typename... Is>
std::size_t GetOffsetFromMultiIndex(Is... is) const
{
assert(sizeof...(Is) == this->GetNumOfDimension());
std::initializer_list<std::size_t> iss{static_cast<std::size_t>(is)...};
return std::inner_product(iss.begin(), iss.end(), mStrides.begin(), std::size_t{0});
}
private:
std::vector<std::size_t> mLens;
std::vector<std::size_t> mStrides;
};
struct joinable_thread : std::thread
{
template <typename... Xs>
joinable_thread(Xs&&... xs) : std::thread(std::forward<Xs>(xs)...)
{
}
joinable_thread(joinable_thread&&) = default;
joinable_thread& operator=(joinable_thread&&) = default;
~joinable_thread()
{
if(this->joinable())
this->join();
}
};
template <typename F, typename... Xs>
struct ParallelTensorFunctor
{
F mF;
static constexpr std::size_t NDIM = sizeof...(Xs);
std::array<std::size_t, NDIM> mLens;
std::array<std::size_t, NDIM> mStrides;
std::size_t mN1d;
ParallelTensorFunctor(F f, Xs... xs) : mF(f), mLens({static_cast<std::size_t>(xs)...})
{
mStrides.back() = 1;
std::partial_sum(mLens.rbegin(),
mLens.rend() - 1,
mStrides.rbegin() + 1,
std::multiplies<std::size_t>());
mN1d = mStrides[0] * mLens[0];
}
std::array<std::size_t, NDIM> GetNdIndices(std::size_t i) const
{
std::array<std::size_t, NDIM> indices;
for(int idim = 0; idim < NDIM; ++idim)
{
indices[idim] = i / mStrides[idim];
i -= indices[idim] * mStrides[idim];
}
return indices;
}
void operator()(std::size_t num_thread = std::thread::hardware_concurrency()) const
{
std::size_t work_per_thread = (mN1d + num_thread - 1) / num_thread;
std::vector<joinable_thread> threads(num_thread);
for(std::size_t it = 0; it < num_thread; ++it)
{
std::size_t iw_begin = it * work_per_thread;
std::size_t iw_end = std::min((it + 1) * work_per_thread, mN1d);
auto f = [=] {
for(std::size_t iw = iw_begin; iw < iw_end; ++iw)
{
call_f_unpack_args(mF, GetNdIndices(iw));
}
};
threads[it] = joinable_thread(f);
}
}
};
template <typename F, typename... Xs>
auto make_ParallelTensorFunctor(F f, Xs... xs)
{
return ParallelTensorFunctor<F, Xs...>(f, xs...);
}
template <typename T>
struct Tensor
{
template <typename X>
Tensor(std::initializer_list<X> lens) : mDesc(lens), mData(mDesc.GetElementSpace())
{
}
template <typename X>
Tensor(std::vector<X> lens) : mDesc(lens), mData(mDesc.GetElementSpace())
{
}
template <typename X, typename Y>
Tensor(std::vector<X> lens, std::vector<Y> strides)
: mDesc(lens, strides), mData(mDesc.GetElementSpace())
{
}
Tensor(const HostTensorDescriptor& desc) : mDesc(desc), mData(mDesc.GetElementSpace()) {}
template <typename G>
void GenerateTensorValue(G g, std::size_t num_thread = 1)
{
switch(mDesc.GetNumOfDimension())
{
case 1: {
auto f = [&](auto i) { (*this)(i) = g(i); };
make_ParallelTensorFunctor(f, mDesc.GetLengths()[0])(num_thread);
break;
}
case 2: {
auto f = [&](auto i0, auto i1) { (*this)(i0, i1) = g(i0, i1); };
make_ParallelTensorFunctor(f, mDesc.GetLengths()[0], mDesc.GetLengths()[1])(num_thread);
break;
}
case 3: {
auto f = [&](auto i0, auto i1, auto i2) { (*this)(i0, i1, i2) = g(i0, i1, i2); };
make_ParallelTensorFunctor(
f, mDesc.GetLengths()[0], mDesc.GetLengths()[1], mDesc.GetLengths()[2])(num_thread);
break;
}
case 4: {
auto f = [&](auto i0, auto i1, auto i2, auto i3) {
(*this)(i0, i1, i2, i3) = g(i0, i1, i2, i3);
};
make_ParallelTensorFunctor(f,
mDesc.GetLengths()[0],
mDesc.GetLengths()[1],
mDesc.GetLengths()[2],
mDesc.GetLengths()[3])(num_thread);
break;
}
default: throw std::runtime_error("unspported dimension");
}
}
template <typename... Is>
T& operator()(Is... is)
{
return mData[mDesc.GetOffsetFromMultiIndex(is...)];
}
template <typename... Is>
const T& operator()(Is... is) const
{
return mData[mDesc.GetOffsetFromMultiIndex(is...)];
}
typename std::vector<T>::iterator begin() { return mData.begin(); }
typename std::vector<T>::iterator end() { return mData.end(); }
typename std::vector<T>::const_iterator begin() const { return mData.begin(); }
typename std::vector<T>::const_iterator end() const { return mData.end(); }
HostTensorDescriptor mDesc;
std::vector<T> mData;
};
template <typename X>
HostTensorDescriptor::HostTensorDescriptor(std::vector<X> lens) : mLens(lens)
{
this->CalculateStrides();
}
template <typename X, typename Y>
HostTensorDescriptor::HostTensorDescriptor(std::vector<X> lens, std::vector<Y> strides)
: mLens(lens), mStrides(strides)
{
}
void ostream_HostTensorDescriptor(const HostTensorDescriptor& desc, std::ostream& os = std::cout);
template <typename T>
void check_error(const Tensor<T>& ref, const Tensor<T>& result)
{
float error = 0;
float max_diff = -1;
float ref_value = 0, result_value = 0;
for(int i = 0; i < ref.mData.size(); ++i)
{
error += std::abs(double(ref.mData[i]) - double(result.mData[i]));
float diff = std::abs(double(ref.mData[i]) - double(result.mData[i]));
if(max_diff < diff)
{
max_diff = diff;
ref_value = ref.mData[i];
result_value = result.mData[i];
}
}
std::cout << "error: " << error << std::endl;
std::cout << "max_diff: " << max_diff << ", " << ref_value << ", " << result_value << std::endl;
}
#endif

View File

@@ -0,0 +1,60 @@
#ifndef HOST_TENSOR_GENERATOR_HPP
#define HOST_TENSOR_GENERATOR_HPP
#include <cmath>
#include "config.hpp"
struct GeneratorTensor_1
{
int value = 1;
template <typename... Is>
float operator()(Is...)
{
return value;
}
};
struct GeneratorTensor_2
{
int min_value = 0;
int max_value = 1;
template <typename... Is>
float operator()(Is...)
{
return (std::rand() % (max_value - min_value)) + min_value;
}
};
template <typename T>
struct GeneratorTensor_3
{
T min_value = 0;
T max_value = 1;
template <typename... Is>
float operator()(Is...)
{
float tmp = float(std::rand()) / float(RAND_MAX);
return min_value + tmp * (max_value - min_value);
}
};
struct GeneratorTensor_Checkboard
{
template <typename... Ts>
float operator()(Ts... Xs) const
{
std::array<ck::index_t, sizeof...(Ts)> dims = {{static_cast<ck::index_t>(Xs)...}};
return std::accumulate(dims.begin(),
dims.end(),
true,
[](bool init, ck::index_t x) -> int { return init != (x % 2); })
? 1
: -1;
}
};
#endif

View File

@@ -0,0 +1,67 @@
#include "device.hpp"
DeviceMem::DeviceMem(std::size_t mem_size) : mMemSize(mem_size)
{
hipGetErrorString(hipMalloc(static_cast<void**>(&mpDeviceBuf), mMemSize));
}
void* DeviceMem::GetDeviceBuffer() { return mpDeviceBuf; }
void DeviceMem::ToDevice(const void* p)
{
hipGetErrorString(
hipMemcpy(mpDeviceBuf, const_cast<void*>(p), mMemSize, hipMemcpyHostToDevice));
}
void DeviceMem::FromDevice(void* p)
{
hipGetErrorString(hipMemcpy(p, mpDeviceBuf, mMemSize, hipMemcpyDeviceToHost));
}
DeviceMem::~DeviceMem() { hipGetErrorString(hipFree(mpDeviceBuf)); }
struct KernelTimerImpl
{
KernelTimerImpl()
{
hipGetErrorString(hipEventCreate(&mStart));
hipGetErrorString(hipEventCreate(&mEnd));
}
~KernelTimerImpl()
{
hipGetErrorString(hipEventDestroy(mStart));
hipGetErrorString(hipEventDestroy(mEnd));
}
void Start()
{
hipGetErrorString(hipDeviceSynchronize());
hipGetErrorString(hipEventRecord(mStart, nullptr));
}
void End()
{
hipGetErrorString(hipEventRecord(mEnd, nullptr));
hipGetErrorString(hipEventSynchronize(mEnd));
}
float GetElapsedTime() const
{
float time;
hipGetErrorString(hipEventElapsedTime(&time, mStart, mEnd));
return time;
}
hipEvent_t mStart, mEnd;
};
KernelTimer::KernelTimer() : impl(new KernelTimerImpl()) {}
KernelTimer::~KernelTimer() {}
void KernelTimer::Start() { impl->Start(); }
void KernelTimer::End() { impl->End(); }
float KernelTimer::GetElapsedTime() const { return impl->GetElapsedTime(); }

View File

@@ -0,0 +1,48 @@
#include <boost/range/adaptor/transformed.hpp>
#include <cassert>
#include "host_tensor.hpp"
void HostTensorDescriptor::CalculateStrides()
{
mStrides.clear();
mStrides.resize(mLens.size(), 0);
if(mStrides.empty())
return;
mStrides.back() = 1;
std::partial_sum(
mLens.rbegin(), mLens.rend() - 1, mStrides.rbegin() + 1, std::multiplies<std::size_t>());
}
std::size_t HostTensorDescriptor::GetNumOfDimension() const { return mLens.size(); }
std::size_t HostTensorDescriptor::GetElementSize() const
{
assert(mLens.size() == mStrides.size());
return std::accumulate(
mLens.begin(), mLens.end(), std::size_t{1}, std::multiplies<std::size_t>());
}
std::size_t HostTensorDescriptor::GetElementSpace() const
{
auto ls = mLens | boost::adaptors::transformed([](std::size_t v) { return v - 1; });
return std::inner_product(ls.begin(), ls.end(), mStrides.begin(), std::size_t{0}) + 1;
}
const std::vector<std::size_t>& HostTensorDescriptor::GetLengths() const { return mLens; }
const std::vector<std::size_t>& HostTensorDescriptor::GetStrides() const { return mStrides; }
void ostream_HostTensorDescriptor(const HostTensorDescriptor& desc, std::ostream& os)
{
os << "dim " << desc.GetNumOfDimension() << ", ";
os << "lengths {";
LogRange(os, desc.GetLengths(), ", ");
os << "}, ";
os << "strides {";
LogRange(os, desc.GetStrides(), ", ");
os << "}" << std::endl;
}