mirror of
https://github.com/ROCm/composable_kernel.git
synced 2026-05-13 01:36:06 +00:00
v5r1 fusion kernels for inference (#49)
* init
* refactor for 1x1
* rename e0_e1
* add e1 with bugs
* debug
* fixed
* fixed e1
* add timer
* imprve threadwise gemm with dot2
* add e2
* tuning
* seperate c2
* add nhwc
* restore nchwc
* clean
* opt
* fixed; tuning
* add BGlobalMoveSliceWindowStepHacks{}
* tuning
* repeat running
* adjust
* merge v5r1 nchwc
* add adaptors
* split k0 k1 in c_thread_grid
* split h and w
* remove v5r1 nhwc
* clean for pr
* remove host_conv_add
* clean code
* clean
* add dynamic support
* static mode
* test static
* add conv+add fusion
* fixed validation
* naming fix
* use activ_enum
* make static
* refactor conv_add for InMem::add
* add bias
* add conv_out
* add configurable makeddesc
* add maxpool fusion
* add maxpool host for validation
* enable static desc
* conv-only use v5r1_add
* test
* test
* for binary dumps
* fixed incorrect results due to typo
* clean
* debugging maxpool
* workaround with offset trick
* clean code
* modularize ops of fusion
* add gridwise_gemm_v3
* create seperate fusion fun
* enable dynamic mode of conv and conv+resize_add
* add dynamic mode of maxpool
* add pass by point
* add activ_type as arguments
* merge develop
* clean
* reset config to old default
Co-authored-by: Chao Liu <chao.liu2@amd.com>
This commit is contained in:
414
host/driver_offline/src/conv_add_fwd_driver_offline_nchwc.cpp
Normal file
414
host/driver_offline/src/conv_add_fwd_driver_offline_nchwc.cpp
Normal file
@@ -0,0 +1,414 @@
|
||||
#include <iostream>
|
||||
#include <numeric>
|
||||
#include <initializer_list>
|
||||
#include <cstdlib>
|
||||
#include <stdlib.h>
|
||||
#include <half.hpp>
|
||||
#include "config.hpp"
|
||||
#include "debug.hpp"
|
||||
#include "print.hpp"
|
||||
#include "device.hpp"
|
||||
#include "host_tensor.hpp"
|
||||
#include "host_tensor_generator.hpp"
|
||||
#include "conv_common.hpp"
|
||||
#include "device_tensor.hpp"
|
||||
#include "device_convolution_add_forward_implicit_gemm_v5r1_dlops_nc0hwc1_kc0yxc1_nk0hwk1.hpp"
|
||||
|
||||
#define USE_DYNAMIC_MODE 0
|
||||
#define USE_CONV_FWD_V5R1_NCHWC 1
|
||||
|
||||
enum ConvForwardAlgo
|
||||
{
|
||||
V5R1NCHWC // 0
|
||||
};
|
||||
|
||||
template <typename TIn,
|
||||
typename TWei,
|
||||
typename TOut,
|
||||
typename ConvStrides,
|
||||
typename ConvDilations,
|
||||
typename InLeftPads,
|
||||
typename InRightPads>
|
||||
void host_direct_convolution_add_nchwc(const Tensor<TIn>& in,
|
||||
const Tensor<TWei>& wei,
|
||||
const Tensor<TOut>& add,
|
||||
const Tensor<TOut>& bias,
|
||||
Tensor<TOut>& add_host,
|
||||
Tensor<TOut>& out_host,
|
||||
const ConvStrides& conv_strides,
|
||||
const ConvDilations& conv_dilations,
|
||||
const InLeftPads& in_left_pads,
|
||||
const InRightPads&,
|
||||
const ck::ActivTypeEnum_t activ_type)
|
||||
{
|
||||
using namespace ck;
|
||||
|
||||
constexpr auto I0 = Number<0>{};
|
||||
constexpr auto I1 = Number<1>{};
|
||||
|
||||
auto f_nchw = [&](auto n, auto k0, auto ho, auto wo, auto k1) {
|
||||
double v = 0;
|
||||
auto k = k0 * out_host.mDesc.GetLengths()[4] + k1;
|
||||
|
||||
for(int c0 = 0; c0 < wei.mDesc.GetLengths()[1]; ++c0)
|
||||
{
|
||||
for(int y = 0; y < wei.mDesc.GetLengths()[2]; ++y)
|
||||
{
|
||||
int hi = ho * conv_strides[I0] + y * conv_dilations[I0] - in_left_pads[I0];
|
||||
for(int x = 0; x < wei.mDesc.GetLengths()[3]; ++x)
|
||||
{
|
||||
int wi = wo * conv_strides[I1] + x * conv_dilations[I1] - in_left_pads[I1];
|
||||
if(hi >= 0 && hi < in.mDesc.GetLengths()[2] && wi >= 0 &&
|
||||
wi < in.mDesc.GetLengths()[3])
|
||||
{
|
||||
|
||||
for(int c1 = 0; c1 < wei.mDesc.GetLengths()[4]; ++c1)
|
||||
{
|
||||
v += static_cast<const double>(in(n, c0, hi, wi, c1)) *
|
||||
static_cast<const double>(wei(k, c0, y, x, c1));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
v += bias(k0, k1);
|
||||
v = activ(v, activ_type);
|
||||
|
||||
const int hox2 = ho * 2;
|
||||
const int wox2 = wo * 2;
|
||||
|
||||
out_host(n, k0, ho, wo, k1) = v;
|
||||
|
||||
add_host(n, k0, hox2, wox2, k1) = v + add(n, k0, hox2, wox2, k1);
|
||||
add_host(n, k0, hox2, wox2 + 1, k1) = v + add(n, k0, hox2, wox2 + 1, k1);
|
||||
add_host(n, k0, hox2 + 1, wox2, k1) = v + add(n, k0, hox2 + 1, wox2, k1);
|
||||
add_host(n, k0, hox2 + 1, wox2 + 1, k1) = v + add(n, k0, hox2 + 1, wox2 + 1, k1);
|
||||
};
|
||||
|
||||
make_ParallelTensorFunctor(f_nchw,
|
||||
out_host.mDesc.GetLengths()[0],
|
||||
out_host.mDesc.GetLengths()[1],
|
||||
out_host.mDesc.GetLengths()[2],
|
||||
out_host.mDesc.GetLengths()[3],
|
||||
out_host.mDesc.GetLengths()[4])(std::thread::hardware_concurrency());
|
||||
}
|
||||
|
||||
int main(int argc, char* argv[])
|
||||
{
|
||||
using namespace ck;
|
||||
|
||||
constexpr auto I0 = Number<0>{};
|
||||
constexpr auto I1 = Number<1>{};
|
||||
constexpr auto I2 = Number<2>{};
|
||||
constexpr auto I3 = Number<3>{};
|
||||
constexpr auto I4 = Number<4>{};
|
||||
constexpr auto I5 = Number<5>{};
|
||||
constexpr auto I6 = Number<6>{};
|
||||
constexpr auto I7 = Number<7>{};
|
||||
|
||||
#if USE_DYNAMIC_MODE
|
||||
// dynamic mode
|
||||
if(argc != 23)
|
||||
{
|
||||
printf("arg1 to 5: algo, do_verification, init_method, do_log, nrepeat\n");
|
||||
printf("rest: N, K0, K1, C0, C1, Y, X, Hi, Wi, Sy, Sx, Dy, Dx, LeftPy, LeftPx, RightPy, "
|
||||
"RightPx\n");
|
||||
exit(1);
|
||||
}
|
||||
|
||||
constexpr ck::ActivTypeEnum_t activ_type = ActivTypeEnum_t::LeakyRelu;
|
||||
|
||||
const ConvForwardAlgo algo = static_cast<ConvForwardAlgo>(std::stoi(argv[1]));
|
||||
const bool do_verification = std::stoi(argv[2]);
|
||||
const int init_method = std::stoi(argv[3]);
|
||||
const bool do_log = std::stoi(argv[4]);
|
||||
const int nrepeat = std::stoi(argv[5]);
|
||||
|
||||
const index_t N = std::stoi(argv[6]);
|
||||
const index_t K0 = std::stoi(argv[7]);
|
||||
const index_t K1 = std::stoi(argv[8]);
|
||||
const index_t C0 = std::stoi(argv[9]);
|
||||
const index_t C1 = std::stoi(argv[10]);
|
||||
const index_t Y = std::stoi(argv[11]);
|
||||
const index_t X = std::stoi(argv[12]);
|
||||
const index_t Hi = std::stoi(argv[13]);
|
||||
const index_t Wi = std::stoi(argv[14]);
|
||||
|
||||
const index_t conv_stride_h = std::stoi(argv[15]);
|
||||
const index_t conv_stride_w = std::stoi(argv[16]);
|
||||
const index_t conv_dilation_h = std::stoi(argv[17]);
|
||||
const index_t conv_dilation_w = std::stoi(argv[18]);
|
||||
const index_t in_left_pad_h = std::stoi(argv[19]);
|
||||
const index_t in_left_pad_w = std::stoi(argv[20]);
|
||||
const index_t in_right_pad_h = std::stoi(argv[21]);
|
||||
const index_t in_right_pad_w = std::stoi(argv[22]);
|
||||
|
||||
const index_t YEff = (Y - 1) * conv_dilation_h + 1;
|
||||
const index_t XEff = (X - 1) * conv_dilation_w + 1;
|
||||
|
||||
const index_t Ho = (Hi + in_left_pad_h + in_right_pad_h - YEff) / conv_stride_h + 1;
|
||||
const index_t Wo = (Wi + in_left_pad_w + in_right_pad_w - XEff) / conv_stride_w + 1;
|
||||
|
||||
const auto Hox2 = Ho * 2;
|
||||
const auto Wox2 = Wo * 2;
|
||||
#else
|
||||
// static mode
|
||||
if(argc < 6)
|
||||
{
|
||||
printf("arg1 to 5: algo, do_verification, init_method, do_log, nrepeat\n");
|
||||
exit(1);
|
||||
}
|
||||
|
||||
const ConvForwardAlgo algo = static_cast<ConvForwardAlgo>(std::stoi(argv[1]));
|
||||
|
||||
const bool do_verification = std::stoi(argv[2]);
|
||||
const int init_method = std::stoi(argv[3]);
|
||||
const bool do_log = std::stoi(argv[4]);
|
||||
const int nrepeat = std::stoi(argv[5]);
|
||||
|
||||
constexpr ck::ActivTypeEnum_t activ_type = ActivTypeEnum_t::LeakyRelu;
|
||||
|
||||
#if 0
|
||||
constexpr auto N = Number<1>{};
|
||||
constexpr auto Hi = Number<1080>{};
|
||||
constexpr auto Wi = Number<1920>{};
|
||||
constexpr auto Y = Number<3>{};
|
||||
constexpr auto X = Number<3>{};
|
||||
constexpr auto C0 = Number<2>{};
|
||||
constexpr auto C1 = Number<8>{};
|
||||
constexpr auto K1 = Number<8>{};
|
||||
constexpr auto K0 = Number<8>{};
|
||||
#elif 0
|
||||
constexpr auto N = Number<1>{};
|
||||
constexpr auto Hi = Number<540>{};
|
||||
constexpr auto Wi = Number<960>{};
|
||||
constexpr auto Y = Number<3>{};
|
||||
constexpr auto X = Number<3>{};
|
||||
constexpr auto C0 = Number<2>{};
|
||||
constexpr auto C1 = Number<8>{};
|
||||
constexpr auto K0 = Number<2>{};
|
||||
constexpr auto K1 = Number<8>{};
|
||||
#elif 0
|
||||
constexpr auto N = Number<1>{};
|
||||
constexpr auto Hi = Number<270>{};
|
||||
constexpr auto Wi = Number<480>{};
|
||||
constexpr auto Y = Number<3>{};
|
||||
constexpr auto X = Number<3>{};
|
||||
constexpr auto C0 = Number<2>{};
|
||||
constexpr auto C1 = Number<8>{};
|
||||
constexpr auto K0 = Number<2>{};
|
||||
constexpr auto K1 = Number<8>{};
|
||||
#elif 1
|
||||
constexpr auto N = Number<128>{};
|
||||
constexpr auto Hi = Number<135>{};
|
||||
constexpr auto Wi = Number<240>{};
|
||||
constexpr auto Y = Number<3>{};
|
||||
constexpr auto X = Number<3>{};
|
||||
constexpr auto C0 = Number<2>{};
|
||||
constexpr auto C1 = Number<8>{};
|
||||
constexpr auto K0 = Number<2>{};
|
||||
constexpr auto K1 = Number<8>{};
|
||||
#elif 1
|
||||
constexpr auto N = Number<1>{};
|
||||
constexpr auto Hi = Number<32>{};
|
||||
constexpr auto Wi = Number<32>{};
|
||||
constexpr auto Y = Number<3>{};
|
||||
constexpr auto X = Number<3>{};
|
||||
constexpr auto C0 = Number<2>{};
|
||||
constexpr auto C1 = Number<8>{};
|
||||
constexpr auto K1 = Number<8>{};
|
||||
constexpr auto K0 = Number<8>{};
|
||||
#endif
|
||||
|
||||
constexpr auto conv_stride_h = I1;
|
||||
constexpr auto conv_stride_w = I1;
|
||||
constexpr auto conv_dilation_h = I1;
|
||||
constexpr auto conv_dilation_w = I1;
|
||||
constexpr auto in_left_pad_h = I1;
|
||||
constexpr auto in_left_pad_w = I1;
|
||||
constexpr auto in_right_pad_h = I1;
|
||||
constexpr auto in_right_pad_w = I1;
|
||||
|
||||
constexpr auto YEff = (Y - I1) * conv_dilation_h + I1;
|
||||
constexpr auto XEff = (X - I1) * conv_dilation_w + I1;
|
||||
|
||||
constexpr auto Ho = (Hi + in_left_pad_h + in_right_pad_h - YEff) / conv_stride_h + I1;
|
||||
constexpr auto Wo = (Wi + in_left_pad_w + in_right_pad_w - XEff) / conv_stride_w + I1;
|
||||
|
||||
constexpr auto Hox2 = Number<Ho * 2>{};
|
||||
constexpr auto Wox2 = Number<Wo * 2>{};
|
||||
|
||||
#endif
|
||||
|
||||
#if 0
|
||||
using in_data_t = float;
|
||||
using acc_data_t = float;
|
||||
using out_data_t = float;
|
||||
#elif 1
|
||||
using in_data_t = half_t;
|
||||
using acc_data_t = float;
|
||||
using out_data_t = half_t;
|
||||
#elif 1
|
||||
using in_data_t = int8_t;
|
||||
using acc_data_t = int32_t;
|
||||
using out_data_t = int8_t;
|
||||
#endif
|
||||
|
||||
std::vector<std::size_t> in_lengths_host(5), wei_lengths_host(5), out_lengths_host(5),
|
||||
add_lengths_host(5), bias_lengths_host(2);
|
||||
|
||||
in_lengths_host[0] = static_cast<std::size_t>(N);
|
||||
in_lengths_host[1] = static_cast<std::size_t>(C0);
|
||||
in_lengths_host[2] = static_cast<std::size_t>(Hi);
|
||||
in_lengths_host[3] = static_cast<std::size_t>(Wi);
|
||||
in_lengths_host[4] = static_cast<std::size_t>(C1);
|
||||
|
||||
wei_lengths_host[0] = static_cast<std::size_t>(K0 * K1);
|
||||
wei_lengths_host[1] = static_cast<std::size_t>(C0);
|
||||
wei_lengths_host[2] = static_cast<std::size_t>(Y);
|
||||
wei_lengths_host[3] = static_cast<std::size_t>(X);
|
||||
wei_lengths_host[4] = static_cast<std::size_t>(C1);
|
||||
|
||||
out_lengths_host[0] = static_cast<std::size_t>(N);
|
||||
out_lengths_host[1] = static_cast<std::size_t>(K0);
|
||||
out_lengths_host[2] = static_cast<std::size_t>(Ho);
|
||||
out_lengths_host[3] = static_cast<std::size_t>(Wo);
|
||||
out_lengths_host[4] = static_cast<std::size_t>(K1);
|
||||
|
||||
add_lengths_host[0] = static_cast<std::size_t>(N);
|
||||
add_lengths_host[1] = static_cast<std::size_t>(K0);
|
||||
add_lengths_host[2] = static_cast<std::size_t>(Hox2);
|
||||
add_lengths_host[3] = static_cast<std::size_t>(Wox2);
|
||||
add_lengths_host[4] = static_cast<std::size_t>(K1);
|
||||
|
||||
bias_lengths_host[0] = static_cast<std::size_t>(K0);
|
||||
bias_lengths_host[1] = static_cast<std::size_t>(K1);
|
||||
|
||||
Tensor<in_data_t> in(in_lengths_host);
|
||||
Tensor<in_data_t> wei(wei_lengths_host);
|
||||
Tensor<in_data_t> add(add_lengths_host);
|
||||
Tensor<in_data_t> add_device(add_lengths_host);
|
||||
Tensor<in_data_t> add_host(add_lengths_host);
|
||||
Tensor<out_data_t> bias(bias_lengths_host);
|
||||
Tensor<out_data_t> out_host(out_lengths_host);
|
||||
|
||||
ostream_HostTensorDescriptor(in.mDesc, std::cout << "in: ");
|
||||
ostream_HostTensorDescriptor(wei.mDesc, std::cout << "wei: ");
|
||||
ostream_HostTensorDescriptor(add.mDesc, std::cout << "add: ");
|
||||
|
||||
print_array("InLeftPads", make_tuple(in_left_pad_h, in_left_pad_w));
|
||||
print_array("InRightPads", make_tuple(in_right_pad_h, in_right_pad_w));
|
||||
print_array("ConvStrides", make_tuple(conv_stride_h, conv_stride_w));
|
||||
print_array("ConvDilations", make_tuple(conv_dilation_h, conv_dilation_w));
|
||||
|
||||
std::size_t num_thread = std::thread::hardware_concurrency();
|
||||
|
||||
switch(init_method)
|
||||
{
|
||||
case 0:
|
||||
// no initialization
|
||||
break;
|
||||
case 1:
|
||||
in.GenerateTensorValue(GeneratorTensor_1{}, num_thread);
|
||||
wei.GenerateTensorValue(GeneratorTensor_1{}, num_thread);
|
||||
break;
|
||||
case 2:
|
||||
in.GenerateTensorValue(GeneratorTensor_1{}, num_thread);
|
||||
wei.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread);
|
||||
break;
|
||||
case 3:
|
||||
in.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread);
|
||||
wei.GenerateTensorValue(GeneratorTensor_1{}, num_thread);
|
||||
break;
|
||||
case 4:
|
||||
in.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread);
|
||||
wei.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread);
|
||||
break;
|
||||
case 5:
|
||||
in.GenerateTensorValue(GeneratorTensor_3<float>{0.0, 1.0}, num_thread);
|
||||
wei.GenerateTensorValue(GeneratorTensor_3<float>{-0.5, 0.5}, num_thread);
|
||||
break;
|
||||
default:
|
||||
in.GenerateTensorValue(GeneratorTensor_2{1, 5}, num_thread);
|
||||
|
||||
auto gen_wei = [](auto... is) {
|
||||
return GeneratorTensor_2{1, 5}(is...) * GeneratorTensor_Checkboard{}(is...);
|
||||
};
|
||||
wei.GenerateTensorValue(gen_wei, num_thread);
|
||||
}
|
||||
|
||||
bias.GenerateTensorValue(GeneratorTensor_1{}, num_thread);
|
||||
add.GenerateTensorValue(GeneratorTensor_1{}, num_thread);
|
||||
|
||||
auto f_make_for_device_nchwc = [&]() {
|
||||
const auto in_lengths_dev = make_tuple(N, C0, Hi, Wi, C1);
|
||||
const auto wei_lengths_dev = make_tuple(K0 * K1, C0, Y, X, C1);
|
||||
const auto add_lengths_dev = make_tuple(N, K0, Hox2, Wox2, K1);
|
||||
const auto out_lengths_dev = make_tuple(N, K0, Ho, Wo, K1);
|
||||
const auto conv_strides_dev = make_tuple(conv_stride_h, conv_stride_w);
|
||||
const auto conv_dilations_dev = make_tuple(conv_dilation_h, conv_dilation_w);
|
||||
const auto in_left_pads_dev = make_tuple(in_left_pad_h, in_left_pad_w);
|
||||
const auto in_right_pads_dev = make_tuple(in_right_pad_h, in_right_pad_w);
|
||||
|
||||
return make_tuple(in_lengths_dev,
|
||||
wei_lengths_dev,
|
||||
add_lengths_dev,
|
||||
out_lengths_dev,
|
||||
conv_strides_dev,
|
||||
conv_dilations_dev,
|
||||
in_left_pads_dev,
|
||||
in_right_pads_dev);
|
||||
};
|
||||
|
||||
#if USE_CONV_FWD_V5R1_NCHWC
|
||||
if(algo == ConvForwardAlgo::V5R1NCHWC)
|
||||
{
|
||||
const auto tmp = f_make_for_device_nchwc();
|
||||
|
||||
device_convolution_add_forward_implicit_gemm_v5r1_dlops_nc0hwc1_kc0yxc1_nk0hwk1<in_data_t,
|
||||
acc_data_t,
|
||||
out_data_t,
|
||||
activ_type>(
|
||||
tmp[I0], // in_lengths_dev
|
||||
tmp[I1], // wei_lengths_dev
|
||||
tmp[I2], // add_lengths_dev
|
||||
tmp[I3], // out_lengths_dev
|
||||
tmp[I4], // conv_strides_dev
|
||||
tmp[I5], // conv_dilations_dev
|
||||
tmp[I6], // in_left_pads_dev
|
||||
tmp[I7], // in_right_pads_dev
|
||||
in,
|
||||
wei,
|
||||
bias,
|
||||
add,
|
||||
add_device,
|
||||
nrepeat);
|
||||
}
|
||||
#endif
|
||||
|
||||
if(do_verification)
|
||||
{
|
||||
host_direct_convolution_add_nchwc(in,
|
||||
wei,
|
||||
add,
|
||||
bias,
|
||||
add_host,
|
||||
out_host,
|
||||
make_tuple(conv_stride_h, conv_stride_w),
|
||||
make_tuple(conv_dilation_h, conv_dilation_w),
|
||||
make_tuple(in_left_pad_h, in_left_pad_w),
|
||||
make_tuple(in_right_pad_h, in_right_pad_w),
|
||||
activ_type);
|
||||
|
||||
check_error(add_host, add_device);
|
||||
|
||||
if(do_log)
|
||||
{
|
||||
LogRangeAsType<float>(std::cout << "in : ", in.mData, ",") << std::endl;
|
||||
LogRangeAsType<float>(std::cout << "wei: ", wei.mData, ",") << std::endl;
|
||||
LogRangeAsType<float>(std::cout << "add_host: ", add_host.mData, ",") << std::endl;
|
||||
LogRangeAsType<float>(std::cout << "add_device: ", add_device.mData, ",") << std::endl;
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -15,17 +15,15 @@
|
||||
#include "device_convolution_forward_implicit_gemm_v4r4_dlops_nchw_kcyx_nkhw.hpp"
|
||||
#include "device_convolution_forward_implicit_gemm_v4r4r2_dlops_nhwc_kyxc_nhwk.hpp"
|
||||
#include "device_convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw.hpp"
|
||||
#include "device_convolution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw.hpp"
|
||||
#include "device_convolution_forward_implicit_gemm_v4r4r2_xdlops_nchw_kcyx_nkhw.hpp"
|
||||
#include "device_convolution_forward_implicit_gemm_v4r4r4_xdlops_nhwc_kyxc_nhwk.hpp"
|
||||
|
||||
#define USE_DYNAMIC_MODE 1
|
||||
#define USE_DYNAMIC_MODE 0
|
||||
#define USE_CONV_FWD_V4R4_NCHW 0
|
||||
#define USE_CONV_FWD_V4R4R2_NHWC 0
|
||||
#define USE_CONV_FWD_V6R1_NCHW 0
|
||||
#define USE_CONV_FWD_V5R1_NCHW 0
|
||||
#define USE_CONV_FWD_V4R4R2_NHWC 1
|
||||
#define USE_CONV_FWD_V6R1_NCHW 1
|
||||
#define USE_CONV_FWD_V4R4R2_XDL_NCHW 0
|
||||
#define USE_CONV_FWD_V4R4R4_XDL_NHWC 1
|
||||
#define USE_CONV_FWD_V4R4R4_XDL_NHWC 0
|
||||
|
||||
enum ConvTensorLayout
|
||||
{
|
||||
@@ -41,9 +39,8 @@ enum ConvForwardAlgo
|
||||
V4R4NCHW, // 0
|
||||
V4R4R2NHWC, // 1
|
||||
V6R1NCHW, // 2
|
||||
V5R1NCHW, // 3
|
||||
V4R4R2XDLNCHW, // 4
|
||||
V4R4R4XDLNHWC // 5
|
||||
V4R4R2XDLNCHW, // 3
|
||||
V4R4R4XDLNHWC // 4
|
||||
};
|
||||
|
||||
template <typename TIn,
|
||||
@@ -237,8 +234,8 @@ int main(int argc, char* argv[])
|
||||
constexpr auto Y = Number<3>{};
|
||||
constexpr auto X = Number<3>{};
|
||||
|
||||
constexpr auto conv_stride_h = I2;
|
||||
constexpr auto conv_stride_w = I2;
|
||||
constexpr auto conv_stride_h = I1;
|
||||
constexpr auto conv_stride_w = I1;
|
||||
constexpr auto conv_dilation_h = I1;
|
||||
constexpr auto conv_dilation_w = I1;
|
||||
constexpr auto in_left_pad_h = I1;
|
||||
@@ -253,7 +250,7 @@ int main(int argc, char* argv[])
|
||||
constexpr auto Wo = (Wi + in_left_pad_w + in_right_pad_w - XEff) / conv_stride_w + I1;
|
||||
#endif
|
||||
|
||||
#if 0
|
||||
#if 1
|
||||
using in_data_t = float;
|
||||
using acc_data_t = float;
|
||||
using out_data_t = float;
|
||||
@@ -472,33 +469,6 @@ int main(int argc, char* argv[])
|
||||
}
|
||||
#endif
|
||||
|
||||
#if USE_CONV_FWD_V5R1_NCHW
|
||||
if(algo == ConvForwardAlgo::V5R1NCHW)
|
||||
{
|
||||
if(layout != ConvTensorLayout::NCHW)
|
||||
{
|
||||
throw std::runtime_error("wrong! layout");
|
||||
}
|
||||
|
||||
const auto tmp = f_make_for_device_nchw();
|
||||
|
||||
device_convolution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw<in_data_t,
|
||||
16,
|
||||
acc_data_t,
|
||||
out_data_t>(tmp[I0],
|
||||
tmp[I1],
|
||||
tmp[I2],
|
||||
tmp[I3],
|
||||
tmp[I4],
|
||||
tmp[I5],
|
||||
tmp[I6],
|
||||
in,
|
||||
wei,
|
||||
out_device,
|
||||
nrepeat);
|
||||
}
|
||||
#endif
|
||||
|
||||
#if USE_CONV_FWD_V4R4R2_XDL_NCHW
|
||||
if(algo == ConvForwardAlgo::V4R4R2XDLNCHW)
|
||||
{
|
||||
|
||||
391
host/driver_offline/src/conv_fwd_driver_offline_nchwc.cpp
Normal file
391
host/driver_offline/src/conv_fwd_driver_offline_nchwc.cpp
Normal file
@@ -0,0 +1,391 @@
|
||||
#include <iostream>
|
||||
#include <numeric>
|
||||
#include <initializer_list>
|
||||
#include <cstdlib>
|
||||
#include <stdlib.h>
|
||||
#include <half.hpp>
|
||||
#include "config.hpp"
|
||||
#include "debug.hpp"
|
||||
#include "print.hpp"
|
||||
#include "device.hpp"
|
||||
#include "host_tensor.hpp"
|
||||
#include "host_tensor_generator.hpp"
|
||||
#include "conv_common.hpp"
|
||||
#include "device_tensor.hpp"
|
||||
#include "device_convolution_forward_implicit_gemm_v5r1_dlops_nc0hwc1_kc0yxc1_nk0hwk1.hpp"
|
||||
|
||||
#define USE_DYNAMIC_MODE 0
|
||||
#define USE_CONV_FWD_V5R1_NCHWC 1
|
||||
|
||||
enum ConvForwardAlgo
|
||||
{
|
||||
V5R1NCHWC // 0
|
||||
};
|
||||
|
||||
template <typename TIn,
|
||||
typename TWei,
|
||||
typename TOut,
|
||||
typename ConvStrides,
|
||||
typename ConvDilations,
|
||||
typename InLeftPads,
|
||||
typename InRightPads>
|
||||
void host_direct_convolution_nchwc(const Tensor<TIn>& in,
|
||||
const Tensor<TWei>& wei,
|
||||
const Tensor<TOut>& bias,
|
||||
Tensor<TOut>& out,
|
||||
const ConvStrides& conv_strides,
|
||||
const ConvDilations& conv_dilations,
|
||||
const InLeftPads& in_left_pads,
|
||||
const InRightPads&,
|
||||
const ck::ActivTypeEnum_t activ_type)
|
||||
{
|
||||
using namespace ck;
|
||||
|
||||
constexpr auto I0 = Number<0>{};
|
||||
constexpr auto I1 = Number<1>{};
|
||||
|
||||
auto f_nchw = [&](auto n, auto k0, auto ho, auto wo, auto k1) {
|
||||
double v = 0;
|
||||
const int k = k0 * out.mDesc.GetLengths()[4] + k1;
|
||||
|
||||
for(int c0 = 0; c0 < wei.mDesc.GetLengths()[1]; ++c0)
|
||||
{
|
||||
for(int y = 0; y < wei.mDesc.GetLengths()[2]; ++y)
|
||||
{
|
||||
int hi = ho * conv_strides[I0] + y * conv_dilations[I0] - in_left_pads[I0];
|
||||
for(int x = 0; x < wei.mDesc.GetLengths()[3]; ++x)
|
||||
{
|
||||
int wi = wo * conv_strides[I1] + x * conv_dilations[I1] - in_left_pads[I1];
|
||||
if(hi >= 0 && hi < in.mDesc.GetLengths()[2] && wi >= 0 &&
|
||||
wi < in.mDesc.GetLengths()[3])
|
||||
{
|
||||
for(int c1 = 0; c1 < wei.mDesc.GetLengths()[4]; ++c1)
|
||||
{
|
||||
v += static_cast<const double>(in(n, c0, hi, wi, c1)) *
|
||||
static_cast<const double>(wei(k, c0, y, x, c1));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
v += bias(k0, k1);
|
||||
out(n, k0, ho, wo, k1) = activ(v, activ_type);
|
||||
};
|
||||
|
||||
make_ParallelTensorFunctor(f_nchw,
|
||||
out.mDesc.GetLengths()[0],
|
||||
out.mDesc.GetLengths()[1],
|
||||
out.mDesc.GetLengths()[2],
|
||||
out.mDesc.GetLengths()[3],
|
||||
out.mDesc.GetLengths()[4])(std::thread::hardware_concurrency());
|
||||
}
|
||||
|
||||
int main(int argc, char* argv[])
|
||||
{
|
||||
using namespace ck;
|
||||
|
||||
constexpr auto I0 = Number<0>{};
|
||||
constexpr auto I1 = Number<1>{};
|
||||
constexpr auto I2 = Number<2>{};
|
||||
constexpr auto I3 = Number<3>{};
|
||||
constexpr auto I4 = Number<4>{};
|
||||
constexpr auto I5 = Number<5>{};
|
||||
constexpr auto I6 = Number<6>{};
|
||||
|
||||
#if USE_DYNAMIC_MODE
|
||||
// dynamic mode
|
||||
if(argc != 23)
|
||||
{
|
||||
printf("arg1 to 5: algo, do_verification, init_method, do_log, nrepeat\n");
|
||||
printf("rest: N, K0, K1, C0, C1, Y, X, Hi, Wi, Sy, Sx, Dy, Dx, LeftPy, LeftPx, RightPy, "
|
||||
"RightPx\n");
|
||||
exit(1);
|
||||
}
|
||||
|
||||
constexpr ck::ActivTypeEnum_t activ_type = ActivTypeEnum_t::LeakyRelu;
|
||||
|
||||
const ConvForwardAlgo algo = static_cast<ConvForwardAlgo>(std::stoi(argv[1]));
|
||||
const bool do_verification = std::stoi(argv[2]);
|
||||
const int init_method = std::stoi(argv[3]);
|
||||
const bool do_log = std::stoi(argv[4]);
|
||||
const int nrepeat = std::stoi(argv[5]);
|
||||
|
||||
const index_t N = std::stoi(argv[6]);
|
||||
const index_t K0 = std::stoi(argv[7]);
|
||||
const index_t K1 = std::stoi(argv[8]);
|
||||
const index_t C0 = std::stoi(argv[9]);
|
||||
const index_t C1 = std::stoi(argv[10]);
|
||||
const index_t Y = std::stoi(argv[11]);
|
||||
const index_t X = std::stoi(argv[12]);
|
||||
const index_t Hi = std::stoi(argv[13]);
|
||||
const index_t Wi = std::stoi(argv[14]);
|
||||
|
||||
const index_t conv_stride_h = std::stoi(argv[15]);
|
||||
const index_t conv_stride_w = std::stoi(argv[16]);
|
||||
const index_t conv_dilation_h = std::stoi(argv[17]);
|
||||
const index_t conv_dilation_w = std::stoi(argv[18]);
|
||||
const index_t in_left_pad_h = std::stoi(argv[19]);
|
||||
const index_t in_left_pad_w = std::stoi(argv[20]);
|
||||
const index_t in_right_pad_h = std::stoi(argv[21]);
|
||||
const index_t in_right_pad_w = std::stoi(argv[22]);
|
||||
|
||||
const index_t YEff = (Y - 1) * conv_dilation_h + 1;
|
||||
const index_t XEff = (X - 1) * conv_dilation_w + 1;
|
||||
|
||||
const index_t Ho = (Hi + in_left_pad_h + in_right_pad_h - YEff) / conv_stride_h + 1;
|
||||
const index_t Wo = (Wi + in_left_pad_w + in_right_pad_w - XEff) / conv_stride_w + 1;
|
||||
#else
|
||||
// static mode
|
||||
if(argc < 6)
|
||||
{
|
||||
printf("arg1 to 5: algo, do_verification, init_method, do_log, nrepeat\n");
|
||||
exit(1);
|
||||
}
|
||||
|
||||
const ConvForwardAlgo algo = static_cast<ConvForwardAlgo>(std::stoi(argv[1]));
|
||||
|
||||
const bool do_verification = std::stoi(argv[2]);
|
||||
const int init_method = std::stoi(argv[3]);
|
||||
const bool do_log = std::stoi(argv[4]);
|
||||
const int nrepeat = std::stoi(argv[5]);
|
||||
|
||||
// constexpr ck::ActivTypeEnum_t activ_type = ActivTypeEnum_t::Sigmoid;
|
||||
constexpr ck::ActivTypeEnum_t activ_type = ActivTypeEnum_t::LeakyRelu;
|
||||
|
||||
#if 0
|
||||
constexpr auto N = Number<1>{};
|
||||
constexpr auto Hi = Number<1080>{};
|
||||
constexpr auto Wi = Number<1920>{};
|
||||
constexpr auto Y = Number<3>{};
|
||||
constexpr auto X = Number<3>{};
|
||||
constexpr auto C0 = Number<2>{};
|
||||
constexpr auto C1 = Number<8>{};
|
||||
constexpr auto K0 = Number<1>{};
|
||||
constexpr auto K1 = Number<4>{};
|
||||
#elif 1
|
||||
constexpr auto N = Number<1>{};
|
||||
constexpr auto Hi = Number<1080>{};
|
||||
constexpr auto Wi = Number<1920>{};
|
||||
constexpr auto Y = Number<3>{};
|
||||
constexpr auto X = Number<3>{};
|
||||
constexpr auto C0 = Number<2>{};
|
||||
constexpr auto C1 = Number<8>{};
|
||||
constexpr auto K0 = Number<2>{};
|
||||
constexpr auto K1 = Number<8>{};
|
||||
#elif 0
|
||||
constexpr auto N = Number<1>{};
|
||||
constexpr auto Hi = Number<1080>{};
|
||||
constexpr auto Wi = Number<1920>{};
|
||||
constexpr auto Y = Number<1>{};
|
||||
constexpr auto X = Number<1>{};
|
||||
constexpr auto C0 = Number<2>{};
|
||||
constexpr auto C1 = Number<8>{};
|
||||
constexpr auto K0 = Number<2>{};
|
||||
constexpr auto K1 = Number<8>{};
|
||||
#elif 0
|
||||
constexpr auto N = Number<1>{};
|
||||
constexpr auto Hi = Number<540>{};
|
||||
constexpr auto Wi = Number<960>{};
|
||||
constexpr auto Y = Number<1>{};
|
||||
constexpr auto X = Number<1>{};
|
||||
constexpr auto C0 = Number<2>{};
|
||||
constexpr auto C1 = Number<8>{};
|
||||
constexpr auto K0 = Number<2>{};
|
||||
constexpr auto K1 = Number<8>{};
|
||||
#elif 0
|
||||
constexpr auto N = Number<128>{};
|
||||
constexpr auto Hi = Number<270>{};
|
||||
constexpr auto Wi = Number<480>{};
|
||||
constexpr auto Y = Number<1>{};
|
||||
constexpr auto X = Number<1>{};
|
||||
constexpr auto C0 = Number<2>{};
|
||||
constexpr auto C1 = Number<8>{};
|
||||
constexpr auto K0 = Number<2>{};
|
||||
constexpr auto K1 = Number<8>{};
|
||||
#endif
|
||||
|
||||
constexpr auto conv_stride_h = I1;
|
||||
constexpr auto conv_stride_w = I1;
|
||||
constexpr auto conv_dilation_h = I1;
|
||||
constexpr auto conv_dilation_w = I1;
|
||||
|
||||
#if 1
|
||||
constexpr auto in_left_pad_h = I1;
|
||||
constexpr auto in_left_pad_w = I1;
|
||||
constexpr auto in_right_pad_h = I1;
|
||||
constexpr auto in_right_pad_w = I1;
|
||||
#else
|
||||
constexpr auto in_left_pad_h = I0;
|
||||
constexpr auto in_left_pad_w = I0;
|
||||
constexpr auto in_right_pad_h = I0;
|
||||
constexpr auto in_right_pad_w = I0;
|
||||
#endif
|
||||
|
||||
constexpr auto YEff = (Y - I1) * conv_dilation_h + I1;
|
||||
constexpr auto XEff = (X - I1) * conv_dilation_w + I1;
|
||||
|
||||
constexpr auto Ho = (Hi + in_left_pad_h + in_right_pad_h - YEff) / conv_stride_h + I1;
|
||||
constexpr auto Wo = (Wi + in_left_pad_w + in_right_pad_w - XEff) / conv_stride_w + I1;
|
||||
#endif
|
||||
|
||||
#if 0
|
||||
using in_data_t = float;
|
||||
using acc_data_t = float;
|
||||
using out_data_t = float;
|
||||
#elif 1
|
||||
using in_data_t = half_t;
|
||||
using acc_data_t = float;
|
||||
using out_data_t = half_t;
|
||||
#elif 1
|
||||
using in_data_t = int8_t;
|
||||
using acc_data_t = int32_t;
|
||||
using out_data_t = int8_t;
|
||||
#endif
|
||||
|
||||
std::vector<std::size_t> in_lengths_host(5), wei_lengths_host(5), out_lengths_host(5),
|
||||
bias_lengths_host(2);
|
||||
|
||||
in_lengths_host[0] = static_cast<std::size_t>(N);
|
||||
in_lengths_host[1] = static_cast<std::size_t>(C0);
|
||||
in_lengths_host[2] = static_cast<std::size_t>(Hi);
|
||||
in_lengths_host[3] = static_cast<std::size_t>(Wi);
|
||||
in_lengths_host[4] = static_cast<std::size_t>(C1);
|
||||
|
||||
wei_lengths_host[0] = static_cast<std::size_t>(K0 * K1);
|
||||
wei_lengths_host[1] = static_cast<std::size_t>(C0);
|
||||
wei_lengths_host[2] = static_cast<std::size_t>(Y);
|
||||
wei_lengths_host[3] = static_cast<std::size_t>(X);
|
||||
wei_lengths_host[4] = static_cast<std::size_t>(C1);
|
||||
|
||||
out_lengths_host[0] = static_cast<std::size_t>(N);
|
||||
out_lengths_host[1] = static_cast<std::size_t>(K0);
|
||||
out_lengths_host[2] = static_cast<std::size_t>(Ho);
|
||||
out_lengths_host[3] = static_cast<std::size_t>(Wo);
|
||||
out_lengths_host[4] = static_cast<std::size_t>(K1);
|
||||
|
||||
bias_lengths_host[0] = static_cast<std::size_t>(K0);
|
||||
bias_lengths_host[1] = static_cast<std::size_t>(K1);
|
||||
|
||||
Tensor<in_data_t> in(in_lengths_host);
|
||||
Tensor<in_data_t> wei(wei_lengths_host);
|
||||
Tensor<out_data_t> bias(bias_lengths_host);
|
||||
Tensor<out_data_t> out_host(out_lengths_host);
|
||||
Tensor<out_data_t> out_device(out_lengths_host);
|
||||
|
||||
ostream_HostTensorDescriptor(in.mDesc, std::cout << "in: ");
|
||||
ostream_HostTensorDescriptor(wei.mDesc, std::cout << "wei: ");
|
||||
ostream_HostTensorDescriptor(bias.mDesc, std::cout << "bias: ");
|
||||
ostream_HostTensorDescriptor(out_host.mDesc, std::cout << "out: ");
|
||||
|
||||
print_array("InLeftPads", make_tuple(in_left_pad_h, in_left_pad_w));
|
||||
print_array("InRightPads", make_tuple(in_right_pad_h, in_right_pad_w));
|
||||
print_array("ConvStrides", make_tuple(conv_stride_h, conv_stride_w));
|
||||
print_array("ConvDilations", make_tuple(conv_dilation_h, conv_dilation_w));
|
||||
|
||||
std::size_t num_thread = std::thread::hardware_concurrency();
|
||||
|
||||
switch(init_method)
|
||||
{
|
||||
case 0:
|
||||
// no initialization
|
||||
break;
|
||||
case 1:
|
||||
in.GenerateTensorValue(GeneratorTensor_1{}, num_thread);
|
||||
wei.GenerateTensorValue(GeneratorTensor_1{}, num_thread);
|
||||
bias.GenerateTensorValue(GeneratorTensor_1{}, num_thread);
|
||||
break;
|
||||
case 2:
|
||||
in.GenerateTensorValue(GeneratorTensor_1{}, num_thread);
|
||||
wei.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread);
|
||||
bias.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread);
|
||||
break;
|
||||
case 3:
|
||||
in.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread);
|
||||
wei.GenerateTensorValue(GeneratorTensor_1{}, num_thread);
|
||||
bias.GenerateTensorValue(GeneratorTensor_1{}, num_thread);
|
||||
break;
|
||||
case 4:
|
||||
in.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread);
|
||||
wei.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread);
|
||||
bias.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread);
|
||||
break;
|
||||
case 5:
|
||||
in.GenerateTensorValue(GeneratorTensor_3<float>{0.0, 1.0}, num_thread);
|
||||
wei.GenerateTensorValue(GeneratorTensor_3<float>{-0.5, 0.5}, num_thread);
|
||||
bias.GenerateTensorValue(GeneratorTensor_3<float>{-0.5, 0.5}, num_thread);
|
||||
break;
|
||||
default:
|
||||
in.GenerateTensorValue(GeneratorTensor_2{1, 5}, num_thread);
|
||||
|
||||
auto gen_wei = [](auto... is) {
|
||||
return GeneratorTensor_2{1, 5}(is...) * GeneratorTensor_Checkboard{}(is...);
|
||||
};
|
||||
wei.GenerateTensorValue(gen_wei, num_thread);
|
||||
}
|
||||
|
||||
auto f_make_for_device_nchwc = [&]() {
|
||||
const auto in_lengths_dev = make_tuple(N, C0, Hi, Wi, C1);
|
||||
const auto wei_lengths_dev = make_tuple(K0 * K1, C0, Y, X, C1);
|
||||
const auto out_lengths_dev = make_tuple(N, K0, Ho, Wo, K1);
|
||||
const auto conv_strides_dev = make_tuple(conv_stride_h, conv_stride_w);
|
||||
const auto conv_dilations_dev = make_tuple(conv_dilation_h, conv_dilation_w);
|
||||
const auto in_left_pads_dev = make_tuple(in_left_pad_h, in_left_pad_w);
|
||||
const auto in_right_pads_dev = make_tuple(in_right_pad_h, in_right_pad_w);
|
||||
|
||||
return make_tuple(in_lengths_dev,
|
||||
wei_lengths_dev,
|
||||
out_lengths_dev,
|
||||
conv_strides_dev,
|
||||
conv_dilations_dev,
|
||||
in_left_pads_dev,
|
||||
in_right_pads_dev);
|
||||
};
|
||||
|
||||
#if USE_CONV_FWD_V5R1_NCHWC
|
||||
if(algo == ConvForwardAlgo::V5R1NCHWC)
|
||||
{
|
||||
const auto tmp = f_make_for_device_nchwc();
|
||||
|
||||
device_convolution_forward_implicit_gemm_v5r1_dlops_nc0hwc1_kc0yxc1_nk0hwk1<in_data_t,
|
||||
acc_data_t,
|
||||
out_data_t,
|
||||
activ_type>(
|
||||
tmp[I0],
|
||||
tmp[I1],
|
||||
tmp[I2],
|
||||
tmp[I3],
|
||||
tmp[I4],
|
||||
tmp[I5],
|
||||
tmp[I6],
|
||||
in,
|
||||
wei,
|
||||
bias,
|
||||
out_device,
|
||||
nrepeat);
|
||||
}
|
||||
#endif
|
||||
|
||||
if(do_verification)
|
||||
{
|
||||
host_direct_convolution_nchwc(in,
|
||||
wei,
|
||||
bias,
|
||||
out_host,
|
||||
make_tuple(conv_stride_h, conv_stride_w),
|
||||
make_tuple(conv_dilation_h, conv_dilation_w),
|
||||
make_tuple(in_left_pad_h, in_left_pad_w),
|
||||
make_tuple(in_right_pad_h, in_right_pad_w),
|
||||
activ_type);
|
||||
|
||||
check_error(out_host, out_device);
|
||||
|
||||
if(do_log)
|
||||
{
|
||||
LogRangeAsType<float>(std::cout << "in : ", in.mData, ",") << std::endl;
|
||||
LogRangeAsType<float>(std::cout << "wei: ", wei.mData, ",") << std::endl;
|
||||
LogRangeAsType<float>(std::cout << "bias: ", bias.mData, ",") << std::endl;
|
||||
LogRangeAsType<float>(std::cout << "out_host : ", out_host.mData, ",") << std::endl;
|
||||
LogRangeAsType<float>(std::cout << "out_device: ", out_device.mData, ",") << std::endl;
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,413 @@
|
||||
#include <iostream>
|
||||
#include <numeric>
|
||||
#include <initializer_list>
|
||||
#include <cstdlib>
|
||||
#include <stdlib.h>
|
||||
#include <half.hpp>
|
||||
#include "config.hpp"
|
||||
#include "debug.hpp"
|
||||
#include "print.hpp"
|
||||
#include "device.hpp"
|
||||
#include "host_tensor.hpp"
|
||||
#include "host_tensor_generator.hpp"
|
||||
#include "conv_common.hpp"
|
||||
#include "device_tensor.hpp"
|
||||
#include "device_convolution_maxpool_forward_implicit_gemm_v5r1_dlops_nc0hwc1_kc0yxc1_nk0hwk1.hpp"
|
||||
|
||||
#define USE_DYNAMIC_MODE 0
|
||||
#define USE_CONV_FWD_V5R1_NCHWC 1
|
||||
|
||||
enum ConvForwardAlgo
|
||||
{
|
||||
V5R1NCHWC // 0
|
||||
};
|
||||
|
||||
template <typename TIn,
|
||||
typename TWei,
|
||||
typename TOut,
|
||||
typename ConvStrides,
|
||||
typename ConvDilations,
|
||||
typename InLeftPads,
|
||||
typename InRightPads>
|
||||
void host_direct_convolution_maxpool_nchwc(const Tensor<TIn>& in,
|
||||
const Tensor<TWei>& wei,
|
||||
const Tensor<TOut>& bias,
|
||||
Tensor<TOut>& out_host,
|
||||
Tensor<TOut>& max_host,
|
||||
const ConvStrides& conv_strides,
|
||||
const ConvDilations& conv_dilations,
|
||||
const InLeftPads& in_left_pads,
|
||||
const InRightPads&,
|
||||
const ck::ActivTypeEnum_t activ_type)
|
||||
{
|
||||
using namespace ck;
|
||||
|
||||
constexpr auto I0 = Number<0>{};
|
||||
constexpr auto I1 = Number<1>{};
|
||||
|
||||
auto f_nchw = [&](auto n, auto k0, auto ho, auto wo, auto k1) {
|
||||
double v = 0;
|
||||
auto k = k0 * out_host.mDesc.GetLengths()[4] + k1;
|
||||
|
||||
for(int c0 = 0; c0 < wei.mDesc.GetLengths()[1]; ++c0)
|
||||
{
|
||||
for(int y = 0; y < wei.mDesc.GetLengths()[2]; ++y)
|
||||
{
|
||||
int hi = ho * conv_strides[I0] + y * conv_dilations[I0] - in_left_pads[I0];
|
||||
for(int x = 0; x < wei.mDesc.GetLengths()[3]; ++x)
|
||||
{
|
||||
int wi = wo * conv_strides[I1] + x * conv_dilations[I1] - in_left_pads[I1];
|
||||
if(hi >= 0 && hi < in.mDesc.GetLengths()[2] && wi >= 0 &&
|
||||
wi < in.mDesc.GetLengths()[3])
|
||||
{
|
||||
for(int c1 = 0; c1 < wei.mDesc.GetLengths()[4]; ++c1)
|
||||
{
|
||||
v += static_cast<const double>(in(n, c0, hi, wi, c1)) *
|
||||
static_cast<const double>(wei(k, c0, y, x, c1));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
v += bias(k0, k1);
|
||||
v = activ(v, activ_type);
|
||||
|
||||
out_host(n, k0, ho, wo, k1) = v;
|
||||
};
|
||||
|
||||
make_ParallelTensorFunctor(f_nchw,
|
||||
out_host.mDesc.GetLengths()[0],
|
||||
out_host.mDesc.GetLengths()[1],
|
||||
out_host.mDesc.GetLengths()[2],
|
||||
out_host.mDesc.GetLengths()[3],
|
||||
out_host.mDesc.GetLengths()[4])(std::thread::hardware_concurrency());
|
||||
|
||||
auto maxpool_nchw = [&](auto n, auto k0, auto ho, auto wo, auto k1) {
|
||||
auto hx = ho * 2;
|
||||
auto wx = wo * 2;
|
||||
|
||||
auto v0 = out_host(n, k0, hx, wx, k1);
|
||||
auto v1 = out_host(n, k0, hx, wx + 1, k1);
|
||||
auto v2 = out_host(n, k0, hx + 1, wx, k1);
|
||||
auto v3 = out_host(n, k0, hx + 1, wx + 1, k1);
|
||||
|
||||
max_host(n, k0, ho, wo, k1) = std::max({v0, v1, v2, v3});
|
||||
};
|
||||
|
||||
make_ParallelTensorFunctor(maxpool_nchw,
|
||||
max_host.mDesc.GetLengths()[0],
|
||||
max_host.mDesc.GetLengths()[1],
|
||||
max_host.mDesc.GetLengths()[2],
|
||||
max_host.mDesc.GetLengths()[3],
|
||||
max_host.mDesc.GetLengths()[4])(std::thread::hardware_concurrency());
|
||||
}
|
||||
|
||||
int main(int argc, char* argv[])
|
||||
{
|
||||
using namespace ck;
|
||||
|
||||
constexpr auto I0 = Number<0>{};
|
||||
constexpr auto I1 = Number<1>{};
|
||||
constexpr auto I2 = Number<2>{};
|
||||
constexpr auto I3 = Number<3>{};
|
||||
constexpr auto I4 = Number<4>{};
|
||||
constexpr auto I5 = Number<5>{};
|
||||
constexpr auto I6 = Number<6>{};
|
||||
constexpr auto I7 = Number<7>{};
|
||||
|
||||
#if USE_DYNAMIC_MODE
|
||||
// dynamic mode
|
||||
if(argc != 23)
|
||||
{
|
||||
printf("arg1 to 5: algo, do_verification, init_method, do_log, nrepeat\n");
|
||||
printf("rest: N, K0, K1, C0, C1, Y, X, Hi, Wi, Sy, Sx, Dy, Dx, LeftPy, LeftPx, RightPy, "
|
||||
"RightPx\n");
|
||||
exit(1);
|
||||
}
|
||||
|
||||
constexpr ck::ActivTypeEnum_t activ_type = ActivTypeEnum_t::LeakyRelu;
|
||||
|
||||
const ConvForwardAlgo algo = static_cast<ConvForwardAlgo>(std::stoi(argv[1]));
|
||||
const bool do_verification = std::stoi(argv[2]);
|
||||
const int init_method = std::stoi(argv[3]);
|
||||
const bool do_log = std::stoi(argv[4]);
|
||||
const int nrepeat = std::stoi(argv[5]);
|
||||
|
||||
const index_t N = std::stoi(argv[6]);
|
||||
const index_t K0 = std::stoi(argv[7]);
|
||||
const index_t K1 = std::stoi(argv[8]);
|
||||
const index_t C0 = std::stoi(argv[9]);
|
||||
const index_t C1 = std::stoi(argv[10]);
|
||||
const index_t Y = std::stoi(argv[11]);
|
||||
const index_t X = std::stoi(argv[12]);
|
||||
const index_t Hi = std::stoi(argv[13]);
|
||||
const index_t Wi = std::stoi(argv[14]);
|
||||
|
||||
const index_t conv_stride_h = std::stoi(argv[15]);
|
||||
const index_t conv_stride_w = std::stoi(argv[16]);
|
||||
const index_t conv_dilation_h = std::stoi(argv[17]);
|
||||
const index_t conv_dilation_w = std::stoi(argv[18]);
|
||||
const index_t in_left_pad_h = std::stoi(argv[19]);
|
||||
const index_t in_left_pad_w = std::stoi(argv[20]);
|
||||
const index_t in_right_pad_h = std::stoi(argv[21]);
|
||||
const index_t in_right_pad_w = std::stoi(argv[22]);
|
||||
|
||||
const index_t YEff = (Y - 1) * conv_dilation_h + 1;
|
||||
const index_t XEff = (X - 1) * conv_dilation_w + 1;
|
||||
|
||||
const index_t Ho = (Hi + in_left_pad_h + in_right_pad_h - YEff) / conv_stride_h + 1;
|
||||
const index_t Wo = (Wi + in_left_pad_w + in_right_pad_w - XEff) / conv_stride_w + 1;
|
||||
|
||||
const index_t Ho_2 = Ho / 2;
|
||||
const index_t Wo_2 = Wo / 2;
|
||||
#else
|
||||
// static mode
|
||||
if(argc < 6)
|
||||
{
|
||||
printf("arg1 to 5: algo, do_verification, init_method, do_log, nrepeat\n");
|
||||
exit(1);
|
||||
}
|
||||
|
||||
const ConvForwardAlgo algo = static_cast<ConvForwardAlgo>(std::stoi(argv[1]));
|
||||
|
||||
const bool do_verification = std::stoi(argv[2]);
|
||||
const int init_method = std::stoi(argv[3]);
|
||||
const bool do_log = std::stoi(argv[4]);
|
||||
const int nrepeat = std::stoi(argv[5]);
|
||||
|
||||
constexpr ck::ActivTypeEnum_t activ_type = ActivTypeEnum_t::LeakyRelu;
|
||||
|
||||
#if 1
|
||||
constexpr auto N = Number<1>{};
|
||||
constexpr auto Hi = Number<1080>{};
|
||||
constexpr auto Wi = Number<1920>{};
|
||||
constexpr auto Y = Number<3>{};
|
||||
constexpr auto X = Number<3>{};
|
||||
constexpr auto C0 = Number<2>{};
|
||||
constexpr auto C1 = Number<8>{};
|
||||
constexpr auto K0 = Number<2>{};
|
||||
constexpr auto K1 = Number<8>{};
|
||||
#elif 0
|
||||
constexpr auto N = Number<1>{};
|
||||
constexpr auto Hi = Number<1080>{};
|
||||
constexpr auto Wi = Number<1920>{};
|
||||
constexpr auto Y = Number<3>{};
|
||||
constexpr auto X = Number<3>{};
|
||||
constexpr auto C0 = Number<3>{};
|
||||
constexpr auto C1 = Number<4>{};
|
||||
constexpr auto K0 = Number<2>{};
|
||||
constexpr auto K1 = Number<8>{};
|
||||
#elif 0
|
||||
constexpr auto N = Number<1>{};
|
||||
constexpr auto Hi = Number<540>{};
|
||||
constexpr auto Wi = Number<960>{};
|
||||
constexpr auto Y = Number<3>{};
|
||||
constexpr auto X = Number<3>{};
|
||||
constexpr auto C0 = Number<2>{};
|
||||
constexpr auto C1 = Number<8>{};
|
||||
constexpr auto K0 = Number<2>{};
|
||||
constexpr auto K1 = Number<8>{};
|
||||
#elif 0
|
||||
constexpr auto N = Number<128>{};
|
||||
constexpr auto Hi = Number<270>{};
|
||||
constexpr auto Wi = Number<480>{};
|
||||
constexpr auto Y = Number<3>{};
|
||||
constexpr auto X = Number<3>{};
|
||||
constexpr auto C0 = Number<2>{};
|
||||
constexpr auto C1 = Number<8>{};
|
||||
constexpr auto K0 = Number<2>{};
|
||||
constexpr auto K1 = Number<8>{};
|
||||
#endif
|
||||
|
||||
constexpr auto conv_stride_h = I1;
|
||||
constexpr auto conv_stride_w = I1;
|
||||
constexpr auto conv_dilation_h = I1;
|
||||
constexpr auto conv_dilation_w = I1;
|
||||
constexpr auto in_left_pad_h = I1;
|
||||
constexpr auto in_left_pad_w = I1;
|
||||
constexpr auto in_right_pad_h = I1;
|
||||
constexpr auto in_right_pad_w = I1;
|
||||
|
||||
constexpr auto YEff = (Y - I1) * conv_dilation_h + I1;
|
||||
constexpr auto XEff = (X - I1) * conv_dilation_w + I1;
|
||||
|
||||
constexpr auto Ho = (Hi + in_left_pad_h + in_right_pad_h - YEff) / conv_stride_h + I1;
|
||||
constexpr auto Wo = (Wi + in_left_pad_w + in_right_pad_w - XEff) / conv_stride_w + I1;
|
||||
|
||||
constexpr auto Ho_2 = Number<Ho / 2>{};
|
||||
constexpr auto Wo_2 = Number<Wo / 2>{};
|
||||
|
||||
#endif
|
||||
|
||||
#if 0
|
||||
using in_data_t = float;
|
||||
using acc_data_t = float;
|
||||
using out_data_t = float;
|
||||
#elif 1
|
||||
using in_data_t = half_t;
|
||||
using acc_data_t = float;
|
||||
using out_data_t = half_t;
|
||||
#elif 1
|
||||
using in_data_t = int8_t;
|
||||
using acc_data_t = int32_t;
|
||||
using out_data_t = int8_t;
|
||||
#endif
|
||||
|
||||
std::vector<std::size_t> in_lengths_host(5), wei_lengths_host(5), out_lengths_host(5),
|
||||
max_lengths_host(5), bias_lengths_host(2);
|
||||
|
||||
in_lengths_host[0] = static_cast<std::size_t>(N);
|
||||
in_lengths_host[1] = static_cast<std::size_t>(C0);
|
||||
in_lengths_host[2] = static_cast<std::size_t>(Hi);
|
||||
in_lengths_host[3] = static_cast<std::size_t>(Wi);
|
||||
in_lengths_host[4] = static_cast<std::size_t>(C1);
|
||||
|
||||
wei_lengths_host[0] = static_cast<std::size_t>(K0 * K1);
|
||||
wei_lengths_host[1] = static_cast<std::size_t>(C0);
|
||||
wei_lengths_host[2] = static_cast<std::size_t>(Y);
|
||||
wei_lengths_host[3] = static_cast<std::size_t>(X);
|
||||
wei_lengths_host[4] = static_cast<std::size_t>(C1);
|
||||
|
||||
out_lengths_host[0] = static_cast<std::size_t>(N);
|
||||
out_lengths_host[1] = static_cast<std::size_t>(K0);
|
||||
out_lengths_host[2] = static_cast<std::size_t>(Ho);
|
||||
out_lengths_host[3] = static_cast<std::size_t>(Wo);
|
||||
out_lengths_host[4] = static_cast<std::size_t>(K1);
|
||||
|
||||
max_lengths_host[0] = static_cast<std::size_t>(N);
|
||||
max_lengths_host[1] = static_cast<std::size_t>(K0);
|
||||
max_lengths_host[2] = static_cast<std::size_t>(Ho_2);
|
||||
max_lengths_host[3] = static_cast<std::size_t>(Wo_2);
|
||||
max_lengths_host[4] = static_cast<std::size_t>(K1);
|
||||
|
||||
bias_lengths_host[0] = static_cast<std::size_t>(K0);
|
||||
bias_lengths_host[1] = static_cast<std::size_t>(K1);
|
||||
|
||||
Tensor<in_data_t> in(in_lengths_host);
|
||||
Tensor<in_data_t> wei(wei_lengths_host);
|
||||
Tensor<out_data_t> bias(bias_lengths_host);
|
||||
Tensor<out_data_t> out_device(out_lengths_host);
|
||||
Tensor<out_data_t> out_host(out_lengths_host);
|
||||
Tensor<in_data_t> max_device(max_lengths_host);
|
||||
Tensor<in_data_t> max_host(max_lengths_host);
|
||||
|
||||
ostream_HostTensorDescriptor(in.mDesc, std::cout << "in: ");
|
||||
ostream_HostTensorDescriptor(wei.mDesc, std::cout << "wei: ");
|
||||
|
||||
print_array("InLeftPads", make_tuple(in_left_pad_h, in_left_pad_w));
|
||||
print_array("InRightPads", make_tuple(in_right_pad_h, in_right_pad_w));
|
||||
print_array("ConvStrides", make_tuple(conv_stride_h, conv_stride_w));
|
||||
print_array("ConvDilations", make_tuple(conv_dilation_h, conv_dilation_w));
|
||||
|
||||
std::size_t num_thread = std::thread::hardware_concurrency();
|
||||
|
||||
switch(init_method)
|
||||
{
|
||||
case 0:
|
||||
// no initialization
|
||||
break;
|
||||
case 1:
|
||||
in.GenerateTensorValue(GeneratorTensor_1{}, num_thread);
|
||||
wei.GenerateTensorValue(GeneratorTensor_1{}, num_thread);
|
||||
break;
|
||||
case 2:
|
||||
in.GenerateTensorValue(GeneratorTensor_1{}, num_thread);
|
||||
wei.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread);
|
||||
break;
|
||||
case 3:
|
||||
in.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread);
|
||||
wei.GenerateTensorValue(GeneratorTensor_1{}, num_thread);
|
||||
break;
|
||||
case 4:
|
||||
in.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread);
|
||||
wei.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread);
|
||||
break;
|
||||
case 5:
|
||||
in.GenerateTensorValue(GeneratorTensor_3<float>{0.0, 1.0}, num_thread);
|
||||
wei.GenerateTensorValue(GeneratorTensor_3<float>{-0.5, 0.5}, num_thread);
|
||||
break;
|
||||
default:
|
||||
in.GenerateTensorValue(GeneratorTensor_2{1, 5}, num_thread);
|
||||
|
||||
auto gen_wei = [](auto... is) {
|
||||
return GeneratorTensor_2{1, 5}(is...) * GeneratorTensor_Checkboard{}(is...);
|
||||
};
|
||||
wei.GenerateTensorValue(gen_wei, num_thread);
|
||||
}
|
||||
|
||||
bias.GenerateTensorValue(GeneratorTensor_1{}, num_thread);
|
||||
|
||||
auto f_make_for_device_nchwc = [&]() {
|
||||
const auto in_lengths_dev = make_tuple(N, C0, Hi, Wi, C1);
|
||||
const auto wei_lengths_dev = make_tuple(K0 * K1, C0, Y, X, C1);
|
||||
const auto max_lengths_dev = make_tuple(N, K0, Ho_2, Wo_2, K1);
|
||||
const auto out_lengths_dev = make_tuple(N, K0, Ho, Wo, K1);
|
||||
const auto conv_strides_dev = make_tuple(conv_stride_h, conv_stride_w);
|
||||
const auto conv_dilations_dev = make_tuple(conv_dilation_h, conv_dilation_w);
|
||||
const auto in_left_pads_dev = make_tuple(in_left_pad_h, in_left_pad_w);
|
||||
const auto in_right_pads_dev = make_tuple(in_right_pad_h, in_right_pad_w);
|
||||
|
||||
return make_tuple(in_lengths_dev,
|
||||
wei_lengths_dev,
|
||||
max_lengths_dev,
|
||||
out_lengths_dev,
|
||||
conv_strides_dev,
|
||||
conv_dilations_dev,
|
||||
in_left_pads_dev,
|
||||
in_right_pads_dev);
|
||||
};
|
||||
|
||||
#if USE_CONV_FWD_V5R1_NCHWC
|
||||
if(algo == ConvForwardAlgo::V5R1NCHWC)
|
||||
{
|
||||
const auto tmp = f_make_for_device_nchwc();
|
||||
|
||||
device_convolution_maxpool_forward_implicit_gemm_v5r1_dlops_nc0hwc1_kc0yxc1_nk0hwk1<
|
||||
in_data_t,
|
||||
acc_data_t,
|
||||
out_data_t,
|
||||
activ_type>(tmp[I0], // in_lengths_dev
|
||||
tmp[I1], // wei_lengths_dev
|
||||
tmp[I2], // max_lengths_dev
|
||||
tmp[I3], // out_lengths_dev
|
||||
tmp[I4], // conv_strides_dev
|
||||
tmp[I5], // conv_dilations_dev
|
||||
tmp[I6], // in_left_pads_dev
|
||||
tmp[I7], // in_right_pads_dev
|
||||
in,
|
||||
wei,
|
||||
bias,
|
||||
out_device,
|
||||
max_device,
|
||||
nrepeat);
|
||||
}
|
||||
#endif
|
||||
|
||||
if(do_verification)
|
||||
{
|
||||
host_direct_convolution_maxpool_nchwc(in,
|
||||
wei,
|
||||
bias,
|
||||
out_host,
|
||||
max_host,
|
||||
make_tuple(conv_stride_h, conv_stride_w),
|
||||
make_tuple(conv_dilation_h, conv_dilation_w),
|
||||
make_tuple(in_left_pad_h, in_left_pad_w),
|
||||
make_tuple(in_right_pad_h, in_right_pad_w),
|
||||
activ_type);
|
||||
|
||||
check_error(out_host, out_device);
|
||||
check_error(max_host, max_device);
|
||||
|
||||
if(do_log)
|
||||
{
|
||||
// LogRangeAsType<float>(std::cout << "in : ", in.mData, ",") << std::endl;
|
||||
// LogRangeAsType<float>(std::cout << "wei: ", wei.mData, ",") << std::endl;
|
||||
// LogRangeAsType<float>(std::cout << "out_device: ", out_device.mData, ",") <<
|
||||
// std::endl;
|
||||
LogRangeAsType<float>(std::cout << "max_host: ", max_host.mData, ",") << std::endl;
|
||||
LogRangeAsType<float>(std::cout << "max_device: ", max_device.mData, ",") << std::endl;
|
||||
}
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user