Files
composable_kernel/host/host_tensor/include/host_conv.hpp
Qianfeng b13f7b1861 Reduction in Composable Kernel (#82)
* Initial adding of generic reduction

* Initial adding of generic reduction ...

* Updates to make compiling done

* clang-format all files

* clang-format some files again

* Renaming in profiler/include/profile_reduce.hpp

* Updates and make BlockWise cases passed

* Updates and make ThreadWise and MultiBlockTwoCall cases passed

* Remove the support for MUL and NORM1 reduceOp from the profiler and the device instances

* Change to replace the dim0_max_vector_size/dim1_max_vector_size template argument in the device reduce classes

* format

* adding pooling

* added max and average pooling

* comment out cout and kernel timing

* Tiny simplification in profiler/reduce_profiler.cpp

* Add example for reduce_blockwise

* Tiny updates

* Change to pass the ElementWiseOp from device layer to kernel

* Fix the vectorDim and vectorSize in Device layer

* Enable vector load on both dim0 and dim1 for Threadwise method

* Tiny updates

* Change to let the user to pass the preUnaryOp and posUnaryOp

* Make pooling example work

* split device_reduce_instance into two libraries

* Tiny update

* Replace nanPropaOpt enum by boolean propagate_nan

* Simplification in DeviceReduce layer codes

* update build

* Change to clarify the difference between ck::half_t and half_float::half

* Renaming in all the reduction codes

* Add VectorSize as template parameter for device layer

* Add BetaIsZero as kernel template and as AccDataType for alpha

* print

* Small updates for pooling

* Updates for host_generic_reduction for reference

* Update to make AVG pooling pass

* Update to make MAX pooling with indices output pass

* fix

* add OutDst vector store to threadwise reduction and pooling

* tweak

* turn off check_indices that caused build issue

* refactor pooling

* clean up

* turn off check_indices for building issue for php-compiler

* add more tile size for odd C

* tweak conv for odd C

* update script

* clean up elementwise op

* add hack in reduction_operator.hpp to avoid compile error. To fix it, need to use element_wise_op in reduction op

* Add OutVectorSize as device and kernel tunable, also update to Elementwise Operations

* Move reduce operator mapping to host layer file reduction_operator_mapping.hpp from reduction_operator.hpp

* Change to the unary operators

* Move the definitions of unary operations to element_wise_operation.hpp

* re-org files

* Refine in device interfaces and multiblock kernels

* Split the reduction configurations into instances for specific methods

* Update in getTypeString() of device pool2d

* Renaming in host and kernel

* Tiny update in profiler/src/profiler.cpp

* Uncomment in device_operation/CMakeLists.txt to enable the building of all operations

* Make check_indices a templated function to remove some linking issue

* Renaming in the profiler reduce module

* Add support for double Reduction (but disable MultiblockAtomicAdd for double)

* Tiny correction of literal string

* Rename DevicePoolFwd to DevicePool2dFwd

* Split device_reduce_instance_xxx.cpp files according to the data types to speed up compiling

* Add comments for lists of configurations, lists of instances and references of add_reduce_instances_xxx

* Remove un-used header file gridwise_generic_reduction_wrapper_common.hpp

* Renaming and refining in the Reduction codes

* Tiny change in the unary operators

* Renaming symbols and files

* Renaming symbols in the kernels

* Move kernel kernel_set_buffer_value to separate file

* Add IndexDataType template parameter for kernels and use int32_t as index data type in device layer

* Tiny update in the kernels

* Remove definition of sqrtf()/isnan()/abs() for half_t due to some ADL issue

* Simplify a helper function in device layer

* Tiny adjustment in testing data initialization

* Renaming in kernel/device/host

* Add two testing scripts for reduction

* Refine the Unary operators in element_wise_operation.hpp

* Update in the reduce profiler module

* Update to the reduction testing scripts

* reduce compile parallelism

* change CI docker to rocm5.0

* remove unused variables

* fix build

Co-authored-by: Chao Liu <chao.liu2@amd.com>

[ROCm/composable_kernel commit: e17c0d8008]
2022-03-05 16:46:51 -06:00

150 lines
6.1 KiB
C++

#pragma once
#include "host_tensor.hpp"
#include "conv_common.hpp"
template <typename TIn,
typename TWei,
typename TOut,
typename ConvStrides,
typename ConvDilations,
typename InLeftPads,
typename InRightPads>
void host_conv_nchw_kcyx_nkhw(const Tensor<TIn>& in,
const Tensor<TWei>& wei,
Tensor<TOut>& out,
const ConvStrides& conv_strides,
const ConvDilations& conv_dilations,
const InLeftPads& in_left_pads,
const InRightPads&)
{
constexpr auto I0 = ck::Number<0>{};
constexpr auto I1 = ck::Number<1>{};
auto f_nchw = [&](auto n, auto k, auto ho, auto wo) {
float v = 0;
for(int c = 0; c < wei.mDesc.GetLengths()[1]; ++c)
{
for(int y = 0; y < wei.mDesc.GetLengths()[2]; ++y)
{
int hi = ho * conv_strides[I0] + y * conv_dilations[I0] - in_left_pads[I0];
for(int x = 0; x < wei.mDesc.GetLengths()[3]; ++x)
{
int wi = wo * conv_strides[I1] + x * conv_dilations[I1] - in_left_pads[I1];
if(hi >= 0 && hi < in.mDesc.GetLengths()[2] && wi >= 0 &&
wi < in.mDesc.GetLengths()[3])
{
v += ck::type_convert<float>(in(n, c, hi, wi)) *
ck::type_convert<float>(wei(k, c, y, x));
}
}
}
}
out(n, k, ho, wo) = ck::type_convert<TOut>(v);
};
make_ParallelTensorFunctor(f_nchw,
out.mDesc.GetLengths()[0],
out.mDesc.GetLengths()[1],
out.mDesc.GetLengths()[2],
out.mDesc.GetLengths()[3])(std::thread::hardware_concurrency());
}
template <typename TIn,
typename TWei,
typename TOut,
typename ConvStrides,
typename ConvDilations,
typename InLeftPads,
typename InRightPads>
void host_conv3d_ndhwc_kzyxc_ndhwk(const Tensor<TIn>& in,
const Tensor<TWei>& wei,
Tensor<TOut>& out,
const ConvStrides& conv_strides,
const ConvDilations& conv_dilations,
const InLeftPads& in_left_pads,
const InRightPads&)
{
using namespace ck;
constexpr auto I0 = Number<0>{};
constexpr auto I1 = Number<1>{};
constexpr auto I2 = Number<2>{};
const auto Di = in.mDesc.GetLengths()[1];
const auto Hi = in.mDesc.GetLengths()[2];
const auto Wi = in.mDesc.GetLengths()[3];
const auto Z = wei.mDesc.GetLengths()[1];
const auto Y = wei.mDesc.GetLengths()[2];
const auto X = wei.mDesc.GetLengths()[3];
const auto C = wei.mDesc.GetLengths()[4];
auto f_ndhwc = [&](auto n, auto do_tmp, auto ho_tmp, auto wo_tmp, auto k) {
// do__ must be converted to signed integer, otherwise zmin might be wrong in cases
// negative values.
const int do_ = static_cast<int>(do_tmp);
const int ho = static_cast<int>(ho_tmp);
const int wo = static_cast<int>(wo_tmp);
const int zmin =
std::max(0,
(in_left_pads[I0] - do_ * conv_strides[I0] + conv_dilations[I0] - 1) /
conv_dilations[I0]);
const int ymin =
std::max(0,
(in_left_pads[I1] - ho * conv_strides[I1] + conv_dilations[I1] - 1) /
conv_dilations[I1]);
const int xmin =
std::max(0,
(in_left_pads[I2] - wo * conv_strides[I2] + conv_dilations[I2] - 1) /
conv_dilations[I2]);
const int zmax =
std::min(Z, (in_left_pads[I0] - do_ * conv_strides[I0] + Di) / conv_dilations[I0]);
const int ymax =
std::min(Y, (in_left_pads[I1] - ho * conv_strides[I1] + Hi) / conv_dilations[I1]);
const int xmax =
std::min(X, (in_left_pads[I2] - wo * conv_strides[I2] + Wi) / conv_dilations[I2]);
const int di_min = do_ * conv_strides[I0] + zmin * conv_dilations[I0] - in_left_pads[I0];
const int hi_min = ho * conv_strides[I1] + ymin * conv_dilations[I1] - in_left_pads[I1];
const int wi_min = wo * conv_strides[I2] + xmin * conv_dilations[I2] - in_left_pads[I2];
double v = 0;
const TIn* in_n = in.mData.data() + n * Di * Hi * Wi * C;
const TWei* wei_k = wei.mData.data() + k * Z * Y * X * C;
int di = di_min;
for(int z = zmin; z < zmax; ++z, di += conv_dilations[I0])
{
const TIn* in_n_di = in_n + di * Hi * Wi * C;
const TWei* wei_k_z = wei_k + z * Y * X * C;
int hi = hi_min;
for(int y = ymin; y < ymax; ++y, hi += conv_dilations[I1])
{
const TIn* in_n_di_hi = in_n_di + hi * Wi * C;
const TWei* wei_k_z_y = wei_k_z + y * X * C;
int wi = wi_min;
for(int x = xmin; x < xmax; ++x, wi += conv_dilations[I2])
{
const TIn* in_n_di_hi_wi = in_n_di_hi + wi * C;
const TWei* wei_k_z_y_x = wei_k_z_y + x * C;
for(int c = 0; c < C; ++c)
{
v += static_cast<const double>(in_n_di_hi_wi[c]) *
static_cast<const double>(wei_k_z_y_x[c]);
}
}
}
}
out(n, do_, ho, wo, k) = v;
};
make_ParallelTensorFunctor(f_ndhwc,
out.mDesc.GetLengths()[0],
out.mDesc.GetLengths()[1],
out.mDesc.GetLengths()[2],
out.mDesc.GetLengths()[3],
out.mDesc.GetLengths()[4])(std::thread::hardware_concurrency() - 4);
}