mirror of
https://github.com/ROCm/composable_kernel.git
synced 2026-05-27 08:25:46 +00:00
Add online compilation for dynamic kernels (#37)
* Add online-compiling facility
* Synchronize from fwd-v4r5 and implement host interfaces to call conv-fwd v4r4/v4r5 using on-line compiling method
* Tiny adjustment to time reporting
* Use object assignment to replace explicit bytes copying in the first kernel of v4r4/v4r5
* Use single thread to assign descriptor object to device memory
* Adjust to the workload assignment of the two kernels of v4r4 (experimental)
* Revert "Adjust to the workload assignment of the two kernels of v4r4 (experimental)"
This reverts commit eb38461456bb0c82b6c0d32cdd616e181907e20c.
* Update to make constexpr for generating descriptor types in kernel 2 of dynamic conv-fwd v4r4
* Update to dynamic conv-fwd v4r4 online-compiling
* Update to dynamic conv-fwd v4r5 online-compiling (result not accurate)
* Tiny update to driver/CMakeLists.txt
* clang-format
* Tiny comments change
* Add env OLC_DUMP_SAVE_TMP_DIR to support saving of temperary dir
* Fwd v4r5 olc perf (#39)
* added hip-clang flags that fix perf issue of online compilation
* fix bug for olc fwd-v4r5-nchw
* Move constexpr and type reference statements out of the function body in conv-fwd v4r4/v4r5 kernel wrapper
* Remove printing in hip_build_utils.cpp
* Update to root CMakeLists.txt
* Revert "Move constexpr and type reference statements out of the function body in conv-fwd v4r4/v4r5 kernel wrapper"
This reverts commit 3d2c5d8ecdd8298b72d127110500ed5b38d9835c.
Co-authored-by: Chao Liu <chao.liu2@amd.com>
Co-authored-by: Chao Liu <lc.roy86@gmail.com>
Co-authored-by: root <root@dc-smc-18.amd.com>
[ROCm/composable_kernel commit: 1685048a67]
This commit is contained in:
131
driver/include/conv_tunables.hpp
Normal file
131
driver/include/conv_tunables.hpp
Normal file
@@ -0,0 +1,131 @@
|
||||
#ifndef CONV_TUNABLES_HPP
|
||||
#define CONV_TUNABLES_HPP
|
||||
|
||||
#include "config.hpp"
|
||||
|
||||
struct tunable_dyn_conv_fwd_v4r4_nchw_kcyx_nkhw
|
||||
{
|
||||
ck::index_t BlockSize; // usually not tunable
|
||||
|
||||
ck::index_t MPerBlock;
|
||||
ck::index_t NPerBlock;
|
||||
ck::index_t KPerBlock;
|
||||
|
||||
ck::index_t M1PerThread;
|
||||
ck::index_t N1PerThread;
|
||||
ck::index_t KPerThread;
|
||||
|
||||
ck::index_t M1N1ThreadClusterM10;
|
||||
ck::index_t M1N1ThreadClusterN10;
|
||||
ck::index_t M1N1ThreadClusterM11;
|
||||
ck::index_t M1N1ThreadClusterN11;
|
||||
|
||||
std::array<ck::index_t, 3> ABlockTransferThreadSliceLengths_K_M0_M1;
|
||||
std::array<ck::index_t, 3> ABlockTransferThreadClusterLengths_K_M0_M1;
|
||||
std::array<ck::index_t, 3> ABlockTransferThreadClusterArrangeOrder;
|
||||
std::array<ck::index_t, 3> ABlockTransferSrcAccessOrder;
|
||||
ck::index_t ABlockTransferSrcVectorDim;
|
||||
ck::index_t ABlockTransferSrcScalarPerVector;
|
||||
ck::index_t ABlockTransferDstScalarPerVector_M1;
|
||||
bool AThreadTransferSrcResetCoordinateAfterRun;
|
||||
|
||||
std::array<ck::index_t, 3> BBlockTransferThreadSliceLengths_K_N0_N1;
|
||||
std::array<ck::index_t, 3> BBlockTransferThreadClusterLengths_K_N0_N1;
|
||||
std::array<ck::index_t, 3> BBlockTransferThreadClusterArrangeOrder;
|
||||
std::array<ck::index_t, 3> BBlockTransferSrcAccessOrder;
|
||||
ck::index_t BBlockTransferSrcVectorDim;
|
||||
ck::index_t BBlockTransferSrcScalarPerVector;
|
||||
ck::index_t BBlockTransferDstScalarPerVector_N1;
|
||||
bool BThreadTransferSrcResetCoordinateAfterRun;
|
||||
|
||||
std::array<ck::index_t, 6> CThreadTransferSrcDstAccessOrder;
|
||||
ck::index_t CThreadTransferSrcDstVectorDim;
|
||||
ck::index_t CThreadTransferDstScalarPerVector;
|
||||
};
|
||||
|
||||
static tunable_dyn_conv_fwd_v4r4_nchw_kcyx_nkhw default_tunable_dyn_conv_fwd_v4r4_nchw_kcyx_nkhw = {
|
||||
256, 128, 128, 8, 4, 4, 1,
|
||||
8, 8, 2, 2, {4, 1, 1}, {2, 1, 128}, {2, 1, 0},
|
||||
{2, 1, 0}, 0, 4, 1, false, {4, 1, 1}, {2, 1, 128},
|
||||
{0, 1, 2}, {0, 1, 2}, 2, 1, 1, false, {3, 4, 5, 0, 1, 2},
|
||||
5, 1};
|
||||
|
||||
struct tunable_dyn_conv_fwd_v4r5_nchw_kcyx_nkhw
|
||||
{
|
||||
ck::index_t BlockSize;
|
||||
|
||||
ck::index_t GM1PerBlockGM11;
|
||||
ck::index_t GN1PerBlockGN11;
|
||||
ck::index_t KPerBlock;
|
||||
|
||||
ck::index_t M1PerThread;
|
||||
ck::index_t N1PerThread;
|
||||
ck::index_t KPerThread;
|
||||
|
||||
ck::index_t M1N1ThreadClusterM10;
|
||||
ck::index_t M1N1ThreadClusterN10;
|
||||
ck::index_t M1N1ThreadClusterM11;
|
||||
ck::index_t M1N1ThreadClusterN11;
|
||||
|
||||
std::array<ck::index_t, 4> ABlockTransferThreadSliceLengths_GK_GM0_GM10_GM11;
|
||||
std::array<ck::index_t, 4> ABlockTransferThreadClusterLengths_GK_GM0_GM10_GM11;
|
||||
std::array<ck::index_t, 4> ABlockTransferThreadClusterArrangeOrder;
|
||||
std::array<ck::index_t, 4> ABlockTransferSrcAccessOrder;
|
||||
ck::index_t ABlockTransferSrcVectorDim;
|
||||
ck::index_t ABlockTransferSrcScalarPerVector;
|
||||
ck::index_t ABlockTransferDstScalarPerVector_GM11;
|
||||
bool AThreadTransferSrcResetCoordinateAfterRun;
|
||||
|
||||
std::array<ck::index_t, 4> BBlockTransferThreadSliceLengths_GK_GN0_GN10_GN11;
|
||||
std::array<ck::index_t, 4> BBlockTransferThreadClusterLengths_GK_GN0_GN10_GN11;
|
||||
std::array<ck::index_t, 4> BBlockTransferThreadClusterArrangeOrder;
|
||||
std::array<ck::index_t, 4> BBlockTransferSrcAccessOrder;
|
||||
ck::index_t BBlockTransferSrcVectorDim;
|
||||
ck::index_t BBlockTransferSrcScalarPerVector;
|
||||
ck::index_t BBlockTransferDstScalarPerVector_GN11;
|
||||
bool BThreadTransferSrcResetCoordinateAfterRun;
|
||||
|
||||
std::array<ck::index_t, 6> CThreadTransferSrcDstAccessOrder;
|
||||
ck::index_t CThreadTransferSrcDstVectorDim;
|
||||
ck::index_t CThreadTransferDstScalarPerVector;
|
||||
};
|
||||
|
||||
static tunable_dyn_conv_fwd_v4r5_nchw_kcyx_nkhw default_tunable_dyn_conv_fwd_v4r5_nchw_kcyx_nkhw = {
|
||||
256,
|
||||
128,
|
||||
32,
|
||||
8,
|
||||
4,
|
||||
4,
|
||||
1,
|
||||
2,
|
||||
2,
|
||||
8,
|
||||
8,
|
||||
{4, 1, 1, 1},
|
||||
{2, 1, 1, 128},
|
||||
{3, 2, 1, 0},
|
||||
{3, 2, 1, 0},
|
||||
0,
|
||||
4,
|
||||
1,
|
||||
false,
|
||||
{1, 4, 1, 1},
|
||||
{8, 1, 1, 32},
|
||||
{0, 3, 2, 1},
|
||||
{0, 3, 2, 1},
|
||||
3,
|
||||
1,
|
||||
1,
|
||||
false,
|
||||
{3, 4, 5, 0, 1, 2},
|
||||
5,
|
||||
1};
|
||||
|
||||
static inline int
|
||||
conv_hw_out_size(int hw_in_size, int leftPad, int rightPad, int dilation, int yx_size, int stride)
|
||||
{
|
||||
return (hw_in_size + leftPad + rightPad - dilation * (yx_size - 1) - 1) / stride + 1;
|
||||
}
|
||||
|
||||
#endif
|
||||
@@ -0,0 +1,383 @@
|
||||
#include "device.hpp"
|
||||
#include "host_tensor.hpp"
|
||||
#include "dynamic_tensor_descriptor.hpp"
|
||||
#include "dynamic_tensor_descriptor_helper.hpp"
|
||||
#include "transform_forward_convolution_into_gemm_v4r4_nchw_kcyx_nkhw.hpp"
|
||||
|
||||
#include "olc_driver_common.hpp"
|
||||
#include "conv_tunables.hpp"
|
||||
|
||||
#include "handle.hpp"
|
||||
|
||||
namespace detail_dyn_conv_fwd_v4r4_nchw_kcyx_nkhw {
|
||||
|
||||
template <typename TInWei, typename TAcc, typename TOut>
|
||||
static std::string get_network_config_string_from_types()
|
||||
{
|
||||
std::string out;
|
||||
|
||||
out += static_cast<char>(Driver::get_typeid_from_type<TInWei>()) +
|
||||
static_cast<char>(Driver::get_typeid_from_type<TAcc>()) +
|
||||
static_cast<char>(Driver::get_typeid_from_type<TOut>());
|
||||
|
||||
return (out);
|
||||
};
|
||||
|
||||
static std::string
|
||||
get_network_config_string_from_tunable(const tunable_dyn_conv_fwd_v4r4_nchw_kcyx_nkhw* pt)
|
||||
{
|
||||
std::string out("TUN_");
|
||||
|
||||
out += std::to_string(pt->BlockSize) + "_";
|
||||
|
||||
out += std::to_string(pt->MPerBlock) + "x" + std::to_string(pt->NPerBlock) + "x" +
|
||||
std::to_string(pt->KPerBlock) + "_";
|
||||
out += std::to_string(pt->M1PerThread) + "x" + std::to_string(pt->N1PerThread) + "x" +
|
||||
std::to_string(pt->KPerThread) + "_";
|
||||
out += std::to_string(pt->M1N1ThreadClusterM10) + "x" +
|
||||
std::to_string(pt->M1N1ThreadClusterN10) + "x" +
|
||||
std::to_string(pt->M1N1ThreadClusterM11) + "x" +
|
||||
std::to_string(pt->M1N1ThreadClusterN11) + "_";
|
||||
|
||||
out += std::to_string(pt->ABlockTransferThreadSliceLengths_K_M0_M1[0]) + "x" +
|
||||
std::to_string(pt->ABlockTransferThreadSliceLengths_K_M0_M1[1]) + "x" +
|
||||
std::to_string(pt->ABlockTransferThreadSliceLengths_K_M0_M1[2]) + "_";
|
||||
|
||||
out += std::to_string(pt->ABlockTransferThreadClusterLengths_K_M0_M1[0]) + "x" +
|
||||
std::to_string(pt->ABlockTransferThreadClusterLengths_K_M0_M1[1]) + "x" +
|
||||
std::to_string(pt->ABlockTransferThreadClusterLengths_K_M0_M1[2]) + "_";
|
||||
|
||||
out += std::to_string(pt->ABlockTransferThreadClusterArrangeOrder[0]) + "x" +
|
||||
std::to_string(pt->ABlockTransferThreadClusterArrangeOrder[1]) + "x" +
|
||||
std::to_string(pt->ABlockTransferThreadClusterArrangeOrder[2]) + "_";
|
||||
|
||||
out += std::to_string(pt->ABlockTransferSrcAccessOrder[0]) + "x" +
|
||||
std::to_string(pt->ABlockTransferSrcAccessOrder[1]) + "x" +
|
||||
std::to_string(pt->ABlockTransferSrcAccessOrder[2]) + "_";
|
||||
|
||||
out += std::to_string(pt->ABlockTransferSrcVectorDim) + "_";
|
||||
out += std::to_string(pt->ABlockTransferSrcScalarPerVector) + "_";
|
||||
out += std::to_string(pt->ABlockTransferDstScalarPerVector_M1) + "_";
|
||||
out += std::to_string(pt->AThreadTransferSrcResetCoordinateAfterRun) + "_";
|
||||
|
||||
out += std::to_string(pt->BBlockTransferThreadSliceLengths_K_N0_N1[0]) + "x" +
|
||||
std::to_string(pt->BBlockTransferThreadSliceLengths_K_N0_N1[1]) + "x" +
|
||||
std::to_string(pt->BBlockTransferThreadSliceLengths_K_N0_N1[2]) + "_";
|
||||
|
||||
out += std::to_string(pt->BBlockTransferThreadClusterLengths_K_N0_N1[0]) + "x" +
|
||||
std::to_string(pt->BBlockTransferThreadClusterLengths_K_N0_N1[1]) + "x" +
|
||||
std::to_string(pt->BBlockTransferThreadClusterLengths_K_N0_N1[2]) + "_";
|
||||
|
||||
out += std::to_string(pt->BBlockTransferThreadClusterArrangeOrder[0]) + "x" +
|
||||
std::to_string(pt->BBlockTransferThreadClusterArrangeOrder[1]) + "x" +
|
||||
std::to_string(pt->BBlockTransferThreadClusterArrangeOrder[2]) + "_";
|
||||
|
||||
out += std::to_string(pt->BBlockTransferSrcAccessOrder[0]) + "x" +
|
||||
std::to_string(pt->BBlockTransferSrcAccessOrder[1]) + "x" +
|
||||
std::to_string(pt->BBlockTransferSrcAccessOrder[2]) + "_";
|
||||
|
||||
out += std::to_string(pt->BBlockTransferSrcVectorDim) + "_";
|
||||
out += std::to_string(pt->BBlockTransferSrcScalarPerVector) + "_";
|
||||
out += std::to_string(pt->BBlockTransferDstScalarPerVector_N1) + "_";
|
||||
out += std::to_string(pt->BThreadTransferSrcResetCoordinateAfterRun) + "_";
|
||||
|
||||
out += std::to_string(pt->CThreadTransferSrcDstAccessOrder[0]) + "x" +
|
||||
std::to_string(pt->CThreadTransferSrcDstAccessOrder[1]) + "x" +
|
||||
std::to_string(pt->CThreadTransferSrcDstAccessOrder[2]) + "x" +
|
||||
std::to_string(pt->CThreadTransferSrcDstAccessOrder[3]) + "x" +
|
||||
std::to_string(pt->CThreadTransferSrcDstAccessOrder[4]) + "x" +
|
||||
std::to_string(pt->CThreadTransferSrcDstAccessOrder[5]) + "_";
|
||||
|
||||
out += std::to_string(pt->CThreadTransferSrcDstVectorDim) + "_";
|
||||
out += std::to_string(pt->CThreadTransferDstScalarPerVector);
|
||||
|
||||
return (out);
|
||||
};
|
||||
|
||||
template <typename TInWei, typename TAcc, typename TOut>
|
||||
static std::string get_definition_string_from_types()
|
||||
{
|
||||
std::string out;
|
||||
|
||||
out += " -DCK_PARAM_IN_WEI_DATATYPE=" + std::to_string(Driver::get_typeid_from_type<TInWei>()) +
|
||||
" -DCK_PARAM_CONV_COMPTYPE=" + std::to_string(Driver::get_typeid_from_type<TAcc>()) +
|
||||
" -DCK_PARAM_OUT_DATATYPE=" + std::to_string(Driver::get_typeid_from_type<TOut>());
|
||||
|
||||
return (out);
|
||||
};
|
||||
|
||||
static std::string
|
||||
get_definition_string_from_tunable(const tunable_dyn_conv_fwd_v4r4_nchw_kcyx_nkhw* pt)
|
||||
{
|
||||
std::string out;
|
||||
|
||||
out += " -DCK_PARAM_BlockSize=" + std::to_string(pt->BlockSize);
|
||||
|
||||
out += " -DCK_PARAM_MPerBlock=" + std::to_string(pt->MPerBlock) +
|
||||
" -DCK_PARAM_NPerBlock=" + std::to_string(pt->NPerBlock) +
|
||||
" -DCK_PARAM_KPerBlock=" + std::to_string(pt->KPerBlock);
|
||||
out += " -DCK_PARAM_M1PerThread=" + std::to_string(pt->M1PerThread) +
|
||||
" -DCK_PARAM_N1PerThread=" + std::to_string(pt->N1PerThread) +
|
||||
" -DCK_PARAM_KPerThread=" + std::to_string(pt->KPerThread);
|
||||
|
||||
out += " -DCK_PARAM_M1N1ThreadClusterM10=" + std::to_string(pt->M1N1ThreadClusterM10) +
|
||||
" -DCK_PARAM_M1N1ThreadClusterN10=" + std::to_string(pt->M1N1ThreadClusterN10) +
|
||||
" -DCK_PARAM_M1N1ThreadClusterM11=" + std::to_string(pt->M1N1ThreadClusterM11) +
|
||||
" -DCK_PARAM_M1N1ThreadClusterN11=" + std::to_string(pt->M1N1ThreadClusterN11);
|
||||
|
||||
out += " -DCK_PARAM_ABlockTransferThreadSliceLengths_K_M0_M1=" +
|
||||
std::to_string(pt->ABlockTransferThreadSliceLengths_K_M0_M1[0]) + "," +
|
||||
std::to_string(pt->ABlockTransferThreadSliceLengths_K_M0_M1[1]) + "," +
|
||||
std::to_string(pt->ABlockTransferThreadSliceLengths_K_M0_M1[2]);
|
||||
|
||||
out += " -DCK_PARAM_ABlockTransferThreadClusterLengths_K_M0_M1=" +
|
||||
std::to_string(pt->ABlockTransferThreadClusterLengths_K_M0_M1[0]) + "," +
|
||||
std::to_string(pt->ABlockTransferThreadClusterLengths_K_M0_M1[1]) + "," +
|
||||
std::to_string(pt->ABlockTransferThreadClusterLengths_K_M0_M1[2]);
|
||||
|
||||
out += " -DCK_PARAM_ABlockTransferThreadClusterArrangeOrder=" +
|
||||
std::to_string(pt->ABlockTransferThreadClusterArrangeOrder[0]) + "," +
|
||||
std::to_string(pt->ABlockTransferThreadClusterArrangeOrder[1]) + "," +
|
||||
std::to_string(pt->ABlockTransferThreadClusterArrangeOrder[2]);
|
||||
|
||||
out += " -DCK_PARAM_ABlockTransferSrcAccessOrder=" +
|
||||
std::to_string(pt->ABlockTransferSrcAccessOrder[0]) + "," +
|
||||
std::to_string(pt->ABlockTransferSrcAccessOrder[1]) + "," +
|
||||
std::to_string(pt->ABlockTransferSrcAccessOrder[2]);
|
||||
|
||||
out +=
|
||||
" -DCK_PARAM_ABlockTransferSrcVectorDim=" + std::to_string(pt->ABlockTransferSrcVectorDim);
|
||||
out += " -DCK_PARAM_ABlockTransferSrcScalarPerVector=" +
|
||||
std::to_string(pt->ABlockTransferSrcScalarPerVector);
|
||||
out += " -DCK_PARAM_ABlockTransferDstScalarPerVector_M1=" +
|
||||
std::to_string(pt->ABlockTransferDstScalarPerVector_M1);
|
||||
out += " -DCK_PARAM_AThreadTransferSrcResetCoordinateAfterRun=" +
|
||||
std::to_string(pt->AThreadTransferSrcResetCoordinateAfterRun);
|
||||
|
||||
out += " -DCK_PARAM_BBlockTransferThreadSliceLengths_K_N0_N1=" +
|
||||
std::to_string(pt->BBlockTransferThreadSliceLengths_K_N0_N1[0]) + "," +
|
||||
std::to_string(pt->BBlockTransferThreadSliceLengths_K_N0_N1[1]) + "," +
|
||||
std::to_string(pt->BBlockTransferThreadSliceLengths_K_N0_N1[2]);
|
||||
|
||||
out += " -DCK_PARAM_BBlockTransferThreadClusterLengths_K_N0_N1=" +
|
||||
std::to_string(pt->BBlockTransferThreadClusterLengths_K_N0_N1[0]) + "," +
|
||||
std::to_string(pt->BBlockTransferThreadClusterLengths_K_N0_N1[1]) + "," +
|
||||
std::to_string(pt->BBlockTransferThreadClusterLengths_K_N0_N1[2]);
|
||||
|
||||
out += " -DCK_PARAM_BBlockTransferThreadClusterArrangeOrder=" +
|
||||
std::to_string(pt->BBlockTransferThreadClusterArrangeOrder[0]) + "," +
|
||||
std::to_string(pt->BBlockTransferThreadClusterArrangeOrder[1]) + "," +
|
||||
std::to_string(pt->BBlockTransferThreadClusterArrangeOrder[2]);
|
||||
|
||||
out += " -DCK_PARAM_BBlockTransferSrcAccessOrder=" +
|
||||
std::to_string(pt->BBlockTransferSrcAccessOrder[0]) + "," +
|
||||
std::to_string(pt->BBlockTransferSrcAccessOrder[1]) + "," +
|
||||
std::to_string(pt->BBlockTransferSrcAccessOrder[2]);
|
||||
|
||||
out +=
|
||||
" -DCK_PARAM_BBlockTransferSrcVectorDim=" + std::to_string(pt->BBlockTransferSrcVectorDim);
|
||||
out += " -DCK_PARAM_BBlockTransferSrcScalarPerVector=" +
|
||||
std::to_string(pt->BBlockTransferSrcScalarPerVector);
|
||||
out += " -DCK_PARAM_BBlockTransferDstScalarPerVector_N1=" +
|
||||
std::to_string(pt->BBlockTransferDstScalarPerVector_N1);
|
||||
out += " -DCK_PARAM_BThreadTransferSrcResetCoordinateAfterRun=" +
|
||||
std::to_string(pt->BThreadTransferSrcResetCoordinateAfterRun);
|
||||
|
||||
out += " -DCK_PARAM_CThreadTransferSrcDstAccessOrder=" +
|
||||
std::to_string(pt->CThreadTransferSrcDstAccessOrder[0]) + "," +
|
||||
std::to_string(pt->CThreadTransferSrcDstAccessOrder[1]) + "," +
|
||||
std::to_string(pt->CThreadTransferSrcDstAccessOrder[2]) + "," +
|
||||
std::to_string(pt->CThreadTransferSrcDstAccessOrder[3]) + "," +
|
||||
std::to_string(pt->CThreadTransferSrcDstAccessOrder[4]) + "," +
|
||||
std::to_string(pt->CThreadTransferSrcDstAccessOrder[5]);
|
||||
|
||||
out += " -DCK_PARAM_CThreadTransferSrcDstVectorDim=" +
|
||||
std::to_string(pt->CThreadTransferSrcDstVectorDim);
|
||||
out += " -DCK_PARAM_CThreadTransferDstScalarPerVector=" +
|
||||
std::to_string(pt->CThreadTransferDstScalarPerVector);
|
||||
|
||||
return (out);
|
||||
};
|
||||
|
||||
} // namespace detail_dyn_conv_fwd_v4r4_nchw_kcyx_nkhw
|
||||
|
||||
template <typename TInWei,
|
||||
typename TAcc,
|
||||
typename TOut,
|
||||
typename InLengths,
|
||||
typename WeiLengths,
|
||||
typename OutLengths,
|
||||
typename ConvStrides,
|
||||
typename ConvDilations,
|
||||
typename InLeftPads,
|
||||
typename InRightPads>
|
||||
void device_dynamic_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw_olc(
|
||||
olCompile::Handle* handle,
|
||||
const InLengths& in_n_c_hi_wi_lengths,
|
||||
const WeiLengths& wei_k_c_y_x_lengths,
|
||||
const OutLengths& out_n_k_ho_wo_lengths,
|
||||
const ConvStrides& conv_strides,
|
||||
const ConvDilations& conv_dilations,
|
||||
const InLeftPads& in_left_pads,
|
||||
const InRightPads& in_right_pads,
|
||||
const Tensor<TInWei>& in_n_c_hi_wi,
|
||||
const Tensor<TInWei>& wei_k_c_y_x,
|
||||
Tensor<TOut>& out_n_k_ho_wo,
|
||||
const tunable_dyn_conv_fwd_v4r4_nchw_kcyx_nkhw* tunable,
|
||||
ck::index_t nrepeat)
|
||||
{
|
||||
using namespace ck;
|
||||
using namespace detail_dyn_conv_fwd_v4r4_nchw_kcyx_nkhw;
|
||||
using size_t = std::size_t;
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
// The follow codes are only used for computing the grid_size, hasMainKBlockLoop,
|
||||
// hasDoubleTailKBlockLoop
|
||||
|
||||
constexpr auto I0 = Number<0>{};
|
||||
constexpr auto I1 = Number<1>{};
|
||||
constexpr auto I2 = Number<2>{};
|
||||
constexpr auto I3 = Number<3>{};
|
||||
|
||||
const auto in_n_c_hi_wi_desc =
|
||||
make_dynamic_naive_tensor_descriptor_packed_v2(in_n_c_hi_wi_lengths);
|
||||
const auto wei_k_c_y_x_desc =
|
||||
make_dynamic_naive_tensor_descriptor_packed_v2(wei_k_c_y_x_lengths);
|
||||
const auto out_n_k_ho_wo_desc =
|
||||
make_dynamic_naive_tensor_descriptor_packed_v2(out_n_k_ho_wo_lengths);
|
||||
|
||||
const auto descs =
|
||||
transform_forward_convolution_into_gemm_v4r4_nchw_kcyx_nkhw_pad(wei_k_c_y_x_desc,
|
||||
in_n_c_hi_wi_desc,
|
||||
out_n_k_ho_wo_desc,
|
||||
conv_strides,
|
||||
conv_dilations,
|
||||
in_left_pads,
|
||||
in_right_pads);
|
||||
const auto a_k_m_grid_desc = descs[I0];
|
||||
const auto c_m_n_grid_desc = descs[I2];
|
||||
const auto M = c_m_n_grid_desc.GetLength(I0);
|
||||
const auto N = c_m_n_grid_desc.GetLength(I1);
|
||||
const auto K = a_k_m_grid_desc.GetLength(I0);
|
||||
|
||||
const index_t grid_size = (M / tunable->MPerBlock) * (N / tunable->NPerBlock);
|
||||
const bool hasMainKBlockLoop = ((K + tunable->KPerBlock) / (2 * tunable->KPerBlock) > 1);
|
||||
const bool hasDoubleTailKBlockLoop = ((K / tunable->KPerBlock) % 2 == 0);
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
// these buffers are usually provided by the user application
|
||||
DeviceMem in_n_c_hi_wi_dev_buf(sizeof(TInWei) * in_n_c_hi_wi.mDesc.GetElementSpace());
|
||||
DeviceMem wei_k_c_y_x_dev_buf(sizeof(TInWei) * wei_k_c_y_x.mDesc.GetElementSpace());
|
||||
DeviceMem out_n_k_ho_wo_dev_buf(sizeof(TOut) * out_n_k_ho_wo.mDesc.GetElementSpace());
|
||||
|
||||
in_n_c_hi_wi_dev_buf.ToDevice(in_n_c_hi_wi.mData.data());
|
||||
wei_k_c_y_x_dev_buf.ToDevice(wei_k_c_y_x.mData.data());
|
||||
out_n_k_ho_wo_dev_buf.ToDevice(out_n_k_ho_wo.mData.data());
|
||||
|
||||
// these are workspace buffers that should be expressed to the user by the corresponding
|
||||
// workspace API
|
||||
DeviceMem workspace_buf(4096);
|
||||
|
||||
void* a_k_m0_m1_grid_desc_dev_buf = workspace_buf.GetDeviceBuffer();
|
||||
void* b_k_n0_n1_grid_desc_dev_buf =
|
||||
static_cast<void*>(static_cast<unsigned char*>(workspace_buf.GetDeviceBuffer()) + 1024);
|
||||
void* c_m0_m10_m11_n0_n10_n11_grid_desc_dev_buf =
|
||||
static_cast<void*>(static_cast<unsigned char*>(workspace_buf.GetDeviceBuffer()) + 2048);
|
||||
void* c_blockid_to_m0_n0_block_cluster_adaptor_dev_buf =
|
||||
static_cast<void*>(static_cast<unsigned char*>(workspace_buf.GetDeviceBuffer()) + 3072);
|
||||
|
||||
const std::vector<size_t> vld = {static_cast<size_t>(tunable->BlockSize), 1, 1};
|
||||
const std::vector<size_t> vgd1 = {static_cast<size_t>(tunable->BlockSize), 1, 1};
|
||||
const std::vector<size_t> vgd2 = {static_cast<size_t>(grid_size * tunable->BlockSize), 1, 1};
|
||||
|
||||
std::string program_name = "dynamic_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw.cpp";
|
||||
std::string algo_name = "implicit_gemm_conv_fwd_v4r4_nchw";
|
||||
|
||||
std::string param = " -std=c++17 ";
|
||||
std::string network_config;
|
||||
|
||||
param += get_definition_string_from_types<TInWei, TAcc, TOut>() + " " +
|
||||
get_definition_string_from_tunable(tunable) +
|
||||
" -DCK_PARAM_HAS_MAIN_KBLOCK_LOOP=" + std::to_string(hasMainKBlockLoop) +
|
||||
" -DCK_PARAM_HAS_DOUBLE_TAIL_KBLOCK_LOOP=" + std::to_string(hasDoubleTailKBlockLoop);
|
||||
network_config = get_network_config_string_from_types<TInWei, TAcc, TOut>() + "_" +
|
||||
get_network_config_string_from_tunable(tunable) + "_" +
|
||||
std::to_string(hasMainKBlockLoop) + "_" +
|
||||
std::to_string(hasDoubleTailKBlockLoop);
|
||||
|
||||
std::vector<float> kernel1_times;
|
||||
std::vector<float> kernel2_times;
|
||||
|
||||
for(index_t i = 0; i < nrepeat; ++i)
|
||||
{
|
||||
KernelTimer timer1, timer2;
|
||||
std::string kernel_name;
|
||||
|
||||
kernel_name = "dynamic_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw_prepare";
|
||||
auto network_config_1 = network_config + "_1";
|
||||
|
||||
timer1.Start();
|
||||
handle->AddKernel(algo_name, network_config_1, program_name, kernel_name, vld, vgd1, param)(
|
||||
static_cast<index_t>(in_n_c_hi_wi_lengths[I0]),
|
||||
static_cast<index_t>(in_n_c_hi_wi_lengths[I1]),
|
||||
static_cast<index_t>(in_n_c_hi_wi_lengths[I2]),
|
||||
static_cast<index_t>(in_n_c_hi_wi_lengths[I3]),
|
||||
static_cast<index_t>(wei_k_c_y_x_lengths[I0]),
|
||||
static_cast<index_t>(wei_k_c_y_x_lengths[I2]),
|
||||
static_cast<index_t>(wei_k_c_y_x_lengths[I3]),
|
||||
conv_strides[I0],
|
||||
conv_strides[I1],
|
||||
conv_dilations[I0],
|
||||
conv_dilations[I1],
|
||||
in_left_pads[I0],
|
||||
in_left_pads[I1],
|
||||
in_right_pads[I0],
|
||||
in_right_pads[I1],
|
||||
a_k_m0_m1_grid_desc_dev_buf,
|
||||
b_k_n0_n1_grid_desc_dev_buf,
|
||||
c_m0_m10_m11_n0_n10_n11_grid_desc_dev_buf,
|
||||
c_blockid_to_m0_n0_block_cluster_adaptor_dev_buf);
|
||||
timer1.End();
|
||||
|
||||
kernel_name = "dynamic_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw";
|
||||
auto network_config_2 = network_config + "_2";
|
||||
|
||||
timer2.Start();
|
||||
handle->AddKernel(algo_name, network_config_2, program_name, kernel_name, vld, vgd2, param)(
|
||||
reinterpret_cast<const TInWei*>(wei_k_c_y_x_dev_buf.GetDeviceBuffer()),
|
||||
reinterpret_cast<const TInWei*>(in_n_c_hi_wi_dev_buf.GetDeviceBuffer()),
|
||||
reinterpret_cast<TOut*>(out_n_k_ho_wo_dev_buf.GetDeviceBuffer()),
|
||||
(const void*)(a_k_m0_m1_grid_desc_dev_buf),
|
||||
(const void*)(b_k_n0_n1_grid_desc_dev_buf),
|
||||
(const void*)(c_m0_m10_m11_n0_n10_n11_grid_desc_dev_buf),
|
||||
(const void*)(c_blockid_to_m0_n0_block_cluster_adaptor_dev_buf));
|
||||
timer2.End();
|
||||
|
||||
kernel1_times.push_back(timer1.GetElapsedTime());
|
||||
kernel2_times.push_back(timer2.GetElapsedTime());
|
||||
}
|
||||
|
||||
{
|
||||
auto ave_time1 = Driver::get_effective_average(kernel1_times);
|
||||
auto ave_time2 = Driver::get_effective_average(kernel2_times);
|
||||
|
||||
const auto N = in_n_c_hi_wi_lengths[I0];
|
||||
const auto C = in_n_c_hi_wi_lengths[I1];
|
||||
|
||||
const auto K = out_n_k_ho_wo_lengths[I1];
|
||||
const auto Ho = out_n_k_ho_wo_lengths[I2];
|
||||
const auto Wo = out_n_k_ho_wo_lengths[I3];
|
||||
|
||||
const auto Y = wei_k_c_y_x_lengths[I2];
|
||||
const auto X = wei_k_c_y_x_lengths[I3];
|
||||
|
||||
float perf = (float)(std::size_t(2) * N * K * Ho * Wo * C * Y * X) /
|
||||
(std::size_t(1000) * 1000 * 1000) / (ave_time1 + ave_time2);
|
||||
|
||||
std::cout << "Average time : " << ave_time1 + ave_time2 << " ms(" << ave_time1 << ", "
|
||||
<< ave_time2 << "), " << perf << " TFlop/s" << std::endl;
|
||||
};
|
||||
|
||||
// copy result back to host
|
||||
out_n_k_ho_wo_dev_buf.FromDevice(out_n_k_ho_wo.mData.data());
|
||||
}
|
||||
@@ -0,0 +1,404 @@
|
||||
#include "device.hpp"
|
||||
#include "host_tensor.hpp"
|
||||
#include "dynamic_tensor_descriptor.hpp"
|
||||
#include "dynamic_tensor_descriptor_helper.hpp"
|
||||
#include "transform_forward_convolution_into_gemm_v4r5_nchw_kcyx_nkhw.hpp"
|
||||
|
||||
#include "olc_driver_common.hpp"
|
||||
#include "conv_tunables.hpp"
|
||||
|
||||
#include "handle.hpp"
|
||||
|
||||
namespace detail_dyn_conv_fwd_v4r5_nchw_kcyx_nkhw {
|
||||
|
||||
template <typename TInWei, typename TAcc, typename TOut>
|
||||
static std::string get_network_config_string_from_types()
|
||||
{
|
||||
std::string out;
|
||||
|
||||
out += static_cast<char>(Driver::get_typeid_from_type<TInWei>()) +
|
||||
static_cast<char>(Driver::get_typeid_from_type<TAcc>()) +
|
||||
static_cast<char>(Driver::get_typeid_from_type<TOut>());
|
||||
|
||||
return (out);
|
||||
};
|
||||
|
||||
static std::string
|
||||
get_network_config_string_from_tunable(const tunable_dyn_conv_fwd_v4r5_nchw_kcyx_nkhw* pt)
|
||||
{
|
||||
std::string out("TUN_");
|
||||
|
||||
out += std::to_string(pt->BlockSize) + "_";
|
||||
|
||||
out += std::to_string(pt->GM1PerBlockGM11) + "x" + std::to_string(pt->GN1PerBlockGN11) + "x" +
|
||||
std::to_string(pt->KPerBlock) + "_";
|
||||
out += std::to_string(pt->M1PerThread) + "x" + std::to_string(pt->N1PerThread) + "x" +
|
||||
std::to_string(pt->KPerThread) + "_";
|
||||
out += std::to_string(pt->M1N1ThreadClusterM10) + "x" +
|
||||
std::to_string(pt->M1N1ThreadClusterN10) + "x" +
|
||||
std::to_string(pt->M1N1ThreadClusterM11) + "x" +
|
||||
std::to_string(pt->M1N1ThreadClusterN11) + "_";
|
||||
|
||||
out += std::to_string(pt->ABlockTransferThreadSliceLengths_GK_GM0_GM10_GM11[0]) + "x" +
|
||||
std::to_string(pt->ABlockTransferThreadSliceLengths_GK_GM0_GM10_GM11[1]) + "x" +
|
||||
std::to_string(pt->ABlockTransferThreadSliceLengths_GK_GM0_GM10_GM11[2]) + "x" +
|
||||
std::to_string(pt->ABlockTransferThreadSliceLengths_GK_GM0_GM10_GM11[3]) + "_";
|
||||
|
||||
out += std::to_string(pt->ABlockTransferThreadClusterLengths_GK_GM0_GM10_GM11[0]) + "x" +
|
||||
std::to_string(pt->ABlockTransferThreadClusterLengths_GK_GM0_GM10_GM11[1]) + "x" +
|
||||
std::to_string(pt->ABlockTransferThreadClusterLengths_GK_GM0_GM10_GM11[2]) + "x" +
|
||||
std::to_string(pt->ABlockTransferThreadClusterLengths_GK_GM0_GM10_GM11[3]) + "_";
|
||||
|
||||
out += std::to_string(pt->ABlockTransferThreadClusterArrangeOrder[0]) + "x" +
|
||||
std::to_string(pt->ABlockTransferThreadClusterArrangeOrder[1]) + "x" +
|
||||
std::to_string(pt->ABlockTransferThreadClusterArrangeOrder[2]) + "x" +
|
||||
std::to_string(pt->ABlockTransferThreadClusterArrangeOrder[3]) + "_";
|
||||
|
||||
out += std::to_string(pt->ABlockTransferSrcAccessOrder[0]) + "x" +
|
||||
std::to_string(pt->ABlockTransferSrcAccessOrder[1]) + "x" +
|
||||
std::to_string(pt->ABlockTransferSrcAccessOrder[2]) + "x" +
|
||||
std::to_string(pt->ABlockTransferSrcAccessOrder[3]) + "_";
|
||||
|
||||
out += std::to_string(pt->ABlockTransferSrcVectorDim) + "_";
|
||||
out += std::to_string(pt->ABlockTransferSrcScalarPerVector) + "_";
|
||||
out += std::to_string(pt->ABlockTransferDstScalarPerVector_GM11) + "_";
|
||||
out += std::to_string(pt->AThreadTransferSrcResetCoordinateAfterRun) + "_";
|
||||
|
||||
out += std::to_string(pt->BBlockTransferThreadSliceLengths_GK_GN0_GN10_GN11[0]) + "x" +
|
||||
std::to_string(pt->BBlockTransferThreadSliceLengths_GK_GN0_GN10_GN11[1]) + "x" +
|
||||
std::to_string(pt->BBlockTransferThreadSliceLengths_GK_GN0_GN10_GN11[2]) + "x" +
|
||||
std::to_string(pt->BBlockTransferThreadSliceLengths_GK_GN0_GN10_GN11[3]);
|
||||
|
||||
out += std::to_string(pt->BBlockTransferThreadClusterLengths_GK_GN0_GN10_GN11[0]) + "x" +
|
||||
std::to_string(pt->BBlockTransferThreadClusterLengths_GK_GN0_GN10_GN11[1]) + "x" +
|
||||
std::to_string(pt->BBlockTransferThreadClusterLengths_GK_GN0_GN10_GN11[2]) + "x" +
|
||||
std::to_string(pt->BBlockTransferThreadClusterLengths_GK_GN0_GN10_GN11[3]) + "_";
|
||||
|
||||
out += std::to_string(pt->BBlockTransferThreadClusterArrangeOrder[0]) + "x" +
|
||||
std::to_string(pt->BBlockTransferThreadClusterArrangeOrder[1]) + "x" +
|
||||
std::to_string(pt->BBlockTransferThreadClusterArrangeOrder[2]) + "x" +
|
||||
std::to_string(pt->BBlockTransferThreadClusterArrangeOrder[3]) + "_";
|
||||
|
||||
out += std::to_string(pt->BBlockTransferSrcAccessOrder[0]) + "x" +
|
||||
std::to_string(pt->BBlockTransferSrcAccessOrder[1]) + "x" +
|
||||
std::to_string(pt->BBlockTransferSrcAccessOrder[2]) + "x" +
|
||||
std::to_string(pt->BBlockTransferSrcAccessOrder[3]) + "_";
|
||||
|
||||
out += std::to_string(pt->BBlockTransferSrcVectorDim) + "_";
|
||||
out += std::to_string(pt->BBlockTransferSrcScalarPerVector) + "_";
|
||||
out += std::to_string(pt->BBlockTransferDstScalarPerVector_GN11) + "_";
|
||||
out += std::to_string(pt->BThreadTransferSrcResetCoordinateAfterRun) + "_";
|
||||
|
||||
out += std::to_string(pt->CThreadTransferSrcDstAccessOrder[0]) + "x" +
|
||||
std::to_string(pt->CThreadTransferSrcDstAccessOrder[1]) + "x" +
|
||||
std::to_string(pt->CThreadTransferSrcDstAccessOrder[2]) + "x" +
|
||||
std::to_string(pt->CThreadTransferSrcDstAccessOrder[3]) + "x" +
|
||||
std::to_string(pt->CThreadTransferSrcDstAccessOrder[4]) + "x" +
|
||||
std::to_string(pt->CThreadTransferSrcDstAccessOrder[5]) + "_";
|
||||
|
||||
out += std::to_string(pt->CThreadTransferSrcDstVectorDim) + "_";
|
||||
out += std::to_string(pt->CThreadTransferDstScalarPerVector);
|
||||
|
||||
return (out);
|
||||
};
|
||||
|
||||
template <typename TInWei, typename TAcc, typename TOut>
|
||||
static std::string get_definition_string_from_types()
|
||||
{
|
||||
std::string out;
|
||||
|
||||
out += " -DCK_PARAM_IN_WEI_DATATYPE=" + std::to_string(Driver::get_typeid_from_type<TInWei>()) +
|
||||
" -DCK_PARAM_CONV_COMPTYPE=" + std::to_string(Driver::get_typeid_from_type<TAcc>()) +
|
||||
" -DCK_PARAM_OUT_DATATYPE=" + std::to_string(Driver::get_typeid_from_type<TOut>());
|
||||
|
||||
return (out);
|
||||
};
|
||||
|
||||
static std::string
|
||||
get_definition_string_from_tunable(const tunable_dyn_conv_fwd_v4r5_nchw_kcyx_nkhw* pt)
|
||||
{
|
||||
std::string out;
|
||||
|
||||
out += " -DCK_PARAM_BlockSize=" + std::to_string(pt->BlockSize);
|
||||
|
||||
out += " -DCK_PARAM_GM1PerBlockGM11=" + std::to_string(pt->GM1PerBlockGM11) +
|
||||
" -DCK_PARAM_GN1PerBlockGN11=" + std::to_string(pt->GN1PerBlockGN11) +
|
||||
" -DCK_PARAM_KPerBlock=" + std::to_string(pt->KPerBlock);
|
||||
out += " -DCK_PARAM_M1PerThread=" + std::to_string(pt->M1PerThread) +
|
||||
" -DCK_PARAM_N1PerThread=" + std::to_string(pt->N1PerThread) +
|
||||
" -DCK_PARAM_KPerThread=" + std::to_string(pt->KPerThread);
|
||||
|
||||
out += " -DCK_PARAM_M1N1ThreadClusterM10=" + std::to_string(pt->M1N1ThreadClusterM10) +
|
||||
" -DCK_PARAM_M1N1ThreadClusterN10=" + std::to_string(pt->M1N1ThreadClusterN10) +
|
||||
" -DCK_PARAM_M1N1ThreadClusterM11=" + std::to_string(pt->M1N1ThreadClusterM11) +
|
||||
" -DCK_PARAM_M1N1ThreadClusterN11=" + std::to_string(pt->M1N1ThreadClusterN11);
|
||||
|
||||
out += " -DCK_PARAM_ABlockTransferThreadSliceLengths_GK_GM0_GM10_GM11=" +
|
||||
std::to_string(pt->ABlockTransferThreadSliceLengths_GK_GM0_GM10_GM11[0]) + "," +
|
||||
std::to_string(pt->ABlockTransferThreadSliceLengths_GK_GM0_GM10_GM11[1]) + "," +
|
||||
std::to_string(pt->ABlockTransferThreadSliceLengths_GK_GM0_GM10_GM11[2]) + "," +
|
||||
std::to_string(pt->ABlockTransferThreadSliceLengths_GK_GM0_GM10_GM11[3]);
|
||||
|
||||
out += " -DCK_PARAM_ABlockTransferThreadClusterLengths_GK_GM0_GM10_GM11=" +
|
||||
std::to_string(pt->ABlockTransferThreadClusterLengths_GK_GM0_GM10_GM11[0]) + "," +
|
||||
std::to_string(pt->ABlockTransferThreadClusterLengths_GK_GM0_GM10_GM11[1]) + "," +
|
||||
std::to_string(pt->ABlockTransferThreadClusterLengths_GK_GM0_GM10_GM11[2]) + "," +
|
||||
std::to_string(pt->ABlockTransferThreadClusterLengths_GK_GM0_GM10_GM11[3]);
|
||||
|
||||
out += " -DCK_PARAM_ABlockTransferThreadClusterArrangeOrder=" +
|
||||
std::to_string(pt->ABlockTransferThreadClusterArrangeOrder[0]) + "," +
|
||||
std::to_string(pt->ABlockTransferThreadClusterArrangeOrder[1]) + "," +
|
||||
std::to_string(pt->ABlockTransferThreadClusterArrangeOrder[2]) + "," +
|
||||
std::to_string(pt->ABlockTransferThreadClusterArrangeOrder[3]);
|
||||
|
||||
out += " -DCK_PARAM_ABlockTransferSrcAccessOrder=" +
|
||||
std::to_string(pt->ABlockTransferSrcAccessOrder[0]) + "," +
|
||||
std::to_string(pt->ABlockTransferSrcAccessOrder[1]) + "," +
|
||||
std::to_string(pt->ABlockTransferSrcAccessOrder[2]) + "," +
|
||||
std::to_string(pt->ABlockTransferSrcAccessOrder[3]);
|
||||
|
||||
out +=
|
||||
" -DCK_PARAM_ABlockTransferSrcVectorDim=" + std::to_string(pt->ABlockTransferSrcVectorDim);
|
||||
out += " -DCK_PARAM_ABlockTransferSrcScalarPerVector=" +
|
||||
std::to_string(pt->ABlockTransferSrcScalarPerVector);
|
||||
out += " -DCK_PARAM_ABlockTransferDstScalarPerVector_GM11=" +
|
||||
std::to_string(pt->ABlockTransferDstScalarPerVector_GM11);
|
||||
out += " -DCK_PARAM_AThreadTransferSrcResetCoordinateAfterRun=" +
|
||||
std::to_string(pt->AThreadTransferSrcResetCoordinateAfterRun);
|
||||
|
||||
out += " -DCK_PARAM_BBlockTransferThreadSliceLengths_GK_GN0_GN10_GN11=" +
|
||||
std::to_string(pt->BBlockTransferThreadSliceLengths_GK_GN0_GN10_GN11[0]) + "," +
|
||||
std::to_string(pt->BBlockTransferThreadSliceLengths_GK_GN0_GN10_GN11[1]) + "," +
|
||||
std::to_string(pt->BBlockTransferThreadSliceLengths_GK_GN0_GN10_GN11[2]) + "," +
|
||||
std::to_string(pt->BBlockTransferThreadSliceLengths_GK_GN0_GN10_GN11[3]);
|
||||
|
||||
out += " -DCK_PARAM_BBlockTransferThreadClusterLengths_GK_GN0_GN10_GN11=" +
|
||||
std::to_string(pt->BBlockTransferThreadClusterLengths_GK_GN0_GN10_GN11[0]) + "," +
|
||||
std::to_string(pt->BBlockTransferThreadClusterLengths_GK_GN0_GN10_GN11[1]) + "," +
|
||||
std::to_string(pt->BBlockTransferThreadClusterLengths_GK_GN0_GN10_GN11[2]) + "," +
|
||||
std::to_string(pt->BBlockTransferThreadClusterLengths_GK_GN0_GN10_GN11[3]);
|
||||
|
||||
out += " -DCK_PARAM_BBlockTransferThreadClusterArrangeOrder=" +
|
||||
std::to_string(pt->BBlockTransferThreadClusterArrangeOrder[0]) + "," +
|
||||
std::to_string(pt->BBlockTransferThreadClusterArrangeOrder[1]) + "," +
|
||||
std::to_string(pt->BBlockTransferThreadClusterArrangeOrder[2]) + "," +
|
||||
std::to_string(pt->BBlockTransferThreadClusterArrangeOrder[3]);
|
||||
|
||||
out += " -DCK_PARAM_BBlockTransferSrcAccessOrder=" +
|
||||
std::to_string(pt->BBlockTransferSrcAccessOrder[0]) + "," +
|
||||
std::to_string(pt->BBlockTransferSrcAccessOrder[1]) + "," +
|
||||
std::to_string(pt->BBlockTransferSrcAccessOrder[2]) + "," +
|
||||
std::to_string(pt->BBlockTransferSrcAccessOrder[3]);
|
||||
|
||||
out +=
|
||||
" -DCK_PARAM_BBlockTransferSrcVectorDim=" + std::to_string(pt->BBlockTransferSrcVectorDim);
|
||||
out += " -DCK_PARAM_BBlockTransferSrcScalarPerVector=" +
|
||||
std::to_string(pt->BBlockTransferSrcScalarPerVector);
|
||||
out += " -DCK_PARAM_BBlockTransferDstScalarPerVector_GN11=" +
|
||||
std::to_string(pt->BBlockTransferDstScalarPerVector_GN11);
|
||||
out += " -DCK_PARAM_BThreadTransferSrcResetCoordinateAfterRun=" +
|
||||
std::to_string(pt->BThreadTransferSrcResetCoordinateAfterRun);
|
||||
|
||||
out += " -DCK_PARAM_CThreadTransferSrcDstAccessOrder=" +
|
||||
std::to_string(pt->CThreadTransferSrcDstAccessOrder[0]) + "," +
|
||||
std::to_string(pt->CThreadTransferSrcDstAccessOrder[1]) + "," +
|
||||
std::to_string(pt->CThreadTransferSrcDstAccessOrder[2]) + "," +
|
||||
std::to_string(pt->CThreadTransferSrcDstAccessOrder[3]) + "," +
|
||||
std::to_string(pt->CThreadTransferSrcDstAccessOrder[4]) + "," +
|
||||
std::to_string(pt->CThreadTransferSrcDstAccessOrder[5]);
|
||||
|
||||
out += " -DCK_PARAM_CThreadTransferSrcDstVectorDim=" +
|
||||
std::to_string(pt->CThreadTransferSrcDstVectorDim);
|
||||
out += " -DCK_PARAM_CThreadTransferDstScalarPerVector=" +
|
||||
std::to_string(pt->CThreadTransferDstScalarPerVector);
|
||||
|
||||
return (out);
|
||||
};
|
||||
|
||||
} // namespace detail_dyn_conv_fwd_v4r5_nchw_kcyx_nkhw
|
||||
|
||||
template <typename TInWei,
|
||||
typename TAcc,
|
||||
typename TOut,
|
||||
typename InLengths,
|
||||
typename WeiLengths,
|
||||
typename OutLengths,
|
||||
typename ConvStrides,
|
||||
typename ConvDilations,
|
||||
typename InLeftPads,
|
||||
typename InRightPads>
|
||||
void device_dynamic_convolution_forward_implicit_gemm_v4r5_nchw_kcyx_nkhw_olc(
|
||||
olCompile::Handle* handle,
|
||||
const InLengths& in_n_c_hi_wi_lengths,
|
||||
const WeiLengths& wei_k_c_y_x_lengths,
|
||||
const OutLengths& out_n_k_ho_wo_lengths,
|
||||
const ConvStrides& conv_strides,
|
||||
const ConvDilations& conv_dilations,
|
||||
const InLeftPads& in_left_pads,
|
||||
const InRightPads& in_right_pads,
|
||||
const Tensor<TInWei>& in_n_c_hi_wi,
|
||||
const Tensor<TInWei>& wei_k_c_y_x,
|
||||
Tensor<TOut>& out_n_k_ho_wo,
|
||||
const tunable_dyn_conv_fwd_v4r5_nchw_kcyx_nkhw* tunable,
|
||||
ck::index_t nrepeat)
|
||||
{
|
||||
using namespace ck;
|
||||
using namespace detail_dyn_conv_fwd_v4r5_nchw_kcyx_nkhw;
|
||||
using size_t = std::size_t;
|
||||
|
||||
constexpr index_t N0 = 4; // this could not be a tunable so far
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
// The follow codes are only used for computing the grid_size, hasMainKBlockLoop,
|
||||
// hasDoubleTailKBlockLoop
|
||||
|
||||
constexpr auto I0 = Number<0>{};
|
||||
constexpr auto I1 = Number<1>{};
|
||||
constexpr auto I2 = Number<2>{};
|
||||
constexpr auto I3 = Number<3>{};
|
||||
|
||||
const auto in_n_c_hi_wi_desc =
|
||||
make_dynamic_naive_tensor_descriptor_packed_v2(in_n_c_hi_wi_lengths);
|
||||
const auto wei_k_c_y_x_desc =
|
||||
make_dynamic_naive_tensor_descriptor_packed_v2(wei_k_c_y_x_lengths);
|
||||
const auto out_n_k_ho_wo_desc =
|
||||
make_dynamic_naive_tensor_descriptor_packed_v2(out_n_k_ho_wo_lengths);
|
||||
|
||||
const auto descs = transform_forward_convolution_into_contraction_v4r5_nchw_kcyx_nkhw_pad<N0>(
|
||||
wei_k_c_y_x_desc,
|
||||
in_n_c_hi_wi_desc,
|
||||
out_n_k_ho_wo_desc,
|
||||
conv_strides,
|
||||
conv_dilations,
|
||||
in_left_pads,
|
||||
in_right_pads);
|
||||
|
||||
const auto a_gk_gm0_gm1_grid_desc = descs[I0];
|
||||
const auto c_gm0_gm1_gn0_gn1_grid_desc = descs[I2];
|
||||
|
||||
const auto GM1 = c_gm0_gm1_gn0_gn1_grid_desc.GetLength(I1);
|
||||
const auto GN1 = c_gm0_gm1_gn0_gn1_grid_desc.GetLength(I3);
|
||||
const auto GK = a_gk_gm0_gm1_grid_desc.GetLength(I0);
|
||||
|
||||
const index_t grid_size = (GM1 / tunable->GM1PerBlockGM11) * (GN1 / tunable->GN1PerBlockGN11);
|
||||
const bool hasMainKBlockLoop = ((GK + tunable->KPerBlock) / (2 * tunable->KPerBlock) > 1);
|
||||
const bool hasDoubleTailKBlockLoop = ((GK / tunable->KPerBlock) % 2 == 0);
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
// these buffers are usually provided by the user application
|
||||
DeviceMem in_n_c_hi_wi_dev_buf(sizeof(TInWei) * in_n_c_hi_wi.mDesc.GetElementSpace());
|
||||
DeviceMem wei_k_c_y_x_dev_buf(sizeof(TInWei) * wei_k_c_y_x.mDesc.GetElementSpace());
|
||||
DeviceMem out_n_k_ho_wo_dev_buf(sizeof(TOut) * out_n_k_ho_wo.mDesc.GetElementSpace());
|
||||
|
||||
in_n_c_hi_wi_dev_buf.ToDevice(in_n_c_hi_wi.mData.data());
|
||||
wei_k_c_y_x_dev_buf.ToDevice(wei_k_c_y_x.mData.data());
|
||||
out_n_k_ho_wo_dev_buf.ToDevice(out_n_k_ho_wo.mData.data());
|
||||
|
||||
// these are workspace buffers that should be expressed to the user by the corresponding
|
||||
// workspace API
|
||||
DeviceMem workspace_buf(4096);
|
||||
|
||||
void* a_gk_gm0_gm10_gm11_grid_desc_dev_buf = workspace_buf.GetDeviceBuffer();
|
||||
void* b_gk_gn0_gn10_gn11_grid_desc_dev_buf =
|
||||
static_cast<void*>(static_cast<unsigned char*>(workspace_buf.GetDeviceBuffer()) + 1024);
|
||||
void* c_gm10_bm0_bm1_gn10_bn0_bn1_grid_desc_dev_buf =
|
||||
static_cast<void*>(static_cast<unsigned char*>(workspace_buf.GetDeviceBuffer()) + 2048);
|
||||
void* c_blockid_to_gm10_gn10_block_cluster_adaptor_dev_buf =
|
||||
static_cast<void*>(static_cast<unsigned char*>(workspace_buf.GetDeviceBuffer()) + 3072);
|
||||
|
||||
const std::vector<size_t> vld = {static_cast<size_t>(tunable->BlockSize), 1, 1};
|
||||
const std::vector<size_t> vgd1 = {static_cast<size_t>(tunable->BlockSize), 1, 1};
|
||||
const std::vector<size_t> vgd2 = {static_cast<size_t>(grid_size * tunable->BlockSize), 1, 1};
|
||||
|
||||
std::string program_name = "dynamic_convolution_forward_implicit_gemm_v4r5_nchw_kcyx_nkhw.cpp";
|
||||
std::string algo_name = "implicit_gemm_conv_fwd_v4r4_nchw";
|
||||
|
||||
std::string param = " -std=c++17 ";
|
||||
std::string network_config;
|
||||
|
||||
param += get_definition_string_from_types<TInWei, TAcc, TOut>() +
|
||||
" -DCK_PARAM_HAS_MAIN_KBLOCK_LOOP=" + std::to_string(hasMainKBlockLoop) +
|
||||
" -DCK_PARAM_HAS_DOUBLE_TAIL_KBLOCK_LOOP=" + std::to_string(hasDoubleTailKBlockLoop) +
|
||||
" -DCK_PARAM_N0=" + std::to_string(N0) + " " +
|
||||
get_definition_string_from_tunable(tunable);
|
||||
network_config = get_network_config_string_from_types<TInWei, TAcc, TOut>() + "_V" +
|
||||
std::to_string(hasDoubleTailKBlockLoop) + "_" + std::to_string(N0) + "_" +
|
||||
get_network_config_string_from_tunable(tunable);
|
||||
|
||||
std::vector<float> kernel1_times;
|
||||
std::vector<float> kernel2_times;
|
||||
|
||||
for(index_t i = 0; i < nrepeat; ++i)
|
||||
{
|
||||
KernelTimer timer1, timer2;
|
||||
std::string kernel_name;
|
||||
|
||||
kernel_name = "dynamic_convolution_forward_implicit_gemm_v4r5_nchw_kcyx_nkhw_prepare";
|
||||
auto network_config_1 = network_config + "_1";
|
||||
|
||||
timer1.Start();
|
||||
handle->AddKernel(algo_name, network_config_1, program_name, kernel_name, vld, vgd1, param)(
|
||||
static_cast<index_t>(in_n_c_hi_wi_lengths[I0]),
|
||||
static_cast<index_t>(in_n_c_hi_wi_lengths[I1]),
|
||||
static_cast<index_t>(in_n_c_hi_wi_lengths[I2]),
|
||||
static_cast<index_t>(in_n_c_hi_wi_lengths[I3]),
|
||||
static_cast<index_t>(wei_k_c_y_x_lengths[I0]),
|
||||
static_cast<index_t>(wei_k_c_y_x_lengths[I2]),
|
||||
static_cast<index_t>(wei_k_c_y_x_lengths[I3]),
|
||||
conv_strides[I0],
|
||||
conv_strides[I1],
|
||||
conv_dilations[I0],
|
||||
conv_dilations[I1],
|
||||
in_left_pads[I0],
|
||||
in_left_pads[I1],
|
||||
in_right_pads[I0],
|
||||
in_right_pads[I1],
|
||||
a_gk_gm0_gm10_gm11_grid_desc_dev_buf,
|
||||
b_gk_gn0_gn10_gn11_grid_desc_dev_buf,
|
||||
c_gm10_bm0_bm1_gn10_bn0_bn1_grid_desc_dev_buf,
|
||||
c_blockid_to_gm10_gn10_block_cluster_adaptor_dev_buf);
|
||||
timer2.End();
|
||||
|
||||
kernel_name = "dynamic_convolution_forward_implicit_gemm_v4r5_nchw_kcyx_nkhw";
|
||||
auto network_config_2 = network_config + "_2";
|
||||
|
||||
timer2.Start();
|
||||
handle->AddKernel(algo_name, network_config_2, program_name, kernel_name, vld, vgd2, param)(
|
||||
reinterpret_cast<const TInWei*>(wei_k_c_y_x_dev_buf.GetDeviceBuffer()),
|
||||
reinterpret_cast<const TInWei*>(in_n_c_hi_wi_dev_buf.GetDeviceBuffer()),
|
||||
reinterpret_cast<TOut*>(out_n_k_ho_wo_dev_buf.GetDeviceBuffer()),
|
||||
(const void*)(a_gk_gm0_gm10_gm11_grid_desc_dev_buf),
|
||||
(const void*)(b_gk_gn0_gn10_gn11_grid_desc_dev_buf),
|
||||
(const void*)(c_gm10_bm0_bm1_gn10_bn0_bn1_grid_desc_dev_buf),
|
||||
(const void*)(c_blockid_to_gm10_gn10_block_cluster_adaptor_dev_buf));
|
||||
timer2.End();
|
||||
|
||||
kernel1_times.push_back(timer1.GetElapsedTime());
|
||||
kernel2_times.push_back(timer2.GetElapsedTime());
|
||||
}
|
||||
|
||||
{
|
||||
auto ave_time1 = Driver::get_effective_average(kernel1_times);
|
||||
auto ave_time2 = Driver::get_effective_average(kernel2_times);
|
||||
|
||||
const auto N = in_n_c_hi_wi_lengths[I0];
|
||||
const auto C = in_n_c_hi_wi_lengths[I1];
|
||||
|
||||
const auto K = out_n_k_ho_wo_lengths[I1];
|
||||
const auto Ho = out_n_k_ho_wo_lengths[I2];
|
||||
const auto Wo = out_n_k_ho_wo_lengths[I3];
|
||||
|
||||
const auto Y = wei_k_c_y_x_lengths[I2];
|
||||
const auto X = wei_k_c_y_x_lengths[I3];
|
||||
|
||||
float perf = (float)(std::size_t(2) * N * K * Ho * Wo * C * Y * X) /
|
||||
(std::size_t(1000) * 1000 * 1000) / (ave_time1 + ave_time2);
|
||||
|
||||
std::cout << "Average time : " << ave_time1 + ave_time2 << " ms(" << ave_time1 << ", "
|
||||
<< ave_time2 << "), " << perf << " TFlop/s" << std::endl;
|
||||
};
|
||||
|
||||
// copy result back to host
|
||||
out_n_k_ho_wo_dev_buf.FromDevice(out_n_k_ho_wo.mData.data());
|
||||
}
|
||||
114
driver/include/olc_driver_common.hpp
Normal file
114
driver/include/olc_driver_common.hpp
Normal file
@@ -0,0 +1,114 @@
|
||||
#ifndef OLC_DRIVER_COMMON_HPP
|
||||
#define OLC_DRIVER_COMMON_HPP
|
||||
|
||||
#include <half.hpp>
|
||||
#include <vector>
|
||||
#include <cassert>
|
||||
|
||||
// this enumerate should be synchronized with include/miopen.h
|
||||
typedef enum {
|
||||
appHalf = 0,
|
||||
appFloat = 1,
|
||||
appInt32 = 2,
|
||||
appInt8 = 3,
|
||||
appInt8x4 = 4,
|
||||
appBFloat16 = 5,
|
||||
appDouble = 6,
|
||||
} appDataType_t;
|
||||
|
||||
namespace Driver {
|
||||
|
||||
template <appDataType_t typeNum>
|
||||
struct get_type_from_type_enum
|
||||
{
|
||||
using type = float;
|
||||
};
|
||||
|
||||
template <>
|
||||
struct get_type_from_type_enum<appHalf>
|
||||
{
|
||||
using type = half_float::half;
|
||||
};
|
||||
|
||||
template <>
|
||||
struct get_type_from_type_enum<appFloat>
|
||||
{
|
||||
using type = float;
|
||||
};
|
||||
|
||||
template <>
|
||||
struct get_type_from_type_enum<appDouble>
|
||||
{
|
||||
using type = double;
|
||||
};
|
||||
|
||||
template <>
|
||||
struct get_type_from_type_enum<appInt32>
|
||||
{
|
||||
using type = int;
|
||||
};
|
||||
|
||||
static inline int get_typeid_from_type_enum(appDataType_t t)
|
||||
{
|
||||
switch(t)
|
||||
{
|
||||
case appHalf: return (static_cast<int>('H'));
|
||||
case appFloat: return (static_cast<int>('F'));
|
||||
case appBFloat16: return (static_cast<int>('B'));
|
||||
case appDouble: return (static_cast<int>('D'));
|
||||
case appInt8:
|
||||
case appInt8x4:
|
||||
case appInt32: return (static_cast<int>('O'));
|
||||
default: throw std::runtime_error("Only float, half, bfloat16 data type is supported."); break;
|
||||
};
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
static inline int get_typeid_from_type()
|
||||
{
|
||||
throw std::runtime_error("Unsupported typeid conversion for this type!");
|
||||
};
|
||||
|
||||
template <>
|
||||
inline int get_typeid_from_type<float>()
|
||||
{
|
||||
return (static_cast<int>('F'));
|
||||
};
|
||||
|
||||
template <>
|
||||
inline int get_typeid_from_type<half_float::half>()
|
||||
{
|
||||
return (static_cast<int>('H'));
|
||||
};
|
||||
|
||||
template <>
|
||||
inline int get_typeid_from_type<double>()
|
||||
{
|
||||
return (static_cast<int>('D'));
|
||||
};
|
||||
|
||||
static inline float get_effective_average(std::vector<float>& values)
|
||||
{
|
||||
assert(!values.empty());
|
||||
|
||||
if(values.size() == 1)
|
||||
return (values[0]);
|
||||
else
|
||||
{
|
||||
float sum = 0.0f;
|
||||
float maxVal = 0.0f;
|
||||
|
||||
for(const auto val : values)
|
||||
{
|
||||
if(maxVal < val)
|
||||
maxVal = val;
|
||||
sum += val;
|
||||
};
|
||||
|
||||
return ((sum - maxVal) / (values.size() - 1));
|
||||
};
|
||||
};
|
||||
|
||||
} // namespace Driver
|
||||
|
||||
#endif
|
||||
Reference in New Issue
Block a user