Add online compilation for dynamic kernels (#37)

* Add online-compiling facility

* Synchronize from fwd-v4r5 and implement host interfaces to call conv-fwd v4r4/v4r5 using on-line compiling method

* Tiny adjustment to time reporting

* Use object assignment to replace explicit bytes copying in the first kernel of v4r4/v4r5

* Use single thread to assign descriptor object to device memory

* Adjust to the workload assignment of the two kernels of v4r4 (experimental)

* Revert "Adjust to the workload assignment of the two kernels of v4r4 (experimental)"

This reverts commit eb38461456bb0c82b6c0d32cdd616e181907e20c.

* Update to make constexpr for generating descriptor types in kernel 2 of dynamic conv-fwd v4r4

* Update to dynamic conv-fwd v4r4 online-compiling

* Update to dynamic conv-fwd v4r5 online-compiling (result not accurate)

* Tiny update to driver/CMakeLists.txt

* clang-format

* Tiny comments change

* Add env OLC_DUMP_SAVE_TMP_DIR to support saving of temperary dir

* Fwd v4r5 olc perf (#39)

* added hip-clang flags that fix perf issue of online compilation

* fix bug for olc fwd-v4r5-nchw

* Move constexpr and type reference statements out of the function body in conv-fwd v4r4/v4r5 kernel wrapper

* Remove printing in hip_build_utils.cpp

* Update to root CMakeLists.txt

* Revert "Move constexpr and type reference statements out of the function body in conv-fwd v4r4/v4r5 kernel wrapper"

This reverts commit 3d2c5d8ecdd8298b72d127110500ed5b38d9835c.

Co-authored-by: Chao Liu <chao.liu2@amd.com>
Co-authored-by: Chao Liu <lc.roy86@gmail.com>
Co-authored-by: root <root@dc-smc-18.amd.com>

[ROCm/composable_kernel commit: 1685048a67]
This commit is contained in:
Qianfeng
2021-06-24 21:34:19 +08:00
committed by GitHub
parent 033a4d6cf3
commit 817b2a47c6
63 changed files with 6675 additions and 40 deletions

View File

@@ -1,6 +1,11 @@
cmake_minimum_required(VERSION 2.8.3)
project(modular_convolution)
list(APPEND CMAKE_MODULE_PATH "${PROJECT_SOURCE_DIR}/cmake")
include(TargetFlags)
include(AddKernels)
#c++
enable_language(CXX)
set(CMAKE_CXX_STANDARD 17)
@@ -8,15 +13,6 @@ set(CMAKE_CXX_STANDARD_REQUIRED ON)
set(CMAKE_CXX_EXTENSIONS OFF)
message("CMAKE_CXX_COMPILER_ID: ${CMAKE_CXX_COMPILER_ID}")
#boost
find_package(Boost REQUIRED)
message("Boost_INCLUDE_DIRS: ${Boost_INCLUDE_DIRS}")
message("Boost_LIBRARY_DIRS: ${Boost_LIBRARY_DIRS}")
include_directories(BEFORE ${Boost_INCLUDE_DIRS})
link_directories(${Boost_LIBRARY_DIRS})
#OpenMP
if(CMAKE_CXX_COMPILER_ID MATCHES "Clang")
# workaround issue hipcc in rocm3.5 cannot find openmp
@@ -78,3 +74,31 @@ elseif(DEVICE_BACKEND STREQUAL "NVIDIA")
endif()
add_subdirectory(driver)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
message("Compiling options for drivers: ${CMAKE_CXX_FLAGS}")
if(DEVICE_BACKEND STREQUAL "AMD")
set(CONV_SOURCE driver/conv_driver.cpp)
set(CONV_V2_SOURCE driver/conv_driver_v2.cpp)
set(CONV_V2_OLC_SOURCE driver/conv_driver_v2_olc.cpp)
set(CONV_BWD_DATA_SOURCE driver/conv_bwd_data_driver.cpp)
elseif(DEVICE_BACKEND STREQUAL "NVIDIA")
set(CONV_SOURCE driver/conv_driver.cu)
set(CONV_BWD_DATA_SOURCE driver/conv_bwd_data_driver.cu)
endif()
##add_executable(conv_driver ${CONV_SOURCE})
add_executable(conv_driver_v2 ${CONV_V2_SOURCE})
add_executable(conv_driver_v2_olc ${CONV_V2_OLC_SOURCE})
##add_executable(conv_bwd_data_driver ${CONV_BWD_DATA_SOURCE})
target_include_directories(conv_driver_v2_olc PRIVATE driver/olCompiling/include/)
##target_link_libraries(conv_driver PRIVATE modConv)
target_link_libraries(conv_driver_v2 PRIVATE modConv)
target_link_libraries(conv_driver_v2_olc PRIVATE modConv)
##target_link_libraries(conv_bwd_data_driver PRIVATE modConv)

40
cmake/AddKernels.cmake Normal file
View File

@@ -0,0 +1,40 @@
function(add_kernels SRC_DIR KERNEL_FILES)
set(INIT_KERNELS_LIST)
set(KERNELS_DECLS)
foreach(KERNEL_FILE ${KERNEL_FILES})
if("${CMAKE_VERSION}" VERSION_LESS 3.0)
configure_file(${KERNEL_FILE} ${KERNEL_FILE}.delete)
else()
set_property(DIRECTORY APPEND PROPERTY CMAKE_CONFIGURE_DEPENDS ${KERNEL_FILE})
endif()
get_filename_component(BASE_NAME ${KERNEL_FILE} NAME_WE)
string(TOUPPER "${BASE_NAME}" KEY_NAME)
string(MAKE_C_IDENTIFIER "${KEY_NAME}" VAR_NAME)
string(APPEND KERNELS_DECLS "extern const size_t APP_KERNEL_${VAR_NAME}_SIZE;\n")
string(APPEND KERNELS_DECLS "extern const unsigned char APP_KERNEL_${VAR_NAME}[];\n")
list(APPEND INIT_KERNELS_LIST " { \"${KEY_NAME}\", std::string(reinterpret_cast<const char*>(APP_KERNEL_${VAR_NAME}), APP_KERNEL_${VAR_NAME}_SIZE) }")
endforeach()
string(REPLACE ";" ",\n" INIT_KERNELS "${INIT_KERNELS_LIST}")
configure_file(${SRC_DIR}/kernel.cpp.in ${PROJECT_BINARY_DIR}/kernel.cpp)
endfunction()
function(add_kernel_includes SRC_DIR KERNEL_FILES)
set(INIT_KERNELS_LIST)
foreach(KERNEL_FILE ${KERNEL_FILES})
if("${CMAKE_VERSION}" VERSION_LESS 3.0)
configure_file(${KERNEL_FILE} ${KERNEL_FILE}.delete)
else()
set_property(DIRECTORY APPEND PROPERTY CMAKE_CONFIGURE_DEPENDS ${KERNEL_FILE})
endif()
get_filename_component(BASE_NAME ${KERNEL_FILE} NAME_WE)
get_filename_component(FILE_NAME ${KERNEL_FILE} NAME)
string(TOUPPER "${BASE_NAME}" KEY_NAME)
string(MAKE_C_IDENTIFIER "${KEY_NAME}" VAR_NAME)
list(APPEND INIT_KERNELS_LIST " { \"${FILE_NAME}\", std::string(reinterpret_cast<const char*>(${VAR_NAME}), ${VAR_NAME}_SIZE) }")
endforeach()
string(REPLACE ";" ",\n" INIT_KERNELS "${INIT_KERNELS_LIST}")
configure_file(${SRC_DIR}/kernel_includes.cpp.in ${PROJECT_BINARY_DIR}/kernel_includes.cpp)
endfunction()

50
cmake/TargetFlags.cmake Normal file
View File

@@ -0,0 +1,50 @@
function(get_target_property2 VAR TARGET PROPERTY)
get_target_property(_pflags ${TARGET} ${PROPERTY})
if(_pflags)
set(${VAR} ${_pflags} PARENT_SCOPE)
else()
set(${VAR} "" PARENT_SCOPE)
endif()
endfunction()
macro(append_flags FLAGS TARGET PROPERTY PREFIX)
get_target_property2(_pflags ${TARGET} ${PROPERTY})
foreach(FLAG ${_pflags})
if(TARGET ${FLAG})
target_flags(_pflags2 ${FLAG})
string(APPEND ${FLAGS} " ${_pflags2}")
else()
string(APPEND ${FLAGS} " ${PREFIX}${FLAG}")
endif()
endforeach()
endmacro()
macro(append_link_flags FLAGS TARGET PROPERTY)
get_target_property2(_pflags ${TARGET} ${PROPERTY})
foreach(FLAG ${_pflags})
if(TARGET ${FLAG})
target_flags(_pflags2 ${FLAG})
string(APPEND ${FLAGS} " ${_pflags2}")
elseif(FLAG MATCHES "^-.*")
string(APPEND ${FLAGS} " ${FLAG}")
elseif(EXISTS ${FLAG})
string(APPEND ${FLAGS} " ${FLAG}")
else()
string(APPEND ${FLAGS} " -l${FLAG}")
endif()
endforeach()
endmacro()
function(target_flags FLAGS TARGET)
set(_flags)
append_flags(_flags ${TARGET} "INTERFACE_COMPILE_OPTIONS" "")
append_flags(_flags ${TARGET} "INTERFACE_COMPILE_DEFINITIONS" "-D")
append_flags(_flags ${TARGET} "INTERFACE_INCLUDE_DIRECTORIES" "-isystem ")
append_flags(_flags ${TARGET} "INTERFACE_LINK_DIRECTORIES" "-L ")
append_flags(_flags ${TARGET} "INTERFACE_LINK_OPTIONS" "")
append_link_flags(_flags ${TARGET} "INTERFACE_LINK_LIBRARIES" "")
# message("_flags: ${_flags}")
set(${FLAGS} ${_flags} PARENT_SCOPE)
endfunction()

View File

@@ -112,8 +112,8 @@
#endif
// pass tensor descriptor by value or void*
#define CK_EXPERIMENTAL_PASS_TENSOR_DESCRIPTOR_BY_VALUE 1
#define CK_EXPERIMENTAL_PASS_TENSOR_DESCRIPTOR_BY_VOID_POINTER 0
#define CK_EXPERIMENTAL_PASS_TENSOR_DESCRIPTOR_BY_VALUE 0
#define CK_EXPERIMENTAL_PASS_TENSOR_DESCRIPTOR_BY_VOID_POINTER 1
// merge transformation use magic number division
#define CK_EXPERIMENTAL_MERGE_USE_MAGIC_DIVISION 0

View File

@@ -1,7 +1,7 @@
#ifndef CK_SEQUENCE_HELPER_HPP
#define CK_SEQUENCE_HELPER_HPP
#include "sequence_helper.hpp"
#include "tuple.hpp"
namespace ck {

View File

@@ -0,0 +1,34 @@
#ifndef CK_TYPE_HELPER_HPP
#define CK_TYPE_HELPER_HPP
#include "float_type.hpp"
namespace ck {
template <char tid>
struct get_type_from_type_id
{
using type = float;
};
template <>
struct get_type_from_type_id<'H'>
{
using type = half_t;
};
template <>
struct get_type_from_type_id<'F'>
{
using type = float;
};
template <>
struct get_type_from_type_id<'D'>
{
using type = double;
};
} // namespace ck
#endif

View File

@@ -0,0 +1,370 @@
#include "common_header.hpp"
#include "type_helper.hpp"
#include "dynamic_tensor_descriptor.hpp"
#include "dynamic_tensor_descriptor_helper.hpp"
#include "gridwise_dynamic_gemm_v1r2.hpp"
#include "transform_forward_convolution_into_gemm_v4r4_nchw_kcyx_nkhw.hpp"
using namespace ck;
using FloatAB = typename get_type_from_type_id<static_cast<char>(CK_PARAM_IN_WEI_DATATYPE)>::type;
using FloatC = typename get_type_from_type_id<static_cast<char>(CK_PARAM_OUT_DATATYPE)>::type;
using FloatAcc = typename get_type_from_type_id<static_cast<char>(CK_PARAM_CONV_COMPTYPE)>::type;
constexpr index_t BlockSize = CK_PARAM_BlockSize;
constexpr index_t MPerBlock = CK_PARAM_MPerBlock;
constexpr index_t NPerBlock = CK_PARAM_NPerBlock;
constexpr index_t KPerBlock = CK_PARAM_KPerBlock;
constexpr index_t M1PerThread = CK_PARAM_M1PerThread;
constexpr index_t N1PerThread = CK_PARAM_N1PerThread;
constexpr index_t KPerThread = CK_PARAM_KPerThread;
constexpr index_t M1N1ThreadClusterM10 = CK_PARAM_M1N1ThreadClusterM10;
constexpr index_t M1N1ThreadClusterN10 = CK_PARAM_M1N1ThreadClusterN10;
constexpr index_t M1N1ThreadClusterM11 = CK_PARAM_M1N1ThreadClusterM11;
constexpr index_t M1N1ThreadClusterN11 = CK_PARAM_M1N1ThreadClusterN11;
using ABlockTransferThreadSliceLengths_K_M0_M1 =
Sequence<CK_PARAM_ABlockTransferThreadSliceLengths_K_M0_M1>;
using ABlockTransferThreadClusterLengths_K_M0_M1 =
Sequence<CK_PARAM_ABlockTransferThreadClusterLengths_K_M0_M1>;
using ABlockTransferThreadClusterArrangeOrder =
Sequence<CK_PARAM_ABlockTransferThreadClusterArrangeOrder>;
using ABlockTransferSrcAccessOrder = Sequence<CK_PARAM_ABlockTransferSrcAccessOrder>;
constexpr index_t ABlockTransferSrcVectorDim = CK_PARAM_ABlockTransferSrcVectorDim;
constexpr index_t ABlockTransferSrcScalarPerVector = CK_PARAM_ABlockTransferSrcScalarPerVector;
constexpr index_t ABlockTransferDstScalarPerVector_M1 =
CK_PARAM_ABlockTransferDstScalarPerVector_M1;
constexpr bool AThreadTransferSrcResetCoordinateAfterRun =
static_cast<bool>(CK_PARAM_AThreadTransferSrcResetCoordinateAfterRun);
using BBlockTransferThreadSliceLengths_K_N0_N1 =
Sequence<CK_PARAM_BBlockTransferThreadSliceLengths_K_N0_N1>;
using BBlockTransferThreadClusterLengths_K_N0_N1 =
Sequence<CK_PARAM_BBlockTransferThreadClusterLengths_K_N0_N1>;
using BBlockTransferThreadClusterArrangeOrder =
Sequence<CK_PARAM_BBlockTransferThreadClusterArrangeOrder>;
using BBlockTransferSrcAccessOrder = Sequence<CK_PARAM_BBlockTransferSrcAccessOrder>;
constexpr index_t BBlockTransferSrcVectorDim = CK_PARAM_BBlockTransferSrcVectorDim;
constexpr index_t BBlockTransferSrcScalarPerVector = CK_PARAM_BBlockTransferSrcScalarPerVector;
constexpr index_t BBlockTransferDstScalarPerVector_N1 =
CK_PARAM_BBlockTransferDstScalarPerVector_N1;
constexpr bool BThreadTransferSrcResetCoordinateAfterRun =
static_cast<bool>(CK_PARAM_BThreadTransferSrcResetCoordinateAfterRun);
using CThreadTransferSrcDstAccessOrder = Sequence<CK_PARAM_CThreadTransferSrcDstAccessOrder>;
constexpr index_t CThreadTransferSrcDstVectorDim = CK_PARAM_CThreadTransferSrcDstVectorDim;
constexpr index_t CThreadTransferDstScalarPerVector = CK_PARAM_CThreadTransferDstScalarPerVector;
constexpr bool HasMainKBlockLoop = static_cast<bool>(CK_PARAM_HAS_MAIN_KBLOCK_LOOP);
constexpr bool HasDoubleTailKBlockLoop = static_cast<bool>(CK_PARAM_HAS_DOUBLE_TAIL_KBLOCK_LOOP);
extern "C" __global__ void dynamic_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw_prepare(
int n,
int c,
int hi,
int wi,
int k,
int y,
int x,
int convStrideH,
int convStrideW,
int convDilationY,
int convDilationX,
int leftPadH,
int leftPadW,
int rightPadH,
int rightPadW,
void* p_a_k_m0_m1_grid_desc,
void* p_b_k_n0_n1_grid_desc,
void* p_c_m0_m10_m11_n0_n10_n11_grid_desc,
void* p_c_blockid_to_m0_n0_block_cluster_adaptor)
{
constexpr auto I0 = Number<0>{};
constexpr auto I1 = Number<1>{};
constexpr auto I2 = Number<2>{};
const index_t ho = (hi + leftPadH + rightPadH - convDilationY * (y - 1) - 1) / convStrideH + 1;
const index_t wo = (wi + leftPadW + rightPadW - convDilationX * (x - 1) - 1) / convStrideW + 1;
const auto in_n_c_hi_wi_desc =
make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(n, c, hi, wi));
const auto wei_k_c_y_x_desc =
make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(k, c, y, x));
const auto out_n_k_ho_wo_desc =
make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(n, k, ho, wo));
const auto descs = transform_forward_convolution_into_gemm_v4r4_nchw_kcyx_nkhw_pad(
wei_k_c_y_x_desc,
in_n_c_hi_wi_desc,
out_n_k_ho_wo_desc,
make_tuple(convStrideH, convStrideW),
make_tuple(convDilationY, convDilationX),
make_tuple(leftPadH, leftPadW),
make_tuple(rightPadH, rightPadW));
const auto a_k_m_grid_desc = descs[I0];
const auto b_k_n_grid_desc = descs[I1];
const auto c_m_n_grid_desc = descs[I2];
using AKMGridDesc = decltype(a_k_m_grid_desc);
using BKNGridDesc = decltype(b_k_n_grid_desc);
using CMNGridDesc = decltype(c_m_n_grid_desc);
using AGridIteratorHacks = decltype(make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0>{},
Sequence<0, 0, 0, 0, 0>{},
Sequence<0, 0, 0, 0, 0>{},
Sequence<0, 0, 0, 0, 0>{}),
make_tuple(Sequence<0, 0, 0, 0, 0>{},
Sequence<0, 0, 0, 0, 0>{},
Sequence<0, 0, 0, 0, 0>{},
Sequence<0, 0, 0, 0, 0>{})));
using BGridIteratorHacks =
decltype(make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0>{},
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0>{},
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0>{}),
make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0>{},
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0>{},
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0>{})));
using CGridIteratorHacks = decltype(make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0>{},
Sequence<0, 0, 0, 0, 0>{},
Sequence<0, 0, 0, 0, 0>{},
Sequence<0, 0, 1, 0, 0>{},
Sequence<0, 0, 1, 0, 0>{},
Sequence<0, 0, 1, 0, 0>{}),
make_tuple(Sequence<0, 0, 0, 0, 0>{},
Sequence<0, 0, 0, 0, 0>{},
Sequence<0, 0, 0, 0, 0>{},
Sequence<0, 0, 2, 0, 0>{},
Sequence<0, 0, 2, 0, 0>{},
Sequence<0, 0, 2, 0, 0>{})));
using AGridMoveSliceWindowIteratorHacks = Sequence<0, 0, 0, 0, 0>;
using BGridMoveSliceWindowIteratorHacks = Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 0, 0>;
using GridwiseGemm =
GridwiseDynamicGemm_km_kn_mn_v1r2<BlockSize,
FloatAB,
FloatAcc,
FloatC,
InMemoryDataOperation::Set, /* ToDo tunable */
AKMGridDesc,
BKNGridDesc,
CMNGridDesc,
MPerBlock,
NPerBlock,
KPerBlock,
M1PerThread,
N1PerThread,
KPerThread,
M1N1ThreadClusterM10,
M1N1ThreadClusterN10,
M1N1ThreadClusterM11,
M1N1ThreadClusterN11,
ABlockTransferThreadSliceLengths_K_M0_M1,
ABlockTransferThreadClusterLengths_K_M0_M1,
ABlockTransferThreadClusterArrangeOrder,
ABlockTransferSrcAccessOrder,
ABlockTransferSrcVectorDim,
ABlockTransferSrcScalarPerVector,
ABlockTransferDstScalarPerVector_M1,
AThreadTransferSrcResetCoordinateAfterRun,
BBlockTransferThreadSliceLengths_K_N0_N1,
BBlockTransferThreadClusterLengths_K_N0_N1,
BBlockTransferThreadClusterArrangeOrder,
BBlockTransferSrcAccessOrder,
BBlockTransferSrcVectorDim,
BBlockTransferSrcScalarPerVector,
BBlockTransferDstScalarPerVector_N1,
BThreadTransferSrcResetCoordinateAfterRun,
CThreadTransferSrcDstAccessOrder,
CThreadTransferSrcDstVectorDim,
CThreadTransferDstScalarPerVector,
AGridIteratorHacks,
BGridIteratorHacks,
CGridIteratorHacks,
AGridMoveSliceWindowIteratorHacks,
BGridMoveSliceWindowIteratorHacks>;
auto a_k_m0_m1_grid_desc = GridwiseGemm::MakeAKM0M1GridDescriptor(a_k_m_grid_desc);
auto b_k_n0_n1_grid_desc = GridwiseGemm::MakeBKN0N1GridDescriptor(b_k_n_grid_desc);
auto c_m0_m10_m11_n0_n10_n11_grid_desc =
GridwiseGemm::MakeCM0M10M11N0N10N11GridDescriptor(c_m_n_grid_desc);
auto c_blockid_to_m0_n0_block_cluster_adaptor =
GridwiseGemm::MakeCBlockIdToM0N0BlockClusterAdaptor(c_m_n_grid_desc);
if(hipThreadIdx_x == 0)
{
*static_cast<decltype(a_k_m0_m1_grid_desc)*>(p_a_k_m0_m1_grid_desc) = a_k_m0_m1_grid_desc;
*static_cast<decltype(b_k_n0_n1_grid_desc)*>(p_b_k_n0_n1_grid_desc) = b_k_n0_n1_grid_desc;
*static_cast<decltype(c_m0_m10_m11_n0_n10_n11_grid_desc)*>(
p_c_m0_m10_m11_n0_n10_n11_grid_desc) = c_m0_m10_m11_n0_n10_n11_grid_desc;
*static_cast<decltype(c_blockid_to_m0_n0_block_cluster_adaptor)*>(
p_c_blockid_to_m0_n0_block_cluster_adaptor) = c_blockid_to_m0_n0_block_cluster_adaptor;
};
};
extern "C" __global__ void
#if CK_USE_LAUNCH_BOUNDS
__launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
#endif
dynamic_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw(
const FloatAB* __restrict__ p_a_grid,
const FloatAB* __restrict__ p_b_grid,
FloatC* __restrict__ p_c_grid,
const void __CONSTANT__* p_a_k_m0_m1_grid_desc,
const void __CONSTANT__* p_b_k_n0_n1_grid_desc,
const void __CONSTANT__* p_c_m0_m10_m11_n0_n10_n11_grid_desc,
const void __CONSTANT__* p_c_blockid_to_m0_n0_block_cluster_adaptor)
{
constexpr auto I0 = Number<0>{};
constexpr auto I1 = Number<1>{};
constexpr auto I2 = Number<2>{};
constexpr auto in_n_c_hi_wi_desc =
make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(64, 4, 35, 35));
constexpr auto wei_k_c_y_x_desc =
make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(8, 4, 3, 3));
constexpr auto out_n_k_ho_wo_desc =
make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(64, 8, 18, 18));
constexpr auto descs =
transform_forward_convolution_into_gemm_v4r4_nchw_kcyx_nkhw_pad(wei_k_c_y_x_desc,
in_n_c_hi_wi_desc,
out_n_k_ho_wo_desc,
make_tuple(2, 2),
make_tuple(1, 1),
make_tuple(1, 1),
make_tuple(1, 1));
constexpr auto a_k_m_grid_desc = descs[I0];
constexpr auto b_k_n_grid_desc = descs[I1];
constexpr auto c_m_n_grid_desc = descs[I2];
using AKMGridDesc = decltype(a_k_m_grid_desc);
using BKNGridDesc = decltype(b_k_n_grid_desc);
using CMNGridDesc = decltype(c_m_n_grid_desc);
using AGridIteratorHacks = decltype(make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0>{},
Sequence<0, 0, 0, 0, 0>{},
Sequence<0, 0, 0, 0, 0>{},
Sequence<0, 0, 0, 0, 0>{}),
make_tuple(Sequence<0, 0, 0, 0, 0>{},
Sequence<0, 0, 0, 0, 0>{},
Sequence<0, 0, 0, 0, 0>{},
Sequence<0, 0, 0, 0, 0>{})));
using BGridIteratorHacks =
decltype(make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0>{},
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0>{},
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0>{}),
make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0>{},
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0>{},
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0>{})));
using CGridIteratorHacks = decltype(make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0>{},
Sequence<0, 0, 0, 0, 0>{},
Sequence<0, 0, 0, 0, 0>{},
Sequence<0, 0, 1, 0, 0>{},
Sequence<0, 0, 1, 0, 0>{},
Sequence<0, 0, 1, 0, 0>{}),
make_tuple(Sequence<0, 0, 0, 0, 0>{},
Sequence<0, 0, 0, 0, 0>{},
Sequence<0, 0, 0, 0, 0>{},
Sequence<0, 0, 2, 0, 0>{},
Sequence<0, 0, 2, 0, 0>{},
Sequence<0, 0, 2, 0, 0>{})));
using AGridMoveSliceWindowIteratorHacks = Sequence<0, 0, 0, 0, 0>;
using BGridMoveSliceWindowIteratorHacks = Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 0, 0>;
using GridwiseGemm =
GridwiseDynamicGemm_km_kn_mn_v1r2<BlockSize,
FloatAB,
FloatAcc,
FloatC,
InMemoryDataOperation::Set, /* ToDo tunable */
AKMGridDesc,
BKNGridDesc,
CMNGridDesc,
MPerBlock,
NPerBlock,
KPerBlock,
M1PerThread,
N1PerThread,
KPerThread,
M1N1ThreadClusterM10,
M1N1ThreadClusterN10,
M1N1ThreadClusterM11,
M1N1ThreadClusterN11,
ABlockTransferThreadSliceLengths_K_M0_M1,
ABlockTransferThreadClusterLengths_K_M0_M1,
ABlockTransferThreadClusterArrangeOrder,
ABlockTransferSrcAccessOrder,
ABlockTransferSrcVectorDim,
ABlockTransferSrcScalarPerVector,
ABlockTransferDstScalarPerVector_M1,
AThreadTransferSrcResetCoordinateAfterRun,
BBlockTransferThreadSliceLengths_K_N0_N1,
BBlockTransferThreadClusterLengths_K_N0_N1,
BBlockTransferThreadClusterArrangeOrder,
BBlockTransferSrcAccessOrder,
BBlockTransferSrcVectorDim,
BBlockTransferSrcScalarPerVector,
BBlockTransferDstScalarPerVector_N1,
BThreadTransferSrcResetCoordinateAfterRun,
CThreadTransferSrcDstAccessOrder,
CThreadTransferSrcDstVectorDim,
CThreadTransferDstScalarPerVector,
AGridIteratorHacks,
BGridIteratorHacks,
CGridIteratorHacks,
AGridMoveSliceWindowIteratorHacks,
BGridMoveSliceWindowIteratorHacks>;
constexpr auto a_k_m0_m1_grid_desc_tmp =
GridwiseGemm::MakeAKM0M1GridDescriptor(a_k_m_grid_desc);
constexpr auto b_k_n0_n1_grid_desc_tmp =
GridwiseGemm::MakeBKN0N1GridDescriptor(b_k_n_grid_desc);
constexpr auto c_m0_m10_m11_n0_n10_n11_grid_desc_tmp =
GridwiseGemm::MakeCM0M10M11N0N10N11GridDescriptor(c_m_n_grid_desc);
constexpr auto c_blockid_to_m0_n0_block_cluster_adaptor_tmp =
GridwiseGemm::MakeCBlockIdToM0N0BlockClusterAdaptor(c_m_n_grid_desc);
using AKM0M1GridDesc = decltype(a_k_m0_m1_grid_desc_tmp);
using BKN0N1GridDesc = decltype(b_k_n0_n1_grid_desc_tmp);
using CM0M10M11N0N10N11GridDesc = decltype(c_m0_m10_m11_n0_n10_n11_grid_desc_tmp);
using CBlockIdToM0N0BlockClusterAdaptor =
decltype(c_blockid_to_m0_n0_block_cluster_adaptor_tmp);
const auto a_k_m0_m1_grid_desc =
*reinterpret_cast<const AKM0M1GridDesc*>((const void*)p_a_k_m0_m1_grid_desc);
const auto b_k_n0_n1_grid_desc =
*reinterpret_cast<const BKN0N1GridDesc*>((const void*)p_b_k_n0_n1_grid_desc);
const auto c_m0_m10_m11_n0_n10_n11_grid_desc =
*reinterpret_cast<const CM0M10M11N0N10N11GridDesc*>(
(const void*)p_c_m0_m10_m11_n0_n10_n11_grid_desc);
const auto c_blockid_to_m0_n0_block_cluster_adaptor =
*reinterpret_cast<const CBlockIdToM0N0BlockClusterAdaptor*>(
(const void*)p_c_blockid_to_m0_n0_block_cluster_adaptor);
constexpr index_t shared_block_size =
GridwiseGemm::GetSharedMemoryNumberOfByte() / sizeof(FloatAB);
__shared__ FloatAB p_shared_block[shared_block_size];
GridwiseGemm::Run(p_a_grid,
p_b_grid,
p_c_grid,
p_shared_block,
a_k_m0_m1_grid_desc,
b_k_n0_n1_grid_desc,
c_m0_m10_m11_n0_n10_n11_grid_desc,
c_blockid_to_m0_n0_block_cluster_adaptor,
integral_constant<bool, HasMainKBlockLoop>{},
integral_constant<bool, HasDoubleTailKBlockLoop>{});
};

View File

@@ -0,0 +1,379 @@
#include "common_header.hpp"
#include "type_helper.hpp"
#include "dynamic_tensor_descriptor.hpp"
#include "dynamic_tensor_descriptor_helper.hpp"
#include "gridwise_dynamic_contraction_v1r1.hpp"
#include "transform_forward_convolution_into_gemm_v4r5_nchw_kcyx_nkhw.hpp"
using namespace ck;
using FloatAB = typename get_type_from_type_id<static_cast<char>(CK_PARAM_IN_WEI_DATATYPE)>::type;
using FloatC = typename get_type_from_type_id<static_cast<char>(CK_PARAM_OUT_DATATYPE)>::type;
using FloatAcc = typename get_type_from_type_id<static_cast<char>(CK_PARAM_CONV_COMPTYPE)>::type;
constexpr index_t BlockSize = CK_PARAM_BlockSize;
constexpr index_t N0 = CK_PARAM_N0;
constexpr index_t GM1PerBlockGM11 = CK_PARAM_GM1PerBlockGM11;
constexpr index_t GN1PerBlockGN11 = CK_PARAM_GN1PerBlockGN11;
constexpr index_t KPerBlock = CK_PARAM_KPerBlock;
constexpr index_t M1PerThread = CK_PARAM_M1PerThread;
constexpr index_t N1PerThread = CK_PARAM_N1PerThread;
constexpr index_t KPerThread = CK_PARAM_KPerThread;
constexpr index_t M1N1ThreadClusterM10 = CK_PARAM_M1N1ThreadClusterM10;
constexpr index_t M1N1ThreadClusterN10 = CK_PARAM_M1N1ThreadClusterN10;
constexpr index_t M1N1ThreadClusterM11 = CK_PARAM_M1N1ThreadClusterM11;
constexpr index_t M1N1ThreadClusterN11 = CK_PARAM_M1N1ThreadClusterN11;
using ABlockTransferThreadSliceLengths_GK_GM0_GM10_GM11 =
Sequence<CK_PARAM_ABlockTransferThreadSliceLengths_GK_GM0_GM10_GM11>;
using ABlockTransferThreadClusterLengths_GK_GM0_GM10_GM11 =
Sequence<CK_PARAM_ABlockTransferThreadClusterLengths_GK_GM0_GM10_GM11>;
using ABlockTransferThreadClusterArrangeOrder =
Sequence<CK_PARAM_ABlockTransferThreadClusterArrangeOrder>;
using ABlockTransferSrcAccessOrder = Sequence<CK_PARAM_ABlockTransferSrcAccessOrder>;
constexpr index_t ABlockTransferSrcVectorDim = CK_PARAM_ABlockTransferSrcVectorDim;
constexpr index_t ABlockTransferSrcScalarPerVector = CK_PARAM_ABlockTransferSrcScalarPerVector;
constexpr index_t ABlockTransferDstScalarPerVector_GM11 =
CK_PARAM_ABlockTransferDstScalarPerVector_GM11;
constexpr bool AThreadTransferSrcResetCoordinateAfterRun =
static_cast<bool>(CK_PARAM_AThreadTransferSrcResetCoordinateAfterRun);
using BBlockTransferThreadSliceLengths_GK_GN0_GN10_GN11 =
Sequence<CK_PARAM_BBlockTransferThreadSliceLengths_GK_GN0_GN10_GN11>;
using BBlockTransferThreadClusterLengths_GK_GN0_GN10_GN11 =
Sequence<CK_PARAM_BBlockTransferThreadClusterLengths_GK_GN0_GN10_GN11>;
using BBlockTransferThreadClusterArrangeOrder =
Sequence<CK_PARAM_BBlockTransferThreadClusterArrangeOrder>;
using BBlockTransferSrcAccessOrder = Sequence<CK_PARAM_BBlockTransferSrcAccessOrder>;
constexpr index_t BBlockTransferSrcVectorDim = CK_PARAM_BBlockTransferSrcVectorDim;
constexpr index_t BBlockTransferSrcScalarPerVector = CK_PARAM_BBlockTransferSrcScalarPerVector;
constexpr index_t BBlockTransferDstScalarPerVector_GN11 =
CK_PARAM_BBlockTransferDstScalarPerVector_GN11;
constexpr bool BThreadTransferSrcResetCoordinateAfterRun =
static_cast<bool>(CK_PARAM_BThreadTransferSrcResetCoordinateAfterRun);
using CThreadTransferSrcDstAccessOrder = Sequence<CK_PARAM_CThreadTransferSrcDstAccessOrder>;
constexpr index_t CThreadTransferSrcDstVectorDim = CK_PARAM_CThreadTransferSrcDstVectorDim;
constexpr index_t CThreadTransferDstScalarPerVector = CK_PARAM_CThreadTransferDstScalarPerVector;
constexpr bool HasMainKBlockLoop = static_cast<bool>(CK_PARAM_HAS_MAIN_KBLOCK_LOOP);
constexpr bool HasDoubleTailKBlockLoop = static_cast<bool>(CK_PARAM_HAS_DOUBLE_TAIL_KBLOCK_LOOP);
extern "C" __global__ void dynamic_convolution_forward_implicit_gemm_v4r5_nchw_kcyx_nkhw_prepare(
int n,
int c,
int hi,
int wi,
int k,
int y,
int x,
int convStrideH,
int convStrideW,
int convDilationY,
int convDilationX,
int leftPadH,
int leftPadW,
int rightPadH,
int rightPadW,
void* p_a_gk_gm0_gm10_gm11_grid_desc,
void* p_b_gk_gn0_gn10_gn11_grid_desc,
void* p_c_gm10_bm0_bm1_gn10_bn0_bn1_grid_desc,
void* p_c_blockid_to_gm10_gn10_block_cluster_adaptor)
{
constexpr auto I0 = Number<0>{};
constexpr auto I1 = Number<1>{};
constexpr auto I2 = Number<2>{};
const index_t ho = (hi + leftPadH + rightPadH - convDilationY * (y - 1) - 1) / convStrideH + 1;
const index_t wo = (wi + leftPadW + rightPadW - convDilationX * (x - 1) - 1) / convStrideW + 1;
const auto in_n_c_hi_wi_desc =
make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(n, c, hi, wi));
const auto wei_k_c_y_x_desc =
make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(k, c, y, x));
const auto out_n_k_ho_wo_desc =
make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(n, k, ho, wo));
const auto descs = transform_forward_convolution_into_contraction_v4r5_nchw_kcyx_nkhw_pad<N0>(
wei_k_c_y_x_desc,
in_n_c_hi_wi_desc,
out_n_k_ho_wo_desc,
make_tuple(convStrideH, convStrideW),
make_tuple(convDilationY, convDilationX),
make_tuple(leftPadH, leftPadW),
make_tuple(rightPadH, rightPadW));
const auto a_gk_gm0_gm1_grid_desc = descs[I0];
const auto b_gk_gn0_gn1_grid_desc = descs[I1];
const auto c_gm0_gm1_gn0_gn1_grid_desc = descs[I2];
using AGKGM0GM1GridDesc = decltype(a_gk_gm0_gm1_grid_desc);
using BGKGN0GN1GridDesc = decltype(b_gk_gn0_gn1_grid_desc);
using CGM0GM1GN0GN1GridDesc = decltype(c_gm0_gm1_gn0_gn1_grid_desc);
using AGridIteratorHacks = decltype(make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0, 0>{},
Sequence<0, 0, 0, 0, 0, 0>{},
Sequence<0, 0, 0, 0, 0, 0>{},
Sequence<0, 0, 0, 0, 0, 0>{}),
make_tuple(Sequence<0, 0, 0, 0, 0, 0>{},
Sequence<0, 0, 0, 0, 0, 0>{},
Sequence<0, 0, 0, 0, 0, 0>{},
Sequence<0, 0, 0, 0, 0, 0>{})));
using BGridIteratorHacks =
decltype(make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0>{},
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0>{},
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0>{}),
make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0>{},
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0>{},
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0>{})));
using CGridIteratorHacks = decltype(make_tuple(
make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0>{},
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0>{},
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0>{},
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0>{}),
make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0>{},
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0>{},
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0>{},
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0>{})));
using AGridMoveSliceWindowIteratorHacks = Sequence<0, 0, 0, 0, 0, 0>;
using BGridMoveSliceWindowIteratorHacks = Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 2, 0, 0, 0>;
using GridwiseContraction = GridwiseDynamicContraction_km0m1_kn0n1_m0m1n0n1_v1r1<
BlockSize,
FloatAB,
FloatAcc,
FloatC,
InMemoryDataOperation::Set, /* ToDo tunable */
AGKGM0GM1GridDesc,
BGKGN0GN1GridDesc,
CGM0GM1GN0GN1GridDesc,
GM1PerBlockGM11,
GN1PerBlockGN11,
KPerBlock,
M1PerThread,
N1PerThread,
KPerThread,
M1N1ThreadClusterM10,
M1N1ThreadClusterN10,
M1N1ThreadClusterM11,
M1N1ThreadClusterN11,
ABlockTransferThreadSliceLengths_GK_GM0_GM10_GM11,
ABlockTransferThreadClusterLengths_GK_GM0_GM10_GM11,
ABlockTransferThreadClusterArrangeOrder,
ABlockTransferSrcAccessOrder,
ABlockTransferSrcVectorDim,
ABlockTransferSrcScalarPerVector,
ABlockTransferDstScalarPerVector_GM11,
AThreadTransferSrcResetCoordinateAfterRun,
BBlockTransferThreadSliceLengths_GK_GN0_GN10_GN11,
BBlockTransferThreadClusterLengths_GK_GN0_GN10_GN11,
BBlockTransferThreadClusterArrangeOrder,
BBlockTransferSrcAccessOrder,
BBlockTransferSrcVectorDim,
BBlockTransferSrcScalarPerVector,
BBlockTransferDstScalarPerVector_GN11,
BThreadTransferSrcResetCoordinateAfterRun,
CThreadTransferSrcDstAccessOrder,
CThreadTransferSrcDstVectorDim,
CThreadTransferDstScalarPerVector,
AGridIteratorHacks,
BGridIteratorHacks,
CGridIteratorHacks,
AGridMoveSliceWindowIteratorHacks,
BGridMoveSliceWindowIteratorHacks>;
auto a_gk_gm0_gm10_gm11_grid_desc =
GridwiseContraction::MakeAGKGM0GM10GM11GridDescriptor(a_gk_gm0_gm1_grid_desc);
auto b_gk_gn0_gn10_gn11_grid_desc =
GridwiseContraction::MakeBGKGN0GN10GN11GridDescriptor(b_gk_gn0_gn1_grid_desc);
auto c_gm10_bm0_bm1_gn10_bn0_bn1_grid_desc =
GridwiseContraction::MakeCGM10BM0BM1GN10BN0BN1GridDescriptor(c_gm0_gm1_gn0_gn1_grid_desc);
auto c_blockid_to_gm10_gn10_block_cluster_adaptor =
GridwiseContraction::MakeCBlockIdToGM10GN10BlockClusterAdaptor(c_gm0_gm1_gn0_gn1_grid_desc);
if(hipThreadIdx_x == 0)
{
*static_cast<decltype(a_gk_gm0_gm10_gm11_grid_desc)*>(p_a_gk_gm0_gm10_gm11_grid_desc) =
a_gk_gm0_gm10_gm11_grid_desc;
*static_cast<decltype(b_gk_gn0_gn10_gn11_grid_desc)*>(p_b_gk_gn0_gn10_gn11_grid_desc) =
b_gk_gn0_gn10_gn11_grid_desc;
*static_cast<decltype(c_gm10_bm0_bm1_gn10_bn0_bn1_grid_desc)*>(
p_c_gm10_bm0_bm1_gn10_bn0_bn1_grid_desc) = c_gm10_bm0_bm1_gn10_bn0_bn1_grid_desc;
*static_cast<decltype(c_blockid_to_gm10_gn10_block_cluster_adaptor)*>(
p_c_blockid_to_gm10_gn10_block_cluster_adaptor) =
c_blockid_to_gm10_gn10_block_cluster_adaptor;
};
};
extern "C" __global__ void
#if CK_USE_LAUNCH_BOUNDS
__launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
#endif
dynamic_convolution_forward_implicit_gemm_v4r5_nchw_kcyx_nkhw(
const FloatAB* __restrict__ p_a_grid,
const FloatAB* __restrict__ p_b_grid,
FloatC* __restrict__ p_c_grid,
const void __CONSTANT__* p_a_gk_gm0_gm10_gm11_grid_desc,
const void __CONSTANT__* p_b_gk_gn0_gn10_gn11_grid_desc,
const void __CONSTANT__* p_c_gm10_bm0_bm1_gn10_bn0_bn1_grid_desc,
const void __CONSTANT__* p_c_blockid_to_gm10_gn10_block_cluster_adaptor)
{
constexpr auto I0 = Number<0>{};
constexpr auto I1 = Number<1>{};
constexpr auto I2 = Number<2>{};
constexpr auto in_n_c_hi_wi_desc =
make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(64, 4, 35, 35));
constexpr auto wei_k_c_y_x_desc =
make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(8, 4, 3, 3));
constexpr auto out_n_k_ho_wo_desc =
make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(64, 8, 18, 18));
constexpr auto descs =
transform_forward_convolution_into_contraction_v4r5_nchw_kcyx_nkhw_pad<N0>(
wei_k_c_y_x_desc,
in_n_c_hi_wi_desc,
out_n_k_ho_wo_desc,
make_tuple(2, 2),
make_tuple(1, 1),
make_tuple(1, 1),
make_tuple(1, 1));
constexpr auto a_gk_gm0_gm1_grid_desc = descs[I0];
constexpr auto b_gk_gn0_gn1_grid_desc = descs[I1];
constexpr auto c_gm0_gm1_gn0_gn1_grid_desc = descs[I2];
using AGKGM0GM1GridDesc = decltype(a_gk_gm0_gm1_grid_desc);
using BGKGN0GN1GridDesc = decltype(b_gk_gn0_gn1_grid_desc);
using CGM0GM1GN0GN1GridDesc = decltype(c_gm0_gm1_gn0_gn1_grid_desc);
using AGridIteratorHacks = decltype(make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0, 0>{},
Sequence<0, 0, 0, 0, 0, 0>{},
Sequence<0, 0, 0, 0, 0, 0>{},
Sequence<0, 0, 0, 0, 0, 0>{}),
make_tuple(Sequence<0, 0, 0, 0, 0, 0>{},
Sequence<0, 0, 0, 0, 0, 0>{},
Sequence<0, 0, 0, 0, 0, 0>{},
Sequence<0, 0, 0, 0, 0, 0>{})));
using BGridIteratorHacks =
decltype(make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0>{},
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0>{},
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0>{}),
make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0>{},
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0>{},
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0>{})));
using CGridIteratorHacks = decltype(make_tuple(
make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0>{},
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0>{},
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0>{},
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0>{}),
make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0>{},
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0>{},
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0>{},
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0>{})));
using AGridMoveSliceWindowIteratorHacks = Sequence<0, 0, 0, 0, 0, 0>;
using BGridMoveSliceWindowIteratorHacks = Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 2, 0, 0, 0>;
using GridwiseContraction = GridwiseDynamicContraction_km0m1_kn0n1_m0m1n0n1_v1r1<
BlockSize,
FloatAB,
FloatAcc,
FloatC,
InMemoryDataOperation::Set, /* ToDo tunable */
AGKGM0GM1GridDesc,
BGKGN0GN1GridDesc,
CGM0GM1GN0GN1GridDesc,
GM1PerBlockGM11,
GN1PerBlockGN11,
KPerBlock,
M1PerThread,
N1PerThread,
KPerThread,
M1N1ThreadClusterM10,
M1N1ThreadClusterN10,
M1N1ThreadClusterM11,
M1N1ThreadClusterN11,
ABlockTransferThreadSliceLengths_GK_GM0_GM10_GM11,
ABlockTransferThreadClusterLengths_GK_GM0_GM10_GM11,
ABlockTransferThreadClusterArrangeOrder,
ABlockTransferSrcAccessOrder,
ABlockTransferSrcVectorDim,
ABlockTransferSrcScalarPerVector,
ABlockTransferDstScalarPerVector_GM11,
AThreadTransferSrcResetCoordinateAfterRun,
BBlockTransferThreadSliceLengths_GK_GN0_GN10_GN11,
BBlockTransferThreadClusterLengths_GK_GN0_GN10_GN11,
BBlockTransferThreadClusterArrangeOrder,
BBlockTransferSrcAccessOrder,
BBlockTransferSrcVectorDim,
BBlockTransferSrcScalarPerVector,
BBlockTransferDstScalarPerVector_GN11,
BThreadTransferSrcResetCoordinateAfterRun,
CThreadTransferSrcDstAccessOrder,
CThreadTransferSrcDstVectorDim,
CThreadTransferDstScalarPerVector,
AGridIteratorHacks,
BGridIteratorHacks,
CGridIteratorHacks,
AGridMoveSliceWindowIteratorHacks,
BGridMoveSliceWindowIteratorHacks>;
using AGKGM0GM10GM11GridDesc =
decltype(GridwiseContraction::MakeAGKGM0GM10GM11GridDescriptor(a_gk_gm0_gm1_grid_desc));
using BGKGN0GN10GN11GridDesc =
decltype(GridwiseContraction::MakeBGKGN0GN10GN11GridDescriptor(b_gk_gn0_gn1_grid_desc));
using CGM10BM0BM1GN10BN0BN1GridDesc = decltype(
GridwiseContraction::MakeCGM10BM0BM1GN10BN0BN1GridDescriptor(c_gm0_gm1_gn0_gn1_grid_desc));
using CBlockIdToGM10GN10BlockClusterAdaptor =
decltype(GridwiseContraction::MakeCBlockIdToGM10GN10BlockClusterAdaptor(
c_gm0_gm1_gn0_gn1_grid_desc));
const auto a_gk_gm0_gm10_gm11_grid_desc = *reinterpret_cast<const AGKGM0GM10GM11GridDesc*>(
(const void*)p_a_gk_gm0_gm10_gm11_grid_desc);
const auto b_gk_gn0_gn10_gn11_grid_desc = *reinterpret_cast<const BGKGN0GN10GN11GridDesc*>(
(const void*)p_b_gk_gn0_gn10_gn11_grid_desc);
const auto c_gm10_bm0_bm1_gn10_bn0_bn1_grid_desc =
*reinterpret_cast<const CGM10BM0BM1GN10BN0BN1GridDesc*>(
(const void*)p_c_gm10_bm0_bm1_gn10_bn0_bn1_grid_desc);
const auto c_blockid_to_gm10_gn10_block_cluster_adaptor =
*reinterpret_cast<const CBlockIdToGM10GN10BlockClusterAdaptor*>(
(const void*)p_c_blockid_to_gm10_gn10_block_cluster_adaptor);
constexpr index_t shared_block_size =
GridwiseContraction::GetSharedMemoryNumberOfByte() / sizeof(FloatAB);
__shared__ FloatAB p_shared_block[shared_block_size];
GridwiseContraction::Run(p_a_grid,
p_b_grid,
p_c_grid,
p_shared_block,
a_gk_gm0_gm10_gm11_grid_desc,
b_gk_gn0_gn10_gn11_grid_desc,
c_gm10_bm0_bm1_gn10_bn0_bn1_grid_desc,
c_blockid_to_gm10_gn10_block_cluster_adaptor,
integral_constant<bool, HasMainKBlockLoop>{},
integral_constant<bool, HasDoubleTailKBlockLoop>{});
};

View File

@@ -0,0 +1,8 @@
extern "C" __global__ void
gridwise_convolution_forward_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer(
const void* const __restrict__ p_in_global,
const void* const __restrict__ p_wei_global,
void* const __restrict__ p_out_global){
};

View File

@@ -0,0 +1,7 @@
extern "C" __global__ void gridwise_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw(
const void* const __restrict__ p_in_global,
const void* const __restrict__ p_wei_global,
void* const __restrict__ p_out_global){
};

View File

@@ -0,0 +1,8 @@
extern "C" __global__ void gridwise_convolution_forward_implicit_gemm_v4r4_nhwc_kyxc_nhwk(
const void* const __restrict__ p_in_global,
const void* const __restrict__ p_wei_global,
void* const __restrict__ p_out_global){
};

View File

@@ -1,32 +1,201 @@
set(TENSOR_SOURCE
set(CMAKE_CXX_COMPILER /opt/rocm/llvm/bin/clang++)
## for online-compiling of HIP kernels
set(OLC_HIP_COMPILER ${CMAKE_CXX_COMPILER} CACHE PATH "")
## reset to avoid the C++ options from the parent project
set(CMAKE_CXX_FLAGS "")
message("Compiling options for library and kernels: ${CMAKE_CXX_FLAGS}")
# look for and register clang-offload-bundler
if(OLC_HIP_COMPILER MATCHES ".*clang\\+\\+$")
find_program(OLC_OFFLOADBUNDLER_BIN clang-offload-bundler
PATH_SUFFIXES bin
PATHS
/opt/rocm/llvm
${CMAKE_INSTALL_PREFIX}/llvm
)
endif()
if(OLC_OFFLOADBUNDLER_BIN)
message(STATUS "clang-offload-bundler found: ${OLC_OFFLOADBUNDLER_BIN}")
set(OLC_OFFLOADBUNDLER_BIN "${OLC_OFFLOADBUNDLER_BIN}")
else()
# look for and register extractkernel
message(STATUS "clang-offload-bundler not found")
find_program(EXTRACTKERNEL_BIN extractkernel
PATH_SUFFIXES bin
PATHS
/opt/rocm/hip
/opt/rocm/hcc
/opt/rocm
${CMAKE_INSTALL_PREFIX}/hip
${CMAKE_INSTALL_PREFIX}/hcc
${CMAKE_INSTALL_PREFIX}
)
if(EXTRACTKERNEL_BIN)
message(STATUS "extractkernel found: ${EXTRACTKERNEL_BIN}")
set(EXTRACTKERNEL_BIN "${EXTRACTKERNEL_BIN}")
else()
message(FATAL_ERROR "extractkernel not found")
endif()
endif()
option(Boost_USE_STATIC_LIBS "Use boost static libraries" OFF)
set(BOOST_COMPONENTS filesystem)
add_definitions(-DBOOST_ALL_NO_LIB=1)
find_package(Boost REQUIRED COMPONENTS ${BOOST_COMPONENTS})
# HIP is always required
find_package(hip REQUIRED PATHS /opt/rocm)
message(STATUS "Build with HIP ${hip_VERSION}")
target_flags(HIP_COMPILER_FLAGS hip::device)
# Remove cuda arch flags
string(REGEX REPLACE --cuda-gpu-arch=[a-z0-9]+ "" HIP_COMPILER_FLAGS "${HIP_COMPILER_FLAGS}")
string(REGEX REPLACE --offload-arch=[a-z0-9]+ "" HIP_COMPILER_FLAGS "${HIP_COMPILER_FLAGS}")
set(OLC_hip_VERSION_MAJOR "${hip_VERSION_MAJOR}")
set(OLC_hip_VERSION_MINOR "${hip_VERSION_MINOR}")
set(OLC_hip_VERSION_PATCH "${hip_VERSION_PATCH}")
option(ENABLE_DEBUG "Build to enable debugging" ON)
if(ENABLE_DEBUG)
set(OLC_DEBUG 1)
else()
set(OLC_DEBUG 0)
endif()
configure_file("${CMAKE_CURRENT_SOURCE_DIR}/olCompiling/include/config.h.in" "${CMAKE_CURRENT_SOURCE_DIR}/olCompiling/include/config.h")
message(STATUS "Hip compiler flags: ${HIP_COMPILER_FLAGS}")
## HIP_COMPILER_FLAGS will be used for on-line compiling of the HIP kernels
add_definitions("-DHIP_COMPILER_FLAGS=${HIP_COMPILER_FLAGS}")
file(GLOB COMPOSABLE_KERNEL_INCLUDE_1 "${PROJECT_SOURCE_DIR}/composable_kernel/include/kernel_algorithm/*.hpp")
file(GLOB COMPOSABLE_KERNEL_INCLUDE_2 "${PROJECT_SOURCE_DIR}/composable_kernel/include/tensor_description/*.hpp")
file(GLOB COMPOSABLE_KERNEL_INCLUDE_3 "${PROJECT_SOURCE_DIR}/composable_kernel/include/tensor_operation/*.hpp")
file(GLOB COMPOSABLE_KERNEL_INCLUDE_4 "${PROJECT_SOURCE_DIR}/composable_kernel/include/utility/*.hpp")
file(GLOB COMPOSABLE_KERNEL_INCLUDE_5 "${PROJECT_BINARY_DIR}/composable_kernel/include/utility/*.hpp")
file(GLOB COMPOSABLE_KERNEL_INCLUDE_6 "${PROJECT_SOURCE_DIR}/external/rocm/include/bfloat16_dev.hpp")
set(MCONV_KERNEL_INCLUDES
${COMPOSABLE_KERNEL_INCLUDE_1}
${COMPOSABLE_KERNEL_INCLUDE_2}
${COMPOSABLE_KERNEL_INCLUDE_3}
${COMPOSABLE_KERNEL_INCLUDE_4}
${COMPOSABLE_KERNEL_INCLUDE_5}
${COMPOSABLE_KERNEL_INCLUDE_6}
)
set(MCONV_KERNELS
../composable_kernel/src/kernel_wrapper/dynamic_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw.cpp
../composable_kernel/src/kernel_wrapper/dynamic_convolution_forward_implicit_gemm_v4r5_nchw_kcyx_nkhw.cpp
)
add_kernels("olCompiling/" "${MCONV_KERNELS}")
add_kernel_includes("olCompiling/" "${MCONV_KERNEL_INCLUDES}")
set(MCONV_SOURCES
src/host_tensor.cpp;
src/device.cpp;
)
add_library(host SHARED ${TENSOR_SOURCE})
target_compile_features(host PUBLIC)
set_target_properties(host PROPERTIES POSITION_INDEPENDENT_CODE ON)
set(OLC_HIP_UTILITY_HEADERS
olCompiling/include/config.h
olCompiling/include/logger.hpp
olCompiling/include/stringutils.hpp
olCompiling/include/tmp_dir.hpp
olCompiling/include/write_file.hpp
olCompiling/include/env.hpp
olCompiling/include/manage_ptr.hpp
olCompiling/include/md5.hpp
olCompiling/include/simple_hash.hpp
olCompiling/include/exec_utils.hpp
olCompiling/include/hipCheck.hpp
olCompiling/include/target_properties.hpp
olCompiling/include/handle.hpp
olCompiling/include/op_kernel_args.hpp
olCompiling/include/kernel.hpp
olCompiling/include/kernel_build_params.hpp
olCompiling/include/hip_build_utils.hpp
olCompiling/include/hipoc_program.hpp
olCompiling/include/hipoc_program_impl.hpp
olCompiling/include/hipoc_kernel.hpp
olCompiling/include/kernel_cache.hpp
olCompiling/include/binary_cache.hpp
)
if(DEVICE_BACKEND STREQUAL "NVIDIA")
target_link_libraries(host nvToolsExt cudart)
endif()
set(OLC_HIP_UTILITY_CPPS
olCompiling/hip_utility/logger.cpp
olCompiling/hip_utility/tmp_dir.cpp
olCompiling/hip_utility/md5.cpp
olCompiling/hip_utility/exec_utils.cpp
olCompiling/hip_utility/target_properties.cpp
olCompiling/hip_utility/handlehip.cpp
olCompiling/hip_utility/kernel_build_params.cpp
olCompiling/hip_utility/hip_build_utils.cpp
olCompiling/hip_utility/hipoc_program.cpp
olCompiling/hip_utility/hipoc_kernel.cpp
olCompiling/hip_utility/kernel_cache.cpp
olCompiling/hip_utility/binary_cache.cpp
)
install(TARGETS host LIBRARY DESTINATION lib)
list(APPEND OLC_SOURCES ${OLC_HIP_UTILITY_CPPS} ${OLC_HIP_UTILITY_HEADERS})
list(INSERT MCONV_SOURCES 0
${PROJECT_BINARY_DIR}/kernel.cpp
${PROJECT_BINARY_DIR}/kernel_includes.cpp
)
if(DEVICE_BACKEND STREQUAL "AMD")
set(CONV_SOURCE src/conv_driver.cpp)
set(CONV_V2_SOURCE src/conv_driver_v2.cpp)
set(CONV_BWD_DATA_SOURCE src/conv_bwd_data_driver.cpp)
elseif(DEVICE_BACKEND STREQUAL "NVIDIA")
set(CONV_SOURCE src/conv_driver.cu)
set(CONV_BWD_DATA_SOURCE src/conv_bwd_data_driver.cu)
endif()
## addkernels provide the tool to create inlined kernels in one header
add_subdirectory(olCompiling/addkernels)
add_executable(conv_driver ${CONV_SOURCE})
add_executable(conv_driver_v2 ${CONV_V2_SOURCE})
add_executable(conv_bwd_data_driver ${CONV_BWD_DATA_SOURCE})
function(inline_kernels_src KERNELS KERNEL_INCLUDES)
set(KERNEL_SRC_HPP_FILENAME batch_all.cpp.hpp)
set(KERNEL_SRC_HPP_PATH ${PROJECT_BINARY_DIR}/inlined_kernels/${KERNEL_SRC_HPP_FILENAME})
set(KERNEL_SRC_CPP_PATH ${PROJECT_BINARY_DIR}/inlined_kernels/batch_all.cpp)
target_link_libraries(conv_driver PRIVATE host)
target_link_libraries(conv_driver_v2 PRIVATE host)
target_link_libraries(conv_bwd_data_driver PRIVATE host)
add_custom_command(
OUTPUT ${KERNEL_SRC_HPP_PATH}
WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
DEPENDS addkernels ${KERNELS} ${KERNEL_INCLUDES}
COMMAND $<TARGET_FILE:addkernels> -target ${KERNEL_SRC_HPP_PATH} -extern -source ${KERNELS}
COMMENT "Inlining All kernels"
)
configure_file(olCompiling/kernels_batch.cpp.in ${KERNEL_SRC_CPP_PATH})
list(APPEND OLC_SOURCES ${KERNEL_SRC_CPP_PATH} ${KERNEL_SRC_HPP_PATH})
set(OLC_SOURCES ${OLC_SOURCES} PARENT_SCOPE)
endfunction()
inline_kernels_src("${MCONV_KERNELS}" "${MCONV_KERNEL_INCLUDES}")
list(APPEND MCONV_SOURCES ${OLC_SOURCES} ${PROJECT_BINARY_DIR}/olc_kernel_includes.h)
add_custom_command(
OUTPUT ${PROJECT_BINARY_DIR}/olc_kernel_includes.h
WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
DEPENDS addkernels ${MCONV_KERNEL_INCLUDES}
COMMAND $<TARGET_FILE:addkernels> -no-recurse -guard GUARD_OLC_KERNEL_INCLUDES_HPP_ -target ${PROJECT_BINARY_DIR}/olc_kernel_includes.h -source ${MCONV_KERNEL_INCLUDES}
COMMENT "Inlining HIP kernel includes"
)
## the library target
add_library(modConv SHARED ${MCONV_SOURCES})
target_include_directories(modConv PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/olCompiling/include/)
target_include_directories(modConv PRIVATE ${PROJECT_BINARY_DIR})
target_include_directories(modConv PRIVATE ${PROJECT_SOURCE_DIR}/external/half/include/)
target_link_libraries(modConv PRIVATE hip::device)
target_link_libraries(modConv INTERFACE hip::host)
target_link_libraries(modConv PRIVATE Boost::filesystem)
target_compile_options(modConv PRIVATE -mfma)
target_compile_features(modConv PUBLIC)
set_target_properties(modConv PROPERTIES POSITION_INDEPENDENT_CODE ON)
install(TARGETS modConv LIBRARY DESTINATION lib)

View File

@@ -18,9 +18,9 @@
#include "device_dynamic_convolution_forward_implicit_gemm_v5r1_nchw_kcyx_nkhw.hpp"
#define USE_DYNAMIC_MODE 1
#define USE_CONV_FWD_V4R4_NCHW 1
#define USE_CONV_FWD_V4R4_NCHW 0
#define USE_CONV_FWD_V4R4_NHWC 0
#define USE_CONV_FWD_V4R5_NCHW 0
#define USE_CONV_FWD_V4R5_NCHW 1
#define USE_CONV_FWD_V5R1_NCHW 0
enum ConvForwardAlgo
@@ -279,8 +279,6 @@ int main(int argc, char* argv[])
in_right_pads_dev);
};
const auto nhwc_desc = f_make_for_device_nhwc();
#if USE_CONV_FWD_V4R4_NCHW
if(algo == ConvForwardAlgo::V4R4NCHW)
{

View File

@@ -0,0 +1,288 @@
#include <iostream>
#include <numeric>
#include <initializer_list>
#include <cstdlib>
#include <stdlib.h>
#include <half.hpp>
#include "config.hpp"
#include "print.hpp"
#include "device.hpp"
#include "host_tensor.hpp"
#include "host_tensor_generator.hpp"
#include "conv_common.hpp"
#include "host_conv.hpp"
#include "device_tensor.hpp"
#include "olc_device_dynamic_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp"
#include "olc_device_dynamic_convolution_forward_implicit_gemm_v4r5_nchw_kcyx_nkhw.hpp"
#define USE_CONV_FWD_V4R4_NCHW 1
#define USE_CONV_FWD_V4R5_NCHW 1
#include "conv_tunables.hpp"
#include "handle.hpp"
#include "hipCheck.hpp"
enum ConvForwardAlgo
{
V4R4NCHW,
V4R4NHWC,
V4R5NCHW,
V5R1NCHW
};
int main(int argc, char* argv[])
{
using namespace ck;
using size_t = std::size_t;
hipStream_t stream;
olCompile::Handle* handle;
MY_HIP_CHECK(hipStreamCreate(&stream));
handle = new olCompile::Handle(stream);
constexpr auto I0 = Number<0>{};
constexpr auto I1 = Number<1>{};
constexpr auto I2 = Number<2>{};
constexpr auto I3 = Number<3>{};
constexpr auto I4 = Number<4>{};
constexpr auto I5 = Number<5>{};
constexpr auto I6 = Number<6>{};
if(argc != 22)
{
printf("arg1 to 5: layout, algo, do_verification, init_method, do_log, nrepeat\n");
printf("rest: N, K, C, Y, X, Hi, Wi, Sy, Sx, Dy, Dx, LeftPy, LeftPx, RightPy, RightPx\n");
exit(1);
}
const ConvTensorLayout layout = static_cast<ConvTensorLayout>(atoi(argv[1]));
const ConvForwardAlgo algo = static_cast<ConvForwardAlgo>(atoi(argv[2]));
const bool do_verification = atoi(argv[3]);
const int init_method = atoi(argv[4]);
const bool do_log = atoi(argv[5]);
const int nrepeat = atoi(argv[6]);
const index_t N = atoi(argv[7]);
const index_t K = atoi(argv[8]);
const index_t C = atoi(argv[9]);
const index_t Y = atoi(argv[10]);
const index_t X = atoi(argv[11]);
const index_t Hi = atoi(argv[12]);
const index_t Wi = atoi(argv[13]);
const index_t conv_stride_h = atoi(argv[14]);
const index_t conv_stride_w = atoi(argv[15]);
const index_t conv_dilation_h = atoi(argv[16]);
const index_t conv_dilation_w = atoi(argv[17]);
const index_t in_left_pad_h = atoi(argv[18]);
const index_t in_left_pad_w = atoi(argv[19]);
const index_t in_right_pad_h = atoi(argv[20]);
const index_t in_right_pad_w = atoi(argv[21]);
const index_t YEff = (Y - 1) * conv_dilation_h + 1;
const index_t XEff = (X - 1) * conv_dilation_w + 1;
const index_t Ho = (Hi + in_left_pad_h + in_right_pad_h - YEff) / conv_stride_h + 1;
const index_t Wo = (Wi + in_left_pad_w + in_right_pad_w - XEff) / conv_stride_w + 1;
#if 1
constexpr index_t in_vector_size = 1;
using in_data_t = float;
using acc_data_t = float;
using out_data_t = float;
#elif 1
constexpr index_t in_vector_size = 16;
using in_data_t = int8_t;
using acc_data_t = int32_t;
using out_data_t = int8_t;
#endif
std::vector<std::size_t> in_lengths_host(4), wei_lengths_host(4), out_lengths_host(4);
switch(layout)
{
case ConvTensorLayout::NCHW:
// NCHW
in_lengths_host[0] = static_cast<std::size_t>(N);
in_lengths_host[1] = static_cast<std::size_t>(C);
in_lengths_host[2] = static_cast<std::size_t>(Hi);
in_lengths_host[3] = static_cast<std::size_t>(Wi);
wei_lengths_host[0] = static_cast<std::size_t>(K);
wei_lengths_host[1] = static_cast<std::size_t>(C);
wei_lengths_host[2] = static_cast<std::size_t>(Y);
wei_lengths_host[3] = static_cast<std::size_t>(X);
out_lengths_host[0] = static_cast<std::size_t>(N);
out_lengths_host[1] = static_cast<std::size_t>(K);
out_lengths_host[2] = static_cast<std::size_t>(Ho);
out_lengths_host[3] = static_cast<std::size_t>(Wo);
break;
case ConvTensorLayout::NHWC:
// NHWC
in_lengths_host[0] = static_cast<std::size_t>(N);
in_lengths_host[1] = static_cast<std::size_t>(Hi);
in_lengths_host[2] = static_cast<std::size_t>(Wi);
in_lengths_host[3] = static_cast<std::size_t>(C);
wei_lengths_host[0] = static_cast<std::size_t>(K);
wei_lengths_host[1] = static_cast<std::size_t>(Y);
wei_lengths_host[2] = static_cast<std::size_t>(X);
wei_lengths_host[3] = static_cast<std::size_t>(C);
out_lengths_host[0] = static_cast<std::size_t>(N);
out_lengths_host[1] = static_cast<std::size_t>(Ho);
out_lengths_host[2] = static_cast<std::size_t>(Wo);
out_lengths_host[3] = static_cast<std::size_t>(K);
break;
default: throw std::runtime_error("wrong! not implemented");
}
Tensor<in_data_t> in(in_lengths_host);
Tensor<in_data_t> wei(wei_lengths_host);
Tensor<out_data_t> out_host(out_lengths_host);
Tensor<out_data_t> out_device(out_lengths_host);
std::cout << "layout: " << layout << std::endl;
ostream_HostTensorDescriptor(in.mDesc, std::cout << "in: ");
ostream_HostTensorDescriptor(wei.mDesc, std::cout << "wei: ");
ostream_HostTensorDescriptor(out_host.mDesc, std::cout << "out: ");
print_array("InLeftPads", make_tuple(in_left_pad_h, in_left_pad_w));
print_array("InRightPads", make_tuple(in_right_pad_h, in_right_pad_w));
print_array("ConvStrides", make_tuple(conv_stride_h, conv_stride_w));
print_array("ConvDilations", make_tuple(conv_dilation_h, conv_dilation_w));
std::size_t num_thread = std::thread::hardware_concurrency();
if(do_verification)
{
switch(init_method)
{
case 0:
in.GenerateTensorValue(GeneratorTensor_1{}, num_thread);
wei.GenerateTensorValue(GeneratorTensor_1{}, num_thread);
break;
case 1:
in.GenerateTensorValue(GeneratorTensor_1{}, num_thread);
wei.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread);
break;
case 2:
in.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread);
wei.GenerateTensorValue(GeneratorTensor_1{}, num_thread);
break;
case 3:
in.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread);
wei.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread);
break;
default:
in.GenerateTensorValue(GeneratorTensor_2{1, 5}, num_thread);
auto gen_wei = [](auto... is) {
return GeneratorTensor_2{1, 5}(is...) * GeneratorTensor_Checkboard{}(is...);
};
wei.GenerateTensorValue(gen_wei, num_thread);
}
}
auto f_make_for_device_nchw = [&]() {
const auto in_lengths_dev = make_tuple(N, C, Hi, Wi);
const auto wei_lengths_dev = make_tuple(K, C, Y, X);
const auto out_lengths_dev = make_tuple(N, K, Ho, Wo);
return make_tuple(in_lengths_dev, wei_lengths_dev, out_lengths_dev);
};
auto f_make_for_device_nhwc = [&]() {
const auto in_lengths_dev = make_tuple(N, Hi, Wi, C);
const auto wei_lengths_dev = make_tuple(K, Y, X, C);
const auto out_lengths_dev = make_tuple(N, Ho, Wo, K);
return make_tuple(in_lengths_dev, wei_lengths_dev, out_lengths_dev);
};
const auto conv_strides = make_tuple(conv_stride_h, conv_stride_w);
const auto conv_dilations = make_tuple(conv_dilation_h, conv_dilation_w);
const auto in_left_pads = make_tuple(in_left_pad_h, in_left_pad_w);
const auto in_right_pads = make_tuple(in_right_pad_h, in_right_pad_w);
#if USE_CONV_FWD_V4R4_NCHW
if(algo == ConvForwardAlgo::V4R4NCHW)
{
if(layout != ConvTensorLayout::NCHW)
{
throw std::runtime_error("wrong! layout");
}
const auto tmp = f_make_for_device_nchw();
tunable_dyn_conv_fwd_v4r4_nchw_kcyx_nkhw* tunable =
&default_tunable_dyn_conv_fwd_v4r4_nchw_kcyx_nkhw;
device_dynamic_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw_olc<in_data_t,
acc_data_t,
out_data_t>(
handle,
tmp[I0],
tmp[I1],
tmp[I2],
conv_strides,
conv_dilations,
in_left_pads,
in_right_pads,
in,
wei,
out_device,
tunable,
nrepeat);
}
#endif
#if USE_CONV_FWD_V4R5_NCHW
if(algo == ConvForwardAlgo::V4R5NCHW)
{
if(layout != ConvTensorLayout::NCHW)
{
throw std::runtime_error("wrong! layout");
}
const auto tmp = f_make_for_device_nchw();
tunable_dyn_conv_fwd_v4r5_nchw_kcyx_nkhw* tunable =
&default_tunable_dyn_conv_fwd_v4r5_nchw_kcyx_nkhw;
device_dynamic_convolution_forward_implicit_gemm_v4r5_nchw_kcyx_nkhw_olc<in_data_t,
acc_data_t,
out_data_t>(
handle,
tmp[I0],
tmp[I1],
tmp[I2],
conv_strides,
conv_dilations,
in_left_pads,
in_right_pads,
in,
wei,
out_device,
tunable,
nrepeat);
}
#endif
if(do_verification)
{
host_direct_convolution(
in, wei, out_host, conv_strides, conv_dilations, in_left_pads, in_right_pads);
check_error(out_host, out_device);
if(do_log)
{
LogRange(std::cout << "in : ", in.mData, ",") << std::endl;
LogRange(std::cout << "wei: ", wei.mData, ",") << std::endl;
LogRange(std::cout << "out_host : ", out_host.mData, ",") << std::endl;
LogRange(std::cout << "out_device: ", out_device.mData, ",") << std::endl;
}
}
delete handle;
MY_HIP_CHECK(hipStreamDestroy(stream));
}

View File

@@ -0,0 +1,131 @@
#ifndef CONV_TUNABLES_HPP
#define CONV_TUNABLES_HPP
#include "config.hpp"
struct tunable_dyn_conv_fwd_v4r4_nchw_kcyx_nkhw
{
ck::index_t BlockSize; // usually not tunable
ck::index_t MPerBlock;
ck::index_t NPerBlock;
ck::index_t KPerBlock;
ck::index_t M1PerThread;
ck::index_t N1PerThread;
ck::index_t KPerThread;
ck::index_t M1N1ThreadClusterM10;
ck::index_t M1N1ThreadClusterN10;
ck::index_t M1N1ThreadClusterM11;
ck::index_t M1N1ThreadClusterN11;
std::array<ck::index_t, 3> ABlockTransferThreadSliceLengths_K_M0_M1;
std::array<ck::index_t, 3> ABlockTransferThreadClusterLengths_K_M0_M1;
std::array<ck::index_t, 3> ABlockTransferThreadClusterArrangeOrder;
std::array<ck::index_t, 3> ABlockTransferSrcAccessOrder;
ck::index_t ABlockTransferSrcVectorDim;
ck::index_t ABlockTransferSrcScalarPerVector;
ck::index_t ABlockTransferDstScalarPerVector_M1;
bool AThreadTransferSrcResetCoordinateAfterRun;
std::array<ck::index_t, 3> BBlockTransferThreadSliceLengths_K_N0_N1;
std::array<ck::index_t, 3> BBlockTransferThreadClusterLengths_K_N0_N1;
std::array<ck::index_t, 3> BBlockTransferThreadClusterArrangeOrder;
std::array<ck::index_t, 3> BBlockTransferSrcAccessOrder;
ck::index_t BBlockTransferSrcVectorDim;
ck::index_t BBlockTransferSrcScalarPerVector;
ck::index_t BBlockTransferDstScalarPerVector_N1;
bool BThreadTransferSrcResetCoordinateAfterRun;
std::array<ck::index_t, 6> CThreadTransferSrcDstAccessOrder;
ck::index_t CThreadTransferSrcDstVectorDim;
ck::index_t CThreadTransferDstScalarPerVector;
};
static tunable_dyn_conv_fwd_v4r4_nchw_kcyx_nkhw default_tunable_dyn_conv_fwd_v4r4_nchw_kcyx_nkhw = {
256, 128, 128, 8, 4, 4, 1,
8, 8, 2, 2, {4, 1, 1}, {2, 1, 128}, {2, 1, 0},
{2, 1, 0}, 0, 4, 1, false, {4, 1, 1}, {2, 1, 128},
{0, 1, 2}, {0, 1, 2}, 2, 1, 1, false, {3, 4, 5, 0, 1, 2},
5, 1};
struct tunable_dyn_conv_fwd_v4r5_nchw_kcyx_nkhw
{
ck::index_t BlockSize;
ck::index_t GM1PerBlockGM11;
ck::index_t GN1PerBlockGN11;
ck::index_t KPerBlock;
ck::index_t M1PerThread;
ck::index_t N1PerThread;
ck::index_t KPerThread;
ck::index_t M1N1ThreadClusterM10;
ck::index_t M1N1ThreadClusterN10;
ck::index_t M1N1ThreadClusterM11;
ck::index_t M1N1ThreadClusterN11;
std::array<ck::index_t, 4> ABlockTransferThreadSliceLengths_GK_GM0_GM10_GM11;
std::array<ck::index_t, 4> ABlockTransferThreadClusterLengths_GK_GM0_GM10_GM11;
std::array<ck::index_t, 4> ABlockTransferThreadClusterArrangeOrder;
std::array<ck::index_t, 4> ABlockTransferSrcAccessOrder;
ck::index_t ABlockTransferSrcVectorDim;
ck::index_t ABlockTransferSrcScalarPerVector;
ck::index_t ABlockTransferDstScalarPerVector_GM11;
bool AThreadTransferSrcResetCoordinateAfterRun;
std::array<ck::index_t, 4> BBlockTransferThreadSliceLengths_GK_GN0_GN10_GN11;
std::array<ck::index_t, 4> BBlockTransferThreadClusterLengths_GK_GN0_GN10_GN11;
std::array<ck::index_t, 4> BBlockTransferThreadClusterArrangeOrder;
std::array<ck::index_t, 4> BBlockTransferSrcAccessOrder;
ck::index_t BBlockTransferSrcVectorDim;
ck::index_t BBlockTransferSrcScalarPerVector;
ck::index_t BBlockTransferDstScalarPerVector_GN11;
bool BThreadTransferSrcResetCoordinateAfterRun;
std::array<ck::index_t, 6> CThreadTransferSrcDstAccessOrder;
ck::index_t CThreadTransferSrcDstVectorDim;
ck::index_t CThreadTransferDstScalarPerVector;
};
static tunable_dyn_conv_fwd_v4r5_nchw_kcyx_nkhw default_tunable_dyn_conv_fwd_v4r5_nchw_kcyx_nkhw = {
256,
128,
32,
8,
4,
4,
1,
2,
2,
8,
8,
{4, 1, 1, 1},
{2, 1, 1, 128},
{3, 2, 1, 0},
{3, 2, 1, 0},
0,
4,
1,
false,
{1, 4, 1, 1},
{8, 1, 1, 32},
{0, 3, 2, 1},
{0, 3, 2, 1},
3,
1,
1,
false,
{3, 4, 5, 0, 1, 2},
5,
1};
static inline int
conv_hw_out_size(int hw_in_size, int leftPad, int rightPad, int dilation, int yx_size, int stride)
{
return (hw_in_size + leftPad + rightPad - dilation * (yx_size - 1) - 1) / stride + 1;
}
#endif

View File

@@ -0,0 +1,383 @@
#include "device.hpp"
#include "host_tensor.hpp"
#include "dynamic_tensor_descriptor.hpp"
#include "dynamic_tensor_descriptor_helper.hpp"
#include "transform_forward_convolution_into_gemm_v4r4_nchw_kcyx_nkhw.hpp"
#include "olc_driver_common.hpp"
#include "conv_tunables.hpp"
#include "handle.hpp"
namespace detail_dyn_conv_fwd_v4r4_nchw_kcyx_nkhw {
template <typename TInWei, typename TAcc, typename TOut>
static std::string get_network_config_string_from_types()
{
std::string out;
out += static_cast<char>(Driver::get_typeid_from_type<TInWei>()) +
static_cast<char>(Driver::get_typeid_from_type<TAcc>()) +
static_cast<char>(Driver::get_typeid_from_type<TOut>());
return (out);
};
static std::string
get_network_config_string_from_tunable(const tunable_dyn_conv_fwd_v4r4_nchw_kcyx_nkhw* pt)
{
std::string out("TUN_");
out += std::to_string(pt->BlockSize) + "_";
out += std::to_string(pt->MPerBlock) + "x" + std::to_string(pt->NPerBlock) + "x" +
std::to_string(pt->KPerBlock) + "_";
out += std::to_string(pt->M1PerThread) + "x" + std::to_string(pt->N1PerThread) + "x" +
std::to_string(pt->KPerThread) + "_";
out += std::to_string(pt->M1N1ThreadClusterM10) + "x" +
std::to_string(pt->M1N1ThreadClusterN10) + "x" +
std::to_string(pt->M1N1ThreadClusterM11) + "x" +
std::to_string(pt->M1N1ThreadClusterN11) + "_";
out += std::to_string(pt->ABlockTransferThreadSliceLengths_K_M0_M1[0]) + "x" +
std::to_string(pt->ABlockTransferThreadSliceLengths_K_M0_M1[1]) + "x" +
std::to_string(pt->ABlockTransferThreadSliceLengths_K_M0_M1[2]) + "_";
out += std::to_string(pt->ABlockTransferThreadClusterLengths_K_M0_M1[0]) + "x" +
std::to_string(pt->ABlockTransferThreadClusterLengths_K_M0_M1[1]) + "x" +
std::to_string(pt->ABlockTransferThreadClusterLengths_K_M0_M1[2]) + "_";
out += std::to_string(pt->ABlockTransferThreadClusterArrangeOrder[0]) + "x" +
std::to_string(pt->ABlockTransferThreadClusterArrangeOrder[1]) + "x" +
std::to_string(pt->ABlockTransferThreadClusterArrangeOrder[2]) + "_";
out += std::to_string(pt->ABlockTransferSrcAccessOrder[0]) + "x" +
std::to_string(pt->ABlockTransferSrcAccessOrder[1]) + "x" +
std::to_string(pt->ABlockTransferSrcAccessOrder[2]) + "_";
out += std::to_string(pt->ABlockTransferSrcVectorDim) + "_";
out += std::to_string(pt->ABlockTransferSrcScalarPerVector) + "_";
out += std::to_string(pt->ABlockTransferDstScalarPerVector_M1) + "_";
out += std::to_string(pt->AThreadTransferSrcResetCoordinateAfterRun) + "_";
out += std::to_string(pt->BBlockTransferThreadSliceLengths_K_N0_N1[0]) + "x" +
std::to_string(pt->BBlockTransferThreadSliceLengths_K_N0_N1[1]) + "x" +
std::to_string(pt->BBlockTransferThreadSliceLengths_K_N0_N1[2]) + "_";
out += std::to_string(pt->BBlockTransferThreadClusterLengths_K_N0_N1[0]) + "x" +
std::to_string(pt->BBlockTransferThreadClusterLengths_K_N0_N1[1]) + "x" +
std::to_string(pt->BBlockTransferThreadClusterLengths_K_N0_N1[2]) + "_";
out += std::to_string(pt->BBlockTransferThreadClusterArrangeOrder[0]) + "x" +
std::to_string(pt->BBlockTransferThreadClusterArrangeOrder[1]) + "x" +
std::to_string(pt->BBlockTransferThreadClusterArrangeOrder[2]) + "_";
out += std::to_string(pt->BBlockTransferSrcAccessOrder[0]) + "x" +
std::to_string(pt->BBlockTransferSrcAccessOrder[1]) + "x" +
std::to_string(pt->BBlockTransferSrcAccessOrder[2]) + "_";
out += std::to_string(pt->BBlockTransferSrcVectorDim) + "_";
out += std::to_string(pt->BBlockTransferSrcScalarPerVector) + "_";
out += std::to_string(pt->BBlockTransferDstScalarPerVector_N1) + "_";
out += std::to_string(pt->BThreadTransferSrcResetCoordinateAfterRun) + "_";
out += std::to_string(pt->CThreadTransferSrcDstAccessOrder[0]) + "x" +
std::to_string(pt->CThreadTransferSrcDstAccessOrder[1]) + "x" +
std::to_string(pt->CThreadTransferSrcDstAccessOrder[2]) + "x" +
std::to_string(pt->CThreadTransferSrcDstAccessOrder[3]) + "x" +
std::to_string(pt->CThreadTransferSrcDstAccessOrder[4]) + "x" +
std::to_string(pt->CThreadTransferSrcDstAccessOrder[5]) + "_";
out += std::to_string(pt->CThreadTransferSrcDstVectorDim) + "_";
out += std::to_string(pt->CThreadTransferDstScalarPerVector);
return (out);
};
template <typename TInWei, typename TAcc, typename TOut>
static std::string get_definition_string_from_types()
{
std::string out;
out += " -DCK_PARAM_IN_WEI_DATATYPE=" + std::to_string(Driver::get_typeid_from_type<TInWei>()) +
" -DCK_PARAM_CONV_COMPTYPE=" + std::to_string(Driver::get_typeid_from_type<TAcc>()) +
" -DCK_PARAM_OUT_DATATYPE=" + std::to_string(Driver::get_typeid_from_type<TOut>());
return (out);
};
static std::string
get_definition_string_from_tunable(const tunable_dyn_conv_fwd_v4r4_nchw_kcyx_nkhw* pt)
{
std::string out;
out += " -DCK_PARAM_BlockSize=" + std::to_string(pt->BlockSize);
out += " -DCK_PARAM_MPerBlock=" + std::to_string(pt->MPerBlock) +
" -DCK_PARAM_NPerBlock=" + std::to_string(pt->NPerBlock) +
" -DCK_PARAM_KPerBlock=" + std::to_string(pt->KPerBlock);
out += " -DCK_PARAM_M1PerThread=" + std::to_string(pt->M1PerThread) +
" -DCK_PARAM_N1PerThread=" + std::to_string(pt->N1PerThread) +
" -DCK_PARAM_KPerThread=" + std::to_string(pt->KPerThread);
out += " -DCK_PARAM_M1N1ThreadClusterM10=" + std::to_string(pt->M1N1ThreadClusterM10) +
" -DCK_PARAM_M1N1ThreadClusterN10=" + std::to_string(pt->M1N1ThreadClusterN10) +
" -DCK_PARAM_M1N1ThreadClusterM11=" + std::to_string(pt->M1N1ThreadClusterM11) +
" -DCK_PARAM_M1N1ThreadClusterN11=" + std::to_string(pt->M1N1ThreadClusterN11);
out += " -DCK_PARAM_ABlockTransferThreadSliceLengths_K_M0_M1=" +
std::to_string(pt->ABlockTransferThreadSliceLengths_K_M0_M1[0]) + "," +
std::to_string(pt->ABlockTransferThreadSliceLengths_K_M0_M1[1]) + "," +
std::to_string(pt->ABlockTransferThreadSliceLengths_K_M0_M1[2]);
out += " -DCK_PARAM_ABlockTransferThreadClusterLengths_K_M0_M1=" +
std::to_string(pt->ABlockTransferThreadClusterLengths_K_M0_M1[0]) + "," +
std::to_string(pt->ABlockTransferThreadClusterLengths_K_M0_M1[1]) + "," +
std::to_string(pt->ABlockTransferThreadClusterLengths_K_M0_M1[2]);
out += " -DCK_PARAM_ABlockTransferThreadClusterArrangeOrder=" +
std::to_string(pt->ABlockTransferThreadClusterArrangeOrder[0]) + "," +
std::to_string(pt->ABlockTransferThreadClusterArrangeOrder[1]) + "," +
std::to_string(pt->ABlockTransferThreadClusterArrangeOrder[2]);
out += " -DCK_PARAM_ABlockTransferSrcAccessOrder=" +
std::to_string(pt->ABlockTransferSrcAccessOrder[0]) + "," +
std::to_string(pt->ABlockTransferSrcAccessOrder[1]) + "," +
std::to_string(pt->ABlockTransferSrcAccessOrder[2]);
out +=
" -DCK_PARAM_ABlockTransferSrcVectorDim=" + std::to_string(pt->ABlockTransferSrcVectorDim);
out += " -DCK_PARAM_ABlockTransferSrcScalarPerVector=" +
std::to_string(pt->ABlockTransferSrcScalarPerVector);
out += " -DCK_PARAM_ABlockTransferDstScalarPerVector_M1=" +
std::to_string(pt->ABlockTransferDstScalarPerVector_M1);
out += " -DCK_PARAM_AThreadTransferSrcResetCoordinateAfterRun=" +
std::to_string(pt->AThreadTransferSrcResetCoordinateAfterRun);
out += " -DCK_PARAM_BBlockTransferThreadSliceLengths_K_N0_N1=" +
std::to_string(pt->BBlockTransferThreadSliceLengths_K_N0_N1[0]) + "," +
std::to_string(pt->BBlockTransferThreadSliceLengths_K_N0_N1[1]) + "," +
std::to_string(pt->BBlockTransferThreadSliceLengths_K_N0_N1[2]);
out += " -DCK_PARAM_BBlockTransferThreadClusterLengths_K_N0_N1=" +
std::to_string(pt->BBlockTransferThreadClusterLengths_K_N0_N1[0]) + "," +
std::to_string(pt->BBlockTransferThreadClusterLengths_K_N0_N1[1]) + "," +
std::to_string(pt->BBlockTransferThreadClusterLengths_K_N0_N1[2]);
out += " -DCK_PARAM_BBlockTransferThreadClusterArrangeOrder=" +
std::to_string(pt->BBlockTransferThreadClusterArrangeOrder[0]) + "," +
std::to_string(pt->BBlockTransferThreadClusterArrangeOrder[1]) + "," +
std::to_string(pt->BBlockTransferThreadClusterArrangeOrder[2]);
out += " -DCK_PARAM_BBlockTransferSrcAccessOrder=" +
std::to_string(pt->BBlockTransferSrcAccessOrder[0]) + "," +
std::to_string(pt->BBlockTransferSrcAccessOrder[1]) + "," +
std::to_string(pt->BBlockTransferSrcAccessOrder[2]);
out +=
" -DCK_PARAM_BBlockTransferSrcVectorDim=" + std::to_string(pt->BBlockTransferSrcVectorDim);
out += " -DCK_PARAM_BBlockTransferSrcScalarPerVector=" +
std::to_string(pt->BBlockTransferSrcScalarPerVector);
out += " -DCK_PARAM_BBlockTransferDstScalarPerVector_N1=" +
std::to_string(pt->BBlockTransferDstScalarPerVector_N1);
out += " -DCK_PARAM_BThreadTransferSrcResetCoordinateAfterRun=" +
std::to_string(pt->BThreadTransferSrcResetCoordinateAfterRun);
out += " -DCK_PARAM_CThreadTransferSrcDstAccessOrder=" +
std::to_string(pt->CThreadTransferSrcDstAccessOrder[0]) + "," +
std::to_string(pt->CThreadTransferSrcDstAccessOrder[1]) + "," +
std::to_string(pt->CThreadTransferSrcDstAccessOrder[2]) + "," +
std::to_string(pt->CThreadTransferSrcDstAccessOrder[3]) + "," +
std::to_string(pt->CThreadTransferSrcDstAccessOrder[4]) + "," +
std::to_string(pt->CThreadTransferSrcDstAccessOrder[5]);
out += " -DCK_PARAM_CThreadTransferSrcDstVectorDim=" +
std::to_string(pt->CThreadTransferSrcDstVectorDim);
out += " -DCK_PARAM_CThreadTransferDstScalarPerVector=" +
std::to_string(pt->CThreadTransferDstScalarPerVector);
return (out);
};
} // namespace detail_dyn_conv_fwd_v4r4_nchw_kcyx_nkhw
template <typename TInWei,
typename TAcc,
typename TOut,
typename InLengths,
typename WeiLengths,
typename OutLengths,
typename ConvStrides,
typename ConvDilations,
typename InLeftPads,
typename InRightPads>
void device_dynamic_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw_olc(
olCompile::Handle* handle,
const InLengths& in_n_c_hi_wi_lengths,
const WeiLengths& wei_k_c_y_x_lengths,
const OutLengths& out_n_k_ho_wo_lengths,
const ConvStrides& conv_strides,
const ConvDilations& conv_dilations,
const InLeftPads& in_left_pads,
const InRightPads& in_right_pads,
const Tensor<TInWei>& in_n_c_hi_wi,
const Tensor<TInWei>& wei_k_c_y_x,
Tensor<TOut>& out_n_k_ho_wo,
const tunable_dyn_conv_fwd_v4r4_nchw_kcyx_nkhw* tunable,
ck::index_t nrepeat)
{
using namespace ck;
using namespace detail_dyn_conv_fwd_v4r4_nchw_kcyx_nkhw;
using size_t = std::size_t;
/////////////////////////////////////////////////////////////////////////////////////////////////////////////
// The follow codes are only used for computing the grid_size, hasMainKBlockLoop,
// hasDoubleTailKBlockLoop
constexpr auto I0 = Number<0>{};
constexpr auto I1 = Number<1>{};
constexpr auto I2 = Number<2>{};
constexpr auto I3 = Number<3>{};
const auto in_n_c_hi_wi_desc =
make_dynamic_naive_tensor_descriptor_packed_v2(in_n_c_hi_wi_lengths);
const auto wei_k_c_y_x_desc =
make_dynamic_naive_tensor_descriptor_packed_v2(wei_k_c_y_x_lengths);
const auto out_n_k_ho_wo_desc =
make_dynamic_naive_tensor_descriptor_packed_v2(out_n_k_ho_wo_lengths);
const auto descs =
transform_forward_convolution_into_gemm_v4r4_nchw_kcyx_nkhw_pad(wei_k_c_y_x_desc,
in_n_c_hi_wi_desc,
out_n_k_ho_wo_desc,
conv_strides,
conv_dilations,
in_left_pads,
in_right_pads);
const auto a_k_m_grid_desc = descs[I0];
const auto c_m_n_grid_desc = descs[I2];
const auto M = c_m_n_grid_desc.GetLength(I0);
const auto N = c_m_n_grid_desc.GetLength(I1);
const auto K = a_k_m_grid_desc.GetLength(I0);
const index_t grid_size = (M / tunable->MPerBlock) * (N / tunable->NPerBlock);
const bool hasMainKBlockLoop = ((K + tunable->KPerBlock) / (2 * tunable->KPerBlock) > 1);
const bool hasDoubleTailKBlockLoop = ((K / tunable->KPerBlock) % 2 == 0);
/////////////////////////////////////////////////////////////////////////////////////////////////////////////
// these buffers are usually provided by the user application
DeviceMem in_n_c_hi_wi_dev_buf(sizeof(TInWei) * in_n_c_hi_wi.mDesc.GetElementSpace());
DeviceMem wei_k_c_y_x_dev_buf(sizeof(TInWei) * wei_k_c_y_x.mDesc.GetElementSpace());
DeviceMem out_n_k_ho_wo_dev_buf(sizeof(TOut) * out_n_k_ho_wo.mDesc.GetElementSpace());
in_n_c_hi_wi_dev_buf.ToDevice(in_n_c_hi_wi.mData.data());
wei_k_c_y_x_dev_buf.ToDevice(wei_k_c_y_x.mData.data());
out_n_k_ho_wo_dev_buf.ToDevice(out_n_k_ho_wo.mData.data());
// these are workspace buffers that should be expressed to the user by the corresponding
// workspace API
DeviceMem workspace_buf(4096);
void* a_k_m0_m1_grid_desc_dev_buf = workspace_buf.GetDeviceBuffer();
void* b_k_n0_n1_grid_desc_dev_buf =
static_cast<void*>(static_cast<unsigned char*>(workspace_buf.GetDeviceBuffer()) + 1024);
void* c_m0_m10_m11_n0_n10_n11_grid_desc_dev_buf =
static_cast<void*>(static_cast<unsigned char*>(workspace_buf.GetDeviceBuffer()) + 2048);
void* c_blockid_to_m0_n0_block_cluster_adaptor_dev_buf =
static_cast<void*>(static_cast<unsigned char*>(workspace_buf.GetDeviceBuffer()) + 3072);
const std::vector<size_t> vld = {static_cast<size_t>(tunable->BlockSize), 1, 1};
const std::vector<size_t> vgd1 = {static_cast<size_t>(tunable->BlockSize), 1, 1};
const std::vector<size_t> vgd2 = {static_cast<size_t>(grid_size * tunable->BlockSize), 1, 1};
std::string program_name = "dynamic_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw.cpp";
std::string algo_name = "implicit_gemm_conv_fwd_v4r4_nchw";
std::string param = " -std=c++17 ";
std::string network_config;
param += get_definition_string_from_types<TInWei, TAcc, TOut>() + " " +
get_definition_string_from_tunable(tunable) +
" -DCK_PARAM_HAS_MAIN_KBLOCK_LOOP=" + std::to_string(hasMainKBlockLoop) +
" -DCK_PARAM_HAS_DOUBLE_TAIL_KBLOCK_LOOP=" + std::to_string(hasDoubleTailKBlockLoop);
network_config = get_network_config_string_from_types<TInWei, TAcc, TOut>() + "_" +
get_network_config_string_from_tunable(tunable) + "_" +
std::to_string(hasMainKBlockLoop) + "_" +
std::to_string(hasDoubleTailKBlockLoop);
std::vector<float> kernel1_times;
std::vector<float> kernel2_times;
for(index_t i = 0; i < nrepeat; ++i)
{
KernelTimer timer1, timer2;
std::string kernel_name;
kernel_name = "dynamic_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw_prepare";
auto network_config_1 = network_config + "_1";
timer1.Start();
handle->AddKernel(algo_name, network_config_1, program_name, kernel_name, vld, vgd1, param)(
static_cast<index_t>(in_n_c_hi_wi_lengths[I0]),
static_cast<index_t>(in_n_c_hi_wi_lengths[I1]),
static_cast<index_t>(in_n_c_hi_wi_lengths[I2]),
static_cast<index_t>(in_n_c_hi_wi_lengths[I3]),
static_cast<index_t>(wei_k_c_y_x_lengths[I0]),
static_cast<index_t>(wei_k_c_y_x_lengths[I2]),
static_cast<index_t>(wei_k_c_y_x_lengths[I3]),
conv_strides[I0],
conv_strides[I1],
conv_dilations[I0],
conv_dilations[I1],
in_left_pads[I0],
in_left_pads[I1],
in_right_pads[I0],
in_right_pads[I1],
a_k_m0_m1_grid_desc_dev_buf,
b_k_n0_n1_grid_desc_dev_buf,
c_m0_m10_m11_n0_n10_n11_grid_desc_dev_buf,
c_blockid_to_m0_n0_block_cluster_adaptor_dev_buf);
timer1.End();
kernel_name = "dynamic_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw";
auto network_config_2 = network_config + "_2";
timer2.Start();
handle->AddKernel(algo_name, network_config_2, program_name, kernel_name, vld, vgd2, param)(
reinterpret_cast<const TInWei*>(wei_k_c_y_x_dev_buf.GetDeviceBuffer()),
reinterpret_cast<const TInWei*>(in_n_c_hi_wi_dev_buf.GetDeviceBuffer()),
reinterpret_cast<TOut*>(out_n_k_ho_wo_dev_buf.GetDeviceBuffer()),
(const void*)(a_k_m0_m1_grid_desc_dev_buf),
(const void*)(b_k_n0_n1_grid_desc_dev_buf),
(const void*)(c_m0_m10_m11_n0_n10_n11_grid_desc_dev_buf),
(const void*)(c_blockid_to_m0_n0_block_cluster_adaptor_dev_buf));
timer2.End();
kernel1_times.push_back(timer1.GetElapsedTime());
kernel2_times.push_back(timer2.GetElapsedTime());
}
{
auto ave_time1 = Driver::get_effective_average(kernel1_times);
auto ave_time2 = Driver::get_effective_average(kernel2_times);
const auto N = in_n_c_hi_wi_lengths[I0];
const auto C = in_n_c_hi_wi_lengths[I1];
const auto K = out_n_k_ho_wo_lengths[I1];
const auto Ho = out_n_k_ho_wo_lengths[I2];
const auto Wo = out_n_k_ho_wo_lengths[I3];
const auto Y = wei_k_c_y_x_lengths[I2];
const auto X = wei_k_c_y_x_lengths[I3];
float perf = (float)(std::size_t(2) * N * K * Ho * Wo * C * Y * X) /
(std::size_t(1000) * 1000 * 1000) / (ave_time1 + ave_time2);
std::cout << "Average time : " << ave_time1 + ave_time2 << " ms(" << ave_time1 << ", "
<< ave_time2 << "), " << perf << " TFlop/s" << std::endl;
};
// copy result back to host
out_n_k_ho_wo_dev_buf.FromDevice(out_n_k_ho_wo.mData.data());
}

View File

@@ -0,0 +1,404 @@
#include "device.hpp"
#include "host_tensor.hpp"
#include "dynamic_tensor_descriptor.hpp"
#include "dynamic_tensor_descriptor_helper.hpp"
#include "transform_forward_convolution_into_gemm_v4r5_nchw_kcyx_nkhw.hpp"
#include "olc_driver_common.hpp"
#include "conv_tunables.hpp"
#include "handle.hpp"
namespace detail_dyn_conv_fwd_v4r5_nchw_kcyx_nkhw {
template <typename TInWei, typename TAcc, typename TOut>
static std::string get_network_config_string_from_types()
{
std::string out;
out += static_cast<char>(Driver::get_typeid_from_type<TInWei>()) +
static_cast<char>(Driver::get_typeid_from_type<TAcc>()) +
static_cast<char>(Driver::get_typeid_from_type<TOut>());
return (out);
};
static std::string
get_network_config_string_from_tunable(const tunable_dyn_conv_fwd_v4r5_nchw_kcyx_nkhw* pt)
{
std::string out("TUN_");
out += std::to_string(pt->BlockSize) + "_";
out += std::to_string(pt->GM1PerBlockGM11) + "x" + std::to_string(pt->GN1PerBlockGN11) + "x" +
std::to_string(pt->KPerBlock) + "_";
out += std::to_string(pt->M1PerThread) + "x" + std::to_string(pt->N1PerThread) + "x" +
std::to_string(pt->KPerThread) + "_";
out += std::to_string(pt->M1N1ThreadClusterM10) + "x" +
std::to_string(pt->M1N1ThreadClusterN10) + "x" +
std::to_string(pt->M1N1ThreadClusterM11) + "x" +
std::to_string(pt->M1N1ThreadClusterN11) + "_";
out += std::to_string(pt->ABlockTransferThreadSliceLengths_GK_GM0_GM10_GM11[0]) + "x" +
std::to_string(pt->ABlockTransferThreadSliceLengths_GK_GM0_GM10_GM11[1]) + "x" +
std::to_string(pt->ABlockTransferThreadSliceLengths_GK_GM0_GM10_GM11[2]) + "x" +
std::to_string(pt->ABlockTransferThreadSliceLengths_GK_GM0_GM10_GM11[3]) + "_";
out += std::to_string(pt->ABlockTransferThreadClusterLengths_GK_GM0_GM10_GM11[0]) + "x" +
std::to_string(pt->ABlockTransferThreadClusterLengths_GK_GM0_GM10_GM11[1]) + "x" +
std::to_string(pt->ABlockTransferThreadClusterLengths_GK_GM0_GM10_GM11[2]) + "x" +
std::to_string(pt->ABlockTransferThreadClusterLengths_GK_GM0_GM10_GM11[3]) + "_";
out += std::to_string(pt->ABlockTransferThreadClusterArrangeOrder[0]) + "x" +
std::to_string(pt->ABlockTransferThreadClusterArrangeOrder[1]) + "x" +
std::to_string(pt->ABlockTransferThreadClusterArrangeOrder[2]) + "x" +
std::to_string(pt->ABlockTransferThreadClusterArrangeOrder[3]) + "_";
out += std::to_string(pt->ABlockTransferSrcAccessOrder[0]) + "x" +
std::to_string(pt->ABlockTransferSrcAccessOrder[1]) + "x" +
std::to_string(pt->ABlockTransferSrcAccessOrder[2]) + "x" +
std::to_string(pt->ABlockTransferSrcAccessOrder[3]) + "_";
out += std::to_string(pt->ABlockTransferSrcVectorDim) + "_";
out += std::to_string(pt->ABlockTransferSrcScalarPerVector) + "_";
out += std::to_string(pt->ABlockTransferDstScalarPerVector_GM11) + "_";
out += std::to_string(pt->AThreadTransferSrcResetCoordinateAfterRun) + "_";
out += std::to_string(pt->BBlockTransferThreadSliceLengths_GK_GN0_GN10_GN11[0]) + "x" +
std::to_string(pt->BBlockTransferThreadSliceLengths_GK_GN0_GN10_GN11[1]) + "x" +
std::to_string(pt->BBlockTransferThreadSliceLengths_GK_GN0_GN10_GN11[2]) + "x" +
std::to_string(pt->BBlockTransferThreadSliceLengths_GK_GN0_GN10_GN11[3]);
out += std::to_string(pt->BBlockTransferThreadClusterLengths_GK_GN0_GN10_GN11[0]) + "x" +
std::to_string(pt->BBlockTransferThreadClusterLengths_GK_GN0_GN10_GN11[1]) + "x" +
std::to_string(pt->BBlockTransferThreadClusterLengths_GK_GN0_GN10_GN11[2]) + "x" +
std::to_string(pt->BBlockTransferThreadClusterLengths_GK_GN0_GN10_GN11[3]) + "_";
out += std::to_string(pt->BBlockTransferThreadClusterArrangeOrder[0]) + "x" +
std::to_string(pt->BBlockTransferThreadClusterArrangeOrder[1]) + "x" +
std::to_string(pt->BBlockTransferThreadClusterArrangeOrder[2]) + "x" +
std::to_string(pt->BBlockTransferThreadClusterArrangeOrder[3]) + "_";
out += std::to_string(pt->BBlockTransferSrcAccessOrder[0]) + "x" +
std::to_string(pt->BBlockTransferSrcAccessOrder[1]) + "x" +
std::to_string(pt->BBlockTransferSrcAccessOrder[2]) + "x" +
std::to_string(pt->BBlockTransferSrcAccessOrder[3]) + "_";
out += std::to_string(pt->BBlockTransferSrcVectorDim) + "_";
out += std::to_string(pt->BBlockTransferSrcScalarPerVector) + "_";
out += std::to_string(pt->BBlockTransferDstScalarPerVector_GN11) + "_";
out += std::to_string(pt->BThreadTransferSrcResetCoordinateAfterRun) + "_";
out += std::to_string(pt->CThreadTransferSrcDstAccessOrder[0]) + "x" +
std::to_string(pt->CThreadTransferSrcDstAccessOrder[1]) + "x" +
std::to_string(pt->CThreadTransferSrcDstAccessOrder[2]) + "x" +
std::to_string(pt->CThreadTransferSrcDstAccessOrder[3]) + "x" +
std::to_string(pt->CThreadTransferSrcDstAccessOrder[4]) + "x" +
std::to_string(pt->CThreadTransferSrcDstAccessOrder[5]) + "_";
out += std::to_string(pt->CThreadTransferSrcDstVectorDim) + "_";
out += std::to_string(pt->CThreadTransferDstScalarPerVector);
return (out);
};
template <typename TInWei, typename TAcc, typename TOut>
static std::string get_definition_string_from_types()
{
std::string out;
out += " -DCK_PARAM_IN_WEI_DATATYPE=" + std::to_string(Driver::get_typeid_from_type<TInWei>()) +
" -DCK_PARAM_CONV_COMPTYPE=" + std::to_string(Driver::get_typeid_from_type<TAcc>()) +
" -DCK_PARAM_OUT_DATATYPE=" + std::to_string(Driver::get_typeid_from_type<TOut>());
return (out);
};
static std::string
get_definition_string_from_tunable(const tunable_dyn_conv_fwd_v4r5_nchw_kcyx_nkhw* pt)
{
std::string out;
out += " -DCK_PARAM_BlockSize=" + std::to_string(pt->BlockSize);
out += " -DCK_PARAM_GM1PerBlockGM11=" + std::to_string(pt->GM1PerBlockGM11) +
" -DCK_PARAM_GN1PerBlockGN11=" + std::to_string(pt->GN1PerBlockGN11) +
" -DCK_PARAM_KPerBlock=" + std::to_string(pt->KPerBlock);
out += " -DCK_PARAM_M1PerThread=" + std::to_string(pt->M1PerThread) +
" -DCK_PARAM_N1PerThread=" + std::to_string(pt->N1PerThread) +
" -DCK_PARAM_KPerThread=" + std::to_string(pt->KPerThread);
out += " -DCK_PARAM_M1N1ThreadClusterM10=" + std::to_string(pt->M1N1ThreadClusterM10) +
" -DCK_PARAM_M1N1ThreadClusterN10=" + std::to_string(pt->M1N1ThreadClusterN10) +
" -DCK_PARAM_M1N1ThreadClusterM11=" + std::to_string(pt->M1N1ThreadClusterM11) +
" -DCK_PARAM_M1N1ThreadClusterN11=" + std::to_string(pt->M1N1ThreadClusterN11);
out += " -DCK_PARAM_ABlockTransferThreadSliceLengths_GK_GM0_GM10_GM11=" +
std::to_string(pt->ABlockTransferThreadSliceLengths_GK_GM0_GM10_GM11[0]) + "," +
std::to_string(pt->ABlockTransferThreadSliceLengths_GK_GM0_GM10_GM11[1]) + "," +
std::to_string(pt->ABlockTransferThreadSliceLengths_GK_GM0_GM10_GM11[2]) + "," +
std::to_string(pt->ABlockTransferThreadSliceLengths_GK_GM0_GM10_GM11[3]);
out += " -DCK_PARAM_ABlockTransferThreadClusterLengths_GK_GM0_GM10_GM11=" +
std::to_string(pt->ABlockTransferThreadClusterLengths_GK_GM0_GM10_GM11[0]) + "," +
std::to_string(pt->ABlockTransferThreadClusterLengths_GK_GM0_GM10_GM11[1]) + "," +
std::to_string(pt->ABlockTransferThreadClusterLengths_GK_GM0_GM10_GM11[2]) + "," +
std::to_string(pt->ABlockTransferThreadClusterLengths_GK_GM0_GM10_GM11[3]);
out += " -DCK_PARAM_ABlockTransferThreadClusterArrangeOrder=" +
std::to_string(pt->ABlockTransferThreadClusterArrangeOrder[0]) + "," +
std::to_string(pt->ABlockTransferThreadClusterArrangeOrder[1]) + "," +
std::to_string(pt->ABlockTransferThreadClusterArrangeOrder[2]) + "," +
std::to_string(pt->ABlockTransferThreadClusterArrangeOrder[3]);
out += " -DCK_PARAM_ABlockTransferSrcAccessOrder=" +
std::to_string(pt->ABlockTransferSrcAccessOrder[0]) + "," +
std::to_string(pt->ABlockTransferSrcAccessOrder[1]) + "," +
std::to_string(pt->ABlockTransferSrcAccessOrder[2]) + "," +
std::to_string(pt->ABlockTransferSrcAccessOrder[3]);
out +=
" -DCK_PARAM_ABlockTransferSrcVectorDim=" + std::to_string(pt->ABlockTransferSrcVectorDim);
out += " -DCK_PARAM_ABlockTransferSrcScalarPerVector=" +
std::to_string(pt->ABlockTransferSrcScalarPerVector);
out += " -DCK_PARAM_ABlockTransferDstScalarPerVector_GM11=" +
std::to_string(pt->ABlockTransferDstScalarPerVector_GM11);
out += " -DCK_PARAM_AThreadTransferSrcResetCoordinateAfterRun=" +
std::to_string(pt->AThreadTransferSrcResetCoordinateAfterRun);
out += " -DCK_PARAM_BBlockTransferThreadSliceLengths_GK_GN0_GN10_GN11=" +
std::to_string(pt->BBlockTransferThreadSliceLengths_GK_GN0_GN10_GN11[0]) + "," +
std::to_string(pt->BBlockTransferThreadSliceLengths_GK_GN0_GN10_GN11[1]) + "," +
std::to_string(pt->BBlockTransferThreadSliceLengths_GK_GN0_GN10_GN11[2]) + "," +
std::to_string(pt->BBlockTransferThreadSliceLengths_GK_GN0_GN10_GN11[3]);
out += " -DCK_PARAM_BBlockTransferThreadClusterLengths_GK_GN0_GN10_GN11=" +
std::to_string(pt->BBlockTransferThreadClusterLengths_GK_GN0_GN10_GN11[0]) + "," +
std::to_string(pt->BBlockTransferThreadClusterLengths_GK_GN0_GN10_GN11[1]) + "," +
std::to_string(pt->BBlockTransferThreadClusterLengths_GK_GN0_GN10_GN11[2]) + "," +
std::to_string(pt->BBlockTransferThreadClusterLengths_GK_GN0_GN10_GN11[3]);
out += " -DCK_PARAM_BBlockTransferThreadClusterArrangeOrder=" +
std::to_string(pt->BBlockTransferThreadClusterArrangeOrder[0]) + "," +
std::to_string(pt->BBlockTransferThreadClusterArrangeOrder[1]) + "," +
std::to_string(pt->BBlockTransferThreadClusterArrangeOrder[2]) + "," +
std::to_string(pt->BBlockTransferThreadClusterArrangeOrder[3]);
out += " -DCK_PARAM_BBlockTransferSrcAccessOrder=" +
std::to_string(pt->BBlockTransferSrcAccessOrder[0]) + "," +
std::to_string(pt->BBlockTransferSrcAccessOrder[1]) + "," +
std::to_string(pt->BBlockTransferSrcAccessOrder[2]) + "," +
std::to_string(pt->BBlockTransferSrcAccessOrder[3]);
out +=
" -DCK_PARAM_BBlockTransferSrcVectorDim=" + std::to_string(pt->BBlockTransferSrcVectorDim);
out += " -DCK_PARAM_BBlockTransferSrcScalarPerVector=" +
std::to_string(pt->BBlockTransferSrcScalarPerVector);
out += " -DCK_PARAM_BBlockTransferDstScalarPerVector_GN11=" +
std::to_string(pt->BBlockTransferDstScalarPerVector_GN11);
out += " -DCK_PARAM_BThreadTransferSrcResetCoordinateAfterRun=" +
std::to_string(pt->BThreadTransferSrcResetCoordinateAfterRun);
out += " -DCK_PARAM_CThreadTransferSrcDstAccessOrder=" +
std::to_string(pt->CThreadTransferSrcDstAccessOrder[0]) + "," +
std::to_string(pt->CThreadTransferSrcDstAccessOrder[1]) + "," +
std::to_string(pt->CThreadTransferSrcDstAccessOrder[2]) + "," +
std::to_string(pt->CThreadTransferSrcDstAccessOrder[3]) + "," +
std::to_string(pt->CThreadTransferSrcDstAccessOrder[4]) + "," +
std::to_string(pt->CThreadTransferSrcDstAccessOrder[5]);
out += " -DCK_PARAM_CThreadTransferSrcDstVectorDim=" +
std::to_string(pt->CThreadTransferSrcDstVectorDim);
out += " -DCK_PARAM_CThreadTransferDstScalarPerVector=" +
std::to_string(pt->CThreadTransferDstScalarPerVector);
return (out);
};
} // namespace detail_dyn_conv_fwd_v4r5_nchw_kcyx_nkhw
template <typename TInWei,
typename TAcc,
typename TOut,
typename InLengths,
typename WeiLengths,
typename OutLengths,
typename ConvStrides,
typename ConvDilations,
typename InLeftPads,
typename InRightPads>
void device_dynamic_convolution_forward_implicit_gemm_v4r5_nchw_kcyx_nkhw_olc(
olCompile::Handle* handle,
const InLengths& in_n_c_hi_wi_lengths,
const WeiLengths& wei_k_c_y_x_lengths,
const OutLengths& out_n_k_ho_wo_lengths,
const ConvStrides& conv_strides,
const ConvDilations& conv_dilations,
const InLeftPads& in_left_pads,
const InRightPads& in_right_pads,
const Tensor<TInWei>& in_n_c_hi_wi,
const Tensor<TInWei>& wei_k_c_y_x,
Tensor<TOut>& out_n_k_ho_wo,
const tunable_dyn_conv_fwd_v4r5_nchw_kcyx_nkhw* tunable,
ck::index_t nrepeat)
{
using namespace ck;
using namespace detail_dyn_conv_fwd_v4r5_nchw_kcyx_nkhw;
using size_t = std::size_t;
constexpr index_t N0 = 4; // this could not be a tunable so far
////////////////////////////////////////////////////////////////////////////////////////////////////////////
// The follow codes are only used for computing the grid_size, hasMainKBlockLoop,
// hasDoubleTailKBlockLoop
constexpr auto I0 = Number<0>{};
constexpr auto I1 = Number<1>{};
constexpr auto I2 = Number<2>{};
constexpr auto I3 = Number<3>{};
const auto in_n_c_hi_wi_desc =
make_dynamic_naive_tensor_descriptor_packed_v2(in_n_c_hi_wi_lengths);
const auto wei_k_c_y_x_desc =
make_dynamic_naive_tensor_descriptor_packed_v2(wei_k_c_y_x_lengths);
const auto out_n_k_ho_wo_desc =
make_dynamic_naive_tensor_descriptor_packed_v2(out_n_k_ho_wo_lengths);
const auto descs = transform_forward_convolution_into_contraction_v4r5_nchw_kcyx_nkhw_pad<N0>(
wei_k_c_y_x_desc,
in_n_c_hi_wi_desc,
out_n_k_ho_wo_desc,
conv_strides,
conv_dilations,
in_left_pads,
in_right_pads);
const auto a_gk_gm0_gm1_grid_desc = descs[I0];
const auto c_gm0_gm1_gn0_gn1_grid_desc = descs[I2];
const auto GM1 = c_gm0_gm1_gn0_gn1_grid_desc.GetLength(I1);
const auto GN1 = c_gm0_gm1_gn0_gn1_grid_desc.GetLength(I3);
const auto GK = a_gk_gm0_gm1_grid_desc.GetLength(I0);
const index_t grid_size = (GM1 / tunable->GM1PerBlockGM11) * (GN1 / tunable->GN1PerBlockGN11);
const bool hasMainKBlockLoop = ((GK + tunable->KPerBlock) / (2 * tunable->KPerBlock) > 1);
const bool hasDoubleTailKBlockLoop = ((GK / tunable->KPerBlock) % 2 == 0);
///////////////////////////////////////////////////////////////////////////////////////////////////////////
// these buffers are usually provided by the user application
DeviceMem in_n_c_hi_wi_dev_buf(sizeof(TInWei) * in_n_c_hi_wi.mDesc.GetElementSpace());
DeviceMem wei_k_c_y_x_dev_buf(sizeof(TInWei) * wei_k_c_y_x.mDesc.GetElementSpace());
DeviceMem out_n_k_ho_wo_dev_buf(sizeof(TOut) * out_n_k_ho_wo.mDesc.GetElementSpace());
in_n_c_hi_wi_dev_buf.ToDevice(in_n_c_hi_wi.mData.data());
wei_k_c_y_x_dev_buf.ToDevice(wei_k_c_y_x.mData.data());
out_n_k_ho_wo_dev_buf.ToDevice(out_n_k_ho_wo.mData.data());
// these are workspace buffers that should be expressed to the user by the corresponding
// workspace API
DeviceMem workspace_buf(4096);
void* a_gk_gm0_gm10_gm11_grid_desc_dev_buf = workspace_buf.GetDeviceBuffer();
void* b_gk_gn0_gn10_gn11_grid_desc_dev_buf =
static_cast<void*>(static_cast<unsigned char*>(workspace_buf.GetDeviceBuffer()) + 1024);
void* c_gm10_bm0_bm1_gn10_bn0_bn1_grid_desc_dev_buf =
static_cast<void*>(static_cast<unsigned char*>(workspace_buf.GetDeviceBuffer()) + 2048);
void* c_blockid_to_gm10_gn10_block_cluster_adaptor_dev_buf =
static_cast<void*>(static_cast<unsigned char*>(workspace_buf.GetDeviceBuffer()) + 3072);
const std::vector<size_t> vld = {static_cast<size_t>(tunable->BlockSize), 1, 1};
const std::vector<size_t> vgd1 = {static_cast<size_t>(tunable->BlockSize), 1, 1};
const std::vector<size_t> vgd2 = {static_cast<size_t>(grid_size * tunable->BlockSize), 1, 1};
std::string program_name = "dynamic_convolution_forward_implicit_gemm_v4r5_nchw_kcyx_nkhw.cpp";
std::string algo_name = "implicit_gemm_conv_fwd_v4r4_nchw";
std::string param = " -std=c++17 ";
std::string network_config;
param += get_definition_string_from_types<TInWei, TAcc, TOut>() +
" -DCK_PARAM_HAS_MAIN_KBLOCK_LOOP=" + std::to_string(hasMainKBlockLoop) +
" -DCK_PARAM_HAS_DOUBLE_TAIL_KBLOCK_LOOP=" + std::to_string(hasDoubleTailKBlockLoop) +
" -DCK_PARAM_N0=" + std::to_string(N0) + " " +
get_definition_string_from_tunable(tunable);
network_config = get_network_config_string_from_types<TInWei, TAcc, TOut>() + "_V" +
std::to_string(hasDoubleTailKBlockLoop) + "_" + std::to_string(N0) + "_" +
get_network_config_string_from_tunable(tunable);
std::vector<float> kernel1_times;
std::vector<float> kernel2_times;
for(index_t i = 0; i < nrepeat; ++i)
{
KernelTimer timer1, timer2;
std::string kernel_name;
kernel_name = "dynamic_convolution_forward_implicit_gemm_v4r5_nchw_kcyx_nkhw_prepare";
auto network_config_1 = network_config + "_1";
timer1.Start();
handle->AddKernel(algo_name, network_config_1, program_name, kernel_name, vld, vgd1, param)(
static_cast<index_t>(in_n_c_hi_wi_lengths[I0]),
static_cast<index_t>(in_n_c_hi_wi_lengths[I1]),
static_cast<index_t>(in_n_c_hi_wi_lengths[I2]),
static_cast<index_t>(in_n_c_hi_wi_lengths[I3]),
static_cast<index_t>(wei_k_c_y_x_lengths[I0]),
static_cast<index_t>(wei_k_c_y_x_lengths[I2]),
static_cast<index_t>(wei_k_c_y_x_lengths[I3]),
conv_strides[I0],
conv_strides[I1],
conv_dilations[I0],
conv_dilations[I1],
in_left_pads[I0],
in_left_pads[I1],
in_right_pads[I0],
in_right_pads[I1],
a_gk_gm0_gm10_gm11_grid_desc_dev_buf,
b_gk_gn0_gn10_gn11_grid_desc_dev_buf,
c_gm10_bm0_bm1_gn10_bn0_bn1_grid_desc_dev_buf,
c_blockid_to_gm10_gn10_block_cluster_adaptor_dev_buf);
timer2.End();
kernel_name = "dynamic_convolution_forward_implicit_gemm_v4r5_nchw_kcyx_nkhw";
auto network_config_2 = network_config + "_2";
timer2.Start();
handle->AddKernel(algo_name, network_config_2, program_name, kernel_name, vld, vgd2, param)(
reinterpret_cast<const TInWei*>(wei_k_c_y_x_dev_buf.GetDeviceBuffer()),
reinterpret_cast<const TInWei*>(in_n_c_hi_wi_dev_buf.GetDeviceBuffer()),
reinterpret_cast<TOut*>(out_n_k_ho_wo_dev_buf.GetDeviceBuffer()),
(const void*)(a_gk_gm0_gm10_gm11_grid_desc_dev_buf),
(const void*)(b_gk_gn0_gn10_gn11_grid_desc_dev_buf),
(const void*)(c_gm10_bm0_bm1_gn10_bn0_bn1_grid_desc_dev_buf),
(const void*)(c_blockid_to_gm10_gn10_block_cluster_adaptor_dev_buf));
timer2.End();
kernel1_times.push_back(timer1.GetElapsedTime());
kernel2_times.push_back(timer2.GetElapsedTime());
}
{
auto ave_time1 = Driver::get_effective_average(kernel1_times);
auto ave_time2 = Driver::get_effective_average(kernel2_times);
const auto N = in_n_c_hi_wi_lengths[I0];
const auto C = in_n_c_hi_wi_lengths[I1];
const auto K = out_n_k_ho_wo_lengths[I1];
const auto Ho = out_n_k_ho_wo_lengths[I2];
const auto Wo = out_n_k_ho_wo_lengths[I3];
const auto Y = wei_k_c_y_x_lengths[I2];
const auto X = wei_k_c_y_x_lengths[I3];
float perf = (float)(std::size_t(2) * N * K * Ho * Wo * C * Y * X) /
(std::size_t(1000) * 1000 * 1000) / (ave_time1 + ave_time2);
std::cout << "Average time : " << ave_time1 + ave_time2 << " ms(" << ave_time1 << ", "
<< ave_time2 << "), " << perf << " TFlop/s" << std::endl;
};
// copy result back to host
out_n_k_ho_wo_dev_buf.FromDevice(out_n_k_ho_wo.mData.data());
}

View File

@@ -0,0 +1,114 @@
#ifndef OLC_DRIVER_COMMON_HPP
#define OLC_DRIVER_COMMON_HPP
#include <half.hpp>
#include <vector>
#include <cassert>
// this enumerate should be synchronized with include/miopen.h
typedef enum {
appHalf = 0,
appFloat = 1,
appInt32 = 2,
appInt8 = 3,
appInt8x4 = 4,
appBFloat16 = 5,
appDouble = 6,
} appDataType_t;
namespace Driver {
template <appDataType_t typeNum>
struct get_type_from_type_enum
{
using type = float;
};
template <>
struct get_type_from_type_enum<appHalf>
{
using type = half_float::half;
};
template <>
struct get_type_from_type_enum<appFloat>
{
using type = float;
};
template <>
struct get_type_from_type_enum<appDouble>
{
using type = double;
};
template <>
struct get_type_from_type_enum<appInt32>
{
using type = int;
};
static inline int get_typeid_from_type_enum(appDataType_t t)
{
switch(t)
{
case appHalf: return (static_cast<int>('H'));
case appFloat: return (static_cast<int>('F'));
case appBFloat16: return (static_cast<int>('B'));
case appDouble: return (static_cast<int>('D'));
case appInt8:
case appInt8x4:
case appInt32: return (static_cast<int>('O'));
default: throw std::runtime_error("Only float, half, bfloat16 data type is supported."); break;
};
};
template <typename T>
static inline int get_typeid_from_type()
{
throw std::runtime_error("Unsupported typeid conversion for this type!");
};
template <>
inline int get_typeid_from_type<float>()
{
return (static_cast<int>('F'));
};
template <>
inline int get_typeid_from_type<half_float::half>()
{
return (static_cast<int>('H'));
};
template <>
inline int get_typeid_from_type<double>()
{
return (static_cast<int>('D'));
};
static inline float get_effective_average(std::vector<float>& values)
{
assert(!values.empty());
if(values.size() == 1)
return (values[0]);
else
{
float sum = 0.0f;
float maxVal = 0.0f;
for(const auto val : values)
{
if(maxVal < val)
maxVal = val;
sum += val;
};
return ((sum - maxVal) / (values.size() - 1));
};
};
} // namespace Driver
#endif

View File

@@ -0,0 +1,30 @@
################################################################################
#
# MIT License
#
# Copyright (c) 2017 Advanced Micro Devices, Inc.
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
#
################################################################################
set(ADD_KERNELS_SOURCE include_inliner.cpp addkernels.cpp)
add_executable(addkernels EXCLUDE_FROM_ALL ${ADD_KERNELS_SOURCE})

View File

@@ -0,0 +1,264 @@
/*******************************************************************************
*
* MIT License
*
* Copyright (c) 2021 Advanced Micro Devices, Inc.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*
*******************************************************************************/
#include "include_inliner.hpp"
#include <algorithm>
#include <fstream>
#include <iomanip>
#include <iostream>
#include <memory>
#include <sstream>
#include <string>
void Bin2Hex(std::istream& source,
std::ostream& target,
const std::string& variable,
bool nullTerminate,
size_t bufferSize,
size_t lineSize)
{
source.seekg(0, std::ios::end);
std::unique_ptr<unsigned char[]> buffer(new unsigned char[bufferSize]);
std::streamoff sourceSize = source.tellg();
std::streamoff blockStart = 0;
if(variable.length() != 0)
{
target << "extern const size_t " << variable << "_SIZE;" << std::endl;
target << "extern const unsigned char " << variable << "[];" << std::endl;
target << "const size_t " << variable << "_SIZE = " << std::setbase(10) << sourceSize << ";"
<< std::endl;
target << "const unsigned char " << variable << "[] = {" << std::endl;
}
target << std::setbase(16) << std::setfill('0');
source.seekg(0, std::ios::beg);
while(blockStart < sourceSize)
{
source.read(reinterpret_cast<char*>(buffer.get()), bufferSize);
std::streamoff pos = source.tellg();
std::streamoff blockSize = (pos < 0 ? sourceSize : pos) - blockStart;
std::streamoff i = 0;
while(i < blockSize)
{
size_t j = i;
size_t end = std::min<size_t>(i + lineSize, blockSize);
for(; j < end; j++)
target << "0x" << std::setw(2) << static_cast<unsigned>(buffer[j]) << ",";
target << std::endl;
i = end;
}
blockStart += blockSize;
}
if(nullTerminate)
target << "0x00," << std::endl;
if(variable.length() != 0)
{
target << "};" << std::endl;
}
}
void PrintHelp()
{
std::cout << "Usage: bin2hex {<option>}" << std::endl;
std::cout << "Option format: -<option name>[ <option value>]" << std::endl;
std::cout << std::endl;
std::cout << "Options:" << std::endl;
std::cout
<< "[REQUIRED] -s[ource] {<path to file>}: files to be processed. Must be last argument."
<< std::endl;
std::cout << " -t[arget] <path>: target file. Default: std out." << std::endl;
std::cout << " -l[ine-size] <number>: bytes in one line. Default: 16." << std::endl;
std::cout << " -b[uffer] <number>: read buffer size. Default: 512." << std::endl;
std::cout << " -g[uard] <string>: guard name. Default: no guard" << std::endl;
std::cout << " -n[o-recurse] : dont expand include files recursively. Default: off"
<< std::endl;
}
[[gnu::noreturn]] void WrongUsage(const std::string& error)
{
std::cout << "Wrong usage: " << error << std::endl;
std::cout << std::endl;
PrintHelp();
std::exit(1);
}
[[gnu::noreturn]] void UnknownArgument(const std::string& arg)
{
std::ostringstream ss;
ss << "unknown argument - " << arg;
WrongUsage(ss.str());
}
void Process(const std::string& sourcePath,
std::ostream& target,
size_t bufferSize,
size_t lineSize,
bool recurse,
bool as_extern)
{
std::string fileName(sourcePath);
std::string extension, root;
std::stringstream inlinerTemp;
auto extPos = fileName.rfind('.');
auto slashPos = fileName.rfind('/');
if(extPos != std::string::npos)
{
extension = fileName.substr(extPos + 1);
fileName = fileName.substr(0, extPos);
}
if(slashPos != std::string::npos)
{
root = fileName.substr(0, slashPos + 1);
fileName = fileName.substr(slashPos + 1);
}
std::string variable(fileName);
std::ifstream sourceFile(sourcePath, std::ios::in | std::ios::binary);
std::istream* source = &sourceFile;
if(!sourceFile.good())
{
std::cerr << "File not found: " << sourcePath << std::endl;
std::exit(1);
}
const auto is_asm = extension == "s";
const auto is_cl = extension == "cl";
const auto is_hip = extension == "cpp";
const auto is_header = extension == "hpp";
if(is_asm || is_cl || is_hip || is_header)
{
IncludeInliner inliner;
try
{
if(is_asm)
inliner.Process(
sourceFile, inlinerTemp, root, sourcePath, ".include", false, recurse);
else if(is_cl || is_header)
inliner.Process(
sourceFile, inlinerTemp, root, sourcePath, "#include", true, recurse);
else if(is_hip)
inliner.Process(
sourceFile, inlinerTemp, root, sourcePath, "<#not_include>", true, false);
}
catch(const InlineException& ex)
{
std::cerr << ex.What() << std::endl;
std::cerr << ex.GetTrace() << std::endl;
std::exit(1);
}
source = &inlinerTemp;
}
std::transform(variable.begin(), variable.end(), variable.begin(), ::toupper);
if(as_extern && variable.length() != 0)
{
variable = "APP_KERNEL_" + variable;
}
Bin2Hex(*source, target, variable, true, bufferSize, lineSize);
}
int main(int argsn, char** args)
{
if(argsn == 1)
{
PrintHelp();
return 2;
}
std::string guard;
size_t bufferSize = 512;
size_t lineSize = 16;
std::ofstream targetFile;
std::ostream* target = &std::cout;
bool recurse = true;
bool as_extern = false;
int i = 0;
while(++i < argsn && **args != '-')
{
std::string arg(args[i] + 1);
std::transform(arg.begin(), arg.end(), arg.begin(), ::tolower);
if(arg == "s" || arg == "source")
{
if(guard.length() > 0)
{
*target << "#ifndef " << guard << std::endl;
*target << "#define " << guard << std::endl;
}
*target << "#include <cstddef>" << std::endl;
while(++i < argsn)
{
Process(args[i], *target, bufferSize, lineSize, recurse, as_extern);
}
if(guard.length() > 0)
{
*target << "#endif" << std::endl;
}
return 0;
}
else if(arg == "t" || arg == "target")
{
targetFile.open(args[++i], std::ios::out);
target = &targetFile;
}
else if(arg == "l" || arg == "line-size")
lineSize = std::stol(args[++i]);
else if(arg == "b" || arg == "buffer")
bufferSize = std::stol(args[++i]);
else if(arg == "g" || arg == "guard")
guard = args[++i];
else if(arg == "n" || arg == "no-recurse")
recurse = false;
else if(arg == "e" || arg == "extern")
as_extern = true;
else
UnknownArgument(arg);
}
WrongUsage("source key is required");
}

View File

@@ -0,0 +1,213 @@
/*******************************************************************************
*
* MIT License
*
* Copyright (c) 2021 Advanced Micro Devices, Inc.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*
*******************************************************************************/
#include <algorithm>
#include <exception>
#include <fstream>
#include <sstream>
#ifdef _WIN32
#include <windows.h>
#endif
#ifdef __linux__
#include <linux/limits.h>
#include <cstdlib>
#endif // !WIN32
#include "include_inliner.hpp"
namespace PathHelpers {
static int GetMaxPath()
{
#ifdef _WIN32
return MAX_PATH;
#else
return PATH_MAX;
#endif
}
static std::string GetAbsolutePath(const std::string& path)
{
std::string result(GetMaxPath(), ' ');
#ifdef _WIN32
const auto retval = GetFullPathName(path.c_str(), result.size(), &result[0], nullptr);
if(retval == 0)
return "";
#else
auto* const retval = realpath(path.c_str(), &result[0]);
if(retval == nullptr)
return "";
#endif
return result;
}
} // namespace PathHelpers
std::string IncludeFileExceptionBase::What() const
{
std::ostringstream ss;
ss << GetMessage() << ": <" << _file << ">";
return ss.str();
}
void IncludeInliner::Process(std::istream& input,
std::ostream& output,
const std::string& root,
const std::string& file_name,
const std::string& directive,
bool allow_angle_brackets,
bool recurse)
{
ProcessCore(input, output, root, file_name, 0, directive, allow_angle_brackets, recurse);
}
void IncludeInliner::ProcessCore(std::istream& input,
std::ostream& output,
const std::string& root,
const std::string& file_name,
int line_number,
const std::string& directive,
bool allow_angle_brackets,
bool recurse)
{
if(_include_depth >= include_depth_limit)
throw InlineStackOverflowException(GetIncludeStackTrace(0));
_include_depth++;
_included_stack_head =
std::make_shared<SourceFileDesc>(file_name, _included_stack_head, line_number);
auto current_line = 0;
auto next_include_optional = false;
while(!input.eof())
{
std::string line;
std::string word;
std::getline(input, line);
std::istringstream line_parser(line);
line_parser >> word;
current_line++;
std::transform(word.begin(), word.end(), word.begin(), ::tolower);
const auto include_optional = next_include_optional;
next_include_optional = false;
if(!word.empty() && word == "//inliner-include-optional")
{
if(include_optional)
throw IncludeExpectedException(GetIncludeStackTrace(current_line));
next_include_optional = true;
continue;
}
if(!word.empty() && word == directive && recurse)
{
auto first_quote_pos = line.find('"', static_cast<int>(line_parser.tellg()) + 1);
std::string::size_type second_quote_pos;
if(first_quote_pos != std::string::npos)
{
second_quote_pos = line.find('"', first_quote_pos + 1);
if(second_quote_pos == std::string::npos)
throw WrongInlineDirectiveException(GetIncludeStackTrace(current_line));
}
else
{
if(!allow_angle_brackets)
throw WrongInlineDirectiveException(GetIncludeStackTrace(current_line));
first_quote_pos = line.find('<', static_cast<int>(line_parser.tellg()) + 1);
if(first_quote_pos == std::string::npos)
throw WrongInlineDirectiveException(GetIncludeStackTrace(current_line));
second_quote_pos = line.find('>', first_quote_pos + 1);
if(second_quote_pos == std::string::npos)
throw WrongInlineDirectiveException(GetIncludeStackTrace(current_line));
}
const std::string include_file_path =
line.substr(first_quote_pos + 1, second_quote_pos - first_quote_pos - 1);
const std::string abs_include_file_path(
PathHelpers::GetAbsolutePath(root + "/" + include_file_path)); // NOLINT
if(abs_include_file_path.empty())
{
if(include_optional)
continue;
throw IncludeNotFoundException(include_file_path,
GetIncludeStackTrace(current_line));
}
std::ifstream include_file(abs_include_file_path, std::ios::in);
if(!include_file.good())
throw IncludeCantBeOpenedException(include_file_path,
GetIncludeStackTrace(current_line));
ProcessCore(include_file,
output,
root,
include_file_path,
current_line,
directive,
allow_angle_brackets,
recurse);
}
else
{
if(include_optional)
throw IncludeExpectedException(GetIncludeStackTrace(current_line));
if(output.tellp() > 0)
output << std::endl;
output << line;
}
}
auto prev_file = _included_stack_head->included_from;
_included_stack_head = prev_file;
_include_depth--;
}
std::string IncludeInliner::GetIncludeStackTrace(int line)
{
std::ostringstream ss;
if(_included_stack_head == nullptr)
return "";
auto item = _included_stack_head;
ss << " " << item->path << ":" << line;
while(item->included_from != nullptr)
{
ss << std::endl << " from " << item->included_from->path << ":" << item->included_line;
item = item->included_from;
}
return ss.str();
}

View File

@@ -0,0 +1,142 @@
/*******************************************************************************
*
* MIT License
*
* Copyright (c) 2021 Advanced Micro Devices, Inc.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*
*******************************************************************************/
#ifndef SOURCE_INLINER_HPP
#define SOURCE_INLINER_HPP
#include "source_file_desc.hpp"
#include <ostream>
#include <memory>
#include <stack>
class InlineException : public std::exception
{
public:
InlineException(const std::string& trace) : _trace(trace) {}
virtual std::string What() const = 0;
const std::string& GetTrace() const { return _trace; }
private:
std::string _trace;
};
class InlineStackOverflowException : public InlineException
{
public:
InlineStackOverflowException(const std::string& trace) : InlineException(trace) {}
std::string What() const override
{
return "Include stack depth limit has been reached, possible circle includes";
}
};
class IncludeExpectedException : public InlineException
{
public:
IncludeExpectedException(const std::string& trace) : InlineException(trace) {}
std::string What() const override { return "Include directive expected"; }
};
class WrongInlineDirectiveException : public InlineException
{
public:
WrongInlineDirectiveException(const std::string& trace) : InlineException(trace) {}
std::string What() const override { return "Include directive has wrong format"; }
};
class IncludeFileExceptionBase : public InlineException
{
public:
IncludeFileExceptionBase(const std::string& file, const std::string& trace)
: InlineException(trace), _file(file)
{
}
std::string What() const override;
virtual std::string GetMessage() const = 0;
private:
std::string _file;
};
class IncludeNotFoundException : public IncludeFileExceptionBase
{
public:
IncludeNotFoundException(const std::string& file, const std::string& trace)
: IncludeFileExceptionBase(file, trace)
{
}
std::string GetMessage() const override
{
return "Include file not found (if it is optional put //inliner-include-optional on line "
"before it)";
}
};
class IncludeCantBeOpenedException : public IncludeFileExceptionBase
{
public:
IncludeCantBeOpenedException(const std::string& file, const std::string& trace)
: IncludeFileExceptionBase(file, trace)
{
}
std::string GetMessage() const override { return "Can not open include file"; }
};
class IncludeInliner
{
public:
int include_depth_limit = 256;
void Process(std::istream& input,
std::ostream& output,
const std::string& root,
const std::string& file_name,
const std::string& directive,
bool allow_angle_brackets,
bool recurse);
std::string GetIncludeStackTrace(int line);
private:
int _include_depth = 0;
std::shared_ptr<SourceFileDesc> _included_stack_head = nullptr;
void ProcessCore(std::istream& input,
std::ostream& output,
const std::string& root,
const std::string& file_name,
int line_number,
const std::string& directive,
bool allow_angle_brackets,
bool recurse);
};
#endif // !SOURCE_INLINER_HPP

View File

@@ -0,0 +1,45 @@
/*******************************************************************************
*
* MIT License
*
* Copyright (c) 2021 Advanced Micro Devices, Inc.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*
*******************************************************************************/
#ifndef SOURCE_FILE_DESC_HPP
#define SOURCE_FILE_DESC_HPP
#include <string>
#include <memory>
class SourceFileDesc
{
public:
const std::string path;
int included_line;
std::shared_ptr<SourceFileDesc> included_from;
SourceFileDesc(const std::string& path_, std::shared_ptr<SourceFileDesc> from, int line)
: path(path_), included_line(line), included_from(from)
{
}
};
#endif // SOURCE_FILE_DESC_HPP

View File

@@ -0,0 +1,112 @@
/*******************************************************************************
*
* MIT License
*
* Copyright (c) 2017 Advanced Micro Devices, Inc.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*
*******************************************************************************/
#include <binary_cache.hpp>
#include <handle.hpp>
#include <md5.hpp>
#include <env.hpp>
#include <stringutils.hpp>
#include <logger.hpp>
#include <target_properties.hpp>
#include <boost/filesystem.hpp>
#include <fstream>
#include <iostream>
namespace olCompile {
OLC_DECLARE_ENV_VAR(OLC_DISABLE_CACHE)
OLC_DECLARE_ENV_VAR(HOME)
static boost::filesystem::path ComputeCachePath()
{
const char* home_dir = GetStringEnv(HOME{});
if(home_dir == nullptr || home_dir == std::string("/") || home_dir == std::string(""))
{
home_dir = "/tmp";
}
auto p = boost::filesystem::path{home_dir} / "_hip_binary_kernels_";
if(!boost::filesystem::exists(p))
boost::filesystem::create_directories(p);
return p;
}
boost::filesystem::path GetCachePath()
{
static const boost::filesystem::path user_path = ComputeCachePath();
return user_path;
}
static bool IsCacheDisabled() { return olCompile::IsEnabled(OLC_DISABLE_CACHE{}); }
boost::filesystem::path
GetCacheFile(const std::string& device, const std::string& name, const std::string& args)
{
// std::string filename = (is_kernel_str ? olCompile::md5(name) : name) + ".o";
std::string filename = name + ".o";
return GetCachePath() / olCompile::md5(device + ":" + args) / filename;
}
boost::filesystem::path LoadBinary(const TargetProperties& target,
const size_t num_cu,
const std::string& name,
const std::string& args)
{
if(olCompile::IsCacheDisabled())
return {};
(void)num_cu;
auto f = GetCacheFile(target.DbId(), name, args);
if(boost::filesystem::exists(f))
{
return f.string();
}
else
{
return {};
}
}
void SaveBinary(const boost::filesystem::path& binary_path,
const TargetProperties& target,
const std::string& name,
const std::string& args)
{
if(olCompile::IsCacheDisabled())
{
boost::filesystem::remove(binary_path);
}
else
{
auto p = GetCacheFile(target.DbId(), name, args);
boost::filesystem::create_directories(p.parent_path());
boost::filesystem::rename(binary_path, p);
}
}
} // namespace olCompile

View File

@@ -0,0 +1,93 @@
/*******************************************************************************
*
* MIT License
*
* Copyright (c) 2019 Advanced Micro Devices, Inc.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*
*******************************************************************************/
#include <exec_utils.hpp>
#include <manage_ptr.hpp>
#include <istream>
#include <ostream>
#include <string>
#include <cstdio>
#include <array>
#include <cassert>
#ifdef __linux__
#include <unistd.h>
#include <cstdio>
#include <sys/wait.h>
#endif // __linux__
namespace olCompile {
namespace exec {
int Run(const std::string& p, std::istream* in, std::ostream* out)
{
#ifdef __linux__
const auto redirect_stdin = (in != nullptr);
const auto redirect_stdout = (out != nullptr);
assert(!(redirect_stdin && redirect_stdout));
const auto file_mode = redirect_stdout ? "r" : "w";
OLC_MANAGE_PTR(FILE*, pclose) pipe{popen(p.c_str(), file_mode)};
if(!pipe)
throw std::runtime_error("olCompile::exec::Run(): popen(" + p + ", " + file_mode +
") failed");
if(redirect_stdin || redirect_stdout)
{
std::array<char, 1024> buffer{};
if(redirect_stdout)
{
while(feof(pipe.get()) == 0)
if(fgets(buffer.data(), buffer.size(), pipe.get()) != nullptr)
*out << buffer.data();
}
else
{
while(!in->eof())
{
in->read(buffer.data(), buffer.size() - 1);
buffer[in->gcount()] = 0;
if(fputs(buffer.data(), pipe.get()) == EOF)
throw std::runtime_error("olCompile::exec::Run(): fputs() failed");
}
}
}
auto status = pclose(pipe.release());
return WEXITSTATUS(status);
#else
(void)p;
(void)in;
(void)out;
return -1;
#endif // __linux__
}
} // namespace exec
} // namespace olCompile

View File

@@ -0,0 +1,285 @@
/*******************************************************************************
*
* MIT License
*
* Copyright (c) 2017-2020 Advanced Micro Devices, Inc.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*
*******************************************************************************/
#include <handle.hpp>
#include <binary_cache.hpp>
#include <env.hpp>
#include <kernel_cache.hpp>
#include <stringutils.hpp>
#include <target_properties.hpp>
#include <hipCheck.hpp>
#include <write_file.hpp>
#include <boost/filesystem.hpp>
#include <boost/lexical_cast.hpp>
#ifndef _WIN32
#include <unistd.h>
#endif
#include <algorithm>
#include <cassert>
#include <chrono>
#include <thread>
OLC_DECLARE_ENV_VAR(OLC_DEVICE_CU)
namespace olCompile {
std::size_t GetAvailableMemory()
{
size_t free, total;
MY_HIP_CHECK(hipMemGetInfo(&free, &total));
return free;
}
int get_device_id() // Get random device
{
int device;
MY_HIP_CHECK(hipGetDevice(&device));
return device;
}
void set_device(int id) { MY_HIP_CHECK(hipSetDevice(id)); }
int set_default_device()
{
int n;
MY_HIP_CHECK(hipGetDeviceCount(&n));
// Pick device based on process id
auto pid = ::getpid();
assert(pid > 0);
set_device(pid % n);
return (pid % n);
}
struct HandleImpl
{
using StreamPtr = std::shared_ptr<typename std::remove_pointer<hipStream_t>::type>;
HandleImpl() {}
StreamPtr create_stream()
{
hipStream_t result;
MY_HIP_CHECK(hipStreamCreate(&result));
return StreamPtr{result, &hipStreamDestroy};
}
static StreamPtr reference_stream(hipStream_t s) { return StreamPtr{s, null_deleter{}}; }
std::string get_device_name() const
{
hipDeviceProp_t props;
MY_HIP_CHECK(hipGetDeviceProperties(&props, device));
const std::string name(props.gcnArchName);
return name;
}
StreamPtr stream = nullptr;
int device = -1;
KernelCache cache;
TargetProperties target_properties;
};
Handle::Handle(hipStream_t stream) : impl(new HandleImpl())
{
this->impl->device = get_device_id();
if(stream == nullptr)
this->impl->stream = HandleImpl::reference_stream(nullptr);
else
this->impl->stream = HandleImpl::reference_stream(stream);
this->impl->target_properties.Init(this);
}
Handle::Handle() : impl(new HandleImpl())
{
this->impl->device = get_device_id();
this->impl->stream = HandleImpl::reference_stream(nullptr);
this->impl->target_properties.Init(this);
}
Handle::~Handle() {}
void Handle::SetStream(hipStream_t streamID) const
{
this->impl->stream = HandleImpl::reference_stream(streamID);
this->impl->target_properties.Init(this);
}
hipStream_t Handle::GetStream() const { return impl->stream.get(); }
KernelInvoke Handle::AddKernel(const std::string& algorithm,
const std::string& network_config,
const std::string& program_name,
const std::string& kernel_name,
const std::vector<size_t>& vld,
const std::vector<size_t>& vgd,
const std::string& params,
std::size_t cache_index) const
{
auto obj = this->impl->cache.AddKernel(
*this, algorithm, network_config, program_name, kernel_name, vld, vgd, params, cache_index);
return this->Run(obj);
}
void Handle::ClearKernels(const std::string& algorithm, const std::string& network_config) const
{
this->impl->cache.ClearKernels(algorithm, network_config);
}
const std::vector<Kernel>& Handle::GetKernelsImpl(const std::string& algorithm,
const std::string& network_config) const
{
return this->impl->cache.GetKernels(algorithm, network_config);
}
bool Handle::HasKernel(const std::string& algorithm, const std::string& network_config) const
{
return this->impl->cache.HasKernels(algorithm, network_config);
}
KernelInvoke Handle::Run(Kernel k) const { return k.Invoke(this->GetStream()); }
Program Handle::LoadProgram(const std::string& program_name, std::string params) const
{
if((!olCompile::EndsWith(program_name, ".mlir-cpp")) &&
(!olCompile::EndsWith(program_name, ".mlir")))
{
params += " -mcpu=" + this->GetTargetProperties().Name();
}
auto hsaco = olCompile::LoadBinary(
this->GetTargetProperties(), this->GetMaxComputeUnits(), program_name, params);
if(hsaco.empty())
{
auto p = HIPOCProgram{program_name, params, this->GetTargetProperties()};
auto path = olCompile::GetCachePath() / boost::filesystem::unique_path();
if(p.IsCodeObjectInMemory())
olCompile::WriteFile(p.GetCodeObjectBlob(), path);
else
boost::filesystem::copy_file(p.GetCodeObjectPathname(), path);
olCompile::SaveBinary(path, this->GetTargetProperties(), program_name, params);
return p;
}
else
{
return HIPOCProgram{program_name, hsaco};
}
}
bool Handle::HasProgram(const std::string& program_name, const std::string& params) const
{
return this->impl->cache.HasProgram(program_name, params);
}
void Handle::AddProgram(Program prog,
const std::string& program_name,
const std::string& params) const
{
this->impl->cache.AddProgram(prog, program_name, params);
}
void Handle::Finish() const { MY_HIP_CHECK(hipStreamSynchronize(this->GetStream())); }
std::size_t Handle::GetLocalMemorySize() const
{
int result;
MY_HIP_CHECK(hipDeviceGetAttribute(
&result, hipDeviceAttributeMaxSharedMemoryPerBlock, this->impl->device));
return result;
}
std::size_t Handle::GetGlobalMemorySize() const
{
size_t result;
MY_HIP_CHECK(hipDeviceTotalMem(&result, this->impl->device));
return result;
}
std::size_t Handle::GetMaxComputeUnits() const
{
int result;
const char* const num_cu = olCompile::GetStringEnv(OLC_DEVICE_CU{});
if(num_cu != nullptr && strlen(num_cu) > 0)
{
return boost::lexical_cast<std::size_t>(num_cu);
}
MY_HIP_CHECK(
hipDeviceGetAttribute(&result, hipDeviceAttributeMultiprocessorCount, this->impl->device));
return result;
}
std::size_t Handle::GetWavefrontWidth() const
{
hipDeviceProp_t props{};
MY_HIP_CHECK(hipGetDeviceProperties(&props, this->impl->device));
auto result = static_cast<size_t>(props.warpSize);
return result;
}
std::string Handle::GetDeviceNameImpl() const { return this->impl->get_device_name(); }
std::string Handle::GetDeviceName() const { return this->impl->target_properties.Name(); }
const TargetProperties& Handle::GetTargetProperties() const
{
return this->impl->target_properties;
}
std::ostream& Handle::Print(std::ostream& os) const
{
os << "stream: " << this->impl->stream << ", device_id: " << this->impl->device;
return os;
}
} // namespace olCompile

View File

@@ -0,0 +1,346 @@
/*******************************************************************************
*
* MIT License
*
* Copyright (c) 2019 Advanced Micro Devices, Inc.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*
*******************************************************************************/
#include <hip_build_utils.hpp>
#include <stringutils.hpp>
#include <tmp_dir.hpp>
#include <env.hpp>
#include <target_properties.hpp>
#include <write_file.hpp>
#include <exec_utils.hpp>
#include <logger.hpp>
#include <config.h>
#include <boost/optional.hpp>
#include <sstream>
#include <string>
#include <stdexcept>
#include <iostream>
OLC_DECLARE_ENV_VAR(OLC_DEBUG_HIP_VERBOSE)
OLC_DECLARE_ENV_VAR(OLC_DEBUG_HIP_DUMP)
#define OLC_HIP_COMPILER "/opt/rocm/llvm/bin/clang++"
namespace olCompile {
bool IsHccCompiler()
{
static const auto isHcc = EndsWith(OLC_HIP_COMPILER, "hcc");
return isHcc;
}
bool IsHipClangCompiler()
{
static const auto isClangXX = EndsWith(OLC_HIP_COMPILER, "clang++");
return isClangXX;
}
namespace {
inline bool ProduceCoV3()
{
// Otherwise, let's enable CO v3 for HIP kernels since ROCm 3.0.
return (HipCompilerVersion() >= external_tool_version_t{3, 0, -1});
}
/// Returns option for enabling/disabling CO v3 generation for the compiler
/// that builds HIP kernels, depending on compiler version etc.
inline const std::string& GetCoV3Option(const bool enable)
{
/// \note PR #2166 uses the "--hcc-cov3" option when isHCC is true.
/// It's unclear why... HCC included in ROCm 2.8 does not support it,
/// perhaps it suits for some older HCC?
///
/// These options are Ok for ROCm 3.0:
static const std::string option_enable{"-mcode-object-v3"};
static const std::string no_option{};
if(enable)
return option_enable;
else
return no_option;
}
} // namespace
static boost::filesystem::path HipBuildImpl(boost::optional<TmpDir>& tmp_dir,
const std::string& filename,
std::string src,
std::string params,
const TargetProperties& target,
const bool testing_mode,
const bool sources_already_reside_on_filesystem)
{
#ifdef __linux__
// Write out the include files
// Let's assume includes are overkill for feature tests & optimize'em out.
if(!testing_mode)
{
auto inc_list = GetHipKernelIncList();
auto inc_path = tmp_dir->path;
boost::filesystem::create_directories(inc_path);
for(auto inc_file : inc_list)
{
auto inc_src = GetKernelInc(inc_file);
WriteFile(inc_src, inc_path / inc_file);
}
}
// Sources produced by MLIR-cpp already reside in tmp dir.
if(!sources_already_reside_on_filesystem)
{
src += "\nint main() {}\n";
WriteFile(src, tmp_dir->path / filename);
}
// cppcheck-suppress unreadVariable
const LcOptionTargetStrings lots(target);
auto env = std::string("");
if(IsHccCompiler())
{
params += " -amdgpu-target=" + target.Name();
params += " " + GetCoV3Option(ProduceCoV3());
}
else if(IsHipClangCompiler())
{
if(params.find("-std=") == std::string::npos)
params += " --std=c++11";
if(HipCompilerVersion() < external_tool_version_t{4, 1, 0})
params += " --cuda-gpu-arch=" + lots.device;
else
params += " --cuda-gpu-arch=" + lots.device + lots.xnack;
params += " --cuda-device-only";
params += " -c";
params += " -O3 ";
}
params += " -Wno-unused-command-line-argument -I. ";
params += OLC_STRINGIZE(HIP_COMPILER_FLAGS);
if(IsHccCompiler())
{
env += std::string("KMOPTLLC=\"-mattr=+enable-ds128 ");
if(HipCompilerVersion() >= external_tool_version_t{2, 8, 0})
env += " --amdgpu-spill-vgpr-to-agpr=0";
env += '\"';
}
else if(IsHipClangCompiler())
{
params += " -mllvm --amdgpu-spill-vgpr-to-agpr=0";
params += " -mllvm -amdgpu-early-inline-all=true";
params += " -mllvm -amdgpu-function-calls=false";
}
if(olCompile::IsEnabled(OLC_DEBUG_HIP_VERBOSE{}))
{
params += " -v";
}
if(olCompile::IsEnabled(OLC_DEBUG_HIP_DUMP{}))
{
if(IsHccCompiler())
{
params += " -gline-tables-only";
env += " KMDUMPISA=1";
env += " KMDUMPLLVM=1";
}
else if(IsHipClangCompiler())
{
params += " -gline-tables-only";
params += " -save-temps";
}
}
// hip version
params +=
std::string(" -DHIP_PACKAGE_VERSION_FLAT=") + std::to_string(HIP_PACKAGE_VERSION_FLAT);
params += " ";
auto bin_file = tmp_dir->path / (filename + ".o");
// compile
const std::string redirector = testing_mode ? " 1>/dev/null 2>&1" : "";
tmp_dir->Execute(env + std::string(" ") + OLC_HIP_COMPILER,
params + filename + " -o " + bin_file.string() + redirector);
if(!boost::filesystem::exists(bin_file))
throw std::runtime_error(filename + " failed to compile");
#ifdef EXTRACTKERNEL_BIN
if(IsHccCompiler())
{
// call extract kernel
tmp_dir->Execute(EXTRACTKERNEL_BIN, " -i " + bin_file.string());
auto hsaco =
std::find_if(boost::filesystem::directory_iterator{tmp_dir->path}, {}, [](auto entry) {
return (entry.path().extension() == ".hsaco");
});
if(hsaco == boost::filesystem::directory_iterator{})
{
fdt_log(LogLevel::Info, "HipBuild", "failed to find *.hsaco in ")
<< hsaco->path().string() << std::endl;
}
return hsaco->path();
}
#endif
return bin_file;
#else
(void)filename;
(void)params;
throw std::runtimer_error("HIP kernels are only supported in Linux");
#endif
}
boost::filesystem::path HipBuild(boost::optional<TmpDir>& tmp_dir,
const std::string& filename,
std::string src,
std::string params,
const TargetProperties& target,
const bool sources_already_reside_on_filesystem)
{
return HipBuildImpl(
tmp_dir, filename, src, params, target, false, sources_already_reside_on_filesystem);
}
void bin_file_to_str(const boost::filesystem::path& file, std::string& buf)
{
std::ifstream bin_file_ptr(file.string().c_str(), std::ios::binary);
std::ostringstream bin_file_strm;
bin_file_strm << bin_file_ptr.rdbuf();
buf = bin_file_strm.str();
}
static external_tool_version_t HipCompilerVersionImpl()
{
external_tool_version_t version;
if(IsHccCompiler())
{
const std::string path(OLC_HIP_COMPILER);
const std::string mandatory_prefix("(based on HCC ");
do
{
if(path.empty() || !std::ifstream(path).good())
break;
std::stringstream out;
if(olCompile::exec::Run(path + " --version", nullptr, &out) != 0)
break;
std::string line;
while(!out.eof())
{
std::getline(out, line);
fdt_log() << line;
auto begin = line.find(mandatory_prefix);
if(begin == std::string::npos)
continue;
begin += mandatory_prefix.size();
int v3, v2, v1 = v2 = v3 = -1;
char c2, c1 = c2 = 'X';
std::istringstream iss(line.substr(begin));
iss >> v1 >> c1 >> v2 >> c2 >> v3;
if(!iss.fail() && v1 >= 0)
{
version.major = v1;
if(c1 == '.' && v2 >= 0)
{
version.minor = v2;
if(c2 == '.' && v3 >= 0)
version.patch = v3;
}
}
break;
}
} while(false);
}
else
{
#ifdef HIP_PACKAGE_VERSION_MAJOR
fdt_log(
LogLevel::Info, "HipCompilerVersion", "Read version information from HIP package...");
version.major = HIP_PACKAGE_VERSION_MAJOR;
#ifdef HIP_PACKAGE_VERSION_MINOR
version.minor = HIP_PACKAGE_VERSION_MINOR;
#else
version.minor = 0;
#endif
#ifdef HIP_PACKAGE_VERSION_PATCH
version.patch = HIP_PACKAGE_VERSION_PATCH;
#else
version.patch = 0;
#endif
#else // HIP_PACKAGE_VERSION_MAJOR is not defined. CMake failed to find HIP package.
fdt_log(LogLevel::Info, "HipCompilerVersion", "...assuming 3.2.0 (hip-clang RC)");
version.major = 3;
version.minor = 2;
version.patch = 0;
#endif
}
fdt_log() << version.major << '.' << version.minor << '.' << version.patch << std::endl;
return version;
}
external_tool_version_t HipCompilerVersion()
{
// NOLINTNEXTLINE (cppcoreguidelines-avoid-non-const-global-variables)
static auto once = HipCompilerVersionImpl();
return once;
}
bool operator>(const external_tool_version_t& lhs, const external_tool_version_t& rhs)
{
if(lhs.major > rhs.major)
return true;
else if(lhs.major == rhs.major)
{
if(lhs.minor > rhs.minor)
return true;
else if(lhs.minor == rhs.minor)
return (lhs.patch > rhs.patch);
else
return false;
}
else
return false;
}
bool operator<(const external_tool_version_t& lhs, const external_tool_version_t& rhs)
{
return rhs > lhs;
}
bool operator>=(const external_tool_version_t& lhs, const external_tool_version_t& rhs)
{
return !(lhs < rhs);
}
bool operator<=(const external_tool_version_t& lhs, const external_tool_version_t& rhs)
{
return !(lhs > rhs);
}
} // namespace olCompile

View File

@@ -0,0 +1,84 @@
/*******************************************************************************
*
* MIT License
*
* Copyright (c) 2017 Advanced Micro Devices, Inc.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*
*******************************************************************************/
#include <env.hpp>
#include <hipoc_kernel.hpp>
#include <hipCheck.hpp>
#include <hip/hip_ext.h>
#include <hip/hip_runtime.h>
#include <chrono>
#include <thread>
namespace olCompile {
void HIPOCKernelInvoke::run(void* args, std::size_t size) const
{
HipEventPtr start = nullptr;
HipEventPtr stop = nullptr;
void* config[] = {// HIP_LAUNCH_PARAM_* are macros that do horrible things
// NOLINTNEXTLINE cppcoreguidelines-pro-type-cstyle-cast
HIP_LAUNCH_PARAM_BUFFER_POINTER,
args,
// NOLINTNEXTLINE cppcoreguidelines-pro-type-cstyle-cast
HIP_LAUNCH_PARAM_BUFFER_SIZE,
&size,
// NOLINTNEXTLINE cppcoreguidelines-pro-type-cstyle-cast
HIP_LAUNCH_PARAM_END};
if(callback)
{
start = make_hip_event();
stop = make_hip_event();
}
MY_HIP_CHECK(hipExtModuleLaunchKernel(fun,
gdims[0],
gdims[1],
gdims[2],
ldims[0],
ldims[1],
ldims[2],
0,
stream,
nullptr,
reinterpret_cast<void**>(&config),
start.get(),
stop.get()));
if(callback)
{
MY_HIP_CHECK(hipEventSynchronize(stop.get()));
callback(start.get(), stop.get());
}
}
HIPOCKernelInvoke HIPOCKernel::Invoke(hipStream_t stream,
std::function<void(hipEvent_t, hipEvent_t)> callback) const
{
return HIPOCKernelInvoke{stream, fun, ldims, gdims, name, callback};
}
} // namespace olCompile

View File

@@ -0,0 +1,139 @@
/*******************************************************************************
*
* MIT License
*
* Copyright (c) 2017 Advanced Micro Devices, Inc.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*
*******************************************************************************/
#include <hip_build_utils.hpp>
#include <hipoc_program.hpp>
#include <kernel.hpp>
#include <stringutils.hpp>
#include <target_properties.hpp>
#include <env.hpp>
#include <write_file.hpp>
#include <boost/optional.hpp>
#include <boost/filesystem/operations.hpp>
#include <cstring>
#include <mutex>
#include <sstream>
#include <unistd.h>
namespace olCompile {
static hipModulePtr CreateModule(const boost::filesystem::path& hsaco_file)
{
hipModule_t raw_m;
MY_HIP_CHECK(hipModuleLoad(&raw_m, hsaco_file.string().c_str()));
hipModulePtr m{raw_m};
return m;
}
template <typename T> /// intended for std::string and std::vector<char>
hipModulePtr CreateModuleInMem(const T& blob)
{
hipModule_t raw_m;
MY_HIP_CHECK(hipModuleLoadData(&raw_m, reinterpret_cast<const void*>(blob.data())));
hipModulePtr m{raw_m};
return m;
}
HIPOCProgramImpl::HIPOCProgramImpl(const std::string& program_name,
const boost::filesystem::path& filespec)
: program(program_name), hsaco_file(filespec)
{
this->module = CreateModule(hsaco_file);
}
HIPOCProgramImpl::HIPOCProgramImpl(const std::string& program_name,
std::string params,
const TargetProperties& target_)
: program(program_name), target(target_)
{
BuildCodeObject(params);
if(!binary.empty())
{
module = CreateModuleInMem(this->binary);
}
else
{
module = CreateModule(this->hsaco_file);
}
}
void HIPOCProgramImpl::BuildCodeObjectInFile(std::string& params,
const std::string& src,
const std::string& filename)
{
this->dir.emplace(filename);
hsaco_file = dir->path / (filename + ".o");
if(olCompile::EndsWith(filename, ".cpp"))
{
hsaco_file = HipBuild(dir, filename, src, params, target);
}
else
throw std::runtime_error("Only HIP kernel source of .cpp file is supported");
if(!boost::filesystem::exists(hsaco_file))
throw std::runtime_error("Cant find file: " + hsaco_file.string());
}
void HIPOCProgramImpl::BuildCodeObject(std::string params)
{
std::string filename = program;
if(olCompile::EndsWith(filename, ".cpp"))
{
params += " -Wno-everything";
}
BuildCodeObjectInFile(params, GetKernelSrc(this->program), filename);
}
HIPOCProgram::HIPOCProgram() {}
HIPOCProgram::HIPOCProgram(const std::string& program_name,
std::string params,
const TargetProperties& target)
: impl(std::make_shared<HIPOCProgramImpl>(program_name, params, target))
{
}
HIPOCProgram::HIPOCProgram(const std::string& program_name, const boost::filesystem::path& hsaco)
: impl(std::make_shared<HIPOCProgramImpl>(program_name, hsaco))
{
}
hipModule_t HIPOCProgram::GetModule() const { return impl->module.get(); }
boost::filesystem::path HIPOCProgram::GetCodeObjectPathname() const { return impl->hsaco_file; }
std::string HIPOCProgram::GetCodeObjectBlob() const
{
return {impl->binary.data(), impl->binary.size()};
}
bool HIPOCProgram::IsCodeObjectInMemory() const { return !impl->binary.empty(); };
} // namespace olCompile

View File

@@ -0,0 +1,66 @@
/*******************************************************************************
*
* MIT License
*
* Copyright (c) 2019 Advanced Micro Devices, Inc.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*
*******************************************************************************/
#include <sstream>
#include <boost/range/adaptor/transformed.hpp>
#include <kernel_build_params.hpp>
#include <stringutils.hpp>
namespace olCompile {
static std::string GenerateDefines(const std::vector<KernelBuildParameter>& options,
const std::string& prefix)
{
const auto strs =
options | boost::adaptors::transformed([&prefix](const KernelBuildParameter& define) {
std::ostringstream ss;
ss << '-';
if(define.type == ParameterTypes::Define)
ss << prefix;
ss << define.name;
if(!define.value.empty())
{
switch(define.type)
{
case ParameterTypes::Define: ss << '='; break;
case ParameterTypes::Option: ss << ' '; break;
}
ss << define.value;
}
return ss.str();
});
return JoinStrings(strs, " ");
}
} // namespace olCompile

View File

@@ -0,0 +1,156 @@
/*******************************************************************************
*
* MIT License
*
* Copyright (c) 2017 Advanced Micro Devices, Inc.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*
*******************************************************************************/
/* ************************************************************************
* Copyright 2015 Vratis, Ltd.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
* ************************************************************************ */
#include <env.hpp>
#include <kernel_cache.hpp>
#include <stringutils.hpp>
#include <iostream>
#include <iterator>
namespace olCompile {
const std::vector<Kernel>& KernelCache::GetKernels(const std::string& algorithm,
const std::string& network_config)
{
std::pair<std::string, std::string> key = std::make_pair(algorithm, network_config);
const auto it = kernel_map.find(key);
if(it != kernel_map.end())
{
return it->second;
}
static const std::vector<Kernel> empty{};
return empty;
}
bool KernelCache::HasKernels(const std::string& algorithm, const std::string& network_config) const
{
const auto key = std::make_pair(algorithm, network_config);
const auto it = kernel_map.find(key);
if(it == kernel_map.end())
return false;
if(it->second.empty())
{
throw std::runtime_error(
"There should be at least one kernel in kernel cache if an entry exists");
}
return true;
}
bool KernelCache::HasProgram(const std::string& name, const std::string& params) const
{
const auto key = std::make_pair(name, params);
return program_map.count(key) > 0;
}
void KernelCache::AddProgram(Program prog, const std::string& program_name, std::string params)
{
program_map[std::make_pair(program_name, params)] = prog;
}
Kernel KernelCache::AddKernel(const Handle& h,
const std::string& algorithm,
const std::string& network_config,
const std::string& program_name,
const std::string& kernel_name,
const std::vector<size_t>& vld,
const std::vector<size_t>& vgd,
std::string params,
std::size_t cache_index)
{
const std::pair<std::string, std::string> key = std::make_pair(algorithm, network_config);
Program program;
auto program_it = program_map.find(std::make_pair(program_name, params));
if(program_it != program_map.end())
{
program = program_it->second;
}
else
{
program = h.LoadProgram(program_name, params);
program_map[std::make_pair(program_name, params)] = program;
}
Kernel kernel{};
kernel = Kernel{program, kernel_name, vld, vgd};
if(!network_config.empty() && !algorithm.empty())
{
this->AddKernel(key, kernel, cache_index);
}
return kernel;
}
void KernelCache::AddKernel(Key key, Kernel k, std::size_t cache_index)
{
auto&& v = kernel_map[key];
if(cache_index >= v.size())
{
v.resize(cache_index + 1);
}
v[cache_index] = k;
}
void KernelCache::ClearKernels(const std::string& algorithm, const std::string& network_config)
{
if(network_config.empty() || algorithm.empty())
{
throw std::runtime_error("Network config or algorithm empty.");
}
const std::pair<std::string, std::string> key = std::make_pair(algorithm, network_config);
auto&& v = this->kernel_map[key];
if(!v.empty())
{
}
v.clear();
}
KernelCache::KernelCache() {}
} // namespace olCompile

View File

@@ -0,0 +1,43 @@
#include <config.h>
#include <logger.hpp>
#include <iostream>
#include <string>
using namespace std;
namespace olCompile {
#if OLC_DEBUG
static LogLevel defLevel = LogLevel::Info2;
#else
static LogLevel defLevel = LogLevel::Error;
#endif
string LogLevelString(LogLevel level)
{
switch(level)
{
case LogLevel::Error: return ("Error");
case LogLevel::Warning: return ("Warning");
case LogLevel::Info: return ("Info");
case LogLevel::Info2: return ("Info2");
default: return ("Unknown");
};
};
ostream& fdt_log(LogLevel level, const char* header, const char* content)
{
if(level > olCompile::defLevel)
{
return (cerr);
};
cerr << endl << LogLevelString(level) << ":" << header << ", " << content;
return (cerr);
}
ostream& fdt_log() { return (cerr); };
void fdt_log_flush() { cerr << endl; }
};

View File

@@ -0,0 +1,319 @@
/*
* Derived from a public-domain MD5 implementation. Original license
* below.
*
* This is an OpenSSL-compatible implementation of the RSA Data Security, Inc.
* MD5 Message-Digest Algorithm (RFC 1321).
*
* Homepage:
* http://openwall.info/wiki/people/solar/software/public-domain-source-code/md5
*
* Author:
* Alexander Peslyak, better known as Solar Designer <solar at openwall.com>
*
* This software was written by Alexander Peslyak in 2001. No copyright is
* claimed, and the software is hereby placed in the public domain.
* In case this attempt to disclaim copyright and place the software in the
* public domain is deemed null and void, then the software is
* Copyright (c) 2001 Alexander Peslyak and it is hereby released to the
* general public under the following terms:
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted.
*
* There's ABSOLUTELY NO WARRANTY, express or implied.
*
* (This is a heavily cut-down "BSD license".)
*
* This differs from Colin Plumb's older public domain implementation in that
* no exactly 32-bit integer data type is required (any 32-bit or wider
* unsigned integer data type will do), there's no compile-time endianness
* configuration, and the function prototypes match OpenSSL's. No code from
* Colin Plumb's implementation has been reused; this comment merely compares
* the properties of the two independent implementations.
*
* The primary goals of this implementation are portability and ease of use.
* It is meant to be fast, but not as fast as possible. Some known
* optimizations are not included to reduce source code size and avoid
* compile-time configuration.
*/
#include <md5.hpp>
#include <array>
#include <cstring>
#include <cstdint>
#include <sstream>
#include <iomanip>
#define MD5_DIGEST_LENGTH 16
struct MD5_CTX
{
uint32_t lo, hi;
uint32_t a, b, c, d;
unsigned char buffer[64];
uint32_t block[MD5_DIGEST_LENGTH];
};
/*
* The basic MD5 functions.
*
* F and G are optimized compared to their RFC 1321 definitions for
* architectures that lack an AND-NOT instruction, just like in Colin Plumb's
* implementation.
*/
#define F(x, y, z) ((z) ^ ((x) & ((y) ^ (z))))
#define G(x, y, z) ((y) ^ ((z) & ((x) ^ (y))))
#define H(x, y, z) (((x) ^ (y)) ^ (z))
#define H2(x, y, z) ((x) ^ ((y) ^ (z)))
#define I(x, y, z) ((y) ^ ((x) | ~(z)))
/*
* The MD5 transformation for all four rounds.
*/
#define STEP(f, a, b, c, d, x, t, s) \
(a) += f((b), (c), (d)) + (x) + (t); \
(a) = (((a) << (s)) | (((a)&0xffffffff) >> (32 - (s)))); \
(a) += (b);
/*
* SET reads 4 input bytes in little-endian byte order and stores them in a
* properly aligned word in host byte order.
*
* The check for little-endian architectures that tolerate unaligned memory
* accesses is just an optimization. Nothing will break if it fails to detect
* a suitable architecture.
*
* Unfortunately, this optimization may be a C strict aliasing rules violation
* if the caller's data buffer has effective type that cannot be aliased by
* uint32_t. In practice, this problem may occur if these MD5 routines are
* inlined into a calling function, or with future and dangerously advanced
* link-time optimizations. For the time being, keeping these MD5 routines in
* their own translation unit avoids the problem.
*/
#if defined(__i386__) || defined(__x86_64__) || defined(__vax__)
#define SET(n) (*reinterpret_cast<const uint32_t*>(&ptr[(n)*4]))
#define GET(n) SET(n)
#else
#define SET(n) \
(ctx->block[(n)] = static_cast<uint32_t>(ptr[(n)*4]) | \
(static_cast<uint32_t>(ptr[(n)*4 + 1]) << 8) | \
(static_cast<uint32_t>(ptr[(n)*4 + 2]) << 16) | \
(static_cast<uint32_t>(ptr[(n)*4 + 3]) << 24))
#define GET(n) (ctx->block[(n)])
#endif
/*
* This processes one or more 64-byte data blocks, but does NOT update the bit
* counters. There are no alignment requirements.
*/
static const void* body(MD5_CTX* ctx, const void* data, size_t size)
{
const unsigned char* ptr;
uint32_t a, b, c, d;
ptr = static_cast<const unsigned char*>(data);
a = ctx->a;
b = ctx->b;
c = ctx->c;
d = ctx->d;
do
{
uint32_t saved_a = a, saved_b = b, saved_c = c, saved_d = d;
/* Round 1 */
STEP(F, a, b, c, d, SET(0), 0xd76aa478, 7)
STEP(F, d, a, b, c, SET(1), 0xe8c7b756, 12)
STEP(F, c, d, a, b, SET(2), 0x242070db, 17)
STEP(F, b, c, d, a, SET(3), 0xc1bdceee, 22)
STEP(F, a, b, c, d, SET(4), 0xf57c0faf, 7)
STEP(F, d, a, b, c, SET(5), 0x4787c62a, 12)
STEP(F, c, d, a, b, SET(6), 0xa8304613, 17)
STEP(F, b, c, d, a, SET(7), 0xfd469501, 22)
STEP(F, a, b, c, d, SET(8), 0x698098d8, 7)
STEP(F, d, a, b, c, SET(9), 0x8b44f7af, 12)
STEP(F, c, d, a, b, SET(10), 0xffff5bb1, 17)
STEP(F, b, c, d, a, SET(11), 0x895cd7be, 22)
STEP(F, a, b, c, d, SET(12), 0x6b901122, 7)
STEP(F, d, a, b, c, SET(13), 0xfd987193, 12)
STEP(F, c, d, a, b, SET(14), 0xa679438e, 17)
STEP(F, b, c, d, a, SET(15), 0x49b40821, 22)
/* Round 2 */
STEP(G, a, b, c, d, GET(1), 0xf61e2562, 5)
STEP(G, d, a, b, c, GET(6), 0xc040b340, 9)
STEP(G, c, d, a, b, GET(11), 0x265e5a51, 14)
STEP(G, b, c, d, a, GET(0), 0xe9b6c7aa, 20)
STEP(G, a, b, c, d, GET(5), 0xd62f105d, 5)
STEP(G, d, a, b, c, GET(10), 0x02441453, 9)
STEP(G, c, d, a, b, GET(15), 0xd8a1e681, 14)
STEP(G, b, c, d, a, GET(4), 0xe7d3fbc8, 20)
STEP(G, a, b, c, d, GET(9), 0x21e1cde6, 5)
STEP(G, d, a, b, c, GET(14), 0xc33707d6, 9)
STEP(G, c, d, a, b, GET(3), 0xf4d50d87, 14)
STEP(G, b, c, d, a, GET(8), 0x455a14ed, 20)
STEP(G, a, b, c, d, GET(13), 0xa9e3e905, 5)
STEP(G, d, a, b, c, GET(2), 0xfcefa3f8, 9)
STEP(G, c, d, a, b, GET(7), 0x676f02d9, 14)
STEP(G, b, c, d, a, GET(12), 0x8d2a4c8a, 20)
/* Round 3 */
STEP(H, a, b, c, d, GET(5), 0xfffa3942, 4)
STEP(H2, d, a, b, c, GET(8), 0x8771f681, 11)
STEP(H, c, d, a, b, GET(11), 0x6d9d6122, 16)
STEP(H2, b, c, d, a, GET(14), 0xfde5380c, 23)
STEP(H, a, b, c, d, GET(1), 0xa4beea44, 4)
STEP(H2, d, a, b, c, GET(4), 0x4bdecfa9, 11)
STEP(H, c, d, a, b, GET(7), 0xf6bb4b60, 16)
STEP(H2, b, c, d, a, GET(10), 0xbebfbc70, 23)
STEP(H, a, b, c, d, GET(13), 0x289b7ec6, 4)
STEP(H2, d, a, b, c, GET(0), 0xeaa127fa, 11)
STEP(H, c, d, a, b, GET(3), 0xd4ef3085, 16)
STEP(H2, b, c, d, a, GET(6), 0x04881d05, 23)
STEP(H, a, b, c, d, GET(9), 0xd9d4d039, 4)
STEP(H2, d, a, b, c, GET(12), 0xe6db99e5, 11)
STEP(H, c, d, a, b, GET(15), 0x1fa27cf8, 16)
STEP(H2, b, c, d, a, GET(2), 0xc4ac5665, 23)
/* Round 4 */
STEP(I, a, b, c, d, GET(0), 0xf4292244, 6)
STEP(I, d, a, b, c, GET(7), 0x432aff97, 10)
STEP(I, c, d, a, b, GET(14), 0xab9423a7, 15)
STEP(I, b, c, d, a, GET(5), 0xfc93a039, 21)
STEP(I, a, b, c, d, GET(12), 0x655b59c3, 6)
STEP(I, d, a, b, c, GET(3), 0x8f0ccc92, 10)
STEP(I, c, d, a, b, GET(10), 0xffeff47d, 15)
STEP(I, b, c, d, a, GET(1), 0x85845dd1, 21)
STEP(I, a, b, c, d, GET(8), 0x6fa87e4f, 6)
STEP(I, d, a, b, c, GET(15), 0xfe2ce6e0, 10)
STEP(I, c, d, a, b, GET(6), 0xa3014314, 15)
STEP(I, b, c, d, a, GET(13), 0x4e0811a1, 21)
STEP(I, a, b, c, d, GET(4), 0xf7537e82, 6)
STEP(I, d, a, b, c, GET(11), 0xbd3af235, 10)
STEP(I, c, d, a, b, GET(2), 0x2ad7d2bb, 15)
STEP(I, b, c, d, a, GET(9), 0xeb86d391, 21)
a += saved_a;
b += saved_b;
c += saved_c;
d += saved_d;
ptr += 64;
} while((size -= 64) != 0u);
ctx->a = a;
ctx->b = b;
ctx->c = c;
ctx->d = d;
return ptr;
}
static void MD5_Init(MD5_CTX* ctx)
{
ctx->a = 0x67452301;
ctx->b = 0xefcdab89;
ctx->c = 0x98badcfe;
ctx->d = 0x10325476;
ctx->lo = 0;
ctx->hi = 0;
}
static void MD5_Update(MD5_CTX* ctx, const void* data, size_t size)
{
uint32_t saved_lo;
size_t used;
saved_lo = ctx->lo;
if((ctx->lo = (saved_lo + size) & 0x1fffffff) < saved_lo)
ctx->hi++;
ctx->hi += size >> 29;
used = saved_lo & 0x3f;
if(used != 0u)
{
size_t available = 64 - used;
if(size < available)
{
memcpy(&ctx->buffer[used], data, size);
return;
}
memcpy(&ctx->buffer[used], data, available);
data = static_cast<const unsigned char*>(data) + available;
size -= available;
body(ctx, ctx->buffer, 64);
}
if(size >= 64)
{
data = body(ctx, data, size & ~size_t{0x3f});
size &= 0x3f;
}
memcpy(ctx->buffer, data, size);
}
#define OUT(dst, src) \
(dst)[0] = static_cast<unsigned char>(src); \
(dst)[1] = static_cast<unsigned char>((src) >> 8); \
(dst)[2] = static_cast<unsigned char>((src) >> 16); \
(dst)[3] = static_cast<unsigned char>((src) >> 24);
static void MD5_Final(unsigned char* result, MD5_CTX* ctx)
{
size_t used, available;
used = ctx->lo & 0x3f;
ctx->buffer[used++] = 0x80;
available = 64 - used;
if(available < 8)
{
memset(&ctx->buffer[used], 0, available);
body(ctx, ctx->buffer, 64);
used = 0;
available = 64;
}
memset(&ctx->buffer[used], 0, available - 8);
ctx->lo <<= 3;
OUT(&ctx->buffer[56], ctx->lo)
OUT(&ctx->buffer[60], ctx->hi)
body(ctx, ctx->buffer, 64);
OUT(&result[0], ctx->a)
OUT(&result[4], ctx->b)
OUT(&result[8], ctx->c)
OUT(&result[12], ctx->d)
memset(ctx, 0, sizeof(*ctx));
}
namespace olCompile {
std::string md5(std::string s)
{
std::array<unsigned char, MD5_DIGEST_LENGTH> result{};
MD5_CTX ctx{};
MD5_Init(&ctx);
MD5_Update(&ctx, s.data(), s.length());
MD5_Final(result.data(), &ctx);
std::ostringstream sout;
sout << std::hex << std::setfill('0');
for(auto c : result)
sout << std::setw(2) << int{c};
return sout.str();
}
} // namespace olCompile

View File

@@ -0,0 +1,119 @@
/*******************************************************************************
*
* MIT License
*
* Copyright (c) 2020 Advanced Micro Devices, Inc.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*
*******************************************************************************/
#include <env.hpp>
#include <handle.hpp>
#include <stringutils.hpp>
#include <target_properties.hpp>
#include <map>
#include <string>
OLC_DECLARE_ENV_VAR(OLC_DEBUG_ENFORCE_DEVICE)
namespace olCompile {
static std::string GetDeviceNameFromMap(const std::string& in)
{
// NOLINTNEXTLINE (cppcoreguidelines-avoid-non-const-global-variables)
static std::map<std::string, std::string> device_name_map = {
{"Ellesmere", "gfx803"},
{"Baffin", "gfx803"},
{"RacerX", "gfx803"},
{"Polaris10", "gfx803"},
{"Polaris11", "gfx803"},
{"Tonga", "gfx803"},
{"Fiji", "gfx803"},
{"gfx800", "gfx803"},
{"gfx802", "gfx803"},
{"gfx804", "gfx803"},
{"Vega10", "gfx900"},
{"gfx901", "gfx900"},
{"10.3.0 Sienna_Cichlid 18", "gfx1030"},
};
const char* const p_asciz = olCompile::GetStringEnv(OLC_DEBUG_ENFORCE_DEVICE{});
if(p_asciz != nullptr && strlen(p_asciz) > 0)
return {p_asciz};
const auto name = in.substr(0, in.find(':')); // str.substr(0, npos) returns str.
auto match = device_name_map.find(name);
if(match != device_name_map.end())
return match->second;
return name; // NOLINT (performance-no-automatic-move)
}
void TargetProperties::Init(const Handle* const handle)
{
const auto rawName = [&]() -> std::string { return handle->GetDeviceNameImpl(); }();
name = GetDeviceNameFromMap(rawName);
// DKMS driver older than 5.9 may report incorrect state of SRAMECC feature.
// Therefore we compute default SRAMECC and rely on it for now.
sramecc = [&]() -> boost::optional<bool> {
if(name == "gfx906" || name == "gfx908")
return {true};
return {};
}();
// However we need to store the reported state, even if it is incorrect,
// to use together with COMGR.
sramecc_reported = [&]() -> boost::optional<bool> {
if(rawName.find(":sramecc+") != std::string::npos)
return true;
if(rawName.find(":sramecc-") != std::string::npos)
return false;
return sramecc; // default
}();
xnack = [&]() -> boost::optional<bool> {
if(rawName.find(":xnack+") != std::string::npos)
return true;
if(rawName.find(":xnack-") != std::string::npos)
return false;
return {}; // default
}();
InitDbId();
}
void TargetProperties::InitDbId()
{
dbId = name;
if(name == "gfx906" || name == "gfx908")
{
// Let's stay compatible with existing gfx906/908 databases.
// When feature equal to the default (SRAMECC ON), do not
// append feature suffix. This is for backward compatibility
// with legacy databases ONLY!
if(!sramecc || !(*sramecc))
dbId += "_nosramecc";
}
else
{
if(sramecc && *sramecc)
dbId += "_sramecc";
}
if(xnack && *xnack)
dbId += "_xnack";
}
} // namespace olCompile

View File

@@ -0,0 +1,66 @@
/*******************************************************************************
*
* MIT License
*
* Copyright (c) 2019 Advanced Micro Devices, Inc.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*
*******************************************************************************/
#include <tmp_dir.hpp>
#include <env.hpp>
#include <boost/filesystem.hpp>
#include <logger.hpp>
OLC_DECLARE_ENV_VAR(OLC_DEBUG_SAVE_TEMP_DIR)
namespace olCompile {
void SystemCmd(std::string cmd)
{
fdt_log(LogLevel::Info, "SystemCmd", cmd.c_str());
fdt_log_flush();
if(std::system(cmd.c_str()) != 0)
throw std::runtime_error("Can't execute " + cmd);
}
TmpDir::TmpDir(std::string prefix)
: path(boost::filesystem::temp_directory_path() /
boost::filesystem::unique_path("olCompile-" + prefix + "-%%%%-%%%%-%%%%-%%%%"))
{
boost::filesystem::create_directories(this->path);
}
void TmpDir::Execute(std::string exe, std::string args) const
{
std::string cd = "cd " + this->path.string() + "; ";
std::string cmd = cd + exe + " " + args; // + " > /dev/null";
SystemCmd(cmd);
}
TmpDir::~TmpDir()
{
if(!olCompile::IsEnabled(OLC_DEBUG_SAVE_TEMP_DIR{}))
{
boost::filesystem::remove_all(this->path);
}
}
} // namespace olCompile

View File

@@ -0,0 +1,52 @@
/*******************************************************************************
*
* MIT License
*
* Copyright (c) 2017 Advanced Micro Devices, Inc.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*
*******************************************************************************/
#ifndef GUARD_OLC_BINARY_CACHE_HPP
#define GUARD_OLC_BINARY_CACHE_HPP
#include <target_properties.hpp>
#include <boost/filesystem/path.hpp>
#include <string>
namespace olCompile {
boost::filesystem::path
GetCacheFile(const std::string& device, const std::string& name, const std::string& args);
boost::filesystem::path GetCachePath();
boost::filesystem::path LoadBinary(const TargetProperties& target,
std::size_t num_cu,
const std::string& name,
const std::string& args);
void SaveBinary(const boost::filesystem::path& binary_path,
const TargetProperties& target,
const std::string& name,
const std::string& args);
} // namespace olCompile
#endif

View File

@@ -0,0 +1,47 @@
/*******************************************************************************
*
* MIT License
*
* Copyright (c) 2017 Advanced Micro Devices, Inc.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*
*******************************************************************************/
#ifndef GUARD_CONFIG_H_IN
#define GUARD_CONFIG_H_IN
// "_PACKAGE_" to avoid name contentions: the macros like
// HIP_VERSION_MAJOR are defined in hip_version.h.
// clang-format off
#define HIP_PACKAGE_VERSION_MAJOR @OLC_hip_VERSION_MAJOR@
#define HIP_PACKAGE_VERSION_MINOR @OLC_hip_VERSION_MINOR@
#define HIP_PACKAGE_VERSION_PATCH @OLC_hip_VERSION_PATCH@
// clang-format on
#define HIP_PACKAGE_VERSION_FLAT \
((HIP_PACKAGE_VERSION_MAJOR * 1000ULL + HIP_PACKAGE_VERSION_MINOR) * 1000000 + \
HIP_PACKAGE_VERSION_PATCH)
#cmakedefine01 OLC_DEBUG
#cmakedefine OLC_HIP_COMPILER "@OLC_HIP_COMPILER@"
#cmakedefine EXTRACTKERNEL_BIN "@EXTRACTKERNEL_BIN@"
#cmakedefine OLC_OFFLOADBUNDLER_BIN "@OLC_OFFLOADBUNDLER_BIN@"
#endif

View File

@@ -0,0 +1,123 @@
/*******************************************************************************
*
* MIT License
*
* Copyright (c) 2017 Advanced Micro Devices, Inc.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*
*******************************************************************************/
#ifndef GUARD_OLC_ENV_HPP
#define GUARD_OLC_ENV_HPP
#include <cstdlib>
#include <cstring>
#include <string>
#include <vector>
namespace olCompile {
/// \todo Rework: Case-insensitive string compare, ODR, (?) move to .cpp
// Declare a cached environment variable
#define OLC_DECLARE_ENV_VAR(x) \
struct x \
{ \
static const char* value() { return #x; } \
};
/*
* Returns false if a feature-controlling environment variable is defined
* and set to something which disables a feature.
*/
inline bool IsEnvvarValueDisabled(const char* name)
{
const auto value_env_p = std::getenv(name);
return value_env_p != nullptr &&
(std::strcmp(value_env_p, "disable") == 0 || std::strcmp(value_env_p, "disabled") == 0 ||
std::strcmp(value_env_p, "0") == 0 || std::strcmp(value_env_p, "no") == 0 ||
std::strcmp(value_env_p, "false") == 0);
}
inline bool IsEnvvarValueEnabled(const char* name)
{
const auto value_env_p = std::getenv(name);
return value_env_p != nullptr &&
(std::strcmp(value_env_p, "enable") == 0 || std::strcmp(value_env_p, "enabled") == 0 ||
std::strcmp(value_env_p, "1") == 0 || std::strcmp(value_env_p, "yes") == 0 ||
std::strcmp(value_env_p, "true") == 0);
}
// Return 0 if env is enabled else convert environment var to an int.
// Supports hexadecimal with leading 0x or decimal
inline unsigned long int EnvvarValue(const char* name, unsigned long int fallback = 0)
{
const auto value_env_p = std::getenv(name);
if(value_env_p == nullptr)
{
return fallback;
}
else
{
return strtoul(value_env_p, nullptr, 0);
}
}
inline std::vector<std::string> GetEnv(const char* name)
{
const auto p = std::getenv(name);
if(p == nullptr)
return {};
else
return {{p}};
}
template <class T>
inline const char* GetStringEnv(T)
{
static const std::vector<std::string> result = GetEnv(T::value());
if(result.empty())
return nullptr;
else
return result.front().c_str();
}
template <class T>
inline bool IsEnabled(T)
{
static const bool result = olCompile::IsEnvvarValueEnabled(T::value());
return result;
}
template <class T>
inline bool IsDisabled(T)
{
static const bool result = olCompile::IsEnvvarValueDisabled(T::value());
return result;
}
template <class T>
inline unsigned long int Value(T, unsigned long int fallback = 0)
{
static const auto result = olCompile::EnvvarValue(T::value(), fallback);
return result;
}
} // namespace olCompile
#endif

View File

@@ -0,0 +1,42 @@
/*******************************************************************************
*
* MIT License
*
* Copyright (c) 2019 Advanced Micro Devices, Inc.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*
*******************************************************************************/
#ifndef EXEC_OLC_UTILS_HPP
#define EXEC_OLC_UTILS_HPP
#include <istream>
#include <ostream>
#include <string>
namespace olCompile {
namespace exec {
/// Redirecting both input and output is not supported.
int Run(const std::string& p, std::istream* in, std::ostream* out);
} // namespace exec
} // namespace olCompile
#endif // EXEC_UTILS_HPP

View File

@@ -0,0 +1,145 @@
/*******************************************************************************
*
* MIT License
*
* Copyright (c) 2017 Advanced Micro Devices, Inc.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*
*******************************************************************************/
#ifndef GUARD_OLC_HANDLE_HPP_
#define GUARD_OLC_HANDLE_HPP_
#include <kernel.hpp>
#include <stringutils.hpp>
#include <target_properties.hpp>
#include <boost/range/adaptor/transformed.hpp>
#include <cstdio>
#include <cstring>
#include <ios>
#include <sstream>
#include <memory>
#include <vector>
#include <unordered_map>
namespace olCompile {
struct HandleImpl;
struct Handle
{
friend struct TargetProperties;
Handle();
Handle(hipStream_t stream);
Handle(Handle&&) noexcept;
~Handle();
hipStream_t GetStream() const;
void SetStream(hipStream_t streamID) const;
KernelInvoke AddKernel(const std::string& algorithm,
const std::string& network_config,
const std::string& program_name,
const std::string& kernel_name,
const std::vector<size_t>& vld,
const std::vector<size_t>& vgd,
const std::string& params,
std::size_t cache_index = 0) const;
bool HasKernel(const std::string& algorithm, const std::string& network_config) const;
void ClearKernels(const std::string& algorithm, const std::string& network_config) const;
auto GetKernels(const std::string& algorithm, const std::string& network_config) const
{
return this->GetKernelsImpl(algorithm, network_config) |
boost::adaptors::transformed([this](Kernel k) { return this->Run(k); });
}
KernelInvoke GetKernel(const std::string& algorithm, const std::string& network_config) const
{
auto ks = this->GetKernelsImpl(algorithm, network_config);
if(ks.empty())
{
throw std::runtime_error("looking for default kernel (does not exist): " + algorithm +
", " + network_config);
}
return this->Run(ks.front());
}
KernelInvoke Run(Kernel k) const;
Program LoadProgram(const std::string& program_name, std::string params) const;
bool HasProgram(const std::string& program_name, const std::string& params) const;
void AddProgram(Program prog, const std::string& program_name, const std::string& params) const;
void Finish() const;
std::size_t GetLocalMemorySize() const;
std::size_t GetGlobalMemorySize() const;
std::size_t GetWavefrontWidth() const;
std::size_t GetMaxComputeUnits() const;
std::size_t GetMaxHardwareComputeUnits() const
{
std::size_t num_cu = this->GetMaxComputeUnits();
std::string name = this->GetDeviceName();
return StartsWith(name, "gfx1") ? num_cu * 2 /* CUs per WGP */ : num_cu;
}
std::string GetDeviceName() const;
const TargetProperties& GetTargetProperties() const;
private:
std::string GetDeviceNameImpl() const;
const std::vector<Kernel>& GetKernelsImpl(const std::string& algorithm,
const std::string& network_config) const;
public:
std::ostream& Print(std::ostream& os) const;
static std::string GetDbBasename(const TargetProperties& target, size_t num_cu)
{
auto ret = target.DbId() + [&]() {
std::ostringstream ss;
if(num_cu <= 64)
ss << '_' << num_cu;
else
ss << std::hex << num_cu;
return std::string(ss.str());
}();
return ret;
}
std::string GetDbBasename() const
{
return GetDbBasename(GetTargetProperties(), GetMaxComputeUnits());
}
std::unique_ptr<HandleImpl> impl;
};
inline std::ostream& operator<<(std::ostream& os, const Handle& handle) { return handle.Print(os); }
} // namespace olCompile
#endif // GUARD_OLC_HANDLE_HPP_

View File

@@ -0,0 +1,22 @@
#ifndef _HIP_OLC_CHECK_HPP_
#define _HIP_OLC_CHECK_HPP_
#include <hip/hip_runtime.h>
#include <sstream>
#include <vector>
// Here flag can be a constant, variable or function call
#define MY_HIP_CHECK(flag) \
do \
{ \
hipError_t _tmpVal; \
if((_tmpVal = flag) != hipSuccess) \
{ \
std::ostringstream ostr; \
ostr << "HIP Function Failed (" << __FILE__ << "," << __LINE__ << ") " \
<< hipGetErrorString(_tmpVal); \
throw std::runtime_error(ostr.str()); \
} \
} while(0)
#endif

View File

@@ -0,0 +1,97 @@
/*******************************************************************************
*
* MIT License
*
* Copyright (c) 2019 Advanced Micro Devices, Inc.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*
*******************************************************************************/
#ifndef OLC_GUARD_OLC_HIP_BUILD_UTILS_HPP
#define OLC_GUARD_OLC_HIP_BUILD_UTILS_HPP
#include <target_properties.hpp>
#include <kernel.hpp>
#include <boost/optional.hpp>
#include <string>
namespace olCompile {
boost::filesystem::path HipBuild(boost::optional<olCompile::TmpDir>& tmp_dir,
const std::string& filename,
std::string src,
std::string params,
const TargetProperties& target,
bool sources_already_reside_on_filesystem = false);
void bin_file_to_str(const boost::filesystem::path& file, std::string& buf);
struct external_tool_version_t
{
int major = -1;
int minor = -1;
int patch = -1;
friend bool operator>(const external_tool_version_t& lhs, const external_tool_version_t& rhs);
friend bool operator<(const external_tool_version_t& lhs, const external_tool_version_t& rhs);
friend bool operator>=(const external_tool_version_t& lhs, const external_tool_version_t& rhs);
friend bool operator<=(const external_tool_version_t& lhs, const external_tool_version_t& rhs);
};
external_tool_version_t HipCompilerVersion();
bool IsHccCompiler();
bool IsHipClangCompiler();
class LcOptionTargetStrings
{
public:
const std::string& device;
const std::string xnack;
private:
const std::string sramecc;
const std::string sramecc_reported;
public:
const std::string targetId;
LcOptionTargetStrings(const TargetProperties& target)
: device(target.Name()),
xnack([&]() -> std::string {
if(target.Xnack())
return std::string{":xnack"} + (*target.Xnack() ? "+" : "-");
return {};
}()),
sramecc([&]() -> std::string {
if(target.Sramecc())
return std::string{":sramecc"} + (*target.Sramecc() ? "+" : "-");
return {};
}()),
sramecc_reported([&]() -> std::string {
if(target.SrameccReported())
return std::string{":sramecc"} + (*target.SrameccReported() ? "+" : "-");
return {};
}()),
targetId(device + sramecc + xnack)
{
}
};
} // namespace olCompile
#endif

View File

@@ -0,0 +1,174 @@
/*******************************************************************************
*
* MIT License
*
* Copyright (c) 2017 Advanced Micro Devices, Inc.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*
*******************************************************************************/
#ifndef GUARD_OLC_HIPOC_KERNEL_HPP
#define GUARD_OLC_HIPOC_KERNEL_HPP
#include <array>
#include <cassert>
#include <hipoc_program.hpp>
#include <stringutils.hpp>
#include <manage_ptr.hpp>
#include <op_kernel_args.hpp>
#include <hipCheck.hpp>
#include <vector>
#include <memory>
namespace olCompile {
using HipEventPtr = OLC_MANAGE_PTR(hipEvent_t, hipEventDestroy);
inline HipEventPtr make_hip_event()
{
hipEvent_t result = nullptr;
MY_HIP_CHECK(hipEventCreate(&result));
return HipEventPtr{result};
}
template <class T, class U>
struct KernelArgsPair
{
static const int alignment = sizeof(U);
static const int padding = (alignment - (sizeof(T) % alignment)) % alignment;
static const int second_index = sizeof(T) + padding;
KernelArgsPair(T x, U y)
{
new(buffer) T(x); // NOLINT (clang-analyzer-cplusplus.PlacementNew)
new(buffer + second_index) U(y);
}
char buffer[second_index + sizeof(U)] = {};
};
template <class... Ts>
struct KernelArgsPack;
template <class T, class U, class... Ts>
struct KernelArgsPack<T, U, Ts...>
{
using data_t = KernelArgsPack<KernelArgsPair<T, U>, Ts...>;
KernelArgsPack(T x, U y, Ts... xs) : data(KernelArgsPair<T, U>(x, y), xs...) {}
data_t data;
};
template <class T>
struct KernelArgsPack<T>
{
KernelArgsPack(T x) : head(x) {}
T head;
};
template <class... Ts>
struct KernelArgs
{
KernelArgs(Ts... xs) : pack(xs...) { std::fill(std::begin(hidden), std::end(hidden), 0); }
KernelArgsPack<Ts...> pack;
uint64_t hidden[6] = {};
};
struct HIPOCKernelInvoke
{
hipStream_t stream = nullptr;
hipFunction_t fun = nullptr;
std::array<size_t, 3> ldims = {};
std::array<size_t, 3> gdims = {};
std::string name;
std::function<void(hipEvent_t, hipEvent_t)> callback;
// Workaround for aggregate types in c++11
HIPOCKernelInvoke() {}
HIPOCKernelInvoke(hipStream_t pstream,
hipFunction_t pfun,
std::array<size_t, 3> pldims,
std::array<size_t, 3> pgdims,
std::string pname,
std::function<void(hipEvent_t, hipEvent_t)> pcallback)
: stream(pstream), fun(pfun), ldims(pldims), gdims(pgdims), name(pname), callback(pcallback)
{
}
void operator()(std::vector<OpKernelArg>& any_args) const
{
char hip_args[256] = {0};
auto sz_left = any_args[0].size();
memcpy(hip_args, &(any_args[0].buffer[0]), any_args[0].size());
for(unsigned long idx = 1; idx < any_args.size(); idx++)
{
auto& any_arg = any_args[idx];
unsigned long alignment = any_arg.size();
unsigned long padding = (alignment - (sz_left % alignment)) % alignment;
unsigned long second_index = sz_left + padding;
memcpy(hip_args + second_index, &(any_arg.buffer[0]), any_arg.size());
sz_left = second_index + alignment;
}
run(hip_args, sz_left);
}
template <class... Ts>
void operator()(Ts... xs) const
{
KernelArgs<Ts...> args{xs...};
run(&args, sizeof(args));
}
void run(void* args, std::size_t size) const;
const std::string& GetName() const { return name; }
};
struct HIPOCKernel
{
HIPOCProgram program;
std::string name;
std::array<size_t, 3> ldims = {};
std::array<size_t, 3> gdims = {};
std::string kernel_module;
hipFunction_t fun = nullptr;
HIPOCKernel() {}
HIPOCKernel(HIPOCProgram p,
const std::string kernel_name,
std::vector<size_t> local_dims,
std::vector<size_t> global_dims)
: program(p), name(kernel_name)
{
assert(!local_dims.empty() && local_dims.size() <= 3);
assert(!global_dims.empty() && global_dims.size() <= 3);
ldims.fill(1);
gdims.fill(1);
std::copy(local_dims.begin(), local_dims.end(), ldims.begin());
std::copy(global_dims.begin(), global_dims.end(), gdims.begin());
kernel_module = name;
MY_HIP_CHECK(hipModuleGetFunction(&fun, program.GetModule(), kernel_module.c_str()));
}
HIPOCKernelInvoke Invoke(hipStream_t stream,
std::function<void(hipEvent_t, hipEvent_t)> callback = nullptr) const;
};
} // namespace olCompile
#endif

View File

@@ -0,0 +1,64 @@
/*******************************************************************************
*
* MIT License
*
* Copyright (c) 2017 Advanced Micro Devices, Inc.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*
*******************************************************************************/
#ifndef GUARD_OLC_HIPOC_PROGRAM_HPP
#define GUARD_OLC_HIPOC_PROGRAM_HPP
#include <target_properties.hpp>
#include <manage_ptr.hpp>
#include <hipoc_program_impl.hpp>
#include <boost/filesystem/path.hpp>
#include <hip/hip_runtime_api.h>
#include <string>
namespace olCompile {
struct HIPOCProgramImpl;
struct HIPOCProgram
{
HIPOCProgram();
/// This ctor builds the program from source, initializes module.
/// Also either CO pathname (typically if offline tools were used)
/// or binary blob (if comgr was used to build the program)
/// is initialized. GetModule(), GetCodeObjectPathname(),
/// GetCodeObjectBlob() return appropriate data after this ctor.
/// Other ctors only guarantee to initialize module.
HIPOCProgram(const std::string& program_name,
std::string params,
const TargetProperties& target);
HIPOCProgram(const std::string& program_name, const boost::filesystem::path& hsaco);
std::shared_ptr<const HIPOCProgramImpl> impl;
hipModule_t GetModule() const;
/// \return Pathname of CO file, if it resides on the filesystem.
boost::filesystem::path GetCodeObjectPathname() const;
/// \return Copy of in-memory CO blob.
std::string GetCodeObjectBlob() const;
/// \return True if CO blob resides in-memory.
/// False if CO resides on filesystem.
bool IsCodeObjectInMemory() const;
};
} // namespace olCompile
#endif

View File

@@ -0,0 +1,61 @@
/*******************************************************************************
*
* MIT License
*
* Copyright (c) 2021 Advanced Micro Devices, Inc.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*
*******************************************************************************/
#ifndef GUARD_OLC_HIPOC_PROGRAM_IMPL_HPP
#define GUARD_OLC_HIPOC_PROGRAM_IMPL_HPP
#include <target_properties.hpp>
#include <manage_ptr.hpp>
#include <tmp_dir.hpp>
#include <boost/filesystem/path.hpp>
#include <boost/optional.hpp>
#include <hip/hip_runtime_api.h>
namespace olCompile {
using hipModulePtr = OLC_MANAGE_PTR(hipModule_t, hipModuleUnload);
struct HIPOCProgramImpl
{
HIPOCProgramImpl(){};
HIPOCProgramImpl(const std::string& program_name, const boost::filesystem::path& filespec);
HIPOCProgramImpl(const std::string& program_name,
std::string params,
const TargetProperties& target_);
std::string program;
TargetProperties target;
boost::filesystem::path hsaco_file;
hipModulePtr module;
boost::optional<TmpDir> dir;
std::vector<char> binary;
void
BuildCodeObjectInFile(std::string& params, const std::string& src, const std::string& filename);
void BuildCodeObject(std::string params);
};
} // namespace olCompile
#endif // GUARD_OLC_HIPOC_PROGRAM_IMPL_HPP

View File

@@ -0,0 +1,45 @@
/*******************************************************************************
*
* MIT License
*
* Copyright (c) 2017 Advanced Micro Devices, Inc.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*
*******************************************************************************/
#ifndef GUARD_OLC_KERNEL_HPP
#define GUARD_OLC_KERNEL_HPP
#include <string>
#include <vector>
#include <hipoc_kernel.hpp>
namespace olCompile {
std::string GetKernelSrc(std::string name);
std::string GetKernelInc(std::string key);
std::vector<std::string> GetKernelIncList();
std::vector<std::string> GetHipKernelIncList();
using Kernel = HIPOCKernel;
using KernelInvoke = HIPOCKernelInvoke;
using Program = HIPOCProgram;
} // namespace olCompile
#endif

View File

@@ -0,0 +1,137 @@
/*******************************************************************************
*
* MIT License
*
* Copyright (c) 2019 Advanced Micro Devices, Inc.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*
*******************************************************************************/
#ifndef GUARD_OLC_KERNEL_BUILD_PARAMETERS_HPP_
#define GUARD_OLC_KERNEL_BUILD_PARAMETERS_HPP_
#include <algorithm>
#include <cassert>
#include <initializer_list>
#include <string>
#include <vector>
namespace olCompile {
namespace kbp {
struct Option
{
};
} // namespace kbp
enum class ParameterTypes
{
Define,
Option,
};
struct KernelBuildParameter
{
ParameterTypes type;
std::string name;
std::string value;
};
class KernelBuildParameters
{
public:
struct KBPInit
{
friend class KernelBuildParameters;
KBPInit(const std::string& name, const std::string& value = "")
: data{ParameterTypes::Define, name, value}
{
}
template <class TValue, class = decltype(std::to_string(std::declval<TValue>()))>
KBPInit(const std::string& name, const TValue& value) : KBPInit(name, std::to_string(value))
{
}
KBPInit(kbp::Option, const std::string& name, const std::string& value = "")
: data{ParameterTypes::Option, name, value}
{
}
template <class TValue, class = decltype(std::to_string(std::declval<TValue>()))>
KBPInit(kbp::Option, const std::string& name, const TValue& value)
: KBPInit(kbp::Option{}, name, std::to_string(value))
{
}
private:
KernelBuildParameter data{};
};
KernelBuildParameters() = default;
KernelBuildParameters(const std::initializer_list<KBPInit>& defines_)
{
options.reserve(defines_.size());
for(const auto& define : defines_)
{
assert(ValidateUniqueness(define.data.name));
options.push_back(define.data);
}
}
bool Empty() const { return options.empty(); }
void Define(const std::string& name, const std::string& value = "")
{
assert(ValidateUniqueness(name));
options.push_back({ParameterTypes::Define, name, value});
}
template <class TValue, class = decltype(std::to_string(std::declval<TValue>()))>
void Define(const std::string& name, const TValue& value)
{
Define(name, std::to_string(value));
}
KernelBuildParameters& operator<<(const KernelBuildParameters& other)
{
std::copy(other.options.begin(), other.options.end(), std::back_inserter(options));
return *this;
}
template <class TFor>
std::string GenerateFor(TFor&&) const
{
return TFor::Generate(options);
}
private:
std::vector<KernelBuildParameter> options = {};
bool ValidateUniqueness(const std::string& name) const
{
const auto eq = [=](const auto& item) { return item.name == name; };
return std::find_if(options.begin(), options.end(), eq) == options.end();
}
};
} // namespace olCompile
#endif

View File

@@ -0,0 +1,97 @@
/*******************************************************************************
*
* MIT License
*
* Copyright (c) 2017 Advanced Micro Devices, Inc.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*
*******************************************************************************/
/* ************************************************************************
* Copyright 2015 Vratis, Ltd.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
* ************************************************************************ */
#ifndef GUARD_OLC_KERNEL_CACHE_HPP_
#define GUARD_OLC_KERNEL_CACHE_HPP_
#include <handle.hpp>
#include <kernel.hpp>
#include <simple_hash.hpp>
#include <string>
#include <unordered_map>
#include <vector>
namespace olCompile {
/**
* @brief The KernelCache class Build and cache kernels
*
*/
class KernelCache
{
public:
using Key = std::pair<std::string, std::string>;
using KernelMap = std::unordered_map<Key, std::vector<Kernel>, SimpleHash>;
using ProgramMap = std::unordered_map<Key, Program, SimpleHash>;
Kernel AddKernel(const Handle& h,
const std::string& algorithm,
const std::string& network_config,
const std::string& program_name,
const std::string& kernel_name,
const std::vector<size_t>& vld,
const std::vector<size_t>& vgd,
std::string params = "",
std::size_t cache_index = 0);
void AddKernel(Key key, Kernel k, std::size_t cache_index);
void ClearKernels(const std::string& algorithm, const std::string& network_config);
const std::vector<Kernel>& GetKernels(const std::string& algorithm,
const std::string& network_config);
bool HasKernels(const std::string& algorithm, const std::string& network_config) const;
bool HasProgram(const std::string& name, const std::string& params) const;
void AddProgram(Program prog, const std::string& program_name, std::string params);
KernelCache();
private:
KernelMap kernel_map;
ProgramMap program_map;
};
} // namespace olCompile
#endif // GUARD_OLC_KERNEL_CACHE_HPP_

View File

@@ -0,0 +1,23 @@
#ifndef _OLC_LOGGER_HPP_
#define _OLC_LOGGER_HPP_
#include <fstream>
namespace olCompile {
enum class LogLevel
{
Quiet = 1,
Error = 2,
Warning = 3,
Info = 4,
Info2 = 5
};
std::ostream& fdt_log(LogLevel level, const char* header, const char* content);
std::ostream& fdt_log();
void fdt_log_flush();
}; // namespace olCompile
#endif

View File

@@ -0,0 +1,76 @@
/*******************************************************************************
*
* MIT License
*
* Copyright (c) 2017 Advanced Micro Devices, Inc.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*
*******************************************************************************/
#ifndef GUARD_OLC_MANAGE_PTR_HPP
#define GUARD_OLC_MANAGE_PTR_HPP
#include <memory>
#include <type_traits>
namespace olCompile {
template <class F, F f>
struct manage_deleter
{
template <class T>
void operator()(T* x) const
{
if(x != nullptr)
{
(void)f(x); // NOLINT (cppcoreguidelines-owning-memory)
}
}
};
struct null_deleter
{
template <class T>
void operator()(T* /*x*/) const
{
}
};
template <class T, class F, F f>
using manage_ptr = std::unique_ptr<T, manage_deleter<F, f>>;
template <class T>
struct element_type
{
using type = typename T::element_type;
};
template <class T>
using remove_ptr = typename std::
conditional<std::is_pointer<T>::value, std::remove_pointer<T>, element_type<T>>::type::type;
template <class T>
using shared = std::shared_ptr<remove_ptr<T>>;
} // namespace olCompile
#define OLC_MANAGE_PTR(T, F) \
olCompile::manage_ptr<typename std::remove_pointer<T>::type, decltype(&F), &F> // NOLINT
#endif

View File

@@ -0,0 +1,12 @@
#ifndef GUARD_OLC_MD5_HPP
#define GUARD_OLC_MD5_HPP
#include <string>
namespace olCompile {
std::string md5(std::string s);
} // namespace olCompile
#endif

View File

@@ -0,0 +1,35 @@
#ifndef OLC_GUARD_MLOPEN_OP_KERNEL_ARGS_HPP
#define OLC_GUARD_MLOPEN_OP_KERNEL_ARGS_HPP
#include <type_traits>
#include <cstdint>
#include <half.hpp>
#include <boost/container/small_vector.hpp>
struct OpKernelArg
{
OpKernelArg(char val, size_t sz) : buffer(sz) { std::fill(buffer.begin(), buffer.end(), val); }
template <typename T>
OpKernelArg(T arg) : buffer(sizeof(T))
{
static_assert(std::is_trivial<T>{} || std::is_same<T, half_float::half>{},
"Only for trivial types");
*(reinterpret_cast<T*>(buffer.data())) = arg;
}
template <typename T>
OpKernelArg(T* arg) // NOLINT
: buffer(sizeof(T*))
{
*(reinterpret_cast<T**>(buffer.data())) = arg;
is_ptr = true;
}
std::size_t size() const { return buffer.size(); };
boost::container::small_vector<char, 8> buffer;
bool is_ptr = false;
};
#endif

View File

@@ -0,0 +1,44 @@
/*******************************************************************************
*
* MIT License
*
* Copyright (c) 2018 Advanced Micro Devices, Inc.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*
*******************************************************************************/
#ifndef GUARD_OLC_SIMPLE_HASH_HPP
#define GUARD_OLC_SIMPLE_HASH_HPP
#include <string>
namespace olCompile {
struct SimpleHash
{
size_t operator()(const std::pair<std::string, std::string>& p) const
{
using std::hash;
return (hash<std::string>()(p.first) ^ hash<std::string>()(p.second));
}
};
} // namespace olCompile
#endif

View File

@@ -0,0 +1,133 @@
/*******************************************************************************
*
* MIT License
*
* Copyright (c) 2017 Advanced Micro Devices, Inc.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*
*******************************************************************************/
#ifndef GUARD_OLC_STRINGUTILS_HPP
#define GUARD_OLC_STRINGUTILS_HPP
#include <algorithm>
#include <iterator>
#include <numeric>
#include <string>
#include <vector>
#include <sstream>
#define OLC_STRINGIZE_1(...) #__VA_ARGS__
#define OLC_STRINGIZE(...) OLC_STRINGIZE_1(__VA_ARGS__)
namespace olCompile {
inline std::string
ReplaceString(std::string subject, const std::string& search, const std::string& replace)
{
size_t pos = 0;
while((pos = subject.find(search, pos)) != std::string::npos)
{
subject.replace(pos, search.length(), replace);
pos += replace.length();
}
return subject;
}
inline bool EndsWith(const std::string& value, const std::string& suffix)
{
if(suffix.size() > value.size())
return false;
else
return std::equal(suffix.rbegin(), suffix.rend(), value.rbegin());
}
template <class Strings>
inline std::string JoinStrings(Strings strings, std::string delim)
{
auto it = strings.begin();
if(it == strings.end())
return "";
auto nit = std::next(it);
return std::accumulate(
nit, strings.end(), *it, [&](std::string x, std::string y) { return x + delim + y; });
}
template <class F>
static inline std::string TransformString(std::string s, F f)
{
std::transform(s.begin(), s.end(), s.begin(), f);
return s;
}
inline std::string ToUpper(std::string s) { return TransformString(std::move(s), ::toupper); }
inline bool StartsWith(const std::string& value, const std::string& prefix)
{
if(prefix.size() > value.size())
return false;
else
return std::equal(prefix.begin(), prefix.end(), value.begin());
}
inline std::string RemovePrefix(std::string s, std::string prefix)
{
if(StartsWith(s, prefix))
return s.substr(prefix.length());
else
return s;
}
inline std::vector<std::string> SplitSpaceSeparated(const std::string& in)
{
std::istringstream ss(in);
std::istream_iterator<std::string> begin(ss), end;
return {begin, end};
}
inline std::vector<std::string> SplitSpaceSeparated(const std::string& in,
const std::vector<std::string>& dontSplitAfter)
{
std::vector<std::string> rv;
std::istringstream ss(in);
std::string s;
while(ss >> s)
{
if(std::any_of(dontSplitAfter.begin(), dontSplitAfter.end(), [&](const auto& dont) {
return dont == s;
}))
{
std::string s2;
if(ss >> s2)
{
s += std::string(" ").append(s2); // Exactly one space is important.
rv.push_back(s);
continue;
}
throw std::runtime_error("Error parsing string: '" + in + '\'');
}
rv.push_back(s);
}
return rv;
}
} // namespace olCompile
#endif // GUARD_OLC_STRINGUTILS_HPP

View File

@@ -0,0 +1,56 @@
/*******************************************************************************
*
* MIT License
*
* Copyright (c) 2020 Advanced Micro Devices, Inc.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*
*******************************************************************************/
#ifndef GUARD_OLC_TARGET_PROPERTIES_HPP
#define GUARD_OLC_TARGET_PROPERTIES_HPP
#include <boost/optional.hpp>
#include <string>
namespace olCompile {
struct Handle;
struct TargetProperties
{
const std::string& Name() const { return name; }
const std::string& DbId() const { return dbId; }
boost::optional<bool> Xnack() const { return xnack; }
boost::optional<bool> Sramecc() const { return sramecc; }
boost::optional<bool> SrameccReported() const { return sramecc_reported; }
void Init(const Handle*);
private:
void InitDbId();
std::string name;
std::string dbId;
boost::optional<bool> xnack = boost::none;
boost::optional<bool> sramecc = boost::none;
boost::optional<bool> sramecc_reported = boost::none;
};
} // namespace olCompile
#endif // GUARD_OLC_TARGET_PROPERTIES_HPP

View File

@@ -0,0 +1,26 @@
#ifndef GUARD_OLC_TMP_DIR_HPP
#define GUARD_OLC_TMP_DIR_HPP
#include <string>
#include <boost/filesystem/path.hpp>
namespace olCompile {
void SystemCmd(std::string cmd);
struct TmpDir
{
boost::filesystem::path path;
TmpDir(std::string prefix);
TmpDir(TmpDir const&) = delete;
TmpDir& operator=(TmpDir const&) = delete;
void Execute(std::string exe, std::string args) const;
~TmpDir();
};
} // namespace olCompile
#endif

View File

@@ -0,0 +1,30 @@
#ifndef GUARD_OLC_WRITE_FILE_HPP
#define GUARD_OLC_WRITE_FILE_HPP
#include <boost/filesystem.hpp>
#include <manage_ptr.hpp>
#include <fstream>
namespace olCompile {
using FilePtr = OLC_MANAGE_PTR(FILE*, std::fclose);
inline void WriteFile(const std::string& content, const boost::filesystem::path& name)
{
// std::cerr << "Write file: " << name << std::endl;
FilePtr f{std::fopen(name.string().c_str(), "w")};
if(std::fwrite(content.c_str(), 1, content.size(), f.get()) != content.size())
throw std::runtime_error("Failed to write to file");
}
inline void WriteFile(const std::vector<char>& content, const boost::filesystem::path& name)
{
// std::cerr << "Write file: " << name << std::endl;
FilePtr f{std::fopen(name.string().c_str(), "w")};
if(std::fwrite(&content[0], 1, content.size(), f.get()) != content.size())
throw std::runtime_error("Failed to write to file");
}
} // namespace olCompile
#endif

View File

@@ -0,0 +1,70 @@
/*******************************************************************************
*
* MIT License
*
* Copyright (c) 2021 Advanced Micro Devices, Inc.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*
*******************************************************************************/
#include <algorithm>
#include <map>
#include <stdexcept>
// clang-format off
${KERNELS_DECLS}
// clang-format on
namespace olCompile {
const std::map<std::string, std::string>& kernels()
{
static const std::map<std::string, std::string> data{${INIT_KERNELS}};
return data;
}
std::string GetKernelSrc(std::string name)
{
// Use the base name of the string
int start = 0;
auto slash = static_cast<int>(name.find_last_of("/\\"));
if(slash != std::string::npos)
{
start = slash + 1;
}
int len = name.size();
auto ex = static_cast<int>(name.rfind('.'));
if(ex != std::string::npos)
{
len = ex - start;
}
auto key = name.substr(start, len);
// Convert to uppercase
std::transform(key.begin(), key.end(), key.begin(), ::toupper);
auto it = kernels().find(key);
if(it == kernels().end())
throw std::runtime_error("Failed to load kernel source: " + key);
return it->second;
}
} // namespace olCompile

View File

@@ -0,0 +1,80 @@
/*******************************************************************************
*
* MIT License
*
* Copyright (c) 2019 Advanced Micro Devices, Inc.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*
*******************************************************************************/
#include "olc_kernel_includes.h"
#include <algorithm>
#include <map>
#include <stdexcept>
#include <vector>
namespace olCompile {
static inline bool EndsWith(const std::string& value, const std::string& suffix)
{
if(suffix.size() > value.size())
return false;
else
return std::equal(suffix.rbegin(), suffix.rend(), value.rbegin());
}
const std::map<std::string, std::string>& kernel_includes()
{
static const std::map<std::string, std::string> data{${INIT_KERNELS}};
return data;
}
std::string GetKernelInc(std::string key)
{
auto it = kernel_includes().find(key);
if(it == kernel_includes().end())
throw std::runtime_error("Failed to load kernel source: " + key);
return it->second;
}
std::vector<std::string> GetKernelIncList()
{
std::vector<std::string> keys;
auto m = kernel_includes();
std::transform(m.begin(),
m.end(),
std::back_inserter(keys),
[](decltype(m)::value_type const& pair) { return pair.first; });
return keys;
}
std::vector<std::string> GetHipKernelIncList()
{
auto keys = GetKernelIncList();
keys.erase(std::remove_if(keys.begin(),
keys.end(),
[&](const auto& key) {
return !(EndsWith(key, ".hpp") || EndsWith(key, ".h"));
}),
keys.end());
return keys;
}
} // namespace olCompile

View File

@@ -0,0 +1 @@
#include "${KERNEL_SRC_HPP_FILENAME}"

View File

@@ -1,7 +1,18 @@
#!/bin/bash
export LD_LIBRARY_PATH=/usr/local/lib:$LD_LIBRARY_PATH
export OLC_DEBUG_HIP_VERBOSE=1
export OLC_DEBUG_HIP_DUMP=1
export OLC_DEBUG_SAVE_TEMP_DIR=1
#make -j conv_driver
make -j conv_driver_v2
#make -j conv_driver_v2
make -j conv_driver_v2_olc
rm -rf /root/_hip_binary_kernels_/
rm -rf /tmp/olCompile*
LAYOUT=$1
ALGO=$2
@@ -11,7 +22,10 @@ LOG=$5
REPEAT=$6
###################### layout algo verify init log repeat N__ K__ C__ Y X Hi_ Wi__ Strides Dilations LeftPads RightPads
driver/conv_driver_v2 $LAYOUT $ALGO $VERIFY $INIT $LOG $REPEAT 128 128 192 3 3 71 71 2 2 1 1 1 1 1 1
#driver/conv_driver_v2 $LAYOUT $ALGO $VERIFY $INIT $LOG $REPEAT 128 128 192 3 3 71 71 2 2 1 1 1 1 1 1
#driver/conv_driver_v2 $LAYOUT $ALGO $VERIFY $INIT $LOG $REPEAT 128 384 192 3 3 35 35 2 2 1 1 0 0 0 0
#driver/conv_driver_v2 $LAYOUT $ALGO $VERIFY $INIT $LOG $REPEAT 128 128 128 1 7 17 17 1 1 1 1 0 3 0 3
#driver/conv_driver_v2 $LAYOUT $ALGO $VERIFY $INIT $LOG $REPEAT 128 256 256 3 3 14 14 1 1 1 1 1 1 1 1
#./conv_driver_v2 $LAYOUT $ALGO $VERIFY $INIT $LOG $REPEAT 128 128 192 3 3 71 71 2 2 1 1 1 1 1 1
./conv_driver_v2_olc $LAYOUT $ALGO $VERIFY $INIT $LOG $REPEAT 128 128 192 3 3 71 71 2 2 1 1 1 1 1 1