diff --git a/CMakeLists.txt b/CMakeLists.txt index db48a26202..882a1af308 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,6 +1,11 @@ cmake_minimum_required(VERSION 2.8.3) project(modular_convolution) +list(APPEND CMAKE_MODULE_PATH "${PROJECT_SOURCE_DIR}/cmake") + +include(TargetFlags) +include(AddKernels) + #c++ enable_language(CXX) set(CMAKE_CXX_STANDARD 17) @@ -8,15 +13,6 @@ set(CMAKE_CXX_STANDARD_REQUIRED ON) set(CMAKE_CXX_EXTENSIONS OFF) message("CMAKE_CXX_COMPILER_ID: ${CMAKE_CXX_COMPILER_ID}") -#boost -find_package(Boost REQUIRED) - -message("Boost_INCLUDE_DIRS: ${Boost_INCLUDE_DIRS}") -message("Boost_LIBRARY_DIRS: ${Boost_LIBRARY_DIRS}") - -include_directories(BEFORE ${Boost_INCLUDE_DIRS}) -link_directories(${Boost_LIBRARY_DIRS}) - #OpenMP if(CMAKE_CXX_COMPILER_ID MATCHES "Clang") # workaround issue hipcc in rocm3.5 cannot find openmp @@ -78,3 +74,31 @@ elseif(DEVICE_BACKEND STREQUAL "NVIDIA") endif() add_subdirectory(driver) + +set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}") + +message("Compiling options for drivers: ${CMAKE_CXX_FLAGS}") + +if(DEVICE_BACKEND STREQUAL "AMD") + set(CONV_SOURCE driver/conv_driver.cpp) + set(CONV_V2_SOURCE driver/conv_driver_v2.cpp) + set(CONV_V2_OLC_SOURCE driver/conv_driver_v2_olc.cpp) + set(CONV_BWD_DATA_SOURCE driver/conv_bwd_data_driver.cpp) +elseif(DEVICE_BACKEND STREQUAL "NVIDIA") + set(CONV_SOURCE driver/conv_driver.cu) + set(CONV_BWD_DATA_SOURCE driver/conv_bwd_data_driver.cu) +endif() + +##add_executable(conv_driver ${CONV_SOURCE}) +add_executable(conv_driver_v2 ${CONV_V2_SOURCE}) +add_executable(conv_driver_v2_olc ${CONV_V2_OLC_SOURCE}) +##add_executable(conv_bwd_data_driver ${CONV_BWD_DATA_SOURCE}) + +target_include_directories(conv_driver_v2_olc PRIVATE driver/olCompiling/include/) + +##target_link_libraries(conv_driver PRIVATE modConv) +target_link_libraries(conv_driver_v2 PRIVATE modConv) +target_link_libraries(conv_driver_v2_olc PRIVATE modConv) +##target_link_libraries(conv_bwd_data_driver PRIVATE modConv) + + diff --git a/cmake/AddKernels.cmake b/cmake/AddKernels.cmake new file mode 100644 index 0000000000..429ecc47a9 --- /dev/null +++ b/cmake/AddKernels.cmake @@ -0,0 +1,40 @@ + +function(add_kernels SRC_DIR KERNEL_FILES) + set(INIT_KERNELS_LIST) + set(KERNELS_DECLS) + foreach(KERNEL_FILE ${KERNEL_FILES}) + if("${CMAKE_VERSION}" VERSION_LESS 3.0) + configure_file(${KERNEL_FILE} ${KERNEL_FILE}.delete) + else() + set_property(DIRECTORY APPEND PROPERTY CMAKE_CONFIGURE_DEPENDS ${KERNEL_FILE}) + endif() + get_filename_component(BASE_NAME ${KERNEL_FILE} NAME_WE) + string(TOUPPER "${BASE_NAME}" KEY_NAME) + string(MAKE_C_IDENTIFIER "${KEY_NAME}" VAR_NAME) + string(APPEND KERNELS_DECLS "extern const size_t APP_KERNEL_${VAR_NAME}_SIZE;\n") + string(APPEND KERNELS_DECLS "extern const unsigned char APP_KERNEL_${VAR_NAME}[];\n") + list(APPEND INIT_KERNELS_LIST " { \"${KEY_NAME}\", std::string(reinterpret_cast(APP_KERNEL_${VAR_NAME}), APP_KERNEL_${VAR_NAME}_SIZE) }") + endforeach() + string(REPLACE ";" ",\n" INIT_KERNELS "${INIT_KERNELS_LIST}") + configure_file(${SRC_DIR}/kernel.cpp.in ${PROJECT_BINARY_DIR}/kernel.cpp) +endfunction() + +function(add_kernel_includes SRC_DIR KERNEL_FILES) + set(INIT_KERNELS_LIST) + foreach(KERNEL_FILE ${KERNEL_FILES}) + if("${CMAKE_VERSION}" VERSION_LESS 3.0) + configure_file(${KERNEL_FILE} ${KERNEL_FILE}.delete) + else() + set_property(DIRECTORY APPEND PROPERTY CMAKE_CONFIGURE_DEPENDS ${KERNEL_FILE}) + endif() + get_filename_component(BASE_NAME ${KERNEL_FILE} NAME_WE) + get_filename_component(FILE_NAME ${KERNEL_FILE} NAME) + string(TOUPPER "${BASE_NAME}" KEY_NAME) + string(MAKE_C_IDENTIFIER "${KEY_NAME}" VAR_NAME) + list(APPEND INIT_KERNELS_LIST " { \"${FILE_NAME}\", std::string(reinterpret_cast(${VAR_NAME}), ${VAR_NAME}_SIZE) }") + endforeach() + string(REPLACE ";" ",\n" INIT_KERNELS "${INIT_KERNELS_LIST}") + configure_file(${SRC_DIR}/kernel_includes.cpp.in ${PROJECT_BINARY_DIR}/kernel_includes.cpp) +endfunction() + + diff --git a/cmake/TargetFlags.cmake b/cmake/TargetFlags.cmake new file mode 100644 index 0000000000..4f83fb5d39 --- /dev/null +++ b/cmake/TargetFlags.cmake @@ -0,0 +1,50 @@ + +function(get_target_property2 VAR TARGET PROPERTY) + get_target_property(_pflags ${TARGET} ${PROPERTY}) + if(_pflags) + set(${VAR} ${_pflags} PARENT_SCOPE) + else() + set(${VAR} "" PARENT_SCOPE) + endif() +endfunction() + + +macro(append_flags FLAGS TARGET PROPERTY PREFIX) + get_target_property2(_pflags ${TARGET} ${PROPERTY}) + foreach(FLAG ${_pflags}) + if(TARGET ${FLAG}) + target_flags(_pflags2 ${FLAG}) + string(APPEND ${FLAGS} " ${_pflags2}") + else() + string(APPEND ${FLAGS} " ${PREFIX}${FLAG}") + endif() + endforeach() +endmacro() + +macro(append_link_flags FLAGS TARGET PROPERTY) + get_target_property2(_pflags ${TARGET} ${PROPERTY}) + foreach(FLAG ${_pflags}) + if(TARGET ${FLAG}) + target_flags(_pflags2 ${FLAG}) + string(APPEND ${FLAGS} " ${_pflags2}") + elseif(FLAG MATCHES "^-.*") + string(APPEND ${FLAGS} " ${FLAG}") + elseif(EXISTS ${FLAG}) + string(APPEND ${FLAGS} " ${FLAG}") + else() + string(APPEND ${FLAGS} " -l${FLAG}") + endif() + endforeach() +endmacro() + +function(target_flags FLAGS TARGET) + set(_flags) + append_flags(_flags ${TARGET} "INTERFACE_COMPILE_OPTIONS" "") + append_flags(_flags ${TARGET} "INTERFACE_COMPILE_DEFINITIONS" "-D") + append_flags(_flags ${TARGET} "INTERFACE_INCLUDE_DIRECTORIES" "-isystem ") + append_flags(_flags ${TARGET} "INTERFACE_LINK_DIRECTORIES" "-L ") + append_flags(_flags ${TARGET} "INTERFACE_LINK_OPTIONS" "") + append_link_flags(_flags ${TARGET} "INTERFACE_LINK_LIBRARIES" "") + # message("_flags: ${_flags}") + set(${FLAGS} ${_flags} PARENT_SCOPE) +endfunction() diff --git a/composable_kernel/include/utility/config.amd.hpp.in b/composable_kernel/include/utility/config.amd.hpp.in index cadc821338..609ae2b212 100644 --- a/composable_kernel/include/utility/config.amd.hpp.in +++ b/composable_kernel/include/utility/config.amd.hpp.in @@ -112,8 +112,8 @@ #endif // pass tensor descriptor by value or void* -#define CK_EXPERIMENTAL_PASS_TENSOR_DESCRIPTOR_BY_VALUE 1 -#define CK_EXPERIMENTAL_PASS_TENSOR_DESCRIPTOR_BY_VOID_POINTER 0 +#define CK_EXPERIMENTAL_PASS_TENSOR_DESCRIPTOR_BY_VALUE 0 +#define CK_EXPERIMENTAL_PASS_TENSOR_DESCRIPTOR_BY_VOID_POINTER 1 // merge transformation use magic number division #define CK_EXPERIMENTAL_MERGE_USE_MAGIC_DIVISION 0 diff --git a/composable_kernel/include/utility/sequence_helper.hpp b/composable_kernel/include/utility/sequence_helper.hpp index ccedfc3e6f..88d7da63e8 100644 --- a/composable_kernel/include/utility/sequence_helper.hpp +++ b/composable_kernel/include/utility/sequence_helper.hpp @@ -1,7 +1,7 @@ #ifndef CK_SEQUENCE_HELPER_HPP #define CK_SEQUENCE_HELPER_HPP -#include "sequence_helper.hpp" +#include "tuple.hpp" namespace ck { diff --git a/composable_kernel/include/utility/type_helper.hpp b/composable_kernel/include/utility/type_helper.hpp new file mode 100644 index 0000000000..987f07e3f4 --- /dev/null +++ b/composable_kernel/include/utility/type_helper.hpp @@ -0,0 +1,34 @@ +#ifndef CK_TYPE_HELPER_HPP +#define CK_TYPE_HELPER_HPP + +#include "float_type.hpp" + +namespace ck { + +template +struct get_type_from_type_id +{ + using type = float; +}; + +template <> +struct get_type_from_type_id<'H'> +{ + using type = half_t; +}; + +template <> +struct get_type_from_type_id<'F'> +{ + using type = float; +}; + +template <> +struct get_type_from_type_id<'D'> +{ + using type = double; +}; + +} // namespace ck + +#endif diff --git a/composable_kernel/src/kernel_wrapper/dynamic_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw.cpp b/composable_kernel/src/kernel_wrapper/dynamic_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw.cpp new file mode 100644 index 0000000000..19501b3fcd --- /dev/null +++ b/composable_kernel/src/kernel_wrapper/dynamic_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw.cpp @@ -0,0 +1,370 @@ +#include "common_header.hpp" +#include "type_helper.hpp" +#include "dynamic_tensor_descriptor.hpp" +#include "dynamic_tensor_descriptor_helper.hpp" +#include "gridwise_dynamic_gemm_v1r2.hpp" +#include "transform_forward_convolution_into_gemm_v4r4_nchw_kcyx_nkhw.hpp" + +using namespace ck; + +using FloatAB = typename get_type_from_type_id(CK_PARAM_IN_WEI_DATATYPE)>::type; +using FloatC = typename get_type_from_type_id(CK_PARAM_OUT_DATATYPE)>::type; +using FloatAcc = typename get_type_from_type_id(CK_PARAM_CONV_COMPTYPE)>::type; + +constexpr index_t BlockSize = CK_PARAM_BlockSize; + +constexpr index_t MPerBlock = CK_PARAM_MPerBlock; +constexpr index_t NPerBlock = CK_PARAM_NPerBlock; +constexpr index_t KPerBlock = CK_PARAM_KPerBlock; +constexpr index_t M1PerThread = CK_PARAM_M1PerThread; +constexpr index_t N1PerThread = CK_PARAM_N1PerThread; +constexpr index_t KPerThread = CK_PARAM_KPerThread; +constexpr index_t M1N1ThreadClusterM10 = CK_PARAM_M1N1ThreadClusterM10; +constexpr index_t M1N1ThreadClusterN10 = CK_PARAM_M1N1ThreadClusterN10; +constexpr index_t M1N1ThreadClusterM11 = CK_PARAM_M1N1ThreadClusterM11; +constexpr index_t M1N1ThreadClusterN11 = CK_PARAM_M1N1ThreadClusterN11; + +using ABlockTransferThreadSliceLengths_K_M0_M1 = + Sequence; +using ABlockTransferThreadClusterLengths_K_M0_M1 = + Sequence; +using ABlockTransferThreadClusterArrangeOrder = + Sequence; +using ABlockTransferSrcAccessOrder = Sequence; + +constexpr index_t ABlockTransferSrcVectorDim = CK_PARAM_ABlockTransferSrcVectorDim; +constexpr index_t ABlockTransferSrcScalarPerVector = CK_PARAM_ABlockTransferSrcScalarPerVector; +constexpr index_t ABlockTransferDstScalarPerVector_M1 = + CK_PARAM_ABlockTransferDstScalarPerVector_M1; +constexpr bool AThreadTransferSrcResetCoordinateAfterRun = + static_cast(CK_PARAM_AThreadTransferSrcResetCoordinateAfterRun); + +using BBlockTransferThreadSliceLengths_K_N0_N1 = + Sequence; +using BBlockTransferThreadClusterLengths_K_N0_N1 = + Sequence; +using BBlockTransferThreadClusterArrangeOrder = + Sequence; +using BBlockTransferSrcAccessOrder = Sequence; + +constexpr index_t BBlockTransferSrcVectorDim = CK_PARAM_BBlockTransferSrcVectorDim; +constexpr index_t BBlockTransferSrcScalarPerVector = CK_PARAM_BBlockTransferSrcScalarPerVector; +constexpr index_t BBlockTransferDstScalarPerVector_N1 = + CK_PARAM_BBlockTransferDstScalarPerVector_N1; +constexpr bool BThreadTransferSrcResetCoordinateAfterRun = + static_cast(CK_PARAM_BThreadTransferSrcResetCoordinateAfterRun); + +using CThreadTransferSrcDstAccessOrder = Sequence; +constexpr index_t CThreadTransferSrcDstVectorDim = CK_PARAM_CThreadTransferSrcDstVectorDim; +constexpr index_t CThreadTransferDstScalarPerVector = CK_PARAM_CThreadTransferDstScalarPerVector; + +constexpr bool HasMainKBlockLoop = static_cast(CK_PARAM_HAS_MAIN_KBLOCK_LOOP); +constexpr bool HasDoubleTailKBlockLoop = static_cast(CK_PARAM_HAS_DOUBLE_TAIL_KBLOCK_LOOP); + +extern "C" __global__ void dynamic_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw_prepare( + int n, + int c, + int hi, + int wi, + int k, + int y, + int x, + int convStrideH, + int convStrideW, + int convDilationY, + int convDilationX, + int leftPadH, + int leftPadW, + int rightPadH, + int rightPadW, + void* p_a_k_m0_m1_grid_desc, + void* p_b_k_n0_n1_grid_desc, + void* p_c_m0_m10_m11_n0_n10_n11_grid_desc, + void* p_c_blockid_to_m0_n0_block_cluster_adaptor) +{ + constexpr auto I0 = Number<0>{}; + constexpr auto I1 = Number<1>{}; + constexpr auto I2 = Number<2>{}; + + const index_t ho = (hi + leftPadH + rightPadH - convDilationY * (y - 1) - 1) / convStrideH + 1; + const index_t wo = (wi + leftPadW + rightPadW - convDilationX * (x - 1) - 1) / convStrideW + 1; + + const auto in_n_c_hi_wi_desc = + make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(n, c, hi, wi)); + const auto wei_k_c_y_x_desc = + make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(k, c, y, x)); + const auto out_n_k_ho_wo_desc = + make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(n, k, ho, wo)); + + const auto descs = transform_forward_convolution_into_gemm_v4r4_nchw_kcyx_nkhw_pad( + wei_k_c_y_x_desc, + in_n_c_hi_wi_desc, + out_n_k_ho_wo_desc, + make_tuple(convStrideH, convStrideW), + make_tuple(convDilationY, convDilationX), + make_tuple(leftPadH, leftPadW), + make_tuple(rightPadH, rightPadW)); + + const auto a_k_m_grid_desc = descs[I0]; + const auto b_k_n_grid_desc = descs[I1]; + const auto c_m_n_grid_desc = descs[I2]; + + using AKMGridDesc = decltype(a_k_m_grid_desc); + using BKNGridDesc = decltype(b_k_n_grid_desc); + using CMNGridDesc = decltype(c_m_n_grid_desc); + + using AGridIteratorHacks = decltype(make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0>{}, + Sequence<0, 0, 0, 0, 0>{}, + Sequence<0, 0, 0, 0, 0>{}, + Sequence<0, 0, 0, 0, 0>{}), + make_tuple(Sequence<0, 0, 0, 0, 0>{}, + Sequence<0, 0, 0, 0, 0>{}, + Sequence<0, 0, 0, 0, 0>{}, + Sequence<0, 0, 0, 0, 0>{}))); + + using BGridIteratorHacks = + decltype(make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0>{}, + Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0>{}, + Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0>{}), + make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0>{}, + Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0>{}, + Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0>{}))); + + using CGridIteratorHacks = decltype(make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0>{}, + Sequence<0, 0, 0, 0, 0>{}, + Sequence<0, 0, 0, 0, 0>{}, + Sequence<0, 0, 1, 0, 0>{}, + Sequence<0, 0, 1, 0, 0>{}, + Sequence<0, 0, 1, 0, 0>{}), + make_tuple(Sequence<0, 0, 0, 0, 0>{}, + Sequence<0, 0, 0, 0, 0>{}, + Sequence<0, 0, 0, 0, 0>{}, + Sequence<0, 0, 2, 0, 0>{}, + Sequence<0, 0, 2, 0, 0>{}, + Sequence<0, 0, 2, 0, 0>{}))); + + using AGridMoveSliceWindowIteratorHacks = Sequence<0, 0, 0, 0, 0>; + using BGridMoveSliceWindowIteratorHacks = Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 0, 0>; + + using GridwiseGemm = + GridwiseDynamicGemm_km_kn_mn_v1r2; + + auto a_k_m0_m1_grid_desc = GridwiseGemm::MakeAKM0M1GridDescriptor(a_k_m_grid_desc); + auto b_k_n0_n1_grid_desc = GridwiseGemm::MakeBKN0N1GridDescriptor(b_k_n_grid_desc); + auto c_m0_m10_m11_n0_n10_n11_grid_desc = + GridwiseGemm::MakeCM0M10M11N0N10N11GridDescriptor(c_m_n_grid_desc); + auto c_blockid_to_m0_n0_block_cluster_adaptor = + GridwiseGemm::MakeCBlockIdToM0N0BlockClusterAdaptor(c_m_n_grid_desc); + + if(hipThreadIdx_x == 0) + { + *static_cast(p_a_k_m0_m1_grid_desc) = a_k_m0_m1_grid_desc; + *static_cast(p_b_k_n0_n1_grid_desc) = b_k_n0_n1_grid_desc; + *static_cast( + p_c_m0_m10_m11_n0_n10_n11_grid_desc) = c_m0_m10_m11_n0_n10_n11_grid_desc; + *static_cast( + p_c_blockid_to_m0_n0_block_cluster_adaptor) = c_blockid_to_m0_n0_block_cluster_adaptor; + }; +}; + +extern "C" __global__ void +#if CK_USE_LAUNCH_BOUNDS + __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU) +#endif + dynamic_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw( + const FloatAB* __restrict__ p_a_grid, + const FloatAB* __restrict__ p_b_grid, + FloatC* __restrict__ p_c_grid, + const void __CONSTANT__* p_a_k_m0_m1_grid_desc, + const void __CONSTANT__* p_b_k_n0_n1_grid_desc, + const void __CONSTANT__* p_c_m0_m10_m11_n0_n10_n11_grid_desc, + const void __CONSTANT__* p_c_blockid_to_m0_n0_block_cluster_adaptor) +{ + constexpr auto I0 = Number<0>{}; + constexpr auto I1 = Number<1>{}; + constexpr auto I2 = Number<2>{}; + + constexpr auto in_n_c_hi_wi_desc = + make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(64, 4, 35, 35)); + constexpr auto wei_k_c_y_x_desc = + make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(8, 4, 3, 3)); + constexpr auto out_n_k_ho_wo_desc = + make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(64, 8, 18, 18)); + + constexpr auto descs = + transform_forward_convolution_into_gemm_v4r4_nchw_kcyx_nkhw_pad(wei_k_c_y_x_desc, + in_n_c_hi_wi_desc, + out_n_k_ho_wo_desc, + make_tuple(2, 2), + make_tuple(1, 1), + make_tuple(1, 1), + make_tuple(1, 1)); + + constexpr auto a_k_m_grid_desc = descs[I0]; + constexpr auto b_k_n_grid_desc = descs[I1]; + constexpr auto c_m_n_grid_desc = descs[I2]; + + using AKMGridDesc = decltype(a_k_m_grid_desc); + using BKNGridDesc = decltype(b_k_n_grid_desc); + using CMNGridDesc = decltype(c_m_n_grid_desc); + + using AGridIteratorHacks = decltype(make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0>{}, + Sequence<0, 0, 0, 0, 0>{}, + Sequence<0, 0, 0, 0, 0>{}, + Sequence<0, 0, 0, 0, 0>{}), + make_tuple(Sequence<0, 0, 0, 0, 0>{}, + Sequence<0, 0, 0, 0, 0>{}, + Sequence<0, 0, 0, 0, 0>{}, + Sequence<0, 0, 0, 0, 0>{}))); + + using BGridIteratorHacks = + decltype(make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0>{}, + Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0>{}, + Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0>{}), + make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0>{}, + Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0>{}, + Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0>{}))); + + using CGridIteratorHacks = decltype(make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0>{}, + Sequence<0, 0, 0, 0, 0>{}, + Sequence<0, 0, 0, 0, 0>{}, + Sequence<0, 0, 1, 0, 0>{}, + Sequence<0, 0, 1, 0, 0>{}, + Sequence<0, 0, 1, 0, 0>{}), + make_tuple(Sequence<0, 0, 0, 0, 0>{}, + Sequence<0, 0, 0, 0, 0>{}, + Sequence<0, 0, 0, 0, 0>{}, + Sequence<0, 0, 2, 0, 0>{}, + Sequence<0, 0, 2, 0, 0>{}, + Sequence<0, 0, 2, 0, 0>{}))); + + using AGridMoveSliceWindowIteratorHacks = Sequence<0, 0, 0, 0, 0>; + using BGridMoveSliceWindowIteratorHacks = Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 0, 0>; + + using GridwiseGemm = + GridwiseDynamicGemm_km_kn_mn_v1r2; + + constexpr auto a_k_m0_m1_grid_desc_tmp = + GridwiseGemm::MakeAKM0M1GridDescriptor(a_k_m_grid_desc); + constexpr auto b_k_n0_n1_grid_desc_tmp = + GridwiseGemm::MakeBKN0N1GridDescriptor(b_k_n_grid_desc); + constexpr auto c_m0_m10_m11_n0_n10_n11_grid_desc_tmp = + GridwiseGemm::MakeCM0M10M11N0N10N11GridDescriptor(c_m_n_grid_desc); + constexpr auto c_blockid_to_m0_n0_block_cluster_adaptor_tmp = + GridwiseGemm::MakeCBlockIdToM0N0BlockClusterAdaptor(c_m_n_grid_desc); + + using AKM0M1GridDesc = decltype(a_k_m0_m1_grid_desc_tmp); + using BKN0N1GridDesc = decltype(b_k_n0_n1_grid_desc_tmp); + using CM0M10M11N0N10N11GridDesc = decltype(c_m0_m10_m11_n0_n10_n11_grid_desc_tmp); + using CBlockIdToM0N0BlockClusterAdaptor = + decltype(c_blockid_to_m0_n0_block_cluster_adaptor_tmp); + + const auto a_k_m0_m1_grid_desc = + *reinterpret_cast((const void*)p_a_k_m0_m1_grid_desc); + const auto b_k_n0_n1_grid_desc = + *reinterpret_cast((const void*)p_b_k_n0_n1_grid_desc); + const auto c_m0_m10_m11_n0_n10_n11_grid_desc = + *reinterpret_cast( + (const void*)p_c_m0_m10_m11_n0_n10_n11_grid_desc); + const auto c_blockid_to_m0_n0_block_cluster_adaptor = + *reinterpret_cast( + (const void*)p_c_blockid_to_m0_n0_block_cluster_adaptor); + + constexpr index_t shared_block_size = + GridwiseGemm::GetSharedMemoryNumberOfByte() / sizeof(FloatAB); + + __shared__ FloatAB p_shared_block[shared_block_size]; + + GridwiseGemm::Run(p_a_grid, + p_b_grid, + p_c_grid, + p_shared_block, + a_k_m0_m1_grid_desc, + b_k_n0_n1_grid_desc, + c_m0_m10_m11_n0_n10_n11_grid_desc, + c_blockid_to_m0_n0_block_cluster_adaptor, + integral_constant{}, + integral_constant{}); +}; diff --git a/composable_kernel/src/kernel_wrapper/dynamic_convolution_forward_implicit_gemm_v4r5_nchw_kcyx_nkhw.cpp b/composable_kernel/src/kernel_wrapper/dynamic_convolution_forward_implicit_gemm_v4r5_nchw_kcyx_nkhw.cpp new file mode 100644 index 0000000000..503da231e9 --- /dev/null +++ b/composable_kernel/src/kernel_wrapper/dynamic_convolution_forward_implicit_gemm_v4r5_nchw_kcyx_nkhw.cpp @@ -0,0 +1,379 @@ +#include "common_header.hpp" +#include "type_helper.hpp" +#include "dynamic_tensor_descriptor.hpp" +#include "dynamic_tensor_descriptor_helper.hpp" +#include "gridwise_dynamic_contraction_v1r1.hpp" +#include "transform_forward_convolution_into_gemm_v4r5_nchw_kcyx_nkhw.hpp" + +using namespace ck; + +using FloatAB = typename get_type_from_type_id(CK_PARAM_IN_WEI_DATATYPE)>::type; +using FloatC = typename get_type_from_type_id(CK_PARAM_OUT_DATATYPE)>::type; +using FloatAcc = typename get_type_from_type_id(CK_PARAM_CONV_COMPTYPE)>::type; + +constexpr index_t BlockSize = CK_PARAM_BlockSize; +constexpr index_t N0 = CK_PARAM_N0; + +constexpr index_t GM1PerBlockGM11 = CK_PARAM_GM1PerBlockGM11; +constexpr index_t GN1PerBlockGN11 = CK_PARAM_GN1PerBlockGN11; +constexpr index_t KPerBlock = CK_PARAM_KPerBlock; +constexpr index_t M1PerThread = CK_PARAM_M1PerThread; +constexpr index_t N1PerThread = CK_PARAM_N1PerThread; +constexpr index_t KPerThread = CK_PARAM_KPerThread; +constexpr index_t M1N1ThreadClusterM10 = CK_PARAM_M1N1ThreadClusterM10; +constexpr index_t M1N1ThreadClusterN10 = CK_PARAM_M1N1ThreadClusterN10; +constexpr index_t M1N1ThreadClusterM11 = CK_PARAM_M1N1ThreadClusterM11; +constexpr index_t M1N1ThreadClusterN11 = CK_PARAM_M1N1ThreadClusterN11; + +using ABlockTransferThreadSliceLengths_GK_GM0_GM10_GM11 = + Sequence; +using ABlockTransferThreadClusterLengths_GK_GM0_GM10_GM11 = + Sequence; +using ABlockTransferThreadClusterArrangeOrder = + Sequence; +using ABlockTransferSrcAccessOrder = Sequence; + +constexpr index_t ABlockTransferSrcVectorDim = CK_PARAM_ABlockTransferSrcVectorDim; +constexpr index_t ABlockTransferSrcScalarPerVector = CK_PARAM_ABlockTransferSrcScalarPerVector; +constexpr index_t ABlockTransferDstScalarPerVector_GM11 = + CK_PARAM_ABlockTransferDstScalarPerVector_GM11; +constexpr bool AThreadTransferSrcResetCoordinateAfterRun = + static_cast(CK_PARAM_AThreadTransferSrcResetCoordinateAfterRun); + +using BBlockTransferThreadSliceLengths_GK_GN0_GN10_GN11 = + Sequence; +using BBlockTransferThreadClusterLengths_GK_GN0_GN10_GN11 = + Sequence; +using BBlockTransferThreadClusterArrangeOrder = + Sequence; +using BBlockTransferSrcAccessOrder = Sequence; + +constexpr index_t BBlockTransferSrcVectorDim = CK_PARAM_BBlockTransferSrcVectorDim; +constexpr index_t BBlockTransferSrcScalarPerVector = CK_PARAM_BBlockTransferSrcScalarPerVector; +constexpr index_t BBlockTransferDstScalarPerVector_GN11 = + CK_PARAM_BBlockTransferDstScalarPerVector_GN11; +constexpr bool BThreadTransferSrcResetCoordinateAfterRun = + static_cast(CK_PARAM_BThreadTransferSrcResetCoordinateAfterRun); + +using CThreadTransferSrcDstAccessOrder = Sequence; +constexpr index_t CThreadTransferSrcDstVectorDim = CK_PARAM_CThreadTransferSrcDstVectorDim; +constexpr index_t CThreadTransferDstScalarPerVector = CK_PARAM_CThreadTransferDstScalarPerVector; + +constexpr bool HasMainKBlockLoop = static_cast(CK_PARAM_HAS_MAIN_KBLOCK_LOOP); +constexpr bool HasDoubleTailKBlockLoop = static_cast(CK_PARAM_HAS_DOUBLE_TAIL_KBLOCK_LOOP); + +extern "C" __global__ void dynamic_convolution_forward_implicit_gemm_v4r5_nchw_kcyx_nkhw_prepare( + int n, + int c, + int hi, + int wi, + int k, + int y, + int x, + int convStrideH, + int convStrideW, + int convDilationY, + int convDilationX, + int leftPadH, + int leftPadW, + int rightPadH, + int rightPadW, + void* p_a_gk_gm0_gm10_gm11_grid_desc, + void* p_b_gk_gn0_gn10_gn11_grid_desc, + void* p_c_gm10_bm0_bm1_gn10_bn0_bn1_grid_desc, + void* p_c_blockid_to_gm10_gn10_block_cluster_adaptor) +{ + constexpr auto I0 = Number<0>{}; + constexpr auto I1 = Number<1>{}; + constexpr auto I2 = Number<2>{}; + + const index_t ho = (hi + leftPadH + rightPadH - convDilationY * (y - 1) - 1) / convStrideH + 1; + const index_t wo = (wi + leftPadW + rightPadW - convDilationX * (x - 1) - 1) / convStrideW + 1; + + const auto in_n_c_hi_wi_desc = + make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(n, c, hi, wi)); + const auto wei_k_c_y_x_desc = + make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(k, c, y, x)); + const auto out_n_k_ho_wo_desc = + make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(n, k, ho, wo)); + + const auto descs = transform_forward_convolution_into_contraction_v4r5_nchw_kcyx_nkhw_pad( + wei_k_c_y_x_desc, + in_n_c_hi_wi_desc, + out_n_k_ho_wo_desc, + make_tuple(convStrideH, convStrideW), + make_tuple(convDilationY, convDilationX), + make_tuple(leftPadH, leftPadW), + make_tuple(rightPadH, rightPadW)); + + const auto a_gk_gm0_gm1_grid_desc = descs[I0]; + const auto b_gk_gn0_gn1_grid_desc = descs[I1]; + const auto c_gm0_gm1_gn0_gn1_grid_desc = descs[I2]; + + using AGKGM0GM1GridDesc = decltype(a_gk_gm0_gm1_grid_desc); + using BGKGN0GN1GridDesc = decltype(b_gk_gn0_gn1_grid_desc); + using CGM0GM1GN0GN1GridDesc = decltype(c_gm0_gm1_gn0_gn1_grid_desc); + + using AGridIteratorHacks = decltype(make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0, 0>{}, + Sequence<0, 0, 0, 0, 0, 0>{}, + Sequence<0, 0, 0, 0, 0, 0>{}, + Sequence<0, 0, 0, 0, 0, 0>{}), + make_tuple(Sequence<0, 0, 0, 0, 0, 0>{}, + Sequence<0, 0, 0, 0, 0, 0>{}, + Sequence<0, 0, 0, 0, 0, 0>{}, + Sequence<0, 0, 0, 0, 0, 0>{}))); + + using BGridIteratorHacks = + decltype(make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0>{}, + Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}, + Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0>{}, + Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0>{}), + make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0>{}, + Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}, + Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0>{}, + Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0>{}))); + + using CGridIteratorHacks = decltype(make_tuple( + make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}, + Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0>{}, + Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0>{}, + Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}, + Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0>{}, + Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0>{}), + make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}, + Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0>{}, + Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0>{}, + Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}, + Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0>{}, + Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0>{}))); + + using AGridMoveSliceWindowIteratorHacks = Sequence<0, 0, 0, 0, 0, 0>; + + using BGridMoveSliceWindowIteratorHacks = Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 2, 0, 0, 0>; + + using GridwiseContraction = GridwiseDynamicContraction_km0m1_kn0n1_m0m1n0n1_v1r1< + BlockSize, + FloatAB, + FloatAcc, + FloatC, + InMemoryDataOperation::Set, /* ToDo tunable */ + AGKGM0GM1GridDesc, + BGKGN0GN1GridDesc, + CGM0GM1GN0GN1GridDesc, + GM1PerBlockGM11, + GN1PerBlockGN11, + KPerBlock, + M1PerThread, + N1PerThread, + KPerThread, + M1N1ThreadClusterM10, + M1N1ThreadClusterN10, + M1N1ThreadClusterM11, + M1N1ThreadClusterN11, + ABlockTransferThreadSliceLengths_GK_GM0_GM10_GM11, + ABlockTransferThreadClusterLengths_GK_GM0_GM10_GM11, + ABlockTransferThreadClusterArrangeOrder, + ABlockTransferSrcAccessOrder, + ABlockTransferSrcVectorDim, + ABlockTransferSrcScalarPerVector, + ABlockTransferDstScalarPerVector_GM11, + AThreadTransferSrcResetCoordinateAfterRun, + BBlockTransferThreadSliceLengths_GK_GN0_GN10_GN11, + BBlockTransferThreadClusterLengths_GK_GN0_GN10_GN11, + BBlockTransferThreadClusterArrangeOrder, + BBlockTransferSrcAccessOrder, + BBlockTransferSrcVectorDim, + BBlockTransferSrcScalarPerVector, + BBlockTransferDstScalarPerVector_GN11, + BThreadTransferSrcResetCoordinateAfterRun, + CThreadTransferSrcDstAccessOrder, + CThreadTransferSrcDstVectorDim, + CThreadTransferDstScalarPerVector, + AGridIteratorHacks, + BGridIteratorHacks, + CGridIteratorHacks, + AGridMoveSliceWindowIteratorHacks, + BGridMoveSliceWindowIteratorHacks>; + + auto a_gk_gm0_gm10_gm11_grid_desc = + GridwiseContraction::MakeAGKGM0GM10GM11GridDescriptor(a_gk_gm0_gm1_grid_desc); + auto b_gk_gn0_gn10_gn11_grid_desc = + GridwiseContraction::MakeBGKGN0GN10GN11GridDescriptor(b_gk_gn0_gn1_grid_desc); + auto c_gm10_bm0_bm1_gn10_bn0_bn1_grid_desc = + GridwiseContraction::MakeCGM10BM0BM1GN10BN0BN1GridDescriptor(c_gm0_gm1_gn0_gn1_grid_desc); + auto c_blockid_to_gm10_gn10_block_cluster_adaptor = + GridwiseContraction::MakeCBlockIdToGM10GN10BlockClusterAdaptor(c_gm0_gm1_gn0_gn1_grid_desc); + + if(hipThreadIdx_x == 0) + { + *static_cast(p_a_gk_gm0_gm10_gm11_grid_desc) = + a_gk_gm0_gm10_gm11_grid_desc; + *static_cast(p_b_gk_gn0_gn10_gn11_grid_desc) = + b_gk_gn0_gn10_gn11_grid_desc; + *static_cast( + p_c_gm10_bm0_bm1_gn10_bn0_bn1_grid_desc) = c_gm10_bm0_bm1_gn10_bn0_bn1_grid_desc; + *static_cast( + p_c_blockid_to_gm10_gn10_block_cluster_adaptor) = + c_blockid_to_gm10_gn10_block_cluster_adaptor; + }; +}; + +extern "C" __global__ void +#if CK_USE_LAUNCH_BOUNDS + __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU) +#endif + dynamic_convolution_forward_implicit_gemm_v4r5_nchw_kcyx_nkhw( + const FloatAB* __restrict__ p_a_grid, + const FloatAB* __restrict__ p_b_grid, + FloatC* __restrict__ p_c_grid, + const void __CONSTANT__* p_a_gk_gm0_gm10_gm11_grid_desc, + const void __CONSTANT__* p_b_gk_gn0_gn10_gn11_grid_desc, + const void __CONSTANT__* p_c_gm10_bm0_bm1_gn10_bn0_bn1_grid_desc, + const void __CONSTANT__* p_c_blockid_to_gm10_gn10_block_cluster_adaptor) +{ + constexpr auto I0 = Number<0>{}; + constexpr auto I1 = Number<1>{}; + constexpr auto I2 = Number<2>{}; + + constexpr auto in_n_c_hi_wi_desc = + make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(64, 4, 35, 35)); + constexpr auto wei_k_c_y_x_desc = + make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(8, 4, 3, 3)); + constexpr auto out_n_k_ho_wo_desc = + make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(64, 8, 18, 18)); + + constexpr auto descs = + transform_forward_convolution_into_contraction_v4r5_nchw_kcyx_nkhw_pad( + wei_k_c_y_x_desc, + in_n_c_hi_wi_desc, + out_n_k_ho_wo_desc, + make_tuple(2, 2), + make_tuple(1, 1), + make_tuple(1, 1), + make_tuple(1, 1)); + + constexpr auto a_gk_gm0_gm1_grid_desc = descs[I0]; + constexpr auto b_gk_gn0_gn1_grid_desc = descs[I1]; + constexpr auto c_gm0_gm1_gn0_gn1_grid_desc = descs[I2]; + + using AGKGM0GM1GridDesc = decltype(a_gk_gm0_gm1_grid_desc); + using BGKGN0GN1GridDesc = decltype(b_gk_gn0_gn1_grid_desc); + using CGM0GM1GN0GN1GridDesc = decltype(c_gm0_gm1_gn0_gn1_grid_desc); + + using AGridIteratorHacks = decltype(make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0, 0>{}, + Sequence<0, 0, 0, 0, 0, 0>{}, + Sequence<0, 0, 0, 0, 0, 0>{}, + Sequence<0, 0, 0, 0, 0, 0>{}), + make_tuple(Sequence<0, 0, 0, 0, 0, 0>{}, + Sequence<0, 0, 0, 0, 0, 0>{}, + Sequence<0, 0, 0, 0, 0, 0>{}, + Sequence<0, 0, 0, 0, 0, 0>{}))); + + using BGridIteratorHacks = + decltype(make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0>{}, + Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}, + Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0>{}, + Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0>{}), + make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0>{}, + Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}, + Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0>{}, + Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0>{}))); + + using CGridIteratorHacks = decltype(make_tuple( + make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}, + Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0>{}, + Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0>{}, + Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}, + Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0>{}, + Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0>{}), + make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}, + Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0>{}, + Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0>{}, + Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}, + Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0>{}, + Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0>{}))); + + using AGridMoveSliceWindowIteratorHacks = Sequence<0, 0, 0, 0, 0, 0>; + using BGridMoveSliceWindowIteratorHacks = Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 2, 0, 0, 0>; + + using GridwiseContraction = GridwiseDynamicContraction_km0m1_kn0n1_m0m1n0n1_v1r1< + BlockSize, + FloatAB, + FloatAcc, + FloatC, + InMemoryDataOperation::Set, /* ToDo tunable */ + AGKGM0GM1GridDesc, + BGKGN0GN1GridDesc, + CGM0GM1GN0GN1GridDesc, + GM1PerBlockGM11, + GN1PerBlockGN11, + KPerBlock, + M1PerThread, + N1PerThread, + KPerThread, + M1N1ThreadClusterM10, + M1N1ThreadClusterN10, + M1N1ThreadClusterM11, + M1N1ThreadClusterN11, + ABlockTransferThreadSliceLengths_GK_GM0_GM10_GM11, + ABlockTransferThreadClusterLengths_GK_GM0_GM10_GM11, + ABlockTransferThreadClusterArrangeOrder, + ABlockTransferSrcAccessOrder, + ABlockTransferSrcVectorDim, + ABlockTransferSrcScalarPerVector, + ABlockTransferDstScalarPerVector_GM11, + AThreadTransferSrcResetCoordinateAfterRun, + BBlockTransferThreadSliceLengths_GK_GN0_GN10_GN11, + BBlockTransferThreadClusterLengths_GK_GN0_GN10_GN11, + BBlockTransferThreadClusterArrangeOrder, + BBlockTransferSrcAccessOrder, + BBlockTransferSrcVectorDim, + BBlockTransferSrcScalarPerVector, + BBlockTransferDstScalarPerVector_GN11, + BThreadTransferSrcResetCoordinateAfterRun, + CThreadTransferSrcDstAccessOrder, + CThreadTransferSrcDstVectorDim, + CThreadTransferDstScalarPerVector, + AGridIteratorHacks, + BGridIteratorHacks, + CGridIteratorHacks, + AGridMoveSliceWindowIteratorHacks, + BGridMoveSliceWindowIteratorHacks>; + + using AGKGM0GM10GM11GridDesc = + decltype(GridwiseContraction::MakeAGKGM0GM10GM11GridDescriptor(a_gk_gm0_gm1_grid_desc)); + using BGKGN0GN10GN11GridDesc = + decltype(GridwiseContraction::MakeBGKGN0GN10GN11GridDescriptor(b_gk_gn0_gn1_grid_desc)); + using CGM10BM0BM1GN10BN0BN1GridDesc = decltype( + GridwiseContraction::MakeCGM10BM0BM1GN10BN0BN1GridDescriptor(c_gm0_gm1_gn0_gn1_grid_desc)); + using CBlockIdToGM10GN10BlockClusterAdaptor = + decltype(GridwiseContraction::MakeCBlockIdToGM10GN10BlockClusterAdaptor( + c_gm0_gm1_gn0_gn1_grid_desc)); + + const auto a_gk_gm0_gm10_gm11_grid_desc = *reinterpret_cast( + (const void*)p_a_gk_gm0_gm10_gm11_grid_desc); + const auto b_gk_gn0_gn10_gn11_grid_desc = *reinterpret_cast( + (const void*)p_b_gk_gn0_gn10_gn11_grid_desc); + const auto c_gm10_bm0_bm1_gn10_bn0_bn1_grid_desc = + *reinterpret_cast( + (const void*)p_c_gm10_bm0_bm1_gn10_bn0_bn1_grid_desc); + const auto c_blockid_to_gm10_gn10_block_cluster_adaptor = + *reinterpret_cast( + (const void*)p_c_blockid_to_gm10_gn10_block_cluster_adaptor); + + constexpr index_t shared_block_size = + GridwiseContraction::GetSharedMemoryNumberOfByte() / sizeof(FloatAB); + + __shared__ FloatAB p_shared_block[shared_block_size]; + + GridwiseContraction::Run(p_a_grid, + p_b_grid, + p_c_grid, + p_shared_block, + a_gk_gm0_gm10_gm11_grid_desc, + b_gk_gn0_gn10_gn11_grid_desc, + c_gm10_bm0_bm1_gn10_bn0_bn1_grid_desc, + c_blockid_to_gm10_gn10_block_cluster_adaptor, + integral_constant{}, + integral_constant{}); +}; diff --git a/composable_kernel/src/kernel_wrapper/gridwise_convolution_forward_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer.cpp b/composable_kernel/src/kernel_wrapper/gridwise_convolution_forward_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer.cpp new file mode 100644 index 0000000000..ecd3af822f --- /dev/null +++ b/composable_kernel/src/kernel_wrapper/gridwise_convolution_forward_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer.cpp @@ -0,0 +1,8 @@ + +extern "C" __global__ void +gridwise_convolution_forward_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer( + const void* const __restrict__ p_in_global, + const void* const __restrict__ p_wei_global, + void* const __restrict__ p_out_global){ + +}; diff --git a/composable_kernel/src/kernel_wrapper/gridwise_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw.cpp b/composable_kernel/src/kernel_wrapper/gridwise_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw.cpp new file mode 100644 index 0000000000..820a0515ee --- /dev/null +++ b/composable_kernel/src/kernel_wrapper/gridwise_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw.cpp @@ -0,0 +1,7 @@ + +extern "C" __global__ void gridwise_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw( + const void* const __restrict__ p_in_global, + const void* const __restrict__ p_wei_global, + void* const __restrict__ p_out_global){ + +}; diff --git a/composable_kernel/src/kernel_wrapper/gridwise_convolution_forward_implicit_gemm_v4r4_nhwc_kyxc_nhwk.cpp b/composable_kernel/src/kernel_wrapper/gridwise_convolution_forward_implicit_gemm_v4r4_nhwc_kyxc_nhwk.cpp new file mode 100644 index 0000000000..4f646adbb7 --- /dev/null +++ b/composable_kernel/src/kernel_wrapper/gridwise_convolution_forward_implicit_gemm_v4r4_nhwc_kyxc_nhwk.cpp @@ -0,0 +1,8 @@ + + +extern "C" __global__ void gridwise_convolution_forward_implicit_gemm_v4r4_nhwc_kyxc_nhwk( + const void* const __restrict__ p_in_global, + const void* const __restrict__ p_wei_global, + void* const __restrict__ p_out_global){ + +}; diff --git a/driver/CMakeLists.txt b/driver/CMakeLists.txt index 6b91ab986d..9800559fe9 100644 --- a/driver/CMakeLists.txt +++ b/driver/CMakeLists.txt @@ -1,32 +1,201 @@ -set(TENSOR_SOURCE + +set(CMAKE_CXX_COMPILER /opt/rocm/llvm/bin/clang++) + +## for online-compiling of HIP kernels +set(OLC_HIP_COMPILER ${CMAKE_CXX_COMPILER} CACHE PATH "") + +## reset to avoid the C++ options from the parent project +set(CMAKE_CXX_FLAGS "") +message("Compiling options for library and kernels: ${CMAKE_CXX_FLAGS}") + +# look for and register clang-offload-bundler +if(OLC_HIP_COMPILER MATCHES ".*clang\\+\\+$") + find_program(OLC_OFFLOADBUNDLER_BIN clang-offload-bundler + PATH_SUFFIXES bin + PATHS + /opt/rocm/llvm + ${CMAKE_INSTALL_PREFIX}/llvm + ) +endif() +if(OLC_OFFLOADBUNDLER_BIN) + message(STATUS "clang-offload-bundler found: ${OLC_OFFLOADBUNDLER_BIN}") + set(OLC_OFFLOADBUNDLER_BIN "${OLC_OFFLOADBUNDLER_BIN}") +else() + # look for and register extractkernel + message(STATUS "clang-offload-bundler not found") + + find_program(EXTRACTKERNEL_BIN extractkernel + PATH_SUFFIXES bin + PATHS + /opt/rocm/hip + /opt/rocm/hcc + /opt/rocm + ${CMAKE_INSTALL_PREFIX}/hip + ${CMAKE_INSTALL_PREFIX}/hcc + ${CMAKE_INSTALL_PREFIX} + + ) + if(EXTRACTKERNEL_BIN) + message(STATUS "extractkernel found: ${EXTRACTKERNEL_BIN}") + set(EXTRACTKERNEL_BIN "${EXTRACTKERNEL_BIN}") + else() + message(FATAL_ERROR "extractkernel not found") + endif() +endif() + +option(Boost_USE_STATIC_LIBS "Use boost static libraries" OFF) +set(BOOST_COMPONENTS filesystem) +add_definitions(-DBOOST_ALL_NO_LIB=1) +find_package(Boost REQUIRED COMPONENTS ${BOOST_COMPONENTS}) + +# HIP is always required +find_package(hip REQUIRED PATHS /opt/rocm) +message(STATUS "Build with HIP ${hip_VERSION}") +target_flags(HIP_COMPILER_FLAGS hip::device) +# Remove cuda arch flags +string(REGEX REPLACE --cuda-gpu-arch=[a-z0-9]+ "" HIP_COMPILER_FLAGS "${HIP_COMPILER_FLAGS}") +string(REGEX REPLACE --offload-arch=[a-z0-9]+ "" HIP_COMPILER_FLAGS "${HIP_COMPILER_FLAGS}") + +set(OLC_hip_VERSION_MAJOR "${hip_VERSION_MAJOR}") +set(OLC_hip_VERSION_MINOR "${hip_VERSION_MINOR}") +set(OLC_hip_VERSION_PATCH "${hip_VERSION_PATCH}") + +option(ENABLE_DEBUG "Build to enable debugging" ON) +if(ENABLE_DEBUG) + set(OLC_DEBUG 1) +else() + set(OLC_DEBUG 0) +endif() + +configure_file("${CMAKE_CURRENT_SOURCE_DIR}/olCompiling/include/config.h.in" "${CMAKE_CURRENT_SOURCE_DIR}/olCompiling/include/config.h") + +message(STATUS "Hip compiler flags: ${HIP_COMPILER_FLAGS}") + +## HIP_COMPILER_FLAGS will be used for on-line compiling of the HIP kernels +add_definitions("-DHIP_COMPILER_FLAGS=${HIP_COMPILER_FLAGS}") + +file(GLOB COMPOSABLE_KERNEL_INCLUDE_1 "${PROJECT_SOURCE_DIR}/composable_kernel/include/kernel_algorithm/*.hpp") +file(GLOB COMPOSABLE_KERNEL_INCLUDE_2 "${PROJECT_SOURCE_DIR}/composable_kernel/include/tensor_description/*.hpp") +file(GLOB COMPOSABLE_KERNEL_INCLUDE_3 "${PROJECT_SOURCE_DIR}/composable_kernel/include/tensor_operation/*.hpp") +file(GLOB COMPOSABLE_KERNEL_INCLUDE_4 "${PROJECT_SOURCE_DIR}/composable_kernel/include/utility/*.hpp") +file(GLOB COMPOSABLE_KERNEL_INCLUDE_5 "${PROJECT_BINARY_DIR}/composable_kernel/include/utility/*.hpp") +file(GLOB COMPOSABLE_KERNEL_INCLUDE_6 "${PROJECT_SOURCE_DIR}/external/rocm/include/bfloat16_dev.hpp") +set(MCONV_KERNEL_INCLUDES + ${COMPOSABLE_KERNEL_INCLUDE_1} + ${COMPOSABLE_KERNEL_INCLUDE_2} + ${COMPOSABLE_KERNEL_INCLUDE_3} + ${COMPOSABLE_KERNEL_INCLUDE_4} + ${COMPOSABLE_KERNEL_INCLUDE_5} + ${COMPOSABLE_KERNEL_INCLUDE_6} + ) + +set(MCONV_KERNELS + ../composable_kernel/src/kernel_wrapper/dynamic_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw.cpp + ../composable_kernel/src/kernel_wrapper/dynamic_convolution_forward_implicit_gemm_v4r5_nchw_kcyx_nkhw.cpp + ) + +add_kernels("olCompiling/" "${MCONV_KERNELS}") +add_kernel_includes("olCompiling/" "${MCONV_KERNEL_INCLUDES}") + +set(MCONV_SOURCES src/host_tensor.cpp; src/device.cpp; ) -add_library(host SHARED ${TENSOR_SOURCE}) -target_compile_features(host PUBLIC) -set_target_properties(host PROPERTIES POSITION_INDEPENDENT_CODE ON) +set(OLC_HIP_UTILITY_HEADERS + olCompiling/include/config.h + olCompiling/include/logger.hpp + olCompiling/include/stringutils.hpp + olCompiling/include/tmp_dir.hpp + olCompiling/include/write_file.hpp + olCompiling/include/env.hpp + olCompiling/include/manage_ptr.hpp + olCompiling/include/md5.hpp + olCompiling/include/simple_hash.hpp + olCompiling/include/exec_utils.hpp + olCompiling/include/hipCheck.hpp + olCompiling/include/target_properties.hpp + olCompiling/include/handle.hpp + olCompiling/include/op_kernel_args.hpp + olCompiling/include/kernel.hpp + olCompiling/include/kernel_build_params.hpp + olCompiling/include/hip_build_utils.hpp + olCompiling/include/hipoc_program.hpp + olCompiling/include/hipoc_program_impl.hpp + olCompiling/include/hipoc_kernel.hpp + olCompiling/include/kernel_cache.hpp + olCompiling/include/binary_cache.hpp + ) -if(DEVICE_BACKEND STREQUAL "NVIDIA") - target_link_libraries(host nvToolsExt cudart) -endif() +set(OLC_HIP_UTILITY_CPPS + olCompiling/hip_utility/logger.cpp + olCompiling/hip_utility/tmp_dir.cpp + olCompiling/hip_utility/md5.cpp + olCompiling/hip_utility/exec_utils.cpp + olCompiling/hip_utility/target_properties.cpp + olCompiling/hip_utility/handlehip.cpp + olCompiling/hip_utility/kernel_build_params.cpp + olCompiling/hip_utility/hip_build_utils.cpp + olCompiling/hip_utility/hipoc_program.cpp + olCompiling/hip_utility/hipoc_kernel.cpp + olCompiling/hip_utility/kernel_cache.cpp + olCompiling/hip_utility/binary_cache.cpp + ) -install(TARGETS host LIBRARY DESTINATION lib) +list(APPEND OLC_SOURCES ${OLC_HIP_UTILITY_CPPS} ${OLC_HIP_UTILITY_HEADERS}) +list(INSERT MCONV_SOURCES 0 + ${PROJECT_BINARY_DIR}/kernel.cpp + ${PROJECT_BINARY_DIR}/kernel_includes.cpp + ) -if(DEVICE_BACKEND STREQUAL "AMD") - set(CONV_SOURCE src/conv_driver.cpp) - set(CONV_V2_SOURCE src/conv_driver_v2.cpp) - set(CONV_BWD_DATA_SOURCE src/conv_bwd_data_driver.cpp) -elseif(DEVICE_BACKEND STREQUAL "NVIDIA") - set(CONV_SOURCE src/conv_driver.cu) - set(CONV_BWD_DATA_SOURCE src/conv_bwd_data_driver.cu) -endif() +## addkernels provide the tool to create inlined kernels in one header +add_subdirectory(olCompiling/addkernels) -add_executable(conv_driver ${CONV_SOURCE}) -add_executable(conv_driver_v2 ${CONV_V2_SOURCE}) -add_executable(conv_bwd_data_driver ${CONV_BWD_DATA_SOURCE}) +function(inline_kernels_src KERNELS KERNEL_INCLUDES) + set(KERNEL_SRC_HPP_FILENAME batch_all.cpp.hpp) + set(KERNEL_SRC_HPP_PATH ${PROJECT_BINARY_DIR}/inlined_kernels/${KERNEL_SRC_HPP_FILENAME}) + set(KERNEL_SRC_CPP_PATH ${PROJECT_BINARY_DIR}/inlined_kernels/batch_all.cpp) -target_link_libraries(conv_driver PRIVATE host) -target_link_libraries(conv_driver_v2 PRIVATE host) -target_link_libraries(conv_bwd_data_driver PRIVATE host) + add_custom_command( + OUTPUT ${KERNEL_SRC_HPP_PATH} + WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} + DEPENDS addkernels ${KERNELS} ${KERNEL_INCLUDES} + COMMAND $ -target ${KERNEL_SRC_HPP_PATH} -extern -source ${KERNELS} + COMMENT "Inlining All kernels" + ) + configure_file(olCompiling/kernels_batch.cpp.in ${KERNEL_SRC_CPP_PATH}) + list(APPEND OLC_SOURCES ${KERNEL_SRC_CPP_PATH} ${KERNEL_SRC_HPP_PATH}) + + set(OLC_SOURCES ${OLC_SOURCES} PARENT_SCOPE) +endfunction() + +inline_kernels_src("${MCONV_KERNELS}" "${MCONV_KERNEL_INCLUDES}") + +list(APPEND MCONV_SOURCES ${OLC_SOURCES} ${PROJECT_BINARY_DIR}/olc_kernel_includes.h) + +add_custom_command( + OUTPUT ${PROJECT_BINARY_DIR}/olc_kernel_includes.h + WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} + DEPENDS addkernels ${MCONV_KERNEL_INCLUDES} + COMMAND $ -no-recurse -guard GUARD_OLC_KERNEL_INCLUDES_HPP_ -target ${PROJECT_BINARY_DIR}/olc_kernel_includes.h -source ${MCONV_KERNEL_INCLUDES} + COMMENT "Inlining HIP kernel includes" + ) + +## the library target +add_library(modConv SHARED ${MCONV_SOURCES}) + +target_include_directories(modConv PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/olCompiling/include/) +target_include_directories(modConv PRIVATE ${PROJECT_BINARY_DIR}) +target_include_directories(modConv PRIVATE ${PROJECT_SOURCE_DIR}/external/half/include/) + +target_link_libraries(modConv PRIVATE hip::device) +target_link_libraries(modConv INTERFACE hip::host) +target_link_libraries(modConv PRIVATE Boost::filesystem) + +target_compile_options(modConv PRIVATE -mfma) + +target_compile_features(modConv PUBLIC) +set_target_properties(modConv PROPERTIES POSITION_INDEPENDENT_CODE ON) + +install(TARGETS modConv LIBRARY DESTINATION lib) diff --git a/driver/src/conv_bwd_data_driver.cpp b/driver/conv_bwd_data_driver.cpp similarity index 100% rename from driver/src/conv_bwd_data_driver.cpp rename to driver/conv_bwd_data_driver.cpp diff --git a/driver/src/conv_driver.cpp b/driver/conv_driver.cpp similarity index 100% rename from driver/src/conv_driver.cpp rename to driver/conv_driver.cpp diff --git a/driver/src/conv_driver_v2.cpp b/driver/conv_driver_v2.cpp similarity index 99% rename from driver/src/conv_driver_v2.cpp rename to driver/conv_driver_v2.cpp index 1fb8fc7873..693448ac25 100644 --- a/driver/src/conv_driver_v2.cpp +++ b/driver/conv_driver_v2.cpp @@ -18,9 +18,9 @@ #include "device_dynamic_convolution_forward_implicit_gemm_v5r1_nchw_kcyx_nkhw.hpp" #define USE_DYNAMIC_MODE 1 -#define USE_CONV_FWD_V4R4_NCHW 1 +#define USE_CONV_FWD_V4R4_NCHW 0 #define USE_CONV_FWD_V4R4_NHWC 0 -#define USE_CONV_FWD_V4R5_NCHW 0 +#define USE_CONV_FWD_V4R5_NCHW 1 #define USE_CONV_FWD_V5R1_NCHW 0 enum ConvForwardAlgo @@ -279,8 +279,6 @@ int main(int argc, char* argv[]) in_right_pads_dev); }; - const auto nhwc_desc = f_make_for_device_nhwc(); - #if USE_CONV_FWD_V4R4_NCHW if(algo == ConvForwardAlgo::V4R4NCHW) { diff --git a/driver/conv_driver_v2_olc.cpp b/driver/conv_driver_v2_olc.cpp new file mode 100644 index 0000000000..0b1c91c81b --- /dev/null +++ b/driver/conv_driver_v2_olc.cpp @@ -0,0 +1,288 @@ +#include +#include +#include +#include +#include +#include +#include "config.hpp" +#include "print.hpp" +#include "device.hpp" +#include "host_tensor.hpp" +#include "host_tensor_generator.hpp" +#include "conv_common.hpp" +#include "host_conv.hpp" +#include "device_tensor.hpp" +#include "olc_device_dynamic_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp" +#include "olc_device_dynamic_convolution_forward_implicit_gemm_v4r5_nchw_kcyx_nkhw.hpp" + +#define USE_CONV_FWD_V4R4_NCHW 1 +#define USE_CONV_FWD_V4R5_NCHW 1 + +#include "conv_tunables.hpp" +#include "handle.hpp" +#include "hipCheck.hpp" + +enum ConvForwardAlgo +{ + V4R4NCHW, + V4R4NHWC, + V4R5NCHW, + V5R1NCHW +}; + +int main(int argc, char* argv[]) +{ + using namespace ck; + using size_t = std::size_t; + + hipStream_t stream; + olCompile::Handle* handle; + + MY_HIP_CHECK(hipStreamCreate(&stream)); + + handle = new olCompile::Handle(stream); + + constexpr auto I0 = Number<0>{}; + constexpr auto I1 = Number<1>{}; + constexpr auto I2 = Number<2>{}; + constexpr auto I3 = Number<3>{}; + constexpr auto I4 = Number<4>{}; + constexpr auto I5 = Number<5>{}; + constexpr auto I6 = Number<6>{}; + + if(argc != 22) + { + printf("arg1 to 5: layout, algo, do_verification, init_method, do_log, nrepeat\n"); + printf("rest: N, K, C, Y, X, Hi, Wi, Sy, Sx, Dy, Dx, LeftPy, LeftPx, RightPy, RightPx\n"); + exit(1); + } + + const ConvTensorLayout layout = static_cast(atoi(argv[1])); + const ConvForwardAlgo algo = static_cast(atoi(argv[2])); + const bool do_verification = atoi(argv[3]); + const int init_method = atoi(argv[4]); + const bool do_log = atoi(argv[5]); + const int nrepeat = atoi(argv[6]); + + const index_t N = atoi(argv[7]); + const index_t K = atoi(argv[8]); + const index_t C = atoi(argv[9]); + const index_t Y = atoi(argv[10]); + const index_t X = atoi(argv[11]); + const index_t Hi = atoi(argv[12]); + const index_t Wi = atoi(argv[13]); + + const index_t conv_stride_h = atoi(argv[14]); + const index_t conv_stride_w = atoi(argv[15]); + const index_t conv_dilation_h = atoi(argv[16]); + const index_t conv_dilation_w = atoi(argv[17]); + const index_t in_left_pad_h = atoi(argv[18]); + const index_t in_left_pad_w = atoi(argv[19]); + const index_t in_right_pad_h = atoi(argv[20]); + const index_t in_right_pad_w = atoi(argv[21]); + + const index_t YEff = (Y - 1) * conv_dilation_h + 1; + const index_t XEff = (X - 1) * conv_dilation_w + 1; + + const index_t Ho = (Hi + in_left_pad_h + in_right_pad_h - YEff) / conv_stride_h + 1; + const index_t Wo = (Wi + in_left_pad_w + in_right_pad_w - XEff) / conv_stride_w + 1; + +#if 1 + constexpr index_t in_vector_size = 1; + using in_data_t = float; + using acc_data_t = float; + using out_data_t = float; +#elif 1 + constexpr index_t in_vector_size = 16; + using in_data_t = int8_t; + using acc_data_t = int32_t; + using out_data_t = int8_t; +#endif + + std::vector in_lengths_host(4), wei_lengths_host(4), out_lengths_host(4); + + switch(layout) + { + case ConvTensorLayout::NCHW: + // NCHW + in_lengths_host[0] = static_cast(N); + in_lengths_host[1] = static_cast(C); + in_lengths_host[2] = static_cast(Hi); + in_lengths_host[3] = static_cast(Wi); + wei_lengths_host[0] = static_cast(K); + wei_lengths_host[1] = static_cast(C); + wei_lengths_host[2] = static_cast(Y); + wei_lengths_host[3] = static_cast(X); + out_lengths_host[0] = static_cast(N); + out_lengths_host[1] = static_cast(K); + out_lengths_host[2] = static_cast(Ho); + out_lengths_host[3] = static_cast(Wo); + break; + case ConvTensorLayout::NHWC: + // NHWC + in_lengths_host[0] = static_cast(N); + in_lengths_host[1] = static_cast(Hi); + in_lengths_host[2] = static_cast(Wi); + in_lengths_host[3] = static_cast(C); + wei_lengths_host[0] = static_cast(K); + wei_lengths_host[1] = static_cast(Y); + wei_lengths_host[2] = static_cast(X); + wei_lengths_host[3] = static_cast(C); + out_lengths_host[0] = static_cast(N); + out_lengths_host[1] = static_cast(Ho); + out_lengths_host[2] = static_cast(Wo); + out_lengths_host[3] = static_cast(K); + break; + default: throw std::runtime_error("wrong! not implemented"); + } + + Tensor in(in_lengths_host); + Tensor wei(wei_lengths_host); + Tensor out_host(out_lengths_host); + Tensor out_device(out_lengths_host); + + std::cout << "layout: " << layout << std::endl; + ostream_HostTensorDescriptor(in.mDesc, std::cout << "in: "); + ostream_HostTensorDescriptor(wei.mDesc, std::cout << "wei: "); + ostream_HostTensorDescriptor(out_host.mDesc, std::cout << "out: "); + print_array("InLeftPads", make_tuple(in_left_pad_h, in_left_pad_w)); + print_array("InRightPads", make_tuple(in_right_pad_h, in_right_pad_w)); + print_array("ConvStrides", make_tuple(conv_stride_h, conv_stride_w)); + print_array("ConvDilations", make_tuple(conv_dilation_h, conv_dilation_w)); + + std::size_t num_thread = std::thread::hardware_concurrency(); + + if(do_verification) + { + switch(init_method) + { + case 0: + in.GenerateTensorValue(GeneratorTensor_1{}, num_thread); + wei.GenerateTensorValue(GeneratorTensor_1{}, num_thread); + break; + case 1: + in.GenerateTensorValue(GeneratorTensor_1{}, num_thread); + wei.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread); + break; + case 2: + in.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread); + wei.GenerateTensorValue(GeneratorTensor_1{}, num_thread); + break; + case 3: + in.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread); + wei.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread); + break; + default: + in.GenerateTensorValue(GeneratorTensor_2{1, 5}, num_thread); + + auto gen_wei = [](auto... is) { + return GeneratorTensor_2{1, 5}(is...) * GeneratorTensor_Checkboard{}(is...); + }; + wei.GenerateTensorValue(gen_wei, num_thread); + } + } + + auto f_make_for_device_nchw = [&]() { + const auto in_lengths_dev = make_tuple(N, C, Hi, Wi); + const auto wei_lengths_dev = make_tuple(K, C, Y, X); + const auto out_lengths_dev = make_tuple(N, K, Ho, Wo); + + return make_tuple(in_lengths_dev, wei_lengths_dev, out_lengths_dev); + }; + + auto f_make_for_device_nhwc = [&]() { + const auto in_lengths_dev = make_tuple(N, Hi, Wi, C); + const auto wei_lengths_dev = make_tuple(K, Y, X, C); + const auto out_lengths_dev = make_tuple(N, Ho, Wo, K); + + return make_tuple(in_lengths_dev, wei_lengths_dev, out_lengths_dev); + }; + + const auto conv_strides = make_tuple(conv_stride_h, conv_stride_w); + const auto conv_dilations = make_tuple(conv_dilation_h, conv_dilation_w); + const auto in_left_pads = make_tuple(in_left_pad_h, in_left_pad_w); + const auto in_right_pads = make_tuple(in_right_pad_h, in_right_pad_w); + +#if USE_CONV_FWD_V4R4_NCHW + if(algo == ConvForwardAlgo::V4R4NCHW) + { + if(layout != ConvTensorLayout::NCHW) + { + throw std::runtime_error("wrong! layout"); + } + + const auto tmp = f_make_for_device_nchw(); + + tunable_dyn_conv_fwd_v4r4_nchw_kcyx_nkhw* tunable = + &default_tunable_dyn_conv_fwd_v4r4_nchw_kcyx_nkhw; + + device_dynamic_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw_olc( + handle, + tmp[I0], + tmp[I1], + tmp[I2], + conv_strides, + conv_dilations, + in_left_pads, + in_right_pads, + in, + wei, + out_device, + tunable, + nrepeat); + } +#endif + +#if USE_CONV_FWD_V4R5_NCHW + if(algo == ConvForwardAlgo::V4R5NCHW) + { + if(layout != ConvTensorLayout::NCHW) + { + throw std::runtime_error("wrong! layout"); + } + + const auto tmp = f_make_for_device_nchw(); + + tunable_dyn_conv_fwd_v4r5_nchw_kcyx_nkhw* tunable = + &default_tunable_dyn_conv_fwd_v4r5_nchw_kcyx_nkhw; + + device_dynamic_convolution_forward_implicit_gemm_v4r5_nchw_kcyx_nkhw_olc( + handle, + tmp[I0], + tmp[I1], + tmp[I2], + conv_strides, + conv_dilations, + in_left_pads, + in_right_pads, + in, + wei, + out_device, + tunable, + nrepeat); + } +#endif + + if(do_verification) + { + host_direct_convolution( + in, wei, out_host, conv_strides, conv_dilations, in_left_pads, in_right_pads); + + check_error(out_host, out_device); + + if(do_log) + { + LogRange(std::cout << "in : ", in.mData, ",") << std::endl; + LogRange(std::cout << "wei: ", wei.mData, ",") << std::endl; + LogRange(std::cout << "out_host : ", out_host.mData, ",") << std::endl; + LogRange(std::cout << "out_device: ", out_device.mData, ",") << std::endl; + } + } + + delete handle; + MY_HIP_CHECK(hipStreamDestroy(stream)); +} diff --git a/driver/include/conv_tunables.hpp b/driver/include/conv_tunables.hpp new file mode 100644 index 0000000000..33f791d289 --- /dev/null +++ b/driver/include/conv_tunables.hpp @@ -0,0 +1,131 @@ +#ifndef CONV_TUNABLES_HPP +#define CONV_TUNABLES_HPP + +#include "config.hpp" + +struct tunable_dyn_conv_fwd_v4r4_nchw_kcyx_nkhw +{ + ck::index_t BlockSize; // usually not tunable + + ck::index_t MPerBlock; + ck::index_t NPerBlock; + ck::index_t KPerBlock; + + ck::index_t M1PerThread; + ck::index_t N1PerThread; + ck::index_t KPerThread; + + ck::index_t M1N1ThreadClusterM10; + ck::index_t M1N1ThreadClusterN10; + ck::index_t M1N1ThreadClusterM11; + ck::index_t M1N1ThreadClusterN11; + + std::array ABlockTransferThreadSliceLengths_K_M0_M1; + std::array ABlockTransferThreadClusterLengths_K_M0_M1; + std::array ABlockTransferThreadClusterArrangeOrder; + std::array ABlockTransferSrcAccessOrder; + ck::index_t ABlockTransferSrcVectorDim; + ck::index_t ABlockTransferSrcScalarPerVector; + ck::index_t ABlockTransferDstScalarPerVector_M1; + bool AThreadTransferSrcResetCoordinateAfterRun; + + std::array BBlockTransferThreadSliceLengths_K_N0_N1; + std::array BBlockTransferThreadClusterLengths_K_N0_N1; + std::array BBlockTransferThreadClusterArrangeOrder; + std::array BBlockTransferSrcAccessOrder; + ck::index_t BBlockTransferSrcVectorDim; + ck::index_t BBlockTransferSrcScalarPerVector; + ck::index_t BBlockTransferDstScalarPerVector_N1; + bool BThreadTransferSrcResetCoordinateAfterRun; + + std::array CThreadTransferSrcDstAccessOrder; + ck::index_t CThreadTransferSrcDstVectorDim; + ck::index_t CThreadTransferDstScalarPerVector; +}; + +static tunable_dyn_conv_fwd_v4r4_nchw_kcyx_nkhw default_tunable_dyn_conv_fwd_v4r4_nchw_kcyx_nkhw = { + 256, 128, 128, 8, 4, 4, 1, + 8, 8, 2, 2, {4, 1, 1}, {2, 1, 128}, {2, 1, 0}, + {2, 1, 0}, 0, 4, 1, false, {4, 1, 1}, {2, 1, 128}, + {0, 1, 2}, {0, 1, 2}, 2, 1, 1, false, {3, 4, 5, 0, 1, 2}, + 5, 1}; + +struct tunable_dyn_conv_fwd_v4r5_nchw_kcyx_nkhw +{ + ck::index_t BlockSize; + + ck::index_t GM1PerBlockGM11; + ck::index_t GN1PerBlockGN11; + ck::index_t KPerBlock; + + ck::index_t M1PerThread; + ck::index_t N1PerThread; + ck::index_t KPerThread; + + ck::index_t M1N1ThreadClusterM10; + ck::index_t M1N1ThreadClusterN10; + ck::index_t M1N1ThreadClusterM11; + ck::index_t M1N1ThreadClusterN11; + + std::array ABlockTransferThreadSliceLengths_GK_GM0_GM10_GM11; + std::array ABlockTransferThreadClusterLengths_GK_GM0_GM10_GM11; + std::array ABlockTransferThreadClusterArrangeOrder; + std::array ABlockTransferSrcAccessOrder; + ck::index_t ABlockTransferSrcVectorDim; + ck::index_t ABlockTransferSrcScalarPerVector; + ck::index_t ABlockTransferDstScalarPerVector_GM11; + bool AThreadTransferSrcResetCoordinateAfterRun; + + std::array BBlockTransferThreadSliceLengths_GK_GN0_GN10_GN11; + std::array BBlockTransferThreadClusterLengths_GK_GN0_GN10_GN11; + std::array BBlockTransferThreadClusterArrangeOrder; + std::array BBlockTransferSrcAccessOrder; + ck::index_t BBlockTransferSrcVectorDim; + ck::index_t BBlockTransferSrcScalarPerVector; + ck::index_t BBlockTransferDstScalarPerVector_GN11; + bool BThreadTransferSrcResetCoordinateAfterRun; + + std::array CThreadTransferSrcDstAccessOrder; + ck::index_t CThreadTransferSrcDstVectorDim; + ck::index_t CThreadTransferDstScalarPerVector; +}; + +static tunable_dyn_conv_fwd_v4r5_nchw_kcyx_nkhw default_tunable_dyn_conv_fwd_v4r5_nchw_kcyx_nkhw = { + 256, + 128, + 32, + 8, + 4, + 4, + 1, + 2, + 2, + 8, + 8, + {4, 1, 1, 1}, + {2, 1, 1, 128}, + {3, 2, 1, 0}, + {3, 2, 1, 0}, + 0, + 4, + 1, + false, + {1, 4, 1, 1}, + {8, 1, 1, 32}, + {0, 3, 2, 1}, + {0, 3, 2, 1}, + 3, + 1, + 1, + false, + {3, 4, 5, 0, 1, 2}, + 5, + 1}; + +static inline int +conv_hw_out_size(int hw_in_size, int leftPad, int rightPad, int dilation, int yx_size, int stride) +{ + return (hw_in_size + leftPad + rightPad - dilation * (yx_size - 1) - 1) / stride + 1; +} + +#endif diff --git a/driver/include/olc_device_dynamic_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp b/driver/include/olc_device_dynamic_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp new file mode 100644 index 0000000000..94a9bcc06d --- /dev/null +++ b/driver/include/olc_device_dynamic_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp @@ -0,0 +1,383 @@ +#include "device.hpp" +#include "host_tensor.hpp" +#include "dynamic_tensor_descriptor.hpp" +#include "dynamic_tensor_descriptor_helper.hpp" +#include "transform_forward_convolution_into_gemm_v4r4_nchw_kcyx_nkhw.hpp" + +#include "olc_driver_common.hpp" +#include "conv_tunables.hpp" + +#include "handle.hpp" + +namespace detail_dyn_conv_fwd_v4r4_nchw_kcyx_nkhw { + +template +static std::string get_network_config_string_from_types() +{ + std::string out; + + out += static_cast(Driver::get_typeid_from_type()) + + static_cast(Driver::get_typeid_from_type()) + + static_cast(Driver::get_typeid_from_type()); + + return (out); +}; + +static std::string +get_network_config_string_from_tunable(const tunable_dyn_conv_fwd_v4r4_nchw_kcyx_nkhw* pt) +{ + std::string out("TUN_"); + + out += std::to_string(pt->BlockSize) + "_"; + + out += std::to_string(pt->MPerBlock) + "x" + std::to_string(pt->NPerBlock) + "x" + + std::to_string(pt->KPerBlock) + "_"; + out += std::to_string(pt->M1PerThread) + "x" + std::to_string(pt->N1PerThread) + "x" + + std::to_string(pt->KPerThread) + "_"; + out += std::to_string(pt->M1N1ThreadClusterM10) + "x" + + std::to_string(pt->M1N1ThreadClusterN10) + "x" + + std::to_string(pt->M1N1ThreadClusterM11) + "x" + + std::to_string(pt->M1N1ThreadClusterN11) + "_"; + + out += std::to_string(pt->ABlockTransferThreadSliceLengths_K_M0_M1[0]) + "x" + + std::to_string(pt->ABlockTransferThreadSliceLengths_K_M0_M1[1]) + "x" + + std::to_string(pt->ABlockTransferThreadSliceLengths_K_M0_M1[2]) + "_"; + + out += std::to_string(pt->ABlockTransferThreadClusterLengths_K_M0_M1[0]) + "x" + + std::to_string(pt->ABlockTransferThreadClusterLengths_K_M0_M1[1]) + "x" + + std::to_string(pt->ABlockTransferThreadClusterLengths_K_M0_M1[2]) + "_"; + + out += std::to_string(pt->ABlockTransferThreadClusterArrangeOrder[0]) + "x" + + std::to_string(pt->ABlockTransferThreadClusterArrangeOrder[1]) + "x" + + std::to_string(pt->ABlockTransferThreadClusterArrangeOrder[2]) + "_"; + + out += std::to_string(pt->ABlockTransferSrcAccessOrder[0]) + "x" + + std::to_string(pt->ABlockTransferSrcAccessOrder[1]) + "x" + + std::to_string(pt->ABlockTransferSrcAccessOrder[2]) + "_"; + + out += std::to_string(pt->ABlockTransferSrcVectorDim) + "_"; + out += std::to_string(pt->ABlockTransferSrcScalarPerVector) + "_"; + out += std::to_string(pt->ABlockTransferDstScalarPerVector_M1) + "_"; + out += std::to_string(pt->AThreadTransferSrcResetCoordinateAfterRun) + "_"; + + out += std::to_string(pt->BBlockTransferThreadSliceLengths_K_N0_N1[0]) + "x" + + std::to_string(pt->BBlockTransferThreadSliceLengths_K_N0_N1[1]) + "x" + + std::to_string(pt->BBlockTransferThreadSliceLengths_K_N0_N1[2]) + "_"; + + out += std::to_string(pt->BBlockTransferThreadClusterLengths_K_N0_N1[0]) + "x" + + std::to_string(pt->BBlockTransferThreadClusterLengths_K_N0_N1[1]) + "x" + + std::to_string(pt->BBlockTransferThreadClusterLengths_K_N0_N1[2]) + "_"; + + out += std::to_string(pt->BBlockTransferThreadClusterArrangeOrder[0]) + "x" + + std::to_string(pt->BBlockTransferThreadClusterArrangeOrder[1]) + "x" + + std::to_string(pt->BBlockTransferThreadClusterArrangeOrder[2]) + "_"; + + out += std::to_string(pt->BBlockTransferSrcAccessOrder[0]) + "x" + + std::to_string(pt->BBlockTransferSrcAccessOrder[1]) + "x" + + std::to_string(pt->BBlockTransferSrcAccessOrder[2]) + "_"; + + out += std::to_string(pt->BBlockTransferSrcVectorDim) + "_"; + out += std::to_string(pt->BBlockTransferSrcScalarPerVector) + "_"; + out += std::to_string(pt->BBlockTransferDstScalarPerVector_N1) + "_"; + out += std::to_string(pt->BThreadTransferSrcResetCoordinateAfterRun) + "_"; + + out += std::to_string(pt->CThreadTransferSrcDstAccessOrder[0]) + "x" + + std::to_string(pt->CThreadTransferSrcDstAccessOrder[1]) + "x" + + std::to_string(pt->CThreadTransferSrcDstAccessOrder[2]) + "x" + + std::to_string(pt->CThreadTransferSrcDstAccessOrder[3]) + "x" + + std::to_string(pt->CThreadTransferSrcDstAccessOrder[4]) + "x" + + std::to_string(pt->CThreadTransferSrcDstAccessOrder[5]) + "_"; + + out += std::to_string(pt->CThreadTransferSrcDstVectorDim) + "_"; + out += std::to_string(pt->CThreadTransferDstScalarPerVector); + + return (out); +}; + +template +static std::string get_definition_string_from_types() +{ + std::string out; + + out += " -DCK_PARAM_IN_WEI_DATATYPE=" + std::to_string(Driver::get_typeid_from_type()) + + " -DCK_PARAM_CONV_COMPTYPE=" + std::to_string(Driver::get_typeid_from_type()) + + " -DCK_PARAM_OUT_DATATYPE=" + std::to_string(Driver::get_typeid_from_type()); + + return (out); +}; + +static std::string +get_definition_string_from_tunable(const tunable_dyn_conv_fwd_v4r4_nchw_kcyx_nkhw* pt) +{ + std::string out; + + out += " -DCK_PARAM_BlockSize=" + std::to_string(pt->BlockSize); + + out += " -DCK_PARAM_MPerBlock=" + std::to_string(pt->MPerBlock) + + " -DCK_PARAM_NPerBlock=" + std::to_string(pt->NPerBlock) + + " -DCK_PARAM_KPerBlock=" + std::to_string(pt->KPerBlock); + out += " -DCK_PARAM_M1PerThread=" + std::to_string(pt->M1PerThread) + + " -DCK_PARAM_N1PerThread=" + std::to_string(pt->N1PerThread) + + " -DCK_PARAM_KPerThread=" + std::to_string(pt->KPerThread); + + out += " -DCK_PARAM_M1N1ThreadClusterM10=" + std::to_string(pt->M1N1ThreadClusterM10) + + " -DCK_PARAM_M1N1ThreadClusterN10=" + std::to_string(pt->M1N1ThreadClusterN10) + + " -DCK_PARAM_M1N1ThreadClusterM11=" + std::to_string(pt->M1N1ThreadClusterM11) + + " -DCK_PARAM_M1N1ThreadClusterN11=" + std::to_string(pt->M1N1ThreadClusterN11); + + out += " -DCK_PARAM_ABlockTransferThreadSliceLengths_K_M0_M1=" + + std::to_string(pt->ABlockTransferThreadSliceLengths_K_M0_M1[0]) + "," + + std::to_string(pt->ABlockTransferThreadSliceLengths_K_M0_M1[1]) + "," + + std::to_string(pt->ABlockTransferThreadSliceLengths_K_M0_M1[2]); + + out += " -DCK_PARAM_ABlockTransferThreadClusterLengths_K_M0_M1=" + + std::to_string(pt->ABlockTransferThreadClusterLengths_K_M0_M1[0]) + "," + + std::to_string(pt->ABlockTransferThreadClusterLengths_K_M0_M1[1]) + "," + + std::to_string(pt->ABlockTransferThreadClusterLengths_K_M0_M1[2]); + + out += " -DCK_PARAM_ABlockTransferThreadClusterArrangeOrder=" + + std::to_string(pt->ABlockTransferThreadClusterArrangeOrder[0]) + "," + + std::to_string(pt->ABlockTransferThreadClusterArrangeOrder[1]) + "," + + std::to_string(pt->ABlockTransferThreadClusterArrangeOrder[2]); + + out += " -DCK_PARAM_ABlockTransferSrcAccessOrder=" + + std::to_string(pt->ABlockTransferSrcAccessOrder[0]) + "," + + std::to_string(pt->ABlockTransferSrcAccessOrder[1]) + "," + + std::to_string(pt->ABlockTransferSrcAccessOrder[2]); + + out += + " -DCK_PARAM_ABlockTransferSrcVectorDim=" + std::to_string(pt->ABlockTransferSrcVectorDim); + out += " -DCK_PARAM_ABlockTransferSrcScalarPerVector=" + + std::to_string(pt->ABlockTransferSrcScalarPerVector); + out += " -DCK_PARAM_ABlockTransferDstScalarPerVector_M1=" + + std::to_string(pt->ABlockTransferDstScalarPerVector_M1); + out += " -DCK_PARAM_AThreadTransferSrcResetCoordinateAfterRun=" + + std::to_string(pt->AThreadTransferSrcResetCoordinateAfterRun); + + out += " -DCK_PARAM_BBlockTransferThreadSliceLengths_K_N0_N1=" + + std::to_string(pt->BBlockTransferThreadSliceLengths_K_N0_N1[0]) + "," + + std::to_string(pt->BBlockTransferThreadSliceLengths_K_N0_N1[1]) + "," + + std::to_string(pt->BBlockTransferThreadSliceLengths_K_N0_N1[2]); + + out += " -DCK_PARAM_BBlockTransferThreadClusterLengths_K_N0_N1=" + + std::to_string(pt->BBlockTransferThreadClusterLengths_K_N0_N1[0]) + "," + + std::to_string(pt->BBlockTransferThreadClusterLengths_K_N0_N1[1]) + "," + + std::to_string(pt->BBlockTransferThreadClusterLengths_K_N0_N1[2]); + + out += " -DCK_PARAM_BBlockTransferThreadClusterArrangeOrder=" + + std::to_string(pt->BBlockTransferThreadClusterArrangeOrder[0]) + "," + + std::to_string(pt->BBlockTransferThreadClusterArrangeOrder[1]) + "," + + std::to_string(pt->BBlockTransferThreadClusterArrangeOrder[2]); + + out += " -DCK_PARAM_BBlockTransferSrcAccessOrder=" + + std::to_string(pt->BBlockTransferSrcAccessOrder[0]) + "," + + std::to_string(pt->BBlockTransferSrcAccessOrder[1]) + "," + + std::to_string(pt->BBlockTransferSrcAccessOrder[2]); + + out += + " -DCK_PARAM_BBlockTransferSrcVectorDim=" + std::to_string(pt->BBlockTransferSrcVectorDim); + out += " -DCK_PARAM_BBlockTransferSrcScalarPerVector=" + + std::to_string(pt->BBlockTransferSrcScalarPerVector); + out += " -DCK_PARAM_BBlockTransferDstScalarPerVector_N1=" + + std::to_string(pt->BBlockTransferDstScalarPerVector_N1); + out += " -DCK_PARAM_BThreadTransferSrcResetCoordinateAfterRun=" + + std::to_string(pt->BThreadTransferSrcResetCoordinateAfterRun); + + out += " -DCK_PARAM_CThreadTransferSrcDstAccessOrder=" + + std::to_string(pt->CThreadTransferSrcDstAccessOrder[0]) + "," + + std::to_string(pt->CThreadTransferSrcDstAccessOrder[1]) + "," + + std::to_string(pt->CThreadTransferSrcDstAccessOrder[2]) + "," + + std::to_string(pt->CThreadTransferSrcDstAccessOrder[3]) + "," + + std::to_string(pt->CThreadTransferSrcDstAccessOrder[4]) + "," + + std::to_string(pt->CThreadTransferSrcDstAccessOrder[5]); + + out += " -DCK_PARAM_CThreadTransferSrcDstVectorDim=" + + std::to_string(pt->CThreadTransferSrcDstVectorDim); + out += " -DCK_PARAM_CThreadTransferDstScalarPerVector=" + + std::to_string(pt->CThreadTransferDstScalarPerVector); + + return (out); +}; + +} // namespace detail_dyn_conv_fwd_v4r4_nchw_kcyx_nkhw + +template +void device_dynamic_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw_olc( + olCompile::Handle* handle, + const InLengths& in_n_c_hi_wi_lengths, + const WeiLengths& wei_k_c_y_x_lengths, + const OutLengths& out_n_k_ho_wo_lengths, + const ConvStrides& conv_strides, + const ConvDilations& conv_dilations, + const InLeftPads& in_left_pads, + const InRightPads& in_right_pads, + const Tensor& in_n_c_hi_wi, + const Tensor& wei_k_c_y_x, + Tensor& out_n_k_ho_wo, + const tunable_dyn_conv_fwd_v4r4_nchw_kcyx_nkhw* tunable, + ck::index_t nrepeat) +{ + using namespace ck; + using namespace detail_dyn_conv_fwd_v4r4_nchw_kcyx_nkhw; + using size_t = std::size_t; + + ///////////////////////////////////////////////////////////////////////////////////////////////////////////// + // The follow codes are only used for computing the grid_size, hasMainKBlockLoop, + // hasDoubleTailKBlockLoop + + constexpr auto I0 = Number<0>{}; + constexpr auto I1 = Number<1>{}; + constexpr auto I2 = Number<2>{}; + constexpr auto I3 = Number<3>{}; + + const auto in_n_c_hi_wi_desc = + make_dynamic_naive_tensor_descriptor_packed_v2(in_n_c_hi_wi_lengths); + const auto wei_k_c_y_x_desc = + make_dynamic_naive_tensor_descriptor_packed_v2(wei_k_c_y_x_lengths); + const auto out_n_k_ho_wo_desc = + make_dynamic_naive_tensor_descriptor_packed_v2(out_n_k_ho_wo_lengths); + + const auto descs = + transform_forward_convolution_into_gemm_v4r4_nchw_kcyx_nkhw_pad(wei_k_c_y_x_desc, + in_n_c_hi_wi_desc, + out_n_k_ho_wo_desc, + conv_strides, + conv_dilations, + in_left_pads, + in_right_pads); + const auto a_k_m_grid_desc = descs[I0]; + const auto c_m_n_grid_desc = descs[I2]; + const auto M = c_m_n_grid_desc.GetLength(I0); + const auto N = c_m_n_grid_desc.GetLength(I1); + const auto K = a_k_m_grid_desc.GetLength(I0); + + const index_t grid_size = (M / tunable->MPerBlock) * (N / tunable->NPerBlock); + const bool hasMainKBlockLoop = ((K + tunable->KPerBlock) / (2 * tunable->KPerBlock) > 1); + const bool hasDoubleTailKBlockLoop = ((K / tunable->KPerBlock) % 2 == 0); + ///////////////////////////////////////////////////////////////////////////////////////////////////////////// + + // these buffers are usually provided by the user application + DeviceMem in_n_c_hi_wi_dev_buf(sizeof(TInWei) * in_n_c_hi_wi.mDesc.GetElementSpace()); + DeviceMem wei_k_c_y_x_dev_buf(sizeof(TInWei) * wei_k_c_y_x.mDesc.GetElementSpace()); + DeviceMem out_n_k_ho_wo_dev_buf(sizeof(TOut) * out_n_k_ho_wo.mDesc.GetElementSpace()); + + in_n_c_hi_wi_dev_buf.ToDevice(in_n_c_hi_wi.mData.data()); + wei_k_c_y_x_dev_buf.ToDevice(wei_k_c_y_x.mData.data()); + out_n_k_ho_wo_dev_buf.ToDevice(out_n_k_ho_wo.mData.data()); + + // these are workspace buffers that should be expressed to the user by the corresponding + // workspace API + DeviceMem workspace_buf(4096); + + void* a_k_m0_m1_grid_desc_dev_buf = workspace_buf.GetDeviceBuffer(); + void* b_k_n0_n1_grid_desc_dev_buf = + static_cast(static_cast(workspace_buf.GetDeviceBuffer()) + 1024); + void* c_m0_m10_m11_n0_n10_n11_grid_desc_dev_buf = + static_cast(static_cast(workspace_buf.GetDeviceBuffer()) + 2048); + void* c_blockid_to_m0_n0_block_cluster_adaptor_dev_buf = + static_cast(static_cast(workspace_buf.GetDeviceBuffer()) + 3072); + + const std::vector vld = {static_cast(tunable->BlockSize), 1, 1}; + const std::vector vgd1 = {static_cast(tunable->BlockSize), 1, 1}; + const std::vector vgd2 = {static_cast(grid_size * tunable->BlockSize), 1, 1}; + + std::string program_name = "dynamic_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw.cpp"; + std::string algo_name = "implicit_gemm_conv_fwd_v4r4_nchw"; + + std::string param = " -std=c++17 "; + std::string network_config; + + param += get_definition_string_from_types() + " " + + get_definition_string_from_tunable(tunable) + + " -DCK_PARAM_HAS_MAIN_KBLOCK_LOOP=" + std::to_string(hasMainKBlockLoop) + + " -DCK_PARAM_HAS_DOUBLE_TAIL_KBLOCK_LOOP=" + std::to_string(hasDoubleTailKBlockLoop); + network_config = get_network_config_string_from_types() + "_" + + get_network_config_string_from_tunable(tunable) + "_" + + std::to_string(hasMainKBlockLoop) + "_" + + std::to_string(hasDoubleTailKBlockLoop); + + std::vector kernel1_times; + std::vector kernel2_times; + + for(index_t i = 0; i < nrepeat; ++i) + { + KernelTimer timer1, timer2; + std::string kernel_name; + + kernel_name = "dynamic_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw_prepare"; + auto network_config_1 = network_config + "_1"; + + timer1.Start(); + handle->AddKernel(algo_name, network_config_1, program_name, kernel_name, vld, vgd1, param)( + static_cast(in_n_c_hi_wi_lengths[I0]), + static_cast(in_n_c_hi_wi_lengths[I1]), + static_cast(in_n_c_hi_wi_lengths[I2]), + static_cast(in_n_c_hi_wi_lengths[I3]), + static_cast(wei_k_c_y_x_lengths[I0]), + static_cast(wei_k_c_y_x_lengths[I2]), + static_cast(wei_k_c_y_x_lengths[I3]), + conv_strides[I0], + conv_strides[I1], + conv_dilations[I0], + conv_dilations[I1], + in_left_pads[I0], + in_left_pads[I1], + in_right_pads[I0], + in_right_pads[I1], + a_k_m0_m1_grid_desc_dev_buf, + b_k_n0_n1_grid_desc_dev_buf, + c_m0_m10_m11_n0_n10_n11_grid_desc_dev_buf, + c_blockid_to_m0_n0_block_cluster_adaptor_dev_buf); + timer1.End(); + + kernel_name = "dynamic_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw"; + auto network_config_2 = network_config + "_2"; + + timer2.Start(); + handle->AddKernel(algo_name, network_config_2, program_name, kernel_name, vld, vgd2, param)( + reinterpret_cast(wei_k_c_y_x_dev_buf.GetDeviceBuffer()), + reinterpret_cast(in_n_c_hi_wi_dev_buf.GetDeviceBuffer()), + reinterpret_cast(out_n_k_ho_wo_dev_buf.GetDeviceBuffer()), + (const void*)(a_k_m0_m1_grid_desc_dev_buf), + (const void*)(b_k_n0_n1_grid_desc_dev_buf), + (const void*)(c_m0_m10_m11_n0_n10_n11_grid_desc_dev_buf), + (const void*)(c_blockid_to_m0_n0_block_cluster_adaptor_dev_buf)); + timer2.End(); + + kernel1_times.push_back(timer1.GetElapsedTime()); + kernel2_times.push_back(timer2.GetElapsedTime()); + } + + { + auto ave_time1 = Driver::get_effective_average(kernel1_times); + auto ave_time2 = Driver::get_effective_average(kernel2_times); + + const auto N = in_n_c_hi_wi_lengths[I0]; + const auto C = in_n_c_hi_wi_lengths[I1]; + + const auto K = out_n_k_ho_wo_lengths[I1]; + const auto Ho = out_n_k_ho_wo_lengths[I2]; + const auto Wo = out_n_k_ho_wo_lengths[I3]; + + const auto Y = wei_k_c_y_x_lengths[I2]; + const auto X = wei_k_c_y_x_lengths[I3]; + + float perf = (float)(std::size_t(2) * N * K * Ho * Wo * C * Y * X) / + (std::size_t(1000) * 1000 * 1000) / (ave_time1 + ave_time2); + + std::cout << "Average time : " << ave_time1 + ave_time2 << " ms(" << ave_time1 << ", " + << ave_time2 << "), " << perf << " TFlop/s" << std::endl; + }; + + // copy result back to host + out_n_k_ho_wo_dev_buf.FromDevice(out_n_k_ho_wo.mData.data()); +} diff --git a/driver/include/olc_device_dynamic_convolution_forward_implicit_gemm_v4r5_nchw_kcyx_nkhw.hpp b/driver/include/olc_device_dynamic_convolution_forward_implicit_gemm_v4r5_nchw_kcyx_nkhw.hpp new file mode 100644 index 0000000000..914de6e81b --- /dev/null +++ b/driver/include/olc_device_dynamic_convolution_forward_implicit_gemm_v4r5_nchw_kcyx_nkhw.hpp @@ -0,0 +1,404 @@ +#include "device.hpp" +#include "host_tensor.hpp" +#include "dynamic_tensor_descriptor.hpp" +#include "dynamic_tensor_descriptor_helper.hpp" +#include "transform_forward_convolution_into_gemm_v4r5_nchw_kcyx_nkhw.hpp" + +#include "olc_driver_common.hpp" +#include "conv_tunables.hpp" + +#include "handle.hpp" + +namespace detail_dyn_conv_fwd_v4r5_nchw_kcyx_nkhw { + +template +static std::string get_network_config_string_from_types() +{ + std::string out; + + out += static_cast(Driver::get_typeid_from_type()) + + static_cast(Driver::get_typeid_from_type()) + + static_cast(Driver::get_typeid_from_type()); + + return (out); +}; + +static std::string +get_network_config_string_from_tunable(const tunable_dyn_conv_fwd_v4r5_nchw_kcyx_nkhw* pt) +{ + std::string out("TUN_"); + + out += std::to_string(pt->BlockSize) + "_"; + + out += std::to_string(pt->GM1PerBlockGM11) + "x" + std::to_string(pt->GN1PerBlockGN11) + "x" + + std::to_string(pt->KPerBlock) + "_"; + out += std::to_string(pt->M1PerThread) + "x" + std::to_string(pt->N1PerThread) + "x" + + std::to_string(pt->KPerThread) + "_"; + out += std::to_string(pt->M1N1ThreadClusterM10) + "x" + + std::to_string(pt->M1N1ThreadClusterN10) + "x" + + std::to_string(pt->M1N1ThreadClusterM11) + "x" + + std::to_string(pt->M1N1ThreadClusterN11) + "_"; + + out += std::to_string(pt->ABlockTransferThreadSliceLengths_GK_GM0_GM10_GM11[0]) + "x" + + std::to_string(pt->ABlockTransferThreadSliceLengths_GK_GM0_GM10_GM11[1]) + "x" + + std::to_string(pt->ABlockTransferThreadSliceLengths_GK_GM0_GM10_GM11[2]) + "x" + + std::to_string(pt->ABlockTransferThreadSliceLengths_GK_GM0_GM10_GM11[3]) + "_"; + + out += std::to_string(pt->ABlockTransferThreadClusterLengths_GK_GM0_GM10_GM11[0]) + "x" + + std::to_string(pt->ABlockTransferThreadClusterLengths_GK_GM0_GM10_GM11[1]) + "x" + + std::to_string(pt->ABlockTransferThreadClusterLengths_GK_GM0_GM10_GM11[2]) + "x" + + std::to_string(pt->ABlockTransferThreadClusterLengths_GK_GM0_GM10_GM11[3]) + "_"; + + out += std::to_string(pt->ABlockTransferThreadClusterArrangeOrder[0]) + "x" + + std::to_string(pt->ABlockTransferThreadClusterArrangeOrder[1]) + "x" + + std::to_string(pt->ABlockTransferThreadClusterArrangeOrder[2]) + "x" + + std::to_string(pt->ABlockTransferThreadClusterArrangeOrder[3]) + "_"; + + out += std::to_string(pt->ABlockTransferSrcAccessOrder[0]) + "x" + + std::to_string(pt->ABlockTransferSrcAccessOrder[1]) + "x" + + std::to_string(pt->ABlockTransferSrcAccessOrder[2]) + "x" + + std::to_string(pt->ABlockTransferSrcAccessOrder[3]) + "_"; + + out += std::to_string(pt->ABlockTransferSrcVectorDim) + "_"; + out += std::to_string(pt->ABlockTransferSrcScalarPerVector) + "_"; + out += std::to_string(pt->ABlockTransferDstScalarPerVector_GM11) + "_"; + out += std::to_string(pt->AThreadTransferSrcResetCoordinateAfterRun) + "_"; + + out += std::to_string(pt->BBlockTransferThreadSliceLengths_GK_GN0_GN10_GN11[0]) + "x" + + std::to_string(pt->BBlockTransferThreadSliceLengths_GK_GN0_GN10_GN11[1]) + "x" + + std::to_string(pt->BBlockTransferThreadSliceLengths_GK_GN0_GN10_GN11[2]) + "x" + + std::to_string(pt->BBlockTransferThreadSliceLengths_GK_GN0_GN10_GN11[3]); + + out += std::to_string(pt->BBlockTransferThreadClusterLengths_GK_GN0_GN10_GN11[0]) + "x" + + std::to_string(pt->BBlockTransferThreadClusterLengths_GK_GN0_GN10_GN11[1]) + "x" + + std::to_string(pt->BBlockTransferThreadClusterLengths_GK_GN0_GN10_GN11[2]) + "x" + + std::to_string(pt->BBlockTransferThreadClusterLengths_GK_GN0_GN10_GN11[3]) + "_"; + + out += std::to_string(pt->BBlockTransferThreadClusterArrangeOrder[0]) + "x" + + std::to_string(pt->BBlockTransferThreadClusterArrangeOrder[1]) + "x" + + std::to_string(pt->BBlockTransferThreadClusterArrangeOrder[2]) + "x" + + std::to_string(pt->BBlockTransferThreadClusterArrangeOrder[3]) + "_"; + + out += std::to_string(pt->BBlockTransferSrcAccessOrder[0]) + "x" + + std::to_string(pt->BBlockTransferSrcAccessOrder[1]) + "x" + + std::to_string(pt->BBlockTransferSrcAccessOrder[2]) + "x" + + std::to_string(pt->BBlockTransferSrcAccessOrder[3]) + "_"; + + out += std::to_string(pt->BBlockTransferSrcVectorDim) + "_"; + out += std::to_string(pt->BBlockTransferSrcScalarPerVector) + "_"; + out += std::to_string(pt->BBlockTransferDstScalarPerVector_GN11) + "_"; + out += std::to_string(pt->BThreadTransferSrcResetCoordinateAfterRun) + "_"; + + out += std::to_string(pt->CThreadTransferSrcDstAccessOrder[0]) + "x" + + std::to_string(pt->CThreadTransferSrcDstAccessOrder[1]) + "x" + + std::to_string(pt->CThreadTransferSrcDstAccessOrder[2]) + "x" + + std::to_string(pt->CThreadTransferSrcDstAccessOrder[3]) + "x" + + std::to_string(pt->CThreadTransferSrcDstAccessOrder[4]) + "x" + + std::to_string(pt->CThreadTransferSrcDstAccessOrder[5]) + "_"; + + out += std::to_string(pt->CThreadTransferSrcDstVectorDim) + "_"; + out += std::to_string(pt->CThreadTransferDstScalarPerVector); + + return (out); +}; + +template +static std::string get_definition_string_from_types() +{ + std::string out; + + out += " -DCK_PARAM_IN_WEI_DATATYPE=" + std::to_string(Driver::get_typeid_from_type()) + + " -DCK_PARAM_CONV_COMPTYPE=" + std::to_string(Driver::get_typeid_from_type()) + + " -DCK_PARAM_OUT_DATATYPE=" + std::to_string(Driver::get_typeid_from_type()); + + return (out); +}; + +static std::string +get_definition_string_from_tunable(const tunable_dyn_conv_fwd_v4r5_nchw_kcyx_nkhw* pt) +{ + std::string out; + + out += " -DCK_PARAM_BlockSize=" + std::to_string(pt->BlockSize); + + out += " -DCK_PARAM_GM1PerBlockGM11=" + std::to_string(pt->GM1PerBlockGM11) + + " -DCK_PARAM_GN1PerBlockGN11=" + std::to_string(pt->GN1PerBlockGN11) + + " -DCK_PARAM_KPerBlock=" + std::to_string(pt->KPerBlock); + out += " -DCK_PARAM_M1PerThread=" + std::to_string(pt->M1PerThread) + + " -DCK_PARAM_N1PerThread=" + std::to_string(pt->N1PerThread) + + " -DCK_PARAM_KPerThread=" + std::to_string(pt->KPerThread); + + out += " -DCK_PARAM_M1N1ThreadClusterM10=" + std::to_string(pt->M1N1ThreadClusterM10) + + " -DCK_PARAM_M1N1ThreadClusterN10=" + std::to_string(pt->M1N1ThreadClusterN10) + + " -DCK_PARAM_M1N1ThreadClusterM11=" + std::to_string(pt->M1N1ThreadClusterM11) + + " -DCK_PARAM_M1N1ThreadClusterN11=" + std::to_string(pt->M1N1ThreadClusterN11); + + out += " -DCK_PARAM_ABlockTransferThreadSliceLengths_GK_GM0_GM10_GM11=" + + std::to_string(pt->ABlockTransferThreadSliceLengths_GK_GM0_GM10_GM11[0]) + "," + + std::to_string(pt->ABlockTransferThreadSliceLengths_GK_GM0_GM10_GM11[1]) + "," + + std::to_string(pt->ABlockTransferThreadSliceLengths_GK_GM0_GM10_GM11[2]) + "," + + std::to_string(pt->ABlockTransferThreadSliceLengths_GK_GM0_GM10_GM11[3]); + + out += " -DCK_PARAM_ABlockTransferThreadClusterLengths_GK_GM0_GM10_GM11=" + + std::to_string(pt->ABlockTransferThreadClusterLengths_GK_GM0_GM10_GM11[0]) + "," + + std::to_string(pt->ABlockTransferThreadClusterLengths_GK_GM0_GM10_GM11[1]) + "," + + std::to_string(pt->ABlockTransferThreadClusterLengths_GK_GM0_GM10_GM11[2]) + "," + + std::to_string(pt->ABlockTransferThreadClusterLengths_GK_GM0_GM10_GM11[3]); + + out += " -DCK_PARAM_ABlockTransferThreadClusterArrangeOrder=" + + std::to_string(pt->ABlockTransferThreadClusterArrangeOrder[0]) + "," + + std::to_string(pt->ABlockTransferThreadClusterArrangeOrder[1]) + "," + + std::to_string(pt->ABlockTransferThreadClusterArrangeOrder[2]) + "," + + std::to_string(pt->ABlockTransferThreadClusterArrangeOrder[3]); + + out += " -DCK_PARAM_ABlockTransferSrcAccessOrder=" + + std::to_string(pt->ABlockTransferSrcAccessOrder[0]) + "," + + std::to_string(pt->ABlockTransferSrcAccessOrder[1]) + "," + + std::to_string(pt->ABlockTransferSrcAccessOrder[2]) + "," + + std::to_string(pt->ABlockTransferSrcAccessOrder[3]); + + out += + " -DCK_PARAM_ABlockTransferSrcVectorDim=" + std::to_string(pt->ABlockTransferSrcVectorDim); + out += " -DCK_PARAM_ABlockTransferSrcScalarPerVector=" + + std::to_string(pt->ABlockTransferSrcScalarPerVector); + out += " -DCK_PARAM_ABlockTransferDstScalarPerVector_GM11=" + + std::to_string(pt->ABlockTransferDstScalarPerVector_GM11); + out += " -DCK_PARAM_AThreadTransferSrcResetCoordinateAfterRun=" + + std::to_string(pt->AThreadTransferSrcResetCoordinateAfterRun); + + out += " -DCK_PARAM_BBlockTransferThreadSliceLengths_GK_GN0_GN10_GN11=" + + std::to_string(pt->BBlockTransferThreadSliceLengths_GK_GN0_GN10_GN11[0]) + "," + + std::to_string(pt->BBlockTransferThreadSliceLengths_GK_GN0_GN10_GN11[1]) + "," + + std::to_string(pt->BBlockTransferThreadSliceLengths_GK_GN0_GN10_GN11[2]) + "," + + std::to_string(pt->BBlockTransferThreadSliceLengths_GK_GN0_GN10_GN11[3]); + + out += " -DCK_PARAM_BBlockTransferThreadClusterLengths_GK_GN0_GN10_GN11=" + + std::to_string(pt->BBlockTransferThreadClusterLengths_GK_GN0_GN10_GN11[0]) + "," + + std::to_string(pt->BBlockTransferThreadClusterLengths_GK_GN0_GN10_GN11[1]) + "," + + std::to_string(pt->BBlockTransferThreadClusterLengths_GK_GN0_GN10_GN11[2]) + "," + + std::to_string(pt->BBlockTransferThreadClusterLengths_GK_GN0_GN10_GN11[3]); + + out += " -DCK_PARAM_BBlockTransferThreadClusterArrangeOrder=" + + std::to_string(pt->BBlockTransferThreadClusterArrangeOrder[0]) + "," + + std::to_string(pt->BBlockTransferThreadClusterArrangeOrder[1]) + "," + + std::to_string(pt->BBlockTransferThreadClusterArrangeOrder[2]) + "," + + std::to_string(pt->BBlockTransferThreadClusterArrangeOrder[3]); + + out += " -DCK_PARAM_BBlockTransferSrcAccessOrder=" + + std::to_string(pt->BBlockTransferSrcAccessOrder[0]) + "," + + std::to_string(pt->BBlockTransferSrcAccessOrder[1]) + "," + + std::to_string(pt->BBlockTransferSrcAccessOrder[2]) + "," + + std::to_string(pt->BBlockTransferSrcAccessOrder[3]); + + out += + " -DCK_PARAM_BBlockTransferSrcVectorDim=" + std::to_string(pt->BBlockTransferSrcVectorDim); + out += " -DCK_PARAM_BBlockTransferSrcScalarPerVector=" + + std::to_string(pt->BBlockTransferSrcScalarPerVector); + out += " -DCK_PARAM_BBlockTransferDstScalarPerVector_GN11=" + + std::to_string(pt->BBlockTransferDstScalarPerVector_GN11); + out += " -DCK_PARAM_BThreadTransferSrcResetCoordinateAfterRun=" + + std::to_string(pt->BThreadTransferSrcResetCoordinateAfterRun); + + out += " -DCK_PARAM_CThreadTransferSrcDstAccessOrder=" + + std::to_string(pt->CThreadTransferSrcDstAccessOrder[0]) + "," + + std::to_string(pt->CThreadTransferSrcDstAccessOrder[1]) + "," + + std::to_string(pt->CThreadTransferSrcDstAccessOrder[2]) + "," + + std::to_string(pt->CThreadTransferSrcDstAccessOrder[3]) + "," + + std::to_string(pt->CThreadTransferSrcDstAccessOrder[4]) + "," + + std::to_string(pt->CThreadTransferSrcDstAccessOrder[5]); + + out += " -DCK_PARAM_CThreadTransferSrcDstVectorDim=" + + std::to_string(pt->CThreadTransferSrcDstVectorDim); + out += " -DCK_PARAM_CThreadTransferDstScalarPerVector=" + + std::to_string(pt->CThreadTransferDstScalarPerVector); + + return (out); +}; + +} // namespace detail_dyn_conv_fwd_v4r5_nchw_kcyx_nkhw + +template +void device_dynamic_convolution_forward_implicit_gemm_v4r5_nchw_kcyx_nkhw_olc( + olCompile::Handle* handle, + const InLengths& in_n_c_hi_wi_lengths, + const WeiLengths& wei_k_c_y_x_lengths, + const OutLengths& out_n_k_ho_wo_lengths, + const ConvStrides& conv_strides, + const ConvDilations& conv_dilations, + const InLeftPads& in_left_pads, + const InRightPads& in_right_pads, + const Tensor& in_n_c_hi_wi, + const Tensor& wei_k_c_y_x, + Tensor& out_n_k_ho_wo, + const tunable_dyn_conv_fwd_v4r5_nchw_kcyx_nkhw* tunable, + ck::index_t nrepeat) +{ + using namespace ck; + using namespace detail_dyn_conv_fwd_v4r5_nchw_kcyx_nkhw; + using size_t = std::size_t; + + constexpr index_t N0 = 4; // this could not be a tunable so far + + //////////////////////////////////////////////////////////////////////////////////////////////////////////// + // The follow codes are only used for computing the grid_size, hasMainKBlockLoop, + // hasDoubleTailKBlockLoop + + constexpr auto I0 = Number<0>{}; + constexpr auto I1 = Number<1>{}; + constexpr auto I2 = Number<2>{}; + constexpr auto I3 = Number<3>{}; + + const auto in_n_c_hi_wi_desc = + make_dynamic_naive_tensor_descriptor_packed_v2(in_n_c_hi_wi_lengths); + const auto wei_k_c_y_x_desc = + make_dynamic_naive_tensor_descriptor_packed_v2(wei_k_c_y_x_lengths); + const auto out_n_k_ho_wo_desc = + make_dynamic_naive_tensor_descriptor_packed_v2(out_n_k_ho_wo_lengths); + + const auto descs = transform_forward_convolution_into_contraction_v4r5_nchw_kcyx_nkhw_pad( + wei_k_c_y_x_desc, + in_n_c_hi_wi_desc, + out_n_k_ho_wo_desc, + conv_strides, + conv_dilations, + in_left_pads, + in_right_pads); + + const auto a_gk_gm0_gm1_grid_desc = descs[I0]; + const auto c_gm0_gm1_gn0_gn1_grid_desc = descs[I2]; + + const auto GM1 = c_gm0_gm1_gn0_gn1_grid_desc.GetLength(I1); + const auto GN1 = c_gm0_gm1_gn0_gn1_grid_desc.GetLength(I3); + const auto GK = a_gk_gm0_gm1_grid_desc.GetLength(I0); + + const index_t grid_size = (GM1 / tunable->GM1PerBlockGM11) * (GN1 / tunable->GN1PerBlockGN11); + const bool hasMainKBlockLoop = ((GK + tunable->KPerBlock) / (2 * tunable->KPerBlock) > 1); + const bool hasDoubleTailKBlockLoop = ((GK / tunable->KPerBlock) % 2 == 0); + + /////////////////////////////////////////////////////////////////////////////////////////////////////////// + + // these buffers are usually provided by the user application + DeviceMem in_n_c_hi_wi_dev_buf(sizeof(TInWei) * in_n_c_hi_wi.mDesc.GetElementSpace()); + DeviceMem wei_k_c_y_x_dev_buf(sizeof(TInWei) * wei_k_c_y_x.mDesc.GetElementSpace()); + DeviceMem out_n_k_ho_wo_dev_buf(sizeof(TOut) * out_n_k_ho_wo.mDesc.GetElementSpace()); + + in_n_c_hi_wi_dev_buf.ToDevice(in_n_c_hi_wi.mData.data()); + wei_k_c_y_x_dev_buf.ToDevice(wei_k_c_y_x.mData.data()); + out_n_k_ho_wo_dev_buf.ToDevice(out_n_k_ho_wo.mData.data()); + + // these are workspace buffers that should be expressed to the user by the corresponding + // workspace API + DeviceMem workspace_buf(4096); + + void* a_gk_gm0_gm10_gm11_grid_desc_dev_buf = workspace_buf.GetDeviceBuffer(); + void* b_gk_gn0_gn10_gn11_grid_desc_dev_buf = + static_cast(static_cast(workspace_buf.GetDeviceBuffer()) + 1024); + void* c_gm10_bm0_bm1_gn10_bn0_bn1_grid_desc_dev_buf = + static_cast(static_cast(workspace_buf.GetDeviceBuffer()) + 2048); + void* c_blockid_to_gm10_gn10_block_cluster_adaptor_dev_buf = + static_cast(static_cast(workspace_buf.GetDeviceBuffer()) + 3072); + + const std::vector vld = {static_cast(tunable->BlockSize), 1, 1}; + const std::vector vgd1 = {static_cast(tunable->BlockSize), 1, 1}; + const std::vector vgd2 = {static_cast(grid_size * tunable->BlockSize), 1, 1}; + + std::string program_name = "dynamic_convolution_forward_implicit_gemm_v4r5_nchw_kcyx_nkhw.cpp"; + std::string algo_name = "implicit_gemm_conv_fwd_v4r4_nchw"; + + std::string param = " -std=c++17 "; + std::string network_config; + + param += get_definition_string_from_types() + + " -DCK_PARAM_HAS_MAIN_KBLOCK_LOOP=" + std::to_string(hasMainKBlockLoop) + + " -DCK_PARAM_HAS_DOUBLE_TAIL_KBLOCK_LOOP=" + std::to_string(hasDoubleTailKBlockLoop) + + " -DCK_PARAM_N0=" + std::to_string(N0) + " " + + get_definition_string_from_tunable(tunable); + network_config = get_network_config_string_from_types() + "_V" + + std::to_string(hasDoubleTailKBlockLoop) + "_" + std::to_string(N0) + "_" + + get_network_config_string_from_tunable(tunable); + + std::vector kernel1_times; + std::vector kernel2_times; + + for(index_t i = 0; i < nrepeat; ++i) + { + KernelTimer timer1, timer2; + std::string kernel_name; + + kernel_name = "dynamic_convolution_forward_implicit_gemm_v4r5_nchw_kcyx_nkhw_prepare"; + auto network_config_1 = network_config + "_1"; + + timer1.Start(); + handle->AddKernel(algo_name, network_config_1, program_name, kernel_name, vld, vgd1, param)( + static_cast(in_n_c_hi_wi_lengths[I0]), + static_cast(in_n_c_hi_wi_lengths[I1]), + static_cast(in_n_c_hi_wi_lengths[I2]), + static_cast(in_n_c_hi_wi_lengths[I3]), + static_cast(wei_k_c_y_x_lengths[I0]), + static_cast(wei_k_c_y_x_lengths[I2]), + static_cast(wei_k_c_y_x_lengths[I3]), + conv_strides[I0], + conv_strides[I1], + conv_dilations[I0], + conv_dilations[I1], + in_left_pads[I0], + in_left_pads[I1], + in_right_pads[I0], + in_right_pads[I1], + a_gk_gm0_gm10_gm11_grid_desc_dev_buf, + b_gk_gn0_gn10_gn11_grid_desc_dev_buf, + c_gm10_bm0_bm1_gn10_bn0_bn1_grid_desc_dev_buf, + c_blockid_to_gm10_gn10_block_cluster_adaptor_dev_buf); + timer2.End(); + + kernel_name = "dynamic_convolution_forward_implicit_gemm_v4r5_nchw_kcyx_nkhw"; + auto network_config_2 = network_config + "_2"; + + timer2.Start(); + handle->AddKernel(algo_name, network_config_2, program_name, kernel_name, vld, vgd2, param)( + reinterpret_cast(wei_k_c_y_x_dev_buf.GetDeviceBuffer()), + reinterpret_cast(in_n_c_hi_wi_dev_buf.GetDeviceBuffer()), + reinterpret_cast(out_n_k_ho_wo_dev_buf.GetDeviceBuffer()), + (const void*)(a_gk_gm0_gm10_gm11_grid_desc_dev_buf), + (const void*)(b_gk_gn0_gn10_gn11_grid_desc_dev_buf), + (const void*)(c_gm10_bm0_bm1_gn10_bn0_bn1_grid_desc_dev_buf), + (const void*)(c_blockid_to_gm10_gn10_block_cluster_adaptor_dev_buf)); + timer2.End(); + + kernel1_times.push_back(timer1.GetElapsedTime()); + kernel2_times.push_back(timer2.GetElapsedTime()); + } + + { + auto ave_time1 = Driver::get_effective_average(kernel1_times); + auto ave_time2 = Driver::get_effective_average(kernel2_times); + + const auto N = in_n_c_hi_wi_lengths[I0]; + const auto C = in_n_c_hi_wi_lengths[I1]; + + const auto K = out_n_k_ho_wo_lengths[I1]; + const auto Ho = out_n_k_ho_wo_lengths[I2]; + const auto Wo = out_n_k_ho_wo_lengths[I3]; + + const auto Y = wei_k_c_y_x_lengths[I2]; + const auto X = wei_k_c_y_x_lengths[I3]; + + float perf = (float)(std::size_t(2) * N * K * Ho * Wo * C * Y * X) / + (std::size_t(1000) * 1000 * 1000) / (ave_time1 + ave_time2); + + std::cout << "Average time : " << ave_time1 + ave_time2 << " ms(" << ave_time1 << ", " + << ave_time2 << "), " << perf << " TFlop/s" << std::endl; + }; + + // copy result back to host + out_n_k_ho_wo_dev_buf.FromDevice(out_n_k_ho_wo.mData.data()); +} diff --git a/driver/include/olc_driver_common.hpp b/driver/include/olc_driver_common.hpp new file mode 100644 index 0000000000..383bf4c6a4 --- /dev/null +++ b/driver/include/olc_driver_common.hpp @@ -0,0 +1,114 @@ +#ifndef OLC_DRIVER_COMMON_HPP +#define OLC_DRIVER_COMMON_HPP + +#include +#include +#include + +// this enumerate should be synchronized with include/miopen.h +typedef enum { + appHalf = 0, + appFloat = 1, + appInt32 = 2, + appInt8 = 3, + appInt8x4 = 4, + appBFloat16 = 5, + appDouble = 6, +} appDataType_t; + +namespace Driver { + +template +struct get_type_from_type_enum +{ + using type = float; +}; + +template <> +struct get_type_from_type_enum +{ + using type = half_float::half; +}; + +template <> +struct get_type_from_type_enum +{ + using type = float; +}; + +template <> +struct get_type_from_type_enum +{ + using type = double; +}; + +template <> +struct get_type_from_type_enum +{ + using type = int; +}; + +static inline int get_typeid_from_type_enum(appDataType_t t) +{ + switch(t) + { + case appHalf: return (static_cast('H')); + case appFloat: return (static_cast('F')); + case appBFloat16: return (static_cast('B')); + case appDouble: return (static_cast('D')); + case appInt8: + case appInt8x4: + case appInt32: return (static_cast('O')); + default: throw std::runtime_error("Only float, half, bfloat16 data type is supported."); break; + }; +}; + +template +static inline int get_typeid_from_type() +{ + throw std::runtime_error("Unsupported typeid conversion for this type!"); +}; + +template <> +inline int get_typeid_from_type() +{ + return (static_cast('F')); +}; + +template <> +inline int get_typeid_from_type() +{ + return (static_cast('H')); +}; + +template <> +inline int get_typeid_from_type() +{ + return (static_cast('D')); +}; + +static inline float get_effective_average(std::vector& values) +{ + assert(!values.empty()); + + if(values.size() == 1) + return (values[0]); + else + { + float sum = 0.0f; + float maxVal = 0.0f; + + for(const auto val : values) + { + if(maxVal < val) + maxVal = val; + sum += val; + }; + + return ((sum - maxVal) / (values.size() - 1)); + }; +}; + +} // namespace Driver + +#endif diff --git a/driver/olCompiling/addkernels/CMakeLists.txt b/driver/olCompiling/addkernels/CMakeLists.txt new file mode 100644 index 0000000000..874cba6a5e --- /dev/null +++ b/driver/olCompiling/addkernels/CMakeLists.txt @@ -0,0 +1,30 @@ +################################################################################ +# +# MIT License +# +# Copyright (c) 2017 Advanced Micro Devices, Inc. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +# +################################################################################ + +set(ADD_KERNELS_SOURCE include_inliner.cpp addkernels.cpp) + +add_executable(addkernels EXCLUDE_FROM_ALL ${ADD_KERNELS_SOURCE}) + diff --git a/driver/olCompiling/addkernels/addkernels.cpp b/driver/olCompiling/addkernels/addkernels.cpp new file mode 100644 index 0000000000..5be523d97b --- /dev/null +++ b/driver/olCompiling/addkernels/addkernels.cpp @@ -0,0 +1,264 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +#include "include_inliner.hpp" +#include +#include +#include +#include +#include +#include +#include + +void Bin2Hex(std::istream& source, + std::ostream& target, + const std::string& variable, + bool nullTerminate, + size_t bufferSize, + size_t lineSize) +{ + source.seekg(0, std::ios::end); + std::unique_ptr buffer(new unsigned char[bufferSize]); + std::streamoff sourceSize = source.tellg(); + std::streamoff blockStart = 0; + + if(variable.length() != 0) + { + target << "extern const size_t " << variable << "_SIZE;" << std::endl; + target << "extern const unsigned char " << variable << "[];" << std::endl; + target << "const size_t " << variable << "_SIZE = " << std::setbase(10) << sourceSize << ";" + << std::endl; + target << "const unsigned char " << variable << "[] = {" << std::endl; + } + + target << std::setbase(16) << std::setfill('0'); + source.seekg(0, std::ios::beg); + + while(blockStart < sourceSize) + { + source.read(reinterpret_cast(buffer.get()), bufferSize); + + std::streamoff pos = source.tellg(); + std::streamoff blockSize = (pos < 0 ? sourceSize : pos) - blockStart; + std::streamoff i = 0; + + while(i < blockSize) + { + size_t j = i; + size_t end = std::min(i + lineSize, blockSize); + + for(; j < end; j++) + target << "0x" << std::setw(2) << static_cast(buffer[j]) << ","; + + target << std::endl; + i = end; + } + + blockStart += blockSize; + } + + if(nullTerminate) + target << "0x00," << std::endl; + + if(variable.length() != 0) + { + target << "};" << std::endl; + } +} + +void PrintHelp() +{ + std::cout << "Usage: bin2hex {