From ae98b52ad8be610bd6f8fdd1ffacc6ac70081379 Mon Sep 17 00:00:00 2001 From: Chao Liu Date: Sat, 7 Aug 2021 00:51:05 +0000 Subject: [PATCH] remove online compilation from CK --- CMakeLists.txt | 5 +- cmake/AddKernels.cmake | 40 -- cmake/TargetFlags.cmake | 50 -- host/CMakeLists.txt | 2 - host/driver_online/CMakeLists.txt | 22 - host/driver_online/conv_fwd_driver_online.cpp | 453 ------------------ ...mplicit_gemm_v4r4_dlops_nchw_kcyx_nkhw.hpp | 395 --------------- ...plicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw.hpp | 386 --------------- ...plicit_gemm_v4r4_xdlops_nhwc_kyxc_nhwk.hpp | 389 --------------- ...mplicit_gemm_v6r1_dlops_nchw_kcyx_nkhw.hpp | 183 ------- host/online_compile/CMakeLists.txt | 168 ------- host/online_compile/addkernels/CMakeLists.txt | 30 -- host/online_compile/addkernels/addkernels.cpp | 264 ---------- .../addkernels/include_inliner.cpp | 213 -------- .../addkernels/include_inliner.hpp | 142 ------ .../addkernels/source_file_desc.hpp | 45 -- .../hip_utility/binary_cache.cpp | 112 ----- .../online_compile/hip_utility/exec_utils.cpp | 93 ---- host/online_compile/hip_utility/handlehip.cpp | 285 ----------- .../hip_utility/hip_build_utils.cpp | 346 ------------- .../hip_utility/hipoc_kernel.cpp | 84 ---- .../hip_utility/hipoc_program.cpp | 139 ------ .../hip_utility/kernel_build_params.cpp | 66 --- .../hip_utility/kernel_cache.cpp | 154 ------ host/online_compile/hip_utility/logger.cpp | 43 -- host/online_compile/hip_utility/md5.cpp | 319 ------------ .../hip_utility/target_properties.cpp | 119 ----- host/online_compile/hip_utility/tmp_dir.cpp | 66 --- host/online_compile/include/binary_cache.hpp | 52 -- host/online_compile/include/config.h.in | 47 -- host/online_compile/include/env.hpp | 123 ----- host/online_compile/include/exec_utils.hpp | 42 -- host/online_compile/include/handle.hpp | 145 ------ host/online_compile/include/hipCheck.hpp | 22 - .../include/hip_build_utils.hpp | 97 ---- host/online_compile/include/hipoc_kernel.hpp | 174 ------- host/online_compile/include/hipoc_program.hpp | 64 --- .../include/hipoc_program_impl.hpp | 61 --- host/online_compile/include/kernel.hpp | 45 -- .../include/kernel_build_params.hpp | 137 ------ host/online_compile/include/kernel_cache.hpp | 97 ---- host/online_compile/include/logger.hpp | 23 - host/online_compile/include/manage_ptr.hpp | 76 --- host/online_compile/include/md5.hpp | 12 - .../online_compile/include/op_kernel_args.hpp | 40 -- host/online_compile/include/simple_hash.hpp | 44 -- host/online_compile/include/stringutils.hpp | 133 ----- .../include/target_properties.hpp | 56 --- host/online_compile/include/tmp_dir.hpp | 26 - host/online_compile/include/write_file.hpp | 30 -- host/online_compile/kernel.cpp.in | 70 --- host/online_compile/kernel_includes.cpp.in | 80 ---- host/online_compile/kernels_batch.cpp.in | 1 - .../include/solver_common.hpp} | 11 +- 54 files changed, 3 insertions(+), 6318 deletions(-) delete mode 100644 cmake/AddKernels.cmake delete mode 100644 cmake/TargetFlags.cmake delete mode 100644 host/driver_online/CMakeLists.txt delete mode 100644 host/driver_online/conv_fwd_driver_online.cpp delete mode 100644 host/driver_online/include/online_device_dynamic_convolution_forward_implicit_gemm_v4r4_dlops_nchw_kcyx_nkhw.hpp delete mode 100644 host/driver_online/include/online_device_dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw.hpp delete mode 100644 host/driver_online/include/online_device_dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nhwc_kyxc_nhwk.hpp delete mode 100644 host/driver_online/include/online_device_dynamic_convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw.hpp delete mode 100644 host/online_compile/CMakeLists.txt delete mode 100644 host/online_compile/addkernels/CMakeLists.txt delete mode 100644 host/online_compile/addkernels/addkernels.cpp delete mode 100644 host/online_compile/addkernels/include_inliner.cpp delete mode 100644 host/online_compile/addkernels/include_inliner.hpp delete mode 100644 host/online_compile/addkernels/source_file_desc.hpp delete mode 100644 host/online_compile/hip_utility/binary_cache.cpp delete mode 100644 host/online_compile/hip_utility/exec_utils.cpp delete mode 100644 host/online_compile/hip_utility/handlehip.cpp delete mode 100644 host/online_compile/hip_utility/hip_build_utils.cpp delete mode 100644 host/online_compile/hip_utility/hipoc_kernel.cpp delete mode 100644 host/online_compile/hip_utility/hipoc_program.cpp delete mode 100644 host/online_compile/hip_utility/kernel_build_params.cpp delete mode 100644 host/online_compile/hip_utility/kernel_cache.cpp delete mode 100644 host/online_compile/hip_utility/logger.cpp delete mode 100644 host/online_compile/hip_utility/md5.cpp delete mode 100644 host/online_compile/hip_utility/target_properties.cpp delete mode 100644 host/online_compile/hip_utility/tmp_dir.cpp delete mode 100644 host/online_compile/include/binary_cache.hpp delete mode 100644 host/online_compile/include/config.h.in delete mode 100644 host/online_compile/include/env.hpp delete mode 100644 host/online_compile/include/exec_utils.hpp delete mode 100644 host/online_compile/include/handle.hpp delete mode 100644 host/online_compile/include/hipCheck.hpp delete mode 100644 host/online_compile/include/hip_build_utils.hpp delete mode 100644 host/online_compile/include/hipoc_kernel.hpp delete mode 100644 host/online_compile/include/hipoc_program.hpp delete mode 100644 host/online_compile/include/hipoc_program_impl.hpp delete mode 100644 host/online_compile/include/kernel.hpp delete mode 100644 host/online_compile/include/kernel_build_params.hpp delete mode 100644 host/online_compile/include/kernel_cache.hpp delete mode 100644 host/online_compile/include/logger.hpp delete mode 100644 host/online_compile/include/manage_ptr.hpp delete mode 100644 host/online_compile/include/md5.hpp delete mode 100644 host/online_compile/include/op_kernel_args.hpp delete mode 100644 host/online_compile/include/simple_hash.hpp delete mode 100644 host/online_compile/include/stringutils.hpp delete mode 100644 host/online_compile/include/target_properties.hpp delete mode 100644 host/online_compile/include/tmp_dir.hpp delete mode 100644 host/online_compile/include/write_file.hpp delete mode 100644 host/online_compile/kernel.cpp.in delete mode 100644 host/online_compile/kernel_includes.cpp.in delete mode 100644 host/online_compile/kernels_batch.cpp.in rename host/{driver_online/include/online_driver_common.hpp => solver/include/solver_common.hpp} (79%) diff --git a/CMakeLists.txt b/CMakeLists.txt index 0cf342bb45..fa5dcfe3ea 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,11 +1,8 @@ cmake_minimum_required(VERSION 2.8.3) -project(modular_convolution) +project(composable_kernel) list(APPEND CMAKE_MODULE_PATH "${PROJECT_SOURCE_DIR}/cmake") -include(TargetFlags) -include(AddKernels) - ## C++ enable_language(CXX) set(CMAKE_CXX_STANDARD 17) diff --git a/cmake/AddKernels.cmake b/cmake/AddKernels.cmake deleted file mode 100644 index 429ecc47a9..0000000000 --- a/cmake/AddKernels.cmake +++ /dev/null @@ -1,40 +0,0 @@ - -function(add_kernels SRC_DIR KERNEL_FILES) - set(INIT_KERNELS_LIST) - set(KERNELS_DECLS) - foreach(KERNEL_FILE ${KERNEL_FILES}) - if("${CMAKE_VERSION}" VERSION_LESS 3.0) - configure_file(${KERNEL_FILE} ${KERNEL_FILE}.delete) - else() - set_property(DIRECTORY APPEND PROPERTY CMAKE_CONFIGURE_DEPENDS ${KERNEL_FILE}) - endif() - get_filename_component(BASE_NAME ${KERNEL_FILE} NAME_WE) - string(TOUPPER "${BASE_NAME}" KEY_NAME) - string(MAKE_C_IDENTIFIER "${KEY_NAME}" VAR_NAME) - string(APPEND KERNELS_DECLS "extern const size_t APP_KERNEL_${VAR_NAME}_SIZE;\n") - string(APPEND KERNELS_DECLS "extern const unsigned char APP_KERNEL_${VAR_NAME}[];\n") - list(APPEND INIT_KERNELS_LIST " { \"${KEY_NAME}\", std::string(reinterpret_cast(APP_KERNEL_${VAR_NAME}), APP_KERNEL_${VAR_NAME}_SIZE) }") - endforeach() - string(REPLACE ";" ",\n" INIT_KERNELS "${INIT_KERNELS_LIST}") - configure_file(${SRC_DIR}/kernel.cpp.in ${PROJECT_BINARY_DIR}/kernel.cpp) -endfunction() - -function(add_kernel_includes SRC_DIR KERNEL_FILES) - set(INIT_KERNELS_LIST) - foreach(KERNEL_FILE ${KERNEL_FILES}) - if("${CMAKE_VERSION}" VERSION_LESS 3.0) - configure_file(${KERNEL_FILE} ${KERNEL_FILE}.delete) - else() - set_property(DIRECTORY APPEND PROPERTY CMAKE_CONFIGURE_DEPENDS ${KERNEL_FILE}) - endif() - get_filename_component(BASE_NAME ${KERNEL_FILE} NAME_WE) - get_filename_component(FILE_NAME ${KERNEL_FILE} NAME) - string(TOUPPER "${BASE_NAME}" KEY_NAME) - string(MAKE_C_IDENTIFIER "${KEY_NAME}" VAR_NAME) - list(APPEND INIT_KERNELS_LIST " { \"${FILE_NAME}\", std::string(reinterpret_cast(${VAR_NAME}), ${VAR_NAME}_SIZE) }") - endforeach() - string(REPLACE ";" ",\n" INIT_KERNELS "${INIT_KERNELS_LIST}") - configure_file(${SRC_DIR}/kernel_includes.cpp.in ${PROJECT_BINARY_DIR}/kernel_includes.cpp) -endfunction() - - diff --git a/cmake/TargetFlags.cmake b/cmake/TargetFlags.cmake deleted file mode 100644 index 4f83fb5d39..0000000000 --- a/cmake/TargetFlags.cmake +++ /dev/null @@ -1,50 +0,0 @@ - -function(get_target_property2 VAR TARGET PROPERTY) - get_target_property(_pflags ${TARGET} ${PROPERTY}) - if(_pflags) - set(${VAR} ${_pflags} PARENT_SCOPE) - else() - set(${VAR} "" PARENT_SCOPE) - endif() -endfunction() - - -macro(append_flags FLAGS TARGET PROPERTY PREFIX) - get_target_property2(_pflags ${TARGET} ${PROPERTY}) - foreach(FLAG ${_pflags}) - if(TARGET ${FLAG}) - target_flags(_pflags2 ${FLAG}) - string(APPEND ${FLAGS} " ${_pflags2}") - else() - string(APPEND ${FLAGS} " ${PREFIX}${FLAG}") - endif() - endforeach() -endmacro() - -macro(append_link_flags FLAGS TARGET PROPERTY) - get_target_property2(_pflags ${TARGET} ${PROPERTY}) - foreach(FLAG ${_pflags}) - if(TARGET ${FLAG}) - target_flags(_pflags2 ${FLAG}) - string(APPEND ${FLAGS} " ${_pflags2}") - elseif(FLAG MATCHES "^-.*") - string(APPEND ${FLAGS} " ${FLAG}") - elseif(EXISTS ${FLAG}) - string(APPEND ${FLAGS} " ${FLAG}") - else() - string(APPEND ${FLAGS} " -l${FLAG}") - endif() - endforeach() -endmacro() - -function(target_flags FLAGS TARGET) - set(_flags) - append_flags(_flags ${TARGET} "INTERFACE_COMPILE_OPTIONS" "") - append_flags(_flags ${TARGET} "INTERFACE_COMPILE_DEFINITIONS" "-D") - append_flags(_flags ${TARGET} "INTERFACE_INCLUDE_DIRECTORIES" "-isystem ") - append_flags(_flags ${TARGET} "INTERFACE_LINK_DIRECTORIES" "-L ") - append_flags(_flags ${TARGET} "INTERFACE_LINK_OPTIONS" "") - append_link_flags(_flags ${TARGET} "INTERFACE_LINK_LIBRARIES" "") - # message("_flags: ${_flags}") - set(${FLAGS} ${_flags} PARENT_SCOPE) -endfunction() diff --git a/host/CMakeLists.txt b/host/CMakeLists.txt index 26739efe34..30cc14d8ca 100644 --- a/host/CMakeLists.txt +++ b/host/CMakeLists.txt @@ -1,4 +1,2 @@ add_subdirectory(host_tensor) -add_subdirectory(online_compile) add_subdirectory(driver_offline) -add_subdirectory(driver_online) diff --git a/host/driver_online/CMakeLists.txt b/host/driver_online/CMakeLists.txt deleted file mode 100644 index 077e3218a0..0000000000 --- a/host/driver_online/CMakeLists.txt +++ /dev/null @@ -1,22 +0,0 @@ -include_directories(BEFORE - include - ${PROJECT_BINARY_DIR}/host/online_compile/include - ${PROJECT_SOURCE_DIR}/host/online_compile/include - ${PROJECT_SOURCE_DIR}/host/host_tensor/include - ${PROJECT_SOURCE_DIR}/host/solver/include - ${PROJECT_SOURCE_DIR}/composable_kernel/include - ${PROJECT_SOURCE_DIR}/composable_kernel/include/utility - ${PROJECT_SOURCE_DIR}/composable_kernel/include/tensor_description - ${PROJECT_SOURCE_DIR}/composable_kernel/include/tensor_operation - ${PROJECT_SOURCE_DIR}/composable_kernel/include/problem_transform - ${PROJECT_SOURCE_DIR}/composable_kernel/include/driver - ${PROJECT_SOURCE_DIR}/external/rocm/include - ${PROJECT_SOURCE_DIR}/external/half/include -) - -set(CONV_FWD_DRIVER_ONLINE_SOURCE conv_fwd_driver_online.cpp) - -add_executable(conv_fwd_driver_online ${CONV_FWD_DRIVER_ONLINE_SOURCE}) - -target_link_libraries(conv_fwd_driver_online PRIVATE host_tensor) -target_link_libraries(conv_fwd_driver_online PRIVATE online_compile) diff --git a/host/driver_online/conv_fwd_driver_online.cpp b/host/driver_online/conv_fwd_driver_online.cpp deleted file mode 100644 index 53e6179aa6..0000000000 --- a/host/driver_online/conv_fwd_driver_online.cpp +++ /dev/null @@ -1,453 +0,0 @@ -#include -#include -#include -#include -#include -#include -#include "config.hpp" -#include "print.hpp" -#include "device.hpp" -#include "host_tensor.hpp" -#include "host_tensor_generator.hpp" -#include "conv_common.hpp" -#include "host_conv.hpp" -#include "device_tensor.hpp" -#include "handle.hpp" -#include "hipCheck.hpp" -#include "online_device_dynamic_convolution_forward_implicit_gemm_v4r4_dlops_nchw_kcyx_nkhw.hpp" -#include "online_device_dynamic_convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw.hpp" -#include "online_device_dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw.hpp" -#include "online_device_dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nhwc_kyxc_nhwk.hpp" - -#define USE_CONV_FWD_V4R4_NCHW 1 -#define USE_CONV_FWD_V6R1_NCHW 1 -#define USE_CONV_FWD_V4R4_XDLOPS_NCHW 1 -#define USE_CONV_FWD_V4R4_XDLOPS_NHWC 1 - -enum ConvForwardAlgo -{ - V4R4NCHW, // 0 - V6R1NCHW, // 1 - V4R4XDLNCHW, // 2 - V4R4XDLNHWC // 3 -}; - -int main(int argc, char* argv[]) -{ - using namespace ck; - using namespace ck::driver; - using size_t = std::size_t; - - hipStream_t stream; - online_compile::Handle* handle; - - MY_HIP_CHECK(hipStreamCreate(&stream)); - - handle = new online_compile::Handle(stream); - - constexpr auto I0 = Number<0>{}; - constexpr auto I1 = Number<1>{}; - constexpr auto I2 = Number<2>{}; - constexpr auto I3 = Number<3>{}; - constexpr auto I4 = Number<4>{}; - constexpr auto I5 = Number<5>{}; - constexpr auto I6 = Number<6>{}; - - if(argc != 22) - { - printf("arg1 to 5: layout, algo, do_verification, init_method, do_log, nrepeat\n"); - printf("rest: N, K, C, Y, X, Hi, Wi, Sy, Sx, Dy, Dx, LeftPy, LeftPx, RightPy, RightPx\n"); - exit(1); - } - - const ConvTensorLayout layout = static_cast(atoi(argv[1])); - const ConvForwardAlgo algo = static_cast(atoi(argv[2])); - const bool do_verification = atoi(argv[3]); - const int init_method = atoi(argv[4]); - const bool do_log = atoi(argv[5]); - const int nrepeat = atoi(argv[6]); - - const index_t N = atoi(argv[7]); - const index_t K = atoi(argv[8]); - const index_t C = atoi(argv[9]); - const index_t Y = atoi(argv[10]); - const index_t X = atoi(argv[11]); - const index_t Hi = atoi(argv[12]); - const index_t Wi = atoi(argv[13]); - - const index_t conv_stride_h = atoi(argv[14]); - const index_t conv_stride_w = atoi(argv[15]); - const index_t conv_dilation_h = atoi(argv[16]); - const index_t conv_dilation_w = atoi(argv[17]); - const index_t in_left_pad_h = atoi(argv[18]); - const index_t in_left_pad_w = atoi(argv[19]); - const index_t in_right_pad_h = atoi(argv[20]); - const index_t in_right_pad_w = atoi(argv[21]); - - const index_t YEff = (Y - 1) * conv_dilation_h + 1; - const index_t XEff = (X - 1) * conv_dilation_w + 1; - - const index_t Ho = (Hi + in_left_pad_h + in_right_pad_h - YEff) / conv_stride_h + 1; - const index_t Wo = (Wi + in_left_pad_w + in_right_pad_w - XEff) / conv_stride_w + 1; - -#if 1 - using in_data_t = float; - using acc_data_t = float; - using out_data_t = float; -#elif 0 - using in_data_t = half_t; - using acc_data_t = float; - using out_data_t = half_t; -#elif 1 - using in_data_t = int8_t; - using acc_data_t = int32_t; - using out_data_t = int8_t; -#endif - - std::vector in_lengths_host(4), wei_lengths_host(4), out_lengths_host(4); - - switch(layout) - { - case ConvTensorLayout::NCHW: - // NCHW - in_lengths_host[0] = static_cast(N); - in_lengths_host[1] = static_cast(C); - in_lengths_host[2] = static_cast(Hi); - in_lengths_host[3] = static_cast(Wi); - - wei_lengths_host[0] = static_cast(K); - wei_lengths_host[1] = static_cast(C); - wei_lengths_host[2] = static_cast(Y); - wei_lengths_host[3] = static_cast(X); - - out_lengths_host[0] = static_cast(N); - out_lengths_host[1] = static_cast(K); - out_lengths_host[2] = static_cast(Ho); - out_lengths_host[3] = static_cast(Wo); - break; - case ConvTensorLayout::NHWC: - // NHWC - in_lengths_host[0] = static_cast(N); - in_lengths_host[1] = static_cast(Hi); - in_lengths_host[2] = static_cast(Wi); - in_lengths_host[3] = static_cast(C); - - wei_lengths_host[0] = static_cast(K); - wei_lengths_host[1] = static_cast(Y); - wei_lengths_host[2] = static_cast(X); - wei_lengths_host[3] = static_cast(C); - - out_lengths_host[0] = static_cast(N); - out_lengths_host[1] = static_cast(Ho); - out_lengths_host[2] = static_cast(Wo); - out_lengths_host[3] = static_cast(K); - break; - default: throw std::runtime_error("wrong! not implemented"); - } - - Tensor in(in_lengths_host); - Tensor wei(wei_lengths_host); - Tensor out_host(out_lengths_host); - Tensor out_device(out_lengths_host); - - std::cout << "layout: " << layout << std::endl; - ostream_HostTensorDescriptor(in.mDesc, std::cout << "in: "); - ostream_HostTensorDescriptor(wei.mDesc, std::cout << "wei: "); - ostream_HostTensorDescriptor(out_host.mDesc, std::cout << "out: "); - print_array("InLeftPads", make_tuple(in_left_pad_h, in_left_pad_w)); - print_array("InRightPads", make_tuple(in_right_pad_h, in_right_pad_w)); - print_array("ConvStrides", make_tuple(conv_stride_h, conv_stride_w)); - print_array("ConvDilations", make_tuple(conv_dilation_h, conv_dilation_w)); - - std::size_t num_thread = std::thread::hardware_concurrency(); - - switch(init_method) - { - case 0: - // no initialization - break; - case 1: - in.GenerateTensorValue(GeneratorTensor_1{}, num_thread); - wei.GenerateTensorValue(GeneratorTensor_1{}, num_thread); - break; - case 2: - in.GenerateTensorValue(GeneratorTensor_1{}, num_thread); - wei.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread); - break; - case 3: - in.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread); - wei.GenerateTensorValue(GeneratorTensor_1{}, num_thread); - break; - case 4: - in.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread); - wei.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread); - break; - case 5: - in.GenerateTensorValue(GeneratorTensor_3{0.0, 1.0}, num_thread); - wei.GenerateTensorValue(GeneratorTensor_3{-0.5, 0.5}, num_thread); - break; - default: - in.GenerateTensorValue(GeneratorTensor_2{1, 5}, num_thread); - - auto gen_wei = [](auto... is) { - return GeneratorTensor_2{1, 5}(is...) * GeneratorTensor_Checkboard{}(is...); - }; - wei.GenerateTensorValue(gen_wei, num_thread); - } - - auto f_make_for_device_nchw = [&]() { - const auto in_lengths_dev = make_tuple(N, C, Hi, Wi); - const auto wei_lengths_dev = make_tuple(K, C, Y, X); - const auto out_lengths_dev = make_tuple(N, K, Ho, Wo); - - return make_tuple(in_lengths_dev, wei_lengths_dev, out_lengths_dev); - }; - - auto f_make_for_device_nhwc = [&]() { - const auto in_lengths_dev = make_tuple(N, Hi, Wi, C); - const auto wei_lengths_dev = make_tuple(K, Y, X, C); - const auto out_lengths_dev = make_tuple(N, Ho, Wo, K); - - return make_tuple(in_lengths_dev, wei_lengths_dev, out_lengths_dev); - }; - - const auto conv_strides = make_tuple(conv_stride_h, conv_stride_w); - const auto conv_dilations = make_tuple(conv_dilation_h, conv_dilation_w); - const auto in_left_pads = make_tuple(in_left_pad_h, in_left_pad_w); - const auto in_right_pads = make_tuple(in_right_pad_h, in_right_pad_w); - -#if USE_CONV_FWD_V4R4_NCHW - if(algo == ConvForwardAlgo::V4R4NCHW) - { - if(layout != ConvTensorLayout::NCHW) - { - throw std::runtime_error("wrong! layout"); - } - - const auto tmp = f_make_for_device_nchw(); - - tunable_dyn_conv_fwd_v4r4_dlops_nchw_kcyx_nkhw* tunable = - &default_tunable_dyn_conv_fwd_v4r4_dlops_nchw_kcyx_nkhw; - - online_device_dynamic_convolution_forward_implicit_gemm_v4r4_dlops_nchw_kcyx_nkhw< - in_data_t, - acc_data_t, - out_data_t>(handle, - tmp[I0], - tmp[I1], - tmp[I2], - conv_strides, - conv_dilations, - in_left_pads, - in_right_pads, - in, - wei, - out_device, - tunable, - nrepeat); - } -#endif - -#if USE_CONV_FWD_V6R1_NCHW - if(algo == ConvForwardAlgo::V6R1NCHW) - { - if(layout != ConvTensorLayout::NCHW) - { - throw std::runtime_error("wrong! layout"); - } - - const auto tmp = f_make_for_device_nchw(); - -#if 1 - const CompileParameterConvIgemmFwdV6r1DlopsNchwKcyxNkhw compile_param = { - get_datatype_enum_from_type::value, - get_datatype_enum_from_type::value, - get_datatype_enum_from_type::value, - 256, - 4, - 1, - 128, - 32, - 8, - 4, - 4, - 1, - {8, 2}, - {8, 2}, - {4, 1, 1, 1, 1}, - {2, 1, 1, 128, 1}, - {4, 1, 1, 1, 1}, - {1, 1, 1, 1, 1}, - {1, 4, 1, 1, 1}, - {8, 1, 1, 32, 1}, - {1, 1, 1, 1, 1}, - {1, 1, 1, 1, 1}, - 4, - true, - true}; -#elif 0 - const CompileParameterConvIgemmFwdV6r1DlopsNchwKcyxNkhw compile_param = { - get_datatype_enum_from_type::value, - get_datatype_enum_from_type::value, - get_datatype_enum_from_type::value, - 256, - 4, - 2, - 128, - 32, - 8, - 4, - 4, - 1, - {8, 2}, - {8, 2}, - {4, 1, 1, 1, 2}, - {2, 1, 1, 128, 1}, - {4, 1, 1, 1, 1}, - {1, 1, 1, 1, 1}, - {1, 4, 1, 1, 2}, - {8, 1, 1, 32, 1}, - {1, 1, 1, 1, 1}, - {1, 1, 1, 1, 1}, - 4, - true, - true}; -#elif 1 - const CompileParameterConvIgemmFwdV6r1DlopsNchwKcyxNkhw compile_param = { - get_datatype_enum_from_type::value, - get_datatype_enum_from_type::value, - get_datatype_enum_from_type::value, - 256, - 4, - 4, - 128, - 32, - 8, - 4, - 4, - 1, - {8, 2}, - {8, 2}, - {4, 1, 1, 1, 4}, - {2, 1, 1, 128, 1}, - {4, 1, 1, 1, 1}, - {1, 1, 1, 1, 1}, - {1, 4, 1, 1, 4}, - {8, 1, 1, 32, 1}, - {1, 1, 1, 1, 1}, - {1, 1, 1, 1, 1}, - 4, - true, - true}; -#endif - - online_device_dynamic_convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw< - in_data_t, - acc_data_t, - out_data_t>(handle, - tmp[I0], - tmp[I1], - tmp[I2], - conv_strides, - conv_dilations, - in_left_pads, - in_right_pads, - in, - wei, - out_device, - compile_param, - nrepeat); - } -#endif - -#if USE_CONV_FWD_V4R4_XDLOPS_NCHW - if(algo == ConvForwardAlgo::V4R4XDLNCHW) - { - if(layout != ConvTensorLayout::NCHW) - { - throw std::runtime_error("wrong! layout"); - } - - const auto tmp = f_make_for_device_nchw(); - - tunable_dyn_conv_fwd_v4r4_xdlops_nchw_kcyx_nkhw* tunable = - &default_tunable_dyn_conv_fwd_v4r4_xdlops_nchw_kcyx_nkhw; - - online_device_dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw< - in_data_t, - acc_data_t, - out_data_t>(handle, - tmp[I0], - tmp[I1], - tmp[I2], - conv_strides, - conv_dilations, - in_left_pads, - in_right_pads, - in, - wei, - out_device, - tunable, - nrepeat); - } -#endif - -#if USE_CONV_FWD_V4R4_XDLOPS_NHWC - if(algo == ConvForwardAlgo::V4R4XDLNHWC) - { - if(layout != ConvTensorLayout::NHWC) - { - throw std::runtime_error("wrong! layout"); - } - - const auto tmp = f_make_for_device_nhwc(); - - tunable_dyn_conv_fwd_v4r4_xdlops_nhwc_kyxc_nhwk* tunable = - &default_tunable_dyn_conv_fwd_v4r4_xdlops_nhwc_kyxc_nhwk; - - online_device_dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nhwc_kyxc_nhwk< - in_data_t, - acc_data_t, - out_data_t>(handle, - tmp[I0], - tmp[I1], - tmp[I2], - conv_strides, - conv_dilations, - in_left_pads, - in_right_pads, - in, - wei, - out_device, - tunable, - nrepeat); - } -#endif - - if(do_verification) - { - host_direct_convolution(in, - wei, - out_host, - make_tuple(conv_stride_h, conv_stride_w), - make_tuple(conv_dilation_h, conv_dilation_w), - make_tuple(in_left_pad_h, in_left_pad_w), - make_tuple(in_right_pad_h, in_right_pad_w), - layout); - - check_error(out_host, out_device); - -#if 0 - if(do_log) - { - LogRangeAsType(std::cout << "in : ", in.mData, ",") << std::endl; - LogRangeAsType(std::cout << "wei: ", wei.mData, ",") << std::endl; - LogRangeAsType(std::cout << "out_host : ", out_host.mData, ",") << std::endl; - LogRangeAsType(std::cout << "out_device: ", out_device.mData, ",") << std::endl; - } -#endif - } - - delete handle; - MY_HIP_CHECK(hipStreamDestroy(stream)); -} diff --git a/host/driver_online/include/online_device_dynamic_convolution_forward_implicit_gemm_v4r4_dlops_nchw_kcyx_nkhw.hpp b/host/driver_online/include/online_device_dynamic_convolution_forward_implicit_gemm_v4r4_dlops_nchw_kcyx_nkhw.hpp deleted file mode 100644 index 419b8ca95d..0000000000 --- a/host/driver_online/include/online_device_dynamic_convolution_forward_implicit_gemm_v4r4_dlops_nchw_kcyx_nkhw.hpp +++ /dev/null @@ -1,395 +0,0 @@ -#pragma once -#include "device.hpp" -#include "host_tensor.hpp" -#include "handle.hpp" -#include "online_driver_common.hpp" -#include "dynamic_tensor_descriptor.hpp" -#include "dynamic_tensor_descriptor_helper.hpp" -#include "transform_forward_convolution_into_gemm_v4r4_nchw_kcyx_nkhw.hpp" -#include "conv_tunable_fwd_v4r4_dlops_nchw_kcyx_nkhw.hpp" - -namespace detail_dyn_conv_fwd_v4r4_nchw_kcyx_nkhw { - -template -static std::string get_network_config_string_from_types() -{ - using namespace ck; - - std::string out; - - out += std::to_string(get_datatype_enum_from_type::value) + "_" + - std::to_string(get_datatype_enum_from_type::value) + "_" + - std::to_string(get_datatype_enum_from_type::value); - - return (out); -}; - -static std::string -get_network_config_string_from_tunable(const tunable_dyn_conv_fwd_v4r4_dlops_nchw_kcyx_nkhw* pt) -{ - std::string out("TUN_"); - - out += std::to_string(pt->BlockSize) + "_"; - - out += std::to_string(pt->MPerBlock) + "x" + std::to_string(pt->NPerBlock) + "x" + - std::to_string(pt->KPerBlock) + "_"; - out += std::to_string(pt->M1PerThread) + "x" + std::to_string(pt->N1PerThread) + "x" + - std::to_string(pt->KPerThread) + "_"; - out += std::to_string(pt->M1N1ThreadClusterM10) + "x" + - std::to_string(pt->M1N1ThreadClusterN10) + "x" + - std::to_string(pt->M1N1ThreadClusterM11) + "x" + - std::to_string(pt->M1N1ThreadClusterN11) + "_"; - - out += std::to_string(pt->ABlockTransferThreadSliceLengths_K_M0_M1[0]) + "x" + - std::to_string(pt->ABlockTransferThreadSliceLengths_K_M0_M1[1]) + "x" + - std::to_string(pt->ABlockTransferThreadSliceLengths_K_M0_M1[2]) + "_"; - - out += std::to_string(pt->ABlockTransferThreadClusterLengths_K_M0_M1[0]) + "x" + - std::to_string(pt->ABlockTransferThreadClusterLengths_K_M0_M1[1]) + "x" + - std::to_string(pt->ABlockTransferThreadClusterLengths_K_M0_M1[2]) + "_"; - - out += std::to_string(pt->ABlockTransferThreadClusterArrangeOrder[0]) + "x" + - std::to_string(pt->ABlockTransferThreadClusterArrangeOrder[1]) + "x" + - std::to_string(pt->ABlockTransferThreadClusterArrangeOrder[2]) + "_"; - - out += std::to_string(pt->ABlockTransferSrcAccessOrder[0]) + "x" + - std::to_string(pt->ABlockTransferSrcAccessOrder[1]) + "x" + - std::to_string(pt->ABlockTransferSrcAccessOrder[2]) + "_"; - - out += std::to_string(pt->ABlockTransferSrcVectorDim) + "_"; - out += std::to_string(pt->ABlockTransferSrcScalarPerVector) + "_"; - out += std::to_string(pt->ABlockTransferDstScalarPerVector_M1) + "_"; - out += std::to_string(pt->AThreadTransferSrcResetCoordinateAfterRun) + "_"; - - out += std::to_string(pt->BBlockTransferThreadSliceLengths_K_N0_N1[0]) + "x" + - std::to_string(pt->BBlockTransferThreadSliceLengths_K_N0_N1[1]) + "x" + - std::to_string(pt->BBlockTransferThreadSliceLengths_K_N0_N1[2]) + "_"; - - out += std::to_string(pt->BBlockTransferThreadClusterLengths_K_N0_N1[0]) + "x" + - std::to_string(pt->BBlockTransferThreadClusterLengths_K_N0_N1[1]) + "x" + - std::to_string(pt->BBlockTransferThreadClusterLengths_K_N0_N1[2]) + "_"; - - out += std::to_string(pt->BBlockTransferThreadClusterArrangeOrder[0]) + "x" + - std::to_string(pt->BBlockTransferThreadClusterArrangeOrder[1]) + "x" + - std::to_string(pt->BBlockTransferThreadClusterArrangeOrder[2]) + "_"; - - out += std::to_string(pt->BBlockTransferSrcAccessOrder[0]) + "x" + - std::to_string(pt->BBlockTransferSrcAccessOrder[1]) + "x" + - std::to_string(pt->BBlockTransferSrcAccessOrder[2]) + "_"; - - out += std::to_string(pt->BBlockTransferSrcVectorDim) + "_"; - out += std::to_string(pt->BBlockTransferSrcScalarPerVector) + "_"; - out += std::to_string(pt->BBlockTransferDstScalarPerVector_N1) + "_"; - out += std::to_string(pt->BThreadTransferSrcResetCoordinateAfterRun) + "_"; - - out += std::to_string(pt->CThreadTransferSrcDstAccessOrder[0]) + "x" + - std::to_string(pt->CThreadTransferSrcDstAccessOrder[1]) + "x" + - std::to_string(pt->CThreadTransferSrcDstAccessOrder[2]) + "x" + - std::to_string(pt->CThreadTransferSrcDstAccessOrder[3]) + "x" + - std::to_string(pt->CThreadTransferSrcDstAccessOrder[4]) + "x" + - std::to_string(pt->CThreadTransferSrcDstAccessOrder[5]) + "_"; - - out += std::to_string(pt->CThreadTransferSrcDstVectorDim) + "_"; - out += std::to_string(pt->CThreadTransferDstScalarPerVector); - - return (out); -}; - -template -static std::string get_definition_string_from_types() -{ - using namespace ck; - - std::string out; - - out += - " -DCK_PARAM_ABDataTypeEnum=" + std::to_string(get_datatype_enum_from_type::value) + - " -DCK_PARAM_AccDataTypeEnum=" + std::to_string(get_datatype_enum_from_type::value) + - " -DCK_PARAM_CDataTypeEnum=" + std::to_string(get_datatype_enum_from_type::value); - - return (out); -}; - -static std::string -get_definition_string_from_tunable(const tunable_dyn_conv_fwd_v4r4_dlops_nchw_kcyx_nkhw* pt) -{ - std::string out; - - out += " -DCK_PARAM_BlockSize=" + std::to_string(pt->BlockSize); - - out += " -DCK_PARAM_MPerBlock=" + std::to_string(pt->MPerBlock) + - " -DCK_PARAM_NPerBlock=" + std::to_string(pt->NPerBlock) + - " -DCK_PARAM_KPerBlock=" + std::to_string(pt->KPerBlock); - out += " -DCK_PARAM_M1PerThread=" + std::to_string(pt->M1PerThread) + - " -DCK_PARAM_N1PerThread=" + std::to_string(pt->N1PerThread) + - " -DCK_PARAM_KPerThread=" + std::to_string(pt->KPerThread); - - out += " -DCK_PARAM_M1N1ThreadClusterM10=" + std::to_string(pt->M1N1ThreadClusterM10) + - " -DCK_PARAM_M1N1ThreadClusterN10=" + std::to_string(pt->M1N1ThreadClusterN10) + - " -DCK_PARAM_M1N1ThreadClusterM11=" + std::to_string(pt->M1N1ThreadClusterM11) + - " -DCK_PARAM_M1N1ThreadClusterN11=" + std::to_string(pt->M1N1ThreadClusterN11); - - out += " -DCK_PARAM_ABlockTransferThreadSliceLengths_K_M0_M1=" + - std::to_string(pt->ABlockTransferThreadSliceLengths_K_M0_M1[0]) + "," + - std::to_string(pt->ABlockTransferThreadSliceLengths_K_M0_M1[1]) + "," + - std::to_string(pt->ABlockTransferThreadSliceLengths_K_M0_M1[2]); - - out += " -DCK_PARAM_ABlockTransferThreadClusterLengths_K_M0_M1=" + - std::to_string(pt->ABlockTransferThreadClusterLengths_K_M0_M1[0]) + "," + - std::to_string(pt->ABlockTransferThreadClusterLengths_K_M0_M1[1]) + "," + - std::to_string(pt->ABlockTransferThreadClusterLengths_K_M0_M1[2]); - - out += " -DCK_PARAM_ABlockTransferThreadClusterArrangeOrder=" + - std::to_string(pt->ABlockTransferThreadClusterArrangeOrder[0]) + "," + - std::to_string(pt->ABlockTransferThreadClusterArrangeOrder[1]) + "," + - std::to_string(pt->ABlockTransferThreadClusterArrangeOrder[2]); - - out += " -DCK_PARAM_ABlockTransferSrcAccessOrder=" + - std::to_string(pt->ABlockTransferSrcAccessOrder[0]) + "," + - std::to_string(pt->ABlockTransferSrcAccessOrder[1]) + "," + - std::to_string(pt->ABlockTransferSrcAccessOrder[2]); - - out += - " -DCK_PARAM_ABlockTransferSrcVectorDim=" + std::to_string(pt->ABlockTransferSrcVectorDim); - out += " -DCK_PARAM_ABlockTransferSrcScalarPerVector=" + - std::to_string(pt->ABlockTransferSrcScalarPerVector); - out += " -DCK_PARAM_ABlockTransferDstScalarPerVector_M1=" + - std::to_string(pt->ABlockTransferDstScalarPerVector_M1); - out += " -DCK_PARAM_AThreadTransferSrcResetCoordinateAfterRun=" + - std::to_string(pt->AThreadTransferSrcResetCoordinateAfterRun); - - out += " -DCK_PARAM_BBlockTransferThreadSliceLengths_K_N0_N1=" + - std::to_string(pt->BBlockTransferThreadSliceLengths_K_N0_N1[0]) + "," + - std::to_string(pt->BBlockTransferThreadSliceLengths_K_N0_N1[1]) + "," + - std::to_string(pt->BBlockTransferThreadSliceLengths_K_N0_N1[2]); - - out += " -DCK_PARAM_BBlockTransferThreadClusterLengths_K_N0_N1=" + - std::to_string(pt->BBlockTransferThreadClusterLengths_K_N0_N1[0]) + "," + - std::to_string(pt->BBlockTransferThreadClusterLengths_K_N0_N1[1]) + "," + - std::to_string(pt->BBlockTransferThreadClusterLengths_K_N0_N1[2]); - - out += " -DCK_PARAM_BBlockTransferThreadClusterArrangeOrder=" + - std::to_string(pt->BBlockTransferThreadClusterArrangeOrder[0]) + "," + - std::to_string(pt->BBlockTransferThreadClusterArrangeOrder[1]) + "," + - std::to_string(pt->BBlockTransferThreadClusterArrangeOrder[2]); - - out += " -DCK_PARAM_BBlockTransferSrcAccessOrder=" + - std::to_string(pt->BBlockTransferSrcAccessOrder[0]) + "," + - std::to_string(pt->BBlockTransferSrcAccessOrder[1]) + "," + - std::to_string(pt->BBlockTransferSrcAccessOrder[2]); - - out += - " -DCK_PARAM_BBlockTransferSrcVectorDim=" + std::to_string(pt->BBlockTransferSrcVectorDim); - out += " -DCK_PARAM_BBlockTransferSrcScalarPerVector=" + - std::to_string(pt->BBlockTransferSrcScalarPerVector); - out += " -DCK_PARAM_BBlockTransferDstScalarPerVector_N1=" + - std::to_string(pt->BBlockTransferDstScalarPerVector_N1); - out += " -DCK_PARAM_BThreadTransferSrcResetCoordinateAfterRun=" + - std::to_string(pt->BThreadTransferSrcResetCoordinateAfterRun); - - out += " -DCK_PARAM_CThreadTransferSrcDstAccessOrder=" + - std::to_string(pt->CThreadTransferSrcDstAccessOrder[0]) + "," + - std::to_string(pt->CThreadTransferSrcDstAccessOrder[1]) + "," + - std::to_string(pt->CThreadTransferSrcDstAccessOrder[2]) + "," + - std::to_string(pt->CThreadTransferSrcDstAccessOrder[3]) + "," + - std::to_string(pt->CThreadTransferSrcDstAccessOrder[4]) + "," + - std::to_string(pt->CThreadTransferSrcDstAccessOrder[5]); - - out += " -DCK_PARAM_CThreadTransferSrcDstVectorDim=" + - std::to_string(pt->CThreadTransferSrcDstVectorDim); - out += " -DCK_PARAM_CThreadTransferDstScalarPerVector=" + - std::to_string(pt->CThreadTransferDstScalarPerVector); - - return (out); -}; - -} // namespace detail_dyn_conv_fwd_v4r4_nchw_kcyx_nkhw - -template -void online_device_dynamic_convolution_forward_implicit_gemm_v4r4_dlops_nchw_kcyx_nkhw( - online_compile::Handle* handle, - const InLengths& in_n_c_hi_wi_lengths, - const WeiLengths& wei_k_c_y_x_lengths, - const OutLengths& out_n_k_ho_wo_lengths, - const ConvStrides& conv_strides, - const ConvDilations& conv_dilations, - const InLeftPads& in_left_pads, - const InRightPads& in_right_pads, - const Tensor& in_n_c_hi_wi, - const Tensor& wei_k_c_y_x, - Tensor& out_n_k_ho_wo, - const tunable_dyn_conv_fwd_v4r4_dlops_nchw_kcyx_nkhw* tunable, - ck::index_t nrepeat) -{ - using namespace ck; - using namespace ck::driver; - using namespace detail_dyn_conv_fwd_v4r4_nchw_kcyx_nkhw; - using size_t = std::size_t; - - ///////////////////////////////////////////////////////////////////////////////////////////////////////////// - // The follow codes are only used for computing the grid_size, hasMainKBlockLoop, - // hasDoubleTailKBlockLoop - - constexpr auto I0 = Number<0>{}; - constexpr auto I1 = Number<1>{}; - constexpr auto I2 = Number<2>{}; - constexpr auto I3 = Number<3>{}; - - const auto in_n_c_hi_wi_desc = - make_dynamic_naive_tensor_descriptor_packed_v2(in_n_c_hi_wi_lengths); - const auto wei_k_c_y_x_desc = - make_dynamic_naive_tensor_descriptor_packed_v2(wei_k_c_y_x_lengths); - const auto out_n_k_ho_wo_desc = - make_dynamic_naive_tensor_descriptor_packed_v2(out_n_k_ho_wo_lengths); - - const auto descs = - transform_forward_convolution_into_gemm_v4r4_nchw_kcyx_nkhw_pad(wei_k_c_y_x_desc, - in_n_c_hi_wi_desc, - out_n_k_ho_wo_desc, - conv_strides, - conv_dilations, - in_left_pads, - in_right_pads); - const auto a_k_m_grid_desc = descs[I0]; - const auto c_m_n_grid_desc = descs[I2]; - const auto M = c_m_n_grid_desc.GetLength(I0); - const auto N = c_m_n_grid_desc.GetLength(I1); - const auto K = a_k_m_grid_desc.GetLength(I0); - - const index_t grid_size = (M / tunable->MPerBlock) * (N / tunable->NPerBlock); - const bool hasMainKBlockLoop = ((K + tunable->KPerBlock) / (2 * tunable->KPerBlock) > 1); - const bool hasDoubleTailKBlockLoop = ((K / tunable->KPerBlock) % 2 == 0); - ///////////////////////////////////////////////////////////////////////////////////////////////////////////// - - // these buffers are usually provided by the user application - DeviceMem in_n_c_hi_wi_dev_buf(sizeof(TInWei) * in_n_c_hi_wi.mDesc.GetElementSpace()); - DeviceMem wei_k_c_y_x_dev_buf(sizeof(TInWei) * wei_k_c_y_x.mDesc.GetElementSpace()); - DeviceMem out_n_k_ho_wo_dev_buf(sizeof(TOut) * out_n_k_ho_wo.mDesc.GetElementSpace()); - - in_n_c_hi_wi_dev_buf.ToDevice(in_n_c_hi_wi.mData.data()); - wei_k_c_y_x_dev_buf.ToDevice(wei_k_c_y_x.mData.data()); - out_n_k_ho_wo_dev_buf.ToDevice(out_n_k_ho_wo.mData.data()); - - // these are workspace buffers that should be expressed to the user by the corresponding - // workspace API - DeviceMem workspace_buf(4096); - - void* a_k_m0_m1_grid_desc_dev_buf = workspace_buf.GetDeviceBuffer(); - void* b_k_n0_n1_grid_desc_dev_buf = - static_cast(static_cast(workspace_buf.GetDeviceBuffer()) + 1024); - void* c_m0_m10_m11_n0_n10_n11_grid_desc_dev_buf = - static_cast(static_cast(workspace_buf.GetDeviceBuffer()) + 2048); - void* c_blockid_to_m0_n0_block_cluster_adaptor_dev_buf = - static_cast(static_cast(workspace_buf.GetDeviceBuffer()) + 3072); - - const std::vector vld = {static_cast(tunable->BlockSize), 1, 1}; - const std::vector vgd1 = {static_cast(tunable->BlockSize), 1, 1}; - const std::vector vgd2 = {static_cast(grid_size * tunable->BlockSize), 1, 1}; - - std::string program_name = - "dynamic_convolution_forward_implicit_gemm_v4r4_dlops_nchw_kcyx_nkhw.cpp"; - std::string algo_name = "implicit_gemm_conv_fwd_v4r4_dlops_nchw"; - - std::string param = " -std=c++17 "; - std::string network_config; - - param += get_definition_string_from_types() + " " + - get_definition_string_from_tunable(tunable) + - " -DCK_PARAM_HAS_MAIN_KBLOCK_LOOP=" + std::to_string(hasMainKBlockLoop) + - " -DCK_PARAM_HAS_DOUBLE_TAIL_KBLOCK_LOOP=" + std::to_string(hasDoubleTailKBlockLoop); - network_config = get_network_config_string_from_types() + "_" + - get_network_config_string_from_tunable(tunable) + "_" + - std::to_string(hasMainKBlockLoop) + "_" + - std::to_string(hasDoubleTailKBlockLoop); - - std::vector kernel1_times; - std::vector kernel2_times; - - for(index_t i = 0; i < nrepeat; ++i) - { - KernelTimer timer1, timer2; - std::string kernel_name; - - kernel_name = "dynamic_convolution_forward_implicit_gemm_v4r4_dlops_nchw_kcyx_nkhw_prepare"; - auto network_config_1 = network_config + "_1"; - - timer1.Start(); - handle->AddKernel(algo_name, network_config_1, program_name, kernel_name, vld, vgd1, param)( - static_cast(in_n_c_hi_wi_lengths[I0]), - static_cast(in_n_c_hi_wi_lengths[I1]), - static_cast(in_n_c_hi_wi_lengths[I2]), - static_cast(in_n_c_hi_wi_lengths[I3]), - static_cast(wei_k_c_y_x_lengths[I0]), - static_cast(wei_k_c_y_x_lengths[I2]), - static_cast(wei_k_c_y_x_lengths[I3]), - conv_strides[I0], - conv_strides[I1], - conv_dilations[I0], - conv_dilations[I1], - in_left_pads[I0], - in_left_pads[I1], - in_right_pads[I0], - in_right_pads[I1], - a_k_m0_m1_grid_desc_dev_buf, - b_k_n0_n1_grid_desc_dev_buf, - c_m0_m10_m11_n0_n10_n11_grid_desc_dev_buf, - c_blockid_to_m0_n0_block_cluster_adaptor_dev_buf); - timer1.End(); - - kernel_name = "dynamic_convolution_forward_implicit_gemm_v4r4_dlops_nchw_kcyx_nkhw"; - auto network_config_2 = network_config + "_2"; - - timer2.Start(); - handle->AddKernel(algo_name, network_config_2, program_name, kernel_name, vld, vgd2, param)( - reinterpret_cast(wei_k_c_y_x_dev_buf.GetDeviceBuffer()), - reinterpret_cast(in_n_c_hi_wi_dev_buf.GetDeviceBuffer()), - reinterpret_cast(out_n_k_ho_wo_dev_buf.GetDeviceBuffer()), - (const void*)(a_k_m0_m1_grid_desc_dev_buf), - (const void*)(b_k_n0_n1_grid_desc_dev_buf), - (const void*)(c_m0_m10_m11_n0_n10_n11_grid_desc_dev_buf), - (const void*)(c_blockid_to_m0_n0_block_cluster_adaptor_dev_buf)); - timer2.End(); - - kernel1_times.push_back(timer1.GetElapsedTime()); - kernel2_times.push_back(timer2.GetElapsedTime()); - } - - { - auto ave_time1 = - std::accumulate( - std::next(kernel1_times.begin()), kernel1_times.end(), 0., std::plus{}) / - (nrepeat - 1); - auto ave_time2 = - std::accumulate( - std::next(kernel2_times.begin()), kernel2_times.end(), 0., std::plus{}) / - (nrepeat - 1); - - const auto N = in_n_c_hi_wi_lengths[I0]; - const auto C = in_n_c_hi_wi_lengths[I1]; - - const auto K = out_n_k_ho_wo_lengths[I1]; - const auto Ho = out_n_k_ho_wo_lengths[I2]; - const auto Wo = out_n_k_ho_wo_lengths[I3]; - - const auto Y = wei_k_c_y_x_lengths[I2]; - const auto X = wei_k_c_y_x_lengths[I3]; - - float perf = (float)(std::size_t(2) * N * K * Ho * Wo * C * Y * X) / - (std::size_t(1000) * 1000 * 1000) / (ave_time1 + ave_time2); - - std::cout << "Average time : " << ave_time1 + ave_time2 << " ms(" << ave_time1 << ", " - << ave_time2 << "), " << perf << " TFlop/s" << std::endl; - }; - - // copy result back to host - out_n_k_ho_wo_dev_buf.FromDevice(out_n_k_ho_wo.mData.data()); -} diff --git a/host/driver_online/include/online_device_dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw.hpp b/host/driver_online/include/online_device_dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw.hpp deleted file mode 100644 index 46d065f615..0000000000 --- a/host/driver_online/include/online_device_dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw.hpp +++ /dev/null @@ -1,386 +0,0 @@ -#include "device.hpp" -#include "host_tensor.hpp" -#include "handle.hpp" -#include "online_driver_common.hpp" -#include "dynamic_tensor_descriptor.hpp" -#include "dynamic_tensor_descriptor_helper.hpp" -#include "conv_tunable_fwd_v4r4_xdlops_nchw_kcyx_nkhw.hpp" - -namespace detail_dyn_conv_fwd_v4r4_xdlops_nchw_kcyx_nkhw { - -template -static std::string get_network_config_string_from_types() -{ - using namespace ck; - - std::string out; - - out += std::to_string(get_datatype_enum_from_type::value) + "_" + - std::to_string(get_datatype_enum_from_type::value) + "_" + - std::to_string(get_datatype_enum_from_type::value); - - return (out); -}; - -static std::string -get_network_config_string_from_tunable(const tunable_dyn_conv_fwd_v4r4_xdlops_nchw_kcyx_nkhw* pt) -{ - std::string out("TUN_"); - - out += std::to_string(pt->BlockSize) + "_"; - - out += std::to_string(pt->MPerBlock) + "x" + std::to_string(pt->NPerBlock) + "x" + - std::to_string(pt->KPerBlock) + "_"; - out += std::to_string(pt->MPerWave) + "x" + std::to_string(pt->NPerWave) + "x" + - std::to_string(pt->MRepeat) + "x" + std::to_string(pt->NRepeat) + "x" + - std::to_string(pt->K1) + "_"; - - out += std::to_string(pt->ABlockTransferThreadSliceLengths_K0_M_K1[0]) + "x" + - std::to_string(pt->ABlockTransferThreadSliceLengths_K0_M_K1[1]) + "x" + - std::to_string(pt->ABlockTransferThreadSliceLengths_K0_M_K1[2]) + "_"; - - out += std::to_string(pt->ABlockTransferThreadClusterLengths_K0_M_K1[0]) + "x" + - std::to_string(pt->ABlockTransferThreadClusterLengths_K0_M_K1[1]) + "x" + - std::to_string(pt->ABlockTransferThreadClusterLengths_K0_M_K1[2]) + "_"; - - out += std::to_string(pt->ABlockTransferThreadClusterArrangeOrder[0]) + "x" + - std::to_string(pt->ABlockTransferThreadClusterArrangeOrder[1]) + "x" + - std::to_string(pt->ABlockTransferThreadClusterArrangeOrder[2]) + "_"; - - out += std::to_string(pt->ABlockTransferSrcAccessOrder[0]) + "x" + - std::to_string(pt->ABlockTransferSrcAccessOrder[1]) + "x" + - std::to_string(pt->ABlockTransferSrcAccessOrder[2]) + "_"; - - out += std::to_string(pt->ABlockTransferSrcVectorDim) + "_"; - out += std::to_string(pt->ABlockTransferSrcScalarPerVector) + "_"; - out += std::to_string(pt->ABlockTransferDstScalarPerVector_K1) + "_"; - out += std::to_string(pt->AThreadTransferSrcResetCoordinateAfterRun) + "_"; - - out += std::to_string(pt->BBlockTransferThreadSliceLengths_K0_N_K1[0]) + "x" + - std::to_string(pt->BBlockTransferThreadSliceLengths_K0_N_K1[1]) + "x" + - std::to_string(pt->BBlockTransferThreadSliceLengths_K0_N_K1[2]) + "_"; - - out += std::to_string(pt->BBlockTransferThreadClusterLengths_K0_N_K1[0]) + "x" + - std::to_string(pt->BBlockTransferThreadClusterLengths_K0_N_K1[1]) + "x" + - std::to_string(pt->BBlockTransferThreadClusterLengths_K0_N_K1[2]) + "_"; - - out += std::to_string(pt->BBlockTransferThreadClusterArrangeOrder[0]) + "x" + - std::to_string(pt->BBlockTransferThreadClusterArrangeOrder[1]) + "x" + - std::to_string(pt->BBlockTransferThreadClusterArrangeOrder[2]) + "_"; - - out += std::to_string(pt->BBlockTransferSrcAccessOrder[0]) + "x" + - std::to_string(pt->BBlockTransferSrcAccessOrder[1]) + "x" + - std::to_string(pt->BBlockTransferSrcAccessOrder[2]) + "_"; - - out += std::to_string(pt->BBlockTransferSrcVectorDim) + "_"; - out += std::to_string(pt->BBlockTransferSrcScalarPerVector) + "_"; - out += std::to_string(pt->BBlockTransferDstScalarPerVector_K1) + "_"; - out += std::to_string(pt->BThreadTransferSrcResetCoordinateAfterRun) + "_"; - - out += std::to_string(pt->CThreadTransferSrcDstAccessOrder[0]) + "x" + - std::to_string(pt->CThreadTransferSrcDstAccessOrder[1]) + "x" + - std::to_string(pt->CThreadTransferSrcDstAccessOrder[2]) + "x" + - std::to_string(pt->CThreadTransferSrcDstAccessOrder[3]) + "x" + - std::to_string(pt->CThreadTransferSrcDstAccessOrder[4]) + "x" + - std::to_string(pt->CThreadTransferSrcDstAccessOrder[5]) + "x" + - std::to_string(pt->CThreadTransferSrcDstAccessOrder[6]) + "x" + - std::to_string(pt->CThreadTransferSrcDstAccessOrder[7]) + "_"; - - out += std::to_string(pt->CThreadTransferSrcDstVectorDim) + "_"; - out += std::to_string(pt->CThreadTransferDstScalarPerVector); - - return (out); -}; - -template -static std::string get_definition_string_from_types() -{ - using namespace ck; - - std::string out; - - out += - " -DCK_PARAM_ABDataTypeEnum=" + std::to_string(get_datatype_enum_from_type::value) + - " -DCK_PARAM_AccDataTypeEnum=" + std::to_string(get_datatype_enum_from_type::value) + - " -DCK_PARAM_CDataTypeEnum=" + std::to_string(get_datatype_enum_from_type::value); - - return (out); -}; - -static std::string -get_definition_string_from_tunable(const tunable_dyn_conv_fwd_v4r4_xdlops_nchw_kcyx_nkhw* pt) -{ - std::string out; - - out += " -DCK_PARAM_BlockSize=" + std::to_string(pt->BlockSize); - - out += " -DCK_PARAM_MPerBlock=" + std::to_string(pt->MPerBlock) + - " -DCK_PARAM_NPerBlock=" + std::to_string(pt->NPerBlock) + - " -DCK_PARAM_KPerBlock=" + std::to_string(pt->KPerBlock); - out += " -DCK_PARAM_MPerWave=" + std::to_string(pt->MPerWave) + - " -DCK_PARAM_NPerWave=" + std::to_string(pt->NPerWave) + - " -DCK_PARAM_K1=" + std::to_string(pt->K1) + - " -DCK_PARAM_MRepeat=" + std::to_string(pt->MRepeat) + - " -DCK_PARAM_NRepeat=" + std::to_string(pt->NRepeat); - - out += " -DCK_PARAM_ABlockTransferThreadSliceLengths_K0_M_K1=" + - std::to_string(pt->ABlockTransferThreadSliceLengths_K0_M_K1[0]) + "," + - std::to_string(pt->ABlockTransferThreadSliceLengths_K0_M_K1[1]) + "," + - std::to_string(pt->ABlockTransferThreadSliceLengths_K0_M_K1[2]); - - out += " -DCK_PARAM_ABlockTransferThreadClusterLengths_K0_M_K1=" + - std::to_string(pt->ABlockTransferThreadClusterLengths_K0_M_K1[0]) + "," + - std::to_string(pt->ABlockTransferThreadClusterLengths_K0_M_K1[1]) + "," + - std::to_string(pt->ABlockTransferThreadClusterLengths_K0_M_K1[2]); - - out += " -DCK_PARAM_ABlockTransferThreadClusterArrangeOrder=" + - std::to_string(pt->ABlockTransferThreadClusterArrangeOrder[0]) + "," + - std::to_string(pt->ABlockTransferThreadClusterArrangeOrder[1]) + "," + - std::to_string(pt->ABlockTransferThreadClusterArrangeOrder[2]); - - out += " -DCK_PARAM_ABlockTransferSrcAccessOrder=" + - std::to_string(pt->ABlockTransferSrcAccessOrder[0]) + "," + - std::to_string(pt->ABlockTransferSrcAccessOrder[1]) + "," + - std::to_string(pt->ABlockTransferSrcAccessOrder[2]); - - out += - " -DCK_PARAM_ABlockTransferSrcVectorDim=" + std::to_string(pt->ABlockTransferSrcVectorDim); - out += " -DCK_PARAM_ABlockTransferSrcScalarPerVector=" + - std::to_string(pt->ABlockTransferSrcScalarPerVector); - out += " -DCK_PARAM_ABlockTransferDstScalarPerVector_K1=" + - std::to_string(pt->ABlockTransferDstScalarPerVector_K1); - out += " -DCK_PARAM_AThreadTransferSrcResetCoordinateAfterRun=" + - std::to_string(pt->AThreadTransferSrcResetCoordinateAfterRun); - - out += " -DCK_PARAM_BBlockTransferThreadSliceLengths_K0_N_K1=" + - std::to_string(pt->BBlockTransferThreadSliceLengths_K0_N_K1[0]) + "," + - std::to_string(pt->BBlockTransferThreadSliceLengths_K0_N_K1[1]) + "," + - std::to_string(pt->BBlockTransferThreadSliceLengths_K0_N_K1[2]); - - out += " -DCK_PARAM_BBlockTransferThreadClusterLengths_K0_N_K1=" + - std::to_string(pt->BBlockTransferThreadClusterLengths_K0_N_K1[0]) + "," + - std::to_string(pt->BBlockTransferThreadClusterLengths_K0_N_K1[1]) + "," + - std::to_string(pt->BBlockTransferThreadClusterLengths_K0_N_K1[2]); - - out += " -DCK_PARAM_BBlockTransferThreadClusterArrangeOrder=" + - std::to_string(pt->BBlockTransferThreadClusterArrangeOrder[0]) + "," + - std::to_string(pt->BBlockTransferThreadClusterArrangeOrder[1]) + "," + - std::to_string(pt->BBlockTransferThreadClusterArrangeOrder[2]); - - out += " -DCK_PARAM_BBlockTransferSrcAccessOrder=" + - std::to_string(pt->BBlockTransferSrcAccessOrder[0]) + "," + - std::to_string(pt->BBlockTransferSrcAccessOrder[1]) + "," + - std::to_string(pt->BBlockTransferSrcAccessOrder[2]); - - out += - " -DCK_PARAM_BBlockTransferSrcVectorDim=" + std::to_string(pt->BBlockTransferSrcVectorDim); - out += " -DCK_PARAM_BBlockTransferSrcScalarPerVector=" + - std::to_string(pt->BBlockTransferSrcScalarPerVector); - out += " -DCK_PARAM_BBlockTransferDstScalarPerVector_K1=" + - std::to_string(pt->BBlockTransferDstScalarPerVector_K1); - out += " -DCK_PARAM_BThreadTransferSrcResetCoordinateAfterRun=" + - std::to_string(pt->BThreadTransferSrcResetCoordinateAfterRun); - - out += " -DCK_PARAM_CThreadTransferSrcDstAccessOrder=" + - std::to_string(pt->CThreadTransferSrcDstAccessOrder[0]) + "," + - std::to_string(pt->CThreadTransferSrcDstAccessOrder[1]) + "," + - std::to_string(pt->CThreadTransferSrcDstAccessOrder[2]) + "," + - std::to_string(pt->CThreadTransferSrcDstAccessOrder[3]) + "," + - std::to_string(pt->CThreadTransferSrcDstAccessOrder[4]) + "," + - std::to_string(pt->CThreadTransferSrcDstAccessOrder[5]) + "," + - std::to_string(pt->CThreadTransferSrcDstAccessOrder[6]) + "," + - std::to_string(pt->CThreadTransferSrcDstAccessOrder[7]); - - out += " -DCK_PARAM_CThreadTransferSrcDstVectorDim=" + - std::to_string(pt->CThreadTransferSrcDstVectorDim); - out += " -DCK_PARAM_CThreadTransferDstScalarPerVector=" + - std::to_string(pt->CThreadTransferDstScalarPerVector); - - return (out); -}; - -} // namespace detail_dyn_conv_fwd_v4r4_xdlops_nchw_kcyx_nkhw - -template -void online_device_dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw( - online_compile::Handle* handle, - const InLengths& in_n_c_hi_wi_lengths, - const WeiLengths& wei_k_c_y_x_lengths, - const OutLengths& out_n_k_ho_wo_lengths, - const ConvStrides& conv_strides, - const ConvDilations& conv_dilations, - const InLeftPads& in_left_pads, - const InRightPads& in_right_pads, - const Tensor& in_n_c_hi_wi, - const Tensor& wei_k_c_y_x, - Tensor& out_n_k_ho_wo, - const tunable_dyn_conv_fwd_v4r4_xdlops_nchw_kcyx_nkhw* tunable, - ck::index_t nrepeat) -{ - using namespace ck; - using namespace ck::driver; - using namespace detail_dyn_conv_fwd_v4r4_xdlops_nchw_kcyx_nkhw; - using size_t = std::size_t; - - ///////////////////////////////////////////////////////////////////////////////////////////////////////////// - - constexpr auto I0 = Number<0>{}; - constexpr auto I1 = Number<1>{}; - constexpr auto I2 = Number<2>{}; - constexpr auto I3 = Number<3>{}; - - const auto in_n_c_hi_wi_desc = - make_dynamic_naive_tensor_descriptor_packed_v2(in_n_c_hi_wi_lengths); - const auto wei_k_c_y_x_desc = - make_dynamic_naive_tensor_descriptor_packed_v2(wei_k_c_y_x_lengths); - const auto out_n_k_ho_wo_desc = - make_dynamic_naive_tensor_descriptor_packed_v2(out_n_k_ho_wo_lengths); - - const auto n = in_n_c_hi_wi_desc.GetLength(I0); - const auto c = in_n_c_hi_wi_desc.GetLength(I1); - const auto hi = in_n_c_hi_wi_desc.GetLength(I2); - const auto wi = in_n_c_hi_wi_desc.GetLength(I3); - const auto k = wei_k_c_y_x_desc.GetLength(I0); - const auto y = wei_k_c_y_x_desc.GetLength(I2); - const auto x = wei_k_c_y_x_desc.GetLength(I3); - const auto ho = out_n_k_ho_wo_desc.GetLength(I2); - const auto wo = out_n_k_ho_wo_desc.GetLength(I3); - - const auto M = k; - const auto N = n * ho * wo; - const auto K = c * y * x; - const auto K0 = K / tunable->K1; - - const index_t grid_size = (M / tunable->MPerBlock) * (N / tunable->NPerBlock); - ///////////////////////////////////////////////////////////////////////////////////////////////////////////// - - // these buffers are usually provided by the user application - DeviceMem in_n_c_hi_wi_dev_buf(sizeof(TInWei) * in_n_c_hi_wi.mDesc.GetElementSpace()); - DeviceMem wei_k_c_y_x_dev_buf(sizeof(TInWei) * wei_k_c_y_x.mDesc.GetElementSpace()); - DeviceMem out_n_k_ho_wo_dev_buf(sizeof(TOut) * out_n_k_ho_wo.mDesc.GetElementSpace()); - - in_n_c_hi_wi_dev_buf.ToDevice(in_n_c_hi_wi.mData.data()); - wei_k_c_y_x_dev_buf.ToDevice(wei_k_c_y_x.mData.data()); - out_n_k_ho_wo_dev_buf.ToDevice(out_n_k_ho_wo.mData.data()); - - // these are workspace buffers that should be expressed to the user by the corresponding - // workspace API - DeviceMem workspace_buf(4096); - - void* a_k_m0_m1_grid_desc_dev_buf = workspace_buf.GetDeviceBuffer(); - void* b_k_n0_n1_grid_desc_dev_buf = - static_cast(static_cast(workspace_buf.GetDeviceBuffer()) + 1024); - void* c_m0_m10_m11_n0_n10_n11_grid_desc_dev_buf = - static_cast(static_cast(workspace_buf.GetDeviceBuffer()) + 2048); - void* c_blockid_to_m0_n0_block_cluster_adaptor_dev_buf = - static_cast(static_cast(workspace_buf.GetDeviceBuffer()) + 3072); - - const std::vector vld = {static_cast(tunable->BlockSize), 1, 1}; - const std::vector vgd1 = {static_cast(tunable->BlockSize), 1, 1}; - const std::vector vgd2 = {static_cast(grid_size * tunable->BlockSize), 1, 1}; - - std::string program_name = - "dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw.cpp"; - std::string algo_name = "implicit_gemm_conv_fwd_v4r4_xdlops_nchw"; - - std::string param = " -std=c++17 "; - std::string network_config; - - param += get_definition_string_from_types() + " " + " -DCK_USE_AMD_XDLOPS" + - get_definition_string_from_tunable(tunable); - - network_config = get_network_config_string_from_types() + "_" + - get_network_config_string_from_tunable(tunable); - - std::vector kernel1_times; - std::vector kernel2_times; - - for(index_t i = 0; i < nrepeat; ++i) - { - KernelTimer timer1, timer2; - std::string kernel_name; - - kernel_name = - "dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw_prepare"; - auto network_config_1 = network_config + "_1"; - - timer1.Start(); - handle->AddKernel(algo_name, network_config_1, program_name, kernel_name, vld, vgd1, param)( - static_cast(in_n_c_hi_wi_lengths[I0]), - static_cast(in_n_c_hi_wi_lengths[I1]), - static_cast(in_n_c_hi_wi_lengths[I2]), - static_cast(in_n_c_hi_wi_lengths[I3]), - static_cast(wei_k_c_y_x_lengths[I0]), - static_cast(wei_k_c_y_x_lengths[I2]), - static_cast(wei_k_c_y_x_lengths[I3]), - conv_strides[I0], - conv_strides[I1], - conv_dilations[I0], - conv_dilations[I1], - in_left_pads[I0], - in_left_pads[I1], - in_right_pads[I0], - in_right_pads[I1], - a_k_m0_m1_grid_desc_dev_buf, - b_k_n0_n1_grid_desc_dev_buf, - c_m0_m10_m11_n0_n10_n11_grid_desc_dev_buf, - c_blockid_to_m0_n0_block_cluster_adaptor_dev_buf); - timer1.End(); - - kernel_name = "dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw"; - auto network_config_2 = network_config + "_2"; - - timer2.Start(); - handle->AddKernel(algo_name, network_config_2, program_name, kernel_name, vld, vgd2, param)( - reinterpret_cast(wei_k_c_y_x_dev_buf.GetDeviceBuffer()), - reinterpret_cast(in_n_c_hi_wi_dev_buf.GetDeviceBuffer()), - reinterpret_cast(out_n_k_ho_wo_dev_buf.GetDeviceBuffer()), - (const void*)(a_k_m0_m1_grid_desc_dev_buf), - (const void*)(b_k_n0_n1_grid_desc_dev_buf), - (const void*)(c_m0_m10_m11_n0_n10_n11_grid_desc_dev_buf), - (const void*)(c_blockid_to_m0_n0_block_cluster_adaptor_dev_buf)); - timer2.End(); - - kernel1_times.push_back(timer1.GetElapsedTime()); - kernel2_times.push_back(timer2.GetElapsedTime()); - } - - { - auto ave_time1 = - std::accumulate( - std::next(kernel1_times.begin()), kernel1_times.end(), 0., std::plus{}) / - (nrepeat - 1); - auto ave_time2 = - std::accumulate( - std::next(kernel2_times.begin()), kernel2_times.end(), 0., std::plus{}) / - (nrepeat - 1); - - const auto N = in_n_c_hi_wi_lengths[I0]; - const auto C = in_n_c_hi_wi_lengths[I1]; - - const auto K = out_n_k_ho_wo_lengths[I1]; - const auto Ho = out_n_k_ho_wo_lengths[I2]; - const auto Wo = out_n_k_ho_wo_lengths[I3]; - - const auto Y = wei_k_c_y_x_lengths[I2]; - const auto X = wei_k_c_y_x_lengths[I3]; - - float perf = (float)(std::size_t(2) * N * K * Ho * Wo * C * Y * X) / - (std::size_t(1000) * 1000 * 1000) / (ave_time1 + ave_time2); - - std::cout << "Average time : " << ave_time1 + ave_time2 << " ms(" << ave_time1 << ", " - << ave_time2 << "), " << perf << " TFlop/s" << std::endl; - }; - - // copy result back to host - out_n_k_ho_wo_dev_buf.FromDevice(out_n_k_ho_wo.mData.data()); -} diff --git a/host/driver_online/include/online_device_dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nhwc_kyxc_nhwk.hpp b/host/driver_online/include/online_device_dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nhwc_kyxc_nhwk.hpp deleted file mode 100644 index 57724c7612..0000000000 --- a/host/driver_online/include/online_device_dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nhwc_kyxc_nhwk.hpp +++ /dev/null @@ -1,389 +0,0 @@ -#include "device.hpp" -#include "host_tensor.hpp" -#include "handle.hpp" -#include "online_driver_common.hpp" -#include "dynamic_tensor_descriptor.hpp" -#include "dynamic_tensor_descriptor_helper.hpp" -#include "transform_forward_convolution_into_gemm_v4r4r4_nhwc_kyxc_nhwk.hpp" -#include "conv_tunable_fwd_v4r4_xdlops_nhwc_kyxc_nhwk.hpp" - -namespace detail_dyn_conv_fwd_v4r4_xdlops_nhwc_kyxc_nhwk { - -template -static std::string get_network_config_string_from_types() -{ - using namespace ck; - - std::string out; - - out += std::to_string(get_datatype_enum_from_type::value) + "_" + - std::to_string(get_datatype_enum_from_type::value) + "_" + - std::to_string(get_datatype_enum_from_type::value); - - return (out); -}; - -static std::string -get_network_config_string_from_tunable(const tunable_dyn_conv_fwd_v4r4_xdlops_nhwc_kyxc_nhwk* pt) -{ - std::string out("TUN_"); - - out += std::to_string(pt->BlockSize) + "_"; - - out += std::to_string(pt->MPerBlock) + "x" + std::to_string(pt->NPerBlock) + "x" + - std::to_string(pt->KPerBlock) + "_"; - out += std::to_string(pt->MPerWave) + "x" + std::to_string(pt->NPerWave) + "x" + - std::to_string(pt->MRepeat) + "x" + std::to_string(pt->NRepeat) + "x" + - std::to_string(pt->K1) + "_"; - - out += std::to_string(pt->ABlockTransferThreadSliceLengths_K0_M_K1[0]) + "x" + - std::to_string(pt->ABlockTransferThreadSliceLengths_K0_M_K1[1]) + "x" + - std::to_string(pt->ABlockTransferThreadSliceLengths_K0_M_K1[2]) + "_"; - - out += std::to_string(pt->ABlockTransferThreadClusterLengths_K0_M_K1[0]) + "x" + - std::to_string(pt->ABlockTransferThreadClusterLengths_K0_M_K1[1]) + "x" + - std::to_string(pt->ABlockTransferThreadClusterLengths_K0_M_K1[2]) + "_"; - - out += std::to_string(pt->ABlockTransferThreadClusterArrangeOrder[0]) + "x" + - std::to_string(pt->ABlockTransferThreadClusterArrangeOrder[1]) + "x" + - std::to_string(pt->ABlockTransferThreadClusterArrangeOrder[2]) + "_"; - - out += std::to_string(pt->ABlockTransferSrcAccessOrder[0]) + "x" + - std::to_string(pt->ABlockTransferSrcAccessOrder[1]) + "x" + - std::to_string(pt->ABlockTransferSrcAccessOrder[2]) + "_"; - - out += std::to_string(pt->ABlockTransferSrcVectorDim) + "_"; - out += std::to_string(pt->ABlockTransferSrcScalarPerVector) + "_"; - out += std::to_string(pt->ABlockTransferDstScalarPerVector_K1) + "_"; - out += std::to_string(pt->AThreadTransferSrcResetCoordinateAfterRun) + "_"; - - out += std::to_string(pt->BBlockTransferThreadSliceLengths_K0_N_K1[0]) + "x" + - std::to_string(pt->BBlockTransferThreadSliceLengths_K0_N_K1[1]) + "x" + - std::to_string(pt->BBlockTransferThreadSliceLengths_K0_N_K1[2]) + "_"; - - out += std::to_string(pt->BBlockTransferThreadClusterLengths_K0_N_K1[0]) + "x" + - std::to_string(pt->BBlockTransferThreadClusterLengths_K0_N_K1[1]) + "x" + - std::to_string(pt->BBlockTransferThreadClusterLengths_K0_N_K1[2]) + "_"; - - out += std::to_string(pt->BBlockTransferThreadClusterArrangeOrder[0]) + "x" + - std::to_string(pt->BBlockTransferThreadClusterArrangeOrder[1]) + "x" + - std::to_string(pt->BBlockTransferThreadClusterArrangeOrder[2]) + "_"; - - out += std::to_string(pt->BBlockTransferSrcAccessOrder[0]) + "x" + - std::to_string(pt->BBlockTransferSrcAccessOrder[1]) + "x" + - std::to_string(pt->BBlockTransferSrcAccessOrder[2]) + "_"; - - out += std::to_string(pt->BBlockTransferSrcVectorDim) + "_"; - out += std::to_string(pt->BBlockTransferSrcScalarPerVector) + "_"; - out += std::to_string(pt->BBlockTransferDstScalarPerVector_K1) + "_"; - out += std::to_string(pt->BThreadTransferSrcResetCoordinateAfterRun) + "_"; - - out += std::to_string(pt->CThreadTransferSrcDstAccessOrder[0]) + "x" + - std::to_string(pt->CThreadTransferSrcDstAccessOrder[1]) + "x" + - std::to_string(pt->CThreadTransferSrcDstAccessOrder[2]) + "x" + - std::to_string(pt->CThreadTransferSrcDstAccessOrder[3]) + "x" + - std::to_string(pt->CThreadTransferSrcDstAccessOrder[4]) + "x" + - std::to_string(pt->CThreadTransferSrcDstAccessOrder[5]) + "x" + - std::to_string(pt->CThreadTransferSrcDstAccessOrder[6]) + "x" + - std::to_string(pt->CThreadTransferSrcDstAccessOrder[7]) + "_"; - - out += std::to_string(pt->CThreadTransferSrcDstVectorDim) + "_"; - out += std::to_string(pt->CThreadTransferDstScalarPerVector); - - return (out); -}; - -template -static std::string get_definition_string_from_types() -{ - using namespace ck; - - std::string out; - - out += - " -DCK_PARAM_ABDataTypeEnum=" + std::to_string(get_datatype_enum_from_type::value) + - " -DCK_PARAM_AccDataTypeEnum=" + std::to_string(get_datatype_enum_from_type::value) + - " -DCK_PARAM_CDataTypeEnum=" + std::to_string(get_datatype_enum_from_type::value); - - return (out); -}; - -static std::string -get_definition_string_from_tunable(const tunable_dyn_conv_fwd_v4r4_xdlops_nhwc_kyxc_nhwk* pt) -{ - std::string out; - - out += " -DCK_PARAM_BlockSize=" + std::to_string(pt->BlockSize); - - out += " -DCK_PARAM_MPerBlock=" + std::to_string(pt->MPerBlock) + - " -DCK_PARAM_NPerBlock=" + std::to_string(pt->NPerBlock) + - " -DCK_PARAM_KPerBlock=" + std::to_string(pt->KPerBlock); - out += " -DCK_PARAM_MPerWave=" + std::to_string(pt->MPerWave) + - " -DCK_PARAM_NPerWave=" + std::to_string(pt->NPerWave) + - " -DCK_PARAM_K1=" + std::to_string(pt->K1) + - " -DCK_PARAM_MRepeat=" + std::to_string(pt->MRepeat) + - " -DCK_PARAM_NRepeat=" + std::to_string(pt->NRepeat); - - out += " -DCK_PARAM_ABlockTransferThreadSliceLengths_K0_M_K1=" + - std::to_string(pt->ABlockTransferThreadSliceLengths_K0_M_K1[0]) + "," + - std::to_string(pt->ABlockTransferThreadSliceLengths_K0_M_K1[1]) + "," + - std::to_string(pt->ABlockTransferThreadSliceLengths_K0_M_K1[2]); - - out += " -DCK_PARAM_ABlockTransferThreadClusterLengths_K0_M_K1=" + - std::to_string(pt->ABlockTransferThreadClusterLengths_K0_M_K1[0]) + "," + - std::to_string(pt->ABlockTransferThreadClusterLengths_K0_M_K1[1]) + "," + - std::to_string(pt->ABlockTransferThreadClusterLengths_K0_M_K1[2]); - - out += " -DCK_PARAM_ABlockTransferThreadClusterArrangeOrder=" + - std::to_string(pt->ABlockTransferThreadClusterArrangeOrder[0]) + "," + - std::to_string(pt->ABlockTransferThreadClusterArrangeOrder[1]) + "," + - std::to_string(pt->ABlockTransferThreadClusterArrangeOrder[2]); - - out += " -DCK_PARAM_ABlockTransferSrcAccessOrder=" + - std::to_string(pt->ABlockTransferSrcAccessOrder[0]) + "," + - std::to_string(pt->ABlockTransferSrcAccessOrder[1]) + "," + - std::to_string(pt->ABlockTransferSrcAccessOrder[2]); - - out += - " -DCK_PARAM_ABlockTransferSrcVectorDim=" + std::to_string(pt->ABlockTransferSrcVectorDim); - out += " -DCK_PARAM_ABlockTransferSrcScalarPerVector=" + - std::to_string(pt->ABlockTransferSrcScalarPerVector); - out += " -DCK_PARAM_ABlockTransferDstScalarPerVector_K1=" + - std::to_string(pt->ABlockTransferDstScalarPerVector_K1); - out += " -DCK_PARAM_AThreadTransferSrcResetCoordinateAfterRun=" + - std::to_string(pt->AThreadTransferSrcResetCoordinateAfterRun); - - out += " -DCK_PARAM_BBlockTransferThreadSliceLengths_K0_N_K1=" + - std::to_string(pt->BBlockTransferThreadSliceLengths_K0_N_K1[0]) + "," + - std::to_string(pt->BBlockTransferThreadSliceLengths_K0_N_K1[1]) + "," + - std::to_string(pt->BBlockTransferThreadSliceLengths_K0_N_K1[2]); - - out += " -DCK_PARAM_BBlockTransferThreadClusterLengths_K0_N_K1=" + - std::to_string(pt->BBlockTransferThreadClusterLengths_K0_N_K1[0]) + "," + - std::to_string(pt->BBlockTransferThreadClusterLengths_K0_N_K1[1]) + "," + - std::to_string(pt->BBlockTransferThreadClusterLengths_K0_N_K1[2]); - - out += " -DCK_PARAM_BBlockTransferThreadClusterArrangeOrder=" + - std::to_string(pt->BBlockTransferThreadClusterArrangeOrder[0]) + "," + - std::to_string(pt->BBlockTransferThreadClusterArrangeOrder[1]) + "," + - std::to_string(pt->BBlockTransferThreadClusterArrangeOrder[2]); - - out += " -DCK_PARAM_BBlockTransferSrcAccessOrder=" + - std::to_string(pt->BBlockTransferSrcAccessOrder[0]) + "," + - std::to_string(pt->BBlockTransferSrcAccessOrder[1]) + "," + - std::to_string(pt->BBlockTransferSrcAccessOrder[2]); - - out += - " -DCK_PARAM_BBlockTransferSrcVectorDim=" + std::to_string(pt->BBlockTransferSrcVectorDim); - out += " -DCK_PARAM_BBlockTransferSrcScalarPerVector=" + - std::to_string(pt->BBlockTransferSrcScalarPerVector); - out += " -DCK_PARAM_BBlockTransferDstScalarPerVector_K1=" + - std::to_string(pt->BBlockTransferDstScalarPerVector_K1); - out += " -DCK_PARAM_BThreadTransferSrcResetCoordinateAfterRun=" + - std::to_string(pt->BThreadTransferSrcResetCoordinateAfterRun); - - out += " -DCK_PARAM_CThreadTransferSrcDstAccessOrder=" + - std::to_string(pt->CThreadTransferSrcDstAccessOrder[0]) + "," + - std::to_string(pt->CThreadTransferSrcDstAccessOrder[1]) + "," + - std::to_string(pt->CThreadTransferSrcDstAccessOrder[2]) + "," + - std::to_string(pt->CThreadTransferSrcDstAccessOrder[3]) + "," + - std::to_string(pt->CThreadTransferSrcDstAccessOrder[4]) + "," + - std::to_string(pt->CThreadTransferSrcDstAccessOrder[5]) + "," + - std::to_string(pt->CThreadTransferSrcDstAccessOrder[6]) + "," + - std::to_string(pt->CThreadTransferSrcDstAccessOrder[7]); - - out += " -DCK_PARAM_CThreadTransferSrcDstVectorDim=" + - std::to_string(pt->CThreadTransferSrcDstVectorDim); - out += " -DCK_PARAM_CThreadTransferDstScalarPerVector=" + - std::to_string(pt->CThreadTransferDstScalarPerVector); - - return (out); -}; - -} // namespace detail_dyn_conv_fwd_v4r4_xdlops_nhwc_kyxc_nhwk - -template -void online_device_dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nhwc_kyxc_nhwk( - online_compile::Handle* handle, - const InLengths& in_n_hi_wi_c_lengths, - const WeiLengths& wei_k_y_x_c_lengths, - const OutLengths& out_n_ho_wo_k_lengths, - const ConvStrides& conv_strides, - const ConvDilations& conv_dilations, - const InLeftPads& in_left_pads, - const InRightPads& in_right_pads, - const Tensor& in_n_hi_wi_c, - const Tensor& wei_k_y_x_c, - Tensor& out_n_ho_wo_k, - const tunable_dyn_conv_fwd_v4r4_xdlops_nhwc_kyxc_nhwk* tunable, - ck::index_t nrepeat) -{ - using namespace ck; - using namespace detail_dyn_conv_fwd_v4r4_xdlops_nhwc_kyxc_nhwk; - using size_t = std::size_t; - - ///////////////////////////////////////////////////////////////////////////////////////////////////////////// - // The follow codes are only used for computing the grid_size, hasMainKBlockLoop, - // hasDoubleTailKBlockLoop - - constexpr auto I0 = Number<0>{}; - constexpr auto I1 = Number<1>{}; - constexpr auto I2 = Number<2>{}; - constexpr auto I3 = Number<3>{}; - - const auto in_n_hi_wi_c_desc = - make_dynamic_naive_tensor_descriptor_packed_v2(in_n_hi_wi_c_lengths); - const auto wei_k_y_x_c_desc = - make_dynamic_naive_tensor_descriptor_packed_v2(wei_k_y_x_c_lengths); - const auto out_n_ho_wo_k_desc = - make_dynamic_naive_tensor_descriptor_packed_v2(out_n_ho_wo_k_lengths); - - const auto n = in_n_hi_wi_c_desc.GetLength(I0); - const auto hi = in_n_hi_wi_c_desc.GetLength(I1); - const auto wi = in_n_hi_wi_c_desc.GetLength(I2); - const auto c = in_n_hi_wi_c_desc.GetLength(I3); - - const auto k = wei_k_y_x_c_desc.GetLength(I0); - const auto y = wei_k_y_x_c_desc.GetLength(I1); - const auto x = wei_k_y_x_c_desc.GetLength(I2); - - const auto ho = out_n_ho_wo_k_desc.GetLength(I1); - const auto wo = out_n_ho_wo_k_desc.GetLength(I2); - - const auto M = k; - const auto N = n * ho * wo; - const auto K = c * y * x; - const auto K0 = K / tunable->K1; - - const index_t grid_size = (M / tunable->MPerBlock) * (N / tunable->NPerBlock); - - // these buffers are usually provided by the user application - DeviceMem in_n_hi_wi_c_dev_buf(sizeof(TInWei) * in_n_hi_wi_c.mDesc.GetElementSpace()); - DeviceMem wei_k_y_x_c_dev_buf(sizeof(TInWei) * wei_k_y_x_c.mDesc.GetElementSpace()); - DeviceMem out_n_ho_wo_k_dev_buf(sizeof(TOut) * out_n_ho_wo_k.mDesc.GetElementSpace()); - - in_n_hi_wi_c_dev_buf.ToDevice(in_n_hi_wi_c.mData.data()); - wei_k_y_x_c_dev_buf.ToDevice(wei_k_y_x_c.mData.data()); - out_n_ho_wo_k_dev_buf.ToDevice(out_n_ho_wo_k.mData.data()); - - // these are workspace buffers that should be expressed to the user by the corresponding - // workspace API - DeviceMem workspace_buf(4096); - - void* a_k0_m_k1_grid_desc_dev_buf = workspace_buf.GetDeviceBuffer(); - void* b_k0_n_k1_grid_desc_dev_buf = - static_cast(static_cast(workspace_buf.GetDeviceBuffer()) + 1024); - void* c_m0_m1_m2_n_grid_desc_dev_buf = - static_cast(static_cast(workspace_buf.GetDeviceBuffer()) + 2048); - void* c_blockid_to_m0_n0_block_cluster_adaptor_dev_buf = - static_cast(static_cast(workspace_buf.GetDeviceBuffer()) + 3072); - - const std::vector vld = {static_cast(tunable->BlockSize), 1, 1}; - const std::vector vgd1 = {static_cast(tunable->BlockSize), 1, 1}; - const std::vector vgd2 = {static_cast(grid_size * tunable->BlockSize), 1, 1}; - - std::string program_name = - "dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nhwc_kyxc_nhwk.cpp"; - std::string algo_name = "implicit_gemm_conv_fwd_v4r4_xdlops_nhwc"; - - std::string param = " -std=c++17 "; - std::string network_config; - - param += get_definition_string_from_types() + " -DCK_USE_AMD_XDLOPS "; - param += get_definition_string_from_tunable(tunable); - - network_config = get_network_config_string_from_types() + "_" + - get_network_config_string_from_tunable(tunable); - - std::vector kernel1_times; - std::vector kernel2_times; - - for(index_t i = 0; i < nrepeat; ++i) - { - KernelTimer timer1, timer2; - std::string kernel_name; - - kernel_name = - "dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nhwc_kyxc_nhwk_prepare"; - auto network_config_1 = network_config + "_1"; - - timer1.Start(); - handle->AddKernel(algo_name, network_config_1, program_name, kernel_name, vld, vgd1, param)( - static_cast(in_n_hi_wi_c_lengths[I0]), - static_cast(in_n_hi_wi_c_lengths[I1]), - static_cast(in_n_hi_wi_c_lengths[I2]), - static_cast(in_n_hi_wi_c_lengths[I3]), - static_cast(wei_k_y_x_c_lengths[I0]), - static_cast(wei_k_y_x_c_lengths[I1]), - static_cast(wei_k_y_x_c_lengths[I2]), - conv_strides[I0], - conv_strides[I1], - conv_dilations[I0], - conv_dilations[I1], - in_left_pads[I0], - in_left_pads[I1], - in_right_pads[I0], - in_right_pads[I1], - a_k0_m_k1_grid_desc_dev_buf, - b_k0_n_k1_grid_desc_dev_buf, - c_m0_m1_m2_n_grid_desc_dev_buf, - c_blockid_to_m0_n0_block_cluster_adaptor_dev_buf); - timer1.End(); - - kernel_name = "dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nhwc_kyxc_nhwk"; - auto network_config_2 = network_config + "_2"; - - timer2.Start(); - handle->AddKernel(algo_name, network_config_2, program_name, kernel_name, vld, vgd2, param)( - reinterpret_cast(in_n_hi_wi_c_dev_buf.GetDeviceBuffer()), - reinterpret_cast(wei_k_y_x_c_dev_buf.GetDeviceBuffer()), - reinterpret_cast(out_n_ho_wo_k_dev_buf.GetDeviceBuffer()), - (const void*)(a_k0_m_k1_grid_desc_dev_buf), - (const void*)(b_k0_n_k1_grid_desc_dev_buf), - (const void*)(c_m0_m1_m2_n_grid_desc_dev_buf), - (const void*)(c_blockid_to_m0_n0_block_cluster_adaptor_dev_buf)); - timer2.End(); - - kernel1_times.push_back(timer1.GetElapsedTime()); - kernel2_times.push_back(timer2.GetElapsedTime()); - } - - { - auto ave_time1 = - std::accumulate( - std::next(kernel1_times.begin()), kernel1_times.end(), 0., std::plus{}) / - (nrepeat - 1); - auto ave_time2 = - std::accumulate( - std::next(kernel2_times.begin()), kernel2_times.end(), 0., std::plus{}) / - (nrepeat - 1); - - const auto N = in_n_hi_wi_c_lengths[I0]; - const auto C = in_n_hi_wi_c_lengths[I3]; - - const auto Ho = out_n_ho_wo_k_lengths[I1]; - const auto Wo = out_n_ho_wo_k_lengths[I2]; - const auto K = out_n_ho_wo_k_lengths[I3]; - - const auto Y = wei_k_y_x_c_lengths[I1]; - const auto X = wei_k_y_x_c_lengths[I2]; - - float perf = (float)(std::size_t(2) * N * K * Ho * Wo * C * Y * X) / - (std::size_t(1000) * 1000 * 1000) / ave_time2; - - std::cout << "Average time : " << ave_time1 + ave_time2 << " ms(" << ave_time1 << ", " - << ave_time2 << "), " << perf << " TFlop/s" << std::endl; - }; - - // copy result back to host - out_n_ho_wo_k_dev_buf.FromDevice(out_n_ho_wo_k.mData.data()); -} diff --git a/host/driver_online/include/online_device_dynamic_convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw.hpp b/host/driver_online/include/online_device_dynamic_convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw.hpp deleted file mode 100644 index 7b88ef02b4..0000000000 --- a/host/driver_online/include/online_device_dynamic_convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw.hpp +++ /dev/null @@ -1,183 +0,0 @@ -#pragma once -#include "device.hpp" -#include "host_tensor.hpp" -#include "handle.hpp" -#include "online_driver_common.hpp" -#include "convolution_problem_descriptor.hpp" -#include "dynamic_tensor_descriptor.hpp" -#include "dynamic_tensor_descriptor_helper.hpp" -#include "transform_forward_convolution_into_gemm_v6r1_nchw_kcyx_nkhw.hpp" -#include "conv_igemm_fwd_v6r1_dlops_nchw_kcyx_nkhw.hpp" - -template -void online_device_dynamic_convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw( - online_compile::Handle* handle, - const InLengths& in_n_c_hi_wi_lengths, - const WeiLengths& wei_k_c_y_x_lengths, - const OutLengths& out_n_k_ho_wo_lengths, - const ConvStrides& conv_strides, - const ConvDilations& conv_dilations, - const InLeftPads& in_left_pads, - const InRightPads& in_right_pads, - const Tensor& in_n_c_hi_wi, - const Tensor& wei_k_c_y_x, - Tensor& out_n_k_ho_wo, - const ck::driver::CompileParameterConvIgemmFwdV6r1DlopsNchwKcyxNkhw& compile_param, - ck::index_t nrepeat) -{ - using namespace ck; - using namespace ck::driver; - using size_t = std::size_t; - - std::cout << __func__ << std::endl; - - constexpr auto I0 = Number<0>{}; - constexpr auto I1 = Number<1>{}; - constexpr auto I2 = Number<2>{}; - constexpr auto I3 = Number<3>{}; - - ConvolutionProblemDescriptor conv_problem_desc{in_n_c_hi_wi_lengths[I0], - out_n_k_ho_wo_lengths[I1], - in_n_c_hi_wi_lengths[I1], - wei_k_c_y_x_lengths[I2], - wei_k_c_y_x_lengths[I3], - in_n_c_hi_wi_lengths[I2], - in_n_c_hi_wi_lengths[I3], - out_n_k_ho_wo_lengths[I2], - out_n_k_ho_wo_lengths[I3], - conv_strides[I0], - conv_strides[I1], - conv_dilations[I0], - conv_dilations[I1], - in_left_pads[I0], - in_left_pads[I1], - in_right_pads[I0], - in_right_pads[I1], - get_datatype_enum_from_type::value, - get_datatype_enum_from_type::value, - get_datatype_enum_from_type::value}; - - if(!ConvIgemmFwdV6r1DlopsNchwKcyxNkhw::IsValidCompileParameter(conv_problem_desc, - compile_param)) - { - throw std::runtime_error("wrong! IsValidCompileParameter fail"); - } - - DeviceMem in_n_c_hi_wi_dev_buf(sizeof(TInWei) * in_n_c_hi_wi.mDesc.GetElementSpace()); - DeviceMem wei_k_c_y_x_dev_buf(sizeof(TInWei) * wei_k_c_y_x.mDesc.GetElementSpace()); - DeviceMem out_n_k_ho_wo_dev_buf(sizeof(TOut) * out_n_k_ho_wo.mDesc.GetElementSpace()); - - in_n_c_hi_wi_dev_buf.ToDevice(in_n_c_hi_wi.mData.data()); - wei_k_c_y_x_dev_buf.ToDevice(wei_k_c_y_x.mData.data()); - out_n_k_ho_wo_dev_buf.ToDevice(out_n_k_ho_wo.mData.data()); - - // workspace is used for save transformed tensor descritpors created by prepare kernel - DeviceMem workspace_dev_buf( - ConvIgemmFwdV6r1DlopsNchwKcyxNkhw::GetWorkSpaceSize(conv_problem_desc, compile_param)); - - const auto block_size = std::size_t( - ConvIgemmFwdV6r1DlopsNchwKcyxNkhw::GetBlockSize(conv_problem_desc, compile_param)); - - const auto grid_size = std::size_t( - ConvIgemmFwdV6r1DlopsNchwKcyxNkhw::GetGridSize(conv_problem_desc, compile_param)); - - const std::vector vld1 = {1, 1, 1}; - const std::vector vgd1 = {1, 1, 1}; - - const std::vector vld2 = {static_cast(block_size), 1, 1}; - const std::vector vgd2 = {static_cast(grid_size * block_size), 1, 1}; - - std::string program_name = - "dynamic_convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw.cpp"; - std::string algo_name = "implicit_gemm_conv_fwd_v6r1_dlops_nchw"; - - std::string compile_param_string = - get_ck_hip_online_compile_common_flag() + compile_param.GetCompileParameterString(); - std::string network_config = compile_param_string; - - std::vector kernel1_times; - std::vector kernel2_times; - - for(index_t i = 0; i < nrepeat + 1; ++i) - { - KernelTimer timer1, timer2; - std::string kernel_name; - - kernel_name = "dynamic_convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw_prepare"; - auto network_config_1 = network_config + "_1"; - - timer1.Start(); - handle->AddKernel(algo_name, - network_config_1, - program_name, - kernel_name, - vld1, - vgd1, - compile_param_string)(static_cast(in_n_c_hi_wi_lengths[I0]), - static_cast(in_n_c_hi_wi_lengths[I1]), - static_cast(in_n_c_hi_wi_lengths[I2]), - static_cast(in_n_c_hi_wi_lengths[I3]), - static_cast(wei_k_c_y_x_lengths[I0]), - static_cast(wei_k_c_y_x_lengths[I2]), - static_cast(wei_k_c_y_x_lengths[I3]), - conv_strides[I0], - conv_strides[I1], - conv_dilations[I0], - conv_dilations[I1], - in_left_pads[I0], - in_left_pads[I1], - in_right_pads[I0], - in_right_pads[I1], - (void*)(workspace_dev_buf.GetDeviceBuffer())); - timer1.End(); - - kernel_name = "dynamic_convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw"; - auto network_config_2 = network_config + "_2"; - - timer2.Start(); - handle->AddKernel(algo_name, - network_config_2, - program_name, - kernel_name, - vld2, - vgd2, - compile_param_string)( - reinterpret_cast(wei_k_c_y_x_dev_buf.GetDeviceBuffer()), - reinterpret_cast(in_n_c_hi_wi_dev_buf.GetDeviceBuffer()), - reinterpret_cast(out_n_k_ho_wo_dev_buf.GetDeviceBuffer()), - (const void*)(workspace_dev_buf.GetDeviceBuffer())); - timer2.End(); - - kernel1_times.push_back(timer1.GetElapsedTime()); - kernel2_times.push_back(timer2.GetElapsedTime()); - } - - { - auto ave_time1 = - std::accumulate( - std::next(kernel1_times.begin()), kernel1_times.end(), 0., std::plus{}) / - nrepeat; - auto ave_time2 = - std::accumulate( - std::next(kernel2_times.begin()), kernel2_times.end(), 0., std::plus{}) / - nrepeat; - - float perf = (float)(conv_problem_desc.CalculateFlop()) / - (std::size_t(1000) * 1000 * 1000) / (ave_time1 + ave_time2); - - std::cout << "Average time : " << ave_time1 + ave_time2 << " ms(" << ave_time1 << ", " - << ave_time2 << "), " << perf << " TFlop/s" << std::endl; - }; - - // copy result back to host - out_n_k_ho_wo_dev_buf.FromDevice(out_n_k_ho_wo.mData.data()); -} diff --git a/host/online_compile/CMakeLists.txt b/host/online_compile/CMakeLists.txt deleted file mode 100644 index 1b66703fcd..0000000000 --- a/host/online_compile/CMakeLists.txt +++ /dev/null @@ -1,168 +0,0 @@ -set(CMAKE_CXX_COMPILER /opt/rocm/llvm/bin/clang++) - -## for online-compiling of HIP kernels -set(OLC_HIP_COMPILER ${CMAKE_CXX_COMPILER} CACHE PATH "") - -## reset to avoid the C++ options from the parent project -set(CMAKE_CXX_FLAGS "") -message("Compiling options for library and kernels: ${CMAKE_CXX_FLAGS}") - -# look for and register clang-offload-bundler -if(OLC_HIP_COMPILER MATCHES ".*clang\\+\\+$") - find_program(OLC_OFFLOADBUNDLER_BIN clang-offload-bundler - PATH_SUFFIXES bin - PATHS - /opt/rocm/llvm - ${CMAKE_INSTALL_PREFIX}/llvm - ) -endif() - -if(OLC_OFFLOADBUNDLER_BIN) - message(STATUS "clang-offload-bundler found: ${OLC_OFFLOADBUNDLER_BIN}") - set(OLC_OFFLOADBUNDLER_BIN "${OLC_OFFLOADBUNDLER_BIN}") -else() - # look for and register extractkernel - message(STATUS "clang-offload-bundler not found") - - find_program(EXTRACTKERNEL_BIN extractkernel - PATH_SUFFIXES bin - PATHS - /opt/rocm/hip - /opt/rocm/hcc - /opt/rocm - ${CMAKE_INSTALL_PREFIX}/hip - ${CMAKE_INSTALL_PREFIX}/hcc - ${CMAKE_INSTALL_PREFIX} - - ) - if(EXTRACTKERNEL_BIN) - message(STATUS "extractkernel found: ${EXTRACTKERNEL_BIN}") - set(EXTRACTKERNEL_BIN "${EXTRACTKERNEL_BIN}") - else() - message(FATAL_ERROR "extractkernel not found") - endif() -endif() - -option(Boost_USE_STATIC_LIBS "Use boost static libraries" OFF) -set(BOOST_COMPONENTS filesystem) -add_definitions(-DBOOST_ALL_NO_LIB=1) -find_package(Boost REQUIRED COMPONENTS ${BOOST_COMPONENTS}) - -# HIP is always required -find_package(hip REQUIRED PATHS /opt/rocm) -message(STATUS "Build with HIP ${hip_VERSION}") -target_flags(HIP_COMPILER_FLAGS hip::device) -# Remove cuda arch flags -string(REGEX REPLACE --cuda-gpu-arch=[a-z0-9]+ "" HIP_COMPILER_FLAGS "${HIP_COMPILER_FLAGS}") -string(REGEX REPLACE --offload-arch=[a-z0-9]+ "" HIP_COMPILER_FLAGS "${HIP_COMPILER_FLAGS}") - -set(OLC_hip_VERSION_MAJOR "${hip_VERSION_MAJOR}") -set(OLC_hip_VERSION_MINOR "${hip_VERSION_MINOR}") -set(OLC_hip_VERSION_PATCH "${hip_VERSION_PATCH}") - -option(ENABLE_DEBUG "Build to enable debugging" ON) -if(ENABLE_DEBUG) - set(OLC_DEBUG 1) -else() - set(OLC_DEBUG 0) -endif() - -configure_file("${PROJECT_SOURCE_DIR}/host/online_compile/include/config.h.in" "${PROJECT_BINARY_DIR}/host/online_compile/include/config.h") - -include_directories(BEFORE - ${PROJECT_BINARY_DIR}/host/online_compile/include -) - -message(STATUS "Hip compiler flags: ${HIP_COMPILER_FLAGS}") - -## HIP_COMPILER_FLAGS will be used for on-line compiling of the HIP kernels -set(HIP_COMPILER_FLAGS "${HIP_COMPILER_FLAGS} ${HIP_ONLINE_COMPILER_FLAGS}") -add_definitions("-DHIP_COMPILER_FLAGS=${HIP_COMPILER_FLAGS}") - -file(GLOB_RECURSE COMPOSABLE_KERNEL_INCLUDE_1 "${PROJECT_SOURCE_DIR}/composable_kernel/include/*/*.hpp") -file(GLOB COMPOSABLE_KERNEL_INCLUDE_2 "${PROJECT_SOURCE_DIR}/external/rocm/include/bfloat16_dev.hpp") -set(MCONV_KERNEL_INCLUDES - ${COMPOSABLE_KERNEL_INCLUDE_1} - ${COMPOSABLE_KERNEL_INCLUDE_2} - ) - -file(GLOB_RECURSE MCONV_KERNELS "${PROJECT_SOURCE_DIR}/composable_kernel/src/kernel_wrapper/*.cpp") - -add_kernels(${CMAKE_CURRENT_SOURCE_DIR} "${MCONV_KERNELS}") -add_kernel_includes(${CMAKE_CURRENT_SOURCE_DIR} "${MCONV_KERNEL_INCLUDES}") - -set(ONLINE_COMPILATION_SOURCE - ${PROJECT_BINARY_DIR}/kernel.cpp - ${PROJECT_BINARY_DIR}/kernel_includes.cpp -) - -include_directories(BEFORE - ${PROJECT_BINARY_DIR}/host/online_compile/include - include -) - -set(OLC_HIP_UTILITY_CPPS - hip_utility/logger.cpp - hip_utility/tmp_dir.cpp - hip_utility/md5.cpp - hip_utility/exec_utils.cpp - hip_utility/target_properties.cpp - hip_utility/handlehip.cpp - hip_utility/kernel_build_params.cpp - hip_utility/hip_build_utils.cpp - hip_utility/hipoc_program.cpp - hip_utility/hipoc_kernel.cpp - hip_utility/kernel_cache.cpp - hip_utility/binary_cache.cpp - ) - -list(APPEND OLC_SOURCES ${OLC_HIP_UTILITY_CPPS} ${OLC_HIP_UTILITY_HEADERS}) - -## addkernels provide the tool to create inlined kernels in one header -add_subdirectory(addkernels) - -function(inline_kernels_src KERNELS KERNEL_INCLUDES) - set(KERNEL_SRC_HPP_FILENAME batch_all.cpp.hpp) - set(KERNEL_SRC_HPP_PATH ${PROJECT_BINARY_DIR}/inlined_kernels/${KERNEL_SRC_HPP_FILENAME}) - set(KERNEL_SRC_CPP_PATH ${PROJECT_BINARY_DIR}/inlined_kernels/batch_all.cpp) - - add_custom_command( - OUTPUT ${KERNEL_SRC_HPP_PATH} - WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} - DEPENDS addkernels ${KERNELS} ${KERNEL_INCLUDES} - COMMAND $ -target ${KERNEL_SRC_HPP_PATH} -extern -source ${KERNELS} - COMMENT "Inlining All kernels" - ) - configure_file(kernels_batch.cpp.in ${KERNEL_SRC_CPP_PATH}) - list(APPEND OLC_SOURCES ${KERNEL_SRC_CPP_PATH} ${KERNEL_SRC_HPP_PATH}) - - set(OLC_SOURCES ${OLC_SOURCES} PARENT_SCOPE) -endfunction() - -inline_kernels_src("${MCONV_KERNELS}" "${MCONV_KERNEL_INCLUDES}") - -list(APPEND ONLINE_COMPILATION_SOURCE ${OLC_SOURCES} ${PROJECT_BINARY_DIR}/olc_kernel_includes.h) - -add_custom_command( - OUTPUT ${PROJECT_BINARY_DIR}/olc_kernel_includes.h - WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} - DEPENDS addkernels ${MCONV_KERNEL_INCLUDES} - COMMAND $ -no-recurse -guard GUARD_OLC_KERNEL_INCLUDES_HPP_ -target ${PROJECT_BINARY_DIR}/olc_kernel_includes.h -source ${MCONV_KERNEL_INCLUDES} - COMMENT "Inlining HIP kernel includes" - ) - -## the library target -add_library(online_compile SHARED ${ONLINE_COMPILATION_SOURCE}) - -target_include_directories(online_compile PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/online_compile/include/) -target_include_directories(online_compile PRIVATE ${PROJECT_BINARY_DIR}) -target_include_directories(online_compile PRIVATE ${PROJECT_SOURCE_DIR}/external/half/include/) - -target_link_libraries(online_compile PRIVATE hip::device) -target_link_libraries(online_compile INTERFACE hip::host) -target_link_libraries(online_compile PRIVATE Boost::filesystem) - -target_compile_features(online_compile PUBLIC) -set_target_properties(online_compile PROPERTIES POSITION_INDEPENDENT_CODE ON) - -install(TARGETS online_compile LIBRARY DESTINATION lib) diff --git a/host/online_compile/addkernels/CMakeLists.txt b/host/online_compile/addkernels/CMakeLists.txt deleted file mode 100644 index 874cba6a5e..0000000000 --- a/host/online_compile/addkernels/CMakeLists.txt +++ /dev/null @@ -1,30 +0,0 @@ -################################################################################ -# -# MIT License -# -# Copyright (c) 2017 Advanced Micro Devices, Inc. -# -# Permission is hereby granted, free of charge, to any person obtaining a copy -# of this software and associated documentation files (the "Software"), to deal -# in the Software without restriction, including without limitation the rights -# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -# copies of the Software, and to permit persons to whom the Software is -# furnished to do so, subject to the following conditions: -# -# The above copyright notice and this permission notice shall be included in all -# copies or substantial portions of the Software. -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -# SOFTWARE. -# -################################################################################ - -set(ADD_KERNELS_SOURCE include_inliner.cpp addkernels.cpp) - -add_executable(addkernels EXCLUDE_FROM_ALL ${ADD_KERNELS_SOURCE}) - diff --git a/host/online_compile/addkernels/addkernels.cpp b/host/online_compile/addkernels/addkernels.cpp deleted file mode 100644 index 5be523d97b..0000000000 --- a/host/online_compile/addkernels/addkernels.cpp +++ /dev/null @@ -1,264 +0,0 @@ -/******************************************************************************* - * - * MIT License - * - * Copyright (c) 2021 Advanced Micro Devices, Inc. - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - *******************************************************************************/ -#include "include_inliner.hpp" -#include -#include -#include -#include -#include -#include -#include - -void Bin2Hex(std::istream& source, - std::ostream& target, - const std::string& variable, - bool nullTerminate, - size_t bufferSize, - size_t lineSize) -{ - source.seekg(0, std::ios::end); - std::unique_ptr buffer(new unsigned char[bufferSize]); - std::streamoff sourceSize = source.tellg(); - std::streamoff blockStart = 0; - - if(variable.length() != 0) - { - target << "extern const size_t " << variable << "_SIZE;" << std::endl; - target << "extern const unsigned char " << variable << "[];" << std::endl; - target << "const size_t " << variable << "_SIZE = " << std::setbase(10) << sourceSize << ";" - << std::endl; - target << "const unsigned char " << variable << "[] = {" << std::endl; - } - - target << std::setbase(16) << std::setfill('0'); - source.seekg(0, std::ios::beg); - - while(blockStart < sourceSize) - { - source.read(reinterpret_cast(buffer.get()), bufferSize); - - std::streamoff pos = source.tellg(); - std::streamoff blockSize = (pos < 0 ? sourceSize : pos) - blockStart; - std::streamoff i = 0; - - while(i < blockSize) - { - size_t j = i; - size_t end = std::min(i + lineSize, blockSize); - - for(; j < end; j++) - target << "0x" << std::setw(2) << static_cast(buffer[j]) << ","; - - target << std::endl; - i = end; - } - - blockStart += blockSize; - } - - if(nullTerminate) - target << "0x00," << std::endl; - - if(variable.length() != 0) - { - target << "};" << std::endl; - } -} - -void PrintHelp() -{ - std::cout << "Usage: bin2hex {