From 3d32ae9404f6f22afc78e95e981806d8d388b96b Mon Sep 17 00:00:00 2001 From: Chao Liu Date: Fri, 30 Jul 2021 17:50:17 -0500 Subject: [PATCH 01/29] add solver ConvIgemmFwdV6r1DlopsNchwKcyxNkhw; rename static ck source files --- external/half/include/half.hpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/external/half/include/half.hpp b/external/half/include/half.hpp index b698aac39f..25f543881f 100644 --- a/external/half/include/half.hpp +++ b/external/half/include/half.hpp @@ -2404,8 +2404,7 @@ unsigned int gamma(unsigned int arg) 0.0114684895434781459556 }; double t = arg + 4.65, s = p[0]; for(unsigned int i=0; i<5; ++i) s += p[i+1] / (arg+i); return std::log(s) + (arg-0.5)*std::log(t) - t; -*/ static const f31 pi(0xC90FDAA2, 1), - lbe(0xB8AA3B29, 0); +*/ static const f31 pi(0xC90FDAA2, 1), lbe(0xB8AA3B29, 0); unsigned int abs = arg & 0x7FFF, sign = arg & 0x8000; bool bsign = sign != 0; f31 z(abs), x = sign ? (z + f31(0x80000000, 0)) : z, t = x + f31(0x94CCCCCD, 2), From d09ea4f4e5aca0aec89badff827639d998ee1f0b Mon Sep 17 00:00:00 2001 From: Chao Liu Date: Fri, 6 Aug 2021 16:11:15 -0500 Subject: [PATCH 02/29] Update develop (#5) * refactor --- external/half/include/half.hpp | 3 +-- host/CMakeLists.txt | 2 +- host/driver_offline/CMakeLists.txt | 1 + host/driver_online/CMakeLists.txt | 7 ++--- host/driver_online/conv_fwd_driver_online.cpp | 4 +-- ...mplicit_gemm_v4r4_dlops_nchw_kcyx_nkhw.hpp | 2 +- ...plicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw.hpp | 2 +- ...plicit_gemm_v4r4_xdlops_nhwc_kyxc_nhwk.hpp | 2 +- ...mplicit_gemm_v6r1_dlops_nchw_kcyx_nkhw.hpp | 10 +++---- .../include/online_driver_common.hpp | 7 +++++ .../CMakeLists.txt | 26 +++++++++---------- .../addkernels/CMakeLists.txt | 0 .../addkernels/addkernels.cpp | 0 .../addkernels/include_inliner.cpp | 0 .../addkernels/include_inliner.hpp | 0 .../addkernels/source_file_desc.hpp | 0 .../hip_utility/binary_cache.cpp | 14 +++++----- .../hip_utility/exec_utils.cpp | 8 +++--- .../hip_utility/handlehip.cpp | 18 ++++++------- .../hip_utility/hip_build_utils.cpp | 10 +++---- .../hip_utility/hipoc_kernel.cpp | 4 +-- .../hip_utility/hipoc_program.cpp | 8 +++--- .../hip_utility/kernel_build_params.cpp | 4 +-- .../hip_utility/kernel_cache.cpp | 4 +-- .../hip_utility/logger.cpp | 6 ++--- .../hip_utility/md5.cpp | 4 +-- .../hip_utility/target_properties.cpp | 6 ++--- .../hip_utility/tmp_dir.cpp | 8 +++--- .../include/binary_cache.hpp | 4 +-- .../include/config.h.in | 0 .../include/env.hpp | 10 +++---- .../include/exec_utils.hpp | 4 +-- .../include/handle.hpp | 4 +-- .../include/hipCheck.hpp | 0 .../include/hip_build_utils.hpp | 6 ++--- .../include/hipoc_kernel.hpp | 4 +-- .../include/hipoc_program.hpp | 4 +-- .../include/hipoc_program_impl.hpp | 4 +-- .../include/kernel.hpp | 4 +-- .../include/kernel_build_params.hpp | 4 +-- .../include/kernel_cache.hpp | 4 +-- .../include/logger.hpp | 4 +-- .../include/manage_ptr.hpp | 6 ++--- .../include/md5.hpp | 4 +-- .../include/op_kernel_args.hpp | 5 ++++ .../include/simple_hash.hpp | 4 +-- .../include/stringutils.hpp | 4 +-- .../include/target_properties.hpp | 4 +-- .../include/tmp_dir.hpp | 4 +-- .../include/write_file.hpp | 4 +-- .../kernel.cpp.in | 4 +-- .../kernel_includes.cpp.in | 4 +-- .../kernels_batch.cpp.in | 0 ...nv_igemm_fwd_v6r1_dlops_nchw_kcyx_nkhw.hpp | 0 ..._tunable_fwd_v4r4_dlops_nchw_kcyx_nkhw.hpp | 0 ...tunable_fwd_v4r4_xdlops_nchw_kcyx_nkhw.hpp | 0 ...tunable_fwd_v4r4_xdlops_nhwc_kyxc_nhwk.hpp | 0 .../convolution_problem_descriptor.hpp | 0 58 files changed, 136 insertions(+), 123 deletions(-) rename host/{online_compilation => online_compile}/CMakeLists.txt (83%) rename host/{online_compilation => online_compile}/addkernels/CMakeLists.txt (100%) rename host/{online_compilation => online_compile}/addkernels/addkernels.cpp (100%) rename host/{online_compilation => online_compile}/addkernels/include_inliner.cpp (100%) rename host/{online_compilation => online_compile}/addkernels/include_inliner.hpp (100%) rename host/{online_compilation => online_compile}/addkernels/source_file_desc.hpp (100%) rename host/{online_compilation => online_compile}/hip_utility/binary_cache.cpp (89%) rename host/{online_compilation => online_compile}/hip_utility/exec_utils.cpp (91%) rename host/{online_compilation => online_compile}/hip_utility/handlehip.cpp (93%) rename host/{online_compilation => online_compile}/hip_utility/hip_build_utils.cpp (97%) rename host/{online_compilation => online_compile}/hip_utility/hipoc_kernel.cpp (98%) rename host/{online_compilation => online_compile}/hip_utility/hipoc_program.cpp (96%) rename host/{online_compilation => online_compile}/hip_utility/kernel_build_params.cpp (97%) rename host/{online_compilation => online_compile}/hip_utility/kernel_cache.cpp (98%) rename host/{online_compilation => online_compile}/hip_utility/logger.cpp (88%) rename host/{online_compilation => online_compile}/hip_utility/md5.cpp (99%) rename host/{online_compilation => online_compile}/hip_utility/target_properties.cpp (96%) rename host/{online_compilation => online_compile}/hip_utility/tmp_dir.cpp (90%) rename host/{online_compilation => online_compile}/include/binary_cache.hpp (97%) rename host/{online_compilation => online_compile}/include/config.h.in (100%) rename host/{online_compilation => online_compile}/include/env.hpp (92%) rename host/{online_compilation => online_compile}/include/exec_utils.hpp (96%) rename host/{online_compilation => online_compile}/include/handle.hpp (98%) rename host/{online_compilation => online_compile}/include/hipCheck.hpp (100%) rename host/{online_compilation => online_compile}/include/hip_build_utils.hpp (96%) rename host/{online_compilation => online_compile}/include/hipoc_kernel.hpp (99%) rename host/{online_compilation => online_compile}/include/hipoc_program.hpp (97%) rename host/{online_compilation => online_compile}/include/hipoc_program_impl.hpp (97%) rename host/{online_compilation => online_compile}/include/kernel.hpp (96%) rename host/{online_compilation => online_compile}/include/kernel_build_params.hpp (98%) rename host/{online_compilation => online_compile}/include/kernel_cache.hpp (98%) rename host/{online_compilation => online_compile}/include/logger.hpp (84%) rename host/{online_compilation => online_compile}/include/manage_ptr.hpp (93%) rename host/{online_compilation => online_compile}/include/md5.hpp (66%) rename host/{online_compilation => online_compile}/include/op_kernel_args.hpp (94%) rename host/{online_compilation => online_compile}/include/simple_hash.hpp (96%) rename host/{online_compilation => online_compile}/include/stringutils.hpp (98%) rename host/{online_compilation => online_compile}/include/target_properties.hpp (97%) rename host/{online_compilation => online_compile}/include/tmp_dir.hpp (87%) rename host/{online_compilation => online_compile}/include/write_file.hpp (94%) rename host/{online_compilation => online_compile}/kernel.cpp.in (97%) rename host/{online_compilation => online_compile}/kernel_includes.cpp.in (97%) rename host/{online_compilation => online_compile}/kernels_batch.cpp.in (100%) rename host/{driver_online => solver}/include/conv_igemm_fwd_v6r1_dlops_nchw_kcyx_nkhw.hpp (100%) rename host/{driver_online => solver}/include/conv_tunable_fwd_v4r4_dlops_nchw_kcyx_nkhw.hpp (100%) rename host/{driver_online => solver}/include/conv_tunable_fwd_v4r4_xdlops_nchw_kcyx_nkhw.hpp (100%) rename host/{driver_online => solver}/include/conv_tunable_fwd_v4r4_xdlops_nhwc_kyxc_nhwk.hpp (100%) rename host/{driver_online => solver}/include/convolution_problem_descriptor.hpp (100%) diff --git a/external/half/include/half.hpp b/external/half/include/half.hpp index b698aac39f..25f543881f 100644 --- a/external/half/include/half.hpp +++ b/external/half/include/half.hpp @@ -2404,8 +2404,7 @@ unsigned int gamma(unsigned int arg) 0.0114684895434781459556 }; double t = arg + 4.65, s = p[0]; for(unsigned int i=0; i<5; ++i) s += p[i+1] / (arg+i); return std::log(s) + (arg-0.5)*std::log(t) - t; -*/ static const f31 pi(0xC90FDAA2, 1), - lbe(0xB8AA3B29, 0); +*/ static const f31 pi(0xC90FDAA2, 1), lbe(0xB8AA3B29, 0); unsigned int abs = arg & 0x7FFF, sign = arg & 0x8000; bool bsign = sign != 0; f31 z(abs), x = sign ? (z + f31(0x80000000, 0)) : z, t = x + f31(0x94CCCCCD, 2), diff --git a/host/CMakeLists.txt b/host/CMakeLists.txt index c9779398a6..26739efe34 100644 --- a/host/CMakeLists.txt +++ b/host/CMakeLists.txt @@ -1,4 +1,4 @@ add_subdirectory(host_tensor) -add_subdirectory(online_compilation) +add_subdirectory(online_compile) add_subdirectory(driver_offline) add_subdirectory(driver_online) diff --git a/host/driver_offline/CMakeLists.txt b/host/driver_offline/CMakeLists.txt index 85bd31fbca..927975d449 100644 --- a/host/driver_offline/CMakeLists.txt +++ b/host/driver_offline/CMakeLists.txt @@ -1,6 +1,7 @@ include_directories(BEFORE include ${PROJECT_SOURCE_DIR}/host/host_tensor/include + ${PROJECT_SOURCE_DIR}/host/solver/include ${PROJECT_SOURCE_DIR}/composable_kernel/include ${PROJECT_SOURCE_DIR}/composable_kernel/include/utility ${PROJECT_SOURCE_DIR}/composable_kernel/include/tensor_description diff --git a/host/driver_online/CMakeLists.txt b/host/driver_online/CMakeLists.txt index 2ae05e0ba5..077e3218a0 100644 --- a/host/driver_online/CMakeLists.txt +++ b/host/driver_online/CMakeLists.txt @@ -1,8 +1,9 @@ include_directories(BEFORE include - ${PROJECT_BINARY_DIR}/host/online_compilation/include - ${PROJECT_SOURCE_DIR}/host/online_compilation/include + ${PROJECT_BINARY_DIR}/host/online_compile/include + ${PROJECT_SOURCE_DIR}/host/online_compile/include ${PROJECT_SOURCE_DIR}/host/host_tensor/include + ${PROJECT_SOURCE_DIR}/host/solver/include ${PROJECT_SOURCE_DIR}/composable_kernel/include ${PROJECT_SOURCE_DIR}/composable_kernel/include/utility ${PROJECT_SOURCE_DIR}/composable_kernel/include/tensor_description @@ -18,4 +19,4 @@ set(CONV_FWD_DRIVER_ONLINE_SOURCE conv_fwd_driver_online.cpp) add_executable(conv_fwd_driver_online ${CONV_FWD_DRIVER_ONLINE_SOURCE}) target_link_libraries(conv_fwd_driver_online PRIVATE host_tensor) -target_link_libraries(conv_fwd_driver_online PRIVATE online_compilation) +target_link_libraries(conv_fwd_driver_online PRIVATE online_compile) diff --git a/host/driver_online/conv_fwd_driver_online.cpp b/host/driver_online/conv_fwd_driver_online.cpp index c91f76fa24..29609d5474 100644 --- a/host/driver_online/conv_fwd_driver_online.cpp +++ b/host/driver_online/conv_fwd_driver_online.cpp @@ -39,11 +39,11 @@ int main(int argc, char* argv[]) using size_t = std::size_t; hipStream_t stream; - olCompile::Handle* handle; + online_compile::Handle* handle; MY_HIP_CHECK(hipStreamCreate(&stream)); - handle = new olCompile::Handle(stream); + handle = new online_compile::Handle(stream); constexpr auto I0 = Number<0>{}; constexpr auto I1 = Number<1>{}; diff --git a/host/driver_online/include/online_device_dynamic_convolution_forward_implicit_gemm_v4r4_dlops_nchw_kcyx_nkhw.hpp b/host/driver_online/include/online_device_dynamic_convolution_forward_implicit_gemm_v4r4_dlops_nchw_kcyx_nkhw.hpp index 628bb6d96d..06412fba0b 100644 --- a/host/driver_online/include/online_device_dynamic_convolution_forward_implicit_gemm_v4r4_dlops_nchw_kcyx_nkhw.hpp +++ b/host/driver_online/include/online_device_dynamic_convolution_forward_implicit_gemm_v4r4_dlops_nchw_kcyx_nkhw.hpp @@ -216,7 +216,7 @@ template void online_device_dynamic_convolution_forward_implicit_gemm_v4r4_dlops_nchw_kcyx_nkhw( - olCompile::Handle* handle, + online_compile::Handle* handle, const InLengths& in_n_c_hi_wi_lengths, const WeiLengths& wei_k_c_y_x_lengths, const OutLengths& out_n_k_ho_wo_lengths, diff --git a/host/driver_online/include/online_device_dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw.hpp b/host/driver_online/include/online_device_dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw.hpp index 1e213b92e1..61ce41fe84 100644 --- a/host/driver_online/include/online_device_dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw.hpp +++ b/host/driver_online/include/online_device_dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw.hpp @@ -212,7 +212,7 @@ template void online_device_dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw( - olCompile::Handle* handle, + online_compile::Handle* handle, const InLengths& in_n_c_hi_wi_lengths, const WeiLengths& wei_k_c_y_x_lengths, const OutLengths& out_n_k_ho_wo_lengths, diff --git a/host/driver_online/include/online_device_dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nhwc_kyxc_nhwk.hpp b/host/driver_online/include/online_device_dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nhwc_kyxc_nhwk.hpp index 8eed1a9934..57724c7612 100644 --- a/host/driver_online/include/online_device_dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nhwc_kyxc_nhwk.hpp +++ b/host/driver_online/include/online_device_dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nhwc_kyxc_nhwk.hpp @@ -213,7 +213,7 @@ template void online_device_dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nhwc_kyxc_nhwk( - olCompile::Handle* handle, + online_compile::Handle* handle, const InLengths& in_n_hi_wi_c_lengths, const WeiLengths& wei_k_y_x_c_lengths, const OutLengths& out_n_ho_wo_k_lengths, diff --git a/host/driver_online/include/online_device_dynamic_convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw.hpp b/host/driver_online/include/online_device_dynamic_convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw.hpp index 260c94ee0e..92467a7668 100644 --- a/host/driver_online/include/online_device_dynamic_convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw.hpp +++ b/host/driver_online/include/online_device_dynamic_convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw.hpp @@ -20,7 +20,7 @@ template void online_device_dynamic_convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw( - olCompile::Handle* handle, + online_compile::Handle* handle, const InLengths& in_n_c_hi_wi_lengths, const WeiLengths& wei_k_c_y_x_lengths, const OutLengths& out_n_k_ho_wo_lengths, @@ -100,13 +100,13 @@ void online_device_dynamic_convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcy "dynamic_convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw.cpp"; std::string algo_name = "implicit_gemm_conv_fwd_v6r1_dlops_nchw"; - std::string compile_param_string = " -std=c++17 " + compile_param.GetCompileParameterString(); + std::string compile_param_string = get_ck_hip_online_compile_common_flag() + compile_param.GetCompileParameterString(); std::string network_config = compile_param_string; std::vector kernel1_times; std::vector kernel2_times; - for(index_t i = 0; i < nrepeat; ++i) + for(index_t i = 0; i < nrepeat + 1; ++i) { KernelTimer timer1, timer2; std::string kernel_name; @@ -164,11 +164,11 @@ void online_device_dynamic_convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcy auto ave_time1 = std::accumulate( std::next(kernel1_times.begin()), kernel1_times.end(), 0., std::plus{}) / - (nrepeat - 1); + nrepeat; auto ave_time2 = std::accumulate( std::next(kernel2_times.begin()), kernel2_times.end(), 0., std::plus{}) / - (nrepeat - 1); + nrepeat; float perf = (float)(conv_problem_desc.CalculateFlop()) / (std::size_t(1000) * 1000 * 1000) / (ave_time1 + ave_time2); diff --git a/host/driver_online/include/online_driver_common.hpp b/host/driver_online/include/online_driver_common.hpp index 472ffb52dc..d05a156d89 100644 --- a/host/driver_online/include/online_driver_common.hpp +++ b/host/driver_online/include/online_driver_common.hpp @@ -3,6 +3,13 @@ namespace ck_driver { +inline auto get_ck_hip_online_compile_common_flag() +{ + std::string param = " -std=c++17"; + + return param; +} + // greatest common divisor, aka highest common factor inline int gcd(int x, int y) { diff --git a/host/online_compilation/CMakeLists.txt b/host/online_compile/CMakeLists.txt similarity index 83% rename from host/online_compilation/CMakeLists.txt rename to host/online_compile/CMakeLists.txt index 02f6795308..1b66703fcd 100644 --- a/host/online_compilation/CMakeLists.txt +++ b/host/online_compile/CMakeLists.txt @@ -67,10 +67,10 @@ else() set(OLC_DEBUG 0) endif() -configure_file("${PROJECT_SOURCE_DIR}/host/online_compilation/include/config.h.in" "${PROJECT_BINARY_DIR}/host/online_compilation/include/config.h") +configure_file("${PROJECT_SOURCE_DIR}/host/online_compile/include/config.h.in" "${PROJECT_BINARY_DIR}/host/online_compile/include/config.h") include_directories(BEFORE - ${PROJECT_BINARY_DIR}/host/online_compilation/include + ${PROJECT_BINARY_DIR}/host/online_compile/include ) message(STATUS "Hip compiler flags: ${HIP_COMPILER_FLAGS}") @@ -97,7 +97,7 @@ set(ONLINE_COMPILATION_SOURCE ) include_directories(BEFORE - ${PROJECT_BINARY_DIR}/host/online_compilation/include + ${PROJECT_BINARY_DIR}/host/online_compile/include include ) @@ -152,17 +152,17 @@ add_custom_command( ) ## the library target -add_library(online_compilation SHARED ${ONLINE_COMPILATION_SOURCE}) +add_library(online_compile SHARED ${ONLINE_COMPILATION_SOURCE}) -target_include_directories(online_compilation PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/online_compilation/include/) -target_include_directories(online_compilation PRIVATE ${PROJECT_BINARY_DIR}) -target_include_directories(online_compilation PRIVATE ${PROJECT_SOURCE_DIR}/external/half/include/) +target_include_directories(online_compile PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/online_compile/include/) +target_include_directories(online_compile PRIVATE ${PROJECT_BINARY_DIR}) +target_include_directories(online_compile PRIVATE ${PROJECT_SOURCE_DIR}/external/half/include/) -target_link_libraries(online_compilation PRIVATE hip::device) -target_link_libraries(online_compilation INTERFACE hip::host) -target_link_libraries(online_compilation PRIVATE Boost::filesystem) +target_link_libraries(online_compile PRIVATE hip::device) +target_link_libraries(online_compile INTERFACE hip::host) +target_link_libraries(online_compile PRIVATE Boost::filesystem) -target_compile_features(online_compilation PUBLIC) -set_target_properties(online_compilation PROPERTIES POSITION_INDEPENDENT_CODE ON) +target_compile_features(online_compile PUBLIC) +set_target_properties(online_compile PROPERTIES POSITION_INDEPENDENT_CODE ON) -install(TARGETS online_compilation LIBRARY DESTINATION lib) +install(TARGETS online_compile LIBRARY DESTINATION lib) diff --git a/host/online_compilation/addkernels/CMakeLists.txt b/host/online_compile/addkernels/CMakeLists.txt similarity index 100% rename from host/online_compilation/addkernels/CMakeLists.txt rename to host/online_compile/addkernels/CMakeLists.txt diff --git a/host/online_compilation/addkernels/addkernels.cpp b/host/online_compile/addkernels/addkernels.cpp similarity index 100% rename from host/online_compilation/addkernels/addkernels.cpp rename to host/online_compile/addkernels/addkernels.cpp diff --git a/host/online_compilation/addkernels/include_inliner.cpp b/host/online_compile/addkernels/include_inliner.cpp similarity index 100% rename from host/online_compilation/addkernels/include_inliner.cpp rename to host/online_compile/addkernels/include_inliner.cpp diff --git a/host/online_compilation/addkernels/include_inliner.hpp b/host/online_compile/addkernels/include_inliner.hpp similarity index 100% rename from host/online_compilation/addkernels/include_inliner.hpp rename to host/online_compile/addkernels/include_inliner.hpp diff --git a/host/online_compilation/addkernels/source_file_desc.hpp b/host/online_compile/addkernels/source_file_desc.hpp similarity index 100% rename from host/online_compilation/addkernels/source_file_desc.hpp rename to host/online_compile/addkernels/source_file_desc.hpp diff --git a/host/online_compilation/hip_utility/binary_cache.cpp b/host/online_compile/hip_utility/binary_cache.cpp similarity index 89% rename from host/online_compilation/hip_utility/binary_cache.cpp rename to host/online_compile/hip_utility/binary_cache.cpp index f2f47a1a31..b899d1e296 100644 --- a/host/online_compilation/hip_utility/binary_cache.cpp +++ b/host/online_compile/hip_utility/binary_cache.cpp @@ -35,7 +35,7 @@ #include #include -namespace olCompile { +namespace online_compile { OLC_DECLARE_ENV_VAR(OLC_DISABLE_CACHE) OLC_DECLARE_ENV_VAR(HOME) @@ -62,14 +62,14 @@ boost::filesystem::path GetCachePath() return user_path; } -static bool IsCacheDisabled() { return olCompile::IsEnabled(OLC_DISABLE_CACHE{}); } +static bool IsCacheDisabled() { return online_compile::IsEnabled(OLC_DISABLE_CACHE{}); } boost::filesystem::path GetCacheFile(const std::string& device, const std::string& name, const std::string& args) { - // std::string filename = (is_kernel_str ? olCompile::md5(name) : name) + ".o"; + // std::string filename = (is_kernel_str ? online_compile::md5(name) : name) + ".o"; std::string filename = name + ".o"; - return GetCachePath() / olCompile::md5(device + ":" + args) / filename; + return GetCachePath() / online_compile::md5(device + ":" + args) / filename; } boost::filesystem::path LoadBinary(const TargetProperties& target, @@ -77,7 +77,7 @@ boost::filesystem::path LoadBinary(const TargetProperties& target, const std::string& name, const std::string& args) { - if(olCompile::IsCacheDisabled()) + if(online_compile::IsCacheDisabled()) return {}; (void)num_cu; @@ -97,7 +97,7 @@ void SaveBinary(const boost::filesystem::path& binary_path, const std::string& name, const std::string& args) { - if(olCompile::IsCacheDisabled()) + if(online_compile::IsCacheDisabled()) { boost::filesystem::remove(binary_path); } @@ -109,4 +109,4 @@ void SaveBinary(const boost::filesystem::path& binary_path, } } -} // namespace olCompile +} // namespace online_compile diff --git a/host/online_compilation/hip_utility/exec_utils.cpp b/host/online_compile/hip_utility/exec_utils.cpp similarity index 91% rename from host/online_compilation/hip_utility/exec_utils.cpp rename to host/online_compile/hip_utility/exec_utils.cpp index 60168c1a54..ec305783f1 100644 --- a/host/online_compilation/hip_utility/exec_utils.cpp +++ b/host/online_compile/hip_utility/exec_utils.cpp @@ -38,7 +38,7 @@ #include #endif // __linux__ -namespace olCompile { +namespace online_compile { namespace exec { int Run(const std::string& p, std::istream* in, std::ostream* out) @@ -53,7 +53,7 @@ int Run(const std::string& p, std::istream* in, std::ostream* out) OLC_MANAGE_PTR(FILE*, pclose) pipe{popen(p.c_str(), file_mode)}; if(!pipe) - throw std::runtime_error("olCompile::exec::Run(): popen(" + p + ", " + file_mode + + throw std::runtime_error("online_compile::exec::Run(): popen(" + p + ", " + file_mode + ") failed"); if(redirect_stdin || redirect_stdout) @@ -74,7 +74,7 @@ int Run(const std::string& p, std::istream* in, std::ostream* out) buffer[in->gcount()] = 0; if(fputs(buffer.data(), pipe.get()) == EOF) - throw std::runtime_error("olCompile::exec::Run(): fputs() failed"); + throw std::runtime_error("online_compile::exec::Run(): fputs() failed"); } } } @@ -90,4 +90,4 @@ int Run(const std::string& p, std::istream* in, std::ostream* out) } } // namespace exec -} // namespace olCompile +} // namespace online_compile diff --git a/host/online_compilation/hip_utility/handlehip.cpp b/host/online_compile/hip_utility/handlehip.cpp similarity index 93% rename from host/online_compilation/hip_utility/handlehip.cpp rename to host/online_compile/hip_utility/handlehip.cpp index f403b040f4..843957b4ad 100644 --- a/host/online_compilation/hip_utility/handlehip.cpp +++ b/host/online_compile/hip_utility/handlehip.cpp @@ -50,7 +50,7 @@ OLC_DECLARE_ENV_VAR(OLC_DEVICE_CU) -namespace olCompile { +namespace online_compile { std::size_t GetAvailableMemory() { @@ -182,24 +182,24 @@ KernelInvoke Handle::Run(Kernel k) const { return k.Invoke(this->GetStream()); } Program Handle::LoadProgram(const std::string& program_name, std::string params) const { - if((!olCompile::EndsWith(program_name, ".mlir-cpp")) && - (!olCompile::EndsWith(program_name, ".mlir"))) + if((!online_compile::EndsWith(program_name, ".mlir-cpp")) && + (!online_compile::EndsWith(program_name, ".mlir"))) { params += " -mcpu=" + this->GetTargetProperties().Name(); } - auto hsaco = olCompile::LoadBinary( + auto hsaco = online_compile::LoadBinary( this->GetTargetProperties(), this->GetMaxComputeUnits(), program_name, params); if(hsaco.empty()) { auto p = HIPOCProgram{program_name, params, this->GetTargetProperties()}; - auto path = olCompile::GetCachePath() / boost::filesystem::unique_path(); + auto path = online_compile::GetCachePath() / boost::filesystem::unique_path(); if(p.IsCodeObjectInMemory()) - olCompile::WriteFile(p.GetCodeObjectBlob(), path); + online_compile::WriteFile(p.GetCodeObjectBlob(), path); else boost::filesystem::copy_file(p.GetCodeObjectPathname(), path); - olCompile::SaveBinary(path, this->GetTargetProperties(), program_name, params); + online_compile::SaveBinary(path, this->GetTargetProperties(), program_name, params); return p; } @@ -245,7 +245,7 @@ std::size_t Handle::GetGlobalMemorySize() const std::size_t Handle::GetMaxComputeUnits() const { int result; - const char* const num_cu = olCompile::GetStringEnv(OLC_DEVICE_CU{}); + const char* const num_cu = online_compile::GetStringEnv(OLC_DEVICE_CU{}); if(num_cu != nullptr && strlen(num_cu) > 0) { return boost::lexical_cast(num_cu); @@ -282,4 +282,4 @@ std::ostream& Handle::Print(std::ostream& os) const return os; } -} // namespace olCompile +} // namespace online_compile diff --git a/host/online_compilation/hip_utility/hip_build_utils.cpp b/host/online_compile/hip_utility/hip_build_utils.cpp similarity index 97% rename from host/online_compilation/hip_utility/hip_build_utils.cpp rename to host/online_compile/hip_utility/hip_build_utils.cpp index e73c345937..99b786e606 100644 --- a/host/online_compilation/hip_utility/hip_build_utils.cpp +++ b/host/online_compile/hip_utility/hip_build_utils.cpp @@ -45,7 +45,7 @@ OLC_DECLARE_ENV_VAR(OLC_DEBUG_HIP_DUMP) #define OLC_HIP_COMPILER "/opt/rocm/llvm/bin/clang++" -namespace olCompile { +namespace online_compile { bool IsHccCompiler() { @@ -155,12 +155,12 @@ static boost::filesystem::path HipBuildImpl(boost::optional& tmp_dir, params += " -mllvm -amdgpu-function-calls=false"; } - if(olCompile::IsEnabled(OLC_DEBUG_HIP_VERBOSE{})) + if(online_compile::IsEnabled(OLC_DEBUG_HIP_VERBOSE{})) { params += " -v"; } - if(olCompile::IsEnabled(OLC_DEBUG_HIP_DUMP{})) + if(online_compile::IsEnabled(OLC_DEBUG_HIP_DUMP{})) { if(IsHccCompiler()) { @@ -247,7 +247,7 @@ static external_tool_version_t HipCompilerVersionImpl() break; std::stringstream out; - if(olCompile::exec::Run(path + " --version", nullptr, &out) != 0) + if(online_compile::exec::Run(path + " --version", nullptr, &out) != 0) break; std::string line; @@ -343,4 +343,4 @@ bool operator<=(const external_tool_version_t& lhs, const external_tool_version_ return !(lhs > rhs); } -} // namespace olCompile +} // namespace online_compile diff --git a/host/online_compilation/hip_utility/hipoc_kernel.cpp b/host/online_compile/hip_utility/hipoc_kernel.cpp similarity index 98% rename from host/online_compilation/hip_utility/hipoc_kernel.cpp rename to host/online_compile/hip_utility/hipoc_kernel.cpp index 41fcd92c94..a07d736ac1 100644 --- a/host/online_compilation/hip_utility/hipoc_kernel.cpp +++ b/host/online_compile/hip_utility/hipoc_kernel.cpp @@ -34,7 +34,7 @@ #include #include -namespace olCompile { +namespace online_compile { void HIPOCKernelInvoke::run(void* args, std::size_t size) const { @@ -81,4 +81,4 @@ HIPOCKernelInvoke HIPOCKernel::Invoke(hipStream_t stream, { return HIPOCKernelInvoke{stream, fun, ldims, gdims, name, callback}; } -} // namespace olCompile +} // namespace online_compile diff --git a/host/online_compilation/hip_utility/hipoc_program.cpp b/host/online_compile/hip_utility/hipoc_program.cpp similarity index 96% rename from host/online_compilation/hip_utility/hipoc_program.cpp rename to host/online_compile/hip_utility/hipoc_program.cpp index d2ea1fcb0c..81e03b72ab 100644 --- a/host/online_compilation/hip_utility/hipoc_program.cpp +++ b/host/online_compile/hip_utility/hipoc_program.cpp @@ -39,7 +39,7 @@ #include -namespace olCompile { +namespace online_compile { static hipModulePtr CreateModule(const boost::filesystem::path& hsaco_file) { @@ -89,7 +89,7 @@ void HIPOCProgramImpl::BuildCodeObjectInFile(std::string& params, this->dir.emplace(filename); hsaco_file = dir->path / (filename + ".o"); - if(olCompile::EndsWith(filename, ".cpp")) + if(online_compile::EndsWith(filename, ".cpp")) { hsaco_file = HipBuild(dir, filename, src, params, target); } @@ -104,7 +104,7 @@ void HIPOCProgramImpl::BuildCodeObject(std::string params) { std::string filename = program; - if(olCompile::EndsWith(filename, ".cpp")) + if(online_compile::EndsWith(filename, ".cpp")) { params += " -Wno-everything"; } @@ -136,4 +136,4 @@ std::string HIPOCProgram::GetCodeObjectBlob() const bool HIPOCProgram::IsCodeObjectInMemory() const { return !impl->binary.empty(); }; -} // namespace olCompile +} // namespace online_compile diff --git a/host/online_compilation/hip_utility/kernel_build_params.cpp b/host/online_compile/hip_utility/kernel_build_params.cpp similarity index 97% rename from host/online_compilation/hip_utility/kernel_build_params.cpp rename to host/online_compile/hip_utility/kernel_build_params.cpp index f9474796bc..e37974b1a3 100644 --- a/host/online_compilation/hip_utility/kernel_build_params.cpp +++ b/host/online_compile/hip_utility/kernel_build_params.cpp @@ -31,7 +31,7 @@ #include #include -namespace olCompile { +namespace online_compile { static std::string GenerateDefines(const std::vector& options, const std::string& prefix) @@ -63,4 +63,4 @@ static std::string GenerateDefines(const std::vector& opti return JoinStrings(strs, " "); } -} // namespace olCompile +} // namespace online_compile diff --git a/host/online_compilation/hip_utility/kernel_cache.cpp b/host/online_compile/hip_utility/kernel_cache.cpp similarity index 98% rename from host/online_compilation/hip_utility/kernel_cache.cpp rename to host/online_compile/hip_utility/kernel_cache.cpp index fff57c194e..dceb8de94e 100644 --- a/host/online_compilation/hip_utility/kernel_cache.cpp +++ b/host/online_compile/hip_utility/kernel_cache.cpp @@ -46,7 +46,7 @@ #include #include -namespace olCompile { +namespace online_compile { const std::vector& KernelCache::GetKernels(const std::string& algorithm, const std::string& network_config) @@ -151,4 +151,4 @@ void KernelCache::ClearKernels(const std::string& algorithm, const std::string& KernelCache::KernelCache() {} -} // namespace olCompile +} // namespace online_compile diff --git a/host/online_compilation/hip_utility/logger.cpp b/host/online_compile/hip_utility/logger.cpp similarity index 88% rename from host/online_compilation/hip_utility/logger.cpp rename to host/online_compile/hip_utility/logger.cpp index e8d31562a5..d84bb20908 100644 --- a/host/online_compilation/hip_utility/logger.cpp +++ b/host/online_compile/hip_utility/logger.cpp @@ -5,7 +5,7 @@ using namespace std; -namespace olCompile { +namespace online_compile { #if OLC_DEBUG static LogLevel defLevel = LogLevel::Info2; @@ -27,7 +27,7 @@ string LogLevelString(LogLevel level) ostream& fdt_log(LogLevel level, const char* header, const char* content) { - if(level > olCompile::defLevel) + if(level > online_compile::defLevel) { return (cerr); }; @@ -40,4 +40,4 @@ ostream& fdt_log(LogLevel level, const char* header, const char* content) ostream& fdt_log() { return (cerr); }; void fdt_log_flush() { cerr << endl; } -}; // namespace olCompile +}; // namespace online_compile diff --git a/host/online_compilation/hip_utility/md5.cpp b/host/online_compile/hip_utility/md5.cpp similarity index 99% rename from host/online_compilation/hip_utility/md5.cpp rename to host/online_compile/hip_utility/md5.cpp index ad31292ea7..24166decba 100644 --- a/host/online_compilation/hip_utility/md5.cpp +++ b/host/online_compile/hip_utility/md5.cpp @@ -298,7 +298,7 @@ static void MD5_Final(unsigned char* result, MD5_CTX* ctx) memset(ctx, 0, sizeof(*ctx)); } -namespace olCompile { +namespace online_compile { std::string md5(std::string s) { @@ -316,4 +316,4 @@ std::string md5(std::string s) return sout.str(); } -} // namespace olCompile +} // namespace online_compile diff --git a/host/online_compilation/hip_utility/target_properties.cpp b/host/online_compile/hip_utility/target_properties.cpp similarity index 96% rename from host/online_compilation/hip_utility/target_properties.cpp rename to host/online_compile/hip_utility/target_properties.cpp index 1d2bdef1c1..1de2852c91 100644 --- a/host/online_compilation/hip_utility/target_properties.cpp +++ b/host/online_compile/hip_utility/target_properties.cpp @@ -32,7 +32,7 @@ OLC_DECLARE_ENV_VAR(OLC_DEBUG_ENFORCE_DEVICE) -namespace olCompile { +namespace online_compile { static std::string GetDeviceNameFromMap(const std::string& in) { @@ -53,7 +53,7 @@ static std::string GetDeviceNameFromMap(const std::string& in) {"10.3.0 Sienna_Cichlid 18", "gfx1030"}, }; - const char* const p_asciz = olCompile::GetStringEnv(OLC_DEBUG_ENFORCE_DEVICE{}); + const char* const p_asciz = online_compile::GetStringEnv(OLC_DEBUG_ENFORCE_DEVICE{}); if(p_asciz != nullptr && strlen(p_asciz) > 0) return {p_asciz}; @@ -116,4 +116,4 @@ void TargetProperties::InitDbId() dbId += "_xnack"; } -} // namespace olCompile +} // namespace online_compile diff --git a/host/online_compilation/hip_utility/tmp_dir.cpp b/host/online_compile/hip_utility/tmp_dir.cpp similarity index 90% rename from host/online_compilation/hip_utility/tmp_dir.cpp rename to host/online_compile/hip_utility/tmp_dir.cpp index 6e5de6935d..bdef7cad06 100644 --- a/host/online_compilation/hip_utility/tmp_dir.cpp +++ b/host/online_compile/hip_utility/tmp_dir.cpp @@ -31,7 +31,7 @@ OLC_DECLARE_ENV_VAR(OLC_DEBUG_SAVE_TEMP_DIR) -namespace olCompile { +namespace online_compile { void SystemCmd(std::string cmd) { @@ -43,7 +43,7 @@ void SystemCmd(std::string cmd) TmpDir::TmpDir(std::string prefix) : path(boost::filesystem::temp_directory_path() / - boost::filesystem::unique_path("olCompile-" + prefix + "-%%%%-%%%%-%%%%-%%%%")) + boost::filesystem::unique_path("online_compile-" + prefix + "-%%%%-%%%%-%%%%-%%%%")) { boost::filesystem::create_directories(this->path); } @@ -57,10 +57,10 @@ void TmpDir::Execute(std::string exe, std::string args) const TmpDir::~TmpDir() { - if(!olCompile::IsEnabled(OLC_DEBUG_SAVE_TEMP_DIR{})) + if(!online_compile::IsEnabled(OLC_DEBUG_SAVE_TEMP_DIR{})) { boost::filesystem::remove_all(this->path); } } -} // namespace olCompile +} // namespace online_compile diff --git a/host/online_compilation/include/binary_cache.hpp b/host/online_compile/include/binary_cache.hpp similarity index 97% rename from host/online_compilation/include/binary_cache.hpp rename to host/online_compile/include/binary_cache.hpp index 5ff9f81093..c146bb9758 100644 --- a/host/online_compilation/include/binary_cache.hpp +++ b/host/online_compile/include/binary_cache.hpp @@ -31,7 +31,7 @@ #include #include -namespace olCompile { +namespace online_compile { boost::filesystem::path GetCacheFile(const std::string& device, const std::string& name, const std::string& args); @@ -47,6 +47,6 @@ void SaveBinary(const boost::filesystem::path& binary_path, const std::string& name, const std::string& args); -} // namespace olCompile +} // namespace online_compile #endif diff --git a/host/online_compilation/include/config.h.in b/host/online_compile/include/config.h.in similarity index 100% rename from host/online_compilation/include/config.h.in rename to host/online_compile/include/config.h.in diff --git a/host/online_compilation/include/env.hpp b/host/online_compile/include/env.hpp similarity index 92% rename from host/online_compilation/include/env.hpp rename to host/online_compile/include/env.hpp index 1d519a44d7..057a863269 100644 --- a/host/online_compilation/include/env.hpp +++ b/host/online_compile/include/env.hpp @@ -31,7 +31,7 @@ #include #include -namespace olCompile { +namespace online_compile { /// \todo Rework: Case-insensitive string compare, ODR, (?) move to .cpp @@ -101,23 +101,23 @@ inline const char* GetStringEnv(T) template inline bool IsEnabled(T) { - static const bool result = olCompile::IsEnvvarValueEnabled(T::value()); + static const bool result = online_compile::IsEnvvarValueEnabled(T::value()); return result; } template inline bool IsDisabled(T) { - static const bool result = olCompile::IsEnvvarValueDisabled(T::value()); + static const bool result = online_compile::IsEnvvarValueDisabled(T::value()); return result; } template inline unsigned long int Value(T, unsigned long int fallback = 0) { - static const auto result = olCompile::EnvvarValue(T::value(), fallback); + static const auto result = online_compile::EnvvarValue(T::value(), fallback); return result; } -} // namespace olCompile +} // namespace online_compile #endif diff --git a/host/online_compilation/include/exec_utils.hpp b/host/online_compile/include/exec_utils.hpp similarity index 96% rename from host/online_compilation/include/exec_utils.hpp rename to host/online_compile/include/exec_utils.hpp index bbad128d96..e257133ca4 100644 --- a/host/online_compilation/include/exec_utils.hpp +++ b/host/online_compile/include/exec_utils.hpp @@ -30,13 +30,13 @@ #include #include -namespace olCompile { +namespace online_compile { namespace exec { /// Redirecting both input and output is not supported. int Run(const std::string& p, std::istream* in, std::ostream* out); } // namespace exec -} // namespace olCompile +} // namespace online_compile #endif // EXEC_UTILS_HPP diff --git a/host/online_compilation/include/handle.hpp b/host/online_compile/include/handle.hpp similarity index 98% rename from host/online_compilation/include/handle.hpp rename to host/online_compile/include/handle.hpp index db93ee1445..8eda802a43 100644 --- a/host/online_compilation/include/handle.hpp +++ b/host/online_compile/include/handle.hpp @@ -40,7 +40,7 @@ #include #include -namespace olCompile { +namespace online_compile { struct HandleImpl; @@ -140,6 +140,6 @@ struct Handle inline std::ostream& operator<<(std::ostream& os, const Handle& handle) { return handle.Print(os); } -} // namespace olCompile +} // namespace online_compile #endif // GUARD_OLC_HANDLE_HPP_ diff --git a/host/online_compilation/include/hipCheck.hpp b/host/online_compile/include/hipCheck.hpp similarity index 100% rename from host/online_compilation/include/hipCheck.hpp rename to host/online_compile/include/hipCheck.hpp diff --git a/host/online_compilation/include/hip_build_utils.hpp b/host/online_compile/include/hip_build_utils.hpp similarity index 96% rename from host/online_compilation/include/hip_build_utils.hpp rename to host/online_compile/include/hip_build_utils.hpp index af456f846b..f93993edef 100644 --- a/host/online_compilation/include/hip_build_utils.hpp +++ b/host/online_compile/include/hip_build_utils.hpp @@ -31,9 +31,9 @@ #include #include -namespace olCompile { +namespace online_compile { -boost::filesystem::path HipBuild(boost::optional& tmp_dir, +boost::filesystem::path HipBuild(boost::optional& tmp_dir, const std::string& filename, std::string src, std::string params, @@ -92,6 +92,6 @@ class LcOptionTargetStrings } }; -} // namespace olCompile +} // namespace online_compile #endif diff --git a/host/online_compilation/include/hipoc_kernel.hpp b/host/online_compile/include/hipoc_kernel.hpp similarity index 99% rename from host/online_compilation/include/hipoc_kernel.hpp rename to host/online_compile/include/hipoc_kernel.hpp index 3bcf88f526..f6c5e1adf5 100644 --- a/host/online_compilation/include/hipoc_kernel.hpp +++ b/host/online_compile/include/hipoc_kernel.hpp @@ -36,7 +36,7 @@ #include #include -namespace olCompile { +namespace online_compile { using HipEventPtr = OLC_MANAGE_PTR(hipEvent_t, hipEventDestroy); inline HipEventPtr make_hip_event() @@ -169,6 +169,6 @@ struct HIPOCKernel std::function callback = nullptr) const; }; -} // namespace olCompile +} // namespace online_compile #endif diff --git a/host/online_compilation/include/hipoc_program.hpp b/host/online_compile/include/hipoc_program.hpp similarity index 97% rename from host/online_compilation/include/hipoc_program.hpp rename to host/online_compile/include/hipoc_program.hpp index 5296003cb7..c388bb35bf 100644 --- a/host/online_compilation/include/hipoc_program.hpp +++ b/host/online_compile/include/hipoc_program.hpp @@ -33,7 +33,7 @@ #include #include -namespace olCompile { +namespace online_compile { struct HIPOCProgramImpl; struct HIPOCProgram @@ -59,6 +59,6 @@ struct HIPOCProgram /// False if CO resides on filesystem. bool IsCodeObjectInMemory() const; }; -} // namespace olCompile +} // namespace online_compile #endif diff --git a/host/online_compilation/include/hipoc_program_impl.hpp b/host/online_compile/include/hipoc_program_impl.hpp similarity index 97% rename from host/online_compilation/include/hipoc_program_impl.hpp rename to host/online_compile/include/hipoc_program_impl.hpp index 2d8706b2e8..4e6b59d265 100644 --- a/host/online_compilation/include/hipoc_program_impl.hpp +++ b/host/online_compile/include/hipoc_program_impl.hpp @@ -33,7 +33,7 @@ #include #include -namespace olCompile { +namespace online_compile { using hipModulePtr = OLC_MANAGE_PTR(hipModule_t, hipModuleUnload); @@ -57,5 +57,5 @@ struct HIPOCProgramImpl BuildCodeObjectInFile(std::string& params, const std::string& src, const std::string& filename); void BuildCodeObject(std::string params); }; -} // namespace olCompile +} // namespace online_compile #endif // GUARD_OLC_HIPOC_PROGRAM_IMPL_HPP diff --git a/host/online_compilation/include/kernel.hpp b/host/online_compile/include/kernel.hpp similarity index 96% rename from host/online_compilation/include/kernel.hpp rename to host/online_compile/include/kernel.hpp index 73d6be61ad..7d1fd81242 100644 --- a/host/online_compilation/include/kernel.hpp +++ b/host/online_compile/include/kernel.hpp @@ -30,7 +30,7 @@ #include #include -namespace olCompile { +namespace online_compile { std::string GetKernelSrc(std::string name); std::string GetKernelInc(std::string key); std::vector GetKernelIncList(); @@ -40,6 +40,6 @@ using Kernel = HIPOCKernel; using KernelInvoke = HIPOCKernelInvoke; using Program = HIPOCProgram; -} // namespace olCompile +} // namespace online_compile #endif diff --git a/host/online_compilation/include/kernel_build_params.hpp b/host/online_compile/include/kernel_build_params.hpp similarity index 98% rename from host/online_compilation/include/kernel_build_params.hpp rename to host/online_compile/include/kernel_build_params.hpp index c15769ed27..30315ac9b7 100644 --- a/host/online_compilation/include/kernel_build_params.hpp +++ b/host/online_compile/include/kernel_build_params.hpp @@ -32,7 +32,7 @@ #include #include -namespace olCompile { +namespace online_compile { namespace kbp { struct Option @@ -132,6 +132,6 @@ class KernelBuildParameters } }; -} // namespace olCompile +} // namespace online_compile #endif diff --git a/host/online_compilation/include/kernel_cache.hpp b/host/online_compile/include/kernel_cache.hpp similarity index 98% rename from host/online_compilation/include/kernel_cache.hpp rename to host/online_compile/include/kernel_cache.hpp index 9f88327858..20d26f6102 100644 --- a/host/online_compilation/include/kernel_cache.hpp +++ b/host/online_compile/include/kernel_cache.hpp @@ -49,7 +49,7 @@ #include #include -namespace olCompile { +namespace online_compile { /** * @brief The KernelCache class Build and cache kernels @@ -92,6 +92,6 @@ class KernelCache ProgramMap program_map; }; -} // namespace olCompile +} // namespace online_compile #endif // GUARD_OLC_KERNEL_CACHE_HPP_ diff --git a/host/online_compilation/include/logger.hpp b/host/online_compile/include/logger.hpp similarity index 84% rename from host/online_compilation/include/logger.hpp rename to host/online_compile/include/logger.hpp index cc420d6e34..a397a868ba 100644 --- a/host/online_compilation/include/logger.hpp +++ b/host/online_compile/include/logger.hpp @@ -3,7 +3,7 @@ #include -namespace olCompile { +namespace online_compile { enum class LogLevel { @@ -18,6 +18,6 @@ std::ostream& fdt_log(LogLevel level, const char* header, const char* content); std::ostream& fdt_log(); void fdt_log_flush(); -}; // namespace olCompile +}; // namespace online_compile #endif diff --git a/host/online_compilation/include/manage_ptr.hpp b/host/online_compile/include/manage_ptr.hpp similarity index 93% rename from host/online_compilation/include/manage_ptr.hpp rename to host/online_compile/include/manage_ptr.hpp index c02c712475..f23807686b 100644 --- a/host/online_compilation/include/manage_ptr.hpp +++ b/host/online_compile/include/manage_ptr.hpp @@ -29,7 +29,7 @@ #include #include -namespace olCompile { +namespace online_compile { template struct manage_deleter @@ -68,9 +68,9 @@ using remove_ptr = typename std:: template using shared = std::shared_ptr>; -} // namespace olCompile +} // namespace online_compile #define OLC_MANAGE_PTR(T, F) \ - olCompile::manage_ptr::type, decltype(&F), &F> // NOLINT + online_compile::manage_ptr::type, decltype(&F), &F> // NOLINT #endif diff --git a/host/online_compilation/include/md5.hpp b/host/online_compile/include/md5.hpp similarity index 66% rename from host/online_compilation/include/md5.hpp rename to host/online_compile/include/md5.hpp index 1f350766e7..0fa25849a5 100644 --- a/host/online_compilation/include/md5.hpp +++ b/host/online_compile/include/md5.hpp @@ -3,10 +3,10 @@ #include -namespace olCompile { +namespace online_compile { std::string md5(std::string s); -} // namespace olCompile +} // namespace online_compile #endif diff --git a/host/online_compilation/include/op_kernel_args.hpp b/host/online_compile/include/op_kernel_args.hpp similarity index 94% rename from host/online_compilation/include/op_kernel_args.hpp rename to host/online_compile/include/op_kernel_args.hpp index 7d0420e8f5..eb483265ea 100644 --- a/host/online_compilation/include/op_kernel_args.hpp +++ b/host/online_compile/include/op_kernel_args.hpp @@ -6,6 +6,9 @@ #include #include + +namespace online_compile { + struct OpKernelArg { @@ -32,4 +35,6 @@ struct OpKernelArg bool is_ptr = false; }; +} // namespace online_compile + #endif diff --git a/host/online_compilation/include/simple_hash.hpp b/host/online_compile/include/simple_hash.hpp similarity index 96% rename from host/online_compilation/include/simple_hash.hpp rename to host/online_compile/include/simple_hash.hpp index c7dac54cfc..1afa2e2066 100644 --- a/host/online_compilation/include/simple_hash.hpp +++ b/host/online_compile/include/simple_hash.hpp @@ -29,7 +29,7 @@ #include -namespace olCompile { +namespace online_compile { struct SimpleHash { size_t operator()(const std::pair& p) const @@ -39,6 +39,6 @@ struct SimpleHash } }; -} // namespace olCompile +} // namespace online_compile #endif diff --git a/host/online_compilation/include/stringutils.hpp b/host/online_compile/include/stringutils.hpp similarity index 98% rename from host/online_compilation/include/stringutils.hpp rename to host/online_compile/include/stringutils.hpp index 6175c36ff4..71975f430a 100644 --- a/host/online_compilation/include/stringutils.hpp +++ b/host/online_compile/include/stringutils.hpp @@ -36,7 +36,7 @@ #define OLC_STRINGIZE_1(...) #__VA_ARGS__ #define OLC_STRINGIZE(...) OLC_STRINGIZE_1(__VA_ARGS__) -namespace olCompile { +namespace online_compile { inline std::string ReplaceString(std::string subject, const std::string& search, const std::string& replace) @@ -128,6 +128,6 @@ inline std::vector SplitSpaceSeparated(const std::string& in, return rv; } -} // namespace olCompile +} // namespace online_compile #endif // GUARD_OLC_STRINGUTILS_HPP diff --git a/host/online_compilation/include/target_properties.hpp b/host/online_compile/include/target_properties.hpp similarity index 97% rename from host/online_compilation/include/target_properties.hpp rename to host/online_compile/include/target_properties.hpp index 7918728130..349a63fdd5 100644 --- a/host/online_compilation/include/target_properties.hpp +++ b/host/online_compile/include/target_properties.hpp @@ -29,7 +29,7 @@ #include #include -namespace olCompile { +namespace online_compile { struct Handle; @@ -51,6 +51,6 @@ struct TargetProperties boost::optional sramecc_reported = boost::none; }; -} // namespace olCompile +} // namespace online_compile #endif // GUARD_OLC_TARGET_PROPERTIES_HPP diff --git a/host/online_compilation/include/tmp_dir.hpp b/host/online_compile/include/tmp_dir.hpp similarity index 87% rename from host/online_compilation/include/tmp_dir.hpp rename to host/online_compile/include/tmp_dir.hpp index 099a18bf74..3221786061 100644 --- a/host/online_compilation/include/tmp_dir.hpp +++ b/host/online_compile/include/tmp_dir.hpp @@ -4,7 +4,7 @@ #include #include -namespace olCompile { +namespace online_compile { void SystemCmd(std::string cmd); @@ -21,6 +21,6 @@ struct TmpDir ~TmpDir(); }; -} // namespace olCompile +} // namespace online_compile #endif diff --git a/host/online_compilation/include/write_file.hpp b/host/online_compile/include/write_file.hpp similarity index 94% rename from host/online_compilation/include/write_file.hpp rename to host/online_compile/include/write_file.hpp index f1ddb85237..098ff17abf 100644 --- a/host/online_compilation/include/write_file.hpp +++ b/host/online_compile/include/write_file.hpp @@ -5,7 +5,7 @@ #include #include -namespace olCompile { +namespace online_compile { using FilePtr = OLC_MANAGE_PTR(FILE*, std::fclose); @@ -25,6 +25,6 @@ inline void WriteFile(const std::vector& content, const boost::filesystem: throw std::runtime_error("Failed to write to file"); } -} // namespace olCompile +} // namespace online_compile #endif diff --git a/host/online_compilation/kernel.cpp.in b/host/online_compile/kernel.cpp.in similarity index 97% rename from host/online_compilation/kernel.cpp.in rename to host/online_compile/kernel.cpp.in index f67e7d1c6e..b9a9805284 100644 --- a/host/online_compilation/kernel.cpp.in +++ b/host/online_compile/kernel.cpp.in @@ -31,7 +31,7 @@ ${KERNELS_DECLS} // clang-format on -namespace olCompile { +namespace online_compile { const std::map& kernels() { @@ -67,4 +67,4 @@ std::string GetKernelSrc(std::string name) return it->second; } -} // namespace olCompile +} // namespace online_compile diff --git a/host/online_compilation/kernel_includes.cpp.in b/host/online_compile/kernel_includes.cpp.in similarity index 97% rename from host/online_compilation/kernel_includes.cpp.in rename to host/online_compile/kernel_includes.cpp.in index 24dc09e9ff..a7e6bd689b 100644 --- a/host/online_compilation/kernel_includes.cpp.in +++ b/host/online_compile/kernel_includes.cpp.in @@ -29,7 +29,7 @@ #include #include -namespace olCompile { +namespace online_compile { static inline bool EndsWith(const std::string& value, const std::string& suffix) { @@ -77,4 +77,4 @@ std::vector GetHipKernelIncList() return keys; } -} // namespace olCompile +} // namespace online_compile diff --git a/host/online_compilation/kernels_batch.cpp.in b/host/online_compile/kernels_batch.cpp.in similarity index 100% rename from host/online_compilation/kernels_batch.cpp.in rename to host/online_compile/kernels_batch.cpp.in diff --git a/host/driver_online/include/conv_igemm_fwd_v6r1_dlops_nchw_kcyx_nkhw.hpp b/host/solver/include/conv_igemm_fwd_v6r1_dlops_nchw_kcyx_nkhw.hpp similarity index 100% rename from host/driver_online/include/conv_igemm_fwd_v6r1_dlops_nchw_kcyx_nkhw.hpp rename to host/solver/include/conv_igemm_fwd_v6r1_dlops_nchw_kcyx_nkhw.hpp diff --git a/host/driver_online/include/conv_tunable_fwd_v4r4_dlops_nchw_kcyx_nkhw.hpp b/host/solver/include/conv_tunable_fwd_v4r4_dlops_nchw_kcyx_nkhw.hpp similarity index 100% rename from host/driver_online/include/conv_tunable_fwd_v4r4_dlops_nchw_kcyx_nkhw.hpp rename to host/solver/include/conv_tunable_fwd_v4r4_dlops_nchw_kcyx_nkhw.hpp diff --git a/host/driver_online/include/conv_tunable_fwd_v4r4_xdlops_nchw_kcyx_nkhw.hpp b/host/solver/include/conv_tunable_fwd_v4r4_xdlops_nchw_kcyx_nkhw.hpp similarity index 100% rename from host/driver_online/include/conv_tunable_fwd_v4r4_xdlops_nchw_kcyx_nkhw.hpp rename to host/solver/include/conv_tunable_fwd_v4r4_xdlops_nchw_kcyx_nkhw.hpp diff --git a/host/driver_online/include/conv_tunable_fwd_v4r4_xdlops_nhwc_kyxc_nhwk.hpp b/host/solver/include/conv_tunable_fwd_v4r4_xdlops_nhwc_kyxc_nhwk.hpp similarity index 100% rename from host/driver_online/include/conv_tunable_fwd_v4r4_xdlops_nhwc_kyxc_nhwk.hpp rename to host/solver/include/conv_tunable_fwd_v4r4_xdlops_nhwc_kyxc_nhwk.hpp diff --git a/host/driver_online/include/convolution_problem_descriptor.hpp b/host/solver/include/convolution_problem_descriptor.hpp similarity index 100% rename from host/driver_online/include/convolution_problem_descriptor.hpp rename to host/solver/include/convolution_problem_descriptor.hpp From cb95421311dfc625edf5e0c59aa243aac1b00268 Mon Sep 17 00:00:00 2001 From: Chao Liu Date: Fri, 6 Aug 2021 22:17:51 +0000 Subject: [PATCH 03/29] refactor --- host/driver_online/conv_fwd_driver_online.cpp | 2 +- ...n_forward_implicit_gemm_v4r4_dlops_nchw_kcyx_nkhw.hpp | 2 +- ..._forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw.hpp | 2 +- ...n_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw.hpp | 9 +++++---- host/driver_online/include/online_driver_common.hpp | 6 ++++-- .../include/conv_igemm_fwd_v6r1_dlops_nchw_kcyx_nkhw.hpp | 6 ++++-- host/solver/include/convolution_problem_descriptor.hpp | 6 ++++-- 7 files changed, 20 insertions(+), 13 deletions(-) diff --git a/host/driver_online/conv_fwd_driver_online.cpp b/host/driver_online/conv_fwd_driver_online.cpp index 29609d5474..53e6179aa6 100644 --- a/host/driver_online/conv_fwd_driver_online.cpp +++ b/host/driver_online/conv_fwd_driver_online.cpp @@ -35,7 +35,7 @@ enum ConvForwardAlgo int main(int argc, char* argv[]) { using namespace ck; - using namespace ck_driver; + using namespace ck::driver; using size_t = std::size_t; hipStream_t stream; diff --git a/host/driver_online/include/online_device_dynamic_convolution_forward_implicit_gemm_v4r4_dlops_nchw_kcyx_nkhw.hpp b/host/driver_online/include/online_device_dynamic_convolution_forward_implicit_gemm_v4r4_dlops_nchw_kcyx_nkhw.hpp index 06412fba0b..419b8ca95d 100644 --- a/host/driver_online/include/online_device_dynamic_convolution_forward_implicit_gemm_v4r4_dlops_nchw_kcyx_nkhw.hpp +++ b/host/driver_online/include/online_device_dynamic_convolution_forward_implicit_gemm_v4r4_dlops_nchw_kcyx_nkhw.hpp @@ -231,7 +231,7 @@ void online_device_dynamic_convolution_forward_implicit_gemm_v4r4_dlops_nchw_kcy ck::index_t nrepeat) { using namespace ck; - using namespace ck_driver; + using namespace ck::driver; using namespace detail_dyn_conv_fwd_v4r4_nchw_kcyx_nkhw; using size_t = std::size_t; diff --git a/host/driver_online/include/online_device_dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw.hpp b/host/driver_online/include/online_device_dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw.hpp index 61ce41fe84..46d065f615 100644 --- a/host/driver_online/include/online_device_dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw.hpp +++ b/host/driver_online/include/online_device_dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw.hpp @@ -227,7 +227,7 @@ void online_device_dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kc ck::index_t nrepeat) { using namespace ck; - using namespace ck_driver; + using namespace ck::driver; using namespace detail_dyn_conv_fwd_v4r4_xdlops_nchw_kcyx_nkhw; using size_t = std::size_t; diff --git a/host/driver_online/include/online_device_dynamic_convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw.hpp b/host/driver_online/include/online_device_dynamic_convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw.hpp index 92467a7668..7b88ef02b4 100644 --- a/host/driver_online/include/online_device_dynamic_convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw.hpp +++ b/host/driver_online/include/online_device_dynamic_convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw.hpp @@ -31,11 +31,11 @@ void online_device_dynamic_convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcy const Tensor& in_n_c_hi_wi, const Tensor& wei_k_c_y_x, Tensor& out_n_k_ho_wo, - const ck_driver::CompileParameterConvIgemmFwdV6r1DlopsNchwKcyxNkhw& compile_param, + const ck::driver::CompileParameterConvIgemmFwdV6r1DlopsNchwKcyxNkhw& compile_param, ck::index_t nrepeat) { using namespace ck; - using namespace ck_driver; + using namespace ck::driver; using size_t = std::size_t; std::cout << __func__ << std::endl; @@ -100,8 +100,9 @@ void online_device_dynamic_convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcy "dynamic_convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw.cpp"; std::string algo_name = "implicit_gemm_conv_fwd_v6r1_dlops_nchw"; - std::string compile_param_string = get_ck_hip_online_compile_common_flag() + compile_param.GetCompileParameterString(); - std::string network_config = compile_param_string; + std::string compile_param_string = + get_ck_hip_online_compile_common_flag() + compile_param.GetCompileParameterString(); + std::string network_config = compile_param_string; std::vector kernel1_times; std::vector kernel2_times; diff --git a/host/driver_online/include/online_driver_common.hpp b/host/driver_online/include/online_driver_common.hpp index d05a156d89..508a3594cd 100644 --- a/host/driver_online/include/online_driver_common.hpp +++ b/host/driver_online/include/online_driver_common.hpp @@ -1,7 +1,8 @@ #ifndef ONLINE_DRIVER_COMMON_HPP #define ONLINE_DRIVER_COMMON_HPP -namespace ck_driver { +namespace ck { +namespace driver { inline auto get_ck_hip_online_compile_common_flag() { @@ -47,5 +48,6 @@ auto gcd(X x, Ys... ys) return gcd(x, gcd(ys...)); } -} // namespace ck_driver +} // namespace driver +} // namespace ck #endif diff --git a/host/solver/include/conv_igemm_fwd_v6r1_dlops_nchw_kcyx_nkhw.hpp b/host/solver/include/conv_igemm_fwd_v6r1_dlops_nchw_kcyx_nkhw.hpp index b0c4921019..a30c2720ee 100644 --- a/host/solver/include/conv_igemm_fwd_v6r1_dlops_nchw_kcyx_nkhw.hpp +++ b/host/solver/include/conv_igemm_fwd_v6r1_dlops_nchw_kcyx_nkhw.hpp @@ -3,7 +3,8 @@ #include -namespace ck_driver { +namespace ck { +namespace driver { struct CompileParameterConvIgemmFwdV6r1DlopsNchwKcyxNkhw { @@ -669,5 +670,6 @@ struct ConvIgemmFwdV6r1DlopsNchwKcyxNkhw } }; -} // namespace ck_driver +} // namespace driver +} // namespace ck #endif diff --git a/host/solver/include/convolution_problem_descriptor.hpp b/host/solver/include/convolution_problem_descriptor.hpp index df9c110e70..8c0ecbee80 100644 --- a/host/solver/include/convolution_problem_descriptor.hpp +++ b/host/solver/include/convolution_problem_descriptor.hpp @@ -1,7 +1,8 @@ #ifndef CONVOLUTION_PROBLEM_DESCRIPTOR #define CONVOLUTION_PROBLEM_DESCRIPTOR -namespace ck_driver { +namespace ck { +namespace driver { struct ConvolutionProblemDescriptor { @@ -75,5 +76,6 @@ struct ConvolutionProblemDescriptor std::size_t CalculateFlop() const { return 2L * N * K * C * Y * X * Ho * Wo; } }; -} // namespace ck_driver +} // namespace driver +} // namespace ck #endif From ae98b52ad8be610bd6f8fdd1ffacc6ac70081379 Mon Sep 17 00:00:00 2001 From: Chao Liu Date: Sat, 7 Aug 2021 00:51:05 +0000 Subject: [PATCH 04/29] remove online compilation from CK --- CMakeLists.txt | 5 +- cmake/AddKernels.cmake | 40 -- cmake/TargetFlags.cmake | 50 -- host/CMakeLists.txt | 2 - host/driver_online/CMakeLists.txt | 22 - host/driver_online/conv_fwd_driver_online.cpp | 453 ------------------ ...mplicit_gemm_v4r4_dlops_nchw_kcyx_nkhw.hpp | 395 --------------- ...plicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw.hpp | 386 --------------- ...plicit_gemm_v4r4_xdlops_nhwc_kyxc_nhwk.hpp | 389 --------------- ...mplicit_gemm_v6r1_dlops_nchw_kcyx_nkhw.hpp | 183 ------- host/online_compile/CMakeLists.txt | 168 ------- host/online_compile/addkernels/CMakeLists.txt | 30 -- host/online_compile/addkernels/addkernels.cpp | 264 ---------- .../addkernels/include_inliner.cpp | 213 -------- .../addkernels/include_inliner.hpp | 142 ------ .../addkernels/source_file_desc.hpp | 45 -- .../hip_utility/binary_cache.cpp | 112 ----- .../online_compile/hip_utility/exec_utils.cpp | 93 ---- host/online_compile/hip_utility/handlehip.cpp | 285 ----------- .../hip_utility/hip_build_utils.cpp | 346 ------------- .../hip_utility/hipoc_kernel.cpp | 84 ---- .../hip_utility/hipoc_program.cpp | 139 ------ .../hip_utility/kernel_build_params.cpp | 66 --- .../hip_utility/kernel_cache.cpp | 154 ------ host/online_compile/hip_utility/logger.cpp | 43 -- host/online_compile/hip_utility/md5.cpp | 319 ------------ .../hip_utility/target_properties.cpp | 119 ----- host/online_compile/hip_utility/tmp_dir.cpp | 66 --- host/online_compile/include/binary_cache.hpp | 52 -- host/online_compile/include/config.h.in | 47 -- host/online_compile/include/env.hpp | 123 ----- host/online_compile/include/exec_utils.hpp | 42 -- host/online_compile/include/handle.hpp | 145 ------ host/online_compile/include/hipCheck.hpp | 22 - .../include/hip_build_utils.hpp | 97 ---- host/online_compile/include/hipoc_kernel.hpp | 174 ------- host/online_compile/include/hipoc_program.hpp | 64 --- .../include/hipoc_program_impl.hpp | 61 --- host/online_compile/include/kernel.hpp | 45 -- .../include/kernel_build_params.hpp | 137 ------ host/online_compile/include/kernel_cache.hpp | 97 ---- host/online_compile/include/logger.hpp | 23 - host/online_compile/include/manage_ptr.hpp | 76 --- host/online_compile/include/md5.hpp | 12 - .../online_compile/include/op_kernel_args.hpp | 40 -- host/online_compile/include/simple_hash.hpp | 44 -- host/online_compile/include/stringutils.hpp | 133 ----- .../include/target_properties.hpp | 56 --- host/online_compile/include/tmp_dir.hpp | 26 - host/online_compile/include/write_file.hpp | 30 -- host/online_compile/kernel.cpp.in | 70 --- host/online_compile/kernel_includes.cpp.in | 80 ---- host/online_compile/kernels_batch.cpp.in | 1 - .../include/solver_common.hpp} | 11 +- 54 files changed, 3 insertions(+), 6318 deletions(-) delete mode 100644 cmake/AddKernels.cmake delete mode 100644 cmake/TargetFlags.cmake delete mode 100644 host/driver_online/CMakeLists.txt delete mode 100644 host/driver_online/conv_fwd_driver_online.cpp delete mode 100644 host/driver_online/include/online_device_dynamic_convolution_forward_implicit_gemm_v4r4_dlops_nchw_kcyx_nkhw.hpp delete mode 100644 host/driver_online/include/online_device_dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw.hpp delete mode 100644 host/driver_online/include/online_device_dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nhwc_kyxc_nhwk.hpp delete mode 100644 host/driver_online/include/online_device_dynamic_convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw.hpp delete mode 100644 host/online_compile/CMakeLists.txt delete mode 100644 host/online_compile/addkernels/CMakeLists.txt delete mode 100644 host/online_compile/addkernels/addkernels.cpp delete mode 100644 host/online_compile/addkernels/include_inliner.cpp delete mode 100644 host/online_compile/addkernels/include_inliner.hpp delete mode 100644 host/online_compile/addkernels/source_file_desc.hpp delete mode 100644 host/online_compile/hip_utility/binary_cache.cpp delete mode 100644 host/online_compile/hip_utility/exec_utils.cpp delete mode 100644 host/online_compile/hip_utility/handlehip.cpp delete mode 100644 host/online_compile/hip_utility/hip_build_utils.cpp delete mode 100644 host/online_compile/hip_utility/hipoc_kernel.cpp delete mode 100644 host/online_compile/hip_utility/hipoc_program.cpp delete mode 100644 host/online_compile/hip_utility/kernel_build_params.cpp delete mode 100644 host/online_compile/hip_utility/kernel_cache.cpp delete mode 100644 host/online_compile/hip_utility/logger.cpp delete mode 100644 host/online_compile/hip_utility/md5.cpp delete mode 100644 host/online_compile/hip_utility/target_properties.cpp delete mode 100644 host/online_compile/hip_utility/tmp_dir.cpp delete mode 100644 host/online_compile/include/binary_cache.hpp delete mode 100644 host/online_compile/include/config.h.in delete mode 100644 host/online_compile/include/env.hpp delete mode 100644 host/online_compile/include/exec_utils.hpp delete mode 100644 host/online_compile/include/handle.hpp delete mode 100644 host/online_compile/include/hipCheck.hpp delete mode 100644 host/online_compile/include/hip_build_utils.hpp delete mode 100644 host/online_compile/include/hipoc_kernel.hpp delete mode 100644 host/online_compile/include/hipoc_program.hpp delete mode 100644 host/online_compile/include/hipoc_program_impl.hpp delete mode 100644 host/online_compile/include/kernel.hpp delete mode 100644 host/online_compile/include/kernel_build_params.hpp delete mode 100644 host/online_compile/include/kernel_cache.hpp delete mode 100644 host/online_compile/include/logger.hpp delete mode 100644 host/online_compile/include/manage_ptr.hpp delete mode 100644 host/online_compile/include/md5.hpp delete mode 100644 host/online_compile/include/op_kernel_args.hpp delete mode 100644 host/online_compile/include/simple_hash.hpp delete mode 100644 host/online_compile/include/stringutils.hpp delete mode 100644 host/online_compile/include/target_properties.hpp delete mode 100644 host/online_compile/include/tmp_dir.hpp delete mode 100644 host/online_compile/include/write_file.hpp delete mode 100644 host/online_compile/kernel.cpp.in delete mode 100644 host/online_compile/kernel_includes.cpp.in delete mode 100644 host/online_compile/kernels_batch.cpp.in rename host/{driver_online/include/online_driver_common.hpp => solver/include/solver_common.hpp} (79%) diff --git a/CMakeLists.txt b/CMakeLists.txt index 0cf342bb45..fa5dcfe3ea 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,11 +1,8 @@ cmake_minimum_required(VERSION 2.8.3) -project(modular_convolution) +project(composable_kernel) list(APPEND CMAKE_MODULE_PATH "${PROJECT_SOURCE_DIR}/cmake") -include(TargetFlags) -include(AddKernels) - ## C++ enable_language(CXX) set(CMAKE_CXX_STANDARD 17) diff --git a/cmake/AddKernels.cmake b/cmake/AddKernels.cmake deleted file mode 100644 index 429ecc47a9..0000000000 --- a/cmake/AddKernels.cmake +++ /dev/null @@ -1,40 +0,0 @@ - -function(add_kernels SRC_DIR KERNEL_FILES) - set(INIT_KERNELS_LIST) - set(KERNELS_DECLS) - foreach(KERNEL_FILE ${KERNEL_FILES}) - if("${CMAKE_VERSION}" VERSION_LESS 3.0) - configure_file(${KERNEL_FILE} ${KERNEL_FILE}.delete) - else() - set_property(DIRECTORY APPEND PROPERTY CMAKE_CONFIGURE_DEPENDS ${KERNEL_FILE}) - endif() - get_filename_component(BASE_NAME ${KERNEL_FILE} NAME_WE) - string(TOUPPER "${BASE_NAME}" KEY_NAME) - string(MAKE_C_IDENTIFIER "${KEY_NAME}" VAR_NAME) - string(APPEND KERNELS_DECLS "extern const size_t APP_KERNEL_${VAR_NAME}_SIZE;\n") - string(APPEND KERNELS_DECLS "extern const unsigned char APP_KERNEL_${VAR_NAME}[];\n") - list(APPEND INIT_KERNELS_LIST " { \"${KEY_NAME}\", std::string(reinterpret_cast(APP_KERNEL_${VAR_NAME}), APP_KERNEL_${VAR_NAME}_SIZE) }") - endforeach() - string(REPLACE ";" ",\n" INIT_KERNELS "${INIT_KERNELS_LIST}") - configure_file(${SRC_DIR}/kernel.cpp.in ${PROJECT_BINARY_DIR}/kernel.cpp) -endfunction() - -function(add_kernel_includes SRC_DIR KERNEL_FILES) - set(INIT_KERNELS_LIST) - foreach(KERNEL_FILE ${KERNEL_FILES}) - if("${CMAKE_VERSION}" VERSION_LESS 3.0) - configure_file(${KERNEL_FILE} ${KERNEL_FILE}.delete) - else() - set_property(DIRECTORY APPEND PROPERTY CMAKE_CONFIGURE_DEPENDS ${KERNEL_FILE}) - endif() - get_filename_component(BASE_NAME ${KERNEL_FILE} NAME_WE) - get_filename_component(FILE_NAME ${KERNEL_FILE} NAME) - string(TOUPPER "${BASE_NAME}" KEY_NAME) - string(MAKE_C_IDENTIFIER "${KEY_NAME}" VAR_NAME) - list(APPEND INIT_KERNELS_LIST " { \"${FILE_NAME}\", std::string(reinterpret_cast(${VAR_NAME}), ${VAR_NAME}_SIZE) }") - endforeach() - string(REPLACE ";" ",\n" INIT_KERNELS "${INIT_KERNELS_LIST}") - configure_file(${SRC_DIR}/kernel_includes.cpp.in ${PROJECT_BINARY_DIR}/kernel_includes.cpp) -endfunction() - - diff --git a/cmake/TargetFlags.cmake b/cmake/TargetFlags.cmake deleted file mode 100644 index 4f83fb5d39..0000000000 --- a/cmake/TargetFlags.cmake +++ /dev/null @@ -1,50 +0,0 @@ - -function(get_target_property2 VAR TARGET PROPERTY) - get_target_property(_pflags ${TARGET} ${PROPERTY}) - if(_pflags) - set(${VAR} ${_pflags} PARENT_SCOPE) - else() - set(${VAR} "" PARENT_SCOPE) - endif() -endfunction() - - -macro(append_flags FLAGS TARGET PROPERTY PREFIX) - get_target_property2(_pflags ${TARGET} ${PROPERTY}) - foreach(FLAG ${_pflags}) - if(TARGET ${FLAG}) - target_flags(_pflags2 ${FLAG}) - string(APPEND ${FLAGS} " ${_pflags2}") - else() - string(APPEND ${FLAGS} " ${PREFIX}${FLAG}") - endif() - endforeach() -endmacro() - -macro(append_link_flags FLAGS TARGET PROPERTY) - get_target_property2(_pflags ${TARGET} ${PROPERTY}) - foreach(FLAG ${_pflags}) - if(TARGET ${FLAG}) - target_flags(_pflags2 ${FLAG}) - string(APPEND ${FLAGS} " ${_pflags2}") - elseif(FLAG MATCHES "^-.*") - string(APPEND ${FLAGS} " ${FLAG}") - elseif(EXISTS ${FLAG}) - string(APPEND ${FLAGS} " ${FLAG}") - else() - string(APPEND ${FLAGS} " -l${FLAG}") - endif() - endforeach() -endmacro() - -function(target_flags FLAGS TARGET) - set(_flags) - append_flags(_flags ${TARGET} "INTERFACE_COMPILE_OPTIONS" "") - append_flags(_flags ${TARGET} "INTERFACE_COMPILE_DEFINITIONS" "-D") - append_flags(_flags ${TARGET} "INTERFACE_INCLUDE_DIRECTORIES" "-isystem ") - append_flags(_flags ${TARGET} "INTERFACE_LINK_DIRECTORIES" "-L ") - append_flags(_flags ${TARGET} "INTERFACE_LINK_OPTIONS" "") - append_link_flags(_flags ${TARGET} "INTERFACE_LINK_LIBRARIES" "") - # message("_flags: ${_flags}") - set(${FLAGS} ${_flags} PARENT_SCOPE) -endfunction() diff --git a/host/CMakeLists.txt b/host/CMakeLists.txt index 26739efe34..30cc14d8ca 100644 --- a/host/CMakeLists.txt +++ b/host/CMakeLists.txt @@ -1,4 +1,2 @@ add_subdirectory(host_tensor) -add_subdirectory(online_compile) add_subdirectory(driver_offline) -add_subdirectory(driver_online) diff --git a/host/driver_online/CMakeLists.txt b/host/driver_online/CMakeLists.txt deleted file mode 100644 index 077e3218a0..0000000000 --- a/host/driver_online/CMakeLists.txt +++ /dev/null @@ -1,22 +0,0 @@ -include_directories(BEFORE - include - ${PROJECT_BINARY_DIR}/host/online_compile/include - ${PROJECT_SOURCE_DIR}/host/online_compile/include - ${PROJECT_SOURCE_DIR}/host/host_tensor/include - ${PROJECT_SOURCE_DIR}/host/solver/include - ${PROJECT_SOURCE_DIR}/composable_kernel/include - ${PROJECT_SOURCE_DIR}/composable_kernel/include/utility - ${PROJECT_SOURCE_DIR}/composable_kernel/include/tensor_description - ${PROJECT_SOURCE_DIR}/composable_kernel/include/tensor_operation - ${PROJECT_SOURCE_DIR}/composable_kernel/include/problem_transform - ${PROJECT_SOURCE_DIR}/composable_kernel/include/driver - ${PROJECT_SOURCE_DIR}/external/rocm/include - ${PROJECT_SOURCE_DIR}/external/half/include -) - -set(CONV_FWD_DRIVER_ONLINE_SOURCE conv_fwd_driver_online.cpp) - -add_executable(conv_fwd_driver_online ${CONV_FWD_DRIVER_ONLINE_SOURCE}) - -target_link_libraries(conv_fwd_driver_online PRIVATE host_tensor) -target_link_libraries(conv_fwd_driver_online PRIVATE online_compile) diff --git a/host/driver_online/conv_fwd_driver_online.cpp b/host/driver_online/conv_fwd_driver_online.cpp deleted file mode 100644 index 53e6179aa6..0000000000 --- a/host/driver_online/conv_fwd_driver_online.cpp +++ /dev/null @@ -1,453 +0,0 @@ -#include -#include -#include -#include -#include -#include -#include "config.hpp" -#include "print.hpp" -#include "device.hpp" -#include "host_tensor.hpp" -#include "host_tensor_generator.hpp" -#include "conv_common.hpp" -#include "host_conv.hpp" -#include "device_tensor.hpp" -#include "handle.hpp" -#include "hipCheck.hpp" -#include "online_device_dynamic_convolution_forward_implicit_gemm_v4r4_dlops_nchw_kcyx_nkhw.hpp" -#include "online_device_dynamic_convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw.hpp" -#include "online_device_dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw.hpp" -#include "online_device_dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nhwc_kyxc_nhwk.hpp" - -#define USE_CONV_FWD_V4R4_NCHW 1 -#define USE_CONV_FWD_V6R1_NCHW 1 -#define USE_CONV_FWD_V4R4_XDLOPS_NCHW 1 -#define USE_CONV_FWD_V4R4_XDLOPS_NHWC 1 - -enum ConvForwardAlgo -{ - V4R4NCHW, // 0 - V6R1NCHW, // 1 - V4R4XDLNCHW, // 2 - V4R4XDLNHWC // 3 -}; - -int main(int argc, char* argv[]) -{ - using namespace ck; - using namespace ck::driver; - using size_t = std::size_t; - - hipStream_t stream; - online_compile::Handle* handle; - - MY_HIP_CHECK(hipStreamCreate(&stream)); - - handle = new online_compile::Handle(stream); - - constexpr auto I0 = Number<0>{}; - constexpr auto I1 = Number<1>{}; - constexpr auto I2 = Number<2>{}; - constexpr auto I3 = Number<3>{}; - constexpr auto I4 = Number<4>{}; - constexpr auto I5 = Number<5>{}; - constexpr auto I6 = Number<6>{}; - - if(argc != 22) - { - printf("arg1 to 5: layout, algo, do_verification, init_method, do_log, nrepeat\n"); - printf("rest: N, K, C, Y, X, Hi, Wi, Sy, Sx, Dy, Dx, LeftPy, LeftPx, RightPy, RightPx\n"); - exit(1); - } - - const ConvTensorLayout layout = static_cast(atoi(argv[1])); - const ConvForwardAlgo algo = static_cast(atoi(argv[2])); - const bool do_verification = atoi(argv[3]); - const int init_method = atoi(argv[4]); - const bool do_log = atoi(argv[5]); - const int nrepeat = atoi(argv[6]); - - const index_t N = atoi(argv[7]); - const index_t K = atoi(argv[8]); - const index_t C = atoi(argv[9]); - const index_t Y = atoi(argv[10]); - const index_t X = atoi(argv[11]); - const index_t Hi = atoi(argv[12]); - const index_t Wi = atoi(argv[13]); - - const index_t conv_stride_h = atoi(argv[14]); - const index_t conv_stride_w = atoi(argv[15]); - const index_t conv_dilation_h = atoi(argv[16]); - const index_t conv_dilation_w = atoi(argv[17]); - const index_t in_left_pad_h = atoi(argv[18]); - const index_t in_left_pad_w = atoi(argv[19]); - const index_t in_right_pad_h = atoi(argv[20]); - const index_t in_right_pad_w = atoi(argv[21]); - - const index_t YEff = (Y - 1) * conv_dilation_h + 1; - const index_t XEff = (X - 1) * conv_dilation_w + 1; - - const index_t Ho = (Hi + in_left_pad_h + in_right_pad_h - YEff) / conv_stride_h + 1; - const index_t Wo = (Wi + in_left_pad_w + in_right_pad_w - XEff) / conv_stride_w + 1; - -#if 1 - using in_data_t = float; - using acc_data_t = float; - using out_data_t = float; -#elif 0 - using in_data_t = half_t; - using acc_data_t = float; - using out_data_t = half_t; -#elif 1 - using in_data_t = int8_t; - using acc_data_t = int32_t; - using out_data_t = int8_t; -#endif - - std::vector in_lengths_host(4), wei_lengths_host(4), out_lengths_host(4); - - switch(layout) - { - case ConvTensorLayout::NCHW: - // NCHW - in_lengths_host[0] = static_cast(N); - in_lengths_host[1] = static_cast(C); - in_lengths_host[2] = static_cast(Hi); - in_lengths_host[3] = static_cast(Wi); - - wei_lengths_host[0] = static_cast(K); - wei_lengths_host[1] = static_cast(C); - wei_lengths_host[2] = static_cast(Y); - wei_lengths_host[3] = static_cast(X); - - out_lengths_host[0] = static_cast(N); - out_lengths_host[1] = static_cast(K); - out_lengths_host[2] = static_cast(Ho); - out_lengths_host[3] = static_cast(Wo); - break; - case ConvTensorLayout::NHWC: - // NHWC - in_lengths_host[0] = static_cast(N); - in_lengths_host[1] = static_cast(Hi); - in_lengths_host[2] = static_cast(Wi); - in_lengths_host[3] = static_cast(C); - - wei_lengths_host[0] = static_cast(K); - wei_lengths_host[1] = static_cast(Y); - wei_lengths_host[2] = static_cast(X); - wei_lengths_host[3] = static_cast(C); - - out_lengths_host[0] = static_cast(N); - out_lengths_host[1] = static_cast(Ho); - out_lengths_host[2] = static_cast(Wo); - out_lengths_host[3] = static_cast(K); - break; - default: throw std::runtime_error("wrong! not implemented"); - } - - Tensor in(in_lengths_host); - Tensor wei(wei_lengths_host); - Tensor out_host(out_lengths_host); - Tensor out_device(out_lengths_host); - - std::cout << "layout: " << layout << std::endl; - ostream_HostTensorDescriptor(in.mDesc, std::cout << "in: "); - ostream_HostTensorDescriptor(wei.mDesc, std::cout << "wei: "); - ostream_HostTensorDescriptor(out_host.mDesc, std::cout << "out: "); - print_array("InLeftPads", make_tuple(in_left_pad_h, in_left_pad_w)); - print_array("InRightPads", make_tuple(in_right_pad_h, in_right_pad_w)); - print_array("ConvStrides", make_tuple(conv_stride_h, conv_stride_w)); - print_array("ConvDilations", make_tuple(conv_dilation_h, conv_dilation_w)); - - std::size_t num_thread = std::thread::hardware_concurrency(); - - switch(init_method) - { - case 0: - // no initialization - break; - case 1: - in.GenerateTensorValue(GeneratorTensor_1{}, num_thread); - wei.GenerateTensorValue(GeneratorTensor_1{}, num_thread); - break; - case 2: - in.GenerateTensorValue(GeneratorTensor_1{}, num_thread); - wei.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread); - break; - case 3: - in.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread); - wei.GenerateTensorValue(GeneratorTensor_1{}, num_thread); - break; - case 4: - in.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread); - wei.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread); - break; - case 5: - in.GenerateTensorValue(GeneratorTensor_3{0.0, 1.0}, num_thread); - wei.GenerateTensorValue(GeneratorTensor_3{-0.5, 0.5}, num_thread); - break; - default: - in.GenerateTensorValue(GeneratorTensor_2{1, 5}, num_thread); - - auto gen_wei = [](auto... is) { - return GeneratorTensor_2{1, 5}(is...) * GeneratorTensor_Checkboard{}(is...); - }; - wei.GenerateTensorValue(gen_wei, num_thread); - } - - auto f_make_for_device_nchw = [&]() { - const auto in_lengths_dev = make_tuple(N, C, Hi, Wi); - const auto wei_lengths_dev = make_tuple(K, C, Y, X); - const auto out_lengths_dev = make_tuple(N, K, Ho, Wo); - - return make_tuple(in_lengths_dev, wei_lengths_dev, out_lengths_dev); - }; - - auto f_make_for_device_nhwc = [&]() { - const auto in_lengths_dev = make_tuple(N, Hi, Wi, C); - const auto wei_lengths_dev = make_tuple(K, Y, X, C); - const auto out_lengths_dev = make_tuple(N, Ho, Wo, K); - - return make_tuple(in_lengths_dev, wei_lengths_dev, out_lengths_dev); - }; - - const auto conv_strides = make_tuple(conv_stride_h, conv_stride_w); - const auto conv_dilations = make_tuple(conv_dilation_h, conv_dilation_w); - const auto in_left_pads = make_tuple(in_left_pad_h, in_left_pad_w); - const auto in_right_pads = make_tuple(in_right_pad_h, in_right_pad_w); - -#if USE_CONV_FWD_V4R4_NCHW - if(algo == ConvForwardAlgo::V4R4NCHW) - { - if(layout != ConvTensorLayout::NCHW) - { - throw std::runtime_error("wrong! layout"); - } - - const auto tmp = f_make_for_device_nchw(); - - tunable_dyn_conv_fwd_v4r4_dlops_nchw_kcyx_nkhw* tunable = - &default_tunable_dyn_conv_fwd_v4r4_dlops_nchw_kcyx_nkhw; - - online_device_dynamic_convolution_forward_implicit_gemm_v4r4_dlops_nchw_kcyx_nkhw< - in_data_t, - acc_data_t, - out_data_t>(handle, - tmp[I0], - tmp[I1], - tmp[I2], - conv_strides, - conv_dilations, - in_left_pads, - in_right_pads, - in, - wei, - out_device, - tunable, - nrepeat); - } -#endif - -#if USE_CONV_FWD_V6R1_NCHW - if(algo == ConvForwardAlgo::V6R1NCHW) - { - if(layout != ConvTensorLayout::NCHW) - { - throw std::runtime_error("wrong! layout"); - } - - const auto tmp = f_make_for_device_nchw(); - -#if 1 - const CompileParameterConvIgemmFwdV6r1DlopsNchwKcyxNkhw compile_param = { - get_datatype_enum_from_type::value, - get_datatype_enum_from_type::value, - get_datatype_enum_from_type::value, - 256, - 4, - 1, - 128, - 32, - 8, - 4, - 4, - 1, - {8, 2}, - {8, 2}, - {4, 1, 1, 1, 1}, - {2, 1, 1, 128, 1}, - {4, 1, 1, 1, 1}, - {1, 1, 1, 1, 1}, - {1, 4, 1, 1, 1}, - {8, 1, 1, 32, 1}, - {1, 1, 1, 1, 1}, - {1, 1, 1, 1, 1}, - 4, - true, - true}; -#elif 0 - const CompileParameterConvIgemmFwdV6r1DlopsNchwKcyxNkhw compile_param = { - get_datatype_enum_from_type::value, - get_datatype_enum_from_type::value, - get_datatype_enum_from_type::value, - 256, - 4, - 2, - 128, - 32, - 8, - 4, - 4, - 1, - {8, 2}, - {8, 2}, - {4, 1, 1, 1, 2}, - {2, 1, 1, 128, 1}, - {4, 1, 1, 1, 1}, - {1, 1, 1, 1, 1}, - {1, 4, 1, 1, 2}, - {8, 1, 1, 32, 1}, - {1, 1, 1, 1, 1}, - {1, 1, 1, 1, 1}, - 4, - true, - true}; -#elif 1 - const CompileParameterConvIgemmFwdV6r1DlopsNchwKcyxNkhw compile_param = { - get_datatype_enum_from_type::value, - get_datatype_enum_from_type::value, - get_datatype_enum_from_type::value, - 256, - 4, - 4, - 128, - 32, - 8, - 4, - 4, - 1, - {8, 2}, - {8, 2}, - {4, 1, 1, 1, 4}, - {2, 1, 1, 128, 1}, - {4, 1, 1, 1, 1}, - {1, 1, 1, 1, 1}, - {1, 4, 1, 1, 4}, - {8, 1, 1, 32, 1}, - {1, 1, 1, 1, 1}, - {1, 1, 1, 1, 1}, - 4, - true, - true}; -#endif - - online_device_dynamic_convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw< - in_data_t, - acc_data_t, - out_data_t>(handle, - tmp[I0], - tmp[I1], - tmp[I2], - conv_strides, - conv_dilations, - in_left_pads, - in_right_pads, - in, - wei, - out_device, - compile_param, - nrepeat); - } -#endif - -#if USE_CONV_FWD_V4R4_XDLOPS_NCHW - if(algo == ConvForwardAlgo::V4R4XDLNCHW) - { - if(layout != ConvTensorLayout::NCHW) - { - throw std::runtime_error("wrong! layout"); - } - - const auto tmp = f_make_for_device_nchw(); - - tunable_dyn_conv_fwd_v4r4_xdlops_nchw_kcyx_nkhw* tunable = - &default_tunable_dyn_conv_fwd_v4r4_xdlops_nchw_kcyx_nkhw; - - online_device_dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw< - in_data_t, - acc_data_t, - out_data_t>(handle, - tmp[I0], - tmp[I1], - tmp[I2], - conv_strides, - conv_dilations, - in_left_pads, - in_right_pads, - in, - wei, - out_device, - tunable, - nrepeat); - } -#endif - -#if USE_CONV_FWD_V4R4_XDLOPS_NHWC - if(algo == ConvForwardAlgo::V4R4XDLNHWC) - { - if(layout != ConvTensorLayout::NHWC) - { - throw std::runtime_error("wrong! layout"); - } - - const auto tmp = f_make_for_device_nhwc(); - - tunable_dyn_conv_fwd_v4r4_xdlops_nhwc_kyxc_nhwk* tunable = - &default_tunable_dyn_conv_fwd_v4r4_xdlops_nhwc_kyxc_nhwk; - - online_device_dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nhwc_kyxc_nhwk< - in_data_t, - acc_data_t, - out_data_t>(handle, - tmp[I0], - tmp[I1], - tmp[I2], - conv_strides, - conv_dilations, - in_left_pads, - in_right_pads, - in, - wei, - out_device, - tunable, - nrepeat); - } -#endif - - if(do_verification) - { - host_direct_convolution(in, - wei, - out_host, - make_tuple(conv_stride_h, conv_stride_w), - make_tuple(conv_dilation_h, conv_dilation_w), - make_tuple(in_left_pad_h, in_left_pad_w), - make_tuple(in_right_pad_h, in_right_pad_w), - layout); - - check_error(out_host, out_device); - -#if 0 - if(do_log) - { - LogRangeAsType(std::cout << "in : ", in.mData, ",") << std::endl; - LogRangeAsType(std::cout << "wei: ", wei.mData, ",") << std::endl; - LogRangeAsType(std::cout << "out_host : ", out_host.mData, ",") << std::endl; - LogRangeAsType(std::cout << "out_device: ", out_device.mData, ",") << std::endl; - } -#endif - } - - delete handle; - MY_HIP_CHECK(hipStreamDestroy(stream)); -} diff --git a/host/driver_online/include/online_device_dynamic_convolution_forward_implicit_gemm_v4r4_dlops_nchw_kcyx_nkhw.hpp b/host/driver_online/include/online_device_dynamic_convolution_forward_implicit_gemm_v4r4_dlops_nchw_kcyx_nkhw.hpp deleted file mode 100644 index 419b8ca95d..0000000000 --- a/host/driver_online/include/online_device_dynamic_convolution_forward_implicit_gemm_v4r4_dlops_nchw_kcyx_nkhw.hpp +++ /dev/null @@ -1,395 +0,0 @@ -#pragma once -#include "device.hpp" -#include "host_tensor.hpp" -#include "handle.hpp" -#include "online_driver_common.hpp" -#include "dynamic_tensor_descriptor.hpp" -#include "dynamic_tensor_descriptor_helper.hpp" -#include "transform_forward_convolution_into_gemm_v4r4_nchw_kcyx_nkhw.hpp" -#include "conv_tunable_fwd_v4r4_dlops_nchw_kcyx_nkhw.hpp" - -namespace detail_dyn_conv_fwd_v4r4_nchw_kcyx_nkhw { - -template -static std::string get_network_config_string_from_types() -{ - using namespace ck; - - std::string out; - - out += std::to_string(get_datatype_enum_from_type::value) + "_" + - std::to_string(get_datatype_enum_from_type::value) + "_" + - std::to_string(get_datatype_enum_from_type::value); - - return (out); -}; - -static std::string -get_network_config_string_from_tunable(const tunable_dyn_conv_fwd_v4r4_dlops_nchw_kcyx_nkhw* pt) -{ - std::string out("TUN_"); - - out += std::to_string(pt->BlockSize) + "_"; - - out += std::to_string(pt->MPerBlock) + "x" + std::to_string(pt->NPerBlock) + "x" + - std::to_string(pt->KPerBlock) + "_"; - out += std::to_string(pt->M1PerThread) + "x" + std::to_string(pt->N1PerThread) + "x" + - std::to_string(pt->KPerThread) + "_"; - out += std::to_string(pt->M1N1ThreadClusterM10) + "x" + - std::to_string(pt->M1N1ThreadClusterN10) + "x" + - std::to_string(pt->M1N1ThreadClusterM11) + "x" + - std::to_string(pt->M1N1ThreadClusterN11) + "_"; - - out += std::to_string(pt->ABlockTransferThreadSliceLengths_K_M0_M1[0]) + "x" + - std::to_string(pt->ABlockTransferThreadSliceLengths_K_M0_M1[1]) + "x" + - std::to_string(pt->ABlockTransferThreadSliceLengths_K_M0_M1[2]) + "_"; - - out += std::to_string(pt->ABlockTransferThreadClusterLengths_K_M0_M1[0]) + "x" + - std::to_string(pt->ABlockTransferThreadClusterLengths_K_M0_M1[1]) + "x" + - std::to_string(pt->ABlockTransferThreadClusterLengths_K_M0_M1[2]) + "_"; - - out += std::to_string(pt->ABlockTransferThreadClusterArrangeOrder[0]) + "x" + - std::to_string(pt->ABlockTransferThreadClusterArrangeOrder[1]) + "x" + - std::to_string(pt->ABlockTransferThreadClusterArrangeOrder[2]) + "_"; - - out += std::to_string(pt->ABlockTransferSrcAccessOrder[0]) + "x" + - std::to_string(pt->ABlockTransferSrcAccessOrder[1]) + "x" + - std::to_string(pt->ABlockTransferSrcAccessOrder[2]) + "_"; - - out += std::to_string(pt->ABlockTransferSrcVectorDim) + "_"; - out += std::to_string(pt->ABlockTransferSrcScalarPerVector) + "_"; - out += std::to_string(pt->ABlockTransferDstScalarPerVector_M1) + "_"; - out += std::to_string(pt->AThreadTransferSrcResetCoordinateAfterRun) + "_"; - - out += std::to_string(pt->BBlockTransferThreadSliceLengths_K_N0_N1[0]) + "x" + - std::to_string(pt->BBlockTransferThreadSliceLengths_K_N0_N1[1]) + "x" + - std::to_string(pt->BBlockTransferThreadSliceLengths_K_N0_N1[2]) + "_"; - - out += std::to_string(pt->BBlockTransferThreadClusterLengths_K_N0_N1[0]) + "x" + - std::to_string(pt->BBlockTransferThreadClusterLengths_K_N0_N1[1]) + "x" + - std::to_string(pt->BBlockTransferThreadClusterLengths_K_N0_N1[2]) + "_"; - - out += std::to_string(pt->BBlockTransferThreadClusterArrangeOrder[0]) + "x" + - std::to_string(pt->BBlockTransferThreadClusterArrangeOrder[1]) + "x" + - std::to_string(pt->BBlockTransferThreadClusterArrangeOrder[2]) + "_"; - - out += std::to_string(pt->BBlockTransferSrcAccessOrder[0]) + "x" + - std::to_string(pt->BBlockTransferSrcAccessOrder[1]) + "x" + - std::to_string(pt->BBlockTransferSrcAccessOrder[2]) + "_"; - - out += std::to_string(pt->BBlockTransferSrcVectorDim) + "_"; - out += std::to_string(pt->BBlockTransferSrcScalarPerVector) + "_"; - out += std::to_string(pt->BBlockTransferDstScalarPerVector_N1) + "_"; - out += std::to_string(pt->BThreadTransferSrcResetCoordinateAfterRun) + "_"; - - out += std::to_string(pt->CThreadTransferSrcDstAccessOrder[0]) + "x" + - std::to_string(pt->CThreadTransferSrcDstAccessOrder[1]) + "x" + - std::to_string(pt->CThreadTransferSrcDstAccessOrder[2]) + "x" + - std::to_string(pt->CThreadTransferSrcDstAccessOrder[3]) + "x" + - std::to_string(pt->CThreadTransferSrcDstAccessOrder[4]) + "x" + - std::to_string(pt->CThreadTransferSrcDstAccessOrder[5]) + "_"; - - out += std::to_string(pt->CThreadTransferSrcDstVectorDim) + "_"; - out += std::to_string(pt->CThreadTransferDstScalarPerVector); - - return (out); -}; - -template -static std::string get_definition_string_from_types() -{ - using namespace ck; - - std::string out; - - out += - " -DCK_PARAM_ABDataTypeEnum=" + std::to_string(get_datatype_enum_from_type::value) + - " -DCK_PARAM_AccDataTypeEnum=" + std::to_string(get_datatype_enum_from_type::value) + - " -DCK_PARAM_CDataTypeEnum=" + std::to_string(get_datatype_enum_from_type::value); - - return (out); -}; - -static std::string -get_definition_string_from_tunable(const tunable_dyn_conv_fwd_v4r4_dlops_nchw_kcyx_nkhw* pt) -{ - std::string out; - - out += " -DCK_PARAM_BlockSize=" + std::to_string(pt->BlockSize); - - out += " -DCK_PARAM_MPerBlock=" + std::to_string(pt->MPerBlock) + - " -DCK_PARAM_NPerBlock=" + std::to_string(pt->NPerBlock) + - " -DCK_PARAM_KPerBlock=" + std::to_string(pt->KPerBlock); - out += " -DCK_PARAM_M1PerThread=" + std::to_string(pt->M1PerThread) + - " -DCK_PARAM_N1PerThread=" + std::to_string(pt->N1PerThread) + - " -DCK_PARAM_KPerThread=" + std::to_string(pt->KPerThread); - - out += " -DCK_PARAM_M1N1ThreadClusterM10=" + std::to_string(pt->M1N1ThreadClusterM10) + - " -DCK_PARAM_M1N1ThreadClusterN10=" + std::to_string(pt->M1N1ThreadClusterN10) + - " -DCK_PARAM_M1N1ThreadClusterM11=" + std::to_string(pt->M1N1ThreadClusterM11) + - " -DCK_PARAM_M1N1ThreadClusterN11=" + std::to_string(pt->M1N1ThreadClusterN11); - - out += " -DCK_PARAM_ABlockTransferThreadSliceLengths_K_M0_M1=" + - std::to_string(pt->ABlockTransferThreadSliceLengths_K_M0_M1[0]) + "," + - std::to_string(pt->ABlockTransferThreadSliceLengths_K_M0_M1[1]) + "," + - std::to_string(pt->ABlockTransferThreadSliceLengths_K_M0_M1[2]); - - out += " -DCK_PARAM_ABlockTransferThreadClusterLengths_K_M0_M1=" + - std::to_string(pt->ABlockTransferThreadClusterLengths_K_M0_M1[0]) + "," + - std::to_string(pt->ABlockTransferThreadClusterLengths_K_M0_M1[1]) + "," + - std::to_string(pt->ABlockTransferThreadClusterLengths_K_M0_M1[2]); - - out += " -DCK_PARAM_ABlockTransferThreadClusterArrangeOrder=" + - std::to_string(pt->ABlockTransferThreadClusterArrangeOrder[0]) + "," + - std::to_string(pt->ABlockTransferThreadClusterArrangeOrder[1]) + "," + - std::to_string(pt->ABlockTransferThreadClusterArrangeOrder[2]); - - out += " -DCK_PARAM_ABlockTransferSrcAccessOrder=" + - std::to_string(pt->ABlockTransferSrcAccessOrder[0]) + "," + - std::to_string(pt->ABlockTransferSrcAccessOrder[1]) + "," + - std::to_string(pt->ABlockTransferSrcAccessOrder[2]); - - out += - " -DCK_PARAM_ABlockTransferSrcVectorDim=" + std::to_string(pt->ABlockTransferSrcVectorDim); - out += " -DCK_PARAM_ABlockTransferSrcScalarPerVector=" + - std::to_string(pt->ABlockTransferSrcScalarPerVector); - out += " -DCK_PARAM_ABlockTransferDstScalarPerVector_M1=" + - std::to_string(pt->ABlockTransferDstScalarPerVector_M1); - out += " -DCK_PARAM_AThreadTransferSrcResetCoordinateAfterRun=" + - std::to_string(pt->AThreadTransferSrcResetCoordinateAfterRun); - - out += " -DCK_PARAM_BBlockTransferThreadSliceLengths_K_N0_N1=" + - std::to_string(pt->BBlockTransferThreadSliceLengths_K_N0_N1[0]) + "," + - std::to_string(pt->BBlockTransferThreadSliceLengths_K_N0_N1[1]) + "," + - std::to_string(pt->BBlockTransferThreadSliceLengths_K_N0_N1[2]); - - out += " -DCK_PARAM_BBlockTransferThreadClusterLengths_K_N0_N1=" + - std::to_string(pt->BBlockTransferThreadClusterLengths_K_N0_N1[0]) + "," + - std::to_string(pt->BBlockTransferThreadClusterLengths_K_N0_N1[1]) + "," + - std::to_string(pt->BBlockTransferThreadClusterLengths_K_N0_N1[2]); - - out += " -DCK_PARAM_BBlockTransferThreadClusterArrangeOrder=" + - std::to_string(pt->BBlockTransferThreadClusterArrangeOrder[0]) + "," + - std::to_string(pt->BBlockTransferThreadClusterArrangeOrder[1]) + "," + - std::to_string(pt->BBlockTransferThreadClusterArrangeOrder[2]); - - out += " -DCK_PARAM_BBlockTransferSrcAccessOrder=" + - std::to_string(pt->BBlockTransferSrcAccessOrder[0]) + "," + - std::to_string(pt->BBlockTransferSrcAccessOrder[1]) + "," + - std::to_string(pt->BBlockTransferSrcAccessOrder[2]); - - out += - " -DCK_PARAM_BBlockTransferSrcVectorDim=" + std::to_string(pt->BBlockTransferSrcVectorDim); - out += " -DCK_PARAM_BBlockTransferSrcScalarPerVector=" + - std::to_string(pt->BBlockTransferSrcScalarPerVector); - out += " -DCK_PARAM_BBlockTransferDstScalarPerVector_N1=" + - std::to_string(pt->BBlockTransferDstScalarPerVector_N1); - out += " -DCK_PARAM_BThreadTransferSrcResetCoordinateAfterRun=" + - std::to_string(pt->BThreadTransferSrcResetCoordinateAfterRun); - - out += " -DCK_PARAM_CThreadTransferSrcDstAccessOrder=" + - std::to_string(pt->CThreadTransferSrcDstAccessOrder[0]) + "," + - std::to_string(pt->CThreadTransferSrcDstAccessOrder[1]) + "," + - std::to_string(pt->CThreadTransferSrcDstAccessOrder[2]) + "," + - std::to_string(pt->CThreadTransferSrcDstAccessOrder[3]) + "," + - std::to_string(pt->CThreadTransferSrcDstAccessOrder[4]) + "," + - std::to_string(pt->CThreadTransferSrcDstAccessOrder[5]); - - out += " -DCK_PARAM_CThreadTransferSrcDstVectorDim=" + - std::to_string(pt->CThreadTransferSrcDstVectorDim); - out += " -DCK_PARAM_CThreadTransferDstScalarPerVector=" + - std::to_string(pt->CThreadTransferDstScalarPerVector); - - return (out); -}; - -} // namespace detail_dyn_conv_fwd_v4r4_nchw_kcyx_nkhw - -template -void online_device_dynamic_convolution_forward_implicit_gemm_v4r4_dlops_nchw_kcyx_nkhw( - online_compile::Handle* handle, - const InLengths& in_n_c_hi_wi_lengths, - const WeiLengths& wei_k_c_y_x_lengths, - const OutLengths& out_n_k_ho_wo_lengths, - const ConvStrides& conv_strides, - const ConvDilations& conv_dilations, - const InLeftPads& in_left_pads, - const InRightPads& in_right_pads, - const Tensor& in_n_c_hi_wi, - const Tensor& wei_k_c_y_x, - Tensor& out_n_k_ho_wo, - const tunable_dyn_conv_fwd_v4r4_dlops_nchw_kcyx_nkhw* tunable, - ck::index_t nrepeat) -{ - using namespace ck; - using namespace ck::driver; - using namespace detail_dyn_conv_fwd_v4r4_nchw_kcyx_nkhw; - using size_t = std::size_t; - - ///////////////////////////////////////////////////////////////////////////////////////////////////////////// - // The follow codes are only used for computing the grid_size, hasMainKBlockLoop, - // hasDoubleTailKBlockLoop - - constexpr auto I0 = Number<0>{}; - constexpr auto I1 = Number<1>{}; - constexpr auto I2 = Number<2>{}; - constexpr auto I3 = Number<3>{}; - - const auto in_n_c_hi_wi_desc = - make_dynamic_naive_tensor_descriptor_packed_v2(in_n_c_hi_wi_lengths); - const auto wei_k_c_y_x_desc = - make_dynamic_naive_tensor_descriptor_packed_v2(wei_k_c_y_x_lengths); - const auto out_n_k_ho_wo_desc = - make_dynamic_naive_tensor_descriptor_packed_v2(out_n_k_ho_wo_lengths); - - const auto descs = - transform_forward_convolution_into_gemm_v4r4_nchw_kcyx_nkhw_pad(wei_k_c_y_x_desc, - in_n_c_hi_wi_desc, - out_n_k_ho_wo_desc, - conv_strides, - conv_dilations, - in_left_pads, - in_right_pads); - const auto a_k_m_grid_desc = descs[I0]; - const auto c_m_n_grid_desc = descs[I2]; - const auto M = c_m_n_grid_desc.GetLength(I0); - const auto N = c_m_n_grid_desc.GetLength(I1); - const auto K = a_k_m_grid_desc.GetLength(I0); - - const index_t grid_size = (M / tunable->MPerBlock) * (N / tunable->NPerBlock); - const bool hasMainKBlockLoop = ((K + tunable->KPerBlock) / (2 * tunable->KPerBlock) > 1); - const bool hasDoubleTailKBlockLoop = ((K / tunable->KPerBlock) % 2 == 0); - ///////////////////////////////////////////////////////////////////////////////////////////////////////////// - - // these buffers are usually provided by the user application - DeviceMem in_n_c_hi_wi_dev_buf(sizeof(TInWei) * in_n_c_hi_wi.mDesc.GetElementSpace()); - DeviceMem wei_k_c_y_x_dev_buf(sizeof(TInWei) * wei_k_c_y_x.mDesc.GetElementSpace()); - DeviceMem out_n_k_ho_wo_dev_buf(sizeof(TOut) * out_n_k_ho_wo.mDesc.GetElementSpace()); - - in_n_c_hi_wi_dev_buf.ToDevice(in_n_c_hi_wi.mData.data()); - wei_k_c_y_x_dev_buf.ToDevice(wei_k_c_y_x.mData.data()); - out_n_k_ho_wo_dev_buf.ToDevice(out_n_k_ho_wo.mData.data()); - - // these are workspace buffers that should be expressed to the user by the corresponding - // workspace API - DeviceMem workspace_buf(4096); - - void* a_k_m0_m1_grid_desc_dev_buf = workspace_buf.GetDeviceBuffer(); - void* b_k_n0_n1_grid_desc_dev_buf = - static_cast(static_cast(workspace_buf.GetDeviceBuffer()) + 1024); - void* c_m0_m10_m11_n0_n10_n11_grid_desc_dev_buf = - static_cast(static_cast(workspace_buf.GetDeviceBuffer()) + 2048); - void* c_blockid_to_m0_n0_block_cluster_adaptor_dev_buf = - static_cast(static_cast(workspace_buf.GetDeviceBuffer()) + 3072); - - const std::vector vld = {static_cast(tunable->BlockSize), 1, 1}; - const std::vector vgd1 = {static_cast(tunable->BlockSize), 1, 1}; - const std::vector vgd2 = {static_cast(grid_size * tunable->BlockSize), 1, 1}; - - std::string program_name = - "dynamic_convolution_forward_implicit_gemm_v4r4_dlops_nchw_kcyx_nkhw.cpp"; - std::string algo_name = "implicit_gemm_conv_fwd_v4r4_dlops_nchw"; - - std::string param = " -std=c++17 "; - std::string network_config; - - param += get_definition_string_from_types() + " " + - get_definition_string_from_tunable(tunable) + - " -DCK_PARAM_HAS_MAIN_KBLOCK_LOOP=" + std::to_string(hasMainKBlockLoop) + - " -DCK_PARAM_HAS_DOUBLE_TAIL_KBLOCK_LOOP=" + std::to_string(hasDoubleTailKBlockLoop); - network_config = get_network_config_string_from_types() + "_" + - get_network_config_string_from_tunable(tunable) + "_" + - std::to_string(hasMainKBlockLoop) + "_" + - std::to_string(hasDoubleTailKBlockLoop); - - std::vector kernel1_times; - std::vector kernel2_times; - - for(index_t i = 0; i < nrepeat; ++i) - { - KernelTimer timer1, timer2; - std::string kernel_name; - - kernel_name = "dynamic_convolution_forward_implicit_gemm_v4r4_dlops_nchw_kcyx_nkhw_prepare"; - auto network_config_1 = network_config + "_1"; - - timer1.Start(); - handle->AddKernel(algo_name, network_config_1, program_name, kernel_name, vld, vgd1, param)( - static_cast(in_n_c_hi_wi_lengths[I0]), - static_cast(in_n_c_hi_wi_lengths[I1]), - static_cast(in_n_c_hi_wi_lengths[I2]), - static_cast(in_n_c_hi_wi_lengths[I3]), - static_cast(wei_k_c_y_x_lengths[I0]), - static_cast(wei_k_c_y_x_lengths[I2]), - static_cast(wei_k_c_y_x_lengths[I3]), - conv_strides[I0], - conv_strides[I1], - conv_dilations[I0], - conv_dilations[I1], - in_left_pads[I0], - in_left_pads[I1], - in_right_pads[I0], - in_right_pads[I1], - a_k_m0_m1_grid_desc_dev_buf, - b_k_n0_n1_grid_desc_dev_buf, - c_m0_m10_m11_n0_n10_n11_grid_desc_dev_buf, - c_blockid_to_m0_n0_block_cluster_adaptor_dev_buf); - timer1.End(); - - kernel_name = "dynamic_convolution_forward_implicit_gemm_v4r4_dlops_nchw_kcyx_nkhw"; - auto network_config_2 = network_config + "_2"; - - timer2.Start(); - handle->AddKernel(algo_name, network_config_2, program_name, kernel_name, vld, vgd2, param)( - reinterpret_cast(wei_k_c_y_x_dev_buf.GetDeviceBuffer()), - reinterpret_cast(in_n_c_hi_wi_dev_buf.GetDeviceBuffer()), - reinterpret_cast(out_n_k_ho_wo_dev_buf.GetDeviceBuffer()), - (const void*)(a_k_m0_m1_grid_desc_dev_buf), - (const void*)(b_k_n0_n1_grid_desc_dev_buf), - (const void*)(c_m0_m10_m11_n0_n10_n11_grid_desc_dev_buf), - (const void*)(c_blockid_to_m0_n0_block_cluster_adaptor_dev_buf)); - timer2.End(); - - kernel1_times.push_back(timer1.GetElapsedTime()); - kernel2_times.push_back(timer2.GetElapsedTime()); - } - - { - auto ave_time1 = - std::accumulate( - std::next(kernel1_times.begin()), kernel1_times.end(), 0., std::plus{}) / - (nrepeat - 1); - auto ave_time2 = - std::accumulate( - std::next(kernel2_times.begin()), kernel2_times.end(), 0., std::plus{}) / - (nrepeat - 1); - - const auto N = in_n_c_hi_wi_lengths[I0]; - const auto C = in_n_c_hi_wi_lengths[I1]; - - const auto K = out_n_k_ho_wo_lengths[I1]; - const auto Ho = out_n_k_ho_wo_lengths[I2]; - const auto Wo = out_n_k_ho_wo_lengths[I3]; - - const auto Y = wei_k_c_y_x_lengths[I2]; - const auto X = wei_k_c_y_x_lengths[I3]; - - float perf = (float)(std::size_t(2) * N * K * Ho * Wo * C * Y * X) / - (std::size_t(1000) * 1000 * 1000) / (ave_time1 + ave_time2); - - std::cout << "Average time : " << ave_time1 + ave_time2 << " ms(" << ave_time1 << ", " - << ave_time2 << "), " << perf << " TFlop/s" << std::endl; - }; - - // copy result back to host - out_n_k_ho_wo_dev_buf.FromDevice(out_n_k_ho_wo.mData.data()); -} diff --git a/host/driver_online/include/online_device_dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw.hpp b/host/driver_online/include/online_device_dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw.hpp deleted file mode 100644 index 46d065f615..0000000000 --- a/host/driver_online/include/online_device_dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw.hpp +++ /dev/null @@ -1,386 +0,0 @@ -#include "device.hpp" -#include "host_tensor.hpp" -#include "handle.hpp" -#include "online_driver_common.hpp" -#include "dynamic_tensor_descriptor.hpp" -#include "dynamic_tensor_descriptor_helper.hpp" -#include "conv_tunable_fwd_v4r4_xdlops_nchw_kcyx_nkhw.hpp" - -namespace detail_dyn_conv_fwd_v4r4_xdlops_nchw_kcyx_nkhw { - -template -static std::string get_network_config_string_from_types() -{ - using namespace ck; - - std::string out; - - out += std::to_string(get_datatype_enum_from_type::value) + "_" + - std::to_string(get_datatype_enum_from_type::value) + "_" + - std::to_string(get_datatype_enum_from_type::value); - - return (out); -}; - -static std::string -get_network_config_string_from_tunable(const tunable_dyn_conv_fwd_v4r4_xdlops_nchw_kcyx_nkhw* pt) -{ - std::string out("TUN_"); - - out += std::to_string(pt->BlockSize) + "_"; - - out += std::to_string(pt->MPerBlock) + "x" + std::to_string(pt->NPerBlock) + "x" + - std::to_string(pt->KPerBlock) + "_"; - out += std::to_string(pt->MPerWave) + "x" + std::to_string(pt->NPerWave) + "x" + - std::to_string(pt->MRepeat) + "x" + std::to_string(pt->NRepeat) + "x" + - std::to_string(pt->K1) + "_"; - - out += std::to_string(pt->ABlockTransferThreadSliceLengths_K0_M_K1[0]) + "x" + - std::to_string(pt->ABlockTransferThreadSliceLengths_K0_M_K1[1]) + "x" + - std::to_string(pt->ABlockTransferThreadSliceLengths_K0_M_K1[2]) + "_"; - - out += std::to_string(pt->ABlockTransferThreadClusterLengths_K0_M_K1[0]) + "x" + - std::to_string(pt->ABlockTransferThreadClusterLengths_K0_M_K1[1]) + "x" + - std::to_string(pt->ABlockTransferThreadClusterLengths_K0_M_K1[2]) + "_"; - - out += std::to_string(pt->ABlockTransferThreadClusterArrangeOrder[0]) + "x" + - std::to_string(pt->ABlockTransferThreadClusterArrangeOrder[1]) + "x" + - std::to_string(pt->ABlockTransferThreadClusterArrangeOrder[2]) + "_"; - - out += std::to_string(pt->ABlockTransferSrcAccessOrder[0]) + "x" + - std::to_string(pt->ABlockTransferSrcAccessOrder[1]) + "x" + - std::to_string(pt->ABlockTransferSrcAccessOrder[2]) + "_"; - - out += std::to_string(pt->ABlockTransferSrcVectorDim) + "_"; - out += std::to_string(pt->ABlockTransferSrcScalarPerVector) + "_"; - out += std::to_string(pt->ABlockTransferDstScalarPerVector_K1) + "_"; - out += std::to_string(pt->AThreadTransferSrcResetCoordinateAfterRun) + "_"; - - out += std::to_string(pt->BBlockTransferThreadSliceLengths_K0_N_K1[0]) + "x" + - std::to_string(pt->BBlockTransferThreadSliceLengths_K0_N_K1[1]) + "x" + - std::to_string(pt->BBlockTransferThreadSliceLengths_K0_N_K1[2]) + "_"; - - out += std::to_string(pt->BBlockTransferThreadClusterLengths_K0_N_K1[0]) + "x" + - std::to_string(pt->BBlockTransferThreadClusterLengths_K0_N_K1[1]) + "x" + - std::to_string(pt->BBlockTransferThreadClusterLengths_K0_N_K1[2]) + "_"; - - out += std::to_string(pt->BBlockTransferThreadClusterArrangeOrder[0]) + "x" + - std::to_string(pt->BBlockTransferThreadClusterArrangeOrder[1]) + "x" + - std::to_string(pt->BBlockTransferThreadClusterArrangeOrder[2]) + "_"; - - out += std::to_string(pt->BBlockTransferSrcAccessOrder[0]) + "x" + - std::to_string(pt->BBlockTransferSrcAccessOrder[1]) + "x" + - std::to_string(pt->BBlockTransferSrcAccessOrder[2]) + "_"; - - out += std::to_string(pt->BBlockTransferSrcVectorDim) + "_"; - out += std::to_string(pt->BBlockTransferSrcScalarPerVector) + "_"; - out += std::to_string(pt->BBlockTransferDstScalarPerVector_K1) + "_"; - out += std::to_string(pt->BThreadTransferSrcResetCoordinateAfterRun) + "_"; - - out += std::to_string(pt->CThreadTransferSrcDstAccessOrder[0]) + "x" + - std::to_string(pt->CThreadTransferSrcDstAccessOrder[1]) + "x" + - std::to_string(pt->CThreadTransferSrcDstAccessOrder[2]) + "x" + - std::to_string(pt->CThreadTransferSrcDstAccessOrder[3]) + "x" + - std::to_string(pt->CThreadTransferSrcDstAccessOrder[4]) + "x" + - std::to_string(pt->CThreadTransferSrcDstAccessOrder[5]) + "x" + - std::to_string(pt->CThreadTransferSrcDstAccessOrder[6]) + "x" + - std::to_string(pt->CThreadTransferSrcDstAccessOrder[7]) + "_"; - - out += std::to_string(pt->CThreadTransferSrcDstVectorDim) + "_"; - out += std::to_string(pt->CThreadTransferDstScalarPerVector); - - return (out); -}; - -template -static std::string get_definition_string_from_types() -{ - using namespace ck; - - std::string out; - - out += - " -DCK_PARAM_ABDataTypeEnum=" + std::to_string(get_datatype_enum_from_type::value) + - " -DCK_PARAM_AccDataTypeEnum=" + std::to_string(get_datatype_enum_from_type::value) + - " -DCK_PARAM_CDataTypeEnum=" + std::to_string(get_datatype_enum_from_type::value); - - return (out); -}; - -static std::string -get_definition_string_from_tunable(const tunable_dyn_conv_fwd_v4r4_xdlops_nchw_kcyx_nkhw* pt) -{ - std::string out; - - out += " -DCK_PARAM_BlockSize=" + std::to_string(pt->BlockSize); - - out += " -DCK_PARAM_MPerBlock=" + std::to_string(pt->MPerBlock) + - " -DCK_PARAM_NPerBlock=" + std::to_string(pt->NPerBlock) + - " -DCK_PARAM_KPerBlock=" + std::to_string(pt->KPerBlock); - out += " -DCK_PARAM_MPerWave=" + std::to_string(pt->MPerWave) + - " -DCK_PARAM_NPerWave=" + std::to_string(pt->NPerWave) + - " -DCK_PARAM_K1=" + std::to_string(pt->K1) + - " -DCK_PARAM_MRepeat=" + std::to_string(pt->MRepeat) + - " -DCK_PARAM_NRepeat=" + std::to_string(pt->NRepeat); - - out += " -DCK_PARAM_ABlockTransferThreadSliceLengths_K0_M_K1=" + - std::to_string(pt->ABlockTransferThreadSliceLengths_K0_M_K1[0]) + "," + - std::to_string(pt->ABlockTransferThreadSliceLengths_K0_M_K1[1]) + "," + - std::to_string(pt->ABlockTransferThreadSliceLengths_K0_M_K1[2]); - - out += " -DCK_PARAM_ABlockTransferThreadClusterLengths_K0_M_K1=" + - std::to_string(pt->ABlockTransferThreadClusterLengths_K0_M_K1[0]) + "," + - std::to_string(pt->ABlockTransferThreadClusterLengths_K0_M_K1[1]) + "," + - std::to_string(pt->ABlockTransferThreadClusterLengths_K0_M_K1[2]); - - out += " -DCK_PARAM_ABlockTransferThreadClusterArrangeOrder=" + - std::to_string(pt->ABlockTransferThreadClusterArrangeOrder[0]) + "," + - std::to_string(pt->ABlockTransferThreadClusterArrangeOrder[1]) + "," + - std::to_string(pt->ABlockTransferThreadClusterArrangeOrder[2]); - - out += " -DCK_PARAM_ABlockTransferSrcAccessOrder=" + - std::to_string(pt->ABlockTransferSrcAccessOrder[0]) + "," + - std::to_string(pt->ABlockTransferSrcAccessOrder[1]) + "," + - std::to_string(pt->ABlockTransferSrcAccessOrder[2]); - - out += - " -DCK_PARAM_ABlockTransferSrcVectorDim=" + std::to_string(pt->ABlockTransferSrcVectorDim); - out += " -DCK_PARAM_ABlockTransferSrcScalarPerVector=" + - std::to_string(pt->ABlockTransferSrcScalarPerVector); - out += " -DCK_PARAM_ABlockTransferDstScalarPerVector_K1=" + - std::to_string(pt->ABlockTransferDstScalarPerVector_K1); - out += " -DCK_PARAM_AThreadTransferSrcResetCoordinateAfterRun=" + - std::to_string(pt->AThreadTransferSrcResetCoordinateAfterRun); - - out += " -DCK_PARAM_BBlockTransferThreadSliceLengths_K0_N_K1=" + - std::to_string(pt->BBlockTransferThreadSliceLengths_K0_N_K1[0]) + "," + - std::to_string(pt->BBlockTransferThreadSliceLengths_K0_N_K1[1]) + "," + - std::to_string(pt->BBlockTransferThreadSliceLengths_K0_N_K1[2]); - - out += " -DCK_PARAM_BBlockTransferThreadClusterLengths_K0_N_K1=" + - std::to_string(pt->BBlockTransferThreadClusterLengths_K0_N_K1[0]) + "," + - std::to_string(pt->BBlockTransferThreadClusterLengths_K0_N_K1[1]) + "," + - std::to_string(pt->BBlockTransferThreadClusterLengths_K0_N_K1[2]); - - out += " -DCK_PARAM_BBlockTransferThreadClusterArrangeOrder=" + - std::to_string(pt->BBlockTransferThreadClusterArrangeOrder[0]) + "," + - std::to_string(pt->BBlockTransferThreadClusterArrangeOrder[1]) + "," + - std::to_string(pt->BBlockTransferThreadClusterArrangeOrder[2]); - - out += " -DCK_PARAM_BBlockTransferSrcAccessOrder=" + - std::to_string(pt->BBlockTransferSrcAccessOrder[0]) + "," + - std::to_string(pt->BBlockTransferSrcAccessOrder[1]) + "," + - std::to_string(pt->BBlockTransferSrcAccessOrder[2]); - - out += - " -DCK_PARAM_BBlockTransferSrcVectorDim=" + std::to_string(pt->BBlockTransferSrcVectorDim); - out += " -DCK_PARAM_BBlockTransferSrcScalarPerVector=" + - std::to_string(pt->BBlockTransferSrcScalarPerVector); - out += " -DCK_PARAM_BBlockTransferDstScalarPerVector_K1=" + - std::to_string(pt->BBlockTransferDstScalarPerVector_K1); - out += " -DCK_PARAM_BThreadTransferSrcResetCoordinateAfterRun=" + - std::to_string(pt->BThreadTransferSrcResetCoordinateAfterRun); - - out += " -DCK_PARAM_CThreadTransferSrcDstAccessOrder=" + - std::to_string(pt->CThreadTransferSrcDstAccessOrder[0]) + "," + - std::to_string(pt->CThreadTransferSrcDstAccessOrder[1]) + "," + - std::to_string(pt->CThreadTransferSrcDstAccessOrder[2]) + "," + - std::to_string(pt->CThreadTransferSrcDstAccessOrder[3]) + "," + - std::to_string(pt->CThreadTransferSrcDstAccessOrder[4]) + "," + - std::to_string(pt->CThreadTransferSrcDstAccessOrder[5]) + "," + - std::to_string(pt->CThreadTransferSrcDstAccessOrder[6]) + "," + - std::to_string(pt->CThreadTransferSrcDstAccessOrder[7]); - - out += " -DCK_PARAM_CThreadTransferSrcDstVectorDim=" + - std::to_string(pt->CThreadTransferSrcDstVectorDim); - out += " -DCK_PARAM_CThreadTransferDstScalarPerVector=" + - std::to_string(pt->CThreadTransferDstScalarPerVector); - - return (out); -}; - -} // namespace detail_dyn_conv_fwd_v4r4_xdlops_nchw_kcyx_nkhw - -template -void online_device_dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw( - online_compile::Handle* handle, - const InLengths& in_n_c_hi_wi_lengths, - const WeiLengths& wei_k_c_y_x_lengths, - const OutLengths& out_n_k_ho_wo_lengths, - const ConvStrides& conv_strides, - const ConvDilations& conv_dilations, - const InLeftPads& in_left_pads, - const InRightPads& in_right_pads, - const Tensor& in_n_c_hi_wi, - const Tensor& wei_k_c_y_x, - Tensor& out_n_k_ho_wo, - const tunable_dyn_conv_fwd_v4r4_xdlops_nchw_kcyx_nkhw* tunable, - ck::index_t nrepeat) -{ - using namespace ck; - using namespace ck::driver; - using namespace detail_dyn_conv_fwd_v4r4_xdlops_nchw_kcyx_nkhw; - using size_t = std::size_t; - - ///////////////////////////////////////////////////////////////////////////////////////////////////////////// - - constexpr auto I0 = Number<0>{}; - constexpr auto I1 = Number<1>{}; - constexpr auto I2 = Number<2>{}; - constexpr auto I3 = Number<3>{}; - - const auto in_n_c_hi_wi_desc = - make_dynamic_naive_tensor_descriptor_packed_v2(in_n_c_hi_wi_lengths); - const auto wei_k_c_y_x_desc = - make_dynamic_naive_tensor_descriptor_packed_v2(wei_k_c_y_x_lengths); - const auto out_n_k_ho_wo_desc = - make_dynamic_naive_tensor_descriptor_packed_v2(out_n_k_ho_wo_lengths); - - const auto n = in_n_c_hi_wi_desc.GetLength(I0); - const auto c = in_n_c_hi_wi_desc.GetLength(I1); - const auto hi = in_n_c_hi_wi_desc.GetLength(I2); - const auto wi = in_n_c_hi_wi_desc.GetLength(I3); - const auto k = wei_k_c_y_x_desc.GetLength(I0); - const auto y = wei_k_c_y_x_desc.GetLength(I2); - const auto x = wei_k_c_y_x_desc.GetLength(I3); - const auto ho = out_n_k_ho_wo_desc.GetLength(I2); - const auto wo = out_n_k_ho_wo_desc.GetLength(I3); - - const auto M = k; - const auto N = n * ho * wo; - const auto K = c * y * x; - const auto K0 = K / tunable->K1; - - const index_t grid_size = (M / tunable->MPerBlock) * (N / tunable->NPerBlock); - ///////////////////////////////////////////////////////////////////////////////////////////////////////////// - - // these buffers are usually provided by the user application - DeviceMem in_n_c_hi_wi_dev_buf(sizeof(TInWei) * in_n_c_hi_wi.mDesc.GetElementSpace()); - DeviceMem wei_k_c_y_x_dev_buf(sizeof(TInWei) * wei_k_c_y_x.mDesc.GetElementSpace()); - DeviceMem out_n_k_ho_wo_dev_buf(sizeof(TOut) * out_n_k_ho_wo.mDesc.GetElementSpace()); - - in_n_c_hi_wi_dev_buf.ToDevice(in_n_c_hi_wi.mData.data()); - wei_k_c_y_x_dev_buf.ToDevice(wei_k_c_y_x.mData.data()); - out_n_k_ho_wo_dev_buf.ToDevice(out_n_k_ho_wo.mData.data()); - - // these are workspace buffers that should be expressed to the user by the corresponding - // workspace API - DeviceMem workspace_buf(4096); - - void* a_k_m0_m1_grid_desc_dev_buf = workspace_buf.GetDeviceBuffer(); - void* b_k_n0_n1_grid_desc_dev_buf = - static_cast(static_cast(workspace_buf.GetDeviceBuffer()) + 1024); - void* c_m0_m10_m11_n0_n10_n11_grid_desc_dev_buf = - static_cast(static_cast(workspace_buf.GetDeviceBuffer()) + 2048); - void* c_blockid_to_m0_n0_block_cluster_adaptor_dev_buf = - static_cast(static_cast(workspace_buf.GetDeviceBuffer()) + 3072); - - const std::vector vld = {static_cast(tunable->BlockSize), 1, 1}; - const std::vector vgd1 = {static_cast(tunable->BlockSize), 1, 1}; - const std::vector vgd2 = {static_cast(grid_size * tunable->BlockSize), 1, 1}; - - std::string program_name = - "dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw.cpp"; - std::string algo_name = "implicit_gemm_conv_fwd_v4r4_xdlops_nchw"; - - std::string param = " -std=c++17 "; - std::string network_config; - - param += get_definition_string_from_types() + " " + " -DCK_USE_AMD_XDLOPS" + - get_definition_string_from_tunable(tunable); - - network_config = get_network_config_string_from_types() + "_" + - get_network_config_string_from_tunable(tunable); - - std::vector kernel1_times; - std::vector kernel2_times; - - for(index_t i = 0; i < nrepeat; ++i) - { - KernelTimer timer1, timer2; - std::string kernel_name; - - kernel_name = - "dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw_prepare"; - auto network_config_1 = network_config + "_1"; - - timer1.Start(); - handle->AddKernel(algo_name, network_config_1, program_name, kernel_name, vld, vgd1, param)( - static_cast(in_n_c_hi_wi_lengths[I0]), - static_cast(in_n_c_hi_wi_lengths[I1]), - static_cast(in_n_c_hi_wi_lengths[I2]), - static_cast(in_n_c_hi_wi_lengths[I3]), - static_cast(wei_k_c_y_x_lengths[I0]), - static_cast(wei_k_c_y_x_lengths[I2]), - static_cast(wei_k_c_y_x_lengths[I3]), - conv_strides[I0], - conv_strides[I1], - conv_dilations[I0], - conv_dilations[I1], - in_left_pads[I0], - in_left_pads[I1], - in_right_pads[I0], - in_right_pads[I1], - a_k_m0_m1_grid_desc_dev_buf, - b_k_n0_n1_grid_desc_dev_buf, - c_m0_m10_m11_n0_n10_n11_grid_desc_dev_buf, - c_blockid_to_m0_n0_block_cluster_adaptor_dev_buf); - timer1.End(); - - kernel_name = "dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw"; - auto network_config_2 = network_config + "_2"; - - timer2.Start(); - handle->AddKernel(algo_name, network_config_2, program_name, kernel_name, vld, vgd2, param)( - reinterpret_cast(wei_k_c_y_x_dev_buf.GetDeviceBuffer()), - reinterpret_cast(in_n_c_hi_wi_dev_buf.GetDeviceBuffer()), - reinterpret_cast(out_n_k_ho_wo_dev_buf.GetDeviceBuffer()), - (const void*)(a_k_m0_m1_grid_desc_dev_buf), - (const void*)(b_k_n0_n1_grid_desc_dev_buf), - (const void*)(c_m0_m10_m11_n0_n10_n11_grid_desc_dev_buf), - (const void*)(c_blockid_to_m0_n0_block_cluster_adaptor_dev_buf)); - timer2.End(); - - kernel1_times.push_back(timer1.GetElapsedTime()); - kernel2_times.push_back(timer2.GetElapsedTime()); - } - - { - auto ave_time1 = - std::accumulate( - std::next(kernel1_times.begin()), kernel1_times.end(), 0., std::plus{}) / - (nrepeat - 1); - auto ave_time2 = - std::accumulate( - std::next(kernel2_times.begin()), kernel2_times.end(), 0., std::plus{}) / - (nrepeat - 1); - - const auto N = in_n_c_hi_wi_lengths[I0]; - const auto C = in_n_c_hi_wi_lengths[I1]; - - const auto K = out_n_k_ho_wo_lengths[I1]; - const auto Ho = out_n_k_ho_wo_lengths[I2]; - const auto Wo = out_n_k_ho_wo_lengths[I3]; - - const auto Y = wei_k_c_y_x_lengths[I2]; - const auto X = wei_k_c_y_x_lengths[I3]; - - float perf = (float)(std::size_t(2) * N * K * Ho * Wo * C * Y * X) / - (std::size_t(1000) * 1000 * 1000) / (ave_time1 + ave_time2); - - std::cout << "Average time : " << ave_time1 + ave_time2 << " ms(" << ave_time1 << ", " - << ave_time2 << "), " << perf << " TFlop/s" << std::endl; - }; - - // copy result back to host - out_n_k_ho_wo_dev_buf.FromDevice(out_n_k_ho_wo.mData.data()); -} diff --git a/host/driver_online/include/online_device_dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nhwc_kyxc_nhwk.hpp b/host/driver_online/include/online_device_dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nhwc_kyxc_nhwk.hpp deleted file mode 100644 index 57724c7612..0000000000 --- a/host/driver_online/include/online_device_dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nhwc_kyxc_nhwk.hpp +++ /dev/null @@ -1,389 +0,0 @@ -#include "device.hpp" -#include "host_tensor.hpp" -#include "handle.hpp" -#include "online_driver_common.hpp" -#include "dynamic_tensor_descriptor.hpp" -#include "dynamic_tensor_descriptor_helper.hpp" -#include "transform_forward_convolution_into_gemm_v4r4r4_nhwc_kyxc_nhwk.hpp" -#include "conv_tunable_fwd_v4r4_xdlops_nhwc_kyxc_nhwk.hpp" - -namespace detail_dyn_conv_fwd_v4r4_xdlops_nhwc_kyxc_nhwk { - -template -static std::string get_network_config_string_from_types() -{ - using namespace ck; - - std::string out; - - out += std::to_string(get_datatype_enum_from_type::value) + "_" + - std::to_string(get_datatype_enum_from_type::value) + "_" + - std::to_string(get_datatype_enum_from_type::value); - - return (out); -}; - -static std::string -get_network_config_string_from_tunable(const tunable_dyn_conv_fwd_v4r4_xdlops_nhwc_kyxc_nhwk* pt) -{ - std::string out("TUN_"); - - out += std::to_string(pt->BlockSize) + "_"; - - out += std::to_string(pt->MPerBlock) + "x" + std::to_string(pt->NPerBlock) + "x" + - std::to_string(pt->KPerBlock) + "_"; - out += std::to_string(pt->MPerWave) + "x" + std::to_string(pt->NPerWave) + "x" + - std::to_string(pt->MRepeat) + "x" + std::to_string(pt->NRepeat) + "x" + - std::to_string(pt->K1) + "_"; - - out += std::to_string(pt->ABlockTransferThreadSliceLengths_K0_M_K1[0]) + "x" + - std::to_string(pt->ABlockTransferThreadSliceLengths_K0_M_K1[1]) + "x" + - std::to_string(pt->ABlockTransferThreadSliceLengths_K0_M_K1[2]) + "_"; - - out += std::to_string(pt->ABlockTransferThreadClusterLengths_K0_M_K1[0]) + "x" + - std::to_string(pt->ABlockTransferThreadClusterLengths_K0_M_K1[1]) + "x" + - std::to_string(pt->ABlockTransferThreadClusterLengths_K0_M_K1[2]) + "_"; - - out += std::to_string(pt->ABlockTransferThreadClusterArrangeOrder[0]) + "x" + - std::to_string(pt->ABlockTransferThreadClusterArrangeOrder[1]) + "x" + - std::to_string(pt->ABlockTransferThreadClusterArrangeOrder[2]) + "_"; - - out += std::to_string(pt->ABlockTransferSrcAccessOrder[0]) + "x" + - std::to_string(pt->ABlockTransferSrcAccessOrder[1]) + "x" + - std::to_string(pt->ABlockTransferSrcAccessOrder[2]) + "_"; - - out += std::to_string(pt->ABlockTransferSrcVectorDim) + "_"; - out += std::to_string(pt->ABlockTransferSrcScalarPerVector) + "_"; - out += std::to_string(pt->ABlockTransferDstScalarPerVector_K1) + "_"; - out += std::to_string(pt->AThreadTransferSrcResetCoordinateAfterRun) + "_"; - - out += std::to_string(pt->BBlockTransferThreadSliceLengths_K0_N_K1[0]) + "x" + - std::to_string(pt->BBlockTransferThreadSliceLengths_K0_N_K1[1]) + "x" + - std::to_string(pt->BBlockTransferThreadSliceLengths_K0_N_K1[2]) + "_"; - - out += std::to_string(pt->BBlockTransferThreadClusterLengths_K0_N_K1[0]) + "x" + - std::to_string(pt->BBlockTransferThreadClusterLengths_K0_N_K1[1]) + "x" + - std::to_string(pt->BBlockTransferThreadClusterLengths_K0_N_K1[2]) + "_"; - - out += std::to_string(pt->BBlockTransferThreadClusterArrangeOrder[0]) + "x" + - std::to_string(pt->BBlockTransferThreadClusterArrangeOrder[1]) + "x" + - std::to_string(pt->BBlockTransferThreadClusterArrangeOrder[2]) + "_"; - - out += std::to_string(pt->BBlockTransferSrcAccessOrder[0]) + "x" + - std::to_string(pt->BBlockTransferSrcAccessOrder[1]) + "x" + - std::to_string(pt->BBlockTransferSrcAccessOrder[2]) + "_"; - - out += std::to_string(pt->BBlockTransferSrcVectorDim) + "_"; - out += std::to_string(pt->BBlockTransferSrcScalarPerVector) + "_"; - out += std::to_string(pt->BBlockTransferDstScalarPerVector_K1) + "_"; - out += std::to_string(pt->BThreadTransferSrcResetCoordinateAfterRun) + "_"; - - out += std::to_string(pt->CThreadTransferSrcDstAccessOrder[0]) + "x" + - std::to_string(pt->CThreadTransferSrcDstAccessOrder[1]) + "x" + - std::to_string(pt->CThreadTransferSrcDstAccessOrder[2]) + "x" + - std::to_string(pt->CThreadTransferSrcDstAccessOrder[3]) + "x" + - std::to_string(pt->CThreadTransferSrcDstAccessOrder[4]) + "x" + - std::to_string(pt->CThreadTransferSrcDstAccessOrder[5]) + "x" + - std::to_string(pt->CThreadTransferSrcDstAccessOrder[6]) + "x" + - std::to_string(pt->CThreadTransferSrcDstAccessOrder[7]) + "_"; - - out += std::to_string(pt->CThreadTransferSrcDstVectorDim) + "_"; - out += std::to_string(pt->CThreadTransferDstScalarPerVector); - - return (out); -}; - -template -static std::string get_definition_string_from_types() -{ - using namespace ck; - - std::string out; - - out += - " -DCK_PARAM_ABDataTypeEnum=" + std::to_string(get_datatype_enum_from_type::value) + - " -DCK_PARAM_AccDataTypeEnum=" + std::to_string(get_datatype_enum_from_type::value) + - " -DCK_PARAM_CDataTypeEnum=" + std::to_string(get_datatype_enum_from_type::value); - - return (out); -}; - -static std::string -get_definition_string_from_tunable(const tunable_dyn_conv_fwd_v4r4_xdlops_nhwc_kyxc_nhwk* pt) -{ - std::string out; - - out += " -DCK_PARAM_BlockSize=" + std::to_string(pt->BlockSize); - - out += " -DCK_PARAM_MPerBlock=" + std::to_string(pt->MPerBlock) + - " -DCK_PARAM_NPerBlock=" + std::to_string(pt->NPerBlock) + - " -DCK_PARAM_KPerBlock=" + std::to_string(pt->KPerBlock); - out += " -DCK_PARAM_MPerWave=" + std::to_string(pt->MPerWave) + - " -DCK_PARAM_NPerWave=" + std::to_string(pt->NPerWave) + - " -DCK_PARAM_K1=" + std::to_string(pt->K1) + - " -DCK_PARAM_MRepeat=" + std::to_string(pt->MRepeat) + - " -DCK_PARAM_NRepeat=" + std::to_string(pt->NRepeat); - - out += " -DCK_PARAM_ABlockTransferThreadSliceLengths_K0_M_K1=" + - std::to_string(pt->ABlockTransferThreadSliceLengths_K0_M_K1[0]) + "," + - std::to_string(pt->ABlockTransferThreadSliceLengths_K0_M_K1[1]) + "," + - std::to_string(pt->ABlockTransferThreadSliceLengths_K0_M_K1[2]); - - out += " -DCK_PARAM_ABlockTransferThreadClusterLengths_K0_M_K1=" + - std::to_string(pt->ABlockTransferThreadClusterLengths_K0_M_K1[0]) + "," + - std::to_string(pt->ABlockTransferThreadClusterLengths_K0_M_K1[1]) + "," + - std::to_string(pt->ABlockTransferThreadClusterLengths_K0_M_K1[2]); - - out += " -DCK_PARAM_ABlockTransferThreadClusterArrangeOrder=" + - std::to_string(pt->ABlockTransferThreadClusterArrangeOrder[0]) + "," + - std::to_string(pt->ABlockTransferThreadClusterArrangeOrder[1]) + "," + - std::to_string(pt->ABlockTransferThreadClusterArrangeOrder[2]); - - out += " -DCK_PARAM_ABlockTransferSrcAccessOrder=" + - std::to_string(pt->ABlockTransferSrcAccessOrder[0]) + "," + - std::to_string(pt->ABlockTransferSrcAccessOrder[1]) + "," + - std::to_string(pt->ABlockTransferSrcAccessOrder[2]); - - out += - " -DCK_PARAM_ABlockTransferSrcVectorDim=" + std::to_string(pt->ABlockTransferSrcVectorDim); - out += " -DCK_PARAM_ABlockTransferSrcScalarPerVector=" + - std::to_string(pt->ABlockTransferSrcScalarPerVector); - out += " -DCK_PARAM_ABlockTransferDstScalarPerVector_K1=" + - std::to_string(pt->ABlockTransferDstScalarPerVector_K1); - out += " -DCK_PARAM_AThreadTransferSrcResetCoordinateAfterRun=" + - std::to_string(pt->AThreadTransferSrcResetCoordinateAfterRun); - - out += " -DCK_PARAM_BBlockTransferThreadSliceLengths_K0_N_K1=" + - std::to_string(pt->BBlockTransferThreadSliceLengths_K0_N_K1[0]) + "," + - std::to_string(pt->BBlockTransferThreadSliceLengths_K0_N_K1[1]) + "," + - std::to_string(pt->BBlockTransferThreadSliceLengths_K0_N_K1[2]); - - out += " -DCK_PARAM_BBlockTransferThreadClusterLengths_K0_N_K1=" + - std::to_string(pt->BBlockTransferThreadClusterLengths_K0_N_K1[0]) + "," + - std::to_string(pt->BBlockTransferThreadClusterLengths_K0_N_K1[1]) + "," + - std::to_string(pt->BBlockTransferThreadClusterLengths_K0_N_K1[2]); - - out += " -DCK_PARAM_BBlockTransferThreadClusterArrangeOrder=" + - std::to_string(pt->BBlockTransferThreadClusterArrangeOrder[0]) + "," + - std::to_string(pt->BBlockTransferThreadClusterArrangeOrder[1]) + "," + - std::to_string(pt->BBlockTransferThreadClusterArrangeOrder[2]); - - out += " -DCK_PARAM_BBlockTransferSrcAccessOrder=" + - std::to_string(pt->BBlockTransferSrcAccessOrder[0]) + "," + - std::to_string(pt->BBlockTransferSrcAccessOrder[1]) + "," + - std::to_string(pt->BBlockTransferSrcAccessOrder[2]); - - out += - " -DCK_PARAM_BBlockTransferSrcVectorDim=" + std::to_string(pt->BBlockTransferSrcVectorDim); - out += " -DCK_PARAM_BBlockTransferSrcScalarPerVector=" + - std::to_string(pt->BBlockTransferSrcScalarPerVector); - out += " -DCK_PARAM_BBlockTransferDstScalarPerVector_K1=" + - std::to_string(pt->BBlockTransferDstScalarPerVector_K1); - out += " -DCK_PARAM_BThreadTransferSrcResetCoordinateAfterRun=" + - std::to_string(pt->BThreadTransferSrcResetCoordinateAfterRun); - - out += " -DCK_PARAM_CThreadTransferSrcDstAccessOrder=" + - std::to_string(pt->CThreadTransferSrcDstAccessOrder[0]) + "," + - std::to_string(pt->CThreadTransferSrcDstAccessOrder[1]) + "," + - std::to_string(pt->CThreadTransferSrcDstAccessOrder[2]) + "," + - std::to_string(pt->CThreadTransferSrcDstAccessOrder[3]) + "," + - std::to_string(pt->CThreadTransferSrcDstAccessOrder[4]) + "," + - std::to_string(pt->CThreadTransferSrcDstAccessOrder[5]) + "," + - std::to_string(pt->CThreadTransferSrcDstAccessOrder[6]) + "," + - std::to_string(pt->CThreadTransferSrcDstAccessOrder[7]); - - out += " -DCK_PARAM_CThreadTransferSrcDstVectorDim=" + - std::to_string(pt->CThreadTransferSrcDstVectorDim); - out += " -DCK_PARAM_CThreadTransferDstScalarPerVector=" + - std::to_string(pt->CThreadTransferDstScalarPerVector); - - return (out); -}; - -} // namespace detail_dyn_conv_fwd_v4r4_xdlops_nhwc_kyxc_nhwk - -template -void online_device_dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nhwc_kyxc_nhwk( - online_compile::Handle* handle, - const InLengths& in_n_hi_wi_c_lengths, - const WeiLengths& wei_k_y_x_c_lengths, - const OutLengths& out_n_ho_wo_k_lengths, - const ConvStrides& conv_strides, - const ConvDilations& conv_dilations, - const InLeftPads& in_left_pads, - const InRightPads& in_right_pads, - const Tensor& in_n_hi_wi_c, - const Tensor& wei_k_y_x_c, - Tensor& out_n_ho_wo_k, - const tunable_dyn_conv_fwd_v4r4_xdlops_nhwc_kyxc_nhwk* tunable, - ck::index_t nrepeat) -{ - using namespace ck; - using namespace detail_dyn_conv_fwd_v4r4_xdlops_nhwc_kyxc_nhwk; - using size_t = std::size_t; - - ///////////////////////////////////////////////////////////////////////////////////////////////////////////// - // The follow codes are only used for computing the grid_size, hasMainKBlockLoop, - // hasDoubleTailKBlockLoop - - constexpr auto I0 = Number<0>{}; - constexpr auto I1 = Number<1>{}; - constexpr auto I2 = Number<2>{}; - constexpr auto I3 = Number<3>{}; - - const auto in_n_hi_wi_c_desc = - make_dynamic_naive_tensor_descriptor_packed_v2(in_n_hi_wi_c_lengths); - const auto wei_k_y_x_c_desc = - make_dynamic_naive_tensor_descriptor_packed_v2(wei_k_y_x_c_lengths); - const auto out_n_ho_wo_k_desc = - make_dynamic_naive_tensor_descriptor_packed_v2(out_n_ho_wo_k_lengths); - - const auto n = in_n_hi_wi_c_desc.GetLength(I0); - const auto hi = in_n_hi_wi_c_desc.GetLength(I1); - const auto wi = in_n_hi_wi_c_desc.GetLength(I2); - const auto c = in_n_hi_wi_c_desc.GetLength(I3); - - const auto k = wei_k_y_x_c_desc.GetLength(I0); - const auto y = wei_k_y_x_c_desc.GetLength(I1); - const auto x = wei_k_y_x_c_desc.GetLength(I2); - - const auto ho = out_n_ho_wo_k_desc.GetLength(I1); - const auto wo = out_n_ho_wo_k_desc.GetLength(I2); - - const auto M = k; - const auto N = n * ho * wo; - const auto K = c * y * x; - const auto K0 = K / tunable->K1; - - const index_t grid_size = (M / tunable->MPerBlock) * (N / tunable->NPerBlock); - - // these buffers are usually provided by the user application - DeviceMem in_n_hi_wi_c_dev_buf(sizeof(TInWei) * in_n_hi_wi_c.mDesc.GetElementSpace()); - DeviceMem wei_k_y_x_c_dev_buf(sizeof(TInWei) * wei_k_y_x_c.mDesc.GetElementSpace()); - DeviceMem out_n_ho_wo_k_dev_buf(sizeof(TOut) * out_n_ho_wo_k.mDesc.GetElementSpace()); - - in_n_hi_wi_c_dev_buf.ToDevice(in_n_hi_wi_c.mData.data()); - wei_k_y_x_c_dev_buf.ToDevice(wei_k_y_x_c.mData.data()); - out_n_ho_wo_k_dev_buf.ToDevice(out_n_ho_wo_k.mData.data()); - - // these are workspace buffers that should be expressed to the user by the corresponding - // workspace API - DeviceMem workspace_buf(4096); - - void* a_k0_m_k1_grid_desc_dev_buf = workspace_buf.GetDeviceBuffer(); - void* b_k0_n_k1_grid_desc_dev_buf = - static_cast(static_cast(workspace_buf.GetDeviceBuffer()) + 1024); - void* c_m0_m1_m2_n_grid_desc_dev_buf = - static_cast(static_cast(workspace_buf.GetDeviceBuffer()) + 2048); - void* c_blockid_to_m0_n0_block_cluster_adaptor_dev_buf = - static_cast(static_cast(workspace_buf.GetDeviceBuffer()) + 3072); - - const std::vector vld = {static_cast(tunable->BlockSize), 1, 1}; - const std::vector vgd1 = {static_cast(tunable->BlockSize), 1, 1}; - const std::vector vgd2 = {static_cast(grid_size * tunable->BlockSize), 1, 1}; - - std::string program_name = - "dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nhwc_kyxc_nhwk.cpp"; - std::string algo_name = "implicit_gemm_conv_fwd_v4r4_xdlops_nhwc"; - - std::string param = " -std=c++17 "; - std::string network_config; - - param += get_definition_string_from_types() + " -DCK_USE_AMD_XDLOPS "; - param += get_definition_string_from_tunable(tunable); - - network_config = get_network_config_string_from_types() + "_" + - get_network_config_string_from_tunable(tunable); - - std::vector kernel1_times; - std::vector kernel2_times; - - for(index_t i = 0; i < nrepeat; ++i) - { - KernelTimer timer1, timer2; - std::string kernel_name; - - kernel_name = - "dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nhwc_kyxc_nhwk_prepare"; - auto network_config_1 = network_config + "_1"; - - timer1.Start(); - handle->AddKernel(algo_name, network_config_1, program_name, kernel_name, vld, vgd1, param)( - static_cast(in_n_hi_wi_c_lengths[I0]), - static_cast(in_n_hi_wi_c_lengths[I1]), - static_cast(in_n_hi_wi_c_lengths[I2]), - static_cast(in_n_hi_wi_c_lengths[I3]), - static_cast(wei_k_y_x_c_lengths[I0]), - static_cast(wei_k_y_x_c_lengths[I1]), - static_cast(wei_k_y_x_c_lengths[I2]), - conv_strides[I0], - conv_strides[I1], - conv_dilations[I0], - conv_dilations[I1], - in_left_pads[I0], - in_left_pads[I1], - in_right_pads[I0], - in_right_pads[I1], - a_k0_m_k1_grid_desc_dev_buf, - b_k0_n_k1_grid_desc_dev_buf, - c_m0_m1_m2_n_grid_desc_dev_buf, - c_blockid_to_m0_n0_block_cluster_adaptor_dev_buf); - timer1.End(); - - kernel_name = "dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nhwc_kyxc_nhwk"; - auto network_config_2 = network_config + "_2"; - - timer2.Start(); - handle->AddKernel(algo_name, network_config_2, program_name, kernel_name, vld, vgd2, param)( - reinterpret_cast(in_n_hi_wi_c_dev_buf.GetDeviceBuffer()), - reinterpret_cast(wei_k_y_x_c_dev_buf.GetDeviceBuffer()), - reinterpret_cast(out_n_ho_wo_k_dev_buf.GetDeviceBuffer()), - (const void*)(a_k0_m_k1_grid_desc_dev_buf), - (const void*)(b_k0_n_k1_grid_desc_dev_buf), - (const void*)(c_m0_m1_m2_n_grid_desc_dev_buf), - (const void*)(c_blockid_to_m0_n0_block_cluster_adaptor_dev_buf)); - timer2.End(); - - kernel1_times.push_back(timer1.GetElapsedTime()); - kernel2_times.push_back(timer2.GetElapsedTime()); - } - - { - auto ave_time1 = - std::accumulate( - std::next(kernel1_times.begin()), kernel1_times.end(), 0., std::plus{}) / - (nrepeat - 1); - auto ave_time2 = - std::accumulate( - std::next(kernel2_times.begin()), kernel2_times.end(), 0., std::plus{}) / - (nrepeat - 1); - - const auto N = in_n_hi_wi_c_lengths[I0]; - const auto C = in_n_hi_wi_c_lengths[I3]; - - const auto Ho = out_n_ho_wo_k_lengths[I1]; - const auto Wo = out_n_ho_wo_k_lengths[I2]; - const auto K = out_n_ho_wo_k_lengths[I3]; - - const auto Y = wei_k_y_x_c_lengths[I1]; - const auto X = wei_k_y_x_c_lengths[I2]; - - float perf = (float)(std::size_t(2) * N * K * Ho * Wo * C * Y * X) / - (std::size_t(1000) * 1000 * 1000) / ave_time2; - - std::cout << "Average time : " << ave_time1 + ave_time2 << " ms(" << ave_time1 << ", " - << ave_time2 << "), " << perf << " TFlop/s" << std::endl; - }; - - // copy result back to host - out_n_ho_wo_k_dev_buf.FromDevice(out_n_ho_wo_k.mData.data()); -} diff --git a/host/driver_online/include/online_device_dynamic_convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw.hpp b/host/driver_online/include/online_device_dynamic_convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw.hpp deleted file mode 100644 index 7b88ef02b4..0000000000 --- a/host/driver_online/include/online_device_dynamic_convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw.hpp +++ /dev/null @@ -1,183 +0,0 @@ -#pragma once -#include "device.hpp" -#include "host_tensor.hpp" -#include "handle.hpp" -#include "online_driver_common.hpp" -#include "convolution_problem_descriptor.hpp" -#include "dynamic_tensor_descriptor.hpp" -#include "dynamic_tensor_descriptor_helper.hpp" -#include "transform_forward_convolution_into_gemm_v6r1_nchw_kcyx_nkhw.hpp" -#include "conv_igemm_fwd_v6r1_dlops_nchw_kcyx_nkhw.hpp" - -template -void online_device_dynamic_convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw( - online_compile::Handle* handle, - const InLengths& in_n_c_hi_wi_lengths, - const WeiLengths& wei_k_c_y_x_lengths, - const OutLengths& out_n_k_ho_wo_lengths, - const ConvStrides& conv_strides, - const ConvDilations& conv_dilations, - const InLeftPads& in_left_pads, - const InRightPads& in_right_pads, - const Tensor& in_n_c_hi_wi, - const Tensor& wei_k_c_y_x, - Tensor& out_n_k_ho_wo, - const ck::driver::CompileParameterConvIgemmFwdV6r1DlopsNchwKcyxNkhw& compile_param, - ck::index_t nrepeat) -{ - using namespace ck; - using namespace ck::driver; - using size_t = std::size_t; - - std::cout << __func__ << std::endl; - - constexpr auto I0 = Number<0>{}; - constexpr auto I1 = Number<1>{}; - constexpr auto I2 = Number<2>{}; - constexpr auto I3 = Number<3>{}; - - ConvolutionProblemDescriptor conv_problem_desc{in_n_c_hi_wi_lengths[I0], - out_n_k_ho_wo_lengths[I1], - in_n_c_hi_wi_lengths[I1], - wei_k_c_y_x_lengths[I2], - wei_k_c_y_x_lengths[I3], - in_n_c_hi_wi_lengths[I2], - in_n_c_hi_wi_lengths[I3], - out_n_k_ho_wo_lengths[I2], - out_n_k_ho_wo_lengths[I3], - conv_strides[I0], - conv_strides[I1], - conv_dilations[I0], - conv_dilations[I1], - in_left_pads[I0], - in_left_pads[I1], - in_right_pads[I0], - in_right_pads[I1], - get_datatype_enum_from_type::value, - get_datatype_enum_from_type::value, - get_datatype_enum_from_type::value}; - - if(!ConvIgemmFwdV6r1DlopsNchwKcyxNkhw::IsValidCompileParameter(conv_problem_desc, - compile_param)) - { - throw std::runtime_error("wrong! IsValidCompileParameter fail"); - } - - DeviceMem in_n_c_hi_wi_dev_buf(sizeof(TInWei) * in_n_c_hi_wi.mDesc.GetElementSpace()); - DeviceMem wei_k_c_y_x_dev_buf(sizeof(TInWei) * wei_k_c_y_x.mDesc.GetElementSpace()); - DeviceMem out_n_k_ho_wo_dev_buf(sizeof(TOut) * out_n_k_ho_wo.mDesc.GetElementSpace()); - - in_n_c_hi_wi_dev_buf.ToDevice(in_n_c_hi_wi.mData.data()); - wei_k_c_y_x_dev_buf.ToDevice(wei_k_c_y_x.mData.data()); - out_n_k_ho_wo_dev_buf.ToDevice(out_n_k_ho_wo.mData.data()); - - // workspace is used for save transformed tensor descritpors created by prepare kernel - DeviceMem workspace_dev_buf( - ConvIgemmFwdV6r1DlopsNchwKcyxNkhw::GetWorkSpaceSize(conv_problem_desc, compile_param)); - - const auto block_size = std::size_t( - ConvIgemmFwdV6r1DlopsNchwKcyxNkhw::GetBlockSize(conv_problem_desc, compile_param)); - - const auto grid_size = std::size_t( - ConvIgemmFwdV6r1DlopsNchwKcyxNkhw::GetGridSize(conv_problem_desc, compile_param)); - - const std::vector vld1 = {1, 1, 1}; - const std::vector vgd1 = {1, 1, 1}; - - const std::vector vld2 = {static_cast(block_size), 1, 1}; - const std::vector vgd2 = {static_cast(grid_size * block_size), 1, 1}; - - std::string program_name = - "dynamic_convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw.cpp"; - std::string algo_name = "implicit_gemm_conv_fwd_v6r1_dlops_nchw"; - - std::string compile_param_string = - get_ck_hip_online_compile_common_flag() + compile_param.GetCompileParameterString(); - std::string network_config = compile_param_string; - - std::vector kernel1_times; - std::vector kernel2_times; - - for(index_t i = 0; i < nrepeat + 1; ++i) - { - KernelTimer timer1, timer2; - std::string kernel_name; - - kernel_name = "dynamic_convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw_prepare"; - auto network_config_1 = network_config + "_1"; - - timer1.Start(); - handle->AddKernel(algo_name, - network_config_1, - program_name, - kernel_name, - vld1, - vgd1, - compile_param_string)(static_cast(in_n_c_hi_wi_lengths[I0]), - static_cast(in_n_c_hi_wi_lengths[I1]), - static_cast(in_n_c_hi_wi_lengths[I2]), - static_cast(in_n_c_hi_wi_lengths[I3]), - static_cast(wei_k_c_y_x_lengths[I0]), - static_cast(wei_k_c_y_x_lengths[I2]), - static_cast(wei_k_c_y_x_lengths[I3]), - conv_strides[I0], - conv_strides[I1], - conv_dilations[I0], - conv_dilations[I1], - in_left_pads[I0], - in_left_pads[I1], - in_right_pads[I0], - in_right_pads[I1], - (void*)(workspace_dev_buf.GetDeviceBuffer())); - timer1.End(); - - kernel_name = "dynamic_convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw"; - auto network_config_2 = network_config + "_2"; - - timer2.Start(); - handle->AddKernel(algo_name, - network_config_2, - program_name, - kernel_name, - vld2, - vgd2, - compile_param_string)( - reinterpret_cast(wei_k_c_y_x_dev_buf.GetDeviceBuffer()), - reinterpret_cast(in_n_c_hi_wi_dev_buf.GetDeviceBuffer()), - reinterpret_cast(out_n_k_ho_wo_dev_buf.GetDeviceBuffer()), - (const void*)(workspace_dev_buf.GetDeviceBuffer())); - timer2.End(); - - kernel1_times.push_back(timer1.GetElapsedTime()); - kernel2_times.push_back(timer2.GetElapsedTime()); - } - - { - auto ave_time1 = - std::accumulate( - std::next(kernel1_times.begin()), kernel1_times.end(), 0., std::plus{}) / - nrepeat; - auto ave_time2 = - std::accumulate( - std::next(kernel2_times.begin()), kernel2_times.end(), 0., std::plus{}) / - nrepeat; - - float perf = (float)(conv_problem_desc.CalculateFlop()) / - (std::size_t(1000) * 1000 * 1000) / (ave_time1 + ave_time2); - - std::cout << "Average time : " << ave_time1 + ave_time2 << " ms(" << ave_time1 << ", " - << ave_time2 << "), " << perf << " TFlop/s" << std::endl; - }; - - // copy result back to host - out_n_k_ho_wo_dev_buf.FromDevice(out_n_k_ho_wo.mData.data()); -} diff --git a/host/online_compile/CMakeLists.txt b/host/online_compile/CMakeLists.txt deleted file mode 100644 index 1b66703fcd..0000000000 --- a/host/online_compile/CMakeLists.txt +++ /dev/null @@ -1,168 +0,0 @@ -set(CMAKE_CXX_COMPILER /opt/rocm/llvm/bin/clang++) - -## for online-compiling of HIP kernels -set(OLC_HIP_COMPILER ${CMAKE_CXX_COMPILER} CACHE PATH "") - -## reset to avoid the C++ options from the parent project -set(CMAKE_CXX_FLAGS "") -message("Compiling options for library and kernels: ${CMAKE_CXX_FLAGS}") - -# look for and register clang-offload-bundler -if(OLC_HIP_COMPILER MATCHES ".*clang\\+\\+$") - find_program(OLC_OFFLOADBUNDLER_BIN clang-offload-bundler - PATH_SUFFIXES bin - PATHS - /opt/rocm/llvm - ${CMAKE_INSTALL_PREFIX}/llvm - ) -endif() - -if(OLC_OFFLOADBUNDLER_BIN) - message(STATUS "clang-offload-bundler found: ${OLC_OFFLOADBUNDLER_BIN}") - set(OLC_OFFLOADBUNDLER_BIN "${OLC_OFFLOADBUNDLER_BIN}") -else() - # look for and register extractkernel - message(STATUS "clang-offload-bundler not found") - - find_program(EXTRACTKERNEL_BIN extractkernel - PATH_SUFFIXES bin - PATHS - /opt/rocm/hip - /opt/rocm/hcc - /opt/rocm - ${CMAKE_INSTALL_PREFIX}/hip - ${CMAKE_INSTALL_PREFIX}/hcc - ${CMAKE_INSTALL_PREFIX} - - ) - if(EXTRACTKERNEL_BIN) - message(STATUS "extractkernel found: ${EXTRACTKERNEL_BIN}") - set(EXTRACTKERNEL_BIN "${EXTRACTKERNEL_BIN}") - else() - message(FATAL_ERROR "extractkernel not found") - endif() -endif() - -option(Boost_USE_STATIC_LIBS "Use boost static libraries" OFF) -set(BOOST_COMPONENTS filesystem) -add_definitions(-DBOOST_ALL_NO_LIB=1) -find_package(Boost REQUIRED COMPONENTS ${BOOST_COMPONENTS}) - -# HIP is always required -find_package(hip REQUIRED PATHS /opt/rocm) -message(STATUS "Build with HIP ${hip_VERSION}") -target_flags(HIP_COMPILER_FLAGS hip::device) -# Remove cuda arch flags -string(REGEX REPLACE --cuda-gpu-arch=[a-z0-9]+ "" HIP_COMPILER_FLAGS "${HIP_COMPILER_FLAGS}") -string(REGEX REPLACE --offload-arch=[a-z0-9]+ "" HIP_COMPILER_FLAGS "${HIP_COMPILER_FLAGS}") - -set(OLC_hip_VERSION_MAJOR "${hip_VERSION_MAJOR}") -set(OLC_hip_VERSION_MINOR "${hip_VERSION_MINOR}") -set(OLC_hip_VERSION_PATCH "${hip_VERSION_PATCH}") - -option(ENABLE_DEBUG "Build to enable debugging" ON) -if(ENABLE_DEBUG) - set(OLC_DEBUG 1) -else() - set(OLC_DEBUG 0) -endif() - -configure_file("${PROJECT_SOURCE_DIR}/host/online_compile/include/config.h.in" "${PROJECT_BINARY_DIR}/host/online_compile/include/config.h") - -include_directories(BEFORE - ${PROJECT_BINARY_DIR}/host/online_compile/include -) - -message(STATUS "Hip compiler flags: ${HIP_COMPILER_FLAGS}") - -## HIP_COMPILER_FLAGS will be used for on-line compiling of the HIP kernels -set(HIP_COMPILER_FLAGS "${HIP_COMPILER_FLAGS} ${HIP_ONLINE_COMPILER_FLAGS}") -add_definitions("-DHIP_COMPILER_FLAGS=${HIP_COMPILER_FLAGS}") - -file(GLOB_RECURSE COMPOSABLE_KERNEL_INCLUDE_1 "${PROJECT_SOURCE_DIR}/composable_kernel/include/*/*.hpp") -file(GLOB COMPOSABLE_KERNEL_INCLUDE_2 "${PROJECT_SOURCE_DIR}/external/rocm/include/bfloat16_dev.hpp") -set(MCONV_KERNEL_INCLUDES - ${COMPOSABLE_KERNEL_INCLUDE_1} - ${COMPOSABLE_KERNEL_INCLUDE_2} - ) - -file(GLOB_RECURSE MCONV_KERNELS "${PROJECT_SOURCE_DIR}/composable_kernel/src/kernel_wrapper/*.cpp") - -add_kernels(${CMAKE_CURRENT_SOURCE_DIR} "${MCONV_KERNELS}") -add_kernel_includes(${CMAKE_CURRENT_SOURCE_DIR} "${MCONV_KERNEL_INCLUDES}") - -set(ONLINE_COMPILATION_SOURCE - ${PROJECT_BINARY_DIR}/kernel.cpp - ${PROJECT_BINARY_DIR}/kernel_includes.cpp -) - -include_directories(BEFORE - ${PROJECT_BINARY_DIR}/host/online_compile/include - include -) - -set(OLC_HIP_UTILITY_CPPS - hip_utility/logger.cpp - hip_utility/tmp_dir.cpp - hip_utility/md5.cpp - hip_utility/exec_utils.cpp - hip_utility/target_properties.cpp - hip_utility/handlehip.cpp - hip_utility/kernel_build_params.cpp - hip_utility/hip_build_utils.cpp - hip_utility/hipoc_program.cpp - hip_utility/hipoc_kernel.cpp - hip_utility/kernel_cache.cpp - hip_utility/binary_cache.cpp - ) - -list(APPEND OLC_SOURCES ${OLC_HIP_UTILITY_CPPS} ${OLC_HIP_UTILITY_HEADERS}) - -## addkernels provide the tool to create inlined kernels in one header -add_subdirectory(addkernels) - -function(inline_kernels_src KERNELS KERNEL_INCLUDES) - set(KERNEL_SRC_HPP_FILENAME batch_all.cpp.hpp) - set(KERNEL_SRC_HPP_PATH ${PROJECT_BINARY_DIR}/inlined_kernels/${KERNEL_SRC_HPP_FILENAME}) - set(KERNEL_SRC_CPP_PATH ${PROJECT_BINARY_DIR}/inlined_kernels/batch_all.cpp) - - add_custom_command( - OUTPUT ${KERNEL_SRC_HPP_PATH} - WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} - DEPENDS addkernels ${KERNELS} ${KERNEL_INCLUDES} - COMMAND $ -target ${KERNEL_SRC_HPP_PATH} -extern -source ${KERNELS} - COMMENT "Inlining All kernels" - ) - configure_file(kernels_batch.cpp.in ${KERNEL_SRC_CPP_PATH}) - list(APPEND OLC_SOURCES ${KERNEL_SRC_CPP_PATH} ${KERNEL_SRC_HPP_PATH}) - - set(OLC_SOURCES ${OLC_SOURCES} PARENT_SCOPE) -endfunction() - -inline_kernels_src("${MCONV_KERNELS}" "${MCONV_KERNEL_INCLUDES}") - -list(APPEND ONLINE_COMPILATION_SOURCE ${OLC_SOURCES} ${PROJECT_BINARY_DIR}/olc_kernel_includes.h) - -add_custom_command( - OUTPUT ${PROJECT_BINARY_DIR}/olc_kernel_includes.h - WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} - DEPENDS addkernels ${MCONV_KERNEL_INCLUDES} - COMMAND $ -no-recurse -guard GUARD_OLC_KERNEL_INCLUDES_HPP_ -target ${PROJECT_BINARY_DIR}/olc_kernel_includes.h -source ${MCONV_KERNEL_INCLUDES} - COMMENT "Inlining HIP kernel includes" - ) - -## the library target -add_library(online_compile SHARED ${ONLINE_COMPILATION_SOURCE}) - -target_include_directories(online_compile PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/online_compile/include/) -target_include_directories(online_compile PRIVATE ${PROJECT_BINARY_DIR}) -target_include_directories(online_compile PRIVATE ${PROJECT_SOURCE_DIR}/external/half/include/) - -target_link_libraries(online_compile PRIVATE hip::device) -target_link_libraries(online_compile INTERFACE hip::host) -target_link_libraries(online_compile PRIVATE Boost::filesystem) - -target_compile_features(online_compile PUBLIC) -set_target_properties(online_compile PROPERTIES POSITION_INDEPENDENT_CODE ON) - -install(TARGETS online_compile LIBRARY DESTINATION lib) diff --git a/host/online_compile/addkernels/CMakeLists.txt b/host/online_compile/addkernels/CMakeLists.txt deleted file mode 100644 index 874cba6a5e..0000000000 --- a/host/online_compile/addkernels/CMakeLists.txt +++ /dev/null @@ -1,30 +0,0 @@ -################################################################################ -# -# MIT License -# -# Copyright (c) 2017 Advanced Micro Devices, Inc. -# -# Permission is hereby granted, free of charge, to any person obtaining a copy -# of this software and associated documentation files (the "Software"), to deal -# in the Software without restriction, including without limitation the rights -# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -# copies of the Software, and to permit persons to whom the Software is -# furnished to do so, subject to the following conditions: -# -# The above copyright notice and this permission notice shall be included in all -# copies or substantial portions of the Software. -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -# SOFTWARE. -# -################################################################################ - -set(ADD_KERNELS_SOURCE include_inliner.cpp addkernels.cpp) - -add_executable(addkernels EXCLUDE_FROM_ALL ${ADD_KERNELS_SOURCE}) - diff --git a/host/online_compile/addkernels/addkernels.cpp b/host/online_compile/addkernels/addkernels.cpp deleted file mode 100644 index 5be523d97b..0000000000 --- a/host/online_compile/addkernels/addkernels.cpp +++ /dev/null @@ -1,264 +0,0 @@ -/******************************************************************************* - * - * MIT License - * - * Copyright (c) 2021 Advanced Micro Devices, Inc. - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - *******************************************************************************/ -#include "include_inliner.hpp" -#include -#include -#include -#include -#include -#include -#include - -void Bin2Hex(std::istream& source, - std::ostream& target, - const std::string& variable, - bool nullTerminate, - size_t bufferSize, - size_t lineSize) -{ - source.seekg(0, std::ios::end); - std::unique_ptr buffer(new unsigned char[bufferSize]); - std::streamoff sourceSize = source.tellg(); - std::streamoff blockStart = 0; - - if(variable.length() != 0) - { - target << "extern const size_t " << variable << "_SIZE;" << std::endl; - target << "extern const unsigned char " << variable << "[];" << std::endl; - target << "const size_t " << variable << "_SIZE = " << std::setbase(10) << sourceSize << ";" - << std::endl; - target << "const unsigned char " << variable << "[] = {" << std::endl; - } - - target << std::setbase(16) << std::setfill('0'); - source.seekg(0, std::ios::beg); - - while(blockStart < sourceSize) - { - source.read(reinterpret_cast(buffer.get()), bufferSize); - - std::streamoff pos = source.tellg(); - std::streamoff blockSize = (pos < 0 ? sourceSize : pos) - blockStart; - std::streamoff i = 0; - - while(i < blockSize) - { - size_t j = i; - size_t end = std::min(i + lineSize, blockSize); - - for(; j < end; j++) - target << "0x" << std::setw(2) << static_cast(buffer[j]) << ","; - - target << std::endl; - i = end; - } - - blockStart += blockSize; - } - - if(nullTerminate) - target << "0x00," << std::endl; - - if(variable.length() != 0) - { - target << "};" << std::endl; - } -} - -void PrintHelp() -{ - std::cout << "Usage: bin2hex {