diff --git a/CMakeLists.txt b/CMakeLists.txt
index db48a26202..882a1af308 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,6 +1,11 @@
 cmake_minimum_required(VERSION 2.8.3)
 project(modular_convolution)
 
+list(APPEND CMAKE_MODULE_PATH "${PROJECT_SOURCE_DIR}/cmake")
+
+include(TargetFlags)
+include(AddKernels)
+
 #c++
 enable_language(CXX)
 set(CMAKE_CXX_STANDARD 17)
@@ -8,15 +13,6 @@ set(CMAKE_CXX_STANDARD_REQUIRED ON)
 set(CMAKE_CXX_EXTENSIONS OFF)
 message("CMAKE_CXX_COMPILER_ID: ${CMAKE_CXX_COMPILER_ID}")
 
-#boost
-find_package(Boost REQUIRED)
-
-message("Boost_INCLUDE_DIRS: ${Boost_INCLUDE_DIRS}")
-message("Boost_LIBRARY_DIRS: ${Boost_LIBRARY_DIRS}")
-
-include_directories(BEFORE ${Boost_INCLUDE_DIRS})
-link_directories(${Boost_LIBRARY_DIRS})
-
 #OpenMP
 if(CMAKE_CXX_COMPILER_ID MATCHES "Clang")
 	# workaround issue hipcc in rocm3.5 cannot find openmp
@@ -78,3 +74,31 @@ elseif(DEVICE_BACKEND STREQUAL "NVIDIA")
 endif()
 
 add_subdirectory(driver)
+
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
+
+message("Compiling options for drivers: ${CMAKE_CXX_FLAGS}")
+
+if(DEVICE_BACKEND STREQUAL "AMD")
+    set(CONV_SOURCE driver/conv_driver.cpp)
+    set(CONV_V2_SOURCE driver/conv_driver_v2.cpp)
+    set(CONV_V2_OLC_SOURCE driver/conv_driver_v2_olc.cpp)
+    set(CONV_BWD_DATA_SOURCE driver/conv_bwd_data_driver.cpp)
+elseif(DEVICE_BACKEND STREQUAL "NVIDIA")
+    set(CONV_SOURCE driver/conv_driver.cu)
+    set(CONV_BWD_DATA_SOURCE driver/conv_bwd_data_driver.cu)
+endif()
+
+##add_executable(conv_driver ${CONV_SOURCE})
+add_executable(conv_driver_v2 ${CONV_V2_SOURCE})
+add_executable(conv_driver_v2_olc ${CONV_V2_OLC_SOURCE})
+##add_executable(conv_bwd_data_driver ${CONV_BWD_DATA_SOURCE})
+
+target_include_directories(conv_driver_v2_olc PRIVATE driver/olCompiling/include/)
+
+##target_link_libraries(conv_driver PRIVATE modConv)
+target_link_libraries(conv_driver_v2 PRIVATE modConv)
+target_link_libraries(conv_driver_v2_olc PRIVATE modConv)
+##target_link_libraries(conv_bwd_data_driver PRIVATE modConv)
+
+
diff --git a/cmake/AddKernels.cmake b/cmake/AddKernels.cmake
new file mode 100644
index 0000000000..429ecc47a9
--- /dev/null
+++ b/cmake/AddKernels.cmake
@@ -0,0 +1,40 @@
+
+function(add_kernels SRC_DIR KERNEL_FILES)
+    set(INIT_KERNELS_LIST)
+    set(KERNELS_DECLS)
+    foreach(KERNEL_FILE ${KERNEL_FILES})
+        if("${CMAKE_VERSION}" VERSION_LESS 3.0)
+            configure_file(${KERNEL_FILE} ${KERNEL_FILE}.delete)
+        else()
+            set_property(DIRECTORY APPEND PROPERTY CMAKE_CONFIGURE_DEPENDS ${KERNEL_FILE})
+        endif()
+        get_filename_component(BASE_NAME ${KERNEL_FILE} NAME_WE)
+        string(TOUPPER "${BASE_NAME}" KEY_NAME)
+        string(MAKE_C_IDENTIFIER "${KEY_NAME}" VAR_NAME)
+	string(APPEND KERNELS_DECLS "extern const size_t APP_KERNEL_${VAR_NAME}_SIZE;\n")
+	string(APPEND KERNELS_DECLS "extern const unsigned char APP_KERNEL_${VAR_NAME}[];\n")
+	list(APPEND INIT_KERNELS_LIST "    { \"${KEY_NAME}\", std::string(reinterpret_cast<const char*>(APP_KERNEL_${VAR_NAME}), APP_KERNEL_${VAR_NAME}_SIZE) }")
+    endforeach()
+    string(REPLACE ";" ",\n" INIT_KERNELS "${INIT_KERNELS_LIST}")
+    configure_file(${SRC_DIR}/kernel.cpp.in ${PROJECT_BINARY_DIR}/kernel.cpp)
+endfunction()
+
+function(add_kernel_includes SRC_DIR KERNEL_FILES)
+    set(INIT_KERNELS_LIST)
+    foreach(KERNEL_FILE ${KERNEL_FILES})
+        if("${CMAKE_VERSION}" VERSION_LESS 3.0)
+            configure_file(${KERNEL_FILE} ${KERNEL_FILE}.delete)
+        else()
+            set_property(DIRECTORY APPEND PROPERTY CMAKE_CONFIGURE_DEPENDS ${KERNEL_FILE})
+        endif()
+        get_filename_component(BASE_NAME ${KERNEL_FILE} NAME_WE)
+        get_filename_component(FILE_NAME ${KERNEL_FILE} NAME)
+        string(TOUPPER "${BASE_NAME}" KEY_NAME)
+        string(MAKE_C_IDENTIFIER "${KEY_NAME}" VAR_NAME)
+        list(APPEND INIT_KERNELS_LIST "    { \"${FILE_NAME}\", std::string(reinterpret_cast<const char*>(${VAR_NAME}), ${VAR_NAME}_SIZE) }")
+    endforeach()
+    string(REPLACE ";" ",\n" INIT_KERNELS "${INIT_KERNELS_LIST}")
+    configure_file(${SRC_DIR}/kernel_includes.cpp.in ${PROJECT_BINARY_DIR}/kernel_includes.cpp)
+endfunction()
+
+
diff --git a/cmake/TargetFlags.cmake b/cmake/TargetFlags.cmake
new file mode 100644
index 0000000000..4f83fb5d39
--- /dev/null
+++ b/cmake/TargetFlags.cmake
@@ -0,0 +1,50 @@
+
+function(get_target_property2 VAR TARGET PROPERTY)
+    get_target_property(_pflags ${TARGET} ${PROPERTY})
+    if(_pflags)
+        set(${VAR} ${_pflags} PARENT_SCOPE)
+    else()
+        set(${VAR} "" PARENT_SCOPE)
+    endif()
+endfunction()
+
+
+macro(append_flags FLAGS TARGET PROPERTY PREFIX)
+    get_target_property2(_pflags ${TARGET} ${PROPERTY})
+    foreach(FLAG ${_pflags})
+        if(TARGET ${FLAG})
+            target_flags(_pflags2 ${FLAG})
+            string(APPEND ${FLAGS} " ${_pflags2}")
+        else()
+            string(APPEND ${FLAGS} " ${PREFIX}${FLAG}")
+        endif()
+    endforeach()
+endmacro()
+
+macro(append_link_flags FLAGS TARGET PROPERTY)
+    get_target_property2(_pflags ${TARGET} ${PROPERTY})
+    foreach(FLAG ${_pflags})
+        if(TARGET ${FLAG})
+            target_flags(_pflags2 ${FLAG})
+            string(APPEND ${FLAGS} " ${_pflags2}")
+        elseif(FLAG MATCHES "^-.*")
+            string(APPEND ${FLAGS} " ${FLAG}")
+        elseif(EXISTS ${FLAG})
+            string(APPEND ${FLAGS} " ${FLAG}")
+        else()
+            string(APPEND ${FLAGS} " -l${FLAG}")
+        endif()
+    endforeach()
+endmacro()
+
+function(target_flags FLAGS TARGET)
+    set(_flags)
+    append_flags(_flags ${TARGET} "INTERFACE_COMPILE_OPTIONS" "")
+    append_flags(_flags ${TARGET} "INTERFACE_COMPILE_DEFINITIONS" "-D")
+    append_flags(_flags ${TARGET} "INTERFACE_INCLUDE_DIRECTORIES" "-isystem ")
+    append_flags(_flags ${TARGET} "INTERFACE_LINK_DIRECTORIES" "-L ")
+    append_flags(_flags ${TARGET} "INTERFACE_LINK_OPTIONS" "")
+    append_link_flags(_flags ${TARGET} "INTERFACE_LINK_LIBRARIES" "")
+    # message("_flags: ${_flags}")
+    set(${FLAGS} ${_flags} PARENT_SCOPE)
+endfunction()
diff --git a/composable_kernel/include/utility/config.amd.hpp.in b/composable_kernel/include/utility/config.amd.hpp.in
index cadc821338..609ae2b212 100644
--- a/composable_kernel/include/utility/config.amd.hpp.in
+++ b/composable_kernel/include/utility/config.amd.hpp.in
@@ -112,8 +112,8 @@
 #endif
 
 // pass tensor descriptor by value or void*
-#define CK_EXPERIMENTAL_PASS_TENSOR_DESCRIPTOR_BY_VALUE 1
-#define CK_EXPERIMENTAL_PASS_TENSOR_DESCRIPTOR_BY_VOID_POINTER 0
+#define CK_EXPERIMENTAL_PASS_TENSOR_DESCRIPTOR_BY_VALUE 0
+#define CK_EXPERIMENTAL_PASS_TENSOR_DESCRIPTOR_BY_VOID_POINTER 1
 
 // merge transformation use magic number division
 #define CK_EXPERIMENTAL_MERGE_USE_MAGIC_DIVISION 0
diff --git a/composable_kernel/include/utility/sequence_helper.hpp b/composable_kernel/include/utility/sequence_helper.hpp
index ccedfc3e6f..88d7da63e8 100644
--- a/composable_kernel/include/utility/sequence_helper.hpp
+++ b/composable_kernel/include/utility/sequence_helper.hpp
@@ -1,7 +1,7 @@
 #ifndef CK_SEQUENCE_HELPER_HPP
 #define CK_SEQUENCE_HELPER_HPP
 
-#include "sequence_helper.hpp"
+#include "tuple.hpp"
 
 namespace ck {
 
diff --git a/composable_kernel/include/utility/type_helper.hpp b/composable_kernel/include/utility/type_helper.hpp
new file mode 100644
index 0000000000..987f07e3f4
--- /dev/null
+++ b/composable_kernel/include/utility/type_helper.hpp
@@ -0,0 +1,34 @@
+#ifndef CK_TYPE_HELPER_HPP
+#define CK_TYPE_HELPER_HPP
+
+#include "float_type.hpp"
+
+namespace ck {
+
+template <char tid>
+struct get_type_from_type_id
+{
+    using type = float;
+};
+
+template <>
+struct get_type_from_type_id<'H'>
+{
+    using type = half_t;
+};
+
+template <>
+struct get_type_from_type_id<'F'>
+{
+    using type = float;
+};
+
+template <>
+struct get_type_from_type_id<'D'>
+{
+    using type = double;
+};
+
+} // namespace ck
+
+#endif
diff --git a/composable_kernel/src/kernel_wrapper/dynamic_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw.cpp b/composable_kernel/src/kernel_wrapper/dynamic_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw.cpp
new file mode 100644
index 0000000000..19501b3fcd
--- /dev/null
+++ b/composable_kernel/src/kernel_wrapper/dynamic_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw.cpp
@@ -0,0 +1,370 @@
+#include "common_header.hpp"
+#include "type_helper.hpp"
+#include "dynamic_tensor_descriptor.hpp"
+#include "dynamic_tensor_descriptor_helper.hpp"
+#include "gridwise_dynamic_gemm_v1r2.hpp"
+#include "transform_forward_convolution_into_gemm_v4r4_nchw_kcyx_nkhw.hpp"
+
+using namespace ck;
+
+using FloatAB  = typename get_type_from_type_id<static_cast<char>(CK_PARAM_IN_WEI_DATATYPE)>::type;
+using FloatC   = typename get_type_from_type_id<static_cast<char>(CK_PARAM_OUT_DATATYPE)>::type;
+using FloatAcc = typename get_type_from_type_id<static_cast<char>(CK_PARAM_CONV_COMPTYPE)>::type;
+
+constexpr index_t BlockSize = CK_PARAM_BlockSize;
+
+constexpr index_t MPerBlock            = CK_PARAM_MPerBlock;
+constexpr index_t NPerBlock            = CK_PARAM_NPerBlock;
+constexpr index_t KPerBlock            = CK_PARAM_KPerBlock;
+constexpr index_t M1PerThread          = CK_PARAM_M1PerThread;
+constexpr index_t N1PerThread          = CK_PARAM_N1PerThread;
+constexpr index_t KPerThread           = CK_PARAM_KPerThread;
+constexpr index_t M1N1ThreadClusterM10 = CK_PARAM_M1N1ThreadClusterM10;
+constexpr index_t M1N1ThreadClusterN10 = CK_PARAM_M1N1ThreadClusterN10;
+constexpr index_t M1N1ThreadClusterM11 = CK_PARAM_M1N1ThreadClusterM11;
+constexpr index_t M1N1ThreadClusterN11 = CK_PARAM_M1N1ThreadClusterN11;
+
+using ABlockTransferThreadSliceLengths_K_M0_M1 =
+    Sequence<CK_PARAM_ABlockTransferThreadSliceLengths_K_M0_M1>;
+using ABlockTransferThreadClusterLengths_K_M0_M1 =
+    Sequence<CK_PARAM_ABlockTransferThreadClusterLengths_K_M0_M1>;
+using ABlockTransferThreadClusterArrangeOrder =
+    Sequence<CK_PARAM_ABlockTransferThreadClusterArrangeOrder>;
+using ABlockTransferSrcAccessOrder = Sequence<CK_PARAM_ABlockTransferSrcAccessOrder>;
+
+constexpr index_t ABlockTransferSrcVectorDim       = CK_PARAM_ABlockTransferSrcVectorDim;
+constexpr index_t ABlockTransferSrcScalarPerVector = CK_PARAM_ABlockTransferSrcScalarPerVector;
+constexpr index_t ABlockTransferDstScalarPerVector_M1 =
+    CK_PARAM_ABlockTransferDstScalarPerVector_M1;
+constexpr bool AThreadTransferSrcResetCoordinateAfterRun =
+    static_cast<bool>(CK_PARAM_AThreadTransferSrcResetCoordinateAfterRun);
+
+using BBlockTransferThreadSliceLengths_K_N0_N1 =
+    Sequence<CK_PARAM_BBlockTransferThreadSliceLengths_K_N0_N1>;
+using BBlockTransferThreadClusterLengths_K_N0_N1 =
+    Sequence<CK_PARAM_BBlockTransferThreadClusterLengths_K_N0_N1>;
+using BBlockTransferThreadClusterArrangeOrder =
+    Sequence<CK_PARAM_BBlockTransferThreadClusterArrangeOrder>;
+using BBlockTransferSrcAccessOrder = Sequence<CK_PARAM_BBlockTransferSrcAccessOrder>;
+
+constexpr index_t BBlockTransferSrcVectorDim       = CK_PARAM_BBlockTransferSrcVectorDim;
+constexpr index_t BBlockTransferSrcScalarPerVector = CK_PARAM_BBlockTransferSrcScalarPerVector;
+constexpr index_t BBlockTransferDstScalarPerVector_N1 =
+    CK_PARAM_BBlockTransferDstScalarPerVector_N1;
+constexpr bool BThreadTransferSrcResetCoordinateAfterRun =
+    static_cast<bool>(CK_PARAM_BThreadTransferSrcResetCoordinateAfterRun);
+
+using CThreadTransferSrcDstAccessOrder = Sequence<CK_PARAM_CThreadTransferSrcDstAccessOrder>;
+constexpr index_t CThreadTransferSrcDstVectorDim    = CK_PARAM_CThreadTransferSrcDstVectorDim;
+constexpr index_t CThreadTransferDstScalarPerVector = CK_PARAM_CThreadTransferDstScalarPerVector;
+
+constexpr bool HasMainKBlockLoop       = static_cast<bool>(CK_PARAM_HAS_MAIN_KBLOCK_LOOP);
+constexpr bool HasDoubleTailKBlockLoop = static_cast<bool>(CK_PARAM_HAS_DOUBLE_TAIL_KBLOCK_LOOP);
+
+extern "C" __global__ void dynamic_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw_prepare(
+    int n,
+    int c,
+    int hi,
+    int wi,
+    int k,
+    int y,
+    int x,
+    int convStrideH,
+    int convStrideW,
+    int convDilationY,
+    int convDilationX,
+    int leftPadH,
+    int leftPadW,
+    int rightPadH,
+    int rightPadW,
+    void* p_a_k_m0_m1_grid_desc,
+    void* p_b_k_n0_n1_grid_desc,
+    void* p_c_m0_m10_m11_n0_n10_n11_grid_desc,
+    void* p_c_blockid_to_m0_n0_block_cluster_adaptor)
+{
+    constexpr auto I0 = Number<0>{};
+    constexpr auto I1 = Number<1>{};
+    constexpr auto I2 = Number<2>{};
+
+    const index_t ho = (hi + leftPadH + rightPadH - convDilationY * (y - 1) - 1) / convStrideH + 1;
+    const index_t wo = (wi + leftPadW + rightPadW - convDilationX * (x - 1) - 1) / convStrideW + 1;
+
+    const auto in_n_c_hi_wi_desc =
+        make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(n, c, hi, wi));
+    const auto wei_k_c_y_x_desc =
+        make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(k, c, y, x));
+    const auto out_n_k_ho_wo_desc =
+        make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(n, k, ho, wo));
+
+    const auto descs = transform_forward_convolution_into_gemm_v4r4_nchw_kcyx_nkhw_pad(
+        wei_k_c_y_x_desc,
+        in_n_c_hi_wi_desc,
+        out_n_k_ho_wo_desc,
+        make_tuple(convStrideH, convStrideW),
+        make_tuple(convDilationY, convDilationX),
+        make_tuple(leftPadH, leftPadW),
+        make_tuple(rightPadH, rightPadW));
+
+    const auto a_k_m_grid_desc = descs[I0];
+    const auto b_k_n_grid_desc = descs[I1];
+    const auto c_m_n_grid_desc = descs[I2];
+
+    using AKMGridDesc = decltype(a_k_m_grid_desc);
+    using BKNGridDesc = decltype(b_k_n_grid_desc);
+    using CMNGridDesc = decltype(c_m_n_grid_desc);
+
+    using AGridIteratorHacks = decltype(make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0>{},
+                                                              Sequence<0, 0, 0, 0, 0>{},
+                                                              Sequence<0, 0, 0, 0, 0>{},
+                                                              Sequence<0, 0, 0, 0, 0>{}),
+                                                   make_tuple(Sequence<0, 0, 0, 0, 0>{},
+                                                              Sequence<0, 0, 0, 0, 0>{},
+                                                              Sequence<0, 0, 0, 0, 0>{},
+                                                              Sequence<0, 0, 0, 0, 0>{})));
+
+    using BGridIteratorHacks =
+        decltype(make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0>{},
+                                       Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0>{},
+                                       Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0>{}),
+                            make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0>{},
+                                       Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0>{},
+                                       Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0>{})));
+
+    using CGridIteratorHacks = decltype(make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0>{},
+                                                              Sequence<0, 0, 0, 0, 0>{},
+                                                              Sequence<0, 0, 0, 0, 0>{},
+                                                              Sequence<0, 0, 1, 0, 0>{},
+                                                              Sequence<0, 0, 1, 0, 0>{},
+                                                              Sequence<0, 0, 1, 0, 0>{}),
+                                                   make_tuple(Sequence<0, 0, 0, 0, 0>{},
+                                                              Sequence<0, 0, 0, 0, 0>{},
+                                                              Sequence<0, 0, 0, 0, 0>{},
+                                                              Sequence<0, 0, 2, 0, 0>{},
+                                                              Sequence<0, 0, 2, 0, 0>{},
+                                                              Sequence<0, 0, 2, 0, 0>{})));
+
+    using AGridMoveSliceWindowIteratorHacks = Sequence<0, 0, 0, 0, 0>;
+    using BGridMoveSliceWindowIteratorHacks = Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 0, 0>;
+
+    using GridwiseGemm =
+        GridwiseDynamicGemm_km_kn_mn_v1r2<BlockSize,
+                                          FloatAB,
+                                          FloatAcc,
+                                          FloatC,
+                                          InMemoryDataOperation::Set, /* ToDo tunable */
+                                          AKMGridDesc,
+                                          BKNGridDesc,
+                                          CMNGridDesc,
+                                          MPerBlock,
+                                          NPerBlock,
+                                          KPerBlock,
+                                          M1PerThread,
+                                          N1PerThread,
+                                          KPerThread,
+                                          M1N1ThreadClusterM10,
+                                          M1N1ThreadClusterN10,
+                                          M1N1ThreadClusterM11,
+                                          M1N1ThreadClusterN11,
+                                          ABlockTransferThreadSliceLengths_K_M0_M1,
+                                          ABlockTransferThreadClusterLengths_K_M0_M1,
+                                          ABlockTransferThreadClusterArrangeOrder,
+                                          ABlockTransferSrcAccessOrder,
+                                          ABlockTransferSrcVectorDim,
+                                          ABlockTransferSrcScalarPerVector,
+                                          ABlockTransferDstScalarPerVector_M1,
+                                          AThreadTransferSrcResetCoordinateAfterRun,
+                                          BBlockTransferThreadSliceLengths_K_N0_N1,
+                                          BBlockTransferThreadClusterLengths_K_N0_N1,
+                                          BBlockTransferThreadClusterArrangeOrder,
+                                          BBlockTransferSrcAccessOrder,
+                                          BBlockTransferSrcVectorDim,
+                                          BBlockTransferSrcScalarPerVector,
+                                          BBlockTransferDstScalarPerVector_N1,
+                                          BThreadTransferSrcResetCoordinateAfterRun,
+                                          CThreadTransferSrcDstAccessOrder,
+                                          CThreadTransferSrcDstVectorDim,
+                                          CThreadTransferDstScalarPerVector,
+                                          AGridIteratorHacks,
+                                          BGridIteratorHacks,
+                                          CGridIteratorHacks,
+                                          AGridMoveSliceWindowIteratorHacks,
+                                          BGridMoveSliceWindowIteratorHacks>;
+
+    auto a_k_m0_m1_grid_desc = GridwiseGemm::MakeAKM0M1GridDescriptor(a_k_m_grid_desc);
+    auto b_k_n0_n1_grid_desc = GridwiseGemm::MakeBKN0N1GridDescriptor(b_k_n_grid_desc);
+    auto c_m0_m10_m11_n0_n10_n11_grid_desc =
+        GridwiseGemm::MakeCM0M10M11N0N10N11GridDescriptor(c_m_n_grid_desc);
+    auto c_blockid_to_m0_n0_block_cluster_adaptor =
+        GridwiseGemm::MakeCBlockIdToM0N0BlockClusterAdaptor(c_m_n_grid_desc);
+
+    if(hipThreadIdx_x == 0)
+    {
+        *static_cast<decltype(a_k_m0_m1_grid_desc)*>(p_a_k_m0_m1_grid_desc) = a_k_m0_m1_grid_desc;
+        *static_cast<decltype(b_k_n0_n1_grid_desc)*>(p_b_k_n0_n1_grid_desc) = b_k_n0_n1_grid_desc;
+        *static_cast<decltype(c_m0_m10_m11_n0_n10_n11_grid_desc)*>(
+            p_c_m0_m10_m11_n0_n10_n11_grid_desc) = c_m0_m10_m11_n0_n10_n11_grid_desc;
+        *static_cast<decltype(c_blockid_to_m0_n0_block_cluster_adaptor)*>(
+            p_c_blockid_to_m0_n0_block_cluster_adaptor) = c_blockid_to_m0_n0_block_cluster_adaptor;
+    };
+};
+
+extern "C" __global__ void
+#if CK_USE_LAUNCH_BOUNDS
+    __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
+#endif
+        dynamic_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw(
+            const FloatAB* __restrict__ p_a_grid,
+            const FloatAB* __restrict__ p_b_grid,
+            FloatC* __restrict__ p_c_grid,
+            const void __CONSTANT__* p_a_k_m0_m1_grid_desc,
+            const void __CONSTANT__* p_b_k_n0_n1_grid_desc,
+            const void __CONSTANT__* p_c_m0_m10_m11_n0_n10_n11_grid_desc,
+            const void __CONSTANT__* p_c_blockid_to_m0_n0_block_cluster_adaptor)
+{
+    constexpr auto I0 = Number<0>{};
+    constexpr auto I1 = Number<1>{};
+    constexpr auto I2 = Number<2>{};
+
+    constexpr auto in_n_c_hi_wi_desc =
+        make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(64, 4, 35, 35));
+    constexpr auto wei_k_c_y_x_desc =
+        make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(8, 4, 3, 3));
+    constexpr auto out_n_k_ho_wo_desc =
+        make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(64, 8, 18, 18));
+
+    constexpr auto descs =
+        transform_forward_convolution_into_gemm_v4r4_nchw_kcyx_nkhw_pad(wei_k_c_y_x_desc,
+                                                                        in_n_c_hi_wi_desc,
+                                                                        out_n_k_ho_wo_desc,
+                                                                        make_tuple(2, 2),
+                                                                        make_tuple(1, 1),
+                                                                        make_tuple(1, 1),
+                                                                        make_tuple(1, 1));
+
+    constexpr auto a_k_m_grid_desc = descs[I0];
+    constexpr auto b_k_n_grid_desc = descs[I1];
+    constexpr auto c_m_n_grid_desc = descs[I2];
+
+    using AKMGridDesc = decltype(a_k_m_grid_desc);
+    using BKNGridDesc = decltype(b_k_n_grid_desc);
+    using CMNGridDesc = decltype(c_m_n_grid_desc);
+
+    using AGridIteratorHacks = decltype(make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0>{},
+                                                              Sequence<0, 0, 0, 0, 0>{},
+                                                              Sequence<0, 0, 0, 0, 0>{},
+                                                              Sequence<0, 0, 0, 0, 0>{}),
+                                                   make_tuple(Sequence<0, 0, 0, 0, 0>{},
+                                                              Sequence<0, 0, 0, 0, 0>{},
+                                                              Sequence<0, 0, 0, 0, 0>{},
+                                                              Sequence<0, 0, 0, 0, 0>{})));
+
+    using BGridIteratorHacks =
+        decltype(make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0>{},
+                                       Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0>{},
+                                       Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0>{}),
+                            make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0>{},
+                                       Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0>{},
+                                       Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0>{})));
+
+    using CGridIteratorHacks = decltype(make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0>{},
+                                                              Sequence<0, 0, 0, 0, 0>{},
+                                                              Sequence<0, 0, 0, 0, 0>{},
+                                                              Sequence<0, 0, 1, 0, 0>{},
+                                                              Sequence<0, 0, 1, 0, 0>{},
+                                                              Sequence<0, 0, 1, 0, 0>{}),
+                                                   make_tuple(Sequence<0, 0, 0, 0, 0>{},
+                                                              Sequence<0, 0, 0, 0, 0>{},
+                                                              Sequence<0, 0, 0, 0, 0>{},
+                                                              Sequence<0, 0, 2, 0, 0>{},
+                                                              Sequence<0, 0, 2, 0, 0>{},
+                                                              Sequence<0, 0, 2, 0, 0>{})));
+
+    using AGridMoveSliceWindowIteratorHacks = Sequence<0, 0, 0, 0, 0>;
+    using BGridMoveSliceWindowIteratorHacks = Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 0, 0>;
+
+    using GridwiseGemm =
+        GridwiseDynamicGemm_km_kn_mn_v1r2<BlockSize,
+                                          FloatAB,
+                                          FloatAcc,
+                                          FloatC,
+                                          InMemoryDataOperation::Set, /* ToDo tunable */
+                                          AKMGridDesc,
+                                          BKNGridDesc,
+                                          CMNGridDesc,
+                                          MPerBlock,
+                                          NPerBlock,
+                                          KPerBlock,
+                                          M1PerThread,
+                                          N1PerThread,
+                                          KPerThread,
+                                          M1N1ThreadClusterM10,
+                                          M1N1ThreadClusterN10,
+                                          M1N1ThreadClusterM11,
+                                          M1N1ThreadClusterN11,
+                                          ABlockTransferThreadSliceLengths_K_M0_M1,
+                                          ABlockTransferThreadClusterLengths_K_M0_M1,
+                                          ABlockTransferThreadClusterArrangeOrder,
+                                          ABlockTransferSrcAccessOrder,
+                                          ABlockTransferSrcVectorDim,
+                                          ABlockTransferSrcScalarPerVector,
+                                          ABlockTransferDstScalarPerVector_M1,
+                                          AThreadTransferSrcResetCoordinateAfterRun,
+                                          BBlockTransferThreadSliceLengths_K_N0_N1,
+                                          BBlockTransferThreadClusterLengths_K_N0_N1,
+                                          BBlockTransferThreadClusterArrangeOrder,
+                                          BBlockTransferSrcAccessOrder,
+                                          BBlockTransferSrcVectorDim,
+                                          BBlockTransferSrcScalarPerVector,
+                                          BBlockTransferDstScalarPerVector_N1,
+                                          BThreadTransferSrcResetCoordinateAfterRun,
+                                          CThreadTransferSrcDstAccessOrder,
+                                          CThreadTransferSrcDstVectorDim,
+                                          CThreadTransferDstScalarPerVector,
+                                          AGridIteratorHacks,
+                                          BGridIteratorHacks,
+                                          CGridIteratorHacks,
+                                          AGridMoveSliceWindowIteratorHacks,
+                                          BGridMoveSliceWindowIteratorHacks>;
+
+    constexpr auto a_k_m0_m1_grid_desc_tmp =
+        GridwiseGemm::MakeAKM0M1GridDescriptor(a_k_m_grid_desc);
+    constexpr auto b_k_n0_n1_grid_desc_tmp =
+        GridwiseGemm::MakeBKN0N1GridDescriptor(b_k_n_grid_desc);
+    constexpr auto c_m0_m10_m11_n0_n10_n11_grid_desc_tmp =
+        GridwiseGemm::MakeCM0M10M11N0N10N11GridDescriptor(c_m_n_grid_desc);
+    constexpr auto c_blockid_to_m0_n0_block_cluster_adaptor_tmp =
+        GridwiseGemm::MakeCBlockIdToM0N0BlockClusterAdaptor(c_m_n_grid_desc);
+
+    using AKM0M1GridDesc            = decltype(a_k_m0_m1_grid_desc_tmp);
+    using BKN0N1GridDesc            = decltype(b_k_n0_n1_grid_desc_tmp);
+    using CM0M10M11N0N10N11GridDesc = decltype(c_m0_m10_m11_n0_n10_n11_grid_desc_tmp);
+    using CBlockIdToM0N0BlockClusterAdaptor =
+        decltype(c_blockid_to_m0_n0_block_cluster_adaptor_tmp);
+
+    const auto a_k_m0_m1_grid_desc =
+        *reinterpret_cast<const AKM0M1GridDesc*>((const void*)p_a_k_m0_m1_grid_desc);
+    const auto b_k_n0_n1_grid_desc =
+        *reinterpret_cast<const BKN0N1GridDesc*>((const void*)p_b_k_n0_n1_grid_desc);
+    const auto c_m0_m10_m11_n0_n10_n11_grid_desc =
+        *reinterpret_cast<const CM0M10M11N0N10N11GridDesc*>(
+            (const void*)p_c_m0_m10_m11_n0_n10_n11_grid_desc);
+    const auto c_blockid_to_m0_n0_block_cluster_adaptor =
+        *reinterpret_cast<const CBlockIdToM0N0BlockClusterAdaptor*>(
+            (const void*)p_c_blockid_to_m0_n0_block_cluster_adaptor);
+
+    constexpr index_t shared_block_size =
+        GridwiseGemm::GetSharedMemoryNumberOfByte() / sizeof(FloatAB);
+
+    __shared__ FloatAB p_shared_block[shared_block_size];
+
+    GridwiseGemm::Run(p_a_grid,
+                      p_b_grid,
+                      p_c_grid,
+                      p_shared_block,
+                      a_k_m0_m1_grid_desc,
+                      b_k_n0_n1_grid_desc,
+                      c_m0_m10_m11_n0_n10_n11_grid_desc,
+                      c_blockid_to_m0_n0_block_cluster_adaptor,
+                      integral_constant<bool, HasMainKBlockLoop>{},
+                      integral_constant<bool, HasDoubleTailKBlockLoop>{});
+};
diff --git a/composable_kernel/src/kernel_wrapper/dynamic_convolution_forward_implicit_gemm_v4r5_nchw_kcyx_nkhw.cpp b/composable_kernel/src/kernel_wrapper/dynamic_convolution_forward_implicit_gemm_v4r5_nchw_kcyx_nkhw.cpp
new file mode 100644
index 0000000000..503da231e9
--- /dev/null
+++ b/composable_kernel/src/kernel_wrapper/dynamic_convolution_forward_implicit_gemm_v4r5_nchw_kcyx_nkhw.cpp
@@ -0,0 +1,379 @@
+#include "common_header.hpp"
+#include "type_helper.hpp"
+#include "dynamic_tensor_descriptor.hpp"
+#include "dynamic_tensor_descriptor_helper.hpp"
+#include "gridwise_dynamic_contraction_v1r1.hpp"
+#include "transform_forward_convolution_into_gemm_v4r5_nchw_kcyx_nkhw.hpp"
+
+using namespace ck;
+
+using FloatAB  = typename get_type_from_type_id<static_cast<char>(CK_PARAM_IN_WEI_DATATYPE)>::type;
+using FloatC   = typename get_type_from_type_id<static_cast<char>(CK_PARAM_OUT_DATATYPE)>::type;
+using FloatAcc = typename get_type_from_type_id<static_cast<char>(CK_PARAM_CONV_COMPTYPE)>::type;
+
+constexpr index_t BlockSize = CK_PARAM_BlockSize;
+constexpr index_t N0        = CK_PARAM_N0;
+
+constexpr index_t GM1PerBlockGM11      = CK_PARAM_GM1PerBlockGM11;
+constexpr index_t GN1PerBlockGN11      = CK_PARAM_GN1PerBlockGN11;
+constexpr index_t KPerBlock            = CK_PARAM_KPerBlock;
+constexpr index_t M1PerThread          = CK_PARAM_M1PerThread;
+constexpr index_t N1PerThread          = CK_PARAM_N1PerThread;
+constexpr index_t KPerThread           = CK_PARAM_KPerThread;
+constexpr index_t M1N1ThreadClusterM10 = CK_PARAM_M1N1ThreadClusterM10;
+constexpr index_t M1N1ThreadClusterN10 = CK_PARAM_M1N1ThreadClusterN10;
+constexpr index_t M1N1ThreadClusterM11 = CK_PARAM_M1N1ThreadClusterM11;
+constexpr index_t M1N1ThreadClusterN11 = CK_PARAM_M1N1ThreadClusterN11;
+
+using ABlockTransferThreadSliceLengths_GK_GM0_GM10_GM11 =
+    Sequence<CK_PARAM_ABlockTransferThreadSliceLengths_GK_GM0_GM10_GM11>;
+using ABlockTransferThreadClusterLengths_GK_GM0_GM10_GM11 =
+    Sequence<CK_PARAM_ABlockTransferThreadClusterLengths_GK_GM0_GM10_GM11>;
+using ABlockTransferThreadClusterArrangeOrder =
+    Sequence<CK_PARAM_ABlockTransferThreadClusterArrangeOrder>;
+using ABlockTransferSrcAccessOrder = Sequence<CK_PARAM_ABlockTransferSrcAccessOrder>;
+
+constexpr index_t ABlockTransferSrcVectorDim       = CK_PARAM_ABlockTransferSrcVectorDim;
+constexpr index_t ABlockTransferSrcScalarPerVector = CK_PARAM_ABlockTransferSrcScalarPerVector;
+constexpr index_t ABlockTransferDstScalarPerVector_GM11 =
+    CK_PARAM_ABlockTransferDstScalarPerVector_GM11;
+constexpr bool AThreadTransferSrcResetCoordinateAfterRun =
+    static_cast<bool>(CK_PARAM_AThreadTransferSrcResetCoordinateAfterRun);
+
+using BBlockTransferThreadSliceLengths_GK_GN0_GN10_GN11 =
+    Sequence<CK_PARAM_BBlockTransferThreadSliceLengths_GK_GN0_GN10_GN11>;
+using BBlockTransferThreadClusterLengths_GK_GN0_GN10_GN11 =
+    Sequence<CK_PARAM_BBlockTransferThreadClusterLengths_GK_GN0_GN10_GN11>;
+using BBlockTransferThreadClusterArrangeOrder =
+    Sequence<CK_PARAM_BBlockTransferThreadClusterArrangeOrder>;
+using BBlockTransferSrcAccessOrder = Sequence<CK_PARAM_BBlockTransferSrcAccessOrder>;
+
+constexpr index_t BBlockTransferSrcVectorDim       = CK_PARAM_BBlockTransferSrcVectorDim;
+constexpr index_t BBlockTransferSrcScalarPerVector = CK_PARAM_BBlockTransferSrcScalarPerVector;
+constexpr index_t BBlockTransferDstScalarPerVector_GN11 =
+    CK_PARAM_BBlockTransferDstScalarPerVector_GN11;
+constexpr bool BThreadTransferSrcResetCoordinateAfterRun =
+    static_cast<bool>(CK_PARAM_BThreadTransferSrcResetCoordinateAfterRun);
+
+using CThreadTransferSrcDstAccessOrder = Sequence<CK_PARAM_CThreadTransferSrcDstAccessOrder>;
+constexpr index_t CThreadTransferSrcDstVectorDim    = CK_PARAM_CThreadTransferSrcDstVectorDim;
+constexpr index_t CThreadTransferDstScalarPerVector = CK_PARAM_CThreadTransferDstScalarPerVector;
+
+constexpr bool HasMainKBlockLoop       = static_cast<bool>(CK_PARAM_HAS_MAIN_KBLOCK_LOOP);
+constexpr bool HasDoubleTailKBlockLoop = static_cast<bool>(CK_PARAM_HAS_DOUBLE_TAIL_KBLOCK_LOOP);
+
+extern "C" __global__ void dynamic_convolution_forward_implicit_gemm_v4r5_nchw_kcyx_nkhw_prepare(
+    int n,
+    int c,
+    int hi,
+    int wi,
+    int k,
+    int y,
+    int x,
+    int convStrideH,
+    int convStrideW,
+    int convDilationY,
+    int convDilationX,
+    int leftPadH,
+    int leftPadW,
+    int rightPadH,
+    int rightPadW,
+    void* p_a_gk_gm0_gm10_gm11_grid_desc,
+    void* p_b_gk_gn0_gn10_gn11_grid_desc,
+    void* p_c_gm10_bm0_bm1_gn10_bn0_bn1_grid_desc,
+    void* p_c_blockid_to_gm10_gn10_block_cluster_adaptor)
+{
+    constexpr auto I0 = Number<0>{};
+    constexpr auto I1 = Number<1>{};
+    constexpr auto I2 = Number<2>{};
+
+    const index_t ho = (hi + leftPadH + rightPadH - convDilationY * (y - 1) - 1) / convStrideH + 1;
+    const index_t wo = (wi + leftPadW + rightPadW - convDilationX * (x - 1) - 1) / convStrideW + 1;
+
+    const auto in_n_c_hi_wi_desc =
+        make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(n, c, hi, wi));
+    const auto wei_k_c_y_x_desc =
+        make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(k, c, y, x));
+    const auto out_n_k_ho_wo_desc =
+        make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(n, k, ho, wo));
+
+    const auto descs = transform_forward_convolution_into_contraction_v4r5_nchw_kcyx_nkhw_pad<N0>(
+        wei_k_c_y_x_desc,
+        in_n_c_hi_wi_desc,
+        out_n_k_ho_wo_desc,
+        make_tuple(convStrideH, convStrideW),
+        make_tuple(convDilationY, convDilationX),
+        make_tuple(leftPadH, leftPadW),
+        make_tuple(rightPadH, rightPadW));
+
+    const auto a_gk_gm0_gm1_grid_desc      = descs[I0];
+    const auto b_gk_gn0_gn1_grid_desc      = descs[I1];
+    const auto c_gm0_gm1_gn0_gn1_grid_desc = descs[I2];
+
+    using AGKGM0GM1GridDesc     = decltype(a_gk_gm0_gm1_grid_desc);
+    using BGKGN0GN1GridDesc     = decltype(b_gk_gn0_gn1_grid_desc);
+    using CGM0GM1GN0GN1GridDesc = decltype(c_gm0_gm1_gn0_gn1_grid_desc);
+
+    using AGridIteratorHacks = decltype(make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0, 0>{},
+                                                              Sequence<0, 0, 0, 0, 0, 0>{},
+                                                              Sequence<0, 0, 0, 0, 0, 0>{},
+                                                              Sequence<0, 0, 0, 0, 0, 0>{}),
+                                                   make_tuple(Sequence<0, 0, 0, 0, 0, 0>{},
+                                                              Sequence<0, 0, 0, 0, 0, 0>{},
+                                                              Sequence<0, 0, 0, 0, 0, 0>{},
+                                                              Sequence<0, 0, 0, 0, 0, 0>{})));
+
+    using BGridIteratorHacks =
+        decltype(make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0>{},
+                                       Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},
+                                       Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0>{},
+                                       Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0>{}),
+                            make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0>{},
+                                       Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},
+                                       Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0>{},
+                                       Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0>{})));
+
+    using CGridIteratorHacks = decltype(make_tuple(
+        make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},
+                   Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0>{},
+                   Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0>{},
+                   Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},
+                   Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0>{},
+                   Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0>{}),
+        make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},
+                   Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0>{},
+                   Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0>{},
+                   Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},
+                   Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0>{},
+                   Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0>{})));
+
+    using AGridMoveSliceWindowIteratorHacks = Sequence<0, 0, 0, 0, 0, 0>;
+
+    using BGridMoveSliceWindowIteratorHacks = Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 2, 0, 0, 0>;
+
+    using GridwiseContraction = GridwiseDynamicContraction_km0m1_kn0n1_m0m1n0n1_v1r1<
+        BlockSize,
+        FloatAB,
+        FloatAcc,
+        FloatC,
+        InMemoryDataOperation::Set, /* ToDo tunable */
+        AGKGM0GM1GridDesc,
+        BGKGN0GN1GridDesc,
+        CGM0GM1GN0GN1GridDesc,
+        GM1PerBlockGM11,
+        GN1PerBlockGN11,
+        KPerBlock,
+        M1PerThread,
+        N1PerThread,
+        KPerThread,
+        M1N1ThreadClusterM10,
+        M1N1ThreadClusterN10,
+        M1N1ThreadClusterM11,
+        M1N1ThreadClusterN11,
+        ABlockTransferThreadSliceLengths_GK_GM0_GM10_GM11,
+        ABlockTransferThreadClusterLengths_GK_GM0_GM10_GM11,
+        ABlockTransferThreadClusterArrangeOrder,
+        ABlockTransferSrcAccessOrder,
+        ABlockTransferSrcVectorDim,
+        ABlockTransferSrcScalarPerVector,
+        ABlockTransferDstScalarPerVector_GM11,
+        AThreadTransferSrcResetCoordinateAfterRun,
+        BBlockTransferThreadSliceLengths_GK_GN0_GN10_GN11,
+        BBlockTransferThreadClusterLengths_GK_GN0_GN10_GN11,
+        BBlockTransferThreadClusterArrangeOrder,
+        BBlockTransferSrcAccessOrder,
+        BBlockTransferSrcVectorDim,
+        BBlockTransferSrcScalarPerVector,
+        BBlockTransferDstScalarPerVector_GN11,
+        BThreadTransferSrcResetCoordinateAfterRun,
+        CThreadTransferSrcDstAccessOrder,
+        CThreadTransferSrcDstVectorDim,
+        CThreadTransferDstScalarPerVector,
+        AGridIteratorHacks,
+        BGridIteratorHacks,
+        CGridIteratorHacks,
+        AGridMoveSliceWindowIteratorHacks,
+        BGridMoveSliceWindowIteratorHacks>;
+
+    auto a_gk_gm0_gm10_gm11_grid_desc =
+        GridwiseContraction::MakeAGKGM0GM10GM11GridDescriptor(a_gk_gm0_gm1_grid_desc);
+    auto b_gk_gn0_gn10_gn11_grid_desc =
+        GridwiseContraction::MakeBGKGN0GN10GN11GridDescriptor(b_gk_gn0_gn1_grid_desc);
+    auto c_gm10_bm0_bm1_gn10_bn0_bn1_grid_desc =
+        GridwiseContraction::MakeCGM10BM0BM1GN10BN0BN1GridDescriptor(c_gm0_gm1_gn0_gn1_grid_desc);
+    auto c_blockid_to_gm10_gn10_block_cluster_adaptor =
+        GridwiseContraction::MakeCBlockIdToGM10GN10BlockClusterAdaptor(c_gm0_gm1_gn0_gn1_grid_desc);
+
+    if(hipThreadIdx_x == 0)
+    {
+        *static_cast<decltype(a_gk_gm0_gm10_gm11_grid_desc)*>(p_a_gk_gm0_gm10_gm11_grid_desc) =
+            a_gk_gm0_gm10_gm11_grid_desc;
+        *static_cast<decltype(b_gk_gn0_gn10_gn11_grid_desc)*>(p_b_gk_gn0_gn10_gn11_grid_desc) =
+            b_gk_gn0_gn10_gn11_grid_desc;
+        *static_cast<decltype(c_gm10_bm0_bm1_gn10_bn0_bn1_grid_desc)*>(
+            p_c_gm10_bm0_bm1_gn10_bn0_bn1_grid_desc) = c_gm10_bm0_bm1_gn10_bn0_bn1_grid_desc;
+        *static_cast<decltype(c_blockid_to_gm10_gn10_block_cluster_adaptor)*>(
+            p_c_blockid_to_gm10_gn10_block_cluster_adaptor) =
+            c_blockid_to_gm10_gn10_block_cluster_adaptor;
+    };
+};
+
+extern "C" __global__ void
+#if CK_USE_LAUNCH_BOUNDS
+    __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
+#endif
+        dynamic_convolution_forward_implicit_gemm_v4r5_nchw_kcyx_nkhw(
+            const FloatAB* __restrict__ p_a_grid,
+            const FloatAB* __restrict__ p_b_grid,
+            FloatC* __restrict__ p_c_grid,
+            const void __CONSTANT__* p_a_gk_gm0_gm10_gm11_grid_desc,
+            const void __CONSTANT__* p_b_gk_gn0_gn10_gn11_grid_desc,
+            const void __CONSTANT__* p_c_gm10_bm0_bm1_gn10_bn0_bn1_grid_desc,
+            const void __CONSTANT__* p_c_blockid_to_gm10_gn10_block_cluster_adaptor)
+{
+    constexpr auto I0 = Number<0>{};
+    constexpr auto I1 = Number<1>{};
+    constexpr auto I2 = Number<2>{};
+
+    constexpr auto in_n_c_hi_wi_desc =
+        make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(64, 4, 35, 35));
+    constexpr auto wei_k_c_y_x_desc =
+        make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(8, 4, 3, 3));
+    constexpr auto out_n_k_ho_wo_desc =
+        make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(64, 8, 18, 18));
+
+    constexpr auto descs =
+        transform_forward_convolution_into_contraction_v4r5_nchw_kcyx_nkhw_pad<N0>(
+            wei_k_c_y_x_desc,
+            in_n_c_hi_wi_desc,
+            out_n_k_ho_wo_desc,
+            make_tuple(2, 2),
+            make_tuple(1, 1),
+            make_tuple(1, 1),
+            make_tuple(1, 1));
+
+    constexpr auto a_gk_gm0_gm1_grid_desc      = descs[I0];
+    constexpr auto b_gk_gn0_gn1_grid_desc      = descs[I1];
+    constexpr auto c_gm0_gm1_gn0_gn1_grid_desc = descs[I2];
+
+    using AGKGM0GM1GridDesc     = decltype(a_gk_gm0_gm1_grid_desc);
+    using BGKGN0GN1GridDesc     = decltype(b_gk_gn0_gn1_grid_desc);
+    using CGM0GM1GN0GN1GridDesc = decltype(c_gm0_gm1_gn0_gn1_grid_desc);
+
+    using AGridIteratorHacks = decltype(make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0, 0>{},
+                                                              Sequence<0, 0, 0, 0, 0, 0>{},
+                                                              Sequence<0, 0, 0, 0, 0, 0>{},
+                                                              Sequence<0, 0, 0, 0, 0, 0>{}),
+                                                   make_tuple(Sequence<0, 0, 0, 0, 0, 0>{},
+                                                              Sequence<0, 0, 0, 0, 0, 0>{},
+                                                              Sequence<0, 0, 0, 0, 0, 0>{},
+                                                              Sequence<0, 0, 0, 0, 0, 0>{})));
+
+    using BGridIteratorHacks =
+        decltype(make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0>{},
+                                       Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},
+                                       Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0>{},
+                                       Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0>{}),
+                            make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0>{},
+                                       Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},
+                                       Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0>{},
+                                       Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0>{})));
+
+    using CGridIteratorHacks = decltype(make_tuple(
+        make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},
+                   Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0>{},
+                   Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0>{},
+                   Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},
+                   Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0>{},
+                   Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0>{}),
+        make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},
+                   Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0>{},
+                   Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0>{},
+                   Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},
+                   Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0>{},
+                   Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0>{})));
+
+    using AGridMoveSliceWindowIteratorHacks = Sequence<0, 0, 0, 0, 0, 0>;
+    using BGridMoveSliceWindowIteratorHacks = Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 2, 0, 0, 0>;
+
+    using GridwiseContraction = GridwiseDynamicContraction_km0m1_kn0n1_m0m1n0n1_v1r1<
+        BlockSize,
+        FloatAB,
+        FloatAcc,
+        FloatC,
+        InMemoryDataOperation::Set, /* ToDo tunable */
+        AGKGM0GM1GridDesc,
+        BGKGN0GN1GridDesc,
+        CGM0GM1GN0GN1GridDesc,
+        GM1PerBlockGM11,
+        GN1PerBlockGN11,
+        KPerBlock,
+        M1PerThread,
+        N1PerThread,
+        KPerThread,
+        M1N1ThreadClusterM10,
+        M1N1ThreadClusterN10,
+        M1N1ThreadClusterM11,
+        M1N1ThreadClusterN11,
+        ABlockTransferThreadSliceLengths_GK_GM0_GM10_GM11,
+        ABlockTransferThreadClusterLengths_GK_GM0_GM10_GM11,
+        ABlockTransferThreadClusterArrangeOrder,
+        ABlockTransferSrcAccessOrder,
+        ABlockTransferSrcVectorDim,
+        ABlockTransferSrcScalarPerVector,
+        ABlockTransferDstScalarPerVector_GM11,
+        AThreadTransferSrcResetCoordinateAfterRun,
+        BBlockTransferThreadSliceLengths_GK_GN0_GN10_GN11,
+        BBlockTransferThreadClusterLengths_GK_GN0_GN10_GN11,
+        BBlockTransferThreadClusterArrangeOrder,
+        BBlockTransferSrcAccessOrder,
+        BBlockTransferSrcVectorDim,
+        BBlockTransferSrcScalarPerVector,
+        BBlockTransferDstScalarPerVector_GN11,
+        BThreadTransferSrcResetCoordinateAfterRun,
+        CThreadTransferSrcDstAccessOrder,
+        CThreadTransferSrcDstVectorDim,
+        CThreadTransferDstScalarPerVector,
+        AGridIteratorHacks,
+        BGridIteratorHacks,
+        CGridIteratorHacks,
+        AGridMoveSliceWindowIteratorHacks,
+        BGridMoveSliceWindowIteratorHacks>;
+
+    using AGKGM0GM10GM11GridDesc =
+        decltype(GridwiseContraction::MakeAGKGM0GM10GM11GridDescriptor(a_gk_gm0_gm1_grid_desc));
+    using BGKGN0GN10GN11GridDesc =
+        decltype(GridwiseContraction::MakeBGKGN0GN10GN11GridDescriptor(b_gk_gn0_gn1_grid_desc));
+    using CGM10BM0BM1GN10BN0BN1GridDesc = decltype(
+        GridwiseContraction::MakeCGM10BM0BM1GN10BN0BN1GridDescriptor(c_gm0_gm1_gn0_gn1_grid_desc));
+    using CBlockIdToGM10GN10BlockClusterAdaptor =
+        decltype(GridwiseContraction::MakeCBlockIdToGM10GN10BlockClusterAdaptor(
+            c_gm0_gm1_gn0_gn1_grid_desc));
+
+    const auto a_gk_gm0_gm10_gm11_grid_desc = *reinterpret_cast<const AGKGM0GM10GM11GridDesc*>(
+        (const void*)p_a_gk_gm0_gm10_gm11_grid_desc);
+    const auto b_gk_gn0_gn10_gn11_grid_desc = *reinterpret_cast<const BGKGN0GN10GN11GridDesc*>(
+        (const void*)p_b_gk_gn0_gn10_gn11_grid_desc);
+    const auto c_gm10_bm0_bm1_gn10_bn0_bn1_grid_desc =
+        *reinterpret_cast<const CGM10BM0BM1GN10BN0BN1GridDesc*>(
+            (const void*)p_c_gm10_bm0_bm1_gn10_bn0_bn1_grid_desc);
+    const auto c_blockid_to_gm10_gn10_block_cluster_adaptor =
+        *reinterpret_cast<const CBlockIdToGM10GN10BlockClusterAdaptor*>(
+            (const void*)p_c_blockid_to_gm10_gn10_block_cluster_adaptor);
+
+    constexpr index_t shared_block_size =
+        GridwiseContraction::GetSharedMemoryNumberOfByte() / sizeof(FloatAB);
+
+    __shared__ FloatAB p_shared_block[shared_block_size];
+
+    GridwiseContraction::Run(p_a_grid,
+                             p_b_grid,
+                             p_c_grid,
+                             p_shared_block,
+                             a_gk_gm0_gm10_gm11_grid_desc,
+                             b_gk_gn0_gn10_gn11_grid_desc,
+                             c_gm10_bm0_bm1_gn10_bn0_bn1_grid_desc,
+                             c_blockid_to_gm10_gn10_block_cluster_adaptor,
+                             integral_constant<bool, HasMainKBlockLoop>{},
+                             integral_constant<bool, HasDoubleTailKBlockLoop>{});
+};
diff --git a/composable_kernel/src/kernel_wrapper/gridwise_convolution_forward_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer.cpp b/composable_kernel/src/kernel_wrapper/gridwise_convolution_forward_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer.cpp
new file mode 100644
index 0000000000..ecd3af822f
--- /dev/null
+++ b/composable_kernel/src/kernel_wrapper/gridwise_convolution_forward_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer.cpp
@@ -0,0 +1,8 @@
+
+extern "C" __global__ void
+gridwise_convolution_forward_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer(
+    const void* const __restrict__ p_in_global,
+    const void* const __restrict__ p_wei_global,
+    void* const __restrict__ p_out_global){
+
+};
diff --git a/composable_kernel/src/kernel_wrapper/gridwise_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw.cpp b/composable_kernel/src/kernel_wrapper/gridwise_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw.cpp
new file mode 100644
index 0000000000..820a0515ee
--- /dev/null
+++ b/composable_kernel/src/kernel_wrapper/gridwise_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw.cpp
@@ -0,0 +1,7 @@
+
+extern "C" __global__ void gridwise_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw(
+    const void* const __restrict__ p_in_global,
+    const void* const __restrict__ p_wei_global,
+    void* const __restrict__ p_out_global){
+
+};
diff --git a/composable_kernel/src/kernel_wrapper/gridwise_convolution_forward_implicit_gemm_v4r4_nhwc_kyxc_nhwk.cpp b/composable_kernel/src/kernel_wrapper/gridwise_convolution_forward_implicit_gemm_v4r4_nhwc_kyxc_nhwk.cpp
new file mode 100644
index 0000000000..4f646adbb7
--- /dev/null
+++ b/composable_kernel/src/kernel_wrapper/gridwise_convolution_forward_implicit_gemm_v4r4_nhwc_kyxc_nhwk.cpp
@@ -0,0 +1,8 @@
+
+
+extern "C" __global__ void gridwise_convolution_forward_implicit_gemm_v4r4_nhwc_kyxc_nhwk(
+    const void* const __restrict__ p_in_global,
+    const void* const __restrict__ p_wei_global,
+    void* const __restrict__ p_out_global){
+
+};
diff --git a/driver/CMakeLists.txt b/driver/CMakeLists.txt
index 6b91ab986d..9800559fe9 100644
--- a/driver/CMakeLists.txt
+++ b/driver/CMakeLists.txt
@@ -1,32 +1,201 @@
-set(TENSOR_SOURCE 
+
+set(CMAKE_CXX_COMPILER /opt/rocm/llvm/bin/clang++)
+
+## for online-compiling of HIP kernels
+set(OLC_HIP_COMPILER ${CMAKE_CXX_COMPILER} CACHE PATH "")
+
+## reset to avoid the C++ options from the parent project
+set(CMAKE_CXX_FLAGS "")
+message("Compiling options for library and kernels: ${CMAKE_CXX_FLAGS}")
+
+# look for and register clang-offload-bundler
+if(OLC_HIP_COMPILER MATCHES ".*clang\\+\\+$")
+    find_program(OLC_OFFLOADBUNDLER_BIN clang-offload-bundler
+        PATH_SUFFIXES bin
+        PATHS
+	    /opt/rocm/llvm
+	    ${CMAKE_INSTALL_PREFIX}/llvm
+    )
+endif()
+if(OLC_OFFLOADBUNDLER_BIN)
+    message(STATUS "clang-offload-bundler found: ${OLC_OFFLOADBUNDLER_BIN}")
+    set(OLC_OFFLOADBUNDLER_BIN "${OLC_OFFLOADBUNDLER_BIN}")
+else()
+    # look for and register extractkernel
+    message(STATUS "clang-offload-bundler not found")
+
+    find_program(EXTRACTKERNEL_BIN extractkernel
+        PATH_SUFFIXES bin
+        PATHS
+            /opt/rocm/hip
+            /opt/rocm/hcc
+            /opt/rocm
+	    ${CMAKE_INSTALL_PREFIX}/hip
+            ${CMAKE_INSTALL_PREFIX}/hcc
+            ${CMAKE_INSTALL_PREFIX}
+
+    )
+    if(EXTRACTKERNEL_BIN)
+        message(STATUS "extractkernel found: ${EXTRACTKERNEL_BIN}")
+        set(EXTRACTKERNEL_BIN "${EXTRACTKERNEL_BIN}")
+    else()
+        message(FATAL_ERROR "extractkernel not found")
+    endif()
+endif()
+
+option(Boost_USE_STATIC_LIBS "Use boost static libraries" OFF)
+set(BOOST_COMPONENTS filesystem)
+add_definitions(-DBOOST_ALL_NO_LIB=1)
+find_package(Boost REQUIRED COMPONENTS ${BOOST_COMPONENTS})
+
+# HIP is always required
+find_package(hip REQUIRED PATHS /opt/rocm)
+message(STATUS "Build with HIP ${hip_VERSION}")
+target_flags(HIP_COMPILER_FLAGS hip::device)
+# Remove cuda arch flags
+string(REGEX REPLACE --cuda-gpu-arch=[a-z0-9]+ "" HIP_COMPILER_FLAGS "${HIP_COMPILER_FLAGS}")
+string(REGEX REPLACE --offload-arch=[a-z0-9]+ "" HIP_COMPILER_FLAGS "${HIP_COMPILER_FLAGS}")
+
+set(OLC_hip_VERSION_MAJOR "${hip_VERSION_MAJOR}")
+set(OLC_hip_VERSION_MINOR "${hip_VERSION_MINOR}")
+set(OLC_hip_VERSION_PATCH "${hip_VERSION_PATCH}")
+
+option(ENABLE_DEBUG "Build to enable debugging" ON)
+if(ENABLE_DEBUG)
+    set(OLC_DEBUG 1)
+else()
+    set(OLC_DEBUG 0)
+endif()
+
+configure_file("${CMAKE_CURRENT_SOURCE_DIR}/olCompiling/include/config.h.in" "${CMAKE_CURRENT_SOURCE_DIR}/olCompiling/include/config.h")
+
+message(STATUS "Hip compiler flags: ${HIP_COMPILER_FLAGS}")
+
+## HIP_COMPILER_FLAGS will be used for on-line compiling of the HIP kernels
+add_definitions("-DHIP_COMPILER_FLAGS=${HIP_COMPILER_FLAGS}")
+
+file(GLOB COMPOSABLE_KERNEL_INCLUDE_1 "${PROJECT_SOURCE_DIR}/composable_kernel/include/kernel_algorithm/*.hpp")
+file(GLOB COMPOSABLE_KERNEL_INCLUDE_2 "${PROJECT_SOURCE_DIR}/composable_kernel/include/tensor_description/*.hpp")
+file(GLOB COMPOSABLE_KERNEL_INCLUDE_3 "${PROJECT_SOURCE_DIR}/composable_kernel/include/tensor_operation/*.hpp")
+file(GLOB COMPOSABLE_KERNEL_INCLUDE_4 "${PROJECT_SOURCE_DIR}/composable_kernel/include/utility/*.hpp")
+file(GLOB COMPOSABLE_KERNEL_INCLUDE_5 "${PROJECT_BINARY_DIR}/composable_kernel/include/utility/*.hpp") 
+file(GLOB COMPOSABLE_KERNEL_INCLUDE_6 "${PROJECT_SOURCE_DIR}/external/rocm/include/bfloat16_dev.hpp")
+set(MCONV_KERNEL_INCLUDES
+    ${COMPOSABLE_KERNEL_INCLUDE_1}
+    ${COMPOSABLE_KERNEL_INCLUDE_2}
+    ${COMPOSABLE_KERNEL_INCLUDE_3}
+    ${COMPOSABLE_KERNEL_INCLUDE_4}
+    ${COMPOSABLE_KERNEL_INCLUDE_5}
+    ${COMPOSABLE_KERNEL_INCLUDE_6}
+   )
+
+set(MCONV_KERNELS
+   ../composable_kernel/src/kernel_wrapper/dynamic_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw.cpp	
+   ../composable_kernel/src/kernel_wrapper/dynamic_convolution_forward_implicit_gemm_v4r5_nchw_kcyx_nkhw.cpp	
+   )
+
+add_kernels("olCompiling/" "${MCONV_KERNELS}")
+add_kernel_includes("olCompiling/" "${MCONV_KERNEL_INCLUDES}")
+
+set(MCONV_SOURCES
     src/host_tensor.cpp;
     src/device.cpp;
 )
 
-add_library(host SHARED ${TENSOR_SOURCE})
-target_compile_features(host PUBLIC)
-set_target_properties(host PROPERTIES POSITION_INDEPENDENT_CODE ON)
+set(OLC_HIP_UTILITY_HEADERS
+    olCompiling/include/config.h
+    olCompiling/include/logger.hpp
+    olCompiling/include/stringutils.hpp 
+    olCompiling/include/tmp_dir.hpp
+    olCompiling/include/write_file.hpp
+    olCompiling/include/env.hpp
+    olCompiling/include/manage_ptr.hpp  
+    olCompiling/include/md5.hpp
+    olCompiling/include/simple_hash.hpp
+    olCompiling/include/exec_utils.hpp
+    olCompiling/include/hipCheck.hpp
+    olCompiling/include/target_properties.hpp
+    olCompiling/include/handle.hpp
+    olCompiling/include/op_kernel_args.hpp  
+    olCompiling/include/kernel.hpp
+    olCompiling/include/kernel_build_params.hpp
+    olCompiling/include/hip_build_utils.hpp
+    olCompiling/include/hipoc_program.hpp
+    olCompiling/include/hipoc_program_impl.hpp
+    olCompiling/include/hipoc_kernel.hpp  
+    olCompiling/include/kernel_cache.hpp
+    olCompiling/include/binary_cache.hpp  
+   )
 
-if(DEVICE_BACKEND STREQUAL "NVIDIA")
-    target_link_libraries(host nvToolsExt cudart)
-endif()
+set(OLC_HIP_UTILITY_CPPS
+    olCompiling/hip_utility/logger.cpp
+    olCompiling/hip_utility/tmp_dir.cpp
+    olCompiling/hip_utility/md5.cpp  
+    olCompiling/hip_utility/exec_utils.cpp
+    olCompiling/hip_utility/target_properties.cpp  
+    olCompiling/hip_utility/handlehip.cpp
+    olCompiling/hip_utility/kernel_build_params.cpp  
+    olCompiling/hip_utility/hip_build_utils.cpp  
+    olCompiling/hip_utility/hipoc_program.cpp  
+    olCompiling/hip_utility/hipoc_kernel.cpp  
+    olCompiling/hip_utility/kernel_cache.cpp  
+    olCompiling/hip_utility/binary_cache.cpp
+   )
 
-install(TARGETS host LIBRARY DESTINATION lib) 
+list(APPEND OLC_SOURCES ${OLC_HIP_UTILITY_CPPS} ${OLC_HIP_UTILITY_HEADERS})
 
+list(INSERT MCONV_SOURCES 0
+     ${PROJECT_BINARY_DIR}/kernel.cpp
+     ${PROJECT_BINARY_DIR}/kernel_includes.cpp
+    )
 
-if(DEVICE_BACKEND STREQUAL "AMD")
-    set(CONV_SOURCE src/conv_driver.cpp)
-    set(CONV_V2_SOURCE src/conv_driver_v2.cpp)
-    set(CONV_BWD_DATA_SOURCE src/conv_bwd_data_driver.cpp)
-elseif(DEVICE_BACKEND STREQUAL "NVIDIA")
-    set(CONV_SOURCE src/conv_driver.cu)
-    set(CONV_BWD_DATA_SOURCE src/conv_bwd_data_driver.cu)
-endif()
+## addkernels provide the tool to create inlined kernels in one header
+add_subdirectory(olCompiling/addkernels)
 
-add_executable(conv_driver ${CONV_SOURCE}) 
-add_executable(conv_driver_v2 ${CONV_V2_SOURCE}) 
-add_executable(conv_bwd_data_driver ${CONV_BWD_DATA_SOURCE}) 
+function(inline_kernels_src KERNELS KERNEL_INCLUDES)
+    set(KERNEL_SRC_HPP_FILENAME batch_all.cpp.hpp)
+    set(KERNEL_SRC_HPP_PATH ${PROJECT_BINARY_DIR}/inlined_kernels/${KERNEL_SRC_HPP_FILENAME})
+    set(KERNEL_SRC_CPP_PATH ${PROJECT_BINARY_DIR}/inlined_kernels/batch_all.cpp)
 
-target_link_libraries(conv_driver PRIVATE host)
-target_link_libraries(conv_driver_v2 PRIVATE host)
-target_link_libraries(conv_bwd_data_driver PRIVATE host)
+    add_custom_command(
+        OUTPUT ${KERNEL_SRC_HPP_PATH}
+        WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
+        DEPENDS addkernels ${KERNELS} ${KERNEL_INCLUDES}
+        COMMAND $<TARGET_FILE:addkernels> -target ${KERNEL_SRC_HPP_PATH} -extern -source ${KERNELS}
+	COMMENT "Inlining All kernels"
+    )
+    configure_file(olCompiling/kernels_batch.cpp.in ${KERNEL_SRC_CPP_PATH})
+    list(APPEND OLC_SOURCES ${KERNEL_SRC_CPP_PATH} ${KERNEL_SRC_HPP_PATH})
+
+    set(OLC_SOURCES ${OLC_SOURCES} PARENT_SCOPE)
+endfunction()
+
+inline_kernels_src("${MCONV_KERNELS}" "${MCONV_KERNEL_INCLUDES}")
+
+list(APPEND MCONV_SOURCES ${OLC_SOURCES} ${PROJECT_BINARY_DIR}/olc_kernel_includes.h)
+
+add_custom_command(
+    OUTPUT ${PROJECT_BINARY_DIR}/olc_kernel_includes.h
+    WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
+    DEPENDS addkernels ${MCONV_KERNEL_INCLUDES}
+    COMMAND $<TARGET_FILE:addkernels> -no-recurse -guard GUARD_OLC_KERNEL_INCLUDES_HPP_ -target ${PROJECT_BINARY_DIR}/olc_kernel_includes.h -source ${MCONV_KERNEL_INCLUDES}
+    COMMENT "Inlining HIP kernel includes"
+  )
+
+## the library target
+add_library(modConv SHARED ${MCONV_SOURCES}) 
+
+target_include_directories(modConv PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/olCompiling/include/)
+target_include_directories(modConv PRIVATE ${PROJECT_BINARY_DIR})
+target_include_directories(modConv PRIVATE ${PROJECT_SOURCE_DIR}/external/half/include/)
+
+target_link_libraries(modConv PRIVATE hip::device)
+target_link_libraries(modConv INTERFACE hip::host)
+target_link_libraries(modConv PRIVATE Boost::filesystem)
+
+target_compile_options(modConv PRIVATE -mfma)
+
+target_compile_features(modConv PUBLIC)
+set_target_properties(modConv PROPERTIES POSITION_INDEPENDENT_CODE ON)
+
+install(TARGETS modConv LIBRARY DESTINATION lib) 
diff --git a/driver/src/conv_bwd_data_driver.cpp b/driver/conv_bwd_data_driver.cpp
similarity index 100%
rename from driver/src/conv_bwd_data_driver.cpp
rename to driver/conv_bwd_data_driver.cpp
diff --git a/driver/src/conv_driver.cpp b/driver/conv_driver.cpp
similarity index 100%
rename from driver/src/conv_driver.cpp
rename to driver/conv_driver.cpp
diff --git a/driver/src/conv_driver_v2.cpp b/driver/conv_driver_v2.cpp
similarity index 99%
rename from driver/src/conv_driver_v2.cpp
rename to driver/conv_driver_v2.cpp
index 1fb8fc7873..693448ac25 100644
--- a/driver/src/conv_driver_v2.cpp
+++ b/driver/conv_driver_v2.cpp
@@ -18,9 +18,9 @@
 #include "device_dynamic_convolution_forward_implicit_gemm_v5r1_nchw_kcyx_nkhw.hpp"
 
 #define USE_DYNAMIC_MODE 1
-#define USE_CONV_FWD_V4R4_NCHW 1
+#define USE_CONV_FWD_V4R4_NCHW 0
 #define USE_CONV_FWD_V4R4_NHWC 0
-#define USE_CONV_FWD_V4R5_NCHW 0
+#define USE_CONV_FWD_V4R5_NCHW 1
 #define USE_CONV_FWD_V5R1_NCHW 0
 
 enum ConvForwardAlgo
@@ -279,8 +279,6 @@ int main(int argc, char* argv[])
                           in_right_pads_dev);
     };
 
-    const auto nhwc_desc = f_make_for_device_nhwc();
-
 #if USE_CONV_FWD_V4R4_NCHW
     if(algo == ConvForwardAlgo::V4R4NCHW)
     {
diff --git a/driver/conv_driver_v2_olc.cpp b/driver/conv_driver_v2_olc.cpp
new file mode 100644
index 0000000000..0b1c91c81b
--- /dev/null
+++ b/driver/conv_driver_v2_olc.cpp
@@ -0,0 +1,288 @@
+#include <iostream>
+#include <numeric>
+#include <initializer_list>
+#include <cstdlib>
+#include <stdlib.h>
+#include <half.hpp>
+#include "config.hpp"
+#include "print.hpp"
+#include "device.hpp"
+#include "host_tensor.hpp"
+#include "host_tensor_generator.hpp"
+#include "conv_common.hpp"
+#include "host_conv.hpp"
+#include "device_tensor.hpp"
+#include "olc_device_dynamic_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp"
+#include "olc_device_dynamic_convolution_forward_implicit_gemm_v4r5_nchw_kcyx_nkhw.hpp"
+
+#define USE_CONV_FWD_V4R4_NCHW 1
+#define USE_CONV_FWD_V4R5_NCHW 1
+
+#include "conv_tunables.hpp"
+#include "handle.hpp"
+#include "hipCheck.hpp"
+
+enum ConvForwardAlgo
+{
+    V4R4NCHW,
+    V4R4NHWC,
+    V4R5NCHW,
+    V5R1NCHW
+};
+
+int main(int argc, char* argv[])
+{
+    using namespace ck;
+    using size_t = std::size_t;
+
+    hipStream_t stream;
+    olCompile::Handle* handle;
+
+    MY_HIP_CHECK(hipStreamCreate(&stream));
+
+    handle = new olCompile::Handle(stream);
+
+    constexpr auto I0 = Number<0>{};
+    constexpr auto I1 = Number<1>{};
+    constexpr auto I2 = Number<2>{};
+    constexpr auto I3 = Number<3>{};
+    constexpr auto I4 = Number<4>{};
+    constexpr auto I5 = Number<5>{};
+    constexpr auto I6 = Number<6>{};
+
+    if(argc != 22)
+    {
+        printf("arg1 to 5: layout, algo, do_verification, init_method, do_log, nrepeat\n");
+        printf("rest: N, K, C, Y, X, Hi, Wi, Sy, Sx, Dy, Dx, LeftPy, LeftPx, RightPy, RightPx\n");
+        exit(1);
+    }
+
+    const ConvTensorLayout layout = static_cast<ConvTensorLayout>(atoi(argv[1]));
+    const ConvForwardAlgo algo    = static_cast<ConvForwardAlgo>(atoi(argv[2]));
+    const bool do_verification    = atoi(argv[3]);
+    const int init_method         = atoi(argv[4]);
+    const bool do_log             = atoi(argv[5]);
+    const int nrepeat             = atoi(argv[6]);
+
+    const index_t N  = atoi(argv[7]);
+    const index_t K  = atoi(argv[8]);
+    const index_t C  = atoi(argv[9]);
+    const index_t Y  = atoi(argv[10]);
+    const index_t X  = atoi(argv[11]);
+    const index_t Hi = atoi(argv[12]);
+    const index_t Wi = atoi(argv[13]);
+
+    const index_t conv_stride_h   = atoi(argv[14]);
+    const index_t conv_stride_w   = atoi(argv[15]);
+    const index_t conv_dilation_h = atoi(argv[16]);
+    const index_t conv_dilation_w = atoi(argv[17]);
+    const index_t in_left_pad_h   = atoi(argv[18]);
+    const index_t in_left_pad_w   = atoi(argv[19]);
+    const index_t in_right_pad_h  = atoi(argv[20]);
+    const index_t in_right_pad_w  = atoi(argv[21]);
+
+    const index_t YEff = (Y - 1) * conv_dilation_h + 1;
+    const index_t XEff = (X - 1) * conv_dilation_w + 1;
+
+    const index_t Ho = (Hi + in_left_pad_h + in_right_pad_h - YEff) / conv_stride_h + 1;
+    const index_t Wo = (Wi + in_left_pad_w + in_right_pad_w - XEff) / conv_stride_w + 1;
+
+#if 1
+    constexpr index_t in_vector_size = 1;
+    using in_data_t                  = float;
+    using acc_data_t                 = float;
+    using out_data_t                 = float;
+#elif 1
+    constexpr index_t in_vector_size = 16;
+    using in_data_t                  = int8_t;
+    using acc_data_t                 = int32_t;
+    using out_data_t                 = int8_t;
+#endif
+
+    std::vector<std::size_t> in_lengths_host(4), wei_lengths_host(4), out_lengths_host(4);
+
+    switch(layout)
+    {
+    case ConvTensorLayout::NCHW:
+        // NCHW
+        in_lengths_host[0]  = static_cast<std::size_t>(N);
+        in_lengths_host[1]  = static_cast<std::size_t>(C);
+        in_lengths_host[2]  = static_cast<std::size_t>(Hi);
+        in_lengths_host[3]  = static_cast<std::size_t>(Wi);
+        wei_lengths_host[0] = static_cast<std::size_t>(K);
+        wei_lengths_host[1] = static_cast<std::size_t>(C);
+        wei_lengths_host[2] = static_cast<std::size_t>(Y);
+        wei_lengths_host[3] = static_cast<std::size_t>(X);
+        out_lengths_host[0] = static_cast<std::size_t>(N);
+        out_lengths_host[1] = static_cast<std::size_t>(K);
+        out_lengths_host[2] = static_cast<std::size_t>(Ho);
+        out_lengths_host[3] = static_cast<std::size_t>(Wo);
+        break;
+    case ConvTensorLayout::NHWC:
+        // NHWC
+        in_lengths_host[0]  = static_cast<std::size_t>(N);
+        in_lengths_host[1]  = static_cast<std::size_t>(Hi);
+        in_lengths_host[2]  = static_cast<std::size_t>(Wi);
+        in_lengths_host[3]  = static_cast<std::size_t>(C);
+        wei_lengths_host[0] = static_cast<std::size_t>(K);
+        wei_lengths_host[1] = static_cast<std::size_t>(Y);
+        wei_lengths_host[2] = static_cast<std::size_t>(X);
+        wei_lengths_host[3] = static_cast<std::size_t>(C);
+        out_lengths_host[0] = static_cast<std::size_t>(N);
+        out_lengths_host[1] = static_cast<std::size_t>(Ho);
+        out_lengths_host[2] = static_cast<std::size_t>(Wo);
+        out_lengths_host[3] = static_cast<std::size_t>(K);
+        break;
+    default: throw std::runtime_error("wrong! not implemented");
+    }
+
+    Tensor<in_data_t> in(in_lengths_host);
+    Tensor<in_data_t> wei(wei_lengths_host);
+    Tensor<out_data_t> out_host(out_lengths_host);
+    Tensor<out_data_t> out_device(out_lengths_host);
+
+    std::cout << "layout: " << layout << std::endl;
+    ostream_HostTensorDescriptor(in.mDesc, std::cout << "in: ");
+    ostream_HostTensorDescriptor(wei.mDesc, std::cout << "wei: ");
+    ostream_HostTensorDescriptor(out_host.mDesc, std::cout << "out: ");
+    print_array("InLeftPads", make_tuple(in_left_pad_h, in_left_pad_w));
+    print_array("InRightPads", make_tuple(in_right_pad_h, in_right_pad_w));
+    print_array("ConvStrides", make_tuple(conv_stride_h, conv_stride_w));
+    print_array("ConvDilations", make_tuple(conv_dilation_h, conv_dilation_w));
+
+    std::size_t num_thread = std::thread::hardware_concurrency();
+
+    if(do_verification)
+    {
+        switch(init_method)
+        {
+        case 0:
+            in.GenerateTensorValue(GeneratorTensor_1{}, num_thread);
+            wei.GenerateTensorValue(GeneratorTensor_1{}, num_thread);
+            break;
+        case 1:
+            in.GenerateTensorValue(GeneratorTensor_1{}, num_thread);
+            wei.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread);
+            break;
+        case 2:
+            in.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread);
+            wei.GenerateTensorValue(GeneratorTensor_1{}, num_thread);
+            break;
+        case 3:
+            in.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread);
+            wei.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread);
+            break;
+        default:
+            in.GenerateTensorValue(GeneratorTensor_2{1, 5}, num_thread);
+
+            auto gen_wei = [](auto... is) {
+                return GeneratorTensor_2{1, 5}(is...) * GeneratorTensor_Checkboard{}(is...);
+            };
+            wei.GenerateTensorValue(gen_wei, num_thread);
+        }
+    }
+
+    auto f_make_for_device_nchw = [&]() {
+        const auto in_lengths_dev  = make_tuple(N, C, Hi, Wi);
+        const auto wei_lengths_dev = make_tuple(K, C, Y, X);
+        const auto out_lengths_dev = make_tuple(N, K, Ho, Wo);
+
+        return make_tuple(in_lengths_dev, wei_lengths_dev, out_lengths_dev);
+    };
+
+    auto f_make_for_device_nhwc = [&]() {
+        const auto in_lengths_dev  = make_tuple(N, Hi, Wi, C);
+        const auto wei_lengths_dev = make_tuple(K, Y, X, C);
+        const auto out_lengths_dev = make_tuple(N, Ho, Wo, K);
+
+        return make_tuple(in_lengths_dev, wei_lengths_dev, out_lengths_dev);
+    };
+
+    const auto conv_strides   = make_tuple(conv_stride_h, conv_stride_w);
+    const auto conv_dilations = make_tuple(conv_dilation_h, conv_dilation_w);
+    const auto in_left_pads   = make_tuple(in_left_pad_h, in_left_pad_w);
+    const auto in_right_pads  = make_tuple(in_right_pad_h, in_right_pad_w);
+
+#if USE_CONV_FWD_V4R4_NCHW
+    if(algo == ConvForwardAlgo::V4R4NCHW)
+    {
+        if(layout != ConvTensorLayout::NCHW)
+        {
+            throw std::runtime_error("wrong! layout");
+        }
+
+        const auto tmp = f_make_for_device_nchw();
+
+        tunable_dyn_conv_fwd_v4r4_nchw_kcyx_nkhw* tunable =
+            &default_tunable_dyn_conv_fwd_v4r4_nchw_kcyx_nkhw;
+
+        device_dynamic_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw_olc<in_data_t,
+                                                                                 acc_data_t,
+                                                                                 out_data_t>(
+            handle,
+            tmp[I0],
+            tmp[I1],
+            tmp[I2],
+            conv_strides,
+            conv_dilations,
+            in_left_pads,
+            in_right_pads,
+            in,
+            wei,
+            out_device,
+            tunable,
+            nrepeat);
+    }
+#endif
+
+#if USE_CONV_FWD_V4R5_NCHW
+    if(algo == ConvForwardAlgo::V4R5NCHW)
+    {
+        if(layout != ConvTensorLayout::NCHW)
+        {
+            throw std::runtime_error("wrong! layout");
+        }
+
+        const auto tmp = f_make_for_device_nchw();
+
+        tunable_dyn_conv_fwd_v4r5_nchw_kcyx_nkhw* tunable =
+            &default_tunable_dyn_conv_fwd_v4r5_nchw_kcyx_nkhw;
+
+        device_dynamic_convolution_forward_implicit_gemm_v4r5_nchw_kcyx_nkhw_olc<in_data_t,
+                                                                                 acc_data_t,
+                                                                                 out_data_t>(
+            handle,
+            tmp[I0],
+            tmp[I1],
+            tmp[I2],
+            conv_strides,
+            conv_dilations,
+            in_left_pads,
+            in_right_pads,
+            in,
+            wei,
+            out_device,
+            tunable,
+            nrepeat);
+    }
+#endif
+
+    if(do_verification)
+    {
+        host_direct_convolution(
+            in, wei, out_host, conv_strides, conv_dilations, in_left_pads, in_right_pads);
+
+        check_error(out_host, out_device);
+
+        if(do_log)
+        {
+            LogRange(std::cout << "in : ", in.mData, ",") << std::endl;
+            LogRange(std::cout << "wei: ", wei.mData, ",") << std::endl;
+            LogRange(std::cout << "out_host  : ", out_host.mData, ",") << std::endl;
+            LogRange(std::cout << "out_device: ", out_device.mData, ",") << std::endl;
+        }
+    }
+
+    delete handle;
+    MY_HIP_CHECK(hipStreamDestroy(stream));
+}
diff --git a/driver/include/conv_tunables.hpp b/driver/include/conv_tunables.hpp
new file mode 100644
index 0000000000..33f791d289
--- /dev/null
+++ b/driver/include/conv_tunables.hpp
@@ -0,0 +1,131 @@
+#ifndef CONV_TUNABLES_HPP
+#define CONV_TUNABLES_HPP
+
+#include "config.hpp"
+
+struct tunable_dyn_conv_fwd_v4r4_nchw_kcyx_nkhw
+{
+    ck::index_t BlockSize; // usually not tunable
+
+    ck::index_t MPerBlock;
+    ck::index_t NPerBlock;
+    ck::index_t KPerBlock;
+
+    ck::index_t M1PerThread;
+    ck::index_t N1PerThread;
+    ck::index_t KPerThread;
+
+    ck::index_t M1N1ThreadClusterM10;
+    ck::index_t M1N1ThreadClusterN10;
+    ck::index_t M1N1ThreadClusterM11;
+    ck::index_t M1N1ThreadClusterN11;
+
+    std::array<ck::index_t, 3> ABlockTransferThreadSliceLengths_K_M0_M1;
+    std::array<ck::index_t, 3> ABlockTransferThreadClusterLengths_K_M0_M1;
+    std::array<ck::index_t, 3> ABlockTransferThreadClusterArrangeOrder;
+    std::array<ck::index_t, 3> ABlockTransferSrcAccessOrder;
+    ck::index_t ABlockTransferSrcVectorDim;
+    ck::index_t ABlockTransferSrcScalarPerVector;
+    ck::index_t ABlockTransferDstScalarPerVector_M1;
+    bool AThreadTransferSrcResetCoordinateAfterRun;
+
+    std::array<ck::index_t, 3> BBlockTransferThreadSliceLengths_K_N0_N1;
+    std::array<ck::index_t, 3> BBlockTransferThreadClusterLengths_K_N0_N1;
+    std::array<ck::index_t, 3> BBlockTransferThreadClusterArrangeOrder;
+    std::array<ck::index_t, 3> BBlockTransferSrcAccessOrder;
+    ck::index_t BBlockTransferSrcVectorDim;
+    ck::index_t BBlockTransferSrcScalarPerVector;
+    ck::index_t BBlockTransferDstScalarPerVector_N1;
+    bool BThreadTransferSrcResetCoordinateAfterRun;
+
+    std::array<ck::index_t, 6> CThreadTransferSrcDstAccessOrder;
+    ck::index_t CThreadTransferSrcDstVectorDim;
+    ck::index_t CThreadTransferDstScalarPerVector;
+};
+
+static tunable_dyn_conv_fwd_v4r4_nchw_kcyx_nkhw default_tunable_dyn_conv_fwd_v4r4_nchw_kcyx_nkhw = {
+    256,       128,       128, 8, 4,         4,           1,
+    8,         8,         2,   2, {4, 1, 1}, {2, 1, 128}, {2, 1, 0},
+    {2, 1, 0}, 0,         4,   1, false,     {4, 1, 1},   {2, 1, 128},
+    {0, 1, 2}, {0, 1, 2}, 2,   1, 1,         false,       {3, 4, 5, 0, 1, 2},
+    5,         1};
+
+struct tunable_dyn_conv_fwd_v4r5_nchw_kcyx_nkhw
+{
+    ck::index_t BlockSize;
+
+    ck::index_t GM1PerBlockGM11;
+    ck::index_t GN1PerBlockGN11;
+    ck::index_t KPerBlock;
+
+    ck::index_t M1PerThread;
+    ck::index_t N1PerThread;
+    ck::index_t KPerThread;
+
+    ck::index_t M1N1ThreadClusterM10;
+    ck::index_t M1N1ThreadClusterN10;
+    ck::index_t M1N1ThreadClusterM11;
+    ck::index_t M1N1ThreadClusterN11;
+
+    std::array<ck::index_t, 4> ABlockTransferThreadSliceLengths_GK_GM0_GM10_GM11;
+    std::array<ck::index_t, 4> ABlockTransferThreadClusterLengths_GK_GM0_GM10_GM11;
+    std::array<ck::index_t, 4> ABlockTransferThreadClusterArrangeOrder;
+    std::array<ck::index_t, 4> ABlockTransferSrcAccessOrder;
+    ck::index_t ABlockTransferSrcVectorDim;
+    ck::index_t ABlockTransferSrcScalarPerVector;
+    ck::index_t ABlockTransferDstScalarPerVector_GM11;
+    bool AThreadTransferSrcResetCoordinateAfterRun;
+
+    std::array<ck::index_t, 4> BBlockTransferThreadSliceLengths_GK_GN0_GN10_GN11;
+    std::array<ck::index_t, 4> BBlockTransferThreadClusterLengths_GK_GN0_GN10_GN11;
+    std::array<ck::index_t, 4> BBlockTransferThreadClusterArrangeOrder;
+    std::array<ck::index_t, 4> BBlockTransferSrcAccessOrder;
+    ck::index_t BBlockTransferSrcVectorDim;
+    ck::index_t BBlockTransferSrcScalarPerVector;
+    ck::index_t BBlockTransferDstScalarPerVector_GN11;
+    bool BThreadTransferSrcResetCoordinateAfterRun;
+
+    std::array<ck::index_t, 6> CThreadTransferSrcDstAccessOrder;
+    ck::index_t CThreadTransferSrcDstVectorDim;
+    ck::index_t CThreadTransferDstScalarPerVector;
+};
+
+static tunable_dyn_conv_fwd_v4r5_nchw_kcyx_nkhw default_tunable_dyn_conv_fwd_v4r5_nchw_kcyx_nkhw = {
+    256,
+    128,
+    32,
+    8,
+    4,
+    4,
+    1,
+    2,
+    2,
+    8,
+    8,
+    {4, 1, 1, 1},
+    {2, 1, 1, 128},
+    {3, 2, 1, 0},
+    {3, 2, 1, 0},
+    0,
+    4,
+    1,
+    false,
+    {1, 4, 1, 1},
+    {8, 1, 1, 32},
+    {0, 3, 2, 1},
+    {0, 3, 2, 1},
+    3,
+    1,
+    1,
+    false,
+    {3, 4, 5, 0, 1, 2},
+    5,
+    1};
+
+static inline int
+conv_hw_out_size(int hw_in_size, int leftPad, int rightPad, int dilation, int yx_size, int stride)
+{
+    return (hw_in_size + leftPad + rightPad - dilation * (yx_size - 1) - 1) / stride + 1;
+}
+
+#endif
diff --git a/driver/include/olc_device_dynamic_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp b/driver/include/olc_device_dynamic_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp
new file mode 100644
index 0000000000..94a9bcc06d
--- /dev/null
+++ b/driver/include/olc_device_dynamic_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp
@@ -0,0 +1,383 @@
+#include "device.hpp"
+#include "host_tensor.hpp"
+#include "dynamic_tensor_descriptor.hpp"
+#include "dynamic_tensor_descriptor_helper.hpp"
+#include "transform_forward_convolution_into_gemm_v4r4_nchw_kcyx_nkhw.hpp"
+
+#include "olc_driver_common.hpp"
+#include "conv_tunables.hpp"
+
+#include "handle.hpp"
+
+namespace detail_dyn_conv_fwd_v4r4_nchw_kcyx_nkhw {
+
+template <typename TInWei, typename TAcc, typename TOut>
+static std::string get_network_config_string_from_types()
+{
+    std::string out;
+
+    out += static_cast<char>(Driver::get_typeid_from_type<TInWei>()) +
+           static_cast<char>(Driver::get_typeid_from_type<TAcc>()) +
+           static_cast<char>(Driver::get_typeid_from_type<TOut>());
+
+    return (out);
+};
+
+static std::string
+get_network_config_string_from_tunable(const tunable_dyn_conv_fwd_v4r4_nchw_kcyx_nkhw* pt)
+{
+    std::string out("TUN_");
+
+    out += std::to_string(pt->BlockSize) + "_";
+
+    out += std::to_string(pt->MPerBlock) + "x" + std::to_string(pt->NPerBlock) + "x" +
+           std::to_string(pt->KPerBlock) + "_";
+    out += std::to_string(pt->M1PerThread) + "x" + std::to_string(pt->N1PerThread) + "x" +
+           std::to_string(pt->KPerThread) + "_";
+    out += std::to_string(pt->M1N1ThreadClusterM10) + "x" +
+           std::to_string(pt->M1N1ThreadClusterN10) + "x" +
+           std::to_string(pt->M1N1ThreadClusterM11) + "x" +
+           std::to_string(pt->M1N1ThreadClusterN11) + "_";
+
+    out += std::to_string(pt->ABlockTransferThreadSliceLengths_K_M0_M1[0]) + "x" +
+           std::to_string(pt->ABlockTransferThreadSliceLengths_K_M0_M1[1]) + "x" +
+           std::to_string(pt->ABlockTransferThreadSliceLengths_K_M0_M1[2]) + "_";
+
+    out += std::to_string(pt->ABlockTransferThreadClusterLengths_K_M0_M1[0]) + "x" +
+           std::to_string(pt->ABlockTransferThreadClusterLengths_K_M0_M1[1]) + "x" +
+           std::to_string(pt->ABlockTransferThreadClusterLengths_K_M0_M1[2]) + "_";
+
+    out += std::to_string(pt->ABlockTransferThreadClusterArrangeOrder[0]) + "x" +
+           std::to_string(pt->ABlockTransferThreadClusterArrangeOrder[1]) + "x" +
+           std::to_string(pt->ABlockTransferThreadClusterArrangeOrder[2]) + "_";
+
+    out += std::to_string(pt->ABlockTransferSrcAccessOrder[0]) + "x" +
+           std::to_string(pt->ABlockTransferSrcAccessOrder[1]) + "x" +
+           std::to_string(pt->ABlockTransferSrcAccessOrder[2]) + "_";
+
+    out += std::to_string(pt->ABlockTransferSrcVectorDim) + "_";
+    out += std::to_string(pt->ABlockTransferSrcScalarPerVector) + "_";
+    out += std::to_string(pt->ABlockTransferDstScalarPerVector_M1) + "_";
+    out += std::to_string(pt->AThreadTransferSrcResetCoordinateAfterRun) + "_";
+
+    out += std::to_string(pt->BBlockTransferThreadSliceLengths_K_N0_N1[0]) + "x" +
+           std::to_string(pt->BBlockTransferThreadSliceLengths_K_N0_N1[1]) + "x" +
+           std::to_string(pt->BBlockTransferThreadSliceLengths_K_N0_N1[2]) + "_";
+
+    out += std::to_string(pt->BBlockTransferThreadClusterLengths_K_N0_N1[0]) + "x" +
+           std::to_string(pt->BBlockTransferThreadClusterLengths_K_N0_N1[1]) + "x" +
+           std::to_string(pt->BBlockTransferThreadClusterLengths_K_N0_N1[2]) + "_";
+
+    out += std::to_string(pt->BBlockTransferThreadClusterArrangeOrder[0]) + "x" +
+           std::to_string(pt->BBlockTransferThreadClusterArrangeOrder[1]) + "x" +
+           std::to_string(pt->BBlockTransferThreadClusterArrangeOrder[2]) + "_";
+
+    out += std::to_string(pt->BBlockTransferSrcAccessOrder[0]) + "x" +
+           std::to_string(pt->BBlockTransferSrcAccessOrder[1]) + "x" +
+           std::to_string(pt->BBlockTransferSrcAccessOrder[2]) + "_";
+
+    out += std::to_string(pt->BBlockTransferSrcVectorDim) + "_";
+    out += std::to_string(pt->BBlockTransferSrcScalarPerVector) + "_";
+    out += std::to_string(pt->BBlockTransferDstScalarPerVector_N1) + "_";
+    out += std::to_string(pt->BThreadTransferSrcResetCoordinateAfterRun) + "_";
+
+    out += std::to_string(pt->CThreadTransferSrcDstAccessOrder[0]) + "x" +
+           std::to_string(pt->CThreadTransferSrcDstAccessOrder[1]) + "x" +
+           std::to_string(pt->CThreadTransferSrcDstAccessOrder[2]) + "x" +
+           std::to_string(pt->CThreadTransferSrcDstAccessOrder[3]) + "x" +
+           std::to_string(pt->CThreadTransferSrcDstAccessOrder[4]) + "x" +
+           std::to_string(pt->CThreadTransferSrcDstAccessOrder[5]) + "_";
+
+    out += std::to_string(pt->CThreadTransferSrcDstVectorDim) + "_";
+    out += std::to_string(pt->CThreadTransferDstScalarPerVector);
+
+    return (out);
+};
+
+template <typename TInWei, typename TAcc, typename TOut>
+static std::string get_definition_string_from_types()
+{
+    std::string out;
+
+    out += " -DCK_PARAM_IN_WEI_DATATYPE=" + std::to_string(Driver::get_typeid_from_type<TInWei>()) +
+           " -DCK_PARAM_CONV_COMPTYPE=" + std::to_string(Driver::get_typeid_from_type<TAcc>()) +
+           " -DCK_PARAM_OUT_DATATYPE=" + std::to_string(Driver::get_typeid_from_type<TOut>());
+
+    return (out);
+};
+
+static std::string
+get_definition_string_from_tunable(const tunable_dyn_conv_fwd_v4r4_nchw_kcyx_nkhw* pt)
+{
+    std::string out;
+
+    out += " -DCK_PARAM_BlockSize=" + std::to_string(pt->BlockSize);
+
+    out += " -DCK_PARAM_MPerBlock=" + std::to_string(pt->MPerBlock) +
+           " -DCK_PARAM_NPerBlock=" + std::to_string(pt->NPerBlock) +
+           " -DCK_PARAM_KPerBlock=" + std::to_string(pt->KPerBlock);
+    out += " -DCK_PARAM_M1PerThread=" + std::to_string(pt->M1PerThread) +
+           " -DCK_PARAM_N1PerThread=" + std::to_string(pt->N1PerThread) +
+           " -DCK_PARAM_KPerThread=" + std::to_string(pt->KPerThread);
+
+    out += " -DCK_PARAM_M1N1ThreadClusterM10=" + std::to_string(pt->M1N1ThreadClusterM10) +
+           " -DCK_PARAM_M1N1ThreadClusterN10=" + std::to_string(pt->M1N1ThreadClusterN10) +
+           " -DCK_PARAM_M1N1ThreadClusterM11=" + std::to_string(pt->M1N1ThreadClusterM11) +
+           " -DCK_PARAM_M1N1ThreadClusterN11=" + std::to_string(pt->M1N1ThreadClusterN11);
+
+    out += " -DCK_PARAM_ABlockTransferThreadSliceLengths_K_M0_M1=" +
+           std::to_string(pt->ABlockTransferThreadSliceLengths_K_M0_M1[0]) + "," +
+           std::to_string(pt->ABlockTransferThreadSliceLengths_K_M0_M1[1]) + "," +
+           std::to_string(pt->ABlockTransferThreadSliceLengths_K_M0_M1[2]);
+
+    out += " -DCK_PARAM_ABlockTransferThreadClusterLengths_K_M0_M1=" +
+           std::to_string(pt->ABlockTransferThreadClusterLengths_K_M0_M1[0]) + "," +
+           std::to_string(pt->ABlockTransferThreadClusterLengths_K_M0_M1[1]) + "," +
+           std::to_string(pt->ABlockTransferThreadClusterLengths_K_M0_M1[2]);
+
+    out += " -DCK_PARAM_ABlockTransferThreadClusterArrangeOrder=" +
+           std::to_string(pt->ABlockTransferThreadClusterArrangeOrder[0]) + "," +
+           std::to_string(pt->ABlockTransferThreadClusterArrangeOrder[1]) + "," +
+           std::to_string(pt->ABlockTransferThreadClusterArrangeOrder[2]);
+
+    out += " -DCK_PARAM_ABlockTransferSrcAccessOrder=" +
+           std::to_string(pt->ABlockTransferSrcAccessOrder[0]) + "," +
+           std::to_string(pt->ABlockTransferSrcAccessOrder[1]) + "," +
+           std::to_string(pt->ABlockTransferSrcAccessOrder[2]);
+
+    out +=
+        " -DCK_PARAM_ABlockTransferSrcVectorDim=" + std::to_string(pt->ABlockTransferSrcVectorDim);
+    out += " -DCK_PARAM_ABlockTransferSrcScalarPerVector=" +
+           std::to_string(pt->ABlockTransferSrcScalarPerVector);
+    out += " -DCK_PARAM_ABlockTransferDstScalarPerVector_M1=" +
+           std::to_string(pt->ABlockTransferDstScalarPerVector_M1);
+    out += " -DCK_PARAM_AThreadTransferSrcResetCoordinateAfterRun=" +
+           std::to_string(pt->AThreadTransferSrcResetCoordinateAfterRun);
+
+    out += " -DCK_PARAM_BBlockTransferThreadSliceLengths_K_N0_N1=" +
+           std::to_string(pt->BBlockTransferThreadSliceLengths_K_N0_N1[0]) + "," +
+           std::to_string(pt->BBlockTransferThreadSliceLengths_K_N0_N1[1]) + "," +
+           std::to_string(pt->BBlockTransferThreadSliceLengths_K_N0_N1[2]);
+
+    out += " -DCK_PARAM_BBlockTransferThreadClusterLengths_K_N0_N1=" +
+           std::to_string(pt->BBlockTransferThreadClusterLengths_K_N0_N1[0]) + "," +
+           std::to_string(pt->BBlockTransferThreadClusterLengths_K_N0_N1[1]) + "," +
+           std::to_string(pt->BBlockTransferThreadClusterLengths_K_N0_N1[2]);
+
+    out += " -DCK_PARAM_BBlockTransferThreadClusterArrangeOrder=" +
+           std::to_string(pt->BBlockTransferThreadClusterArrangeOrder[0]) + "," +
+           std::to_string(pt->BBlockTransferThreadClusterArrangeOrder[1]) + "," +
+           std::to_string(pt->BBlockTransferThreadClusterArrangeOrder[2]);
+
+    out += " -DCK_PARAM_BBlockTransferSrcAccessOrder=" +
+           std::to_string(pt->BBlockTransferSrcAccessOrder[0]) + "," +
+           std::to_string(pt->BBlockTransferSrcAccessOrder[1]) + "," +
+           std::to_string(pt->BBlockTransferSrcAccessOrder[2]);
+
+    out +=
+        " -DCK_PARAM_BBlockTransferSrcVectorDim=" + std::to_string(pt->BBlockTransferSrcVectorDim);
+    out += " -DCK_PARAM_BBlockTransferSrcScalarPerVector=" +
+           std::to_string(pt->BBlockTransferSrcScalarPerVector);
+    out += " -DCK_PARAM_BBlockTransferDstScalarPerVector_N1=" +
+           std::to_string(pt->BBlockTransferDstScalarPerVector_N1);
+    out += " -DCK_PARAM_BThreadTransferSrcResetCoordinateAfterRun=" +
+           std::to_string(pt->BThreadTransferSrcResetCoordinateAfterRun);
+
+    out += " -DCK_PARAM_CThreadTransferSrcDstAccessOrder=" +
+           std::to_string(pt->CThreadTransferSrcDstAccessOrder[0]) + "," +
+           std::to_string(pt->CThreadTransferSrcDstAccessOrder[1]) + "," +
+           std::to_string(pt->CThreadTransferSrcDstAccessOrder[2]) + "," +
+           std::to_string(pt->CThreadTransferSrcDstAccessOrder[3]) + "," +
+           std::to_string(pt->CThreadTransferSrcDstAccessOrder[4]) + "," +
+           std::to_string(pt->CThreadTransferSrcDstAccessOrder[5]);
+
+    out += " -DCK_PARAM_CThreadTransferSrcDstVectorDim=" +
+           std::to_string(pt->CThreadTransferSrcDstVectorDim);
+    out += " -DCK_PARAM_CThreadTransferDstScalarPerVector=" +
+           std::to_string(pt->CThreadTransferDstScalarPerVector);
+
+    return (out);
+};
+
+} // namespace detail_dyn_conv_fwd_v4r4_nchw_kcyx_nkhw
+
+template <typename TInWei,
+          typename TAcc,
+          typename TOut,
+          typename InLengths,
+          typename WeiLengths,
+          typename OutLengths,
+          typename ConvStrides,
+          typename ConvDilations,
+          typename InLeftPads,
+          typename InRightPads>
+void device_dynamic_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw_olc(
+    olCompile::Handle* handle,
+    const InLengths& in_n_c_hi_wi_lengths,
+    const WeiLengths& wei_k_c_y_x_lengths,
+    const OutLengths& out_n_k_ho_wo_lengths,
+    const ConvStrides& conv_strides,
+    const ConvDilations& conv_dilations,
+    const InLeftPads& in_left_pads,
+    const InRightPads& in_right_pads,
+    const Tensor<TInWei>& in_n_c_hi_wi,
+    const Tensor<TInWei>& wei_k_c_y_x,
+    Tensor<TOut>& out_n_k_ho_wo,
+    const tunable_dyn_conv_fwd_v4r4_nchw_kcyx_nkhw* tunable,
+    ck::index_t nrepeat)
+{
+    using namespace ck;
+    using namespace detail_dyn_conv_fwd_v4r4_nchw_kcyx_nkhw;
+    using size_t = std::size_t;
+
+    /////////////////////////////////////////////////////////////////////////////////////////////////////////////
+    // The follow codes are only used for computing the grid_size, hasMainKBlockLoop,
+    // hasDoubleTailKBlockLoop
+
+    constexpr auto I0 = Number<0>{};
+    constexpr auto I1 = Number<1>{};
+    constexpr auto I2 = Number<2>{};
+    constexpr auto I3 = Number<3>{};
+
+    const auto in_n_c_hi_wi_desc =
+        make_dynamic_naive_tensor_descriptor_packed_v2(in_n_c_hi_wi_lengths);
+    const auto wei_k_c_y_x_desc =
+        make_dynamic_naive_tensor_descriptor_packed_v2(wei_k_c_y_x_lengths);
+    const auto out_n_k_ho_wo_desc =
+        make_dynamic_naive_tensor_descriptor_packed_v2(out_n_k_ho_wo_lengths);
+
+    const auto descs =
+        transform_forward_convolution_into_gemm_v4r4_nchw_kcyx_nkhw_pad(wei_k_c_y_x_desc,
+                                                                        in_n_c_hi_wi_desc,
+                                                                        out_n_k_ho_wo_desc,
+                                                                        conv_strides,
+                                                                        conv_dilations,
+                                                                        in_left_pads,
+                                                                        in_right_pads);
+    const auto a_k_m_grid_desc = descs[I0];
+    const auto c_m_n_grid_desc = descs[I2];
+    const auto M               = c_m_n_grid_desc.GetLength(I0);
+    const auto N               = c_m_n_grid_desc.GetLength(I1);
+    const auto K               = a_k_m_grid_desc.GetLength(I0);
+
+    const index_t grid_size            = (M / tunable->MPerBlock) * (N / tunable->NPerBlock);
+    const bool hasMainKBlockLoop       = ((K + tunable->KPerBlock) / (2 * tunable->KPerBlock) > 1);
+    const bool hasDoubleTailKBlockLoop = ((K / tunable->KPerBlock) % 2 == 0);
+    /////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+    // these buffers are usually provided by the user application
+    DeviceMem in_n_c_hi_wi_dev_buf(sizeof(TInWei) * in_n_c_hi_wi.mDesc.GetElementSpace());
+    DeviceMem wei_k_c_y_x_dev_buf(sizeof(TInWei) * wei_k_c_y_x.mDesc.GetElementSpace());
+    DeviceMem out_n_k_ho_wo_dev_buf(sizeof(TOut) * out_n_k_ho_wo.mDesc.GetElementSpace());
+
+    in_n_c_hi_wi_dev_buf.ToDevice(in_n_c_hi_wi.mData.data());
+    wei_k_c_y_x_dev_buf.ToDevice(wei_k_c_y_x.mData.data());
+    out_n_k_ho_wo_dev_buf.ToDevice(out_n_k_ho_wo.mData.data());
+
+    // these are workspace buffers that should be expressed to the user by the corresponding
+    // workspace API
+    DeviceMem workspace_buf(4096);
+
+    void* a_k_m0_m1_grid_desc_dev_buf = workspace_buf.GetDeviceBuffer();
+    void* b_k_n0_n1_grid_desc_dev_buf =
+        static_cast<void*>(static_cast<unsigned char*>(workspace_buf.GetDeviceBuffer()) + 1024);
+    void* c_m0_m10_m11_n0_n10_n11_grid_desc_dev_buf =
+        static_cast<void*>(static_cast<unsigned char*>(workspace_buf.GetDeviceBuffer()) + 2048);
+    void* c_blockid_to_m0_n0_block_cluster_adaptor_dev_buf =
+        static_cast<void*>(static_cast<unsigned char*>(workspace_buf.GetDeviceBuffer()) + 3072);
+
+    const std::vector<size_t> vld  = {static_cast<size_t>(tunable->BlockSize), 1, 1};
+    const std::vector<size_t> vgd1 = {static_cast<size_t>(tunable->BlockSize), 1, 1};
+    const std::vector<size_t> vgd2 = {static_cast<size_t>(grid_size * tunable->BlockSize), 1, 1};
+
+    std::string program_name = "dynamic_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw.cpp";
+    std::string algo_name    = "implicit_gemm_conv_fwd_v4r4_nchw";
+
+    std::string param = " -std=c++17 ";
+    std::string network_config;
+
+    param += get_definition_string_from_types<TInWei, TAcc, TOut>() + " " +
+             get_definition_string_from_tunable(tunable) +
+             " -DCK_PARAM_HAS_MAIN_KBLOCK_LOOP=" + std::to_string(hasMainKBlockLoop) +
+             " -DCK_PARAM_HAS_DOUBLE_TAIL_KBLOCK_LOOP=" + std::to_string(hasDoubleTailKBlockLoop);
+    network_config = get_network_config_string_from_types<TInWei, TAcc, TOut>() + "_" +
+                     get_network_config_string_from_tunable(tunable) + "_" +
+                     std::to_string(hasMainKBlockLoop) + "_" +
+                     std::to_string(hasDoubleTailKBlockLoop);
+
+    std::vector<float> kernel1_times;
+    std::vector<float> kernel2_times;
+
+    for(index_t i = 0; i < nrepeat; ++i)
+    {
+        KernelTimer timer1, timer2;
+        std::string kernel_name;
+
+        kernel_name = "dynamic_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw_prepare";
+        auto network_config_1 = network_config + "_1";
+
+        timer1.Start();
+        handle->AddKernel(algo_name, network_config_1, program_name, kernel_name, vld, vgd1, param)(
+            static_cast<index_t>(in_n_c_hi_wi_lengths[I0]),
+            static_cast<index_t>(in_n_c_hi_wi_lengths[I1]),
+            static_cast<index_t>(in_n_c_hi_wi_lengths[I2]),
+            static_cast<index_t>(in_n_c_hi_wi_lengths[I3]),
+            static_cast<index_t>(wei_k_c_y_x_lengths[I0]),
+            static_cast<index_t>(wei_k_c_y_x_lengths[I2]),
+            static_cast<index_t>(wei_k_c_y_x_lengths[I3]),
+            conv_strides[I0],
+            conv_strides[I1],
+            conv_dilations[I0],
+            conv_dilations[I1],
+            in_left_pads[I0],
+            in_left_pads[I1],
+            in_right_pads[I0],
+            in_right_pads[I1],
+            a_k_m0_m1_grid_desc_dev_buf,
+            b_k_n0_n1_grid_desc_dev_buf,
+            c_m0_m10_m11_n0_n10_n11_grid_desc_dev_buf,
+            c_blockid_to_m0_n0_block_cluster_adaptor_dev_buf);
+        timer1.End();
+
+        kernel_name           = "dynamic_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw";
+        auto network_config_2 = network_config + "_2";
+
+        timer2.Start();
+        handle->AddKernel(algo_name, network_config_2, program_name, kernel_name, vld, vgd2, param)(
+            reinterpret_cast<const TInWei*>(wei_k_c_y_x_dev_buf.GetDeviceBuffer()),
+            reinterpret_cast<const TInWei*>(in_n_c_hi_wi_dev_buf.GetDeviceBuffer()),
+            reinterpret_cast<TOut*>(out_n_k_ho_wo_dev_buf.GetDeviceBuffer()),
+            (const void*)(a_k_m0_m1_grid_desc_dev_buf),
+            (const void*)(b_k_n0_n1_grid_desc_dev_buf),
+            (const void*)(c_m0_m10_m11_n0_n10_n11_grid_desc_dev_buf),
+            (const void*)(c_blockid_to_m0_n0_block_cluster_adaptor_dev_buf));
+        timer2.End();
+
+        kernel1_times.push_back(timer1.GetElapsedTime());
+        kernel2_times.push_back(timer2.GetElapsedTime());
+    }
+
+    {
+        auto ave_time1 = Driver::get_effective_average(kernel1_times);
+        auto ave_time2 = Driver::get_effective_average(kernel2_times);
+
+        const auto N = in_n_c_hi_wi_lengths[I0];
+        const auto C = in_n_c_hi_wi_lengths[I1];
+
+        const auto K  = out_n_k_ho_wo_lengths[I1];
+        const auto Ho = out_n_k_ho_wo_lengths[I2];
+        const auto Wo = out_n_k_ho_wo_lengths[I3];
+
+        const auto Y = wei_k_c_y_x_lengths[I2];
+        const auto X = wei_k_c_y_x_lengths[I3];
+
+        float perf = (float)(std::size_t(2) * N * K * Ho * Wo * C * Y * X) /
+                     (std::size_t(1000) * 1000 * 1000) / (ave_time1 + ave_time2);
+
+        std::cout << "Average time : " << ave_time1 + ave_time2 << " ms(" << ave_time1 << ", "
+                  << ave_time2 << "), " << perf << " TFlop/s" << std::endl;
+    };
+
+    // copy result back to host
+    out_n_k_ho_wo_dev_buf.FromDevice(out_n_k_ho_wo.mData.data());
+}
diff --git a/driver/include/olc_device_dynamic_convolution_forward_implicit_gemm_v4r5_nchw_kcyx_nkhw.hpp b/driver/include/olc_device_dynamic_convolution_forward_implicit_gemm_v4r5_nchw_kcyx_nkhw.hpp
new file mode 100644
index 0000000000..914de6e81b
--- /dev/null
+++ b/driver/include/olc_device_dynamic_convolution_forward_implicit_gemm_v4r5_nchw_kcyx_nkhw.hpp
@@ -0,0 +1,404 @@
+#include "device.hpp"
+#include "host_tensor.hpp"
+#include "dynamic_tensor_descriptor.hpp"
+#include "dynamic_tensor_descriptor_helper.hpp"
+#include "transform_forward_convolution_into_gemm_v4r5_nchw_kcyx_nkhw.hpp"
+
+#include "olc_driver_common.hpp"
+#include "conv_tunables.hpp"
+
+#include "handle.hpp"
+
+namespace detail_dyn_conv_fwd_v4r5_nchw_kcyx_nkhw {
+
+template <typename TInWei, typename TAcc, typename TOut>
+static std::string get_network_config_string_from_types()
+{
+    std::string out;
+
+    out += static_cast<char>(Driver::get_typeid_from_type<TInWei>()) +
+           static_cast<char>(Driver::get_typeid_from_type<TAcc>()) +
+           static_cast<char>(Driver::get_typeid_from_type<TOut>());
+
+    return (out);
+};
+
+static std::string
+get_network_config_string_from_tunable(const tunable_dyn_conv_fwd_v4r5_nchw_kcyx_nkhw* pt)
+{
+    std::string out("TUN_");
+
+    out += std::to_string(pt->BlockSize) + "_";
+
+    out += std::to_string(pt->GM1PerBlockGM11) + "x" + std::to_string(pt->GN1PerBlockGN11) + "x" +
+           std::to_string(pt->KPerBlock) + "_";
+    out += std::to_string(pt->M1PerThread) + "x" + std::to_string(pt->N1PerThread) + "x" +
+           std::to_string(pt->KPerThread) + "_";
+    out += std::to_string(pt->M1N1ThreadClusterM10) + "x" +
+           std::to_string(pt->M1N1ThreadClusterN10) + "x" +
+           std::to_string(pt->M1N1ThreadClusterM11) + "x" +
+           std::to_string(pt->M1N1ThreadClusterN11) + "_";
+
+    out += std::to_string(pt->ABlockTransferThreadSliceLengths_GK_GM0_GM10_GM11[0]) + "x" +
+           std::to_string(pt->ABlockTransferThreadSliceLengths_GK_GM0_GM10_GM11[1]) + "x" +
+           std::to_string(pt->ABlockTransferThreadSliceLengths_GK_GM0_GM10_GM11[2]) + "x" +
+           std::to_string(pt->ABlockTransferThreadSliceLengths_GK_GM0_GM10_GM11[3]) + "_";
+
+    out += std::to_string(pt->ABlockTransferThreadClusterLengths_GK_GM0_GM10_GM11[0]) + "x" +
+           std::to_string(pt->ABlockTransferThreadClusterLengths_GK_GM0_GM10_GM11[1]) + "x" +
+           std::to_string(pt->ABlockTransferThreadClusterLengths_GK_GM0_GM10_GM11[2]) + "x" +
+           std::to_string(pt->ABlockTransferThreadClusterLengths_GK_GM0_GM10_GM11[3]) + "_";
+
+    out += std::to_string(pt->ABlockTransferThreadClusterArrangeOrder[0]) + "x" +
+           std::to_string(pt->ABlockTransferThreadClusterArrangeOrder[1]) + "x" +
+           std::to_string(pt->ABlockTransferThreadClusterArrangeOrder[2]) + "x" +
+           std::to_string(pt->ABlockTransferThreadClusterArrangeOrder[3]) + "_";
+
+    out += std::to_string(pt->ABlockTransferSrcAccessOrder[0]) + "x" +
+           std::to_string(pt->ABlockTransferSrcAccessOrder[1]) + "x" +
+           std::to_string(pt->ABlockTransferSrcAccessOrder[2]) + "x" +
+           std::to_string(pt->ABlockTransferSrcAccessOrder[3]) + "_";
+
+    out += std::to_string(pt->ABlockTransferSrcVectorDim) + "_";
+    out += std::to_string(pt->ABlockTransferSrcScalarPerVector) + "_";
+    out += std::to_string(pt->ABlockTransferDstScalarPerVector_GM11) + "_";
+    out += std::to_string(pt->AThreadTransferSrcResetCoordinateAfterRun) + "_";
+
+    out += std::to_string(pt->BBlockTransferThreadSliceLengths_GK_GN0_GN10_GN11[0]) + "x" +
+           std::to_string(pt->BBlockTransferThreadSliceLengths_GK_GN0_GN10_GN11[1]) + "x" +
+           std::to_string(pt->BBlockTransferThreadSliceLengths_GK_GN0_GN10_GN11[2]) + "x" +
+           std::to_string(pt->BBlockTransferThreadSliceLengths_GK_GN0_GN10_GN11[3]);
+
+    out += std::to_string(pt->BBlockTransferThreadClusterLengths_GK_GN0_GN10_GN11[0]) + "x" +
+           std::to_string(pt->BBlockTransferThreadClusterLengths_GK_GN0_GN10_GN11[1]) + "x" +
+           std::to_string(pt->BBlockTransferThreadClusterLengths_GK_GN0_GN10_GN11[2]) + "x" +
+           std::to_string(pt->BBlockTransferThreadClusterLengths_GK_GN0_GN10_GN11[3]) + "_";
+
+    out += std::to_string(pt->BBlockTransferThreadClusterArrangeOrder[0]) + "x" +
+           std::to_string(pt->BBlockTransferThreadClusterArrangeOrder[1]) + "x" +
+           std::to_string(pt->BBlockTransferThreadClusterArrangeOrder[2]) + "x" +
+           std::to_string(pt->BBlockTransferThreadClusterArrangeOrder[3]) + "_";
+
+    out += std::to_string(pt->BBlockTransferSrcAccessOrder[0]) + "x" +
+           std::to_string(pt->BBlockTransferSrcAccessOrder[1]) + "x" +
+           std::to_string(pt->BBlockTransferSrcAccessOrder[2]) + "x" +
+           std::to_string(pt->BBlockTransferSrcAccessOrder[3]) + "_";
+
+    out += std::to_string(pt->BBlockTransferSrcVectorDim) + "_";
+    out += std::to_string(pt->BBlockTransferSrcScalarPerVector) + "_";
+    out += std::to_string(pt->BBlockTransferDstScalarPerVector_GN11) + "_";
+    out += std::to_string(pt->BThreadTransferSrcResetCoordinateAfterRun) + "_";
+
+    out += std::to_string(pt->CThreadTransferSrcDstAccessOrder[0]) + "x" +
+           std::to_string(pt->CThreadTransferSrcDstAccessOrder[1]) + "x" +
+           std::to_string(pt->CThreadTransferSrcDstAccessOrder[2]) + "x" +
+           std::to_string(pt->CThreadTransferSrcDstAccessOrder[3]) + "x" +
+           std::to_string(pt->CThreadTransferSrcDstAccessOrder[4]) + "x" +
+           std::to_string(pt->CThreadTransferSrcDstAccessOrder[5]) + "_";
+
+    out += std::to_string(pt->CThreadTransferSrcDstVectorDim) + "_";
+    out += std::to_string(pt->CThreadTransferDstScalarPerVector);
+
+    return (out);
+};
+
+template <typename TInWei, typename TAcc, typename TOut>
+static std::string get_definition_string_from_types()
+{
+    std::string out;
+
+    out += " -DCK_PARAM_IN_WEI_DATATYPE=" + std::to_string(Driver::get_typeid_from_type<TInWei>()) +
+           " -DCK_PARAM_CONV_COMPTYPE=" + std::to_string(Driver::get_typeid_from_type<TAcc>()) +
+           " -DCK_PARAM_OUT_DATATYPE=" + std::to_string(Driver::get_typeid_from_type<TOut>());
+
+    return (out);
+};
+
+static std::string
+get_definition_string_from_tunable(const tunable_dyn_conv_fwd_v4r5_nchw_kcyx_nkhw* pt)
+{
+    std::string out;
+
+    out += " -DCK_PARAM_BlockSize=" + std::to_string(pt->BlockSize);
+
+    out += " -DCK_PARAM_GM1PerBlockGM11=" + std::to_string(pt->GM1PerBlockGM11) +
+           " -DCK_PARAM_GN1PerBlockGN11=" + std::to_string(pt->GN1PerBlockGN11) +
+           " -DCK_PARAM_KPerBlock=" + std::to_string(pt->KPerBlock);
+    out += " -DCK_PARAM_M1PerThread=" + std::to_string(pt->M1PerThread) +
+           " -DCK_PARAM_N1PerThread=" + std::to_string(pt->N1PerThread) +
+           " -DCK_PARAM_KPerThread=" + std::to_string(pt->KPerThread);
+
+    out += " -DCK_PARAM_M1N1ThreadClusterM10=" + std::to_string(pt->M1N1ThreadClusterM10) +
+           " -DCK_PARAM_M1N1ThreadClusterN10=" + std::to_string(pt->M1N1ThreadClusterN10) +
+           " -DCK_PARAM_M1N1ThreadClusterM11=" + std::to_string(pt->M1N1ThreadClusterM11) +
+           " -DCK_PARAM_M1N1ThreadClusterN11=" + std::to_string(pt->M1N1ThreadClusterN11);
+
+    out += " -DCK_PARAM_ABlockTransferThreadSliceLengths_GK_GM0_GM10_GM11=" +
+           std::to_string(pt->ABlockTransferThreadSliceLengths_GK_GM0_GM10_GM11[0]) + "," +
+           std::to_string(pt->ABlockTransferThreadSliceLengths_GK_GM0_GM10_GM11[1]) + "," +
+           std::to_string(pt->ABlockTransferThreadSliceLengths_GK_GM0_GM10_GM11[2]) + "," +
+           std::to_string(pt->ABlockTransferThreadSliceLengths_GK_GM0_GM10_GM11[3]);
+
+    out += " -DCK_PARAM_ABlockTransferThreadClusterLengths_GK_GM0_GM10_GM11=" +
+           std::to_string(pt->ABlockTransferThreadClusterLengths_GK_GM0_GM10_GM11[0]) + "," +
+           std::to_string(pt->ABlockTransferThreadClusterLengths_GK_GM0_GM10_GM11[1]) + "," +
+           std::to_string(pt->ABlockTransferThreadClusterLengths_GK_GM0_GM10_GM11[2]) + "," +
+           std::to_string(pt->ABlockTransferThreadClusterLengths_GK_GM0_GM10_GM11[3]);
+
+    out += " -DCK_PARAM_ABlockTransferThreadClusterArrangeOrder=" +
+           std::to_string(pt->ABlockTransferThreadClusterArrangeOrder[0]) + "," +
+           std::to_string(pt->ABlockTransferThreadClusterArrangeOrder[1]) + "," +
+           std::to_string(pt->ABlockTransferThreadClusterArrangeOrder[2]) + "," +
+           std::to_string(pt->ABlockTransferThreadClusterArrangeOrder[3]);
+
+    out += " -DCK_PARAM_ABlockTransferSrcAccessOrder=" +
+           std::to_string(pt->ABlockTransferSrcAccessOrder[0]) + "," +
+           std::to_string(pt->ABlockTransferSrcAccessOrder[1]) + "," +
+           std::to_string(pt->ABlockTransferSrcAccessOrder[2]) + "," +
+           std::to_string(pt->ABlockTransferSrcAccessOrder[3]);
+
+    out +=
+        " -DCK_PARAM_ABlockTransferSrcVectorDim=" + std::to_string(pt->ABlockTransferSrcVectorDim);
+    out += " -DCK_PARAM_ABlockTransferSrcScalarPerVector=" +
+           std::to_string(pt->ABlockTransferSrcScalarPerVector);
+    out += " -DCK_PARAM_ABlockTransferDstScalarPerVector_GM11=" +
+           std::to_string(pt->ABlockTransferDstScalarPerVector_GM11);
+    out += " -DCK_PARAM_AThreadTransferSrcResetCoordinateAfterRun=" +
+           std::to_string(pt->AThreadTransferSrcResetCoordinateAfterRun);
+
+    out += " -DCK_PARAM_BBlockTransferThreadSliceLengths_GK_GN0_GN10_GN11=" +
+           std::to_string(pt->BBlockTransferThreadSliceLengths_GK_GN0_GN10_GN11[0]) + "," +
+           std::to_string(pt->BBlockTransferThreadSliceLengths_GK_GN0_GN10_GN11[1]) + "," +
+           std::to_string(pt->BBlockTransferThreadSliceLengths_GK_GN0_GN10_GN11[2]) + "," +
+           std::to_string(pt->BBlockTransferThreadSliceLengths_GK_GN0_GN10_GN11[3]);
+
+    out += " -DCK_PARAM_BBlockTransferThreadClusterLengths_GK_GN0_GN10_GN11=" +
+           std::to_string(pt->BBlockTransferThreadClusterLengths_GK_GN0_GN10_GN11[0]) + "," +
+           std::to_string(pt->BBlockTransferThreadClusterLengths_GK_GN0_GN10_GN11[1]) + "," +
+           std::to_string(pt->BBlockTransferThreadClusterLengths_GK_GN0_GN10_GN11[2]) + "," +
+           std::to_string(pt->BBlockTransferThreadClusterLengths_GK_GN0_GN10_GN11[3]);
+
+    out += " -DCK_PARAM_BBlockTransferThreadClusterArrangeOrder=" +
+           std::to_string(pt->BBlockTransferThreadClusterArrangeOrder[0]) + "," +
+           std::to_string(pt->BBlockTransferThreadClusterArrangeOrder[1]) + "," +
+           std::to_string(pt->BBlockTransferThreadClusterArrangeOrder[2]) + "," +
+           std::to_string(pt->BBlockTransferThreadClusterArrangeOrder[3]);
+
+    out += " -DCK_PARAM_BBlockTransferSrcAccessOrder=" +
+           std::to_string(pt->BBlockTransferSrcAccessOrder[0]) + "," +
+           std::to_string(pt->BBlockTransferSrcAccessOrder[1]) + "," +
+           std::to_string(pt->BBlockTransferSrcAccessOrder[2]) + "," +
+           std::to_string(pt->BBlockTransferSrcAccessOrder[3]);
+
+    out +=
+        " -DCK_PARAM_BBlockTransferSrcVectorDim=" + std::to_string(pt->BBlockTransferSrcVectorDim);
+    out += " -DCK_PARAM_BBlockTransferSrcScalarPerVector=" +
+           std::to_string(pt->BBlockTransferSrcScalarPerVector);
+    out += " -DCK_PARAM_BBlockTransferDstScalarPerVector_GN11=" +
+           std::to_string(pt->BBlockTransferDstScalarPerVector_GN11);
+    out += " -DCK_PARAM_BThreadTransferSrcResetCoordinateAfterRun=" +
+           std::to_string(pt->BThreadTransferSrcResetCoordinateAfterRun);
+
+    out += " -DCK_PARAM_CThreadTransferSrcDstAccessOrder=" +
+           std::to_string(pt->CThreadTransferSrcDstAccessOrder[0]) + "," +
+           std::to_string(pt->CThreadTransferSrcDstAccessOrder[1]) + "," +
+           std::to_string(pt->CThreadTransferSrcDstAccessOrder[2]) + "," +
+           std::to_string(pt->CThreadTransferSrcDstAccessOrder[3]) + "," +
+           std::to_string(pt->CThreadTransferSrcDstAccessOrder[4]) + "," +
+           std::to_string(pt->CThreadTransferSrcDstAccessOrder[5]);
+
+    out += " -DCK_PARAM_CThreadTransferSrcDstVectorDim=" +
+           std::to_string(pt->CThreadTransferSrcDstVectorDim);
+    out += " -DCK_PARAM_CThreadTransferDstScalarPerVector=" +
+           std::to_string(pt->CThreadTransferDstScalarPerVector);
+
+    return (out);
+};
+
+} // namespace detail_dyn_conv_fwd_v4r5_nchw_kcyx_nkhw
+
+template <typename TInWei,
+          typename TAcc,
+          typename TOut,
+          typename InLengths,
+          typename WeiLengths,
+          typename OutLengths,
+          typename ConvStrides,
+          typename ConvDilations,
+          typename InLeftPads,
+          typename InRightPads>
+void device_dynamic_convolution_forward_implicit_gemm_v4r5_nchw_kcyx_nkhw_olc(
+    olCompile::Handle* handle,
+    const InLengths& in_n_c_hi_wi_lengths,
+    const WeiLengths& wei_k_c_y_x_lengths,
+    const OutLengths& out_n_k_ho_wo_lengths,
+    const ConvStrides& conv_strides,
+    const ConvDilations& conv_dilations,
+    const InLeftPads& in_left_pads,
+    const InRightPads& in_right_pads,
+    const Tensor<TInWei>& in_n_c_hi_wi,
+    const Tensor<TInWei>& wei_k_c_y_x,
+    Tensor<TOut>& out_n_k_ho_wo,
+    const tunable_dyn_conv_fwd_v4r5_nchw_kcyx_nkhw* tunable,
+    ck::index_t nrepeat)
+{
+    using namespace ck;
+    using namespace detail_dyn_conv_fwd_v4r5_nchw_kcyx_nkhw;
+    using size_t = std::size_t;
+
+    constexpr index_t N0 = 4; // this could not be a tunable so far
+
+    ////////////////////////////////////////////////////////////////////////////////////////////////////////////
+    // The follow codes are only used for computing the grid_size, hasMainKBlockLoop,
+    // hasDoubleTailKBlockLoop
+
+    constexpr auto I0 = Number<0>{};
+    constexpr auto I1 = Number<1>{};
+    constexpr auto I2 = Number<2>{};
+    constexpr auto I3 = Number<3>{};
+
+    const auto in_n_c_hi_wi_desc =
+        make_dynamic_naive_tensor_descriptor_packed_v2(in_n_c_hi_wi_lengths);
+    const auto wei_k_c_y_x_desc =
+        make_dynamic_naive_tensor_descriptor_packed_v2(wei_k_c_y_x_lengths);
+    const auto out_n_k_ho_wo_desc =
+        make_dynamic_naive_tensor_descriptor_packed_v2(out_n_k_ho_wo_lengths);
+
+    const auto descs = transform_forward_convolution_into_contraction_v4r5_nchw_kcyx_nkhw_pad<N0>(
+        wei_k_c_y_x_desc,
+        in_n_c_hi_wi_desc,
+        out_n_k_ho_wo_desc,
+        conv_strides,
+        conv_dilations,
+        in_left_pads,
+        in_right_pads);
+
+    const auto a_gk_gm0_gm1_grid_desc      = descs[I0];
+    const auto c_gm0_gm1_gn0_gn1_grid_desc = descs[I2];
+
+    const auto GM1 = c_gm0_gm1_gn0_gn1_grid_desc.GetLength(I1);
+    const auto GN1 = c_gm0_gm1_gn0_gn1_grid_desc.GetLength(I3);
+    const auto GK  = a_gk_gm0_gm1_grid_desc.GetLength(I0);
+
+    const index_t grid_size = (GM1 / tunable->GM1PerBlockGM11) * (GN1 / tunable->GN1PerBlockGN11);
+    const bool hasMainKBlockLoop       = ((GK + tunable->KPerBlock) / (2 * tunable->KPerBlock) > 1);
+    const bool hasDoubleTailKBlockLoop = ((GK / tunable->KPerBlock) % 2 == 0);
+
+    ///////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+    // these buffers are usually provided by the user application
+    DeviceMem in_n_c_hi_wi_dev_buf(sizeof(TInWei) * in_n_c_hi_wi.mDesc.GetElementSpace());
+    DeviceMem wei_k_c_y_x_dev_buf(sizeof(TInWei) * wei_k_c_y_x.mDesc.GetElementSpace());
+    DeviceMem out_n_k_ho_wo_dev_buf(sizeof(TOut) * out_n_k_ho_wo.mDesc.GetElementSpace());
+
+    in_n_c_hi_wi_dev_buf.ToDevice(in_n_c_hi_wi.mData.data());
+    wei_k_c_y_x_dev_buf.ToDevice(wei_k_c_y_x.mData.data());
+    out_n_k_ho_wo_dev_buf.ToDevice(out_n_k_ho_wo.mData.data());
+
+    // these are workspace buffers that should be expressed to the user by the corresponding
+    // workspace API
+    DeviceMem workspace_buf(4096);
+
+    void* a_gk_gm0_gm10_gm11_grid_desc_dev_buf = workspace_buf.GetDeviceBuffer();
+    void* b_gk_gn0_gn10_gn11_grid_desc_dev_buf =
+        static_cast<void*>(static_cast<unsigned char*>(workspace_buf.GetDeviceBuffer()) + 1024);
+    void* c_gm10_bm0_bm1_gn10_bn0_bn1_grid_desc_dev_buf =
+        static_cast<void*>(static_cast<unsigned char*>(workspace_buf.GetDeviceBuffer()) + 2048);
+    void* c_blockid_to_gm10_gn10_block_cluster_adaptor_dev_buf =
+        static_cast<void*>(static_cast<unsigned char*>(workspace_buf.GetDeviceBuffer()) + 3072);
+
+    const std::vector<size_t> vld  = {static_cast<size_t>(tunable->BlockSize), 1, 1};
+    const std::vector<size_t> vgd1 = {static_cast<size_t>(tunable->BlockSize), 1, 1};
+    const std::vector<size_t> vgd2 = {static_cast<size_t>(grid_size * tunable->BlockSize), 1, 1};
+
+    std::string program_name = "dynamic_convolution_forward_implicit_gemm_v4r5_nchw_kcyx_nkhw.cpp";
+    std::string algo_name    = "implicit_gemm_conv_fwd_v4r4_nchw";
+
+    std::string param = " -std=c++17 ";
+    std::string network_config;
+
+    param += get_definition_string_from_types<TInWei, TAcc, TOut>() +
+             " -DCK_PARAM_HAS_MAIN_KBLOCK_LOOP=" + std::to_string(hasMainKBlockLoop) +
+             " -DCK_PARAM_HAS_DOUBLE_TAIL_KBLOCK_LOOP=" + std::to_string(hasDoubleTailKBlockLoop) +
+             " -DCK_PARAM_N0=" + std::to_string(N0) + " " +
+             get_definition_string_from_tunable(tunable);
+    network_config = get_network_config_string_from_types<TInWei, TAcc, TOut>() + "_V" +
+                     std::to_string(hasDoubleTailKBlockLoop) + "_" + std::to_string(N0) + "_" +
+                     get_network_config_string_from_tunable(tunable);
+
+    std::vector<float> kernel1_times;
+    std::vector<float> kernel2_times;
+
+    for(index_t i = 0; i < nrepeat; ++i)
+    {
+        KernelTimer timer1, timer2;
+        std::string kernel_name;
+
+        kernel_name = "dynamic_convolution_forward_implicit_gemm_v4r5_nchw_kcyx_nkhw_prepare";
+        auto network_config_1 = network_config + "_1";
+
+        timer1.Start();
+        handle->AddKernel(algo_name, network_config_1, program_name, kernel_name, vld, vgd1, param)(
+            static_cast<index_t>(in_n_c_hi_wi_lengths[I0]),
+            static_cast<index_t>(in_n_c_hi_wi_lengths[I1]),
+            static_cast<index_t>(in_n_c_hi_wi_lengths[I2]),
+            static_cast<index_t>(in_n_c_hi_wi_lengths[I3]),
+            static_cast<index_t>(wei_k_c_y_x_lengths[I0]),
+            static_cast<index_t>(wei_k_c_y_x_lengths[I2]),
+            static_cast<index_t>(wei_k_c_y_x_lengths[I3]),
+            conv_strides[I0],
+            conv_strides[I1],
+            conv_dilations[I0],
+            conv_dilations[I1],
+            in_left_pads[I0],
+            in_left_pads[I1],
+            in_right_pads[I0],
+            in_right_pads[I1],
+            a_gk_gm0_gm10_gm11_grid_desc_dev_buf,
+            b_gk_gn0_gn10_gn11_grid_desc_dev_buf,
+            c_gm10_bm0_bm1_gn10_bn0_bn1_grid_desc_dev_buf,
+            c_blockid_to_gm10_gn10_block_cluster_adaptor_dev_buf);
+        timer2.End();
+
+        kernel_name           = "dynamic_convolution_forward_implicit_gemm_v4r5_nchw_kcyx_nkhw";
+        auto network_config_2 = network_config + "_2";
+
+        timer2.Start();
+        handle->AddKernel(algo_name, network_config_2, program_name, kernel_name, vld, vgd2, param)(
+            reinterpret_cast<const TInWei*>(wei_k_c_y_x_dev_buf.GetDeviceBuffer()),
+            reinterpret_cast<const TInWei*>(in_n_c_hi_wi_dev_buf.GetDeviceBuffer()),
+            reinterpret_cast<TOut*>(out_n_k_ho_wo_dev_buf.GetDeviceBuffer()),
+            (const void*)(a_gk_gm0_gm10_gm11_grid_desc_dev_buf),
+            (const void*)(b_gk_gn0_gn10_gn11_grid_desc_dev_buf),
+            (const void*)(c_gm10_bm0_bm1_gn10_bn0_bn1_grid_desc_dev_buf),
+            (const void*)(c_blockid_to_gm10_gn10_block_cluster_adaptor_dev_buf));
+        timer2.End();
+
+        kernel1_times.push_back(timer1.GetElapsedTime());
+        kernel2_times.push_back(timer2.GetElapsedTime());
+    }
+
+    {
+        auto ave_time1 = Driver::get_effective_average(kernel1_times);
+        auto ave_time2 = Driver::get_effective_average(kernel2_times);
+
+        const auto N = in_n_c_hi_wi_lengths[I0];
+        const auto C = in_n_c_hi_wi_lengths[I1];
+
+        const auto K  = out_n_k_ho_wo_lengths[I1];
+        const auto Ho = out_n_k_ho_wo_lengths[I2];
+        const auto Wo = out_n_k_ho_wo_lengths[I3];
+
+        const auto Y = wei_k_c_y_x_lengths[I2];
+        const auto X = wei_k_c_y_x_lengths[I3];
+
+        float perf = (float)(std::size_t(2) * N * K * Ho * Wo * C * Y * X) /
+                     (std::size_t(1000) * 1000 * 1000) / (ave_time1 + ave_time2);
+
+        std::cout << "Average time : " << ave_time1 + ave_time2 << " ms(" << ave_time1 << ", "
+                  << ave_time2 << "), " << perf << " TFlop/s" << std::endl;
+    };
+
+    // copy result back to host
+    out_n_k_ho_wo_dev_buf.FromDevice(out_n_k_ho_wo.mData.data());
+}
diff --git a/driver/include/olc_driver_common.hpp b/driver/include/olc_driver_common.hpp
new file mode 100644
index 0000000000..383bf4c6a4
--- /dev/null
+++ b/driver/include/olc_driver_common.hpp
@@ -0,0 +1,114 @@
+#ifndef OLC_DRIVER_COMMON_HPP
+#define OLC_DRIVER_COMMON_HPP
+
+#include <half.hpp>
+#include <vector>
+#include <cassert>
+
+// this enumerate should be synchronized with include/miopen.h
+typedef enum {
+    appHalf     = 0,
+    appFloat    = 1,
+    appInt32    = 2,
+    appInt8     = 3,
+    appInt8x4   = 4,
+    appBFloat16 = 5,
+    appDouble   = 6,
+} appDataType_t;
+
+namespace Driver {
+
+template <appDataType_t typeNum>
+struct get_type_from_type_enum
+{
+    using type = float;
+};
+
+template <>
+struct get_type_from_type_enum<appHalf>
+{
+    using type = half_float::half;
+};
+
+template <>
+struct get_type_from_type_enum<appFloat>
+{
+    using type = float;
+};
+
+template <>
+struct get_type_from_type_enum<appDouble>
+{
+    using type = double;
+};
+
+template <>
+struct get_type_from_type_enum<appInt32>
+{
+    using type = int;
+};
+
+static inline int get_typeid_from_type_enum(appDataType_t t)
+{
+    switch(t)
+    {
+    case appHalf: return (static_cast<int>('H'));
+    case appFloat: return (static_cast<int>('F'));
+    case appBFloat16: return (static_cast<int>('B'));
+    case appDouble: return (static_cast<int>('D'));
+    case appInt8:
+    case appInt8x4:
+    case appInt32: return (static_cast<int>('O'));
+    default: throw std::runtime_error("Only float, half, bfloat16 data type is supported."); break;
+    };
+};
+
+template <typename T>
+static inline int get_typeid_from_type()
+{
+    throw std::runtime_error("Unsupported typeid conversion for this type!");
+};
+
+template <>
+inline int get_typeid_from_type<float>()
+{
+    return (static_cast<int>('F'));
+};
+
+template <>
+inline int get_typeid_from_type<half_float::half>()
+{
+    return (static_cast<int>('H'));
+};
+
+template <>
+inline int get_typeid_from_type<double>()
+{
+    return (static_cast<int>('D'));
+};
+
+static inline float get_effective_average(std::vector<float>& values)
+{
+    assert(!values.empty());
+
+    if(values.size() == 1)
+        return (values[0]);
+    else
+    {
+        float sum    = 0.0f;
+        float maxVal = 0.0f;
+
+        for(const auto val : values)
+        {
+            if(maxVal < val)
+                maxVal = val;
+            sum += val;
+        };
+
+        return ((sum - maxVal) / (values.size() - 1));
+    };
+};
+
+} // namespace Driver
+
+#endif
diff --git a/driver/olCompiling/addkernels/CMakeLists.txt b/driver/olCompiling/addkernels/CMakeLists.txt
new file mode 100644
index 0000000000..874cba6a5e
--- /dev/null
+++ b/driver/olCompiling/addkernels/CMakeLists.txt
@@ -0,0 +1,30 @@
+################################################################################
+# 
+# MIT License
+# 
+# Copyright (c) 2017 Advanced Micro Devices, Inc.
+# 
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+# 
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+# 
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+# 
+################################################################################
+
+set(ADD_KERNELS_SOURCE include_inliner.cpp addkernels.cpp)
+
+add_executable(addkernels EXCLUDE_FROM_ALL ${ADD_KERNELS_SOURCE})
+
diff --git a/driver/olCompiling/addkernels/addkernels.cpp b/driver/olCompiling/addkernels/addkernels.cpp
new file mode 100644
index 0000000000..5be523d97b
--- /dev/null
+++ b/driver/olCompiling/addkernels/addkernels.cpp
@@ -0,0 +1,264 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2021 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#include "include_inliner.hpp"
+#include <algorithm>
+#include <fstream>
+#include <iomanip>
+#include <iostream>
+#include <memory>
+#include <sstream>
+#include <string>
+
+void Bin2Hex(std::istream& source,
+             std::ostream& target,
+             const std::string& variable,
+             bool nullTerminate,
+             size_t bufferSize,
+             size_t lineSize)
+{
+    source.seekg(0, std::ios::end);
+    std::unique_ptr<unsigned char[]> buffer(new unsigned char[bufferSize]);
+    std::streamoff sourceSize = source.tellg();
+    std::streamoff blockStart = 0;
+
+    if(variable.length() != 0)
+    {
+        target << "extern const size_t " << variable << "_SIZE;" << std::endl;
+        target << "extern const unsigned char " << variable << "[];" << std::endl;
+        target << "const size_t " << variable << "_SIZE = " << std::setbase(10) << sourceSize << ";"
+               << std::endl;
+        target << "const unsigned char " << variable << "[] = {" << std::endl;
+    }
+
+    target << std::setbase(16) << std::setfill('0');
+    source.seekg(0, std::ios::beg);
+
+    while(blockStart < sourceSize)
+    {
+        source.read(reinterpret_cast<char*>(buffer.get()), bufferSize);
+
+        std::streamoff pos       = source.tellg();
+        std::streamoff blockSize = (pos < 0 ? sourceSize : pos) - blockStart;
+        std::streamoff i         = 0;
+
+        while(i < blockSize)
+        {
+            size_t j   = i;
+            size_t end = std::min<size_t>(i + lineSize, blockSize);
+
+            for(; j < end; j++)
+                target << "0x" << std::setw(2) << static_cast<unsigned>(buffer[j]) << ",";
+
+            target << std::endl;
+            i = end;
+        }
+
+        blockStart += blockSize;
+    }
+
+    if(nullTerminate)
+        target << "0x00," << std::endl;
+
+    if(variable.length() != 0)
+    {
+        target << "};" << std::endl;
+    }
+}
+
+void PrintHelp()
+{
+    std::cout << "Usage: bin2hex {<option>}" << std::endl;
+    std::cout << "Option format: -<option name>[ <option value>]" << std::endl;
+    std::cout << std::endl;
+    std::cout << "Options:" << std::endl;
+    std::cout
+        << "[REQUIRED] -s[ource] {<path to file>}: files to be processed. Must be last argument."
+        << std::endl;
+    std::cout << "           -t[arget] <path>: target file. Default: std out." << std::endl;
+    std::cout << "           -l[ine-size] <number>: bytes in one line. Default: 16." << std::endl;
+    std::cout << "           -b[uffer] <number>: read buffer size. Default: 512." << std::endl;
+    std::cout << "           -g[uard] <string>: guard name. Default: no guard" << std::endl;
+    std::cout << "           -n[o-recurse] : dont expand include files recursively. Default: off"
+              << std::endl;
+}
+
+[[gnu::noreturn]] void WrongUsage(const std::string& error)
+{
+    std::cout << "Wrong usage: " << error << std::endl;
+    std::cout << std::endl;
+    PrintHelp();
+    std::exit(1);
+}
+
+[[gnu::noreturn]] void UnknownArgument(const std::string& arg)
+{
+    std::ostringstream ss;
+    ss << "unknown argument - " << arg;
+    WrongUsage(ss.str());
+}
+
+void Process(const std::string& sourcePath,
+             std::ostream& target,
+             size_t bufferSize,
+             size_t lineSize,
+             bool recurse,
+             bool as_extern)
+{
+    std::string fileName(sourcePath);
+    std::string extension, root;
+    std::stringstream inlinerTemp;
+    auto extPos   = fileName.rfind('.');
+    auto slashPos = fileName.rfind('/');
+
+    if(extPos != std::string::npos)
+    {
+        extension = fileName.substr(extPos + 1);
+        fileName  = fileName.substr(0, extPos);
+    }
+
+    if(slashPos != std::string::npos)
+    {
+        root     = fileName.substr(0, slashPos + 1);
+        fileName = fileName.substr(slashPos + 1);
+    }
+
+    std::string variable(fileName);
+    std::ifstream sourceFile(sourcePath, std::ios::in | std::ios::binary);
+    std::istream* source = &sourceFile;
+
+    if(!sourceFile.good())
+    {
+        std::cerr << "File not found: " << sourcePath << std::endl;
+        std::exit(1);
+    }
+
+    const auto is_asm    = extension == "s";
+    const auto is_cl     = extension == "cl";
+    const auto is_hip    = extension == "cpp";
+    const auto is_header = extension == "hpp";
+
+    if(is_asm || is_cl || is_hip || is_header)
+    {
+        IncludeInliner inliner;
+
+        try
+        {
+            if(is_asm)
+                inliner.Process(
+                    sourceFile, inlinerTemp, root, sourcePath, ".include", false, recurse);
+            else if(is_cl || is_header)
+                inliner.Process(
+                    sourceFile, inlinerTemp, root, sourcePath, "#include", true, recurse);
+            else if(is_hip)
+                inliner.Process(
+                    sourceFile, inlinerTemp, root, sourcePath, "<#not_include>", true, false);
+        }
+        catch(const InlineException& ex)
+        {
+            std::cerr << ex.What() << std::endl;
+            std::cerr << ex.GetTrace() << std::endl;
+            std::exit(1);
+        }
+
+        source = &inlinerTemp;
+    }
+
+    std::transform(variable.begin(), variable.end(), variable.begin(), ::toupper);
+
+    if(as_extern && variable.length() != 0)
+    {
+        variable = "APP_KERNEL_" + variable;
+    }
+
+    Bin2Hex(*source, target, variable, true, bufferSize, lineSize);
+}
+
+int main(int argsn, char** args)
+{
+    if(argsn == 1)
+    {
+        PrintHelp();
+        return 2;
+    }
+
+    std::string guard;
+    size_t bufferSize = 512;
+    size_t lineSize   = 16;
+
+    std::ofstream targetFile;
+    std::ostream* target = &std::cout;
+    bool recurse         = true;
+    bool as_extern       = false;
+
+    int i = 0;
+    while(++i < argsn && **args != '-')
+    {
+        std::string arg(args[i] + 1);
+        std::transform(arg.begin(), arg.end(), arg.begin(), ::tolower);
+
+        if(arg == "s" || arg == "source")
+        {
+            if(guard.length() > 0)
+            {
+                *target << "#ifndef " << guard << std::endl;
+                *target << "#define " << guard << std::endl;
+            }
+
+            *target << "#include <cstddef>" << std::endl;
+
+            while(++i < argsn)
+            {
+                Process(args[i], *target, bufferSize, lineSize, recurse, as_extern);
+            }
+
+            if(guard.length() > 0)
+            {
+                *target << "#endif" << std::endl;
+            }
+
+            return 0;
+        }
+        else if(arg == "t" || arg == "target")
+        {
+            targetFile.open(args[++i], std::ios::out);
+            target = &targetFile;
+        }
+        else if(arg == "l" || arg == "line-size")
+            lineSize = std::stol(args[++i]);
+        else if(arg == "b" || arg == "buffer")
+            bufferSize = std::stol(args[++i]);
+        else if(arg == "g" || arg == "guard")
+            guard = args[++i];
+        else if(arg == "n" || arg == "no-recurse")
+            recurse = false;
+        else if(arg == "e" || arg == "extern")
+            as_extern = true;
+        else
+            UnknownArgument(arg);
+    }
+
+    WrongUsage("source key is required");
+}
diff --git a/driver/olCompiling/addkernels/include_inliner.cpp b/driver/olCompiling/addkernels/include_inliner.cpp
new file mode 100644
index 0000000000..e5aec7dd77
--- /dev/null
+++ b/driver/olCompiling/addkernels/include_inliner.cpp
@@ -0,0 +1,213 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2021 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#include <algorithm>
+#include <exception>
+#include <fstream>
+#include <sstream>
+
+#ifdef _WIN32
+#include <windows.h>
+#endif
+#ifdef __linux__
+#include <linux/limits.h>
+#include <cstdlib>
+#endif // !WIN32
+
+#include "include_inliner.hpp"
+
+namespace PathHelpers {
+static int GetMaxPath()
+{
+#ifdef _WIN32
+    return MAX_PATH;
+#else
+    return PATH_MAX;
+#endif
+}
+
+static std::string GetAbsolutePath(const std::string& path)
+{
+    std::string result(GetMaxPath(), ' ');
+#ifdef _WIN32
+    const auto retval = GetFullPathName(path.c_str(), result.size(), &result[0], nullptr);
+
+    if(retval == 0)
+        return "";
+#else
+    auto* const retval = realpath(path.c_str(), &result[0]);
+
+    if(retval == nullptr)
+        return "";
+#endif
+    return result;
+}
+} // namespace PathHelpers
+
+std::string IncludeFileExceptionBase::What() const
+{
+    std::ostringstream ss;
+    ss << GetMessage() << ": <" << _file << ">";
+
+    return ss.str();
+}
+
+void IncludeInliner::Process(std::istream& input,
+                             std::ostream& output,
+                             const std::string& root,
+                             const std::string& file_name,
+                             const std::string& directive,
+                             bool allow_angle_brackets,
+                             bool recurse)
+{
+    ProcessCore(input, output, root, file_name, 0, directive, allow_angle_brackets, recurse);
+}
+
+void IncludeInliner::ProcessCore(std::istream& input,
+                                 std::ostream& output,
+                                 const std::string& root,
+                                 const std::string& file_name,
+                                 int line_number,
+                                 const std::string& directive,
+                                 bool allow_angle_brackets,
+                                 bool recurse)
+{
+    if(_include_depth >= include_depth_limit)
+        throw InlineStackOverflowException(GetIncludeStackTrace(0));
+
+    _include_depth++;
+    _included_stack_head =
+        std::make_shared<SourceFileDesc>(file_name, _included_stack_head, line_number);
+    auto current_line          = 0;
+    auto next_include_optional = false;
+
+    while(!input.eof())
+    {
+        std::string line;
+        std::string word;
+        std::getline(input, line);
+        std::istringstream line_parser(line);
+        line_parser >> word;
+        current_line++;
+        std::transform(word.begin(), word.end(), word.begin(), ::tolower);
+
+        const auto include_optional = next_include_optional;
+        next_include_optional       = false;
+
+        if(!word.empty() && word == "//inliner-include-optional")
+        {
+            if(include_optional)
+                throw IncludeExpectedException(GetIncludeStackTrace(current_line));
+            next_include_optional = true;
+            continue;
+        }
+
+        if(!word.empty() && word == directive && recurse)
+        {
+            auto first_quote_pos = line.find('"', static_cast<int>(line_parser.tellg()) + 1);
+            std::string::size_type second_quote_pos;
+
+            if(first_quote_pos != std::string::npos)
+            {
+                second_quote_pos = line.find('"', first_quote_pos + 1);
+                if(second_quote_pos == std::string::npos)
+                    throw WrongInlineDirectiveException(GetIncludeStackTrace(current_line));
+            }
+            else
+            {
+                if(!allow_angle_brackets)
+                    throw WrongInlineDirectiveException(GetIncludeStackTrace(current_line));
+
+                first_quote_pos = line.find('<', static_cast<int>(line_parser.tellg()) + 1);
+                if(first_quote_pos == std::string::npos)
+                    throw WrongInlineDirectiveException(GetIncludeStackTrace(current_line));
+
+                second_quote_pos = line.find('>', first_quote_pos + 1);
+                if(second_quote_pos == std::string::npos)
+                    throw WrongInlineDirectiveException(GetIncludeStackTrace(current_line));
+            }
+
+            const std::string include_file_path =
+                line.substr(first_quote_pos + 1, second_quote_pos - first_quote_pos - 1);
+            const std::string abs_include_file_path(
+                PathHelpers::GetAbsolutePath(root + "/" + include_file_path)); // NOLINT
+
+            if(abs_include_file_path.empty())
+            {
+                if(include_optional)
+                    continue;
+                throw IncludeNotFoundException(include_file_path,
+                                               GetIncludeStackTrace(current_line));
+            }
+            std::ifstream include_file(abs_include_file_path, std::ios::in);
+
+            if(!include_file.good())
+                throw IncludeCantBeOpenedException(include_file_path,
+                                                   GetIncludeStackTrace(current_line));
+
+            ProcessCore(include_file,
+                        output,
+                        root,
+                        include_file_path,
+                        current_line,
+                        directive,
+                        allow_angle_brackets,
+                        recurse);
+        }
+        else
+        {
+            if(include_optional)
+                throw IncludeExpectedException(GetIncludeStackTrace(current_line));
+
+            if(output.tellp() > 0)
+                output << std::endl;
+
+            output << line;
+        }
+    }
+
+    auto prev_file       = _included_stack_head->included_from;
+    _included_stack_head = prev_file;
+    _include_depth--;
+}
+
+std::string IncludeInliner::GetIncludeStackTrace(int line)
+{
+    std::ostringstream ss;
+
+    if(_included_stack_head == nullptr)
+        return "";
+
+    auto item = _included_stack_head;
+    ss << "    " << item->path << ":" << line;
+
+    while(item->included_from != nullptr)
+    {
+        ss << std::endl << "    from " << item->included_from->path << ":" << item->included_line;
+        item = item->included_from;
+    }
+
+    return ss.str();
+}
diff --git a/driver/olCompiling/addkernels/include_inliner.hpp b/driver/olCompiling/addkernels/include_inliner.hpp
new file mode 100644
index 0000000000..501ad7cc78
--- /dev/null
+++ b/driver/olCompiling/addkernels/include_inliner.hpp
@@ -0,0 +1,142 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2021 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#ifndef SOURCE_INLINER_HPP
+
+#define SOURCE_INLINER_HPP
+#include "source_file_desc.hpp"
+#include <ostream>
+#include <memory>
+#include <stack>
+
+class InlineException : public std::exception
+{
+    public:
+    InlineException(const std::string& trace) : _trace(trace) {}
+
+    virtual std::string What() const = 0;
+    const std::string& GetTrace() const { return _trace; }
+
+    private:
+    std::string _trace;
+};
+
+class InlineStackOverflowException : public InlineException
+{
+    public:
+    InlineStackOverflowException(const std::string& trace) : InlineException(trace) {}
+
+    std::string What() const override
+    {
+        return "Include stack depth limit has been reached, possible circle includes";
+    }
+};
+
+class IncludeExpectedException : public InlineException
+{
+    public:
+    IncludeExpectedException(const std::string& trace) : InlineException(trace) {}
+
+    std::string What() const override { return "Include directive expected"; }
+};
+
+class WrongInlineDirectiveException : public InlineException
+{
+    public:
+    WrongInlineDirectiveException(const std::string& trace) : InlineException(trace) {}
+
+    std::string What() const override { return "Include directive has wrong format"; }
+};
+
+class IncludeFileExceptionBase : public InlineException
+{
+    public:
+    IncludeFileExceptionBase(const std::string& file, const std::string& trace)
+        : InlineException(trace), _file(file)
+    {
+    }
+
+    std::string What() const override;
+    virtual std::string GetMessage() const = 0;
+
+    private:
+    std::string _file;
+};
+
+class IncludeNotFoundException : public IncludeFileExceptionBase
+{
+    public:
+    IncludeNotFoundException(const std::string& file, const std::string& trace)
+        : IncludeFileExceptionBase(file, trace)
+    {
+    }
+
+    std::string GetMessage() const override
+    {
+        return "Include file not found (if it is optional put //inliner-include-optional on line "
+               "before it)";
+    }
+};
+
+class IncludeCantBeOpenedException : public IncludeFileExceptionBase
+{
+    public:
+    IncludeCantBeOpenedException(const std::string& file, const std::string& trace)
+        : IncludeFileExceptionBase(file, trace)
+    {
+    }
+
+    std::string GetMessage() const override { return "Can not open include file"; }
+};
+
+class IncludeInliner
+{
+    public:
+    int include_depth_limit = 256;
+
+    void Process(std::istream& input,
+                 std::ostream& output,
+                 const std::string& root,
+                 const std::string& file_name,
+                 const std::string& directive,
+                 bool allow_angle_brackets,
+                 bool recurse);
+    std::string GetIncludeStackTrace(int line);
+
+    private:
+    int _include_depth                                   = 0;
+    std::shared_ptr<SourceFileDesc> _included_stack_head = nullptr;
+
+    void ProcessCore(std::istream& input,
+                     std::ostream& output,
+                     const std::string& root,
+                     const std::string& file_name,
+                     int line_number,
+                     const std::string& directive,
+                     bool allow_angle_brackets,
+                     bool recurse);
+};
+
+#endif // !SOURCE_INLINER_HPP
diff --git a/driver/olCompiling/addkernels/source_file_desc.hpp b/driver/olCompiling/addkernels/source_file_desc.hpp
new file mode 100644
index 0000000000..f0fbf5938f
--- /dev/null
+++ b/driver/olCompiling/addkernels/source_file_desc.hpp
@@ -0,0 +1,45 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2021 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#ifndef SOURCE_FILE_DESC_HPP
+
+#define SOURCE_FILE_DESC_HPP
+#include <string>
+#include <memory>
+
+class SourceFileDesc
+{
+    public:
+    const std::string path;
+    int included_line;
+    std::shared_ptr<SourceFileDesc> included_from;
+
+    SourceFileDesc(const std::string& path_, std::shared_ptr<SourceFileDesc> from, int line)
+        : path(path_), included_line(line), included_from(from)
+    {
+    }
+};
+
+#endif // SOURCE_FILE_DESC_HPP
diff --git a/driver/olCompiling/hip_utility/binary_cache.cpp b/driver/olCompiling/hip_utility/binary_cache.cpp
new file mode 100644
index 0000000000..f2f47a1a31
--- /dev/null
+++ b/driver/olCompiling/hip_utility/binary_cache.cpp
@@ -0,0 +1,112 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2017 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+
+#include <binary_cache.hpp>
+#include <handle.hpp>
+#include <md5.hpp>
+#include <env.hpp>
+#include <stringutils.hpp>
+#include <logger.hpp>
+#include <target_properties.hpp>
+#include <boost/filesystem.hpp>
+#include <fstream>
+#include <iostream>
+
+namespace olCompile {
+
+OLC_DECLARE_ENV_VAR(OLC_DISABLE_CACHE)
+OLC_DECLARE_ENV_VAR(HOME)
+
+static boost::filesystem::path ComputeCachePath()
+{
+    const char* home_dir = GetStringEnv(HOME{});
+    if(home_dir == nullptr || home_dir == std::string("/") || home_dir == std::string(""))
+    {
+        home_dir = "/tmp";
+    }
+
+    auto p = boost::filesystem::path{home_dir} / "_hip_binary_kernels_";
+
+    if(!boost::filesystem::exists(p))
+        boost::filesystem::create_directories(p);
+    return p;
+}
+
+boost::filesystem::path GetCachePath()
+{
+    static const boost::filesystem::path user_path = ComputeCachePath();
+
+    return user_path;
+}
+
+static bool IsCacheDisabled() { return olCompile::IsEnabled(OLC_DISABLE_CACHE{}); }
+
+boost::filesystem::path
+GetCacheFile(const std::string& device, const std::string& name, const std::string& args)
+{
+    // std::string filename = (is_kernel_str ? olCompile::md5(name) : name) + ".o";
+    std::string filename = name + ".o";
+    return GetCachePath() / olCompile::md5(device + ":" + args) / filename;
+}
+
+boost::filesystem::path LoadBinary(const TargetProperties& target,
+                                   const size_t num_cu,
+                                   const std::string& name,
+                                   const std::string& args)
+{
+    if(olCompile::IsCacheDisabled())
+        return {};
+
+    (void)num_cu;
+    auto f = GetCacheFile(target.DbId(), name, args);
+    if(boost::filesystem::exists(f))
+    {
+        return f.string();
+    }
+    else
+    {
+        return {};
+    }
+}
+
+void SaveBinary(const boost::filesystem::path& binary_path,
+                const TargetProperties& target,
+                const std::string& name,
+                const std::string& args)
+{
+    if(olCompile::IsCacheDisabled())
+    {
+        boost::filesystem::remove(binary_path);
+    }
+    else
+    {
+        auto p = GetCacheFile(target.DbId(), name, args);
+        boost::filesystem::create_directories(p.parent_path());
+        boost::filesystem::rename(binary_path, p);
+    }
+}
+
+} // namespace olCompile
diff --git a/driver/olCompiling/hip_utility/exec_utils.cpp b/driver/olCompiling/hip_utility/exec_utils.cpp
new file mode 100644
index 0000000000..60168c1a54
--- /dev/null
+++ b/driver/olCompiling/hip_utility/exec_utils.cpp
@@ -0,0 +1,93 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2019 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#include <exec_utils.hpp>
+#include <manage_ptr.hpp>
+#include <istream>
+#include <ostream>
+#include <string>
+#include <cstdio>
+#include <array>
+#include <cassert>
+
+#ifdef __linux__
+#include <unistd.h>
+#include <cstdio>
+#include <sys/wait.h>
+#endif // __linux__
+
+namespace olCompile {
+namespace exec {
+
+int Run(const std::string& p, std::istream* in, std::ostream* out)
+{
+#ifdef __linux__
+    const auto redirect_stdin  = (in != nullptr);
+    const auto redirect_stdout = (out != nullptr);
+
+    assert(!(redirect_stdin && redirect_stdout));
+
+    const auto file_mode = redirect_stdout ? "r" : "w";
+    OLC_MANAGE_PTR(FILE*, pclose) pipe{popen(p.c_str(), file_mode)};
+
+    if(!pipe)
+        throw std::runtime_error("olCompile::exec::Run(): popen(" + p + ", " + file_mode +
+                                 ") failed");
+
+    if(redirect_stdin || redirect_stdout)
+    {
+        std::array<char, 1024> buffer{};
+
+        if(redirect_stdout)
+        {
+            while(feof(pipe.get()) == 0)
+                if(fgets(buffer.data(), buffer.size(), pipe.get()) != nullptr)
+                    *out << buffer.data();
+        }
+        else
+        {
+            while(!in->eof())
+            {
+                in->read(buffer.data(), buffer.size() - 1);
+                buffer[in->gcount()] = 0;
+
+                if(fputs(buffer.data(), pipe.get()) == EOF)
+                    throw std::runtime_error("olCompile::exec::Run(): fputs() failed");
+            }
+        }
+    }
+
+    auto status = pclose(pipe.release());
+    return WEXITSTATUS(status);
+#else
+    (void)p;
+    (void)in;
+    (void)out;
+    return -1;
+#endif // __linux__
+}
+
+} // namespace exec
+} // namespace olCompile
diff --git a/driver/olCompiling/hip_utility/handlehip.cpp b/driver/olCompiling/hip_utility/handlehip.cpp
new file mode 100644
index 0000000000..f403b040f4
--- /dev/null
+++ b/driver/olCompiling/hip_utility/handlehip.cpp
@@ -0,0 +1,285 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2017-2020 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+
+#include <handle.hpp>
+
+#include <binary_cache.hpp>
+#include <env.hpp>
+#include <kernel_cache.hpp>
+#include <stringutils.hpp>
+#include <target_properties.hpp>
+
+#include <hipCheck.hpp>
+
+#include <write_file.hpp>
+
+#include <boost/filesystem.hpp>
+#include <boost/lexical_cast.hpp>
+
+#ifndef _WIN32
+#include <unistd.h>
+#endif
+
+#include <algorithm>
+#include <cassert>
+#include <chrono>
+#include <thread>
+
+OLC_DECLARE_ENV_VAR(OLC_DEVICE_CU)
+
+namespace olCompile {
+
+std::size_t GetAvailableMemory()
+{
+    size_t free, total;
+    MY_HIP_CHECK(hipMemGetInfo(&free, &total));
+    return free;
+}
+
+int get_device_id() // Get random device
+{
+    int device;
+
+    MY_HIP_CHECK(hipGetDevice(&device));
+
+    return device;
+}
+
+void set_device(int id) { MY_HIP_CHECK(hipSetDevice(id)); }
+
+int set_default_device()
+{
+    int n;
+
+    MY_HIP_CHECK(hipGetDeviceCount(&n));
+
+    // Pick device based on process id
+    auto pid = ::getpid();
+    assert(pid > 0);
+    set_device(pid % n);
+    return (pid % n);
+}
+
+struct HandleImpl
+{
+    using StreamPtr = std::shared_ptr<typename std::remove_pointer<hipStream_t>::type>;
+
+    HandleImpl() {}
+
+    StreamPtr create_stream()
+    {
+        hipStream_t result;
+
+        MY_HIP_CHECK(hipStreamCreate(&result));
+
+        return StreamPtr{result, &hipStreamDestroy};
+    }
+
+    static StreamPtr reference_stream(hipStream_t s) { return StreamPtr{s, null_deleter{}}; }
+
+    std::string get_device_name() const
+    {
+        hipDeviceProp_t props;
+
+        MY_HIP_CHECK(hipGetDeviceProperties(&props, device));
+
+        const std::string name(props.gcnArchName);
+        return name;
+    }
+
+    StreamPtr stream = nullptr;
+    int device       = -1;
+    KernelCache cache;
+    TargetProperties target_properties;
+};
+
+Handle::Handle(hipStream_t stream) : impl(new HandleImpl())
+{
+    this->impl->device = get_device_id();
+
+    if(stream == nullptr)
+        this->impl->stream = HandleImpl::reference_stream(nullptr);
+    else
+        this->impl->stream = HandleImpl::reference_stream(stream);
+
+    this->impl->target_properties.Init(this);
+}
+
+Handle::Handle() : impl(new HandleImpl())
+{
+    this->impl->device = get_device_id();
+    this->impl->stream = HandleImpl::reference_stream(nullptr);
+
+    this->impl->target_properties.Init(this);
+}
+
+Handle::~Handle() {}
+
+void Handle::SetStream(hipStream_t streamID) const
+{
+    this->impl->stream = HandleImpl::reference_stream(streamID);
+
+    this->impl->target_properties.Init(this);
+}
+
+hipStream_t Handle::GetStream() const { return impl->stream.get(); }
+
+KernelInvoke Handle::AddKernel(const std::string& algorithm,
+                               const std::string& network_config,
+                               const std::string& program_name,
+                               const std::string& kernel_name,
+                               const std::vector<size_t>& vld,
+                               const std::vector<size_t>& vgd,
+                               const std::string& params,
+                               std::size_t cache_index) const
+{
+
+    auto obj = this->impl->cache.AddKernel(
+        *this, algorithm, network_config, program_name, kernel_name, vld, vgd, params, cache_index);
+    return this->Run(obj);
+}
+
+void Handle::ClearKernels(const std::string& algorithm, const std::string& network_config) const
+{
+    this->impl->cache.ClearKernels(algorithm, network_config);
+}
+
+const std::vector<Kernel>& Handle::GetKernelsImpl(const std::string& algorithm,
+                                                  const std::string& network_config) const
+{
+    return this->impl->cache.GetKernels(algorithm, network_config);
+}
+
+bool Handle::HasKernel(const std::string& algorithm, const std::string& network_config) const
+{
+    return this->impl->cache.HasKernels(algorithm, network_config);
+}
+
+KernelInvoke Handle::Run(Kernel k) const { return k.Invoke(this->GetStream()); }
+
+Program Handle::LoadProgram(const std::string& program_name, std::string params) const
+{
+    if((!olCompile::EndsWith(program_name, ".mlir-cpp")) &&
+       (!olCompile::EndsWith(program_name, ".mlir")))
+    {
+        params += " -mcpu=" + this->GetTargetProperties().Name();
+    }
+
+    auto hsaco = olCompile::LoadBinary(
+        this->GetTargetProperties(), this->GetMaxComputeUnits(), program_name, params);
+    if(hsaco.empty())
+    {
+        auto p = HIPOCProgram{program_name, params, this->GetTargetProperties()};
+
+        auto path = olCompile::GetCachePath() / boost::filesystem::unique_path();
+        if(p.IsCodeObjectInMemory())
+            olCompile::WriteFile(p.GetCodeObjectBlob(), path);
+        else
+            boost::filesystem::copy_file(p.GetCodeObjectPathname(), path);
+        olCompile::SaveBinary(path, this->GetTargetProperties(), program_name, params);
+
+        return p;
+    }
+    else
+    {
+        return HIPOCProgram{program_name, hsaco};
+    }
+}
+
+bool Handle::HasProgram(const std::string& program_name, const std::string& params) const
+{
+    return this->impl->cache.HasProgram(program_name, params);
+}
+
+void Handle::AddProgram(Program prog,
+                        const std::string& program_name,
+                        const std::string& params) const
+{
+    this->impl->cache.AddProgram(prog, program_name, params);
+}
+
+void Handle::Finish() const { MY_HIP_CHECK(hipStreamSynchronize(this->GetStream())); }
+
+std::size_t Handle::GetLocalMemorySize() const
+{
+    int result;
+
+    MY_HIP_CHECK(hipDeviceGetAttribute(
+        &result, hipDeviceAttributeMaxSharedMemoryPerBlock, this->impl->device));
+
+    return result;
+}
+
+std::size_t Handle::GetGlobalMemorySize() const
+{
+    size_t result;
+
+    MY_HIP_CHECK(hipDeviceTotalMem(&result, this->impl->device));
+
+    return result;
+}
+
+std::size_t Handle::GetMaxComputeUnits() const
+{
+    int result;
+    const char* const num_cu = olCompile::GetStringEnv(OLC_DEVICE_CU{});
+    if(num_cu != nullptr && strlen(num_cu) > 0)
+    {
+        return boost::lexical_cast<std::size_t>(num_cu);
+    }
+
+    MY_HIP_CHECK(
+        hipDeviceGetAttribute(&result, hipDeviceAttributeMultiprocessorCount, this->impl->device));
+
+    return result;
+}
+
+std::size_t Handle::GetWavefrontWidth() const
+{
+    hipDeviceProp_t props{};
+
+    MY_HIP_CHECK(hipGetDeviceProperties(&props, this->impl->device));
+
+    auto result = static_cast<size_t>(props.warpSize);
+    return result;
+}
+
+std::string Handle::GetDeviceNameImpl() const { return this->impl->get_device_name(); }
+
+std::string Handle::GetDeviceName() const { return this->impl->target_properties.Name(); }
+
+const TargetProperties& Handle::GetTargetProperties() const
+{
+    return this->impl->target_properties;
+}
+
+std::ostream& Handle::Print(std::ostream& os) const
+{
+    os << "stream: " << this->impl->stream << ", device_id: " << this->impl->device;
+    return os;
+}
+
+} // namespace olCompile
diff --git a/driver/olCompiling/hip_utility/hip_build_utils.cpp b/driver/olCompiling/hip_utility/hip_build_utils.cpp
new file mode 100644
index 0000000000..e73c345937
--- /dev/null
+++ b/driver/olCompiling/hip_utility/hip_build_utils.cpp
@@ -0,0 +1,346 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2019 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+
+#include <hip_build_utils.hpp>
+#include <stringutils.hpp>
+#include <tmp_dir.hpp>
+#include <env.hpp>
+#include <target_properties.hpp>
+#include <write_file.hpp>
+#include <exec_utils.hpp>
+#include <logger.hpp>
+#include <config.h>
+#include <boost/optional.hpp>
+#include <sstream>
+#include <string>
+#include <stdexcept>
+
+#include <iostream>
+
+OLC_DECLARE_ENV_VAR(OLC_DEBUG_HIP_VERBOSE)
+OLC_DECLARE_ENV_VAR(OLC_DEBUG_HIP_DUMP)
+
+#define OLC_HIP_COMPILER "/opt/rocm/llvm/bin/clang++"
+
+namespace olCompile {
+
+bool IsHccCompiler()
+{
+    static const auto isHcc = EndsWith(OLC_HIP_COMPILER, "hcc");
+    return isHcc;
+}
+
+bool IsHipClangCompiler()
+{
+    static const auto isClangXX = EndsWith(OLC_HIP_COMPILER, "clang++");
+    return isClangXX;
+}
+
+namespace {
+
+inline bool ProduceCoV3()
+{
+    // Otherwise, let's enable CO v3 for HIP kernels since ROCm 3.0.
+    return (HipCompilerVersion() >= external_tool_version_t{3, 0, -1});
+}
+
+/// Returns option for enabling/disabling CO v3 generation for the compiler
+/// that builds HIP kernels, depending on compiler version etc.
+inline const std::string& GetCoV3Option(const bool enable)
+{
+    /// \note PR #2166 uses the "--hcc-cov3" option when isHCC is true.
+    /// It's unclear why... HCC included in ROCm 2.8 does not support it,
+    /// perhaps it suits for some older HCC?
+    ///
+    /// These options are Ok for ROCm 3.0:
+    static const std::string option_enable{"-mcode-object-v3"};
+    static const std::string no_option{};
+    if(enable)
+        return option_enable;
+    else
+        return no_option;
+}
+} // namespace
+
+static boost::filesystem::path HipBuildImpl(boost::optional<TmpDir>& tmp_dir,
+                                            const std::string& filename,
+                                            std::string src,
+                                            std::string params,
+                                            const TargetProperties& target,
+                                            const bool testing_mode,
+                                            const bool sources_already_reside_on_filesystem)
+{
+#ifdef __linux__
+    // Write out the include files
+    // Let's assume includes are overkill for feature tests & optimize'em out.
+    if(!testing_mode)
+    {
+        auto inc_list = GetHipKernelIncList();
+        auto inc_path = tmp_dir->path;
+        boost::filesystem::create_directories(inc_path);
+        for(auto inc_file : inc_list)
+        {
+            auto inc_src = GetKernelInc(inc_file);
+            WriteFile(inc_src, inc_path / inc_file);
+        }
+    }
+
+    // Sources produced by MLIR-cpp already reside in tmp dir.
+    if(!sources_already_reside_on_filesystem)
+    {
+        src += "\nint main() {}\n";
+        WriteFile(src, tmp_dir->path / filename);
+    }
+
+    // cppcheck-suppress unreadVariable
+    const LcOptionTargetStrings lots(target);
+
+    auto env = std::string("");
+    if(IsHccCompiler())
+    {
+        params += " -amdgpu-target=" + target.Name();
+        params += " " + GetCoV3Option(ProduceCoV3());
+    }
+    else if(IsHipClangCompiler())
+    {
+        if(params.find("-std=") == std::string::npos)
+            params += " --std=c++11";
+
+        if(HipCompilerVersion() < external_tool_version_t{4, 1, 0})
+            params += " --cuda-gpu-arch=" + lots.device;
+        else
+            params += " --cuda-gpu-arch=" + lots.device + lots.xnack;
+
+        params += " --cuda-device-only";
+        params += " -c";
+        params += " -O3 ";
+    }
+
+    params += " -Wno-unused-command-line-argument -I. ";
+    params += OLC_STRINGIZE(HIP_COMPILER_FLAGS);
+    if(IsHccCompiler())
+    {
+        env += std::string("KMOPTLLC=\"-mattr=+enable-ds128 ");
+        if(HipCompilerVersion() >= external_tool_version_t{2, 8, 0})
+            env += " --amdgpu-spill-vgpr-to-agpr=0";
+        env += '\"';
+    }
+    else if(IsHipClangCompiler())
+    {
+        params += " -mllvm --amdgpu-spill-vgpr-to-agpr=0";
+        params += " -mllvm -amdgpu-early-inline-all=true";
+        params += " -mllvm -amdgpu-function-calls=false";
+    }
+
+    if(olCompile::IsEnabled(OLC_DEBUG_HIP_VERBOSE{}))
+    {
+        params += " -v";
+    }
+
+    if(olCompile::IsEnabled(OLC_DEBUG_HIP_DUMP{}))
+    {
+        if(IsHccCompiler())
+        {
+            params += " -gline-tables-only";
+            env += " KMDUMPISA=1";
+            env += " KMDUMPLLVM=1";
+        }
+        else if(IsHipClangCompiler())
+        {
+            params += " -gline-tables-only";
+            params += " -save-temps";
+        }
+    }
+
+    // hip version
+    params +=
+        std::string(" -DHIP_PACKAGE_VERSION_FLAT=") + std::to_string(HIP_PACKAGE_VERSION_FLAT);
+
+    params += " ";
+    auto bin_file = tmp_dir->path / (filename + ".o");
+
+    // compile
+    const std::string redirector = testing_mode ? " 1>/dev/null 2>&1" : "";
+    tmp_dir->Execute(env + std::string(" ") + OLC_HIP_COMPILER,
+                     params + filename + " -o " + bin_file.string() + redirector);
+    if(!boost::filesystem::exists(bin_file))
+        throw std::runtime_error(filename + " failed to compile");
+#ifdef EXTRACTKERNEL_BIN
+    if(IsHccCompiler())
+    {
+        // call extract kernel
+        tmp_dir->Execute(EXTRACTKERNEL_BIN, " -i " + bin_file.string());
+        auto hsaco =
+            std::find_if(boost::filesystem::directory_iterator{tmp_dir->path}, {}, [](auto entry) {
+                return (entry.path().extension() == ".hsaco");
+            });
+
+        if(hsaco == boost::filesystem::directory_iterator{})
+        {
+            fdt_log(LogLevel::Info, "HipBuild", "failed to find *.hsaco in ")
+                << hsaco->path().string() << std::endl;
+        }
+
+        return hsaco->path();
+    }
+#endif
+    return bin_file;
+#else
+    (void)filename;
+    (void)params;
+    throw std::runtimer_error("HIP kernels are only supported in Linux");
+#endif
+}
+
+boost::filesystem::path HipBuild(boost::optional<TmpDir>& tmp_dir,
+                                 const std::string& filename,
+                                 std::string src,
+                                 std::string params,
+                                 const TargetProperties& target,
+                                 const bool sources_already_reside_on_filesystem)
+{
+    return HipBuildImpl(
+        tmp_dir, filename, src, params, target, false, sources_already_reside_on_filesystem);
+}
+
+void bin_file_to_str(const boost::filesystem::path& file, std::string& buf)
+{
+    std::ifstream bin_file_ptr(file.string().c_str(), std::ios::binary);
+    std::ostringstream bin_file_strm;
+    bin_file_strm << bin_file_ptr.rdbuf();
+    buf = bin_file_strm.str();
+}
+
+static external_tool_version_t HipCompilerVersionImpl()
+{
+    external_tool_version_t version;
+    if(IsHccCompiler())
+    {
+        const std::string path(OLC_HIP_COMPILER);
+        const std::string mandatory_prefix("(based on HCC ");
+        do
+        {
+            if(path.empty() || !std::ifstream(path).good())
+                break;
+
+            std::stringstream out;
+            if(olCompile::exec::Run(path + " --version", nullptr, &out) != 0)
+                break;
+
+            std::string line;
+            while(!out.eof())
+            {
+                std::getline(out, line);
+                fdt_log() << line;
+                auto begin = line.find(mandatory_prefix);
+                if(begin == std::string::npos)
+                    continue;
+
+                begin += mandatory_prefix.size();
+                int v3, v2, v1 = v2 = v3 = -1;
+                char c2, c1 = c2 = 'X';
+                std::istringstream iss(line.substr(begin));
+                iss >> v1 >> c1 >> v2 >> c2 >> v3;
+                if(!iss.fail() && v1 >= 0)
+                {
+                    version.major = v1;
+                    if(c1 == '.' && v2 >= 0)
+                    {
+                        version.minor = v2;
+                        if(c2 == '.' && v3 >= 0)
+                            version.patch = v3;
+                    }
+                }
+                break;
+            }
+        } while(false);
+    }
+    else
+    {
+#ifdef HIP_PACKAGE_VERSION_MAJOR
+        fdt_log(
+            LogLevel::Info, "HipCompilerVersion", "Read version information from HIP package...");
+        version.major = HIP_PACKAGE_VERSION_MAJOR;
+#ifdef HIP_PACKAGE_VERSION_MINOR
+        version.minor = HIP_PACKAGE_VERSION_MINOR;
+#else
+        version.minor = 0;
+#endif
+#ifdef HIP_PACKAGE_VERSION_PATCH
+        version.patch = HIP_PACKAGE_VERSION_PATCH;
+#else
+        version.patch = 0;
+#endif
+#else // HIP_PACKAGE_VERSION_MAJOR is not defined. CMake failed to find HIP package.
+        fdt_log(LogLevel::Info, "HipCompilerVersion", "...assuming 3.2.0 (hip-clang RC)");
+        version.major = 3;
+        version.minor = 2;
+        version.patch = 0;
+#endif
+    }
+    fdt_log() << version.major << '.' << version.minor << '.' << version.patch << std::endl;
+    return version;
+}
+
+external_tool_version_t HipCompilerVersion()
+{
+    // NOLINTNEXTLINE (cppcoreguidelines-avoid-non-const-global-variables)
+    static auto once = HipCompilerVersionImpl();
+    return once;
+}
+
+bool operator>(const external_tool_version_t& lhs, const external_tool_version_t& rhs)
+{
+    if(lhs.major > rhs.major)
+        return true;
+    else if(lhs.major == rhs.major)
+    {
+        if(lhs.minor > rhs.minor)
+            return true;
+        else if(lhs.minor == rhs.minor)
+            return (lhs.patch > rhs.patch);
+        else
+            return false;
+    }
+    else
+        return false;
+}
+
+bool operator<(const external_tool_version_t& lhs, const external_tool_version_t& rhs)
+{
+    return rhs > lhs;
+}
+bool operator>=(const external_tool_version_t& lhs, const external_tool_version_t& rhs)
+{
+    return !(lhs < rhs);
+}
+
+bool operator<=(const external_tool_version_t& lhs, const external_tool_version_t& rhs)
+{
+    return !(lhs > rhs);
+}
+
+} // namespace olCompile
diff --git a/driver/olCompiling/hip_utility/hipoc_kernel.cpp b/driver/olCompiling/hip_utility/hipoc_kernel.cpp
new file mode 100644
index 0000000000..41fcd92c94
--- /dev/null
+++ b/driver/olCompiling/hip_utility/hipoc_kernel.cpp
@@ -0,0 +1,84 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2017 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+
+#include <env.hpp>
+#include <hipoc_kernel.hpp>
+#include <hipCheck.hpp>
+
+#include <hip/hip_ext.h>
+#include <hip/hip_runtime.h>
+
+#include <chrono>
+#include <thread>
+
+namespace olCompile {
+
+void HIPOCKernelInvoke::run(void* args, std::size_t size) const
+{
+    HipEventPtr start = nullptr;
+    HipEventPtr stop  = nullptr;
+    void* config[]    = {// HIP_LAUNCH_PARAM_* are macros that do horrible things
+                      // NOLINTNEXTLINE cppcoreguidelines-pro-type-cstyle-cast
+                      HIP_LAUNCH_PARAM_BUFFER_POINTER,
+                      args,
+                      // NOLINTNEXTLINE cppcoreguidelines-pro-type-cstyle-cast
+                      HIP_LAUNCH_PARAM_BUFFER_SIZE,
+                      &size,
+                      // NOLINTNEXTLINE cppcoreguidelines-pro-type-cstyle-cast
+                      HIP_LAUNCH_PARAM_END};
+    if(callback)
+    {
+        start = make_hip_event();
+        stop  = make_hip_event();
+    }
+
+    MY_HIP_CHECK(hipExtModuleLaunchKernel(fun,
+                                          gdims[0],
+                                          gdims[1],
+                                          gdims[2],
+                                          ldims[0],
+                                          ldims[1],
+                                          ldims[2],
+                                          0,
+                                          stream,
+                                          nullptr,
+                                          reinterpret_cast<void**>(&config),
+                                          start.get(),
+                                          stop.get()));
+
+    if(callback)
+    {
+        MY_HIP_CHECK(hipEventSynchronize(stop.get()));
+        callback(start.get(), stop.get());
+    }
+}
+
+HIPOCKernelInvoke HIPOCKernel::Invoke(hipStream_t stream,
+                                      std::function<void(hipEvent_t, hipEvent_t)> callback) const
+{
+    return HIPOCKernelInvoke{stream, fun, ldims, gdims, name, callback};
+}
+} // namespace olCompile
diff --git a/driver/olCompiling/hip_utility/hipoc_program.cpp b/driver/olCompiling/hip_utility/hipoc_program.cpp
new file mode 100644
index 0000000000..d2ea1fcb0c
--- /dev/null
+++ b/driver/olCompiling/hip_utility/hipoc_program.cpp
@@ -0,0 +1,139 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2017 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#include <hip_build_utils.hpp>
+#include <hipoc_program.hpp>
+#include <kernel.hpp>
+#include <stringutils.hpp>
+#include <target_properties.hpp>
+#include <env.hpp>
+#include <write_file.hpp>
+#include <boost/optional.hpp>
+#include <boost/filesystem/operations.hpp>
+
+#include <cstring>
+#include <mutex>
+#include <sstream>
+
+#include <unistd.h>
+
+namespace olCompile {
+
+static hipModulePtr CreateModule(const boost::filesystem::path& hsaco_file)
+{
+    hipModule_t raw_m;
+    MY_HIP_CHECK(hipModuleLoad(&raw_m, hsaco_file.string().c_str()));
+    hipModulePtr m{raw_m};
+    return m;
+}
+
+template <typename T> /// intended for std::string and std::vector<char>
+hipModulePtr CreateModuleInMem(const T& blob)
+{
+    hipModule_t raw_m;
+    MY_HIP_CHECK(hipModuleLoadData(&raw_m, reinterpret_cast<const void*>(blob.data())));
+    hipModulePtr m{raw_m};
+    return m;
+}
+
+HIPOCProgramImpl::HIPOCProgramImpl(const std::string& program_name,
+                                   const boost::filesystem::path& filespec)
+    : program(program_name), hsaco_file(filespec)
+{
+    this->module = CreateModule(hsaco_file);
+}
+
+HIPOCProgramImpl::HIPOCProgramImpl(const std::string& program_name,
+                                   std::string params,
+                                   const TargetProperties& target_)
+    : program(program_name), target(target_)
+{
+    BuildCodeObject(params);
+    if(!binary.empty())
+    {
+        module = CreateModuleInMem(this->binary);
+    }
+    else
+    {
+        module = CreateModule(this->hsaco_file);
+    }
+}
+
+void HIPOCProgramImpl::BuildCodeObjectInFile(std::string& params,
+                                             const std::string& src,
+                                             const std::string& filename)
+{
+
+    this->dir.emplace(filename);
+    hsaco_file = dir->path / (filename + ".o");
+
+    if(olCompile::EndsWith(filename, ".cpp"))
+    {
+        hsaco_file = HipBuild(dir, filename, src, params, target);
+    }
+    else
+        throw std::runtime_error("Only HIP kernel source of .cpp file is supported");
+
+    if(!boost::filesystem::exists(hsaco_file))
+        throw std::runtime_error("Cant find file: " + hsaco_file.string());
+}
+
+void HIPOCProgramImpl::BuildCodeObject(std::string params)
+{
+    std::string filename = program;
+
+    if(olCompile::EndsWith(filename, ".cpp"))
+    {
+        params += " -Wno-everything";
+    }
+
+    BuildCodeObjectInFile(params, GetKernelSrc(this->program), filename);
+}
+
+HIPOCProgram::HIPOCProgram() {}
+HIPOCProgram::HIPOCProgram(const std::string& program_name,
+                           std::string params,
+                           const TargetProperties& target)
+    : impl(std::make_shared<HIPOCProgramImpl>(program_name, params, target))
+{
+}
+
+HIPOCProgram::HIPOCProgram(const std::string& program_name, const boost::filesystem::path& hsaco)
+    : impl(std::make_shared<HIPOCProgramImpl>(program_name, hsaco))
+{
+}
+
+hipModule_t HIPOCProgram::GetModule() const { return impl->module.get(); }
+
+boost::filesystem::path HIPOCProgram::GetCodeObjectPathname() const { return impl->hsaco_file; }
+
+std::string HIPOCProgram::GetCodeObjectBlob() const
+{
+    return {impl->binary.data(), impl->binary.size()};
+}
+
+bool HIPOCProgram::IsCodeObjectInMemory() const { return !impl->binary.empty(); };
+
+} // namespace olCompile
diff --git a/driver/olCompiling/hip_utility/kernel_build_params.cpp b/driver/olCompiling/hip_utility/kernel_build_params.cpp
new file mode 100644
index 0000000000..f9474796bc
--- /dev/null
+++ b/driver/olCompiling/hip_utility/kernel_build_params.cpp
@@ -0,0 +1,66 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2019 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+
+#include <sstream>
+
+#include <boost/range/adaptor/transformed.hpp>
+
+#include <kernel_build_params.hpp>
+#include <stringutils.hpp>
+
+namespace olCompile {
+
+static std::string GenerateDefines(const std::vector<KernelBuildParameter>& options,
+                                   const std::string& prefix)
+{
+    const auto strs =
+        options | boost::adaptors::transformed([&prefix](const KernelBuildParameter& define) {
+            std::ostringstream ss;
+
+            ss << '-';
+            if(define.type == ParameterTypes::Define)
+                ss << prefix;
+
+            ss << define.name;
+
+            if(!define.value.empty())
+            {
+                switch(define.type)
+                {
+                case ParameterTypes::Define: ss << '='; break;
+                case ParameterTypes::Option: ss << ' '; break;
+                }
+
+                ss << define.value;
+            }
+
+            return ss.str();
+        });
+
+    return JoinStrings(strs, " ");
+}
+
+} // namespace olCompile
diff --git a/driver/olCompiling/hip_utility/kernel_cache.cpp b/driver/olCompiling/hip_utility/kernel_cache.cpp
new file mode 100644
index 0000000000..8ecf54b0c6
--- /dev/null
+++ b/driver/olCompiling/hip_utility/kernel_cache.cpp
@@ -0,0 +1,156 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2017 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+/* ************************************************************************
+ * Copyright 2015 Vratis, Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************ */
+
+#include <env.hpp>
+#include <kernel_cache.hpp>
+#include <stringutils.hpp>
+
+#include <iostream>
+#include <iterator>
+
+namespace olCompile {
+
+const std::vector<Kernel>& KernelCache::GetKernels(const std::string& algorithm,
+                                                   const std::string& network_config)
+{
+
+    std::pair<std::string, std::string> key = std::make_pair(algorithm, network_config);
+
+    const auto it = kernel_map.find(key);
+    if(it != kernel_map.end())
+    {
+        return it->second;
+    }
+
+    static const std::vector<Kernel> empty{};
+    return empty;
+}
+
+bool KernelCache::HasKernels(const std::string& algorithm, const std::string& network_config) const
+{
+    const auto key = std::make_pair(algorithm, network_config);
+    const auto it  = kernel_map.find(key);
+    if(it == kernel_map.end())
+        return false;
+
+    if(it->second.empty())
+    {
+        throw std::runtime_error(
+            "There should be at least one kernel in kernel cache if an entry exists");
+    }
+
+    return true;
+}
+
+bool KernelCache::HasProgram(const std::string& name, const std::string& params) const
+{
+    const auto key = std::make_pair(name, params);
+    return program_map.count(key) > 0;
+}
+
+void KernelCache::AddProgram(Program prog, const std::string& program_name, std::string params)
+{
+    program_map[std::make_pair(program_name, params)] = prog;
+}
+
+Kernel KernelCache::AddKernel(const Handle& h,
+                              const std::string& algorithm,
+                              const std::string& network_config,
+                              const std::string& program_name,
+                              const std::string& kernel_name,
+                              const std::vector<size_t>& vld,
+                              const std::vector<size_t>& vgd,
+                              std::string params,
+                              std::size_t cache_index)
+{
+    const std::pair<std::string, std::string> key = std::make_pair(algorithm, network_config);
+
+    Program program;
+
+    auto program_it = program_map.find(std::make_pair(program_name, params));
+    if(program_it != program_map.end())
+    {
+        program = program_it->second;
+    }
+    else
+    {
+        program                                           = h.LoadProgram(program_name, params);
+        program_map[std::make_pair(program_name, params)] = program;
+    }
+
+    Kernel kernel{};
+
+    kernel = Kernel{program, kernel_name, vld, vgd};
+
+    if(!network_config.empty() && !algorithm.empty())
+    {
+        this->AddKernel(key, kernel, cache_index);
+    }
+    return kernel;
+}
+
+void KernelCache::AddKernel(Key key, Kernel k, std::size_t cache_index)
+{
+    auto&& v = kernel_map[key];
+    if(cache_index >= v.size())
+    {
+        v.resize(cache_index + 1);
+    }
+    v[cache_index] = k;
+}
+
+void KernelCache::ClearKernels(const std::string& algorithm, const std::string& network_config)
+{
+    if(network_config.empty() || algorithm.empty())
+    {
+        throw std::runtime_error("Network config or algorithm empty.");
+    }
+    const std::pair<std::string, std::string> key = std::make_pair(algorithm, network_config);
+    auto&& v                                      = this->kernel_map[key];
+    if(!v.empty())
+    {
+    }
+    v.clear();
+}
+
+KernelCache::KernelCache() {}
+
+} // namespace olCompile
diff --git a/driver/olCompiling/hip_utility/logger.cpp b/driver/olCompiling/hip_utility/logger.cpp
new file mode 100644
index 0000000000..15e578cb77
--- /dev/null
+++ b/driver/olCompiling/hip_utility/logger.cpp
@@ -0,0 +1,43 @@
+#include <config.h>
+#include <logger.hpp>
+#include <iostream>
+#include <string>
+
+using namespace std;
+
+namespace olCompile {
+
+#if OLC_DEBUG
+static LogLevel defLevel = LogLevel::Info2;
+#else
+static LogLevel defLevel = LogLevel::Error;
+#endif
+
+string LogLevelString(LogLevel level)
+{
+    switch(level)
+    {
+    case LogLevel::Error: return ("Error");
+    case LogLevel::Warning: return ("Warning");
+    case LogLevel::Info: return ("Info");
+    case LogLevel::Info2: return ("Info2");
+    default: return ("Unknown");
+    };
+};
+
+ostream& fdt_log(LogLevel level, const char* header, const char* content)
+{
+    if(level > olCompile::defLevel)
+    {
+        return (cerr);
+    };
+
+    cerr << endl << LogLevelString(level) << ":" << header << ", " << content;
+
+    return (cerr);
+}
+
+ostream& fdt_log() { return (cerr); };
+
+void fdt_log_flush() { cerr << endl; }
+};
diff --git a/driver/olCompiling/hip_utility/md5.cpp b/driver/olCompiling/hip_utility/md5.cpp
new file mode 100644
index 0000000000..ad31292ea7
--- /dev/null
+++ b/driver/olCompiling/hip_utility/md5.cpp
@@ -0,0 +1,319 @@
+/*
+ * Derived from a public-domain MD5 implementation. Original license
+ * below.
+ *
+ * This is an OpenSSL-compatible implementation of the RSA Data Security, Inc.
+ * MD5 Message-Digest Algorithm (RFC 1321).
+ *
+ * Homepage:
+ * http://openwall.info/wiki/people/solar/software/public-domain-source-code/md5
+ *
+ * Author:
+ * Alexander Peslyak, better known as Solar Designer <solar at openwall.com>
+ *
+ * This software was written by Alexander Peslyak in 2001.  No copyright is
+ * claimed, and the software is hereby placed in the public domain.
+ * In case this attempt to disclaim copyright and place the software in the
+ * public domain is deemed null and void, then the software is
+ * Copyright (c) 2001 Alexander Peslyak and it is hereby released to the
+ * general public under the following terms:
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted.
+ *
+ * There's ABSOLUTELY NO WARRANTY, express or implied.
+ *
+ * (This is a heavily cut-down "BSD license".)
+ *
+ * This differs from Colin Plumb's older public domain implementation in that
+ * no exactly 32-bit integer data type is required (any 32-bit or wider
+ * unsigned integer data type will do), there's no compile-time endianness
+ * configuration, and the function prototypes match OpenSSL's.  No code from
+ * Colin Plumb's implementation has been reused; this comment merely compares
+ * the properties of the two independent implementations.
+ *
+ * The primary goals of this implementation are portability and ease of use.
+ * It is meant to be fast, but not as fast as possible.  Some known
+ * optimizations are not included to reduce source code size and avoid
+ * compile-time configuration.
+ */
+#include <md5.hpp>
+#include <array>
+#include <cstring>
+#include <cstdint>
+#include <sstream>
+#include <iomanip>
+
+#define MD5_DIGEST_LENGTH 16
+
+struct MD5_CTX
+{
+    uint32_t lo, hi;
+    uint32_t a, b, c, d;
+    unsigned char buffer[64];
+    uint32_t block[MD5_DIGEST_LENGTH];
+};
+
+/*
+ * The basic MD5 functions.
+ *
+ * F and G are optimized compared to their RFC 1321 definitions for
+ * architectures that lack an AND-NOT instruction, just like in Colin Plumb's
+ * implementation.
+ */
+#define F(x, y, z) ((z) ^ ((x) & ((y) ^ (z))))
+#define G(x, y, z) ((y) ^ ((z) & ((x) ^ (y))))
+#define H(x, y, z) (((x) ^ (y)) ^ (z))
+#define H2(x, y, z) ((x) ^ ((y) ^ (z)))
+#define I(x, y, z) ((y) ^ ((x) | ~(z)))
+
+/*
+ * The MD5 transformation for all four rounds.
+ */
+#define STEP(f, a, b, c, d, x, t, s)                         \
+    (a) += f((b), (c), (d)) + (x) + (t);                     \
+    (a) = (((a) << (s)) | (((a)&0xffffffff) >> (32 - (s)))); \
+    (a) += (b);
+
+/*
+ * SET reads 4 input bytes in little-endian byte order and stores them in a
+ * properly aligned word in host byte order.
+ *
+ * The check for little-endian architectures that tolerate unaligned memory
+ * accesses is just an optimization.  Nothing will break if it fails to detect
+ * a suitable architecture.
+ *
+ * Unfortunately, this optimization may be a C strict aliasing rules violation
+ * if the caller's data buffer has effective type that cannot be aliased by
+ * uint32_t.  In practice, this problem may occur if these MD5 routines are
+ * inlined into a calling function, or with future and dangerously advanced
+ * link-time optimizations.  For the time being, keeping these MD5 routines in
+ * their own translation unit avoids the problem.
+ */
+#if defined(__i386__) || defined(__x86_64__) || defined(__vax__)
+#define SET(n) (*reinterpret_cast<const uint32_t*>(&ptr[(n)*4]))
+#define GET(n) SET(n)
+#else
+#define SET(n)                                                         \
+    (ctx->block[(n)] = static_cast<uint32_t>(ptr[(n)*4]) |             \
+                       (static_cast<uint32_t>(ptr[(n)*4 + 1]) << 8) |  \
+                       (static_cast<uint32_t>(ptr[(n)*4 + 2]) << 16) | \
+                       (static_cast<uint32_t>(ptr[(n)*4 + 3]) << 24))
+#define GET(n) (ctx->block[(n)])
+#endif
+
+/*
+ * This processes one or more 64-byte data blocks, but does NOT update the bit
+ * counters.  There are no alignment requirements.
+ */
+static const void* body(MD5_CTX* ctx, const void* data, size_t size)
+{
+    const unsigned char* ptr;
+    uint32_t a, b, c, d;
+
+    ptr = static_cast<const unsigned char*>(data);
+
+    a = ctx->a;
+    b = ctx->b;
+    c = ctx->c;
+    d = ctx->d;
+
+    do
+    {
+        uint32_t saved_a = a, saved_b = b, saved_c = c, saved_d = d;
+
+        /* Round 1 */
+        STEP(F, a, b, c, d, SET(0), 0xd76aa478, 7)
+        STEP(F, d, a, b, c, SET(1), 0xe8c7b756, 12)
+        STEP(F, c, d, a, b, SET(2), 0x242070db, 17)
+        STEP(F, b, c, d, a, SET(3), 0xc1bdceee, 22)
+        STEP(F, a, b, c, d, SET(4), 0xf57c0faf, 7)
+        STEP(F, d, a, b, c, SET(5), 0x4787c62a, 12)
+        STEP(F, c, d, a, b, SET(6), 0xa8304613, 17)
+        STEP(F, b, c, d, a, SET(7), 0xfd469501, 22)
+        STEP(F, a, b, c, d, SET(8), 0x698098d8, 7)
+        STEP(F, d, a, b, c, SET(9), 0x8b44f7af, 12)
+        STEP(F, c, d, a, b, SET(10), 0xffff5bb1, 17)
+        STEP(F, b, c, d, a, SET(11), 0x895cd7be, 22)
+        STEP(F, a, b, c, d, SET(12), 0x6b901122, 7)
+        STEP(F, d, a, b, c, SET(13), 0xfd987193, 12)
+        STEP(F, c, d, a, b, SET(14), 0xa679438e, 17)
+        STEP(F, b, c, d, a, SET(15), 0x49b40821, 22)
+
+        /* Round 2 */
+        STEP(G, a, b, c, d, GET(1), 0xf61e2562, 5)
+        STEP(G, d, a, b, c, GET(6), 0xc040b340, 9)
+        STEP(G, c, d, a, b, GET(11), 0x265e5a51, 14)
+        STEP(G, b, c, d, a, GET(0), 0xe9b6c7aa, 20)
+        STEP(G, a, b, c, d, GET(5), 0xd62f105d, 5)
+        STEP(G, d, a, b, c, GET(10), 0x02441453, 9)
+        STEP(G, c, d, a, b, GET(15), 0xd8a1e681, 14)
+        STEP(G, b, c, d, a, GET(4), 0xe7d3fbc8, 20)
+        STEP(G, a, b, c, d, GET(9), 0x21e1cde6, 5)
+        STEP(G, d, a, b, c, GET(14), 0xc33707d6, 9)
+        STEP(G, c, d, a, b, GET(3), 0xf4d50d87, 14)
+        STEP(G, b, c, d, a, GET(8), 0x455a14ed, 20)
+        STEP(G, a, b, c, d, GET(13), 0xa9e3e905, 5)
+        STEP(G, d, a, b, c, GET(2), 0xfcefa3f8, 9)
+        STEP(G, c, d, a, b, GET(7), 0x676f02d9, 14)
+        STEP(G, b, c, d, a, GET(12), 0x8d2a4c8a, 20)
+
+        /* Round 3 */
+        STEP(H, a, b, c, d, GET(5), 0xfffa3942, 4)
+        STEP(H2, d, a, b, c, GET(8), 0x8771f681, 11)
+        STEP(H, c, d, a, b, GET(11), 0x6d9d6122, 16)
+        STEP(H2, b, c, d, a, GET(14), 0xfde5380c, 23)
+        STEP(H, a, b, c, d, GET(1), 0xa4beea44, 4)
+        STEP(H2, d, a, b, c, GET(4), 0x4bdecfa9, 11)
+        STEP(H, c, d, a, b, GET(7), 0xf6bb4b60, 16)
+        STEP(H2, b, c, d, a, GET(10), 0xbebfbc70, 23)
+        STEP(H, a, b, c, d, GET(13), 0x289b7ec6, 4)
+        STEP(H2, d, a, b, c, GET(0), 0xeaa127fa, 11)
+        STEP(H, c, d, a, b, GET(3), 0xd4ef3085, 16)
+        STEP(H2, b, c, d, a, GET(6), 0x04881d05, 23)
+        STEP(H, a, b, c, d, GET(9), 0xd9d4d039, 4)
+        STEP(H2, d, a, b, c, GET(12), 0xe6db99e5, 11)
+        STEP(H, c, d, a, b, GET(15), 0x1fa27cf8, 16)
+        STEP(H2, b, c, d, a, GET(2), 0xc4ac5665, 23)
+
+        /* Round 4 */
+        STEP(I, a, b, c, d, GET(0), 0xf4292244, 6)
+        STEP(I, d, a, b, c, GET(7), 0x432aff97, 10)
+        STEP(I, c, d, a, b, GET(14), 0xab9423a7, 15)
+        STEP(I, b, c, d, a, GET(5), 0xfc93a039, 21)
+        STEP(I, a, b, c, d, GET(12), 0x655b59c3, 6)
+        STEP(I, d, a, b, c, GET(3), 0x8f0ccc92, 10)
+        STEP(I, c, d, a, b, GET(10), 0xffeff47d, 15)
+        STEP(I, b, c, d, a, GET(1), 0x85845dd1, 21)
+        STEP(I, a, b, c, d, GET(8), 0x6fa87e4f, 6)
+        STEP(I, d, a, b, c, GET(15), 0xfe2ce6e0, 10)
+        STEP(I, c, d, a, b, GET(6), 0xa3014314, 15)
+        STEP(I, b, c, d, a, GET(13), 0x4e0811a1, 21)
+        STEP(I, a, b, c, d, GET(4), 0xf7537e82, 6)
+        STEP(I, d, a, b, c, GET(11), 0xbd3af235, 10)
+        STEP(I, c, d, a, b, GET(2), 0x2ad7d2bb, 15)
+        STEP(I, b, c, d, a, GET(9), 0xeb86d391, 21)
+
+        a += saved_a;
+        b += saved_b;
+        c += saved_c;
+        d += saved_d;
+
+        ptr += 64;
+    } while((size -= 64) != 0u);
+
+    ctx->a = a;
+    ctx->b = b;
+    ctx->c = c;
+    ctx->d = d;
+
+    return ptr;
+}
+
+static void MD5_Init(MD5_CTX* ctx)
+{
+    ctx->a = 0x67452301;
+    ctx->b = 0xefcdab89;
+    ctx->c = 0x98badcfe;
+    ctx->d = 0x10325476;
+
+    ctx->lo = 0;
+    ctx->hi = 0;
+}
+
+static void MD5_Update(MD5_CTX* ctx, const void* data, size_t size)
+{
+    uint32_t saved_lo;
+    size_t used;
+
+    saved_lo = ctx->lo;
+    if((ctx->lo = (saved_lo + size) & 0x1fffffff) < saved_lo)
+        ctx->hi++;
+    ctx->hi += size >> 29;
+
+    used = saved_lo & 0x3f;
+
+    if(used != 0u)
+    {
+        size_t available = 64 - used;
+
+        if(size < available)
+        {
+            memcpy(&ctx->buffer[used], data, size);
+            return;
+        }
+
+        memcpy(&ctx->buffer[used], data, available);
+        data = static_cast<const unsigned char*>(data) + available;
+        size -= available;
+        body(ctx, ctx->buffer, 64);
+    }
+
+    if(size >= 64)
+    {
+        data = body(ctx, data, size & ~size_t{0x3f});
+        size &= 0x3f;
+    }
+
+    memcpy(ctx->buffer, data, size);
+}
+
+#define OUT(dst, src)                                   \
+    (dst)[0] = static_cast<unsigned char>(src);         \
+    (dst)[1] = static_cast<unsigned char>((src) >> 8);  \
+    (dst)[2] = static_cast<unsigned char>((src) >> 16); \
+    (dst)[3] = static_cast<unsigned char>((src) >> 24);
+
+static void MD5_Final(unsigned char* result, MD5_CTX* ctx)
+{
+    size_t used, available;
+
+    used = ctx->lo & 0x3f;
+
+    ctx->buffer[used++] = 0x80;
+
+    available = 64 - used;
+
+    if(available < 8)
+    {
+        memset(&ctx->buffer[used], 0, available);
+        body(ctx, ctx->buffer, 64);
+        used      = 0;
+        available = 64;
+    }
+
+    memset(&ctx->buffer[used], 0, available - 8);
+
+    ctx->lo <<= 3;
+    OUT(&ctx->buffer[56], ctx->lo)
+    OUT(&ctx->buffer[60], ctx->hi)
+
+    body(ctx, ctx->buffer, 64);
+
+    OUT(&result[0], ctx->a)
+    OUT(&result[4], ctx->b)
+    OUT(&result[8], ctx->c)
+    OUT(&result[12], ctx->d)
+
+    memset(ctx, 0, sizeof(*ctx));
+}
+
+namespace olCompile {
+
+std::string md5(std::string s)
+{
+    std::array<unsigned char, MD5_DIGEST_LENGTH> result{};
+
+    MD5_CTX ctx{};
+    MD5_Init(&ctx);
+    MD5_Update(&ctx, s.data(), s.length());
+    MD5_Final(result.data(), &ctx);
+
+    std::ostringstream sout;
+    sout << std::hex << std::setfill('0');
+    for(auto c : result)
+        sout << std::setw(2) << int{c};
+
+    return sout.str();
+}
+} // namespace olCompile
diff --git a/driver/olCompiling/hip_utility/target_properties.cpp b/driver/olCompiling/hip_utility/target_properties.cpp
new file mode 100644
index 0000000000..1d2bdef1c1
--- /dev/null
+++ b/driver/olCompiling/hip_utility/target_properties.cpp
@@ -0,0 +1,119 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2020 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#include <env.hpp>
+#include <handle.hpp>
+#include <stringutils.hpp>
+#include <target_properties.hpp>
+#include <map>
+#include <string>
+
+OLC_DECLARE_ENV_VAR(OLC_DEBUG_ENFORCE_DEVICE)
+
+namespace olCompile {
+
+static std::string GetDeviceNameFromMap(const std::string& in)
+{
+    // NOLINTNEXTLINE (cppcoreguidelines-avoid-non-const-global-variables)
+    static std::map<std::string, std::string> device_name_map = {
+        {"Ellesmere", "gfx803"},
+        {"Baffin", "gfx803"},
+        {"RacerX", "gfx803"},
+        {"Polaris10", "gfx803"},
+        {"Polaris11", "gfx803"},
+        {"Tonga", "gfx803"},
+        {"Fiji", "gfx803"},
+        {"gfx800", "gfx803"},
+        {"gfx802", "gfx803"},
+        {"gfx804", "gfx803"},
+        {"Vega10", "gfx900"},
+        {"gfx901", "gfx900"},
+        {"10.3.0 Sienna_Cichlid 18", "gfx1030"},
+    };
+
+    const char* const p_asciz = olCompile::GetStringEnv(OLC_DEBUG_ENFORCE_DEVICE{});
+    if(p_asciz != nullptr && strlen(p_asciz) > 0)
+        return {p_asciz};
+
+    const auto name = in.substr(0, in.find(':')); // str.substr(0, npos) returns str.
+
+    auto match = device_name_map.find(name);
+    if(match != device_name_map.end())
+        return match->second;
+    return name; // NOLINT (performance-no-automatic-move)
+}
+
+void TargetProperties::Init(const Handle* const handle)
+{
+    const auto rawName = [&]() -> std::string { return handle->GetDeviceNameImpl(); }();
+    name               = GetDeviceNameFromMap(rawName);
+    // DKMS driver older than 5.9 may report incorrect state of SRAMECC feature.
+    // Therefore we compute default SRAMECC and rely on it for now.
+    sramecc = [&]() -> boost::optional<bool> {
+        if(name == "gfx906" || name == "gfx908")
+            return {true};
+        return {};
+    }();
+    // However we need to store the reported state, even if it is incorrect,
+    // to use together with COMGR.
+    sramecc_reported = [&]() -> boost::optional<bool> {
+        if(rawName.find(":sramecc+") != std::string::npos)
+            return true;
+        if(rawName.find(":sramecc-") != std::string::npos)
+            return false;
+        return sramecc; // default
+    }();
+    xnack = [&]() -> boost::optional<bool> {
+        if(rawName.find(":xnack+") != std::string::npos)
+            return true;
+        if(rawName.find(":xnack-") != std::string::npos)
+            return false;
+        return {}; // default
+    }();
+    InitDbId();
+}
+
+void TargetProperties::InitDbId()
+{
+    dbId = name;
+    if(name == "gfx906" || name == "gfx908")
+    {
+        // Let's stay compatible with existing gfx906/908 databases.
+        // When feature equal to the default (SRAMECC ON), do not
+        // append feature suffix. This is for backward compatibility
+        // with legacy databases ONLY!
+        if(!sramecc || !(*sramecc))
+            dbId += "_nosramecc";
+    }
+    else
+    {
+        if(sramecc && *sramecc)
+            dbId += "_sramecc";
+    }
+    if(xnack && *xnack)
+        dbId += "_xnack";
+}
+
+} // namespace olCompile
diff --git a/driver/olCompiling/hip_utility/tmp_dir.cpp b/driver/olCompiling/hip_utility/tmp_dir.cpp
new file mode 100644
index 0000000000..6e5de6935d
--- /dev/null
+++ b/driver/olCompiling/hip_utility/tmp_dir.cpp
@@ -0,0 +1,66 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2019 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+
+#include <tmp_dir.hpp>
+#include <env.hpp>
+#include <boost/filesystem.hpp>
+#include <logger.hpp>
+
+OLC_DECLARE_ENV_VAR(OLC_DEBUG_SAVE_TEMP_DIR)
+
+namespace olCompile {
+
+void SystemCmd(std::string cmd)
+{
+    fdt_log(LogLevel::Info, "SystemCmd", cmd.c_str());
+    fdt_log_flush();
+    if(std::system(cmd.c_str()) != 0)
+        throw std::runtime_error("Can't execute " + cmd);
+}
+
+TmpDir::TmpDir(std::string prefix)
+    : path(boost::filesystem::temp_directory_path() /
+           boost::filesystem::unique_path("olCompile-" + prefix + "-%%%%-%%%%-%%%%-%%%%"))
+{
+    boost::filesystem::create_directories(this->path);
+}
+
+void TmpDir::Execute(std::string exe, std::string args) const
+{
+    std::string cd  = "cd " + this->path.string() + "; ";
+    std::string cmd = cd + exe + " " + args; // + " > /dev/null";
+    SystemCmd(cmd);
+}
+
+TmpDir::~TmpDir()
+{
+    if(!olCompile::IsEnabled(OLC_DEBUG_SAVE_TEMP_DIR{}))
+    {
+        boost::filesystem::remove_all(this->path);
+    }
+}
+
+} // namespace olCompile
diff --git a/driver/olCompiling/include/binary_cache.hpp b/driver/olCompiling/include/binary_cache.hpp
new file mode 100644
index 0000000000..5ff9f81093
--- /dev/null
+++ b/driver/olCompiling/include/binary_cache.hpp
@@ -0,0 +1,52 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2017 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+
+#ifndef GUARD_OLC_BINARY_CACHE_HPP
+#define GUARD_OLC_BINARY_CACHE_HPP
+
+#include <target_properties.hpp>
+#include <boost/filesystem/path.hpp>
+#include <string>
+
+namespace olCompile {
+
+boost::filesystem::path
+GetCacheFile(const std::string& device, const std::string& name, const std::string& args);
+
+boost::filesystem::path GetCachePath();
+
+boost::filesystem::path LoadBinary(const TargetProperties& target,
+                                   std::size_t num_cu,
+                                   const std::string& name,
+                                   const std::string& args);
+void SaveBinary(const boost::filesystem::path& binary_path,
+                const TargetProperties& target,
+                const std::string& name,
+                const std::string& args);
+
+} // namespace olCompile
+
+#endif
diff --git a/driver/olCompiling/include/config.h.in b/driver/olCompiling/include/config.h.in
new file mode 100644
index 0000000000..b36486de2f
--- /dev/null
+++ b/driver/olCompiling/include/config.h.in
@@ -0,0 +1,47 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2017 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#ifndef GUARD_CONFIG_H_IN
+#define GUARD_CONFIG_H_IN
+
+// "_PACKAGE_" to avoid name contentions: the macros like
+// HIP_VERSION_MAJOR are defined in hip_version.h.
+// clang-format off
+#define HIP_PACKAGE_VERSION_MAJOR @OLC_hip_VERSION_MAJOR@
+#define HIP_PACKAGE_VERSION_MINOR @OLC_hip_VERSION_MINOR@
+#define HIP_PACKAGE_VERSION_PATCH @OLC_hip_VERSION_PATCH@
+// clang-format on
+
+#define HIP_PACKAGE_VERSION_FLAT                                                   \
+    ((HIP_PACKAGE_VERSION_MAJOR * 1000ULL + HIP_PACKAGE_VERSION_MINOR) * 1000000 + \
+     HIP_PACKAGE_VERSION_PATCH)
+
+#cmakedefine01 OLC_DEBUG
+
+#cmakedefine OLC_HIP_COMPILER "@OLC_HIP_COMPILER@"
+#cmakedefine EXTRACTKERNEL_BIN "@EXTRACTKERNEL_BIN@"
+#cmakedefine OLC_OFFLOADBUNDLER_BIN "@OLC_OFFLOADBUNDLER_BIN@"
+
+#endif
diff --git a/driver/olCompiling/include/env.hpp b/driver/olCompiling/include/env.hpp
new file mode 100644
index 0000000000..1d519a44d7
--- /dev/null
+++ b/driver/olCompiling/include/env.hpp
@@ -0,0 +1,123 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2017 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#ifndef GUARD_OLC_ENV_HPP
+#define GUARD_OLC_ENV_HPP
+
+#include <cstdlib>
+#include <cstring>
+#include <string>
+#include <vector>
+
+namespace olCompile {
+
+/// \todo Rework: Case-insensitive string compare, ODR, (?) move to .cpp
+
+// Declare a cached environment variable
+#define OLC_DECLARE_ENV_VAR(x)                    \
+    struct x                                      \
+    {                                             \
+        static const char* value() { return #x; } \
+    };
+
+/*
+ * Returns false if a feature-controlling environment variable is defined
+ * and set to something which disables a feature.
+ */
+inline bool IsEnvvarValueDisabled(const char* name)
+{
+    const auto value_env_p = std::getenv(name);
+    return value_env_p != nullptr &&
+           (std::strcmp(value_env_p, "disable") == 0 || std::strcmp(value_env_p, "disabled") == 0 ||
+            std::strcmp(value_env_p, "0") == 0 || std::strcmp(value_env_p, "no") == 0 ||
+            std::strcmp(value_env_p, "false") == 0);
+}
+
+inline bool IsEnvvarValueEnabled(const char* name)
+{
+    const auto value_env_p = std::getenv(name);
+    return value_env_p != nullptr &&
+           (std::strcmp(value_env_p, "enable") == 0 || std::strcmp(value_env_p, "enabled") == 0 ||
+            std::strcmp(value_env_p, "1") == 0 || std::strcmp(value_env_p, "yes") == 0 ||
+            std::strcmp(value_env_p, "true") == 0);
+}
+
+// Return 0 if env is enabled else convert environment var to an int.
+// Supports hexadecimal with leading 0x or decimal
+inline unsigned long int EnvvarValue(const char* name, unsigned long int fallback = 0)
+{
+    const auto value_env_p = std::getenv(name);
+    if(value_env_p == nullptr)
+    {
+        return fallback;
+    }
+    else
+    {
+        return strtoul(value_env_p, nullptr, 0);
+    }
+}
+
+inline std::vector<std::string> GetEnv(const char* name)
+{
+    const auto p = std::getenv(name);
+    if(p == nullptr)
+        return {};
+    else
+        return {{p}};
+}
+
+template <class T>
+inline const char* GetStringEnv(T)
+{
+    static const std::vector<std::string> result = GetEnv(T::value());
+    if(result.empty())
+        return nullptr;
+    else
+        return result.front().c_str();
+}
+
+template <class T>
+inline bool IsEnabled(T)
+{
+    static const bool result = olCompile::IsEnvvarValueEnabled(T::value());
+    return result;
+}
+
+template <class T>
+inline bool IsDisabled(T)
+{
+    static const bool result = olCompile::IsEnvvarValueDisabled(T::value());
+    return result;
+}
+
+template <class T>
+inline unsigned long int Value(T, unsigned long int fallback = 0)
+{
+    static const auto result = olCompile::EnvvarValue(T::value(), fallback);
+    return result;
+}
+} // namespace olCompile
+
+#endif
diff --git a/driver/olCompiling/include/exec_utils.hpp b/driver/olCompiling/include/exec_utils.hpp
new file mode 100644
index 0000000000..bbad128d96
--- /dev/null
+++ b/driver/olCompiling/include/exec_utils.hpp
@@ -0,0 +1,42 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2019 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#ifndef EXEC_OLC_UTILS_HPP
+#define EXEC_OLC_UTILS_HPP
+
+#include <istream>
+#include <ostream>
+#include <string>
+
+namespace olCompile {
+namespace exec {
+
+/// Redirecting both input and output is not supported.
+int Run(const std::string& p, std::istream* in, std::ostream* out);
+
+} // namespace exec
+} // namespace olCompile
+
+#endif // EXEC_UTILS_HPP
diff --git a/driver/olCompiling/include/handle.hpp b/driver/olCompiling/include/handle.hpp
new file mode 100644
index 0000000000..db93ee1445
--- /dev/null
+++ b/driver/olCompiling/include/handle.hpp
@@ -0,0 +1,145 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2017 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#ifndef GUARD_OLC_HANDLE_HPP_
+#define GUARD_OLC_HANDLE_HPP_
+
+#include <kernel.hpp>
+#include <stringutils.hpp>
+#include <target_properties.hpp>
+
+#include <boost/range/adaptor/transformed.hpp>
+
+#include <cstdio>
+#include <cstring>
+#include <ios>
+#include <sstream>
+#include <memory>
+#include <vector>
+#include <unordered_map>
+
+namespace olCompile {
+
+struct HandleImpl;
+
+struct Handle
+{
+    friend struct TargetProperties;
+
+    Handle();
+    Handle(hipStream_t stream);
+    Handle(Handle&&) noexcept;
+    ~Handle();
+
+    hipStream_t GetStream() const;
+    void SetStream(hipStream_t streamID) const;
+
+    KernelInvoke AddKernel(const std::string& algorithm,
+                           const std::string& network_config,
+                           const std::string& program_name,
+                           const std::string& kernel_name,
+                           const std::vector<size_t>& vld,
+                           const std::vector<size_t>& vgd,
+                           const std::string& params,
+                           std::size_t cache_index = 0) const;
+
+    bool HasKernel(const std::string& algorithm, const std::string& network_config) const;
+
+    void ClearKernels(const std::string& algorithm, const std::string& network_config) const;
+
+    auto GetKernels(const std::string& algorithm, const std::string& network_config) const
+    {
+        return this->GetKernelsImpl(algorithm, network_config) |
+               boost::adaptors::transformed([this](Kernel k) { return this->Run(k); });
+    }
+    KernelInvoke GetKernel(const std::string& algorithm, const std::string& network_config) const
+    {
+        auto ks = this->GetKernelsImpl(algorithm, network_config);
+        if(ks.empty())
+        {
+            throw std::runtime_error("looking for default kernel (does not exist): " + algorithm +
+                                     ", " + network_config);
+        }
+        return this->Run(ks.front());
+    }
+
+    KernelInvoke Run(Kernel k) const;
+
+    Program LoadProgram(const std::string& program_name, std::string params) const;
+
+    bool HasProgram(const std::string& program_name, const std::string& params) const;
+
+    void AddProgram(Program prog, const std::string& program_name, const std::string& params) const;
+
+    void Finish() const;
+
+    std::size_t GetLocalMemorySize() const;
+    std::size_t GetGlobalMemorySize() const;
+    std::size_t GetWavefrontWidth() const;
+    std::size_t GetMaxComputeUnits() const;
+    std::size_t GetMaxHardwareComputeUnits() const
+    {
+        std::size_t num_cu = this->GetMaxComputeUnits();
+        std::string name   = this->GetDeviceName();
+        return StartsWith(name, "gfx1") ? num_cu * 2 /* CUs per WGP */ : num_cu;
+    }
+
+    std::string GetDeviceName() const;
+    const TargetProperties& GetTargetProperties() const;
+
+    private:
+    std::string GetDeviceNameImpl() const;
+    const std::vector<Kernel>& GetKernelsImpl(const std::string& algorithm,
+                                              const std::string& network_config) const;
+
+    public:
+    std::ostream& Print(std::ostream& os) const;
+
+    static std::string GetDbBasename(const TargetProperties& target, size_t num_cu)
+    {
+        auto ret = target.DbId() + [&]() {
+            std::ostringstream ss;
+            if(num_cu <= 64)
+                ss << '_' << num_cu;
+            else
+                ss << std::hex << num_cu;
+            return std::string(ss.str());
+        }();
+        return ret;
+    }
+
+    std::string GetDbBasename() const
+    {
+        return GetDbBasename(GetTargetProperties(), GetMaxComputeUnits());
+    }
+
+    std::unique_ptr<HandleImpl> impl;
+};
+
+inline std::ostream& operator<<(std::ostream& os, const Handle& handle) { return handle.Print(os); }
+
+} // namespace olCompile
+
+#endif // GUARD_OLC_HANDLE_HPP_
diff --git a/driver/olCompiling/include/hipCheck.hpp b/driver/olCompiling/include/hipCheck.hpp
new file mode 100644
index 0000000000..61959c8fa6
--- /dev/null
+++ b/driver/olCompiling/include/hipCheck.hpp
@@ -0,0 +1,22 @@
+#ifndef _HIP_OLC_CHECK_HPP_
+#define _HIP_OLC_CHECK_HPP_
+
+#include <hip/hip_runtime.h>
+#include <sstream>
+#include <vector>
+
+// Here flag can be a constant, variable or function call
+#define MY_HIP_CHECK(flag)                                                         \
+    do                                                                             \
+    {                                                                              \
+        hipError_t _tmpVal;                                                        \
+        if((_tmpVal = flag) != hipSuccess)                                         \
+        {                                                                          \
+            std::ostringstream ostr;                                               \
+            ostr << "HIP Function Failed (" << __FILE__ << "," << __LINE__ << ") " \
+                 << hipGetErrorString(_tmpVal);                                    \
+            throw std::runtime_error(ostr.str());                                  \
+        }                                                                          \
+    } while(0)
+
+#endif
diff --git a/driver/olCompiling/include/hip_build_utils.hpp b/driver/olCompiling/include/hip_build_utils.hpp
new file mode 100644
index 0000000000..af456f846b
--- /dev/null
+++ b/driver/olCompiling/include/hip_build_utils.hpp
@@ -0,0 +1,97 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2019 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#ifndef OLC_GUARD_OLC_HIP_BUILD_UTILS_HPP
+#define OLC_GUARD_OLC_HIP_BUILD_UTILS_HPP
+
+#include <target_properties.hpp>
+#include <kernel.hpp>
+#include <boost/optional.hpp>
+#include <string>
+
+namespace olCompile {
+
+boost::filesystem::path HipBuild(boost::optional<olCompile::TmpDir>& tmp_dir,
+                                 const std::string& filename,
+                                 std::string src,
+                                 std::string params,
+                                 const TargetProperties& target,
+                                 bool sources_already_reside_on_filesystem = false);
+
+void bin_file_to_str(const boost::filesystem::path& file, std::string& buf);
+
+struct external_tool_version_t
+{
+    int major = -1;
+    int minor = -1;
+    int patch = -1;
+    friend bool operator>(const external_tool_version_t& lhs, const external_tool_version_t& rhs);
+    friend bool operator<(const external_tool_version_t& lhs, const external_tool_version_t& rhs);
+    friend bool operator>=(const external_tool_version_t& lhs, const external_tool_version_t& rhs);
+    friend bool operator<=(const external_tool_version_t& lhs, const external_tool_version_t& rhs);
+};
+
+external_tool_version_t HipCompilerVersion();
+
+bool IsHccCompiler();
+bool IsHipClangCompiler();
+
+class LcOptionTargetStrings
+{
+    public:
+    const std::string& device;
+    const std::string xnack;
+
+    private:
+    const std::string sramecc;
+    const std::string sramecc_reported;
+
+    public:
+    const std::string targetId;
+    LcOptionTargetStrings(const TargetProperties& target)
+        : device(target.Name()),
+          xnack([&]() -> std::string {
+              if(target.Xnack())
+                  return std::string{":xnack"} + (*target.Xnack() ? "+" : "-");
+              return {};
+          }()),
+          sramecc([&]() -> std::string {
+              if(target.Sramecc())
+                  return std::string{":sramecc"} + (*target.Sramecc() ? "+" : "-");
+              return {};
+          }()),
+          sramecc_reported([&]() -> std::string {
+              if(target.SrameccReported())
+                  return std::string{":sramecc"} + (*target.SrameccReported() ? "+" : "-");
+              return {};
+          }()),
+          targetId(device + sramecc + xnack)
+    {
+    }
+};
+
+} // namespace olCompile
+
+#endif
diff --git a/driver/olCompiling/include/hipoc_kernel.hpp b/driver/olCompiling/include/hipoc_kernel.hpp
new file mode 100644
index 0000000000..3bcf88f526
--- /dev/null
+++ b/driver/olCompiling/include/hipoc_kernel.hpp
@@ -0,0 +1,174 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2017 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#ifndef GUARD_OLC_HIPOC_KERNEL_HPP
+#define GUARD_OLC_HIPOC_KERNEL_HPP
+
+#include <array>
+#include <cassert>
+#include <hipoc_program.hpp>
+#include <stringutils.hpp>
+#include <manage_ptr.hpp>
+#include <op_kernel_args.hpp>
+#include <hipCheck.hpp>
+#include <vector>
+#include <memory>
+
+namespace olCompile {
+
+using HipEventPtr = OLC_MANAGE_PTR(hipEvent_t, hipEventDestroy);
+inline HipEventPtr make_hip_event()
+{
+    hipEvent_t result = nullptr;
+    MY_HIP_CHECK(hipEventCreate(&result));
+    return HipEventPtr{result};
+}
+
+template <class T, class U>
+struct KernelArgsPair
+{
+    static const int alignment    = sizeof(U);
+    static const int padding      = (alignment - (sizeof(T) % alignment)) % alignment;
+    static const int second_index = sizeof(T) + padding;
+    KernelArgsPair(T x, U y)
+    {
+
+        new(buffer) T(x); // NOLINT (clang-analyzer-cplusplus.PlacementNew)
+        new(buffer + second_index) U(y);
+    }
+    char buffer[second_index + sizeof(U)] = {};
+};
+
+template <class... Ts>
+struct KernelArgsPack;
+
+template <class T, class U, class... Ts>
+struct KernelArgsPack<T, U, Ts...>
+{
+    using data_t = KernelArgsPack<KernelArgsPair<T, U>, Ts...>;
+    KernelArgsPack(T x, U y, Ts... xs) : data(KernelArgsPair<T, U>(x, y), xs...) {}
+    data_t data;
+};
+
+template <class T>
+struct KernelArgsPack<T>
+{
+    KernelArgsPack(T x) : head(x) {}
+    T head;
+};
+
+template <class... Ts>
+struct KernelArgs
+{
+    KernelArgs(Ts... xs) : pack(xs...) { std::fill(std::begin(hidden), std::end(hidden), 0); }
+    KernelArgsPack<Ts...> pack;
+    uint64_t hidden[6] = {};
+};
+
+struct HIPOCKernelInvoke
+{
+    hipStream_t stream          = nullptr;
+    hipFunction_t fun           = nullptr;
+    std::array<size_t, 3> ldims = {};
+    std::array<size_t, 3> gdims = {};
+    std::string name;
+    std::function<void(hipEvent_t, hipEvent_t)> callback;
+
+    // Workaround for aggregate types in c++11
+    HIPOCKernelInvoke() {}
+    HIPOCKernelInvoke(hipStream_t pstream,
+                      hipFunction_t pfun,
+                      std::array<size_t, 3> pldims,
+                      std::array<size_t, 3> pgdims,
+                      std::string pname,
+                      std::function<void(hipEvent_t, hipEvent_t)> pcallback)
+        : stream(pstream), fun(pfun), ldims(pldims), gdims(pgdims), name(pname), callback(pcallback)
+    {
+    }
+    void operator()(std::vector<OpKernelArg>& any_args) const
+    {
+        char hip_args[256] = {0};
+        auto sz_left       = any_args[0].size();
+
+        memcpy(hip_args, &(any_args[0].buffer[0]), any_args[0].size());
+
+        for(unsigned long idx = 1; idx < any_args.size(); idx++)
+        {
+            auto& any_arg              = any_args[idx];
+            unsigned long alignment    = any_arg.size();
+            unsigned long padding      = (alignment - (sz_left % alignment)) % alignment;
+            unsigned long second_index = sz_left + padding;
+            memcpy(hip_args + second_index, &(any_arg.buffer[0]), any_arg.size());
+            sz_left = second_index + alignment;
+        }
+        run(hip_args, sz_left);
+    }
+
+    template <class... Ts>
+    void operator()(Ts... xs) const
+    {
+        KernelArgs<Ts...> args{xs...};
+        run(&args, sizeof(args));
+    }
+
+    void run(void* args, std::size_t size) const;
+
+    const std::string& GetName() const { return name; }
+};
+
+struct HIPOCKernel
+{
+    HIPOCProgram program;
+    std::string name;
+    std::array<size_t, 3> ldims = {};
+    std::array<size_t, 3> gdims = {};
+    std::string kernel_module;
+    hipFunction_t fun = nullptr;
+
+    HIPOCKernel() {}
+    HIPOCKernel(HIPOCProgram p,
+                const std::string kernel_name,
+                std::vector<size_t> local_dims,
+                std::vector<size_t> global_dims)
+        : program(p), name(kernel_name)
+    {
+        assert(!local_dims.empty() && local_dims.size() <= 3);
+        assert(!global_dims.empty() && global_dims.size() <= 3);
+        ldims.fill(1);
+        gdims.fill(1);
+        std::copy(local_dims.begin(), local_dims.end(), ldims.begin());
+        std::copy(global_dims.begin(), global_dims.end(), gdims.begin());
+
+        kernel_module = name;
+        MY_HIP_CHECK(hipModuleGetFunction(&fun, program.GetModule(), kernel_module.c_str()));
+    }
+
+    HIPOCKernelInvoke Invoke(hipStream_t stream,
+                             std::function<void(hipEvent_t, hipEvent_t)> callback = nullptr) const;
+};
+
+} // namespace olCompile
+
+#endif
diff --git a/driver/olCompiling/include/hipoc_program.hpp b/driver/olCompiling/include/hipoc_program.hpp
new file mode 100644
index 0000000000..5296003cb7
--- /dev/null
+++ b/driver/olCompiling/include/hipoc_program.hpp
@@ -0,0 +1,64 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2017 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#ifndef GUARD_OLC_HIPOC_PROGRAM_HPP
+#define GUARD_OLC_HIPOC_PROGRAM_HPP
+
+#include <target_properties.hpp>
+#include <manage_ptr.hpp>
+#include <hipoc_program_impl.hpp>
+#include <boost/filesystem/path.hpp>
+#include <hip/hip_runtime_api.h>
+#include <string>
+
+namespace olCompile {
+
+struct HIPOCProgramImpl;
+struct HIPOCProgram
+{
+    HIPOCProgram();
+    /// This ctor builds the program from source, initializes module.
+    /// Also either CO pathname (typically if offline tools were used)
+    /// or binary blob (if comgr was used to build the program)
+    /// is initialized. GetModule(), GetCodeObjectPathname(),
+    /// GetCodeObjectBlob() return appropriate data after this ctor.
+    /// Other ctors only guarantee to initialize module.
+    HIPOCProgram(const std::string& program_name,
+                 std::string params,
+                 const TargetProperties& target);
+    HIPOCProgram(const std::string& program_name, const boost::filesystem::path& hsaco);
+    std::shared_ptr<const HIPOCProgramImpl> impl;
+    hipModule_t GetModule() const;
+    /// \return Pathname of CO file, if it resides on the filesystem.
+    boost::filesystem::path GetCodeObjectPathname() const;
+    /// \return Copy of in-memory CO blob.
+    std::string GetCodeObjectBlob() const;
+    /// \return True if CO blob resides in-memory.
+    /// False if CO resides on filesystem.
+    bool IsCodeObjectInMemory() const;
+};
+} // namespace olCompile
+
+#endif
diff --git a/driver/olCompiling/include/hipoc_program_impl.hpp b/driver/olCompiling/include/hipoc_program_impl.hpp
new file mode 100644
index 0000000000..2d8706b2e8
--- /dev/null
+++ b/driver/olCompiling/include/hipoc_program_impl.hpp
@@ -0,0 +1,61 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2021 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#ifndef GUARD_OLC_HIPOC_PROGRAM_IMPL_HPP
+#define GUARD_OLC_HIPOC_PROGRAM_IMPL_HPP
+
+#include <target_properties.hpp>
+#include <manage_ptr.hpp>
+#include <tmp_dir.hpp>
+#include <boost/filesystem/path.hpp>
+#include <boost/optional.hpp>
+#include <hip/hip_runtime_api.h>
+
+namespace olCompile {
+
+using hipModulePtr = OLC_MANAGE_PTR(hipModule_t, hipModuleUnload);
+
+struct HIPOCProgramImpl
+{
+    HIPOCProgramImpl(){};
+    HIPOCProgramImpl(const std::string& program_name, const boost::filesystem::path& filespec);
+
+    HIPOCProgramImpl(const std::string& program_name,
+                     std::string params,
+                     const TargetProperties& target_);
+
+    std::string program;
+    TargetProperties target;
+    boost::filesystem::path hsaco_file;
+    hipModulePtr module;
+    boost::optional<TmpDir> dir;
+    std::vector<char> binary;
+
+    void
+    BuildCodeObjectInFile(std::string& params, const std::string& src, const std::string& filename);
+    void BuildCodeObject(std::string params);
+};
+} // namespace olCompile
+#endif // GUARD_OLC_HIPOC_PROGRAM_IMPL_HPP
diff --git a/driver/olCompiling/include/kernel.hpp b/driver/olCompiling/include/kernel.hpp
new file mode 100644
index 0000000000..73d6be61ad
--- /dev/null
+++ b/driver/olCompiling/include/kernel.hpp
@@ -0,0 +1,45 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2017 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#ifndef GUARD_OLC_KERNEL_HPP
+#define GUARD_OLC_KERNEL_HPP
+
+#include <string>
+#include <vector>
+#include <hipoc_kernel.hpp>
+
+namespace olCompile {
+std::string GetKernelSrc(std::string name);
+std::string GetKernelInc(std::string key);
+std::vector<std::string> GetKernelIncList();
+std::vector<std::string> GetHipKernelIncList();
+
+using Kernel       = HIPOCKernel;
+using KernelInvoke = HIPOCKernelInvoke;
+using Program      = HIPOCProgram;
+
+} // namespace olCompile
+
+#endif
diff --git a/driver/olCompiling/include/kernel_build_params.hpp b/driver/olCompiling/include/kernel_build_params.hpp
new file mode 100644
index 0000000000..c15769ed27
--- /dev/null
+++ b/driver/olCompiling/include/kernel_build_params.hpp
@@ -0,0 +1,137 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2019 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#ifndef GUARD_OLC_KERNEL_BUILD_PARAMETERS_HPP_
+#define GUARD_OLC_KERNEL_BUILD_PARAMETERS_HPP_
+
+#include <algorithm>
+#include <cassert>
+#include <initializer_list>
+#include <string>
+#include <vector>
+
+namespace olCompile {
+
+namespace kbp {
+struct Option
+{
+};
+} // namespace kbp
+
+enum class ParameterTypes
+{
+    Define,
+    Option,
+};
+
+struct KernelBuildParameter
+{
+    ParameterTypes type;
+    std::string name;
+    std::string value;
+};
+
+class KernelBuildParameters
+{
+    public:
+    struct KBPInit
+    {
+        friend class KernelBuildParameters;
+
+        KBPInit(const std::string& name, const std::string& value = "")
+            : data{ParameterTypes::Define, name, value}
+        {
+        }
+
+        template <class TValue, class = decltype(std::to_string(std::declval<TValue>()))>
+        KBPInit(const std::string& name, const TValue& value) : KBPInit(name, std::to_string(value))
+        {
+        }
+
+        KBPInit(kbp::Option, const std::string& name, const std::string& value = "")
+            : data{ParameterTypes::Option, name, value}
+        {
+        }
+
+        template <class TValue, class = decltype(std::to_string(std::declval<TValue>()))>
+        KBPInit(kbp::Option, const std::string& name, const TValue& value)
+            : KBPInit(kbp::Option{}, name, std::to_string(value))
+        {
+        }
+
+        private:
+        KernelBuildParameter data{};
+    };
+
+    KernelBuildParameters() = default;
+    KernelBuildParameters(const std::initializer_list<KBPInit>& defines_)
+    {
+        options.reserve(defines_.size());
+        for(const auto& define : defines_)
+        {
+            assert(ValidateUniqueness(define.data.name));
+            options.push_back(define.data);
+        }
+    }
+
+    bool Empty() const { return options.empty(); }
+
+    void Define(const std::string& name, const std::string& value = "")
+    {
+        assert(ValidateUniqueness(name));
+        options.push_back({ParameterTypes::Define, name, value});
+    }
+
+    template <class TValue, class = decltype(std::to_string(std::declval<TValue>()))>
+    void Define(const std::string& name, const TValue& value)
+    {
+        Define(name, std::to_string(value));
+    }
+
+    KernelBuildParameters& operator<<(const KernelBuildParameters& other)
+    {
+        std::copy(other.options.begin(), other.options.end(), std::back_inserter(options));
+        return *this;
+    }
+
+    template <class TFor>
+    std::string GenerateFor(TFor&&) const
+    {
+        return TFor::Generate(options);
+    }
+
+    private:
+    std::vector<KernelBuildParameter> options = {};
+
+    bool ValidateUniqueness(const std::string& name) const
+    {
+        const auto eq = [=](const auto& item) { return item.name == name; };
+        return std::find_if(options.begin(), options.end(), eq) == options.end();
+    }
+};
+
+} // namespace olCompile
+
+#endif
diff --git a/driver/olCompiling/include/kernel_cache.hpp b/driver/olCompiling/include/kernel_cache.hpp
new file mode 100644
index 0000000000..9f88327858
--- /dev/null
+++ b/driver/olCompiling/include/kernel_cache.hpp
@@ -0,0 +1,97 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2017 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+/* ************************************************************************
+ * Copyright 2015 Vratis, Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************ */
+
+#ifndef GUARD_OLC_KERNEL_CACHE_HPP_
+#define GUARD_OLC_KERNEL_CACHE_HPP_
+
+#include <handle.hpp>
+#include <kernel.hpp>
+#include <simple_hash.hpp>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+namespace olCompile {
+
+/**
+ * @brief The KernelCache class Build and cache kernels
+ *
+ */
+class KernelCache
+{
+    public:
+    using Key        = std::pair<std::string, std::string>;
+    using KernelMap  = std::unordered_map<Key, std::vector<Kernel>, SimpleHash>;
+    using ProgramMap = std::unordered_map<Key, Program, SimpleHash>;
+
+    Kernel AddKernel(const Handle& h,
+                     const std::string& algorithm,
+                     const std::string& network_config,
+                     const std::string& program_name,
+                     const std::string& kernel_name,
+                     const std::vector<size_t>& vld,
+                     const std::vector<size_t>& vgd,
+                     std::string params      = "",
+                     std::size_t cache_index = 0);
+
+    void AddKernel(Key key, Kernel k, std::size_t cache_index);
+
+    void ClearKernels(const std::string& algorithm, const std::string& network_config);
+
+    const std::vector<Kernel>& GetKernels(const std::string& algorithm,
+                                          const std::string& network_config);
+
+    bool HasKernels(const std::string& algorithm, const std::string& network_config) const;
+
+    bool HasProgram(const std::string& name, const std::string& params) const;
+
+    void AddProgram(Program prog, const std::string& program_name, std::string params);
+
+    KernelCache();
+
+    private:
+    KernelMap kernel_map;
+    ProgramMap program_map;
+};
+
+} // namespace olCompile
+
+#endif // GUARD_OLC_KERNEL_CACHE_HPP_
diff --git a/driver/olCompiling/include/logger.hpp b/driver/olCompiling/include/logger.hpp
new file mode 100644
index 0000000000..cc420d6e34
--- /dev/null
+++ b/driver/olCompiling/include/logger.hpp
@@ -0,0 +1,23 @@
+#ifndef _OLC_LOGGER_HPP_
+#define _OLC_LOGGER_HPP_
+
+#include <fstream>
+
+namespace olCompile {
+
+enum class LogLevel
+{
+    Quiet   = 1,
+    Error   = 2,
+    Warning = 3,
+    Info    = 4,
+    Info2   = 5
+};
+
+std::ostream& fdt_log(LogLevel level, const char* header, const char* content);
+std::ostream& fdt_log();
+void fdt_log_flush();
+
+}; // namespace olCompile
+
+#endif
diff --git a/driver/olCompiling/include/manage_ptr.hpp b/driver/olCompiling/include/manage_ptr.hpp
new file mode 100644
index 0000000000..c02c712475
--- /dev/null
+++ b/driver/olCompiling/include/manage_ptr.hpp
@@ -0,0 +1,76 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2017 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#ifndef GUARD_OLC_MANAGE_PTR_HPP
+#define GUARD_OLC_MANAGE_PTR_HPP
+
+#include <memory>
+#include <type_traits>
+
+namespace olCompile {
+
+template <class F, F f>
+struct manage_deleter
+{
+    template <class T>
+    void operator()(T* x) const
+    {
+        if(x != nullptr)
+        {
+            (void)f(x); // NOLINT (cppcoreguidelines-owning-memory)
+        }
+    }
+};
+
+struct null_deleter
+{
+    template <class T>
+    void operator()(T* /*x*/) const
+    {
+    }
+};
+
+template <class T, class F, F f>
+using manage_ptr = std::unique_ptr<T, manage_deleter<F, f>>;
+
+template <class T>
+struct element_type
+{
+    using type = typename T::element_type;
+};
+
+template <class T>
+using remove_ptr = typename std::
+    conditional<std::is_pointer<T>::value, std::remove_pointer<T>, element_type<T>>::type::type;
+
+template <class T>
+using shared = std::shared_ptr<remove_ptr<T>>;
+
+} // namespace olCompile
+
+#define OLC_MANAGE_PTR(T, F) \
+    olCompile::manage_ptr<typename std::remove_pointer<T>::type, decltype(&F), &F> // NOLINT
+
+#endif
diff --git a/driver/olCompiling/include/md5.hpp b/driver/olCompiling/include/md5.hpp
new file mode 100644
index 0000000000..1f350766e7
--- /dev/null
+++ b/driver/olCompiling/include/md5.hpp
@@ -0,0 +1,12 @@
+#ifndef GUARD_OLC_MD5_HPP
+#define GUARD_OLC_MD5_HPP
+
+#include <string>
+
+namespace olCompile {
+
+std::string md5(std::string s);
+
+} // namespace olCompile
+
+#endif
diff --git a/driver/olCompiling/include/op_kernel_args.hpp b/driver/olCompiling/include/op_kernel_args.hpp
new file mode 100644
index 0000000000..7d0420e8f5
--- /dev/null
+++ b/driver/olCompiling/include/op_kernel_args.hpp
@@ -0,0 +1,35 @@
+#ifndef OLC_GUARD_MLOPEN_OP_KERNEL_ARGS_HPP
+#define OLC_GUARD_MLOPEN_OP_KERNEL_ARGS_HPP
+
+#include <type_traits>
+#include <cstdint>
+#include <half.hpp>
+
+#include <boost/container/small_vector.hpp>
+struct OpKernelArg
+{
+
+    OpKernelArg(char val, size_t sz) : buffer(sz) { std::fill(buffer.begin(), buffer.end(), val); }
+
+    template <typename T>
+    OpKernelArg(T arg) : buffer(sizeof(T))
+    {
+        static_assert(std::is_trivial<T>{} || std::is_same<T, half_float::half>{},
+                      "Only for trivial types");
+        *(reinterpret_cast<T*>(buffer.data())) = arg;
+    }
+
+    template <typename T>
+    OpKernelArg(T* arg) // NOLINT
+        : buffer(sizeof(T*))
+    {
+        *(reinterpret_cast<T**>(buffer.data())) = arg;
+        is_ptr                                  = true;
+    }
+
+    std::size_t size() const { return buffer.size(); };
+    boost::container::small_vector<char, 8> buffer;
+    bool is_ptr = false;
+};
+
+#endif
diff --git a/driver/olCompiling/include/simple_hash.hpp b/driver/olCompiling/include/simple_hash.hpp
new file mode 100644
index 0000000000..c7dac54cfc
--- /dev/null
+++ b/driver/olCompiling/include/simple_hash.hpp
@@ -0,0 +1,44 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2018 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+
+#ifndef GUARD_OLC_SIMPLE_HASH_HPP
+#define GUARD_OLC_SIMPLE_HASH_HPP
+
+#include <string>
+
+namespace olCompile {
+struct SimpleHash
+{
+    size_t operator()(const std::pair<std::string, std::string>& p) const
+    {
+        using std::hash;
+        return (hash<std::string>()(p.first) ^ hash<std::string>()(p.second));
+    }
+};
+
+} // namespace olCompile
+
+#endif
diff --git a/driver/olCompiling/include/stringutils.hpp b/driver/olCompiling/include/stringutils.hpp
new file mode 100644
index 0000000000..6175c36ff4
--- /dev/null
+++ b/driver/olCompiling/include/stringutils.hpp
@@ -0,0 +1,133 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2017 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#ifndef GUARD_OLC_STRINGUTILS_HPP
+#define GUARD_OLC_STRINGUTILS_HPP
+
+#include <algorithm>
+#include <iterator>
+#include <numeric>
+#include <string>
+#include <vector>
+#include <sstream>
+
+#define OLC_STRINGIZE_1(...) #__VA_ARGS__
+#define OLC_STRINGIZE(...) OLC_STRINGIZE_1(__VA_ARGS__)
+
+namespace olCompile {
+
+inline std::string
+ReplaceString(std::string subject, const std::string& search, const std::string& replace)
+{
+    size_t pos = 0;
+    while((pos = subject.find(search, pos)) != std::string::npos)
+    {
+        subject.replace(pos, search.length(), replace);
+        pos += replace.length();
+    }
+    return subject;
+}
+
+inline bool EndsWith(const std::string& value, const std::string& suffix)
+{
+    if(suffix.size() > value.size())
+        return false;
+    else
+        return std::equal(suffix.rbegin(), suffix.rend(), value.rbegin());
+}
+
+template <class Strings>
+inline std::string JoinStrings(Strings strings, std::string delim)
+{
+    auto it = strings.begin();
+    if(it == strings.end())
+        return "";
+
+    auto nit = std::next(it);
+    return std::accumulate(
+        nit, strings.end(), *it, [&](std::string x, std::string y) { return x + delim + y; });
+}
+
+template <class F>
+static inline std::string TransformString(std::string s, F f)
+{
+    std::transform(s.begin(), s.end(), s.begin(), f);
+    return s;
+}
+
+inline std::string ToUpper(std::string s) { return TransformString(std::move(s), ::toupper); }
+
+inline bool StartsWith(const std::string& value, const std::string& prefix)
+{
+    if(prefix.size() > value.size())
+        return false;
+    else
+        return std::equal(prefix.begin(), prefix.end(), value.begin());
+}
+
+inline std::string RemovePrefix(std::string s, std::string prefix)
+{
+    if(StartsWith(s, prefix))
+        return s.substr(prefix.length());
+    else
+        return s;
+}
+
+inline std::vector<std::string> SplitSpaceSeparated(const std::string& in)
+{
+    std::istringstream ss(in);
+    std::istream_iterator<std::string> begin(ss), end;
+    return {begin, end};
+}
+
+inline std::vector<std::string> SplitSpaceSeparated(const std::string& in,
+                                                    const std::vector<std::string>& dontSplitAfter)
+{
+    std::vector<std::string> rv;
+    std::istringstream ss(in);
+    std::string s;
+    while(ss >> s)
+    {
+        if(std::any_of(dontSplitAfter.begin(), dontSplitAfter.end(), [&](const auto& dont) {
+               return dont == s;
+           }))
+        {
+            std::string s2;
+            if(ss >> s2)
+            {
+                s += std::string(" ").append(s2); // Exactly one space is important.
+                rv.push_back(s);
+                continue;
+            }
+            throw std::runtime_error("Error parsing string: '" + in + '\'');
+        }
+        rv.push_back(s);
+    }
+    return rv;
+}
+
+} // namespace olCompile
+
+#endif // GUARD_OLC_STRINGUTILS_HPP
diff --git a/driver/olCompiling/include/target_properties.hpp b/driver/olCompiling/include/target_properties.hpp
new file mode 100644
index 0000000000..7918728130
--- /dev/null
+++ b/driver/olCompiling/include/target_properties.hpp
@@ -0,0 +1,56 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2020 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#ifndef GUARD_OLC_TARGET_PROPERTIES_HPP
+#define GUARD_OLC_TARGET_PROPERTIES_HPP
+
+#include <boost/optional.hpp>
+#include <string>
+
+namespace olCompile {
+
+struct Handle;
+
+struct TargetProperties
+{
+    const std::string& Name() const { return name; }
+    const std::string& DbId() const { return dbId; }
+    boost::optional<bool> Xnack() const { return xnack; }
+    boost::optional<bool> Sramecc() const { return sramecc; }
+    boost::optional<bool> SrameccReported() const { return sramecc_reported; }
+    void Init(const Handle*);
+
+    private:
+    void InitDbId();
+    std::string name;
+    std::string dbId;
+    boost::optional<bool> xnack            = boost::none;
+    boost::optional<bool> sramecc          = boost::none;
+    boost::optional<bool> sramecc_reported = boost::none;
+};
+
+} // namespace olCompile
+
+#endif // GUARD_OLC_TARGET_PROPERTIES_HPP
diff --git a/driver/olCompiling/include/tmp_dir.hpp b/driver/olCompiling/include/tmp_dir.hpp
new file mode 100644
index 0000000000..099a18bf74
--- /dev/null
+++ b/driver/olCompiling/include/tmp_dir.hpp
@@ -0,0 +1,26 @@
+#ifndef GUARD_OLC_TMP_DIR_HPP
+#define GUARD_OLC_TMP_DIR_HPP
+
+#include <string>
+#include <boost/filesystem/path.hpp>
+
+namespace olCompile {
+
+void SystemCmd(std::string cmd);
+
+struct TmpDir
+{
+    boost::filesystem::path path;
+    TmpDir(std::string prefix);
+
+    TmpDir(TmpDir const&) = delete;
+    TmpDir& operator=(TmpDir const&) = delete;
+
+    void Execute(std::string exe, std::string args) const;
+
+    ~TmpDir();
+};
+
+} // namespace olCompile
+
+#endif
diff --git a/driver/olCompiling/include/write_file.hpp b/driver/olCompiling/include/write_file.hpp
new file mode 100644
index 0000000000..f1ddb85237
--- /dev/null
+++ b/driver/olCompiling/include/write_file.hpp
@@ -0,0 +1,30 @@
+#ifndef GUARD_OLC_WRITE_FILE_HPP
+#define GUARD_OLC_WRITE_FILE_HPP
+
+#include <boost/filesystem.hpp>
+#include <manage_ptr.hpp>
+#include <fstream>
+
+namespace olCompile {
+
+using FilePtr = OLC_MANAGE_PTR(FILE*, std::fclose);
+
+inline void WriteFile(const std::string& content, const boost::filesystem::path& name)
+{
+    // std::cerr << "Write file: " << name << std::endl;
+    FilePtr f{std::fopen(name.string().c_str(), "w")};
+    if(std::fwrite(content.c_str(), 1, content.size(), f.get()) != content.size())
+        throw std::runtime_error("Failed to write to file");
+}
+
+inline void WriteFile(const std::vector<char>& content, const boost::filesystem::path& name)
+{
+    // std::cerr << "Write file: " << name << std::endl;
+    FilePtr f{std::fopen(name.string().c_str(), "w")};
+    if(std::fwrite(&content[0], 1, content.size(), f.get()) != content.size())
+        throw std::runtime_error("Failed to write to file");
+}
+
+} // namespace olCompile
+
+#endif
diff --git a/driver/olCompiling/kernel.cpp.in b/driver/olCompiling/kernel.cpp.in
new file mode 100644
index 0000000000..f67e7d1c6e
--- /dev/null
+++ b/driver/olCompiling/kernel.cpp.in
@@ -0,0 +1,70 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2021 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#include <algorithm>
+#include <map>
+#include <stdexcept>
+
+// clang-format off
+${KERNELS_DECLS}
+// clang-format on
+
+namespace olCompile {
+
+const std::map<std::string, std::string>& kernels()
+{
+    static const std::map<std::string, std::string> data{${INIT_KERNELS}};
+    return data;
+}
+
+std::string GetKernelSrc(std::string name)
+{
+    // Use the base name of the string
+    int start  = 0;
+    auto slash = static_cast<int>(name.find_last_of("/\\"));
+    if(slash != std::string::npos)
+    {
+        start = slash + 1;
+    }
+
+    int len = name.size();
+    auto ex = static_cast<int>(name.rfind('.'));
+    if(ex != std::string::npos)
+    {
+        len = ex - start;
+    }
+
+    auto key = name.substr(start, len);
+    // Convert to uppercase
+    std::transform(key.begin(), key.end(), key.begin(), ::toupper);
+
+    auto it = kernels().find(key);
+    if(it == kernels().end())
+        throw std::runtime_error("Failed to load kernel source: " + key);
+
+    return it->second;
+}
+
+} // namespace olCompile
diff --git a/driver/olCompiling/kernel_includes.cpp.in b/driver/olCompiling/kernel_includes.cpp.in
new file mode 100644
index 0000000000..24dc09e9ff
--- /dev/null
+++ b/driver/olCompiling/kernel_includes.cpp.in
@@ -0,0 +1,80 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2019 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#include "olc_kernel_includes.h"
+#include <algorithm>
+#include <map>
+#include <stdexcept>
+#include <vector>
+
+namespace olCompile {
+
+static inline bool EndsWith(const std::string& value, const std::string& suffix)
+{
+    if(suffix.size() > value.size())
+        return false;
+    else
+        return std::equal(suffix.rbegin(), suffix.rend(), value.rbegin());
+}
+
+const std::map<std::string, std::string>& kernel_includes()
+{
+    static const std::map<std::string, std::string> data{${INIT_KERNELS}};
+    return data;
+}
+
+std::string GetKernelInc(std::string key)
+{
+    auto it = kernel_includes().find(key);
+    if(it == kernel_includes().end())
+        throw std::runtime_error("Failed to load kernel source: " + key);
+
+    return it->second;
+}
+
+std::vector<std::string> GetKernelIncList()
+{
+    std::vector<std::string> keys;
+    auto m = kernel_includes();
+    std::transform(m.begin(),
+                   m.end(),
+                   std::back_inserter(keys),
+                   [](decltype(m)::value_type const& pair) { return pair.first; });
+    return keys;
+}
+
+std::vector<std::string> GetHipKernelIncList()
+{
+    auto keys = GetKernelIncList();
+    keys.erase(std::remove_if(keys.begin(),
+                              keys.end(),
+                              [&](const auto& key) {
+                                  return !(EndsWith(key, ".hpp") || EndsWith(key, ".h"));
+                              }),
+               keys.end());
+    return keys;
+}
+
+} // namespace olCompile
diff --git a/driver/olCompiling/kernels_batch.cpp.in b/driver/olCompiling/kernels_batch.cpp.in
new file mode 100644
index 0000000000..a31caf9c5e
--- /dev/null
+++ b/driver/olCompiling/kernels_batch.cpp.in
@@ -0,0 +1 @@
+#include "${KERNEL_SRC_HPP_FILENAME}"
diff --git a/script/run.sh b/script/run.sh
index 75caa16a8c..4160d9c08f 100755
--- a/script/run.sh
+++ b/script/run.sh
@@ -1,7 +1,18 @@
 #!/bin/bash
 
+
+ export LD_LIBRARY_PATH=/usr/local/lib:$LD_LIBRARY_PATH
+
+ export OLC_DEBUG_HIP_VERBOSE=1
+ export OLC_DEBUG_HIP_DUMP=1
+ export OLC_DEBUG_SAVE_TEMP_DIR=1
+
 #make -j conv_driver
- make -j conv_driver_v2
+#make -j conv_driver_v2
+ make -j conv_driver_v2_olc
+
+ rm -rf /root/_hip_binary_kernels_/
+ rm -rf /tmp/olCompile*
 
 LAYOUT=$1
 ALGO=$2
@@ -11,7 +22,10 @@ LOG=$5
 REPEAT=$6
 
 ######################  layout  algo  verify  init  log  repeat  N__ K__ C__ Y X Hi_ Wi__ Strides Dilations LeftPads RightPads
- driver/conv_driver_v2 $LAYOUT $ALGO $VERIFY $INIT $LOG $REPEAT  128 128 192 3 3  71   71     2 2       1 1      1 1       1 1
+#driver/conv_driver_v2 $LAYOUT $ALGO $VERIFY $INIT $LOG $REPEAT  128 128 192 3 3  71   71     2 2       1 1      1 1       1 1
 #driver/conv_driver_v2 $LAYOUT $ALGO $VERIFY $INIT $LOG $REPEAT  128 384 192 3 3  35   35     2 2       1 1      0 0       0 0
 #driver/conv_driver_v2 $LAYOUT $ALGO $VERIFY $INIT $LOG $REPEAT  128 128 128 1 7  17   17     1 1       1 1      0 3       0 3
 #driver/conv_driver_v2 $LAYOUT $ALGO $VERIFY $INIT $LOG $REPEAT  128 256 256 3 3  14   14     1 1       1 1      1 1       1 1
+
+#./conv_driver_v2      $LAYOUT $ALGO $VERIFY $INIT $LOG $REPEAT  128 128 192 3 3  71   71     2 2       1 1      1 1       1 1
+ ./conv_driver_v2_olc  $LAYOUT $ALGO $VERIFY $INIT $LOG $REPEAT  128 128 192 3 3  71   71     2 2       1 1      1 1       1 1