reorginzed files

[ROCm/composable_kernel commit: 1566b31736]
2026-06-30 11:47:48 +00:00 · 2019-06-13 15:12:12 -05:00
parent 11c6b2ab9a
commit 5f217ebda5
64 changed files with 254 additions and 218 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -46,8 +46,19 @@ endif()

 #
 include_directories(BEFORE
-    include
-    ${PROJECT_BINARY_DIR}/include
+    ${PROJECT_SOURCE_DIR}/composable_kernel/include
+    ${PROJECT_SOURCE_DIR}/composable_kernel/include/utility
+    ${PROJECT_SOURCE_DIR}/composable_kernel/include/tensor_description
+    ${PROJECT_SOURCE_DIR}/composable_kernel/include/tensor_operation
+    ${PROJECT_SOURCE_DIR}/composable_kernel/include/kernel_algorithm
+    ${PROJECT_SOURCE_DIR}/driver/include
+    ${PROJECT_BINARY_DIR}/composable_kernel/include/utility
 )
-add_subdirectory(src)
+
+if(DEVICE_BACKEND STREQUAL "AMD")
+    configure_file("${PROJECT_SOURCE_DIR}/composable_kernel/include/utility/config_amd.hpp.in" "${PROJECT_BINARY_DIR}/composable_kernel/include/utility/config.hpp")
+elseif(DEVICE_BACKEND STREQUAL "NVIDIA")
+    configure_file("${PROJECT_SOURCE_DIR}/composable_kernel/include/utility/config_nvidia.hpp.in" "${PROJECT_BINARY_DIR}/composable_kernel/include/utility/config.hpp")
+endif()
+
 add_subdirectory(driver)
--- a/composable_kernel/include/gridwise_convolution_kernel_wrapper.hpp
+++ b/composable_kernel/include/gridwise_convolution_kernel_wrapper.hpp
--- a/composable_kernel/include/kernel_algorithm/gridwise_convolution_direct_v2_nchw_kcyx_nkhw.hpp
+++ b/composable_kernel/include/kernel_algorithm/gridwise_convolution_direct_v2_nchw_kcyx_nkhw.hpp
@@ -1,12 +1,12 @@
 #ifndef CK_GRIDWISE_CONVOLUTION_DIRECT_V2_NCHW_KCYX_NKHW
 #define CK_GRIDWISE_CONVOLUTION_DIRECT_V2_NCHW_KCYX_NKHW

-#include "composable_kernel/utility/common.hpp"
-#include "composable_kernel/tensor_description/ConstantTensorDescriptor.hpp"
-#include "composable_kernel/tensor_operation/blockwise_2d_tensor_op.hpp"
-#include "composable_kernel/tensor_operation/blockwise_4d_tensor_op.hpp"
-#include "composable_kernel/tensor_operation/threadwise_tensor_slice_copy.hpp"
-#include "composable_kernel/tensor_operation/threadwise_direct_convolution.hpp"
+#include "common_header.hpp"
+#include "ConstantTensorDescriptor.hpp"
+#include "blockwise_2d_tensor_op.hpp"
+#include "blockwise_4d_tensor_op.hpp"
+#include "threadwise_tensor_slice_copy.hpp"
+#include "threadwise_direct_convolution.hpp"

 namespace ck {

--- a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v1r1_chwn_cyxk_khwn.hpp
+++ b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v1r1_chwn_cyxk_khwn.hpp
@@ -1,14 +1,14 @@
 #ifndef CK_GRIDWISE_CONVOLUTION_IMPLICIT_GEMM_V1R1_CHWN_CYXK_KHWN
 #define CK_GRIDWISE_CONVOLUTION_IMPLICIT_GEMM_V1R1_CHWN_CYXK_KHWN

-#include "composable_kernel/utility/common.hpp"
-#include "composable_kernel/tensor_description/ConstantTensorDescriptor.hpp"
-#include "composable_kernel/tensor_description/ConstantMatrixDescriptor.hpp"
-#include "composable_kernel/tensor_operation/blockwise_4d_tensor_op.hpp"
-#include "composable_kernel/tensor_operation/blockwise_2d_tensor_op.hpp"
-#include "composable_kernel/tensor_operation/threadwise_tensor_slice_copy.hpp"
-#include "composable_kernel/tensor_operation/threadwise_4d_tensor_op.hpp"
-#include "composable_kernel/tensor_operation/blockwise_batched_gemm.hpp"
+#include "common_header.hpp"
+#include "ConstantTensorDescriptor.hpp"
+#include "ConstantMatrixDescriptor.hpp"
+#include "blockwise_4d_tensor_op.hpp"
+#include "blockwise_2d_tensor_op.hpp"
+#include "threadwise_tensor_slice_copy.hpp"
+#include "threadwise_4d_tensor_op.hpp"
+#include "blockwise_batched_gemm.hpp"

 namespace ck {

--- a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v1r2_chwn_cyxk_khwn.hpp
+++ b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v1r2_chwn_cyxk_khwn.hpp
@@ -1,15 +1,15 @@
 #ifndef CK_GRIDWISE_CONVOLUTION_IMPLICIT_GEMM_V1R2_CHWN_CYXK_KHWN
 #define CK_GRIDWISE_CONVOLUTION_IMPLICIT_GEMM_V1R2_CHWN_CYXK_KHWN

-#include "composable_kernel/utility/common.hpp"
-#include "composable_kernel/tensor_description/ConstantTensorDescriptor.hpp"
-#include "composable_kernel/tensor_description/ConstantMatrixDescriptor.hpp"
-#include "composable_kernel/tensor_operation/blockwise_2d_tensor_op.hpp"
-#include "composable_kernel/tensor_operation/blockwise_3d_tensor_op.hpp"
-#include "composable_kernel/tensor_operation/blockwise_4d_tensor_op.hpp"
-#include "composable_kernel/tensor_operation/threadwise_tensor_slice_copy.hpp"
-#include "composable_kernel/tensor_operation/threadwise_4d_tensor_op.hpp"
-#include "composable_kernel/tensor_operation/blockwise_batched_gemm.hpp"
+#include "common_header.hpp"
+#include "ConstantTensorDescriptor.hpp"
+#include "ConstantMatrixDescriptor.hpp"
+#include "blockwise_2d_tensor_op.hpp"
+#include "blockwise_3d_tensor_op.hpp"
+#include "blockwise_4d_tensor_op.hpp"
+#include "threadwise_tensor_slice_copy.hpp"
+#include "threadwise_4d_tensor_op.hpp"
+#include "blockwise_batched_gemm.hpp"

 namespace ck {

--- a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v1r3_chwn_cyxk_khwn.hpp
+++ b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v1r3_chwn_cyxk_khwn.hpp
@@ -1,14 +1,14 @@
 #ifndef CK_GRIDWISE_CONVOLUTION_IMPLICIT_GEMM_V1R3_CHWN_CYXK_KHWN
 #define CK_GRIDWISE_CONVOLUTION_IMPLICIT_GEMM_V1R3_CHWN_CYXK_KHWN

-#include "composable_kernel/utility/common.hpp"
-#include "composable_kernel/tensor_description/ConstantTensorDescriptor.hpp"
-#include "composable_kernel/tensor_description/ConstantMatrixDescriptor.hpp"
-#include "composable_kernel/tensor_operation/blockwise_2d_tensor_op.hpp"
-#include "composable_kernel/tensor_operation/blockwise_4d_tensor_op.hpp"
-#include "composable_kernel/tensor_operation/threadwise_tensor_slice_copy.hpp"
-#include "composable_kernel/tensor_operation/threadwise_4d_tensor_op.hpp"
-#include "composable_kernel/tensor_operation/blockwise_batched_gemm.hpp"
+#include "common_header.hpp"
+#include "ConstantTensorDescriptor.hpp"
+#include "ConstantMatrixDescriptor.hpp"
+#include "blockwise_2d_tensor_op.hpp"
+#include "blockwise_4d_tensor_op.hpp"
+#include "threadwise_tensor_slice_copy.hpp"
+#include "threadwise_4d_tensor_op.hpp"
+#include "blockwise_batched_gemm.hpp"

 namespace ck {

--- a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v1r3_chwn_cyxk_khwn_lds_double_buffer.hpp
+++ b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v1r3_chwn_cyxk_khwn_lds_double_buffer.hpp
@@ -1,14 +1,14 @@
 #ifndef CK_GRIDWISE_CONVOLUTION_IMPLICIT_GEMM_V1R3_CHWN_CYXK_KHWN_LDS_DOUBLE_BUFFER
 #define CK_GRIDWISE_CONVOLUTION_IMPLICIT_GEMM_V1R3_CHWN_CYXK_KHWN_LDS_DOUBLE_BUFFER

-#include "composable_kernel/utility/common.hpp"
-#include "composable_kernel/tensor_description/ConstantTensorDescriptor.hpp"
-#include "composable_kernel/tensor_description/ConstantMatrixDescriptor.hpp"
-#include "composable_kernel/tensor_operation/blockwise_2d_tensor_op.hpp"
-#include "composable_kernel/tensor_operation/blockwise_4d_tensor_op.hpp"
-#include "composable_kernel/tensor_operation/threadwise_tensor_slice_copy.hpp"
-#include "composable_kernel/tensor_operation/threadwise_4d_tensor_op.hpp"
-#include "composable_kernel/tensor_operation/blockwise_batched_gemm.hpp"
+#include "common_header.hpp"
+#include "ConstantTensorDescriptor.hpp"
+#include "ConstantMatrixDescriptor.hpp"
+#include "blockwise_2d_tensor_op.hpp"
+#include "blockwise_4d_tensor_op.hpp"
+#include "threadwise_tensor_slice_copy.hpp"
+#include "threadwise_4d_tensor_op.hpp"
+#include "blockwise_batched_gemm.hpp"

 namespace ck {

--- a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v1r3_nchw_cyxk_nkhw.hpp
+++ b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v1r3_nchw_cyxk_nkhw.hpp
@@ -1,14 +1,14 @@
 #ifndef CK_GRIDWISE_CONVOLUTION_IMPLICIT_GEMM_V1R3_NCHW_CYXK_NKHW
 #define CK_GRIDWISE_CONVOLUTION_IMPLICIT_GEMM_V1R3_NCHW_CYXK_NKHW

-#include "composable_kernel/utility/common.hpp"
-#include "composable_kernel/tensor_description/ConstantTensorDescriptor.hpp"
-#include "composable_kernel/tensor_description/ConstantMatrixDescriptor.hpp"
-#include "composable_kernel/tensor_operation/blockwise_2d_tensor_op.hpp"
-#include "composable_kernel/tensor_operation/blockwise_tensor_slice_copy.hpp"
-#include "composable_kernel/tensor_operation/threadwise_tensor_slice_copy.hpp"
-#include "composable_kernel/tensor_operation/threadwise_generic_tensor_op.hpp"
-#include "composable_kernel/tensor_operation/blockwise_batched_gemm.hpp"
+#include "common_header.hpp"
+#include "ConstantTensorDescriptor.hpp"
+#include "ConstantMatrixDescriptor.hpp"
+#include "blockwise_2d_tensor_op.hpp"
+#include "blockwise_tensor_slice_copy.hpp"
+#include "threadwise_tensor_slice_copy.hpp"
+#include "threadwise_generic_tensor_op.hpp"
+#include "blockwise_batched_gemm.hpp"

 namespace ck {

--- a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v1r3_nchw_cyxk_nkhw_lds_double_buffer.hpp
+++ b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v1r3_nchw_cyxk_nkhw_lds_double_buffer.hpp
@@ -1,14 +1,14 @@
 #ifndef CK_GRIDWISE_CONVOLUTION_IMPLICIT_GEMM_V1R3_NCHW_CYXK_NKHW_LDS_DOUBLE_BUFFER
 #define CK_GRIDWISE_CONVOLUTION_IMPLICIT_GEMM_V1R3_NCHW_CYXK_NKHW_LDS_DOUBLE_BUFFER

-#include "composable_kernel/utility/common.hpp"
-#include "composable_kernel/tensor_description/ConstantTensorDescriptor.hpp"
-#include "composable_kernel/tensor_description/ConstantMatrixDescriptor.hpp"
-#include "composable_kernel/tensor_operation/blockwise_2d_tensor_op.hpp"
-#include "composable_kernel/tensor_operation/blockwise_tensor_slice_copy.hpp"
-#include "composable_kernel/tensor_operation/threadwise_tensor_slice_copy.hpp"
-#include "composable_kernel/tensor_operation/threadwise_generic_tensor_op.hpp"
-#include "composable_kernel/tensor_operation/blockwise_batched_gemm.hpp"
+#include "common_header.hpp"
+#include "ConstantTensorDescriptor.hpp"
+#include "ConstantMatrixDescriptor.hpp"
+#include "blockwise_2d_tensor_op.hpp"
+#include "blockwise_tensor_slice_copy.hpp"
+#include "threadwise_tensor_slice_copy.hpp"
+#include "threadwise_generic_tensor_op.hpp"
+#include "blockwise_batched_gemm.hpp"

 namespace ck {

--- a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v2_chwn_cyxk_khwn.hpp
+++ b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v2_chwn_cyxk_khwn.hpp
@@ -1,12 +1,12 @@
 #ifndef CK_GRIDWISE_CONVOLUTION_IMPLICIT_GEMM_V2_CHWN_CYXK_KHWN
 #define CK_GRIDWISE_CONVOLUTION_IMPLICIT_GEMM_V2_CHWN_CYXK_KHWN

-#include "composable_kernel/utility/common.hpp"
-#include "composable_kernel/tensor_description/ConstantTensorDescriptor.hpp"
-#include "composable_kernel/tensor_description/ConstantMatrixDescriptor.hpp"
-#include "composable_kernel/tensor_operation/blockwise_4d_tensor_op.hpp"
-#include "composable_kernel/tensor_operation/blockwise_2d_tensor_op.hpp"
-#include "composable_kernel/tensor_operation/blockwise_gemm.hpp"
+#include "common_header.hpp"
+#include "ConstantTensorDescriptor.hpp"
+#include "ConstantMatrixDescriptor.hpp"
+#include "blockwise_4d_tensor_op.hpp"
+#include "blockwise_2d_tensor_op.hpp"
+#include "blockwise_gemm.hpp"

 namespace ck {

--- a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v2_chwn_cyxk_khwn_lds_double_buffer.hpp
+++ b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v2_chwn_cyxk_khwn_lds_double_buffer.hpp
@@ -1,13 +1,13 @@
 #ifndef CK_GRIDWISE_CONVOLUTION_IMPLICIT_GEMM_V2_CHWN_CYXK_KHWN_LDS_DOUBLE_BUFFER
 #define CK_GRIDWISE_CONVOLUTION_IMPLICIT_GEMM_V2_CHWN_CYXK_KHWN_LDS_DOUBLE_BUFFER

-#include "composable_kernel/utility/common.hpp"
-#include "composable_kernel/tensor_description/ConstantTensorDescriptor.hpp"
-#include "composable_kernel/tensor_description/ConstantMatrixDescriptor.hpp"
-#include "composable_kernel/tensor_operation/blockwise_4d_tensor_op.hpp"
-#include "composable_kernel/tensor_operation/blockwise_2d_tensor_op.hpp"
-#include "composable_kernel/tensor_operation/threadwise_tensor_slice_copy.hpp"
-#include "composable_kernel/tensor_operation/blockwise_gemm.hpp"
+#include "common_header.hpp"
+#include "ConstantTensorDescriptor.hpp"
+#include "ConstantMatrixDescriptor.hpp"
+#include "blockwise_4d_tensor_op.hpp"
+#include "blockwise_2d_tensor_op.hpp"
+#include "threadwise_tensor_slice_copy.hpp"
+#include "blockwise_gemm.hpp"

 namespace ck {

--- a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v3_nchw_cyxk_nkhw.hpp
+++ b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v3_nchw_cyxk_nkhw.hpp
@@ -1,12 +1,12 @@
 #ifndef CK_GRIDWISE_CONVOLUTION_IMPLICIT_GEMM_V3_NCHW_CYXK_NKHW
 #define CK_GRIDWISE_CONVOLUTION_IMPLICIT_GEMM_V3_NCHW_CYXK_NKHW

-#include "composable_kernel/utility/common.hpp"
-#include "composable_kernel/tensor_description/ConstantTensorDescriptor.hpp"
-#include "composable_kernel/tensor_description/ConstantMergedTensorDescriptor.hpp"
-#include "composable_kernel/tensor_description/ConstantMatrixDescriptor.hpp"
-#include "composable_kernel/tensor_operation/blockwise_generic_tensor_slice_copy.hpp"
-#include "composable_kernel/tensor_operation/blockwise_gemm.hpp"
+#include "common_header.hpp"
+#include "ConstantTensorDescriptor.hpp"
+#include "ConstantMergedTensorDescriptor.hpp"
+#include "ConstantMatrixDescriptor.hpp"
+#include "blockwise_generic_tensor_slice_copy.hpp"
+#include "blockwise_gemm.hpp"

 namespace ck {

--- a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v3_nchw_cyxk_nkhw_lds_double_buffer.hpp
+++ b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v3_nchw_cyxk_nkhw_lds_double_buffer.hpp
@@ -1,12 +1,12 @@
 #ifndef CK_GRIDWISE_CONVOLUTION_IMPLICIT_GEMM_V3_NCHW_CYXK_NKHW_LDS_DOUBLE_BUFFER
 #define CK_GRIDWISE_CONVOLUTION_IMPLICIT_GEMM_V3_NCHW_CYXK_NKHW_LDS_DOUBLE_BUFFER

-#include "composable_kernel/utility/common.hpp"
-#include "composable_kernel/tensor_description/ConstantTensorDescriptor.hpp"
-#include "composable_kernel/tensor_description/ConstantMergedTensorDescriptor.hpp"
-#include "composable_kernel/tensor_description/ConstantMatrixDescriptor.hpp"
-#include "composable_kernel/tensor_operation/blockwise_generic_tensor_slice_copy.hpp"
-#include "composable_kernel/tensor_operation/blockwise_gemm.hpp"
+#include "common_header.hpp"
+#include "ConstantTensorDescriptor.hpp"
+#include "ConstantMergedTensorDescriptor.hpp"
+#include "ConstantMatrixDescriptor.hpp"
+#include "blockwise_generic_tensor_slice_copy.hpp"
+#include "blockwise_gemm.hpp"

 namespace ck {

--- a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4_nchw_kcyx_nkhw.hpp
+++ b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4_nchw_kcyx_nkhw.hpp
@@ -1,13 +1,13 @@
 #ifndef CK_GRIDWISE_CONVOLUTION_IMPLICIT_GEMM_V4_NCHW_KCYX_NKHW
 #define CK_GRIDWISE_CONVOLUTION_IMPLICIT_GEMM_V4_NCHW_KCYX_NKHW

-#include "composable_kernel/utility/common.hpp"
-#include "composable_kernel/tensor_description/ConstantTensorDescriptor.hpp"
-#include "composable_kernel/tensor_description/ConstantMergedTensorDescriptor.hpp"
-#include "composable_kernel/tensor_description/ConstantMatrixDescriptor.hpp"
-#include "composable_kernel/tensor_operation/blockwise_generic_tensor_slice_copy.hpp"
-#include "composable_kernel/tensor_operation/blockwise_gemm.hpp"
-#include "composable_kernel/tensor_operation/threadwise_generic_tensor_slice_copy.hpp"
+#include "common_header.hpp"
+#include "ConstantTensorDescriptor.hpp"
+#include "ConstantMergedTensorDescriptor.hpp"
+#include "ConstantMatrixDescriptor.hpp"
+#include "blockwise_generic_tensor_slice_copy.hpp"
+#include "blockwise_gemm.hpp"
+#include "threadwise_generic_tensor_slice_copy.hpp"

 namespace ck {

--- a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4_nchw_kcyx_nkhw_lds_double_buffer.hpp
+++ b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4_nchw_kcyx_nkhw_lds_double_buffer.hpp
@@ -1,13 +1,17 @@
 #ifndef CK_GRIDWISE_CONVOLUTION_IMPLICIT_GEMM_V4_NCHW_KCYX_NKHW_LDS_DOUBLE_BUFFER
 #define CK_GRIDWISE_CONVOLUTION_IMPLICIT_GEMM_V4_NCHW_KCYX_NKHW_LDS_DOUBLE_BUFFER

-#include "composable_kernel/utility/common.hpp"
-#include "composable_kernel/tensor_description/ConstantTensorDescriptor.hpp"
-#include "composable_kernel/tensor_description/ConstantMergedTensorDescriptor.hpp"
-#include "composable_kernel/tensor_description/ConstantMatrixDescriptor.hpp"
-#include "composable_kernel/tensor_operation/blockwise_generic_tensor_slice_copy.hpp"
-#include "composable_kernel/tensor_operation/blockwise_gemm.hpp"
-#include "composable_kernel/tensor_operation/threadwise_generic_tensor_slice_copy.hpp"
+#include "common_header.hpp"
+#include "ConstantTensorDescriptor.hpp"
+#include "ConstantMergedTensorDescriptor.hpp"
+#include "ConstantMatrixDescriptor.hpp"
+#include "blockwise_generic_tensor_slice_copy.hpp"
+#include "blockwise_gemm.hpp"
+#include "threadwise_generic_tensor_slice_copy.hpp"
+
+#ifndef CK_BLOCKWISE_GEMM_USE_AMD_INLINE_ASM
+#define CK_BLOCKWISE_GEMM_USE_AMD_INLINE_ASM 1
+#endif

 namespace ck {

@@ -233,10 +237,10 @@ struct GridwiseConvolutionImplicitGemm_v4_nchw_kcyx_nkhw_lds_double_buffer

        // choose GEMM implementation here
        const auto run_blockwise_gemm = [&](auto... Xs) {
-#if 1
-            return blockwise_gemm.Run(Xs...);
-#else
+#if CK_USE_AMD_INLINE_ASM && CK_BLOCKWISE_GEMM_USE_AMD_INLINE_ASM
            return blockwise_gemm.Run_asm(Xs...);
+#else
+            return blockwise_gemm.Run(Xs...);
 #endif
        };

--- a/composable_kernel/include/kernel_algorithm/gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hpp
+++ b/composable_kernel/include/kernel_algorithm/gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hpp
@@ -1,11 +1,11 @@
 #pragma once
-#include "composable_kernel/utility/common.hpp"
-#include "composable_kernel/tensor_description/ConstantTensorDescriptor.hpp"
-#include "composable_kernel/tensor_operation/blockwise_2d_tensor_op.hpp"
-#include "composable_kernel/tensor_operation/blockwise_4d_tensor_op.hpp"
-#include "composable_kernel/tensor_operation/blockwise_direct_convolution.hpp"
-#include "composable_kernel/tensor_operation/threadwise_4d_tensor_op.hpp"
-#include "composable_kernel/tensor_operation/threadwise_direct_convolution.hpp"
+#include "common_header.hpp"
+#include "ConstantTensorDescriptor.hpp"
+#include "blockwise_2d_tensor_op.hpp"
+#include "blockwise_4d_tensor_op.hpp"
+#include "blockwise_direct_convolution.hpp"
+#include "threadwise_4d_tensor_op.hpp"
+#include "threadwise_direct_convolution.hpp"

 namespace ck {

--- a/composable_kernel/include/kernel_algorithm/gridwise_implicit_gemm_convolution_1_chwn_cyxk_khwn_padded.hpp
+++ b/composable_kernel/include/kernel_algorithm/gridwise_implicit_gemm_convolution_1_chwn_cyxk_khwn_padded.hpp
@@ -1,11 +1,11 @@
 #pragma once
-#include "composable_kernel/utility/common.hpp"
-#include "composable_kernel/tensor_description/ConstantTensorDescriptor.hpp"
-#include "composable_kernel/tensor_description/ConstantMatrixDescriptor.hpp"
-#include "composable_kernel/tensor_operation/blockwise_4d_tensor_op.hpp"
-#include "composable_kernel/tensor_operation/blockwise_2d_tensor_op.hpp"
-#include "composable_kernel/tensor_operation/threadwise_4d_tensor_op.hpp"
-#include "composable_kernel/tensor_operation/blockwise_gemm.hpp"
+#include "common_header.hpp"
+#include "ConstantTensorDescriptor.hpp"
+#include "ConstantMatrixDescriptor.hpp"
+#include "blockwise_4d_tensor_op.hpp"
+#include "blockwise_2d_tensor_op.hpp"
+#include "threadwise_4d_tensor_op.hpp"
+#include "blockwise_gemm.hpp"

 namespace ck {

--- a/composable_kernel/include/tensor_description/ConstantMatrixDescriptor.hpp
+++ b/composable_kernel/include/tensor_description/ConstantMatrixDescriptor.hpp
@@ -1,7 +1,7 @@
 #ifndef CK_CONSTANT_MATRIX_DESCRIPTOR_HPP
 #define CK_CONSTANT_MATRIX_DESCRIPTOR_HPP

-#include "composable_kernel/utility/common.hpp"
+#include "common_header.hpp"

 namespace ck {

--- a/composable_kernel/include/tensor_description/ConstantMergedTensorDescriptor.hpp
+++ b/composable_kernel/include/tensor_description/ConstantMergedTensorDescriptor.hpp
@@ -1,8 +1,8 @@
 #ifndef CK_CONSTANT_MERGED_TENSOR_DESCRIPTOR_HPP
 #define CK_CONSTANT_MERGED_TENSOR_DESCRIPTOR_HPP

-#include "composable_kernel/utility/common.hpp"
-#include "composable_kernel/tensor_description/ConstantTensorDescriptor.hpp"
+#include "common_header.hpp"
+#include "ConstantTensorDescriptor.hpp"

 namespace ck {

--- a/composable_kernel/include/tensor_description/ConstantTensorDescriptor.hpp
+++ b/composable_kernel/include/tensor_description/ConstantTensorDescriptor.hpp
@@ -1,7 +1,7 @@
 #ifndef CK_CONSTANT_TENSOR_DESCRIPTOR_HPP
 #define CK_CONSTANT_TENSOR_DESCRIPTOR_HPP

-#include "composable_kernel/utility/common.hpp"
+#include "common_header.hpp"

 namespace ck {

--- a/composable_kernel/include/tensor_operation/blockwise_2d_tensor_op.hpp
+++ b/composable_kernel/include/tensor_operation/blockwise_2d_tensor_op.hpp
@@ -1,8 +1,8 @@
 #ifndef CK_BLOCKWISE_2D_TENSOR_OP_HPP
 #define CK_BLOCKWISE_2D_TENSOR_OP_HPP

-#include "composable_kernel/utility/common.hpp"
-#include "composable_kernel/tensor_description/ConstantTensorDescriptor.hpp"
+#include "common_header.hpp"
+#include "ConstantTensorDescriptor.hpp"

 namespace ck {

--- a/composable_kernel/include/tensor_operation/blockwise_3d_tensor_op.hpp
+++ b/composable_kernel/include/tensor_operation/blockwise_3d_tensor_op.hpp
@@ -1,8 +1,8 @@
 #ifndef CK_BLOCKWISE_3D_TENSOR_OP_HPP
 #define CK_BLOCKWISE_3D_TENSOR_OP_HPP

-#include "composable_kernel/utility/common.hpp"
-#include "composable_kernel/tensor_description/ConstantTensorDescriptor.hpp"
+#include "common_header.hpp"
+#include "ConstantTensorDescriptor.hpp"

 namespace ck {

--- a/composable_kernel/include/tensor_operation/blockwise_4d_tensor_op.hpp
+++ b/composable_kernel/include/tensor_operation/blockwise_4d_tensor_op.hpp
@@ -1,8 +1,9 @@
 #ifndef CK_BLOCKWISE_4D_TENSOR_OP_HPP
 #define CK_BLOCKWISE_4D_TENSOR_OP_HPP

-#include "composable_kernel/tensor_description/ConstantTensorDescriptor.hpp"
-#include "composable_kernel/tensor_operation/threadwise_tensor_slice_copy.hpp"
+#include "common_header.hpp"
+#include "ConstantTensorDescriptor.hpp"
+#include "threadwise_tensor_slice_copy.hpp"

 namespace ck {

--- a/composable_kernel/include/tensor_operation/blockwise_batched_gemm.hpp
+++ b/composable_kernel/include/tensor_operation/blockwise_batched_gemm.hpp
@@ -1,7 +1,9 @@
 #ifndef CK_BLOCKWISE_BATCHED_GEMM_HPP
 #define CK_BLOCKWISE_BATCHED_GEMM_HPP

-#include "composable_kernel/tensor_operation/threadwise_gemm.hpp"
+#include "common_header.hpp"
+#include "ConstantMatrixDescriptor.hpp"
+#include "threadwise_gemm.hpp"

 namespace ck {

--- a/composable_kernel/include/tensor_operation/blockwise_gemm.hpp
+++ b/composable_kernel/include/tensor_operation/blockwise_gemm.hpp
@@ -1,8 +1,9 @@
 #ifndef CK_BLOCKWISE_GEMM_HPP
 #define CK_BLOCKWISE_GEMM_HPP

-#include "composable_kernel/utility/common.hpp"
-#include "composable_kernel/tensor_operation/threadwise_gemm.hpp"
+#include "common_header.hpp"
+#include "ConstantMatrixDescriptor.hpp"
+#include "threadwise_gemm.hpp"

 namespace ck {

--- a/composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp
+++ b/composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp
@@ -1,7 +1,10 @@
 #ifndef CK_BLOCKWISE_GENERIC_TENSOR_SLICE_COPY_HPP
 #define CK_BLOCKWISE_GENERIC_TENSOR_SLICE_COPY_HPP

-#include "composable_kernel/tensor_operation/threadwise_generic_tensor_slice_copy.hpp"
+#include "common_header.hpp"
+#include "ConstantTensorDescriptor.hpp"
+#include "ConstantMergedTensorDescriptor.hpp"
+#include "threadwise_generic_tensor_slice_copy.hpp"

 namespace ck {

--- a/composable_kernel/include/tensor_operation/blockwise_tensor_slice_copy.hpp
+++ b/composable_kernel/include/tensor_operation/blockwise_tensor_slice_copy.hpp
@@ -1,7 +1,9 @@
 #ifndef CK_BLOCKWISE_TENSOR_SLICE_COPY_HPP
 #define CK_BLOCKWISE_TENSOR_SLICE_COPY_HPP

-#include "composable_kernel/tensor_operation/threadwise_tensor_slice_copy.hpp"
+#include "common_header.hpp"
+#include "ConstantTensorDescriptor.hpp"
+#include "threadwise_tensor_slice_copy.hpp"

 namespace ck {

--- a/composable_kernel/include/tensor_operation/threadwise_4d_tensor_op.hpp
+++ b/composable_kernel/include/tensor_operation/threadwise_4d_tensor_op.hpp
@@ -1,7 +1,8 @@
 #ifndef CK_THREADWISE_4D_TENSOR_OP_HPP
 #define CK_THREADWISE_4D_TENSOR_OP_HPP

-#include "composable_kernel/tensor_description/ConstantTensorDescriptor.hpp"
+#include "common_header.hpp"
+#include "ConstantTensorDescriptor.hpp"

 namespace ck {

--- a/composable_kernel/include/tensor_operation/threadwise_direct_convolution.hpp
+++ b/composable_kernel/include/tensor_operation/threadwise_direct_convolution.hpp
@@ -1,8 +1,9 @@
 #ifndef CK_THREADWISE_DIRECT_CONVOLUTION_HPP
 #define CK_THREADWISE_DIRECT_CONVOLUTION_HPP

-#include "composable_kernel/tensor_description/ConstantTensorDescriptor.hpp"
-#include "composable_kernel/tensor_operation/threadwise_tensor_slice_copy.hpp"
+#include "common_header.hpp"
+#include "ConstantTensorDescriptor.hpp"
+#include "threadwise_tensor_slice_copy.hpp"

 namespace ck {

--- a/composable_kernel/include/tensor_operation/threadwise_gemm.hpp
+++ b/composable_kernel/include/tensor_operation/threadwise_gemm.hpp
@@ -1,8 +1,8 @@
 #ifndef CK_THREADWISE_GEMM_HPP
 #define CK_THREADWISE_GEMM_HPP

-#include "composable_kernel/utility/common.hpp"
-#include "composable_kernel/tensor_description/ConstantMatrixDescriptor.hpp"
+#include "common_header.hpp"
+#include "ConstantMatrixDescriptor.hpp"

 namespace ck {

--- a/composable_kernel/include/tensor_operation/threadwise_generic_tensor_op.hpp
+++ b/composable_kernel/include/tensor_operation/threadwise_generic_tensor_op.hpp
@@ -1,8 +1,9 @@
 #ifndef CK_THREADWISE_GENERIC_TENSOR_OP_HPP
 #define CK_THREADWISE_GENERIC_TENSOR_OP_HPP

-#include "composable_kernel/tensor_description/ConstantTensorDescriptor.hpp"
-#include "composable_kernel/tensor_description/ConstantMergedTensorDescriptor.hpp"
+#include "common_header.hpp"
+#include "ConstantTensorDescriptor.hpp"
+#include "ConstantMergedTensorDescriptor.hpp"

 namespace ck {
 template <class Float, class TDesc>
--- a/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp
+++ b/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp
@@ -1,8 +1,9 @@
 #ifndef CK_THREADWISE_GENERIC_TENSOR_SLICE_COPY_HPP
 #define CK_THREADWISE_GENERIC_TENSOR_SLICE_COPY_HPP

-#include "composable_kernel/tensor_description/ConstantTensorDescriptor.hpp"
-#include "composable_kernel/tensor_description/ConstantMergedTensorDescriptor.hpp"
+#include "common_header.hpp"
+#include "ConstantTensorDescriptor.hpp"
+#include "ConstantMergedTensorDescriptor.hpp"

 namespace ck {

--- a/composable_kernel/include/tensor_operation/threadwise_tensor_slice_copy.hpp
+++ b/composable_kernel/include/tensor_operation/threadwise_tensor_slice_copy.hpp
@@ -1,7 +1,8 @@
 #ifndef CK_THREADWISE_TENSOR_SLICE_COPY_HPP
 #define CK_THREADWISE_TENSOR_SLICE_COPY_HPP

-#include "composable_kernel/tensor_description/ConstantTensorDescriptor.hpp"
+#include "common_header.hpp"
+#include "ConstantTensorDescriptor.hpp"

 namespace ck {

--- a/composable_kernel/include/utility/Array.hpp
+++ b/composable_kernel/include/utility/Array.hpp
@@ -1,8 +1,8 @@
 #ifndef CK_ARRAY_HPP
 #define CK_ARRAY_HPP

-#include "composable_kernel/utility/Sequence.hpp"
-#include "composable_kernel/utility/functional2.hpp"
+#include "Sequence.hpp"
+#include "functional2.hpp"

 namespace ck {

--- a/composable_kernel/include/utility/Sequence.hpp
+++ b/composable_kernel/include/utility/Sequence.hpp
@@ -1,8 +1,8 @@
 #ifndef CK_SEQUENCE_HPP
 #define CK_SEQUENCE_HPP

-#include "composable_kernel/utility/integral_constant.hpp"
-#include "composable_kernel/utility/functional.hpp"
+#include "integral_constant.hpp"
+#include "functional.hpp"

 namespace ck {

--- a/composable_kernel/include/utility/amd_inline_asm.hpp
+++ b/composable_kernel/include/utility/amd_inline_asm.hpp
@@ -1,7 +1,7 @@
 #ifndef CK_AMD_INLINE_ASM_HPP
 #define CK_AMD_INLINE_ASM_HPP

-#include "composable_kernel/utility/vector_type.hpp"
+#include "vector_type.hpp"

 #define NO_VM_WAIT 0
 #define NO_LGKM_WAIT 0
--- a/composable_kernel/include/utility/common_header.hpp
+++ b/composable_kernel/include/utility/common_header.hpp
@@ -0,0 +1,18 @@
+#ifndef CK_COMMON_HPP
+#define CK_COMMON_HPP
+
+#include "config.hpp"
+#include "utility.hpp"
+#include "vector_type.hpp"
+#include "integral_constant.hpp"
+#include "Sequence.hpp"
+#include "Array.hpp"
+#include "functional.hpp"
+#include "functional2.hpp"
+#include "functional3.hpp"
+
+#if CK_USE_AMD_INLINE_ASM
+#include "amd_inline_asm.hpp"
+#endif
+
+#endif
--- a/composable_kernel/include/utility/config_amd.hpp.in
+++ b/composable_kernel/include/utility/config_amd.hpp.in
@@ -1,5 +1,5 @@
-#ifndef CK_CONFIG_HPP
-#define CK_CONFIG_HPP
+#ifndef CK_CONFIG_AMD_HPP
+#define CK_CONFIG_AMD_HPP

 #cmakedefine01 CK_DEVICE_BACKEND_AMD

--- a/composable_kernel/include/utility/config_nvidia.hpp.in
+++ b/composable_kernel/include/utility/config_nvidia.hpp.in
@@ -1,5 +1,5 @@
-#ifndef CK_CONFIG_CUDA_HPP
-#define CK_CONFIG_CUDA_HPP
+#ifndef CK_CONFIG_NVIDIA_HPP
+#define CK_CONFIG_NVIDIA_HPP

 #cmakedefine01 CK_DEVICE_BACKEND_NVIDIA

--- a/composable_kernel/include/utility/functional.hpp
+++ b/composable_kernel/include/utility/functional.hpp
@@ -1,8 +1,8 @@
 #ifndef CK_FUNCTIONAL_HPP
 #define CK_FUNCTIONAL_HPP

-#include "composable_kernel/utility/integral_constant.hpp"
-#include "composable_kernel/utility/Sequence.hpp"
+#include "integral_constant.hpp"
+#include "Sequence.hpp"

 namespace ck {

--- a/composable_kernel/include/utility/functional2.hpp
+++ b/composable_kernel/include/utility/functional2.hpp
@@ -1,8 +1,8 @@
 #ifndef CK_FUNCTIONAL2_HPP
 #define CK_FUNCTIONAL2_HPP

-#include "composable_kernel/utility/functional.hpp"
-#include "composable_kernel/utility/Sequence.hpp"
+#include "functional.hpp"
+#include "Sequence.hpp"

 namespace ck {

--- a/composable_kernel/include/utility/functional3.hpp
+++ b/composable_kernel/include/utility/functional3.hpp
@@ -1,10 +1,10 @@
 #ifndef CK_FUNCTIONAL3_HPP
 #define CK_FUNCTIONAL3_HPP

-#include "composable_kernel/utility/functional.hpp"
-#include "composable_kernel/utility/functional2.hpp"
-#include "composable_kernel/utility/Sequence.hpp"
-#include "composable_kernel/utility/Array.hpp"
+#include "functional.hpp"
+#include "functional2.hpp"
+#include "Sequence.hpp"
+#include "Array.hpp"

 namespace ck {

--- a/composable_kernel/include/utility/integral_constant.hpp
+++ b/composable_kernel/include/utility/integral_constant.hpp
--- a/composable_kernel/include/utility/utility.hpp
+++ b/composable_kernel/include/utility/utility.hpp
@@ -1,6 +1,8 @@
 #ifndef CK_UTILITY_HPP
 #define CK_UTILITY_HPP

+#include "config.hpp"
+
 namespace ck {

 __device__ index_t get_thread_local_1d_id() { return threadIdx.x; }
--- a/composable_kernel/include/utility/vector_type.hpp
+++ b/composable_kernel/include/utility/vector_type.hpp
@@ -1,8 +1,8 @@
 #ifndef CK_VECTOR_TYPE_HPP
 #define CK_VECTOR_TYPE_HPP

-#include "composable_kernel/utility/config.hpp"
-#include "composable_kernel/utility/integral_constant.hpp"
+#include "config.hpp"
+#include "integral_constant.hpp"

 namespace ck {

--- a/driver/CMakeLists.txt
+++ b/driver/CMakeLists.txt
@@ -1,7 +1,23 @@
+set(TENSOR_SOURCE 
+    src/tensor.cpp;
+    src/device.cpp;
+)
+
+add_library(tensor SHARED ${TENSOR_SOURCE})
+target_compile_features(tensor PUBLIC)
+set_target_properties(tensor PROPERTIES POSITION_INDEPENDENT_CODE ON)
+
+if(DEVICE_BACKEND STREQUAL "NVIDIA")
+    target_link_libraries(tensor nvToolsExt cudart)
+endif()
+
+install(TARGETS tensor LIBRARY DESTINATION lib) 
+
+
 if(DEVICE_BACKEND STREQUAL "AMD")
-    set(DRIVER_SOURCE driver.cpp)
+    set(DRIVER_SOURCE src/driver.cpp)
 elseif(DEVICE_BACKEND STREQUAL "NVIDIA")
-    set(DRIVER_SOURCE driver.cu)
+    set(DRIVER_SOURCE src/driver.cu)
 endif()

 add_executable(driver ${DRIVER_SOURCE}) 
--- a/driver/include/conv_common.hpp
+++ b/driver/include/conv_common.hpp
@@ -1,7 +1,7 @@
 #ifndef CK_CONV_COMMON_HPP
 #define CK_CONV_COMMON_HPP

-#include "composable_kernel/tensor_description/ConstantTensorDescriptor.hpp"
+#include "ConstantTensorDescriptor.hpp"

 using namespace ck;

--- a/driver/include/device.hpp
+++ b/driver/include/device.hpp
@@ -2,7 +2,7 @@
 #define CK_DEVICE_HPP

 #include <memory>
-#include "composable_kernel/utility/config.hpp"
+#include "config.hpp"

 using namespace ck;

--- a/driver/include/device_convolution_direct_v2_nchw_kcyx_nkhw.hpp
+++ b/driver/include/device_convolution_direct_v2_nchw_kcyx_nkhw.hpp
@@ -1,8 +1,9 @@
 #pragma once
 #include <unistd.h>
 #include "device.hpp"
+#include "tensor.hpp"
 #include "gridwise_convolution_kernel_wrapper.hpp"
-#include "composable_kernel/kernel_algorithm/gridwise_convolution_direct_v2_nchw_kcyx_nkhw.hpp"
+#include "gridwise_convolution_direct_v2_nchw_kcyx_nkhw.hpp"

 using namespace ck;

--- a/driver/include/device_convolution_implicit_gemm_v1_chwn_cyxk_khwn.hpp
+++ b/driver/include/device_convolution_implicit_gemm_v1_chwn_cyxk_khwn.hpp
@@ -1,11 +1,12 @@
 #pragma once
 #include <unistd.h>
 #include "device.hpp"
+#include "tensor.hpp"
 #include "gridwise_convolution_kernel_wrapper.hpp"
-#include "composable_kernel/kernel_algorithm/gridwise_convolution_implicit_gemm_v1r1_chwn_cyxk_khwn.hpp"
-#include "composable_kernel/kernel_algorithm/gridwise_convolution_implicit_gemm_v1r2_chwn_cyxk_khwn.hpp"
-#include "composable_kernel/kernel_algorithm/gridwise_convolution_implicit_gemm_v1r3_chwn_cyxk_khwn.hpp"
-#include "composable_kernel/kernel_algorithm/gridwise_convolution_implicit_gemm_v1r3_chwn_cyxk_khwn_lds_double_buffer.hpp"
+#include "gridwise_convolution_implicit_gemm_v1r1_chwn_cyxk_khwn.hpp"
+#include "gridwise_convolution_implicit_gemm_v1r2_chwn_cyxk_khwn.hpp"
+#include "gridwise_convolution_implicit_gemm_v1r3_chwn_cyxk_khwn.hpp"
+#include "gridwise_convolution_implicit_gemm_v1r3_chwn_cyxk_khwn_lds_double_buffer.hpp"

 using namespace ck;

--- a/driver/include/device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw.hpp
+++ b/driver/include/device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw.hpp
@@ -1,9 +1,10 @@
 #pragma once
 #include <unistd.h>
 #include "device.hpp"
+#include "tensor.hpp"
 #include "gridwise_convolution_kernel_wrapper.hpp"
-#include "composable_kernel/kernel_algorithm/gridwise_convolution_implicit_gemm_v1r3_nchw_cyxk_nkhw.hpp"
-#include "composable_kernel/kernel_algorithm/gridwise_convolution_implicit_gemm_v1r3_nchw_cyxk_nkhw_lds_double_buffer.hpp"
+#include "gridwise_convolution_implicit_gemm_v1r3_nchw_cyxk_nkhw.hpp"
+#include "gridwise_convolution_implicit_gemm_v1r3_nchw_cyxk_nkhw_lds_double_buffer.hpp"

 using namespace ck;

--- a/driver/include/device_convolution_implicit_gemm_v2_chwn_cyxk_khwn.hpp
+++ b/driver/include/device_convolution_implicit_gemm_v2_chwn_cyxk_khwn.hpp
@@ -1,9 +1,10 @@
 #pragma once
 #include <unistd.h>
 #include "device.hpp"
+#include "tensor.hpp"
 #include "gridwise_convolution_kernel_wrapper.hpp"
-#include "composable_kernel/kernel_algorithm/gridwise_convolution_implicit_gemm_v2_chwn_cyxk_khwn.hpp"
-#include "composable_kernel/kernel_algorithm/gridwise_convolution_implicit_gemm_v2_chwn_cyxk_khwn_lds_double_buffer.hpp"
+#include "gridwise_convolution_implicit_gemm_v2_chwn_cyxk_khwn.hpp"
+#include "gridwise_convolution_implicit_gemm_v2_chwn_cyxk_khwn_lds_double_buffer.hpp"

 using namespace ck;

--- a/driver/include/device_convolution_implicit_gemm_v3_nchw_cyxk_nkhw.hpp
+++ b/driver/include/device_convolution_implicit_gemm_v3_nchw_cyxk_nkhw.hpp
@@ -1,9 +1,10 @@
 #pragma once
 #include <unistd.h>
 #include "device.hpp"
+#include "tensor.hpp"
 #include "gridwise_convolution_kernel_wrapper.hpp"
-#include "composable_kernel/kernel_algorithm/gridwise_convolution_implicit_gemm_v3_nchw_cyxk_nkhw.hpp"
-#include "composable_kernel/kernel_algorithm/gridwise_convolution_implicit_gemm_v3_nchw_cyxk_nkhw_lds_double_buffer.hpp"
+#include "gridwise_convolution_implicit_gemm_v3_nchw_cyxk_nkhw.hpp"
+#include "gridwise_convolution_implicit_gemm_v3_nchw_cyxk_nkhw_lds_double_buffer.hpp"

 using namespace ck;

--- a/driver/include/device_convolution_implicit_gemm_v4_nchw_kcyx_nkhw.hpp
+++ b/driver/include/device_convolution_implicit_gemm_v4_nchw_kcyx_nkhw.hpp
@@ -1,9 +1,10 @@
 #pragma once
 #include <unistd.h>
 #include "device.hpp"
+#include "tensor.hpp"
 #include "gridwise_convolution_kernel_wrapper.hpp"
-#include "composable_kernel/kernel_algorithm/gridwise_convolution_implicit_gemm_v4_nchw_kcyx_nkhw.hpp"
-#include "composable_kernel/kernel_algorithm/gridwise_convolution_implicit_gemm_v4_nchw_kcyx_nkhw_lds_double_buffer.hpp"
+#include "gridwise_convolution_implicit_gemm_v4_nchw_kcyx_nkhw.hpp"
+#include "gridwise_convolution_implicit_gemm_v4_nchw_kcyx_nkhw_lds_double_buffer.hpp"

 using namespace ck;

--- a/driver/include/device_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hpp
+++ b/driver/include/device_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hpp
@@ -1,7 +1,8 @@
 #pragma once
 #include <unistd.h>
 #include "device.hpp"
-#include "composable_kernel/kernel_algorithm/gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hpp"
+#include "tensor.hpp"
+#include "gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hpp"

 using namespace ck;

--- a/driver/include/device_implicit_gemm_convolution_1_chwn_cyxk_khwn_padded.hpp
+++ b/driver/include/device_implicit_gemm_convolution_1_chwn_cyxk_khwn_padded.hpp
@@ -1,7 +1,8 @@
 #pragma once
 #include <unistd.h>
 #include "device.hpp"
-#include "composable_kernel/kernel_algorithm/gridwise_implicit_gemm_convolution_1_chwn_cyxk_khwn_padded.hpp"
+#include "tensor.hpp"
+#include "gridwise_implicit_gemm_convolution_1_chwn_cyxk_khwn_padded.hpp"

 using namespace ck;

--- a/driver/include/tensor.hpp
+++ b/driver/include/tensor.hpp
--- a/driver/src/CMakeLists.txt
+++ b/driver/src/CMakeLists.txt
--- a/driver/src/device.cpp
+++ b/driver/src/device.cpp
@@ -1,4 +1,4 @@
-#include "composable_kernel/utility/config.hpp"
+#include "config.hpp"
 #include "device.hpp"

 DeviceMem::DeviceMem(std::size_t mem_size) : mMemSize(mem_size)
--- a/driver/src/driver.cpp
+++ b/driver/src/driver.cpp
@@ -3,9 +3,9 @@
 #include <initializer_list>
 #include <cstdlib>
 #include <stdlib.h>
-#include "composable_kernel/utility/config.hpp"
-#include "composable_kernel/tensor_description/ConstantTensorDescriptor.hpp"
-#include "tensor.hpp"
+#include "config.hpp"
+#include "ConstantTensorDescriptor.hpp"
+#include "device.hpp"
 #include "conv_common.hpp"
 #include "device_convolution_direct_v2_nchw_kcyx_nkhw.hpp"
 #include "device_convolution_implicit_gemm_v1_chwn_cyxk_khwn.hpp"
--- a/driver/src/driver.cu
+++ b/driver/src/driver.cu
--- a/driver/src/tensor.cpp
+++ b/driver/src/tensor.cpp
--- a/include/composable_kernel/utility/common.hpp
+++ b/include/composable_kernel/utility/common.hpp
@@ -1,17 +0,0 @@
-#ifndef CK_COMMON_HPP
-#define CK_COMMON_HPP
-
-#include "composable_kernel/utility/utility.hpp"
-#include "composable_kernel/utility/vector_type.hpp"
-#include "composable_kernel/utility/integral_constant.hpp"
-#include "composable_kernel/utility/Sequence.hpp"
-#include "composable_kernel/utility/Array.hpp"
-#include "composable_kernel/utility/functional.hpp"
-#include "composable_kernel/utility/functional2.hpp"
-#include "composable_kernel/utility/functional3.hpp"
-
-#if CK_USE_AMD_INLINE_ASM
-#include "composable_kernel/utility/amd_inline_asm.hpp"
-#endif
-
-#endif
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -1,20 +0,0 @@
-if(DEVICE_BACKEND STREQUAL "AMD")
-    configure_file("${PROJECT_SOURCE_DIR}/include/composable_kernel/utility/config_amd.hpp.in" "${PROJECT_BINARY_DIR}/include/composable_kernel/utility/config.hpp")
-elseif(DEVICE_BACKEND STREQUAL "NVIDIA")
-    configure_file("${PROJECT_SOURCE_DIR}/include/composable_kernel/utility/config_nvidia.hpp.in" "${PROJECT_BINARY_DIR}/include/composable_kernel/utility/config.hpp")
-endif()
-
-set(TENSOR_SOURCE 
-    tensor.cpp;
-    device.cpp;
-)
-
-add_library(tensor SHARED ${TENSOR_SOURCE})
-target_compile_features(tensor PUBLIC)
-set_target_properties(tensor PROPERTIES POSITION_INDEPENDENT_CODE ON)
-
-if(DEVICE_BACKEND STREQUAL "NVIDIA")
-    target_link_libraries(tensor nvToolsExt cudart)
-endif()
-
-install(TARGETS tensor LIBRARY DESTINATION lib)