reorginze files

[ROCm/composable_kernel commit: 81497a93a0]
2026-05-14 02:02:46 +00:00 · 2019-06-11 23:49:51 -05:00
parent 3dc2ff612c
commit 14451df1fd
59 changed files with 197 additions and 198 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -45,6 +45,9 @@ elseif(DEVICE_BACKEND STREQUAL "CUDA")
 endif()

 #
-include_directories(BEFORE src/include ${PROJECT_BINARY_DIR}/src/include)
+include_directories(BEFORE
+    include
+    ${PROJECT_BINARY_DIR}/include
+)
 add_subdirectory(src)
 add_subdirectory(driver)
--- a/driver/device_convolution_direct_v2_nchw_kcyx_nkhw.hpp
+++ b/driver/device_convolution_direct_v2_nchw_kcyx_nkhw.hpp
@@ -2,7 +2,7 @@
 #include <unistd.h>
 #include "device.hpp"
 #include "gridwise_convolution_kernel_wrapper.hpp"
-#include "gridwise_convolution_direct_v2_nchw_kcyx_nkhw.hpp"
+#include "composable_kernel/kernel_algorithm/gridwise_convolution_direct_v2_nchw_kcyx_nkhw.hpp"

 using namespace ck;

--- a/driver/device_convolution_implicit_gemm_v1_chwn_cyxk_khwn.hpp
+++ b/driver/device_convolution_implicit_gemm_v1_chwn_cyxk_khwn.hpp
@@ -2,10 +2,10 @@
 #include <unistd.h>
 #include "device.hpp"
 #include "gridwise_convolution_kernel_wrapper.hpp"
-#include "gridwise_convolution_implicit_gemm_v1r1_chwn_cyxk_khwn.hpp"
-#include "gridwise_convolution_implicit_gemm_v1r2_chwn_cyxk_khwn.hpp"
-#include "gridwise_convolution_implicit_gemm_v1r3_chwn_cyxk_khwn.hpp"
-#include "gridwise_convolution_implicit_gemm_v1r3_chwn_cyxk_khwn_lds_double_buffer.hpp"
+#include "composable_kernel/kernel_algorithm/gridwise_convolution_implicit_gemm_v1r1_chwn_cyxk_khwn.hpp"
+#include "composable_kernel/kernel_algorithm/gridwise_convolution_implicit_gemm_v1r2_chwn_cyxk_khwn.hpp"
+#include "composable_kernel/kernel_algorithm/gridwise_convolution_implicit_gemm_v1r3_chwn_cyxk_khwn.hpp"
+#include "composable_kernel/kernel_algorithm/gridwise_convolution_implicit_gemm_v1r3_chwn_cyxk_khwn_lds_double_buffer.hpp"

 using namespace ck;

--- a/driver/device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw.hpp
+++ b/driver/device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw.hpp
@@ -2,8 +2,8 @@
 #include <unistd.h>
 #include "device.hpp"
 #include "gridwise_convolution_kernel_wrapper.hpp"
-#include "gridwise_convolution_implicit_gemm_v1r3_nchw_cyxk_nkhw.hpp"
-#include "gridwise_convolution_implicit_gemm_v1r3_nchw_cyxk_nkhw_lds_double_buffer.hpp"
+#include "composable_kernel/kernel_algorithm/gridwise_convolution_implicit_gemm_v1r3_nchw_cyxk_nkhw.hpp"
+#include "composable_kernel/kernel_algorithm/gridwise_convolution_implicit_gemm_v1r3_nchw_cyxk_nkhw_lds_double_buffer.hpp"

 using namespace ck;

--- a/driver/device_convolution_implicit_gemm_v2_chwn_cyxk_khwn.hpp
+++ b/driver/device_convolution_implicit_gemm_v2_chwn_cyxk_khwn.hpp
@@ -2,8 +2,8 @@
 #include <unistd.h>
 #include "device.hpp"
 #include "gridwise_convolution_kernel_wrapper.hpp"
-#include "gridwise_convolution_implicit_gemm_v2_chwn_cyxk_khwn.hpp"
-#include "gridwise_convolution_implicit_gemm_v2_chwn_cyxk_khwn_lds_double_buffer.hpp"
+#include "composable_kernel/kernel_algorithm/gridwise_convolution_implicit_gemm_v2_chwn_cyxk_khwn.hpp"
+#include "composable_kernel/kernel_algorithm/gridwise_convolution_implicit_gemm_v2_chwn_cyxk_khwn_lds_double_buffer.hpp"

 using namespace ck;

--- a/driver/device_convolution_implicit_gemm_v3_nchw_cyxk_nkhw.hpp
+++ b/driver/device_convolution_implicit_gemm_v3_nchw_cyxk_nkhw.hpp
@@ -2,8 +2,8 @@
 #include <unistd.h>
 #include "device.hpp"
 #include "gridwise_convolution_kernel_wrapper.hpp"
-#include "gridwise_convolution_implicit_gemm_v3_nchw_cyxk_nkhw.hpp"
-#include "gridwise_convolution_implicit_gemm_v3_nchw_cyxk_nkhw_lds_double_buffer.hpp"
+#include "composable_kernel/kernel_algorithm/gridwise_convolution_implicit_gemm_v3_nchw_cyxk_nkhw.hpp"
+#include "composable_kernel/kernel_algorithm/gridwise_convolution_implicit_gemm_v3_nchw_cyxk_nkhw_lds_double_buffer.hpp"

 using namespace ck;

--- a/driver/device_convolution_implicit_gemm_v4_nchw_kcyx_nkhw.hpp
+++ b/driver/device_convolution_implicit_gemm_v4_nchw_kcyx_nkhw.hpp
@@ -2,8 +2,8 @@
 #include <unistd.h>
 #include "device.hpp"
 #include "gridwise_convolution_kernel_wrapper.hpp"
-#include "gridwise_convolution_implicit_gemm_v4_nchw_kcyx_nkhw.hpp"
-#include "gridwise_convolution_implicit_gemm_v4_nchw_kcyx_nkhw_lds_double_buffer.hpp"
+#include "composable_kernel/kernel_algorithm/gridwise_convolution_implicit_gemm_v4_nchw_kcyx_nkhw.hpp"
+#include "composable_kernel/kernel_algorithm/gridwise_convolution_implicit_gemm_v4_nchw_kcyx_nkhw_lds_double_buffer.hpp"

 using namespace ck;

--- a/driver/device_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hpp
+++ b/driver/device_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hpp
@@ -1,7 +1,7 @@
 #pragma once
 #include <unistd.h>
 #include "device.hpp"
-#include "gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hpp"
+#include "composable_kernel/kernel_algorithm/gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hpp"

 using namespace ck;

--- a/driver/device_implicit_gemm_convolution_1_chwn_cyxk_khwn_padded.hpp
+++ b/driver/device_implicit_gemm_convolution_1_chwn_cyxk_khwn_padded.hpp
@@ -1,7 +1,7 @@
 #pragma once
 #include <unistd.h>
 #include "device.hpp"
-#include "gridwise_implicit_gemm_convolution_1_chwn_cyxk_khwn_padded.hpp"
+#include "composable_kernel/kernel_algorithm/gridwise_implicit_gemm_convolution_1_chwn_cyxk_khwn_padded.hpp"

 using namespace ck;

--- a/driver/driver.cpp
+++ b/driver/driver.cpp
@@ -3,9 +3,9 @@
 #include <initializer_list>
 #include <cstdlib>
 #include <stdlib.h>
-#include "config.hpp"
+#include "composable_kernel/utility/config.hpp"
+#include "composable_kernel/tensor_description/ConstantTensorDescriptor.hpp"
 #include "tensor.hpp"
-#include "ConstantTensorDescriptor.hpp"
 #include "conv_common.hpp"
 #include "device_convolution_direct_v2_nchw_kcyx_nkhw.hpp"
 #include "device_convolution_implicit_gemm_v1_chwn_cyxk_khwn.hpp"
--- a/include/composable_kernel/kernel_algorithm/gridwise_convolution_direct_v2_nchw_kcyx_nkhw.hpp
+++ b/include/composable_kernel/kernel_algorithm/gridwise_convolution_direct_v2_nchw_kcyx_nkhw.hpp
@@ -1,12 +1,12 @@
 #ifndef CK_GRIDWISE_CONVOLUTION_DIRECT_V2_NCHW_KCYX_NKHW
 #define CK_GRIDWISE_CONVOLUTION_DIRECT_V2_NCHW_KCYX_NKHW

-#include "common.hpp"
-#include "ConstantTensorDescriptor.hpp"
-#include "blockwise_2d_tensor_op.hpp"
-#include "blockwise_4d_tensor_op.hpp"
-#include "threadwise_tensor_slice_copy.hpp"
-#include "threadwise_direct_convolution.hpp"
+#include "composable_kernel/utility/common.hpp"
+#include "composable_kernel/tensor_description/ConstantTensorDescriptor.hpp"
+#include "composable_kernel/tensor_operation/blockwise_2d_tensor_op.hpp"
+#include "composable_kernel/tensor_operation/blockwise_4d_tensor_op.hpp"
+#include "composable_kernel/tensor_operation/threadwise_tensor_slice_copy.hpp"
+#include "composable_kernel/tensor_operation/threadwise_direct_convolution.hpp"

 namespace ck {

--- a/include/composable_kernel/kernel_algorithm/gridwise_convolution_implicit_gemm_v1r1_chwn_cyxk_khwn.hpp
+++ b/include/composable_kernel/kernel_algorithm/gridwise_convolution_implicit_gemm_v1r1_chwn_cyxk_khwn.hpp
@@ -1,14 +1,14 @@
 #ifndef CK_GRIDWISE_CONVOLUTION_IMPLICIT_GEMM_V1R1_CHWN_CYXK_KHWN
 #define CK_GRIDWISE_CONVOLUTION_IMPLICIT_GEMM_V1R1_CHWN_CYXK_KHWN

-#include "common.hpp"
-#include "ConstantTensorDescriptor.hpp"
-#include "ConstantMatrixDescriptor.hpp"
-#include "blockwise_4d_tensor_op.hpp"
-#include "blockwise_2d_tensor_op.hpp"
-#include "threadwise_tensor_slice_copy.hpp"
-#include "threadwise_4d_tensor_op.hpp"
-#include "blockwise_batched_gemm.hpp"
+#include "composable_kernel/utility/common.hpp"
+#include "composable_kernel/tensor_description/ConstantTensorDescriptor.hpp"
+#include "composable_kernel/tensor_description/ConstantMatrixDescriptor.hpp"
+#include "composable_kernel/tensor_operation/blockwise_4d_tensor_op.hpp"
+#include "composable_kernel/tensor_operation/blockwise_2d_tensor_op.hpp"
+#include "composable_kernel/tensor_operation/threadwise_tensor_slice_copy.hpp"
+#include "composable_kernel/tensor_operation/threadwise_4d_tensor_op.hpp"
+#include "composable_kernel/tensor_operation/blockwise_batched_gemm.hpp"

 namespace ck {

--- a/include/composable_kernel/kernel_algorithm/gridwise_convolution_implicit_gemm_v1r2_chwn_cyxk_khwn.hpp
+++ b/include/composable_kernel/kernel_algorithm/gridwise_convolution_implicit_gemm_v1r2_chwn_cyxk_khwn.hpp
@@ -1,15 +1,15 @@
 #ifndef CK_GRIDWISE_CONVOLUTION_IMPLICIT_GEMM_V1R2_CHWN_CYXK_KHWN
 #define CK_GRIDWISE_CONVOLUTION_IMPLICIT_GEMM_V1R2_CHWN_CYXK_KHWN

-#include "common.hpp"
-#include "ConstantTensorDescriptor.hpp"
-#include "ConstantMatrixDescriptor.hpp"
-#include "blockwise_2d_tensor_op.hpp"
-#include "blockwise_3d_tensor_op.hpp"
-#include "blockwise_4d_tensor_op.hpp"
-#include "threadwise_tensor_slice_copy.hpp"
-#include "threadwise_4d_tensor_op.hpp"
-#include "blockwise_batched_gemm.hpp"
+#include "composable_kernel/utility/common.hpp"
+#include "composable_kernel/tensor_description/ConstantTensorDescriptor.hpp"
+#include "composable_kernel/tensor_description/ConstantMatrixDescriptor.hpp"
+#include "composable_kernel/tensor_operation/blockwise_2d_tensor_op.hpp"
+#include "composable_kernel/tensor_operation/blockwise_3d_tensor_op.hpp"
+#include "composable_kernel/tensor_operation/blockwise_4d_tensor_op.hpp"
+#include "composable_kernel/tensor_operation/threadwise_tensor_slice_copy.hpp"
+#include "composable_kernel/tensor_operation/threadwise_4d_tensor_op.hpp"
+#include "composable_kernel/tensor_operation/blockwise_batched_gemm.hpp"

 namespace ck {

--- a/include/composable_kernel/kernel_algorithm/gridwise_convolution_implicit_gemm_v1r3_chwn_cyxk_khwn.hpp
+++ b/include/composable_kernel/kernel_algorithm/gridwise_convolution_implicit_gemm_v1r3_chwn_cyxk_khwn.hpp
@@ -1,14 +1,14 @@
 #ifndef CK_GRIDWISE_CONVOLUTION_IMPLICIT_GEMM_V1R3_CHWN_CYXK_KHWN
 #define CK_GRIDWISE_CONVOLUTION_IMPLICIT_GEMM_V1R3_CHWN_CYXK_KHWN

-#include "common.hpp"
-#include "ConstantTensorDescriptor.hpp"
-#include "ConstantMatrixDescriptor.hpp"
-#include "blockwise_2d_tensor_op.hpp"
-#include "blockwise_4d_tensor_op.hpp"
-#include "threadwise_tensor_slice_copy.hpp"
-#include "threadwise_4d_tensor_op.hpp"
-#include "blockwise_batched_gemm.hpp"
+#include "composable_kernel/utility/common.hpp"
+#include "composable_kernel/tensor_description/ConstantTensorDescriptor.hpp"
+#include "composable_kernel/tensor_description/ConstantMatrixDescriptor.hpp"
+#include "composable_kernel/tensor_operation/blockwise_2d_tensor_op.hpp"
+#include "composable_kernel/tensor_operation/blockwise_4d_tensor_op.hpp"
+#include "composable_kernel/tensor_operation/threadwise_tensor_slice_copy.hpp"
+#include "composable_kernel/tensor_operation/threadwise_4d_tensor_op.hpp"
+#include "composable_kernel/tensor_operation/blockwise_batched_gemm.hpp"

 namespace ck {

--- a/include/composable_kernel/kernel_algorithm/gridwise_convolution_implicit_gemm_v1r3_chwn_cyxk_khwn_lds_double_buffer.hpp
+++ b/include/composable_kernel/kernel_algorithm/gridwise_convolution_implicit_gemm_v1r3_chwn_cyxk_khwn_lds_double_buffer.hpp
@@ -1,14 +1,14 @@
 #ifndef CK_GRIDWISE_CONVOLUTION_IMPLICIT_GEMM_V1R3_CHWN_CYXK_KHWN_LDS_DOUBLE_BUFFER
 #define CK_GRIDWISE_CONVOLUTION_IMPLICIT_GEMM_V1R3_CHWN_CYXK_KHWN_LDS_DOUBLE_BUFFER

-#include "common.hpp"
-#include "ConstantTensorDescriptor.hpp"
-#include "ConstantMatrixDescriptor.hpp"
-#include "blockwise_2d_tensor_op.hpp"
-#include "blockwise_4d_tensor_op.hpp"
-#include "threadwise_tensor_slice_copy.hpp"
-#include "threadwise_4d_tensor_op.hpp"
-#include "blockwise_batched_gemm.hpp"
+#include "composable_kernel/utility/common.hpp"
+#include "composable_kernel/tensor_description/ConstantTensorDescriptor.hpp"
+#include "composable_kernel/tensor_description/ConstantMatrixDescriptor.hpp"
+#include "composable_kernel/tensor_operation/blockwise_2d_tensor_op.hpp"
+#include "composable_kernel/tensor_operation/blockwise_4d_tensor_op.hpp"
+#include "composable_kernel/tensor_operation/threadwise_tensor_slice_copy.hpp"
+#include "composable_kernel/tensor_operation/threadwise_4d_tensor_op.hpp"
+#include "composable_kernel/tensor_operation/blockwise_batched_gemm.hpp"

 namespace ck {

--- a/include/composable_kernel/kernel_algorithm/gridwise_convolution_implicit_gemm_v1r3_nchw_cyxk_nkhw.hpp
+++ b/include/composable_kernel/kernel_algorithm/gridwise_convolution_implicit_gemm_v1r3_nchw_cyxk_nkhw.hpp
@@ -1,14 +1,14 @@
 #ifndef CK_GRIDWISE_CONVOLUTION_IMPLICIT_GEMM_V1R3_NCHW_CYXK_NKHW
 #define CK_GRIDWISE_CONVOLUTION_IMPLICIT_GEMM_V1R3_NCHW_CYXK_NKHW

-#include "common.hpp"
-#include "ConstantTensorDescriptor.hpp"
-#include "ConstantMatrixDescriptor.hpp"
-#include "blockwise_2d_tensor_op.hpp"
-#include "blockwise_tensor_slice_copy.hpp"
-#include "threadwise_tensor_slice_copy.hpp"
-#include "threadwise_generic_tensor_op.hpp"
-#include "blockwise_batched_gemm.hpp"
+#include "composable_kernel/utility/common.hpp"
+#include "composable_kernel/tensor_description/ConstantTensorDescriptor.hpp"
+#include "composable_kernel/tensor_description/ConstantMatrixDescriptor.hpp"
+#include "composable_kernel/tensor_operation/blockwise_2d_tensor_op.hpp"
+#include "composable_kernel/tensor_operation/blockwise_tensor_slice_copy.hpp"
+#include "composable_kernel/tensor_operation/threadwise_tensor_slice_copy.hpp"
+#include "composable_kernel/tensor_operation/threadwise_generic_tensor_op.hpp"
+#include "composable_kernel/tensor_operation/blockwise_batched_gemm.hpp"

 namespace ck {

--- a/include/composable_kernel/kernel_algorithm/gridwise_convolution_implicit_gemm_v1r3_nchw_cyxk_nkhw_lds_double_buffer.hpp
+++ b/include/composable_kernel/kernel_algorithm/gridwise_convolution_implicit_gemm_v1r3_nchw_cyxk_nkhw_lds_double_buffer.hpp
@@ -1,14 +1,14 @@
 #ifndef CK_GRIDWISE_CONVOLUTION_IMPLICIT_GEMM_V1R3_NCHW_CYXK_NKHW_LDS_DOUBLE_BUFFER
 #define CK_GRIDWISE_CONVOLUTION_IMPLICIT_GEMM_V1R3_NCHW_CYXK_NKHW_LDS_DOUBLE_BUFFER

-#include "common.hpp"
-#include "ConstantTensorDescriptor.hpp"
-#include "ConstantMatrixDescriptor.hpp"
-#include "blockwise_2d_tensor_op.hpp"
-#include "blockwise_tensor_slice_copy.hpp"
-#include "threadwise_tensor_slice_copy.hpp"
-#include "threadwise_generic_tensor_op.hpp"
-#include "blockwise_batched_gemm.hpp"
+#include "composable_kernel/utility/common.hpp"
+#include "composable_kernel/tensor_description/ConstantTensorDescriptor.hpp"
+#include "composable_kernel/tensor_description/ConstantMatrixDescriptor.hpp"
+#include "composable_kernel/tensor_operation/blockwise_2d_tensor_op.hpp"
+#include "composable_kernel/tensor_operation/blockwise_tensor_slice_copy.hpp"
+#include "composable_kernel/tensor_operation/threadwise_tensor_slice_copy.hpp"
+#include "composable_kernel/tensor_operation/threadwise_generic_tensor_op.hpp"
+#include "composable_kernel/tensor_operation/blockwise_batched_gemm.hpp"

 namespace ck {

--- a/include/composable_kernel/kernel_algorithm/gridwise_convolution_implicit_gemm_v2_chwn_cyxk_khwn.hpp
+++ b/include/composable_kernel/kernel_algorithm/gridwise_convolution_implicit_gemm_v2_chwn_cyxk_khwn.hpp
@@ -1,12 +1,12 @@
 #ifndef CK_GRIDWISE_CONVOLUTION_IMPLICIT_GEMM_V2_CHWN_CYXK_KHWN
 #define CK_GRIDWISE_CONVOLUTION_IMPLICIT_GEMM_V2_CHWN_CYXK_KHWN

-#include "common.hpp"
-#include "ConstantTensorDescriptor.hpp"
-#include "ConstantMatrixDescriptor.hpp"
-#include "blockwise_4d_tensor_op.hpp"
-#include "blockwise_2d_tensor_op.hpp"
-#include "blockwise_gemm.hpp"
+#include "composable_kernel/utility/common.hpp"
+#include "composable_kernel/tensor_description/ConstantTensorDescriptor.hpp"
+#include "composable_kernel/tensor_description/ConstantMatrixDescriptor.hpp"
+#include "composable_kernel/tensor_operation/blockwise_4d_tensor_op.hpp"
+#include "composable_kernel/tensor_operation/blockwise_2d_tensor_op.hpp"
+#include "composable_kernel/tensor_operation/blockwise_gemm.hpp"

 namespace ck {

--- a/include/composable_kernel/kernel_algorithm/gridwise_convolution_implicit_gemm_v2_chwn_cyxk_khwn_lds_double_buffer.hpp
+++ b/include/composable_kernel/kernel_algorithm/gridwise_convolution_implicit_gemm_v2_chwn_cyxk_khwn_lds_double_buffer.hpp
@@ -1,13 +1,13 @@
 #ifndef CK_GRIDWISE_CONVOLUTION_IMPLICIT_GEMM_V2_CHWN_CYXK_KHWN_LDS_DOUBLE_BUFFER
 #define CK_GRIDWISE_CONVOLUTION_IMPLICIT_GEMM_V2_CHWN_CYXK_KHWN_LDS_DOUBLE_BUFFER

-#include "common.hpp"
-#include "ConstantTensorDescriptor.hpp"
-#include "ConstantMatrixDescriptor.hpp"
-#include "blockwise_4d_tensor_op.hpp"
-#include "blockwise_2d_tensor_op.hpp"
-#include "threadwise_tensor_slice_copy.hpp"
-#include "blockwise_gemm.hpp"
+#include "composable_kernel/utility/common.hpp"
+#include "composable_kernel/tensor_description/ConstantTensorDescriptor.hpp"
+#include "composable_kernel/tensor_description/ConstantMatrixDescriptor.hpp"
+#include "composable_kernel/tensor_operation/blockwise_4d_tensor_op.hpp"
+#include "composable_kernel/tensor_operation/blockwise_2d_tensor_op.hpp"
+#include "composable_kernel/tensor_operation/threadwise_tensor_slice_copy.hpp"
+#include "composable_kernel/tensor_operation/blockwise_gemm.hpp"

 namespace ck {

--- a/include/composable_kernel/kernel_algorithm/gridwise_convolution_implicit_gemm_v3_nchw_cyxk_nkhw.hpp
+++ b/include/composable_kernel/kernel_algorithm/gridwise_convolution_implicit_gemm_v3_nchw_cyxk_nkhw.hpp
@@ -1,12 +1,12 @@
 #ifndef CK_GRIDWISE_CONVOLUTION_IMPLICIT_GEMM_V3_NCHW_CYXK_NKHW
 #define CK_GRIDWISE_CONVOLUTION_IMPLICIT_GEMM_V3_NCHW_CYXK_NKHW

-#include "common.hpp"
-#include "ConstantTensorDescriptor.hpp"
-#include "ConstantMergedTensorDescriptor.hpp"
-#include "ConstantMatrixDescriptor.hpp"
-#include "blockwise_generic_tensor_slice_copy.hpp"
-#include "blockwise_gemm.hpp"
+#include "composable_kernel/utility/common.hpp"
+#include "composable_kernel/tensor_description/ConstantTensorDescriptor.hpp"
+#include "composable_kernel/tensor_description/ConstantMergedTensorDescriptor.hpp"
+#include "composable_kernel/tensor_description/ConstantMatrixDescriptor.hpp"
+#include "composable_kernel/tensor_operation/blockwise_generic_tensor_slice_copy.hpp"
+#include "composable_kernel/tensor_operation/blockwise_gemm.hpp"

 namespace ck {

--- a/include/composable_kernel/kernel_algorithm/gridwise_convolution_implicit_gemm_v3_nchw_cyxk_nkhw_lds_double_buffer.hpp
+++ b/include/composable_kernel/kernel_algorithm/gridwise_convolution_implicit_gemm_v3_nchw_cyxk_nkhw_lds_double_buffer.hpp
@@ -1,12 +1,12 @@
 #ifndef CK_GRIDWISE_CONVOLUTION_IMPLICIT_GEMM_V3_NCHW_CYXK_NKHW_LDS_DOUBLE_BUFFER
 #define CK_GRIDWISE_CONVOLUTION_IMPLICIT_GEMM_V3_NCHW_CYXK_NKHW_LDS_DOUBLE_BUFFER

-#include "common.hpp"
-#include "ConstantTensorDescriptor.hpp"
-#include "ConstantMergedTensorDescriptor.hpp"
-#include "ConstantMatrixDescriptor.hpp"
-#include "blockwise_generic_tensor_slice_copy.hpp"
-#include "blockwise_gemm.hpp"
+#include "composable_kernel/utility/common.hpp"
+#include "composable_kernel/tensor_description/ConstantTensorDescriptor.hpp"
+#include "composable_kernel/tensor_description/ConstantMergedTensorDescriptor.hpp"
+#include "composable_kernel/tensor_description/ConstantMatrixDescriptor.hpp"
+#include "composable_kernel/tensor_operation/blockwise_generic_tensor_slice_copy.hpp"
+#include "composable_kernel/tensor_operation/blockwise_gemm.hpp"

 namespace ck {

--- a/include/composable_kernel/kernel_algorithm/gridwise_convolution_implicit_gemm_v4_nchw_kcyx_nkhw.hpp
+++ b/include/composable_kernel/kernel_algorithm/gridwise_convolution_implicit_gemm_v4_nchw_kcyx_nkhw.hpp
@@ -1,13 +1,13 @@
 #ifndef CK_GRIDWISE_CONVOLUTION_IMPLICIT_GEMM_V4_NCHW_KCYX_NKHW
 #define CK_GRIDWISE_CONVOLUTION_IMPLICIT_GEMM_V4_NCHW_KCYX_NKHW

-#include "common.hpp"
-#include "ConstantTensorDescriptor.hpp"
-#include "ConstantMergedTensorDescriptor.hpp"
-#include "ConstantMatrixDescriptor.hpp"
-#include "blockwise_generic_tensor_slice_copy.hpp"
-#include "blockwise_gemm.hpp"
-#include "threadwise_generic_tensor_slice_copy.hpp"
+#include "composable_kernel/utility/common.hpp"
+#include "composable_kernel/tensor_description/ConstantTensorDescriptor.hpp"
+#include "composable_kernel/tensor_description/ConstantMergedTensorDescriptor.hpp"
+#include "composable_kernel/tensor_description/ConstantMatrixDescriptor.hpp"
+#include "composable_kernel/tensor_operation/blockwise_generic_tensor_slice_copy.hpp"
+#include "composable_kernel/tensor_operation/blockwise_gemm.hpp"
+#include "composable_kernel/tensor_operation/threadwise_generic_tensor_slice_copy.hpp"

 namespace ck {

--- a/include/composable_kernel/kernel_algorithm/gridwise_convolution_implicit_gemm_v4_nchw_kcyx_nkhw_lds_double_buffer.hpp
+++ b/include/composable_kernel/kernel_algorithm/gridwise_convolution_implicit_gemm_v4_nchw_kcyx_nkhw_lds_double_buffer.hpp
@@ -1,13 +1,13 @@
 #ifndef CK_GRIDWISE_CONVOLUTION_IMPLICIT_GEMM_V4_NCHW_KCYX_NKHW_LDS_DOUBLE_BUFFER
 #define CK_GRIDWISE_CONVOLUTION_IMPLICIT_GEMM_V4_NCHW_KCYX_NKHW_LDS_DOUBLE_BUFFER

-#include "common.hpp"
-#include "ConstantTensorDescriptor.hpp"
-#include "ConstantMergedTensorDescriptor.hpp"
-#include "ConstantMatrixDescriptor.hpp"
-#include "blockwise_generic_tensor_slice_copy.hpp"
-#include "blockwise_gemm.hpp"
-#include "threadwise_generic_tensor_slice_copy.hpp"
+#include "composable_kernel/utility/common.hpp"
+#include "composable_kernel/tensor_description/ConstantTensorDescriptor.hpp"
+#include "composable_kernel/tensor_description/ConstantMergedTensorDescriptor.hpp"
+#include "composable_kernel/tensor_description/ConstantMatrixDescriptor.hpp"
+#include "composable_kernel/tensor_operation/blockwise_generic_tensor_slice_copy.hpp"
+#include "composable_kernel/tensor_operation/blockwise_gemm.hpp"
+#include "composable_kernel/tensor_operation/threadwise_generic_tensor_slice_copy.hpp"

 namespace ck {

--- a/include/composable_kernel/kernel_algorithm/gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hpp
+++ b/include/composable_kernel/kernel_algorithm/gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hpp
@@ -1,11 +1,11 @@
 #pragma once
-#include "common.hpp"
-#include "ConstantTensorDescriptor.hpp"
-#include "blockwise_2d_tensor_op.hpp"
-#include "blockwise_4d_tensor_op.hpp"
-#include "blockwise_direct_convolution.hpp"
-#include "threadwise_4d_tensor_op.hpp"
-#include "threadwise_direct_convolution.hpp"
+#include "composable_kernel/utility/common.hpp"
+#include "composable_kernel/tensor_description/ConstantTensorDescriptor.hpp"
+#include "composable_kernel/tensor_operation/blockwise_2d_tensor_op.hpp"
+#include "composable_kernel/tensor_operation/blockwise_4d_tensor_op.hpp"
+#include "composable_kernel/tensor_operation/blockwise_direct_convolution.hpp"
+#include "composable_kernel/tensor_operation/threadwise_4d_tensor_op.hpp"
+#include "composable_kernel/tensor_operation/threadwise_direct_convolution.hpp"

 namespace ck {

--- a/include/composable_kernel/kernel_algorithm/gridwise_implicit_gemm_convolution_1_chwn_cyxk_khwn_padded.hpp
+++ b/include/composable_kernel/kernel_algorithm/gridwise_implicit_gemm_convolution_1_chwn_cyxk_khwn_padded.hpp
@@ -1,11 +1,11 @@
 #pragma once
-#include "common.hpp"
-#include "ConstantTensorDescriptor.hpp"
-#include "ConstantMatrixDescriptor.hpp"
-#include "blockwise_4d_tensor_op.hpp"
-#include "blockwise_2d_tensor_op.hpp"
-#include "threadwise_4d_tensor_op.hpp"
-#include "blockwise_gemm.hpp"
+#include "composable_kernel/utility/common.hpp"
+#include "composable_kernel/tensor_description/ConstantTensorDescriptor.hpp"
+#include "composable_kernel/tensor_description/ConstantMatrixDescriptor.hpp"
+#include "composable_kernel/tensor_operation/blockwise_4d_tensor_op.hpp"
+#include "composable_kernel/tensor_operation/blockwise_2d_tensor_op.hpp"
+#include "composable_kernel/tensor_operation/threadwise_4d_tensor_op.hpp"
+#include "composable_kernel/tensor_operation/blockwise_gemm.hpp"

 namespace ck {

--- a/include/composable_kernel/tensor_description/ConstantMatrixDescriptor.hpp
+++ b/include/composable_kernel/tensor_description/ConstantMatrixDescriptor.hpp
@@ -1,7 +1,7 @@
 #ifndef CK_CONSTANT_MATRIX_DESCRIPTOR_HPP
 #define CK_CONSTANT_MATRIX_DESCRIPTOR_HPP

-#include "common.hpp"
+#include "composable_kernel/utility/common.hpp"

 namespace ck {

--- a/include/composable_kernel/tensor_description/ConstantMergedTensorDescriptor.hpp
+++ b/include/composable_kernel/tensor_description/ConstantMergedTensorDescriptor.hpp
@@ -1,8 +1,8 @@
 #ifndef CK_CONSTANT_MERGED_TENSOR_DESCRIPTOR_HPP
 #define CK_CONSTANT_MERGED_TENSOR_DESCRIPTOR_HPP

-#include "common.hpp"
-#include "ConstantTensorDescriptor.hpp"
+#include "composable_kernel/utility/common.hpp"
+#include "composable_kernel/tensor_description/ConstantTensorDescriptor.hpp"

 namespace ck {

--- a/include/composable_kernel/tensor_description/ConstantTensorDescriptor.hpp
+++ b/include/composable_kernel/tensor_description/ConstantTensorDescriptor.hpp
@@ -1,7 +1,7 @@
 #ifndef CK_CONSTANT_TENSOR_DESCRIPTOR_HPP
 #define CK_CONSTANT_TENSOR_DESCRIPTOR_HPP

-#include "common.hpp"
+#include "composable_kernel/utility/common.hpp"

 namespace ck {

--- a/include/composable_kernel/tensor_operation/blockwise_2d_tensor_op.hpp
+++ b/include/composable_kernel/tensor_operation/blockwise_2d_tensor_op.hpp
@@ -1,8 +1,8 @@
 #ifndef CK_BLOCKWISE_2D_TENSOR_OP_HPP
 #define CK_BLOCKWISE_2D_TENSOR_OP_HPP

-#include "common.hpp"
-#include "ConstantTensorDescriptor.hpp"
+#include "composable_kernel/utility/common.hpp"
+#include "composable_kernel/tensor_description/ConstantTensorDescriptor.hpp"

 namespace ck {

--- a/include/composable_kernel/tensor_operation/blockwise_3d_tensor_op.hpp
+++ b/include/composable_kernel/tensor_operation/blockwise_3d_tensor_op.hpp
@@ -1,8 +1,8 @@
 #ifndef CK_BLOCKWISE_3D_TENSOR_OP_HPP
 #define CK_BLOCKWISE_3D_TENSOR_OP_HPP

-#include "common.hpp"
-#include "ConstantTensorDescriptor.hpp"
+#include "composable_kernel/utility/common.hpp"
+#include "composable_kernel/tensor_description/ConstantTensorDescriptor.hpp"

 namespace ck {

--- a/include/composable_kernel/tensor_operation/blockwise_4d_tensor_op.hpp
+++ b/include/composable_kernel/tensor_operation/blockwise_4d_tensor_op.hpp
@@ -1,8 +1,8 @@
 #ifndef CK_BLOCKWISE_4D_TENSOR_OP_HPP
 #define CK_BLOCKWISE_4D_TENSOR_OP_HPP

-#include "ConstantTensorDescriptor.hpp"
-#include "threadwise_tensor_slice_copy.hpp"
+#include "composable_kernel/tensor_description/ConstantTensorDescriptor.hpp"
+#include "composable_kernel/tensor_operation/threadwise_tensor_slice_copy.hpp"

 namespace ck {

--- a/include/composable_kernel/tensor_operation/blockwise_batched_gemm.hpp
+++ b/include/composable_kernel/tensor_operation/blockwise_batched_gemm.hpp
@@ -1,7 +1,7 @@
 #ifndef CK_BLOCKWISE_BATCHED_GEMM_HPP
 #define CK_BLOCKWISE_BATCHED_GEMM_HPP

-#include "threadwise_gemm.hpp"
+#include "composable_kernel/tensor_operation/threadwise_gemm.hpp"

 namespace ck {

--- a/include/composable_kernel/tensor_operation/blockwise_gemm.hpp
+++ b/include/composable_kernel/tensor_operation/blockwise_gemm.hpp
@@ -1,8 +1,8 @@
 #ifndef CK_BLOCKWISE_GEMM_HPP
 #define CK_BLOCKWISE_GEMM_HPP

-#include "common.hpp"
-#include "threadwise_gemm.hpp"
+#include "composable_kernel/utility/common.hpp"
+#include "composable_kernel/tensor_operation/threadwise_gemm.hpp"

 namespace ck {

--- a/include/composable_kernel/tensor_operation/blockwise_generic_tensor_slice_copy.hpp
+++ b/include/composable_kernel/tensor_operation/blockwise_generic_tensor_slice_copy.hpp
@@ -1,7 +1,7 @@
 #ifndef CK_BLOCKWISE_GENERIC_TENSOR_SLICE_COPY_HPP
 #define CK_BLOCKWISE_GENERIC_TENSOR_SLICE_COPY_HPP

-#include "threadwise_generic_tensor_slice_copy.hpp"
+#include "composable_kernel/tensor_operation/threadwise_generic_tensor_slice_copy.hpp"

 namespace ck {

--- a/include/composable_kernel/tensor_operation/blockwise_tensor_slice_copy.hpp
+++ b/include/composable_kernel/tensor_operation/blockwise_tensor_slice_copy.hpp
@@ -1,7 +1,7 @@
 #ifndef CK_BLOCKWISE_TENSOR_SLICE_COPY_HPP
 #define CK_BLOCKWISE_TENSOR_SLICE_COPY_HPP

-#include "threadwise_tensor_slice_copy.hpp"
+#include "composable_kernel/tensor_operation/threadwise_tensor_slice_copy.hpp"

 namespace ck {

--- a/include/composable_kernel/tensor_operation/threadwise_4d_tensor_op.hpp
+++ b/include/composable_kernel/tensor_operation/threadwise_4d_tensor_op.hpp
@@ -1,7 +1,7 @@
 #ifndef CK_THREADWISE_4D_TENSOR_OP_HPP
 #define CK_THREADWISE_4D_TENSOR_OP_HPP

-#include "ConstantTensorDescriptor.hpp"
+#include "composable_kernel/tensor_description/ConstantTensorDescriptor.hpp"

 namespace ck {

--- a/include/composable_kernel/tensor_operation/threadwise_direct_convolution.hpp
+++ b/include/composable_kernel/tensor_operation/threadwise_direct_convolution.hpp
@@ -1,8 +1,8 @@
 #ifndef CK_THREADWISE_DIRECT_CONVOLUTION_HPP
 #define CK_THREADWISE_DIRECT_CONVOLUTION_HPP

-#include "ConstantTensorDescriptor.hpp"
-#include "threadwise_tensor_slice_copy.hpp"
+#include "composable_kernel/tensor_description/ConstantTensorDescriptor.hpp"
+#include "composable_kernel/tensor_operation/threadwise_tensor_slice_copy.hpp"

 namespace ck {

--- a/include/composable_kernel/tensor_operation/threadwise_gemm.hpp
+++ b/include/composable_kernel/tensor_operation/threadwise_gemm.hpp
@@ -1,8 +1,8 @@
 #ifndef CK_THREADWISE_GEMM_HPP
 #define CK_THREADWISE_GEMM_HPP

-#include "common.hpp"
-#include "ConstantMatrixDescriptor.hpp"
+#include "composable_kernel/utility/common.hpp"
+#include "composable_kernel/tensor_description/ConstantMatrixDescriptor.hpp"

 namespace ck {

--- a/include/composable_kernel/tensor_operation/threadwise_generic_tensor_op.hpp
+++ b/include/composable_kernel/tensor_operation/threadwise_generic_tensor_op.hpp
@@ -1,8 +1,8 @@
 #ifndef CK_THREADWISE_GENERIC_TENSOR_OP_HPP
 #define CK_THREADWISE_GENERIC_TENSOR_OP_HPP

-#include "ConstantTensorDescriptor.hpp"
-#include "ConstantMergedTensorDescriptor.hpp"
+#include "composable_kernel/tensor_description/ConstantTensorDescriptor.hpp"
+#include "composable_kernel/tensor_description/ConstantMergedTensorDescriptor.hpp"

 namespace ck {
 template <class Float, class TDesc>
--- a/include/composable_kernel/tensor_operation/threadwise_generic_tensor_slice_copy.hpp
+++ b/include/composable_kernel/tensor_operation/threadwise_generic_tensor_slice_copy.hpp
@@ -1,8 +1,8 @@
 #ifndef CK_THREADWISE_GENERIC_TENSOR_SLICE_COPY_HPP
 #define CK_THREADWISE_GENERIC_TENSOR_SLICE_COPY_HPP

-#include "ConstantTensorDescriptor.hpp"
-#include "ConstantMergedTensorDescriptor.hpp"
+#include "composable_kernel/tensor_description/ConstantTensorDescriptor.hpp"
+#include "composable_kernel/tensor_description/ConstantMergedTensorDescriptor.hpp"

 namespace ck {

--- a/include/composable_kernel/tensor_operation/threadwise_tensor_slice_copy.hpp
+++ b/include/composable_kernel/tensor_operation/threadwise_tensor_slice_copy.hpp
@@ -1,7 +1,7 @@
 #ifndef CK_THREADWISE_TENSOR_SLICE_COPY_HPP
 #define CK_THREADWISE_TENSOR_SLICE_COPY_HPP

-#include "ConstantTensorDescriptor.hpp"
+#include "composable_kernel/tensor_description/ConstantTensorDescriptor.hpp"

 namespace ck {

--- a/include/composable_kernel/utility/Array.hpp
+++ b/include/composable_kernel/utility/Array.hpp
@@ -1,8 +1,8 @@
 #ifndef CK_ARRAY_HPP
 #define CK_ARRAY_HPP

-#include "Sequence.hpp"
-#include "functional2.hpp"
+#include "composable_kernel/utility/Sequence.hpp"
+#include "composable_kernel/utility/functional2.hpp"

 namespace ck {

--- a/include/composable_kernel/utility/Sequence.hpp
+++ b/include/composable_kernel/utility/Sequence.hpp
@@ -1,8 +1,8 @@
 #ifndef CK_SEQUENCE_HPP
 #define CK_SEQUENCE_HPP

-#include "integral_constant.hpp"
-#include "functional.hpp"
+#include "composable_kernel/utility/integral_constant.hpp"
+#include "composable_kernel/utility/functional.hpp"

 namespace ck {

--- a/include/composable_kernel/utility/amd_inline_asm.hpp
+++ b/include/composable_kernel/utility/amd_inline_asm.hpp
@@ -1,7 +1,7 @@
 #ifndef CK_AMD_INLINE_ASM_HPP
 #define CK_AMD_INLINE_ASM_HPP

-#include "common.hpp"
+#include "composable_kernel/utility/vector_type.hpp"

 #define NO_VM_WAIT 0
 #define NO_LGKM_WAIT 0
--- a/include/composable_kernel/utility/common.hpp
+++ b/include/composable_kernel/utility/common.hpp
@@ -0,0 +1,17 @@
+#ifndef CK_COMMON_HPP
+#define CK_COMMON_HPP
+
+#include "composable_kernel/utility/utility.hpp"
+#include "composable_kernel/utility/vector_type.hpp"
+#include "composable_kernel/utility/integral_constant.hpp"
+#include "composable_kernel/utility/Sequence.hpp"
+#include "composable_kernel/utility/Array.hpp"
+#include "composable_kernel/utility/functional.hpp"
+#include "composable_kernel/utility/functional2.hpp"
+#include "composable_kernel/utility/functional3.hpp"
+
+#if CK_USE_AMD_INLINE_ASM
+#include "composable_kernel/utility/amd_inline_asm.hpp"
+#endif
+
+#endif
--- a/include/composable_kernel/utility/config.hpp.in
+++ b/include/composable_kernel/utility/config.hpp.in
--- a/include/composable_kernel/utility/functional.hpp
+++ b/include/composable_kernel/utility/functional.hpp
@@ -1,8 +1,8 @@
 #ifndef CK_FUNCTIONAL_HPP
 #define CK_FUNCTIONAL_HPP

-#include "integral_constant.hpp"
-#include "Sequence.hpp"
+#include "composable_kernel/utility/integral_constant.hpp"
+#include "composable_kernel/utility/Sequence.hpp"

 namespace ck {

@@ -38,7 +38,7 @@ struct static_if<true>
    __host__ __device__ constexpr auto operator()(F f) const
    {
        // This is a trick for compiler:
-        //   Pass forwarder to lambda "f" as "auto" argument, and maks sure "f" will use it,
+        //   Pass forwarder to lambda "f" as "auto" argument, and make sure "f" will use it,
        //   this will make "f" a generic lambda, so that "f" won't be compiled until being
        //   instantiated here
        f(forwarder{});
@@ -67,7 +67,7 @@ struct static_if<false>
    __host__ __device__ static constexpr auto Else(F f)
    {
        // This is a trick for compiler:
-        //   Pass forwarder to lambda "f" as "auto" argument, and maks sure "f" will use it,
+        //   Pass forwarder to lambda "f" as "auto" argument, and make sure "f" will use it,
        //   this will make "f" a generic lambda, so that "f" won't be compiled until being
        //   instantiated here
        f(forwarder{});
--- a/include/composable_kernel/utility/functional2.hpp
+++ b/include/composable_kernel/utility/functional2.hpp
@@ -1,8 +1,8 @@
 #ifndef CK_FUNCTIONAL2_HPP
 #define CK_FUNCTIONAL2_HPP

-#include "functional.hpp"
-#include "Sequence.hpp"
+#include "composable_kernel/utility/functional.hpp"
+#include "composable_kernel/utility/Sequence.hpp"

 namespace ck {

--- a/include/composable_kernel/utility/functional3.hpp
+++ b/include/composable_kernel/utility/functional3.hpp
@@ -1,10 +1,10 @@
 #ifndef CK_FUNCTIONAL3_HPP
 #define CK_FUNCTIONAL3_HPP

-#include "functional.hpp"
-#include "functional2.hpp"
-#include "Sequence.hpp"
-#include "Array.hpp"
+#include "composable_kernel/utility/functional.hpp"
+#include "composable_kernel/utility/functional2.hpp"
+#include "composable_kernel/utility/Sequence.hpp"
+#include "composable_kernel/utility/Array.hpp"

 namespace ck {

--- a/include/composable_kernel/utility/integral_constant.hpp
+++ b/include/composable_kernel/utility/integral_constant.hpp
--- a/include/composable_kernel/utility/utility.hpp
+++ b/include/composable_kernel/utility/utility.hpp
@@ -1,5 +1,5 @@
-#ifndef CK_BASE_HPP
-#define CK_BASE_HPP
+#ifndef CK_UTILITY_HPP
+#define CK_UTILITY_HPP

 namespace ck {

@@ -25,7 +25,8 @@ __host__ __device__ constexpr bool is_same_type(X, Y)
    return is_same<X, Y>::value;
 }

-namespace math { // namespace math
+namespace math {
+
 template <class T, T s>
 struct scales
 {
@@ -106,7 +107,7 @@ __host__ __device__ constexpr T min(T x, Ts... xs)
 }

 // this is wrong
-// TODO: implement correct least common multiple, instead of calling max()
+// TODO: implement least common multiple properly, instead of calling max()
 template <class T, class... Ts>
 __host__ __device__ constexpr T lcm(T x, Ts... xs)
 {
--- a/include/composable_kernel/utility/vector_type.hpp
+++ b/include/composable_kernel/utility/vector_type.hpp
@@ -1,8 +1,8 @@
 #ifndef CK_VECTOR_TYPE_HPP
 #define CK_VECTOR_TYPE_HPP

-#include "config.hpp"
-#include "integral_constant.hpp"
+#include "composable_kernel/utility/config.hpp"
+#include "composable_kernel/utility/integral_constant.hpp"

 namespace ck {

--- a/src/include/conv_common.hpp
+++ b/src/include/conv_common.hpp
@@ -1,7 +1,7 @@
 #ifndef CK_CONV_COMMON_HPP
 #define CK_CONV_COMMON_HPP

-#include "ConstantTensorDescriptor.hpp"
+#include "composable_kernel/tensor_description/ConstantTensorDescriptor.hpp"

 using namespace ck;

--- a/src/include/device.hpp
+++ b/src/include/device.hpp
@@ -2,7 +2,7 @@
 #define CK_DEVICE_HPP

 #include <memory>
-#include "config.hpp"
+#include "composable_kernel/utility/config.hpp"

 using namespace ck;

--- a/src/include/gridwise_convolution_kernel_wrapper.hpp
+++ b/src/include/gridwise_convolution_kernel_wrapper.hpp
@@ -1,8 +1,6 @@
 #ifndef CK_GRIDWISE_CONVOLUTION_KERNEL_WRAPPER
 #define CK_GRIDWISE_CONVOLUTION_KERNEL_WRAPPER

-namespace ck {
-
 template <class GridwiseConvolution, class T>
 __global__ void run_gridwise_convolution_kernel(const T* const __restrict__ p_in_global,
                                                const T* const __restrict__ p_wei_global,
@@ -11,6 +9,4 @@ __global__ void run_gridwise_convolution_kernel(const T* const __restrict__ p_in
    GridwiseConvolution{}.Run(p_in_global, p_wei_global, p_out_global);
 }

-} // namespace ck
-
 #endif
--- a/src/include/tensor.hpp
+++ b/src/include/tensor.hpp
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -1,4 +1,4 @@
-configure_file("${PROJECT_SOURCE_DIR}/src/include/config.hpp.in" "${PROJECT_BINARY_DIR}/src/include/config.hpp")
+configure_file("${PROJECT_SOURCE_DIR}/include/composable_kernel/utility/config.hpp.in" "${PROJECT_BINARY_DIR}/include/composable_kernel/utility/config.hpp")

 set(TENSOR_SOURCE 
    tensor.cpp;
@@ -9,7 +9,6 @@ add_library(tensor SHARED ${TENSOR_SOURCE})
 target_compile_features(tensor PUBLIC)
 set_target_properties(tensor PROPERTIES POSITION_INDEPENDENT_CODE ON)

-
 if(DEVICE_BACKEND STREQUAL "CUDA")
    target_link_libraries(tensor nvToolsExt cudart)
 endif()
--- a/src/device.cpp
+++ b/src/device.cpp
@@ -1,4 +1,4 @@
-#include "config.hpp"
+#include "composable_kernel/utility/config.hpp"
 #include "device.hpp"

 DeviceMem::DeviceMem(std::size_t mem_size) : mMemSize(mem_size)
--- a/src/include/common.hpp
+++ b/src/include/common.hpp
@@ -1,17 +0,0 @@
-#ifndef CK_COMMON_HPP
-#define CK_COMMON_HPP
-
-#include "utility.hpp"
-#include "vector_type.hpp"
-#include "integral_constant.hpp"
-#include "Sequence.hpp"
-#include "Array.hpp"
-#include "functional.hpp"
-#include "functional2.hpp"
-#include "functional3.hpp"
-
-#if CK_USE_AMD_INLINE_ASM
-#include "amd_inline_asm.hpp"
-#endif
-
-#endif