diff --git a/CMakeLists.txt b/CMakeLists.txt
index 55727cc7c9..9798220caa 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -33,13 +33,9 @@ endif( NOT( ${CMAKE_CXX_COMPILER_ID} STREQUAL "AppleClang") )
 
 #GPU backend
 if(DEVICE_BACKEND STREQUAL "AMD")
-    set(CK_DEVICE_BACKEND_AMD 1)
-
     set(CMAKE_MODULE_PATH "/opt/rocm/hip/cmake" ${CMAKE_MODULE_PATH})
     find_package(HIP REQUIRED)
 elseif(DEVICE_BACKEND STREQUAL "NVIDIA")
-    set(CK_DEVICE_BACKEND_NVIDIA 1)
-
 	enable_language(CUDA)
 	include_directories(BEFORE ${CUDA_COMMON_INCLUDE_DIR})
 endif()
diff --git a/composable_kernel/include/tensor_operation/blockwise_gemm.hpp b/composable_kernel/include/tensor_operation/blockwise_gemm.hpp
index 2052747c65..923e8c7209 100644
--- a/composable_kernel/include/tensor_operation/blockwise_gemm.hpp
+++ b/composable_kernel/include/tensor_operation/blockwise_gemm.hpp
@@ -54,9 +54,9 @@ struct BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_v2
                           N % (NPerThreadSubC * NLevel0Cluster * NLevel1Cluster) == 0,
                       "wrong! Cannot evenly divide work among\n");
 
-        static_assert(std::is_same<decltype(ThreadMatrixC::GetLengths()),
-                                   decltype(GetThreadMatrixCLengths())>{},
-                      "wrong! ThreadMatrixC lengths is wrong");
+        static_assert(
+            is_same<decltype(ThreadMatrixC::GetLengths()), decltype(GetThreadMatrixCLengths())>{},
+            "wrong! ThreadMatrixC lengths is wrong");
 
         auto c_thread_mtx_index = GetBeginOfThreadMatrixC(get_thread_local_1d_id());
 
diff --git a/composable_kernel/include/utility/amd_inline_asm.hpp b/composable_kernel/include/utility/amd_inline_asm.hpp
index f973a41b0d..e7e20808e1 100644
--- a/composable_kernel/include/utility/amd_inline_asm.hpp
+++ b/composable_kernel/include/utility/amd_inline_asm.hpp
@@ -6,7 +6,7 @@
 namespace ck {
 
 // cast a pointer of LDS to its address
-extern "C" __attribute__((address_space(3))) void* __to_local(void* p)[[hc]];
+extern "C" __attribute__((address_space(3))) __device__ void* __to_local(void* p);
 
 __device__ void vmcnt(index_t cnt)
 {
diff --git a/composable_kernel/include/utility/common_header.hpp b/composable_kernel/include/utility/common_header.hpp
index 1c8dcbd521..339666ff91 100644
--- a/composable_kernel/include/utility/common_header.hpp
+++ b/composable_kernel/include/utility/common_header.hpp
@@ -1,18 +1,15 @@
-#ifndef CK_COMMON_HPP
-#define CK_COMMON_HPP
+#ifndef CK_COMMON_HEADER_HPP
+#define CK_COMMON_HEADER_HPP
 
 #include "config.hpp"
+#include "integral_constant.hpp"
+#include "math.hpp"
 #include "utility.hpp"
 #include "vector_type.hpp"
-#include "integral_constant.hpp"
 #include "Sequence.hpp"
 #include "Array.hpp"
 #include "functional.hpp"
 #include "functional2.hpp"
 #include "functional3.hpp"
 
-#if CK_USE_AMD_INLINE_ASM
-#include "amd_inline_asm.hpp"
-#endif
-
 #endif
diff --git a/composable_kernel/include/utility/config_amd.hpp.in b/composable_kernel/include/utility/config_amd.hpp.in
index ea8a3356d6..f9ae2e7830 100644
--- a/composable_kernel/include/utility/config_amd.hpp.in
+++ b/composable_kernel/include/utility/config_amd.hpp.in
@@ -1,12 +1,11 @@
 #ifndef CK_CONFIG_AMD_HPP
 #define CK_CONFIG_AMD_HPP
 
-#cmakedefine01 CK_DEVICE_BACKEND_AMD
-
 #include "hip/hip_runtime.h"
 #include "hip/hip_fp16.h"
-#define CK_USE_AMD_INLINE_ASM 1
 
+#define CK_DEVICE_BACKEND_AMD 1
+#define CK_USE_AMD_INLINE_ASM 1
 #define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_BLOCKWISE_GENERIC_SLICE_COPY_V1 1
 #define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1 0
 
diff --git a/composable_kernel/include/utility/config_nvidia.hpp.in b/composable_kernel/include/utility/config_nvidia.hpp.in
index 038448e9d1..f077db991c 100644
--- a/composable_kernel/include/utility/config_nvidia.hpp.in
+++ b/composable_kernel/include/utility/config_nvidia.hpp.in
@@ -1,14 +1,13 @@
 #ifndef CK_CONFIG_NVIDIA_HPP
 #define CK_CONFIG_NVIDIA_HPP
 
-#cmakedefine01 CK_DEVICE_BACKEND_NVIDIA
-
 #include "cuda_runtime.h"
 #include "cuda_fp16.h"
 #include "nvToolsExt.h"
 #include "helper_cuda.h"
-#define CK_USE_AMD_INLINE_ASM 0
 
+#define CK_DEVICE_BACKEND_NVIDIA 1
+#define CK_USE_AMD_INLINE_ASM 0
 #define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_BLOCKWISE_GENERIC_SLICE_COPY_V1 0
 #define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1 0
 
@@ -23,10 +22,6 @@ using float4_t = float4;
 
 using index_t = uint32_t;
 
-__device__ index_t get_thread_local_1d_id() { return threadIdx.x; }
-
-__device__ index_t get_block_1d_id() { return blockIdx.x; }
-
 __device__ void fused_multiply_accumulate(float& d, const float& s0, const float& s1)
 {
     d += s0 * s1;
@@ -52,9 +47,7 @@ __device__ void fused_multiply_accumulate(char& d, const char& s0, const char& s
 //  need to make a better interface
 __device__ void fused_multiply_accumulate(int32_t& d, const int32_t& s0, const int32_t& s1)
 {
-#if CK_DEVICE_BACKEND_NVIDIA
     d = __dp4a(s0, s1, d);
-#endif
 }
 #endif
 
diff --git a/composable_kernel/include/utility/integral_constant.hpp b/composable_kernel/include/utility/integral_constant.hpp
index 6038b1b203..b83d7843d5 100644
--- a/composable_kernel/include/utility/integral_constant.hpp
+++ b/composable_kernel/include/utility/integral_constant.hpp
@@ -1,12 +1,20 @@
 #ifndef CK_INTEGRAL_CONSTANT_HPP
 #define CK_INTEGRAL_CONSTANT_HPP
 
-#include <type_traits>
-
 namespace ck {
 
 template <class T, T v>
-using integral_constant = std::integral_constant<T, v>;
+struct integral_constant
+{
+    static constexpr T value = v;
+    typedef T value_type;
+    typedef integral_constant type; // using injected-class-name
+    __host__ __device__ constexpr operator value_type() const noexcept { return value; }
+    __host__ __device__ constexpr value_type operator()() const noexcept
+    {
+        return value;
+    } // since c++14
+};
 
 template <class T, T X, T Y>
 __host__ __device__ constexpr auto operator+(integral_constant<T, X>, integral_constant<T, Y>)
@@ -23,5 +31,15 @@ __host__ __device__ constexpr auto operator*(integral_constant<T, X>, integral_c
 template <index_t N>
 using Number = integral_constant<index_t, N>;
 
+template <class X, class Y>
+struct is_same : public integral_constant<bool, false>
+{
+};
+
+template <class X>
+struct is_same<X, X> : public integral_constant<bool, true>
+{
+};
+
 } // namespace ck
 #endif
diff --git a/composable_kernel/include/utility/math.hpp b/composable_kernel/include/utility/math.hpp
new file mode 100644
index 0000000000..b754ca173c
--- /dev/null
+++ b/composable_kernel/include/utility/math.hpp
@@ -0,0 +1,107 @@
+#ifndef CK_MATH_HPP
+#define CK_MATH_HPP
+
+#include "config.hpp"
+
+namespace ck {
+namespace math {
+
+template <class T, T s>
+struct scales
+{
+    __host__ __device__ constexpr T operator()(T a) const { return s * a; }
+};
+
+template <class T>
+struct plus
+{
+    __host__ __device__ constexpr T operator()(T a, T b) const { return a + b; }
+};
+
+template <class T>
+struct minus
+{
+    __host__ __device__ constexpr T operator()(T a, T b) const { return a - b; }
+};
+
+template <class T>
+struct multiplies
+{
+    __host__ __device__ constexpr T operator()(T a, T b) const { return a * b; }
+};
+
+template <class T>
+struct integer_divide_ceiler
+{
+    __host__ __device__ constexpr T operator()(T a, T b) const
+    {
+        static_assert(is_same<T, index_t>{} || is_same<T, int>{}, "wrong type");
+
+        return (a + b - 1) / b;
+    }
+};
+
+template <class T>
+__host__ __device__ constexpr T integer_divide_ceil(T a, T b)
+{
+    static_assert(is_same<T, index_t>{} || is_same<T, int>{}, "wrong type");
+
+    return (a + b - 1) / b;
+}
+
+template <class T>
+__host__ __device__ constexpr T integer_least_multiple(T a, T b)
+{
+    static_assert(is_same<T, index_t>{} || is_same<T, int>{}, "wrong type");
+
+    return b * integer_divide_ceil(a, b);
+}
+
+template <class T>
+__host__ __device__ constexpr T max(T x)
+{
+    return x;
+}
+
+template <class T, class... Ts>
+__host__ __device__ constexpr T max(T x, Ts... xs)
+{
+    static_assert(sizeof...(xs) > 0, "not enough argument");
+
+    auto y = max(xs...);
+
+    static_assert(is_same<decltype(y), T>{}, "not the same type");
+
+    return x > y ? x : y;
+}
+
+template <class T>
+__host__ __device__ constexpr T min(T x)
+{
+    return x;
+}
+
+template <class T, class... Ts>
+__host__ __device__ constexpr T min(T x, Ts... xs)
+{
+    static_assert(sizeof...(xs) > 0, "not enough argument");
+
+    auto y = min(xs...);
+
+    static_assert(is_same<decltype(y), T>{}, "not the same type");
+
+    return x < y ? x : y;
+}
+
+// this is WRONG
+// TODO: implement least common multiple properly, instead of calling max()
+template <class T, class... Ts>
+__host__ __device__ constexpr T lcm(T x, Ts... xs)
+{
+    return max(x, xs...);
+}
+
+} // namespace math
+} // namspace ck
+
+#endif
diff --git a/composable_kernel/include/utility/utility.hpp b/composable_kernel/include/utility/utility.hpp
index e873881561..4cb2daa7c8 100644
--- a/composable_kernel/include/utility/utility.hpp
+++ b/composable_kernel/include/utility/utility.hpp
@@ -1,112 +1,14 @@
 #ifndef CK_UTILITY_HPP
 #define CK_UTILITY_HPP
 
-#include <type_traits>
 #include "config.hpp"
 
 namespace ck {
 
-template <class X, class Y>
-using is_same = std::is_same<X, Y>;
+__device__ index_t get_thread_local_1d_id() { return threadIdx.x; }
 
-namespace math {
+__device__ index_t get_block_1d_id() { return blockIdx.x; }
 
-template <class T, T s>
-struct scales
-{
-    __host__ __device__ constexpr T operator()(T a) const { return s * a; }
-};
-
-template <class T>
-struct plus
-{
-    __host__ __device__ constexpr T operator()(T a, T b) const { return a + b; }
-};
-
-template <class T>
-struct minus
-{
-    __host__ __device__ constexpr T operator()(T a, T b) const { return a - b; }
-};
-
-template <class T>
-struct multiplies
-{
-    __host__ __device__ constexpr T operator()(T a, T b) const { return a * b; }
-};
-
-template <class T>
-struct integer_divide_ceiler
-{
-    __host__ __device__ constexpr T operator()(T a, T b) const
-    {
-        static_assert(is_same<T, index_t>{} || is_same<T, int>{}, "wrong type");
-
-        return (a + b - 1) / b;
-    }
-};
-
-template <class T>
-__host__ __device__ constexpr T integer_divide_ceil(T a, T b)
-{
-    static_assert(is_same<T, index_t>{} || is_same<T, int>{}, "wrong type");
-
-    return (a + b - 1) / b;
-}
-
-template <class T>
-__host__ __device__ constexpr T integer_least_multiple(T a, T b)
-{
-    static_assert(is_same<T, index_t>{} || is_same<T, int>{}, "wrong type");
-
-    return b * integer_divide_ceil(a, b);
-}
-
-template <class T>
-__host__ __device__ constexpr T max(T x)
-{
-    return x;
-}
-
-template <class T, class... Ts>
-__host__ __device__ constexpr T max(T x, Ts... xs)
-{
-    static_assert(sizeof...(xs) > 0, "not enough argument");
-
-    auto y = max(xs...);
-
-    static_assert(is_same<decltype(y), T>{}, "not the same type");
-
-    return x > y ? x : y;
-}
-
-template <class T>
-__host__ __device__ constexpr T min(T x)
-{
-    return x;
-}
-
-template <class T, class... Ts>
-__host__ __device__ constexpr T min(T x, Ts... xs)
-{
-    static_assert(sizeof...(xs) > 0, "not enough argument");
-
-    auto y = min(xs...);
-
-    static_assert(is_same<decltype(y), T>{}, "not the same type");
-
-    return x < y ? x : y;
-}
-
-// this is WRONG
-// TODO: implement least common multiple properly, instead of calling max()
-template <class T, class... Ts>
-__host__ __device__ constexpr T lcm(T x, Ts... xs)
-{
-    return max(x, xs...);
-}
-
-} // namespace math
 } // namspace ck
 
 #endif