diff --git a/src/include/ConstantMatrixDescriptor.hip.hpp b/src/include/ConstantMatrixDescriptor.hip.hpp
index 9cacf27553..c6ca3192e3 100644
--- a/src/include/ConstantMatrixDescriptor.hip.hpp
+++ b/src/include/ConstantMatrixDescriptor.hip.hpp
@@ -23,11 +23,7 @@ struct ConstantMatrixDescriptor
 
     __host__ __device__ index_t Get1dIndex(index_t irow, index_t icol) const
     {
-#if DEVICE_BACKEND_HIP
-        return __mul24(irow, RowStride_) + icol;
-#else
         return irow * RowStride_ + icol;
-#endif
     }
 
     template <index_t SubNRow, index_t SubNCol>
diff --git a/src/include/blockwise_2d_tensor_op.hip.hpp b/src/include/blockwise_2d_tensor_op.hip.hpp
index 8d457056a2..a5c8f1ea9b 100644
--- a/src/include/blockwise_2d_tensor_op.hip.hpp
+++ b/src/include/blockwise_2d_tensor_op.hip.hpp
@@ -668,7 +668,7 @@ struct Blockwise2dTensorCopy3
         }
     }
 
-#if DEVICE_BACKEND_HIP
+#if USE_AMD_INLINE_ASM
     __device__ void RunLoadRegisterClipboard_asm(const Float* __restrict__ p_src,
                                                  Float* p_clipboard) const
     {
diff --git a/src/include/blockwise_batched_gemm.hip.hpp b/src/include/blockwise_batched_gemm.hip.hpp
index b50acfe0d4..98f36011f3 100644
--- a/src/include/blockwise_batched_gemm.hip.hpp
+++ b/src/include/blockwise_batched_gemm.hip.hpp
@@ -283,7 +283,7 @@ struct BlockwiseBatchGemmBlockABlockBThreadCTransANormalBNormalC_V2
         }
     }
 
-#if DEVICE_BACKEND_HIP
+#if USE_AMD_INLINE_ASM
     template <class FloatA, class FloatB, class FloatC>
     __device__ void Run_asm(const FloatA* __restrict__ p_a_block,
                             const FloatB* __restrict__ p_b_block,
diff --git a/src/include/blockwise_gemm.hip.hpp b/src/include/blockwise_gemm.hip.hpp
index 4870227f7e..582f45f7ea 100644
--- a/src/include/blockwise_gemm.hip.hpp
+++ b/src/include/blockwise_gemm.hip.hpp
@@ -126,7 +126,7 @@ struct BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_v2
                            n_repeat * NPerLevel1Cluster + n_in_sub_c};
     }
 
-#if DEVICE_BACKEND_HIP
+#if USE_AMD_INLINE_ASM
     // TODO: this is not working correctly
     template <class FloatA, class FloatB, class FloatC>
     __device__ void Run_asm(const FloatA* __restrict__ p_a_block,
diff --git a/src/include/common.hip.hpp b/src/include/common.hip.hpp
index 5f99872d8e..bf7249ca70 100644
--- a/src/include/common.hip.hpp
+++ b/src/include/common.hip.hpp
@@ -1,12 +1,12 @@
 #pragma once
-#include "data_type.hip.hpp"
+#include "vector_type.hip.hpp"
 #include "constant_integral.hip.hpp"
 #include "Sequence.hip.hpp"
 #include "Array.hip.hpp"
 #include "functional.hip.hpp"
 #include "functional2.hip.hpp"
 
-#if DEVICE_BACKEND_HIP
+#if USE_AMD_INLINE_ASM
 #include "amd_inline_asm.hip.hpp"
 #endif
 
diff --git a/src/include/config.h.in b/src/include/config.h.in
index ce3232489d..c97d71def2 100644
--- a/src/include/config.h.in
+++ b/src/include/config.h.in
@@ -5,11 +5,56 @@
 #if DEVICE_BACKEND_HIP
 #include "hip/hip_runtime.h"
 #include "hip/hip_fp16.h"
+#define USE_AMD_INLINE_ASM 1
+
+// For some reason, HIP compiler need this definition to generate optimal load and store
+// instruction
+typedef float float2_t __attribute__((ext_vector_type(2)));
+typedef float float4_t __attribute__((ext_vector_type(4)));
 #elif DEVICE_BACKEND_CUDA
 #include "cuda_runtime.h"
 #include "cuda_fp16.h"
 #include "nvToolsExt.h"
 #include "helper_cuda.h"
+#define USE_AMD_INLINE_ASM 0
+
+// For some reason, CUDA need this definition, otherwise
+//   compiler won't generate optimal load and store instruction, and
+//   kernel would produce wrong result, indicating the compiler fail to generate correct
+//   instruction,
+using float2_t = float2;
+using float4_t = float4;
 #endif
 
 using index_t = uint32_t;
+
+__device__ void fused_multiply_accumulate(float& d, const float& s0, const float& s1)
+{
+    d += s0 * s1;
+}
+
+#if 0
+__device__ void fused_multiply_accumulate(half& d, const half& s0, const half& s1) { d += s0 * s1; }
+
+__device__ void fused_multiply_accumulate(half& d, const half2& s0, const half2& s1)
+{
+    d += s0.x * s1.x;
+    d += s0.y * s1.y;
+}
+
+__device__ void fused_multiply_accumulate(float& d, const half2& s0, const half2& s1)
+{
+    d += s0.x * s1.x + s0.y * s1.y;
+}
+
+__device__ void fused_multiply_accumulate(char& d, const char& s0, const char& s1) { d += s0 * s1; }
+
+// TODO:: this interface is misleading, s0, s1 are actually int8x4
+//  need to make a better interface
+__device__ void fused_multiply_accumulate(int32_t& d, const int32_t& s0, const int32_t& s1)
+{
+#if DEVICE_BACKEND_CUDA
+    d = __dp4a(s0, s1, d);
+#endif
+}
+#endif
diff --git a/src/include/data_type.hip.hpp b/src/include/vector_type.hip.hpp
similarity index 59%
rename from src/include/data_type.hip.hpp
rename to src/include/vector_type.hip.hpp
index b341609a39..d2d3db92ec 100644
--- a/src/include/data_type.hip.hpp
+++ b/src/include/vector_type.hip.hpp
@@ -23,17 +23,7 @@ struct vector_type<float, 1>
 template <>
 struct vector_type<float, 2>
 {
-#if DEVICE_BACKEND_HIP
-    // For some reason, HIP compiler need this definition to generate optimal load and store
-    // instruction
-    typedef float MemoryType __attribute__((ext_vector_type(2)));
-#elif DEVICE_BACKEND_CUDA
-    // For some reason, CUDA need this definition, otherwise
-    //   compiler won't generate optimal load and store instruction, and
-    //   kernel would produce wrong result, indicating the compiler fail to generate correct
-    //   instruction,
-    using MemoryType = float2;
-#endif
+    using MemoryType = float2_t;
 
     union Data
     {
@@ -60,17 +50,7 @@ struct vector_type<float, 2>
 template <>
 struct vector_type<float, 4>
 {
-#if DEVICE_BACKEND_HIP
-    // For some reason, HIP compiler need this definition to generate optimal load and store
-    // instruction
-    typedef float MemoryType __attribute__((ext_vector_type(4)));
-#elif DEVICE_BACKEND_CUDA
-    // For some reason, CUDA need this definition, otherwise
-    //   compiler won't generate optimal load and store instruction, and
-    //   kernel would produce wrong result, indicating the compiler fail to generate correct
-    //   instruction,
-    using MemoryType = float4;
-#endif
+    using MemoryType = float4_t;
 
     template <index_t I>
     __host__ __device__ static void SetScalar(MemoryType& v, float s, Number<I>)
@@ -204,48 +184,3 @@ struct vector_type<char4, 2>
     using MemoryType = int64_t;
 };
 #endif
-
-__device__ void fused_multiply_accumulate(float& d, const float& s0, const float& s1)
-{
-    d += s0 * s1;
-}
-
-__device__ void fused_multiply_accumulate(float& d, const float2& s0, const float2& s1)
-{
-    d += s0.x * s1.x;
-    d += s0.y * s1.y;
-}
-
-__device__ void fused_multiply_accumulate(float& d, const float4& s0, const float4& s1)
-{
-    d += s0.x * s1.x;
-    d += s0.y * s1.y;
-    d += s0.z * s1.z;
-    d += s0.w * s1.w;
-}
-
-#if 0
-__device__ void fused_multiply_accumulate(half& d, const half& s0, const half& s1) { d += s0 * s1; }
-
-__device__ void fused_multiply_accumulate(half& d, const half2& s0, const half2& s1)
-{
-    d += s0.x * s1.x;
-    d += s0.y * s1.y;
-}
-
-__device__ void fused_multiply_accumulate(float& d, const half2& s0, const half2& s1)
-{
-    d += s0.x * s1.x + s0.y * s1.y;
-}
-
-__device__ void fused_multiply_accumulate(char& d, const char& s0, const char& s1) { d += s0 * s1; }
-
-// TODO:: this interface is misleading, s0, s1 are actually int8x4
-//  need to make a better interface
-__device__ void fused_multiply_accumulate(int32_t& d, const int32_t& s0, const int32_t& s1)
-{
-#if DEVICE_BACKEND_CUDA
-    d = __dp4a(s0, s1, d);
-#endif
-}
-#endif