diff --git a/CMakeLists.txt b/CMakeLists.txt
index fbbafb877d..1cbca2bb77 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -32,13 +32,13 @@ if( NOT( ${CMAKE_CXX_COMPILER_ID} STREQUAL "AppleClang") )
 endif( NOT( ${CMAKE_CXX_COMPILER_ID} STREQUAL "AppleClang") )
 
 #GPU backend
-if(DEVICE_BACKEND STREQUAL "HIP")
-    set(DEVICE_BACKEND_HIP 1)
+if(DEVICE_BACKEND STREQUAL "AMD")
+    set(CK_DEVICE_BACKEND_AMD 1)
 
     set(CMAKE_MODULE_PATH "/opt/rocm/hip/cmake" ${CMAKE_MODULE_PATH})
     find_package(HIP REQUIRED)
-elseif(DEVICE_BACKEND STREQUAL "CUDA")
-    set(DEVICE_BACKEND_CUDA 1)
+elseif(DEVICE_BACKEND STREQUAL "NVIDIA")
+    set(CK_DEVICE_BACKEND_NVIDIA 1)
 
 	enable_language(CUDA)
 	include_directories(BEFORE ${CUDA_COMMON_INCLUDE_DIR})
diff --git a/driver/CMakeLists.txt b/driver/CMakeLists.txt
index 4a7a81a36a..f9528ae211 100644
--- a/driver/CMakeLists.txt
+++ b/driver/CMakeLists.txt
@@ -1,6 +1,6 @@
-if(DEVICE_BACKEND STREQUAL "HIP")
+if(DEVICE_BACKEND STREQUAL "AMD")
     set(DRIVER_SOURCE driver.cpp)
-elseif(DEVICE_BACKEND STREQUAL "CUDA")
+elseif(DEVICE_BACKEND STREQUAL "NVIDIA")
     set(DRIVER_SOURCE driver.cu)
 endif()
 
diff --git a/include/composable_kernel/utility/config_amd.hpp.in b/include/composable_kernel/utility/config_amd.hpp.in
new file mode 100644
index 0000000000..d6ac44ab24
--- /dev/null
+++ b/include/composable_kernel/utility/config_amd.hpp.in
@@ -0,0 +1,41 @@
+#ifndef CK_CONFIG_HPP
+#define CK_CONFIG_HPP
+
+#cmakedefine01 CK_DEVICE_BACKEND_AMD
+
+#include "hip/hip_runtime.h"
+#include "hip/hip_fp16.h"
+#define CK_USE_AMD_INLINE_ASM 1
+
+namespace ck {
+
+// For some reason, HIP compiler need this definition to generate optimal load and store
+// instruction
+typedef float float2_t __attribute__((ext_vector_type(2)));
+typedef float float4_t __attribute__((ext_vector_type(4)));
+
+using index_t = uint32_t;
+
+__device__ void fused_multiply_accumulate(float& d, const float& s0, const float& s1)
+{
+    d += s0 * s1;
+}
+
+#if 0
+__device__ void fused_multiply_accumulate(half& d, const half& s0, const half& s1) { d += s0 * s1; }
+
+__device__ void fused_multiply_accumulate(half& d, const half2& s0, const half2& s1)
+{
+    d += s0.x * s1.x;
+    d += s0.y * s1.y;
+}
+
+__device__ void fused_multiply_accumulate(float& d, const half2& s0, const half2& s1)
+{
+    d += s0.x * s1.x + s0.y * s1.y;
+}
+#endif
+
+} // namespace ck
+
+#endif
diff --git a/include/composable_kernel/utility/config.hpp.in b/include/composable_kernel/utility/config_nvidia.hpp.in
similarity index 70%
rename from include/composable_kernel/utility/config.hpp.in
rename to include/composable_kernel/utility/config_nvidia.hpp.in
index 1bd7500c12..a4d4bcaae7 100644
--- a/include/composable_kernel/utility/config.hpp.in
+++ b/include/composable_kernel/utility/config_nvidia.hpp.in
@@ -1,37 +1,22 @@
-#ifndef CK_CONFIG_HPP
-#define CK_CONFIG_HPP
+#ifndef CK_CONFIG_CUDA_HPP
+#define CK_CONFIG_CUDA_HPP
 
-#cmakedefine01 DEVICE_BACKEND_HIP
-#cmakedefine01 DEVICE_BACKEND_CUDA
+#cmakedefine01 CK_DEVICE_BACKEND_NVIDIA
 
-#if DEVICE_BACKEND_HIP
-#include "hip/hip_runtime.h"
-#include "hip/hip_fp16.h"
-#define CK_USE_AMD_INLINE_ASM 1
-
-#elif DEVICE_BACKEND_CUDA
 #include "cuda_runtime.h"
 #include "cuda_fp16.h"
 #include "nvToolsExt.h"
 #include "helper_cuda.h"
 #define CK_USE_AMD_INLINE_ASM 0
-#endif
 
 namespace ck {
 
-#if DEVICE_BACKEND_HIP
-// For some reason, HIP compiler need this definition to generate optimal load and store
-// instruction
-typedef float float2_t __attribute__((ext_vector_type(2)));
-typedef float float4_t __attribute__((ext_vector_type(4)));
-#else
 // For some reason, CUDA need this definition, otherwise
 //   compiler won't generate optimal load and store instruction, and
 //   kernel would produce wrong result, indicating the compiler fail to generate correct
 //   instruction,
 using float2_t = float2;
 using float4_t = float4;
-#endif
 
 using index_t = uint32_t;
 
@@ -60,7 +45,7 @@ __device__ void fused_multiply_accumulate(char& d, const char& s0, const char& s
 //  need to make a better interface
 __device__ void fused_multiply_accumulate(int32_t& d, const int32_t& s0, const int32_t& s1)
 {
-#if DEVICE_BACKEND_CUDA
+#if CK_DEVICE_BACKEND_NVIDIA
     d = __dp4a(s0, s1, d);
 #endif
 }
diff --git a/include/device.hpp b/include/device.hpp
index 2c982a4012..59ab0d0bcd 100644
--- a/include/device.hpp
+++ b/include/device.hpp
@@ -37,7 +37,7 @@ float launch_kernel(F kernel, dim3 grid_dim, dim3 block_dim, std::size_t lds_byt
 {
     KernelTimer timer;
 
-#if DEVICE_BACKEND_HIP
+#if CK_DEVICE_BACKEND_AMD
     timer.Start();
 
     hipLaunchKernelGGL(kernel, grid_dim, block_dim, lds_byte, 0, args...);
@@ -45,7 +45,7 @@ float launch_kernel(F kernel, dim3 grid_dim, dim3 block_dim, std::size_t lds_byt
     timer.End();
 
     hipGetErrorString(hipGetLastError());
-#elif DEVICE_BACKEND_CUDA
+#elif CK_DEVICE_BACKEND_NVIDIA
     const void* f  = reinterpret_cast<const void*>(kernel);
     void* p_args[] = {&args...};
 
diff --git a/script/cmake-cuda.sh b/script/cmake-cuda.sh
index 0e7d00c469..7feb67e213 100755
--- a/script/cmake-cuda.sh
+++ b/script/cmake-cuda.sh
@@ -12,7 +12,7 @@ cmake
 -D CMAKE_CXX_COMPILER=clang++                                                               \
 -D CMAKE_BUILD_TYPE=Release                                                                 \
 -D CMAKE_VERBOSE_MAKEFILE:BOOL=ON                                                           \
--D DEVICE_BACKEND=CUDA                                                                      \
+-D DEVICE_BACKEND=NVIDIA                                                                    \
 -D BOOST_ROOT="/package/install/boost_1.67.0"                                               \
 -D CUDA_COMMON_INCLUDE_DIR="/home/chao/code/test_feature/cuda_common/cuda_10.0_common/inc"  \
 -D CMAKE_CUDA_FLAGS="-ccbin clang++ -m64 -Xcompiler -fopenmp -lineinfo --source-in-ptx -keep -Xptxas -v -gencode=arch=compute_61,code=sm_61 -Xptxas -v -Xptxas -v -maxrregcount=128" \
diff --git a/script/cmake-hip.sh b/script/cmake-hip.sh
index 3880c67a17..959582ffcf 100755
--- a/script/cmake-hip.sh
+++ b/script/cmake-hip.sh
@@ -4,16 +4,16 @@ rm -f CMakeCache.txt
 rm -f *.cmake
 rm -rf CMakeFiles
 
-MY_PROJECT_SOURCE=/home/chao/code/modular_convolution
+MY_PROJECT_SOURCE=../../../
 MY_PROJECT_INSTALL=../install.dir
 
 cmake                                                                                       \
 -D CMAKE_INSTALL_PREFIX=${MY_PROJECT_INSTALL}                                               \
 -D CMAKE_BUILD_TYPE=Release                                                                 \
--D DEVICE_BACKEND="HIP"                                                                     \
--D HIP_HIPCC_FLAGS="${HIP_HIPCC_FLAGS} -gline-tables-only"                                  \
--D CMAKE_CXX_FLAGS="-gline-tables-only"                                                     \
--D CMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc                                                   \
--D CMAKE_PREFIX_PATH="/opt/rocm;/home/package/build/mlopen_dep"                             \
+-D DEVICE_BACKEND="AMD"                                                                     \
+-D HIP_HIPCC_FLAGS="${HIP_HIPCC_FLAGS} -gline-tables-only -v"                               \
+-D CMAKE_CXX_FLAGS="-gline-tables-only --amdgpu-target=gfx906"                              \
+-D CMAKE_CXX_COMPILER=/opt/rocm/hip/bin/hipcc                                               \
+-D CMAKE_PREFIX_PATH="/opt/rocm"                                                            \
 -D CMAKE_VERBOSE_MAKEFILE:BOOL=ON                                                           \
 ${MY_PROJECT_SOURCE}
diff --git a/script/trace.sh b/script/trace.sh
new file mode 100755
index 0000000000..231a69de08
--- /dev/null
+++ b/script/trace.sh
@@ -0,0 +1,3 @@
+#!/bin/bash
+
+/root/workspace/rocprofiler_pkg/bin/rpl_run.sh --timestamp on -i /root/workspace/rocprofiler_pkg/input.xml -d ./trace ./driver/driver 0 10
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index a0b63a179d..ef8a781dd5 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -1,4 +1,8 @@
-configure_file("${PROJECT_SOURCE_DIR}/include/composable_kernel/utility/config.hpp.in" "${PROJECT_BINARY_DIR}/include/composable_kernel/utility/config.hpp")
+if(DEVICE_BACKEND STREQUAL "AMD")
+    configure_file("${PROJECT_SOURCE_DIR}/include/composable_kernel/utility/config_amd.hpp.in" "${PROJECT_BINARY_DIR}/include/composable_kernel/utility/config.hpp")
+elseif(DEVICE_BACKEND STREQUAL "NVIDIA")
+    configure_file("${PROJECT_SOURCE_DIR}/include/composable_kernel/utility/config_nvidia.hpp.in" "${PROJECT_BINARY_DIR}/include/composable_kernel/utility/config.hpp")
+endif()
 
 set(TENSOR_SOURCE 
     tensor.cpp;
@@ -9,7 +13,7 @@ add_library(tensor SHARED ${TENSOR_SOURCE})
 target_compile_features(tensor PUBLIC)
 set_target_properties(tensor PROPERTIES POSITION_INDEPENDENT_CODE ON)
 
-if(DEVICE_BACKEND STREQUAL "CUDA")
+if(DEVICE_BACKEND STREQUAL "NVIDIA")
     target_link_libraries(tensor nvToolsExt cudart)
 endif()
 
diff --git a/src/device.cpp b/src/device.cpp
index ab880b33da..ca5c22e0ab 100644
--- a/src/device.cpp
+++ b/src/device.cpp
@@ -3,9 +3,9 @@
 
 DeviceMem::DeviceMem(std::size_t mem_size) : mMemSize(mem_size)
 {
-#if DEVICE_BACKEND_HIP
+#if CK_DEVICE_BACKEND_AMD
     hipGetErrorString(hipMalloc(static_cast<void**>(&mpDeviceBuf), mMemSize));
-#elif DEVICE_BACKEND_CUDA
+#elif CK_DEVICE_BACKEND_NVIDIA
     checkCudaErrors(cudaMalloc(static_cast<void**>(&mpDeviceBuf), mMemSize));
 #endif
 }
@@ -14,10 +14,10 @@ void* DeviceMem::GetDeviceBuffer() { return mpDeviceBuf; }
 
 void DeviceMem::ToDevice(const void* p)
 {
-#if DEVICE_BACKEND_HIP
+#if CK_DEVICE_BACKEND_AMD
     hipGetErrorString(
         hipMemcpy(mpDeviceBuf, const_cast<void*>(p), mMemSize, hipMemcpyHostToDevice));
-#elif DEVICE_BACKEND_CUDA
+#elif CK_DEVICE_BACKEND_NVIDIA
     checkCudaErrors(
         cudaMemcpy(mpDeviceBuf, const_cast<void*>(p), mMemSize, cudaMemcpyHostToDevice));
 #endif
@@ -25,18 +25,18 @@ void DeviceMem::ToDevice(const void* p)
 
 void DeviceMem::FromDevice(void* p)
 {
-#if DEVICE_BACKEND_HIP
+#if CK_DEVICE_BACKEND_AMD
     hipGetErrorString(hipMemcpy(p, mpDeviceBuf, mMemSize, hipMemcpyDeviceToHost));
-#elif DEVICE_BACKEND_CUDA
+#elif CK_DEVICE_BACKEND_NVIDIA
     checkCudaErrors(cudaMemcpy(p, mpDeviceBuf, mMemSize, cudaMemcpyDeviceToHost));
 #endif
 }
 
 DeviceMem::~DeviceMem()
 {
-#if DEVICE_BACKEND_HIP
+#if CK_DEVICE_BACKEND_AMD
     hipGetErrorString(hipFree(mpDeviceBuf));
-#elif DEVICE_BACKEND_CUDA
+#elif CK_DEVICE_BACKEND_NVIDIA
     checkCudaErrors(cudaFree(mpDeviceBuf));
 #endif
 }
@@ -45,10 +45,10 @@ struct KernelTimerImpl
 {
     KernelTimerImpl()
     {
-#if DEVICE_BACKEND_HIP
+#if CK_DEVICE_BACKEND_AMD
         hipEventCreate(&mStart);
         hipEventCreate(&mEnd);
-#elif DEVICE_BACKEND_CUDA
+#elif CK_DEVICE_BACKEND_NVIDIA
         cudaEventCreate(&mStart);
         cudaEventCreate(&mEnd);
 #endif
@@ -56,10 +56,10 @@ struct KernelTimerImpl
 
     ~KernelTimerImpl()
     {
-#if DEVICE_BACKEND_HIP
+#if CK_DEVICE_BACKEND_AMD
         hipEventDestroy(mStart);
         hipEventDestroy(mEnd);
-#elif DEVICE_BACKEND_CUDA
+#elif CK_DEVICE_BACKEND_NVIDIA
         cudaEventDestroy(mStart);
         cudaEventDestroy(mEnd);
 #endif
@@ -67,19 +67,19 @@ struct KernelTimerImpl
 
     void Start()
     {
-#if DEVICE_BACKEND_HIP
+#if CK_DEVICE_BACKEND_AMD
         hipEventRecord(mStart, 0);
-#elif DEVICE_BACKEND_CUDA
+#elif CK_DEVICE_BACKEND_NVIDIA
         cudaEventRecord(mStart, 0);
 #endif
     }
 
     void End()
     {
-#if DEVICE_BACKEND_HIP
+#if CK_DEVICE_BACKEND_AMD
         hipEventRecord(mEnd, 0);
         hipEventSynchronize(mEnd);
-#elif DEVICE_BACKEND_CUDA
+#elif CK_DEVICE_BACKEND_NVIDIA
         cudaEventRecord(mEnd, 0);
         cudaEventSynchronize(mEnd);
 #endif
@@ -88,17 +88,17 @@ struct KernelTimerImpl
     float GetElapsedTime() const
     {
         float time;
-#if DEVICE_BACKEND_HIP
+#if CK_DEVICE_BACKEND_AMD
         hipEventElapsedTime(&time, mStart, mEnd);
-#elif DEVICE_BACKEND_CUDA
+#elif CK_DEVICE_BACKEND_NVIDIA
         cudaEventElapsedTime(&time, mStart, mEnd);
 #endif
         return time;
     }
 
-#if DEVICE_BACKEND_HIP
+#if CK_DEVICE_BACKEND_AMD
     hipEvent_t mStart, mEnd;
-#elif DEVICE_BACKEND_CUDA
+#elif CK_DEVICE_BACKEND_NVIDIA
     cudaEvent_t mStart, mEnd;
 #endif
 };