From 184c6e7d37f27afd45fa894559b45673649d14de Mon Sep 17 00:00:00 2001
From: Chao Liu <lc.roy86@gmail.com>
Date: Fri, 20 Sep 2019 21:45:03 -0500
Subject: [PATCH] nvidia build

---
 .../tensor_description/tensor_coordinate.hpp  |   4 +-
 .../tensor_coordinate_v2.hpp                  |   4 +-
 .../tensor_description/tensor_descriptor.hpp  |   2 +-
 .../threadwise_generic_tensor_slice_copy.hpp  |  68 +--
 .../include/utility/amd_inline_asm.hpp        | 501 ------------------
 .../include/utility/common_header.hpp         |   4 +
 .../include/utility/config_amd.hpp.in         |   6 +-
 .../include/utility/config_nvidia.hpp.in      |   6 +-
 8 files changed, 41 insertions(+), 554 deletions(-)

diff --git a/composable_kernel/include/tensor_description/tensor_coordinate.hpp b/composable_kernel/include/tensor_description/tensor_coordinate.hpp
index 223d0d5bed..5b4805e9ee 100644
--- a/composable_kernel/include/tensor_description/tensor_coordinate.hpp
+++ b/composable_kernel/include/tensor_description/tensor_coordinate.hpp
@@ -325,14 +325,14 @@ struct TensorCoordinate
     private:
     template <class... Ts>
     __host__ __device__ static constexpr auto
-        MakeDummyTensorCoordinate(ConstantTensorDescriptor<Ts...>)
+    MakeDummyTensorCoordinate(ConstantTensorDescriptor<Ts...>)
     {
         return NormalTensorCoordinate<ConstantTensorDescriptor<Ts...>>();
     }
 
     template <class... Ts>
     __host__ __device__ static constexpr auto
-        MakeDummyTensorCoordinate(ConstantMergedTensorDescriptor<Ts...>)
+    MakeDummyTensorCoordinate(ConstantMergedTensorDescriptor<Ts...>)
     {
         return MergedTensorCoordinate<ConstantMergedTensorDescriptor<Ts...>>();
     }
diff --git a/composable_kernel/include/tensor_description/tensor_coordinate_v2.hpp b/composable_kernel/include/tensor_description/tensor_coordinate_v2.hpp
index 62dc8b4c9a..831088ab25 100644
--- a/composable_kernel/include/tensor_description/tensor_coordinate_v2.hpp
+++ b/composable_kernel/include/tensor_description/tensor_coordinate_v2.hpp
@@ -188,7 +188,7 @@ struct TensorCoordinate_v2
     private:
     template <typename... Ts>
     __host__ __device__ static constexpr auto
-        MakeDummyTensorCoordinate(NativeTensorDescriptor<Ts...>)
+    MakeDummyTensorCoordinate(NativeTensorDescriptor<Ts...>)
     {
         return NativeTensorCoordinate<NativeTensorDescriptor<Ts...>>(
             make_zero_array<index_t, TensorDesc::GetNumOfDimension()>());
@@ -196,7 +196,7 @@ struct TensorCoordinate_v2
 
     template <typename... Ts>
     __host__ __device__ static constexpr auto
-        MakeDummyTensorCoordinate(TransformedTensorDescriptor<Ts...>)
+    MakeDummyTensorCoordinate(TransformedTensorDescriptor<Ts...>)
     {
         return TransformedTensorCoordinate<TransformedTensorDescriptor<Ts...>>(
             make_zero_array<index_t, TensorDesc::GetNumOfDimension()>());
diff --git a/composable_kernel/include/tensor_description/tensor_descriptor.hpp b/composable_kernel/include/tensor_description/tensor_descriptor.hpp
index 4cff9a45d1..4d0acd7993 100644
--- a/composable_kernel/include/tensor_description/tensor_descriptor.hpp
+++ b/composable_kernel/include/tensor_description/tensor_descriptor.hpp
@@ -346,7 +346,7 @@ struct TransformedTensorDescriptor
         return GetLowerTensorDescriptor().CalculateOffset(CalculateLowerIndex(idx_up));
     }
 
-#if 0
+#if 1
     struct lambda_sequence_logic_or
     {
         template <typename... Seqs>
diff --git a/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp b/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp
index 68e6b5bb02..67b0969a0e 100644
--- a/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp
+++ b/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp
@@ -21,6 +21,10 @@
 #define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V2R1 0
 #endif
 
+#ifndef CK_EXPERIMENTAL_USE_AMD_BUFFER_LOAD_STORE_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V2R1
+#define CK_EXPERIMENTAL_USE_AMD_BUFFER_LOAD_STORE_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V2R1 0
+#endif
+
 namespace ck {
 
 // This threadwise copy allow vector access of src and dst.
@@ -835,19 +839,15 @@ struct ThreadwiseGenericTensorSliceCopy_v2r1
                     //     2. src_normal_offset must be calculatd at compile time (guaranteed)
                     //     3. src_merged_offset can be runtime value (no assumption imposed)
                     static_if<SrcMemorySpace == 2>{}([&](auto) {
-#if 0   // source code
-                        vector_data = *reinterpret_cast<const src_vector_t*>(
-                            &p_src[src_normal_offset + src_merged_offset]);
-#elif 0 // inline asm using global_load
-                        vector_data = __global_load<TData, SrcDataPerAccess>(
-                            p_src,
-                            static_cast<uint32_t>(src_merged_offset),
-                            static_cast<uint32_t>(src_normal_offset));
-#elif 1 // inline asm using buffer_load
+#if CK_USE_AMD_INTRINSIC && \
+    CK_USE_AMD_INTRINSIC_BUFFER_LOAD_STORE_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V2R1
                         vector_data = __buffer_load<TData, SrcDataPerAccess>(
                             p_src,
                             static_cast<uint32_t>(src_merged_offset),
                             static_cast<uint32_t>(src_normal_offset));
+#else
+                        vector_data = *reinterpret_cast<const src_vector_t*>(
+                            &p_src[src_normal_offset + src_merged_offset]);
 #endif
                     }).Else([&](auto) {
                         // src can be all kinds of memory-space.
@@ -940,15 +940,13 @@ struct ThreadwiseGenericTensorSliceCopy_v2r1
                         //     2. dst_normal_offset must be calculatd at compile time (guaranteed)
                         //     3. dst_merged_offset can be runtime value (no assumption imposed)
                         static_if<DstMemorySpace == 2>{}([&](auto) {
-#if 0   // source code
-                            *reinterpret_cast<dst_vector_t*>(
-                                &p_dst[dst_normal_offset + dst_merged_offset]) = vector_data;
-#elif 0 // inline asm using global_store
-                            __global_store<TData, DstDataPerAccess>(
-                                vector_data, p_dst, dst_merged_offset, dst_normal_offset);
-#elif 1 // inline asm using buffer_store
+#if CK_USE_AMD_INTRINSIC && \
+    CK_USE_AMD_INTRINSIC_BUFFER_LOAD_STORE_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V2R1
                             __buffer_store<TData, DstDataPerAccess>(
                                 vector_data, p_dst, dst_merged_offset, dst_normal_offset);
+#else
+                            *reinterpret_cast<dst_vector_t*>(
+                                &p_dst[dst_normal_offset + dst_merged_offset]) = vector_data;
 #endif
                         }).Else([&](auto) {
                             // dst can be all kinds of memory-space
@@ -1053,15 +1051,6 @@ struct ThreadwiseGenericTensorSliceCopy_v3r1
             auto src_slice_vectorized =
                 mSrcSlice.Vectorize(src_vector_access_dim, src_data_per_access);
 
-#if 0
-            if(get_thread_local_1d_id() == 0 && get_block_1d_id() == 0)
-            {
-                print_ConstantTensorDescriptor("mSrcSlice: ", typename decltype(mSrcSlice)::tensor_desc_type{});
-                print_ConstantTensorDescriptor("src_slice_vector: ", typename decltype(src_slice_vectorized)::tensor_desc_type{});
-            }
-#endif
-
-#if 1 // debug
             ford<decltype(src_slice_vectorized.GetLengths()), SrcDimAccessOrder>{}(
                 [&](auto src_vector_id) {
                     // load vector from src
@@ -1080,7 +1069,6 @@ struct ThreadwiseGenericTensorSliceCopy_v3r1
                             reinterpret_cast<const data_type*>(&vector_data)[i];
                     }
                 });
-#endif
         }
 
         // copy data from buffer into dst
@@ -1093,15 +1081,6 @@ struct ThreadwiseGenericTensorSliceCopy_v3r1
             auto dst_slice_vectorized =
                 mDstSlice.Vectorize(dst_vector_access_dim, dst_data_per_access);
 
-#if 0
-            if(get_thread_local_1d_id() == 0 && get_block_1d_id() == 0)
-            {
-                print_ConstantTensorDescriptor("mDstSlice: ", typename decltype(mDstSlice)::tensor_desc_type{});
-                print_ConstantTensorDescriptor("dst_slice_vector: ", typename decltype(dst_slice_vectorized)::tensor_desc_type{});
-            }
-#endif
-
-#if 1 // debug
             ford<decltype(dst_slice_vectorized.GetLengths()), DstDimAccessOrder>{}(
                 [&](auto dst_vector_id) {
 
@@ -1122,7 +1101,6 @@ struct ThreadwiseGenericTensorSliceCopy_v3r1
                     // write vector into dst
                     dst_slice_vectorized(dst_vector_id) = vector_data;
                 });
-#endif
         }
     }
 
@@ -1330,13 +1308,14 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
                     const index_t buffer_offset = i * src_data_per_access;
 
                     static_if<SrcMemorySpace == 2>{}([&](auto) {
-#if 0   // source code
-                        *reinterpret_cast<src_vector_t*>(&p_long_vector[buffer_offset]) =
-                            *reinterpret_cast<const src_vector_t*>(&p_src[src_offset]);
-#elif 1 // inline asm using buffer_load
+#if CK_USE_AMD_INTRINSIC && \
+    CK_USE_AMD_INTRINSIC_BUFFER_LOAD_STORE_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V2R1
                         *reinterpret_cast<src_vector_t*>(&p_long_vector[buffer_offset]) =
                             __buffer_load<TData, SrcDataPerAccess>(
                                 p_src, static_cast<uint32_t>(src_offset), static_cast<uint32_t>(0));
+#else
+                        *reinterpret_cast<src_vector_t*>(&p_long_vector[buffer_offset]) =
+                            *reinterpret_cast<const src_vector_t*>(&p_src[src_offset]);
 #endif
                     }).Else([&](auto) {
                         // src can be all kinds of memory-space.
@@ -1358,15 +1337,16 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
                     (mDstSliceOrigin + (long_vector_data_begin_id + scalar_id)).GetOffset();
 
                 static_if<DstMemorySpace == 2>{}([&](auto) {
-#if 0   // source code
-                    *reinterpret_cast<dst_vector_t*>(&p_dst[dst_offset]) =
-                        *reinterpret_cast<dst_vector_t*>(&p_long_vector[buffer_offset]);
-#elif 1 // inline asm using buffer_store
+#if CK_USE_AMD_INTRINSIC && \
+    CK_USE_AMD_INTRINSIC_BUFFER_LOAD_STORE_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V2R1
                     __buffer_store<TData, DstDataPerAccess>(
                         *reinterpret_cast<dst_vector_t*>(&p_long_vector[buffer_offset]),
                         p_dst,
                         dst_offset,
                         0);
+#else
+                    *reinterpret_cast<dst_vector_t*>(&p_dst[dst_offset]) =
+                        *reinterpret_cast<dst_vector_t*>(&p_long_vector[buffer_offset]);
 #endif
                 }).Else([&](auto) {
                     // dst can be all kinds of memory-space
diff --git a/composable_kernel/include/utility/amd_inline_asm.hpp b/composable_kernel/include/utility/amd_inline_asm.hpp
index fd8669256d..0a17b4bd3a 100644
--- a/composable_kernel/include/utility/amd_inline_asm.hpp
+++ b/composable_kernel/include/utility/amd_inline_asm.hpp
@@ -8,507 +8,6 @@ namespace ck {
 // cast a pointer of LDS to its address
 extern "C" __attribute__((address_space(3))) __device__ void* __to_local(void* p);
 
-__device__ float __llvm_amdgcn_buffer_load(int32x4_t rsrc,
-                                           uint32_t vindex,
-                                           uint32_t offset,
-                                           bool glc,
-                                           bool slc) __asm("llvm.amdgcn.buffer.load");
-
-__device__ vector_type<float, 2>::MemoryType
-__llvm_amdgcn_buffer_loadx2(int32x4_t rsrc,
-                            uint32_t vindex,
-                            uint32_t offset,
-                            bool glc,
-                            bool slc) __asm("llvm.amdgcn.buffer.load.dwordx2");
-
-__device__ vector_type<float, 4>::MemoryType
-__llvm_amdgcn_buffer_loadx4(int32x4_t rsrc,
-                            uint32_t vindex,
-                            uint32_t offset,
-                            bool glc,
-                            bool slc) __asm("llvm.amdgcn.buffer.load.dwordx4");
-
-__device__ void __llvm_amdgcn_buffer_store(float vdata,
-                                           int32x4_t rsrc,
-                                           uint32_t vindex,
-                                           uint32_t offset,
-                                           bool glc,
-                                           bool slc) __asm("llvm.amdgcn.buffer.store");
-
-__device__ void __llvm_amdgcn_buffer_storex2(vector_type<float, 2>::MemoryType vdata,
-                                             int32x4_t rsrc,
-                                             uint32_t vindex,
-                                             uint32_t offset,
-                                             bool glc,
-                                             bool slc) __asm("llvm.amdgcn.buffer.store.dwordx2");
-
-__device__ void __llvm_amdgcn_buffer_storex4(vector_type<float, 4>::MemoryType vdata,
-                                             int32x4_t rsrc,
-                                             uint32_t vindex,
-                                             uint32_t offset,
-                                             bool glc,
-                                             bool slc) __asm("llvm.amdgcn.buffer.store.dwordx4");
-
-// global_load and global_store
-template <typename T, index_t VectorSize>
-__device__ typename vector_type<T, VectorSize>::MemoryType __global_load(
-    const T* p_src_block, uint32_t src_thread_data_offset, uint32_t src_const_data_offset);
-
-template <typename T, index_t VectorSize>
-__device__ void __global_store(const typename vector_type<T, VectorSize>::MemoryType& src,
-                               T* p_dst_block,
-                               uint32_t dst_thread_data_offset,
-                               uint32_t dst_const_data_offset);
-
-template <>
-__device__ float __global_load<float, 1>(const float* p_src_block,
-                                         uint32_t src_thread_data_offset,
-                                         uint32_t src_const_data_offset)
-{
-    float dst;
-
-#if 0   // source code
-    dst = p_src_block[src_const_data_offset + src_thread_data_offset];
-#elif 0 // use VGPR only
-    const float* src_thread_addr_offset_u64 =
-        p_src_block + src_const_data_offset + src_thread_data_offset;
-
-    asm volatile("\n \
-     global_load_dword %0, %1 off offset:0 \n \
-     s_waitcnt 0 \n \
-     "
-                 : "=v"(dst)
-                 : "v"(src_thread_addr_offset_u64));
-#elif 0 // use VGPR and SGPR, do compute on VALU
-    uint64_t src_thread_addr_offset_u64 =
-        (src_thread_data_offset + src_const_data_offset) * sizeof(float);
-
-    asm volatile("\n \
-     global_load_dword %0, %1, %2, offset:0 \n \
-     s_waitcnt 0 \n \
-     "
-                 : "=v"(dst)
-                 : "v"(src_thread_addr_offset_u64), "s"(p_src_block));
-#elif 1 // use VGPR and SGPR, do compute on SALU
-    uint64_t src_thread_addr_offset_u64 =
-        static_cast<uint64_t>(src_thread_data_offset * sizeof(float));
-
-    const float* p_src_block_with_offset = p_src_block + src_const_data_offset;
-
-    asm volatile("\n \
-     global_load_dword %0, %1, %2, offset:0 \n \
-     s_waitcnt 0 \n \
-     "
-                 : "=v"(dst)
-                 : "v"(src_thread_addr_offset_u64), "s"(p_src_block_with_offset));
-#endif
-
-    return dst;
-}
-
-template <>
-__device__ vector_type<float, 2>::MemoryType __global_load<float, 2>(
-    const float* p_src_block, uint32_t src_thread_data_offset, uint32_t src_const_data_offset)
-{
-    using vector_t = vector_type<float, 2>::MemoryType;
-
-    vector_t dst;
-
-#if 0   // source code
-    dst = *reinterpret_cast<const vector_t*>(&p_src_block[src_const_data_offset + src_thread_data_offset]);
-#elif 0 // use VGPR only
-    const float* src_thread_addr_offset_u64 =
-        p_src_block + src_const_data_offset + src_thread_data_offset;
-
-    asm volatile("\n \
-     global_load_dwordx2 %0, %1 off offset:0 \n \
-     s_waitcnt 0 \n \
-     "
-                 : "=v"(dst)
-                 : "v"(src_thread_addr_offset_u64));
-#elif 0 // use VGPR and SGPR, do compute on VALU
-    uint64_t src_thread_addr_offset_u64 =
-        (src_thread_data_offset + src_const_data_offset) * sizeof(float);
-
-    asm volatile("\n \
-     global_load_dwordx2 %0, %1, %2, offset:0 \n \
-     s_waitcnt 0 \n \
-     "
-                 : "=v"(dst)
-                 : "v"(src_thread_addr_offset_u64), "s"(p_src_block));
-#elif 1 // use VGPR and SGPR, do compute on SALU
-    uint64_t src_thread_addr_offset_u64 =
-        static_cast<uint64_t>(src_thread_data_offset * sizeof(float));
-
-    const float* p_src_block_with_offset = p_src_block + src_const_data_offset;
-
-    asm volatile("\n \
-     global_load_dwordx2 %0, %1, %2, offset:0 \n \
-     s_waitcnt 0 \n \
-     "
-                 : "=v"(dst)
-                 : "v"(src_thread_addr_offset_u64), "s"(p_src_block_with_offset));
-#endif
-
-    return dst;
-}
-
-template <>
-__device__ vector_type<float, 4>::MemoryType __global_load<float, 4>(
-    const float* p_src_block, uint32_t src_thread_data_offset, uint32_t src_const_data_offset)
-{
-    using vector_t = vector_type<float, 4>::MemoryType;
-
-    vector_t dst;
-
-#if 0   // source code
-    dst = *reinterpret_cast<const vector_t*>(&p_src_block[src_const_data_offset + src_thread_data_offset]);
-#elif 0 // use VGPR only
-    const float* src_thread_addr_offset_u64 =
-        p_src_block + src_const_data_offset + src_thread_data_offset;
-
-    asm volatile("\n \
-     global_load_dwordx4 %0, %1 off offset:0 \n \
-     s_waitcnt 0 \n \
-     "
-                 : "=v"(dst)
-                 : "v"(src_thread_addr_offset_u64));
-#elif 0 // use VGPR and SGPR, do compute on VALU
-    uint64_t src_thread_addr_offset_u64 =
-        (src_thread_data_offset + src_const_data_offset) * sizeof(float);
-
-    asm volatile("\n \
-     global_load_dwordx4 %0, %1, %2, offset:0 \n \
-     s_waitcnt 0 \n \
-     "
-                 : "=v"(dst)
-                 : "v"(src_thread_addr_offset_u64), "s"(p_src_block));
-#elif 1 // use VGPR and SGPR, do compute on SALU
-    uint64_t src_thread_addr_offset_u64 =
-        static_cast<uint64_t>(src_thread_data_offset * sizeof(float));
-
-    const float* p_src_block_with_offset = p_src_block + src_const_data_offset;
-
-    asm volatile("\n \
-     global_load_dwordx4 %0, %1, %2, offset:0 \n \
-     s_waitcnt 0 \n \
-     "
-                 : "=v"(dst)
-                 : "v"(src_thread_addr_offset_u64), "s"(p_src_block_with_offset));
-#endif
-
-    return dst;
-}
-
-template <>
-__device__ void __global_store<float, 1>(const float& src,
-                                         float* p_dst_block,
-                                         uint32_t dst_thread_data_offset,
-                                         uint32_t dst_const_data_offset)
-{
-#if 0 // compute on VALU
-    uint64_t dst_thread_data_offset_u64 = (dst_thread_data_offset + dst_const_data_offset) * sizeof(float);
-
-    asm volatile("\n \
-     global_store_dword %0, %1, %2, offset:0 \n \
-     "
-                 :
-                 : "v"(dst_thread_data_offset_u64), "v"(src), "s"(p_dst_block));
-#else // compute on SALU
-    uint64_t dst_thread_data_offset_u64 = dst_thread_data_offset * sizeof(float);
-
-    float* p_dst_block_with_offset = p_dst_block + dst_const_data_offset;
-
-    asm volatile("\n \
-     global_store_dword %0, %1, %2, offset:0 \n \
-     "
-                 :
-                 : "v"(dst_thread_data_offset_u64), "v"(src), "s"(p_dst_block_with_offset));
-#endif
-}
-
-// buffer_load and buffer_store
-template <typename T, index_t VectorSize>
-__device__ typename vector_type<T, VectorSize>::MemoryType __buffer_load(
-    const T* p_src_block, uint32_t src_thread_data_offset, uint32_t src_const_data_offset);
-
-template <typename T, index_t VectorSize>
-__device__ void __buffer_store(const typename vector_type<T, VectorSize>::MemoryType& src,
-                               T* p_dst_block,
-                               uint32_t dst_thread_data_offset,
-                               uint32_t dst_const_data_offset);
-
-template <>
-__device__ float __buffer_load<float, 1>(const float* p_src_block,
-                                         uint32_t src_thread_data_offset,
-                                         uint32_t src_const_data_offset)
-{
-#if 0
-    float dst;
-
-    uint32_t src_thread_addr_offset = src_thread_data_offset * sizeof(float);
-    uint32_t src_const_addr_offset  = src_const_data_offset * sizeof(float);
-
-    int32x4_t src_block_setting{0};
-    // fill in byte 0 - 1
-    *reinterpret_cast<float**>(&src_block_setting) = const_cast<float*>(p_src_block);
-    // fill in byte 2
-    reinterpret_cast<int*>(&src_block_setting)[2] = -1;
-    // fill in byte 3
-    reinterpret_cast<int*>(&src_block_setting)[3] = 0x00027000;
-
-    asm volatile("\n \
-    buffer_load_dword %0, %1, %2, %3 offen offset:0 \n \
-    s_waitcnt 0 \n \
-    "
-                 : "=v"(dst)
-                 : "v"(src_thread_addr_offset), "s"(src_block_setting), "s"(src_const_addr_offset));
-
-    return dst;
-#else
-    float dst;
-
-    uint32_t src_thread_addr_offset = src_thread_data_offset * sizeof(float);
-    uint32_t src_const_addr_offset  = src_const_data_offset * sizeof(float);
-
-    int32x4_t src_block_setting{0};
-    // fill in byte 0 - 1
-    *reinterpret_cast<float**>(&src_block_setting) = const_cast<float*>(p_src_block);
-    // fill in byte 2
-    reinterpret_cast<int*>(&src_block_setting)[2] = -1;
-    // fill in byte 3
-    reinterpret_cast<int*>(&src_block_setting)[3] = 0x00027000;
-
-    dst = __llvm_amdgcn_buffer_load(
-        src_block_setting, 0, src_thread_addr_offset + src_const_addr_offset, false, false);
-
-    return dst;
-#endif
-}
-
-template <>
-__device__ vector_type<float, 2>::MemoryType __buffer_load<float, 2>(
-    const float* p_src_block, uint32_t src_thread_data_offset, uint32_t src_const_data_offset)
-{
-#if 0
-    vector_type<float, 2>::MemoryType dst;
-
-    uint32_t src_thread_addr_offset = src_thread_data_offset * sizeof(float);
-    uint32_t src_const_addr_offset  = src_const_data_offset * sizeof(float);
-
-    int32x4_t src_block_setting{0};
-    // fill in byte 0 - 1
-    *reinterpret_cast<float**>(&src_block_setting) = const_cast<float*>(p_src_block);
-    // fill in byte 2
-    reinterpret_cast<int*>(&src_block_setting)[2] = -1;
-    // fill in byte 3
-    reinterpret_cast<int*>(&src_block_setting)[3] = 0x00027000;
-
-    asm volatile("\n \
-    buffer_load_dwordx2 %0, %1, %2, %3 offen offset:0 \n \
-    s_waitcnt 0 \n \
-    "
-                 : "=v"(dst)
-                 : "v"(src_thread_addr_offset), "s"(src_block_setting), "s"(src_const_addr_offset));
-
-    return dst;
-#else
-    vector_type<float, 2>::MemoryType dst;
-
-    uint32_t src_thread_addr_offset = src_thread_data_offset * sizeof(float);
-    uint32_t src_const_addr_offset  = src_const_data_offset * sizeof(float);
-
-    int32x4_t src_block_setting{0};
-    // fill in byte 0 - 1
-    *reinterpret_cast<float**>(&src_block_setting) = const_cast<float*>(p_src_block);
-    // fill in byte 2
-    reinterpret_cast<int*>(&src_block_setting)[2] = -1;
-    // fill in byte 3
-    reinterpret_cast<int*>(&src_block_setting)[3] = 0x00027000;
-
-    dst = __llvm_amdgcn_buffer_loadx2(
-        src_block_setting, 0, src_thread_addr_offset + src_const_addr_offset, false, false);
-
-    return dst;
-#endif
-}
-
-template <>
-__device__ vector_type<float, 4>::MemoryType __buffer_load<float, 4>(
-    const float* p_src_block, uint32_t src_thread_data_offset, uint32_t src_const_data_offset)
-{
-#if 0
-    vector_type<float, 4>::MemoryType dst;
-
-    uint32_t src_thread_addr_offset = src_thread_data_offset * sizeof(float);
-    uint32_t src_const_addr_offset  = src_const_data_offset * sizeof(float);
-
-    int32x4_t src_block_setting{0};
-    // fill in byte 0 - 1
-    *reinterpret_cast<float**>(&src_block_setting) = const_cast<float*>(p_src_block);
-    // fill in byte 2
-    reinterpret_cast<int*>(&src_block_setting)[2] = -1;
-    // fill in byte 3
-    reinterpret_cast<int*>(&src_block_setting)[3] = 0x00027000;
-
-    asm volatile("\n \
-    buffer_load_dwordx4 %0, %1, %2, %3 offen offset:0 \n \
-    s_waitcnt 0 \n \
-    "
-                 : "=v"(dst)
-                 : "v"(src_thread_addr_offset), "s"(src_block_setting), "s"(src_const_addr_offset));
-
-    return dst;
-#elif 1
-    vector_type<float, 4>::MemoryType dst;
-
-    uint32_t src_thread_addr_offset = src_thread_data_offset * sizeof(float);
-    uint32_t src_const_addr_offset  = src_const_data_offset * sizeof(float);
-
-    int32x4_t src_block_setting{0};
-    // fill in byte 0 - 1
-    *reinterpret_cast<float**>(&src_block_setting) = const_cast<float*>(p_src_block);
-    // fill in byte 2
-    reinterpret_cast<int*>(&src_block_setting)[2] = -1;
-    // fill in byte 3
-    reinterpret_cast<int*>(&src_block_setting)[3] = 0x00027000;
-
-    dst = __llvm_amdgcn_buffer_loadx4(
-        src_block_setting, 0, src_thread_addr_offset + src_const_addr_offset, false, false);
-
-    return dst;
-#endif
-}
-
-template <>
-__device__ void __buffer_store<float, 1>(const float& src,
-                                         float* p_dst_block,
-                                         uint32_t dst_thread_data_offset,
-                                         uint32_t dst_const_data_offset)
-{
-#if 0
-    uint32_t dst_thread_addr_offset = dst_thread_data_offset * sizeof(float);
-    uint32_t dst_const_addr_offset  = dst_const_data_offset * sizeof(float);
-
-    int32x4_t dst_block_setting{0};
-    // fill in byte 0 - 1
-    *reinterpret_cast<float**>(&dst_block_setting) = p_dst_block;
-    // fill in byte 2
-    reinterpret_cast<int*>(&dst_block_setting)[2] = -1;
-    // fill in byte 3
-    reinterpret_cast<int*>(&dst_block_setting)[3] = 0x00027000;
-
-    asm volatile("\n \
-    buffer_store_dword %1, %2, %0, %3 offen offset:0 \n \
-    "
-                 :
-                 : "s"(dst_block_setting),
-                   "v"(src),
-                   "v"(dst_thread_addr_offset),
-                   "s"(dst_const_addr_offset));
-#else
-    uint32_t dst_thread_addr_offset = dst_thread_data_offset * sizeof(float);
-    uint32_t dst_const_addr_offset  = dst_const_data_offset * sizeof(float);
-
-    int32x4_t dst_block_setting{0};
-    // fill in byte 0 - 1
-    *reinterpret_cast<float**>(&dst_block_setting) = p_dst_block;
-    // fill in byte 2
-    reinterpret_cast<int*>(&dst_block_setting)[2] = -1;
-    // fill in byte 3
-    reinterpret_cast<int*>(&dst_block_setting)[3] = 0x00027000;
-
-    __llvm_amdgcn_buffer_store(
-        src, dst_block_setting, 0, dst_thread_addr_offset + dst_const_addr_offset, false, false);
-#endif
-}
-
-template <>
-__device__ void __buffer_store<float, 2>(const vector_type<float, 2>::MemoryType& src,
-                                         float* p_dst_block,
-                                         uint32_t dst_thread_data_offset,
-                                         uint32_t dst_const_data_offset)
-{
-#if 0
-    uint32_t dst_thread_addr_offset = dst_thread_data_offset * sizeof(float);
-    uint32_t dst_const_addr_offset  = dst_const_data_offset * sizeof(float);
-
-    int32x4_t dst_block_setting{0};
-    // fill in byte 0 - 1
-    *reinterpret_cast<float**>(&dst_block_setting) = p_dst_block;
-    // fill in byte 2
-    reinterpret_cast<int*>(&dst_block_setting)[2] = -1;
-    // fill in byte 3
-    reinterpret_cast<int*>(&dst_block_setting)[3] = 0x00027000;
-
-    asm volatile("\n \
-    buffer_store_dwordx2 %1, %2, %0, %3 offen offset:0 \n \
-    "
-                 :
-                 : "s"(dst_block_setting),
-                   "v"(src),
-                   "v"(dst_thread_addr_offset),
-                   "s"(dst_const_addr_offset));
-#else
-    uint32_t dst_thread_addr_offset = dst_thread_data_offset * sizeof(float);
-    uint32_t dst_const_addr_offset  = dst_const_data_offset * sizeof(float);
-
-    int32x4_t dst_block_setting{0};
-    // fill in byte 0 - 1
-    *reinterpret_cast<float**>(&dst_block_setting) = p_dst_block;
-    // fill in byte 2
-    reinterpret_cast<int*>(&dst_block_setting)[2] = -1;
-    // fill in byte 3
-    reinterpret_cast<int*>(&dst_block_setting)[3] = 0x00027000;
-
-    __llvm_amdgcn_buffer_storex2(
-        src, dst_block_setting, 0, dst_thread_addr_offset + dst_const_addr_offset, false, false);
-#endif
-}
-
-template <>
-__device__ void __buffer_store<float, 4>(const vector_type<float, 4>::MemoryType& src,
-                                         float* p_dst_block,
-                                         uint32_t dst_thread_data_offset,
-                                         uint32_t dst_const_data_offset)
-{
-#if 0
-    uint32_t dst_thread_addr_offset = dst_thread_data_offset * sizeof(float);
-    uint32_t dst_const_addr_offset  = dst_const_data_offset * sizeof(float);
-
-    int32x4_t dst_block_setting{0};
-    // fill in byte 0 - 1
-    *reinterpret_cast<float**>(&dst_block_setting) = p_dst_block;
-    // fill in byte 2
-    reinterpret_cast<int*>(&dst_block_setting)[2] = -1;
-    // fill in byte 3
-    reinterpret_cast<int*>(&dst_block_setting)[3] = 0x00027000;
-
-    asm volatile("\n \
-    buffer_store_dwordx4 %1, %2, %0, %3 offen offset:0 \n \
-    "
-                 :
-                 : "s"(dst_block_setting),
-                   "v"(src),
-                   "v"(dst_thread_addr_offset),
-                   "s"(dst_const_addr_offset));
-#else
-    uint32_t dst_thread_addr_offset = dst_thread_data_offset * sizeof(float);
-    uint32_t dst_const_addr_offset  = dst_const_data_offset * sizeof(float);
-
-    int32x4_t dst_block_setting{0};
-    // fill in byte 0 - 1
-    *reinterpret_cast<float**>(&dst_block_setting) = p_dst_block;
-    // fill in byte 2
-    reinterpret_cast<int*>(&dst_block_setting)[2] = -1;
-    // fill in byte 3
-    reinterpret_cast<int*>(&dst_block_setting)[3] = 0x00027000;
-
-    __llvm_amdgcn_buffer_storex4(
-        src, dst_block_setting, 0, dst_thread_addr_offset + dst_const_addr_offset, false, false);
-#endif
-}
-
 __device__ void vmcnt(index_t cnt)
 {
     if(cnt == 0)
diff --git a/composable_kernel/include/utility/common_header.hpp b/composable_kernel/include/utility/common_header.hpp
index ed581c95c0..f8b52bba5b 100644
--- a/composable_kernel/include/utility/common_header.hpp
+++ b/composable_kernel/include/utility/common_header.hpp
@@ -22,4 +22,8 @@
 #include "amd_inline_asm.hpp"
 #endif
 
+#if CK_USE_AMD_INTRINCIS
+#include "amd_intrinsic.hpp"
+#endif
+
 #endif
diff --git a/composable_kernel/include/utility/config_amd.hpp.in b/composable_kernel/include/utility/config_amd.hpp.in
index 1b57256a00..0ee722f22d 100644
--- a/composable_kernel/include/utility/config_amd.hpp.in
+++ b/composable_kernel/include/utility/config_amd.hpp.in
@@ -4,9 +4,11 @@
 #include "hip/hip_runtime.h"
 #include "hip/hip_fp16.h"
 
+#define CK_UNSIGNED_INDEX_TYPE 0
 #define CK_DEVICE_BACKEND_AMD 1
-#define CK_USE_UNSIGNED_INDEX_TYPE 0
+#define CK_USE_AMD_INTRINSIC 1
 #define CK_USE_AMD_INLINE_ASM 1
+#define CK_USE_AMD_INTRINSIC_BUFFER_LOAD_STORE_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V2R1 1
 #define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_BLOCKWISE_GENERIC_SLICE_COPY_V1 1
 #define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1R1 0
 #define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1R2 0
@@ -14,7 +16,7 @@
 
 namespace ck {
 
-#if CK_USE_UNSIGNED_INDEX_TYPE
+#if CK_UNSIGNED_INDEX_TYPE
 using index_t = uint32_t;
 #else
 using index_t = int32_t;
diff --git a/composable_kernel/include/utility/config_nvidia.hpp.in b/composable_kernel/include/utility/config_nvidia.hpp.in
index b2f8e3e43e..523b7be589 100644
--- a/composable_kernel/include/utility/config_nvidia.hpp.in
+++ b/composable_kernel/include/utility/config_nvidia.hpp.in
@@ -6,9 +6,11 @@
 #include "nvToolsExt.h"
 #include "helper_cuda.h"
 
+#define CK_UNSIGNED_INDEX_TYPE 0
 #define CK_DEVICE_BACKEND_NVIDIA 1
-#define CK_USE_UNSIGNED_INDEX_TYPE 0
+#define CK_USE_AMD_INTRINSIC 0
 #define CK_USE_AMD_INLINE_ASM 0
+#define CK_USE_AMD_INTRINSIC_BUFFER_LOAD_STORE_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V2R1 0
 #define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_BLOCKWISE_GENERIC_SLICE_COPY_V1 0
 #define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1R1 0
 #define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1R2 0
@@ -16,7 +18,7 @@
 
 namespace ck {
 
-#if CK_USE_UNSIGNED_INDEX_TYPE
+#if CK_UNSIGNED_INDEX_TYPE
 using index_t = uint32_t;
 #else
 using index_t = int32_t;