diff --git a/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp b/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp
index 2a0fffb34d..525b7a04f1 100644
--- a/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp
+++ b/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp
@@ -835,7 +835,7 @@ struct ThreadwiseGenericTensorSliceCopy_v2r1
                     //     2. src_normal_offset must be calculatd at compile time (guaranteed)
                     //     3. src_merged_offset can be runtime value (no assumption imposed)
                     static_if<SrcMemorySpace == 2>{}([&](auto) {
-#if 1 // source code
+#if 0   // source code
                         vector_data = *reinterpret_cast<const src_vector_t*>(
                             &p_src[src_normal_offset + src_merged_offset]);
 #elif 0 // inline asm using global_load
@@ -940,7 +940,7 @@ struct ThreadwiseGenericTensorSliceCopy_v2r1
                         //     2. dst_normal_offset must be calculatd at compile time (guaranteed)
                         //     3. dst_merged_offset can be runtime value (no assumption imposed)
                         static_if<DstMemorySpace == 2>{}([&](auto) {
-#if 1 // source code
+#if 0   // source code
                             *reinterpret_cast<dst_vector_t*>(
                                 &p_dst[dst_normal_offset + dst_merged_offset]) = vector_data;
 #elif 0 // inline asm using global_store
diff --git a/composable_kernel/include/utility/amd_inline_asm.hpp b/composable_kernel/include/utility/amd_inline_asm.hpp
index f3930f8733..fd8669256d 100644
--- a/composable_kernel/include/utility/amd_inline_asm.hpp
+++ b/composable_kernel/include/utility/amd_inline_asm.hpp
@@ -8,6 +8,47 @@ namespace ck {
 // cast a pointer of LDS to its address
 extern "C" __attribute__((address_space(3))) __device__ void* __to_local(void* p);
 
+__device__ float __llvm_amdgcn_buffer_load(int32x4_t rsrc,
+                                           uint32_t vindex,
+                                           uint32_t offset,
+                                           bool glc,
+                                           bool slc) __asm("llvm.amdgcn.buffer.load");
+
+__device__ vector_type<float, 2>::MemoryType
+__llvm_amdgcn_buffer_loadx2(int32x4_t rsrc,
+                            uint32_t vindex,
+                            uint32_t offset,
+                            bool glc,
+                            bool slc) __asm("llvm.amdgcn.buffer.load.dwordx2");
+
+__device__ vector_type<float, 4>::MemoryType
+__llvm_amdgcn_buffer_loadx4(int32x4_t rsrc,
+                            uint32_t vindex,
+                            uint32_t offset,
+                            bool glc,
+                            bool slc) __asm("llvm.amdgcn.buffer.load.dwordx4");
+
+__device__ void __llvm_amdgcn_buffer_store(float vdata,
+                                           int32x4_t rsrc,
+                                           uint32_t vindex,
+                                           uint32_t offset,
+                                           bool glc,
+                                           bool slc) __asm("llvm.amdgcn.buffer.store");
+
+__device__ void __llvm_amdgcn_buffer_storex2(vector_type<float, 2>::MemoryType vdata,
+                                             int32x4_t rsrc,
+                                             uint32_t vindex,
+                                             uint32_t offset,
+                                             bool glc,
+                                             bool slc) __asm("llvm.amdgcn.buffer.store.dwordx2");
+
+__device__ void __llvm_amdgcn_buffer_storex4(vector_type<float, 4>::MemoryType vdata,
+                                             int32x4_t rsrc,
+                                             uint32_t vindex,
+                                             uint32_t offset,
+                                             bool glc,
+                                             bool slc) __asm("llvm.amdgcn.buffer.store.dwordx4");
+
 // global_load and global_store
 template <typename T, index_t VectorSize>
 __device__ typename vector_type<T, VectorSize>::MemoryType __global_load(
@@ -186,7 +227,7 @@ __device__ void __global_store<float, 1>(const float& src,
 #endif
 }
 
-// __buffer_load and __buffer_store
+// buffer_load and buffer_store
 template <typename T, index_t VectorSize>
 __device__ typename vector_type<T, VectorSize>::MemoryType __buffer_load(
     const T* p_src_block, uint32_t src_thread_data_offset, uint32_t src_const_data_offset);
@@ -202,6 +243,7 @@ __device__ float __buffer_load<float, 1>(const float* p_src_block,
                                          uint32_t src_thread_data_offset,
                                          uint32_t src_const_data_offset)
 {
+#if 0
     float dst;
 
     uint32_t src_thread_addr_offset = src_thread_data_offset * sizeof(float);
@@ -223,12 +265,32 @@ __device__ float __buffer_load<float, 1>(const float* p_src_block,
                  : "v"(src_thread_addr_offset), "s"(src_block_setting), "s"(src_const_addr_offset));
 
     return dst;
+#else
+    float dst;
+
+    uint32_t src_thread_addr_offset = src_thread_data_offset * sizeof(float);
+    uint32_t src_const_addr_offset  = src_const_data_offset * sizeof(float);
+
+    int32x4_t src_block_setting{0};
+    // fill in byte 0 - 1
+    *reinterpret_cast<float**>(&src_block_setting) = const_cast<float*>(p_src_block);
+    // fill in byte 2
+    reinterpret_cast<int*>(&src_block_setting)[2] = -1;
+    // fill in byte 3
+    reinterpret_cast<int*>(&src_block_setting)[3] = 0x00027000;
+
+    dst = __llvm_amdgcn_buffer_load(
+        src_block_setting, 0, src_thread_addr_offset + src_const_addr_offset, false, false);
+
+    return dst;
+#endif
 }
 
 template <>
 __device__ vector_type<float, 2>::MemoryType __buffer_load<float, 2>(
     const float* p_src_block, uint32_t src_thread_data_offset, uint32_t src_const_data_offset)
 {
+#if 0
     vector_type<float, 2>::MemoryType dst;
 
     uint32_t src_thread_addr_offset = src_thread_data_offset * sizeof(float);
@@ -250,12 +312,32 @@ __device__ vector_type<float, 2>::MemoryType __buffer_load<float, 2>(
                  : "v"(src_thread_addr_offset), "s"(src_block_setting), "s"(src_const_addr_offset));
 
     return dst;
+#else
+    vector_type<float, 2>::MemoryType dst;
+
+    uint32_t src_thread_addr_offset = src_thread_data_offset * sizeof(float);
+    uint32_t src_const_addr_offset  = src_const_data_offset * sizeof(float);
+
+    int32x4_t src_block_setting{0};
+    // fill in byte 0 - 1
+    *reinterpret_cast<float**>(&src_block_setting) = const_cast<float*>(p_src_block);
+    // fill in byte 2
+    reinterpret_cast<int*>(&src_block_setting)[2] = -1;
+    // fill in byte 3
+    reinterpret_cast<int*>(&src_block_setting)[3] = 0x00027000;
+
+    dst = __llvm_amdgcn_buffer_loadx2(
+        src_block_setting, 0, src_thread_addr_offset + src_const_addr_offset, false, false);
+
+    return dst;
+#endif
 }
 
 template <>
 __device__ vector_type<float, 4>::MemoryType __buffer_load<float, 4>(
     const float* p_src_block, uint32_t src_thread_data_offset, uint32_t src_const_data_offset)
 {
+#if 0
     vector_type<float, 4>::MemoryType dst;
 
     uint32_t src_thread_addr_offset = src_thread_data_offset * sizeof(float);
@@ -277,6 +359,25 @@ __device__ vector_type<float, 4>::MemoryType __buffer_load<float, 4>(
                  : "v"(src_thread_addr_offset), "s"(src_block_setting), "s"(src_const_addr_offset));
 
     return dst;
+#elif 1
+    vector_type<float, 4>::MemoryType dst;
+
+    uint32_t src_thread_addr_offset = src_thread_data_offset * sizeof(float);
+    uint32_t src_const_addr_offset  = src_const_data_offset * sizeof(float);
+
+    int32x4_t src_block_setting{0};
+    // fill in byte 0 - 1
+    *reinterpret_cast<float**>(&src_block_setting) = const_cast<float*>(p_src_block);
+    // fill in byte 2
+    reinterpret_cast<int*>(&src_block_setting)[2] = -1;
+    // fill in byte 3
+    reinterpret_cast<int*>(&src_block_setting)[3] = 0x00027000;
+
+    dst = __llvm_amdgcn_buffer_loadx4(
+        src_block_setting, 0, src_thread_addr_offset + src_const_addr_offset, false, false);
+
+    return dst;
+#endif
 }
 
 template <>
@@ -285,6 +386,7 @@ __device__ void __buffer_store<float, 1>(const float& src,
                                          uint32_t dst_thread_data_offset,
                                          uint32_t dst_const_data_offset)
 {
+#if 0
     uint32_t dst_thread_addr_offset = dst_thread_data_offset * sizeof(float);
     uint32_t dst_const_addr_offset  = dst_const_data_offset * sizeof(float);
 
@@ -304,6 +406,107 @@ __device__ void __buffer_store<float, 1>(const float& src,
                    "v"(src),
                    "v"(dst_thread_addr_offset),
                    "s"(dst_const_addr_offset));
+#else
+    uint32_t dst_thread_addr_offset = dst_thread_data_offset * sizeof(float);
+    uint32_t dst_const_addr_offset  = dst_const_data_offset * sizeof(float);
+
+    int32x4_t dst_block_setting{0};
+    // fill in byte 0 - 1
+    *reinterpret_cast<float**>(&dst_block_setting) = p_dst_block;
+    // fill in byte 2
+    reinterpret_cast<int*>(&dst_block_setting)[2] = -1;
+    // fill in byte 3
+    reinterpret_cast<int*>(&dst_block_setting)[3] = 0x00027000;
+
+    __llvm_amdgcn_buffer_store(
+        src, dst_block_setting, 0, dst_thread_addr_offset + dst_const_addr_offset, false, false);
+#endif
+}
+
+template <>
+__device__ void __buffer_store<float, 2>(const vector_type<float, 2>::MemoryType& src,
+                                         float* p_dst_block,
+                                         uint32_t dst_thread_data_offset,
+                                         uint32_t dst_const_data_offset)
+{
+#if 0
+    uint32_t dst_thread_addr_offset = dst_thread_data_offset * sizeof(float);
+    uint32_t dst_const_addr_offset  = dst_const_data_offset * sizeof(float);
+
+    int32x4_t dst_block_setting{0};
+    // fill in byte 0 - 1
+    *reinterpret_cast<float**>(&dst_block_setting) = p_dst_block;
+    // fill in byte 2
+    reinterpret_cast<int*>(&dst_block_setting)[2] = -1;
+    // fill in byte 3
+    reinterpret_cast<int*>(&dst_block_setting)[3] = 0x00027000;
+
+    asm volatile("\n \
+    buffer_store_dwordx2 %1, %2, %0, %3 offen offset:0 \n \
+    "
+                 :
+                 : "s"(dst_block_setting),
+                   "v"(src),
+                   "v"(dst_thread_addr_offset),
+                   "s"(dst_const_addr_offset));
+#else
+    uint32_t dst_thread_addr_offset = dst_thread_data_offset * sizeof(float);
+    uint32_t dst_const_addr_offset  = dst_const_data_offset * sizeof(float);
+
+    int32x4_t dst_block_setting{0};
+    // fill in byte 0 - 1
+    *reinterpret_cast<float**>(&dst_block_setting) = p_dst_block;
+    // fill in byte 2
+    reinterpret_cast<int*>(&dst_block_setting)[2] = -1;
+    // fill in byte 3
+    reinterpret_cast<int*>(&dst_block_setting)[3] = 0x00027000;
+
+    __llvm_amdgcn_buffer_storex2(
+        src, dst_block_setting, 0, dst_thread_addr_offset + dst_const_addr_offset, false, false);
+#endif
+}
+
+template <>
+__device__ void __buffer_store<float, 4>(const vector_type<float, 4>::MemoryType& src,
+                                         float* p_dst_block,
+                                         uint32_t dst_thread_data_offset,
+                                         uint32_t dst_const_data_offset)
+{
+#if 0
+    uint32_t dst_thread_addr_offset = dst_thread_data_offset * sizeof(float);
+    uint32_t dst_const_addr_offset  = dst_const_data_offset * sizeof(float);
+
+    int32x4_t dst_block_setting{0};
+    // fill in byte 0 - 1
+    *reinterpret_cast<float**>(&dst_block_setting) = p_dst_block;
+    // fill in byte 2
+    reinterpret_cast<int*>(&dst_block_setting)[2] = -1;
+    // fill in byte 3
+    reinterpret_cast<int*>(&dst_block_setting)[3] = 0x00027000;
+
+    asm volatile("\n \
+    buffer_store_dwordx4 %1, %2, %0, %3 offen offset:0 \n \
+    "
+                 :
+                 : "s"(dst_block_setting),
+                   "v"(src),
+                   "v"(dst_thread_addr_offset),
+                   "s"(dst_const_addr_offset));
+#else
+    uint32_t dst_thread_addr_offset = dst_thread_data_offset * sizeof(float);
+    uint32_t dst_const_addr_offset  = dst_const_data_offset * sizeof(float);
+
+    int32x4_t dst_block_setting{0};
+    // fill in byte 0 - 1
+    *reinterpret_cast<float**>(&dst_block_setting) = p_dst_block;
+    // fill in byte 2
+    reinterpret_cast<int*>(&dst_block_setting)[2] = -1;
+    // fill in byte 3
+    reinterpret_cast<int*>(&dst_block_setting)[3] = 0x00027000;
+
+    __llvm_amdgcn_buffer_storex4(
+        src, dst_block_setting, 0, dst_thread_addr_offset + dst_const_addr_offset, false, false);
+#endif
 }
 
 __device__ void vmcnt(index_t cnt)
diff --git a/driver/include/device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp b/driver/include/device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp
index febd1625e2..7c1f142a8d 100644
--- a/driver/include/device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp
+++ b/driver/include/device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp
@@ -47,7 +47,7 @@ void device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw(InDesc,
     wei_kcyx_device_buf.ToDevice(wei_kcyx.mData.data());
     out_nkhw_device_buf.ToDevice(out_nkhw.mData.data());
 
-#if 0
+#if 1
     // BlockSize = 256, blockwise-GEMM 128x128, each thread hold 64 data
     constexpr index_t BlockSize = 256;