diff --git a/composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp b/composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp
index fba11478f8..0f9976f453 100644
--- a/composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp
+++ b/composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp
@@ -487,6 +487,7 @@ struct BlockwiseGenericTensorSliceCopy_v2
 #if 0
         mThreadwiseLoad.Run(p_src, p_buffer);
 #else
+        // hardcoded: global to register
         mThreadwiseLoad.template Run_amd_experiment<TData, 2, 0>(p_src, p_buffer);
 #endif
     }
@@ -497,7 +498,8 @@ struct BlockwiseGenericTensorSliceCopy_v2
 #if 0
         mThreadwiseStore.Run(p_buffer, p_dst);
 #else
-        mThreadwiseStore.template Run_amd_experiment<TData, 0, 2>(p_buffer, p_dst);
+        // hardcoded: register to LDS
+        mThreadwiseStore.template Run_amd_experiment<TData, 0, 1>(p_buffer, p_dst);
 #endif
     }
 
@@ -506,13 +508,8 @@ struct BlockwiseGenericTensorSliceCopy_v2
     {
         TData p_buffer[GetRegisterBufferSize()];
 
-#if 0
-        mThreadwiseLoad.Run(p_src, p_buffer);
-        mThreadwiseStore.Run(p_buffer, p_dst);
-#else
-        mThreadwiseLoad.template Run_amd_experiment<TData, 2, 0>(p_src, p_buffer);
-        mThreadwiseStore.template Run_amd_experiment<TData, 0, 2>(p_buffer, p_dst);
-#endif
+        RunLoadRegisterBuffer(p_src, p_buffer);
+        RunStoreRegisterBuffer(p_buffer, p_dst);
     }
 
     template <typename T, bool PositiveDirection>
diff --git a/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp b/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp
index 097ac78a8a..e661264d86 100644
--- a/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp
+++ b/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp
@@ -819,38 +819,38 @@ struct ThreadwiseGenericTensorSliceCopy_v2r1
 
                     src_vector_t vector_data;
 
+                    // Read vector from src.
+                    //   1. Source code version can take src of all kinds of memory-space
+                    //   2. Inline asm versions using global_load or buffer_load can only take
+                    //      src from global-memory
+                    //
+                    // Commemt for loading from global-memory:
+                    //   When
+                    //     1) using source code, in order for compiler to emit optimal
+                    //        load instruction, or
+                    //     2) using inline asm (global_load or buffer_load), in order
+                    //        for inline asm to be valid,
+                    //   following assumptions need to be satisfied:
+                    //     1. p_src need to be block-invariant (assumption)
+                    //     2. src_normal_offset must be calculatd at compile time (guaranteed)
+                    //     3. src_merged_offset can be runtime value (no assumption imposed)
                     static_if<SrcMemorySpace == 2>{}([&](auto) {
 #if 1 // source code
-                        // Load vector from src.
-                        // src can be all kinds of memory-space.
-                        // In order for optimized global_load to be emitted by compiler, need to
-                        // assume:
-                        //   1. p_src need to be block-invariant (assumption)
-                        //   2. src_normal_offset must be calculatd at compile time (guaranteed)
-                        //   3. src_merged_offset can be runtime value (no assumption imposed)
                         vector_data = *reinterpret_cast<const src_vector_t*>(
                             &p_src[src_normal_offset + src_merged_offset]);
-#else // inline asm using buffer_load
-                        // Load vector from src
-                        // src's memory-space can only be global-memory (buffer_load inline-asm is
-                        // used)
-                        // In order for buffer_load to be valid, need to assume:
-                        //   1. p_src need to be block-invariant (assumption)
-                        //   2. src_normal_offset must be calculatd at compile time (guaranteed)
-                        //   3. src_merged_offset can be runtime value (no assumption imposed)
-                        vector_data = buffer_load<TData, SrcDataPerAccess>(
+#elif 1 // inline asm using global_load
+                        vector_data = __global_load<TData, SrcDataPerAccess>(
+                            p_src,
+                            static_cast<uint32_t>(src_merged_offset),
+                            static_cast<uint32_t>(src_normal_offset));
+#elif 1 // inline asm using buffer_load
+                        vector_data = __buffer_load<TData, SrcDataPerAccess>(
                             p_src,
                             static_cast<uint32_t>(src_merged_offset),
                             static_cast<uint32_t>(src_normal_offset));
 #endif
                     }).Else([&](auto) {
-                        // Load vector from src.
                         // src can be all kinds of memory-space.
-                        // In order for optimized global_load to be emitted by compiler, need to
-                        // assume:
-                        //   1. p_src need to be block-invariant (assumption)
-                        //   2. src_normal_offset must be calculatd at compile time (guaranteed)
-                        //   3. src_merged_offset can be runtime value (no assumption imposed)
                         vector_data = *reinterpret_cast<const src_vector_t*>(
                             &p_src[src_normal_offset + src_merged_offset]);
                     });
@@ -924,36 +924,34 @@ struct ThreadwiseGenericTensorSliceCopy_v2r1
                         const index_t dst_normal_offset =
                             DstDesc::GetOffsetFromMultiIndex(dst_normal_dim_data_id);
 
+                        // Write vector into dst.
+                        //   1. Source code version can take dst of all kinds of memory-space
+                        //   2. Inline asm versions using global_store or buffer_store can only take
+                        //      dst from global-memory
+                        //
+                        // Commemt for storing into global-memory:
+                        //   When
+                        //     1) using source code, in order for compiler to emit optimal
+                        //        store instruction, or
+                        //     2) using inline asm (global_store or buffer_store), in order
+                        //        for inline asm to be valid,
+                        //   following assumptions need to be satisfied:
+                        //     1. p_dst need to be block-invariant (assumption)
+                        //     2. dst_normal_offset must be calculatd at compile time (guaranteed)
+                        //     3. dst_merged_offset can be runtime value (no assumption imposed)
                         static_if<DstMemorySpace == 2>{}([&](auto) {
 #if 1 // source code
-                            // Write vector into dst.
-                            // dst can be all kinds of memory-space
-                            // In order for optmized global_store to be emitted by compiler, need to
-                            // assume:
-                            //   1. p_dst need to be block-invariant (assumption)
-                            //   2. dst_normal_offset must be calculatd at compile time (guaranteed)
-                            //   3. dst_merged_offset can be runtime value (no assumption imposed)
                             *reinterpret_cast<dst_vector_t*>(
                                 &p_dst[dst_normal_offset + dst_merged_offset]) = vector_data;
-#else // inline asm using buffer_store
-                            // Write vector into dst.
-                            // dst's memory-space need to be global-memory (buffer_store is used)
-                            // In order for optmized global_store to be emitted by compiler, need to
-                            // assume:
-                            //   1. p_dst need to be block-invariant (assumption)
-                            //   2. dst_normal_offset must be calculatd at compile time (guaranteed)
-                            //   3. dst_merged_offset can be runtime value (no assumption imposed)
-                            buffer_store<TData, DstDataPerAccess>(
+#elif 1 // inline asm using global_store
+                            __global_store<TData, DstDataPerAccess>(
+                                vector_data, p_dst, dst_merged_offset, dst_normal_offset);
+#elif 1 // inline asm using buffer_store
+                            __buffer_store<TData, DstDataPerAccess>(
                                 vector_data, p_dst, dst_merged_offset, dst_normal_offset);
 #endif
                         }).Else([&](auto) {
-                            // Write vector into dst.
                             // dst can be all kinds of memory-space
-                            // In order for optmized global_store to be emitted by compiler, need to
-                            // assume:
-                            //   1. p_dst need to be block-invariant (assumption)
-                            //   2. dst_normal_offset must be calculatd at compile time (guaranteed)
-                            //   3. dst_merged_offset can be runtime value (no assumption imposed)
                             *reinterpret_cast<dst_vector_t*>(
                                 &p_dst[dst_normal_offset + dst_merged_offset]) = vector_data;
                         });
diff --git a/composable_kernel/include/utility/amd_inline_asm.hpp b/composable_kernel/include/utility/amd_inline_asm.hpp
index 307c96c4a4..04b6864fa4 100644
--- a/composable_kernel/include/utility/amd_inline_asm.hpp
+++ b/composable_kernel/include/utility/amd_inline_asm.hpp
@@ -8,21 +8,169 @@ namespace ck {
 // cast a pointer of LDS to its address
 extern "C" __attribute__((address_space(3))) __device__ void* __to_local(void* p);
 
-// buffer_load and buffer_store
+// global_load and global_store
 template <typename T, index_t VectorSize>
 __device__ typename vector_type<T, VectorSize>::MemoryType
-buffer_load(const T* p_src_block, uint32_t src_thread_offset, uint32_t src_const_offset);
+__global_load(const T* p_src_block, uint32_t src_thread_offset, uint32_t src_const_offset);
 
 template <typename T, index_t VectorSize>
-__device__ void buffer_store(const typename vector_type<T, VectorSize>::MemoryType& src,
-                             T* p_dst_block,
-                             uint32_t dst_thread_offset,
-                             uint32_t dst_const_offset);
+__device__ void __global_store(const typename vector_type<T, VectorSize>::MemoryType& src,
+                               T* p_dst_block,
+                               uint32_t dst_thread_offset,
+                               uint32_t dst_const_offset);
 
 template <>
-__device__ float buffer_load<float, 1>(const float* p_src_block,
-                                       uint32_t src_thread_offset,
-                                       uint32_t src_const_offset)
+__device__ float __global_load<float, 1>(const float* p_src_block,
+                                         uint32_t src_thread_offset,
+                                         uint32_t src_const_offset)
+{
+#if 0 // compute on VALU
+    float dst;
+
+    uint64_t src_thread_offset_u64 = static_cast<uint64_t>(src_thread_offset + src_const_offset);
+
+    asm volatile("\n \
+     global_load_dword %0, %1, %2, offset:0 \n \
+     s_waitcnt 0 \n \
+     "
+                 : "=v"(dst)
+                 : "v"(src_thread_offset_u64), "s"(p_src_block));
+
+    return dst;
+#else // compute on SALU
+    float dst;
+
+    uint64_t src_thread_offset_u64 = static_cast<uint64_t>(src_thread_offset);
+
+    const float* p_src_block_with_offset = p_src_block + src_const_offset;
+
+    asm volatile("\n \
+     global_load_dword %0, %1, %2, offset:0 \n \
+     ;;s_waitcnt 0 \n \
+     "
+                 : "=v"(dst)
+                 : "v"(src_thread_offset_u64), "s"(p_src_block_with_offset));
+
+    return dst;
+#endif
+}
+
+template <>
+__device__ vector_type<float, 2>::MemoryType __global_load<float, 2>(const float* p_src_block,
+                                                                     uint32_t src_thread_offset,
+                                                                     uint32_t src_const_offset)
+{
+#if 0 // compute on VALU
+    vector_type<float, 2>::MemoryType dst;
+
+    uint64_t src_thread_offset_u64 = static_cast<uint64_t>(src_thread_offset + src_const_offset);
+
+    asm volatile("\n \
+     global_load_dwordx2 %0, %1, %2, offset:0 \n \
+     s_waitcnt 0 \n \
+     "
+                 : "=v"(dst)
+                 : "v"(src_thread_offset_u64), "s"(p_src_block));
+
+    return dst;
+#else // compute on SALU
+    vector_type<float, 2>::MemoryType dst;
+
+    uint64_t src_thread_offset_u64 = static_cast<uint64_t>(src_thread_offset);
+
+    const float* p_src_block_with_offset = p_src_block + src_const_offset;
+
+    asm volatile("\n \
+     global_load_dwordx2 %0, %1, %2, offset:0 \n \
+     ;;s_waitcnt 0 \n \
+     "
+                 : "=v"(dst)
+                 : "v"(src_thread_offset_u64), "s"(p_src_block_with_offset));
+
+    return dst;
+#endif
+}
+
+template <>
+__device__ vector_type<float, 4>::MemoryType __global_load<float, 4>(const float* p_src_block,
+                                                                     uint32_t src_thread_offset,
+                                                                     uint32_t src_const_offset)
+{
+#if 0 // compute on VALU
+    vector_type<float, 4>::MemoryType dst;
+
+    uint64_t src_thread_offset_u64 = static_cast<uint64_t>(src_thread_offset + src_const_offset);
+
+    asm volatile("\n \
+     global_load_dwordx4 %0, %1, %2, offset:0 \n \
+     s_waitcnt 0 \n \
+     "
+                 : "=v"(dst)
+                 : "v"(src_thread_offset_u64), "s"(p_src_block));
+
+    return dst;
+#else // compute on SALU
+    vector_type<float, 4>::MemoryType dst;
+
+    uint64_t src_thread_offset_u64 = static_cast<uint64_t>(src_thread_offset);
+
+    const float* p_src_block_with_offset = p_src_block + src_const_offset;
+
+    asm volatile("\n \
+     global_load_dwordx4 %0, %1, %2, offset:0 \n \
+     ;;s_waitcnt 0 \n \
+     "
+                 : "=v"(dst)
+                 : "v"(src_thread_offset_u64), "s"(p_src_block_with_offset));
+
+    return dst;
+#endif
+}
+
+template <>
+__device__ void __global_store<float, 1>(const float& src,
+                                         float* p_dst_block,
+                                         uint32_t dst_thread_offset,
+                                         uint32_t dst_const_offset)
+{
+#if 0 // compute on VALU
+    uint64_t dst_thread_offset_u64 = static_cast<uint64_t>(dst_thread_offset + dst_const_offset);
+
+    asm volatile("\n \
+     global_store_dword %0, %1, %2, offset:0 \n \
+     s_waitcnt 0 \n \
+     "
+                 :
+                 : "v"(dst_thread_offset_u64), "v"(src), "s"(p_dst_block));
+#else // compute on SALU
+    uint64_t dst_thread_offset_u64 = static_cast<uint64_t>(dst_thread_offset);
+
+    float* p_dst_block_with_offset = p_dst_block + dst_const_offset;
+
+    asm volatile("\n \
+     global_store_dword %0, %1, %2, offset:0 \n \
+     ;;s_waitcnt 0 \n \
+     "
+                 :
+                 : "v"(dst_thread_offset_u64), "v"(src), "s"(p_dst_block_with_offset));
+#endif
+}
+
+// __buffer_load and __buffer_store
+template <typename T, index_t VectorSize>
+__device__ typename vector_type<T, VectorSize>::MemoryType
+__buffer_load(const T* p_src_block, uint32_t src_thread_offset, uint32_t src_const_offset);
+
+template <typename T, index_t VectorSize>
+__device__ void __buffer_store(const typename vector_type<T, VectorSize>::MemoryType& src,
+                               T* p_dst_block,
+                               uint32_t dst_thread_offset,
+                               uint32_t dst_const_offset);
+
+template <>
+__device__ float __buffer_load<float, 1>(const float* p_src_block,
+                                         uint32_t src_thread_offset,
+                                         uint32_t src_const_offset)
 {
     float dst;
 
@@ -35,7 +183,7 @@ __device__ float buffer_load<float, 1>(const float* p_src_block,
     reinterpret_cast<int*>(&src_block_setting)[3] = 0x00027000;
 
     asm volatile("\n \
-    buffer_load_dword %0, %1, %2, %3 offen offset:0 \n \
+    __buffer_load_dword %0, %1, %2, %3 offen offset:0 \n \
     s_waitcnt 0 \n \
     "
                  : "=v"(dst)
@@ -45,9 +193,9 @@ __device__ float buffer_load<float, 1>(const float* p_src_block,
 }
 
 template <>
-__device__ vector_type<float, 2>::MemoryType buffer_load<float, 2>(const float* p_src_block,
-                                                                   uint32_t src_thread_offset,
-                                                                   uint32_t src_const_offset)
+__device__ vector_type<float, 2>::MemoryType __buffer_load<float, 2>(const float* p_src_block,
+                                                                     uint32_t src_thread_offset,
+                                                                     uint32_t src_const_offset)
 {
     vector_type<float, 2>::MemoryType dst;
 
@@ -60,7 +208,7 @@ __device__ vector_type<float, 2>::MemoryType buffer_load<float, 2>(const float*
     reinterpret_cast<int*>(&src_block_setting)[3] = 0x00027000;
 
     asm volatile("\n \
-    buffer_load_dwordx2 %0, %1, %2, %3 offen offset:0 \n \
+    __buffer_load_dwordx2 %0, %1, %2, %3 offen offset:0 \n \
     s_waitcnt 0 \n \
     "
                  : "=v"(dst)
@@ -70,9 +218,9 @@ __device__ vector_type<float, 2>::MemoryType buffer_load<float, 2>(const float*
 }
 
 template <>
-__device__ vector_type<float, 4>::MemoryType buffer_load<float, 4>(const float* p_src_block,
-                                                                   uint32_t src_thread_offset,
-                                                                   uint32_t src_const_offset)
+__device__ vector_type<float, 4>::MemoryType __buffer_load<float, 4>(const float* p_src_block,
+                                                                     uint32_t src_thread_offset,
+                                                                     uint32_t src_const_offset)
 {
     vector_type<float, 4>::MemoryType dst;
 
@@ -85,7 +233,7 @@ __device__ vector_type<float, 4>::MemoryType buffer_load<float, 4>(const float*
     reinterpret_cast<int*>(&src_block_setting)[3] = 0x00027000;
 
     asm volatile("\n \
-    buffer_load_dwordx4 %0, %1, %2, %3 offen offset:0 \n \
+    __buffer_load_dwordx4 %0, %1, %2, %3 offen offset:0 \n \
     s_waitcnt 0 \n \
     "
                  : "=v"(dst)
@@ -95,10 +243,10 @@ __device__ vector_type<float, 4>::MemoryType buffer_load<float, 4>(const float*
 }
 
 template <>
-__device__ void buffer_store<float, 1>(const float& src,
-                                       float* p_dst_block,
-                                       uint32_t dst_thread_offset,
-                                       uint32_t dst_const_offset)
+__device__ void __buffer_store<float, 1>(const float& src,
+                                         float* p_dst_block,
+                                         uint32_t dst_thread_offset,
+                                         uint32_t dst_const_offset)
 {
     int32x4_t dst_block_setting{0};
     // fill in byte 0 - 1
@@ -109,7 +257,7 @@ __device__ void buffer_store<float, 1>(const float& src,
     reinterpret_cast<int*>(&dst_block_setting)[3] = 0x00027000;
 
     asm volatile("\n \
-    buffer_store_dword %1, %2, %0, %3 offen offset:0 \n \
+    __buffer_store_dword %1, %2, %0, %3 offen offset:0 \n \
     s_waitcnt 0 \n \
     "
                  :
diff --git a/driver/include/device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp b/driver/include/device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp
index 4555ce873f..febd1625e2 100644
--- a/driver/include/device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp
+++ b/driver/include/device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp
@@ -47,7 +47,7 @@ void device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw(InDesc,
     wei_kcyx_device_buf.ToDevice(wei_kcyx.mData.data());
     out_nkhw_device_buf.ToDevice(out_nkhw.mData.data());
 
-#if 1
+#if 0
     // BlockSize = 256, blockwise-GEMM 128x128, each thread hold 64 data
     constexpr index_t BlockSize = 256;
 
@@ -82,7 +82,7 @@ void device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw(InDesc,
     using WeiBlockCopySrcAccessOrder            = Sequence<1, 0>; // [K, E]
     using WeiBlockCopyDstAccessOrder            = Sequence<0, 1>; // [E, K]
 
-    constexpr index_t WeiBlockCopySrcDataPerRead_E  = 1;
+    constexpr index_t WeiBlockCopySrcDataPerRead_E  = 4;
     constexpr index_t WeiBlockCopyDstDataPerWrite_K = 1;
 #elif 0
     // BlockSize = 64, blockwise-GEMM 64x64, each thread hold 64 data
@@ -156,7 +156,7 @@ void device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw(InDesc,
     using WeiBlockCopySrcAccessOrder            = Sequence<1, 0>; // [K, E]
     using WeiBlockCopyDstAccessOrder            = Sequence<0, 1>; // [E, K]
 
-    constexpr index_t WeiBlockCopySrcDataPerRead_E  = 1;
+    constexpr index_t WeiBlockCopySrcDataPerRead_E  = 2;
     constexpr index_t WeiBlockCopyDstDataPerWrite_K = 1;
 #endif
 
diff --git a/driver/src/driver.cpp b/driver/src/driver.cpp
index 93a871b83b..166f9382fe 100644
--- a/driver/src/driver.cpp
+++ b/driver/src/driver.cpp
@@ -103,7 +103,7 @@ int main(int argc, char* argv[])
 
     using LeftPads  = Sequence<0, 0>;
     using RightPads = Sequence<0, 0>;
-#elif 0
+#elif 1
     // 1x1 filter, 8x8 image
     // cudnn@V100 68%, ck@V100 72%, ck@P100 52%, ck@VII 42%
     constexpr index_t N  = 64;
@@ -295,7 +295,7 @@ int main(int argc, char* argv[])
 
     using LeftPads  = Sequence<0, 0>;
     using RightPads = Sequence<0, 0>;
-#elif 1
+#elif 0
     // 3x3 filter, 2x2 stride, 35x35 input, 17x17 output
     // cudnn@V100 90%, ck@V100 93%, ck@P100 83%, ck@VII 81%
     constexpr index_t N  = 128;