diff --git a/driver/driver.hip.cpp b/driver/driver.hip.cpp
index 0ea091e607..1607a9e802 100644
--- a/driver/driver.hip.cpp
+++ b/driver/driver.hip.cpp
@@ -592,7 +592,7 @@ int main(int argc, char* argv[])
 
     constexpr index_t HPad = 0;
     constexpr index_t WPad = 0;
-#elif 1
+#elif 0
     // 1x1 filter, 14x14 image, C = 512
     constexpr index_t N  = 128;
     constexpr index_t C  = 512;
diff --git a/src/include/inline_asm.hpp b/src/include/amd_inline_asm.hip.hpp
similarity index 87%
rename from src/include/inline_asm.hpp
rename to src/include/amd_inline_asm.hip.hpp
index f7b1f217cb..38ac970981 100644
--- a/src/include/inline_asm.hpp
+++ b/src/include/amd_inline_asm.hip.hpp
@@ -1,8 +1,5 @@
 #pragma once
-
-typedef float Float4 __attribute__((ext_vector_type(4)));
-
-extern "C" __attribute__((address_space(3))) void* __to_local(void* p)[[hc]];
+#include "common.hip.hpp"
 
 #define NO_VM_WAIT 0
 #define NO_LGKM_WAIT 0
@@ -10,7 +7,10 @@ extern "C" __attribute__((address_space(3))) void* __to_local(void* p)[[hc]];
 #define NO_DS_WRITE 0
 #define NO_GLB_READ 0
 
-inline __device__ void vmcnt(index_t cnt)
+// cast a pointer of LDS to its address
+extern "C" __attribute__((address_space(3))) void* __to_local(void* p)[[hc]];
+
+__device__ void vmcnt(index_t cnt)
 {
 #if !NO_VM_WAIT
     if(cnt == 0)
@@ -44,7 +44,7 @@ inline __device__ void vmcnt(index_t cnt)
 #endif
 }
 
-inline __device__ void lgkmcnt(index_t cnt)
+__device__ void lgkmcnt(index_t cnt)
 {
 #if !NO_LGKM_WAIT
     if(cnt == 0)
@@ -84,7 +84,7 @@ inline __device__ void lgkmcnt(index_t cnt)
 #endif
 }
 
-inline __device__ void outerProduct1x4(const float* a, const float* b, float* c)
+__device__ void outerProduct1x4(const float* a, const float* b, float* c)
 {
     asm volatile("\n \
             v_mac_f32 %0, %4, %5 \n \
@@ -104,7 +104,9 @@ inline __device__ void outerProduct1x4(const float* a, const float* b, float* c)
                    "3"(c[3]));
 }
 
-inline __device__ void outerProduct1x4(const float& a, const Float4& b, Float4& c)
+__device__ void outerProduct1x4(const float& a,
+                                const vector_type<float, 4>::MemoryType& b,
+                                vector_type<float, 4>::MemoryType& c)
 {
 #if 0
     asm volatile(
@@ -123,8 +125,12 @@ inline __device__ void outerProduct1x4(const float& a, const Float4& b, Float4&
 #endif
 }
 
-inline __device__ void
-outerProduct4x4(const Float4& a, const Float4& b, Float4& c0, Float4& c1, Float4& c2, Float4& c3)
+__device__ void outerProduct4x4(const vector_type<float, 4>::MemoryType& a,
+                                const vector_type<float, 4>::MemoryType& b,
+                                vector_type<float, 4>::MemoryType& c0,
+                                vector_type<float, 4>::MemoryType& c1,
+                                vector_type<float, 4>::MemoryType& c2,
+                                vector_type<float, 4>::MemoryType& c3)
 {
 #if 0
     asm volatile(
@@ -179,7 +185,9 @@ outerProduct4x4(const Float4& a, const Float4& b, Float4& c0, Float4& c1, Float4
 #endif
 }
 
-inline __device__ void outerProduct8x8(const Float4* a, const Float4* b, Float4* c)
+__device__ void outerProduct8x8(const vector_type<float, 4>::MemoryType* a,
+                                const vector_type<float, 4>::MemoryType* b,
+                                vector_type<float, 4>::MemoryType* c)
 {
     outerProduct4x4(a[0], b[0], c[0], c[2], c[4], c[6]);
     outerProduct4x4(a[0], b[1], c[1], c[3], c[5], c[7]);
@@ -187,7 +195,7 @@ inline __device__ void outerProduct8x8(const Float4* a, const Float4* b, Float4*
     outerProduct4x4(a[1], b[1], c[9], c[11], c[13], c[15]);
 }
 
-inline __device__ void ds_read_b128(Float4& r, void* lds, index_t offset = 0)
+__device__ void ds_read_b128(vector_type<float, 4>::MemoryType& r, void* lds, index_t offset = 0)
 {
 #if !NO_DS_READ
     if(offset == 0)
@@ -413,7 +421,9 @@ inline __device__ void ds_read_b128(Float4& r, void* lds, index_t offset = 0)
 #endif
 }
 
-inline __device__ void global_load(Float4& r, const Float4* ptr, index_t offset = 0)
+__device__ void global_load(vector_type<float, 4>::MemoryType& r,
+                            const vector_type<float, 4>::MemoryType* ptr,
+                            index_t offset = 0)
 {
 #if !NO_GLB_READ
     if(offset == 0)
@@ -431,7 +441,8 @@ inline __device__ void global_load(Float4& r, const Float4* ptr, index_t offset
 #endif
 }
 
-inline __device__ void ds_write_b128(const Float4& r, void* lds, index_t offset = 0)
+__device__ void
+ds_write_b128(const vector_type<float, 4>::MemoryType& r, void* lds, index_t offset = 0)
 {
 #if !NO_DS_WRITE
     if(offset == 0)
diff --git a/src/include/blockwise_gemm.hip.hpp b/src/include/blockwise_gemm.hip.hpp
index 7f25a7322f..1ebc780bf8 100644
--- a/src/include/blockwise_gemm.hip.hpp
+++ b/src/include/blockwise_gemm.hip.hpp
@@ -130,6 +130,10 @@ struct BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_v2
                             const FloatB* __restrict__ p_b_block,
                             FloatC* __restrict__ p_c_thread) const
     {
+        static_assert(is_same<FloatA, float>::value && is_same<FloatB, float>::value &&
+                          is_same<FloatC, float>::value,
+                      "Run_asm only deal with float\n");
+
         constexpr auto True  = integral_constant<bool, true>{};
         constexpr auto False = integral_constant<bool, false>{};
 
@@ -158,6 +162,12 @@ struct BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_v2
         constexpr auto b_thread_sub_mtx = make_ConstantMatrixDescriptor(
             Number<KPerThreadLoop>{}, Number<NPerThreadSubC>{}, Number<NPerThread>{});
 
+        static_assert(MPerThreadSubC == 4 && NPerThreadSubC == 4 && KPerThreadLoop == 1 &&
+                          MPerThread == 8 && NPerThread == 8,
+                      "Run_asm cannot deal with this GEMM shape yet\n");
+
+        using Float4 = vector_type<float, 4>::MemoryType;
+
         float p_thread[a_thread_mtx.GetElementSpace() + b_thread_mtx.GetElementSpace()];
 
         FloatA* p_a_thread = p_thread;
diff --git a/src/include/common.hip.hpp b/src/include/common.hip.hpp
index 1c41d04feb..feb9060be7 100644
--- a/src/include/common.hip.hpp
+++ b/src/include/common.hip.hpp
@@ -5,8 +5,8 @@
 #include "Array.hip.hpp"
 #include "functional.hip.hpp"
 
-#if DEVICE_BACKEDN_HIP
-#include "inline_asm.hpp"
+#if DEVICE_BACKEND_HIP
+#include "amd_inline_asm.hip.hpp"
 #endif
 
 __device__ index_t get_thread_local_1d_id() { return threadIdx.x; }
@@ -67,8 +67,3 @@ __host__ __device__ constexpr T min(T x, Ts... xs)
     return x < y ? x : y;
 }
 } // namespace mod_conv
-
-#if DEVICE_BACKEND_HIP
-// cast a pointer of LDS to its address
-extern "C" __attribute__((address_space(3))) void* __to_local(void* p)[[hc]];
-#endif
diff --git a/src/include/data_type.hip.hpp b/src/include/data_type.hip.hpp
index ee44eebbe7..31791542ae 100644
--- a/src/include/data_type.hip.hpp
+++ b/src/include/data_type.hip.hpp
@@ -16,12 +16,14 @@ template <>
 struct vector_type<float, 2>
 {
 #if DEVICE_BACKEND_HIP
-    // For some reason, HIP compiler need this definition to generate optimal load and store instruction
+    // For some reason, HIP compiler need this definition to generate optimal load and store
+    // instruction
     typedef float MemoryType __attribute__((ext_vector_type(2)));
 #elif DEVICE_BACKEND_CUDA
     // For some reason, CUDA need this definition to, otherwise
     //   compiler won't generate optimal load and store instruction, and
-    //   kernel would produce wrong result, indicating the compiler fail to generate correct instruction, 
+    //   kernel would produce wrong result, indicating the compiler fail to generate correct
+    //   instruction,
     using MemoryType = float2;
 #endif
 
@@ -43,12 +45,14 @@ template <>
 struct vector_type<float, 4>
 {
 #if DEVICE_BACKEND_HIP
-    // For some reason, HIP compiler need this definition to generate optimal load and store instruction
+    // For some reason, HIP compiler need this definition to generate optimal load and store
+    // instruction
     typedef float MemoryType __attribute__((ext_vector_type(4)));
 #elif DEVICE_BACKEND_CUDA
     // For some reason, CUDA need this definition to, otherwise
     //   compiler won't generate optimal load and store instruction, and
-    //   kernel would produce wrong result, indicating the compiler fail to generate correct instruction, 
+    //   kernel would produce wrong result, indicating the compiler fail to generate correct
+    //   instruction,
     using MemoryType = float4;
 #endif
 };
diff --git a/src/include/gridwise_convolution_implicit_gemm_v2_chwn_cyxk_khwn.hip.hpp b/src/include/gridwise_convolution_implicit_gemm_v2_chwn_cyxk_khwn.hip.hpp
index acd4df9662..64d3c03970 100644
--- a/src/include/gridwise_convolution_implicit_gemm_v2_chwn_cyxk_khwn.hip.hpp
+++ b/src/include/gridwise_convolution_implicit_gemm_v2_chwn_cyxk_khwn.hip.hpp
@@ -239,9 +239,9 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn
 #elif 0
                     blockwise_gemm.Run_asm
 #endif
-                    (p_wei_block + wei_cyxk_block_desc.Get1dIndex(0, y, x, 0),
-                     p_in_block + y * Wi + x,
-                     p_out_thread);
+                        (p_wei_block + wei_cyxk_block_desc.Get1dIndex(0, y, x, 0),
+                         p_in_block + y * Wi + x,
+                         p_out_thread);
                 }
             }
         }
diff --git a/src/include/gridwise_convolution_implicit_gemm_v2_chwn_cyxk_khwn_lds_double_buffer.hip.hpp b/src/include/gridwise_convolution_implicit_gemm_v2_chwn_cyxk_khwn_lds_double_buffer.hip.hpp
index f1cd81b32b..7c867b620f 100644
--- a/src/include/gridwise_convolution_implicit_gemm_v2_chwn_cyxk_khwn_lds_double_buffer.hip.hpp
+++ b/src/include/gridwise_convolution_implicit_gemm_v2_chwn_cyxk_khwn_lds_double_buffer.hip.hpp
@@ -34,11 +34,6 @@ template <index_t GridSize,
           index_t WeiBlockCopyDataPerRead>
 struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn_lds_double_buffer
 {
-    __host__
-        __device__ constexpr GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn_lds_double_buffer()
-    {
-    }
-
     __device__ void Run(const Float* const __restrict__ p_in_global,
                         const Float* const __restrict__ p_wei_global,
                         Float* const __restrict__ p_out_global) const
@@ -203,7 +198,6 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn_lds_double_buffer
 
         // preload data into LDS
         {
-#if 1
             Float p_in_register_clipboard[blockwise_in_copy.GetRegisterClipboardSize()];
             Float p_wei_register_clipboard[blockwise_wei_copy.GetRegisterClipboardSize()];
 
@@ -212,19 +206,11 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn_lds_double_buffer
             blockwise_wei_copy.RunLoadRegisterClipboard(p_wei_global_block_offset,
                                                         p_wei_register_clipboard);
 
-            blockwise_in_copy.RunStoreRegisterClipboard(p_in_register_clipboard,
-                                                            p_in_block_double);
+#if 0
+            blockwise_in_copy.RunStoreRegisterClipboard(p_in_register_clipboard, p_in_block_double);
             blockwise_wei_copy.RunStoreRegisterClipboard(p_wei_register_clipboard,
-                                                             p_wei_block_double);
-#elif 0
-            Float p_in_register_clipboard[blockwise_in_copy.GetRegisterClipboardSize()];
-            Float p_wei_register_clipboard[blockwise_wei_copy.GetRegisterClipboardSize()];
-
-            blockwise_in_copy.RunLoadRegisterClipboard(p_in_global_block_offset,
-                                                       p_in_register_clipboard);
-            blockwise_wei_copy.RunLoadRegisterClipboard(p_wei_global_block_offset,
-                                                        p_wei_register_clipboard);
-
+                                                         p_wei_block_double);
+#elif 1
             vmcnt(0);
             blockwise_in_copy.RunStoreRegisterClipboard_asm(p_in_register_clipboard,
                                                             p_in_block_double);
@@ -278,28 +264,26 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn_lds_double_buffer
                 {
                     for(index_t x = 0; x < X; ++x)
                     {
-#if 1
+#if 0
                         blockwise_gemm.Run
 #elif 0
                         blockwise_gemm.Run_RegisterDoubleBuffer
 #elif 1
                         blockwise_gemm.Run_asm
 #endif
-                            (p_wei_block_now + wei_cyxk_block_desc.Get1dIndex(0, y, x, 0),
-                             p_in_block_now + y * Wi + x,
-                             p_out_thread);
+                        (p_wei_block_now + wei_cyxk_block_desc.Get1dIndex(0, y, x, 0),
+                         p_in_block_now + y * Wi + x,
+                         p_out_thread);
                     }
                 }
 
-#if 1
+#if 0
                 blockwise_in_copy.RunStoreRegisterClipboard(p_in_register_clipboard,
                                                             p_in_block_next);
                 blockwise_wei_copy.RunStoreRegisterClipboard(p_wei_register_clipboard,
                                                              p_wei_block_next);
-#elif 0
-                // if work with RunLoadRegisterClipboard_asm, need to wait
+#elif 1
                 vmcnt(0);
-
                 blockwise_in_copy.RunStoreRegisterClipboard_asm(p_in_register_clipboard,
                                                                 p_in_block_next);
                 blockwise_wei_copy.RunStoreRegisterClipboard_asm(p_wei_register_clipboard,
@@ -329,32 +313,29 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn_lds_double_buffer
             {
                 for(index_t x = 0; x < X; ++x)
                 {
-#if 1
+#if 0
                     blockwise_gemm.Run
 #elif 0
                     blockwise_gemm.Run_RegisterDoubleBuffer
 #elif 1
                     blockwise_gemm.Run_asm
 #endif
-                        (p_wei_block_double + wei_cyxk_block_desc.Get1dIndex(0, y, x, 0),
-                         p_in_block_double + y * Wi + x,
-                         p_out_thread);
+                    (p_wei_block_double + wei_cyxk_block_desc.Get1dIndex(0, y, x, 0),
+                     p_in_block_double + y * Wi + x,
+                     p_out_thread);
                 }
             }
 
-#if 1
+#if 0
             blockwise_in_copy.RunStoreRegisterClipboard(p_in_register_clipboard,
                                                         p_in_block_double + in_block_space);
 
             blockwise_wei_copy.RunStoreRegisterClipboard(p_wei_register_clipboard,
                                                          p_wei_block_double + wei_block_space);
 #else
-            // if work with RunLoadRegisterClipboard_asm, need to wait
             vmcnt(0);
-
             blockwise_in_copy.RunStoreRegisterClipboard_asm(p_in_register_clipboard,
                                                             p_in_block_double + in_block_space);
-
             blockwise_wei_copy.RunStoreRegisterClipboard_asm(p_wei_register_clipboard,
                                                              p_wei_block_double + wei_block_space);
 #endif
@@ -366,17 +347,17 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn_lds_double_buffer
             {
                 for(index_t x = 0; x < X; ++x)
                 {
-#if 1
+#if 0
                     blockwise_gemm.Run
 #elif 0
                     blockwise_gemm.Run_RegisterDoubleBuffer
 #elif 1
                     blockwise_gemm.Run_asm
 #endif
-                        (p_wei_block_double + wei_block_space +
-                             wei_cyxk_block_desc.Get1dIndex(0, y, x, 0),
-                         p_in_block_double + in_block_space + y * Wi + x,
-                         p_out_thread);
+                    (p_wei_block_double + wei_block_space +
+                         wei_cyxk_block_desc.Get1dIndex(0, y, x, 0),
+                     p_in_block_double + in_block_space + y * Wi + x,
+                     p_out_thread);
                 }
             }
         }