From 33d1e0e2e54bed6f155dda50bd8d8796b5f20adf Mon Sep 17 00:00:00 2001
From: Chao Liu <lc.roy86@gmail.com>
Date: Mon, 17 Jun 2019 14:58:44 -0500
Subject: [PATCH] refactoring for miopen

---
 ...tion_implicit_gemm_v1r1_chwn_cyxk_khwn.hpp |   2 +-
 ...tion_implicit_gemm_v1r3_chwn_cyxk_khwn.hpp |   2 +-
 ..._v1r3_chwn_cyxk_khwn_lds_double_buffer.hpp |   2 +-
 ...tion_implicit_gemm_v1r3_nchw_cyxk_nkhw.hpp |   2 +-
 ..._v1r3_nchw_cyxk_nkhw_lds_double_buffer.hpp |   2 +-
 ...lution_implicit_gemm_v2_chwn_cyxk_khwn.hpp |   2 +-
 ...mm_v2_chwn_cyxk_khwn_lds_double_buffer.hpp |   6 +-
 ...lution_implicit_gemm_v3_nchw_cyxk_nkhw.hpp |   2 +-
 ...mm_v3_nchw_cyxk_nkhw_lds_double_buffer.hpp |   2 +-
 ...lution_implicit_gemm_v4_nchw_kcyx_nkhw.hpp |   2 +-
 ...mm_v4_nchw_kcyx_nkhw_lds_double_buffer.hpp |  25 +--
 .../ConstantTensorDescriptor.hpp              |  11 +-
 .../blockwise_2d_tensor_op.hpp                |   8 +-
 .../blockwise_3d_tensor_op.hpp                |   2 +-
 .../blockwise_4d_tensor_op.hpp                |   2 +-
 .../blockwise_batched_gemm.hpp                |  38 ++--
 .../tensor_operation/blockwise_gemm.hpp       |  48 +++--
 .../blockwise_generic_tensor_slice_copy.hpp   |  99 ++++-------
 .../blockwise_tensor_slice_copy.hpp           |   7 +-
 .../threadwise_4d_tensor_op.hpp               |   8 +-
 .../tensor_operation/threadwise_gemm.hpp      |  27 +--
 .../threadwise_generic_tensor_slice_copy.hpp  |  63 +++----
 .../threadwise_tensor_slice_copy.hpp          |   7 +-
 composable_kernel/include/utility/Array.hpp   |   2 +-
 .../include/utility/Sequence.hpp              |  87 +--------
 .../include/utility/amd_inline_asm.hpp        | 168 ------------------
 .../include/utility/config_amd.hpp.in         |   3 +
 .../include/utility/config_nvidia.hpp.in      |   3 +
 .../include/utility/functional.hpp            |   6 +-
 .../include/utility/integral_constant.hpp     |  17 +-
 composable_kernel/include/utility/utility.hpp |  28 +--
 .../include/utility/vector_type.hpp           | 125 -------------
 driver/include/tensor.hpp                     |   4 +-
 33 files changed, 195 insertions(+), 617 deletions(-)

diff --git a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v1r1_chwn_cyxk_khwn.hpp b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v1r1_chwn_cyxk_khwn.hpp
index eca22ce632..5dcf4f415e 100644
--- a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v1r1_chwn_cyxk_khwn.hpp
+++ b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v1r1_chwn_cyxk_khwn.hpp
@@ -264,7 +264,7 @@ struct GridwiseConvolutionImplicitGemm_v1r1_chwn_cyxk_khwn
 #if 1
                     blockwise_batch_gemm.Run
 #else
-                    blockwise_batch_gemm.Run_asm
+                    blockwise_batch_gemm.Run_amd_asm
 #endif
                         (p_wei_block + wei_c_y_x_k_block_desc.GetOffsetFromMultiIndex(0, y, x, 0),
                          p_in_block + in_c_h_w_n_block_desc.GetOffsetFromMultiIndex(0, y, x, 0),
diff --git a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v1r3_chwn_cyxk_khwn.hpp b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v1r3_chwn_cyxk_khwn.hpp
index c54eb80dd9..1455e1797b 100644
--- a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v1r3_chwn_cyxk_khwn.hpp
+++ b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v1r3_chwn_cyxk_khwn.hpp
@@ -191,7 +191,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn
 #if 0
             return blockwise_batch_gemm.Run(Xs...);
 #elif 0
-            return blockwise_batch_gemm.Run_asm(Xs...);
+            return blockwise_batch_gemm.Run_amd_asm(Xs...);
 #else
             return blockwise_batch_gemm.Run_asm_v2(Xs...);
 #endif
diff --git a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v1r3_chwn_cyxk_khwn_lds_double_buffer.hpp b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v1r3_chwn_cyxk_khwn_lds_double_buffer.hpp
index f421cfa4c3..c5e1712332 100644
--- a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v1r3_chwn_cyxk_khwn_lds_double_buffer.hpp
+++ b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v1r3_chwn_cyxk_khwn_lds_double_buffer.hpp
@@ -206,7 +206,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn_lds_double_buffer
 #if 1
             return blockwise_batch_gemm.Run(Xs...);
 #elif 0
-            return blockwise_batch_gemm.Run_asm(Xs...);
+            return blockwise_batch_gemm.Run_amd_asm(Xs...);
 #else
             return blockwise_batch_gemm.Run_asm_v2(Xs...);
 #endif
diff --git a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v1r3_nchw_cyxk_nkhw.hpp b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v1r3_nchw_cyxk_nkhw.hpp
index 447ce4ce52..3636db9fa6 100644
--- a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v1r3_nchw_cyxk_nkhw.hpp
+++ b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v1r3_nchw_cyxk_nkhw.hpp
@@ -201,7 +201,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_nchw_cyxk_nkhw
 #if 1
             return blockwise_batch_gemm.Run(Xs...);
 #elif 0
-            return blockwise_batch_gemm.Run_asm(Xs...);
+            return blockwise_batch_gemm.Run_amd_asm(Xs...);
 #else
             return blockwise_batch_gemm.Run_asm_v2(Xs...);
 #endif
diff --git a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v1r3_nchw_cyxk_nkhw_lds_double_buffer.hpp b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v1r3_nchw_cyxk_nkhw_lds_double_buffer.hpp
index 3c71a5afc4..5a7267d75c 100644
--- a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v1r3_nchw_cyxk_nkhw_lds_double_buffer.hpp
+++ b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v1r3_nchw_cyxk_nkhw_lds_double_buffer.hpp
@@ -203,7 +203,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_nchw_cyxk_nkhw_lds_double_buffer
 #if 1
             return blockwise_batch_gemm.Run(Xs...);
 #elif 0
-            return blockwise_batch_gemm.Run_asm(Xs...);
+            return blockwise_batch_gemm.Run_amd_asm(Xs...);
 #else
             return blockwise_batch_gemm.Run_asm_v2(Xs...);
 #endif
diff --git a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v2_chwn_cyxk_khwn.hpp b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v2_chwn_cyxk_khwn.hpp
index a1525e74ee..b39bb66a2c 100644
--- a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v2_chwn_cyxk_khwn.hpp
+++ b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v2_chwn_cyxk_khwn.hpp
@@ -238,7 +238,7 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn
 #elif 0
                     blockwise_gemm.Run_RegisterDoubleBuffer
 #elif 1
-                    blockwise_gemm.Run_asm
+                    blockwise_gemm.Run_amd_asm
 #endif
                         (p_wei_block + wei_cyxk_block_desc.GetOffsetFromMultiIndex(0, y, x, 0),
                          p_in_block + y * Wi + x,
diff --git a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v2_chwn_cyxk_khwn_lds_double_buffer.hpp b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v2_chwn_cyxk_khwn_lds_double_buffer.hpp
index 31832b2ef9..52abeab530 100644
--- a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v2_chwn_cyxk_khwn_lds_double_buffer.hpp
+++ b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v2_chwn_cyxk_khwn_lds_double_buffer.hpp
@@ -271,7 +271,7 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn_lds_double_buffer
 #elif 0
                         blockwise_gemm.Run_RegisterDoubleBuffer
 #elif 0
-                        blockwise_gemm.Run_asm
+                        blockwise_gemm.Run_amd_asm
 #endif
                             (p_wei_block_now +
                                  wei_cyxk_block_desc.GetOffsetFromMultiIndex(0, y, x, 0),
@@ -313,7 +313,7 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn_lds_double_buffer
 #elif 0
                     blockwise_gemm.Run_RegisterDoubleBuffer
 #elif 0
-                    blockwise_gemm.Run_asm
+                    blockwise_gemm.Run_amd_asm
 #endif
                         (p_wei_block_double +
                              wei_cyxk_block_desc.GetOffsetFromMultiIndex(0, y, x, 0),
@@ -339,7 +339,7 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn_lds_double_buffer
 #elif 0
                     blockwise_gemm.Run_RegisterDoubleBuffer
 #elif 0
-                    blockwise_gemm.Run_asm
+                    blockwise_gemm.Run_amd_asm
 #endif
                         (p_wei_block_double + wei_block_space +
                              wei_cyxk_block_desc.GetOffsetFromMultiIndex(0, y, x, 0),
diff --git a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v3_nchw_cyxk_nkhw.hpp b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v3_nchw_cyxk_nkhw.hpp
index d1b77f06e7..70737ebc6c 100644
--- a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v3_nchw_cyxk_nkhw.hpp
+++ b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v3_nchw_cyxk_nkhw.hpp
@@ -217,7 +217,7 @@ struct GridwiseConvolutionImplicitGemm_v3_nchw_cyxk_nkhw
 #if 1
             return blockwise_gemm.Run(Xs...);
 #else
-            return blockwise_gemm.Run_asm(Xs...);
+            return blockwise_gemm.Run_amd_asm(Xs...);
 #endif
         };
 
diff --git a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v3_nchw_cyxk_nkhw_lds_double_buffer.hpp b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v3_nchw_cyxk_nkhw_lds_double_buffer.hpp
index 0d5b520c53..ca8412355e 100644
--- a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v3_nchw_cyxk_nkhw_lds_double_buffer.hpp
+++ b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v3_nchw_cyxk_nkhw_lds_double_buffer.hpp
@@ -214,7 +214,7 @@ struct GridwiseConvolutionImplicitGemm_v3_nchw_cyxk_nkhw_lds_double_buffer
 #if 1
             return blockwise_gemm.Run(Xs...);
 #else
-            return blockwise_gemm.Run_asm(Xs...);
+            return blockwise_gemm.Run_amd_asm(Xs...);
 #endif
         };
 
diff --git a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4_nchw_kcyx_nkhw.hpp b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4_nchw_kcyx_nkhw.hpp
index f6535b3d7b..461db757bd 100644
--- a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4_nchw_kcyx_nkhw.hpp
+++ b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4_nchw_kcyx_nkhw.hpp
@@ -247,7 +247,7 @@ struct GridwiseConvolutionImplicitGemm_v4_nchw_kcyx_nkhw
 #if 1
             return blockwise_gemm.Run(Xs...);
 #else
-            return blockwise_gemm.Run_asm(Xs...);
+            return blockwise_gemm.Run_amd_asm(Xs...);
 #endif
         };
 
diff --git a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4_nchw_kcyx_nkhw_lds_double_buffer.hpp b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4_nchw_kcyx_nkhw_lds_double_buffer.hpp
index 433ba2d855..2b41e2640e 100644
--- a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4_nchw_kcyx_nkhw_lds_double_buffer.hpp
+++ b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4_nchw_kcyx_nkhw_lds_double_buffer.hpp
@@ -9,10 +9,6 @@
 #include "blockwise_gemm.hpp"
 #include "threadwise_generic_tensor_slice_copy.hpp"
 
-#ifndef CK_BLOCKWISE_GEMM_USE_AMD_INLINE_ASM
-#define CK_BLOCKWISE_GEMM_USE_AMD_INLINE_ASM 1
-#endif
-
 namespace ck {
 
 // define B = merge(N0, Ho, Wo)
@@ -239,15 +235,6 @@ struct GridwiseConvolutionImplicitGemm_v4_nchw_kcyx_nkhw_lds_double_buffer
             GemmDataPerReadA,
             GemmDataPerReadB>{};
 
-        // choose GEMM implementation here
-        const auto run_blockwise_gemm = [&](auto... Xs) {
-#if CK_USE_AMD_INLINE_ASM && CK_BLOCKWISE_GEMM_USE_AMD_INLINE_ASM
-            return blockwise_gemm.Run_asm(Xs...);
-#else
-            return blockwise_gemm.Run(Xs...);
-#endif
-        };
-
         // LDS allocation for input and weight: be careful of alignment
         constexpr index_t max_align = math::lcm(InBlockCopyDstDataPerWrite_N2,
                                                 WeiBlockCopyDstDataPerWrite_K,
@@ -255,9 +242,11 @@ struct GridwiseConvolutionImplicitGemm_v4_nchw_kcyx_nkhw_lds_double_buffer
                                                 GemmDataPerReadB);
 
         constexpr index_t in_block_space =
-            in_e_n1_b_n2_block_desc.GetElementSpace(Number<max_align>{});
+            math::integer_divide_ceil(in_e_n1_b_n2_block_desc.GetElementSpace(), max_align) *
+            max_align;
 
-        constexpr index_t wei_block_space = wei_e_k_block_desc.GetElementSpace(Number<max_align>{});
+        constexpr index_t wei_block_space =
+            math::integer_divide_ceil(wei_e_k_block_desc.GetElementSpace(), max_align) * max_align;
 
         __shared__ Float p_in_block_double[2 * in_block_space];
         __shared__ Float p_wei_block_double[2 * wei_block_space];
@@ -309,7 +298,7 @@ struct GridwiseConvolutionImplicitGemm_v4_nchw_kcyx_nkhw_lds_double_buffer
                                                             p_wei_register_clipboard);
 
                 // LDS double buffer: GEMM on current data
-                run_blockwise_gemm(p_wei_block_now, p_in_block_now, p_out_thread);
+                blockwise_gemm.Run(p_wei_block_now, p_in_block_now, p_out_thread);
 
                 // LDS double buffer: store next data to LDS
                 blockwise_in_copy.RunStoreRegisterClipboard(p_in_register_clipboard,
@@ -336,7 +325,7 @@ struct GridwiseConvolutionImplicitGemm_v4_nchw_kcyx_nkhw_lds_double_buffer
                                                         p_wei_register_clipboard);
 
             // LDS double buffer: GEMM on current data
-            run_blockwise_gemm(p_wei_block_double, p_in_block_double, p_out_thread);
+            blockwise_gemm.Run(p_wei_block_double, p_in_block_double, p_out_thread);
 
             // LDS double buffer: store next data to LDS
             blockwise_in_copy.RunStoreRegisterClipboard(p_in_register_clipboard,
@@ -348,7 +337,7 @@ struct GridwiseConvolutionImplicitGemm_v4_nchw_kcyx_nkhw_lds_double_buffer
             __syncthreads();
 
             // LDS double buffer: GEMM on current data
-            run_blockwise_gemm(p_wei_block_double + wei_block_space,
+            blockwise_gemm.Run(p_wei_block_double + wei_block_space,
                                p_in_block_double + in_block_space,
                                p_out_thread);
         }
diff --git a/composable_kernel/include/tensor_description/ConstantTensorDescriptor.hpp b/composable_kernel/include/tensor_description/ConstantTensorDescriptor.hpp
index f93a5a60cd..a8295d6624 100644
--- a/composable_kernel/include/tensor_description/ConstantTensorDescriptor.hpp
+++ b/composable_kernel/include/tensor_description/ConstantTensorDescriptor.hpp
@@ -107,15 +107,12 @@ struct ConstantTensorDescriptor
         return accumulate_on_sequence(Lengths{}, math::multiplies<index_t>{}, Number<1>{});
     }
 
-    template <class Align = Number<1>>
-    __host__ __device__ static constexpr index_t GetElementSpace(Align align = Align{})
+    __host__ __device__ static constexpr index_t GetElementSpace()
     {
-        // This is WRONG! align shouldbe applied to the last memory rank, not the last tensor
-        // dimension
         constexpr index_t element_space_unaligned = accumulate_on_sequence(
             (GetLengths() - Number<1>{}) * GetStrides(), math::plus<index_t>{}, Number<1>{});
 
-        return align.Get() * ((element_space_unaligned + align.Get() - 1) / align.Get());
+        return element_space_unaligned;
     }
 
     // emulate constexpr lambda
@@ -234,7 +231,7 @@ struct ConstantTensorDescriptor
             // do carry check in reversed order, starting from lowest dimension
             // don't check the highest dimension
             static_for<0, nDim, 1>{}([&](auto IDimReverse) {
-                constexpr index_t idim = nDim - 1 - IDimReverse.Get();
+                constexpr index_t idim = nDim - 1 - IDimReverse;
                 constexpr auto IDim    = Number<idim>{};
 
                 if(carry)
@@ -260,7 +257,7 @@ struct ConstantTensorDescriptor
             // do borrow check in reversed order, starting from lowest dimension
             // don't check the highest dimension
             static_for<0, nDim, 1>{}([&](auto IDimReverse) {
-                constexpr index_t idim = nDim - 1 - IDimReverse.Get();
+                constexpr index_t idim = nDim - 1 - IDimReverse;
                 constexpr auto IDim    = Number<idim>{};
 
                 if(borrow)
diff --git a/composable_kernel/include/tensor_operation/blockwise_2d_tensor_op.hpp b/composable_kernel/include/tensor_operation/blockwise_2d_tensor_op.hpp
index 9354da7861..527c84b67a 100644
--- a/composable_kernel/include/tensor_operation/blockwise_2d_tensor_op.hpp
+++ b/composable_kernel/include/tensor_operation/blockwise_2d_tensor_op.hpp
@@ -286,7 +286,7 @@ struct Blockwise2dTensorCopy2
 
     __device__ void Run(const Float* __restrict__ p_src, Float* __restrict__ p_dst) const
     {
-        static_assert(is_same<Float, float>::value, "wrong! only support float!\n");
+        static_assert(is_same<Float, float>{}, "wrong! only support float!\n");
 
         using Float4 = float4;
         using Float2 = float2;
@@ -565,7 +565,7 @@ struct Blockwise2dTensorCopy3
 
     __device__ constexpr index_t GetRegisterClipboardSize() const
     {
-        static_assert(is_same<Float, float>::value, "wrong! only support float!\n");
+        static_assert(is_same<Float, float>{}, "wrong! only support float!\n");
 
         constexpr auto I0 = Number<0>{};
         constexpr auto I1 = Number<1>{};
@@ -714,7 +714,7 @@ struct Blockwise2dTensorCopy3
                 *(reinterpret_cast<const vector_t*>(&p_src[mSrcMyThreadOffset +
                                                     iloop * src_loop_stride]));
 #else
-            static_assert(is_same<float, Float>::value && DataPerRead == 4,
+            static_assert(is_same<float, Float>{} && DataPerRead == 4,
                           "global_load is only for float4");
 
             global_load(reinterpret_cast<vector_t&>(p_clipboard[iloop * DataPerRead]),
@@ -773,7 +773,7 @@ struct Blockwise2dTensorCopy3
             *(reinterpret_cast<vector_t*>(&p_dst[mDstMyThreadOffset + iloop * dst_loop_stride]) =
                 *(reinterpret_cast<const vector_t*>(&p_clipboard[iloop * DataPerRead]);
 #else
-            static_assert(is_same<float, Float>::value && DataPerRead == 4,
+            static_assert(is_same<float, Float>{} && DataPerRead == 4,
                           "ds_write_b128 is only for float4");
 
             ds_write_b128(reinterpret_cast<const vector_t&>(p_clipboard[iloop * DataPerRead]),
diff --git a/composable_kernel/include/tensor_operation/blockwise_3d_tensor_op.hpp b/composable_kernel/include/tensor_operation/blockwise_3d_tensor_op.hpp
index fb2eafa160..e328caf495 100644
--- a/composable_kernel/include/tensor_operation/blockwise_3d_tensor_op.hpp
+++ b/composable_kernel/include/tensor_operation/blockwise_3d_tensor_op.hpp
@@ -239,7 +239,7 @@ struct Blockwise3dTensorCopy3
 
     __device__ static constexpr index_t GetRegisterClipboardSize()
     {
-        static_assert(is_same<Float, float>::value, "wrong! only support float!\n");
+        static_assert(is_same<Float, float>{}, "wrong! only support float!\n");
 
         constexpr auto I0 = Number<0>{};
         constexpr auto I1 = Number<1>{};
diff --git a/composable_kernel/include/tensor_operation/blockwise_4d_tensor_op.hpp b/composable_kernel/include/tensor_operation/blockwise_4d_tensor_op.hpp
index 6f65ce077d..95fcd28023 100644
--- a/composable_kernel/include/tensor_operation/blockwise_4d_tensor_op.hpp
+++ b/composable_kernel/include/tensor_operation/blockwise_4d_tensor_op.hpp
@@ -598,7 +598,7 @@ struct Blockwise4dTensorCopy3
 
     __device__ constexpr index_t GetRegisterClipboardSize() const
     {
-        static_assert(is_same<Float, float>::value, "wrong! only support float!\n");
+        static_assert(is_same<Float, float>{}, "wrong! only support float!\n");
 
         constexpr auto I0 = Number<0>{};
         constexpr auto I1 = Number<1>{};
diff --git a/composable_kernel/include/tensor_operation/blockwise_batched_gemm.hpp b/composable_kernel/include/tensor_operation/blockwise_batched_gemm.hpp
index 0b1c9e2a53..36d3060b23 100644
--- a/composable_kernel/include/tensor_operation/blockwise_batched_gemm.hpp
+++ b/composable_kernel/include/tensor_operation/blockwise_batched_gemm.hpp
@@ -295,9 +295,9 @@ struct BlockwiseBatchGemmBlockABlockBThreadCTransANormalBNormalC_V2
 
 #if CK_USE_AMD_INLINE_ASM
     template <class FloatA, class FloatB, class FloatC>
-    __device__ void Run_asm(const FloatA* __restrict__ p_a_block,
-                            const FloatB* __restrict__ p_b_block,
-                            FloatC* __restrict__ p_c_thread) const
+    __device__ void Run_amd_asm(const FloatA* __restrict__ p_a_block,
+                                const FloatB* __restrict__ p_b_block,
+                                FloatC* __restrict__ p_c_thread) const
     {
         constexpr auto a_block_mtx  = BlockMatrixA{};
         constexpr auto b_block_mtx  = BlockMatrixB{};
@@ -330,19 +330,19 @@ struct BlockwiseBatchGemmBlockABlockBThreadCTransANormalBNormalC_V2
         constexpr index_t NPerLevel1Cluster = NPerThreadSubC * NLevel0Cluster * NLevel1Cluster;
 
         // assertion for inline asm
-        static_assert(is_same<FloatA, float>::value && is_same<FloatB, float>::value &&
-                          is_same<FloatC, float>::value,
-                      "Run_asm only deal with float\n");
+        static_assert(is_same<FloatA, float>{} && is_same<FloatB, float>{} &&
+                          is_same<FloatC, float>{},
+                      "Run_amd_asm only deal with float\n");
 
         static_assert(MPerThreadSubC == 4 && NPerThreadSubC == 4 && KPerThreadLoop == 1 &&
                           MPerThread == 8 && NPerThread == 8,
-                      "Run_asm cannot deal with this GEMM shape yet\n");
+                      "Run_amd_asm cannot deal with this GEMM shape yet\n");
 
-        static_assert(DataPerReadA == 4 && DataPerReadB == 4, "Run_asm only do float4 read\n");
+        static_assert(DataPerReadA == 4 && DataPerReadB == 4, "Run_amd_asm only do float4 read\n");
 
-        static_assert(
-            BlockMatrixStrideA == 0 && BatchPerThread == 1,
-            "Run_asm can only deal with BlockMatrixStrideA == 0 && BatchPerThread == 1 for now\n");
+        static_assert(BlockMatrixStrideA == 0 && BatchPerThread == 1,
+                      "Run_amd_asm can only deal with BlockMatrixStrideA == 0 && BatchPerThread == "
+                      "1 for now\n");
 
         using Float4 = vector_type<float, 4>::MemoryType;
 
@@ -421,19 +421,19 @@ struct BlockwiseBatchGemmBlockABlockBThreadCTransANormalBNormalC_V2
         constexpr index_t NPerLevel1Cluster = NPerThreadSubC * NLevel0Cluster * NLevel1Cluster;
 
         // assertion for inline asm
-        static_assert(is_same<FloatA, float>::value && is_same<FloatB, float>::value &&
-                          is_same<FloatC, float>::value,
-                      "Run_asm only deal with float\n");
+        static_assert(is_same<FloatA, float>{} && is_same<FloatB, float>{} &&
+                          is_same<FloatC, float>{},
+                      "Run_amd_asm only deal with float\n");
 
         static_assert(MPerThreadSubC == 4 && NPerThreadSubC == 4 && KPerThreadLoop == 1 &&
                           MPerThread == 8 && NPerThread == 8,
-                      "Run_asm cannot deal with this GEMM shape yet\n");
+                      "Run_amd_asm cannot deal with this GEMM shape yet\n");
 
-        static_assert(DataPerReadA == 4 && DataPerReadB == 4, "Run_asm only do float4 read\n");
+        static_assert(DataPerReadA == 4 && DataPerReadB == 4, "Run_amd_asm only do float4 read\n");
 
-        static_assert(
-            BlockMatrixStrideA == 0 && BatchPerThread == 1,
-            "Run_asm can only deal with BlockMatrixStrideA == 0 && BatchPerThread == 1 for now\n");
+        static_assert(BlockMatrixStrideA == 0 && BatchPerThread == 1,
+                      "Run_amd_asm can only deal with BlockMatrixStrideA == 0 && BatchPerThread == "
+                      "1 for now\n");
 
         using Float4 = vector_type<float, 4>::MemoryType;
 
diff --git a/composable_kernel/include/tensor_operation/blockwise_gemm.hpp b/composable_kernel/include/tensor_operation/blockwise_gemm.hpp
index f257137f3c..b1e6171f4e 100644
--- a/composable_kernel/include/tensor_operation/blockwise_gemm.hpp
+++ b/composable_kernel/include/tensor_operation/blockwise_gemm.hpp
@@ -5,6 +5,10 @@
 #include "ConstantMatrixDescriptor.hpp"
 #include "threadwise_gemm.hpp"
 
+#ifndef CK_BLOCKWISE_GEMM_USE_AMD_INLINE_ASM
+#define CK_BLOCKWISE_GEMM_USE_AMD_INLINE_ASM 1
+#endif
+
 namespace ck {
 
 // if following number are power of 2, index calculation shall be greatly reduced:
@@ -51,7 +55,8 @@ struct BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_v2
                           N % (NPerThreadSubC * NLevel0Cluster * NLevel1Cluster) == 0,
                       "wrong! Cannot evenly divide work among\n");
 
-        static_assert(is_same_type(ThreadMatrixC::GetLengths(), GetThreadMatrixCLengths()),
+        static_assert(std::is_same<decltype(ThreadMatrixC::GetLengths()),
+                                   decltype(GetThreadMatrixCLengths())>{},
                       "wrong! ThreadMatrixC lengths is wrong");
 
         auto c_thread_mtx_index = GetBeginOfThreadMatrixC(get_thread_local_1d_id());
@@ -115,11 +120,10 @@ struct BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_v2
     }
 
 #if CK_USE_AMD_INLINE_ASM
-    // TODO: this is not working correctly
     template <class FloatA, class FloatB, class FloatC>
-    __device__ void Run_asm(const FloatA* __restrict__ p_a_block,
-                            const FloatB* __restrict__ p_b_block,
-                            FloatC* __restrict__ p_c_thread) const
+    __device__ void Run_amd_asm(const FloatA* __restrict__ p_a_block,
+                                const FloatB* __restrict__ p_b_block,
+                                FloatC* __restrict__ p_c_thread) const
     {
         constexpr auto True  = integral_constant<bool, true>{};
         constexpr auto False = integral_constant<bool, false>{};
@@ -156,15 +160,15 @@ struct BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_v2
         constexpr index_t NPerLevel1Cluster = NPerThreadSubC * NLevel0Cluster * NLevel1Cluster;
 
         // assertion for inline asm
-        static_assert(is_same<FloatA, float>::value && is_same<FloatB, float>::value &&
-                          is_same<FloatC, float>::value,
-                      "Run_asm only deal with float\n");
+        static_assert(is_same<FloatA, float>{} && is_same<FloatB, float>{} &&
+                          is_same<FloatC, float>{},
+                      "Run_amd_asm only deal with float");
 
         static_assert(MPerThreadSubC == 4 && NPerThreadSubC == 4 && KPerThreadLoop == 1 &&
                           MPerThread == 8 && NPerThread == 8,
-                      "Run_asm cannot deal with this GEMM shape yet\n");
+                      "Run_amd_asm cannot deal with this GEMM shape yet");
 
-        static_assert(DataPerReadA == 4 && DataPerReadB == 4, "Run_asm only do float4 read\n");
+        static_assert(DataPerReadA == 4 && DataPerReadB == 4, "Run_amd_asm only do float4 read");
 
         using Float4 = vector_type<float, 4>::MemoryType;
 
@@ -200,9 +204,9 @@ struct BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_v2
 #endif
 
     template <class FloatA, class FloatB, class FloatC>
-    __device__ void Run(const FloatA* const __restrict__ p_a_block,
-                        const FloatB* const __restrict__ p_b_block,
-                        FloatC* const __restrict__ p_c_thread) const
+    __device__ void Run_source(const FloatA* const __restrict__ p_a_block,
+                               const FloatB* const __restrict__ p_b_block,
+                               FloatC* const __restrict__ p_c_thread) const
     {
         constexpr auto True  = integral_constant<bool, true>{};
         constexpr auto False = integral_constant<bool, false>{};
@@ -291,9 +295,9 @@ struct BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_v2
     }
 
     template <class FloatA, class FloatB, class FloatC>
-    __device__ void Run_RegisterDoubleBuffer(FloatA* const p_a_block,
-                                             FloatB* const p_b_block,
-                                             FloatC* p_c_thread) const
+    __device__ void RunRegisterDoubleBuffer_source(FloatA* const p_a_block,
+                                                   FloatB* const p_b_block,
+                                                   FloatC* p_c_thread) const
     {
         constexpr auto True  = integral_constant<bool, true>{};
         constexpr auto False = integral_constant<bool, false>{};
@@ -427,6 +431,18 @@ struct BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_v2
                             p_c_thread);
         }
     }
+    template <class FloatA, class FloatB, class FloatC>
+    __device__ void Run(const FloatA* __restrict__ p_a_block,
+                        const FloatB* __restrict__ p_b_block,
+                        FloatC* __restrict__ p_c_thread) const
+
+    {
+#if CK_USE_AMD_INLINE_ASM && CK_BLOCKWISE_GEMM_USE_AMD_INLINE_ASM
+        Run_amd_asm(p_a_block, p_b_block, p_c_thread);
+#else
+        Run_source(p_a_block, p_b_block, p_c_thread);
+#endif
+    }
 };
 
 } // namespace ck
diff --git a/composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp b/composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp
index 1496630543..b59d4120ca 100644
--- a/composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp
+++ b/composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp
@@ -6,6 +6,10 @@
 #include "ConstantMergedTensorDescriptor.hpp"
 #include "threadwise_generic_tensor_slice_copy.hpp"
 
+#ifndef CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_BLOCKWISE_GENERIC_SLICE_COPY_V1
+#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_BLOCKWISE_GENERIC_SLICE_COPY_V1 1
+#endif
+
 namespace ck {
 
 // slice a (normal or merged) tensor, and copy it into another (normal or merged) tensor
@@ -91,7 +95,7 @@ struct BlockwiseGenericTensorSliceCopy_v1
 
         constexpr auto repeat_lengths = SliceLengths{} / data_per_cluster_per_dims;
 
-        // for now, only support SubLengths.Get() == 1 on a merged dimension that constains
+        // for now, only support SubLengths == 1 on a merged dimension that constains
         // multiple original dimensions
         static_for<0, nDim, 1>{}([&](auto IDim_) {
             constexpr auto IDim = decltype(IDim_){};
@@ -121,7 +125,7 @@ struct BlockwiseGenericTensorSliceCopy_v1
         // partial offset on each dimension
         static_for<0, nDim, 1>{}([&](auto IDim_) {
             constexpr auto IDim    = decltype(IDim_){};
-            constexpr index_t idim = IDim.Get();
+            constexpr index_t idim = IDim;
 
             constexpr auto src_partial_original_dims =
                 SrcDesc::GetContainedOriginalDimensions(IDim);
@@ -135,7 +139,7 @@ struct BlockwiseGenericTensorSliceCopy_v1
 
         static_for<0, nDim, 1>{}([&](auto IDim_) {
             constexpr auto IDim    = decltype(IDim_){};
-            constexpr index_t idim = IDim.Get();
+            constexpr index_t idim = IDim;
 
             constexpr auto dst_partial_original_dims =
                 DstDesc::GetContainedOriginalDimensions(IDim);
@@ -153,38 +157,6 @@ struct BlockwiseGenericTensorSliceCopy_v1
 
         mThreadDstOffset = accumulate_on_array(
             mThreadDstPartialOffsets, math::plus<index_t>{}, static_cast<index_t>(0));
-
-#if 0
-        if(get_block_1d_id() == 0)
-        {
-            printf("id %5u %5u: "
-                   "src_block_data_multi_id_begin: %u %u %u %u, "
-                   "thread_cluster_multi_id: %u %u %u %u, "
-                   "data_cluster_multi_id: %u %u %u %u, "
-                   "thread_data_multi_id_begin: %u %u %u %u, "
-                   "mThreadSrcOffset %u, mThreadDstOffset %u \n",
-                   get_block_1d_id(),
-                   get_thread_local_1d_id(),
-                   src_block_data_multi_id_begin[0],
-                   src_block_data_multi_id_begin[1],
-                   src_block_data_multi_id_begin[2],
-                   src_block_data_multi_id_begin[3],
-                   thread_cluster_multi_id[0],
-                   thread_cluster_multi_id[1],
-                   thread_cluster_multi_id[2],
-                   thread_cluster_multi_id[3],
-                   data_cluster_multi_id[0],
-                   data_cluster_multi_id[1],
-                   data_cluster_multi_id[2],
-                   data_cluster_multi_id[3],
-                   thread_data_multi_id_begin[0],
-                   thread_data_multi_id_begin[1],
-                   thread_data_multi_id_begin[2],
-                   thread_data_multi_id_begin[3],
-                   mThreadSrcOffset,
-                   mThreadDstOffset);
-        }
-#endif
     }
 
     __device__ static constexpr index_t GetRegisterClipboardSize()
@@ -210,19 +182,7 @@ struct BlockwiseGenericTensorSliceCopy_v1
             make_ConstantTensorDescriptor_packed(thread_sub_tensor_lengths * repeat_lengths);
 
         static_ford<decltype(repeat_lengths)>{}([&](auto repeat_multi_id_) {
-#if 0
-            constexpr auto repeat_multi_id = sequence2array(decltype(repeat_multi_id_){});
-
-            const auto src_thread_data_multi_id_begin = repeat_multi_id * data_per_cluster_per_dims;
-
-            const auto clipboard_data_multi_id_begin = repeat_multi_id * thread_sub_tensor_lengths;
-
-            const index_t src_offset =
-                SrcDesc{}.GetOffsetFromMultiIndex(src_thread_data_multi_id_begin);
-
-            const index_t clipboard_offset =
-                thread_tensor_desc.GetOffsetFromMultiIndex(clipboard_data_multi_id_begin);
-#else // HIP compiler performs better with these codes
+#if CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_BLOCKWISE_GENERIC_SLICE_COPY_V1
             constexpr auto repeat_multi_id = decltype(repeat_multi_id_){};
 
             constexpr auto src_thread_data_multi_id_begin =
@@ -236,6 +196,18 @@ struct BlockwiseGenericTensorSliceCopy_v1
 
             constexpr index_t clipboard_offset =
                 thread_tensor_desc.GetOffsetFromMultiIndex(clipboard_data_multi_id_begin);
+#else
+            constexpr auto repeat_multi_id = sequence2array(decltype(repeat_multi_id_){});
+
+            const auto src_thread_data_multi_id_begin = repeat_multi_id * data_per_cluster_per_dims;
+
+            const auto clipboard_data_multi_id_begin = repeat_multi_id * thread_sub_tensor_lengths;
+
+            const index_t src_offset =
+                SrcDesc{}.GetOffsetFromMultiIndex(src_thread_data_multi_id_begin);
+
+            const index_t clipboard_offset =
+                thread_tensor_desc.GetOffsetFromMultiIndex(clipboard_data_multi_id_begin);
 #endif
 
             threadwise_generic_tensor_slice_copy_v1(SrcDesc{},
@@ -263,18 +235,7 @@ struct BlockwiseGenericTensorSliceCopy_v1
             make_ConstantTensorDescriptor_packed(thread_sub_tensor_lengths * repeat_lengths);
 
         static_ford<decltype(repeat_lengths)>{}([&](auto repeat_multi_id_) {
-#if 0
-            constexpr auto repeat_multi_id = sequence2array(decltype(repeat_multi_id_){});
-
-            const auto clipboard_data_multi_id_begin = repeat_multi_id * thread_sub_tensor_lengths;
-
-            const auto dst_data_multi_id_begin = repeat_multi_id * data_per_cluster_per_dims;
-
-            const index_t clipboard_offset =
-                thread_tensor_desc.GetOffsetFromMultiIndex(clipboard_data_multi_id_begin);
-
-            const index_t dst_offset = DstDesc{}.GetOffsetFromMultiIndex(dst_data_multi_id_begin);
-#else // HIP compiler performs better with these codes
+#if CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_BLOCKWISE_GENERIC_SLICE_COPY_V1
             constexpr auto repeat_multi_id = decltype(repeat_multi_id_){};
 
             constexpr auto clipboard_data_multi_id_begin =
@@ -287,6 +248,17 @@ struct BlockwiseGenericTensorSliceCopy_v1
 
             constexpr index_t dst_offset =
                 DstDesc{}.GetOffsetFromMultiIndex(dst_data_multi_id_begin);
+#else
+            constexpr auto repeat_multi_id = sequence2array(decltype(repeat_multi_id_){});
+
+            const auto clipboard_data_multi_id_begin = repeat_multi_id * thread_sub_tensor_lengths;
+
+            const auto dst_data_multi_id_begin = repeat_multi_id * data_per_cluster_per_dims;
+
+            const index_t clipboard_offset =
+                thread_tensor_desc.GetOffsetFromMultiIndex(clipboard_data_multi_id_begin);
+
+            const index_t dst_offset = DstDesc{}.GetOffsetFromMultiIndex(dst_data_multi_id_begin);
 #endif
 
             threadwise_generic_tensor_slice_copy_v1(thread_tensor_desc,
@@ -310,7 +282,7 @@ struct BlockwiseGenericTensorSliceCopy_v1
     }
 
     // When moving the slicing windows along a merged dimension, if the strides of the
-    // contained (by the merged dimension) original dimensions are in descending order,
+    // contained (by the merged dimension) original dimensions are not in descending order,
     // then there is no guarantee that the new offset will be larger than the old offset
     // for movement in positive direction (vice versue for movement in negative direction).
     // As a result, there is the possiblity that the offset calculation may result in
@@ -323,7 +295,7 @@ struct BlockwiseGenericTensorSliceCopy_v1
         Number<IDim_>, Number<StepSize>, integral_constant<bool, PositiveDirection> direction)
     {
         constexpr auto IDim    = Number<IDim_>{};
-        constexpr index_t idim = IDim.Get();
+        constexpr index_t idim = IDim;
 
         static_if<SrcDesc::ContainMultipleOriginalDimensions(IDim)>{}([&](auto fwd) {
             // logic for a merged dimension, also works for non-merged dimension, but its logic may
@@ -350,8 +322,7 @@ struct BlockwiseGenericTensorSliceCopy_v1
                 constexpr auto I                = decltype(I_){};
                 constexpr index_t idim_original = src_partial_original_dims.Get(I);
 
-                mThreadSrcOriginalMultiId(idim_original) =
-                    new_src_partial_original_multi_id[I.Get()];
+                mThreadSrcOriginalMultiId(idim_original) = new_src_partial_original_multi_id[I];
             });
 
             // calculate new partial offset on this merged dimension
diff --git a/composable_kernel/include/tensor_operation/blockwise_tensor_slice_copy.hpp b/composable_kernel/include/tensor_operation/blockwise_tensor_slice_copy.hpp
index 2de6cf1d45..55e7fb8f69 100644
--- a/composable_kernel/include/tensor_operation/blockwise_tensor_slice_copy.hpp
+++ b/composable_kernel/include/tensor_operation/blockwise_tensor_slice_copy.hpp
@@ -49,7 +49,7 @@ struct BlockwiseTensorSliceReorderCopy_v3
             make_ConstantTensorDescriptor_packed(thread_cluster_lengths);
 
         // sanity check: data type
-        static_assert(is_same<Float, float>::value, "wrong! only support float for now!\n");
+        static_assert(is_same<Float, float>{}, "wrong! only support float for now!\n");
 
         // sanity check: nDim
         static_assert(SrcDesc::GetNumOfDimension() == nDim &&
@@ -121,12 +121,11 @@ struct BlockwiseTensorSliceReorderCopy_v3
             reorder_array_given_old2new(thread_multi_id, map_thread_cluster_2_src_cluster);
 
         static_for<0, nDim, 1>{}([&](auto IDim) {
-            constexpr auto I    = decltype(IDim){};
-            constexpr index_t i = I.Get();
+            constexpr index_t idim = IDim;
             // compiler: will it really compute index here, or be merged with
             // GetOffsetFromMultiIndex and
             // optimized away???
-            src_data_multi_id(i) *= src_sub_lengths.Get(I);
+            src_data_multi_id(idim) *= src_sub_lengths.Get(IDim);
         });
 
         // compiler: will it really compute index here, or be merged with GetOffsetFromMultiIndex
diff --git a/composable_kernel/include/tensor_operation/threadwise_4d_tensor_op.hpp b/composable_kernel/include/tensor_operation/threadwise_4d_tensor_op.hpp
index a8d0398de1..7bf340f162 100644
--- a/composable_kernel/include/tensor_operation/threadwise_4d_tensor_op.hpp
+++ b/composable_kernel/include/tensor_operation/threadwise_4d_tensor_op.hpp
@@ -26,16 +26,16 @@ __device__ void threadwise_4d_tensor_shift_down(Desc, Float* __restrict__ p, IDi
     constexpr index_t nshift = NShift::mValue;
 
     constexpr index_t did0_end =
-        is_same<decltype(I0), IDim>::value ? desc.GetLength(I0) - nshift : desc.GetLength(I0);
+        is_same<decltype(I0), IDim>{} ? desc.GetLength(I0) - nshift : desc.GetLength(I0);
 
     constexpr index_t did1_end =
-        is_same<decltype(I1), IDim>::value ? desc.GetLength(I1) - nshift : desc.GetLength(I1);
+        is_same<decltype(I1), IDim>{} ? desc.GetLength(I1) - nshift : desc.GetLength(I1);
 
     constexpr index_t did2_end =
-        is_same<decltype(I2), IDim>::value ? desc.GetLength(I2) - nshift : desc.GetLength(I2);
+        is_same<decltype(I2), IDim>{} ? desc.GetLength(I2) - nshift : desc.GetLength(I2);
 
     constexpr index_t did3_end =
-        is_same<decltype(I3), IDim>::value ? desc.GetLength(I3) - nshift : desc.GetLength(I3);
+        is_same<decltype(I3), IDim>{} ? desc.GetLength(I3) - nshift : desc.GetLength(I3);
 
     for(index_t did0 = 0; did0 < did0_end; ++did0)
     {
diff --git a/composable_kernel/include/tensor_operation/threadwise_gemm.hpp b/composable_kernel/include/tensor_operation/threadwise_gemm.hpp
index ea77027c10..d77ad2bbee 100644
--- a/composable_kernel/include/tensor_operation/threadwise_gemm.hpp
+++ b/composable_kernel/include/tensor_operation/threadwise_gemm.hpp
@@ -71,24 +71,7 @@ __device__ void threadwise_gemm(MatrixA,
                                 integral_constant<bool, TransC>,
                                 FloatC* __restrict__ p_c_thread)
 {
-#if 0
-    if(get_thread_local_1d_id() == 0 && get_block_1d_id() == 0)
-    {
-        printf("p_a_thread: %f %f %f %f\n",
-               p_a_thread[0],
-               p_a_thread[1],
-               p_a_thread[2],
-               p_a_thread[3]);
-        printf("p_b_thread: %f %f %f %f\n",
-               p_b_thread[0],
-               p_b_thread[1],
-               p_b_thread[2],
-               p_b_thread[3]);
-    }
-#endif
-
-    if(TransA && (!TransB) && (!TransC))
-    {
+    static_if<TransA && (!TransB) && (!TransC)>{}([&](auto fwd) {
         constexpr auto a_mtx = MatrixA{};
         constexpr auto b_mtx = MatrixB{};
         constexpr auto c_mtx = MatrixC{};
@@ -111,12 +94,10 @@ __device__ void threadwise_gemm(MatrixA,
                 }
             }
         }
-    }
-    else
-    {
+    }).Else([&](auto fwd) {
         // not implemented
-        assert(false);
-    }
+        static_assert(fwd(false), "wrong! support for this config is not implemented");
+    });
 }
 
 } // namespace ck
diff --git a/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp b/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp
index 2dd7b79ab2..16e1e7a153 100644
--- a/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp
+++ b/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp
@@ -5,6 +5,10 @@
 #include "ConstantTensorDescriptor.hpp"
 #include "ConstantMergedTensorDescriptor.hpp"
 
+#ifndef CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1
+#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1 0
+#endif
+
 namespace ck {
 
 template <class Float,
@@ -32,21 +36,18 @@ __device__ void threadwise_generic_tensor_slice_copy_v1(
 
     static_assert(is_valid_sequence_map<DimAccessOrder>::value, "wrong! map is not valid");
 
-#if 0 
-    // doesn't compile, because merged-tensor reordering is not implemented
-    // TODO: implement tensor desc ops for merged-tensor
-    constexpr auto src_strides_in_access_order =
-        SrcDesc::ReorderGivenNew2Old(DimAccessOrder{}).GetStride(Number<nDim-1>{});
+    // TODO: do more sanity-check here, something like:
+    // constexpr auto src_strides_in_access_order =
+    //     SrcDesc::ReorderGivenNew2Old(DimAccessOrder{}).GetStride(Number<nDim-1>{});
 
-    constexpr auto dst_strides_in_access_order =
-        SrcDesc::ReorderGivenNew2Old(DimAccessOrder{}).GetStride(Number<nDim-1>{});
+    // constexpr auto dst_strides_in_access_order =
+    //     SrcDesc::ReorderGivenNew2Old(DimAccessOrder{}).GetStride(Number<nDim-1>{});
 
-    // check src/dst stride on the lowest access dimension
-    static_assert((DataPerAccess == 1 || src_strides_in_access_order.Back() == 1) &&
-                      (DataPerAccess == 1 || dst_strides_in_access_order.Back() == 1),
-                  "wrong! src/dst stride on the lowest access dimension needs to be 1 for "
-                  "vectorized read/write");
-#endif
+    // // check src/dst stride on the lowest access dimension
+    // static_assert((DataPerAccess == 1 || src_strides_in_access_order.Back() == 1) &&
+    //                   (DataPerAccess == 1 || dst_strides_in_access_order.Back() == 1),
+    //               "wrong! src/dst stride on the lowest access dimension needs to be 1 for "
+    //               "vectorized read/write");
 
     constexpr auto slice_lengths_in_access_order =
         SliceLengths::ReorderGivenNew2Old(DimAccessOrder{});
@@ -64,24 +65,7 @@ __device__ void threadwise_generic_tensor_slice_copy_v1(
 
     using vector_t = typename vector_type<Float, DataPerAccess>::MemoryType;
 
-#if 1
-    ford<decltype(access_lengths)>{}([&](auto access_multi_id) {
-        auto data_multi_id_in_access_order      = access_multi_id;
-        data_multi_id_in_access_order(nDim - 1) = access_multi_id[nDim - 1] * DataPerAccess;
-
-        const auto data_multi_id =
-            reorder_array_given_old2new(data_multi_id_in_access_order, DimAccessOrder{});
-
-        const index_t src_index =
-            SrcDesc::GetOffsetFromMultiIndex(src_multi_id_begin + data_multi_id);
-
-        const index_t dst_index =
-            DstDesc::GetOffsetFromMultiIndex(dst_multi_id_begin + data_multi_id);
-
-        *reinterpret_cast<vector_t*>(&p_dst[dst_index]) =
-            *reinterpret_cast<const vector_t*>(&p_src[src_index]);
-    });
-#else
+#if CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1
     static_ford<decltype(access_lengths)>{}([&](auto access_multi_id) {
         constexpr index_t itmp = access_multi_id.Back() * DataPerAccess;
 
@@ -97,6 +81,23 @@ __device__ void threadwise_generic_tensor_slice_copy_v1(
         const index_t dst_index =
             DstDesc::GetOffsetFromMultiIndex(dst_multi_id_begin + data_multi_id);
 
+        *reinterpret_cast<vector_t*>(&p_dst[dst_index]) =
+            *reinterpret_cast<const vector_t*>(&p_src[src_index]);
+    });
+#else
+    ford<decltype(access_lengths)>{}([&](auto access_multi_id) {
+        auto data_multi_id_in_access_order      = access_multi_id;
+        data_multi_id_in_access_order(nDim - 1) = access_multi_id[nDim - 1] * DataPerAccess;
+
+        const auto data_multi_id =
+            reorder_array_given_old2new(data_multi_id_in_access_order, DimAccessOrder{});
+
+        const index_t src_index =
+            SrcDesc::GetOffsetFromMultiIndex(src_multi_id_begin + data_multi_id);
+
+        const index_t dst_index =
+            DstDesc::GetOffsetFromMultiIndex(dst_multi_id_begin + data_multi_id);
+
         *reinterpret_cast<vector_t*>(&p_dst[dst_index]) =
             *reinterpret_cast<const vector_t*>(&p_src[src_index]);
     });
diff --git a/composable_kernel/include/tensor_operation/threadwise_tensor_slice_copy.hpp b/composable_kernel/include/tensor_operation/threadwise_tensor_slice_copy.hpp
index 8d0ce26f94..ac0b500b15 100644
--- a/composable_kernel/include/tensor_operation/threadwise_tensor_slice_copy.hpp
+++ b/composable_kernel/include/tensor_operation/threadwise_tensor_slice_copy.hpp
@@ -56,7 +56,7 @@ __device__ void threadwise_tensor_slice_copy(SrcDesc,
 
     static_ford<decltype(ref_desc.GetLengths().PopBack())>{}([=](auto Ids) {
         static_for<0, nRead, 1>{}([&](auto IRead) {
-            constexpr auto multi_id = decltype(Ids){}.PushBack(Number<IRead.Get() * DataPerRead>{});
+            constexpr auto multi_id = decltype(Ids){}.PushBack(Number<IRead * DataPerRead>{});
 
             const index_t src_index = src_desc.GetOffsetFromMultiIndex(multi_id);
 
@@ -177,8 +177,7 @@ threadwise_tensor_slice_copy_reorder_given_dst2src_v3(SrcDesc,
 
             // pack data
             static_for<0, DstDataPerWrite, 1>{}([&](auto IDstData) {
-                const auto dst_multi_id =
-                    ids.PushBack(IWrite.Get() * DstDataPerWrite + IDstData.Get());
+                const auto dst_multi_id = ids.PushBack(IWrite * DstDataPerWrite + IDstData);
 
                 const auto src_multi_id = reorder_array_given_old2new(dst_multi_id, MapDst2Src{});
 
@@ -189,7 +188,7 @@ threadwise_tensor_slice_copy_reorder_given_dst2src_v3(SrcDesc,
             });
 
             // write data
-            const auto dst_multi_id = ids.PushBack(IWrite.Get() * DstDataPerWrite);
+            const auto dst_multi_id = ids.PushBack(IWrite * DstDataPerWrite);
 
             const index_t dst_index = dst_desc.GetOffsetFromMultiIndex(dst_multi_id);
 
diff --git a/composable_kernel/include/utility/Array.hpp b/composable_kernel/include/utility/Array.hpp
index fcf87c5843..f33fa516e2 100644
--- a/composable_kernel/include/utility/Array.hpp
+++ b/composable_kernel/include/utility/Array.hpp
@@ -98,7 +98,7 @@ __host__ __device__ constexpr auto reorder_array_given_new2old(const Array<TData
 {
     static_assert(NSize == sizeof...(IRs), "NSize not consistent");
 
-    static_assert(is_valid_sequence_map<Sequence<IRs...>>::value, "wrong! invalid reorder map");
+    static_assert(is_valid_sequence_map<Sequence<IRs...>>{}, "wrong! invalid reorder map");
 
     return Array<TData, NSize>{old_array[IRs]...};
 }
diff --git a/composable_kernel/include/utility/Sequence.hpp b/composable_kernel/include/utility/Sequence.hpp
index 09d67c13fa..5c566503a6 100644
--- a/composable_kernel/include/utility/Sequence.hpp
+++ b/composable_kernel/include/utility/Sequence.hpp
@@ -55,22 +55,6 @@ struct Sequence
         return Sequence<Type::Get(Number<IRs>{})...>{};
     }
 
-#if 0 // require sequence_sort, which is not implemented yet
-    template <class MapOld2New>
-    __host__ __device__ static constexpr auto ReorderGivenOld2New(MapOld2New /*old2new*/)
-    {
-        static_assert(sizeof...(Is) == MapOld2New::GetSize(),
-                      "wrong! reorder map should have the same size as Sequence to be rerodered");
-
-        static_assert(is_valid_sequence_map<MapOld2New>::value, 
-                      "wrong! invalid reorder map");
-
-        constexpr auto map_new2old = typename sequence_map_inverse<MapOld2New>::SeqMapType{};
-
-        return ReorderGivenNew2Old(map_new2old);
-    }
-#endif
-
     __host__ __device__ static constexpr auto Reverse();
 
     __host__ __device__ static constexpr index_t Front()
@@ -263,74 +247,15 @@ struct sequence_reverse<Sequence<I0, I1>>
     using SeqType = Sequence<I1, I0>;
 };
 
-#if 0 // not fully implemented
-template <class KeySeq0, class ValSeq0, class KeySeq1, class ValSeq1>
-struct sequence_sort_merge_impl;
-
-template <index_t Key0,
-          index_t... Keys0,
-          index_t Val0,
-          index_t... Vals0,
-          index_t Key1,
-          index_t... Keys1,
-          index_t Val0,
-          index_t... Vals1>
-struct sequence_sort_merge_impl<Sequence<Key0, Keys0...>,
-                                Sequence<Val0, Vals0...>,
-                                Sequence<Key1, Keys1...>,
-                                Sequence<Val1, Vals1...>>
-{
-};
-
-template <class>
-struct sequence_sort;
-
-template <index_t... Is>
-struct sequence_sort<Sequence<Is...>>
-{
-    using OriginalSeqType        = Sequence<Is...>;
-    using SortedSeqType          = xxxxx;
-    using MapSorted2OriginalType = xxx;
-};
-
-template <class Seq, class IsValidSeqMap>
-struct sequence_map_inverse_impl;
-
-// impl for valid map, no impl for invalid map
-template <index_t... Is>
-struct sequence_map_inverse_impl<Sequence<Is...>, true>
-{
-    using SeqMapType = sequence_sort<Sequence<Is...>>::MapSorted2OriginalType;
-};
-
-template <class>
-struct sequence_map_inverse;
-
-template <class Is...>
-struct sequence_map_inverse<Sequence<Is...>>
-{
-    // TODO: make sure the map to be inversed is valid: [0, sizeof...(Is))
-    static constexpr bool is_valid_sequence_map =
-        is_same<typename sequence_sort<Sequence<Is...>>::SortedSeqType,
-                typename arithmetic_sequence_gen<0, sizeof...(Is), 1>::SeqType>::value;
-
-    // make compiler fails, if is_valid_map != true
-    using SeqMapType =
-        typename sequence_map_inverse_impl<Sequence<Is...>, is_valid_map>::SeqMapType;
-};
-
-#endif
-
 template <class Seq>
 struct is_valid_sequence_map
 {
-    static constexpr bool value =
-#if 0 // sequence_sort is not implemented yet
-        is_same<typename arithmetic_sequence_gen<0, Seq::GetSize(), 1>::SeqType,
-                typename sequence_sort<Seq>::SortedSeqType>::value;
-#else
-        true;
-#endif
+    static constexpr bool value = true;
+
+    // TODO: add proper check for is_valid, something like:
+    // static constexpr bool value =
+    //     is_same<typename arithmetic_sequence_gen<0, Seq::GetSize(), 1>::SeqType,
+    //             typename sequence_sort<Seq>::SortedSeqType>{};
 };
 
 template <index_t... Xs, index_t... Ys>
diff --git a/composable_kernel/include/utility/amd_inline_asm.hpp b/composable_kernel/include/utility/amd_inline_asm.hpp
index e82cd62db8..6400fb708f 100644
--- a/composable_kernel/include/utility/amd_inline_asm.hpp
+++ b/composable_kernel/include/utility/amd_inline_asm.hpp
@@ -3,91 +3,8 @@
 
 #include "vector_type.hpp"
 
-#define NO_VM_WAIT 0
-#define NO_LGKM_WAIT 0
-#define NO_DS_READ 0
-#define NO_DS_WRITE 0
-#define NO_GLB_READ 0
-
 namespace ck {
 
-// cast a pointer of LDS to its address
-extern "C" __attribute__((address_space(3))) void* __to_local(void* p)[[hc]];
-
-__device__ void vmcnt(index_t cnt)
-{
-#if !NO_VM_WAIT
-    if(cnt == 0)
-    {
-        asm volatile("\n \
-                s_waitcnt vmcnt(0) \n \
-                " ::);
-    }
-    else if(cnt == 1)
-    {
-        asm volatile("\n \
-                s_waitcnt vmcnt(1) \n \
-                " ::);
-    }
-    else if(cnt == 2)
-    {
-        asm volatile("\n \
-                s_waitcnt vmcnt(2) \n \
-                " ::);
-    }
-    else if(cnt == 4)
-    {
-        asm volatile("\n \
-                s_waitcnt vmcnt(2) \n \
-                " ::);
-    }
-    else
-    {
-        assert(false);
-    }
-#endif
-}
-
-__device__ void lgkmcnt(index_t cnt)
-{
-#if !NO_LGKM_WAIT
-    if(cnt == 0)
-    {
-        asm volatile("\n \
-                s_waitcnt lgkmcnt(0) \n \
-                " ::);
-    }
-    else if(cnt == 1)
-    {
-        asm volatile("\n \
-                s_waitcnt lgkmcnt(1) \n \
-                " ::);
-    }
-    else if(cnt == 2)
-    {
-        asm volatile("\n \
-                s_waitcnt lgkmcnt(2) \n \
-                " ::);
-    }
-    else if(cnt == 3)
-    {
-        asm volatile("\n \
-                s_waitcnt lgkmcnt(3) \n \
-                " ::);
-    }
-    else if(cnt == 4)
-    {
-        asm volatile("\n \
-                s_waitcnt lgkmcnt(4) \n \
-                " ::);
-    }
-    else
-    {
-        assert(false);
-    }
-#endif
-}
-
 __device__ void outerProduct1x4(const float* a, const float* b, float* c)
 {
     asm volatile("\n \
@@ -112,21 +29,7 @@ __device__ void outerProduct1x4(const float& a,
                                 const vector_type<float, 4>::MemoryType& b,
                                 vector_type<float, 4>::MemoryType& c)
 {
-#if 0
-    asm volatile(
-            "\n \
-            v_mac_f32 %0, %4, %5 \n \
-            v_mac_f32 %1, %4, %6 \n \
-            v_mac_f32 %2, %4, %7 \n \
-            v_mac_f32 %3, %4, %8 \n \
-            "
-            :
-            :"v"(c.x),"v"(c.y),"v"(c.z),"v"(c.w), \
-            "v"(a.x),"v"(b.x),"v"(b.y),"v"(b.z),"v"(b.w)
-            );
-#else
     outerProduct1x4(&a, (float*)&b, (float*)&c);
-#endif
 }
 
 __device__ void outerProduct4x4(const vector_type<float, 4>::MemoryType& a,
@@ -136,57 +39,10 @@ __device__ void outerProduct4x4(const vector_type<float, 4>::MemoryType& a,
                                 vector_type<float, 4>::MemoryType& c2,
                                 vector_type<float, 4>::MemoryType& c3)
 {
-#if 0
-    asm volatile(
-            "\n \
-            v_mac_f32 %0, %4, %5 \n \
-            v_mac_f32 %1, %4, %6 \n \
-            v_mac_f32 %2, %4, %7 \n \
-            v_mac_f32 %3, %4, %8 \n \
-            "
-            :
-            :"v"(c0.x),"v"(c0.y),"v"(c0.z),"v"(c0.w), \
-            "v"(a.x),"v"(b.x),"v"(b.y),"v"(b.z),"v"(b.w)
-            );
-    asm volatile(
-            "\n \
-            v_mac_f32 %0, %4, %5 \n \
-            v_mac_f32 %1, %4, %6 \n \
-            v_mac_f32 %2, %4, %7 \n \
-            v_mac_f32 %3, %4, %8 \n \
-            "
-            :
-            :"v"(c1.x),"v"(c1.y),"v"(c1.z),"v"(c1.w), \
-            "v"(a.y),"v"(b.x),"v"(b.y),"v"(b.z),"v"(b.w)
-            );
-    asm volatile(
-            "\n \
-            v_mac_f32 %0, %4, %5 \n \
-            v_mac_f32 %1, %4, %6 \n \
-            v_mac_f32 %2, %4, %7 \n \
-            v_mac_f32 %3, %4, %8 \n \
-            "
-            :
-            :"v"(c2.x),"v"(c2.y),"v"(c2.z),"v"(c2.w), \
-            "v"(a.z),"v"(b.x),"v"(b.y),"v"(b.z),"v"(b.w)
-            );
-    asm volatile(
-            "\n \
-            v_mac_f32 %0, %4, %5 \n \
-            v_mac_f32 %1, %4, %6 \n \
-            v_mac_f32 %2, %4, %7 \n \
-            v_mac_f32 %3, %4, %8 \n \
-            "
-            :
-            :"v"(c3.x),"v"(c3.y),"v"(c3.z),"v"(c3.w), \
-            "v"(a.w),"v"(b.x),"v"(b.y),"v"(b.z),"v"(b.w)
-            );
-#else
     outerProduct1x4(a.x, b, c0);
     outerProduct1x4(a.y, b, c1);
     outerProduct1x4(a.z, b, c2);
     outerProduct1x4(a.w, b, c3);
-#endif
 }
 
 __device__ void outerProduct8x8(const vector_type<float, 4>::MemoryType* a,
@@ -201,7 +57,6 @@ __device__ void outerProduct8x8(const vector_type<float, 4>::MemoryType* a,
 
 __device__ void ds_read_b128(vector_type<float, 4>::MemoryType& r, void* lds, index_t offset = 0)
 {
-#if !NO_DS_READ
     if(offset == 0)
     {
         asm volatile("\n \
@@ -722,33 +577,11 @@ __device__ void ds_read_b128(vector_type<float, 4>::MemoryType& r, void* lds, in
                      : "=v"(r)
                      : "v"(__to_local(lds)));
     }
-#endif
-}
-
-__device__ void global_load(vector_type<float, 4>::MemoryType& r,
-                            const vector_type<float, 4>::MemoryType* ptr,
-                            index_t offset = 0)
-{
-#if !NO_GLB_READ
-    if(offset == 0)
-    {
-        asm volatile("\n \
-                global_load_dwordx4 %0, %1, off \n \
-                "
-                     : "=v"(r)
-                     : "v"(ptr));
-    }
-    else
-    {
-        assert(false);
-    }
-#endif
 }
 
 __device__ void
 ds_write_b128(const vector_type<float, 4>::MemoryType& r, void* lds, index_t offset = 0)
 {
-#if !NO_DS_WRITE
     if(offset == 0)
     {
         asm volatile("\n \
@@ -761,7 +594,6 @@ ds_write_b128(const vector_type<float, 4>::MemoryType& r, void* lds, index_t off
     {
         assert(false);
     }
-#endif
 }
 
 } // namespace ck
diff --git a/composable_kernel/include/utility/config_amd.hpp.in b/composable_kernel/include/utility/config_amd.hpp.in
index 4d840df7fe..babdbc47cf 100644
--- a/composable_kernel/include/utility/config_amd.hpp.in
+++ b/composable_kernel/include/utility/config_amd.hpp.in
@@ -7,6 +7,9 @@
 #include "hip/hip_fp16.h"
 #define CK_USE_AMD_INLINE_ASM 1
 
+#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_BLOCKWISE_GENERIC_SLICE_COPY_V1 1
+#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1 0
+
 namespace ck {
 
 // For some reason, HIP compiler need this definition to generate optimal load and store
diff --git a/composable_kernel/include/utility/config_nvidia.hpp.in b/composable_kernel/include/utility/config_nvidia.hpp.in
index 0f9e3441ea..39afc0b154 100644
--- a/composable_kernel/include/utility/config_nvidia.hpp.in
+++ b/composable_kernel/include/utility/config_nvidia.hpp.in
@@ -9,6 +9,9 @@
 #include "helper_cuda.h"
 #define CK_USE_AMD_INLINE_ASM 0
 
+#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_BLOCKWISE_GENERIC_SLICE_COPY_V1 0
+#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1 0
+
 namespace ck {
 
 // For some reason, CUDA need this definition, otherwise
diff --git a/composable_kernel/include/utility/functional.hpp b/composable_kernel/include/utility/functional.hpp
index 920f11af2a..56bdc4c06b 100644
--- a/composable_kernel/include/utility/functional.hpp
+++ b/composable_kernel/include/utility/functional.hpp
@@ -24,10 +24,8 @@ struct swallow
 };
 
 // Emulate if constexpr
-template <bool Predicate>
-struct static_if
-{
-};
+template <bool>
+struct static_if;
 
 template <>
 struct static_if<true>
diff --git a/composable_kernel/include/utility/integral_constant.hpp b/composable_kernel/include/utility/integral_constant.hpp
index 005d08e0c9..6038b1b203 100644
--- a/composable_kernel/include/utility/integral_constant.hpp
+++ b/composable_kernel/include/utility/integral_constant.hpp
@@ -1,15 +1,12 @@
 #ifndef CK_INTEGRAL_CONSTANT_HPP
 #define CK_INTEGRAL_CONSTANT_HPP
 
+#include <type_traits>
+
 namespace ck {
 
-template <class T, T N>
-struct integral_constant
-{
-    static const T value = N;
-
-    __host__ __device__ constexpr T Get() const { return value; }
-};
+template <class T, T v>
+using integral_constant = std::integral_constant<T, v>;
 
 template <class T, T X, T Y>
 __host__ __device__ constexpr auto operator+(integral_constant<T, X>, integral_constant<T, Y>)
@@ -17,6 +14,12 @@ __host__ __device__ constexpr auto operator+(integral_constant<T, X>, integral_c
     return integral_constant<T, X + Y>{};
 }
 
+template <class T, T X, T Y>
+__host__ __device__ constexpr auto operator*(integral_constant<T, X>, integral_constant<T, Y>)
+{
+    return integral_constant<T, X * Y>{};
+}
+
 template <index_t N>
 using Number = integral_constant<index_t, N>;
 
diff --git a/composable_kernel/include/utility/utility.hpp b/composable_kernel/include/utility/utility.hpp
index 9d32c2be55..c744e27ddd 100644
--- a/composable_kernel/include/utility/utility.hpp
+++ b/composable_kernel/include/utility/utility.hpp
@@ -1,6 +1,7 @@
 #ifndef CK_UTILITY_HPP
 #define CK_UTILITY_HPP
 
+#include <type_traits>
 #include "config.hpp"
 
 namespace ck {
@@ -9,23 +10,8 @@ __device__ index_t get_thread_local_1d_id() { return threadIdx.x; }
 
 __device__ index_t get_block_1d_id() { return blockIdx.x; }
 
-template <class T1, class T2>
-struct is_same
-{
-    static constexpr bool value = false;
-};
-
-template <class T>
-struct is_same<T, T>
-{
-    static constexpr bool value = true;
-};
-
 template <class X, class Y>
-__host__ __device__ constexpr bool is_same_type(X, Y)
-{
-    return is_same<X, Y>::value;
-}
+using is_same = std::is_same<X, Y>;
 
 namespace math {
 
@@ -58,7 +44,7 @@ struct integer_divide_ceiler
 {
     __host__ __device__ constexpr T operator()(T a, T b) const
     {
-        static_assert(is_same<T, index_t>::value || is_same<T, int>::value, "wrong type");
+        static_assert(is_same<T, index_t>{} || is_same<T, int>{}, "wrong type");
 
         return (a + b - 1) / b;
     }
@@ -67,7 +53,7 @@ struct integer_divide_ceiler
 template <class T>
 __host__ __device__ constexpr T integer_divide_ceil(T a, T b)
 {
-    static_assert(is_same<T, index_t>::value || is_same<T, int>::value, "wrong type");
+    static_assert(is_same<T, index_t>{} || is_same<T, int>{}, "wrong type");
 
     return (a + b - 1) / b;
 }
@@ -85,7 +71,7 @@ __host__ __device__ constexpr T max(T x, Ts... xs)
 
     auto y = max(xs...);
 
-    static_assert(is_same<decltype(y), T>::value, "not the same type");
+    static_assert(is_same<decltype(y), T>{}, "not the same type");
 
     return x > y ? x : y;
 }
@@ -103,12 +89,12 @@ __host__ __device__ constexpr T min(T x, Ts... xs)
 
     auto y = min(xs...);
 
-    static_assert(is_same<decltype(y), T>::value, "not the same type");
+    static_assert(is_same<decltype(y), T>{}, "not the same type");
 
     return x < y ? x : y;
 }
 
-// this is wrong
+// this is WRONG
 // TODO: implement least common multiple properly, instead of calling max()
 template <class T, class... Ts>
 __host__ __device__ constexpr T lcm(T x, Ts... xs)
diff --git a/composable_kernel/include/utility/vector_type.hpp b/composable_kernel/include/utility/vector_type.hpp
index 72c73068e1..2b33887ffb 100644
--- a/composable_kernel/include/utility/vector_type.hpp
+++ b/composable_kernel/include/utility/vector_type.hpp
@@ -64,131 +64,6 @@ struct vector_type<float, 4>
     }
 };
 
-#if 0
-template <>
-struct vector_type<half, 1>
-{
-    using MemoryType = half;
-
-    __host__ __device__ static MemoryType Pack(half s) { return s; }
-};
-
-template <>
-struct vector_type<half, 2>
-{
-    using MemoryType = half2;
-
-    __host__ __device__ static MemoryType Pack(half s0, half s1)
-    {
-        union
-        {
-            MemoryType vector;
-            half scalar[2];
-        } data;
-
-        data.scalar[0] = s0;
-        data.scalar[1] = s1;
-        return data.vector;
-    }
-};
-
-template <>
-struct vector_type<half, 4>
-{
-    using MemoryType = float2;
-};
-
-template <>
-struct vector_type<half, 8>
-{
-    using MemoryType = float4;
-};
-
-template <>
-struct vector_type<char, 1>
-{
-    using MemoryType = char;
-
-    __host__ __device__ static MemoryType Pack(char s) { return s; }
-};
-
-template <>
-struct vector_type<char, 2>
-{
-    using MemoryType = int16_t;
-
-    __host__ __device__ static MemoryType Pack(char s0, char s1)
-    {
-        union
-        {
-            MemoryType vector;
-            char scalar[2];
-        } data;
-
-        data.scalar[0] = s0;
-        data.scalar[1] = s1;
-        return data.vector;
-    }
-};
-
-template <>
-struct vector_type<char, 4>
-{
-    using MemoryType = int32_t;
-
-    __host__ __device__ static MemoryType Pack(char s0, char s1, char s2, char s3)
-    {
-        union
-        {
-            MemoryType vector;
-            char scalar[4];
-        } data;
-
-        data.scalar[0] = s0;
-        data.scalar[1] = s1;
-        data.scalar[2] = s2;
-        data.scalar[3] = s3;
-        return data.vector;
-    }
-};
-
-template <>
-struct vector_type<char, 8>
-{
-    using MemoryType = int64_t;
-};
-
-template <>
-struct vector_type<int32_t, 2>
-{
-    using MemoryType = int64_t;
-};
-
-template <>
-struct vector_type<char2, 2>
-{
-    using MemoryType = char4;
-};
-
-template <>
-struct vector_type<char2, 4>
-{
-    using MemoryType = int64_t;
-};
-
-template <>
-struct vector_type<char4, 1>
-{
-    using MemoryType = int;
-};
-
-template <>
-struct vector_type<char4, 2>
-{
-    using MemoryType = int64_t;
-};
-#endif
-
 } // namespace ck
 
 #endif
diff --git a/driver/include/tensor.hpp b/driver/include/tensor.hpp
index ff537e44fe..e7001b1022 100644
--- a/driver/include/tensor.hpp
+++ b/driver/include/tensor.hpp
@@ -46,7 +46,7 @@ auto call_f_unpack_args_impl(F f, T args, std::index_sequence<Is...>)
 template <class F, class T>
 auto call_f_unpack_args(F f, T args)
 {
-    constexpr std::size_t N = std::tuple_size<T>::value;
+    constexpr std::size_t N = std::tuple_size<T>{};
 
     return call_f_unpack_args_impl(f, args, std::make_index_sequence<N>{});
 }
@@ -60,7 +60,7 @@ auto construct_f_unpack_args_impl(T args, std::index_sequence<Is...>)
 template <class F, class T>
 auto construct_f_unpack_args(F, T args)
 {
-    constexpr std::size_t N = std::tuple_size<T>::value;
+    constexpr std::size_t N = std::tuple_size<T>{};
 
     return construct_f_unpack_args_impl<F>(args, std::make_index_sequence<N>{});
 }