diff --git a/driver/device_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hpp b/driver/device_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hpp
index b25459c30d..d8bb3b768e 100644
--- a/driver/device_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hpp
+++ b/driver/device_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hpp
@@ -51,7 +51,7 @@ void device_direct_convolution_2_vectorized_nchw_kcyx_nkhw(InDesc,
         in_nchw_vec(n, c, h, w) =
             vector_t::Pack(in_nchw(n, 2 * c, h, w), in_nchw(n, 2 * c + 1, h, w));
 #elif 1
-        in_nchw_vec(n, c, h, w)  = vector_t::Pack(in_nchw(n, 4 * c, h, w),
+        in_nchw_vec(n, c, h, w) = vector_t::Pack(in_nchw(n, 4 * c, h, w),
                                                  in_nchw(n, 4 * c + 1, h, w),
                                                  in_nchw(n, 4 * c + 2, h, w),
                                                  in_nchw(n, 4 * c + 3, h, w));
@@ -113,37 +113,37 @@ void device_direct_convolution_2_vectorized_nchw_kcyx_nkhw(InDesc,
     constexpr unsigned BlockSize = 128;
 #elif 0
     // 3x3, 34x34, 128 thread, fp32, vector = 2
-    constexpr unsigned NPerBlock = 2;
-    constexpr unsigned KPerBlock = 32;
-    constexpr unsigned CPerBlock = 2;
+    constexpr unsigned NPerBlock  = 2;
+    constexpr unsigned KPerBlock  = 32;
+    constexpr unsigned CPerBlock  = 2;
     constexpr unsigned HoPerBlock = 2;
     constexpr unsigned WoPerBlock = 32;
 
-    constexpr unsigned NPerThread = 2;
-    constexpr unsigned KPerThread = 4;
-    constexpr unsigned CPerThread = 1;
+    constexpr unsigned NPerThread  = 2;
+    constexpr unsigned KPerThread  = 4;
+    constexpr unsigned CPerThread  = 1;
     constexpr unsigned HoPerThread = 2;
     constexpr unsigned WoPerThread = 2;
 
-    constexpr unsigned InBlockCopyDataPerRead = 2;
+    constexpr unsigned InBlockCopyDataPerRead  = 2;
     constexpr unsigned WeiBlockCopyDataPerRead = 2;
 
     constexpr unsigned BlockSize = 128;
 #elif 0
     // 3x3, 34x34, 128 thread, int8, vector = 4
-    constexpr unsigned NPerBlock = 2;
-    constexpr unsigned KPerBlock = 32;
-    constexpr unsigned CPerBlock = 8;
+    constexpr unsigned NPerBlock  = 2;
+    constexpr unsigned KPerBlock  = 32;
+    constexpr unsigned CPerBlock  = 8;
     constexpr unsigned HoPerBlock = 4;
     constexpr unsigned WoPerBlock = 32;
 
-    constexpr unsigned NPerThread = 1;
-    constexpr unsigned KPerThread = 8;
-    constexpr unsigned CPerThread = 2;
+    constexpr unsigned NPerThread  = 1;
+    constexpr unsigned KPerThread  = 8;
+    constexpr unsigned CPerThread  = 2;
     constexpr unsigned HoPerThread = 4;
     constexpr unsigned WoPerThread = 2;
 
-    constexpr unsigned InBlockCopyDataPerRead = 2;
+    constexpr unsigned InBlockCopyDataPerRead  = 2;
     constexpr unsigned WeiBlockCopyDataPerRead = 2;
 
     constexpr unsigned BlockSize = 128;
diff --git a/driver/device_implicit_gemm_convolution_1_chwn_cyxk_khwn.hpp b/driver/device_implicit_gemm_convolution_1_chwn_cyxk_khwn.hpp
index 2c27080670..99a6eb45c1 100644
--- a/driver/device_implicit_gemm_convolution_1_chwn_cyxk_khwn.hpp
+++ b/driver/device_implicit_gemm_convolution_1_chwn_cyxk_khwn.hpp
@@ -74,7 +74,7 @@ void device_implicit_gemm_convolution_1_chwn_cyxk_khwn(InDesc,
     wei_cyxk_device_buf.ToDevice(wei_cyxk.mData.data());
     out_khwn_device_buf.ToDevice(out_khwn.mData.data());
 
-#if 1
+#if 0
     // for 3x3, 34x34
     constexpr unsigned NPerBlock  = 16;
     constexpr unsigned KPerBlock  = 64;
@@ -213,7 +213,7 @@ void device_implicit_gemm_convolution_1_chwn_cyxk_khwn(InDesc,
     constexpr unsigned WoPerThread = 1;
 
     constexpr unsigned BlockSize = 128;
-#elif 1
+#elif 0
     // for 1x1, 28x28
     constexpr unsigned NPerBlock  = 16;
     constexpr unsigned KPerBlock  = 128;
@@ -245,6 +245,39 @@ void device_implicit_gemm_convolution_1_chwn_cyxk_khwn(InDesc,
 
     constexpr unsigned OutThreadCopyDataPerWrite = 2;
 
+    constexpr unsigned BlockSize = 128;
+#elif 1
+    // for 1x1, 14x14
+    constexpr unsigned NPerBlock  = 16;
+    constexpr unsigned KPerBlock  = 128;
+    constexpr unsigned CPerBlock  = 8;
+    constexpr unsigned HoPerBlock = 2;
+    constexpr unsigned WoPerBlock = 2;
+
+    constexpr unsigned NPerThread  = 4;
+    constexpr unsigned KPerThread  = 16;
+    constexpr unsigned CPerThread  = 1;
+    constexpr unsigned HoPerThread = 1;
+    constexpr unsigned WoPerThread = 1;
+
+    constexpr unsigned InBlockCopy_ThreadPerDimC = 8;
+    constexpr unsigned InBlockCopy_ThreadPerDimH = 2;
+    constexpr unsigned InBlockCopy_ThreadPerDimW = 2;
+    constexpr unsigned InBlockCopy_ThreadPerDimN = 4;
+    constexpr unsigned InBlockCopyDataPerRead    = 4;
+
+    constexpr unsigned WeiBlockCopyDataPerRead = 4;
+
+    constexpr unsigned GemmMPerThreadSubC = 4;
+    constexpr unsigned GemmNPerThreadSubC = 4;
+    constexpr unsigned GemmMLevel0Cluster = 4;
+    constexpr unsigned GemmNLevel0Cluster = 2;
+    constexpr unsigned GemmMLevel1Cluster = 2;
+    constexpr unsigned GemmNLevel1Cluster = 4;
+    constexpr unsigned GemmKPerThreadLoop = 1;
+
+    constexpr unsigned OutThreadCopyDataPerWrite = 2;
+
     constexpr unsigned BlockSize = 128;
 #endif
 
diff --git a/driver/driver.hip.cpp b/driver/driver.hip.cpp
index b1df58265e..d391ec5b5f 100644
--- a/driver/driver.hip.cpp
+++ b/driver/driver.hip.cpp
@@ -8,11 +8,11 @@
 #include "ConstantTensorDescriptor.hip.hpp"
 #include "conv_common.hip.hpp"
 //#include "device_direct_convolution_1.hpp"
-#include "device_direct_convolution_2_nchw_kcyx_nkhw.hpp"
-#include "device_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hpp"
-//#include "device_implicit_gemm_convolution_1_chwn_cyxk_khwn.hpp"
+//#include "device_direct_convolution_2_nchw_kcyx_nkhw.hpp"
+//#include "device_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hpp"
+#include "device_implicit_gemm_convolution_1_chwn_cyxk_khwn.hpp"
 //#include "device_implicit_gemm_convolution_1_chwn_cyxk_khwn_padded.hpp"
-//#include "device_implicit_gemm_convolution_2_chwn_cyxk_khwn.hpp"
+#include "device_implicit_gemm_convolution_2_chwn_cyxk_khwn.hpp"
 
 struct GeneratorTensor_1
 {
@@ -353,7 +353,7 @@ void host_winograd_3x3_convolution(const Tensor<TIn>& in_nchw,
             std::size_t ho = HoPerTile * htile + j;
             for(int i = 0; i < WoPerTile; ++i)
             {
-                std::size_t wo         = WoPerTile * wtile + i;
+                std::size_t wo = WoPerTile * wtile + i;
                 out_nkhw(n, k, ho, wo) = out_hold(n, k, htile, wtile, j, i);
             }
         }
@@ -568,7 +568,7 @@ int main(int argc, char* argv[])
 
     constexpr unsigned HPad = 2;
     constexpr unsigned WPad = 2;
-#elif 1
+#elif 0
     // 1x1 filter, 32x32 image
     constexpr unsigned N  = 64;
     constexpr unsigned C  = 256;
@@ -578,6 +578,18 @@ int main(int argc, char* argv[])
     constexpr unsigned Y  = 1;
     constexpr unsigned X  = 1;
 
+    constexpr unsigned HPad = 0;
+    constexpr unsigned WPad = 0;
+#elif 1
+    // 1x1 filter, 14x14 image
+    constexpr unsigned N  = 128;
+    constexpr unsigned C  = 2048;
+    constexpr unsigned HI = 14;
+    constexpr unsigned WI = 14;
+    constexpr unsigned K  = 512;
+    constexpr unsigned Y  = 1;
+    constexpr unsigned X  = 1;
+
     constexpr unsigned HPad = 0;
     constexpr unsigned WPad = 0;
 #endif
@@ -594,8 +606,8 @@ int main(int argc, char* argv[])
     ostream_ConstantTensorDescriptor(wei_kcyx_desc, std::cout << "wei_kcyx_desc: ");
     ostream_ConstantTensorDescriptor(out_nkhw_desc, std::cout << "out_nkhw_desc: ");
 
-    using in_data_t  = char;
-    using out_data_t = int32_t;
+    using in_data_t  = float;
+    using out_data_t = float;
     Tensor<in_data_t> in_nchw(make_TensorDescriptor(in_nchw_desc));
     Tensor<in_data_t> wei_kcyx(make_TensorDescriptor(wei_kcyx_desc));
     Tensor<out_data_t> out_nkhw_host(make_TensorDescriptor(out_nkhw_desc));
@@ -635,9 +647,9 @@ int main(int argc, char* argv[])
     device_direct_convolution_1
 #elif 0
     device_direct_convolution_2_nchw_kcyx_nkhw
-#elif 1
-    device_direct_convolution_2_vectorized_nchw_kcyx_nkhw
 #elif 0
+    device_direct_convolution_2_vectorized_nchw_kcyx_nkhw
+#elif 1
     device_implicit_gemm_convolution_1_chwn_cyxk_khwn
 #elif 0
     device_implicit_gemm_convolution_2_chwn_cyxk_khwn
diff --git a/src/include/Array.hip.hpp b/src/include/Array.hip.hpp
index 1caab6a4c9..89654cbc2b 100644
--- a/src/include/Array.hip.hpp
+++ b/src/include/Array.hip.hpp
@@ -10,7 +10,7 @@ struct Array
     unsigned mData[nSize];
 
     template <class... Xs>
-    __host__ __device__ Array(Xs... xs) : mData({static_cast<TData>(xs)...})
+    __host__ __device__ Array(Xs... xs) : mData{static_cast<TData>(xs)...}
     {
     }
 
diff --git a/src/include/blockwise_4d_tensor_op.hip.hpp b/src/include/blockwise_4d_tensor_op.hip.hpp
index fa5f36be51..0660c34ebb 100644
--- a/src/include/blockwise_4d_tensor_op.hip.hpp
+++ b/src/include/blockwise_4d_tensor_op.hip.hpp
@@ -340,10 +340,11 @@ struct BlockwiseChwnTensorCopyPadded
         constexpr unsigned NLoop = ref_desc.GetElementSize() / BlockSize;
 
         const Float* p_src_tmp =
-            p_src + src_desc.Get1dIndex(c_block_data_begin,
-                                        (ho_block_data_begin + h_block_pad_low) - h_global_pad_low,
-                                        (wo_block_data_begin + w_block_pad_low) - w_global_pad_low,
-                                        n_block_data_begin);
+            p_src +
+            src_desc.Get1dIndex(c_block_data_begin,
+                                (ho_block_data_begin + h_block_pad_low) - h_global_pad_low,
+                                (wo_block_data_begin + w_block_pad_low) - w_global_pad_low,
+                                n_block_data_begin);
 
 #if 0
         if(get_thread_local_1d_id() == 0)
diff --git a/src/include/blockwise_direct_convolution.hip.hpp b/src/include/blockwise_direct_convolution.hip.hpp
index 247ff219f6..7666607c9c 100644
--- a/src/include/blockwise_direct_convolution.hip.hpp
+++ b/src/include/blockwise_direct_convolution.hip.hpp
@@ -93,10 +93,11 @@ __device__ void blockwise_direct_convolution(InBlockDesc,
         Float p_out_thread[out_thread_desc.GetElementSpace()];
 
         threadwise_4d_tensor_copy(out_block_desc,
-                                  p_out_block + out_block_desc.Get1dIndex(n_thread_data_begin,
-                                                                          k_thread_data_begin,
-                                                                          ho_thread_data_begin,
-                                                                          wo_thread_data_begin),
+                                  p_out_block +
+                                      out_block_desc.Get1dIndex(n_thread_data_begin,
+                                                                k_thread_data_begin,
+                                                                ho_thread_data_begin,
+                                                                wo_thread_data_begin),
                                   out_thread_desc,
                                   p_out_thread,
                                   out_thread_desc.GetLengths());
@@ -107,10 +108,11 @@ __device__ void blockwise_direct_convolution(InBlockDesc,
             // threadwise convolution
             threadwise_direct_convolution_2(
                 in_thread_block_desc,
-                p_in_block + in_block_desc.Get1dIndex(n_thread_data_begin,
-                                                      c_thread_data_begin,
-                                                      hi_thread_data_begin,
-                                                      wi_thread_data_begin),
+                p_in_block +
+                    in_block_desc.Get1dIndex(n_thread_data_begin,
+                                             c_thread_data_begin,
+                                             hi_thread_data_begin,
+                                             wi_thread_data_begin),
                 wei_thread_block_desc,
                 p_wei_block +
                     wei_block_desc.Get1dIndex(k_thread_data_begin, c_thread_data_begin, 0, 0),
@@ -122,10 +124,11 @@ __device__ void blockwise_direct_convolution(InBlockDesc,
         threadwise_4d_tensor_copy(out_thread_desc,
                                   p_out_thread,
                                   out_block_desc,
-                                  p_out_block + out_block_desc.Get1dIndex(n_thread_data_begin,
-                                                                          k_thread_data_begin,
-                                                                          ho_thread_data_begin,
-                                                                          wo_thread_data_begin),
+                                  p_out_block +
+                                      out_block_desc.Get1dIndex(n_thread_data_begin,
+                                                                k_thread_data_begin,
+                                                                ho_thread_data_begin,
+                                                                wo_thread_data_begin),
                                   out_thread_desc.GetLengths());
     }
 }
diff --git a/src/include/blockwise_gemm.hip.hpp b/src/include/blockwise_gemm.hip.hpp
index 33556dde25..221a7153a2 100644
--- a/src/include/blockwise_gemm.hip.hpp
+++ b/src/include/blockwise_gemm.hip.hpp
@@ -431,12 +431,12 @@ struct BlockwiseBatchGemmBlockABlockBThreadCTransANormalBNormalC_V2
         constexpr unsigned MRepeat = MPerThread / MPerThreadSubC;
         constexpr unsigned NRepeat = NPerThread / NPerThreadSubC;
 
-        // loop over k
+// loop over k
 #pragma unroll
         for(unsigned k_begin = 0; k_begin < KPerBlock; k_begin += KPerThreadLoop)
         {
-            // read first batch of A, B
-            //   copy A-sub to form A
+// read first batch of A, B
+//   copy A-sub to form A
 #pragma unroll
             for(unsigned m_repeat = 0; m_repeat < MRepeat; ++m_repeat)
             {
@@ -449,7 +449,7 @@ struct BlockwiseBatchGemmBlockABlockBThreadCTransANormalBNormalC_V2
                     a_thread_sub_mtx.GetLengths());
             }
 
-            //   copy B-sub to form B
+//   copy B-sub to form B
 #pragma unroll
             for(unsigned n_repeat = 0; n_repeat < NRepeat; ++n_repeat)
             {
@@ -462,7 +462,7 @@ struct BlockwiseBatchGemmBlockABlockBThreadCTransANormalBNormalC_V2
                     b_thread_sub_mtx.GetLengths());
             }
 
-            // loop over batch
+// loop over batch
 #pragma unroll
             for(unsigned ib = 0; ib + 1 < BatchPerThread; ++ib)
             {
@@ -551,14 +551,15 @@ struct BlockwiseBatchGemmBlockABlockBThreadCTransANormalBNormalC_V2
             c_thread_mtx_begin.batch * BlockMatrixStrideC +
             c_block_mtx.Get1dIndex(c_thread_mtx_begin.row, c_thread_mtx_begin.col);
 
-        for(unsigned m_repeat = 0; m_repeat, MRepeat; ++m_repeat)
+        for(unsigned m_repeat = 0; m_repeat < MRepeat; ++m_repeat)
         {
-            for(unsigned n_repeat = 0; n_repeat, NRepeat; ++n_repeat)
+            for(unsigned n_repeat = 0; n_repeat < NRepeat; ++n_repeat)
             {
                 threadwise_matrix_copy(
                     c_thread_sub_mtx,
-                    p_c_thread + c_thread_sub_mtx.Get1dIndex(m_repeat * MPerLevel1Cluster,
-                                                             n_repeat * NPerLevel1Cluster),
+                    p_c_thread +
+                        c_thread_sub_mtx.Get1dIndex(m_repeat * MPerLevel1Cluster,
+                                                    n_repeat * NPerLevel1Cluster),
                     c_block_mtx,
                     p_c_block +
                         c_block_mtx.Get1dIndex(m_repeat * MPerLevel1Cluster,
@@ -656,8 +657,9 @@ struct BlockwiseGemmBlockABlockBThreadC
             constexpr unsigned NClusterWork =
                 (NPerBlock + NPerThread * NThreadPerCluster - 1) / (NPerThread * NThreadPerCluster);
 
-            static_assert(BlockSize == (MClusterWork * MThreadPerCluster) *
-                                           (NClusterWork * NThreadPerCluster),
+            static_assert(BlockSize ==
+                              (MClusterWork * MThreadPerCluster) *
+                                  (NClusterWork * NThreadPerCluster),
                           "wrong! wrong BlockSize");
 
             if(DistributeThreadAlongColumnFirst)
@@ -1256,8 +1258,9 @@ struct BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_v2
                             p_b_thread + b_thread_mtx.Get1dIndex(0, n_repeat * NPerThreadSubC),
                             c_thread_sub_mtx,
                             False,
-                            p_c_thread + c_thread_mtx.Get1dIndex(m_repeat * MPerThreadSubC,
-                                                                 n_repeat * NPerThreadSubC),
+                            p_c_thread +
+                                c_thread_mtx.Get1dIndex(m_repeat * MPerThreadSubC,
+                                                        n_repeat * NPerThreadSubC),
                             f_accum);
                     }
                 }
diff --git a/src/include/common.hip.hpp b/src/include/common.hip.hpp
index d5832dde9d..ba0a521fb3 100644
--- a/src/include/common.hip.hpp
+++ b/src/include/common.hip.hpp
@@ -21,6 +21,7 @@ struct is_same<T, T>
     static const bool value = true;
 };
 
+#if 0
 template <typename T>
 __host__ __device__ constexpr T max(T a, T b)
 {
@@ -32,6 +33,7 @@ __host__ __device__ constexpr T min(T a, T b)
 {
     return a < b ? a : b;
 }
+#endif
 
 __host__ __device__ constexpr unsigned integer_divide_ceil(unsigned a, unsigned b)
 {
diff --git a/src/include/config.h.in b/src/include/config.h.in
index 7b888c6951..bb4f6cb51d 100644
--- a/src/include/config.h.in
+++ b/src/include/config.h.in
@@ -4,8 +4,10 @@
 
 #if DEVICE_BACKEND_HIP
 #include "hip/hip_runtime.h"
+#include "hip/hip_fp16.h"
 #elif DEVICE_BACKEND_CUDA
 #include "cuda_runtime.h"
+#include "cuda_fp16.h"
 #include "nvToolsExt.h"
 #include "helper_cuda.h"
 #endif
diff --git a/src/include/data_type.hip.hpp b/src/include/data_type.hip.hpp
index ca1f4dcbae..95d5b0b33f 100644
--- a/src/include/data_type.hip.hpp
+++ b/src/include/data_type.hip.hpp
@@ -1,15 +1,6 @@
 #pragma once
 #include "config.h"
 
-#if DEVICE_BACKEND_CUDA
-namespace CUDA {
-#include "cuda_fp16.h"
-}
-#endif
-
-using half  = CUDA::half;
-using half2 = CUDA::half2;
-
 template <class T, unsigned N>
 struct vector_type
 {
@@ -52,6 +43,7 @@ struct vector_type<float2, 2>
     using MemoryType = float4;
 };
 
+#if 0
 template <>
 struct vector_type<half, 1>
 {
@@ -91,24 +83,6 @@ struct vector_type<half, 8>
     using MemoryType = float4;
 };
 
-template <>
-struct vector_type<half2, 1>
-{
-    using MemoryType = half2;
-};
-
-template <>
-struct vector_type<half2, 2>
-{
-    using MemoryType = float2;
-};
-
-template <>
-struct vector_type<half2, 4>
-{
-    using MemoryType = float4;
-};
-
 template <>
 struct vector_type<char, 1>
 {
@@ -169,7 +143,6 @@ struct vector_type<int32_t, 2>
     using MemoryType = int64_t;
 };
 
-#if 0
 template <>
 struct vector_type<char2, 2>
 {
@@ -214,6 +187,7 @@ __device__ void fused_multiply_accumulate(float& d, const float4& s0, const floa
     d += s0.w * s1.w;
 }
 
+#if 0
 __device__ void fused_multiply_accumulate(half& d, const half& s0, const half& s1) { d += s0 * s1; }
 
 __device__ void fused_multiply_accumulate(half& d, const half2& s0, const half2& s1)
@@ -222,12 +196,10 @@ __device__ void fused_multiply_accumulate(half& d, const half2& s0, const half2&
     d += s0.y * s1.y;
 }
 
-#if 0
 __device__ void fused_multiply_accumulate(float& d, const half2& s0, const half2& s1)
 {
     d += s0.x * s1.x + s0.y * s1.y;
 }
-#endif
 
 __device__ void fused_multiply_accumulate(char& d, const char& s0, const char& s1) { d += s0 * s1; }
 
@@ -239,3 +211,4 @@ __device__ void fused_multiply_accumulate(int32_t& d, const int32_t& s0, const i
     d = __dp4a(s0, s1, d);
 #endif
 }
+#endif
diff --git a/src/include/gridwise_direct_convolution_1.hip.hpp b/src/include/gridwise_direct_convolution_1.hip.hpp
index f4fe1809fc..edcfd6d38e 100644
--- a/src/include/gridwise_direct_convolution_1.hip.hpp
+++ b/src/include/gridwise_direct_convolution_1.hip.hpp
@@ -113,10 +113,11 @@ __global__ void gridwise_direct_convolution_1(const Float* const __restrict__ p_
         c_block_work_begin += CPerBlock)
     {
         // copy input tensor to LDS
-        blockwise_in_copy.Run(p_in_global + in_global_desc.Get1dIndex(n_block_work_begin,
-                                                                      c_block_work_begin,
-                                                                      hi_block_work_begin,
-                                                                      wi_block_work_begin),
+        blockwise_in_copy.Run(p_in_global +
+                                  in_global_desc.Get1dIndex(n_block_work_begin,
+                                                            c_block_work_begin,
+                                                            hi_block_work_begin,
+                                                            wi_block_work_begin),
                               p_in_block);
 
         // copy weight tensor to LDS
@@ -143,9 +144,9 @@ __global__ void gridwise_direct_convolution_1(const Float* const __restrict__ p_
     }
 
     // copy output tensor from LDS to device mem
-    blockwise_out_copy.Run(p_out_block,
-                           p_out_global + out_global_desc.Get1dIndex(n_block_work_begin,
-                                                                     k_block_work_begin,
-                                                                     ho_block_work_begin,
-                                                                     wo_block_work_begin));
+    blockwise_out_copy.Run(
+        p_out_block,
+        p_out_global +
+            out_global_desc.Get1dIndex(
+                n_block_work_begin, k_block_work_begin, ho_block_work_begin, wo_block_work_begin));
 }
diff --git a/src/include/gridwise_direct_convolution_2_nchw_kcyx_nkhw.hip.hpp b/src/include/gridwise_direct_convolution_2_nchw_kcyx_nkhw.hip.hpp
index 5761a22c16..1e6d3d24bd 100644
--- a/src/include/gridwise_direct_convolution_2_nchw_kcyx_nkhw.hip.hpp
+++ b/src/include/gridwise_direct_convolution_2_nchw_kcyx_nkhw.hip.hpp
@@ -176,16 +176,18 @@ gridwise_direct_convolution_2_nchw_kcyx_nkhw(const Float* const __restrict__ p_i
         c_block_data_begin += CPerBlock, __syncthreads())
     {
         // copy input tensor to LDS
-        blockwise_in_copy.Run(p_in_global + in_nchw_global_desc.Get1dIndex(n_block_data_begin,
-                                                                           c_block_data_begin,
-                                                                           hi_block_data_begin,
-                                                                           wi_block_data_begin),
+        blockwise_in_copy.Run(p_in_global +
+                                  in_nchw_global_desc.Get1dIndex(n_block_data_begin,
+                                                                 c_block_data_begin,
+                                                                 hi_block_data_begin,
+                                                                 wi_block_data_begin),
                               p_in_block);
 
         // copy weight tensor to LDS
-        blockwise_wei_copy.Run(p_wei_global + wei_kcyx_global_desc.Get1dIndex(
-                                                  k_block_data_begin, c_block_data_begin, 0, 0),
-                               p_wei_block);
+        blockwise_wei_copy.Run(
+            p_wei_global +
+                wei_kcyx_global_desc.Get1dIndex(k_block_data_begin, c_block_data_begin, 0, 0),
+            p_wei_block);
 
         __syncthreads();
 
@@ -195,10 +197,11 @@ gridwise_direct_convolution_2_nchw_kcyx_nkhw(const Float* const __restrict__ p_i
 #if 1
             threadwise_direct_convolution_2(
                 in_nchw_thread_block_desc,
-                p_in_block + in_nchw_block_desc.Get1dIndex(n_thread_data_begin,
-                                                           c_thread_data,
-                                                           hi_thread_data_begin,
-                                                           wi_thread_data_begin),
+                p_in_block +
+                    in_nchw_block_desc.Get1dIndex(n_thread_data_begin,
+                                                  c_thread_data,
+                                                  hi_thread_data_begin,
+                                                  wi_thread_data_begin),
                 wei_kcyx_thread_block_desc,
                 p_wei_block +
                     wei_kcyx_block_desc.Get1dIndex(k_thread_data_begin, c_thread_data, 0, 0),
@@ -207,10 +210,11 @@ gridwise_direct_convolution_2_nchw_kcyx_nkhw(const Float* const __restrict__ p_i
 #elif 0
             threadwise_direct_convolution_3(
                 in_nchw_thread_block_desc,
-                p_in_block + in_nchw_block_desc.Get1dIndex(n_thread_data_begin,
-                                                           c_thread_data,
-                                                           hi_thread_data_begin,
-                                                           wi_thread_data_begin),
+                p_in_block +
+                    in_nchw_block_desc.Get1dIndex(n_thread_data_begin,
+                                                  c_thread_data,
+                                                  hi_thread_data_begin,
+                                                  wi_thread_data_begin),
                 wei_kcyx_thread_block_desc,
                 p_wei_block +
                     wei_kcyx_block_desc.Get1dIndex(k_thread_data_begin, c_thread_data, 0, 0),
@@ -225,9 +229,10 @@ gridwise_direct_convolution_2_nchw_kcyx_nkhw(const Float* const __restrict__ p_i
         out_nkhw_thread_desc,
         p_out_thread,
         out_nkhw_global_desc,
-        p_out_global + out_nkhw_global_desc.Get1dIndex(n_block_data_begin + n_thread_data_begin,
-                                                       k_block_data_begin + k_thread_data_begin,
-                                                       ho_block_data_begin + ho_thread_data_begin,
-                                                       wo_block_data_begin + wo_thread_data_begin),
+        p_out_global +
+            out_nkhw_global_desc.Get1dIndex(n_block_data_begin + n_thread_data_begin,
+                                            k_block_data_begin + k_thread_data_begin,
+                                            ho_block_data_begin + ho_thread_data_begin,
+                                            wo_block_data_begin + wo_thread_data_begin),
         out_nkhw_thread_desc.GetLengths());
 }
diff --git a/src/include/gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hip.hpp b/src/include/gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hip.hpp
index 7ae594788b..4d72368b29 100644
--- a/src/include/gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hip.hpp
+++ b/src/include/gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hip.hpp
@@ -200,9 +200,10 @@ __global__ void gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw(
                               p_in_vec_block);
 
         // copy weight tensor to LDS
-        blockwise_wei_copy.Run(p_wei_vec_global + wei_kcyx_vec_global_desc.Get1dIndex(
-                                                      k_block_data_begin, c_block_data_begin, 0, 0),
-                               p_wei_vec_block);
+        blockwise_wei_copy.Run(
+            p_wei_vec_global +
+                wei_kcyx_vec_global_desc.Get1dIndex(k_block_data_begin, c_block_data_begin, 0, 0),
+            p_wei_vec_block);
 
         __syncthreads();
 
@@ -212,10 +213,11 @@ __global__ void gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw(
 #if 1
             threadwise_direct_convolution_2(
                 in_nchw_vec_thread_block_desc,
-                p_in_vec_block + in_nchw_vec_block_desc.Get1dIndex(n_thread_data_begin,
-                                                                   c_thread_data,
-                                                                   hi_thread_data_begin,
-                                                                   wi_thread_data_begin),
+                p_in_vec_block +
+                    in_nchw_vec_block_desc.Get1dIndex(n_thread_data_begin,
+                                                      c_thread_data,
+                                                      hi_thread_data_begin,
+                                                      wi_thread_data_begin),
                 wei_kcyx_vec_thread_block_desc,
                 p_wei_vec_block +
                     wei_kcyx_vec_block_desc.Get1dIndex(k_thread_data_begin, c_thread_data, 0, 0),
@@ -224,10 +226,11 @@ __global__ void gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw(
 #elif 0
             threadwise_direct_convolution_3(
                 in_nchw_vec_thread_block_desc,
-                p_in_vec_block + in_nchw_vec_block_desc.Get1dIndex(n_thread_data_begin,
-                                                                   c_thread_data,
-                                                                   hi_thread_data_begin,
-                                                                   wi_thread_data_begin),
+                p_in_vec_block +
+                    in_nchw_vec_block_desc.Get1dIndex(n_thread_data_begin,
+                                                      c_thread_data,
+                                                      hi_thread_data_begin,
+                                                      wi_thread_data_begin),
                 wei_kcyx_vec_thread_block_desc,
                 p_wei_vec_block +
                     wei_kcyx_vec_block_desc.Get1dIndex(k_thread_data_begin, c_thread_data, 0, 0),
@@ -242,9 +245,10 @@ __global__ void gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw(
         out_nkhw_thread_desc,
         p_out_thread,
         out_nkhw_global_desc,
-        p_out_global + out_nkhw_global_desc.Get1dIndex(n_block_data_begin + n_thread_data_begin,
-                                                       k_block_data_begin + k_thread_data_begin,
-                                                       ho_block_data_begin + ho_thread_data_begin,
-                                                       wo_block_data_begin + wo_thread_data_begin),
+        p_out_global +
+            out_nkhw_global_desc.Get1dIndex(n_block_data_begin + n_thread_data_begin,
+                                            k_block_data_begin + k_thread_data_begin,
+                                            ho_block_data_begin + ho_thread_data_begin,
+                                            wo_block_data_begin + wo_thread_data_begin),
         out_nkhw_thread_desc.GetLengths());
 }
diff --git a/src/include/gridwise_implicit_gemm_convolution_1_chwn_cyxk_khwn.hip.hpp b/src/include/gridwise_implicit_gemm_convolution_1_chwn_cyxk_khwn.hip.hpp
index 1caef669e9..99342d3ca1 100644
--- a/src/include/gridwise_implicit_gemm_convolution_1_chwn_cyxk_khwn.hip.hpp
+++ b/src/include/gridwise_implicit_gemm_convolution_1_chwn_cyxk_khwn.hip.hpp
@@ -184,8 +184,9 @@ gridwise_implicit_gemm_convolution_1_chwn_cyxk_khwn(const Float* const __restric
     threadwise_4d_tensor_set_zero(out_khwn_thread_desc, p_out_thread);
 
     const Float* p_in_global_block_begin =
-        p_in_global + in_chwn_global_desc.Get1dIndex(
-                          0, hi_block_data_begin, wi_block_data_begin, n_block_data_begin);
+        p_in_global +
+        in_chwn_global_desc.Get1dIndex(
+            0, hi_block_data_begin, wi_block_data_begin, n_block_data_begin);
 
     const Float* p_wei_global_block_begin =
         p_wei_global + wei_cyxk_global_desc.Get1dIndex(0, 0, 0, k_block_data_begin);
@@ -216,7 +217,7 @@ gridwise_implicit_gemm_convolution_1_chwn_cyxk_khwn(const Float* const __restric
         }
     }
 
-    // output: register to global mem,
+// output: register to global mem,
 #if 0
     const auto c_thread_mtx_begin =
         blockwise_batch_gemm.GetBeginOfThreadMatrixC(get_thread_local_1d_id());
@@ -286,16 +287,17 @@ gridwise_implicit_gemm_convolution_1_chwn_cyxk_khwn(const Float* const __restric
         }
 #endif
 
-        threadwise_8d_tensor_copy(out_8d_thread_desc,
-                                  p_out_thread,
-                                  out_8d_global_desc,
-                                  p_out_global + out_khwn_global_desc.Get1dIndex(
-                                                     k_block_data_begin + k_thread_data_begin,
-                                                     ho_block_data_begin + ho_thread_data_begin,
-                                                     wo_block_data_begin + wo_thread_data_begin,
-                                                     n_block_data_begin + n_thread_data_begin),
-                                  out_8d_thread_desc.GetLengths(),
-                                  Number<OutThreadCopyDataPerWrite>{});
+        threadwise_8d_tensor_copy(
+            out_8d_thread_desc,
+            p_out_thread,
+            out_8d_global_desc,
+            p_out_global +
+                out_khwn_global_desc.Get1dIndex(k_block_data_begin + k_thread_data_begin,
+                                                ho_block_data_begin + ho_thread_data_begin,
+                                                wo_block_data_begin + wo_thread_data_begin,
+                                                n_block_data_begin + n_thread_data_begin),
+            out_8d_thread_desc.GetLengths(),
+            Number<OutThreadCopyDataPerWrite>{});
     }
     else if(NPerThread == NPerBlock)
     {
diff --git a/src/include/gridwise_implicit_gemm_convolution_1_chwn_cyxk_khwn_padded.hip.hpp b/src/include/gridwise_implicit_gemm_convolution_1_chwn_cyxk_khwn_padded.hip.hpp
index a4904cdf58..790a006023 100644
--- a/src/include/gridwise_implicit_gemm_convolution_1_chwn_cyxk_khwn_padded.hip.hpp
+++ b/src/include/gridwise_implicit_gemm_convolution_1_chwn_cyxk_khwn_padded.hip.hpp
@@ -283,10 +283,11 @@ __global__ void gridwise_implicit_gemm_convolution_1_chwn_cyxk_khwn_padded(
         out_hkwn_thread_desc,
         p_out_thread,
         out_khwn_global_desc,
-        p_out_global + out_khwn_global_desc.Get1dIndex(k_block_data_begin + k_thread_data_begin,
-                                                       ho_block_data_begin + ho_thread_data_begin,
-                                                       wo_block_data_begin + wo_thread_data_begin,
-                                                       n_block_data_begin + n_thread_data_begin),
+        p_out_global +
+            out_khwn_global_desc.Get1dIndex(k_block_data_begin + k_thread_data_begin,
+                                            ho_block_data_begin + ho_thread_data_begin,
+                                            wo_block_data_begin + wo_thread_data_begin,
+                                            n_block_data_begin + n_thread_data_begin),
         out_hkwn_thread_desc.GetLengths(),
         reorder_khwn_from_hkwn);
 }
diff --git a/src/include/gridwise_implicit_gemm_convolution_2_chwn_cyxk_khwn.hip.hpp b/src/include/gridwise_implicit_gemm_convolution_2_chwn_cyxk_khwn.hip.hpp
index afa3d3ee90..f68b57b6a0 100644
--- a/src/include/gridwise_implicit_gemm_convolution_2_chwn_cyxk_khwn.hip.hpp
+++ b/src/include/gridwise_implicit_gemm_convolution_2_chwn_cyxk_khwn.hip.hpp
@@ -121,7 +121,7 @@ gridwise_implicit_gemm_convolution_2_chwn_cyxk_khwn(const Float* const __restric
                                decltype(in_cb_block_desc),
                                decltype(in_cb_block_desc.GetLengths())>{};
 #elif 0
-    const auto blockwise_in_copy  = Blockwise2dTensorCopy2<BlockSize,
+    const auto blockwise_in_copy = Blockwise2dTensorCopy2<BlockSize,
                                                           Float,
                                                           decltype(in_cb_global_desc),
                                                           decltype(in_cb_block_desc),
@@ -129,7 +129,7 @@ gridwise_implicit_gemm_convolution_2_chwn_cyxk_khwn(const Float* const __restric
                                                           InBlockCopyThreadPerDim0,
                                                           InBlockCopyThreadPerDim1>{};
 #elif 1
-    const auto blockwise_in_copy  = Blockwise2dTensorCopy3<BlockSize,
+    const auto blockwise_in_copy = Blockwise2dTensorCopy3<BlockSize,
                                                           Float,
                                                           decltype(in_cb_global_desc),
                                                           decltype(in_cb_block_desc),
diff --git a/src/include/gridwise_implicit_gemm_convolution_2_chwn_cyxk_khwn_lds_double_buffer.hip.hpp b/src/include/gridwise_implicit_gemm_convolution_2_chwn_cyxk_khwn_lds_double_buffer.hip.hpp
index 60d827293b..728a2d56ba 100644
--- a/src/include/gridwise_implicit_gemm_convolution_2_chwn_cyxk_khwn_lds_double_buffer.hip.hpp
+++ b/src/include/gridwise_implicit_gemm_convolution_2_chwn_cyxk_khwn_lds_double_buffer.hip.hpp
@@ -121,7 +121,7 @@ __global__ void gridwise_implicit_gemm_convolution_2_chwn_cyxk_khwn_lds_double_b
                                decltype(in_cb_block_desc),
                                decltype(in_cb_block_desc.GetLengths())>{};
 #elif 0
-    const auto blockwise_in_copy  = Blockwise2dTensorCopy2<BlockSize,
+    const auto blockwise_in_copy = Blockwise2dTensorCopy2<BlockSize,
                                                           Float,
                                                           decltype(in_cb_global_desc),
                                                           decltype(in_cb_block_desc),
@@ -129,7 +129,7 @@ __global__ void gridwise_implicit_gemm_convolution_2_chwn_cyxk_khwn_lds_double_b
                                                           InBlockCopyThreadPerDim0,
                                                           InBlockCopyThreadPerDim1>{};
 #elif 1
-    const auto blockwise_in_copy  = Blockwise2dTensorCopy3<BlockSize,
+    const auto blockwise_in_copy = Blockwise2dTensorCopy3<BlockSize,
                                                           Float,
                                                           decltype(in_cb_global_desc),
                                                           decltype(in_cb_block_desc),
diff --git a/src/include/tensor.hpp b/src/include/tensor.hpp
index d0c785c16e..1ebfef0c5d 100644
--- a/src/include/tensor.hpp
+++ b/src/include/tensor.hpp
@@ -22,8 +22,7 @@ std::ostream& LogRange(std::ostream& os, Range&& range, std::string delim)
     return os;
 }
 
-typedef enum
-{
+typedef enum {
     Half  = 0,
     Float = 1,
 } DataType_t;
diff --git a/src/include/threadwise_nd_tensor_op.hip.hpp b/src/include/threadwise_nd_tensor_op.hip.hpp
index c787afae77..510525db5d 100644
--- a/src/include/threadwise_nd_tensor_op.hip.hpp
+++ b/src/include/threadwise_nd_tensor_op.hip.hpp
@@ -10,7 +10,7 @@ __device__ void threadwise_6d_tensor_copy(SrcDesc,
                                           SrcOpLengths,
                                           Number<DataPerRead>)
 {
-    using vector_t = typename vector_type<Float, DataPerRead>::type;
+    using vector_t = typename vector_type<Float, DataPerRead>::MemoryType;
 
     static_assert(SrcDesc{}.GetDimension() == 6 && DstDesc{}.GetDimension() == 6 &&
                       SrcOpLengths::nDim == 6,
@@ -80,7 +80,7 @@ __device__ void threadwise_8d_tensor_copy(SrcDesc,
                                           SrcOpLengths,
                                           Number<DataPerRead>)
 {
-    using vector_t = typename vector_type<Float, DataPerRead>::type;
+    using vector_t = typename vector_type<Float, DataPerRead>::MemoryType;
 
     static_assert(SrcDesc{}.GetDimension() == 8 && DstDesc{}.GetDimension() == 8 &&
                       SrcOpLengths::nDim == 8,