diff --git a/driver/driver.hip.cpp b/driver/driver.hip.cpp
index f534e4eda9..3b18645c4b 100644
--- a/driver/driver.hip.cpp
+++ b/driver/driver.hip.cpp
@@ -633,6 +633,7 @@ int main(int argc, char* argv[])
 
     if(do_verification)
     {
+#if 1
         if(Y == 3 && X == 3)
         {
             host_winograd_3x3_convolution(in_nchw, wei_kcsr, out_nkhw_host, lower_pads, upper_pads);
@@ -642,6 +643,7 @@ int main(int argc, char* argv[])
             host_direct_convolution(in_nchw, wei_kcsr, out_nkhw_host, lower_pads, upper_pads);
         }
         check_error(out_nkhw_host, out_nkhw_device);
+#endif
 
 #if 0
         LogRange(std::cout << "in_nchw : ", in_nchw.mData, ",") << std::endl;
diff --git a/src/include/blockwise_2d_tensor_op.hip.hpp b/src/include/blockwise_2d_tensor_op.hip.hpp
index a178b5dade..b54c4d0c5f 100644
--- a/src/include/blockwise_2d_tensor_op.hip.hpp
+++ b/src/include/blockwise_2d_tensor_op.hip.hpp
@@ -373,6 +373,8 @@ template <unsigned BlockSize,
           unsigned DataPerRead>
 struct Blockwise2dTensorCopy3
 {
+    using vector_t = typename vector_type<Float, DataPerRead>::type;
+
     unsigned mSrcMyThreadOffset;
     unsigned mDstMyThreadOffset;
 
@@ -424,11 +426,6 @@ struct Blockwise2dTensorCopy3
 
     __device__ void Run(const Float* __restrict__ p_src, Float* __restrict__ p_dst) const
     {
-        static_assert(is_same<Float, float>::value, "wrong! only support float!\n");
-
-        using Float2 = float2;
-        using Float4 = float4;
-
         constexpr auto I0 = Number<0>{};
         constexpr auto I1 = Number<1>{};
 
@@ -454,27 +451,9 @@ struct Blockwise2dTensorCopy3
         constexpr unsigned dst_loop_stride = DstDesc{}.GetStride(I0) * thread_per_d0;
 
         auto f_copy = [&](unsigned iloop) {
-            if(DataPerRead == 1)
-            {
-                p_dst[mDstMyThreadOffset + iloop * dst_loop_stride] =
-                    p_src[mSrcMyThreadOffset + iloop * src_loop_stride];
-            }
-            else if(DataPerRead == 2)
-            {
-                *(reinterpret_cast<Float2*>(p_dst + mDstMyThreadOffset + iloop * dst_loop_stride)) =
-                    *(reinterpret_cast<const Float2*>(p_src + mSrcMyThreadOffset +
-                                                      iloop * src_loop_stride));
-            }
-            else if(DataPerRead == 4)
-            {
-                *(reinterpret_cast<Float4*>(p_dst + mDstMyThreadOffset + iloop * dst_loop_stride)) =
-                    *(reinterpret_cast<const Float4*>(p_src + mSrcMyThreadOffset +
-                                                      iloop * src_loop_stride));
-            }
-            else
-            {
-                assert(false);
-            }
+            *(reinterpret_cast<vector_t*>(p_dst + mDstMyThreadOffset + iloop * dst_loop_stride)) =
+                *(reinterpret_cast<const vector_t*>(p_src + mSrcMyThreadOffset +
+                                                    iloop * src_loop_stride));
         };
 
         for(unsigned iloop = 0; iloop < nloop_d0; ++iloop)
@@ -514,11 +493,6 @@ struct Blockwise2dTensorCopy3
     __device__ void RunLoadRegisterClipboard(const Float* __restrict__ p_src,
                                              Float* p_clipboard) const
     {
-        static_assert(is_same<Float, float>::value, "wrong! only support float!\n");
-
-        using Float2 = float2;
-        using Float4 = float4;
-
         constexpr auto I0 = Number<0>{};
         constexpr auto I1 = Number<1>{};
 
@@ -544,26 +518,9 @@ struct Blockwise2dTensorCopy3
         constexpr unsigned dst_loop_stride = DstDesc{}.GetStride(I0) * thread_per_d0;
 
         auto f_copy = [&](unsigned iloop) {
-            if(DataPerRead == 1)
-            {
-                p_clipboard[iloop] = p_src[mSrcMyThreadOffset + iloop * src_loop_stride];
-            }
-            else if(DataPerRead == 2)
-            {
-                *(reinterpret_cast<Float2*>(p_clipboard + iloop * 2)) =
-                    *(reinterpret_cast<const Float2*>(p_src + mSrcMyThreadOffset +
-                                                      iloop * src_loop_stride));
-            }
-            else if(DataPerRead == 4)
-            {
-                *(reinterpret_cast<Float4*>(p_clipboard + iloop * 4)) =
-                    *(reinterpret_cast<const Float4*>(p_src + mSrcMyThreadOffset +
-                                                      iloop * src_loop_stride));
-            }
-            else
-            {
-                assert(false);
-            }
+            *(reinterpret_cast<vector_t*>(p_clipboard + iloop * 4)) =
+                *(reinterpret_cast<const vector_t*>(p_src + mSrcMyThreadOffset +
+                                                    iloop * src_loop_stride));
         };
 
         for(unsigned iloop = 0; iloop < nloop_d0; ++iloop)
@@ -587,11 +544,6 @@ struct Blockwise2dTensorCopy3
     __device__ void RunStoreRegisterClipboard(const Float* __restrict__ p_clipboard,
                                               Float* __restrict__ p_dst) const
     {
-        static_assert(is_same<Float, float>::value, "wrong! only support float!\n");
-
-        using Float2 = float2;
-        using Float4 = float4;
-
         constexpr auto I0 = Number<0>{};
         constexpr auto I1 = Number<1>{};
 
@@ -617,24 +569,8 @@ struct Blockwise2dTensorCopy3
         constexpr unsigned dst_loop_stride = DstDesc{}.GetStride(I0) * thread_per_d0;
 
         auto f_copy = [&](unsigned iloop) {
-            if(DataPerRead == 1)
-            {
-                p_dst[mDstMyThreadOffset + iloop * dst_loop_stride] = p_clipboard[iloop];
-            }
-            else if(DataPerRead == 2)
-            {
-                *(reinterpret_cast<Float2*>(p_dst + mDstMyThreadOffset + iloop * dst_loop_stride)) =
-                    *(reinterpret_cast<const Float2*>(p_clipboard + iloop * 2));
-            }
-            else if(DataPerRead == 4)
-            {
-                *(reinterpret_cast<Float4*>(p_dst + mDstMyThreadOffset + iloop * dst_loop_stride)) =
-                    *(reinterpret_cast<const Float4*>(p_clipboard + iloop * 4));
-            }
-            else
-            {
-                assert(false);
-            }
+            *(reinterpret_cast<vector_t*>(p_dst + mDstMyThreadOffset + iloop * dst_loop_stride)) =
+                *(reinterpret_cast<const vector_t*>(p_clipboard + iloop * 4));
         };
 
         for(unsigned iloop = 0; iloop < nloop_d0; ++iloop)
diff --git a/src/include/blockwise_4d_tensor_op.hip.hpp b/src/include/blockwise_4d_tensor_op.hip.hpp
index 1b2f5e5d15..cc50d9eecd 100644
--- a/src/include/blockwise_4d_tensor_op.hip.hpp
+++ b/src/include/blockwise_4d_tensor_op.hip.hpp
@@ -349,6 +349,8 @@ template <unsigned BlockSize,
           unsigned DataPerRead>
 struct Blockwise4dTensorCopy3
 {
+    using vector_t = typename vector_type<Float, DataPerRead>::type;
+
     unsigned mSrcMyThreadOffset;
     unsigned mDstMyThreadOffset;
 
@@ -422,11 +424,6 @@ struct Blockwise4dTensorCopy3
 
     __device__ void Run(const Float* __restrict__ p_src, Float* __restrict__ p_dst) const
     {
-        static_assert(is_same<Float, float>::value, "wrong! only support float!\n");
-
-        using Float2 = float2;
-        using Float4 = float4;
-
         constexpr auto I0 = Number<0>{};
         constexpr auto I1 = Number<1>{};
         constexpr auto I2 = Number<2>{};
@@ -482,27 +479,9 @@ struct Blockwise4dTensorCopy3
                                                  iloop_d2 * thread_per_d2,
                                                  iloop_d3 * thread_per_d3 * DataPerRead);
 
-                        if(DataPerRead == 1)
-                        {
-                            p_dst[dst_offset + mDstMyThreadOffset] =
-                                p_src[src_offset + mSrcMyThreadOffset];
-                        }
-                        else if(DataPerRead == 2)
-                        {
-                            *(reinterpret_cast<Float2*>(p_dst + dst_offset + mDstMyThreadOffset)) =
-                                *(reinterpret_cast<const Float2*>(p_src + src_offset +
-                                                                  mSrcMyThreadOffset));
-                        }
-                        else if(DataPerRead == 4)
-                        {
-                            *(reinterpret_cast<Float4*>(p_dst + dst_offset + mDstMyThreadOffset)) =
-                                *(reinterpret_cast<const Float4*>(p_src + src_offset +
-                                                                  mSrcMyThreadOffset));
-                        }
-                        else
-                        {
-                            assert(false);
-                        }
+                        *(reinterpret_cast<vector_t*>(p_dst + dst_offset + mDstMyThreadOffset)) =
+                            *(reinterpret_cast<const vector_t*>(p_src + src_offset +
+                                                                mSrcMyThreadOffset));
                     }
                 }
             }
diff --git a/src/include/common.hip.hpp b/src/include/common.hip.hpp
index e2bc44943c..2df008fcad 100644
--- a/src/include/common.hip.hpp
+++ b/src/include/common.hip.hpp
@@ -16,6 +16,81 @@ struct is_same<T, T>
     static const bool value = true;
 };
 
+template <class T, unsigned N>
+struct vector_type
+{
+};
+
+template <>
+struct vector_type<float, 1>
+{
+    using type = float;
+};
+
+template <>
+struct vector_type<float, 2>
+{
+    using type = float2;
+};
+
+template <>
+struct vector_type<float, 4>
+{
+    using type = float4;
+};
+
+#if 0
+template <>
+struct vector_type<half_float::half, 1>
+{
+    using type = half_float::half;
+};
+
+template <>
+struct vector_type<half_float::half, 2>
+{
+    using type = float;
+};
+
+template <>
+struct vector_type<half_float::half, 4>
+{
+    using type = float2;
+};
+
+template <>
+struct vector_type<half_float::half, 8>
+{
+    using type = float4;
+};
+#endif
+
+#if 1
+template <>
+struct vector_type<half, 1>
+{
+    using type = half;
+};
+
+template <>
+struct vector_type<half, 2>
+{
+    using type = half2;
+};
+
+template <>
+struct vector_type<half, 4>
+{
+    using type = float2;
+};
+
+template <>
+struct vector_type<half, 8>
+{
+    using type = float4;
+};
+#endif
+
 template <class T, T N>
 struct integral_constant
 {
diff --git a/src/include/config.h.in b/src/include/config.h.in
index 7b888c6951..9ee0c41f80 100644
--- a/src/include/config.h.in
+++ b/src/include/config.h.in
@@ -4,8 +4,10 @@
 
 #if DEVICE_BACKEND_HIP
 #include "hip/hip_runtime.h"
+#include "half.hpp"
 #elif DEVICE_BACKEND_CUDA
 #include "cuda_runtime.h"
 #include "nvToolsExt.h"
 #include "helper_cuda.h"
+#include "cuda_fp16.h"
 #endif