refactor

2026-05-11 08:50:17 +00:00 · 2019-03-04 17:09:20 -06:00
parent 5fd40ad768
commit 04c5527d07
5 changed files with 94 additions and 100 deletions
--- a/driver/driver.hip.cpp
+++ b/driver/driver.hip.cpp
@@ -633,6 +633,7 @@ int main(int argc, char* argv[])

    if(do_verification)
    {
+#if 1
        if(Y == 3 && X == 3)
        {
            host_winograd_3x3_convolution(in_nchw, wei_kcsr, out_nkhw_host, lower_pads, upper_pads);
@@ -642,6 +643,7 @@ int main(int argc, char* argv[])
            host_direct_convolution(in_nchw, wei_kcsr, out_nkhw_host, lower_pads, upper_pads);
        }
        check_error(out_nkhw_host, out_nkhw_device);
+#endif

 #if 0
        LogRange(std::cout << "in_nchw : ", in_nchw.mData, ",") << std::endl;
--- a/src/include/blockwise_2d_tensor_op.hip.hpp
+++ b/src/include/blockwise_2d_tensor_op.hip.hpp
@@ -373,6 +373,8 @@ template <unsigned BlockSize,
          unsigned DataPerRead>
 struct Blockwise2dTensorCopy3
 {
+    using vector_t = typename vector_type<Float, DataPerRead>::type;
+
    unsigned mSrcMyThreadOffset;
    unsigned mDstMyThreadOffset;

@@ -424,11 +426,6 @@ struct Blockwise2dTensorCopy3

    __device__ void Run(const Float* __restrict__ p_src, Float* __restrict__ p_dst) const
    {
-        static_assert(is_same<Float, float>::value, "wrong! only support float!\n");
-
-        using Float2 = float2;
-        using Float4 = float4;
-
        constexpr auto I0 = Number<0>{};
        constexpr auto I1 = Number<1>{};

@@ -454,27 +451,9 @@ struct Blockwise2dTensorCopy3
        constexpr unsigned dst_loop_stride = DstDesc{}.GetStride(I0) * thread_per_d0;

        auto f_copy = [&](unsigned iloop) {
-            if(DataPerRead == 1)
-            {
-                p_dst[mDstMyThreadOffset + iloop * dst_loop_stride] =
-                    p_src[mSrcMyThreadOffset + iloop * src_loop_stride];
-            }
-            else if(DataPerRead == 2)
-            {
-                *(reinterpret_cast<Float2*>(p_dst + mDstMyThreadOffset + iloop * dst_loop_stride)) =
-                    *(reinterpret_cast<const Float2*>(p_src + mSrcMyThreadOffset +
-                                                      iloop * src_loop_stride));
-            }
-            else if(DataPerRead == 4)
-            {
-                *(reinterpret_cast<Float4*>(p_dst + mDstMyThreadOffset + iloop * dst_loop_stride)) =
-                    *(reinterpret_cast<const Float4*>(p_src + mSrcMyThreadOffset +
-                                                      iloop * src_loop_stride));
-            }
-            else
-            {
-                assert(false);
-            }
+            *(reinterpret_cast<vector_t*>(p_dst + mDstMyThreadOffset + iloop * dst_loop_stride)) =
+                *(reinterpret_cast<const vector_t*>(p_src + mSrcMyThreadOffset +
+                                                    iloop * src_loop_stride));
        };

        for(unsigned iloop = 0; iloop < nloop_d0; ++iloop)
@@ -514,11 +493,6 @@ struct Blockwise2dTensorCopy3
    __device__ void RunLoadRegisterClipboard(const Float* __restrict__ p_src,
                                             Float* p_clipboard) const
    {
-        static_assert(is_same<Float, float>::value, "wrong! only support float!\n");
-
-        using Float2 = float2;
-        using Float4 = float4;
-
        constexpr auto I0 = Number<0>{};
        constexpr auto I1 = Number<1>{};

@@ -544,26 +518,9 @@ struct Blockwise2dTensorCopy3
        constexpr unsigned dst_loop_stride = DstDesc{}.GetStride(I0) * thread_per_d0;

        auto f_copy = [&](unsigned iloop) {
-            if(DataPerRead == 1)
-            {
-                p_clipboard[iloop] = p_src[mSrcMyThreadOffset + iloop * src_loop_stride];
-            }
-            else if(DataPerRead == 2)
-            {
-                *(reinterpret_cast<Float2*>(p_clipboard + iloop * 2)) =
-                    *(reinterpret_cast<const Float2*>(p_src + mSrcMyThreadOffset +
-                                                      iloop * src_loop_stride));
-            }
-            else if(DataPerRead == 4)
-            {
-                *(reinterpret_cast<Float4*>(p_clipboard + iloop * 4)) =
-                    *(reinterpret_cast<const Float4*>(p_src + mSrcMyThreadOffset +
-                                                      iloop * src_loop_stride));
-            }
-            else
-            {
-                assert(false);
-            }
+            *(reinterpret_cast<vector_t*>(p_clipboard + iloop * 4)) =
+                *(reinterpret_cast<const vector_t*>(p_src + mSrcMyThreadOffset +
+                                                    iloop * src_loop_stride));
        };

        for(unsigned iloop = 0; iloop < nloop_d0; ++iloop)
@@ -587,11 +544,6 @@ struct Blockwise2dTensorCopy3
    __device__ void RunStoreRegisterClipboard(const Float* __restrict__ p_clipboard,
                                              Float* __restrict__ p_dst) const
    {
-        static_assert(is_same<Float, float>::value, "wrong! only support float!\n");
-
-        using Float2 = float2;
-        using Float4 = float4;
-
        constexpr auto I0 = Number<0>{};
        constexpr auto I1 = Number<1>{};

@@ -617,24 +569,8 @@ struct Blockwise2dTensorCopy3
        constexpr unsigned dst_loop_stride = DstDesc{}.GetStride(I0) * thread_per_d0;

        auto f_copy = [&](unsigned iloop) {
-            if(DataPerRead == 1)
-            {
-                p_dst[mDstMyThreadOffset + iloop * dst_loop_stride] = p_clipboard[iloop];
-            }
-            else if(DataPerRead == 2)
-            {
-                *(reinterpret_cast<Float2*>(p_dst + mDstMyThreadOffset + iloop * dst_loop_stride)) =
-                    *(reinterpret_cast<const Float2*>(p_clipboard + iloop * 2));
-            }
-            else if(DataPerRead == 4)
-            {
-                *(reinterpret_cast<Float4*>(p_dst + mDstMyThreadOffset + iloop * dst_loop_stride)) =
-                    *(reinterpret_cast<const Float4*>(p_clipboard + iloop * 4));
-            }
-            else
-            {
-                assert(false);
-            }
+            *(reinterpret_cast<vector_t*>(p_dst + mDstMyThreadOffset + iloop * dst_loop_stride)) =
+                *(reinterpret_cast<const vector_t*>(p_clipboard + iloop * 4));
        };

        for(unsigned iloop = 0; iloop < nloop_d0; ++iloop)
--- a/src/include/blockwise_4d_tensor_op.hip.hpp
+++ b/src/include/blockwise_4d_tensor_op.hip.hpp
@@ -349,6 +349,8 @@ template <unsigned BlockSize,
          unsigned DataPerRead>
 struct Blockwise4dTensorCopy3
 {
+    using vector_t = typename vector_type<Float, DataPerRead>::type;
+
    unsigned mSrcMyThreadOffset;
    unsigned mDstMyThreadOffset;

@@ -422,11 +424,6 @@ struct Blockwise4dTensorCopy3

    __device__ void Run(const Float* __restrict__ p_src, Float* __restrict__ p_dst) const
    {
-        static_assert(is_same<Float, float>::value, "wrong! only support float!\n");
-
-        using Float2 = float2;
-        using Float4 = float4;
-
        constexpr auto I0 = Number<0>{};
        constexpr auto I1 = Number<1>{};
        constexpr auto I2 = Number<2>{};
@@ -482,27 +479,9 @@ struct Blockwise4dTensorCopy3
                                                 iloop_d2 * thread_per_d2,
                                                 iloop_d3 * thread_per_d3 * DataPerRead);

-                        if(DataPerRead == 1)
-                        {
-                            p_dst[dst_offset + mDstMyThreadOffset] =
-                                p_src[src_offset + mSrcMyThreadOffset];
-                        }
-                        else if(DataPerRead == 2)
-                        {
-                            *(reinterpret_cast<Float2*>(p_dst + dst_offset + mDstMyThreadOffset)) =
-                                *(reinterpret_cast<const Float2*>(p_src + src_offset +
-                                                                  mSrcMyThreadOffset));
-                        }
-                        else if(DataPerRead == 4)
-                        {
-                            *(reinterpret_cast<Float4*>(p_dst + dst_offset + mDstMyThreadOffset)) =
-                                *(reinterpret_cast<const Float4*>(p_src + src_offset +
-                                                                  mSrcMyThreadOffset));
-                        }
-                        else
-                        {
-                            assert(false);
-                        }
+                        *(reinterpret_cast<vector_t*>(p_dst + dst_offset + mDstMyThreadOffset)) =
+                            *(reinterpret_cast<const vector_t*>(p_src + src_offset +
+                                                                mSrcMyThreadOffset));
                    }
                }
            }
--- a/src/include/common.hip.hpp
+++ b/src/include/common.hip.hpp
@@ -16,6 +16,81 @@ struct is_same<T, T>
    static const bool value = true;
 };

+template <class T, unsigned N>
+struct vector_type
+{
+};
+
+template <>
+struct vector_type<float, 1>
+{
+    using type = float;
+};
+
+template <>
+struct vector_type<float, 2>
+{
+    using type = float2;
+};
+
+template <>
+struct vector_type<float, 4>
+{
+    using type = float4;
+};
+
+#if 0
+template <>
+struct vector_type<half_float::half, 1>
+{
+    using type = half_float::half;
+};
+
+template <>
+struct vector_type<half_float::half, 2>
+{
+    using type = float;
+};
+
+template <>
+struct vector_type<half_float::half, 4>
+{
+    using type = float2;
+};
+
+template <>
+struct vector_type<half_float::half, 8>
+{
+    using type = float4;
+};
+#endif
+
+#if 1
+template <>
+struct vector_type<half, 1>
+{
+    using type = half;
+};
+
+template <>
+struct vector_type<half, 2>
+{
+    using type = half2;
+};
+
+template <>
+struct vector_type<half, 4>
+{
+    using type = float2;
+};
+
+template <>
+struct vector_type<half, 8>
+{
+    using type = float4;
+};
+#endif
+
 template <class T, T N>
 struct integral_constant
 {
--- a/src/include/config.h.in
+++ b/src/include/config.h.in
@@ -4,8 +4,10 @@

 #if DEVICE_BACKEND_HIP
 #include "hip/hip_runtime.h"
+#include "half.hpp"
 #elif DEVICE_BACKEND_CUDA
 #include "cuda_runtime.h"
 #include "nvToolsExt.h"
 #include "helper_cuda.h"
+#include "cuda_fp16.h"
 #endif