diff --git a/driver/driver.hip.cpp b/driver/driver.hip.cpp index f534e4eda9..3b18645c4b 100644 --- a/driver/driver.hip.cpp +++ b/driver/driver.hip.cpp @@ -633,6 +633,7 @@ int main(int argc, char* argv[]) if(do_verification) { +#if 1 if(Y == 3 && X == 3) { host_winograd_3x3_convolution(in_nchw, wei_kcsr, out_nkhw_host, lower_pads, upper_pads); @@ -642,6 +643,7 @@ int main(int argc, char* argv[]) host_direct_convolution(in_nchw, wei_kcsr, out_nkhw_host, lower_pads, upper_pads); } check_error(out_nkhw_host, out_nkhw_device); +#endif #if 0 LogRange(std::cout << "in_nchw : ", in_nchw.mData, ",") << std::endl; diff --git a/src/include/blockwise_2d_tensor_op.hip.hpp b/src/include/blockwise_2d_tensor_op.hip.hpp index a178b5dade..b54c4d0c5f 100644 --- a/src/include/blockwise_2d_tensor_op.hip.hpp +++ b/src/include/blockwise_2d_tensor_op.hip.hpp @@ -373,6 +373,8 @@ template struct Blockwise2dTensorCopy3 { + using vector_t = typename vector_type::type; + unsigned mSrcMyThreadOffset; unsigned mDstMyThreadOffset; @@ -424,11 +426,6 @@ struct Blockwise2dTensorCopy3 __device__ void Run(const Float* __restrict__ p_src, Float* __restrict__ p_dst) const { - static_assert(is_same::value, "wrong! only support float!\n"); - - using Float2 = float2; - using Float4 = float4; - constexpr auto I0 = Number<0>{}; constexpr auto I1 = Number<1>{}; @@ -454,27 +451,9 @@ struct Blockwise2dTensorCopy3 constexpr unsigned dst_loop_stride = DstDesc{}.GetStride(I0) * thread_per_d0; auto f_copy = [&](unsigned iloop) { - if(DataPerRead == 1) - { - p_dst[mDstMyThreadOffset + iloop * dst_loop_stride] = - p_src[mSrcMyThreadOffset + iloop * src_loop_stride]; - } - else if(DataPerRead == 2) - { - *(reinterpret_cast(p_dst + mDstMyThreadOffset + iloop * dst_loop_stride)) = - *(reinterpret_cast(p_src + mSrcMyThreadOffset + - iloop * src_loop_stride)); - } - else if(DataPerRead == 4) - { - *(reinterpret_cast(p_dst + mDstMyThreadOffset + iloop * dst_loop_stride)) = - *(reinterpret_cast(p_src + mSrcMyThreadOffset + - iloop * src_loop_stride)); - } - else - { - assert(false); - } + *(reinterpret_cast(p_dst + mDstMyThreadOffset + iloop * dst_loop_stride)) = + *(reinterpret_cast(p_src + mSrcMyThreadOffset + + iloop * src_loop_stride)); }; for(unsigned iloop = 0; iloop < nloop_d0; ++iloop) @@ -514,11 +493,6 @@ struct Blockwise2dTensorCopy3 __device__ void RunLoadRegisterClipboard(const Float* __restrict__ p_src, Float* p_clipboard) const { - static_assert(is_same::value, "wrong! only support float!\n"); - - using Float2 = float2; - using Float4 = float4; - constexpr auto I0 = Number<0>{}; constexpr auto I1 = Number<1>{}; @@ -544,26 +518,9 @@ struct Blockwise2dTensorCopy3 constexpr unsigned dst_loop_stride = DstDesc{}.GetStride(I0) * thread_per_d0; auto f_copy = [&](unsigned iloop) { - if(DataPerRead == 1) - { - p_clipboard[iloop] = p_src[mSrcMyThreadOffset + iloop * src_loop_stride]; - } - else if(DataPerRead == 2) - { - *(reinterpret_cast(p_clipboard + iloop * 2)) = - *(reinterpret_cast(p_src + mSrcMyThreadOffset + - iloop * src_loop_stride)); - } - else if(DataPerRead == 4) - { - *(reinterpret_cast(p_clipboard + iloop * 4)) = - *(reinterpret_cast(p_src + mSrcMyThreadOffset + - iloop * src_loop_stride)); - } - else - { - assert(false); - } + *(reinterpret_cast(p_clipboard + iloop * 4)) = + *(reinterpret_cast(p_src + mSrcMyThreadOffset + + iloop * src_loop_stride)); }; for(unsigned iloop = 0; iloop < nloop_d0; ++iloop) @@ -587,11 +544,6 @@ struct Blockwise2dTensorCopy3 __device__ void RunStoreRegisterClipboard(const Float* __restrict__ p_clipboard, Float* __restrict__ p_dst) const { - static_assert(is_same::value, "wrong! only support float!\n"); - - using Float2 = float2; - using Float4 = float4; - constexpr auto I0 = Number<0>{}; constexpr auto I1 = Number<1>{}; @@ -617,24 +569,8 @@ struct Blockwise2dTensorCopy3 constexpr unsigned dst_loop_stride = DstDesc{}.GetStride(I0) * thread_per_d0; auto f_copy = [&](unsigned iloop) { - if(DataPerRead == 1) - { - p_dst[mDstMyThreadOffset + iloop * dst_loop_stride] = p_clipboard[iloop]; - } - else if(DataPerRead == 2) - { - *(reinterpret_cast(p_dst + mDstMyThreadOffset + iloop * dst_loop_stride)) = - *(reinterpret_cast(p_clipboard + iloop * 2)); - } - else if(DataPerRead == 4) - { - *(reinterpret_cast(p_dst + mDstMyThreadOffset + iloop * dst_loop_stride)) = - *(reinterpret_cast(p_clipboard + iloop * 4)); - } - else - { - assert(false); - } + *(reinterpret_cast(p_dst + mDstMyThreadOffset + iloop * dst_loop_stride)) = + *(reinterpret_cast(p_clipboard + iloop * 4)); }; for(unsigned iloop = 0; iloop < nloop_d0; ++iloop) diff --git a/src/include/blockwise_4d_tensor_op.hip.hpp b/src/include/blockwise_4d_tensor_op.hip.hpp index 1b2f5e5d15..cc50d9eecd 100644 --- a/src/include/blockwise_4d_tensor_op.hip.hpp +++ b/src/include/blockwise_4d_tensor_op.hip.hpp @@ -349,6 +349,8 @@ template struct Blockwise4dTensorCopy3 { + using vector_t = typename vector_type::type; + unsigned mSrcMyThreadOffset; unsigned mDstMyThreadOffset; @@ -422,11 +424,6 @@ struct Blockwise4dTensorCopy3 __device__ void Run(const Float* __restrict__ p_src, Float* __restrict__ p_dst) const { - static_assert(is_same::value, "wrong! only support float!\n"); - - using Float2 = float2; - using Float4 = float4; - constexpr auto I0 = Number<0>{}; constexpr auto I1 = Number<1>{}; constexpr auto I2 = Number<2>{}; @@ -482,27 +479,9 @@ struct Blockwise4dTensorCopy3 iloop_d2 * thread_per_d2, iloop_d3 * thread_per_d3 * DataPerRead); - if(DataPerRead == 1) - { - p_dst[dst_offset + mDstMyThreadOffset] = - p_src[src_offset + mSrcMyThreadOffset]; - } - else if(DataPerRead == 2) - { - *(reinterpret_cast(p_dst + dst_offset + mDstMyThreadOffset)) = - *(reinterpret_cast(p_src + src_offset + - mSrcMyThreadOffset)); - } - else if(DataPerRead == 4) - { - *(reinterpret_cast(p_dst + dst_offset + mDstMyThreadOffset)) = - *(reinterpret_cast(p_src + src_offset + - mSrcMyThreadOffset)); - } - else - { - assert(false); - } + *(reinterpret_cast(p_dst + dst_offset + mDstMyThreadOffset)) = + *(reinterpret_cast(p_src + src_offset + + mSrcMyThreadOffset)); } } } diff --git a/src/include/common.hip.hpp b/src/include/common.hip.hpp index e2bc44943c..2df008fcad 100644 --- a/src/include/common.hip.hpp +++ b/src/include/common.hip.hpp @@ -16,6 +16,81 @@ struct is_same static const bool value = true; }; +template +struct vector_type +{ +}; + +template <> +struct vector_type +{ + using type = float; +}; + +template <> +struct vector_type +{ + using type = float2; +}; + +template <> +struct vector_type +{ + using type = float4; +}; + +#if 0 +template <> +struct vector_type +{ + using type = half_float::half; +}; + +template <> +struct vector_type +{ + using type = float; +}; + +template <> +struct vector_type +{ + using type = float2; +}; + +template <> +struct vector_type +{ + using type = float4; +}; +#endif + +#if 1 +template <> +struct vector_type +{ + using type = half; +}; + +template <> +struct vector_type +{ + using type = half2; +}; + +template <> +struct vector_type +{ + using type = float2; +}; + +template <> +struct vector_type +{ + using type = float4; +}; +#endif + template struct integral_constant { diff --git a/src/include/config.h.in b/src/include/config.h.in index 7b888c6951..9ee0c41f80 100644 --- a/src/include/config.h.in +++ b/src/include/config.h.in @@ -4,8 +4,10 @@ #if DEVICE_BACKEND_HIP #include "hip/hip_runtime.h" +#include "half.hpp" #elif DEVICE_BACKEND_CUDA #include "cuda_runtime.h" #include "nvToolsExt.h" #include "helper_cuda.h" +#include "cuda_fp16.h" #endif