From 4f0fc72e913965d92d5410eb2a1c4a0e6b1fce6f Mon Sep 17 00:00:00 2001 From: Chao Liu Date: Mon, 18 Mar 2019 15:03:17 -0500 Subject: [PATCH] adding fp16 direct that reads pre-vectorized data --- ...ce_direct_convolution_2_nchw_kcyx_nkhw.hpp | 74 ++++---- ...onvolution_2_vectorized_nchw_kcyx_nkhw.hpp | 160 ++++++++++++++++++ driver/driver.hip.cpp | 157 ++++++++--------- src/include/blockwise_2d_tensor_op.hip.hpp | 2 +- src/include/blockwise_4d_tensor_op.hip.hpp | 8 +- src/include/common.hip.hpp | 38 +++-- ...irect_convolution_2_nchw_kcyx_nkhw.hip.hpp | 8 +- ...lution_2_vectorized_nchw_kcyx_nkhw.hip.hpp | 97 ++++++----- 8 files changed, 369 insertions(+), 175 deletions(-) create mode 100644 driver/device_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hpp diff --git a/driver/device_direct_convolution_2_nchw_kcyx_nkhw.hpp b/driver/device_direct_convolution_2_nchw_kcyx_nkhw.hpp index 2fc2264f6d..4a49ff9fc8 100644 --- a/driver/device_direct_convolution_2_nchw_kcyx_nkhw.hpp +++ b/driver/device_direct_convolution_2_nchw_kcyx_nkhw.hpp @@ -1,8 +1,7 @@ #pragma once #include #include "device.hpp" -//#include "gridwise_direct_convolution_2_nchw_kcyx_nkhw.hip.hpp" -#include "gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hip.hpp" +#include "gridwise_direct_convolution_2_nchw_kcyx_nkhw.hip.hpp" template void device_direct_convolution_2_nchw_kcyx_nkhw(InDesc, @@ -50,6 +49,24 @@ void device_direct_convolution_2_nchw_kcyx_nkhw(InDesc, constexpr unsigned InBlockCopyDataPerRead = 2; constexpr unsigned WeiBlockCopyDataPerRead = 4; + constexpr unsigned BlockSize = 128; +#elif 1 + // 3x3, 34x34, 128 thread, fp16 + constexpr unsigned NPerBlock = 2; + constexpr unsigned KPerBlock = 32; + constexpr unsigned CPerBlock = 4; + constexpr unsigned HoPerBlock = 2; + constexpr unsigned WoPerBlock = 32; + + constexpr unsigned NPerThread = 2; + constexpr unsigned KPerThread = 4; + constexpr unsigned CPerThread = 2; + constexpr unsigned HoPerThread = 2; + constexpr unsigned WoPerThread = 2; + + constexpr unsigned InBlockCopyDataPerRead = 2; + constexpr unsigned WeiBlockCopyDataPerRead = 4; + constexpr unsigned BlockSize = 128; #endif @@ -61,35 +78,30 @@ void device_direct_convolution_2_nchw_kcyx_nkhw(InDesc, for(unsigned i = 0; i < nrepeat; ++i) { - float time = launch_kernel( -#if 0 - gridwise_direct_convolution_2_nchw_kcyx_nkhw -#else - gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw -#endif - , - dim3(GridSize), - dim3(BlockSize), - static_cast(in_device_buf.GetDeviceBuffer()), - static_cast(wei_device_buf.GetDeviceBuffer()), - static_cast(out_device_buf.GetDeviceBuffer())); + float time = + launch_kernel(gridwise_direct_convolution_2_nchw_kcyx_nkhw, + dim3(GridSize), + dim3(BlockSize), + static_cast(in_device_buf.GetDeviceBuffer()), + static_cast(wei_device_buf.GetDeviceBuffer()), + static_cast(out_device_buf.GetDeviceBuffer())); printf("Elapsed time : %f ms\n", time); usleep(std::min(time * 1000, float(10000))); diff --git a/driver/device_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hpp b/driver/device_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hpp new file mode 100644 index 0000000000..19633f0462 --- /dev/null +++ b/driver/device_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hpp @@ -0,0 +1,160 @@ +#pragma once +#include +#include "device.hpp" +#include "gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hip.hpp" + +template +void device_direct_convolution_2_vectorized_nchw_kcyx_nkhw(InDesc, + const Tensor& in_nchw, + WeiDesc, + const Tensor& wei_kcyx, + OutDesc, + Tensor& out_nkhw, + unsigned nrepeat) +{ + constexpr unsigned NVector = 1; + using vector_type_t = vector_type; + using vector_t = typename vector_type_t::VectorType; + + constexpr auto I0 = Number<0>{}; + constexpr auto I1 = Number<1>{}; + constexpr auto I2 = Number<2>{}; + constexpr auto I3 = Number<3>{}; + + constexpr auto in_nchw_desc = InDesc{}; + constexpr auto wei_kcyx_desc = WeiDesc{}; + constexpr auto out_nkhw_desc = OutDesc{}; + + constexpr unsigned Hi = in_nchw_desc.GetLength(I2); + constexpr unsigned Wi = in_nchw_desc.GetLength(I3); + + constexpr unsigned N = out_nkhw_desc.GetLength(I0); + constexpr unsigned Ho = out_nkhw_desc.GetLength(I2); + constexpr unsigned Wo = out_nkhw_desc.GetLength(I3); + + constexpr unsigned K = wei_kcyx_desc.GetLength(I0); + constexpr unsigned C = wei_kcyx_desc.GetLength(I1); + constexpr unsigned Y = wei_kcyx_desc.GetLength(I2); + constexpr unsigned X = wei_kcyx_desc.GetLength(I3); + + // vectorized input + auto in_nchw_vec_desc = make_ConstantTensorDescriptor(Sequence{}); + ostream_ConstantTensorDescriptor(in_nchw_vec_desc, std::cout << "in_nchw_vec_desc: "); + + Tensor in_nchw_vec(make_TensorDescriptor(in_nchw_vec_desc)); + + auto f_vectorized_nchw = [&](auto n, auto c, auto h, auto w) { +#if 1 + in_nchw_vec(n, c, h, w) = in_nchw(n, c, h, w); +#else + in_nchw_vec(n, c, h, w) = + vector_type_t::pack(in_nchw(n, 2 * c, h, w), in_nchw(n, 2 * c + 1, h, w)); +#endif + }; + + make_ParallelTensorFunctor(f_vectorized_nchw, N, C, Hi, Wi)( + std::thread::hardware_concurrency()); + + // vectorize weight + auto wei_kcyx_vec_desc = make_ConstantTensorDescriptor(Sequence{}); + ostream_ConstantTensorDescriptor(wei_kcyx_vec_desc, std::cout << "wei_kcyx_vec_desc: "); + + Tensor wei_kcyx_vec(make_TensorDescriptor(wei_kcyx_vec_desc)); + + auto f_vectorized_kcyx = [&](auto k, auto c, auto y, auto x) { +#if 1 + wei_kcyx_vec(k, c, y, x) = wei_kcyx(k, c, y, x); +#else + wei_kcyx_vec(k, c, y, x) = + vector_type_t::pack(wei_kcyx(k, 2 * c, y, x), wei_kcyx(k, 2 * c + 1, y, x)); +#endif + }; + + make_ParallelTensorFunctor(f_vectorized_kcyx, K, C, Y, X)(std::thread::hardware_concurrency()); + + // + DeviceMem in_nchw_vec_device_buf(sizeof(vector_t) * in_nchw_vec.mDesc.GetElementSpace()); + DeviceMem wei_kcyx_vec_device_buf(sizeof(vector_t) * wei_kcyx_vec.mDesc.GetElementSpace()); + DeviceMem out_nkhw_device_buf(sizeof(T) * out_nkhw.mDesc.GetElementSpace()); + + in_nchw_vec_device_buf.ToDevice(in_nchw_vec.mData.data()); + wei_kcyx_vec_device_buf.ToDevice(wei_kcyx_vec.mData.data()); + out_nkhw_device_buf.ToDevice(out_nkhw.mData.data()); + +#if 1 + // 3x3, 34x34, 128 thread + constexpr unsigned NPerBlock = 2; + constexpr unsigned KPerBlock = 32; + constexpr unsigned CPerBlock = 4; + constexpr unsigned HoPerBlock = 2; + constexpr unsigned WoPerBlock = 32; + + constexpr unsigned NPerThread = 2; + constexpr unsigned KPerThread = 4; + constexpr unsigned CPerThread = 2; + constexpr unsigned HoPerThread = 2; + constexpr unsigned WoPerThread = 2; + + constexpr unsigned InBlockCopyDataPerRead = 2; + constexpr unsigned WeiBlockCopyDataPerRead = 4; + + constexpr unsigned BlockSize = 128; +#elif 1 + // 3x3, 34x34, 128 thread, fp16 + constexpr unsigned NPerBlock = 2; + constexpr unsigned KPerBlock = 32; + constexpr unsigned CPerBlock = 4; + constexpr unsigned HoPerBlock = 2; + constexpr unsigned WoPerBlock = 32; + + constexpr unsigned NPerThread = 2; + constexpr unsigned KPerThread = 4; + constexpr unsigned CPerThread = 2; + constexpr unsigned HoPerThread = 2; + constexpr unsigned WoPerThread = 2; + + constexpr unsigned InBlockCopyDataPerRead = 2; + constexpr unsigned WeiBlockCopyDataPerRead = 4; + + constexpr unsigned BlockSize = 128; +#endif + + constexpr unsigned GridSize = + (N / NPerBlock) * (K / KPerBlock) * (Ho / HoPerBlock) * (Wo / WoPerBlock); + + printf("%s: BlockSize %u, GridSize %u \n", __func__, BlockSize, GridSize); + + for(unsigned i = 0; i < nrepeat; ++i) + { + float time = launch_kernel( + gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw, + dim3(GridSize), + dim3(BlockSize), + static_cast(in_nchw_vec_device_buf.GetDeviceBuffer()), + static_cast(wei_kcyx_vec_device_buf.GetDeviceBuffer()), + static_cast(out_nkhw_device_buf.GetDeviceBuffer())); + + printf("Elapsed time : %f ms\n", time); + usleep(std::min(time * 1000, float(10000))); + } + + out_nkhw_device_buf.FromDevice(out_nkhw.mData.data()); +} diff --git a/driver/driver.hip.cpp b/driver/driver.hip.cpp index a952b95380..47ff1b6882 100644 --- a/driver/driver.hip.cpp +++ b/driver/driver.hip.cpp @@ -9,6 +9,7 @@ #include "conv_common.hip.hpp" //#include "device_direct_convolution_1.hpp" #include "device_direct_convolution_2_nchw_kcyx_nkhw.hpp" +#include "device_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hpp" //#include "device_implicit_gemm_convolution_1_chwn_cyxk_khwn.hpp" //#include "device_implicit_gemm_convolution_1_chwn_cyxk_khwn_padded.hpp" //#include "device_implicit_gemm_convolution_2_chwn_cyxk_khwn.hpp" @@ -34,25 +35,6 @@ struct GeneratorTensor_2 } }; -struct GeneratorTensor_3 -{ - template - double operator()(Is... is) - { -#if 0 - std::initializer_list ls = {static_cast(is)...}; - return std::accumulate(ls.begin(), ls.end(), std::size_t(0)); -#elif 1 - assert(sizeof...(Is) > 0); - std::initializer_list ids = {static_cast(is)...}; - std::vector lens(sizeof...(Is), 100); - std::vector strides(sizeof...(Is), 1); - std::partial_sum(lens.rbegin(), lens.rbegin() + (sizeof...(Is) - 1), strides.rbegin() + 1); - return std::inner_product(ids.begin(), ids.end(), strides.begin(), std::size_t(0)) + 1; -#endif - } -}; - struct GeneratorTensor_Checkboard { template @@ -129,7 +111,7 @@ void host_direct_convolution( if(hi >= 0 && hi < in_nchw.mDesc.GetLengths()[2] && wi >= 0 && wi < in_nchw.mDesc.GetLengths()[3]) { - v += in_nchw(n, c, hi, wi) * wei_kcyx(k, c, y, x); + v += double(in_nchw(n, c, hi, wi)) * double(wei_kcyx(k, c, y, x)); } } } @@ -177,11 +159,11 @@ void host_winograd_3x3_convolution( std::size_t HTile = (HO + HoPerTile - 1) / HoPerTile; std::size_t WTile = (WO + WoPerTile - 1) / WoPerTile; - Tensor in_hold({N, C, HTile, WTile, HiPerTile, WiPerTile}); - Tensor in_transform({N, C, HTile, WTile, HiPerTile, WiPerTile}); - Tensor wei_transform({K, C, HiPerTile, WiPerTile}); - Tensor out_transform({N, K, HTile, WTile, HiPerTile, HiPerTile}); - Tensor out_hold({N, K, HTile, WTile, HoPerTile, WoPerTile}); + Tensor in_hold({N, C, HTile, WTile, HiPerTile, WiPerTile}); + Tensor in_transform({N, C, HTile, WTile, HiPerTile, WiPerTile}); + Tensor wei_transform({K, C, HiPerTile, WiPerTile}); + Tensor out_transform({N, K, HTile, WTile, HiPerTile, HiPerTile}); + Tensor out_hold({N, K, HTile, WTile, HoPerTile, WoPerTile}); auto f_in_hold = [&](auto n, auto c, auto htile, auto wtile) { for(int j = 0; j < HiPerTile; ++j) @@ -259,49 +241,61 @@ void host_winograd_3x3_convolution( }; auto f_wei_transform = [&](auto k, auto c) { - wei_transform(k, c, 0, 0) = wei_kcyx(k, c, 0, 0); - wei_transform(k, c, 0, 1) = - 0.5 * wei_kcyx(k, c, 0, 0) + 0.5 * wei_kcyx(k, c, 0, 1) + 0.5 * wei_kcyx(k, c, 0, 2); - wei_transform(k, c, 0, 2) = - 0.5 * wei_kcyx(k, c, 0, 0) - 0.5 * wei_kcyx(k, c, 0, 1) + 0.5 * wei_kcyx(k, c, 0, 2); - wei_transform(k, c, 0, 3) = wei_kcyx(k, c, 0, 2); + wei_transform(k, c, 0, 0) = double(wei_kcyx(k, c, 0, 0)); + wei_transform(k, c, 0, 1) = 0.5 * double(wei_kcyx(k, c, 0, 0)) + + 0.5 * double(wei_kcyx(k, c, 0, 1)) + + 0.5 * double(wei_kcyx(k, c, 0, 2)); + wei_transform(k, c, 0, 2) = 0.5 * double(wei_kcyx(k, c, 0, 0)) - + 0.5 * double(wei_kcyx(k, c, 0, 1)) + + 0.5 * double(wei_kcyx(k, c, 0, 2)); + wei_transform(k, c, 0, 3) = double(wei_kcyx(k, c, 0, 2)); - wei_transform(k, c, 1, 0) = - 0.5 * wei_kcyx(k, c, 0, 0) + 0.5 * wei_kcyx(k, c, 1, 0) + 0.5 * wei_kcyx(k, c, 2, 0); - wei_transform(k, c, 1, 1) = 0.25 * wei_kcyx(k, c, 0, 0) + 0.25 * wei_kcyx(k, c, 0, 1) + - 0.25 * wei_kcyx(k, c, 0, 2) + 0.25 * wei_kcyx(k, c, 1, 0) + - 0.25 * wei_kcyx(k, c, 1, 1) + 0.25 * wei_kcyx(k, c, 1, 2) + - 0.25 * wei_kcyx(k, c, 2, 0) + 0.25 * wei_kcyx(k, c, 2, 1) + - 0.25 * wei_kcyx(k, c, 2, 2); - wei_transform(k, c, 1, 2) = 0.25 * wei_kcyx(k, c, 0, 0) - 0.25 * wei_kcyx(k, c, 0, 1) + - 0.25 * wei_kcyx(k, c, 0, 2) + 0.25 * wei_kcyx(k, c, 1, 0) - - 0.25 * wei_kcyx(k, c, 1, 1) + 0.25 * wei_kcyx(k, c, 1, 2) + - 0.25 * wei_kcyx(k, c, 2, 0) - 0.25 * wei_kcyx(k, c, 2, 1) + - 0.25 * wei_kcyx(k, c, 2, 2); - wei_transform(k, c, 1, 3) = - 0.5 * wei_kcyx(k, c, 0, 2) + 0.5 * wei_kcyx(k, c, 1, 2) + 0.5 * wei_kcyx(k, c, 2, 2); + wei_transform(k, c, 1, 0) = 0.5 * double(wei_kcyx(k, c, 0, 0)) + + 0.5 * double(wei_kcyx(k, c, 1, 0)) + + 0.5 * double(wei_kcyx(k, c, 2, 0)); + wei_transform(k, c, 1, 1) = + 0.25 * double(wei_kcyx(k, c, 0, 0)) + 0.25 * double(wei_kcyx(k, c, 0, 1)) + + 0.25 * double(wei_kcyx(k, c, 0, 2)) + 0.25 * double(wei_kcyx(k, c, 1, 0)) + + 0.25 * double(wei_kcyx(k, c, 1, 1)) + 0.25 * double(wei_kcyx(k, c, 1, 2)) + + 0.25 * double(wei_kcyx(k, c, 2, 0)) + 0.25 * double(wei_kcyx(k, c, 2, 1)) + + 0.25 * double(wei_kcyx(k, c, 2, 2)); + wei_transform(k, c, 1, 2) = + 0.25 * double(wei_kcyx(k, c, 0, 0)) - 0.25 * double(wei_kcyx(k, c, 0, 1)) + + 0.25 * double(wei_kcyx(k, c, 0, 2)) + 0.25 * double(wei_kcyx(k, c, 1, 0)) - + 0.25 * double(wei_kcyx(k, c, 1, 1)) + 0.25 * double(wei_kcyx(k, c, 1, 2)) + + 0.25 * double(wei_kcyx(k, c, 2, 0)) - 0.25 * double(wei_kcyx(k, c, 2, 1)) + + 0.25 * double(wei_kcyx(k, c, 2, 2)); + wei_transform(k, c, 1, 3) = 0.5 * double(wei_kcyx(k, c, 0, 2)) + + 0.5 * double(wei_kcyx(k, c, 1, 2)) + + 0.5 * double(wei_kcyx(k, c, 2, 2)); - wei_transform(k, c, 2, 0) = - 0.5 * wei_kcyx(k, c, 0, 0) - 0.5 * wei_kcyx(k, c, 1, 0) + 0.5 * wei_kcyx(k, c, 2, 0); - wei_transform(k, c, 2, 1) = 0.25 * wei_kcyx(k, c, 0, 0) + 0.25 * wei_kcyx(k, c, 0, 1) + - 0.25 * wei_kcyx(k, c, 0, 2) - 0.25 * wei_kcyx(k, c, 1, 0) - - 0.25 * wei_kcyx(k, c, 1, 1) - 0.25 * wei_kcyx(k, c, 1, 2) + - 0.25 * wei_kcyx(k, c, 2, 0) + 0.25 * wei_kcyx(k, c, 2, 1) + - 0.25 * wei_kcyx(k, c, 2, 2); - wei_transform(k, c, 2, 2) = 0.25 * wei_kcyx(k, c, 0, 0) - 0.25 * wei_kcyx(k, c, 0, 1) + - 0.25 * wei_kcyx(k, c, 0, 2) - 0.25 * wei_kcyx(k, c, 1, 0) + - 0.25 * wei_kcyx(k, c, 1, 1) - 0.25 * wei_kcyx(k, c, 1, 2) + - 0.25 * wei_kcyx(k, c, 2, 0) - 0.25 * wei_kcyx(k, c, 2, 1) + - 0.25 * wei_kcyx(k, c, 2, 2); - wei_transform(k, c, 2, 3) = - 0.5 * wei_kcyx(k, c, 0, 2) - 0.5 * wei_kcyx(k, c, 1, 2) + 0.5 * wei_kcyx(k, c, 2, 2); + wei_transform(k, c, 2, 0) = 0.5 * double(wei_kcyx(k, c, 0, 0)) - + 0.5 * double(wei_kcyx(k, c, 1, 0)) + + 0.5 * double(wei_kcyx(k, c, 2, 0)); + wei_transform(k, c, 2, 1) = + 0.25 * double(wei_kcyx(k, c, 0, 0)) + 0.25 * double(wei_kcyx(k, c, 0, 1)) + + 0.25 * double(wei_kcyx(k, c, 0, 2)) - 0.25 * double(wei_kcyx(k, c, 1, 0)) - + 0.25 * double(wei_kcyx(k, c, 1, 1)) - 0.25 * double(wei_kcyx(k, c, 1, 2)) + + 0.25 * double(wei_kcyx(k, c, 2, 0)) + 0.25 * double(wei_kcyx(k, c, 2, 1)) + + 0.25 * double(wei_kcyx(k, c, 2, 2)); + wei_transform(k, c, 2, 2) = + 0.25 * double(wei_kcyx(k, c, 0, 0)) - 0.25 * double(wei_kcyx(k, c, 0, 1)) + + 0.25 * double(wei_kcyx(k, c, 0, 2)) - 0.25 * double(wei_kcyx(k, c, 1, 0)) + + 0.25 * double(wei_kcyx(k, c, 1, 1)) - 0.25 * double(wei_kcyx(k, c, 1, 2)) + + 0.25 * double(wei_kcyx(k, c, 2, 0)) - 0.25 * double(wei_kcyx(k, c, 2, 1)) + + 0.25 * double(wei_kcyx(k, c, 2, 2)); + wei_transform(k, c, 2, 3) = 0.5 * double(wei_kcyx(k, c, 0, 2)) - + 0.5 * double(wei_kcyx(k, c, 1, 2)) + + 0.5 * double(wei_kcyx(k, c, 2, 2)); - wei_transform(k, c, 3, 0) = wei_kcyx(k, c, 2, 0); - wei_transform(k, c, 3, 1) = - 0.5 * wei_kcyx(k, c, 2, 0) + 0.5 * wei_kcyx(k, c, 2, 1) + 0.5 * wei_kcyx(k, c, 2, 2); - wei_transform(k, c, 3, 2) = - 0.5 * wei_kcyx(k, c, 2, 0) - 0.5 * wei_kcyx(k, c, 2, 1) + 0.5 * wei_kcyx(k, c, 2, 2); - wei_transform(k, c, 3, 3) = wei_kcyx(k, c, 2, 2); + wei_transform(k, c, 3, 0) = double(wei_kcyx(k, c, 2, 0)); + wei_transform(k, c, 3, 1) = 0.5 * double(wei_kcyx(k, c, 2, 0)) + + 0.5 * double(wei_kcyx(k, c, 2, 1)) + + 0.5 * double(wei_kcyx(k, c, 2, 2)); + wei_transform(k, c, 3, 2) = 0.5 * double(wei_kcyx(k, c, 2, 0)) - + 0.5 * double(wei_kcyx(k, c, 2, 1)) + + 0.5 * double(wei_kcyx(k, c, 2, 2)); + wei_transform(k, c, 3, 3) = double(wei_kcyx(k, c, 2, 2)); }; auto f_out_transform = [&](auto n, auto k, auto htile, auto wtile) { @@ -372,20 +366,25 @@ void host_winograd_3x3_convolution( template void check_error(const Tensor& ref, const Tensor& result) { + // printf("\n"); + float error = 0; float max_diff = -1; float ref_value = 0, result_value = 0; for(int i = 0; i < ref.mData.size(); ++i) { - error += std::abs(ref.mData[i] - result.mData[i]); - float diff = std::abs(ref.mData[i] - result.mData[i]); + error += std::abs(double(ref.mData[i]) - double(result.mData[i])); + float diff = std::abs(double(ref.mData[i]) - double(result.mData[i])); if(max_diff < diff) { max_diff = diff; ref_value = ref.mData[i]; result_value = result.mData[i]; } + + // printf("{%f, %f}", double(ref.mData[i]), double(result.mData[i])); } + // printf("\n"); std::cout << "error: " << error << std::endl; std::cout << "max_diff: " << max_diff << ", " << ref_value << ", " << result_value << std::endl; @@ -406,13 +405,13 @@ int main(int argc, char* argv[]) constexpr unsigned WPad = 0; #elif 1 // 3x3, 34x34 - constexpr unsigned N = 64; - constexpr unsigned C = 256; + constexpr unsigned N = 64; + constexpr unsigned C = 256; constexpr unsigned HI = 34; constexpr unsigned WI = 34; - constexpr unsigned K = 64; - constexpr unsigned Y = 3; - constexpr unsigned X = 3; + constexpr unsigned K = 64; + constexpr unsigned Y = 3; + constexpr unsigned X = 3; constexpr unsigned HPad = 0; constexpr unsigned WPad = 0; @@ -603,16 +602,22 @@ int main(int argc, char* argv[]) in_nchw.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread); wei_kcyx.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread); #elif 1 - in_nchw.GenerateTensorValue(GeneratorTensor_2{-2, 2}, num_thread); - wei_kcyx.GenerateTensorValue(GeneratorTensor_1{}, num_thread); + in_nchw.GenerateTensorValue(GeneratorTensor_2{1, 5}, num_thread); + + auto gen_wei = [](auto... is) { + return GeneratorTensor_2{1, 5}(is...) * GeneratorTensor_Checkboard{}(is...); + }; + wei_kcyx.GenerateTensorValue(gen_wei, num_thread); #endif } #if 1 #if 0 device_direct_convolution_1 -#elif 1 +#elif 0 device_direct_convolution_2_nchw_kcyx_nkhw +#elif 1 + device_direct_convolution_2_vectorized_nchw_kcyx_nkhw #elif 0 device_implicit_gemm_convolution_1_chwn_cyxk_khwn #elif 0 @@ -634,7 +639,6 @@ int main(int argc, char* argv[]) if(do_verification) { -#if 1 if(Y == 3 && X == 3) { host_winograd_3x3_convolution(in_nchw, wei_kcyx, out_nkhw_host, lower_pads, upper_pads); @@ -644,7 +648,6 @@ int main(int argc, char* argv[]) host_direct_convolution(in_nchw, wei_kcyx, out_nkhw_host, lower_pads, upper_pads); } check_error(out_nkhw_host, out_nkhw_device); -#endif #if 0 LogRange(std::cout << "in_nchw : ", in_nchw.mData, ",") << std::endl; diff --git a/src/include/blockwise_2d_tensor_op.hip.hpp b/src/include/blockwise_2d_tensor_op.hip.hpp index 761c32a370..969f18c4e8 100644 --- a/src/include/blockwise_2d_tensor_op.hip.hpp +++ b/src/include/blockwise_2d_tensor_op.hip.hpp @@ -373,7 +373,7 @@ template struct Blockwise2dTensorCopy3 { - using vector_t = typename vector_type::type; + using vector_t = typename vector_type::VectorType; unsigned mSrcMyThreadOffset; unsigned mDstMyThreadOffset; diff --git a/src/include/blockwise_4d_tensor_op.hip.hpp b/src/include/blockwise_4d_tensor_op.hip.hpp index 693b6fe9d5..e8829c0cbf 100644 --- a/src/include/blockwise_4d_tensor_op.hip.hpp +++ b/src/include/blockwise_4d_tensor_op.hip.hpp @@ -207,9 +207,9 @@ template struct Blockwise4dTensorCopy1 { - using vector_t = typename vector_type::type; + using vector_t = typename vector_type::VectorType; - __device__ void SanityCheck() const + __device__ constexpr Blockwise4dTensorCopy1() { constexpr auto I0 = Number<0>{}; constexpr auto I1 = Number<1>{}; @@ -239,8 +239,6 @@ struct Blockwise4dTensorCopy1 __device__ void Run(const Float* __restrict__ p_src, Float* __restrict__ p_dst) const { - SanityCheck(); - constexpr auto I0 = Number<0>{}; constexpr auto I1 = Number<1>{}; constexpr auto I2 = Number<2>{}; @@ -446,7 +444,7 @@ template struct Blockwise4dTensorCopy3 { - using vector_t = typename vector_type::type; + using vector_t = typename vector_type::VectorType; unsigned mSrcMyThreadOffset; unsigned mDstMyThreadOffset; diff --git a/src/include/common.hip.hpp b/src/include/common.hip.hpp index f447fce784..aa7e2269f6 100644 --- a/src/include/common.hip.hpp +++ b/src/include/common.hip.hpp @@ -28,44 +28,44 @@ struct vector_type template <> struct vector_type { - using type = float; + using VectorType = float; }; template <> struct vector_type { - using type = float2; + using VectorType = float2; }; template <> struct vector_type { - using type = float4; + using VectorType = float4; }; #if 0 template <> struct vector_type { - using type = half_float::half; + using VectorType = half_float::half; }; template <> struct vector_type { - using type = float; + using VectorType = float; }; template <> struct vector_type { - using type = float2; + using VectorType = float2; }; template <> struct vector_type { - using type = float4; + using VectorType = float4; }; #endif @@ -73,25 +73,41 @@ struct vector_type template <> struct vector_type { - using type = half; + using VectorType = half; + + __host__ __device__ static VectorType pack(half s) { return s; } }; template <> struct vector_type { - using type = half2; + using VectorType = half2; + + union Data + { + VectorType vector; + half scalar[2]; + }; + + __host__ __device__ static VectorType pack(half s0, half s1) + { + Data data; + data.scalar[0] = s0; + data.scalar[1] = s1; + return data.vector; + } }; template <> struct vector_type { - using type = float2; + using VectorType = float2; }; template <> struct vector_type { - using type = float4; + using VectorType = float4; }; #endif diff --git a/src/include/gridwise_direct_convolution_2_nchw_kcyx_nkhw.hip.hpp b/src/include/gridwise_direct_convolution_2_nchw_kcyx_nkhw.hip.hpp index da4542d9cb..5761a22c16 100644 --- a/src/include/gridwise_direct_convolution_2_nchw_kcyx_nkhw.hip.hpp +++ b/src/include/gridwise_direct_convolution_2_nchw_kcyx_nkhw.hip.hpp @@ -25,10 +25,10 @@ template -__global__ void gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw( - const Float* const __restrict__ p_in_global, - const Float* const __restrict__ p_wei_global, - Float* const __restrict__ p_out_global) +__global__ void +gridwise_direct_convolution_2_nchw_kcyx_nkhw(const Float* const __restrict__ p_in_global, + const Float* const __restrict__ p_wei_global, + Float* const __restrict__ p_out_global) { constexpr auto I0 = Number<0>{}; constexpr auto I1 = Number<1>{}; diff --git a/src/include/gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hip.hpp b/src/include/gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hip.hpp index da4542d9cb..2b3cb03b78 100644 --- a/src/include/gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hip.hpp +++ b/src/include/gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hip.hpp @@ -11,6 +11,7 @@ template __global__ void gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw( - const Float* const __restrict__ p_in_global, - const Float* const __restrict__ p_wei_global, + const typename vector_type::VectorType* const __restrict__ p_in_global, + const typename vector_type::VectorType* const __restrict__ p_wei_global, Float* const __restrict__ p_out_global) { + using scalar_t = Float; + using vector_t = typename vector_type::VectorType; + constexpr auto I0 = Number<0>{}; constexpr auto I1 = Number<1>{}; constexpr auto I2 = Number<2>{}; constexpr auto I3 = Number<3>{}; - constexpr auto in_nchw_global_desc = InGlobalDesc{}; - constexpr auto wei_kcyx_global_desc = WeiGlobalDesc{}; - constexpr auto out_nkhw_global_desc = OutGlobalDesc{}; + constexpr auto in_nchw_vec_global_desc = InGlobalDesc{}; + constexpr auto wei_kcyx_vec_global_desc = WeiGlobalDesc{}; + constexpr auto out_nkhw_global_desc = OutGlobalDesc{}; - constexpr unsigned N = in_nchw_global_desc.GetLength(I0); - constexpr unsigned K = wei_kcyx_global_desc.GetLength(I0); - constexpr unsigned C = wei_kcyx_global_desc.GetLength(I1); - constexpr unsigned Y = wei_kcyx_global_desc.GetLength(I2); - constexpr unsigned X = wei_kcyx_global_desc.GetLength(I3); + constexpr unsigned N = in_nchw_vec_global_desc.GetLength(I0); + constexpr unsigned K = wei_kcyx_vec_global_desc.GetLength(I0); + constexpr unsigned C = wei_kcyx_vec_global_desc.GetLength(I1); + constexpr unsigned Y = wei_kcyx_vec_global_desc.GetLength(I2); + constexpr unsigned X = wei_kcyx_vec_global_desc.GetLength(I3); - constexpr auto wei_ke_global_desc = make_ConstantTensorDescriptor( + constexpr auto wei_ke_vec_global_desc = make_ConstantTensorDescriptor( Sequence{}); // 2d view of wei for blockwise copy constexpr unsigned HiPerBlock = HoPerBlock + Y - 1; constexpr unsigned WiPerBlock = WoPerBlock + X - 1; - constexpr auto in_nchw_block_desc = make_ConstantTensorDescriptor_aligned( + constexpr auto in_nchw_vec_block_desc = make_ConstantTensorDescriptor_aligned( Sequence{}, Number{}); - constexpr auto wei_ke_block_desc = make_ConstantTensorDescriptor_aligned( + constexpr auto wei_ke_vec_block_desc = make_ConstantTensorDescriptor_aligned( Sequence{}, Number{}); // 2d view of wei for blockwise copy - constexpr auto wei_kcyx_block_desc = + constexpr auto wei_kcyx_vec_block_desc = make_ConstantTensorDescriptor(Sequence{}, - Sequence{}); + Sequence{}); // shared mem constexpr unsigned in_block_size = - in_nchw_block_desc.GetElementSpace(Number{}); + in_nchw_vec_block_desc.GetElementSpace(Number{}); constexpr unsigned wei_block_size = - wei_kcyx_block_desc.GetElementSpace(Number{}); + wei_kcyx_vec_block_desc.GetElementSpace(Number{}); constexpr unsigned max_align = InBlockCopyDataPerRead > WeiBlockCopyDataPerRead ? InBlockCopyDataPerRead @@ -81,10 +85,10 @@ __global__ void gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw( constexpr auto in_nchw_thread_block_desc = make_ConstantTensorDescriptor(Sequence{}, - in_nchw_block_desc.GetStrides()); + in_nchw_vec_block_desc.GetStrides()); constexpr auto wei_kcyx_thread_block_desc = make_ConstantTensorDescriptor( - Sequence{}, wei_kcyx_block_desc.GetStrides()); + Sequence{}, wei_kcyx_vec_block_desc.GetStrides()); constexpr auto out_nkhw_thread_desc = get_convolution_output_default_4d_tensor_descriptor( in_nchw_thread_block_desc, wei_kcyx_thread_block_desc); @@ -147,26 +151,27 @@ __global__ void gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw( constexpr auto blockwise_in_copy = Blockwise4dTensorCopy1{}; #if 0 constexpr auto blockwise_wei_copy = Blockwise4dTensorCopy1{}; #elif 1 - const auto blockwise_wei_copy = Blockwise2dTensorCopy3{}; + const auto blockwise_wei_copy = + Blockwise2dTensorCopy3{}; #endif // set threadwise output tensor to 0 @@ -176,14 +181,14 @@ __global__ void gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw( c_block_data_begin += CPerBlock, __syncthreads()) { // copy input tensor to LDS - blockwise_in_copy.Run(p_in_global + in_nchw_global_desc.Get1dIndex(n_block_data_begin, - c_block_data_begin, - hi_block_data_begin, - wi_block_data_begin), + blockwise_in_copy.Run(p_in_global + in_nchw_vec_global_desc.Get1dIndex(n_block_data_begin, + c_block_data_begin, + hi_block_data_begin, + wi_block_data_begin), p_in_block); // copy weight tensor to LDS - blockwise_wei_copy.Run(p_wei_global + wei_kcyx_global_desc.Get1dIndex( + blockwise_wei_copy.Run(p_wei_global + wei_kcyx_vec_global_desc.Get1dIndex( k_block_data_begin, c_block_data_begin, 0, 0), p_wei_block); @@ -195,25 +200,25 @@ __global__ void gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw( #if 1 threadwise_direct_convolution_2( in_nchw_thread_block_desc, - p_in_block + in_nchw_block_desc.Get1dIndex(n_thread_data_begin, - c_thread_data, - hi_thread_data_begin, - wi_thread_data_begin), + p_in_block + in_nchw_vec_block_desc.Get1dIndex(n_thread_data_begin, + c_thread_data, + hi_thread_data_begin, + wi_thread_data_begin), wei_kcyx_thread_block_desc, p_wei_block + - wei_kcyx_block_desc.Get1dIndex(k_thread_data_begin, c_thread_data, 0, 0), + wei_kcyx_vec_block_desc.Get1dIndex(k_thread_data_begin, c_thread_data, 0, 0), out_nkhw_thread_desc, p_out_thread); #elif 0 threadwise_direct_convolution_3( in_nchw_thread_block_desc, - p_in_block + in_nchw_block_desc.Get1dIndex(n_thread_data_begin, - c_thread_data, - hi_thread_data_begin, - wi_thread_data_begin), + p_in_block + in_nchw_vec_block_desc.Get1dIndex(n_thread_data_begin, + c_thread_data, + hi_thread_data_begin, + wi_thread_data_begin), wei_kcyx_thread_block_desc, p_wei_block + - wei_kcyx_block_desc.Get1dIndex(k_thread_data_begin, c_thread_data, 0, 0), + wei_kcyx_vec_block_desc.Get1dIndex(k_thread_data_begin, c_thread_data, 0, 0), out_nkhw_thread_desc, p_out_thread); #endif