From 220f5aea05ff4eb8bed3ef1a6cce864a326d4c68 Mon Sep 17 00:00:00 2001 From: Chao Liu Date: Tue, 8 Jan 2019 14:05:03 -0600 Subject: [PATCH] refactor [ROCm/composable_kernel commit: 0b8e67ef08b28447509fd3e0f26d8e276b06cbf0] --- src/include/blockwise_direct_convolution.cuh | 10 ++--- src/include/blockwise_tensor_op.cuh | 32 +++++++------- src/include/gridwise_direct_convolution_1.cuh | 16 +++---- src/include/gridwise_direct_convolution_2.cuh | 14 +++---- src/include/gridwise_winograd_convolution.cuh | 24 +++++------ src/include/threadwise_direct_convolution.cuh | 32 +++++++------- src/include/threadwise_tensor_op.cuh | 42 +++++++++---------- 7 files changed, 81 insertions(+), 89 deletions(-) diff --git a/src/include/blockwise_direct_convolution.cuh b/src/include/blockwise_direct_convolution.cuh index 3078e77b9d..0d6c749648 100644 --- a/src/include/blockwise_direct_convolution.cuh +++ b/src/include/blockwise_direct_convolution.cuh @@ -4,7 +4,7 @@ #include "threadwise_direct_convolution.cuh" template __device__ void blockwise_direct_convolution(InBlockDesc, - TFloat* const __restrict__ p_in_block, + Float* const __restrict__ p_in_block, WeiBlockDesc, - TFloat* const __restrict__ p_wei_block, + Float* const __restrict__ p_wei_block, OutBlockDesc, - TFloat* __restrict__ p_out_block) + Float* __restrict__ p_out_block) { constexpr auto I0 = Number<0>{}; constexpr auto I1 = Number<1>{}; @@ -92,7 +92,7 @@ __device__ void blockwise_direct_convolution(InBlockDesc, unsigned hi_thread_data_begin = ho_thread_data_begin; // minus padding unsigned wi_thread_data_begin = wo_thread_data_begin; // minus padding - TFloat p_out_thread[out_thread_desc.GetElementSpace()]; + Float p_out_thread[out_thread_desc.GetElementSpace()]; threadwise_4d_tensor_copy(out_block_desc, p_out_block + out_block_desc.Get1dIndex(n_thread_data_begin, diff --git a/src/include/blockwise_tensor_op.cuh b/src/include/blockwise_tensor_op.cuh index 9de53046bd..6cfff6023d 100644 --- a/src/include/blockwise_tensor_op.cuh +++ b/src/include/blockwise_tensor_op.cuh @@ -1,9 +1,9 @@ #pragma once #include "constant_tensor_descriptor.cuh" -template +template __device__ void -blockwise_4d_tensor_pointwise_operation_unary(DstDesc, TFloat* __restrict__ p_dst, F f) +blockwise_4d_tensor_pointwise_operation_unary(DstDesc, Float* __restrict__ p_dst, F f) { constexpr auto I0 = Number<0>{}; constexpr auto I1 = Number<1>{}; @@ -79,7 +79,7 @@ blockwise_4d_tensor_pointwise_operation_unary(DstDesc, TFloat* __restrict__ p_ds // TODO: in order to optimize mem access for different mem type, // need to write specialized version template __device__ void blockwise_4d_tensor_pointwise_operation_binary_reorder(SrcDesc, - TFloat* const __restrict__ p_src, + Float* const __restrict__ p_src, DstDesc, - TFloat* __restrict__ p_dst, + Float* __restrict__ p_dst, RefDesc, Reorder, F f) @@ -170,36 +170,32 @@ blockwise_4d_tensor_pointwise_operation_binary_reorder(SrcDesc, } } -template -__device__ void blockwise_4d_tensor_set_zero(DstDesc, TFloat* __restrict__ p_dst) +template +__device__ void blockwise_4d_tensor_set_zero(DstDesc, Float* __restrict__ p_dst) { - auto f_set_zero = [](TFloat& v) { v = TFloat(0); }; + auto f_set_zero = [](Float& v) { v = Float(0); }; blockwise_4d_tensor_pointwise_operation_unary(DstDesc{}, p_dst, f_set_zero); } template -__device__ void blockwise_4d_tensor_copy_reorder(SrcDesc, - TFloat* const __restrict__ p_src, - DstDesc, - TFloat* __restrict__ p_dst, - RefDesc, - Reorder) +__device__ void blockwise_4d_tensor_copy_reorder( + SrcDesc, Float* const __restrict__ p_src, DstDesc, Float* __restrict__ p_dst, RefDesc, Reorder) { - auto f_copy = [](const TFloat& src, TFloat& dst) { dst = src; }; + auto f_copy = [](const Float& src, Float& dst) { dst = src; }; blockwise_4d_tensor_pointwise_operation_binary_reorder( SrcDesc{}, p_src, DstDesc{}, p_dst, RefDesc{}, Reorder{}, f_copy); } -template +template __device__ void blockwise_4d_tensor_copy( - SrcDesc, TFloat* const __restrict__ p_src, DstDesc, TFloat* __restrict__ p_dst, RefDesc) + SrcDesc, Float* const __restrict__ p_src, DstDesc, Float* __restrict__ p_dst, RefDesc) { constexpr auto reorder = Sequence<0, 1, 2, 3>{}; diff --git a/src/include/gridwise_direct_convolution_1.cuh b/src/include/gridwise_direct_convolution_1.cuh index 1ec2cd83a8..4dfc6dfebc 100644 --- a/src/include/gridwise_direct_convolution_1.cuh +++ b/src/include/gridwise_direct_convolution_1.cuh @@ -3,7 +3,7 @@ #include "blockwise_tensor_op.cuh" #include "blockwise_direct_convolution.cuh" -template __global__ void gridwise_direct_convolution_1(InGlobalDesc, - TFloat* const __restrict__ p_in_global, + Float* const __restrict__ p_in_global, WeiGlobalDesc, - TFloat* const __restrict__ p_wei_global, + Float* const __restrict__ p_wei_global, OutGlobalDesc, - TFloat* __restrict__ p_out_global) + Float* __restrict__ p_out_global) { constexpr auto I0 = Number<0>{}; constexpr auto I1 = Number<1>{}; @@ -68,9 +68,9 @@ __global__ void gridwise_direct_convolution_1(InGlobalDesc, constexpr unsigned wei_block_size = wei_block_desc.GetElementSpace(); constexpr unsigned out_block_size = out_block_desc.GetElementSpace(); - __shared__ TFloat p_in_block[in_block_size]; - __shared__ TFloat p_wei_block[wei_block_size]; - __shared__ TFloat p_out_block[out_block_size]; + __shared__ Float p_in_block[in_block_size]; + __shared__ Float p_wei_block[wei_block_size]; + __shared__ Float p_out_block[out_block_size]; const unsigned block_id = blockIdx.x; @@ -150,7 +150,7 @@ __global__ void gridwise_direct_convolution_1(InGlobalDesc, // blockwise convolution blockwise_direct_convolution __global__ void gridwise_direct_convolution_2(InGlobalDesc, - TFloat* const __restrict__ p_in_global, + Float* const __restrict__ p_in_global, WeiGlobalDesc, - TFloat* const __restrict__ p_wei_global, + Float* const __restrict__ p_wei_global, OutGlobalDesc, - TFloat* __restrict__ p_out_global) + Float* __restrict__ p_out_global) { constexpr auto I0 = Number<0>{}; constexpr auto I1 = Number<1>{}; @@ -56,8 +56,8 @@ __global__ void gridwise_direct_convolution_2(InGlobalDesc, constexpr unsigned in_block_size = in_block_desc.GetElementSpace(); constexpr unsigned wei_block_size = wei_block_desc.GetElementSpace(); - __shared__ TFloat p_in_block[in_block_size]; - __shared__ TFloat p_wei_block[wei_block_size]; + __shared__ Float p_in_block[in_block_size]; + __shared__ Float p_wei_block[wei_block_size]; // threadwise tensors constexpr unsigned InTileSizeH = OutTileSizeH + S - 1; @@ -73,7 +73,7 @@ __global__ void gridwise_direct_convolution_2(InGlobalDesc, get_convolution_output_4d_tensor_descriptor(in_thread_block_desc, wei_thread_block_desc); // register - TFloat p_out_thread[out_thread_desc.GetElementSpace()]; + Float p_out_thread[out_thread_desc.GetElementSpace()]; // divide block work constexpr unsigned NBlockWork = (out_global_desc.GetLength(I0) + NPerBlock - 1) / NPerBlock; diff --git a/src/include/gridwise_winograd_convolution.cuh b/src/include/gridwise_winograd_convolution.cuh index 63a8595285..2797844bdf 100644 --- a/src/include/gridwise_winograd_convolution.cuh +++ b/src/include/gridwise_winograd_convolution.cuh @@ -3,7 +3,7 @@ #include "blockwise_winograd_transform.cuh" #include "threadwise_winograd_transform.cuh" -template __global__ void gridwise_winograd_convolution(InGlobalDesc, - TFloat* const __restrict__ p_in_global, + Float* const __restrict__ p_in_global, WeiGlobalDesc, - TFloat* const __restrict__ p_wei_global, + Float* const __restrict__ p_wei_global, OutGlobalDesc, - TFloat* __restrict__ p_out_global) + Float* __restrict__ p_out_global) { constexpr auto I0 = Number<0>{}; constexpr auto I1 = Number<1>{}; @@ -102,8 +102,8 @@ __global__ void gridwise_winograd_convolution(InGlobalDesc, constexpr auto wei_transform_block_desc = make_ConstantTensorDescriptor(Sequence{}); - __shared__ TFloat p_in_transform_block[in_transform_block_desc.GetElementSpace()]; - __shared__ TFloat p_wei_transform_block[wei_transform_block_desc.GetElementSpace()]; + __shared__ Float p_in_transform_block[in_transform_block_desc.GetElementSpace()]; + __shared__ Float p_wei_transform_block[wei_transform_block_desc.GetElementSpace()]; // thread data constexpr auto in_transform_thread_block_desc = @@ -123,8 +123,8 @@ __global__ void gridwise_winograd_convolution(InGlobalDesc, constexpr auto out_thread_global_desc = make_ConstantTensorDescriptor(out_thread_desc.GetLengths(), out_global_desc.GetStrides()); - TFloat p_out_transform_thread[out_transform_thread_desc.GetElementSpace()]; - TFloat p_out_thread[out_thread_desc.GetElementSpace()]; + Float p_out_transform_thread[out_transform_thread_desc.GetElementSpace()]; + Float p_out_thread[out_thread_desc.GetElementSpace()]; #if 0 if(blockIdx.x == 0 && threadIdx.x == 0) @@ -146,7 +146,7 @@ __global__ void gridwise_winograd_convolution(InGlobalDesc, { #if 0 // blockwise transform input - blockwise_winograd_transform_input +template __device__ void threadwise_direct_convolution_1(InDesc, - TFloat* const __restrict__ p_in, + Float* const __restrict__ p_in, WeiDesc, - TFloat* const __restrict__ p_wei, + Float* const __restrict__ p_wei, OutDesc, - TFloat* __restrict__ p_out) + Float* __restrict__ p_out) { constexpr auto I0 = Number<0>{}; constexpr auto I1 = Number<1>{}; @@ -81,13 +81,13 @@ __device__ void threadwise_direct_convolution_1(InDesc, // Optimized for scenario if p_in and p_wei are in LDS, p_out are in register // Copy in and wei into register before doing convolution -template +template __device__ void threadwise_direct_convolution_2(InDesc, - TFloat* const __restrict__ p_in, + Float* const __restrict__ p_in, WeiDesc, - TFloat* const __restrict__ p_wei, + Float* const __restrict__ p_wei, OutDesc, - TFloat* __restrict__ p_out) + Float* __restrict__ p_out) { constexpr auto in_desc = InDesc{}; constexpr auto wei_desc = WeiDesc{}; @@ -97,8 +97,8 @@ __device__ void threadwise_direct_convolution_2(InDesc, constexpr auto wei_reg_desc = make_ConstantTensorDescriptor(wei_desc.GetLengths()); // register - TFloat p_in_reg[in_reg_desc.GetElementSpace()]; - TFloat p_wei_reg[wei_reg_desc.GetElementSpace()]; + Float p_in_reg[in_reg_desc.GetElementSpace()]; + Float p_wei_reg[wei_reg_desc.GetElementSpace()]; // copy input tensor into register threadwise_4d_tensor_copy(in_desc, p_in, in_reg_desc, p_in_reg, in_reg_desc); @@ -114,13 +114,13 @@ __device__ void threadwise_direct_convolution_2(InDesc, // optimized for scenario where p_in and p_wei are in LDS, p_out is in register // break down a non-1x1 convolution into a sequence of 1x1 convolutions, // load 1x1 weight into register, and do 1x1 convolution in register. -template +template __device__ void threadwise_direct_convolution_3(InDesc, - TFloat* const __restrict__ p_in, + Float* const __restrict__ p_in, WeiDesc, - TFloat* const __restrict__ p_wei, + Float* const __restrict__ p_wei, OutDesc, - TFloat* __restrict__ p_out) + Float* __restrict__ p_out) { constexpr auto I0 = Number<0>{}; constexpr auto I1 = Number<1>{}; @@ -139,8 +139,8 @@ __device__ void threadwise_direct_convolution_3(InDesc, constexpr auto wei_reg_desc = make_ConstantTensorDescriptor( Sequence{}); - TFloat p_in_reg[in_reg_desc.GetElementSpace()]; - TFloat p_wei_reg[wei_reg_desc.GetElementSpace()]; + Float p_in_reg[in_reg_desc.GetElementSpace()]; + Float p_wei_reg[wei_reg_desc.GetElementSpace()]; constexpr unsigned in_w_new_read = 1; diff --git a/src/include/threadwise_tensor_op.cuh b/src/include/threadwise_tensor_op.cuh index 8d6bc917aa..7aeed4a764 100644 --- a/src/include/threadwise_tensor_op.cuh +++ b/src/include/threadwise_tensor_op.cuh @@ -1,8 +1,8 @@ #pragma once #include "constant_tensor_descriptor.cuh" -template -__device__ void threadwise_4d_tensor_pointwise_operation_unary(Desc, TFloat* __restrict__ p, F f) +template +__device__ void threadwise_4d_tensor_pointwise_operation_unary(Desc, Float* __restrict__ p, F f) { constexpr auto I0 = Number<0>{}; constexpr auto I1 = Number<1>{}; @@ -37,12 +37,12 @@ __device__ void threadwise_4d_tensor_pointwise_operation_unary(Desc, TFloat* __r // TODO: in order to optimize mem access for different mem type, // need to write specialized version -template +template __device__ void threadwise_4d_tensor_pointwise_operation_binary_reorder(SrcDesc, - TFloat* const __restrict__ p_src, + Float* const __restrict__ p_src, DstDesc, - TFloat* __restrict__ p_dst, + Float* __restrict__ p_dst, RefDesc, Reorder, F f) @@ -83,26 +83,22 @@ threadwise_4d_tensor_pointwise_operation_binary_reorder(SrcDesc, } } -template -__device__ void threadwise_4d_tensor_set_zero(Desc, TFloat* __restrict__ p) +template +__device__ void threadwise_4d_tensor_set_zero(Desc, Float* __restrict__ p) { - auto f_set_zero = [](TFloat& v) { v = TFloat(0); }; + auto f_set_zero = [](Float& v) { v = Float(0); }; - threadwise_4d_tensor_pointwise_operation_unary( + threadwise_4d_tensor_pointwise_operation_unary( Desc{}, p, f_set_zero); } -template -__device__ void threadwise_4d_tensor_copy_reorder(SrcDesc, - TFloat* const __restrict__ p_src, - DstDesc, - TFloat* __restrict__ p_dst, - RefDesc, - Reorder) +template +__device__ void threadwise_4d_tensor_copy_reorder( + SrcDesc, Float* const __restrict__ p_src, DstDesc, Float* __restrict__ p_dst, RefDesc, Reorder) { - auto f_copy = [](const TFloat& src, TFloat& dst) { dst = src; }; + auto f_copy = [](const Float& src, Float& dst) { dst = src; }; - threadwise_4d_tensor_pointwise_operation_binary_reorder +template __device__ void threadwise_4d_tensor_copy( - SrcDesc, TFloat* const __restrict__ p_src, DstDesc, TFloat* __restrict__ p_dst, RefDesc) + SrcDesc, Float* const __restrict__ p_src, DstDesc, Float* __restrict__ p_dst, RefDesc) { auto reorder = Sequence<0, 1, 2, 3>{}; - threadwise_4d_tensor_copy_reorder( + threadwise_4d_tensor_copy_reorder( SrcDesc{}, p_src, DstDesc{}, p_dst, RefDesc{}, reorder); } -template -__device__ void threadwise_4d_tensor_shift_down(Desc, TFloat* __restrict__ p, IDim, NShift) +template +__device__ void threadwise_4d_tensor_shift_down(Desc, Float* __restrict__ p, IDim, NShift) { constexpr auto I0 = Number<0>{}; constexpr auto I1 = Number<1>{};