From bf975428460a27b46912d1c4293b407febb92de0 Mon Sep 17 00:00:00 2001 From: Chao Liu Date: Sun, 15 Sep 2019 16:58:16 -0500 Subject: [PATCH] add lds doble buffer to nchw padded v4r1 and v4r4 --- ...plicit_gemm_v4r4_nchw_kcyx_nkhw_padded.hpp | 2 - .../blockwise_generic_tensor_slice_copy.hpp | 102 +++++++++--------- .../threadwise_generic_tensor_slice_copy.hpp | 12 ++- ...tion_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp | 2 +- ...plicit_gemm_v4r1_nchw_kcyx_nkhw_padded.hpp | 82 +++++++------- ...plicit_gemm_v4r4_nchw_kcyx_nkhw_padded.hpp | 80 +++++++------- driver/src/driver.cpp | 8 +- 7 files changed, 153 insertions(+), 135 deletions(-) diff --git a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_padded.hpp b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_padded.hpp index 0ebfa08f35..f3976d43bc 100644 --- a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_padded.hpp +++ b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_padded.hpp @@ -59,7 +59,6 @@ struct GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw_padded constexpr auto I1 = Number<1>{}; constexpr auto I2 = Number<2>{}; constexpr auto I3 = Number<3>{}; - constexpr auto I5 = Number<5>{}; constexpr auto True = integral_constant{}; @@ -330,7 +329,6 @@ struct GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw_padded constexpr auto I1 = Number<1>{}; constexpr auto I2 = Number<2>{}; constexpr auto I3 = Number<3>{}; - constexpr auto I5 = Number<5>{}; constexpr auto True = integral_constant{}; diff --git a/composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp b/composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp index cec6f08c4e..a3956dc713 100644 --- a/composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp +++ b/composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp @@ -25,14 +25,14 @@ namespace ck { // repeat-length on the merged dimension need to be 1. These sanity checks are performed // in constructor of BlockwiseGenericTensorSliceCopy_v1 template + template __device__ void RunLoadRegisterBuffer(const TData* __restrict__ p_src, TData* __restrict__ p_buffer) const { @@ -260,7 +260,7 @@ struct BlockwiseGenericTensorSliceCopy_v1 }); } - template + template __device__ void RunStoreRegisterBuffer(const TData* __restrict__ p_buffer, TData* __restrict__ p_dst) const { @@ -315,7 +315,7 @@ struct BlockwiseGenericTensorSliceCopy_v1 }); } - template + template __device__ void Run(const TData* __restrict__ p_src, TData* __restrict__ p_dst) const { TData p_buffer[GetRegisterBufferSize()]; @@ -406,7 +406,7 @@ struct BlockwiseGenericTensorSliceCopy_v1 }); } - template + template __device__ void MoveSrcSliceWindow(T step_sizes, integral_constant positive_direction) { @@ -423,14 +423,14 @@ struct BlockwiseGenericTensorSliceCopy_v1 // Slice a (normal or merged) tensor, and copy it into another (normal or merged) tensor // memory layout (ordering of dimensions) can be different between src and dst. template + template __device__ void RunLoadRegisterBuffer(const TData* p_src, TData* p_buffer) const { mThreadwiseLoad.Run(p_src, p_buffer); } - template + template __device__ void RunStoreRegisterBuffer(const TData* p_buffer, TData* p_dst) const { mThreadwiseStore.Run(p_buffer, p_dst); } - template + template __device__ void Run(const TData* p_src, TData* p_dst) const { TData p_buffer[GetRegisterBufferSize()]; @@ -503,14 +503,14 @@ struct BlockwiseGenericTensorSliceCopy_v2 mThreadwiseStore.Run(p_buffer, p_dst); } - template + template __device__ void MoveSrcSliceWindow(T step_sizes, integral_constant positive_direction) { mThreadwiseLoad.MoveSrcSliceWindow(step_sizes, positive_direction); } - template + template __device__ void MoveDstSliceWindow(T step_sizes, integral_constant positive_direction) { @@ -546,14 +546,14 @@ struct BlockwiseGenericTensorSliceCopy_v2 // this version use TensorView and TensorCoordinate template + template __device__ void MoveSrcSliceWindow(T step_sizes, integral_constant positive_direction) { mThreadwiseLoad.MoveSrcSliceWindow(step_sizes, positive_direction); } - template + template __device__ void MoveDstSliceWindow(T step_sizes, integral_constant positive_direction) { @@ -669,14 +669,14 @@ struct BlockwiseGenericTensorSliceCopy_v3 }; template + template __device__ void RunLoadRegisterBuffer(const TData* p_src, TData* p_buffer) const { mThreadwiseLoad.Run(p_src, p_buffer); } - template + template __device__ void RunStoreRegisterBuffer(const TData* p_buffer, TData* p_dst) const { mThreadwiseStore.Run(p_buffer, p_dst); } - template + template __device__ void Run(const TData* p_src, TData* p_dst) const { TData p_buffer[GetRegisterBufferSize()]; @@ -748,16 +748,18 @@ struct BlockwiseGenericTensorSliceCopy_v4 mThreadwiseStore.Run(p_buffer, p_dst); } - template + template __device__ void - MoveSrcSliceWindow(T step_sizes, integral_constant positive_direction) + MoveSrcSliceWindow(const T& step_sizes, + integral_constant positive_direction) { mThreadwiseLoad.MoveSrcSliceWindow(step_sizes, positive_direction); } - template + template __device__ void - MoveDstSliceWindow(T step_sizes, integral_constant positive_direction) + MoveDstSliceWindow(const T& step_sizes, + integral_constant positive_direction) { mThreadwiseStore.MoveDstSliceWindow(step_sizes, positive_direction); } diff --git a/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp b/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp index 56de4c6f66..110ef53bc5 100644 --- a/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp +++ b/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp @@ -1072,16 +1072,22 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2 } template - __device__ void MoveSrcSliceWindow(T step_sizes, integral_constant) + __device__ void MoveSrcSliceWindow(const T& step_sizes_, + integral_constant) { + const auto step_sizes = to_array(step_sizes_); + static_if{}([&](auto) { - mSrcSliceOrigin += step_sizes; + mSrcSliceOrigin += to_array(step_sizes); }).Else([&](auto) { mSrcSliceOrigin -= step_sizes; }); } template - __device__ void MoveDstSliceWindow(T step_sizes, integral_constant) + __device__ void MoveDstSliceWindow(const T& step_sizes_, + integral_constant) { + const auto step_sizes = to_array(step_sizes_); + static_if{}([&](auto) { mDstSliceOrigin += step_sizes; }).Else([&](auto) { mDstSliceOrigin -= step_sizes; }); diff --git a/driver/include/device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp b/driver/include/device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp index fc8d0e7adc..36ca649aa4 100644 --- a/driver/include/device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp +++ b/driver/include/device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp @@ -136,7 +136,7 @@ void device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw(InDesc, for(index_t i = 0; i < nrepeat; ++i) { constexpr auto gridwise_conv = -#if 1 +#if 0 GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw #else GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer diff --git a/driver/include/device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded.hpp b/driver/include/device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded.hpp index 4140d45626..93f91873e7 100644 --- a/driver/include/device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded.hpp +++ b/driver/include/device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded.hpp @@ -4,6 +4,7 @@ #include "tensor.hpp" #include "gridwise_convolution_kernel_wrapper.hpp" #include "gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded.hpp" +#include "gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded_lds_double_buffer.hpp" template {}; + constexpr auto gridwise_conv = +#if 0 + GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_padded +#else + GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_padded_lds_double_buffer +#endif + {}; float time = launch_kernel(run_gridwise_convolution_kernel, dim3(GridSize), diff --git a/driver/include/device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_padded.hpp b/driver/include/device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_padded.hpp index d43008c0df..304fbe211d 100644 --- a/driver/include/device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_padded.hpp +++ b/driver/include/device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_padded.hpp @@ -4,6 +4,7 @@ #include "tensor.hpp" #include "gridwise_convolution_kernel_wrapper.hpp" #include "gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_padded.hpp" +#include "gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_padded_lds_double_buffer.hpp" template {}; + constexpr auto gridwise_conv = +#if 0 + GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw_padded +#else + GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw_padded_lds_double_buffer +#endif + {}; for(index_t i = 0; i < nrepeat; ++i) { diff --git a/driver/src/driver.cpp b/driver/src/driver.cpp index cd8325f0f8..e383f8e06f 100644 --- a/driver/src/driver.cpp +++ b/driver/src/driver.cpp @@ -92,8 +92,8 @@ int main(int argc, char* argv[]) // 3x3, 34x34 constexpr index_t N = 64; constexpr index_t C = 256; - constexpr index_t HI = 34; - constexpr index_t WI = 34; + constexpr index_t HI = 32; + constexpr index_t WI = 32; constexpr index_t K = 128; constexpr index_t Y = 3; constexpr index_t X = 3; @@ -101,8 +101,8 @@ int main(int argc, char* argv[]) using ConvStrides = Sequence<1, 1>; using ConvDilations = Sequence<1, 1>; - using LeftPads = Sequence<0, 0>; - using RightPads = Sequence<0, 0>; + using LeftPads = Sequence<1, 1>; + using RightPads = Sequence<1, 1>; #elif 0 // 1x1 filter, 8x8 image // cudnn@V100 68%, ck@V100 72%, ck@P100 52%, ck@VII 42%