From bf975428460a27b46912d1c4293b407febb92de0 Mon Sep 17 00:00:00 2001
From: Chao Liu <lc.roy86@gmail.com>
Date: Sun, 15 Sep 2019 16:58:16 -0500
Subject: [PATCH] add lds doble buffer to nchw padded v4r1 and v4r4

---
 ...plicit_gemm_v4r4_nchw_kcyx_nkhw_padded.hpp |   2 -
 .../blockwise_generic_tensor_slice_copy.hpp   | 102 +++++++++---------
 .../threadwise_generic_tensor_slice_copy.hpp  |  12 ++-
 ...tion_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp |   2 +-
 ...plicit_gemm_v4r1_nchw_kcyx_nkhw_padded.hpp |  82 +++++++-------
 ...plicit_gemm_v4r4_nchw_kcyx_nkhw_padded.hpp |  80 +++++++-------
 driver/src/driver.cpp                         |   8 +-
 7 files changed, 153 insertions(+), 135 deletions(-)
diff --git a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_padded.hpp b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_padded.hpp
index 0ebfa08f35..f3976d43bc 100644
--- a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_padded.hpp
+++ b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_padded.hpp
@@ -59,7 +59,6 @@ struct GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw_padded
         constexpr auto I1 = Number<1>{};
         constexpr auto I2 = Number<2>{};
         constexpr auto I3 = Number<3>{};
-        constexpr auto I5 = Number<5>{};
 
         constexpr auto True = integral_constant<bool, true>{};
 
@@ -330,7 +329,6 @@ struct GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw_padded
         constexpr auto I1 = Number<1>{};
         constexpr auto I2 = Number<2>{};
         constexpr auto I3 = Number<3>{};
-        constexpr auto I5 = Number<5>{};
 
         constexpr auto True = integral_constant<bool, true>{};
 
diff --git a/composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp b/composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp
index cec6f08c4e..a3956dc713 100644
--- a/composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp
+++ b/composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp
@@ -25,14 +25,14 @@ namespace ck {
 // repeat-length on the merged dimension need to be 1. These sanity checks are performed
 // in constructor of BlockwiseGenericTensorSliceCopy_v1
 template <index_t BlockSize,
-          class SrcDesc,
-          class DstDesc,
-          class SliceLengths,
-          class SubLengths,
-          class ThreadClusterLengths,
-          class ThreadClusterArrangeOrder,
-          class SrcDimAccessOrder,
-          class DstDimAccessOrder,
+          typename SrcDesc,
+          typename DstDesc,
+          typename SliceLengths,
+          typename SubLengths,
+          typename ThreadClusterLengths,
+          typename ThreadClusterArrangeOrder,
+          typename SrcDimAccessOrder,
+          typename DstDimAccessOrder,
           index_t SrcVectorAccessDim,
           index_t DstVectorAccessDim,
           index_t SrcDataPerAccess,
@@ -204,7 +204,7 @@ struct BlockwiseGenericTensorSliceCopy_v1
         return GetRegisterBufferDescriptor().GetElementSpace();
     }
 
-    template <class TData>
+    template <typename TData>
     __device__ void RunLoadRegisterBuffer(const TData* __restrict__ p_src,
                                           TData* __restrict__ p_buffer) const
     {
@@ -260,7 +260,7 @@ struct BlockwiseGenericTensorSliceCopy_v1
         });
     }
 
-    template <class TData>
+    template <typename TData>
     __device__ void RunStoreRegisterBuffer(const TData* __restrict__ p_buffer,
                                            TData* __restrict__ p_dst) const
     {
@@ -315,7 +315,7 @@ struct BlockwiseGenericTensorSliceCopy_v1
         });
     }
 
-    template <class TData>
+    template <typename TData>
     __device__ void Run(const TData* __restrict__ p_src, TData* __restrict__ p_dst) const
     {
         TData p_buffer[GetRegisterBufferSize()];
@@ -406,7 +406,7 @@ struct BlockwiseGenericTensorSliceCopy_v1
         });
     }
 
-    template <class T, bool PositiveDirection>
+    template <typename T, bool PositiveDirection>
     __device__ void
     MoveSrcSliceWindow(T step_sizes, integral_constant<bool, PositiveDirection> positive_direction)
     {
@@ -423,14 +423,14 @@ struct BlockwiseGenericTensorSliceCopy_v1
 // Slice a (normal or merged) tensor, and copy it into another (normal or merged) tensor
 // memory layout (ordering of dimensions) can be different between src and dst.
 template <index_t BlockSize,
-          class SrcDesc,
-          class DstDesc,
-          class SliceLengths,
-          class SubLengths,
-          class ThreadClusterLengths,
-          class ThreadClusterArrangeOrder,
-          class SrcDimAccessOrder,
-          class DstDimAccessOrder,
+          typename SrcDesc,
+          typename DstDesc,
+          typename SliceLengths,
+          typename SubLengths,
+          typename ThreadClusterLengths,
+          typename ThreadClusterArrangeOrder,
+          typename SrcDimAccessOrder,
+          typename DstDimAccessOrder,
           index_t SrcVectorAccessDim,
           index_t DstVectorAccessDim,
           index_t SrcDataPerAccess,
@@ -482,19 +482,19 @@ struct BlockwiseGenericTensorSliceCopy_v2
         return RegisterBufferDesc::GetElementSpace();
     }
 
-    template <class TData>
+    template <typename TData>
     __device__ void RunLoadRegisterBuffer(const TData* p_src, TData* p_buffer) const
     {
         mThreadwiseLoad.Run(p_src, p_buffer);
     }
 
-    template <class TData>
+    template <typename TData>
     __device__ void RunStoreRegisterBuffer(const TData* p_buffer, TData* p_dst) const
     {
         mThreadwiseStore.Run(p_buffer, p_dst);
     }
 
-    template <class TData>
+    template <typename TData>
     __device__ void Run(const TData* p_src, TData* p_dst) const
     {
         TData p_buffer[GetRegisterBufferSize()];
@@ -503,14 +503,14 @@ struct BlockwiseGenericTensorSliceCopy_v2
         mThreadwiseStore.Run(p_buffer, p_dst);
     }
 
-    template <class T, bool PositiveDirection>
+    template <typename T, bool PositiveDirection>
     __device__ void
     MoveSrcSliceWindow(T step_sizes, integral_constant<bool, PositiveDirection> positive_direction)
     {
         mThreadwiseLoad.MoveSrcSliceWindow(step_sizes, positive_direction);
     }
 
-    template <class T, bool PositiveDirection>
+    template <typename T, bool PositiveDirection>
     __device__ void
     MoveDstSliceWindow(T step_sizes, integral_constant<bool, PositiveDirection> positive_direction)
     {
@@ -546,14 +546,14 @@ struct BlockwiseGenericTensorSliceCopy_v2
 
 // this version use TensorView and TensorCoordinate
 template <index_t BlockSize,
-          class SrcTensor,
-          class DstTensor,
-          class SliceLengths,
-          class SubLengths,
-          class ThreadClusterLengths,
-          class ThreadClusterArrangeOrder,
-          class SrcDimAccessOrder,
-          class DstDimAccessOrder,
+          typename SrcTensor,
+          typename DstTensor,
+          typename SliceLengths,
+          typename SubLengths,
+          typename ThreadClusterLengths,
+          typename ThreadClusterArrangeOrder,
+          typename SrcDimAccessOrder,
+          typename DstDimAccessOrder,
           index_t SrcVectorAccessDim,
           index_t DstVectorAccessDim,
           index_t SrcDataPerAccess,
@@ -622,14 +622,14 @@ struct BlockwiseGenericTensorSliceCopy_v3
         mThreadwiseStore.Run();
     }
 
-    template <class T, bool PositiveDirection>
+    template <typename T, bool PositiveDirection>
     __device__ void
     MoveSrcSliceWindow(T step_sizes, integral_constant<bool, PositiveDirection> positive_direction)
     {
         mThreadwiseLoad.MoveSrcSliceWindow(step_sizes, positive_direction);
     }
 
-    template <class T, bool PositiveDirection>
+    template <typename T, bool PositiveDirection>
     __device__ void
     MoveDstSliceWindow(T step_sizes, integral_constant<bool, PositiveDirection> positive_direction)
     {
@@ -669,14 +669,14 @@ struct BlockwiseGenericTensorSliceCopy_v3
 };
 
 template <index_t BlockSize,
-          class SrcDesc,
-          class DstDesc,
-          class SliceLengths,
-          class SubLengths,
-          class ThreadClusterLengths,
-          class ThreadClusterArrangeOrder,
-          class SrcDimAccessOrder,
-          class DstDimAccessOrder,
+          typename SrcDesc,
+          typename DstDesc,
+          typename SliceLengths,
+          typename SubLengths,
+          typename ThreadClusterLengths,
+          typename ThreadClusterArrangeOrder,
+          typename SrcDimAccessOrder,
+          typename DstDimAccessOrder,
           index_t SrcVectorAccessDim,
           index_t DstVectorAccessDim,
           index_t SrcDataPerAccess,
@@ -727,19 +727,19 @@ struct BlockwiseGenericTensorSliceCopy_v4
         return RegisterBufferDesc::GetElementSpace();
     }
 
-    template <class TData>
+    template <typename TData>
     __device__ void RunLoadRegisterBuffer(const TData* p_src, TData* p_buffer) const
     {
         mThreadwiseLoad.Run(p_src, p_buffer);
     }
 
-    template <class TData>
+    template <typename TData>
     __device__ void RunStoreRegisterBuffer(const TData* p_buffer, TData* p_dst) const
     {
         mThreadwiseStore.Run(p_buffer, p_dst);
     }
 
-    template <class TData>
+    template <typename TData>
     __device__ void Run(const TData* p_src, TData* p_dst) const
     {
         TData p_buffer[GetRegisterBufferSize()];
@@ -748,16 +748,18 @@ struct BlockwiseGenericTensorSliceCopy_v4
         mThreadwiseStore.Run(p_buffer, p_dst);
     }
 
-    template <class T, bool PositiveDirection>
+    template <typename T, bool PositiveDirection>
     __device__ void
-    MoveSrcSliceWindow(T step_sizes, integral_constant<bool, PositiveDirection> positive_direction)
+    MoveSrcSliceWindow(const T& step_sizes,
+                       integral_constant<bool, PositiveDirection> positive_direction)
     {
         mThreadwiseLoad.MoveSrcSliceWindow(step_sizes, positive_direction);
     }
 
-    template <class T, bool PositiveDirection>
+    template <typename T, bool PositiveDirection>
     __device__ void
-    MoveDstSliceWindow(T step_sizes, integral_constant<bool, PositiveDirection> positive_direction)
+    MoveDstSliceWindow(const T& step_sizes,
+                       integral_constant<bool, PositiveDirection> positive_direction)
     {
         mThreadwiseStore.MoveDstSliceWindow(step_sizes, positive_direction);
     }
diff --git a/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp b/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp
index 56de4c6f66..110ef53bc5 100644
--- a/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp
+++ b/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp
@@ -1072,16 +1072,22 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
     }
 
     template <class T, bool PositiveDirection>
-    __device__ void MoveSrcSliceWindow(T step_sizes, integral_constant<bool, PositiveDirection>)
+    __device__ void MoveSrcSliceWindow(const T& step_sizes_,
+                                       integral_constant<bool, PositiveDirection>)
     {
+        const auto step_sizes = to_array(step_sizes_);
+
         static_if<PositiveDirection>{}([&](auto) {
-            mSrcSliceOrigin += step_sizes;
+            mSrcSliceOrigin += to_array(step_sizes);
         }).Else([&](auto) { mSrcSliceOrigin -= step_sizes; });
     }
 
     template <class T, bool PositiveDirection>
-    __device__ void MoveDstSliceWindow(T step_sizes, integral_constant<bool, PositiveDirection>)
+    __device__ void MoveDstSliceWindow(const T& step_sizes_,
+                                       integral_constant<bool, PositiveDirection>)
     {
+        const auto step_sizes = to_array(step_sizes_);
+
         static_if<PositiveDirection>{}([&](auto) {
             mDstSliceOrigin += step_sizes;
         }).Else([&](auto) { mDstSliceOrigin -= step_sizes; });
diff --git a/driver/include/device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp b/driver/include/device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp
index fc8d0e7adc..36ca649aa4 100644
--- a/driver/include/device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp
+++ b/driver/include/device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp
@@ -136,7 +136,7 @@ void device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw(InDesc,
     for(index_t i = 0; i < nrepeat; ++i)
     {
         constexpr auto gridwise_conv =
-#if 1
+#if 0
             GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw
 #else
             GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer
diff --git a/driver/include/device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded.hpp b/driver/include/device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded.hpp
index 4140d45626..93f91873e7 100644
--- a/driver/include/device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded.hpp
+++ b/driver/include/device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded.hpp
@@ -4,6 +4,7 @@
 #include "tensor.hpp"
 #include "gridwise_convolution_kernel_wrapper.hpp"
 #include "gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded.hpp"
+#include "gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded_lds_double_buffer.hpp"
 
 template <typename T,
           typename InDesc,
@@ -101,44 +102,49 @@ void device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded(InDesc,
 
     for(index_t i = 0; i < nrepeat; ++i)
     {
-        constexpr auto gridwise_conv = GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_padded<
-            GridSize,
-            BlockSize,
-            T,
-            decltype(in_nchw_desc),
-            decltype(wei_kcyx_desc),
-            decltype(out_nkhw_desc),
-            ConvStrides,
-            ConvDilations,
-            LeftPads,
-            RightPads,
-            BPerBlock,
-            KPerBlock,
-            EPerBlock,
-            GemmNRepeat,
-            GemmMPerThreadSubC,
-            GemmNPerThreadSubC,
-            GemmMLevel0Cluster,
-            GemmNLevel0Cluster,
-            GemmMLevel1Cluster,
-            GemmNLevel1Cluster,
-            GemmKPerThreadLoop,
-            GemmDataPerReadA,
-            GemmDataPerReadB,
-            InBlockCopySubLengths_E_N1_B_N2,
-            InBlockCopyClusterLengths_E_N1_B_N2,
-            InBlockCopyThreadClusterArrangeOrder,
-            InBlockCopySrcAccessOrder,
-            InBlockCopyDstAccessOrder,
-            InBlockCopySrcDataPerRead_B,
-            InBlockCopyDstDataPerWrite_N2,
-            WeiBlockCopySubLengths_E_K,
-            WeiBlockCopyClusterLengths_E_K,
-            WeiBlockCopyThreadClusterArrangeOrder,
-            WeiBlockCopySrcAccessOrder,
-            WeiBlockCopyDstAccessOrder,
-            WeiBlockCopySrcDataPerRead_E,
-            WeiBlockCopyDstDataPerWrite_K>{};
+        constexpr auto gridwise_conv =
+#if 0
+            GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_padded
+#else
+            GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_padded_lds_double_buffer
+#endif
+            <GridSize,
+             BlockSize,
+             T,
+             decltype(in_nchw_desc),
+             decltype(wei_kcyx_desc),
+             decltype(out_nkhw_desc),
+             ConvStrides,
+             ConvDilations,
+             LeftPads,
+             RightPads,
+             BPerBlock,
+             KPerBlock,
+             EPerBlock,
+             GemmNRepeat,
+             GemmMPerThreadSubC,
+             GemmNPerThreadSubC,
+             GemmMLevel0Cluster,
+             GemmNLevel0Cluster,
+             GemmMLevel1Cluster,
+             GemmNLevel1Cluster,
+             GemmKPerThreadLoop,
+             GemmDataPerReadA,
+             GemmDataPerReadB,
+             InBlockCopySubLengths_E_N1_B_N2,
+             InBlockCopyClusterLengths_E_N1_B_N2,
+             InBlockCopyThreadClusterArrangeOrder,
+             InBlockCopySrcAccessOrder,
+             InBlockCopyDstAccessOrder,
+             InBlockCopySrcDataPerRead_B,
+             InBlockCopyDstDataPerWrite_N2,
+             WeiBlockCopySubLengths_E_K,
+             WeiBlockCopyClusterLengths_E_K,
+             WeiBlockCopyThreadClusterArrangeOrder,
+             WeiBlockCopySrcAccessOrder,
+             WeiBlockCopyDstAccessOrder,
+             WeiBlockCopySrcDataPerRead_E,
+             WeiBlockCopyDstDataPerWrite_K>{};
 
         float time = launch_kernel(run_gridwise_convolution_kernel<decltype(gridwise_conv), T>,
                                    dim3(GridSize),
diff --git a/driver/include/device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_padded.hpp b/driver/include/device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_padded.hpp
index d43008c0df..304fbe211d 100644
--- a/driver/include/device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_padded.hpp
+++ b/driver/include/device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_padded.hpp
@@ -4,6 +4,7 @@
 #include "tensor.hpp"
 #include "gridwise_convolution_kernel_wrapper.hpp"
 #include "gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_padded.hpp"
+#include "gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_padded_lds_double_buffer.hpp"
 
 template <class T,
           class InDesc,
@@ -166,43 +167,48 @@ void device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_padded(InDesc,
 
     printf("%s: BlockSize %u, GridSize %u \n", __func__, BlockSize, GridSize);
 
-    constexpr auto gridwise_conv = GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw_padded<
-        GridSize,
-        BlockSize,
-        T,
-        decltype(in_nchw_desc),
-        decltype(wei_kcyx_desc),
-        decltype(out_nkhw_desc),
-        ConvStrides,
-        ConvDilations,
-        LeftPads,
-        RightPads,
-        BPerBlock,
-        KPerBlock,
-        EPerBlock,
-        GemmMPerThreadSubC,
-        GemmNPerThreadSubC,
-        GemmMLevel0Cluster,
-        GemmNLevel0Cluster,
-        GemmMLevel1Cluster,
-        GemmNLevel1Cluster,
-        GemmKPerThreadLoop,
-        GemmDataPerReadA,
-        GemmDataPerReadB,
-        InBlockCopySubLengths_E_B,
-        InBlockCopyClusterLengths_E_B,
-        InBlockCopyThreadClusterArrangeOrder,
-        InBlockCopySrcAccessOrder,
-        InBlockCopyDstAccessOrder,
-        InBlockCopyDataPerAccess_B,
-        WeiBlockCopySubLengths_E_K,
-        WeiBlockCopyClusterLengths_E_K,
-        WeiBlockCopyThreadClusterArrangeOrder,
-        WeiBlockCopySrcAccessOrder,
-        WeiBlockCopyDstAccessOrder,
-        WeiBlockCopySrcDataPerRead_E,
-        WeiBlockCopyDstDataPerWrite_K,
-        OutThreadCopyDataPerAccess_B>{};
+    constexpr auto gridwise_conv =
+#if 0
+        GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw_padded
+#else
+        GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw_padded_lds_double_buffer
+#endif
+        <GridSize,
+         BlockSize,
+         T,
+         decltype(in_nchw_desc),
+         decltype(wei_kcyx_desc),
+         decltype(out_nkhw_desc),
+         ConvStrides,
+         ConvDilations,
+         LeftPads,
+         RightPads,
+         BPerBlock,
+         KPerBlock,
+         EPerBlock,
+         GemmMPerThreadSubC,
+         GemmNPerThreadSubC,
+         GemmMLevel0Cluster,
+         GemmNLevel0Cluster,
+         GemmMLevel1Cluster,
+         GemmNLevel1Cluster,
+         GemmKPerThreadLoop,
+         GemmDataPerReadA,
+         GemmDataPerReadB,
+         InBlockCopySubLengths_E_B,
+         InBlockCopyClusterLengths_E_B,
+         InBlockCopyThreadClusterArrangeOrder,
+         InBlockCopySrcAccessOrder,
+         InBlockCopyDstAccessOrder,
+         InBlockCopyDataPerAccess_B,
+         WeiBlockCopySubLengths_E_K,
+         WeiBlockCopyClusterLengths_E_K,
+         WeiBlockCopyThreadClusterArrangeOrder,
+         WeiBlockCopySrcAccessOrder,
+         WeiBlockCopyDstAccessOrder,
+         WeiBlockCopySrcDataPerRead_E,
+         WeiBlockCopyDstDataPerWrite_K,
+         OutThreadCopyDataPerAccess_B>{};
 
     for(index_t i = 0; i < nrepeat; ++i)
     {
diff --git a/driver/src/driver.cpp b/driver/src/driver.cpp
index cd8325f0f8..e383f8e06f 100644
--- a/driver/src/driver.cpp
+++ b/driver/src/driver.cpp
@@ -92,8 +92,8 @@ int main(int argc, char* argv[])
     // 3x3, 34x34
     constexpr index_t N  = 64;
     constexpr index_t C  = 256;
-    constexpr index_t HI = 34;
-    constexpr index_t WI = 34;
+    constexpr index_t HI = 32;
+    constexpr index_t WI = 32;
     constexpr index_t K  = 128;
     constexpr index_t Y  = 3;
     constexpr index_t X  = 3;
@@ -101,8 +101,8 @@ int main(int argc, char* argv[])
     using ConvStrides   = Sequence<1, 1>;
     using ConvDilations = Sequence<1, 1>;
 
-    using LeftPads  = Sequence<0, 0>;
-    using RightPads = Sequence<0, 0>;
+    using LeftPads  = Sequence<1, 1>;
+    using RightPads = Sequence<1, 1>;
 #elif 0
     // 1x1 filter, 8x8 image
     // cudnn@V100 68%, ck@V100 72%, ck@P100 52%, ck@VII 42%