add lds doble buffer to nchw padded v4r1 and v4r4

2026-05-13 01:36:06 +00:00 · 2019-09-15 16:58:16 -05:00
parent 2c93b3057d
commit bf97542846
7 changed files with 153 additions and 135 deletions
--- a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_padded.hpp
+++ b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_padded.hpp
@@ -59,7 +59,6 @@ struct GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw_padded
        constexpr auto I1 = Number<1>{};
        constexpr auto I2 = Number<2>{};
        constexpr auto I3 = Number<3>{};
-        constexpr auto I5 = Number<5>{};

        constexpr auto True = integral_constant<bool, true>{};

@@ -330,7 +329,6 @@ struct GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw_padded
        constexpr auto I1 = Number<1>{};
        constexpr auto I2 = Number<2>{};
        constexpr auto I3 = Number<3>{};
-        constexpr auto I5 = Number<5>{};

        constexpr auto True = integral_constant<bool, true>{};

--- a/composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp
+++ b/composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp
@@ -25,14 +25,14 @@ namespace ck {
 // repeat-length on the merged dimension need to be 1. These sanity checks are performed
 // in constructor of BlockwiseGenericTensorSliceCopy_v1
 template <index_t BlockSize,
-          class SrcDesc,
-          class DstDesc,
-          class SliceLengths,
-          class SubLengths,
-          class ThreadClusterLengths,
-          class ThreadClusterArrangeOrder,
-          class SrcDimAccessOrder,
-          class DstDimAccessOrder,
+          typename SrcDesc,
+          typename DstDesc,
+          typename SliceLengths,
+          typename SubLengths,
+          typename ThreadClusterLengths,
+          typename ThreadClusterArrangeOrder,
+          typename SrcDimAccessOrder,
+          typename DstDimAccessOrder,
          index_t SrcVectorAccessDim,
          index_t DstVectorAccessDim,
          index_t SrcDataPerAccess,
@@ -204,7 +204,7 @@ struct BlockwiseGenericTensorSliceCopy_v1
        return GetRegisterBufferDescriptor().GetElementSpace();
    }

-    template <class TData>
+    template <typename TData>
    __device__ void RunLoadRegisterBuffer(const TData* __restrict__ p_src,
                                          TData* __restrict__ p_buffer) const
    {
@@ -260,7 +260,7 @@ struct BlockwiseGenericTensorSliceCopy_v1
        });
    }

-    template <class TData>
+    template <typename TData>
    __device__ void RunStoreRegisterBuffer(const TData* __restrict__ p_buffer,
                                           TData* __restrict__ p_dst) const
    {
@@ -315,7 +315,7 @@ struct BlockwiseGenericTensorSliceCopy_v1
        });
    }

-    template <class TData>
+    template <typename TData>
    __device__ void Run(const TData* __restrict__ p_src, TData* __restrict__ p_dst) const
    {
        TData p_buffer[GetRegisterBufferSize()];
@@ -406,7 +406,7 @@ struct BlockwiseGenericTensorSliceCopy_v1
        });
    }

-    template <class T, bool PositiveDirection>
+    template <typename T, bool PositiveDirection>
    __device__ void
    MoveSrcSliceWindow(T step_sizes, integral_constant<bool, PositiveDirection> positive_direction)
    {
@@ -423,14 +423,14 @@ struct BlockwiseGenericTensorSliceCopy_v1
 // Slice a (normal or merged) tensor, and copy it into another (normal or merged) tensor
 // memory layout (ordering of dimensions) can be different between src and dst.
 template <index_t BlockSize,
-          class SrcDesc,
-          class DstDesc,
-          class SliceLengths,
-          class SubLengths,
-          class ThreadClusterLengths,
-          class ThreadClusterArrangeOrder,
-          class SrcDimAccessOrder,
-          class DstDimAccessOrder,
+          typename SrcDesc,
+          typename DstDesc,
+          typename SliceLengths,
+          typename SubLengths,
+          typename ThreadClusterLengths,
+          typename ThreadClusterArrangeOrder,
+          typename SrcDimAccessOrder,
+          typename DstDimAccessOrder,
          index_t SrcVectorAccessDim,
          index_t DstVectorAccessDim,
          index_t SrcDataPerAccess,
@@ -482,19 +482,19 @@ struct BlockwiseGenericTensorSliceCopy_v2
        return RegisterBufferDesc::GetElementSpace();
    }

-    template <class TData>
+    template <typename TData>
    __device__ void RunLoadRegisterBuffer(const TData* p_src, TData* p_buffer) const
    {
        mThreadwiseLoad.Run(p_src, p_buffer);
    }

-    template <class TData>
+    template <typename TData>
    __device__ void RunStoreRegisterBuffer(const TData* p_buffer, TData* p_dst) const
    {
        mThreadwiseStore.Run(p_buffer, p_dst);
    }

-    template <class TData>
+    template <typename TData>
    __device__ void Run(const TData* p_src, TData* p_dst) const
    {
        TData p_buffer[GetRegisterBufferSize()];
@@ -503,14 +503,14 @@ struct BlockwiseGenericTensorSliceCopy_v2
        mThreadwiseStore.Run(p_buffer, p_dst);
    }

-    template <class T, bool PositiveDirection>
+    template <typename T, bool PositiveDirection>
    __device__ void
    MoveSrcSliceWindow(T step_sizes, integral_constant<bool, PositiveDirection> positive_direction)
    {
        mThreadwiseLoad.MoveSrcSliceWindow(step_sizes, positive_direction);
    }

-    template <class T, bool PositiveDirection>
+    template <typename T, bool PositiveDirection>
    __device__ void
    MoveDstSliceWindow(T step_sizes, integral_constant<bool, PositiveDirection> positive_direction)
    {
@@ -546,14 +546,14 @@ struct BlockwiseGenericTensorSliceCopy_v2

 // this version use TensorView and TensorCoordinate
 template <index_t BlockSize,
-          class SrcTensor,
-          class DstTensor,
-          class SliceLengths,
-          class SubLengths,
-          class ThreadClusterLengths,
-          class ThreadClusterArrangeOrder,
-          class SrcDimAccessOrder,
-          class DstDimAccessOrder,
+          typename SrcTensor,
+          typename DstTensor,
+          typename SliceLengths,
+          typename SubLengths,
+          typename ThreadClusterLengths,
+          typename ThreadClusterArrangeOrder,
+          typename SrcDimAccessOrder,
+          typename DstDimAccessOrder,
          index_t SrcVectorAccessDim,
          index_t DstVectorAccessDim,
          index_t SrcDataPerAccess,
@@ -622,14 +622,14 @@ struct BlockwiseGenericTensorSliceCopy_v3
        mThreadwiseStore.Run();
    }

-    template <class T, bool PositiveDirection>
+    template <typename T, bool PositiveDirection>
    __device__ void
    MoveSrcSliceWindow(T step_sizes, integral_constant<bool, PositiveDirection> positive_direction)
    {
        mThreadwiseLoad.MoveSrcSliceWindow(step_sizes, positive_direction);
    }

-    template <class T, bool PositiveDirection>
+    template <typename T, bool PositiveDirection>
    __device__ void
    MoveDstSliceWindow(T step_sizes, integral_constant<bool, PositiveDirection> positive_direction)
    {
@@ -669,14 +669,14 @@ struct BlockwiseGenericTensorSliceCopy_v3
 };

 template <index_t BlockSize,
-          class SrcDesc,
-          class DstDesc,
-          class SliceLengths,
-          class SubLengths,
-          class ThreadClusterLengths,
-          class ThreadClusterArrangeOrder,
-          class SrcDimAccessOrder,
-          class DstDimAccessOrder,
+          typename SrcDesc,
+          typename DstDesc,
+          typename SliceLengths,
+          typename SubLengths,
+          typename ThreadClusterLengths,
+          typename ThreadClusterArrangeOrder,
+          typename SrcDimAccessOrder,
+          typename DstDimAccessOrder,
          index_t SrcVectorAccessDim,
          index_t DstVectorAccessDim,
          index_t SrcDataPerAccess,
@@ -727,19 +727,19 @@ struct BlockwiseGenericTensorSliceCopy_v4
        return RegisterBufferDesc::GetElementSpace();
    }

-    template <class TData>
+    template <typename TData>
    __device__ void RunLoadRegisterBuffer(const TData* p_src, TData* p_buffer) const
    {
        mThreadwiseLoad.Run(p_src, p_buffer);
    }

-    template <class TData>
+    template <typename TData>
    __device__ void RunStoreRegisterBuffer(const TData* p_buffer, TData* p_dst) const
    {
        mThreadwiseStore.Run(p_buffer, p_dst);
    }

-    template <class TData>
+    template <typename TData>
    __device__ void Run(const TData* p_src, TData* p_dst) const
    {
        TData p_buffer[GetRegisterBufferSize()];
@@ -748,16 +748,18 @@ struct BlockwiseGenericTensorSliceCopy_v4
        mThreadwiseStore.Run(p_buffer, p_dst);
    }

-    template <class T, bool PositiveDirection>
+    template <typename T, bool PositiveDirection>
    __device__ void
-    MoveSrcSliceWindow(T step_sizes, integral_constant<bool, PositiveDirection> positive_direction)
+    MoveSrcSliceWindow(const T& step_sizes,
+                       integral_constant<bool, PositiveDirection> positive_direction)
    {
        mThreadwiseLoad.MoveSrcSliceWindow(step_sizes, positive_direction);
    }

-    template <class T, bool PositiveDirection>
+    template <typename T, bool PositiveDirection>
    __device__ void
-    MoveDstSliceWindow(T step_sizes, integral_constant<bool, PositiveDirection> positive_direction)
+    MoveDstSliceWindow(const T& step_sizes,
+                       integral_constant<bool, PositiveDirection> positive_direction)
    {
        mThreadwiseStore.MoveDstSliceWindow(step_sizes, positive_direction);
    }
--- a/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp
+++ b/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp
@@ -1072,16 +1072,22 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
    }

    template <class T, bool PositiveDirection>
-    __device__ void MoveSrcSliceWindow(T step_sizes, integral_constant<bool, PositiveDirection>)
+    __device__ void MoveSrcSliceWindow(const T& step_sizes_,
+                                       integral_constant<bool, PositiveDirection>)
    {
+        const auto step_sizes = to_array(step_sizes_);
+
        static_if<PositiveDirection>{}([&](auto) {
-            mSrcSliceOrigin += step_sizes;
+            mSrcSliceOrigin += to_array(step_sizes);
        }).Else([&](auto) { mSrcSliceOrigin -= step_sizes; });
    }

    template <class T, bool PositiveDirection>
-    __device__ void MoveDstSliceWindow(T step_sizes, integral_constant<bool, PositiveDirection>)
+    __device__ void MoveDstSliceWindow(const T& step_sizes_,
+                                       integral_constant<bool, PositiveDirection>)
    {
+        const auto step_sizes = to_array(step_sizes_);
+
        static_if<PositiveDirection>{}([&](auto) {
            mDstSliceOrigin += step_sizes;
        }).Else([&](auto) { mDstSliceOrigin -= step_sizes; });
--- a/driver/include/device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp
+++ b/driver/include/device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp
@@ -136,7 +136,7 @@ void device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw(InDesc,
    for(index_t i = 0; i < nrepeat; ++i)
    {
        constexpr auto gridwise_conv =
-#if 1
+#if 0
            GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw
 #else
            GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer
--- a/driver/include/device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded.hpp
+++ b/driver/include/device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded.hpp
@@ -4,6 +4,7 @@
 #include "tensor.hpp"
 #include "gridwise_convolution_kernel_wrapper.hpp"
 #include "gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded.hpp"
+#include "gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded_lds_double_buffer.hpp"

 template <typename T,
          typename InDesc,
@@ -101,44 +102,49 @@ void device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded(InDesc,

    for(index_t i = 0; i < nrepeat; ++i)
    {
-        constexpr auto gridwise_conv = GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_padded<
-            GridSize,
-            BlockSize,
-            T,
-            decltype(in_nchw_desc),
-            decltype(wei_kcyx_desc),
-            decltype(out_nkhw_desc),
-            ConvStrides,
-            ConvDilations,
-            LeftPads,
-            RightPads,
-            BPerBlock,
-            KPerBlock,
-            EPerBlock,
-            GemmNRepeat,
-            GemmMPerThreadSubC,
-            GemmNPerThreadSubC,
-            GemmMLevel0Cluster,
-            GemmNLevel0Cluster,
-            GemmMLevel1Cluster,
-            GemmNLevel1Cluster,
-            GemmKPerThreadLoop,
-            GemmDataPerReadA,
-            GemmDataPerReadB,
-            InBlockCopySubLengths_E_N1_B_N2,
-            InBlockCopyClusterLengths_E_N1_B_N2,
-            InBlockCopyThreadClusterArrangeOrder,
-            InBlockCopySrcAccessOrder,
-            InBlockCopyDstAccessOrder,
-            InBlockCopySrcDataPerRead_B,
-            InBlockCopyDstDataPerWrite_N2,
-            WeiBlockCopySubLengths_E_K,
-            WeiBlockCopyClusterLengths_E_K,
-            WeiBlockCopyThreadClusterArrangeOrder,
-            WeiBlockCopySrcAccessOrder,
-            WeiBlockCopyDstAccessOrder,
-            WeiBlockCopySrcDataPerRead_E,
-            WeiBlockCopyDstDataPerWrite_K>{};
+        constexpr auto gridwise_conv =
+#if 0
+            GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_padded
+#else
+            GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_padded_lds_double_buffer
+#endif
+            <GridSize,
+             BlockSize,
+             T,
+             decltype(in_nchw_desc),
+             decltype(wei_kcyx_desc),
+             decltype(out_nkhw_desc),
+             ConvStrides,
+             ConvDilations,
+             LeftPads,
+             RightPads,
+             BPerBlock,
+             KPerBlock,
+             EPerBlock,
+             GemmNRepeat,
+             GemmMPerThreadSubC,
+             GemmNPerThreadSubC,
+             GemmMLevel0Cluster,
+             GemmNLevel0Cluster,
+             GemmMLevel1Cluster,
+             GemmNLevel1Cluster,
+             GemmKPerThreadLoop,
+             GemmDataPerReadA,
+             GemmDataPerReadB,
+             InBlockCopySubLengths_E_N1_B_N2,
+             InBlockCopyClusterLengths_E_N1_B_N2,
+             InBlockCopyThreadClusterArrangeOrder,
+             InBlockCopySrcAccessOrder,
+             InBlockCopyDstAccessOrder,
+             InBlockCopySrcDataPerRead_B,
+             InBlockCopyDstDataPerWrite_N2,
+             WeiBlockCopySubLengths_E_K,
+             WeiBlockCopyClusterLengths_E_K,
+             WeiBlockCopyThreadClusterArrangeOrder,
+             WeiBlockCopySrcAccessOrder,
+             WeiBlockCopyDstAccessOrder,
+             WeiBlockCopySrcDataPerRead_E,
+             WeiBlockCopyDstDataPerWrite_K>{};

        float time = launch_kernel(run_gridwise_convolution_kernel<decltype(gridwise_conv), T>,
                                   dim3(GridSize),
--- a/driver/include/device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_padded.hpp
+++ b/driver/include/device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_padded.hpp
@@ -4,6 +4,7 @@
 #include "tensor.hpp"
 #include "gridwise_convolution_kernel_wrapper.hpp"
 #include "gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_padded.hpp"
+#include "gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_padded_lds_double_buffer.hpp"

 template <class T,
          class InDesc,
@@ -166,43 +167,48 @@ void device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_padded(InDesc,

    printf("%s: BlockSize %u, GridSize %u \n", __func__, BlockSize, GridSize);

-    constexpr auto gridwise_conv = GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw_padded<
-        GridSize,
-        BlockSize,
-        T,
-        decltype(in_nchw_desc),
-        decltype(wei_kcyx_desc),
-        decltype(out_nkhw_desc),
-        ConvStrides,
-        ConvDilations,
-        LeftPads,
-        RightPads,
-        BPerBlock,
-        KPerBlock,
-        EPerBlock,
-        GemmMPerThreadSubC,
-        GemmNPerThreadSubC,
-        GemmMLevel0Cluster,
-        GemmNLevel0Cluster,
-        GemmMLevel1Cluster,
-        GemmNLevel1Cluster,
-        GemmKPerThreadLoop,
-        GemmDataPerReadA,
-        GemmDataPerReadB,
-        InBlockCopySubLengths_E_B,
-        InBlockCopyClusterLengths_E_B,
-        InBlockCopyThreadClusterArrangeOrder,
-        InBlockCopySrcAccessOrder,
-        InBlockCopyDstAccessOrder,
-        InBlockCopyDataPerAccess_B,
-        WeiBlockCopySubLengths_E_K,
-        WeiBlockCopyClusterLengths_E_K,
-        WeiBlockCopyThreadClusterArrangeOrder,
-        WeiBlockCopySrcAccessOrder,
-        WeiBlockCopyDstAccessOrder,
-        WeiBlockCopySrcDataPerRead_E,
-        WeiBlockCopyDstDataPerWrite_K,
-        OutThreadCopyDataPerAccess_B>{};
+    constexpr auto gridwise_conv =
+#if 0
+        GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw_padded
+#else
+        GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw_padded_lds_double_buffer
+#endif
+        <GridSize,
+         BlockSize,
+         T,
+         decltype(in_nchw_desc),
+         decltype(wei_kcyx_desc),
+         decltype(out_nkhw_desc),
+         ConvStrides,
+         ConvDilations,
+         LeftPads,
+         RightPads,
+         BPerBlock,
+         KPerBlock,
+         EPerBlock,
+         GemmMPerThreadSubC,
+         GemmNPerThreadSubC,
+         GemmMLevel0Cluster,
+         GemmNLevel0Cluster,
+         GemmMLevel1Cluster,
+         GemmNLevel1Cluster,
+         GemmKPerThreadLoop,
+         GemmDataPerReadA,
+         GemmDataPerReadB,
+         InBlockCopySubLengths_E_B,
+         InBlockCopyClusterLengths_E_B,
+         InBlockCopyThreadClusterArrangeOrder,
+         InBlockCopySrcAccessOrder,
+         InBlockCopyDstAccessOrder,
+         InBlockCopyDataPerAccess_B,
+         WeiBlockCopySubLengths_E_K,
+         WeiBlockCopyClusterLengths_E_K,
+         WeiBlockCopyThreadClusterArrangeOrder,
+         WeiBlockCopySrcAccessOrder,
+         WeiBlockCopyDstAccessOrder,
+         WeiBlockCopySrcDataPerRead_E,
+         WeiBlockCopyDstDataPerWrite_K,
+         OutThreadCopyDataPerAccess_B>{};

    for(index_t i = 0; i < nrepeat; ++i)
    {
--- a/driver/src/driver.cpp
+++ b/driver/src/driver.cpp
@@ -92,8 +92,8 @@ int main(int argc, char* argv[])
    // 3x3, 34x34
    constexpr index_t N  = 64;
    constexpr index_t C  = 256;
-    constexpr index_t HI = 34;
-    constexpr index_t WI = 34;
+    constexpr index_t HI = 32;
+    constexpr index_t WI = 32;
    constexpr index_t K  = 128;
    constexpr index_t Y  = 3;
    constexpr index_t X  = 3;
@@ -101,8 +101,8 @@ int main(int argc, char* argv[])
    using ConvStrides   = Sequence<1, 1>;
    using ConvDilations = Sequence<1, 1>;

-    using LeftPads  = Sequence<0, 0>;
-    using RightPads = Sequence<0, 0>;
+    using LeftPads  = Sequence<1, 1>;
+    using RightPads = Sequence<1, 1>;
 #elif 0
    // 1x1 filter, 8x8 image
    // cudnn@V100 68%, ck@V100 72%, ck@P100 52%, ck@VII 42%