rename files, added header guard, added namespace

2026-05-11 17:00:18 +00:00 · 2019-06-11 14:35:59 -05:00
parent 05e046654c
commit 88b77181aa
62 changed files with 580 additions and 1844 deletions
--- a/driver/device_convolution_direct_v2_nchw_kcyx_nkhw.hpp
+++ b/driver/device_convolution_direct_v2_nchw_kcyx_nkhw.hpp
@@ -1,9 +1,11 @@
 #pragma once
 #include <unistd.h>
 #include "device.hpp"
-#include "gridwise_convolution_wrapper.hpp"
+#include "gridwise_convolution_kernel_wrapper.hpp"
 #include "gridwise_convolution_direct_v2_nchw_kcyx_nkhw.hpp"

+using namespace ck;
+
 template <class T, class InDesc, class WeiDesc, class OutDesc>
 void device_convolution_direct_v2_nchw_kcyx_nkhw(InDesc,
                                                 const Tensor<T>& in,
@@ -79,7 +81,7 @@ void device_convolution_direct_v2_nchw_kcyx_nkhw(InDesc,
                                                                          WoPerThread,
                                                                          InBlockCopyDataPerRead,
                                                                          WeiBlockCopyDataPerRead>;
-        float time = launch_kernel(run_gridwise_convolution<gridwise_conv, T>,
+        float time = launch_kernel(run_gridwise_convolution_kernel<gridwise_conv, T>,
                                   dim3(GridSize),
                                   dim3(BlockSize),
                                   0,
--- a/driver/device_convolution_implicit_gemm_v1_chwn_cyxk_khwn.hpp
+++ b/driver/device_convolution_implicit_gemm_v1_chwn_cyxk_khwn.hpp
@@ -1,11 +1,13 @@
 #pragma once
 #include <unistd.h>
 #include "device.hpp"
-#include "gridwise_convolution_wrapper.hpp"
+#include "gridwise_convolution_kernel_wrapper.hpp"
 #include "gridwise_convolution_implicit_gemm_v1r1_chwn_cyxk_khwn.hpp"
 #include "gridwise_convolution_implicit_gemm_v1r2_chwn_cyxk_khwn.hpp"
 #include "gridwise_convolution_implicit_gemm_v1r3_chwn_cyxk_khwn.hpp"
-#include "gridwise_convolution_implicit_gemm_v1r3_lds_double_buffer_chwn_cyxk_khwn.hpp"
+#include "gridwise_convolution_implicit_gemm_v1r3_chwn_cyxk_khwn_lds_double_buffer.hpp"
+
+using namespace ck;

 template <class T, class InDesc, class WeiDesc, class OutDesc>
 void device_convolution_implicit_gemm_v1_chwn_cyxk_khwn(InDesc,
@@ -478,7 +480,7 @@ void device_convolution_implicit_gemm_v1_chwn_cyxk_khwn(InDesc,
 #elif 0
            GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn
 #elif 1
-            GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_chwn_cyxk_khwn
+            GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn_lds_double_buffer
 #endif
            <GridSize,
             BlockSize,
@@ -509,7 +511,7 @@ void device_convolution_implicit_gemm_v1_chwn_cyxk_khwn(InDesc,
             WeiBlockCopyDataPerRead_K,
             OutThreadCopyDataPerWrite_N>{};

-        float time = launch_kernel(run_gridwise_convolution<decltype(gridwise_conv), T>,
+        float time = launch_kernel(run_gridwise_convolution_kernel<decltype(gridwise_conv), T>,
                                   dim3(GridSize),
                                   dim3(BlockSize),
                                   0,
--- a/driver/device_convolution_implicit_gemm_v1_nchw_cyxk_khwn.hpp
+++ b/driver/device_convolution_implicit_gemm_v1_nchw_cyxk_khwn.hpp
@@ -1,282 +0,0 @@
-#pragma once
-#include <unistd.h>
-#include "device.hpp"
-#include "gridwise_convolution_wrapper.hpp"
-#include "gridwise_convolution_implicit_gemm_v1r2_nchw_cyxk_khwn.hpp"
-#include "gridwise_convolution_implicit_gemm_v1r3_nchw_cyxk_khwn.hpp"
-#include "gridwise_convolution_implicit_gemm_v1r3_lds_double_buffer_nchw_cyxk_khwn.hpp"
-
-template <class T, class InDesc, class WeiDesc, class OutDesc>
-void device_convolution_implicit_gemm_v1_nchw_cyxk_khwn(InDesc,
-                                                        const Tensor<T>& in_nchw,
-                                                        WeiDesc,
-                                                        const Tensor<T>& wei_kcyx,
-                                                        OutDesc,
-                                                        Tensor<T>& out_nkhw,
-                                                        index_t nrepeat)
-{
-    constexpr auto I0 = Number<0>{};
-    constexpr auto I1 = Number<1>{};
-    constexpr auto I2 = Number<2>{};
-    constexpr auto I3 = Number<3>{};
-
-    constexpr auto in_nchw_desc  = InDesc{};
-    constexpr auto wei_kcyx_desc = WeiDesc{};
-    constexpr auto out_nkhw_desc = OutDesc{};
-
-    constexpr index_t Hi = in_nchw_desc.GetLength(I2);
-    constexpr index_t Wi = in_nchw_desc.GetLength(I3);
-
-    constexpr index_t N  = out_nkhw_desc.GetLength(I0);
-    constexpr index_t Ho = out_nkhw_desc.GetLength(I2);
-    constexpr index_t Wo = out_nkhw_desc.GetLength(I3);
-
-    constexpr index_t K = wei_kcyx_desc.GetLength(I0);
-    constexpr index_t C = wei_kcyx_desc.GetLength(I1);
-    constexpr index_t Y = wei_kcyx_desc.GetLength(I2);
-    constexpr index_t X = wei_kcyx_desc.GetLength(I3);
-
-    // reorder weight
-    auto wei_cyxk_desc = make_ConstantTensorDescriptor_packed(Sequence<C, Y, X, K>{});
-    ostream_ConstantTensorDescriptor(wei_cyxk_desc, std::cout << "wei_cyxk_desc: ");
-
-    Tensor<T> wei_cyxk(make_TensorDescriptor(wei_cyxk_desc));
-
-    auto f_reorder_kcyx2cyxk = [&](auto k, auto c, auto y, auto x) {
-        wei_cyxk(c, y, x, k) = wei_kcyx(k, c, y, x);
-    };
-
-    make_ParallelTensorFunctor(f_reorder_kcyx2cyxk, K, C, Y, X)(
-        std::thread::hardware_concurrency());
-
-    // output
-    auto out_khwn_desc = make_ConstantTensorDescriptor_packed(Sequence<K, Ho, Wo, N>{});
-    ostream_ConstantTensorDescriptor(out_khwn_desc, std::cout << "out_khwn_desc: ");
-
-    Tensor<T> out_khwn(make_TensorDescriptor(out_khwn_desc));
-
-    std::size_t data_sz = sizeof(T);
-    DeviceMem in_nchw_device_buf(data_sz * in_nchw.mDesc.GetElementSpace());
-    DeviceMem wei_cyxk_device_buf(data_sz * wei_cyxk.mDesc.GetElementSpace());
-    DeviceMem out_khwn_device_buf(data_sz * out_khwn.mDesc.GetElementSpace());
-
-    in_nchw_device_buf.ToDevice(in_nchw.mData.data());
-    wei_cyxk_device_buf.ToDevice(wei_cyxk.mData.data());
-    out_khwn_device_buf.ToDevice(out_khwn.mData.data());
-
-#if 1
-    // for 3x3, 34x34, v1r3, Pascal
-    constexpr index_t BlockSize = 128;
-
-    constexpr index_t NPerBlock  = 2;
-    constexpr index_t KPerBlock  = 128;
-    constexpr index_t CPerBlock  = 8;
-    constexpr index_t HoPerBlock = 2;
-    constexpr index_t WoPerBlock = 16;
-
-    constexpr index_t NPerThread  = 2;
-    constexpr index_t KPerThread  = 8;
-    constexpr index_t HoPerThread = 1;
-    constexpr index_t WoPerThread = 4;
-
-    constexpr index_t GemmMPerThreadSubC = 4;
-    constexpr index_t GemmNPerThreadSubC = 4;
-    constexpr index_t GemmMLevel0Cluster = 4;
-    constexpr index_t GemmNLevel0Cluster = 2;
-    constexpr index_t GemmMLevel1Cluster = 4;
-    constexpr index_t GemmNLevel1Cluster = 2;
-    constexpr index_t GemmKPerThreadLoop = 1;
-    constexpr index_t GemmDataPerReadA   = 4;
-    constexpr index_t GemmDataPerReadB   = 4;
-
-    using InBlockReorderSrcSubLengths_NCHW                    = Sequence<2, 1, 2, 1>;
-    using InBlockReorderSrcClusterLengths_NCHW                = Sequence<1, 8, 1, 16>;
-    using InBlockReorderMapThreadCluster2SrcCluster_CHNW2NCHW = Sequence<1, 2, 0, 3>;
-    constexpr index_t InBlockReorderDataPerRead_W  = 1; // v1r3 cannot do vector load input for NCHW
-    constexpr index_t InBlockReorderDataPerWrite_N = 2;
-
-    using WeiBlockCopyClusterLengths            = void;
-    constexpr index_t WeiBlockCopyDataPerRead_K = 4;
-
-    constexpr index_t OutThreadCopyDataPerWrite_N = 2;
-#elif 1
-    // for 3x3, 34x34, v1r3, Vega 20
-    constexpr index_t BlockSize = 256;
-
-    constexpr index_t NPerBlock  = 2;
-    constexpr index_t KPerBlock  = 128;
-    constexpr index_t CPerBlock  = 8;
-    constexpr index_t HoPerBlock = 4;
-    constexpr index_t WoPerBlock = 16;
-
-    constexpr index_t NPerThread  = 2;
-    constexpr index_t KPerThread  = 8;
-    constexpr index_t HoPerThread = 1;
-    constexpr index_t WoPerThread = 4;
-
-    constexpr index_t GemmMPerThreadSubC = 4;
-    constexpr index_t GemmNPerThreadSubC = 4;
-    constexpr index_t GemmMLevel0Cluster = 4;
-    constexpr index_t GemmNLevel0Cluster = 2;
-    constexpr index_t GemmMLevel1Cluster = 4;
-    constexpr index_t GemmNLevel1Cluster = 2;
-    constexpr index_t GemmKPerThreadLoop = 1;
-    constexpr index_t GemmDataPerReadA   = 4;
-    constexpr index_t GemmDataPerReadB   = 4;
-
-    using InBlockReorderSrcSubLengths_NCHW                    = Sequence<2, 1, 2, 1>;
-    using InBlockReorderSrcClusterLengths_NCHW                = Sequence<1, 8, 2, 16>;
-    using InBlockReorderMapThreadCluster2SrcCluster_CHNW2NCHW = Sequence<1, 2, 0, 3>;
-    constexpr index_t InBlockReorderDataPerRead_W  = 1; // v1r3 cannot do vector load input for NCHW
-    constexpr index_t InBlockReorderDataPerWrite_N = 2;
-
-    using WeiBlockCopyClusterLengths            = void;
-    constexpr index_t WeiBlockCopyDataPerRead_K = 4;
-
-    constexpr index_t OutThreadCopyDataPerWrite_N = 2;
-#elif 0
-    // for 3x3, 28x28, v1r2, Pascal
-    constexpr index_t BlockSize = 128;
-
-    constexpr index_t NPerBlock  = 16;
-    constexpr index_t KPerBlock  = 128;
-    constexpr index_t CPerBlock  = 8;
-    constexpr index_t HoPerBlock = 2;
-    constexpr index_t WoPerBlock = 2;
-
-    constexpr index_t NPerThread  = 4;
-    constexpr index_t KPerThread  = 8;
-    constexpr index_t HoPerThread = 1;
-    constexpr index_t WoPerThread = 2;
-
-    constexpr index_t GemmMPerThreadSubC = 4;
-    constexpr index_t GemmNPerThreadSubC = 4;
-    constexpr index_t GemmMLevel0Cluster = 4;
-    constexpr index_t GemmNLevel0Cluster = 2;
-    constexpr index_t GemmMLevel1Cluster = 4;
-    constexpr index_t GemmNLevel1Cluster = 2;
-    constexpr index_t GemmKPerThreadLoop = 1;
-    constexpr index_t GemmDataPerReadA   = 4;
-    constexpr index_t GemmDataPerReadB   = 4;
-
-    using InBlockReorderSrcSubLengths_NCHW                    = Sequence<4, 1, 1, 2>;
-    using InBlockReorderSrcClusterLengths_NCHW                = Sequence<4, 8, 2, 2>;
-    using InBlockReorderMapThreadCluster2SrcCluster_CHNW2NCHW = Sequence<1, 2, 0, 3>;
-    constexpr index_t InBlockReorderDataPerRead_W             = 2;
-    constexpr index_t InBlockReorderDataPerWrite_N            = 4;
-
-    using WeiBlockCopyClusterLengths            = Sequence<4, 1, 32>;
-    constexpr index_t WeiBlockCopyDataPerRead_K = 4;
-
-    constexpr index_t OutThreadCopyDataPerWrite_N = 2;
-#elif 0
-    // for 3x3, 28x28, v1r3, Pascal, bad
-    constexpr index_t BlockSize = 128;
-
-    constexpr index_t NPerBlock  = 16;
-    constexpr index_t KPerBlock  = 128;
-    constexpr index_t CPerBlock  = 8;
-    constexpr index_t HoPerBlock = 2;
-    constexpr index_t WoPerBlock = 2;
-
-    constexpr index_t NPerThread  = 4;
-    constexpr index_t KPerThread  = 8;
-    constexpr index_t HoPerThread = 1;
-    constexpr index_t WoPerThread = 2;
-
-    constexpr index_t GemmMPerThreadSubC = 4;
-    constexpr index_t GemmNPerThreadSubC = 4;
-    constexpr index_t GemmMLevel0Cluster = 4;
-    constexpr index_t GemmNLevel0Cluster = 2;
-    constexpr index_t GemmMLevel1Cluster = 4;
-    constexpr index_t GemmNLevel1Cluster = 2;
-    constexpr index_t GemmKPerThreadLoop = 1;
-    constexpr index_t GemmDataPerReadA   = 4;
-    constexpr index_t GemmDataPerReadB   = 4;
-
-    using InBlockReorderSrcSubLengths_NCHW                    = Sequence<4, 1, 1, 1>;
-    using InBlockReorderSrcClusterLengths_NCHW                = Sequence<4, 8, 2, 2>;
-    using InBlockReorderMapThreadCluster2SrcCluster_CHNW2NCHW = Sequence<1, 2, 0, 3>;
-    constexpr index_t InBlockReorderDataPerRead_W  = 1; // v1r3 cannot do vector load input for NCHW
-    constexpr index_t InBlockReorderDataPerWrite_N = 1;
-
-    using WeiBlockCopyClusterLengths            = void;
-    constexpr index_t WeiBlockCopyDataPerRead_K = 4;
-
-    constexpr index_t OutThreadCopyDataPerWrite_N = 2;
-#endif
-
-    constexpr index_t GridSize =
-        ((N + NPerBlock - 1) / NPerBlock) * ((K + KPerBlock - 1) / KPerBlock) *
-        ((Ho + HoPerBlock - 1) / HoPerBlock) * ((Wo + WoPerBlock - 1) / WoPerBlock);
-
-    printf("%s: BlockSize %u, GridSize %u \n", __func__, BlockSize, GridSize);
-
-    for(index_t i = 0; i < nrepeat; ++i)
-    {
-        constexpr auto gridwise_conv =
-#if 0
-            GridwiseConvolutionImplicitGemm_v1r2_nchw_cyxk_khwn
-#elif 0
-            GridwiseConvolutionImplicitGemm_v1r3_nchw_cyxk_khwn
-#elif 1
-            GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_nchw_cyxk_khwn
-#endif
-            <GridSize,
-             BlockSize,
-             T,
-             decltype(in_nchw_desc),
-             decltype(wei_cyxk_desc),
-             decltype(out_khwn_desc),
-             NPerBlock,
-             KPerBlock,
-             CPerBlock,
-             HoPerBlock,
-             WoPerBlock,
-             NPerThread,
-             KPerThread,
-             HoPerThread,
-             WoPerThread,
-             GemmMPerThreadSubC,
-             GemmNPerThreadSubC,
-             GemmMLevel0Cluster,
-             GemmNLevel0Cluster,
-             GemmMLevel1Cluster,
-             GemmNLevel1Cluster,
-             GemmKPerThreadLoop,
-             GemmDataPerReadA,
-             GemmDataPerReadB,
-             InBlockReorderSrcSubLengths_NCHW,
-             InBlockReorderSrcClusterLengths_NCHW,
-             InBlockReorderMapThreadCluster2SrcCluster_CHNW2NCHW,
-             InBlockReorderDataPerRead_W,
-             InBlockReorderDataPerWrite_N,
-             WeiBlockCopyClusterLengths,
-             WeiBlockCopyDataPerRead_K,
-             OutThreadCopyDataPerWrite_N>{};
-
-        float time = launch_kernel(run_gridwise_convolution<decltype(gridwise_conv), T>,
-                                   dim3(GridSize),
-                                   dim3(BlockSize),
-                                   0,
-                                   static_cast<T*>(in_nchw_device_buf.GetDeviceBuffer()),
-                                   static_cast<T*>(wei_cyxk_device_buf.GetDeviceBuffer()),
-                                   static_cast<T*>(out_khwn_device_buf.GetDeviceBuffer()));
-
-        printf("Elapsed time : %f ms, %f TFlop/s\n",
-               time,
-               (float)calculate_convolution_flops(InDesc{}, WeiDesc{}, OutDesc{}) /
-                   (std::size_t(1000) * 1000 * 1000) / time);
-        usleep(std::min(time * 1000, float(10000)));
-    }
-
-    out_khwn_device_buf.FromDevice(out_khwn.mData.data());
-
-    // reorder output
-    auto f_reorder_khwn2nkhw = [&](auto k, auto ho, auto wo, auto n) {
-        out_nkhw(n, k, ho, wo) = out_khwn(k, ho, wo, n);
-    };
-
-    make_ParallelTensorFunctor(f_reorder_khwn2nkhw, K, Ho, Wo, N)(
-        std::thread::hardware_concurrency());
-}
--- a/driver/device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw.hpp
+++ b/driver/device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw.hpp
@@ -1,9 +1,11 @@
 #pragma once
 #include <unistd.h>
 #include "device.hpp"
-#include "gridwise_convolution_wrapper.hpp"
+#include "gridwise_convolution_kernel_wrapper.hpp"
 #include "gridwise_convolution_implicit_gemm_v1r3_nchw_cyxk_nkhw.hpp"
-#include "gridwise_convolution_implicit_gemm_v1r3_lds_double_buffer_nchw_cyxk_nkhw.hpp"
+#include "gridwise_convolution_implicit_gemm_v1r3_nchw_cyxk_nkhw_lds_double_buffer.hpp"
+
+using namespace ck;

 template <class T, class InDesc, class WeiDesc, class OutDesc>
 void device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw(InDesc,
@@ -313,10 +315,10 @@ void device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw(InDesc,
    for(index_t i = 0; i < nrepeat; ++i)
    {
        constexpr auto gridwise_conv =
-#if 1
+#if 0
            GridwiseConvolutionImplicitGemm_v1r3_nchw_cyxk_nkhw
 #else
-            GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_nchw_cyxk_nkhw
+            GridwiseConvolutionImplicitGemm_v1r3_nchw_cyxk_nkhw_lds_double_buffer
 #endif
            <GridSize,
             BlockSize,
@@ -351,7 +353,7 @@ void device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw(InDesc,
             WeiBlockCopyDataPerRead_K,
             OutThreadCopyDataPerWrite_W>{};

-        float time = launch_kernel(run_gridwise_convolution<decltype(gridwise_conv), T>,
+        float time = launch_kernel(run_gridwise_convolution_kernel<decltype(gridwise_conv), T>,
                                   dim3(GridSize),
                                   dim3(BlockSize),
                                   0,
--- a/driver/device_convolution_implicit_gemm_v2_chwn_cyxk_khwn.hpp
+++ b/driver/device_convolution_implicit_gemm_v2_chwn_cyxk_khwn.hpp
@@ -1,10 +1,12 @@
 #pragma once
 #include <unistd.h>
 #include "device.hpp"
-#include "gridwise_convolution_wrapper.hpp"
+#include "gridwise_convolution_kernel_wrapper.hpp"
 #include "gridwise_convolution_implicit_gemm_v2_chwn_cyxk_khwn.hpp"
 #include "gridwise_convolution_implicit_gemm_v2_chwn_cyxk_khwn_lds_double_buffer.hpp"

+using namespace ck;
+
 template <class T, class InDesc, class WeiDesc, class OutDesc>
 void device_convolution_implicit_gemm_v2_chwn_cyxk_khwn(InDesc,
                                                        const Tensor<T>& in_nchw,
@@ -303,7 +305,7 @@ void device_convolution_implicit_gemm_v2_chwn_cyxk_khwn(InDesc,
             WeiBlockCopyDataPerRead,
             OutThreadCopyDataPerWrite>{};

-        float time = launch_kernel(run_gridwise_convolution<decltype(gridwise_conv), T>,
+        float time = launch_kernel(run_gridwise_convolution_kernel<decltype(gridwise_conv), T>,
                                   dim3(GridSize),
                                   dim3(BlockSize),
                                   0,
--- a/driver/device_convolution_implicit_gemm_v3_nchw_cyxk_nkhw.hpp
+++ b/driver/device_convolution_implicit_gemm_v3_nchw_cyxk_nkhw.hpp
@@ -1,9 +1,11 @@
 #pragma once
 #include <unistd.h>
 #include "device.hpp"
-#include "gridwise_convolution_wrapper.hpp"
+#include "gridwise_convolution_kernel_wrapper.hpp"
 #include "gridwise_convolution_implicit_gemm_v3_nchw_cyxk_nkhw.hpp"
-#include "gridwise_convolution_implicit_gemm_v3_lds_double_buffer_nchw_cyxk_nkhw.hpp"
+#include "gridwise_convolution_implicit_gemm_v3_nchw_cyxk_nkhw_lds_double_buffer.hpp"
+
+using namespace ck;

 template <class T, class InDesc, class WeiDesc, class OutDesc>
 void device_convolution_implicit_gemm_v3_nchw_cyxk_nkhw(InDesc,
@@ -102,7 +104,7 @@ void device_convolution_implicit_gemm_v3_nchw_cyxk_nkhw(InDesc,
 #if 0
            GridwiseConvolutionImplicitGemm_v3_nchw_cyxk_nkhw
 #else
-            GridwiseConvolutionImplicitGemm_v3_lds_double_buffer_nchw_cyxk_nkhw
+            GridwiseConvolutionImplicitGemm_v3_nchw_cyxk_nkhw_lds_double_buffer
 #endif
            <GridSize,
             BlockSize,
@@ -133,7 +135,7 @@ void device_convolution_implicit_gemm_v3_nchw_cyxk_nkhw(InDesc,
             WeiBlockCopyDataPerAccess_K>{};

 #if 1
-        float time = launch_kernel(run_gridwise_convolution<decltype(gridwise_conv), T>,
+        float time = launch_kernel(run_gridwise_convolution_kernel<decltype(gridwise_conv), T>,
                                   dim3(GridSize),
                                   dim3(BlockSize),
                                   0,
--- a/driver/device_convolution_implicit_gemm_v4_nchw_kcyx_nkhw.hpp
+++ b/driver/device_convolution_implicit_gemm_v4_nchw_kcyx_nkhw.hpp
@@ -1,9 +1,11 @@
 #pragma once
 #include <unistd.h>
 #include "device.hpp"
-#include "gridwise_convolution_wrapper.hpp"
+#include "gridwise_convolution_kernel_wrapper.hpp"
 #include "gridwise_convolution_implicit_gemm_v4_nchw_kcyx_nkhw.hpp"
-#include "gridwise_convolution_implicit_gemm_v4_lds_double_buffer_nchw_kcyx_nkhw.hpp"
+#include "gridwise_convolution_implicit_gemm_v4_nchw_kcyx_nkhw_lds_double_buffer.hpp"
+
+using namespace ck;

 template <class T, class InDesc, class WeiDesc, class OutDesc>
 void device_convolution_implicit_gemm_v4_nchw_kcyx_nkhw(InDesc,
@@ -96,7 +98,7 @@ void device_convolution_implicit_gemm_v4_nchw_kcyx_nkhw(InDesc,
 #if 0
            GridwiseConvolutionImplicitGemm_v4_nchw_kcyx_nkhw
 #else
-            GridwiseConvolutionImplicitGemm_v4_lds_double_buffer_nchw_kcyx_nkhw
+            GridwiseConvolutionImplicitGemm_v4_nchw_kcyx_nkhw_lds_double_buffer
 #endif
            <GridSize,
             BlockSize,
@@ -133,7 +135,7 @@ void device_convolution_implicit_gemm_v4_nchw_kcyx_nkhw(InDesc,
             WeiBlockCopySrcDataPerRead_E,
             WeiBlockCopyDstDataPerWrite_K>{};

-        float time = launch_kernel(run_gridwise_convolution<decltype(gridwise_conv), T>,
+        float time = launch_kernel(run_gridwise_convolution_kernel<decltype(gridwise_conv), T>,
                                   dim3(GridSize),
                                   dim3(BlockSize),
                                   0,
--- a/driver/device_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hpp
+++ b/driver/device_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hpp
@@ -3,6 +3,8 @@
 #include "device.hpp"
 #include "gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hpp"

+using namespace ck;
+
 template <class TInWei, class TOut, class InDesc, class WeiDesc, class OutDesc>
 void device_direct_convolution_2_vectorized_nchw_kcyx_nkhw(InDesc,
                                                           const Tensor<TInWei>& in_nchw,
--- a/driver/device_implicit_gemm_convolution_1_chwn_cyxk_khwn_padded.hpp
+++ b/driver/device_implicit_gemm_convolution_1_chwn_cyxk_khwn_padded.hpp
@@ -3,6 +3,8 @@
 #include "device.hpp"
 #include "gridwise_implicit_gemm_convolution_1_chwn_cyxk_khwn_padded.hpp"

+using namespace ck;
+
 template <class T, class InDesc, class WeiDesc, class OutDesc, class LowerPads, class UpperPads>
 void device_implicit_gemm_convolution_1_chwn_cyxk_khwn_padded(InDesc,
                                                              const Tensor<T>& in_nchw,
--- a/driver/driver.cpp
+++ b/driver/driver.cpp
@@ -3,19 +3,19 @@
 #include <initializer_list>
 #include <cstdlib>
 #include <stdlib.h>
-#include "config.h"
+#include "config.hpp"
 #include "tensor.hpp"
 #include "ConstantTensorDescriptor.hpp"
 #include "conv_common.hpp"
 #include "device_convolution_direct_v2_nchw_kcyx_nkhw.hpp"
-//#include "device_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hpp"
 #include "device_convolution_implicit_gemm_v1_chwn_cyxk_khwn.hpp"
-#include "device_convolution_implicit_gemm_v1_nchw_cyxk_khwn.hpp"
 #include "device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw.hpp"
 #include "device_convolution_implicit_gemm_v2_chwn_cyxk_khwn.hpp"
 #include "device_convolution_implicit_gemm_v3_nchw_cyxk_nkhw.hpp"
 #include "device_convolution_implicit_gemm_v4_nchw_kcyx_nkhw.hpp"

+using namespace ck;
+
 struct GeneratorTensor_1
 {
    template <class... Is>
@@ -419,7 +419,7 @@ int main(int argc, char* argv[])

    constexpr index_t HPad = 0;
    constexpr index_t WPad = 0;
-#elif 0
+#elif 1
    // 3x3, 34x34
    constexpr index_t N  = 64;
    constexpr index_t C  = 256;
@@ -633,15 +633,9 @@ int main(int argc, char* argv[])

 #if 1
 #if 0
-    device_direct_convolution_1
-#elif 0
    device_convolution_direct_v2_nchw_kcyx_nkhw
-#elif 0
-    device_direct_convolution_2_vectorized_nchw_kcyx_nkhw
 #elif 0
    device_convolution_implicit_gemm_v1_chwn_cyxk_khwn
-#elif 0
-    device_convolution_implicit_gemm_v1_nchw_cyxk_khwn
 #elif 0
    device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw
 #elif 0