diff --git a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v1r3_chwn_cyxk_khwn.hpp b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v1r3_chwn_cyxk_khwn.hpp
index 1455e1797b..2d1cd532de 100644
--- a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v1r3_chwn_cyxk_khwn.hpp
+++ b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v1r3_chwn_cyxk_khwn.hpp
@@ -83,7 +83,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn
         constexpr index_t HBlockWork = math::integer_divide_ceil(Ho, HoPerBlock);
         constexpr index_t WBlockWork = math::integer_divide_ceil(Wo, WoPerBlock);
 
-        constexpr auto block_work_desc = make_ConstantTensorDescriptor(
+        constexpr auto block_work_desc = make_ConstantTensorDescriptor_packed(
             Sequence<NBlockWork, KBlockWork, HBlockWork, WBlockWork>{});
 
         const auto block_work_multi_id =
@@ -109,8 +109,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn
                                                 GemmDataPerReadB);
 
         constexpr auto in_c_h_w_n_block_desc = make_ConstantTensorDescriptor_aligned(
-            Sequence<CPerBlock, HoPerBlock, WoPerBlock, NPerBlock>{},
-            Number<InBlockCopyDataPerRead_N>{});
+            Sequence<CPerBlock, HoPerBlock, WoPerBlock, NPerBlock>{}, Number<max_align>{});
 
         // this check is ad-hoc
         // TODO: need to properly implement tensor descriptor with alignment
@@ -118,11 +117,10 @@ struct GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn
                       "GemmDataPerReadB alignment requirement is not meet");
 
         constexpr auto wei_c_k_block_desc = make_ConstantTensorDescriptor_aligned(
-            Sequence<CPerBlock, KPerBlock>{},
-            Number<math::lcm(WeiBlockCopyDataPerRead_K, GemmDataPerReadA)>{});
+            Sequence<CPerBlock, KPerBlock>{}, Number<max_align>{});
 
         // tensor view of threadwise output in register
-        constexpr auto out_k_h_w_n_thread_desc = make_ConstantTensorDescriptor(
+        constexpr auto out_k_h_w_n_thread_desc = make_ConstantTensorDescriptor_packed(
             Sequence<KPerThread, HoPerThread, WoPerThread, NPerThread>{});
 
         // blockwise copy
@@ -144,7 +142,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn
                                    decltype(wei_c_k_global_desc),
                                    decltype(wei_c_k_block_desc),
                                    decltype(wei_c_k_block_desc.GetLengths()),
-                                   WeiBlockCopyDataPerRead_K>{};
+                                   WeiBlockCopyDataPerRead_K>({0, 0}, {0, 0});
 
         // a series of blockwise batched GEMM
         // C_matrix += transpose(A_matrix) * B_matrix
@@ -186,22 +184,9 @@ struct GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn
                 GemmDataPerReadA,
                 GemmDataPerReadB>{};
 
-        // choose GEMM implementation here
-        const auto run_blockwise_batch_gemm = [&](auto... Xs) {
-#if 0
-            return blockwise_batch_gemm.Run(Xs...);
-#elif 0
-            return blockwise_batch_gemm.Run_amd_asm(Xs...);
-#else
-            return blockwise_batch_gemm.Run_asm_v2(Xs...);
-#endif
-        };
-
         // LDS: be careful of alignment
-        // TODO:: need to properly implement tensor descriptor with alignment
-        constexpr index_t in_block_space =
-            in_c_h_w_n_block_desc.GetElementSpace(Number<max_align>{});
-        constexpr index_t wei_block_space = wei_c_k_block_desc.GetElementSpace(Number<max_align>{});
+        constexpr index_t in_block_space  = in_c_h_w_n_block_desc.GetElementSpace();
+        constexpr index_t wei_block_space = wei_c_k_block_desc.GetElementSpace();
 
         __shared__ Float p_in_block[in_block_space];
         __shared__ Float p_wei_block[wei_block_space];
@@ -225,7 +210,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn
 #endif
 
         // set threadwise output tensor to 0
-        threadwise_4d_tensor_set_zero(out_k_h_w_n_thread_desc, p_out_thread);
+        threadwise_matrix_set_zero(c_k_wn_thread_mtx_desc, p_out_thread);
 
 #if 1
         const Float* p_in_global_block_offset =
@@ -258,7 +243,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn
 
                     __syncthreads();
 
-                    run_blockwise_batch_gemm(p_wei_block, p_in_block, p_out_thread);
+                    blockwise_batch_gemm.Run(p_wei_block, p_in_block, p_out_thread);
 
                     __syncthreads();
                 }
@@ -291,7 +276,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn
 
                     __syncthreads();
 
-                    run_blockwise_batch_gemm(p_wei_block, p_in_block, p_out_thread);
+                    blockwise_batch_gemm.Run(p_wei_block, p_in_block, p_out_thread);
 
                     __syncthreads();
                 }
@@ -308,13 +293,12 @@ struct GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn
         const index_t wo_thread_data_begin = c_thread_mtx_begin.col / NPerBlock;
         const index_t n_thread_data_begin  = c_thread_mtx_begin.col % NPerBlock;
 
-        static_if<GemmNPerThreadSubC <= NPerBlock>{}([&](auto f_dummy) { // f_dummy do nothing but
-                                                                         // perfect forwarding.
-                                                                         // Using this trick to
-            // make this lambda a generic lambda, so it won't be compiled until
-            // instantiated
+        static_if<GemmNPerThreadSubC <= NPerBlock>{}([&](auto fwd) {
+            // fwd do nothing but perfect forwarding.
+            // Using this trick to make this lambda a generic lambda, so it won't be compiled until
+            // being instantiated here
             static_assert(
-                (f_dummy(GemmNPerThreadSubC) <= NPerBlock && NPerBlock % GemmNPerThreadSubC == 0),
+                (fwd(GemmNPerThreadSubC) <= NPerBlock && NPerBlock % GemmNPerThreadSubC == 0),
                 "wrong!");
 
             // output is a 10d tensor
@@ -322,38 +306,33 @@ struct GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn
             constexpr index_t N1 = NPerBlock / N2;
 
             constexpr index_t W2 =
-                (GemmNLevel0Cluster * GemmNLevel1Cluster) / f_dummy(NPerBlock / GemmNPerThreadSubC);
+                (GemmNLevel0Cluster * GemmNLevel1Cluster) / fwd(NPerBlock / GemmNPerThreadSubC);
             constexpr index_t W1 = WoPerBlock / W2;
 
             constexpr index_t K2 = GemmMPerThreadSubC;
             constexpr index_t K1 = KPerBlock / KPerThread;
 
-            constexpr auto out_10d_global_desc =
-                make_ConstantTensorDescriptor(Sequence<K / (K1 * K2),
-                                                       K1,
-                                                       K2,
-                                                       Ho,
-                                                       Wo / (W1 * W2),
-                                                       W1,
-                                                       W2,
-                                                       N / f_dummy(N1 * N2),
-                                                       N1,
-                                                       N2>{});
+            constexpr auto out_10d_global_desc = fwd(out_k_h_w_n_global_desc)
+                                                     .Fold(I3, Number<N1>{}, Number<N2>{})
+                                                     .Fold(I2, Number<W1>{}, Number<W2>{})
+                                                     .Fold(I0, Number<K1>{}, Number<K2>{});
 
-            constexpr auto out_10d_thread_desc = make_ConstantTensorDescriptor(
-                Sequence<KPerThread / K2, 1, K2, HoPerThread, 1, W1, 1, 1, 1, N2>{});
+            constexpr auto out_10d_thread_desc = fwd(out_k_h_w_n_thread_desc)
+                                                     .Fold(I3, Number<1>{}, Number<N2>{})
+                                                     .Fold(I2, Number<W1>{}, Number<1>{})
+                                                     .Fold(I0, Number<1>{}, Number<K2>{});
 
 #if 0
-                if(get_thread_local_1d_id() == 0 && get_block_1d_id() == 0)
-                {
-                    print_ConstantTensorDescriptor(out_k_h_w_n_thread_desc,
-                                                   "out_k_h_w_n_thread_desc");
-                    print_ConstantTensorDescriptor(out_10d_thread_desc, "out_10d_thread_desc");
+            if(get_thread_local_1d_id() == 0 && get_block_1d_id() == 0)
+            {
+                print_ConstantTensorDescriptor(out_k_h_w_n_thread_desc,
+                                               "a: out_k_h_w_n_thread_desc");
+                print_ConstantTensorDescriptor(out_10d_thread_desc, "a: out_10d_thread_desc");
 
-                    print_ConstantTensorDescriptor(out_k_h_w_n_global_desc,
-                                                   "out_k_h_w_n_global_desc");
-                    print_ConstantTensorDescriptor(out_10d_global_desc, "out_10d_global_desc");
-                }
+                print_ConstantTensorDescriptor(out_k_h_w_n_global_desc,
+                                               "a: out_k_h_w_n_global_desc");
+                print_ConstantTensorDescriptor(out_10d_global_desc, "a: out_10d_global_desc");
+            }
 #endif
 
             threadwise_tensor_slice_copy(out_10d_thread_desc,
@@ -367,8 +346,8 @@ struct GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn
                                                  n_block_data_begin + n_thread_data_begin),
                                          out_10d_thread_desc.GetLengths(),
                                          Number<OutThreadCopyDataPerWrite_N>{});
-        }).Else([&](auto f_dummy) {
-            static_assert(f_dummy(GemmNPerThreadSubC) >= NPerBlock && NPerThread == NPerBlock &&
+        }).Else([&](auto fwd) {
+            static_assert(fwd(GemmNPerThreadSubC) >= NPerBlock && NPerThread == NPerBlock &&
                               GemmNPerThreadSubC % NPerThread == 0,
                           "wrong!");
 
@@ -377,33 +356,34 @@ struct GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn
 
             constexpr index_t W3 = GemmNPerThreadSubC / NPerBlock;
             constexpr index_t W2 = GemmNLevel0Cluster * GemmNLevel1Cluster;
-            constexpr index_t W1 = WoPerBlock / f_dummy(W2 * W3);
+            constexpr index_t W1 = WoPerBlock / fwd(W2 * W3);
 
             constexpr index_t K2 = GemmMPerThreadSubC;
             constexpr index_t K1 = KPerBlock / KPerThread;
 
-            constexpr auto out_10d_global_desc = make_ConstantTensorDescriptor(
-                Sequence<K / (K1 * K2), K1, K2, Ho, Wo / (W1 * W2 * W3), W1, W2, W3, N / N1, N1>{});
+            constexpr auto out_10d_global_desc =
+                fwd(out_k_h_w_n_global_desc)
+                    .Fold(I3, Number<N1>{})
+                    .Fold(I2, Number<W1>{}, Number<W2>{}, Number<W3>{})
+                    .Fold(I0, Number<K1>{}, Number<K2>{});
 
-            constexpr auto out_10d_thread_desc = make_ConstantTensorDescriptor(
-                Sequence<KPerThread / K2, 1, K2, HoPerThread, 1, W1, 1, W3, 1, N1>{});
+            constexpr auto out_10d_thread_desc =
+                fwd(out_k_h_w_n_thread_desc)
+                    .Fold(I3, Number<N1>{})
+                    .Fold(I2, Number<W1>{}, Number<1>{}, Number<W3>{})
+                    .Fold(I0, Number<1>{}, Number<K2>{});
 
 #if 0
-                if(get_thread_local_1d_id() == 0 && get_block_1d_id() == 0)
-                {
-                    print_ConstantTensorDescriptor(out_k_h_w_n_thread_desc,
-                                                   "out_k_h_w_n_thread_desc");
-                    print_ConstantTensorDescriptor(out_10d_thread_desc, "out_10d_thread_desc");
+            if(get_thread_local_1d_id() == 0 && get_block_1d_id() == 0)
+            {
+                print_ConstantTensorDescriptor(out_k_h_w_n_thread_desc,
+                                               "b: out_k_h_w_n_thread_desc");
+                print_ConstantTensorDescriptor(out_10d_thread_desc, "b: out_10d_thread_desc");
 
-                    print_ConstantTensorDescriptor(out_k_h_w_n_global_desc,
-                                                   "out_k_h_w_n_global_desc");
-                    print_ConstantTensorDescriptor(out_10d_global_desc, "out_10d_global_desc");
-
-                    for(index_t i = 0; i < 64; ++i)
-                    {
-                        printf("out %f, ", p_out_thread[i]);
-                    }
-                }
+                print_ConstantTensorDescriptor(out_k_h_w_n_global_desc,
+                                               "b: out_k_h_w_n_global_desc");
+                print_ConstantTensorDescriptor(out_10d_global_desc, "b: out_10d_global_desc");
+            }
 #endif
 
             threadwise_tensor_slice_copy(out_10d_thread_desc,
diff --git a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v1r3_chwn_cyxk_khwn_lds_double_buffer.hpp b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v1r3_chwn_cyxk_khwn_lds_double_buffer.hpp
index c5e1712332..0df27009ad 100644
--- a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v1r3_chwn_cyxk_khwn_lds_double_buffer.hpp
+++ b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v1r3_chwn_cyxk_khwn_lds_double_buffer.hpp
@@ -114,8 +114,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn_lds_double_buffer
                                                 GemmDataPerReadB);
 
         constexpr auto in_c_h_w_n_block_desc = make_ConstantTensorDescriptor_aligned(
-            Sequence<CPerBlock, HoPerBlock, WoPerBlock, NPerBlock>{},
-            Number<InBlockCopyDataPerRead_N>{});
+            Sequence<CPerBlock, HoPerBlock, WoPerBlock, NPerBlock>{}, Number<max_align>{});
 
         // this check is ad-hoc
         // TODO: need to properly implement tensor descriptor with alignment
@@ -123,8 +122,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn_lds_double_buffer
                       "GemmDataPerReadB alignment requirement is not meet");
 
         constexpr auto wei_c_k_block_desc = make_ConstantTensorDescriptor_aligned(
-            Sequence<CPerBlock, KPerBlock>{},
-            Number<math::lcm(WeiBlockCopyDataPerRead_K, GemmDataPerReadA)>{});
+            Sequence<CPerBlock, KPerBlock>{}, Number<max_align>{});
 
         // tensor view of threadwise output in register
         constexpr auto out_k_h_w_n_thread_desc = make_ConstantTensorDescriptor_packed(
@@ -201,21 +199,9 @@ struct GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn_lds_double_buffer
                 GemmDataPerReadA,
                 GemmDataPerReadB>{};
 
-        // choose GEMM implementation here
-        const auto run_blockwise_batch_gemm = [&](auto... Xs) {
-#if 1
-            return blockwise_batch_gemm.Run(Xs...);
-#elif 0
-            return blockwise_batch_gemm.Run_amd_asm(Xs...);
-#else
-            return blockwise_batch_gemm.Run_asm_v2(Xs...);
-#endif
-        };
-
         // LDS: be careful of alignment
-        constexpr index_t in_block_space =
-            in_c_h_w_n_block_desc.GetElementSpace(Number<max_align>{});
-        constexpr index_t wei_block_space = wei_c_k_block_desc.GetElementSpace(Number<max_align>{});
+        constexpr index_t in_block_space  = in_c_h_w_n_block_desc.GetElementSpace();
+        constexpr index_t wei_block_space = wei_c_k_block_desc.GetElementSpace();
 
         // LDS double buffer
         __shared__ Float p_in_block_double[2 * in_block_space];
@@ -307,7 +293,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn_lds_double_buffer
                         blockwise_wei_copy.RunLoadRegisterClipboard(p_wei_global_block_offset,
                                                                     p_wei_register_clipboard);
 
-                        run_blockwise_batch_gemm(p_wei_block_now, p_in_block_now, p_out_thread);
+                        blockwise_batch_gemm.Run(p_wei_block_now, p_in_block_now, p_out_thread);
 
                         // LDS double buffer: store next data to LDS
                         blockwise_in_copy.RunStoreRegisterClipboard(p_in_register_clipboard,
@@ -335,7 +321,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn_lds_double_buffer
                                                                 p_wei_register_clipboard);
 
                     // LDS double buffer: GEMM on current data
-                    run_blockwise_batch_gemm(p_wei_block_double, p_in_block_double, p_out_thread);
+                    blockwise_batch_gemm.Run(p_wei_block_double, p_in_block_double, p_out_thread);
 
                     // LDS double buffer: store next data to LDS
                     blockwise_in_copy.RunStoreRegisterClipboard(p_in_register_clipboard,
@@ -347,7 +333,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn_lds_double_buffer
                     __syncthreads();
 
                     // LDS double buffer: GEMM on current data
-                    run_blockwise_batch_gemm(p_wei_block_double + wei_block_space,
+                    blockwise_batch_gemm.Run(p_wei_block_double + wei_block_space,
                                              p_in_block_double + in_block_space,
                                              p_out_thread);
                 }
diff --git a/composable_kernel/include/tensor_operation/blockwise_batched_gemm.hpp b/composable_kernel/include/tensor_operation/blockwise_batched_gemm.hpp
index 36d3060b23..a6aebf0fd0 100644
--- a/composable_kernel/include/tensor_operation/blockwise_batched_gemm.hpp
+++ b/composable_kernel/include/tensor_operation/blockwise_batched_gemm.hpp
@@ -170,9 +170,9 @@ struct BlockwiseBatchGemmBlockABlockBThreadCTransANormalBNormalC_V2
     }
 
     template <class FloatA, class FloatB, class FloatC>
-    __device__ void Run(const FloatA* __restrict__ p_a_block,
-                        const FloatB* __restrict__ p_b_block,
-                        FloatC* __restrict__ p_c_thread) const
+    __device__ void Run_source(const FloatA* __restrict__ p_a_block,
+                               const FloatB* __restrict__ p_b_block,
+                               FloatC* __restrict__ p_c_thread) const
     {
         constexpr auto True  = integral_constant<bool, true>{};
         constexpr auto False = integral_constant<bool, false>{};
@@ -189,10 +189,10 @@ struct BlockwiseBatchGemmBlockABlockBThreadCTransANormalBNormalC_V2
         // thread A, B for GEMM
         //   A is transposed, b is not
         constexpr auto a_thread_mtx =
-            make_ConstantMatrixDescriptor(Number<KPerThreadLoop>{}, Number<MPerThread>{});
+            make_ConstantMatrixDescriptor_packed(Number<KPerThreadLoop>{}, Number<MPerThread>{});
 
         constexpr auto b_thread_mtx =
-            make_ConstantMatrixDescriptor(Number<KPerThreadLoop>{}, Number<NPerThread>{});
+            make_ConstantMatrixDescriptor_packed(Number<KPerThreadLoop>{}, Number<NPerThread>{});
 
         // thread A-sub, B-sub for copy
         constexpr auto a_thread_sub_mtx = make_ConstantMatrixDescriptor(
@@ -480,6 +480,19 @@ struct BlockwiseBatchGemmBlockABlockBThreadCTransANormalBNormalC_V2
     }
 #endif
 
+    template <class FloatA, class FloatB, class FloatC>
+    __device__ void Run(const FloatA* __restrict__ p_a_block,
+                        const FloatB* __restrict__ p_b_block,
+                        FloatC* __restrict__ p_c_thread) const
+
+    {
+#if CK_USE_AMD_INLINE_ASM && CK_BLOCKWISE_GEMM_USE_AMD_INLINE_ASM
+        Run_amd_asm(p_a_block, p_b_block, p_c_thread);
+#else
+        Run_source(p_a_block, p_b_block, p_c_thread);
+#endif
+    }
+
     template <class BlockMatrixC, index_t BlockMatrixStrideC, class FloatC>
     __device__ void CopyThreadMatrixCToBlockMatrixC(const FloatC* __restrict__ p_c_thread,
                                                     FloatC* __restrict__ p_c_block) const
diff --git a/driver/include/device_convolution_implicit_gemm_v1_chwn_cyxk_khwn.hpp b/driver/include/device_convolution_implicit_gemm_v1_chwn_cyxk_khwn.hpp
index de82858288..0f5305d196 100644
--- a/driver/include/device_convolution_implicit_gemm_v1_chwn_cyxk_khwn.hpp
+++ b/driver/include/device_convolution_implicit_gemm_v1_chwn_cyxk_khwn.hpp
@@ -143,7 +143,7 @@ void device_convolution_implicit_gemm_v1_chwn_cyxk_khwn(InDesc,
     constexpr index_t WeiBlockCopyDataPerRead_K = 4;
 
     constexpr index_t OutThreadCopyDataPerWrite_N = 2;
-#elif 0
+#elif 1
     // for 3x3, 34x34, v1r3, Pascal
     // for 3x3, 28x28, v1r3, Pascal
     // for 3x3, 14x14, v1r3, Pascal
@@ -478,9 +478,9 @@ void device_convolution_implicit_gemm_v1_chwn_cyxk_khwn(InDesc,
             GridwiseConvolutionImplicitGemm_v1r1_chwn_cyxk_khwn
 #elif 0
             GridwiseConvolutionImplicitGemm_v1r2_chwn_cyxk_khwn
-#elif 0
-            GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn
 #elif 1
+            GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn
+#elif 0
             GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn_lds_double_buffer
 #endif
             <GridSize,
diff --git a/driver/src/driver.cpp b/driver/src/driver.cpp
index 418f0d4a74..0deb5d329c 100644
--- a/driver/src/driver.cpp
+++ b/driver/src/driver.cpp
@@ -85,7 +85,7 @@ int main(int argc, char* argv[])
 
     constexpr index_t HPad = 0;
     constexpr index_t WPad = 0;
-#elif 0
+#elif 1
     // 3x3, 34x34
     constexpr index_t N  = 128;
     constexpr index_t C  = 256;
@@ -95,7 +95,7 @@ int main(int argc, char* argv[])
     constexpr index_t Y  = 3;
     constexpr index_t X  = 3;
 
-    using ConvStrides   = Sequence<2, 2>;
+    using ConvStrides   = Sequence<1, 1>;
     using ConvDilations = Sequence<1, 1>;
 
     constexpr index_t HPad = 0;
@@ -519,19 +519,19 @@ int main(int argc, char* argv[])
 
 #if 0
     device_convolution_direct_v2_nchw_kcyx_nkhw
+        (in_nchw_desc, in_nchw, wei_kcyx_desc, wei_kcyx, out_nkhw_desc, out_nkhw_device, nrepeat);
+#elif 1
+    device_convolution_implicit_gemm_v1_chwn_cyxk_khwn(
         in_nchw_desc, in_nchw, wei_kcyx_desc, wei_kcyx, out_nkhw_desc, out_nkhw_device, nrepeat);
 #elif 0
-    device_convolution_implicit_gemm_v1_chwn_cyxk_khwn
+    device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw(
         in_nchw_desc, in_nchw, wei_kcyx_desc, wei_kcyx, out_nkhw_desc, out_nkhw_device, nrepeat);
 #elif 0
-    device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw
-        in_nchw_desc, in_nchw, wei_kcyx_desc, wei_kcyx, out_nkhw_desc, out_nkhw_device, nrepeat);
-#elif 0
-    device_convolution_implicit_gemm_v2_chwn_cyxk_khwn
+    device_convolution_implicit_gemm_v2_chwn_cyxk_khwn(
         in_nchw_desc, in_nchw, wei_kcyx_desc, wei_kcyx, out_nkhw_desc, out_nkhw_device, nrepeat);
 #elif 0
     device_convolution_implicit_gemm_v3_nchw_cyxk_nkhw(
-        in_nchw_desc, in_nchw, wei_kcyx_desc, wei_kcyx, out_nkhw_desc, out_nkhw_device, nrepeat);
+        (in_nchw_desc, in_nchw, wei_kcyx_desc, wei_kcyx, out_nkhw_desc, out_nkhw_device, nrepeat);
 #elif 0
     device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw(in_nchw_desc,
                                                          in_nchw,
diff --git a/script/cmake-cuda.sh b/script/cmake-cuda.sh
index 7feb67e213..4828c22fc8 100755
--- a/script/cmake-cuda.sh
+++ b/script/cmake-cuda.sh
@@ -13,11 +13,11 @@ cmake
 -D CMAKE_BUILD_TYPE=Release                                                                 \
 -D CMAKE_VERBOSE_MAKEFILE:BOOL=ON                                                           \
 -D DEVICE_BACKEND=NVIDIA                                                                    \
--D BOOST_ROOT="/package/install/boost_1.67.0"                                               \
--D CUDA_COMMON_INCLUDE_DIR="/home/chao/code/test_feature/cuda_common/cuda_10.0_common/inc"  \
--D CMAKE_CUDA_FLAGS="-ccbin clang++ -m64 -Xcompiler -fopenmp -lineinfo --source-in-ptx -keep -Xptxas -v -gencode=arch=compute_61,code=sm_61 -Xptxas -v -Xptxas -v -maxrregcount=128" \
+-D CUDA_COMMON_INCLUDE_DIR="/package/install/cuda/10.1/NVIDIA_CUDA-10.1_Samples/common/inc" \
+-D CMAKE_CUDA_FLAGS="-ccbin clang++ -m64 -Xcompiler -fopenmp -lineinfo --source-in-ptx -keep -Xptxas -v -gencode=arch=compute_61,code=sm_61" \
 ${MY_PROJECT_SOURCE}
 
+#-D BOOST_ROOT="/package/install/boost_1.67.0"                                               \
 
 #-D CMAKE_CUDA_COMPILER="/package/install/cuda_10.0/bin/nvcc"                                \
 #-D CMAKE_CUDA_FLAGS="-ccbin clang++ -m64 -Xcompiler -fopenmp -lineinfo --source-in-ptx -keep -Xptxas -v -gencode=arch=compute_61,code=sm_61" \