diff --git a/driver/device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw.hpp b/driver/device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw.hpp
index 99a2ec9ee3..f74b05e750 100644
--- a/driver/device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw.hpp
+++ b/driver/device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw.hpp
@@ -87,7 +87,7 @@ void device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw(InDesc,
     constexpr index_t InBlockReorderDataPerRead_W  = 1; // v1r3 cannot do vector load input for NCHW
     constexpr index_t InBlockReorderDataPerWrite_N = 1;
 
-    using WeiBlockCopyClusterLengths            = Sequence<0, 0>; // not used
+    using WeiBlockCopyClusterLengths            = void;
     constexpr index_t WeiBlockCopyDataPerRead_K = 4;
 
     constexpr index_t OutThreadCopyDataPerWrite_W = 2;
@@ -122,7 +122,7 @@ void device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw(InDesc,
     constexpr index_t InBlockReorderDataPerRead_W  = 1; // v1r3 cannot do vector load NCHW
     constexpr index_t InBlockReorderDataPerWrite_N = 2;
 
-    using WeiBlockCopyClusterLengths            = Sequence<0, 0>; // not used
+    using WeiBlockCopyClusterLengths            = void;
     constexpr index_t WeiBlockCopyDataPerRead_K = 4;
 
     constexpr index_t OutThreadCopyDataPerWrite_W = 4;
@@ -136,10 +136,10 @@ void device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw(InDesc,
     constexpr index_t HoPerBlock = 4;
     constexpr index_t WoPerBlock = 8;
 
-    constexpr index_t NPerThread  = 2;
+    constexpr index_t NPerThread  = 4;
     constexpr index_t KPerThread  = 8;
     constexpr index_t HoPerThread = 1;
-    constexpr index_t WoPerThread = 4;
+    constexpr index_t WoPerThread = 2;
 
     constexpr index_t GemmMPerThreadSubC = 4;
     constexpr index_t GemmNPerThreadSubC = 4;
@@ -155,14 +155,14 @@ void device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw(InDesc,
     using InBlockReorderSrcClusterLengths_NCHW                = Sequence<1, 8, 4, 8>;
     using InBlockReorderMapThreadCluster2SrcCluster_CHNW2NCHW = Sequence<1, 2, 0, 3>;
     constexpr index_t InBlockReorderDataPerRead_W  = 1; // v1r3 cannot do vector load NCHW
-    constexpr index_t InBlockReorderDataPerWrite_N = 1;
+    constexpr index_t InBlockReorderDataPerWrite_N = 4;
 
-    using WeiBlockCopyClusterLengths            = Sequence<0, 0>; // not used
+    using WeiBlockCopyClusterLengths            = void;
     constexpr index_t WeiBlockCopyDataPerRead_K = 4;
 
-    constexpr index_t OutThreadCopyDataPerWrite_W = 1;
-#elif 0
-    // for 3x3, 28x28, v1r2, Pascal
+    constexpr index_t OutThreadCopyDataPerWrite_W = 2;
+#elif 1
+    // for 3x3, 28x28, v1r3, Pascal
     constexpr index_t BlockSize = 128;
 
     constexpr index_t NPerBlock  = 16;
@@ -186,13 +186,13 @@ void device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw(InDesc,
     constexpr index_t GemmDataPerReadA   = 4;
     constexpr index_t GemmDataPerReadB   = 4;
 
-    using InBlockReorderSrcSubLengths_NCHW                    = Sequence<4, 1, 1, 2>;
+    using InBlockReorderSrcSubLengths_NCHW                    = Sequence<4, 1, 1, 1>;
     using InBlockReorderSrcClusterLengths_NCHW                = Sequence<4, 8, 2, 2>;
     using InBlockReorderMapThreadCluster2SrcCluster_CHNW2NCHW = Sequence<1, 2, 0, 3>;
-    constexpr index_t InBlockReorderDataPerRead_W             = 2;
-    constexpr index_t InBlockReorderDataPerWrite_N            = 4;
+    constexpr index_t InBlockReorderDataPerRead_W  = 1; // v1r3 cannot do vector load NCHW
+    constexpr index_t InBlockReorderDataPerWrite_N = 4;
 
-    using WeiBlockCopyClusterLengths            = Sequence<4, 1, 32>;
+    using WeiBlockCopyClusterLengths            = void;
     constexpr index_t WeiBlockCopyDataPerRead_K = 4;
 
     constexpr index_t OutThreadCopyDataPerWrite_W = 2;
diff --git a/driver/driver.hip.cpp b/driver/driver.hip.cpp
index fd6a3bbf8d..f25b99ca27 100644
--- a/driver/driver.hip.cpp
+++ b/driver/driver.hip.cpp
@@ -371,7 +371,7 @@ void host_winograd_3x3_convolution(const Tensor<TIn>& in_nchw,
             std::size_t ho = HoPerTile * htile + j;
             for(int i = 0; i < WoPerTile; ++i)
             {
-                std::size_t wo = WoPerTile * wtile + i;
+                std::size_t wo         = WoPerTile * wtile + i;
                 out_nkhw(n, k, ho, wo) = out_hold(n, k, htile, wtile, j, i);
             }
         }
@@ -413,13 +413,13 @@ int main(int argc, char* argv[])
 {
 #if 1
     // 3x3, 34x34
-    constexpr index_t N  = 64;
-    constexpr index_t C  = 256;
+    constexpr index_t N = 64;
+    constexpr index_t C = 256;
     constexpr index_t HI = 34;
     constexpr index_t WI = 34;
-    constexpr index_t K  = 128;
-    constexpr index_t Y  = 3;
-    constexpr index_t X  = 3;
+    constexpr index_t K = 128;
+    constexpr index_t Y = 3;
+    constexpr index_t X = 3;
 
     constexpr index_t HPad = 0;
     constexpr index_t WPad = 0;
@@ -597,6 +597,8 @@ int main(int argc, char* argv[])
         };
         wei_kcyx.GenerateTensorValue(gen_wei, num_thread);
 #endif
+
+        // out_nkhw_device.GenerateTensorValue(GeneratorTensor_1{}, num_thread);
     }
 
 #if 1
diff --git a/src/include/gridwise_convolution_implicit_gemm_v1r3_nchw_cyxk_nkhw.hip.hpp b/src/include/gridwise_convolution_implicit_gemm_v1r3_nchw_cyxk_nkhw.hip.hpp
index dcafa0f4c8..2a85725a50 100644
--- a/src/include/gridwise_convolution_implicit_gemm_v1r3_nchw_cyxk_nkhw.hip.hpp
+++ b/src/include/gridwise_convolution_implicit_gemm_v1r3_nchw_cyxk_nkhw.hip.hpp
@@ -359,19 +359,6 @@ struct GridwiseConvolutionImplicitGemm_v1r3_nchw_cyxk_nkhw
                 constexpr index_t K2 = GemmMPerThreadSubC;
                 constexpr index_t K1 = KPerBlock / KPerThread;
 
-#if 0
-                constexpr auto out_10d_global_desc =
-                    make_ConstantTensorDescriptor(Sequence<K / (K1 * K2),
-                                                           K1,
-                                                           K2,
-                                                           Ho,
-                                                           Wo / (W1 * W2),
-                                                           W1,
-                                                           W2,
-                                                           N / f_dummy(N1 * N2),
-                                                           N1,
-                                                           N2>{});
-#else
                 constexpr auto out_10d_global_desc =
                     make_ConstantTensorDescriptor(Sequence<N / f_dummy(N1 * N2),
                                                            N1,
@@ -383,7 +370,6 @@ struct GridwiseConvolutionImplicitGemm_v1r3_nchw_cyxk_nkhw
                                                            Wo / (W1 * W2),
                                                            W1,
                                                            W2>{});
-#endif
 
                 constexpr auto out_10d_thread_desc = make_ConstantTensorDescriptor(
                     Sequence<KPerThread / K2, 1, K2, HoPerThread, 1, W1, 1, 1, 1, N2>{});
@@ -401,20 +387,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_nchw_cyxk_nkhw
                 }
 #endif
 
-#if 0
-                threadwise_nd_tensor_copy(out_10d_thread_desc,
-                                          p_out_thread,
-                                          out_10d_global_desc,
-                                          p_out_global +
-                                              out_k_h_w_n_global_desc.Get1dIndex(
-                                                  k_block_data_begin + k_thread_data_begin,
-                                                  ho_block_data_begin + ho_thread_data_begin,
-                                                  wo_block_data_begin + wo_thread_data_begin,
-                                                  n_block_data_begin + n_thread_data_begin),
-                                          out_10d_thread_desc.GetLengths(),
-                                          Number<OutThreadCopyDataPerWrite_N>{});
-#else
-                constexpr auto map_out_global2thread = Sequence<7, 8, 9, 0, 1, 2, 6, 3, 4, 5>{};
+                constexpr auto map_out_global2thread = Sequence<7, 8, 9, 0, 1, 2, 3, 4, 5, 6>{};
 
                 threadwise_nd_tensor_copy_reorder_given_dst2src_v2(
                     out_10d_thread_desc,
@@ -428,8 +401,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_nchw_cyxk_nkhw
                             wo_block_data_begin + wo_thread_data_begin),
                     out_10d_thread_desc.GetLengths(),
                     map_out_global2thread);
-// Number<OutThreadCopyDataPerWrite_W>{});
-#endif
+                    // Number<OutThreadCopyDataPerWrite_W>{});
             })
             .else_([&](auto f_dummy) {
                 static_assert(f_dummy(GemmNPerThreadSubC) >= NPerBlock && NPerThread == NPerBlock &&
@@ -446,19 +418,6 @@ struct GridwiseConvolutionImplicitGemm_v1r3_nchw_cyxk_nkhw
                 constexpr index_t K2 = GemmMPerThreadSubC;
                 constexpr index_t K1 = KPerBlock / KPerThread;
 
-#if 0
-                constexpr auto out_10d_global_desc =
-                    make_ConstantTensorDescriptor(Sequence<K / (K1 * K2),
-                                                           K1,
-                                                           K2,
-                                                           Ho,
-                                                           Wo / (W1 * W2 * W3),
-                                                           W1,
-                                                           W2,
-                                                           W3,
-                                                           N / N1,
-                                                           N1>{});
-#else
                 constexpr auto out_10d_global_desc =
                     make_ConstantTensorDescriptor(Sequence<N / N1,
                                                            N1,
@@ -470,7 +429,6 @@ struct GridwiseConvolutionImplicitGemm_v1r3_nchw_cyxk_nkhw
                                                            W1,
                                                            W2,
                                                            W3>{});
-#endif
 
                 constexpr auto out_10d_thread_desc = make_ConstantTensorDescriptor(
                     Sequence<KPerThread / K2, 1, K2, HoPerThread, 1, W1, 1, W3, 1, N1>{});
@@ -486,26 +444,9 @@ struct GridwiseConvolutionImplicitGemm_v1r3_nchw_cyxk_nkhw
                                                    "out_k_h_w_n_global_desc");
                     print_ConstantTensorDescriptor(out_10d_global_desc, "out_10d_global_desc");
 
-                    for(index_t i = 0; i < 64; ++i)
-                    {
-                        printf("out %f, ", p_out_thread[i]);
-                    }
                 }
 #endif
 
-#if 0
-                threadwise_nd_tensor_copy(out_10d_thread_desc,
-                                          p_out_thread,
-                                          out_10d_global_desc,
-                                          p_out_global +
-                                              out_k_h_w_n_global_desc.Get1dIndex(
-                                                  k_block_data_begin + k_thread_data_begin,
-                                                  ho_block_data_begin + ho_thread_data_begin,
-                                                  wo_block_data_begin + wo_thread_data_begin,
-                                                  n_block_data_begin + n_thread_data_begin),
-                                          out_10d_thread_desc.GetLengths(),
-                                          Number<OutThreadCopyDataPerWrite_N>{});
-#else
                 constexpr auto map_out_global2thread = Sequence<8, 9, 0, 1, 2, 3, 4, 5, 6, 7>{};
 
                 threadwise_nd_tensor_copy_reorder_given_dst2src_v2(
@@ -520,8 +461,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_nchw_cyxk_nkhw
                             wo_block_data_begin + wo_thread_data_begin),
                     out_10d_thread_desc.GetLengths(),
                     map_out_global2thread);
-// Number<OutThreadCopyDataPerWrite_W>{});
-#endif
+                    // Number<OutThreadCopyDataPerWrite_W>{});
             });
     }
 };