diff --git a/driver/conv.cu b/driver/conv.cu
index 12ff9a0879..0265730949 100644
--- a/driver/conv.cu
+++ b/driver/conv.cu
@@ -391,7 +391,7 @@ int main()
 
     constexpr unsigned HPad = 0;
     constexpr unsigned WPad = 0;
-#elif 0
+#elif 1
     // 3x3, 34x34
     constexpr unsigned N = 64;
     constexpr unsigned C = 256;
@@ -484,7 +484,7 @@ int main()
 
     constexpr unsigned HPad = 1;
     constexpr unsigned WPad = 1;
-#elif 1
+#elif 0
     // 1x1 filter, 28x28 image
     constexpr unsigned N  = 16;
     constexpr unsigned C  = 256;
@@ -608,7 +608,7 @@ int main()
                                                              nrepeat);
 #endif
 
-#if 0
+#if 1
     if(S == 3 && R == 3)
     {
         host_winograd_3x3_convolution(in_nchw, wei_kcsr, out_nkhw_host, lower_pads, upper_pads);
diff --git a/driver/device_implicit_gemm_convolution_2_cnhw_csrk_knhw.cuh b/driver/device_implicit_gemm_convolution_2_cnhw_csrk_knhw.cuh
index 650ef70e06..c765f8aa58 100644
--- a/driver/device_implicit_gemm_convolution_2_cnhw_csrk_knhw.cuh
+++ b/driver/device_implicit_gemm_convolution_2_cnhw_csrk_knhw.cuh
@@ -67,7 +67,7 @@ void device_implicit_gemm_convolution_2_cnhw_csrk_knhw(InDesc,
 
     Tensor<T> out_knhw(make_TensorDescriptor(out_knhw_desc));
 
-#if 0
+#if 1
     // 3x3, 34x34
     constexpr unsigned BPerBlock = 128;
     constexpr unsigned KPerBlock = 64;
@@ -86,11 +86,11 @@ void device_implicit_gemm_convolution_2_cnhw_csrk_knhw(InDesc,
     constexpr unsigned WeiBlockCopyThreadPerDim0 = 4;
     constexpr unsigned WeiBlockCopyThreadPerDim1 = 16;
 
-    constexpr unsigned InBlockCopyDataPerRead  = 2;
+    constexpr unsigned InBlockCopyDataPerRead  = 4;
     constexpr unsigned WeiBlockCopyDataPerRead = 4;
 
     constexpr unsigned BlockSize = 128;
-#elif 1
+#elif 0
     // 1x1, 28x28
     constexpr unsigned BPerBlock = 64;
     constexpr unsigned KPerBlock = 64;
@@ -112,6 +112,29 @@ void device_implicit_gemm_convolution_2_cnhw_csrk_knhw(InDesc,
     constexpr unsigned InBlockCopyDataPerRead  = 4;
     constexpr unsigned WeiBlockCopyDataPerRead = 4;
 
+    constexpr unsigned BlockSize = 64;
+#elif 1
+    // 1x1, 28x28 try
+    constexpr unsigned BPerBlock = 64;
+    constexpr unsigned KPerBlock = 64;
+    constexpr unsigned CPerBlock = 8;
+
+    constexpr unsigned BPerThread = 4;
+    constexpr unsigned KPerThread = 16;
+    constexpr unsigned CPerThread = 1;
+
+    constexpr unsigned GemmThreadPerColumnPerCluster = 4;
+    constexpr unsigned GemmThreadPerRowPerCluster    = 8;
+
+    constexpr unsigned InBlockCopyThreadPerDim0 = 4;
+    constexpr unsigned InBlockCopyThreadPerDim1 = 16;
+
+    constexpr unsigned WeiBlockCopyThreadPerDim0 = 4;
+    constexpr unsigned WeiBlockCopyThreadPerDim1 = 16;
+
+    constexpr unsigned InBlockCopyDataPerRead  = 4;
+    constexpr unsigned WeiBlockCopyDataPerRead = 4;
+
     constexpr unsigned BlockSize = 64;
 #endif
 
diff --git a/src/include/ConstantTensorDescriptor.cuh b/src/include/ConstantTensorDescriptor.cuh
index 9b62832355..90bf761a94 100644
--- a/src/include/ConstantTensorDescriptor.cuh
+++ b/src/include/ConstantTensorDescriptor.cuh
@@ -33,15 +33,6 @@ __host__ __device__ constexpr auto calculate_default_strides_aligned(Sequence<L0
     return Sequence<L1 * L2 * L3_align, L2 * L3_align, L3_align, 1>{};
 }
 
-// this is ugly, only for 4d
-template <unsigned S0, unsigned S1, unsigned S2, unsigned S3>
-__host__ __device__ constexpr auto calculate_full_lengths(Sequence<S0, S1, S2, S3>)
-{
-    static_assert((S0 % S1 == 0) && (S1 % S2 == 0) && (S2 % S3 == 0), "cannot be evenly divided!");
-
-    return Sequence<1, S0 / S1, S1 / S2, S2 / S3>{};
-}
-
 template <class Lengths, class Strides>
 struct ConstantTensorDescriptor
 {
@@ -71,7 +62,6 @@ struct ConstantTensorDescriptor
         return Strides{}.Get(Number<I>{});
     }
 
-    // this is ugly, only for 4d
     __host__ __device__ constexpr unsigned GetElementSize() const
     {
         static_assert(nDim >= 2 && nDim <= 4, "nDim");
@@ -102,16 +92,20 @@ struct ConstantTensorDescriptor
         }
     }
 
-    __host__ __device__ constexpr unsigned GetElementSpace() const
+    template <class Align = Number<1>>
+    __host__ __device__ constexpr unsigned GetElementSpace(Align align = Align{}) const
     {
         static_assert(nDim >= 2 && nDim <= 4, "nDim");
 
+        constexpr unsigned align_size = align.Get();
+
         if(nDim == 2)
         {
             constexpr auto I0 = Number<0>{};
             constexpr auto I1 = Number<1>{};
 
-            return (GetLength(I0) - 1) * GetStride(I0) + (GetLength(I1) - 1) * GetStride(I1) + 1;
+            return (GetLength(I0) - 1) * GetStride(I0) + (GetLength(I1) - 1) * GetStride(I1) +
+                   align_size;
         }
         else if(nDim == 3)
         {
@@ -120,7 +114,7 @@ struct ConstantTensorDescriptor
             constexpr auto I2 = Number<2>{};
 
             return (GetLength(I0) - 1) * GetStride(I0) + (GetLength(I1) - 1) * GetStride(I1) +
-                   (GetLength(I2) - 1) * GetStride(I2) + 1;
+                   (GetLength(I2) - 1) * GetStride(I2) + align_size;
         }
         else if(nDim == 4)
         {
@@ -130,7 +124,8 @@ struct ConstantTensorDescriptor
             constexpr auto I3 = Number<3>{};
 
             return (GetLength(I0) - 1) * GetStride(I0) + (GetLength(I1) - 1) * GetStride(I1) +
-                   (GetLength(I2) - 1) * GetStride(I2) + (GetLength(I3) - 1) * GetStride(I3) + 1;
+                   (GetLength(I2) - 1) * GetStride(I2) + (GetLength(I3) - 1) * GetStride(I3) +
+                   align_size;
         }
     }
 
diff --git a/src/include/blockwise_2d_tensor_op.cuh b/src/include/blockwise_2d_tensor_op.cuh
index 0f7204b4cb..593b761e43 100644
--- a/src/include/blockwise_2d_tensor_op.cuh
+++ b/src/include/blockwise_2d_tensor_op.cuh
@@ -399,11 +399,21 @@ struct Blockwise2dTensorCopy3
 
         // we allow out-of-bound read from src in D1 dimension,
         //   but we need to make sure dst stride is big enough,
-        //   so that the out-of-bound write won't overwrite next line
+        //   so that the out-of-bound write won't contaminate next line in dst
         static_assert(thread_per_d1 * DataPerRead <= DstDesc{}.GetStride(I0),
-                      "wrong! out-of-bound write will overwrite next line!\n");
+                      "wrong! out-of-bound write will contaminate next line!\n");
 
-        static_assert(thread_per_d0 >= 1, "wrong! not enough threads to cover L1 dimension\n");
+        static_assert(thread_per_d0 >= 1, "wrong! not enough threads to cover one line\n");
+
+        constexpr unsigned num_active_thread = thread_per_d0 * thread_per_d1;
+
+        if(BlockSize > num_active_thread)
+        {
+            if(get_thread_local_1d_id() >= num_active_thread)
+            {
+                return;
+            }
+        }
 
         const unsigned thread_id_d0 = get_thread_local_1d_id() / thread_per_d1;
         const unsigned thread_id_d1 = get_thread_local_1d_id() - thread_id_d0 * thread_per_d1;
diff --git a/src/include/gridwise_implicit_gemm_convolution_2_cnhw_csrk_knhw.cuh b/src/include/gridwise_implicit_gemm_convolution_2_cnhw_csrk_knhw.cuh
index d33142028b..877c595ba5 100644
--- a/src/include/gridwise_implicit_gemm_convolution_2_cnhw_csrk_knhw.cuh
+++ b/src/include/gridwise_implicit_gemm_convolution_2_cnhw_csrk_knhw.cuh
@@ -75,27 +75,14 @@ gridwise_implicit_gemm_convolution_2_cnhw_csrk_knhw(InGlobalDesc,
     constexpr auto wei_ek_global_desc = make_ConstantTensorDescriptor(Sequence<C * S * R, K>{});
 
     // tensor view of blockwise input and weight
-#if 0
-    constexpr auto in_cb_block_desc =
-        make_ConstantTensorDescriptor(Sequence<CPerBlock, BPerBlock + BGhostRead>{});
-#else
     constexpr auto in_cb_block_desc = make_ConstantTensorDescriptor_aligned(
         Sequence<CPerBlock, BPerBlock + BGhostRead>{}, Number<InBlockCopyDataPerRead>{});
-#endif
 
-#if 0
-    constexpr auto wei_ek_block_desc =
-        make_ConstantTensorDescriptor(Sequence<CPerBlock * S * R, KPerBlock>{});
-
-    constexpr auto wei_csrk_block_desc =
-        make_ConstantTensorDescriptor(Sequence<CPerBlock, S, R, KPerBlock>{});
-#else
     constexpr auto wei_ek_block_desc = make_ConstantTensorDescriptor_aligned(
         Sequence<CPerBlock * S * R, KPerBlock>{}, Number<WeiBlockCopyDataPerRead>{});
 
     constexpr auto wei_csrk_block_desc = make_ConstantTensorDescriptor_aligned(
         Sequence<CPerBlock, S, R, KPerBlock>{}, Number<WeiBlockCopyDataPerRead>{});
-#endif
 
     // tensor view of threadwise output in register
     constexpr auto out_kb_thread_desc =
@@ -203,12 +190,19 @@ gridwise_implicit_gemm_convolution_2_cnhw_csrk_knhw(InGlobalDesc,
                                                                  GemmThreadPerRowPerCluster,
                                                                  true>{};
 
-    // LDS
-    constexpr unsigned in_block_size  = in_cb_block_desc.GetElementSpace();
-    constexpr unsigned wei_block_size = wei_csrk_block_desc.GetElementSpace();
+    // LDS: be careful of alignment
+    constexpr unsigned in_block_size =
+        in_cb_block_desc.GetElementSpace(Number<InBlockCopyDataPerRead>{});
 
-    __shared__ Float p_in_block[in_block_size];
-    __shared__ Float p_wei_block[wei_block_size];
+    constexpr unsigned wei_block_size =
+        wei_csrk_block_desc.GetElementSpace(Number<WeiBlockCopyDataPerRead>{});
+
+    constexpr unsigned max_align = InBlockCopyDataPerRead > WeiBlockCopyDataPerRead
+                                       ? InBlockCopyDataPerRead
+                                       : WeiBlockCopyDataPerRead;
+
+    __shared__ Float p_in_block[max_align * ((in_block_size + max_align - 1) / max_align)];
+    __shared__ Float p_wei_block[max_align * ((wei_block_size + max_align - 1) / max_align)];
 
     // register
     Float p_out_thread[out_kb_thread_desc.GetElementSpace()];