diff --git a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded.hpp b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded.hpp
index f0ed466eef..6f92b30617 100644
--- a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded.hpp
+++ b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded.hpp
@@ -185,6 +185,7 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_padded
                                                InBlockCopyDstDataPerWrite_N2>(
                 {0, 0, b_block_data_on_global, 0}, {0, 0, 0, 0});
 
+#if 0
         // weight tensor
         //     tensor descriptor in device memory, src of blockwise copy
         constexpr auto wei_e_k_global_desc =
@@ -192,6 +193,13 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_padded
                                         make_tuple(Merge<Sequence<C, Y, X>>{}, PassThrough<K>{}),
                                         make_tuple(Sequence<1, 2, 3>{}, Sequence<0>{}),
                                         make_tuple(Sequence<0>{}, Sequence<1>{}));
+#else // hack
+        constexpr auto wei_e_k_global_desc_old =
+            WeiGlobalDesc::Unfold(I1, I3).ReorderGivenNew2Old(Sequence<1, 0>{});
+
+        constexpr auto wei_e_k_global_desc = make_native_tensor_descriptor(
+            wei_e_k_global_desc_old.GetLengths(), wei_e_k_global_desc_old.GetStrides());
+#endif
 
         //     tensor descriptor in LDS, dst of blockwise copy
         //     be careful of LDS alignment
diff --git a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded_lds_double_buffer.hpp b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded_lds_double_buffer.hpp
index cce36456d8..011112e49f 100644
--- a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded_lds_double_buffer.hpp
+++ b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded_lds_double_buffer.hpp
@@ -184,23 +184,27 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_padded_lds_double_buf
                                                InBlockCopyDstDataPerWrite_N2>(
                 {0, 0, b_block_data_on_global, 0}, {0, 0, 0, 0});
 
+#if 0
         // weight tensor
         //     tensor descriptor in device memory, src of blockwise copy
         constexpr auto wei_e_k_global_desc =
-#if 0
             transform_tensor_descriptor(wei_k_c_y_x_global_desc,
                                         make_tuple(Merge<Sequence<C, Y, X>>{}, PassThrough<K>{}),
                                         make_tuple(Sequence<1, 2, 3>{}, Sequence<0>{}),
                                         make_tuple(Sequence<0>{}, Sequence<1>{}));
 #else // hack
-            make_native_tensor_descriptor_packed(Sequence<K, C * Y * X>{});
+        constexpr auto wei_e_k_global_desc_old =
+            WeiGlobalDesc::Unfold(I1, I3).ReorderGivenNew2Old(Sequence<1, 0>{});
+
+        constexpr auto wei_e_k_global_desc = make_native_tensor_descriptor(
+            wei_e_k_global_desc_old.GetLengths(), wei_e_k_global_desc_old.GetStrides());
 #endif
 
-            //     tensor descriptor in LDS, dst of blockwise copy
-            //     be careful of LDS alignment
-            constexpr auto wei_e_k_block_desc = make_native_tensor_descriptor_aligned(
-                Sequence<EPerBlock, KPerBlock>{},
-                Number<math::lcm(WeiBlockCopyDstDataPerWrite_K, GemmDataPerReadA)>{});
+        //     tensor descriptor in LDS, dst of blockwise copy
+        //     be careful of LDS alignment
+        constexpr auto wei_e_k_block_desc = make_native_tensor_descriptor_aligned(
+            Sequence<EPerBlock, KPerBlock>{},
+            Number<math::lcm(WeiBlockCopyDstDataPerWrite_K, GemmDataPerReadA)>{});
 
         // operator for blockwise copy of weight into LDS
         //     slice a tensor, and copy it into another tensor
diff --git a/composable_kernel/include/tensor_description/tensor_coordinate.hpp b/composable_kernel/include/tensor_description/tensor_coordinate.hpp
index 1a2774b589..4600b682ac 100644
--- a/composable_kernel/include/tensor_description/tensor_coordinate.hpp
+++ b/composable_kernel/include/tensor_description/tensor_coordinate.hpp
@@ -313,14 +313,14 @@ struct TensorCoordinate
     private:
     template <class... Ts>
     __host__ __device__ static constexpr auto
-        MakeDummyTensorCoordinate(ConstantTensorDescriptor<Ts...>)
+    MakeDummyTensorCoordinate(ConstantTensorDescriptor<Ts...>)
     {
         return NormalTensorCoordinate<ConstantTensorDescriptor<Ts...>>();
     }
 
     template <class... Ts>
     __host__ __device__ static constexpr auto
-        MakeDummyTensorCoordinate(ConstantMergedTensorDescriptor<Ts...>)
+    MakeDummyTensorCoordinate(ConstantMergedTensorDescriptor<Ts...>)
     {
         return MergedTensorCoordinate<ConstantMergedTensorDescriptor<Ts...>>();
     }
diff --git a/composable_kernel/include/tensor_description/tensor_coordinate_v2.hpp b/composable_kernel/include/tensor_description/tensor_coordinate_v2.hpp
index 62dc8b4c9a..831088ab25 100644
--- a/composable_kernel/include/tensor_description/tensor_coordinate_v2.hpp
+++ b/composable_kernel/include/tensor_description/tensor_coordinate_v2.hpp
@@ -188,7 +188,7 @@ struct TensorCoordinate_v2
     private:
     template <typename... Ts>
     __host__ __device__ static constexpr auto
-        MakeDummyTensorCoordinate(NativeTensorDescriptor<Ts...>)
+    MakeDummyTensorCoordinate(NativeTensorDescriptor<Ts...>)
     {
         return NativeTensorCoordinate<NativeTensorDescriptor<Ts...>>(
             make_zero_array<index_t, TensorDesc::GetNumOfDimension()>());
@@ -196,7 +196,7 @@ struct TensorCoordinate_v2
 
     template <typename... Ts>
     __host__ __device__ static constexpr auto
-        MakeDummyTensorCoordinate(TransformedTensorDescriptor<Ts...>)
+    MakeDummyTensorCoordinate(TransformedTensorDescriptor<Ts...>)
     {
         return TransformedTensorCoordinate<TransformedTensorDescriptor<Ts...>>(
             make_zero_array<index_t, TensorDesc::GetNumOfDimension()>());
diff --git a/composable_kernel/include/tensor_description/tensor_descriptor.hpp b/composable_kernel/include/tensor_description/tensor_descriptor.hpp
index a41dc1da67..df625c6ecb 100644
--- a/composable_kernel/include/tensor_description/tensor_descriptor.hpp
+++ b/composable_kernel/include/tensor_description/tensor_descriptor.hpp
@@ -85,12 +85,6 @@ struct NativeTensorDescriptor
         return offset;
     }
 
-    // TODO: remove this
-    __host__ __device__ static constexpr index_t GetOffsetFromMultiIndex(const Index& idx)
-    {
-        return CalculateOffset(idx);
-    }
-
     __host__ __device__ static constexpr index_t CalculateOffsetDiff(const Index& idx_diff)
     {
         index_t offset_diff = 0;
@@ -227,13 +221,6 @@ struct TransformedTensorDescriptor
         return LowTensorDescriptor{};
     }
 
-#if 0
-    __host__ __device__ static constexpr auto GetLowerLengths()
-    {
-        return GetLowerTensorDescriptor().GetLengths();
-    }
-#endif
-
     struct lambda_GetUpperLengths
     {
         template <typename Transform>
@@ -359,12 +346,6 @@ struct TransformedTensorDescriptor
         return GetLowerTensorDescriptor().CalculateOffset(CalculateLowerIndex(idx_up));
     }
 
-    // TODO: remove this
-    __host__ __device__ static constexpr index_t GetOffsetFromMultiIndex(const UpperIndex& idx_up)
-    {
-        return CalculateOffset(idx_up);
-    }
-
 #if 0
     template <index_t IDim>
     __host__ __device__ static constexpr bool IsLinearDimension(Number<IDim>)
diff --git a/driver/src/driver.cpp b/driver/src/driver.cpp
index 045e796c38..8ae1b3eb35 100644
--- a/driver/src/driver.cpp
+++ b/driver/src/driver.cpp
@@ -49,7 +49,7 @@ struct GeneratorTensor_3
     {
         std::array<index_t, sizeof...(Is)> dims = {{static_cast<index_t>(is)...}};
 
-        auto f_acc = [](auto a, auto b) { return 100 * a + b; };
+        auto f_acc = [](auto a, auto b) { return 10 * a + b; };
 
         return std::accumulate(dims.begin(), dims.end(), index_t(0), f_acc);
     }
@@ -75,19 +75,19 @@ int main(int argc, char* argv[])
     using namespace ck;
 
 #if 0
-    constexpr index_t N  = 256;
-    constexpr index_t C  = 64;
-    constexpr index_t HI = 17;
-    constexpr index_t WI = 17;
-    constexpr index_t K  = 256;
-    constexpr index_t Y  = 17;
-    constexpr index_t X  = 17;
+    constexpr index_t N  = 8;
+    constexpr index_t C  = 8;
+    constexpr index_t HI = 2;
+    constexpr index_t WI = 8;
+    constexpr index_t K  = 128;
+    constexpr index_t Y  = 1;
+    constexpr index_t X  = 1;
 
     using ConvStrides   = Sequence<1, 1>;
     using ConvDilations = Sequence<1, 1>;
 
-    using LeftPads  = Sequence<0, 3>;
-    using RightPads = Sequence<0, 3>;
+    using LeftPads  = Sequence<0, 0>;
+    using RightPads = Sequence<0, 0>;
 #elif 0
     // 3x3, 34x34
     constexpr index_t N  = 64;
@@ -347,7 +347,7 @@ int main(int argc, char* argv[])
         wei_kcyx.GenerateTensorValue(GeneratorTensor_1{}, num_thread);
 #elif 0
         in_nchw.GenerateTensorValue(GeneratorTensor_1{}, num_thread);
-        wei_kcyx.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread);
+        wei_kcyx.GenerateTensorValue(GeneratorTensor_3{}, num_thread);
 #elif 0
         in_nchw.GenerateTensorValue(GeneratorTensor_3{}, num_thread);
         wei_kcyx.GenerateTensorValue(GeneratorTensor_1{}, num_thread);