diff --git a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded.hpp b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded.hpp
index f73557d438..f0ed466eef 100644
--- a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded.hpp
+++ b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded.hpp
@@ -51,7 +51,7 @@ template <index_t GridSize,
           index_t WeiBlockCopyDstDataPerWrite_K>
 struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_padded
 {
-#if 0
+#if 1
     __device__ void Run(const Float* const __restrict__ p_in_global,
                         const Float* const __restrict__ p_wei_global,
                         Float* const __restrict__ p_out_global) const
diff --git a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded_lds_double_buffer.hpp b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded_lds_double_buffer.hpp
index b14720eb88..cce36456d8 100644
--- a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded_lds_double_buffer.hpp
+++ b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded_lds_double_buffer.hpp
@@ -187,16 +187,20 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_padded_lds_double_buf
         // weight tensor
         //     tensor descriptor in device memory, src of blockwise copy
         constexpr auto wei_e_k_global_desc =
+#if 0
             transform_tensor_descriptor(wei_k_c_y_x_global_desc,
                                         make_tuple(Merge<Sequence<C, Y, X>>{}, PassThrough<K>{}),
                                         make_tuple(Sequence<1, 2, 3>{}, Sequence<0>{}),
                                         make_tuple(Sequence<0>{}, Sequence<1>{}));
+#else // hack
+            make_native_tensor_descriptor_packed(Sequence<K, C * Y * X>{});
+#endif
 
-        //     tensor descriptor in LDS, dst of blockwise copy
-        //     be careful of LDS alignment
-        constexpr auto wei_e_k_block_desc = make_native_tensor_descriptor_aligned(
-            Sequence<EPerBlock, KPerBlock>{},
-            Number<math::lcm(WeiBlockCopyDstDataPerWrite_K, GemmDataPerReadA)>{});
+            //     tensor descriptor in LDS, dst of blockwise copy
+            //     be careful of LDS alignment
+            constexpr auto wei_e_k_block_desc = make_native_tensor_descriptor_aligned(
+                Sequence<EPerBlock, KPerBlock>{},
+                Number<math::lcm(WeiBlockCopyDstDataPerWrite_K, GemmDataPerReadA)>{});
 
         // operator for blockwise copy of weight into LDS
         //     slice a tensor, and copy it into another tensor
diff --git a/composable_kernel/include/tensor_description/tensor_coordinate_v2.hpp b/composable_kernel/include/tensor_description/tensor_coordinate_v2.hpp
index 0330b22438..62dc8b4c9a 100644
--- a/composable_kernel/include/tensor_description/tensor_coordinate_v2.hpp
+++ b/composable_kernel/include/tensor_description/tensor_coordinate_v2.hpp
@@ -47,7 +47,7 @@ struct NativeTensorCoordinate
         // mIndex is updated here, but some (or all) of its entries may never be used
         mIndex += idx_diff;
 
-        mOffset += tensor_desc_type::CalculateOffset(idx_diff);
+        mOffset += tensor_desc_type::CalculateOffsetDiff(idx_diff);
 
         return *this;
     }
@@ -57,7 +57,7 @@ struct NativeTensorCoordinate
         // mIndex is updated here, but some (or all) of its entries may never be used
         mIndex -= idx_diff;
 
-        mOffset -= tensor_desc_type::CalculateOffset(idx_diff);
+        mOffset -= tensor_desc_type::CalculateOffsetDiff(idx_diff);
 
         return *this;
     }
diff --git a/composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp b/composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp
index a3956dc713..c9a7ec85b2 100644
--- a/composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp
+++ b/composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp
@@ -684,12 +684,10 @@ template <index_t BlockSize,
 struct BlockwiseGenericTensorSliceCopy_v4
 {
     static constexpr index_t nDim = SrcDesc::GetNumOfDimension();
+    using Index                   = MultiIndex<nDim>;
 
-    using SrcCoord = typename TensorCoordinate_v2<SrcDesc>::type;
-    using DstCoord = typename TensorCoordinate_v2<DstDesc>::type;
-
-    __device__ constexpr BlockwiseGenericTensorSliceCopy_v4(SrcCoord src_block_slice_origin,
-                                                            DstCoord dst_block_slice_origin)
+    __device__ constexpr BlockwiseGenericTensorSliceCopy_v4(const Index& src_block_slice_origin,
+                                                            const Index& dst_block_slice_origin)
     {
         static_assert(nDim == SrcDesc::GetNumOfDimension() &&
                           nDim == DstDesc::GetNumOfDimension() && nDim == SliceLengths::Size() &&
diff --git a/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp b/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp
index 110ef53bc5..c7e084c882 100644
--- a/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp
+++ b/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp
@@ -966,8 +966,8 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
     using SrcCoord = typename TensorCoordinate_v2<SrcDesc>::type;
     using DstCoord = typename TensorCoordinate_v2<DstDesc>::type;
 
-    __device__ constexpr ThreadwiseGenericTensorSliceCopy_v4r2(SrcCoord src_slice_origin,
-                                                               DstCoord dst_slice_origin)
+    __device__ constexpr ThreadwiseGenericTensorSliceCopy_v4r2(const Index& src_slice_origin,
+                                                               const Index& dst_slice_origin)
         : mSrcSliceOrigin(src_slice_origin), mDstSliceOrigin(dst_slice_origin)
     {
         static_assert(nDim == SrcDesc::GetNumOfDimension() &&