adding implicit gemm v3

2026-07-17 17:19:12 +00:00 · 2019-05-22 19:39:56 -05:00
parent 2a48812edb
commit 8a4b59785b
26 changed files with 373 additions and 259 deletions
--- a/driver/device_convolution_implicit_gemm_v1_chwn_cyxk_khwn.hpp
+++ b/driver/device_convolution_implicit_gemm_v1_chwn_cyxk_khwn.hpp
@@ -38,7 +38,7 @@ void device_convolution_implicit_gemm_v1_chwn_cyxk_khwn(InDesc,
    constexpr index_t X = wei_kcyx_desc.GetLength(I3);

    // reorder weight
-    auto wei_cyxk_desc = make_packed_ConstantTensorDescriptor(Sequence<C, Y, X, K>{});
+    auto wei_cyxk_desc = make_ConstantTensorDescriptor_default_rank_packed(Sequence<C, Y, X, K>{});
    ostream_ConstantTensorDescriptor(wei_cyxk_desc, std::cout << "wei_cyxk_desc: ");

    Tensor<T> wei_cyxk(make_TensorDescriptor(wei_cyxk_desc));
@@ -51,7 +51,7 @@ void device_convolution_implicit_gemm_v1_chwn_cyxk_khwn(InDesc,
        std::thread::hardware_concurrency());

    // reorder input
-    auto in_chwn_desc = make_packed_ConstantTensorDescriptor(Sequence<C, Hi, Wi, N>{});
+    auto in_chwn_desc = make_ConstantTensorDescriptor_default_rank_packed(Sequence<C, Hi, Wi, N>{});
    ostream_ConstantTensorDescriptor(in_chwn_desc, std::cout << "in_chwn_desc: ");

    Tensor<T> in_chwn(make_TensorDescriptor(in_chwn_desc));
@@ -64,7 +64,8 @@ void device_convolution_implicit_gemm_v1_chwn_cyxk_khwn(InDesc,
        std::thread::hardware_concurrency());

    // output
-    auto out_khwn_desc = make_packed_ConstantTensorDescriptor(Sequence<K, Ho, Wo, N>{});
+    auto out_khwn_desc =
+        make_ConstantTensorDescriptor_default_rank_packed(Sequence<K, Ho, Wo, N>{});
    ostream_ConstantTensorDescriptor(out_khwn_desc, std::cout << "out_khwn_desc: ");

    Tensor<T> out_khwn(make_TensorDescriptor(out_khwn_desc));
--- a/driver/device_convolution_implicit_gemm_v1_nchw_cyxk_khwn.hpp
+++ b/driver/device_convolution_implicit_gemm_v1_nchw_cyxk_khwn.hpp
@@ -37,7 +37,7 @@ void device_convolution_implicit_gemm_v1_nchw_cyxk_khwn(InDesc,
    constexpr index_t X = wei_kcyx_desc.GetLength(I3);

    // reorder weight
-    auto wei_cyxk_desc = make_packed_ConstantTensorDescriptor(Sequence<C, Y, X, K>{});
+    auto wei_cyxk_desc = make_ConstantTensorDescriptor_default_rank_packed(Sequence<C, Y, X, K>{});
    ostream_ConstantTensorDescriptor(wei_cyxk_desc, std::cout << "wei_cyxk_desc: ");

    Tensor<T> wei_cyxk(make_TensorDescriptor(wei_cyxk_desc));
@@ -50,7 +50,8 @@ void device_convolution_implicit_gemm_v1_nchw_cyxk_khwn(InDesc,
        std::thread::hardware_concurrency());

    // output
-    auto out_khwn_desc = make_packed_ConstantTensorDescriptor(Sequence<K, Ho, Wo, N>{});
+    auto out_khwn_desc =
+        make_ConstantTensorDescriptor_default_rank_packed(Sequence<K, Ho, Wo, N>{});
    ostream_ConstantTensorDescriptor(out_khwn_desc, std::cout << "out_khwn_desc: ");

    Tensor<T> out_khwn(make_TensorDescriptor(out_khwn_desc));
--- a/driver/device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw.hpp
+++ b/driver/device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw.hpp
@@ -36,7 +36,7 @@ void device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw(InDesc,
    constexpr index_t X = wei_kcyx_desc.GetLength(I3);

    // reorder weight
-    auto wei_cyxk_desc = make_packed_ConstantTensorDescriptor(Sequence<C, Y, X, K>{});
+    auto wei_cyxk_desc = make_ConstantTensorDescriptor_default_rank_packed(Sequence<C, Y, X, K>{});
    ostream_ConstantTensorDescriptor(wei_cyxk_desc, std::cout << "wei_cyxk_desc: ");

    Tensor<T> wei_cyxk(make_TensorDescriptor(wei_cyxk_desc));
@@ -57,7 +57,7 @@ void device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw(InDesc,
    wei_cyxk_device_buf.ToDevice(wei_cyxk.mData.data());
    out_nkhw_device_buf.ToDevice(out_nkhw.mData.data());

-#if 0
+#if 1
    // for 3x3, 34x34, v1r3, Pascal
    constexpr index_t BlockSize = 128;

@@ -92,7 +92,7 @@ void device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw(InDesc,
    constexpr index_t WeiBlockCopyDataPerRead_K = 4;

    constexpr index_t OutThreadCopyDataPerWrite_W = 2;
-#elif 0
+#elif 1
    // for 3x3, 34x34, v1r3, Vega 20, WoPerBlock = 32
    constexpr index_t BlockSize = 256;

@@ -162,7 +162,7 @@ void device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw(InDesc,
    constexpr index_t WeiBlockCopyDataPerRead_K = 4;

    constexpr index_t OutThreadCopyDataPerWrite_W = 2;
-#elif 1
+#elif 0
    // for 3x3, 34x34, v1r3, Vega 20, WoPerBlock = 8
    constexpr index_t BlockSize = 256;

--- a/driver/device_convolution_implicit_gemm_v3_nchw_cyxk_nkhw.hpp
+++ b/driver/device_convolution_implicit_gemm_v3_nchw_cyxk_nkhw.hpp
@@ -35,7 +35,7 @@ void device_convolution_implicit_gemm_v3_nchw_cyxk_nkhw(InDesc,
    constexpr index_t X = wei_kcyx_desc.GetLength(I3);

    // reorder weight
-    auto wei_cyxk_desc = make_ConstantTensorDescriptor(Sequence<C, Y, X, K>{});
+    auto wei_cyxk_desc = make_ConstantTensorDescriptor_default_rank_packed(Sequence<C, Y, X, K>{});
    ostream_ConstantTensorDescriptor(wei_cyxk_desc, std::cout << "wei_cyxk_desc: ");

    Tensor<T> wei_cyxk(make_TensorDescriptor(wei_cyxk_desc));
@@ -56,37 +56,40 @@ void device_convolution_implicit_gemm_v3_nchw_cyxk_nkhw(InDesc,
    wei_cyxk_device_buf.ToDevice(wei_cyxk.mData.data());
    out_nkhw_device_buf.ToDevice(out_nkhw.mData.data());

+    constexpr index_t N1 = 2;
+    constexpr index_t N2 = 4;
+
+    constexpr index_t B = (N * Ho * Wo) / (N1 * N2);
+
 #if 1
-    // for 3x3, 28x28, v3, Pascal
-    constexpr index_t BlockSize = 128;
+    // for 3x3, 28x28, v3
+    constexpr index_t BlockSize = 256;

    constexpr index_t BPerBlock = 16;
    constexpr index_t KPerBlock = 128;
    constexpr index_t CPerBlock = 8;

-    constexpr index_t BPerThread = 1;
-    constexpr index_t KPerThread = 8;
-
    constexpr index_t GemmMPerThreadSubC = 4;
    constexpr index_t GemmNPerThreadSubC = 4;
    constexpr index_t GemmMLevel0Cluster = 4;
-    constexpr index_t GemmNLevel0Cluster = 2;
+    constexpr index_t GemmNLevel0Cluster = 4;
    constexpr index_t GemmMLevel1Cluster = 4;
-    constexpr index_t GemmNLevel1Cluster = 2;
+    constexpr index_t GemmNLevel1Cluster = 4;
    constexpr index_t GemmKPerThreadLoop = 1;
    constexpr index_t GemmDataPerReadA   = 4;
    constexpr index_t GemmDataPerReadB   = 4;

-    using InBlockReorderSrcSubLengths_NCHW                    = Sequence<4, 1, 1, 1>;
-    using InBlockReorderSrcClusterLengths_NCHW                = Sequence<4, 8, 2, 2>;
-    using InBlockReorderMapThreadCluster2SrcCluster_CHNW2NCHW = Sequence<1, 2, 0, 3>;
+    using InBlockCopySubLengths_N1_N2_C_B     = Sequence<1, 4, 1, 1>;
+    using InBlockCopyClusterLengths_N1_N2_C_B = Sequence<2, 1, 8, 16>;

-    constexpr index_t WeiBlockCopyDataPerRead_K = 4;
+    constexpr index_t InBlockCopySrcDataPerRead_B   = 1;
+    constexpr index_t InBlockCopyDstDataPerWrite_N2 = 4;
+
+    constexpr index_t WeiBlockCopyDataPerAccess_K = 4;
 #endif

    constexpr index_t GridSize =
-        ((N + NPerBlock - 1) / NPerBlock) * ((K + KPerBlock - 1) / KPerBlock) *
-        ((Ho + HoPerBlock - 1) / HoPerBlock) * ((Wo + WoPerBlock - 1) / WoPerBlock);
+        ((B + BPerBlock - 1) / BPerBlock) * ((K + KPerBlock - 1) / KPerBlock);

    printf("%s: BlockSize %u, GridSize %u \n", __func__, BlockSize, GridSize);

@@ -102,15 +105,11 @@ void device_convolution_implicit_gemm_v3_nchw_cyxk_nkhw(InDesc,
             decltype(in_nchw_desc),
             decltype(wei_cyxk_desc),
             decltype(out_nkhw_desc),
-             NPerBlock,
+             BPerBlock,
             KPerBlock,
             CPerBlock,
-             HoPerBlock,
-             WoPerBlock,
-             NPerThread,
-             KPerThread,
-             HoPerThread,
-             WoPerThread,
+             N1,
+             N2,
             GemmMPerThreadSubC,
             GemmNPerThreadSubC,
             GemmMLevel0Cluster,
@@ -120,14 +119,11 @@ void device_convolution_implicit_gemm_v3_nchw_cyxk_nkhw(InDesc,
             GemmKPerThreadLoop,
             GemmDataPerReadA,
             GemmDataPerReadB,
-             InBlockReorderSrcSubLengths_NCHW,
-             InBlockReorderSrcClusterLengths_NCHW,
-             InBlockReorderMapThreadCluster2SrcCluster_CHNW2NCHW,
-             InBlockReorderDataPerRead_W,
-             InBlockReorderDataPerWrite_N,
-             WeiBlockCopyClusterLengths,
-             WeiBlockCopyDataPerRead_K,
-             OutThreadCopyDataPerWrite_W>{};
+             InBlockCopySubLengths_N1_N2_C_B,
+             InBlockCopyClusterLengths_N1_N2_C_B,
+             InBlockCopySrcDataPerRead_B,
+             InBlockCopyDstDataPerWrite_N2,
+             WeiBlockCopyDataPerAccess_K>{};

        float time = launch_kernel(run_gridwise_convolution<decltype(gridwise_conv), T>,
                                   dim3(GridSize),
--- a/driver/driver.hip.cpp
+++ b/driver/driver.hip.cpp
@@ -13,7 +13,7 @@
 #include "device_convolution_implicit_gemm_v1_nchw_cyxk_khwn.hpp"
 #include "device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw.hpp"
 #include "device_convolution_implicit_gemm_v2_chwn_cyxk_khwn.hpp"
-//#include "device_convolution_implicit_gemm_v3_nchw_cyxk_nkhw.hpp"
+#include "device_convolution_implicit_gemm_v3_nchw_cyxk_nkhw.hpp"

 struct GeneratorTensor_1
 {
@@ -548,8 +548,8 @@ int main(int argc, char* argv[])
    auto lower_pads = Sequence<HPad, WPad>{};
    auto upper_pads = Sequence<HPad, WPad>{};

-    auto in_nchw_desc  = make_packed_ConstantTensorDescriptor(Sequence<N, C, HI, WI>{});
-    auto wei_kcyx_desc = make_packed_ConstantTensorDescriptor(Sequence<K, C, Y, X>{});
+    auto in_nchw_desc = make_ConstantTensorDescriptor_default_rank_packed(Sequence<N, C, HI, WI>{});
+    auto wei_kcyx_desc = make_ConstantTensorDescriptor_default_rank_packed(Sequence<K, C, Y, X>{});
    auto out_nkhw_desc = get_convolution_with_padding_output_default_4d_tensor_descriptor(
        in_nchw_desc, wei_kcyx_desc, lower_pads, upper_pads);

@@ -612,11 +612,11 @@ int main(int argc, char* argv[])
    device_convolution_implicit_gemm_v1_chwn_cyxk_khwn
 #elif 0
    device_convolution_implicit_gemm_v1_nchw_cyxk_khwn
-#elif 1
+#elif 0
    device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw
 #elif 0
    device_convolution_implicit_gemm_v2_chwn_cyxk_khwn
-#elif 0
+#elif 1
    device_convolution_implicit_gemm_v3_nchw_cyxk_nkhw
 #endif
    (in_nchw_desc, in_nchw, wei_kcyx_desc, wei_kcyx, out_nkhw_desc, out_nkhw_device, nrepeat);
--- a/src/include/Array.hip.hpp
+++ b/src/include/Array.hip.hpp
@@ -12,7 +12,7 @@ struct Array
    index_t mData[nSize];

    template <class... Xs>
-    __host__ __device__ Array(Xs... xs) : mData{static_cast<TData>(xs)...}
+    __host__ __device__ constexpr Array(Xs... xs) : mData{static_cast<TData>(xs)...}
    {
    }

@@ -37,6 +37,25 @@ struct Array
    }
 };

+template <index_t... Is>
+__host__ __device__ constexpr auto sequence2array(Sequence<Is...>)
+{
+    return Array<index_t, sizeof...(Is)>{Is...};
+}
+
+template <class TData, index_t NSize>
+__host__ __device__ constexpr auto make_zero_array()
+{
+    Array<TData, NSize> a;
+
+    static_for<0, NSize, 1>{}([&](auto I) {
+        constexpr index_t i = I.Get();
+        a[i]                = static_cast<TData>(0);
+    });
+
+    return a;
+}
+
 template <class TData, index_t NSize, index_t... IRs>
 __host__ __device__ auto reorder_array_given_new2old(const Array<TData, NSize>& old_array,
                                                     Sequence<IRs...> new2old)
@@ -80,15 +99,14 @@ __host__ __device__ auto extract_array(const Array<TData, NSize>& old_array, Ext

    static_for<0, new_size, 1>{}([&](auto I) {
        constexpr index_t i = I.Get();
-        new_array[i]        = old_array[ExtractSeq{}.Get(I)];
+        new_array[i]        = old_array[ExtractSeq::Get(I)];
    });

    return new_array;
 }

 template <class TData, index_t NSize>
-__host__ __device__ constexpr auto operator+(const Array<TData, NSize>& a,
-                                             const Array<TData, NSize>& b)
+__host__ __device__ constexpr auto operator+(Array<TData, NSize> a, Array<TData, NSize> b)
 {
    Array<TData, NSize> result;

@@ -99,3 +117,20 @@ __host__ __device__ constexpr auto operator+(const Array<TData, NSize>& a,

    return result;
 }
+
+// Array = Array * Sequence
+template <class TData, index_t NSize, index_t... Is>
+__host__ __device__ constexpr auto operator*(Array<TData, NSize> a, Sequence<Is...> b)
+{
+    static_assert(sizeof...(Is) == NSize, "wrong! size not the same");
+
+    Array<TData, NSize> result;
+
+    static_for<0, NSize, 1>{}([&](auto I) {
+        constexpr index_t i = I.Get();
+
+        result[i] = a[i] + b.Get(I);
+    });
+
+    return result;
+}
--- a/src/include/ConstantMatrixDescriptor.hip.hpp
+++ b/src/include/ConstantMatrixDescriptor.hip.hpp
@@ -9,26 +9,26 @@ struct ConstantMatrixDescriptor
        static_assert(NCol_ <= RowStride_, "wrong! NCol > RowStride!");
    }

-    __host__ __device__ constexpr index_t NRow() const { return NRow_; }
+    __host__ __device__ static constexpr index_t NRow() { return NRow_; }

-    __host__ __device__ constexpr index_t NCol() const { return NCol_; }
+    __host__ __device__ static constexpr index_t NCol() { return NCol_; }

-    __host__ __device__ constexpr index_t RowStride() const { return RowStride_; }
+    __host__ __device__ static constexpr index_t RowStride() { return RowStride_; }

-    __host__ __device__ constexpr auto GetLengths() const { return Sequence<NRow_, NCol_>{}; }
+    __host__ __device__ static constexpr auto GetLengths() { return Sequence<NRow_, NCol_>{}; }

-    __host__ __device__ constexpr index_t GetElementSize() const { return NRow_ * NCol_; }
+    __host__ __device__ static constexpr index_t GetElementSize() { return NRow_ * NCol_; }

-    __host__ __device__ constexpr index_t GetElementSpace() const { return NRow_ * RowStride_; }
+    __host__ __device__ static constexpr index_t GetElementSpace() { return NRow_ * RowStride_; }

-    __host__ __device__ index_t GetOffsetFromMultiIndex(index_t irow, index_t icol) const
+    __host__ __device__ static index_t GetOffsetFromMultiIndex(index_t irow, index_t icol)
    {
        return irow * RowStride_ + icol;
    }

    template <index_t SubNRow, index_t SubNCol>
-    __host__ __device__ constexpr auto MakeSubMatrixDescriptor(Number<SubNRow>,
-                                                               Number<SubNCol>) const
+    __host__ __device__ static constexpr auto MakeSubMatrixDescriptor(Number<SubNRow>,
+                                                                      Number<SubNCol>)
    {
        return ConstantMatrixDescriptor<SubNRow, SubNCol, RowStride_>{};
    }
--- a/src/include/ConstantMergedTensorDescriptor.hip.hpp
+++ b/src/include/ConstantMergedTensorDescriptor.hip.hpp
@@ -11,8 +11,8 @@ struct ConstantMergedTensorDescriptor
 {
    static constexpr auto mOriginalDimMergeSeqs = std::tuple<OriginalDimMergeSeqs...>{};

-    static constexpr index_t nDim         = std::tuple_size<mOriginalDimMergeSeqs>::value;
-    static constexpr index_t nOriginalDim = OriginalDesc::GetNumOfDimension();
+    static constexpr index_t nDim         = sizeof...(OriginalDimMergeSeqs);
+    static constexpr index_t nOriginalDim = OriginalTensorDesc::GetNumOfDimension();

    __host__ __device__ constexpr ConstantMergedTensorDescriptor()
    {
@@ -21,25 +21,28 @@ struct ConstantMergedTensorDescriptor
        // TODO: check each of OriginalDimMergeSeqs contains at least 1, and at most
        // OriginalTensorDesc::nDim number of dimensions

-        // TODO: check there is no duplication in OriginalDimMergeSeqs
-
        // TODO: check OriginalDimMergeSeqs contains all original dimensions
+
+        // TODO: check there is no duplication in OriginalDimMergeSeqs
    }

    __host__ __device__ static constexpr index_t GetNumOfDimension() { return nDim; }

-    __host__ __device__ static constexpr index_t GetNumOfOriginalDimension() { return nOriginalDim }
+    __host__ __device__ static constexpr index_t GetNumOfOriginalDimension()
+    {
+        return nOriginalDim;
+    }

    template <index_t IDim>
    __host__ __device__ static constexpr bool ContainMultipleOriginalDimensions(Number<IDim>)
    {
-        return (std::Get<IDIM>(mOriginalDimMergeSeqs).GetSize() > 1);
+        return (std::get<IDim>(mOriginalDimMergeSeqs).GetSize() > 1);
    }

    template <index_t IDim>
    __host__ __device__ static constexpr index_t GetLength(Number<IDim>)
    {
-        constexpr auto original_dims_partial = std::Get<IDim>(mOriginalDimMergeSeqs);
+        constexpr auto original_dims_partial = std::get<IDim>(mOriginalDimMergeSeqs);

        return OriginalTensorDesc::Extract(original_dims_partial).GetElementSize();
    }
@@ -50,14 +53,14 @@ struct ConstantMergedTensorDescriptor
        static_assert(!ContainMultipleOriginalDimensions(Number<IDim>{}),
                      "wrong! stride of a merged dimension is undefined");

-        constexpr auto idim_original = std::Get<IDim>(mOriginalDimMergeSeqs).Front();
+        constexpr auto idim_original = std::get<IDim>(mOriginalDimMergeSeqs).Front();

        return OriginalTensorDesc::GetStride(Number<idim_original>{});
    }

    __host__ __device__ static constexpr auto GetLengths()
    {
-        return Sequence<OriginalTensorDesc::Extract(OriginalDimMergeSeqs).GetElementSize()...>{};
+        return Sequence<OriginalTensorDesc::Extract(OriginalDimMergeSeqs{}).GetElementSize()...>{};
    }

    __host__ __device__ static constexpr index_t GetElementSize()
@@ -75,17 +78,16 @@ struct ConstantMergedTensorDescriptor
            constexpr auto original_dims_partial = std::get<idim>(mOriginalDimMergeSeqs);

            // get partial original-multi-id corresponding to this merged dimension
-            constexpr auto original_multi_id_partial =
+            const auto original_multi_id_partial =
                OriginalTensorDesc::Extract(original_dims_partial)
                    .GetMultiIndexFrom1dIndex(multi_id[idim]);

-            // make sure compiler unroll this loop and propagate all the constants
-            for(index_t i = 0; i < original_dims_partial.GetSize(); ++i)
-            {
-                index_t idim_original = original_dims_partial[i];
+            static_for<0, original_dims_partial.GetSize(), 1>{}([&](auto I_) {
+                constexpr auto I                = decltype(I_){};
+                constexpr index_t idim_original = original_dims_partial.Get(I);

-                original_multi_id[idim_original] = original_multi_id_partial[i]
-            }
+                original_multi_id[idim_original] = original_multi_id_partial[I.Get()];
+            });
        });

        return original_multi_id;
@@ -95,10 +97,10 @@ struct ConstantMergedTensorDescriptor
    {
        const auto original_multi_id = GetOriginalMultiIndexFromMultiIndex(multi_id);

-        return OriginalTensorDesc::GetOffsetFromMultiIndex(orginal_multi_id);
+        return OriginalTensorDesc::GetOffsetFromMultiIndex(original_multi_id);
    }

-    template <index_t... Is>
+    template <class... Is>
    __host__ __device__ static index_t GetOffsetFromMultiIndex(Is... is)
    {
        return GetOffsetFromMultiIndex(Array<index_t, nDim>{is...});
@@ -106,14 +108,15 @@ struct ConstantMergedTensorDescriptor

    __host__ __device__ static Array<index_t, nDim> GetMultiIndexFrom1dIndex(index_t id)
    {
-        constexpr auto dummy_desc = make_packed_ConstantTensorDescriptor(GetLengths());
+        constexpr auto dummy_desc = make_ConstantTensorDescriptor_default_rank_packed(GetLengths());

        return dummy_desc.GetMultiIndexFrom1dIndex(id);
    }
 };

 template <class OriginalTensorDesc, class... OriginalDimMergeSeqs>
-constexpr auto make_ConstantMergedTensorDescriptor(OriginalTensorDesc, OriginalDimMergeSeqs...)
+__host__ __device__ constexpr auto make_ConstantMergedTensorDescriptor(OriginalTensorDesc,
+                                                                       OriginalDimMergeSeqs...)
 {
    return ConstantMergedTensorDescriptor<OriginalTensorDesc, OriginalDimMergeSeqs...>{};
 }
--- a/src/include/ConstantTensorDescriptor.hip.hpp
+++ b/src/include/ConstantTensorDescriptor.hip.hpp
@@ -2,20 +2,20 @@
 #include "common.hip.hpp"

 template <class Lengths>
-__host__ __device__ constexpr auto calculate_packed_tensor_strides(Lengths)
+__host__ __device__ constexpr auto calculate_tensor_strides_default_rank_packed(Lengths)
 {
    return reverse_inclusive_scan_sequence(Lengths{}.PopFront(), std::multiplies<index_t>{})
        .PushBack(Number<1>{});
 }

 template <class Lengths, index_t Align>
-__host__ __device__ constexpr auto
-    calculate_rank_tensor_default_strides_with_alignment(Lengths, Number<Align>)
+__host__ __device__ constexpr auto calculate_tensor_strides_default_rank_aligned(Lengths,
+                                                                                 Number<Align>)
 {
    constexpr index_t L_back_align =
        Align * mod_conv::integer_divide_ceiler<index_t>{}(Lengths{}.Back(), Align);

-    return calculate_packed_tensor_strides(
+    return calculate_tensor_strides_default_rank_packed(
        Lengths{}.Modify(Number<Lengths{}.GetSize() - 1>{}, Number<L_back_align>{}));
 }

@@ -66,6 +66,12 @@ struct ConstantTensorDescriptor
        return MemoryRanks{}.Get(Number<I>{});
    }

+    template <class T>
+    __host__ __device__ static constexpr bool ContainMultipleOriginalDimensions(T)
+    {
+        return false;
+    }
+
    __host__ __device__ static constexpr index_t GetElementSize()
    {
        return accumulate_on_sequence(Lengths{}, std::multiplies<index_t>{}, Number<1>{});
@@ -146,7 +152,7 @@ struct ConstantTensorDescriptor
    {
        Array<index_t, nDim> multi_id;

-        constexpr auto dummy_strides = calculate_packed_tensor_strides(GetLengths());
+        constexpr auto dummy_strides = calculate_tensor_strides_default_rank_packed(GetLengths());

        // calculate index in each of the dimensions in the order of their dimension (not rank)
        static_for<0, nDim - 1, 1>{}([&](auto IDim) {
@@ -181,6 +187,12 @@ struct ConstantTensorDescriptor
        return ConstantTensorDescriptor<extract_lengths, extract_strides, new_ranks>{};
    }

+    template <index_t... IDims>
+    __host__ __device__ static constexpr auto Extract(Sequence<IDims...>)
+    {
+        return Extract(Number<IDims>{}...);
+    }
+
    template <index_t IDim, index_t SliceLen>
    __host__ __device__ static constexpr auto Slice(Number<IDim>, Number<SliceLen>)
    {
@@ -271,9 +283,11 @@ struct ConstantTensorDescriptor
                          FirstUnfoldDim <= LastUnfoldDim,
                      "wrong! should have FirstUnfoldDim <= LastUnfoldDim!");

+#if 0 // cannot compile: compiler complain about constexpr
        // dimensions to be unfold need to be in descending order (w.r.t. strides), and need to be
        // packed in memory, otherwise, unfolding is invalid
-        static_for<FirstUnfoldDim, LastUnfoldDim, 1>{}([&](auto IDim) {
+        static_for<FirstUnfoldDim, LastUnfoldDim, 1>{}([&](auto IDim_) {
+            constexpr auto IDim    = decltype(IDim_){};
            constexpr auto IDim_p1 = IDim + Number<1>{};

            // check stride
@@ -285,11 +299,12 @@ struct ConstantTensorDescriptor
            static_assert(GetStride(IDim_p1) * GetLength(IDim_p1) == GetStride(IDim),
                          "wrong! dimensions to be unfolded need to be packed");

-            // checkt ranks
+            // check ranks
            static_assert(GetMemoryRank(IDim_p1) == GetMemoryRank(IDim) + 1,
                          "wrong! ranks of dimensions to be unfolded need to be in increasing and "
                          "continuous ranks");
        });
+#endif

        // left and right
        constexpr auto left = typename arithmetic_sequence_gen<0, FirstUnfoldDim, 1>::SeqType{};
@@ -308,9 +323,9 @@ struct ConstantTensorDescriptor

        // decrease the ranks that are larger than the rank of LastUnfoldDim
        constexpr auto tmp_ranks =
-            transform_sequences(GetMemoryRanks(),
-                                f_unfold_impl<GetMemoryRank(Number<LastUnfoldDim>{}),
-                                              LastUnfoldDim - FirstUnfoldDim + 1>{});
+            transform_sequences(f_unfold_impl<GetMemoryRank(Number<LastUnfoldDim>{}),
+                                              LastUnfoldDim - FirstUnfoldDim + 1>{},
+                                GetMemoryRanks());

        // new lengths, strides and ranks
        constexpr auto new_lengths = GetLengths()
@@ -354,26 +369,26 @@ struct ConstantTensorDescriptor
 };

 template <class Lengths>
-__host__ __device__ constexpr auto make_packed_ConstantTensorDescriptor(Lengths)
+__host__ __device__ constexpr auto make_ConstantTensorDescriptor_default_rank_packed(Lengths)
 {
-    using Strides     = decltype(calculate_packed_tensor_strides(Lengths{}));
+    using Strides     = decltype(calculate_tensor_strides_default_rank_packed(Lengths{}));
    using MemoryRanks = typename arithmetic_sequence_gen<0, Lengths::GetSize(), 1>::SeqType;
    return ConstantTensorDescriptor<Lengths, Strides, MemoryRanks>{};
 }

 template <class Lengths, class Strides>
-__host__ __device__ constexpr auto make_ranked_ConstantTensorDescriptor(Lengths, Strides)
+__host__ __device__ constexpr auto make_ConstantTensorDescriptor_default_rank(Lengths, Strides)
 {
    using MemoryRanks = typename arithmetic_sequence_gen<0, Lengths::GetSize(), 1>::SeqType;
    return ConstantTensorDescriptor<Lengths, Strides, MemoryRanks>{};
 }

 template <class Lengths, index_t Align>
-__host__ __device__ constexpr auto
-    make_ranked_ConstantTensorDescriptor_with_alignment(Lengths, Number<Align>)
+__host__ __device__ constexpr auto make_ConstantTensorDescriptor_default_rank_aligned(Lengths,
+                                                                                      Number<Align>)
 {
    using Strides =
-        decltype(calculate_rank_tensor_default_strides_with_alignment(Lengths{}, Number<Align>{}));
+        decltype(calculate_tensor_strides_default_rank_aligned(Lengths{}, Number<Align>{}));
    using MemoryRanks = typename arithmetic_sequence_gen<0, Lengths::GetSize(), 1>::SeqType;
    return ConstantTensorDescriptor<Lengths, Strides, MemoryRanks>{};
 }
--- a/src/include/Sequence.hip.hpp
+++ b/src/include/Sequence.hip.hpp
@@ -1,5 +1,5 @@
 #pragma once
-#include "constant_integral.hip.hpp"
+#include "integral_constant.hip.hpp"
 #include "functional.hip.hpp"

 template <index_t... Is>
@@ -21,12 +21,6 @@ struct Sequence
        return mData[I];
    }

-    __host__ __device__ index_t operator[](index_t i) const
-    {
-        const index_t mData[mSize + 1] = {Is..., 0};
-        return mData[i];
-    }
-
    template <index_t... IRs>
    __host__ __device__ static constexpr auto ReorderGivenNew2Old(Sequence<IRs...> /*new2old*/)
    {
@@ -164,6 +158,12 @@ struct sequence_reverse_inclusive_scan<Sequence<I>, Reduce>
    using SeqType = Sequence<I>;
 };

+template <class Reduce>
+struct sequence_reverse_inclusive_scan<Sequence<>, Reduce>
+{
+    using SeqType = Sequence<>;
+};
+
 template <class, class>
 struct sequence_extract;

--- a/src/include/blockwise_2d_tensor_op.hip.hpp
+++ b/src/include/blockwise_2d_tensor_op.hip.hpp
@@ -457,7 +457,8 @@ struct Blockwise2dTensorCopy3
    index_t mSrcMyThreadOffset;
    index_t mDstMyThreadOffset;

-    __device__ Blockwise2dTensorCopy3()
+    __device__ Blockwise2dTensorCopy3(Array<index_t, 2> src_block_data_multi_id_begin,
+                                      Array<index_t, 2> dst_block_data_multi_id_begin)
    {
        constexpr auto I0 = Number<0>{};
        constexpr auto I1 = Number<1>{};
@@ -499,10 +500,13 @@ struct Blockwise2dTensorCopy3
        const index_t thread_id_d0 = get_thread_local_1d_id() / thread_per_d1;
        const index_t thread_id_d1 = get_thread_local_1d_id() - thread_id_d0 * thread_per_d1;

-        mSrcMyThreadOffset =
-            SrcDesc{}.GetOffsetFromMultiIndex(thread_id_d0, thread_id_d1 * DataPerRead);
-        mDstMyThreadOffset =
-            DstDesc{}.GetOffsetFromMultiIndex(thread_id_d0, thread_id_d1 * DataPerRead);
+        mSrcMyThreadOffset = SrcDesc{}.GetOffsetFromMultiIndex(
+            src_block_data_multi_id_begin +
+            Array<index_t, 2>{thread_id_d0, thread_id_d1 * DataPerRead});
+
+        mDstMyThreadOffset = DstDesc{}.GetOffsetFromMultiIndex(
+            dst_block_data_multi_id_begin +
+            Array<index_t, 2>{thread_id_d0, thread_id_d1 * DataPerRead});
    }

    __device__ void Run(const Float* __restrict__ p_src, Float* __restrict__ p_dst) const
--- a/src/include/blockwise_4d_tensor_op.hip.hpp
+++ b/src/include/blockwise_4d_tensor_op.hip.hpp
@@ -13,7 +13,7 @@ blockwise_4d_tensor_pointwise_operation_unary(DstDesc, Float* __restrict__ p_dst

    constexpr auto dst_desc = DstDesc{};

-    constexpr auto desc = make_packed_ConstantTensorDescriptor(dst_desc.GetLengths());
+    constexpr auto desc = make_ConstantTensorDescriptor_default_rank_packed(dst_desc.GetLengths());

 #if 0
    if(get_thread_local_1d_id() == 0)
@@ -108,7 +108,7 @@ __device__ void blockwise_4d_tensor_pointwise_operation_binary_reorder_by_get_ds

    constexpr auto src_desc = SrcDesc{};
    constexpr auto dst_desc = DstDesc{};
-    constexpr auto ref_desc = make_packed_ConstantTensorDescriptor(SrcOpLengths{});
+    constexpr auto ref_desc = make_ConstantTensorDescriptor_default_rank_packed(SrcOpLengths{});

    constexpr index_t NLoop = ref_desc.GetElementSize() / BlockSize;

@@ -259,7 +259,7 @@ struct Blockwise4dTensorCopy1
        constexpr index_t read_per_d3 = mod_conv::integer_divide_ceil(L3, DataPerRead);

        constexpr auto ref_desc =
-            make_packed_ConstantTensorDescriptor(Sequence<L0, L1, L2, read_per_d3>{});
+            make_ConstantTensorDescriptor_default_rank_packed(Sequence<L0, L1, L2, read_per_d3>{});

        constexpr index_t NLoop = ref_desc.GetElementSize() / BlockSize;

@@ -336,7 +336,7 @@ struct BlockwiseChwnTensorCopyPadded

        constexpr auto src_desc = SrcDesc{};
        constexpr auto dst_desc = DstDesc{};
-        constexpr auto ref_desc = make_packed_ConstantTensorDescriptor(DstOpLengths{});
+        constexpr auto ref_desc = make_ConstantTensorDescriptor_default_rank_packed(DstOpLengths{});

        constexpr auto h_global_pad_low = GlobalLowerPads{}.Get(I0);
        constexpr auto w_global_pad_low = GlobalLowerPads{}.Get(I1);
@@ -510,7 +510,8 @@ struct Blockwise4dTensorCopy3
            }
        }

-        constexpr auto thread_cluster_desc = make_packed_ConstantTensorDescriptor(ThreadPerDims{});
+        constexpr auto thread_cluster_desc =
+            make_ConstantTensorDescriptor_default_rank_packed(ThreadPerDims{});
        const auto thread_multi_id =
            thread_cluster_desc.GetMultiIndexFrom1dIndex(get_thread_local_1d_id());

@@ -652,7 +653,7 @@ struct Blockwise4dTensorCopy3
        constexpr index_t nloop_d2 = L2 / thread_per_d2;
        constexpr index_t nloop_d3 = mod_conv::integer_divide_ceil(L3, thread_per_d3 * DataPerRead);

-        constexpr auto clipboard_desc = make_packed_ConstantTensorDescriptor(
+        constexpr auto clipboard_desc = make_ConstantTensorDescriptor_default_rank_packed(
            Sequence<nloop_d0, nloop_d1, nloop_d2, nloop_d3 * DataPerRead>{});

 #pragma unroll
@@ -719,7 +720,7 @@ struct Blockwise4dTensorCopy3
        constexpr index_t nloop_d2 = L2 / thread_per_d2;
        constexpr index_t nloop_d3 = mod_conv::integer_divide_ceil(L3, thread_per_d3 * DataPerRead);

-        constexpr auto clipboard_desc = make_packed_ConstantTensorDescriptor(
+        constexpr auto clipboard_desc = make_ConstantTensorDescriptor_default_rank_packed(
            Sequence<nloop_d0, nloop_d1, nloop_d2, nloop_d3 * DataPerRead>{});

 #pragma unroll
--- a/src/include/blockwise_gemm.hip.hpp
+++ b/src/include/blockwise_gemm.hip.hpp
@@ -46,7 +46,7 @@ struct BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_v2
                          N % (NPerThreadSubC * NLevel0Cluster * NLevel1Cluster) == 0,
                      "wrong! Cannot evenly divide work among\n");

-        static_assert(ThreadMatrixC::GetLengths() == GetThreadMatrixCLengths(),
+        static_assert(is_same_type(ThreadMatrixC::GetLengths(), GetThreadMatrixCLengths()),
                      "wrong! ThreadMatrixC lengths is wrong");

        auto c_thread_mtx_index = GetBeginOfThreadMatrixC(get_thread_local_1d_id());
@@ -55,7 +55,7 @@ struct BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_v2
        mMyThreadOffsetB = BlockMatrixB::GetOffsetFromMultiIndex(0, c_thread_mtx_index.col);
    }

-    __device__ static auto GetThreadMatrixCLengths()
+    __device__ static constexpr auto GetThreadMatrixCLengths()
    {
        constexpr index_t M = BlockMatrixA::NCol(); // A is transposed
        constexpr index_t N = BlockMatrixB::NCol();
--- a/src/include/blockwise_merged_tensor_slice_op.hip.hpp
+++ b/src/include/blockwise_merged_tensor_slice_op.hip.hpp
@@ -1,19 +1,19 @@
 #pragma once
 #include "threadwise_tensor_slice_op.hip.hpp"

-// slice a merged tensor, reorder and copy it into a normal tensor
-// src: a merged tensor,
-// dst: a normal tensor
+// slice a (normal or merged) tensor, reorder and copy it into another (normal or merged) tensor
 template <index_t BlockSize,
          class Float,
          class SrcDesc,
          class DstDesc,
          class SliceLengths,
          class SubLengths,
-          class ClusterLengths,
+          class DataClusterLengths,
          class ThreadClusterArrangeOrder,
          class SrcAccessOrder,
-          class DstAccessOrder>
+          class DstAccessOrder,
+          index_t SrcDataPerRead,
+          index_t DstDataPerRead>
 struct BlockwiseTensorSliceCopy_generic_v1
 {
    static constexpr index_t nDim = SrcDesc::GetNumOfDimension();
@@ -21,39 +21,44 @@ struct BlockwiseTensorSliceCopy_generic_v1
    index_t mSrcMyThreadOffset;
    index_t mDstMyThreadOffset;

-    __device__ BlockwiseTensorSliceCopy_generic_v1(Array<index_t, nDim> src_block_multi_offset,
-                                                   Array<index_t, nDim> dst_block_multi_offset)
+    __device__
+    BlockwiseTensorSliceCopy_generic_v1(Array<index_t, nDim> src_block_data_multi_id_begin,
+                                        Array<index_t, nDim> dst_block_data_multi_id_begin)
    {
        // check NDim consistent
        static_assert(SrcDesc::GetNumOfDimension() == DstDesc::GetNumOfDimension(), "wrong");

-        constexpr auto thread_cluster_desc = make_packed_ConstantTensorDescriptor(
-            ClusterLengths{}.ReorderGivenNew2Old(ThreadClusterArrangeOrder{}));
+        // thread cluster
+        constexpr auto thread_cluster_desc = make_ConstantTensorDescriptor_default_rank_packed(
+            DataClusterLengths{}.ReorderGivenNew2Old(ThreadClusterArrangeOrder{}));

        // BlockSize
        static_assert(BlockSize == thread_cluster_desc.GetElementSize(), "wrong! BlockSize");

        // divide work
-        static_for<0, nDim, 1>{}([&](auto IDim) {
-            static_assert(SliceLengths{}.Get(IDim) % SubLenghs{}.Get(IDim) == 0,
+        constexpr auto data_per_cluster_per_dims = SubLengths{} * DataClusterLengths{};
+
+        static_for<0, nDim, 1>{}([&](auto IDim_) {
+            constexpr auto IDim = decltype(IDim_){};
+
+            static_assert(SliceLengths::Get(IDim) % SubLengths::Get(IDim) == 0,
                          "wrong! cannot evenly divide sliced tensor into sub-tensor");
+
+            static_assert(SliceLengths::Get(IDim) % data_per_cluster_per_dims.Get(IDim) == 0,
+                          "wrong! cannot evenly divide sliced tensor into cluster");
        });

-        constexpr auto thread_work_desc =
-            make_packed_ConstantTensorDescriptor(SliceLengths{} / SliceSubLengths{});
+        constexpr auto repeat_lengths = SliceLengths{} / data_per_cluster_per_dims;

-        static_for<0, nDim, 1>{}([&](auto IDim) {
-            static_assert(thread_work_desc.GetLength(IDim) % thread_cluster_desc.Get(IDim) == 0,
-                          "wrong! cannot evenly divide work to cluster");
-        });
+        // for now, only support SubLengths.Get() == 1 on a merged dimension that is merge from
+        // multiple dimensions
+        static_for<0, nDim, 1>{}([&](auto IDim_) {
+            constexpr auto IDim = decltype(IDim_){};

-        // only support SubLengths.Get() == 1 on merged dimension, for now
-        static_for<0, nDim, 1>{}([&](auto IDim) {
-            static_if<(SrcDesc::ContainMultipleOriginalDimensions(IDim) ||
-                       DstDesc::ContainMultipleOriginalDimensions(IDim))>{}([&](auto fwd) {
-                static_assert(fwd(SubLengths{}).Get(IDim) == 1,
-                              "wrong! Sub-Lengths on merged dimension should be 1");
-            });
+            static_assert(SubLengths::Get(IDim) == 1 ||
+                              (!SrcDesc::ContainMultipleOriginalDimensions(IDim) &&
+                               !DstDesc::ContainMultipleOriginalDimensions(IDim)),
+                          "wrong! only surpport Sub-Length == 1 on a merged dimension");
        });

        // calculate mSrcMyThreadOffset, mDstMyThreadOffset
@@ -63,22 +68,23 @@ struct BlockwiseTensorSliceCopy_generic_v1
        const auto data_cluster_multi_id =
            reorder_array_given_old2new(thread_cluster_multi_id, ThreadClusterArrangeOrder{});

-        const auto thread_data_multi_offset = data_cluster_multi_id * SubLengths{};
+        const auto thread_data_multi_id_begin = data_cluster_multi_id * SubLengths{};

-        mSrcMythreadOffset =
-            SrcDesc::GetOffsetFromMultiIndex(src_block_multi_offset + thread_data_multi_offset);
-        mSrcMythreadOffset =
-            DstDesc::GetOffsetFromMultiIndex(dst_block_multi_offset + thread_data_multi_offset);
+        mSrcMyThreadOffset = SrcDesc::GetOffsetFromMultiIndex(src_block_data_multi_id_begin +
+                                                              thread_data_multi_id_begin);
+
+        mSrcMyThreadOffset = DstDesc::GetOffsetFromMultiIndex(dst_block_data_multi_id_begin +
+                                                              thread_data_multi_id_begin);
    }

    __device__ static constexpr index_t GetRegisterClipboardSize()
    {
-        constexpr auto repeat_lengths = SliceLengths{} / (SubLengths{} * ClusterLengths{});
+        constexpr auto repeat_lengths = SliceLengths{} / (SubLengths{} * DataClusterLengths{});

        constexpr auto thread_tensor_desc =
-            make_packed_ConstantTensorDescriptor(SubLengths{} * repeat_lengths);
+            make_ConstantTensorDescriptor_default_rank_packed(SubLengths{} * repeat_lengths);

-        return thread_tensor_desc.GetElementSpaceSize();
+        return thread_tensor_desc.GetElementSpace();
    }

    __device__ void RunLoadRegisterClipboard(const Float* __restrict__ p_src,
@@ -86,32 +92,34 @@ struct BlockwiseTensorSliceCopy_generic_v1
    {
        constexpr auto thread_sub_tensor_lengths = SubLengths{};

-        constexpr auto data_per_cluster_per_dims = thread_sub_tensor_lengths * ClusterLengths{};
+        constexpr auto data_per_cluster_per_dims = thread_sub_tensor_lengths * DataClusterLengths{};

-        constexpr auto repeat_lengths = SliceLengths{} / (SubLengths{} * ClusterLengths{});
+        constexpr auto repeat_lengths = SliceLengths{} / (SubLengths{} * DataClusterLengths{});

-        constexpr auto thread_tensor_desc =
-            make_packed_ConstantTensorDescriptor(thread_sub_tensor_lengths * repeat_lengths);
+        constexpr auto thread_tensor_desc = make_ConstantTensorDescriptor_default_rank_packed(
+            thread_sub_tensor_lengths * repeat_lengths);

        static_ford<decltype(repeat_lengths)>{}([&](auto repeat_multi_id_) {
-            constexpr auto repeat_multi_id = decltype(repeat_multi_id_){};
+            constexpr auto repeat_multi_id = sequence2array(decltype(repeat_multi_id_){});

-            constexpr auto src_data_multi_offset = repeat_multi_id * data_per_cluster_per_dims;
+            const auto src_thread_data_multi_id_begin =
+                repeat_multi_id * data_per_cluster_per_dims; // cannot not constexpr, why?

-            constexpr auto clipboard_data_multi_offset =
-                repeat_multi_id * thread_sub_tensor_lengths;
+            const auto clipboard_data_multi_id_begin =
+                repeat_multi_id * thread_sub_tensor_lengths; // cannot not constexpr, why?

-            constexpr index_t src_offset = SrcDesc{}.GetOffsetFromMultiIndex(src_data_multi_id);
-            constexpr index_t clipboard_offset =
-                thread_tensor_desc.GetOffsetFromMultiIndex(clipboard_data_multi_id);
+            const index_t src_offset = SrcDesc{}.GetOffsetFromMultiIndex(
+                src_thread_data_multi_id_begin); // cannot not constexpr, why?
+
+            const index_t clipboard_offset = thread_tensor_desc.GetOffsetFromMultiIndex(
+                clipboard_data_multi_id_begin); // cannot not constexpr, why?

            threadwise_tensor_slice_copy_generic(SrcDesc{},
                                                 p_src + src_offset + mSrcMyThreadOffset,
-                                                 thread_tensor_desc,
-                                                 zero_array<index_t, nDim>{},
+                                                 make_zero_array<index_t, nDim>(),
                                                 thread_tensor_desc,
                                                 p_clipboard + clipboard_offset,
-                                                 zero_array<index_t, nDim>{},
+                                                 make_zero_array<index_t, nDim>(),
                                                 thread_sub_tensor_lengths,
                                                 SrcAccessOrder{});
        });
@@ -122,41 +130,44 @@ struct BlockwiseTensorSliceCopy_generic_v1
    {
        constexpr auto thread_sub_tensor_lengths = SubLengths{};

-        constexpr auto data_per_cluster_per_dims = thread_sub_tensor_lengths * ClusterLengths{};
+        constexpr auto data_per_cluster_per_dims = thread_sub_tensor_lengths * DataClusterLengths{};

-        constexpr auto repeat_lengths = SliceLengths{} / (SubLengths{} * ClusterLengths{});
+        constexpr auto repeat_lengths = SliceLengths{} / (SubLengths{} * DataClusterLengths{});

-        constexpr auto thread_tensor_desc =
-            make_packed_ConstantTensorDescriptor(thread_sub_tensor_lengths * repeat_lengths);
+        constexpr auto thread_tensor_desc = make_ConstantTensorDescriptor_default_rank_packed(
+            thread_sub_tensor_lengths * repeat_lengths);

        static_ford<decltype(repeat_lengths)>{}([&](auto repeat_multi_id_) {
-            constexpr auto repeat_multi_id = decltype(repeat_multi_id_){};
+            constexpr auto repeat_multi_id = sequence2array(decltype(repeat_multi_id_){});

-            constexpr auto clipboard_data_multi_offset =
-                repeat_multi_id * thread_sub_tensor_lengths;
+            const auto clipboard_data_multi_id_begin =
+                repeat_multi_id * thread_sub_tensor_lengths; // cannot not constexpr, why?

-            constexpr auto dst_data_multi_offset = repeat_multi_id * data_per_cluster_per_dims;
+            const auto dst_data_multi_id_begin =
+                repeat_multi_id * data_per_cluster_per_dims; // cannot not constexpr, why?

-            constexpr index_t clipboard_offset =
-                thread_tensor_desc.GetOffsetFromMultiIndex(clipboard_data_multi_offset);
+            const index_t clipboard_offset = thread_tensor_desc.GetOffsetFromMultiIndex(
+                clipboard_data_multi_id_begin); // cannot not constexpr, why?

-            constexpr index_t dst_offset = DstDesc{}.GetOffsetFromMultiIndex(dst_data_multi_offset);
+            const index_t dst_offset = DstDesc{}.GetOffsetFromMultiIndex(
+                dst_data_multi_id_begin); // cannot not constexpr, why?

            threadwise_tensor_slice_copy_generic(thread_tensor_desc,
                                                 p_clipboard + clipboard_offset,
-                                                 zero_array<index_t, nDim>{},
+                                                 make_zero_array<index_t, nDim>(),
                                                 DstDesc{},
                                                 p_dst + dst_offset + mDstMyThreadOffset,
-                                                 zero_array<index_t, nDim>{},
+                                                 make_zero_array<index_t, nDim>(),
                                                 thread_sub_tensor_lengths,
                                                 DstAccessOrder{});
+        });
    }

    __device__ void Run(const Float* __restrict__ p_src, Float* __restrict__ p_dst) const
    {
-            Float p_clipboard[GetRegisterClipboardSize()];
+        Float p_clipboard[GetRegisterClipboardSize()];

-            RunLoadRegisterClipboard(p_src, p_clipboard);
-            RunStoreRegisterClipboard(p_clipboard, p_dst);
+        RunLoadRegisterClipboard(p_src, p_clipboard);
+        RunStoreRegisterClipboard(p_clipboard, p_dst);
    }
-    };
+};
--- a/src/include/blockwise_tensor_slice_op.hip.hpp
+++ b/src/include/blockwise_tensor_slice_op.hip.hpp
@@ -40,7 +40,7 @@ struct BlockwiseTensorSliceReorderCopy_v3
            src_cluster_lengths.ReorderGivenNew2Old(map_thread_cluster_2_src_cluster);

        constexpr auto thread_cluster_desc =
-            make_packed_ConstantTensorDescriptor(thread_cluster_lengths);
+            make_ConstantTensorDescriptor_default_rank_packed(thread_cluster_lengths);

        // sanity check: data type
        static_assert(is_same<Float, float>::value, "wrong! only support float for now!\n");
@@ -149,7 +149,7 @@ struct BlockwiseTensorSliceReorderCopy_v3
        constexpr auto thread_tensor_lengths = thread_sub_tensor_lengths * repeat_lengths;

        constexpr auto thread_tensor_desc =
-            make_packed_ConstantTensorDescriptor(thread_tensor_lengths);
+            make_ConstantTensorDescriptor_default_rank_packed(thread_tensor_lengths);

        return thread_tensor_desc.GetElementSpace();
    }
@@ -170,7 +170,7 @@ struct BlockwiseTensorSliceReorderCopy_v3
        constexpr auto thread_tensor_lengths = thread_sub_tensor_lengths * repeat_lengths;

        constexpr auto thread_tensor_desc =
-            make_packed_ConstantTensorDescriptor(thread_tensor_lengths);
+            make_ConstantTensorDescriptor_default_rank_packed(thread_tensor_lengths);

        static_ford<decltype(repeat_lengths)>{}([&](auto repeat_multi_id_) {
            constexpr auto repeat_multi_id = decltype(repeat_multi_id_){};
@@ -208,7 +208,7 @@ struct BlockwiseTensorSliceReorderCopy_v3
        constexpr auto thread_tensor_lengths = thread_sub_tensor_lengths * repeat_lengths;

        constexpr auto thread_tensor_desc =
-            make_packed_ConstantTensorDescriptor(thread_tensor_lengths);
+            make_ConstantTensorDescriptor_default_rank_packed(thread_tensor_lengths);

        static_ford<decltype(repeat_lengths)>{}([&](auto repeat_multi_id_) {
            constexpr auto repeat_multi_id = decltype(repeat_multi_id_){};
--- a/src/include/common.hip.hpp
+++ b/src/include/common.hip.hpp
@@ -1,6 +1,6 @@
 #pragma once
 #include "vector_type.hip.hpp"
-#include "constant_integral.hip.hpp"
+#include "integral_constant.hip.hpp"
 #include "Sequence.hip.hpp"
 #include "Array.hip.hpp"
 #include "functional.hip.hpp"
@@ -17,15 +17,21 @@ __device__ index_t get_block_1d_id() { return blockIdx.x; }
 template <class T1, class T2>
 struct is_same
 {
-    static const bool value = false;
+    static constexpr bool value = false;
 };

 template <class T>
 struct is_same<T, T>
 {
-    static const bool value = true;
+    static constexpr bool value = true;
 };

+template <class X, class Y>
+__host__ __device__ constexpr bool is_same_type(X, Y)
+{
+    return is_same<X, Y>::value;
+}
+
 namespace mod_conv { // namespace mod_conv
 template <class T, T s>
 struct scales
--- a/src/include/conv_common.hip.hpp
+++ b/src/include/conv_common.hip.hpp
@@ -30,7 +30,7 @@ __host__ __device__ constexpr auto get_convolution_output_default_4d_tensor_desc
    constexpr auto HO = HI + 1 - Y;
    constexpr auto WO = WI + 1 - X;

-    return make_packed_ConstantTensorDescriptor(Sequence<N, K, HO, WO>{});
+    return make_ConstantTensorDescriptor_default_rank_packed(Sequence<N, K, HO, WO>{});
 }

 template <class InDesc, class WeiDesc, class LowerPads, class UpperPads>
@@ -67,7 +67,7 @@ __host__ __device__ constexpr auto get_convolution_with_padding_output_default_4
    constexpr auto HO = HI + HPadLow + HPadUp + 1 - Y;
    constexpr auto WO = WI + WPadLow + WPadUp + 1 - X;

-    return make_packed_ConstantTensorDescriptor(Sequence<N, K, HO, WO>{});
+    return make_ConstantTensorDescriptor_default_rank_packed(Sequence<N, K, HO, WO>{});
 }

 template <class InDesc, class WeiDesc, class OutDesc>
--- a/src/include/functional.hip.hpp
+++ b/src/include/functional.hip.hpp
@@ -1,5 +1,5 @@
 #pragma once
-#include "constant_integral.hip.hpp"
+#include "integral_constant.hip.hpp"

 struct forwarder
 {
--- a/src/include/gridwise_convolution_implicit_gemm_v1r3_lds_double_buffer_chwn_cyxk_khwn.hip.hpp
+++ b/src/include/gridwise_convolution_implicit_gemm_v1r3_lds_double_buffer_chwn_cyxk_khwn.hip.hpp
@@ -85,7 +85,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_chwn_cyxk_khwn
        constexpr index_t WBlockWork = mod_conv::integer_divide_ceil(Wo, WoPerBlock);
        constexpr index_t NBlockWork = mod_conv::integer_divide_ceil(N, NPerBlock);

-        constexpr auto block_work_desc = make_packed_ConstantTensorDescriptor(
+        constexpr auto block_work_desc = make_ConstantTensorDescriptor_default_rank_packed(
            Sequence<KBlockWork, HBlockWork, WBlockWork, NBlockWork>{});

        const auto block_work_multi_id =
@@ -109,7 +109,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_chwn_cyxk_khwn
                                                    GemmDataPerReadA,
                                                    GemmDataPerReadB);

-        constexpr auto in_c_h_w_n_block_desc = make_ranked_ConstantTensorDescriptor_with_alignment(
+        constexpr auto in_c_h_w_n_block_desc = make_ConstantTensorDescriptor_default_rank_aligned(
            Sequence<CPerBlock, HoPerBlock, WoPerBlock, NPerBlock>{},
            Number<InBlockCopyDataPerRead_N>{});

@@ -118,12 +118,12 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_chwn_cyxk_khwn
        static_assert(in_c_h_w_n_block_desc.GetStride(I1) % GemmDataPerReadB == 0,
                      "GemmDataPerReadB alignment requirement is not meet");

-        constexpr auto wei_c_k_block_desc = make_ranked_ConstantTensorDescriptor_with_alignment(
+        constexpr auto wei_c_k_block_desc = make_ConstantTensorDescriptor_default_rank_aligned(
            Sequence<CPerBlock, KPerBlock>{},
            Number<mod_conv::max(WeiBlockCopyDataPerRead_K, GemmDataPerReadA)>{});

        // tensor view of threadwise output in register
-        constexpr auto out_k_h_w_n_thread_desc = make_packed_ConstantTensorDescriptor(
+        constexpr auto out_k_h_w_n_thread_desc = make_ConstantTensorDescriptor_default_rank_packed(
            Sequence<KPerThread, HoPerThread, WoPerThread, NPerThread>{});

 // blockwise copy
--- a/src/include/gridwise_convolution_implicit_gemm_v1r3_lds_double_buffer_nchw_cyxk_khwn.hip.hpp
+++ b/src/include/gridwise_convolution_implicit_gemm_v1r3_lds_double_buffer_nchw_cyxk_khwn.hip.hpp
@@ -86,7 +86,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_nchw_cyxk_khwn
        constexpr index_t HBlockWork = mod_conv::integer_divide_ceil(Ho, HoPerBlock);
        constexpr index_t WBlockWork = mod_conv::integer_divide_ceil(Wo, WoPerBlock);

-        constexpr auto block_work_desc = make_packed_ConstantTensorDescriptor(
+        constexpr auto block_work_desc = make_ConstantTensorDescriptor_default_rank_packed(
            Sequence<NBlockWork, KBlockWork, HBlockWork, WBlockWork>{});

        const auto block_work_multi_id =
@@ -102,7 +102,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_nchw_cyxk_khwn

        // global tensor view
        constexpr auto wei_c_k_global_desc =
-            make_ranked_ConstantTensorDescriptor(Sequence<C, K>{}, Sequence<Y * X * K, 1>{});
+            make_ConstantTensorDescriptor_default_rank(Sequence<C, K>{}, Sequence<Y * X * K, 1>{});

        // LDS tensor view
        //   be careful of alignment
@@ -111,7 +111,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_nchw_cyxk_khwn
                                                    GemmDataPerReadA,
                                                    GemmDataPerReadB);

-        constexpr auto in_c_h_w_n_block_desc = make_ranked_ConstantTensorDescriptor_with_alignment(
+        constexpr auto in_c_h_w_n_block_desc = make_ConstantTensorDescriptor_default_rank_aligned(
            Sequence<CPerBlock, HoPerBlock, WoPerBlock, NPerBlock>{},
            Number<InBlockReorderDataPerWrite_N>{});

@@ -120,12 +120,12 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_nchw_cyxk_khwn
        static_assert(in_c_h_w_n_block_desc.GetStride(I1) % GemmDataPerReadB == 0,
                      "GemmDataPerReadB alignment requirement is not meet");

-        constexpr auto wei_c_k_block_desc = make_ranked_ConstantTensorDescriptor_with_alignment(
+        constexpr auto wei_c_k_block_desc = make_ConstantTensorDescriptor_default_rank_aligned(
            Sequence<CPerBlock, KPerBlock>{},
            Number<mod_conv::max(WeiBlockCopyDataPerRead_K, GemmDataPerReadA)>{});

        // tensor view of threadwise output in register
-        constexpr auto out_k_h_w_n_thread_desc = make_packed_ConstantTensorDescriptor(
+        constexpr auto out_k_h_w_n_thread_desc = make_ConstantTensorDescriptor_default_rank_packed(
            Sequence<KPerThread, HoPerThread, WoPerThread, NPerThread>{});

        // blockwise copy
@@ -448,10 +448,10 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_nchw_cyxk_khwn
            constexpr index_t K1 = KPerBlock / KPerThread;

 #if 0
-            constexpr auto out_10d_global_desc = make_packed_ConstantTensorDescriptor(
+            constexpr auto out_10d_global_desc = make_ConstantTensorDescriptor_default_rank_packed(
                Sequence<K / (K1 * K2), K1, K2, Ho, Wo / (W1 * W2 * W3), W1, W2, W3, N / N1, N1>{});

-            constexpr auto out_10d_thread_desc = make_packed_ConstantTensorDescriptor(
+            constexpr auto out_10d_thread_desc = make_ConstantTensorDescriptor_default_rank_packed(
                Sequence<KPerThread / K2, 1, K2, HoPerThread, 1, W1, 1, W3, 1, N1>{});
 #else
            constexpr auto out_10d_global_desc =
--- a/src/include/gridwise_convolution_implicit_gemm_v1r3_lds_double_buffer_nchw_cyxk_nkhw.hip.hpp
+++ b/src/include/gridwise_convolution_implicit_gemm_v1r3_lds_double_buffer_nchw_cyxk_nkhw.hip.hpp
@@ -86,7 +86,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_nchw_cyxk_nkhw
        constexpr index_t HBlockWork = mod_conv::integer_divide_ceil(Ho, HoPerBlock);
        constexpr index_t WBlockWork = mod_conv::integer_divide_ceil(Wo, WoPerBlock);

-        constexpr auto block_work_desc = make_packed_ConstantTensorDescriptor(
+        constexpr auto block_work_desc = make_ConstantTensorDescriptor_default_rank_packed(
            Sequence<NBlockWork, KBlockWork, HBlockWork, WBlockWork>{});

        const auto block_work_multi_id =
@@ -110,7 +110,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_nchw_cyxk_nkhw
                                                    GemmDataPerReadA,
                                                    GemmDataPerReadB);

-        constexpr auto in_c_h_w_n_block_desc = make_ranked_ConstantTensorDescriptor_with_alignment(
+        constexpr auto in_c_h_w_n_block_desc = make_ConstantTensorDescriptor_default_rank_aligned(
            Sequence<CPerBlock, HoPerBlock, WoPerBlock, NPerBlock>{},
            Number<InBlockReorderDataPerWrite_N>{});

@@ -119,12 +119,12 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_nchw_cyxk_nkhw
        static_assert(in_c_h_w_n_block_desc.GetStride(I1) % GemmDataPerReadB == 0,
                      "GemmDataPerReadB alignment requirement is not meet");

-        constexpr auto wei_c_k_block_desc = make_ranked_ConstantTensorDescriptor_with_alignment(
+        constexpr auto wei_c_k_block_desc = make_ConstantTensorDescriptor_default_rank_aligned(
            Sequence<CPerBlock, KPerBlock>{},
            Number<mod_conv::max(WeiBlockCopyDataPerRead_K, GemmDataPerReadA)>{});

        // tensor view of threadwise output in register
-        constexpr auto out_k_h_w_n_thread_desc = make_packed_ConstantTensorDescriptor(
+        constexpr auto out_k_h_w_n_thread_desc = make_ConstantTensorDescriptor_default_rank_packed(
            Sequence<KPerThread, HoPerThread, WoPerThread, NPerThread>{});

        // blockwise copy
@@ -152,7 +152,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_nchw_cyxk_nkhw
                                   decltype(wei_c_k_global_desc),
                                   decltype(wei_c_k_block_desc),
                                   decltype(wei_c_k_block_desc.GetLengths()),
-                                   WeiBlockCopyDataPerRead_K>{};
+                                   WeiBlockCopyDataPerRead_K>({0, 0}, {0, 0});

        // a series of blockwise batched GEMM
        // C_matrix += transpose(A_matrix) * B_matrix
@@ -196,7 +196,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_nchw_cyxk_nkhw

        // choose GEMM implementation here
        const auto run_blockwise_batch_gemm = [&](auto... Xs) {
-#if 0
+#if 1
            return blockwise_batch_gemm.Run(Xs...);
 #elif 0
            return blockwise_batch_gemm.Run_asm(Xs...);
--- a/src/include/gridwise_convolution_implicit_gemm_v3_nchw_cyxk_nkhw.hip.hpp
+++ b/src/include/gridwise_convolution_implicit_gemm_v3_nchw_cyxk_nkhw.hip.hpp
@@ -1,8 +1,11 @@
 #pragma once
 #include "common.hip.hpp"
 #include "ConstantTensorDescriptor.hip.hpp"
+#include "ConstantMergedTensorDescriptor.hip.hpp"
 #include "ConstantMatrixDescriptor.hip.hpp"
+#include "blockwise_merged_tensor_slice_op.hip.hpp"
 #include "blockwise_gemm.hip.hpp"
+#include "threadwise_tensor_slice_op.hip.hpp"

 // define B = merge(N, Ho, Wo)
 template <index_t GridSize,
@@ -24,7 +27,12 @@ template <index_t GridSize,
          index_t GemmNLevel1Cluster,
          index_t GemmKPerThreadLoop,
          index_t GemmDataPerReadA,
-          index_t GemmDataPerReadB>
+          index_t GemmDataPerReadB,
+          class InBlockCopySubLengths_N1_N2_C_B,
+          class InBlockCopyClusterLengths_N1_N2_C_B,
+          index_t InBlockCopySrcDataPerRead_B,
+          index_t InBlockCopyDstDataPerWrite_N2,
+          index_t WeiBlockCopyDataPerAccess_K>
 struct GridwiseConvolutionImplicitGemm_v3_nchw_cyxk_nkhw
 {
    __device__ void Run(const Float* const __restrict__ p_in_global,
@@ -34,12 +42,10 @@ struct GridwiseConvolutionImplicitGemm_v3_nchw_cyxk_nkhw
        // this is a mess
        // TODO: more elegent way of specifying (or calculating) performance variables
        static_assert(N2 == GemmNPerThreadSubC, "wrong!");
-        static_assert(KPerBlock ==
-                          N1 * GemmNPerThreadSubC * GemmNLevel0Cluster * GemmNLevel1Cluster,
+        static_assert((N1 * N2 * BPerBlock) %
+                              (GemmNPerThreadSubC * GemmNLevel0Cluster * GemmNLevel1Cluster) ==
+                          0,
                      "wrong!");
-        static_assert(
-            KPerBlock % (N1 * GemmNPerThreadSubC * GemmNLevel0Cluster * GemmNLevel1Cluster) == 0,
-            "wrong!");

        constexpr auto I0 = Number<0>{};
        constexpr auto I1 = Number<1>{};
@@ -73,15 +79,14 @@ struct GridwiseConvolutionImplicitGemm_v3_nchw_cyxk_nkhw
        constexpr index_t B = N0 * Ho * Wo;

        // divide block work by [K, B]
-        static_assert(K % KPerBlock == 0 && B % BPerBlock == 0,
-                      C % CPerBlock == 0,
+        static_assert(K % KPerBlock == 0 && B % BPerBlock == 0 && C % CPerBlock == 0,
                      "wrong! cannot divide work evenly among block");

        constexpr index_t KBlockWork = K / KPerBlock;
        constexpr index_t BBlockWork = B / BPerBlock;

        constexpr auto block_work_desc =
-            make_ConstantTensorDescriptor(Sequence<KBlockWork, BBlockWork>{});
+            make_ConstantTensorDescriptor_default_rank_packed(Sequence<KBlockWork, BBlockWork>{});

        const auto block_work_multi_id =
            block_work_desc.GetMultiIndexFrom1dIndex(get_block_1d_id());
@@ -95,16 +100,20 @@ struct GridwiseConvolutionImplicitGemm_v3_nchw_cyxk_nkhw
            in_n_c_h_w_global_desc.Fold(I0, Number<N1>{}, Number<N2>{});

        //     merged tensor descriptor in device memory [N1, N2, C, B], src of blockwise copy
-        constexpr auto in_n1_n2_c_b_global_merged_desc =
-            in_n0_n1_n2_c_h_w_global_desc.ReorderGivenNew2Old(Sequence<1, 2, 3, 0, 4, 5>{})
+        constexpr auto in_n1_n2_c_b_global_merged_desc = make_ConstantMergedTensorDescriptor(
+            in_n0_n1_n2_c_h_w_global_mem_desc.ReorderGivenNew2Old(Sequence<1, 2, 3, 0, 4, 5>{})
                .Slice(I4, Number<Ho>{})
-                .Slice(I5, Number<Wo>{})
-                .Merge(I3, I5);
+                .Slice(I5, Number<Wo>{}),
+            Sequence<0>{},
+            Sequence<1>{},
+            Sequence<2>{},
+            Sequence<3, 4, 5>{});

        //     memory layout descriptor in LDS [C, N1, B, N2]
        //     be careful of LDS alignment
-        constexpr auto in_c_n1_b_n2_block_mem_desc = make_ConstantTensorDescriptor_aligned(
-            Sequence<CPerBlock, N1, BPerBlock, N2>{}, Number<InBlockCopyDstDataPerWrite_N2>{});
+        constexpr auto in_c_n1_b_n2_block_mem_desc =
+            make_ConstantTensorDescriptor_default_rank_aligned(
+                Sequence<CPerBlock, N1, BPerBlock, N2>{}, Number<InBlockCopyDstDataPerWrite_N2>{});

        //    tensor descriptor in LDS [N1, N2, C, B], dst of blockwise copy
        constexpr auto in_n1_n2_c_b_block_desc =
@@ -112,7 +121,7 @@ struct GridwiseConvolutionImplicitGemm_v3_nchw_cyxk_nkhw

        //     this check is ad-hoc
        //     TODO: need to properly implement tensor descriptor with alignment
-        static_assert(in_c_n1_b_n2_block_desc.GetStride(I1) % GemmDataPerReadB == 0,
+        static_assert(in_c_n1_b_n2_block_mem_desc.GetStride(I1) % GemmDataPerReadB == 0,
                      "GemmDataPerReadB alignment requirement is not satisfied");

        // input blockwise copy
@@ -129,7 +138,8 @@ struct GridwiseConvolutionImplicitGemm_v3_nchw_cyxk_nkhw
            Sequence<2, 0, 1, 3>, // thread_arrange_order [C, N1, N2, B]
            Sequence<0, 1, 2, 3>, // src_access_order [N1, N2, C, B]
            Sequence<2, 0, 3, 1>, // dst_access_order [C, N1, B, N2]
-            >({0, 0, 0, b_block_data_on_global}, {0, 0, 0, 0});
+            InBlockCopySrcDataPerRead_B,
+            InBlockCopyDstDataPerWrite_N2>({0, 0, 0, b_block_data_on_global}, {0, 0, 0, 0});

        // weight tensor
        //     tensor descriptor in device memory, src of blockwise copy
@@ -137,9 +147,9 @@ struct GridwiseConvolutionImplicitGemm_v3_nchw_cyxk_nkhw

        //     tensor descriptor in LDS, dst of blockwise copy
        //     be careful of LDS alignment
-        constexpr auto wei_c_k_block_desc = make_ConstantTensorDescriptor_aligned(
+        constexpr auto wei_c_k_block_desc = make_ConstantTensorDescriptor_default_rank_aligned(
            Sequence<CPerBlock, KPerBlock>{},
-            Number<mod_conv::max(WeiBlockCopyDataPerRead_K, GemmDataPerReadA)>{});
+            Number<mod_conv::max(WeiBlockCopyDataPerAccess_K, GemmDataPerReadA)>{});

        // operator for blockwise copy of weight into LDS
        //     slicing a tensor
@@ -150,7 +160,8 @@ struct GridwiseConvolutionImplicitGemm_v3_nchw_cyxk_nkhw
                                   decltype(wei_c_k_global_desc),
                                   decltype(wei_c_k_block_desc),
                                   decltype(wei_c_k_block_desc.GetLengths()),
-                                   WeiBlockCopyDataPerRead_K>({0, k_block_data_on_global}, {0, 0});
+                                   WeiBlockCopyDataPerAccess_K>({0, k_block_data_on_global},
+                                                                {0, 0});

        // GEMM definition
        // c_mtx += transpose(a_mtx) * b_mtx
@@ -167,7 +178,8 @@ struct GridwiseConvolutionImplicitGemm_v3_nchw_cyxk_nkhw
                                          Number<in_c_n1_b_n2_block_mem_desc.GetStride(I0)>{});

        // sanity check
-        static_assert(KPerBlock % (GemmMPerThreadSubC * GemmMLevel0Cluster * GemmMLevel1Cluster),
+        static_assert(KPerBlock % (GemmMPerThreadSubC * GemmMLevel0Cluster * GemmMLevel1Cluster) ==
+                          0,
                      "wrong!");

        constexpr index_t GemmMRepeat =
@@ -194,8 +206,8 @@ struct GridwiseConvolutionImplicitGemm_v3_nchw_cyxk_nkhw
            GemmDataPerReadB>{};

        // LDS allocation for input and weight: be careful of alignment
-        constexpr index_t max_align = mod_conv::max(InBlockReorderDataPerWrite_N,
-                                                    WeiBlockCopyDataPerRead_K,
+        constexpr index_t max_align = mod_conv::max(InBlockCopyDstDataPerWrite_N2,
+                                                    WeiBlockCopyDataPerAccess_K,
                                                    GemmDataPerReadA,
                                                    GemmDataPerReadB);

@@ -211,7 +223,7 @@ struct GridwiseConvolutionImplicitGemm_v3_nchw_cyxk_nkhw
        Float p_out_thread[c_k0k2_n1n2_thread_mtx_desc.GetElementSpace()];

        // zero out threadwise output
-        threadwise_matrix_set_zero(out_k0_k1_k2_n1_n0_h_w_n2_thread_desc, p_out_thread);
+        threadwise_matrix_set_zero(c_k0k2_n1n2_thread_mtx_desc, p_out_thread);

        // do work
        for(index_t y = 0; y < Y; ++y)
@@ -229,15 +241,15 @@ struct GridwiseConvolutionImplicitGemm_v3_nchw_cyxk_nkhw
                        c_block_data_on_global = 0;
                    c_block_data_on_global < C;
                    c_block_data_on_global += CPerBlock,
-                        p_in_block_ont_global += CPerBlock * in_n_c_h_w_global_desc.GetStride(I1),
+                        p_in_block_on_global += CPerBlock * in_n_c_h_w_global_desc.GetStride(I1),
                        p_wei_block_on_global += CPerBlock * wei_c_y_x_k_global_desc.GetStride(I0))
                {
-                    blockwise_in_copy.run(p_in_block_on_global, p_in_block);
-                    blockwise_wei_copy.run(p_wei_block_on_global, p_wei_block);
+                    blockwise_in_copy.Run(p_in_block_on_global, p_in_block);
+                    blockwise_wei_copy.Run(p_wei_block_on_global, p_wei_block);

                    __syncthreads();

-                    blockwise_gemm.run(p_wei_block, p_in_block, p_out_thread);
+                    blockwise_gemm.Run(p_wei_block, p_in_block, p_out_thread);

                    __syncthreads();
                }
@@ -253,19 +265,26 @@ struct GridwiseConvolutionImplicitGemm_v3_nchw_cyxk_nkhw
            // define tensor descriptor for threadwise copy
            //     output tensor (also, memory layout) descriptor in register, src of threadwise
            //     copy
-            constexpr auto out_k0_k1_k2_n1_b_n2_thread_mem_desc = make_ConstantTensorDescriptor(
-                Sequence<KPerBlock / (K1 * K2), 1, K2, N1, 1, 1, 1, N2>{});
+            constexpr auto out_k0_k1_k2_n1_b_n2_thread_mem_desc =
+                make_ConstantTensorDescriptor_default_rank_packed(
+                    Sequence<KPerBlock / (K1 * K2), 1, K2, N1, 1, N2>{});

            //     output memory layout descriptor in device memory
            constexpr auto out_n0_n1_n2_k0_k1_k2_h_w_global_mem_desc =
-                out_n_k_h_w_global.Fold(I1, Number<K1>{}, Number<K2>{})
+                out_n_k_h_w_global_desc.Fold(I1, Number<K1>{}, Number<K2>{})
                    .Fold(I0, Number<N1>{}, Number<N2>{});

            //     output merged tensor descriptor in device memory, dst of threadwise copy
            constexpr auto out_k0_k1_k2_n1_b_n2_global_merged_desc =
-                out_n0_n1_n2_k0_k1_k2_h_w_global_mem_desc
-                    .ReorderGivenNew2Old(Sequence<3, 4, 5, 1, 0, 6, 7, 2>{})
-                    .Merge(I4, I6);
+                make_ConstantMergedTensorDescriptor(
+                    out_n0_n1_n2_k0_k1_k2_h_w_global_mem_desc.ReorderGivenNew2Old(
+                        Sequence<3, 4, 5, 1, 0, 6, 7, 2>{}),
+                    Sequence<0>{},
+                    Sequence<1>{},
+                    Sequence<2>{},
+                    Sequence<3>{},
+                    Sequence<4, 5, 6>{},
+                    Sequence<7>{});

            // calculate origin of thread output tensor on global memory
            //     blockwise GEMM c matrix starting index
@@ -273,18 +292,30 @@ struct GridwiseConvolutionImplicitGemm_v3_nchw_cyxk_nkhw
                blockwise_gemm.GetBeginOfThreadMatrixC(get_thread_local_1d_id());

            //     origin of thread tensor on global
-            const index_t k_thread_data_on_global k_block_data_on_global +
-                c_thread_mtx_on_block.row;
+            const index_t k_thread_data_on_global =
+                k_block_data_on_global + c_thread_mtx_on_block.row;
+
            const index_t b_thread_data_on_global =
                b_block_data_on_global + c_thread_mtx_on_block.col;

-            //     output merged global tensor descriptor, for calculating origin of thread tensor
-            //     in global memory
+//     output merged global tensor descriptor, for calculating origin of thread tensor
+//     in global memory
+#if 0 // unfold a merged tensor is not implemented yet
            constexpr auto out_k_n1_b_n2_global_merged_desc =
-                out_k0_k1_k2_n1_b_n2_global_merged_desc.Unfold(I1, I2);
+                out_k0_k1_k2_n1_b_n2_global_merged_desc.Unfold(I0, I2);
+#else
+            constexpr auto out_k_n1_b_n2_global_merged_desc = make_ConstantMergedTensorDescriptor(
+                out_n0_n1_n2_k0_k1_k2_h_w_global_mem_desc
+                    .ReorderGivenNew2Old(Sequence<3, 4, 5, 1, 0, 6, 7, 2>{})
+                    .Unfold(I0, I2),
+                Sequence<0>{},
+                Sequence<1>{},
+                Sequence<2, 3, 4>{},
+                Sequence<5>{});
+#endif

            //     origin of thread tensor in global memory
-            const index_t p_out_thread_on_global =
+            Float* p_out_thread_on_global =
                p_out_global +
                out_k_n1_b_n2_global_merged_desc.GetOffsetFromMultiIndex(
                    k_thread_data_on_global, 0, 0, 0); // dst origin on merged global tensor
@@ -303,8 +334,8 @@ struct GridwiseConvolutionImplicitGemm_v3_nchw_cyxk_nkhw
                 0,
                 b_thread_data_on_global,
                 0}, // starting point of slice w.r.t. origin of dst
-                out_k0_k1_k2_n1_b_n2_thread_desc.GetLengths(), // slice lengths
-                Sequence<2, 3, 4, 0, 5, 1>{}                   // order of dimension access
+                out_k0_k1_k2_n1_b_n2_thread_mem_desc.GetLengths(), // slice lengths
+                Sequence<2, 3, 4, 0, 5, 1>{}                       // order of dimension access
                );
        }
    }
--- a/src/include/integral_constant.hip.hpp
+++ b/src/include/integral_constant.hip.hpp
@@ -8,5 +8,11 @@ struct integral_constant
    __host__ __device__ constexpr T Get() const { return value; }
 };

+template <class T, index_t X, index_t Y>
+__host__ __device__ constexpr auto operator+(integral_constant<T, X>, integral_constant<T, Y>)
+{
+    return integral_constant<T, X + Y>{};
+}
+
 template <index_t N>
 using Number = integral_constant<index_t, N>;
--- a/src/include/threadwise_gemm.hip.hpp
+++ b/src/include/threadwise_gemm.hip.hpp
@@ -10,7 +10,7 @@ __device__ void threadwise_matrix_set_zero(Matrix, Float* __restrict__ p_thread)
        for(index_t j = 0; j < Matrix::NCol(); ++j)
        {
            const index_t id = Matrix::GetOffsetFromMultiIndex(i, j);
-            p_thread[id]     = 0;
+            p_thread[id]     = Float(0);
        }
    }
 }
--- a/src/include/threadwise_tensor_slice_op.hip.hpp
+++ b/src/include/threadwise_tensor_slice_op.hip.hpp
@@ -19,7 +19,7 @@ __device__ void threadwise_tensor_slice_copy(SrcDesc,

    constexpr auto src_desc = SrcDesc{};
    constexpr auto dst_desc = DstDesc{};
-    constexpr auto ref_desc = make_packed_ConstantTensorDescriptor(SrcOpLengths{});
+    constexpr auto ref_desc = make_ConstantTensorDescriptor_default_rank_packed(SrcOpLengths{});

 #if 0
    if(get_thread_local_1d_id() == 0 && get_block_1d_id() == 0)
@@ -194,16 +194,19 @@ threadwise_tensor_slice_copy_reorder_given_dst2src_v3(SrcDesc,
 }

 template <class Float, class SrcDesc, class DstDesc, class SliceLengths, class DimAccessOrder>
-__device__ void
-threadwise_tensor_slice_copy_generic(SrcDesc,
-                                     const Float* __restrict__ p_src,
-                                     Array<index_t, SrcDesc::GetNumOfDimension()> src_multi_offset,
-                                     DstDesc,
-                                     Float* __restrict__ p_dst,
-                                     Array<index_t, DstDesc::GetNumOfDimension()> dst_multi_offset,
-                                     SliceLengths,
-                                     DimAccessOrder)
+__device__ void threadwise_tensor_slice_copy_generic(
+    SrcDesc,
+    const Float* __restrict__ p_src,
+    Array<index_t, SrcDesc::GetNumOfDimension()> src_multi_id_begin,
+    DstDesc,
+    Float* __restrict__ p_dst,
+    Array<index_t, DstDesc::GetNumOfDimension()> dst_multi_id_begin,
+    SliceLengths,
+    DimAccessOrder)
 {
+    static_assert(SrcDesc::GetNumOfDimension() == DstDesc::GetNumOfDimension(),
+                  "wrong! # of dimensions not the same");
+
    constexpr auto src_desc = SrcDesc{};
    constexpr auto dst_desc = DstDesc{};

@@ -215,9 +218,10 @@ threadwise_tensor_slice_copy_generic(SrcDesc,
            reorder_array_given_old2new(data_multi_id_in_access_order, DimAccessOrder{});

        const index_t dst_index =
-            dst_desc.GetOffsetFromMultiIndex(src_multi_offset + data_multi_id);
+            dst_desc.GetOffsetFromMultiIndex(src_multi_id_begin + data_multi_id);
+
        const index_t src_index =
-            src_desc.GetOffsetFromMultiIndex(dst_multi_offset + data_multi_id);
+            src_desc.GetOffsetFromMultiIndex(dst_multi_id_begin + data_multi_id);

        p_dst[dst_index] = p_src[src_index];
    });
--- a/src/include/vector_type.hip.hpp
+++ b/src/include/vector_type.hip.hpp
@@ -1,6 +1,6 @@
 #pragma once
 #include "config.h"
-#include "constant_integral.hip.hpp"
+#include "integral_constant.hip.hpp"

 template <class T, index_t N>
 struct vector_type