adding implicit gemm v4 (nchw, kcyx)

2026-05-12 01:10:17 +00:00 · 2019-05-30 17:50:49 -05:00
parent 0a2657312e
commit b2439ec9dd
11 changed files with 440 additions and 130 deletions
--- a/driver/driver.hip.cpp
+++ b/driver/driver.hip.cpp
@@ -44,11 +44,7 @@ struct GeneratorTensor_3
    {
        std::array<index_t, sizeof...(Is)> dims = {{static_cast<index_t>(is)...}};

-#if 0
-        auto f_acc = std::plus<index_t>{};
-#else
        auto f_acc = [](auto a, auto b) { return 100 * a + b; };
-#endif

        return std::accumulate(dims.begin(), dims.end(), index_t(0), f_acc);
    }
@@ -447,7 +443,7 @@ int main(int argc, char* argv[])

    constexpr index_t HPad = 0;
    constexpr index_t WPad = 0;
-#elif 1
+#elif 0
    // 3x3 filter, 28x28 image
    constexpr index_t N  = 128;
    constexpr index_t C  = 256;
@@ -543,7 +539,7 @@ int main(int argc, char* argv[])

    constexpr index_t HPad = 0;
    constexpr index_t WPad = 0;
-#elif 0
+#elif 1
    // 1x1 filter, 14x14 image
    constexpr index_t N  = 128;
    constexpr index_t C  = 512;
@@ -553,6 +549,18 @@ int main(int argc, char* argv[])
    constexpr index_t Y  = 1;
    constexpr index_t X  = 1;

+    constexpr index_t HPad = 0;
+    constexpr index_t WPad = 0;
+#elif 1
+    // 1x1 filter, 73x73 image
+    constexpr index_t N  = 128;
+    constexpr index_t C  = 64;
+    constexpr index_t HI = 73;
+    constexpr index_t WI = 73;
+    constexpr index_t K  = 128;
+    constexpr index_t Y  = 1;
+    constexpr index_t X  = 1;
+
    constexpr index_t HPad = 0;
    constexpr index_t WPad = 0;
 #endif
@@ -609,8 +617,6 @@ int main(int argc, char* argv[])
        };
        wei_kcyx.GenerateTensorValue(gen_wei, num_thread);
 #endif
-
-        // out_nkhw_device.GenerateTensorValue(GeneratorTensor_1{}, num_thread);
    }

 #if 1
@@ -649,7 +655,7 @@ int main(int argc, char* argv[])

    if(do_verification)
    {
-#if 1
+#if 0
        if(Y == 3 && X == 3)
        {
            host_winograd_3x3_convolution(in_nchw, wei_kcyx, out_nkhw_host, lower_pads, upper_pads);
--- a/src/include/Array.hip.hpp
+++ b/src/include/Array.hip.hpp
@@ -105,6 +105,7 @@ __host__ __device__ constexpr auto extract_array(const Array<TData, NSize>& old_
    return new_array;
 }

+// Array = Array + Array
 template <class TData, index_t NSize>
 __host__ __device__ constexpr auto operator+(Array<TData, NSize> a, Array<TData, NSize> b)
 {
@@ -119,6 +120,55 @@ __host__ __device__ constexpr auto operator+(Array<TData, NSize> a, Array<TData,
    return result;
 }

+// Array = Array - Array
+template <class TData, index_t NSize>
+__host__ __device__ constexpr auto operator-(Array<TData, NSize> a, Array<TData, NSize> b)
+{
+    Array<TData, NSize> result;
+
+    static_for<0, NSize, 1>{}([&](auto I) {
+        constexpr index_t i = I.Get();
+
+        result[i] = a[i] - b[i];
+    });
+
+    return result;
+}
+
+// Array = Array + Sequence
+template <class TData, index_t NSize, index_t... Is>
+__host__ __device__ constexpr auto operator+(Array<TData, NSize> a, Sequence<Is...> b)
+{
+    static_assert(sizeof...(Is) == NSize, "wrong! size not the same");
+
+    Array<TData, NSize> result;
+
+    static_for<0, NSize, 1>{}([&](auto I) {
+        constexpr index_t i = I.Get();
+
+        result[i] = a[i] + b.Get(I);
+    });
+
+    return result;
+}
+
+// Array = Array - Sequence
+template <class TData, index_t NSize, index_t... Is>
+__host__ __device__ constexpr auto operator-(Array<TData, NSize> a, Sequence<Is...> b)
+{
+    static_assert(sizeof...(Is) == NSize, "wrong! size not the same");
+
+    Array<TData, NSize> result;
+
+    static_for<0, NSize, 1>{}([&](auto I) {
+        constexpr index_t i = I.Get();
+
+        result[i] = a[i] - b.Get(I);
+    });
+
+    return result;
+}
+
 // Array = Array * Sequence
 template <class TData, index_t NSize, index_t... Is>
 __host__ __device__ constexpr auto operator*(Array<TData, NSize> a, Sequence<Is...> b)
@@ -136,15 +186,119 @@ __host__ __device__ constexpr auto operator*(Array<TData, NSize> a, Sequence<Is.
    return result;
 }

-template <class TData, index_t NSize, class F>
-__host__ __device__ constexpr TData reduce_on_array(Array<TData, NSize> a, F f)
+// Array = Sequence - Array
+template <class TData, index_t NSize, index_t... Is>
+__host__ __device__ constexpr auto operator-(Sequence<Is...> a, Array<TData, NSize> b)
 {
-    TData result = a[0];
+    static_assert(sizeof...(Is) == NSize, "wrong! size not the same");

-    static_for<1, NSize, 1>{}([&](auto I) {
+    Array<TData, NSize> result;
+
+    static_for<0, NSize, 1>{}([&](auto I) {
+        constexpr index_t i = I.Get();
+
+        result[i] = a.Get(I) - b[i];
+    });
+
+    return result;
+}
+
+template <class TData, index_t NSize, class Reduce>
+__host__ __device__ constexpr TData
+accumulate_on_array(const Array<TData, NSize>& a, Reduce f, TData init)
+{
+    TData result = init;
+
+    static_assert(NSize > 0, "wrong");
+
+    static_for<0, NSize, 1>{}([&](auto I) {
        constexpr index_t i = I.Get();
        result              = f(result, a[i]);
    });

    return result;
 }
+
+template <class T, index_t NSize>
+__host__ __device__ void print_Array(const char* s, Array<T, NSize> a)
+{
+    constexpr index_t nsize = a.GetSize();
+
+    static_assert(nsize > 0 && nsize <= 10, "wrong!");
+
+    static_if<nsize == 1>{}([&](auto) { printf("%s size %u, {%u}\n", s, nsize, a[0]); });
+
+    static_if<nsize == 2>{}([&](auto) { printf("%s size %u, {%u %u}\n", s, nsize, a[0], a[1]); });
+
+    static_if<nsize == 3>{}(
+        [&](auto) { printf("%s size %u, {%u %u %u}\n", s, nsize, a[0], a[1], a[2]); });
+
+    static_if<nsize == 4>{}(
+        [&](auto) { printf("%s size %u, {%u %u %u %u}\n", s, nsize, a[0], a[1], a[2], a[3]); });
+
+    static_if<nsize == 5>{}([&](auto) {
+        printf("%s size %u, {%u %u %u %u %u}\n", s, nsize, a[0], a[1], a[2], a[3], a[4]);
+    });
+
+    static_if<nsize == 6>{}([&](auto) {
+        printf("%s size %u, {%u %u %u %u %u %u}\n", s, nsize, a[0], a[1], a[2], a[3], a[4], a[5]);
+    });
+
+    static_if<nsize == 7>{}([&](auto) {
+        printf("%s size %u, {%u %u %u %u %u %u %u}\n",
+               s,
+               nsize,
+               a[0],
+               a[1],
+               a[2],
+               a[3],
+               a[4],
+               a[5],
+               a[6]);
+    });
+
+    static_if<nsize == 8>{}([&](auto) {
+        printf("%s size %u, {%u %u %u %u %u %u %u %u}\n",
+               s,
+               nsize,
+               a[0],
+               a[1],
+               a[2],
+               a[3],
+               a[4],
+               a[5],
+               a[6],
+               a[7]);
+    });
+
+    static_if<nsize == 9>{}([&](auto) {
+        printf("%s size %u, {%u %u %u %u %u %u %u %u %u}\n",
+               s,
+               nsize,
+               a[0],
+               a[1],
+               a[2],
+               a[3],
+               a[4],
+               a[5],
+               a[6],
+               a[7],
+               a[8]);
+    });
+
+    static_if<nsize == 10>{}([&](auto) {
+        printf("%s size %u, {%u %u %u %u %u %u %u %u %u %u}\n",
+               s,
+               nsize,
+               a[0],
+               a[1],
+               a[2],
+               a[3],
+               a[4],
+               a[5],
+               a[6],
+               a[7],
+               a[8],
+               a[9]);
+    });
+}
--- a/src/include/ConstantMergedTensorDescriptor.hip.hpp
+++ b/src/include/ConstantMergedTensorDescriptor.hip.hpp
@@ -99,15 +99,7 @@ struct ConstantMergedTensorDescriptor
        return original_multi_id;
    }

-#if 0 // not needed
-    __host__ __device__ static index_t
-    GetOffsetFromOriginalMultiIndex(Array<index_t, nOriginalDim> original_multi_id)
-    {
-        return OriginalTensorDesc::GetOffsetFromMultiIndex(original_multi_id);
-    }
-#endif
-
-    __host__ __device__ static index_t GetOffsetFromMultiIndexA(Array<index_t, nDim> multi_id)
+    __host__ __device__ static index_t GetOffsetFromMultiIndex(Array<index_t, nDim> multi_id)
    {
        const auto original_multi_id = GetOriginalMultiIndexFromMultiIndex(multi_id);

@@ -126,38 +118,6 @@ struct ConstantMergedTensorDescriptor

        return dummy_desc.GetMultiIndexFrom1dIndex(id);
    }
-
-#if 0 // not needed
-    template <index_t IDim>
-    __host__ __device__ static index_t GetNewOriginalMultiIndexAfterMovingAlongOneDimension(
-        Array<index_t, nOriginalDim> old_original_multi_id, Number<IDim>, index_t step_size)
-    {
-        auto new_original_multi_id = old_original_multi_id;
-
-        // get partial-original-multi-id corresponding to this merged dimension
-        constexpr auto original_partial_dims = std::get<IDim>(mOriginalDimMergeSeqs);
-
-        constexpr auto original_partial_tensor_desc =
-            OriginalTensorDesc::Extract(original_partial_dims);
-
-        auto old_original_partial_multi_id =
-            extract_array(old_original_mutli_id, original_paritial_dims);
-
-        auto new_original_partial_multi_id =
-            original_partial_tensor_desc.GetNewMultiIndexGivenStepSizeOf1dIndex(
-                old_original_partial_multi_id, step_size);
-
-        // update original-mutli-id
-        static_for<0, original_dims_partial.GetSize(), 1>{}([&](auto I_) {
-            constexpr auto I                = decltype(I_){};
-            constexpr index_t idim_original = original_dims_partial.Get(I);
-
-            new_original_multi_id[idim_original] = original_multi_id_partial[I.Get()];
-        });
-
-        return new_original_multi_id;
-    }
-#endif
 };

 template <class OriginalTensorDesc, class... OriginalDimMergeSeqs>
--- a/src/include/ConstantTensorDescriptor.hip.hpp
+++ b/src/include/ConstantTensorDescriptor.hip.hpp
@@ -4,7 +4,7 @@
 template <class Lengths>
 __host__ __device__ constexpr auto calculate_tensor_strides_default_rank_packed(Lengths)
 {
-    return reverse_inclusive_scan_sequence(Lengths{}.PopFront(), std::multiplies<index_t>{})
+    return reverse_inclusive_scan_sequence(Lengths{}.PopFront(), mod_conv::multiplies<index_t>{})
        .PushBack(Number<1>{});
 }

@@ -95,7 +95,7 @@ struct ConstantTensorDescriptor

    __host__ __device__ static constexpr index_t GetElementSize()
    {
-        return accumulate_on_sequence(Lengths{}, std::multiplies<index_t>{}, Number<1>{});
+        return accumulate_on_sequence(Lengths{}, mod_conv::multiplies<index_t>{}, Number<1>{});
    }

    // WRONG! ReorderGivenOld2New is broken
@@ -107,10 +107,10 @@ struct ConstantTensorDescriptor
        constexpr auto strides_in_rank = GetStrides().ReorderGivenOld2new(MemoryRank{});

        constexpr index_t element_space_unaligned = accumulate_on_sequence(
-            (lengths_in_rank - Number<1>{}) * strides_in_rank, std::plus<index_t>{}, Number<1>{});
+            (lengths_in_rank - Number<1>{}) * strides_in_rank, mod_conv::plus<index_t>{}, Number<1>{});
 #else // WRONG! align shouldbe applied to the last memory rank, not the last tensor dimension
        constexpr index_t element_space_unaligned = accumulate_on_sequence(
-            (GetLengths() - Number<1>{}) * GetStrides(), std::plus<index_t>{}, Number<1>{});
+            (GetLengths() - Number<1>{}) * GetStrides(), mod_conv::plus<index_t>{}, Number<1>{});
 #endif

        return align.Get() * ((element_space_unaligned + align.Get() - 1) / align.Get());
@@ -144,7 +144,8 @@ struct ConstantTensorDescriptor

        constexpr auto multi_id = Sequence<Is...>{};

-        return accumulate_on_sequence(multi_id * GetStrides(), std::plus<index_t>{}, Number<0>{});
+        return accumulate_on_sequence(
+            multi_id * GetStrides(), mod_conv::plus<index_t>{}, Number<0>{});
    }

 #if 0 // ReorderGivenOld2new is broken
@@ -197,32 +198,70 @@ struct ConstantTensorDescriptor
    // This function doesn't do carry check on the highest dimension, for performance reason.
    // It is the user's responsibility to make sure the result "new_mutli_id" is not out-of-bound
    // on the highest dimension
+    template <bool PositiveDirection>
    __host__ __device__ static Array<index_t, nDim>
    UpdateMultiIndexGivenStepSizeOf1dIndex(Array<index_t, nDim> old_multi_id,
-                                           index_t step_size_of_1d_index)
+                                           index_t step_size_of_1d_index,
+                                           integral_constant<bool, PositiveDirection>)
    {
-        auto new_multi_id = old_multi_id + GetMultiIndexFrom1dIndex(step_size_of_1d_index);
+        Array<index_t, nDim> new_multi_id;

-        bool carry = false;
+        const auto step_sizes = GetMultiIndexFrom1dIndex(step_size_of_1d_index);

-        // do carry check in reversed order, starting from lowest dimension
-        // don't check the highest dimension
-        static_for<0, nDim - 1, 1>{}([&](auto IDimReverse) {
-            constexpr index_t idim = nDim - 1 - IDimReverse.Get();
-            constexpr auto IDim    = Number<idim>{};
+        static_if<PositiveDirection>{}([&](auto) {
+            new_multi_id = old_multi_id + step_sizes;

-            if(carry)
-            {
-                ++new_multi_id[idim];
-            }
+            bool carry = false;

-            carry = false;
+            // do carry check in reversed order, starting from lowest dimension
+            // don't check the highest dimension
+            static_for<0, nDim - 1, 1>{}([&](auto IDimReverse) {
+                constexpr index_t idim = nDim - 1 - IDimReverse.Get();
+                constexpr auto IDim    = Number<idim>{};

-            if(new_multi_id[idim] >= GetLength(IDim))
-            {
-                new_multi_id[idim] -= GetLength(IDim);
-                carry = true;
-            }
+                if(carry)
+                {
+                    ++new_multi_id[idim];
+                }
+
+                carry = false;
+
+                if(new_multi_id[idim] >= GetLength(IDim))
+                {
+                    new_multi_id[idim] -= GetLength(IDim);
+                    carry = true;
+                }
+            });
+        }).Else([&](auto) {
+            // shift up multi-id to avoid unsigned integer underflow during intermediate
+            // calculations. After the shift, should have new_multi_id[...] >= 1
+            new_multi_id = old_multi_id + (GetLengths() - step_sizes);
+
+            bool borrow = false;
+
+            // do borrow check in reversed order, starting from lowest dimension
+            // don't check the highest dimension
+            static_for<0, nDim - 1, 1>{}([&](auto IDimReverse) {
+                constexpr index_t idim = nDim - 1 - IDimReverse.Get();
+                constexpr auto IDim    = Number<idim>{};
+
+                if(borrow)
+                {
+                    --new_multi_id[idim];
+                }
+
+                borrow = false;
+
+                if(new_multi_id[idim] < GetLength(IDim))
+                {
+                    new_multi_id[idim] += GetLength(IDim);
+                    borrow = true;
+                }
+            });
+
+            // shift back down multi-id
+            // here, should have new_multi_id[...] >= GetLengths()
+            new_multi_id = new_multi_id - GetLengths();
        });

        return new_multi_id;
@@ -255,7 +294,7 @@ struct ConstantTensorDescriptor
    }

    template <class... Ts>
-    __host__ __device__ static constexpr auto Inject(ConstantTensorDescriptor<Ts...>)
+    __host__ __device__ static constexpr auto Embed(ConstantTensorDescriptor<Ts...>)
    {
        using leaf_tensor = ConstantTensorDescriptor<Ts...>;

@@ -290,7 +329,7 @@ struct ConstantTensorDescriptor
        constexpr auto fold_intervals = Sequence<FoldIntervals...>{};

        constexpr index_t fold_intervals_product =
-            accumulate_on_sequence(fold_intervals, std::multiplies<index_t>{}, Number<1>{});
+            accumulate_on_sequence(fold_intervals, mod_conv::multiplies<index_t>{}, Number<1>{});

        constexpr auto unfold_length = GetLength(Number<IDim>{});
        constexpr auto unfold_stride = GetStride(Number<IDim>{});
@@ -309,7 +348,7 @@ struct ConstantTensorDescriptor
        constexpr auto fold_strides =
            Number<unfold_stride>{} *
            reverse_inclusive_scan_sequence(fold_intervals.PushBack(Number<1>{}),
-                                            std::multiplies<index_t>{});
+                                            mod_conv::multiplies<index_t>{});

        // folded_ranks
        constexpr auto fold_ranks =
@@ -389,7 +428,7 @@ struct ConstantTensorDescriptor

        // unfolded length, stride and rank
        constexpr index_t unfold_length = accumulate_on_sequence(
-            GetLengths().Extract(middle), std::multiplies<index_t>{}, Number<1>{});
+            GetLengths().Extract(middle), mod_conv::multiplies<index_t>{}, Number<1>{});

        constexpr index_t unfold_stride = GetStride(Number<LastUnfoldDim>{});

@@ -472,7 +511,20 @@ __host__ __device__ void print_ConstantTensorDescriptor(TDesc, const char* s)
 {
    constexpr index_t ndim = TDesc::GetNumOfDimension();

-    static_assert(ndim >= 2 && ndim <= 10, "wrong!");
+    static_assert(ndim >= 1 && ndim <= 10, "wrong!");
+
+    static_if<ndim == 1>{}([&](auto fwd) {
+        constexpr auto I0 = Number<0>{};
+
+        constexpr auto desc = fwd(TDesc{});
+
+        printf("%s dim %u, lengths {%u}, strides {%u}, ranks {%u}\n",
+               s,
+               desc.GetNumOfDimension(),
+               desc.GetLength(I0),
+               desc.GetStride(I0),
+               desc.GetMemoryRank(I0));
+    });

    static_if<ndim == 2>{}([&](auto fwd) {
        constexpr auto I0 = Number<0>{};
--- a/src/include/Sequence.hip.hpp
+++ b/src/include/Sequence.hip.hpp
@@ -495,3 +495,39 @@ __host__ __device__ constexpr auto Sequence<Is...>::Modify(Number<I>, Number<X>)

    return seq_left.PushBack(Number<X>{}).Append(seq_right);
 }
+
+template <index_t... Xs>
+__host__ __device__ void print_Sequence(const char* s, Sequence<Xs...>)
+{
+    constexpr index_t nsize = Sequence<Xs...>::GetSize();
+
+    static_assert(nsize <= 10, "wrong!");
+
+    static_if<nsize == 0>{}([&](auto) { printf("%s size %u, {}\n", s, nsize, Xs...); });
+
+    static_if<nsize == 1>{}([&](auto) { printf("%s size %u, {%u}\n", s, nsize, Xs...); });
+
+    static_if<nsize == 2>{}([&](auto) { printf("%s size %u, {%u %u}\n", s, nsize, Xs...); });
+
+    static_if<nsize == 3>{}([&](auto) { printf("%s size %u, {%u %u %u}\n", s, nsize, Xs...); });
+
+    static_if<nsize == 4>{}([&](auto) { printf("%s size %u, {%u %u %u %u}\n", s, nsize, Xs...); });
+
+    static_if<nsize == 5>{}(
+        [&](auto) { printf("%s size %u, {%u %u %u %u %u}\n", s, nsize, Xs...); });
+
+    static_if<nsize == 6>{}(
+        [&](auto) { printf("%s size %u, {%u %u %u %u %u %u}\n", s, nsize, Xs...); });
+
+    static_if<nsize == 7>{}(
+        [&](auto) { printf("%s size %u, {%u %u %u %u %u %u %u}\n", s, nsize, Xs...); });
+
+    static_if<nsize == 8>{}(
+        [&](auto) { printf("%s size %u, {%u %u %u %u %u %u %u %u}\n", s, nsize, Xs...); });
+
+    static_if<nsize == 9>{}(
+        [&](auto) { printf("%s size %u, {%u %u %u %u %u %u %u %u %u}\n", s, nsize, Xs...); });
+
+    static_if<nsize == 10>{}(
+        [&](auto) { printf("%s size %u, {%u %u %u %u %u %u %u %u %u %u}\n", s, nsize, Xs...); });
+}
--- a/src/include/blockwise_3d_tensor_op.hip.hpp
+++ b/src/include/blockwise_3d_tensor_op.hip.hpp
@@ -158,7 +158,7 @@ struct Blockwise3dTensorCopy3
                      "wrrong! BlockSize is not big enough for ThreadPerDims!");

        constexpr index_t num_active_thread =
-            accumulate_on_sequence(ThreadPerDims{}, std::multiplies<index_t>{}, Number<1>{});
+            accumulate_on_sequence(ThreadPerDims{}, mod_conv::multiplies<index_t>{}, Number<1>{});

        if(BlockSize > num_active_thread)
        {
--- a/src/include/blockwise_4d_tensor_op.hip.hpp
+++ b/src/include/blockwise_4d_tensor_op.hip.hpp
@@ -500,7 +500,7 @@ struct Blockwise4dTensorCopy3
                      "wrrong! BlockSize is not big enough for ThreadPerDims!");

        constexpr index_t num_active_thread =
-            accumulate_on_sequence(ThreadPerDims{}, std::multiplies<index_t>{}, Number<1>{});
+            accumulate_on_sequence(ThreadPerDims{}, mod_conv::multiplies<index_t>{}, Number<1>{});

        if(BlockSize > num_active_thread)
        {
--- a/src/include/blockwise_generic_tensor_slice_op.hip.hpp
+++ b/src/include/blockwise_generic_tensor_slice_op.hip.hpp
@@ -3,6 +3,7 @@

 // slice a (normal or merged) tensor, and copy it into another (normal or merged) tensor
 // memory layout (ordering of dimensions) can be different between src and dst
+// For now, only support SubLengths == 1 on a merged dimension
 template <index_t BlockSize,
          class Float,
          class SrcDesc,
@@ -47,7 +48,7 @@ struct BlockwiseGenericTensorSliceCopy_v1
    BlockwiseGenericTensorSliceCopy_v1(Array<index_t, nDim> src_block_data_multi_id_begin,
                                       Array<index_t, nDim> dst_block_data_multi_id_begin)
    {
-        // check NDim consistent
+        // check NDim consistency
        static_assert(nDim == SrcDesc::GetNumOfDimension() &&
                          nDim == DstDesc::GetNumOfDimension() && nDim == SliceLengths::GetSize() &&
                          nDim == SubLengths::GetSize() && nDim == DataClusterLengths::GetSize() &&
@@ -55,7 +56,7 @@ struct BlockwiseGenericTensorSliceCopy_v1
                          nDim == SrcAccessOrder::GetSize() && nDim == DstAccessOrder::GetSize(),
                      "wrong");

-        // check
+        // check thread arrange order and read/write access order are valid
        static_assert(is_valid_sequence_map<ThreadClusterArrangeOrder>::value &&
                          is_valid_sequence_map<SrcAccessOrder>::value &&
                          is_valid_sequence_map<DstAccessOrder>::value,
@@ -140,10 +141,14 @@ struct BlockwiseGenericTensorSliceCopy_v1
        });

        // complete offset
-        mThreadSrcOffset = reduce_on_array(mThreadSrcPartialOffsets, std::plus<index_t>{});
-        mThreadDstOffset = reduce_on_array(mThreadDstPartialOffsets, std::plus<index_t>{});
+        mThreadSrcOffset = accumulate_on_array(
+            mThreadSrcPartialOffsets, mod_conv::plus<index_t>{}, static_cast<index_t>(0));
+
+        mThreadDstOffset = accumulate_on_array(
+            mThreadDstPartialOffsets, mod_conv::plus<index_t>{}, static_cast<index_t>(0));

 #if 0
+        if(get_block_1d_id() == 0)
        {
            printf("id %5u %5u: "
                   "src_block_data_multi_id_begin: %u %u %u %u, "
@@ -279,13 +284,9 @@ struct BlockwiseGenericTensorSliceCopy_v1
    // the boundary of the tensor being sliced. This functions doesn't do runtime sanity
    // check on out-of-bound slicing window, for performance reason
    template <index_t IDim_, index_t StepSize, bool PositiveDirection>
-    __device__ void MoveSlicingWindowOnSourceTensor(Number<IDim_>,
-                                                    Number<StepSize>,
-                                                    integral_constant<bool, PositiveDirection>)
+    __device__ void MoveSlicingWindowOnSourceTensor(
+        Number<IDim_>, Number<StepSize>, integral_constant<bool, PositiveDirection> direction)
    {
-        static_assert(PositiveDirection,
-                      "wrong! only support movement in positive direction for now");
-
        constexpr auto IDim    = Number<IDim_>{};
        constexpr index_t idim = IDim.Get();

@@ -306,7 +307,7 @@ struct BlockwiseGenericTensorSliceCopy_v1

            auto new_src_partial_original_multi_id =
                src_partial_original_desc.UpdateMultiIndexGivenStepSizeOf1dIndex(
-                    old_src_partial_original_multi_id, StepSize);
+                    old_src_partial_original_multi_id, StepSize, direction);

            // update "mThreadSrcOriginalMultiId"
            static_for<0, src_partial_original_dims.GetSize(), 1>{}([&](auto I_) {
@@ -328,7 +329,7 @@ struct BlockwiseGenericTensorSliceCopy_v1
            mThreadSrcPartialOffsets[idim] = new_src_partial_offset;

            // update "mThreadSrcOffset", do "+" before "-" to avoid underflow
-            mThreadSrcOffset = mThreadSrcOffset + new_src_partial_offset - old_src_partial_offset;
+            mThreadSrcOffset = (mThreadSrcOffset + new_src_partial_offset) - old_src_partial_offset;
        }).Else([&](auto fwd) {
            // Logic for non-merged dimension. If you are never going to move the slicing window on
            // a merged dimension, then "mThreadSrcOriginalMultiId" and "mThreadSrcPartialOffsets",
@@ -336,13 +337,25 @@ struct BlockwiseGenericTensorSliceCopy_v1
            // should be able to remove these calculations.
            // TODO: make sure compiler would actually remove them in this case.

+            // It is the user's responsiblity to make sure the slicing window will not be moved out
+            // of the boundary of the tensor being sliced. Otherwise, there might be hazard like
+            // unsigned integer underflow. That is NO runtime sanity check to prevent the hazard
+
            constexpr index_t idim_original = SrcDesc::GetContainedOriginalDimensions(IDim).Front();

-            mThreadSrcOffset += StepSize * SrcDesc::GetStride(IDim);
+            static_if<PositiveDirection>{}([&](auto) {
+                mThreadSrcOffset += StepSize * SrcDesc::GetStride(IDim);

-            mThreadSrcOriginalMultiId[idim_original] += StepSize;
+                mThreadSrcOriginalMultiId[idim_original] += StepSize;

-            mThreadSrcPartialOffsets[idim] += StepSize * SrcDesc::GetStride(IDim);
+                mThreadSrcPartialOffsets[idim] += StepSize * SrcDesc::GetStride(IDim);
+            }).Else([&](auto) {
+                mThreadSrcOffset -= StepSize * SrcDesc::GetStride(IDim);
+
+                mThreadSrcOriginalMultiId[idim_original] -= StepSize;
+
+                mThreadSrcPartialOffsets[idim] -= StepSize * SrcDesc::GetStride(IDim);
+            });
        });
    }
 };
--- a/src/include/common.hip.hpp
+++ b/src/include/common.hip.hpp
@@ -39,6 +39,18 @@ struct scales
    __host__ __device__ constexpr T operator()(T a) const { return s * a; }
 };

+template <class T>
+struct plus
+{
+    __host__ __device__ constexpr T operator()(T a, T b) const { return a + b; }
+};
+
+template <class T>
+struct multiplies
+{
+    __host__ __device__ constexpr T operator()(T a, T b) const { return a * b; }
+};
+
 template <class T>
 struct integer_divide_ceiler
 {
--- a/src/include/gridwise_convolution_implicit_gemm_v3_nchw_cyxk_nkhw.hip.hpp
+++ b/src/include/gridwise_convolution_implicit_gemm_v3_nchw_cyxk_nkhw.hip.hpp
@@ -58,6 +58,9 @@ struct GridwiseConvolutionImplicitGemm_v3_nchw_cyxk_nkhw
        constexpr auto I6 = Number<6>{};
        constexpr auto I7 = Number<7>{};

+        constexpr auto True  = integral_constant<bool, true>{};
+        constexpr auto False = integral_constant<bool, false>{};
+
        constexpr auto in_n_c_h_w_global_desc  = InGlobalDesc{};
        constexpr auto wei_c_y_x_k_global_desc = WeiGlobalDesc{};
        constexpr auto out_n_k_h_w_global_desc = OutGlobalDesc{};
@@ -123,7 +126,7 @@ struct GridwiseConvolutionImplicitGemm_v3_nchw_cyxk_nkhw
        // input blockwise copy
        //     slice a merged tensor, reorder and copy to a normal tensor
        //     this copy operator already has blockwise offset built-in
-        const auto blockwise_in_copy = BlockwiseGenericTensorSliceCopy_v1<
+        auto blockwise_in_copy = BlockwiseGenericTensorSliceCopy_v1<
            BlockSize,
            Float,
            decltype(in_c_n1_b_n2_global_merged_desc),
@@ -150,20 +153,20 @@ struct GridwiseConvolutionImplicitGemm_v3_nchw_cyxk_nkhw
        // operator for blockwise copy of weight into LDS
        //     slice a tensor, and copy it into another tensor
        //     this copy operator already have blockwise offset built-in
-        const auto blockwise_wei_copy =
-#if 0
+        auto blockwise_wei_copy =
+#if 1
            BlockwiseGenericTensorSliceCopy_v1<BlockSize,
-                                                Float,
-                                                decltype(wei_c_k_global_desc),
-                                                decltype(wei_c_k_block_desc),
-                                                decltype(wei_c_k_block_desc.GetLengths()),
-                                                WeiBlockCopySubLengths_C_K,
-                                                WeiBlockCopyClusterLengths_C_K,
-                                                Sequence<0, 1>, // thread_arrange_order [C, K]
-                                                Sequence<0, 1>, // src_access_order [C, K]
-                                                Sequence<0, 1>, // dst_access_order [C, K]
-                                                WeiBlockCopyDataPerAccess_K,
-                                                WeiBlockCopyDataPerAccess_K>(
+                                               Float,
+                                               decltype(wei_c_k_global_desc),
+                                               decltype(wei_c_k_block_desc),
+                                               decltype(wei_c_k_block_desc.GetLengths()),
+                                               WeiBlockCopySubLengths_C_K,
+                                               WeiBlockCopyClusterLengths_C_K,
+                                               Sequence<0, 1>, // thread_arrange_order [C, K]
+                                               Sequence<0, 1>, // src_access_order [C, K]
+                                               Sequence<0, 1>, // dst_access_order [C, K]
+                                               WeiBlockCopyDataPerAccess_K,
+                                               WeiBlockCopyDataPerAccess_K>(
                {0, k_block_data_on_global}, {0, 0});
 #else
            Blockwise2dTensorCopy3<BlockSize,
@@ -175,16 +178,14 @@ struct GridwiseConvolutionImplicitGemm_v3_nchw_cyxk_nkhw
                                                                {0, 0});
 #endif

-            // GEMM definition
-            // c_mtx += transpose(a_mtx) * b_mtx
-            //     a_mtx[CPerBlock, KPerBlock] is in LDS
-            //     b_mtx[CPerBlocl, N1 * BPerBlock * N2] is in LDS
-            //     c_mtx[KPerBlock, N1 * BPerBlock * N2] is distributed among threads, and saved in
-            //     register
-            constexpr auto a_c_k_block_mtx_desc =
-                make_ConstantMatrixDescriptor(Number<CPerBlock>{},
-                                              Number<KPerBlock>{},
-                                              Number<wei_c_k_block_desc.GetStride(I0)>{});
+        // GEMM definition
+        // c_mtx += transpose(a_mtx) * b_mtx
+        //     a_mtx[CPerBlock, KPerBlock] is in LDS
+        //     b_mtx[CPerBlocl, N1 * BPerBlock * N2] is in LDS
+        //     c_mtx[KPerBlock, N1 * BPerBlock * N2] is distributed among threads, and saved in
+        //     register
+        constexpr auto a_c_k_block_mtx_desc = make_ConstantMatrixDescriptor(
+            Number<CPerBlock>{}, Number<KPerBlock>{}, Number<wei_c_k_block_desc.GetStride(I0)>{});

        constexpr auto b_c_n1bn2_block_mtx_desc =
            make_ConstantMatrixDescriptor(Number<CPerBlock>{},
@@ -239,6 +240,7 @@ struct GridwiseConvolutionImplicitGemm_v3_nchw_cyxk_nkhw
        // zero out threadwise output
        threadwise_matrix_set_zero(c_k0k2_n1n2_thread_mtx_desc, p_out_thread);

+#if 0
        // do work
        for(index_t y = 0; y < Y; ++y)
        {
@@ -269,6 +271,45 @@ struct GridwiseConvolutionImplicitGemm_v3_nchw_cyxk_nkhw
                }
            }
        }
+#else
+        for(index_t y = 0; y < Y; ++y)
+        {
+            for(index_t x = 0; x < X; ++x)
+            {
+                // calculate origin of block input and weight tensor on global memory
+                const Float* p_in_block_on_global =
+                    p_in_global + in_n_c_h_w_global_desc.GetOffsetFromMultiIndex(0, 0, y, x);
+
+                const Float* p_wei_block_on_global =
+                    p_wei_global + wei_c_y_x_k_global_desc.GetOffsetFromMultiIndex(0, y, x, 0);
+
+                for(index_t c_block_data_on_global = 0; c_block_data_on_global < C;
+                    c_block_data_on_global += CPerBlock)
+                {
+                    blockwise_in_copy.Run(p_in_block_on_global, p_in_block);
+                    blockwise_wei_copy.Run(p_wei_block_on_global, p_wei_block);
+
+                    __syncthreads();
+
+                    blockwise_gemm.Run(p_wei_block, p_in_block, p_out_thread);
+
+                    __syncthreads();
+
+                    // move on C: C_N1_B_N2, C_K
+                    blockwise_in_copy.MoveSlicingWindowOnSourceTensor(
+                        I0, Number<CPerBlock>{}, True);
+
+                    blockwise_wei_copy.MoveSlicingWindowOnSourceTensor(
+                        I0, Number<CPerBlock>{}, True);
+                }
+
+                // reset C
+                blockwise_in_copy.MoveSlicingWindowOnSourceTensor(I0, Number<C>{}, False);
+
+                blockwise_wei_copy.MoveSlicingWindowOnSourceTensor(I0, Number<C>{}, False);
+            }
+        }
+#endif

        // copy output: register to global memory
        {
--- a/src/include/gridwise_convolution_implicit_gemm_v4_nchw_kcyx_nkhw.hip.hpp
+++ b/src/include/gridwise_convolution_implicit_gemm_v4_nchw_kcyx_nkhw.hip.hpp
@@ -59,7 +59,7 @@ struct GridwiseConvolutionImplicitGemm_v4_nchw_kcyx_nkhw
        constexpr auto I6 = Number<6>{};
        constexpr auto I7 = Number<7>{};

-        constexpr auto TRUE = integral_constant<bool, true>{};
+        constexpr auto True = integral_constant<bool, true>{};

        constexpr auto in_n_c_h_w_global_desc  = InGlobalDesc{};
        constexpr auto wei_k_c_y_x_global_desc = WeiGlobalDesc{};
@@ -102,9 +102,9 @@ struct GridwiseConvolutionImplicitGemm_v4_nchw_kcyx_nkhw
        const index_t b_block_data_on_global = block_work_multi_id[1] * BPerBlock;

        // input tensor
-        //     tensor descriptor in device memory [N0, N1, N2, H, W]
-        constexpr auto in_n0_n1_n2_h_w_global_desc = in_n_c_h_w_global_desc.Slice(I2, Number<Hi>{})
-                                                         .Slice(I3, Number<Wi>{})
+        //     tensor descriptor in device memory [N0, N1, N2, Ho, Wo]
+        constexpr auto in_n0_n1_n2_h_w_global_desc = in_n_c_h_w_global_desc.Slice(I2, Number<Ho>{})
+                                                         .Slice(I3, Number<Wo>{})
                                                         .Fold(I0, Number<N1>{}, Number<N2>{})
                                                         .Extract(Sequence<0, 1, 2, 4, 5>{});

@@ -115,12 +115,23 @@ struct GridwiseConvolutionImplicitGemm_v4_nchw_kcyx_nkhw

        //     merged tensor descriptor in device memory [E, N1, B, N2], src of blockwise copy
        constexpr auto in_e_n1_b_n2_global_merged_desc = make_ConstantMergedTensorDescriptor(
-            in_c_y_x_global_desc.Inject(in_n0_n1_n2_h_w_global_desc),
+            in_c_y_x_global_desc.Embed(in_n0_n1_n2_h_w_global_desc),
            Sequence<0, 1, 2>{},
            Sequence<4>{},
            Sequence<3, 6, 7>{},
            Sequence<5>{});

+#if 0
+        if(get_block_1d_id() == 0 && get_thread_local_1d_id() == 0)
+        {
+            print_ConstantTensorDescriptor(in_n0_n1_n2_h_w_global_desc,
+                                           "in_n0_n1_n2_h_w_global_desc: ");
+            print_ConstantTensorDescriptor(in_c_y_x_global_desc, "in_c_y_x_global_desc: ");
+            print_ConstantMergedTensorDescriptor(in_e_n1_b_n2_global_merged_desc,
+                                                 "in_e_n1_b_n2_global_merged_desc: ");
+        }
+#endif
+
        //     memory layout descriptor in LDS [E, N1, B, N2], dst of blockwise copy
        //     be careful of LDS alignment
        constexpr auto in_e_n1_b_n2_block_desc = make_ConstantTensorDescriptor_default_rank_aligned(
@@ -243,6 +254,31 @@ struct GridwiseConvolutionImplicitGemm_v4_nchw_kcyx_nkhw
        // do work
        for(index_t e = 0; e < E; e += EPerBlock)
        {
+#if 0
+            if(e == 1 * EPerBlock && get_block_1d_id() == 0)
+            {
+                printf("id %5u %5u: "
+                       "mThreadSrcOriginalMultiId %u %u %u %u %u %u %u %u, "
+                       "mThreadSrcPartialOffsets %u %u %u %u, "
+                       "mThreadSrcOffset %u, mThreadDstOffset %u \n",
+                       get_block_1d_id(),
+                       get_thread_local_1d_id(),
+                       blockwise_in_copy.mThreadSrcOriginalMultiId[0],
+                       blockwise_in_copy.mThreadSrcOriginalMultiId[1],
+                       blockwise_in_copy.mThreadSrcOriginalMultiId[2],
+                       blockwise_in_copy.mThreadSrcOriginalMultiId[3],
+                       blockwise_in_copy.mThreadSrcOriginalMultiId[4],
+                       blockwise_in_copy.mThreadSrcOriginalMultiId[5],
+                       blockwise_in_copy.mThreadSrcOriginalMultiId[6],
+                       blockwise_in_copy.mThreadSrcOriginalMultiId[7],
+                       blockwise_in_copy.mThreadSrcPartialOffsets[0],
+                       blockwise_in_copy.mThreadSrcPartialOffsets[1],
+                       blockwise_in_copy.mThreadSrcPartialOffsets[2],
+                       blockwise_in_copy.mThreadSrcPartialOffsets[3],
+                       blockwise_in_copy.mThreadSrcOffset,
+                       blockwise_in_copy.mThreadDstOffset);
+            }
+#endif
            // marching slicing window
            blockwise_in_copy.Run(p_in_global, p_in_block);
            blockwise_wei_copy.Run(p_wei_global, p_wei_block);
@@ -253,8 +289,8 @@ struct GridwiseConvolutionImplicitGemm_v4_nchw_kcyx_nkhw

            __syncthreads();

-            blockwise_in_copy.MoveSlicingWindowOnSourceTensor(I0, Number<EPerBlock>{}, TRUE);
-            blockwise_wei_copy.MoveSlicingWindowOnSourceTensor(I0, Number<EPerBlock>{}, TRUE);
+            blockwise_in_copy.MoveSlicingWindowOnSourceTensor(I0, Number<EPerBlock>{}, True);
+            blockwise_wei_copy.MoveSlicingWindowOnSourceTensor(I0, Number<EPerBlock>{}, True);
        }

        // copy output: register to global memory