From 33b5a8556b68c2cbeba555088b647310779d17e4 Mon Sep 17 00:00:00 2001
From: Chao Liu <lc.roy86@gmail.com>
Date: Thu, 16 May 2019 22:23:18 -0500
Subject: [PATCH] adding implicit gemm v3

---
 ...lution_implicit_gemm_v1_chwn_cyxk_khwn.hpp |   2 +-
 ...lution_implicit_gemm_v1_nchw_cyxk_khwn.hpp |   2 +-
 ...lution_implicit_gemm_v1_nchw_cyxk_nkhw.hpp |   4 +-
 driver/driver.hip.cpp                         |  10 +-
 src/include/Array.hip.hpp                     |  14 ++
 src/include/ConstantTensorDescriptor.hip.hpp  |  60 ++---
 src/include/Sequence.hip.hpp                  | 223 +++++++++---------
 src/include/blockwise_gemm.hip.hpp            |   3 +-
 src/include/blockwise_tensor_slice_op.hip.hpp |  33 +--
 src/include/functional.hip.hpp                |   8 +-
 ...3_lds_double_buffer_chwn_cyxk_khwn.hip.hpp |   2 +-
 ...3_lds_double_buffer_nchw_cyxk_khwn.hip.hpp |   4 +-
 ...3_lds_double_buffer_nchw_cyxk_nkhw.hip.hpp |   4 +-
 13 files changed, 172 insertions(+), 197 deletions(-)
diff --git a/driver/device_convolution_implicit_gemm_v1_chwn_cyxk_khwn.hpp b/driver/device_convolution_implicit_gemm_v1_chwn_cyxk_khwn.hpp
index 938fba037a..4e739a001e 100644
--- a/driver/device_convolution_implicit_gemm_v1_chwn_cyxk_khwn.hpp
+++ b/driver/device_convolution_implicit_gemm_v1_chwn_cyxk_khwn.hpp
@@ -140,7 +140,7 @@ void device_convolution_implicit_gemm_v1_chwn_cyxk_khwn(InDesc,
     constexpr index_t WeiBlockCopyDataPerRead_K = 4;
 
     constexpr index_t OutThreadCopyDataPerWrite_N = 2;
-#elif 0
+#elif 1
     // for 3x3, 34x34, v1r3, Pascal
     // for 3x3, 28x28, v1r3, Pascal
     // for 3x3, 14x14, v1r3, Pascal
diff --git a/driver/device_convolution_implicit_gemm_v1_nchw_cyxk_khwn.hpp b/driver/device_convolution_implicit_gemm_v1_nchw_cyxk_khwn.hpp
index e5c20994f2..5ea9b8f030 100644
--- a/driver/device_convolution_implicit_gemm_v1_nchw_cyxk_khwn.hpp
+++ b/driver/device_convolution_implicit_gemm_v1_nchw_cyxk_khwn.hpp
@@ -64,7 +64,7 @@ void device_convolution_implicit_gemm_v1_nchw_cyxk_khwn(InDesc,
     wei_cyxk_device_buf.ToDevice(wei_cyxk.mData.data());
     out_khwn_device_buf.ToDevice(out_khwn.mData.data());
 
-#if 0
+#if 1
     // for 3x3, 34x34, v1r3, Pascal
     constexpr index_t BlockSize = 128;
 
diff --git a/driver/device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw.hpp b/driver/device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw.hpp
index f366f6664c..34ed48229f 100644
--- a/driver/device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw.hpp
+++ b/driver/device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw.hpp
@@ -57,7 +57,7 @@ void device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw(InDesc,
     wei_cyxk_device_buf.ToDevice(wei_cyxk.mData.data());
     out_nkhw_device_buf.ToDevice(out_nkhw.mData.data());
 
-#if 0
+#if 1
     // for 3x3, 34x34, v1r3, Pascal
     constexpr index_t BlockSize = 128;
 
@@ -162,7 +162,7 @@ void device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw(InDesc,
     constexpr index_t WeiBlockCopyDataPerRead_K = 4;
 
     constexpr index_t OutThreadCopyDataPerWrite_W = 2;
-#elif 1
+#elif 0
     // for 3x3, 34x34, v1r3, Vega 20, WoPerBlock = 8
     constexpr index_t BlockSize = 256;
 
diff --git a/driver/driver.hip.cpp b/driver/driver.hip.cpp
index 0b75e0083a..3e032a2333 100644
--- a/driver/driver.hip.cpp
+++ b/driver/driver.hip.cpp
@@ -13,7 +13,7 @@
 #include "device_convolution_implicit_gemm_v1_nchw_cyxk_khwn.hpp"
 #include "device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw.hpp"
 #include "device_convolution_implicit_gemm_v2_chwn_cyxk_khwn.hpp"
-#include "device_convolution_implicit_gemm_v3_nchw_cyxk_nkhw.hpp"
+//#include "device_convolution_implicit_gemm_v3_nchw_cyxk_nkhw.hpp"
 
 struct GeneratorTensor_1
 {
@@ -411,7 +411,7 @@ void check_error(const Tensor<T>& ref, const Tensor<T>& result)
 
 int main(int argc, char* argv[])
 {
-#if 0
+#if 1
     // 3x3, 34x34
     constexpr index_t N  = 64;
     constexpr index_t C  = 256;
@@ -435,7 +435,7 @@ int main(int argc, char* argv[])
 
     constexpr index_t HPad = 0;
     constexpr index_t WPad = 0;
-#elif 1
+#elif 0
     // 3x3 filter, 28x28 image
     constexpr index_t N  = 128;
     constexpr index_t C  = 256;
@@ -608,7 +608,7 @@ int main(int argc, char* argv[])
     device_convolution_direct_v2_nchw_kcyx_nkhw
 #elif 0
     device_direct_convolution_2_vectorized_nchw_kcyx_nkhw
-#elif 0
+#elif 1
     device_convolution_implicit_gemm_v1_chwn_cyxk_khwn
 #elif 0
     device_convolution_implicit_gemm_v1_nchw_cyxk_khwn
@@ -616,7 +616,7 @@ int main(int argc, char* argv[])
     device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw
 #elif 0
     device_convolution_implicit_gemm_v2_chwn_cyxk_khwn
-#elif 1
+#elif 0
     device_convolution_implicit_gemm_v3_nchw_cyxk_nkhw
 #endif
     (in_nchw_desc, in_nchw, wei_kcyx_desc, wei_kcyx, out_nkhw_desc, out_nkhw_device, nrepeat);
diff --git a/src/include/Array.hip.hpp b/src/include/Array.hip.hpp
index 2370e4ed32..30e3bd0b7c 100644
--- a/src/include/Array.hip.hpp
+++ b/src/include/Array.hip.hpp
@@ -66,3 +66,17 @@ __host__ __device__ auto reorder_array_given_old2new(const Array<TData, NSize>&
 
     return new_array;
 }
+
+template <class TData, index_t NSize>
+__host__ __device__ constexpr auto operator+(const Array<TData, NSize>& a,
+                                             const Array<TData, NSize>& b)
+{
+    Array<TData, NSize> result;
+
+    static_for<0, NSize, 1>{}([&](auto I) {
+        constexpr index_t i = I.Get();
+        result[i]           = a[i] + b[i];
+    });
+
+    return result;
+}
diff --git a/src/include/ConstantTensorDescriptor.hip.hpp b/src/include/ConstantTensorDescriptor.hip.hpp
index 64c7f4408d..d61632a389 100644
--- a/src/include/ConstantTensorDescriptor.hip.hpp
+++ b/src/include/ConstantTensorDescriptor.hip.hpp
@@ -88,32 +88,11 @@ struct ConstantTensorDescriptor
         return accumulate_on_sequence(Lengths{}, std::multiplies<index_t>{}, Number<1>{});
     }
 
-#if 0
-    // c++14 doesn't support constexpr lambdas, has to use this trick instead
-    struct f_GetElementSpace_impl
-    {
-        template <class IDim>
-        __host__ __device__ constexpr index_t operator()(IDim idim) const
-        {
-            return (Type{}.GetLength(idim) - 1) * Type{}.GetStride(idim);
-        }
-        __host__ __device__ constexpr index_t operator()(index_t length, index_t stride) const
-        {
-            return (length - 1) * stride;
-        }
-    };
-#endif
-
     template <class Align = Number<1>>
     __host__ __device__ static constexpr index_t GetElementSpace(Align align = Align{})
     {
-#if 0
-        index_t element_space_unaligned =
-            static_const_reduce_n<nDim>{}(f_GetElementSpace_impl{}, std::plus<index_t>{}) + 1;
-#else
         constexpr index_t element_space_unaligned = accumulate_on_sequence(
             (GetLengths() - Number<1>{}) * GetStrides(), std::plus<index_t>{}, Number<1>{});
-#endif
 
         return align.Get() * ((element_space_unaligned + align.Get() - 1) / align.Get());
     }
@@ -150,10 +129,7 @@ struct ConstantTensorDescriptor
 
         constexpr auto multi_id = Sequence<Is...>{};
 
-        constexpr auto seq_tmp =
-            transform_sequences(std::multiplies<index_t>{}, multi_id, GetStrides());
-
-        return accumulate_on_sequence(seq_tmp, std::plus<index_t>{}, Number<0>{});
+        return accumulate_on_sequence(multi_id * GetStrides(), std::plus<index_t>{}, Number<0>{});
     }
 
     __host__ __device__ static Array<index_t, nDim> GetMultiIndex(index_t id)
@@ -177,14 +153,14 @@ struct ConstantTensorDescriptor
         return ConstantTensorDescriptor<Lengths, decltype(default_strides)>{};
     }
 
-    template <index_t IDims...>
+    template <index_t... IDims>
     __host__ __device__ static constexpr auto Extract(Number<IDims>... extract_dims)
     {
         static_assert(sizeof...(IDims) <= GetNumOfDimension(),
                       "wrong! too many number of dimensions to be extracted");
 
-        return make_ConstantTensorDescriptor(Lengths{}.Extract(extract_dims),
-                                             Strides{}.Extract(extract_dims));
+        return make_ConstantTensorDescriptor(Lengths{}.Extract(extract_dims...),
+                                             Strides{}.Extract(extract_dims...));
     }
 
     template <index_t IDim, index_t SliceLen>
@@ -195,11 +171,11 @@ struct ConstantTensorDescriptor
     }
 
     template <index_t IDim, index_t... FoldIntervals>
-    __host__ device__ static constexpr auto Fold(Number<IDim>, Number<FoldIntervals>...)
+    __host__ __device__ static constexpr auto Fold(Number<IDim>, Number<FoldIntervals>...)
     {
         constexpr auto fold_intervals = Sequence<FoldIntervals...>{};
 
-        constexpr fold_intervals_product =
+        constexpr index_t fold_intervals_product =
             accumulate_on_sequence(fold_intervals, std::multiplies<index_t>{}, Number<1>{});
 
         constexpr auto unfold_length = GetLength(Number<IDim>{});
@@ -207,29 +183,31 @@ struct ConstantTensorDescriptor
 
         // length of the dimension to be folded needs to be dividable by fold_interval_product,
         // otherwise, folding is invalid
-        static_assert(unfold_length % fold_interval_product == 0,
+        static_assert(unfold_length % fold_intervals_product == 0,
                       "wrong! length on the dimension to be folded cannot be evenly divided!");
 
         // folded lengths
         constexpr auto fold_lengths =
-            Sequence<unfold_length / fold_interval_product>{}.Append(fold_intervals);
+            Sequence<unfold_length / fold_intervals_product>{}.Append(fold_intervals);
 
         // folded strides
-        constexpr auto fold_strides = transform_sequences(mod_conv::scales<index_t, unfold_stride>{}, 
+        constexpr auto fold_strides =
+            Number<unfold_stride>{} *
             reverse_scan_sequence(fold_intervals.PushBack(Number<1>{}), std::multiplies<index_t>{});
 
         // left and right lengths
-        constexpr auto lengths_pair  = GetLengths().Split(Number<I>{});
+        constexpr auto lengths_pair  = GetLengths().Split(Number<IDim>{});
         constexpr auto left_lengths  = lengths_pair.first;
         constexpr auto right_lengths = lengths_pair.second.PopFront();
 
         // left and right strides
-        constexpr auto strides_pair  = GetStrides().Split(Number<I>{});
+        constexpr auto strides_pair  = GetStrides().Split(Number<IDim>{});
         constexpr auto left_strides  = strides_pair.first;
         constexpr auto right_strides = strides_pair.second.PopFront();
 
-        return make_ConstantTensorDescriptor(left_lengths.Append(fold_lengths).Append(right_lengths),
-                                             left_strides.Append(fold_strides).Append(right_strides));
+        return make_ConstantTensorDescriptor(
+            left_lengths.Append(fold_lengths).Append(right_lengths),
+            left_strides.Append(fold_strides).Append(right_strides));
     }
 
     template <index_t FirstUnfoldDim, index_t LastUnfoldDim>
@@ -264,8 +242,8 @@ struct ConstantTensorDescriptor
         constexpr index_t unfold_length =
             accumulate_on_sequence(fold_lengths, std::multiplies<index_t>{}, Number<1>{});
 
-        constexpr auto new_strides =
-            left_strides.PopBack(Number<unfold_strides>{}).Append(right_strides);
+        constexpr auto new_lengths =
+            left_lengths.PopBack(Number<unfold_length>{}).Append(right_lengths);
 
         // strides
         constexpr auto strides_pair1 = Strides{}.Split(Number<LastUnfoldDim + 1>{});
@@ -281,7 +259,7 @@ struct ConstantTensorDescriptor
         constexpr index_t unfold_stride = fold_strides.Back();
 
         constexpr auto new_strides =
-            left_strides.PushBack(Number<unfold_strides>{}).Append(right_strides);
+            left_strides.PushBack(Number<unfold_stride>{}).Append(right_strides);
 
         return make_ConstantTensorDescriptor(new_lengths, new_strides);
     }
@@ -289,7 +267,7 @@ struct ConstantTensorDescriptor
     template <index_t... IRs>
     __host__ __device__ static constexpr auto ReorderGivenNew2Old(Sequence<IRs...> /*new2old*/)
     {
-        static_assert(sizeof...(IRs) == GetNumberOfDimension(), "wrong! dimension is wrong");
+        static_assert(sizeof...(IRs) == GetNumOfDimension(), "wrong! dimension is wrong");
         constexpr auto map_new2old = Sequence<IRs...>{};
         return make_ConstantTensorDescriptor(Lengths{}.ReorderGivenNew2Old(map_new2old),
                                              Strides{}.ReorderGivenNew2Old(map_new2old));
diff --git a/src/include/Sequence.hip.hpp b/src/include/Sequence.hip.hpp
index 6b87885780..ad9010fc0f 100644
--- a/src/include/Sequence.hip.hpp
+++ b/src/include/Sequence.hip.hpp
@@ -2,14 +2,7 @@
 #include "constant_integral.hip.hpp"
 #include "functional.hip.hpp"
 
-struct EmptySequence
-{
-    template <class Seq>
-    __host__ __device__ constexpr Seq Append(Seq) const
-    {
-        return {};
-    }
-};
+struct EmptySequence;
 
 template <index_t... Is>
 struct Sequence
@@ -73,18 +66,18 @@ struct Sequence
 
     __host__ __device__ constexpr auto PopBack() const;
 
-    template <index_t Xs...>
+    template <index_t... Xs>
     __host__ __device__ constexpr auto Append(Sequence<Xs...>) const
     {
         return Sequence<Is..., Xs...>{};
     }
 
-    __host__ __device__ constexpr auto Append(EmptySequence) const { return Type{}; }
+    __host__ __device__ constexpr auto Append(EmptySequence) const;
 
     template <index_t... Ns>
     __host__ __device__ constexpr auto Extract(Number<Ns>...) const
     {
-        return Sequence<Type{}.Get(Number<Ns>)...>{};
+        return Sequence<Get(Number<Ns>{})...>{};
     }
 
     template <index_t N>
@@ -93,8 +86,8 @@ struct Sequence
         template <class FirstSeq, class SecondSeq>
         __host__ __device__ constexpr auto operator()(FirstSeq, SecondSeq) const
         {
-            constexpr new_first  = FirstSeq{}.PushBack(Number<Second{}.Front()>{});
-            constexpr new_second = SecondSeq{}.PopFront();
+            constexpr index_t new_first  = FirstSeq{}.PushBack(Number<SecondSeq{}.Front()>{});
+            constexpr index_t new_second = SecondSeq{}.PopFront();
 
             static_if<(N > 0)>{}([&](auto fwd) {
                 return split_impl<N - 1>{}(new_first, fwd(new_second));
@@ -102,26 +95,10 @@ struct Sequence
         }
     };
 
-    // split one sequence to two sequnces: [0, I) and [I, nSize)
+    // split one sequence to two sequnces: [0, I) and [I, mSize)
     // return type is std::pair
     template <index_t I>
-    __host__ __device__ constexpr auto Split(Number<I>) const
-    {
-        static_assert(I <= nSize, "wrong! split position is too high!");
-
-        static_if<(I == 0)>{}(
-            [&](auto fwd) { return std::make_pair(EmptySequence<>{}, fwd(Type{})); });
-
-        static_if<(I == nSize)>{}(
-            [&](auto fwd) { return std::make_pair(Type<>{}, fwd(EmptySequence<>{})); });
-
-        static_if<(I > 0 && I < nSize)>{}([&](auto fforwader) {
-            constexpr auto first  = Sequence<Type{}.Front()> {}
-            constexpr auto second = Type{}.PopFront();
-
-            return split_impl<I - 1>{}(first, fwd(second));
-        });
-    }
+    __host__ __device__ constexpr auto Split(Number<I>) const;
 
     template <index_t I, index_t X>
     __host__ __device__ constexpr auto Modify(Number<I>, Number<X>) const
@@ -135,6 +112,53 @@ struct Sequence
     }
 };
 
+struct EmptySequence
+{
+    __host__ __device__ static constexpr index_t GetSize() { return 0; }
+
+    template <index_t I>
+    __host__ __device__ constexpr auto PushFront(Number<I>) const
+    {
+        return Sequence<I>{};
+    }
+
+    template <index_t I>
+    __host__ __device__ constexpr auto PushBack(Number<I>) const
+    {
+        return Sequence<I>{};
+    }
+
+    template <class Seq>
+    __host__ __device__ constexpr Seq Append(Seq) const
+    {
+        return Seq{};
+    }
+};
+
+template <index_t... Is>
+__host__ __device__ constexpr auto Sequence<Is...>::Append(EmptySequence) const
+{
+    return Type{};
+}
+
+// split one sequence to two sequnces: [0, I) and [I, mSize)
+// return type is std::pair
+template <index_t... Is>
+template <index_t I>
+__host__ __device__ constexpr auto Sequence<Is...>::Split(Number<I>) const
+{
+    static_assert(I <= GetSize(), "wrong! split position is too high!");
+
+    static_if<(I == 0)>{}([&](auto fwd) { return std::make_pair(EmptySequence{}, fwd(Type{})); });
+
+    static_if<(I == GetSize())>{}(
+        [&](auto fwd) { return std::make_pair(Type{}, fwd(EmptySequence{})); });
+
+    static_if<(I > 0 && I < GetSize())>{}(
+        [&](auto fwd) { return split_impl<I>{}(EmptySequence{}, fwd(Type{})); });
+}
+
+#if 0
 template <index_t IBegin, index_t IEnd, index_t Increment>
 __host__ __device__ auto make_increasing_sequence(Number<IBegin>, Number<IEnd>, Number<Increment>)
 {
@@ -142,15 +166,10 @@ __host__ __device__ auto make_increasing_sequence(Number<IBegin>, Number<IEnd>,
 
     // not implemented
 }
-
-template <index_t N, index_t X>
-__host__ __device__ auto make_uniform_sequence(Number<N>, Number<X>);
-{
-    // not implemented
-}
+#endif
 
 template <index_t... Xs, index_t... Ys>
-__host__ __device__ constexpr auto operator+(Sequence<Xs...>, Sequence<Ys...>) const
+__host__ __device__ constexpr auto operator+(Sequence<Xs...>, Sequence<Ys...>)
 {
     static_assert(sizeof...(Xs) == sizeof...(Ys), "wrong! inconsistent size");
 
@@ -158,17 +177,18 @@ __host__ __device__ constexpr auto operator+(Sequence<Xs...>, Sequence<Ys...>) c
 }
 
 template <index_t... Xs, index_t... Ys>
-__host__ __device__ constexpr auto operator-(Sequence<Xs...> seq_x, Sequence<Ys...> seq_y) const
+__host__ __device__ constexpr auto operator-(Sequence<Xs...> seq_x, Sequence<Ys...> seq_y)
 {
     static_assert(sizeof...(Xs) == sizeof...(Ys), "wrong! inconsistent size");
 
-    static_for<0, xs.GetSize(), 1>{}([&](auto I) { static_assert(seq_x.Get(I) >= seq_y.Get(I)); });
+    static_for<0, seq_x.GetSize(), 1>{}(
+        [&](auto I) { static_assert(seq_x.Get(I) >= seq_y.Get(I), "wrong! going to undeflow"); });
 
     return Sequence<(Xs - Ys)...>{};
 }
 
 template <index_t... Xs, index_t... Ys>
-__host__ __device__ constexpr auto operator*(Sequence<Xs...>, Sequence<Ys...>)const
+__host__ __device__ constexpr auto operator*(Sequence<Xs...>, Sequence<Ys...>)
 {
     static_assert(sizeof...(Xs) == sizeof...(Ys), "wrong! inconsistent size");
 
@@ -176,7 +196,7 @@ __host__ __device__ constexpr auto operator*(Sequence<Xs...>, Sequence<Ys...>)co
 }
 
 template <index_t... Xs, index_t... Ys>
-__host__ __device__ constexpr auto operator/(Sequence<Xs...>, Sequence<Ys...>) const
+__host__ __device__ constexpr auto operator/(Sequence<Xs...>, Sequence<Ys...>)
 {
     static_assert(sizeof...(Xs) == sizeof...(Ys), "wrong! inconsistent size");
 
@@ -184,15 +204,7 @@ __host__ __device__ constexpr auto operator/(Sequence<Xs...>, Sequence<Ys...>) c
 }
 
 template <index_t... Xs, index_t... Ys>
-__host__ __device__ constexpr auto operator%(Sequence<Xs...>, Sequence<Ys...>) const
-{
-    static_assert(sizeof...(Xs) == sizeof...(Ys), "wrong! inconsistent size");
-
-    return Sequence<(Xs % Ys)...>{};
-}
-
-template <index_t... Xs, index_t... Ys>
-__host__ __device__ constexpr auto operator%(Sequence<Xs...>, Sequence<Ys...>) const
+__host__ __device__ constexpr auto operator%(Sequence<Xs...>, Sequence<Ys...>)
 {
     static_assert(sizeof...(Xs) == sizeof...(Ys), "wrong! inconsistent size");
 
@@ -200,63 +212,79 @@ __host__ __device__ constexpr auto operator%(Sequence<Xs...>, Sequence<Ys...>) c
 }
 
 template <index_t... Xs, index_t Y>
-__host__ __device__ constexpr auto operator+(Sequence<Xs...>, Number<Y>) const
+__host__ __device__ constexpr auto operator+(Sequence<Xs...>, Number<Y>)
 {
-    return seq_x + make_uniform_sequence(Number<sizeof...(Xs)>, Number<Y>{});
+    return Sequence<(Xs + Y)...>{};
 }
 
 template <index_t... Xs, index_t Y>
-__host__ __device__ constexpr auto operator-(Sequence<Xs...>, Number<Y>) const
+__host__ __device__ constexpr auto operator-(Sequence<Xs...>, Number<Y>)
 {
-    return seq_x - make_uniform_sequence(Number<sizeof...(Xs)>, Number<Y>{});
+    constexpr auto seq_x = Sequence<Xs...>{};
+
+#if 0
+    static_for<0, sizeof...(Xs), 1>{}([&](auto Iter) {
+        constexpr auto I = decltype(Iter){};
+        static_assert(seq_x.Get(I) >= Y, "wrong! going to underflow");
+    });
+#endif
+
+    return Sequence<(Xs - Y)...>{};
 }
 
 template <index_t... Xs, index_t Y>
-__host__ __device__ constexpr auto operator*(Sequence<Xs...>, Number<Y>)const
+__host__ __device__ constexpr auto operator*(Sequence<Xs...>, Number<Y>)
 {
-    return seq_x * make_uniform_sequence(Number<sizeof...(Xs)>, Number<Y>{});
+    return Sequence<(Xs * Y)...>{};
 }
 
 template <index_t... Xs, index_t Y>
-__host__ __device__ constexpr auto operator/(Sequence<Xs...>, Number<Y>) const
+__host__ __device__ constexpr auto operator/(Sequence<Xs...>, Number<Y>)
 {
-    return seq_x / make_uniform_sequence(Number<sizeof...(Xs)>, Number<Y>{});
+    return Sequence<(Xs / Y)...>{};
 }
 
 template <index_t... Xs, index_t Y>
-__host__ __device__ constexpr auto operator%(Sequence<Xs...> seq_x, Number<Y> y) const
+__host__ __device__ constexpr auto operator%(Sequence<Xs...>, Number<Y>)
 {
-    return seq_x % make_uniform_sequence(Number<sizeof...(Xs)>, Number<Y>{});
+    return Sequence<(Xs % Y)...>{};
 }
 
-template <index_t X, index_t... Ys>
-__host__ __device__ constexpr auto operator+(Number<X>, Sequence<Ys...>) const
+template <index_t Y, index_t... Xs>
+__host__ __device__ constexpr auto operator+(Number<Y>, Sequence<Xs...>)
 {
-    return make_uniform_sequence(Number<sizeof...(Ys)>{}, Number<X>{}) + Sequence<Ys...>{};
+    return Sequence<(Y + Xs)...>{};
 }
 
-template <index_t X, index_t... Ys>
-__host__ __device__ constexpr auto operator-(Number<X>, Sequence<Ys...>) const
+template <index_t Y, index_t... Xs>
+__host__ __device__ constexpr auto operator-(Number<Y>, Sequence<Xs...>)
 {
-    return make_uniform_sequence(Number<sizeof...(Ys)>{}, Number<X>{}) - Sequence<Ys...>{};
+    constexpr auto seq_x = Sequence<Xs...>{};
+
+    static_for<0, sizeof...(Xs), 1>{}([&](auto Iter) {
+        constexpr auto I = decltype(Iter){};
+        static_assert(seq_x.Get(I) <= Y, "wrong! going to underflow");
+    });
+
+    return Sequence<(Y - Xs)...>{};
 }
 
-template <index_t X, index_t... Ys>
-__host__ __device__ constexpr auto operator*(Number<X>, Sequence<Ys...>)const
+template <index_t Y, index_t... Xs>
+__host__ __device__ constexpr auto operator*(Number<Y>, Sequence<Xs...>)
 {
-    return make_uniform_sequence(Number<sizeof...(Ys)>{}, Number<X>{}) * Sequence<Ys...>{};
+    return Sequence<(Y * Xs)...>{};
 }
 
-template <index_t X, index_t... Ys>
-__host__ __device__ constexpr auto operator/(Number<X>, Sequence<Ys...>) const
+template <index_t Y, index_t... Xs>
+__host__ __device__ constexpr auto operator/(Number<Y>, Sequence<Xs...>)
 {
-    return make_uniform_sequence(Number<sizeof...(Ys)>{}, Number<X>{}) / Sequence<Ys...>{};
+    return Sequence<(Y / Xs)...>{};
 }
 
-template <index_t X, index_t... Ys>
-__host__ __device__ constexpr auto operator%(Number<X>, Sequence<Ys...>) const
+template <index_t Y, index_t... Xs>
+__host__ __device__ constexpr auto operator%(Number<Y>, Sequence<Xs...>)
 {
-    return make_uniform_sequence(Number<sizeof...(Ys)>{}, Number<X>{}) % Sequence<Ys...>{};
+    return Sequence<(Y % Xs)...>{};
 }
 
 template <index_t I, index_t... Is>
@@ -268,7 +296,7 @@ __host__ __device__ constexpr auto sequence_pop_front(Sequence<I, Is...>)
 
 #if 0
 // TODO: for some reason, compiler cannot instantiate this template
-template <index_t I, index_t... Is>
+template <index_t... Is, index_t I>
 __host__ __device__ constexpr auto sequence_pop_back(Sequence<Is..., I>)
 {
     static_assert(sizeof...(Is) > 0, "empty Sequence!");
@@ -356,8 +384,6 @@ __host__ __device__ constexpr auto
 }
 #endif
 
-#if 1
-// TODO: fix these mess
 template <class F, index_t... Xs>
 __host__ __device__ constexpr auto transform_sequences(F f, Sequence<Xs...>)
 {
@@ -382,45 +408,6 @@ transform_sequences(F f, Sequence<Xs...>, Sequence<Ys...>, Sequence<Zs...>)
 
     return Sequence<f(Xs, Ys, Zs)...>{};
 }
-#else
-// TODO:: these doesn't compile
-template <index_t NRemain>
-struct transform_sequences_impl
-{
-    template <class F, class Y, class... Xs>
-    __host__ __device__ constexpr auto operator()(F f, Y y, Xs... xs) const
-    {
-        static_assert(NRemain > 1, "wrong! should have NRemain > 1");
-
-        constexpr index_t N  = f(Xs{}.Get(Number<0>{})...);
-        constexpr auto y_new = y.PushBack(Number<N>{});
-
-        return transform_sequences_impl<NRemain - 1>{}(f, y_new, xs.PopFront()...);
-    }
-};
-
-template <>
-struct transform_sequences_impl<1>
-{
-    template <class F, class Y, class... Xs>
-    __host__ __device__ constexpr auto operator()(F f, Y, Xs...) const
-    {
-        constexpr index_t N = f(Xs{}.Get(Number<0>{})...);
-        return Y{}.PushBack(Number<N>{});
-    }
-};
-
-template <class F, class X, class... Xs>
-__host__ __device__ constexpr auto transform_sequences(F f, X x, Xs... xs)
-{
-    constexpr index_t nSize = X::GetSize();
-    constexpr auto I0       = Number<0>{};
-
-    constexpr auto y0 = Sequence<f(X{}.Get(I0), Xs{}.Get(I0)...)>{};
-
-    return transform_sequences_impl<nSize - 1>{}(f, y0, x.PopFront(), xs.PopFront()...);
-}
-#endif
 
 template <index_t... Is>
 __host__ __device__ constexpr auto Sequence<Is...>::PopFront() const
diff --git a/src/include/blockwise_gemm.hip.hpp b/src/include/blockwise_gemm.hip.hpp
index 722c1ae9bb..3159eb5ae2 100644
--- a/src/include/blockwise_gemm.hip.hpp
+++ b/src/include/blockwise_gemm.hip.hpp
@@ -7,6 +7,7 @@
 template <index_t BlockSize,
           class BlockMatrixA,
           class BlockMatrixB,
+          class ThreadMatrixC,
           index_t MPerThreadSubC,
           index_t NPerThreadSubC,
           index_t MLevel0Cluster,
@@ -45,7 +46,7 @@ struct BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_v2
                           N % (NPerThreadSubC * NLevel0Cluster * NLevel1Cluster) == 0,
                       "wrong! Cannot evenly divide work among\n");
 
-        static_assert(ThreadMatrixC::GetLengths() == GetThreadMatrixCLengths,
+        static_assert(ThreadMatrixC::GetLengths() == GetThreadMatrixCLengths(),
                       "wrong! ThreadMatrixC lengths is wrong");
 
         auto c_thread_mtx_index = GetBeginOfThreadMatrixC(get_thread_local_1d_id());
diff --git a/src/include/blockwise_tensor_slice_op.hip.hpp b/src/include/blockwise_tensor_slice_op.hip.hpp
index 5a1b8cb9e8..22c01250ce 100644
--- a/src/include/blockwise_tensor_slice_op.hip.hpp
+++ b/src/include/blockwise_tensor_slice_op.hip.hpp
@@ -132,16 +132,15 @@ struct BlockwiseTensorSliceReorderCopy_v3
     {
         constexpr auto thread_sub_tensor_lengths = SrcSubLengths{};
 
-        constexpr auto src_data_per_cluster_per_dims = transform_sequences(
-            std::multiplies<index_t>{}, thread_sub_tensor_lengths, SrcClusterLengths{});
+        constexpr auto src_data_per_cluster_per_dims =
+            thread_sub_tensor_lengths * SrcClusterLengths{};
 
         constexpr auto repeat_lengths =
             transform_sequences(mod_conv::integer_divide_ceiler<index_t>{},
                                 SrcLengths{},
                                 src_data_per_cluster_per_dims);
 
-        constexpr auto thread_tensor_lengths = transform_sequences(
-            std::multiplies<index_t>{}, thread_sub_tensor_lengths, repeat_lengths);
+        constexpr auto thread_tensor_lengths = thread_sub_tensor_lengths * repeat_lengths;
 
         constexpr auto thread_tensor_desc = make_ConstantTensorDescriptor(thread_tensor_lengths);
 
@@ -153,27 +152,24 @@ struct BlockwiseTensorSliceReorderCopy_v3
     {
         constexpr auto thread_sub_tensor_lengths = SrcSubLengths{};
 
-        constexpr auto src_data_per_cluster_per_dims = transform_sequences(
-            std::multiplies<index_t>{}, thread_sub_tensor_lengths, SrcClusterLengths{});
+        constexpr auto src_data_per_cluster_per_dims =
+            thread_sub_tensor_lengths * SrcClusterLengths{};
 
         constexpr auto repeat_lengths =
             transform_sequences(mod_conv::integer_divide_ceiler<index_t>{},
                                 SrcLengths{},
                                 src_data_per_cluster_per_dims);
 
-        constexpr auto thread_tensor_lengths = transform_sequences(
-            std::multiplies<index_t>{}, thread_sub_tensor_lengths, repeat_lengths);
+        constexpr auto thread_tensor_lengths = thread_sub_tensor_lengths * repeat_lengths;
 
         constexpr auto thread_tensor_desc = make_ConstantTensorDescriptor(thread_tensor_lengths);
 
         static_ford<decltype(repeat_lengths)>{}([&](auto repeat_multi_id_) {
             constexpr auto repeat_multi_id = decltype(repeat_multi_id_){};
 
-            constexpr auto src_data_multi_id = transform_sequences(
-                std::multiplies<index_t>{}, repeat_multi_id, src_data_per_cluster_per_dims);
+            constexpr auto src_data_multi_id = repeat_multi_id * src_data_per_cluster_per_dims;
 
-            constexpr auto clipboard_data_multi_id = transform_sequences(
-                std::multiplies<index_t>{}, repeat_multi_id, thread_sub_tensor_lengths);
+            constexpr auto clipboard_data_multi_id = repeat_multi_id * thread_sub_tensor_lengths;
 
             constexpr index_t src_offset = SrcDesc{}.Get1dIndex(src_data_multi_id);
             constexpr index_t clipboard_offset =
@@ -193,27 +189,24 @@ struct BlockwiseTensorSliceReorderCopy_v3
     {
         constexpr auto thread_sub_tensor_lengths = SrcSubLengths{};
 
-        constexpr auto src_data_per_cluster_per_dims = transform_sequences(
-            std::multiplies<index_t>{}, thread_sub_tensor_lengths, SrcClusterLengths{});
+        constexpr auto src_data_per_cluster_per_dims =
+            thread_sub_tensor_lengths * SrcClusterLengths{};
 
         constexpr auto repeat_lengths =
             transform_sequences(mod_conv::integer_divide_ceiler<index_t>{},
                                 SrcLengths{},
                                 src_data_per_cluster_per_dims);
 
-        constexpr auto thread_tensor_lengths = transform_sequences(
-            std::multiplies<index_t>{}, thread_sub_tensor_lengths, repeat_lengths);
+        constexpr auto thread_tensor_lengths = thread_sub_tensor_lengths * repeat_lengths;
 
         constexpr auto thread_tensor_desc = make_ConstantTensorDescriptor(thread_tensor_lengths);
 
         static_ford<decltype(repeat_lengths)>{}([&](auto repeat_multi_id_) {
             constexpr auto repeat_multi_id = decltype(repeat_multi_id_){};
 
-            constexpr auto clipboard_data_multi_id = transform_sequences(
-                std::multiplies<index_t>{}, repeat_multi_id, thread_sub_tensor_lengths);
+            constexpr auto clipboard_data_multi_id = repeat_multi_id * thread_sub_tensor_lengths;
 
-            constexpr auto src_data_multi_id = transform_sequences(
-                std::multiplies<index_t>{}, repeat_multi_id, src_data_per_cluster_per_dims);
+            constexpr auto src_data_multi_id = repeat_multi_id * src_data_per_cluster_per_dims;
 
             // reorder src_data_multi_id to get dst_data_multi_id
             constexpr auto dst_data_multi_id = src_data_multi_id.ReorderGivenNew2Old(MapDst2Src{});
diff --git a/src/include/functional.hip.hpp b/src/include/functional.hip.hpp
index 79b5b22cc3..e25dffa8c7 100644
--- a/src/include/functional.hip.hpp
+++ b/src/include/functional.hip.hpp
@@ -37,7 +37,8 @@ struct static_if<true>
     {
         // This is a trick for compiler:
         //   Pass forwarder to lambda "f" as "auto" argument, and maks sure "f" will use it,
-        //   this will make "f" a generic lambda, so that "f" won't be compiled until here
+        //   this will make "f" a generic lambda, so that "f" won't be compiled until being
+        //   instantiated here
         f(forwarder{});
         return Type{};
     }
@@ -65,7 +66,8 @@ struct static_if<false>
     {
         // This is a trick for compiler:
         //   Pass forwarder to lambda "f" as "auto" argument, and maks sure "f" will use it,
-        //   this will make "f" a generic lambda, so that "f" won't be compiled until here
+        //   this will make "f" a generic lambda, so that "f" won't be compiled until being
+        //   instantiated here
         f(forwarder{});
         return Type{};
     }
@@ -105,7 +107,7 @@ struct static_for
         static_assert((NEnd - NBegin) % Increment == 0,
                       "Wrong! should satisfy (NEnd - NBegin) % Increment == 0");
 
-        static_if<(NBegin < End)>{}(
+        static_if<(NBegin < NEnd)>{}(
             [&](auto fwd) { static_for_impl<NBegin, NEnd - NBegin, fwd(Increment)>{}(f); });
     }
 };
diff --git a/src/include/gridwise_convolution_implicit_gemm_v1r3_lds_double_buffer_chwn_cyxk_khwn.hip.hpp b/src/include/gridwise_convolution_implicit_gemm_v1r3_lds_double_buffer_chwn_cyxk_khwn.hip.hpp
index 3e2ea73bd9..1cf033bef5 100644
--- a/src/include/gridwise_convolution_implicit_gemm_v1r3_lds_double_buffer_chwn_cyxk_khwn.hip.hpp
+++ b/src/include/gridwise_convolution_implicit_gemm_v1r3_lds_double_buffer_chwn_cyxk_khwn.hip.hpp
@@ -201,7 +201,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_chwn_cyxk_khwn
 
         // choose GEMM implementation here
         const auto run_blockwise_batch_gemm = [&](auto... Xs) {
-#if 0
+#if 1
             return blockwise_batch_gemm.Run(Xs...);
 #elif 0
             return blockwise_batch_gemm.Run_asm(Xs...);
diff --git a/src/include/gridwise_convolution_implicit_gemm_v1r3_lds_double_buffer_nchw_cyxk_khwn.hip.hpp b/src/include/gridwise_convolution_implicit_gemm_v1r3_lds_double_buffer_nchw_cyxk_khwn.hip.hpp
index e33b25d429..45933c6bc2 100644
--- a/src/include/gridwise_convolution_implicit_gemm_v1r3_lds_double_buffer_nchw_cyxk_khwn.hip.hpp
+++ b/src/include/gridwise_convolution_implicit_gemm_v1r3_lds_double_buffer_nchw_cyxk_khwn.hip.hpp
@@ -142,7 +142,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_nchw_cyxk_khwn
             decltype(map_chwn2nchw),
             InBlockReorderMapThreadCluster2SrcCluster_CHNW2NCHW,
             InBlockReorderDataPerRead_W,
-            InBlockReorderDataPerWrite_N>{};
+            InBlockReorderDataPerWrite_N>({0, 0, 0, 0}, {0, 0, 0, 0});
 
         // blockwise wei copy
         //   format is [CPerBlock, KPerBlock]
@@ -196,7 +196,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_nchw_cyxk_khwn
 
         // choose GEMM implementation here
         const auto run_blockwise_batch_gemm = [&](auto... Xs) {
-#if 0
+#if 1
             return blockwise_batch_gemm.Run(Xs...);
 #elif 0
             return blockwise_batch_gemm.Run_asm(Xs...);
diff --git a/src/include/gridwise_convolution_implicit_gemm_v1r3_lds_double_buffer_nchw_cyxk_nkhw.hip.hpp b/src/include/gridwise_convolution_implicit_gemm_v1r3_lds_double_buffer_nchw_cyxk_nkhw.hip.hpp
index 605496d1c8..33673dbaa4 100644
--- a/src/include/gridwise_convolution_implicit_gemm_v1r3_lds_double_buffer_nchw_cyxk_nkhw.hip.hpp
+++ b/src/include/gridwise_convolution_implicit_gemm_v1r3_lds_double_buffer_nchw_cyxk_nkhw.hip.hpp
@@ -142,7 +142,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_nchw_cyxk_nkhw
             decltype(map_chwn2nchw),
             InBlockReorderMapThreadCluster2SrcCluster_CHNW2NCHW,
             InBlockReorderDataPerRead_W,
-            InBlockReorderDataPerWrite_N>{};
+            InBlockReorderDataPerWrite_N>({0, 0, 0, 0}, {0, 0, 0, 0});
 
         // blockwise wei copy
         //   format is [CPerBlock, KPerBlock]
@@ -196,7 +196,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_nchw_cyxk_nkhw
 
         // choose GEMM implementation here
         const auto run_blockwise_batch_gemm = [&](auto... Xs) {
-#if 0
+#if 1
             return blockwise_batch_gemm.Run(Xs...);
 #elif 0
             return blockwise_batch_gemm.Run_asm(Xs...);