diff --git a/composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp b/composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp
index ed3049341e..0c1e5af052 100644
--- a/composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp
+++ b/composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp
@@ -199,7 +199,7 @@ struct BlockwiseGenericTensorSliceCopy_v1
     }
 
     __device__ void RunLoadRegisterBuffer(const Float* __restrict__ p_src,
-                                          Float* __restrict__ p_Buffer) const
+                                          Float* __restrict__ p_buffer) const
     {
         constexpr auto thread_sub_tensor_lengths = SubLengths{};
 
@@ -216,24 +216,24 @@ struct BlockwiseGenericTensorSliceCopy_v1
             constexpr auto src_thread_data_multi_id_begin =
                 repeat_multi_id * data_per_cluster_per_dims;
 
-            constexpr auto Buffer_data_multi_id_begin = repeat_multi_id * thread_sub_tensor_lengths;
+            constexpr auto buffer_data_multi_id_begin = repeat_multi_id * thread_sub_tensor_lengths;
 
             constexpr index_t src_offset =
                 SrcDesc::GetOffsetFromMultiIndex(src_thread_data_multi_id_begin);
 
-            constexpr index_t Buffer_offset =
-                thread_tensor_desc.GetOffsetFromMultiIndex(Buffer_data_multi_id_begin);
+            constexpr index_t buffer_offset =
+                thread_tensor_desc.GetOffsetFromMultiIndex(buffer_data_multi_id_begin);
 #else
         ford<decltype(repeat_lengths)>{}([&](auto repeat_multi_id) {
             const auto src_thread_data_multi_id_begin = repeat_multi_id * data_per_cluster_per_dims;
 
-            const auto Buffer_data_multi_id_begin = repeat_multi_id * thread_sub_tensor_lengths;
+            const auto buffer_data_multi_id_begin = repeat_multi_id * thread_sub_tensor_lengths;
 
             const index_t src_offset =
                 SrcDesc::GetOffsetFromMultiIndex(src_thread_data_multi_id_begin);
 
-            const index_t Buffer_offset =
-                thread_tensor_desc.GetOffsetFromMultiIndex(Buffer_data_multi_id_begin);
+            const index_t buffer_offset =
+                thread_tensor_desc.GetOffsetFromMultiIndex(buffer_data_multi_id_begin);
 #endif
 
             // By position the origin of the per-thread window at the point, where multi-index
@@ -247,7 +247,7 @@ struct BlockwiseGenericTensorSliceCopy_v1
                                                     p_src + src_offset + mThreadSrcOffset,
                                                     make_zero_array<index_t, nDim>(),
                                                     thread_tensor_desc,
-                                                    p_Buffer + Buffer_offset,
+                                                    p_buffer + buffer_offset,
                                                     make_zero_array<index_t, nDim>(),
                                                     thread_sub_tensor_lengths,
                                                     SrcAccessOrder{},
@@ -255,7 +255,7 @@ struct BlockwiseGenericTensorSliceCopy_v1
         });
     }
 
-    __device__ void RunStoreRegisterBuffer(const Float* __restrict__ p_Buffer,
+    __device__ void RunStoreRegisterBuffer(const Float* __restrict__ p_buffer,
                                            Float* __restrict__ p_dst) const
     {
         constexpr auto thread_sub_tensor_lengths = SubLengths{};
@@ -270,23 +270,23 @@ struct BlockwiseGenericTensorSliceCopy_v1
 
 #if CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_BLOCKWISE_GENERIC_SLICE_COPY_V1
         static_ford<decltype(repeat_lengths)>{}([&](auto repeat_multi_id) {
-            constexpr auto Buffer_data_multi_id_begin = repeat_multi_id * thread_sub_tensor_lengths;
+            constexpr auto buffer_data_multi_id_begin = repeat_multi_id * thread_sub_tensor_lengths;
 
             constexpr auto dst_data_multi_id_begin = repeat_multi_id * data_per_cluster_per_dims;
 
-            constexpr index_t Buffer_offset =
-                thread_tensor_desc.GetOffsetFromMultiIndex(Buffer_data_multi_id_begin);
+            constexpr index_t buffer_offset =
+                thread_tensor_desc.GetOffsetFromMultiIndex(buffer_data_multi_id_begin);
 
             constexpr index_t dst_offset =
                 DstDesc::GetOffsetFromMultiIndex(dst_data_multi_id_begin);
 #else
         ford<decltype(repeat_lengths)>{}([&](auto repeat_multi_id) {
-            const auto Buffer_data_multi_id_begin = repeat_multi_id * thread_sub_tensor_lengths;
+            const auto buffer_data_multi_id_begin = repeat_multi_id * thread_sub_tensor_lengths;
 
             const auto dst_data_multi_id_begin = repeat_multi_id * data_per_cluster_per_dims;
 
-            const index_t Buffer_offset =
-                thread_tensor_desc.GetOffsetFromMultiIndex(Buffer_data_multi_id_begin);
+            const index_t buffer_offset =
+                thread_tensor_desc.GetOffsetFromMultiIndex(buffer_data_multi_id_begin);
 
             const index_t dst_offset = DstDesc::GetOffsetFromMultiIndex(dst_data_multi_id_begin);
 #endif
@@ -299,7 +299,7 @@ struct BlockwiseGenericTensorSliceCopy_v1
             // If in the future, you want to enable SubLengths > 1 at the merged dimension,
             // special care in implementation is needed
             threadwise_generic_tensor_slice_copy_v1(thread_tensor_desc,
-                                                    p_Buffer + Buffer_offset,
+                                                    p_buffer + buffer_offset,
                                                     make_zero_array<index_t, nDim>(),
                                                     DstDesc{},
                                                     p_dst + dst_offset + mThreadDstOffset,
@@ -312,10 +312,10 @@ struct BlockwiseGenericTensorSliceCopy_v1
 
     __device__ void Run(const Float* __restrict__ p_src, Float* __restrict__ p_dst) const
     {
-        Float p_Buffer[GetRegisterBufferSize()];
+        Float p_buffer[GetRegisterBufferSize()];
 
-        RunLoadRegisterBuffer(p_src, p_Buffer);
-        RunStoreRegisterBuffer(p_Buffer, p_dst);
+        RunLoadRegisterBuffer(p_src, p_buffer);
+        RunStoreRegisterBuffer(p_buffer, p_dst);
     }
 
     // When moving the slicing windows along a merged dimension, if the strides of the
diff --git a/composable_kernel/include/utility/functional3.hpp b/composable_kernel/include/utility/functional3.hpp
index 73674aa039..7bd2b8d0cd 100644
--- a/composable_kernel/include/utility/functional3.hpp
+++ b/composable_kernel/include/utility/functional3.hpp
@@ -24,105 +24,120 @@ struct is_static<Sequence<Is...>> : integral_constant<bool, true>
 };
 
 // RemainLengths: Sequence<...>
-template <class RemainLengths>
+// Orders: Sequence<...>
+template <class RemainLengths, class Orders>
 struct static_ford_impl
 {
-    // F signature: F(Sequence<...> multi_id)
-    // CurrentMultiIndex: Sequence<...>
-    template <class F, class CurrentMultiIndex>
-    __host__ __device__ constexpr void operator()(F f, CurrentMultiIndex) const
+    __host__ __device__ constexpr static_ford_impl()
     {
         static_assert(RemainLengths::GetSize() > 0, "wrong! should not get here");
+    }
 
+    // F signature: F(Sequence<...>)
+    // CurrentOrderedId: Sequence<...>
+    template <class F, class CurrentOrderedId>
+    __host__ __device__ constexpr void operator()(F f, CurrentOrderedId) const
+    {
         static_for<0, RemainLengths::Front(), 1>{}([=](auto I) {
-            static_ford_impl<decltype(RemainLengths::PopFront())>{}(f,
-                                                                    CurrentMultiIndex::PushBack(I));
+            static_ford_impl<decltype(RemainLengths::PopFront()), Orders>{}(
+                f, CurrentOrderedId::PushBack(I));
         });
     }
 };
 
-template <>
-struct static_ford_impl<Sequence<>>
+template <class Orders>
+struct static_ford_impl<Sequence<>, Orders>
 {
-    // F signature: F(Sequence<...> multi_id)
-    // CurrentMultiIndex: Sequence<...>
-    template <class F, class CurrentMultiIndex>
-    __host__ __device__ constexpr void operator()(F f, CurrentMultiIndex) const
+    // F signature: F(Sequence<...>)
+    // OrderedId: Sequence<...>
+    template <class F, class OrderedId>
+    __host__ __device__ constexpr void operator()(F f, OrderedId) const
     {
-        f(CurrentMultiIndex{});
+        // retrive unordered Id
+        f(OrderedId::ReorderGivenOld2New(Orders{}));
     }
 };
 
-// Lengths is Sequence<...>
-template <class Lengths>
+// Lengths is Sequence<...>, it is the length of each dimension for N-dimensional loop
+// Orders is Sequence<...>, it is the order of dimension in which static_ford will loop over each
+// dimension
+template <class Lengths,
+          class Orders = typename arithmetic_sequence_gen<0, Lengths::GetSize(), 1>::type>
 struct static_ford
 {
-    // F signature: F(Sequence<...> multi_id)
-    template <class F>
-    __host__ __device__ constexpr void operator()(F f) const
+    __host__ __device__ constexpr static_ford()
     {
         static_assert(Lengths::GetSize() > 0, "wrong! Lengths is empty");
-
-        static_ford_impl<Lengths>{}(f, Sequence<>{});
+        static_assert(Lengths::GetSize() == Orders::GetSize(), "wrong! inconsistent size");
     }
-};
 
-template <index_t RemainDim>
-struct ford_impl
-{
-    // F signature: F(Array<...> multi_id)
-    // CurrentMultiIndex: Array<...>
-    // RemainLengths: Sequence<...>
-    template <class F, class CurrentMultiIndex, class RemainLengths>
-    __host__ __device__ constexpr void
-    operator()(F f, CurrentMultiIndex current_multi_id, RemainLengths) const
-    {
-        static_assert(RemainLengths::GetSize() == RemainDim, "wrong!");
-        static_assert(RemainDim > 1, "wrong!");
-
-        constexpr auto next_length = RemainLengths{}.Front();
-
-        for(index_t i = 0; i < next_length; ++i)
-        {
-            ford_impl<RemainDim - 1>{}(f, current_multi_id.PushBack(i), RemainLengths{}.PopFront());
-        }
-    }
-};
-
-template <>
-struct ford_impl<1>
-{
-    // F signature: F(Array<...> multi_id)
-    // CurrentMultiIndex: Array<...>
-    // RemainLengths: Sequence<...>
-    template <class F, class CurrentMultiIndex, class RemainLengths>
-    __host__ __device__ constexpr void
-    operator()(F f, CurrentMultiIndex current_multi_id, RemainLengths) const
-    {
-        static_assert(RemainLengths::GetSize() == 1, "wrong!");
-
-        constexpr index_t last_length = RemainLengths{}.Front();
-
-        for(index_t i = 0; i < last_length; ++i)
-        {
-            f(current_multi_id.PushBack(i));
-        }
-    }
-};
-
-// Lengths is Sequence<...>
-template <class Lengths>
-struct ford
-{
-    // F signature: F(Array<...> multi_id)
+    // F signature: F(Sequence<...> multi_id)
+    // multi_id is the unordered multi-index
     template <class F>
     __host__ __device__ constexpr void operator()(F f) const
     {
-        constexpr index_t first_length = Lengths{}.Front();
+        constexpr auto ordered_lengths = Lengths::ReorderGivenNew2Old(Orders{});
+        static_ford_impl<decltype(ordered_lengths), Orders>{}(f, Sequence<>{});
+    }
+};
 
-        for(index_t i = 0; i < first_length; ++i)
+// RemainLengths: Sequence<...>
+// Orders: Sequence<...>
+template <class RemainLengths, class Orders>
+struct ford_impl
+{
+    __host__ __device__ constexpr ford_impl()
+    {
+        static_assert(RemainLengths::GetSize() > 0, "wrong! should not get here");
+    }
+
+    // F signature: F(Array<...> multi_id)
+    // CurrentOrderdId: Array<...>
+    template <class F, class CurrentOrderedId>
+    __host__ __device__ constexpr void operator()(F f, CurrentOrderedId current_ordered_id) const
+    {
+        for(index_t i = 0; i < RemainLengths::Front(); ++i)
         {
-            ford_impl<Lengths::GetSize() - 1>{}(f, Array<index_t, 1>{i}, Lengths{}.PopFront());
+            ford_impl<decltype(RemainLengths::PopFront()), Orders>{}(
+                f, current_ordered_id.PushBack(i));
+        }
+    }
+};
+
+template <class Orders>
+struct ford_impl<Sequence<>, Orders>
+{
+    // F signature: F(Array<...> multi_id)
+    // CurrentOrderdId: Array<...>
+    template <class F, class CurrentOrderedId>
+    __host__ __device__ constexpr void operator()(F f, CurrentOrderedId current_ordered_id) const
+    {
+        // retrive unordered Id
+        f(reorder_array_given_old2new(current_ordered_id, Orders{}));
+    }
+};
+
+// Lengths is Sequence<...>, it is the length of each dimension for N-dimensional loop
+// Orders is Sequence<...>, it is the order of dimension in which ford will loop over each
+// dimension
+template <class Lengths,
+          class Orders = typename arithmetic_sequence_gen<0, Lengths::GetSize(), 1>::type>
+struct ford
+{
+    __host__ __device__ constexpr ford()
+    {
+        static_assert(Lengths::GetSize() > 0, "wrong! Lengths is empty");
+        static_assert(Lengths::GetSize() == Orders::GetSize(), "wrong! inconsistent size");
+    }
+
+    // F signature: F(Array<...> multi_id)
+    // multi_id is the unordered multi-index
+    template <class F>
+    __host__ __device__ constexpr void operator()(F f) const
+    {
+        for(index_t i = 0; i < Lengths::Front(); ++i)
+        {
+            ford_impl<decltype(Lengths::PopFront()), Orders>{}(f, Array<index_t, 1>{i});
         }
     }
 };