diff --git a/example/12_reduce/README.md b/example/12_reduce/README.md
index fca8205ca6..20e1b5aa6a 100644
--- a/example/12_reduce/README.md
+++ b/example/12_reduce/README.md
@@ -37,7 +37,7 @@ cmake                                                                  \
 ```bash
 # -D <xxx> : input 4-d tensor lengths
 # -v <x> :   verification (0=no, 1=yes)
-#arg1: initialization (0=no init, 1=integer value, 2=decimal value)
+#arg1: initialization (0=no init, 1=single integer value, 2=scope integer value, 3=decimal value)
 #arg2: run kernel # of times (>1)
 ./bin/reduce_blockwise -D 16,64,32,960 -v 1 1 10
 ```
diff --git a/example/12_reduce/reduce_blockwise.cpp b/example/12_reduce/reduce_blockwise.cpp
index 6a5864ede0..e41a961103 100644
--- a/example/12_reduce/reduce_blockwise.cpp
+++ b/example/12_reduce/reduce_blockwise.cpp
@@ -13,7 +13,7 @@
 #include "device_base.hpp"
 #include "device_reduce_blockwise.hpp"
 #include "host_reduce_util.hpp"
-#include "host_generic_reduction.hpp"
+#include "host_reduction.hpp"
 
 #include "reduction_enums.hpp"
 #include "reduction_operator_mapping.hpp"
@@ -21,13 +21,13 @@
 using namespace ck;
 using namespace ck::tensor_operation::device;
 
-using InDataType  = half_float::half;
-using OutDataType = half_float::half;
+using InDataType  = ck::half_t;
+using OutDataType = ck::half_t;
 using AccDataType = float;
 
-using kInDataType  = ck::half_t;
-using kOutDataType = ck::half_t;
-using kAccDataType = float;
+using HostInDataType  = half_float::half;
+using HostOutDataType = half_float::half;
+using HostAccDataType = float;
 
 constexpr int Rank         = 4;
 constexpr int NumReduceDim = 3;
@@ -43,9 +43,9 @@ using InElementwiseOperation =
 using AccElementwiseOperation =
     typename reduce_unary_operator<AccDataType, ReduceOpId, true, true>::AccElementwiseOperation;
 
-using DeviceReduceInstance = DeviceReduceBlockWise<kInDataType,
-                                                   kAccDataType,
-                                                   kOutDataType,
+using DeviceReduceInstance = DeviceReduceBlockWise<InDataType,
+                                                   AccDataType,
+                                                   OutDataType,
                                                    Rank,
                                                    NumReduceDim,
                                                    ReduceOperation,
@@ -135,6 +135,10 @@ class SimpleAppArgs
         std::cout << "--verify or -v, 1/0 to indicate whether to verify the reduction result by "
                      "comparing with the host-based reduction"
                   << std::endl;
+        std::cout << "Arg1 -- init method (0=no init, 1=single integer value, 2=scope integer "
+                     "value, 3=decimal value)"
+                  << std::endl;
+        std::cout << "Arg2 -- number of repeats to run the kernel" << std::endl;
     };
 
     int processArgs(int argc, char* argv[])
@@ -263,20 +267,21 @@ int main(int argc, char* argv[])
     {
         switch(args.init_method)
         {
-        case 0:
-            in.GenerateTensorValue(GeneratorTensor_1<InDataType>{}, num_thread);
-            if(beta != 0.0f)
-                out_ref.GenerateTensorValue(GeneratorTensor_1<InDataType>{}, num_thread);
-            break;
+        case 0: break;
         case 1:
+            in.GenerateTensorValue(GeneratorTensor_1<InDataType>{1}, num_thread);
+            if(beta != 0.0f)
+                out_ref.GenerateTensorValue(GeneratorTensor_1<InDataType>{1}, num_thread);
+            break;
+        case 2:
             in.GenerateTensorValue(GeneratorTensor_2<InDataType>{-5, 5}, num_thread);
             if(beta != 0.0f)
                 out_ref.GenerateTensorValue(GeneratorTensor_2<InDataType>{-5, 5}, num_thread);
             break;
         default:
-            in.GenerateTensorValue(GeneratorTensor_2<InDataType>{1, 5}, num_thread);
+            in.GenerateTensorValue(GeneratorTensor_3<InDataType>{-5.0, 5.0}, num_thread);
             if(beta != 0.0f)
-                out_ref.GenerateTensorValue(GeneratorTensor_2<InDataType>{1, 5}, num_thread);
+                out_ref.GenerateTensorValue(GeneratorTensor_3<InDataType>{-5.0, 5.0}, num_thread);
         }
 
         if(beta != 0.0f)
@@ -293,17 +298,27 @@ int main(int argc, char* argv[])
     if(beta != 0.0f)
         out_dev.ToDevice(out.mData.data());
 
-    size_t indicesSizeInBytes = NeedIndices ? out.mDesc.GetElementSize() * sizeof(int) : 0;
+    size_t indicesSizeInBytes = NeedIndices ? out.mDesc.GetElementSize() * sizeof(int32_t) : 0;
 
     DeviceMem out_indices_dev(indicesSizeInBytes);
 
     if(args.do_verification)
     {
-        ReductionHost<InDataType, AccDataType, OutDataType, ReduceOpId, PropagateNan, NeedIndices>
+        ReductionHost<HostInDataType,
+                      HostAccDataType,
+                      HostOutDataType,
+                      ReduceOpId,
+                      Rank,
+                      NumReduceDim,
+                      PropagateNan,
+                      NeedIndices>
             hostReduce(in.mDesc, out_ref.mDesc, invariantDims, reduceDims);
 
-        hostReduce.Run(
-            alpha, in.mData.data(), beta, out_ref.mData.data(), out_indices_ref.mData.data());
+        hostReduce.Run(alpha,
+                       reinterpret_cast<const HostInDataType*>(in.mData.data()),
+                       beta,
+                       reinterpret_cast<HostOutDataType*>(out_ref.mData.data()),
+                       out_indices_ref.mData.data());
     };
 
     const auto i_inLengths  = to_int_vector(args.inLengths);
@@ -313,7 +328,7 @@ int main(int argc, char* argv[])
 
     auto reduce = DeviceReduceInstance{};
 
-    auto wsSizeInBytes = reduce.GetWorkspaceSizeInBytes(i_inLengths);
+    auto wsSizeInBytes = reduce.GetWorkspaceSizeInBytes(i_inLengths, reduceDims);
 
     DeviceMem ws_dev(wsSizeInBytes);
 
diff --git a/example/13_pool2d_fwd/README.md b/example/13_pool2d_fwd/README.md
index 1f8cc4cfbd..4b994e7382 100644
--- a/example/13_pool2d_fwd/README.md
+++ b/example/13_pool2d_fwd/README.md
@@ -36,7 +36,7 @@ cmake                                                                  \
 ## Run ```pool2d_fwd```
 ```bash
 #arg1: verification (0=no, 1=yes)
-#arg2: initialization (0=no init, 1=integer value, 2=decimal value)
+#arg2: initialization (0=no init, 1=single integer value, 2=scope integer value, 3=decimal value)
 #arg3: run kernel # of times (>1)
 #arg4 to 15: N, C, Y, X, Hi, Wi, Sy, Sx, LeftPy, LeftPx, RightPy, RightPx
 ./example/pool2d_fwd 1 1 10
diff --git a/example/13_pool2d_fwd/pool2d_fwd.cpp b/example/13_pool2d_fwd/pool2d_fwd.cpp
index a0cb61136f..0b4aba3af1 100644
--- a/example/13_pool2d_fwd/pool2d_fwd.cpp
+++ b/example/13_pool2d_fwd/pool2d_fwd.cpp
@@ -236,8 +236,9 @@ int main(int argc, char* argv[])
     switch(init_method)
     {
     case 0: break;
-    case 1: in_n_c_hi_wi.GenerateTensorValue(GeneratorTensor_2<InDataType>{-5, 5}); break;
-    default: in_n_c_hi_wi.GenerateTensorValue(GeneratorTensor_3<InDataType>{0.0, 1.0});
+    case 1: in_n_c_hi_wi.GenerateTensorValue(GeneratorTensor_1<InDataType>{1}); break;
+    case 2: in_n_c_hi_wi.GenerateTensorValue(GeneratorTensor_2<InDataType>{-5, 5}); break;
+    default: in_n_c_hi_wi.GenerateTensorValue(GeneratorTensor_3<InDataType>{-5.0, 5.0});
     }
 
     DeviceMem in_device_buf(sizeof(InDataType) * in_n_c_hi_wi.mDesc.GetElementSpace());
diff --git a/include/ck/tensor_operation/gpu/device/device_reduce.hpp b/include/ck/tensor_operation/gpu/device/device_reduce.hpp
index 11fd58a2ff..50fa64dab8 100644
--- a/include/ck/tensor_operation/gpu/device/device_reduce.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_reduce.hpp
@@ -16,9 +16,11 @@ namespace device {
 template <typename InElementwiseOperation, typename AccElementwiseOperation>
 struct DeviceReduce : public BaseOperator
 {
-    virtual size_t GetWorkspaceSizeInBytes(const std::vector<int>& inLengths)
+    virtual long_index_t GetWorkspaceSizeInBytes(const std::vector<int> inLengths,
+                                                 const std::vector<int> reduceDims)
     {
         (void)inLengths;
+        (void)reduceDims;
 
         return (0);
     };
@@ -32,19 +34,19 @@ struct DeviceReduce : public BaseOperator
     };
 
     virtual std::unique_ptr<BaseArgument>
-    MakeArgumentPointer(const std::vector<int>& inLengths,
-                        const std::vector<int>& inStrides,
-                        const std::vector<int>& outLengths,
-                        const std::vector<int>& outStrides,
-                        const std::vector<int>& reduceDims,
+    MakeArgumentPointer(const std::vector<int> inLengths,
+                        const std::vector<int> inStrides,
+                        const std::vector<int> outLengths,
+                        const std::vector<int> outStrides,
+                        const std::vector<int> reduceDims,
                         float alpha,
                         float beta,
                         const void* in_dev,
                         void* out_dev,
                         void* out_indices_dev,
                         void* workspace_dev,
-                        const InElementwiseOperation& in_elementwise_op,
-                        const AccElementwiseOperation& acc_elementwise_op) = 0;
+                        const InElementwiseOperation in_elementwise_op,
+                        const AccElementwiseOperation acc_elementwise_op) = 0;
 
     virtual std::unique_ptr<BaseInvoker> MakeInvokerPointer() = 0;
 };
diff --git a/include/ck/tensor_operation/gpu/device/device_reduce_blockwise.hpp b/include/ck/tensor_operation/gpu/device/device_reduce_blockwise.hpp
index cc1919ab81..4f17989b53 100644
--- a/include/ck/tensor_operation/gpu/device/device_reduce_blockwise.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_reduce_blockwise.hpp
@@ -36,20 +36,20 @@ struct DeviceReduceBlockWise : public DeviceReduce<InElementwiseOperation, AccEl
     static_assert(BlockSize == MThreadClusterSize * KThreadClusterSize,
                   "Invalid thread cluster size assignments!");
 
+    static_assert(((InSrcVectorDim == 0 && MThreadSliceSize % InSrcVectorSize == 0) ||
+                   (InSrcVectorDim == 1 && KThreadSliceSize % InSrcVectorSize == 0)) &&
+                      (MThreadSliceSize % OutDstVectorSize == 0),
+                  "Invalid thread slice sizes and/or vector sizes configuration, please check!");
+
     using IndexDataType = int32_t;
 
     static constexpr bool BetaIsZero = NeedIndices;
 
     static constexpr index_t NumInvariantDim = Rank - NumReduceDim;
-    using InvariantDims =
-        typename conditional<NumInvariantDim == 0,
-                             Sequence<>,
-                             typename arithmetic_sequence_gen<0, NumInvariantDim, 1>::type>::type;
-    using ReduceDims = typename arithmetic_sequence_gen<NumInvariantDim, Rank, 1>::type;
 
-    static constexpr index_t srcDims    = Rank;
-    static constexpr index_t dstDims    = (InvariantDims::Size() == 0) ? 1 : InvariantDims::Size();
-    static constexpr bool reduceAllDims = (InvariantDims::Size() == 0);
+    static constexpr index_t numSrcDim = Rank;
+    static constexpr index_t numDstDim = (NumInvariantDim == 0) ? 1 : NumInvariantDim;
+    static constexpr bool reduceAllDim = (NumInvariantDim == 0);
 
     static constexpr int M_BlockTileSize = MThreadClusterSize * MThreadSliceSize;
     static constexpr int K_BlockTileSize = KThreadClusterSize * KThreadSliceSize;
@@ -57,18 +57,18 @@ struct DeviceReduceBlockWise : public DeviceReduce<InElementwiseOperation, AccEl
     static auto MakeSrc2dDescriptor(const std::vector<int>& inLengths,
                                     const std::vector<int>& inStrides)
     {
-        const auto tupleSrcLengths = make_tuple_from_array(inLengths, Number<srcDims>{});
-        const auto tupleSrcStrides = make_tuple_from_array(inStrides, Number<srcDims>{});
+        const auto tupleSrcLengths = make_tuple_from_array(inLengths, Number<numSrcDim>{});
+        const auto tupleSrcStrides = make_tuple_from_array(inStrides, Number<numSrcDim>{});
 
         const auto inDesc = make_naive_tensor_descriptor(tupleSrcLengths, tupleSrcStrides);
 
         const auto in_grid_desc_m_k = [&]() {
-            if constexpr(reduceAllDims)
+            if constexpr(reduceAllDim)
             {
                 const auto one_dim_inDesc = transform_tensor_descriptor(
                     inDesc,
                     make_tuple(make_merge_transform(tupleSrcLengths)),
-                    make_tuple(typename arithmetic_sequence_gen<0, srcDims, 1>::type{}),
+                    make_tuple(typename arithmetic_sequence_gen<0, numSrcDim, 1>::type{}),
                     make_tuple(Sequence<0>{}));
 
                 return transform_tensor_descriptor(one_dim_inDesc,
@@ -79,6 +79,9 @@ struct DeviceReduceBlockWise : public DeviceReduce<InElementwiseOperation, AccEl
             }
             else
             {
+                using InvariantDims = typename arithmetic_sequence_gen<0, NumInvariantDim, 1>::type;
+                using ReduceDims = typename arithmetic_sequence_gen<NumInvariantDim, Rank, 1>::type;
+
                 const auto reduceDimLengths =
                     make_tuple_from_array_and_index_seq(inLengths, ReduceDims{});
                 const auto invariantDimLengths =
@@ -93,18 +96,20 @@ struct DeviceReduceBlockWise : public DeviceReduce<InElementwiseOperation, AccEl
             }
         }();
 
-        const auto outerLen = in_grid_desc_m_k.GetLength(Number<0>{});
-        const auto innerLen = in_grid_desc_m_k.GetLength(Number<1>{});
+        const auto invariantLength = in_grid_desc_m_k.GetLength(Number<0>{});
+        const auto reduceLength    = in_grid_desc_m_k.GetLength(Number<1>{});
 
-        const auto inPad_M = math::integer_least_multiple(outerLen, M_BlockTileSize) - outerLen;
-        const auto inPad_K = math::integer_least_multiple(innerLen, K_BlockTileSize) - innerLen;
+        const auto inPad_M =
+            math::integer_least_multiple(invariantLength, M_BlockTileSize) - invariantLength;
+        const auto inPad_K =
+            math::integer_least_multiple(reduceLength, K_BlockTileSize) - reduceLength;
 
-        auto in_grid_desc_m_k_padded =
-            transform_tensor_descriptor(in_grid_desc_m_k,
-                                        make_tuple(make_right_pad_transform(outerLen, inPad_M),
-                                                   make_right_pad_transform(innerLen, inPad_K)),
-                                        make_tuple(Sequence<0>{}, Sequence<1>{}),
-                                        make_tuple(Sequence<0>{}, Sequence<1>{}));
+        auto in_grid_desc_m_k_padded = transform_tensor_descriptor(
+            in_grid_desc_m_k,
+            make_tuple(make_right_pad_transform(invariantLength, inPad_M),
+                       make_right_pad_transform(reduceLength, inPad_K)),
+            make_tuple(Sequence<0>{}, Sequence<1>{}),
+            make_tuple(Sequence<0>{}, Sequence<1>{}));
 
         return (in_grid_desc_m_k_padded);
     };
@@ -112,44 +117,45 @@ struct DeviceReduceBlockWise : public DeviceReduce<InElementwiseOperation, AccEl
     static auto MakeDst1dDescriptor(const std::vector<int>& outLengths,
                                     const std::vector<int>& outStrides)
     {
-        const auto tupleDstLengths = make_tuple_from_array(outLengths, Number<dstDims>{});
-        const auto tupleDstStrides = make_tuple_from_array(outStrides, Number<dstDims>{});
+        const auto tupleDstLengths = make_tuple_from_array(outLengths, Number<numDstDim>{});
+        const auto tupleDstStrides = make_tuple_from_array(outStrides, Number<numDstDim>{});
 
         auto outDesc = make_naive_tensor_descriptor(tupleDstLengths, tupleDstStrides);
 
         auto out_grid_desc_m = transform_tensor_descriptor(
             outDesc,
             make_tuple(make_merge_transform(tupleDstLengths)),
-            make_tuple(typename arithmetic_sequence_gen<0, dstDims, 1>::type{}),
+            make_tuple(typename arithmetic_sequence_gen<0, numDstDim, 1>::type{}),
             make_tuple(Sequence<0>{}));
 
-        const auto outerLen = out_grid_desc_m.GetLength(Number<0>{});
+        const auto invariantLength = out_grid_desc_m.GetLength(Number<0>{});
 
-        const auto inPad = math::integer_least_multiple(outerLen, M_BlockTileSize) - outerLen;
+        const auto inPad =
+            math::integer_least_multiple(invariantLength, M_BlockTileSize) - invariantLength;
 
-        auto out_grid_desc_m_padded =
-            transform_tensor_descriptor(out_grid_desc_m,
-                                        make_tuple(make_right_pad_transform(outerLen, inPad)),
-                                        make_tuple(Sequence<0>{}),
-                                        make_tuple(Sequence<0>{}));
+        auto out_grid_desc_m_padded = transform_tensor_descriptor(
+            out_grid_desc_m,
+            make_tuple(make_right_pad_transform(invariantLength, inPad)),
+            make_tuple(Sequence<0>{}),
+            make_tuple(Sequence<0>{}));
         return (out_grid_desc_m_padded);
     };
 
     struct Argument : public BaseArgument
     {
-        Argument(const std::vector<int>& inLengths,
-                 const std::vector<int>& inStrides,
-                 const std::vector<int>& outLengths,
-                 const std::vector<int>& outStrides,
-                 const std::vector<int>& reduceDims,
+        Argument(const std::vector<int> inLengths,
+                 const std::vector<int> inStrides,
+                 const std::vector<int> outLengths,
+                 const std::vector<int> outStrides,
+                 const std::vector<int> reduceDims,
                  float alpha,
                  float beta,
                  const InDataType* in_dev,
                  OutDataType* out_dev,
                  IndexDataType* out_indices_dev,
                  AccDataType* workspace_dev,
-                 const InElementwiseOperation& in_elementwise_op,
-                 const AccElementwiseOperation& acc_elementwise_op)
+                 const InElementwiseOperation in_elementwise_op,
+                 const AccElementwiseOperation acc_elementwise_op)
             : outLengths_{outLengths},
               outStrides_{outStrides},
               in_dev_{in_dev},
@@ -160,21 +166,21 @@ struct DeviceReduceBlockWise : public DeviceReduce<InElementwiseOperation, AccEl
         {
             (void)workspace_dev;
 
-            std::tie(inLengths_, inStrides_) =
-                shuffle_tensor_dimensions<Rank, NumReduceDim>(inLengths, inStrides, reduceDims);
+            inLengths_ = shuffle_tensor_dimensions<Rank, NumReduceDim>(inLengths, reduceDims);
+            inStrides_ = shuffle_tensor_dimensions<Rank, NumReduceDim>(inStrides, reduceDims);
 
-            alpha_ = static_cast<AccDataType>(alpha);
-            beta_  = static_cast<OutDataType>(beta);
+            alpha_ = type_convert<AccDataType>(alpha);
+            beta_  = type_convert<AccDataType>(beta);
 
             std::tie(invariant_total_length, reduce_total_length) =
-                get_2d_lengths<Rank, ReduceDims>(inLengths_);
+                get_2d_lengths<Rank, NumReduceDim>(inLengths_);
 
-            if constexpr(InvariantDims::Size() == 0)
+            if constexpr(NumInvariantDim == 0)
                 invariant_lowest_length = 1;
             else
-                invariant_lowest_length = inLengths_[InvariantDims::At(InvariantDims::Size() - 1)];
+                invariant_lowest_length = inLengths_[NumInvariantDim - 1];
 
-            reduce_lowest_length = inLengths_[ReduceDims::At(ReduceDims::Size() - 1)];
+            reduce_lowest_length = inLengths_[Rank - 1];
 
             gridSize = math::integer_least_multiple(invariant_total_length, M_BlockTileSize) /
                        M_BlockTileSize;
@@ -186,7 +192,7 @@ struct DeviceReduceBlockWise : public DeviceReduce<InElementwiseOperation, AccEl
         std::vector<int> outStrides_;
 
         AccDataType alpha_;
-        OutDataType beta_;
+        AccDataType beta_;
 
         const InDataType* in_dev_;
         OutDataType* out_dev_;
@@ -278,18 +284,22 @@ struct DeviceReduceBlockWise : public DeviceReduce<InElementwiseOperation, AccEl
 
         if constexpr(InSrcVectorDim == 0)
         {
-            if constexpr(InvariantDims::Size() == 0)
+            if constexpr(NumInvariantDim == 0)
+            {
                 return (false);
+            }
+            else
+            {
+                if(pArg->inStrides_[NumInvariantDim - 1] != 1)
+                    return (false);
 
-            if(pArg->inStrides_[InvariantDims::At(InvariantDims::Size() - 1)] != 1)
-                return (false);
-
-            if(pArg->invariant_lowest_length % InSrcVectorSize != 0)
-                return (false);
+                if(pArg->invariant_lowest_length % InSrcVectorSize != 0)
+                    return (false);
+            };
         }
         else
         {
-            if(pArg->inStrides_[ReduceDims::At(ReduceDims::Size() - 1)] != 1)
+            if(pArg->inStrides_[Rank - 1] != 1)
                 return (false);
 
             if(pArg->reduce_lowest_length % InSrcVectorSize != 0)
@@ -308,19 +318,19 @@ struct DeviceReduceBlockWise : public DeviceReduce<InElementwiseOperation, AccEl
     };
 
     std::unique_ptr<BaseArgument>
-    MakeArgumentPointer(const std::vector<int>& inLengths,
-                        const std::vector<int>& inStrides,
-                        const std::vector<int>& outLengths,
-                        const std::vector<int>& outStrides,
-                        const std::vector<int>& reduceDims,
+    MakeArgumentPointer(const std::vector<int> inLengths,
+                        const std::vector<int> inStrides,
+                        const std::vector<int> outLengths,
+                        const std::vector<int> outStrides,
+                        const std::vector<int> reduceDims,
                         float alpha,
                         float beta,
                         const void* in_dev,
                         void* out_dev,
                         void* out_indices_dev,
                         void* workspace_dev,
-                        const InElementwiseOperation& in_elementwise_op,
-                        const AccElementwiseOperation& acc_elementwise_op) override
+                        const InElementwiseOperation in_elementwise_op,
+                        const AccElementwiseOperation acc_elementwise_op) override
     {
         return std::make_unique<Argument>(inLengths,
                                           inStrides,
diff --git a/include/ck/tensor_operation/gpu/device/device_reduce_blockwise_second_call.hpp b/include/ck/tensor_operation/gpu/device/device_reduce_blockwise_second_call.hpp
index 1647b3d84c..d3b1b4b5c3 100644
--- a/include/ck/tensor_operation/gpu/device/device_reduce_blockwise_second_call.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_reduce_blockwise_second_call.hpp
@@ -37,6 +37,10 @@ struct DeviceReduceBlockWiseSecondCall
     static_assert(BlockSize == MThreadClusterSize * KThreadClusterSize,
                   "Invalid thread cluster size assignments!");
 
+    static_assert((InSrcVectorDim == 1 && KThreadSliceSize % InSrcVectorSize == 0) &&
+                      (MThreadSliceSize % OutDstVectorSize == 0),
+                  "Invalid thread slice sizes and/or vector sizes configuration, please check!");
+
     using IndexDataType = int32_t;
 
     static constexpr bool BetaIsZero = NeedIndices;
@@ -46,12 +50,8 @@ struct DeviceReduceBlockWiseSecondCall
         "InDataType and AccDataType should be the same to use DEviceReduceBlockWiseSecondCall!");
 
     static constexpr index_t NumInvariantDim = Rank - NumReduceDim;
-    using InvariantDims =
-        typename conditional<NumInvariantDim == 0,
-                             Sequence<>,
-                             typename arithmetic_sequence_gen<0, NumInvariantDim, 1>::type>::type;
 
-    static constexpr index_t dstDims = (InvariantDims::Size() == 0) ? 1 : InvariantDims::Size();
+    static constexpr index_t numDstDim = (NumInvariantDim == 0) ? 1 : NumInvariantDim;
 
     static constexpr int M_BlockTileSize = MThreadClusterSize * MThreadSliceSize;
     static constexpr int K_BlockTileSize = KThreadClusterSize * KThreadSliceSize;
@@ -65,18 +65,20 @@ struct DeviceReduceBlockWiseSecondCall
         const auto in_grid_desc_m_k =
             make_naive_tensor_descriptor(tupleSrcLengths, tupleSrcStrides);
 
-        const auto outerLen = in_grid_desc_m_k.GetLength(Number<0>{});
-        const auto innerLen = in_grid_desc_m_k.GetLength(Number<1>{});
+        const auto invariantLength = in_grid_desc_m_k.GetLength(Number<0>{});
+        const auto reduceLength    = in_grid_desc_m_k.GetLength(Number<1>{});
 
-        const auto inPad_M = math::integer_least_multiple(outerLen, M_BlockTileSize) - outerLen;
-        const auto inPad_K = math::integer_least_multiple(innerLen, K_BlockTileSize) - innerLen;
+        const auto inPad_M =
+            math::integer_least_multiple(invariantLength, M_BlockTileSize) - invariantLength;
+        const auto inPad_K =
+            math::integer_least_multiple(reduceLength, K_BlockTileSize) - reduceLength;
 
-        auto in_grid_desc_m_k_padded =
-            transform_tensor_descriptor(in_grid_desc_m_k,
-                                        make_tuple(make_right_pad_transform(outerLen, inPad_M),
-                                                   make_right_pad_transform(innerLen, inPad_K)),
-                                        make_tuple(Sequence<0>{}, Sequence<1>{}),
-                                        make_tuple(Sequence<0>{}, Sequence<1>{}));
+        auto in_grid_desc_m_k_padded = transform_tensor_descriptor(
+            in_grid_desc_m_k,
+            make_tuple(make_right_pad_transform(invariantLength, inPad_M),
+                       make_right_pad_transform(reduceLength, inPad_K)),
+            make_tuple(Sequence<0>{}, Sequence<1>{}),
+            make_tuple(Sequence<0>{}, Sequence<1>{}));
 
         return (in_grid_desc_m_k_padded);
     };
@@ -84,26 +86,27 @@ struct DeviceReduceBlockWiseSecondCall
     static auto MakeDst1dDescriptor(const std::vector<int>& outLengths,
                                     const std::vector<int>& outStrides)
     {
-        const auto tupleDstLengths = make_tuple_from_array(outLengths, Number<dstDims>{});
-        const auto tupleDstStrides = make_tuple_from_array(outStrides, Number<dstDims>{});
+        const auto tupleDstLengths = make_tuple_from_array(outLengths, Number<numDstDim>{});
+        const auto tupleDstStrides = make_tuple_from_array(outStrides, Number<numDstDim>{});
 
         auto outDesc = make_naive_tensor_descriptor(tupleDstLengths, tupleDstStrides);
 
         auto out_grid_desc_m = transform_tensor_descriptor(
             outDesc,
             make_tuple(make_merge_transform(tupleDstLengths)),
-            make_tuple(typename arithmetic_sequence_gen<0, dstDims, 1>::type{}),
+            make_tuple(typename arithmetic_sequence_gen<0, numDstDim, 1>::type{}),
             make_tuple(Sequence<0>{}));
 
-        const auto outerLen = out_grid_desc_m.GetLength(Number<0>{});
+        const auto invariantLength = out_grid_desc_m.GetLength(Number<0>{});
 
-        const auto outPad = math::integer_least_multiple(outerLen, M_BlockTileSize) - outerLen;
+        const auto outPad =
+            math::integer_least_multiple(invariantLength, M_BlockTileSize) - invariantLength;
 
-        auto out_grid_desc_m_padded =
-            transform_tensor_descriptor(out_grid_desc_m,
-                                        make_tuple(make_right_pad_transform(outerLen, outPad)),
-                                        make_tuple(Sequence<0>{}),
-                                        make_tuple(Sequence<0>{}));
+        auto out_grid_desc_m_padded = transform_tensor_descriptor(
+            out_grid_desc_m,
+            make_tuple(make_right_pad_transform(invariantLength, outPad)),
+            make_tuple(Sequence<0>{}),
+            make_tuple(Sequence<0>{}));
         return (out_grid_desc_m_padded);
     };
 
@@ -131,8 +134,8 @@ struct DeviceReduceBlockWiseSecondCall
               in_elementwise_op_(in_elementwise_op),
               acc_elementwise_op_(acc_elementwise_op)
         {
-            alpha_ = static_cast<AccDataType>(alpha);
-            beta_  = static_cast<OutDataType>(beta);
+            alpha_ = type_convert<AccDataType>(alpha);
+            beta_  = type_convert<AccDataType>(beta);
 
             invariant_total_length = inLengths[0];
             reduce_total_length    = inLengths[1];
@@ -159,7 +162,7 @@ struct DeviceReduceBlockWiseSecondCall
         std::vector<int> outStrides_;
 
         AccDataType alpha_;
-        OutDataType beta_;
+        AccDataType beta_;
 
         const InDataType* in_dev_;
         OutDataType* out_dev_;
@@ -268,19 +271,19 @@ struct DeviceReduceBlockWiseSecondCall
     };
 
     std::unique_ptr<BaseArgument>
-    MakeArgumentPointer(const std::vector<int>& inLengths,
-                        const std::vector<int>& inStrides,
-                        const std::vector<int>& outLengths,
-                        const std::vector<int>& outStrides,
-                        const std::vector<int>& reduceDims,
+    MakeArgumentPointer(const std::vector<int> inLengths,
+                        const std::vector<int> inStrides,
+                        const std::vector<int> outLengths,
+                        const std::vector<int> outStrides,
+                        const std::vector<int> reduceDims,
                         float alpha,
                         float beta,
                         const void* in_dev,
                         void* out_dev,
                         void* out_indices_dev,
                         void* workspace_dev,
-                        const InElementwiseOperation& in_elementwise_op,
-                        const AccElementwiseOperation& acc_elementwise_op) override
+                        const InElementwiseOperation in_elementwise_op,
+                        const AccElementwiseOperation acc_elementwise_op) override
     {
         (void)reduceDims;
 
diff --git a/include/ck/tensor_operation/gpu/device/device_reduce_common.hpp b/include/ck/tensor_operation/gpu/device/device_reduce_common.hpp
index 85e0eb1197..038c754722 100644
--- a/include/ck/tensor_operation/gpu/device/device_reduce_common.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_reduce_common.hpp
@@ -12,38 +12,30 @@ namespace ck {
 namespace tensor_operation {
 namespace device {
 
-// template <typename preUnaryOpType, typename posUnaryOpType>
-// using DeviceReducePtr = std::unique_ptr<DeviceReduce<preUnaryOpType, posUnaryOpType>>;
-
-template <int Rank, typename ReduceDims>
+// here, inLengths[] is already shuffled so that lengths of invariant dims are included before those
+// of reduce dims
+template <int Rank, int NumReduceDim>
 std::pair<size_t, size_t> get_2d_lengths(const std::vector<int>& inLengths)
 {
     static_assert(Rank <= 6, "bigger Rank size not supported!");
 
-    size_t tensor_total_length = 1;
-    size_t reduce_total_length = 1;
+    size_t invariant_total_length = 1;
+    size_t reduce_total_length    = 1;
 
-    static_for<0, ReduceDims::Size(), 1>{}(
-        [&](auto i) { reduce_total_length *= inLengths[ReduceDims::At(i)]; });
+    constexpr int NumInvariantDim = Rank - NumReduceDim;
 
-    static_for<0, Rank, 1>{}([&](auto i) { tensor_total_length *= inLengths[i.value]; });
+    for(int i = NumInvariantDim; i < Rank; i++)
+        reduce_total_length *= inLengths[i];
 
-    return std::make_pair(tensor_total_length / reduce_total_length, reduce_total_length);
-};
+    for(int i = 0; i < NumInvariantDim; i++)
+        invariant_total_length *= inLengths[i];
 
-template <int x, typename Seq>
-constexpr bool belong()
-{
-    bool inside = false;
-
-    static_for<0, Seq::Size(), 1>{}([&](auto i) { inside = (inside || (x == Seq::At(i))); });
-
-    return (inside);
+    return std::make_pair(invariant_total_length, reduce_total_length);
 };
 
 // helper functions using variadic template arguments
 template <index_t... Ns>
-static auto make_tuple_from_array_and_index_seq(const std::vector<int>& lengths, Sequence<Ns...>)
+auto make_tuple_from_array_and_index_seq(const std::vector<int>& lengths, Sequence<Ns...>)
 {
     return make_tuple(static_cast<index_t>(lengths[Ns])...);
 };
@@ -59,16 +51,12 @@ static auto make_tuple_from_array(const std::vector<int>& lengths, Number<arrayS
 };
 
 template <index_t Rank, index_t NumReduceDim>
-static inline std::pair<std::vector<int>, std::vector<int>>
-shuffle_tensor_dimensions(const std::vector<int>& dimLengths,
-                          const std::vector<int>& dimStrides,
-                          const std::vector<int>& reduceDims)
+std::vector<int> shuffle_tensor_dimensions(const std::vector<int>& origLengthsStrides,
+                                           const std::vector<int>& reduceDims)
 {
-    std::vector<int> newDimLengths;
-    std::vector<int> newDimStrides;
+    std::vector<int> newLengthsStrides;
 
-    assert(Rank == dimLengths.size() && Rank == dimStrides.size() &&
-           NumReduceDim == reduceDims.size());
+    assert(Rank == origLengthsStrides.size() && NumReduceDim == reduceDims.size());
 
     int reduceFlag = 0;
 
@@ -82,19 +70,17 @@ shuffle_tensor_dimensions(const std::vector<int>& dimLengths,
     for(int i = 0; i < Rank; i++)
         if((reduceFlag & (1 << i)) == 0)
         {
-            newDimLengths.push_back(dimLengths[i]);
-            newDimStrides.push_back(dimStrides[i]);
+            newLengthsStrides.push_back(origLengthsStrides[i]);
         };
 
     // collect reduce dimensions
     for(int i = 0; i < Rank; i++)
         if((reduceFlag & (1 << i)) > 0)
         {
-            newDimLengths.push_back(dimLengths[i]);
-            newDimStrides.push_back(dimStrides[i]);
+            newLengthsStrides.push_back(origLengthsStrides[i]);
         };
 
-    return std::make_pair(newDimLengths, newDimStrides);
+    return newLengthsStrides;
 };
 
 } // namespace device
diff --git a/include/ck/tensor_operation/gpu/device/device_reduce_multiblock_atomic_add.hpp b/include/ck/tensor_operation/gpu/device/device_reduce_multiblock_atomic_add.hpp
index 5bf3c1d7d1..889c366875 100644
--- a/include/ck/tensor_operation/gpu/device/device_reduce_multiblock_atomic_add.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_reduce_multiblock_atomic_add.hpp
@@ -39,18 +39,18 @@ struct DeviceReduceMultiBlockAtomicAdd
     static_assert(BlockSize == MThreadClusterSize * KThreadClusterSize,
                   "Invalid thread cluster size assignments!");
 
+    static_assert(((InSrcVectorDim == 0 && MThreadSliceSize % InSrcVectorSize == 0) ||
+                   (InSrcVectorDim == 1 && KThreadSliceSize % InSrcVectorSize == 0)) &&
+                      (MThreadSliceSize % OutDstVectorSize == 0),
+                  "Invalid thread slice sizes and/or vector sizes configuration, please check!");
+
     using IndexDataType = int32_t;
 
     static constexpr index_t NumInvariantDim = Rank - NumReduceDim;
-    using InvariantDims =
-        typename conditional<NumInvariantDim == 0,
-                             Sequence<>,
-                             typename arithmetic_sequence_gen<0, NumInvariantDim, 1>::type>::type;
-    using ReduceDims = typename arithmetic_sequence_gen<NumInvariantDim, Rank, 1>::type;
 
-    static constexpr index_t srcDims    = Rank;
-    static constexpr index_t dstDims    = (InvariantDims::Size() == 0) ? 1 : InvariantDims::Size();
-    static constexpr bool reduceAllDims = (InvariantDims::Size() == 0);
+    static constexpr index_t numSrcDim = Rank;
+    static constexpr index_t numDstDim = (NumInvariantDim == 0) ? 1 : NumInvariantDim;
+    static constexpr bool reduceAllDim = (NumInvariantDim == 0);
 
     static constexpr bool support_AtomicAdd =
         std::is_same<OutDataType, float>::value || std::is_same<OutDataType, double>::value;
@@ -67,18 +67,18 @@ struct DeviceReduceMultiBlockAtomicAdd
                                     int blkGroupSize,
                                     int kBlockTileIterations)
     {
-        const auto tupleSrcLengths = make_tuple_from_array(inLengths, Number<srcDims>{});
-        const auto tupleSrcStrides = make_tuple_from_array(inStrides, Number<srcDims>{});
+        const auto tupleSrcLengths = make_tuple_from_array(inLengths, Number<numSrcDim>{});
+        const auto tupleSrcStrides = make_tuple_from_array(inStrides, Number<numSrcDim>{});
 
         const auto inDesc = make_naive_tensor_descriptor(tupleSrcLengths, tupleSrcStrides);
 
         const auto in_grid_desc_m_k = [&]() {
-            if constexpr(reduceAllDims)
+            if constexpr(reduceAllDim)
             {
                 const auto one_dim_inDesc = transform_tensor_descriptor(
                     inDesc,
                     make_tuple(make_merge_transform(tupleSrcLengths)),
-                    make_tuple(typename arithmetic_sequence_gen<0, srcDims, 1>::type{}),
+                    make_tuple(typename arithmetic_sequence_gen<0, numSrcDim, 1>::type{}),
                     make_tuple(Sequence<0>{}));
 
                 return transform_tensor_descriptor(one_dim_inDesc,
@@ -89,6 +89,9 @@ struct DeviceReduceMultiBlockAtomicAdd
             }
             else
             {
+                using InvariantDims = typename arithmetic_sequence_gen<0, NumInvariantDim, 1>::type;
+                using ReduceDims = typename arithmetic_sequence_gen<NumInvariantDim, Rank, 1>::type;
+
                 const auto reduceDimLengths =
                     make_tuple_from_array_and_index_seq(inLengths, ReduceDims{});
                 const auto invariantDimLengths =
@@ -103,19 +106,20 @@ struct DeviceReduceMultiBlockAtomicAdd
             }
         }();
 
-        const auto outerLen = in_grid_desc_m_k.GetLength(Number<0>{});
-        const auto innerLen = in_grid_desc_m_k.GetLength(Number<1>{});
+        const auto invariantLength = in_grid_desc_m_k.GetLength(Number<0>{});
+        const auto reduceLength    = in_grid_desc_m_k.GetLength(Number<1>{});
 
         const int reduceSizePerBlock = K_BlockTileSize * kBlockTileIterations;
-        const auto inPad_M = math::integer_least_multiple(outerLen, M_BlockTileSize) - outerLen;
-        const auto inPad_K = reduceSizePerBlock * blkGroupSize - innerLen;
+        const auto inPad_M =
+            math::integer_least_multiple(invariantLength, M_BlockTileSize) - invariantLength;
+        const auto inPad_K = reduceSizePerBlock * blkGroupSize - reduceLength;
 
-        auto in_grid_desc_m_k_padded =
-            transform_tensor_descriptor(in_grid_desc_m_k,
-                                        make_tuple(make_right_pad_transform(outerLen, inPad_M),
-                                                   make_right_pad_transform(innerLen, inPad_K)),
-                                        make_tuple(Sequence<0>{}, Sequence<1>{}),
-                                        make_tuple(Sequence<0>{}, Sequence<1>{}));
+        auto in_grid_desc_m_k_padded = transform_tensor_descriptor(
+            in_grid_desc_m_k,
+            make_tuple(make_right_pad_transform(invariantLength, inPad_M),
+                       make_right_pad_transform(reduceLength, inPad_K)),
+            make_tuple(Sequence<0>{}, Sequence<1>{}),
+            make_tuple(Sequence<0>{}, Sequence<1>{}));
 
         return (in_grid_desc_m_k_padded);
     };
@@ -123,44 +127,45 @@ struct DeviceReduceMultiBlockAtomicAdd
     static auto MakeDst1dDescriptor(const std::vector<int>& outLengths,
                                     const std::vector<int>& outStrides)
     {
-        const auto tupleDstLengths = make_tuple_from_array(outLengths, Number<dstDims>{});
-        const auto tupleDstStrides = make_tuple_from_array(outStrides, Number<dstDims>{});
+        const auto tupleDstLengths = make_tuple_from_array(outLengths, Number<numDstDim>{});
+        const auto tupleDstStrides = make_tuple_from_array(outStrides, Number<numDstDim>{});
 
         auto outDesc = make_naive_tensor_descriptor(tupleDstLengths, tupleDstStrides);
 
         auto out_grid_desc_m = transform_tensor_descriptor(
             outDesc,
             make_tuple(make_merge_transform(tupleDstLengths)),
-            make_tuple(typename arithmetic_sequence_gen<0, dstDims, 1>::type{}),
+            make_tuple(typename arithmetic_sequence_gen<0, numDstDim, 1>::type{}),
             make_tuple(Sequence<0>{}));
 
-        const auto outerLen = out_grid_desc_m.GetLength(Number<0>{});
+        const auto invariantLength = out_grid_desc_m.GetLength(Number<0>{});
 
-        const auto outPad = math::integer_least_multiple(outerLen, M_BlockTileSize) - outerLen;
+        const auto outPad =
+            math::integer_least_multiple(invariantLength, M_BlockTileSize) - invariantLength;
 
-        auto out_grid_desc_m_padded =
-            transform_tensor_descriptor(out_grid_desc_m,
-                                        make_tuple(make_right_pad_transform(outerLen, outPad)),
-                                        make_tuple(Sequence<0>{}),
-                                        make_tuple(Sequence<0>{}));
+        auto out_grid_desc_m_padded = transform_tensor_descriptor(
+            out_grid_desc_m,
+            make_tuple(make_right_pad_transform(invariantLength, outPad)),
+            make_tuple(Sequence<0>{}),
+            make_tuple(Sequence<0>{}));
         return (out_grid_desc_m_padded);
     };
 
     struct Argument : public BaseArgument
     {
-        Argument(const std::vector<int>& inLengths,
-                 const std::vector<int>& inStrides,
-                 const std::vector<int>& outLengths,
-                 const std::vector<int>& outStrides,
-                 const std::vector<int>& reduceDims,
+        Argument(const std::vector<int> inLengths,
+                 const std::vector<int> inStrides,
+                 const std::vector<int> outLengths,
+                 const std::vector<int> outStrides,
+                 const std::vector<int> reduceDims,
                  float alpha,
                  float beta,
                  const InDataType* in_dev,
                  OutDataType* out_dev,
                  IndexDataType* out_indices_dev,
                  AccDataType* workspace_dev,
-                 const InElementwiseOperation& in_elementwise_op,
-                 const AccElementwiseOperation& acc_elementwise_op)
+                 const InElementwiseOperation in_elementwise_op,
+                 const AccElementwiseOperation acc_elementwise_op)
             : outLengths_{outLengths},
               outStrides_{outStrides},
               in_dev_{in_dev},
@@ -171,21 +176,21 @@ struct DeviceReduceMultiBlockAtomicAdd
             (void)out_indices_dev;
             (void)workspace_dev;
 
-            std::tie(inLengths_, inStrides_) =
-                shuffle_tensor_dimensions<Rank, NumReduceDim>(inLengths, inStrides, reduceDims);
+            inLengths_ = shuffle_tensor_dimensions<Rank, NumReduceDim>(inLengths, reduceDims);
+            inStrides_ = shuffle_tensor_dimensions<Rank, NumReduceDim>(inStrides, reduceDims);
 
-            alpha_ = static_cast<AccDataType>(alpha);
-            beta_  = static_cast<OutDataType>(beta);
+            alpha_ = type_convert<AccDataType>(alpha);
+            beta_  = type_convert<AccDataType>(beta);
 
             std::tie(invariant_total_length, reduce_total_length) =
-                get_2d_lengths<Rank, ReduceDims>(inLengths_);
+                get_2d_lengths<Rank, NumReduceDim>(inLengths_);
 
-            if constexpr(InvariantDims::Size() == 0)
+            if constexpr(NumInvariantDim == 0)
                 invariant_lowest_length = 1;
             else
-                invariant_lowest_length = inLengths_[InvariantDims::At(InvariantDims::Size() - 1)];
+                invariant_lowest_length = inLengths_[NumInvariantDim - 1];
 
-            reduce_lowest_length = inLengths_[ReduceDims::At(ReduceDims::Size() - 1)];
+            reduce_lowest_length = inLengths_[Rank - 1];
 
             int iterations = 1;
             while(true)
@@ -218,7 +223,7 @@ struct DeviceReduceMultiBlockAtomicAdd
         std::vector<int> outStrides_;
 
         AccDataType alpha_;
-        OutDataType beta_;
+        AccDataType beta_;
 
         const InDataType* in_dev_;
         OutDataType* out_dev_;
@@ -334,18 +339,22 @@ struct DeviceReduceMultiBlockAtomicAdd
 
         if constexpr(InSrcVectorDim == 0)
         {
-            if constexpr(InvariantDims::Size() == 0)
+            if constexpr(NumInvariantDim == 0)
+            {
                 return (false);
+            }
+            else
+            {
+                if(pArg->inStrides_[NumInvariantDim - 1] != 1)
+                    return (false);
 
-            if(pArg->inStrides_[InvariantDims::At(InvariantDims::Size() - 1)] != 1)
-                return (false);
-
-            if(pArg->invariant_lowest_length % InSrcVectorSize != 0)
-                return (false);
+                if(pArg->invariant_lowest_length % InSrcVectorSize != 0)
+                    return (false);
+            };
         }
         else
         {
-            if(pArg->inStrides_[ReduceDims::At(ReduceDims::Size() - 1)] != 1)
+            if(pArg->inStrides_[Rank - 1] != 1)
                 return (false);
 
             if(pArg->reduce_lowest_length % InSrcVectorSize != 0)
@@ -371,19 +380,19 @@ struct DeviceReduceMultiBlockAtomicAdd
     };
 
     std::unique_ptr<BaseArgument>
-    MakeArgumentPointer(const std::vector<int>& inLengths,
-                        const std::vector<int>& inStrides,
-                        const std::vector<int>& outLengths,
-                        const std::vector<int>& outStrides,
-                        const std::vector<int>& reduceDims,
+    MakeArgumentPointer(const std::vector<int> inLengths,
+                        const std::vector<int> inStrides,
+                        const std::vector<int> outLengths,
+                        const std::vector<int> outStrides,
+                        const std::vector<int> reduceDims,
                         float alpha,
                         float beta,
                         const void* in_dev,
                         void* out_dev,
                         void* out_indices_dev,
                         void* workspace_dev,
-                        const InElementwiseOperation& in_elementwise_op,
-                        const AccElementwiseOperation& acc_elementwise_op) override
+                        const InElementwiseOperation in_elementwise_op,
+                        const AccElementwiseOperation acc_elementwise_op) override
     {
         return std::make_unique<Argument>(inLengths,
                                           inStrides,
diff --git a/include/ck/tensor_operation/gpu/device/device_reduce_multiblock_partial_reduce.hpp b/include/ck/tensor_operation/gpu/device/device_reduce_multiblock_partial_reduce.hpp
index 5b69afa5d8..d583f7f1b8 100644
--- a/include/ck/tensor_operation/gpu/device/device_reduce_multiblock_partial_reduce.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_reduce_multiblock_partial_reduce.hpp
@@ -37,31 +37,35 @@ struct DeviceReduceMultiBlockPartialReduce
     static_assert(BlockSize == MThreadClusterSize * KThreadClusterSize,
                   "Invalid thread cluster size assignments!");
 
+    static_assert((InSrcVectorDim == 0 && MThreadSliceSize % InSrcVectorSize == 0) ||
+                      (InSrcVectorDim == 1 && KThreadSliceSize % InSrcVectorSize == 0),
+                  "Invalid thread slice sizes and/or vector sizes configuration, please check!");
+
     static_assert(OutDstVectorSize == 1, "OutDstVectorSize must be 1 for MultiBlockPartialReduce!");
 
     using IndexDataType = int32_t;
 
     static constexpr index_t NumInvariantDim = Rank - NumReduceDim;
-    using InvariantDims =
-        typename conditional<NumInvariantDim == 0,
-                             Sequence<>,
-                             typename arithmetic_sequence_gen<0, NumInvariantDim, 1>::type>::type;
-    using ReduceDims = typename arithmetic_sequence_gen<NumInvariantDim, Rank, 1>::type;
 
-    static constexpr index_t srcDims    = Rank;
-    static constexpr index_t dstDims    = (InvariantDims::Size() == 0) ? 1 : InvariantDims::Size();
-    static constexpr bool reduceAllDims = (InvariantDims::Size() == 0);
+    static constexpr index_t numSrcDim = Rank;
+    static constexpr index_t numDstDim = (NumInvariantDim == 0) ? 1 : NumInvariantDim;
+    static constexpr bool reduceAllDim = (NumInvariantDim == 0);
 
     static constexpr int M_BlockTileSize = MThreadClusterSize * MThreadSliceSize;
     static constexpr int K_BlockTileSize = KThreadClusterSize * KThreadSliceSize;
 
-    size_t GetWorkspaceSizeInBytes(const std::vector<int>& inLengths) override
+    static constexpr int MaxBlockGroupSize = 256;
+
+    long_index_t GetWorkspaceSizeInBytes(const std::vector<int> inLengths,
+                                         const std::vector<int> reduceDims) override
     {
         size_t invariant_total_length;
         size_t reduce_total_length;
 
+        auto inLengths_ = shuffle_tensor_dimensions<Rank, NumReduceDim>(inLengths, reduceDims);
+
         std::tie(invariant_total_length, reduce_total_length) =
-            get_2d_lengths<Rank, ReduceDims>(inLengths);
+            get_2d_lengths<Rank, NumReduceDim>(inLengths_);
 
         int iterations = 1;
         while(true)
@@ -69,8 +73,7 @@ struct DeviceReduceMultiBlockPartialReduce
             int testBlkGroupSize = (reduce_total_length + (K_BlockTileSize * iterations) - 1) /
                                    (K_BlockTileSize * iterations);
 
-            // we want the blkGroupSize be not more than 128
-            if(testBlkGroupSize <= 128)
+            if(testBlkGroupSize <= MaxBlockGroupSize)
                 break;
 
             iterations++;
@@ -79,11 +82,12 @@ struct DeviceReduceMultiBlockPartialReduce
         int blkGroupSize = (reduce_total_length + (K_BlockTileSize * iterations) - 1) /
                            (K_BlockTileSize * iterations);
 
-        size_t workspace_size = invariant_total_length * blkGroupSize;
+        long_index_t workspace_size = invariant_total_length * blkGroupSize;
 
-        size_t wsSizeInBytes =
-            !NeedIndices ? workspace_size * sizeof(AccDataType)
-                         : workspace_size * (sizeof(AccDataType) + sizeof(int)) + 64 + sizeof(int);
+        long_index_t wsSizeInBytes =
+            !NeedIndices
+                ? workspace_size * sizeof(AccDataType)
+                : workspace_size * (sizeof(AccDataType) + sizeof(int32_t)) + 64 + sizeof(int);
 
         return (wsSizeInBytes);
     };
@@ -95,18 +99,18 @@ struct DeviceReduceMultiBlockPartialReduce
                                     int blkGroupSize,
                                     int kBlockTileIterations)
     {
-        const auto tupleSrcLengths = make_tuple_from_array(inLengths, Number<srcDims>{});
-        const auto tupleSrcStrides = make_tuple_from_array(inStrides, Number<srcDims>{});
+        const auto tupleSrcLengths = make_tuple_from_array(inLengths, Number<numSrcDim>{});
+        const auto tupleSrcStrides = make_tuple_from_array(inStrides, Number<numSrcDim>{});
 
         const auto inDesc = make_naive_tensor_descriptor(tupleSrcLengths, tupleSrcStrides);
 
         const auto in_grid_desc_m_k = [&]() {
-            if constexpr(reduceAllDims)
+            if constexpr(reduceAllDim)
             {
                 const auto one_dim_inDesc = transform_tensor_descriptor(
                     inDesc,
                     make_tuple(make_merge_transform(tupleSrcLengths)),
-                    make_tuple(typename arithmetic_sequence_gen<0, srcDims, 1>::type{}),
+                    make_tuple(typename arithmetic_sequence_gen<0, numSrcDim, 1>::type{}),
                     make_tuple(Sequence<0>{}));
 
                 return transform_tensor_descriptor(one_dim_inDesc,
@@ -117,6 +121,9 @@ struct DeviceReduceMultiBlockPartialReduce
             }
             else
             {
+                using InvariantDims = typename arithmetic_sequence_gen<0, NumInvariantDim, 1>::type;
+                using ReduceDims = typename arithmetic_sequence_gen<NumInvariantDim, Rank, 1>::type;
+
                 const auto reduceDimLengths =
                     make_tuple_from_array_and_index_seq(inLengths, ReduceDims{});
                 const auto invariantDimLengths =
@@ -131,32 +138,35 @@ struct DeviceReduceMultiBlockPartialReduce
             }
         }();
 
-        const auto outerLen = in_grid_desc_m_k.GetLength(Number<0>{});
-        const auto innerLen = in_grid_desc_m_k.GetLength(Number<1>{});
+        const auto invariantLength = in_grid_desc_m_k.GetLength(Number<0>{});
+        const auto reduceLength    = in_grid_desc_m_k.GetLength(Number<1>{});
 
         const int reduceSizePerBlock = K_BlockTileSize * kBlockTileIterations;
-        const auto inPad_M = math::integer_least_multiple(outerLen, M_BlockTileSize) - outerLen;
-        const auto inPad_K = reduceSizePerBlock * blkGroupSize - innerLen;
+        const auto inPad_M =
+            math::integer_least_multiple(invariantLength, M_BlockTileSize) - invariantLength;
+        const auto inPad_K = reduceSizePerBlock * blkGroupSize - reduceLength;
 
-        auto in_grid_desc_m_k_padded =
-            transform_tensor_descriptor(in_grid_desc_m_k,
-                                        make_tuple(make_right_pad_transform(outerLen, inPad_M),
-                                                   make_right_pad_transform(innerLen, inPad_K)),
-                                        make_tuple(Sequence<0>{}, Sequence<1>{}),
-                                        make_tuple(Sequence<0>{}, Sequence<1>{}));
+        auto in_grid_desc_m_k_padded = transform_tensor_descriptor(
+            in_grid_desc_m_k,
+            make_tuple(make_right_pad_transform(invariantLength, inPad_M),
+                       make_right_pad_transform(reduceLength, inPad_K)),
+            make_tuple(Sequence<0>{}, Sequence<1>{}),
+            make_tuple(Sequence<0>{}, Sequence<1>{}));
 
         return (in_grid_desc_m_k_padded);
     };
 
-    static auto MakeWorkspace2dDescriptor(int outerLen, int blkGroupSize)
+    static auto MakeWorkspace2dDescriptor(int invariantLength, int blkGroupSize)
     {
-        auto ws_desc_m_k = make_naive_tensor_descriptor_packed(make_tuple(outerLen, blkGroupSize));
+        auto ws_desc_m_k =
+            make_naive_tensor_descriptor_packed(make_tuple(invariantLength, blkGroupSize));
 
-        const auto wsPad = math::integer_least_multiple(outerLen, M_BlockTileSize) - outerLen;
+        const auto wsPad =
+            math::integer_least_multiple(invariantLength, M_BlockTileSize) - invariantLength;
 
         auto ws_desc_m_k_padded =
             transform_tensor_descriptor(ws_desc_m_k,
-                                        make_tuple(make_right_pad_transform(outerLen, wsPad),
+                                        make_tuple(make_right_pad_transform(invariantLength, wsPad),
                                                    make_pass_through_transform(blkGroupSize)),
                                         make_tuple(Sequence<0>{}, Sequence<1>{}),
                                         make_tuple(Sequence<0>{}, Sequence<1>{}));
@@ -166,19 +176,19 @@ struct DeviceReduceMultiBlockPartialReduce
 
     struct Argument : public BaseArgument
     {
-        Argument(const std::vector<int>& inLengths,
-                 const std::vector<int>& inStrides,
-                 const std::vector<int>& outLengths,
-                 const std::vector<int>& outStrides,
-                 const std::vector<int>& reduceDims,
+        Argument(const std::vector<int> inLengths,
+                 const std::vector<int> inStrides,
+                 const std::vector<int> outLengths,
+                 const std::vector<int> outStrides,
+                 const std::vector<int> reduceDims,
                  float alpha,
                  float beta,
                  const InDataType* in_dev,
                  OutDataType* out_dev,
                  IndexDataType* out_indices_dev,
                  AccDataType* workspace_dev,
-                 const InElementwiseOperation& in_elementwise_op,
-                 const AccElementwiseOperation& acc_elementwise_op)
+                 const InElementwiseOperation in_elementwise_op,
+                 const AccElementwiseOperation acc_elementwise_op)
             : outLengths_{outLengths},
               outStrides_{outStrides},
               in_dev_{in_dev},
@@ -188,21 +198,21 @@ struct DeviceReduceMultiBlockPartialReduce
               in_elementwise_op_{in_elementwise_op},
               acc_elementwise_op_{acc_elementwise_op}
         {
-            std::tie(inLengths_, inStrides_) =
-                shuffle_tensor_dimensions<Rank, NumReduceDim>(inLengths, inStrides, reduceDims);
+            inLengths_ = shuffle_tensor_dimensions<Rank, NumReduceDim>(inLengths, reduceDims);
+            inStrides_ = shuffle_tensor_dimensions<Rank, NumReduceDim>(inStrides, reduceDims);
 
-            alpha_ = static_cast<AccDataType>(alpha);
-            beta_  = static_cast<OutDataType>(beta);
+            alpha_ = type_convert<AccDataType>(alpha);
+            beta_  = type_convert<AccDataType>(beta);
 
             std::tie(invariant_total_length, reduce_total_length) =
-                get_2d_lengths<Rank, ReduceDims>(inLengths_);
+                get_2d_lengths<Rank, NumReduceDim>(inLengths_);
 
-            if constexpr(InvariantDims::Size() == 0)
+            if constexpr(NumInvariantDim == 0)
                 invariant_lowest_length = 1;
             else
-                invariant_lowest_length = inLengths_[InvariantDims::At(InvariantDims::Size() - 1)];
+                invariant_lowest_length = inLengths_[NumInvariantDim - 1];
 
-            reduce_lowest_length = inLengths_[ReduceDims::At(ReduceDims::Size() - 1)];
+            reduce_lowest_length = inLengths_[Rank - 1];
 
             int iterations = 1;
             while(true)
@@ -210,8 +220,7 @@ struct DeviceReduceMultiBlockPartialReduce
                 int testBlkGroupSize = (reduce_total_length + (K_BlockTileSize * iterations) - 1) /
                                        (K_BlockTileSize * iterations);
 
-                // we want the blkGroupSize be not more than 128
-                if(testBlkGroupSize <= 128)
+                if(testBlkGroupSize <= MaxBlockGroupSize)
                     break;
 
                 iterations++;
@@ -241,7 +250,7 @@ struct DeviceReduceMultiBlockPartialReduce
         std::vector<int> outStrides_;
 
         AccDataType alpha_;
-        OutDataType beta_;
+        AccDataType beta_;
 
         const InDataType* in_dev_;
         OutDataType* out_dev_;
@@ -337,18 +346,22 @@ struct DeviceReduceMultiBlockPartialReduce
 
         if constexpr(InSrcVectorDim == 0)
         {
-            if constexpr(InvariantDims::Size() == 0)
+            if constexpr(NumInvariantDim == 0)
+            {
                 return (false);
+            }
+            else
+            {
+                if(pArg->inStrides_[NumInvariantDim - 1] != 1)
+                    return (false);
 
-            if(pArg->inStrides_[InvariantDims::At(InvariantDims::Size() - 1)] != 1)
-                return (false);
-
-            if(pArg->invariant_lowest_length % InSrcVectorSize != 0)
-                return (false);
+                if(pArg->invariant_lowest_length % InSrcVectorSize != 0)
+                    return (false);
+            };
         }
         else
         {
-            if(pArg->inStrides_[ReduceDims::At(ReduceDims::Size() - 1)] != 1)
+            if(pArg->inStrides_[Rank - 1] != 1)
                 return (false);
 
             if(pArg->reduce_lowest_length % InSrcVectorSize != 0)
@@ -371,19 +384,19 @@ struct DeviceReduceMultiBlockPartialReduce
     };
 
     std::unique_ptr<BaseArgument>
-    MakeArgumentPointer(const std::vector<int>& inLengths,
-                        const std::vector<int>& inStrides,
-                        const std::vector<int>& outLengths,
-                        const std::vector<int>& outStrides,
-                        const std::vector<int>& reduceDims,
+    MakeArgumentPointer(const std::vector<int> inLengths,
+                        const std::vector<int> inStrides,
+                        const std::vector<int> outLengths,
+                        const std::vector<int> outStrides,
+                        const std::vector<int> reduceDims,
                         float alpha,
                         float beta,
                         const void* in_dev,
                         void* out_dev,
                         void* out_indices_dev,
                         void* workspace_dev,
-                        const InElementwiseOperation& in_elementwise_op,
-                        const AccElementwiseOperation& acc_elementwise_op) override
+                        const InElementwiseOperation in_elementwise_op,
+                        const AccElementwiseOperation acc_elementwise_op) override
     {
         return std::make_unique<Argument>(inLengths,
                                           inStrides,
diff --git a/include/ck/tensor_operation/gpu/device/device_reduce_threadwise.hpp b/include/ck/tensor_operation/gpu/device/device_reduce_threadwise.hpp
index e975a10d71..bf4088a96b 100644
--- a/include/ck/tensor_operation/gpu/device/device_reduce_threadwise.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_reduce_threadwise.hpp
@@ -36,20 +36,20 @@ struct DeviceReduceThreadWise : public DeviceReduce<InElementwiseOperation, OutE
     static_assert((BlockSize == MThreadClusterSize) && (KThreadClusterSize == 1),
                   "Threadwise can only be called with KThreadClusterSize be 1 !");
 
+    static_assert(((InSrcVectorDim == 0 && MThreadSliceSize % InSrcVectorSize == 0) ||
+                   (InSrcVectorDim == 1 && KThreadSliceSize % InSrcVectorSize == 0)) &&
+                      (MThreadSliceSize % OutDstVectorSize == 0),
+                  "Invalid thread slice sizes and/or vector sizes configuration, please check!");
+
     using IndexDataType = int32_t;
 
     static constexpr bool BetaIsZero = NeedIndices;
 
     static constexpr index_t NumInvariantDim = Rank - NumReduceDim;
-    using InvariantDims =
-        typename conditional<NumInvariantDim == 0,
-                             Sequence<>,
-                             typename arithmetic_sequence_gen<0, NumInvariantDim, 1>::type>::type;
-    using ReduceDims = typename arithmetic_sequence_gen<NumInvariantDim, Rank, 1>::type;
 
-    static constexpr index_t srcDims    = Rank;
-    static constexpr index_t dstDims    = (InvariantDims::Size() == 0) ? 1 : InvariantDims::Size();
-    static constexpr bool reduceAllDims = (InvariantDims::Size() == 0);
+    static constexpr index_t numSrcDim = Rank;
+    static constexpr index_t numDstDim = (NumInvariantDim == 0) ? 1 : NumInvariantDim;
+    static constexpr bool reduceAllDim = (NumInvariantDim == 0);
 
     static constexpr int M_BlockTileSize = MThreadClusterSize * MThreadSliceSize;
     static constexpr int K_BlockTileSize = KThreadClusterSize * KThreadSliceSize;
@@ -57,18 +57,18 @@ struct DeviceReduceThreadWise : public DeviceReduce<InElementwiseOperation, OutE
     static auto MakeSrc2dDescriptor(const std::vector<int>& inLengths,
                                     const std::vector<int>& inStrides)
     {
-        const auto tupleSrcLengths = make_tuple_from_array(inLengths, Number<srcDims>{});
-        const auto tupleSrcStrides = make_tuple_from_array(inStrides, Number<srcDims>{});
+        const auto tupleSrcLengths = make_tuple_from_array(inLengths, Number<numSrcDim>{});
+        const auto tupleSrcStrides = make_tuple_from_array(inStrides, Number<numSrcDim>{});
 
         const auto inDesc = make_naive_tensor_descriptor(tupleSrcLengths, tupleSrcStrides);
 
         const auto in_grid_desc_m_k = [&]() {
-            if constexpr(reduceAllDims)
+            if constexpr(reduceAllDim)
             {
                 const auto one_dim_inDesc = transform_tensor_descriptor(
                     inDesc,
                     make_tuple(make_merge_transform(tupleSrcLengths)),
-                    make_tuple(typename arithmetic_sequence_gen<0, srcDims, 1>::type{}),
+                    make_tuple(typename arithmetic_sequence_gen<0, numSrcDim, 1>::type{}),
                     make_tuple(Sequence<0>{}));
 
                 return transform_tensor_descriptor(one_dim_inDesc,
@@ -79,6 +79,9 @@ struct DeviceReduceThreadWise : public DeviceReduce<InElementwiseOperation, OutE
             }
             else
             {
+                using InvariantDims = typename arithmetic_sequence_gen<0, NumInvariantDim, 1>::type;
+                using ReduceDims = typename arithmetic_sequence_gen<NumInvariantDim, Rank, 1>::type;
+
                 const auto reduceDimLengths =
                     make_tuple_from_array_and_index_seq(inLengths, ReduceDims{});
                 const auto invariantDimLengths =
@@ -93,18 +96,20 @@ struct DeviceReduceThreadWise : public DeviceReduce<InElementwiseOperation, OutE
             }
         }();
 
-        const auto outerLen = in_grid_desc_m_k.GetLength(Number<0>{});
-        const auto innerLen = in_grid_desc_m_k.GetLength(Number<1>{});
+        const auto invariantLength = in_grid_desc_m_k.GetLength(Number<0>{});
+        const auto reduceLength    = in_grid_desc_m_k.GetLength(Number<1>{});
 
-        const auto inPad_M = math::integer_least_multiple(outerLen, M_BlockTileSize) - outerLen;
-        const auto inPad_K = math::integer_least_multiple(innerLen, K_BlockTileSize) - innerLen;
+        const auto inPad_M =
+            math::integer_least_multiple(invariantLength, M_BlockTileSize) - invariantLength;
+        const auto inPad_K =
+            math::integer_least_multiple(reduceLength, K_BlockTileSize) - reduceLength;
 
-        auto in_grid_desc_m_k_padded =
-            transform_tensor_descriptor(in_grid_desc_m_k,
-                                        make_tuple(make_right_pad_transform(outerLen, inPad_M),
-                                                   make_right_pad_transform(innerLen, inPad_K)),
-                                        make_tuple(Sequence<0>{}, Sequence<1>{}),
-                                        make_tuple(Sequence<0>{}, Sequence<1>{}));
+        auto in_grid_desc_m_k_padded = transform_tensor_descriptor(
+            in_grid_desc_m_k,
+            make_tuple(make_right_pad_transform(invariantLength, inPad_M),
+                       make_right_pad_transform(reduceLength, inPad_K)),
+            make_tuple(Sequence<0>{}, Sequence<1>{}),
+            make_tuple(Sequence<0>{}, Sequence<1>{}));
 
         return (in_grid_desc_m_k_padded);
     };
@@ -112,44 +117,45 @@ struct DeviceReduceThreadWise : public DeviceReduce<InElementwiseOperation, OutE
     static auto MakeDst1dDescriptor(const std::vector<int>& outLengths,
                                     const std::vector<int>& outStrides)
     {
-        const auto tupleDstLengths = make_tuple_from_array(outLengths, Number<dstDims>{});
-        const auto tupleDstStrides = make_tuple_from_array(outStrides, Number<dstDims>{});
+        const auto tupleDstLengths = make_tuple_from_array(outLengths, Number<numDstDim>{});
+        const auto tupleDstStrides = make_tuple_from_array(outStrides, Number<numDstDim>{});
 
         auto outDesc = make_naive_tensor_descriptor(tupleDstLengths, tupleDstStrides);
 
         auto out_grid_desc_m = transform_tensor_descriptor(
             outDesc,
             make_tuple(make_merge_transform(tupleDstLengths)),
-            make_tuple(typename arithmetic_sequence_gen<0, dstDims, 1>::type{}),
+            make_tuple(typename arithmetic_sequence_gen<0, numDstDim, 1>::type{}),
             make_tuple(Sequence<0>{}));
 
-        const auto outerLen = out_grid_desc_m.GetLength(Number<0>{});
+        const auto invariantLength = out_grid_desc_m.GetLength(Number<0>{});
 
-        const auto outPad = math::integer_least_multiple(outerLen, M_BlockTileSize) - outerLen;
+        const auto outPad =
+            math::integer_least_multiple(invariantLength, M_BlockTileSize) - invariantLength;
 
-        auto out_grid_desc_m_padded =
-            transform_tensor_descriptor(out_grid_desc_m,
-                                        make_tuple(make_right_pad_transform(outerLen, outPad)),
-                                        make_tuple(Sequence<0>{}),
-                                        make_tuple(Sequence<0>{}));
+        auto out_grid_desc_m_padded = transform_tensor_descriptor(
+            out_grid_desc_m,
+            make_tuple(make_right_pad_transform(invariantLength, outPad)),
+            make_tuple(Sequence<0>{}),
+            make_tuple(Sequence<0>{}));
         return (out_grid_desc_m_padded);
     };
 
     struct Argument : public BaseArgument
     {
-        Argument(const std::vector<int>& inLengths,
-                 const std::vector<int>& inStrides,
-                 const std::vector<int>& outLengths,
-                 const std::vector<int>& outStrides,
-                 const std::vector<int>& reduceDims,
+        Argument(const std::vector<int> inLengths,
+                 const std::vector<int> inStrides,
+                 const std::vector<int> outLengths,
+                 const std::vector<int> outStrides,
+                 const std::vector<int> reduceDims,
                  float alpha,
                  float beta,
                  const InDataType* in_dev,
                  OutDataType* out_dev,
                  IndexDataType* out_indices_dev,
                  AccDataType* workspace_dev,
-                 const InElementwiseOperation& in_elementwise_op,
-                 const OutElementwiseOperation& acc_elementwise_op)
+                 const InElementwiseOperation in_elementwise_op,
+                 const OutElementwiseOperation acc_elementwise_op)
             : outLengths_{outLengths},
               outStrides_{outStrides},
               in_dev_{in_dev},
@@ -161,21 +167,21 @@ struct DeviceReduceThreadWise : public DeviceReduce<InElementwiseOperation, OutE
         {
             (void)workspace_dev;
 
-            std::tie(inLengths_, inStrides_) =
-                shuffle_tensor_dimensions<Rank, NumReduceDim>(inLengths, inStrides, reduceDims);
+            inLengths_ = shuffle_tensor_dimensions<Rank, NumReduceDim>(inLengths, reduceDims);
+            inStrides_ = shuffle_tensor_dimensions<Rank, NumReduceDim>(inStrides, reduceDims);
 
-            alpha_ = static_cast<AccDataType>(alpha);
-            beta_  = static_cast<OutDataType>(beta);
+            alpha_ = type_convert<AccDataType>(alpha);
+            beta_  = type_convert<AccDataType>(beta);
 
             std::tie(invariant_total_length, reduce_total_length) =
-                get_2d_lengths<Rank, ReduceDims>(inLengths_);
+                get_2d_lengths<Rank, NumReduceDim>(inLengths_);
 
-            if constexpr(InvariantDims::Size() == 0)
+            if constexpr(NumInvariantDim == 0)
                 invariant_lowest_length = 1;
             else
-                invariant_lowest_length = inLengths_[InvariantDims::At(InvariantDims::Size() - 1)];
+                invariant_lowest_length = inLengths_[NumInvariantDim - 1];
 
-            reduce_lowest_length = inLengths_[ReduceDims::At(ReduceDims::Size() - 1)];
+            reduce_lowest_length = inLengths_[Rank - 1];
 
             gridSize = math::integer_least_multiple(invariant_total_length, M_BlockTileSize) /
                        M_BlockTileSize;
@@ -187,7 +193,7 @@ struct DeviceReduceThreadWise : public DeviceReduce<InElementwiseOperation, OutE
         std::vector<int> outStrides_;
 
         AccDataType alpha_;
-        OutDataType beta_;
+        AccDataType beta_;
 
         const InDataType* in_dev_;
         OutDataType* out_dev_;
@@ -278,18 +284,22 @@ struct DeviceReduceThreadWise : public DeviceReduce<InElementwiseOperation, OutE
 
         if constexpr(InSrcVectorDim == 0)
         {
-            if constexpr(InvariantDims::Size() == 0)
+            if constexpr(NumInvariantDim == 0)
+            {
                 return (false);
+            }
+            else
+            {
+                if(pArg->inStrides_[NumInvariantDim - 1] != 1)
+                    return (false);
 
-            if(pArg->inStrides_[InvariantDims::At(InvariantDims::Size() - 1)] != 1)
-                return (false);
-
-            if(pArg->invariant_lowest_length % InSrcVectorSize != 0)
-                return (false);
+                if(pArg->invariant_lowest_length % InSrcVectorSize != 0)
+                    return (false);
+            };
         }
         else
         {
-            if(pArg->inStrides_[ReduceDims::At(ReduceDims::Size() - 1)] != 1)
+            if(pArg->inStrides_[Rank - 1] != 1)
                 return (false);
 
             if(pArg->reduce_lowest_length % InSrcVectorSize != 0)
@@ -310,19 +320,19 @@ struct DeviceReduceThreadWise : public DeviceReduce<InElementwiseOperation, OutE
     };
 
     std::unique_ptr<BaseArgument>
-    MakeArgumentPointer(const std::vector<int>& inLengths,
-                        const std::vector<int>& inStrides,
-                        const std::vector<int>& outLengths,
-                        const std::vector<int>& outStrides,
-                        const std::vector<int>& reduceDims,
+    MakeArgumentPointer(const std::vector<int> inLengths,
+                        const std::vector<int> inStrides,
+                        const std::vector<int> outLengths,
+                        const std::vector<int> outStrides,
+                        const std::vector<int> reduceDims,
                         float alpha,
                         float beta,
                         const void* in_dev,
                         void* out_dev,
                         void* out_indices_dev,
                         void* workspace_dev,
-                        const InElementwiseOperation& in_elementwise_op,
-                        const OutElementwiseOperation& acc_elementwise_op) override
+                        const InElementwiseOperation in_elementwise_op,
+                        const OutElementwiseOperation acc_elementwise_op) override
     {
         return std::make_unique<Argument>(inLengths,
                                           inStrides,
diff --git a/include/ck/tensor_operation/gpu/element/element_wise_operation.hpp b/include/ck/tensor_operation/gpu/element/element_wise_operation.hpp
index 2c45d1f544..fcc775e900 100644
--- a/include/ck/tensor_operation/gpu/element/element_wise_operation.hpp
+++ b/include/ck/tensor_operation/gpu/element/element_wise_operation.hpp
@@ -1,6 +1,5 @@
 #ifndef CK_ELEMENT_WISE_OPERATION_HPP
 #define CK_ELEMENT_WISE_OPERATION_HPP
-#include "data_type.hpp"
 
 #include "data_type.hpp"
 
@@ -19,6 +18,8 @@ struct PassThrough
     __host__ __device__ void operator()(int32_t& y, const int32_t& x) const { y = x; }
 
     __host__ __device__ void operator()(int8_t& y, const int8_t& x) const { y = x; }
+
+    __host__ __device__ void operator()(double& y, const double& x) const { y = x; }
 };
 
 struct Add
@@ -239,6 +240,24 @@ struct UnaryIdentic<int32_t, int32_t, false>
     __host__ __device__ void operator()(int32_t& y, const int32_t& x) const { y = x; };
 };
 
+template <>
+struct UnaryIdentic<int32_t, int32_t, true>
+{
+    __host__ __device__ UnaryIdentic(const int32_t divider = 1) { divider_ = divider; };
+
+    __host__ __device__ void operator()(int32_t& y, const int32_t& x) const { y = x / divider_; };
+
+    int32_t divider_ = 1;
+};
+
+template <>
+struct UnaryIdentic<int8_t, int8_t, false>
+{
+    __host__ __device__ UnaryIdentic(const int8_t divider = 1) { (void)divider; };
+
+    __host__ __device__ void operator()(int8_t& y, const int8_t& x) const { y = x; };
+};
+
 template <typename Y, typename X, bool HasDividing = false>
 struct UnarySquare;
 
@@ -311,6 +330,19 @@ struct UnaryAbs<double, double>
     __host__ __device__ void operator()(double& y, const double& x) const { y = abs(x); };
 };
 
+template <>
+struct UnaryAbs<int8_t, int8_t>
+{
+    __host__ __device__ UnaryAbs(const int32_t divider = 1) { (void)divider; };
+
+    __host__ __device__ void operator()(int8_t& y, const int8_t& x) const
+    {
+        int8_t sgn = x >> (8 - 1);
+
+        y = (x ^ sgn) - sgn;
+    };
+};
+
 template <typename Y, typename X>
 struct UnarySqrt;
 
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_2d_reduction_blockwise.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_2d_reduction_blockwise.hpp
index d68a217434..14fe0818a5 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_2d_reduction_blockwise.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_2d_reduction_blockwise.hpp
@@ -33,6 +33,7 @@
 #include "reduction_functions_blockwise.hpp"
 #include "threadwise_tensor_slice_transfer.hpp"
 #include "cluster_descriptor.hpp"
+#include "element_wise_operation.hpp"
 
 namespace ck {
 
@@ -52,23 +53,25 @@ __global__ void kernel_reduce_blockwise(const InGridDesc_M_K in_grid_desc_m_k,
                                         const OutElementwiseOperation acc_elementwise_op,
                                         AccDataType alpha,
                                         const InDataType* const __restrict__ p_in_global,
-                                        OutDataType beta,
+                                        AccDataType beta,
                                         OutDataType* const __restrict__ p_out_global,
                                         const IndexDataType* const __restrict__ p_ws_indices_global,
                                         IndexDataType* const __restrict__ p_indices_global)
 {
     if constexpr(!NeedIndices)
     {
-        GridwiseReduction::Run(in_grid_desc_m_k,
-                               out_grid_desc_m,
-                               in_elementwise_op,
-                               acc_elementwise_op,
-                               alpha,
-                               p_in_global,
-                               beta,
-                               p_out_global,
-                               p_ws_indices_global,
-                               p_indices_global);
+        constexpr bool IsSecondCall = false;
+
+        GridwiseReduction::template Run<IsSecondCall>(in_grid_desc_m_k,
+                                                      out_grid_desc_m,
+                                                      in_elementwise_op,
+                                                      acc_elementwise_op,
+                                                      alpha,
+                                                      p_in_global,
+                                                      beta,
+                                                      p_out_global,
+                                                      p_ws_indices_global,
+                                                      p_indices_global);
     }
     else
     {
@@ -102,23 +105,25 @@ kernel_reduce_blockwise_second_call(const InGridDesc_M_K in_grid_desc_m_k,
                                     const OutElementwiseOperation acc_elementwise_op,
                                     AccDataType alpha,
                                     const InDataType* const __restrict__ p_in_global,
-                                    OutDataType beta,
+                                    AccDataType beta,
                                     OutDataType* const __restrict__ p_out_global,
                                     const IndexDataType* const __restrict__ p_ws_indices_global,
                                     IndexDataType* const __restrict__ p_indices_global)
 {
     if constexpr(!NeedIndices)
     {
-        GridwiseReduction::Run(in_grid_desc_m_k,
-                               out_grid_desc_m,
-                               in_elementwise_op,
-                               acc_elementwise_op,
-                               alpha,
-                               p_in_global,
-                               beta,
-                               p_out_global,
-                               p_ws_indices_global,
-                               p_indices_global);
+        constexpr bool IsSecondCall = true;
+
+        GridwiseReduction::template Run<IsSecondCall>(in_grid_desc_m_k,
+                                                      out_grid_desc_m,
+                                                      in_elementwise_op,
+                                                      acc_elementwise_op,
+                                                      alpha,
+                                                      p_in_global,
+                                                      beta,
+                                                      p_out_global,
+                                                      p_ws_indices_global,
+                                                      p_indices_global);
     }
     else
     {
@@ -156,6 +161,11 @@ template <typename InDataType,
           index_t OutDstVectorSize>
 struct GridwiseReduction_mk_to_m_blockwise
 {
+    static_assert(((InSrcVectorDim == 0 && MThreadSliceSize % InSrcVectorSize == 0) ||
+                   (InSrcVectorDim == 1 && KThreadSliceSize % InSrcVectorSize == 0)) &&
+                      (MThreadSliceSize % OutDstVectorSize == 0),
+                  "Invalid thread slice sizes and/or vector sizes configuration, please check!");
+
     static constexpr bool reorder_thread_cluster = (InSrcVectorDim == 0);
 
     using ThreadClusterLengths_M_K = Sequence<MThreadClusterSize, KThreadClusterSize>;
@@ -174,8 +184,7 @@ struct GridwiseReduction_mk_to_m_blockwise
     static constexpr auto block_buf_desc_m_k = make_naive_tensor_descriptor_packed(
         make_tuple(Number<MThreadClusterSize>{}, Number<KThreadClusterSize>{}));
 
-    template <typename T>
-    using PassThroughOp = tensor_operation::element_wise::UnaryIdentic<T, T>;
+    using PassThroughOp = tensor_operation::element_wise::PassThrough;
 
     static constexpr auto I0 = Number<0>{};
     static constexpr auto I1 = Number<1>{};
@@ -183,17 +192,24 @@ struct GridwiseReduction_mk_to_m_blockwise
     static constexpr index_t M_BlockTileSize = MThreadClusterSize * MThreadSliceSize;
     static constexpr index_t K_BlockTileSize = KThreadClusterSize * KThreadSliceSize;
 
+    template <bool IsSecondCall>
     __device__ static void Run(const InGridDesc_M_K& in_grid_desc_m_k,
                                const OutGridDesc_M& out_grid_desc_m,
                                const InElementwiseOperation& in_elementwise_op,
                                const OutElementwiseOperation& acc_elementwise_op,
                                AccDataType alpha,
                                const InDataType* const __restrict__ p_in_global,
-                               OutDataType beta,
+                               AccDataType beta,
                                OutDataType* const __restrict__ p_out_global,
                                const IndexDataType* const __restrict__ p_ws_indices_global,
                                IndexDataType* const __restrict__ p_indices_global)
     {
+        if constexpr(IsSecondCall)
+        {
+            static_assert(InSrcVectorDim == 1,
+                          "InSrcVectorDim must be 1 for BlockwiseSecondCall, please check!");
+        };
+
         using BlockwiseReduce = PartitionedBlockwiseReduction<AccDataType,
                                                               BlockSize,
                                                               ThreadClusterLengths_M_K,
@@ -345,7 +361,7 @@ struct GridwiseReduction_mk_to_m_blockwise
                                             priorDstValueBuf);
 
                     static_for<0, MThreadSliceSize, 1>{}([&](auto I) {
-                        accu_value_buf(I) += type_convert<AccDataType>(priorDstValueBuf[I] * beta);
+                        accu_value_buf(I) += type_convert<AccDataType>(priorDstValueBuf[I]) * beta;
                     });
                 };
             };
@@ -355,7 +371,7 @@ struct GridwiseReduction_mk_to_m_blockwise
                                                    OutDataType,
                                                    decltype(reduced_data_desc),
                                                    OutGridDesc_M,
-                                                   PassThroughOp<AccDataType>,
+                                                   PassThroughOp,
                                                    Sequence<MThreadSliceSize>,
                                                    Sequence<0>,
                                                    0,
@@ -366,7 +382,7 @@ struct GridwiseReduction_mk_to_m_blockwise
                     out_grid_desc_m,
                     make_multi_index(block_global_1d_id * M_BlockTileSize +
                                      thread_m_cluster_id * MThreadSliceSize),
-                    PassThroughOp<AccDataType>{});
+                    PassThroughOp{});
 
             threadwise_dst_store.Run(
                 reduced_data_desc, make_tuple(I0), accu_value_buf, out_grid_desc_m, out_global_buf);
@@ -379,7 +395,7 @@ struct GridwiseReduction_mk_to_m_blockwise
                                         const OutElementwiseOperation& acc_elementwise_op,
                                         AccDataType alpha,
                                         const InDataType* const __restrict__ p_in_global,
-                                        OutDataType beta,
+                                        AccDataType beta,
                                         OutDataType* const __restrict__ p_out_global,
                                         const IndexDataType* const __restrict__ p_ws_indices_global,
                                         IndexDataType* const __restrict__ p_indices_global)
@@ -570,7 +586,7 @@ struct GridwiseReduction_mk_to_m_blockwise
                                             priorDstValueBuf);
 
                     static_for<0, MThreadSliceSize, 1>{}([&](auto I) {
-                        accu_value_buf(I) += type_convert<AccDataType>(priorDstValueBuf[I] * beta);
+                        accu_value_buf(I) += type_convert<AccDataType>(priorDstValueBuf[I]) * beta;
                     });
                 };
             };
@@ -580,7 +596,7 @@ struct GridwiseReduction_mk_to_m_blockwise
                                                    OutDataType,
                                                    decltype(reduced_data_desc),
                                                    OutGridDesc_M,
-                                                   PassThroughOp<AccDataType>,
+                                                   PassThroughOp,
                                                    Sequence<MThreadSliceSize>,
                                                    Sequence<0>,
                                                    0,
@@ -591,14 +607,14 @@ struct GridwiseReduction_mk_to_m_blockwise
                     out_grid_desc_m,
                     make_multi_index(block_global_1d_id * M_BlockTileSize +
                                      thread_m_cluster_id * MThreadSliceSize),
-                    PassThroughOp<AccDataType>{});
+                    PassThroughOp{});
 
             auto threadwise_dst_idx_store =
                 ThreadwiseTensorSliceTransfer_v1r3<IndexDataType,
                                                    IndexDataType,
                                                    decltype(reduced_data_desc),
                                                    OutGridDesc_M,
-                                                   PassThroughOp<index_t>,
+                                                   PassThroughOp,
                                                    Sequence<MThreadSliceSize>,
                                                    Sequence<0>,
                                                    0,
@@ -609,7 +625,7 @@ struct GridwiseReduction_mk_to_m_blockwise
                     out_grid_desc_m,
                     make_multi_index(block_global_1d_id * M_BlockTileSize +
                                      thread_m_cluster_id * MThreadSliceSize),
-                    PassThroughOp<index_t>{});
+                    PassThroughOp{});
 
             threadwise_dst_val_store.Run(reduced_data_desc,
                                          make_tuple(I0),
@@ -631,11 +647,14 @@ struct GridwiseReduction_mk_to_m_blockwise
                            const OutElementwiseOperation acc_elementwise_op,
                            AccDataType alpha,
                            const InDataType* const __restrict__ p_ws_values_global,
-                           OutDataType beta,
+                           AccDataType beta,
                            OutDataType* const __restrict__ p_out_global,
                            const IndexDataType* const __restrict__ p_ws_indices_global,
                            IndexDataType* const __restrict__ p_indices_global)
     {
+        static_assert(InSrcVectorDim == 1,
+                      "InSrcVectorDim must be 1 for BlockwiseSecondCall, please check!");
+
         using BlockwiseReduceWithIndex =
             PartitionedBlockwiseReductionWithIndex<AccDataType,
                                                    IndexDataType,
@@ -841,7 +860,7 @@ struct GridwiseReduction_mk_to_m_blockwise
                                             priorDstValueBuf);
 
                     static_for<0, MThreadSliceSize, 1>{}([&](auto I) {
-                        accu_value_buf(I) += type_convert<AccDataType>(priorDstValueBuf[I] * beta);
+                        accu_value_buf(I) += type_convert<AccDataType>(priorDstValueBuf[I]) * beta;
                     });
                 };
             };
@@ -851,7 +870,7 @@ struct GridwiseReduction_mk_to_m_blockwise
                                                    OutDataType,
                                                    decltype(reduced_data_desc),
                                                    OutGridDesc_M,
-                                                   PassThroughOp<AccDataType>,
+                                                   PassThroughOp,
                                                    Sequence<MThreadSliceSize>,
                                                    Sequence<0>,
                                                    0,
@@ -862,14 +881,14 @@ struct GridwiseReduction_mk_to_m_blockwise
                     out_grid_desc_m,
                     make_multi_index(block_global_1d_id * M_BlockTileSize +
                                      thread_m_cluster_id * MThreadSliceSize),
-                    PassThroughOp<AccDataType>{});
+                    PassThroughOp{});
 
             auto threadwise_dst_idx_store =
                 ThreadwiseTensorSliceTransfer_v1r3<IndexDataType,
                                                    IndexDataType,
                                                    decltype(reduced_data_desc),
                                                    OutGridDesc_M,
-                                                   PassThroughOp<IndexDataType>,
+                                                   PassThroughOp,
                                                    Sequence<MThreadSliceSize>,
                                                    Sequence<0>,
                                                    0,
@@ -880,7 +899,7 @@ struct GridwiseReduction_mk_to_m_blockwise
                     out_grid_desc_m,
                     make_multi_index(block_global_1d_id * M_BlockTileSize +
                                      thread_m_cluster_id * MThreadSliceSize),
-                    PassThroughOp<index_t>{});
+                    PassThroughOp{});
 
             threadwise_dst_val_store.Run(reduced_data_desc,
                                          make_tuple(I0),
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_2d_reduction_multiblock_atomic_add.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_2d_reduction_multiblock_atomic_add.hpp
index 8527aee827..6a46135a33 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_2d_reduction_multiblock_atomic_add.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_2d_reduction_multiblock_atomic_add.hpp
@@ -32,6 +32,7 @@
 #include "reduction_functions_blockwise.hpp"
 
 #include "threadwise_tensor_slice_transfer.hpp"
+#include "element_wise_operation.hpp"
 
 namespace ck {
 
@@ -84,6 +85,11 @@ template <typename InDataType,
           index_t OutDstVectorSize>
 struct GridwiseReduction_mk_to_m_multiblock_atomic_add
 {
+    static_assert(((InSrcVectorDim == 0 && MThreadSliceSize % InSrcVectorSize == 0) ||
+                   (InSrcVectorDim == 1 && KThreadSliceSize % InSrcVectorSize == 0)) &&
+                      (MThreadSliceSize % OutDstVectorSize == 0),
+                  "Invalid thread slice sizes and/or vector sizes configuration, please check!");
+
     static constexpr bool reorder_thread_cluster = (InSrcVectorDim == 0);
 
     using ThreadClusterLengths_M_K = Sequence<MThreadClusterSize, KThreadClusterSize>;
@@ -109,8 +115,7 @@ struct GridwiseReduction_mk_to_m_multiblock_atomic_add
                                                           ReduceOperation,
                                                           PropagateNan>;
 
-    template <typename T>
-    using PassThroughOp = tensor_operation::element_wise::UnaryIdentic<T, T>;
+    using PassThroughOp = tensor_operation::element_wise::PassThrough;
 
     static constexpr auto I0 = Number<0>{};
     static constexpr auto I1 = Number<1>{};
@@ -249,7 +254,7 @@ struct GridwiseReduction_mk_to_m_multiblock_atomic_add
                                                    OutDataType,
                                                    decltype(reduced_data_desc),
                                                    OutGridDesc_M,
-                                                   PassThroughOp<AccDataType>,
+                                                   PassThroughOp,
                                                    Sequence<MThreadSliceSize>,
                                                    Sequence<0>,
                                                    0,
@@ -260,7 +265,7 @@ struct GridwiseReduction_mk_to_m_multiblock_atomic_add
                     out_grid_desc_m,
                     make_multi_index(blkgroup_id * M_BlockTileSize +
                                      thread_m_cluster_id * MThreadSliceSize),
-                    PassThroughOp<AccDataType>{});
+                    PassThroughOp{});
 
             threadwise_dst_store.Run(
                 reduced_data_desc, make_tuple(I0), accu_value_buf, out_grid_desc_m, out_global_buf);
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_2d_reduction_multiblock_partial_reduce.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_2d_reduction_multiblock_partial_reduce.hpp
index d47e4ed078..0c76794754 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_2d_reduction_multiblock_partial_reduce.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_2d_reduction_multiblock_partial_reduce.hpp
@@ -23,8 +23,8 @@
  * SOFTWARE.
  *
  *******************************************************************************/
-#ifndef CK_GRIDWISE_2D_REDUCTION_MULTIBLOCK_TWO_CALL_HPP
-#define CK_GRIDWISE_2D_REDUCTION_MULTIBLOCK_TWO_CALL_HPP
+#ifndef CK_GRIDWISE_2D_REDUCTION_MULTIBLOCK_PARTIAL_REDUCE_HPP
+#define CK_GRIDWISE_2D_REDUCTION_MULTIBLOCK_PARTIAL_REDUCE_HPP
 
 #include "reduction_common.hpp"
 #include "reduction_operator.hpp"
@@ -32,6 +32,7 @@
 #include "reduction_functions_blockwise.hpp"
 #include "threadwise_tensor_slice_transfer.hpp"
 #include "cluster_descriptor.hpp"
+#include "element_wise_operation.hpp"
 
 namespace ck {
 
@@ -101,6 +102,12 @@ template <typename InDataType,
           index_t OutDstVectorSize>
 struct GridwiseReduction_mk_to_mk_multiblock_partial_reduce
 {
+    static_assert((InSrcVectorDim == 0 && MThreadSliceSize % InSrcVectorSize == 0) ||
+                      (InSrcVectorDim == 1 && KThreadSliceSize % InSrcVectorSize == 0),
+                  "Invalid thread slice sizes and/or vector sizes configuration, please check!");
+
+    static_assert(OutDstVectorSize == 1, "OutDstVectorSize must be 1 for MultiBlockPartialReduce!");
+
     static constexpr bool reorder_thread_cluster = (InSrcVectorDim == 0);
 
     using ThreadClusterLengths_M_K = Sequence<MThreadClusterSize, KThreadClusterSize>;
@@ -119,8 +126,7 @@ struct GridwiseReduction_mk_to_mk_multiblock_partial_reduce
     static constexpr auto block_buf_desc_m_k = make_naive_tensor_descriptor_packed(
         make_tuple(Number<MThreadClusterSize>{}, Number<KThreadClusterSize>{}));
 
-    template <typename T>
-    using PassThroughOp = tensor_operation::element_wise::UnaryIdentic<T, T>;
+    using PassThroughOp = tensor_operation::element_wise::PassThrough;
 
     static constexpr auto I0 = Number<0>{};
     static constexpr auto I1 = Number<1>{};
@@ -238,9 +244,6 @@ struct GridwiseReduction_mk_to_mk_multiblock_partial_reduce
             reducedTiles++;
         } while(reducedTiles < num_k_block_tile_iteration);
 
-        constexpr auto reduced_data_desc = make_naive_tensor_descriptor_packed(
-            make_tuple(Number<MThreadSliceSize>{}, Number<1>{}));
-
         // Each block executes multiple parallel reductions on the LDS, and due to the using of
         // vector_load, each block/thread is involved into multiple invarirant dimensions.
         static_for<0, MThreadSliceSize, 1>{}([&](auto I) {
@@ -254,6 +257,9 @@ struct GridwiseReduction_mk_to_mk_multiblock_partial_reduce
             BlockwiseReduce::Reduce(block_reduce_buf, accu_value_buf(I));
         });
 
+        constexpr auto reduced_data_desc = make_naive_tensor_descriptor_packed(
+            make_tuple(Number<MThreadSliceSize>{}, Number<1>{}));
+
         if(thread_k_cluster_id == 0)
         {
             auto threadwise_workspace_store =
@@ -261,7 +267,7 @@ struct GridwiseReduction_mk_to_mk_multiblock_partial_reduce
                                                    AccDataType,
                                                    decltype(reduced_data_desc),
                                                    WorkspaceDesc_M_K,
-                                                   PassThroughOp<AccDataType>,
+                                                   PassThroughOp,
                                                    Sequence<MThreadSliceSize, 1>,
                                                    Sequence<0, 1>,
                                                    1,
@@ -273,7 +279,7 @@ struct GridwiseReduction_mk_to_mk_multiblock_partial_reduce
                     make_multi_index(blkgroup_id * M_BlockTileSize +
                                          thread_m_cluster_id * MThreadSliceSize,
                                      block_local_id),
-                    PassThroughOp<AccDataType>{});
+                    PassThroughOp{});
 
             threadwise_workspace_store.Run(reduced_data_desc,
                                            make_tuple(I0, I0),
@@ -450,7 +456,7 @@ struct GridwiseReduction_mk_to_mk_multiblock_partial_reduce
                                                    AccDataType,
                                                    decltype(reduced_data_desc),
                                                    WorkspaceDesc_M_K,
-                                                   PassThroughOp<AccDataType>,
+                                                   PassThroughOp,
                                                    Sequence<MThreadSliceSize, 1>,
                                                    Sequence<0, 1>,
                                                    1,
@@ -462,14 +468,14 @@ struct GridwiseReduction_mk_to_mk_multiblock_partial_reduce
                     make_multi_index(blkgroup_id * M_BlockTileSize +
                                          thread_m_cluster_id * MThreadSliceSize,
                                      block_local_id),
-                    PassThroughOp<AccDataType>{});
+                    PassThroughOp{});
 
             auto threadwise_workspace_idx_store =
                 ThreadwiseTensorSliceTransfer_v1r3<IndexDataType,
                                                    IndexDataType,
                                                    decltype(reduced_data_desc),
                                                    WorkspaceDesc_M_K,
-                                                   PassThroughOp<IndexDataType>,
+                                                   PassThroughOp,
                                                    Sequence<MThreadSliceSize, 1>,
                                                    Sequence<0, 1>,
                                                    1,
@@ -481,7 +487,7 @@ struct GridwiseReduction_mk_to_mk_multiblock_partial_reduce
                     make_multi_index(blkgroup_id * M_BlockTileSize +
                                          thread_m_cluster_id * MThreadSliceSize,
                                      block_local_id),
-                    PassThroughOp<IndexDataType>{});
+                    PassThroughOp{});
 
             threadwise_workspace_val_store.Run(reduced_data_desc,
                                                make_tuple(I0, I0),
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_2d_reduction_threadwise.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_2d_reduction_threadwise.hpp
index 3afa99c470..86caea2a92 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_2d_reduction_threadwise.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_2d_reduction_threadwise.hpp
@@ -31,6 +31,7 @@
 #include "reduction_operator.hpp"
 #include "reduction_functions_accumulate.hpp"
 #include "threadwise_tensor_slice_transfer.hpp"
+#include "element_wise_operation.hpp"
 
 namespace ck {
 
@@ -50,7 +51,7 @@ __global__ void kernel_reduce_threadwise(const InGridDesc_M_K in_grid_desc_m_k,
                                          const AccElementwiseOperation acc_elementwise_op,
                                          AccDataType alpha,
                                          const InDataType* const __restrict__ p_in_global,
-                                         OutDataType beta,
+                                         AccDataType beta,
                                          OutDataType* const __restrict__ p_out_global,
                                          IndexDataType* const __restrict__ p_indices_global)
 {
@@ -101,11 +102,15 @@ template <typename InDataType,
           index_t OutDstVectorSize>
 struct GridwiseReduction_mk_to_m_threadwise
 {
+    static_assert(((InSrcVectorDim == 0 && MThreadSliceSize % InSrcVectorSize == 0) ||
+                   (InSrcVectorDim == 1 && KThreadSliceSize % InSrcVectorSize == 0)) &&
+                      (MThreadSliceSize % OutDstVectorSize == 0),
+                  "Invalid thread slice sizes and/or vector sizes configuration, please check!");
+
     using ThreadBufferDimAccessOrder =
         typename conditional<InSrcVectorDim == 0, Sequence<1, 0>, Sequence<0, 1>>::type;
 
-    template <typename T>
-    using PassThroughOp = tensor_operation::element_wise::UnaryIdentic<T, T>;
+    using PassThroughOp = tensor_operation::element_wise::PassThrough;
 
     static constexpr auto I0 = Number<0>{};
 
@@ -115,7 +120,7 @@ struct GridwiseReduction_mk_to_m_threadwise
                                const AccElementwiseOperation& acc_elementwise_op,
                                AccDataType alpha,
                                const InDataType* const __restrict__ p_in_global,
-                               OutDataType beta,
+                               AccDataType beta,
                                OutDataType* const __restrict__ p_out_global,
                                IndexDataType* const __restrict__ p_indices_global)
     {
@@ -228,7 +233,7 @@ struct GridwiseReduction_mk_to_m_threadwise
                                         priorDstValue_buf);
 
                 static_for<0, MThreadSliceSize, 1>{}([&](auto I) {
-                    accu_value_buf(I) += type_convert<AccDataType>(priorDstValue_buf[I] * beta);
+                    accu_value_buf(I) += type_convert<AccDataType>(priorDstValue_buf[I]) * beta;
                 });
             };
         };
@@ -238,7 +243,7 @@ struct GridwiseReduction_mk_to_m_threadwise
                                                OutDataType,
                                                decltype(reduced_data_desc),
                                                OutGridDesc_M,
-                                               PassThroughOp<AccDataType>,
+                                               PassThroughOp,
                                                Sequence<MThreadSliceSize>,
                                                Sequence<0>,
                                                0,
@@ -248,7 +253,7 @@ struct GridwiseReduction_mk_to_m_threadwise
                                                false>(
                 out_grid_desc_m,
                 make_multi_index(thread_global_1d_id * MThreadSliceSize),
-                PassThroughOp<AccDataType>{});
+                PassThroughOp{});
 
         threadwise_dst_store.Run(
             reduced_data_desc, make_tuple(I0), accu_value_buf, out_grid_desc_m, dst_global_buf);
@@ -260,7 +265,7 @@ struct GridwiseReduction_mk_to_m_threadwise
                                           const AccElementwiseOperation& acc_elementwise_op,
                                           AccDataType alpha,
                                           const InDataType* const __restrict__ p_in_global,
-                                          OutDataType beta,
+                                          AccDataType beta,
                                           OutDataType* const __restrict__ p_out_global,
                                           IndexDataType* const __restrict__ p_indices_global)
     {
@@ -387,7 +392,7 @@ struct GridwiseReduction_mk_to_m_threadwise
                                         priorDstValue_buf);
 
                 static_for<0, MThreadSliceSize, 1>{}([&](auto I) {
-                    accu_value_buf(I) += type_convert<AccDataType>(priorDstValue_buf[I] * beta);
+                    accu_value_buf(I) += type_convert<AccDataType>(priorDstValue_buf[I]) * beta;
                 });
             };
         };
@@ -397,7 +402,7 @@ struct GridwiseReduction_mk_to_m_threadwise
                                                OutDataType,
                                                decltype(reduced_data_desc),
                                                OutGridDesc_M,
-                                               PassThroughOp<AccDataType>,
+                                               PassThroughOp,
                                                Sequence<MThreadSliceSize>,
                                                Sequence<0>,
                                                0,
@@ -407,14 +412,14 @@ struct GridwiseReduction_mk_to_m_threadwise
                                                false>(
                 out_grid_desc_m,
                 make_multi_index(thread_global_1d_id * MThreadSliceSize),
-                PassThroughOp<AccDataType>{});
+                PassThroughOp{});
 
         auto threadwise_dst_idx_store =
             ThreadwiseTensorSliceTransfer_v1r3<IndexDataType,
                                                IndexDataType,
                                                decltype(reduced_data_desc),
                                                OutGridDesc_M,
-                                               PassThroughOp<IndexDataType>,
+                                               PassThroughOp,
                                                Sequence<MThreadSliceSize>,
                                                Sequence<0>,
                                                0,
@@ -424,7 +429,7 @@ struct GridwiseReduction_mk_to_m_threadwise
                                                false>(
                 out_grid_desc_m,
                 make_multi_index(thread_global_1d_id * MThreadSliceSize),
-                PassThroughOp<IndexDataType>{});
+                PassThroughOp{});
 
         threadwise_dst_val_store.Run(
             reduced_data_desc, make_tuple(I0), accu_value_buf, out_grid_desc_m, out_global_val_buf);
diff --git a/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp b/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp
index 524da47e24..2ce64a9840 100644
--- a/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp
+++ b/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp
@@ -79,6 +79,8 @@ struct ThreadwiseTensorSliceTransfer_v1r3
     {
         static_assert(SrcDesc::IsKnownAtCompileTime(),
                       "wrong! SrcDesc need to known at compile-time");
+        static_assert(SliceLengths::At(Number<DstVectorDim>{}) % DstScalarPerVector == 0,
+                      "wrong! Not divisible");
     }
 
     __device__ void SetDstSliceOrigin(const DstDesc& dst_desc, const Index& dst_slice_origin_idx)
@@ -250,6 +252,8 @@ struct ThreadwiseTensorSliceTransfer_v2
     {
         static_assert(DstDesc::IsKnownAtCompileTime(),
                       "wrong! SrcDesc need to known at compile-time");
+        static_assert(SliceLengths::At(Number<SrcVectorDim>{}) % SrcScalarPerVector == 0,
+                      "wrong! Not divisible");
     }
 
     __device__ void SetSrcSliceOrigin(const SrcDesc& src_desc, const Index& src_slice_origin_idx)
@@ -313,7 +317,8 @@ struct ThreadwiseTensorSliceTransfer_v2
                     dst_desc.CalculateOffset(to_multi_index(dst_slice_origin_idx) + src_data_idx +
                                              i * src_scalar_step_in_vector);
 
-                dst_buf(Number<dst_offset>{}) = src_vector.template AsType<SrcData>()[i];
+                dst_buf(Number<dst_offset>{}) =
+                    type_convert<DstData>(src_vector.template AsType<SrcData>()[i]);
             });
 
             if constexpr(idx_1d.value != num_access - 1)
@@ -439,6 +444,10 @@ struct ThreadwiseTensorSliceTransfer_v3
         : src_coord_(make_tensor_coordinate(src_desc, src_slice_origin)),
           dst_coord_(make_tensor_coordinate(dst_desc, dst_slice_origin))
     {
+        static_assert(SliceLengths::At(Number<SrcVectorDim>{}) % SrcScalarPerVector == 0,
+                      "wrong! Not divisible");
+        static_assert(SliceLengths::At(Number<DstVectorDim>{}) % DstScalarPerVector == 0,
+                      "wrong! Not divisible");
     }
 
     __device__ void SetSrcSliceOrigin(const SrcDesc& src_desc, const Index& src_slice_origin_idx)
@@ -1016,7 +1025,8 @@ struct ThreadwiseTensorSliceTransfer_v4
         static_assert(SrcDesc::IsKnownAtCompileTime() && DstDesc::IsKnownAtCompileTime(),
                       "wrong! SrcDesc and DstDesc need to known at compile-time");
 
-        static_assert(SliceLengths::At(Number<SrcVectorDim>{}) % SrcScalarPerVector == 0, "wrong!");
+        static_assert(SliceLengths::At(Number<SrcVectorDim>{}) % SrcScalarPerVector == 0,
+                      "wrong! Not divisible");
     }
 
     template <typename SrcRefToOriginDisplacement,
diff --git a/include/ck/utility/amd_buffer_addressing.hpp b/include/ck/utility/amd_buffer_addressing.hpp
index ddd27c3d3a..c8fb9cb1a3 100644
--- a/include/ck/utility/amd_buffer_addressing.hpp
+++ b/include/ck/utility/amd_buffer_addressing.hpp
@@ -637,19 +637,19 @@ __device__ void amd_buffer_store_impl(const typename vector_type<T, N>::type src
         }
         else if constexpr(N == 2)
         {
-            llvm_amdgcn_raw_buffer_store_fp16x2(src_thread_data,
-                                                dst_wave_buffer_resource,
-                                                dst_thread_addr_offset,
-                                                dst_wave_addr_offset,
-                                                0);
+            llvm_amdgcn_raw_buffer_store_i16x2(src_thread_data,
+                                               dst_wave_buffer_resource,
+                                               dst_thread_addr_offset,
+                                               dst_wave_addr_offset,
+                                               0);
         }
         else if constexpr(N == 4)
         {
-            llvm_amdgcn_raw_buffer_store_fp16x4(src_thread_data,
-                                                dst_wave_buffer_resource,
-                                                dst_thread_addr_offset,
-                                                dst_wave_addr_offset,
-                                                0);
+            llvm_amdgcn_raw_buffer_store_i16x4(src_thread_data,
+                                               dst_wave_buffer_resource,
+                                               dst_thread_addr_offset,
+                                               dst_wave_addr_offset,
+                                               0);
         }
         else if constexpr(N == 8)
         {
diff --git a/include/ck/utility/sequence.hpp b/include/ck/utility/sequence.hpp
index b35999d56f..c2adfc5063 100644
--- a/include/ck/utility/sequence.hpp
+++ b/include/ck/utility/sequence.hpp
@@ -606,6 +606,12 @@ struct sequence_map_inverse
                                            SeqMap::Size()>::type;
 };
 
+template <index_t... Xs, index_t... Ys>
+__host__ __device__ constexpr bool operator==(Sequence<Xs...>, Sequence<Ys...>)
+{
+    return ((Xs == Ys) && ...);
+}
+
 template <index_t... Xs, index_t... Ys>
 __host__ __device__ constexpr auto operator+(Sequence<Xs...>, Sequence<Ys...>)
 {
diff --git a/include/ck/utility/tensor_space_filling_curve.hpp b/include/ck/utility/tensor_space_filling_curve.hpp
index c5cbe461f0..62b68559bf 100644
--- a/include/ck/utility/tensor_space_filling_curve.hpp
+++ b/include/ck/utility/tensor_space_filling_curve.hpp
@@ -37,6 +37,10 @@ struct SpaceFillingCurve
 
     __host__ __device__ static constexpr index_t GetNumOfAccess()
     {
+        static_assert(TensorLengths::Size() == ScalarsPerAccess::Size());
+        static_assert(TensorLengths{} % ScalarsPerAccess{} ==
+                      typename uniform_sequence_gen<TensorLengths::Size(), 0>::type{});
+
         return reduce_on_sequence(TensorLengths{}, math::multiplies{}, Number<1>{}) /
                ScalarPerVector;
     }
diff --git a/library/include/ck/library/host_tensor/host_generic_reduction.hpp b/library/include/ck/library/host_tensor/host_generic_reduction.hpp
deleted file mode 100644
index d10184aaf6..0000000000
--- a/library/include/ck/library/host_tensor/host_generic_reduction.hpp
+++ /dev/null
@@ -1,424 +0,0 @@
-
-/*******************************************************************************
- *
- * MIT License
- *
- * Copyright (c) 2020 Advanced Micro Devices, Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- *******************************************************************************/
-#ifndef HOST_GENERIC_REDUCTION_HPP_
-#define HOST_GENERIC_REDUCTION_HPP_
-
-#include <vector>
-#include <functional>
-#include <limits>
-#include <type_traits>
-#include <cassert>
-#include <cmath>
-
-#include "reduction_enums.hpp"
-#include "host_reduce_util.hpp"
-
-using float16 = half_float::half;
-
-namespace ck {
-
-namespace host_reduce {
-
-template <typename T>
-static void
-get_all_indexes(const std::vector<T>& dimLengths, int dim, std::vector<std::vector<T>>& indexes)
-{
-    if(dim < dimLengths.size())
-    {
-        std::vector<std::vector<T>> updated_indexes;
-
-        if(dim == 0)
-        {
-            assert(indexes.size() == 0);
-            assert(dimLengths[dim] > 0);
-            for(T i = 0; i < dimLengths[dim]; i++)
-            {
-                std::vector<T> index = {i};
-
-                updated_indexes.push_back(index);
-            };
-        }
-        else
-        {
-            // go through all the current indexes
-            for(const auto& index : indexes)
-                for(T i = 0; i < dimLengths[dim]; i++)
-                {
-                    auto index_new = index;
-                    index_new.push_back(i);
-
-                    updated_indexes.push_back(index_new);
-                };
-        };
-
-        // update to the indexes (output)
-        indexes = updated_indexes;
-
-        // further to construct the indexes from the updated status
-        get_all_indexes(dimLengths, dim + 1, indexes);
-    };
-};
-
-template <typename T>
-static T get_offset_from_index(const std::vector<T>& strides, const std::vector<T>& index)
-{
-    T offset = 0;
-
-    assert(strides.size() == index.size());
-
-    for(int i = 0; i < index.size(); i++)
-        offset += strides[i] * static_cast<T>(index[i]);
-
-    return (offset);
-};
-
-template <typename T>
-static inline T get_flatten_offset(const std::vector<T>& lengths, const std::vector<T>& index)
-{
-    T offset = 0;
-
-    assert(lengths.size() == index.size() && lengths.size() > 0);
-
-    int len  = lengths.size();
-    T stride = 1;
-
-    // for len==1, the loop is not executed
-    for(int i = len - 1; i > 0; i--)
-    {
-        offset += stride * static_cast<T>(index[i]);
-
-        stride *= lengths[i];
-    };
-
-    offset += stride * static_cast<T>(index[0]);
-
-    return (offset);
-};
-
-template <typename InDataType,
-          typename AccDataType,
-          typename OutDataType,
-          ck::ReduceTensorOp_t ReduceOpId,
-          bool PropagateNan,
-          bool NeedIndices>
-class ReductionHost
-{
-    public:
-    ReductionHost() = default;
-    ReductionHost(HostTensorDescriptor& inDesc,
-                  HostTensorDescriptor& outDesc,
-                  const std::vector<int>& invariantDims_,
-                  const std::vector<int>& toReduceDims_)
-    {
-        this->inLengths  = to_int_vector(inDesc.GetLengths());
-        this->outLengths = to_int_vector(outDesc.GetLengths());
-        this->inStrides  = to_int_vector(inDesc.GetStrides());
-        this->outStrides = to_int_vector(outDesc.GetStrides());
-
-        this->invariantDims = invariantDims_;
-        this->toReduceDims  = toReduceDims_;
-
-        assert(this->inLengths.size() == this->outLengths.size());
-        assert(!this->toReduceDims.empty());
-
-        for(const auto dim : this->invariantDims)
-            this->invariantLengths.push_back(this->inLengths[dim]);
-
-        for(const auto dim : this->toReduceDims)
-            toReduceLengths.push_back(this->inLengths[dim]);
-
-        this->reduceAllDims = this->invariantDims.empty();
-    };
-
-    ~ReductionHost(){};
-
-    void
-    Run(float alpha, const InDataType* in_data, float beta, OutDataType* out_data, int* indices)
-    {
-        if constexpr(NeedIndices)
-            RunImpl_with_indices(alpha, in_data, beta, out_data, indices);
-        else
-            RunImpl_no_indices(alpha, in_data, beta, out_data);
-    };
-
-    private:
-    std::vector<int> inLengths;
-    std::vector<int> outLengths;
-    std::vector<int> inStrides;
-    std::vector<int> outStrides;
-
-    std::vector<int> invariantLengths;
-    std::vector<int> toReduceLengths;
-
-    std::vector<int> invariantDims;
-    std::vector<int> toReduceDims;
-
-    bool reduceAllDims;
-
-    void RunImpl_with_indices(
-        float alpha, const InDataType* in_data, float beta, OutDataType* out_data, int* indices)
-    {
-        using ck::host_reduce::binop_with_nan_check;
-        using ck::host_reduce::binop_with_nan_check2;
-        using ck::host_reduce::float_equal_one;
-        using ck::host_reduce::float_equal_zero;
-        using ck::host_reduce::PosUnaryOpFn;
-        using ck::host_reduce::PreUnaryOpFn;
-        using ck::host_reduce::ReduceOpFn2;
-        using ck::host_reduce::ReduceOpZeroVal;
-
-        auto opReduce = ReduceOpFn2<AccDataType, ReduceOpId>();
-
-        int divider = 1;
-        for(int i = 0; i < toReduceLengths.size(); i++)
-            divider *= toReduceLengths[i];
-
-        auto PreUnaryOp = PreUnaryOpFn<AccDataType, ReduceOpId>(divider);
-        auto PosUnaryOp = PosUnaryOpFn<AccDataType, ReduceOpId>(divider);
-
-        if(reduceAllDims)
-        {
-            std::vector<std::vector<int>> indexes_1;
-
-            get_all_indexes(inLengths, 0, indexes_1); // generate the input indexes space
-
-            auto accuVal  = ReduceOpZeroVal<AccDataType, ReduceOpId>();
-            int accuIndex = 0;
-
-            // go through indexes of the invariant dimensions
-            for(const auto& src_index : indexes_1)
-            {
-                auto src_offset = get_offset_from_index(this->inStrides, src_index);
-
-                auto currVal = static_cast<AccDataType>(in_data[src_offset]);
-
-                // unary operation before reducing, needed by AMAX. For MIN/MAX, nothing is actually
-                // done
-                PreUnaryOp(currVal);
-
-                auto currIndex = get_flatten_offset(inLengths, src_index);
-                binop_with_nan_check2<AccDataType, PropagateNan>(
-                    opReduce, accuVal, currVal, accuIndex, currIndex);
-            };
-
-            // scale the accumulated value
-            if(!float_equal_one(alpha))
-                accuVal *= static_cast<AccDataType>(alpha);
-
-            // scale the prior dst value and add it to the accumulated value
-            if(!float_equal_zero(beta))
-                accuVal += static_cast<AccDataType>(out_data[0]) * static_cast<AccDataType>(beta);
-
-            // store the reduced value to dst location
-            out_data[0] = static_cast<OutDataType>(accuVal);
-            indices[0]  = accuIndex;
-        }
-        else
-        {
-            std::vector<std::vector<int>> indexes_1, indexes_2;
-
-            get_all_indexes(
-                this->invariantLengths, 0, indexes_1); // generate the invariant indexes space
-            get_all_indexes(
-                this->toReduceLengths, 0, indexes_2); // generate the toReduce indexes space
-
-            // go through indexes of the invariant dimensions
-            for(const auto& index_1 : indexes_1)
-            {
-                std::vector<int> src_index;
-                std::vector<int> dst_index;
-
-                src_index.resize(this->inLengths.size());
-
-                // generate the part of src index belonging to invariant dims
-                for(int k = 0; k < invariantDims.size(); k++)
-                    src_index[invariantDims[k]] = index_1[k];
-
-                for(int k = 0; k < invariantDims.size(); k++)
-                    dst_index.push_back(index_1[k]);
-
-                int dst_offset = get_offset_from_index(this->outStrides, dst_index);
-
-                AccDataType accuVal = ReduceOpZeroVal<AccDataType, ReduceOpId>();
-                int accuIndex       = 0;
-
-                // go through indexes of the toReduce dimensions
-                for(const auto& index_2 : indexes_2)
-                {
-                    // generate the part of src index belonging to toReduce dims
-                    for(int k = 0; k < toReduceDims.size(); k++)
-                        src_index[toReduceDims[k]] = index_2[k];
-
-                    auto src_offset = get_offset_from_index(this->inStrides, src_index);
-
-                    auto currVal = static_cast<AccDataType>(in_data[src_offset]);
-                    // unary operation before reducing, needed by AMAX. For MIN/MAX, nothing is
-                    // actually done
-                    PreUnaryOp(currVal);
-
-                    auto currIndex = get_flatten_offset(toReduceLengths, index_2);
-                    binop_with_nan_check2<AccDataType, PropagateNan>(
-                        opReduce, accuVal, currVal, accuIndex, currIndex);
-                };
-
-                // scale the accumulated value
-                if(!float_equal_one(alpha))
-                    accuVal *= static_cast<AccDataType>(alpha);
-
-                // scale the prior dst value and add it to the accumulated value
-                if(!float_equal_zero(beta))
-                    accuVal += static_cast<AccDataType>(out_data[dst_offset]) *
-                               static_cast<AccDataType>(beta);
-
-                // store the reduced value to dst location
-                out_data[dst_offset] = static_cast<OutDataType>(accuVal);
-                indices[dst_offset]  = accuIndex;
-            };
-        };
-    }; // end of RunImpl_with_indices()
-
-    void
-    RunImpl_no_indices(float alpha, const InDataType* in_data, float beta, OutDataType* out_data)
-    {
-        using ck::host_reduce::binop_with_nan_check;
-        using ck::host_reduce::binop_with_nan_check2;
-        using ck::host_reduce::float_equal_one;
-        using ck::host_reduce::float_equal_zero;
-        using ck::host_reduce::PosUnaryOpFn;
-        using ck::host_reduce::PreUnaryOpFn;
-        using ck::host_reduce::ReduceOpFn;
-        using ck::host_reduce::ReduceOpZeroVal;
-
-        auto opReduce = ReduceOpFn<AccDataType, ReduceOpId>();
-
-        int divider = 1;
-        for(int i = 0; i < toReduceLengths.size(); i++)
-            divider *= toReduceLengths[i];
-
-        auto PreUnaryOp = PreUnaryOpFn<AccDataType, ReduceOpId>(divider);
-        auto PosUnaryOp = PosUnaryOpFn<AccDataType, ReduceOpId>(divider);
-
-        if(reduceAllDims)
-        {
-            std::vector<std::vector<int>> indexes_1;
-
-            get_all_indexes(inLengths, 0, indexes_1); // generate the input indexes space
-
-            auto accuVal = ReduceOpZeroVal<AccDataType, ReduceOpId>();
-
-            // go through indexes of the invariant dimensions
-            for(const auto& src_index : indexes_1)
-            {
-                auto src_offset = get_offset_from_index(this->inStrides, src_index);
-
-                auto currVal = static_cast<AccDataType>(in_data[src_offset]);
-
-                PreUnaryOp(currVal);
-
-                binop_with_nan_check<AccDataType, PropagateNan>(opReduce, accuVal, currVal);
-            };
-
-            PosUnaryOp(accuVal);
-
-            // scale the accumulated value
-            if(!float_equal_one(alpha))
-                accuVal *= static_cast<AccDataType>(alpha);
-
-            // scale the prior dst value and add it to the accumulated value
-            if(!float_equal_zero(beta))
-                accuVal += static_cast<AccDataType>(out_data[0]) * static_cast<AccDataType>(beta);
-
-            // store the reduced value to dst location
-            out_data[0] = static_cast<OutDataType>(accuVal);
-        }
-        else
-        {
-            std::vector<std::vector<int>> indexes_1, indexes_2;
-
-            get_all_indexes(
-                this->invariantLengths, 0, indexes_1); // generate the invariant indexes space
-            get_all_indexes(
-                this->toReduceLengths, 0, indexes_2); // generate the toReduce indexes space
-
-            // go through indexes of the invariant dimensions
-            for(const auto& index_1 : indexes_1)
-            {
-                std::vector<int> src_index;
-                std::vector<int> dst_index;
-
-                src_index.resize(this->inLengths.size());
-
-                for(int k = 0; k < invariantDims.size(); k++)
-                    dst_index.push_back(index_1[k]);
-
-                int dst_offset = get_offset_from_index(this->outStrides, dst_index);
-
-                // generate the part of src index belonging to invariant dims
-                for(int k = 0; k < invariantDims.size(); k++)
-                    src_index[invariantDims[k]] = index_1[k];
-
-                AccDataType accuVal = ReduceOpZeroVal<AccDataType, ReduceOpId>();
-
-                // go through indexes of the toReduce dimensions
-                for(const auto& index_2 : indexes_2)
-                {
-                    // generate the part of src index belonging to toReduce dims
-                    for(int k = 0; k < toReduceDims.size(); k++)
-                        src_index[toReduceDims[k]] = index_2[k];
-
-                    auto src_offset = get_offset_from_index(this->inStrides, src_index);
-
-                    auto currVal = static_cast<AccDataType>(in_data[src_offset]);
-
-                    PreUnaryOp(currVal);
-
-                    binop_with_nan_check<AccDataType, PropagateNan>(opReduce, accuVal, currVal);
-                };
-
-                PosUnaryOp(accuVal);
-
-                // scale the accumulated value
-                if(!float_equal_one(alpha))
-                    accuVal *= static_cast<AccDataType>(alpha);
-
-                // scale the prior dst value and add it to the accumulated value
-                if(!float_equal_zero(beta))
-                    accuVal += static_cast<AccDataType>(out_data[dst_offset]) *
-                               static_cast<AccDataType>(beta);
-
-                // store the reduced value to dst location
-                out_data[dst_offset] = static_cast<OutDataType>(accuVal);
-            };
-        };
-    }; // end of RunImpl_no_indices()
-};
-
-}; // end of namespace host_reduce
-
-}; // end of namespace ck
-
-#endif
diff --git a/library/include/ck/library/host_tensor/host_reduce_util.hpp b/library/include/ck/library/host_tensor/host_reduce_util.hpp
index a176962bb1..f5e01ccc94 100644
--- a/library/include/ck/library/host_tensor/host_reduce_util.hpp
+++ b/library/include/ck/library/host_tensor/host_reduce_util.hpp
@@ -66,22 +66,22 @@ static inline bool float_equal_zero(half_float::half x)
     return x == static_cast<half_float::half>(0.0f);
 };
 
-template <typename compType, ReduceTensorOp_t ReduceOpId>
-__host__ static inline std::function<void(compType&)> PreUnaryOpFn(int)
+template <typename AccDataType, ReduceTensorOp_t ReduceOpId>
+__host__ static inline std::function<void(AccDataType&)> PreUnaryOpFn(int)
 {
     using std::abs;
 
     if constexpr(ReduceOpId == ReduceTensorOp_t::NORM1)
     {
-        return ([&](compType& a_) { a_ = abs(a_); });
+        return ([&](AccDataType& a_) { a_ = abs(a_); });
     }
     else if constexpr(ReduceOpId == ReduceTensorOp_t::NORM2)
     {
-        return ([&](compType& a_) { a_ = a_ * a_; });
+        return ([&](AccDataType& a_) { a_ = a_ * a_; });
     }
     else if constexpr(ReduceOpId == ReduceTensorOp_t::AMAX)
     {
-        return ([&](compType& a_) { a_ = abs(a_); });
+        return ([&](AccDataType& a_) { a_ = abs(a_); });
     }
     else
     {
@@ -90,23 +90,23 @@ __host__ static inline std::function<void(compType&)> PreUnaryOpFn(int)
         // ReduceTensorOp_t::MUL:
         // ReduceTensorOp_t::MIN:
         // ReduceTensorOp_t::MAX:
-        return ([&](compType&) {});
+        return ([&](AccDataType&) {});
     };
 };
 
-template <typename compType, ReduceTensorOp_t ReduceOpId>
-__host__ static inline std::function<void(compType&)> PosUnaryOpFn(int divider)
+template <typename AccDataType, ReduceTensorOp_t ReduceOpId>
+__host__ static inline std::function<void(AccDataType&)> PosUnaryOpFn(int32_t divider)
 {
     using std::sqrt;
 
     if constexpr(ReduceOpId == ReduceTensorOp_t::NORM2)
     {
-        return ([&](compType& a_) { a_ = sqrt(a_); });
+        return ([&](AccDataType& a_) { a_ = sqrt(a_); });
     }
     else if constexpr(ReduceOpId == ReduceTensorOp_t::AVG)
     {
-        return ([&, divider](compType& a_) {
-            a_ = a_ / static_cast<compType>(static_cast<float>(divider));
+        return ([&, divider](AccDataType& a_) {
+            a_ = a_ / static_cast<AccDataType>(static_cast<float>(divider));
         });
     }
     else
@@ -117,44 +117,44 @@ __host__ static inline std::function<void(compType&)> PosUnaryOpFn(int divider)
         // ReduceTensorOp_t::MIN:
         // ReduceTensorOp_t::MAX:
         // ReduceTensorOp_t::AMAX:
-        return ([&](compType&) {});
+        return ([&](AccDataType&) {});
     }
 };
 
-template <typename compType, ReduceTensorOp_t ReduceOpId>
-__host__ static inline std::function<void(compType&, compType)> ReduceOpFn()
+template <typename AccDataType, ReduceTensorOp_t ReduceOpId>
+__host__ static inline std::function<void(AccDataType&, AccDataType)> ReduceOpFn()
 {
     if constexpr(ReduceOpId == ReduceTensorOp_t::ADD || ReduceOpId == ReduceTensorOp_t::AVG ||
                  ReduceOpId == ReduceTensorOp_t::NORM1 || ReduceOpId == ReduceTensorOp_t::NORM2)
     {
-        return ([&](compType& a_, compType b_) { a_ = a_ + b_; });
+        return ([&](AccDataType& a_, AccDataType b_) { a_ = a_ + b_; });
     }
     else if constexpr(ReduceOpId == ReduceTensorOp_t::MUL)
     {
-        return ([&](compType& a_, compType b_) { a_ = a_ * b_; });
+        return ([&](AccDataType& a_, AccDataType b_) { a_ = a_ * b_; });
     }
     else if constexpr(ReduceOpId == ReduceTensorOp_t::MIN)
     {
-        return ([&](compType& a_, compType b_) {
+        return ([&](AccDataType& a_, AccDataType b_) {
             if(a_ > b_)
                 a_ = b_;
         });
     }
     else if constexpr(ReduceOpId == ReduceTensorOp_t::MAX || ReduceOpId == ReduceTensorOp_t::AMAX)
     {
-        return ([&](compType& a_, compType b_) {
+        return ([&](AccDataType& a_, AccDataType b_) {
             if(a_ < b_)
                 a_ = b_;
         });
     }
 };
 
-template <typename compType, ReduceTensorOp_t ReduceOpId>
-__host__ static inline std::function<void(compType&, compType, bool& changed)> ReduceOpFn2()
+template <typename AccDataType, ReduceTensorOp_t ReduceOpId>
+__host__ static inline std::function<void(AccDataType&, AccDataType, bool& changed)> ReduceOpFn2()
 {
     if constexpr(ReduceOpId == ReduceTensorOp_t::MIN)
     {
-        return ([&](compType& a_, compType b_, bool& changed) {
+        return ([&](AccDataType& a_, AccDataType b_, bool& changed) {
             if(a_ > b_)
             {
                 a_      = b_;
@@ -166,7 +166,7 @@ __host__ static inline std::function<void(compType&, compType, bool& changed)> R
     }
     else if constexpr(ReduceOpId == ReduceTensorOp_t::MAX || ReduceOpId == ReduceTensorOp_t::AMAX)
     {
-        return ([&](compType& a_, compType b_, bool& changed) {
+        return ([&](AccDataType& a_, AccDataType b_, bool& changed) {
             if(a_ < b_)
             {
                 a_      = b_;
@@ -183,28 +183,28 @@ __host__ static inline std::function<void(compType&, compType, bool& changed)> R
         // ReduceTensorOp_t::AVG:
         // ReduceTensorOp_t::NORM1:
         // ReduceTensorOp_t::NORM2:
-        return (std::function<void(compType&, compType, bool&)>{});
+        return (std::function<void(AccDataType&, AccDataType, bool&)>{});
     };
 };
 
-template <typename compType, ReduceTensorOp_t ReduceOpId>
-__host__ static inline compType ReduceOpZeroVal()
+template <typename AccDataType, ReduceTensorOp_t ReduceOpId>
+__host__ static inline AccDataType ReduceOpZeroVal()
 {
     if constexpr(ReduceOpId == ReduceTensorOp_t::MUL)
     {
-        return (static_cast<compType>(1.0f));
+        return (static_cast<AccDataType>(1.0f));
     }
     else if constexpr(ReduceOpId == ReduceTensorOp_t::MIN)
     {
-        return (std::numeric_limits<compType>::max());
+        return (std::numeric_limits<AccDataType>::max());
     }
     else if constexpr(ReduceOpId == ReduceTensorOp_t::MAX)
     {
-        return (std::numeric_limits<compType>::lowest());
+        return (std::numeric_limits<AccDataType>::lowest());
     }
     else if constexpr(ReduceOpId == ReduceTensorOp_t::AMAX)
     {
-        return (static_cast<compType>(0.0f));
+        return (static_cast<AccDataType>(0.0f));
     }
     else
     {
@@ -212,14 +212,15 @@ __host__ static inline compType ReduceOpZeroVal()
         // ReduceTensorOp_t::AVG
         // ReduceTensorOp_t::NORM1
         // ReduceTensorOp_t::NORM2
-        return (static_cast<compType>(0.0f));
+        return (static_cast<AccDataType>(0.0f));
     };
 };
 
-template <typename compType, bool PropagateNan>
-__host__ static inline void binop_with_nan_check(std::function<void(compType&, compType)> opReduce,
-                                                 compType& accuVal,
-                                                 compType currVal)
+template <typename AccDataType, bool PropagateNan>
+__host__ static inline void
+binop_with_nan_check(std::function<void(AccDataType&, AccDataType)> opReduce,
+                     AccDataType& accuVal,
+                     AccDataType currVal)
 {
     using std::isnan;
 
@@ -236,11 +237,11 @@ __host__ static inline void binop_with_nan_check(std::function<void(compType&, c
     };
 };
 
-template <typename compType, bool PropagateNan>
+template <typename AccDataType, bool PropagateNan>
 __host__ static inline void
-binop_with_nan_check2(std::function<void(compType&, compType, bool&)> opReduce,
-                      compType& accuVal,
-                      compType currVal,
+binop_with_nan_check2(std::function<void(AccDataType&, AccDataType, bool&)> opReduce,
+                      AccDataType& accuVal,
+                      AccDataType currVal,
                       int& accuIndex,
                       int currIndex)
 {
diff --git a/library/include/ck/library/host_tensor/host_reduction.hpp b/library/include/ck/library/host_tensor/host_reduction.hpp
new file mode 100644
index 0000000000..fe9fba6121
--- /dev/null
+++ b/library/include/ck/library/host_tensor/host_reduction.hpp
@@ -0,0 +1,402 @@
+
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2020 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#ifndef HOST_REDUCTION_HPP_
+#define HOST_REDUCTION_HPP_
+
+#include <vector>
+#include <array>
+#include <functional>
+
+#include "reduction_enums.hpp"
+#include "host_reduce_util.hpp"
+#include "host_tensor.hpp"
+#include "data_type.hpp"
+
+template <int NDim>
+static void get_all_indexes(const std::array<size_t, NDim>& dimLengths,
+                            std::vector<std::array<size_t, NDim>>& indexes)
+{
+    static_assert(NDim >= 1, "NDim >= 1 is required to use this function!");
+
+    if constexpr(NDim == 1)
+    {
+        for(size_t i = 0; i < dimLengths[0]; i++)
+        {
+            std::array<size_t, 1> index{i};
+
+            indexes.push_back(index);
+        };
+    }
+    else
+    {
+        std::array<size_t, NDim - 1> partial_dim_lengths;
+
+        for(int i = 0; i < NDim - 1; i++)
+            partial_dim_lengths[i] = dimLengths[i + 1];
+
+        std::vector<std::array<size_t, NDim - 1>> partial_indexes;
+
+        get_all_indexes<NDim - 1>(partial_dim_lengths, partial_indexes);
+
+        for(size_t i = 0; i < dimLengths[0]; i++)
+            for(const auto& index : partial_indexes)
+            {
+                std::array<size_t, NDim> extIndex;
+
+                extIndex[0] = i;
+
+                for(int k = 0; k < NDim - 1; k++)
+                    extIndex[k + 1] = index[k];
+
+                indexes.push_back(extIndex);
+            };
+    };
+};
+
+template <int NDim>
+static size_t get_offset_from_index(const std::array<size_t, NDim>& strides,
+                                    const std::array<size_t, NDim>& index)
+{
+    size_t offset = 0;
+
+    for(int i = 0; i < NDim; i++)
+        offset += strides[i] * index[i];
+
+    return (offset);
+};
+
+template <int NDim>
+static size_t get_offset_from_index(const std::vector<size_t>& strides,
+                                    const std::array<size_t, NDim>& index)
+{
+    size_t offset = 0;
+
+    for(int i = 0; i < NDim; i++)
+        offset += strides[i] * index[i];
+
+    return (offset);
+};
+
+template <typename InDataType,
+          typename AccDataType,
+          typename OutDataType,
+          ck::ReduceTensorOp_t ReduceOpId,
+          int Rank,
+          int NumReduceDim,
+          bool PropagateNan,
+          bool NeedIndices>
+struct ReductionHost
+{
+    using IndexDataType = int32_t;
+
+    static constexpr int NumInvariantDim = Rank - NumReduceDim;
+
+    std::vector<size_t> outStrides;
+    std::vector<int> invariantDims;
+    std::vector<int> reduceDims;
+
+    IndexDataType divider;
+    std::function<void(AccDataType&)> preUnaryOp;
+    std::function<void(AccDataType&)> posUnaryOp;
+    std::array<size_t, NumReduceDim> reduceLengths;
+    std::array<size_t, NumReduceDim> reduceStrides;
+    std::array<size_t, NumInvariantDim> invariantLengths;
+    std::array<size_t, NumInvariantDim> invariantStrides;
+
+    std::vector<std::array<size_t, NumReduceDim>> reduce_dim_indexes;
+    std::vector<std::array<size_t, NumInvariantDim>> invariant_dim_indexes;
+
+    ReductionHost(HostTensorDescriptor& inDesc,
+                  HostTensorDescriptor& outDesc,
+                  const std::vector<int>& invariantDims_,
+                  const std::vector<int>& reduceDims_)
+    {
+        using ck::host_reduce::PosUnaryOpFn;
+        using ck::host_reduce::PreUnaryOpFn;
+
+        // this->outLengths = to_int_vector(outDesc.GetLengths());
+        this->outStrides = outDesc.GetStrides();
+
+        this->invariantDims = invariantDims_;
+        this->reduceDims    = reduceDims_;
+
+        int product = 1;
+
+        for(int i = 0; i < NumReduceDim; i++)
+        {
+            reduceLengths[i] = inDesc.GetLengths()[reduceDims[i]];
+            reduceStrides[i] = inDesc.GetStrides()[reduceDims[i]];
+            product *= inDesc.GetLengths()[reduceDims[i]];
+        };
+
+        divider = product;
+
+        for(int i = 0; i < NumInvariantDim; i++)
+        {
+            invariantLengths[i] = inDesc.GetLengths()[invariantDims[i]];
+            invariantStrides[i] = inDesc.GetStrides()[invariantDims[i]];
+        };
+
+        reduce_dim_indexes.clear();
+        get_all_indexes<NumReduceDim>(reduceLengths, reduce_dim_indexes);
+
+        if constexpr(NumInvariantDim > 0)
+        {
+            invariant_dim_indexes.clear();
+            get_all_indexes<NumInvariantDim>(invariantLengths, invariant_dim_indexes);
+        };
+
+        preUnaryOp = PreUnaryOpFn<AccDataType, ReduceOpId>(divider);
+        posUnaryOp = PosUnaryOpFn<AccDataType, ReduceOpId>(divider);
+    };
+
+    void Run(float alpha,
+             const InDataType* in_data,
+             float beta,
+             OutDataType* out_data,
+             IndexDataType* out_indices)
+    {
+        if constexpr(NeedIndices)
+        {
+            RunImpl_with_index(alpha, in_data, beta, out_data, out_indices);
+        }
+        else
+        {
+            RunImpl_no_index(alpha, in_data, beta, out_data);
+        };
+    };
+
+    void RunImpl_with_index(float alpha,
+                            const InDataType* in_data,
+                            float beta,
+                            OutDataType* out_data,
+                            IndexDataType* out_indices)
+    {
+        using ck::type_convert;
+        using ck::host_reduce::binop_with_nan_check2;
+        using ck::host_reduce::float_equal_one;
+        using ck::host_reduce::float_equal_zero;
+        using ck::host_reduce::ReduceOpFn2;
+        using ck::host_reduce::ReduceOpZeroVal;
+
+        auto opReduce2 = ReduceOpFn2<AccDataType, ReduceOpId>();
+
+        if constexpr(NumInvariantDim == 0)
+        {
+            AccDataType accuVal     = ReduceOpZeroVal<AccDataType, ReduceOpId>();
+            IndexDataType accuIndex = 0;
+
+            for(IndexDataType i = 0; i < reduce_dim_indexes.size(); i++)
+            {
+                auto offset_reduce =
+                    get_offset_from_index<NumReduceDim>(reduceStrides, reduce_dim_indexes[i]);
+
+                auto currVal = type_convert<AccDataType>(in_data[offset_reduce]);
+
+                preUnaryOp(currVal);
+
+                auto currIndex = i;
+
+                binop_with_nan_check2<AccDataType, PropagateNan>(
+                    opReduce2, accuVal, currVal, accuIndex, currIndex);
+            };
+
+            posUnaryOp(accuVal);
+
+            if(!float_equal_one(alpha))
+                accuVal *= type_convert<AccDataType>(alpha);
+
+            if(!float_equal_zero(beta))
+                accuVal += type_convert<AccDataType>(out_data[0]) * type_convert<AccDataType>(beta);
+
+            out_data[0]    = type_convert<OutDataType>(accuVal);
+            out_indices[0] = accuIndex;
+        }
+        else
+        {
+            auto thread_reduce_func = [&](auto invariant_index) {
+                AccDataType accuVal     = ReduceOpZeroVal<AccDataType, ReduceOpId>();
+                IndexDataType accuIndex = 0;
+
+                auto offset_invariant =
+                    get_offset_from_index<NumInvariantDim>(invariantStrides, invariant_index);
+
+                for(IndexDataType i = 0; i < reduce_dim_indexes.size(); i++)
+                {
+                    auto offset_reduce =
+                        get_offset_from_index<NumReduceDim>(reduceStrides, reduce_dim_indexes[i]);
+
+                    auto currVal =
+                        type_convert<AccDataType>(in_data[offset_invariant + offset_reduce]);
+
+                    preUnaryOp(currVal);
+
+                    auto currIndex = i;
+
+                    binop_with_nan_check2<AccDataType, PropagateNan>(
+                        opReduce2, accuVal, currVal, accuIndex, currIndex);
+                };
+
+                posUnaryOp(accuVal);
+
+                if(!float_equal_one(alpha))
+                    accuVal *= type_convert<AccDataType>(alpha);
+
+                auto dst_offset =
+                    get_offset_from_index<NumInvariantDim>(outStrides, invariant_index);
+
+                if(!float_equal_zero(beta))
+                    accuVal += type_convert<AccDataType>(out_data[dst_offset]) *
+                               type_convert<AccDataType>(beta);
+
+                out_data[dst_offset]    = type_convert<OutDataType>(accuVal);
+                out_indices[dst_offset] = accuIndex;
+            };
+
+            std::size_t num_thread = std::thread::hardware_concurrency();
+            std::size_t work_per_thread =
+                (invariant_dim_indexes.size() + num_thread - 1) / num_thread;
+
+            std::vector<joinable_thread> threads(num_thread);
+
+            for(std::size_t it = 0; it < num_thread; ++it)
+            {
+                std::size_t iw_begin = it * work_per_thread;
+                std::size_t iw_end =
+                    std::min((it + 1) * work_per_thread, invariant_dim_indexes.size());
+
+                auto f = [=] {
+                    for(std::size_t iw = iw_begin; iw < iw_end; ++iw)
+                    {
+                        thread_reduce_func(invariant_dim_indexes[iw]);
+                    }
+                };
+
+                threads[it] = joinable_thread(f);
+            }
+        };
+    };
+
+    void RunImpl_no_index(float alpha, const InDataType* in_data, float beta, OutDataType* out_data)
+    {
+        using ck::type_convert;
+        using ck::host_reduce::binop_with_nan_check;
+        using ck::host_reduce::float_equal_one;
+        using ck::host_reduce::float_equal_zero;
+        using ck::host_reduce::ReduceOpFn;
+        using ck::host_reduce::ReduceOpZeroVal;
+
+        auto opReduce = ReduceOpFn<AccDataType, ReduceOpId>();
+
+        if constexpr(NumInvariantDim == 0)
+        {
+            AccDataType accuVal = ReduceOpZeroVal<AccDataType, ReduceOpId>();
+
+            for(const auto& reduce_index : reduce_dim_indexes)
+            {
+                auto offset_reduce =
+                    get_offset_from_index<NumReduceDim>(reduceStrides, reduce_index);
+
+                auto currVal = type_convert<AccDataType>(in_data[offset_reduce]);
+
+                preUnaryOp(currVal);
+
+                binop_with_nan_check<AccDataType, PropagateNan>(opReduce, accuVal, currVal);
+            };
+
+            posUnaryOp(accuVal);
+
+            if(!float_equal_one(alpha))
+                accuVal *= type_convert<AccDataType>(alpha);
+
+            if(!float_equal_zero(beta))
+                accuVal += type_convert<AccDataType>(out_data[0]) * type_convert<AccDataType>(beta);
+
+            out_data[0] = type_convert<OutDataType>(accuVal);
+        }
+        else
+        {
+            auto thread_reduce_func = [&](auto invariant_index) {
+                AccDataType accuVal = ReduceOpZeroVal<AccDataType, ReduceOpId>();
+
+                auto offset_invariant =
+                    get_offset_from_index<NumInvariantDim>(invariantStrides, invariant_index);
+
+                for(const auto& reduce_index : reduce_dim_indexes)
+                {
+                    auto offset_reduce =
+                        get_offset_from_index<NumReduceDim>(reduceStrides, reduce_index);
+
+                    auto currVal =
+                        type_convert<AccDataType>(in_data[offset_invariant + offset_reduce]);
+
+                    preUnaryOp(currVal);
+
+                    binop_with_nan_check<AccDataType, PropagateNan>(opReduce, accuVal, currVal);
+                };
+
+                posUnaryOp(accuVal);
+
+                if(!float_equal_one(alpha))
+                    accuVal *= type_convert<AccDataType>(alpha);
+
+                auto dst_offset =
+                    get_offset_from_index<NumInvariantDim>(outStrides, invariant_index);
+
+                if(!float_equal_zero(beta))
+                    accuVal += type_convert<AccDataType>(out_data[dst_offset]) *
+                               type_convert<AccDataType>(beta);
+
+                out_data[dst_offset] = type_convert<OutDataType>(accuVal);
+            };
+
+            std::size_t num_thread = std::thread::hardware_concurrency();
+            std::size_t work_per_thread =
+                (invariant_dim_indexes.size() + num_thread - 1) / num_thread;
+
+            std::vector<joinable_thread> threads(num_thread);
+
+            for(std::size_t it = 0; it < num_thread; ++it)
+            {
+                std::size_t iw_begin = it * work_per_thread;
+                std::size_t iw_end =
+                    std::min((it + 1) * work_per_thread, invariant_dim_indexes.size());
+
+                auto f = [=] {
+                    for(std::size_t iw = iw_begin; iw < iw_end; ++iw)
+                    {
+                        thread_reduce_func(invariant_dim_indexes[iw]);
+                    }
+                };
+
+                threads[it] = joinable_thread(f);
+            }
+        };
+    };
+};
+
+#endif
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance.hpp
index 6fd30b7cb6..fafbe120b9 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance.hpp
@@ -6,23 +6,36 @@
 #include "device_reduce_instance_blockwise_f32_f32_f32.hpp"
 #include "device_reduce_instance_blockwise_f32_f64_f32.hpp"
 #include "device_reduce_instance_blockwise_f64_f64_f64.hpp"
+#include "device_reduce_instance_blockwise_i8_i8_i8.hpp"
+#include "device_reduce_instance_blockwise_i8_i32_i8.hpp"
+#include "device_reduce_instance_blockwise_b16_f32_b16.hpp"
 #include "device_reduce_instance_blockwise_second_call_f16_f16_f16.hpp"
 #include "device_reduce_instance_blockwise_second_call_f32_f32_f16.hpp"
 #include "device_reduce_instance_blockwise_second_call_f32_f32_f32.hpp"
 #include "device_reduce_instance_blockwise_second_call_f64_f64_f32.hpp"
 #include "device_reduce_instance_blockwise_second_call_f64_f64_f64.hpp"
+#include "device_reduce_instance_blockwise_second_call_i8_i8_i8.hpp"
+#include "device_reduce_instance_blockwise_second_call_i32_i32_i8.hpp"
+#include "device_reduce_instance_blockwise_second_call_f32_f32_b16.hpp"
 #include "device_reduce_instance_multiblock_atomic_add_f16_f32_f32.hpp"
 #include "device_reduce_instance_multiblock_atomic_add_f32_f32_f32.hpp"
 #include "device_reduce_instance_multiblock_atomic_add_f32_f64_f32.hpp"
+#include "device_reduce_instance_multiblock_atomic_add_b16_f32_f32.hpp"
 #include "device_reduce_instance_multiblock_partial_reduce_f16_f16_f16.hpp"
 #include "device_reduce_instance_multiblock_partial_reduce_f16_f32_f16.hpp"
 #include "device_reduce_instance_multiblock_partial_reduce_f32_f32_f32.hpp"
 #include "device_reduce_instance_multiblock_partial_reduce_f32_f64_f32.hpp"
 #include "device_reduce_instance_multiblock_partial_reduce_f64_f64_f64.hpp"
+#include "device_reduce_instance_multiblock_partial_reduce_i8_i8_i8.hpp"
+#include "device_reduce_instance_multiblock_partial_reduce_i8_i32_i8.hpp"
+#include "device_reduce_instance_multiblock_partial_reduce_b16_f32_b16.hpp"
 #include "device_reduce_instance_threadwise_f16_f16_f16.hpp"
 #include "device_reduce_instance_threadwise_f16_f32_f16.hpp"
 #include "device_reduce_instance_threadwise_f32_f32_f32.hpp"
 #include "device_reduce_instance_threadwise_f32_f64_f32.hpp"
 #include "device_reduce_instance_threadwise_f64_f64_f64.hpp"
+#include "device_reduce_instance_threadwise_i8_i8_i8.hpp"
+#include "device_reduce_instance_threadwise_i8_i32_i8.hpp"
+#include "device_reduce_instance_threadwise_b16_f32_b16.hpp"
 
 #endif
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp
index b71707294c..64d89e41b0 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp
@@ -17,7 +17,6 @@ using reduce_configuration_2_instances_blockwise = std::tuple<
     ReductionConfiguration_2<0, 2, 2, 2, 1>,
     ReductionConfiguration_2<0, 1, 1, 2, 1>,
     ReductionConfiguration_2<1, 2, 1, 1, 2>,
-    ReductionConfiguration_2<1, 2, 2, 1, 2>,
     ReductionConfiguration_2<0, 1, 1, 3, 1>,
     ReductionConfiguration_2<1, 1, 1, 1, 3>
     // clang-format on
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16.hpp
new file mode 100644
index 0000000000..0ae3289a0d
--- /dev/null
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16.hpp
@@ -0,0 +1,60 @@
+#ifndef DEVICE_REDUCE_INSTANCE_BLOCKWISE_B16_F32_B16_HPP
+#define DEVICE_REDUCE_INSTANCE_BLOCKWISE_B16_F32_B16_HPP
+
+#include "reduction_enums.hpp"
+#include "reduction_operator_mapping.hpp"
+#include "device_reduce_instance_blockwise.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace device_reduce_instance {
+
+// clang-format off
+// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim 
+ADD_BLOCKWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 0, 0, 0, 4, 3); // for ADD
+ADD_BLOCKWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 0, 0, 0, 4, 4);
+ADD_BLOCKWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 0, 0, 0, 4, 1);
+ADD_BLOCKWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 0, 0, 0, 2, 1);
+ADD_BLOCKWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 5, 0, 0, 4, 3); // for AVG
+ADD_BLOCKWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 5, 0, 0, 4, 4);       
+ADD_BLOCKWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 5, 0, 0, 4, 1);       
+ADD_BLOCKWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 5, 0, 0, 2, 1);       
+ADD_BLOCKWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 7, 0, 0, 4, 3); // for NORM2
+ADD_BLOCKWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 7, 0, 0, 4, 4);       
+ADD_BLOCKWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 7, 0, 0, 4, 1);       
+ADD_BLOCKWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 7, 0, 0, 2, 1);
+
+ADD_BLOCKWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 2, 0, 0, 4, 3); // for MIN
+ADD_BLOCKWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 2, 0, 0, 4, 4);
+ADD_BLOCKWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 2, 0, 0, 4, 1);
+ADD_BLOCKWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 2, 0, 0, 2, 1);
+ADD_BLOCKWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 3, 0, 0, 4, 3); // for MAX
+ADD_BLOCKWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 3, 0, 0, 4, 4);
+ADD_BLOCKWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 3, 0, 0, 4, 1);
+ADD_BLOCKWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 3, 0, 0, 2, 1);
+ADD_BLOCKWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 4, 0, 0, 4, 3); // for AMAX
+ADD_BLOCKWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 4, 0, 0, 4, 4);
+ADD_BLOCKWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 4, 0, 0, 4, 1);
+ADD_BLOCKWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 4, 0, 0, 2, 1);
+ADD_BLOCKWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 2, 0, 1, 4, 3); // for MIN
+ADD_BLOCKWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 2, 0, 1, 4, 4);
+ADD_BLOCKWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 2, 0, 1, 4, 1);
+ADD_BLOCKWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 2, 0, 1, 2, 1);
+ADD_BLOCKWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 3, 0, 1, 4, 3); // for MAX
+ADD_BLOCKWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 3, 0, 1, 4, 4);
+ADD_BLOCKWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 3, 0, 1, 4, 1);
+ADD_BLOCKWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 3, 0, 1, 2, 1);
+ADD_BLOCKWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 4, 0, 1, 4, 3); // for AMAX
+ADD_BLOCKWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 4, 0, 1, 4, 4);
+ADD_BLOCKWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 4, 0, 1, 4, 1);
+ADD_BLOCKWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 4, 0, 1, 2, 1);
+// clang-format on
+
+} // namespace device_reduce_instance
+} // namespace device
+} // namespace tensor_operation
+
+} // namespace ck
+
+#endif
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f16_f16.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f16_f16.hpp
index 42b2482085..e7bdb15d92 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f16_f16.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f16_f16.hpp
@@ -13,21 +13,27 @@ namespace device_reduce_instance {
 // clang-format off
 // InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim 
 ADD_BLOCKWISE_INST_REF_BY_ID(half_t, half_t, half_t, 2, 0, 0, 4, 3); // for MIN
+ADD_BLOCKWISE_INST_REF_BY_ID(half_t, half_t, half_t, 2, 0, 0, 4, 4);       
 ADD_BLOCKWISE_INST_REF_BY_ID(half_t, half_t, half_t, 2, 0, 0, 4, 1);       
 ADD_BLOCKWISE_INST_REF_BY_ID(half_t, half_t, half_t, 2, 0, 0, 2, 1);       
 ADD_BLOCKWISE_INST_REF_BY_ID(half_t, half_t, half_t, 3, 0, 0, 4, 3); // for MAX
+ADD_BLOCKWISE_INST_REF_BY_ID(half_t, half_t, half_t, 3, 0, 0, 4, 4);       
 ADD_BLOCKWISE_INST_REF_BY_ID(half_t, half_t, half_t, 3, 0, 0, 4, 1);       
 ADD_BLOCKWISE_INST_REF_BY_ID(half_t, half_t, half_t, 3, 0, 0, 2, 1);       
 ADD_BLOCKWISE_INST_REF_BY_ID(half_t, half_t, half_t, 4, 0, 0, 4, 3); // for AMAX
+ADD_BLOCKWISE_INST_REF_BY_ID(half_t, half_t, half_t, 4, 0, 0, 4, 4);       
 ADD_BLOCKWISE_INST_REF_BY_ID(half_t, half_t, half_t, 4, 0, 0, 4, 1);       
 ADD_BLOCKWISE_INST_REF_BY_ID(half_t, half_t, half_t, 4, 0, 0, 2, 1);       
 ADD_BLOCKWISE_INST_REF_BY_ID(half_t, half_t, half_t, 2, 0, 1, 4, 3); // for MIN
+ADD_BLOCKWISE_INST_REF_BY_ID(half_t, half_t, half_t, 2, 0, 1, 4, 4);       
 ADD_BLOCKWISE_INST_REF_BY_ID(half_t, half_t, half_t, 2, 0, 1, 4, 1);       
 ADD_BLOCKWISE_INST_REF_BY_ID(half_t, half_t, half_t, 2, 0, 1, 2, 1);       
 ADD_BLOCKWISE_INST_REF_BY_ID(half_t, half_t, half_t, 3, 0, 1, 4, 3); // for MAX
+ADD_BLOCKWISE_INST_REF_BY_ID(half_t, half_t, half_t, 3, 0, 1, 4, 4);       
 ADD_BLOCKWISE_INST_REF_BY_ID(half_t, half_t, half_t, 3, 0, 1, 4, 1);       
 ADD_BLOCKWISE_INST_REF_BY_ID(half_t, half_t, half_t, 3, 0, 1, 2, 1);       
 ADD_BLOCKWISE_INST_REF_BY_ID(half_t, half_t, half_t, 4, 0, 1, 4, 3); // for AMAX
+ADD_BLOCKWISE_INST_REF_BY_ID(half_t, half_t, half_t, 4, 0, 1, 4, 4);       
 ADD_BLOCKWISE_INST_REF_BY_ID(half_t, half_t, half_t, 4, 0, 1, 4, 1);       
 ADD_BLOCKWISE_INST_REF_BY_ID(half_t, half_t, half_t, 4, 0, 1, 2, 1);
 // clang-format on
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f32_f16.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f32_f16.hpp
index fdf2f8b587..dad0d86350 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f32_f16.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f32_f16.hpp
@@ -13,12 +13,15 @@ namespace device_reduce_instance {
 // clang-format off
 // InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim 
 ADD_BLOCKWISE_INST_REF_BY_ID(half_t, float, half_t, 0, 0, 0, 4, 3); // for ADD
+ADD_BLOCKWISE_INST_REF_BY_ID(half_t, float, half_t, 0, 0, 0, 4, 4);
 ADD_BLOCKWISE_INST_REF_BY_ID(half_t, float, half_t, 0, 0, 0, 4, 1);
 ADD_BLOCKWISE_INST_REF_BY_ID(half_t, float, half_t, 0, 0, 0, 2, 1);
 ADD_BLOCKWISE_INST_REF_BY_ID(half_t, float, half_t, 5, 0, 0, 4, 3); // for AVG
+ADD_BLOCKWISE_INST_REF_BY_ID(half_t, float, half_t, 5, 0, 0, 4, 4);       
 ADD_BLOCKWISE_INST_REF_BY_ID(half_t, float, half_t, 5, 0, 0, 4, 1);       
 ADD_BLOCKWISE_INST_REF_BY_ID(half_t, float, half_t, 5, 0, 0, 2, 1);       
 ADD_BLOCKWISE_INST_REF_BY_ID(half_t, float, half_t, 7, 0, 0, 4, 3); // for NORM2
+ADD_BLOCKWISE_INST_REF_BY_ID(half_t, float, half_t, 7, 0, 0, 4, 4);       
 ADD_BLOCKWISE_INST_REF_BY_ID(half_t, float, half_t, 7, 0, 0, 4, 1);       
 ADD_BLOCKWISE_INST_REF_BY_ID(half_t, float, half_t, 7, 0, 0, 2, 1);
 // clang-format on
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32.hpp
index 877b687d24..34ec15db2b 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32.hpp
@@ -13,30 +13,39 @@ namespace device_reduce_instance {
 // clang-format off
 // InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim 
 ADD_BLOCKWISE_INST_REF_BY_ID(float, float, float, 0, 0, 0, 4, 3); // for ADD
+ADD_BLOCKWISE_INST_REF_BY_ID(float, float, float, 0, 0, 0, 4, 4);
 ADD_BLOCKWISE_INST_REF_BY_ID(float, float, float, 0, 0, 0, 4, 1);
 ADD_BLOCKWISE_INST_REF_BY_ID(float, float, float, 0, 0, 0, 2, 1);
 ADD_BLOCKWISE_INST_REF_BY_ID(float, float, float, 5, 0, 0, 4, 3); // for AVG
+ADD_BLOCKWISE_INST_REF_BY_ID(float, float, float, 5, 0, 0, 4, 4);       
 ADD_BLOCKWISE_INST_REF_BY_ID(float, float, float, 5, 0, 0, 4, 1);       
 ADD_BLOCKWISE_INST_REF_BY_ID(float, float, float, 5, 0, 0, 2, 1);       
 ADD_BLOCKWISE_INST_REF_BY_ID(float, float, float, 7, 0, 0, 4, 3); // for NORM2
+ADD_BLOCKWISE_INST_REF_BY_ID(float, float, float, 7, 0, 0, 4, 4);       
 ADD_BLOCKWISE_INST_REF_BY_ID(float, float, float, 7, 0, 0, 4, 1);       
 ADD_BLOCKWISE_INST_REF_BY_ID(float, float, float, 7, 0, 0, 2, 1);       
 ADD_BLOCKWISE_INST_REF_BY_ID(float, float, float, 2, 0, 0, 4, 3); // for MIN
+ADD_BLOCKWISE_INST_REF_BY_ID(float, float, float, 2, 0, 0, 4, 4);       
 ADD_BLOCKWISE_INST_REF_BY_ID(float, float, float, 2, 0, 0, 4, 1);       
 ADD_BLOCKWISE_INST_REF_BY_ID(float, float, float, 2, 0, 0, 2, 1);       
 ADD_BLOCKWISE_INST_REF_BY_ID(float, float, float, 3, 0, 0, 4, 3); // for MAX
+ADD_BLOCKWISE_INST_REF_BY_ID(float, float, float, 3, 0, 0, 4, 4);       
 ADD_BLOCKWISE_INST_REF_BY_ID(float, float, float, 3, 0, 0, 4, 1);       
 ADD_BLOCKWISE_INST_REF_BY_ID(float, float, float, 3, 0, 0, 2, 1);       
 ADD_BLOCKWISE_INST_REF_BY_ID(float, float, float, 4, 0, 0, 4, 3); // for AMAX
+ADD_BLOCKWISE_INST_REF_BY_ID(float, float, float, 4, 0, 0, 4, 4);       
 ADD_BLOCKWISE_INST_REF_BY_ID(float, float, float, 4, 0, 0, 4, 1);       
 ADD_BLOCKWISE_INST_REF_BY_ID(float, float, float, 4, 0, 0, 2, 1);       
 ADD_BLOCKWISE_INST_REF_BY_ID(float, float, float, 2, 0, 1, 4, 3); // for MIN
+ADD_BLOCKWISE_INST_REF_BY_ID(float, float, float, 2, 0, 1, 4, 4);       
 ADD_BLOCKWISE_INST_REF_BY_ID(float, float, float, 2, 0, 1, 4, 1);       
 ADD_BLOCKWISE_INST_REF_BY_ID(float, float, float, 2, 0, 1, 2, 1);       
 ADD_BLOCKWISE_INST_REF_BY_ID(float, float, float, 3, 0, 1, 4, 3); // for MAX
+ADD_BLOCKWISE_INST_REF_BY_ID(float, float, float, 3, 0, 1, 4, 4);      
 ADD_BLOCKWISE_INST_REF_BY_ID(float, float, float, 3, 0, 1, 4, 1);       
 ADD_BLOCKWISE_INST_REF_BY_ID(float, float, float, 3, 0, 1, 2, 1);       
 ADD_BLOCKWISE_INST_REF_BY_ID(float, float, float, 4, 0, 1, 4, 3); // for AMAX
+ADD_BLOCKWISE_INST_REF_BY_ID(float, float, float, 4, 0, 1, 4, 4);       
 ADD_BLOCKWISE_INST_REF_BY_ID(float, float, float, 4, 0, 1, 4, 1);       
 ADD_BLOCKWISE_INST_REF_BY_ID(float, float, float, 4, 0, 1, 2, 1);
 // clang-format on
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f64_f32.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f64_f32.hpp
index 48f3ab567f..b08f35ad09 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f64_f32.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f64_f32.hpp
@@ -13,12 +13,15 @@ namespace device_reduce_instance {
 // clang-format off
 // InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim 
 ADD_BLOCKWISE_INST_REF_BY_ID(float, double, float, 0, 0, 0, 4, 3); // for ADD
+ADD_BLOCKWISE_INST_REF_BY_ID(float, double, float, 0, 0, 0, 4, 4);
 ADD_BLOCKWISE_INST_REF_BY_ID(float, double, float, 0, 0, 0, 4, 1);
 ADD_BLOCKWISE_INST_REF_BY_ID(float, double, float, 0, 0, 0, 2, 1);
 ADD_BLOCKWISE_INST_REF_BY_ID(float, double, float, 5, 0, 0, 4, 3); // for AVG
+ADD_BLOCKWISE_INST_REF_BY_ID(float, double, float, 5, 0, 0, 4, 4);       
 ADD_BLOCKWISE_INST_REF_BY_ID(float, double, float, 5, 0, 0, 4, 1);       
 ADD_BLOCKWISE_INST_REF_BY_ID(float, double, float, 5, 0, 0, 2, 1);       
 ADD_BLOCKWISE_INST_REF_BY_ID(float, double, float, 7, 0, 0, 4, 3); // for NORM2
+ADD_BLOCKWISE_INST_REF_BY_ID(float, double, float, 7, 0, 0, 4, 4);       
 ADD_BLOCKWISE_INST_REF_BY_ID(float, double, float, 7, 0, 0, 4, 1);       
 ADD_BLOCKWISE_INST_REF_BY_ID(float, double, float, 7, 0, 0, 2, 1);
 // clang-format on
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64.hpp
index d88bd341a2..65cdd45340 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64.hpp
@@ -13,30 +13,39 @@ namespace device_reduce_instance {
 // clang-format off
 // InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim 
 ADD_BLOCKWISE_INST_REF_BY_ID(double, double, double, 0, 0, 0, 4, 3); // for ADD
+ADD_BLOCKWISE_INST_REF_BY_ID(double, double, double, 0, 0, 0, 4, 4);
 ADD_BLOCKWISE_INST_REF_BY_ID(double, double, double, 0, 0, 0, 4, 1);
 ADD_BLOCKWISE_INST_REF_BY_ID(double, double, double, 0, 0, 0, 2, 1);
 ADD_BLOCKWISE_INST_REF_BY_ID(double, double, double, 5, 0, 0, 4, 3); // for AVG
+ADD_BLOCKWISE_INST_REF_BY_ID(double, double, double, 5, 0, 0, 4, 4);       
 ADD_BLOCKWISE_INST_REF_BY_ID(double, double, double, 5, 0, 0, 4, 1);       
 ADD_BLOCKWISE_INST_REF_BY_ID(double, double, double, 5, 0, 0, 2, 1);       
 ADD_BLOCKWISE_INST_REF_BY_ID(double, double, double, 7, 0, 0, 4, 3); // for NORM2
+ADD_BLOCKWISE_INST_REF_BY_ID(double, double, double, 7, 0, 0, 4, 4);       
 ADD_BLOCKWISE_INST_REF_BY_ID(double, double, double, 7, 0, 0, 4, 1);       
 ADD_BLOCKWISE_INST_REF_BY_ID(double, double, double, 7, 0, 0, 2, 1);       
 ADD_BLOCKWISE_INST_REF_BY_ID(double, double, double, 2, 0, 0, 4, 3); // for MIN
+ADD_BLOCKWISE_INST_REF_BY_ID(double, double, double, 2, 0, 0, 4, 4);       
 ADD_BLOCKWISE_INST_REF_BY_ID(double, double, double, 2, 0, 0, 4, 1);       
 ADD_BLOCKWISE_INST_REF_BY_ID(double, double, double, 2, 0, 0, 2, 1);       
 ADD_BLOCKWISE_INST_REF_BY_ID(double, double, double, 3, 0, 0, 4, 3); // for MAX
+ADD_BLOCKWISE_INST_REF_BY_ID(double, double, double, 3, 0, 0, 4, 4);       
 ADD_BLOCKWISE_INST_REF_BY_ID(double, double, double, 3, 0, 0, 4, 1);       
 ADD_BLOCKWISE_INST_REF_BY_ID(double, double, double, 3, 0, 0, 2, 1);       
 ADD_BLOCKWISE_INST_REF_BY_ID(double, double, double, 4, 0, 0, 4, 3); // for AMAX
+ADD_BLOCKWISE_INST_REF_BY_ID(double, double, double, 4, 0, 0, 4, 4);       
 ADD_BLOCKWISE_INST_REF_BY_ID(double, double, double, 4, 0, 0, 4, 1);       
 ADD_BLOCKWISE_INST_REF_BY_ID(double, double, double, 4, 0, 0, 2, 1);       
 ADD_BLOCKWISE_INST_REF_BY_ID(double, double, double, 2, 0, 1, 4, 3); // for MIN
+ADD_BLOCKWISE_INST_REF_BY_ID(double, double, double, 2, 0, 1, 4, 4);       
 ADD_BLOCKWISE_INST_REF_BY_ID(double, double, double, 2, 0, 1, 4, 1);       
 ADD_BLOCKWISE_INST_REF_BY_ID(double, double, double, 2, 0, 1, 2, 1);       
 ADD_BLOCKWISE_INST_REF_BY_ID(double, double, double, 3, 0, 1, 4, 3); // for MAX
+ADD_BLOCKWISE_INST_REF_BY_ID(double, double, double, 3, 0, 1, 4, 4);       
 ADD_BLOCKWISE_INST_REF_BY_ID(double, double, double, 3, 0, 1, 4, 1);       
 ADD_BLOCKWISE_INST_REF_BY_ID(double, double, double, 3, 0, 1, 2, 1);       
 ADD_BLOCKWISE_INST_REF_BY_ID(double, double, double, 4, 0, 1, 4, 3); // for AMAX
+ADD_BLOCKWISE_INST_REF_BY_ID(double, double, double, 4, 0, 1, 4, 4);       
 ADD_BLOCKWISE_INST_REF_BY_ID(double, double, double, 4, 0, 1, 4, 1);       
 ADD_BLOCKWISE_INST_REF_BY_ID(double, double, double, 4, 0, 1, 2, 1);
 // clang-format on
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i32_i8.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i32_i8.hpp
new file mode 100644
index 0000000000..8d222d53dc
--- /dev/null
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i32_i8.hpp
@@ -0,0 +1,31 @@
+#ifndef DEVICE_REDUCE_INSTANCE_BLOCKWISE_I8_I32_I8_HPP
+#define DEVICE_REDUCE_INSTANCE_BLOCKWISE_I8_I32_I8_HPP
+
+#include "reduction_enums.hpp"
+#include "reduction_operator_mapping.hpp"
+#include "device_reduce_instance_blockwise.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace device_reduce_instance {
+
+// clang-format off
+// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim 
+ADD_BLOCKWISE_INST_REF_BY_ID(int8_t, int32_t, int8_t, 0, 0, 0, 4, 3); // for ADD
+ADD_BLOCKWISE_INST_REF_BY_ID(int8_t, int32_t, int8_t, 0, 0, 0, 4, 4);
+ADD_BLOCKWISE_INST_REF_BY_ID(int8_t, int32_t, int8_t, 0, 0, 0, 4, 1);
+ADD_BLOCKWISE_INST_REF_BY_ID(int8_t, int32_t, int8_t, 0, 0, 0, 2, 1);
+ADD_BLOCKWISE_INST_REF_BY_ID(int8_t, int32_t, int8_t, 5, 0, 0, 4, 3); // for AVG
+ADD_BLOCKWISE_INST_REF_BY_ID(int8_t, int32_t, int8_t, 5, 0, 0, 4, 4);       
+ADD_BLOCKWISE_INST_REF_BY_ID(int8_t, int32_t, int8_t, 5, 0, 0, 4, 1);       
+ADD_BLOCKWISE_INST_REF_BY_ID(int8_t, int32_t, int8_t, 5, 0, 0, 2, 1);       
+// clang-format on
+
+} // namespace device_reduce_instance
+} // namespace device
+} // namespace tensor_operation
+
+} // namespace ck
+
+#endif
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i8_i8.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i8_i8.hpp
new file mode 100644
index 0000000000..7f67138e6b
--- /dev/null
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i8_i8.hpp
@@ -0,0 +1,47 @@
+#ifndef DEVICE_REDUCE_INSTANCE_BLOCKWISE_I8_I8_I8_HPP
+#define DEVICE_REDUCE_INSTANCE_BLOCKWISE_I8_I8_I8_HPP
+
+#include "reduction_enums.hpp"
+#include "reduction_operator_mapping.hpp"
+#include "device_reduce_instance_blockwise.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace device_reduce_instance {
+
+// clang-format off
+// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim 
+ADD_BLOCKWISE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 2, 0, 0, 4, 3); // for MIN
+ADD_BLOCKWISE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 2, 0, 0, 4, 4);       
+ADD_BLOCKWISE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 2, 0, 0, 4, 1);       
+ADD_BLOCKWISE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 2, 0, 0, 2, 1);       
+ADD_BLOCKWISE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 3, 0, 0, 4, 3); // for MAX
+ADD_BLOCKWISE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 3, 0, 0, 4, 4);       
+ADD_BLOCKWISE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 3, 0, 0, 4, 1);       
+ADD_BLOCKWISE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 3, 0, 0, 2, 1);       
+ADD_BLOCKWISE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 4, 0, 0, 4, 3); // for AMAX
+ADD_BLOCKWISE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 4, 0, 0, 4, 4);       
+ADD_BLOCKWISE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 4, 0, 0, 4, 1);       
+ADD_BLOCKWISE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 4, 0, 0, 2, 1);       
+ADD_BLOCKWISE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 2, 0, 1, 4, 3); // for MIN
+ADD_BLOCKWISE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 2, 0, 1, 4, 4);       
+ADD_BLOCKWISE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 2, 0, 1, 4, 1);       
+ADD_BLOCKWISE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 2, 0, 1, 2, 1);       
+ADD_BLOCKWISE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 3, 0, 1, 4, 3); // for MAX
+ADD_BLOCKWISE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 3, 0, 1, 4, 4);       
+ADD_BLOCKWISE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 3, 0, 1, 4, 1);       
+ADD_BLOCKWISE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 3, 0, 1, 2, 1);       
+ADD_BLOCKWISE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 4, 0, 1, 4, 3); // for AMAX
+ADD_BLOCKWISE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 4, 0, 1, 4, 4);       
+ADD_BLOCKWISE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 4, 0, 1, 4, 1);       
+ADD_BLOCKWISE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 4, 0, 1, 2, 1);
+// clang-format on
+
+} // namespace device_reduce_instance
+} // namespace device
+} // namespace tensor_operation
+
+} // namespace ck
+
+#endif
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_second_call.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_second_call.hpp
index 6ffe22ec0c..5a0c18e7a3 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_second_call.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_second_call.hpp
@@ -15,9 +15,7 @@ using reduce_configuration_2_instances_blockwise_second_call = std::tuple<
     // clang-format off
     // InSrcVectorDim | InSrcVectorSize | OutDstVectorSize | MThreadSliceSize | KThreadSliceSize
     ReductionConfiguration_2<1, 2, 1, 1, 2>,
-    ReductionConfiguration_2<1, 2, 2, 1, 2>,
-    ReductionConfiguration_2<1, 1, 1, 1, 3>,
-    ReductionConfiguration_2<1, 1, 2, 1, 3>
+    ReductionConfiguration_2<1, 1, 1, 1, 3>
     // clang-format on
     >;
 #else
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_second_call_f16_f16_f16.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_second_call_f16_f16_f16.hpp
index bf78feb552..4ce19c7d0c 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_second_call_f16_f16_f16.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_second_call_f16_f16_f16.hpp
@@ -13,21 +13,27 @@ namespace device_reduce_instance {
 // clang-format off
 // InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim 
 ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(half_t, half_t, half_t, 2, 0, 0, 4, 3); // for MIN
+ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(half_t, half_t, half_t, 2, 0, 0, 4, 4);       
 ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(half_t, half_t, half_t, 2, 0, 0, 4, 1);       
 ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(half_t, half_t, half_t, 2, 0, 0, 2, 1);       
 ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(half_t, half_t, half_t, 3, 0, 0, 4, 3); // for MAX
+ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(half_t, half_t, half_t, 3, 0, 0, 4, 4);       
 ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(half_t, half_t, half_t, 3, 0, 0, 4, 1);       
 ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(half_t, half_t, half_t, 3, 0, 0, 2, 1);       
 ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(half_t, half_t, half_t, 4, 0, 0, 4, 3); // for AMAX
+ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(half_t, half_t, half_t, 4, 0, 0, 4, 4);       
 ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(half_t, half_t, half_t, 4, 0, 0, 4, 1);       
 ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(half_t, half_t, half_t, 4, 0, 0, 2, 1);       
 ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(half_t, half_t, half_t, 2, 0, 1, 4, 3); // for MIN
+ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(half_t, half_t, half_t, 2, 0, 1, 4, 4);       
 ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(half_t, half_t, half_t, 2, 0, 1, 4, 1);       
 ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(half_t, half_t, half_t, 2, 0, 1, 2, 1);       
 ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(half_t, half_t, half_t, 3, 0, 1, 4, 3); // for MAX
+ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(half_t, half_t, half_t, 3, 0, 1, 4, 4);       
 ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(half_t, half_t, half_t, 3, 0, 1, 4, 1);       
 ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(half_t, half_t, half_t, 3, 0, 1, 2, 1);       
 ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(half_t, half_t, half_t, 4, 0, 1, 4, 3); // for AMAX
+ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(half_t, half_t, half_t, 4, 0, 1, 4, 4);       
 ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(half_t, half_t, half_t, 4, 0, 1, 4, 1);       
 ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(half_t, half_t, half_t, 4, 0, 1, 2, 1);
 // clang-format on
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_second_call_f32_f32_b16.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_second_call_f32_f32_b16.hpp
new file mode 100644
index 0000000000..c85419befc
--- /dev/null
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_second_call_f32_f32_b16.hpp
@@ -0,0 +1,60 @@
+#ifndef DEVICE_REDUCE_INSTANCE_BLOCKWISE_SECOND_CALL_F32_F32_B16_HPP
+#define DEVICE_REDUCE_INSTANCE_BLOCKWISE_SECOND_CALL_F32_F32_B16_HPP
+
+#include "reduction_enums.hpp"
+#include "reduction_operator_mapping.hpp"
+#include "device_reduce_instance_blockwise_second_call.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace device_reduce_instance {
+
+// clang-format off
+// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim 
+ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, bhalf_t, 0, 0, 0, 4, 3); // for ADD
+ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, bhalf_t, 0, 0, 0, 4, 4);
+ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, bhalf_t, 0, 0, 0, 4, 1);
+ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, bhalf_t, 0, 0, 0, 2, 1);
+ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, bhalf_t, 5, 0, 0, 4, 3); // for AVG
+ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, bhalf_t, 5, 0, 0, 4, 4);       
+ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, bhalf_t, 5, 0, 0, 4, 1);       
+ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, bhalf_t, 5, 0, 0, 2, 1);       
+ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, bhalf_t, 7, 0, 0, 4, 3); // for NORM2
+ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, bhalf_t, 7, 0, 0, 4, 4);       
+ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, bhalf_t, 7, 0, 0, 4, 1);       
+ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, bhalf_t, 7, 0, 0, 2, 1);
+
+ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, bhalf_t, 2, 0, 0, 4, 3); // for MIN
+ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, bhalf_t, 2, 0, 0, 4, 4);
+ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, bhalf_t, 2, 0, 0, 4, 1);
+ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, bhalf_t, 2, 0, 0, 2, 1);
+ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, bhalf_t, 3, 0, 0, 4, 3); // for MAX
+ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, bhalf_t, 3, 0, 0, 4, 4);
+ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, bhalf_t, 3, 0, 0, 4, 1);
+ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, bhalf_t, 3, 0, 0, 2, 1);
+ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, bhalf_t, 4, 0, 0, 4, 3); // for AMAX
+ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, bhalf_t, 4, 0, 0, 4, 4);
+ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, bhalf_t, 4, 0, 0, 4, 1);
+ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, bhalf_t, 4, 0, 0, 2, 1);
+ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, bhalf_t, 2, 0, 1, 4, 3); // for MIN
+ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, bhalf_t, 2, 0, 1, 4, 4);
+ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, bhalf_t, 2, 0, 1, 4, 1);
+ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, bhalf_t, 2, 0, 1, 2, 1);
+ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, bhalf_t, 3, 0, 1, 4, 3); // for MAX
+ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, bhalf_t, 3, 0, 1, 4, 4);
+ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, bhalf_t, 3, 0, 1, 4, 1);
+ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, bhalf_t, 3, 0, 1, 2, 1);
+ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, bhalf_t, 4, 0, 1, 4, 3); // for AMAX
+ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, bhalf_t, 4, 0, 1, 4, 4);
+ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, bhalf_t, 4, 0, 1, 4, 1);
+ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, bhalf_t, 4, 0, 1, 2, 1);
+// clang-format on
+
+} // namespace device_reduce_instance
+} // namespace device
+} // namespace tensor_operation
+
+} // namespace ck
+
+#endif
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_second_call_f32_f32_f16.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_second_call_f32_f32_f16.hpp
index 3e880b6929..d42e7e020f 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_second_call_f32_f32_f16.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_second_call_f32_f32_f16.hpp
@@ -13,12 +13,15 @@ namespace device_reduce_instance {
 // clang-format off
 // InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim 
 ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, half_t, 0, 0, 0, 4, 3); // for ADD
+ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, half_t, 0, 0, 0, 4, 4);
 ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, half_t, 0, 0, 0, 4, 1);
 ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, half_t, 0, 0, 0, 2, 1);
 ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, half_t, 5, 0, 0, 4, 3); // for AVG
+ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, half_t, 5, 0, 0, 4, 4);       
 ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, half_t, 5, 0, 0, 4, 1);       
 ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, half_t, 5, 0, 0, 2, 1);       
 ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, half_t, 7, 0, 0, 4, 3); // for NORM2
+ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, half_t, 7, 0, 0, 4, 4);       
 ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, half_t, 7, 0, 0, 4, 1);       
 ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, half_t, 7, 0, 0, 2, 1);
 // clang-format on
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_second_call_f32_f32_f32.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_second_call_f32_f32_f32.hpp
index 01b1a3103a..fcf244d1d3 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_second_call_f32_f32_f32.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_second_call_f32_f32_f32.hpp
@@ -13,30 +13,39 @@ namespace device_reduce_instance {
 // clang-format off
 // InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim 
 ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, float, 0, 0, 0, 4, 3); // for ADD
+ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, float, 0, 0, 0, 4, 4);
 ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, float, 0, 0, 0, 4, 1);
 ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, float, 0, 0, 0, 2, 1);
 ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, float, 5, 0, 0, 4, 3); // for AVG
+ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, float, 5, 0, 0, 4, 4);       
 ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, float, 5, 0, 0, 4, 1);       
 ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, float, 5, 0, 0, 2, 1);       
 ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, float, 7, 0, 0, 4, 3); // for NORM2
+ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, float, 7, 0, 0, 4, 4);       
 ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, float, 7, 0, 0, 4, 1);       
 ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, float, 7, 0, 0, 2, 1);       
 ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, float, 2, 0, 0, 4, 3); // for MIN
+ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, float, 2, 0, 0, 4, 4);       
 ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, float, 2, 0, 0, 4, 1);       
 ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, float, 2, 0, 0, 2, 1);       
 ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, float, 3, 0, 0, 4, 3); // for MAX
+ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, float, 3, 0, 0, 4, 4);       
 ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, float, 3, 0, 0, 4, 1);       
 ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, float, 3, 0, 0, 2, 1);       
 ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, float, 4, 0, 0, 4, 3); // for AMAX
+ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, float, 4, 0, 0, 4, 4);       
 ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, float, 4, 0, 0, 4, 1);       
 ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, float, 4, 0, 0, 2, 1);       
 ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, float, 2, 0, 1, 4, 3); // for MIN
+ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, float, 2, 0, 1, 4, 4);       
 ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, float, 2, 0, 1, 4, 1);       
 ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, float, 2, 0, 1, 2, 1);       
 ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, float, 3, 0, 1, 4, 3); // for MAX
+ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, float, 3, 0, 1, 4, 4);       
 ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, float, 3, 0, 1, 4, 1);       
 ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, float, 3, 0, 1, 2, 1);       
 ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, float, 4, 0, 1, 4, 3); // for AMAX
+ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, float, 4, 0, 1, 4, 4);       
 ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, float, 4, 0, 1, 4, 1);       
 ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, float, 4, 0, 1, 2, 1);
 // clang-format on
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_second_call_f64_f64_f32.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_second_call_f64_f64_f32.hpp
index 46908a4c56..72e806ee60 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_second_call_f64_f64_f32.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_second_call_f64_f64_f32.hpp
@@ -13,12 +13,15 @@ namespace device_reduce_instance {
 // clang-format off
 // InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim 
 ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(double, double, float, 0, 0, 0, 4, 3); // for ADD
+ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(double, double, float, 0, 0, 0, 4, 4);
 ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(double, double, float, 0, 0, 0, 4, 1);
 ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(double, double, float, 0, 0, 0, 2, 1);
 ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(double, double, float, 5, 0, 0, 4, 3); // for AVG
+ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(double, double, float, 5, 0, 0, 4, 4);       
 ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(double, double, float, 5, 0, 0, 4, 1);       
 ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(double, double, float, 5, 0, 0, 2, 1);       
 ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(double, double, float, 7, 0, 0, 4, 3); // for NORM2
+ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(double, double, float, 7, 0, 0, 4, 4);       
 ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(double, double, float, 7, 0, 0, 4, 1);       
 ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(double, double, float, 7, 0, 0, 2, 1);
 // clang-format on
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_second_call_f64_f64_f64.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_second_call_f64_f64_f64.hpp
index 2182c2eac2..476c3a7d8f 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_second_call_f64_f64_f64.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_second_call_f64_f64_f64.hpp
@@ -13,30 +13,39 @@ namespace device_reduce_instance {
 // clang-format off
 // InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim 
 ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(double, double, double, 0, 0, 0, 4, 3); // for ADD
+ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(double, double, double, 0, 0, 0, 4, 4);
 ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(double, double, double, 0, 0, 0, 4, 1);
 ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(double, double, double, 0, 0, 0, 2, 1);
 ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(double, double, double, 5, 0, 0, 4, 3); // for AVG
+ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(double, double, double, 5, 0, 0, 4, 4);       
 ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(double, double, double, 5, 0, 0, 4, 1);       
 ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(double, double, double, 5, 0, 0, 2, 1);       
 ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(double, double, double, 7, 0, 0, 4, 3); // for NORM2
+ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(double, double, double, 7, 0, 0, 4, 4);       
 ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(double, double, double, 7, 0, 0, 4, 1);       
 ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(double, double, double, 7, 0, 0, 2, 1);       
 ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(double, double, double, 2, 0, 0, 4, 3); // for MIN
+ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(double, double, double, 2, 0, 0, 4, 4);       
 ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(double, double, double, 2, 0, 0, 4, 1);       
 ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(double, double, double, 2, 0, 0, 2, 1);       
 ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(double, double, double, 3, 0, 0, 4, 3); // for MAX
+ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(double, double, double, 3, 0, 0, 4, 4);       
 ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(double, double, double, 3, 0, 0, 4, 1);       
 ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(double, double, double, 3, 0, 0, 2, 1);       
 ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(double, double, double, 4, 0, 0, 4, 3); // for AMAX
+ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(double, double, double, 4, 0, 0, 4, 4);       
 ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(double, double, double, 4, 0, 0, 4, 1);       
 ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(double, double, double, 4, 0, 0, 2, 1);       
 ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(double, double, double, 2, 0, 1, 4, 3); // for MIN
+ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(double, double, double, 2, 0, 1, 4, 4);       
 ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(double, double, double, 2, 0, 1, 4, 1);       
 ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(double, double, double, 2, 0, 1, 2, 1);       
 ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(double, double, double, 3, 0, 1, 4, 3); // for MAX
+ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(double, double, double, 3, 0, 1, 4, 4);       
 ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(double, double, double, 3, 0, 1, 4, 1);       
 ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(double, double, double, 3, 0, 1, 2, 1);       
 ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(double, double, double, 4, 0, 1, 4, 3); // for AMAX
+ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(double, double, double, 4, 0, 1, 4, 4);       
 ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(double, double, double, 4, 0, 1, 4, 1);       
 ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(double, double, double, 4, 0, 1, 2, 1);
 // clang-format on
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_second_call_i32_i32_i8.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_second_call_i32_i32_i8.hpp
new file mode 100644
index 0000000000..d46780483b
--- /dev/null
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_second_call_i32_i32_i8.hpp
@@ -0,0 +1,31 @@
+#ifndef DEVICE_REDUCE_INSTANCE_BLOCKWISE_SECOND_CALL_I32_I32_I8_HPP
+#define DEVICE_REDUCE_INSTANCE_BLOCKWISE_SECOND_CALL_I32_I32_I8_HPP
+
+#include "reduction_enums.hpp"
+#include "reduction_operator_mapping.hpp"
+#include "device_reduce_instance_blockwise_second_call.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace device_reduce_instance {
+
+// clang-format off
+// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim 
+ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(int32_t, int32_t, int8_t, 0, 0, 0, 4, 3); // for ADD
+ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(int32_t, int32_t, int8_t, 0, 0, 0, 4, 4);
+ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(int32_t, int32_t, int8_t, 0, 0, 0, 4, 1);
+ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(int32_t, int32_t, int8_t, 0, 0, 0, 2, 1);
+ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(int32_t, int32_t, int8_t, 5, 0, 0, 4, 3); // for AVG
+ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(int32_t, int32_t, int8_t, 5, 0, 0, 4, 4);
+ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(int32_t, int32_t, int8_t, 5, 0, 0, 4, 1);
+ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(int32_t, int32_t, int8_t, 5, 0, 0, 2, 1);
+// clang-format on
+
+} // namespace device_reduce_instance
+} // namespace device
+} // namespace tensor_operation
+
+} // namespace ck
+
+#endif
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_second_call_i8_i8_i8.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_second_call_i8_i8_i8.hpp
new file mode 100644
index 0000000000..7b020fb439
--- /dev/null
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_second_call_i8_i8_i8.hpp
@@ -0,0 +1,47 @@
+#ifndef DEVICE_REDUCE_INSTANCE_BLOCKWISE_SECOND_CALL_I8_I8_I8_HPP
+#define DEVICE_REDUCE_INSTANCE_BLOCKWISE_SECOND_CALL_I8_I8_I8_HPP
+
+#include "reduction_enums.hpp"
+#include "reduction_operator_mapping.hpp"
+#include "device_reduce_instance_blockwise_second_call.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace device_reduce_instance {
+
+// clang-format off
+// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim 
+ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(int8_t, int8_t, int8_t, 2, 0, 0, 4, 3); // for MIN
+ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(int8_t, int8_t, int8_t, 2, 0, 0, 4, 4);       
+ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(int8_t, int8_t, int8_t, 2, 0, 0, 4, 1);       
+ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(int8_t, int8_t, int8_t, 2, 0, 0, 2, 1);       
+ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(int8_t, int8_t, int8_t, 3, 0, 0, 4, 3); // for MAX
+ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(int8_t, int8_t, int8_t, 3, 0, 0, 4, 4);       
+ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(int8_t, int8_t, int8_t, 3, 0, 0, 4, 1);       
+ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(int8_t, int8_t, int8_t, 3, 0, 0, 2, 1);       
+ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(int8_t, int8_t, int8_t, 4, 0, 0, 4, 3); // for AMAX
+ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(int8_t, int8_t, int8_t, 4, 0, 0, 4, 4);       
+ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(int8_t, int8_t, int8_t, 4, 0, 0, 4, 1);       
+ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(int8_t, int8_t, int8_t, 4, 0, 0, 2, 1);       
+ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(int8_t, int8_t, int8_t, 2, 0, 1, 4, 3); // for MIN
+ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(int8_t, int8_t, int8_t, 2, 0, 1, 4, 4);       
+ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(int8_t, int8_t, int8_t, 2, 0, 1, 4, 1);       
+ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(int8_t, int8_t, int8_t, 2, 0, 1, 2, 1);       
+ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(int8_t, int8_t, int8_t, 3, 0, 1, 4, 3); // for MAX
+ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(int8_t, int8_t, int8_t, 3, 0, 1, 4, 4);       
+ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(int8_t, int8_t, int8_t, 3, 0, 1, 4, 1);       
+ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(int8_t, int8_t, int8_t, 3, 0, 1, 2, 1);       
+ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(int8_t, int8_t, int8_t, 4, 0, 1, 4, 3); // for AMAX
+ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(int8_t, int8_t, int8_t, 4, 0, 1, 4, 4);       
+ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(int8_t, int8_t, int8_t, 4, 0, 1, 4, 1);       
+ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(int8_t, int8_t, int8_t, 4, 0, 1, 2, 1);
+// clang-format on
+
+} // namespace device_reduce_instance
+} // namespace device
+} // namespace tensor_operation
+
+} // namespace ck
+
+#endif
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add.hpp
index d3f62e4050..3b317e1d80 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add.hpp
@@ -17,7 +17,6 @@ using reduce_configuration_2_instances_multiblock_atomic_add = std::tuple<
     ReductionConfiguration_2<0, 2, 2, 2, 1>,
     ReductionConfiguration_2<0, 1, 1, 2, 1>,
     ReductionConfiguration_2<1, 2, 1, 1, 2>,
-    ReductionConfiguration_2<1, 2, 2, 1, 2>,
     ReductionConfiguration_2<0, 1, 1, 3, 1>,
     ReductionConfiguration_2<1, 1, 1, 1, 3>
     // clang-format on
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_b16_f32_f32.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_b16_f32_f32.hpp
new file mode 100644
index 0000000000..58f90bb94f
--- /dev/null
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_b16_f32_f32.hpp
@@ -0,0 +1,31 @@
+#ifndef DEVICE_REDUCE_INSTANCE_MULTIBLOCK_ATOMIC_ADD_B16_F32_F32_HPP
+#define DEVICE_REDUCE_INSTANCE_MULTIBLOCK_ATOMIC_ADD_B16_F32_F32_HPP
+
+#include "reduction_enums.hpp"
+#include "reduction_operator_mapping.hpp"
+#include "device_reduce_instance_multiblock_atomic_add.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace device_reduce_instance {
+
+// clang-format off
+// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim 
+ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID(bhalf_t, float, float, 0, 0, 0, 4, 3); // for ADD
+ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID(bhalf_t, float, float, 0, 0, 0, 4, 4);
+ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID(bhalf_t, float, float, 0, 0, 0, 4, 1);
+ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID(bhalf_t, float, float, 0, 0, 0, 2, 1);
+ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID(bhalf_t, float, float, 5, 0, 0, 4, 3); // for AVG
+ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID(bhalf_t, float, float, 5, 0, 0, 4, 4);       
+ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID(bhalf_t, float, float, 5, 0, 0, 4, 1);       
+ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID(bhalf_t, float, float, 5, 0, 0, 2, 1);
+// clang-format on
+
+} // namespace device_reduce_instance
+} // namespace device
+} // namespace tensor_operation
+
+} // namespace ck
+
+#endif
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f16_f32_f32.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f16_f32_f32.hpp
index f1c53b9bce..f4c766ca03 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f16_f32_f32.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f16_f32_f32.hpp
@@ -13,9 +13,11 @@ namespace device_reduce_instance {
 // clang-format off
 // InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim 
 ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID(half_t, float, float, 0, 0, 0, 4, 3); // for ADD
+ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID(half_t, float, float, 0, 0, 0, 4, 4);
 ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID(half_t, float, float, 0, 0, 0, 4, 1);
 ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID(half_t, float, float, 0, 0, 0, 2, 1);
 ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID(half_t, float, float, 5, 0, 0, 4, 3); // for AVG
+ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID(half_t, float, float, 5, 0, 0, 4, 4);       
 ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID(half_t, float, float, 5, 0, 0, 4, 1);       
 ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID(half_t, float, float, 5, 0, 0, 2, 1);
 // clang-format on
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f32_f32.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f32_f32.hpp
index 07258be297..c2f2564fc9 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f32_f32.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f32_f32.hpp
@@ -13,9 +13,11 @@ namespace device_reduce_instance {
 // clang-format off
 // InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim 
 ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID(float, float, float, 0, 0, 0, 4, 3); // for ADD
+ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID(float, float, float, 0, 0, 0, 4, 4);
 ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID(float, float, float, 0, 0, 0, 4, 1);
 ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID(float, float, float, 0, 0, 0, 2, 1);
 ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID(float, float, float, 5, 0, 0, 4, 3); // for AVG
+ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID(float, float, float, 5, 0, 0, 4, 4);       
 ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID(float, float, float, 5, 0, 0, 4, 1);       
 ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID(float, float, float, 5, 0, 0, 2, 1);
 // clang-format on
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f64_f32.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f64_f32.hpp
index 7cd5bc778e..830dcf9407 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f64_f32.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f64_f32.hpp
@@ -13,9 +13,11 @@ namespace device_reduce_instance {
 // clang-format off
 // InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim 
 ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID(float, double, float, 0, 0, 0, 4, 3); // for ADD
+ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID(float, double, float, 0, 0, 0, 4, 4);
 ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID(float, double, float, 0, 0, 0, 4, 1);
 ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID(float, double, float, 0, 0, 0, 2, 1);
 ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID(float, double, float, 5, 0, 0, 4, 3); // for AVG
+ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID(float, double, float, 5, 0, 0, 4, 4);       
 ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID(float, double, float, 5, 0, 0, 4, 1);       
 ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID(float, double, float, 5, 0, 0, 2, 1);
 // clang-format on
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_partial_reduce_b16_f32_b16.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_partial_reduce_b16_f32_b16.hpp
new file mode 100644
index 0000000000..d25645ad1e
--- /dev/null
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_partial_reduce_b16_f32_b16.hpp
@@ -0,0 +1,60 @@
+#ifndef DEVICE_REDUCE_INSTANCE_MULTIBLOCK_PARTIAL_REDUCE_B16_F32_B16_HPP
+#define DEVICE_REDUCE_INSTANCE_MULTIBLOCK_PARTIAL_REDUCE_B16_F32_B16_HPP
+
+#include "reduction_enums.hpp"
+#include "reduction_operator_mapping.hpp"
+#include "device_reduce_instance_multiblock_partial_reduce.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace device_reduce_instance {
+
+// clang-format off
+// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim 
+ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 0, 0, 0, 4, 3); // for ADD
+ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 0, 0, 0, 4, 4);
+ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 0, 0, 0, 4, 1);
+ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 0, 0, 0, 2, 1);
+ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 5, 0, 0, 4, 3); // for AVG
+ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 5, 0, 0, 4, 4);       
+ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 5, 0, 0, 4, 1);       
+ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 5, 0, 0, 2, 1);       
+ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 7, 0, 0, 4, 3); // for NORM2
+ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 7, 0, 0, 4, 4);       
+ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 7, 0, 0, 4, 1);       
+ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 7, 0, 0, 2, 1);
+
+ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 2, 0, 0, 4, 3); // for MIN
+ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 2, 0, 0, 4, 4);
+ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 2, 0, 0, 4, 1);
+ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 2, 0, 0, 2, 1);
+ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 3, 0, 0, 4, 3); // for MAX
+ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 3, 0, 0, 4, 4);
+ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 3, 0, 0, 4, 1);
+ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 3, 0, 0, 2, 1);
+ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 4, 0, 0, 4, 3); // for AMAX
+ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 4, 0, 0, 4, 4);
+ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 4, 0, 0, 4, 1);
+ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 4, 0, 0, 2, 1);
+ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 2, 0, 1, 4, 3); // for MIN
+ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 2, 0, 1, 4, 4);
+ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 2, 0, 1, 4, 1);
+ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 2, 0, 1, 2, 1);
+ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 3, 0, 1, 4, 3); // for MAX
+ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 3, 0, 1, 4, 4);
+ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 3, 0, 1, 4, 1);
+ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 3, 0, 1, 2, 1);
+ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 4, 0, 1, 4, 3); // for AMAX
+ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 4, 0, 1, 4, 4);
+ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 4, 0, 1, 4, 1);
+ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 4, 0, 1, 2, 1);
+// clang-format on
+
+} // namespace device_reduce_instance
+} // namespace device
+} // namespace tensor_operation
+
+} // namespace ck
+
+#endif
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_partial_reduce_f16_f16_f16.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_partial_reduce_f16_f16_f16.hpp
index d58acf14ca..05549fc702 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_partial_reduce_f16_f16_f16.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_partial_reduce_f16_f16_f16.hpp
@@ -13,21 +13,27 @@ namespace device_reduce_instance {
 // clang-format off
 // InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim 
 ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(half_t, half_t, half_t, 2, 0, 0, 4, 3); // for MIN
+ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(half_t, half_t, half_t, 2, 0, 0, 4, 4);       
 ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(half_t, half_t, half_t, 2, 0, 0, 4, 1);       
 ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(half_t, half_t, half_t, 2, 0, 0, 2, 1);       
 ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(half_t, half_t, half_t, 3, 0, 0, 4, 3); // for MAX
+ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(half_t, half_t, half_t, 3, 0, 0, 4, 4);       
 ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(half_t, half_t, half_t, 3, 0, 0, 4, 1);       
 ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(half_t, half_t, half_t, 3, 0, 0, 2, 1);       
 ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(half_t, half_t, half_t, 4, 0, 0, 4, 3); // for AMAX
+ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(half_t, half_t, half_t, 4, 0, 0, 4, 4);       
 ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(half_t, half_t, half_t, 4, 0, 0, 4, 1);       
 ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(half_t, half_t, half_t, 4, 0, 0, 2, 1);       
 ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(half_t, half_t, half_t, 2, 0, 1, 4, 3); // for MIN
+ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(half_t, half_t, half_t, 2, 0, 1, 4, 4);       
 ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(half_t, half_t, half_t, 2, 0, 1, 4, 1);       
 ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(half_t, half_t, half_t, 2, 0, 1, 2, 1);       
 ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(half_t, half_t, half_t, 3, 0, 1, 4, 3); // for MAX
+ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(half_t, half_t, half_t, 3, 0, 1, 4, 4);       
 ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(half_t, half_t, half_t, 3, 0, 1, 4, 1);       
 ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(half_t, half_t, half_t, 3, 0, 1, 2, 1);       
 ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(half_t, half_t, half_t, 4, 0, 1, 4, 3); // for AMAX
+ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(half_t, half_t, half_t, 4, 0, 1, 4, 4);       
 ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(half_t, half_t, half_t, 4, 0, 1, 4, 1);       
 ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(half_t, half_t, half_t, 4, 0, 1, 2, 1);
 // clang-format on
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_partial_reduce_f16_f32_f16.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_partial_reduce_f16_f32_f16.hpp
index 54c5b853b1..3e4aaef51b 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_partial_reduce_f16_f32_f16.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_partial_reduce_f16_f32_f16.hpp
@@ -13,12 +13,15 @@ namespace device_reduce_instance {
 // clang-format off
 // InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim 
 ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(half_t, float, half_t, 0, 0, 0, 4, 3); // for ADD
+ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(half_t, float, half_t, 0, 0, 0, 4, 4);
 ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(half_t, float, half_t, 0, 0, 0, 4, 1);
 ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(half_t, float, half_t, 0, 0, 0, 2, 1);
 ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(half_t, float, half_t, 5, 0, 0, 4, 3); // for AVG
+ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(half_t, float, half_t, 5, 0, 0, 4, 4);       
 ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(half_t, float, half_t, 5, 0, 0, 4, 1);       
 ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(half_t, float, half_t, 5, 0, 0, 2, 1);       
 ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(half_t, float, half_t, 7, 0, 0, 4, 3); // for NORM2
+ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(half_t, float, half_t, 7, 0, 0, 4, 4);       
 ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(half_t, float, half_t, 7, 0, 0, 4, 1);       
 ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(half_t, float, half_t, 7, 0, 0, 2, 1);
 // clang-format on
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_partial_reduce_f32_f32_f32.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_partial_reduce_f32_f32_f32.hpp
index f7f476abc1..2a1e4e7bf0 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_partial_reduce_f32_f32_f32.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_partial_reduce_f32_f32_f32.hpp
@@ -13,25 +13,32 @@ namespace device_reduce_instance {
 // clang-format off
 // InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim 
 ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(float, float, float, 2, 0, 0, 4, 3); // for MIN
+ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(float, float, float, 2, 0, 0, 4, 4);       
 ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(float, float, float, 2, 0, 0, 4, 1);       
 ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(float, float, float, 2, 0, 0, 2, 1);       
 ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(float, float, float, 3, 0, 0, 4, 3); // for MAX
+ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(float, float, float, 3, 0, 0, 4, 4);       
 ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(float, float, float, 3, 0, 0, 4, 1);       
 ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(float, float, float, 3, 0, 0, 2, 1);       
 ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(float, float, float, 4, 0, 0, 4, 3); // for AMAX
+ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(float, float, float, 4, 0, 0, 4, 4);       
 ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(float, float, float, 4, 0, 0, 4, 1);       
 ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(float, float, float, 4, 0, 0, 2, 1);       
 ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(float, float, float, 2, 0, 1, 4, 3); // for MIN
+ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(float, float, float, 2, 0, 1, 4, 4);       
 ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(float, float, float, 2, 0, 1, 4, 1);       
 ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(float, float, float, 2, 0, 1, 2, 1);       
 ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(float, float, float, 3, 0, 1, 4, 3); // for MAX
+ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(float, float, float, 3, 0, 1, 4, 4);       
 ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(float, float, float, 3, 0, 1, 4, 1);       
 ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(float, float, float, 3, 0, 1, 2, 1);       
 ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(float, float, float, 4, 0, 1, 4, 3); // for AMAX
+ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(float, float, float, 4, 0, 1, 4, 4);       
 ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(float, float, float, 4, 0, 1, 4, 1);       
 ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(float, float, float, 4, 0, 1, 2, 1);       
 
 ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(float, float, float, 7, 0, 0, 4, 3); // for NORM2
+ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(float, float, float, 7, 0, 0, 4, 4);       
 ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(float, float, float, 7, 0, 0, 4, 1);       
 ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(float, float, float, 7, 0, 0, 2, 1);
 // clang-format on
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_partial_reduce_f32_f64_f32.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_partial_reduce_f32_f64_f32.hpp
index 86455fd913..f95e3001ee 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_partial_reduce_f32_f64_f32.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_partial_reduce_f32_f64_f32.hpp
@@ -13,6 +13,7 @@ namespace device_reduce_instance {
 // clang-format off
 // InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim 
 ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(float, double, float, 7, 0, 0, 4, 3); // for NORM2
+ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(float, double, float, 7, 0, 0, 4, 4);       
 ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(float, double, float, 7, 0, 0, 4, 1);       
 ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(float, double, float, 7, 0, 0, 2, 1);
 // clang-format on
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_partial_reduce_f64_f64_f64.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_partial_reduce_f64_f64_f64.hpp
index 55b69257b6..fac65128b6 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_partial_reduce_f64_f64_f64.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_partial_reduce_f64_f64_f64.hpp
@@ -13,33 +13,42 @@ namespace device_reduce_instance {
 // clang-format off
 // InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim 
 ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(double, double, double, 2, 0, 0, 4, 3); // for MIN
+ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(double, double, double, 2, 0, 0, 4, 4);       
 ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(double, double, double, 2, 0, 0, 4, 1);       
 ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(double, double, double, 2, 0, 0, 2, 1);       
 ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(double, double, double, 3, 0, 0, 4, 3); // for MAX
+ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(double, double, double, 3, 0, 0, 4, 4);       
 ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(double, double, double, 3, 0, 0, 4, 1);       
 ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(double, double, double, 3, 0, 0, 2, 1);       
 ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(double, double, double, 4, 0, 0, 4, 3); // for AMAX
+ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(double, double, double, 4, 0, 0, 4, 4);       
 ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(double, double, double, 4, 0, 0, 4, 1);       
 ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(double, double, double, 4, 0, 0, 2, 1);       
 ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(double, double, double, 2, 0, 1, 4, 3); // for MIN
+ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(double, double, double, 2, 0, 1, 4, 4);       
 ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(double, double, double, 2, 0, 1, 4, 1);       
 ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(double, double, double, 2, 0, 1, 2, 1);       
 ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(double, double, double, 3, 0, 1, 4, 3); // for MAX
+ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(double, double, double, 3, 0, 1, 4, 4);       
 ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(double, double, double, 3, 0, 1, 4, 1);       
 ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(double, double, double, 3, 0, 1, 2, 1);       
 ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(double, double, double, 4, 0, 1, 4, 3); // for AMAX
+ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(double, double, double, 4, 0, 1, 4, 4);       
 ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(double, double, double, 4, 0, 1, 4, 1);       
 ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(double, double, double, 4, 0, 1, 2, 1);       
 
 ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(double, double, double, 7, 0, 0, 4, 3); // for NORM2
+ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(double, double, double, 7, 0, 0, 4, 4);       
 ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(double, double, double, 7, 0, 0, 4, 1);       
 ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(double, double, double, 7, 0, 0, 2, 1);       
 
 // Will be moved to use MultiBlockAtomicAdd
 ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(double, double, double, 0, 0, 0, 4, 3); // for ADD
+ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(double, double, double, 0, 0, 0, 4, 4);       
 ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(double, double, double, 0, 0, 0, 4, 1);       
 ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(double, double, double, 0, 0, 0, 2, 1);       
 ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(double, double, double, 5, 0, 0, 4, 3); // for AVG
+ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(double, double, double, 5, 0, 0, 4, 4);       
 ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(double, double, double, 5, 0, 0, 4, 1);       
 ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(double, double, double, 5, 0, 0, 2, 1);
 // clang-format on
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_partial_reduce_i8_i32_i8.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_partial_reduce_i8_i32_i8.hpp
new file mode 100644
index 0000000000..895c144c66
--- /dev/null
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_partial_reduce_i8_i32_i8.hpp
@@ -0,0 +1,31 @@
+#ifndef DEVICE_REDUCE_INSTANCE_MULTIBLOCK_PARTIAL_REDUCE_I8_I32_I8_HPP
+#define DEVICE_REDUCE_INSTANCE_MULTIBLOCK_PARTIAL_REDUCE_I8_I32_I8_HPP
+
+#include "reduction_enums.hpp"
+#include "reduction_operator_mapping.hpp"
+#include "device_reduce_instance_multiblock_partial_reduce.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace device_reduce_instance {
+
+// clang-format off
+// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim 
+ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(int8_t, int32_t, int8_t, 0, 0, 0, 4, 3); // for ADD
+ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(int8_t, int32_t, int8_t, 0, 0, 0, 4, 4);
+ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(int8_t, int32_t, int8_t, 0, 0, 0, 4, 1);
+ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(int8_t, int32_t, int8_t, 0, 0, 0, 2, 1);
+ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(int8_t, int32_t, int8_t, 5, 0, 0, 4, 3); // for AVG
+ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(int8_t, int32_t, int8_t, 5, 0, 0, 4, 4);
+ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(int8_t, int32_t, int8_t, 5, 0, 0, 4, 1);
+ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(int8_t, int32_t, int8_t, 5, 0, 0, 2, 1);
+// clang-format on
+
+} // namespace device_reduce_instance
+} // namespace device
+} // namespace tensor_operation
+
+} // namespace ck
+
+#endif
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_partial_reduce_i8_i8_i8.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_partial_reduce_i8_i8_i8.hpp
new file mode 100644
index 0000000000..d6bee57fcd
--- /dev/null
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_partial_reduce_i8_i8_i8.hpp
@@ -0,0 +1,47 @@
+#ifndef DEVICE_REDUCE_INSTANCE_MULTIBLOCK_PARTIAL_REDUCE_I8_I8_I8_HPP
+#define DEVICE_REDUCE_INSTANCE_MULTIBLOCK_PARTIAL_REDUCE_I8_I8_I8_HPP
+
+#include "reduction_enums.hpp"
+#include "reduction_operator_mapping.hpp"
+#include "device_reduce_instance_multiblock_partial_reduce.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace device_reduce_instance {
+
+// clang-format off
+// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim 
+ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 2, 0, 0, 4, 3); // for MIN
+ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 2, 0, 0, 4, 4);       
+ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 2, 0, 0, 4, 1);       
+ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 2, 0, 0, 2, 1);       
+ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 3, 0, 0, 4, 3); // for MAX
+ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 3, 0, 0, 4, 4);       
+ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 3, 0, 0, 4, 1);       
+ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 3, 0, 0, 2, 1);       
+ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 4, 0, 0, 4, 3); // for AMAX
+ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 4, 0, 0, 4, 4);       
+ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 4, 0, 0, 4, 1);       
+ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 4, 0, 0, 2, 1);       
+ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 2, 0, 1, 4, 3); // for MIN
+ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 2, 0, 1, 4, 4);       
+ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 2, 0, 1, 4, 1);       
+ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 2, 0, 1, 2, 1);       
+ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 3, 0, 1, 4, 3); // for MAX
+ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 3, 0, 1, 4, 4);       
+ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 3, 0, 1, 4, 1);       
+ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 3, 0, 1, 2, 1);       
+ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 4, 0, 1, 4, 3); // for AMAX
+ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 4, 0, 1, 4, 4);       
+ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 4, 0, 1, 4, 1);       
+ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 4, 0, 1, 2, 1);
+// clang-format on
+
+} // namespace device_reduce_instance
+} // namespace device
+} // namespace tensor_operation
+
+} // namespace ck
+
+#endif
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp
index 3321791207..9371672a54 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp
@@ -17,7 +17,6 @@ using reduce_configuration_2_instances_threadwise = std::tuple<
     ReductionConfiguration_2<0, 2, 2, 2, 1>,
     ReductionConfiguration_2<0, 1, 1, 2, 1>,
     ReductionConfiguration_2<1, 2, 1, 1, 2>,
-    ReductionConfiguration_2<1, 2, 2, 1, 2>,
     ReductionConfiguration_2<0, 1, 1, 3, 1>,
     ReductionConfiguration_2<1, 1, 1, 1, 3>
     // clang-format on
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16.hpp
new file mode 100644
index 0000000000..f11d9118c9
--- /dev/null
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16.hpp
@@ -0,0 +1,60 @@
+#ifndef DEVICE_REDUCE_INSTANCE_THREADWISE_B16_F32_B16_HPP
+#define DEVICE_REDUCE_INSTANCE_THREADWISE_B16_F32_B16_HPP
+
+#include "reduction_enums.hpp"
+#include "reduction_operator_mapping.hpp"
+#include "device_reduce_instance_threadwise.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace device_reduce_instance {
+
+// clang-format off
+// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim 
+ADD_THREADWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 0, 0, 0, 4, 3); // for ADD
+ADD_THREADWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 0, 0, 0, 4, 4);
+ADD_THREADWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 0, 0, 0, 4, 1);
+ADD_THREADWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 0, 0, 0, 2, 1);
+ADD_THREADWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 5, 0, 0, 4, 3); // for AVG
+ADD_THREADWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 5, 0, 0, 4, 4);       
+ADD_THREADWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 5, 0, 0, 4, 1);       
+ADD_THREADWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 5, 0, 0, 2, 1);       
+ADD_THREADWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 7, 0, 0, 4, 3); // for NORM2
+ADD_THREADWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 7, 0, 0, 4, 4);       
+ADD_THREADWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 7, 0, 0, 4, 1);       
+ADD_THREADWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 7, 0, 0, 2, 1);
+
+ADD_THREADWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 2, 0, 0, 4, 3); // for MIN
+ADD_THREADWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 2, 0, 0, 4, 4);
+ADD_THREADWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 2, 0, 0, 4, 1);
+ADD_THREADWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 2, 0, 0, 2, 1);
+ADD_THREADWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 3, 0, 0, 4, 3); // for MAX
+ADD_THREADWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 3, 0, 0, 4, 4);
+ADD_THREADWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 3, 0, 0, 4, 1);
+ADD_THREADWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 3, 0, 0, 2, 1);
+ADD_THREADWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 4, 0, 0, 4, 3); // for AMAX
+ADD_THREADWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 4, 0, 0, 4, 4);
+ADD_THREADWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 4, 0, 0, 4, 1);
+ADD_THREADWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 4, 0, 0, 2, 1);
+ADD_THREADWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 2, 0, 1, 4, 3); // for MIN
+ADD_THREADWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 2, 0, 1, 4, 4);
+ADD_THREADWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 2, 0, 1, 4, 1);
+ADD_THREADWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 2, 0, 1, 2, 1);
+ADD_THREADWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 3, 0, 1, 4, 3); // for MAX
+ADD_THREADWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 3, 0, 1, 4, 4);
+ADD_THREADWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 3, 0, 1, 4, 1);
+ADD_THREADWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 3, 0, 1, 2, 1);
+ADD_THREADWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 4, 0, 1, 4, 3); // for AMAX
+ADD_THREADWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 4, 0, 1, 4, 4);
+ADD_THREADWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 4, 0, 1, 4, 1);
+ADD_THREADWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 4, 0, 1, 2, 1);
+// clang-format on
+
+} // namespace device_reduce_instance
+} // namespace device
+} // namespace tensor_operation
+
+} // namespace ck
+
+#endif
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f16_f16.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f16_f16.hpp
index 5d8a037cb4..fe220335c5 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f16_f16.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f16_f16.hpp
@@ -13,21 +13,27 @@ namespace device_reduce_instance {
 // clang-format off
 // InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim 
 ADD_THREADWISE_INST_REF_BY_ID(half_t, half_t, half_t, 2, 0, 0, 4, 3); // for MIN
+ADD_THREADWISE_INST_REF_BY_ID(half_t, half_t, half_t, 2, 0, 0, 4, 4);       
 ADD_THREADWISE_INST_REF_BY_ID(half_t, half_t, half_t, 2, 0, 0, 4, 1);       
 ADD_THREADWISE_INST_REF_BY_ID(half_t, half_t, half_t, 2, 0, 0, 2, 1);       
 ADD_THREADWISE_INST_REF_BY_ID(half_t, half_t, half_t, 3, 0, 0, 4, 3); // for MAX
+ADD_THREADWISE_INST_REF_BY_ID(half_t, half_t, half_t, 3, 0, 0, 4, 4);       
 ADD_THREADWISE_INST_REF_BY_ID(half_t, half_t, half_t, 3, 0, 0, 4, 1);       
 ADD_THREADWISE_INST_REF_BY_ID(half_t, half_t, half_t, 3, 0, 0, 2, 1);       
 ADD_THREADWISE_INST_REF_BY_ID(half_t, half_t, half_t, 4, 0, 0, 4, 3); // for AMAX
+ADD_THREADWISE_INST_REF_BY_ID(half_t, half_t, half_t, 4, 0, 0, 4, 4);       
 ADD_THREADWISE_INST_REF_BY_ID(half_t, half_t, half_t, 4, 0, 0, 4, 1);       
 ADD_THREADWISE_INST_REF_BY_ID(half_t, half_t, half_t, 4, 0, 0, 2, 1);       
 ADD_THREADWISE_INST_REF_BY_ID(half_t, half_t, half_t, 2, 0, 1, 4, 3); // for MIN
+ADD_THREADWISE_INST_REF_BY_ID(half_t, half_t, half_t, 2, 0, 1, 4, 4);       
 ADD_THREADWISE_INST_REF_BY_ID(half_t, half_t, half_t, 2, 0, 1, 4, 1);       
 ADD_THREADWISE_INST_REF_BY_ID(half_t, half_t, half_t, 2, 0, 1, 2, 1);       
 ADD_THREADWISE_INST_REF_BY_ID(half_t, half_t, half_t, 3, 0, 1, 4, 3); // for MAX
+ADD_THREADWISE_INST_REF_BY_ID(half_t, half_t, half_t, 3, 0, 1, 4, 4);       
 ADD_THREADWISE_INST_REF_BY_ID(half_t, half_t, half_t, 3, 0, 1, 4, 1);       
 ADD_THREADWISE_INST_REF_BY_ID(half_t, half_t, half_t, 3, 0, 1, 2, 1);       
 ADD_THREADWISE_INST_REF_BY_ID(half_t, half_t, half_t, 4, 0, 1, 4, 3); // for AMAX
+ADD_THREADWISE_INST_REF_BY_ID(half_t, half_t, half_t, 4, 0, 1, 4, 4);       
 ADD_THREADWISE_INST_REF_BY_ID(half_t, half_t, half_t, 4, 0, 1, 4, 1);       
 ADD_THREADWISE_INST_REF_BY_ID(half_t, half_t, half_t, 4, 0, 1, 2, 1);
 // clang-format on
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f32_f16.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f32_f16.hpp
index 8a50074054..970559cfac 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f32_f16.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f32_f16.hpp
@@ -13,12 +13,15 @@ namespace device_reduce_instance {
 // clang-format off
 // InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim 
 ADD_THREADWISE_INST_REF_BY_ID(half_t, float, half_t, 0, 0, 0, 4, 3); // for ADD
+ADD_THREADWISE_INST_REF_BY_ID(half_t, float, half_t, 0, 0, 0, 4, 4);
 ADD_THREADWISE_INST_REF_BY_ID(half_t, float, half_t, 0, 0, 0, 4, 1);
 ADD_THREADWISE_INST_REF_BY_ID(half_t, float, half_t, 0, 0, 0, 2, 1);
 ADD_THREADWISE_INST_REF_BY_ID(half_t, float, half_t, 5, 0, 0, 4, 3); // for AVG
+ADD_THREADWISE_INST_REF_BY_ID(half_t, float, half_t, 5, 0, 0, 4, 4);       
 ADD_THREADWISE_INST_REF_BY_ID(half_t, float, half_t, 5, 0, 0, 4, 1);       
 ADD_THREADWISE_INST_REF_BY_ID(half_t, float, half_t, 5, 0, 0, 2, 1);       
 ADD_THREADWISE_INST_REF_BY_ID(half_t, float, half_t, 7, 0, 0, 4, 3); // for NORM2
+ADD_THREADWISE_INST_REF_BY_ID(half_t, float, half_t, 7, 0, 0, 4, 4);       
 ADD_THREADWISE_INST_REF_BY_ID(half_t, float, half_t, 7, 0, 0, 4, 1);       
 ADD_THREADWISE_INST_REF_BY_ID(half_t, float, half_t, 7, 0, 0, 2, 1);
 // clang-format on
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32.hpp
index 2ad2535523..66c33a72a4 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32.hpp
@@ -13,30 +13,39 @@ namespace device_reduce_instance {
 // clang-format off
 // InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim 
 ADD_THREADWISE_INST_REF_BY_ID(float, float, float, 0, 0, 0, 4, 3); // for ADD
+ADD_THREADWISE_INST_REF_BY_ID(float, float, float, 0, 0, 0, 4, 4);
 ADD_THREADWISE_INST_REF_BY_ID(float, float, float, 0, 0, 0, 4, 1);
 ADD_THREADWISE_INST_REF_BY_ID(float, float, float, 0, 0, 0, 2, 1);
 ADD_THREADWISE_INST_REF_BY_ID(float, float, float, 5, 0, 0, 4, 3); // for AVG
+ADD_THREADWISE_INST_REF_BY_ID(float, float, float, 5, 0, 0, 4, 4);       
 ADD_THREADWISE_INST_REF_BY_ID(float, float, float, 5, 0, 0, 4, 1);       
 ADD_THREADWISE_INST_REF_BY_ID(float, float, float, 5, 0, 0, 2, 1);       
 ADD_THREADWISE_INST_REF_BY_ID(float, float, float, 7, 0, 0, 4, 3); // for NORM2
+ADD_THREADWISE_INST_REF_BY_ID(float, float, float, 7, 0, 0, 4, 4);       
 ADD_THREADWISE_INST_REF_BY_ID(float, float, float, 7, 0, 0, 4, 1);       
 ADD_THREADWISE_INST_REF_BY_ID(float, float, float, 7, 0, 0, 2, 1);       
 ADD_THREADWISE_INST_REF_BY_ID(float, float, float, 2, 0, 0, 4, 3); // for MIN
+ADD_THREADWISE_INST_REF_BY_ID(float, float, float, 2, 0, 0, 4, 4);       
 ADD_THREADWISE_INST_REF_BY_ID(float, float, float, 2, 0, 0, 4, 1);       
 ADD_THREADWISE_INST_REF_BY_ID(float, float, float, 2, 0, 0, 2, 1);       
 ADD_THREADWISE_INST_REF_BY_ID(float, float, float, 3, 0, 0, 4, 3); // for MAX
+ADD_THREADWISE_INST_REF_BY_ID(float, float, float, 3, 0, 0, 4, 4);       
 ADD_THREADWISE_INST_REF_BY_ID(float, float, float, 3, 0, 0, 4, 1);       
 ADD_THREADWISE_INST_REF_BY_ID(float, float, float, 3, 0, 0, 2, 1);       
 ADD_THREADWISE_INST_REF_BY_ID(float, float, float, 4, 0, 0, 4, 3); // for AMAX
+ADD_THREADWISE_INST_REF_BY_ID(float, float, float, 4, 0, 0, 4, 4);       
 ADD_THREADWISE_INST_REF_BY_ID(float, float, float, 4, 0, 0, 4, 1);       
 ADD_THREADWISE_INST_REF_BY_ID(float, float, float, 4, 0, 0, 2, 1);       
 ADD_THREADWISE_INST_REF_BY_ID(float, float, float, 2, 0, 1, 4, 3); // for MIN
+ADD_THREADWISE_INST_REF_BY_ID(float, float, float, 2, 0, 1, 4, 4);       
 ADD_THREADWISE_INST_REF_BY_ID(float, float, float, 2, 0, 1, 4, 1);       
 ADD_THREADWISE_INST_REF_BY_ID(float, float, float, 2, 0, 1, 2, 1);       
 ADD_THREADWISE_INST_REF_BY_ID(float, float, float, 3, 0, 1, 4, 3); // for MAX
+ADD_THREADWISE_INST_REF_BY_ID(float, float, float, 3, 0, 1, 4, 4);       
 ADD_THREADWISE_INST_REF_BY_ID(float, float, float, 3, 0, 1, 4, 1);       
 ADD_THREADWISE_INST_REF_BY_ID(float, float, float, 3, 0, 1, 2, 1);       
 ADD_THREADWISE_INST_REF_BY_ID(float, float, float, 4, 0, 1, 4, 3); // for AMAX
+ADD_THREADWISE_INST_REF_BY_ID(float, float, float, 4, 0, 1, 4, 4);       
 ADD_THREADWISE_INST_REF_BY_ID(float, float, float, 4, 0, 1, 4, 1);       
 ADD_THREADWISE_INST_REF_BY_ID(float, float, float, 4, 0, 1, 2, 1);
 // clang-format on
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f64_f32.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f64_f32.hpp
index 2dca1e40df..196f142dbf 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f64_f32.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f64_f32.hpp
@@ -13,12 +13,15 @@ namespace device_reduce_instance {
 // clang-format off
 // InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim 
 ADD_THREADWISE_INST_REF_BY_ID(float, double, float, 0, 0, 0, 4, 3); // for ADD
+ADD_THREADWISE_INST_REF_BY_ID(float, double, float, 0, 0, 0, 4, 4);
 ADD_THREADWISE_INST_REF_BY_ID(float, double, float, 0, 0, 0, 4, 1);
 ADD_THREADWISE_INST_REF_BY_ID(float, double, float, 0, 0, 0, 2, 1);
 ADD_THREADWISE_INST_REF_BY_ID(float, double, float, 5, 0, 0, 4, 3); // for AVG
+ADD_THREADWISE_INST_REF_BY_ID(float, double, float, 5, 0, 0, 4, 4);       
 ADD_THREADWISE_INST_REF_BY_ID(float, double, float, 5, 0, 0, 4, 1);       
 ADD_THREADWISE_INST_REF_BY_ID(float, double, float, 5, 0, 0, 2, 1);       
 ADD_THREADWISE_INST_REF_BY_ID(float, double, float, 7, 0, 0, 4, 3); // for NORM2
+ADD_THREADWISE_INST_REF_BY_ID(float, double, float, 7, 0, 0, 4, 4);       
 ADD_THREADWISE_INST_REF_BY_ID(float, double, float, 7, 0, 0, 4, 1);       
 ADD_THREADWISE_INST_REF_BY_ID(float, double, float, 7, 0, 0, 2, 1);
 // clang-format on
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64.hpp
index 8fcfaa38f8..4f3e1448d0 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64.hpp
@@ -13,30 +13,39 @@ namespace device_reduce_instance {
 // clang-format off
 // InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim 
 ADD_THREADWISE_INST_REF_BY_ID(double, double, double, 0, 0, 0, 4, 3); // for ADD
+ADD_THREADWISE_INST_REF_BY_ID(double, double, double, 0, 0, 0, 4, 4);
 ADD_THREADWISE_INST_REF_BY_ID(double, double, double, 0, 0, 0, 4, 1);
 ADD_THREADWISE_INST_REF_BY_ID(double, double, double, 0, 0, 0, 2, 1);
 ADD_THREADWISE_INST_REF_BY_ID(double, double, double, 5, 0, 0, 4, 3); // for AVG
+ADD_THREADWISE_INST_REF_BY_ID(double, double, double, 5, 0, 0, 4, 4);       
 ADD_THREADWISE_INST_REF_BY_ID(double, double, double, 5, 0, 0, 4, 1);       
 ADD_THREADWISE_INST_REF_BY_ID(double, double, double, 5, 0, 0, 2, 1);       
 ADD_THREADWISE_INST_REF_BY_ID(double, double, double, 7, 0, 0, 4, 3); // for NORM2
+ADD_THREADWISE_INST_REF_BY_ID(double, double, double, 7, 0, 0, 4, 4);       
 ADD_THREADWISE_INST_REF_BY_ID(double, double, double, 7, 0, 0, 4, 1);       
 ADD_THREADWISE_INST_REF_BY_ID(double, double, double, 7, 0, 0, 2, 1);       
 ADD_THREADWISE_INST_REF_BY_ID(double, double, double, 2, 0, 0, 4, 3); // for MIN
+ADD_THREADWISE_INST_REF_BY_ID(double, double, double, 2, 0, 0, 4, 4);       
 ADD_THREADWISE_INST_REF_BY_ID(double, double, double, 2, 0, 0, 4, 1);       
 ADD_THREADWISE_INST_REF_BY_ID(double, double, double, 2, 0, 0, 2, 1);       
 ADD_THREADWISE_INST_REF_BY_ID(double, double, double, 3, 0, 0, 4, 3); // for MAX
+ADD_THREADWISE_INST_REF_BY_ID(double, double, double, 3, 0, 0, 4, 4);       
 ADD_THREADWISE_INST_REF_BY_ID(double, double, double, 3, 0, 0, 4, 1);       
 ADD_THREADWISE_INST_REF_BY_ID(double, double, double, 3, 0, 0, 2, 1);       
 ADD_THREADWISE_INST_REF_BY_ID(double, double, double, 4, 0, 0, 4, 3); // for AMAX
+ADD_THREADWISE_INST_REF_BY_ID(double, double, double, 4, 0, 0, 4, 4);       
 ADD_THREADWISE_INST_REF_BY_ID(double, double, double, 4, 0, 0, 4, 1);       
 ADD_THREADWISE_INST_REF_BY_ID(double, double, double, 4, 0, 0, 2, 1);       
 ADD_THREADWISE_INST_REF_BY_ID(double, double, double, 2, 0, 1, 4, 3); // for MIN
+ADD_THREADWISE_INST_REF_BY_ID(double, double, double, 2, 0, 1, 4, 4);       
 ADD_THREADWISE_INST_REF_BY_ID(double, double, double, 2, 0, 1, 4, 1);       
 ADD_THREADWISE_INST_REF_BY_ID(double, double, double, 2, 0, 1, 2, 1);       
 ADD_THREADWISE_INST_REF_BY_ID(double, double, double, 3, 0, 1, 4, 3); // for MAX
+ADD_THREADWISE_INST_REF_BY_ID(double, double, double, 3, 0, 1, 4, 4);       
 ADD_THREADWISE_INST_REF_BY_ID(double, double, double, 3, 0, 1, 4, 1);       
 ADD_THREADWISE_INST_REF_BY_ID(double, double, double, 3, 0, 1, 2, 1);       
 ADD_THREADWISE_INST_REF_BY_ID(double, double, double, 4, 0, 1, 4, 3); // for AMAX
+ADD_THREADWISE_INST_REF_BY_ID(double, double, double, 4, 0, 1, 4, 4);       
 ADD_THREADWISE_INST_REF_BY_ID(double, double, double, 4, 0, 1, 4, 1);       
 ADD_THREADWISE_INST_REF_BY_ID(double, double, double, 4, 0, 1, 2, 1);
 // clang-format on
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i32_i8.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i32_i8.hpp
new file mode 100644
index 0000000000..8f19a5d0a2
--- /dev/null
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i32_i8.hpp
@@ -0,0 +1,31 @@
+#ifndef DEVICE_REDUCE_INSTANCE_THREADWISE_I8_I32_I8_HPP
+#define DEVICE_REDUCE_INSTANCE_THREADWISE_I8_I32_I8_HPP
+
+#include "reduction_enums.hpp"
+#include "reduction_operator_mapping.hpp"
+#include "device_reduce_instance_threadwise.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace device_reduce_instance {
+
+// clang-format off
+// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim 
+ADD_THREADWISE_INST_REF_BY_ID(int8_t, int32_t, int8_t, 0, 0, 0, 4, 3); // for ADD
+ADD_THREADWISE_INST_REF_BY_ID(int8_t, int32_t, int8_t, 0, 0, 0, 4, 4);
+ADD_THREADWISE_INST_REF_BY_ID(int8_t, int32_t, int8_t, 0, 0, 0, 4, 1);
+ADD_THREADWISE_INST_REF_BY_ID(int8_t, int32_t, int8_t, 0, 0, 0, 2, 1);
+ADD_THREADWISE_INST_REF_BY_ID(int8_t, int32_t, int8_t, 5, 0, 0, 4, 3); // for AVG
+ADD_THREADWISE_INST_REF_BY_ID(int8_t, int32_t, int8_t, 5, 0, 0, 4, 4);
+ADD_THREADWISE_INST_REF_BY_ID(int8_t, int32_t, int8_t, 5, 0, 0, 4, 1);
+ADD_THREADWISE_INST_REF_BY_ID(int8_t, int32_t, int8_t, 5, 0, 0, 2, 1);
+// clang-format on
+
+} // namespace device_reduce_instance
+} // namespace device
+} // namespace tensor_operation
+
+} // namespace ck
+
+#endif
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i8_i8.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i8_i8.hpp
new file mode 100644
index 0000000000..83bd48cd3f
--- /dev/null
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i8_i8.hpp
@@ -0,0 +1,47 @@
+#ifndef DEVICE_REDUCE_INSTANCE_THREADWISE_I8_I8_I8_HPP
+#define DEVICE_REDUCE_INSTANCE_THREADWISE_I8_I8_I8_HPP
+
+#include "reduction_enums.hpp"
+#include "reduction_operator_mapping.hpp"
+#include "device_reduce_instance_threadwise.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace device_reduce_instance {
+
+// clang-format off
+// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim 
+ADD_THREADWISE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 2, 0, 0, 4, 3); // for MIN
+ADD_THREADWISE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 2, 0, 0, 4, 4);       
+ADD_THREADWISE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 2, 0, 0, 4, 1);       
+ADD_THREADWISE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 2, 0, 0, 2, 1);       
+ADD_THREADWISE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 3, 0, 0, 4, 3); // for MAX
+ADD_THREADWISE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 3, 0, 0, 4, 4);       
+ADD_THREADWISE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 3, 0, 0, 4, 1);       
+ADD_THREADWISE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 3, 0, 0, 2, 1);       
+ADD_THREADWISE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 4, 0, 0, 4, 3); // for AMAX
+ADD_THREADWISE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 4, 0, 0, 4, 4);       
+ADD_THREADWISE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 4, 0, 0, 4, 1);       
+ADD_THREADWISE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 4, 0, 0, 2, 1);       
+ADD_THREADWISE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 2, 0, 1, 4, 3); // for MIN
+ADD_THREADWISE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 2, 0, 1, 4, 4);       
+ADD_THREADWISE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 2, 0, 1, 4, 1);       
+ADD_THREADWISE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 2, 0, 1, 2, 1);       
+ADD_THREADWISE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 3, 0, 1, 4, 3); // for MAX
+ADD_THREADWISE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 3, 0, 1, 4, 4);       
+ADD_THREADWISE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 3, 0, 1, 4, 1);       
+ADD_THREADWISE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 3, 0, 1, 2, 1);       
+ADD_THREADWISE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 4, 0, 1, 4, 3); // for AMAX
+ADD_THREADWISE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 4, 0, 1, 4, 4);       
+ADD_THREADWISE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 4, 0, 1, 4, 1);       
+ADD_THREADWISE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 4, 0, 1, 2, 1);
+// clang-format on
+
+} // namespace device_reduce_instance
+} // namespace device
+} // namespace tensor_operation
+
+} // namespace ck
+
+#endif
diff --git a/library/src/tensor_operation_instance/gpu/reduce/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/reduce/CMakeLists.txt
index c64d8b1361..cced3a4b76 100644
--- a/library/src/tensor_operation_instance/gpu/reduce/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/reduce/CMakeLists.txt
@@ -5,24 +5,37 @@ set(DEVICE_REDUCE_INSTANCE_SOURCE
    device_reduce_instance_blockwise_f32_f32_f32.cpp;
    device_reduce_instance_blockwise_f32_f64_f32.cpp;
    device_reduce_instance_blockwise_f64_f64_f64.cpp;
+   device_reduce_instance_blockwise_i8_i32_i8.cpp;
+   device_reduce_instance_blockwise_i8_i8_i8.cpp;   
+   device_reduce_instance_blockwise_b16_f32_b16.cpp;
    device_reduce_instance_threadwise_f16_f16_f16.cpp;
    device_reduce_instance_threadwise_f16_f32_f16.cpp;
    device_reduce_instance_threadwise_f32_f32_f32.cpp;
    device_reduce_instance_threadwise_f32_f64_f32.cpp;
    device_reduce_instance_threadwise_f64_f64_f64.cpp;
+   device_reduce_instance_threadwise_i8_i32_i8.cpp;
+   device_reduce_instance_threadwise_i8_i8_i8.cpp;
+   device_reduce_instance_threadwise_b16_f32_b16.cpp;
    device_reduce_instance_blockwise_second_call_f16_f16_f16.cpp;
    device_reduce_instance_blockwise_second_call_f32_f32_f16.cpp;
    device_reduce_instance_blockwise_second_call_f32_f32_f32.cpp;
    device_reduce_instance_blockwise_second_call_f64_f64_f32.cpp;
    device_reduce_instance_blockwise_second_call_f64_f64_f64.cpp;
+   device_reduce_instance_blockwise_second_call_i32_i32_i8.cpp;
+   device_reduce_instance_blockwise_second_call_i8_i8_i8.cpp;
+   device_reduce_instance_blockwise_second_call_f32_f32_b16.cpp;
    device_reduce_instance_multiblock_atomic_add_f16_f32_f32.cpp;
    device_reduce_instance_multiblock_atomic_add_f32_f32_f32.cpp;
    device_reduce_instance_multiblock_atomic_add_f32_f64_f32.cpp;
+   device_reduce_instance_multiblock_atomic_add_b16_f32_f32.cpp;
    device_reduce_instance_multiblock_partial_reduce_f16_f16_f16.cpp;
    device_reduce_instance_multiblock_partial_reduce_f16_f32_f16.cpp;
    device_reduce_instance_multiblock_partial_reduce_f32_f32_f32.cpp;
    device_reduce_instance_multiblock_partial_reduce_f32_f64_f32.cpp;
    device_reduce_instance_multiblock_partial_reduce_f64_f64_f64.cpp;
+   device_reduce_instance_multiblock_partial_reduce_i8_i32_i8.cpp;
+   device_reduce_instance_multiblock_partial_reduce_i8_i8_i8.cpp;
+   device_reduce_instance_multiblock_partial_reduce_b16_f32_b16.cpp;
 )
 
 add_library(device_reduce_instance SHARED ${DEVICE_REDUCE_INSTANCE_SOURCE}) 
diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16.cpp
new file mode 100644
index 0000000000..0274d89fc9
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16.cpp
@@ -0,0 +1,53 @@
+#include "device_reduce_instance_blockwise.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace device_reduce_instance {
+
+// clang-format off
+// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim
+ADD_BLOCKWISE_INST_BY_ID(bhalf_t, float, bhalf_t, 0, 0, 0, 4, 3); // for ADD
+ADD_BLOCKWISE_INST_BY_ID(bhalf_t, float, bhalf_t, 0, 0, 0, 4, 4);
+ADD_BLOCKWISE_INST_BY_ID(bhalf_t, float, bhalf_t, 0, 0, 0, 4, 1);
+ADD_BLOCKWISE_INST_BY_ID(bhalf_t, float, bhalf_t, 0, 0, 0, 2, 1);
+ADD_BLOCKWISE_INST_BY_ID(bhalf_t, float, bhalf_t, 5, 0, 0, 4, 3); // for AVG
+ADD_BLOCKWISE_INST_BY_ID(bhalf_t, float, bhalf_t, 5, 0, 0, 4, 4);       
+ADD_BLOCKWISE_INST_BY_ID(bhalf_t, float, bhalf_t, 5, 0, 0, 4, 1);       
+ADD_BLOCKWISE_INST_BY_ID(bhalf_t, float, bhalf_t, 5, 0, 0, 2, 1);       
+ADD_BLOCKWISE_INST_BY_ID(bhalf_t, float, bhalf_t, 7, 0, 0, 4, 3); // for NORM2
+ADD_BLOCKWISE_INST_BY_ID(bhalf_t, float, bhalf_t, 7, 0, 0, 4, 4);       
+ADD_BLOCKWISE_INST_BY_ID(bhalf_t, float, bhalf_t, 7, 0, 0, 4, 1);       
+ADD_BLOCKWISE_INST_BY_ID(bhalf_t, float, bhalf_t, 7, 0, 0, 2, 1);
+
+ADD_BLOCKWISE_INST_BY_ID(bhalf_t, float, bhalf_t, 2, 0, 0, 4, 3); // for MIN
+ADD_BLOCKWISE_INST_BY_ID(bhalf_t, float, bhalf_t, 2, 0, 0, 4, 4);
+ADD_BLOCKWISE_INST_BY_ID(bhalf_t, float, bhalf_t, 2, 0, 0, 4, 1);
+ADD_BLOCKWISE_INST_BY_ID(bhalf_t, float, bhalf_t, 2, 0, 0, 2, 1);
+ADD_BLOCKWISE_INST_BY_ID(bhalf_t, float, bhalf_t, 3, 0, 0, 4, 3); // for MAX
+ADD_BLOCKWISE_INST_BY_ID(bhalf_t, float, bhalf_t, 3, 0, 0, 4, 4);
+ADD_BLOCKWISE_INST_BY_ID(bhalf_t, float, bhalf_t, 3, 0, 0, 4, 1);
+ADD_BLOCKWISE_INST_BY_ID(bhalf_t, float, bhalf_t, 3, 0, 0, 2, 1);
+ADD_BLOCKWISE_INST_BY_ID(bhalf_t, float, bhalf_t, 4, 0, 0, 4, 3); // for AMAX
+ADD_BLOCKWISE_INST_BY_ID(bhalf_t, float, bhalf_t, 4, 0, 0, 4, 4);
+ADD_BLOCKWISE_INST_BY_ID(bhalf_t, float, bhalf_t, 4, 0, 0, 4, 1);
+ADD_BLOCKWISE_INST_BY_ID(bhalf_t, float, bhalf_t, 4, 0, 0, 2, 1);
+ADD_BLOCKWISE_INST_BY_ID(bhalf_t, float, bhalf_t, 2, 0, 1, 4, 3); // for MIN
+ADD_BLOCKWISE_INST_BY_ID(bhalf_t, float, bhalf_t, 2, 0, 1, 4, 4);
+ADD_BLOCKWISE_INST_BY_ID(bhalf_t, float, bhalf_t, 2, 0, 1, 4, 1);
+ADD_BLOCKWISE_INST_BY_ID(bhalf_t, float, bhalf_t, 2, 0, 1, 2, 1);
+ADD_BLOCKWISE_INST_BY_ID(bhalf_t, float, bhalf_t, 3, 0, 1, 4, 3); // for MAX
+ADD_BLOCKWISE_INST_BY_ID(bhalf_t, float, bhalf_t, 3, 0, 1, 4, 4);
+ADD_BLOCKWISE_INST_BY_ID(bhalf_t, float, bhalf_t, 3, 0, 1, 4, 1);
+ADD_BLOCKWISE_INST_BY_ID(bhalf_t, float, bhalf_t, 3, 0, 1, 2, 1);
+ADD_BLOCKWISE_INST_BY_ID(bhalf_t, float, bhalf_t, 4, 0, 1, 4, 3); // for AMAX
+ADD_BLOCKWISE_INST_BY_ID(bhalf_t, float, bhalf_t, 4, 0, 1, 4, 4);
+ADD_BLOCKWISE_INST_BY_ID(bhalf_t, float, bhalf_t, 4, 0, 1, 4, 1);
+ADD_BLOCKWISE_INST_BY_ID(bhalf_t, float, bhalf_t, 4, 0, 1, 2, 1);
+// clang-format on
+
+} // namespace device_reduce_instance
+} // namespace device
+} // namespace tensor_operation
+
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f16_f16.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f16_f16.cpp
index aa7c69e362..8a43d860ea 100644
--- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f16_f16.cpp
+++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f16_f16.cpp
@@ -8,21 +8,27 @@ namespace device_reduce_instance {
 // clang-format off
 // InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim
 ADD_BLOCKWISE_INST_BY_ID(half_t, half_t, half_t, 2, 0, 0, 4, 3); // for MIN
+ADD_BLOCKWISE_INST_BY_ID(half_t, half_t, half_t, 2, 0, 0, 4, 4);       
 ADD_BLOCKWISE_INST_BY_ID(half_t, half_t, half_t, 2, 0, 0, 4, 1);       
 ADD_BLOCKWISE_INST_BY_ID(half_t, half_t, half_t, 2, 0, 0, 2, 1);       
 ADD_BLOCKWISE_INST_BY_ID(half_t, half_t, half_t, 3, 0, 0, 4, 3); // for MAX
+ADD_BLOCKWISE_INST_BY_ID(half_t, half_t, half_t, 3, 0, 0, 4, 4);       
 ADD_BLOCKWISE_INST_BY_ID(half_t, half_t, half_t, 3, 0, 0, 4, 1);       
 ADD_BLOCKWISE_INST_BY_ID(half_t, half_t, half_t, 3, 0, 0, 2, 1);       
 ADD_BLOCKWISE_INST_BY_ID(half_t, half_t, half_t, 4, 0, 0, 4, 3); // for AMAX
+ADD_BLOCKWISE_INST_BY_ID(half_t, half_t, half_t, 4, 0, 0, 4, 4);       
 ADD_BLOCKWISE_INST_BY_ID(half_t, half_t, half_t, 4, 0, 0, 4, 1);       
 ADD_BLOCKWISE_INST_BY_ID(half_t, half_t, half_t, 4, 0, 0, 2, 1);       
 ADD_BLOCKWISE_INST_BY_ID(half_t, half_t, half_t, 2, 0, 1, 4, 3); // for MIN
+ADD_BLOCKWISE_INST_BY_ID(half_t, half_t, half_t, 2, 0, 1, 4, 4);       
 ADD_BLOCKWISE_INST_BY_ID(half_t, half_t, half_t, 2, 0, 1, 4, 1);       
 ADD_BLOCKWISE_INST_BY_ID(half_t, half_t, half_t, 2, 0, 1, 2, 1);       
 ADD_BLOCKWISE_INST_BY_ID(half_t, half_t, half_t, 3, 0, 1, 4, 3); // for MAX
+ADD_BLOCKWISE_INST_BY_ID(half_t, half_t, half_t, 3, 0, 1, 4, 4);       
 ADD_BLOCKWISE_INST_BY_ID(half_t, half_t, half_t, 3, 0, 1, 4, 1);       
 ADD_BLOCKWISE_INST_BY_ID(half_t, half_t, half_t, 3, 0, 1, 2, 1);       
 ADD_BLOCKWISE_INST_BY_ID(half_t, half_t, half_t, 4, 0, 1, 4, 3); // for AMAX
+ADD_BLOCKWISE_INST_BY_ID(half_t, half_t, half_t, 4, 0, 1, 4, 4);       
 ADD_BLOCKWISE_INST_BY_ID(half_t, half_t, half_t, 4, 0, 1, 4, 1);       
 ADD_BLOCKWISE_INST_BY_ID(half_t, half_t, half_t, 4, 0, 1, 2, 1);
 // clang-format on
diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f32_f16.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f32_f16.cpp
index 5a8e5eb625..3e0b8ba59c 100644
--- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f32_f16.cpp
+++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f32_f16.cpp
@@ -8,12 +8,15 @@ namespace device_reduce_instance {
 // clang-format off
 // InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim
 ADD_BLOCKWISE_INST_BY_ID(half_t, float, half_t, 0, 0, 0, 4, 3); // for ADD
+ADD_BLOCKWISE_INST_BY_ID(half_t, float, half_t, 0, 0, 0, 4, 4);
 ADD_BLOCKWISE_INST_BY_ID(half_t, float, half_t, 0, 0, 0, 4, 1);
 ADD_BLOCKWISE_INST_BY_ID(half_t, float, half_t, 0, 0, 0, 2, 1);
 ADD_BLOCKWISE_INST_BY_ID(half_t, float, half_t, 5, 0, 0, 4, 3); // for AVG
+ADD_BLOCKWISE_INST_BY_ID(half_t, float, half_t, 5, 0, 0, 4, 4);       
 ADD_BLOCKWISE_INST_BY_ID(half_t, float, half_t, 5, 0, 0, 4, 1);       
 ADD_BLOCKWISE_INST_BY_ID(half_t, float, half_t, 5, 0, 0, 2, 1);       
 ADD_BLOCKWISE_INST_BY_ID(half_t, float, half_t, 7, 0, 0, 4, 3); // for NORM2
+ADD_BLOCKWISE_INST_BY_ID(half_t, float, half_t, 7, 0, 0, 4, 4);       
 ADD_BLOCKWISE_INST_BY_ID(half_t, float, half_t, 7, 0, 0, 4, 1);       
 ADD_BLOCKWISE_INST_BY_ID(half_t, float, half_t, 7, 0, 0, 2, 1);
 // clang-format on
diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32.cpp
index cfe7cd86e9..ee96311f8c 100644
--- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32.cpp
+++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32.cpp
@@ -8,30 +8,39 @@ namespace device_reduce_instance {
 // clang-format off
 // InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim
 ADD_BLOCKWISE_INST_BY_ID(float, float, float, 0, 0, 0, 4, 3); // for ADD
+ADD_BLOCKWISE_INST_BY_ID(float, float, float, 0, 0, 0, 4, 4);
 ADD_BLOCKWISE_INST_BY_ID(float, float, float, 0, 0, 0, 4, 1);
 ADD_BLOCKWISE_INST_BY_ID(float, float, float, 0, 0, 0, 2, 1);
 ADD_BLOCKWISE_INST_BY_ID(float, float, float, 5, 0, 0, 4, 3); // for AVG
+ADD_BLOCKWISE_INST_BY_ID(float, float, float, 5, 0, 0, 4, 4);       
 ADD_BLOCKWISE_INST_BY_ID(float, float, float, 5, 0, 0, 4, 1);       
 ADD_BLOCKWISE_INST_BY_ID(float, float, float, 5, 0, 0, 2, 1);       
 ADD_BLOCKWISE_INST_BY_ID(float, float, float, 7, 0, 0, 4, 3); // for NORM2
+ADD_BLOCKWISE_INST_BY_ID(float, float, float, 7, 0, 0, 4, 4);       
 ADD_BLOCKWISE_INST_BY_ID(float, float, float, 7, 0, 0, 4, 1);       
 ADD_BLOCKWISE_INST_BY_ID(float, float, float, 7, 0, 0, 2, 1);       
 ADD_BLOCKWISE_INST_BY_ID(float, float, float, 2, 0, 0, 4, 3); // for MIN
+ADD_BLOCKWISE_INST_BY_ID(float, float, float, 2, 0, 0, 4, 4);       
 ADD_BLOCKWISE_INST_BY_ID(float, float, float, 2, 0, 0, 4, 1);       
 ADD_BLOCKWISE_INST_BY_ID(float, float, float, 2, 0, 0, 2, 1);       
 ADD_BLOCKWISE_INST_BY_ID(float, float, float, 3, 0, 0, 4, 3); // for MAX
+ADD_BLOCKWISE_INST_BY_ID(float, float, float, 3, 0, 0, 4, 4);       
 ADD_BLOCKWISE_INST_BY_ID(float, float, float, 3, 0, 0, 4, 1);       
 ADD_BLOCKWISE_INST_BY_ID(float, float, float, 3, 0, 0, 2, 1);       
 ADD_BLOCKWISE_INST_BY_ID(float, float, float, 4, 0, 0, 4, 3); // for AMAX
+ADD_BLOCKWISE_INST_BY_ID(float, float, float, 4, 0, 0, 4, 4);       
 ADD_BLOCKWISE_INST_BY_ID(float, float, float, 4, 0, 0, 4, 1);       
 ADD_BLOCKWISE_INST_BY_ID(float, float, float, 4, 0, 0, 2, 1);       
 ADD_BLOCKWISE_INST_BY_ID(float, float, float, 2, 0, 1, 4, 3); // for MIN
+ADD_BLOCKWISE_INST_BY_ID(float, float, float, 2, 0, 1, 4, 4);       
 ADD_BLOCKWISE_INST_BY_ID(float, float, float, 2, 0, 1, 4, 1);       
 ADD_BLOCKWISE_INST_BY_ID(float, float, float, 2, 0, 1, 2, 1);       
 ADD_BLOCKWISE_INST_BY_ID(float, float, float, 3, 0, 1, 4, 3); // for MAX
+ADD_BLOCKWISE_INST_BY_ID(float, float, float, 3, 0, 1, 4, 4);       
 ADD_BLOCKWISE_INST_BY_ID(float, float, float, 3, 0, 1, 4, 1);       
 ADD_BLOCKWISE_INST_BY_ID(float, float, float, 3, 0, 1, 2, 1);       
 ADD_BLOCKWISE_INST_BY_ID(float, float, float, 4, 0, 1, 4, 3); // for AMAX
+ADD_BLOCKWISE_INST_BY_ID(float, float, float, 4, 0, 1, 4, 4);       
 ADD_BLOCKWISE_INST_BY_ID(float, float, float, 4, 0, 1, 4, 1);       
 ADD_BLOCKWISE_INST_BY_ID(float, float, float, 4, 0, 1, 2, 1);
 // clang-format on
diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f64_f32.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f64_f32.cpp
index 453a2c6437..b0ae95e82d 100644
--- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f64_f32.cpp
+++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f64_f32.cpp
@@ -8,12 +8,15 @@ namespace device_reduce_instance {
 // clang-format off
 // InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim
 ADD_BLOCKWISE_INST_BY_ID(float, double, float, 0, 0, 0, 4, 3); // for ADD
+ADD_BLOCKWISE_INST_BY_ID(float, double, float, 0, 0, 0, 4, 4);
 ADD_BLOCKWISE_INST_BY_ID(float, double, float, 0, 0, 0, 4, 1);
 ADD_BLOCKWISE_INST_BY_ID(float, double, float, 0, 0, 0, 2, 1);
 ADD_BLOCKWISE_INST_BY_ID(float, double, float, 5, 0, 0, 4, 3); // for AVG
+ADD_BLOCKWISE_INST_BY_ID(float, double, float, 5, 0, 0, 4, 4);       
 ADD_BLOCKWISE_INST_BY_ID(float, double, float, 5, 0, 0, 4, 1);       
 ADD_BLOCKWISE_INST_BY_ID(float, double, float, 5, 0, 0, 2, 1);       
 ADD_BLOCKWISE_INST_BY_ID(float, double, float, 7, 0, 0, 4, 3); // for NORM2
+ADD_BLOCKWISE_INST_BY_ID(float, double, float, 7, 0, 0, 4, 4);       
 ADD_BLOCKWISE_INST_BY_ID(float, double, float, 7, 0, 0, 4, 1);       
 ADD_BLOCKWISE_INST_BY_ID(float, double, float, 7, 0, 0, 2, 1);
 // clang-format on
diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64.cpp
index 0499bd3987..9cca2dbbeb 100644
--- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64.cpp
+++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64.cpp
@@ -8,30 +8,39 @@ namespace device_reduce_instance {
 // clang-format off
 // InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim
 ADD_BLOCKWISE_INST_BY_ID(double, double, double, 0, 0, 0, 4, 3); // for ADD
+ADD_BLOCKWISE_INST_BY_ID(double, double, double, 0, 0, 0, 4, 4);
 ADD_BLOCKWISE_INST_BY_ID(double, double, double, 0, 0, 0, 4, 1);
 ADD_BLOCKWISE_INST_BY_ID(double, double, double, 0, 0, 0, 2, 1);
 ADD_BLOCKWISE_INST_BY_ID(double, double, double, 5, 0, 0, 4, 3); // for AVG
+ADD_BLOCKWISE_INST_BY_ID(double, double, double, 5, 0, 0, 4, 4);       
 ADD_BLOCKWISE_INST_BY_ID(double, double, double, 5, 0, 0, 4, 1);       
 ADD_BLOCKWISE_INST_BY_ID(double, double, double, 5, 0, 0, 2, 1);       
 ADD_BLOCKWISE_INST_BY_ID(double, double, double, 7, 0, 0, 4, 3); // for NORM2
+ADD_BLOCKWISE_INST_BY_ID(double, double, double, 7, 0, 0, 4, 4);       
 ADD_BLOCKWISE_INST_BY_ID(double, double, double, 7, 0, 0, 4, 1);       
 ADD_BLOCKWISE_INST_BY_ID(double, double, double, 7, 0, 0, 2, 1);       
 ADD_BLOCKWISE_INST_BY_ID(double, double, double, 2, 0, 0, 4, 3); // for MIN
+ADD_BLOCKWISE_INST_BY_ID(double, double, double, 2, 0, 0, 4, 4);       
 ADD_BLOCKWISE_INST_BY_ID(double, double, double, 2, 0, 0, 4, 1);       
 ADD_BLOCKWISE_INST_BY_ID(double, double, double, 2, 0, 0, 2, 1);       
 ADD_BLOCKWISE_INST_BY_ID(double, double, double, 3, 0, 0, 4, 3); // for MAX
+ADD_BLOCKWISE_INST_BY_ID(double, double, double, 3, 0, 0, 4, 4);       
 ADD_BLOCKWISE_INST_BY_ID(double, double, double, 3, 0, 0, 4, 1);       
 ADD_BLOCKWISE_INST_BY_ID(double, double, double, 3, 0, 0, 2, 1);       
 ADD_BLOCKWISE_INST_BY_ID(double, double, double, 4, 0, 0, 4, 3); // for AMAX
+ADD_BLOCKWISE_INST_BY_ID(double, double, double, 4, 0, 0, 4, 4);       
 ADD_BLOCKWISE_INST_BY_ID(double, double, double, 4, 0, 0, 4, 1);       
 ADD_BLOCKWISE_INST_BY_ID(double, double, double, 4, 0, 0, 2, 1);       
 ADD_BLOCKWISE_INST_BY_ID(double, double, double, 2, 0, 1, 4, 3); // for MIN
+ADD_BLOCKWISE_INST_BY_ID(double, double, double, 2, 0, 1, 4, 4);       
 ADD_BLOCKWISE_INST_BY_ID(double, double, double, 2, 0, 1, 4, 1);       
 ADD_BLOCKWISE_INST_BY_ID(double, double, double, 2, 0, 1, 2, 1);       
 ADD_BLOCKWISE_INST_BY_ID(double, double, double, 3, 0, 1, 4, 3); // for MAX
+ADD_BLOCKWISE_INST_BY_ID(double, double, double, 3, 0, 1, 4, 4);       
 ADD_BLOCKWISE_INST_BY_ID(double, double, double, 3, 0, 1, 4, 1);       
 ADD_BLOCKWISE_INST_BY_ID(double, double, double, 3, 0, 1, 2, 1);       
 ADD_BLOCKWISE_INST_BY_ID(double, double, double, 4, 0, 1, 4, 3); // for AMAX
+ADD_BLOCKWISE_INST_BY_ID(double, double, double, 4, 0, 1, 4, 4);       
 ADD_BLOCKWISE_INST_BY_ID(double, double, double, 4, 0, 1, 4, 1);       
 ADD_BLOCKWISE_INST_BY_ID(double, double, double, 4, 0, 1, 2, 1);
 // clang-format on
diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i32_i8.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i32_i8.cpp
new file mode 100644
index 0000000000..05cd1921ee
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i32_i8.cpp
@@ -0,0 +1,24 @@
+#include "device_reduce_instance_blockwise.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace device_reduce_instance {
+
+// clang-format off
+// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim
+ADD_BLOCKWISE_INST_BY_ID(int8_t, int32_t, int8_t, 0, 0, 0, 4, 3); // for ADD
+ADD_BLOCKWISE_INST_BY_ID(int8_t, int32_t, int8_t, 0, 0, 0, 4, 4);
+ADD_BLOCKWISE_INST_BY_ID(int8_t, int32_t, int8_t, 0, 0, 0, 4, 1);
+ADD_BLOCKWISE_INST_BY_ID(int8_t, int32_t, int8_t, 0, 0, 0, 2, 1);
+ADD_BLOCKWISE_INST_BY_ID(int8_t, int32_t, int8_t, 5, 0, 0, 4, 3); // for AVG
+ADD_BLOCKWISE_INST_BY_ID(int8_t, int32_t, int8_t, 5, 0, 0, 4, 4);
+ADD_BLOCKWISE_INST_BY_ID(int8_t, int32_t, int8_t, 5, 0, 0, 4, 1);
+ADD_BLOCKWISE_INST_BY_ID(int8_t, int32_t, int8_t, 5, 0, 0, 2, 1);
+// clang-format on
+
+} // namespace device_reduce_instance
+} // namespace device
+} // namespace tensor_operation
+
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i8_i8.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i8_i8.cpp
new file mode 100644
index 0000000000..66ef017864
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i8_i8.cpp
@@ -0,0 +1,40 @@
+#include "device_reduce_instance_blockwise.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace device_reduce_instance {
+
+// clang-format off
+// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim
+ADD_BLOCKWISE_INST_BY_ID(int8_t, int8_t, int8_t, 2, 0, 0, 4, 3); // for MIN
+ADD_BLOCKWISE_INST_BY_ID(int8_t, int8_t, int8_t, 2, 0, 0, 4, 4);       
+ADD_BLOCKWISE_INST_BY_ID(int8_t, int8_t, int8_t, 2, 0, 0, 4, 1);       
+ADD_BLOCKWISE_INST_BY_ID(int8_t, int8_t, int8_t, 2, 0, 0, 2, 1);       
+ADD_BLOCKWISE_INST_BY_ID(int8_t, int8_t, int8_t, 3, 0, 0, 4, 3); // for MAX
+ADD_BLOCKWISE_INST_BY_ID(int8_t, int8_t, int8_t, 3, 0, 0, 4, 4);       
+ADD_BLOCKWISE_INST_BY_ID(int8_t, int8_t, int8_t, 3, 0, 0, 4, 1);       
+ADD_BLOCKWISE_INST_BY_ID(int8_t, int8_t, int8_t, 3, 0, 0, 2, 1);       
+ADD_BLOCKWISE_INST_BY_ID(int8_t, int8_t, int8_t, 4, 0, 0, 4, 3); // for AMAX
+ADD_BLOCKWISE_INST_BY_ID(int8_t, int8_t, int8_t, 4, 0, 0, 4, 4);       
+ADD_BLOCKWISE_INST_BY_ID(int8_t, int8_t, int8_t, 4, 0, 0, 4, 1);       
+ADD_BLOCKWISE_INST_BY_ID(int8_t, int8_t, int8_t, 4, 0, 0, 2, 1);       
+ADD_BLOCKWISE_INST_BY_ID(int8_t, int8_t, int8_t, 2, 0, 1, 4, 3); // for MIN
+ADD_BLOCKWISE_INST_BY_ID(int8_t, int8_t, int8_t, 2, 0, 1, 4, 4);       
+ADD_BLOCKWISE_INST_BY_ID(int8_t, int8_t, int8_t, 2, 0, 1, 4, 1);       
+ADD_BLOCKWISE_INST_BY_ID(int8_t, int8_t, int8_t, 2, 0, 1, 2, 1);       
+ADD_BLOCKWISE_INST_BY_ID(int8_t, int8_t, int8_t, 3, 0, 1, 4, 3); // for MAX
+ADD_BLOCKWISE_INST_BY_ID(int8_t, int8_t, int8_t, 3, 0, 1, 4, 4);       
+ADD_BLOCKWISE_INST_BY_ID(int8_t, int8_t, int8_t, 3, 0, 1, 4, 1);       
+ADD_BLOCKWISE_INST_BY_ID(int8_t, int8_t, int8_t, 3, 0, 1, 2, 1);       
+ADD_BLOCKWISE_INST_BY_ID(int8_t, int8_t, int8_t, 4, 0, 1, 4, 3); // for AMAX
+ADD_BLOCKWISE_INST_BY_ID(int8_t, int8_t, int8_t, 4, 0, 1, 4, 4);       
+ADD_BLOCKWISE_INST_BY_ID(int8_t, int8_t, int8_t, 4, 0, 1, 4, 1);       
+ADD_BLOCKWISE_INST_BY_ID(int8_t, int8_t, int8_t, 4, 0, 1, 2, 1);
+// clang-format on
+
+} // namespace device_reduce_instance
+} // namespace device
+} // namespace tensor_operation
+
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_second_call_f16_f16_f16.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_second_call_f16_f16_f16.cpp
index dd5514daca..82a9c11413 100644
--- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_second_call_f16_f16_f16.cpp
+++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_second_call_f16_f16_f16.cpp
@@ -8,21 +8,27 @@ namespace device_reduce_instance {
 // clang-format off
 // InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim
 ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(half_t, half_t, half_t, 2, 0, 0, 4, 3); // for MIN
+ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(half_t, half_t, half_t, 2, 0, 0, 4, 4);       
 ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(half_t, half_t, half_t, 2, 0, 0, 4, 1);       
 ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(half_t, half_t, half_t, 2, 0, 0, 2, 1);       
 ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(half_t, half_t, half_t, 3, 0, 0, 4, 3); // for MAX
+ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(half_t, half_t, half_t, 3, 0, 0, 4, 4);       
 ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(half_t, half_t, half_t, 3, 0, 0, 4, 1);       
 ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(half_t, half_t, half_t, 3, 0, 0, 2, 1);       
 ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(half_t, half_t, half_t, 4, 0, 0, 4, 3); // for AMAX
+ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(half_t, half_t, half_t, 4, 0, 0, 4, 4);       
 ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(half_t, half_t, half_t, 4, 0, 0, 4, 1);       
 ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(half_t, half_t, half_t, 4, 0, 0, 2, 1);       
 ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(half_t, half_t, half_t, 2, 0, 1, 4, 3); // for MIN
+ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(half_t, half_t, half_t, 2, 0, 1, 4, 4);       
 ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(half_t, half_t, half_t, 2, 0, 1, 4, 1);       
 ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(half_t, half_t, half_t, 2, 0, 1, 2, 1);       
 ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(half_t, half_t, half_t, 3, 0, 1, 4, 3); // for MAX
+ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(half_t, half_t, half_t, 3, 0, 1, 4, 4);       
 ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(half_t, half_t, half_t, 3, 0, 1, 4, 1);       
 ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(half_t, half_t, half_t, 3, 0, 1, 2, 1);       
 ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(half_t, half_t, half_t, 4, 0, 1, 4, 3); // for AMAX
+ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(half_t, half_t, half_t, 4, 0, 1, 4, 4);       
 ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(half_t, half_t, half_t, 4, 0, 1, 4, 1);       
 ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(half_t, half_t, half_t, 4, 0, 1, 2, 1);
 // clang-format on
diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_second_call_f32_f32_b16.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_second_call_f32_f32_b16.cpp
new file mode 100644
index 0000000000..6b8139c32c
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_second_call_f32_f32_b16.cpp
@@ -0,0 +1,53 @@
+#include "device_reduce_instance_blockwise_second_call.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace device_reduce_instance {
+
+// clang-format off
+// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim
+ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, bhalf_t, 0, 0, 0, 4, 3); // for ADD
+ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, bhalf_t, 0, 0, 0, 4, 4);
+ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, bhalf_t, 0, 0, 0, 4, 1);
+ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, bhalf_t, 0, 0, 0, 2, 1);
+ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, bhalf_t, 5, 0, 0, 4, 3); // for AVG
+ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, bhalf_t, 5, 0, 0, 4, 4);       
+ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, bhalf_t, 5, 0, 0, 4, 1);       
+ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, bhalf_t, 5, 0, 0, 2, 1);       
+ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, bhalf_t, 7, 0, 0, 4, 3); // for NORM2
+ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, bhalf_t, 7, 0, 0, 4, 4);       
+ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, bhalf_t, 7, 0, 0, 4, 1);       
+ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, bhalf_t, 7, 0, 0, 2, 1);
+
+ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, bhalf_t, 2, 0, 0, 4, 3); // for MIN
+ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, bhalf_t, 2, 0, 0, 4, 4);
+ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, bhalf_t, 2, 0, 0, 4, 1);
+ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, bhalf_t, 2, 0, 0, 2, 1);
+ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, bhalf_t, 3, 0, 0, 4, 3); // for MAX
+ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, bhalf_t, 3, 0, 0, 4, 4);
+ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, bhalf_t, 3, 0, 0, 4, 1);
+ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, bhalf_t, 3, 0, 0, 2, 1);
+ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, bhalf_t, 4, 0, 0, 4, 3); // for AMAX
+ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, bhalf_t, 4, 0, 0, 4, 4);
+ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, bhalf_t, 4, 0, 0, 4, 1);
+ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, bhalf_t, 4, 0, 0, 2, 1);
+ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, bhalf_t, 2, 0, 1, 4, 3); // for MIN
+ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, bhalf_t, 2, 0, 1, 4, 4);
+ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, bhalf_t, 2, 0, 1, 4, 1);
+ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, bhalf_t, 2, 0, 1, 2, 1);
+ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, bhalf_t, 3, 0, 1, 4, 3); // for MAX
+ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, bhalf_t, 3, 0, 1, 4, 4);
+ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, bhalf_t, 3, 0, 1, 4, 1);
+ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, bhalf_t, 3, 0, 1, 2, 1);
+ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, bhalf_t, 4, 0, 1, 4, 3); // for AMAX
+ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, bhalf_t, 4, 0, 1, 4, 4);
+ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, bhalf_t, 4, 0, 1, 4, 1);
+ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, bhalf_t, 4, 0, 1, 2, 1);
+// clang-format on
+
+} // namespace device_reduce_instance
+} // namespace device
+} // namespace tensor_operation
+
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_second_call_f32_f32_f16.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_second_call_f32_f32_f16.cpp
index 295b31f629..267b9d4d9d 100644
--- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_second_call_f32_f32_f16.cpp
+++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_second_call_f32_f32_f16.cpp
@@ -8,12 +8,15 @@ namespace device_reduce_instance {
 // clang-format off
 // InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim
 ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, half_t, 0, 0, 0, 4, 3); // for ADD
+ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, half_t, 0, 0, 0, 4, 4);
 ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, half_t, 0, 0, 0, 4, 1);
 ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, half_t, 0, 0, 0, 2, 1);
 ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, half_t, 5, 0, 0, 4, 3); // for AVG
+ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, half_t, 5, 0, 0, 4, 4);       
 ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, half_t, 5, 0, 0, 4, 1);       
 ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, half_t, 5, 0, 0, 2, 1);       
 ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, half_t, 7, 0, 0, 4, 3); // for NORM2
+ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, half_t, 7, 0, 0, 4, 4);       
 ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, half_t, 7, 0, 0, 4, 1);       
 ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, half_t, 7, 0, 0, 2, 1);
 // clang-format on
diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_second_call_f32_f32_f32.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_second_call_f32_f32_f32.cpp
index 08b1592eab..0036a89542 100644
--- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_second_call_f32_f32_f32.cpp
+++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_second_call_f32_f32_f32.cpp
@@ -8,30 +8,39 @@ namespace device_reduce_instance {
 // clang-format off
 // InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim
 ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, float, 0, 0, 0, 4, 3); // for ADD
+ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, float, 0, 0, 0, 4, 4);
 ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, float, 0, 0, 0, 4, 1);
 ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, float, 0, 0, 0, 2, 1);
 ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, float, 5, 0, 0, 4, 3); // for AVG
+ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, float, 5, 0, 0, 4, 4);       
 ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, float, 5, 0, 0, 4, 1);       
 ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, float, 5, 0, 0, 2, 1);       
 ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, float, 7, 0, 0, 4, 3); // for NORM2
+ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, float, 7, 0, 0, 4, 4);       
 ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, float, 7, 0, 0, 4, 1);       
 ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, float, 7, 0, 0, 2, 1);       
 ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, float, 2, 0, 0, 4, 3); // for MIN
+ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, float, 2, 0, 0, 4, 4);       
 ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, float, 2, 0, 0, 4, 1);       
 ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, float, 2, 0, 0, 2, 1);       
 ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, float, 3, 0, 0, 4, 3); // for MAX
+ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, float, 3, 0, 0, 4, 4);       
 ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, float, 3, 0, 0, 4, 1);       
 ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, float, 3, 0, 0, 2, 1);       
 ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, float, 4, 0, 0, 4, 3); // for AMAX
+ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, float, 4, 0, 0, 4, 4);       
 ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, float, 4, 0, 0, 4, 1);       
 ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, float, 4, 0, 0, 2, 1);       
 ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, float, 2, 0, 1, 4, 3); // for MIN
+ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, float, 2, 0, 1, 4, 4);       
 ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, float, 2, 0, 1, 4, 1);       
 ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, float, 2, 0, 1, 2, 1);       
 ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, float, 3, 0, 1, 4, 3); // for MAX
+ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, float, 3, 0, 1, 4, 4);       
 ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, float, 3, 0, 1, 4, 1);       
 ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, float, 3, 0, 1, 2, 1);       
 ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, float, 4, 0, 1, 4, 3); // for AMAX
+ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, float, 4, 0, 1, 4, 4);       
 ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, float, 4, 0, 1, 4, 1);       
 ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, float, 4, 0, 1, 2, 1);
 // clang-format on
diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_second_call_f64_f64_f32.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_second_call_f64_f64_f32.cpp
index ba46891d0e..0512fa4158 100644
--- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_second_call_f64_f64_f32.cpp
+++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_second_call_f64_f64_f32.cpp
@@ -8,12 +8,15 @@ namespace device_reduce_instance {
 // clang-format off
 // InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim
 ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(double, double, float, 0, 0, 0, 4, 3); // for ADD
+ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(double, double, float, 0, 0, 0, 4, 4);
 ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(double, double, float, 0, 0, 0, 4, 1);
 ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(double, double, float, 0, 0, 0, 2, 1);
 ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(double, double, float, 5, 0, 0, 4, 3); // for AVG
+ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(double, double, float, 5, 0, 0, 4, 4);       
 ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(double, double, float, 5, 0, 0, 4, 1);       
 ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(double, double, float, 5, 0, 0, 2, 1);       
 ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(double, double, float, 7, 0, 0, 4, 3); // for NORM2
+ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(double, double, float, 7, 0, 0, 4, 4);       
 ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(double, double, float, 7, 0, 0, 4, 1);       
 ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(double, double, float, 7, 0, 0, 2, 1);
 // clang-format on
diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_second_call_f64_f64_f64.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_second_call_f64_f64_f64.cpp
index 3a8ddadb2e..afe7f0752e 100644
--- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_second_call_f64_f64_f64.cpp
+++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_second_call_f64_f64_f64.cpp
@@ -8,30 +8,39 @@ namespace device_reduce_instance {
 // clang-format off
 // InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim
 ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(double, double, double, 0, 0, 0, 4, 3); // for ADD
+ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(double, double, double, 0, 0, 0, 4, 4);
 ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(double, double, double, 0, 0, 0, 4, 1);
 ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(double, double, double, 0, 0, 0, 2, 1);
 ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(double, double, double, 5, 0, 0, 4, 3); // for AVG
+ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(double, double, double, 5, 0, 0, 4, 4);       
 ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(double, double, double, 5, 0, 0, 4, 1);       
 ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(double, double, double, 5, 0, 0, 2, 1);       
 ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(double, double, double, 7, 0, 0, 4, 3); // for NORM2
+ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(double, double, double, 7, 0, 0, 4, 4);       
 ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(double, double, double, 7, 0, 0, 4, 1);       
 ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(double, double, double, 7, 0, 0, 2, 1);       
 ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(double, double, double, 2, 0, 0, 4, 3); // for MIN
+ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(double, double, double, 2, 0, 0, 4, 4);       
 ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(double, double, double, 2, 0, 0, 4, 1);       
 ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(double, double, double, 2, 0, 0, 2, 1);       
 ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(double, double, double, 3, 0, 0, 4, 3); // for MAX
+ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(double, double, double, 3, 0, 0, 4, 4);       
 ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(double, double, double, 3, 0, 0, 4, 1);       
 ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(double, double, double, 3, 0, 0, 2, 1);       
 ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(double, double, double, 4, 0, 0, 4, 3); // for AMAX
+ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(double, double, double, 4, 0, 0, 4, 4);       
 ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(double, double, double, 4, 0, 0, 4, 1);       
 ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(double, double, double, 4, 0, 0, 2, 1);       
 ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(double, double, double, 2, 0, 1, 4, 3); // for MIN
+ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(double, double, double, 2, 0, 1, 4, 4);       
 ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(double, double, double, 2, 0, 1, 4, 1);       
 ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(double, double, double, 2, 0, 1, 2, 1);       
 ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(double, double, double, 3, 0, 1, 4, 3); // for MAX
+ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(double, double, double, 3, 0, 1, 4, 4);       
 ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(double, double, double, 3, 0, 1, 4, 1);       
 ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(double, double, double, 3, 0, 1, 2, 1);       
 ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(double, double, double, 4, 0, 1, 4, 3); // for AMAX
+ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(double, double, double, 4, 0, 1, 4, 4);       
 ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(double, double, double, 4, 0, 1, 4, 1);       
 ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(double, double, double, 4, 0, 1, 2, 1);
 // clang-format on
diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_second_call_i32_i32_i8.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_second_call_i32_i32_i8.cpp
new file mode 100644
index 0000000000..9cb3b8684f
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_second_call_i32_i32_i8.cpp
@@ -0,0 +1,24 @@
+#include "device_reduce_instance_blockwise_second_call.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace device_reduce_instance {
+
+// clang-format off
+// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim
+ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(int32_t, int32_t, int8_t, 0, 0, 0, 4, 3); // for ADD
+ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(int32_t, int32_t, int8_t, 0, 0, 0, 4, 4);
+ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(int32_t, int32_t, int8_t, 0, 0, 0, 4, 1);
+ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(int32_t, int32_t, int8_t, 0, 0, 0, 2, 1);
+ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(int32_t, int32_t, int8_t, 5, 0, 0, 4, 3); // for AVG
+ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(int32_t, int32_t, int8_t, 5, 0, 0, 4, 4);
+ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(int32_t, int32_t, int8_t, 5, 0, 0, 4, 1);
+ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(int32_t, int32_t, int8_t, 5, 0, 0, 2, 1);
+// clang-format on
+
+} // namespace device_reduce_instance
+} // namespace device
+} // namespace tensor_operation
+
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_second_call_i8_i8_i8.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_second_call_i8_i8_i8.cpp
new file mode 100644
index 0000000000..8783a75486
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_second_call_i8_i8_i8.cpp
@@ -0,0 +1,40 @@
+#include "device_reduce_instance_blockwise_second_call.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace device_reduce_instance {
+
+// clang-format off
+// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim
+ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(int8_t, int8_t, int8_t, 2, 0, 0, 4, 3); // for MIN
+ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(int8_t, int8_t, int8_t, 2, 0, 0, 4, 4);       
+ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(int8_t, int8_t, int8_t, 2, 0, 0, 4, 1);       
+ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(int8_t, int8_t, int8_t, 2, 0, 0, 2, 1);       
+ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(int8_t, int8_t, int8_t, 3, 0, 0, 4, 3); // for MAX
+ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(int8_t, int8_t, int8_t, 3, 0, 0, 4, 4);       
+ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(int8_t, int8_t, int8_t, 3, 0, 0, 4, 1);       
+ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(int8_t, int8_t, int8_t, 3, 0, 0, 2, 1);       
+ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(int8_t, int8_t, int8_t, 4, 0, 0, 4, 3); // for AMAX
+ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(int8_t, int8_t, int8_t, 4, 0, 0, 4, 4);       
+ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(int8_t, int8_t, int8_t, 4, 0, 0, 4, 1);       
+ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(int8_t, int8_t, int8_t, 4, 0, 0, 2, 1);       
+ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(int8_t, int8_t, int8_t, 2, 0, 1, 4, 3); // for MIN
+ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(int8_t, int8_t, int8_t, 2, 0, 1, 4, 4);       
+ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(int8_t, int8_t, int8_t, 2, 0, 1, 4, 1);       
+ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(int8_t, int8_t, int8_t, 2, 0, 1, 2, 1);       
+ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(int8_t, int8_t, int8_t, 3, 0, 1, 4, 3); // for MAX
+ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(int8_t, int8_t, int8_t, 3, 0, 1, 4, 4);       
+ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(int8_t, int8_t, int8_t, 3, 0, 1, 4, 1);       
+ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(int8_t, int8_t, int8_t, 3, 0, 1, 2, 1);       
+ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(int8_t, int8_t, int8_t, 4, 0, 1, 4, 3); // for AMAX
+ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(int8_t, int8_t, int8_t, 4, 0, 1, 4, 4);       
+ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(int8_t, int8_t, int8_t, 4, 0, 1, 4, 1);       
+ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(int8_t, int8_t, int8_t, 4, 0, 1, 2, 1);
+// clang-format on
+
+} // namespace device_reduce_instance
+} // namespace device
+} // namespace tensor_operation
+
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_b16_f32_f32.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_b16_f32_f32.cpp
new file mode 100644
index 0000000000..9b2b7f5d8c
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_b16_f32_f32.cpp
@@ -0,0 +1,24 @@
+#include "device_reduce_instance_multiblock_atomic_add.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace device_reduce_instance {
+
+// clang-format off
+// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim
+ADD_MULTIBLOCK_ATOMIC_ADD_INST_BY_ID(bhalf_t, float, float, 0, 0, 0, 4, 3); // for ADD
+ADD_MULTIBLOCK_ATOMIC_ADD_INST_BY_ID(bhalf_t, float, float, 0, 0, 0, 4, 4);
+ADD_MULTIBLOCK_ATOMIC_ADD_INST_BY_ID(bhalf_t, float, float, 0, 0, 0, 4, 1);
+ADD_MULTIBLOCK_ATOMIC_ADD_INST_BY_ID(bhalf_t, float, float, 0, 0, 0, 2, 1);
+ADD_MULTIBLOCK_ATOMIC_ADD_INST_BY_ID(bhalf_t, float, float, 5, 0, 0, 4, 3); // for AVG
+ADD_MULTIBLOCK_ATOMIC_ADD_INST_BY_ID(bhalf_t, float, float, 5, 0, 0, 4, 4);       
+ADD_MULTIBLOCK_ATOMIC_ADD_INST_BY_ID(bhalf_t, float, float, 5, 0, 0, 4, 1);       
+ADD_MULTIBLOCK_ATOMIC_ADD_INST_BY_ID(bhalf_t, float, float, 5, 0, 0, 2, 1);
+// clang-format on
+
+} // namespace device_reduce_instance
+} // namespace device
+} // namespace tensor_operation
+
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f16_f32_f32.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f16_f32_f32.cpp
index 847a3b6ac9..fc956aa04b 100644
--- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f16_f32_f32.cpp
+++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f16_f32_f32.cpp
@@ -8,9 +8,11 @@ namespace device_reduce_instance {
 // clang-format off
 // InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim
 ADD_MULTIBLOCK_ATOMIC_ADD_INST_BY_ID(half_t, float, float, 0, 0, 0, 4, 3); // for ADD
+ADD_MULTIBLOCK_ATOMIC_ADD_INST_BY_ID(half_t, float, float, 0, 0, 0, 4, 4);
 ADD_MULTIBLOCK_ATOMIC_ADD_INST_BY_ID(half_t, float, float, 0, 0, 0, 4, 1);
 ADD_MULTIBLOCK_ATOMIC_ADD_INST_BY_ID(half_t, float, float, 0, 0, 0, 2, 1);
 ADD_MULTIBLOCK_ATOMIC_ADD_INST_BY_ID(half_t, float, float, 5, 0, 0, 4, 3); // for AVG
+ADD_MULTIBLOCK_ATOMIC_ADD_INST_BY_ID(half_t, float, float, 5, 0, 0, 4, 4);       
 ADD_MULTIBLOCK_ATOMIC_ADD_INST_BY_ID(half_t, float, float, 5, 0, 0, 4, 1);       
 ADD_MULTIBLOCK_ATOMIC_ADD_INST_BY_ID(half_t, float, float, 5, 0, 0, 2, 1);
 // clang-format on
diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f32_f32.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f32_f32.cpp
index 77fe2d8a05..e5ffd9f976 100644
--- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f32_f32.cpp
+++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f32_f32.cpp
@@ -8,9 +8,11 @@ namespace device_reduce_instance {
 // clang-format off
 // InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim
 ADD_MULTIBLOCK_ATOMIC_ADD_INST_BY_ID(float, float, float, 0, 0, 0, 4, 3); // for ADD
+ADD_MULTIBLOCK_ATOMIC_ADD_INST_BY_ID(float, float, float, 0, 0, 0, 4, 4);
 ADD_MULTIBLOCK_ATOMIC_ADD_INST_BY_ID(float, float, float, 0, 0, 0, 4, 1);
 ADD_MULTIBLOCK_ATOMIC_ADD_INST_BY_ID(float, float, float, 0, 0, 0, 2, 1);
 ADD_MULTIBLOCK_ATOMIC_ADD_INST_BY_ID(float, float, float, 5, 0, 0, 4, 3); // for AVG
+ADD_MULTIBLOCK_ATOMIC_ADD_INST_BY_ID(float, float, float, 5, 0, 0, 4, 4);       
 ADD_MULTIBLOCK_ATOMIC_ADD_INST_BY_ID(float, float, float, 5, 0, 0, 4, 1);       
 ADD_MULTIBLOCK_ATOMIC_ADD_INST_BY_ID(float, float, float, 5, 0, 0, 2, 1);
 // clang-format on
diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f64_f32.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f64_f32.cpp
index a748dc263c..229829b889 100644
--- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f64_f32.cpp
+++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f64_f32.cpp
@@ -8,9 +8,11 @@ namespace device_reduce_instance {
 // clang-format off
 // InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim
 ADD_MULTIBLOCK_ATOMIC_ADD_INST_BY_ID(float, double, float, 0, 0, 0, 4, 3); // for ADD
+ADD_MULTIBLOCK_ATOMIC_ADD_INST_BY_ID(float, double, float, 0, 0, 0, 4, 4);
 ADD_MULTIBLOCK_ATOMIC_ADD_INST_BY_ID(float, double, float, 0, 0, 0, 4, 1);
 ADD_MULTIBLOCK_ATOMIC_ADD_INST_BY_ID(float, double, float, 0, 0, 0, 2, 1);
 ADD_MULTIBLOCK_ATOMIC_ADD_INST_BY_ID(float, double, float, 5, 0, 0, 4, 3); // for AVG
+ADD_MULTIBLOCK_ATOMIC_ADD_INST_BY_ID(float, double, float, 5, 0, 0, 4, 4);       
 ADD_MULTIBLOCK_ATOMIC_ADD_INST_BY_ID(float, double, float, 5, 0, 0, 4, 1);       
 ADD_MULTIBLOCK_ATOMIC_ADD_INST_BY_ID(float, double, float, 5, 0, 0, 2, 1);
 // clang-format on
diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_partial_reduce_b16_f32_b16.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_partial_reduce_b16_f32_b16.cpp
new file mode 100644
index 0000000000..d740fcfa8f
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_partial_reduce_b16_f32_b16.cpp
@@ -0,0 +1,53 @@
+#include "device_reduce_instance_multiblock_partial_reduce.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace device_reduce_instance {
+
+// clang-format off
+// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim
+ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(bhalf_t, float, bhalf_t, 0, 0, 0, 4, 3); // for ADD
+ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(bhalf_t, float, bhalf_t, 0, 0, 0, 4, 4);
+ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(bhalf_t, float, bhalf_t, 0, 0, 0, 4, 1);
+ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(bhalf_t, float, bhalf_t, 0, 0, 0, 2, 1);
+ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(bhalf_t, float, bhalf_t, 5, 0, 0, 4, 3); // for AVG
+ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(bhalf_t, float, bhalf_t, 5, 0, 0, 4, 4);       
+ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(bhalf_t, float, bhalf_t, 5, 0, 0, 4, 1);       
+ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(bhalf_t, float, bhalf_t, 5, 0, 0, 2, 1);       
+ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(bhalf_t, float, bhalf_t, 7, 0, 0, 4, 3); // for NORM2
+ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(bhalf_t, float, bhalf_t, 7, 0, 0, 4, 4);       
+ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(bhalf_t, float, bhalf_t, 7, 0, 0, 4, 1);       
+ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(bhalf_t, float, bhalf_t, 7, 0, 0, 2, 1);
+
+ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(bhalf_t, float, bhalf_t, 2, 0, 0, 4, 3); // for MIN
+ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(bhalf_t, float, bhalf_t, 2, 0, 0, 4, 4);
+ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(bhalf_t, float, bhalf_t, 2, 0, 0, 4, 1);
+ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(bhalf_t, float, bhalf_t, 2, 0, 0, 2, 1);
+ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(bhalf_t, float, bhalf_t, 3, 0, 0, 4, 3); // for MAX
+ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(bhalf_t, float, bhalf_t, 3, 0, 0, 4, 4);
+ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(bhalf_t, float, bhalf_t, 3, 0, 0, 4, 1);
+ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(bhalf_t, float, bhalf_t, 3, 0, 0, 2, 1);
+ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(bhalf_t, float, bhalf_t, 4, 0, 0, 4, 3); // for AMAX
+ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(bhalf_t, float, bhalf_t, 4, 0, 0, 4, 4);
+ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(bhalf_t, float, bhalf_t, 4, 0, 0, 4, 1);
+ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(bhalf_t, float, bhalf_t, 4, 0, 0, 2, 1);
+ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(bhalf_t, float, bhalf_t, 2, 0, 1, 4, 3); // for MIN
+ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(bhalf_t, float, bhalf_t, 2, 0, 1, 4, 4);
+ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(bhalf_t, float, bhalf_t, 2, 0, 1, 4, 1);
+ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(bhalf_t, float, bhalf_t, 2, 0, 1, 2, 1);
+ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(bhalf_t, float, bhalf_t, 3, 0, 1, 4, 3); // for MAX
+ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(bhalf_t, float, bhalf_t, 3, 0, 1, 4, 4);
+ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(bhalf_t, float, bhalf_t, 3, 0, 1, 4, 1);
+ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(bhalf_t, float, bhalf_t, 3, 0, 1, 2, 1);
+ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(bhalf_t, float, bhalf_t, 4, 0, 1, 4, 3); // for AMAX
+ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(bhalf_t, float, bhalf_t, 4, 0, 1, 4, 4);
+ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(bhalf_t, float, bhalf_t, 4, 0, 1, 4, 1);
+ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(bhalf_t, float, bhalf_t, 4, 0, 1, 2, 1);
+// clang-format on
+
+} // namespace device_reduce_instance
+} // namespace device
+} // namespace tensor_operation
+
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_partial_reduce_f16_f16_f16.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_partial_reduce_f16_f16_f16.cpp
index 527ebc5386..f57ed5ad86 100644
--- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_partial_reduce_f16_f16_f16.cpp
+++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_partial_reduce_f16_f16_f16.cpp
@@ -8,21 +8,27 @@ namespace device_reduce_instance {
 // clang-format off
 // InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim
 ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(half_t, half_t, half_t, 2, 0, 0, 4, 3); // for MIN
+ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(half_t, half_t, half_t, 2, 0, 0, 4, 4);       
 ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(half_t, half_t, half_t, 2, 0, 0, 4, 1);       
 ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(half_t, half_t, half_t, 2, 0, 0, 2, 1);       
 ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(half_t, half_t, half_t, 3, 0, 0, 4, 3); // for MAX
+ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(half_t, half_t, half_t, 3, 0, 0, 4, 4);       
 ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(half_t, half_t, half_t, 3, 0, 0, 4, 1);       
 ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(half_t, half_t, half_t, 3, 0, 0, 2, 1);       
 ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(half_t, half_t, half_t, 4, 0, 0, 4, 3); // for AMAX
+ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(half_t, half_t, half_t, 4, 0, 0, 4, 4);       
 ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(half_t, half_t, half_t, 4, 0, 0, 4, 1);       
 ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(half_t, half_t, half_t, 4, 0, 0, 2, 1);       
 ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(half_t, half_t, half_t, 2, 0, 1, 4, 3); // for MIN
+ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(half_t, half_t, half_t, 2, 0, 1, 4, 4);       
 ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(half_t, half_t, half_t, 2, 0, 1, 4, 1);       
 ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(half_t, half_t, half_t, 2, 0, 1, 2, 1);       
 ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(half_t, half_t, half_t, 3, 0, 1, 4, 3); // for MAX
+ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(half_t, half_t, half_t, 3, 0, 1, 4, 4);       
 ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(half_t, half_t, half_t, 3, 0, 1, 4, 1);       
 ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(half_t, half_t, half_t, 3, 0, 1, 2, 1);       
 ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(half_t, half_t, half_t, 4, 0, 1, 4, 3); // for AMAX
+ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(half_t, half_t, half_t, 4, 0, 1, 4, 4);       
 ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(half_t, half_t, half_t, 4, 0, 1, 4, 1);       
 ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(half_t, half_t, half_t, 4, 0, 1, 2, 1);
 // clang-format on
diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_partial_reduce_f16_f32_f16.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_partial_reduce_f16_f32_f16.cpp
index ace76f4675..724b364104 100644
--- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_partial_reduce_f16_f32_f16.cpp
+++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_partial_reduce_f16_f32_f16.cpp
@@ -8,12 +8,15 @@ namespace device_reduce_instance {
 // clang-format off
 // InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim
 ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(half_t, float, half_t, 0, 0, 0, 4, 3); // for ADD
+ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(half_t, float, half_t, 0, 0, 0, 4, 4);
 ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(half_t, float, half_t, 0, 0, 0, 4, 1);
 ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(half_t, float, half_t, 0, 0, 0, 2, 1);
 ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(half_t, float, half_t, 5, 0, 0, 4, 3); // for AVG
+ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(half_t, float, half_t, 5, 0, 0, 4, 4);       
 ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(half_t, float, half_t, 5, 0, 0, 4, 1);       
 ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(half_t, float, half_t, 5, 0, 0, 2, 1);       
 ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(half_t, float, half_t, 7, 0, 0, 4, 3); // for NORM2
+ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(half_t, float, half_t, 7, 0, 0, 4, 4);       
 ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(half_t, float, half_t, 7, 0, 0, 4, 1);       
 ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(half_t, float, half_t, 7, 0, 0, 2, 1);
 // clang-format on
diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_partial_reduce_f32_f32_f32.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_partial_reduce_f32_f32_f32.cpp
index 767dca99bd..15028a0b4c 100644
--- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_partial_reduce_f32_f32_f32.cpp
+++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_partial_reduce_f32_f32_f32.cpp
@@ -8,25 +8,32 @@ namespace device_reduce_instance {
 // clang-format off
 // InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim
 ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(float, float, float, 2, 0, 0, 4, 3); // for MIN
+ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(float, float, float, 2, 0, 0, 4, 4);       
 ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(float, float, float, 2, 0, 0, 4, 1);       
 ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(float, float, float, 2, 0, 0, 2, 1);       
 ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(float, float, float, 3, 0, 0, 4, 3); // for MAX
+ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(float, float, float, 3, 0, 0, 4, 4);       
 ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(float, float, float, 3, 0, 0, 4, 1);       
 ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(float, float, float, 3, 0, 0, 2, 1);       
 ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(float, float, float, 4, 0, 0, 4, 3); // for AMAX
+ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(float, float, float, 4, 0, 0, 4, 4);       
 ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(float, float, float, 4, 0, 0, 4, 1);       
 ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(float, float, float, 4, 0, 0, 2, 1);       
 ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(float, float, float, 2, 0, 1, 4, 3); // for MIN
+ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(float, float, float, 2, 0, 1, 4, 4);       
 ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(float, float, float, 2, 0, 1, 4, 1);       
 ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(float, float, float, 2, 0, 1, 2, 1);       
 ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(float, float, float, 3, 0, 1, 4, 3); // for MAX
+ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(float, float, float, 3, 0, 1, 4, 4);       
 ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(float, float, float, 3, 0, 1, 4, 1);       
 ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(float, float, float, 3, 0, 1, 2, 1);       
 ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(float, float, float, 4, 0, 1, 4, 3); // for AMAX
+ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(float, float, float, 4, 0, 1, 4, 4);       
 ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(float, float, float, 4, 0, 1, 4, 1);       
 ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(float, float, float, 4, 0, 1, 2, 1);       
 
 ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(float, float, float, 7, 0, 0, 4, 3); // for NORM2
+ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(float, float, float, 7, 0, 0, 4, 4);       
 ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(float, float, float, 7, 0, 0, 4, 1);       
 ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(float, float, float, 7, 0, 0, 2, 1);
 // clang-format on
diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_partial_reduce_f32_f64_f32.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_partial_reduce_f32_f64_f32.cpp
index 2ed21e74e8..ec0ba3cf8e 100644
--- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_partial_reduce_f32_f64_f32.cpp
+++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_partial_reduce_f32_f64_f32.cpp
@@ -8,6 +8,7 @@ namespace device_reduce_instance {
 // clang-format off
 // InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim
 ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(float, double, float, 7, 0, 0, 4, 3); // for NORM2
+ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(float, double, float, 7, 0, 0, 4, 4);       
 ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(float, double, float, 7, 0, 0, 4, 1);       
 ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(float, double, float, 7, 0, 0, 2, 1);
 // clang-format on
diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_partial_reduce_f64_f64_f64.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_partial_reduce_f64_f64_f64.cpp
index 95bd1daa8f..9ff2dcd93b 100644
--- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_partial_reduce_f64_f64_f64.cpp
+++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_partial_reduce_f64_f64_f64.cpp
@@ -8,33 +8,42 @@ namespace device_reduce_instance {
 // clang-format off
 // InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim
 ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(double, double, double, 2, 0, 0, 4, 3); // for MIN
+ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(double, double, double, 2, 0, 0, 4, 4);       
 ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(double, double, double, 2, 0, 0, 4, 1);       
 ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(double, double, double, 2, 0, 0, 2, 1);       
 ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(double, double, double, 3, 0, 0, 4, 3); // for MAX
+ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(double, double, double, 3, 0, 0, 4, 4);       
 ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(double, double, double, 3, 0, 0, 4, 1);       
 ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(double, double, double, 3, 0, 0, 2, 1);       
 ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(double, double, double, 4, 0, 0, 4, 3); // for AMAX
+ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(double, double, double, 4, 0, 0, 4, 4);       
 ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(double, double, double, 4, 0, 0, 4, 1);       
 ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(double, double, double, 4, 0, 0, 2, 1);       
 ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(double, double, double, 2, 0, 1, 4, 3); // for MIN
+ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(double, double, double, 2, 0, 1, 4, 4);       
 ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(double, double, double, 2, 0, 1, 4, 1);       
 ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(double, double, double, 2, 0, 1, 2, 1);       
 ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(double, double, double, 3, 0, 1, 4, 3); // for MAX
+ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(double, double, double, 3, 0, 1, 4, 4);       
 ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(double, double, double, 3, 0, 1, 4, 1);       
 ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(double, double, double, 3, 0, 1, 2, 1);       
 ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(double, double, double, 4, 0, 1, 4, 3); // for AMAX
+ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(double, double, double, 4, 0, 1, 4, 4);       
 ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(double, double, double, 4, 0, 1, 4, 1);       
 ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(double, double, double, 4, 0, 1, 2, 1);       
 
 ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(double, double, double, 7, 0, 0, 4, 3); // for NORM2
+ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(double, double, double, 7, 0, 0, 4, 4);       
 ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(double, double, double, 7, 0, 0, 4, 1);       
 ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(double, double, double, 7, 0, 0, 2, 1);       
 
 // Will be moved to use MultiBlockAtomicAdd
 ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(double, double, double, 0, 0, 0, 4, 3); // for ADD
+ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(double, double, double, 0, 0, 0, 4, 4);       
 ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(double, double, double, 0, 0, 0, 4, 1);       
 ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(double, double, double, 0, 0, 0, 2, 1);       
 ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(double, double, double, 5, 0, 0, 4, 3); // for AVG
+ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(double, double, double, 5, 0, 0, 4, 4);       
 ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(double, double, double, 5, 0, 0, 4, 1);       
 ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(double, double, double, 5, 0, 0, 2, 1);
 // clang-format on
diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_partial_reduce_i8_i32_i8.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_partial_reduce_i8_i32_i8.cpp
new file mode 100644
index 0000000000..0e37c2947f
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_partial_reduce_i8_i32_i8.cpp
@@ -0,0 +1,24 @@
+#include "device_reduce_instance_multiblock_partial_reduce.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace device_reduce_instance {
+
+// clang-format off
+// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim
+ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(int8_t, int32_t, int8_t, 0, 0, 0, 4, 3); // for ADD
+ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(int8_t, int32_t, int8_t, 0, 0, 0, 4, 4);
+ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(int8_t, int32_t, int8_t, 0, 0, 0, 4, 1);
+ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(int8_t, int32_t, int8_t, 0, 0, 0, 2, 1);
+ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(int8_t, int32_t, int8_t, 5, 0, 0, 4, 3); // for AVG
+ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(int8_t, int32_t, int8_t, 5, 0, 0, 4, 4);
+ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(int8_t, int32_t, int8_t, 5, 0, 0, 4, 1);
+ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(int8_t, int32_t, int8_t, 5, 0, 0, 2, 1);
+// clang-format on
+
+} // namespace device_reduce_instance
+} // namespace device
+} // namespace tensor_operation
+
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_partial_reduce_i8_i8_i8.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_partial_reduce_i8_i8_i8.cpp
new file mode 100644
index 0000000000..4634faed06
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_partial_reduce_i8_i8_i8.cpp
@@ -0,0 +1,40 @@
+#include "device_reduce_instance_multiblock_partial_reduce.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace device_reduce_instance {
+
+// clang-format off
+// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim
+ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(int8_t, int8_t, int8_t, 2, 0, 0, 4, 3); // for MIN
+ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(int8_t, int8_t, int8_t, 2, 0, 0, 4, 4);       
+ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(int8_t, int8_t, int8_t, 2, 0, 0, 4, 1);       
+ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(int8_t, int8_t, int8_t, 2, 0, 0, 2, 1);       
+ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(int8_t, int8_t, int8_t, 3, 0, 0, 4, 3); // for MAX
+ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(int8_t, int8_t, int8_t, 3, 0, 0, 4, 4);       
+ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(int8_t, int8_t, int8_t, 3, 0, 0, 4, 1);       
+ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(int8_t, int8_t, int8_t, 3, 0, 0, 2, 1);       
+ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(int8_t, int8_t, int8_t, 4, 0, 0, 4, 3); // for AMAX
+ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(int8_t, int8_t, int8_t, 4, 0, 0, 4, 4);       
+ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(int8_t, int8_t, int8_t, 4, 0, 0, 4, 1);       
+ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(int8_t, int8_t, int8_t, 4, 0, 0, 2, 1);       
+ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(int8_t, int8_t, int8_t, 2, 0, 1, 4, 3); // for MIN
+ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(int8_t, int8_t, int8_t, 2, 0, 1, 4, 4);       
+ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(int8_t, int8_t, int8_t, 2, 0, 1, 4, 1);       
+ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(int8_t, int8_t, int8_t, 2, 0, 1, 2, 1);       
+ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(int8_t, int8_t, int8_t, 3, 0, 1, 4, 3); // for MAX
+ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(int8_t, int8_t, int8_t, 3, 0, 1, 4, 4);       
+ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(int8_t, int8_t, int8_t, 3, 0, 1, 4, 1);       
+ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(int8_t, int8_t, int8_t, 3, 0, 1, 2, 1);       
+ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(int8_t, int8_t, int8_t, 4, 0, 1, 4, 3); // for AMAX
+ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(int8_t, int8_t, int8_t, 4, 0, 1, 4, 4);       
+ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(int8_t, int8_t, int8_t, 4, 0, 1, 4, 1);       
+ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(int8_t, int8_t, int8_t, 4, 0, 1, 2, 1);
+// clang-format on
+
+} // namespace device_reduce_instance
+} // namespace device
+} // namespace tensor_operation
+
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16.cpp
new file mode 100644
index 0000000000..02fc4b4c01
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16.cpp
@@ -0,0 +1,53 @@
+#include "device_reduce_instance_threadwise.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace device_reduce_instance {
+
+// clang-format off
+// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim
+ADD_THREADWISE_INST_BY_ID(bhalf_t, float, bhalf_t, 0, 0, 0, 4, 3); // for ADD
+ADD_THREADWISE_INST_BY_ID(bhalf_t, float, bhalf_t, 0, 0, 0, 4, 4);
+ADD_THREADWISE_INST_BY_ID(bhalf_t, float, bhalf_t, 0, 0, 0, 4, 1);
+ADD_THREADWISE_INST_BY_ID(bhalf_t, float, bhalf_t, 0, 0, 0, 2, 1);
+ADD_THREADWISE_INST_BY_ID(bhalf_t, float, bhalf_t, 5, 0, 0, 4, 3); // for AVG
+ADD_THREADWISE_INST_BY_ID(bhalf_t, float, bhalf_t, 5, 0, 0, 4, 4);       
+ADD_THREADWISE_INST_BY_ID(bhalf_t, float, bhalf_t, 5, 0, 0, 4, 1);       
+ADD_THREADWISE_INST_BY_ID(bhalf_t, float, bhalf_t, 5, 0, 0, 2, 1);       
+ADD_THREADWISE_INST_BY_ID(bhalf_t, float, bhalf_t, 7, 0, 0, 4, 3); // for NORM2
+ADD_THREADWISE_INST_BY_ID(bhalf_t, float, bhalf_t, 7, 0, 0, 4, 4);       
+ADD_THREADWISE_INST_BY_ID(bhalf_t, float, bhalf_t, 7, 0, 0, 4, 1);       
+ADD_THREADWISE_INST_BY_ID(bhalf_t, float, bhalf_t, 7, 0, 0, 2, 1);
+
+ADD_THREADWISE_INST_BY_ID(bhalf_t, float, bhalf_t, 2, 0, 0, 4, 3); // for MIN
+ADD_THREADWISE_INST_BY_ID(bhalf_t, float, bhalf_t, 2, 0, 0, 4, 4);
+ADD_THREADWISE_INST_BY_ID(bhalf_t, float, bhalf_t, 2, 0, 0, 4, 1);
+ADD_THREADWISE_INST_BY_ID(bhalf_t, float, bhalf_t, 2, 0, 0, 2, 1);
+ADD_THREADWISE_INST_BY_ID(bhalf_t, float, bhalf_t, 3, 0, 0, 4, 3); // for MAX
+ADD_THREADWISE_INST_BY_ID(bhalf_t, float, bhalf_t, 3, 0, 0, 4, 4);
+ADD_THREADWISE_INST_BY_ID(bhalf_t, float, bhalf_t, 3, 0, 0, 4, 1);
+ADD_THREADWISE_INST_BY_ID(bhalf_t, float, bhalf_t, 3, 0, 0, 2, 1);
+ADD_THREADWISE_INST_BY_ID(bhalf_t, float, bhalf_t, 4, 0, 0, 4, 3); // for AMAX
+ADD_THREADWISE_INST_BY_ID(bhalf_t, float, bhalf_t, 4, 0, 0, 4, 4);
+ADD_THREADWISE_INST_BY_ID(bhalf_t, float, bhalf_t, 4, 0, 0, 4, 1);
+ADD_THREADWISE_INST_BY_ID(bhalf_t, float, bhalf_t, 4, 0, 0, 2, 1);
+ADD_THREADWISE_INST_BY_ID(bhalf_t, float, bhalf_t, 2, 0, 1, 4, 3); // for MIN
+ADD_THREADWISE_INST_BY_ID(bhalf_t, float, bhalf_t, 2, 0, 1, 4, 4);
+ADD_THREADWISE_INST_BY_ID(bhalf_t, float, bhalf_t, 2, 0, 1, 4, 1);
+ADD_THREADWISE_INST_BY_ID(bhalf_t, float, bhalf_t, 2, 0, 1, 2, 1);
+ADD_THREADWISE_INST_BY_ID(bhalf_t, float, bhalf_t, 3, 0, 1, 4, 3); // for MAX
+ADD_THREADWISE_INST_BY_ID(bhalf_t, float, bhalf_t, 3, 0, 1, 4, 4);
+ADD_THREADWISE_INST_BY_ID(bhalf_t, float, bhalf_t, 3, 0, 1, 4, 1);
+ADD_THREADWISE_INST_BY_ID(bhalf_t, float, bhalf_t, 3, 0, 1, 2, 1);
+ADD_THREADWISE_INST_BY_ID(bhalf_t, float, bhalf_t, 4, 0, 1, 4, 3); // for AMAX
+ADD_THREADWISE_INST_BY_ID(bhalf_t, float, bhalf_t, 4, 0, 1, 4, 4);
+ADD_THREADWISE_INST_BY_ID(bhalf_t, float, bhalf_t, 4, 0, 1, 4, 1);
+ADD_THREADWISE_INST_BY_ID(bhalf_t, float, bhalf_t, 4, 0, 1, 2, 1);
+// clang-format on
+
+} // namespace device_reduce_instance
+} // namespace device
+} // namespace tensor_operation
+
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f16_f16.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f16_f16.cpp
index 70b667e7d2..0984cdc46b 100644
--- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f16_f16.cpp
+++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f16_f16.cpp
@@ -8,21 +8,27 @@ namespace device_reduce_instance {
 // clang-format off
 // InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim
 ADD_THREADWISE_INST_BY_ID(half_t, half_t, half_t, 2, 0, 0, 4, 3); // for MIN
+ADD_THREADWISE_INST_BY_ID(half_t, half_t, half_t, 2, 0, 0, 4, 4);       
 ADD_THREADWISE_INST_BY_ID(half_t, half_t, half_t, 2, 0, 0, 4, 1);       
 ADD_THREADWISE_INST_BY_ID(half_t, half_t, half_t, 2, 0, 0, 2, 1);       
 ADD_THREADWISE_INST_BY_ID(half_t, half_t, half_t, 3, 0, 0, 4, 3); // for MAX
+ADD_THREADWISE_INST_BY_ID(half_t, half_t, half_t, 3, 0, 0, 4, 4);       
 ADD_THREADWISE_INST_BY_ID(half_t, half_t, half_t, 3, 0, 0, 4, 1);       
 ADD_THREADWISE_INST_BY_ID(half_t, half_t, half_t, 3, 0, 0, 2, 1);       
 ADD_THREADWISE_INST_BY_ID(half_t, half_t, half_t, 4, 0, 0, 4, 3); // for AMAX
+ADD_THREADWISE_INST_BY_ID(half_t, half_t, half_t, 4, 0, 0, 4, 4);       
 ADD_THREADWISE_INST_BY_ID(half_t, half_t, half_t, 4, 0, 0, 4, 1);       
 ADD_THREADWISE_INST_BY_ID(half_t, half_t, half_t, 4, 0, 0, 2, 1);       
 ADD_THREADWISE_INST_BY_ID(half_t, half_t, half_t, 2, 0, 1, 4, 3); // for MIN
+ADD_THREADWISE_INST_BY_ID(half_t, half_t, half_t, 2, 0, 1, 4, 4);       
 ADD_THREADWISE_INST_BY_ID(half_t, half_t, half_t, 2, 0, 1, 4, 1);       
 ADD_THREADWISE_INST_BY_ID(half_t, half_t, half_t, 2, 0, 1, 2, 1);       
 ADD_THREADWISE_INST_BY_ID(half_t, half_t, half_t, 3, 0, 1, 4, 3); // for MAX
+ADD_THREADWISE_INST_BY_ID(half_t, half_t, half_t, 3, 0, 1, 4, 4);       
 ADD_THREADWISE_INST_BY_ID(half_t, half_t, half_t, 3, 0, 1, 4, 1);       
 ADD_THREADWISE_INST_BY_ID(half_t, half_t, half_t, 3, 0, 1, 2, 1);       
 ADD_THREADWISE_INST_BY_ID(half_t, half_t, half_t, 4, 0, 1, 4, 3); // for AMAX
+ADD_THREADWISE_INST_BY_ID(half_t, half_t, half_t, 4, 0, 1, 4, 4);       
 ADD_THREADWISE_INST_BY_ID(half_t, half_t, half_t, 4, 0, 1, 4, 1);       
 ADD_THREADWISE_INST_BY_ID(half_t, half_t, half_t, 4, 0, 1, 2, 1);
 // clang-format on
diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f32_f16.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f32_f16.cpp
index 6b81513c27..64f14bd4e7 100644
--- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f32_f16.cpp
+++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f32_f16.cpp
@@ -8,12 +8,15 @@ namespace device_reduce_instance {
 // clang-format off
 // InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim
 ADD_THREADWISE_INST_BY_ID(half_t, float, half_t, 0, 0, 0, 4, 3); // for ADD
+ADD_THREADWISE_INST_BY_ID(half_t, float, half_t, 0, 0, 0, 4, 4);
 ADD_THREADWISE_INST_BY_ID(half_t, float, half_t, 0, 0, 0, 4, 1);
 ADD_THREADWISE_INST_BY_ID(half_t, float, half_t, 0, 0, 0, 2, 1);
 ADD_THREADWISE_INST_BY_ID(half_t, float, half_t, 5, 0, 0, 4, 3); // for AVG
+ADD_THREADWISE_INST_BY_ID(half_t, float, half_t, 5, 0, 0, 4, 4);       
 ADD_THREADWISE_INST_BY_ID(half_t, float, half_t, 5, 0, 0, 4, 1);       
 ADD_THREADWISE_INST_BY_ID(half_t, float, half_t, 5, 0, 0, 2, 1);       
 ADD_THREADWISE_INST_BY_ID(half_t, float, half_t, 7, 0, 0, 4, 3); // for NORM2
+ADD_THREADWISE_INST_BY_ID(half_t, float, half_t, 7, 0, 0, 4, 4);       
 ADD_THREADWISE_INST_BY_ID(half_t, float, half_t, 7, 0, 0, 4, 1);       
 ADD_THREADWISE_INST_BY_ID(half_t, float, half_t, 7, 0, 0, 2, 1);
 // clang-format on
diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32.cpp
index 27076415e6..69ed303b17 100644
--- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32.cpp
+++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32.cpp
@@ -8,30 +8,39 @@ namespace device_reduce_instance {
 // clang-format off
 // InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim
 ADD_THREADWISE_INST_BY_ID(float, float, float, 0, 0, 0, 4, 3); // for ADD
+ADD_THREADWISE_INST_BY_ID(float, float, float, 0, 0, 0, 4, 4);
 ADD_THREADWISE_INST_BY_ID(float, float, float, 0, 0, 0, 4, 1);
 ADD_THREADWISE_INST_BY_ID(float, float, float, 0, 0, 0, 2, 1);
 ADD_THREADWISE_INST_BY_ID(float, float, float, 5, 0, 0, 4, 3); // for AVG
+ADD_THREADWISE_INST_BY_ID(float, float, float, 5, 0, 0, 4, 4);       
 ADD_THREADWISE_INST_BY_ID(float, float, float, 5, 0, 0, 4, 1);       
 ADD_THREADWISE_INST_BY_ID(float, float, float, 5, 0, 0, 2, 1);       
 ADD_THREADWISE_INST_BY_ID(float, float, float, 7, 0, 0, 4, 3); // for NORM2
+ADD_THREADWISE_INST_BY_ID(float, float, float, 7, 0, 0, 4, 4);       
 ADD_THREADWISE_INST_BY_ID(float, float, float, 7, 0, 0, 4, 1);       
 ADD_THREADWISE_INST_BY_ID(float, float, float, 7, 0, 0, 2, 1);       
 ADD_THREADWISE_INST_BY_ID(float, float, float, 2, 0, 0, 4, 3); // for MIN
+ADD_THREADWISE_INST_BY_ID(float, float, float, 2, 0, 0, 4, 4);       
 ADD_THREADWISE_INST_BY_ID(float, float, float, 2, 0, 0, 4, 1);       
 ADD_THREADWISE_INST_BY_ID(float, float, float, 2, 0, 0, 2, 1);       
 ADD_THREADWISE_INST_BY_ID(float, float, float, 3, 0, 0, 4, 3); // for MAX
+ADD_THREADWISE_INST_BY_ID(float, float, float, 3, 0, 0, 4, 4);       
 ADD_THREADWISE_INST_BY_ID(float, float, float, 3, 0, 0, 4, 1);       
 ADD_THREADWISE_INST_BY_ID(float, float, float, 3, 0, 0, 2, 1);       
 ADD_THREADWISE_INST_BY_ID(float, float, float, 4, 0, 0, 4, 3); // for AMAX
+ADD_THREADWISE_INST_BY_ID(float, float, float, 4, 0, 0, 4, 4);       
 ADD_THREADWISE_INST_BY_ID(float, float, float, 4, 0, 0, 4, 1);       
 ADD_THREADWISE_INST_BY_ID(float, float, float, 4, 0, 0, 2, 1);       
 ADD_THREADWISE_INST_BY_ID(float, float, float, 2, 0, 1, 4, 3); // for MIN
+ADD_THREADWISE_INST_BY_ID(float, float, float, 2, 0, 1, 4, 4);       
 ADD_THREADWISE_INST_BY_ID(float, float, float, 2, 0, 1, 4, 1);       
 ADD_THREADWISE_INST_BY_ID(float, float, float, 2, 0, 1, 2, 1);       
 ADD_THREADWISE_INST_BY_ID(float, float, float, 3, 0, 1, 4, 3); // for MAX
+ADD_THREADWISE_INST_BY_ID(float, float, float, 3, 0, 1, 4, 4);       
 ADD_THREADWISE_INST_BY_ID(float, float, float, 3, 0, 1, 4, 1);       
 ADD_THREADWISE_INST_BY_ID(float, float, float, 3, 0, 1, 2, 1);       
 ADD_THREADWISE_INST_BY_ID(float, float, float, 4, 0, 1, 4, 3); // for AMAX
+ADD_THREADWISE_INST_BY_ID(float, float, float, 4, 0, 1, 4, 4);       
 ADD_THREADWISE_INST_BY_ID(float, float, float, 4, 0, 1, 4, 1);       
 ADD_THREADWISE_INST_BY_ID(float, float, float, 4, 0, 1, 2, 1);
 // clang-format on
diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f64_f32.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f64_f32.cpp
index 52c84a4278..5d791cec41 100644
--- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f64_f32.cpp
+++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f64_f32.cpp
@@ -8,12 +8,15 @@ namespace device_reduce_instance {
 // clang-format off
 // InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim
 ADD_THREADWISE_INST_BY_ID(float, double, float, 0, 0, 0, 4, 3); // for ADD
+ADD_THREADWISE_INST_BY_ID(float, double, float, 0, 0, 0, 4, 4);
 ADD_THREADWISE_INST_BY_ID(float, double, float, 0, 0, 0, 4, 1);
 ADD_THREADWISE_INST_BY_ID(float, double, float, 0, 0, 0, 2, 1);
 ADD_THREADWISE_INST_BY_ID(float, double, float, 5, 0, 0, 4, 3); // for AVG
+ADD_THREADWISE_INST_BY_ID(float, double, float, 5, 0, 0, 4, 4);       
 ADD_THREADWISE_INST_BY_ID(float, double, float, 5, 0, 0, 4, 1);       
 ADD_THREADWISE_INST_BY_ID(float, double, float, 5, 0, 0, 2, 1);       
 ADD_THREADWISE_INST_BY_ID(float, double, float, 7, 0, 0, 4, 3); // for NORM2
+ADD_THREADWISE_INST_BY_ID(float, double, float, 7, 0, 0, 4, 4);       
 ADD_THREADWISE_INST_BY_ID(float, double, float, 7, 0, 0, 4, 1);       
 ADD_THREADWISE_INST_BY_ID(float, double, float, 7, 0, 0, 2, 1);
 // clang-format on
diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64.cpp
index f77122d5a0..16c0409134 100644
--- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64.cpp
+++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64.cpp
@@ -8,30 +8,39 @@ namespace device_reduce_instance {
 // clang-format off
 // InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim
 ADD_THREADWISE_INST_BY_ID(double, double, double, 0, 0, 0, 4, 3); // for ADD
+ADD_THREADWISE_INST_BY_ID(double, double, double, 0, 0, 0, 4, 4);
 ADD_THREADWISE_INST_BY_ID(double, double, double, 0, 0, 0, 4, 1);
 ADD_THREADWISE_INST_BY_ID(double, double, double, 0, 0, 0, 2, 1);
 ADD_THREADWISE_INST_BY_ID(double, double, double, 5, 0, 0, 4, 3); // for AVG
+ADD_THREADWISE_INST_BY_ID(double, double, double, 5, 0, 0, 4, 4);       
 ADD_THREADWISE_INST_BY_ID(double, double, double, 5, 0, 0, 4, 1);       
 ADD_THREADWISE_INST_BY_ID(double, double, double, 5, 0, 0, 2, 1);       
 ADD_THREADWISE_INST_BY_ID(double, double, double, 7, 0, 0, 4, 3); // for NORM2
+ADD_THREADWISE_INST_BY_ID(double, double, double, 7, 0, 0, 4, 4);       
 ADD_THREADWISE_INST_BY_ID(double, double, double, 7, 0, 0, 4, 1);       
 ADD_THREADWISE_INST_BY_ID(double, double, double, 7, 0, 0, 2, 1);       
 ADD_THREADWISE_INST_BY_ID(double, double, double, 2, 0, 0, 4, 3); // for MIN
+ADD_THREADWISE_INST_BY_ID(double, double, double, 2, 0, 0, 4, 4);       
 ADD_THREADWISE_INST_BY_ID(double, double, double, 2, 0, 0, 4, 1);       
 ADD_THREADWISE_INST_BY_ID(double, double, double, 2, 0, 0, 2, 1);       
 ADD_THREADWISE_INST_BY_ID(double, double, double, 3, 0, 0, 4, 3); // for MAX
+ADD_THREADWISE_INST_BY_ID(double, double, double, 3, 0, 0, 4, 4);       
 ADD_THREADWISE_INST_BY_ID(double, double, double, 3, 0, 0, 4, 1);       
 ADD_THREADWISE_INST_BY_ID(double, double, double, 3, 0, 0, 2, 1);       
 ADD_THREADWISE_INST_BY_ID(double, double, double, 4, 0, 0, 4, 3); // for AMAX
+ADD_THREADWISE_INST_BY_ID(double, double, double, 4, 0, 0, 4, 4);       
 ADD_THREADWISE_INST_BY_ID(double, double, double, 4, 0, 0, 4, 1);       
 ADD_THREADWISE_INST_BY_ID(double, double, double, 4, 0, 0, 2, 1);       
 ADD_THREADWISE_INST_BY_ID(double, double, double, 2, 0, 1, 4, 3); // for MIN
+ADD_THREADWISE_INST_BY_ID(double, double, double, 2, 0, 1, 4, 4);       
 ADD_THREADWISE_INST_BY_ID(double, double, double, 2, 0, 1, 4, 1);       
 ADD_THREADWISE_INST_BY_ID(double, double, double, 2, 0, 1, 2, 1);       
 ADD_THREADWISE_INST_BY_ID(double, double, double, 3, 0, 1, 4, 3); // for MAX
+ADD_THREADWISE_INST_BY_ID(double, double, double, 3, 0, 1, 4, 4);       
 ADD_THREADWISE_INST_BY_ID(double, double, double, 3, 0, 1, 4, 1);       
 ADD_THREADWISE_INST_BY_ID(double, double, double, 3, 0, 1, 2, 1);       
 ADD_THREADWISE_INST_BY_ID(double, double, double, 4, 0, 1, 4, 3); // for AMAX
+ADD_THREADWISE_INST_BY_ID(double, double, double, 4, 0, 1, 4, 4);       
 ADD_THREADWISE_INST_BY_ID(double, double, double, 4, 0, 1, 4, 1);       
 ADD_THREADWISE_INST_BY_ID(double, double, double, 4, 0, 1, 2, 1);
 // clang-format on
diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i32_i8.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i32_i8.cpp
new file mode 100644
index 0000000000..7af7bc03f2
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i32_i8.cpp
@@ -0,0 +1,25 @@
+#include "device_reduce_instance_threadwise.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace device_reduce_instance {
+
+// clang-format off
+// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim
+ADD_THREADWISE_INST_BY_ID(int8_t, int32_t, int8_t, 0, 0, 0, 4, 3); // for ADD
+ADD_THREADWISE_INST_BY_ID(int8_t, int32_t, int8_t, 0, 0, 0, 4, 4);
+ADD_THREADWISE_INST_BY_ID(int8_t, int32_t, int8_t, 0, 0, 0, 4, 1);
+ADD_THREADWISE_INST_BY_ID(int8_t, int32_t, int8_t, 0, 0, 0, 2, 1);
+ADD_THREADWISE_INST_BY_ID(int8_t, int32_t, int8_t, 5, 0, 0, 4, 3); // for AVG
+ADD_THREADWISE_INST_BY_ID(int8_t, int32_t, int8_t, 5, 0, 0, 4, 4);
+ADD_THREADWISE_INST_BY_ID(int8_t, int32_t, int8_t, 5, 0, 0, 4, 1);
+ADD_THREADWISE_INST_BY_ID(int8_t, int32_t, int8_t, 5, 0, 0, 2, 1);
+// clang-format on
+// clang-format on
+
+} // namespace device_reduce_instance
+} // namespace device
+} // namespace tensor_operation
+
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i8_i8.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i8_i8.cpp
new file mode 100644
index 0000000000..9580aae057
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i8_i8.cpp
@@ -0,0 +1,40 @@
+#include "device_reduce_instance_threadwise.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace device_reduce_instance {
+
+// clang-format off
+// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim
+ADD_THREADWISE_INST_BY_ID(int8_t, int8_t, int8_t, 2, 0, 0, 4, 3); // for MIN
+ADD_THREADWISE_INST_BY_ID(int8_t, int8_t, int8_t, 2, 0, 0, 4, 4);       
+ADD_THREADWISE_INST_BY_ID(int8_t, int8_t, int8_t, 2, 0, 0, 4, 1);       
+ADD_THREADWISE_INST_BY_ID(int8_t, int8_t, int8_t, 2, 0, 0, 2, 1);       
+ADD_THREADWISE_INST_BY_ID(int8_t, int8_t, int8_t, 3, 0, 0, 4, 3); // for MAX
+ADD_THREADWISE_INST_BY_ID(int8_t, int8_t, int8_t, 3, 0, 0, 4, 4);       
+ADD_THREADWISE_INST_BY_ID(int8_t, int8_t, int8_t, 3, 0, 0, 4, 1);       
+ADD_THREADWISE_INST_BY_ID(int8_t, int8_t, int8_t, 3, 0, 0, 2, 1);       
+ADD_THREADWISE_INST_BY_ID(int8_t, int8_t, int8_t, 4, 0, 0, 4, 3); // for AMAX
+ADD_THREADWISE_INST_BY_ID(int8_t, int8_t, int8_t, 4, 0, 0, 4, 4);       
+ADD_THREADWISE_INST_BY_ID(int8_t, int8_t, int8_t, 4, 0, 0, 4, 1);       
+ADD_THREADWISE_INST_BY_ID(int8_t, int8_t, int8_t, 4, 0, 0, 2, 1);       
+ADD_THREADWISE_INST_BY_ID(int8_t, int8_t, int8_t, 2, 0, 1, 4, 3); // for MIN
+ADD_THREADWISE_INST_BY_ID(int8_t, int8_t, int8_t, 2, 0, 1, 4, 4);       
+ADD_THREADWISE_INST_BY_ID(int8_t, int8_t, int8_t, 2, 0, 1, 4, 1);       
+ADD_THREADWISE_INST_BY_ID(int8_t, int8_t, int8_t, 2, 0, 1, 2, 1);       
+ADD_THREADWISE_INST_BY_ID(int8_t, int8_t, int8_t, 3, 0, 1, 4, 3); // for MAX
+ADD_THREADWISE_INST_BY_ID(int8_t, int8_t, int8_t, 3, 0, 1, 4, 4);       
+ADD_THREADWISE_INST_BY_ID(int8_t, int8_t, int8_t, 3, 0, 1, 4, 1);       
+ADD_THREADWISE_INST_BY_ID(int8_t, int8_t, int8_t, 3, 0, 1, 2, 1);       
+ADD_THREADWISE_INST_BY_ID(int8_t, int8_t, int8_t, 4, 0, 1, 4, 3); // for AMAX
+ADD_THREADWISE_INST_BY_ID(int8_t, int8_t, int8_t, 4, 0, 1, 4, 4);       
+ADD_THREADWISE_INST_BY_ID(int8_t, int8_t, int8_t, 4, 0, 1, 4, 1);       
+ADD_THREADWISE_INST_BY_ID(int8_t, int8_t, int8_t, 4, 0, 1, 2, 1);
+// clang-format on
+
+} // namespace device_reduce_instance
+} // namespace device
+} // namespace tensor_operation
+
+} // namespace ck
diff --git a/profiler/include/profile_reduce_impl.hpp b/profiler/include/profile_reduce_impl.hpp
index 8ed93b94eb..c03f955ad3 100644
--- a/profiler/include/profile_reduce_impl.hpp
+++ b/profiler/include/profile_reduce_impl.hpp
@@ -2,7 +2,7 @@
 #include "device_reduce.hpp"
 #include "device_reduce_instance.hpp"
 #include "reduction_enums.hpp"
-#include "host_generic_reduction.hpp"
+#include "host_reduction.hpp"
 
 namespace ck {
 namespace tensor_operation {
@@ -20,34 +20,43 @@ struct ReduceDescription
 };
 
 using reduce_description_instances = std::tuple<ReduceDescription<4, 3, 0, 0, 0>, // for ADD
+                                                ReduceDescription<4, 4, 0, 0, 0>,
                                                 ReduceDescription<4, 1, 0, 0, 0>,
                                                 ReduceDescription<2, 1, 0, 0, 0>,
 
                                                 ReduceDescription<4, 3, 5, 0, 0>, // for AVG
+                                                ReduceDescription<4, 4, 5, 0, 0>,
                                                 ReduceDescription<4, 1, 5, 0, 0>,
                                                 ReduceDescription<2, 1, 5, 0, 0>,
 
                                                 ReduceDescription<4, 3, 7, 0, 0>, // for NORM2
+                                                ReduceDescription<4, 4, 7, 0, 0>,
                                                 ReduceDescription<4, 1, 7, 0, 0>,
                                                 ReduceDescription<2, 1, 7, 0, 0>,
 
                                                 ReduceDescription<4, 3, 2, 0, 0>, // for MIN
+                                                ReduceDescription<4, 4, 2, 0, 0>,
                                                 ReduceDescription<4, 1, 2, 0, 0>,
                                                 ReduceDescription<2, 1, 2, 0, 0>,
                                                 ReduceDescription<4, 3, 3, 0, 0>, // for MAX
+                                                ReduceDescription<4, 4, 3, 0, 0>,
                                                 ReduceDescription<4, 1, 3, 0, 0>,
                                                 ReduceDescription<2, 1, 3, 0, 0>,
                                                 ReduceDescription<4, 3, 4, 0, 0>, // for AMAX
+                                                ReduceDescription<4, 4, 4, 0, 0>,
                                                 ReduceDescription<4, 1, 4, 0, 0>,
                                                 ReduceDescription<2, 1, 4, 0, 0>,
 
                                                 ReduceDescription<4, 3, 2, 0, 1>, // for MIN
+                                                ReduceDescription<4, 4, 2, 0, 1>,
                                                 ReduceDescription<4, 1, 2, 0, 1>,
                                                 ReduceDescription<2, 1, 2, 0, 1>,
                                                 ReduceDescription<4, 3, 3, 0, 1>, // for MAX
+                                                ReduceDescription<4, 4, 3, 0, 1>,
                                                 ReduceDescription<4, 1, 3, 0, 1>,
                                                 ReduceDescription<2, 1, 3, 0, 1>,
                                                 ReduceDescription<4, 3, 4, 0, 1>, // for AMAX
+                                                ReduceDescription<4, 4, 4, 0, 1>,
                                                 ReduceDescription<4, 1, 4, 0, 1>,
                                                 ReduceDescription<2, 1, 4, 0, 1>>;
 
@@ -122,16 +131,16 @@ static void dumpBufferToFile(const char* fileName, T* data, size_t dataNumItems)
 };
 
 // map the data type used by the GPU kernels to the corresponding type used by the host codes
-template <typename inDataType>
+template <typename InType>
 struct type_mapping
 {
-    using outDataType = inDataType;
+    using OutType = InType;
 };
 
 template <>
 struct type_mapping<ck::half_t>
 {
-    using outDataType = half_float::half;
+    using OutType = half_float::half;
 };
 
 template <typename InDataType,
@@ -187,7 +196,26 @@ void profile_reduce_impl_impl(bool do_verification,
     constexpr bool invalid_reduce_3 =
         (!op_support_indices && IndicesOpt != ReduceTensorIndices_t::NO_INDICES);
 
-    constexpr bool invalid_reduce = (invalid_reduce_1 || invalid_reduce_2 || invalid_reduce_3);
+    // 1) If InDataType is int8_t, must use int8_t as AccDataType for indexable reduction operations
+    // 2) If InDataType is int8_t, must use int32_t as AccDataType for non-indexable reduction
+    // operations
+    constexpr bool invalid_reduce_4 =
+        std::is_same<InDataType, int8_t>::value &&
+        ((!op_support_indices && !std::is_same<AccDataType, int32_t>::value) ||
+         (op_support_indices && !std::is_same<AccDataType, int8_t>::value));
+
+    // 1) If InDataType is int8_t, the supported operation must be either indexable operations or
+    // ADD/AVG
+    constexpr bool invalid_reduce_5 = std::is_same<InDataType, int8_t>::value &&
+                                      (!op_support_indices && ReduceOpId != ReduceTensorOp_t::ADD &&
+                                       ReduceOpId != ReduceTensorOp_t::AVG);
+
+    // 1) If InDataType is bhalf_t, must use float as AccDataType for all reduction operations
+    constexpr bool invalid_reduce_6 =
+        std::is_same<InDataType, bhalf_t>::value && !std::is_same<AccDataType, float>::value;
+
+    constexpr bool invalid_reduce = (invalid_reduce_1 || invalid_reduce_2 || invalid_reduce_3 ||
+                                     invalid_reduce_4 || invalid_reduce_5 || invalid_reduce_6);
 
     if constexpr(!invalid_reduce)
     {
@@ -205,8 +233,8 @@ void profile_reduce_impl_impl(bool do_verification,
 
         Tensor<OutDataType> out_ref(outLengths);
         Tensor<OutDataType> out(outLengths);
-        Tensor<int> out_indices_ref(outLengths);
-        Tensor<int> out_indices(outLengths);
+        Tensor<int32_t> out_indices_ref(outLengths);
+        Tensor<int32_t> out_indices(outLengths);
 
         auto inStrides  = in.mDesc.GetStrides();
         auto outStrides = out.mDesc.GetStrides();
@@ -220,20 +248,22 @@ void profile_reduce_impl_impl(bool do_verification,
         {
             switch(init_method)
             {
-            case 0:
-                in.GenerateTensorValue(GeneratorTensor_1<InDataType>{}, num_thread);
-                if(beta != 0.0f)
-                    out_ref.GenerateTensorValue(GeneratorTensor_1<InDataType>{}, num_thread);
-                break;
+            case 0: break;
             case 1:
+                in.GenerateTensorValue(GeneratorTensor_1<InDataType>{1}, num_thread);
+                if(beta != 0.0f)
+                    out_ref.GenerateTensorValue(GeneratorTensor_1<InDataType>{1}, num_thread);
+                break;
+            case 2:
                 in.GenerateTensorValue(GeneratorTensor_2<InDataType>{-5, 5}, num_thread);
                 if(beta != 0.0f)
                     out_ref.GenerateTensorValue(GeneratorTensor_2<InDataType>{-5, 5}, num_thread);
                 break;
             default:
-                in.GenerateTensorValue(GeneratorTensor_2<InDataType>{1, 5}, num_thread);
+                in.GenerateTensorValue(GeneratorTensor_3<InDataType>{-5.0, 5.0}, num_thread);
                 if(beta != 0.0f)
-                    out_ref.GenerateTensorValue(GeneratorTensor_2<InDataType>{1, 5}, num_thread);
+                    out_ref.GenerateTensorValue(GeneratorTensor_3<InDataType>{-5.0, 5.0},
+                                                num_thread);
             }
 
             if(beta != 0.0f)
@@ -306,6 +336,7 @@ void profile_reduce_impl_impl(bool do_verification,
                                              IndicesOpt>(reduce0_ptrs);
 
         if constexpr(use_atomic_add)
+        {
             add_device_reduce_instance_multiblock_atomic_add<InDataType,
                                                              AccDataType,
                                                              OutDataType,
@@ -314,7 +345,9 @@ void profile_reduce_impl_impl(bool do_verification,
                                                              ReduceOpId,
                                                              NanOpt,
                                                              IndicesOpt>(reduce0_ptrs);
+        }
         else
+        {
             add_device_reduce_instance_multiblock_partial_reduce<InDataType,
                                                                  AccDataType,
                                                                  OutDataType,
@@ -323,9 +356,11 @@ void profile_reduce_impl_impl(bool do_verification,
                                                                  ReduceOpId,
                                                                  NanOpt,
                                                                  IndicesOpt>(reduce1_ptrs);
+        };
 
         // used for secondary reduction
         if constexpr(!use_atomic_add)
+        {
             add_device_reduce_instance_blockwise_second_call<AccDataType,
                                                              AccDataType,
                                                              OutDataType,
@@ -334,6 +369,7 @@ void profile_reduce_impl_impl(bool do_verification,
                                                              ReduceOpId,
                                                              NanOpt,
                                                              IndicesOpt>(reduce2_ptrs);
+        };
 
         if(reduce0_ptrs.empty() && reduce1_ptrs.empty())
         {
@@ -342,17 +378,24 @@ void profile_reduce_impl_impl(bool do_verification,
 
         if(do_verification)
         {
-            using hInType   = typename type_mapping<InDataType>::outDataType;
-            using hOutType  = typename type_mapping<OutDataType>::outDataType;
-            using hCompType = typename type_mapping<AccDataType>::outDataType;
+            using HostInDataType  = typename type_mapping<InDataType>::OutType;
+            using HostOutDataType = typename type_mapping<OutDataType>::OutType;
+            using HostAccDataType = typename type_mapping<AccDataType>::OutType;
 
-            ReductionHost<hInType, hCompType, hOutType, ReduceOpId, PropagateNan, NeedIndices>
+            ReductionHost<HostInDataType,
+                          HostAccDataType,
+                          HostOutDataType,
+                          ReduceOpId,
+                          Rank,
+                          NumReduceDim,
+                          PropagateNan,
+                          NeedIndices>
                 hostReduce(in.mDesc, out_ref.mDesc, invariantDims, reduceDims);
 
             hostReduce.Run(alpha,
-                           reinterpret_cast<const hInType*>(in.mData.data()),
+                           reinterpret_cast<const HostInDataType*>(in.mData.data()),
                            beta,
-                           reinterpret_cast<hOutType*>(out_ref.mData.data()),
+                           reinterpret_cast<HostOutDataType*>(out_ref.mData.data()),
                            out_indices_ref.mData.data());
         };
 
@@ -363,24 +406,27 @@ void profile_reduce_impl_impl(bool do_verification,
 
         for(auto& reduce_ptr : reduce0_ptrs)
         {
-            auto wsSizeInBytes = reduce_ptr->GetWorkspaceSizeInBytes(i_inLengths);
+            auto wsSizeInBytes = reduce_ptr->GetWorkspaceSizeInBytes(i_inLengths, reduceDims);
 
             DeviceMem ws_dev(wsSizeInBytes);
 
-            auto argument_ptr = reduce_ptr->MakeArgumentPointer(
-                i_inLengths,
-                i_inStrides,
-                i_outLengths,
-                i_outStrides,
-                reduceDims,
-                alpha,
-                beta,
-                in_dev.GetDeviceBuffer(),
-                out_dev.GetDeviceBuffer(),
-                out_indices_dev.GetDeviceBuffer(),
-                ws_dev.GetDeviceBuffer(),
-                InElementwiseOperation_0{static_cast<int32_t>(reduce_total_length)},
-                AccElementwiseOperation_0{static_cast<int32_t>(reduce_total_length)});
+            InElementwiseOperation_0 in_elementwise_op_0(static_cast<int32_t>(reduce_total_length));
+            AccElementwiseOperation_0 acc_elementwise_op_0(
+                static_cast<int32_t>(reduce_total_length));
+
+            auto argument_ptr = reduce_ptr->MakeArgumentPointer(i_inLengths,
+                                                                i_inStrides,
+                                                                i_outLengths,
+                                                                i_outStrides,
+                                                                reduceDims,
+                                                                alpha,
+                                                                beta,
+                                                                in_dev.GetDeviceBuffer(),
+                                                                out_dev.GetDeviceBuffer(),
+                                                                out_indices_dev.GetDeviceBuffer(),
+                                                                ws_dev.GetDeviceBuffer(),
+                                                                in_elementwise_op_0,
+                                                                acc_elementwise_op_0);
 
             if(!reduce_ptr->IsSupportedArgument(argument_ptr.get()))
                 continue;
@@ -445,24 +491,27 @@ void profile_reduce_impl_impl(bool do_verification,
 
         for(auto& reduce_ptr : reduce1_ptrs)
         {
-            auto wsSizeInBytes = reduce_ptr->GetWorkspaceSizeInBytes(i_inLengths);
+            auto wsSizeInBytes = reduce_ptr->GetWorkspaceSizeInBytes(i_inLengths, reduceDims);
 
             DeviceMem ws_dev(wsSizeInBytes);
 
-            auto argument_ptr = reduce_ptr->MakeArgumentPointer(
-                i_inLengths,
-                i_inStrides,
-                i_outLengths,
-                i_outStrides,
-                reduceDims,
-                alpha,
-                beta,
-                in_dev.GetDeviceBuffer(),
-                out_dev.GetDeviceBuffer(),
-                out_indices_dev.GetDeviceBuffer(),
-                ws_dev.GetDeviceBuffer(),
-                InElementwiseOperation_1{static_cast<int32_t>(reduce_total_length)},
-                AccElementwiseOperation_1{static_cast<int32_t>(reduce_total_length)});
+            InElementwiseOperation_1 in_elementwise_op_1(static_cast<int32_t>(reduce_total_length));
+            AccElementwiseOperation_1 acc_elementwise_op_1(
+                static_cast<int32_t>(reduce_total_length));
+
+            auto argument_ptr = reduce_ptr->MakeArgumentPointer(i_inLengths,
+                                                                i_inStrides,
+                                                                i_outLengths,
+                                                                i_outStrides,
+                                                                reduceDims,
+                                                                alpha,
+                                                                beta,
+                                                                in_dev.GetDeviceBuffer(),
+                                                                out_dev.GetDeviceBuffer(),
+                                                                out_indices_dev.GetDeviceBuffer(),
+                                                                ws_dev.GetDeviceBuffer(),
+                                                                in_elementwise_op_1,
+                                                                acc_elementwise_op_1);
 
             if(!reduce_ptr->IsSupportedArgument(argument_ptr.get()))
                 continue;
@@ -482,20 +531,25 @@ void profile_reduce_impl_impl(bool do_verification,
 
             for(auto& reduce2_ptr : reduce2_ptrs)
             {
-                auto argument2_ptr = reduce2_ptr->MakeArgumentPointer(
-                    inLengths2,
-                    inStrides2,
-                    i_outLengths,
-                    i_outStrides,
-                    reduceDims,
-                    alpha,
-                    beta,
-                    ws_dev.GetDeviceBuffer(),
-                    out_dev.GetDeviceBuffer(),
-                    out_indices_dev.GetDeviceBuffer(),
-                    ws_dev.GetDeviceBuffer(),
-                    InElementwiseOperation_2{static_cast<int32_t>(reduce_total_length)},
-                    AccElementwiseOperation_2{static_cast<int32_t>(reduce_total_length)});
+                InElementwiseOperation_2 in_elementwise_op_2(
+                    static_cast<int32_t>(reduce_total_length));
+                AccElementwiseOperation_2 acc_elementwise_op_2(
+                    static_cast<int32_t>(reduce_total_length));
+
+                auto argument2_ptr =
+                    reduce2_ptr->MakeArgumentPointer(inLengths2,
+                                                     inStrides2,
+                                                     i_outLengths,
+                                                     i_outStrides,
+                                                     reduceDims,
+                                                     alpha,
+                                                     beta,
+                                                     ws_dev.GetDeviceBuffer(),
+                                                     out_dev.GetDeviceBuffer(),
+                                                     out_indices_dev.GetDeviceBuffer(),
+                                                     ws_dev.GetDeviceBuffer(),
+                                                     in_elementwise_op_2,
+                                                     acc_elementwise_op_2);
 
                 if(!reduce2_ptr->IsSupportedArgument(argument2_ptr.get()))
                     continue;
diff --git a/profiler/src/profile_reduce.cpp b/profiler/src/profile_reduce.cpp
index ef8fd1115b..4ae1eeda8b 100644
--- a/profiler/src/profile_reduce.cpp
+++ b/profiler/src/profile_reduce.cpp
@@ -34,6 +34,8 @@ static struct option long_options[] = {{"inLengths", required_argument, nullptr,
                                        {"scales", required_argument, nullptr, 'S'},
                                        {"half", no_argument, nullptr, '?'},
                                        {"double", no_argument, nullptr, '?'},
+                                       {"int8", no_argument, nullptr, '?'},
+                                       {"bf16", no_argument, nullptr, '?'},
                                        {"dumpout", required_argument, nullptr, 'o'},
                                        {"verify", required_argument, nullptr, 'v'},
                                        {"log", required_argument, nullptr, 'l'},
@@ -119,6 +121,8 @@ class AppArgs
     public:
     bool use_half   = false;
     bool use_double = false;
+    bool use_int8   = false;
+    bool use_bf16   = false;
 
     std::vector<size_t> inLengths;
     std::vector<size_t> outLengths;
@@ -169,6 +173,8 @@ class AppArgs
                   << std::endl;
         std::cout << "--half, use fp16 for the input and output tensor data types" << std::endl;
         std::cout << "--double, use fp64 for the input and output tensor data types" << std::endl;
+        std::cout << "--int8, use int8 for the input and output tensor data types" << std::endl;
+        std::cout << "--bf16, use bfloat16 for the input and output tensor data types" << std::endl;
         std::cout << "--verify or -v, 1/0 to indicate whether to verify the reduction result by "
                      "comparing with the host-based reduction"
                   << std::endl;
@@ -267,6 +273,10 @@ class AppArgs
                     use_half = true;
                 else if(std::string(long_options[option_index].name) == "double")
                     use_double = true;
+                else if(std::string(long_options[option_index].name) == "int8")
+                    use_int8 = true;
+                else if(std::string(long_options[option_index].name) == "bf16")
+                    use_bf16 = true;
                 else if(std::string(long_options[option_index].name) == "help")
                 {
                     show_usage(argv[0]);
@@ -385,6 +395,71 @@ int profile_reduce(int argc, char* argv[])
                                                     args.scales[0],
                                                     args.scales[1]);
     }
+    else if(args.use_int8)
+    {
+        if(!args.compType_assigned)
+            args.compTypeId = appInt8;
+
+        if(args.outType_assigned && (args.outTypeId != appInt8 && args.outTypeId != appInt32))
+            args.outTypeId = appInt32;
+
+        if(!args.outType_assigned)
+            args.outTypeId = appInt8;
+
+        if(args.compTypeId == appInt8)
+        {
+            profile_reduce_impl<int8_t, int8_t, int8_t>(args.do_verification,
+                                                        args.init_method,
+                                                        args.do_log,
+                                                        args.do_dumpout,
+                                                        args.nrepeat,
+                                                        args.inLengths,
+                                                        args.reduceDims,
+                                                        args.reduceOp,
+                                                        args.nanOpt,
+                                                        args.indicesOpt,
+                                                        args.scales[0],
+                                                        args.scales[1]);
+        }
+        else if(args.compTypeId == appInt32)
+        {
+            profile_reduce_impl<int8_t, int32_t, int8_t>(args.do_verification,
+                                                         args.init_method,
+                                                         args.do_log,
+                                                         args.do_dumpout,
+                                                         args.nrepeat,
+                                                         args.inLengths,
+                                                         args.reduceDims,
+                                                         args.reduceOp,
+                                                         args.nanOpt,
+                                                         args.indicesOpt,
+                                                         args.scales[0],
+                                                         args.scales[1]);
+        }
+        else
+            throw std::runtime_error("Invalid compType assignment!");
+    }
+    else if(args.use_bf16)
+    {
+        if(args.outType_assigned && (args.outTypeId != appBFloat16 && args.outTypeId != appFloat))
+            args.outTypeId = appFloat;
+
+        if(!args.outType_assigned)
+            args.outTypeId = appBFloat16;
+
+        profile_reduce_impl<ck::bhalf_t, float, ck::bhalf_t>(args.do_verification,
+                                                             args.init_method,
+                                                             args.do_log,
+                                                             args.do_dumpout,
+                                                             args.nrepeat,
+                                                             args.inLengths,
+                                                             args.reduceDims,
+                                                             args.reduceOp,
+                                                             args.nanOpt,
+                                                             args.indicesOpt,
+                                                             args.scales[0],
+                                                             args.scales[1]);
+    }
     else
     {
         if(args.compTypeId == appFloat)
diff --git a/script/cmake-rocm.sh b/script/cmake-rocm.sh
index fcfe6c960b..0e8424f940 100755
--- a/script/cmake-rocm.sh
+++ b/script/cmake-rocm.sh
@@ -3,14 +3,14 @@ rm -f CMakeCache.txt
 rm -f *.cmake
 rm -rf CMakeFiles
 
-MY_PROJECT_SOURCE=../../..
+MY_PROJECT_SOURCE=../
 MY_PROJECT_INSTALL=../install.dir
 
 cmake                                                                                                                                          \
 -D CMAKE_INSTALL_PREFIX=${MY_PROJECT_INSTALL}                                                                                                  \
 -D BUILD_DEV=OFF                                                                                                                               \
 -D CMAKE_BUILD_TYPE=Release                                                                                                                    \
--D CMAKE_CXX_FLAGS="-DCK_AMD_GPU_GFX908 --amdgpu-target=gfx908 -O3 -ftemplate-backtrace-limit=0 -mllvm --amdgpu-spill-vgpr-to-agpr=0 -gline-tables-only -save-temps=$PWD"   \
+-D CMAKE_CXX_FLAGS="-DCK_AMD_GPU_GFX908 --amdgpu-target=gfx908 -O3 -ftemplate-backtrace-limit=0 -mllvm --amdgpu-spill-vgpr-to-agpr=0 -gline-tables-only "   \
 -D CMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc                                                                                                      \
 -D CMAKE_PREFIX_PATH=/opt/rocm                                                                                                                 \
 -D CMAKE_VERBOSE_MAKEFILE:BOOL=ON                                                                                                              \
diff --git a/script/profile_reduce_no_index.sh b/script/profile_reduce_no_index.sh
index a038f3f285..580a7ca1ee 100755
--- a/script/profile_reduce_no_index.sh
+++ b/script/profile_reduce_no_index.sh
@@ -3,13 +3,16 @@
 PRECISION=
 ##PRECISION=--half
 ##PRECISION=--double
+##PRECISION=--int8
+##PRECISION=--bf16
 
-if test -n $PRECISION && test "$PRECISION" = "--half"; then 
+if [ -n $PRECISION ] && [ "$PRECISION" = "--half" -o "$PRECISION" = "--bf16" ]; then
    ACCTYPE="-C 1"
-else
-   ACCTYPE=""
+elif [ -n $PRECISION ] && [ "$PRECISION" = "--int8" ]; then
+   ACCTYPE="-C 2"
 fi
 
+
 driver="./bin/ckProfiler"
 
 VERIFY="-v $1"
@@ -20,10 +23,16 @@ NREPEAT=$3
 #### 0 - ADD,  5 - AVG,  7 - NORM2
 Operations="0 5 7"
 
+#### 0 - ADD,  5 - AVG,    for int8, no NORM2 supported
+if [ -n $PRECISION ] && [ "$PRECISION" = "--int8" ]; then
+   Operations=5
+fi
+
 ## for generic validation
 for op in $Operations; do
     set -x
     #######        datatype   layout          reduce dims  op     acctype   verify  init  repeats
+    $driver reduce $PRECISION -D 64,4,280,82  -R 0,1,2,3   -O $op $ACCTYPE  $VERIFY $INIT $NREPEAT
     $driver reduce $PRECISION -D 64,4,280,82  -R 0         -O $op $ACCTYPE  $VERIFY $INIT $NREPEAT
     $driver reduce $PRECISION -D 64,4,280,82  -R 1         -O $op $ACCTYPE  $VERIFY $INIT $NREPEAT
     $driver reduce $PRECISION -D 64,4,280,82  -R 2         -O $op $ACCTYPE  $VERIFY $INIT $NREPEAT
diff --git a/script/profile_reduce_with_index.sh b/script/profile_reduce_with_index.sh
index 5e6a61748a..d4671e3981 100755
--- a/script/profile_reduce_with_index.sh
+++ b/script/profile_reduce_with_index.sh
@@ -3,6 +3,8 @@
 PRECISION=
 ##PRECISION=--half
 ##PRECISION=--double
+##PRECISION=--int8
+##PRECISION=--bf16
 
 driver="./bin/ckProfiler"
 
@@ -18,6 +20,7 @@ for op in $Operations; do
     for use_idx in 0 1; do
         set -x
         #######        datatype   layout          reduce dims  op     use index    verify  init  repeats
+        $driver reduce $PRECISION -D 64,4,280,82  -R 0,1,2,3   -O $op -I $use_idx  $VERIFY $INIT $NREPEAT
         $driver reduce $PRECISION -D 64,4,280,82  -R 0         -O $op -I $use_idx  $VERIFY $INIT $NREPEAT
         $driver reduce $PRECISION -D 64,4,280,82  -R 1         -O $op -I $use_idx  $VERIFY $INIT $NREPEAT
         $driver reduce $PRECISION -D 64,4,280,82  -R 2         -O $op -I $use_idx  $VERIFY $INIT $NREPEAT
diff --git a/script/test_reduce_no_index.sh b/script/test_reduce_no_index.sh
new file mode 100755
index 0000000000..95e563c93c
--- /dev/null
+++ b/script/test_reduce_no_index.sh
@@ -0,0 +1,52 @@
+#!/bin/bash
+
+## The following will be used for CI
+
+set -x
+
+## for float
+bin/test_reduce_no_index -D 64,4,280,82  -R 0,1,2,3  0 2
+bin/test_reduce_no_index -D 64,4,280,82  -R 0,1,2  0 2
+bin/test_reduce_no_index -D 64,4,280,82  -R 0,1,3  0 2
+bin/test_reduce_no_index -D 64,4,280,82  -R 0,2,3  0 2
+bin/test_reduce_no_index -D 64,4,280,82  -R 1,2,3  0 2
+bin/test_reduce_no_index -D 64,4,280,82  -R 0  0 2
+bin/test_reduce_no_index -D 64,4,280,82  -R 1  0 2
+bin/test_reduce_no_index -D 64,4,280,82  -R 2  0 2
+bin/test_reduce_no_index -D 64,4,280,82  -R 3  0 2
+
+## for float16
+bin/test_reduce_no_index -D 64,4,280,82  -R 0,1,2,3  1 2
+bin/test_reduce_no_index -D 64,4,280,82  -R 0,1,2  1 2
+bin/test_reduce_no_index -D 64,4,280,82  -R 0,1,3  1 2
+bin/test_reduce_no_index -D 64,4,280,82  -R 0,2,3  1 2
+bin/test_reduce_no_index -D 64,4,280,82  -R 1,2,3  1 2
+bin/test_reduce_no_index -D 64,4,280,82  -R 0  1 2
+bin/test_reduce_no_index -D 64,4,280,82  -R 1  1 2
+bin/test_reduce_no_index -D 64,4,280,82  -R 2  1 2
+bin/test_reduce_no_index -D 64,4,280,82  -R 3  1 2
+
+## for int8_t
+bin/test_reduce_no_index -D 64,4,280,82  -R 0,1,2,3  3 2
+bin/test_reduce_no_index -D 64,4,280,82  -R 0,1,2  3 2
+bin/test_reduce_no_index -D 64,4,280,82  -R 0,1,3  3 2
+bin/test_reduce_no_index -D 64,4,280,82  -R 0,2,3  3 2
+bin/test_reduce_no_index -D 64,4,280,82  -R 1,2,3  3 2
+bin/test_reduce_no_index -D 64,4,280,82  -R 0  3 2
+bin/test_reduce_no_index -D 64,4,280,82  -R 1  3 2
+bin/test_reduce_no_index -D 64,4,280,82  -R 2  3 2
+bin/test_reduce_no_index -D 64,4,280,82  -R 3  3 2
+
+## for bfloat16
+bin/test_reduce_no_index -D 64,4,280,82  -R 0,1,2,3  5 2
+bin/test_reduce_no_index -D 64,4,280,82  -R 0,1,2  5 2
+bin/test_reduce_no_index -D 64,4,280,82  -R 0,1,3  5 2
+bin/test_reduce_no_index -D 64,4,280,82  -R 0,2,3  5 2
+bin/test_reduce_no_index -D 64,4,280,82  -R 1,2,3  5 2
+bin/test_reduce_no_index -D 64,4,280,82  -R 0  5 2
+bin/test_reduce_no_index -D 64,4,280,82  -R 1  5 2
+bin/test_reduce_no_index -D 64,4,280,82  -R 2  5 2
+bin/test_reduce_no_index -D 64,4,280,82  -R 3  5 2
+
+set +x
+
diff --git a/script/test_reduce_with_index.sh b/script/test_reduce_with_index.sh
new file mode 100755
index 0000000000..8e7ed33847
--- /dev/null
+++ b/script/test_reduce_with_index.sh
@@ -0,0 +1,52 @@
+#!/bin/bash
+
+## The following will be used for CI
+
+set -x
+
+## for float
+bin/test_reduce_with_index -D 64,4,280,82  -R 0,1,2,3  0 2
+bin/test_reduce_with_index -D 64,4,280,82  -R 0,1,2  0 2
+bin/test_reduce_with_index -D 64,4,280,82  -R 0,1,3  0 2
+bin/test_reduce_with_index -D 64,4,280,82  -R 0,2,3  0 2
+bin/test_reduce_with_index -D 64,4,280,82  -R 1,2,3  0 2
+bin/test_reduce_with_index -D 64,4,280,82  -R 0  0 2
+bin/test_reduce_with_index -D 64,4,280,82  -R 1  0 2
+bin/test_reduce_with_index -D 64,4,280,82  -R 2  0 2
+bin/test_reduce_with_index -D 64,4,280,82  -R 3  0 2
+
+## for float16
+bin/test_reduce_with_index -D 64,4,280,82  -R 0,1,2,3  1 2
+bin/test_reduce_with_index -D 64,4,280,82  -R 0,1,2  1 2
+bin/test_reduce_with_index -D 64,4,280,82  -R 0,1,3  1 2
+bin/test_reduce_with_index -D 64,4,280,82  -R 0,2,3  1 2
+bin/test_reduce_with_index -D 64,4,280,82  -R 1,2,3  1 2
+bin/test_reduce_with_index -D 64,4,280,82  -R 0  1 2
+bin/test_reduce_with_index -D 64,4,280,82  -R 1  1 2
+bin/test_reduce_with_index -D 64,4,280,82  -R 2  1 2
+bin/test_reduce_with_index -D 64,4,280,82  -R 3  1 2
+
+## for int8_t
+bin/test_reduce_with_index -D 64,4,280,82  -R 0,1,2,3  3 2
+bin/test_reduce_with_index -D 64,4,280,82  -R 0,1,2  3 2
+bin/test_reduce_with_index -D 64,4,280,82  -R 0,1,3  3 2
+bin/test_reduce_with_index -D 64,4,280,82  -R 0,2,3  3 2
+bin/test_reduce_with_index -D 64,4,280,82  -R 1,2,3  3 2
+bin/test_reduce_with_index -D 64,4,280,82  -R 0  3 2
+bin/test_reduce_with_index -D 64,4,280,82  -R 1  3 2
+bin/test_reduce_with_index -D 64,4,280,82  -R 2  3 2
+bin/test_reduce_with_index -D 64,4,280,82  -R 3  3 2
+
+## for bfloat16
+bin/test_reduce_with_index -D 64,4,280,82  -R 0,1,2,3  5 2
+bin/test_reduce_with_index -D 64,4,280,82  -R 0,1,2  5 2
+bin/test_reduce_with_index -D 64,4,280,82  -R 0,1,3  5 2
+bin/test_reduce_with_index -D 64,4,280,82  -R 0,2,3  5 2
+bin/test_reduce_with_index -D 64,4,280,82  -R 1,2,3  5 2
+bin/test_reduce_with_index -D 64,4,280,82  -R 0  5 2
+bin/test_reduce_with_index -D 64,4,280,82  -R 1  5 2
+bin/test_reduce_with_index -D 64,4,280,82  -R 2  5 2
+bin/test_reduce_with_index -D 64,4,280,82  -R 3  5 2
+
+set +x
+
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index 4901c84813..13289443fa 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -40,3 +40,4 @@ add_subdirectory(conv2d_fwd)
 add_subdirectory(convnd_fwd)
 add_subdirectory(conv2d_bwd_data)
 add_subdirectory(batched_gemm)
+add_subdirectory(reduce)
diff --git a/test/reduce/CMakeLists.txt b/test/reduce/CMakeLists.txt
new file mode 100644
index 0000000000..4e11b049a8
--- /dev/null
+++ b/test/reduce/CMakeLists.txt
@@ -0,0 +1,7 @@
+add_test_executable(test_reduce_no_index reduce_no_index.cpp)
+add_test_executable(test_reduce_with_index reduce_with_index.cpp)
+target_link_libraries(test_reduce_no_index PRIVATE host_tensor)
+target_link_libraries(test_reduce_no_index PRIVATE device_reduce_instance)
+target_link_libraries(test_reduce_with_index PRIVATE host_tensor)
+target_link_libraries(test_reduce_with_index PRIVATE device_reduce_instance)
+
diff --git a/test/reduce/reduce_no_index.cpp b/test/reduce/reduce_no_index.cpp
new file mode 100644
index 0000000000..911bdf0bb1
--- /dev/null
+++ b/test/reduce/reduce_no_index.cpp
@@ -0,0 +1,666 @@
+#include "getopt.h"
+#include "device_reduce_instance.hpp"
+#include "reduction_enums.hpp"
+#include "host_tensor.hpp"
+#include "host_tensor_generator.hpp"
+#include "host_reduction.hpp"
+#include "test_util.hpp"
+#include "reduce_util.hpp"
+
+using namespace ck;
+
+namespace {
+
+template <index_t Rank, index_t NumReduceDim>
+static inline std::vector<int> get_invariant_dims(const std::vector<int>& reduceDims)
+{
+    assert(NumReduceDim == reduceDims.size());
+
+    int reduceFlag = 0;
+
+    // flag the bits for the reduceDims
+    for(int i = 0; i < NumReduceDim; i++)
+    {
+        reduceFlag |= 1 << reduceDims[i];
+    };
+
+    std::vector<int> invariantDims;
+
+    // collect invariant dimensions
+    for(int i = 0; i < Rank; i++)
+        if((reduceFlag & (1 << i)) == 0)
+        {
+            invariantDims.push_back(i);
+        };
+
+    return invariantDims;
+};
+
+// map the data type used by the GPU kernels to the corresponding type used by the host codes
+template <typename InType>
+struct type_mapping
+{
+    using OutType = InType;
+};
+
+template <>
+struct type_mapping<ck::half_t>
+{
+    using OutType = half_float::half;
+};
+
+constexpr int Rank = 4;
+
+constexpr ReduceTensorOp_t ReduceOpId      = ReduceTensorOp_t::AVG;
+constexpr NanPropagation_t NanOpt          = NanPropagation_t::PROPAGATE_NAN;
+constexpr bool PropagateNan                = false;
+constexpr ReduceTensorIndices_t IndicesOpt = ReduceTensorIndices_t::NO_INDICES;
+constexpr bool NeedIndices                 = false;
+
+template <typename InDataType,
+          typename AccDataType,
+          typename OutDataType,
+          int Rank,
+          int NumReduceDim>
+bool test_reduce_no_index_impl(int init_method,
+                               const std::vector<size_t>& inLengths,
+                               const std::vector<int>& reduceDims,
+                               float alpha,
+                               float beta)
+{
+    using namespace ck::tensor_operation::device;
+    using namespace ck::tensor_operation::device::device_reduce_instance;
+    using namespace ck::host_reduce;
+
+    constexpr bool out_support_atomic_add = std::is_same<OutDataType, float>::value;
+    constexpr bool op_support_atomic_add  = true;
+    constexpr bool use_atomic_add         = (out_support_atomic_add && op_support_atomic_add);
+
+    Tensor<InDataType> in(inLengths);
+
+    std::vector<size_t> outLengths;
+
+    const auto invariantDims = get_invariant_dims<Rank, NumReduceDim>(reduceDims);
+
+    if(reduceDims.size() == Rank)
+        outLengths.push_back(1);
+    else
+        for(auto dim : invariantDims)
+            outLengths.push_back(inLengths[dim]);
+
+    Tensor<OutDataType> out_ref(outLengths);
+    Tensor<OutDataType> out(outLengths);
+
+    // only used when the OutDataType is bhalf_t
+    Tensor<float> out_ref_fp32(outLengths);
+    Tensor<float> out_fp32(outLengths);
+
+    auto inStrides  = in.mDesc.GetStrides();
+    auto outStrides = out.mDesc.GetStrides();
+
+    size_t invariant_total_length = out.mDesc.GetElementSize();
+    size_t reduce_total_length    = in.mDesc.GetElementSize() / invariant_total_length;
+
+    std::size_t num_thread = std::thread::hardware_concurrency();
+
+    switch(init_method)
+    {
+    case 0: break;
+    case 1:
+        in.GenerateTensorValue(GeneratorTensor_1<InDataType>{1}, num_thread);
+        if(beta != 0.0f)
+            out_ref.GenerateTensorValue(GeneratorTensor_1<InDataType>{1}, num_thread);
+        break;
+    case 2:
+        in.GenerateTensorValue(GeneratorTensor_2<InDataType>{-5, 5}, num_thread);
+        if(beta != 0.0f)
+            out_ref.GenerateTensorValue(GeneratorTensor_2<InDataType>{-5, 5}, num_thread);
+        break;
+    default:
+        in.GenerateTensorValue(GeneratorTensor_3<InDataType>{-5.0, 5.0}, num_thread);
+        if(beta != 0.0f)
+            out_ref.GenerateTensorValue(GeneratorTensor_3<InDataType>{-5.0, 5.0}, num_thread);
+    }
+
+    if(beta != 0.0f)
+        for(size_t i = 0; i < out_ref.mDesc.GetElementSpace(); i++)
+            out.mData[i] = out_ref.mData[i];
+
+    // these buffers are usually provided by the user application
+    DeviceMem in_dev(sizeof(InDataType) * in.mDesc.GetElementSpace());
+    DeviceMem out_dev(sizeof(OutDataType) * out.mDesc.GetElementSpace());
+
+    in_dev.ToDevice(in.mData.data());
+
+    if(beta != 0.0f)
+        out_dev.ToDevice(out.mData.data());
+
+    using InElementwiseOperation_0 =
+        typename reduce_unary_operator<AccDataType, ReduceOpId, true, true>::InElementwiseOperation;
+    using AccElementwiseOperation_0 =
+        typename reduce_unary_operator<AccDataType, ReduceOpId, true, true>::
+            AccElementwiseOperation;
+    using InElementwiseOperation_1 =
+        typename reduce_unary_operator<AccDataType, ReduceOpId, true, false>::
+            InElementwiseOperation;
+    using AccElementwiseOperation_1 =
+        typename reduce_unary_operator<AccDataType, ReduceOpId, true, false>::
+            AccElementwiseOperation;
+    using InElementwiseOperation_2 =
+        typename reduce_unary_operator<AccDataType, ReduceOpId, false, true>::
+            InElementwiseOperation;
+    using AccElementwiseOperation_2 =
+        typename reduce_unary_operator<AccDataType, ReduceOpId, false, true>::
+            AccElementwiseOperation;
+
+    using DeviceReduceInstPtr0 =
+        DeviceReducePtr<InElementwiseOperation_0, AccElementwiseOperation_0>;
+    using DeviceReduceInstPtr1 =
+        DeviceReducePtr<InElementwiseOperation_1, AccElementwiseOperation_1>;
+    using DeviceReduceInstPtr2 =
+        DeviceReducePtr<InElementwiseOperation_2, AccElementwiseOperation_2>;
+
+    std::vector<DeviceReduceInstPtr0> reduce0_ptrs;
+    std::vector<DeviceReduceInstPtr1> reduce1_ptrs;
+    std::vector<DeviceReduceInstPtr2> reduce2_ptrs;
+
+    add_device_reduce_instance_threadwise<InDataType,
+                                          AccDataType,
+                                          OutDataType,
+                                          Rank,
+                                          NumReduceDim,
+                                          ReduceOpId,
+                                          NanOpt,
+                                          IndicesOpt>(reduce0_ptrs);
+
+    add_device_reduce_instance_blockwise<InDataType,
+                                         AccDataType,
+                                         OutDataType,
+                                         Rank,
+                                         NumReduceDim,
+                                         ReduceOpId,
+                                         NanOpt,
+                                         IndicesOpt>(reduce0_ptrs);
+
+    if constexpr(use_atomic_add)
+    {
+        add_device_reduce_instance_multiblock_atomic_add<InDataType,
+                                                         AccDataType,
+                                                         OutDataType,
+                                                         Rank,
+                                                         NumReduceDim,
+                                                         ReduceOpId,
+                                                         NanOpt,
+                                                         IndicesOpt>(reduce0_ptrs);
+    }
+    else
+    {
+        add_device_reduce_instance_multiblock_partial_reduce<InDataType,
+                                                             AccDataType,
+                                                             OutDataType,
+                                                             Rank,
+                                                             NumReduceDim,
+                                                             ReduceOpId,
+                                                             NanOpt,
+                                                             IndicesOpt>(reduce1_ptrs);
+    };
+
+    // used for secondary reduction
+    if constexpr(!use_atomic_add)
+    {
+        add_device_reduce_instance_blockwise_second_call<AccDataType,
+                                                         AccDataType,
+                                                         OutDataType,
+                                                         Rank,
+                                                         NumReduceDim,
+                                                         ReduceOpId,
+                                                         NanOpt,
+                                                         IndicesOpt>(reduce2_ptrs);
+    };
+
+    if(reduce0_ptrs.empty() && reduce1_ptrs.empty())
+    {
+        throw std::runtime_error("Wrong! No device REDUCE instance found");
+    };
+
+    bool result = true;
+
+    using HostInDataType  = typename type_mapping<InDataType>::OutType;
+    using HostOutDataType = typename type_mapping<OutDataType>::OutType;
+    using HostAccDataType = typename type_mapping<AccDataType>::OutType;
+
+    ReductionHost<HostInDataType,
+                  HostAccDataType,
+                  HostOutDataType,
+                  ReduceOpId,
+                  Rank,
+                  NumReduceDim,
+                  PropagateNan,
+                  NeedIndices>
+        hostReduce(in.mDesc, out_ref.mDesc, invariantDims, reduceDims);
+
+    hostReduce.Run(alpha,
+                   reinterpret_cast<const HostInDataType*>(in.mData.data()),
+                   beta,
+                   reinterpret_cast<HostOutDataType*>(out_ref.mData.data()),
+                   nullptr);
+
+    const auto i_inLengths  = to_int_vector(inLengths);
+    const auto i_inStrides  = to_int_vector(inStrides);
+    const auto i_outLengths = to_int_vector(outLengths);
+    const auto i_outStrides = to_int_vector(outStrides);
+
+    for(auto& reduce_ptr : reduce0_ptrs)
+    {
+        auto wsSizeInBytes = reduce_ptr->GetWorkspaceSizeInBytes(i_inLengths, reduceDims);
+
+        DeviceMem ws_dev(wsSizeInBytes);
+
+        InElementwiseOperation_0 in_elementwise_op_0(static_cast<int32_t>(reduce_total_length));
+        AccElementwiseOperation_0 acc_elementwise_op_0(static_cast<int32_t>(reduce_total_length));
+
+        auto argument_ptr = reduce_ptr->MakeArgumentPointer(i_inLengths,
+                                                            i_inStrides,
+                                                            i_outLengths,
+                                                            i_outStrides,
+                                                            reduceDims,
+                                                            alpha,
+                                                            beta,
+                                                            in_dev.GetDeviceBuffer(),
+                                                            out_dev.GetDeviceBuffer(),
+                                                            nullptr,
+                                                            ws_dev.GetDeviceBuffer(),
+                                                            in_elementwise_op_0,
+                                                            acc_elementwise_op_0);
+
+        if(!reduce_ptr->IsSupportedArgument(argument_ptr.get()))
+            continue;
+
+        auto invoker_ptr = reduce_ptr->MakeInvokerPointer();
+
+        (void)invoker_ptr->Run(argument_ptr.get());
+
+        out_dev.FromDevice(out.mData.data());
+
+        bool single_result = true;
+
+        if constexpr(std::is_same<OutDataType, ck::half_t>::value ||
+                     std::is_same<OutDataType, ck::bhalf_t>::value)
+        {
+            reduce_util::to_f32_vector(out, out_fp32);
+            reduce_util::to_f32_vector(out_ref, out_ref_fp32);
+            single_result = test_util::check_err(
+                out_fp32.mData, out_ref_fp32.mData, "Error: incorrect data result!");
+        }
+        else
+        {
+            single_result =
+                test_util::check_err(out.mData, out_ref.mData, "Error: incorrect data result!");
+        };
+
+        if(!single_result)
+        {
+            std::cout << "Fail Info: " << reduce_ptr->GetTypeString() << std::endl;
+            result = false;
+        }
+    };
+
+    for(auto& reduce_ptr : reduce1_ptrs)
+    {
+        auto wsSizeInBytes = reduce_ptr->GetWorkspaceSizeInBytes(i_inLengths, reduceDims);
+
+        DeviceMem ws_dev(wsSizeInBytes);
+
+        InElementwiseOperation_1 in_elementwise_op_1(static_cast<int32_t>(reduce_total_length));
+        AccElementwiseOperation_1 acc_elementwise_op_1(static_cast<int32_t>(reduce_total_length));
+
+        auto argument_ptr = reduce_ptr->MakeArgumentPointer(i_inLengths,
+                                                            i_inStrides,
+                                                            i_outLengths,
+                                                            i_outStrides,
+                                                            reduceDims,
+                                                            alpha,
+                                                            beta,
+                                                            in_dev.GetDeviceBuffer(),
+                                                            out_dev.GetDeviceBuffer(),
+                                                            nullptr,
+                                                            ws_dev.GetDeviceBuffer(),
+                                                            in_elementwise_op_1,
+                                                            acc_elementwise_op_1);
+
+        if(!reduce_ptr->IsSupportedArgument(argument_ptr.get()))
+            continue;
+
+        auto invoker_ptr = reduce_ptr->MakeInvokerPointer();
+
+        (void)invoker_ptr->Run(argument_ptr.get());
+
+        std::vector<int> inLengths2 = reduce_ptr->GetWorkspace2dLengths(argument_ptr.get());
+        std::vector<int> inStrides2{inLengths2[1], 1};
+
+        for(auto& reduce2_ptr : reduce2_ptrs)
+        {
+            InElementwiseOperation_2 in_elementwise_op_2(static_cast<int32_t>(reduce_total_length));
+            AccElementwiseOperation_2 acc_elementwise_op_2(
+                static_cast<int32_t>(reduce_total_length));
+
+            auto argument2_ptr = reduce2_ptr->MakeArgumentPointer(inLengths2,
+                                                                  inStrides2,
+                                                                  i_outLengths,
+                                                                  i_outStrides,
+                                                                  reduceDims,
+                                                                  alpha,
+                                                                  beta,
+                                                                  ws_dev.GetDeviceBuffer(),
+                                                                  out_dev.GetDeviceBuffer(),
+                                                                  nullptr,
+                                                                  ws_dev.GetDeviceBuffer(),
+                                                                  in_elementwise_op_2,
+                                                                  acc_elementwise_op_2);
+
+            if(!reduce2_ptr->IsSupportedArgument(argument2_ptr.get()))
+                continue;
+
+            std::string reduce2_name = reduce2_ptr->GetTypeString();
+
+            auto invoker2_ptr = reduce2_ptr->MakeInvokerPointer();
+
+            (void)invoker2_ptr->Run(argument2_ptr.get());
+
+            out_dev.FromDevice(out.mData.data());
+
+            bool single_result = true;
+
+            if constexpr(std::is_same<OutDataType, ck::half_t>::value ||
+                         std::is_same<OutDataType, ck::bhalf_t>::value)
+            {
+                reduce_util::to_f32_vector(out, out_fp32);
+                reduce_util::to_f32_vector(out_ref, out_ref_fp32);
+                single_result = test_util::check_err(
+                    out_fp32.mData, out_ref_fp32.mData, "Error: incorrect data result!");
+            }
+            else
+            {
+                single_result =
+                    test_util::check_err(out.mData, out_ref.mData, "Error: incorrect data result!");
+            };
+
+            if(!single_result)
+            {
+                std::cout << "Fail Info: " << reduce_ptr->GetTypeString() << " => "
+                          << reduce2_ptr->GetTypeString() << std::endl;
+                result = false;
+            }
+        };
+    };
+
+    return (result);
+};
+
+} // anonymous namespace
+
+static struct option long_options[] = {{"inLengths", required_argument, nullptr, 'D'},
+                                       {"reduceDimensions", required_argument, nullptr, 'R'},
+                                       {"scales", required_argument, nullptr, 'S'},
+                                       {"help", no_argument, nullptr, '?'},
+                                       {nullptr, 0, nullptr, 0}};
+
+class SimpleAppArgs
+{
+    template <typename T>
+    static T getSingleValueFromString(const std::string& valueStr)
+    {
+        std::istringstream iss(valueStr);
+
+        T ret;
+
+        iss >> ret;
+
+        return (ret);
+    };
+
+    template <typename T>
+    static std::vector<T> getTypeValuesFromString(const char* cstr_values)
+    {
+        std::string valuesStr(cstr_values);
+
+        std::vector<T> values;
+        std::size_t pos = 0;
+        std::size_t new_pos;
+
+        new_pos = valuesStr.find(',', pos);
+        while(new_pos != std::string::npos)
+        {
+            const std::string sliceStr = valuesStr.substr(pos, new_pos - pos);
+
+            T val = getSingleValueFromString<T>(sliceStr);
+
+            values.push_back(val);
+
+            pos     = new_pos + 1;
+            new_pos = valuesStr.find(',', pos);
+        };
+
+        std::string sliceStr = valuesStr.substr(pos);
+        T val                = getSingleValueFromString<T>(sliceStr);
+
+        values.push_back(val);
+
+        return (values);
+    };
+
+    private:
+    int option_index = 0;
+
+    public:
+    std::vector<size_t> inLengths;
+    std::vector<int> reduceDims;
+    std::vector<float> scales;
+
+    int data_type;
+    int init_method = 1;
+
+    public:
+    void show_usage(const char* cmd)
+    {
+        std::cout << "Usage of " << cmd << std::endl;
+        std::cout << "--inLengths or -D, comma separated list of input tensor dimension lengths "
+                     "(only 4-d tensor supported)"
+                  << std::endl;
+        std::cout << "--reduceDimensions or -R comma seperated list of dimension indexes to reduce "
+                     "(only 1 or 3 or 4 dimensions supported)"
+                  << std::endl;
+        std::cout << "--scales or -S, comma separated two float values for alpha and beta"
+                  << std::endl;
+        std::cout << "Arg1 -- data type (0: fp16, 1: fp32, 3: int8, 5: bp16, 6: fp64)" << std::endl;
+        std::cout << "Arg2 -- init method(0=no init, 1=single integer value, 2=scope integer "
+                     "value, 3=decimal value)"
+                  << std::endl;
+    };
+
+    int processArgs(int argc, char* argv[])
+    {
+        unsigned int ch;
+
+        while(1)
+        {
+            ch = getopt_long(argc, argv, "D:R:S:", long_options, &option_index);
+            if(ch == -1)
+                break;
+            switch(ch)
+            {
+            case 'D':
+                if(!optarg)
+                    throw std::runtime_error("Invalid option format!");
+
+                inLengths = getTypeValuesFromString<size_t>(optarg);
+                break;
+            case 'R':
+                if(!optarg)
+                    throw std::runtime_error("Invalid option format!");
+
+                reduceDims = getTypeValuesFromString<int>(optarg);
+                break;
+            case 'S':
+                if(!optarg)
+                    throw std::runtime_error("Invalid option format!");
+
+                scales = getTypeValuesFromString<float>(optarg);
+                break;
+            case '?':
+                if(std::string(long_options[option_index].name) == "help")
+                {
+                    show_usage(argv[0]);
+                    return (-1);
+                };
+                break;
+            default: show_usage(argv[0]); return (-1);
+            };
+        };
+
+        if(optind + 2 > argc)
+            throw std::runtime_error("Invalid cmd-line arguments, more argumetns are needed!");
+
+        data_type   = std::atoi(argv[optind++]);
+        init_method = std::atoi(argv[optind]);
+
+        if(scales.empty())
+        {
+            scales.push_back(1.0f);
+            scales.push_back(0.0f);
+        };
+
+        if(inLengths.size() != 4 ||
+           (reduceDims.size() != 1 && reduceDims.size() != 3 && reduceDims.size() != 4))
+            return (-1);
+
+        if(data_type != 0 && data_type != 1 && data_type != 3 && data_type != 5)
+            return (-1);
+
+        return (0);
+    };
+};
+
+bool test_reduce_no_index(int data_type,
+                          int init_method,
+                          std::vector<int> reduceDims,
+                          std::vector<size_t> inLengths,
+                          float alpha,
+                          float beta)
+{
+    bool result = true;
+
+    if(data_type == 0)
+    {
+        switch(reduceDims.size())
+        {
+        case 1:
+            result = test_reduce_no_index_impl<float, float, float, Rank, 1>(
+                init_method, inLengths, reduceDims, alpha, beta);
+            break;
+        case 3:
+            result = test_reduce_no_index_impl<float, float, float, Rank, 3>(
+                init_method, inLengths, reduceDims, alpha, beta);
+            break;
+        case 4:
+            result = test_reduce_no_index_impl<float, float, float, Rank, 4>(
+                init_method, inLengths, reduceDims, alpha, beta);
+            break;
+        };
+    }
+    else if(data_type == 1)
+    {
+        switch(reduceDims.size())
+        {
+        case 1:
+            result = test_reduce_no_index_impl<ck::half_t, float, ck::half_t, Rank, 1>(
+                init_method, inLengths, reduceDims, alpha, beta);
+            break;
+        case 3:
+            result = test_reduce_no_index_impl<ck::half_t, float, ck::half_t, Rank, 3>(
+                init_method, inLengths, reduceDims, alpha, beta);
+            break;
+        case 4:
+            result = test_reduce_no_index_impl<ck::half_t, float, ck::half_t, Rank, 4>(
+                init_method, inLengths, reduceDims, alpha, beta);
+            break;
+        };
+    }
+    else if(data_type == 3)
+    {
+        switch(reduceDims.size())
+        {
+        case 1:
+            result = test_reduce_no_index_impl<int8_t, int32_t, int8_t, Rank, 1>(
+                init_method, inLengths, reduceDims, alpha, beta);
+            break;
+        case 3:
+            result = test_reduce_no_index_impl<int8_t, int32_t, int8_t, Rank, 3>(
+                init_method, inLengths, reduceDims, alpha, beta);
+            break;
+        case 4:
+            result = test_reduce_no_index_impl<int8_t, int32_t, int8_t, Rank, 4>(
+                init_method, inLengths, reduceDims, alpha, beta);
+            break;
+        };
+    }
+    else if(data_type == 5)
+    {
+        switch(reduceDims.size())
+        {
+        case 1:
+            result = test_reduce_no_index_impl<ck::bhalf_t, float, ck::bhalf_t, Rank, 1>(
+                init_method, inLengths, reduceDims, alpha, beta);
+            break;
+        case 3:
+            result = test_reduce_no_index_impl<ck::bhalf_t, float, ck::bhalf_t, Rank, 3>(
+                init_method, inLengths, reduceDims, alpha, beta);
+            break;
+        case 4:
+            result = test_reduce_no_index_impl<ck::bhalf_t, float, ck::bhalf_t, Rank, 4>(
+                init_method, inLengths, reduceDims, alpha, beta);
+            break;
+        };
+    }
+
+    return (result);
+};
+
+int main(int argc, char* argv[])
+{
+    SimpleAppArgs args;
+
+    bool result = true;
+
+    if(argc == 1)
+    {
+        int data_type   = 1;
+        int init_method = 2;
+        std::vector<size_t> inLengths{64, 4, 280, 80};
+        std::vector<std::vector<int>> v_reduceDims{
+            {0, 1, 2, 3}, {0, 1, 2}, {1, 2, 3}, {0, 1, 3}, {0, 2, 3}, {0}, {1}, {2}, {3}};
+
+        for(auto& reduceDims : v_reduceDims)
+            result = result && test_reduce_no_index(
+                                   data_type, init_method, reduceDims, inLengths, 1.0f, 0.0f);
+    }
+    else
+    {
+        if(args.processArgs(argc, argv) < 0)
+        {
+            throw std::runtime_error(
+                "Invalid input arguments, test_reduce_no_index could not be executed!");
+        };
+
+        result = test_reduce_no_index(args.data_type,
+                                      args.init_method,
+                                      args.reduceDims,
+                                      args.inLengths,
+                                      args.scales[0],
+                                      args.scales[1]);
+    }
+
+    std::cout << "test_reduce_no_index ..... " << (result ? "SUCCESS" : "FAILURE") << std::endl;
+
+    return (result ? 0 : -1);
+}
diff --git a/test/reduce/reduce_util.hpp b/test/reduce/reduce_util.hpp
new file mode 100644
index 0000000000..e9a7b4896e
--- /dev/null
+++ b/test/reduce/reduce_util.hpp
@@ -0,0 +1,19 @@
+#ifndef REDUCE_UTILS_HPP
+#define REDUCE_UTILS_HPP
+
+#include "data_type.hpp"
+
+namespace ck {
+namespace reduce_util {
+
+template <typename T>
+void to_f32_vector(const Tensor<T>& src, Tensor<float>& dst)
+{
+    for(int i = 0; i < src.mData.size(); ++i)
+        dst.mData[i] = type_convert<float>(src.mData[i]);
+}
+
+} // namespace reduce_util
+
+} // namespace ck
+#endif
diff --git a/test/reduce/reduce_with_index.cpp b/test/reduce/reduce_with_index.cpp
new file mode 100644
index 0000000000..4c51fad550
--- /dev/null
+++ b/test/reduce/reduce_with_index.cpp
@@ -0,0 +1,669 @@
+#include "getopt.h"
+#include "device_reduce_instance.hpp"
+#include "reduction_enums.hpp"
+#include "host_tensor.hpp"
+#include "host_tensor_generator.hpp"
+#include "host_reduction.hpp"
+#include "test_util.hpp"
+#include "reduce_util.hpp"
+
+using namespace ck;
+
+namespace {
+
+template <index_t Rank, index_t NumReduceDim>
+static inline std::vector<int> get_invariant_dims(const std::vector<int>& reduceDims)
+{
+    assert(NumReduceDim == reduceDims.size());
+
+    int reduceFlag = 0;
+
+    // flag the bits for the reduceDims
+    for(int i = 0; i < NumReduceDim; i++)
+    {
+        reduceFlag |= 1 << reduceDims[i];
+    };
+
+    std::vector<int> invariantDims;
+
+    // collect invariant dimensions
+    for(int i = 0; i < Rank; i++)
+        if((reduceFlag & (1 << i)) == 0)
+        {
+            invariantDims.push_back(i);
+        };
+
+    return invariantDims;
+};
+
+// map the data type used by the GPU kernels to the corresponding type used by the host codes
+template <typename InType>
+struct type_mapping
+{
+    using OutType = InType;
+};
+
+template <>
+struct type_mapping<ck::half_t>
+{
+    using OutType = half_float::half;
+};
+
+constexpr int Rank = 4;
+
+constexpr ReduceTensorOp_t ReduceOpId      = ReduceTensorOp_t::AMAX;
+constexpr NanPropagation_t NanOpt          = NanPropagation_t::PROPAGATE_NAN;
+constexpr bool PropagateNan                = false;
+constexpr ReduceTensorIndices_t IndicesOpt = ReduceTensorIndices_t::FLATTENED_INDICES;
+constexpr bool NeedIndices                 = true;
+
+template <typename InDataType,
+          typename AccDataType,
+          typename OutDataType,
+          int Rank,
+          int NumReduceDim>
+bool test_reduce_with_index_impl(int init_method,
+                                 const std::vector<size_t>& inLengths,
+                                 const std::vector<int>& reduceDims,
+                                 float alpha,
+                                 float beta)
+{
+    using namespace ck::tensor_operation::device;
+    using namespace ck::tensor_operation::device::device_reduce_instance;
+    using namespace ck::host_reduce;
+
+    Tensor<InDataType> in(inLengths);
+
+    std::vector<size_t> outLengths;
+
+    const auto invariantDims = get_invariant_dims<Rank, NumReduceDim>(reduceDims);
+
+    if(reduceDims.size() == Rank)
+        outLengths.push_back(1);
+    else
+        for(auto dim : invariantDims)
+            outLengths.push_back(inLengths[dim]);
+
+    Tensor<OutDataType> out_ref(outLengths);
+    Tensor<OutDataType> out(outLengths);
+    Tensor<int32_t> out_indices_ref(outLengths);
+    Tensor<int32_t> out_indices(outLengths);
+
+    // only used when the OutDataType is bhalf_t
+    Tensor<float> out_ref_fp32(outLengths);
+    Tensor<float> out_fp32(outLengths);
+
+    auto inStrides  = in.mDesc.GetStrides();
+    auto outStrides = out.mDesc.GetStrides();
+
+    size_t invariant_total_length = out.mDesc.GetElementSize();
+    size_t reduce_total_length    = in.mDesc.GetElementSize() / invariant_total_length;
+
+    std::size_t num_thread = std::thread::hardware_concurrency();
+
+    switch(init_method)
+    {
+    case 0: break;
+    case 1:
+        in.GenerateTensorValue(GeneratorTensor_1<InDataType>{1}, num_thread);
+        if(beta != 0.0f)
+            out_ref.GenerateTensorValue(GeneratorTensor_1<InDataType>{1}, num_thread);
+        break;
+    case 2:
+        in.GenerateTensorValue(GeneratorTensor_2<InDataType>{-5, 5}, num_thread);
+        if(beta != 0.0f)
+            out_ref.GenerateTensorValue(GeneratorTensor_2<InDataType>{-5, 5}, num_thread);
+        break;
+    default:
+        in.GenerateTensorValue(GeneratorTensor_3<InDataType>{-5.0, 5.0}, num_thread);
+        if(beta != 0.0f)
+            out_ref.GenerateTensorValue(GeneratorTensor_3<InDataType>{-5.0, 5.0}, num_thread);
+    }
+
+    if(beta != 0.0f)
+        for(size_t i = 0; i < out_ref.mDesc.GetElementSpace(); i++)
+            out.mData[i] = out_ref.mData[i];
+
+    // these buffers are usually provided by the user application
+    DeviceMem in_dev(sizeof(InDataType) * in.mDesc.GetElementSpace());
+    DeviceMem out_dev(sizeof(OutDataType) * out.mDesc.GetElementSpace());
+
+    in_dev.ToDevice(in.mData.data());
+
+    if(beta != 0.0f)
+        out_dev.ToDevice(out.mData.data());
+
+    size_t indicesSizeInBytes = NeedIndices ? out.mDesc.GetElementSize() * sizeof(int) : 0;
+
+    DeviceMem out_indices_dev(indicesSizeInBytes);
+
+    using InElementwiseOperation_0 =
+        typename reduce_unary_operator<AccDataType, ReduceOpId, true, true>::InElementwiseOperation;
+    using AccElementwiseOperation_0 =
+        typename reduce_unary_operator<AccDataType, ReduceOpId, true, true>::
+            AccElementwiseOperation;
+    using InElementwiseOperation_1 =
+        typename reduce_unary_operator<AccDataType, ReduceOpId, true, false>::
+            InElementwiseOperation;
+    using AccElementwiseOperation_1 =
+        typename reduce_unary_operator<AccDataType, ReduceOpId, true, false>::
+            AccElementwiseOperation;
+    using InElementwiseOperation_2 =
+        typename reduce_unary_operator<AccDataType, ReduceOpId, false, true>::
+            InElementwiseOperation;
+    using AccElementwiseOperation_2 =
+        typename reduce_unary_operator<AccDataType, ReduceOpId, false, true>::
+            AccElementwiseOperation;
+
+    using DeviceReduceInstPtr0 =
+        DeviceReducePtr<InElementwiseOperation_0, AccElementwiseOperation_0>;
+    using DeviceReduceInstPtr1 =
+        DeviceReducePtr<InElementwiseOperation_1, AccElementwiseOperation_1>;
+    using DeviceReduceInstPtr2 =
+        DeviceReducePtr<InElementwiseOperation_2, AccElementwiseOperation_2>;
+
+    std::vector<DeviceReduceInstPtr0> reduce0_ptrs;
+    std::vector<DeviceReduceInstPtr1> reduce1_ptrs;
+    std::vector<DeviceReduceInstPtr2> reduce2_ptrs;
+
+    add_device_reduce_instance_threadwise<InDataType,
+                                          AccDataType,
+                                          OutDataType,
+                                          Rank,
+                                          NumReduceDim,
+                                          ReduceOpId,
+                                          NanOpt,
+                                          IndicesOpt>(reduce0_ptrs);
+
+    add_device_reduce_instance_blockwise<InDataType,
+                                         AccDataType,
+                                         OutDataType,
+                                         Rank,
+                                         NumReduceDim,
+                                         ReduceOpId,
+                                         NanOpt,
+                                         IndicesOpt>(reduce0_ptrs);
+
+    add_device_reduce_instance_multiblock_partial_reduce<InDataType,
+                                                         AccDataType,
+                                                         OutDataType,
+                                                         Rank,
+                                                         NumReduceDim,
+                                                         ReduceOpId,
+                                                         NanOpt,
+                                                         IndicesOpt>(reduce1_ptrs);
+
+    add_device_reduce_instance_blockwise_second_call<AccDataType,
+                                                     AccDataType,
+                                                     OutDataType,
+                                                     Rank,
+                                                     NumReduceDim,
+                                                     ReduceOpId,
+                                                     NanOpt,
+                                                     IndicesOpt>(reduce2_ptrs);
+
+    if(reduce0_ptrs.empty() && reduce1_ptrs.empty())
+    {
+        throw std::runtime_error("Wrong! No device REDUCE instance found");
+    };
+
+    bool result = true;
+
+    using HostInDataType  = typename type_mapping<InDataType>::OutType;
+    using HostOutDataType = typename type_mapping<OutDataType>::OutType;
+    using HostAccDataType = typename type_mapping<AccDataType>::OutType;
+
+    ReductionHost<HostInDataType,
+                  HostAccDataType,
+                  HostOutDataType,
+                  ReduceOpId,
+                  Rank,
+                  NumReduceDim,
+                  PropagateNan,
+                  NeedIndices>
+        hostReduce(in.mDesc, out_ref.mDesc, invariantDims, reduceDims);
+
+    hostReduce.Run(alpha,
+                   reinterpret_cast<const HostInDataType*>(in.mData.data()),
+                   beta,
+                   reinterpret_cast<HostOutDataType*>(out_ref.mData.data()),
+                   out_indices_ref.mData.data());
+
+    const auto i_inLengths  = to_int_vector(inLengths);
+    const auto i_inStrides  = to_int_vector(inStrides);
+    const auto i_outLengths = to_int_vector(outLengths);
+    const auto i_outStrides = to_int_vector(outStrides);
+
+    for(auto& reduce_ptr : reduce0_ptrs)
+    {
+        auto wsSizeInBytes = reduce_ptr->GetWorkspaceSizeInBytes(i_inLengths, reduceDims);
+
+        DeviceMem ws_dev(wsSizeInBytes);
+
+        InElementwiseOperation_0 in_elementwise_op_0(static_cast<int32_t>(reduce_total_length));
+        AccElementwiseOperation_0 acc_elementwise_op_0(static_cast<int32_t>(reduce_total_length));
+
+        auto argument_ptr = reduce_ptr->MakeArgumentPointer(i_inLengths,
+                                                            i_inStrides,
+                                                            i_outLengths,
+                                                            i_outStrides,
+                                                            reduceDims,
+                                                            alpha,
+                                                            beta,
+                                                            in_dev.GetDeviceBuffer(),
+                                                            out_dev.GetDeviceBuffer(),
+                                                            out_indices_dev.GetDeviceBuffer(),
+                                                            ws_dev.GetDeviceBuffer(),
+                                                            in_elementwise_op_0,
+                                                            acc_elementwise_op_0);
+
+        if(!reduce_ptr->IsSupportedArgument(argument_ptr.get()))
+            continue;
+
+        auto invoker_ptr = reduce_ptr->MakeInvokerPointer();
+
+        (void)invoker_ptr->Run(argument_ptr.get());
+
+        out_dev.FromDevice(out.mData.data());
+
+        bool single_result = true;
+
+        if constexpr(std::is_same<OutDataType, ck::half_t>::value ||
+                     std::is_same<OutDataType, ck::bhalf_t>::value)
+        {
+            reduce_util::to_f32_vector(out, out_fp32);
+            reduce_util::to_f32_vector(out_ref, out_ref_fp32);
+            single_result = test_util::check_err(
+                out_fp32.mData, out_ref_fp32.mData, "Error: incorrect data result!");
+        }
+        else
+        {
+            single_result =
+                test_util::check_err(out.mData, out_ref.mData, "Error: incorrect data result!");
+        };
+
+        if(NeedIndices)
+        {
+            out_indices_dev.FromDevice(out_indices.mData.data());
+            single_result = single_result && test_util::check_err(out_indices_ref.mData,
+                                                                  out_indices.mData,
+                                                                  "Error: incorrect index result!");
+        };
+
+        if(!single_result)
+        {
+            std::cout << "Fail Info: " << reduce_ptr->GetTypeString() << std::endl;
+            result = false;
+        }
+    };
+
+    for(auto& reduce_ptr : reduce1_ptrs)
+    {
+        auto wsSizeInBytes = reduce_ptr->GetWorkspaceSizeInBytes(i_inLengths, reduceDims);
+
+        DeviceMem ws_dev(wsSizeInBytes);
+
+        InElementwiseOperation_1 in_elementwise_op_1(static_cast<int32_t>(reduce_total_length));
+        AccElementwiseOperation_1 acc_elementwise_op_1(static_cast<int32_t>(reduce_total_length));
+
+        auto argument_ptr = reduce_ptr->MakeArgumentPointer(i_inLengths,
+                                                            i_inStrides,
+                                                            i_outLengths,
+                                                            i_outStrides,
+                                                            reduceDims,
+                                                            alpha,
+                                                            beta,
+                                                            in_dev.GetDeviceBuffer(),
+                                                            out_dev.GetDeviceBuffer(),
+                                                            out_indices_dev.GetDeviceBuffer(),
+                                                            ws_dev.GetDeviceBuffer(),
+                                                            in_elementwise_op_1,
+                                                            acc_elementwise_op_1);
+
+        if(!reduce_ptr->IsSupportedArgument(argument_ptr.get()))
+            continue;
+
+        std::string reduce_name = reduce_ptr->GetTypeString();
+
+        auto invoker_ptr = reduce_ptr->MakeInvokerPointer();
+
+        (void)invoker_ptr->Run(argument_ptr.get());
+
+        std::vector<int> inLengths2 = reduce_ptr->GetWorkspace2dLengths(argument_ptr.get());
+        std::vector<int> inStrides2{inLengths2[1], 1};
+
+        for(auto& reduce2_ptr : reduce2_ptrs)
+        {
+            InElementwiseOperation_2 in_elementwise_op_2(static_cast<int32_t>(reduce_total_length));
+            AccElementwiseOperation_2 acc_elementwise_op_2(
+                static_cast<int32_t>(reduce_total_length));
+
+            auto argument2_ptr = reduce2_ptr->MakeArgumentPointer(inLengths2,
+                                                                  inStrides2,
+                                                                  i_outLengths,
+                                                                  i_outStrides,
+                                                                  reduceDims,
+                                                                  alpha,
+                                                                  beta,
+                                                                  ws_dev.GetDeviceBuffer(),
+                                                                  out_dev.GetDeviceBuffer(),
+                                                                  out_indices_dev.GetDeviceBuffer(),
+                                                                  ws_dev.GetDeviceBuffer(),
+                                                                  in_elementwise_op_2,
+                                                                  acc_elementwise_op_2);
+
+            if(!reduce2_ptr->IsSupportedArgument(argument2_ptr.get()))
+                continue;
+
+            std::string reduce2_name = reduce2_ptr->GetTypeString();
+
+            auto invoker2_ptr = reduce2_ptr->MakeInvokerPointer();
+
+            (void)invoker2_ptr->Run(argument2_ptr.get());
+
+            out_dev.FromDevice(out.mData.data());
+
+            bool single_result = true;
+
+            if constexpr(std::is_same<OutDataType, ck::half_t>::value ||
+                         std::is_same<OutDataType, ck::bhalf_t>::value)
+            {
+                reduce_util::to_f32_vector(out, out_fp32);
+                reduce_util::to_f32_vector(out_ref, out_ref_fp32);
+                single_result = test_util::check_err(
+                    out_fp32.mData, out_ref_fp32.mData, "Error: incorrect data result!");
+            }
+            else
+            {
+                single_result =
+                    test_util::check_err(out.mData, out_ref.mData, "Error: incorrect data result!");
+            };
+
+            if(NeedIndices)
+            {
+                out_indices_dev.FromDevice(out_indices.mData.data());
+                single_result =
+                    single_result && test_util::check_err(out_indices_ref.mData,
+                                                          out_indices.mData,
+                                                          "Error: incorrect index result!");
+            };
+
+            if(!single_result)
+            {
+                std::cout << "Fail Info: " << reduce_ptr->GetTypeString() << " => "
+                          << reduce2_ptr->GetTypeString() << std::endl;
+                result = false;
+            }
+        };
+    };
+
+    return (result);
+};
+
+} // anonymous namespace
+
+static struct option long_options[] = {{"inLengths", required_argument, nullptr, 'D'},
+                                       {"reduceDimensions", required_argument, nullptr, 'R'},
+                                       {"scales", required_argument, nullptr, 'S'},
+                                       {"help", no_argument, nullptr, '?'},
+                                       {nullptr, 0, nullptr, 0}};
+
+class SimpleAppArgs
+{
+    template <typename T>
+    static T getSingleValueFromString(const std::string& valueStr)
+    {
+        std::istringstream iss(valueStr);
+
+        T ret;
+
+        iss >> ret;
+
+        return (ret);
+    };
+
+    template <typename T>
+    static std::vector<T> getTypeValuesFromString(const char* cstr_values)
+    {
+        std::string valuesStr(cstr_values);
+
+        std::vector<T> values;
+        std::size_t pos = 0;
+        std::size_t new_pos;
+
+        new_pos = valuesStr.find(',', pos);
+        while(new_pos != std::string::npos)
+        {
+            const std::string sliceStr = valuesStr.substr(pos, new_pos - pos);
+
+            T val = getSingleValueFromString<T>(sliceStr);
+
+            values.push_back(val);
+
+            pos     = new_pos + 1;
+            new_pos = valuesStr.find(',', pos);
+        };
+
+        std::string sliceStr = valuesStr.substr(pos);
+        T val                = getSingleValueFromString<T>(sliceStr);
+
+        values.push_back(val);
+
+        return (values);
+    };
+
+    private:
+    int option_index = 0;
+
+    public:
+    std::vector<size_t> inLengths;
+    std::vector<int> reduceDims;
+    std::vector<float> scales;
+
+    int data_type;
+    int init_method = 1;
+
+    public:
+    void show_usage(const char* cmd)
+    {
+        std::cout << "Usage of " << cmd << std::endl;
+        std::cout << "--inLengths or -D, comma separated list of input tensor dimension lengths "
+                     "(only 4-d tensor supported)"
+                  << std::endl;
+        std::cout << "--reduceDimensions or -R comma seperated list of dimension indexes to reduce "
+                     "(only 1 or 3 or 4 dimensions supported)"
+                  << std::endl;
+        std::cout << "--scales or -S, comma separated two float values for alpha and beta"
+                  << std::endl;
+        std::cout << "Arg1 -- data type (1: fp32, 3: int8, 5: bp16, 6: fp64)" << std::endl;
+        std::cout << "Arg2 -- init method(0=no init, 1=single integer value, 2=scope integer "
+                     "value, 3=decimal value)"
+                  << std::endl;
+    };
+
+    int processArgs(int argc, char* argv[])
+    {
+        unsigned int ch;
+
+        while(1)
+        {
+            ch = getopt_long(argc, argv, "D:R:S:", long_options, &option_index);
+            if(ch == -1)
+                break;
+            switch(ch)
+            {
+            case 'D':
+                if(!optarg)
+                    throw std::runtime_error("Invalid option format!");
+
+                inLengths = getTypeValuesFromString<size_t>(optarg);
+                break;
+            case 'R':
+                if(!optarg)
+                    throw std::runtime_error("Invalid option format!");
+
+                reduceDims = getTypeValuesFromString<int>(optarg);
+                break;
+            case 'S':
+                if(!optarg)
+                    throw std::runtime_error("Invalid option format!");
+
+                scales = getTypeValuesFromString<float>(optarg);
+                break;
+            case '?':
+                if(std::string(long_options[option_index].name) == "help")
+                {
+                    show_usage(argv[0]);
+                    return (-1);
+                };
+                break;
+            default: show_usage(argv[0]); return (-1);
+            };
+        };
+
+        if(optind + 2 > argc)
+            throw std::runtime_error("Invalid cmd-line arguments, more argumetns are needed!");
+
+        data_type   = std::atoi(argv[optind++]);
+        init_method = std::atoi(argv[optind]);
+
+        if(scales.empty())
+        {
+            scales.push_back(1.0f);
+            scales.push_back(0.0f);
+        };
+
+        if(inLengths.size() != 4 ||
+           (reduceDims.size() != 1 && reduceDims.size() != 3 && reduceDims.size() != 4))
+            return (-1);
+
+        if(data_type != 0 && data_type != 1 && data_type != 3 && data_type != 5)
+            return (-1);
+
+        return (0);
+    };
+};
+
+bool test_reduce_with_index(int data_type,
+                            int init_method,
+                            std::vector<int> reduceDims,
+                            std::vector<size_t> inLengths,
+                            float alpha,
+                            float beta)
+{
+    bool result = true;
+
+    if(data_type == 0)
+    {
+        switch(reduceDims.size())
+        {
+        case 1:
+            result = test_reduce_with_index_impl<float, float, float, Rank, 1>(
+                init_method, inLengths, reduceDims, alpha, beta);
+            break;
+        case 3:
+            result = test_reduce_with_index_impl<float, float, float, Rank, 3>(
+                init_method, inLengths, reduceDims, alpha, beta);
+            break;
+        case 4:
+            result = test_reduce_with_index_impl<float, float, float, Rank, 4>(
+                init_method, inLengths, reduceDims, alpha, beta);
+            break;
+        };
+    }
+    else if(data_type == 1)
+    {
+        switch(reduceDims.size())
+        {
+        case 1:
+            result = test_reduce_with_index_impl<ck::half_t, ck::half_t, ck::half_t, Rank, 1>(
+                init_method, inLengths, reduceDims, alpha, beta);
+            break;
+        case 3:
+            result = test_reduce_with_index_impl<ck::half_t, ck::half_t, ck::half_t, Rank, 3>(
+                init_method, inLengths, reduceDims, alpha, beta);
+            break;
+        case 4:
+            result = test_reduce_with_index_impl<ck::half_t, ck::half_t, ck::half_t, Rank, 4>(
+                init_method, inLengths, reduceDims, alpha, beta);
+            break;
+        };
+    }
+    else if(data_type == 3)
+    {
+        switch(reduceDims.size())
+        {
+        case 1:
+            result = test_reduce_with_index_impl<int8_t, int8_t, int8_t, Rank, 1>(
+                init_method, inLengths, reduceDims, alpha, beta);
+            break;
+        case 3:
+            result = test_reduce_with_index_impl<int8_t, int8_t, int8_t, Rank, 3>(
+                init_method, inLengths, reduceDims, alpha, beta);
+            break;
+        case 4:
+            result = test_reduce_with_index_impl<int8_t, int8_t, int8_t, Rank, 4>(
+                init_method, inLengths, reduceDims, alpha, beta);
+            break;
+        };
+    }
+    else if(data_type == 5)
+    {
+        switch(reduceDims.size())
+        {
+        case 1:
+            result = test_reduce_with_index_impl<ck::bhalf_t, float, ck::bhalf_t, Rank, 1>(
+                init_method, inLengths, reduceDims, alpha, beta);
+            break;
+        case 3:
+            result = test_reduce_with_index_impl<ck::bhalf_t, float, ck::bhalf_t, Rank, 3>(
+                init_method, inLengths, reduceDims, alpha, beta);
+            break;
+        case 4:
+            result = test_reduce_with_index_impl<ck::bhalf_t, float, ck::bhalf_t, Rank, 4>(
+                init_method, inLengths, reduceDims, alpha, beta);
+            break;
+        };
+    }
+
+    return (result);
+};
+
+int main(int argc, char* argv[])
+{
+    SimpleAppArgs args;
+
+    bool result = true;
+
+    if(argc == 1)
+    {
+        int data_type   = 1;
+        int init_method = 2;
+        std::vector<size_t> inLengths{64, 4, 280, 80};
+        std::vector<std::vector<int>> v_reduceDims{
+            {0, 1, 2, 3}, {0, 1, 2}, {1, 2, 3}, {0, 1, 3}, {0, 2, 3}, {0}, {1}, {2}, {3}};
+
+        for(auto& reduceDims : v_reduceDims)
+            result = result && test_reduce_with_index(
+                                   data_type, init_method, reduceDims, inLengths, 1.0f, 0.0f);
+    }
+    else
+    {
+        if(args.processArgs(argc, argv) < 0)
+        {
+            throw std::runtime_error(
+                "Invalid input arguments, test_reduce_with_index could not be executed!");
+        };
+
+        result = test_reduce_with_index(args.data_type,
+                                        args.init_method,
+                                        args.reduceDims,
+                                        args.inLengths,
+                                        args.scales[0],
+                                        args.scales[1]);
+    }
+
+    std::cout << "test_reduce_with_index ..... " << (result ? "SUCCESS" : "FAILURE") << std::endl;
+
+    return (result ? 0 : -1);
+}