Add NumReduceDim template parameter to DeviceSoftmax and Softmax client API to simplify instances collecting

2026-06-06 05:55:39 +00:00 · 2023-05-30 21:17:54 +00:00
parent ac9e01e2cc
commit 240900b1c0
57 changed files with 441 additions and 398 deletions
--- a/client_example/06_softmax/softmax4d.cpp
+++ b/client_example/06_softmax/softmax4d.cpp
@@ -53,8 +53,13 @@ int main(int argc, char* argv[])
    SimpleDeviceMem in(sizeof(InDataType) * num_elements);
    SimpleDeviceMem out(sizeof(OutDataType) * num_elements);

-    using DeviceOp = ck::tensor_operation::device::
-        DeviceSoftmax<InDataType, AccDataType, OutDataType, PassThrough, PassThrough, Rank>;
+    using DeviceOp = ck::tensor_operation::device::DeviceSoftmax<InDataType,
+                                                                 AccDataType,
+                                                                 OutDataType,
+                                                                 PassThrough,
+                                                                 PassThrough,
+                                                                 Rank,
+                                                                 NumReduceDim>;
    // get device op instances
    const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
        DeviceOp>::GetInstances();
@@ -74,11 +79,6 @@ int main(int argc, char* argv[])
    {
        auto& op_ptr = op_ptrs[i];

-        if(op_ptr->GetRank() != Rank || op_ptr->GetNumReduceDim() != NumReduceDim)
-        {
-            continue;
-        }
-
        auto argument_ptr   = op_ptr->MakeArgumentPointer(in_lengths,
                                                        in_strides,
                                                        reduce_dims,
--- a/include/ck/tensor_operation/gpu/device/device_softmax.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_softmax.hpp
@@ -18,7 +18,8 @@ template <typename InDataType,
          typename OutDataType,
          typename InElementwiseOp,
          typename AccElementwiseOp,
-          index_t Rank>
+          index_t Rank,
+          index_t NumReduceDim>
 struct DeviceSoftmax : public BaseOperator
 {
    //
@@ -49,8 +50,6 @@ struct DeviceSoftmax : public BaseOperator
                        AccElementwiseOp acc_elementwise_op) = 0;

    virtual std::unique_ptr<BaseInvoker> MakeInvokerPointer() = 0;
-    virtual index_t GetRank() const                           = 0;
-    virtual index_t GetNumReduceDim() const                   = 0;
 };

 template <typename InDataType,
@@ -58,9 +57,15 @@ template <typename InDataType,
          typename OutDataType,
          typename InElementwiseOp,
          typename AccElementwiseOp,
-          index_t Rank>
-using DeviceSoftmaxPtr = std::unique_ptr<
-    DeviceSoftmax<InDataType, AccDataType, OutDataType, InElementwiseOp, AccElementwiseOp, Rank>>;
+          index_t Rank,
+          index_t NumReduceDim>
+using DeviceSoftmaxPtr = std::unique_ptr<DeviceSoftmax<InDataType,
+                                                       AccDataType,
+                                                       OutDataType,
+                                                       InElementwiseOp,
+                                                       AccElementwiseOp,
+                                                       Rank,
+                                                       NumReduceDim>>;

 } // namespace device
 } // namespace tensor_operation
--- a/include/ck/tensor_operation/gpu/device/impl/device_softmax_impl.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_softmax_impl.hpp
@@ -38,16 +38,9 @@ struct DeviceSoftmaxImpl : public DeviceSoftmax<InDataType,
                                                OutDataType,
                                                InElementwiseOp,
                                                AccElementwiseOp,
-                                                Rank>
+                                                Rank,
+                                                NumReduceDim>
 {
-    static constexpr index_t kRank            = Rank;
-    static constexpr index_t kNumReduceDim    = NumReduceDim;
-    static constexpr index_t kNumInvariantDim = Rank - NumReduceDim;
-
-    virtual index_t GetRank() const override { return kRank; }
-
-    virtual index_t GetNumReduceDim() const override { return kNumReduceDim; }
-
    static constexpr index_t NumInvariantDim = Rank - NumReduceDim;

    static constexpr index_t NumSrcDim = Rank;
@@ -287,13 +280,13 @@ struct DeviceSoftmaxImpl : public DeviceSoftmax<InDataType,
    {
        if constexpr(InSrcVectorDim == 0)
        {
-            if constexpr(kNumInvariantDim == 0)
+            if constexpr(NumInvariantDim == 0)
            {
                return false;
            }
            else
            {
-                if(arg.inStrides_[kNumInvariantDim - 1] != 1 && InSrcVectorSize != 1)
+                if(arg.inStrides_[NumInvariantDim - 1] != 1 && InSrcVectorSize != 1)
                {
                    return false;
                }
@@ -316,7 +309,7 @@ struct DeviceSoftmaxImpl : public DeviceSoftmax<InDataType,
        }

        // To improve
-        if(kNumInvariantDim > 0 && arg.invariant_lowest_length_ % OutDstVectorSize != 0)
+        if(NumInvariantDim > 0 && arg.invariant_lowest_length_ % OutDstVectorSize != 0)
        {
            return false;
        }
--- a/library/include/ck/library/tensor_operation_instance/gpu/softmax.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/softmax.hpp
@@ -9,34 +9,33 @@
 #include "ck/ck.hpp"
 #include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
 #include "ck/tensor_operation/gpu/device/device_softmax.hpp"
+#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_instance.hpp"

 namespace ck {
 namespace tensor_operation {
 namespace device {
 namespace instance {

-void add_device_softmax_f16_f16_rank3_instances(
-    std::vector<DeviceSoftmaxPtr<F16, F32, F16, PassThrough, PassThrough, 3>>&);
-void add_device_softmax_f16_f16_rank4_instances(
-    std::vector<DeviceSoftmaxPtr<F16, F32, F16, PassThrough, PassThrough, 4>>&);
-
-void add_device_softmax_f32_f32_rank3_instances(
-    std::vector<DeviceSoftmaxPtr<F32, F32, F32, PassThrough, PassThrough, 3>>&);
-void add_device_softmax_f32_f32_rank4_instances(
-    std::vector<DeviceSoftmaxPtr<F32, F32, F32, PassThrough, PassThrough, 4>>&);
-
-void add_device_softmax_i8_i8_rank3_instances(
-    std::vector<DeviceSoftmaxPtr<I8, F32, I8, PassThrough, PassThrough, 3>>&);
-void add_device_softmax_i8_i8_rank4_instances(
-    std::vector<DeviceSoftmaxPtr<I8, F32, I8, PassThrough, PassThrough, 4>>&);
-
-template <typename InDataType, typename AccDataType, typename OutDataType, index_t Rank>
-struct DeviceOperationInstanceFactory<
-    ck::tensor_operation::device::
-        DeviceSoftmax<InDataType, AccDataType, OutDataType, PassThrough, PassThrough, Rank>>
+template <typename InDataType,
+          typename AccDataType,
+          typename OutDataType,
+          index_t Rank,
+          index_t NumReduceDim>
+struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceSoftmax<InDataType,
+                                                                                  AccDataType,
+                                                                                  OutDataType,
+                                                                                  PassThrough,
+                                                                                  PassThrough,
+                                                                                  Rank,
+                                                                                  NumReduceDim>>
 {
-    using DeviceOp =
-        DeviceSoftmax<InDataType, AccDataType, OutDataType, PassThrough, PassThrough, Rank>;
+    using DeviceOp = DeviceSoftmax<InDataType,
+                                   AccDataType,
+                                   OutDataType,
+                                   PassThrough,
+                                   PassThrough,
+                                   Rank,
+                                   NumReduceDim>;

    static auto GetInstances()
    {
@@ -46,25 +45,73 @@ struct DeviceOperationInstanceFactory<
                     std::is_same_v<OutDataType, F16>)
        {
            if constexpr(Rank == 3)
-                add_device_softmax_f16_f16_rank3_instances(op_ptrs);
+            {
+                if constexpr(NumReduceDim == 1)
+                    add_device_softmax_f16_f16_rank3_reduce1_instances(op_ptrs);
+                else if constexpr(NumReduceDim == 2)
+                    add_device_softmax_f16_f16_rank3_reduce2_instances(op_ptrs);
+                else if constexpr(NumReduceDim == 3)
+                    add_device_softmax_f16_f16_rank3_reduce3_instances(op_ptrs);
+            }
            else if constexpr(Rank == 4)
-                add_device_softmax_f16_f16_rank4_instances(op_ptrs);
+            {
+                if constexpr(NumReduceDim == 1)
+                    add_device_softmax_f16_f16_rank4_reduce1_instances(op_ptrs);
+                else if constexpr(NumReduceDim == 2)
+                    add_device_softmax_f16_f16_rank4_reduce2_instances(op_ptrs);
+                else if constexpr(NumReduceDim == 3)
+                    add_device_softmax_f16_f16_rank4_reduce3_instances(op_ptrs);
+                else if constexpr(NumReduceDim == 4)
+                    add_device_softmax_f16_f16_rank4_reduce4_instances(op_ptrs);
+            }
        }
        else if constexpr(std::is_same_v<InDataType, F32> && std::is_same_v<AccDataType, F32> &&
                          std::is_same_v<OutDataType, F32>)
        {
            if constexpr(Rank == 3)
-                add_device_softmax_f32_f32_rank3_instances(op_ptrs);
+            {
+                if constexpr(NumReduceDim == 1)
+                    add_device_softmax_f32_f32_rank3_reduce1_instances(op_ptrs);
+                else if constexpr(NumReduceDim == 2)
+                    add_device_softmax_f32_f32_rank3_reduce2_instances(op_ptrs);
+                else if constexpr(NumReduceDim == 3)
+                    add_device_softmax_f32_f32_rank3_reduce3_instances(op_ptrs);
+            }
            else if constexpr(Rank == 4)
-                add_device_softmax_f32_f32_rank4_instances(op_ptrs);
+            {
+                if constexpr(NumReduceDim == 1)
+                    add_device_softmax_f32_f32_rank4_reduce1_instances(op_ptrs);
+                else if constexpr(NumReduceDim == 2)
+                    add_device_softmax_f32_f32_rank4_reduce2_instances(op_ptrs);
+                else if constexpr(NumReduceDim == 3)
+                    add_device_softmax_f32_f32_rank4_reduce3_instances(op_ptrs);
+                else if constexpr(NumReduceDim == 4)
+                    add_device_softmax_f32_f32_rank4_reduce4_instances(op_ptrs);
+            }
        }
        else if constexpr(std::is_same_v<InDataType, I8> && std::is_same_v<AccDataType, F32> &&
                          std::is_same_v<OutDataType, I8>)
        {
            if constexpr(Rank == 3)
-                add_device_softmax_i8_i8_rank3_instances(op_ptrs);
+            {
+                if constexpr(NumReduceDim == 1)
+                    add_device_softmax_i8_i8_rank3_reduce1_instances(op_ptrs);
+                else if constexpr(NumReduceDim == 2)
+                    add_device_softmax_i8_i8_rank3_reduce2_instances(op_ptrs);
+                else if constexpr(NumReduceDim == 3)
+                    add_device_softmax_i8_i8_rank3_reduce3_instances(op_ptrs);
+            }
            else if constexpr(Rank == 4)
-                add_device_softmax_i8_i8_rank4_instances(op_ptrs);
+            {
+                if constexpr(NumReduceDim == 1)
+                    add_device_softmax_i8_i8_rank4_reduce1_instances(op_ptrs);
+                else if constexpr(NumReduceDim == 2)
+                    add_device_softmax_i8_i8_rank4_reduce2_instances(op_ptrs);
+                else if constexpr(NumReduceDim == 3)
+                    add_device_softmax_i8_i8_rank4_reduce3_instances(op_ptrs);
+                else if constexpr(NumReduceDim == 4)
+                    add_device_softmax_i8_i8_rank4_reduce4_instances(op_ptrs);
+            }
        }

        return op_ptrs;
--- a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance.hpp
@@ -1,22 +0,0 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
-
-#pragma once
-
-#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
-#include "ck/tensor_operation/gpu/device/device_softmax.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
-
-void add_device_softmax_f16_f16_rank3_instances(
-    std::vector<DeviceSoftmaxPtr<F16, F32, F16, PassThrough, PassThrough, 3>>& instances);
-void add_device_softmax_f16_f16_rank4_instances(
-    std::vector<DeviceSoftmaxPtr<F16, F32, F16, PassThrough, PassThrough, 4>>& instances);
-
-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
--- a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce1.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce1.hpp
@@ -14,7 +14,7 @@ namespace device {
 namespace instance {

 void add_device_softmax_f16_f16_rank3_reduce1_instances(
-    std::vector<DeviceSoftmaxPtr<F16, F32, F16, PassThrough, PassThrough, 3>>& instances);
+    std::vector<DeviceSoftmaxPtr<F16, F32, F16, PassThrough, PassThrough, 3, 1>>& instances);

 } // namespace instance
 } // namespace device
--- a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce2.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce2.hpp
@@ -14,7 +14,7 @@ namespace device {
 namespace instance {

 void add_device_softmax_f16_f16_rank3_reduce2_instances(
-    std::vector<DeviceSoftmaxPtr<F16, F32, F16, PassThrough, PassThrough, 3>>& instances);
+    std::vector<DeviceSoftmaxPtr<F16, F32, F16, PassThrough, PassThrough, 3, 2>>& instances);

 } // namespace instance
 } // namespace device
--- a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce3.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce3.hpp
@@ -14,7 +14,7 @@ namespace device {
 namespace instance {

 void add_device_softmax_f16_f16_rank3_reduce3_instances(
-    std::vector<DeviceSoftmaxPtr<F16, F32, F16, PassThrough, PassThrough, 3>>& instances);
+    std::vector<DeviceSoftmaxPtr<F16, F32, F16, PassThrough, PassThrough, 3, 3>>& instances);

 } // namespace instance
 } // namespace device
--- a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce1.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce1.hpp
@@ -14,7 +14,7 @@ namespace device {
 namespace instance {

 void add_device_softmax_f16_f16_rank4_reduce1_instances(
-    std::vector<DeviceSoftmaxPtr<F16, F32, F16, PassThrough, PassThrough, 4>>& instances);
+    std::vector<DeviceSoftmaxPtr<F16, F32, F16, PassThrough, PassThrough, 4, 1>>& instances);

 } // namespace instance
 } // namespace device
--- a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce2.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce2.hpp
@@ -14,7 +14,7 @@ namespace device {
 namespace instance {

 void add_device_softmax_f16_f16_rank4_reduce2_instances(
-    std::vector<DeviceSoftmaxPtr<F16, F32, F16, PassThrough, PassThrough, 4>>& instances);
+    std::vector<DeviceSoftmaxPtr<F16, F32, F16, PassThrough, PassThrough, 4, 2>>& instances);

 } // namespace instance
 } // namespace device
--- a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce3.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce3.hpp
@@ -14,7 +14,7 @@ namespace device {
 namespace instance {

 void add_device_softmax_f16_f16_rank4_reduce3_instances(
-    std::vector<DeviceSoftmaxPtr<F16, F32, F16, PassThrough, PassThrough, 4>>& instances);
+    std::vector<DeviceSoftmaxPtr<F16, F32, F16, PassThrough, PassThrough, 4, 3>>& instances);

 } // namespace instance
 } // namespace device
--- a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce4.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce4.hpp
@@ -14,7 +14,7 @@ namespace device {
 namespace instance {

 void add_device_softmax_f16_f16_rank4_reduce4_instances(
-    std::vector<DeviceSoftmaxPtr<F16, F32, F16, PassThrough, PassThrough, 4>>& instances);
+    std::vector<DeviceSoftmaxPtr<F16, F32, F16, PassThrough, PassThrough, 4, 4>>& instances);

 } // namespace instance
 } // namespace device
--- a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance.hpp
@@ -1,22 +0,0 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
-
-#pragma once
-
-#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
-#include "ck/tensor_operation/gpu/device/device_softmax.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
-
-void add_device_softmax_f32_f32_rank3_instances(
-    std::vector<DeviceSoftmaxPtr<F32, F32, F32, PassThrough, PassThrough, 3>>& instances);
-void add_device_softmax_f32_f32_rank4_instances(
-    std::vector<DeviceSoftmaxPtr<F32, F32, F32, PassThrough, PassThrough, 4>>& instances);
-
-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
--- a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce1.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce1.hpp
@@ -14,7 +14,7 @@ namespace device {
 namespace instance {

 void add_device_softmax_f32_f32_rank3_reduce1_instances(
-    std::vector<DeviceSoftmaxPtr<F32, F32, F32, PassThrough, PassThrough, 3>>& instances);
+    std::vector<DeviceSoftmaxPtr<F32, F32, F32, PassThrough, PassThrough, 3, 1>>& instances);

 } // namespace instance
 } // namespace device
--- a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce2.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce2.hpp
@@ -14,7 +14,7 @@ namespace device {
 namespace instance {

 void add_device_softmax_f32_f32_rank3_reduce2_instances(
-    std::vector<DeviceSoftmaxPtr<F32, F32, F32, PassThrough, PassThrough, 3>>& instances);
+    std::vector<DeviceSoftmaxPtr<F32, F32, F32, PassThrough, PassThrough, 3, 2>>& instances);

 } // namespace instance
 } // namespace device
--- a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce3.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce3.hpp
@@ -14,7 +14,7 @@ namespace device {
 namespace instance {

 void add_device_softmax_f32_f32_rank3_reduce3_instances(
-    std::vector<DeviceSoftmaxPtr<F32, F32, F32, PassThrough, PassThrough, 3>>& instances);
+    std::vector<DeviceSoftmaxPtr<F32, F32, F32, PassThrough, PassThrough, 3, 3>>& instances);

 } // namespace instance
 } // namespace device
--- a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce1.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce1.hpp
@@ -14,7 +14,7 @@ namespace device {
 namespace instance {

 void add_device_softmax_f32_f32_rank4_reduce1_instances(
-    std::vector<DeviceSoftmaxPtr<F32, F32, F32, PassThrough, PassThrough, 4>>& instances);
+    std::vector<DeviceSoftmaxPtr<F32, F32, F32, PassThrough, PassThrough, 4, 1>>& instances);

 } // namespace instance
 } // namespace device
--- a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce2.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce2.hpp
@@ -14,7 +14,7 @@ namespace device {
 namespace instance {

 void add_device_softmax_f32_f32_rank4_reduce2_instances(
-    std::vector<DeviceSoftmaxPtr<F32, F32, F32, PassThrough, PassThrough, 4>>& instances);
+    std::vector<DeviceSoftmaxPtr<F32, F32, F32, PassThrough, PassThrough, 4, 2>>& instances);

 } // namespace instance
 } // namespace device
--- a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce3.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce3.hpp
@@ -14,7 +14,7 @@ namespace device {
 namespace instance {

 void add_device_softmax_f32_f32_rank4_reduce3_instances(
-    std::vector<DeviceSoftmaxPtr<F32, F32, F32, PassThrough, PassThrough, 4>>& instances);
+    std::vector<DeviceSoftmaxPtr<F32, F32, F32, PassThrough, PassThrough, 4, 3>>& instances);

 } // namespace instance
 } // namespace device
--- a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce4.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce4.hpp
@@ -14,7 +14,7 @@ namespace device {
 namespace instance {

 void add_device_softmax_f32_f32_rank4_reduce4_instances(
-    std::vector<DeviceSoftmaxPtr<F32, F32, F32, PassThrough, PassThrough, 4>>& instances);
+    std::vector<DeviceSoftmaxPtr<F32, F32, F32, PassThrough, PassThrough, 4, 4>>& instances);

 } // namespace instance
 } // namespace device
--- a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance.hpp
@@ -1,22 +0,0 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
-
-#pragma once
-
-#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
-#include "ck/tensor_operation/gpu/device/device_softmax.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
-
-void add_device_softmax_i8_i8_rank3_instances(
-    std::vector<DeviceSoftmaxPtr<F16, F32, F16, PassThrough, PassThrough, 3>>& instances);
-void add_device_softmax_i8_i8_rank4_instances(
-    std::vector<DeviceSoftmaxPtr<F16, F32, F16, PassThrough, PassThrough, 4>>& instances);
-
-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
--- a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce1.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce1.hpp
@@ -14,7 +14,7 @@ namespace device {
 namespace instance {

 void add_device_softmax_i8_i8_rank3_reduce1_instances(
-    std::vector<DeviceSoftmaxPtr<I8, F32, I8, PassThrough, PassThrough, 3>>& instances);
+    std::vector<DeviceSoftmaxPtr<I8, F32, I8, PassThrough, PassThrough, 3, 1>>& instances);

 } // namespace instance
 } // namespace device
--- a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce2.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce2.hpp
@@ -14,7 +14,7 @@ namespace device {
 namespace instance {

 void add_device_softmax_i8_i8_rank3_reduce2_instances(
-    std::vector<DeviceSoftmaxPtr<I8, F32, I8, PassThrough, PassThrough, 3>>& instances);
+    std::vector<DeviceSoftmaxPtr<I8, F32, I8, PassThrough, PassThrough, 3, 2>>& instances);

 } // namespace instance
 } // namespace device
--- a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce3.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce3.hpp
@@ -14,7 +14,7 @@ namespace device {
 namespace instance {

 void add_device_softmax_i8_i8_rank3_reduce3_instances(
-    std::vector<DeviceSoftmaxPtr<I8, F32, I8, PassThrough, PassThrough, 3>>& instances);
+    std::vector<DeviceSoftmaxPtr<I8, F32, I8, PassThrough, PassThrough, 3, 3>>& instances);

 } // namespace instance
 } // namespace device
--- a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce1.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce1.hpp
@@ -14,7 +14,7 @@ namespace device {
 namespace instance {

 void add_device_softmax_i8_i8_rank4_reduce1_instances(
-    std::vector<DeviceSoftmaxPtr<I8, F32, I8, PassThrough, PassThrough, 4>>& instances);
+    std::vector<DeviceSoftmaxPtr<I8, F32, I8, PassThrough, PassThrough, 4, 1>>& instances);

 } // namespace instance
 } // namespace device
--- a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce2.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce2.hpp
@@ -14,7 +14,7 @@ namespace device {
 namespace instance {

 void add_device_softmax_i8_i8_rank4_reduce2_instances(
-    std::vector<DeviceSoftmaxPtr<I8, F32, I8, PassThrough, PassThrough, 4>>& instances);
+    std::vector<DeviceSoftmaxPtr<I8, F32, I8, PassThrough, PassThrough, 4, 2>>& instances);

 } // namespace instance
 } // namespace device
--- a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce3.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce3.hpp
@@ -14,7 +14,7 @@ namespace device {
 namespace instance {

 void add_device_softmax_i8_i8_rank4_reduce3_instances(
-    std::vector<DeviceSoftmaxPtr<I8, F32, I8, PassThrough, PassThrough, 4>>& instances);
+    std::vector<DeviceSoftmaxPtr<I8, F32, I8, PassThrough, PassThrough, 4, 3>>& instances);

 } // namespace instance
 } // namespace device
--- a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce4.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce4.hpp
@@ -14,7 +14,7 @@ namespace device {
 namespace instance {

 void add_device_softmax_i8_i8_rank4_reduce4_instances(
-    std::vector<DeviceSoftmaxPtr<I8, F32, I8, PassThrough, PassThrough, 4>>& instances);
+    std::vector<DeviceSoftmaxPtr<I8, F32, I8, PassThrough, PassThrough, 4, 4>>& instances);

 } // namespace instance
 } // namespace device
--- a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_instance.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_instance.hpp
@@ -3,6 +3,24 @@

 #pragma once

-#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance.hpp"
-#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance.hpp"
-#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce1.hpp"
+#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce2.hpp"
+#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce3.hpp"
+#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce1.hpp"
+#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce2.hpp"
+#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce3.hpp"
+#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce4.hpp"
+#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce1.hpp"
+#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce2.hpp"
+#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce3.hpp"
+#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce1.hpp"
+#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce2.hpp"
+#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce3.hpp"
+#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce4.hpp"
+#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce1.hpp"
+#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce2.hpp"
+#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce3.hpp"
+#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce1.hpp"
+#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce2.hpp"
+#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce3.hpp"
+#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce4.hpp"
--- a/library/src/tensor_operation_instance/gpu/softmax/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/softmax/CMakeLists.txt
@@ -1,5 +1,4 @@
 add_instance_library(device_softmax_instance
-    device_softmax_i8_i8_instance.cpp
    device_softmax_i8_i8_instance_rank3_reduce1.cpp
    device_softmax_i8_i8_instance_rank3_reduce2.cpp
    device_softmax_i8_i8_instance_rank3_reduce3.cpp
@@ -7,7 +6,6 @@ add_instance_library(device_softmax_instance
    device_softmax_i8_i8_instance_rank4_reduce2.cpp
    device_softmax_i8_i8_instance_rank4_reduce3.cpp
    device_softmax_i8_i8_instance_rank4_reduce4.cpp
-    device_softmax_f16_f16_instance.cpp
    device_softmax_f16_f16_instance_rank3_reduce1.cpp
    device_softmax_f16_f16_instance_rank3_reduce2.cpp
    device_softmax_f16_f16_instance_rank3_reduce3.cpp
@@ -15,7 +13,6 @@ add_instance_library(device_softmax_instance
    device_softmax_f16_f16_instance_rank4_reduce2.cpp
    device_softmax_f16_f16_instance_rank4_reduce3.cpp
    device_softmax_f16_f16_instance_rank4_reduce4.cpp
-    device_softmax_f32_f32_instance.cpp
    device_softmax_f32_f32_instance_rank3_reduce1.cpp
    device_softmax_f32_f32_instance_rank3_reduce2.cpp
    device_softmax_f32_f32_instance_rank3_reduce3.cpp
--- a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance.cpp
@@ -1,40 +0,0 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
-
-#include <vector>
-
-#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce1.hpp"
-#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce2.hpp"
-#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce3.hpp"
-
-#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce1.hpp"
-#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce2.hpp"
-#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce3.hpp"
-#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce4.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
-
-void add_device_softmax_f16_f16_rank3_instances(
-    std::vector<DeviceSoftmaxPtr<F16, F32, F16, PassThrough, PassThrough, 3>>& instances)
-{
-    add_device_softmax_f16_f16_rank3_reduce1_instances(instances);
-    add_device_softmax_f16_f16_rank3_reduce2_instances(instances);
-    add_device_softmax_f16_f16_rank3_reduce3_instances(instances);
-}
-
-void add_device_softmax_f16_f16_rank4_instances(
-    std::vector<DeviceSoftmaxPtr<F16, F32, F16, PassThrough, PassThrough, 4>>& instances)
-{
-    add_device_softmax_f16_f16_rank4_reduce1_instances(instances);
-    add_device_softmax_f16_f16_rank4_reduce2_instances(instances);
-    add_device_softmax_f16_f16_rank4_reduce3_instances(instances);
-    add_device_softmax_f16_f16_rank4_reduce4_instances(instances);
-}
-
-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
--- a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce1.cpp
+++ b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce1.cpp
@@ -13,12 +13,10 @@ namespace tensor_operation {
 namespace device {
 namespace instance {

-static constexpr index_t RANK = 3;
-
 void add_device_softmax_f16_f16_rank3_reduce1_instances(
-    std::vector<DeviceSoftmaxPtr<F16, F32, F16, PassThrough, PassThrough, RANK>>& instances)
+    std::vector<DeviceSoftmaxPtr<F16, F32, F16, PassThrough, PassThrough, 3, 1>>& instances)
 {
-    add_device_operation_instances(instances, device_softmax_f16_f16_instances<RANK, 1>{});
+    add_device_operation_instances(instances, device_softmax_f16_f16_instances<3, 1>{});
 }

 } // namespace instance
--- a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce2.cpp
+++ b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce2.cpp
@@ -13,12 +13,10 @@ namespace tensor_operation {
 namespace device {
 namespace instance {

-static constexpr index_t RANK = 3;
-
 void add_device_softmax_f16_f16_rank3_reduce2_instances(
-    std::vector<DeviceSoftmaxPtr<F16, F32, F16, PassThrough, PassThrough, RANK>>& instances)
+    std::vector<DeviceSoftmaxPtr<F16, F32, F16, PassThrough, PassThrough, 3, 2>>& instances)
 {
-    add_device_operation_instances(instances, device_softmax_f16_f16_instances<RANK, 2>{});
+    add_device_operation_instances(instances, device_softmax_f16_f16_instances<3, 2>{});
 }

 } // namespace instance
--- a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce3.cpp
+++ b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce3.cpp
@@ -13,12 +13,10 @@ namespace tensor_operation {
 namespace device {
 namespace instance {

-static constexpr index_t RANK = 3;
-
 void add_device_softmax_f16_f16_rank3_reduce3_instances(
-    std::vector<DeviceSoftmaxPtr<F16, F32, F16, PassThrough, PassThrough, RANK>>& instances)
+    std::vector<DeviceSoftmaxPtr<F16, F32, F16, PassThrough, PassThrough, 3, 3>>& instances)
 {
-    add_device_operation_instances(instances, device_softmax_f16_f16_instances<RANK, 3>{});
+    add_device_operation_instances(instances, device_softmax_f16_f16_instances<3, 3>{});
 }

 } // namespace instance
--- a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce1.cpp
+++ b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce1.cpp
@@ -13,12 +13,10 @@ namespace tensor_operation {
 namespace device {
 namespace instance {

-static constexpr index_t RANK = 4;
-
 void add_device_softmax_f16_f16_rank4_reduce1_instances(
-    std::vector<DeviceSoftmaxPtr<F16, F32, F16, PassThrough, PassThrough, RANK>>& instances)
+    std::vector<DeviceSoftmaxPtr<F16, F32, F16, PassThrough, PassThrough, 4, 1>>& instances)
 {
-    add_device_operation_instances(instances, device_softmax_f16_f16_instances<RANK, 1>{});
+    add_device_operation_instances(instances, device_softmax_f16_f16_instances<4, 1>{});
 }

 } // namespace instance
--- a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce2.cpp
+++ b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce2.cpp
@@ -13,12 +13,10 @@ namespace tensor_operation {
 namespace device {
 namespace instance {

-static constexpr index_t RANK = 4;
-
 void add_device_softmax_f16_f16_rank4_reduce2_instances(
-    std::vector<DeviceSoftmaxPtr<F16, F32, F16, PassThrough, PassThrough, RANK>>& instances)
+    std::vector<DeviceSoftmaxPtr<F16, F32, F16, PassThrough, PassThrough, 4, 2>>& instances)
 {
-    add_device_operation_instances(instances, device_softmax_f16_f16_instances<RANK, 2>{});
+    add_device_operation_instances(instances, device_softmax_f16_f16_instances<4, 2>{});
 }

 } // namespace instance
--- a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce3.cpp
+++ b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce3.cpp
@@ -13,12 +13,10 @@ namespace tensor_operation {
 namespace device {
 namespace instance {

-static constexpr index_t RANK = 4;
-
 void add_device_softmax_f16_f16_rank4_reduce3_instances(
-    std::vector<DeviceSoftmaxPtr<F16, F32, F16, PassThrough, PassThrough, RANK>>& instances)
+    std::vector<DeviceSoftmaxPtr<F16, F32, F16, PassThrough, PassThrough, 4, 3>>& instances)
 {
-    add_device_operation_instances(instances, device_softmax_f16_f16_instances<RANK, 3>{});
+    add_device_operation_instances(instances, device_softmax_f16_f16_instances<4, 3>{});
 }

 } // namespace instance
--- a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce4.cpp
+++ b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce4.cpp
@@ -13,12 +13,10 @@ namespace tensor_operation {
 namespace device {
 namespace instance {

-static constexpr index_t RANK = 4;
-
 void add_device_softmax_f16_f16_rank4_reduce4_instances(
-    std::vector<DeviceSoftmaxPtr<F16, F32, F16, PassThrough, PassThrough, RANK>>& instances)
+    std::vector<DeviceSoftmaxPtr<F16, F32, F16, PassThrough, PassThrough, 4, 4>>& instances)
 {
-    add_device_operation_instances(instances, device_softmax_f16_f16_instances<RANK, 4>{});
+    add_device_operation_instances(instances, device_softmax_f16_f16_instances<4, 4>{});
 }

 } // namespace instance
--- a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance.cpp
@@ -1,40 +0,0 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
-
-#include <vector>
-
-#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce1.hpp"
-#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce2.hpp"
-#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce3.hpp"
-
-#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce1.hpp"
-#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce2.hpp"
-#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce3.hpp"
-#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce4.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
-
-void add_device_softmax_f32_f32_rank3_instances(
-    std::vector<DeviceSoftmaxPtr<F32, F32, F32, PassThrough, PassThrough, 3>>& instances)
-{
-    add_device_softmax_f32_f32_rank3_reduce1_instances(instances);
-    add_device_softmax_f32_f32_rank3_reduce2_instances(instances);
-    add_device_softmax_f32_f32_rank3_reduce3_instances(instances);
-}
-
-void add_device_softmax_f32_f32_rank4_instances(
-    std::vector<DeviceSoftmaxPtr<F32, F32, F32, PassThrough, PassThrough, 4>>& instances)
-{
-    add_device_softmax_f32_f32_rank4_reduce1_instances(instances);
-    add_device_softmax_f32_f32_rank4_reduce2_instances(instances);
-    add_device_softmax_f32_f32_rank4_reduce3_instances(instances);
-    add_device_softmax_f32_f32_rank4_reduce4_instances(instances);
-}
-
-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
--- a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce1.cpp
+++ b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce1.cpp
@@ -13,12 +13,10 @@ namespace tensor_operation {
 namespace device {
 namespace instance {

-static constexpr index_t RANK = 3;
-
 void add_device_softmax_f32_f32_rank3_reduce1_instances(
-    std::vector<DeviceSoftmaxPtr<F32, F32, F32, PassThrough, PassThrough, RANK>>& instances)
+    std::vector<DeviceSoftmaxPtr<F32, F32, F32, PassThrough, PassThrough, 3, 1>>& instances)
 {
-    add_device_operation_instances(instances, device_softmax_f32_f32_instances<RANK, 1>{});
+    add_device_operation_instances(instances, device_softmax_f32_f32_instances<3, 1>{});
 }

 } // namespace instance
--- a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce2.cpp
+++ b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce2.cpp
@@ -13,12 +13,10 @@ namespace tensor_operation {
 namespace device {
 namespace instance {

-static constexpr index_t RANK = 3;
-
 void add_device_softmax_f32_f32_rank3_reduce2_instances(
-    std::vector<DeviceSoftmaxPtr<F32, F32, F32, PassThrough, PassThrough, RANK>>& instances)
+    std::vector<DeviceSoftmaxPtr<F32, F32, F32, PassThrough, PassThrough, 3, 2>>& instances)
 {
-    add_device_operation_instances(instances, device_softmax_f32_f32_instances<RANK, 2>{});
+    add_device_operation_instances(instances, device_softmax_f32_f32_instances<3, 2>{});
 }

 } // namespace instance
--- a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce3.cpp
+++ b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce3.cpp
@@ -13,12 +13,10 @@ namespace tensor_operation {
 namespace device {
 namespace instance {

-static constexpr index_t RANK = 3;
-
 void add_device_softmax_f32_f32_rank3_reduce3_instances(
-    std::vector<DeviceSoftmaxPtr<F32, F32, F32, PassThrough, PassThrough, RANK>>& instances)
+    std::vector<DeviceSoftmaxPtr<F32, F32, F32, PassThrough, PassThrough, 3, 3>>& instances)
 {
-    add_device_operation_instances(instances, device_softmax_f32_f32_instances<RANK, 3>{});
+    add_device_operation_instances(instances, device_softmax_f32_f32_instances<3, 3>{});
 }

 } // namespace instance
--- a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce1.cpp
+++ b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce1.cpp
@@ -13,12 +13,10 @@ namespace tensor_operation {
 namespace device {
 namespace instance {

-static constexpr index_t RANK = 4;
-
 void add_device_softmax_f32_f32_rank4_reduce1_instances(
-    std::vector<DeviceSoftmaxPtr<F32, F32, F32, PassThrough, PassThrough, RANK>>& instances)
+    std::vector<DeviceSoftmaxPtr<F32, F32, F32, PassThrough, PassThrough, 4, 1>>& instances)
 {
-    add_device_operation_instances(instances, device_softmax_f32_f32_instances<RANK, 1>{});
+    add_device_operation_instances(instances, device_softmax_f32_f32_instances<4, 1>{});
 }

 } // namespace instance
--- a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce2.cpp
+++ b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce2.cpp
@@ -13,12 +13,10 @@ namespace tensor_operation {
 namespace device {
 namespace instance {

-static constexpr index_t RANK = 4;
-
 void add_device_softmax_f32_f32_rank4_reduce2_instances(
-    std::vector<DeviceSoftmaxPtr<F32, F32, F32, PassThrough, PassThrough, RANK>>& instances)
+    std::vector<DeviceSoftmaxPtr<F32, F32, F32, PassThrough, PassThrough, 4, 2>>& instances)
 {
-    add_device_operation_instances(instances, device_softmax_f32_f32_instances<RANK, 2>{});
+    add_device_operation_instances(instances, device_softmax_f32_f32_instances<4, 2>{});
 }

 } // namespace instance
--- a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce3.cpp
+++ b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce3.cpp
@@ -13,12 +13,10 @@ namespace tensor_operation {
 namespace device {
 namespace instance {

-static constexpr index_t RANK = 4;
-
 void add_device_softmax_f32_f32_rank4_reduce3_instances(
-    std::vector<DeviceSoftmaxPtr<F32, F32, F32, PassThrough, PassThrough, RANK>>& instances)
+    std::vector<DeviceSoftmaxPtr<F32, F32, F32, PassThrough, PassThrough, 4, 3>>& instances)
 {
-    add_device_operation_instances(instances, device_softmax_f32_f32_instances<RANK, 3>{});
+    add_device_operation_instances(instances, device_softmax_f32_f32_instances<4, 3>{});
 }

 } // namespace instance
--- a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce4.cpp
+++ b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce4.cpp
@@ -13,12 +13,10 @@ namespace tensor_operation {
 namespace device {
 namespace instance {

-static constexpr index_t RANK = 4;
-
 void add_device_softmax_f32_f32_rank4_reduce4_instances(
-    std::vector<DeviceSoftmaxPtr<F32, F32, F32, PassThrough, PassThrough, RANK>>& instances)
+    std::vector<DeviceSoftmaxPtr<F32, F32, F32, PassThrough, PassThrough, 4, 4>>& instances)
 {
-    add_device_operation_instances(instances, device_softmax_f32_f32_instances<RANK, 4>{});
+    add_device_operation_instances(instances, device_softmax_f32_f32_instances<4, 4>{});
 }

 } // namespace instance
--- a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance.cpp
@@ -1,40 +0,0 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
-
-#include <vector>
-
-#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce1.hpp"
-#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce2.hpp"
-#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce3.hpp"
-
-#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce1.hpp"
-#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce2.hpp"
-#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce3.hpp"
-#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce4.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
-
-void add_device_softmax_i8_i8_rank3_instances(
-    std::vector<DeviceSoftmaxPtr<I8, F32, I8, PassThrough, PassThrough, 3>>& instances)
-{
-    add_device_softmax_i8_i8_rank3_reduce1_instances(instances);
-    add_device_softmax_i8_i8_rank3_reduce2_instances(instances);
-    add_device_softmax_i8_i8_rank3_reduce3_instances(instances);
-}
-
-void add_device_softmax_i8_i8_rank4_instances(
-    std::vector<DeviceSoftmaxPtr<I8, F32, I8, PassThrough, PassThrough, 4>>& instances)
-{
-    add_device_softmax_i8_i8_rank4_reduce1_instances(instances);
-    add_device_softmax_i8_i8_rank4_reduce2_instances(instances);
-    add_device_softmax_i8_i8_rank4_reduce3_instances(instances);
-    add_device_softmax_i8_i8_rank4_reduce4_instances(instances);
-}
-
-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
--- a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce1.cpp
+++ b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce1.cpp
@@ -13,12 +13,10 @@ namespace tensor_operation {
 namespace device {
 namespace instance {

-static constexpr index_t RANK = 3;
-
 void add_device_softmax_i8_i8_rank3_reduce1_instances(
-    std::vector<DeviceSoftmaxPtr<I8, F32, I8, PassThrough, PassThrough, RANK>>& instances)
+    std::vector<DeviceSoftmaxPtr<I8, F32, I8, PassThrough, PassThrough, 3, 1>>& instances)
 {
-    add_device_operation_instances(instances, device_softmax_i8_i8_instances<RANK, 1>{});
+    add_device_operation_instances(instances, device_softmax_i8_i8_instances<3, 1>{});
 }

 } // namespace instance
--- a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce2.cpp
+++ b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce2.cpp
@@ -13,12 +13,10 @@ namespace tensor_operation {
 namespace device {
 namespace instance {

-static constexpr index_t RANK = 3;
-
 void add_device_softmax_i8_i8_rank3_reduce2_instances(
-    std::vector<DeviceSoftmaxPtr<I8, F32, I8, PassThrough, PassThrough, RANK>>& instances)
+    std::vector<DeviceSoftmaxPtr<I8, F32, I8, PassThrough, PassThrough, 3, 2>>& instances)
 {
-    add_device_operation_instances(instances, device_softmax_i8_i8_instances<RANK, 2>{});
+    add_device_operation_instances(instances, device_softmax_i8_i8_instances<3, 2>{});
 }

 } // namespace instance
--- a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce3.cpp
+++ b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce3.cpp
@@ -13,12 +13,10 @@ namespace tensor_operation {
 namespace device {
 namespace instance {

-static constexpr index_t RANK = 3;
-
 void add_device_softmax_i8_i8_rank3_reduce3_instances(
-    std::vector<DeviceSoftmaxPtr<I8, F32, I8, PassThrough, PassThrough, RANK>>& instances)
+    std::vector<DeviceSoftmaxPtr<I8, F32, I8, PassThrough, PassThrough, 3, 3>>& instances)
 {
-    add_device_operation_instances(instances, device_softmax_i8_i8_instances<RANK, 3>{});
+    add_device_operation_instances(instances, device_softmax_i8_i8_instances<3, 3>{});
 }

 } // namespace instance
--- a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce1.cpp
+++ b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce1.cpp
@@ -13,12 +13,10 @@ namespace tensor_operation {
 namespace device {
 namespace instance {

-static constexpr index_t RANK = 4;
-
 void add_device_softmax_i8_i8_rank4_reduce1_instances(
-    std::vector<DeviceSoftmaxPtr<I8, F32, I8, PassThrough, PassThrough, RANK>>& instances)
+    std::vector<DeviceSoftmaxPtr<I8, F32, I8, PassThrough, PassThrough, 4, 1>>& instances)
 {
-    add_device_operation_instances(instances, device_softmax_i8_i8_instances<RANK, 1>{});
+    add_device_operation_instances(instances, device_softmax_i8_i8_instances<4, 1>{});
 }

 } // namespace instance
--- a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce2.cpp
+++ b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce2.cpp
@@ -13,12 +13,10 @@ namespace tensor_operation {
 namespace device {
 namespace instance {

-static constexpr index_t RANK = 4;
-
 void add_device_softmax_i8_i8_rank4_reduce2_instances(
-    std::vector<DeviceSoftmaxPtr<I8, F32, I8, PassThrough, PassThrough, RANK>>& instances)
+    std::vector<DeviceSoftmaxPtr<I8, F32, I8, PassThrough, PassThrough, 4, 2>>& instances)
 {
-    add_device_operation_instances(instances, device_softmax_i8_i8_instances<RANK, 2>{});
+    add_device_operation_instances(instances, device_softmax_i8_i8_instances<4, 2>{});
 }

 } // namespace instance
--- a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce3.cpp
+++ b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce3.cpp
@@ -13,12 +13,10 @@ namespace tensor_operation {
 namespace device {
 namespace instance {

-static constexpr index_t RANK = 4;
-
 void add_device_softmax_i8_i8_rank4_reduce3_instances(
-    std::vector<DeviceSoftmaxPtr<I8, F32, I8, PassThrough, PassThrough, RANK>>& instances)
+    std::vector<DeviceSoftmaxPtr<I8, F32, I8, PassThrough, PassThrough, 4, 3>>& instances)
 {
-    add_device_operation_instances(instances, device_softmax_i8_i8_instances<RANK, 3>{});
+    add_device_operation_instances(instances, device_softmax_i8_i8_instances<4, 3>{});
 }

 } // namespace instance
--- a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce4.cpp
+++ b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce4.cpp
@@ -13,12 +13,10 @@ namespace tensor_operation {
 namespace device {
 namespace instance {

-static constexpr index_t RANK = 4;
-
 void add_device_softmax_i8_i8_rank4_reduce4_instances(
-    std::vector<DeviceSoftmaxPtr<I8, F32, I8, PassThrough, PassThrough, RANK>>& instances)
+    std::vector<DeviceSoftmaxPtr<I8, F32, I8, PassThrough, PassThrough, 4, 4>>& instances)
 {
-    add_device_operation_instances(instances, device_softmax_i8_i8_instances<RANK, 4>{});
+    add_device_operation_instances(instances, device_softmax_i8_i8_instances<4, 4>{});
 }

 } // namespace instance
--- a/profiler/include/profiler/profile_softmax_impl.hpp
+++ b/profiler/include/profiler/profile_softmax_impl.hpp
@@ -40,7 +40,11 @@ template <> std::string type_to_string<int8_t>()  { return "int8"; }
 template <> std::string type_to_string<int32_t>() { return "int32"; }
 // clang-format on

-template <typename InDataType, typename AccDataType, typename OutDataType, index_t Rank>
+template <typename InDataType,
+          typename AccDataType,
+          typename OutDataType,
+          index_t Rank,
+          index_t NumReduceDim>
 bool profile_softmax_impl(int do_verification,
                          int init_method,
                          bool do_log,
@@ -54,7 +58,13 @@ bool profile_softmax_impl(int do_verification,
    if(Rank != in_length.size())
    {
        throw std::runtime_error("Input tensor rank is different from template argument Rank!");
-    }
+    };
+
+    if(NumReduceDim != reduce_dims.size())
+    {
+        throw std::runtime_error(
+            "Input reduce_dims rank is different from template argument NumReduceDim!");
+    };

    Tensor<InDataType> in = in_strides.empty() ? Tensor<InDataType>(in_length)
                                               : Tensor<InDataType>(in_length, in_strides);
@@ -92,8 +102,13 @@ bool profile_softmax_impl(int do_verification,

    // add device softmax instances
    using PassThrough = ck::tensor_operation::element_wise::PassThrough;
-    using DeviceOp    = tensor_operation::device::
-        DeviceSoftmax<InDataType, AccDataType, OutDataType, PassThrough, PassThrough, Rank>;
+    using DeviceOp    = tensor_operation::device::DeviceSoftmax<InDataType,
+                                                             AccDataType,
+                                                             OutDataType,
+                                                             PassThrough,
+                                                             PassThrough,
+                                                             Rank,
+                                                             NumReduceDim>;

    // get device op instances
    const auto instances = tensor_operation::device::instance::DeviceOperationInstanceFactory<
@@ -112,13 +127,6 @@ bool profile_softmax_impl(int do_verification,

    for(auto& inst_ptr : instances)
    {
-        // Is this user's responsibility to check if problem mismatches kernel instance (ie. rank 3
-        // problem to rank 4 kernel) other than invoking IsSupportedArgument()?
-        if(!(inst_ptr->GetNumReduceDim() == static_cast<index_t>(reduce_dims.size())))
-        {
-            continue;
-        }
-
        auto argument_ptr = inst_ptr->MakeArgumentPointer(in_tensor_lengths,
                                                          in_tensor_strides,
                                                          reduce_dims,
--- a/profiler/src/profile_softmax.cpp
+++ b/profiler/src/profile_softmax.cpp
@@ -92,27 +92,76 @@ int profile_softmax(int argc, char* argv[])
    {
        if(data_type == SoftmaxDataType::F16_F16)
        {
-            ck::profiler::profile_softmax_impl<ck::half_t, float, ck::half_t, 3>(do_verification,
-                                                                                 init_method,
-                                                                                 do_log,
-                                                                                 time_kernel,
-                                                                                 length,
-                                                                                 stride,
-                                                                                 reduce,
-                                                                                 double(alpha),
-                                                                                 double(beta));
+            if(reduce.size() == 1)
+                ck::profiler::profile_softmax_impl<ck::half_t, float, ck::half_t, 3, 1>(
+                    do_verification,
+                    init_method,
+                    do_log,
+                    time_kernel,
+                    length,
+                    stride,
+                    reduce,
+                    double(alpha),
+                    double(beta));
+            else if(reduce.size() == 2)
+                ck::profiler::profile_softmax_impl<ck::half_t, float, ck::half_t, 3, 2>(
+                    do_verification,
+                    init_method,
+                    do_log,
+                    time_kernel,
+                    length,
+                    stride,
+                    reduce,
+                    double(alpha),
+                    double(beta));
+            else if(reduce.size() == 3)
+                ck::profiler::profile_softmax_impl<ck::half_t, float, ck::half_t, 3, 3>(
+                    do_verification,
+                    init_method,
+                    do_log,
+                    time_kernel,
+                    length,
+                    stride,
+                    reduce,
+                    double(alpha),
+                    double(beta));
+            else
+                throw std::runtime_error("invalid number of dimensions to reduce");
        }
        else if(data_type == SoftmaxDataType::F32_F32)
        {
-            ck::profiler::profile_softmax_impl<float, float, float, 3>(do_verification,
-                                                                       init_method,
-                                                                       do_log,
-                                                                       time_kernel,
-                                                                       length,
-                                                                       stride,
-                                                                       reduce,
-                                                                       double(alpha),
-                                                                       double(beta));
+            if(reduce.size() == 1)
+                ck::profiler::profile_softmax_impl<float, float, float, 3, 1>(do_verification,
+                                                                              init_method,
+                                                                              do_log,
+                                                                              time_kernel,
+                                                                              length,
+                                                                              stride,
+                                                                              reduce,
+                                                                              double(alpha),
+                                                                              double(beta));
+            else if(reduce.size() == 2)
+                ck::profiler::profile_softmax_impl<float, float, float, 3, 2>(do_verification,
+                                                                              init_method,
+                                                                              do_log,
+                                                                              time_kernel,
+                                                                              length,
+                                                                              stride,
+                                                                              reduce,
+                                                                              double(alpha),
+                                                                              double(beta));
+            else if(reduce.size() == 3)
+                ck::profiler::profile_softmax_impl<float, float, float, 3, 3>(do_verification,
+                                                                              init_method,
+                                                                              do_log,
+                                                                              time_kernel,
+                                                                              length,
+                                                                              stride,
+                                                                              reduce,
+                                                                              double(alpha),
+                                                                              double(beta));
+            else
+                throw std::runtime_error("invalid number of dimensions to reduce");
        }
        else
        {
@@ -124,27 +173,97 @@ int profile_softmax(int argc, char* argv[])
    {
        if(data_type == SoftmaxDataType::F16_F16)
        {
-            ck::profiler::profile_softmax_impl<ck::half_t, float, ck::half_t, 4>(do_verification,
-                                                                                 init_method,
-                                                                                 do_log,
-                                                                                 time_kernel,
-                                                                                 length,
-                                                                                 stride,
-                                                                                 reduce,
-                                                                                 double(alpha),
-                                                                                 double(beta));
+            if(reduce.size() == 1)
+                ck::profiler::profile_softmax_impl<ck::half_t, float, ck::half_t, 4, 1>(
+                    do_verification,
+                    init_method,
+                    do_log,
+                    time_kernel,
+                    length,
+                    stride,
+                    reduce,
+                    double(alpha),
+                    double(beta));
+            else if(reduce.size() == 2)
+                ck::profiler::profile_softmax_impl<ck::half_t, float, ck::half_t, 4, 2>(
+                    do_verification,
+                    init_method,
+                    do_log,
+                    time_kernel,
+                    length,
+                    stride,
+                    reduce,
+                    double(alpha),
+                    double(beta));
+            else if(reduce.size() == 3)
+                ck::profiler::profile_softmax_impl<ck::half_t, float, ck::half_t, 4, 3>(
+                    do_verification,
+                    init_method,
+                    do_log,
+                    time_kernel,
+                    length,
+                    stride,
+                    reduce,
+                    double(alpha),
+                    double(beta));
+            else if(reduce.size() == 4)
+                ck::profiler::profile_softmax_impl<ck::half_t, float, ck::half_t, 4, 4>(
+                    do_verification,
+                    init_method,
+                    do_log,
+                    time_kernel,
+                    length,
+                    stride,
+                    reduce,
+                    double(alpha),
+                    double(beta));
+            else
+                throw std::runtime_error("invalid number of dimensions to reduce");
        }
        else if(data_type == SoftmaxDataType::F32_F32)
        {
-            ck::profiler::profile_softmax_impl<float, float, float, 4>(do_verification,
-                                                                       init_method,
-                                                                       do_log,
-                                                                       time_kernel,
-                                                                       length,
-                                                                       stride,
-                                                                       reduce,
-                                                                       double(alpha),
-                                                                       double(beta));
+            if(reduce.size() == 1)
+                ck::profiler::profile_softmax_impl<float, float, float, 4, 1>(do_verification,
+                                                                              init_method,
+                                                                              do_log,
+                                                                              time_kernel,
+                                                                              length,
+                                                                              stride,
+                                                                              reduce,
+                                                                              double(alpha),
+                                                                              double(beta));
+            else if(reduce.size() == 2)
+                ck::profiler::profile_softmax_impl<float, float, float, 4, 2>(do_verification,
+                                                                              init_method,
+                                                                              do_log,
+                                                                              time_kernel,
+                                                                              length,
+                                                                              stride,
+                                                                              reduce,
+                                                                              double(alpha),
+                                                                              double(beta));
+            else if(reduce.size() == 3)
+                ck::profiler::profile_softmax_impl<float, float, float, 4, 3>(do_verification,
+                                                                              init_method,
+                                                                              do_log,
+                                                                              time_kernel,
+                                                                              length,
+                                                                              stride,
+                                                                              reduce,
+                                                                              double(alpha),
+                                                                              double(beta));
+            else if(reduce.size() == 4)
+                ck::profiler::profile_softmax_impl<float, float, float, 4, 4>(do_verification,
+                                                                              init_method,
+                                                                              do_log,
+                                                                              time_kernel,
+                                                                              length,
+                                                                              stride,
+                                                                              reduce,
+                                                                              double(alpha),
+                                                                              double(beta));
+            else
+                throw std::runtime_error("invalid number of dimensions to reduce");
        }
        else
        {
--- a/test/softmax/test_softmax_util.hpp
+++ b/test/softmax/test_softmax_util.hpp
@@ -61,8 +61,92 @@ class TestSoftmax : public ::testing::Test
        int init_method = 1; // integer value initialization
        bool log        = false;
        std::vector<ck::index_t> strides; // intenionally empty, to get packed layout.
-        bool pass = ck::profiler::profile_softmax_impl<InDataType, AccDataType, OutDataType, Rank>(
-            verify_, init_method, log, bench_, in_length, strides, reduce_dims, alpha, beta);
+        bool pass = false;
+
+        if constexpr(Rank == 3)
+        {
+            if(reduce_dims.size() == 1)
+                pass = ck::profiler::
+                    profile_softmax_impl<InDataType, AccDataType, OutDataType, Rank, 1>(verify_,
+                                                                                        init_method,
+                                                                                        log,
+                                                                                        bench_,
+                                                                                        in_length,
+                                                                                        strides,
+                                                                                        reduce_dims,
+                                                                                        alpha,
+                                                                                        beta);
+            else if(reduce_dims.size() == 2)
+                pass = ck::profiler::
+                    profile_softmax_impl<InDataType, AccDataType, OutDataType, Rank, 2>(verify_,
+                                                                                        init_method,
+                                                                                        log,
+                                                                                        bench_,
+                                                                                        in_length,
+                                                                                        strides,
+                                                                                        reduce_dims,
+                                                                                        alpha,
+                                                                                        beta);
+            else if(reduce_dims.size() == 3)
+                pass = ck::profiler::
+                    profile_softmax_impl<InDataType, AccDataType, OutDataType, Rank, 3>(verify_,
+                                                                                        init_method,
+                                                                                        log,
+                                                                                        bench_,
+                                                                                        in_length,
+                                                                                        strides,
+                                                                                        reduce_dims,
+                                                                                        alpha,
+                                                                                        beta);
+        }
+        else if constexpr(Rank == 4)
+        {
+            if(reduce_dims.size() == 1)
+                pass = ck::profiler::
+                    profile_softmax_impl<InDataType, AccDataType, OutDataType, Rank, 1>(verify_,
+                                                                                        init_method,
+                                                                                        log,
+                                                                                        bench_,
+                                                                                        in_length,
+                                                                                        strides,
+                                                                                        reduce_dims,
+                                                                                        alpha,
+                                                                                        beta);
+            else if(reduce_dims.size() == 2)
+                pass = ck::profiler::
+                    profile_softmax_impl<InDataType, AccDataType, OutDataType, Rank, 2>(verify_,
+                                                                                        init_method,
+                                                                                        log,
+                                                                                        bench_,
+                                                                                        in_length,
+                                                                                        strides,
+                                                                                        reduce_dims,
+                                                                                        alpha,
+                                                                                        beta);
+            else if(reduce_dims.size() == 3)
+                pass = ck::profiler::
+                    profile_softmax_impl<InDataType, AccDataType, OutDataType, Rank, 3>(verify_,
+                                                                                        init_method,
+                                                                                        log,
+                                                                                        bench_,
+                                                                                        in_length,
+                                                                                        strides,
+                                                                                        reduce_dims,
+                                                                                        alpha,
+                                                                                        beta);
+            else if(reduce_dims.size() == 4)
+                pass = ck::profiler::
+                    profile_softmax_impl<InDataType, AccDataType, OutDataType, Rank, 4>(verify_,
+                                                                                        init_method,
+                                                                                        log,
+                                                                                        bench_,
+                                                                                        in_length,
+                                                                                        strides,
+                                                                                        reduce_dims,
+                                                                                        alpha,
+                                                                                        beta);
+        };
+
        EXPECT_TRUE(pass);
    }