Softmax client example (#396)

* Update Softmax device operation interface. * Update ckProfiler. * Update Softmax UT. * Update example. * Client example. * Clang format Co-authored-by: Adam Osewski <aosewski@amd.com> [ROCm/composable_kernel commit: 3da5c19e62]
2026-05-14 02:02:46 +00:00 · 2022-09-06 19:22:48 +02:00
parent bac57cc4e5
commit 5643625481
13 changed files with 738 additions and 331 deletions
--- a/client_example/06_softmax/CMakeLists.txt
+++ b/client_example/06_softmax/CMakeLists.txt
@@ -0,0 +1,2 @@
+add_executable(client_softmax4d softmax4d.cpp)
+target_link_libraries(client_softmax4d PRIVATE composable_kernel::device_operations)
--- a/client_example/06_softmax/softmax4d.cpp
+++ b/client_example/06_softmax/softmax4d.cpp
@@ -0,0 +1,150 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <functional>
+#include <numeric>
+#include <iomanip>
+#include <iostream>
+#include <vector>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/device_softmax.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/tensor_operation_instance/gpu/softmax.hpp"
+
+using InDataType  = ck::half_t;
+using OutDataType = ck::half_t;
+using AccDataType = float;
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+constexpr int Rank         = 4;
+constexpr int NumReduceDim = 2;
+
+struct SimpleDeviceMem
+{
+    SimpleDeviceMem() = delete;
+
+    SimpleDeviceMem(std::size_t mem_size) : p_mem_{}
+    {
+        (void)hipMalloc(static_cast<void**>(&p_mem_), mem_size);
+    }
+
+    void* GetDeviceBuffer() { return p_mem_; }
+
+    ~SimpleDeviceMem() { (void)hipFree(p_mem_); }
+
+    void* p_mem_;
+};
+
+int main(int argc, char* argv[])
+{
+    std::vector<ck::index_t> in_lengths{2, 8, 128, 1024};
+    std::vector<ck::index_t> in_strides{8 * 128 * 1024, 128 * 1024, 1024, 1};
+    std::vector<ck::index_t> reduce_dims{2, 3};
+
+    ck::index_t num_elements =
+        std::accumulate(in_lengths.begin(), in_lengths.end(), 1, std::multiplies<ck::index_t>());
+
+    AccDataType alpha{2.0f};
+    AccDataType beta{2.0f};
+
+    SimpleDeviceMem in(sizeof(InDataType) * num_elements);
+    SimpleDeviceMem out(sizeof(OutDataType) * num_elements);
+
+    using DeviceOp = ck::tensor_operation::device::
+        DeviceSoftmax<InDataType, AccDataType, OutDataType, PassThrough, PassThrough, Rank>;
+    // get device op instances
+    const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
+        DeviceOp>::GetInstances();
+
+    std::cout << "found " << op_ptrs.size() << " instances" << std::endl;
+
+    std::string best_op_name;
+    bool found            = false;
+    int best_op_id        = -1;
+    float best_ave_time   = std::numeric_limits<float>::max();
+    float best_gb_per_sec = 0;
+
+    // profile device operation instances
+    std::cout << "Run all instances and do timing" << std::endl;
+
+    for(int i = 0; i < op_ptrs.size(); ++i)
+    {
+        auto& op_ptr = op_ptrs[i];
+
+        if(op_ptr->GetRank() != Rank || op_ptr->GetNumReduceDim() != NumReduceDim)
+        {
+            continue;
+        }
+
+        auto argument_ptr   = op_ptr->MakeArgumentPointer(in_lengths,
+                                                        in_strides,
+                                                        reduce_dims,
+                                                        &alpha,
+                                                        &beta,
+                                                        in.GetDeviceBuffer(),
+                                                        out.GetDeviceBuffer(),
+                                                        PassThrough{},
+                                                        PassThrough{});
+        auto invoker_ptr    = op_ptr->MakeInvokerPointer();
+        std::string op_name = op_ptr->GetTypeString();
+
+        if(op_ptr->IsSupportedArgument(argument_ptr.get()))
+        {
+            float ave_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true});
+
+            std::size_t num_bytes = num_elements * sizeof(InDataType) +
+                                    (beta == 0.0f ? 1 : 2) * num_elements * sizeof(OutDataType);
+
+            float gb_per_sec = num_bytes / 1.E6 / ave_time;
+
+            std::cout << "Perf: " << std::setw(10) << ave_time << " ms, " << gb_per_sec << " GB/s, "
+                      << op_name << std::endl;
+
+            if(ave_time < best_ave_time)
+            {
+                found           = true;
+                best_op_id      = i;
+                best_op_name    = op_name;
+                best_ave_time   = ave_time;
+                best_gb_per_sec = gb_per_sec;
+            }
+        }
+        else
+        {
+            std::cout << op_name << " does not support this problem" << std::endl;
+        }
+    }
+
+    std::cout << "Best Perf: " << best_ave_time << " ms, " << best_gb_per_sec << " GB/s, "
+              << best_op_name << std::endl;
+
+    // run the best intance
+    {
+        auto& op_ptr = op_ptrs[best_op_id];
+        std::cout << "Run the best instance without timing: " << op_ptr->GetTypeString()
+                  << std::endl;
+        auto argument_ptr = op_ptr->MakeArgumentPointer(in_lengths,
+                                                        in_strides,
+                                                        reduce_dims,
+                                                        &alpha,
+                                                        &beta,
+                                                        in.GetDeviceBuffer(),
+                                                        out.GetDeviceBuffer(),
+                                                        PassThrough{},
+                                                        PassThrough{});
+
+        auto invoker_ptr = op_ptr->MakeInvokerPointer();
+
+        if(op_ptr->IsSupportedArgument(argument_ptr.get()))
+        {
+            invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, false});
+        }
+
+        std::cout << "Done" << std::endl;
+    }
+
+    return 0;
+}
--- a/client_example/CMakeLists.txt
+++ b/client_example/CMakeLists.txt
@@ -11,3 +11,4 @@ add_subdirectory(02_gemm_add_add_fastgelu)
 add_subdirectory(03_gemm_layernorm)
 add_subdirectory(04_contraction)
 add_subdirectory(05_layernorm)
+add_subdirectory(06_softmax)
--- a/example/23_softmax/softmax_blockwise.cpp
+++ b/example/23_softmax/softmax_blockwise.cpp
@@ -9,37 +9,41 @@

 #include "ck/ck.hpp"
 #include "ck/utility/reduction_enums.hpp"
-#include "ck/tensor_operation/gpu/device/device_softmax.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_softmax_impl.hpp"
 #include "ck/tensor_operation/gpu/device/reduction_operator_mapping.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"

 #include "ck/library/utility/check_err.hpp"
 #include "ck/library/utility/device_memory.hpp"
 #include "ck/library/utility/host_common_util.hpp"
 #include "ck/library/reference_tensor_operation/cpu/reference_softmax.hpp"

-using namespace ck;
 using namespace ck::tensor_operation::device;

 using InDataType  = ck::half_t;
 using OutDataType = ck::half_t;
 using AccDataType = float;

+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
 constexpr int Rank         = 3;
 constexpr int NumReduceDim = 1;

-using DeviceInstance = DeviceSoftmax<InDataType,
-                                     AccDataType,
-                                     OutDataType,
-                                     Rank,
-                                     NumReduceDim,
-                                     256, // BlockSize
-                                     8,   // ClusterM
-                                     32,  // ClusterK
-                                     1,   // SliceM
-                                     8,   // SliceK
-                                     1,   // SrcVecDim (0=M, 1=K)
-                                     8,   // SrcScalarPerVector
-                                     8>;  // OutScalarPerVector
+using DeviceInstance = DeviceSoftmaxImpl<InDataType,
+                                         AccDataType,
+                                         OutDataType,
+                                         PassThrough, // InElementwiseOperation
+                                         PassThrough, // AccElementwiseOperation
+                                         Rank,
+                                         NumReduceDim,
+                                         256, // BlockSize
+                                         8,   // ClusterM
+                                         32,  // ClusterK
+                                         1,   // SliceM
+                                         8,   // SliceK
+                                         1,   // SrcVecDim (0=M, 1=K)
+                                         8,   // SrcScalarPerVector
+                                         8>;  // OutScalarPerVector

 static struct option long_options[] = {{"inLengths", required_argument, nullptr, 'D'},
                                       {"verify", required_argument, nullptr, 'v'},
@@ -196,7 +200,7 @@ int main(int argc, char* argv[])
    if(args.do_verification)
    {
        using ReferenceInstance =
-            tensor_operation::host::ReferenceSoftmax<InDataType, OutDataType, AccDataType>;
+            ck::tensor_operation::host::ReferenceSoftmax<InDataType, OutDataType, AccDataType>;
        ReferenceInstance ref;
        auto ref_arg = ref.MakeArgument(in, out_ref, alpha, beta, reduceDims);
        auto invoker = ref.MakeInvoker();
@@ -220,7 +224,9 @@ int main(int argc, char* argv[])
                                                            &alpha,
                                                            &beta,
                                                            in_dev.GetDeviceBuffer(),
-                                                            out_dev.GetDeviceBuffer());
+                                                            out_dev.GetDeviceBuffer(),
+                                                            PassThrough{},
+                                                            PassThrough{});

    if(!device_instance.IsSupportedArgument(argument_ptr.get()))
    {
--- a/include/ck/tensor_operation/gpu/device/device_softmax.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_softmax.hpp
@@ -3,19 +3,10 @@

 #pragma once

-#include <iostream>
-#include <sstream>
+#include <memory>
+#include <vector>

-#include "ck/utility/reduction_operator.hpp"
 #include "ck/tensor_operation/gpu/device/device_base.hpp"
-#include "ck/tensor_operation/gpu/device/device_reduce.hpp"
-#include "ck/tensor_operation/gpu/device/device_normalization.hpp"
-#include "ck/tensor_operation/gpu/device/device_reduce_multiblock.hpp"
-#include "ck/tensor_operation/gpu/device/device_reduce_common.hpp"
-#include "ck/tensor_operation/gpu/grid/gridwise_softmax.hpp"
-#include "ck/tensor_operation/gpu/grid/gridwise_set_buffer_value.hpp"
-#include "ck/host_utility/device_prop.hpp"
-#include "ck/host_utility/kernel_launch.hpp"

 namespace ck {
 namespace tensor_operation {
@@ -24,227 +15,54 @@ namespace device {
 template <typename InDataType,
          typename AccDataType,
          typename OutDataType,
-          index_t Rank,
-          index_t NumReduceDim,
-          index_t BlockSize,
-          index_t MThreadClusterSize,
-          index_t KThreadClusterSize,
-          index_t MThreadSliceSize,
-          index_t KThreadSliceSize,
-          index_t InSrcVectorDim,
-          index_t InSrcVectorSize,
-          index_t OutDstVectorSize>
-struct DeviceSoftmax : public DeviceNormalization
+          typename InElementwiseOp,
+          typename AccElementwiseOp,
+          index_t Rank>
+struct DeviceSoftmax : public BaseOperator
 {
-    static constexpr index_t kRank         = Rank;
-    static constexpr index_t kNumReduceDim = NumReduceDim;
+    //
+    // @brief      Makes a pointer to Argument class.
+    //
+    // @param[in]  inLengths           Input tensor extent(s) from high to low dimension
+    // @param[in]  inStrides           Input tensor stride(s) from high to low dimension
+    // @param[in]  reduceDims          The dimension(s) the normalization operation is applied
+    // @param[in]  alpha               Typeless pointer in host memory storing the alpha scaling
+    //                                 value as type AccDataType
+    // @param[in]  beta                Typeless pointer in host memory storing the beta scaling
+    //                                 value as type AccDataType
+    // @param[in]  in_dev              Typeless const pointer in device memory storing the input
+    //                                 tensor
+    // @param      out_dev             Typeless pointer in device memory storing the output tensor
+    // @param[in]  in_elementwise_op   The input elementwise operation.
+    // @param[in]  acc_elementwise_op  The accumulation elementwise operation.
+    //
+    // @return     Unique pointer to the Argument class.
+    //
+    virtual std::unique_ptr<BaseArgument>
+    MakeArgumentPointer(const std::vector<index_t> inLengths,
+                        const std::vector<index_t> inStrides,
+                        const std::vector<int> reduceDims,
+                        const void* alpha,
+                        const void* beta,
+                        const void* in_dev,
+                        void* out_dev,
+                        InElementwiseOp in_elementwise_op,
+                        AccElementwiseOp acc_elementwise_op) = 0;

-    virtual index_t GetRank() const override { return kRank; }
-
-    virtual index_t GetNumReduceDim() const override { return kNumReduceDim; }
-
-    using PassThrough = tensor_operation::element_wise::PassThrough;
-
-    // Used for freeloading of some handy functions from DeviceReduceMultiBlock
-    using Reduction = DeviceReduceMultiBlock<InDataType,
-                                             AccDataType,
-                                             OutDataType,
-                                             Rank,
-                                             NumReduceDim,
-                                             reduce::Add,
-                                             PassThrough, // InElementwiseOperation
-                                             PassThrough, // AccElementwiseOperation
-                                             InMemoryDataOperationEnum::Set,
-                                             false, // PropagateNan
-                                             false, // OutputIndex
-                                             false, // HaveIndexInputIfOutputIndex
-                                             BlockSize,
-                                             MThreadClusterSize,
-                                             KThreadClusterSize,
-                                             MThreadSliceSize,
-                                             KThreadSliceSize,
-                                             InSrcVectorDim,
-                                             InSrcVectorSize,
-                                             1>; // OutDstVectorSize
-
-    using GridDesc_M_K = decltype(Reduction::MakeSrc2dDescriptor({1}, {1}, 1, 1));
-
-    using GridwiseSoftmaxGeneric = GridwiseSoftmax_mk_to_mk<InDataType,
-                                                            OutDataType,
-                                                            AccDataType,
-                                                            GridDesc_M_K,
-                                                            BlockSize,
-                                                            MThreadClusterSize,
-                                                            KThreadClusterSize,
-                                                            MThreadSliceSize,
-                                                            KThreadSliceSize,
-                                                            InSrcVectorDim,
-                                                            InSrcVectorSize,
-                                                            OutDstVectorSize,
-                                                            false>;
-
-    using GridwiseSoftmaxSweepOnce = GridwiseSoftmax_mk_to_mk<InDataType,
-                                                              OutDataType,
-                                                              AccDataType,
-                                                              GridDesc_M_K,
-                                                              BlockSize,
-                                                              MThreadClusterSize,
-                                                              KThreadClusterSize,
-                                                              MThreadSliceSize,
-                                                              KThreadSliceSize,
-                                                              InSrcVectorDim,
-                                                              InSrcVectorSize,
-                                                              OutDstVectorSize,
-                                                              true>;
-
-    struct Argument : public Reduction::Argument
-    {
-        Argument(const std::vector<index_t> inLengths,
-                 const std::vector<index_t> inStrides,
-                 const std::vector<index_t> reduceDims,
-                 AccDataType alpha,
-                 AccDataType beta,
-                 const InDataType* in_dev,
-                 OutDataType* out_dev)
-            : Reduction::Argument(inLengths,
-                                  inStrides,
-                                  {},
-                                  {},
-                                  reduceDims,
-                                  0.0f, // alpha
-                                  0.0f, // beta
-                                  in_dev,
-                                  nullptr,
-                                  out_dev,
-                                  nullptr,
-                                  PassThrough{},
-                                  PassThrough{}),
-              // FIXME: The base class DeviceReduceMultiBlock::Argument only supports alpha/beta of
-              // float32 precision. Make it support any data type so the fields can be removed.
-              alpha_(alpha),
-              beta_(beta)
-        {
-            // std::cout << "blkGroupSize= " << this->blkGroupSize
-            //           << ", numBlockTileIteration= " << this->numBlockTileIteration
-            //           << ", gridSize=" << this->gridSize
-            //           << ", invariant_total_length=" << this->invariant_total_length <<
-            //           std::endl;
-        }
-
-        AccDataType alpha_;
-        AccDataType beta_;
-    };
-
-    struct Invoker : public BaseInvoker
-    {
-        float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
-        {
-            const auto in_grid_desc_m_k = Reduction::MakeSrc2dDescriptor(
-                arg.inLengths_, arg.inStrides_, arg.blkGroupSize, arg.numBlockTileIteration);
-            const auto out_grid_desc_m_k = Reduction::MakeSrc2dDescriptor(
-                arg.inLengths_, arg.inStrides_, arg.blkGroupSize, arg.numBlockTileIteration);
-
-            bool sweep_once =
-                in_grid_desc_m_k.GetLength(Number<1>{}) <= KThreadClusterSize * KThreadSliceSize;
-
-            const auto kernel_main = sweep_once ? kernel_softmax<GridwiseSoftmaxSweepOnce,
-                                                                 InDataType,
-                                                                 OutDataType,
-                                                                 AccDataType,
-                                                                 GridDesc_M_K>
-                                                : kernel_softmax<GridwiseSoftmaxGeneric,
-                                                                 InDataType,
-                                                                 OutDataType,
-                                                                 AccDataType,
-                                                                 GridDesc_M_K>;
-
-            float avg_time = 0;
-
-            avg_time += launch_and_time_kernel(stream_config,
-                                               kernel_main,
-                                               dim3(arg.gridSize),
-                                               dim3(BlockSize),
-                                               0,
-                                               in_grid_desc_m_k,
-                                               out_grid_desc_m_k,
-                                               arg.blkGroupSize,
-                                               arg.numBlockTileIteration,
-                                               arg.alpha_,
-                                               arg.in_dev_,
-                                               arg.beta_,
-                                               arg.out_dev_);
-
-            return (avg_time);
-        };
-
-        float Run(const BaseArgument* p_arg,
-                  const StreamConfig& stream_config = StreamConfig{}) override
-        {
-            return Run(*dynamic_cast<const Argument*>(p_arg), stream_config);
-        };
-    };
-
-    bool IsSupportedArgument(const BaseArgument* p_arg) override
-    {
-        const Argument* p_arg_ = dynamic_cast<const Argument*>(p_arg);
-
-        if(!Reduction::IsSupportedArgument(p_arg_))
-        {
-            return false;
-        }
-
-        if(p_arg_->inLengths_[Rank - 1] % OutDstVectorSize != 0)
-        {
-            return false;
-        }
-
-        return true;
-    };
-
-    // inLengths: input tensor extent(s) from high to low dimension
-    // inStrides: input tensor stride(s) from high to low dimension
-    // reduceDims: the dimension(s) the softmax normalization operate on
-    // alpha: typeless pointer in host memory storing the alpha scaling value as type AccDataType
-    // beta: typeless pointer in host memory storing the beta scaling value as type AccDataType
-    // in_dev: typeless const pointer in device memory storing the input tensor
-    // out_dev: typeless pointer in device memory storing the output tensor
-    std::unique_ptr<BaseArgument> MakeArgumentPointer(const std::vector<index_t> inLengths,
-                                                      const std::vector<index_t> inStrides,
-                                                      const std::vector<int> reduceDims,
-                                                      const void* alpha,
-                                                      const void* beta,
-                                                      const void* in_dev,
-                                                      void* out_dev) override
-    {
-        return std::make_unique<Argument>(inLengths,
-                                          inStrides,
-                                          reduceDims,
-                                          *static_cast<const AccDataType*>(alpha),
-                                          *static_cast<const AccDataType*>(beta),
-                                          static_cast<const InDataType*>(in_dev),
-                                          static_cast<OutDataType*>(out_dev));
-    };
-
-    std::unique_ptr<BaseInvoker> MakeInvokerPointer() override
-    {
-        return std::make_unique<Invoker>();
-    };
-
-    std::string GetTypeString() const override
-    {
-        auto str = std::stringstream();
-
-        // clang-format off
-        str << "DeviceReduceSoftmax<" << BlockSize << ",";
-        str << "M_C" << MThreadClusterSize << "_S" << MThreadSliceSize << ",";
-        str << "K_C" << KThreadClusterSize << "_S" << KThreadSliceSize << ",";
-        str << "InSrcVectorDim_" << InSrcVectorDim << "_InSrcVectorSize_" << InSrcVectorSize << "_OutDstVectorSize_" << OutDstVectorSize << ">";
-        // clang-format on
-
-        return str.str();
-    }
+    virtual std::unique_ptr<BaseInvoker> MakeInvokerPointer() = 0;
+    virtual index_t GetRank() const                           = 0;
+    virtual index_t GetNumReduceDim() const                   = 0;
 };

+template <typename InDataType,
+          typename AccDataType,
+          typename OutDataType,
+          typename InElementwiseOp,
+          typename AccElementwiseOp,
+          index_t Rank>
+using DeviceSoftmaxPtr = std::unique_ptr<
+    DeviceSoftmax<InDataType, AccDataType, OutDataType, InElementwiseOp, AccElementwiseOp, Rank>>;
+
 } // namespace device
 } // namespace tensor_operation
 } // namespace ck
--- a/include/ck/tensor_operation/gpu/device/impl/device_softmax_impl.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_softmax_impl.hpp
@@ -0,0 +1,272 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <iostream>
+#include <sstream>
+
+#include "ck/utility/reduction_operator.hpp"
+#include "ck/tensor_operation/gpu/device/device_base.hpp"
+#include "ck/tensor_operation/gpu/device/device_reduce.hpp"
+#include "ck/tensor_operation/gpu/device/device_softmax.hpp"
+#include "ck/tensor_operation/gpu/device/device_reduce_multiblock.hpp"
+#include "ck/tensor_operation/gpu/device/device_reduce_common.hpp"
+#include "ck/tensor_operation/gpu/grid/gridwise_softmax.hpp"
+#include "ck/tensor_operation/gpu/grid/gridwise_set_buffer_value.hpp"
+#include "ck/host_utility/device_prop.hpp"
+#include "ck/host_utility/kernel_launch.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+
+template <typename InDataType,
+          typename AccDataType,
+          typename OutDataType,
+          typename InElementwiseOp,
+          typename AccElementwiseOp,
+          index_t Rank,
+          index_t NumReduceDim,
+          index_t BlockSize,
+          index_t MThreadClusterSize,
+          index_t KThreadClusterSize,
+          index_t MThreadSliceSize,
+          index_t KThreadSliceSize,
+          index_t InSrcVectorDim,
+          index_t InSrcVectorSize,
+          index_t OutDstVectorSize>
+struct DeviceSoftmaxImpl : public DeviceSoftmax<InDataType,
+                                                AccDataType,
+                                                OutDataType,
+                                                InElementwiseOp,
+                                                AccElementwiseOp,
+                                                Rank>
+{
+    static constexpr index_t kRank         = Rank;
+    static constexpr index_t kNumReduceDim = NumReduceDim;
+
+    virtual index_t GetRank() const override { return kRank; }
+
+    virtual index_t GetNumReduceDim() const override { return kNumReduceDim; }
+
+    // Used for freeloading of some handy functions from DeviceReduceMultiBlock
+    using Reduction = DeviceReduceMultiBlock<InDataType,
+                                             AccDataType,
+                                             OutDataType,
+                                             Rank,
+                                             NumReduceDim,
+                                             reduce::Add,
+                                             InElementwiseOp,
+                                             AccElementwiseOp,
+                                             InMemoryDataOperationEnum::Set,
+                                             false, // PropagateNan
+                                             false, // OutputIndex
+                                             false, // HaveIndexInputIfOutputIndex
+                                             BlockSize,
+                                             MThreadClusterSize,
+                                             KThreadClusterSize,
+                                             MThreadSliceSize,
+                                             KThreadSliceSize,
+                                             InSrcVectorDim,
+                                             InSrcVectorSize,
+                                             1>; // OutDstVectorSize
+
+    using GridDesc_M_K = decltype(Reduction::MakeSrc2dDescriptor({1}, {1}, 1, 1));
+
+    using GridwiseSoftmaxGeneric = GridwiseSoftmax_mk_to_mk<InDataType,
+                                                            OutDataType,
+                                                            AccDataType,
+                                                            GridDesc_M_K,
+                                                            BlockSize,
+                                                            MThreadClusterSize,
+                                                            KThreadClusterSize,
+                                                            MThreadSliceSize,
+                                                            KThreadSliceSize,
+                                                            InSrcVectorDim,
+                                                            InSrcVectorSize,
+                                                            OutDstVectorSize,
+                                                            false>;
+
+    using GridwiseSoftmaxSweepOnce = GridwiseSoftmax_mk_to_mk<InDataType,
+                                                              OutDataType,
+                                                              AccDataType,
+                                                              GridDesc_M_K,
+                                                              BlockSize,
+                                                              MThreadClusterSize,
+                                                              KThreadClusterSize,
+                                                              MThreadSliceSize,
+                                                              KThreadSliceSize,
+                                                              InSrcVectorDim,
+                                                              InSrcVectorSize,
+                                                              OutDstVectorSize,
+                                                              true>;
+
+    struct Argument : public Reduction::Argument
+    {
+        Argument(const std::vector<index_t> inLengths,
+                 const std::vector<index_t> inStrides,
+                 const std::vector<index_t> reduceDims,
+                 AccDataType alpha,
+                 AccDataType beta,
+                 const InDataType* in_dev,
+                 OutDataType* out_dev,
+                 InElementwiseOp in_elementwise_op,
+                 AccElementwiseOp acc_elementwise_op)
+            : Reduction::Argument(inLengths,
+                                  inStrides,
+                                  {},
+                                  {},
+                                  reduceDims,
+                                  0.0f, // alpha
+                                  0.0f, // beta
+                                  in_dev,
+                                  nullptr,
+                                  out_dev,
+                                  nullptr,
+                                  in_elementwise_op,
+                                  acc_elementwise_op),
+              // FIXME: The base class DeviceReduceMultiBlock::Argument only supports alpha/beta of
+              // float32 precision. Make it support any data type so the fields can be removed.
+              alpha_(alpha),
+              beta_(beta)
+        {
+            // std::cout << "blkGroupSize= " << this->blkGroupSize
+            //           << ", numBlockTileIteration= " << this->numBlockTileIteration
+            //           << ", gridSize=" << this->gridSize
+            //           << ", invariant_total_length=" << this->invariant_total_length <<
+            //           std::endl;
+        }
+
+        AccDataType alpha_;
+        AccDataType beta_;
+    };
+
+    struct Invoker : public BaseInvoker
+    {
+        float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
+        {
+            const auto in_grid_desc_m_k = Reduction::MakeSrc2dDescriptor(
+                arg.inLengths_, arg.inStrides_, arg.blkGroupSize, arg.numBlockTileIteration);
+            const auto out_grid_desc_m_k = Reduction::MakeSrc2dDescriptor(
+                arg.inLengths_, arg.inStrides_, arg.blkGroupSize, arg.numBlockTileIteration);
+
+            bool sweep_once =
+                in_grid_desc_m_k.GetLength(Number<1>{}) <= KThreadClusterSize * KThreadSliceSize;
+
+            const auto kernel_main = sweep_once ? kernel_softmax<GridwiseSoftmaxSweepOnce,
+                                                                 InDataType,
+                                                                 OutDataType,
+                                                                 AccDataType,
+                                                                 GridDesc_M_K>
+                                                : kernel_softmax<GridwiseSoftmaxGeneric,
+                                                                 InDataType,
+                                                                 OutDataType,
+                                                                 AccDataType,
+                                                                 GridDesc_M_K>;
+
+            float avg_time = 0;
+
+            avg_time += launch_and_time_kernel(stream_config,
+                                               kernel_main,
+                                               dim3(arg.gridSize),
+                                               dim3(BlockSize),
+                                               0,
+                                               in_grid_desc_m_k,
+                                               out_grid_desc_m_k,
+                                               arg.blkGroupSize,
+                                               arg.numBlockTileIteration,
+                                               arg.alpha_,
+                                               arg.in_dev_,
+                                               arg.beta_,
+                                               arg.out_dev_);
+
+            return (avg_time);
+        };
+
+        float Run(const BaseArgument* p_arg,
+                  const StreamConfig& stream_config = StreamConfig{}) override
+        {
+            return Run(*dynamic_cast<const Argument*>(p_arg), stream_config);
+        };
+    };
+
+    bool IsSupportedArgument(const BaseArgument* p_arg) override
+    {
+        const Argument* p_arg_ = dynamic_cast<const Argument*>(p_arg);
+
+        if(!Reduction::IsSupportedArgument(p_arg_))
+        {
+            return false;
+        }
+
+        if(p_arg_->inLengths_[Rank - 1] % OutDstVectorSize != 0)
+        {
+            return false;
+        }
+
+        return true;
+    };
+
+    //
+    // @brief      Makes a pointer to Argument class.
+    //
+    // @param[in]  inLengths           Input tensor extent(s) from high to low dimension
+    // @param[in]  inStrides           Input tensor stride(s) from high to low dimension
+    // @param[in]  reduceDims          The dimension(s) the normalization operation is applied
+    // @param[in]  alpha               Typeless pointer in host memory storing the alpha scaling
+    //                                 value as type AccDataType
+    // @param[in]  beta                Typeless pointer in host memory storing the beta scaling
+    //                                 value as type AccDataType
+    // @param[in]  in_dev              Typeless const pointer in device memory storing the input
+    //                                 tensor
+    // @param      out_dev             Typeless pointer in device memory storing the output tensor
+    // @param[in]  in_elementwise_op   The input elementwise operation.
+    // @param[in]  acc_elementwise_op  The accumulation elementwise operation.
+    //
+    // @return     Unique pointer to the Argument class.
+    //
+    std::unique_ptr<BaseArgument> MakeArgumentPointer(const std::vector<index_t> inLengths,
+                                                      const std::vector<index_t> inStrides,
+                                                      const std::vector<int> reduceDims,
+                                                      const void* alpha,
+                                                      const void* beta,
+                                                      const void* in_dev,
+                                                      void* out_dev,
+                                                      InElementwiseOp in_elementwise_op,
+                                                      AccElementwiseOp acc_elementwise_op) override
+    {
+        return std::make_unique<Argument>(inLengths,
+                                          inStrides,
+                                          reduceDims,
+                                          *static_cast<const AccDataType*>(alpha),
+                                          *static_cast<const AccDataType*>(beta),
+                                          static_cast<const InDataType*>(in_dev),
+                                          static_cast<OutDataType*>(out_dev),
+                                          in_elementwise_op,
+                                          acc_elementwise_op);
+    };
+
+    std::unique_ptr<BaseInvoker> MakeInvokerPointer() override
+    {
+        return std::make_unique<Invoker>();
+    };
+
+    std::string GetTypeString() const override
+    {
+        auto str = std::stringstream();
+
+        // clang-format off
+        str << "DeviceReduceSoftmax<" << BlockSize << ",";
+        str << "M_C" << MThreadClusterSize << "_S" << MThreadSliceSize << ",";
+        str << "K_C" << KThreadClusterSize << "_S" << KThreadSliceSize << ",";
+        str << "InSrcVectorDim_" << InSrcVectorDim << "_InSrcVectorSize_" << InSrcVectorSize << "_OutDstVectorSize_" << OutDstVectorSize << ">";
+        // clang-format on
+
+        return str.str();
+    }
+};
+
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/library/include/ck/library/tensor_operation_instance/gpu/softmax.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/softmax.hpp
@@ -0,0 +1,71 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <type_traits>
+#include <vector>
+
+#include "ck/ck.hpp"
+#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/device_softmax.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+#include "ck/utility/data_type.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using F16         = ck::half_t;
+using F32         = float;
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+void add_device_softmax_f16_f16_rank3_instances(
+    std::vector<DeviceSoftmaxPtr<F16, F32, F16, PassThrough, PassThrough, 3>>&);
+void add_device_softmax_f16_f16_rank4_instances(
+    std::vector<DeviceSoftmaxPtr<F16, F32, F16, PassThrough, PassThrough, 4>>&);
+
+void add_device_softmax_f32_f32_rank3_instances(
+    std::vector<DeviceSoftmaxPtr<F32, F32, F32, PassThrough, PassThrough, 3>>&);
+void add_device_softmax_f32_f32_rank4_instances(
+    std::vector<DeviceSoftmaxPtr<F32, F32, F32, PassThrough, PassThrough, 4>>&);
+
+template <typename InDataType, typename AccDataType, typename OutDataType, index_t Rank>
+struct DeviceOperationInstanceFactory<
+    ck::tensor_operation::device::
+        DeviceSoftmax<InDataType, AccDataType, OutDataType, PassThrough, PassThrough, Rank>>
+{
+    using DeviceOp =
+        DeviceSoftmax<InDataType, AccDataType, OutDataType, PassThrough, PassThrough, Rank>;
+
+    static auto GetInstances()
+    {
+        std::vector<std::unique_ptr<DeviceOp>> op_ptrs;
+
+        if constexpr(std::is_same_v<InDataType, F16> && std::is_same_v<AccDataType, F32> &&
+                     std::is_same_v<OutDataType, F16>)
+        {
+            if constexpr(Rank == 3)
+                add_device_softmax_f16_f16_rank3_instances(op_ptrs);
+            else if constexpr(Rank == 4)
+                add_device_softmax_f16_f16_rank4_instances(op_ptrs);
+        }
+        else if constexpr(std::is_same_v<InDataType, F32> && std::is_same_v<AccDataType, F32> &&
+                          std::is_same_v<OutDataType, F32>)
+        {
+            if constexpr(Rank == 3)
+                add_device_softmax_f32_f32_rank3_instances(op_ptrs);
+            else if constexpr(Rank == 4)
+                add_device_softmax_f32_f32_rank4_instances(op_ptrs);
+        }
+
+        return op_ptrs;
+    }
+};
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/library/src/tensor_operation_instance/gpu/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/CMakeLists.txt
@@ -78,6 +78,7 @@ target_include_directories(device_operations PUBLIC
    $<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}/ck/tensor>
    $<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}/ck/problem_transform>
    $<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}/ck/tensor_operation/gpu/device>
+    $<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}/ck/tensor_operation/gpu/device/impl>
    $<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}/ck/tensor_operation/gpu/grid>
    $<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}/ck/tensor_operation/gpu/block>
    $<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}/ck/tensor_operation/gpu/warp>
--- a/library/src/tensor_operation_instance/gpu/normalization/device_softmax_f16_f16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/normalization/device_softmax_f16_f16_instance.cpp
@@ -1,43 +1,51 @@
 // SPDX-License-Identifier: MIT
 // Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.

-#include "ck/ck.hpp"
-#include "ck/tensor_operation/gpu/device/device_softmax.hpp"
-#include "ck/utility/data_type.hpp"
+#include <tuple>
+#include <vector>

+#include "ck/ck.hpp"
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_softmax_impl.hpp"
+#include "ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp"
+#include "ck/utility/data_type.hpp"

 namespace ck {
 namespace tensor_operation {
 namespace device {
 namespace instance {

-using F16 = ck::half_t;
-using F32 = float;
+namespace {
+using F16  = ck::half_t;
+using F32  = float;
+using Pass = ck::tensor_operation::element_wise::PassThrough;
+} // namespace

 template <index_t Rank, index_t Reduce>
 using device_softmax_f16_f16_instances = std::tuple<
    // clang-format off
-        // InDataType, AccDataType, OutDataType, Rank, NumReduceDim, BlockSize, MThreadClusterSize, KThreadClusterSize, MThreadSliceSize, KThreadSliceSize, InSrcVectorDim, InSrcVectorSize, OutDstVectorSize>
-        DeviceSoftmax<F16, F32, F16, Rank, Reduce, 256, 8, 32, 1, 8, 1, 1, 1>, // fallback kernel
-        DeviceSoftmax<F16, F32, F16, Rank, Reduce, 256, 8, 32, 1, 8, 1, 8, 8>,
-        DeviceSoftmax<F16, F32, F16, Rank, Reduce, 256, 4, 64, 1, 8, 1, 8, 8>,
-        DeviceSoftmax<F16, F32, F16, Rank, Reduce, 256, 2, 128, 1, 8, 1, 8, 8>,
-        DeviceSoftmax<F16, F32, F16, Rank, Reduce, 256, 2, 128, 1, 16, 1, 8, 8>,
-        DeviceSoftmax<F16, F32, F16, Rank, Reduce, 256, 2, 128, 1, 32, 1, 8, 8>,
-        DeviceSoftmax<F16, F32, F16, Rank, Reduce, 256, 1, 256, 1, 8, 1, 8, 8>,
-        DeviceSoftmax<F16, F32, F16, Rank, Reduce, 256, 1, 256, 1, 16, 1, 8, 8>,
-        DeviceSoftmax<F16, F32, F16, Rank, Reduce, 256, 1, 256, 1, 32, 1, 8, 8>
+    //                InDataType, AccDataType, OutDataType, InElementwiseOp, AccElementwiseOp, Rank, NumReduceDim, BlockSize, MThreadClusterSize, KThreadClusterSize, MThreadSliceSize, KThreadSliceSize, InSrcVectorDim, InSrcVectorSize, OutDstVectorSize>
+    DeviceSoftmaxImpl<       F16,         F32,         F16,            Pass,             Pass, Rank,       Reduce,       256,                  8,                 32,                1,                8,              1,               1,              1>, // fallback kernel
+    DeviceSoftmaxImpl<       F16,         F32,         F16,            Pass,             Pass, Rank,       Reduce,       256,                  8,                 32,                1,                8,              1,               8,              8>,
+    DeviceSoftmaxImpl<       F16,         F32,         F16,            Pass,             Pass, Rank,       Reduce,       256,                  4,                 64,                1,                8,              1,               8,              8>,
+    DeviceSoftmaxImpl<       F16,         F32,         F16,            Pass,             Pass, Rank,       Reduce,       256,                  2,                128,                1,                8,              1,               8,              8>,
+    DeviceSoftmaxImpl<       F16,         F32,         F16,            Pass,             Pass, Rank,       Reduce,       256,                  2,                128,                1,               16,              1,               8,              8>,
+    DeviceSoftmaxImpl<       F16,         F32,         F16,            Pass,             Pass, Rank,       Reduce,       256,                  2,                128,                1,               32,              1,               8,              8>,
+    DeviceSoftmaxImpl<       F16,         F32,         F16,            Pass,             Pass, Rank,       Reduce,       256,                  1,                256,                1,                8,              1,               8,              8>,
+    DeviceSoftmaxImpl<       F16,         F32,         F16,            Pass,             Pass, Rank,       Reduce,       256,                  1,                256,                1,               16,              1,               8,              8>,
+    DeviceSoftmaxImpl<       F16,         F32,         F16,            Pass,             Pass, Rank,       Reduce,       256,                  1,                256,                1,               32,              1,               8,              8>
    // clang-format on
    >;

-void add_device_softmax_f16_f16_rank3_instances(std::vector<DeviceNormalizationPtr>& instances)
+void add_device_softmax_f16_f16_rank3_instances(
+    std::vector<DeviceSoftmaxPtr<F16, F32, F16, Pass, Pass, 3>>& instances)
 {
    add_device_operation_instances(instances, device_softmax_f16_f16_instances<3, 1>{});
    add_device_operation_instances(instances, device_softmax_f16_f16_instances<3, 2>{});
 }

-void add_device_softmax_f16_f16_rank4_instances(std::vector<DeviceNormalizationPtr>& instances)
+void add_device_softmax_f16_f16_rank4_instances(
+    std::vector<DeviceSoftmaxPtr<F16, F32, F16, Pass, Pass, 4>>& instances)
 {
    add_device_operation_instances(instances, device_softmax_f16_f16_instances<4, 1>{});
    add_device_operation_instances(instances, device_softmax_f16_f16_instances<4, 2>{});
--- a/library/src/tensor_operation_instance/gpu/normalization/device_softmax_f32_f32_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/normalization/device_softmax_f32_f32_instance.cpp
@@ -1,41 +1,49 @@
 // SPDX-License-Identifier: MIT
 // Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.

+#include <tuple>
+#include <vector>
+
 #include "ck/ck.hpp"
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
-#include "ck/tensor_operation/gpu/device/device_softmax.hpp"
-#include "ck/utility/data_type.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_softmax_impl.hpp"
+#include "ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp"

 namespace ck {
 namespace tensor_operation {
 namespace device {
 namespace instance {

-using F32 = float;
+namespace {
+using F32  = float;
+using Pass = ck::tensor_operation::element_wise::PassThrough;
+} // namespace

 template <index_t Rank, index_t Reduce>
 using device_softmax_f32_f32_instances = std::tuple<
    // clang-format off
-        // InDataType, AccDataType, OutDataType, Rank, NumReduceDim, BlockSize, MThreadClusterSize, KThreadClusterSize, MThreadSliceSize, KThreadSliceSize, InSrcVectorDim, InSrcVectorSize, OutDstVectorSize>
-        DeviceSoftmax<F32, F32, F32, Rank, Reduce, 256, 8, 32, 1, 8, 1, 1, 1>, // fallback kernel
-        DeviceSoftmax<F32, F32, F32, Rank, Reduce, 256, 8, 32, 1, 8, 1, 4, 4>,
-        DeviceSoftmax<F32, F32, F32, Rank, Reduce, 256, 4, 64, 1, 8, 1, 4, 4>,
-        DeviceSoftmax<F32, F32, F32, Rank, Reduce, 256, 2, 128, 1, 8, 1, 4, 4>,
-        DeviceSoftmax<F32, F32, F32, Rank, Reduce, 256, 2, 128, 1, 16, 1, 4, 4>,
-        DeviceSoftmax<F32, F32, F32, Rank, Reduce, 256, 2, 128, 1, 32, 1, 4, 4>,
-        DeviceSoftmax<F32, F32, F32, Rank, Reduce, 256, 1, 256, 1, 8, 1, 4, 4>,
-        DeviceSoftmax<F32, F32, F32, Rank, Reduce, 256, 1, 256, 1, 16, 1, 4, 4>,
-        DeviceSoftmax<F32, F32, F32, Rank, Reduce, 256, 1, 256, 1, 32, 1, 4, 4>
+    //                InDataType, AccDataType, OutDataType, InElementwiseOp, AccElementwiseOp, Rank, NumReduceDim, BlockSize, MThreadClusterSize, KThreadClusterSize, MThreadSliceSize, KThreadSliceSize, InSrcVectorDim, InSrcVectorSize, OutDstVectorSize>
+    DeviceSoftmaxImpl<       F32,         F32,         F32,            Pass,             Pass, Rank,       Reduce,       256,                  8,                 32,                1,                8,              1,               1,               1>, // fallback kernel
+    DeviceSoftmaxImpl<       F32,         F32,         F32,            Pass,             Pass, Rank,       Reduce,       256,                  8,                 32,                1,                8,              1,               4,               4>,
+    DeviceSoftmaxImpl<       F32,         F32,         F32,            Pass,             Pass, Rank,       Reduce,       256,                  4,                 64,                1,                8,              1,               4,               4>,
+    DeviceSoftmaxImpl<       F32,         F32,         F32,            Pass,             Pass, Rank,       Reduce,       256,                  2,                128,                1,                8,              1,               4,               4>,
+    DeviceSoftmaxImpl<       F32,         F32,         F32,            Pass,             Pass, Rank,       Reduce,       256,                  2,                128,                1,               16,              1,               4,               4>,
+    DeviceSoftmaxImpl<       F32,         F32,         F32,            Pass,             Pass, Rank,       Reduce,       256,                  2,                128,                1,               32,              1,               4,               4>,
+    DeviceSoftmaxImpl<       F32,         F32,         F32,            Pass,             Pass, Rank,       Reduce,       256,                  1,                256,                1,                8,              1,               4,               4>,
+    DeviceSoftmaxImpl<       F32,         F32,         F32,            Pass,             Pass, Rank,       Reduce,       256,                  1,                256,                1,               16,              1,               4,               4>,
+    DeviceSoftmaxImpl<       F32,         F32,         F32,            Pass,             Pass, Rank,       Reduce,       256,                  1,                256,                1,               32,              1,               4,               4>
    // clang-format on
    >;

-void add_device_softmax_f32_f32_rank3_instances(std::vector<DeviceNormalizationPtr>& instances)
+void add_device_softmax_f32_f32_rank3_instances(
+    std::vector<DeviceSoftmaxPtr<F32, F32, F32, Pass, Pass, 3>>& instances)
 {
    add_device_operation_instances(instances, device_softmax_f32_f32_instances<3, 1>{});
    add_device_operation_instances(instances, device_softmax_f32_f32_instances<3, 2>{});
 }

-void add_device_softmax_f32_f32_rank4_instances(std::vector<DeviceNormalizationPtr>& instances)
+void add_device_softmax_f32_f32_rank4_instances(
+    std::vector<DeviceSoftmaxPtr<F32, F32, F32, Pass, Pass, 4>>& instances)
 {
    add_device_operation_instances(instances, device_softmax_f32_f32_instances<4, 1>{});
    add_device_operation_instances(instances, device_softmax_f32_f32_instances<4, 2>{});
--- a/profiler/include/profile_normalization_impl.hpp
+++ b/profiler/include/profile_normalization_impl.hpp
@@ -6,25 +6,36 @@
 #include <iomanip>

 #include "ck/ck.hpp"
-#include "ck/tensor_operation/gpu/device/device_softmax.hpp"
-
 #include "ck/library/utility/check_err.hpp"
 #include "ck/library/utility/convolution_parameter.hpp"
 #include "ck/library/utility/device_memory.hpp"
 #include "ck/library/utility/host_tensor.hpp"
 #include "ck/library/utility/host_tensor_generator.hpp"
 #include "ck/library/reference_tensor_operation/cpu/reference_softmax.hpp"
+#include "ck/tensor_operation/gpu/device/device_softmax.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+#include "ck/utility/data_type.hpp"

 namespace ck {
 namespace tensor_operation {
 namespace device {
 namespace instance {

-void add_device_softmax_f16_f16_rank3_instances(std::vector<DeviceNormalizationPtr>&);
-void add_device_softmax_f16_f16_rank4_instances(std::vector<DeviceNormalizationPtr>&);
+namespace {
+using F16         = ck::half_t;
+using F32         = float;
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+} // namespace

-void add_device_softmax_f32_f32_rank3_instances(std::vector<DeviceNormalizationPtr>&);
-void add_device_softmax_f32_f32_rank4_instances(std::vector<DeviceNormalizationPtr>&);
+void add_device_softmax_f16_f16_rank3_instances(
+    std::vector<DeviceSoftmaxPtr<F16, F32, F16, PassThrough, PassThrough, 3>>&);
+void add_device_softmax_f16_f16_rank4_instances(
+    std::vector<DeviceSoftmaxPtr<F16, F32, F16, PassThrough, PassThrough, 4>>&);
+
+void add_device_softmax_f32_f32_rank3_instances(
+    std::vector<DeviceSoftmaxPtr<F32, F32, F32, PassThrough, PassThrough, 3>>&);
+void add_device_softmax_f32_f32_rank4_instances(
+    std::vector<DeviceSoftmaxPtr<F32, F32, F32, PassThrough, PassThrough, 4>>&);

 } // namespace instance
 } // namespace device
@@ -57,7 +68,7 @@ template <> std::string type_to_string<int8_t>()  { return "int8"; }
 template <> std::string type_to_string<int32_t>() { return "int32"; }
 // clang-format on

-template <typename InDataType, typename AccDataType, typename OutDataType>
+template <typename InDataType, typename AccDataType, typename OutDataType, index_t Rank>
 void profile_normalization_impl(int do_verification,
                                int init_method,
                                bool do_log,
@@ -69,6 +80,11 @@ void profile_normalization_impl(int do_verification,
                                AccDataType beta,
                                NormType norm_type)
 {
+    if(Rank != in_length.size())
+    {
+        throw std::runtime_error("Input tensor rank is different from template argument Rank!");
+    }
+
    Tensor<InDataType> in = in_strides.empty() ? Tensor<InDataType>(in_length)
                                               : Tensor<InDataType>(in_length, in_strides);
    Tensor<OutDataType> out(in.mDesc);
@@ -99,30 +115,31 @@ void profile_normalization_impl(int do_verification,
    std::vector<index_t> i_in_lengths(in.mDesc.GetLengths().begin(), in.mDesc.GetLengths().end());
    std::vector<index_t> i_in_strides(in.mDesc.GetStrides().begin(), in.mDesc.GetStrides().end());

-    // add device normalization instances
-    std::vector<tensor_operation::device::DeviceNormalizationPtr> instances;
+    // add device softmax instances
+    using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+    using DeviceOpPtr = tensor_operation::device::
+        DeviceSoftmaxPtr<InDataType, AccDataType, OutDataType, PassThrough, PassThrough, Rank>;
+    std::vector<DeviceOpPtr> instances;

    if(norm_type == NormType::SOFTMAX)
    {
        if constexpr(is_same<InDataType, half_t>::value && is_same<OutDataType, half_t>::value &&
                     is_same<AccDataType, float>::value)
        {
-            if(in_length.size() == 3)
+            if constexpr(Rank == 3)
                tensor_operation::device::instance::add_device_softmax_f16_f16_rank3_instances(
                    instances);
-
-            if(in_length.size() == 4)
+            else if constexpr(Rank == 4)
                tensor_operation::device::instance::add_device_softmax_f16_f16_rank4_instances(
                    instances);
        }
        else if constexpr(is_same<InDataType, float>::value && is_same<OutDataType, float>::value &&
                          is_same<AccDataType, float>::value)
        {
-            if(in_length.size() == 3)
+            if constexpr(Rank == 3)
                tensor_operation::device::instance::add_device_softmax_f32_f32_rank3_instances(
                    instances);
-
-            if(in_length.size() == 4)
+            else if constexpr(Rank == 4)
                tensor_operation::device::instance::add_device_softmax_f32_f32_rank4_instances(
                    instances);
        }
@@ -137,6 +154,8 @@ void profile_normalization_impl(int do_verification,
    float best_avg_time   = std::numeric_limits<float>::max();
    float best_gb_per_sec = 0;

+    using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
    for(auto& inst_ptr : instances)
    {
        // Is this user's responsibility to check if problem mismatches kernel instance (ie. rank 3
@@ -153,7 +172,9 @@ void profile_normalization_impl(int do_verification,
                                                          &alpha,
                                                          &beta,
                                                          in_dev.GetDeviceBuffer(),
-                                                          out_dev.GetDeviceBuffer());
+                                                          out_dev.GetDeviceBuffer(),
+                                                          PassThrough{},
+                                                          PassThrough{});

        if(!inst_ptr->IsSupportedArgument(argument_ptr.get()))
        {
--- a/profiler/src/profile_normalization.cpp
+++ b/profiler/src/profile_normalization.cpp
@@ -50,7 +50,7 @@ struct ArgParser

 void print_help()
 {
-    std::cout << "arg1: tensor operation (layernorm/batchnorm/softmax)\n"
+    std::cout << "arg1: tensor operation (batchnorm/softmax)\n"
              << "arg2: data type (0: fp32; 1: fp16; 2: bf16; 3: int8)\n"
              << "arg3: verification (0: no; 1: yes)\n"
              << "arg4: initialization (0: no init; 1: integer value; 2: decimal value)\n"
@@ -91,31 +91,73 @@ int profile_normalization(int argc, char* argv[])
        arg_parser.long_opts["alpha"].empty() ? 1 : arg_parser.long_opts["alpha"][0];
    const index_t beta = arg_parser.long_opts["beta"].empty() ? 0 : arg_parser.long_opts["beta"][0];

-    if(data_type == NormDataType::F16_F16)
+    if(length.size() == 3)
    {
-        ck::profiler::profile_normalization_impl<ck::half_t, float, ck::half_t>(do_verification,
-                                                                                init_method,
-                                                                                do_log,
-                                                                                time_kernel,
-                                                                                length,
-                                                                                stride,
-                                                                                reduce,
-                                                                                float(alpha),
-                                                                                float(beta),
-                                                                                norm_type);
+        if(data_type == NormDataType::F16_F16)
+        {
+            ck::profiler::profile_normalization_impl<ck::half_t, float, ck::half_t, 3>(
+                do_verification,
+                init_method,
+                do_log,
+                time_kernel,
+                length,
+                stride,
+                reduce,
+                float(alpha),
+                float(beta),
+                norm_type);
+        }
+        else if(data_type == NormDataType::F32_F32)
+        {
+            ck::profiler::profile_normalization_impl<float, float, float, 3>(do_verification,
+                                                                             init_method,
+                                                                             do_log,
+                                                                             time_kernel,
+                                                                             length,
+                                                                             stride,
+                                                                             reduce,
+                                                                             float(alpha),
+                                                                             float(beta),
+                                                                             norm_type);
+        }
+        else
+        {
+            throw std::runtime_error("not implemented yet");
+        }
    }
-    else if(data_type == NormDataType::F32_F32)
+    else if(length.size() == 4)
    {
-        ck::profiler::profile_normalization_impl<float, float, float>(do_verification,
-                                                                      init_method,
-                                                                      do_log,
-                                                                      time_kernel,
-                                                                      length,
-                                                                      stride,
-                                                                      reduce,
-                                                                      float(alpha),
-                                                                      float(beta),
-                                                                      norm_type);
+        if(data_type == NormDataType::F16_F16)
+        {
+            ck::profiler::profile_normalization_impl<ck::half_t, float, ck::half_t, 4>(
+                do_verification,
+                init_method,
+                do_log,
+                time_kernel,
+                length,
+                stride,
+                reduce,
+                float(alpha),
+                float(beta),
+                norm_type);
+        }
+        else if(data_type == NormDataType::F32_F32)
+        {
+            ck::profiler::profile_normalization_impl<float, float, float, 4>(do_verification,
+                                                                             init_method,
+                                                                             do_log,
+                                                                             time_kernel,
+                                                                             length,
+                                                                             stride,
+                                                                             reduce,
+                                                                             float(alpha),
+                                                                             float(beta),
+                                                                             norm_type);
+        }
+        else
+        {
+            throw std::runtime_error("not implemented yet");
+        }
    }
    else
    {
--- a/test/softmax/test_softmax_util.hpp
+++ b/test/softmax/test_softmax_util.hpp
@@ -9,7 +9,8 @@

 #include "ck/ck.hpp"
 #include "ck/utility/number.hpp"
-#include "ck/tensor_operation/gpu/device/device_softmax.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_softmax_impl.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"

 #include "ck/library/utility/check_err.hpp"
 #include "ck/library/utility/host_tensor.hpp"
@@ -51,19 +52,23 @@ class TestSoftmax : public ::testing::Test
    using ReferenceInstance =
        tensor_operation::host::ReferenceSoftmax<InDataType, OutDataType, AccDataType>;

-    using DeviceInstance = tensor_operation::device::DeviceSoftmax<InDataType,
-                                                                   AccDataType,
-                                                                   OutDataType,
-                                                                   Rank,
-                                                                   NumReduceDim,
-                                                                   BlockSize,
-                                                                   MThreadClusterSize,
-                                                                   KThreadClusterSize,
-                                                                   MThreadSliceSize,
-                                                                   KThreadSliceSize,
-                                                                   InSrcVectorDim,
-                                                                   InSrcVectorSize,
-                                                                   OutDstVectorSize>;
+    using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+    using DeviceInstance = tensor_operation::device::DeviceSoftmaxImpl<InDataType,
+                                                                       AccDataType,
+                                                                       OutDataType,
+                                                                       PassThrough,
+                                                                       PassThrough,
+                                                                       Rank,
+                                                                       NumReduceDim,
+                                                                       BlockSize,
+                                                                       MThreadClusterSize,
+                                                                       KThreadClusterSize,
+                                                                       MThreadSliceSize,
+                                                                       KThreadSliceSize,
+                                                                       InSrcVectorDim,
+                                                                       InSrcVectorSize,
+                                                                       OutDstVectorSize>;

    TestSoftmax() : ref_instance_invoker_(ReferenceInstance{}.MakeInvoker()) {}

@@ -97,7 +102,9 @@ class TestSoftmax : public ::testing::Test
                                                                &alpha,
                                                                &beta,
                                                                in_dev.GetDeviceBuffer(),
-                                                                out_dev.GetDeviceBuffer());
+                                                                out_dev.GetDeviceBuffer(),
+                                                                PassThrough{},
+                                                                PassThrough{});

        if(!device_instance.IsSupportedArgument(argument_ptr.get()))
        {