From d41b1a7c2c696ccd0d561996053fcf186f3ff78f Mon Sep 17 00:00:00 2001
From: Anthony Chang <ac.chang@outlook.com>
Date: Fri, 1 Jul 2022 01:08:50 +0800
Subject: [PATCH] Standalone sweep once softmax kernel w/ ckProfiler (#295)

* use 'sweep once' softmax kernel where applicable

* threadwise copy's dst buffer can specify invalid element value

* add int8 in/out float compute softmax support

give a bit of leeway for int absolute tolerance as there's a single data point of all test cases showing off-by-1 error

* format

* softmax inherits DeviceNormalization

* softmax profiler stub

* tighten up reference softmax interface

* example prints tensor dimension

* add fp32 to softmax profiler

* rename header

* hook with ckProfiler

* format

* resolve merge conflict

* resolve merge conflicts

* update normalization profiler help string

* resolve conflict

* typo

* remove residual

* softmax profiler: address feedback

* test for mixed precision input/output

* fully qualify ck::math::isnan

* add comment for device normalization interface

* revise wording

* constness for alpha/beta scaler pointer

[ROCm/composable_kernel commit: 93c99f3d8701f7c88e7e5389850328f830701017]
---
 example/23_softmax/softmax_blockwise.cpp      |   9 +-
 .../gpu/device/device_normalization.hpp       |  43 ++++
 .../gpu/device/device_softmax.hpp             |  86 +++++--
 .../gpu/grid/gridwise_softmax.hpp             | 143 ++++++-----
 .../threadwise_tensor_slice_transfer.hpp      |  19 +-
 include/ck/utility/math.hpp                   |   2 +
 .../reduction_functions_accumulate.hpp        |   2 +-
 .../ck/library/host_tensor/host_tensor.hpp    |   6 +
 .../cpu/reference_softmax.hpp                 |   7 +-
 .../device_operation_instance.hpp             |   1 +
 .../include/ck/library/utility/check_err.hpp  |   4 +-
 .../gpu/CMakeLists.txt                        |   1 +
 .../gpu/normalization/CMakeLists.txt          |  10 +
 .../device_softmax_f16_f16_instance.cpp       |  49 ++++
 .../device_softmax_f32_f32_instance.cpp       |  48 ++++
 profiler/CMakeLists.txt                       |   2 +
 .../include/profile_normalization_impl.hpp    | 243 ++++++++++++++++++
 profiler/src/profile_normalization.cpp        | 134 ++++++++++
 profiler/src/profiler.cpp                     |   6 +
 test/softmax/CMakeLists.txt                   |   5 +-
 test/softmax/test_softmax_fp16.cpp            |   7 +-
 test/softmax/test_softmax_fp32.cpp            |   7 +-
 test/softmax/test_softmax_int8.cpp            |  30 +++
 test/softmax/test_softmax_util.hpp            |  51 +++-
 24 files changed, 809 insertions(+), 106 deletions(-)
 create mode 100644 include/ck/tensor_operation/gpu/device/device_normalization.hpp
 create mode 100644 library/src/tensor_operation_instance/gpu/normalization/CMakeLists.txt
 create mode 100644 library/src/tensor_operation_instance/gpu/normalization/device_softmax_f16_f16_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/normalization/device_softmax_f32_f32_instance.cpp
 create mode 100644 profiler/include/profile_normalization_impl.hpp
 create mode 100644 profiler/src/profile_normalization.cpp
 create mode 100644 test/softmax/test_softmax_int8.cpp

diff --git a/example/23_softmax/softmax_blockwise.cpp b/example/23_softmax/softmax_blockwise.cpp
index 32570e19c3..6df3155e80 100644
--- a/example/23_softmax/softmax_blockwise.cpp
+++ b/example/23_softmax/softmax_blockwise.cpp
@@ -150,6 +150,9 @@ int main(int argc, char* argv[])
     AccDataType alpha = args.scales[0];
     AccDataType beta  = args.scales[1];
 
+    std::cout << "in: " << in.mDesc << std::endl;
+    std::cout << "out: " << out.mDesc << std::endl;
+
     std::size_t num_thread = 1;
 
     if(args.do_verification)
@@ -195,7 +198,7 @@ int main(int argc, char* argv[])
         using ReferenceInstance =
             tensor_operation::host::ReferenceSoftmax<InDataType, OutDataType, AccDataType>;
         ReferenceInstance ref;
-        auto ref_arg = ref.MakeArgument(in, out_ref, alpha, beta, Rank, reduceDims);
+        auto ref_arg = ref.MakeArgument(in, out_ref, alpha, beta, reduceDims);
         auto invoker = ref.MakeInvoker();
         invoker.Run(ref_arg);
         // LogRangeAsType<float>(std::cout << "tensor out_ref: ", out_ref.mData, ",") << std::endl;
@@ -212,8 +215,8 @@ int main(int argc, char* argv[])
     auto argument_ptr = device_instance.MakeArgumentPointer(i_inLengths,
                                                             i_inStrides,
                                                             reduceDims,
-                                                            alpha,
-                                                            beta,
+                                                            &alpha,
+                                                            &beta,
                                                             in_dev.GetDeviceBuffer(),
                                                             out_dev.GetDeviceBuffer());
 
diff --git a/include/ck/tensor_operation/gpu/device/device_normalization.hpp b/include/ck/tensor_operation/gpu/device/device_normalization.hpp
new file mode 100644
index 0000000000..0e4313f17d
--- /dev/null
+++ b/include/ck/tensor_operation/gpu/device/device_normalization.hpp
@@ -0,0 +1,43 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <iostream>
+#include <vector>
+
+#include "ck/tensor_operation/gpu/device/device_base.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+
+struct DeviceNormalization : public BaseOperator
+{
+    // inLengths: input tensor extent(s) from high to low dimension
+    // inStrides: input tensor stride(s) from high to low dimension
+    // reduceDims: the dimension(s) the normalization operation is applied
+    // alpha: typeless pointer in host memory storing the alpha scaling value of type AccDataType
+    // beta: typeless pointer in host memory storing the beta scaling value of type AccDataType
+    // in_dev: typeless const pointer in device memory storing the input tensor
+    // out_dev: typeless pointer in device memory storing the output tensor
+    virtual std::unique_ptr<BaseArgument> MakeArgumentPointer(const std::vector<index_t> inLengths,
+                                                              const std::vector<index_t> inStrides,
+                                                              const std::vector<int> reduceDims,
+                                                              const void* alpha,
+                                                              const void* beta,
+                                                              const void* in_dev,
+                                                              void* out_dev) = 0;
+
+    virtual std::unique_ptr<BaseInvoker> MakeInvokerPointer() = 0;
+
+    virtual index_t GetRank() const = 0;
+
+    virtual index_t GetNumReduceDim() const = 0;
+};
+
+using DeviceNormalizationPtr = std::unique_ptr<DeviceNormalization>;
+
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/include/ck/tensor_operation/gpu/device/device_softmax.hpp b/include/ck/tensor_operation/gpu/device/device_softmax.hpp
index 1aa24c0e55..6a5dfc4da4 100644
--- a/include/ck/tensor_operation/gpu/device/device_softmax.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_softmax.hpp
@@ -9,6 +9,7 @@
 #include "ck/utility/reduction_operator.hpp"
 #include "ck/tensor_operation/gpu/device/device_base.hpp"
 #include "ck/tensor_operation/gpu/device/device_reduce.hpp"
+#include "ck/tensor_operation/gpu/device/device_normalization.hpp"
 #include "ck/tensor_operation/gpu/device/device_reduce_multiblock.hpp"
 #include "ck/tensor_operation/gpu/device/device_reduce_common.hpp"
 #include "ck/tensor_operation/gpu/grid/gridwise_softmax.hpp"
@@ -33,8 +34,15 @@ template <typename InDataType,
           index_t InSrcVectorDim,
           index_t InSrcVectorSize,
           index_t OutDstVectorSize>
-struct DeviceSoftmax : public BaseOperator
+struct DeviceSoftmax : public DeviceNormalization
 {
+    static constexpr index_t kRank         = Rank;
+    static constexpr index_t kNumReduceDim = NumReduceDim;
+
+    virtual index_t GetRank() const override { return kRank; }
+
+    virtual index_t GetNumReduceDim() const override { return kNumReduceDim; }
+
     using PassThrough = tensor_operation::element_wise::PassThrough;
 
     // Used for freeloading of some handy functions from DeviceReduceMultiBlock
@@ -61,18 +69,33 @@ struct DeviceSoftmax : public BaseOperator
 
     using GridDesc_M_K = decltype(Reduction::MakeSrc2dDescriptor({1}, {1}, 1, 1));
 
-    using GridwiseReduce = GridwiseSoftmax_mk_to_mk<InDataType,
-                                                    OutDataType,
-                                                    AccDataType,
-                                                    GridDesc_M_K,
-                                                    BlockSize,
-                                                    MThreadClusterSize,
-                                                    KThreadClusterSize,
-                                                    MThreadSliceSize,
-                                                    KThreadSliceSize,
-                                                    InSrcVectorDim,
-                                                    InSrcVectorSize,
-                                                    OutDstVectorSize>;
+    using GridwiseSoftmaxGeneric = GridwiseSoftmax_mk_to_mk<InDataType,
+                                                            OutDataType,
+                                                            AccDataType,
+                                                            GridDesc_M_K,
+                                                            BlockSize,
+                                                            MThreadClusterSize,
+                                                            KThreadClusterSize,
+                                                            MThreadSliceSize,
+                                                            KThreadSliceSize,
+                                                            InSrcVectorDim,
+                                                            InSrcVectorSize,
+                                                            OutDstVectorSize,
+                                                            false>;
+
+    using GridwiseSoftmaxSweepOnce = GridwiseSoftmax_mk_to_mk<InDataType,
+                                                              OutDataType,
+                                                              AccDataType,
+                                                              GridDesc_M_K,
+                                                              BlockSize,
+                                                              MThreadClusterSize,
+                                                              KThreadClusterSize,
+                                                              MThreadSliceSize,
+                                                              KThreadSliceSize,
+                                                              InSrcVectorDim,
+                                                              InSrcVectorSize,
+                                                              OutDstVectorSize,
+                                                              true>;
 
     struct Argument : public Reduction::Argument
     {
@@ -121,8 +144,19 @@ struct DeviceSoftmax : public BaseOperator
             const auto out_grid_desc_m_k = Reduction::MakeSrc2dDescriptor(
                 arg.inLengths_, arg.inStrides_, arg.blkGroupSize, arg.numBlockTileIteration);
 
-            const auto kernel_main =
-                kernel_softmax<GridwiseReduce, InDataType, OutDataType, AccDataType, GridDesc_M_K>;
+            bool sweep_once =
+                in_grid_desc_m_k.GetLength(Number<1>{}) <= KThreadClusterSize * KThreadSliceSize;
+
+            const auto kernel_main = sweep_once ? kernel_softmax<GridwiseSoftmaxSweepOnce,
+                                                                 InDataType,
+                                                                 OutDataType,
+                                                                 AccDataType,
+                                                                 GridDesc_M_K>
+                                                : kernel_softmax<GridwiseSoftmaxGeneric,
+                                                                 InDataType,
+                                                                 OutDataType,
+                                                                 AccDataType,
+                                                                 GridDesc_M_K>;
 
             float avg_time = 0;
 
@@ -167,24 +201,34 @@ struct DeviceSoftmax : public BaseOperator
         return true;
     };
 
+    // inLengths: input tensor extent(s) from high to low dimension
+    // inStrides: input tensor stride(s) from high to low dimension
+    // reduceDims: the dimension(s) the softmax normalization operate on
+    // alpha: typeless pointer in host memory storing the alpha scaling value as type AccDataType
+    // beta: typeless pointer in host memory storing the beta scaling value as type AccDataType
+    // in_dev: typeless const pointer in device memory storing the input tensor
+    // out_dev: typeless pointer in device memory storing the output tensor
     std::unique_ptr<BaseArgument> MakeArgumentPointer(const std::vector<index_t> inLengths,
                                                       const std::vector<index_t> inStrides,
                                                       const std::vector<int> reduceDims,
-                                                      AccDataType alpha,
-                                                      AccDataType beta,
+                                                      const void* alpha,
+                                                      const void* beta,
                                                       const void* in_dev,
-                                                      void* out_dev)
+                                                      void* out_dev) override
     {
         return std::make_unique<Argument>(inLengths,
                                           inStrides,
                                           reduceDims,
-                                          alpha,
-                                          beta,
+                                          *static_cast<const AccDataType*>(alpha),
+                                          *static_cast<const AccDataType*>(beta),
                                           static_cast<const InDataType*>(in_dev),
                                           static_cast<OutDataType*>(out_dev));
     };
 
-    std::unique_ptr<BaseInvoker> MakeInvokerPointer() { return std::make_unique<Invoker>(); };
+    std::unique_ptr<BaseInvoker> MakeInvokerPointer() override
+    {
+        return std::make_unique<Invoker>();
+    };
 
     std::string GetTypeString() const override
     {
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_softmax.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_softmax.hpp
index 3a457b2c79..98b29ff82e 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_softmax.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_softmax.hpp
@@ -49,7 +49,8 @@ template <typename InDataType,
           index_t KThreadSliceSize,
           index_t InSrcVectorDim,
           index_t InSrcVectorSize,
-          index_t OutDstVectorSize>
+          index_t OutDstVectorSize,
+          bool SweepOnce>
 struct GridwiseSoftmax_mk_to_mk
 {
     static_assert(((InSrcVectorDim == 0 && MThreadSliceSize % InSrcVectorSize == 0) ||
@@ -75,19 +76,6 @@ struct GridwiseSoftmax_mk_to_mk
     using ThreadReduceDstDesc_M =
         decltype(make_naive_tensor_descriptor_packed(make_tuple(Number<MThreadSliceSize>{})));
 
-    using BlockwiseMaxReduce = PartitionedBlockwiseReduction<AccDataType,
-                                                             BlockSize,
-                                                             ThreadClusterLengths_M_K,
-                                                             ThreadClusterArrangeOrder,
-                                                             reduce::Max,
-                                                             false>; // PropagateNan
-
-    using ThreadwiseMaxReduce = ThreadwiseReduction<AccDataType,
-                                                    ThreadReduceSrcDesc_M_K,
-                                                    ThreadReduceDstDesc_M,
-                                                    reduce::Max,
-                                                    false>; // PropagateNan
-
     using PassThroughOp = tensor_operation::element_wise::PassThrough;
 
     static constexpr auto I0 = Number<0>{};
@@ -105,6 +93,11 @@ struct GridwiseSoftmax_mk_to_mk
                                AccDataType beta,
                                OutDataType* const __restrict__ p_out_value_global)
     {
+        if constexpr(SweepOnce)
+        {
+            num_k_block_tile_iteration = 1;
+        }
+
         // LDS
         __shared__ AccDataType p_reduce_work_buffer[BlockSize];
 
@@ -149,6 +142,20 @@ struct GridwiseSoftmax_mk_to_mk
         constexpr auto thread_buffer_desc = make_naive_tensor_descriptor_packed(
             make_tuple(Number<MThreadSliceSize>{}, Number<KThreadSliceSize>{}));
 
+        // Normally, 0 as invalid element value is adequate since 0 makes no contribution to
+        // accumulated result. However, in stable softmax, all values 0s or not are subtracted by
+        // another value_max. As numbers become non-zero, effectively it allows invalid values to
+        // slip through and contribute to the accumulated result.
+        //
+        // The trick here is leveraging the fact that many math functions (add, sub, exp, ...)
+        // propagate NaNs when operands have NaNs involved. By initialiing invalid element value
+        // with NaN, an invalid value doing math manipulations is still NaN, which in turn can still
+        // be identified as an invalid value. We can then discard the invalid values which
+        // originally failed the bound check during accumulation. This allows to ignore values that
+        // failed bound check even after multiple math manipulations.
+        //
+        // NOTE: reset coordinate after every step because the same threadwise copy will sweep
+        // through global memory 3 times back and forth
         auto threadwise_src_load = ThreadwiseTensorSliceTransfer_v2<InDataType,
                                                                     AccDataType,
                                                                     GridDesc_M_K,
@@ -158,7 +165,8 @@ struct GridwiseSoftmax_mk_to_mk
                                                                     InSrcVectorDim,
                                                                     InSrcVectorSize,
                                                                     1,
-                                                                    false>(
+                                                                    true /* ResetCoordAfterRun */,
+                                                                    true /* InvalidElementAsNaN */>(
             in_grid_desc_m_k,
             make_multi_index(blkgroup_id * M_BlockTileSize + thread_m_cluster_id * MThreadSliceSize,
                              block_local_id * reduceSizePerBlock +
@@ -198,21 +206,39 @@ struct GridwiseSoftmax_mk_to_mk
                     block_local_id * reduceSizePerBlock + thread_k_cluster_id * KThreadSliceSize),
                 PassThroughOp{});
 
-        constexpr auto in_thread_copy_fwd_step = make_multi_index(0, K_BlockTileSize);
-        constexpr auto in_thread_copy_bwd_step = make_multi_index(0, -K_BlockTileSize);
+        constexpr auto in_thread_copy_fwd_step =
+            make_multi_index(0, SweepOnce ? 0 : K_BlockTileSize);
+        constexpr auto in_thread_copy_bwd_step =
+            make_multi_index(0, SweepOnce ? 0 : -K_BlockTileSize);
 
         ///
         /// max(x)
         ///
-        const auto in_global_val_buf_oob_non_zero = make_dynamic_buffer<AddressSpaceEnum::Global>(
-            p_in_value_global,
-            in_grid_desc_m_k.GetElementSpaceSize(),
-            reduce::Max::template GetIdentityValue<InDataType>());
+        using BlockwiseMaxReduce = PartitionedBlockwiseReduction<
+            AccDataType,
+            BlockSize,
+            ThreadClusterLengths_M_K,
+            ThreadClusterArrangeOrder,
+            reduce::Max,
+            false, // param ignored
+            detail::AccumulateWithNanIgnore<reduce::Max, AccDataType>>;
+
+        using ThreadwiseMaxReduce =
+            ThreadwiseReduction<AccDataType,
+                                ThreadReduceSrcDesc_M_K,
+                                ThreadReduceDstDesc_M,
+                                reduce::Max,
+                                false, // param ignored
+                                detail::AccumulateWithNanIgnore<reduce::Max, AccDataType>>;
+
+        const auto in_global_val_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_in_value_global, in_grid_desc_m_k.GetElementSpaceSize());
+
         index_t reducedTiles = 0;
         do
         {
             threadwise_src_load.Run(in_grid_desc_m_k,
-                                    in_global_val_buf_oob_non_zero,
+                                    in_global_val_buf,
                                     thread_buffer_desc,
                                     make_tuple(I0, I0),
                                     in_thread_buf);
@@ -232,26 +258,6 @@ struct GridwiseSoftmax_mk_to_mk
         ///
         /// sum(exp(x - max(x)))
         ///
-        static_for<0, MThreadSliceSize, 1>{}([&](auto I) {
-            accu_value_buf(I) = reduce::Add::template GetIdentityValue<AccDataType>();
-        });
-
-        // Normally, 0 as invalid element value is adequate since 0 makes no contribution to
-        // accumulated result. However, in stable softmax, all values 0s or not are subtracted by
-        // another value_max. As numbers become non-zero, effectively it allows invalid values to
-        // slip through and contribute to the accumulated result.
-        //
-        // The trick here is leveraging the fact that many math functions (add, sub, exp, ...)
-        // propagate NaNs when operands have NaNs involved. By initialiing invalid element value
-        // with NaN, an invalid value doing math manipulations is still NaN, which in turn can still
-        // be identified as an invalid value. We can then discard the invalid values which
-        // originally failed the bound check during accumulation. This allows to ignore values that
-        // failed bound check even after multiple math manipulations.
-        const auto in_global_val_buf_oob_nan =
-            make_dynamic_buffer<AddressSpaceEnum::Global>(p_in_value_global,
-                                                          in_grid_desc_m_k.GetElementSpaceSize(),
-                                                          NumericLimits<InDataType>::QuietNaN());
-
         using BlockwiseSumReduce = PartitionedBlockwiseReduction<
             AccDataType,
             BlockSize,
@@ -272,22 +278,25 @@ struct GridwiseSoftmax_mk_to_mk
         reducedTiles = 0;
         do
         {
-            threadwise_src_load.Run(in_grid_desc_m_k,
-                                    in_global_val_buf_oob_nan,
-                                    thread_buffer_desc,
-                                    make_tuple(I0, I0),
-                                    in_thread_buf);
+            if constexpr(!SweepOnce)
+            {
+                threadwise_src_load.Run(in_grid_desc_m_k,
+                                        in_global_val_buf,
+                                        thread_buffer_desc,
+                                        make_tuple(I0, I0),
+                                        in_thread_buf);
+            }
 
             // do element-wise pre-reduction operation
             static_for<0, MThreadSliceSize, 1>{}([&](auto iM) {
                 static_for<0, KThreadSliceSize, 1>{}([&](auto iK) {
                     constexpr auto offset = thread_buffer_desc.CalculateOffset(make_tuple(iM, iK));
-                    in_thread_buf(Number<offset>{}) =
+                    out_thread_buf(Number<offset>{}) =
                         math::exp(in_thread_buf(Number<offset>{}) - max_value_buf(iM));
                 });
             });
 
-            ThreadwiseSumReduce::Reduce(in_thread_buf, accu_value_buf);
+            ThreadwiseSumReduce::Reduce(out_thread_buf, accu_value_buf);
 
             threadwise_src_load.MoveSrcSliceWindow(in_grid_desc_m_k, in_thread_copy_bwd_step);
 
@@ -309,11 +318,14 @@ struct GridwiseSoftmax_mk_to_mk
         {
             do
             {
-                threadwise_src_load.Run(in_grid_desc_m_k,
-                                        in_global_val_buf_oob_nan,
-                                        thread_buffer_desc,
-                                        make_tuple(I0, I0),
-                                        in_thread_buf);
+                if constexpr(!SweepOnce)
+                {
+                    threadwise_src_load.Run(in_grid_desc_m_k,
+                                            in_global_val_buf,
+                                            thread_buffer_desc,
+                                            make_tuple(I0, I0),
+                                            in_thread_buf);
+                }
 
                 static_for<0, MThreadSliceSize, 1>{}([&](auto iM) {
                     // out = alpha * exp(x - max(x)) / sum(exp(x - max(x)))
@@ -340,18 +352,27 @@ struct GridwiseSoftmax_mk_to_mk
         }
         else
         {
+            StaticBuffer<AddressSpaceEnum::Vgpr,
+                         AccDataType,
+                         MThreadSliceSize * KThreadSliceSize,
+                         true>
+                in_prior_dst_buf;
             do
             {
-                threadwise_src_load.Run(in_grid_desc_m_k,
-                                        in_global_val_buf_oob_nan,
-                                        thread_buffer_desc,
-                                        make_tuple(I0, I0),
-                                        in_thread_buf);
+                if constexpr(!SweepOnce)
+                {
+                    threadwise_src_load.Run(in_grid_desc_m_k,
+                                            in_global_val_buf,
+                                            thread_buffer_desc,
+                                            make_tuple(I0, I0),
+                                            in_thread_buf);
+                }
                 threadwise_dst_load.Run(out_grid_desc_m_k,
                                         out_global_val_buf,
                                         thread_buffer_desc,
                                         make_tuple(I0, I0),
-                                        out_thread_buf);
+                                        in_prior_dst_buf);
+
                 static_for<0, MThreadSliceSize, 1>{}([&](auto iM) {
                     // out = alpha * exp(x - max(x)) / sum(exp(x - max(x))) + beta * prior_out
                     static_for<0, KThreadSliceSize, 1>{}([&](auto iK) {
@@ -360,7 +381,7 @@ struct GridwiseSoftmax_mk_to_mk
                         out_thread_buf(Number<offset>{}) =
                             alpha * math::exp(in_thread_buf(Number<offset>{}) - max_value_buf(iM)) /
                                 accu_value_buf(iM) +
-                            beta * out_thread_buf(Number<offset>{});
+                            beta * in_prior_dst_buf(Number<offset>{});
                     });
                 });
 
diff --git a/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp b/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp
index 6bc0745466..a50bb851fe 100644
--- a/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp
+++ b/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp
@@ -236,9 +236,14 @@ template <typename SrcData,
           index_t SrcScalarPerVector,
           index_t SrcScalarStrideInVector,
           bool SrcResetCoordinateAfterRun,
+          bool InvalidElementAsNaN                                        = false,
           typename enable_if<DstDesc::IsKnownAtCompileTime(), bool>::type = false>
 struct ThreadwiseTensorSliceTransfer_v2
 {
+    static_assert((InvalidElementAsNaN && !std::is_integral<DstData>::value) ||
+                      (!InvalidElementAsNaN),
+                  "Filling invalid element as NaN is only for floating point types");
+
     static constexpr index_t nDim = SliceLengths::Size();
 
     using Index = MultiIndex<nDim>;
@@ -318,8 +323,18 @@ struct ThreadwiseTensorSliceTransfer_v2
                     dst_desc.CalculateOffset(to_multi_index(dst_slice_origin_idx) + src_data_idx +
                                              i * src_scalar_step_in_vector);
 
-                dst_buf(Number<dst_offset>{}) =
-                    type_convert<DstData>(src_vector.template AsType<SrcData>()[i]);
+                if constexpr(InvalidElementAsNaN)
+                {
+                    dst_buf(Number<dst_offset>{}) =
+                        is_src_valid
+                            ? type_convert<DstData>(src_vector.template AsType<SrcData>()[i])
+                            : NumericLimits<DstData>::QuietNaN();
+                }
+                else
+                {
+                    dst_buf(Number<dst_offset>{}) =
+                        type_convert<DstData>(src_vector.template AsType<SrcData>()[i]);
+                }
             });
 
             if constexpr(idx_1d.value != num_access - 1)
diff --git a/include/ck/utility/math.hpp b/include/ck/utility/math.hpp
index 9cf47fb5d2..0cfc2f7da4 100644
--- a/include/ck/utility/math.hpp
+++ b/include/ck/utility/math.hpp
@@ -148,6 +148,8 @@ __host__ __device__ constexpr auto min(X x, Ys... ys)
 template <typename T>
 __device__ T exp(T x);
 
+// TODO: add f16 support using v_exp_f16
+
 template <>
 __device__ float exp<float>(float x)
 {
diff --git a/include/ck/utility/reduction_functions_accumulate.hpp b/include/ck/utility/reduction_functions_accumulate.hpp
index fca7e6107d..724e5599d6 100644
--- a/include/ck/utility/reduction_functions_accumulate.hpp
+++ b/include/ck/utility/reduction_functions_accumulate.hpp
@@ -17,7 +17,7 @@ struct AccumulateWithNanIgnore
 {
     __device__ static inline void Calculate(AccDataType& accuVal, AccDataType currVal)
     {
-        if(!isnan(currVal))
+        if(!ck::math::isnan(currVal))
         {
             ReduceOperation{}(accuVal, currVal);
         }
diff --git a/library/include/ck/library/host_tensor/host_tensor.hpp b/library/include/ck/library/host_tensor/host_tensor.hpp
index 87e98f6e54..cf982c80f7 100644
--- a/library/include/ck/library/host_tensor/host_tensor.hpp
+++ b/library/include/ck/library/host_tensor/host_tensor.hpp
@@ -222,6 +222,12 @@ struct Tensor
 
     Tensor(const Tensor& other) : mDesc(other.mDesc), mData(other.mData) {}
 
+    Tensor& operator=(const Tensor& other)
+    {
+        mDesc = other.mDesc;
+        mData = other.mData;
+    }
+
     template <typename F>
     void ForEach_impl(F&& f, std::vector<size_t>& idx, size_t rank)
     {
diff --git a/library/include/ck/library/reference_tensor_operation/cpu/reference_softmax.hpp b/library/include/ck/library/reference_tensor_operation/cpu/reference_softmax.hpp
index 738373be4e..5d9e90f71a 100644
--- a/library/include/ck/library/reference_tensor_operation/cpu/reference_softmax.hpp
+++ b/library/include/ck/library/reference_tensor_operation/cpu/reference_softmax.hpp
@@ -26,12 +26,11 @@ struct ReferenceSoftmax : public device::BaseOperator
                  Tensor<OutDataType>& out,
                  AccDataType alpha,
                  AccDataType beta,
-                 const index_t rank,
                  const std::vector<index_t> sm_reduce_dims)
             : in_(in), out_(out), alpha_(alpha), beta_(beta), sm_reduce_dims_(sm_reduce_dims)
         {
             // std::cout << "debug: scalar dims: ";
-            for(int i = 0; i < rank; i++)
+            for(size_t i = 0; i < in.mDesc.GetNumOfDimension(); i++)
             {
                 if(std::find(sm_reduce_dims.begin(), sm_reduce_dims.end(), i) ==
                    sm_reduce_dims.end())
@@ -47,7 +46,6 @@ struct ReferenceSoftmax : public device::BaseOperator
         Tensor<OutDataType>& out_;
         AccDataType alpha_;
         AccDataType beta_;
-        index_t rank_;
         std::vector<index_t> sm_reduce_dims_;
         std::vector<index_t> sm_scalar_dims_; // dim after internal max/sum reduction
     };
@@ -136,10 +134,9 @@ struct ReferenceSoftmax : public device::BaseOperator
                              Tensor<OutDataType>& out,
                              AccDataType alpha,
                              AccDataType beta,
-                             const index_t rank,
                              const std::vector<index_t> sm_reduce_dims)
     {
-        return Argument{in, out, alpha, beta, rank, sm_reduce_dims};
+        return Argument{in, out, alpha, beta, sm_reduce_dims};
     }
 
     static auto MakeInvoker() { return Invoker{}; }
diff --git a/library/include/ck/library/tensor_operation_instance/device_operation_instance.hpp b/library/include/ck/library/tensor_operation_instance/device_operation_instance.hpp
index cc6b36869a..60343a17b8 100644
--- a/library/include/ck/library/tensor_operation_instance/device_operation_instance.hpp
+++ b/library/include/ck/library/tensor_operation_instance/device_operation_instance.hpp
@@ -4,6 +4,7 @@
 #pragma once
 
 #include <vector>
+#include "ck/utility/functional2.hpp"
 
 namespace ck {
 namespace tensor_operation {
diff --git a/library/include/ck/library/utility/check_err.hpp b/library/include/ck/library/utility/check_err.hpp
index 4ea2c63cad..0b82ba4357 100644
--- a/library/include/ck/library/utility/check_err.hpp
+++ b/library/include/ck/library/utility/check_err.hpp
@@ -159,7 +159,7 @@ check_err(const std::vector<T>& out,
           const std::vector<T>& ref,
           const std::string& msg = "Error: Incorrect results!",
           double                 = 0,
-          double                 = 0)
+          double atol            = 0)
 {
     if(out.size() != ref.size())
     {
@@ -179,7 +179,7 @@ check_err(const std::vector<T>& out,
         int64_t r = ref[i];
         err       = std::abs(o - r);
 
-        if(err > 0)
+        if(err > atol)
         {
             max_err = err > max_err ? err : max_err;
             err_count++;
diff --git a/library/src/tensor_operation_instance/gpu/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/CMakeLists.txt
index 7be2a1b75b..28cd1923e3 100644
--- a/library/src/tensor_operation_instance/gpu/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/CMakeLists.txt
@@ -25,6 +25,7 @@ add_subdirectory(conv2d_fwd_bias_relu_add)
 add_subdirectory(conv2d_bwd_data)
 add_subdirectory(convnd_bwd_data)
 add_subdirectory(conv2d_bwd_weight)
+add_subdirectory(normalization)
 add_subdirectory(reduce)
 
 add_library(device_operations STATIC
diff --git a/library/src/tensor_operation_instance/gpu/normalization/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/normalization/CMakeLists.txt
new file mode 100644
index 0000000000..a6ae07bab9
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/normalization/CMakeLists.txt
@@ -0,0 +1,10 @@
+# device_normalization_instance
+set(DEVICE_NORMALIZATION_INSTANCE_SOURCE
+    device_softmax_f32_f32_instance.cpp
+    device_softmax_f16_f16_instance.cpp
+)
+
+add_library(device_normalization_instance OBJECT ${DEVICE_NORMALIZATION_INSTANCE_SOURCE})
+set_target_properties(device_normalization_instance PROPERTIES POSITION_INDEPENDENT_CODE ON)
+
+clang_tidy_check(device_normalization_instance)
diff --git a/library/src/tensor_operation_instance/gpu/normalization/device_softmax_f16_f16_instance.cpp b/library/src/tensor_operation_instance/gpu/normalization/device_softmax_f16_f16_instance.cpp
new file mode 100644
index 0000000000..c5019c690d
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/normalization/device_softmax_f16_f16_instance.cpp
@@ -0,0 +1,49 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/ck.hpp"
+#include "ck/library/tensor_operation_instance/device_operation_instance.hpp"
+#include "ck/tensor_operation/gpu/device/device_softmax.hpp"
+#include "ck/utility/data_type.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace device_normalization_instance {
+
+using F16 = ck::half_t;
+using F32 = float;
+
+template <index_t Rank, index_t Reduce>
+using device_softmax_f16_f16_instances = std::tuple<
+    // clang-format off
+        // InDataType, AccDataType, OutDataType, Rank, NumReduceDim, BlockSize, MThreadClusterSize, KThreadClusterSize, MThreadSliceSize, KThreadSliceSize, InSrcVectorDim, InSrcVectorSize, OutDstVectorSize>
+        DeviceSoftmax<F16, F32, F16, Rank, Reduce, 256, 8, 32, 1, 8, 1, 1, 1>, // fallback kernel
+        DeviceSoftmax<F16, F32, F16, Rank, Reduce, 256, 8, 32, 1, 8, 1, 8, 8>,
+        DeviceSoftmax<F16, F32, F16, Rank, Reduce, 256, 4, 64, 1, 8, 1, 8, 8>,
+        DeviceSoftmax<F16, F32, F16, Rank, Reduce, 256, 2, 128, 1, 8, 1, 8, 8>,
+        DeviceSoftmax<F16, F32, F16, Rank, Reduce, 256, 2, 128, 1, 16, 1, 8, 8>,
+        DeviceSoftmax<F16, F32, F16, Rank, Reduce, 256, 2, 128, 1, 32, 1, 8, 8>,
+        DeviceSoftmax<F16, F32, F16, Rank, Reduce, 256, 1, 256, 1, 8, 1, 8, 8>,
+        DeviceSoftmax<F16, F32, F16, Rank, Reduce, 256, 1, 256, 1, 16, 1, 8, 8>,
+        DeviceSoftmax<F16, F32, F16, Rank, Reduce, 256, 1, 256, 1, 32, 1, 8, 8>
+    // clang-format on
+    >;
+
+void add_device_softmax_f16_f16_rank3_instances(std::vector<DeviceNormalizationPtr>& instances)
+{
+    add_device_operation_instances(instances, device_softmax_f16_f16_instances<3, 1>{});
+    add_device_operation_instances(instances, device_softmax_f16_f16_instances<3, 2>{});
+}
+
+void add_device_softmax_f16_f16_rank4_instances(std::vector<DeviceNormalizationPtr>& instances)
+{
+    add_device_operation_instances(instances, device_softmax_f16_f16_instances<4, 1>{});
+    add_device_operation_instances(instances, device_softmax_f16_f16_instances<4, 2>{});
+    add_device_operation_instances(instances, device_softmax_f16_f16_instances<4, 3>{});
+}
+
+} // namespace device_normalization_instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/normalization/device_softmax_f32_f32_instance.cpp b/library/src/tensor_operation_instance/gpu/normalization/device_softmax_f32_f32_instance.cpp
new file mode 100644
index 0000000000..985f17012e
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/normalization/device_softmax_f32_f32_instance.cpp
@@ -0,0 +1,48 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/ck.hpp"
+#include "ck/library/tensor_operation_instance/device_operation_instance.hpp"
+#include "ck/tensor_operation/gpu/device/device_softmax.hpp"
+#include "ck/utility/data_type.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace device_normalization_instance {
+
+using F32 = float;
+
+template <index_t Rank, index_t Reduce>
+using device_softmax_f32_f32_instances = std::tuple<
+    // clang-format off
+        // InDataType, AccDataType, OutDataType, Rank, NumReduceDim, BlockSize, MThreadClusterSize, KThreadClusterSize, MThreadSliceSize, KThreadSliceSize, InSrcVectorDim, InSrcVectorSize, OutDstVectorSize>
+        DeviceSoftmax<F32, F32, F32, Rank, Reduce, 256, 8, 32, 1, 8, 1, 1, 1>, // fallback kernel
+        DeviceSoftmax<F32, F32, F32, Rank, Reduce, 256, 8, 32, 1, 8, 1, 4, 4>,
+        DeviceSoftmax<F32, F32, F32, Rank, Reduce, 256, 4, 64, 1, 8, 1, 4, 4>,
+        DeviceSoftmax<F32, F32, F32, Rank, Reduce, 256, 2, 128, 1, 8, 1, 4, 4>,
+        DeviceSoftmax<F32, F32, F32, Rank, Reduce, 256, 2, 128, 1, 16, 1, 4, 4>,
+        DeviceSoftmax<F32, F32, F32, Rank, Reduce, 256, 2, 128, 1, 32, 1, 4, 4>,
+        DeviceSoftmax<F32, F32, F32, Rank, Reduce, 256, 1, 256, 1, 8, 1, 4, 4>,
+        DeviceSoftmax<F32, F32, F32, Rank, Reduce, 256, 1, 256, 1, 16, 1, 4, 4>,
+        DeviceSoftmax<F32, F32, F32, Rank, Reduce, 256, 1, 256, 1, 32, 1, 4, 4>
+    // clang-format on
+    >;
+
+void add_device_softmax_f32_f32_rank3_instances(std::vector<DeviceNormalizationPtr>& instances)
+{
+    add_device_operation_instances(instances, device_softmax_f32_f32_instances<3, 1>{});
+    add_device_operation_instances(instances, device_softmax_f32_f32_instances<3, 2>{});
+}
+
+void add_device_softmax_f32_f32_rank4_instances(std::vector<DeviceNormalizationPtr>& instances)
+{
+    add_device_operation_instances(instances, device_softmax_f32_f32_instances<4, 1>{});
+    add_device_operation_instances(instances, device_softmax_f32_f32_instances<4, 2>{});
+    add_device_operation_instances(instances, device_softmax_f32_f32_instances<4, 3>{});
+}
+
+} // namespace device_normalization_instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/profiler/CMakeLists.txt b/profiler/CMakeLists.txt
index b5d341095b..57f83b2a63 100644
--- a/profiler/CMakeLists.txt
+++ b/profiler/CMakeLists.txt
@@ -22,6 +22,7 @@ set(PROFILER_SOURCE
     src/profile_conv_bwd_weight.cpp
     src/profile_batched_gemm_reduce.cpp
     src/profile_gemm_add_add_fastgelu.cpp
+    src/profile_normalization.cpp
 )
 
 add_executable(ckProfiler ${PROFILER_SOURCE})
@@ -46,4 +47,5 @@ target_link_libraries(ckProfiler PRIVATE device_conv2d_fwd_bias_relu_instance)
 target_link_libraries(ckProfiler PRIVATE device_conv2d_fwd_bias_relu_add_instance)
 target_link_libraries(ckProfiler PRIVATE device_convnd_bwd_data_instance)
 target_link_libraries(ckProfiler PRIVATE device_conv2d_bwd_weight_instance)
+target_link_libraries(ckProfiler PRIVATE device_normalization_instance)
 target_link_libraries(ckProfiler PRIVATE device_reduce_instance)
diff --git a/profiler/include/profile_normalization_impl.hpp b/profiler/include/profile_normalization_impl.hpp
new file mode 100644
index 0000000000..f7ecea43d5
--- /dev/null
+++ b/profiler/include/profile_normalization_impl.hpp
@@ -0,0 +1,243 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <iomanip>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/device_softmax.hpp"
+
+#include "ck/library/utility/check_err.hpp"
+#include "ck/library/utility/conv_util.hpp"
+#include "ck/library/host_tensor/device_memory.hpp"
+#include "ck/library/host_tensor/host_tensor.hpp"
+#include "ck/library/host_tensor/host_tensor_generator.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_softmax.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace device_normalization_instance {
+
+void add_device_softmax_f16_f16_rank3_instances(std::vector<DeviceNormalizationPtr>&);
+void add_device_softmax_f16_f16_rank4_instances(std::vector<DeviceNormalizationPtr>&);
+
+void add_device_softmax_f32_f32_rank3_instances(std::vector<DeviceNormalizationPtr>&);
+void add_device_softmax_f32_f32_rank4_instances(std::vector<DeviceNormalizationPtr>&);
+
+} // namespace device_normalization_instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
+
+namespace ck {
+namespace profiler {
+
+enum struct NormType
+{
+    LAYERNORM,
+    BATCHNORM,
+    SOFTMAX,
+};
+
+enum struct NormDataType
+{
+    F32_F32, // in, out
+    F16_F16,
+    BF16_BF16,
+    INT8_INT8,
+};
+
+// clang-format off
+template <typename NormDataType> std::string type_to_string();
+template <> std::string type_to_string<float>()   { return "f32"; }
+template <> std::string type_to_string<half_t>()  { return "f16"; }
+template <> std::string type_to_string<bhalf_t>() { return "bf16"; }
+template <> std::string type_to_string<int8_t>()  { return "int8"; }
+template <> std::string type_to_string<int32_t>() { return "int32"; }
+// clang-format on
+
+template <typename InDataType, typename AccDataType, typename OutDataType>
+void profile_normalization_impl(int do_verification,
+                                int init_method,
+                                bool do_log,
+                                bool time_kernel,
+                                std::vector<index_t> in_length,
+                                std::vector<index_t> in_strides,
+                                std::vector<index_t> reduce_dims,
+                                AccDataType alpha,
+                                AccDataType beta,
+                                NormType norm_type)
+{
+    Tensor<InDataType> in = in_strides.empty() ? Tensor<InDataType>(in_length)
+                                               : Tensor<InDataType>(in_length, in_strides);
+    Tensor<OutDataType> out(in.mDesc);
+
+    switch(init_method)
+    {
+    // case 0: break;
+    case 0:
+        in.GenerateTensorValue(GeneratorTensor_1<InDataType>{});
+        out.GenerateTensorValue(GeneratorTensor_1<OutDataType>{});
+        break;
+    case 1:
+        in.GenerateTensorValue(GeneratorTensor_2<InDataType>{-5, 5});
+        out.GenerateTensorValue(GeneratorTensor_2<OutDataType>{-5, 5});
+        break;
+    default:
+        in.GenerateTensorValue(GeneratorTensor_3<InDataType>{0.0, 1.0});
+        out.GenerateTensorValue(GeneratorTensor_3<OutDataType>{-0.5, 0.5});
+    }
+
+    Tensor<OutDataType> out_ref(out);
+
+    DeviceMem in_dev(sizeof(InDataType) * in.mDesc.GetElementSpace());
+    DeviceMem out_dev(sizeof(OutDataType) * out.mDesc.GetElementSpace());
+    in_dev.ToDevice(in.mData.data());
+    out_dev.ToDevice(out.mData.data());
+
+    std::vector<index_t> i_in_lengths(in.mDesc.GetLengths().begin(), in.mDesc.GetLengths().end());
+    std::vector<index_t> i_in_strides(in.mDesc.GetStrides().begin(), in.mDesc.GetStrides().end());
+
+    // add device normalization instances
+    std::vector<tensor_operation::device::DeviceNormalizationPtr> instances;
+
+    if(norm_type == NormType::SOFTMAX)
+    {
+        if constexpr(is_same<InDataType, half_t>::value && is_same<OutDataType, half_t>::value &&
+                     is_same<AccDataType, float>::value)
+        {
+            if(in_length.size() == 3)
+                tensor_operation::device::device_normalization_instance::
+                    add_device_softmax_f16_f16_rank3_instances(instances);
+
+            if(in_length.size() == 4)
+                tensor_operation::device::device_normalization_instance::
+                    add_device_softmax_f16_f16_rank4_instances(instances);
+        }
+        else if constexpr(is_same<InDataType, float>::value && is_same<OutDataType, float>::value &&
+                          is_same<AccDataType, float>::value)
+        {
+            if(in_length.size() == 3)
+                tensor_operation::device::device_normalization_instance::
+                    add_device_softmax_f32_f32_rank3_instances(instances);
+
+            if(in_length.size() == 4)
+                tensor_operation::device::device_normalization_instance::
+                    add_device_softmax_f32_f32_rank4_instances(instances);
+        }
+    }
+
+    if(instances.size() <= 0)
+    {
+        throw std::runtime_error("wrong! no device normalization instance found");
+    }
+
+    std::string best_instance_name;
+    float best_avg_time   = std::numeric_limits<float>::max();
+    float best_gb_per_sec = 0;
+
+    for(auto& inst_ptr : instances)
+    {
+        // Is this user's responsibility to check if problem mismatches kernel instance (ie. rank 3
+        // problem to rank 4 kernel) other than invoking IsSupportedArgument()?
+        if(!(inst_ptr->GetRank() == static_cast<index_t>(i_in_lengths.size()) &&
+             inst_ptr->GetNumReduceDim() == static_cast<index_t>(reduce_dims.size())))
+        {
+            continue;
+        }
+
+        auto argument_ptr = inst_ptr->MakeArgumentPointer(i_in_lengths,
+                                                          i_in_strides,
+                                                          reduce_dims,
+                                                          &alpha,
+                                                          &beta,
+                                                          in_dev.GetDeviceBuffer(),
+                                                          out_dev.GetDeviceBuffer());
+
+        if(!inst_ptr->IsSupportedArgument(argument_ptr.get()))
+        {
+            std::cout << inst_ptr->GetTypeString() << " skipped due to unsupported argument: ";
+            LogRange(std::cout << "input lengths = [", in_length, ", ")
+                << "], "
+                << "scaler = [" << alpha << ", " << beta << "]." << std::endl;
+            return;
+        }
+
+        auto invoker_ptr = inst_ptr->MakeInvokerPointer();
+
+        float avg_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel});
+
+        std::size_t num_bytes =
+            in.mDesc.GetElementSize() * sizeof(InDataType) +
+            (beta == 0.0f ? 1 : 2) * out.mDesc.GetElementSize() * sizeof(OutDataType);
+
+        float gb_per_sec = num_bytes / 1.E6 / avg_time;
+
+        std::cout << "Perf: " << std::setw(10) << avg_time << " ms, " << gb_per_sec << " GB/s, "
+                  << inst_ptr->GetTypeString() << std::endl;
+
+        if(avg_time < best_avg_time)
+        {
+            best_instance_name = inst_ptr->GetTypeString();
+            best_avg_time      = avg_time;
+            best_gb_per_sec    = gb_per_sec;
+        }
+
+        if(do_verification)
+        {
+            // TODO: factory method to dynamically switch between different reference normalizations
+            using ReferenceFactory =
+                tensor_operation::host::ReferenceSoftmax<InDataType, OutDataType, AccDataType>;
+
+            ReferenceFactory{}.MakeInvoker().Run({in, out_ref, alpha, beta, reduce_dims});
+
+            out_dev.FromDevice(out.mData.data());
+
+            bool pass;
+            if(std::is_same<InDataType, int8_t>::value)
+            {
+                pass = ck::utils::check_err(
+                    out.mData, out_ref.mData, "Error: Incorrect results!", 0, 1);
+                if(do_log)
+                {
+                    LogRangeAsType<int>(std::cout << "in  : ", in.mData, ",") << std::endl;
+                    LogRangeAsType<int>(std::cout << "out_ref  : ", out_ref.mData, ",")
+                        << std::endl;
+                    LogRangeAsType<int>(std::cout << "out  : ", out.mData, ",") << std::endl;
+                }
+            }
+            else
+            {
+                pass = ck::utils::check_err(out.mData, out_ref.mData);
+                if(do_log)
+                {
+                    LogRangeAsType<float>(std::cout << "in  : ", in.mData, ",") << std::endl;
+                    LogRangeAsType<float>(std::cout << "out_ref  : ", out_ref.mData, ",")
+                        << std::endl;
+                    LogRangeAsType<float>(std::cout << "out  : ", out.mData, ",") << std::endl;
+                }
+            }
+
+            if(!pass)
+            {
+                std::cout << inst_ptr->GetTypeString() << " failed verification: ";
+                LogRange(std::cout << "input lengths = [", in_length, ", ")
+                    << "], "
+                    << "scaler = [" << alpha << ", " << beta << "]." << std::endl;
+            }
+        }
+    }
+    std::cout << "Best Perf for datatype = " << type_to_string<InDataType>() << "_"
+              << type_to_string<OutDataType>() << ", ";
+    LogRange(std::cout << "length = ", i_in_lengths, ",") << ", ";
+    LogRange(std::cout << "stride = ", i_in_strides, ",") << ", ";
+    LogRange(std::cout << "reduce dims ", reduce_dims, ",") << ", ";
+    std::cout << "alpha = " << alpha << ", "
+              << "beta = " << beta << ", " << best_avg_time << " ms, " << best_gb_per_sec
+              << " GB/s, " << best_instance_name << std::endl;
+}
+
+} // namespace profiler
+} // namespace ck
diff --git a/profiler/src/profile_normalization.cpp b/profiler/src/profile_normalization.cpp
new file mode 100644
index 0000000000..277a78a669
--- /dev/null
+++ b/profiler/src/profile_normalization.cpp
@@ -0,0 +1,134 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <iostream>
+#include <vector>
+#include <unordered_map>
+
+#include "profiler/include/profile_normalization_impl.hpp"
+
+using ck::index_t;
+using ck::profiler::NormDataType;
+using ck::profiler::NormType;
+
+struct ArgParser
+{
+    std::unordered_map<std::string, NormType> norm_dict = {{"layernorm", NormType::LAYERNORM},
+                                                           {"batchnorm", NormType::BATCHNORM},
+                                                           {"softmax", NormType::SOFTMAX}};
+
+    std::unordered_map<std::string, std::vector<int>> long_opts = {
+        {"length", {}}, {"stride", {}}, {"reduce", {}}, {"alpha", {}}, {"beta", {}}};
+
+    bool parse_opt(int argc, char* argv[], const std::string& key, int i)
+    {
+        if(std::string("--") + key == argv[i])
+        {
+            int pos = i;
+            while(++i < argc && argv[i][0] != '-') {}
+            int end = i;
+            for(int j = pos + 1; j < end; j++)
+            {
+                long_opts[key].push_back(std::stoi(argv[j]));
+            }
+            return true;
+        }
+        return false;
+    }
+
+    void operator()(int argc, char* argv[])
+    {
+        for(auto& kv : long_opts)
+        {
+            for(int i = 1; i < argc; i++)
+            {
+                if(parse_opt(argc, argv, kv.first, i))
+                    break;
+            }
+        }
+    }
+};
+
+void print_help()
+{
+    std::cout << "arg1: tensor operation (layernorm/batchnorm/softmax)\n"
+              << "arg2: data type (0: fp32; 1: fp16; 2: bf16; 3: int8)\n"
+              << "arg3: verification (0: no; 1: yes)\n"
+              << "arg4: initialization (0: no init; 1: integer value; 2: decimal value)\n"
+              << "arg5: print tensor value (0: no; 1: yes)\n"
+              << "arg6: time kernel (0=n0, 1=yes)\n"
+              << "--length: tensor extents (e.g, --length 8 4 256) \n"
+              << "--stride: tensor strides (e.g, --stride 1024 256 1)\n"
+              << "--reduce: to-reduce dimensions (e.g, --reduce 2)\n"
+              << "--alpha: alpha scaling value\n"
+              << "--beta: beta scaling value\n"
+              << std::endl;
+}
+
+int profile_normalization(int argc, char* argv[])
+{
+    if(argc <= 2)
+    {
+        print_help();
+        return 0;
+    }
+
+    ArgParser arg_parser;
+
+    // short unnamed options
+    const NormType norm_type     = arg_parser.norm_dict[argv[1]];
+    const NormDataType data_type = static_cast<NormDataType>(std::stoi(argv[2]));
+    const bool do_verification   = std::stoi(argv[3]);
+    const int init_method        = std::stoi(argv[4]);
+    const bool do_log            = std::stoi(argv[5]);
+    const bool time_kernel       = std::stoi(argv[6]);
+
+    // parse the long options
+    arg_parser(argc, argv);
+    const std::vector<index_t> length = arg_parser.long_opts["length"];
+    const std::vector<index_t> stride = arg_parser.long_opts["stride"];
+    const std::vector<index_t> reduce = arg_parser.long_opts["reduce"];
+    const index_t alpha =
+        arg_parser.long_opts["alpha"].empty() ? 1 : arg_parser.long_opts["alpha"][0];
+    const index_t beta = arg_parser.long_opts["beta"].empty() ? 0 : arg_parser.long_opts["beta"][0];
+
+    if(data_type == NormDataType::F16_F16)
+    {
+        ck::profiler::profile_normalization_impl<ck::half_t, float, ck::half_t>(do_verification,
+                                                                                init_method,
+                                                                                do_log,
+                                                                                time_kernel,
+                                                                                length,
+                                                                                stride,
+                                                                                reduce,
+                                                                                float(alpha),
+                                                                                float(beta),
+                                                                                norm_type);
+    }
+    else if(data_type == NormDataType::F32_F32)
+    {
+        ck::profiler::profile_normalization_impl<float, float, float>(do_verification,
+                                                                      init_method,
+                                                                      do_log,
+                                                                      time_kernel,
+                                                                      length,
+                                                                      stride,
+                                                                      reduce,
+                                                                      float(alpha),
+                                                                      float(beta),
+                                                                      norm_type);
+    }
+    else
+    {
+        throw std::runtime_error("not implemented yet");
+    }
+
+    return 0;
+}
+
+// hijack main() for quick debugging
+// int main(int argc, char* argv[])
+// {
+//     profile_normalization(argc, argv);
+//     return 0;
+// }
diff --git a/profiler/src/profiler.cpp b/profiler/src/profiler.cpp
index e30d921da2..e30d06d0c7 100644
--- a/profiler/src/profiler.cpp
+++ b/profiler/src/profiler.cpp
@@ -20,6 +20,7 @@ int profile_conv_fwd_bias_relu_add(int, char*[]);
 int profile_convnd_fwd(int argc, char* argv[]);
 int profile_convnd_bwd_data(int, char*[], int);
 int profile_conv_bwd_weight(int, char*[]);
+int profile_normalization(int, char*[]);
 int profile_reduce(int, char*[]);
 
 static void print_helper_message()
@@ -130,6 +131,11 @@ int main(int argc, char* argv[])
     {
         return profile_gemm_add_add_fastgelu(argc, argv);
     }
+    else if(strcmp(argv[1], "batchnorm") == 0 || strcmp(argv[1], "layernorm") == 0 ||
+            strcmp(argv[1], "softmax") == 0)
+    {
+        return profile_normalization(argc, argv);
+    }
     else
     {
         print_helper_message();
diff --git a/test/softmax/CMakeLists.txt b/test/softmax/CMakeLists.txt
index 50ec04f9e4..da80e372ea 100644
--- a/test/softmax/CMakeLists.txt
+++ b/test/softmax/CMakeLists.txt
@@ -2,7 +2,10 @@ add_custom_target(test_softmax)
 
 add_gtest_executable(test_softmax_fp32 test_softmax_fp32.cpp)
 add_gtest_executable(test_softmax_fp16 test_softmax_fp16.cpp)
+add_gtest_executable(test_softmax_int8 test_softmax_int8.cpp)
 target_link_libraries(test_softmax_fp32 PRIVATE host_tensor)
 target_link_libraries(test_softmax_fp16 PRIVATE host_tensor)
+target_link_libraries(test_softmax_int8 PRIVATE host_tensor)
 add_dependencies(test_softmax test_softmax_fp32)
-add_dependencies(test_softmax test_softmax_fp16)
\ No newline at end of file
+add_dependencies(test_softmax test_softmax_fp16)
+add_dependencies(test_softmax test_softmax_int8)
\ No newline at end of file
diff --git a/test/softmax/test_softmax_fp16.cpp b/test/softmax/test_softmax_fp16.cpp
index 8eca9a20a3..cce6a422b6 100644
--- a/test/softmax/test_softmax_fp16.cpp
+++ b/test/softmax/test_softmax_fp16.cpp
@@ -15,14 +15,19 @@ class TestSoftmaxFP16 : public ck::TestSoftmax<Tuple>
 // clang-format off
 using KernelTypes = ::testing::Types<
 // InDataType, AccDataType, OutDataType, Rank, NumReduceDim, BlockSize, MThreadClusterSize, KThreadClusterSize, MThreadSliceSize, KThreadSliceSize, InSrcVectorDim, InSrcVectorSize, OutDstVectorSize>
+    std::tuple<ck::half_t, float, float, I<3>, I<1>, I<256>, I<8>, I<32>, I<1>, I<8>, I<1>, I<8>, I<4>>, // mixed precision
     std::tuple<ck::half_t, float, ck::half_t, I<3>, I<1>, I<256>, I<8>, I<32>, I<1>, I<8>, I<1>, I<8>, I<8>>,
     std::tuple<ck::half_t, float, ck::half_t, I<3>, I<1>, I<256>, I<4>, I<64>, I<1>, I<8>, I<1>, I<8>, I<8>>,
     std::tuple<ck::half_t, float, ck::half_t, I<3>, I<1>, I<256>, I<2>, I<128>, I<1>, I<8>, I<1>, I<8>, I<8>>,
     std::tuple<ck::half_t, float, ck::half_t, I<3>, I<1>, I<256>, I<1>, I<256>, I<1>, I<8>, I<1>, I<8>, I<8>>,
+    std::tuple<ck::half_t, float, ck::half_t, I<3>, I<1>, I<256>, I<1>, I<256>, I<1>, I<16>, I<1>, I<8>, I<8>>,
+    std::tuple<ck::half_t, float, ck::half_t, I<3>, I<1>, I<256>, I<1>, I<256>, I<1>, I<32>, I<1>, I<8>, I<8>>,
     std::tuple<ck::half_t, float, ck::half_t, I<3>, I<2>, I<256>, I<8>, I<32>, I<1>, I<8>, I<1>, I<8>, I<8>>,
     std::tuple<ck::half_t, float, ck::half_t, I<3>, I<2>, I<256>, I<4>, I<64>, I<1>, I<8>, I<1>, I<8>, I<8>>,
     std::tuple<ck::half_t, float, ck::half_t, I<3>, I<2>, I<256>, I<2>, I<128>, I<1>, I<8>, I<1>, I<8>, I<8>>,
-    std::tuple<ck::half_t, float, ck::half_t, I<3>, I<2>, I<256>, I<1>, I<256>, I<1>, I<8>, I<1>, I<8>, I<8>>
+    std::tuple<ck::half_t, float, ck::half_t, I<3>, I<2>, I<256>, I<1>, I<256>, I<1>, I<8>, I<1>, I<8>, I<8>>,
+    std::tuple<ck::half_t, float, ck::half_t, I<3>, I<2>, I<256>, I<1>, I<256>, I<1>, I<16>, I<1>, I<8>, I<8>>,
+    std::tuple<ck::half_t, float, ck::half_t, I<3>, I<2>, I<256>, I<1>, I<256>, I<1>, I<32>, I<1>, I<8>, I<8>>
     >;
 // clang-format on
 TYPED_TEST_SUITE(TestSoftmaxFP16, KernelTypes);
diff --git a/test/softmax/test_softmax_fp32.cpp b/test/softmax/test_softmax_fp32.cpp
index b0db3cec75..4301a5ae2f 100644
--- a/test/softmax/test_softmax_fp32.cpp
+++ b/test/softmax/test_softmax_fp32.cpp
@@ -15,14 +15,19 @@ class TestSoftmaxFP32 : public ck::TestSoftmax<Tuple>
 // clang-format off
 using KernelTypes = ::testing::Types<
 // InDataType, AccDataType, OutDataType, Rank, NumReduceDim, BlockSize, MThreadClusterSize, KThreadClusterSize, MThreadSliceSize, KThreadSliceSize, InSrcVectorDim, InSrcVectorSize, OutDstVectorSize>
+    std::tuple<float, float, ck::half_t, I<3>, I<2>, I<256>, I<1>, I<256>, I<1>, I<8>, I<1>, I<4>, I<8>>, // mixed precision
     std::tuple<float, float, float, I<3>, I<1>, I<256>, I<8>, I<32>, I<1>, I<4>, I<1>, I<4>, I<4>>,
     std::tuple<float, float, float, I<3>, I<1>, I<256>, I<4>, I<64>, I<1>, I<4>, I<1>, I<4>, I<4>>,
     std::tuple<float, float, float, I<3>, I<1>, I<256>, I<2>, I<128>, I<1>, I<4>, I<1>, I<4>, I<4>>,
     std::tuple<float, float, float, I<3>, I<1>, I<256>, I<1>, I<256>, I<1>, I<4>, I<1>, I<4>, I<4>>,
+    std::tuple<float, float, float, I<3>, I<1>, I<256>, I<1>, I<256>, I<1>, I<8>, I<1>, I<4>, I<4>>,
+    std::tuple<float, float, float, I<3>, I<1>, I<256>, I<1>, I<256>, I<1>, I<16>, I<1>, I<4>, I<4>>,
     std::tuple<float, float, float, I<3>, I<2>, I<256>, I<8>, I<32>, I<1>, I<4>, I<1>, I<4>, I<4>>,
     std::tuple<float, float, float, I<3>, I<2>, I<256>, I<4>, I<64>, I<1>, I<4>, I<1>, I<4>, I<4>>,
     std::tuple<float, float, float, I<3>, I<2>, I<256>, I<2>, I<128>, I<1>, I<4>, I<1>, I<4>, I<4>>,
-    std::tuple<float, float, float, I<3>, I<2>, I<256>, I<1>, I<256>, I<1>, I<4>, I<1>, I<4>, I<4>>
+    std::tuple<float, float, float, I<3>, I<2>, I<256>, I<1>, I<256>, I<1>, I<4>, I<1>, I<4>, I<4>>,
+    std::tuple<float, float, float, I<3>, I<2>, I<256>, I<1>, I<256>, I<1>, I<8>, I<1>, I<4>, I<4>>,
+    std::tuple<float, float, float, I<3>, I<2>, I<256>, I<1>, I<256>, I<1>, I<16>, I<1>, I<4>, I<4>>
     >;
 // clang-format on
 TYPED_TEST_SUITE(TestSoftmaxFP32, KernelTypes);
diff --git a/test/softmax/test_softmax_int8.cpp b/test/softmax/test_softmax_int8.cpp
new file mode 100644
index 0000000000..dde165295e
--- /dev/null
+++ b/test/softmax/test_softmax_int8.cpp
@@ -0,0 +1,30 @@
+#include "gtest/gtest.h"
+#include "test_softmax_util.hpp"
+
+template <ck::index_t N>
+using I = ck::Number<N>;
+
+template <typename Tuple>
+class TestSoftmaxINT8 : public ck::TestSoftmax<Tuple>
+{
+};
+
+// clang-format off
+using KernelTypes = ::testing::Types<
+// InDataType, AccDataType, OutDataType, Rank, NumReduceDim, BlockSize, MThreadClusterSize, KThreadClusterSize, MThreadSliceSize, KThreadSliceSize, InSrcVectorDim, InSrcVectorSize, OutDstVectorSize>
+    std::tuple<int8_t, float, int8_t, I<3>, I<1>, I<256>, I<8>, I<32>, I<1>, I<16>, I<1>, I<16>, I<16>>,
+    std::tuple<int8_t, float, int8_t, I<3>, I<1>, I<256>, I<4>, I<64>, I<1>, I<16>, I<1>, I<16>, I<16>>,
+    std::tuple<int8_t, float, int8_t, I<3>, I<1>, I<256>, I<2>, I<128>, I<1>, I<16>, I<1>, I<16>, I<16>>,
+    std::tuple<int8_t, float, int8_t, I<3>, I<1>, I<256>, I<1>, I<256>, I<1>, I<16>, I<1>, I<16>, I<16>>,
+    std::tuple<int8_t, float, int8_t, I<3>, I<1>, I<256>, I<1>, I<256>, I<1>, I<32>, I<1>, I<16>, I<16>>,
+    std::tuple<int8_t, float, int8_t, I<3>, I<1>, I<256>, I<1>, I<256>, I<1>, I<64>, I<1>, I<16>, I<16>>,
+    std::tuple<int8_t, float, int8_t, I<3>, I<2>, I<256>, I<8>, I<32>, I<1>, I<16>, I<1>, I<16>, I<16>>,
+    std::tuple<int8_t, float, int8_t, I<3>, I<2>, I<256>, I<4>, I<64>, I<1>, I<16>, I<1>, I<16>, I<16>>,
+    std::tuple<int8_t, float, int8_t, I<3>, I<2>, I<256>, I<2>, I<128>, I<1>, I<16>, I<1>, I<16>, I<16>>,
+    std::tuple<int8_t, float, int8_t, I<3>, I<2>, I<256>, I<1>, I<256>, I<1>, I<16>, I<1>, I<16>, I<16>>,
+    std::tuple<int8_t, float, int8_t, I<3>, I<2>, I<256>, I<1>, I<256>, I<1>, I<32>, I<1>, I<16>, I<16>>,
+    std::tuple<int8_t, float, int8_t, I<3>, I<2>, I<256>, I<1>, I<256>, I<1>, I<64>, I<1>, I<16>, I<16>>
+    >;
+// clang-format on
+TYPED_TEST_SUITE(TestSoftmaxINT8, KernelTypes);
+TYPED_TEST(TestSoftmaxINT8, Test_INT8) { this->Run(); }
diff --git a/test/softmax/test_softmax_util.hpp b/test/softmax/test_softmax_util.hpp
index d54cf10225..2ca3b47abc 100644
--- a/test/softmax/test_softmax_util.hpp
+++ b/test/softmax/test_softmax_util.hpp
@@ -1,6 +1,8 @@
 // SPDX-License-Identifier: MIT
 // Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
 
+#pragma once
+
 #include <vector>
 #include <iostream>
 #include <gtest/gtest.h>
@@ -16,6 +18,18 @@
 
 namespace ck {
 
+template <typename Range>
+std::string serialize_range(const Range& range)
+{
+    std::stringstream ss;
+    for(auto& r : range)
+    {
+        ss << r << ", ";
+    }
+    std::string str = ss.str();
+    return std::string(str.begin(), str.end() - 2);
+}
+
 template <typename Tuple>
 class TestSoftmax : public ::testing::Test
 {
@@ -80,23 +94,43 @@ class TestSoftmax : public ::testing::Test
         auto argument_ptr    = device_instance.MakeArgumentPointer(i_in_lengths,
                                                                 i_in_strides,
                                                                 reduce_dims,
-                                                                alpha,
-                                                                beta,
+                                                                &alpha,
+                                                                &beta,
                                                                 in_dev.GetDeviceBuffer(),
                                                                 out_dev.GetDeviceBuffer());
 
         if(!device_instance.IsSupportedArgument(argument_ptr.get()))
         {
-            FAIL() << "Unsupported argument";
+            // std::cout << "Skipped due to unsupported argument: "
+            //           << "input lengths = [" << serialize_range(in_length) << "], "
+            //           << "scaler = [" << alpha << ", " << beta << "]." << std::endl;
+            return;
         }
 
         auto invoker_ptr = device_instance.MakeInvokerPointer();
         invoker_ptr->Run(argument_ptr.get());
 
-        ref_instance_invoker_.Run({in, out_ref, alpha, beta, Rank, reduce_dims});
+        ref_instance_invoker_.Run({in, out_ref, alpha, beta, reduce_dims});
 
         out_dev.FromDevice(out.mData.data());
-        EXPECT_TRUE(ck::utils::check_err(out.mData, out_ref.mData));
+
+        bool pass;
+
+        if(std::is_same<InDataType, int8_t>::value)
+        {
+            EXPECT_TRUE(pass = ck::utils::check_err(
+                            out.mData, out_ref.mData, "Error: Incorrect results!", 0, 1));
+        }
+        else
+        {
+            EXPECT_TRUE(pass = ck::utils::check_err(out.mData, out_ref.mData));
+        }
+
+        if(!pass)
+        {
+            FAIL() << "Failure in input lengths = [" << serialize_range(in_length) << "], "
+                   << "scaler = [" << alpha << ", " << beta << "].";
+        }
     }
 
     void Run()
@@ -105,13 +139,14 @@ class TestSoftmax : public ::testing::Test
         {
             for(auto scale : this->scales_)
             {
-                this->RunSingle(in_length, std::get<0>(scale), std::get<1>(scale));
+                this->RunSingle(in_length, scale[0], scale[1]);
             }
         }
     }
 
-    std::vector<std::vector<index_t>> in_lengths_ = {{1, 8, 128}, {2, 128, 1024}, {3, 9, 1032}};
-    std::vector<std::tuple<AccDataType, AccDataType>> scales_ = {{1, 0}, {2, 2}, {0, 1}};
+    std::vector<std::vector<index_t>> in_lengths_ = {
+        {1, 8, 128}, {2, 128, 1024}, {3, 9, 1032}, {4, 4, 2048}, {8, 1, 8192}};
+    std::vector<std::vector<AccDataType>> scales_ = {{1, 0}, {1, 1}, {0, 1}, {2, 2}};
 
     typename ReferenceInstance::Invoker ref_instance_invoker_;
 };