Softmax client example (#396)

* Update Softmax device operation interface. * Update ckProfiler. * Update Softmax UT. * Update example. * Client example. * Clang format Co-authored-by: Adam Osewski <aosewski@amd.com>
2026-05-11 17:00:18 +00:00 · 2022-09-06 19:22:48 +02:00
parent 7589116121
commit 3da5c19e62
13 changed files with 738 additions and 331 deletions
--- a/library/include/ck/library/tensor_operation_instance/gpu/softmax.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/softmax.hpp
@@ -0,0 +1,71 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <type_traits>
+#include <vector>
+
+#include "ck/ck.hpp"
+#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/device_softmax.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+#include "ck/utility/data_type.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using F16         = ck::half_t;
+using F32         = float;
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+void add_device_softmax_f16_f16_rank3_instances(
+    std::vector<DeviceSoftmaxPtr<F16, F32, F16, PassThrough, PassThrough, 3>>&);
+void add_device_softmax_f16_f16_rank4_instances(
+    std::vector<DeviceSoftmaxPtr<F16, F32, F16, PassThrough, PassThrough, 4>>&);
+
+void add_device_softmax_f32_f32_rank3_instances(
+    std::vector<DeviceSoftmaxPtr<F32, F32, F32, PassThrough, PassThrough, 3>>&);
+void add_device_softmax_f32_f32_rank4_instances(
+    std::vector<DeviceSoftmaxPtr<F32, F32, F32, PassThrough, PassThrough, 4>>&);
+
+template <typename InDataType, typename AccDataType, typename OutDataType, index_t Rank>
+struct DeviceOperationInstanceFactory<
+    ck::tensor_operation::device::
+        DeviceSoftmax<InDataType, AccDataType, OutDataType, PassThrough, PassThrough, Rank>>
+{
+    using DeviceOp =
+        DeviceSoftmax<InDataType, AccDataType, OutDataType, PassThrough, PassThrough, Rank>;
+
+    static auto GetInstances()
+    {
+        std::vector<std::unique_ptr<DeviceOp>> op_ptrs;
+
+        if constexpr(std::is_same_v<InDataType, F16> && std::is_same_v<AccDataType, F32> &&
+                     std::is_same_v<OutDataType, F16>)
+        {
+            if constexpr(Rank == 3)
+                add_device_softmax_f16_f16_rank3_instances(op_ptrs);
+            else if constexpr(Rank == 4)
+                add_device_softmax_f16_f16_rank4_instances(op_ptrs);
+        }
+        else if constexpr(std::is_same_v<InDataType, F32> && std::is_same_v<AccDataType, F32> &&
+                          std::is_same_v<OutDataType, F32>)
+        {
+            if constexpr(Rank == 3)
+                add_device_softmax_f32_f32_rank3_instances(op_ptrs);
+            else if constexpr(Rank == 4)
+                add_device_softmax_f32_f32_rank4_instances(op_ptrs);
+        }
+
+        return op_ptrs;
+    }
+};
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/library/src/tensor_operation_instance/gpu/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/CMakeLists.txt
@@ -78,6 +78,7 @@ target_include_directories(device_operations PUBLIC
    $<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}/ck/tensor>
    $<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}/ck/problem_transform>
    $<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}/ck/tensor_operation/gpu/device>
+    $<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}/ck/tensor_operation/gpu/device/impl>
    $<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}/ck/tensor_operation/gpu/grid>
    $<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}/ck/tensor_operation/gpu/block>
    $<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}/ck/tensor_operation/gpu/warp>
--- a/library/src/tensor_operation_instance/gpu/normalization/device_softmax_f16_f16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/normalization/device_softmax_f16_f16_instance.cpp
@@ -1,43 +1,51 @@
 // SPDX-License-Identifier: MIT
 // Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.

-#include "ck/ck.hpp"
-#include "ck/tensor_operation/gpu/device/device_softmax.hpp"
-#include "ck/utility/data_type.hpp"
+#include <tuple>
+#include <vector>

+#include "ck/ck.hpp"
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_softmax_impl.hpp"
+#include "ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp"
+#include "ck/utility/data_type.hpp"

 namespace ck {
 namespace tensor_operation {
 namespace device {
 namespace instance {

-using F16 = ck::half_t;
-using F32 = float;
+namespace {
+using F16  = ck::half_t;
+using F32  = float;
+using Pass = ck::tensor_operation::element_wise::PassThrough;
+} // namespace

 template <index_t Rank, index_t Reduce>
 using device_softmax_f16_f16_instances = std::tuple<
    // clang-format off
-        // InDataType, AccDataType, OutDataType, Rank, NumReduceDim, BlockSize, MThreadClusterSize, KThreadClusterSize, MThreadSliceSize, KThreadSliceSize, InSrcVectorDim, InSrcVectorSize, OutDstVectorSize>
-        DeviceSoftmax<F16, F32, F16, Rank, Reduce, 256, 8, 32, 1, 8, 1, 1, 1>, // fallback kernel
-        DeviceSoftmax<F16, F32, F16, Rank, Reduce, 256, 8, 32, 1, 8, 1, 8, 8>,
-        DeviceSoftmax<F16, F32, F16, Rank, Reduce, 256, 4, 64, 1, 8, 1, 8, 8>,
-        DeviceSoftmax<F16, F32, F16, Rank, Reduce, 256, 2, 128, 1, 8, 1, 8, 8>,
-        DeviceSoftmax<F16, F32, F16, Rank, Reduce, 256, 2, 128, 1, 16, 1, 8, 8>,
-        DeviceSoftmax<F16, F32, F16, Rank, Reduce, 256, 2, 128, 1, 32, 1, 8, 8>,
-        DeviceSoftmax<F16, F32, F16, Rank, Reduce, 256, 1, 256, 1, 8, 1, 8, 8>,
-        DeviceSoftmax<F16, F32, F16, Rank, Reduce, 256, 1, 256, 1, 16, 1, 8, 8>,
-        DeviceSoftmax<F16, F32, F16, Rank, Reduce, 256, 1, 256, 1, 32, 1, 8, 8>
+    //                InDataType, AccDataType, OutDataType, InElementwiseOp, AccElementwiseOp, Rank, NumReduceDim, BlockSize, MThreadClusterSize, KThreadClusterSize, MThreadSliceSize, KThreadSliceSize, InSrcVectorDim, InSrcVectorSize, OutDstVectorSize>
+    DeviceSoftmaxImpl<       F16,         F32,         F16,            Pass,             Pass, Rank,       Reduce,       256,                  8,                 32,                1,                8,              1,               1,              1>, // fallback kernel
+    DeviceSoftmaxImpl<       F16,         F32,         F16,            Pass,             Pass, Rank,       Reduce,       256,                  8,                 32,                1,                8,              1,               8,              8>,
+    DeviceSoftmaxImpl<       F16,         F32,         F16,            Pass,             Pass, Rank,       Reduce,       256,                  4,                 64,                1,                8,              1,               8,              8>,
+    DeviceSoftmaxImpl<       F16,         F32,         F16,            Pass,             Pass, Rank,       Reduce,       256,                  2,                128,                1,                8,              1,               8,              8>,
+    DeviceSoftmaxImpl<       F16,         F32,         F16,            Pass,             Pass, Rank,       Reduce,       256,                  2,                128,                1,               16,              1,               8,              8>,
+    DeviceSoftmaxImpl<       F16,         F32,         F16,            Pass,             Pass, Rank,       Reduce,       256,                  2,                128,                1,               32,              1,               8,              8>,
+    DeviceSoftmaxImpl<       F16,         F32,         F16,            Pass,             Pass, Rank,       Reduce,       256,                  1,                256,                1,                8,              1,               8,              8>,
+    DeviceSoftmaxImpl<       F16,         F32,         F16,            Pass,             Pass, Rank,       Reduce,       256,                  1,                256,                1,               16,              1,               8,              8>,
+    DeviceSoftmaxImpl<       F16,         F32,         F16,            Pass,             Pass, Rank,       Reduce,       256,                  1,                256,                1,               32,              1,               8,              8>
    // clang-format on
    >;

-void add_device_softmax_f16_f16_rank3_instances(std::vector<DeviceNormalizationPtr>& instances)
+void add_device_softmax_f16_f16_rank3_instances(
+    std::vector<DeviceSoftmaxPtr<F16, F32, F16, Pass, Pass, 3>>& instances)
 {
    add_device_operation_instances(instances, device_softmax_f16_f16_instances<3, 1>{});
    add_device_operation_instances(instances, device_softmax_f16_f16_instances<3, 2>{});
 }

-void add_device_softmax_f16_f16_rank4_instances(std::vector<DeviceNormalizationPtr>& instances)
+void add_device_softmax_f16_f16_rank4_instances(
+    std::vector<DeviceSoftmaxPtr<F16, F32, F16, Pass, Pass, 4>>& instances)
 {
    add_device_operation_instances(instances, device_softmax_f16_f16_instances<4, 1>{});
    add_device_operation_instances(instances, device_softmax_f16_f16_instances<4, 2>{});
--- a/library/src/tensor_operation_instance/gpu/normalization/device_softmax_f32_f32_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/normalization/device_softmax_f32_f32_instance.cpp
@@ -1,41 +1,49 @@
 // SPDX-License-Identifier: MIT
 // Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.

+#include <tuple>
+#include <vector>
+
 #include "ck/ck.hpp"
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
-#include "ck/tensor_operation/gpu/device/device_softmax.hpp"
-#include "ck/utility/data_type.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_softmax_impl.hpp"
+#include "ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp"

 namespace ck {
 namespace tensor_operation {
 namespace device {
 namespace instance {

-using F32 = float;
+namespace {
+using F32  = float;
+using Pass = ck::tensor_operation::element_wise::PassThrough;
+} // namespace

 template <index_t Rank, index_t Reduce>
 using device_softmax_f32_f32_instances = std::tuple<
    // clang-format off
-        // InDataType, AccDataType, OutDataType, Rank, NumReduceDim, BlockSize, MThreadClusterSize, KThreadClusterSize, MThreadSliceSize, KThreadSliceSize, InSrcVectorDim, InSrcVectorSize, OutDstVectorSize>
-        DeviceSoftmax<F32, F32, F32, Rank, Reduce, 256, 8, 32, 1, 8, 1, 1, 1>, // fallback kernel
-        DeviceSoftmax<F32, F32, F32, Rank, Reduce, 256, 8, 32, 1, 8, 1, 4, 4>,
-        DeviceSoftmax<F32, F32, F32, Rank, Reduce, 256, 4, 64, 1, 8, 1, 4, 4>,
-        DeviceSoftmax<F32, F32, F32, Rank, Reduce, 256, 2, 128, 1, 8, 1, 4, 4>,
-        DeviceSoftmax<F32, F32, F32, Rank, Reduce, 256, 2, 128, 1, 16, 1, 4, 4>,
-        DeviceSoftmax<F32, F32, F32, Rank, Reduce, 256, 2, 128, 1, 32, 1, 4, 4>,
-        DeviceSoftmax<F32, F32, F32, Rank, Reduce, 256, 1, 256, 1, 8, 1, 4, 4>,
-        DeviceSoftmax<F32, F32, F32, Rank, Reduce, 256, 1, 256, 1, 16, 1, 4, 4>,
-        DeviceSoftmax<F32, F32, F32, Rank, Reduce, 256, 1, 256, 1, 32, 1, 4, 4>
+    //                InDataType, AccDataType, OutDataType, InElementwiseOp, AccElementwiseOp, Rank, NumReduceDim, BlockSize, MThreadClusterSize, KThreadClusterSize, MThreadSliceSize, KThreadSliceSize, InSrcVectorDim, InSrcVectorSize, OutDstVectorSize>
+    DeviceSoftmaxImpl<       F32,         F32,         F32,            Pass,             Pass, Rank,       Reduce,       256,                  8,                 32,                1,                8,              1,               1,               1>, // fallback kernel
+    DeviceSoftmaxImpl<       F32,         F32,         F32,            Pass,             Pass, Rank,       Reduce,       256,                  8,                 32,                1,                8,              1,               4,               4>,
+    DeviceSoftmaxImpl<       F32,         F32,         F32,            Pass,             Pass, Rank,       Reduce,       256,                  4,                 64,                1,                8,              1,               4,               4>,
+    DeviceSoftmaxImpl<       F32,         F32,         F32,            Pass,             Pass, Rank,       Reduce,       256,                  2,                128,                1,                8,              1,               4,               4>,
+    DeviceSoftmaxImpl<       F32,         F32,         F32,            Pass,             Pass, Rank,       Reduce,       256,                  2,                128,                1,               16,              1,               4,               4>,
+    DeviceSoftmaxImpl<       F32,         F32,         F32,            Pass,             Pass, Rank,       Reduce,       256,                  2,                128,                1,               32,              1,               4,               4>,
+    DeviceSoftmaxImpl<       F32,         F32,         F32,            Pass,             Pass, Rank,       Reduce,       256,                  1,                256,                1,                8,              1,               4,               4>,
+    DeviceSoftmaxImpl<       F32,         F32,         F32,            Pass,             Pass, Rank,       Reduce,       256,                  1,                256,                1,               16,              1,               4,               4>,
+    DeviceSoftmaxImpl<       F32,         F32,         F32,            Pass,             Pass, Rank,       Reduce,       256,                  1,                256,                1,               32,              1,               4,               4>
    // clang-format on
    >;

-void add_device_softmax_f32_f32_rank3_instances(std::vector<DeviceNormalizationPtr>& instances)
+void add_device_softmax_f32_f32_rank3_instances(
+    std::vector<DeviceSoftmaxPtr<F32, F32, F32, Pass, Pass, 3>>& instances)
 {
    add_device_operation_instances(instances, device_softmax_f32_f32_instances<3, 1>{});
    add_device_operation_instances(instances, device_softmax_f32_f32_instances<3, 2>{});
 }

-void add_device_softmax_f32_f32_rank4_instances(std::vector<DeviceNormalizationPtr>& instances)
+void add_device_softmax_f32_f32_rank4_instances(
+    std::vector<DeviceSoftmaxPtr<F32, F32, F32, Pass, Pass, 4>>& instances)
 {
    add_device_operation_instances(instances, device_softmax_f32_f32_instances<4, 1>{});
    add_device_operation_instances(instances, device_softmax_f32_f32_instances<4, 2>{});