Standalone sweep once softmax kernel w/ ckProfiler (#295)

* use 'sweep once' softmax kernel where applicable * threadwise copy's dst buffer can specify invalid element value * add int8 in/out float compute softmax support give a bit of leeway for int absolute tolerance as there's a single data point of all test cases showing off-by-1 error * format * softmax inherits DeviceNormalization * softmax profiler stub * tighten up reference softmax interface * example prints tensor dimension * add fp32 to softmax profiler * rename header * hook with ckProfiler * format * resolve merge conflict * resolve merge conflicts * update normalization profiler help string * resolve conflict * typo * remove residual * softmax profiler: address feedback * test for mixed precision input/output * fully qualify ck::math::isnan * add comment for device normalization interface * revise wording * constness for alpha/beta scaler pointer
2026-04-20 06:49:15 +00:00 · 2022-07-01 01:08:50 +08:00
parent eccf8773a6
commit 93c99f3d87
24 changed files with 809 additions and 106 deletions
--- a/test/softmax/CMakeLists.txt
+++ b/test/softmax/CMakeLists.txt
@@ -2,7 +2,10 @@ add_custom_target(test_softmax)

 add_gtest_executable(test_softmax_fp32 test_softmax_fp32.cpp)
 add_gtest_executable(test_softmax_fp16 test_softmax_fp16.cpp)
+add_gtest_executable(test_softmax_int8 test_softmax_int8.cpp)
 target_link_libraries(test_softmax_fp32 PRIVATE host_tensor)
 target_link_libraries(test_softmax_fp16 PRIVATE host_tensor)
+target_link_libraries(test_softmax_int8 PRIVATE host_tensor)
 add_dependencies(test_softmax test_softmax_fp32)
-add_dependencies(test_softmax test_softmax_fp16)
+add_dependencies(test_softmax test_softmax_fp16)
+add_dependencies(test_softmax test_softmax_int8)
--- a/test/softmax/test_softmax_fp16.cpp
+++ b/test/softmax/test_softmax_fp16.cpp
@@ -15,14 +15,19 @@ class TestSoftmaxFP16 : public ck::TestSoftmax<Tuple>
 // clang-format off
 using KernelTypes = ::testing::Types<
 // InDataType, AccDataType, OutDataType, Rank, NumReduceDim, BlockSize, MThreadClusterSize, KThreadClusterSize, MThreadSliceSize, KThreadSliceSize, InSrcVectorDim, InSrcVectorSize, OutDstVectorSize>
+    std::tuple<ck::half_t, float, float, I<3>, I<1>, I<256>, I<8>, I<32>, I<1>, I<8>, I<1>, I<8>, I<4>>, // mixed precision
    std::tuple<ck::half_t, float, ck::half_t, I<3>, I<1>, I<256>, I<8>, I<32>, I<1>, I<8>, I<1>, I<8>, I<8>>,
    std::tuple<ck::half_t, float, ck::half_t, I<3>, I<1>, I<256>, I<4>, I<64>, I<1>, I<8>, I<1>, I<8>, I<8>>,
    std::tuple<ck::half_t, float, ck::half_t, I<3>, I<1>, I<256>, I<2>, I<128>, I<1>, I<8>, I<1>, I<8>, I<8>>,
    std::tuple<ck::half_t, float, ck::half_t, I<3>, I<1>, I<256>, I<1>, I<256>, I<1>, I<8>, I<1>, I<8>, I<8>>,
+    std::tuple<ck::half_t, float, ck::half_t, I<3>, I<1>, I<256>, I<1>, I<256>, I<1>, I<16>, I<1>, I<8>, I<8>>,
+    std::tuple<ck::half_t, float, ck::half_t, I<3>, I<1>, I<256>, I<1>, I<256>, I<1>, I<32>, I<1>, I<8>, I<8>>,
    std::tuple<ck::half_t, float, ck::half_t, I<3>, I<2>, I<256>, I<8>, I<32>, I<1>, I<8>, I<1>, I<8>, I<8>>,
    std::tuple<ck::half_t, float, ck::half_t, I<3>, I<2>, I<256>, I<4>, I<64>, I<1>, I<8>, I<1>, I<8>, I<8>>,
    std::tuple<ck::half_t, float, ck::half_t, I<3>, I<2>, I<256>, I<2>, I<128>, I<1>, I<8>, I<1>, I<8>, I<8>>,
-    std::tuple<ck::half_t, float, ck::half_t, I<3>, I<2>, I<256>, I<1>, I<256>, I<1>, I<8>, I<1>, I<8>, I<8>>
+    std::tuple<ck::half_t, float, ck::half_t, I<3>, I<2>, I<256>, I<1>, I<256>, I<1>, I<8>, I<1>, I<8>, I<8>>,
+    std::tuple<ck::half_t, float, ck::half_t, I<3>, I<2>, I<256>, I<1>, I<256>, I<1>, I<16>, I<1>, I<8>, I<8>>,
+    std::tuple<ck::half_t, float, ck::half_t, I<3>, I<2>, I<256>, I<1>, I<256>, I<1>, I<32>, I<1>, I<8>, I<8>>
    >;
 // clang-format on
 TYPED_TEST_SUITE(TestSoftmaxFP16, KernelTypes);
--- a/test/softmax/test_softmax_fp32.cpp
+++ b/test/softmax/test_softmax_fp32.cpp
@@ -15,14 +15,19 @@ class TestSoftmaxFP32 : public ck::TestSoftmax<Tuple>
 // clang-format off
 using KernelTypes = ::testing::Types<
 // InDataType, AccDataType, OutDataType, Rank, NumReduceDim, BlockSize, MThreadClusterSize, KThreadClusterSize, MThreadSliceSize, KThreadSliceSize, InSrcVectorDim, InSrcVectorSize, OutDstVectorSize>
+    std::tuple<float, float, ck::half_t, I<3>, I<2>, I<256>, I<1>, I<256>, I<1>, I<8>, I<1>, I<4>, I<8>>, // mixed precision
    std::tuple<float, float, float, I<3>, I<1>, I<256>, I<8>, I<32>, I<1>, I<4>, I<1>, I<4>, I<4>>,
    std::tuple<float, float, float, I<3>, I<1>, I<256>, I<4>, I<64>, I<1>, I<4>, I<1>, I<4>, I<4>>,
    std::tuple<float, float, float, I<3>, I<1>, I<256>, I<2>, I<128>, I<1>, I<4>, I<1>, I<4>, I<4>>,
    std::tuple<float, float, float, I<3>, I<1>, I<256>, I<1>, I<256>, I<1>, I<4>, I<1>, I<4>, I<4>>,
+    std::tuple<float, float, float, I<3>, I<1>, I<256>, I<1>, I<256>, I<1>, I<8>, I<1>, I<4>, I<4>>,
+    std::tuple<float, float, float, I<3>, I<1>, I<256>, I<1>, I<256>, I<1>, I<16>, I<1>, I<4>, I<4>>,
    std::tuple<float, float, float, I<3>, I<2>, I<256>, I<8>, I<32>, I<1>, I<4>, I<1>, I<4>, I<4>>,
    std::tuple<float, float, float, I<3>, I<2>, I<256>, I<4>, I<64>, I<1>, I<4>, I<1>, I<4>, I<4>>,
    std::tuple<float, float, float, I<3>, I<2>, I<256>, I<2>, I<128>, I<1>, I<4>, I<1>, I<4>, I<4>>,
-    std::tuple<float, float, float, I<3>, I<2>, I<256>, I<1>, I<256>, I<1>, I<4>, I<1>, I<4>, I<4>>
+    std::tuple<float, float, float, I<3>, I<2>, I<256>, I<1>, I<256>, I<1>, I<4>, I<1>, I<4>, I<4>>,
+    std::tuple<float, float, float, I<3>, I<2>, I<256>, I<1>, I<256>, I<1>, I<8>, I<1>, I<4>, I<4>>,
+    std::tuple<float, float, float, I<3>, I<2>, I<256>, I<1>, I<256>, I<1>, I<16>, I<1>, I<4>, I<4>>
    >;
 // clang-format on
 TYPED_TEST_SUITE(TestSoftmaxFP32, KernelTypes);
--- a/test/softmax/test_softmax_int8.cpp
+++ b/test/softmax/test_softmax_int8.cpp
@@ -0,0 +1,30 @@
+#include "gtest/gtest.h"
+#include "test_softmax_util.hpp"
+
+template <ck::index_t N>
+using I = ck::Number<N>;
+
+template <typename Tuple>
+class TestSoftmaxINT8 : public ck::TestSoftmax<Tuple>
+{
+};
+
+// clang-format off
+using KernelTypes = ::testing::Types<
+// InDataType, AccDataType, OutDataType, Rank, NumReduceDim, BlockSize, MThreadClusterSize, KThreadClusterSize, MThreadSliceSize, KThreadSliceSize, InSrcVectorDim, InSrcVectorSize, OutDstVectorSize>
+    std::tuple<int8_t, float, int8_t, I<3>, I<1>, I<256>, I<8>, I<32>, I<1>, I<16>, I<1>, I<16>, I<16>>,
+    std::tuple<int8_t, float, int8_t, I<3>, I<1>, I<256>, I<4>, I<64>, I<1>, I<16>, I<1>, I<16>, I<16>>,
+    std::tuple<int8_t, float, int8_t, I<3>, I<1>, I<256>, I<2>, I<128>, I<1>, I<16>, I<1>, I<16>, I<16>>,
+    std::tuple<int8_t, float, int8_t, I<3>, I<1>, I<256>, I<1>, I<256>, I<1>, I<16>, I<1>, I<16>, I<16>>,
+    std::tuple<int8_t, float, int8_t, I<3>, I<1>, I<256>, I<1>, I<256>, I<1>, I<32>, I<1>, I<16>, I<16>>,
+    std::tuple<int8_t, float, int8_t, I<3>, I<1>, I<256>, I<1>, I<256>, I<1>, I<64>, I<1>, I<16>, I<16>>,
+    std::tuple<int8_t, float, int8_t, I<3>, I<2>, I<256>, I<8>, I<32>, I<1>, I<16>, I<1>, I<16>, I<16>>,
+    std::tuple<int8_t, float, int8_t, I<3>, I<2>, I<256>, I<4>, I<64>, I<1>, I<16>, I<1>, I<16>, I<16>>,
+    std::tuple<int8_t, float, int8_t, I<3>, I<2>, I<256>, I<2>, I<128>, I<1>, I<16>, I<1>, I<16>, I<16>>,
+    std::tuple<int8_t, float, int8_t, I<3>, I<2>, I<256>, I<1>, I<256>, I<1>, I<16>, I<1>, I<16>, I<16>>,
+    std::tuple<int8_t, float, int8_t, I<3>, I<2>, I<256>, I<1>, I<256>, I<1>, I<32>, I<1>, I<16>, I<16>>,
+    std::tuple<int8_t, float, int8_t, I<3>, I<2>, I<256>, I<1>, I<256>, I<1>, I<64>, I<1>, I<16>, I<16>>
+    >;
+// clang-format on
+TYPED_TEST_SUITE(TestSoftmaxINT8, KernelTypes);
+TYPED_TEST(TestSoftmaxINT8, Test_INT8) { this->Run(); }
--- a/test/softmax/test_softmax_util.hpp
+++ b/test/softmax/test_softmax_util.hpp
@@ -1,6 +1,8 @@
 // SPDX-License-Identifier: MIT
 // Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.

+#pragma once
+
 #include <vector>
 #include <iostream>
 #include <gtest/gtest.h>
@@ -16,6 +18,18 @@

 namespace ck {

+template <typename Range>
+std::string serialize_range(const Range& range)
+{
+    std::stringstream ss;
+    for(auto& r : range)
+    {
+        ss << r << ", ";
+    }
+    std::string str = ss.str();
+    return std::string(str.begin(), str.end() - 2);
+}
+
 template <typename Tuple>
 class TestSoftmax : public ::testing::Test
 {
@@ -80,23 +94,43 @@ class TestSoftmax : public ::testing::Test
        auto argument_ptr    = device_instance.MakeArgumentPointer(i_in_lengths,
                                                                i_in_strides,
                                                                reduce_dims,
-                                                                alpha,
-                                                                beta,
+                                                                &alpha,
+                                                                &beta,
                                                                in_dev.GetDeviceBuffer(),
                                                                out_dev.GetDeviceBuffer());

        if(!device_instance.IsSupportedArgument(argument_ptr.get()))
        {
-            FAIL() << "Unsupported argument";
+            // std::cout << "Skipped due to unsupported argument: "
+            //           << "input lengths = [" << serialize_range(in_length) << "], "
+            //           << "scaler = [" << alpha << ", " << beta << "]." << std::endl;
+            return;
        }

        auto invoker_ptr = device_instance.MakeInvokerPointer();
        invoker_ptr->Run(argument_ptr.get());

-        ref_instance_invoker_.Run({in, out_ref, alpha, beta, Rank, reduce_dims});
+        ref_instance_invoker_.Run({in, out_ref, alpha, beta, reduce_dims});

        out_dev.FromDevice(out.mData.data());
-        EXPECT_TRUE(ck::utils::check_err(out.mData, out_ref.mData));
+
+        bool pass;
+
+        if(std::is_same<InDataType, int8_t>::value)
+        {
+            EXPECT_TRUE(pass = ck::utils::check_err(
+                            out.mData, out_ref.mData, "Error: Incorrect results!", 0, 1));
+        }
+        else
+        {
+            EXPECT_TRUE(pass = ck::utils::check_err(out.mData, out_ref.mData));
+        }
+
+        if(!pass)
+        {
+            FAIL() << "Failure in input lengths = [" << serialize_range(in_length) << "], "
+                   << "scaler = [" << alpha << ", " << beta << "].";
+        }
    }

    void Run()
@@ -105,13 +139,14 @@ class TestSoftmax : public ::testing::Test
        {
            for(auto scale : this->scales_)
            {
-                this->RunSingle(in_length, std::get<0>(scale), std::get<1>(scale));
+                this->RunSingle(in_length, scale[0], scale[1]);
            }
        }
    }

-    std::vector<std::vector<index_t>> in_lengths_ = {{1, 8, 128}, {2, 128, 1024}, {3, 9, 1032}};
-    std::vector<std::tuple<AccDataType, AccDataType>> scales_ = {{1, 0}, {2, 2}, {0, 1}};
+    std::vector<std::vector<index_t>> in_lengths_ = {
+        {1, 8, 128}, {2, 128, 1024}, {3, 9, 1032}, {4, 4, 2048}, {8, 1, 8192}};
+    std::vector<std::vector<AccDataType>> scales_ = {{1, 0}, {1, 1}, {0, 1}, {2, 2}};

    typename ReferenceInstance::Invoker ref_instance_invoker_;
 };