Standalone sweep once softmax kernel w/ ckProfiler (#295)

* use 'sweep once' softmax kernel where applicable * threadwise copy's dst buffer can specify invalid element value * add int8 in/out float compute softmax support give a bit of leeway for int absolute tolerance as there's a single data point of all test cases showing off-by-1 error * format * softmax inherits DeviceNormalization * softmax profiler stub * tighten up reference softmax interface * example prints tensor dimension * add fp32 to softmax profiler * rename header * hook with ckProfiler * format * resolve merge conflict * resolve merge conflicts * update normalization profiler help string * resolve conflict * typo * remove residual * softmax profiler: address feedback * test for mixed precision input/output * fully qualify ck::math::isnan * add comment for device normalization interface * revise wording * constness for alpha/beta scaler pointer [ROCm/composable_kernel commit: 93c99f3d87]
2026-05-18 12:00:07 +00:00 · 2022-07-01 01:08:50 +08:00
parent ca34ce4450
commit d41b1a7c2c
24 changed files with 809 additions and 106 deletions
--- a/test/softmax/test_softmax_fp32.cpp
+++ b/test/softmax/test_softmax_fp32.cpp
@@ -15,14 +15,19 @@ class TestSoftmaxFP32 : public ck::TestSoftmax<Tuple>
 // clang-format off
 using KernelTypes = ::testing::Types<
 // InDataType, AccDataType, OutDataType, Rank, NumReduceDim, BlockSize, MThreadClusterSize, KThreadClusterSize, MThreadSliceSize, KThreadSliceSize, InSrcVectorDim, InSrcVectorSize, OutDstVectorSize>
+    std::tuple<float, float, ck::half_t, I<3>, I<2>, I<256>, I<1>, I<256>, I<1>, I<8>, I<1>, I<4>, I<8>>, // mixed precision
    std::tuple<float, float, float, I<3>, I<1>, I<256>, I<8>, I<32>, I<1>, I<4>, I<1>, I<4>, I<4>>,
    std::tuple<float, float, float, I<3>, I<1>, I<256>, I<4>, I<64>, I<1>, I<4>, I<1>, I<4>, I<4>>,
    std::tuple<float, float, float, I<3>, I<1>, I<256>, I<2>, I<128>, I<1>, I<4>, I<1>, I<4>, I<4>>,
    std::tuple<float, float, float, I<3>, I<1>, I<256>, I<1>, I<256>, I<1>, I<4>, I<1>, I<4>, I<4>>,
+    std::tuple<float, float, float, I<3>, I<1>, I<256>, I<1>, I<256>, I<1>, I<8>, I<1>, I<4>, I<4>>,
+    std::tuple<float, float, float, I<3>, I<1>, I<256>, I<1>, I<256>, I<1>, I<16>, I<1>, I<4>, I<4>>,
    std::tuple<float, float, float, I<3>, I<2>, I<256>, I<8>, I<32>, I<1>, I<4>, I<1>, I<4>, I<4>>,
    std::tuple<float, float, float, I<3>, I<2>, I<256>, I<4>, I<64>, I<1>, I<4>, I<1>, I<4>, I<4>>,
    std::tuple<float, float, float, I<3>, I<2>, I<256>, I<2>, I<128>, I<1>, I<4>, I<1>, I<4>, I<4>>,
-    std::tuple<float, float, float, I<3>, I<2>, I<256>, I<1>, I<256>, I<1>, I<4>, I<1>, I<4>, I<4>>
+    std::tuple<float, float, float, I<3>, I<2>, I<256>, I<1>, I<256>, I<1>, I<4>, I<1>, I<4>, I<4>>,
+    std::tuple<float, float, float, I<3>, I<2>, I<256>, I<1>, I<256>, I<1>, I<8>, I<1>, I<4>, I<4>>,
+    std::tuple<float, float, float, I<3>, I<2>, I<256>, I<1>, I<256>, I<1>, I<16>, I<1>, I<4>, I<4>>
    >;
 // clang-format on
 TYPED_TEST_SUITE(TestSoftmaxFP32, KernelTypes);