Standalone sweep once softmax kernel w/ ckProfiler (#295)

* use 'sweep once' softmax kernel where applicable * threadwise copy's dst buffer can specify invalid element value * add int8 in/out float compute softmax support give a bit of leeway for int absolute tolerance as there's a single data point of all test cases showing off-by-1 error * format * softmax inherits DeviceNormalization * softmax profiler stub * tighten up reference softmax interface * example prints tensor dimension * add fp32 to softmax profiler * rename header * hook with ckProfiler * format * resolve merge conflict * resolve merge conflicts * update normalization profiler help string * resolve conflict * typo * remove residual * softmax profiler: address feedback * test for mixed precision input/output * fully qualify ck::math::isnan * add comment for device normalization interface * revise wording * constness for alpha/beta scaler pointer [ROCm/composable_kernel commit: 93c99f3d87]
2026-05-18 20:09:25 +00:00 · 2022-07-01 01:08:50 +08:00
parent ca34ce4450
commit d41b1a7c2c
24 changed files with 809 additions and 106 deletions
--- a/library/src/tensor_operation_instance/gpu/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/CMakeLists.txt
@@ -25,6 +25,7 @@ add_subdirectory(conv2d_fwd_bias_relu_add)
 add_subdirectory(conv2d_bwd_data)
 add_subdirectory(convnd_bwd_data)
 add_subdirectory(conv2d_bwd_weight)
+add_subdirectory(normalization)
 add_subdirectory(reduce)

 add_library(device_operations STATIC
--- a/library/src/tensor_operation_instance/gpu/normalization/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/normalization/CMakeLists.txt
@@ -0,0 +1,10 @@
+# device_normalization_instance
+set(DEVICE_NORMALIZATION_INSTANCE_SOURCE
+    device_softmax_f32_f32_instance.cpp
+    device_softmax_f16_f16_instance.cpp
+)
+
+add_library(device_normalization_instance OBJECT ${DEVICE_NORMALIZATION_INSTANCE_SOURCE})
+set_target_properties(device_normalization_instance PROPERTIES POSITION_INDEPENDENT_CODE ON)
+
+clang_tidy_check(device_normalization_instance)
--- a/library/src/tensor_operation_instance/gpu/normalization/device_softmax_f16_f16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/normalization/device_softmax_f16_f16_instance.cpp
@@ -0,0 +1,49 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/ck.hpp"
+#include "ck/library/tensor_operation_instance/device_operation_instance.hpp"
+#include "ck/tensor_operation/gpu/device/device_softmax.hpp"
+#include "ck/utility/data_type.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace device_normalization_instance {
+
+using F16 = ck::half_t;
+using F32 = float;
+
+template <index_t Rank, index_t Reduce>
+using device_softmax_f16_f16_instances = std::tuple<
+    // clang-format off
+        // InDataType, AccDataType, OutDataType, Rank, NumReduceDim, BlockSize, MThreadClusterSize, KThreadClusterSize, MThreadSliceSize, KThreadSliceSize, InSrcVectorDim, InSrcVectorSize, OutDstVectorSize>
+        DeviceSoftmax<F16, F32, F16, Rank, Reduce, 256, 8, 32, 1, 8, 1, 1, 1>, // fallback kernel
+        DeviceSoftmax<F16, F32, F16, Rank, Reduce, 256, 8, 32, 1, 8, 1, 8, 8>,
+        DeviceSoftmax<F16, F32, F16, Rank, Reduce, 256, 4, 64, 1, 8, 1, 8, 8>,
+        DeviceSoftmax<F16, F32, F16, Rank, Reduce, 256, 2, 128, 1, 8, 1, 8, 8>,
+        DeviceSoftmax<F16, F32, F16, Rank, Reduce, 256, 2, 128, 1, 16, 1, 8, 8>,
+        DeviceSoftmax<F16, F32, F16, Rank, Reduce, 256, 2, 128, 1, 32, 1, 8, 8>,
+        DeviceSoftmax<F16, F32, F16, Rank, Reduce, 256, 1, 256, 1, 8, 1, 8, 8>,
+        DeviceSoftmax<F16, F32, F16, Rank, Reduce, 256, 1, 256, 1, 16, 1, 8, 8>,
+        DeviceSoftmax<F16, F32, F16, Rank, Reduce, 256, 1, 256, 1, 32, 1, 8, 8>
+    // clang-format on
+    >;
+
+void add_device_softmax_f16_f16_rank3_instances(std::vector<DeviceNormalizationPtr>& instances)
+{
+    add_device_operation_instances(instances, device_softmax_f16_f16_instances<3, 1>{});
+    add_device_operation_instances(instances, device_softmax_f16_f16_instances<3, 2>{});
+}
+
+void add_device_softmax_f16_f16_rank4_instances(std::vector<DeviceNormalizationPtr>& instances)
+{
+    add_device_operation_instances(instances, device_softmax_f16_f16_instances<4, 1>{});
+    add_device_operation_instances(instances, device_softmax_f16_f16_instances<4, 2>{});
+    add_device_operation_instances(instances, device_softmax_f16_f16_instances<4, 3>{});
+}
+
+} // namespace device_normalization_instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/library/src/tensor_operation_instance/gpu/normalization/device_softmax_f32_f32_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/normalization/device_softmax_f32_f32_instance.cpp
@@ -0,0 +1,48 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/ck.hpp"
+#include "ck/library/tensor_operation_instance/device_operation_instance.hpp"
+#include "ck/tensor_operation/gpu/device/device_softmax.hpp"
+#include "ck/utility/data_type.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace device_normalization_instance {
+
+using F32 = float;
+
+template <index_t Rank, index_t Reduce>
+using device_softmax_f32_f32_instances = std::tuple<
+    // clang-format off
+        // InDataType, AccDataType, OutDataType, Rank, NumReduceDim, BlockSize, MThreadClusterSize, KThreadClusterSize, MThreadSliceSize, KThreadSliceSize, InSrcVectorDim, InSrcVectorSize, OutDstVectorSize>
+        DeviceSoftmax<F32, F32, F32, Rank, Reduce, 256, 8, 32, 1, 8, 1, 1, 1>, // fallback kernel
+        DeviceSoftmax<F32, F32, F32, Rank, Reduce, 256, 8, 32, 1, 8, 1, 4, 4>,
+        DeviceSoftmax<F32, F32, F32, Rank, Reduce, 256, 4, 64, 1, 8, 1, 4, 4>,
+        DeviceSoftmax<F32, F32, F32, Rank, Reduce, 256, 2, 128, 1, 8, 1, 4, 4>,
+        DeviceSoftmax<F32, F32, F32, Rank, Reduce, 256, 2, 128, 1, 16, 1, 4, 4>,
+        DeviceSoftmax<F32, F32, F32, Rank, Reduce, 256, 2, 128, 1, 32, 1, 4, 4>,
+        DeviceSoftmax<F32, F32, F32, Rank, Reduce, 256, 1, 256, 1, 8, 1, 4, 4>,
+        DeviceSoftmax<F32, F32, F32, Rank, Reduce, 256, 1, 256, 1, 16, 1, 4, 4>,
+        DeviceSoftmax<F32, F32, F32, Rank, Reduce, 256, 1, 256, 1, 32, 1, 4, 4>
+    // clang-format on
+    >;
+
+void add_device_softmax_f32_f32_rank3_instances(std::vector<DeviceNormalizationPtr>& instances)
+{
+    add_device_operation_instances(instances, device_softmax_f32_f32_instances<3, 1>{});
+    add_device_operation_instances(instances, device_softmax_f32_f32_instances<3, 2>{});
+}
+
+void add_device_softmax_f32_f32_rank4_instances(std::vector<DeviceNormalizationPtr>& instances)
+{
+    add_device_operation_instances(instances, device_softmax_f32_f32_instances<4, 1>{});
+    add_device_operation_instances(instances, device_softmax_f32_f32_instances<4, 2>{});
+    add_device_operation_instances(instances, device_softmax_f32_f32_instances<4, 3>{});
+}
+
+} // namespace device_normalization_instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck