Layernorm welford (#346)

* Add threadwise and blockwise welford * Rename gridwise op, prepare to add welford version * implement welford and integrate welford into layernorm * Take care of tail loop * Fix buf when ThreadSliceK > 1 * Fix bug of merging of two empty set * Rename clip to clamp * 1. Fix type of count 2. Remove useless static_assert * Do not inherit Reduction::Argument * [What] replace __syncthreads() with block_sync_lds() [Why] __syncthreads might wait both lgkmcnt(0) and vmcnt(0) * Add y stride * Rename. DeviceLayernorm -> DeviceLayernormImpl DeviceNormalization2 -> DeviceLayernorm * Move literal ""_uz & ""_zu into namespace 'literals' * Move namespace 'literals' as 'ck::literals' Co-authored-by: Po-Yen, Chen <PoYen.Chen@amd.com> Co-authored-by: Chao Liu <chao.liu2@amd.com> [ROCm/composable_kernel commit: 0bd6b842b9]
2026-05-20 21:09:08 +00:00 · 2022-08-13 22:43:18 +08:00
parent 37f61cbba5
commit 8ff56b19d5
13 changed files with 1097 additions and 476 deletions
--- a/library/src/tensor_operation_instance/gpu/normalization/device_layernorm_f16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/normalization/device_layernorm_f16_instance.cpp
@@ -2,7 +2,7 @@
 // Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.

 #include "ck/ck.hpp"
-#include "ck/tensor_operation/gpu/device/device_layernorm.hpp"
+#include "ck/tensor_operation/gpu/device/device_layernorm_impl.hpp"
 #include "ck/utility/data_type.hpp"

 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
@@ -21,28 +21,28 @@ template <index_t Rank, index_t Reduce>
 using device_layernorm_f16_instances = std::tuple<
    // clang-format off
        // XDataType, GammaDataType, BetaDataType, AccDataType, YDataType, Rank, NumReduceDim, BlockSize, MThreadClusterSize, KThreadClusterSize, MThreadSliceSize, KThreadSliceSize, XYSrcVectorDim, XSrcVectorSize, GammaSrcVectorSize, BetaSrcVectorSize, YDstVectorSize>
-        DeviceLayernorm<F16, F16, F16, F32, F16, Pass, Rank, Reduce, 256, 8, 32, 1, 8, 1, 1, 1, 1, 1>, // fallback kernel
-        DeviceLayernorm<F16, F16, F16, F32, F16, Pass, Rank, Reduce, 256, 8, 32, 1, 8, 1, 2, 2, 2, 2>, // fallback kernel
-        DeviceLayernorm<F16, F16, F16, F32, F16, Pass, Rank, Reduce, 256, 8, 32, 1, 8, 1, 4, 4, 4, 4>, // fallback kernel
-        DeviceLayernorm<F16, F16, F16, F32, F16, Pass, Rank, Reduce, 256, 8, 32, 1, 8, 1, 8, 8, 8, 8>,
-        DeviceLayernorm<F16, F16, F16, F32, F16, Pass, Rank, Reduce, 256, 4, 64, 1, 8, 1, 8, 8, 8, 8>,
-        DeviceLayernorm<F16, F16, F16, F32, F16, Pass, Rank, Reduce, 256, 2, 128, 1, 8, 1, 8, 8, 8, 8>,
-        DeviceLayernorm<F16, F16, F16, F32, F16, Pass, Rank, Reduce, 256, 2, 128, 1, 16, 1, 8, 8, 8, 8>,
-        DeviceLayernorm<F16, F16, F16, F32, F16, Pass, Rank, Reduce, 256, 2, 128, 1, 32, 1, 8, 8, 8, 8>,
-        DeviceLayernorm<F16, F16, F16, F32, F16, Pass, Rank, Reduce, 256, 1, 256, 1, 8, 1, 8, 8, 8, 8>,
-        DeviceLayernorm<F16, F16, F16, F32, F16, Pass, Rank, Reduce, 256, 1, 256, 1, 16, 1, 8, 8, 8, 8>,
-        DeviceLayernorm<F16, F16, F16, F32, F16, Pass, Rank, Reduce, 256, 1, 256, 1, 32, 1, 8, 8, 8, 8>
+        DeviceLayernormImpl<F16, F16, F16, F32, F16, Pass, Rank, Reduce, 256, 8, 32, 1, 8, 1, 1, 1, 1, 1>, // fallback kernel
+        DeviceLayernormImpl<F16, F16, F16, F32, F16, Pass, Rank, Reduce, 256, 8, 32, 1, 8, 1, 2, 2, 2, 2>, // fallback kernel
+        DeviceLayernormImpl<F16, F16, F16, F32, F16, Pass, Rank, Reduce, 256, 8, 32, 1, 8, 1, 4, 4, 4, 4>, // fallback kernel
+        DeviceLayernormImpl<F16, F16, F16, F32, F16, Pass, Rank, Reduce, 256, 8, 32, 1, 8, 1, 8, 8, 8, 8>,
+        DeviceLayernormImpl<F16, F16, F16, F32, F16, Pass, Rank, Reduce, 256, 4, 64, 1, 8, 1, 8, 8, 8, 8>,
+        DeviceLayernormImpl<F16, F16, F16, F32, F16, Pass, Rank, Reduce, 256, 2, 128, 1, 8, 1, 8, 8, 8, 8>,
+        DeviceLayernormImpl<F16, F16, F16, F32, F16, Pass, Rank, Reduce, 256, 2, 128, 1, 16, 1, 8, 8, 8, 8>,
+        DeviceLayernormImpl<F16, F16, F16, F32, F16, Pass, Rank, Reduce, 256, 2, 128, 1, 32, 1, 8, 8, 8, 8>,
+        DeviceLayernormImpl<F16, F16, F16, F32, F16, Pass, Rank, Reduce, 256, 1, 256, 1, 8, 1, 8, 8, 8, 8>,
+        DeviceLayernormImpl<F16, F16, F16, F32, F16, Pass, Rank, Reduce, 256, 1, 256, 1, 16, 1, 8, 8, 8, 8>,
+        DeviceLayernormImpl<F16, F16, F16, F32, F16, Pass, Rank, Reduce, 256, 1, 256, 1, 32, 1, 8, 8, 8, 8>
    // clang-format on
    >;

 void add_device_layernorm_f16_rank2_instances(
-    std::vector<DeviceNormalization2Ptr<F16, F16, F16, F32, F16, Pass, 2, 1>>& instances)
+    std::vector<DeviceLayernormPtr<F16, F16, F16, F32, F16, Pass, 2, 1>>& instances)
 {
    add_device_operation_instances(instances, device_layernorm_f16_instances<2, 1>{});
 }

 void add_device_layernorm_f16_rank4_instances(
-    std::vector<DeviceNormalization2Ptr<F16, F16, F16, F32, F16, Pass, 4, 3>>& instances)
+    std::vector<DeviceLayernormPtr<F16, F16, F16, F32, F16, Pass, 4, 3>>& instances)
 {
    add_device_operation_instances(instances, device_layernorm_f16_instances<4, 3>{});
 }
--- a/library/src/tensor_operation_instance/gpu/normalization/device_layernorm_f32_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/normalization/device_layernorm_f32_instance.cpp
@@ -2,7 +2,7 @@
 // Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.

 #include "ck/ck.hpp"
-#include "ck/tensor_operation/gpu/device/device_layernorm.hpp"
+#include "ck/tensor_operation/gpu/device/device_layernorm_impl.hpp"
 #include "ck/utility/data_type.hpp"

 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
@@ -20,27 +20,27 @@ template <index_t Rank, index_t Reduce>
 using device_layernorm_f32_instances = std::tuple<
    // clang-format off
        // XDataType, GammaDataType, BetaDataType, AccDataType, YDataType, Rank, NumReduceDim, BlockSize, MThreadClusterSize, KThreadClusterSize, MThreadSliceSize, KThreadSliceSize, XYSrcVectorDim, XSrcVectorSize, GammaSrcVectorSize, BetaSrcVectorSize, YDstVectorSize>
-        DeviceLayernorm<F32, F32, F32, F32, F32, Pass, Rank, Reduce, 256, 8, 32, 1, 8, 1, 1, 1, 1, 1>, // fallback kernel
-        DeviceLayernorm<F32, F32, F32, F32, F32, Pass, Rank, Reduce, 256, 8, 32, 1, 8, 1, 2, 2, 2, 2>, // fallback kernel
-        DeviceLayernorm<F32, F32, F32, F32, F32, Pass, Rank, Reduce, 256, 8, 32, 1, 8, 1, 4, 4, 4, 4>,
-        DeviceLayernorm<F32, F32, F32, F32, F32, Pass, Rank, Reduce, 256, 4, 64, 1, 8, 1, 4, 4, 4, 4>,
-        DeviceLayernorm<F32, F32, F32, F32, F32, Pass, Rank, Reduce, 256, 2, 128, 1, 8, 1, 4, 4, 4, 4>,
-        DeviceLayernorm<F32, F32, F32, F32, F32, Pass, Rank, Reduce, 256, 2, 128, 1, 16, 1, 4, 4, 4, 4>,
-        DeviceLayernorm<F32, F32, F32, F32, F32, Pass, Rank, Reduce, 256, 2, 128, 1, 32, 1, 4, 4, 4, 4>,
-        DeviceLayernorm<F32, F32, F32, F32, F32, Pass, Rank, Reduce, 256, 1, 256, 1, 8, 1, 4, 4, 4, 4>,
-        DeviceLayernorm<F32, F32, F32, F32, F32, Pass, Rank, Reduce, 256, 1, 256, 1, 16, 1, 4, 4, 4, 4>,
-        DeviceLayernorm<F32, F32, F32, F32, F32, Pass, Rank, Reduce, 256, 1, 256, 1, 32, 1, 4, 4, 4, 4>
+        DeviceLayernormImpl<F32, F32, F32, F32, F32, Pass, Rank, Reduce, 256, 8, 32, 1, 8, 1, 1, 1, 1, 1>, // fallback kernel
+        DeviceLayernormImpl<F32, F32, F32, F32, F32, Pass, Rank, Reduce, 256, 8, 32, 1, 8, 1, 2, 2, 2, 2>, // fallback kernel
+        DeviceLayernormImpl<F32, F32, F32, F32, F32, Pass, Rank, Reduce, 256, 8, 32, 1, 8, 1, 4, 4, 4, 4>,
+        DeviceLayernormImpl<F32, F32, F32, F32, F32, Pass, Rank, Reduce, 256, 4, 64, 1, 8, 1, 4, 4, 4, 4>,
+        DeviceLayernormImpl<F32, F32, F32, F32, F32, Pass, Rank, Reduce, 256, 2, 128, 1, 8, 1, 4, 4, 4, 4>,
+        DeviceLayernormImpl<F32, F32, F32, F32, F32, Pass, Rank, Reduce, 256, 2, 128, 1, 16, 1, 4, 4, 4, 4>,
+        DeviceLayernormImpl<F32, F32, F32, F32, F32, Pass, Rank, Reduce, 256, 2, 128, 1, 32, 1, 4, 4, 4, 4>,
+        DeviceLayernormImpl<F32, F32, F32, F32, F32, Pass, Rank, Reduce, 256, 1, 256, 1, 8, 1, 4, 4, 4, 4>,
+        DeviceLayernormImpl<F32, F32, F32, F32, F32, Pass, Rank, Reduce, 256, 1, 256, 1, 16, 1, 4, 4, 4, 4>,
+        DeviceLayernormImpl<F32, F32, F32, F32, F32, Pass, Rank, Reduce, 256, 1, 256, 1, 32, 1, 4, 4, 4, 4>
    // clang-format on
    >;

 void add_device_layernorm_f32_rank2_instances(
-    std::vector<DeviceNormalization2Ptr<F32, F32, F32, F32, F32, Pass, 2, 1>>& instances)
+    std::vector<DeviceLayernormPtr<F32, F32, F32, F32, F32, Pass, 2, 1>>& instances)
 {
    add_device_operation_instances(instances, device_layernorm_f32_instances<2, 1>{});
 }

 void add_device_layernorm_f32_rank4_instances(
-    std::vector<DeviceNormalization2Ptr<F32, F32, F32, F32, F32, Pass, 4, 3>>& instances)
+    std::vector<DeviceLayernormPtr<F32, F32, F32, F32, F32, Pass, 4, 3>>& instances)
 {
    add_device_operation_instances(instances, device_layernorm_f32_instances<4, 3>{});
 }