From 1dcaa3991fffcd07e6e717c44ccde4ccc6beba44 Mon Sep 17 00:00:00 2001 From: rocking5566 Date: Thu, 13 Oct 2022 10:06:39 +0800 Subject: [PATCH] Fix bug of layernorm ckProfiler and refine code (#448) * Fix bug of profiler for layernorm * 1. Rename layernorm into normalization 2. Decouple softmax from normalization * clang-format [ROCm/composable_kernel commit: 1b62bfaa2a42ed83da2692f6797a5f929c39946f] --- client_example/05_layernorm/layernorm2d.cpp | 18 +-- example/27_layernorm/layernorm_blockwise.cpp | 42 +++---- .../42_groupnorm/groupnorm_sigmoid_fp16.cpp | 42 +++---- .../gpu/device/device_normalization.hpp | 45 ++------ ...impl.hpp => device_normalization_impl.hpp} | 18 +-- .../gpu/layernorm.hpp | 109 ------------------ .../gpu/normalization.hpp | 109 ++++++++++++++++++ .../gpu/CMakeLists.txt | 1 - .../gpu/normalization/CMakeLists.txt | 6 +- .../device_layernorm_f16_instance.cpp | 61 ---------- .../device_layernorm_f32_instance.cpp | 57 --------- .../device_normalization_f16_instance.cpp | 65 +++++++++++ .../device_normalization_f32_instance.cpp | 60 ++++++++++ .../gpu/softmax/CMakeLists.txt | 4 + .../device_softmax_f16_f16_instance.cpp | 0 .../device_softmax_f32_f32_instance.cpp | 0 profiler/CMakeLists.txt | 3 +- profiler/include/profile_groupnorm_impl.hpp | 18 +-- profiler/include/profile_layernorm_impl.hpp | 42 +++---- ...tion_impl.hpp => profile_softmax_impl.hpp} | 20 ++-- profiler/src/profile_layernorm.cpp | 31 +---- ..._normalization.cpp => profile_softmax.cpp} | 84 +++++++------- test/CMakeLists.txt | 7 +- .../CMakeLists.txt | 0 .../test_groupnorm_fp16.cpp | 0 .../test_groupnorm_fp32.cpp | 0 .../test_layernorm2d_fp16.cpp | 0 .../test_layernorm2d_fp32.cpp | 0 .../test_layernorm2d_util.hpp | 42 +++---- 29 files changed, 423 insertions(+), 461 deletions(-) rename include/ck/tensor_operation/gpu/device/{device_layernorm_impl.hpp => device_normalization_impl.hpp} (96%) delete mode 100644 library/include/ck/library/tensor_operation_instance/gpu/layernorm.hpp create mode 100644 library/include/ck/library/tensor_operation_instance/gpu/normalization.hpp delete mode 100644 library/src/tensor_operation_instance/gpu/normalization/device_layernorm_f16_instance.cpp delete mode 100644 library/src/tensor_operation_instance/gpu/normalization/device_layernorm_f32_instance.cpp create mode 100644 library/src/tensor_operation_instance/gpu/normalization/device_normalization_f16_instance.cpp create mode 100644 library/src/tensor_operation_instance/gpu/normalization/device_normalization_f32_instance.cpp create mode 100644 library/src/tensor_operation_instance/gpu/softmax/CMakeLists.txt rename library/src/tensor_operation_instance/gpu/{normalization => softmax}/device_softmax_f16_f16_instance.cpp (100%) rename library/src/tensor_operation_instance/gpu/{normalization => softmax}/device_softmax_f32_f32_instance.cpp (100%) rename profiler/include/{profile_normalization_impl.hpp => profile_softmax_impl.hpp} (94%) rename profiler/src/{profile_normalization.cpp => profile_softmax.cpp} (67%) rename test/{layernorm => normalization}/CMakeLists.txt (100%) rename test/{layernorm => normalization}/test_groupnorm_fp16.cpp (100%) rename test/{layernorm => normalization}/test_groupnorm_fp32.cpp (100%) rename test/{layernorm => normalization}/test_layernorm2d_fp16.cpp (100%) rename test/{layernorm => normalization}/test_layernorm2d_fp32.cpp (100%) rename test/{layernorm => normalization}/test_layernorm2d_util.hpp (91%) diff --git a/client_example/05_layernorm/layernorm2d.cpp b/client_example/05_layernorm/layernorm2d.cpp index c58a21da03..bdc6c2bd31 100644 --- a/client_example/05_layernorm/layernorm2d.cpp +++ b/client_example/05_layernorm/layernorm2d.cpp @@ -10,7 +10,7 @@ #include "ck/tensor_operation/gpu/device/device_normalization.hpp" #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/gpu/layernorm.hpp" +#include "ck/library/tensor_operation_instance/gpu/normalization.hpp" using XDataType = ck::half_t; using GammaDataType = ck::half_t; @@ -51,14 +51,14 @@ int main(int argc, char* argv[]) SimpleDeviceMem beta_device_buf(sizeof(BetaDataType) * N); SimpleDeviceMem y_device_buf(sizeof(YDataType) * xy_size); - using DeviceOp = ck::tensor_operation::device::DeviceLayernorm; + using DeviceOp = ck::tensor_operation::device::DeviceNormalization; // get device op instances const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory< diff --git a/example/27_layernorm/layernorm_blockwise.cpp b/example/27_layernorm/layernorm_blockwise.cpp index 6e8679cbe1..e8a1af9c25 100644 --- a/example/27_layernorm/layernorm_blockwise.cpp +++ b/example/27_layernorm/layernorm_blockwise.cpp @@ -9,7 +9,7 @@ #include "ck/ck.hpp" #include "ck/utility/reduction_enums.hpp" -#include "ck/tensor_operation/gpu/device/device_layernorm_impl.hpp" +#include "ck/tensor_operation/gpu/device/device_normalization_impl.hpp" #include "ck/tensor_operation/gpu/device/reduction_operator_mapping.hpp" #include "ck/library/utility/check_err.hpp" @@ -30,26 +30,26 @@ constexpr int Rank = 2; constexpr int NumReduceDim = 1; using DeviceInstance = - ck::tensor_operation::device::DeviceLayernormImpl; // OutScalarPerVector + ck::tensor_operation::device::DeviceNormalizationImpl; // OutScalarPerVector int main() { diff --git a/example/42_groupnorm/groupnorm_sigmoid_fp16.cpp b/example/42_groupnorm/groupnorm_sigmoid_fp16.cpp index 0748131340..e0924ec3aa 100644 --- a/example/42_groupnorm/groupnorm_sigmoid_fp16.cpp +++ b/example/42_groupnorm/groupnorm_sigmoid_fp16.cpp @@ -9,7 +9,7 @@ #include "ck/ck.hpp" #include "ck/utility/reduction_enums.hpp" -#include "ck/tensor_operation/gpu/device/device_layernorm_impl.hpp" +#include "ck/tensor_operation/gpu/device/device_normalization_impl.hpp" #include "ck/tensor_operation/gpu/device/reduction_operator_mapping.hpp" #include "ck/library/utility/fill.hpp" @@ -47,26 +47,26 @@ struct YElementOp }; using DeviceInstance = - ck::tensor_operation::device::DeviceLayernormImpl; // OutScalarPerVector + ck::tensor_operation::device::DeviceNormalizationImpl; // OutScalarPerVector int main(int argc, char* argv[]) { diff --git a/include/ck/tensor_operation/gpu/device/device_normalization.hpp b/include/ck/tensor_operation/gpu/device/device_normalization.hpp index 7032b2858b..f1a3133c94 100644 --- a/include/ck/tensor_operation/gpu/device/device_normalization.hpp +++ b/include/ck/tensor_operation/gpu/device/device_normalization.hpp @@ -11,33 +11,6 @@ namespace ck { namespace tensor_operation { namespace device { - -struct DeviceNormalization : public BaseOperator -{ - // inLengths: input tensor extent(s) from high to low dimension - // inStrides: input tensor stride(s) from high to low dimension - // reduceDims: the dimension(s) the normalization operation is applied - // alpha: typeless pointer in host memory storing the alpha scaling value of type AccDataType - // beta: typeless pointer in host memory storing the beta scaling value of type AccDataType - // in_dev: typeless const pointer in device memory storing the input tensor - // out_dev: typeless pointer in device memory storing the output tensor - virtual std::unique_ptr MakeArgumentPointer(const std::vector inLengths, - const std::vector inStrides, - const std::vector reduceDims, - const void* alpha, - const void* beta, - const void* in_dev, - void* out_dev) = 0; - - virtual std::unique_ptr MakeInvokerPointer() = 0; - - virtual index_t GetRank() const = 0; - - virtual index_t GetNumReduceDim() const = 0; -}; - -using DeviceNormalizationPtr = std::unique_ptr; - template -struct DeviceLayernorm : public BaseOperator +struct DeviceNormalization : public BaseOperator { virtual std::unique_ptr MakeArgumentPointer(const std::vector lengths, @@ -73,14 +46,14 @@ template -using DeviceLayernormPtr = std::unique_ptr>; +using DeviceNormalizationPtr = std::unique_ptr>; } // namespace device } // namespace tensor_operation diff --git a/include/ck/tensor_operation/gpu/device/device_layernorm_impl.hpp b/include/ck/tensor_operation/gpu/device/device_normalization_impl.hpp similarity index 96% rename from include/ck/tensor_operation/gpu/device/device_layernorm_impl.hpp rename to include/ck/tensor_operation/gpu/device/device_normalization_impl.hpp index 4b89d3eacf..31d77149e1 100644 --- a/include/ck/tensor_operation/gpu/device/device_layernorm_impl.hpp +++ b/include/ck/tensor_operation/gpu/device/device_normalization_impl.hpp @@ -75,14 +75,14 @@ template -struct DeviceLayernormImpl : public DeviceLayernorm +struct DeviceNormalizationImpl : public DeviceNormalization { static_assert( ((GammaSrcVectorDim == 0 && MThreadSliceSize % GammaSrcVectorSize == 0) || @@ -452,7 +452,7 @@ struct DeviceLayernormImpl : public DeviceLayernorm - -#include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/tensor_layout.hpp" -#include "ck/tensor_operation/gpu/device/device_normalization.hpp" -#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" - -#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// FP16 -void add_device_layernorm_rank_2_1_f16_instances( - std::vector>>&); - -void add_device_layernorm_rank_4_3_f16_instances( - std::vector>>&); - -void add_device_layernorm_rank_5_3_f16_instances( - std::vector>>&); - -// FP32 -void add_device_layernorm_rank_2_1_f32_instances( - std::vector>>&); - -void add_device_layernorm_rank_4_3_f32_instances( - std::vector>>&); - -void add_device_layernorm_rank_5_3_f32_instances( - std::vector>>&); - -template -struct DeviceOperationInstanceFactory< - ck::tensor_operation::device::DeviceLayernorm> -{ - using DeviceOp = DeviceLayernorm; - - static auto GetInstances() - { - std::vector> op_ptrs; - - if constexpr(is_same_v && is_same_v && - is_same_v && is_same_v) - { - if constexpr(Rank == 2 && NumReduceDim == 1) - { - add_device_layernorm_rank_2_1_f16_instances(op_ptrs); - } - else if constexpr(Rank == 4 && NumReduceDim == 3) - { - add_device_layernorm_rank_4_3_f16_instances(op_ptrs); - } - else if constexpr(Rank == 5 && NumReduceDim == 3) - { - add_device_layernorm_rank_5_3_f16_instances(op_ptrs); - } - } - else if constexpr(is_same_v && is_same_v && - is_same_v && is_same_v) - { - if constexpr(Rank == 2 && NumReduceDim == 1) - { - add_device_layernorm_rank_2_1_f32_instances(op_ptrs); - } - else if constexpr(Rank == 4 && NumReduceDim == 3) - { - add_device_layernorm_rank_4_3_f32_instances(op_ptrs); - } - else if constexpr(Rank == 5 && NumReduceDim == 3) - { - add_device_layernorm_rank_5_3_f32_instances(op_ptrs); - } - } - - return op_ptrs; - } -}; - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck diff --git a/library/include/ck/library/tensor_operation_instance/gpu/normalization.hpp b/library/include/ck/library/tensor_operation_instance/gpu/normalization.hpp new file mode 100644 index 0000000000..55c67b7623 --- /dev/null +++ b/library/include/ck/library/tensor_operation_instance/gpu/normalization.hpp @@ -0,0 +1,109 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include + +#include "ck/ck.hpp" +#include "ck/tensor_operation/gpu/device/tensor_layout.hpp" +#include "ck/tensor_operation/gpu/device/device_normalization.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" + +#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +// FP16 +void add_device_normalization_rank_2_1_f16_instances( + std::vector>>&); + +void add_device_normalization_rank_4_3_f16_instances( + std::vector>>&); + +void add_device_normalization_rank_5_3_f16_instances( + std::vector>>&); + +// FP32 +void add_device_normalization_rank_2_1_f32_instances( + std::vector>>&); + +void add_device_normalization_rank_4_3_f32_instances( + std::vector>>&); + +void add_device_normalization_rank_5_3_f32_instances( + std::vector>>&); + +template +struct DeviceOperationInstanceFactory> +{ + using DeviceOp = DeviceNormalization; + + static auto GetInstances() + { + std::vector> op_ptrs; + + if constexpr(is_same_v && is_same_v && + is_same_v && is_same_v) + { + if constexpr(Rank == 2 && NumReduceDim == 1) + { + add_device_normalization_rank_2_1_f16_instances(op_ptrs); + } + else if constexpr(Rank == 4 && NumReduceDim == 3) + { + add_device_normalization_rank_4_3_f16_instances(op_ptrs); + } + else if constexpr(Rank == 5 && NumReduceDim == 3) + { + add_device_normalization_rank_5_3_f16_instances(op_ptrs); + } + } + else if constexpr(is_same_v && is_same_v && + is_same_v && is_same_v) + { + if constexpr(Rank == 2 && NumReduceDim == 1) + { + add_device_normalization_rank_2_1_f32_instances(op_ptrs); + } + else if constexpr(Rank == 4 && NumReduceDim == 3) + { + add_device_normalization_rank_4_3_f32_instances(op_ptrs); + } + else if constexpr(Rank == 5 && NumReduceDim == 3) + { + add_device_normalization_rank_5_3_f32_instances(op_ptrs); + } + } + + return op_ptrs; + } +}; + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/CMakeLists.txt index 230ff5362c..d660f28493 100644 --- a/library/src/tensor_operation_instance/gpu/CMakeLists.txt +++ b/library/src/tensor_operation_instance/gpu/CMakeLists.txt @@ -17,7 +17,6 @@ IF(IS_DIRECTORY "${subdir_path}") ENDIF() ENDFOREACH() - add_library(device_operations STATIC ${CK_DEVICE_INSTANCES}) add_library(composablekernels::device_operations ALIAS device_operations) diff --git a/library/src/tensor_operation_instance/gpu/normalization/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/normalization/CMakeLists.txt index 17159fc9e4..aa0cc11480 100644 --- a/library/src/tensor_operation_instance/gpu/normalization/CMakeLists.txt +++ b/library/src/tensor_operation_instance/gpu/normalization/CMakeLists.txt @@ -1,6 +1,4 @@ add_instance_library(device_normalization_instance - device_layernorm_f16_instance.cpp - device_layernorm_f32_instance.cpp - device_softmax_f32_f32_instance.cpp - device_softmax_f16_f16_instance.cpp + device_normalization_f16_instance.cpp + device_normalization_f32_instance.cpp ) diff --git a/library/src/tensor_operation_instance/gpu/normalization/device_layernorm_f16_instance.cpp b/library/src/tensor_operation_instance/gpu/normalization/device_layernorm_f16_instance.cpp deleted file mode 100644 index 89bdf9438c..0000000000 --- a/library/src/tensor_operation_instance/gpu/normalization/device_layernorm_f16_instance.cpp +++ /dev/null @@ -1,61 +0,0 @@ -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. - -#include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/device_layernorm_impl.hpp" -#include "ck/utility/data_type.hpp" - -#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -using F16 = ck::half_t; -using F32 = float; - -using Pass = ck::tensor_operation::element_wise::PassThrough; - -template -using device_layernorm_f16_instances = std::tuple< - // clang-format off - // XDataType, GammaDataType, BetaDataType, AccDataType, YDataType, Rank, NumReduceDim, BlockSize, MThreadClusterSize, KThreadClusterSize, MThreadSliceSize, KThreadSliceSize, XYSrcVectorDim, XSrcVectorSize, GammaSrcVectorDim, GammaSrcVectorSize, BetaSrcVectorDim, BetaSrcVectorSize, YDstVectorSize> - DeviceLayernormImpl, // fallback kernel - DeviceLayernormImpl, // fallback kernel - DeviceLayernormImpl, // fallback kernel - DeviceLayernormImpl, - DeviceLayernormImpl, - DeviceLayernormImpl, - DeviceLayernormImpl, - DeviceLayernormImpl, - DeviceLayernormImpl, - DeviceLayernormImpl, - DeviceLayernormImpl, - DeviceLayernormImpl, - DeviceLayernormImpl - // clang-format on - >; - -void add_device_layernorm_rank_2_1_f16_instances( - std::vector>>& instances) -{ - add_device_operation_instances(instances, device_layernorm_f16_instances{}); -} - -void add_device_layernorm_rank_4_3_f16_instances( - std::vector>>& instances) -{ - add_device_operation_instances(instances, device_layernorm_f16_instances{}); -} - -void add_device_layernorm_rank_5_3_f16_instances( - std::vector>>& instances) -{ - add_device_operation_instances(instances, device_layernorm_f16_instances{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/normalization/device_layernorm_f32_instance.cpp b/library/src/tensor_operation_instance/gpu/normalization/device_layernorm_f32_instance.cpp deleted file mode 100644 index 1b35f275ad..0000000000 --- a/library/src/tensor_operation_instance/gpu/normalization/device_layernorm_f32_instance.cpp +++ /dev/null @@ -1,57 +0,0 @@ -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. - -#include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/device_layernorm_impl.hpp" -#include "ck/utility/data_type.hpp" - -#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -using F32 = float; - -using Pass = ck::tensor_operation::element_wise::PassThrough; - -template -using device_layernorm_f32_instances = std::tuple< - // clang-format off - // XDataType, GammaDataType, BetaDataType, AccDataType, YDataType, Rank, NumReduceDim, BlockSize, MThreadClusterSize, KThreadClusterSize, MThreadSliceSize, KThreadSliceSize, XYSrcVectorDim, XSrcVectorSize, GammaSrcVectorSize, BetaSrcVectorSize, YDstVectorSize> - DeviceLayernormImpl, // fallback kernel - DeviceLayernormImpl, // fallback kernel - DeviceLayernormImpl, - DeviceLayernormImpl, - DeviceLayernormImpl, - DeviceLayernormImpl, - DeviceLayernormImpl, - DeviceLayernormImpl, - DeviceLayernormImpl, - DeviceLayernormImpl - // clang-format on - >; - -void add_device_layernorm_rank_2_1_f32_instances( - std::vector>>& instances) -{ - add_device_operation_instances(instances, device_layernorm_f32_instances{}); -} - -void add_device_layernorm_rank_4_3_f32_instances( - std::vector>>& instances) -{ - add_device_operation_instances(instances, device_layernorm_f32_instances{}); -} - -void add_device_layernorm_rank_5_3_f32_instances( - std::vector>>& instances) -{ - add_device_operation_instances(instances, device_layernorm_f32_instances{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/normalization/device_normalization_f16_instance.cpp b/library/src/tensor_operation_instance/gpu/normalization/device_normalization_f16_instance.cpp new file mode 100644 index 0000000000..97582403a4 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/normalization/device_normalization_f16_instance.cpp @@ -0,0 +1,65 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/ck.hpp" +#include "ck/tensor_operation/gpu/device/device_normalization_impl.hpp" +#include "ck/utility/data_type.hpp" + +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +using F16 = ck::half_t; +using F32 = float; + +using Pass = ck::tensor_operation::element_wise::PassThrough; + +template +// clang-format off +using device_normalization_f16_instances = + std::tuple < + // XDataType, GammaDataType, BetaDataType, AccDataType, YDataType, Rank, NumReduceDim, BlockSize, MThreadClusterSize, KThreadClusterSize, MThreadSliceSize, KThreadSliceSize, XYSrcVectorDim, XSrcVectorSize, GammaSrcVectorDim, GammaSrcVectorSize, BetaSrcVectorDim, BetaSrcVectorSize, YDstVectorSize> + DeviceNormalizationImpl, // fallback kernel + DeviceNormalizationImpl, // fallback kernel + DeviceNormalizationImpl, // fallback kernel + DeviceNormalizationImpl, + DeviceNormalizationImpl, + DeviceNormalizationImpl, + DeviceNormalizationImpl, + DeviceNormalizationImpl, + DeviceNormalizationImpl, + DeviceNormalizationImpl, + DeviceNormalizationImpl, + DeviceNormalizationImpl, + DeviceNormalizationImpl + >; +// clang-format on + +void add_device_normalization_rank_2_1_f16_instances( + std::vector>>& + instances) +{ + add_device_operation_instances(instances, device_normalization_f16_instances{}); +} + +void add_device_normalization_rank_4_3_f16_instances( + std::vector>>& + instances) +{ + add_device_operation_instances(instances, device_normalization_f16_instances{}); +} + +void add_device_normalization_rank_5_3_f16_instances( + std::vector>>& + instances) +{ + add_device_operation_instances(instances, device_normalization_f16_instances{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/normalization/device_normalization_f32_instance.cpp b/library/src/tensor_operation_instance/gpu/normalization/device_normalization_f32_instance.cpp new file mode 100644 index 0000000000..75e9fafe6e --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/normalization/device_normalization_f32_instance.cpp @@ -0,0 +1,60 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/ck.hpp" +#include "ck/tensor_operation/gpu/device/device_normalization_impl.hpp" +#include "ck/utility/data_type.hpp" + +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +using F32 = float; + +using Pass = ck::tensor_operation::element_wise::PassThrough; + +template +using device_layernorm_f32_instances = std::tuple< + // clang-format off + // XDataType, GammaDataType, BetaDataType, AccDataType, YDataType, Rank, NumReduceDim, BlockSize, MThreadClusterSize, KThreadClusterSize, MThreadSliceSize, KThreadSliceSize, XYSrcVectorDim, XSrcVectorSize, GammaSrcVectorSize, BetaSrcVectorSize, YDstVectorSize> + DeviceNormalizationImpl, // fallback kernel + DeviceNormalizationImpl, // fallback kernel + DeviceNormalizationImpl, + DeviceNormalizationImpl, + DeviceNormalizationImpl, + DeviceNormalizationImpl, + DeviceNormalizationImpl, + DeviceNormalizationImpl, + DeviceNormalizationImpl, + DeviceNormalizationImpl + // clang-format on + >; + +void add_device_normalization_rank_2_1_f32_instances( + std::vector>>& + instances) +{ + add_device_operation_instances(instances, device_layernorm_f32_instances{}); +} + +void add_device_normalization_rank_4_3_f32_instances( + std::vector>>& + instances) +{ + add_device_operation_instances(instances, device_layernorm_f32_instances{}); +} + +void add_device_normalization_rank_5_3_f32_instances( + std::vector>>& + instances) +{ + add_device_operation_instances(instances, device_layernorm_f32_instances{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/softmax/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/softmax/CMakeLists.txt new file mode 100644 index 0000000000..081cb23b23 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/softmax/CMakeLists.txt @@ -0,0 +1,4 @@ +add_instance_library(device_softmax_instance + device_softmax_f16_f16_instance.cpp + device_softmax_f32_f32_instance.cpp +) diff --git a/library/src/tensor_operation_instance/gpu/normalization/device_softmax_f16_f16_instance.cpp b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/normalization/device_softmax_f16_f16_instance.cpp rename to library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/normalization/device_softmax_f32_f32_instance.cpp b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance.cpp similarity index 100% rename from library/src/tensor_operation_instance/gpu/normalization/device_softmax_f32_f32_instance.cpp rename to library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance.cpp diff --git a/profiler/CMakeLists.txt b/profiler/CMakeLists.txt index 53a26af890..bb0547933c 100644 --- a/profiler/CMakeLists.txt +++ b/profiler/CMakeLists.txt @@ -25,7 +25,7 @@ set(PROFILER_SOURCE src/profile_reduce.cpp src/profile_groupnorm.cpp src/profile_layernorm.cpp - src/profile_normalization.cpp + src/profile_softmax.cpp ) add_executable(ckProfiler ${PROFILER_SOURCE}) @@ -55,4 +55,5 @@ target_link_libraries(ckProfiler PRIVATE device_conv3d_bwd_weight_instance) target_link_libraries(ckProfiler PRIVATE device_conv2d_fwd_bias_relu_instance) target_link_libraries(ckProfiler PRIVATE device_conv2d_fwd_bias_relu_add_instance) target_link_libraries(ckProfiler PRIVATE device_normalization_instance) +target_link_libraries(ckProfiler PRIVATE device_softmax_instance) target_link_libraries(ckProfiler PRIVATE device_reduce_instance) diff --git a/profiler/include/profile_groupnorm_impl.hpp b/profiler/include/profile_groupnorm_impl.hpp index 44aa1d0e3c..05966ed412 100644 --- a/profiler/include/profile_groupnorm_impl.hpp +++ b/profiler/include/profile_groupnorm_impl.hpp @@ -7,7 +7,7 @@ #include "ck/ck.hpp" -#include "ck/library/tensor_operation_instance/gpu/layernorm.hpp" +#include "ck/library/tensor_operation_instance/gpu/normalization.hpp" #include "ck/library/utility/check_err.hpp" #include "ck/library/utility/device_memory.hpp" @@ -75,14 +75,14 @@ bool profile_groupnorm_impl(int do_verification, beta_dev.ToDevice(beta.mData.data()); // add device normalization instances - using DeviceOp = ck::tensor_operation::device::DeviceLayernorm; + using DeviceOp = ck::tensor_operation::device::DeviceNormalization; // get device op instances const auto instance_ptrs = diff --git a/profiler/include/profile_layernorm_impl.hpp b/profiler/include/profile_layernorm_impl.hpp index b0b4a73ab8..bff0321355 100644 --- a/profiler/include/profile_layernorm_impl.hpp +++ b/profiler/include/profile_layernorm_impl.hpp @@ -7,7 +7,7 @@ #include "ck/ck.hpp" -#include "ck/library/tensor_operation_instance/gpu/layernorm.hpp" +#include "ck/library/tensor_operation_instance/gpu/normalization.hpp" #include "ck/library/utility/check_err.hpp" #include "ck/library/utility/device_memory.hpp" @@ -28,27 +28,29 @@ void profile_layernorm_impl(int do_verification, int init_method, bool do_log, bool time_kernel, - std::vector length, - std::vector strideXY, - std::vector strideGamma, - std::vector strideBeta) + std::vector length) { using PassThrough = ck::tensor_operation::element_wise::PassThrough; if(length.size() < 2) return; - // Assume normalize dimension except for first dimension + // Assume normalize dimension except for batch (first) dimension std::vector reduce_length{length.begin() + 1, length.end()}; std::vector reduce_dim; for(int i = 1; i < Rank; ++i) reduce_dim.push_back(i); Tensor x(length); - Tensor gamma(reduce_length, strideGamma); - Tensor beta(reduce_length, strideBeta); - Tensor y(length, strideXY); - Tensor host_y(length, strideXY); + Tensor gamma(reduce_length); + Tensor beta(reduce_length); + Tensor y(length); + Tensor host_y(length); + + std::vector strideXY = + std::vector{x.mDesc.GetStrides().begin(), x.mDesc.GetStrides().end()}; + std::vector strideGammaBeta = strideXY; + strideGammaBeta[0] = 0; switch(init_method) { @@ -84,14 +86,14 @@ void profile_layernorm_impl(int do_verification, constexpr int NumReduceDim = Rank - 1; // add device normalization instances - using DeviceOp = ck::tensor_operation::device::DeviceLayernorm; + using DeviceOp = ck::tensor_operation::device::DeviceNormalization; // get device op instances const auto instance_ptrs = @@ -126,8 +128,8 @@ void profile_layernorm_impl(int do_verification, { auto argument_ptr = inst_ptr->MakeArgumentPointer(length, strideXY, - strideGamma, - strideBeta, + strideGammaBeta, + strideGammaBeta, strideXY, reduce_dim, 1e-4, diff --git a/profiler/include/profile_normalization_impl.hpp b/profiler/include/profile_softmax_impl.hpp similarity index 94% rename from profiler/include/profile_normalization_impl.hpp rename to profiler/include/profile_softmax_impl.hpp index 9f6d7e3d88..8394a58453 100644 --- a/profiler/include/profile_normalization_impl.hpp +++ b/profiler/include/profile_softmax_impl.hpp @@ -69,16 +69,16 @@ template <> std::string type_to_string() { return "int32"; } // clang-format on template -void profile_normalization_impl(int do_verification, - int init_method, - bool do_log, - bool time_kernel, - std::vector in_length, - std::vector in_strides, - std::vector reduce_dims, - AccDataType alpha, - AccDataType beta, - NormType norm_type) +void profile_softmax_impl(int do_verification, + int init_method, + bool do_log, + bool time_kernel, + std::vector in_length, + std::vector in_strides, + std::vector reduce_dims, + AccDataType alpha, + AccDataType beta, + NormType norm_type) { if(Rank != in_length.size()) { diff --git a/profiler/src/profile_layernorm.cpp b/profiler/src/profile_layernorm.cpp index 9e31342cca..b090a4e1c8 100644 --- a/profiler/src/profile_layernorm.cpp +++ b/profiler/src/profile_layernorm.cpp @@ -12,8 +12,7 @@ using ck::index_t; struct LayernormArgParser { - std::unordered_map> long_opts = { - {"length", {}}, {"strideXY", {}}, {"strideGamma", {}}, {"strideBeta", {}}}; + std::unordered_map> long_opts = {{"length", {}}}; bool parse_opt(int argc, char* argv[], const std::string& key, int i) { @@ -52,9 +51,6 @@ void print_help_layernorm() << "arg4: print tensor value (0: no; 1: yes)\n" << "arg5: time kernel (0=no, 1=yes)\n" << "--length: tensor extents (e.g, --length 1024 1024) \n" - << "--strideXY: tensor strides (e.g, --strideXY 1024 1)\n" - << "--strideGamma: tensor strides (e.g, --strideGamma 1)\n" - << "--strideBeta: tensor strides (e.g, --strideBeta 1)\n" << std::endl; } @@ -77,10 +73,7 @@ int profile_layernorm(int argc, char* argv[]) // parse the long options arg_parser(argc, argv); - const std::vector length = arg_parser.long_opts["length"]; - const std::vector strideXY = arg_parser.long_opts["strideXY"]; - const std::vector strideGamma = arg_parser.long_opts["strideGamma"]; - const std::vector strideBeta = arg_parser.long_opts["strideBeta"]; + const std::vector length = arg_parser.long_opts["length"]; using F16 = ck::half_t; using F32 = float; @@ -88,25 +81,13 @@ int profile_layernorm(int argc, char* argv[]) if(data_type == ck::DataTypeEnum::Half) { - ck::profiler::profile_layernorm_impl(do_verification, - init_method, - do_log, - time_kernel, - length, - strideXY, - strideGamma, - strideBeta); + ck::profiler::profile_layernorm_impl( + do_verification, init_method, do_log, time_kernel, length); } else if(data_type == ck::DataTypeEnum::Float) { - ck::profiler::profile_layernorm_impl(do_verification, - init_method, - do_log, - time_kernel, - length, - strideXY, - strideGamma, - strideBeta); + ck::profiler::profile_layernorm_impl( + do_verification, init_method, do_log, time_kernel, length); } else { diff --git a/profiler/src/profile_normalization.cpp b/profiler/src/profile_softmax.cpp similarity index 67% rename from profiler/src/profile_normalization.cpp rename to profiler/src/profile_softmax.cpp index 0e95a989a7..622d1c5673 100644 --- a/profiler/src/profile_normalization.cpp +++ b/profiler/src/profile_softmax.cpp @@ -5,7 +5,7 @@ #include #include -#include "profiler/include/profile_normalization_impl.hpp" +#include "profiler/include/profile_softmax_impl.hpp" using ck::index_t; using ck::profiler::NormDataType; @@ -95,30 +95,29 @@ int profile_normalization(int argc, char* argv[]) { if(data_type == NormDataType::F16_F16) { - ck::profiler::profile_normalization_impl( - do_verification, - init_method, - do_log, - time_kernel, - length, - stride, - reduce, - float(alpha), - float(beta), - norm_type); + ck::profiler::profile_softmax_impl(do_verification, + init_method, + do_log, + time_kernel, + length, + stride, + reduce, + float(alpha), + float(beta), + norm_type); } else if(data_type == NormDataType::F32_F32) { - ck::profiler::profile_normalization_impl(do_verification, - init_method, - do_log, - time_kernel, - length, - stride, - reduce, - float(alpha), - float(beta), - norm_type); + ck::profiler::profile_softmax_impl(do_verification, + init_method, + do_log, + time_kernel, + length, + stride, + reduce, + float(alpha), + float(beta), + norm_type); } else { @@ -129,30 +128,29 @@ int profile_normalization(int argc, char* argv[]) { if(data_type == NormDataType::F16_F16) { - ck::profiler::profile_normalization_impl( - do_verification, - init_method, - do_log, - time_kernel, - length, - stride, - reduce, - float(alpha), - float(beta), - norm_type); + ck::profiler::profile_softmax_impl(do_verification, + init_method, + do_log, + time_kernel, + length, + stride, + reduce, + float(alpha), + float(beta), + norm_type); } else if(data_type == NormDataType::F32_F32) { - ck::profiler::profile_normalization_impl(do_verification, - init_method, - do_log, - time_kernel, - length, - stride, - reduce, - float(alpha), - float(beta), - norm_type); + ck::profiler::profile_softmax_impl(do_verification, + init_method, + do_log, + time_kernel, + length, + stride, + reduce, + float(alpha), + float(beta), + norm_type); } else { diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 306a311226..e1b0b9c6e6 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -6,11 +6,10 @@ include(googletest) add_custom_target(tests) - function(add_test_executable TEST_NAME) message("adding test ${TEST_NAME}") add_executable(${TEST_NAME} ${ARGN}) - add_test(NAME ${TEST_NAME} COMMAND $ ) + add_test(NAME ${TEST_NAME} COMMAND $) add_dependencies(tests ${TEST_NAME}) add_dependencies(check ${TEST_NAME}) rocm_install(TARGETS ${TEST_NAME} COMPONENT tests) @@ -23,6 +22,7 @@ function(add_gtest_executable TEST_NAME) add_executable(${TEST_NAME} ${ARGN}) add_dependencies(tests ${TEST_NAME}) add_dependencies(check ${TEST_NAME}) + # suppress gtest warnings target_compile_options(${TEST_NAME} PRIVATE -Wno-global-constructors -Wno-undef) target_link_libraries(${TEST_NAME} PRIVATE gtest_main) @@ -30,7 +30,6 @@ function(add_gtest_executable TEST_NAME) rocm_install(TARGETS ${TEST_NAME} COMPONENT tests) endfunction(add_gtest_executable TEST_NAME) - add_subdirectory(magic_number_division) add_subdirectory(space_filling_curve) add_subdirectory(conv_util) @@ -51,5 +50,5 @@ add_subdirectory(convnd_bwd_data) add_subdirectory(grouped_convnd_fwd) add_subdirectory(block_to_ctile_map) add_subdirectory(softmax) -add_subdirectory(layernorm) +add_subdirectory(normalization) add_subdirectory(data_type) diff --git a/test/layernorm/CMakeLists.txt b/test/normalization/CMakeLists.txt similarity index 100% rename from test/layernorm/CMakeLists.txt rename to test/normalization/CMakeLists.txt diff --git a/test/layernorm/test_groupnorm_fp16.cpp b/test/normalization/test_groupnorm_fp16.cpp similarity index 100% rename from test/layernorm/test_groupnorm_fp16.cpp rename to test/normalization/test_groupnorm_fp16.cpp diff --git a/test/layernorm/test_groupnorm_fp32.cpp b/test/normalization/test_groupnorm_fp32.cpp similarity index 100% rename from test/layernorm/test_groupnorm_fp32.cpp rename to test/normalization/test_groupnorm_fp32.cpp diff --git a/test/layernorm/test_layernorm2d_fp16.cpp b/test/normalization/test_layernorm2d_fp16.cpp similarity index 100% rename from test/layernorm/test_layernorm2d_fp16.cpp rename to test/normalization/test_layernorm2d_fp16.cpp diff --git a/test/layernorm/test_layernorm2d_fp32.cpp b/test/normalization/test_layernorm2d_fp32.cpp similarity index 100% rename from test/layernorm/test_layernorm2d_fp32.cpp rename to test/normalization/test_layernorm2d_fp32.cpp diff --git a/test/layernorm/test_layernorm2d_util.hpp b/test/normalization/test_layernorm2d_util.hpp similarity index 91% rename from test/layernorm/test_layernorm2d_util.hpp rename to test/normalization/test_layernorm2d_util.hpp index 6112c7f5bf..3998d08b03 100644 --- a/test/layernorm/test_layernorm2d_util.hpp +++ b/test/normalization/test_layernorm2d_util.hpp @@ -9,7 +9,7 @@ #include "ck/ck.hpp" #include "ck/utility/number.hpp" -#include "ck/tensor_operation/gpu/device/device_layernorm_impl.hpp" +#include "ck/tensor_operation/gpu/device/device_normalization_impl.hpp" #include "ck/library/utility/check_err.hpp" #include "ck/library/utility/host_tensor.hpp" @@ -65,26 +65,26 @@ class TestLayernorm2d : public ::testing::Test Rank, NumReduceDim>; - using DeviceInstance = tensor_operation::device::DeviceLayernormImpl; + using DeviceInstance = tensor_operation::device::DeviceNormalizationImpl; TestLayernorm2d() : ref_instance_invoker_(ReferenceInstance{}.MakeInvoker()) {}