From b64f30e7338dab2262eae03033dfc7bb72d0d62e Mon Sep 17 00:00:00 2001 From: rocking Date: Thu, 9 Nov 2023 08:34:51 +0800 Subject: [PATCH] Layernorm4d (#1022) * Rename folder * Add layernorm 4d fwd example * Rename original layernorm example * Add layernorm 4d f16 test * Add layernorm4d_fwd client example * Support layernorm4D in ckProfiler * Rename groupnorm to groupnorm fwd in example * Rename layernorm and group fwd in test * Rename normalization to normalization_fwd (instances) * Add fwd to DeviceNormalization * Rename external api header * Rename folder, because we can also add bwd in this folder * Add fwd in layernorm and groupnorm (profiler * Fix compile error --------- Co-authored-by: Po Yen Chen [ROCm/composable_kernel commit: a3d9a2cd42ae2405a9265553d05a706ae61ade7e] --- client_example/05_layernorm/CMakeLists.txt | 7 +- .../{layernorm2d.cpp => layernorm2d_fwd.cpp} | 20 +- .../05_layernorm/layernorm4d_fwd.cpp | 201 ++++++++++++++++++ .../18_groupnorm/groupnorm_swish.cpp | 20 +- example/27_layernorm/CMakeLists.txt | 2 - example/27_layernorm/layernorm_fp16.cpp | 44 ---- .../27_layernorm/layernorm_splitk_fp16.cpp | 45 ---- example/27_layernorm2d_fwd/CMakeLists.txt | 2 + .../common.hpp | 4 +- .../layernorm2d_fwd_fp16.cpp | 44 ++++ .../layernorm2d_fwd_splitk_fp16.cpp | 45 ++++ .../run_layernorm_example.inc | 6 +- example/42_groupnorm/CMakeLists.txt | 3 - .../groupnorm_sigmoid_mul_fp16.cpp | 65 ------ .../42_groupnorm/groupnorm_splitk_fp16.cpp | 45 ---- example/42_groupnorm/groupnorm_swish_fp16.cpp | 45 ---- example/42_groupnorm_fwd/CMakeLists.txt | 3 + .../common.hpp | 4 +- .../groupnorm_fwd_sigmoid_mul_fp16.cpp | 65 ++++++ .../groupnorm_fwd_splitk_fp16.cpp | 45 ++++ .../groupnorm_fwd_swish_fp16.cpp | 45 ++++ .../run_groupnorm_fwd_example.inc} | 6 +- example/63_layernorm4d_fwd/CMakeLists.txt | 2 + example/63_layernorm4d_fwd/common.hpp | 22 ++ .../layernorm4d_fwd_fp16.cpp | 44 ++++ .../layernorm4d_fwd_splitk_fp16.cpp | 45 ++++ .../run_layernorm4d_fwd_example.inc | 124 +++++++++++ ...ation.hpp => device_normalization_fwd.hpp} | 18 +- ....hpp => device_normalization_fwd_impl.hpp} | 20 +- ... device_normalization_fwd_splitk_impl.hpp} | 20 +- .../cpu/reference_layernorm.hpp | 87 +++++++- ...ormalization.hpp => normalization_fwd.hpp} | 62 +++--- ..._swish.hpp => normalization_fwd_swish.hpp} | 52 ++--- .../gpu/normalization/CMakeLists.txt | 14 -- .../normalization_instance_common.hpp | 201 ------------------ .../gpu/normalization_fwd/CMakeLists.txt | 14 ++ .../device_groupnorm_fwd_f16_instance.cpp} | 6 +- .../device_groupnorm_fwd_f32_instance.cpp} | 6 +- ...rm_fwd_swish_f16_f32_f32_f16_instance.cpp} | 6 +- ...vice_groupnorm_fwd_swish_f16_instance.cpp} | 6 +- ...vice_groupnorm_fwd_swish_f32_instance.cpp} | 6 +- .../device_layernorm2d_fwd_f16_instance.cpp} | 6 +- .../device_layernorm2d_fwd_f32_instance.cpp} | 6 +- .../device_layernorm4d_fwd_f16_instance.cpp} | 6 +- .../device_layernorm4d_fwd_f32_instance.cpp} | 6 +- .../normalization_fwd_instance_common.hpp | 201 ++++++++++++++++++ ...mpl.hpp => profile_groupnorm_fwd_impl.hpp} | 18 +- ...mpl.hpp => profile_layernorm_fwd_impl.hpp} | 18 +- profiler/src/CMakeLists.txt | 6 +- ...roupnorm.cpp => profile_groupnorm_fwd.cpp} | 2 +- ...ayernorm.cpp => profile_layernorm_fwd.cpp} | 47 +++- test/CMakeLists.txt | 2 +- test/normalization/CMakeLists.txt | 21 -- test/normalization_fwd/CMakeLists.txt | 30 +++ .../test_groupnorm_fwd_fp16.cpp} | 2 +- .../test_groupnorm_fwd_fp32.cpp} | 2 +- .../test_layernorm2d_fwd_fp16.cpp} | 2 +- .../test_layernorm2d_fwd_fp32.cpp} | 2 +- .../test_layernorm4d_fwd_fp16.cpp | 48 +++++ 59 files changed, 1271 insertions(+), 675 deletions(-) rename client_example/05_layernorm/{layernorm2d.cpp => layernorm2d_fwd.cpp} (96%) create mode 100644 client_example/05_layernorm/layernorm4d_fwd.cpp delete mode 100644 example/27_layernorm/CMakeLists.txt delete mode 100644 example/27_layernorm/layernorm_fp16.cpp delete mode 100644 example/27_layernorm/layernorm_splitk_fp16.cpp create mode 100644 example/27_layernorm2d_fwd/CMakeLists.txt rename example/{27_layernorm => 27_layernorm2d_fwd}/common.hpp (94%) create mode 100644 example/27_layernorm2d_fwd/layernorm2d_fwd_fp16.cpp create mode 100644 example/27_layernorm2d_fwd/layernorm2d_fwd_splitk_fp16.cpp rename example/{27_layernorm => 27_layernorm2d_fwd}/run_layernorm_example.inc (96%) delete mode 100644 example/42_groupnorm/CMakeLists.txt delete mode 100644 example/42_groupnorm/groupnorm_sigmoid_mul_fp16.cpp delete mode 100644 example/42_groupnorm/groupnorm_splitk_fp16.cpp delete mode 100644 example/42_groupnorm/groupnorm_swish_fp16.cpp create mode 100644 example/42_groupnorm_fwd/CMakeLists.txt rename example/{42_groupnorm => 42_groupnorm_fwd}/common.hpp (95%) create mode 100644 example/42_groupnorm_fwd/groupnorm_fwd_sigmoid_mul_fp16.cpp create mode 100644 example/42_groupnorm_fwd/groupnorm_fwd_splitk_fp16.cpp create mode 100644 example/42_groupnorm_fwd/groupnorm_fwd_swish_fp16.cpp rename example/{42_groupnorm/run_groupnorm_example.inc => 42_groupnorm_fwd/run_groupnorm_fwd_example.inc} (96%) create mode 100644 example/63_layernorm4d_fwd/CMakeLists.txt create mode 100644 example/63_layernorm4d_fwd/common.hpp create mode 100644 example/63_layernorm4d_fwd/layernorm4d_fwd_fp16.cpp create mode 100644 example/63_layernorm4d_fwd/layernorm4d_fwd_splitk_fp16.cpp create mode 100644 example/63_layernorm4d_fwd/run_layernorm4d_fwd_example.inc rename include/ck/tensor_operation/gpu/device/{device_normalization.hpp => device_normalization_fwd.hpp} (88%) rename include/ck/tensor_operation/gpu/device/impl/{device_normalization_impl.hpp => device_normalization_fwd_impl.hpp} (96%) rename include/ck/tensor_operation/gpu/device/impl/{device_normalization_splitk_impl.hpp => device_normalization_fwd_splitk_impl.hpp} (98%) rename library/include/ck/library/tensor_operation_instance/gpu/{normalization.hpp => normalization_fwd.hpp} (52%) rename library/include/ck/library/tensor_operation_instance/gpu/{normalization_swish.hpp => normalization_fwd_swish.hpp} (51%) delete mode 100644 library/src/tensor_operation_instance/gpu/normalization/CMakeLists.txt delete mode 100644 library/src/tensor_operation_instance/gpu/normalization/normalization_instance_common.hpp create mode 100644 library/src/tensor_operation_instance/gpu/normalization_fwd/CMakeLists.txt rename library/src/tensor_operation_instance/gpu/{normalization/device_groupnorm_f16_instance.cpp => normalization_fwd/device_groupnorm_fwd_f16_instance.cpp} (78%) rename library/src/tensor_operation_instance/gpu/{normalization/device_groupnorm_f32_instance.cpp => normalization_fwd/device_groupnorm_fwd_f32_instance.cpp} (78%) rename library/src/tensor_operation_instance/gpu/{normalization/device_groupnorm_swish_f16_f32_f32_f16_instance.cpp => normalization_fwd/device_groupnorm_fwd_swish_f16_f32_f32_f16_instance.cpp} (77%) rename library/src/tensor_operation_instance/gpu/{normalization/device_groupnorm_swish_f16_instance.cpp => normalization_fwd/device_groupnorm_fwd_swish_f16_instance.cpp} (78%) rename library/src/tensor_operation_instance/gpu/{normalization/device_groupnorm_swish_f32_instance.cpp => normalization_fwd/device_groupnorm_fwd_swish_f32_instance.cpp} (78%) rename library/src/tensor_operation_instance/gpu/{normalization/device_layernorm2d_f16_instance.cpp => normalization_fwd/device_layernorm2d_fwd_f16_instance.cpp} (78%) rename library/src/tensor_operation_instance/gpu/{normalization/device_layernorm2d_f32_instance.cpp => normalization_fwd/device_layernorm2d_fwd_f32_instance.cpp} (78%) rename library/src/tensor_operation_instance/gpu/{normalization/device_layernorm4d_f16_instance.cpp => normalization_fwd/device_layernorm4d_fwd_f16_instance.cpp} (78%) rename library/src/tensor_operation_instance/gpu/{normalization/device_layernorm4d_f32_instance.cpp => normalization_fwd/device_layernorm4d_fwd_f32_instance.cpp} (78%) create mode 100644 library/src/tensor_operation_instance/gpu/normalization_fwd/normalization_fwd_instance_common.hpp rename profiler/include/profiler/{profile_groupnorm_impl.hpp => profile_groupnorm_fwd_impl.hpp} (98%) rename profiler/include/profiler/{profile_layernorm_impl.hpp => profile_layernorm_fwd_impl.hpp} (98%) rename profiler/src/{profile_groupnorm.cpp => profile_groupnorm_fwd.cpp} (98%) rename profiler/src/{profile_layernorm.cpp => profile_layernorm_fwd.cpp} (63%) delete mode 100644 test/normalization/CMakeLists.txt create mode 100644 test/normalization_fwd/CMakeLists.txt rename test/{normalization/test_groupnorm_fp16.cpp => normalization_fwd/test_groupnorm_fwd_fp16.cpp} (97%) rename test/{normalization/test_groupnorm_fp32.cpp => normalization_fwd/test_groupnorm_fwd_fp32.cpp} (97%) rename test/{normalization/test_layernorm2d_fp16.cpp => normalization_fwd/test_layernorm2d_fwd_fp16.cpp} (97%) rename test/{normalization/test_layernorm2d_fp32.cpp => normalization_fwd/test_layernorm2d_fwd_fp32.cpp} (97%) create mode 100644 test/normalization_fwd/test_layernorm4d_fwd_fp16.cpp diff --git a/client_example/05_layernorm/CMakeLists.txt b/client_example/05_layernorm/CMakeLists.txt index b582b485d4..642eae16d3 100644 --- a/client_example/05_layernorm/CMakeLists.txt +++ b/client_example/05_layernorm/CMakeLists.txt @@ -1,2 +1,5 @@ -add_executable(client_layernorm2d layernorm2d.cpp) -target_link_libraries(client_layernorm2d PRIVATE composable_kernel::device_operations) +add_executable(client_layernorm2d_fwd layernorm2d_fwd.cpp) +target_link_libraries(client_layernorm2d_fwd PRIVATE composable_kernel::device_operations) + +add_executable(client_layernorm4d_fwd layernorm4d_fwd.cpp) +target_link_libraries(client_layernorm4d_fwd PRIVATE composable_kernel::device_operations) diff --git a/client_example/05_layernorm/layernorm2d.cpp b/client_example/05_layernorm/layernorm2d_fwd.cpp similarity index 96% rename from client_example/05_layernorm/layernorm2d.cpp rename to client_example/05_layernorm/layernorm2d_fwd.cpp index 7a8e5fec99..19ddd614de 100644 --- a/client_example/05_layernorm/layernorm2d.cpp +++ b/client_example/05_layernorm/layernorm2d_fwd.cpp @@ -7,10 +7,10 @@ #include "ck/ck.hpp" #include "ck/tensor_operation/gpu/device/tensor_layout.hpp" -#include "ck/tensor_operation/gpu/device/device_normalization.hpp" +#include "ck/tensor_operation/gpu/device/device_normalization_fwd.hpp" #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/gpu/normalization.hpp" +#include "ck/library/tensor_operation_instance/gpu/normalization_fwd.hpp" using XDataType = ck::half_t; using GammaDataType = ck::half_t; @@ -57,14 +57,14 @@ int main(int argc, char* argv[]) SimpleDeviceMem save_inv_std_device_buf(sizeof(SaveMeanInvStdDataType) * M); #endif - using DeviceOp = ck::tensor_operation::device::DeviceNormalization; + using DeviceOp = ck::tensor_operation::device::DeviceNormalizationFwd; // get device op instances const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory< diff --git a/client_example/05_layernorm/layernorm4d_fwd.cpp b/client_example/05_layernorm/layernorm4d_fwd.cpp new file mode 100644 index 0000000000..9a7ecfd87e --- /dev/null +++ b/client_example/05_layernorm/layernorm4d_fwd.cpp @@ -0,0 +1,201 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. + +#include +#include +#include + +#include "ck/ck.hpp" +#include "ck/tensor_operation/gpu/device/tensor_layout.hpp" +#include "ck/tensor_operation/gpu/device/device_normalization_fwd.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" + +#include "ck/library/tensor_operation_instance/gpu/normalization_fwd.hpp" + +using XDataType = ck::half_t; +using GammaDataType = ck::half_t; +using BetaDataType = ck::half_t; +using YDataType = ck::half_t; +using SaveMeanInvStdDataType = float; +using PassThrough = ck::tensor_operation::element_wise::PassThrough; + +#define SAVE_MEAN_INV_STD + +constexpr int Rank = 4; +constexpr int NumReduceDim = 3; + +struct SimpleDeviceMem +{ + SimpleDeviceMem() = delete; + + SimpleDeviceMem(std::size_t mem_size) : p_mem_{} + { + (void)hipMalloc(static_cast(&p_mem_), mem_size); + } + + void* GetDeviceBuffer() { return p_mem_; } + + ~SimpleDeviceMem() { (void)hipFree(p_mem_); } + + void* p_mem_; +}; + +int main(int argc, char* argv[]) +{ + ck::index_t N = 256; + ck::index_t H = 16; + ck::index_t W = 16; + ck::index_t C = 8; + + std::vector strideXY = {H * W * C, W * C, C, 1}; + std::vector strideGammaBeta = {0, W * C, C, 1}; + std::vector strideSaveMeanInvStd = {1}; + + SimpleDeviceMem x_device_buf(sizeof(XDataType) * N * H * W * C); + SimpleDeviceMem gamma_device_buf(sizeof(GammaDataType) * H * W * C); + SimpleDeviceMem beta_device_buf(sizeof(BetaDataType) * H * W * C); + SimpleDeviceMem y_device_buf(sizeof(YDataType) * N * H * W * C); +#ifdef SAVE_MEAN_INV_STD + SimpleDeviceMem save_mean_device_buf(sizeof(SaveMeanInvStdDataType) * N); + SimpleDeviceMem save_inv_std_device_buf(sizeof(SaveMeanInvStdDataType) * N); +#endif + + using DeviceOp = ck::tensor_operation::device::DeviceNormalizationFwd; + + // get device op instances + const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory< + DeviceOp>::GetInstances(); + + std::cout << "found " << op_ptrs.size() << " instances" << std::endl; + + std::string best_op_name; + bool found = false; + int best_op_id = -1; + float best_ave_time = std::numeric_limits::max(); + float best_gb_per_sec = 0; + + // profile device operation instances + std::cout << "Run all instances and do timing" << std::endl; + + for(int i = 0; i < op_ptrs.size(); ++i) + { + auto& op_ptr = op_ptrs[i]; + + auto argument_ptr = + op_ptr->MakeArgumentPointer({N, H, W, C}, // lengths + strideXY, // xStrides + strideGammaBeta, // gammaStrides + strideGammaBeta, // betaStrides + strideXY, // yStrides + strideSaveMeanInvStd, // save_mean Strides + strideSaveMeanInvStd, // save_inv_std Strides + {1, 2, 3}, // reduceDims + 1e-4, + x_device_buf.GetDeviceBuffer(), + gamma_device_buf.GetDeviceBuffer(), + beta_device_buf.GetDeviceBuffer(), + y_device_buf.GetDeviceBuffer(), +#ifdef SAVE_MEAN_INV_STD + save_mean_device_buf.GetDeviceBuffer(), + save_inv_std_device_buf.GetDeviceBuffer(), +#else + nullptr, + nullptr, +#endif + PassThrough{}); + + auto invoker_ptr = op_ptr->MakeInvokerPointer(); + + std::string op_name = op_ptr->GetTypeString(); + + if(op_ptr->IsSupportedArgument(argument_ptr.get())) + { + size_t workspace_sz = op_ptr->GetWorkSpaceSize(argument_ptr.get()); + SimpleDeviceMem workspace(workspace_sz); + op_ptr->SetWorkSpacePointer(argument_ptr.get(), workspace.GetDeviceBuffer()); + + float ave_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true}); + + std::size_t num_byte = + sizeof(XDataType) * N * H * W * C + sizeof(GammaDataType) * H * W * C + + sizeof(BetaDataType) * H * W * C + sizeof(YDataType) * N * H * W * C; + +#ifdef SAVE_MEAN_INV_STD + num_byte += sizeof(SaveMeanInvStdDataType) * N * 2; +#endif + + float gb_per_sec = num_byte / 1.E6 / ave_time; + + std::cout << "Perf: " << std::setw(10) << ave_time << " ms, " << gb_per_sec << " GB/s, " + << op_name << std::endl; + + if(ave_time < best_ave_time) + { + found = true; + best_op_id = i; + best_op_name = op_name; + best_ave_time = ave_time; + best_gb_per_sec = gb_per_sec; + } + } + else + { + std::cout << op_name << " does not support this problem" << std::endl; + } + } + + std::cout << "Best Perf: " << best_ave_time << " ms, " << best_gb_per_sec << " GB/s, " + << best_op_name << std::endl; + + // run the best intance + { + auto& op_ptr = op_ptrs[best_op_id]; + std::cout << "Run the best instance without timing: " << op_ptr->GetTypeString() + << std::endl; + + auto argument_ptr = + op_ptr->MakeArgumentPointer({N, H, W, C}, // lengths + strideXY, // xStrides + strideGammaBeta, // gammaStrides + strideGammaBeta, // betaStrides + strideXY, // yStrides + strideSaveMeanInvStd, // save_mean Strides + strideSaveMeanInvStd, // save_inv_std Strides + {1, 2, 3}, // reduceDims + 1e-4, + x_device_buf.GetDeviceBuffer(), + gamma_device_buf.GetDeviceBuffer(), + beta_device_buf.GetDeviceBuffer(), + y_device_buf.GetDeviceBuffer(), +#ifdef SAVE_MEAN_INV_STD + save_mean_device_buf.GetDeviceBuffer(), + save_inv_std_device_buf.GetDeviceBuffer(), +#else + nullptr, + nullptr, +#endif + PassThrough{}); + + auto invoker_ptr = op_ptr->MakeInvokerPointer(); + + if(op_ptr->IsSupportedArgument(argument_ptr.get())) + { + size_t workspace_sz = op_ptr->GetWorkSpaceSize(argument_ptr.get()); + SimpleDeviceMem workspace(workspace_sz); + op_ptr->SetWorkSpacePointer(argument_ptr.get(), workspace.GetDeviceBuffer()); + + invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, false}); + } + + std::cout << "Done" << std::endl; + } + + return 0; +} diff --git a/client_example/18_groupnorm/groupnorm_swish.cpp b/client_example/18_groupnorm/groupnorm_swish.cpp index abe7492c65..d10d16bf9d 100644 --- a/client_example/18_groupnorm/groupnorm_swish.cpp +++ b/client_example/18_groupnorm/groupnorm_swish.cpp @@ -7,10 +7,10 @@ #include "ck/ck.hpp" #include "ck/tensor_operation/gpu/device/tensor_layout.hpp" -#include "ck/tensor_operation/gpu/device/device_normalization.hpp" +#include "ck/tensor_operation/gpu/device/device_normalization_fwd.hpp" #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/gpu/normalization_swish.hpp" +#include "ck/library/tensor_operation_instance/gpu/normalization_fwd_swish.hpp" using XDataType = ck::half_t; using GammaDataType = float; @@ -64,14 +64,14 @@ int main(int argc, char* argv[]) SimpleDeviceMem save_inv_std_device_buf(sizeof(SaveMeanInvStdDataType) * N * G); #endif - using DeviceOp = ck::tensor_operation::device::DeviceNormalization; + using DeviceOp = ck::tensor_operation::device::DeviceNormalizationFwd; // get device op instances const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory< diff --git a/example/27_layernorm/CMakeLists.txt b/example/27_layernorm/CMakeLists.txt deleted file mode 100644 index 94c23ce774..0000000000 --- a/example/27_layernorm/CMakeLists.txt +++ /dev/null @@ -1,2 +0,0 @@ -add_example_executable(example_layernorm_fp16 layernorm_fp16.cpp) -add_example_executable(example_layernorm_splitk_fp16 layernorm_splitk_fp16.cpp) diff --git a/example/27_layernorm/layernorm_fp16.cpp b/example/27_layernorm/layernorm_fp16.cpp deleted file mode 100644 index 255452e769..0000000000 --- a/example/27_layernorm/layernorm_fp16.cpp +++ /dev/null @@ -1,44 +0,0 @@ -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. - -#include "common.hpp" - -using XDataType = ck::half_t; -using GammaDataType = ck::half_t; -using BetaDataType = ck::half_t; -using YDataType = ck::half_t; -using SaveMeanInvStdDataType = float; -using ComputeDataType = float; -using PassThrough = ck::tensor_operation::element_wise::PassThrough; - -#define SAVE_MEAN_INV_STD - -constexpr int Rank = 2; -constexpr int NumReduceDim = 1; - -using DeviceInstance = - ck::tensor_operation::device::DeviceNormalizationImpl; // SaveMeanInvStdScalarPerVector -#include "run_layernorm_example.inc" - -int main() { return run_groupnorm_example(); } diff --git a/example/27_layernorm/layernorm_splitk_fp16.cpp b/example/27_layernorm/layernorm_splitk_fp16.cpp deleted file mode 100644 index e2a85bddc5..0000000000 --- a/example/27_layernorm/layernorm_splitk_fp16.cpp +++ /dev/null @@ -1,45 +0,0 @@ -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. - -#include "common.hpp" - -using XDataType = ck::half_t; -using GammaDataType = ck::half_t; -using BetaDataType = ck::half_t; -using YDataType = ck::half_t; -using SaveMeanInvStdDataType = float; -using ComputeDataType = float; -using PassThrough = ck::tensor_operation::element_wise::PassThrough; - -#define SAVE_MEAN_INV_STD - -constexpr int Rank = 2; -constexpr int NumReduceDim = 1; - -using DeviceInstance = - ck::tensor_operation::device::DeviceNormalizationSplitKImpl; // SaveMeanInvStdScalarPerVector - -#include "run_layernorm_example.inc" - -int main() { return run_groupnorm_example(); } diff --git a/example/27_layernorm2d_fwd/CMakeLists.txt b/example/27_layernorm2d_fwd/CMakeLists.txt new file mode 100644 index 0000000000..639bd9c400 --- /dev/null +++ b/example/27_layernorm2d_fwd/CMakeLists.txt @@ -0,0 +1,2 @@ +add_example_executable(example_layernorm2d_fwd_fp16 layernorm2d_fwd_fp16.cpp) +add_example_executable(example_layernorm2d_fwd_splitk_fp16 layernorm2d_fwd_splitk_fp16.cpp) diff --git a/example/27_layernorm/common.hpp b/example/27_layernorm2d_fwd/common.hpp similarity index 94% rename from example/27_layernorm/common.hpp rename to example/27_layernorm2d_fwd/common.hpp index 62a71713df..6c7b99f89f 100644 --- a/example/27_layernorm/common.hpp +++ b/example/27_layernorm2d_fwd/common.hpp @@ -10,8 +10,8 @@ #include #include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/impl/device_normalization_impl.hpp" -#include "ck/tensor_operation/gpu/device/impl/device_normalization_splitk_impl.hpp" +#include "ck/tensor_operation/gpu/device/impl/device_normalization_fwd_impl.hpp" +#include "ck/tensor_operation/gpu/device/impl/device_normalization_fwd_splitk_impl.hpp" #include "ck/library/utility/check_err.hpp" #include "ck/library/utility/device_memory.hpp" diff --git a/example/27_layernorm2d_fwd/layernorm2d_fwd_fp16.cpp b/example/27_layernorm2d_fwd/layernorm2d_fwd_fp16.cpp new file mode 100644 index 0000000000..20db24f56d --- /dev/null +++ b/example/27_layernorm2d_fwd/layernorm2d_fwd_fp16.cpp @@ -0,0 +1,44 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. + +#include "common.hpp" + +using XDataType = ck::half_t; +using GammaDataType = ck::half_t; +using BetaDataType = ck::half_t; +using YDataType = ck::half_t; +using SaveMeanInvStdDataType = float; +using ComputeDataType = float; +using PassThrough = ck::tensor_operation::element_wise::PassThrough; + +#define SAVE_MEAN_INV_STD + +constexpr int Rank = 2; +constexpr int NumReduceDim = 1; + +using DeviceInstance = + ck::tensor_operation::device::DeviceNormalizationFwdImpl; // SaveMeanInvStdScalarPerVector +#include "run_layernorm_example.inc" + +int main() { return run_layernorm2d_fwd_example(); } diff --git a/example/27_layernorm2d_fwd/layernorm2d_fwd_splitk_fp16.cpp b/example/27_layernorm2d_fwd/layernorm2d_fwd_splitk_fp16.cpp new file mode 100644 index 0000000000..5a57082c79 --- /dev/null +++ b/example/27_layernorm2d_fwd/layernorm2d_fwd_splitk_fp16.cpp @@ -0,0 +1,45 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. + +#include "common.hpp" + +using XDataType = ck::half_t; +using GammaDataType = ck::half_t; +using BetaDataType = ck::half_t; +using YDataType = ck::half_t; +using SaveMeanInvStdDataType = float; +using ComputeDataType = float; +using PassThrough = ck::tensor_operation::element_wise::PassThrough; + +#define SAVE_MEAN_INV_STD + +constexpr int Rank = 2; +constexpr int NumReduceDim = 1; + +using DeviceInstance = ck::tensor_operation::device::DeviceNormalizationFwdSplitKImpl< + XDataType, + GammaDataType, + BetaDataType, + ComputeDataType, + YDataType, + SaveMeanInvStdDataType, + PassThrough, + Rank, + NumReduceDim, + 256, // BlockSize + 8, // ClusterM + 32, // ClusterK + 1, // SliceM + 8, // SliceK + 1, // XYVectorDim (0=M, 1=K) + 8, // XScalarPerVector + 1, // GammaVecDim (0=M, 1=K) + 8, // GammaScalarPerVector + 1, // BetaVecDim (0=M, 1=K) + 8, // BetaScalarPerVector + 8, // YScalarPerVector + 1>; // SaveMeanInvStdScalarPerVector + +#include "run_layernorm_example.inc" + +int main() { return run_layernorm2d_fwd_example(); } diff --git a/example/27_layernorm/run_layernorm_example.inc b/example/27_layernorm2d_fwd/run_layernorm_example.inc similarity index 96% rename from example/27_layernorm/run_layernorm_example.inc rename to example/27_layernorm2d_fwd/run_layernorm_example.inc index 399165c36e..02b60fe548 100644 --- a/example/27_layernorm/run_layernorm_example.inc +++ b/example/27_layernorm2d_fwd/run_layernorm_example.inc @@ -4,7 +4,7 @@ #pragma once template -int run_groupnorm_example() +int run_layernorm2d_fwd_example() { bool time_kernel = false; @@ -44,9 +44,9 @@ int run_groupnorm_example() {0, 1}, std::vector{y.mDesc.GetStrides().begin(), y.mDesc.GetStrides().end()}, std::vector{save_mean.mDesc.GetStrides().begin(), - save_mean.mDesc.GetStrides().end()}, + save_mean.mDesc.GetStrides().end()}, std::vector{save_mean.mDesc.GetStrides().begin(), - save_mean.mDesc.GetStrides().end()}, + save_mean.mDesc.GetStrides().end()}, {1}, 1e-4, x_dev.GetDeviceBuffer(), diff --git a/example/42_groupnorm/CMakeLists.txt b/example/42_groupnorm/CMakeLists.txt deleted file mode 100644 index e8c306ac58..0000000000 --- a/example/42_groupnorm/CMakeLists.txt +++ /dev/null @@ -1,3 +0,0 @@ -add_example_executable(example_groupnorm_sigmoid_mul_fp16 groupnorm_sigmoid_mul_fp16.cpp) -add_example_executable(example_groupnorm_splitk_fp16 groupnorm_splitk_fp16.cpp) -add_example_executable(example_groupnorm_swish_fp16 groupnorm_swish_fp16.cpp) diff --git a/example/42_groupnorm/groupnorm_sigmoid_mul_fp16.cpp b/example/42_groupnorm/groupnorm_sigmoid_mul_fp16.cpp deleted file mode 100644 index 0ede570e62..0000000000 --- a/example/42_groupnorm/groupnorm_sigmoid_mul_fp16.cpp +++ /dev/null @@ -1,65 +0,0 @@ -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. - -#include "common.hpp" - -constexpr int Rank = 5; -constexpr int NumReduceDim = 3; - -using XDataType = ck::half_t; -using GammaDataType = ck::half_t; -using BetaDataType = ck::half_t; -using YDataType = ck::half_t; -using SaveMeanInvStdDataType = float; -using ComputeDataType = float; - -#define SAVE_MEAN_INV_STD - -struct YElementOp -{ - template - __host__ __device__ void operator()(Y& y, const X& x) const - { - static_assert(ck::is_same::value || ck::is_same::value || - ck::is_same::value, - "Data type is not supported by this operation!"); - - static_assert(ck::is_same::value || ck::is_same::value || - ck::is_same::value, - "Data type is not supported by this operation!"); - - X a; - - ck::tensor_operation::element_wise::Sigmoid{}(a, x); - - y = ck::type_convert(x * a); - }; -}; - -using DeviceInstance = - ck::tensor_operation::device::DeviceNormalizationImpl; // SaveMeanInvStdScalarPerVector - -#include "run_groupnorm_example.inc" - -int main(int argc, char* argv[]) { run_groupnorm_example(argc, argv); } diff --git a/example/42_groupnorm/groupnorm_splitk_fp16.cpp b/example/42_groupnorm/groupnorm_splitk_fp16.cpp deleted file mode 100644 index 5f56268e02..0000000000 --- a/example/42_groupnorm/groupnorm_splitk_fp16.cpp +++ /dev/null @@ -1,45 +0,0 @@ -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. - -#include "common.hpp" - -constexpr int Rank = 5; -constexpr int NumReduceDim = 3; - -using XDataType = ck::half_t; -using GammaDataType = ck::half_t; -using BetaDataType = ck::half_t; -using YDataType = ck::half_t; -using SaveMeanInvStdDataType = float; -using ComputeDataType = float; -using YElementOp = ck::tensor_operation::element_wise::Swish; - -#define SAVE_MEAN_INV_STD - -using DeviceInstance = - ck::tensor_operation::device::DeviceNormalizationSplitKImpl; // SaveMeanInvStdScalarPerVector - -#include "run_groupnorm_example.inc" - -int main(int argc, char* argv[]) { run_groupnorm_example(argc, argv); } diff --git a/example/42_groupnorm/groupnorm_swish_fp16.cpp b/example/42_groupnorm/groupnorm_swish_fp16.cpp deleted file mode 100644 index 97cd4698aa..0000000000 --- a/example/42_groupnorm/groupnorm_swish_fp16.cpp +++ /dev/null @@ -1,45 +0,0 @@ -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. - -#include "common.hpp" - -constexpr int Rank = 5; -constexpr int NumReduceDim = 3; - -using XDataType = ck::half_t; -using GammaDataType = ck::half_t; -using BetaDataType = ck::half_t; -using YDataType = ck::half_t; -using SaveMeanInvStdDataType = float; -using ComputeDataType = float; -using YElementOp = ck::tensor_operation::element_wise::Swish; - -#define SAVE_MEAN_INV_STD - -using DeviceInstance = - ck::tensor_operation::device::DeviceNormalizationImpl; // SaveMeanInvStdScalarPerVector - -#include "run_groupnorm_example.inc" - -int main(int argc, char* argv[]) { run_groupnorm_example(argc, argv); } diff --git a/example/42_groupnorm_fwd/CMakeLists.txt b/example/42_groupnorm_fwd/CMakeLists.txt new file mode 100644 index 0000000000..7d08baccd0 --- /dev/null +++ b/example/42_groupnorm_fwd/CMakeLists.txt @@ -0,0 +1,3 @@ +add_example_executable(example_groupnorm_fwd_sigmoid_mul_fp16 groupnorm_fwd_sigmoid_mul_fp16.cpp) +add_example_executable(example_groupnorm_fwd_splitk_fp16 groupnorm_fwd_splitk_fp16.cpp) +add_example_executable(example_groupnorm_fwd_swish_fp16 groupnorm_fwd_swish_fp16.cpp) diff --git a/example/42_groupnorm/common.hpp b/example/42_groupnorm_fwd/common.hpp similarity index 95% rename from example/42_groupnorm/common.hpp rename to example/42_groupnorm_fwd/common.hpp index c8f91eb53b..038a8c0f1f 100644 --- a/example/42_groupnorm/common.hpp +++ b/example/42_groupnorm_fwd/common.hpp @@ -11,8 +11,8 @@ #include "ck/ck.hpp" #include "ck/utility/reduction_enums.hpp" -#include "ck/tensor_operation/gpu/device/impl/device_normalization_impl.hpp" -#include "ck/tensor_operation/gpu/device/impl/device_normalization_splitk_impl.hpp" +#include "ck/tensor_operation/gpu/device/impl/device_normalization_fwd_impl.hpp" +#include "ck/tensor_operation/gpu/device/impl/device_normalization_fwd_splitk_impl.hpp" #include "ck/tensor_operation/gpu/device/reduction_operator_mapping.hpp" #include "ck/library/utility/fill.hpp" diff --git a/example/42_groupnorm_fwd/groupnorm_fwd_sigmoid_mul_fp16.cpp b/example/42_groupnorm_fwd/groupnorm_fwd_sigmoid_mul_fp16.cpp new file mode 100644 index 0000000000..15c02b8213 --- /dev/null +++ b/example/42_groupnorm_fwd/groupnorm_fwd_sigmoid_mul_fp16.cpp @@ -0,0 +1,65 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. + +#include "common.hpp" + +constexpr int Rank = 5; +constexpr int NumReduceDim = 3; + +using XDataType = ck::half_t; +using GammaDataType = ck::half_t; +using BetaDataType = ck::half_t; +using YDataType = ck::half_t; +using SaveMeanInvStdDataType = float; +using ComputeDataType = float; + +#define SAVE_MEAN_INV_STD + +struct YElementOp +{ + template + __host__ __device__ void operator()(Y& y, const X& x) const + { + static_assert(ck::is_same::value || ck::is_same::value || + ck::is_same::value, + "Data type is not supported by this operation!"); + + static_assert(ck::is_same::value || ck::is_same::value || + ck::is_same::value, + "Data type is not supported by this operation!"); + + X a; + + ck::tensor_operation::element_wise::Sigmoid{}(a, x); + + y = ck::type_convert(x * a); + }; +}; + +using DeviceInstance = + ck::tensor_operation::device::DeviceNormalizationFwdImpl; // SaveMeanInvStdScalarPerVector + +#include "run_groupnorm_fwd_example.inc" + +int main(int argc, char* argv[]) { run_groupnorm_fwd_example(argc, argv); } diff --git a/example/42_groupnorm_fwd/groupnorm_fwd_splitk_fp16.cpp b/example/42_groupnorm_fwd/groupnorm_fwd_splitk_fp16.cpp new file mode 100644 index 0000000000..37fdf1b253 --- /dev/null +++ b/example/42_groupnorm_fwd/groupnorm_fwd_splitk_fp16.cpp @@ -0,0 +1,45 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. + +#include "common.hpp" + +constexpr int Rank = 5; +constexpr int NumReduceDim = 3; + +using XDataType = ck::half_t; +using GammaDataType = ck::half_t; +using BetaDataType = ck::half_t; +using YDataType = ck::half_t; +using SaveMeanInvStdDataType = float; +using ComputeDataType = float; +using YElementOp = ck::tensor_operation::element_wise::Swish; + +#define SAVE_MEAN_INV_STD + +using DeviceInstance = ck::tensor_operation::device::DeviceNormalizationFwdSplitKImpl< + XDataType, + GammaDataType, + BetaDataType, + ComputeDataType, + YDataType, + SaveMeanInvStdDataType, + YElementOp, + Rank, + NumReduceDim, + 256, // BlockSize + 1, // ClusterM + 256, // ClusterK + 1, // SliceM + 16, // SliceK + 1, // SrcVecDim (0=M, 1=K) + 2, // SrcScalarPerVector + 1, // GammaVecDim (0=M, 1=K) + 2, // GammaScalarPerVector + 1, // BetaVecDim (0=M, 1=K) + 2, // BetaScalarPerVector + 2, // YScalarPerVector + 1>; // SaveMeanInvStdScalarPerVector + +#include "run_groupnorm_fwd_example.inc" + +int main(int argc, char* argv[]) { run_groupnorm_fwd_example(argc, argv); } diff --git a/example/42_groupnorm_fwd/groupnorm_fwd_swish_fp16.cpp b/example/42_groupnorm_fwd/groupnorm_fwd_swish_fp16.cpp new file mode 100644 index 0000000000..6d17264cef --- /dev/null +++ b/example/42_groupnorm_fwd/groupnorm_fwd_swish_fp16.cpp @@ -0,0 +1,45 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. + +#include "common.hpp" + +constexpr int Rank = 5; +constexpr int NumReduceDim = 3; + +using XDataType = ck::half_t; +using GammaDataType = ck::half_t; +using BetaDataType = ck::half_t; +using YDataType = ck::half_t; +using SaveMeanInvStdDataType = float; +using ComputeDataType = float; +using YElementOp = ck::tensor_operation::element_wise::Swish; + +#define SAVE_MEAN_INV_STD + +using DeviceInstance = + ck::tensor_operation::device::DeviceNormalizationFwdImpl; // SaveMeanInvStdScalarPerVector + +#include "run_groupnorm_fwd_example.inc" + +int main(int argc, char* argv[]) { run_groupnorm_fwd_example(argc, argv); } diff --git a/example/42_groupnorm/run_groupnorm_example.inc b/example/42_groupnorm_fwd/run_groupnorm_fwd_example.inc similarity index 96% rename from example/42_groupnorm/run_groupnorm_example.inc rename to example/42_groupnorm_fwd/run_groupnorm_fwd_example.inc index 89117a9b94..ab6f317bc6 100644 --- a/example/42_groupnorm/run_groupnorm_example.inc +++ b/example/42_groupnorm_fwd/run_groupnorm_fwd_example.inc @@ -3,7 +3,7 @@ #pragma once -int run_groupnorm_example(int argc, char* argv[]) +int run_groupnorm_fwd_example(int argc, char* argv[]) { ck::index_t N = 32; ck::index_t H = 16; @@ -65,9 +65,9 @@ int run_groupnorm_example(int argc, char* argv[]) {0, 0, 0, C, 1}, std::vector{y.mDesc.GetStrides().begin(), y.mDesc.GetStrides().end()}, std::vector{save_mean.mDesc.GetStrides().begin(), - save_mean.mDesc.GetStrides().end()}, + save_mean.mDesc.GetStrides().end()}, std::vector{save_mean.mDesc.GetStrides().begin(), - save_mean.mDesc.GetStrides().end()}, + save_mean.mDesc.GetStrides().end()}, {1, 2, 4}, // reduction dimension: [H, W, C] 1e-6, x_dev.GetDeviceBuffer(), diff --git a/example/63_layernorm4d_fwd/CMakeLists.txt b/example/63_layernorm4d_fwd/CMakeLists.txt new file mode 100644 index 0000000000..3f8c679ab8 --- /dev/null +++ b/example/63_layernorm4d_fwd/CMakeLists.txt @@ -0,0 +1,2 @@ +add_example_executable(example_layernorm4d_fwd_fp16 layernorm4d_fwd_fp16.cpp) +add_example_executable(example_layernorm4d_fwd_splitk_fp16 layernorm4d_fwd_splitk_fp16.cpp) diff --git a/example/63_layernorm4d_fwd/common.hpp b/example/63_layernorm4d_fwd/common.hpp new file mode 100644 index 0000000000..6c7b99f89f --- /dev/null +++ b/example/63_layernorm4d_fwd/common.hpp @@ -0,0 +1,22 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include +#include +#include +#include +#include + +#include "ck/ck.hpp" +#include "ck/tensor_operation/gpu/device/impl/device_normalization_fwd_impl.hpp" +#include "ck/tensor_operation/gpu/device/impl/device_normalization_fwd_splitk_impl.hpp" + +#include "ck/library/utility/check_err.hpp" +#include "ck/library/utility/device_memory.hpp" +#include "ck/library/utility/host_common_util.hpp" +#include "ck/library/utility/host_tensor.hpp" +#include "ck/library/utility/host_tensor_generator.hpp" +#include "ck/library/utility/literals.hpp" +#include "ck/library/reference_tensor_operation/cpu/reference_layernorm.hpp" diff --git a/example/63_layernorm4d_fwd/layernorm4d_fwd_fp16.cpp b/example/63_layernorm4d_fwd/layernorm4d_fwd_fp16.cpp new file mode 100644 index 0000000000..659cc3554d --- /dev/null +++ b/example/63_layernorm4d_fwd/layernorm4d_fwd_fp16.cpp @@ -0,0 +1,44 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. + +#include "common.hpp" + +using XDataType = ck::half_t; +using GammaDataType = ck::half_t; +using BetaDataType = ck::half_t; +using YDataType = ck::half_t; +using SaveMeanInvStdDataType = float; +using ComputeDataType = float; +using PassThrough = ck::tensor_operation::element_wise::PassThrough; + +#define SAVE_MEAN_INV_STD + +constexpr int Rank = 4; +constexpr int NumReduceDim = 3; + +using DeviceInstance = + ck::tensor_operation::device::DeviceNormalizationFwdImpl; // SaveMeanInvStdScalarPerVector +#include "run_layernorm4d_fwd_example.inc" + +int main() { return run_layernorm4d_fwd_example(); } diff --git a/example/63_layernorm4d_fwd/layernorm4d_fwd_splitk_fp16.cpp b/example/63_layernorm4d_fwd/layernorm4d_fwd_splitk_fp16.cpp new file mode 100644 index 0000000000..415635a0e3 --- /dev/null +++ b/example/63_layernorm4d_fwd/layernorm4d_fwd_splitk_fp16.cpp @@ -0,0 +1,45 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. + +#include "common.hpp" + +using XDataType = ck::half_t; +using GammaDataType = ck::half_t; +using BetaDataType = ck::half_t; +using YDataType = ck::half_t; +using SaveMeanInvStdDataType = float; +using ComputeDataType = float; +using PassThrough = ck::tensor_operation::element_wise::PassThrough; + +#define SAVE_MEAN_INV_STD + +constexpr int Rank = 4; +constexpr int NumReduceDim = 3; + +using DeviceInstance = ck::tensor_operation::device::DeviceNormalizationFwdSplitKImpl< + XDataType, + GammaDataType, + BetaDataType, + ComputeDataType, + YDataType, + SaveMeanInvStdDataType, + PassThrough, + Rank, + NumReduceDim, + 256, // BlockSize + 8, // ClusterM + 32, // ClusterK + 1, // SliceM + 8, // SliceK + 1, // XYVectorDim (0=M, 1=K) + 8, // XScalarPerVector + 1, // GammaVecDim (0=M, 1=K) + 8, // GammaScalarPerVector + 1, // BetaVecDim (0=M, 1=K) + 8, // BetaScalarPerVector + 8, // YScalarPerVector + 1>; // SaveMeanInvStdScalarPerVector + +#include "run_layernorm4d_fwd_example.inc" + +int main() { return run_layernorm4d_fwd_example(); } diff --git a/example/63_layernorm4d_fwd/run_layernorm4d_fwd_example.inc b/example/63_layernorm4d_fwd/run_layernorm4d_fwd_example.inc new file mode 100644 index 0000000000..f75c01ec61 --- /dev/null +++ b/example/63_layernorm4d_fwd/run_layernorm4d_fwd_example.inc @@ -0,0 +1,124 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +template +int run_layernorm4d_fwd_example() +{ + bool time_kernel = false; + + ck::index_t N = 256; + ck::index_t H = 16; + ck::index_t W = 16; + ck::index_t C = 8; + + Tensor x({N, H, W, C}); + Tensor gamma({H, W, C}); + Tensor beta({H, W, C}); + Tensor y({N, H, W, C}); + Tensor save_mean({N}); + Tensor save_inv_std({N}); + + x.GenerateTensorValue(GeneratorTensor_3{0.0, 1.0}); + gamma.GenerateTensorValue(GeneratorTensor_3{0.0, 1.0}); + beta.GenerateTensorValue(GeneratorTensor_3{0.0, 1.0}); + + DeviceMem x_dev(sizeof(XDataType) * x.mDesc.GetElementSpaceSize()); + DeviceMem gamma_dev(sizeof(GammaDataType) * gamma.mDesc.GetElementSpaceSize()); + DeviceMem beta_dev(sizeof(BetaDataType) * beta.mDesc.GetElementSpaceSize()); + DeviceMem y_dev(sizeof(YDataType) * y.mDesc.GetElementSpaceSize()); +#ifdef SAVE_MEAN_INV_STD + DeviceMem save_mean_dev(sizeof(SaveMeanInvStdDataType) * save_mean.mDesc.GetElementSpaceSize()); + DeviceMem save_inv_std_dev(sizeof(SaveMeanInvStdDataType) * + save_inv_std.mDesc.GetElementSpaceSize()); +#endif + + x_dev.ToDevice(x.mData.data()); + gamma_dev.ToDevice(gamma.mData.data()); + beta_dev.ToDevice(beta.mData.data()); + + auto device_instance = DeviceInstance{}; + auto argument_ptr = device_instance.MakeArgumentPointer( + {N, H, W, C}, + std::vector{x.mDesc.GetStrides().begin(), x.mDesc.GetStrides().end()}, + {0, W * C, C, 1}, + {0, W * C, C, 1}, + std::vector{y.mDesc.GetStrides().begin(), y.mDesc.GetStrides().end()}, + std::vector{save_mean.mDesc.GetStrides().begin(), + save_mean.mDesc.GetStrides().end()}, + std::vector{save_mean.mDesc.GetStrides().begin(), + save_mean.mDesc.GetStrides().end()}, + {1, 2, 3}, + 1e-4, + x_dev.GetDeviceBuffer(), + gamma_dev.GetDeviceBuffer(), + beta_dev.GetDeviceBuffer(), + y_dev.GetDeviceBuffer(), +#ifdef SAVE_MEAN_INV_STD + save_mean_dev.GetDeviceBuffer(), + save_inv_std_dev.GetDeviceBuffer(), +#else + nullptr, + nullptr, +#endif + PassThrough{}); + + if(!device_instance.IsSupportedArgument(argument_ptr.get())) + { + std::cout << "The runtime parameters are not supported" << std::endl; + return 1; + }; + + size_t workspace_sz = device_instance.GetWorkSpaceSize(argument_ptr.get()); + DeviceMem workspace_dev(workspace_sz); + device_instance.SetWorkSpacePointer(argument_ptr.get(), workspace_dev.GetDeviceBuffer()); + + auto invoker_ptr = device_instance.MakeInvokerPointer(); + invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel}); + + bool pass = true; + { + Tensor host_y({N, H, W, C}); + Tensor host_save_mean({N}); + Tensor host_save_inv_std({N}); + + using ReferenceInstance = + ck::tensor_operation::host::ReferenceLayernorm; + + ReferenceInstance ref; + auto ref_argument = ref.MakeArgument(x, + gamma, + beta, + host_y, + host_save_mean, + host_save_inv_std, + PassThrough{}, + {N, H, W, C}, + {1, 2, 3}, + 1e-4); + auto ref_invoker = ref.MakeInvoker(); + ref_invoker.Run(ref_argument); + + y_dev.FromDevice(y.mData.data()); + pass &= ck::utils::check_err(y, host_y, "Error: Incorrect results (y)", 1e-3, 1e-3); +#ifdef SAVE_MEAN_INV_STD + save_mean_dev.FromDevice(save_mean.mData.data()); + save_inv_std_dev.FromDevice(save_inv_std.mData.data()); + pass &= ck::utils::check_err( + save_mean, host_save_mean, "Error: Incorrect results (mean)", 1e-3, 1e-3); + pass &= ck::utils::check_err( + save_inv_std, host_save_inv_std, "Error: Incorrect results (inv_std)", 1e-3, 1e-3); +#endif + } + + return (pass ? 0 : 1); +} diff --git a/include/ck/tensor_operation/gpu/device/device_normalization.hpp b/include/ck/tensor_operation/gpu/device/device_normalization_fwd.hpp similarity index 88% rename from include/ck/tensor_operation/gpu/device/device_normalization.hpp rename to include/ck/tensor_operation/gpu/device/device_normalization_fwd.hpp index 97e83ebab2..d252ad1d98 100644 --- a/include/ck/tensor_operation/gpu/device/device_normalization.hpp +++ b/include/ck/tensor_operation/gpu/device/device_normalization_fwd.hpp @@ -19,7 +19,7 @@ template -struct DeviceNormalization : public BaseOperator +struct DeviceNormalizationFwd : public BaseOperator { virtual std::unique_ptr MakeArgumentPointer(const std::vector lengths, @@ -50,14 +50,14 @@ template -using DeviceNormalizationPtr = std::unique_ptr>; +using DeviceNormalizationFwdPtr = std::unique_ptr>; } // namespace device } // namespace tensor_operation diff --git a/include/ck/tensor_operation/gpu/device/impl/device_normalization_impl.hpp b/include/ck/tensor_operation/gpu/device/impl/device_normalization_fwd_impl.hpp similarity index 96% rename from include/ck/tensor_operation/gpu/device/impl/device_normalization_impl.hpp rename to include/ck/tensor_operation/gpu/device/impl/device_normalization_fwd_impl.hpp index 1ef3350185..254d60ea38 100644 --- a/include/ck/tensor_operation/gpu/device/impl/device_normalization_impl.hpp +++ b/include/ck/tensor_operation/gpu/device/impl/device_normalization_fwd_impl.hpp @@ -7,7 +7,7 @@ #include #include "ck/utility/reduction_operator.hpp" -#include "ck/tensor_operation/gpu/device/device_normalization.hpp" +#include "ck/tensor_operation/gpu/device/device_normalization_fwd.hpp" #include "ck/tensor_operation/gpu/device/device_reduce.hpp" #include "ck/tensor_operation/gpu/device/impl/device_reduce_common.hpp" #include "ck/tensor_operation/gpu/grid/normalization/gridwise_normalization_selector.hpp" @@ -46,14 +46,14 @@ template -struct DeviceNormalizationImpl : public DeviceNormalization +struct DeviceNormalizationFwdImpl : public DeviceNormalizationFwd { static_assert(BlockSize == MThreadClusterSize * KThreadClusterSize); static_assert( @@ -461,7 +461,7 @@ struct DeviceNormalizationImpl : public DeviceNormalization -struct DeviceNormalizationSplitKImpl : public DeviceNormalization +struct DeviceNormalizationFwdSplitKImpl : public DeviceNormalizationFwd { using WorkspaceMeanVarDataType = SaveMeanInvStdDataType; @@ -732,7 +732,7 @@ struct DeviceNormalizationSplitKImpl : public DeviceNormalization mean({N}); + Tensor var({N}); + + int reduce_length = H * W * C; + + for(int n = 0; n < N; ++n) + { + mean(n) = 0; + var(n) = 0; + + for(int h = 0; h < H; ++h) + for(int w = 0; w < W; ++w) + for(int c = 0; c < C; ++c) + { + auto x_val = ck::type_convert(arg.x_m_n_(n, h, w, c)); + mean(n) += x_val; + var(n) += x_val * x_val; + } + + mean(n) = mean(n) / reduce_length; + var(n) = (var(n) / reduce_length) - (mean(n) * mean(n)); + } + + for(int n = 0; n < N; ++n) + { + ComputeDataType divisor = + static_cast(1) / ck::math::sqrt(var(n) + arg.epsilon_); + + for(int h = 0; h < H; ++h) + for(int w = 0; w < W; ++w) + for(int c = 0; c < C; ++c) + { + auto x_val = ck::type_convert(arg.x_m_n_(n, h, w, c)); + auto gamma_val = + ck::type_convert(arg.gamma_n_(h, w, c)); + auto beta_val = ck::type_convert(arg.beta_n_(h, w, c)); + auto y_val = (x_val - mean(n)) * divisor; + y_val = (y_val * gamma_val) + beta_val; + arg.y_elementwise_op_(y_val, y_val); + arg.y_m_n_(n, h, w, c) = ck::type_convert(y_val); + } + arg.save_mean_m_(n) = ck::type_convert(mean(n)); + arg.save_inv_std_m_(n) = ck::type_convert(divisor); + } + + return 0; + } + + float Run(const Argument& arg) + { + if(arg.lengths_.size() == 2) + return Run2D(arg); + else if(arg.lengths_.size() == 4) + return Run4D(arg); + + return 0; + } + float Run(const device::BaseArgument* p_arg, const StreamConfig& /* stream_config */ = StreamConfig{}) override { @@ -134,17 +200,16 @@ struct ReferenceLayernorm : public device::BaseOperator { const Argument* p_arg_ = dynamic_cast(p_arg); - // TODO - support generic layernorm - if(p_arg_->lengths_.size() != 2) - return false; + if(p_arg_->lengths_.size() == 2 && p_arg_->reduceDims_.size() == 1 && + p_arg_->reduceDims_[0] == 1) + return true; - if(p_arg_->reduceDims_.size() != 1) - return false; + else if(p_arg_->lengths_.size() == 4 && p_arg_->reduceDims_.size() == 3 && + p_arg_->reduceDims_[0] == 1 && p_arg_->reduceDims_[1] == 2 && + p_arg_->reduceDims_[2] == 3) + return true; - if(p_arg_->reduceDims_[0] != 1) - return false; - - return true; + return false; } static auto MakeArgument(const Tensor& x_m_n, diff --git a/library/include/ck/library/tensor_operation_instance/gpu/normalization.hpp b/library/include/ck/library/tensor_operation_instance/gpu/normalization_fwd.hpp similarity index 52% rename from library/include/ck/library/tensor_operation_instance/gpu/normalization.hpp rename to library/include/ck/library/tensor_operation_instance/gpu/normalization_fwd.hpp index 229de41b5e..29c9f8b2c0 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/normalization.hpp +++ b/library/include/ck/library/tensor_operation_instance/gpu/normalization_fwd.hpp @@ -7,7 +7,7 @@ #include #include "ck/ck.hpp" #include "ck/tensor_operation/gpu/device/tensor_layout.hpp" -#include "ck/tensor_operation/gpu/device/device_normalization.hpp" +#include "ck/tensor_operation/gpu/device/device_normalization_fwd.hpp" #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" #include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp" @@ -18,25 +18,31 @@ namespace device { namespace instance { #ifdef CK_ENABLE_FP16 // FP16 -void add_device_normalization_rank_2_1_f16_instances( - std::vector>>&); +void add_device_normalization_fwd_rank_2_1_f16_instances( + std::vector< + std::unique_ptr>>&); -void add_device_normalization_rank_4_3_f16_instances( - std::vector>>&); +void add_device_normalization_fwd_rank_4_3_f16_instances( + std::vector< + std::unique_ptr>>&); -void add_device_normalization_rank_5_3_f16_instances( - std::vector>>&); +void add_device_normalization_fwd_rank_5_3_f16_instances( + std::vector< + std::unique_ptr>>&); #endif #ifdef CK_ENABLE_FP32 // FP32 -void add_device_normalization_rank_2_1_f32_instances( - std::vector>>&); +void add_device_normalization_fwd_rank_2_1_f32_instances( + std::vector< + std::unique_ptr>>&); -void add_device_normalization_rank_4_3_f32_instances( - std::vector>>&); +void add_device_normalization_fwd_rank_4_3_f32_instances( + std::vector< + std::unique_ptr>>&); -void add_device_normalization_rank_5_3_f32_instances( - std::vector>>&); +void add_device_normalization_fwd_rank_5_3_f32_instances( + std::vector< + std::unique_ptr>>&); #endif template -struct DeviceOperationInstanceFactory> { - using DeviceOp = DeviceNormalization; + using DeviceOp = DeviceNormalizationFwd; static auto GetInstances() { @@ -74,15 +80,15 @@ struct DeviceOperationInstanceFactory>>&); +void add_device_normalization_fwd_rank_5_3_swish_f16_instances( + std::vector>>&); // FP32 -void add_device_normalization_rank_5_3_swish_f32_instances( - std::vector>>&); +void add_device_normalization_fwd_rank_5_3_swish_f32_instances( + std::vector>>&); // [x, gamma, beta, y] = [f16, f32, f32, f16] -void add_device_normalization_rank_5_3_swish_f16_f32_f32_f16_instances( - std::vector>>&); +void add_device_normalization_fwd_rank_5_3_swish_f16_f32_f32_f16_instances( + std::vector>>&); template struct DeviceOperationInstanceFactory< - ck::tensor_operation::device::DeviceNormalization> + ck::tensor_operation::device::DeviceNormalizationFwd> { - using DeviceOp = DeviceNormalization; + using DeviceOp = DeviceNormalizationFwd; static auto GetInstances() { @@ -65,7 +65,7 @@ struct DeviceOperationInstanceFactory< { if constexpr(Rank == 5 && NumReduceDim == 3) { - add_device_normalization_rank_5_3_swish_f16_instances(op_ptrs); + add_device_normalization_fwd_rank_5_3_swish_f16_instances(op_ptrs); } } else if constexpr(is_same_v && is_same_v && @@ -74,7 +74,7 @@ struct DeviceOperationInstanceFactory< { if constexpr(Rank == 5 && NumReduceDim == 3) { - add_device_normalization_rank_5_3_swish_f32_instances(op_ptrs); + add_device_normalization_fwd_rank_5_3_swish_f32_instances(op_ptrs); } } else if constexpr(is_same_v && is_same_v && @@ -83,7 +83,7 @@ struct DeviceOperationInstanceFactory< { if constexpr(Rank == 5 && NumReduceDim == 3) { - add_device_normalization_rank_5_3_swish_f16_f32_f32_f16_instances(op_ptrs); + add_device_normalization_fwd_rank_5_3_swish_f16_f32_f32_f16_instances(op_ptrs); } } diff --git a/library/src/tensor_operation_instance/gpu/normalization/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/normalization/CMakeLists.txt deleted file mode 100644 index 19818a523b..0000000000 --- a/library/src/tensor_operation_instance/gpu/normalization/CMakeLists.txt +++ /dev/null @@ -1,14 +0,0 @@ -set(DEVICE_NORMALIZATION_INSTANCES) - -list(APPEND DEVICE_NORMALIZATION_INSTANCES - device_layernorm2d_f16_instance.cpp - device_layernorm4d_f16_instance.cpp - device_groupnorm_f16_instance.cpp - device_groupnorm_swish_f16_instance.cpp - device_groupnorm_swish_f16_f32_f32_f16_instance.cpp - device_layernorm2d_f32_instance.cpp - device_layernorm4d_f32_instance.cpp - device_groupnorm_f32_instance.cpp - device_groupnorm_swish_f32_instance.cpp) - -add_instance_library(device_normalization_instance ${DEVICE_NORMALIZATION_INSTANCES}) diff --git a/library/src/tensor_operation_instance/gpu/normalization/normalization_instance_common.hpp b/library/src/tensor_operation_instance/gpu/normalization/normalization_instance_common.hpp deleted file mode 100644 index 488f34b4b3..0000000000 --- a/library/src/tensor_operation_instance/gpu/normalization/normalization_instance_common.hpp +++ /dev/null @@ -1,201 +0,0 @@ -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. - -#pragma once - -#include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/impl/device_normalization_impl.hpp" -#include "ck/tensor_operation/gpu/device/impl/device_normalization_splitk_impl.hpp" -#include "ck/utility/data_type.hpp" - -#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -using F16 = ck::half_t; -using F32 = float; - -template -using device_normalization_f16_instances = - // clang-format off - std::tuple < - // XDataType, GammaDataType, BetaDataType, ComputeDataType, YDataType, SaveMeanInvStdDataType, Rank, NumReduceDim, BlockSize, MThreadClusterSize, KThreadClusterSize, MThreadSliceSize, KThreadSliceSize, XYSrcVectorDim, XSrcVectorSize, GammaSrcVectorDim, GammaSrcVectorSize, BetaSrcVectorDim, BetaSrcVectorSize, YDstVectorSize, SaveMeanInvStdScalarPerVector> - DeviceNormalizationImpl, // irregular size - DeviceNormalizationImpl, // irregular size - DeviceNormalizationImpl, // irregular size - DeviceNormalizationImpl, // irregular size - DeviceNormalizationImpl, // irregular size - DeviceNormalizationImpl, // irregular size - DeviceNormalizationImpl, - DeviceNormalizationImpl, - DeviceNormalizationImpl, - DeviceNormalizationImpl, - DeviceNormalizationImpl, - DeviceNormalizationImpl, - DeviceNormalizationImpl, - DeviceNormalizationImpl, - DeviceNormalizationImpl, - DeviceNormalizationImpl, - DeviceNormalizationImpl, - DeviceNormalizationImpl - // clang-format on - >; - -template -using device_normalization_splitk_f16_instances = - // clang-format off - std::tuple < - // XDataType, GammaDataType, BetaDataType, ComputeDataType, YDataType, SaveMeanInvStdDataType, Rank, NumReduceDim, BlockSize, MThreadClusterSize, KThreadClusterSize, MThreadSliceSize, KThreadSliceSize, XYSrcVectorDim, XSrcVectorSize, GammaSrcVectorDim, GammaSrcVectorSize, BetaSrcVectorDim, BetaSrcVectorSize, YDstVectorSize, SaveMeanInvStdScalarPerVector> - DeviceNormalizationSplitKImpl, // irregular size - DeviceNormalizationSplitKImpl, // irregular size - DeviceNormalizationSplitKImpl, // irregular size - DeviceNormalizationSplitKImpl, // irregular size - DeviceNormalizationSplitKImpl, // irregular size - DeviceNormalizationSplitKImpl, // irregular size - DeviceNormalizationSplitKImpl, - DeviceNormalizationSplitKImpl, - DeviceNormalizationSplitKImpl, - DeviceNormalizationSplitKImpl, - DeviceNormalizationSplitKImpl, - DeviceNormalizationSplitKImpl, - DeviceNormalizationSplitKImpl, - DeviceNormalizationSplitKImpl, - DeviceNormalizationSplitKImpl, - DeviceNormalizationSplitKImpl, - DeviceNormalizationSplitKImpl, - DeviceNormalizationSplitKImpl - // clang-format on - >; - -template -using device_normalization_f16_generic_instance = std::tuple< - // clang-format off - DeviceNormalizationImpl - // clang-format on - >; - -template -using device_normalization_f32_instances = std::tuple< - // clang-format off - // XDataType, GammaDataType, BetaDataType, ComputeDataType, YDataType, SaveMeanInvStdDataType, Rank, NumReduceDim, BlockSize, MThreadClusterSize, KThreadClusterSize, MThreadSliceSize, KThreadSliceSize, XYSrcVectorDim, XSrcVectorSize, GammaSrcVectorDim, GammaSrcVectorSize, BetaSrcVectorDim, BetaSrcVectorSize, YDstVectorSize, SaveMeanInvStdScalarPerVector> - DeviceNormalizationImpl, // irregular size - DeviceNormalizationImpl, // irregular size - DeviceNormalizationImpl, // irregular size - DeviceNormalizationImpl, // irregular size - DeviceNormalizationImpl, // irregular size - DeviceNormalizationImpl, - DeviceNormalizationImpl, - DeviceNormalizationImpl, - DeviceNormalizationImpl, - DeviceNormalizationImpl, - DeviceNormalizationImpl, - DeviceNormalizationImpl, - DeviceNormalizationImpl, - DeviceNormalizationImpl, - DeviceNormalizationImpl, - DeviceNormalizationImpl, - DeviceNormalizationImpl, - DeviceNormalizationImpl, - DeviceNormalizationImpl - // clang-format on - >; - -template -using device_normalization_splitk_f32_instances = std::tuple< - // clang-format off - // XDataType, GammaDataType, BetaDataType, ComputeDataType, YDataType, SaveMeanInvStdDataType, Rank, NumReduceDim, BlockSize, MThreadClusterSize, KThreadClusterSize, MThreadSliceSize, KThreadSliceSize, XYSrcVectorDim, XSrcVectorSize, GammaSrcVectorDim, GammaSrcVectorSize, BetaSrcVectorDim, BetaSrcVectorSize, YDstVectorSize, SaveMeanInvStdScalarPerVector> - DeviceNormalizationSplitKImpl, // irregular size - DeviceNormalizationSplitKImpl, // irregular size - DeviceNormalizationSplitKImpl, // irregular size - DeviceNormalizationSplitKImpl, // irregular size - DeviceNormalizationSplitKImpl, // irregular size - DeviceNormalizationSplitKImpl, - DeviceNormalizationSplitKImpl, - DeviceNormalizationSplitKImpl, - DeviceNormalizationSplitKImpl, - DeviceNormalizationSplitKImpl, - DeviceNormalizationSplitKImpl, - DeviceNormalizationSplitKImpl, - DeviceNormalizationSplitKImpl, - DeviceNormalizationSplitKImpl, - DeviceNormalizationSplitKImpl, - DeviceNormalizationSplitKImpl, - DeviceNormalizationSplitKImpl, - DeviceNormalizationSplitKImpl, - DeviceNormalizationSplitKImpl - // clang-format on - >; - -template -using device_normalization_f32_generic_instance = std::tuple< - // clang-format off - DeviceNormalizationImpl - // clang-format on - >; - -template -using device_normalization_f16_f32_f32_f16_instances = std::tuple< - // clang-format off - // XDataType, GammaDataType, BetaDataType, ComputeDataType, YDataType, SaveMeanInvStdDataType, Rank, NumReduceDim, BlockSize, MThreadClusterSize, KThreadClusterSize, MThreadSliceSize, KThreadSliceSize, XYSrcVectorDim, XSrcVectorSize, GammaSrcVectorDim, GammaSrcVectorSize, BetaSrcVectorDim, BetaSrcVectorSize, YDstVectorSize, SaveMeanInvStdScalarPerVector> - DeviceNormalizationImpl, // irregular size - DeviceNormalizationImpl, // irregular size - DeviceNormalizationImpl, // irregular size - DeviceNormalizationImpl, // irregular size - DeviceNormalizationImpl, // irregular size - DeviceNormalizationImpl, - DeviceNormalizationImpl, - DeviceNormalizationImpl, - DeviceNormalizationImpl, - DeviceNormalizationImpl, - DeviceNormalizationImpl, - DeviceNormalizationImpl, - DeviceNormalizationImpl, - DeviceNormalizationImpl, - DeviceNormalizationImpl, - DeviceNormalizationImpl, - DeviceNormalizationImpl, - DeviceNormalizationImpl, - DeviceNormalizationImpl - // clang-format on - >; - -template -using device_normalization_splitk_f16_f32_f32_f16_instances = std::tuple< - // clang-format off - // XDataType, GammaDataType, BetaDataType, ComputeDataType, YDataType, SaveMeanInvStdDataType, Rank, NumReduceDim, BlockSize, MThreadClusterSize, KThreadClusterSize, MThreadSliceSize, KThreadSliceSize, XYSrcVectorDim, XSrcVectorSize, GammaSrcVectorDim, GammaSrcVectorSize, BetaSrcVectorDim, BetaSrcVectorSize, YDstVectorSize, SaveMeanInvStdScalarPerVector> - DeviceNormalizationSplitKImpl, // irregular size - DeviceNormalizationSplitKImpl, // irregular size - DeviceNormalizationSplitKImpl, // irregular size - DeviceNormalizationSplitKImpl, // irregular size - DeviceNormalizationSplitKImpl, // irregular size - DeviceNormalizationSplitKImpl, - DeviceNormalizationSplitKImpl, - DeviceNormalizationSplitKImpl, - DeviceNormalizationSplitKImpl, - DeviceNormalizationSplitKImpl, - DeviceNormalizationSplitKImpl, - DeviceNormalizationSplitKImpl, - DeviceNormalizationSplitKImpl, - DeviceNormalizationSplitKImpl, - DeviceNormalizationSplitKImpl, - DeviceNormalizationSplitKImpl, - DeviceNormalizationSplitKImpl, - DeviceNormalizationSplitKImpl, - DeviceNormalizationSplitKImpl - // clang-format on - >; - -template -using device_normalization_f16_f32_f32_f16_generic_instance = std::tuple< - // clang-format off - DeviceNormalizationImpl - // clang-format on - >; - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/normalization_fwd/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/normalization_fwd/CMakeLists.txt new file mode 100644 index 0000000000..ce4c80943e --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/normalization_fwd/CMakeLists.txt @@ -0,0 +1,14 @@ +set(DEVICE_NORMALIZATION_FWD_INSTANCES) + +list(APPEND DEVICE_NORMALIZATION_FWD_INSTANCES + device_layernorm2d_fwd_f16_instance.cpp + device_layernorm4d_fwd_f16_instance.cpp + device_groupnorm_fwd_f16_instance.cpp + device_groupnorm_fwd_swish_f16_instance.cpp + device_groupnorm_fwd_swish_f16_f32_f32_f16_instance.cpp + device_layernorm2d_fwd_f32_instance.cpp + device_layernorm4d_fwd_f32_instance.cpp + device_groupnorm_fwd_f32_instance.cpp + device_groupnorm_fwd_swish_f32_instance.cpp) + +add_instance_library(device_normalization_fwd_instance ${DEVICE_NORMALIZATION_FWD_INSTANCES}) diff --git a/library/src/tensor_operation_instance/gpu/normalization/device_groupnorm_f16_instance.cpp b/library/src/tensor_operation_instance/gpu/normalization_fwd/device_groupnorm_fwd_f16_instance.cpp similarity index 78% rename from library/src/tensor_operation_instance/gpu/normalization/device_groupnorm_f16_instance.cpp rename to library/src/tensor_operation_instance/gpu/normalization_fwd/device_groupnorm_fwd_f16_instance.cpp index 439e724199..0f8bab973e 100644 --- a/library/src/tensor_operation_instance/gpu/normalization/device_groupnorm_f16_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/normalization_fwd/device_groupnorm_fwd_f16_instance.cpp @@ -1,7 +1,7 @@ // SPDX-License-Identifier: MIT // Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. -#include "normalization_instance_common.hpp" +#include "normalization_fwd_instance_common.hpp" namespace ck { namespace tensor_operation { @@ -10,8 +10,8 @@ namespace instance { using Pass = ck::tensor_operation::element_wise::PassThrough; -void add_device_normalization_rank_5_3_f16_instances( - std::vector>>& +void add_device_normalization_fwd_rank_5_3_f16_instances( + std::vector>>& instances) { add_device_operation_instances(instances, diff --git a/library/src/tensor_operation_instance/gpu/normalization/device_groupnorm_f32_instance.cpp b/library/src/tensor_operation_instance/gpu/normalization_fwd/device_groupnorm_fwd_f32_instance.cpp similarity index 78% rename from library/src/tensor_operation_instance/gpu/normalization/device_groupnorm_f32_instance.cpp rename to library/src/tensor_operation_instance/gpu/normalization_fwd/device_groupnorm_fwd_f32_instance.cpp index 44b553bd16..b5de03638f 100644 --- a/library/src/tensor_operation_instance/gpu/normalization/device_groupnorm_f32_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/normalization_fwd/device_groupnorm_fwd_f32_instance.cpp @@ -1,7 +1,7 @@ // SPDX-License-Identifier: MIT // Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. -#include "normalization_instance_common.hpp" +#include "normalization_fwd_instance_common.hpp" namespace ck { namespace tensor_operation { @@ -10,8 +10,8 @@ namespace instance { using Pass = ck::tensor_operation::element_wise::PassThrough; -void add_device_normalization_rank_5_3_f32_instances( - std::vector>>& +void add_device_normalization_fwd_rank_5_3_f32_instances( + std::vector>>& instances) { add_device_operation_instances(instances, diff --git a/library/src/tensor_operation_instance/gpu/normalization/device_groupnorm_swish_f16_f32_f32_f16_instance.cpp b/library/src/tensor_operation_instance/gpu/normalization_fwd/device_groupnorm_fwd_swish_f16_f32_f32_f16_instance.cpp similarity index 77% rename from library/src/tensor_operation_instance/gpu/normalization/device_groupnorm_swish_f16_f32_f32_f16_instance.cpp rename to library/src/tensor_operation_instance/gpu/normalization_fwd/device_groupnorm_fwd_swish_f16_f32_f32_f16_instance.cpp index 5f42d073ff..1d2ca7898d 100644 --- a/library/src/tensor_operation_instance/gpu/normalization/device_groupnorm_swish_f16_f32_f32_f16_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/normalization_fwd/device_groupnorm_fwd_swish_f16_f32_f32_f16_instance.cpp @@ -1,7 +1,7 @@ // SPDX-License-Identifier: MIT // Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. -#include "normalization_instance_common.hpp" +#include "normalization_fwd_instance_common.hpp" namespace ck { namespace tensor_operation { @@ -10,8 +10,8 @@ namespace instance { using Swish = ck::tensor_operation::element_wise::Swish; -void add_device_normalization_rank_5_3_swish_f16_f32_f32_f16_instances( - std::vector>>& +void add_device_normalization_fwd_rank_5_3_swish_f16_f32_f32_f16_instances( + std::vector>>& instances) { add_device_operation_instances( diff --git a/library/src/tensor_operation_instance/gpu/normalization/device_groupnorm_swish_f16_instance.cpp b/library/src/tensor_operation_instance/gpu/normalization_fwd/device_groupnorm_fwd_swish_f16_instance.cpp similarity index 78% rename from library/src/tensor_operation_instance/gpu/normalization/device_groupnorm_swish_f16_instance.cpp rename to library/src/tensor_operation_instance/gpu/normalization_fwd/device_groupnorm_fwd_swish_f16_instance.cpp index 63aea024da..9fbbab64e7 100644 --- a/library/src/tensor_operation_instance/gpu/normalization/device_groupnorm_swish_f16_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/normalization_fwd/device_groupnorm_fwd_swish_f16_instance.cpp @@ -1,7 +1,7 @@ // SPDX-License-Identifier: MIT // Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. -#include "normalization_instance_common.hpp" +#include "normalization_fwd_instance_common.hpp" namespace ck { namespace tensor_operation { @@ -10,8 +10,8 @@ namespace instance { using Swish = ck::tensor_operation::element_wise::Swish; -void add_device_normalization_rank_5_3_swish_f16_instances( - std::vector>>& +void add_device_normalization_fwd_rank_5_3_swish_f16_instances( + std::vector>>& instances) { add_device_operation_instances(instances, diff --git a/library/src/tensor_operation_instance/gpu/normalization/device_groupnorm_swish_f32_instance.cpp b/library/src/tensor_operation_instance/gpu/normalization_fwd/device_groupnorm_fwd_swish_f32_instance.cpp similarity index 78% rename from library/src/tensor_operation_instance/gpu/normalization/device_groupnorm_swish_f32_instance.cpp rename to library/src/tensor_operation_instance/gpu/normalization_fwd/device_groupnorm_fwd_swish_f32_instance.cpp index 4b2ab33570..0cd3e6d8bc 100644 --- a/library/src/tensor_operation_instance/gpu/normalization/device_groupnorm_swish_f32_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/normalization_fwd/device_groupnorm_fwd_swish_f32_instance.cpp @@ -1,7 +1,7 @@ // SPDX-License-Identifier: MIT // Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. -#include "normalization_instance_common.hpp" +#include "normalization_fwd_instance_common.hpp" namespace ck { namespace tensor_operation { @@ -10,8 +10,8 @@ namespace instance { using Swish = ck::tensor_operation::element_wise::Swish; -void add_device_normalization_rank_5_3_swish_f32_instances( - std::vector>>& +void add_device_normalization_fwd_rank_5_3_swish_f32_instances( + std::vector>>& instances) { add_device_operation_instances(instances, diff --git a/library/src/tensor_operation_instance/gpu/normalization/device_layernorm2d_f16_instance.cpp b/library/src/tensor_operation_instance/gpu/normalization_fwd/device_layernorm2d_fwd_f16_instance.cpp similarity index 78% rename from library/src/tensor_operation_instance/gpu/normalization/device_layernorm2d_f16_instance.cpp rename to library/src/tensor_operation_instance/gpu/normalization_fwd/device_layernorm2d_fwd_f16_instance.cpp index e15ff4b6d0..bfc2e465df 100644 --- a/library/src/tensor_operation_instance/gpu/normalization/device_layernorm2d_f16_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/normalization_fwd/device_layernorm2d_fwd_f16_instance.cpp @@ -1,7 +1,7 @@ // SPDX-License-Identifier: MIT // Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. -#include "normalization_instance_common.hpp" +#include "normalization_fwd_instance_common.hpp" namespace ck { namespace tensor_operation { @@ -10,8 +10,8 @@ namespace instance { using Pass = ck::tensor_operation::element_wise::PassThrough; -void add_device_normalization_rank_2_1_f16_instances( - std::vector>>& +void add_device_normalization_fwd_rank_2_1_f16_instances( + std::vector>>& instances) { add_device_operation_instances(instances, diff --git a/library/src/tensor_operation_instance/gpu/normalization/device_layernorm2d_f32_instance.cpp b/library/src/tensor_operation_instance/gpu/normalization_fwd/device_layernorm2d_fwd_f32_instance.cpp similarity index 78% rename from library/src/tensor_operation_instance/gpu/normalization/device_layernorm2d_f32_instance.cpp rename to library/src/tensor_operation_instance/gpu/normalization_fwd/device_layernorm2d_fwd_f32_instance.cpp index 00039531e1..fefcf8bec3 100644 --- a/library/src/tensor_operation_instance/gpu/normalization/device_layernorm2d_f32_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/normalization_fwd/device_layernorm2d_fwd_f32_instance.cpp @@ -1,7 +1,7 @@ // SPDX-License-Identifier: MIT // Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. -#include "normalization_instance_common.hpp" +#include "normalization_fwd_instance_common.hpp" namespace ck { namespace tensor_operation { @@ -10,8 +10,8 @@ namespace instance { using Pass = ck::tensor_operation::element_wise::PassThrough; -void add_device_normalization_rank_2_1_f32_instances( - std::vector>>& +void add_device_normalization_fwd_rank_2_1_f32_instances( + std::vector>>& instances) { add_device_operation_instances(instances, diff --git a/library/src/tensor_operation_instance/gpu/normalization/device_layernorm4d_f16_instance.cpp b/library/src/tensor_operation_instance/gpu/normalization_fwd/device_layernorm4d_fwd_f16_instance.cpp similarity index 78% rename from library/src/tensor_operation_instance/gpu/normalization/device_layernorm4d_f16_instance.cpp rename to library/src/tensor_operation_instance/gpu/normalization_fwd/device_layernorm4d_fwd_f16_instance.cpp index 4152c6ebbf..690489bbad 100644 --- a/library/src/tensor_operation_instance/gpu/normalization/device_layernorm4d_f16_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/normalization_fwd/device_layernorm4d_fwd_f16_instance.cpp @@ -1,7 +1,7 @@ // SPDX-License-Identifier: MIT // Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. -#include "normalization_instance_common.hpp" +#include "normalization_fwd_instance_common.hpp" namespace ck { namespace tensor_operation { @@ -10,8 +10,8 @@ namespace instance { using Pass = ck::tensor_operation::element_wise::PassThrough; -void add_device_normalization_rank_4_3_f16_instances( - std::vector>>& +void add_device_normalization_fwd_rank_4_3_f16_instances( + std::vector>>& instances) { add_device_operation_instances(instances, diff --git a/library/src/tensor_operation_instance/gpu/normalization/device_layernorm4d_f32_instance.cpp b/library/src/tensor_operation_instance/gpu/normalization_fwd/device_layernorm4d_fwd_f32_instance.cpp similarity index 78% rename from library/src/tensor_operation_instance/gpu/normalization/device_layernorm4d_f32_instance.cpp rename to library/src/tensor_operation_instance/gpu/normalization_fwd/device_layernorm4d_fwd_f32_instance.cpp index b387dc2f3f..2df2ff9415 100644 --- a/library/src/tensor_operation_instance/gpu/normalization/device_layernorm4d_f32_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/normalization_fwd/device_layernorm4d_fwd_f32_instance.cpp @@ -1,7 +1,7 @@ // SPDX-License-Identifier: MIT // Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. -#include "normalization_instance_common.hpp" +#include "normalization_fwd_instance_common.hpp" namespace ck { namespace tensor_operation { @@ -10,8 +10,8 @@ namespace instance { using Pass = ck::tensor_operation::element_wise::PassThrough; -void add_device_normalization_rank_4_3_f32_instances( - std::vector>>& +void add_device_normalization_fwd_rank_4_3_f32_instances( + std::vector>>& instances) { add_device_operation_instances(instances, diff --git a/library/src/tensor_operation_instance/gpu/normalization_fwd/normalization_fwd_instance_common.hpp b/library/src/tensor_operation_instance/gpu/normalization_fwd/normalization_fwd_instance_common.hpp new file mode 100644 index 0000000000..60a55dd6e1 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/normalization_fwd/normalization_fwd_instance_common.hpp @@ -0,0 +1,201 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck/ck.hpp" +#include "ck/tensor_operation/gpu/device/impl/device_normalization_fwd_impl.hpp" +#include "ck/tensor_operation/gpu/device/impl/device_normalization_fwd_splitk_impl.hpp" +#include "ck/utility/data_type.hpp" + +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +using F16 = ck::half_t; +using F32 = float; + +template +using device_normalization_f16_instances = + // clang-format off + std::tuple < + // XDataType, GammaDataType, BetaDataType, ComputeDataType, YDataType, SaveMeanInvStdDataType, Rank, NumReduceDim, BlockSize, MThreadClusterSize, KThreadClusterSize, MThreadSliceSize, KThreadSliceSize, XYSrcVectorDim, XSrcVectorSize, GammaSrcVectorDim, GammaSrcVectorSize, BetaSrcVectorDim, BetaSrcVectorSize, YDstVectorSize, SaveMeanInvStdScalarPerVector> + DeviceNormalizationFwdImpl, // irregular size + DeviceNormalizationFwdImpl, // irregular size + DeviceNormalizationFwdImpl, // irregular size + DeviceNormalizationFwdImpl, // irregular size + DeviceNormalizationFwdImpl, // irregular size + DeviceNormalizationFwdImpl, // irregular size + DeviceNormalizationFwdImpl, + DeviceNormalizationFwdImpl, + DeviceNormalizationFwdImpl, + DeviceNormalizationFwdImpl, + DeviceNormalizationFwdImpl, + DeviceNormalizationFwdImpl, + DeviceNormalizationFwdImpl, + DeviceNormalizationFwdImpl, + DeviceNormalizationFwdImpl, + DeviceNormalizationFwdImpl, + DeviceNormalizationFwdImpl, + DeviceNormalizationFwdImpl + // clang-format on + >; + +template +using device_normalization_splitk_f16_instances = + // clang-format off + std::tuple < + // XDataType, GammaDataType, BetaDataType, ComputeDataType, YDataType, SaveMeanInvStdDataType, Rank, NumReduceDim, BlockSize, MThreadClusterSize, KThreadClusterSize, MThreadSliceSize, KThreadSliceSize, XYSrcVectorDim, XSrcVectorSize, GammaSrcVectorDim, GammaSrcVectorSize, BetaSrcVectorDim, BetaSrcVectorSize, YDstVectorSize, SaveMeanInvStdScalarPerVector> + DeviceNormalizationFwdSplitKImpl, // irregular size + DeviceNormalizationFwdSplitKImpl, // irregular size + DeviceNormalizationFwdSplitKImpl, // irregular size + DeviceNormalizationFwdSplitKImpl, // irregular size + DeviceNormalizationFwdSplitKImpl, // irregular size + DeviceNormalizationFwdSplitKImpl, // irregular size + DeviceNormalizationFwdSplitKImpl, + DeviceNormalizationFwdSplitKImpl, + DeviceNormalizationFwdSplitKImpl, + DeviceNormalizationFwdSplitKImpl, + DeviceNormalizationFwdSplitKImpl, + DeviceNormalizationFwdSplitKImpl, + DeviceNormalizationFwdSplitKImpl, + DeviceNormalizationFwdSplitKImpl, + DeviceNormalizationFwdSplitKImpl, + DeviceNormalizationFwdSplitKImpl, + DeviceNormalizationFwdSplitKImpl, + DeviceNormalizationFwdSplitKImpl + // clang-format on + >; + +template +using device_normalization_f16_generic_instance = std::tuple< + // clang-format off + DeviceNormalizationFwdImpl + // clang-format on + >; + +template +using device_normalization_f32_instances = std::tuple< + // clang-format off + // XDataType, GammaDataType, BetaDataType, ComputeDataType, YDataType, SaveMeanInvStdDataType, Rank, NumReduceDim, BlockSize, MThreadClusterSize, KThreadClusterSize, MThreadSliceSize, KThreadSliceSize, XYSrcVectorDim, XSrcVectorSize, GammaSrcVectorDim, GammaSrcVectorSize, BetaSrcVectorDim, BetaSrcVectorSize, YDstVectorSize, SaveMeanInvStdScalarPerVector> + DeviceNormalizationFwdImpl, // irregular size + DeviceNormalizationFwdImpl, // irregular size + DeviceNormalizationFwdImpl, // irregular size + DeviceNormalizationFwdImpl, // irregular size + DeviceNormalizationFwdImpl, // irregular size + DeviceNormalizationFwdImpl, + DeviceNormalizationFwdImpl, + DeviceNormalizationFwdImpl, + DeviceNormalizationFwdImpl, + DeviceNormalizationFwdImpl, + DeviceNormalizationFwdImpl, + DeviceNormalizationFwdImpl, + DeviceNormalizationFwdImpl, + DeviceNormalizationFwdImpl, + DeviceNormalizationFwdImpl, + DeviceNormalizationFwdImpl, + DeviceNormalizationFwdImpl, + DeviceNormalizationFwdImpl, + DeviceNormalizationFwdImpl + // clang-format on + >; + +template +using device_normalization_splitk_f32_instances = std::tuple< + // clang-format off + // XDataType, GammaDataType, BetaDataType, ComputeDataType, YDataType, SaveMeanInvStdDataType, Rank, NumReduceDim, BlockSize, MThreadClusterSize, KThreadClusterSize, MThreadSliceSize, KThreadSliceSize, XYSrcVectorDim, XSrcVectorSize, GammaSrcVectorDim, GammaSrcVectorSize, BetaSrcVectorDim, BetaSrcVectorSize, YDstVectorSize, SaveMeanInvStdScalarPerVector> + DeviceNormalizationFwdSplitKImpl, // irregular size + DeviceNormalizationFwdSplitKImpl, // irregular size + DeviceNormalizationFwdSplitKImpl, // irregular size + DeviceNormalizationFwdSplitKImpl, // irregular size + DeviceNormalizationFwdSplitKImpl, // irregular size + DeviceNormalizationFwdSplitKImpl, + DeviceNormalizationFwdSplitKImpl, + DeviceNormalizationFwdSplitKImpl, + DeviceNormalizationFwdSplitKImpl, + DeviceNormalizationFwdSplitKImpl, + DeviceNormalizationFwdSplitKImpl, + DeviceNormalizationFwdSplitKImpl, + DeviceNormalizationFwdSplitKImpl, + DeviceNormalizationFwdSplitKImpl, + DeviceNormalizationFwdSplitKImpl, + DeviceNormalizationFwdSplitKImpl, + DeviceNormalizationFwdSplitKImpl, + DeviceNormalizationFwdSplitKImpl, + DeviceNormalizationFwdSplitKImpl + // clang-format on + >; + +template +using device_normalization_f32_generic_instance = std::tuple< + // clang-format off + DeviceNormalizationFwdImpl + // clang-format on + >; + +template +using device_normalization_f16_f32_f32_f16_instances = std::tuple< + // clang-format off + // XDataType, GammaDataType, BetaDataType, ComputeDataType, YDataType, SaveMeanInvStdDataType, Rank, NumReduceDim, BlockSize, MThreadClusterSize, KThreadClusterSize, MThreadSliceSize, KThreadSliceSize, XYSrcVectorDim, XSrcVectorSize, GammaSrcVectorDim, GammaSrcVectorSize, BetaSrcVectorDim, BetaSrcVectorSize, YDstVectorSize, SaveMeanInvStdScalarPerVector> + DeviceNormalizationFwdImpl, // irregular size + DeviceNormalizationFwdImpl, // irregular size + DeviceNormalizationFwdImpl, // irregular size + DeviceNormalizationFwdImpl, // irregular size + DeviceNormalizationFwdImpl, // irregular size + DeviceNormalizationFwdImpl, + DeviceNormalizationFwdImpl, + DeviceNormalizationFwdImpl, + DeviceNormalizationFwdImpl, + DeviceNormalizationFwdImpl, + DeviceNormalizationFwdImpl, + DeviceNormalizationFwdImpl, + DeviceNormalizationFwdImpl, + DeviceNormalizationFwdImpl, + DeviceNormalizationFwdImpl, + DeviceNormalizationFwdImpl, + DeviceNormalizationFwdImpl, + DeviceNormalizationFwdImpl, + DeviceNormalizationFwdImpl + // clang-format on + >; + +template +using device_normalization_splitk_f16_f32_f32_f16_instances = std::tuple< + // clang-format off + // XDataType, GammaDataType, BetaDataType, ComputeDataType, YDataType, SaveMeanInvStdDataType, Rank, NumReduceDim, BlockSize, MThreadClusterSize, KThreadClusterSize, MThreadSliceSize, KThreadSliceSize, XYSrcVectorDim, XSrcVectorSize, GammaSrcVectorDim, GammaSrcVectorSize, BetaSrcVectorDim, BetaSrcVectorSize, YDstVectorSize, SaveMeanInvStdScalarPerVector> + DeviceNormalizationFwdSplitKImpl, // irregular size + DeviceNormalizationFwdSplitKImpl, // irregular size + DeviceNormalizationFwdSplitKImpl, // irregular size + DeviceNormalizationFwdSplitKImpl, // irregular size + DeviceNormalizationFwdSplitKImpl, // irregular size + DeviceNormalizationFwdSplitKImpl, + DeviceNormalizationFwdSplitKImpl, + DeviceNormalizationFwdSplitKImpl, + DeviceNormalizationFwdSplitKImpl, + DeviceNormalizationFwdSplitKImpl, + DeviceNormalizationFwdSplitKImpl, + DeviceNormalizationFwdSplitKImpl, + DeviceNormalizationFwdSplitKImpl, + DeviceNormalizationFwdSplitKImpl, + DeviceNormalizationFwdSplitKImpl, + DeviceNormalizationFwdSplitKImpl, + DeviceNormalizationFwdSplitKImpl, + DeviceNormalizationFwdSplitKImpl, + DeviceNormalizationFwdSplitKImpl + // clang-format on + >; + +template +using device_normalization_f16_f32_f32_f16_generic_instance = std::tuple< + // clang-format off + DeviceNormalizationFwdImpl + // clang-format on + >; + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/profiler/include/profiler/profile_groupnorm_impl.hpp b/profiler/include/profiler/profile_groupnorm_fwd_impl.hpp similarity index 98% rename from profiler/include/profiler/profile_groupnorm_impl.hpp rename to profiler/include/profiler/profile_groupnorm_fwd_impl.hpp index 4715853d2a..d0a5032bff 100644 --- a/profiler/include/profiler/profile_groupnorm_impl.hpp +++ b/profiler/include/profiler/profile_groupnorm_fwd_impl.hpp @@ -7,7 +7,7 @@ #include "ck/ck.hpp" -#include "ck/library/tensor_operation_instance/gpu/normalization.hpp" +#include "ck/library/tensor_operation_instance/gpu/normalization_fwd.hpp" #include "ck/library/utility/check_err.hpp" #include "ck/library/utility/device_memory.hpp" @@ -88,14 +88,14 @@ bool profile_groupnorm_impl(int do_verification, beta_dev.ToDevice(beta.mData.data()); // add device normalization instances - using DeviceOp = ck::tensor_operation::device::DeviceNormalization; + using DeviceOp = ck::tensor_operation::device::DeviceNormalizationFwd; // get device op instances const auto instance_ptrs = diff --git a/profiler/include/profiler/profile_layernorm_impl.hpp b/profiler/include/profiler/profile_layernorm_fwd_impl.hpp similarity index 98% rename from profiler/include/profiler/profile_layernorm_impl.hpp rename to profiler/include/profiler/profile_layernorm_fwd_impl.hpp index 7c214af019..66272b6eff 100644 --- a/profiler/include/profiler/profile_layernorm_impl.hpp +++ b/profiler/include/profiler/profile_layernorm_fwd_impl.hpp @@ -6,7 +6,7 @@ #include #include "ck/ck.hpp" -#include "ck/library/tensor_operation_instance/gpu/normalization.hpp" +#include "ck/library/tensor_operation_instance/gpu/normalization_fwd.hpp" #include "ck/library/utility/check_err.hpp" #include "ck/library/utility/device_memory.hpp" #include "ck/library/utility/host_tensor.hpp" @@ -94,14 +94,14 @@ bool profile_layernorm_impl(int do_verification, constexpr int NumReduceDim = Rank - 1; // add device normalization instances - using DeviceOp = ck::tensor_operation::device::DeviceNormalization; + using DeviceOp = ck::tensor_operation::device::DeviceNormalizationFwd; // get device op instances const auto instance_ptrs = diff --git a/profiler/src/CMakeLists.txt b/profiler/src/CMakeLists.txt index 61b890de55..0af3107157 100644 --- a/profiler/src/CMakeLists.txt +++ b/profiler/src/CMakeLists.txt @@ -16,8 +16,8 @@ set(PROFILER_SOURCES profile_grouped_conv_fwd.cpp profile_grouped_conv_bwd_weight.cpp profile_reduce.cpp - profile_groupnorm.cpp - profile_layernorm.cpp + profile_groupnorm_fwd.cpp + profile_layernorm_fwd.cpp profile_max_pool3d_fwd.cpp profile_avg_pool3d_bwd.cpp profile_max_pool3d_bwd.cpp @@ -77,7 +77,7 @@ target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_conv2d_bwd_w target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_conv3d_bwd_weight_instance) target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_conv2d_fwd_bias_relu_instance) target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_conv2d_fwd_bias_relu_add_instance) -target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_normalization_instance) +target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_normalization_fwd_instance) target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_softmax_instance) target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_reduce_instance) target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_batchnorm_instance) diff --git a/profiler/src/profile_groupnorm.cpp b/profiler/src/profile_groupnorm_fwd.cpp similarity index 98% rename from profiler/src/profile_groupnorm.cpp rename to profiler/src/profile_groupnorm_fwd.cpp index 079f6f0db7..3ba2f751cc 100644 --- a/profiler/src/profile_groupnorm.cpp +++ b/profiler/src/profile_groupnorm_fwd.cpp @@ -6,7 +6,7 @@ #include #include "profiler/data_type_enum.hpp" -#include "profiler/profile_groupnorm_impl.hpp" +#include "profiler/profile_groupnorm_fwd_impl.hpp" #include "profiler_operation_registry.hpp" using ck::index_t; diff --git a/profiler/src/profile_layernorm.cpp b/profiler/src/profile_layernorm_fwd.cpp similarity index 63% rename from profiler/src/profile_layernorm.cpp rename to profiler/src/profile_layernorm_fwd.cpp index fdeaa036b2..9bd66e0cb8 100644 --- a/profiler/src/profile_layernorm.cpp +++ b/profiler/src/profile_layernorm_fwd.cpp @@ -6,7 +6,7 @@ #include #include "profiler/data_type_enum.hpp" -#include "profiler/profile_layernorm_impl.hpp" +#include "profiler/profile_layernorm_fwd_impl.hpp" #include "profiler_operation_registry.hpp" using ck::index_t; @@ -76,19 +76,46 @@ int profile_layernorm(int argc, char* argv[]) arg_parser(argc, argv); const std::vector length = arg_parser.long_opts["length"]; - using F16 = ck::half_t; - using F32 = float; - constexpr int rank = 2; + using F16 = ck::half_t; + using F32 = float; - if(data_type == ck::DataTypeEnum::Half) + if(length.size() == 2) { - ck::profiler::profile_layernorm_impl( - do_verification, init_method, do_log, time_kernel, length); + constexpr int rank = 2; + + if(data_type == ck::DataTypeEnum::Half) + { + ck::profiler::profile_layernorm_impl( + do_verification, init_method, do_log, time_kernel, length); + } + else if(data_type == ck::DataTypeEnum::Float) + { + ck::profiler::profile_layernorm_impl( + do_verification, init_method, do_log, time_kernel, length); + } + else + { + throw std::runtime_error("not implemented yet"); + } } - else if(data_type == ck::DataTypeEnum::Float) + else if(length.size() == 4) { - ck::profiler::profile_layernorm_impl( - do_verification, init_method, do_log, time_kernel, length); + constexpr int rank = 4; + + if(data_type == ck::DataTypeEnum::Half) + { + ck::profiler::profile_layernorm_impl( + do_verification, init_method, do_log, time_kernel, length); + } + else if(data_type == ck::DataTypeEnum::Float) + { + ck::profiler::profile_layernorm_impl( + do_verification, init_method, do_log, time_kernel, length); + } + else + { + throw std::runtime_error("not implemented yet"); + } } else { diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 1567d8bc69..5ac04837d9 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -139,7 +139,7 @@ add_subdirectory(grouped_convnd_fwd) add_subdirectory(grouped_convnd_bwd_weight) add_subdirectory(block_to_ctile_map) add_subdirectory(softmax) -add_subdirectory(normalization) +add_subdirectory(normalization_fwd) add_subdirectory(data_type) add_subdirectory(elementwise_normalization) add_subdirectory(batchnorm) diff --git a/test/normalization/CMakeLists.txt b/test/normalization/CMakeLists.txt deleted file mode 100644 index fe4b1ad82e..0000000000 --- a/test/normalization/CMakeLists.txt +++ /dev/null @@ -1,21 +0,0 @@ -add_custom_target(test_normalization) -add_gtest_executable(test_layernorm2d_fp32 test_layernorm2d_fp32.cpp) -if(result EQUAL 0) - target_link_libraries(test_layernorm2d_fp32 PRIVATE utility device_normalization_instance) - add_dependencies(test_normalization test_layernorm2d_fp32) -endif() -add_gtest_executable(test_groupnorm_fp32 test_groupnorm_fp32.cpp) -if(result EQUAL 0) - target_link_libraries(test_groupnorm_fp32 PRIVATE utility device_normalization_instance) - add_dependencies(test_normalization test_groupnorm_fp32) -endif() -add_gtest_executable(test_layernorm2d_fp16 test_layernorm2d_fp16.cpp) -if(result EQUAL 0) - target_link_libraries(test_layernorm2d_fp16 PRIVATE utility device_normalization_instance) - add_dependencies(test_normalization test_layernorm2d_fp16) -endif() -add_gtest_executable(test_groupnorm_fp16 test_groupnorm_fp16.cpp) -if(result EQUAL 0) - target_link_libraries(test_groupnorm_fp16 PRIVATE utility device_normalization_instance) - add_dependencies(test_normalization test_groupnorm_fp16) -endif() diff --git a/test/normalization_fwd/CMakeLists.txt b/test/normalization_fwd/CMakeLists.txt new file mode 100644 index 0000000000..c309149deb --- /dev/null +++ b/test/normalization_fwd/CMakeLists.txt @@ -0,0 +1,30 @@ +add_custom_target(test_normalization_fwd) +add_gtest_executable(test_layernorm2d_fwd_fp32 test_layernorm2d_fwd_fp32.cpp) +if(result EQUAL 0) + target_link_libraries(test_layernorm2d_fwd_fp32 PRIVATE utility device_normalization_fwd_instance) + add_dependencies(test_normalization_fwd test_layernorm2d_fwd_fp32) +endif() + +add_gtest_executable(test_groupnorm_fwd_fp32 test_groupnorm_fwd_fp32.cpp) +if(result EQUAL 0) + target_link_libraries(test_groupnorm_fwd_fp32 PRIVATE utility device_normalization_fwd_instance) + add_dependencies(test_normalization_fwd test_groupnorm_fwd_fp32) +endif() + +add_gtest_executable(test_layernorm2d_fwd_fp16 test_layernorm2d_fwd_fp16.cpp) +if(result EQUAL 0) + target_link_libraries(test_layernorm2d_fwd_fp16 PRIVATE utility device_normalization_fwd_instance) + add_dependencies(test_normalization_fwd test_layernorm2d_fwd_fp16) +endif() + +add_gtest_executable(test_layernorm4d_fwd_fp16 test_layernorm4d_fwd_fp16.cpp) +if(result EQUAL 0) + target_link_libraries(test_layernorm4d_fwd_fp16 PRIVATE utility device_normalization_fwd_instance) + add_dependencies(test_normalization_fwd test_layernorm4d_fwd_fp16) +endif() + +add_gtest_executable(test_groupnorm_fwd_fp16 test_groupnorm_fwd_fp16.cpp) +if(result EQUAL 0) + target_link_libraries(test_groupnorm_fwd_fp16 PRIVATE utility device_normalization_fwd_instance) + add_dependencies(test_normalization_fwd test_groupnorm_fwd_fp16) +endif() diff --git a/test/normalization/test_groupnorm_fp16.cpp b/test/normalization_fwd/test_groupnorm_fwd_fp16.cpp similarity index 97% rename from test/normalization/test_groupnorm_fp16.cpp rename to test/normalization_fwd/test_groupnorm_fwd_fp16.cpp index 67387ad40b..143c725257 100644 --- a/test/normalization/test_groupnorm_fp16.cpp +++ b/test/normalization_fwd/test_groupnorm_fwd_fp16.cpp @@ -2,7 +2,7 @@ // Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. #include "gtest/gtest.h" -#include "profiler/profile_groupnorm_impl.hpp" +#include "profiler/profile_groupnorm_fwd_impl.hpp" using F16 = ck::half_t; using F32 = float; diff --git a/test/normalization/test_groupnorm_fp32.cpp b/test/normalization_fwd/test_groupnorm_fwd_fp32.cpp similarity index 97% rename from test/normalization/test_groupnorm_fp32.cpp rename to test/normalization_fwd/test_groupnorm_fwd_fp32.cpp index 136079f104..84a833c793 100644 --- a/test/normalization/test_groupnorm_fp32.cpp +++ b/test/normalization_fwd/test_groupnorm_fwd_fp32.cpp @@ -2,7 +2,7 @@ // Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. #include "gtest/gtest.h" -#include "profiler/profile_groupnorm_impl.hpp" +#include "profiler/profile_groupnorm_fwd_impl.hpp" using F16 = ck::half_t; using F32 = float; diff --git a/test/normalization/test_layernorm2d_fp16.cpp b/test/normalization_fwd/test_layernorm2d_fwd_fp16.cpp similarity index 97% rename from test/normalization/test_layernorm2d_fp16.cpp rename to test/normalization_fwd/test_layernorm2d_fwd_fp16.cpp index 54bab25257..cc49ebe0ae 100644 --- a/test/normalization/test_layernorm2d_fp16.cpp +++ b/test/normalization_fwd/test_layernorm2d_fwd_fp16.cpp @@ -2,7 +2,7 @@ // Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. #include "gtest/gtest.h" -#include "profiler/profile_layernorm_impl.hpp" +#include "profiler/profile_layernorm_fwd_impl.hpp" using F16 = ck::half_t; using F32 = float; diff --git a/test/normalization/test_layernorm2d_fp32.cpp b/test/normalization_fwd/test_layernorm2d_fwd_fp32.cpp similarity index 97% rename from test/normalization/test_layernorm2d_fp32.cpp rename to test/normalization_fwd/test_layernorm2d_fwd_fp32.cpp index ee9646a4d5..b46715d96a 100644 --- a/test/normalization/test_layernorm2d_fp32.cpp +++ b/test/normalization_fwd/test_layernorm2d_fwd_fp32.cpp @@ -2,7 +2,7 @@ // Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. #include "gtest/gtest.h" -#include "profiler/profile_layernorm_impl.hpp" +#include "profiler/profile_layernorm_fwd_impl.hpp" using F16 = ck::half_t; using F32 = float; diff --git a/test/normalization_fwd/test_layernorm4d_fwd_fp16.cpp b/test/normalization_fwd/test_layernorm4d_fwd_fp16.cpp new file mode 100644 index 0000000000..a3bd388f7f --- /dev/null +++ b/test/normalization_fwd/test_layernorm4d_fwd_fp16.cpp @@ -0,0 +1,48 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. + +#include "gtest/gtest.h" +#include "profiler/profile_layernorm_fwd_impl.hpp" + +using F16 = ck::half_t; +using F32 = float; +using ck::index_t; + +template +class TestLayernorm4d : public ::testing::Test +{ + protected: + using XDataType = std::tuple_element_t<0, Tuple>; + using GammaDataType = std::tuple_element_t<1, Tuple>; + using BetaDataType = std::tuple_element_t<2, Tuple>; + using ComputeDataType = std::tuple_element_t<3, Tuple>; + using YDataType = std::tuple_element_t<4, Tuple>; + using SaveMeanInvStdDataType = std::tuple_element_t<5, Tuple>; + + void Run() + { + // [N, D], reduce D + std::vector> lengths = { + {1, 1, 1, 1}, {7, 7, 7, 7}, {256, 16, 16, 8}}; + + for(auto length : lengths) + { + bool success = ck::profiler::profile_layernorm_impl(true, 2, false, false, length); + EXPECT_TRUE(success); + } + } +}; + +using KernelTypes = ::testing::Types< + // XDataType, GammaDataType, BetaDataType, ComputeDataType, YDataType> + std::tuple>; + +TYPED_TEST_SUITE(TestLayernorm4d, KernelTypes); +TYPED_TEST(TestLayernorm4d, Test_FP16) { this->Run(); }