From 218a6a62d2393e0cf8ea24d614a1c7aea80c836e Mon Sep 17 00:00:00 2001 From: rocking Date: Thu, 25 Jan 2024 19:53:15 +0800 Subject: [PATCH] layernorm & groupnorm bwd gamma beta (#1133) * Add layernorm bwd gamma beta external api * Add groupnorm external api * Add layernorm bwd gamma beta profiler * Add groupnorm bwd gamma beta ckProfiler * Add layernorm & groupnorm bwd gamma beta test * Fix groupnorm bwd gamma beta profiler bug * Layernorm bwd weight client example * Groupnorm bwd weight client example * clang format * Remove useless header * Let inv_std be positive * Rename to num_bytes and move this calculation outside the loop [ROCm/composable_kernel commit: 28f68a5a99a689f11f351aa3d496505d10491d0c] --- client_example/05_layernorm/CMakeLists.txt | 3 + .../layernorm2d_bwd_gamma_beta.cpp | 171 ++++++++++++ client_example/18_groupnorm/CMakeLists.txt | 3 + .../18_groupnorm/groupnorm_bwd_gamma_beta.cpp | 180 ++++++++++++ .../gpu/groupnorm_bwd_gamma_beta.hpp | 64 +++++ .../gpu/layernorm_bwd_gamma_beta.hpp | 83 ++++++ ...ayernorm2d_bwd_gamma_beta_f16_instance.cpp | 2 +- ...ayernorm2d_bwd_gamma_beta_f32_instance.cpp | 2 +- .../profile_groupnorm_bwd_gamma_beta_impl.hpp | 261 +++++++++++++++++ .../profile_layernorm_bwd_gamma_beta_impl.hpp | 263 ++++++++++++++++++ profiler/src/CMakeLists.txt | 3 + .../src/profile_groupnorm_bwd_gamma_beta.cpp | 104 +++++++ .../src/profile_layernorm_bwd_gamma_beta.cpp | 112 ++++++++ test/CMakeLists.txt | 1 + .../CMakeLists.txt | 13 + .../test_groupnorm_bwd_gamma_beta_fp32.cpp | 51 ++++ .../test_layernorm2d_bwd_gamma_beta_fp32.cpp | 48 ++++ 17 files changed, 1362 insertions(+), 2 deletions(-) create mode 100644 client_example/05_layernorm/layernorm2d_bwd_gamma_beta.cpp create mode 100644 client_example/18_groupnorm/groupnorm_bwd_gamma_beta.cpp create mode 100644 library/include/ck/library/tensor_operation_instance/gpu/groupnorm_bwd_gamma_beta.hpp create mode 100644 library/include/ck/library/tensor_operation_instance/gpu/layernorm_bwd_gamma_beta.hpp create mode 100644 profiler/include/profiler/profile_groupnorm_bwd_gamma_beta_impl.hpp create mode 100644 profiler/include/profiler/profile_layernorm_bwd_gamma_beta_impl.hpp create mode 100644 profiler/src/profile_groupnorm_bwd_gamma_beta.cpp create mode 100644 profiler/src/profile_layernorm_bwd_gamma_beta.cpp create mode 100644 test/normalization_bwd_gamma_beta/CMakeLists.txt create mode 100644 test/normalization_bwd_gamma_beta/test_groupnorm_bwd_gamma_beta_fp32.cpp create mode 100644 test/normalization_bwd_gamma_beta/test_layernorm2d_bwd_gamma_beta_fp32.cpp diff --git a/client_example/05_layernorm/CMakeLists.txt b/client_example/05_layernorm/CMakeLists.txt index 246f877cde..b7b3c830ed 100644 --- a/client_example/05_layernorm/CMakeLists.txt +++ b/client_example/05_layernorm/CMakeLists.txt @@ -1,6 +1,9 @@ add_executable(client_layernorm2d_bwd_data layernorm2d_bwd_data.cpp) target_link_libraries(client_layernorm2d_bwd_data PRIVATE composable_kernel::device_other_operations) +add_executable(client_layernorm2d_bwd_gamma_beta layernorm2d_bwd_gamma_beta.cpp) +target_link_libraries(client_layernorm2d_bwd_gamma_beta PRIVATE composable_kernel::device_other_operations) + add_executable(client_layernorm2d_fwd layernorm2d_fwd.cpp) target_link_libraries(client_layernorm2d_fwd PRIVATE composable_kernel::device_other_operations) diff --git a/client_example/05_layernorm/layernorm2d_bwd_gamma_beta.cpp b/client_example/05_layernorm/layernorm2d_bwd_gamma_beta.cpp new file mode 100644 index 0000000000..98b394add6 --- /dev/null +++ b/client_example/05_layernorm/layernorm2d_bwd_gamma_beta.cpp @@ -0,0 +1,171 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. + +#include +#include +#include + +#include "ck/ck.hpp" +#include "ck/tensor_operation/gpu/device/device_normalization_bwd_gamma_beta.hpp" + +#include "ck/library/tensor_operation_instance/gpu/layernorm_bwd_gamma_beta.hpp" + +using DYDataType = float; +using XDataType = float; +using GammaDataType = float; +using MeanInvStdDataType = float; +using DGammaDataType = float; +using DBetaDataType = float; + +constexpr int Rank = 2; +constexpr int NumReduceDim = 1; + +struct SimpleDeviceMem +{ + SimpleDeviceMem() = delete; + + SimpleDeviceMem(std::size_t mem_size) : p_mem_{} + { + (void)hipMalloc(static_cast(&p_mem_), mem_size); + } + + void* GetDeviceBuffer() { return p_mem_; } + + ~SimpleDeviceMem() { (void)hipFree(p_mem_); } + + void* p_mem_; +}; + +int main(int argc, char* argv[]) +{ + ck::index_t M = 1024; + ck::index_t N = 1024; + + SimpleDeviceMem dy_dev(sizeof(DYDataType) * M * N); + SimpleDeviceMem x_dev(sizeof(XDataType) * M * N); + SimpleDeviceMem mean_dev(sizeof(MeanInvStdDataType) * M); + SimpleDeviceMem inv_std_dev(sizeof(MeanInvStdDataType) * M); + SimpleDeviceMem dgamma_dev(sizeof(DGammaDataType) * N); + SimpleDeviceMem dbeta_dev(sizeof(DBetaDataType) * N); + + using DeviceOp = + ck::tensor_operation::device::DeviceNormalizationBwdGammaBeta; + + // get device op instances + const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory< + DeviceOp>::GetInstances(); + + std::cout << "found " << op_ptrs.size() << " instances" << std::endl; + + std::string best_op_name; + bool found = false; + int best_op_id = -1; + float best_ave_time = std::numeric_limits::max(); + float best_gb_per_sec = 0; + + // profile device operation instances + std::cout << "Run all instances and do timing" << std::endl; + + std::size_t num_bytes = sizeof(DYDataType) * M * N + sizeof(XDataType) * M * N + + sizeof(MeanInvStdDataType) * M * 2 + sizeof(DGammaDataType) * N + + sizeof(DBetaDataType) * N; + + for(int i = 0; i < op_ptrs.size(); ++i) + { + auto& op_ptr = op_ptrs[i]; + + auto argument_ptr = op_ptr->MakeArgumentPointer({M, N}, // inLengths + {N, 1}, // dyStrides + {N, 1}, // xStrides + {1, 0}, // meanStrides + {1, 0}, // invStdStrides + {N}, // outLengths + {1}, // dgammaStrides + {1}, // dbetaStrides + {0}, // reduceDims + dy_dev.GetDeviceBuffer(), + x_dev.GetDeviceBuffer(), + mean_dev.GetDeviceBuffer(), + inv_std_dev.GetDeviceBuffer(), + dgamma_dev.GetDeviceBuffer(), + dbeta_dev.GetDeviceBuffer()); + + auto invoker_ptr = op_ptr->MakeInvokerPointer(); + + std::string op_name = op_ptr->GetTypeString(); + + if(op_ptr->IsSupportedArgument(argument_ptr.get())) + { + size_t workspace_sz = op_ptr->GetWorkSpaceSize(argument_ptr.get()); + SimpleDeviceMem workspace(workspace_sz); + op_ptr->SetWorkSpacePointer(argument_ptr.get(), workspace.GetDeviceBuffer()); + + float ave_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true}); + float gb_per_sec = num_bytes / 1.E6 / ave_time; + + std::cout << "Perf: " << std::setw(10) << ave_time << " ms, " << gb_per_sec << " GB/s, " + << op_name << std::endl; + + if(ave_time < best_ave_time) + { + found = true; + best_op_id = i; + best_op_name = op_name; + best_ave_time = ave_time; + best_gb_per_sec = gb_per_sec; + } + } + else + { + std::cout << op_name << " does not support this problem" << std::endl; + } + } + + std::cout << "Best Perf: " << best_ave_time << " ms, " << best_gb_per_sec << " GB/s, " + << best_op_name << std::endl; + + // run the best intance + if(found) + { + auto& op_ptr = op_ptrs[best_op_id]; + std::cout << "Run the best instance without timing: " << op_ptr->GetTypeString() + << std::endl; + + auto argument_ptr = op_ptr->MakeArgumentPointer({M, N}, // inLengths + {N, 1}, // dyStrides + {N, 1}, // xStrides + {1, 0}, // meanStrides + {1, 0}, // invStdStrides + {N}, // outLengths + {1}, // dgammaStrides + {1}, // dbetaStrides + {0}, // reduceDims + dy_dev.GetDeviceBuffer(), + x_dev.GetDeviceBuffer(), + mean_dev.GetDeviceBuffer(), + inv_std_dev.GetDeviceBuffer(), + dgamma_dev.GetDeviceBuffer(), + dbeta_dev.GetDeviceBuffer()); + + auto invoker_ptr = op_ptr->MakeInvokerPointer(); + + if(op_ptr->IsSupportedArgument(argument_ptr.get())) + { + size_t workspace_sz = op_ptr->GetWorkSpaceSize(argument_ptr.get()); + SimpleDeviceMem workspace(workspace_sz); + op_ptr->SetWorkSpacePointer(argument_ptr.get(), workspace.GetDeviceBuffer()); + + invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, false}); + } + + std::cout << "Done" << std::endl; + } + + return 0; +} diff --git a/client_example/18_groupnorm/CMakeLists.txt b/client_example/18_groupnorm/CMakeLists.txt index deb50f6fce..e04c26d8e7 100644 --- a/client_example/18_groupnorm/CMakeLists.txt +++ b/client_example/18_groupnorm/CMakeLists.txt @@ -1,5 +1,8 @@ add_executable(client_groupnorm_bwd_data groupnorm_bwd_data.cpp) target_link_libraries(client_groupnorm_bwd_data PRIVATE composable_kernel::device_other_operations) +add_executable(client_groupnorm_bwd_gamma_beta groupnorm_bwd_gamma_beta.cpp) +target_link_libraries(client_groupnorm_bwd_gamma_beta PRIVATE composable_kernel::device_other_operations) + add_executable(client_groupnorm_swish_fwd groupnorm_swish_fwd.cpp) target_link_libraries(client_groupnorm_swish_fwd PRIVATE composable_kernel::device_other_operations) diff --git a/client_example/18_groupnorm/groupnorm_bwd_gamma_beta.cpp b/client_example/18_groupnorm/groupnorm_bwd_gamma_beta.cpp new file mode 100644 index 0000000000..c2fbe285df --- /dev/null +++ b/client_example/18_groupnorm/groupnorm_bwd_gamma_beta.cpp @@ -0,0 +1,180 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. + +#include +#include +#include + +#include "ck/ck.hpp" +#include "ck/tensor_operation/gpu/device/device_normalization_bwd_gamma_beta.hpp" + +#include "ck/library/tensor_operation_instance/gpu/groupnorm_bwd_gamma_beta.hpp" + +using DYDataType = float; +using XDataType = float; +using GammaDataType = float; +using MeanInvStdDataType = float; +using DGammaDataType = float; +using DBetaDataType = float; + +constexpr int Rank = 5; +constexpr int NumReduceDim = 3; + +struct SimpleDeviceMem +{ + SimpleDeviceMem() = delete; + + SimpleDeviceMem(std::size_t mem_size) : p_mem_{} + { + (void)hipMalloc(static_cast(&p_mem_), mem_size); + } + + void* GetDeviceBuffer() { return p_mem_; } + + ~SimpleDeviceMem() { (void)hipFree(p_mem_); } + + void* p_mem_; +}; + +int main(int argc, char* argv[]) +{ + ck::index_t N = 32; + ck::index_t H = 16; + ck::index_t W = 16; + ck::index_t G = 64; + ck::index_t C = 128; + + std::size_t length = N * H * W * G * C; + + std::vector strideDy = {H * W * G * C, W * G * C, G * C, C, 1}; + std::vector strideX = strideDy; + std::vector strideMeanInvStd = {G, 0, 0, 1, 0}; + std::vector strideDGammaBeta = {C, 1}; + + SimpleDeviceMem dy_dev(sizeof(DYDataType) * length); + SimpleDeviceMem x_dev(sizeof(XDataType) * length); + SimpleDeviceMem mean_dev(sizeof(MeanInvStdDataType) * N * G); + SimpleDeviceMem inv_std_dev(sizeof(MeanInvStdDataType) * N * G); + SimpleDeviceMem dgamma_dev(sizeof(DGammaDataType) * G * C); + SimpleDeviceMem dbeta_dev(sizeof(DBetaDataType) * G * C); + + using DeviceOp = + ck::tensor_operation::device::DeviceNormalizationBwdGammaBeta; + + // get device op instances + const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory< + DeviceOp>::GetInstances(); + + std::cout << "found " << op_ptrs.size() << " instances" << std::endl; + + std::string best_op_name; + bool found = false; + int best_op_id = -1; + float best_ave_time = std::numeric_limits::max(); + float best_gb_per_sec = 0; + + // profile device operation instances + std::cout << "Run all instances and do timing" << std::endl; + + std::size_t num_bytes = sizeof(DYDataType) * length + sizeof(XDataType) * length + + sizeof(GammaDataType) * G * C + sizeof(MeanInvStdDataType) * N * G * 2 + + sizeof(DGammaDataType) * G * C + sizeof(DBetaDataType) * G * C; + + for(int i = 0; i < op_ptrs.size(); ++i) + { + auto& op_ptr = op_ptrs[i]; + auto argument_ptr = op_ptr->MakeArgumentPointer({N, H, W, G, C}, + strideDy, + strideX, + strideMeanInvStd, + strideMeanInvStd, + {G, C}, + strideDGammaBeta, + strideDGammaBeta, + {0, 1, 2}, // reduceDims + dy_dev.GetDeviceBuffer(), + x_dev.GetDeviceBuffer(), + mean_dev.GetDeviceBuffer(), + inv_std_dev.GetDeviceBuffer(), + dgamma_dev.GetDeviceBuffer(), + dbeta_dev.GetDeviceBuffer()); + + auto invoker_ptr = op_ptr->MakeInvokerPointer(); + + std::string op_name = op_ptr->GetTypeString(); + + if(op_ptr->IsSupportedArgument(argument_ptr.get())) + { + size_t workspace_sz = op_ptr->GetWorkSpaceSize(argument_ptr.get()); + SimpleDeviceMem workspace(workspace_sz); + op_ptr->SetWorkSpacePointer(argument_ptr.get(), workspace.GetDeviceBuffer()); + + float ave_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true}); + float gb_per_sec = num_bytes / 1.E6 / ave_time; + + std::cout << "Perf: " << std::setw(10) << ave_time << " ms, " << gb_per_sec << " GB/s, " + << op_name << std::endl; + + if(ave_time < best_ave_time) + { + found = true; + best_op_id = i; + best_op_name = op_name; + best_ave_time = ave_time; + best_gb_per_sec = gb_per_sec; + } + } + else + { + std::cout << op_name << " does not support this problem" << std::endl; + } + } + + // run the best intance + if(found) + { + std::cout << "Best Perf: " << best_ave_time << " ms, " << best_gb_per_sec << " GB/s, " + << best_op_name << std::endl; + + auto& op_ptr = op_ptrs[best_op_id]; + std::cout << "Run the best instance without timing: " << op_ptr->GetTypeString() + << std::endl; + + auto argument_ptr = op_ptr->MakeArgumentPointer({N, H, W, G, C}, + strideDy, + strideX, + strideMeanInvStd, + strideMeanInvStd, + {G, C}, + strideDGammaBeta, + strideDGammaBeta, + {0, 1, 2}, // reduceDims + dy_dev.GetDeviceBuffer(), + x_dev.GetDeviceBuffer(), + mean_dev.GetDeviceBuffer(), + inv_std_dev.GetDeviceBuffer(), + dgamma_dev.GetDeviceBuffer(), + dbeta_dev.GetDeviceBuffer()); + + auto invoker_ptr = op_ptr->MakeInvokerPointer(); + + if(op_ptr->IsSupportedArgument(argument_ptr.get())) + { + size_t workspace_sz = op_ptr->GetWorkSpaceSize(argument_ptr.get()); + SimpleDeviceMem workspace(workspace_sz); + op_ptr->SetWorkSpacePointer(argument_ptr.get(), workspace.GetDeviceBuffer()); + + invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, false}); + } + + std::cout << "Done" << std::endl; + } + + return 0; +} diff --git a/library/include/ck/library/tensor_operation_instance/gpu/groupnorm_bwd_gamma_beta.hpp b/library/include/ck/library/tensor_operation_instance/gpu/groupnorm_bwd_gamma_beta.hpp new file mode 100644 index 0000000000..3f888d5c67 --- /dev/null +++ b/library/include/ck/library/tensor_operation_instance/gpu/groupnorm_bwd_gamma_beta.hpp @@ -0,0 +1,64 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include +#include +#include "ck/ck.hpp" +#include "ck/tensor_operation/gpu/device/tensor_layout.hpp" +#include "ck/tensor_operation/gpu/device/device_normalization_bwd_gamma_beta.hpp" + +#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { +#ifdef CK_ENABLE_FP32 +// FP32 +void add_device_groupnorm_bwd_gamma_beta_f32_instances( + std::vector>>&); +#endif +template +struct DeviceOperationInstanceFactory< + ck::tensor_operation::device::DeviceNormalizationBwdGammaBeta> +{ + using DeviceOp = DeviceNormalizationBwdGammaBeta; + + static auto GetInstances() + { + std::vector> op_ptrs; + +#ifdef CK_ENABLE_FP32 + if constexpr(is_same_v && is_same_v && + is_same_v && is_same_v && + is_same_v) + { + add_device_groupnorm_bwd_gamma_beta_f32_instances(op_ptrs); + } +#endif + return op_ptrs; + } +}; + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/include/ck/library/tensor_operation_instance/gpu/layernorm_bwd_gamma_beta.hpp b/library/include/ck/library/tensor_operation_instance/gpu/layernorm_bwd_gamma_beta.hpp new file mode 100644 index 0000000000..e2736ac77e --- /dev/null +++ b/library/include/ck/library/tensor_operation_instance/gpu/layernorm_bwd_gamma_beta.hpp @@ -0,0 +1,83 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include +#include +#include "ck/ck.hpp" +#include "ck/tensor_operation/gpu/device/device_normalization_bwd_gamma_beta.hpp" + +#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { +#ifdef CK_ENABLE_FP16 +// FP16 +void add_device_layernorm2d_bwd_gamma_beta_f16_instances( + std::vector>>&); +#endif +#ifdef CK_ENABLE_FP32 +// FP32 +void add_device_layernorm2d_bwd_gamma_beta_f32_instances( + std::vector>>&); +#endif +template +struct DeviceOperationInstanceFactory< + ck::tensor_operation::device::DeviceNormalizationBwdGammaBeta> +{ + using DeviceOp = DeviceNormalizationBwdGammaBeta; + + static auto GetInstances() + { + std::vector> op_ptrs; +#ifdef CK_ENABLE_FP16 + if constexpr(is_same_v && is_same_v && + is_same_v && is_same_v && + is_same_v) + { + if constexpr(Rank == 2 && NumReduceDim == 1) + { + add_device_layernorm2d_bwd_gamma_beta_f16_instances(op_ptrs); + } + } +#endif +#ifdef CK_ENABLE_FP32 + if constexpr(is_same_v && is_same_v && + is_same_v && is_same_v && + is_same_v) + { + if constexpr(Rank == 2 && NumReduceDim == 1) + { + add_device_layernorm2d_bwd_gamma_beta_f32_instances(op_ptrs); + } + } +#endif + return op_ptrs; + } +}; + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/normalization_bwd_gamma_beta/device_layernorm2d_bwd_gamma_beta_f16_instance.cpp b/library/src/tensor_operation_instance/gpu/normalization_bwd_gamma_beta/device_layernorm2d_bwd_gamma_beta_f16_instance.cpp index aa399f56ec..160bcb4ace 100644 --- a/library/src/tensor_operation_instance/gpu/normalization_bwd_gamma_beta/device_layernorm2d_bwd_gamma_beta_f16_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/normalization_bwd_gamma_beta/device_layernorm2d_bwd_gamma_beta_f16_instance.cpp @@ -8,7 +8,7 @@ namespace tensor_operation { namespace device { namespace instance { -void add_device_layernorm2d_bwd_gamma_beta_rank_2_1_f16_instances( +void add_device_layernorm2d_bwd_gamma_beta_f16_instances( std::vector>>& instances) { diff --git a/library/src/tensor_operation_instance/gpu/normalization_bwd_gamma_beta/device_layernorm2d_bwd_gamma_beta_f32_instance.cpp b/library/src/tensor_operation_instance/gpu/normalization_bwd_gamma_beta/device_layernorm2d_bwd_gamma_beta_f32_instance.cpp index ba2966ba37..6f42eca0b9 100644 --- a/library/src/tensor_operation_instance/gpu/normalization_bwd_gamma_beta/device_layernorm2d_bwd_gamma_beta_f32_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/normalization_bwd_gamma_beta/device_layernorm2d_bwd_gamma_beta_f32_instance.cpp @@ -8,7 +8,7 @@ namespace tensor_operation { namespace device { namespace instance { -void add_device_layernorm2d_bwd_gamma_beta_rank_2_1_f32_instances( +void add_device_layernorm2d_bwd_gamma_beta_f32_instances( std::vector>>& instances) { diff --git a/profiler/include/profiler/profile_groupnorm_bwd_gamma_beta_impl.hpp b/profiler/include/profiler/profile_groupnorm_bwd_gamma_beta_impl.hpp new file mode 100644 index 0000000000..5e9d3df1b1 --- /dev/null +++ b/profiler/include/profiler/profile_groupnorm_bwd_gamma_beta_impl.hpp @@ -0,0 +1,261 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include + +#include "ck/ck.hpp" +#include "ck/library/tensor_operation_instance/gpu/groupnorm_bwd_gamma_beta.hpp" +#include "ck/library/utility/check_err.hpp" +#include "ck/library/utility/device_memory.hpp" +#include "ck/library/utility/host_tensor.hpp" +#include "ck/library/utility/host_tensor_generator.hpp" +#include "ck/library/reference_tensor_operation/cpu/reference_groupnorm_bwd.hpp" + +namespace ck { +namespace profiler { + +template +bool profile_groupnorm_bwd_gamma_beta_impl(int do_verification, + int init_method, + bool do_log, + bool time_kernel, + std::vector length) +{ + // we don't need GammaDataType and DXDataType here, just for reference class + using GammaDataType = DYDataType; + using DXDataType = DYDataType; + + if(length.size() != 5) + return false; + + index_t N = length[0]; + index_t G = length[3]; + index_t C = length[4]; + + std::vector reduce_dim = {0, 1, 2}; + std::vector gamma_beta_length = {G, C}; + + Tensor dy(length); + Tensor x(length); + Tensor gamma(gamma_beta_length); // dummy tensor, for reference + Tensor mean({N, G}); + Tensor inv_std({N, G}); + Tensor dgamma(gamma_beta_length); + Tensor dbeta(gamma_beta_length); + + Tensor host_dx(length); // dummy tensor, for reference + Tensor host_dgamma(gamma_beta_length); + Tensor host_dbeta(gamma_beta_length); + + std::vector strideDy = + std::vector{dy.mDesc.GetStrides().begin(), dy.mDesc.GetStrides().end()}; + std::vector strideX = + std::vector{x.mDesc.GetStrides().begin(), x.mDesc.GetStrides().end()}; + + std::vector strideDGamma{dgamma.mDesc.GetStrides().begin(), + dgamma.mDesc.GetStrides().end()}; + + std::vector strideDBeta{dbeta.mDesc.GetStrides().begin(), + dbeta.mDesc.GetStrides().end()}; + + std::vector strideMeanInvStd = {G, 0, 0, 1, 0}; + + switch(init_method) + { + case 0: + dy.GenerateTensorValue(GeneratorTensor_1{}); + x.GenerateTensorValue(GeneratorTensor_1{}); + mean.GenerateTensorValue(GeneratorTensor_1{}); + inv_std.GenerateTensorValue(GeneratorTensor_1{}); + dgamma.GenerateTensorValue(GeneratorTensor_1{}); + dbeta.GenerateTensorValue(GeneratorTensor_1{}); + break; + case 1: + dy.GenerateTensorValue(GeneratorTensor_2{-5, 5}); + x.GenerateTensorValue(GeneratorTensor_2{-5, 5}); + mean.GenerateTensorValue(GeneratorTensor_2{-5, 5}); + inv_std.GenerateTensorValue(GeneratorTensor_2{0, 5}); + dgamma.GenerateTensorValue(GeneratorTensor_2{-5, 5}); + dbeta.GenerateTensorValue(GeneratorTensor_2{-5, 5}); + break; + default: + dy.GenerateTensorValue(GeneratorTensor_3{0, 1}); + x.GenerateTensorValue(GeneratorTensor_3{0, 1}); + mean.GenerateTensorValue(GeneratorTensor_3{-0.5, 0.5}); + inv_std.GenerateTensorValue(GeneratorTensor_3{0, 0.5}); + dgamma.GenerateTensorValue(GeneratorTensor_3{-0.5, 0.5}); + dbeta.GenerateTensorValue(GeneratorTensor_3{-0.5, 0.5}); + } + + DeviceMem dy_dev(sizeof(DYDataType) * dy.mDesc.GetElementSpaceSize()); + DeviceMem x_dev(sizeof(XDataType) * x.mDesc.GetElementSpaceSize()); + DeviceMem mean_dev(sizeof(MeanInvStdDataType) * mean.mDesc.GetElementSpaceSize()); + DeviceMem inv_std_dev(sizeof(MeanInvStdDataType) * inv_std.mDesc.GetElementSpaceSize()); + DeviceMem dgamma_dev(sizeof(DGammaDataType) * dgamma.mDesc.GetElementSpaceSize()); + DeviceMem dbeta_dev(sizeof(DBetaDataType) * dbeta.mDesc.GetElementSpaceSize()); + + dy_dev.ToDevice(dy.mData.data()); + x_dev.ToDevice(x.mData.data()); + mean_dev.ToDevice(mean.mData.data()); + inv_std_dev.ToDevice(inv_std.mData.data()); + + // add device normalization instances + using DeviceOp = + ck::tensor_operation::device::DeviceNormalizationBwdGammaBeta; + + // get device op instances + const auto instance_ptrs = + ck::tensor_operation::device::instance::DeviceOperationInstanceFactory< + DeviceOp>::GetInstances(); + + std::cout << "found " << instance_ptrs.size() << " instances" << std::endl; + + std::string best_instance_name; + float best_avg_time = std::numeric_limits::max(); + float best_gb_per_sec = 0; + + if(do_verification) + { + using ReferenceInstance = + ck::tensor_operation::host::ReferenceGroupnormBwd; + + ReferenceInstance ref; + auto ref_argument = + ref.MakeArgument(dy, x, gamma, mean, inv_std, host_dgamma, host_dbeta, host_dx, length); + auto ref_invoker = ref.MakeInvoker(); + ref_invoker.Run(ref_argument); + } + + std::size_t num_bytes = dy.mDesc.GetElementSize() * sizeof(DYDataType) + + x.mDesc.GetElementSize() * sizeof(XDataType) + + mean.mDesc.GetElementSize() * sizeof(MeanInvStdDataType) + + inv_std.mDesc.GetElementSize() * sizeof(MeanInvStdDataType) + + dgamma.mDesc.GetElementSize() * sizeof(DGammaDataType) + + dbeta.mDesc.GetElementSize() * sizeof(DBetaDataType); + + int num_kernel = 0; + + for(auto& inst_ptr : instance_ptrs) + { + auto argument_ptr = inst_ptr->MakeArgumentPointer(length, + strideDy, + strideX, + strideMeanInvStd, + strideMeanInvStd, + gamma_beta_length, + strideDGamma, + strideDBeta, + reduce_dim, + dy_dev.GetDeviceBuffer(), + x_dev.GetDeviceBuffer(), + mean_dev.GetDeviceBuffer(), + inv_std_dev.GetDeviceBuffer(), + dgamma_dev.GetDeviceBuffer(), + dbeta_dev.GetDeviceBuffer()); + + if(inst_ptr->IsSupportedArgument(argument_ptr.get())) + { + ++num_kernel; + } + else + { + if(time_kernel) + { + std::cout << inst_ptr->GetTypeString() << " skipped due to unsupported argument: "; + LogRange(std::cout << "input lengths = ", length, ", ") << std::endl; + } + + continue; + } + + size_t workspace_sz = inst_ptr->GetWorkSpaceSize(argument_ptr.get()); + DeviceMem workspace_dev(workspace_sz); + inst_ptr->SetWorkSpacePointer(argument_ptr.get(), workspace_dev.GetDeviceBuffer()); + + auto invoker_ptr = inst_ptr->MakeInvokerPointer(); + + float avg_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel}); + + float gb_per_sec = num_bytes / 1.E6 / avg_time; + + if(time_kernel) + std::cout << "Perf: " << std::setw(10) << avg_time << " ms, " << gb_per_sec << " GB/s, " + << inst_ptr->GetTypeString() << std::endl; + + if(avg_time < best_avg_time) + { + best_instance_name = inst_ptr->GetTypeString(); + best_avg_time = avg_time; + best_gb_per_sec = gb_per_sec; + } + + if(do_verification) + { + dgamma_dev.FromDevice(dgamma.mData.data()); + dbeta_dev.FromDevice(dbeta.mData.data()); + bool pass = + ck::utils::check_err(dgamma, host_dgamma, "Error: Incorrect dgamma", 1e-3, 1e-3); + + pass &= ck::utils::check_err(dbeta, host_dbeta, "Error: Incorrect dbeta", 1e-3, 1e-3); + + if(do_log) + { + LogRangeAsType(std::cout << "dy : ", dy.mData, ",") << std::endl; + LogRangeAsType(std::cout << "host_dgamma : ", host_dgamma.mData, ",") + << std::endl; + LogRangeAsType(std::cout << "dgamma : ", dgamma.mData, ",") << std::endl; + } + + if(!pass) + { + std::cout << inst_ptr->GetTypeString() << " failed verification: "; + LogRange(std::cout << "lengths = [", length, ", ") << "]." << std::endl; + return false; + } + else + { + if(time_kernel) + std::cout << "pass" << std::endl; + } + } + } + + if(time_kernel) + { + LogRange(std::cout << "length = ", length, ",") << ", "; + LogRange(std::cout << "reduce dims ", reduce_dim, ",") << std::endl; + std::cout << "best perf = " << best_avg_time << " ms, " << best_gb_per_sec << " GB/s," + << best_instance_name << std::endl; + } + + if(num_kernel == 0) + { + std::cout << "Error: No kernel is applicable" << std::endl; + return false; + } + + return true; +} + +} // namespace profiler +} // namespace ck diff --git a/profiler/include/profiler/profile_layernorm_bwd_gamma_beta_impl.hpp b/profiler/include/profiler/profile_layernorm_bwd_gamma_beta_impl.hpp new file mode 100644 index 0000000000..10fa9c86d5 --- /dev/null +++ b/profiler/include/profiler/profile_layernorm_bwd_gamma_beta_impl.hpp @@ -0,0 +1,263 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include + +#include "ck/ck.hpp" +#include "ck/library/tensor_operation_instance/gpu/layernorm_bwd_gamma_beta.hpp" +#include "ck/library/utility/check_err.hpp" +#include "ck/library/utility/device_memory.hpp" +#include "ck/library/utility/host_tensor.hpp" +#include "ck/library/utility/host_tensor_generator.hpp" +#include "ck/library/reference_tensor_operation/cpu/reference_layernorm_bwd.hpp" + +namespace ck { +namespace profiler { + +template +bool profile_layernorm_bwd_gamma_beta_impl(int do_verification, + int init_method, + bool do_log, + bool time_kernel, + std::vector length) +{ + // we don't need GammaDataType and DXDataType here, just for reference class + using GammaDataType = DYDataType; + using DXDataType = DYDataType; + + if(length.size() != Rank || Rank < 2) + return false; + + // Assume normalize dimension for first dimension + // Layernorm 2D, input = [M, K], reduce on M axis + // Layernorm 4D, input = [N, H, W, C], redice on N axis + constexpr int NumReduceDim = Rank - 1; + + std::vector reduce_dim = {0}; + std::vector invarient_length{length.begin() + 1, length.end()}; + + Tensor dy(length); + Tensor x(length); + Tensor gamma(invarient_length); // dummy tensor, for reference + Tensor mean({length[0]}); + Tensor inv_std({length[0]}); + Tensor dgamma(invarient_length); + Tensor dbeta(invarient_length); + + Tensor host_dx(length); // dummy tensor, for reference + Tensor host_dgamma(invarient_length); + Tensor host_dbeta(invarient_length); + + std::vector strideDy = + std::vector{dy.mDesc.GetStrides().begin(), dy.mDesc.GetStrides().end()}; + std::vector strideX = strideDy; + + std::vector strideDGamma{dgamma.mDesc.GetStrides().begin(), + dgamma.mDesc.GetStrides().end()}; + + std::vector strideDBeta{dbeta.mDesc.GetStrides().begin(), + dbeta.mDesc.GetStrides().end()}; + + std::vector strideMeanInvStd{Rank, 0}; + strideMeanInvStd[0] = 1; + + switch(init_method) + { + case 0: + dy.GenerateTensorValue(GeneratorTensor_1{}); + x.GenerateTensorValue(GeneratorTensor_1{}); + mean.GenerateTensorValue(GeneratorTensor_1{}); + inv_std.GenerateTensorValue(GeneratorTensor_1{}); + dgamma.GenerateTensorValue(GeneratorTensor_1{}); + dbeta.GenerateTensorValue(GeneratorTensor_1{}); + break; + case 1: + dy.GenerateTensorValue(GeneratorTensor_2{-5, 5}); + x.GenerateTensorValue(GeneratorTensor_2{-5, 5}); + mean.GenerateTensorValue(GeneratorTensor_2{-5, 5}); + inv_std.GenerateTensorValue(GeneratorTensor_2{0, 5}); + dgamma.GenerateTensorValue(GeneratorTensor_2{-5, 5}); + dbeta.GenerateTensorValue(GeneratorTensor_2{-5, 5}); + break; + default: + dy.GenerateTensorValue(GeneratorTensor_3{0, 1}); + x.GenerateTensorValue(GeneratorTensor_3{0, 1}); + mean.GenerateTensorValue(GeneratorTensor_3{-0.5, 0.5}); + inv_std.GenerateTensorValue(GeneratorTensor_3{0, 0.5}); + dgamma.GenerateTensorValue(GeneratorTensor_3{-0.5, 0.5}); + dbeta.GenerateTensorValue(GeneratorTensor_3{-0.5, 0.5}); + } + + DeviceMem dy_dev(sizeof(DYDataType) * dy.mDesc.GetElementSpaceSize()); + DeviceMem x_dev(sizeof(XDataType) * x.mDesc.GetElementSpaceSize()); + DeviceMem mean_dev(sizeof(MeanInvStdDataType) * mean.mDesc.GetElementSpaceSize()); + DeviceMem inv_std_dev(sizeof(MeanInvStdDataType) * inv_std.mDesc.GetElementSpaceSize()); + DeviceMem dgamma_dev(sizeof(DGammaDataType) * dgamma.mDesc.GetElementSpaceSize()); + DeviceMem dbeta_dev(sizeof(DBetaDataType) * dbeta.mDesc.GetElementSpaceSize()); + + dy_dev.ToDevice(dy.mData.data()); + x_dev.ToDevice(x.mData.data()); + mean_dev.ToDevice(mean.mData.data()); + inv_std_dev.ToDevice(inv_std.mData.data()); + + // add device normalization instances + using DeviceOp = + ck::tensor_operation::device::DeviceNormalizationBwdGammaBeta; + + // get device op instances + const auto instance_ptrs = + ck::tensor_operation::device::instance::DeviceOperationInstanceFactory< + DeviceOp>::GetInstances(); + + std::cout << "found " << instance_ptrs.size() << " instances" << std::endl; + + std::string best_instance_name; + float best_avg_time = std::numeric_limits::max(); + float best_gb_per_sec = 0; + + if(do_verification) + { + using ReferenceInstance = + ck::tensor_operation::host::ReferenceLayernormBwd; + + ReferenceInstance ref; + auto ref_argument = + ref.MakeArgument(dy, x, gamma, mean, inv_std, host_dgamma, host_dbeta, host_dx, length); + auto ref_invoker = ref.MakeInvoker(); + ref_invoker.Run(ref_argument); + } + + std::size_t num_bytes = dy.mDesc.GetElementSize() * sizeof(DYDataType) + + x.mDesc.GetElementSize() * sizeof(XDataType) + + mean.mDesc.GetElementSize() * sizeof(MeanInvStdDataType) + + inv_std.mDesc.GetElementSize() * sizeof(MeanInvStdDataType) + + dgamma.mDesc.GetElementSize() * sizeof(DGammaDataType) + + dbeta.mDesc.GetElementSize() * sizeof(DBetaDataType); + + int num_kernel = 0; + + for(auto& inst_ptr : instance_ptrs) + { + auto argument_ptr = inst_ptr->MakeArgumentPointer(length, + strideDy, + strideX, + strideMeanInvStd, + strideMeanInvStd, + invarient_length, + strideDGamma, + strideDBeta, + reduce_dim, + dy_dev.GetDeviceBuffer(), + x_dev.GetDeviceBuffer(), + mean_dev.GetDeviceBuffer(), + inv_std_dev.GetDeviceBuffer(), + dgamma_dev.GetDeviceBuffer(), + dbeta_dev.GetDeviceBuffer()); + + if(inst_ptr->IsSupportedArgument(argument_ptr.get())) + { + ++num_kernel; + } + else + { + if(time_kernel) + { + std::cout << inst_ptr->GetTypeString() << " skipped due to unsupported argument: "; + LogRange(std::cout << "input lengths = ", length, ", ") << std::endl; + } + + continue; + } + + size_t workspace_sz = inst_ptr->GetWorkSpaceSize(argument_ptr.get()); + DeviceMem workspace_dev(workspace_sz); + inst_ptr->SetWorkSpacePointer(argument_ptr.get(), workspace_dev.GetDeviceBuffer()); + + auto invoker_ptr = inst_ptr->MakeInvokerPointer(); + + float avg_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel}); + + float gb_per_sec = num_bytes / 1.E6 / avg_time; + + if(time_kernel) + std::cout << "Perf: " << std::setw(10) << avg_time << " ms, " << gb_per_sec << " GB/s, " + << inst_ptr->GetTypeString() << std::endl; + + if(avg_time < best_avg_time) + { + best_instance_name = inst_ptr->GetTypeString(); + best_avg_time = avg_time; + best_gb_per_sec = gb_per_sec; + } + + if(do_verification) + { + dgamma_dev.FromDevice(dgamma.mData.data()); + dbeta_dev.FromDevice(dbeta.mData.data()); + bool pass = + ck::utils::check_err(dgamma, host_dgamma, "Error: Incorrect dgamma", 1e-3, 1e-3); + + pass &= ck::utils::check_err(dbeta, host_dbeta, "Error: Incorrect dbeta", 1e-3, 1e-3); + + if(do_log) + { + LogRangeAsType(std::cout << "dy : ", dy.mData, ",") << std::endl; + LogRangeAsType(std::cout << "host_dgamma : ", host_dgamma.mData, ",") + << std::endl; + LogRangeAsType(std::cout << "dgamma : ", dgamma.mData, ",") << std::endl; + } + + if(!pass) + { + std::cout << inst_ptr->GetTypeString() << " failed verification: "; + LogRange(std::cout << "lengths = [", length, ", ") << "]." << std::endl; + return false; + } + else + { + if(time_kernel) + std::cout << "pass" << std::endl; + } + } + } + + if(time_kernel) + { + LogRange(std::cout << "length = ", length, ",") << ", "; + LogRange(std::cout << "reduce dims ", reduce_dim, ",") << std::endl; + std::cout << "best perf = " << best_avg_time << " ms, " << best_gb_per_sec << " GB/s," + << best_instance_name << std::endl; + } + + if(num_kernel == 0) + { + std::cout << "Error: No kernel is applicable" << std::endl; + return false; + } + + return true; +} + +} // namespace profiler +} // namespace ck diff --git a/profiler/src/CMakeLists.txt b/profiler/src/CMakeLists.txt index 68ef04ed11..e9cf6eecfb 100644 --- a/profiler/src/CMakeLists.txt +++ b/profiler/src/CMakeLists.txt @@ -19,6 +19,8 @@ set(PROFILER_SOURCES profile_groupnorm_bwd_data.cpp profile_groupnorm_fwd.cpp profile_layernorm_bwd_data.cpp + profile_layernorm_bwd_gamma_beta.cpp + profile_groupnorm_bwd_gamma_beta.cpp profile_layernorm_fwd.cpp profile_max_pool3d_fwd.cpp profile_avg_pool3d_bwd.cpp @@ -82,6 +84,7 @@ target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_conv2d_fwd_bias_relu target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_conv2d_fwd_bias_relu_add_instance) target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_normalization_fwd_instance) target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_normalization_bwd_data_instance) +target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_normalization_bwd_gamma_beta_instance) target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_softmax_instance) target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_reduce_instance) target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_batchnorm_instance) diff --git a/profiler/src/profile_groupnorm_bwd_gamma_beta.cpp b/profiler/src/profile_groupnorm_bwd_gamma_beta.cpp new file mode 100644 index 0000000000..7fcef3a4e2 --- /dev/null +++ b/profiler/src/profile_groupnorm_bwd_gamma_beta.cpp @@ -0,0 +1,104 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. + +#include +#include +#include + +#include "profiler/data_type_enum.hpp" +#include "profiler/profile_groupnorm_bwd_gamma_beta_impl.hpp" +#include "profiler_operation_registry.hpp" + +using ck::index_t; + +struct groupnormBwdGammaBetaArgParser +{ + std::unordered_map> long_opts = {{"length", {}}}; + + bool parse_opt(int argc, char* argv[], const std::string& key, int i) + { + if(std::string("--") + key == argv[i]) + { + int pos = i; + while(++i < argc && argv[i][0] != '-') {} + int end = i; + for(int j = pos + 1; j < end; j++) + { + long_opts[key].push_back(std::stoi(argv[j])); + } + return true; + } + return false; + } + + void operator()(int argc, char* argv[]) + { + for(auto& kv : long_opts) + { + for(int i = 1; i < argc; i++) + { + if(parse_opt(argc, argv, kv.first, i)) + break; + } + } + } +}; + +void print_help_groupnorm_bwd_gamma_beta() +{ + // eg: ckProfiler groupnorm_bwd_gamma_beta 1 0 2 0 1 --length 1 16 16 32 40 + std::cout << "arg1: data type (0: fp16; 1: fp32)\n" + << "arg2: verification (0: no; 1: yes)\n" + << "arg3: initialization (0: no init; 1: integer value; 2: decimal value)\n" + << "arg4: print tensor value (0: no; 1: yes)\n" + << "arg5: time kernel (0=no, 1=yes)\n" + << "--length: tensor extents (e.g, --length 1 16 16 32 40) \n" + << std::endl; +} + +int profile_groupnorm_bwd_gamma_beta(int argc, char* argv[]) +{ + if(argc <= 2) + { + print_help_groupnorm_bwd_gamma_beta(); + return 0; + } + + groupnormBwdGammaBetaArgParser arg_parser; + + // short unnamed options + const ck::DataTypeEnum data_type = static_cast(std::stoi(argv[2])); + const bool do_verification = std::stoi(argv[3]); + const int init_method = std::stoi(argv[4]); + const bool do_log = std::stoi(argv[5]); + const bool time_kernel = std::stoi(argv[6]); + + // parse the long options + arg_parser(argc, argv); + const std::vector length = arg_parser.long_opts["length"]; + + using F32 = float; + + if(length.size() == 5) + { + if(data_type == ck::DataTypeEnum::Float) + { + ck::profiler::profile_groupnorm_bwd_gamma_beta_impl( + do_verification, init_method, do_log, time_kernel, length); + } + else + { + throw std::runtime_error("not implemented yet"); + } + } + else + { + throw std::runtime_error("length should be 5"); + } + + return 0; +} + +REGISTER_PROFILER_OPERATION("groupnorm_bwd_gamma_beta", + "Group Normalization", + profile_groupnorm_bwd_gamma_beta); diff --git a/profiler/src/profile_layernorm_bwd_gamma_beta.cpp b/profiler/src/profile_layernorm_bwd_gamma_beta.cpp new file mode 100644 index 0000000000..0f3436c663 --- /dev/null +++ b/profiler/src/profile_layernorm_bwd_gamma_beta.cpp @@ -0,0 +1,112 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. + +#include +#include +#include + +#include "profiler/data_type_enum.hpp" +#include "profiler/profile_layernorm_bwd_gamma_beta_impl.hpp" +#include "profiler_operation_registry.hpp" + +using ck::index_t; + +struct layernormBwdGammaBetaArgParser +{ + std::unordered_map> long_opts = {{"length", {}}}; + + bool parse_opt(int argc, char* argv[], const std::string& key, int i) + { + if(std::string("--") + key == argv[i]) + { + int pos = i; + while(++i < argc && argv[i][0] != '-') {} + int end = i; + for(int j = pos + 1; j < end; j++) + { + long_opts[key].push_back(std::stoi(argv[j])); + } + return true; + } + return false; + } + + void operator()(int argc, char* argv[]) + { + for(auto& kv : long_opts) + { + for(int i = 1; i < argc; i++) + { + if(parse_opt(argc, argv, kv.first, i)) + break; + } + } + } +}; + +void print_help_layernorm_bwd_gamma_beta() +{ + // eg: ckProfiler layernorm_bwd_gamma_beta 0 0 2 0 1 --length 1502 4096 + std::cout << "arg1: data type (0: fp16; 1: fp32)\n" + << "arg2: verification (0: no; 1: yes)\n" + << "arg3: initialization (0: no init; 1: integer value; 2: decimal value)\n" + << "arg4: print tensor value (0: no; 1: yes)\n" + << "arg5: time kernel (0=no, 1=yes)\n" + << "--length: tensor extents (e.g, --length 1024 1024) \n" + << std::endl; +} + +int profile_layernorm_bwd_gamma_beta(int argc, char* argv[]) +{ + if(argc <= 2) + { + print_help_layernorm_bwd_gamma_beta(); + return 0; + } + + layernormBwdGammaBetaArgParser arg_parser; + + // short unnamed options + const ck::DataTypeEnum data_type = static_cast(std::stoi(argv[2])); + const bool do_verification = std::stoi(argv[3]); + const int init_method = std::stoi(argv[4]); + const bool do_log = std::stoi(argv[5]); + const bool time_kernel = std::stoi(argv[6]); + + // parse the long options + arg_parser(argc, argv); + const std::vector length = arg_parser.long_opts["length"]; + + using F16 = ck::half_t; + using F32 = float; + + if(length.size() == 2) + { + constexpr int rank = 2; + + if(data_type == ck::DataTypeEnum::Half) + { + ck::profiler::profile_layernorm_bwd_gamma_beta_impl( + do_verification, init_method, do_log, time_kernel, length); + } + else if(data_type == ck::DataTypeEnum::Float) + { + ck::profiler::profile_layernorm_bwd_gamma_beta_impl( + do_verification, init_method, do_log, time_kernel, length); + } + else + { + throw std::runtime_error("not implemented yet"); + } + } + else + { + throw std::runtime_error("not implemented yet"); + } + + return 0; +} + +REGISTER_PROFILER_OPERATION("layernorm_bwd_gamma_beta", + "Layer Normalization", + profile_layernorm_bwd_gamma_beta); diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 90140659f6..fa5f8583af 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -140,6 +140,7 @@ add_subdirectory(block_to_ctile_map) add_subdirectory(softmax) add_subdirectory(normalization_fwd) add_subdirectory(normalization_bwd_data) +add_subdirectory(normalization_bwd_gamma_beta) add_subdirectory(data_type) add_subdirectory(elementwise_normalization) add_subdirectory(batchnorm) diff --git a/test/normalization_bwd_gamma_beta/CMakeLists.txt b/test/normalization_bwd_gamma_beta/CMakeLists.txt new file mode 100644 index 0000000000..f3579aad08 --- /dev/null +++ b/test/normalization_bwd_gamma_beta/CMakeLists.txt @@ -0,0 +1,13 @@ +add_custom_target(test_normalization_bwd_gamma_beta) +add_gtest_executable(test_layernorm2d_bwd_gamma_beta_fp32 test_layernorm2d_bwd_gamma_beta_fp32.cpp) +if(result EQUAL 0) + target_link_libraries(test_layernorm2d_bwd_gamma_beta_fp32 PRIVATE utility device_normalization_bwd_gamma_beta_instance) + add_dependencies(test_normalization_bwd_gamma_beta test_layernorm2d_bwd_gamma_beta_fp32) +endif() + +add_gtest_executable(test_groupnorm_bwd_gamma_beta_fp32 test_groupnorm_bwd_gamma_beta_fp32.cpp) +if(result EQUAL 0) + target_link_libraries(test_groupnorm_bwd_gamma_beta_fp32 PRIVATE utility device_normalization_bwd_gamma_beta_instance) + add_dependencies(test_normalization_bwd_gamma_beta test_groupnorm_bwd_gamma_beta_fp32) +endif() + diff --git a/test/normalization_bwd_gamma_beta/test_groupnorm_bwd_gamma_beta_fp32.cpp b/test/normalization_bwd_gamma_beta/test_groupnorm_bwd_gamma_beta_fp32.cpp new file mode 100644 index 0000000000..ab9cb29891 --- /dev/null +++ b/test/normalization_bwd_gamma_beta/test_groupnorm_bwd_gamma_beta_fp32.cpp @@ -0,0 +1,51 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. + +#include "gtest/gtest.h" +#include "profiler/profile_groupnorm_bwd_gamma_beta_impl.hpp" + +using F16 = ck::half_t; +using F32 = float; +using ck::index_t; + +template +class TestgroupnormBwdGammaBeta : public ::testing::Test +{ + protected: + using DYDataType = std::tuple_element_t<0, Tuple>; + using XDataType = std::tuple_element_t<1, Tuple>; + using MeanInvStdDataType = std::tuple_element_t<2, Tuple>; + using ComputeDataType = std::tuple_element_t<3, Tuple>; + using DGammaDataType = std::tuple_element_t<4, Tuple>; + using DBetaDataType = std::tuple_element_t<5, Tuple>; + + void Run() + { + // Bwd data: [N, H, W, G, C], reduce H, W, C + std::vector> lengths = {{1, 1, 1, 1, 1}, + {1, 2, 3, 4, 5}, + {256, 9, 9, 9, 9}, + {1, 64, 64, 32, 10}, + {1, 32, 32, 32, 20}, + {1, 16, 16, 32, 40}}; + + for(auto length : lengths) + { + bool success = ck::profiler::profile_groupnorm_bwd_gamma_beta_impl( + true, 2, false, false, length); + EXPECT_TRUE(success); + } + } +}; + +using KernelTypes = ::testing::Types< + // DYDataType XDataType, MeanInvStdDataType, ComputeDataType, DGammaDataType, DBetaDataType> + std::tuple>; + +TYPED_TEST_SUITE(TestgroupnormBwdGammaBeta, KernelTypes); +TYPED_TEST(TestgroupnormBwdGammaBeta, Test_FP32) { this->Run(); } diff --git a/test/normalization_bwd_gamma_beta/test_layernorm2d_bwd_gamma_beta_fp32.cpp b/test/normalization_bwd_gamma_beta/test_layernorm2d_bwd_gamma_beta_fp32.cpp new file mode 100644 index 0000000000..53c92413b1 --- /dev/null +++ b/test/normalization_bwd_gamma_beta/test_layernorm2d_bwd_gamma_beta_fp32.cpp @@ -0,0 +1,48 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. + +#include "gtest/gtest.h" +#include "profiler/profile_layernorm_bwd_gamma_beta_impl.hpp" + +using F16 = ck::half_t; +using F32 = float; +using ck::index_t; + +template +class TestLayernorm2dBwdGammaBeta : public ::testing::Test +{ + protected: + using DYDataType = std::tuple_element_t<0, Tuple>; + using XDataType = std::tuple_element_t<1, Tuple>; + using MeanInvStdDataType = std::tuple_element_t<2, Tuple>; + using ComputeDataType = std::tuple_element_t<3, Tuple>; + using DGammaDataType = std::tuple_element_t<4, Tuple>; + using DBetaDataType = std::tuple_element_t<5, Tuple>; + + void Run() + { + // Bwd data: [N, D], reduce D + std::vector> lengths = { + {4, 256}, {8, 511}, {9, 1032}, {4, 2048}, {1, 8192}, {4000, 2000}}; + + for(auto length : lengths) + { + bool success = ck::profiler::profile_layernorm_bwd_gamma_beta_impl( + true, 2, false, false, length); + EXPECT_TRUE(success); + } + } +}; + +using KernelTypes = ::testing::Types< + // DYDataType XDataType, MeanInvStdDataType, ComputeDataType, DGammaDataType, DBetaDataType> + std::tuple>; + +TYPED_TEST_SUITE(TestLayernorm2dBwdGammaBeta, KernelTypes); +TYPED_TEST(TestLayernorm2dBwdGammaBeta, Test_FP32) { this->Run(); }