From f09ecf09f61e525e2f3b0cc087b6a51440acdbce Mon Sep 17 00:00:00 2001 From: rocking5566 Date: Tue, 20 Sep 2022 11:30:46 +0800 Subject: [PATCH] Group norm (#417) * Add groupnorm example by layernorm 1. Reference is not ready 2. shape of gamma and beta need to be fix * Let shape of gamma and beta can be same as x * Modify test, instance and client example * [What] Fix bug of layernorm for greater than 2 dimension. [Why] We need to get upper length from merge transform instead of embed transform. * Add reference for groupnorm * Fuse sigmoid after groupnorm * [What] Rename original layernorm into layernorm2d [Why] Prepare to add groupnorm using layernorm5d * clang-format * Add groupnorm test * Refine error message * Add groupnorm ckProfiler * Test groupnorm kernel from device_instance * update example * upadte profiler * Fix test naming * Fix argc number * Move descriptor and sweeponce to argument for quick debugging Co-authored-by: Chao Liu [ROCm/composable_kernel commit: 4eba345f6e4b68a5969a90d1eb44d63c696fe51e] --- client_example/05_layernorm/layernorm2d.cpp | 4 +- example/27_layernorm/layernorm_blockwise.cpp | 43 ++-- example/42_groupnorm/CMakeLists.txt | 1 + .../42_groupnorm/groupnorm_sigmoid_fp16.cpp | 172 +++++++++++++++ .../gpu/device/device_layernorm_impl.hpp | 193 ++++++++-------- .../element/unary_element_wise_operation.hpp | 15 ++ .../gridwise_layernorm_naive_variance.hpp | 106 ++++----- .../gridwise_layernorm_welford_variance.hpp | 100 ++++----- .../cpu/reference_groupnorm.hpp | 191 ++++++++++++++++ .../gpu/layernorm.hpp | 48 +++- .../device_layernorm_f16_instance.cpp | 44 ++-- .../device_layernorm_f32_instance.cpp | 40 ++-- profiler/CMakeLists.txt | 1 + profiler/include/profile_groupnorm_impl.hpp | 207 ++++++++++++++++++ profiler/include/profile_layernorm_impl.hpp | 75 ++----- profiler/src/profile_groupnorm.cpp | 106 +++++++++ profiler/src/profile_layernorm.cpp | 10 +- profiler/src/profiler.cpp | 45 ++-- test/layernorm/CMakeLists.txt | 19 +- test/layernorm/test_groupnorm_fp16.cpp | 56 +++++ test/layernorm/test_groupnorm_fp32.cpp | 56 +++++ ...orm_fp16.cpp => test_layernorm2d_fp16.cpp} | 26 +-- ...orm_fp32.cpp => test_layernorm2d_fp32.cpp} | 26 +-- ...orm_util.hpp => test_layernorm2d_util.hpp} | 50 ++--- 24 files changed, 1218 insertions(+), 416 deletions(-) create mode 100644 example/42_groupnorm/CMakeLists.txt create mode 100644 example/42_groupnorm/groupnorm_sigmoid_fp16.cpp create mode 100644 library/include/ck/library/reference_tensor_operation/cpu/reference_groupnorm.hpp create mode 100644 profiler/include/profile_groupnorm_impl.hpp create mode 100644 profiler/src/profile_groupnorm.cpp create mode 100644 test/layernorm/test_groupnorm_fp16.cpp create mode 100644 test/layernorm/test_groupnorm_fp32.cpp rename test/layernorm/{test_layernorm_fp16.cpp => test_layernorm2d_fp16.cpp} (73%) rename test/layernorm/{test_layernorm_fp32.cpp => test_layernorm2d_fp32.cpp} (52%) rename test/layernorm/{test_layernorm_util.hpp => test_layernorm2d_util.hpp} (85%) diff --git a/client_example/05_layernorm/layernorm2d.cpp b/client_example/05_layernorm/layernorm2d.cpp index 657f2248f3..c58a21da03 100644 --- a/client_example/05_layernorm/layernorm2d.cpp +++ b/client_example/05_layernorm/layernorm2d.cpp @@ -81,8 +81,8 @@ int main(int argc, char* argv[]) auto argument_ptr = op_ptr->MakeArgumentPointer({M, N}, // lengths {Stride, 1}, // xStrides - {1}, // gammaStrides - {1}, // betaStrides + {0, 1}, // gammaStrides + {0, 1}, // betaStrides {Stride, 1}, // yStrides {1}, // reduceDims 1e-4, diff --git a/example/27_layernorm/layernorm_blockwise.cpp b/example/27_layernorm/layernorm_blockwise.cpp index 7166cae5d3..6e8679cbe1 100644 --- a/example/27_layernorm/layernorm_blockwise.cpp +++ b/example/27_layernorm/layernorm_blockwise.cpp @@ -29,24 +29,27 @@ using PassThrough = ck::tensor_operation::element_wise::PassThrough; constexpr int Rank = 2; constexpr int NumReduceDim = 1; -using DeviceInstance = ck::tensor_operation::device::DeviceLayernormImpl; // OutScalarPerVector +using DeviceInstance = + ck::tensor_operation::device::DeviceLayernormImpl; // OutScalarPerVector int main() { @@ -88,8 +91,8 @@ int main() auto argument_ptr = device_instance.MakeArgumentPointer( {M, N}, std::vector{x.mDesc.GetStrides().begin(), x.mDesc.GetStrides().end()}, - std::vector{gamma.mDesc.GetStrides().begin(), gamma.mDesc.GetStrides().end()}, - std::vector{beta.mDesc.GetStrides().begin(), beta.mDesc.GetStrides().end()}, + {0, 1}, + {0, 1}, std::vector{y.mDesc.GetStrides().begin(), y.mDesc.GetStrides().end()}, {1}, 1e-4, diff --git a/example/42_groupnorm/CMakeLists.txt b/example/42_groupnorm/CMakeLists.txt new file mode 100644 index 0000000000..c3b7b82592 --- /dev/null +++ b/example/42_groupnorm/CMakeLists.txt @@ -0,0 +1 @@ +add_example_executable(example_groupnorm_sigmoid_fp16 groupnorm_sigmoid_fp16.cpp) diff --git a/example/42_groupnorm/groupnorm_sigmoid_fp16.cpp b/example/42_groupnorm/groupnorm_sigmoid_fp16.cpp new file mode 100644 index 0000000000..e05b02ad18 --- /dev/null +++ b/example/42_groupnorm/groupnorm_sigmoid_fp16.cpp @@ -0,0 +1,172 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#include +#include +#include +#include +#include + +#include "ck/ck.hpp" +#include "ck/utility/reduction_enums.hpp" +#include "ck/tensor_operation/gpu/device/device_layernorm_impl.hpp" +#include "ck/tensor_operation/gpu/device/reduction_operator_mapping.hpp" + +#include "ck/library/utility/fill.hpp" +#include "ck/library/utility/check_err.hpp" +#include "ck/library/utility/device_memory.hpp" +#include "ck/library/utility/host_common_util.hpp" +#include "ck/library/utility/host_tensor.hpp" +#include "ck/library/utility/host_tensor_generator.hpp" +#include "ck/library/reference_tensor_operation/cpu/reference_groupnorm.hpp" + +constexpr int Rank = 5; +constexpr int NumReduceDim = 3; + +using XDataType = ck::half_t; +using GammaDataType = ck::half_t; +using BetaDataType = ck::half_t; +using YDataType = ck::half_t; +using AccDataType = float; + +struct YElementOp +{ + template + __host__ __device__ void operator()(T& y, const T& x) const + { + static_assert(ck::is_same::value || ck::is_same::value || + ck::is_same::value, + "Data type is not supported by this operation!"); + + T a; + + ck::tensor_operation::element_wise::Sigmoid{}(a, x); + + y = x * a; + }; +}; + +using DeviceInstance = + ck::tensor_operation::device::DeviceLayernormImpl; // OutScalarPerVector + +int main(int argc, char* argv[]) +{ + ck::index_t N = 128; + ck::index_t H = 16; + ck::index_t W = 16; + ck::index_t G = 32; + ck::index_t C = 40; + + if(argc == 1) + { + // use default case + } + else if(argc == 6) + { + N = std::stoi(argv[1]); + H = std::stoi(argv[2]); + W = std::stoi(argv[3]); + G = std::stoi(argv[4]); + C = std::stoi(argv[5]); + } + else + { + std::cerr << "arg1 to 5: N, H, W, G, C" << std::endl; + + return 1; + } + + Tensor x({N, H, W, G, C}); + Tensor y({N, H, W, G, C}); + Tensor gamma({G, C}); + Tensor beta({G, C}); + + ck::utils::FillUniformDistribution{0.f, 1.f}(x.begin(), x.end()); + ck::utils::FillUniformDistribution{0.f, 1.f}(gamma.begin(), gamma.end()); + ck::utils::FillUniformDistribution{0.f, 1.f}(beta.begin(), beta.end()); + + DeviceMem x_dev(sizeof(XDataType) * x.mDesc.GetElementSpaceSize()); + DeviceMem gamma_dev(sizeof(GammaDataType) * gamma.mDesc.GetElementSpaceSize()); + DeviceMem beta_dev(sizeof(BetaDataType) * beta.mDesc.GetElementSpaceSize()); + DeviceMem y_dev(sizeof(YDataType) * y.mDesc.GetElementSpaceSize()); + + x_dev.ToDevice(x.mData.data()); + gamma_dev.ToDevice(gamma.mData.data()); + beta_dev.ToDevice(beta.mData.data()); + + const auto y_element_op = YElementOp{}; + + auto device_instance = DeviceInstance{}; + auto argument_ptr = device_instance.MakeArgumentPointer( + {N, H, W, G, C}, + std::vector{x.mDesc.GetStrides().begin(), x.mDesc.GetStrides().end()}, + {0, 0, 0, C, 1}, + {0, 0, 0, C, 1}, + std::vector{y.mDesc.GetStrides().begin(), y.mDesc.GetStrides().end()}, + {1, 2, 4}, // reduction dimension: [H, W, C] + 1e-6, + x_dev.GetDeviceBuffer(), + gamma_dev.GetDeviceBuffer(), + beta_dev.GetDeviceBuffer(), + y_dev.GetDeviceBuffer(), + y_element_op); + + if(!device_instance.IsSupportedArgument(argument_ptr.get())) + { + std::cout << "The runtime parameters are not supported" << std::endl; + return 1; + }; + + auto invoker_ptr = device_instance.MakeInvokerPointer(); + float ave_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true, true}); + + std::size_t num_btype = sizeof(XDataType) * N * H * W * G * C + + sizeof(YDataType) * N * H * W * G * C + sizeof(GammaDataType) * G * C + + sizeof(BetaDataType) * G * C; + + float gb_per_sec = num_btype / 1.E6 / ave_time; + + std::cout << "Perf: " << ave_time << " ms, " << gb_per_sec << " GB/s, " + << device_instance.GetTypeString() << std::endl; + + bool pass = true; + { + Tensor host_y({N, H, W, G, C}); + using ReferenceInstance = ck::tensor_operation::host::ReferenceGroupnorm; + + ReferenceInstance ref; + auto ref_argument = + ref.MakeArgument(x, gamma, beta, host_y, y_element_op, {N, H, W, G, C}, 1e-6); + auto ref_invoker = ref.MakeInvoker(); + ref_invoker.Run(ref_argument); + + y_dev.FromDevice(y.mData.data()); + pass &= ck::utils::check_err(y.mData, host_y.mData, "Error: Incorrect results", 1e-3, 1e-3); + } + + return (pass ? 0 : 1); +} diff --git a/include/ck/tensor_operation/gpu/device/device_layernorm_impl.hpp b/include/ck/tensor_operation/gpu/device/device_layernorm_impl.hpp index 7852209c3a..4b89d3eacf 100644 --- a/include/ck/tensor_operation/gpu/device/device_layernorm_impl.hpp +++ b/include/ck/tensor_operation/gpu/device/device_layernorm_impl.hpp @@ -23,11 +23,10 @@ template + typename GridDesc_M_K> __global__ void kernel_layernorm(const GridDesc_M_K x_grid_desc_m_k, - const GridDesc_K gamma_grid_desc_k, - const GridDesc_K beta_grid_desc_k, + const GridDesc_M_K gamma_grid_desc_m_k, + const GridDesc_M_K beta_grid_desc_m_k, const GridDesc_M_K y_grid_desc_m_k, index_t num_k_block_tile_iteration, AccDataType epsilon, @@ -38,8 +37,8 @@ __global__ void kernel_layernorm(const GridDesc_M_K x_grid_desc_m_k, const AccElementwiseOperation acc_elementwise_op) { GridwiseReduction::Run(x_grid_desc_m_k, - gamma_grid_desc_k, - beta_grid_desc_k, + gamma_grid_desc_m_k, + beta_grid_desc_m_k, y_grid_desc_m_k, num_k_block_tile_iteration, epsilon, @@ -71,7 +70,9 @@ template struct DeviceLayernormImpl : public DeviceLayernorm { static_assert( - (KThreadSliceSize % GammaSrcVectorSize == 0), + ((GammaSrcVectorDim == 0 && MThreadSliceSize % GammaSrcVectorSize == 0) || + (GammaSrcVectorDim == 1 && KThreadSliceSize % GammaSrcVectorSize == 0)), "Invalid thread slice sizes and/or gamma vector sizes configuration, please check!"); static_assert( - (KThreadSliceSize % BetaSrcVectorSize == 0), + ((BetaSrcVectorDim == 0 && MThreadSliceSize % BetaSrcVectorSize == 0) || + (BetaSrcVectorDim == 1 && KThreadSliceSize % BetaSrcVectorSize == 0)), "Invalid thread slice sizes and/or beta vector sizes configuration, please check!"); using PassThrough = tensor_operation::element_wise::PassThrough; @@ -162,38 +165,7 @@ struct DeviceLayernormImpl : public DeviceLayernorm& Lengths, - const std::vector& Strides, - int blkGroupSize, - int numBlockTileIteration) - { - const auto tupleLengths = make_tuple_from_array(Lengths, Number{}); - const auto tupleStrides = make_tuple_from_array(Strides, Number{}); - - auto desc = make_naive_tensor_descriptor(tupleLengths, tupleStrides); - - auto grid_desc_k = transform_tensor_descriptor( - desc, - make_tuple(make_merge_transform(tupleLengths)), - make_tuple(typename arithmetic_sequence_gen<0, NumReduceDim, 1>::type{}), - make_tuple(Sequence<0>{})); - - const auto reduceTotalLength = grid_desc_k.GetLength(Number<0>{}); - const int reduceSizePerBlock = K_BlockTileSize * numBlockTileIteration; - - const auto Pad_K = reduceSizePerBlock * blkGroupSize - reduceTotalLength; - - auto grid_desc_k_padded = transform_tensor_descriptor( - grid_desc_k, - make_tuple(make_right_pad_transform(reduceTotalLength, Pad_K)), - make_tuple(Sequence<0>{}), - make_tuple(Sequence<0>{})); - - return (grid_desc_k_padded); - }; - using GridDesc_M_K = decltype(MakeSrc2dDescriptor({1}, {1}, 1, 1)); - using GridDesc_K = decltype(MakeAffine1dDescriptor({1}, {1}, 1, 1)); using GridwiseReduceLayernormGeneric = GridwiseLayernormWelfordVariance_mk_to_mk; - using GridwiseReduceLayernormSweepOnce = GridwiseLayernormWelfordVariance_mk_to_mk(lengths, reduceDims); - xStrides_ = shuffle_tensor_dimensions(xStrides, reduceDims); - yStrides_ = shuffle_tensor_dimensions(yStrides, reduceDims); + Lengths_ = shuffle_tensor_dimensions(lengths, reduceDims); + xStrides_ = shuffle_tensor_dimensions(xStrides, reduceDims); + yStrides_ = shuffle_tensor_dimensions(yStrides, reduceDims); + gammaStrides_ = shuffle_tensor_dimensions(gammaStrides, reduceDims); + betaStrides_ = shuffle_tensor_dimensions(betaStrides, reduceDims); long_index_t invariant_total_length; long_index_t reduce_total_length; @@ -278,12 +251,17 @@ struct DeviceLayernormImpl : public DeviceLayernorm{}) <= KThreadClusterSize * KThreadSliceSize; } AccDataType epsilon_; @@ -295,7 +273,6 @@ struct DeviceLayernormImpl : public DeviceLayernorm Lengths_; std::vector xStrides_; - std::vector reduceLengths_; std::vector gammaStrides_; std::vector betaStrides_; std::vector yStrides_; @@ -305,46 +282,35 @@ struct DeviceLayernormImpl : public DeviceLayernorm{}) <= KThreadClusterSize * KThreadSliceSize; - - const auto kernel_main = sweep_once ? kernel_layernorm - : kernel_layernorm; + const auto kernel_main = arg.isSweeponce_ + ? kernel_layernorm + : kernel_layernorm; float avg_time = 0; avg_time += launch_and_time_kernel(stream_config, @@ -352,10 +318,10 @@ struct DeviceLayernormImpl : public DeviceLayernormgammaStrides_.size() != NumReduceDim || - p_arg_->betaStrides_.size() != NumReduceDim) - return false; + // if fastest dim is not reduced + if constexpr(GammaSrcVectorDim == 0) + { + if(p_arg_->gammaStrides_[NumInvariantDim - 1] != 1) + return (false); - auto IsScalarPerVectorValid = [](bool isLastDimensionCoalesced, int scalarPerVector) { - bool ret = true; + if(p_arg_->Lengths_[Rank - 1] % GammaSrcVectorSize != 0) + return (false); + } + else // if fastest dim is reduced + { + if(p_arg_->gammaStrides_[Rank - 1] != 1) + return (false); - if(!isLastDimensionCoalesced) - ret = scalarPerVector == 1; - else - ret = KThreadSliceSize % scalarPerVector == 0; + if(p_arg_->Lengths_[Rank - 1] % GammaSrcVectorSize != 0) + return (false); + } - return ret; - }; + // if fastest dim is not reduced + if constexpr(BetaSrcVectorDim == 0) + { + if(p_arg_->betaStrides_[NumInvariantDim - 1] != 1) + return (false); - if(!IsScalarPerVectorValid(p_arg_->gammaStrides_.back() == 1, GammaSrcVectorSize)) - return false; + if(p_arg_->invariant_lowest_length % BetaSrcVectorSize != 0) + return (false); + } + else // if fastest dim is reduced + { + if(p_arg_->betaStrides_[Rank - 1] != 1) + return (false); - if(!IsScalarPerVectorValid(p_arg_->betaStrides_.back() == 1, BetaSrcVectorSize)) - return false; + if(p_arg_->Lengths_[Rank - 1] % BetaSrcVectorSize != 0) + return (false); + } return true; }; diff --git a/include/ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp b/include/ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp index bcbce5bc41..699b05fe3c 100644 --- a/include/ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp +++ b/include/ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp @@ -232,6 +232,21 @@ struct Gelu } }; +struct Sigmoid +{ + template + __host__ __device__ void operator()(T& y, const T& x) const + { + static_assert(is_same::value || is_same::value || + is_same::value, + "Data type is not supported by this operation!"); + + y = 1 / (ck::type_convert(1) + exp(-x)); + }; + + int32_t divider_ = 1; +}; + } // namespace element_wise } // namespace tensor_operation } // namespace ck diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_layernorm_naive_variance.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_layernorm_naive_variance.hpp index 99061328b6..f90739eaec 100644 --- a/include/ck/tensor_operation/gpu/grid/gridwise_layernorm_naive_variance.hpp +++ b/include/ck/tensor_operation/gpu/grid/gridwise_layernorm_naive_variance.hpp @@ -22,7 +22,6 @@ template {}; static constexpr auto I1 = Number<1>{}; + static constexpr auto I2 = Number<2>{}; static constexpr index_t M_BlockTileSize = MThreadClusterSize * MThreadSliceSize; static constexpr index_t K_BlockTileSize = KThreadClusterSize * KThreadSliceSize; __device__ static void Run(const GridDesc_M_K& x_grid_desc_m_k, - const GridDesc_K& gamma_grid_desc_k, - const GridDesc_K& beta_grid_desc_k, + const GridDesc_M_K& gamma_grid_desc_m_k, + const GridDesc_M_K& beta_grid_desc_m_k, const GridDesc_M_K& y_grid_desc_m_k, index_t num_k_block_tile_iteration, AccDataType epsilon, @@ -111,11 +113,14 @@ struct GridwiseLayernormNaiveVariance_mk_to_mk StaticBuffer x_thread_buf; - StaticBuffer gamma_thread_buf; - - StaticBuffer& beta_thread_buf = + StaticBuffer gamma_thread_buf; + StaticBuffer& beta_thread_buf = gamma_thread_buf; + StaticBuffer y_thread_buf; @@ -127,7 +132,7 @@ struct GridwiseLayernormNaiveVariance_mk_to_mk StaticBuffer mean_thread_buf; StaticBuffer mean_square_thread_buf; - StaticBuffer& var_value_buf = + StaticBuffer& var_thread_buf = mean_square_thread_buf; static_for<0, MThreadSliceSize, 1>{}([&](auto I) { @@ -145,11 +150,8 @@ struct GridwiseLayernormNaiveVariance_mk_to_mk const auto thread_k_cluster_id = thread_cluster_idx[I1]; using ThreadBufferLengths_M_K = Sequence; - using ThreadBufferLengths_K = Sequence; constexpr auto thread_buffer_desc_m_k = make_naive_tensor_descriptor_packed( make_tuple(Number{}, Number{})); - constexpr auto thread_buffer_desc_k = - make_naive_tensor_descriptor_packed(make_tuple(Number{})); auto threadwise_x_load = ThreadwiseTensorSliceTransfer_v2, - 0, + GridDesc_M_K, + decltype(thread_buffer_desc_m_k), + ThreadBufferLengths_M_K, + ThreadBufferDimAccessOrder, + GammaSrcVectorDim, GammaSrcVectorSize, 1, true>( - gamma_grid_desc_k, make_multi_index(thread_k_cluster_id * KThreadSliceSize)); + gamma_grid_desc_m_k, + make_multi_index(block_global_id * M_BlockTileSize + + thread_m_cluster_id * MThreadSliceSize, + thread_k_cluster_id * KThreadSliceSize)); - auto threadwise_beta_load = ThreadwiseTensorSliceTransfer_v2, - 0, - BetaSrcVectorSize, - 1, - true>( - beta_grid_desc_k, make_multi_index(thread_k_cluster_id * KThreadSliceSize)); + auto threadwise_beta_load = + ThreadwiseTensorSliceTransfer_v2( + beta_grid_desc_m_k, + make_multi_index(block_global_id * M_BlockTileSize + + thread_m_cluster_id * MThreadSliceSize, + thread_k_cluster_id * KThreadSliceSize)); auto threadwise_y_store = ThreadwiseTensorSliceTransfer_v1r3( - p_gamma_global, gamma_grid_desc_k.GetElementSpaceSize()); + p_gamma_global, gamma_grid_desc_m_k.GetElementSpaceSize()); const auto beta_global_val_buf = make_dynamic_buffer( - p_beta_global, beta_grid_desc_k.GetElementSpaceSize()); + p_beta_global, beta_grid_desc_m_k.GetElementSpaceSize()); // E(x), E[x^2], var(x) - int reduce_length = x_grid_desc_m_k.GetTransforms()[I0].GetUpperLengths()[I1]; + // FIXME: Should not hack the transform from deviceOP + int reduce_length = x_grid_desc_m_k.GetTransforms()[I2].GetUpperLengths()[I0]; index_t reducedTiles = 0; do @@ -271,17 +278,16 @@ struct GridwiseLayernormNaiveVariance_mk_to_mk mean_square_thread_buf(I) = mean_square_thread_buf(I) / reduce_length; // var(x) = E[x^2] - E[x]^2 - var_value_buf(I) = + var_thread_buf(I) = mean_square_thread_buf(I) - (mean_thread_buf(I) * mean_thread_buf(I)); }); // y = (x - E[x]) / sqrt(var[x] + epsilon) auto thread_copy_tail_m_k = (num_k_block_tile_iteration - 1) * thread_copy_fwd_step_m_k; - auto thread_copy_tail_k = (num_k_block_tile_iteration - 1) * thread_copy_fwd_step_k; threadwise_x_load.MoveSrcSliceWindow(x_grid_desc_m_k, thread_copy_bwd_step_m_k); - threadwise_gamma_load.MoveSrcSliceWindow(gamma_grid_desc_k, thread_copy_tail_k); - threadwise_beta_load.MoveSrcSliceWindow(beta_grid_desc_k, thread_copy_tail_k); + threadwise_gamma_load.MoveSrcSliceWindow(gamma_grid_desc_m_k, thread_copy_tail_m_k); + threadwise_beta_load.MoveSrcSliceWindow(beta_grid_desc_m_k, thread_copy_tail_m_k); threadwise_y_store.MoveDstSliceWindow(y_grid_desc_m_k, thread_copy_tail_m_k); reducedTiles = 0; @@ -296,10 +302,10 @@ struct GridwiseLayernormNaiveVariance_mk_to_mk x_thread_buf); } - threadwise_gamma_load.Run(gamma_grid_desc_k, + threadwise_gamma_load.Run(gamma_grid_desc_m_k, gamma_global_val_buf, - thread_buffer_desc_k, - make_tuple(I0), + thread_buffer_desc_m_k, + make_tuple(I0, I0), gamma_thread_buf); static_for<0, MThreadSliceSize, 1>{}([&](auto iM) { @@ -307,23 +313,21 @@ struct GridwiseLayernormNaiveVariance_mk_to_mk constexpr auto offset_m_k = thread_buffer_desc_m_k.CalculateOffset(make_tuple(iM, iK)); - constexpr auto offset_k = thread_buffer_desc_k.CalculateOffset(make_tuple(iK)); - // normalize y_thread_buf(Number{}) = (x_thread_buf(Number{}) - mean_thread_buf(iM)) / - sqrt(var_value_buf(iM) + epsilon); + sqrt(var_thread_buf(iM) + epsilon); // gamma y_thread_buf(Number{}) = - y_thread_buf(Number{}) * gamma_thread_buf(Number{}); + y_thread_buf(Number{}) * gamma_thread_buf(Number{}); }); }); - threadwise_beta_load.Run(beta_grid_desc_k, + threadwise_beta_load.Run(beta_grid_desc_m_k, beta_global_val_buf, - thread_buffer_desc_k, - make_tuple(I0), + thread_buffer_desc_m_k, + make_tuple(I0, I0), beta_thread_buf); static_for<0, MThreadSliceSize, 1>{}([&](auto iM) { @@ -331,11 +335,9 @@ struct GridwiseLayernormNaiveVariance_mk_to_mk constexpr auto offset_m_k = thread_buffer_desc_m_k.CalculateOffset(make_tuple(iM, iK)); - constexpr auto offset_k = thread_buffer_desc_k.CalculateOffset(make_tuple(iK)); - // beta y_thread_buf(Number{}) = - y_thread_buf(Number{}) + beta_thread_buf(Number{}); + y_thread_buf(Number{}) + beta_thread_buf(Number{}); }); }); @@ -346,8 +348,8 @@ struct GridwiseLayernormNaiveVariance_mk_to_mk y_global_val_buf); threadwise_x_load.MoveSrcSliceWindow(x_grid_desc_m_k, thread_copy_bwd_step_m_k); - threadwise_gamma_load.MoveSrcSliceWindow(gamma_grid_desc_k, thread_copy_bwd_step_k); - threadwise_beta_load.MoveSrcSliceWindow(beta_grid_desc_k, thread_copy_bwd_step_k); + threadwise_gamma_load.MoveSrcSliceWindow(gamma_grid_desc_m_k, thread_copy_bwd_step_m_k); + threadwise_beta_load.MoveSrcSliceWindow(beta_grid_desc_m_k, thread_copy_bwd_step_m_k); threadwise_y_store.MoveDstSliceWindow(y_grid_desc_m_k, thread_copy_bwd_step_m_k); ++reducedTiles; diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_layernorm_welford_variance.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_layernorm_welford_variance.hpp index a81c501e61..8d17178649 100644 --- a/include/ck/tensor_operation/gpu/grid/gridwise_layernorm_welford_variance.hpp +++ b/include/ck/tensor_operation/gpu/grid/gridwise_layernorm_welford_variance.hpp @@ -19,7 +19,6 @@ template {}; static constexpr auto I1 = Number<1>{}; + static constexpr auto I2 = Number<2>{}; static constexpr index_t M_BlockTileSize = MThreadClusterSize * MThreadSliceSize; static constexpr index_t K_BlockTileSize = KThreadClusterSize * KThreadSliceSize; @@ -77,7 +79,8 @@ struct GridwiseLayernormWelfordVariance_mk_to_mk __device__ static int GetKPerThread(const GridDesc_M_K& x_grid_desc_m_k, int thread_k_cluster_id) { - int kPerBlock = x_grid_desc_m_k.GetTransforms()[I0].GetUpperLengths()[I1]; + // FIXME: Should not hack the transform from deviceOP + int kPerBlock = x_grid_desc_m_k.GetTransforms()[I2].GetUpperLengths()[I0]; int kPerThread = kPerBlock < K_BlockTileSize ? 0 : KThreadSliceSize * (kPerBlock / K_BlockTileSize); int kPerBlockTail = kPerBlock - kPerThread * KThreadClusterSize; @@ -94,8 +97,8 @@ struct GridwiseLayernormWelfordVariance_mk_to_mk } __device__ static void Run(const GridDesc_M_K& x_grid_desc_m_k, - const GridDesc_K& gamma_grid_desc_k, - const GridDesc_K& beta_grid_desc_k, + const GridDesc_M_K& gamma_grid_desc_m_k, + const GridDesc_M_K& beta_grid_desc_m_k, const GridDesc_M_K& y_grid_desc_m_k, index_t num_k_block_tile_iteration, AccDataType epsilon, @@ -116,11 +119,14 @@ struct GridwiseLayernormWelfordVariance_mk_to_mk StaticBuffer x_thread_buf; - StaticBuffer gamma_thread_buf; - - StaticBuffer& beta_thread_buf = + StaticBuffer gamma_thread_buf; + StaticBuffer& beta_thread_buf = gamma_thread_buf; + StaticBuffer y_thread_buf; @@ -137,11 +143,8 @@ struct GridwiseLayernormWelfordVariance_mk_to_mk const auto thread_k_cluster_id = thread_cluster_idx[I1]; using ThreadBufferLengths_M_K = Sequence; - using ThreadBufferLengths_K = Sequence; constexpr auto thread_buffer_desc_m_k = make_naive_tensor_descriptor_packed( make_tuple(Number{}, Number{})); - constexpr auto thread_buffer_desc_k = - make_naive_tensor_descriptor_packed(make_tuple(Number{})); auto threadwise_x_load = ThreadwiseTensorSliceTransfer_v2, - 0, + GridDesc_M_K, + decltype(thread_buffer_desc_m_k), + ThreadBufferLengths_M_K, + ThreadBufferDimAccessOrder, + GammaSrcVectorDim, GammaSrcVectorSize, 1, true>( - gamma_grid_desc_k, make_multi_index(thread_k_cluster_id * KThreadSliceSize)); + gamma_grid_desc_m_k, + make_multi_index(block_global_id * M_BlockTileSize + + thread_m_cluster_id * MThreadSliceSize, + thread_k_cluster_id * KThreadSliceSize)); - auto threadwise_beta_load = ThreadwiseTensorSliceTransfer_v2, - 0, - BetaSrcVectorSize, - 1, - true>( - beta_grid_desc_k, make_multi_index(thread_k_cluster_id * KThreadSliceSize)); + auto threadwise_beta_load = + ThreadwiseTensorSliceTransfer_v2( + beta_grid_desc_m_k, + make_multi_index(block_global_id * M_BlockTileSize + + thread_m_cluster_id * MThreadSliceSize, + thread_k_cluster_id * KThreadSliceSize)); auto threadwise_y_store = ThreadwiseTensorSliceTransfer_v1r3( - p_gamma_global, gamma_grid_desc_k.GetElementSpaceSize()); + p_gamma_global, gamma_grid_desc_m_k.GetElementSpaceSize()); const auto beta_global_val_buf = make_dynamic_buffer( - p_beta_global, beta_grid_desc_k.GetElementSpaceSize()); + p_beta_global, beta_grid_desc_m_k.GetElementSpaceSize()); auto threadwise_welford = ThreadwiseWelford(); threadwise_welford.max_count_ = GetKPerThread(x_grid_desc_m_k, thread_k_cluster_id); @@ -250,11 +257,10 @@ struct GridwiseLayernormWelfordVariance_mk_to_mk }); auto thread_copy_tail_m_k = (num_k_block_tile_iteration - 1) * thread_copy_fwd_step_m_k; - auto thread_copy_tail_k = (num_k_block_tile_iteration - 1) * thread_copy_fwd_step_k; threadwise_x_load.MoveSrcSliceWindow(x_grid_desc_m_k, thread_copy_bwd_step_m_k); - threadwise_gamma_load.MoveSrcSliceWindow(gamma_grid_desc_k, thread_copy_tail_k); - threadwise_beta_load.MoveSrcSliceWindow(beta_grid_desc_k, thread_copy_tail_k); + threadwise_gamma_load.MoveSrcSliceWindow(gamma_grid_desc_m_k, thread_copy_tail_m_k); + threadwise_beta_load.MoveSrcSliceWindow(beta_grid_desc_m_k, thread_copy_tail_m_k); threadwise_y_store.MoveDstSliceWindow(y_grid_desc_m_k, thread_copy_tail_m_k); for(index_t reducedTiles = 0; reducedTiles < num_k_block_tile_iteration; ++reducedTiles) @@ -268,10 +274,10 @@ struct GridwiseLayernormWelfordVariance_mk_to_mk x_thread_buf); } - threadwise_gamma_load.Run(gamma_grid_desc_k, + threadwise_gamma_load.Run(gamma_grid_desc_m_k, gamma_global_val_buf, - thread_buffer_desc_k, - make_tuple(I0), + thread_buffer_desc_m_k, + make_tuple(I0, I0), gamma_thread_buf); static_for<0, MThreadSliceSize, 1>{}([&](auto iM) { @@ -279,8 +285,6 @@ struct GridwiseLayernormWelfordVariance_mk_to_mk constexpr auto offset_m_k = thread_buffer_desc_m_k.CalculateOffset(make_tuple(iM, iK)); - constexpr auto offset_k = thread_buffer_desc_k.CalculateOffset(make_tuple(iK)); - // normalize y_thread_buf(Number{}) = (x_thread_buf(Number{}) - mean_thread_buf(iM)) / @@ -288,14 +292,14 @@ struct GridwiseLayernormWelfordVariance_mk_to_mk // gamma y_thread_buf(Number{}) = - y_thread_buf(Number{}) * gamma_thread_buf(Number{}); + y_thread_buf(Number{}) * gamma_thread_buf(Number{}); }); }); - threadwise_beta_load.Run(beta_grid_desc_k, + threadwise_beta_load.Run(beta_grid_desc_m_k, beta_global_val_buf, - thread_buffer_desc_k, - make_tuple(I0), + thread_buffer_desc_m_k, + make_tuple(I0, I0), beta_thread_buf); static_for<0, MThreadSliceSize, 1>{}([&](auto iM) { @@ -303,11 +307,9 @@ struct GridwiseLayernormWelfordVariance_mk_to_mk constexpr auto offset_m_k = thread_buffer_desc_m_k.CalculateOffset(make_tuple(iM, iK)); - constexpr auto offset_k = thread_buffer_desc_k.CalculateOffset(make_tuple(iK)); - // beta y_thread_buf(Number{}) = - y_thread_buf(Number{}) + beta_thread_buf(Number{}); + y_thread_buf(Number{}) + beta_thread_buf(Number{}); }); }); @@ -318,8 +320,8 @@ struct GridwiseLayernormWelfordVariance_mk_to_mk y_global_val_buf); threadwise_x_load.MoveSrcSliceWindow(x_grid_desc_m_k, thread_copy_bwd_step_m_k); - threadwise_gamma_load.MoveSrcSliceWindow(gamma_grid_desc_k, thread_copy_bwd_step_k); - threadwise_beta_load.MoveSrcSliceWindow(beta_grid_desc_k, thread_copy_bwd_step_k); + threadwise_gamma_load.MoveSrcSliceWindow(gamma_grid_desc_m_k, thread_copy_bwd_step_m_k); + threadwise_beta_load.MoveSrcSliceWindow(beta_grid_desc_m_k, thread_copy_bwd_step_m_k); threadwise_y_store.MoveDstSliceWindow(y_grid_desc_m_k, thread_copy_bwd_step_m_k); } } diff --git a/library/include/ck/library/reference_tensor_operation/cpu/reference_groupnorm.hpp b/library/include/ck/library/reference_tensor_operation/cpu/reference_groupnorm.hpp new file mode 100644 index 0000000000..fedd4dce62 --- /dev/null +++ b/library/include/ck/library/reference_tensor_operation/cpu/reference_groupnorm.hpp @@ -0,0 +1,191 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include +#include +#include +#include + +#include "ck/tensor_operation/gpu/device/device_base.hpp" +#include "ck/library/utility/host_tensor.hpp" +#include "ck/library/utility/host_tensor_generator.hpp" + +namespace ck { +namespace tensor_operation { +namespace host { + +template +struct ReferenceGroupnorm : public device::BaseOperator +{ + // x = [N, H, W, G, C] + // y = [N, H, W, G, C] + // reduce dim [H, W, C], mean, var = [N, G] + // gamma, beta = [G, C] + // beta: [G, C] + struct Argument : public device::BaseArgument + { + Argument(const Tensor& x, + const Tensor& gamma, + const Tensor& beta, + Tensor& y, + AccElementwiseOperation acc_elementwise_op, + const std::vector lengths, + AccDataType epsilon) + : x_(x), + gamma_(gamma), + beta_(beta), + y_(y), + acc_elementwise_op_(acc_elementwise_op), + lengths_(lengths), + epsilon_(epsilon) + { + } + + const Tensor x_; + const Tensor gamma_; + const Tensor beta_; + Tensor& y_; + AccElementwiseOperation acc_elementwise_op_; + std::vector lengths_; + AccDataType epsilon_; + }; + + // Invoker + struct Invoker : public device::BaseInvoker + { + float Run(const Argument& arg) + { + int N = arg.lengths_[0]; + int H = arg.lengths_[1]; + int W = arg.lengths_[2]; + int G = arg.lengths_[3]; + int C = arg.lengths_[4]; + + Tensor mean({N, G}); + Tensor var({N, G}); + + // Compute mean & var in [H, W, C] by Welford Algorithm + // TODO - parallel for each HWC + // TODO - address calculation + for(int n = 0; n < N; ++n) + { + for(int g = 0; g < G; ++g) + { + AccDataType mean_val = type_convert(0.0f); + AccDataType var_val = type_convert(0.0f); + int32_t curr_count = 0; + + for(int h = 0; h < H; ++h) + { + for(int w = 0; w < W; ++w) + { + for(int c = 0; c < C; ++c) + { + curr_count++; + AccDataType x = type_convert(arg.x_(n, h, w, g, c)); + AccDataType delta = x - mean_val; + mean_val += delta / curr_count; + AccDataType delta2 = x - mean_val; + var_val += delta * delta2; + } + } + } + + mean(n, g) = mean_val; + var(n, g) = var_val / curr_count; + } + } + + // Normalization + for(int n = 0; n < N; ++n) + { + for(int h = 0; h < H; ++h) + { + for(int w = 0; w < W; ++w) + { + for(int g = 0; g < G; ++g) + { + for(int c = 0; c < C; ++c) + { + AccDataType x = type_convert(arg.x_(n, h, w, g, c)); + AccDataType gamma = type_convert(arg.gamma_(g, c)); + AccDataType beta = type_convert(arg.beta_(g, c)); + AccDataType mean_val = type_convert(mean(n, g)); + AccDataType var_val = type_convert(var(n, g)); + AccDataType y = gamma * (x - mean_val) / + ck::math::sqrt(arg.epsilon_ + var_val) + + beta; + arg.acc_elementwise_op_(y, y); + arg.y_(n, h, w, g, c) = type_convert(y); + } + } + } + } + } + + return 0; + } + + float Run(const device::BaseArgument* p_arg, + const StreamConfig& /* stream_config */ = StreamConfig{}) override + { + return Run(*dynamic_cast(p_arg)); + } + }; + + static constexpr bool IsValidCompilationParameter() + { + // TODO: properly implement this check + return true; + } + + bool IsSupportedArgument(const device::BaseArgument* p_arg) override + { + const Argument* p_arg_ = dynamic_cast(p_arg); + if(p_arg_->lengths_.size() != 5) + return false; + + return true; + } + + static auto MakeArgument(const Tensor& x, + const Tensor& gamma, + const Tensor& beta, + Tensor& y, + AccElementwiseOperation acc_elementwise_op, + const std::vector lengths, + AccDataType epsilon) + { + return Argument{x, gamma, beta, y, acc_elementwise_op, lengths, epsilon}; + } + + static auto MakeInvoker() { return Invoker{}; } + + virtual std::unique_ptr MakeInvokerPointer() + { + return std::make_unique(Invoker{}); + } + + std::string GetTypeString() const override + { + auto str = std::stringstream(); + + // clang-format off + str << "ReferenceLayernorm" + << std::endl; + // clang-format on + + return str.str(); + } +}; + +} // namespace host +} // namespace tensor_operation +} // namespace ck diff --git a/library/include/ck/library/tensor_operation_instance/gpu/layernorm.hpp b/library/include/ck/library/tensor_operation_instance/gpu/layernorm.hpp index a73c8c5c43..ae60038163 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/layernorm.hpp +++ b/library/include/ck/library/tensor_operation_instance/gpu/layernorm.hpp @@ -17,17 +17,25 @@ namespace tensor_operation { namespace device { namespace instance { -void add_device_layernorm_f16_rank2_instances( - std::vector>&); +// FP16 +void add_device_layernorm_rank_2_1_f16_instances( + std::vector>>&); -void add_device_layernorm_f16_rank4_instances( - std::vector>&); +void add_device_layernorm_rank_4_3_f16_instances( + std::vector>>&); -void add_device_layernorm_f32_rank2_instances( - std::vector>&); +void add_device_layernorm_rank_5_3_f16_instances( + std::vector>>&); -void add_device_layernorm_f32_rank4_instances( - std::vector>&); +// FP32 +void add_device_layernorm_rank_2_1_f32_instances( + std::vector>>&); + +void add_device_layernorm_rank_4_3_f32_instances( + std::vector>>&); + +void add_device_layernorm_rank_5_3_f32_instances( + std::vector>>&); template && is_same_v) { if constexpr(Rank == 2 && NumReduceDim == 1) - add_device_layernorm_f16_rank2_instances(op_ptrs); + { + add_device_layernorm_rank_2_1_f16_instances(op_ptrs); + } else if constexpr(Rank == 4 && NumReduceDim == 3) - add_device_layernorm_f16_rank4_instances(op_ptrs); + { + add_device_layernorm_rank_4_3_f16_instances(op_ptrs); + } + else if constexpr(Rank == 5 && NumReduceDim == 3) + { + add_device_layernorm_rank_5_3_f16_instances(op_ptrs); + } } else if constexpr(is_same_v && is_same_v && is_same_v && is_same_v) { if constexpr(Rank == 2 && NumReduceDim == 1) - add_device_layernorm_f32_rank2_instances(op_ptrs); + { + add_device_layernorm_rank_2_1_f32_instances(op_ptrs); + } else if constexpr(Rank == 4 && NumReduceDim == 3) - add_device_layernorm_f32_rank4_instances(op_ptrs); + { + add_device_layernorm_rank_4_3_f32_instances(op_ptrs); + } + else if constexpr(Rank == 5 && NumReduceDim == 3) + { + add_device_layernorm_rank_5_3_f32_instances(op_ptrs); + } } return op_ptrs; diff --git a/library/src/tensor_operation_instance/gpu/normalization/device_layernorm_f16_instance.cpp b/library/src/tensor_operation_instance/gpu/normalization/device_layernorm_f16_instance.cpp index ddcde996f7..bf0f7a3d2c 100644 --- a/library/src/tensor_operation_instance/gpu/normalization/device_layernorm_f16_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/normalization/device_layernorm_f16_instance.cpp @@ -17,34 +17,40 @@ using F32 = float; using Pass = ck::tensor_operation::element_wise::PassThrough; -template +template using device_layernorm_f16_instances = std::tuple< // clang-format off - // XDataType, GammaDataType, BetaDataType, AccDataType, YDataType, Rank, NumReduceDim, BlockSize, MThreadClusterSize, KThreadClusterSize, MThreadSliceSize, KThreadSliceSize, XYSrcVectorDim, XSrcVectorSize, GammaSrcVectorSize, BetaSrcVectorSize, YDstVectorSize> - DeviceLayernormImpl, // fallback kernel - DeviceLayernormImpl, // fallback kernel - DeviceLayernormImpl, // fallback kernel - DeviceLayernormImpl, - DeviceLayernormImpl, - DeviceLayernormImpl, - DeviceLayernormImpl, - DeviceLayernormImpl, - DeviceLayernormImpl, - DeviceLayernormImpl, - DeviceLayernormImpl + // XDataType, GammaDataType, BetaDataType, AccDataType, YDataType, Rank, NumReduceDim, BlockSize, MThreadClusterSize, KThreadClusterSize, MThreadSliceSize, KThreadSliceSize, XYSrcVectorDim, XSrcVectorSize, GammaSrcVectorDim, GammaSrcVectorSize, BetaSrcVectorDim, BetaSrcVectorSize, YDstVectorSize> + DeviceLayernormImpl, // fallback kernel + DeviceLayernormImpl, // fallback kernel + DeviceLayernormImpl, // fallback kernel + DeviceLayernormImpl, + DeviceLayernormImpl, + DeviceLayernormImpl, + DeviceLayernormImpl, + DeviceLayernormImpl, + DeviceLayernormImpl, + DeviceLayernormImpl, + DeviceLayernormImpl // clang-format on >; -void add_device_layernorm_f16_rank2_instances( - std::vector>& instances) +void add_device_layernorm_rank_2_1_f16_instances( + std::vector>>& instances) { - add_device_operation_instances(instances, device_layernorm_f16_instances<2, 1>{}); + add_device_operation_instances(instances, device_layernorm_f16_instances{}); } -void add_device_layernorm_f16_rank4_instances( - std::vector>& instances) +void add_device_layernorm_rank_4_3_f16_instances( + std::vector>>& instances) { - add_device_operation_instances(instances, device_layernorm_f16_instances<4, 3>{}); + add_device_operation_instances(instances, device_layernorm_f16_instances{}); +} + +void add_device_layernorm_rank_5_3_f16_instances( + std::vector>>& instances) +{ + add_device_operation_instances(instances, device_layernorm_f16_instances{}); } } // namespace instance diff --git a/library/src/tensor_operation_instance/gpu/normalization/device_layernorm_f32_instance.cpp b/library/src/tensor_operation_instance/gpu/normalization/device_layernorm_f32_instance.cpp index 313d876807..1b35f275ad 100644 --- a/library/src/tensor_operation_instance/gpu/normalization/device_layernorm_f32_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/normalization/device_layernorm_f32_instance.cpp @@ -16,33 +16,39 @@ using F32 = float; using Pass = ck::tensor_operation::element_wise::PassThrough; -template +template using device_layernorm_f32_instances = std::tuple< // clang-format off // XDataType, GammaDataType, BetaDataType, AccDataType, YDataType, Rank, NumReduceDim, BlockSize, MThreadClusterSize, KThreadClusterSize, MThreadSliceSize, KThreadSliceSize, XYSrcVectorDim, XSrcVectorSize, GammaSrcVectorSize, BetaSrcVectorSize, YDstVectorSize> - DeviceLayernormImpl, // fallback kernel - DeviceLayernormImpl, // fallback kernel - DeviceLayernormImpl, - DeviceLayernormImpl, - DeviceLayernormImpl, - DeviceLayernormImpl, - DeviceLayernormImpl, - DeviceLayernormImpl, - DeviceLayernormImpl, - DeviceLayernormImpl + DeviceLayernormImpl, // fallback kernel + DeviceLayernormImpl, // fallback kernel + DeviceLayernormImpl, + DeviceLayernormImpl, + DeviceLayernormImpl, + DeviceLayernormImpl, + DeviceLayernormImpl, + DeviceLayernormImpl, + DeviceLayernormImpl, + DeviceLayernormImpl // clang-format on >; -void add_device_layernorm_f32_rank2_instances( - std::vector>& instances) +void add_device_layernorm_rank_2_1_f32_instances( + std::vector>>& instances) { - add_device_operation_instances(instances, device_layernorm_f32_instances<2, 1>{}); + add_device_operation_instances(instances, device_layernorm_f32_instances{}); } -void add_device_layernorm_f32_rank4_instances( - std::vector>& instances) +void add_device_layernorm_rank_4_3_f32_instances( + std::vector>>& instances) { - add_device_operation_instances(instances, device_layernorm_f32_instances<4, 3>{}); + add_device_operation_instances(instances, device_layernorm_f32_instances{}); +} + +void add_device_layernorm_rank_5_3_f32_instances( + std::vector>>& instances) +{ + add_device_operation_instances(instances, device_layernorm_f32_instances{}); } } // namespace instance diff --git a/profiler/CMakeLists.txt b/profiler/CMakeLists.txt index e3d950c68a..53a26af890 100644 --- a/profiler/CMakeLists.txt +++ b/profiler/CMakeLists.txt @@ -23,6 +23,7 @@ set(PROFILER_SOURCE src/profile_conv_bwd_weight.cpp src/profile_grouped_conv_fwd.cpp src/profile_reduce.cpp + src/profile_groupnorm.cpp src/profile_layernorm.cpp src/profile_normalization.cpp ) diff --git a/profiler/include/profile_groupnorm_impl.hpp b/profiler/include/profile_groupnorm_impl.hpp new file mode 100644 index 0000000000..44aa1d0e3c --- /dev/null +++ b/profiler/include/profile_groupnorm_impl.hpp @@ -0,0 +1,207 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include + +#include "ck/ck.hpp" + +#include "ck/library/tensor_operation_instance/gpu/layernorm.hpp" + +#include "ck/library/utility/check_err.hpp" +#include "ck/library/utility/device_memory.hpp" +#include "ck/library/utility/host_tensor.hpp" +#include "ck/library/utility/host_tensor_generator.hpp" +#include "ck/library/reference_tensor_operation/cpu/reference_groupnorm.hpp" + +namespace ck { +namespace profiler { + +template +bool profile_groupnorm_impl(int do_verification, + int init_method, + bool do_log, + bool time_kernel, + std::vector length) +{ + using PassThrough = ck::tensor_operation::element_wise::PassThrough; + + if(length.size() != 5) + return false; + + index_t G = length[3]; + index_t C = length[4]; + + std::vector reduce_dim = {1, 2, 4}; + std::vector gammaBetaLength = {G, C}; + std::vector gammaBetaStride = {0, 0, 0, C, 1}; + + Tensor x(length); + Tensor gamma(gammaBetaLength); + Tensor beta(gammaBetaLength); + Tensor y(length); + Tensor host_y(length); + + switch(init_method) + { + case 0: + x.GenerateTensorValue(GeneratorTensor_1{}); + gamma.GenerateTensorValue(GeneratorTensor_1{}); + beta.GenerateTensorValue(GeneratorTensor_1{}); + break; + case 1: + x.GenerateTensorValue(GeneratorTensor_2{-5, 5}); + gamma.GenerateTensorValue(GeneratorTensor_2{-5, 5}); + beta.GenerateTensorValue(GeneratorTensor_2{-5, 5}); + break; + default: + x.GenerateTensorValue(GeneratorTensor_3{0, 1}); + gamma.GenerateTensorValue(GeneratorTensor_3{-0.5, 0.5}); + beta.GenerateTensorValue(GeneratorTensor_3{-0.5, 0.5}); + } + + DeviceMem x_dev(sizeof(XDataType) * x.mDesc.GetElementSpaceSize()); + DeviceMem gamma_dev(sizeof(GammaDataType) * gamma.mDesc.GetElementSpaceSize()); + DeviceMem beta_dev(sizeof(BetaDataType) * beta.mDesc.GetElementSpaceSize()); + DeviceMem y_dev(sizeof(YDataType) * y.mDesc.GetElementSpaceSize()); + + x_dev.ToDevice(x.mData.data()); + gamma_dev.ToDevice(gamma.mData.data()); + beta_dev.ToDevice(beta.mData.data()); + + // add device normalization instances + using DeviceOp = ck::tensor_operation::device::DeviceLayernorm; + + // get device op instances + const auto instance_ptrs = + ck::tensor_operation::device::instance::DeviceOperationInstanceFactory< + DeviceOp>::GetInstances(); + + std::cout << "found " << instance_ptrs.size() << " instances" << std::endl; + + std::string best_instance_name; + float best_avg_time = std::numeric_limits::max(); + float best_gb_per_sec = 0; + + if(do_verification) + { + using ReferenceInstance = ck::tensor_operation::host::ReferenceGroupnorm; + + ReferenceInstance ref; + auto ref_argument = ref.MakeArgument(x, gamma, beta, host_y, PassThrough{}, length, 1e-6); + auto ref_invoker = ref.MakeInvoker(); + ref_invoker.Run(ref_argument); + } + + int num_kernel = 0; + + for(auto& inst_ptr : instance_ptrs) + { + auto argument_ptr = inst_ptr->MakeArgumentPointer( + length, + std::vector{x.mDesc.GetStrides().begin(), x.mDesc.GetStrides().end()}, + gammaBetaStride, + gammaBetaStride, + std::vector{y.mDesc.GetStrides().begin(), y.mDesc.GetStrides().end()}, + reduce_dim, + 1e-6, + x_dev.GetDeviceBuffer(), + gamma_dev.GetDeviceBuffer(), + beta_dev.GetDeviceBuffer(), + y_dev.GetDeviceBuffer(), + PassThrough{}); + + if(inst_ptr->IsSupportedArgument(argument_ptr.get())) + { + ++num_kernel; + } + else + { + continue; + } + + auto invoker_ptr = inst_ptr->MakeInvokerPointer(); + + float avg_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel}); + + std::size_t num_bytes = x.mDesc.GetElementSize() * sizeof(XDataType) + + gamma.mDesc.GetElementSize() * sizeof(GammaDataType) + + beta.mDesc.GetElementSize() * sizeof(BetaDataType) + + y.mDesc.GetElementSize() * sizeof(YDataType); + + float gb_per_sec = num_bytes / 1.E6 / avg_time; + + if(time_kernel) + std::cout << "Perf: " << std::setw(10) << avg_time << " ms, " << gb_per_sec << " GB/s, " + << inst_ptr->GetTypeString() << std::endl; + + if(avg_time < best_avg_time) + { + best_instance_name = inst_ptr->GetTypeString(); + best_avg_time = avg_time; + best_gb_per_sec = gb_per_sec; + } + + if(do_verification) + { + y_dev.FromDevice(y.mData.data()); + + bool pass = + ck::utils::check_err(y.mData, host_y.mData, "Error: Incorrect results", 1e-3, 1e-3); + + if(do_log) + { + LogRangeAsType(std::cout << "x : ", x.mData, ",") << std::endl; + LogRangeAsType(std::cout << "host_y : ", host_y.mData, ",") << std::endl; + LogRangeAsType(std::cout << "y : ", y.mData, ",") << std::endl; + } + + if(!pass) + { + std::cout << inst_ptr->GetTypeString() << " failed verification: "; + LogRange(std::cout << "lengths = [", length, ", ") << "]." << std::endl; + return false; + } + else + { + if(time_kernel) + std::cout << "pass" << std::endl; + } + } + } + + if(time_kernel) + { + LogRange(std::cout << "length = ", length, ",") << ", "; + std::cout << "num_kernel = " << num_kernel << ", best perf = " << best_avg_time << " ms, " + << best_gb_per_sec << " GB/s, " << best_instance_name << std::endl; + } + + if(num_kernel == 0) + { + std::cout << "Error: No kernel is tested" << std::endl; + return false; + } + + return true; +} + +} // namespace profiler +} // namespace ck diff --git a/profiler/include/profile_layernorm_impl.hpp b/profiler/include/profile_layernorm_impl.hpp index b5d994c129..b0b4a73ab8 100644 --- a/profiler/include/profile_layernorm_impl.hpp +++ b/profiler/include/profile_layernorm_impl.hpp @@ -6,8 +6,8 @@ #include #include "ck/ck.hpp" -#include "profiler/include/data_type_enum.hpp" -#include "ck/tensor_operation/gpu/device/device_layernorm_impl.hpp" + +#include "ck/library/tensor_operation_instance/gpu/layernorm.hpp" #include "ck/library/utility/check_err.hpp" #include "ck/library/utility/device_memory.hpp" @@ -15,26 +15,6 @@ #include "ck/library/utility/host_tensor_generator.hpp" #include "ck/library/reference_tensor_operation/cpu/reference_layernorm.hpp" -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -using F16 = ck::half_t; -using F32 = float; -using PassThrough = ck::tensor_operation::element_wise::PassThrough; - -void add_device_layernorm_f16_rank2_instances( - std::vector>&); - -void add_device_layernorm_f32_rank2_instances( - std::vector>&); - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck - namespace ck { namespace profiler { @@ -53,8 +33,6 @@ void profile_layernorm_impl(int do_verification, std::vector strideGamma, std::vector strideBeta) { - using F16 = ck::half_t; - using F32 = float; using PassThrough = ck::tensor_operation::element_wise::PassThrough; if(length.size() < 2) @@ -103,37 +81,24 @@ void profile_layernorm_impl(int do_verification, gamma_dev.ToDevice(gamma.mData.data()); beta_dev.ToDevice(beta.mData.data()); - // add device normalization instances constexpr int NumReduceDim = Rank - 1; - std::vector> - instances; - if constexpr(is_same::value && is_same::value && - is_same::value && is_same::value && - is_same::value) - { - if(length.size() == 2) - tensor_operation::device::instance::add_device_layernorm_f16_rank2_instances(instances); - } - else if constexpr(is_same::value && is_same::value && - is_same::value && is_same::value && - is_same::value) - { - if(length.size() == 2) - tensor_operation::device::instance::add_device_layernorm_f32_rank2_instances(instances); - } + // add device normalization instances + using DeviceOp = ck::tensor_operation::device::DeviceLayernorm; - if(instances.size() <= 0) - { - throw std::runtime_error("wrong! no device normalization instance found"); - } + // get device op instances + const auto instance_ptrs = + ck::tensor_operation::device::instance::DeviceOperationInstanceFactory< + DeviceOp>::GetInstances(); + + std::cout << "found " << instance_ptrs.size() << " instances" << std::endl; std::string best_instance_name; float best_avg_time = std::numeric_limits::max(); @@ -157,7 +122,7 @@ void profile_layernorm_impl(int do_verification, ref_invoker.Run(ref_argument); } - for(auto& inst_ptr : instances) + for(auto& inst_ptr : instance_ptrs) { auto argument_ptr = inst_ptr->MakeArgumentPointer(length, strideXY, @@ -175,9 +140,9 @@ void profile_layernorm_impl(int do_verification, if(!inst_ptr->IsSupportedArgument(argument_ptr.get())) { std::cout << inst_ptr->GetTypeString() << " skipped due to unsupported argument: "; - LogRange(std::cout << "input lengths = [", length, "], ") << std::endl; + LogRange(std::cout << "input lengths = ", length, ", ") << std::endl; - return; + continue; } auto invoker_ptr = inst_ptr->MakeInvokerPointer(); diff --git a/profiler/src/profile_groupnorm.cpp b/profiler/src/profile_groupnorm.cpp new file mode 100644 index 0000000000..7eeaca7d45 --- /dev/null +++ b/profiler/src/profile_groupnorm.cpp @@ -0,0 +1,106 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#include +#include +#include + +#include "profiler/include/data_type_enum.hpp" +#include "profiler/include/profile_groupnorm_impl.hpp" + +using ck::index_t; + +struct GroupnormArgParser +{ + std::unordered_map> long_opts = {{"length", {}}}; + + bool parse_opt(int argc, char* argv[], const std::string& key, int i) + { + if(std::string("--") + key == argv[i]) + { + int pos = i; + while(++i < argc && argv[i][0] != '-') {} + int end = i; + for(int j = pos + 1; j < end; j++) + { + long_opts[key].push_back(std::stoi(argv[j])); + } + return true; + } + return false; + } + + void operator()(int argc, char* argv[]) + { + for(auto& kv : long_opts) + { + for(int i = 1; i < argc; i++) + { + if(parse_opt(argc, argv, kv.first, i)) + break; + } + } + } +}; + +void print_help_groupnorm() +{ + std::cout << "arg1: tensor operation (groupnorm: Group normalization)\n" + << "arg2: data type (0: fp16; 1: fp32)\n" + << "arg3: verification (0: no; 1: yes)\n" + << "arg4: initialization (0: no init; 1: integer value; 2: decimal value)\n" + << "arg5: print tensor value (0: no; 1: yes)\n" + << "arg6: time kernel (0=no, 1=yes)\n" + << "--length: tensor extents (e.g, --length 1 16 16 32 40) \n" + << std::endl; +} + +int profile_groupnorm(int argc, char* argv[]) +{ + ck::DataTypeEnum data_type = ck::DataTypeEnum::Half; + bool do_verification = false; + int init_method = 0; + bool do_log = 0; + bool time_kernel = 1; + std::vector length = {64, 16, 16, 32, 40}; + + if(argc != 1 && argc != 13) + { + print_help_groupnorm(); + return 0; + } + + if(argc == 13) + { + data_type = static_cast(std::stoi(argv[2])); + do_verification = std::stoi(argv[3]); + init_method = std::stoi(argv[4]); + do_log = std::stoi(argv[5]); + time_kernel = std::stoi(argv[6]); + + // parse the long options + GroupnormArgParser arg_parser; + arg_parser(argc, argv); + length = arg_parser.long_opts["length"]; + } + + using F16 = ck::half_t; + using F32 = float; + + if(data_type == ck::DataTypeEnum::Float) + { + ck::profiler::profile_groupnorm_impl( + do_verification, init_method, do_log, time_kernel, length); + } + else if(data_type == ck::DataTypeEnum::Half) + { + ck::profiler::profile_groupnorm_impl( + do_verification, init_method, do_log, time_kernel, length); + } + else + { + throw std::runtime_error("not implemented yet"); + } + + return 0; +} diff --git a/profiler/src/profile_layernorm.cpp b/profiler/src/profile_layernorm.cpp index f4cffb33d1..9e31342cca 100644 --- a/profiler/src/profile_layernorm.cpp +++ b/profiler/src/profile_layernorm.cpp @@ -5,6 +5,7 @@ #include #include +#include "profiler/include/data_type_enum.hpp" #include "profiler/include/profile_layernorm_impl.hpp" using ck::index_t; @@ -49,7 +50,7 @@ void print_help_layernorm() << "arg2: verification (0: no; 1: yes)\n" << "arg3: initialization (0: no init; 1: integer value; 2: decimal value)\n" << "arg4: print tensor value (0: no; 1: yes)\n" - << "arg5: time kernel (0=n0, 1=yes)\n" + << "arg5: time kernel (0=no, 1=yes)\n" << "--length: tensor extents (e.g, --length 1024 1024) \n" << "--strideXY: tensor strides (e.g, --strideXY 1024 1)\n" << "--strideGamma: tensor strides (e.g, --strideGamma 1)\n" @@ -114,10 +115,3 @@ int profile_layernorm(int argc, char* argv[]) return 0; } - -// hijack main() for quick debugging -// int main(int argc, char* argv[]) -// { -// profile_layernorm(argc, argv); -// return 0; -// } diff --git a/profiler/src/profiler.cpp b/profiler/src/profiler.cpp index 93e8e997e0..2c8cd5b56f 100644 --- a/profiler/src/profiler.cpp +++ b/profiler/src/profiler.cpp @@ -3,26 +3,27 @@ #include -int profile_gemm(int, char*[]); -int profile_gemm_splitk(int, char*[]); -int profile_gemm_bilinear(int, char*[]); -int profile_gemm_add_add_fastgelu(int, char*[]); -int profile_gemm_reduce(int, char*[]); -int profile_gemm_bias_add_reduce(int, char*[]); -int profile_batched_gemm(int, char*[]); -int profile_batched_gemm_gemm(int, char*[]); -int profile_batched_gemm_add_relu_gemm_add(int, char*[]); -int profile_batched_gemm_reduce(int, char*[]); -int profile_grouped_gemm(int, char*[]); -int profile_conv_fwd(int, char*[]); -int profile_conv_fwd_bias_relu(int, char*[]); -int profile_conv_fwd_bias_relu_add(int, char*[]); -int profile_conv_bwd_data(int, char*[]); -int profile_conv_bwd_weight(int, char*[]); -int profile_grouped_conv_fwd(int, char*[]); -int profile_normalization(int, char*[]); +// int profile_gemm(int, char*[]); +// int profile_gemm_splitk(int, char*[]); +// int profile_gemm_bilinear(int, char*[]); +// int profile_gemm_add_add_fastgelu(int, char*[]); +// int profile_gemm_reduce(int, char*[]); +// int profile_gemm_bias_add_reduce(int, char*[]); +// int profile_batched_gemm(int, char*[]); +// int profile_batched_gemm_gemm(int, char*[]); +// int profile_batched_gemm_add_relu_gemm_add(int, char*[]); +// int profile_batched_gemm_reduce(int, char*[]); +// int profile_grouped_gemm(int, char*[]); +// int profile_conv_fwd(int, char*[]); +// int profile_conv_fwd_bias_relu(int, char*[]); +// int profile_conv_fwd_bias_relu_add(int, char*[]); +// int profile_conv_bwd_data(int, char*[]); +// int profile_conv_bwd_weight(int, char*[]); +// int profile_grouped_conv_fwd(int, char*[]); +// int profile_normalization(int, char*[]); int profile_layernorm(int, char*[]); -int profile_reduce(int, char*[]); +int profile_groupnorm(int, char*[]); +// int profile_reduce(int, char*[]); static void print_helper_message() { @@ -56,6 +57,7 @@ int main(int argc, char* argv[]) return 0; } +#if 0 else if(strcmp(argv[1], "gemm") == 0) { return profile_gemm(argc, argv); @@ -132,10 +134,15 @@ int main(int argc, char* argv[]) { return profile_normalization(argc, argv); } +#endif else if(strcmp(argv[1], "layernorm") == 0) { return profile_layernorm(argc, argv); } + else if(strcmp(argv[1], "groupnorm") == 0) + { + return profile_groupnorm(argc, argv); + } else { print_helper_message(); diff --git a/test/layernorm/CMakeLists.txt b/test/layernorm/CMakeLists.txt index ad681583d1..ab6e2d1cd1 100644 --- a/test/layernorm/CMakeLists.txt +++ b/test/layernorm/CMakeLists.txt @@ -1,10 +1,17 @@ add_custom_target(test_layernorm) -add_gtest_executable(test_layernorm_fp32 test_layernorm_fp32.cpp) -add_gtest_executable(test_layernorm_fp16 test_layernorm_fp16.cpp) +add_gtest_executable(test_layernorm2d_fp32 test_layernorm2d_fp32.cpp) +add_gtest_executable(test_layernorm2d_fp16 test_layernorm2d_fp16.cpp) +add_gtest_executable(test_groupnorm_fp16 test_groupnorm_fp16.cpp) +add_gtest_executable(test_groupnorm_fp32 test_groupnorm_fp32.cpp) -target_link_libraries(test_layernorm_fp32 PRIVATE utility) -target_link_libraries(test_layernorm_fp16 PRIVATE utility) +target_link_libraries(test_layernorm2d_fp32 PRIVATE utility) +target_link_libraries(test_layernorm2d_fp16 PRIVATE utility) +target_link_libraries(test_groupnorm_fp16 PRIVATE utility device_normalization_instance) +target_link_libraries(test_groupnorm_fp32 PRIVATE utility device_normalization_instance) + +add_dependencies(test_layernorm test_layernorm2d_fp32) +add_dependencies(test_layernorm test_layernorm2d_fp16) +add_dependencies(test_layernorm test_groupnorm_fp16) +add_dependencies(test_layernorm test_groupnorm_fp32) -add_dependencies(test_layernorm test_layernorm_fp32) -add_dependencies(test_layernorm test_layernorm_fp16) diff --git a/test/layernorm/test_groupnorm_fp16.cpp b/test/layernorm/test_groupnorm_fp16.cpp new file mode 100644 index 0000000000..235ebca3d1 --- /dev/null +++ b/test/layernorm/test_groupnorm_fp16.cpp @@ -0,0 +1,56 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#include "gtest/gtest.h" +#include "profiler/include/profile_groupnorm_impl.hpp" + +using F16 = ck::half_t; +using F32 = float; +using ck::index_t; + +template +class TestGroupnorm : public ::testing::Test +{ + protected: + using XDataType = std::tuple_element_t<0, Tuple>; + using GammaDataType = std::tuple_element_t<1, Tuple>; + using BetaDataType = std::tuple_element_t<2, Tuple>; + using AccDataType = std::tuple_element_t<3, Tuple>; + using YDataType = std::tuple_element_t<4, Tuple>; + + void Run() + { + // N, H, W, G, C + std::vector> lengths = {{1, 1, 1, 1, 1}, + {1, 2, 3, 4, 5}, + {256, 9, 9, 9, 9}, + {1, 64, 64, 32, 10}, + {1, 32, 32, 32, 20}, + {1, 16, 16, 32, 40}}; + + for(auto length : lengths) + { + bool success = + ck::profiler::profile_groupnorm_impl(true, 2, false, false, length); + EXPECT_TRUE(success); + } + } +}; + +using KernelTypes = ::testing::Types< + // XDataType, GammaDataType, BetaDataType, AccDataType, YDataType> + std::tuple, + std::tuple, + std::tuple, + std::tuple, + std::tuple, + std::tuple, + std::tuple, + std::tuple>; + +TYPED_TEST_SUITE(TestGroupnorm, KernelTypes); +TYPED_TEST(TestGroupnorm, Test_FP16) { this->Run(); } diff --git a/test/layernorm/test_groupnorm_fp32.cpp b/test/layernorm/test_groupnorm_fp32.cpp new file mode 100644 index 0000000000..8abec91fee --- /dev/null +++ b/test/layernorm/test_groupnorm_fp32.cpp @@ -0,0 +1,56 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#include "gtest/gtest.h" +#include "profiler/include/profile_groupnorm_impl.hpp" + +using F16 = ck::half_t; +using F32 = float; +using ck::index_t; + +template +class TestGroupnorm : public ::testing::Test +{ + protected: + using XDataType = std::tuple_element_t<0, Tuple>; + using GammaDataType = std::tuple_element_t<1, Tuple>; + using BetaDataType = std::tuple_element_t<2, Tuple>; + using AccDataType = std::tuple_element_t<3, Tuple>; + using YDataType = std::tuple_element_t<4, Tuple>; + + void Run() + { + // N, H, W, G, C + std::vector> lengths = {{1, 1, 1, 1, 1}, + {1, 2, 3, 4, 5}, + {256, 9, 9, 9, 9}, + {1, 64, 64, 32, 10}, + {1, 32, 32, 32, 20}, + {1, 16, 16, 32, 40}}; + + for(auto length : lengths) + { + bool success = + ck::profiler::profile_groupnorm_impl(true, 2, false, false, length); + EXPECT_TRUE(success); + } + } +}; + +using KernelTypes = ::testing::Types< + // XDataType, GammaDataType, BetaDataType, AccDataType, YDataType> + std::tuple, + std::tuple, + std::tuple, + std::tuple, + std::tuple, + std::tuple, + std::tuple, + std::tuple>; + +TYPED_TEST_SUITE(TestGroupnorm, KernelTypes); +TYPED_TEST(TestGroupnorm, Test_FP32) { this->Run(); } diff --git a/test/layernorm/test_layernorm_fp16.cpp b/test/layernorm/test_layernorm2d_fp16.cpp similarity index 73% rename from test/layernorm/test_layernorm_fp16.cpp rename to test/layernorm/test_layernorm2d_fp16.cpp index 39b28c902c..ccc6472660 100644 --- a/test/layernorm/test_layernorm_fp16.cpp +++ b/test/layernorm/test_layernorm2d_fp16.cpp @@ -2,28 +2,28 @@ // Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. #include "gtest/gtest.h" -#include "test_layernorm_util.hpp" +#include "test_layernorm2d_util.hpp" template using I = ck::Number; template -class TestLayernormFP16 : public ck::TestLayernorm +class TestLayernorm2dFP16 : public ck::TestLayernorm2d { }; // clang-format off using KernelTypes = ::testing::Types< -// XDataType, GammaDataType, BetaDataType, AccDataType, YDataType, Rank, NumReduceDim, BlockSize, MThreadClusterSize, KThreadClusterSize, MThreadSliceSize, KThreadSliceSize, XYSrcVectorDim, XSrcVectorSize, , GammaSrcVectorSize, BetaSrcVectorSize, YDstVectorSize> - std::tuple, I<1>, I<256>, I<8>, I<32>, I<1>, I<8>, I<1>, I<8>, I<8>, I<8>, I<8>>, - std::tuple, I<1>, I<256>, I<8>, I<32>, I<2>, I<8>, I<1>, I<8>, I<8>, I<8>, I<8>>, - std::tuple, I<1>, I<256>, I<4>, I<64>, I<1>, I<8>, I<1>, I<8>, I<8>, I<8>, I<8>>, - std::tuple, I<1>, I<256>, I<4>, I<64>, I<2>, I<8>, I<1>, I<8>, I<8>, I<8>, I<8>>, - std::tuple, I<1>, I<256>, I<2>, I<128>, I<1>, I<8>, I<1>, I<8>, I<8>, I<8>, I<8>>, - std::tuple, I<1>, I<256>, I<2>, I<128>, I<2>, I<8>, I<1>, I<8>, I<8>, I<8>, I<8>>, - std::tuple, I<1>, I<256>, I<1>, I<256>, I<1>, I<8>, I<1>, I<8>, I<8>, I<8>, I<8>>, - std::tuple, I<1>, I<256>, I<1>, I<256>, I<2>, I<8>, I<1>, I<8>, I<8>, I<8>, I<8>> +// XDataType, GammaDataType, BetaDataType, AccDataType, YDataType, Rank, NumReduceDim, BlockSize, MThreadClusterSize, KThreadClusterSize, MThreadSliceSize, KThreadSliceSize, XYSrcVectorDim, XSrcVectorSize, GammaSrcVectorDim , GammaSrcVectorSize, BetaSrcVectorDim, BetaSrcVectorSize, YDstVectorSize> + std::tuple, I<1>, I<256>, I<8>, I<32>, I<1>, I<8>, I<1>, I<8>, I<1>, I<8>, I<1>, I<8>, I<8>>, + std::tuple, I<1>, I<256>, I<8>, I<32>, I<2>, I<8>, I<1>, I<8>, I<1>, I<8>, I<1>, I<8>, I<8>>, + std::tuple, I<1>, I<256>, I<4>, I<64>, I<1>, I<8>, I<1>, I<8>, I<1>, I<8>, I<1>, I<8>, I<8>>, + std::tuple, I<1>, I<256>, I<4>, I<64>, I<2>, I<8>, I<1>, I<8>, I<1>, I<8>, I<1>, I<8>, I<8>>, + std::tuple, I<1>, I<256>, I<2>, I<128>, I<1>, I<8>, I<1>, I<8>, I<1>, I<8>, I<1>, I<8>, I<8>>, + std::tuple, I<1>, I<256>, I<2>, I<128>, I<2>, I<8>, I<1>, I<8>, I<1>, I<8>, I<1>, I<8>, I<8>>, + std::tuple, I<1>, I<256>, I<1>, I<256>, I<1>, I<8>, I<1>, I<8>, I<1>, I<8>, I<1>, I<8>, I<8>>, + std::tuple, I<1>, I<256>, I<1>, I<256>, I<2>, I<8>, I<1>, I<8>, I<1>, I<8>, I<1>, I<8>, I<8>> >; // clang-format on -TYPED_TEST_SUITE(TestLayernormFP16, KernelTypes); -TYPED_TEST(TestLayernormFP16, Test_FP16) { this->Run(); } +TYPED_TEST_SUITE(TestLayernorm2dFP16, KernelTypes); +TYPED_TEST(TestLayernorm2dFP16, Test_FP16) { this->Run(); } diff --git a/test/layernorm/test_layernorm_fp32.cpp b/test/layernorm/test_layernorm2d_fp32.cpp similarity index 52% rename from test/layernorm/test_layernorm_fp32.cpp rename to test/layernorm/test_layernorm2d_fp32.cpp index 655e11d2c9..47cf1641e3 100644 --- a/test/layernorm/test_layernorm_fp32.cpp +++ b/test/layernorm/test_layernorm2d_fp32.cpp @@ -2,28 +2,28 @@ // Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. #include "gtest/gtest.h" -#include "test_layernorm_util.hpp" +#include "test_layernorm2d_util.hpp" template using I = ck::Number; template -class TestLayernormFP32 : public ck::TestLayernorm +class TestLayernorm2dFP32 : public ck::TestLayernorm2d { }; // clang-format off using KernelTypes = ::testing::Types< -// XDataType, GammaDataType, BetaDataType, AccDataType, YDataType, Rank, NumReduceDim, BlockSize, MThreadClusterSize, KThreadClusterSize, MThreadSliceSize, KThreadSliceSize, XYSrcVectorDim, XSrcVectorSize, , GammaSrcVectorSize, BetaSrcVectorSize, YDstVectorSize> - std::tuple, I<1>, I<256>, I<8>, I<32>, I<1>, I<8>, I<1>, I<4>, I<4>, I<4>, I<4>>, - std::tuple, I<1>, I<256>, I<8>, I<32>, I<2>, I<8>, I<1>, I<4>, I<4>, I<4>, I<4>>, - std::tuple, I<1>, I<256>, I<4>, I<64>, I<1>, I<8>, I<1>, I<4>, I<4>, I<4>, I<4>>, - std::tuple, I<1>, I<256>, I<4>, I<64>, I<2>, I<8>, I<1>, I<4>, I<4>, I<4>, I<4>>, - std::tuple, I<1>, I<256>, I<2>, I<128>, I<1>, I<8>, I<1>, I<4>, I<4>, I<4>, I<4>>, - std::tuple, I<1>, I<256>, I<2>, I<128>, I<2>, I<8>, I<1>, I<4>, I<4>, I<4>, I<4>>, - std::tuple, I<1>, I<256>, I<1>, I<256>, I<1>, I<8>, I<1>, I<4>, I<4>, I<4>, I<4>>, - std::tuple, I<1>, I<256>, I<1>, I<256>, I<2>, I<8>, I<1>, I<4>, I<4>, I<4>, I<4>> +// XDataType, GammaDataType, BetaDataType, AccDataType, YDataType, Rank, NumReduceDim, BlockSize, MThreadClusterSize, KThreadClusterSize, MThreadSliceSize, KThreadSliceSize, XYSrcVectorDim, XSrcVectorSize, GammaSrcVectorDim, GammaSrcVectorSize, BetaSrcVectorDim, BetaSrcVectorSize, YDstVectorSize> + std::tuple, I<1>, I<256>, I<8>, I<32>, I<1>, I<8>, I<1>, I<4>, I<1>, I<4>, I<1>, I<4>, I<4>>, + std::tuple, I<1>, I<256>, I<8>, I<32>, I<2>, I<8>, I<1>, I<4>, I<1>, I<4>, I<1>, I<4>, I<4>>, + std::tuple, I<1>, I<256>, I<4>, I<64>, I<1>, I<8>, I<1>, I<4>, I<1>, I<4>, I<1>, I<4>, I<4>>, + std::tuple, I<1>, I<256>, I<4>, I<64>, I<2>, I<8>, I<1>, I<4>, I<1>, I<4>, I<1>, I<4>, I<4>>, + std::tuple, I<1>, I<256>, I<2>, I<128>, I<1>, I<8>, I<1>, I<4>, I<1>, I<4>, I<1>, I<4>, I<4>>, + std::tuple, I<1>, I<256>, I<2>, I<128>, I<2>, I<8>, I<1>, I<4>, I<1>, I<4>, I<1>, I<4>, I<4>>, + std::tuple, I<1>, I<256>, I<1>, I<256>, I<1>, I<8>, I<1>, I<4>, I<1>, I<4>, I<1>, I<4>, I<4>>, + std::tuple, I<1>, I<256>, I<1>, I<256>, I<2>, I<8>, I<1>, I<4>, I<1>, I<4>, I<1>, I<4>, I<4>> >; // clang-format on -TYPED_TEST_SUITE(TestLayernormFP32, KernelTypes); -TYPED_TEST(TestLayernormFP32, Test_FP32) { this->Run(); } +TYPED_TEST_SUITE(TestLayernorm2dFP32, KernelTypes); +TYPED_TEST(TestLayernorm2dFP32, Test_FP32) { this->Run(); } diff --git a/test/layernorm/test_layernorm_util.hpp b/test/layernorm/test_layernorm2d_util.hpp similarity index 85% rename from test/layernorm/test_layernorm_util.hpp rename to test/layernorm/test_layernorm2d_util.hpp index 707fe36f86..6112c7f5bf 100644 --- a/test/layernorm/test_layernorm_util.hpp +++ b/test/layernorm/test_layernorm2d_util.hpp @@ -31,7 +31,7 @@ std::string serialize_range(const Range& range) } template -class TestLayernorm : public ::testing::Test +class TestLayernorm2d : public ::testing::Test { protected: using XDataType = std::tuple_element_t<0, Tuple>; @@ -48,9 +48,11 @@ class TestLayernorm : public ::testing::Test static constexpr index_t KThreadSliceSize = std::tuple_element_t<11, Tuple>{}.value; static constexpr index_t XYSrcVectorDim = std::tuple_element_t<12, Tuple>{}.value; static constexpr index_t XSrcVectorSize = std::tuple_element_t<13, Tuple>{}.value; - static constexpr index_t GammaSrcVectorSize = std::tuple_element_t<14, Tuple>{}.value; - static constexpr index_t BetaSrcVectorSize = std::tuple_element_t<15, Tuple>{}.value; - static constexpr index_t YDstVectorSize = std::tuple_element_t<16, Tuple>{}.value; + static constexpr index_t GammaSrcVectorDim = std::tuple_element_t<14, Tuple>{}.value; + static constexpr index_t GammaSrcVectorSize = std::tuple_element_t<15, Tuple>{}.value; + static constexpr index_t BetaSrcVectorDim = std::tuple_element_t<16, Tuple>{}.value; + static constexpr index_t BetaSrcVectorSize = std::tuple_element_t<17, Tuple>{}.value; + static constexpr index_t YDstVectorSize = std::tuple_element_t<18, Tuple>{}.value; using PassThrough = ck::tensor_operation::element_wise::PassThrough; @@ -78,23 +80,24 @@ class TestLayernorm : public ::testing::Test KThreadSliceSize, XYSrcVectorDim, XSrcVectorSize, + GammaSrcVectorDim, GammaSrcVectorSize, + BetaSrcVectorDim, BetaSrcVectorSize, YDstVectorSize>; - TestLayernorm() : ref_instance_invoker_(ReferenceInstance{}.MakeInvoker()) {} + TestLayernorm2d() : ref_instance_invoker_(ReferenceInstance{}.MakeInvoker()) {} - void RunSingle(std::vector lengths, std::vector reduceDims) + void RunSingle(const std::vector& lengths, + const std::vector& reduceDims, + const std::vector& GammaLength, + const std::vector& GammaStride, + const std::vector& BetaLength, + const std::vector& BetaStride) { - std::vector reduceLength(reduceDims.size()); - for(int i = 0; i < NumReduceDim; ++i) - { - reduceLength[i] = lengths[reduceDims[i]]; - } - Tensor x(lengths); - Tensor gamma(reduceLength); - Tensor beta(reduceLength); + Tensor gamma(GammaLength); + Tensor beta(BetaLength); Tensor y(lengths); Tensor y_ref(lengths); @@ -115,10 +118,8 @@ class TestLayernorm : public ::testing::Test auto argument_ptr = device_instance.MakeArgumentPointer( lengths, std::vector{x.mDesc.GetStrides().begin(), x.mDesc.GetStrides().end()}, - std::vector{gamma.mDesc.GetStrides().begin(), - gamma.mDesc.GetStrides().end()}, - std::vector{beta.mDesc.GetStrides().begin(), - beta.mDesc.GetStrides().end()}, + GammaStride, + BetaStride, std::vector{y.mDesc.GetStrides().begin(), y.mDesc.GetStrides().end()}, reduceDims, 1e-4, @@ -163,17 +164,16 @@ class TestLayernorm : public ::testing::Test void Run() { - for(auto length : this->lengths_) + std::vector> lengths = { + {4, 256}, {8, 511}, {9, 1032}, {4, 2048}, {1, 8192}, {4000, 2000}}; + + for(auto length : lengths) { - this->RunSingle(length, reduceDims_[0]); + this->RunSingle(length, {1}, {length[1]}, {0, 1}, {length[1]}, {0, 1}); } } - std::vector> lengths_ = { - {4, 256}, {8, 511}, {9, 1032}, {4, 2048}, {1, 8192}, {4000, 2000}}; - - std::vector> reduceDims_ = {{1}}; - typename ReferenceInstance::Invoker ref_instance_invoker_; }; + } // namespace ck