diff --git a/example/27_layernorm/CMakeLists.txt b/example/27_layernorm/CMakeLists.txt index d96deae45e..b2ca59c5e2 100644 --- a/example/27_layernorm/CMakeLists.txt +++ b/example/27_layernorm/CMakeLists.txt @@ -1 +1 @@ -add_example_executable(example_layernorm_blockwise layernorm_blockwise.cpp) +add_example_executable(example_layernorm_blockwise layernorm_blockwise.cpp) \ No newline at end of file diff --git a/example/45_elementwise_normalization/CMakeLists.txt b/example/45_elementwise_normalization/CMakeLists.txt deleted file mode 100644 index 8f5b9d4d87..0000000000 --- a/example/45_elementwise_normalization/CMakeLists.txt +++ /dev/null @@ -1 +0,0 @@ -add_example_executable(example_elementwise_layernorm_blockwise elementwise_layernorm_blockwise.cpp) diff --git a/example/45_elementwise_normalization/elementwise_layernorm_blockwise.cpp b/example/45_elementwise_normalization/elementwise_layernorm_blockwise.cpp deleted file mode 100644 index 7d6ff12eea..0000000000 --- a/example/45_elementwise_normalization/elementwise_layernorm_blockwise.cpp +++ /dev/null @@ -1,195 +0,0 @@ -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. - -#include -#include -#include -#include -#include - -#include "ck/ck.hpp" -#include "ck/utility/reduction_enums.hpp" -#include "ck/tensor_operation/gpu/device/impl/device_elementwise_normalization_impl.hpp" -#include "ck/tensor_operation/gpu/device/reduction_operator_mapping.hpp" - -#include "ck/library/utility/check_err.hpp" -#include "ck/library/utility/device_memory.hpp" -#include "ck/library/utility/host_common_util.hpp" -#include "ck/library/utility/host_tensor.hpp" -#include "ck/library/utility/host_tensor_generator.hpp" -#include "ck/library/reference_tensor_operation/cpu/reference_layernorm.hpp" - -using ADataType = ck::half_t; // Input 1 -using BDataType = ck::half_t; // Input 2 -using XDataType = ck::half_t; -using GammaDataType = ck::half_t; -using BetaDataType = ck::half_t; -using YDataType = ck::half_t; -using AccDataType = float; -using XElementwiseOperation = ck::tensor_operation::element_wise::Add; -using YElementwiseOperation = ck::tensor_operation::element_wise::PassThrough; - -constexpr int Rank = 2; -constexpr int NumReduceDim = 1; - -// X = Elementwise(input1, input2, input3, ...) -// Y = Layernorm(X, beta, gamma) -using DeviceInstance = ck::tensor_operation::device::DeviceElementwiseNormalizationImpl< - ck::Tuple, - GammaDataType, - BetaDataType, - AccDataType, - YDataType, - XElementwiseOperation, - YElementwiseOperation, - Rank, - NumReduceDim, - 256, // BlockSize - 8, // ClusterM - 32, // ClusterK - 1, // SliceM - 32, // SliceK - 1, // SrcVecDim (0=M, 1=K) - 8, // SrcScalarPerVector - 1, // GammaVecDim (0=M, 1=K) - 8, // GammaScalarPerVector - 1, // BetaVecDim (0=M, 1=K) - 8, // BetaScalarPerVector - 8>; // OutScalarPerVector - -template -void host_elementwise2D(HostTensorC& C, - const HostTensorA& A, - const HostTensorB& B, - const std::vector& shape, - Functor functor) -{ - using ctype = ck::remove_reference_t; - - for(std::size_t m = 0; m < shape[0]; ++m) - for(std::size_t n = 0; n < shape[1]; ++n) - { - auto a_val = A(m, n); - auto b_val = B(m, n); - ctype c_val = 0; - functor(c_val, a_val, b_val); - C(m, n) = c_val; - } -} - -int main() -{ - bool time_kernel = true; - - ck::index_t M = 48 * 256; - ck::index_t N = 1024; - ck::index_t Stride = N; - - auto f_host_tensor_descriptor1d = [](std::size_t len, std::size_t stride) { - return HostTensorDescriptor(std::vector({len}), - std::vector({stride})); - }; - - auto f_host_tensor_descriptor2d = [](std::size_t row, std::size_t col, std::size_t stride) { - return HostTensorDescriptor(std::vector({row, col}), - std::vector({stride, 1})); - }; - - Tensor a(f_host_tensor_descriptor2d(M, N, Stride)); - Tensor b(f_host_tensor_descriptor2d(M, N, Stride)); - Tensor gamma(f_host_tensor_descriptor1d(N, 1)); - Tensor beta(f_host_tensor_descriptor1d(N, 1)); - Tensor y(f_host_tensor_descriptor2d(M, N, Stride)); - - a.GenerateTensorValue(GeneratorTensor_2{-5, 5}); - b.GenerateTensorValue(GeneratorTensor_2{-5, 5}); - gamma.GenerateTensorValue(GeneratorTensor_2{-5, 5}); - beta.GenerateTensorValue(GeneratorTensor_2{-5, 5}); - - DeviceMem a_dev(sizeof(ADataType) * a.mDesc.GetElementSpaceSize()); - DeviceMem b_dev(sizeof(BDataType) * b.mDesc.GetElementSpaceSize()); - DeviceMem gamma_dev(sizeof(GammaDataType) * gamma.mDesc.GetElementSpaceSize()); - DeviceMem beta_dev(sizeof(BetaDataType) * beta.mDesc.GetElementSpaceSize()); - DeviceMem y_dev(sizeof(YDataType) * y.mDesc.GetElementSpaceSize()); - - a_dev.ToDevice(a.mData.data()); - b_dev.ToDevice(b.mData.data()); - gamma_dev.ToDevice(gamma.mData.data()); - beta_dev.ToDevice(beta.mData.data()); - - std::array input = {a_dev.GetDeviceBuffer(), b_dev.GetDeviceBuffer()}; - - auto device_instance = DeviceInstance{}; - auto argument_ptr = device_instance.MakeArgumentPointer( - {M, N}, - { - std::vector{a.mDesc.GetStrides().begin(), a.mDesc.GetStrides().end()}, - std::vector{b.mDesc.GetStrides().begin(), b.mDesc.GetStrides().end()}, - }, - {0, 1}, - {0, 1}, - std::vector{y.mDesc.GetStrides().begin(), y.mDesc.GetStrides().end()}, - {1}, - 1e-4, - input, - gamma_dev.GetDeviceBuffer(), - beta_dev.GetDeviceBuffer(), - y_dev.GetDeviceBuffer(), - XElementwiseOperation{}, - YElementwiseOperation{}); - - if(!device_instance.IsSupportedArgument(argument_ptr.get())) - { - std::cout << "The runtime parameters are not supported" << std::endl; - return 1; - }; - - auto invoker_ptr = device_instance.MakeInvokerPointer(); - float ela_time = 0; - ela_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel}); - - float data_mem_size = M * N * sizeof(ADataType) + M * N * sizeof(BDataType) + - M * N * sizeof(YDataType) + N * sizeof(GammaDataType) + - N * sizeof(BetaDataType); - float bandwidth = data_mem_size * 1000 / ela_time / 1024 / 1024 / 1024; - - std::cout << "Bandwidth is : " << bandwidth << "GB/s . " << std::endl; - std::cout << "Time elapase is : " << ela_time << " ms . " << std::endl; - - bool pass = true; - { - std::vector mn = {static_cast(M), - static_cast(N)}; - Tensor x(f_host_tensor_descriptor2d(M, N, Stride)); - host_elementwise2D, - Tensor, - Tensor, - XElementwiseOperation>(x, a, b, mn, XElementwiseOperation{}); - - Tensor host_y(f_host_tensor_descriptor2d(M, N, Stride)); - using ReferenceInstance = - ck::tensor_operation::host::ReferenceLayernorm; - - ReferenceInstance ref; - auto ref_argument = - ref.MakeArgument(x, gamma, beta, host_y, YElementwiseOperation{}, {M, N}, {1}, 1e-4); - auto ref_invoker = ref.MakeInvoker(); - ref_invoker.Run(ref_argument); - - y_dev.FromDevice(y.mData.data()); - pass &= - ck::utils::check_err(y.mData, host_y.mData, "Error: Incorrect results d1", 1e-3, 1e-3); - if(!(pass)) - { - std::cout << "layernorm wrong" << std::endl; - } - } - return (pass ? 0 : 1); -} diff --git a/include/ck/tensor_operation/gpu/device/device_elementwise_normalization.hpp b/include/ck/tensor_operation/gpu/device/device_elementwise_normalization.hpp deleted file mode 100644 index d8a791c322..0000000000 --- a/include/ck/tensor_operation/gpu/device/device_elementwise_normalization.hpp +++ /dev/null @@ -1,68 +0,0 @@ -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. - -#pragma once - -#include -#include - -#include "ck/tensor_operation/gpu/device/device_base.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { - -template -struct DeviceElementwiseNormalization : public BaseOperator -{ - static constexpr int NumInput = InDataTypeTuple::Size(); - - virtual std::unique_ptr - MakeArgumentPointer(const std::vector lengths, - const std::array, NumInput> inStridesArray, - const std::vector gammaStrides, - const std::vector betaStrides, - const std::vector yStrides, - const std::vector reduceDims, - AccDataType epsilon, - const std::array in_dev_buffers, - const void* p_gamma, - const void* p_beta, - void* p_y, - XElementwiseOperation x_elementwise_op, - YElementwiseOperation y_elementwise_op) = 0; - - virtual std::unique_ptr MakeInvokerPointer() = 0; -}; - -template -using DeviceElementwiseNormalizationPtr = - std::unique_ptr>; - -} // namespace device -} // namespace tensor_operation -} // namespace ck diff --git a/include/ck/tensor_operation/gpu/device/impl/device_elementwise_normalization_impl.hpp b/include/ck/tensor_operation/gpu/device/impl/device_elementwise_normalization_impl.hpp deleted file mode 100644 index 8ffc5ef9fb..0000000000 --- a/include/ck/tensor_operation/gpu/device/impl/device_elementwise_normalization_impl.hpp +++ /dev/null @@ -1,592 +0,0 @@ -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. - -#pragma once - -#include -#include - -#include "ck/utility/math.hpp" -#include "ck/utility/sequence.hpp" -#include "ck/utility/reduction_operator.hpp" - -#include "ck/tensor_operation/gpu/device/device_elementwise_normalization.hpp" -#include "ck/tensor_operation/gpu/device/device_reduce.hpp" -#include "ck/tensor_operation/gpu/device/impl/device_reduce_common.hpp" -#include "ck/tensor_operation/gpu/grid/gridwise_elementwise_layernorm_welford_variance.hpp" -#include "ck/tensor_operation/gpu/grid/gridwise_set_buffer_value.hpp" -#include "ck/host_utility/device_prop.hpp" -#include "ck/host_utility/kernel_launch.hpp" - -// X = Elementwise(input1, input2, input3, ...) -// Y = Normalization(X, beta, gamma) -namespace ck { -template // Descriptor of inputs, Gamma, Beta -__global__ void kernel_elementwise_layernorm( - const InGrid2dDescTuple in_grid_2d_desc_tuple, // Descriptor tuple of inputs - const GridDesc_M_K x_grid_desc_m_k, // Descriptor of X - const GridDesc_M_K gamma_grid_desc_m_k, // Descriptor of gamma - const GridDesc_M_K beta_grid_desc_m_k, // Descriptor of beta - const GridDesc_M_K y_grid_desc_m_k, // Descriptor of Y - index_t num_k_block_tile_iteration, // - AccDataType epsilon, // Datatype of epsilon - const InDataTypePointerTuple p_in_global_tuple, // Ptr tuple of input matrixs - const GammaDataType* const __restrict__ p_gamma_global, // Ptr of gamma - const BetaDataType* const __restrict__ p_beta_global, // Ptr of beta - YDataType* const __restrict__ p_y_global, // Ptr of y - const XElementwiseOperation x_elementwise_op, // Operation of input - const YElementwiseOperation y_elementwise_op) // Operation of output of normalization -{ - extern __shared__ XDataType p_x_lds[]; - GridwiseElementwiseReduction::Run(in_grid_2d_desc_tuple, // Descriptor tuple of inputs - x_grid_desc_m_k, // Descriptor of X - gamma_grid_desc_m_k, // Descriptor of Gamma - beta_grid_desc_m_k, // Descriptor of Beta - y_grid_desc_m_k, // Descriptor of Y - num_k_block_tile_iteration, // - epsilon, // epsilon - p_in_global_tuple, // Ptr tuple of inputs - p_x_lds, // Ptr of X - p_gamma_global, // Ptr of gamma - p_beta_global, // Ptr of beta - p_y_global, // Ptr of Y - x_elementwise_op, // Operation of input - y_elementwise_op); // Operation of output of normalization -}; -} // namespace ck - -namespace ck { -namespace tensor_operation { -namespace device { - -// Y = LayerNorm(A + B, Beta, Gamma) -template // Size to write destination Y -struct DeviceElementwiseNormalizationImpl - : public DeviceElementwiseNormalization -{ - static constexpr int NumInput = InDataTypeTuple::Size(); - - using XDataType = YDataType; - - static_assert( - (KThreadSliceSize % GammaSrcVectorSize == 0), - "Invalid thread slice sizes and/or gamma vector sizes configuration, please check!"); - - static_assert( - (KThreadSliceSize % BetaSrcVectorSize == 0), - "Invalid thread slice sizes and/or beta vector sizes configuration, please check!"); - - static constexpr index_t M_BlockTileSize = - MThreadClusterSize * MThreadSliceSize; // num of rows calculated in a block - static constexpr index_t K_BlockTileSize = - KThreadClusterSize * KThreadSliceSize; // num of columns calculated in a block - - static auto GenerateInDataTypePointerTuple() - { - return generate_tuple( - [&](auto I) { - using DataType = remove_cvref_t; - return static_cast(nullptr); - }, - Number{}); - }; - - using InDataTypePointerTuple = decltype(GenerateInDataTypePointerTuple()); - - static auto MakeSrc2dDescriptor(const std::vector& inLengths, - const std::vector& inStrides, - int blkGroupSize, - int numBlockTileIteration) - { - constexpr index_t NumInvariantDim = Rank - NumReduceDim; - static constexpr index_t numSrcDim = Rank; - static constexpr bool reduceAllDim = (NumInvariantDim == 0); - - const auto tupleSrcLengths = make_tuple_from_array(inLengths, Number{}); - const auto tupleSrcStrides = make_tuple_from_array(inStrides, Number{}); - - const auto inDesc = make_naive_tensor_descriptor(tupleSrcLengths, tupleSrcStrides); - - const auto in_grid_desc_m_k = [&]() { - if constexpr(reduceAllDim) - { - const auto one_dim_inDesc = transform_tensor_descriptor( - inDesc, - make_tuple(make_merge_transform(tupleSrcLengths)), - make_tuple(typename arithmetic_sequence_gen<0, numSrcDim, 1>::type{}), - make_tuple(Sequence<0>{})); - - return transform_tensor_descriptor(one_dim_inDesc, - make_tuple(make_unmerge_transform(make_tuple( - 1, one_dim_inDesc.GetLength(Number<0>{})))), - make_tuple(Sequence<0>{}), - make_tuple(Sequence<0, 1>{})); - } - else - { - using InvariantDims = typename arithmetic_sequence_gen<0, NumInvariantDim, 1>::type; - using ReduceDims = typename arithmetic_sequence_gen::type; - - const auto reduceDimLengths = - make_tuple_from_array_and_index_seq(inLengths, ReduceDims{}); - const auto invariantDimLengths = - make_tuple_from_array_and_index_seq(inLengths, InvariantDims{}); - - return transform_tensor_descriptor( - inDesc, - make_tuple(make_merge_transform(invariantDimLengths), - make_merge_transform(reduceDimLengths)), - make_tuple(InvariantDims{}, ReduceDims{}), - make_tuple(Sequence<0>{}, Sequence<1>{})); - } - }(); - - const auto invariantLength = in_grid_desc_m_k.GetLength(Number<0>{}); - const auto reduceLength = in_grid_desc_m_k.GetLength(Number<1>{}); - - const int reduceSizePerBlock = K_BlockTileSize * numBlockTileIteration; - const auto inPad_M = - math::integer_least_multiple(invariantLength, M_BlockTileSize) - invariantLength; - const auto inPad_K = reduceSizePerBlock * blkGroupSize - reduceLength; - - auto in_grid_desc_m_k_padded = transform_tensor_descriptor( - in_grid_desc_m_k, - make_tuple(make_right_pad_transform(invariantLength, inPad_M), - make_right_pad_transform(reduceLength, inPad_K)), - make_tuple(Sequence<0>{}, Sequence<1>{}), - make_tuple(Sequence<0>{}, Sequence<1>{})); - - return (in_grid_desc_m_k_padded); - }; - - template - static auto GenerateSrcGrid2dDescTuple(Number) - { - return generate_tuple([&](auto) { return MakeSrc2dDescriptor({1}, {1}, 1, 1); }, - Number{}); - }; - - using InGrid2dDescTuple = decltype(GenerateSrcGrid2dDescTuple(Number{})); - - using GridDesc_M_K = decltype(MakeSrc2dDescriptor({1}, {1}, 1, 1)); - - using GridwiseReduceLayernormGeneric = - GridwiseElementwiseLayernormWelfordVariance_mk_to_mk; - - using GridwiseReduceLayernormSweepOnce = - GridwiseElementwiseLayernormWelfordVariance_mk_to_mk; - - struct Argument : public BaseArgument - { - Argument(const std::vector lengths, - const std::array, NumInput> inStridesArray, - const std::vector gammaStrides, - const std::vector betaStrides, - const std::vector yStrides, - const std::vector reduceDims, - XElementwiseOperation x_elementwise_op, - YElementwiseOperation y_elementwise_op, - AccDataType epsilon, - const std::array in_dev_buffers, - const GammaDataType* p_gamma, - const BetaDataType* p_beta, - YDataType* p_y) - : epsilon_(epsilon), - p_gamma_(p_gamma), - p_beta_(p_beta), - p_y_(p_y), - x_elementwise_op_(x_elementwise_op), - y_elementwise_op_(y_elementwise_op) - { - - Lengths_ = shuffle_tensor_dimensions(lengths, reduceDims); - for(int i = 0; i < NumInput; i++) - { - inStridesArray_[i] = - shuffle_tensor_dimensions(inStridesArray[i], reduceDims); - } - - yStrides_ = shuffle_tensor_dimensions(yStrides, reduceDims); - xStrides_ = shuffle_tensor_dimensions(yStrides, reduceDims); - - gammaStrides_ = shuffle_tensor_dimensions(gammaStrides, reduceDims); - betaStrides_ = shuffle_tensor_dimensions(betaStrides, reduceDims); - - in_dev_buffers_ = generate_tuple( - [&](auto I) { - using DataType = remove_cvref_t; - return static_cast(in_dev_buffers[I.value]); - }, - Number{}); - - long_index_t invariant_total_length; - long_index_t reduce_total_length; - - std::tie(invariant_total_length, reduce_total_length) = - get_2d_lengths(Lengths_); - - blkGroupSize_ = 1; - numBlockTileIteration_ = (reduce_total_length + K_BlockTileSize - 1) / K_BlockTileSize; - - gridSize_ = math::integer_least_multiple(invariant_total_length, M_BlockTileSize) / - M_BlockTileSize * blkGroupSize_; - - in_grid_2d_desc_tuple_ = generate_tuple( - [&](auto I) { - return MakeSrc2dDescriptor( - Lengths_, inStridesArray_[I.value], blkGroupSize_, numBlockTileIteration_); - }, - Number{}); - - x_grid_desc_m_k_ = - MakeSrc2dDescriptor(Lengths_, xStrides_, blkGroupSize_, numBlockTileIteration_); - - gamma_grid_desc_m_k_ = - MakeSrc2dDescriptor(Lengths_, gammaStrides_, blkGroupSize_, numBlockTileIteration_); - - beta_grid_desc_m_k_ = - MakeSrc2dDescriptor(Lengths_, betaStrides_, blkGroupSize_, numBlockTileIteration_); - - y_grid_desc_m_k_ = - MakeSrc2dDescriptor(Lengths_, yStrides_, blkGroupSize_, numBlockTileIteration_); - - sweep_once_ = - x_grid_desc_m_k_.GetLength(Number<1>{}) <= KThreadClusterSize * KThreadSliceSize; - - if(!sweep_once_) // if not sweep once, compute memory size for matrix X in lds for - // store Intermediate results - { - int block_TileSize = M_BlockTileSize * reduce_total_length; - x_lds_size_ = block_TileSize * sizeof(XDataType); - } - else - x_lds_size_ = 0; - } - - AccDataType epsilon_; - - InDataTypePointerTuple in_dev_buffers_; - const GammaDataType* p_gamma_; - const BetaDataType* p_beta_; - YDataType* p_y_; - - std::vector Lengths_; - std::array, NumInput> inStridesArray_; - std::vector xStrides_; - std::vector gammaStrides_; - std::vector betaStrides_; - std::vector yStrides_; - - XElementwiseOperation x_elementwise_op_; - YElementwiseOperation y_elementwise_op_; - - int blkGroupSize_; - int numBlockTileIteration_; - size_t gridSize_; - - InGrid2dDescTuple in_grid_2d_desc_tuple_; - GridDesc_M_K x_grid_desc_m_k_; - GridDesc_M_K gamma_grid_desc_m_k_; - GridDesc_M_K beta_grid_desc_m_k_; - GridDesc_M_K y_grid_desc_m_k_; - bool sweep_once_; - int x_lds_size_; - }; - - struct Invoker : public BaseInvoker - { - float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{}) - { - const auto kernel_main = - arg.sweep_once_ ? kernel_elementwise_layernorm - : kernel_elementwise_layernorm; - - float avg_time = 0; - avg_time += launch_and_time_kernel(stream_config, - kernel_main, - dim3(arg.gridSize_), - dim3(BlockSize), - arg.x_lds_size_, - arg.in_grid_2d_desc_tuple_, - arg.x_grid_desc_m_k_, - arg.gamma_grid_desc_m_k_, - arg.beta_grid_desc_m_k_, - arg.y_grid_desc_m_k_, - arg.numBlockTileIteration_, - arg.epsilon_, - arg.in_dev_buffers_, - arg.p_gamma_, - arg.p_beta_, - arg.p_y_, - arg.x_elementwise_op_, - arg.y_elementwise_op_); - - return (avg_time); - }; - - float Run(const BaseArgument* p_arg, - const StreamConfig& stream_config = StreamConfig{}) override - { - return Run(*dynamic_cast(p_arg), stream_config); - }; - }; - - bool IsSupportedArgument(const BaseArgument* p_arg) override - { - const Argument* p_arg_ = dynamic_cast(p_arg); - - constexpr index_t NumInvariantDim = Rank - NumReduceDim; - - if constexpr(XYSrcVectorDim == 0) - { - if constexpr(NumInvariantDim == 0) - { - return false; - } - else - { - for(int i = 0; i < NumInput; i++) - { - if(p_arg_->inStridesArray_[i][NumInvariantDim - 1] != 1) - return false; - } - - if(p_arg_->inStridesArray_[0][NumInvariantDim - 1] != 1 && - p_arg_->inStridesArray_[1][NumInvariantDim - 1] != 1) - return false; - - if(p_arg_->invariant_lowest_length % XSrcVectorSize != 0) - return false; - }; - } - else - { - for(int i = 0; i < NumInput; i++) - { - if(p_arg_->inStridesArray_[i][Rank - 1] != 1) - return false; - } - - if(p_arg_->Lengths_[Rank - 1] % XSrcVectorSize != 0) - return false; - }; - - if(p_arg_->Lengths_[Rank - 1] % YDstVectorSize != 0) - { - return false; - } - - auto IsScalarPerVectorValid = [](bool isLastDimensionCoalesced, int scalarPerVector) { - bool ret = true; - - if(!isLastDimensionCoalesced) - ret = scalarPerVector == 1; - else - ret = KThreadSliceSize % scalarPerVector == 0; - - return ret; - }; - - if(!IsScalarPerVectorValid(p_arg_->gammaStrides_.back() == 1, GammaSrcVectorSize)) - return false; - - if(!IsScalarPerVectorValid(p_arg_->betaStrides_.back() == 1, BetaSrcVectorSize)) - return false; - - // if fastest dim is not reduced - if constexpr(XYSrcVectorDim == 0) // - { - if(p_arg_->gammaStrides_[NumInvariantDim - 1] != 1) - return (false); - - if(p_arg_->Lengths_[Rank - 1] % GammaSrcVectorSize != 0) - return (false); - } - else // if fastest dim is reduced - { - if(p_arg_->gammaStrides_[Rank - 1] != 1) - return (false); - - if(p_arg_->Lengths_[Rank - 1] % GammaSrcVectorSize != 0) - return (false); - } - - // if fastest dim is not reduced - if constexpr(XYSrcVectorDim == 0) - { - if(p_arg_->betaStrides_[NumInvariantDim - 1] != 1) - return (false); - - if(p_arg_->invariant_lowest_length % BetaSrcVectorSize != 0) - return (false); - } - else // if fastest dim is reduced - { - if(p_arg_->betaStrides_[Rank - 1] != 1) - return (false); - - if(p_arg_->Lengths_[Rank - 1] % BetaSrcVectorSize != 0) - return (false); - } - - return true; - }; - - std::unique_ptr - MakeArgumentPointer(const std::vector lengths, - const std::array, NumInput> inStridesArray, - const std::vector gammaStrides, - const std::vector betaStrides, - const std::vector yStrides, - const std::vector reduceDims, - AccDataType epsilon, - const std::array in_dev_buffers, - const void* p_gamma, - const void* p_beta, - void* p_y, - XElementwiseOperation x_elementwise_op, - YElementwiseOperation y_elementwise_op) override - { - return std::make_unique(lengths, - inStridesArray, - gammaStrides, - betaStrides, - yStrides, - reduceDims, - x_elementwise_op, - y_elementwise_op, - epsilon, - in_dev_buffers, - static_cast(p_gamma), - static_cast(p_beta), - static_cast(p_y)); - }; - - std::unique_ptr MakeInvokerPointer() override - { - return std::make_unique(); - }; - - std::string GetTypeString() const override - { - auto str = std::stringstream(); - - // clang-format off - str << "DeviceElementwiseNormalizationImpl<" << BlockSize << ","; - str << "M_C" << MThreadClusterSize << "_S" << MThreadSliceSize << ","; - str << "K_C" << KThreadClusterSize << "_S" << KThreadSliceSize << ","; - str << "XYSrcVectorDim_" << XYSrcVectorDim << ","; - str << "VectorSize_X" << XSrcVectorSize << "_Gamma" << GammaSrcVectorSize << "_Beta" << BetaSrcVectorSize << "_Y" << YDstVectorSize << ">"; - // clang-format on - - return str.str(); - } -}; - -} // namespace device -} // namespace tensor_operation -} // namespace ck diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_elementwise_layernorm_welford_variance.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_elementwise_layernorm_welford_variance.hpp deleted file mode 100644 index 40d75e05a1..0000000000 --- a/include/ck/tensor_operation/gpu/grid/gridwise_elementwise_layernorm_welford_variance.hpp +++ /dev/null @@ -1,500 +0,0 @@ -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. - -#pragma once - -#include "ck/utility/data_type.hpp" -#include "ck/tensor_operation/gpu/element/binary_element_wise_operation.hpp" -#include "ck/tensor_operation/gpu/block/blockwise_welford.hpp" -#include "ck/tensor_operation/gpu/thread/threadwise_welford.hpp" -#include "ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp" -#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" - -namespace ck { - -// X = Elementwise(input1, input2, input3, ...) -// Y = Normalization(X, beta, gamma) -template -struct GridwiseElementwiseLayernormWelfordVariance_mk_to_mk -{ - static_assert((XSrcVectorDim == 0 && MThreadSliceSize % XSrcVectorSize == 0) || - (XSrcVectorDim == 1 && KThreadSliceSize % XSrcVectorSize == 0), - "Invalid thread slice sizes and/or vector sizes configuration, please check!"); - - static_assert((YDstVectorDim == 0 && MThreadSliceSize % YDstVectorSize == 0) || - (YDstVectorDim == 1 && KThreadSliceSize % YDstVectorSize == 0), - "Invalid thread slice sizes and/or vector sizes configuration, please check!"); - - static constexpr index_t NumInput = InDataTypePointerTuple::Size(); - - static constexpr bool reorder_thread_cluster = (XSrcVectorDim == 0); - - using ThreadClusterLengths_M_K = Sequence; - - using ThreadBufferDimAccessOrder = - typename conditional, Sequence<0, 1>>::type; - - using ThreadClusterArrangeOrder = - typename conditional, Sequence<0, 1>>::type; - - static constexpr auto thread_cluster_desc = - make_cluster_descriptor(ThreadClusterLengths_M_K{}, ThreadClusterArrangeOrder{}); - - using ThreadReduceSrcDesc_M_K = decltype(make_naive_tensor_descriptor_packed( - make_tuple(Number{}, Number{}))); - using ThreadReduceDstDesc_M = - decltype(make_naive_tensor_descriptor_packed(make_tuple(Number{}))); - - using ThreadwiseWelford = - ThreadwiseWelford; - - using BlockwiseWelford = BlockwiseWelford; - - static constexpr auto I0 = Number<0>{}; - static constexpr auto I1 = Number<1>{}; - static constexpr auto I2 = Number<2>{}; - - static constexpr index_t M_BlockTileSize = MThreadClusterSize * MThreadSliceSize; - static constexpr index_t K_BlockTileSize = KThreadClusterSize * KThreadSliceSize; - static constexpr index_t K_BlockTileStepSize = KThreadClusterSize * XSrcVectorSize; - - static constexpr auto XThreadBufferNumber = Number{}; - static constexpr auto GammaThreadBufferNumber = Number{}; - static constexpr auto BetaThreadBufferNumber = Number{}; - static constexpr auto YThreadBufferNumber = Number{}; - - __device__ static int GetKPerThread(const GridDesc_M_K& x_grid_desc_m_k, - int thread_k_cluster_id) - { - int kPerBlock = x_grid_desc_m_k.GetTransforms()[I2].GetUpperLengths()[I0]; - int kPerThread = - kPerBlock < K_BlockTileSize ? 0 : KThreadSliceSize * (kPerBlock / K_BlockTileSize); - int kPerBlockTail = kPerBlock - kPerThread * KThreadClusterSize; - - if(kPerBlockTail > 0) - { - static_for<0, XThreadBufferNumber, 1>{}([&](auto i) { - int thread_max_len = - (thread_k_cluster_id + 1) * XSrcVectorSize + K_BlockTileStepSize * i; - int delta = thread_max_len - kPerBlockTail; - delta = math::clamp(thread_max_len - kPerBlockTail, 0, XSrcVectorSize); - kPerThread += XSrcVectorSize - delta; - }); - } - - return kPerThread; - } - - __device__ static void Run(const InGrid2dDescTuple in_grid_2d_desc_tuple, - const GridDesc_M_K& x_grid_desc_m_k, - const GridDesc_M_K& gamma_grid_desc_m_k, - const GridDesc_M_K& beta_grid_desc_m_k, - const GridDesc_M_K& y_grid_desc_m_k, - index_t num_k_block_tile_iteration, - AccDataType epsilon, - const InDataTypePointerTuple p_in_global_tuple, - XDataType* const __restrict__ p_x_lds, - const GammaDataType* const __restrict__ p_gamma_global, - const BetaDataType* const __restrict__ p_beta_global, - YDataType* const __restrict__ p_y_global, - const XElementwiseOperation x_elementwise_op, - const YElementwiseOperation y_elementwise_op) - { - if constexpr(SweepOnce) - { - num_k_block_tile_iteration = 1; - } - - const index_t thread_local_id = get_thread_local_1d_id(); - const index_t block_global_id = get_block_1d_id(); - const index_t grid_size = get_grid_size(); - - auto in_global_buf_tuple = generate_tuple( - [&](auto I) { - static_assert(in_grid_2d_desc_tuple[I].GetNumOfDimension() == - 2); // matrix dimension - - return make_dynamic_buffer( - p_in_global_tuple[I], in_grid_2d_desc_tuple[I].GetElementSpaceSize()); - }, - Number{}); - - auto y_global_val_buf = make_dynamic_buffer( - p_y_global, y_grid_desc_m_k.GetElementSpaceSize()); - - auto x_lds_val_buf = make_dynamic_buffer( - p_x_lds, x_grid_desc_m_k.GetElementSpaceSize() / grid_size); - - auto in_thread_buf_tuple = generate_tuple( - [&](auto) { - return generate_tuple( - [&](auto) { - return StaticBuffer{}; - }, - Number{}); - }, - Number{}); - - auto x_thread_buf = generate_tuple( - [&](auto) { - return StaticBuffer{}; - }, - Number{}); - - auto gamma_thread_buf = generate_tuple( - [&](auto) { - return StaticBuffer{}; - }, - Number{}); - - auto beta_thread_buf = generate_tuple( - [&](auto) { - return StaticBuffer{}; - }, - Number{}); - - auto y_thread_buf = generate_tuple( - [&](auto) { - return StaticBuffer{}; - }, - Number{}); - - StaticBuffer mean_thread_buf; - StaticBuffer var_thread_buf; - - const auto thread_cluster_idx = - thread_cluster_desc.CalculateBottomIndex(make_multi_index(thread_local_id)); - - const auto thread_m_cluster_id = thread_cluster_idx[I0]; - const auto thread_k_cluster_id = thread_cluster_idx[I1]; - - using ThreadBufferLengths_M_K = Sequence; - - constexpr auto thread_buffer_desc_m_k = make_naive_tensor_descriptor_packed( - make_tuple(Number{}, Number{})); - - auto in_global_load_tuple = generate_tuple( - [&](auto I) { - using DataTypePointer = remove_cvref_t; - using DataType = remove_cv_t>; - - return ThreadwiseTensorSliceTransfer_v2{ - in_grid_2d_desc_tuple[I], - make_multi_index(block_global_id * M_BlockTileSize + - thread_m_cluster_id * MThreadSliceSize, - thread_k_cluster_id * XSrcVectorSize)}; - }, - Number{}); - - auto threadwise_x_load = ThreadwiseTensorSliceTransfer_v2( - x_grid_desc_m_k, - make_multi_index(thread_m_cluster_id * MThreadSliceSize, - thread_k_cluster_id * XSrcVectorSize)); - - auto threadwise_gamma_load = - ThreadwiseTensorSliceTransfer_v2( - gamma_grid_desc_m_k, - make_multi_index(block_global_id * M_BlockTileSize + - thread_m_cluster_id * MThreadSliceSize, - thread_k_cluster_id * GammaSrcVectorSize)); - - auto threadwise_beta_load = - ThreadwiseTensorSliceTransfer_v2( - beta_grid_desc_m_k, - make_multi_index(block_global_id * M_BlockTileSize + - thread_m_cluster_id * MThreadSliceSize, - thread_k_cluster_id * BetaSrcVectorSize)); - - using PassThrough = tensor_operation::element_wise::PassThrough; - PassThrough pass_through_op; - auto threadwise_x_store = - ThreadwiseTensorSliceTransfer_v1r3( - x_grid_desc_m_k, - make_multi_index(thread_m_cluster_id * MThreadSliceSize, - thread_k_cluster_id * XSrcVectorSize), - pass_through_op); - - auto threadwise_y_store = - ThreadwiseTensorSliceTransfer_v1r3( - y_grid_desc_m_k, - make_multi_index(block_global_id * M_BlockTileSize + - thread_m_cluster_id * MThreadSliceSize, - thread_k_cluster_id * YDstVectorSize), - y_elementwise_op); - - // Copy x from Cache - // one pass: fwd, second pass: bwd - constexpr auto thread_copy_fwd_step_m_k = make_multi_index(0, K_BlockTileStepSize); - constexpr auto thread_copy_bwd_step_m_k = - make_multi_index(0, SweepOnce ? 0 : -K_BlockTileSize); - - const auto gamma_global_val_buf = make_dynamic_buffer( - p_gamma_global, gamma_grid_desc_m_k.GetElementSpaceSize()); - - const auto beta_global_val_buf = make_dynamic_buffer( - p_beta_global, beta_grid_desc_m_k.GetElementSpaceSize()); - - auto threadwise_welford = ThreadwiseWelford(); - threadwise_welford.max_count_ = GetKPerThread(x_grid_desc_m_k, thread_k_cluster_id); - - static_for<0, MThreadSliceSize, 1>{}([&](auto I) { - mean_thread_buf(I) = type_convert(0.0f); - var_thread_buf(I) = type_convert(0.0f); - }); - - for(index_t reducedTiles = 0; reducedTiles < num_k_block_tile_iteration; ++reducedTiles) - { - static_for<0, XThreadBufferNumber, 1>{}([&](auto iK0) { - static_for<0, NumInput, 1>{}([&](auto I) { // input load loop - in_global_load_tuple(I).Run(in_grid_2d_desc_tuple[I], - in_global_buf_tuple[I], - thread_buffer_desc_m_k, - make_tuple(I0, I0), - in_thread_buf_tuple(iK0)(I)); - - in_global_load_tuple(I).MoveSrcSliceWindow(in_grid_2d_desc_tuple[I], - thread_copy_fwd_step_m_k); - }); - - static_for<0, MThreadSliceSize, 1>{}([&](auto iM) { // input add loop - static_for<0, XSrcVectorSize, 1>{}([&](auto iK1) { - constexpr auto offset_m_k = - thread_buffer_desc_m_k.CalculateOffset(make_tuple(iM, iK1)); - - // get reference to in data - const auto in_data_refs = generate_tie( - // return type should be lvalue - [&](auto I) -> const auto& { - return in_thread_buf_tuple(iK0)(I)(Number{}); - }, - Number{}); - - // get reference to dst data - auto out_data_refs = generate_tie( - // return type should be lvalue - [&](auto) -> auto& { return x_thread_buf(iK0)(Number{}); }, - I1); - - unpack2(x_elementwise_op, out_data_refs, in_data_refs); - }); - }); - threadwise_welford.Run(x_thread_buf[iK0], mean_thread_buf, var_thread_buf); - - if constexpr(!SweepOnce) - { - threadwise_x_store.Run(thread_buffer_desc_m_k, - make_tuple(I0, I0), - x_thread_buf(iK0), - x_grid_desc_m_k, - x_lds_val_buf); - threadwise_x_store.MoveDstSliceWindow(x_grid_desc_m_k, - thread_copy_fwd_step_m_k); - } - }); - } - - static_for<0, MThreadSliceSize, 1>{}([&](auto I) { - if constexpr(I > 0) - block_sync_lds(); - - int count = threadwise_welford.cur_count_; - BlockwiseWelford::Run(mean_thread_buf(I), var_thread_buf(I), count); - }); - - auto thread_copy_tail_m_k = - (num_k_block_tile_iteration - 1) * XThreadBufferNumber * thread_copy_fwd_step_m_k; - - if constexpr(!SweepOnce) - threadwise_x_load.MoveSrcSliceWindow(x_grid_desc_m_k, thread_copy_tail_m_k); - threadwise_gamma_load.MoveSrcSliceWindow(gamma_grid_desc_m_k, thread_copy_tail_m_k); - threadwise_beta_load.MoveSrcSliceWindow(beta_grid_desc_m_k, thread_copy_tail_m_k); - threadwise_y_store.MoveDstSliceWindow(y_grid_desc_m_k, thread_copy_tail_m_k); - - for(index_t reducedTiles = 0; reducedTiles < num_k_block_tile_iteration; ++reducedTiles) - { - if constexpr(!SweepOnce) - { - static_for<0, XThreadBufferNumber, 1>{}([&](auto i) { - threadwise_x_load.Run(x_grid_desc_m_k, - x_lds_val_buf, - thread_buffer_desc_m_k, - make_tuple(I0, I0), - x_thread_buf(i)); - threadwise_x_load.MoveSrcSliceWindow(x_grid_desc_m_k, thread_copy_fwd_step_m_k); - }); - } - - static_for<0, GammaThreadBufferNumber, 1>{}([&](auto i) { - threadwise_gamma_load.Run(gamma_grid_desc_m_k, - gamma_global_val_buf, - thread_buffer_desc_m_k, - make_tuple(I0, I0), - gamma_thread_buf(i)); - threadwise_gamma_load.MoveSrcSliceWindow(gamma_grid_desc_m_k, - thread_copy_fwd_step_m_k); - }); - - static_for<0, MThreadSliceSize, 1>{}([&](auto iM) { - auto divisor = 1 / __builtin_amdgcn_sqrtf(var_thread_buf(iM) + epsilon); - static_for<0, XThreadBufferNumber, 1>{}([&](auto iK0) { - static_for<0, XSrcVectorSize, 1>{}([&](auto iK1) { - constexpr auto offset_m_k = - thread_buffer_desc_m_k.CalculateOffset(make_tuple(iM, iK1)); - - // normalize - y_thread_buf(iK0)(Number{}) = - (x_thread_buf(iK0)(Number{}) - mean_thread_buf(iM)) * - divisor; - - // gamma - y_thread_buf(iK0)(Number{}) = - y_thread_buf(iK0)(Number{}) * - gamma_thread_buf(iK0)(Number{}); - }); - }); - }); - - static_for<0, BetaThreadBufferNumber, 1>{}([&](auto i) { - threadwise_beta_load.Run(beta_grid_desc_m_k, - beta_global_val_buf, - thread_buffer_desc_m_k, - make_tuple(I0, I0), - beta_thread_buf(i)); - threadwise_beta_load.MoveSrcSliceWindow(beta_grid_desc_m_k, - thread_copy_fwd_step_m_k); - }); - - static_for<0, MThreadSliceSize, 1>{}([&](auto iM) { - static_for<0, XThreadBufferNumber, 1>{}([&](auto iK0) { - static_for<0, XSrcVectorSize, 1>{}([&](auto iK1) { - constexpr auto offset_m_k = - thread_buffer_desc_m_k.CalculateOffset(make_tuple(iM, iK1)); - - // beta - y_thread_buf(iK0)(Number{}) = - y_thread_buf(iK0)(Number{}) + - beta_thread_buf(iK0)(Number{}); - }); - }); - }); - - static_for<0, YThreadBufferNumber, 1>{}([&](auto i) { - threadwise_y_store.Run(thread_buffer_desc_m_k, - make_tuple(I0, I0), - y_thread_buf(i), - y_grid_desc_m_k, - y_global_val_buf); - threadwise_y_store.MoveDstSliceWindow(y_grid_desc_m_k, thread_copy_fwd_step_m_k); - }); - - if constexpr(!SweepOnce) - threadwise_x_load.MoveSrcSliceWindow(x_grid_desc_m_k, 2 * thread_copy_bwd_step_m_k); - threadwise_gamma_load.MoveSrcSliceWindow(gamma_grid_desc_m_k, - 2 * thread_copy_bwd_step_m_k); - threadwise_beta_load.MoveSrcSliceWindow(beta_grid_desc_m_k, - 2 * thread_copy_bwd_step_m_k); - threadwise_y_store.MoveDstSliceWindow(y_grid_desc_m_k, 2 * thread_copy_bwd_step_m_k); - } - } -}; - -} // namespace ck diff --git a/library/include/ck/library/tensor_operation_instance/gpu/elementwise_normalization.hpp b/library/include/ck/library/tensor_operation_instance/gpu/elementwise_normalization.hpp deleted file mode 100644 index c87ae159be..0000000000 --- a/library/include/ck/library/tensor_operation_instance/gpu/elementwise_normalization.hpp +++ /dev/null @@ -1,79 +0,0 @@ -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. - -#pragma once - -#include - -#include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/tensor_layout.hpp" -#include "ck/tensor_operation/gpu/device/device_elementwise_normalization.hpp" -#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" - -#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// FP16 -void add_device_elementwise_normalization_rank_2_1_f16_instances( - std::vector, - F16, - F16, - F32, - F16, - element_wise::Add, - PassThrough, - 2, - 1>>>&); - -template -struct DeviceOperationInstanceFactory> -{ - using DeviceOp = DeviceElementwiseNormalization; - - static auto GetInstances() - { - std::vector> op_ptrs; - - if constexpr(is_same_v && is_same_v && - is_same_v) - { - if constexpr(Rank == 2 && NumReduceDim == 1) - { - add_device_elementwise_normalization_rank_2_1_f16_instances(op_ptrs); - } - } - - return op_ptrs; - } -}; - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/elementwise_normalization/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/elementwise_normalization/CMakeLists.txt deleted file mode 100644 index 0c7cc2cd31..0000000000 --- a/library/src/tensor_operation_instance/gpu/elementwise_normalization/CMakeLists.txt +++ /dev/null @@ -1,3 +0,0 @@ -add_instance_library(device_elementwise_normalization_instance - device_elementwise_normalization_f16_instance.cpp -) diff --git a/library/src/tensor_operation_instance/gpu/elementwise_normalization/device_elementwise_normalization_f16_instance.cpp b/library/src/tensor_operation_instance/gpu/elementwise_normalization/device_elementwise_normalization_f16_instance.cpp deleted file mode 100644 index 7f15372ed9..0000000000 --- a/library/src/tensor_operation_instance/gpu/elementwise_normalization/device_elementwise_normalization_f16_instance.cpp +++ /dev/null @@ -1,54 +0,0 @@ -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. - -#include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/impl/device_elementwise_normalization_impl.hpp" -#include "ck/utility/data_type.hpp" - -#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -using F16 = ck::half_t; -using F32 = float; - -using Add = ck::tensor_operation::element_wise::Add; -using Pass = ck::tensor_operation::element_wise::PassThrough; - -template -// clang-format off -using device_elementwise_normalization_f16_instances = - std::tuple < - // XDataType, GammaDataType, BetaDataType, AccDataType, YDataType, Rank, NumReduceDim, BlockSize, MThreadClusterSize, KThreadClusterSize, MThreadSliceSize, KThreadSliceSize, XYSrcVectorDim, XSrcVectorSize, GammaSrcVectorDim, GammaSrcVectorSize, BetaSrcVectorDim, BetaSrcVectorSize, YDstVectorSize> - DeviceElementwiseNormalizationImpl, F16, F16, F32, F16, XElementwise ,YElementwise, Rank, Reduce, 256, 8, 32, 1, 8, 1, 1, 1, 1, 1, 1, 1>, // fallback kernel - DeviceElementwiseNormalizationImpl, F16, F16, F32, F16, XElementwise ,YElementwise, Rank, Reduce, 256, 8, 32, 1, 8, 1, 2, 1, 2, 1, 2, 2>, // fallback kernel - DeviceElementwiseNormalizationImpl, F16, F16, F32, F16, XElementwise ,YElementwise, Rank, Reduce, 256, 8, 32, 1, 8, 1, 4, 1, 4, 1, 4, 4>, // fallback kernel - DeviceElementwiseNormalizationImpl, F16, F16, F32, F16, XElementwise ,YElementwise, Rank, Reduce, 256, 8, 32, 1, 8, 1, 8, 1, 8, 1, 8, 8>, - DeviceElementwiseNormalizationImpl, F16, F16, F32, F16, XElementwise ,YElementwise, Rank, Reduce, 256, 4, 64, 1, 8, 1, 8, 1, 8, 1, 8, 8>, - DeviceElementwiseNormalizationImpl, F16, F16, F32, F16, XElementwise ,YElementwise, Rank, Reduce, 256, 2, 128, 1, 8, 1, 8, 1, 8, 1, 8, 8>, - DeviceElementwiseNormalizationImpl, F16, F16, F32, F16, XElementwise ,YElementwise, Rank, Reduce, 256, 2, 128, 1, 16, 1, 8, 1, 8, 1, 8, 8>, - DeviceElementwiseNormalizationImpl, F16, F16, F32, F16, XElementwise ,YElementwise, Rank, Reduce, 256, 2, 128, 1, 32, 1, 8, 1, 8, 1, 8, 8>, - DeviceElementwiseNormalizationImpl, F16, F16, F32, F16, XElementwise ,YElementwise, Rank, Reduce, 256, 1, 256, 1, 8, 1, 8, 1, 8, 1, 8, 8>, - DeviceElementwiseNormalizationImpl, F16, F16, F32, F16, XElementwise ,YElementwise, Rank, Reduce, 256, 1, 256, 1, 16, 1, 8, 1, 8, 1, 8, 8>, - DeviceElementwiseNormalizationImpl, F16, F16, F32, F16, XElementwise ,YElementwise, Rank, Reduce, 256, 1, 256, 1, 32, 1, 8, 1, 8, 1, 8, 8>, - DeviceElementwiseNormalizationImpl, F16, F16, F32, F16, XElementwise ,YElementwise, Rank, Reduce, 1024, 1, 1024, 1, 32, 1, 8, 1, 8, 1, 8, 8>, - DeviceElementwiseNormalizationImpl, F16, F16, F32, F16, XElementwise ,YElementwise, Rank, Reduce, 1024, 1, 1024, 1, 8, 1, 2, 1, 2, 1, 2, 2> - >; -// clang-format on - -void add_device_elementwise_normalization_rank_2_1_f16_instances( - std::vector, F16, F16, F32, F16, Add, Pass, 2, 1>>>& - instances) -{ - add_device_operation_instances( - instances, device_elementwise_normalization_f16_instances{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck diff --git a/profiler/include/profile_elementwise_layernorm_impl.hpp b/profiler/include/profile_elementwise_layernorm_impl.hpp deleted file mode 100644 index f5135005f2..0000000000 --- a/profiler/include/profile_elementwise_layernorm_impl.hpp +++ /dev/null @@ -1,264 +0,0 @@ -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. - -#pragma once - -#include - -#include "ck/ck.hpp" - -#include "ck/library/tensor_operation_instance/gpu/elementwise_normalization.hpp" - -#include "ck/library/utility/check_err.hpp" -#include "ck/library/utility/device_memory.hpp" -#include "ck/library/utility/host_tensor.hpp" -#include "ck/library/utility/host_tensor_generator.hpp" -#include "ck/library/reference_tensor_operation/cpu/reference_layernorm.hpp" - -namespace ck { -namespace profiler { - -template -void host_elementwise2D(HostTensorC& C, - const HostTensorA& A, - const HostTensorB& B, - const std::vector& shape, - Functor functor) -{ - using ctype = ck::remove_reference_t; - - for(std::size_t m = 0; m < shape[0]; ++m) - for(std::size_t n = 0; n < shape[1]; ++n) - { - auto a_val = A(m, n); - auto b_val = B(m, n); - ctype c_val = 0; - functor(c_val, a_val, b_val); - C(m, n) = c_val; - } -} - -template -bool profile_elementwise_layernorm_impl(int do_verification, - int init_method, - bool do_log, - bool time_kernel, - std::vector length) -{ - using Add = ck::tensor_operation::element_wise::Add; - using PassThrough = ck::tensor_operation::element_wise::PassThrough; - - if(length.size() != 2) - return false; - - index_t M = length[0]; - index_t N = length[1]; - index_t Stride = N; - - constexpr int Rank = 2; - constexpr int NumReduceDim = 1; - - std::vector reduce_dim = {1}; - std::vector gammaBetaLength = {N}; - std::vector gammaBetaStride = {0, 1}; - - auto f_host_tensor_descriptor2d = [](std::size_t row, std::size_t col, std::size_t stride) { - return HostTensorDescriptor(std::vector({row, col}), - std::vector({stride, 1})); - }; - - Tensor a(length); - Tensor b(length); - Tensor gamma(gammaBetaLength); - Tensor beta(gammaBetaLength); - Tensor y(length); - Tensor host_y(length); - - switch(init_method) - { - case 0: - a.GenerateTensorValue(GeneratorTensor_1{}); - b.GenerateTensorValue(GeneratorTensor_1{}); - gamma.GenerateTensorValue(GeneratorTensor_1{}); - beta.GenerateTensorValue(GeneratorTensor_1{}); - break; - case 1: - a.GenerateTensorValue(GeneratorTensor_2{-5, 5}); - b.GenerateTensorValue(GeneratorTensor_2{-5, 5}); - gamma.GenerateTensorValue(GeneratorTensor_2{-5, 5}); - beta.GenerateTensorValue(GeneratorTensor_2{-5, 5}); - break; - default: - a.GenerateTensorValue(GeneratorTensor_3{0, 1}); - b.GenerateTensorValue(GeneratorTensor_3{0, 1}); - gamma.GenerateTensorValue(GeneratorTensor_3{-0.5, 0.5}); - beta.GenerateTensorValue(GeneratorTensor_3{-0.5, 0.5}); - } - - DeviceMem a_dev(sizeof(ADataType) * a.mDesc.GetElementSpaceSize()); - DeviceMem b_dev(sizeof(ADataType) * b.mDesc.GetElementSpaceSize()); - DeviceMem gamma_dev(sizeof(GammaDataType) * gamma.mDesc.GetElementSpaceSize()); - DeviceMem beta_dev(sizeof(BetaDataType) * beta.mDesc.GetElementSpaceSize()); - DeviceMem y_dev(sizeof(YDataType) * y.mDesc.GetElementSpaceSize()); - - a_dev.ToDevice(a.mData.data()); - b_dev.ToDevice(b.mData.data()); - gamma_dev.ToDevice(gamma.mData.data()); - beta_dev.ToDevice(beta.mData.data()); - - std::array input = {a_dev.GetDeviceBuffer(), b_dev.GetDeviceBuffer()}; - - // add device normalization instances - using DeviceOp = ck::tensor_operation::device::DeviceElementwiseNormalization< - ck::Tuple, - GammaDataType, - BetaDataType, - AccDataType, - YDataType, - Add, - PassThrough, - 2, - 1>; - - // get device op instances - const auto instance_ptrs = - ck::tensor_operation::device::instance::DeviceOperationInstanceFactory< - DeviceOp>::GetInstances(); - - std::cout << "found " << instance_ptrs.size() << " instances" << std::endl; - - std::string best_instance_name; - float best_avg_time = std::numeric_limits::max(); - float best_gb_per_sec = 0; - - if(do_verification) - { - using XDataType = ADataType; - std::vector mn = {static_cast(M), - static_cast(N)}; - Tensor x(f_host_tensor_descriptor2d(M, N, Stride)); - host_elementwise2D, Tensor, Tensor, Add>( - x, a, b, mn, Add{}); - - using ReferenceInstance = ck::tensor_operation::host::ReferenceLayernorm; - - ReferenceInstance ref; - auto ref_argument = - ref.MakeArgument(x, gamma, beta, host_y, PassThrough{}, {M, N}, {1}, 1e-4); - auto ref_invoker = ref.MakeInvoker(); - ref_invoker.Run(ref_argument); - } - - int num_kernel = 0; - - for(auto& inst_ptr : instance_ptrs) - { - auto argument_ptr = inst_ptr->MakeArgumentPointer( - length, - { - std::vector{a.mDesc.GetStrides().begin(), a.mDesc.GetStrides().end()}, - std::vector{b.mDesc.GetStrides().begin(), b.mDesc.GetStrides().end()}, - }, - gammaBetaStride, - gammaBetaStride, - std::vector{y.mDesc.GetStrides().begin(), y.mDesc.GetStrides().end()}, - reduce_dim, - 1e-4, - input, - gamma_dev.GetDeviceBuffer(), - beta_dev.GetDeviceBuffer(), - y_dev.GetDeviceBuffer(), - Add{}, - PassThrough{}); - - if(inst_ptr->IsSupportedArgument(argument_ptr.get())) - { - ++num_kernel; - } - else - { - continue; - } - - auto invoker_ptr = inst_ptr->MakeInvokerPointer(); - - float avg_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel}); - - std::size_t num_bytes = a.mDesc.GetElementSize() * sizeof(ADataType) + - b.mDesc.GetElementSize() * sizeof(BDataType) + - gamma.mDesc.GetElementSize() * sizeof(GammaDataType) + - beta.mDesc.GetElementSize() * sizeof(BetaDataType) + - y.mDesc.GetElementSize() * sizeof(YDataType); - - float gb_per_sec = num_bytes / 1.E6 / avg_time; - - if(time_kernel) - std::cout << "Perf: " << std::setw(10) << avg_time << " ms, " << gb_per_sec << " GB/s, " - << inst_ptr->GetTypeString() << std::endl; - - if(avg_time < best_avg_time) - { - best_instance_name = inst_ptr->GetTypeString(); - best_avg_time = avg_time; - best_gb_per_sec = gb_per_sec; - } - - if(do_verification) - { - y_dev.FromDevice(y.mData.data()); - - bool pass = - ck::utils::check_err(y.mData, host_y.mData, "Error: Incorrect results", 1e-3, 1e-3); - - if(do_log) - { - LogRangeAsType(std::cout << "a : ", a.mData, ",") << std::endl; - LogRangeAsType(std::cout << "b : ", b.mData, ",") << std::endl; - LogRangeAsType(std::cout << "host_y : ", host_y.mData, ",") << std::endl; - LogRangeAsType(std::cout << "y : ", y.mData, ",") << std::endl; - } - - if(!pass) - { - std::cout << inst_ptr->GetTypeString() << " failed verification: "; - LogRange(std::cout << "lengths = [", length, ", ") << "]." << std::endl; - return false; - } - else - { - if(time_kernel) - std::cout << "pass" << std::endl; - } - } - } - - if(time_kernel) - { - LogRange(std::cout << "length = ", length, ",") << ", "; - std::cout << "num_kernel = " << num_kernel << ", best perf = " << best_avg_time << " ms, " - << best_gb_per_sec << " GB/s, " << best_instance_name << std::endl; - } - - if(num_kernel == 0) - { - std::cout << "Error: No kernel is tested" << std::endl; - return false; - } - - return true; -} - -} // namespace profiler -} // namespace ck diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index cbe2937ef4..e1b0b9c6e6 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -52,4 +52,3 @@ add_subdirectory(block_to_ctile_map) add_subdirectory(softmax) add_subdirectory(normalization) add_subdirectory(data_type) -add_subdirectory(elementwise_normalization) diff --git a/test/elementwise_normalization/CMakeLists.txt b/test/elementwise_normalization/CMakeLists.txt deleted file mode 100644 index a20eb26325..0000000000 --- a/test/elementwise_normalization/CMakeLists.txt +++ /dev/null @@ -1,7 +0,0 @@ -add_custom_target(test_elementwise_normalization) - -add_gtest_executable(test_elementwise_layernorm_fp16 test_elementwise_layernorm_fp16.cpp) - -target_link_libraries(test_elementwise_layernorm_fp16 PRIVATE utility device_elementwise_normalization_instance) - -add_dependencies(test_elementwise_normalization test_elementwise_layernorm_fp16) diff --git a/test/elementwise_normalization/test_elementwise_layernorm_fp16.cpp b/test/elementwise_normalization/test_elementwise_layernorm_fp16.cpp deleted file mode 100644 index f01e963bdb..0000000000 --- a/test/elementwise_normalization/test_elementwise_layernorm_fp16.cpp +++ /dev/null @@ -1,47 +0,0 @@ -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. - -#include "gtest/gtest.h" -#include "profiler/include/profile_elementwise_layernorm_impl.hpp" - -using F16 = ck::half_t; -using F32 = float; -using ck::index_t; - -template -class TestElementwiseLayernorm : public ::testing::Test -{ - protected: - using ADataType = std::tuple_element_t<0, Tuple>; - using BDataType = std::tuple_element_t<1, Tuple>; - using GammaDataType = std::tuple_element_t<2, Tuple>; - using BetaDataType = std::tuple_element_t<3, Tuple>; - using AccDataType = std::tuple_element_t<4, Tuple>; - using YDataType = std::tuple_element_t<5, Tuple>; - - void Run() - { - // M, N - std::vector> lengths = { - {1, 1}, {25, 16}, {39, 777}, {100, 200}, {1024, 1024}, {48 * 256, 2048}}; - - for(auto length : lengths) - { - bool success = ck::profiler::profile_elementwise_layernorm_impl( - true, 2, false, false, length); - EXPECT_TRUE(success); - } - } -}; - -using KernelTypes = ::testing::Types< - // ADataType, BDataType, GammaDataType, BetaDataType, AccDataType, YDataType> - std::tuple>; - -TYPED_TEST_SUITE(TestElementwiseLayernorm, KernelTypes); -TYPED_TEST(TestElementwiseLayernorm, Test_FP16) { this->Run(); } diff --git a/test/normalization/CMakeLists.txt b/test/normalization/CMakeLists.txt index 4890f2f751..ab6e2d1cd1 100644 --- a/test/normalization/CMakeLists.txt +++ b/test/normalization/CMakeLists.txt @@ -3,9 +3,9 @@ add_custom_target(test_layernorm) add_gtest_executable(test_layernorm2d_fp32 test_layernorm2d_fp32.cpp) add_gtest_executable(test_layernorm2d_fp16 test_layernorm2d_fp16.cpp) add_gtest_executable(test_groupnorm_fp16 test_groupnorm_fp16.cpp) -add_gtest_executable(test_groupnorm_fp32 test_groupnorm_fp32.cpp) +add_gtest_executable(test_groupnorm_fp32 test_groupnorm_fp32.cpp) -target_link_libraries(test_layernorm2d_fp32 PRIVATE utility) +target_link_libraries(test_layernorm2d_fp32 PRIVATE utility) target_link_libraries(test_layernorm2d_fp16 PRIVATE utility) target_link_libraries(test_groupnorm_fp16 PRIVATE utility device_normalization_instance) target_link_libraries(test_groupnorm_fp32 PRIVATE utility device_normalization_instance) @@ -14,3 +14,4 @@ add_dependencies(test_layernorm test_layernorm2d_fp32) add_dependencies(test_layernorm test_layernorm2d_fp16) add_dependencies(test_layernorm test_groupnorm_fp16) add_dependencies(test_layernorm test_groupnorm_fp32) +