Gemm multiple d multiple r (#335)

* Imitate XXX_gemm_multiple_d, add XXX_gemm_multiple_d_multiple_r for gemm + reduction

* Implement run of kernel

* Add example

* Fix parameter of typo

* Rewrite the reduceMax example

* Rewrite the reduceMean + reduceMeanSquare example

* Refine naming

* Refine folder name

* refine naming

* Rewrite the gemm + bias + relu + add + layernorm example

* Rewrite the gemm + layernorm example

* clang-format

* Fix bug if sync lds

* Fix compile error
This commit is contained in:
rocking5566
2022-08-13 14:07:12 +08:00
committed by GitHub
parent cac014f173
commit 6c3c06bf1f
14 changed files with 2940 additions and 902 deletions

View File

@@ -9,7 +9,7 @@
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
#include "ck/tensor_operation/gpu/device/device_gemm_bias_add_reduce_xdl_cshuffle.hpp"
#include "ck/tensor_operation/gpu/device/device_gemm_multiple_d_multiple_r_xdl_cshuffle.hpp"
#include "ck/tensor_operation/gpu/device/device_5ary_elementwise.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
@@ -28,57 +28,64 @@ using F32 = float;
using Row = ck::tensor_layout::gemm::RowMajor;
using Col = ck::tensor_layout::gemm::ColumnMajor;
// DataType
using ADataType = F16;
using BDataType = F16;
using CDataType = F16;
using BiasDataType = F32;
using D0DataType = F16;
using GemmAccDataType = F32;
using CShuffleDataType = F32;
using D0DataType = F16;
using D1DataType = F16;
using DsDataType = ck::Tuple<D0DataType, D1DataType>;
using EDataType = F16;
using ReduceAccDataType = F32;
using ReduceDataType = F32;
using ReducePtrsGlobal = ck::Tuple<ReduceDataType*, ReduceDataType*>;
using R0DataType = F32;
using R1DataType = F32;
using RsDataType = ck::Tuple<R0DataType, R1DataType>;
using GammaDataType = F16;
using BetaDataType = F16;
using LayerNormOutDataType = F16;
using NormalizeComputeDataType = F32;
using ALayout = ck::tensor_layout::gemm::RowMajor;
using BLayout = ck::tensor_layout::gemm::ColumnMajor;
using CLayout = ck::tensor_layout::gemm::RowMajor;
// Layout
using ALayout = Row;
using BLayout = Col;
using D1Layout = Row;
using ELayout = D1Layout;
using PassThrough = ck::tensor_operation::element_wise::PassThrough;
using AElementOp = PassThrough;
using BElementOp = PassThrough;
using CElementOp = ck::tensor_operation::element_wise::Relu;
using D0ElementOp = PassThrough;
using ReduceSumOp = ck::reduce::Add;
using ReduceOps = ck::Tuple<ReduceSumOp, ReduceSumOp>;
// Elementwise op
using PassThrough = ck::tensor_operation::element_wise::PassThrough;
using AddReluAdd = ck::tensor_operation::element_wise::AddReluAdd;
using Square = ck::tensor_operation::element_wise::UnarySquare;
using Div = ck::tensor_operation::element_wise::UnaryDivide;
using AElementOp = PassThrough;
using BElementOp = PassThrough;
using CDEElementOp = AddReluAdd;
using QsElementOp = ck::Tuple<PassThrough, Square>;
using RsElementOp = ck::Tuple<Div, Div>;
using UnaryIdenticElementOp = ck::tensor_operation::element_wise::PassThrough;
using UnaryDivElementOp = ck::tensor_operation::element_wise::UnaryDivide;
using UnarySquareElementOp = ck::tensor_operation::element_wise::UnarySquare;
using ReduceInElementOps = ck::Tuple<UnaryIdenticElementOp, UnarySquareElementOp>;
using ReduceOutElementOps = ck::Tuple<UnaryDivElementOp, UnaryDivElementOp>;
// ReduceOp
using R0ThreadReduceOp = ck::reduce::Add;
using R1ThreadReduceOp = ck::reduce::Add;
using RsThreadReduceOp = ck::Tuple<R0ThreadReduceOp, R1ThreadReduceOp>;
using ReduceGlobalMemOps =
ck::InMemoryDataOperationEnumSequence<ck::InMemoryDataOperationEnum::AtomicAdd,
ck::InMemoryDataOperationEnum::AtomicAdd>;
static constexpr auto R0GlobalReduceOp = ck::InMemoryDataOperationEnum::AtomicAdd;
static constexpr auto R1GlobalReduceOp = ck::InMemoryDataOperationEnum::AtomicAdd;
using RsGlobalReduceOp = ck::InMemoryDataOperationEnumSequence<R0GlobalReduceOp, R1GlobalReduceOp>;
static constexpr auto GemmSpecialization =
ck::tensor_operation::device::GemmSpecialization::Default;
static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
// clang-format off
using DeviceGemmBiasAddReduceInstance = ck::tensor_operation::device::DeviceGemmBiasAddReduce_Xdl_CShuffle
//######| ALayout| BLayout| CLayout|AData| BData| CData|C0Data|C1Data| GemmAcc| CShuffle| ReduceAcc| ReduceData| A| B| C| C1| Reduce| ReduceInEleOp| ReduceAccEleOp| Reduce| GEMM| NumGemmK| Block| MPer| NPer| KPer| AK1| BK1| MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CBlockTransferClusterLengths| CBlockTransfer| CReduce| CReduceThreadLds2VGprCopy| CReduceThreadVgpr2GlobalCopy|
//######| | | | Type| Type| Type| Type| Type| DataType| DataType| DataType| Type Tuple| Elementwise| Elementwise| Elementwise| Elementwise| Operation| | | MemoryData| Spacialization| Prefetch| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| ExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| ExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MPerBlock| ScalarPerVector| ThreadClusterLengths| SrcDstScalarPerVector| SrcDstScalarPerVector|
//######| | | | | | | | | | | | | Operation| Operation| Operation| Operation| | | | Operation| | Stage| | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NPerBlock| _NPerBlock| _MPerBlock_NPerBlock| _NPerBlock| _MPerBlock|
//######| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | |
< Row, Col, Row, F16, F16, F16, F32, F16, F32, F32, F32, ReducePtrsGlobal, AElementOp, BElementOp, CElementOp, D0ElementOp, ReduceOps,ReduceInElementOps, ReduceOutElementOps, ReduceGlobalMemOps, GemmSpecialization, 1, 256, 256, 128, 32, 8, 8, 32, 32, 4, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 32, 1, 8>, 8, S<64, 4>, 4, 1>;
using DeviceOpInstance = ck::tensor_operation::device::DeviceGemmMultipleDMultipleR_Xdl_CShuffle
//######| ALayout| BLayout| ELayout| AData| BData| GemmAccData| CShuffle| DsData| EData| ReduceAccData| RsData| A| B| CDE| Qs| Rs| Thread| Global| GEMM| NumGemmK| Block| MPer| NPer| KPer| AK1| BK1| MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CDRThreadTransfer| CDE| RThreadTransfer|
//######| | | | Type| Type| Type| DataType| Type| Type| Type| Type| Elementwise| Elementwise| Elementwise| Elementwise| Elementwise| Reduce| Reduce| Spacialization| Prefetch| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| ClusterLengths| ReduceThreadTransfer| DstScalarPerVector|
//######| | | | | | | | | | | | Operation| Operation| Operation| Operation| Operation| Operation| Operation| | Stage| | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _MPerBlock_NPerBlock| ScalarPerVector| _MPerBlock|
//######| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | _NPerBlock| |
< ALayout, BLayout, ELayout, ADataType, BDataType, GemmAccDataType, CShuffleDataType, DsDataType, EDataType, ReduceAccDataType, RsDataType, AElementOp, BElementOp, CDEElementOp, QsElementOp, RsElementOp, RsThreadReduceOp, RsGlobalReduceOp, GemmDefault, 1, 256, 256, 128, 32, 8, 8, 32, 32, 4, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<64, 4>, 4, 1>;
// clang-format on
using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemm<ADataType,
BDataType,
CDataType,
EDataType,
GemmAccDataType,
AElementOp,
BElementOp,
@@ -88,9 +95,9 @@ using NormalizeFunctor = ck::tensor_operation::element_wise::Normalize;
// A:x, B:E[x], C:E[x^2], D:Gamma, E:Beta , F:y
using DeviceNormalizeInstance =
ck::tensor_operation::device::Device5AryElementwise<CDataType,
ReduceDataType,
ReduceDataType,
ck::tensor_operation::device::Device5AryElementwise<EDataType,
R0DataType,
R1DataType,
GammaDataType,
BetaDataType,
LayerNormOutDataType,
@@ -124,41 +131,31 @@ auto f_host_tensor_descriptor2d =
}
};
template <typename CDataType,
typename ReduceDataType,
typename AccDataType,
typename BiasDataType,
typename D0DataType,
typename A_functor,
typename B_functor,
typename C_functor,
typename C1_functor>
void host_gemm_layernorm(Tensor<LayerNormOutDataType>& out_m_n,
const Tensor<ADataType>& a_m_k,
const Tensor<ADataType>& b_k_n,
const Tensor<BiasDataType>& bias_n,
const Tensor<D0DataType>& c1_m_n,
const Tensor<BDataType>& b_k_n,
const Tensor<D0DataType>& bias_n,
const Tensor<D1DataType>& d1_m_n,
const Tensor<GammaDataType>& gamma_n,
const Tensor<GammaDataType>& beta_n,
A_functor a_element_op,
B_functor b_element_op,
C_functor c_element_op,
C1_functor c1_element_op,
const Tensor<BetaDataType>& beta_n,
AElementOp a_element_op,
BElementOp b_element_op,
CDEElementOp cde_element_op,
int M,
int N)
{
int StrideC = N;
Tensor<CDataType> c_m_n(f_host_tensor_descriptor2d(M, N, StrideC, CLayout{}));
Tensor<ReduceDataType> mean_m(f_host_tensor_descriptor1d(M, 1));
Tensor<ReduceDataType> meanSquare_m(f_host_tensor_descriptor1d(M, 1));
auto averageOpInst = UnaryDivElementOp{N};
int StrideE = N;
Tensor<EDataType> e_m_n(f_host_tensor_descriptor2d(M, N, StrideE, ELayout{}));
Tensor<R0DataType> mean_m(f_host_tensor_descriptor1d(M, 1));
Tensor<R1DataType> meanSquare_m(f_host_tensor_descriptor1d(M, 1));
auto averageOpInst = Div{N};
auto ref_gemm = ReferenceGemmInstance{};
auto ref_invoker = ref_gemm.MakeInvoker();
auto ref_argument =
ref_gemm.MakeArgument(a_m_k, b_k_n, c_m_n, a_element_op, b_element_op, PassThrough{});
ref_gemm.MakeArgument(a_m_k, b_k_n, e_m_n, a_element_op, b_element_op, PassThrough{});
ref_invoker.Run(ref_argument);
@@ -166,38 +163,32 @@ void host_gemm_layernorm(Tensor<LayerNormOutDataType>& out_m_n,
for(int m = 0; m < M; ++m)
for(int n = 0; n < N; ++n)
{
AccDataType acc = ck::type_convert<AccDataType>(c_m_n(m, n)) +
ck::type_convert<AccDataType>(bias_n(n));
AccDataType c1 = ck::type_convert<AccDataType>(c1_m_n(m, n));
c_element_op(acc, acc);
c1_element_op(c1, c1);
acc += c1;
c_m_n(m, n) = ck::type_convert<CDataType>(acc);
auto acc = ck::type_convert<GemmAccDataType>(e_m_n(m, n));
cde_element_op(e_m_n(m, n), acc, bias_n(n), d1_m_n(m, n));
}
// reduce_mean and reduce_square_mean
auto reduceSumOpInst = ReduceSumOp{};
auto r0Op = R0ThreadReduceOp{};
auto r1Op = R1ThreadReduceOp{};
for(int m = 0; m < M; ++m)
{
auto mean_acc = reduceSumOpInst.GetIdentityValue<AccDataType>();
auto square_mean_acc = reduceSumOpInst.GetIdentityValue<AccDataType>();
auto mean_acc = r0Op.GetIdentityValue<ReduceAccDataType>();
auto mean_square_acc = r1Op.GetIdentityValue<ReduceAccDataType>();
for(int n = 0; n < N; ++n)
{
AccDataType c_val = ck::type_convert<AccDataType>(c_m_n(m, n));
AccDataType square_c_val = 0;
UnarySquareElementOp{}(square_c_val, c_val);
auto e_val = ck::type_convert<ReduceAccDataType>(e_m_n(m, n));
ReduceAccDataType square_e_val = 0;
Square{}(square_e_val, e_val);
reduceSumOpInst(mean_acc, c_val);
reduceSumOpInst(square_mean_acc, square_c_val);
r0Op(mean_acc, e_val);
r1Op(mean_square_acc, square_e_val);
}
averageOpInst(mean_acc, mean_acc);
averageOpInst(square_mean_acc, square_mean_acc);
mean_m(m) = ck::type_convert<ReduceDataType>(mean_acc);
meanSquare_m(m) = ck::type_convert<ReduceDataType>(square_mean_acc);
averageOpInst(mean_square_acc, mean_square_acc);
mean_m(m) = ck::type_convert<R0DataType>(mean_acc);
meanSquare_m(m) = ck::type_convert<R1DataType>(mean_square_acc);
}
// LayerNorm
@@ -206,24 +197,25 @@ void host_gemm_layernorm(Tensor<LayerNormOutDataType>& out_m_n,
{
for(int n = 0; n < N; ++n)
{
AccDataType out_acc = 0;
NormalizeComputeDataType out_acc = 0;
layerNormInst(out_acc,
ck::type_convert<AccDataType>(c_m_n(m, n)),
ck::type_convert<AccDataType>(mean_m(m)),
ck::type_convert<AccDataType>(meanSquare_m(m)),
ck::type_convert<AccDataType>(gamma_n(n)),
ck::type_convert<AccDataType>(beta_n(n)));
out_m_n(m, n) = ck::type_convert<ReduceDataType>(out_acc);
ck::type_convert<NormalizeComputeDataType>(e_m_n(m, n)),
ck::type_convert<NormalizeComputeDataType>(mean_m(m)),
ck::type_convert<NormalizeComputeDataType>(meanSquare_m(m)),
ck::type_convert<NormalizeComputeDataType>(gamma_n(n)),
ck::type_convert<NormalizeComputeDataType>(beta_n(n)));
out_m_n(m, n) = ck::type_convert<LayerNormOutDataType>(out_acc);
}
}
}
template <typename ADataType,
typename BDataType,
typename CDataType,
typename BiasDataType,
typename EDataType,
typename D0DataType,
typename ReduceDataType,
typename D1DataType,
typename R0DataType,
typename R1DataType,
typename GammaDataType,
typename BetaDataType,
typename NormalizeDataType>
@@ -231,12 +223,12 @@ void DumpGemmLayerNormPerf(float gemm_reduce_time, float normalize_time, int M,
{
std::size_t gemm_flop = std::size_t(2) * M * N * K + std::size_t(2) * M * N;
std::size_t gemm_num_byte = sizeof(ADataType) * M * K + sizeof(BDataType) * K * N +
sizeof(CDataType) * M * N + sizeof(BiasDataType) * M * N +
sizeof(D0DataType) * M * N + sizeof(ReduceDataType) * M +
sizeof(ReduceDataType) * M;
sizeof(EDataType) * M * N + sizeof(D0DataType) * M * N +
sizeof(D0DataType) * M * N + sizeof(R0DataType) * M +
sizeof(R1DataType) * M;
std::size_t normalize_num_byte = sizeof(CDataType) * M * N + sizeof(ReduceDataType) * M +
sizeof(ReduceDataType) * M + sizeof(GammaDataType) * N +
std::size_t normalize_num_byte = sizeof(EDataType) * M * N + sizeof(R0DataType) * M +
sizeof(R1DataType) * M + sizeof(GammaDataType) * N +
sizeof(BetaDataType) * N + sizeof(NormalizeDataType) * M * N;
float tflops = static_cast<float>(gemm_flop) / 1.E9 / gemm_reduce_time;
@@ -259,37 +251,37 @@ int main()
ck::index_t StrideA = 1024;
ck::index_t StrideB = 1024;
ck::index_t StrideC = 1024;
ck::index_t StrideD0 = 1024;
ck::index_t StrideD0 = 0;
ck::index_t StrideD1 = 1024;
ck::index_t StrideE = 1024;
Tensor<ADataType> a_m_k(f_host_tensor_descriptor2d(M, K, StrideA, ALayout{}));
Tensor<BDataType> b_k_n(f_host_tensor_descriptor2d(K, N, StrideB, BLayout{}));
Tensor<CDataType> c_m_n(f_host_tensor_descriptor2d(M, N, StrideC, CLayout{}));
Tensor<BiasDataType> bias_n(f_host_tensor_descriptor1d(N, 1));
Tensor<D0DataType> c1_m_n(f_host_tensor_descriptor2d(M, N, StrideC, CLayout{}));
Tensor<ReduceDataType> reduceMean_m(f_host_tensor_descriptor1d(M, 1));
Tensor<ReduceDataType> reduceMeanSquare_m(f_host_tensor_descriptor1d(M, 1));
Tensor<D0DataType> bias_n(f_host_tensor_descriptor1d(N, 1));
Tensor<D1DataType> d1_m_n(f_host_tensor_descriptor2d(M, N, StrideD1, ELayout{}));
Tensor<EDataType> e_m_n(f_host_tensor_descriptor2d(M, N, StrideE, ELayout{}));
Tensor<R0DataType> r0_Mean_m(f_host_tensor_descriptor1d(M, 1));
Tensor<R1DataType> r1_MeanSquare_m(f_host_tensor_descriptor1d(M, 1));
Tensor<GammaDataType> gamma_n(f_host_tensor_descriptor1d(N, 1));
Tensor<BetaDataType> beta_n(f_host_tensor_descriptor1d(N, 1));
Tensor<LayerNormOutDataType> layerNorm_m_n(
f_host_tensor_descriptor2d(M, N, StrideC, CLayout{}));
f_host_tensor_descriptor2d(M, N, StrideE, ELayout{}));
a_m_k.GenerateTensorValue(GeneratorTensor_3<ADataType>{-1, 1});
b_k_n.GenerateTensorValue(GeneratorTensor_3<BDataType>{-1, 1});
bias_n.GenerateTensorValue(GeneratorTensor_3<BiasDataType>{-1, 1});
c1_m_n.GenerateTensorValue(GeneratorTensor_3<D0DataType>{-5, 5});
bias_n.GenerateTensorValue(GeneratorTensor_3<D0DataType>{-1, 1});
d1_m_n.GenerateTensorValue(GeneratorTensor_3<D1DataType>{-5, 5});
gamma_n.GenerateTensorValue(GeneratorTensor_3<GammaDataType>{-1, 1});
beta_n.GenerateTensorValue(GeneratorTensor_3<BetaDataType>{-1, 1});
DeviceMem a_device_buf(sizeof(ADataType) * a_m_k.mDesc.GetElementSpaceSize());
DeviceMem b_device_buf(sizeof(BDataType) * b_k_n.mDesc.GetElementSpaceSize());
DeviceMem c_device_buf(sizeof(CDataType) * c_m_n.mDesc.GetElementSpaceSize());
DeviceMem bias_device_buf(sizeof(BiasDataType) * bias_n.mDesc.GetElementSpaceSize());
DeviceMem d0_device_buf(sizeof(D0DataType) * c1_m_n.mDesc.GetElementSpaceSize());
DeviceMem reduceMean_device_buf(sizeof(ReduceDataType) *
reduceMean_m.mDesc.GetElementSpaceSize());
DeviceMem reduceMeanSquare_device_buf(sizeof(ReduceDataType) *
reduceMeanSquare_m.mDesc.GetElementSpaceSize());
DeviceMem bias_device_buf(sizeof(D0DataType) * bias_n.mDesc.GetElementSpaceSize());
DeviceMem d1_device_buf(sizeof(D1DataType) * d1_m_n.mDesc.GetElementSpaceSize());
DeviceMem e_device_buf(sizeof(EDataType) * e_m_n.mDesc.GetElementSpaceSize());
DeviceMem r0_Mean_device_buf(sizeof(R0DataType) * r0_Mean_m.mDesc.GetElementSpaceSize());
DeviceMem r1_MeanSquare_device_buf(sizeof(R1DataType) *
r1_MeanSquare_m.mDesc.GetElementSpaceSize());
DeviceMem gamma_device_buf(sizeof(GammaDataType) * gamma_n.mDesc.GetElementSpaceSize());
DeviceMem beta_device_buf(sizeof(BetaDataType) * beta_n.mDesc.GetElementSpaceSize());
DeviceMem layerNorm_device_buf(sizeof(LayerNormOutDataType) *
@@ -298,60 +290,51 @@ int main()
a_device_buf.ToDevice(a_m_k.mData.data());
b_device_buf.ToDevice(b_k_n.mData.data());
bias_device_buf.ToDevice(bias_n.mData.data());
d0_device_buf.ToDevice(c1_m_n.mData.data());
d1_device_buf.ToDevice(d1_m_n.mData.data());
gamma_device_buf.ToDevice(gamma_n.mData.data());
beta_device_buf.ToDevice(beta_n.mData.data());
auto a_element_op = AElementOp{};
auto b_element_op = BElementOp{};
auto c_element_op = CElementOp{};
auto d_element_op = D0ElementOp{};
std::array<void*, 3> gemm_element_ops = {&a_element_op, &b_element_op, &c_element_op};
auto a_element_op = AElementOp{};
auto b_element_op = BElementOp{};
auto cde_element_op = CDEElementOp{};
auto qs_element_op = QsElementOp{};
auto rs_element_op = RsElementOp{N, N};
auto passthrough = UnaryIdenticElementOp{};
auto square = UnarySquareElementOp{};
auto div = UnaryDivElementOp{N};
std::array<void*, 2> reduce_in_element_ops = {&passthrough, &square};
std::array<void*, 2> reduce_out_element_ops = {&div, &div};
std::array<void*, 2> p_reduces = {reduceMean_device_buf.GetDeviceBuffer(),
reduceMeanSquare_device_buf.GetDeviceBuffer()};
// Prepare GEMM, reduce_mean, reduce_mean_square
auto gemmReduce = DeviceGemmBiasAddReduceInstance{};
// Prepare GEMM, mean, mean_square
auto gemmReduce = DeviceOpInstance{};
auto gemmReduce_invoker = gemmReduce.MakeInvoker();
auto gemmReduce_argument = gemmReduce.MakeArgument(a_device_buf.GetDeviceBuffer(),
b_device_buf.GetDeviceBuffer(),
bias_device_buf.GetDeviceBuffer(),
{d0_device_buf.GetDeviceBuffer()},
c_device_buf.GetDeviceBuffer(),
p_reduces,
M,
N,
K,
StrideA,
StrideB,
StrideC,
{StrideD0},
gemm_element_ops,
{&d_element_op},
reduce_in_element_ops,
reduce_out_element_ops);
auto gemmReduce_argument = gemmReduce.MakeArgument(
a_device_buf.GetDeviceBuffer(),
b_device_buf.GetDeviceBuffer(),
{bias_device_buf.GetDeviceBuffer(), d1_device_buf.GetDeviceBuffer()},
e_device_buf.GetDeviceBuffer(),
{r0_Mean_device_buf.GetDeviceBuffer(), r1_MeanSquare_device_buf.GetDeviceBuffer()},
M,
N,
K,
StrideA,
StrideB,
{StrideD0, StrideD1},
StrideE,
a_element_op,
b_element_op,
cde_element_op,
qs_element_op,
rs_element_op);
if(!gemmReduce.IsSupportedArgument(gemmReduce_argument))
{
throw std::runtime_error(
"wrong! device_gemm with the specified compilation parameters does "
"not support this GEMM problem");
throw std::runtime_error("wrong! this device_op instance does not support this problem");
}
reduceMean_device_buf.SetZero();
reduceMeanSquare_device_buf.SetZero();
// init reducetion buffer to 0
r0_Mean_device_buf.SetZero();
r1_MeanSquare_device_buf.SetZero();
// Prepare LayerNorm
std::array<const void*, 5> input = {c_device_buf.GetDeviceBuffer(),
reduceMean_device_buf.GetDeviceBuffer(),
reduceMeanSquare_device_buf.GetDeviceBuffer(),
std::array<const void*, 5> input = {e_device_buf.GetDeviceBuffer(),
r0_Mean_device_buf.GetDeviceBuffer(),
r1_MeanSquare_device_buf.GetDeviceBuffer(),
gamma_device_buf.GetDeviceBuffer(),
beta_device_buf.GetDeviceBuffer()};
std::array<void*, 1> output = {layerNorm_device_buf.GetDeviceBuffer()};
@@ -361,12 +344,12 @@ int main()
auto normalize_argument = normalize.MakeArgument(input,
output,
{M, N},
{StrideC, 1},
{StrideE, 1},
{1, 0},
{1, 0},
{0, 1},
{0, 1},
{StrideC, 1},
{StrideE, 1},
NormalizeFunctor{});
if(!normalize.IsSupportedArgument(normalize_argument))
@@ -383,21 +366,20 @@ int main()
{
// verification
Tensor<LayerNormOutDataType> host_layerNorm_m_n(
f_host_tensor_descriptor2d(M, N, StrideC, CLayout{}));
f_host_tensor_descriptor2d(M, N, StrideE, ELayout{}));
host_gemm_layernorm<CDataType, ReduceDataType, ReduceAccDataType>(host_layerNorm_m_n,
a_m_k,
b_k_n,
bias_n,
c1_m_n,
gamma_n,
beta_n,
a_element_op,
b_element_op,
c_element_op,
d_element_op,
M,
N);
host_gemm_layernorm(host_layerNorm_m_n,
a_m_k,
b_k_n,
bias_n,
d1_m_n,
gamma_n,
beta_n,
a_element_op,
b_element_op,
cde_element_op,
M,
N);
layerNorm_device_buf.FromDevice(layerNorm_m_n.mData.data());
pass &= ck::utils::check_err(layerNorm_m_n.mData,
@@ -419,10 +401,11 @@ int main()
if(time_kernel)
DumpGemmLayerNormPerf<ADataType,
BDataType,
CDataType,
BiasDataType,
EDataType,
D0DataType,
ReduceDataType,
D1DataType,
R0DataType,
R1DataType,
GammaDataType,
BetaDataType,
LayerNormOutDataType>(

View File

@@ -9,7 +9,7 @@
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
#include "ck/tensor_operation/gpu/device/device_gemm_reduce_xdl_cshuffle.hpp"
#include "ck/tensor_operation/gpu/device/device_gemm_multiple_d_multiple_r_xdl_cshuffle.hpp"
#include "ck/tensor_operation/gpu/device/device_5ary_elementwise.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
@@ -28,65 +28,73 @@ using F32 = float;
using Row = ck::tensor_layout::gemm::RowMajor;
using Col = ck::tensor_layout::gemm::ColumnMajor;
// DataType
using ADataType = F16;
using BDataType = F16;
using CDataType = F16;
using GemmAccDataType = F32;
using CShuffleDataType = F32;
using DsDataType = ck::Tuple<>;
using EDataType = F16;
using ReduceAccDataType = F32;
using ReduceDataType = F32;
using ReducePtrsGlobal = ck::Tuple<ReduceDataType*, ReduceDataType*>;
using R0DataType = F32;
using R1DataType = F32;
using RsDataType = ck::Tuple<R0DataType, R1DataType>;
using GammaDataType = F16;
using BetaDataType = F16;
using LayerNormOutDataType = F16;
using NormalizeComputeDataType = F32;
using ALayout = ck::tensor_layout::gemm::RowMajor;
using BLayout = ck::tensor_layout::gemm::ColumnMajor;
using CLayout = ck::tensor_layout::gemm::RowMajor;
// Layout
using ALayout = Row;
using BLayout = Col;
using D1Layout = Row;
using ELayout = D1Layout;
using AElementOp = ck::tensor_operation::element_wise::PassThrough;
using BElementOp = ck::tensor_operation::element_wise::PassThrough;
using CElementOp = ck::tensor_operation::element_wise::PassThrough;
using ReduceSumOp = ck::reduce::Add;
using ReduceOps = ck::Tuple<ReduceSumOp, ReduceSumOp>;
// Elementwise op
using PassThrough = ck::tensor_operation::element_wise::PassThrough;
using Square = ck::tensor_operation::element_wise::UnarySquare;
using Div = ck::tensor_operation::element_wise::UnaryDivide;
using AElementOp = PassThrough;
using BElementOp = PassThrough;
using CDEElementOp = PassThrough;
using QsElementOp = ck::Tuple<PassThrough, Square>;
using RsElementOp = ck::Tuple<Div, Div>;
using UnaryIdenticElementOp = ck::tensor_operation::element_wise::PassThrough;
using UnaryDivElementOp = ck::tensor_operation::element_wise::UnaryDivide;
using UnarySquareElementOp = ck::tensor_operation::element_wise::UnarySquare;
using ReduceInElementOps = ck::Tuple<UnaryIdenticElementOp, UnarySquareElementOp>;
using ReduceOutElementOps = ck::Tuple<UnaryDivElementOp, UnaryDivElementOp>;
// ReduceOp
using R0ThreadReduceOp = ck::reduce::Add;
using R1ThreadReduceOp = ck::reduce::Add;
using RsThreadReduceOp = ck::Tuple<R0ThreadReduceOp, R1ThreadReduceOp>;
using ReduceGlobalMemOps =
ck::InMemoryDataOperationEnumSequence<ck::InMemoryDataOperationEnum::AtomicAdd,
ck::InMemoryDataOperationEnum::AtomicAdd>;
static constexpr auto R0GlobalReduceOp = ck::InMemoryDataOperationEnum::AtomicAdd;
static constexpr auto R1GlobalReduceOp = ck::InMemoryDataOperationEnum::AtomicAdd;
using RsGlobalReduceOp = ck::InMemoryDataOperationEnumSequence<R0GlobalReduceOp, R1GlobalReduceOp>;
static constexpr auto GemmSpecialization =
ck::tensor_operation::device::GemmSpecialization::Default;
static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
// clang-format off
using DeviceGemmReduceInstance = ck::tensor_operation::device::DeviceGemmReduce_Xdl_CShuffle
//######| ALayout| BLayout| CLayout|AData| BData| CData| GemmAcc| CShuffle| ReduceAcc| ReduceData| A| B| C| Reduce| ReduceInEleOp| ReduceAccEleOp| Reduce| GEMM| NumGemmK| Block| MPer| NPer| KPer| AK1| BK1| MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CBlockTransferClusterLengths| CBlockTransfer| CReduce| CReduceThreadLds2VGprCopy| CReduceThreadVgpr2GlobalCopy|
//######| | | | Type| Type| Type| DataType| DataType| DataType| Type Tuple| Elementwise| Elementwise| Elementwise| Operation| | | MemoryData| Spacialization| Prefetch| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| ExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| ExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MPerBlock| ScalarPerVector| ThreadClusterLengths| SrcDstScalarPerVector| SrcDstScalarPerVector|
//######| | | | | | | | | | | Operation| Operation| Operation| | | | Operation| | Stage| | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NPerBlock| _NPerBlock| _MPerBlock_NPerBlock| _NPerBlock| _MPerBlock|
//######| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | |
< Row, Col, Row, F16, F16, F16, F32, F32, F32, ReducePtrsGlobal, AElementOp, BElementOp, CElementOp, ReduceOps,ReduceInElementOps, ReduceOutElementOps, ReduceGlobalMemOps, GemmSpecialization, 1, 256, 256, 128, 32, 8, 8, 32, 32, 4, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 32, 1, 8>, 8, S<64, 4>, 4, 1>;
using DeviceOpInstance = ck::tensor_operation::device::DeviceGemmMultipleDMultipleR_Xdl_CShuffle
//######| ALayout| BLayout| ELayout| AData| BData| GemmAccData| CShuffle| DsData| EData| ReduceAccData| RsData| A| B| CDE| Qs| Rs| Thread| Global| GEMM| NumGemmK| Block| MPer| NPer| KPer| AK1| BK1| MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CDRThreadTransfer| CDE| RThreadTransfer|
//######| | | | Type| Type| Type| DataType| Type| Type| Type| Type| Elementwise| Elementwise| Elementwise| Elementwise| Elementwise| Reduce| Reduce| Spacialization| Prefetch| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| ClusterLengths| ReduceThreadTransfer| DstScalarPerVector|
//######| | | | | | | | | | | | Operation| Operation| Operation| Operation| Operation| Operation| Operation| | Stage| | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _MPerBlock_NPerBlock| ScalarPerVector| _MPerBlock|
//######| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | _NPerBlock| |
< ALayout, BLayout, ELayout, ADataType, BDataType, GemmAccDataType, CShuffleDataType, DsDataType, EDataType, ReduceAccDataType, RsDataType, AElementOp, BElementOp, CDEElementOp, QsElementOp, RsElementOp, RsThreadReduceOp, RsGlobalReduceOp, GemmDefault, 1, 256, 256, 128, 32, 8, 8, 32, 32, 4, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<64, 4>, 4, 1>;
// clang-format on
using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemm<ADataType,
BDataType,
CDataType,
EDataType,
GemmAccDataType,
AElementOp,
BElementOp,
CElementOp>;
PassThrough>;
using NormalizeFunctor = ck::tensor_operation::element_wise::Normalize;
// A:x, B:E[x], C:E[x^2], D:Gamma, E:Beta , F:y
using DeviceNormalizeInstance =
ck::tensor_operation::device::Device5AryElementwise<CDataType,
ReduceDataType,
ReduceDataType,
ck::tensor_operation::device::Device5AryElementwise<EDataType,
R0DataType,
R1DataType,
GammaDataType,
BetaDataType,
LayerNormOutDataType,
@@ -120,60 +128,54 @@ auto f_host_tensor_descriptor2d =
}
};
template <typename CDataType,
typename ReduceDataType,
typename A_functor,
typename B_functor,
typename C_functor>
void host_gemm_layernorm(Tensor<LayerNormOutDataType>& out_m_n,
const Tensor<ADataType>& a_m_k,
const Tensor<ADataType>& b_k_n,
const Tensor<BDataType>& b_k_n,
const Tensor<GammaDataType>& gamma_n,
const Tensor<BetaDataType>& beta_n,
A_functor a_element_op,
B_functor b_element_op,
C_functor c_element_op,
AElementOp a_element_op,
BElementOp b_element_op,
CDEElementOp c_element_op,
int M,
int N)
{
using out_type = ck::remove_reference_t<decltype(out_m_n(0, 0))>;
int StrideC = N;
Tensor<CDataType> c_m_n(f_host_tensor_descriptor2d(M, N, StrideC, CLayout{}));
Tensor<ReduceDataType> mean_m(f_host_tensor_descriptor1d(M, 1));
Tensor<ReduceDataType> meanSquare_m(f_host_tensor_descriptor1d(M, 1));
auto averageOpInst = UnaryDivElementOp{N};
int StrideE = N;
Tensor<EDataType> e_m_n(f_host_tensor_descriptor2d(M, N, StrideE, ELayout{}));
Tensor<R0DataType> mean_m(f_host_tensor_descriptor1d(M, 1));
Tensor<R1DataType> meanSquare_m(f_host_tensor_descriptor1d(M, 1));
auto averageOpInst = Div{N};
auto ref_gemm = ReferenceGemmInstance{};
auto ref_invoker = ref_gemm.MakeInvoker();
auto ref_argument =
ref_gemm.MakeArgument(a_m_k, b_k_n, c_m_n, a_element_op, b_element_op, c_element_op);
ref_gemm.MakeArgument(a_m_k, b_k_n, e_m_n, a_element_op, b_element_op, c_element_op);
ref_invoker.Run(ref_argument);
// reduce_mean and reduce_square_mean
auto reduceSumOpInst = ReduceSumOp{};
auto r0Op = R0ThreadReduceOp{};
auto r1Op = R1ThreadReduceOp{};
for(int m = 0; m < M; ++m)
{
auto mean_acc = reduceSumOpInst.GetIdentityValue<ReduceAccDataType>();
auto square_mean_acc = reduceSumOpInst.GetIdentityValue<ReduceAccDataType>();
auto mean_acc = r0Op.GetIdentityValue<ReduceAccDataType>();
auto mean_square_acc = r1Op.GetIdentityValue<ReduceAccDataType>();
for(int n = 0; n < N; ++n)
{
auto c_val = ck::type_convert<ReduceAccDataType>(c_m_n(m, n));
auto square_c_val = reduceSumOpInst.GetIdentityValue<ReduceAccDataType>();
auto e_val = ck::type_convert<ReduceAccDataType>(e_m_n(m, n));
ReduceAccDataType square_e_val = 0;
Square{}(square_e_val, e_val);
UnarySquareElementOp{}(square_c_val, c_val);
reduceSumOpInst(mean_acc, c_val);
reduceSumOpInst(square_mean_acc, square_c_val);
r0Op(mean_acc, e_val);
r1Op(mean_square_acc, square_e_val);
}
averageOpInst(mean_acc, mean_acc);
averageOpInst(square_mean_acc, square_mean_acc);
mean_m(m) = ck::type_convert<ReduceDataType>(mean_acc);
meanSquare_m(m) = ck::type_convert<ReduceDataType>(square_mean_acc);
averageOpInst(mean_square_acc, mean_square_acc);
mean_m(m) = ck::type_convert<R0DataType>(mean_acc);
meanSquare_m(m) = ck::type_convert<R1DataType>(mean_square_acc);
}
// LayerNorm
@@ -182,22 +184,23 @@ void host_gemm_layernorm(Tensor<LayerNormOutDataType>& out_m_n,
{
for(int n = 0; n < N; ++n)
{
float out_f32 = 0;
layerNormInst(out_f32,
static_cast<float>(c_m_n(m, n)),
static_cast<float>(mean_m(m)),
static_cast<float>(meanSquare_m(m)),
static_cast<float>(gamma_n(n)),
static_cast<float>(beta_n(n)));
out_m_n(m, n) = static_cast<out_type>(out_f32);
NormalizeComputeDataType out_acc = 0;
layerNormInst(out_acc,
ck::type_convert<NormalizeComputeDataType>(e_m_n(m, n)),
ck::type_convert<NormalizeComputeDataType>(mean_m(m)),
ck::type_convert<NormalizeComputeDataType>(meanSquare_m(m)),
ck::type_convert<NormalizeComputeDataType>(gamma_n(n)),
ck::type_convert<NormalizeComputeDataType>(beta_n(n)));
out_m_n(m, n) = ck::type_convert<LayerNormOutDataType>(out_acc);
}
}
}
template <typename ADataType,
typename BDataType,
typename CDataType,
typename ReduceDataType,
typename EDataType,
typename R0DataType,
typename R1DataType,
typename GammaDataType,
typename BetaDataType,
typename NormalizeDataType>
@@ -205,11 +208,11 @@ void DumpGemmLayerNormPerf(float gemm_reduce_time, float normalize_time, int M,
{
std::size_t gemm_flop = std::size_t(2) * M * N * K;
std::size_t gemm_num_byte = sizeof(ADataType) * M * K + sizeof(BDataType) * K * N +
sizeof(CDataType) * M * N + sizeof(ReduceDataType) * M +
sizeof(ReduceDataType) * M;
sizeof(EDataType) * M * N + sizeof(R0DataType) * M +
sizeof(R1DataType) * M;
std::size_t normalize_num_btye = sizeof(CDataType) * M * N + sizeof(ReduceDataType) * M +
sizeof(ReduceDataType) * M + sizeof(GammaDataType) * N +
std::size_t normalize_num_btye = sizeof(EDataType) * M * N + sizeof(R0DataType) * M +
sizeof(R1DataType) * M + sizeof(GammaDataType) * N +
sizeof(BetaDataType) * N + sizeof(NormalizeDataType) * M * N;
float tflops = static_cast<float>(gemm_flop) / 1.E9 / gemm_reduce_time;
@@ -232,17 +235,17 @@ int main()
ck::index_t StrideA = 1024;
ck::index_t StrideB = 1024;
ck::index_t StrideC = 1024;
ck::index_t StrideE = 1024;
Tensor<ADataType> a_m_k(f_host_tensor_descriptor2d(M, K, StrideA, ALayout{}));
Tensor<BDataType> b_k_n(f_host_tensor_descriptor2d(K, N, StrideB, BLayout{}));
Tensor<CDataType> c_m_n(f_host_tensor_descriptor2d(M, N, StrideC, CLayout{}));
Tensor<ReduceDataType> reduceMean_m(f_host_tensor_descriptor1d(M, 1));
Tensor<ReduceDataType> reduceMeanSquare_m(f_host_tensor_descriptor1d(M, 1));
Tensor<EDataType> e_m_n(f_host_tensor_descriptor2d(M, N, StrideE, ELayout{}));
Tensor<R0DataType> r0_Mean_m(f_host_tensor_descriptor1d(M, 1));
Tensor<R1DataType> r1_MeanSquare_m(f_host_tensor_descriptor1d(M, 1));
Tensor<GammaDataType> gamma_n(f_host_tensor_descriptor1d(N, 1));
Tensor<BetaDataType> beta_n(f_host_tensor_descriptor1d(N, 1));
Tensor<LayerNormOutDataType> layerNorm_m_n(
f_host_tensor_descriptor2d(M, N, StrideC, CLayout{}));
f_host_tensor_descriptor2d(M, N, StrideE, ELayout{}));
a_m_k.GenerateTensorValue(GeneratorTensor_3<ADataType>{-1, 1});
b_k_n.GenerateTensorValue(GeneratorTensor_3<BDataType>{-1, 1});
@@ -251,11 +254,10 @@ int main()
DeviceMem a_device_buf(sizeof(ADataType) * a_m_k.mDesc.GetElementSpaceSize());
DeviceMem b_device_buf(sizeof(BDataType) * b_k_n.mDesc.GetElementSpaceSize());
DeviceMem c_device_buf(sizeof(CDataType) * c_m_n.mDesc.GetElementSpaceSize());
DeviceMem reduceMean_device_buf(sizeof(ReduceDataType) *
reduceMean_m.mDesc.GetElementSpaceSize());
DeviceMem reduceMeanSquare_device_buf(sizeof(ReduceDataType) *
reduceMeanSquare_m.mDesc.GetElementSpaceSize());
DeviceMem e_device_buf(sizeof(EDataType) * e_m_n.mDesc.GetElementSpaceSize());
DeviceMem r0_Mean_device_buf(sizeof(R0DataType) * r0_Mean_m.mDesc.GetElementSpaceSize());
DeviceMem r1_MeanSquare_device_buf(sizeof(R1DataType) *
r1_MeanSquare_m.mDesc.GetElementSpaceSize());
DeviceMem gamma_device_buf(sizeof(GammaDataType) * gamma_n.mDesc.GetElementSpaceSize());
DeviceMem beta_device_buf(sizeof(BetaDataType) * beta_n.mDesc.GetElementSpaceSize());
DeviceMem layerNorm_device_buf(sizeof(LayerNormOutDataType) *
@@ -266,40 +268,33 @@ int main()
gamma_device_buf.ToDevice(gamma_n.mData.data());
beta_device_buf.ToDevice(beta_n.mData.data());
auto a_element_op = AElementOp{};
auto b_element_op = BElementOp{};
auto c_element_op = CElementOp{};
std::array<void*, 3> gemm_element_ops = {&a_element_op, &b_element_op, &c_element_op};
auto a_element_op = AElementOp{};
auto b_element_op = BElementOp{};
auto cde_element_op = CDEElementOp{};
auto qs_element_op = QsElementOp{};
auto rs_element_op = RsElementOp{N, N};
auto passthrough = UnaryIdenticElementOp{};
auto square = UnarySquareElementOp{};
auto div = UnaryDivElementOp{N};
std::array<void*, 2> reduce_in_element_ops = {&passthrough, &square};
std::array<void*, 2> reduce_out_element_ops = {&div, &div};
std::array<void*, 2> p_reduces = {reduceMean_device_buf.GetDeviceBuffer(),
reduceMeanSquare_device_buf.GetDeviceBuffer()};
// Prepare GEMM, reduce_mean, reduce_mean_square
auto gemmReduce = DeviceGemmReduceInstance{};
// Prepare GEMM, mean, mean_square
auto gemmReduce = DeviceOpInstance{};
auto gemmReduce_invoker = gemmReduce.MakeInvoker();
auto gemmReduce_argument = gemmReduce.MakeArgument(a_device_buf.GetDeviceBuffer(),
b_device_buf.GetDeviceBuffer(),
nullptr,
{},
c_device_buf.GetDeviceBuffer(),
p_reduces,
M,
N,
K,
StrideA,
StrideB,
StrideC,
{},
gemm_element_ops,
{},
reduce_in_element_ops,
reduce_out_element_ops);
auto gemmReduce_argument = gemmReduce.MakeArgument(
a_device_buf.GetDeviceBuffer(),
b_device_buf.GetDeviceBuffer(),
{},
e_device_buf.GetDeviceBuffer(),
{r0_Mean_device_buf.GetDeviceBuffer(), r1_MeanSquare_device_buf.GetDeviceBuffer()},
M,
N,
K,
StrideA,
StrideB,
{},
StrideE,
a_element_op,
b_element_op,
cde_element_op,
qs_element_op,
rs_element_op);
if(!gemmReduce.IsSupportedArgument(gemmReduce_argument))
{
@@ -308,13 +303,13 @@ int main()
"not support this GEMM problem");
}
reduceMean_device_buf.SetZero();
reduceMeanSquare_device_buf.SetZero();
r0_Mean_device_buf.SetZero();
r1_MeanSquare_device_buf.SetZero();
// Prepare LayerNorm
std::array<const void*, 5> input = {c_device_buf.GetDeviceBuffer(),
reduceMean_device_buf.GetDeviceBuffer(),
reduceMeanSquare_device_buf.GetDeviceBuffer(),
std::array<const void*, 5> input = {e_device_buf.GetDeviceBuffer(),
r0_Mean_device_buf.GetDeviceBuffer(),
r1_MeanSquare_device_buf.GetDeviceBuffer(),
gamma_device_buf.GetDeviceBuffer(),
beta_device_buf.GetDeviceBuffer()};
std::array<void*, 1> output = {layerNorm_device_buf.GetDeviceBuffer()};
@@ -324,12 +319,12 @@ int main()
auto normalize_argument = normalize.MakeArgument(input,
output,
{M, N},
{StrideC, 1},
{StrideE, 1},
{1, 0},
{1, 0},
{0, 1},
{0, 1},
{StrideC, 1},
{StrideE, 1},
NormalizeFunctor{});
if(!normalize.IsSupportedArgument(normalize_argument))
@@ -346,18 +341,18 @@ int main()
{
// verification
Tensor<LayerNormOutDataType> host_layerNorm_m_n(
f_host_tensor_descriptor2d(M, N, StrideC, CLayout{}));
f_host_tensor_descriptor2d(M, N, StrideE, ELayout{}));
host_gemm_layernorm<CDataType, ReduceDataType>(host_layerNorm_m_n,
a_m_k,
b_k_n,
gamma_n,
beta_n,
a_element_op,
b_element_op,
c_element_op,
M,
N);
host_gemm_layernorm(host_layerNorm_m_n,
a_m_k,
b_k_n,
gamma_n,
beta_n,
a_element_op,
b_element_op,
cde_element_op,
M,
N);
layerNorm_device_buf.FromDevice(layerNorm_m_n.mData.data());
pass &= ck::utils::check_err(layerNorm_m_n.mData,
@@ -379,8 +374,9 @@ int main()
if(time_kernel)
DumpGemmLayerNormPerf<ADataType,
BDataType,
CDataType,
ReduceDataType,
EDataType,
R0DataType,
R1DataType,
GammaDataType,
BetaDataType,
LayerNormOutDataType>(