Batchnorm-forward and Batchnorm-infer Implemented using generic kernels (#320)

* Implement multiple-reduction in one kernel (kernels, device ops, examples)

* Add generic elementwise kernel and device interface

* Add generator for normal-distributed data initialization

* Add host refer implementation of batchnorm-forward and batchnorm-infer

* Add examples for implementing batchnorm-forward and batchnorm-infer using generic kernels

* Remove un-needed including in batchnorm example

* Renaming generic_elementwise to elementiwise in kernel and device classes/functions

* Change in gemm_layernorm examples to use DeviceElementwise instead of Device5AryElementwise

* Change in exampe 19_binary_elementwise to use DeviceElementwise instead of DeviceBinaryElementwise

* Change in device_cgemm_4gemm_xdl_cshuffle.hpp to use kernel_elementwise instead of kernel_binary_elementwise

* Add DeviceElementwiseBase and use it in device_normalize_instance.cpp

* Removing and renaming files

* Update to synchronize gemm_layernorm client example to the generic element-wise device op API

* Update to synchronize with the latest headers directory and HostTensorDescriptor interface renaming

* Merge two static member functions in device_elementwise.hpp

* Remove unary_elementwise_1d kernel and device
This commit is contained in:
Qianfeng
2022-08-15 23:11:02 +08:00
committed by GitHub
parent 5ee304595c
commit 53ea4713af
47 changed files with 5195 additions and 1707 deletions

View File

@@ -5,7 +5,7 @@
#include <cstdlib>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/device_binary_elementwise.hpp"
#include "ck/tensor_operation/gpu/device/device_elementwise.hpp"
#include "ck/tensor_operation/gpu/element/binary_element_wise_operation.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/device_memory.hpp"
@@ -15,29 +15,21 @@
using F16 = ck::half_t;
using F32 = float;
using ABDataType = F16;
using CDataType = F16;
using EltwiseComputeDataType = F32;
using ABDataType = F16;
using CDataType = F16;
using Add = ck::tensor_operation::element_wise::Add;
using DeviceElementwiseAddInstance =
ck::tensor_operation::device::DeviceBinaryElementwise<ABDataType,
ABDataType,
CDataType,
EltwiseComputeDataType,
Add,
1,
8,
8,
8,
8>;
ck::tensor_operation::device::DeviceElementwise<ck::Tuple<ABDataType, ABDataType>,
ck::Tuple<CDataType>,
Add,
1,
8,
ck::Sequence<8, 8>,
ck::Sequence<8>>;
template <typename HostTensorA,
typename HostTensorB,
typename HostTensorC,
typename ComputeDataType,
typename Functor>
template <typename HostTensorA, typename HostTensorB, typename HostTensorC, typename Functor>
void host_elementwise1D(
HostTensorC& C, const HostTensorA& A, const HostTensorB& B, int M, Functor functor)
{
@@ -45,11 +37,11 @@ void host_elementwise1D(
for(int m = 0; m < M; ++m)
{
ComputeDataType Am = ck::type_convert<ComputeDataType>(A(m));
ComputeDataType Bm = ck::type_convert<ComputeDataType>(B(m));
ComputeDataType Cm = 0;
auto Am = A(m);
auto Bm = B(m);
ctype Cm = 0;
functor(Cm, Am, Bm);
C(m) = ck::type_convert<ctype>(Cm);
C(m) = Cm;
}
}
@@ -83,18 +75,19 @@ int main()
b_m_device_buf.GetDeviceBuffer()};
std::array<void*, 1> output = {c_m_device_buf.GetDeviceBuffer()};
std::vector<ck::index_t> a_strides = {1};
std::vector<ck::index_t> b_strides = {1};
std::vector<ck::index_t> c_strides = {1};
std::array<ck::index_t, 1> abc_lengths = {M};
std::array<ck::index_t, 1> a_strides = {1};
std::array<ck::index_t, 1> b_strides = {1};
std::array<ck::index_t, 1> c_strides = {1};
auto broadcastAdd = DeviceElementwiseAddInstance{};
auto argument = broadcastAdd.MakeArgumentPointer(
input, output, {M}, {{a_strides}, b_strides}, {c_strides}, Add{});
abc_lengths, {a_strides, b_strides}, {c_strides}, input, output, Add{});
if(!broadcastAdd.IsSupportedArgument(argument.get()))
{
throw std::runtime_error("The runtime parameters seems not supported by the "
"DeviceBinaryElementwise instance, exiting!");
throw std::runtime_error(
"The runtime parameters seems not supported by the device instance, exiting!");
};
auto broadcastAdd_invoker_ptr = broadcastAdd.MakeInvokerPointer();
@@ -109,11 +102,8 @@ int main()
c_m_device_buf.FromDevice(c_m.mData.data());
Tensor<CDataType> host_c_m(f_host_tensor_descriptor1d(M, 1));
host_elementwise1D<Tensor<ABDataType>,
Tensor<ABDataType>,
Tensor<CDataType>,
EltwiseComputeDataType,
Add>(host_c_m, a_m, b_m, M, Add{});
host_elementwise1D<Tensor<ABDataType>, Tensor<ABDataType>, Tensor<CDataType>, Add>(
host_c_m, a_m, b_m, M, Add{});
pass &= ck::utils::check_err(
c_m.mData, host_c_m.mData, "Error: Incorrect results c", 1e-3, 1e-3);