diff --git a/example/19_binary_elementwise/broadcast_add_2d.cpp b/example/19_binary_elementwise/broadcast_add_2d.cpp index 0f67499984..181d0e6a2d 100644 --- a/example/19_binary_elementwise/broadcast_add_2d.cpp +++ b/example/19_binary_elementwise/broadcast_add_2d.cpp @@ -1,10 +1,5 @@ #include -#include -#include #include -#include -#include -#include #include "check_err.hpp" #include "config.hpp" #include "device.hpp" @@ -13,7 +8,6 @@ #include "device_tensor.hpp" #include "binary_element_wise_operation.hpp" - #include "device_binary_elementwise.hpp" using F16 = ck::half_t; @@ -26,7 +20,7 @@ using EltwiseComputeDataType = F32; using Add = ck::tensor_operation::binary_element_wise::Add; using DeviceElementwiseAddInstance = ck::tensor_operation::device:: - DeviceBinaryElementwise; + DeviceBinaryElementwise; template ; + for(int m = 0; m < M; ++m) { for(int n = 0; n < N; ++n) @@ -53,7 +49,7 @@ void host_broadcast2D( ComputeDataType Bm = static_cast(B(m)); functor(Cmn, Amn, Bm); } - C(m, n) = static_cast(Cmn); + C(m, n) = static_cast(Cmn); } } } diff --git a/example/19_binary_elementwise/elementwise_add_1d.cpp b/example/19_binary_elementwise/elementwise_add_1d.cpp index 602e055290..f94c19f1d1 100644 --- a/example/19_binary_elementwise/elementwise_add_1d.cpp +++ b/example/19_binary_elementwise/elementwise_add_1d.cpp @@ -1,10 +1,5 @@ #include -#include -#include #include -#include -#include -#include #include "check_err.hpp" #include "config.hpp" #include "device.hpp" @@ -13,7 +8,6 @@ #include "device_tensor.hpp" #include "binary_element_wise_operation.hpp" - #include "device_binary_elementwise.hpp" using F16 = ck::half_t; @@ -26,7 +20,7 @@ using EltwiseComputeDataType = F32; using Add = ck::tensor_operation::binary_element_wise::Add; using DeviceElementwiseAddInstance = ck::tensor_operation::device:: - DeviceBinaryElementwise; + DeviceBinaryElementwise; template ; + for(int m = 0; m < M; ++m) { ComputeDataType Am = static_cast(A(m)); ComputeDataType Bm = static_cast(B(m)); ComputeDataType Cm = 0; functor(Cm, Am, Bm); - C(m) = static_cast(Cm); + C(m) = static_cast(Cm); } } diff --git a/example/19_binary_elementwise/elementwise_add_4d.cpp b/example/19_binary_elementwise/elementwise_add_4d.cpp index 9d468771f2..e358e993b0 100644 --- a/example/19_binary_elementwise/elementwise_add_4d.cpp +++ b/example/19_binary_elementwise/elementwise_add_4d.cpp @@ -1,20 +1,14 @@ #include -#include -#include #include -#include -#include -#include #include "check_err.hpp" #include "config.hpp" #include "device.hpp" -#include "host_reduce_util.hpp" #include "host_tensor.hpp" #include "host_tensor_generator.hpp" +#include "host_utility.hpp" #include "device_tensor.hpp" #include "binary_element_wise_operation.hpp" - #include "device_binary_elementwise.hpp" using F16 = ck::half_t; @@ -27,7 +21,7 @@ using EltwiseComputeDataType = F32; using Add = ck::tensor_operation::binary_element_wise::Add; using DeviceElementwiseAddInstance = ck::tensor_operation::device:: - DeviceBinaryElementwise; + DeviceBinaryElementwise; template & shape, Functor functor) { + using ctype = ck::remove_reference_t; + for(std::size_t n = 0; n < shape[0]; ++n) for(std::size_t c = 0; c < shape[1]; ++c) for(std::size_t h = 0; h < shape[2]; ++h) @@ -49,7 +45,7 @@ void host_elementwise4D(HostTensorC& C, ComputeDataType b_val = static_cast(B(n, c, h, w)); ComputeDataType c_val = 0; functor(c_val, a_val, b_val); - C(n, c, h, w) = static_cast(c_val); + C(n, c, h, w) = static_cast(c_val); } } @@ -75,14 +71,15 @@ int main() b_m_device_buf.ToDevice(b_m.mData.data()); auto broadcastAdd = DeviceElementwiseAddInstance{}; - auto argument = broadcastAdd.MakeArgumentPointer(a_m_device_buf.GetDeviceBuffer(), - b_m_device_buf.GetDeviceBuffer(), - c_m_device_buf.GetDeviceBuffer(), - ck::to_int_vector(nchw), - ck::to_int_vector(a_m.mDesc.GetStrides()), - ck::to_int_vector(b_m.mDesc.GetStrides()), - ck::to_int_vector(c_m.mDesc.GetStrides()), - Add{}); + auto argument = broadcastAdd.MakeArgumentPointer( + a_m_device_buf.GetDeviceBuffer(), + b_m_device_buf.GetDeviceBuffer(), + c_m_device_buf.GetDeviceBuffer(), + ck::convert_vector_element_type(nchw), + ck::convert_vector_element_type(a_m.mDesc.GetStrides()), + ck::convert_vector_element_type(b_m.mDesc.GetStrides()), + ck::convert_vector_element_type(c_m.mDesc.GetStrides()), + Add{}); if(!broadcastAdd.IsSupportedArgument(argument.get())) { diff --git a/include/ck/tensor_operation/gpu/device/device_binary_elementwise.hpp b/include/ck/tensor_operation/gpu/device/device_binary_elementwise.hpp index a3a2c89eb7..8bf6604f18 100644 --- a/include/ck/tensor_operation/gpu/device/device_binary_elementwise.hpp +++ b/include/ck/tensor_operation/gpu/device/device_binary_elementwise.hpp @@ -19,18 +19,15 @@ template struct DeviceBinaryElementwise : public BaseOperator { - DeviceBinaryElementwise(index_t threadPerBlock = 256) - : BaseOperator(), threadPerBlock_(threadPerBlock) - { - } + DeviceBinaryElementwise(index_t blockSize = 256) : BaseOperator(), blockSize_(blockSize) {} static constexpr auto I0 = Number<0>{}; template - static auto PadDescriptor_M0_1d(Desc_M0 desc_m0, index_t gridSize, index_t threadPerBlock) + static auto PadDescriptor_M0_1d(Desc_M0 desc_m0, index_t gridSize, index_t blockSize) { const auto m0 = desc_m0.GetLength(I0); - const index_t loop_step = gridSize * threadPerBlock * ScalarPerVector; + const index_t loop_step = gridSize * blockSize * ScalarPerVector; const auto pad = math::integer_least_multiple(m0, loop_step) - m0; const auto desc_m0_pad = transform_tensor_descriptor(desc_m0, @@ -40,10 +37,10 @@ struct DeviceBinaryElementwise : public BaseOperator return desc_m0_pad; } - static auto MakeDescriptor_M0(const std::vector& shape, - const std::vector& stride, + static auto MakeDescriptor_M0(const std::vector& shape, + const std::vector& stride, index_t gridSize, - index_t threadPerBlock) + index_t blockSize) { auto tupleOfShape = generate_tuple([&](auto I) { return shape[I]; }, Number{}); auto tupleOfStride = generate_tuple([&](auto I) { return stride[I]; }, Number{}); @@ -60,10 +57,10 @@ struct DeviceBinaryElementwise : public BaseOperator make_tuple(generate_sequence_v2([&](auto I) { return I; }, Number{})), make_tuple(Sequence<0>{})); - return PadDescriptor_M0_1d(desc_m0, gridSize, threadPerBlock); + return PadDescriptor_M0_1d(desc_m0, gridSize, blockSize); } else - return PadDescriptor_M0_1d(desc, gridSize, threadPerBlock); + return PadDescriptor_M0_1d(desc, gridSize, blockSize); } using GridDesc_M0 = decltype(MakeDescriptor_M0({1, 1}, {1, 1}, 1, 1)); @@ -80,26 +77,28 @@ struct DeviceBinaryElementwise : public BaseOperator Argument(const ADataType* p_a, const BDataType* p_b, CDataType* p_c, - const std::vector& shape, - const std::vector& stride_a, - const std::vector& stride_b, - const std::vector& stride_c, + const std::vector& shape, + const std::vector& stride_a, + const std::vector& stride_b, + const std::vector& stride_c, ElementwiseFunctor functor, - index_t threadPerBlock) + index_t blockSize) : p_a_(p_a), p_b_(p_b), p_c_(p_c), + shape_(shape), functor_(functor), gridSize_(120) // FIXME - Calculate the grid size by number of CU in the future { - a_grid_desc_m0_ = MakeDescriptor_M0(shape, stride_a, gridSize_, threadPerBlock); - b_grid_desc_m0_ = MakeDescriptor_M0(shape, stride_b, gridSize_, threadPerBlock); - c_grid_desc_m0_ = MakeDescriptor_M0(shape, stride_c, gridSize_, threadPerBlock); + a_grid_desc_m0_ = MakeDescriptor_M0(shape, stride_a, gridSize_, blockSize); + b_grid_desc_m0_ = MakeDescriptor_M0(shape, stride_b, gridSize_, blockSize); + c_grid_desc_m0_ = MakeDescriptor_M0(shape, stride_c, gridSize_, blockSize); } const ADataType* p_a_; const BDataType* p_b_; CDataType* p_c_; + std::vector shape_; GridDesc_M0 a_grid_desc_m0_; GridDesc_M0 b_grid_desc_m0_; GridDesc_M0 c_grid_desc_m0_; @@ -109,21 +108,21 @@ struct DeviceBinaryElementwise : public BaseOperator struct Invoker : public BaseInvoker { - Invoker(index_t threadPerBlock) : BaseInvoker(), threadPerBlock_(threadPerBlock) {} + Invoker(index_t blockSize) : BaseInvoker(), blockSize_(blockSize) {} float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{}) { - const auto kernel = kernel_elementwise_1d; + const auto kernel = kernel_binary_elementwise_1d; float elapsed_time = launch_and_time_kernel(stream_config, kernel, dim3(arg.gridSize_), - dim3(threadPerBlock_), + dim3(blockSize_), 0, arg.p_a_, arg.p_b_, @@ -142,7 +141,7 @@ struct DeviceBinaryElementwise : public BaseOperator return Run(*dynamic_cast(p_arg), stream_config); } - index_t threadPerBlock_; + index_t blockSize_; }; bool IsSupportedArgument(const BaseArgument* p_arg) override @@ -152,10 +151,7 @@ struct DeviceBinaryElementwise : public BaseOperator if(pArg == nullptr) return false; - // shape[0] * shape[1] * shape[2] * ... - const auto m0 = pArg->c_grid_desc_m0_.GetLength(I0); - - if(m0 % ScalarPerVector != 0) + if(pArg->shape_.back() % ScalarPerVector != 0) return false; return true; @@ -164,10 +160,10 @@ struct DeviceBinaryElementwise : public BaseOperator std::unique_ptr MakeArgumentPointer(const void* p_a, const void* p_b, void* p_c, - std::vector shape, - std::vector stride_a, - std::vector stride_b, - std::vector stride_c, + std::vector shape, + std::vector stride_a, + std::vector stride_b, + std::vector stride_c, ElementwiseFunctor functor) { return std::make_unique(static_cast(p_a), @@ -178,12 +174,12 @@ struct DeviceBinaryElementwise : public BaseOperator stride_b, stride_c, functor, - threadPerBlock_); + blockSize_); } std::unique_ptr MakeInvokerPointer() { - return std::make_unique(Invoker{threadPerBlock_}); + return std::make_unique(Invoker{blockSize_}); } std::string GetTypeString() const override @@ -200,7 +196,7 @@ struct DeviceBinaryElementwise : public BaseOperator return str.str(); } - index_t threadPerBlock_; + index_t blockSize_; }; } // namespace device diff --git a/include/ck/tensor_operation/gpu/device/device_cgemm_4gemm_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/device_cgemm_4gemm_xdl_cshuffle.hpp index 53271f0802..ce3f5991ec 100644 --- a/include/ck/tensor_operation/gpu/device/device_cgemm_4gemm_xdl_cshuffle.hpp +++ b/include/ck/tensor_operation/gpu/device/device_cgemm_4gemm_xdl_cshuffle.hpp @@ -71,10 +71,10 @@ struct DeviceCGemm_4Gemm_Xdl_CShuffle static constexpr auto ScalarPerVector = Number<4>{}; template - static auto PadDescriptor_M0_1d(Desc_M0 desc_m0, index_t gridSize, index_t threadPerBlock) + static auto PadDescriptor_M0_1d(Desc_M0 desc_m0, index_t gridSize, index_t blockSize) { const auto m0 = desc_m0.GetLength(I0); - const index_t loop_step = gridSize * threadPerBlock * ScalarPerVector; + const index_t loop_step = gridSize * blockSize * ScalarPerVector; const auto pad = math::integer_least_multiple(m0, loop_step) - m0; const auto desc_m0_pad = transform_tensor_descriptor(desc_m0, @@ -87,7 +87,7 @@ struct DeviceCGemm_4Gemm_Xdl_CShuffle static auto MakeDescriptor_M0(const std::vector& shape, const std::vector& stride, index_t gridSize, - index_t threadPerBlock) + index_t blockSize) { auto tupleOfShape = generate_tuple([&](auto I) { return shape[I]; }, Number<2>{}); auto tupleOfStride = generate_tuple([&](auto I) { return stride[I]; }, Number<2>{}); @@ -100,7 +100,7 @@ struct DeviceCGemm_4Gemm_Xdl_CShuffle make_tuple(generate_sequence_v2([&](auto I) { return I; }, Number<2>{})), make_tuple(Sequence<0>{})); - return PadDescriptor_M0_1d(desc_m0, gridSize, threadPerBlock); + return PadDescriptor_M0_1d(desc_m0, gridSize,blockSize); } static auto MakeAGridDescriptor_AK0_M_AK1(index_t MRaw, index_t KRaw, index_t StrideA) @@ -536,18 +536,18 @@ struct DeviceCGemm_4Gemm_Xdl_CShuffle GridDesc_M0, Substract, ScalarPerVector>; - const auto add_kernel = kernel_elementwise_1d; - const auto substract_kernel = kernel_elementwise_1d; + const auto add_kernel = kernel_binary_elementwise_1d; + const auto substract_kernel = kernel_binary_elementwise_1d; if(GridwiseGemm::CalculateHasMainKBlockLoop(K)) { diff --git a/include/ck/tensor_operation/gpu/element/binary_element_wise_operation.hpp b/include/ck/tensor_operation/gpu/element/binary_element_wise_operation.hpp index 5ab1f89ed3..6b9b90b809 100644 --- a/include/ck/tensor_operation/gpu/element/binary_element_wise_operation.hpp +++ b/include/ck/tensor_operation/gpu/element/binary_element_wise_operation.hpp @@ -7,6 +7,12 @@ namespace binary_element_wise { struct Add { + __host__ __device__ constexpr void + operator()(double& dst, const double& src1, const double& src2) const + { + dst = src1 + src2; + } + __host__ __device__ constexpr void operator()(float& dst, const float& src1, const float& src2) const { @@ -32,6 +38,12 @@ struct Add struct Substract { __host__ __device__ constexpr void + operator()(double& dst, const double& src1, const double& src2) const + { + dst = src1 - src2; + } + + __host__ __device__ constexpr void operator()(float& dst, const float& src1, const float& src2) const { dst = src1 - src2; @@ -43,7 +55,6 @@ struct Substract dst = src1 - src2; } - // TO FIX!!! __host__ __device__ constexpr void operator()(bhalf_t& dst, const bhalf_t& src1, const bhalf_t& src2) const { diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_binary_elementwise_1d.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_binary_elementwise_1d.hpp index aea54ff53c..c77d49ae94 100644 --- a/include/ck/tensor_operation/gpu/grid/gridwise_binary_elementwise_1d.hpp +++ b/include/ck/tensor_operation/gpu/grid/gridwise_binary_elementwise_1d.hpp @@ -13,13 +13,13 @@ template -__global__ void kernel_elementwise_1d(const ADataType* __restrict__ p_a_global, - const BDataType* __restrict__ p_b_global, - CDataType* __restrict__ p_c_global, - const GridDesc_M0 a_grid_desc_m0, - const GridDesc_M0 b_grid_desc_m0, - const GridDesc_M0 c_grid_desc_m0, - const ElementwiseFunctor functor) +__global__ void kernel_binary_elementwise_1d(const ADataType* __restrict__ p_a_global, + const BDataType* __restrict__ p_b_global, + CDataType* __restrict__ p_c_global, + const GridDesc_M0 a_grid_desc_m0, + const GridDesc_M0 b_grid_desc_m0, + const GridDesc_M0 c_grid_desc_m0, + const ElementwiseFunctor functor) { GridwiseBinEltwise::Run(p_a_global, p_b_global, @@ -45,7 +45,7 @@ struct GridwiseBinaryElementwise_1D using PassThrough = tensor_operation::element_wise::PassThrough; - static __device__ __host__ auto CalculateElementwiseIndex() + static __device__ auto CalculateElementwiseIndex() { const index_t global_thread_id = get_thread_global_1d_id(); return make_multi_index(global_thread_id * ScalarPerVector); @@ -70,7 +70,7 @@ struct GridwiseBinaryElementwise_1D StaticBuffer b_thread_buf; StaticBuffer c_thread_buf; - const auto thread_to_global_offset = CalculateElementwiseIndex(); + const auto thread_store_global_offset = CalculateElementwiseIndex(); auto a_global_load = ThreadwiseTensorSliceTransfer_v2{a_grid_desc_m0, thread_to_global_offset}; + false>{a_grid_desc_m0, thread_store_global_offset}; auto b_global_load = ThreadwiseTensorSliceTransfer_v2{b_grid_desc_m0, thread_to_global_offset}; + false>{b_grid_desc_m0, thread_store_global_offset}; auto c_global_write = ThreadwiseTensorSliceTransfer_v1r3{ - c_grid_desc_m0, thread_to_global_offset, PassThrough{}}; + c_grid_desc_m0, thread_store_global_offset, PassThrough{}}; - const index_t threadPerBlock = get_block_size(); - const index_t blockPerGrid = get_grid_size(); - const auto m0 = c_grid_desc_m0.GetLength(I0); - const index_t loop_step = blockPerGrid * threadPerBlock * ScalarPerVector; - const auto loop_step_index = make_multi_index(loop_step); + const index_t blockSize = get_block_size(); + const index_t blockPerGrid = get_grid_size(); + const auto m0 = c_grid_desc_m0.GetLength(I0); + const index_t loop_step = blockPerGrid * blockSize * ScalarPerVector; + const auto loop_step_index = make_multi_index(loop_step); index_t num_iter = m0 / (loop_step); do diff --git a/library/include/ck/library/host_tensor/host_utility.hpp b/library/include/ck/library/host_tensor/host_utility.hpp new file mode 100644 index 0000000000..2ff76e58c3 --- /dev/null +++ b/library/include/ck/library/host_tensor/host_utility.hpp @@ -0,0 +1,17 @@ +#pragma once +#include + +namespace ck { + +template +inline std::vector convert_vector_element_type(const std::vector& inData) +{ + std::vector outData; + + for(auto elem : inData) + outData.push_back(static_cast(elem)); + + return (outData); +}; + +}; // namespace ck