From 6521ccba67f1d0be908177960eeba59cf39fa05a Mon Sep 17 00:00:00 2001 From: Chao Liu Date: Fri, 19 Oct 2018 01:26:21 -0500 Subject: [PATCH] cpu direct conv [ROCm/composable_kernel commit: d51b81588ff6102dbde9c9d91810c1bb8f709cfc] --- CMakeLists.txt | 2 + driver/conv.cpp | 82 +++++++++---- src/CMakeLists.txt | 4 +- src/include/tensor.hpp | 261 +++++++++++++++++++++-------------------- 4 files changed, 197 insertions(+), 152 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 02b72e1dad..150632d790 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -41,5 +41,7 @@ link_libraries(${PYTHON_LIBRARIES}) include_directories(BEFORE ${CUDA_COMMON_INCLUDE_DIR}) # +include_directories(BEFORE src/include) add_subdirectory(src) add_subdirectory(driver) + diff --git a/driver/conv.cpp b/driver/conv.cpp index 4e10bce265..6889ba1c15 100644 --- a/driver/conv.cpp +++ b/driver/conv.cpp @@ -1,31 +1,67 @@ +#include #include "tensor.hpp" +template +void direct_convolution(const Tensor& in, + const Tensor& wei, + Tensor& out, + std::size_t num_thread) +{ + auto f = [&](auto n, auto k, auto ho, auto wo) { + double v = 0; + for(int c = 0; c < wei.mDesc.GetLengths()[1]; ++c) + { + for(int y = 0; y < wei.mDesc.GetLengths()[2]; ++y) + { + int hi = ho + y; + for(int x = 0; x < wei.mDesc.GetLengths()[3]; ++x) + { + int wi = wo + x; + v += in(n, c, hi, wi) * wei(k, c, y, x); + } + } + } + out(n, k, ho, wo) = v; + }; + + auto f_par = make_ParallelTensorFunctor(f, + out.mDesc.GetLengths()[0], + out.mDesc.GetLengths()[1], + out.mDesc.GetLengths()[2], + out.mDesc.GetLengths()[3]); + + f_par(num_thread); +} + +template +struct Generator +{ + + template + T operator()(Is... is) + { + return 1; + } +}; + int main() { + Tensor in({3, 16, 128, 128}); + Tensor wei({4, 16, 3, 3}); + Tensor out({3, 4, 126, 126}); - int len_in = 100; - int len_wei = 3; - int len_out = len_in - len_wei + 1; + int num_thread = std::thread::hardware_concurrency(); - std::vector in(len_in, 1); - std::vector wei(len_wei, 1); - std::vector out(len_out, 1); + std::cout << __func__ << ": num_thread " << num_thread << std::endl; - direct_convolution(in.data(), wei.data(), out.data(), len_in, len_wei); -} - -template -void direct_convolution(const T* in, const T* wei, T* out, const int len_in, const int len_wei) -{ - int len_out = len_in - len_wei + 1; - - for(int i_out = 0; i_out < len_out++ i_out) - { - double acc = 0; - for(int i_wei = 0; i_wei < len_wei; ++i_wei) - { - acc += in[i_out + i_wei] * *wei[i_wei]; - } - out[i_out] = acc; - } + in.GenerateTensorValue(Generator{}, num_thread); + wei.GenerateTensorValue(Generator{}, num_thread); + + direct_convolution(in, wei, out, num_thread); + + std::cout << __func__ << ": done" << std::endl; + + LogRange(std::cout, in.mData, ",") << std::endl; + LogRange(std::cout, wei.mData, ",") << std::endl; + LogRange(std::cout, out.mData, ",") << std::endl; } diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 6165ed605a..b88c759ac9 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -1,5 +1,3 @@ -include_directories(BEFORE include) - set(SOURCE tensor.cpp; ) @@ -11,7 +9,7 @@ set_target_properties(convolution PROPERTIES PREFIX "") target_link_libraries(convolution boost_python3) # cuda -target_link_libraries(convolution nvToolsExt) +target_link_libraries(convolution nvToolsExt cudart) target_compile_features(convolution PUBLIC) set_target_properties(convolution PROPERTIES POSITION_INDEPENDENT_CODE ON) set_target_properties(convolution PROPERTIES CUDA_SEPARABLE_COMPILATION OFF) diff --git a/src/include/tensor.hpp b/src/include/tensor.hpp index 3961a40b80..e232c74516 100644 --- a/src/include/tensor.hpp +++ b/src/include/tensor.hpp @@ -1,10 +1,28 @@ #include #include #include +#include #include +#include +#include #include "cuda_runtime.h" #include "helper_cuda.h" +template +std::ostream& LogRange(std::ostream& os, Range&& r, std::string delim) +{ + bool first = true; + for(auto&& x : r) + { + if(first) + first = false; + else + os << delim; + os << x; + } + return os; +} + typedef enum { Half = 0, @@ -19,6 +37,34 @@ struct DataType : std::integral_constant { }; +template +auto call_f_unpack_args_impl(F f, T args, std::index_sequence) +{ + return f(std::get(args)...); +} + +template +auto call_f_unpack_args(F f, T args) +{ + constexpr std::size_t N = std::tuple_size::value; + + return call_f_unpack_args_impl(f, args, std::make_index_sequence{}); +} + +template +auto construct_f_unpack_args_impl(T args, std::index_sequence) +{ + return F(std::get(args)...); +} + +template +auto construct_f_unpack_args(F, T args) +{ + constexpr std::size_t N = std::tuple_size::value; + + return construct_f_unpack_args_impl(args, std::make_index_sequence{}); +} + struct TensorDescriptor { TensorDescriptor() = delete; @@ -50,12 +96,12 @@ struct TensorDescriptor const std::vector& GetLengths() const; const std::vector& GetStrides() const; - template - std::size_t Get1dIndex(Xs... xs) const + template + std::size_t Get1dIndex(Is... is) const { - assert(sizeof...(Xs) == this->GetDimension()); - std::initializer_list is{xs...}; - return std::inner_product(is.begin(), is.end(), mStrides.begin(), std::size_t{0}); + assert(sizeof...(Is) == this->GetDimension()); + std::initializer_list iss{static_cast(is)...}; + return std::inner_product(iss.begin(), iss.end(), mStrides.begin(), std::size_t{0}); } private: @@ -65,90 +111,6 @@ struct TensorDescriptor DataType_t mDataType; }; -template -struct Tensor -{ - template - Tensor(std::initializer_list lens) - : mDesc(DataType{}, lens), mData(mDesc.GetElementSpace()) - { - } - - template - Tensor(std::vector lens) : mDesc(DataType{}, lens), mData(mDesc.GetElementSpace()) - { - } - - template - Tensor(std::vector lens, std::vector strides) - : mDesc(DataType{}, lens, strides), mData(mDesc.GetElementSpace()) - { - } - - template - void GenerateTensorValue(G g) - { - // ParallelTensorFunctor([&](Xs... xs) { mData(mDesc.Get1dIndex(xs...)) = g(xs...); }, - // mDesc.mLens)(); - switch(mDesc.GetDimension()) - { - case 1: - { - ParallelTensorFunctor([&](auto i) { mData(mDesc.Get1dIndex(i)) = g(i); }, - mDesc.GetLengths()[0])(); - break; - } - case 2: - { - ParallelTensorFunctor( - [&](auto i0, auto i1) { mData(mDesc.Get1dIndex(i0, i1)) = g(i0, i1); }, - mDesc.GetLengths()[0], - mDesc.GetLengths()[1])(); - break; - } - case 3: - { - ParallelTensorFunctor( - [&](auto i0, auto i1, auto i2) { - mData(mDesc.Get1dIndex(i0, i1, i2)) = g(i0, i1, i2); - }, - mDesc.GetLengths()[0], - mDesc.GetLengths()[1], - mDesc.GetLengths()[2])(); - break; - } - case 4: - { - ParallelTensorFunctor( - [&](auto i0, auto i1, auto i2, auto i3) { - mData(mDesc.Get1dIndex(i0, i1, i2, i3)) = g(i0, i1, i2, i3); - }, - mDesc.GetLengths()[0], - mDesc.GetLengths()[1], - mDesc.GetLengths()[3], - mDesc.GetLengths()[4])(); - break; - } - default: throw std::runtime_error("unspported dimension"); - } - } - - T& operator[](std::size_t i) { return mData.at(i); } - - const T& operator[](std::size_t i) const { return mData.at(i); } - - typename std::vector::iterator begin() { return mData.begin(); } - - typename std::vector::iterator end() { return mData.end(); } - - typename std::vector::const_iterator begin() const { return mData.begin(); } - - typename std::vector::const_iterator end() const { return mData.end(); } - - TensorDescriptor mDesc; - std::vector mData; -}; - struct GpuMem { GpuMem() = delete; @@ -194,12 +156,6 @@ struct joinable_thread : std::thread template struct ParallelTensorFunctor { - enum ParallelMethod_t - { - Serial = 0, - Parallel = 1, - }; - F mF; static constexpr std::size_t NDIM = sizeof...(Xs); std::array mLens; @@ -229,16 +185,7 @@ struct ParallelTensorFunctor return indices; } - void operator()(std::integral_constant) - { - for(std::size_t i = 0; i < mN1d; ++i) - { - call_f_unpack_args(mF, GetNdIndices(i)); - } - } - - void operator()(std::integral_constant, - std::size_t num_thread) + void operator()(std::size_t num_thread) const { std::size_t work_per_thread = (mN1d + num_thread - 1) / num_thread; @@ -247,7 +194,7 @@ struct ParallelTensorFunctor for(std::size_t it = 0; it < num_thread; ++it) { std::size_t iw_begin = it * work_per_thread; - std::size_t iw_end = std::min(((it + 1) * work_per_thread, mN1d)); + std::size_t iw_end = std::min((it + 1) * work_per_thread, mN1d); auto f = [=] { for(std::size_t iw = iw_begin; iw < iw_end; ++iw) @@ -260,30 +207,92 @@ struct ParallelTensorFunctor } }; -template -auto call_f_unpack_args(F f, T args) +template +auto make_ParallelTensorFunctor(F f, Xs... xs) { - static constexpr std::size_t N = std::tuple_size::value; - - return call_f_unpack_args_impl(f, args, std::make_index_sequence{}); + return ParallelTensorFunctor(f, xs...); } -template -auto call_f_unpack_args_impl(F f, T args, std::integer_sequence) +template +struct Tensor { - return f(std::get(args)...); -} + template + Tensor(std::initializer_list lens) + : mDesc(DataType{}, lens), mData(mDesc.GetElementSpace()) + { + } -template -auto construct_f_unpack_args_impl(T args, std::integer_sequence) -{ - return F(std::get(args)...); -} + template + Tensor(std::vector lens) : mDesc(DataType{}, lens), mData(mDesc.GetElementSpace()) + { + } -template -auto construct_f_unpack_args(F, T args) -{ - static constexpr std::size_t N = std::tuple_size::value; + template + Tensor(std::vector lens, std::vector strides) + : mDesc(DataType{}, lens, strides), mData(mDesc.GetElementSpace()) + { + } - return construct_f_unpack_args_impl(args, std::make_index_sequence{}); -} + template + void GenerateTensorValue(G g, std::size_t num_thread = 1) + { + switch(mDesc.GetDimension()) + { + case 1: + { + auto f = [&](auto i) { (*this)(i) = g(i); }; + make_ParallelTensorFunctor(f, mDesc.GetLengths()[0])(num_thread); + break; + } + case 2: + { + auto f = [&](auto i0, auto i1) { (*this)(i0, i1) = g(i0, i1); }; + make_ParallelTensorFunctor(f, mDesc.GetLengths()[0], mDesc.GetLengths()[1])(num_thread); + break; + } + case 3: + { + auto f = [&](auto i0, auto i1, auto i2) { (*this)(i0, i1, i2) = g(i0, i1, i2); }; + make_ParallelTensorFunctor( + f, mDesc.GetLengths()[0], mDesc.GetLengths()[1], mDesc.GetLengths()[2])(num_thread); + break; + } + case 4: + { + auto f = [&](auto i0, auto i1, auto i2, auto i3) { + (*this)(i0, i1, i2, i3) = g(i0, i1, i2, i3); + }; + make_ParallelTensorFunctor(f, + mDesc.GetLengths()[0], + mDesc.GetLengths()[1], + mDesc.GetLengths()[2], + mDesc.GetLengths()[3])(num_thread); + break; + } + default: throw std::runtime_error("unspported dimension"); + } + } + + template + T& operator()(Is... is) + { + return mData[mDesc.Get1dIndex(is...)]; + } + + template + const T& operator()(Is... is) const + { + return mData[mDesc.Get1dIndex(is...)]; + } + + typename std::vector::iterator begin() { return mData.begin(); } + + typename std::vector::iterator end() { return mData.end(); } + + typename std::vector::const_iterator begin() const { return mData.begin(); } + + typename std::vector::const_iterator end() const { return mData.end(); } + + TensorDescriptor mDesc; + std::vector mData; +};