diff --git a/CMakeLists.txt b/CMakeLists.txt index f247ca581e..02b72e1dad 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -3,6 +3,7 @@ project(convolution LANGUAGES CXX CUDA) #c++ message("CMAKE_CXX_COMPILER_ID: ${CMAKE_CXX_COMPILER_ID}") +add_compile_options(-std=c++14) #boost find_package(Boost REQUIRED) diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 9586666cef..6165ed605a 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -12,7 +12,7 @@ target_link_libraries(convolution boost_python3) # cuda target_link_libraries(convolution nvToolsExt) -target_compile_features(convolution PUBLIC cxx_std_11) +target_compile_features(convolution PUBLIC) set_target_properties(convolution PROPERTIES POSITION_INDEPENDENT_CODE ON) set_target_properties(convolution PROPERTIES CUDA_SEPARABLE_COMPILATION OFF) diff --git a/src/include/tensor.hpp b/src/include/tensor.hpp index dd5d13b4cf..3961a40b80 100644 --- a/src/include/tensor.hpp +++ b/src/include/tensor.hpp @@ -1,6 +1,9 @@ #include #include #include +#include +#include "cuda_runtime.h" +#include "helper_cuda.h" typedef enum { @@ -34,17 +37,21 @@ struct TensorDescriptor this->CalculateStrides(); } - template + template TensorDescriptor(DataType_t t, const Range1& lens, const Range2& strides) : mLens(lens.begin(), lens.end()), mStrides(strides.begin(), strides.end()), mDataType(t) - {} + { + } std::size_t GetDimension() const; std::size_t GetElementSize() const; std::size_t GetElementSpace() const; - template - std::size_t GetIndex(Xs... xs) const + const std::vector& GetLengths() const; + const std::vector& GetStrides() const; + + template + std::size_t Get1dIndex(Xs... xs) const { assert(sizeof...(Xs) == this->GetDimension()); std::initializer_list is{xs...}; @@ -81,7 +88,49 @@ struct Tensor template void GenerateTensorValue(G g) { - parallel_for([&](Xs... xs) { mData(mDesc.GetIndex(xs...)) = g(xs...); }, mDesc.mLens); + // ParallelTensorFunctor([&](Xs... xs) { mData(mDesc.Get1dIndex(xs...)) = g(xs...); }, + // mDesc.mLens)(); + switch(mDesc.GetDimension()) + { + case 1: + { + ParallelTensorFunctor([&](auto i) { mData(mDesc.Get1dIndex(i)) = g(i); }, + mDesc.GetLengths()[0])(); + break; + } + case 2: + { + ParallelTensorFunctor( + [&](auto i0, auto i1) { mData(mDesc.Get1dIndex(i0, i1)) = g(i0, i1); }, + mDesc.GetLengths()[0], + mDesc.GetLengths()[1])(); + break; + } + case 3: + { + ParallelTensorFunctor( + [&](auto i0, auto i1, auto i2) { + mData(mDesc.Get1dIndex(i0, i1, i2)) = g(i0, i1, i2); + }, + mDesc.GetLengths()[0], + mDesc.GetLengths()[1], + mDesc.GetLengths()[2])(); + break; + } + case 4: + { + ParallelTensorFunctor( + [&](auto i0, auto i1, auto i2, auto i3) { + mData(mDesc.Get1dIndex(i0, i1, i2, i3)) = g(i0, i1, i2, i3); + }, + mDesc.GetLengths()[0], + mDesc.GetLengths()[1], + mDesc.GetLengths()[3], + mDesc.GetLengths()[4])(); + break; + } + default: throw std::runtime_error("unspported dimension"); + } } T& operator[](std::size_t i) { return mData.at(i); } @@ -103,42 +152,44 @@ struct Tensor struct GpuMem { GpuMem() = delete; - GpuMem(std::size_t sz, std::size_t data_sz) : mSz(sz), mDataSz(data_sz) + GpuMem(std::size_t size, std::size_t data_size) : mSize(size), mDataSize(data_size) { - cudaMalloc(statci_cast(&GpuBuf), mDataSize * mSz); + cudaMalloc(static_cast(&mGpuBuf), mDataSize * mSize); } int ToGpu(void* p) { - return static_cast(cudaMemcpy(mGpuBuf, p, mDataSz * mSz, cudaMemCpyHostToDevice)); + return static_cast(cudaMemcpy(mGpuBuf, p, mDataSize * mSize, cudaMemcpyHostToDevice)); } - int FromGpu(void* p) { return static_cast(cuadMemCpy(p, mGpuBuf, mDataSz * mSz)); } + int FromGpu(void* p) + { + return static_cast(cudaMemcpy(p, mGpuBuf, mDataSize * mSize, cudaMemcpyDeviceToHost)); + } ~GpuMem() { cudaFree(mGpuBuf); } void* mGpuBuf; - std::size_t mSz; - std::size_t mDataSz; + std::size_t mSize; + std::size_t mDataSize; }; -void dummy() +struct joinable_thread : std::thread { - auto f1 = [](int n, int c, int h, int w) { do_f1(n, c, h, w); }; - auto f2 = [](int n, int c, int h, int w) { do_f2(n, c, h, w); }; + template + joinable_thread(Xs&&... xs) : std::thread(std::forward(xs)...) + { + } - auto par_f1 = generate_ParallelTensorFunctor(f1, 3, 3, 3, 3, 3); - auto par_f2 = generate_ParallelTensorFunctor(f2, 4, 4, 4); + joinable_thread(joinable_thread&&) = default; + joinable_thread& operator=(joinable_thread&&) = default; - auto r1 = par_f1(); - auto r2 = par_f2(); -} - -template -auto generate_parallel_tensor_functor(F f, Xs... xs) -{ - return ParallelTensorFunctor(f, xs...); -} + ~joinable_thread() + { + if(this->joinable()) + this->join(); + } +}; template struct ParallelTensorFunctor @@ -150,7 +201,7 @@ struct ParallelTensorFunctor }; F mF; - constexpr std::size_t DIM = sizeof...(Xs); + static constexpr std::size_t NDIM = sizeof...(Xs); std::array mLens; std::array mStrides; std::size_t mN1d; @@ -165,16 +216,29 @@ struct ParallelTensorFunctor mN1d = mStrides[0] * mLens[0]; } + std::array GetNdIndices(std::size_t i) const + { + std::array indices; + + for(int idim = 0; idim < NDIM; ++idim) + { + indices[idim] = i / mStrides[idim]; + i -= indices[idim] * mStrides[idim]; + } + + return indices; + } + void operator()(std::integral_constant) { for(std::size_t i = 0; i < mN1d; ++i) { - call_f_unpack_indices(mF, GetNdIndices(i)); + call_f_unpack_args(mF, GetNdIndices(i)); } } void operator()(std::integral_constant, - std::size_t::num_thread) + std::size_t num_thread) { std::size_t work_per_thread = (mN1d + num_thread - 1) / num_thread; @@ -183,42 +247,43 @@ struct ParallelTensorFunctor for(std::size_t it = 0; it < num_thread; ++it) { std::size_t iw_begin = it * work_per_thread; - std::size_t iw_end = std::min(((it+1)*work_per_thread, mN1d)); + std::size_t iw_end = std::min(((it + 1) * work_per_thread, mN1d)); auto f = [=] { for(std::size_t iw = iw_begin; iw < iw_end; ++iw) - call_f_unpack_indices(mF, GetNdIndices(iw); + { + call_f_unpack_args(mF, GetNdIndices(iw)); + } }; threads[it] = joinable_thread(f); } } }; -struct joinable_thread : std::thread -{ - template - joinable_thread(Xs&&... xs) : std::thread(std::forward(xs)...) - { - } - - ~joinable_thread() - { - if(this->joinable()) - this->join; - } -} - template -auto call_f_unpack_indices(F f, T indices) +auto call_f_unpack_args(F f, T args) { - constexpr std::size_t N = std::tuple_size::value; - using NSeq = std::make_integer_sequence; + static constexpr std::size_t N = std::tuple_size::value; - return call_f_unpack_indices_impl(f, indices, NSeq{}); + return call_f_unpack_args_impl(f, args, std::make_index_sequence{}); } template -auto call_f_unpack_indices_impl(F f, T indices, std::integer_sequence) +auto call_f_unpack_args_impl(F f, T args, std::integer_sequence) { - return f(std::get(indices)...); + return f(std::get(args)...); +} + +template +auto construct_f_unpack_args_impl(T args, std::integer_sequence) +{ + return F(std::get(args)...); +} + +template +auto construct_f_unpack_args(F, T args) +{ + static constexpr std::size_t N = std::tuple_size::value; + + return construct_f_unpack_args_impl(args, std::make_index_sequence{}); } diff --git a/src/tensor.cpp b/src/tensor.cpp index b8b6f8c516..b94677a073 100644 --- a/src/tensor.cpp +++ b/src/tensor.cpp @@ -3,8 +3,6 @@ #include "tensor.hpp" -TensorDescriptor::TensorDescriptor() {} - TensorDescriptor::TensorDescriptor(DataType_t t, std::initializer_list lens) : mLens(lens), mDataType(t) { @@ -22,7 +20,7 @@ void TensorDescriptor::CalculateStrides() { mStrides.clear(); mStrides.resize(mLens.size(), 0); - if(strides.empty()) + if(mStrides.empty()) return; mStrides.back() = 1; @@ -41,6 +39,10 @@ std::size_t TensorDescriptor::GetElementSize() const std::size_t TensorDescriptor::GetElementSpace() const { - auto ls = mLens | boost::adaptor::transformed([](auto v) { return v - 1; }); + auto ls = mLens | boost::adaptors::transformed([](std::size_t v) { return v - 1; }); return std::inner_product(ls.begin(), ls.end(), mStrides.begin(), std::size_t{0}) + 1; } + +const std::vector& TensorDescriptor::GetLengths() const { return mLens; } + +const std::vector& TensorDescriptor::GetStrides() const { return mStrides; }