commit 8bfafec5542d4f72d3b9112e7db99697e436f751 Author: Chao Liu Date: Mon Oct 8 22:49:58 2018 -0500 start adding convolution [ROCm/composable_kernel commit: fc98757acd68219eebecb16b15ac472172f6dd55] diff --git a/.clang-format b/.clang-format new file mode 100644 index 0000000000..22f2674966 --- /dev/null +++ b/.clang-format @@ -0,0 +1,90 @@ +--- +Language: Cpp +AccessModifierOffset: 0 +AlignAfterOpenBracket: Align +AlignConsecutiveAssignments: true +AlignConsecutiveDeclarations: false +AlignEscapedNewlinesLeft: true +AlignOperands: true +AlignTrailingComments: true +AllowAllParametersOfDeclarationOnNextLine: true +AllowShortBlocksOnASingleLine: true +AllowShortCaseLabelsOnASingleLine: true +AllowShortFunctionsOnASingleLine: All +AllowShortIfStatementsOnASingleLine: false +AllowShortLoopsOnASingleLine: false +AlwaysBreakAfterDefinitionReturnType: None +AlwaysBreakAfterReturnType: None +AlwaysBreakBeforeMultilineStrings: false +AlwaysBreakTemplateDeclarations: true +BinPackArguments: false +BinPackParameters: false +BraceWrapping: + AfterClass: true + AfterControlStatement: true + AfterEnum: true + AfterFunction: true + AfterNamespace: false + AfterObjCDeclaration: true + AfterStruct: true + AfterUnion: true + BeforeCatch: true + BeforeElse: true + IndentBraces: false +BreakBeforeBinaryOperators: None +BreakBeforeBraces: Custom +BreakBeforeTernaryOperators: true +BreakConstructorInitializersBeforeComma: false +ColumnLimit: 100 +CommentPragmas: '^ IWYU pragma:' +ConstructorInitializerAllOnOneLineOrOnePerLine: true +ConstructorInitializerIndentWidth: 4 +ContinuationIndentWidth: 4 +Cpp11BracedListStyle: true +DerivePointerAlignment: false +DisableFormat: false +ExperimentalAutoDetectBinPacking: false +ForEachMacros: [ foreach, Q_FOREACH, BOOST_FOREACH ] +IncludeCategories: + - Regex: '^"(llvm|llvm-c|clang|clang-c)/' + Priority: 2 + - Regex: '^(<|"(gtest|isl|json)/)' + Priority: 3 + - Regex: '.*' + Priority: 1 +IndentCaseLabels: false +IndentWidth: 4 +IndentWrappedFunctionNames: false +KeepEmptyLinesAtTheStartOfBlocks: true +MacroBlockBegin: '' +MacroBlockEnd: '' +MaxEmptyLinesToKeep: 1 +NamespaceIndentation: None +ObjCBlockIndentWidth: 2 +ObjCSpaceAfterProperty: false +ObjCSpaceBeforeProtocolList: true +PenaltyBreakBeforeFirstCallParameter: 19 +PenaltyBreakComment: 300 +PenaltyBreakFirstLessLess: 120 +PenaltyBreakString: 1000 +PenaltyExcessCharacter: 1000000 +PenaltyReturnTypeOnItsOwnLine: 60 +PointerAlignment: Left +ReflowComments: true +SortIncludes: false +SpaceAfterCStyleCast: false +# SpaceAfterTemplateKeyword: true +SpaceBeforeAssignmentOperators: true +SpaceBeforeParens: Never +SpaceInEmptyParentheses: false +SpacesBeforeTrailingComments: 1 +SpacesInAngles: false +SpacesInContainerLiterals: true +SpacesInCStyleCastParentheses: false +SpacesInParentheses: false +SpacesInSquareBrackets: false +Standard: Cpp11 +TabWidth: 8 +UseTab: Never +... + diff --git a/CMakeLists.txt b/CMakeLists.txt new file mode 100644 index 0000000000..f247ca581e --- /dev/null +++ b/CMakeLists.txt @@ -0,0 +1,44 @@ +cmake_minimum_required(VERSION 3.9) +project(convolution LANGUAGES CXX CUDA) + +#c++ +message("CMAKE_CXX_COMPILER_ID: ${CMAKE_CXX_COMPILER_ID}") + +#boost +find_package(Boost REQUIRED) + +message("Boost_INCLUDE_DIRS: ${Boost_INCLUDE_DIRS}") +message("Boost_LIBRARY_DIRS: ${Boost_LIBRARY_DIRS}") + +include_directories(BEFORE ${Boost_INCLUDE_DIRS}) +link_directories(${Boost_LIBRARY_DIRS}) + +#openMP +if( NOT( ${CMAKE_CXX_COMPILER_ID} STREQUAL "AppleClang") ) + find_package(OpenMP REQUIRED) + + message("OpenMP_CXX_LIB_NAMES: ${OpenMP_CXX_LIB_NAMES}") + message("OpenMP_gomp_LIBRARY: ${OpenMP_gomp_LIBRARY}") + message("OpenMP_pthread_LIBRARY: ${OpenMP_pthread_LIBRARY}") + message("OpenMP_CXX_FLAGS: ${OpenMP_CXX_FLAGS}") + + set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}") + link_libraries(${OpenMP_gomp_LIBRARY}) + link_libraries(${OpenMP_pthread_LIBRARY}) +endif( NOT( ${CMAKE_CXX_COMPILER_ID} STREQUAL "AppleClang") ) + +#python +find_package(PythonLibs 3 REQUIRED) + +message("PYTHON_INCLUDE_DIRS: ${PYTHON_INCLUDE_DIRS}") +message("PYTHON_LIBRARIES: ${PYTHON_LIBRARIES}") + +include_directories(BEFORE ${PYTHON_INCLUDE_DIRS}) +link_libraries(${PYTHON_LIBRARIES}) + +#cuda +include_directories(BEFORE ${CUDA_COMMON_INCLUDE_DIR}) + +# +add_subdirectory(src) +add_subdirectory(driver) diff --git a/cmake.sh b/cmake.sh new file mode 100755 index 0000000000..456abe9e91 --- /dev/null +++ b/cmake.sh @@ -0,0 +1,22 @@ +#!/bin/bash + +rm -f CMakeCache.txt +rm -f *.cmake +rm -rf CMakeFiles + +MY_PROJECT_SOURCE=/package/code/github/test_feature/SpMV +MY_PROJECT_INSTALL=../install.dir + +cmake \ +-D CMAKE_INSTALL_PREFIX=${MY_PROJECT_INSTALL} \ +-D CMAKE_CXX_FLAGS="${CMAKE_CXX_FLAGS} -std=c++11" \ +-D CMAKE_BUILD_TYPE=Release \ +-D CMAKE_VERBOSE_MAKEFILE:BOOL=ON \ +-D BOOST_ROOT="/package/install/boost_1.66.0-mpich_3.2" \ +-D CMAKE_CUDA_COMPILER="/package/install/cuda_9.0/bin/nvcc" \ +-D CUDA_COMMON_INCLUDE_DIR="/package/code/github/test_feature/cuda_9.0_common/inc" \ +-D CMAKE_CUDA_FLAGS="-ccbin g++ -m64 -Xcompiler -fopenmp -lineinfo --source-in-ptx -keep -Xptxas -v -arch=sm_35 -Xptxas -v -maxrregcount=40" \ +${MY_PROJECT_SOURCE} + +#-D CMAKE_CUDA_FLAGS="-lineinfo --source-in-ptx -keep -Xptxas -v -arch=sm_35 -Xptxas -v -maxrregcount=32" \ +#-D CMAKE_CUDA_FLAGS="-G -lineinfo --source-in-ptx -keep -Xptxas -v -arch=sm_35" \ diff --git a/driver/CMakeLists.txt b/driver/CMakeLists.txt new file mode 100644 index 0000000000..dae6f62835 --- /dev/null +++ b/driver/CMakeLists.txt @@ -0,0 +1,2 @@ +add_executable(conv EXCLUDE_FROM_ALL conv.cpp) +target_link_libraries(conv convolution) diff --git a/driver/conv.cpp b/driver/conv.cpp new file mode 100644 index 0000000000..4e10bce265 --- /dev/null +++ b/driver/conv.cpp @@ -0,0 +1,31 @@ +#include "tensor.hpp" + +int main() +{ + + int len_in = 100; + int len_wei = 3; + int len_out = len_in - len_wei + 1; + + std::vector in(len_in, 1); + std::vector wei(len_wei, 1); + std::vector out(len_out, 1); + + direct_convolution(in.data(), wei.data(), out.data(), len_in, len_wei); +} + +template +void direct_convolution(const T* in, const T* wei, T* out, const int len_in, const int len_wei) +{ + int len_out = len_in - len_wei + 1; + + for(int i_out = 0; i_out < len_out++ i_out) + { + double acc = 0; + for(int i_wei = 0; i_wei < len_wei; ++i_wei) + { + acc += in[i_out + i_wei] * *wei[i_wei]; + } + out[i_out] = acc; + } +} diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt new file mode 100644 index 0000000000..9586666cef --- /dev/null +++ b/src/CMakeLists.txt @@ -0,0 +1,19 @@ +include_directories(BEFORE include) + +set(SOURCE + tensor.cpp; +) + +add_library(convolution SHARED ${SOURCE}) +set_target_properties(convolution PROPERTIES PREFIX "") + +# boost.python +target_link_libraries(convolution boost_python3) + +# cuda +target_link_libraries(convolution nvToolsExt) +target_compile_features(convolution PUBLIC cxx_std_11) +set_target_properties(convolution PROPERTIES POSITION_INDEPENDENT_CODE ON) +set_target_properties(convolution PROPERTIES CUDA_SEPARABLE_COMPILATION OFF) + +install(TARGETS convolution LIBRARY DESTINATION lib) diff --git a/src/include/tensor.hpp b/src/include/tensor.hpp new file mode 100644 index 0000000000..dd5d13b4cf --- /dev/null +++ b/src/include/tensor.hpp @@ -0,0 +1,224 @@ +#include +#include +#include + +typedef enum +{ + Half = 0, + Float = 1, +} DataType_t; + +template +struct DataType; + +template <> +struct DataType : std::integral_constant +{ +}; + +struct TensorDescriptor +{ + TensorDescriptor() = delete; + TensorDescriptor(DataType_t t, std::initializer_list lens); + TensorDescriptor(DataType_t t, + std::initializer_list lens, + std::initializer_list strides); + TensorDescriptor(DataType_t t, std::vector lens, std::vector strides); + + void CalculateStrides(); + + template + TensorDescriptor(DataType_t t, const Range& lens) + : mLens(lens.begin(), lens.end()), mDataType(t) + { + this->CalculateStrides(); + } + + template + TensorDescriptor(DataType_t t, const Range1& lens, const Range2& strides) + : mLens(lens.begin(), lens.end()), mStrides(strides.begin(), strides.end()), mDataType(t) + {} + + std::size_t GetDimension() const; + std::size_t GetElementSize() const; + std::size_t GetElementSpace() const; + + template + std::size_t GetIndex(Xs... xs) const + { + assert(sizeof...(Xs) == this->GetDimension()); + std::initializer_list is{xs...}; + return std::inner_product(is.begin(), is.end(), mStrides.begin(), std::size_t{0}); + } + + private: + std::vector mLens; + std::vector mStrides; + + DataType_t mDataType; +}; + +template +struct Tensor +{ + template + Tensor(std::initializer_list lens) + : mDesc(DataType{}, lens), mData(mDesc.GetElementSpace()) + { + } + + template + Tensor(std::vector lens) : mDesc(DataType{}, lens), mData(mDesc.GetElementSpace()) + { + } + + template + Tensor(std::vector lens, std::vector strides) + : mDesc(DataType{}, lens, strides), mData(mDesc.GetElementSpace()) + { + } + + template + void GenerateTensorValue(G g) + { + parallel_for([&](Xs... xs) { mData(mDesc.GetIndex(xs...)) = g(xs...); }, mDesc.mLens); + } + + T& operator[](std::size_t i) { return mData.at(i); } + + const T& operator[](std::size_t i) const { return mData.at(i); } + + typename std::vector::iterator begin() { return mData.begin(); } + + typename std::vector::iterator end() { return mData.end(); } + + typename std::vector::const_iterator begin() const { return mData.begin(); } + + typename std::vector::const_iterator end() const { return mData.end(); } + + TensorDescriptor mDesc; + std::vector mData; +}; + +struct GpuMem +{ + GpuMem() = delete; + GpuMem(std::size_t sz, std::size_t data_sz) : mSz(sz), mDataSz(data_sz) + { + cudaMalloc(statci_cast(&GpuBuf), mDataSize * mSz); + } + + int ToGpu(void* p) + { + return static_cast(cudaMemcpy(mGpuBuf, p, mDataSz * mSz, cudaMemCpyHostToDevice)); + } + + int FromGpu(void* p) { return static_cast(cuadMemCpy(p, mGpuBuf, mDataSz * mSz)); } + + ~GpuMem() { cudaFree(mGpuBuf); } + + void* mGpuBuf; + std::size_t mSz; + std::size_t mDataSz; +}; + +void dummy() +{ + auto f1 = [](int n, int c, int h, int w) { do_f1(n, c, h, w); }; + auto f2 = [](int n, int c, int h, int w) { do_f2(n, c, h, w); }; + + auto par_f1 = generate_ParallelTensorFunctor(f1, 3, 3, 3, 3, 3); + auto par_f2 = generate_ParallelTensorFunctor(f2, 4, 4, 4); + + auto r1 = par_f1(); + auto r2 = par_f2(); +} + +template +auto generate_parallel_tensor_functor(F f, Xs... xs) +{ + return ParallelTensorFunctor(f, xs...); +} + +template +struct ParallelTensorFunctor +{ + enum ParallelMethod_t + { + Serial = 0, + Parallel = 1, + }; + + F mF; + constexpr std::size_t DIM = sizeof...(Xs); + std::array mLens; + std::array mStrides; + std::size_t mN1d; + + ParallelTensorFunctor(F f, Xs... xs) : mF(f), mLens({static_cast(xs)...}) + { + mStrides.back() = 1; + std::partial_sum(mLens.rbegin(), + mLens.rend() - 1, + mStrides.rbegin() + 1, + std::multiplies()); + mN1d = mStrides[0] * mLens[0]; + } + + void operator()(std::integral_constant) + { + for(std::size_t i = 0; i < mN1d; ++i) + { + call_f_unpack_indices(mF, GetNdIndices(i)); + } + } + + void operator()(std::integral_constant, + std::size_t::num_thread) + { + std::size_t work_per_thread = (mN1d + num_thread - 1) / num_thread; + + std::vector threads(num_thread); + + for(std::size_t it = 0; it < num_thread; ++it) + { + std::size_t iw_begin = it * work_per_thread; + std::size_t iw_end = std::min(((it+1)*work_per_thread, mN1d)); + + auto f = [=] { + for(std::size_t iw = iw_begin; iw < iw_end; ++iw) + call_f_unpack_indices(mF, GetNdIndices(iw); + }; + threads[it] = joinable_thread(f); + } + } +}; + +struct joinable_thread : std::thread +{ + template + joinable_thread(Xs&&... xs) : std::thread(std::forward(xs)...) + { + } + + ~joinable_thread() + { + if(this->joinable()) + this->join; + } +} + +template +auto call_f_unpack_indices(F f, T indices) +{ + constexpr std::size_t N = std::tuple_size::value; + using NSeq = std::make_integer_sequence; + + return call_f_unpack_indices_impl(f, indices, NSeq{}); +} + +template +auto call_f_unpack_indices_impl(F f, T indices, std::integer_sequence) +{ + return f(std::get(indices)...); +} diff --git a/src/tensor.cpp b/src/tensor.cpp new file mode 100644 index 0000000000..b8b6f8c516 --- /dev/null +++ b/src/tensor.cpp @@ -0,0 +1,46 @@ +#include +#include + +#include "tensor.hpp" + +TensorDescriptor::TensorDescriptor() {} + +TensorDescriptor::TensorDescriptor(DataType_t t, std::initializer_list lens) + : mLens(lens), mDataType(t) +{ + this->CalculateStrides(); +} + +TensorDescriptor::TensorDescriptor(DataType_t t, + std::vector lens, + std::vector strides) + : mLens(lens), mStrides(strides), mDataType(t) +{ +} + +void TensorDescriptor::CalculateStrides() +{ + mStrides.clear(); + mStrides.resize(mLens.size(), 0); + if(strides.empty()) + return; + + mStrides.back() = 1; + std::partial_sum( + mLens.rbegin(), mLens.rend() - 1, mStrides.rbegin() + 1, std::multiplies()); +} + +std::size_t TensorDescriptor::GetDimension() const { return mLens.size(); } + +std::size_t TensorDescriptor::GetElementSize() const +{ + assert(mLens.size() == mStrides.size()); + return std::accumulate( + mLens.begin(), mLens.end(), std::size_t{1}, std::multiplies()); +} + +std::size_t TensorDescriptor::GetElementSpace() const +{ + auto ls = mLens | boost::adaptor::transformed([](auto v) { return v - 1; }); + return std::inner_product(ls.begin(), ls.end(), mStrides.begin(), std::size_t{0}) + 1; +}