mirror of
https://github.com/ROCm/composable_kernel.git
synced 2026-05-13 17:55:48 +00:00
90
.clang-format
Normal file
90
.clang-format
Normal file
@@ -0,0 +1,90 @@
|
||||
---
|
||||
Language: Cpp
|
||||
AccessModifierOffset: 0
|
||||
AlignAfterOpenBracket: Align
|
||||
AlignConsecutiveAssignments: true
|
||||
AlignConsecutiveDeclarations: false
|
||||
AlignEscapedNewlinesLeft: true
|
||||
AlignOperands: true
|
||||
AlignTrailingComments: true
|
||||
AllowAllParametersOfDeclarationOnNextLine: true
|
||||
AllowShortBlocksOnASingleLine: true
|
||||
AllowShortCaseLabelsOnASingleLine: true
|
||||
AllowShortFunctionsOnASingleLine: All
|
||||
AllowShortIfStatementsOnASingleLine: false
|
||||
AllowShortLoopsOnASingleLine: false
|
||||
AlwaysBreakAfterDefinitionReturnType: None
|
||||
AlwaysBreakAfterReturnType: None
|
||||
AlwaysBreakBeforeMultilineStrings: false
|
||||
AlwaysBreakTemplateDeclarations: true
|
||||
BinPackArguments: false
|
||||
BinPackParameters: false
|
||||
BraceWrapping:
|
||||
AfterClass: true
|
||||
AfterControlStatement: true
|
||||
AfterEnum: true
|
||||
AfterFunction: true
|
||||
AfterNamespace: false
|
||||
AfterObjCDeclaration: true
|
||||
AfterStruct: true
|
||||
AfterUnion: true
|
||||
BeforeCatch: true
|
||||
BeforeElse: true
|
||||
IndentBraces: false
|
||||
BreakBeforeBinaryOperators: None
|
||||
BreakBeforeBraces: Custom
|
||||
BreakBeforeTernaryOperators: true
|
||||
BreakConstructorInitializersBeforeComma: false
|
||||
ColumnLimit: 100
|
||||
CommentPragmas: '^ IWYU pragma:'
|
||||
ConstructorInitializerAllOnOneLineOrOnePerLine: true
|
||||
ConstructorInitializerIndentWidth: 4
|
||||
ContinuationIndentWidth: 4
|
||||
Cpp11BracedListStyle: true
|
||||
DerivePointerAlignment: false
|
||||
DisableFormat: false
|
||||
ExperimentalAutoDetectBinPacking: false
|
||||
ForEachMacros: [ foreach, Q_FOREACH, BOOST_FOREACH ]
|
||||
IncludeCategories:
|
||||
- Regex: '^"(llvm|llvm-c|clang|clang-c)/'
|
||||
Priority: 2
|
||||
- Regex: '^(<|"(gtest|isl|json)/)'
|
||||
Priority: 3
|
||||
- Regex: '.*'
|
||||
Priority: 1
|
||||
IndentCaseLabels: false
|
||||
IndentWidth: 4
|
||||
IndentWrappedFunctionNames: false
|
||||
KeepEmptyLinesAtTheStartOfBlocks: true
|
||||
MacroBlockBegin: ''
|
||||
MacroBlockEnd: ''
|
||||
MaxEmptyLinesToKeep: 1
|
||||
NamespaceIndentation: None
|
||||
ObjCBlockIndentWidth: 2
|
||||
ObjCSpaceAfterProperty: false
|
||||
ObjCSpaceBeforeProtocolList: true
|
||||
PenaltyBreakBeforeFirstCallParameter: 19
|
||||
PenaltyBreakComment: 300
|
||||
PenaltyBreakFirstLessLess: 120
|
||||
PenaltyBreakString: 1000
|
||||
PenaltyExcessCharacter: 1000000
|
||||
PenaltyReturnTypeOnItsOwnLine: 60
|
||||
PointerAlignment: Left
|
||||
ReflowComments: true
|
||||
SortIncludes: false
|
||||
SpaceAfterCStyleCast: false
|
||||
# SpaceAfterTemplateKeyword: true
|
||||
SpaceBeforeAssignmentOperators: true
|
||||
SpaceBeforeParens: Never
|
||||
SpaceInEmptyParentheses: false
|
||||
SpacesBeforeTrailingComments: 1
|
||||
SpacesInAngles: false
|
||||
SpacesInContainerLiterals: true
|
||||
SpacesInCStyleCastParentheses: false
|
||||
SpacesInParentheses: false
|
||||
SpacesInSquareBrackets: false
|
||||
Standard: Cpp11
|
||||
TabWidth: 8
|
||||
UseTab: Never
|
||||
...
|
||||
|
||||
44
CMakeLists.txt
Normal file
44
CMakeLists.txt
Normal file
@@ -0,0 +1,44 @@
|
||||
cmake_minimum_required(VERSION 3.9)
|
||||
project(convolution LANGUAGES CXX CUDA)
|
||||
|
||||
#c++
|
||||
message("CMAKE_CXX_COMPILER_ID: ${CMAKE_CXX_COMPILER_ID}")
|
||||
|
||||
#boost
|
||||
find_package(Boost REQUIRED)
|
||||
|
||||
message("Boost_INCLUDE_DIRS: ${Boost_INCLUDE_DIRS}")
|
||||
message("Boost_LIBRARY_DIRS: ${Boost_LIBRARY_DIRS}")
|
||||
|
||||
include_directories(BEFORE ${Boost_INCLUDE_DIRS})
|
||||
link_directories(${Boost_LIBRARY_DIRS})
|
||||
|
||||
#openMP
|
||||
if( NOT( ${CMAKE_CXX_COMPILER_ID} STREQUAL "AppleClang") )
|
||||
find_package(OpenMP REQUIRED)
|
||||
|
||||
message("OpenMP_CXX_LIB_NAMES: ${OpenMP_CXX_LIB_NAMES}")
|
||||
message("OpenMP_gomp_LIBRARY: ${OpenMP_gomp_LIBRARY}")
|
||||
message("OpenMP_pthread_LIBRARY: ${OpenMP_pthread_LIBRARY}")
|
||||
message("OpenMP_CXX_FLAGS: ${OpenMP_CXX_FLAGS}")
|
||||
|
||||
set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}")
|
||||
link_libraries(${OpenMP_gomp_LIBRARY})
|
||||
link_libraries(${OpenMP_pthread_LIBRARY})
|
||||
endif( NOT( ${CMAKE_CXX_COMPILER_ID} STREQUAL "AppleClang") )
|
||||
|
||||
#python
|
||||
find_package(PythonLibs 3 REQUIRED)
|
||||
|
||||
message("PYTHON_INCLUDE_DIRS: ${PYTHON_INCLUDE_DIRS}")
|
||||
message("PYTHON_LIBRARIES: ${PYTHON_LIBRARIES}")
|
||||
|
||||
include_directories(BEFORE ${PYTHON_INCLUDE_DIRS})
|
||||
link_libraries(${PYTHON_LIBRARIES})
|
||||
|
||||
#cuda
|
||||
include_directories(BEFORE ${CUDA_COMMON_INCLUDE_DIR})
|
||||
|
||||
#
|
||||
add_subdirectory(src)
|
||||
add_subdirectory(driver)
|
||||
22
cmake.sh
Executable file
22
cmake.sh
Executable file
@@ -0,0 +1,22 @@
|
||||
#!/bin/bash
|
||||
|
||||
rm -f CMakeCache.txt
|
||||
rm -f *.cmake
|
||||
rm -rf CMakeFiles
|
||||
|
||||
MY_PROJECT_SOURCE=/package/code/github/test_feature/SpMV
|
||||
MY_PROJECT_INSTALL=../install.dir
|
||||
|
||||
cmake \
|
||||
-D CMAKE_INSTALL_PREFIX=${MY_PROJECT_INSTALL} \
|
||||
-D CMAKE_CXX_FLAGS="${CMAKE_CXX_FLAGS} -std=c++11" \
|
||||
-D CMAKE_BUILD_TYPE=Release \
|
||||
-D CMAKE_VERBOSE_MAKEFILE:BOOL=ON \
|
||||
-D BOOST_ROOT="/package/install/boost_1.66.0-mpich_3.2" \
|
||||
-D CMAKE_CUDA_COMPILER="/package/install/cuda_9.0/bin/nvcc" \
|
||||
-D CUDA_COMMON_INCLUDE_DIR="/package/code/github/test_feature/cuda_9.0_common/inc" \
|
||||
-D CMAKE_CUDA_FLAGS="-ccbin g++ -m64 -Xcompiler -fopenmp -lineinfo --source-in-ptx -keep -Xptxas -v -arch=sm_35 -Xptxas -v -maxrregcount=40" \
|
||||
${MY_PROJECT_SOURCE}
|
||||
|
||||
#-D CMAKE_CUDA_FLAGS="-lineinfo --source-in-ptx -keep -Xptxas -v -arch=sm_35 -Xptxas -v -maxrregcount=32" \
|
||||
#-D CMAKE_CUDA_FLAGS="-G -lineinfo --source-in-ptx -keep -Xptxas -v -arch=sm_35" \
|
||||
2
driver/CMakeLists.txt
Normal file
2
driver/CMakeLists.txt
Normal file
@@ -0,0 +1,2 @@
|
||||
add_executable(conv EXCLUDE_FROM_ALL conv.cpp)
|
||||
target_link_libraries(conv convolution)
|
||||
31
driver/conv.cpp
Normal file
31
driver/conv.cpp
Normal file
@@ -0,0 +1,31 @@
|
||||
#include "tensor.hpp"
|
||||
|
||||
int main()
|
||||
{
|
||||
|
||||
int len_in = 100;
|
||||
int len_wei = 3;
|
||||
int len_out = len_in - len_wei + 1;
|
||||
|
||||
std::vector<float> in(len_in, 1);
|
||||
std::vector<float> wei(len_wei, 1);
|
||||
std::vector<float> out(len_out, 1);
|
||||
|
||||
direct_convolution(in.data(), wei.data(), out.data(), len_in, len_wei);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void direct_convolution(const T* in, const T* wei, T* out, const int len_in, const int len_wei)
|
||||
{
|
||||
int len_out = len_in - len_wei + 1;
|
||||
|
||||
for(int i_out = 0; i_out < len_out++ i_out)
|
||||
{
|
||||
double acc = 0;
|
||||
for(int i_wei = 0; i_wei < len_wei; ++i_wei)
|
||||
{
|
||||
acc += in[i_out + i_wei] * *wei[i_wei];
|
||||
}
|
||||
out[i_out] = acc;
|
||||
}
|
||||
}
|
||||
19
src/CMakeLists.txt
Normal file
19
src/CMakeLists.txt
Normal file
@@ -0,0 +1,19 @@
|
||||
include_directories(BEFORE include)
|
||||
|
||||
set(SOURCE
|
||||
tensor.cpp;
|
||||
)
|
||||
|
||||
add_library(convolution SHARED ${SOURCE})
|
||||
set_target_properties(convolution PROPERTIES PREFIX "")
|
||||
|
||||
# boost.python
|
||||
target_link_libraries(convolution boost_python3)
|
||||
|
||||
# cuda
|
||||
target_link_libraries(convolution nvToolsExt)
|
||||
target_compile_features(convolution PUBLIC cxx_std_11)
|
||||
set_target_properties(convolution PROPERTIES POSITION_INDEPENDENT_CODE ON)
|
||||
set_target_properties(convolution PROPERTIES CUDA_SEPARABLE_COMPILATION OFF)
|
||||
|
||||
install(TARGETS convolution LIBRARY DESTINATION lib)
|
||||
224
src/include/tensor.hpp
Normal file
224
src/include/tensor.hpp
Normal file
@@ -0,0 +1,224 @@
|
||||
#include <thread>
|
||||
#include <vector>
|
||||
#include <numeric>
|
||||
|
||||
typedef enum
|
||||
{
|
||||
Half = 0,
|
||||
Float = 1,
|
||||
} DataType_t;
|
||||
|
||||
template <class T>
|
||||
struct DataType;
|
||||
|
||||
template <>
|
||||
struct DataType<float> : std::integral_constant<DataType_t, DataType_t::Float>
|
||||
{
|
||||
};
|
||||
|
||||
struct TensorDescriptor
|
||||
{
|
||||
TensorDescriptor() = delete;
|
||||
TensorDescriptor(DataType_t t, std::initializer_list<std::size_t> lens);
|
||||
TensorDescriptor(DataType_t t,
|
||||
std::initializer_list<std::size_t> lens,
|
||||
std::initializer_list<std::size_t> strides);
|
||||
TensorDescriptor(DataType_t t, std::vector<std::size_t> lens, std::vector<std::size_t> strides);
|
||||
|
||||
void CalculateStrides();
|
||||
|
||||
template <class Range>
|
||||
TensorDescriptor(DataType_t t, const Range& lens)
|
||||
: mLens(lens.begin(), lens.end()), mDataType(t)
|
||||
{
|
||||
this->CalculateStrides();
|
||||
}
|
||||
|
||||
template<class Range1, class Range2>
|
||||
TensorDescriptor(DataType_t t, const Range1& lens, const Range2& strides)
|
||||
: mLens(lens.begin(), lens.end()), mStrides(strides.begin(), strides.end()), mDataType(t)
|
||||
{}
|
||||
|
||||
std::size_t GetDimension() const;
|
||||
std::size_t GetElementSize() const;
|
||||
std::size_t GetElementSpace() const;
|
||||
|
||||
template<class... Xs>
|
||||
std::size_t GetIndex(Xs... xs) const
|
||||
{
|
||||
assert(sizeof...(Xs) == this->GetDimension());
|
||||
std::initializer_list<std::size_t> is{xs...};
|
||||
return std::inner_product(is.begin(), is.end(), mStrides.begin(), std::size_t{0});
|
||||
}
|
||||
|
||||
private:
|
||||
std::vector<std::size_t> mLens;
|
||||
std::vector<std::size_t> mStrides;
|
||||
|
||||
DataType_t mDataType;
|
||||
};
|
||||
|
||||
template <class T>
|
||||
struct Tensor
|
||||
{
|
||||
template <class X>
|
||||
Tensor(std::initializer_list<X> lens)
|
||||
: mDesc(DataType<T>{}, lens), mData(mDesc.GetElementSpace())
|
||||
{
|
||||
}
|
||||
|
||||
template <class X>
|
||||
Tensor(std::vector<X> lens) : mDesc(DataType<T>{}, lens), mData(mDesc.GetElementSpace())
|
||||
{
|
||||
}
|
||||
|
||||
template <class X, class Y>
|
||||
Tensor(std::vector<X> lens, std::vector<Y> strides)
|
||||
: mDesc(DataType<T>{}, lens, strides), mData(mDesc.GetElementSpace())
|
||||
{
|
||||
}
|
||||
|
||||
template <class G>
|
||||
void GenerateTensorValue(G g)
|
||||
{
|
||||
parallel_for([&](Xs... xs) { mData(mDesc.GetIndex(xs...)) = g(xs...); }, mDesc.mLens);
|
||||
}
|
||||
|
||||
T& operator[](std::size_t i) { return mData.at(i); }
|
||||
|
||||
const T& operator[](std::size_t i) const { return mData.at(i); }
|
||||
|
||||
typename std::vector<T>::iterator begin() { return mData.begin(); }
|
||||
|
||||
typename std::vector<T>::iterator end() { return mData.end(); }
|
||||
|
||||
typename std::vector<T>::const_iterator begin() const { return mData.begin(); }
|
||||
|
||||
typename std::vector<T>::const_iterator end() const { return mData.end(); }
|
||||
|
||||
TensorDescriptor mDesc;
|
||||
std::vector<T> mData;
|
||||
};
|
||||
|
||||
struct GpuMem
|
||||
{
|
||||
GpuMem() = delete;
|
||||
GpuMem(std::size_t sz, std::size_t data_sz) : mSz(sz), mDataSz(data_sz)
|
||||
{
|
||||
cudaMalloc(statci_cast<void**>(&GpuBuf), mDataSize * mSz);
|
||||
}
|
||||
|
||||
int ToGpu(void* p)
|
||||
{
|
||||
return static_cast<int>(cudaMemcpy(mGpuBuf, p, mDataSz * mSz, cudaMemCpyHostToDevice));
|
||||
}
|
||||
|
||||
int FromGpu(void* p) { return static_cast<int>(cuadMemCpy(p, mGpuBuf, mDataSz * mSz)); }
|
||||
|
||||
~GpuMem() { cudaFree(mGpuBuf); }
|
||||
|
||||
void* mGpuBuf;
|
||||
std::size_t mSz;
|
||||
std::size_t mDataSz;
|
||||
};
|
||||
|
||||
void dummy()
|
||||
{
|
||||
auto f1 = [](int n, int c, int h, int w) { do_f1(n, c, h, w); };
|
||||
auto f2 = [](int n, int c, int h, int w) { do_f2(n, c, h, w); };
|
||||
|
||||
auto par_f1 = generate_ParallelTensorFunctor(f1, 3, 3, 3, 3, 3);
|
||||
auto par_f2 = generate_ParallelTensorFunctor(f2, 4, 4, 4);
|
||||
|
||||
auto r1 = par_f1();
|
||||
auto r2 = par_f2();
|
||||
}
|
||||
|
||||
template <class F, class... Xs>
|
||||
auto generate_parallel_tensor_functor(F f, Xs... xs)
|
||||
{
|
||||
return ParallelTensorFunctor(f, xs...);
|
||||
}
|
||||
|
||||
template <class F, class... Xs>
|
||||
struct ParallelTensorFunctor
|
||||
{
|
||||
enum ParallelMethod_t
|
||||
{
|
||||
Serial = 0,
|
||||
Parallel = 1,
|
||||
};
|
||||
|
||||
F mF;
|
||||
constexpr std::size_t DIM = sizeof...(Xs);
|
||||
std::array<std::size_t, NDIM> mLens;
|
||||
std::array<std::size_t, NDIM> mStrides;
|
||||
std::size_t mN1d;
|
||||
|
||||
ParallelTensorFunctor(F f, Xs... xs) : mF(f), mLens({static_cast<std::size_t>(xs)...})
|
||||
{
|
||||
mStrides.back() = 1;
|
||||
std::partial_sum(mLens.rbegin(),
|
||||
mLens.rend() - 1,
|
||||
mStrides.rbegin() + 1,
|
||||
std::multiplies<std::size_t>());
|
||||
mN1d = mStrides[0] * mLens[0];
|
||||
}
|
||||
|
||||
void operator()(std::integral_constant<ParallelMethod_t, ParallelMethod_t::Serial>)
|
||||
{
|
||||
for(std::size_t i = 0; i < mN1d; ++i)
|
||||
{
|
||||
call_f_unpack_indices(mF, GetNdIndices(i));
|
||||
}
|
||||
}
|
||||
|
||||
void operator()(std::integral_constant<ParallelMethod_t, ParallelMethod_t::Parallel>,
|
||||
std::size_t::num_thread)
|
||||
{
|
||||
std::size_t work_per_thread = (mN1d + num_thread - 1) / num_thread;
|
||||
|
||||
std::vector<joinable_thread> threads(num_thread);
|
||||
|
||||
for(std::size_t it = 0; it < num_thread; ++it)
|
||||
{
|
||||
std::size_t iw_begin = it * work_per_thread;
|
||||
std::size_t iw_end = std::min(((it+1)*work_per_thread, mN1d));
|
||||
|
||||
auto f = [=] {
|
||||
for(std::size_t iw = iw_begin; iw < iw_end; ++iw)
|
||||
call_f_unpack_indices(mF, GetNdIndices(iw);
|
||||
};
|
||||
threads[it] = joinable_thread(f);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
struct joinable_thread : std::thread
|
||||
{
|
||||
template <class... Xs>
|
||||
joinable_thread(Xs&&... xs) : std::thread(std::forward<Xs>(xs)...)
|
||||
{
|
||||
}
|
||||
|
||||
~joinable_thread()
|
||||
{
|
||||
if(this->joinable())
|
||||
this->join;
|
||||
}
|
||||
}
|
||||
|
||||
template <class F, class T>
|
||||
auto call_f_unpack_indices(F f, T indices)
|
||||
{
|
||||
constexpr std::size_t N = std::tuple_size<T>::value;
|
||||
using NSeq = std::make_integer_sequence<std::size_t, N>;
|
||||
|
||||
return call_f_unpack_indices_impl(f, indices, NSeq{});
|
||||
}
|
||||
|
||||
template <class F, class T, class... Is>
|
||||
auto call_f_unpack_indices_impl(F f, T indices, std::integer_sequence<std::size_t, Is...>)
|
||||
{
|
||||
return f(std::get<Is>(indices)...);
|
||||
}
|
||||
46
src/tensor.cpp
Normal file
46
src/tensor.cpp
Normal file
@@ -0,0 +1,46 @@
|
||||
#include <boost/range/adaptor/transformed.hpp>
|
||||
#include <cassert>
|
||||
|
||||
#include "tensor.hpp"
|
||||
|
||||
TensorDescriptor::TensorDescriptor() {}
|
||||
|
||||
TensorDescriptor::TensorDescriptor(DataType_t t, std::initializer_list<std::size_t> lens)
|
||||
: mLens(lens), mDataType(t)
|
||||
{
|
||||
this->CalculateStrides();
|
||||
}
|
||||
|
||||
TensorDescriptor::TensorDescriptor(DataType_t t,
|
||||
std::vector<std::size_t> lens,
|
||||
std::vector<std::size_t> strides)
|
||||
: mLens(lens), mStrides(strides), mDataType(t)
|
||||
{
|
||||
}
|
||||
|
||||
void TensorDescriptor::CalculateStrides()
|
||||
{
|
||||
mStrides.clear();
|
||||
mStrides.resize(mLens.size(), 0);
|
||||
if(strides.empty())
|
||||
return;
|
||||
|
||||
mStrides.back() = 1;
|
||||
std::partial_sum(
|
||||
mLens.rbegin(), mLens.rend() - 1, mStrides.rbegin() + 1, std::multiplies<std::size_t>());
|
||||
}
|
||||
|
||||
std::size_t TensorDescriptor::GetDimension() const { return mLens.size(); }
|
||||
|
||||
std::size_t TensorDescriptor::GetElementSize() const
|
||||
{
|
||||
assert(mLens.size() == mStrides.size());
|
||||
return std::accumulate(
|
||||
mLens.begin(), mLens.end(), std::size_t{1}, std::multiplies<std::size_t>());
|
||||
}
|
||||
|
||||
std::size_t TensorDescriptor::GetElementSpace() const
|
||||
{
|
||||
auto ls = mLens | boost::adaptor::transformed([](auto v) { return v - 1; });
|
||||
return std::inner_product(ls.begin(), ls.end(), mStrides.begin(), std::size_t{0}) + 1;
|
||||
}
|
||||
Reference in New Issue
Block a user