mirror of
https://github.com/ROCm/composable_kernel.git
synced 2026-05-14 18:17:44 +00:00
@@ -41,5 +41,7 @@ link_libraries(${PYTHON_LIBRARIES})
|
||||
include_directories(BEFORE ${CUDA_COMMON_INCLUDE_DIR})
|
||||
|
||||
#
|
||||
include_directories(BEFORE src/include)
|
||||
add_subdirectory(src)
|
||||
add_subdirectory(driver)
|
||||
|
||||
|
||||
@@ -1,31 +1,67 @@
|
||||
#include <iostream>
|
||||
#include "tensor.hpp"
|
||||
|
||||
template <typename T>
|
||||
void direct_convolution(const Tensor<T>& in,
|
||||
const Tensor<T>& wei,
|
||||
Tensor<T>& out,
|
||||
std::size_t num_thread)
|
||||
{
|
||||
auto f = [&](auto n, auto k, auto ho, auto wo) {
|
||||
double v = 0;
|
||||
for(int c = 0; c < wei.mDesc.GetLengths()[1]; ++c)
|
||||
{
|
||||
for(int y = 0; y < wei.mDesc.GetLengths()[2]; ++y)
|
||||
{
|
||||
int hi = ho + y;
|
||||
for(int x = 0; x < wei.mDesc.GetLengths()[3]; ++x)
|
||||
{
|
||||
int wi = wo + x;
|
||||
v += in(n, c, hi, wi) * wei(k, c, y, x);
|
||||
}
|
||||
}
|
||||
}
|
||||
out(n, k, ho, wo) = v;
|
||||
};
|
||||
|
||||
auto f_par = make_ParallelTensorFunctor(f,
|
||||
out.mDesc.GetLengths()[0],
|
||||
out.mDesc.GetLengths()[1],
|
||||
out.mDesc.GetLengths()[2],
|
||||
out.mDesc.GetLengths()[3]);
|
||||
|
||||
f_par(num_thread);
|
||||
}
|
||||
|
||||
template <class T>
|
||||
struct Generator
|
||||
{
|
||||
|
||||
template <class... Is>
|
||||
T operator()(Is... is)
|
||||
{
|
||||
return 1;
|
||||
}
|
||||
};
|
||||
|
||||
int main()
|
||||
{
|
||||
Tensor<float> in({3, 16, 128, 128});
|
||||
Tensor<float> wei({4, 16, 3, 3});
|
||||
Tensor<float> out({3, 4, 126, 126});
|
||||
|
||||
int len_in = 100;
|
||||
int len_wei = 3;
|
||||
int len_out = len_in - len_wei + 1;
|
||||
int num_thread = std::thread::hardware_concurrency();
|
||||
|
||||
std::vector<float> in(len_in, 1);
|
||||
std::vector<float> wei(len_wei, 1);
|
||||
std::vector<float> out(len_out, 1);
|
||||
std::cout << __func__ << ": num_thread " << num_thread << std::endl;
|
||||
|
||||
direct_convolution(in.data(), wei.data(), out.data(), len_in, len_wei);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void direct_convolution(const T* in, const T* wei, T* out, const int len_in, const int len_wei)
|
||||
{
|
||||
int len_out = len_in - len_wei + 1;
|
||||
|
||||
for(int i_out = 0; i_out < len_out++ i_out)
|
||||
{
|
||||
double acc = 0;
|
||||
for(int i_wei = 0; i_wei < len_wei; ++i_wei)
|
||||
{
|
||||
acc += in[i_out + i_wei] * *wei[i_wei];
|
||||
}
|
||||
out[i_out] = acc;
|
||||
}
|
||||
in.GenerateTensorValue(Generator<float>{}, num_thread);
|
||||
wei.GenerateTensorValue(Generator<float>{}, num_thread);
|
||||
|
||||
direct_convolution(in, wei, out, num_thread);
|
||||
|
||||
std::cout << __func__ << ": done" << std::endl;
|
||||
|
||||
LogRange(std::cout, in.mData, ",") << std::endl;
|
||||
LogRange(std::cout, wei.mData, ",") << std::endl;
|
||||
LogRange(std::cout, out.mData, ",") << std::endl;
|
||||
}
|
||||
|
||||
@@ -1,5 +1,3 @@
|
||||
include_directories(BEFORE include)
|
||||
|
||||
set(SOURCE
|
||||
tensor.cpp;
|
||||
)
|
||||
@@ -11,7 +9,7 @@ set_target_properties(convolution PROPERTIES PREFIX "")
|
||||
target_link_libraries(convolution boost_python3)
|
||||
|
||||
# cuda
|
||||
target_link_libraries(convolution nvToolsExt)
|
||||
target_link_libraries(convolution nvToolsExt cudart)
|
||||
target_compile_features(convolution PUBLIC)
|
||||
set_target_properties(convolution PROPERTIES POSITION_INDEPENDENT_CODE ON)
|
||||
set_target_properties(convolution PROPERTIES CUDA_SEPARABLE_COMPILATION OFF)
|
||||
|
||||
@@ -1,10 +1,28 @@
|
||||
#include <thread>
|
||||
#include <vector>
|
||||
#include <numeric>
|
||||
#include <algorithm>
|
||||
#include <utility>
|
||||
#include <cassert>
|
||||
#include <iostream>
|
||||
#include "cuda_runtime.h"
|
||||
#include "helper_cuda.h"
|
||||
|
||||
template <class Range>
|
||||
std::ostream& LogRange(std::ostream& os, Range&& r, std::string delim)
|
||||
{
|
||||
bool first = true;
|
||||
for(auto&& x : r)
|
||||
{
|
||||
if(first)
|
||||
first = false;
|
||||
else
|
||||
os << delim;
|
||||
os << x;
|
||||
}
|
||||
return os;
|
||||
}
|
||||
|
||||
typedef enum
|
||||
{
|
||||
Half = 0,
|
||||
@@ -19,6 +37,34 @@ struct DataType<float> : std::integral_constant<DataType_t, DataType_t::Float>
|
||||
{
|
||||
};
|
||||
|
||||
template <class F, class T, std::size_t... Is>
|
||||
auto call_f_unpack_args_impl(F f, T args, std::index_sequence<Is...>)
|
||||
{
|
||||
return f(std::get<Is>(args)...);
|
||||
}
|
||||
|
||||
template <class F, class T>
|
||||
auto call_f_unpack_args(F f, T args)
|
||||
{
|
||||
constexpr std::size_t N = std::tuple_size<T>::value;
|
||||
|
||||
return call_f_unpack_args_impl(f, args, std::make_index_sequence<N>{});
|
||||
}
|
||||
|
||||
template <class F, class T, std::size_t... Is>
|
||||
auto construct_f_unpack_args_impl(T args, std::index_sequence<Is...>)
|
||||
{
|
||||
return F(std::get<Is>(args)...);
|
||||
}
|
||||
|
||||
template <class F, class T>
|
||||
auto construct_f_unpack_args(F, T args)
|
||||
{
|
||||
constexpr std::size_t N = std::tuple_size<T>::value;
|
||||
|
||||
return construct_f_unpack_args_impl<F>(args, std::make_index_sequence<N>{});
|
||||
}
|
||||
|
||||
struct TensorDescriptor
|
||||
{
|
||||
TensorDescriptor() = delete;
|
||||
@@ -50,12 +96,12 @@ struct TensorDescriptor
|
||||
const std::vector<std::size_t>& GetLengths() const;
|
||||
const std::vector<std::size_t>& GetStrides() const;
|
||||
|
||||
template <class... Xs>
|
||||
std::size_t Get1dIndex(Xs... xs) const
|
||||
template <class... Is>
|
||||
std::size_t Get1dIndex(Is... is) const
|
||||
{
|
||||
assert(sizeof...(Xs) == this->GetDimension());
|
||||
std::initializer_list<std::size_t> is{xs...};
|
||||
return std::inner_product(is.begin(), is.end(), mStrides.begin(), std::size_t{0});
|
||||
assert(sizeof...(Is) == this->GetDimension());
|
||||
std::initializer_list<std::size_t> iss{static_cast<std::size_t>(is)...};
|
||||
return std::inner_product(iss.begin(), iss.end(), mStrides.begin(), std::size_t{0});
|
||||
}
|
||||
|
||||
private:
|
||||
@@ -65,90 +111,6 @@ struct TensorDescriptor
|
||||
DataType_t mDataType;
|
||||
};
|
||||
|
||||
template <class T>
|
||||
struct Tensor
|
||||
{
|
||||
template <class X>
|
||||
Tensor(std::initializer_list<X> lens)
|
||||
: mDesc(DataType<T>{}, lens), mData(mDesc.GetElementSpace())
|
||||
{
|
||||
}
|
||||
|
||||
template <class X>
|
||||
Tensor(std::vector<X> lens) : mDesc(DataType<T>{}, lens), mData(mDesc.GetElementSpace())
|
||||
{
|
||||
}
|
||||
|
||||
template <class X, class Y>
|
||||
Tensor(std::vector<X> lens, std::vector<Y> strides)
|
||||
: mDesc(DataType<T>{}, lens, strides), mData(mDesc.GetElementSpace())
|
||||
{
|
||||
}
|
||||
|
||||
template <class G>
|
||||
void GenerateTensorValue(G g)
|
||||
{
|
||||
// ParallelTensorFunctor([&](Xs... xs) { mData(mDesc.Get1dIndex(xs...)) = g(xs...); },
|
||||
// mDesc.mLens)();
|
||||
switch(mDesc.GetDimension())
|
||||
{
|
||||
case 1:
|
||||
{
|
||||
ParallelTensorFunctor([&](auto i) { mData(mDesc.Get1dIndex(i)) = g(i); },
|
||||
mDesc.GetLengths()[0])();
|
||||
break;
|
||||
}
|
||||
case 2:
|
||||
{
|
||||
ParallelTensorFunctor(
|
||||
[&](auto i0, auto i1) { mData(mDesc.Get1dIndex(i0, i1)) = g(i0, i1); },
|
||||
mDesc.GetLengths()[0],
|
||||
mDesc.GetLengths()[1])();
|
||||
break;
|
||||
}
|
||||
case 3:
|
||||
{
|
||||
ParallelTensorFunctor(
|
||||
[&](auto i0, auto i1, auto i2) {
|
||||
mData(mDesc.Get1dIndex(i0, i1, i2)) = g(i0, i1, i2);
|
||||
},
|
||||
mDesc.GetLengths()[0],
|
||||
mDesc.GetLengths()[1],
|
||||
mDesc.GetLengths()[2])();
|
||||
break;
|
||||
}
|
||||
case 4:
|
||||
{
|
||||
ParallelTensorFunctor(
|
||||
[&](auto i0, auto i1, auto i2, auto i3) {
|
||||
mData(mDesc.Get1dIndex(i0, i1, i2, i3)) = g(i0, i1, i2, i3);
|
||||
},
|
||||
mDesc.GetLengths()[0],
|
||||
mDesc.GetLengths()[1],
|
||||
mDesc.GetLengths()[3],
|
||||
mDesc.GetLengths()[4])();
|
||||
break;
|
||||
}
|
||||
default: throw std::runtime_error("unspported dimension");
|
||||
}
|
||||
}
|
||||
|
||||
T& operator[](std::size_t i) { return mData.at(i); }
|
||||
|
||||
const T& operator[](std::size_t i) const { return mData.at(i); }
|
||||
|
||||
typename std::vector<T>::iterator begin() { return mData.begin(); }
|
||||
|
||||
typename std::vector<T>::iterator end() { return mData.end(); }
|
||||
|
||||
typename std::vector<T>::const_iterator begin() const { return mData.begin(); }
|
||||
|
||||
typename std::vector<T>::const_iterator end() const { return mData.end(); }
|
||||
|
||||
TensorDescriptor mDesc;
|
||||
std::vector<T> mData;
|
||||
};
|
||||
|
||||
struct GpuMem
|
||||
{
|
||||
GpuMem() = delete;
|
||||
@@ -194,12 +156,6 @@ struct joinable_thread : std::thread
|
||||
template <class F, class... Xs>
|
||||
struct ParallelTensorFunctor
|
||||
{
|
||||
enum ParallelMethod_t
|
||||
{
|
||||
Serial = 0,
|
||||
Parallel = 1,
|
||||
};
|
||||
|
||||
F mF;
|
||||
static constexpr std::size_t NDIM = sizeof...(Xs);
|
||||
std::array<std::size_t, NDIM> mLens;
|
||||
@@ -229,16 +185,7 @@ struct ParallelTensorFunctor
|
||||
return indices;
|
||||
}
|
||||
|
||||
void operator()(std::integral_constant<ParallelMethod_t, ParallelMethod_t::Serial>)
|
||||
{
|
||||
for(std::size_t i = 0; i < mN1d; ++i)
|
||||
{
|
||||
call_f_unpack_args(mF, GetNdIndices(i));
|
||||
}
|
||||
}
|
||||
|
||||
void operator()(std::integral_constant<ParallelMethod_t, ParallelMethod_t::Parallel>,
|
||||
std::size_t num_thread)
|
||||
void operator()(std::size_t num_thread) const
|
||||
{
|
||||
std::size_t work_per_thread = (mN1d + num_thread - 1) / num_thread;
|
||||
|
||||
@@ -247,7 +194,7 @@ struct ParallelTensorFunctor
|
||||
for(std::size_t it = 0; it < num_thread; ++it)
|
||||
{
|
||||
std::size_t iw_begin = it * work_per_thread;
|
||||
std::size_t iw_end = std::min(((it + 1) * work_per_thread, mN1d));
|
||||
std::size_t iw_end = std::min((it + 1) * work_per_thread, mN1d);
|
||||
|
||||
auto f = [=] {
|
||||
for(std::size_t iw = iw_begin; iw < iw_end; ++iw)
|
||||
@@ -260,30 +207,92 @@ struct ParallelTensorFunctor
|
||||
}
|
||||
};
|
||||
|
||||
template <class F, class T>
|
||||
auto call_f_unpack_args(F f, T args)
|
||||
template <class F, class... Xs>
|
||||
auto make_ParallelTensorFunctor(F f, Xs... xs)
|
||||
{
|
||||
static constexpr std::size_t N = std::tuple_size<T>::value;
|
||||
|
||||
return call_f_unpack_args_impl(f, args, std::make_index_sequence<N>{});
|
||||
return ParallelTensorFunctor<F, Xs...>(f, xs...);
|
||||
}
|
||||
|
||||
template <class F, class T, class... Is>
|
||||
auto call_f_unpack_args_impl(F f, T args, std::integer_sequence<Is...>)
|
||||
template <class T>
|
||||
struct Tensor
|
||||
{
|
||||
return f(std::get<Is>(args)...);
|
||||
}
|
||||
template <class X>
|
||||
Tensor(std::initializer_list<X> lens)
|
||||
: mDesc(DataType<T>{}, lens), mData(mDesc.GetElementSpace())
|
||||
{
|
||||
}
|
||||
|
||||
template <class F, class T, class... Is>
|
||||
auto construct_f_unpack_args_impl(T args, std::integer_sequence<Is...>)
|
||||
{
|
||||
return F(std::get<Is>(args)...);
|
||||
}
|
||||
template <class X>
|
||||
Tensor(std::vector<X> lens) : mDesc(DataType<T>{}, lens), mData(mDesc.GetElementSpace())
|
||||
{
|
||||
}
|
||||
|
||||
template <class F, class T>
|
||||
auto construct_f_unpack_args(F, T args)
|
||||
{
|
||||
static constexpr std::size_t N = std::tuple_size<T>::value;
|
||||
template <class X, class Y>
|
||||
Tensor(std::vector<X> lens, std::vector<Y> strides)
|
||||
: mDesc(DataType<T>{}, lens, strides), mData(mDesc.GetElementSpace())
|
||||
{
|
||||
}
|
||||
|
||||
return construct_f_unpack_args_impl<F>(args, std::make_index_sequence<N>{});
|
||||
}
|
||||
template <class G>
|
||||
void GenerateTensorValue(G g, std::size_t num_thread = 1)
|
||||
{
|
||||
switch(mDesc.GetDimension())
|
||||
{
|
||||
case 1:
|
||||
{
|
||||
auto f = [&](auto i) { (*this)(i) = g(i); };
|
||||
make_ParallelTensorFunctor(f, mDesc.GetLengths()[0])(num_thread);
|
||||
break;
|
||||
}
|
||||
case 2:
|
||||
{
|
||||
auto f = [&](auto i0, auto i1) { (*this)(i0, i1) = g(i0, i1); };
|
||||
make_ParallelTensorFunctor(f, mDesc.GetLengths()[0], mDesc.GetLengths()[1])(num_thread);
|
||||
break;
|
||||
}
|
||||
case 3:
|
||||
{
|
||||
auto f = [&](auto i0, auto i1, auto i2) { (*this)(i0, i1, i2) = g(i0, i1, i2); };
|
||||
make_ParallelTensorFunctor(
|
||||
f, mDesc.GetLengths()[0], mDesc.GetLengths()[1], mDesc.GetLengths()[2])(num_thread);
|
||||
break;
|
||||
}
|
||||
case 4:
|
||||
{
|
||||
auto f = [&](auto i0, auto i1, auto i2, auto i3) {
|
||||
(*this)(i0, i1, i2, i3) = g(i0, i1, i2, i3);
|
||||
};
|
||||
make_ParallelTensorFunctor(f,
|
||||
mDesc.GetLengths()[0],
|
||||
mDesc.GetLengths()[1],
|
||||
mDesc.GetLengths()[2],
|
||||
mDesc.GetLengths()[3])(num_thread);
|
||||
break;
|
||||
}
|
||||
default: throw std::runtime_error("unspported dimension");
|
||||
}
|
||||
}
|
||||
|
||||
template <class... Is>
|
||||
T& operator()(Is... is)
|
||||
{
|
||||
return mData[mDesc.Get1dIndex(is...)];
|
||||
}
|
||||
|
||||
template <class... Is>
|
||||
const T& operator()(Is... is) const
|
||||
{
|
||||
return mData[mDesc.Get1dIndex(is...)];
|
||||
}
|
||||
|
||||
typename std::vector<T>::iterator begin() { return mData.begin(); }
|
||||
|
||||
typename std::vector<T>::iterator end() { return mData.end(); }
|
||||
|
||||
typename std::vector<T>::const_iterator begin() const { return mData.begin(); }
|
||||
|
||||
typename std::vector<T>::const_iterator end() const { return mData.end(); }
|
||||
|
||||
TensorDescriptor mDesc;
|
||||
std::vector<T> mData;
|
||||
};
|
||||
|
||||
Reference in New Issue
Block a user