reorginzed files

[ROCm/composable_kernel commit: 1566b31736]
This commit is contained in:
Chao Liu
2019-06-13 15:12:12 -05:00
parent 11c6b2ab9a
commit 5f217ebda5
64 changed files with 254 additions and 218 deletions

View File

@@ -1,7 +1,23 @@
set(TENSOR_SOURCE
src/tensor.cpp;
src/device.cpp;
)
add_library(tensor SHARED ${TENSOR_SOURCE})
target_compile_features(tensor PUBLIC)
set_target_properties(tensor PROPERTIES POSITION_INDEPENDENT_CODE ON)
if(DEVICE_BACKEND STREQUAL "NVIDIA")
target_link_libraries(tensor nvToolsExt cudart)
endif()
install(TARGETS tensor LIBRARY DESTINATION lib)
if(DEVICE_BACKEND STREQUAL "AMD")
set(DRIVER_SOURCE driver.cpp)
set(DRIVER_SOURCE src/driver.cpp)
elseif(DEVICE_BACKEND STREQUAL "NVIDIA")
set(DRIVER_SOURCE driver.cu)
set(DRIVER_SOURCE src/driver.cu)
endif()
add_executable(driver ${DRIVER_SOURCE})

View File

@@ -0,0 +1,125 @@
#ifndef CK_CONV_COMMON_HPP
#define CK_CONV_COMMON_HPP
#include "ConstantTensorDescriptor.hpp"
using namespace ck;
// this is ugly, only for 4d
template <class InDesc, class WeiDesc>
constexpr auto get_convolution_output_default_4d_tensor_descriptor(InDesc, WeiDesc)
{
constexpr auto in_desc = InDesc{};
constexpr auto wei_desc = WeiDesc{};
constexpr auto I0 = Number<0>{};
constexpr auto I1 = Number<1>{};
constexpr auto I2 = Number<2>{};
constexpr auto I3 = Number<3>{};
static_assert(in_desc.GetNumOfDimension() == 4, "input nDim is not 4");
static_assert(wei_desc.GetNumOfDimension() == 4, "weight nDim is not 4");
static_assert(in_desc.GetLength(I1) == wei_desc.GetLength(I1),
"input & weight dimension not consistent");
constexpr auto N = in_desc.GetLength(I0);
constexpr auto HI = in_desc.GetLength(I2);
constexpr auto WI = in_desc.GetLength(I3);
constexpr auto K = wei_desc.GetLength(I0);
constexpr auto Y = wei_desc.GetLength(I2);
constexpr auto X = wei_desc.GetLength(I3);
constexpr auto HO = HI + 1 - Y;
constexpr auto WO = WI + 1 - X;
return make_ConstantTensorDescriptor_packed(Sequence<N, K, HO, WO>{});
}
template <class InDesc, class WeiDesc, class LowerPads, class UpperPads>
constexpr auto get_convolution_with_padding_output_default_4d_tensor_descriptor(InDesc,
WeiDesc,
LowerPads,
UpperPads)
{
constexpr auto in_desc = InDesc{};
constexpr auto wei_desc = WeiDesc{};
constexpr auto I0 = Number<0>{};
constexpr auto I1 = Number<1>{};
constexpr auto I2 = Number<2>{};
constexpr auto I3 = Number<3>{};
static_assert(in_desc.GetNumOfDimension() == 4, "input nDim is not 4");
static_assert(wei_desc.GetNumOfDimension() == 4, "weight nDim is not 4");
static_assert(in_desc.GetLength(I1) == wei_desc.GetLength(I1),
"input & weight dimension not consistent");
constexpr auto N = in_desc.GetLength(I0);
constexpr auto HI = in_desc.GetLength(I2);
constexpr auto WI = in_desc.GetLength(I3);
constexpr auto K = wei_desc.GetLength(I0);
constexpr auto Y = wei_desc.GetLength(I2);
constexpr auto X = wei_desc.GetLength(I3);
constexpr auto HPadLow = LowerPads{}.Get(I0);
constexpr auto WPadLow = LowerPads{}.Get(I1);
constexpr auto HPadUp = UpperPads{}.Get(I0);
constexpr auto WPadUp = UpperPads{}.Get(I1);
constexpr auto HO = HI + HPadLow + HPadUp + 1 - Y;
constexpr auto WO = WI + WPadLow + WPadUp + 1 - X;
return make_ConstantTensorDescriptor_packed(Sequence<N, K, HO, WO>{});
}
template <class InDesc, class WeiDesc, class OutDesc>
constexpr std::size_t calculate_convolution_flops(InDesc, WeiDesc, OutDesc)
{
constexpr auto wei_desc = WeiDesc{};
constexpr auto out_desc = OutDesc{};
constexpr auto I0 = Number<0>{};
constexpr auto I1 = Number<1>{};
constexpr auto I2 = Number<2>{};
constexpr auto I3 = Number<3>{};
constexpr index_t N = out_desc.GetLength(I0);
constexpr index_t K = out_desc.GetLength(I1);
constexpr index_t Ho = out_desc.GetLength(I2);
constexpr index_t Wo = out_desc.GetLength(I3);
constexpr index_t C = wei_desc.GetLength(I1);
constexpr index_t Y = wei_desc.GetLength(I2);
constexpr index_t X = wei_desc.GetLength(I3);
return std::size_t(2) * N * K * Ho * Wo * C * Y * X;
}
template <class Float, class InDesc, class WeiDesc, class OutDesc>
constexpr std::size_t calculate_convolution_memory_size(Float, InDesc, WeiDesc, OutDesc)
{
constexpr auto wei_desc = WeiDesc{};
constexpr auto out_desc = OutDesc{};
constexpr auto I0 = Number<0>{};
constexpr auto I1 = Number<1>{};
constexpr auto I2 = Number<2>{};
constexpr auto I3 = Number<3>{};
constexpr index_t N = out_desc.GetLength(I0);
constexpr index_t K = out_desc.GetLength(I1);
constexpr index_t Ho = out_desc.GetLength(I2);
constexpr index_t Wo = out_desc.GetLength(I3);
constexpr index_t C = wei_desc.GetLength(I1);
constexpr index_t Y = wei_desc.GetLength(I2);
constexpr index_t X = wei_desc.GetLength(I3);
return sizeof(Float) *
(InDesc::GetElementSpace() + WeiDesc::GetElementSpace() + OutDesc::GetElementSpace());
}
#endif

64
driver/include/device.hpp Normal file
View File

@@ -0,0 +1,64 @@
#ifndef CK_DEVICE_HPP
#define CK_DEVICE_HPP
#include <memory>
#include "config.hpp"
using namespace ck;
struct DeviceMem
{
DeviceMem() = delete;
DeviceMem(std::size_t mem_size);
void* GetDeviceBuffer();
void ToDevice(const void* p);
void FromDevice(void* p);
~DeviceMem();
void* mpDeviceBuf;
std::size_t mMemSize;
};
struct KernelTimerImpl;
struct KernelTimer
{
KernelTimer();
~KernelTimer();
void Start();
void End();
float GetElapsedTime() const;
std::unique_ptr<KernelTimerImpl> impl;
};
template <typename... Args, typename F>
float launch_kernel(F kernel, dim3 grid_dim, dim3 block_dim, std::size_t lds_byte, Args... args)
{
KernelTimer timer;
#if CK_DEVICE_BACKEND_AMD
timer.Start();
hipLaunchKernelGGL(kernel, grid_dim, block_dim, lds_byte, 0, args...);
timer.End();
hipGetErrorString(hipGetLastError());
#elif CK_DEVICE_BACKEND_NVIDIA
const void* f = reinterpret_cast<const void*>(kernel);
void* p_args[] = {&args...};
timer.Start();
cudaError_t error = cudaLaunchKernel(f, grid_dim, block_dim, p_args, lds_byte, 0);
timer.End();
checkCudaErrors(error);
#endif
return timer.GetElapsedTime();
}
#endif

View File

@@ -1,8 +1,9 @@
#pragma once
#include <unistd.h>
#include "device.hpp"
#include "tensor.hpp"
#include "gridwise_convolution_kernel_wrapper.hpp"
#include "composable_kernel/kernel_algorithm/gridwise_convolution_direct_v2_nchw_kcyx_nkhw.hpp"
#include "gridwise_convolution_direct_v2_nchw_kcyx_nkhw.hpp"
using namespace ck;

View File

@@ -1,11 +1,12 @@
#pragma once
#include <unistd.h>
#include "device.hpp"
#include "tensor.hpp"
#include "gridwise_convolution_kernel_wrapper.hpp"
#include "composable_kernel/kernel_algorithm/gridwise_convolution_implicit_gemm_v1r1_chwn_cyxk_khwn.hpp"
#include "composable_kernel/kernel_algorithm/gridwise_convolution_implicit_gemm_v1r2_chwn_cyxk_khwn.hpp"
#include "composable_kernel/kernel_algorithm/gridwise_convolution_implicit_gemm_v1r3_chwn_cyxk_khwn.hpp"
#include "composable_kernel/kernel_algorithm/gridwise_convolution_implicit_gemm_v1r3_chwn_cyxk_khwn_lds_double_buffer.hpp"
#include "gridwise_convolution_implicit_gemm_v1r1_chwn_cyxk_khwn.hpp"
#include "gridwise_convolution_implicit_gemm_v1r2_chwn_cyxk_khwn.hpp"
#include "gridwise_convolution_implicit_gemm_v1r3_chwn_cyxk_khwn.hpp"
#include "gridwise_convolution_implicit_gemm_v1r3_chwn_cyxk_khwn_lds_double_buffer.hpp"
using namespace ck;

View File

@@ -1,9 +1,10 @@
#pragma once
#include <unistd.h>
#include "device.hpp"
#include "tensor.hpp"
#include "gridwise_convolution_kernel_wrapper.hpp"
#include "composable_kernel/kernel_algorithm/gridwise_convolution_implicit_gemm_v1r3_nchw_cyxk_nkhw.hpp"
#include "composable_kernel/kernel_algorithm/gridwise_convolution_implicit_gemm_v1r3_nchw_cyxk_nkhw_lds_double_buffer.hpp"
#include "gridwise_convolution_implicit_gemm_v1r3_nchw_cyxk_nkhw.hpp"
#include "gridwise_convolution_implicit_gemm_v1r3_nchw_cyxk_nkhw_lds_double_buffer.hpp"
using namespace ck;

View File

@@ -1,9 +1,10 @@
#pragma once
#include <unistd.h>
#include "device.hpp"
#include "tensor.hpp"
#include "gridwise_convolution_kernel_wrapper.hpp"
#include "composable_kernel/kernel_algorithm/gridwise_convolution_implicit_gemm_v2_chwn_cyxk_khwn.hpp"
#include "composable_kernel/kernel_algorithm/gridwise_convolution_implicit_gemm_v2_chwn_cyxk_khwn_lds_double_buffer.hpp"
#include "gridwise_convolution_implicit_gemm_v2_chwn_cyxk_khwn.hpp"
#include "gridwise_convolution_implicit_gemm_v2_chwn_cyxk_khwn_lds_double_buffer.hpp"
using namespace ck;

View File

@@ -1,9 +1,10 @@
#pragma once
#include <unistd.h>
#include "device.hpp"
#include "tensor.hpp"
#include "gridwise_convolution_kernel_wrapper.hpp"
#include "composable_kernel/kernel_algorithm/gridwise_convolution_implicit_gemm_v3_nchw_cyxk_nkhw.hpp"
#include "composable_kernel/kernel_algorithm/gridwise_convolution_implicit_gemm_v3_nchw_cyxk_nkhw_lds_double_buffer.hpp"
#include "gridwise_convolution_implicit_gemm_v3_nchw_cyxk_nkhw.hpp"
#include "gridwise_convolution_implicit_gemm_v3_nchw_cyxk_nkhw_lds_double_buffer.hpp"
using namespace ck;

View File

@@ -1,9 +1,10 @@
#pragma once
#include <unistd.h>
#include "device.hpp"
#include "tensor.hpp"
#include "gridwise_convolution_kernel_wrapper.hpp"
#include "composable_kernel/kernel_algorithm/gridwise_convolution_implicit_gemm_v4_nchw_kcyx_nkhw.hpp"
#include "composable_kernel/kernel_algorithm/gridwise_convolution_implicit_gemm_v4_nchw_kcyx_nkhw_lds_double_buffer.hpp"
#include "gridwise_convolution_implicit_gemm_v4_nchw_kcyx_nkhw.hpp"
#include "gridwise_convolution_implicit_gemm_v4_nchw_kcyx_nkhw_lds_double_buffer.hpp"
using namespace ck;

View File

@@ -1,7 +1,8 @@
#pragma once
#include <unistd.h>
#include "device.hpp"
#include "composable_kernel/kernel_algorithm/gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hpp"
#include "tensor.hpp"
#include "gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hpp"
using namespace ck;

View File

@@ -1,7 +1,8 @@
#pragma once
#include <unistd.h>
#include "device.hpp"
#include "composable_kernel/kernel_algorithm/gridwise_implicit_gemm_convolution_1_chwn_cyxk_khwn_padded.hpp"
#include "tensor.hpp"
#include "gridwise_implicit_gemm_convolution_1_chwn_cyxk_khwn_padded.hpp"
using namespace ck;

272
driver/include/tensor.hpp Normal file
View File

@@ -0,0 +1,272 @@
#ifndef CK_TENSOR_HPP
#define CK_TENSOR_HPP
#include <thread>
#include <vector>
#include <numeric>
#include <algorithm>
#include <utility>
#include <cassert>
#include <iostream>
template <class Range>
std::ostream& LogRange(std::ostream& os, Range&& range, std::string delim)
{
bool first = true;
for(auto&& v : range)
{
if(first)
first = false;
else
os << delim;
os << v;
}
return os;
}
typedef enum {
Half = 0,
Float = 1,
} DataType_t;
template <class T>
struct DataType;
template <>
struct DataType<float> : std::integral_constant<DataType_t, DataType_t::Float>
{
};
template <class F, class T, std::size_t... Is>
auto call_f_unpack_args_impl(F f, T args, std::index_sequence<Is...>)
{
return f(std::get<Is>(args)...);
}
template <class F, class T>
auto call_f_unpack_args(F f, T args)
{
constexpr std::size_t N = std::tuple_size<T>::value;
return call_f_unpack_args_impl(f, args, std::make_index_sequence<N>{});
}
template <class F, class T, std::size_t... Is>
auto construct_f_unpack_args_impl(T args, std::index_sequence<Is...>)
{
return F(std::get<Is>(args)...);
}
template <class F, class T>
auto construct_f_unpack_args(F, T args)
{
constexpr std::size_t N = std::tuple_size<T>::value;
return construct_f_unpack_args_impl<F>(args, std::make_index_sequence<N>{});
}
struct TensorDescriptor
{
TensorDescriptor() = delete;
TensorDescriptor(std::initializer_list<std::size_t> lens);
TensorDescriptor(std::initializer_list<std::size_t> lens,
std::initializer_list<std::size_t> strides);
TensorDescriptor(std::vector<std::size_t> lens, std::vector<std::size_t> strides);
void CalculateStrides();
template <class Range>
TensorDescriptor(const Range& lens) : mLens(lens.begin(), lens.end())
{
this->CalculateStrides();
}
template <class Range1, class Range2>
TensorDescriptor(const Range1& lens, const Range2& strides)
: mLens(lens.begin(), lens.end()), mStrides(strides.begin(), strides.end())
{
}
std::size_t GetNumOfDimension() const;
std::size_t GetElementSize() const;
std::size_t GetElementSpace() const;
const std::vector<std::size_t>& GetLengths() const;
const std::vector<std::size_t>& GetStrides() const;
template <class... Is>
std::size_t GetOffsetFromMultiIndex(Is... is) const
{
assert(sizeof...(Is) == this->GetNumOfDimension());
std::initializer_list<std::size_t> iss{static_cast<std::size_t>(is)...};
return std::inner_product(iss.begin(), iss.end(), mStrides.begin(), std::size_t{0});
}
private:
std::vector<std::size_t> mLens;
std::vector<std::size_t> mStrides;
};
struct joinable_thread : std::thread
{
template <class... Xs>
joinable_thread(Xs&&... xs) : std::thread(std::forward<Xs>(xs)...)
{
}
joinable_thread(joinable_thread&&) = default;
joinable_thread& operator=(joinable_thread&&) = default;
~joinable_thread()
{
if(this->joinable())
this->join();
}
};
template <class F, class... Xs>
struct ParallelTensorFunctor
{
F mF;
static constexpr std::size_t NDIM = sizeof...(Xs);
std::array<std::size_t, NDIM> mLens;
std::array<std::size_t, NDIM> mStrides;
std::size_t mN1d;
ParallelTensorFunctor(F f, Xs... xs) : mF(f), mLens({static_cast<std::size_t>(xs)...})
{
mStrides.back() = 1;
std::partial_sum(mLens.rbegin(),
mLens.rend() - 1,
mStrides.rbegin() + 1,
std::multiplies<std::size_t>());
mN1d = mStrides[0] * mLens[0];
}
std::array<std::size_t, NDIM> GetNdIndices(std::size_t i) const
{
std::array<std::size_t, NDIM> indices;
for(int idim = 0; idim < NDIM; ++idim)
{
indices[idim] = i / mStrides[idim];
i -= indices[idim] * mStrides[idim];
}
return indices;
}
void operator()(std::size_t num_thread) const
{
std::size_t work_per_thread = (mN1d + num_thread - 1) / num_thread;
std::vector<joinable_thread> threads(num_thread);
for(std::size_t it = 0; it < num_thread; ++it)
{
std::size_t iw_begin = it * work_per_thread;
std::size_t iw_end = std::min((it + 1) * work_per_thread, mN1d);
auto f = [=] {
for(std::size_t iw = iw_begin; iw < iw_end; ++iw)
{
call_f_unpack_args(mF, GetNdIndices(iw));
}
};
threads[it] = joinable_thread(f);
}
}
};
template <class F, class... Xs>
auto make_ParallelTensorFunctor(F f, Xs... xs)
{
return ParallelTensorFunctor<F, Xs...>(f, xs...);
}
template <class T>
struct Tensor
{
template <class X>
Tensor(std::initializer_list<X> lens) : mDesc(lens), mData(mDesc.GetElementSpace())
{
}
template <class X>
Tensor(std::vector<X> lens) : mDesc(lens), mData(mDesc.GetElementSpace())
{
}
template <class X, class Y>
Tensor(std::vector<X> lens, std::vector<Y> strides)
: mDesc(lens, strides), mData(mDesc.GetElementSpace())
{
}
Tensor(const TensorDescriptor& desc) : mDesc(desc), mData(mDesc.GetElementSpace()) {}
template <class G>
void GenerateTensorValue(G g, std::size_t num_thread = 1)
{
switch(mDesc.GetNumOfDimension())
{
case 1:
{
auto f = [&](auto i) { (*this)(i) = g(i); };
make_ParallelTensorFunctor(f, mDesc.GetLengths()[0])(num_thread);
break;
}
case 2:
{
auto f = [&](auto i0, auto i1) { (*this)(i0, i1) = g(i0, i1); };
make_ParallelTensorFunctor(f, mDesc.GetLengths()[0], mDesc.GetLengths()[1])(num_thread);
break;
}
case 3:
{
auto f = [&](auto i0, auto i1, auto i2) { (*this)(i0, i1, i2) = g(i0, i1, i2); };
make_ParallelTensorFunctor(
f, mDesc.GetLengths()[0], mDesc.GetLengths()[1], mDesc.GetLengths()[2])(num_thread);
break;
}
case 4:
{
auto f = [&](auto i0, auto i1, auto i2, auto i3) {
(*this)(i0, i1, i2, i3) = g(i0, i1, i2, i3);
};
make_ParallelTensorFunctor(f,
mDesc.GetLengths()[0],
mDesc.GetLengths()[1],
mDesc.GetLengths()[2],
mDesc.GetLengths()[3])(num_thread);
break;
}
default: throw std::runtime_error("unspported dimension");
}
}
template <class... Is>
T& operator()(Is... is)
{
return mData[mDesc.GetOffsetFromMultiIndex(is...)];
}
template <class... Is>
const T& operator()(Is... is) const
{
return mData[mDesc.GetOffsetFromMultiIndex(is...)];
}
typename std::vector<T>::iterator begin() { return mData.begin(); }
typename std::vector<T>::iterator end() { return mData.end(); }
typename std::vector<T>::const_iterator begin() const { return mData.begin(); }
typename std::vector<T>::const_iterator end() const { return mData.end(); }
TensorDescriptor mDesc;
std::vector<T> mData;
};
#endif

View File

114
driver/src/device.cpp Normal file
View File

@@ -0,0 +1,114 @@
#include "config.hpp"
#include "device.hpp"
DeviceMem::DeviceMem(std::size_t mem_size) : mMemSize(mem_size)
{
#if CK_DEVICE_BACKEND_AMD
hipGetErrorString(hipMalloc(static_cast<void**>(&mpDeviceBuf), mMemSize));
#elif CK_DEVICE_BACKEND_NVIDIA
checkCudaErrors(cudaMalloc(static_cast<void**>(&mpDeviceBuf), mMemSize));
#endif
}
void* DeviceMem::GetDeviceBuffer() { return mpDeviceBuf; }
void DeviceMem::ToDevice(const void* p)
{
#if CK_DEVICE_BACKEND_AMD
hipGetErrorString(
hipMemcpy(mpDeviceBuf, const_cast<void*>(p), mMemSize, hipMemcpyHostToDevice));
#elif CK_DEVICE_BACKEND_NVIDIA
checkCudaErrors(
cudaMemcpy(mpDeviceBuf, const_cast<void*>(p), mMemSize, cudaMemcpyHostToDevice));
#endif
}
void DeviceMem::FromDevice(void* p)
{
#if CK_DEVICE_BACKEND_AMD
hipGetErrorString(hipMemcpy(p, mpDeviceBuf, mMemSize, hipMemcpyDeviceToHost));
#elif CK_DEVICE_BACKEND_NVIDIA
checkCudaErrors(cudaMemcpy(p, mpDeviceBuf, mMemSize, cudaMemcpyDeviceToHost));
#endif
}
DeviceMem::~DeviceMem()
{
#if CK_DEVICE_BACKEND_AMD
hipGetErrorString(hipFree(mpDeviceBuf));
#elif CK_DEVICE_BACKEND_NVIDIA
checkCudaErrors(cudaFree(mpDeviceBuf));
#endif
}
struct KernelTimerImpl
{
KernelTimerImpl()
{
#if CK_DEVICE_BACKEND_AMD
hipEventCreate(&mStart);
hipEventCreate(&mEnd);
#elif CK_DEVICE_BACKEND_NVIDIA
cudaEventCreate(&mStart);
cudaEventCreate(&mEnd);
#endif
}
~KernelTimerImpl()
{
#if CK_DEVICE_BACKEND_AMD
hipEventDestroy(mStart);
hipEventDestroy(mEnd);
#elif CK_DEVICE_BACKEND_NVIDIA
cudaEventDestroy(mStart);
cudaEventDestroy(mEnd);
#endif
}
void Start()
{
#if CK_DEVICE_BACKEND_AMD
hipEventRecord(mStart, 0);
#elif CK_DEVICE_BACKEND_NVIDIA
cudaEventRecord(mStart, 0);
#endif
}
void End()
{
#if CK_DEVICE_BACKEND_AMD
hipEventRecord(mEnd, 0);
hipEventSynchronize(mEnd);
#elif CK_DEVICE_BACKEND_NVIDIA
cudaEventRecord(mEnd, 0);
cudaEventSynchronize(mEnd);
#endif
}
float GetElapsedTime() const
{
float time;
#if CK_DEVICE_BACKEND_AMD
hipEventElapsedTime(&time, mStart, mEnd);
#elif CK_DEVICE_BACKEND_NVIDIA
cudaEventElapsedTime(&time, mStart, mEnd);
#endif
return time;
}
#if CK_DEVICE_BACKEND_AMD
hipEvent_t mStart, mEnd;
#elif CK_DEVICE_BACKEND_NVIDIA
cudaEvent_t mStart, mEnd;
#endif
};
KernelTimer::KernelTimer() : impl(new KernelTimerImpl()) {}
KernelTimer::~KernelTimer() {}
void KernelTimer::Start() { impl->Start(); }
void KernelTimer::End() { impl->End(); }
float KernelTimer::GetElapsedTime() const { return impl->GetElapsedTime(); }

View File

@@ -3,9 +3,9 @@
#include <initializer_list>
#include <cstdlib>
#include <stdlib.h>
#include "composable_kernel/utility/config.hpp"
#include "composable_kernel/tensor_description/ConstantTensorDescriptor.hpp"
#include "tensor.hpp"
#include "config.hpp"
#include "ConstantTensorDescriptor.hpp"
#include "device.hpp"
#include "conv_common.hpp"
#include "device_convolution_direct_v2_nchw_kcyx_nkhw.hpp"
#include "device_convolution_implicit_gemm_v1_chwn_cyxk_khwn.hpp"

45
driver/src/tensor.cpp Normal file
View File

@@ -0,0 +1,45 @@
#include <boost/range/adaptor/transformed.hpp>
#include <cassert>
#include "tensor.hpp"
TensorDescriptor::TensorDescriptor(std::initializer_list<std::size_t> lens) : mLens(lens)
{
this->CalculateStrides();
}
TensorDescriptor::TensorDescriptor(std::vector<std::size_t> lens, std::vector<std::size_t> strides)
: mLens(lens), mStrides(strides)
{
}
void TensorDescriptor::CalculateStrides()
{
mStrides.clear();
mStrides.resize(mLens.size(), 0);
if(mStrides.empty())
return;
mStrides.back() = 1;
std::partial_sum(
mLens.rbegin(), mLens.rend() - 1, mStrides.rbegin() + 1, std::multiplies<std::size_t>());
}
std::size_t TensorDescriptor::GetNumOfDimension() const { return mLens.size(); }
std::size_t TensorDescriptor::GetElementSize() const
{
assert(mLens.size() == mStrides.size());
return std::accumulate(
mLens.begin(), mLens.end(), std::size_t{1}, std::multiplies<std::size_t>());
}
std::size_t TensorDescriptor::GetElementSpace() const
{
auto ls = mLens | boost::adaptors::transformed([](std::size_t v) { return v - 1; });
return std::inner_product(ls.begin(), ls.end(), mStrides.begin(), std::size_t{0}) + 1;
}
const std::vector<std::size_t>& TensorDescriptor::GetLengths() const { return mLens; }
const std::vector<std::size_t>& TensorDescriptor::GetStrides() const { return mStrides; }