mirror of
https://github.com/ROCm/composable_kernel.git
synced 2026-05-17 11:30:02 +00:00
@@ -1,7 +1,23 @@
|
||||
set(TENSOR_SOURCE
|
||||
src/tensor.cpp;
|
||||
src/device.cpp;
|
||||
)
|
||||
|
||||
add_library(tensor SHARED ${TENSOR_SOURCE})
|
||||
target_compile_features(tensor PUBLIC)
|
||||
set_target_properties(tensor PROPERTIES POSITION_INDEPENDENT_CODE ON)
|
||||
|
||||
if(DEVICE_BACKEND STREQUAL "NVIDIA")
|
||||
target_link_libraries(tensor nvToolsExt cudart)
|
||||
endif()
|
||||
|
||||
install(TARGETS tensor LIBRARY DESTINATION lib)
|
||||
|
||||
|
||||
if(DEVICE_BACKEND STREQUAL "AMD")
|
||||
set(DRIVER_SOURCE driver.cpp)
|
||||
set(DRIVER_SOURCE src/driver.cpp)
|
||||
elseif(DEVICE_BACKEND STREQUAL "NVIDIA")
|
||||
set(DRIVER_SOURCE driver.cu)
|
||||
set(DRIVER_SOURCE src/driver.cu)
|
||||
endif()
|
||||
|
||||
add_executable(driver ${DRIVER_SOURCE})
|
||||
|
||||
125
driver/include/conv_common.hpp
Normal file
125
driver/include/conv_common.hpp
Normal file
@@ -0,0 +1,125 @@
|
||||
#ifndef CK_CONV_COMMON_HPP
|
||||
#define CK_CONV_COMMON_HPP
|
||||
|
||||
#include "ConstantTensorDescriptor.hpp"
|
||||
|
||||
using namespace ck;
|
||||
|
||||
// this is ugly, only for 4d
|
||||
template <class InDesc, class WeiDesc>
|
||||
constexpr auto get_convolution_output_default_4d_tensor_descriptor(InDesc, WeiDesc)
|
||||
{
|
||||
constexpr auto in_desc = InDesc{};
|
||||
constexpr auto wei_desc = WeiDesc{};
|
||||
|
||||
constexpr auto I0 = Number<0>{};
|
||||
constexpr auto I1 = Number<1>{};
|
||||
constexpr auto I2 = Number<2>{};
|
||||
constexpr auto I3 = Number<3>{};
|
||||
|
||||
static_assert(in_desc.GetNumOfDimension() == 4, "input nDim is not 4");
|
||||
static_assert(wei_desc.GetNumOfDimension() == 4, "weight nDim is not 4");
|
||||
static_assert(in_desc.GetLength(I1) == wei_desc.GetLength(I1),
|
||||
"input & weight dimension not consistent");
|
||||
|
||||
constexpr auto N = in_desc.GetLength(I0);
|
||||
constexpr auto HI = in_desc.GetLength(I2);
|
||||
constexpr auto WI = in_desc.GetLength(I3);
|
||||
|
||||
constexpr auto K = wei_desc.GetLength(I0);
|
||||
constexpr auto Y = wei_desc.GetLength(I2);
|
||||
constexpr auto X = wei_desc.GetLength(I3);
|
||||
|
||||
constexpr auto HO = HI + 1 - Y;
|
||||
constexpr auto WO = WI + 1 - X;
|
||||
|
||||
return make_ConstantTensorDescriptor_packed(Sequence<N, K, HO, WO>{});
|
||||
}
|
||||
|
||||
template <class InDesc, class WeiDesc, class LowerPads, class UpperPads>
|
||||
constexpr auto get_convolution_with_padding_output_default_4d_tensor_descriptor(InDesc,
|
||||
WeiDesc,
|
||||
LowerPads,
|
||||
UpperPads)
|
||||
{
|
||||
constexpr auto in_desc = InDesc{};
|
||||
constexpr auto wei_desc = WeiDesc{};
|
||||
|
||||
constexpr auto I0 = Number<0>{};
|
||||
constexpr auto I1 = Number<1>{};
|
||||
constexpr auto I2 = Number<2>{};
|
||||
constexpr auto I3 = Number<3>{};
|
||||
|
||||
static_assert(in_desc.GetNumOfDimension() == 4, "input nDim is not 4");
|
||||
static_assert(wei_desc.GetNumOfDimension() == 4, "weight nDim is not 4");
|
||||
static_assert(in_desc.GetLength(I1) == wei_desc.GetLength(I1),
|
||||
"input & weight dimension not consistent");
|
||||
|
||||
constexpr auto N = in_desc.GetLength(I0);
|
||||
constexpr auto HI = in_desc.GetLength(I2);
|
||||
constexpr auto WI = in_desc.GetLength(I3);
|
||||
|
||||
constexpr auto K = wei_desc.GetLength(I0);
|
||||
constexpr auto Y = wei_desc.GetLength(I2);
|
||||
constexpr auto X = wei_desc.GetLength(I3);
|
||||
|
||||
constexpr auto HPadLow = LowerPads{}.Get(I0);
|
||||
constexpr auto WPadLow = LowerPads{}.Get(I1);
|
||||
|
||||
constexpr auto HPadUp = UpperPads{}.Get(I0);
|
||||
constexpr auto WPadUp = UpperPads{}.Get(I1);
|
||||
|
||||
constexpr auto HO = HI + HPadLow + HPadUp + 1 - Y;
|
||||
constexpr auto WO = WI + WPadLow + WPadUp + 1 - X;
|
||||
|
||||
return make_ConstantTensorDescriptor_packed(Sequence<N, K, HO, WO>{});
|
||||
}
|
||||
|
||||
template <class InDesc, class WeiDesc, class OutDesc>
|
||||
constexpr std::size_t calculate_convolution_flops(InDesc, WeiDesc, OutDesc)
|
||||
{
|
||||
constexpr auto wei_desc = WeiDesc{};
|
||||
constexpr auto out_desc = OutDesc{};
|
||||
|
||||
constexpr auto I0 = Number<0>{};
|
||||
constexpr auto I1 = Number<1>{};
|
||||
constexpr auto I2 = Number<2>{};
|
||||
constexpr auto I3 = Number<3>{};
|
||||
|
||||
constexpr index_t N = out_desc.GetLength(I0);
|
||||
constexpr index_t K = out_desc.GetLength(I1);
|
||||
constexpr index_t Ho = out_desc.GetLength(I2);
|
||||
constexpr index_t Wo = out_desc.GetLength(I3);
|
||||
|
||||
constexpr index_t C = wei_desc.GetLength(I1);
|
||||
constexpr index_t Y = wei_desc.GetLength(I2);
|
||||
constexpr index_t X = wei_desc.GetLength(I3);
|
||||
|
||||
return std::size_t(2) * N * K * Ho * Wo * C * Y * X;
|
||||
}
|
||||
|
||||
template <class Float, class InDesc, class WeiDesc, class OutDesc>
|
||||
constexpr std::size_t calculate_convolution_memory_size(Float, InDesc, WeiDesc, OutDesc)
|
||||
{
|
||||
constexpr auto wei_desc = WeiDesc{};
|
||||
constexpr auto out_desc = OutDesc{};
|
||||
|
||||
constexpr auto I0 = Number<0>{};
|
||||
constexpr auto I1 = Number<1>{};
|
||||
constexpr auto I2 = Number<2>{};
|
||||
constexpr auto I3 = Number<3>{};
|
||||
|
||||
constexpr index_t N = out_desc.GetLength(I0);
|
||||
constexpr index_t K = out_desc.GetLength(I1);
|
||||
constexpr index_t Ho = out_desc.GetLength(I2);
|
||||
constexpr index_t Wo = out_desc.GetLength(I3);
|
||||
|
||||
constexpr index_t C = wei_desc.GetLength(I1);
|
||||
constexpr index_t Y = wei_desc.GetLength(I2);
|
||||
constexpr index_t X = wei_desc.GetLength(I3);
|
||||
|
||||
return sizeof(Float) *
|
||||
(InDesc::GetElementSpace() + WeiDesc::GetElementSpace() + OutDesc::GetElementSpace());
|
||||
}
|
||||
|
||||
#endif
|
||||
64
driver/include/device.hpp
Normal file
64
driver/include/device.hpp
Normal file
@@ -0,0 +1,64 @@
|
||||
#ifndef CK_DEVICE_HPP
|
||||
#define CK_DEVICE_HPP
|
||||
|
||||
#include <memory>
|
||||
#include "config.hpp"
|
||||
|
||||
using namespace ck;
|
||||
|
||||
struct DeviceMem
|
||||
{
|
||||
DeviceMem() = delete;
|
||||
DeviceMem(std::size_t mem_size);
|
||||
void* GetDeviceBuffer();
|
||||
void ToDevice(const void* p);
|
||||
void FromDevice(void* p);
|
||||
~DeviceMem();
|
||||
|
||||
void* mpDeviceBuf;
|
||||
std::size_t mMemSize;
|
||||
};
|
||||
|
||||
struct KernelTimerImpl;
|
||||
|
||||
struct KernelTimer
|
||||
{
|
||||
KernelTimer();
|
||||
~KernelTimer();
|
||||
void Start();
|
||||
void End();
|
||||
float GetElapsedTime() const;
|
||||
|
||||
std::unique_ptr<KernelTimerImpl> impl;
|
||||
};
|
||||
|
||||
template <typename... Args, typename F>
|
||||
float launch_kernel(F kernel, dim3 grid_dim, dim3 block_dim, std::size_t lds_byte, Args... args)
|
||||
{
|
||||
KernelTimer timer;
|
||||
|
||||
#if CK_DEVICE_BACKEND_AMD
|
||||
timer.Start();
|
||||
|
||||
hipLaunchKernelGGL(kernel, grid_dim, block_dim, lds_byte, 0, args...);
|
||||
|
||||
timer.End();
|
||||
|
||||
hipGetErrorString(hipGetLastError());
|
||||
#elif CK_DEVICE_BACKEND_NVIDIA
|
||||
const void* f = reinterpret_cast<const void*>(kernel);
|
||||
void* p_args[] = {&args...};
|
||||
|
||||
timer.Start();
|
||||
|
||||
cudaError_t error = cudaLaunchKernel(f, grid_dim, block_dim, p_args, lds_byte, 0);
|
||||
|
||||
timer.End();
|
||||
|
||||
checkCudaErrors(error);
|
||||
#endif
|
||||
|
||||
return timer.GetElapsedTime();
|
||||
}
|
||||
|
||||
#endif
|
||||
@@ -1,8 +1,9 @@
|
||||
#pragma once
|
||||
#include <unistd.h>
|
||||
#include "device.hpp"
|
||||
#include "tensor.hpp"
|
||||
#include "gridwise_convolution_kernel_wrapper.hpp"
|
||||
#include "composable_kernel/kernel_algorithm/gridwise_convolution_direct_v2_nchw_kcyx_nkhw.hpp"
|
||||
#include "gridwise_convolution_direct_v2_nchw_kcyx_nkhw.hpp"
|
||||
|
||||
using namespace ck;
|
||||
|
||||
@@ -1,11 +1,12 @@
|
||||
#pragma once
|
||||
#include <unistd.h>
|
||||
#include "device.hpp"
|
||||
#include "tensor.hpp"
|
||||
#include "gridwise_convolution_kernel_wrapper.hpp"
|
||||
#include "composable_kernel/kernel_algorithm/gridwise_convolution_implicit_gemm_v1r1_chwn_cyxk_khwn.hpp"
|
||||
#include "composable_kernel/kernel_algorithm/gridwise_convolution_implicit_gemm_v1r2_chwn_cyxk_khwn.hpp"
|
||||
#include "composable_kernel/kernel_algorithm/gridwise_convolution_implicit_gemm_v1r3_chwn_cyxk_khwn.hpp"
|
||||
#include "composable_kernel/kernel_algorithm/gridwise_convolution_implicit_gemm_v1r3_chwn_cyxk_khwn_lds_double_buffer.hpp"
|
||||
#include "gridwise_convolution_implicit_gemm_v1r1_chwn_cyxk_khwn.hpp"
|
||||
#include "gridwise_convolution_implicit_gemm_v1r2_chwn_cyxk_khwn.hpp"
|
||||
#include "gridwise_convolution_implicit_gemm_v1r3_chwn_cyxk_khwn.hpp"
|
||||
#include "gridwise_convolution_implicit_gemm_v1r3_chwn_cyxk_khwn_lds_double_buffer.hpp"
|
||||
|
||||
using namespace ck;
|
||||
|
||||
@@ -1,9 +1,10 @@
|
||||
#pragma once
|
||||
#include <unistd.h>
|
||||
#include "device.hpp"
|
||||
#include "tensor.hpp"
|
||||
#include "gridwise_convolution_kernel_wrapper.hpp"
|
||||
#include "composable_kernel/kernel_algorithm/gridwise_convolution_implicit_gemm_v1r3_nchw_cyxk_nkhw.hpp"
|
||||
#include "composable_kernel/kernel_algorithm/gridwise_convolution_implicit_gemm_v1r3_nchw_cyxk_nkhw_lds_double_buffer.hpp"
|
||||
#include "gridwise_convolution_implicit_gemm_v1r3_nchw_cyxk_nkhw.hpp"
|
||||
#include "gridwise_convolution_implicit_gemm_v1r3_nchw_cyxk_nkhw_lds_double_buffer.hpp"
|
||||
|
||||
using namespace ck;
|
||||
|
||||
@@ -1,9 +1,10 @@
|
||||
#pragma once
|
||||
#include <unistd.h>
|
||||
#include "device.hpp"
|
||||
#include "tensor.hpp"
|
||||
#include "gridwise_convolution_kernel_wrapper.hpp"
|
||||
#include "composable_kernel/kernel_algorithm/gridwise_convolution_implicit_gemm_v2_chwn_cyxk_khwn.hpp"
|
||||
#include "composable_kernel/kernel_algorithm/gridwise_convolution_implicit_gemm_v2_chwn_cyxk_khwn_lds_double_buffer.hpp"
|
||||
#include "gridwise_convolution_implicit_gemm_v2_chwn_cyxk_khwn.hpp"
|
||||
#include "gridwise_convolution_implicit_gemm_v2_chwn_cyxk_khwn_lds_double_buffer.hpp"
|
||||
|
||||
using namespace ck;
|
||||
|
||||
@@ -1,9 +1,10 @@
|
||||
#pragma once
|
||||
#include <unistd.h>
|
||||
#include "device.hpp"
|
||||
#include "tensor.hpp"
|
||||
#include "gridwise_convolution_kernel_wrapper.hpp"
|
||||
#include "composable_kernel/kernel_algorithm/gridwise_convolution_implicit_gemm_v3_nchw_cyxk_nkhw.hpp"
|
||||
#include "composable_kernel/kernel_algorithm/gridwise_convolution_implicit_gemm_v3_nchw_cyxk_nkhw_lds_double_buffer.hpp"
|
||||
#include "gridwise_convolution_implicit_gemm_v3_nchw_cyxk_nkhw.hpp"
|
||||
#include "gridwise_convolution_implicit_gemm_v3_nchw_cyxk_nkhw_lds_double_buffer.hpp"
|
||||
|
||||
using namespace ck;
|
||||
|
||||
@@ -1,9 +1,10 @@
|
||||
#pragma once
|
||||
#include <unistd.h>
|
||||
#include "device.hpp"
|
||||
#include "tensor.hpp"
|
||||
#include "gridwise_convolution_kernel_wrapper.hpp"
|
||||
#include "composable_kernel/kernel_algorithm/gridwise_convolution_implicit_gemm_v4_nchw_kcyx_nkhw.hpp"
|
||||
#include "composable_kernel/kernel_algorithm/gridwise_convolution_implicit_gemm_v4_nchw_kcyx_nkhw_lds_double_buffer.hpp"
|
||||
#include "gridwise_convolution_implicit_gemm_v4_nchw_kcyx_nkhw.hpp"
|
||||
#include "gridwise_convolution_implicit_gemm_v4_nchw_kcyx_nkhw_lds_double_buffer.hpp"
|
||||
|
||||
using namespace ck;
|
||||
|
||||
@@ -1,7 +1,8 @@
|
||||
#pragma once
|
||||
#include <unistd.h>
|
||||
#include "device.hpp"
|
||||
#include "composable_kernel/kernel_algorithm/gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hpp"
|
||||
#include "tensor.hpp"
|
||||
#include "gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hpp"
|
||||
|
||||
using namespace ck;
|
||||
|
||||
@@ -1,7 +1,8 @@
|
||||
#pragma once
|
||||
#include <unistd.h>
|
||||
#include "device.hpp"
|
||||
#include "composable_kernel/kernel_algorithm/gridwise_implicit_gemm_convolution_1_chwn_cyxk_khwn_padded.hpp"
|
||||
#include "tensor.hpp"
|
||||
#include "gridwise_implicit_gemm_convolution_1_chwn_cyxk_khwn_padded.hpp"
|
||||
|
||||
using namespace ck;
|
||||
|
||||
272
driver/include/tensor.hpp
Normal file
272
driver/include/tensor.hpp
Normal file
@@ -0,0 +1,272 @@
|
||||
#ifndef CK_TENSOR_HPP
|
||||
#define CK_TENSOR_HPP
|
||||
|
||||
#include <thread>
|
||||
#include <vector>
|
||||
#include <numeric>
|
||||
#include <algorithm>
|
||||
#include <utility>
|
||||
#include <cassert>
|
||||
#include <iostream>
|
||||
|
||||
template <class Range>
|
||||
std::ostream& LogRange(std::ostream& os, Range&& range, std::string delim)
|
||||
{
|
||||
bool first = true;
|
||||
for(auto&& v : range)
|
||||
{
|
||||
if(first)
|
||||
first = false;
|
||||
else
|
||||
os << delim;
|
||||
os << v;
|
||||
}
|
||||
return os;
|
||||
}
|
||||
|
||||
typedef enum {
|
||||
Half = 0,
|
||||
Float = 1,
|
||||
} DataType_t;
|
||||
|
||||
template <class T>
|
||||
struct DataType;
|
||||
|
||||
template <>
|
||||
struct DataType<float> : std::integral_constant<DataType_t, DataType_t::Float>
|
||||
{
|
||||
};
|
||||
|
||||
template <class F, class T, std::size_t... Is>
|
||||
auto call_f_unpack_args_impl(F f, T args, std::index_sequence<Is...>)
|
||||
{
|
||||
return f(std::get<Is>(args)...);
|
||||
}
|
||||
|
||||
template <class F, class T>
|
||||
auto call_f_unpack_args(F f, T args)
|
||||
{
|
||||
constexpr std::size_t N = std::tuple_size<T>::value;
|
||||
|
||||
return call_f_unpack_args_impl(f, args, std::make_index_sequence<N>{});
|
||||
}
|
||||
|
||||
template <class F, class T, std::size_t... Is>
|
||||
auto construct_f_unpack_args_impl(T args, std::index_sequence<Is...>)
|
||||
{
|
||||
return F(std::get<Is>(args)...);
|
||||
}
|
||||
|
||||
template <class F, class T>
|
||||
auto construct_f_unpack_args(F, T args)
|
||||
{
|
||||
constexpr std::size_t N = std::tuple_size<T>::value;
|
||||
|
||||
return construct_f_unpack_args_impl<F>(args, std::make_index_sequence<N>{});
|
||||
}
|
||||
|
||||
struct TensorDescriptor
|
||||
{
|
||||
TensorDescriptor() = delete;
|
||||
TensorDescriptor(std::initializer_list<std::size_t> lens);
|
||||
TensorDescriptor(std::initializer_list<std::size_t> lens,
|
||||
std::initializer_list<std::size_t> strides);
|
||||
TensorDescriptor(std::vector<std::size_t> lens, std::vector<std::size_t> strides);
|
||||
|
||||
void CalculateStrides();
|
||||
|
||||
template <class Range>
|
||||
TensorDescriptor(const Range& lens) : mLens(lens.begin(), lens.end())
|
||||
{
|
||||
this->CalculateStrides();
|
||||
}
|
||||
|
||||
template <class Range1, class Range2>
|
||||
TensorDescriptor(const Range1& lens, const Range2& strides)
|
||||
: mLens(lens.begin(), lens.end()), mStrides(strides.begin(), strides.end())
|
||||
{
|
||||
}
|
||||
|
||||
std::size_t GetNumOfDimension() const;
|
||||
std::size_t GetElementSize() const;
|
||||
std::size_t GetElementSpace() const;
|
||||
|
||||
const std::vector<std::size_t>& GetLengths() const;
|
||||
const std::vector<std::size_t>& GetStrides() const;
|
||||
|
||||
template <class... Is>
|
||||
std::size_t GetOffsetFromMultiIndex(Is... is) const
|
||||
{
|
||||
assert(sizeof...(Is) == this->GetNumOfDimension());
|
||||
std::initializer_list<std::size_t> iss{static_cast<std::size_t>(is)...};
|
||||
return std::inner_product(iss.begin(), iss.end(), mStrides.begin(), std::size_t{0});
|
||||
}
|
||||
|
||||
private:
|
||||
std::vector<std::size_t> mLens;
|
||||
std::vector<std::size_t> mStrides;
|
||||
};
|
||||
|
||||
struct joinable_thread : std::thread
|
||||
{
|
||||
template <class... Xs>
|
||||
joinable_thread(Xs&&... xs) : std::thread(std::forward<Xs>(xs)...)
|
||||
{
|
||||
}
|
||||
|
||||
joinable_thread(joinable_thread&&) = default;
|
||||
joinable_thread& operator=(joinable_thread&&) = default;
|
||||
|
||||
~joinable_thread()
|
||||
{
|
||||
if(this->joinable())
|
||||
this->join();
|
||||
}
|
||||
};
|
||||
|
||||
template <class F, class... Xs>
|
||||
struct ParallelTensorFunctor
|
||||
{
|
||||
F mF;
|
||||
static constexpr std::size_t NDIM = sizeof...(Xs);
|
||||
std::array<std::size_t, NDIM> mLens;
|
||||
std::array<std::size_t, NDIM> mStrides;
|
||||
std::size_t mN1d;
|
||||
|
||||
ParallelTensorFunctor(F f, Xs... xs) : mF(f), mLens({static_cast<std::size_t>(xs)...})
|
||||
{
|
||||
mStrides.back() = 1;
|
||||
std::partial_sum(mLens.rbegin(),
|
||||
mLens.rend() - 1,
|
||||
mStrides.rbegin() + 1,
|
||||
std::multiplies<std::size_t>());
|
||||
mN1d = mStrides[0] * mLens[0];
|
||||
}
|
||||
|
||||
std::array<std::size_t, NDIM> GetNdIndices(std::size_t i) const
|
||||
{
|
||||
std::array<std::size_t, NDIM> indices;
|
||||
|
||||
for(int idim = 0; idim < NDIM; ++idim)
|
||||
{
|
||||
indices[idim] = i / mStrides[idim];
|
||||
i -= indices[idim] * mStrides[idim];
|
||||
}
|
||||
|
||||
return indices;
|
||||
}
|
||||
|
||||
void operator()(std::size_t num_thread) const
|
||||
{
|
||||
std::size_t work_per_thread = (mN1d + num_thread - 1) / num_thread;
|
||||
|
||||
std::vector<joinable_thread> threads(num_thread);
|
||||
|
||||
for(std::size_t it = 0; it < num_thread; ++it)
|
||||
{
|
||||
std::size_t iw_begin = it * work_per_thread;
|
||||
std::size_t iw_end = std::min((it + 1) * work_per_thread, mN1d);
|
||||
|
||||
auto f = [=] {
|
||||
for(std::size_t iw = iw_begin; iw < iw_end; ++iw)
|
||||
{
|
||||
call_f_unpack_args(mF, GetNdIndices(iw));
|
||||
}
|
||||
};
|
||||
threads[it] = joinable_thread(f);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
template <class F, class... Xs>
|
||||
auto make_ParallelTensorFunctor(F f, Xs... xs)
|
||||
{
|
||||
return ParallelTensorFunctor<F, Xs...>(f, xs...);
|
||||
}
|
||||
|
||||
template <class T>
|
||||
struct Tensor
|
||||
{
|
||||
template <class X>
|
||||
Tensor(std::initializer_list<X> lens) : mDesc(lens), mData(mDesc.GetElementSpace())
|
||||
{
|
||||
}
|
||||
|
||||
template <class X>
|
||||
Tensor(std::vector<X> lens) : mDesc(lens), mData(mDesc.GetElementSpace())
|
||||
{
|
||||
}
|
||||
|
||||
template <class X, class Y>
|
||||
Tensor(std::vector<X> lens, std::vector<Y> strides)
|
||||
: mDesc(lens, strides), mData(mDesc.GetElementSpace())
|
||||
{
|
||||
}
|
||||
|
||||
Tensor(const TensorDescriptor& desc) : mDesc(desc), mData(mDesc.GetElementSpace()) {}
|
||||
|
||||
template <class G>
|
||||
void GenerateTensorValue(G g, std::size_t num_thread = 1)
|
||||
{
|
||||
switch(mDesc.GetNumOfDimension())
|
||||
{
|
||||
case 1:
|
||||
{
|
||||
auto f = [&](auto i) { (*this)(i) = g(i); };
|
||||
make_ParallelTensorFunctor(f, mDesc.GetLengths()[0])(num_thread);
|
||||
break;
|
||||
}
|
||||
case 2:
|
||||
{
|
||||
auto f = [&](auto i0, auto i1) { (*this)(i0, i1) = g(i0, i1); };
|
||||
make_ParallelTensorFunctor(f, mDesc.GetLengths()[0], mDesc.GetLengths()[1])(num_thread);
|
||||
break;
|
||||
}
|
||||
case 3:
|
||||
{
|
||||
auto f = [&](auto i0, auto i1, auto i2) { (*this)(i0, i1, i2) = g(i0, i1, i2); };
|
||||
make_ParallelTensorFunctor(
|
||||
f, mDesc.GetLengths()[0], mDesc.GetLengths()[1], mDesc.GetLengths()[2])(num_thread);
|
||||
break;
|
||||
}
|
||||
case 4:
|
||||
{
|
||||
auto f = [&](auto i0, auto i1, auto i2, auto i3) {
|
||||
(*this)(i0, i1, i2, i3) = g(i0, i1, i2, i3);
|
||||
};
|
||||
make_ParallelTensorFunctor(f,
|
||||
mDesc.GetLengths()[0],
|
||||
mDesc.GetLengths()[1],
|
||||
mDesc.GetLengths()[2],
|
||||
mDesc.GetLengths()[3])(num_thread);
|
||||
break;
|
||||
}
|
||||
default: throw std::runtime_error("unspported dimension");
|
||||
}
|
||||
}
|
||||
|
||||
template <class... Is>
|
||||
T& operator()(Is... is)
|
||||
{
|
||||
return mData[mDesc.GetOffsetFromMultiIndex(is...)];
|
||||
}
|
||||
|
||||
template <class... Is>
|
||||
const T& operator()(Is... is) const
|
||||
{
|
||||
return mData[mDesc.GetOffsetFromMultiIndex(is...)];
|
||||
}
|
||||
|
||||
typename std::vector<T>::iterator begin() { return mData.begin(); }
|
||||
|
||||
typename std::vector<T>::iterator end() { return mData.end(); }
|
||||
|
||||
typename std::vector<T>::const_iterator begin() const { return mData.begin(); }
|
||||
|
||||
typename std::vector<T>::const_iterator end() const { return mData.end(); }
|
||||
|
||||
TensorDescriptor mDesc;
|
||||
std::vector<T> mData;
|
||||
};
|
||||
|
||||
#endif
|
||||
0
driver/src/CMakeLists.txt
Normal file
0
driver/src/CMakeLists.txt
Normal file
114
driver/src/device.cpp
Normal file
114
driver/src/device.cpp
Normal file
@@ -0,0 +1,114 @@
|
||||
#include "config.hpp"
|
||||
#include "device.hpp"
|
||||
|
||||
DeviceMem::DeviceMem(std::size_t mem_size) : mMemSize(mem_size)
|
||||
{
|
||||
#if CK_DEVICE_BACKEND_AMD
|
||||
hipGetErrorString(hipMalloc(static_cast<void**>(&mpDeviceBuf), mMemSize));
|
||||
#elif CK_DEVICE_BACKEND_NVIDIA
|
||||
checkCudaErrors(cudaMalloc(static_cast<void**>(&mpDeviceBuf), mMemSize));
|
||||
#endif
|
||||
}
|
||||
|
||||
void* DeviceMem::GetDeviceBuffer() { return mpDeviceBuf; }
|
||||
|
||||
void DeviceMem::ToDevice(const void* p)
|
||||
{
|
||||
#if CK_DEVICE_BACKEND_AMD
|
||||
hipGetErrorString(
|
||||
hipMemcpy(mpDeviceBuf, const_cast<void*>(p), mMemSize, hipMemcpyHostToDevice));
|
||||
#elif CK_DEVICE_BACKEND_NVIDIA
|
||||
checkCudaErrors(
|
||||
cudaMemcpy(mpDeviceBuf, const_cast<void*>(p), mMemSize, cudaMemcpyHostToDevice));
|
||||
#endif
|
||||
}
|
||||
|
||||
void DeviceMem::FromDevice(void* p)
|
||||
{
|
||||
#if CK_DEVICE_BACKEND_AMD
|
||||
hipGetErrorString(hipMemcpy(p, mpDeviceBuf, mMemSize, hipMemcpyDeviceToHost));
|
||||
#elif CK_DEVICE_BACKEND_NVIDIA
|
||||
checkCudaErrors(cudaMemcpy(p, mpDeviceBuf, mMemSize, cudaMemcpyDeviceToHost));
|
||||
#endif
|
||||
}
|
||||
|
||||
DeviceMem::~DeviceMem()
|
||||
{
|
||||
#if CK_DEVICE_BACKEND_AMD
|
||||
hipGetErrorString(hipFree(mpDeviceBuf));
|
||||
#elif CK_DEVICE_BACKEND_NVIDIA
|
||||
checkCudaErrors(cudaFree(mpDeviceBuf));
|
||||
#endif
|
||||
}
|
||||
|
||||
struct KernelTimerImpl
|
||||
{
|
||||
KernelTimerImpl()
|
||||
{
|
||||
#if CK_DEVICE_BACKEND_AMD
|
||||
hipEventCreate(&mStart);
|
||||
hipEventCreate(&mEnd);
|
||||
#elif CK_DEVICE_BACKEND_NVIDIA
|
||||
cudaEventCreate(&mStart);
|
||||
cudaEventCreate(&mEnd);
|
||||
#endif
|
||||
}
|
||||
|
||||
~KernelTimerImpl()
|
||||
{
|
||||
#if CK_DEVICE_BACKEND_AMD
|
||||
hipEventDestroy(mStart);
|
||||
hipEventDestroy(mEnd);
|
||||
#elif CK_DEVICE_BACKEND_NVIDIA
|
||||
cudaEventDestroy(mStart);
|
||||
cudaEventDestroy(mEnd);
|
||||
#endif
|
||||
}
|
||||
|
||||
void Start()
|
||||
{
|
||||
#if CK_DEVICE_BACKEND_AMD
|
||||
hipEventRecord(mStart, 0);
|
||||
#elif CK_DEVICE_BACKEND_NVIDIA
|
||||
cudaEventRecord(mStart, 0);
|
||||
#endif
|
||||
}
|
||||
|
||||
void End()
|
||||
{
|
||||
#if CK_DEVICE_BACKEND_AMD
|
||||
hipEventRecord(mEnd, 0);
|
||||
hipEventSynchronize(mEnd);
|
||||
#elif CK_DEVICE_BACKEND_NVIDIA
|
||||
cudaEventRecord(mEnd, 0);
|
||||
cudaEventSynchronize(mEnd);
|
||||
#endif
|
||||
}
|
||||
|
||||
float GetElapsedTime() const
|
||||
{
|
||||
float time;
|
||||
#if CK_DEVICE_BACKEND_AMD
|
||||
hipEventElapsedTime(&time, mStart, mEnd);
|
||||
#elif CK_DEVICE_BACKEND_NVIDIA
|
||||
cudaEventElapsedTime(&time, mStart, mEnd);
|
||||
#endif
|
||||
return time;
|
||||
}
|
||||
|
||||
#if CK_DEVICE_BACKEND_AMD
|
||||
hipEvent_t mStart, mEnd;
|
||||
#elif CK_DEVICE_BACKEND_NVIDIA
|
||||
cudaEvent_t mStart, mEnd;
|
||||
#endif
|
||||
};
|
||||
|
||||
KernelTimer::KernelTimer() : impl(new KernelTimerImpl()) {}
|
||||
|
||||
KernelTimer::~KernelTimer() {}
|
||||
|
||||
void KernelTimer::Start() { impl->Start(); }
|
||||
|
||||
void KernelTimer::End() { impl->End(); }
|
||||
|
||||
float KernelTimer::GetElapsedTime() const { return impl->GetElapsedTime(); }
|
||||
@@ -3,9 +3,9 @@
|
||||
#include <initializer_list>
|
||||
#include <cstdlib>
|
||||
#include <stdlib.h>
|
||||
#include "composable_kernel/utility/config.hpp"
|
||||
#include "composable_kernel/tensor_description/ConstantTensorDescriptor.hpp"
|
||||
#include "tensor.hpp"
|
||||
#include "config.hpp"
|
||||
#include "ConstantTensorDescriptor.hpp"
|
||||
#include "device.hpp"
|
||||
#include "conv_common.hpp"
|
||||
#include "device_convolution_direct_v2_nchw_kcyx_nkhw.hpp"
|
||||
#include "device_convolution_implicit_gemm_v1_chwn_cyxk_khwn.hpp"
|
||||
45
driver/src/tensor.cpp
Normal file
45
driver/src/tensor.cpp
Normal file
@@ -0,0 +1,45 @@
|
||||
#include <boost/range/adaptor/transformed.hpp>
|
||||
#include <cassert>
|
||||
|
||||
#include "tensor.hpp"
|
||||
|
||||
TensorDescriptor::TensorDescriptor(std::initializer_list<std::size_t> lens) : mLens(lens)
|
||||
{
|
||||
this->CalculateStrides();
|
||||
}
|
||||
|
||||
TensorDescriptor::TensorDescriptor(std::vector<std::size_t> lens, std::vector<std::size_t> strides)
|
||||
: mLens(lens), mStrides(strides)
|
||||
{
|
||||
}
|
||||
|
||||
void TensorDescriptor::CalculateStrides()
|
||||
{
|
||||
mStrides.clear();
|
||||
mStrides.resize(mLens.size(), 0);
|
||||
if(mStrides.empty())
|
||||
return;
|
||||
|
||||
mStrides.back() = 1;
|
||||
std::partial_sum(
|
||||
mLens.rbegin(), mLens.rend() - 1, mStrides.rbegin() + 1, std::multiplies<std::size_t>());
|
||||
}
|
||||
|
||||
std::size_t TensorDescriptor::GetNumOfDimension() const { return mLens.size(); }
|
||||
|
||||
std::size_t TensorDescriptor::GetElementSize() const
|
||||
{
|
||||
assert(mLens.size() == mStrides.size());
|
||||
return std::accumulate(
|
||||
mLens.begin(), mLens.end(), std::size_t{1}, std::multiplies<std::size_t>());
|
||||
}
|
||||
|
||||
std::size_t TensorDescriptor::GetElementSpace() const
|
||||
{
|
||||
auto ls = mLens | boost::adaptors::transformed([](std::size_t v) { return v - 1; });
|
||||
return std::inner_product(ls.begin(), ls.end(), mStrides.begin(), std::size_t{0}) + 1;
|
||||
}
|
||||
|
||||
const std::vector<std::size_t>& TensorDescriptor::GetLengths() const { return mLens; }
|
||||
|
||||
const std::vector<std::size_t>& TensorDescriptor::GetStrides() const { return mStrides; }
|
||||
Reference in New Issue
Block a user