From 2f2cf35bf4fdf71fc7f306e51ee8b34066e32829 Mon Sep 17 00:00:00 2001 From: Chao Liu Date: Mon, 22 Oct 2018 11:51:10 -0500 Subject: [PATCH] initial cuda build --- driver/CMakeLists.txt | 2 +- driver/conv.cpp | 67 ------------------ driver/conv.cu | 108 +++++++++++++++++++++++++++++ src/include/device_tensor.cuh | 39 +++++++++++ src/include/direct_convolution.cuh | 12 ++++ src/include/tensor.hpp | 31 +++++---- src/tensor.cpp | 2 + 7 files changed, 179 insertions(+), 82 deletions(-) delete mode 100644 driver/conv.cpp create mode 100644 driver/conv.cu create mode 100644 src/include/device_tensor.cuh create mode 100644 src/include/direct_convolution.cuh diff --git a/driver/CMakeLists.txt b/driver/CMakeLists.txt index dae6f62835..25c23dea5a 100644 --- a/driver/CMakeLists.txt +++ b/driver/CMakeLists.txt @@ -1,2 +1,2 @@ -add_executable(conv EXCLUDE_FROM_ALL conv.cpp) +add_executable(conv EXCLUDE_FROM_ALL conv.cu) target_link_libraries(conv convolution) diff --git a/driver/conv.cpp b/driver/conv.cpp deleted file mode 100644 index 6889ba1c15..0000000000 --- a/driver/conv.cpp +++ /dev/null @@ -1,67 +0,0 @@ -#include -#include "tensor.hpp" - -template -void direct_convolution(const Tensor& in, - const Tensor& wei, - Tensor& out, - std::size_t num_thread) -{ - auto f = [&](auto n, auto k, auto ho, auto wo) { - double v = 0; - for(int c = 0; c < wei.mDesc.GetLengths()[1]; ++c) - { - for(int y = 0; y < wei.mDesc.GetLengths()[2]; ++y) - { - int hi = ho + y; - for(int x = 0; x < wei.mDesc.GetLengths()[3]; ++x) - { - int wi = wo + x; - v += in(n, c, hi, wi) * wei(k, c, y, x); - } - } - } - out(n, k, ho, wo) = v; - }; - - auto f_par = make_ParallelTensorFunctor(f, - out.mDesc.GetLengths()[0], - out.mDesc.GetLengths()[1], - out.mDesc.GetLengths()[2], - out.mDesc.GetLengths()[3]); - - f_par(num_thread); -} - -template -struct Generator -{ - - template - T operator()(Is... is) - { - return 1; - } -}; - -int main() -{ - Tensor in({3, 16, 128, 128}); - Tensor wei({4, 16, 3, 3}); - Tensor out({3, 4, 126, 126}); - - int num_thread = std::thread::hardware_concurrency(); - - std::cout << __func__ << ": num_thread " << num_thread << std::endl; - - in.GenerateTensorValue(Generator{}, num_thread); - wei.GenerateTensorValue(Generator{}, num_thread); - - direct_convolution(in, wei, out, num_thread); - - std::cout << __func__ << ": done" << std::endl; - - LogRange(std::cout, in.mData, ",") << std::endl; - LogRange(std::cout, wei.mData, ",") << std::endl; - LogRange(std::cout, out.mData, ",") << std::endl; -} diff --git a/driver/conv.cu b/driver/conv.cu new file mode 100644 index 0000000000..40851c9dc8 --- /dev/null +++ b/driver/conv.cu @@ -0,0 +1,108 @@ +#include +#include "nvToolsExt.h" +#include "tensor.hpp" +#include "device_tensor.cuh" +#include "direct_convolution.cuh" + +template +void host_convolution(const Tensor& in, + const Tensor& wei, + Tensor& out, + std::size_t num_thread) +{ + auto f = [&](auto n, auto k, auto ho, auto wo) { + double v = 0; + for(int c = 0; c < wei.mDesc.GetLengths()[1]; ++c) + { + for(int y = 0; y < wei.mDesc.GetLengths()[2]; ++y) + { + int hi = ho + y; + for(int x = 0; x < wei.mDesc.GetLengths()[3]; ++x) + { + int wi = wo + x; + v += in(n, c, hi, wi) * wei(k, c, y, x); + } + } + } + out(n, k, ho, wo) = v; + }; + + auto f_par = make_ParallelTensorFunctor(f, + out.mDesc.GetLengths()[0], + out.mDesc.GetLengths()[1], + out.mDesc.GetLengths()[2], + out.mDesc.GetLengths()[3]); + + f_par(num_thread); +} + +template +void device_convolution(const Tensor& in, const Tensor& wei, Tensor& out) + +{ + DeviceTensorDescriptor in_desc_device(in.mDesc); + DeviceTensorDescriptor wei_desc_device(wei.mDesc); + DeviceTensorDescriptor out_desc_device(out.mDesc); + + std::size_t data_sz = sizeof(T); + DeviceMem in_device_buf(data_sz * in.mDesc.GetElementSpace()); + DeviceMem wei_device_buf(data_sz * wei.mDesc.GetElementSpace()); + DeviceMem out_device_buf(data_sz * out.mDesc.GetElementSpace()); + + in_device_buf.ToDevice(in.mData.data()); + wei_device_buf.ToDevice(wei.mData.data()); + + dim3 block_dim(256, 1, 1); + dim3 grid_dim(1, 1, 1); + direct_convolution + <<>>(in_desc_device, + static_cast(in_device_buf.GetDeviceBuffer()), + wei_desc_device, + static_cast(wei_device_buf.GetDeviceBuffer()), + out_desc_device, + static_cast(out_device_buf.GetDeviceBuffer())); + + out_device_buf.FromDevice(out.mData.data()); +} + +template +struct Generator +{ + + template + T operator()(Is... is) + { + return 1; + } +}; + +int main() +{ +#if 0 + Tensor in({3, 16, 128, 128}); + Tensor wei({4, 16, 3, 3}); + Tensor out_host({3, 4, 126, 126}); +#else + Tensor in({1, 1, 4, 4}); + Tensor wei({1, 1, 3, 3}); + Tensor out_host({1, 1, 2, 2}); +#endif + Tensor out_device = out_host; + + int num_thread = std::thread::hardware_concurrency(); + + std::cout << __func__ << ": num_thread " << num_thread << std::endl; + + in.GenerateTensorValue(Generator{}, num_thread); + wei.GenerateTensorValue(Generator{}, num_thread); + + host_convolution(in, wei, out_host, num_thread); + device_convolution(in, wei, out_device); + + std::cout << __func__ << ": done" << std::endl; + + LogRange(std::cout, in.mData, ",") << std::endl; + LogRange(std::cout, wei.mData, ",") << std::endl; + LogRange(std::cout, out_host.mData, ",") << std::endl; + LogRange(std::cout, out_device.mData, ",") << std::endl; +} diff --git a/src/include/device_tensor.cuh b/src/include/device_tensor.cuh new file mode 100644 index 0000000000..87865b53ad --- /dev/null +++ b/src/include/device_tensor.cuh @@ -0,0 +1,39 @@ +#pragma once +#include "helper_cuda.h" +#include "tensor.hpp" + +struct DeviceTensorDescriptor +{ + DeviceTensorDescriptor() = delete; + + __host__ DeviceTensorDescriptor(const TensorDescriptor& host_desc) + : mDataType(host_desc.GetDataType()), mDim(host_desc.GetDimension()) + { + std::size_t data_sz = host_desc.GetDataType() == DataType_t::Float ? 4 : 2; + + checkCudaErrors(cudaMalloc(&mpLengths, data_sz * mDim)); + checkCudaErrors(cudaMalloc(&mpStrides, data_sz * mDim)); + + checkCudaErrors( + cudaMemcpy(const_cast(static_cast(host_desc.GetLengths().data())), + mpLengths, + data_sz * mDim, + cudaMemcpyHostToDevice)); + checkCudaErrors( + cudaMemcpy(const_cast(static_cast(host_desc.GetStrides().data())), + mpStrides, + data_sz * mDim, + cudaMemcpyHostToDevice)); + } + + __host__ ~DeviceTensorDescriptor() + { + checkCudaErrors(cudaFree(mpLengths)); + checkCudaErrors(cudaFree(mpStrides)); + } + + DataType_t mDataType; + unsigned long mDim; + unsigned long* mpLengths; + unsigned long* mpStrides; +}; diff --git a/src/include/direct_convolution.cuh b/src/include/direct_convolution.cuh new file mode 100644 index 0000000000..2816028ac8 --- /dev/null +++ b/src/include/direct_convolution.cuh @@ -0,0 +1,12 @@ +#pragma once +#include "device_tensor.cuh" + +template +__global__ void direct_convolution(DeviceTensorDescriptor in_desc, + TFloat* const in, + DeviceTensorDescriptor wei_desc, + TFloat* const wei, + DeviceTensorDescriptor out_desc, + TFloat* out) +{ +} diff --git a/src/include/tensor.hpp b/src/include/tensor.hpp index e232c74516..4bcd81b122 100644 --- a/src/include/tensor.hpp +++ b/src/include/tensor.hpp @@ -1,3 +1,4 @@ +#pragma once #include #include #include @@ -89,6 +90,7 @@ struct TensorDescriptor { } + DataType_t GetDataType() const; std::size_t GetDimension() const; std::size_t GetElementSize() const; std::size_t GetElementSpace() const; @@ -105,35 +107,36 @@ struct TensorDescriptor } private: + DataType_t mDataType; std::vector mLens; std::vector mStrides; - - DataType_t mDataType; }; -struct GpuMem +struct DeviceMem { - GpuMem() = delete; - GpuMem(std::size_t size, std::size_t data_size) : mSize(size), mDataSize(data_size) + DeviceMem() = delete; + DeviceMem(std::size_t mem_size) : mMemSize(mem_size) { - cudaMalloc(static_cast(&mGpuBuf), mDataSize * mSize); + cudaMalloc(static_cast(&mpDeviceBuf), mMemSize); } - int ToGpu(void* p) + void* GetDeviceBuffer() { return mpDeviceBuf; } + + int ToDevice(const void* p) { - return static_cast(cudaMemcpy(mGpuBuf, p, mDataSize * mSize, cudaMemcpyHostToDevice)); + return static_cast( + cudaMemcpy(mpDeviceBuf, const_cast(p), mMemSize, cudaMemcpyHostToDevice)); } - int FromGpu(void* p) + int FromDevice(void* p) { - return static_cast(cudaMemcpy(p, mGpuBuf, mDataSize * mSize, cudaMemcpyDeviceToHost)); + return static_cast(cudaMemcpy(p, mpDeviceBuf, mMemSize, cudaMemcpyDeviceToHost)); } - ~GpuMem() { cudaFree(mGpuBuf); } + ~DeviceMem() { cudaFree(mpDeviceBuf); } - void* mGpuBuf; - std::size_t mSize; - std::size_t mDataSize; + void* mpDeviceBuf; + std::size_t mMemSize; }; struct joinable_thread : std::thread diff --git a/src/tensor.cpp b/src/tensor.cpp index b94677a073..852d0b8046 100644 --- a/src/tensor.cpp +++ b/src/tensor.cpp @@ -28,6 +28,8 @@ void TensorDescriptor::CalculateStrides() mLens.rbegin(), mLens.rend() - 1, mStrides.rbegin() + 1, std::multiplies()); } +DataType_t TensorDescriptor::GetDataType() const { return mDataType; } + std::size_t TensorDescriptor::GetDimension() const { return mLens.size(); } std::size_t TensorDescriptor::GetElementSize() const