initial cuda build

2026-05-12 17:26:00 +00:00 · 2018-10-22 11:51:10 -05:00
parent d51b81588f
commit 2f2cf35bf4
7 changed files with 179 additions and 82 deletions
--- a/src/include/device_tensor.cuh
+++ b/src/include/device_tensor.cuh
@@ -0,0 +1,39 @@
+#pragma once
+#include "helper_cuda.h"
+#include "tensor.hpp"
+
+struct DeviceTensorDescriptor
+{
+    DeviceTensorDescriptor() = delete;
+
+    __host__ DeviceTensorDescriptor(const TensorDescriptor& host_desc)
+        : mDataType(host_desc.GetDataType()), mDim(host_desc.GetDimension())
+    {
+        std::size_t data_sz = host_desc.GetDataType() == DataType_t::Float ? 4 : 2;
+
+        checkCudaErrors(cudaMalloc(&mpLengths, data_sz * mDim));
+        checkCudaErrors(cudaMalloc(&mpStrides, data_sz * mDim));
+
+        checkCudaErrors(
+            cudaMemcpy(const_cast<void*>(static_cast<const void*>(host_desc.GetLengths().data())),
+                       mpLengths,
+                       data_sz * mDim,
+                       cudaMemcpyHostToDevice));
+        checkCudaErrors(
+            cudaMemcpy(const_cast<void*>(static_cast<const void*>(host_desc.GetStrides().data())),
+                       mpStrides,
+                       data_sz * mDim,
+                       cudaMemcpyHostToDevice));
+    }
+
+    __host__ ~DeviceTensorDescriptor()
+    {
+        checkCudaErrors(cudaFree(mpLengths));
+        checkCudaErrors(cudaFree(mpStrides));
+    }
+
+    DataType_t mDataType;
+    unsigned long mDim;
+    unsigned long* mpLengths;
+    unsigned long* mpStrides;
+};
--- a/src/include/direct_convolution.cuh
+++ b/src/include/direct_convolution.cuh
@@ -0,0 +1,12 @@
+#pragma once
+#include "device_tensor.cuh"
+
+template <class TFloat, int NBlockDim>
+__global__ void direct_convolution(DeviceTensorDescriptor in_desc,
+                                   TFloat* const in,
+                                   DeviceTensorDescriptor wei_desc,
+                                   TFloat* const wei,
+                                   DeviceTensorDescriptor out_desc,
+                                   TFloat* out)
+{
+}
--- a/src/include/tensor.hpp
+++ b/src/include/tensor.hpp
@@ -1,3 +1,4 @@
+#pragma once
 #include <thread>
 #include <vector>
 #include <numeric>
@@ -89,6 +90,7 @@ struct TensorDescriptor
    {
    }

+    DataType_t GetDataType() const;
    std::size_t GetDimension() const;
    std::size_t GetElementSize() const;
    std::size_t GetElementSpace() const;
@@ -105,35 +107,36 @@ struct TensorDescriptor
    }

    private:
+    DataType_t mDataType;
    std::vector<std::size_t> mLens;
    std::vector<std::size_t> mStrides;
-
-    DataType_t mDataType;
 };

-struct GpuMem
+struct DeviceMem
 {
-    GpuMem() = delete;
-    GpuMem(std::size_t size, std::size_t data_size) : mSize(size), mDataSize(data_size)
+    DeviceMem() = delete;
+    DeviceMem(std::size_t mem_size) : mMemSize(mem_size)
    {
-        cudaMalloc(static_cast<void**>(&mGpuBuf), mDataSize * mSize);
+        cudaMalloc(static_cast<void**>(&mpDeviceBuf), mMemSize);
    }

-    int ToGpu(void* p)
+    void* GetDeviceBuffer() { return mpDeviceBuf; }
+
+    int ToDevice(const void* p)
    {
-        return static_cast<int>(cudaMemcpy(mGpuBuf, p, mDataSize * mSize, cudaMemcpyHostToDevice));
+        return static_cast<int>(
+            cudaMemcpy(mpDeviceBuf, const_cast<void*>(p), mMemSize, cudaMemcpyHostToDevice));
    }

-    int FromGpu(void* p)
+    int FromDevice(void* p)
    {
-        return static_cast<int>(cudaMemcpy(p, mGpuBuf, mDataSize * mSize, cudaMemcpyDeviceToHost));
+        return static_cast<int>(cudaMemcpy(p, mpDeviceBuf, mMemSize, cudaMemcpyDeviceToHost));
    }

-    ~GpuMem() { cudaFree(mGpuBuf); }
+    ~DeviceMem() { cudaFree(mpDeviceBuf); }

-    void* mGpuBuf;
-    std::size_t mSize;
-    std::size_t mDataSize;
+    void* mpDeviceBuf;
+    std::size_t mMemSize;
 };

 struct joinable_thread : std::thread
--- a/src/tensor.cpp
+++ b/src/tensor.cpp
@@ -28,6 +28,8 @@ void TensorDescriptor::CalculateStrides()
        mLens.rbegin(), mLens.rend() - 1, mStrides.rbegin() + 1, std::multiplies<std::size_t>());
 }

+DataType_t TensorDescriptor::GetDataType() const { return mDataType; }
+
 std::size_t TensorDescriptor::GetDimension() const { return mLens.size(); }

 std::size_t TensorDescriptor::GetElementSize() const