sucess cuda run

2026-05-11 17:00:18 +00:00 · 2018-11-04 14:55:55 -06:00
parent 6a45afba95
commit 84b36bc18d
3 changed files with 212 additions and 186 deletions
--- a/driver/conv.cu
+++ b/driver/conv.cu
@@ -1,6 +1,7 @@
 #include <iostream>
 #include <numeric>
 #include <initializer_list>
+#include <cstdlib>
 #include "nvToolsExt.h"
 #include "tensor.hpp"
 #include "constant_tensor_descriptor.cuh"
@@ -30,7 +31,10 @@ struct GeneratorTensor
    template <class... Is>
    T operator()(Is... is)
    {
-#if 0
+#if 1
+        return std::rand() / RAND_MAX;
+#elif 0
+
        std::initializer_list<std::size_t> ls = {static_cast<std::size_t>(is)...};
        return std::accumulate(ls.begin(), ls.end(), std::size_t(0));
 #else
@@ -111,53 +115,6 @@ void host_convolution(const Tensor<T>& in, const Tensor<T>& wei, Tensor<T>& out)
    f_par(std::thread::hardware_concurrency());
 }

-#if 0
-template <class T>
-void device_convolution(
-    const Tensor<T>& in, const Tensor<T>& wei, Tensor<T>& out)
-{
-    DeviceTensorDescriptor<4> in_desc_device(in.mDesc);
-    DeviceTensorDescriptor<4> wei_desc_device(wei.mDesc);
-    DeviceTensorDescriptor<4> out_desc_device(out.mDesc);
-
-    printf("__func__: in_desc_device: {%u %u %u %u}, {%u %u %u %u}\n",
-           in_desc_device.GetLength(0),
-           in_desc_device.GetLength(1),
-           in_desc_device.GetLength(2),
-           in_desc_device.GetLength(3),
-           in_desc_device.GetStride(0),
-           in_desc_device.GetStride(1),
-           in_desc_device.GetStride(2),
-           in_desc_device.GetStride(3));
-
-    std::size_t data_sz = sizeof(T);
-    DeviceMem in_device_buf(data_sz * in.mDesc.GetElementSpace());
-    DeviceMem wei_device_buf(data_sz * wei.mDesc.GetElementSpace());
-    DeviceMem out_device_buf(data_sz * out.mDesc.GetElementSpace());
-
-    int num_thread = std::thread::hardware_concurrency();
-
-    out.GenerateTensorValue(GeneratorConstant<float>{0}, num_thread);
-
-    in_device_buf.ToDevice(in.mData.data());
-    wei_device_buf.ToDevice(wei.mData.data());
-    out_device_buf.ToDevice(out.mData.data());
-
-    dim3 block_dim(64, 1, 1);
-    dim3 grid_dim(1, 1, 1);
-
-    gridwise_convolution<T, 3, 3, 4, 4, 2, 2, 1, 1, 8, 8, 1>
-        <<<grid_dim, block_dim>>>(in_desc_device,
-                                  static_cast<T*>(in_device_buf.GetDeviceBuffer()),
-                                  wei_desc_device,
-                                  static_cast<T*>(wei_device_buf.GetDeviceBuffer()),
-                                  out_desc_device,
-                                  static_cast<T*>(out_device_buf.GetDeviceBuffer()));
-
-    checkCudaErrors(cudaGetLastError());
-    out_device_buf.FromDevice(out.mData.data());
-}
-#else
 template <class T, class InDesc, class WeiDesc, class OutDesc>
 void const_device_convolution(
    InDesc, const Tensor<T>& in, WeiDesc, const Tensor<T>& wei, OutDesc, Tensor<T>& out)
@@ -169,36 +126,45 @@ void const_device_convolution(

    int num_thread = std::thread::hardware_concurrency();

+#if 0
    out.GenerateTensorValue(GeneratorConstant<float>{0}, num_thread);
+#endif

    in_device_buf.ToDevice(in.mData.data());
    wei_device_buf.ToDevice(wei.mData.data());
    out_device_buf.ToDevice(out.mData.data());

-    dim3 block_dim(64, 1, 1);
-    dim3 grid_dim(1, 1, 1);
-
    constexpr auto I0 = Index<0>{};
    constexpr auto I1 = Index<1>{};
    constexpr auto I2 = Index<2>{};
    constexpr auto I3 = Index<3>{};

-    constexpr auto in_desc = InDesc{};
-    constexpr auto wei_desc = WeiDesc{};
-    constexpr auto out_desc = OutDesc{};
-    constexpr unsigned NPerBlock = 1;
-    constexpr unsigned KPerBlock = 1;
+    constexpr auto in_desc           = InDesc{};
+    constexpr auto wei_desc          = WeiDesc{};
+    constexpr auto out_desc          = OutDesc{};
+    constexpr unsigned NPerBlock     = 1;
+    constexpr unsigned KPerBlock     = 1;
    constexpr unsigned CPerBlockLoop = 1;
-    constexpr unsigned OutTileSizeH = 2;
-    constexpr unsigned OutTileSizeW = 2;
-    constexpr unsigned YPerBlock = (out_desc.GetLength(I2) + OutTileSizeH - 1) / OutTileSizeH;
-    constexpr unsigned XPerBlock = (out_desc.GetLength(I3) + OutTileSizeW - 1) / OutTileSizeW;
+    constexpr unsigned OutTileSizeH  = 2;
+    constexpr unsigned OutTileSizeW  = 2;
+    constexpr unsigned YPerBlock     = 16;
+    constexpr unsigned XPerBlock     = 16;

    constexpr unsigned NBlockCopyLen0 = 1;
    constexpr unsigned NBlockCopyLen1 = 1;
    constexpr unsigned NBlockCopyLen2 = 1;
    constexpr unsigned NBlockCopyLen3 = 64;

+    constexpr unsigned nblock = (out_desc.GetLength(I0) / NPerBlock) *
+                                (out_desc.GetLength(I1) / KPerBlock) *
+                                (out_desc.GetLength(I2) / (OutTileSizeH * YPerBlock)) *
+                                (out_desc.GetLength(I3) / (OutTileSizeW * XPerBlock));
+
+    dim3 block_dim(32);
+    dim3 grid_dim(nblock);
+
+    printf("__func__: nblock %u \n", nblock);
+
    gridwise_convolution<T,
                         InDesc,
                         WeiDesc,
@@ -224,11 +190,10 @@ void const_device_convolution(
    checkCudaErrors(cudaGetLastError());
    out_device_buf.FromDevice(out.mData.data());
 }
-#endif

 int main()
 {
-#if 1
+#if 0
    constexpr unsigned N  = 1;
    constexpr unsigned C  = 1;
    constexpr unsigned HI = 18;
@@ -237,21 +202,21 @@ int main()
    constexpr unsigned S  = 3;
    constexpr unsigned R  = 3;
 #elif 1
-    constexpr unsigned N = 1;
-    constexpr unsigned C = 1;
-    constexpr unsigned HI = 36;
-    constexpr unsigned WI = 36;
-    constexpr unsigned K = 1;
+    constexpr unsigned N = 64;
+    constexpr unsigned C = 256;
+    constexpr unsigned HI = 34;
+    constexpr unsigned WI = 34;
+    constexpr unsigned K = 56;
    constexpr unsigned S = 3;
    constexpr unsigned R = 3;
 #elif 0
-    constexpr unsigned N  = 1;
-    constexpr unsigned C  = 1;
+    constexpr unsigned N = 2;
+    constexpr unsigned C = 3;
    constexpr unsigned HI = 130;
    constexpr unsigned WI = 130;
-    constexpr unsigned K  = 1;
-    constexpr unsigned S  = 3;
-    constexpr unsigned R  = 3;
+    constexpr unsigned K = 5;
+    constexpr unsigned S = 3;
+    constexpr unsigned R = 3;
 #elif 0
    constexpr unsigned N  = 3;
    constexpr unsigned C  = 16;
@@ -278,28 +243,43 @@ int main()

    int num_thread = std::thread::hardware_concurrency();

+#if 0
    in.GenerateTensorValue(GeneratorTensor<float>{}, num_thread);
    wei.GenerateTensorValue(GeneratorTensor<float>{}, num_thread);
-
-    host_convolution(in, wei, out_host);
+#endif

 #if 0
-    device_convolution(in, wei, out_device);
-#else
-    const_device_convolution(in_desc, in, wei_desc, wei, out_desc, out_device);
+    host_convolution(in, wei, out_host);
 #endif

+    const_device_convolution(in_desc, in, wei_desc, wei, out_desc, out_device);
+
    std::cout << __func__ << ": done" << std::endl;

+#if 0
    LogRange(std::cout << __func__ << "in : ", in.mData, ",") << std::endl;
    LogRange(std::cout << __func__ << "wei: ", wei.mData, ",") << std::endl;
    LogRange(std::cout, out_host.mData, ",") << std::endl;
    LogRange(std::cout, out_device.mData, ",") << std::endl;
+#endif

-    float error = 0;
+#if 0
+    float error      = 0;
+    float max_diff   = 0;
+    float host_value = 0, device_value = 0;
    for(int i = 0; i < out_host.mData.size(); ++i)
    {
        error += std::abs(out_host.mData[i] - out_device.mData[i]);
+        float diff = std::abs(out_host.mData[i] - out_device.mData[i]);
+        if(max_diff < diff)
+        {
+            max_diff     = diff;
+            host_value   = out_host.mData[i];
+            device_value = out_device.mData[i];
+        }
    }
    std::cout << "error: " << error << std::endl;
+    std::cout << "max_diff: " << max_diff << ", " << host_value << ", " << device_value
+              << std::endl;
+#endif
 }