From 6521ccba67f1d0be908177960eeba59cf39fa05a Mon Sep 17 00:00:00 2001
From: Chao Liu <lc.roy86@gmail.com>
Date: Fri, 19 Oct 2018 01:26:21 -0500
Subject: [PATCH] cpu direct conv

[ROCm/composable_kernel commit: d51b81588ff6102dbde9c9d91810c1bb8f709cfc]
---
 CMakeLists.txt         |   2 +
 driver/conv.cpp        |  82 +++++++++----
 src/CMakeLists.txt     |   4 +-
 src/include/tensor.hpp | 261 +++++++++++++++++++++--------------------
 4 files changed, 197 insertions(+), 152 deletions(-)
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 02b72e1dad..150632d790 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -41,5 +41,7 @@ link_libraries(${PYTHON_LIBRARIES})
 include_directories(BEFORE ${CUDA_COMMON_INCLUDE_DIR})
 
 #
+include_directories(BEFORE src/include)
 add_subdirectory(src)
 add_subdirectory(driver)
+
diff --git a/driver/conv.cpp b/driver/conv.cpp
index 4e10bce265..6889ba1c15 100644
--- a/driver/conv.cpp
+++ b/driver/conv.cpp
@@ -1,31 +1,67 @@
+#include <iostream>
 #include "tensor.hpp"
 
+template <typename T>
+void direct_convolution(const Tensor<T>& in,
+                        const Tensor<T>& wei,
+                        Tensor<T>& out,
+                        std::size_t num_thread)
+{
+    auto f = [&](auto n, auto k, auto ho, auto wo) {
+        double v = 0;
+        for(int c = 0; c < wei.mDesc.GetLengths()[1]; ++c)
+        {
+            for(int y = 0; y < wei.mDesc.GetLengths()[2]; ++y)
+            {
+                int hi = ho + y;
+                for(int x = 0; x < wei.mDesc.GetLengths()[3]; ++x)
+                {
+                    int wi = wo + x;
+                    v += in(n, c, hi, wi) * wei(k, c, y, x);
+                }
+            }
+        }
+        out(n, k, ho, wo) = v;
+    };
+
+    auto f_par = make_ParallelTensorFunctor(f,
+                                            out.mDesc.GetLengths()[0],
+                                            out.mDesc.GetLengths()[1],
+                                            out.mDesc.GetLengths()[2],
+                                            out.mDesc.GetLengths()[3]);
+
+    f_par(num_thread);
+}
+
+template <class T>
+struct Generator
+{
+
+    template <class... Is>
+    T operator()(Is... is)
+    {
+        return 1;
+    }
+};
+
 int main()
 {
+    Tensor<float> in({3, 16, 128, 128});
+    Tensor<float> wei({4, 16, 3, 3});
+    Tensor<float> out({3, 4, 126, 126});
 
-    int len_in  = 100;
-    int len_wei = 3;
-    int len_out = len_in - len_wei + 1;
+    int num_thread = std::thread::hardware_concurrency();
 
-    std::vector<float> in(len_in, 1);
-    std::vector<float> wei(len_wei, 1);
-    std::vector<float> out(len_out, 1);
+    std::cout << __func__ << ": num_thread " << num_thread << std::endl;
 
-    direct_convolution(in.data(), wei.data(), out.data(), len_in, len_wei);
-}
-
-template <typename T>
-void direct_convolution(const T* in, const T* wei, T* out, const int len_in, const int len_wei)
-{
-    int len_out = len_in - len_wei + 1;
-
-    for(int i_out = 0; i_out < len_out++ i_out)
-    {
-        double acc = 0;
-        for(int i_wei = 0; i_wei < len_wei; ++i_wei)
-        {
-            acc += in[i_out + i_wei] * *wei[i_wei];
-        }
-        out[i_out] = acc;
-    }
+    in.GenerateTensorValue(Generator<float>{}, num_thread);
+    wei.GenerateTensorValue(Generator<float>{}, num_thread);
+
+    direct_convolution(in, wei, out, num_thread);
+
+    std::cout << __func__ << ": done" << std::endl;
+
+    LogRange(std::cout, in.mData, ",") << std::endl;
+    LogRange(std::cout, wei.mData, ",") << std::endl;
+    LogRange(std::cout, out.mData, ",") << std::endl;
 }
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 6165ed605a..b88c759ac9 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -1,5 +1,3 @@
-include_directories(BEFORE include)
-
 set(SOURCE 
     tensor.cpp;
 )
@@ -11,7 +9,7 @@ set_target_properties(convolution PROPERTIES PREFIX "")
 target_link_libraries(convolution boost_python3)
 
 # cuda
-target_link_libraries(convolution nvToolsExt)
+target_link_libraries(convolution nvToolsExt cudart)
 target_compile_features(convolution PUBLIC)
 set_target_properties(convolution PROPERTIES POSITION_INDEPENDENT_CODE ON)
 set_target_properties(convolution PROPERTIES CUDA_SEPARABLE_COMPILATION OFF)
diff --git a/src/include/tensor.hpp b/src/include/tensor.hpp
index 3961a40b80..e232c74516 100644
--- a/src/include/tensor.hpp
+++ b/src/include/tensor.hpp
@@ -1,10 +1,28 @@
 #include <thread>
 #include <vector>
 #include <numeric>
+#include <algorithm>
 #include <utility>
+#include <cassert>
+#include <iostream>
 #include "cuda_runtime.h"
 #include "helper_cuda.h"
 
+template <class Range>
+std::ostream& LogRange(std::ostream& os, Range&& r, std::string delim)
+{
+    bool first = true;
+    for(auto&& x : r)
+    {
+        if(first)
+            first = false;
+        else
+            os << delim;
+        os << x;
+    }
+    return os;
+}
+
 typedef enum
 {
     Half  = 0,
@@ -19,6 +37,34 @@ struct DataType<float> : std::integral_constant<DataType_t, DataType_t::Float>
 {
 };
 
+template <class F, class T, std::size_t... Is>
+auto call_f_unpack_args_impl(F f, T args, std::index_sequence<Is...>)
+{
+    return f(std::get<Is>(args)...);
+}
+
+template <class F, class T>
+auto call_f_unpack_args(F f, T args)
+{
+    constexpr std::size_t N = std::tuple_size<T>::value;
+
+    return call_f_unpack_args_impl(f, args, std::make_index_sequence<N>{});
+}
+
+template <class F, class T, std::size_t... Is>
+auto construct_f_unpack_args_impl(T args, std::index_sequence<Is...>)
+{
+    return F(std::get<Is>(args)...);
+}
+
+template <class F, class T>
+auto construct_f_unpack_args(F, T args)
+{
+    constexpr std::size_t N = std::tuple_size<T>::value;
+
+    return construct_f_unpack_args_impl<F>(args, std::make_index_sequence<N>{});
+}
+
 struct TensorDescriptor
 {
     TensorDescriptor() = delete;
@@ -50,12 +96,12 @@ struct TensorDescriptor
     const std::vector<std::size_t>& GetLengths() const;
     const std::vector<std::size_t>& GetStrides() const;
 
-    template <class... Xs>
-    std::size_t Get1dIndex(Xs... xs) const
+    template <class... Is>
+    std::size_t Get1dIndex(Is... is) const
     {
-        assert(sizeof...(Xs) == this->GetDimension());
-        std::initializer_list<std::size_t> is{xs...};
-        return std::inner_product(is.begin(), is.end(), mStrides.begin(), std::size_t{0});
+        assert(sizeof...(Is) == this->GetDimension());
+        std::initializer_list<std::size_t> iss{static_cast<std::size_t>(is)...};
+        return std::inner_product(iss.begin(), iss.end(), mStrides.begin(), std::size_t{0});
     }
 
     private:
@@ -65,90 +111,6 @@ struct TensorDescriptor
     DataType_t mDataType;
 };
 
-template <class T>
-struct Tensor
-{
-    template <class X>
-    Tensor(std::initializer_list<X> lens)
-        : mDesc(DataType<T>{}, lens), mData(mDesc.GetElementSpace())
-    {
-    }
-
-    template <class X>
-    Tensor(std::vector<X> lens) : mDesc(DataType<T>{}, lens), mData(mDesc.GetElementSpace())
-    {
-    }
-
-    template <class X, class Y>
-    Tensor(std::vector<X> lens, std::vector<Y> strides)
-        : mDesc(DataType<T>{}, lens, strides), mData(mDesc.GetElementSpace())
-    {
-    }
-
-    template <class G>
-    void GenerateTensorValue(G g)
-    {
-        // ParallelTensorFunctor([&](Xs... xs) { mData(mDesc.Get1dIndex(xs...)) = g(xs...); },
-        // mDesc.mLens)();
-        switch(mDesc.GetDimension())
-        {
-        case 1:
-        {
-            ParallelTensorFunctor([&](auto i) { mData(mDesc.Get1dIndex(i)) = g(i); },
-                                  mDesc.GetLengths()[0])();
-            break;
-        }
-        case 2:
-        {
-            ParallelTensorFunctor(
-                [&](auto i0, auto i1) { mData(mDesc.Get1dIndex(i0, i1)) = g(i0, i1); },
-                mDesc.GetLengths()[0],
-                mDesc.GetLengths()[1])();
-            break;
-        }
-        case 3:
-        {
-            ParallelTensorFunctor(
-                [&](auto i0, auto i1, auto i2) {
-                    mData(mDesc.Get1dIndex(i0, i1, i2)) = g(i0, i1, i2);
-                },
-                mDesc.GetLengths()[0],
-                mDesc.GetLengths()[1],
-                mDesc.GetLengths()[2])();
-            break;
-        }
-        case 4:
-        {
-            ParallelTensorFunctor(
-                [&](auto i0, auto i1, auto i2, auto i3) {
-                    mData(mDesc.Get1dIndex(i0, i1, i2, i3)) = g(i0, i1, i2, i3);
-                },
-                mDesc.GetLengths()[0],
-                mDesc.GetLengths()[1],
-                mDesc.GetLengths()[3],
-                mDesc.GetLengths()[4])();
-            break;
-        }
-        default: throw std::runtime_error("unspported dimension");
-        }
-    }
-
-    T& operator[](std::size_t i) { return mData.at(i); }
-
-    const T& operator[](std::size_t i) const { return mData.at(i); }
-
-    typename std::vector<T>::iterator begin() { return mData.begin(); }
-
-    typename std::vector<T>::iterator end() { return mData.end(); }
-
-    typename std::vector<T>::const_iterator begin() const { return mData.begin(); }
-
-    typename std::vector<T>::const_iterator end() const { return mData.end(); }
-
-    TensorDescriptor mDesc;
-    std::vector<T> mData;
-};
-
 struct GpuMem
 {
     GpuMem() = delete;
@@ -194,12 +156,6 @@ struct joinable_thread : std::thread
 template <class F, class... Xs>
 struct ParallelTensorFunctor
 {
-    enum ParallelMethod_t
-    {
-        Serial   = 0,
-        Parallel = 1,
-    };
-
     F mF;
     static constexpr std::size_t NDIM = sizeof...(Xs);
     std::array<std::size_t, NDIM> mLens;
@@ -229,16 +185,7 @@ struct ParallelTensorFunctor
         return indices;
     }
 
-    void operator()(std::integral_constant<ParallelMethod_t, ParallelMethod_t::Serial>)
-    {
-        for(std::size_t i = 0; i < mN1d; ++i)
-        {
-            call_f_unpack_args(mF, GetNdIndices(i));
-        }
-    }
-
-    void operator()(std::integral_constant<ParallelMethod_t, ParallelMethod_t::Parallel>,
-                    std::size_t num_thread)
+    void operator()(std::size_t num_thread) const
     {
         std::size_t work_per_thread = (mN1d + num_thread - 1) / num_thread;
 
@@ -247,7 +194,7 @@ struct ParallelTensorFunctor
         for(std::size_t it = 0; it < num_thread; ++it)
         {
             std::size_t iw_begin = it * work_per_thread;
-            std::size_t iw_end   = std::min(((it + 1) * work_per_thread, mN1d));
+            std::size_t iw_end   = std::min((it + 1) * work_per_thread, mN1d);
 
             auto f = [=] {
                 for(std::size_t iw = iw_begin; iw < iw_end; ++iw)
@@ -260,30 +207,92 @@ struct ParallelTensorFunctor
     }
 };
 
-template <class F, class T>
-auto call_f_unpack_args(F f, T args)
+template <class F, class... Xs>
+auto make_ParallelTensorFunctor(F f, Xs... xs)
 {
-    static constexpr std::size_t N = std::tuple_size<T>::value;
-
-    return call_f_unpack_args_impl(f, args, std::make_index_sequence<N>{});
+    return ParallelTensorFunctor<F, Xs...>(f, xs...);
 }
 
-template <class F, class T, class... Is>
-auto call_f_unpack_args_impl(F f, T args, std::integer_sequence<Is...>)
+template <class T>
+struct Tensor
 {
-    return f(std::get<Is>(args)...);
-}
+    template <class X>
+    Tensor(std::initializer_list<X> lens)
+        : mDesc(DataType<T>{}, lens), mData(mDesc.GetElementSpace())
+    {
+    }
 
-template <class F, class T, class... Is>
-auto construct_f_unpack_args_impl(T args, std::integer_sequence<Is...>)
-{
-    return F(std::get<Is>(args)...);
-}
+    template <class X>
+    Tensor(std::vector<X> lens) : mDesc(DataType<T>{}, lens), mData(mDesc.GetElementSpace())
+    {
+    }
 
-template <class F, class T>
-auto construct_f_unpack_args(F, T args)
-{
-    static constexpr std::size_t N = std::tuple_size<T>::value;
+    template <class X, class Y>
+    Tensor(std::vector<X> lens, std::vector<Y> strides)
+        : mDesc(DataType<T>{}, lens, strides), mData(mDesc.GetElementSpace())
+    {
+    }
 
-    return construct_f_unpack_args_impl<F>(args, std::make_index_sequence<N>{});
-}
+    template <class G>
+    void GenerateTensorValue(G g, std::size_t num_thread = 1)
+    {
+        switch(mDesc.GetDimension())
+        {
+        case 1:
+        {
+            auto f = [&](auto i) { (*this)(i) = g(i); };
+            make_ParallelTensorFunctor(f, mDesc.GetLengths()[0])(num_thread);
+            break;
+        }
+        case 2:
+        {
+            auto f = [&](auto i0, auto i1) { (*this)(i0, i1) = g(i0, i1); };
+            make_ParallelTensorFunctor(f, mDesc.GetLengths()[0], mDesc.GetLengths()[1])(num_thread);
+            break;
+        }
+        case 3:
+        {
+            auto f = [&](auto i0, auto i1, auto i2) { (*this)(i0, i1, i2) = g(i0, i1, i2); };
+            make_ParallelTensorFunctor(
+                f, mDesc.GetLengths()[0], mDesc.GetLengths()[1], mDesc.GetLengths()[2])(num_thread);
+            break;
+        }
+        case 4:
+        {
+            auto f = [&](auto i0, auto i1, auto i2, auto i3) {
+                (*this)(i0, i1, i2, i3) = g(i0, i1, i2, i3);
+            };
+            make_ParallelTensorFunctor(f,
+                                       mDesc.GetLengths()[0],
+                                       mDesc.GetLengths()[1],
+                                       mDesc.GetLengths()[2],
+                                       mDesc.GetLengths()[3])(num_thread);
+            break;
+        }
+        default: throw std::runtime_error("unspported dimension");
+        }
+    }
+
+    template <class... Is>
+    T& operator()(Is... is)
+    {
+        return mData[mDesc.Get1dIndex(is...)];
+    }
+
+    template <class... Is>
+    const T& operator()(Is... is) const
+    {
+        return mData[mDesc.Get1dIndex(is...)];
+    }
+
+    typename std::vector<T>::iterator begin() { return mData.begin(); }
+
+    typename std::vector<T>::iterator end() { return mData.end(); }
+
+    typename std::vector<T>::const_iterator begin() const { return mData.begin(); }
+
+    typename std::vector<T>::const_iterator end() const { return mData.end(); }
+
+    TensorDescriptor mDesc;
+    std::vector<T> mData;
+};