reorginzed files

[ROCm/composable_kernel commit: 1566b31736]
2026-05-17 11:30:02 +00:00 · 2019-06-13 15:12:12 -05:00
parent 11c6b2ab9a
commit 5f217ebda5
64 changed files with 254 additions and 218 deletions
--- a/driver/CMakeLists.txt
+++ b/driver/CMakeLists.txt
@@ -1,7 +1,23 @@
+set(TENSOR_SOURCE 
+    src/tensor.cpp;
+    src/device.cpp;
+)
+
+add_library(tensor SHARED ${TENSOR_SOURCE})
+target_compile_features(tensor PUBLIC)
+set_target_properties(tensor PROPERTIES POSITION_INDEPENDENT_CODE ON)
+
+if(DEVICE_BACKEND STREQUAL "NVIDIA")
+    target_link_libraries(tensor nvToolsExt cudart)
+endif()
+
+install(TARGETS tensor LIBRARY DESTINATION lib) 
+
+
 if(DEVICE_BACKEND STREQUAL "AMD")
-    set(DRIVER_SOURCE driver.cpp)
+    set(DRIVER_SOURCE src/driver.cpp)
 elseif(DEVICE_BACKEND STREQUAL "NVIDIA")
-    set(DRIVER_SOURCE driver.cu)
+    set(DRIVER_SOURCE src/driver.cu)
 endif()

 add_executable(driver ${DRIVER_SOURCE}) 
--- a/driver/include/conv_common.hpp
+++ b/driver/include/conv_common.hpp
@@ -0,0 +1,125 @@
+#ifndef CK_CONV_COMMON_HPP
+#define CK_CONV_COMMON_HPP
+
+#include "ConstantTensorDescriptor.hpp"
+
+using namespace ck;
+
+// this is ugly, only for 4d
+template <class InDesc, class WeiDesc>
+constexpr auto get_convolution_output_default_4d_tensor_descriptor(InDesc, WeiDesc)
+{
+    constexpr auto in_desc  = InDesc{};
+    constexpr auto wei_desc = WeiDesc{};
+
+    constexpr auto I0 = Number<0>{};
+    constexpr auto I1 = Number<1>{};
+    constexpr auto I2 = Number<2>{};
+    constexpr auto I3 = Number<3>{};
+
+    static_assert(in_desc.GetNumOfDimension() == 4, "input nDim is not 4");
+    static_assert(wei_desc.GetNumOfDimension() == 4, "weight nDim is not 4");
+    static_assert(in_desc.GetLength(I1) == wei_desc.GetLength(I1),
+                  "input & weight dimension not consistent");
+
+    constexpr auto N  = in_desc.GetLength(I0);
+    constexpr auto HI = in_desc.GetLength(I2);
+    constexpr auto WI = in_desc.GetLength(I3);
+
+    constexpr auto K = wei_desc.GetLength(I0);
+    constexpr auto Y = wei_desc.GetLength(I2);
+    constexpr auto X = wei_desc.GetLength(I3);
+
+    constexpr auto HO = HI + 1 - Y;
+    constexpr auto WO = WI + 1 - X;
+
+    return make_ConstantTensorDescriptor_packed(Sequence<N, K, HO, WO>{});
+}
+
+template <class InDesc, class WeiDesc, class LowerPads, class UpperPads>
+constexpr auto get_convolution_with_padding_output_default_4d_tensor_descriptor(InDesc,
+                                                                                WeiDesc,
+                                                                                LowerPads,
+                                                                                UpperPads)
+{
+    constexpr auto in_desc  = InDesc{};
+    constexpr auto wei_desc = WeiDesc{};
+
+    constexpr auto I0 = Number<0>{};
+    constexpr auto I1 = Number<1>{};
+    constexpr auto I2 = Number<2>{};
+    constexpr auto I3 = Number<3>{};
+
+    static_assert(in_desc.GetNumOfDimension() == 4, "input nDim is not 4");
+    static_assert(wei_desc.GetNumOfDimension() == 4, "weight nDim is not 4");
+    static_assert(in_desc.GetLength(I1) == wei_desc.GetLength(I1),
+                  "input & weight dimension not consistent");
+
+    constexpr auto N  = in_desc.GetLength(I0);
+    constexpr auto HI = in_desc.GetLength(I2);
+    constexpr auto WI = in_desc.GetLength(I3);
+
+    constexpr auto K = wei_desc.GetLength(I0);
+    constexpr auto Y = wei_desc.GetLength(I2);
+    constexpr auto X = wei_desc.GetLength(I3);
+
+    constexpr auto HPadLow = LowerPads{}.Get(I0);
+    constexpr auto WPadLow = LowerPads{}.Get(I1);
+
+    constexpr auto HPadUp = UpperPads{}.Get(I0);
+    constexpr auto WPadUp = UpperPads{}.Get(I1);
+
+    constexpr auto HO = HI + HPadLow + HPadUp + 1 - Y;
+    constexpr auto WO = WI + WPadLow + WPadUp + 1 - X;
+
+    return make_ConstantTensorDescriptor_packed(Sequence<N, K, HO, WO>{});
+}
+
+template <class InDesc, class WeiDesc, class OutDesc>
+constexpr std::size_t calculate_convolution_flops(InDesc, WeiDesc, OutDesc)
+{
+    constexpr auto wei_desc = WeiDesc{};
+    constexpr auto out_desc = OutDesc{};
+
+    constexpr auto I0 = Number<0>{};
+    constexpr auto I1 = Number<1>{};
+    constexpr auto I2 = Number<2>{};
+    constexpr auto I3 = Number<3>{};
+
+    constexpr index_t N  = out_desc.GetLength(I0);
+    constexpr index_t K  = out_desc.GetLength(I1);
+    constexpr index_t Ho = out_desc.GetLength(I2);
+    constexpr index_t Wo = out_desc.GetLength(I3);
+
+    constexpr index_t C = wei_desc.GetLength(I1);
+    constexpr index_t Y = wei_desc.GetLength(I2);
+    constexpr index_t X = wei_desc.GetLength(I3);
+
+    return std::size_t(2) * N * K * Ho * Wo * C * Y * X;
+}
+
+template <class Float, class InDesc, class WeiDesc, class OutDesc>
+constexpr std::size_t calculate_convolution_memory_size(Float, InDesc, WeiDesc, OutDesc)
+{
+    constexpr auto wei_desc = WeiDesc{};
+    constexpr auto out_desc = OutDesc{};
+
+    constexpr auto I0 = Number<0>{};
+    constexpr auto I1 = Number<1>{};
+    constexpr auto I2 = Number<2>{};
+    constexpr auto I3 = Number<3>{};
+
+    constexpr index_t N  = out_desc.GetLength(I0);
+    constexpr index_t K  = out_desc.GetLength(I1);
+    constexpr index_t Ho = out_desc.GetLength(I2);
+    constexpr index_t Wo = out_desc.GetLength(I3);
+
+    constexpr index_t C = wei_desc.GetLength(I1);
+    constexpr index_t Y = wei_desc.GetLength(I2);
+    constexpr index_t X = wei_desc.GetLength(I3);
+
+    return sizeof(Float) *
+           (InDesc::GetElementSpace() + WeiDesc::GetElementSpace() + OutDesc::GetElementSpace());
+}
+
+#endif
--- a/driver/include/device.hpp
+++ b/driver/include/device.hpp
@@ -0,0 +1,64 @@
+#ifndef CK_DEVICE_HPP
+#define CK_DEVICE_HPP
+
+#include <memory>
+#include "config.hpp"
+
+using namespace ck;
+
+struct DeviceMem
+{
+    DeviceMem() = delete;
+    DeviceMem(std::size_t mem_size);
+    void* GetDeviceBuffer();
+    void ToDevice(const void* p);
+    void FromDevice(void* p);
+    ~DeviceMem();
+
+    void* mpDeviceBuf;
+    std::size_t mMemSize;
+};
+
+struct KernelTimerImpl;
+
+struct KernelTimer
+{
+    KernelTimer();
+    ~KernelTimer();
+    void Start();
+    void End();
+    float GetElapsedTime() const;
+
+    std::unique_ptr<KernelTimerImpl> impl;
+};
+
+template <typename... Args, typename F>
+float launch_kernel(F kernel, dim3 grid_dim, dim3 block_dim, std::size_t lds_byte, Args... args)
+{
+    KernelTimer timer;
+
+#if CK_DEVICE_BACKEND_AMD
+    timer.Start();
+
+    hipLaunchKernelGGL(kernel, grid_dim, block_dim, lds_byte, 0, args...);
+
+    timer.End();
+
+    hipGetErrorString(hipGetLastError());
+#elif CK_DEVICE_BACKEND_NVIDIA
+    const void* f  = reinterpret_cast<const void*>(kernel);
+    void* p_args[] = {&args...};
+
+    timer.Start();
+
+    cudaError_t error = cudaLaunchKernel(f, grid_dim, block_dim, p_args, lds_byte, 0);
+
+    timer.End();
+
+    checkCudaErrors(error);
+#endif
+
+    return timer.GetElapsedTime();
+}
+
+#endif
--- a/driver/include/device_convolution_direct_v2_nchw_kcyx_nkhw.hpp
+++ b/driver/include/device_convolution_direct_v2_nchw_kcyx_nkhw.hpp
@@ -1,8 +1,9 @@
 #pragma once
 #include <unistd.h>
 #include "device.hpp"
+#include "tensor.hpp"
 #include "gridwise_convolution_kernel_wrapper.hpp"
-#include "composable_kernel/kernel_algorithm/gridwise_convolution_direct_v2_nchw_kcyx_nkhw.hpp"
+#include "gridwise_convolution_direct_v2_nchw_kcyx_nkhw.hpp"

 using namespace ck;

--- a/driver/include/device_convolution_implicit_gemm_v1_chwn_cyxk_khwn.hpp
+++ b/driver/include/device_convolution_implicit_gemm_v1_chwn_cyxk_khwn.hpp
@@ -1,11 +1,12 @@
 #pragma once
 #include <unistd.h>
 #include "device.hpp"
+#include "tensor.hpp"
 #include "gridwise_convolution_kernel_wrapper.hpp"
-#include "composable_kernel/kernel_algorithm/gridwise_convolution_implicit_gemm_v1r1_chwn_cyxk_khwn.hpp"
-#include "composable_kernel/kernel_algorithm/gridwise_convolution_implicit_gemm_v1r2_chwn_cyxk_khwn.hpp"
-#include "composable_kernel/kernel_algorithm/gridwise_convolution_implicit_gemm_v1r3_chwn_cyxk_khwn.hpp"
-#include "composable_kernel/kernel_algorithm/gridwise_convolution_implicit_gemm_v1r3_chwn_cyxk_khwn_lds_double_buffer.hpp"
+#include "gridwise_convolution_implicit_gemm_v1r1_chwn_cyxk_khwn.hpp"
+#include "gridwise_convolution_implicit_gemm_v1r2_chwn_cyxk_khwn.hpp"
+#include "gridwise_convolution_implicit_gemm_v1r3_chwn_cyxk_khwn.hpp"
+#include "gridwise_convolution_implicit_gemm_v1r3_chwn_cyxk_khwn_lds_double_buffer.hpp"

 using namespace ck;

--- a/driver/include/device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw.hpp
+++ b/driver/include/device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw.hpp
@@ -1,9 +1,10 @@
 #pragma once
 #include <unistd.h>
 #include "device.hpp"
+#include "tensor.hpp"
 #include "gridwise_convolution_kernel_wrapper.hpp"
-#include "composable_kernel/kernel_algorithm/gridwise_convolution_implicit_gemm_v1r3_nchw_cyxk_nkhw.hpp"
-#include "composable_kernel/kernel_algorithm/gridwise_convolution_implicit_gemm_v1r3_nchw_cyxk_nkhw_lds_double_buffer.hpp"
+#include "gridwise_convolution_implicit_gemm_v1r3_nchw_cyxk_nkhw.hpp"
+#include "gridwise_convolution_implicit_gemm_v1r3_nchw_cyxk_nkhw_lds_double_buffer.hpp"

 using namespace ck;

--- a/driver/include/device_convolution_implicit_gemm_v2_chwn_cyxk_khwn.hpp
+++ b/driver/include/device_convolution_implicit_gemm_v2_chwn_cyxk_khwn.hpp
@@ -1,9 +1,10 @@
 #pragma once
 #include <unistd.h>
 #include "device.hpp"
+#include "tensor.hpp"
 #include "gridwise_convolution_kernel_wrapper.hpp"
-#include "composable_kernel/kernel_algorithm/gridwise_convolution_implicit_gemm_v2_chwn_cyxk_khwn.hpp"
-#include "composable_kernel/kernel_algorithm/gridwise_convolution_implicit_gemm_v2_chwn_cyxk_khwn_lds_double_buffer.hpp"
+#include "gridwise_convolution_implicit_gemm_v2_chwn_cyxk_khwn.hpp"
+#include "gridwise_convolution_implicit_gemm_v2_chwn_cyxk_khwn_lds_double_buffer.hpp"

 using namespace ck;

--- a/driver/include/device_convolution_implicit_gemm_v3_nchw_cyxk_nkhw.hpp
+++ b/driver/include/device_convolution_implicit_gemm_v3_nchw_cyxk_nkhw.hpp
@@ -1,9 +1,10 @@
 #pragma once
 #include <unistd.h>
 #include "device.hpp"
+#include "tensor.hpp"
 #include "gridwise_convolution_kernel_wrapper.hpp"
-#include "composable_kernel/kernel_algorithm/gridwise_convolution_implicit_gemm_v3_nchw_cyxk_nkhw.hpp"
-#include "composable_kernel/kernel_algorithm/gridwise_convolution_implicit_gemm_v3_nchw_cyxk_nkhw_lds_double_buffer.hpp"
+#include "gridwise_convolution_implicit_gemm_v3_nchw_cyxk_nkhw.hpp"
+#include "gridwise_convolution_implicit_gemm_v3_nchw_cyxk_nkhw_lds_double_buffer.hpp"

 using namespace ck;

--- a/driver/include/device_convolution_implicit_gemm_v4_nchw_kcyx_nkhw.hpp
+++ b/driver/include/device_convolution_implicit_gemm_v4_nchw_kcyx_nkhw.hpp
@@ -1,9 +1,10 @@
 #pragma once
 #include <unistd.h>
 #include "device.hpp"
+#include "tensor.hpp"
 #include "gridwise_convolution_kernel_wrapper.hpp"
-#include "composable_kernel/kernel_algorithm/gridwise_convolution_implicit_gemm_v4_nchw_kcyx_nkhw.hpp"
-#include "composable_kernel/kernel_algorithm/gridwise_convolution_implicit_gemm_v4_nchw_kcyx_nkhw_lds_double_buffer.hpp"
+#include "gridwise_convolution_implicit_gemm_v4_nchw_kcyx_nkhw.hpp"
+#include "gridwise_convolution_implicit_gemm_v4_nchw_kcyx_nkhw_lds_double_buffer.hpp"

 using namespace ck;

--- a/driver/include/device_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hpp
+++ b/driver/include/device_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hpp
@@ -1,7 +1,8 @@
 #pragma once
 #include <unistd.h>
 #include "device.hpp"
-#include "composable_kernel/kernel_algorithm/gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hpp"
+#include "tensor.hpp"
+#include "gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hpp"

 using namespace ck;

--- a/driver/include/device_implicit_gemm_convolution_1_chwn_cyxk_khwn_padded.hpp
+++ b/driver/include/device_implicit_gemm_convolution_1_chwn_cyxk_khwn_padded.hpp
@@ -1,7 +1,8 @@
 #pragma once
 #include <unistd.h>
 #include "device.hpp"
-#include "composable_kernel/kernel_algorithm/gridwise_implicit_gemm_convolution_1_chwn_cyxk_khwn_padded.hpp"
+#include "tensor.hpp"
+#include "gridwise_implicit_gemm_convolution_1_chwn_cyxk_khwn_padded.hpp"

 using namespace ck;

--- a/driver/include/tensor.hpp
+++ b/driver/include/tensor.hpp
@@ -0,0 +1,272 @@
+#ifndef CK_TENSOR_HPP
+#define CK_TENSOR_HPP
+
+#include <thread>
+#include <vector>
+#include <numeric>
+#include <algorithm>
+#include <utility>
+#include <cassert>
+#include <iostream>
+
+template <class Range>
+std::ostream& LogRange(std::ostream& os, Range&& range, std::string delim)
+{
+    bool first = true;
+    for(auto&& v : range)
+    {
+        if(first)
+            first = false;
+        else
+            os << delim;
+        os << v;
+    }
+    return os;
+}
+
+typedef enum {
+    Half  = 0,
+    Float = 1,
+} DataType_t;
+
+template <class T>
+struct DataType;
+
+template <>
+struct DataType<float> : std::integral_constant<DataType_t, DataType_t::Float>
+{
+};
+
+template <class F, class T, std::size_t... Is>
+auto call_f_unpack_args_impl(F f, T args, std::index_sequence<Is...>)
+{
+    return f(std::get<Is>(args)...);
+}
+
+template <class F, class T>
+auto call_f_unpack_args(F f, T args)
+{
+    constexpr std::size_t N = std::tuple_size<T>::value;
+
+    return call_f_unpack_args_impl(f, args, std::make_index_sequence<N>{});
+}
+
+template <class F, class T, std::size_t... Is>
+auto construct_f_unpack_args_impl(T args, std::index_sequence<Is...>)
+{
+    return F(std::get<Is>(args)...);
+}
+
+template <class F, class T>
+auto construct_f_unpack_args(F, T args)
+{
+    constexpr std::size_t N = std::tuple_size<T>::value;
+
+    return construct_f_unpack_args_impl<F>(args, std::make_index_sequence<N>{});
+}
+
+struct TensorDescriptor
+{
+    TensorDescriptor() = delete;
+    TensorDescriptor(std::initializer_list<std::size_t> lens);
+    TensorDescriptor(std::initializer_list<std::size_t> lens,
+                     std::initializer_list<std::size_t> strides);
+    TensorDescriptor(std::vector<std::size_t> lens, std::vector<std::size_t> strides);
+
+    void CalculateStrides();
+
+    template <class Range>
+    TensorDescriptor(const Range& lens) : mLens(lens.begin(), lens.end())
+    {
+        this->CalculateStrides();
+    }
+
+    template <class Range1, class Range2>
+    TensorDescriptor(const Range1& lens, const Range2& strides)
+        : mLens(lens.begin(), lens.end()), mStrides(strides.begin(), strides.end())
+    {
+    }
+
+    std::size_t GetNumOfDimension() const;
+    std::size_t GetElementSize() const;
+    std::size_t GetElementSpace() const;
+
+    const std::vector<std::size_t>& GetLengths() const;
+    const std::vector<std::size_t>& GetStrides() const;
+
+    template <class... Is>
+    std::size_t GetOffsetFromMultiIndex(Is... is) const
+    {
+        assert(sizeof...(Is) == this->GetNumOfDimension());
+        std::initializer_list<std::size_t> iss{static_cast<std::size_t>(is)...};
+        return std::inner_product(iss.begin(), iss.end(), mStrides.begin(), std::size_t{0});
+    }
+
+    private:
+    std::vector<std::size_t> mLens;
+    std::vector<std::size_t> mStrides;
+};
+
+struct joinable_thread : std::thread
+{
+    template <class... Xs>
+    joinable_thread(Xs&&... xs) : std::thread(std::forward<Xs>(xs)...)
+    {
+    }
+
+    joinable_thread(joinable_thread&&) = default;
+    joinable_thread& operator=(joinable_thread&&) = default;
+
+    ~joinable_thread()
+    {
+        if(this->joinable())
+            this->join();
+    }
+};
+
+template <class F, class... Xs>
+struct ParallelTensorFunctor
+{
+    F mF;
+    static constexpr std::size_t NDIM = sizeof...(Xs);
+    std::array<std::size_t, NDIM> mLens;
+    std::array<std::size_t, NDIM> mStrides;
+    std::size_t mN1d;
+
+    ParallelTensorFunctor(F f, Xs... xs) : mF(f), mLens({static_cast<std::size_t>(xs)...})
+    {
+        mStrides.back() = 1;
+        std::partial_sum(mLens.rbegin(),
+                         mLens.rend() - 1,
+                         mStrides.rbegin() + 1,
+                         std::multiplies<std::size_t>());
+        mN1d = mStrides[0] * mLens[0];
+    }
+
+    std::array<std::size_t, NDIM> GetNdIndices(std::size_t i) const
+    {
+        std::array<std::size_t, NDIM> indices;
+
+        for(int idim = 0; idim < NDIM; ++idim)
+        {
+            indices[idim] = i / mStrides[idim];
+            i -= indices[idim] * mStrides[idim];
+        }
+
+        return indices;
+    }
+
+    void operator()(std::size_t num_thread) const
+    {
+        std::size_t work_per_thread = (mN1d + num_thread - 1) / num_thread;
+
+        std::vector<joinable_thread> threads(num_thread);
+
+        for(std::size_t it = 0; it < num_thread; ++it)
+        {
+            std::size_t iw_begin = it * work_per_thread;
+            std::size_t iw_end   = std::min((it + 1) * work_per_thread, mN1d);
+
+            auto f = [=] {
+                for(std::size_t iw = iw_begin; iw < iw_end; ++iw)
+                {
+                    call_f_unpack_args(mF, GetNdIndices(iw));
+                }
+            };
+            threads[it] = joinable_thread(f);
+        }
+    }
+};
+
+template <class F, class... Xs>
+auto make_ParallelTensorFunctor(F f, Xs... xs)
+{
+    return ParallelTensorFunctor<F, Xs...>(f, xs...);
+}
+
+template <class T>
+struct Tensor
+{
+    template <class X>
+    Tensor(std::initializer_list<X> lens) : mDesc(lens), mData(mDesc.GetElementSpace())
+    {
+    }
+
+    template <class X>
+    Tensor(std::vector<X> lens) : mDesc(lens), mData(mDesc.GetElementSpace())
+    {
+    }
+
+    template <class X, class Y>
+    Tensor(std::vector<X> lens, std::vector<Y> strides)
+        : mDesc(lens, strides), mData(mDesc.GetElementSpace())
+    {
+    }
+
+    Tensor(const TensorDescriptor& desc) : mDesc(desc), mData(mDesc.GetElementSpace()) {}
+
+    template <class G>
+    void GenerateTensorValue(G g, std::size_t num_thread = 1)
+    {
+        switch(mDesc.GetNumOfDimension())
+        {
+        case 1:
+        {
+            auto f = [&](auto i) { (*this)(i) = g(i); };
+            make_ParallelTensorFunctor(f, mDesc.GetLengths()[0])(num_thread);
+            break;
+        }
+        case 2:
+        {
+            auto f = [&](auto i0, auto i1) { (*this)(i0, i1) = g(i0, i1); };
+            make_ParallelTensorFunctor(f, mDesc.GetLengths()[0], mDesc.GetLengths()[1])(num_thread);
+            break;
+        }
+        case 3:
+        {
+            auto f = [&](auto i0, auto i1, auto i2) { (*this)(i0, i1, i2) = g(i0, i1, i2); };
+            make_ParallelTensorFunctor(
+                f, mDesc.GetLengths()[0], mDesc.GetLengths()[1], mDesc.GetLengths()[2])(num_thread);
+            break;
+        }
+        case 4:
+        {
+            auto f = [&](auto i0, auto i1, auto i2, auto i3) {
+                (*this)(i0, i1, i2, i3) = g(i0, i1, i2, i3);
+            };
+            make_ParallelTensorFunctor(f,
+                                       mDesc.GetLengths()[0],
+                                       mDesc.GetLengths()[1],
+                                       mDesc.GetLengths()[2],
+                                       mDesc.GetLengths()[3])(num_thread);
+            break;
+        }
+        default: throw std::runtime_error("unspported dimension");
+        }
+    }
+
+    template <class... Is>
+    T& operator()(Is... is)
+    {
+        return mData[mDesc.GetOffsetFromMultiIndex(is...)];
+    }
+
+    template <class... Is>
+    const T& operator()(Is... is) const
+    {
+        return mData[mDesc.GetOffsetFromMultiIndex(is...)];
+    }
+
+    typename std::vector<T>::iterator begin() { return mData.begin(); }
+
+    typename std::vector<T>::iterator end() { return mData.end(); }
+
+    typename std::vector<T>::const_iterator begin() const { return mData.begin(); }
+
+    typename std::vector<T>::const_iterator end() const { return mData.end(); }
+
+    TensorDescriptor mDesc;
+    std::vector<T> mData;
+};
+
+#endif
--- a/driver/src/CMakeLists.txt
+++ b/driver/src/CMakeLists.txt
--- a/driver/src/device.cpp
+++ b/driver/src/device.cpp
@@ -0,0 +1,114 @@
+#include "config.hpp"
+#include "device.hpp"
+
+DeviceMem::DeviceMem(std::size_t mem_size) : mMemSize(mem_size)
+{
+#if CK_DEVICE_BACKEND_AMD
+    hipGetErrorString(hipMalloc(static_cast<void**>(&mpDeviceBuf), mMemSize));
+#elif CK_DEVICE_BACKEND_NVIDIA
+    checkCudaErrors(cudaMalloc(static_cast<void**>(&mpDeviceBuf), mMemSize));
+#endif
+}
+
+void* DeviceMem::GetDeviceBuffer() { return mpDeviceBuf; }
+
+void DeviceMem::ToDevice(const void* p)
+{
+#if CK_DEVICE_BACKEND_AMD
+    hipGetErrorString(
+        hipMemcpy(mpDeviceBuf, const_cast<void*>(p), mMemSize, hipMemcpyHostToDevice));
+#elif CK_DEVICE_BACKEND_NVIDIA
+    checkCudaErrors(
+        cudaMemcpy(mpDeviceBuf, const_cast<void*>(p), mMemSize, cudaMemcpyHostToDevice));
+#endif
+}
+
+void DeviceMem::FromDevice(void* p)
+{
+#if CK_DEVICE_BACKEND_AMD
+    hipGetErrorString(hipMemcpy(p, mpDeviceBuf, mMemSize, hipMemcpyDeviceToHost));
+#elif CK_DEVICE_BACKEND_NVIDIA
+    checkCudaErrors(cudaMemcpy(p, mpDeviceBuf, mMemSize, cudaMemcpyDeviceToHost));
+#endif
+}
+
+DeviceMem::~DeviceMem()
+{
+#if CK_DEVICE_BACKEND_AMD
+    hipGetErrorString(hipFree(mpDeviceBuf));
+#elif CK_DEVICE_BACKEND_NVIDIA
+    checkCudaErrors(cudaFree(mpDeviceBuf));
+#endif
+}
+
+struct KernelTimerImpl
+{
+    KernelTimerImpl()
+    {
+#if CK_DEVICE_BACKEND_AMD
+        hipEventCreate(&mStart);
+        hipEventCreate(&mEnd);
+#elif CK_DEVICE_BACKEND_NVIDIA
+        cudaEventCreate(&mStart);
+        cudaEventCreate(&mEnd);
+#endif
+    }
+
+    ~KernelTimerImpl()
+    {
+#if CK_DEVICE_BACKEND_AMD
+        hipEventDestroy(mStart);
+        hipEventDestroy(mEnd);
+#elif CK_DEVICE_BACKEND_NVIDIA
+        cudaEventDestroy(mStart);
+        cudaEventDestroy(mEnd);
+#endif
+    }
+
+    void Start()
+    {
+#if CK_DEVICE_BACKEND_AMD
+        hipEventRecord(mStart, 0);
+#elif CK_DEVICE_BACKEND_NVIDIA
+        cudaEventRecord(mStart, 0);
+#endif
+    }
+
+    void End()
+    {
+#if CK_DEVICE_BACKEND_AMD
+        hipEventRecord(mEnd, 0);
+        hipEventSynchronize(mEnd);
+#elif CK_DEVICE_BACKEND_NVIDIA
+        cudaEventRecord(mEnd, 0);
+        cudaEventSynchronize(mEnd);
+#endif
+    }
+
+    float GetElapsedTime() const
+    {
+        float time;
+#if CK_DEVICE_BACKEND_AMD
+        hipEventElapsedTime(&time, mStart, mEnd);
+#elif CK_DEVICE_BACKEND_NVIDIA
+        cudaEventElapsedTime(&time, mStart, mEnd);
+#endif
+        return time;
+    }
+
+#if CK_DEVICE_BACKEND_AMD
+    hipEvent_t mStart, mEnd;
+#elif CK_DEVICE_BACKEND_NVIDIA
+    cudaEvent_t mStart, mEnd;
+#endif
+};
+
+KernelTimer::KernelTimer() : impl(new KernelTimerImpl()) {}
+
+KernelTimer::~KernelTimer() {}
+
+void KernelTimer::Start() { impl->Start(); }
+
+void KernelTimer::End() { impl->End(); }
+
+float KernelTimer::GetElapsedTime() const { return impl->GetElapsedTime(); }
--- a/driver/src/driver.cpp
+++ b/driver/src/driver.cpp
@@ -3,9 +3,9 @@
 #include <initializer_list>
 #include <cstdlib>
 #include <stdlib.h>
-#include "composable_kernel/utility/config.hpp"
-#include "composable_kernel/tensor_description/ConstantTensorDescriptor.hpp"
-#include "tensor.hpp"
+#include "config.hpp"
+#include "ConstantTensorDescriptor.hpp"
+#include "device.hpp"
 #include "conv_common.hpp"
 #include "device_convolution_direct_v2_nchw_kcyx_nkhw.hpp"
 #include "device_convolution_implicit_gemm_v1_chwn_cyxk_khwn.hpp"
--- a/driver/src/driver.cu
+++ b/driver/src/driver.cu
--- a/driver/src/tensor.cpp
+++ b/driver/src/tensor.cpp
@@ -0,0 +1,45 @@
+#include <boost/range/adaptor/transformed.hpp>
+#include <cassert>
+
+#include "tensor.hpp"
+
+TensorDescriptor::TensorDescriptor(std::initializer_list<std::size_t> lens) : mLens(lens)
+{
+    this->CalculateStrides();
+}
+
+TensorDescriptor::TensorDescriptor(std::vector<std::size_t> lens, std::vector<std::size_t> strides)
+    : mLens(lens), mStrides(strides)
+{
+}
+
+void TensorDescriptor::CalculateStrides()
+{
+    mStrides.clear();
+    mStrides.resize(mLens.size(), 0);
+    if(mStrides.empty())
+        return;
+
+    mStrides.back() = 1;
+    std::partial_sum(
+        mLens.rbegin(), mLens.rend() - 1, mStrides.rbegin() + 1, std::multiplies<std::size_t>());
+}
+
+std::size_t TensorDescriptor::GetNumOfDimension() const { return mLens.size(); }
+
+std::size_t TensorDescriptor::GetElementSize() const
+{
+    assert(mLens.size() == mStrides.size());
+    return std::accumulate(
+        mLens.begin(), mLens.end(), std::size_t{1}, std::multiplies<std::size_t>());
+}
+
+std::size_t TensorDescriptor::GetElementSpace() const
+{
+    auto ls = mLens | boost::adaptors::transformed([](std::size_t v) { return v - 1; });
+    return std::inner_product(ls.begin(), ls.end(), mStrides.begin(), std::size_t{0}) + 1;
+}
+
+const std::vector<std::size_t>& TensorDescriptor::GetLengths() const { return mLens; }
+
+const std::vector<std::size_t>& TensorDescriptor::GetStrides() const { return mStrides; }