Add host API (#220)

* Add host API

* manually rebase on develop

* clean

* manually rebase on develop

* exclude tests from all target

* address review comments

* update client app name

* fix missing lib name

* clang-format update

* refactor

* refactor

* refactor

* refactor

* refactor

* fix test issue

* refactor

* refactor

* refactor

* upate cmake and readme

Co-authored-by: Chao Liu <chao.liu2@amd.com>
This commit is contained in:
JD
2022-05-12 09:21:01 -05:00
committed by GitHub
parent 0f912e205e
commit cec69bc3bc
131 changed files with 1666 additions and 1089 deletions

View File

@@ -0,0 +1,54 @@
#pragma once
#include <memory>
#include <string>
#include "stream_config.hpp"
#include "config.hpp"
#include "device_base.hpp"
struct DeviceConvFwdPtr_t
{
using BaseArgument = ck::tensor_operation::device::BaseArgument;
using BaseInvoker = ck::tensor_operation::device::BaseInvoker;
struct DeviceConvFwdPtrImpl;
std::unique_ptr<DeviceConvFwdPtrImpl> pImpl;
DeviceConvFwdPtr_t();
~DeviceConvFwdPtr_t();
DeviceConvFwdPtr_t(DeviceConvFwdPtr_t&&);
DeviceConvFwdPtr_t(DeviceConvFwdPtrImpl&);
DeviceConvFwdPtr_t& operator=(DeviceConvFwdPtr_t&) = delete;
DeviceConvFwdPtr_t& operator=(const DeviceConvFwdPtr_t&) = delete;
std::unique_ptr<BaseArgument>
MakeArgumentPointer(void* in_ptr,
void* wei_ptr,
void* out_ptr,
size_t N,
size_t K,
size_t C,
std::vector<ck::index_t> input_spatial_lengths,
std::vector<ck::index_t> filter_spatial_lengths,
std::vector<ck::index_t> output_spatial_lengths,
std::vector<ck::index_t> conv_filter_strides,
std::vector<ck::index_t> conv_filter_dilations,
std::vector<ck::index_t> input_left_pads,
std::vector<ck::index_t> input_right_pads)
const; // in,wei and out element ops are ignored for now since even if we change them, they
// cant be linked
std::unique_ptr<BaseInvoker>
MakeInvokerPointer() const; // requires including BaseInvoker headers
std::string GetTypeString();
bool IsSupportedArgument(const BaseArgument* arg_ptr);
};
void add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_f32_instances_t(
std::vector<DeviceConvFwdPtr_t>& instances);
void add_device_conv2d_fwd_xdl_c_shuffle_nhwc_kyxc_nhwk_f16_instances_t(
std::vector<DeviceConvFwdPtr_t>& instances);
void add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_bf16_instances_t(
std::vector<DeviceConvFwdPtr_t>& instances);
void add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_f16_instances_t(
std::vector<DeviceConvFwdPtr_t>& instances);
void add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_int8_instances_t(
std::vector<DeviceConvFwdPtr_t>& instances);

View File

@@ -1,12 +1,25 @@
#ifndef DEVICE_HPP
#define DEVICE_HPP
#pragma once
#include <memory>
#include <functional>
#include <thread>
#include <chrono>
#include "hip/hip_runtime.h"
#include "hip/hip_fp16.h"
#include <hip/hip_runtime.h>
#include <hip/hip_fp16.h>
#include "stream_config.hpp"
#include "ck/options.hpp"
inline void hip_check_error(hipError_t x)
{
if(x != hipSuccess)
{
std::ostringstream ss;
ss << "HIP runtime error: " << hipGetErrorString(x) << ". " << __FILE__ << ": " << __LINE__
<< "in function: " << __func__;
throw std::runtime_error(ss.str());
}
}
struct DeviceMem
{
@@ -36,49 +49,59 @@ struct KernelTimer
std::unique_ptr<KernelTimerImpl> impl;
};
using device_stream_t = hipStream_t;
template <typename... Args, typename F>
void launch_kernel(F kernel, dim3 grid_dim, dim3 block_dim, std::size_t lds_byte, Args... args)
float launch_and_time_kernel(const StreamConfig& stream_config,
F kernel,
dim3 grid_dim,
dim3 block_dim,
std::size_t lds_byte,
Args... args)
{
hipStream_t stream_id = nullptr;
hipLaunchKernelGGL(kernel, grid_dim, block_dim, lds_byte, stream_id, args...);
}
template <typename... Args, typename F>
float launch_and_time_kernel(
F kernel, int nrepeat, dim3 grid_dim, dim3 block_dim, std::size_t lds_byte, Args... args)
{
KernelTimer timer;
printf("%s: grid_dim {%d, %d, %d}, block_dim {%d, %d, %d} \n",
__func__,
grid_dim.x,
grid_dim.y,
grid_dim.z,
block_dim.x,
block_dim.y,
block_dim.z);
printf("Warm up\n");
hipStream_t stream_id = nullptr;
// warm up
hipLaunchKernelGGL(kernel, grid_dim, block_dim, lds_byte, stream_id, args...);
printf("Start running %d times...\n", nrepeat);
timer.Start();
for(int i = 0; i < nrepeat; ++i)
#if CK_TIME_KERNEL
if(stream_config.time_kernel_)
{
hipLaunchKernelGGL(kernel, grid_dim, block_dim, lds_byte, stream_id, args...);
printf("%s: grid_dim {%d, %d, %d}, block_dim {%d, %d, %d} \n",
__func__,
grid_dim.x,
grid_dim.y,
grid_dim.z,
block_dim.x,
block_dim.y,
block_dim.z);
const int nrepeat = 10;
printf("Warm up 1 time\n");
// warm up
hipLaunchKernelGGL(
kernel, grid_dim, block_dim, lds_byte, stream_config.stream_id_, args...);
printf("Start running %d times...\n", nrepeat);
KernelTimer timer;
timer.Start();
for(int i = 0; i < nrepeat; ++i)
{
hipLaunchKernelGGL(
kernel, grid_dim, block_dim, lds_byte, stream_config.stream_id_, args...);
}
timer.End();
return timer.GetElapsedTime() / nrepeat;
}
else
{
hipLaunchKernelGGL(
kernel, grid_dim, block_dim, lds_byte, stream_config.stream_id_, args...);
timer.End();
return 0;
}
#else
hipLaunchKernelGGL(kernel, grid_dim, block_dim, lds_byte, stream_config.stream_id_, args...);
return timer.GetElapsedTime() / nrepeat;
}
return 0;
#endif
}

View File

@@ -84,7 +84,8 @@ struct ReferenceBatchedGemm : public device::BaseOperator
return 0;
}
float Run(const device::BaseArgument* p_arg, int) override
float Run(const device::BaseArgument* p_arg,
const StreamConfig& /* stream_config */ = StreamConfig{}) override
{
return Run(*dynamic_cast<const Argument*>(p_arg));
}

View File

@@ -121,7 +121,8 @@ struct ReferenceConvBwdWeight : public device::BaseOperator
return 0;
}
float Run(const device::BaseArgument* p_arg, int) override
float Run(const device::BaseArgument* p_arg,
const StreamConfig& /*stream_config*/ = StreamConfig{}) override
{
return Run(*dynamic_cast<const Argument*>(p_arg));
}

View File

@@ -291,7 +291,8 @@ struct ReferenceConvBwdData : public device::BaseOperator
}
}
float Run(const device::BaseArgument* p_arg, int) override
float Run(const device::BaseArgument* p_arg,
const StreamConfig& /* stream_config */ = StreamConfig{}) override
{
return Run(*dynamic_cast<const Argument*>(p_arg));
}

View File

@@ -1,9 +1,10 @@
#ifndef REFERENCE_CONV_FWD_HPP
#define REFERENCE_CONV_FWD_HPP
#pragma once
#include <iostream>
#include <type_traits>
#include <sstream>
#include "stream_config.hpp"
#include "device_base.hpp"
#include "host_tensor.hpp"
@@ -251,7 +252,8 @@ struct ReferenceConvFwd : public device::BaseOperator
}
}
float Run(const device::BaseArgument* p_arg, int) override
float Run(const device::BaseArgument* p_arg,
const StreamConfig& /*stream_config*/ = StreamConfig{}) override
{
return Run(*dynamic_cast<const Argument*>(p_arg));
}
@@ -311,4 +313,3 @@ struct ReferenceConvFwd : public device::BaseOperator
} // namespace host
} // namespace tensor_operation
} // namespace ck
#endif

View File

@@ -124,7 +124,8 @@ struct ReferenceConvFwd_Bias_Activation : public device::BaseOperator
return 0;
}
float Run(const device::BaseArgument* p_arg, int) override
float Run(const device::BaseArgument* p_arg,
const StreamConfig& /* stream_config */ = StreamConfig{}) override
{
return Run(*dynamic_cast<const Argument*>(p_arg));
}

View File

@@ -130,7 +130,8 @@ struct ReferenceConvFwd_Bias_Activation_Add : public device::BaseOperator
return 0;
}
float Run(const device::BaseArgument* p_arg, int) override
float Run(const device::BaseArgument* p_arg,
const StreamConfig& /*stream_config*/ = StreamConfig{}) override
{
return Run(*dynamic_cast<const Argument*>(p_arg));
}

View File

@@ -80,7 +80,8 @@ struct ReferenceGemm : public device::BaseOperator
return 0;
}
float Run(const device::BaseArgument* p_arg, int) override
float Run(const device::BaseArgument* p_arg,
const StreamConfig& /* stream_config */ = StreamConfig{}) override
{
return Run(*dynamic_cast<const Argument*>(p_arg));
}

View File

@@ -82,7 +82,8 @@ struct ReferenceGemmBias2D : public device::BaseOperator
return 0;
}
float Run(const device::BaseArgument* p_arg, int) override
float Run(const device::BaseArgument* p_arg,
const StreamConfig& /* stream_config */ = StreamConfig{}) override
{
return Run(*dynamic_cast<const Argument*>(p_arg));
}

View File

@@ -85,7 +85,8 @@ struct ReferenceGemmBiasActivation : public device::BaseOperator
return 0;
}
float Run(const device::BaseArgument* p_arg, int) override
float Run(const device::BaseArgument* p_arg,
const StreamConfig& /* stream_config */ = StreamConfig{}) override
{
return Run(*dynamic_cast<const Argument*>(p_arg));
}

View File

@@ -91,7 +91,8 @@ struct ReferenceGemmBiasActivationAdd : public device::BaseOperator
return 0;
}
float Run(const device::BaseArgument* p_arg, int) override
float Run(const device::BaseArgument* p_arg,
const StreamConfig& /* stream_config */ = StreamConfig{}) override
{
return Run(*dynamic_cast<const Argument*>(p_arg));
}

View File

@@ -128,7 +128,7 @@ class OpInstanceRunEngine
template <typename OpInstancePtr>
ProfileBestConfig Profile(const std::vector<OpInstancePtr>& op_ptrs,
int nrepeat = 100,
bool time_kernel = false,
bool do_verification = false,
bool do_log = false)
{
@@ -143,7 +143,7 @@ class OpInstanceRunEngine
if(op_ptr->IsSupportedArgument(argument.get()))
{
std::string op_name = op_ptr->GetTypeString();
float avg_time = invoker->Run(argument.get(), nrepeat);
float avg_time = invoker->Run(argument.get(), StreamConfig{nullptr, time_kernel});
std::size_t flops = op_instance_.GetFlops();
std::size_t num_btype = op_instance_.GetBtype();