mirror of
https://github.com/ROCm/composable_kernel.git
synced 2026-05-12 09:16:52 +00:00
Add host API (#220)
* Add host API * manually rebase on develop * clean * manually rebase on develop * exclude tests from all target * address review comments * update client app name * fix missing lib name * clang-format update * refactor * refactor * refactor * refactor * refactor * fix test issue * refactor * refactor * refactor * upate cmake and readme Co-authored-by: Chao Liu <chao.liu2@amd.com>
This commit is contained in:
54
library/include/ck/library/host/host_interface.hpp
Normal file
54
library/include/ck/library/host/host_interface.hpp
Normal file
@@ -0,0 +1,54 @@
|
||||
#pragma once
|
||||
|
||||
#include <memory>
|
||||
#include <string>
|
||||
|
||||
#include "stream_config.hpp"
|
||||
#include "config.hpp"
|
||||
#include "device_base.hpp"
|
||||
|
||||
struct DeviceConvFwdPtr_t
|
||||
{
|
||||
using BaseArgument = ck::tensor_operation::device::BaseArgument;
|
||||
using BaseInvoker = ck::tensor_operation::device::BaseInvoker;
|
||||
|
||||
struct DeviceConvFwdPtrImpl;
|
||||
std::unique_ptr<DeviceConvFwdPtrImpl> pImpl;
|
||||
DeviceConvFwdPtr_t();
|
||||
~DeviceConvFwdPtr_t();
|
||||
DeviceConvFwdPtr_t(DeviceConvFwdPtr_t&&);
|
||||
DeviceConvFwdPtr_t(DeviceConvFwdPtrImpl&);
|
||||
DeviceConvFwdPtr_t& operator=(DeviceConvFwdPtr_t&) = delete;
|
||||
DeviceConvFwdPtr_t& operator=(const DeviceConvFwdPtr_t&) = delete;
|
||||
std::unique_ptr<BaseArgument>
|
||||
MakeArgumentPointer(void* in_ptr,
|
||||
void* wei_ptr,
|
||||
void* out_ptr,
|
||||
size_t N,
|
||||
size_t K,
|
||||
size_t C,
|
||||
std::vector<ck::index_t> input_spatial_lengths,
|
||||
std::vector<ck::index_t> filter_spatial_lengths,
|
||||
std::vector<ck::index_t> output_spatial_lengths,
|
||||
std::vector<ck::index_t> conv_filter_strides,
|
||||
std::vector<ck::index_t> conv_filter_dilations,
|
||||
std::vector<ck::index_t> input_left_pads,
|
||||
std::vector<ck::index_t> input_right_pads)
|
||||
const; // in,wei and out element ops are ignored for now since even if we change them, they
|
||||
// cant be linked
|
||||
std::unique_ptr<BaseInvoker>
|
||||
MakeInvokerPointer() const; // requires including BaseInvoker headers
|
||||
std::string GetTypeString();
|
||||
bool IsSupportedArgument(const BaseArgument* arg_ptr);
|
||||
};
|
||||
|
||||
void add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_f32_instances_t(
|
||||
std::vector<DeviceConvFwdPtr_t>& instances);
|
||||
void add_device_conv2d_fwd_xdl_c_shuffle_nhwc_kyxc_nhwk_f16_instances_t(
|
||||
std::vector<DeviceConvFwdPtr_t>& instances);
|
||||
void add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_bf16_instances_t(
|
||||
std::vector<DeviceConvFwdPtr_t>& instances);
|
||||
void add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_f16_instances_t(
|
||||
std::vector<DeviceConvFwdPtr_t>& instances);
|
||||
void add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_int8_instances_t(
|
||||
std::vector<DeviceConvFwdPtr_t>& instances);
|
||||
@@ -1,12 +1,25 @@
|
||||
#ifndef DEVICE_HPP
|
||||
#define DEVICE_HPP
|
||||
#pragma once
|
||||
|
||||
#include <memory>
|
||||
#include <functional>
|
||||
#include <thread>
|
||||
#include <chrono>
|
||||
#include "hip/hip_runtime.h"
|
||||
#include "hip/hip_fp16.h"
|
||||
#include <hip/hip_runtime.h>
|
||||
#include <hip/hip_fp16.h>
|
||||
|
||||
#include "stream_config.hpp"
|
||||
#include "ck/options.hpp"
|
||||
|
||||
inline void hip_check_error(hipError_t x)
|
||||
{
|
||||
if(x != hipSuccess)
|
||||
{
|
||||
std::ostringstream ss;
|
||||
ss << "HIP runtime error: " << hipGetErrorString(x) << ". " << __FILE__ << ": " << __LINE__
|
||||
<< "in function: " << __func__;
|
||||
throw std::runtime_error(ss.str());
|
||||
}
|
||||
}
|
||||
|
||||
struct DeviceMem
|
||||
{
|
||||
@@ -36,49 +49,59 @@ struct KernelTimer
|
||||
std::unique_ptr<KernelTimerImpl> impl;
|
||||
};
|
||||
|
||||
using device_stream_t = hipStream_t;
|
||||
|
||||
template <typename... Args, typename F>
|
||||
void launch_kernel(F kernel, dim3 grid_dim, dim3 block_dim, std::size_t lds_byte, Args... args)
|
||||
float launch_and_time_kernel(const StreamConfig& stream_config,
|
||||
F kernel,
|
||||
dim3 grid_dim,
|
||||
dim3 block_dim,
|
||||
std::size_t lds_byte,
|
||||
Args... args)
|
||||
{
|
||||
hipStream_t stream_id = nullptr;
|
||||
|
||||
hipLaunchKernelGGL(kernel, grid_dim, block_dim, lds_byte, stream_id, args...);
|
||||
}
|
||||
|
||||
template <typename... Args, typename F>
|
||||
float launch_and_time_kernel(
|
||||
F kernel, int nrepeat, dim3 grid_dim, dim3 block_dim, std::size_t lds_byte, Args... args)
|
||||
{
|
||||
KernelTimer timer;
|
||||
|
||||
printf("%s: grid_dim {%d, %d, %d}, block_dim {%d, %d, %d} \n",
|
||||
__func__,
|
||||
grid_dim.x,
|
||||
grid_dim.y,
|
||||
grid_dim.z,
|
||||
block_dim.x,
|
||||
block_dim.y,
|
||||
block_dim.z);
|
||||
|
||||
printf("Warm up\n");
|
||||
|
||||
hipStream_t stream_id = nullptr;
|
||||
|
||||
// warm up
|
||||
hipLaunchKernelGGL(kernel, grid_dim, block_dim, lds_byte, stream_id, args...);
|
||||
|
||||
printf("Start running %d times...\n", nrepeat);
|
||||
|
||||
timer.Start();
|
||||
|
||||
for(int i = 0; i < nrepeat; ++i)
|
||||
#if CK_TIME_KERNEL
|
||||
if(stream_config.time_kernel_)
|
||||
{
|
||||
hipLaunchKernelGGL(kernel, grid_dim, block_dim, lds_byte, stream_id, args...);
|
||||
printf("%s: grid_dim {%d, %d, %d}, block_dim {%d, %d, %d} \n",
|
||||
__func__,
|
||||
grid_dim.x,
|
||||
grid_dim.y,
|
||||
grid_dim.z,
|
||||
block_dim.x,
|
||||
block_dim.y,
|
||||
block_dim.z);
|
||||
|
||||
const int nrepeat = 10;
|
||||
|
||||
printf("Warm up 1 time\n");
|
||||
|
||||
// warm up
|
||||
hipLaunchKernelGGL(
|
||||
kernel, grid_dim, block_dim, lds_byte, stream_config.stream_id_, args...);
|
||||
|
||||
printf("Start running %d times...\n", nrepeat);
|
||||
|
||||
KernelTimer timer;
|
||||
timer.Start();
|
||||
|
||||
for(int i = 0; i < nrepeat; ++i)
|
||||
{
|
||||
hipLaunchKernelGGL(
|
||||
kernel, grid_dim, block_dim, lds_byte, stream_config.stream_id_, args...);
|
||||
}
|
||||
|
||||
timer.End();
|
||||
|
||||
return timer.GetElapsedTime() / nrepeat;
|
||||
}
|
||||
else
|
||||
{
|
||||
hipLaunchKernelGGL(
|
||||
kernel, grid_dim, block_dim, lds_byte, stream_config.stream_id_, args...);
|
||||
|
||||
timer.End();
|
||||
return 0;
|
||||
}
|
||||
#else
|
||||
hipLaunchKernelGGL(kernel, grid_dim, block_dim, lds_byte, stream_config.stream_id_, args...);
|
||||
|
||||
return timer.GetElapsedTime() / nrepeat;
|
||||
}
|
||||
return 0;
|
||||
#endif
|
||||
}
|
||||
|
||||
@@ -84,7 +84,8 @@ struct ReferenceBatchedGemm : public device::BaseOperator
|
||||
return 0;
|
||||
}
|
||||
|
||||
float Run(const device::BaseArgument* p_arg, int) override
|
||||
float Run(const device::BaseArgument* p_arg,
|
||||
const StreamConfig& /* stream_config */ = StreamConfig{}) override
|
||||
{
|
||||
return Run(*dynamic_cast<const Argument*>(p_arg));
|
||||
}
|
||||
|
||||
@@ -121,7 +121,8 @@ struct ReferenceConvBwdWeight : public device::BaseOperator
|
||||
return 0;
|
||||
}
|
||||
|
||||
float Run(const device::BaseArgument* p_arg, int) override
|
||||
float Run(const device::BaseArgument* p_arg,
|
||||
const StreamConfig& /*stream_config*/ = StreamConfig{}) override
|
||||
{
|
||||
return Run(*dynamic_cast<const Argument*>(p_arg));
|
||||
}
|
||||
|
||||
@@ -291,7 +291,8 @@ struct ReferenceConvBwdData : public device::BaseOperator
|
||||
}
|
||||
}
|
||||
|
||||
float Run(const device::BaseArgument* p_arg, int) override
|
||||
float Run(const device::BaseArgument* p_arg,
|
||||
const StreamConfig& /* stream_config */ = StreamConfig{}) override
|
||||
{
|
||||
return Run(*dynamic_cast<const Argument*>(p_arg));
|
||||
}
|
||||
|
||||
@@ -1,9 +1,10 @@
|
||||
#ifndef REFERENCE_CONV_FWD_HPP
|
||||
#define REFERENCE_CONV_FWD_HPP
|
||||
#pragma once
|
||||
|
||||
#include <iostream>
|
||||
#include <type_traits>
|
||||
#include <sstream>
|
||||
|
||||
#include "stream_config.hpp"
|
||||
#include "device_base.hpp"
|
||||
#include "host_tensor.hpp"
|
||||
|
||||
@@ -251,7 +252,8 @@ struct ReferenceConvFwd : public device::BaseOperator
|
||||
}
|
||||
}
|
||||
|
||||
float Run(const device::BaseArgument* p_arg, int) override
|
||||
float Run(const device::BaseArgument* p_arg,
|
||||
const StreamConfig& /*stream_config*/ = StreamConfig{}) override
|
||||
{
|
||||
return Run(*dynamic_cast<const Argument*>(p_arg));
|
||||
}
|
||||
@@ -311,4 +313,3 @@ struct ReferenceConvFwd : public device::BaseOperator
|
||||
} // namespace host
|
||||
} // namespace tensor_operation
|
||||
} // namespace ck
|
||||
#endif
|
||||
|
||||
@@ -124,7 +124,8 @@ struct ReferenceConvFwd_Bias_Activation : public device::BaseOperator
|
||||
return 0;
|
||||
}
|
||||
|
||||
float Run(const device::BaseArgument* p_arg, int) override
|
||||
float Run(const device::BaseArgument* p_arg,
|
||||
const StreamConfig& /* stream_config */ = StreamConfig{}) override
|
||||
{
|
||||
return Run(*dynamic_cast<const Argument*>(p_arg));
|
||||
}
|
||||
|
||||
@@ -130,7 +130,8 @@ struct ReferenceConvFwd_Bias_Activation_Add : public device::BaseOperator
|
||||
return 0;
|
||||
}
|
||||
|
||||
float Run(const device::BaseArgument* p_arg, int) override
|
||||
float Run(const device::BaseArgument* p_arg,
|
||||
const StreamConfig& /*stream_config*/ = StreamConfig{}) override
|
||||
{
|
||||
return Run(*dynamic_cast<const Argument*>(p_arg));
|
||||
}
|
||||
|
||||
@@ -80,7 +80,8 @@ struct ReferenceGemm : public device::BaseOperator
|
||||
return 0;
|
||||
}
|
||||
|
||||
float Run(const device::BaseArgument* p_arg, int) override
|
||||
float Run(const device::BaseArgument* p_arg,
|
||||
const StreamConfig& /* stream_config */ = StreamConfig{}) override
|
||||
{
|
||||
return Run(*dynamic_cast<const Argument*>(p_arg));
|
||||
}
|
||||
|
||||
@@ -82,7 +82,8 @@ struct ReferenceGemmBias2D : public device::BaseOperator
|
||||
return 0;
|
||||
}
|
||||
|
||||
float Run(const device::BaseArgument* p_arg, int) override
|
||||
float Run(const device::BaseArgument* p_arg,
|
||||
const StreamConfig& /* stream_config */ = StreamConfig{}) override
|
||||
{
|
||||
return Run(*dynamic_cast<const Argument*>(p_arg));
|
||||
}
|
||||
|
||||
@@ -85,7 +85,8 @@ struct ReferenceGemmBiasActivation : public device::BaseOperator
|
||||
return 0;
|
||||
}
|
||||
|
||||
float Run(const device::BaseArgument* p_arg, int) override
|
||||
float Run(const device::BaseArgument* p_arg,
|
||||
const StreamConfig& /* stream_config */ = StreamConfig{}) override
|
||||
{
|
||||
return Run(*dynamic_cast<const Argument*>(p_arg));
|
||||
}
|
||||
|
||||
@@ -91,7 +91,8 @@ struct ReferenceGemmBiasActivationAdd : public device::BaseOperator
|
||||
return 0;
|
||||
}
|
||||
|
||||
float Run(const device::BaseArgument* p_arg, int) override
|
||||
float Run(const device::BaseArgument* p_arg,
|
||||
const StreamConfig& /* stream_config */ = StreamConfig{}) override
|
||||
{
|
||||
return Run(*dynamic_cast<const Argument*>(p_arg));
|
||||
}
|
||||
|
||||
@@ -128,7 +128,7 @@ class OpInstanceRunEngine
|
||||
|
||||
template <typename OpInstancePtr>
|
||||
ProfileBestConfig Profile(const std::vector<OpInstancePtr>& op_ptrs,
|
||||
int nrepeat = 100,
|
||||
bool time_kernel = false,
|
||||
bool do_verification = false,
|
||||
bool do_log = false)
|
||||
{
|
||||
@@ -143,7 +143,7 @@ class OpInstanceRunEngine
|
||||
if(op_ptr->IsSupportedArgument(argument.get()))
|
||||
{
|
||||
std::string op_name = op_ptr->GetTypeString();
|
||||
float avg_time = invoker->Run(argument.get(), nrepeat);
|
||||
float avg_time = invoker->Run(argument.get(), StreamConfig{nullptr, time_kernel});
|
||||
|
||||
std::size_t flops = op_instance_.GetFlops();
|
||||
std::size_t num_btype = op_instance_.GetBtype();
|
||||
|
||||
Reference in New Issue
Block a user