mirror of
https://github.com/ROCm/composable_kernel.git
synced 2026-05-12 01:10:17 +00:00
Add host API (#220)
* Add host API * manually rebase on develop * clean * manually rebase on develop * exclude tests from all target * address review comments * update client app name * fix missing lib name * clang-format update * refactor * refactor * refactor * refactor * refactor * fix test issue * refactor * refactor * refactor * upate cmake and readme Co-authored-by: Chao Liu <chao.liu2@amd.com>
This commit is contained in:
@@ -10,10 +10,31 @@ set(HOST_TENSOR_SOURCE
|
||||
host_tensor.cpp
|
||||
)
|
||||
|
||||
add_library(host_tensor SHARED ${HOST_TENSOR_SOURCE})
|
||||
add_library(host_tensor STATIC ${HOST_TENSOR_SOURCE})
|
||||
add_library(composable_kernel::host_tensor ALIAS host_tensor)
|
||||
|
||||
target_compile_features(host_tensor PUBLIC)
|
||||
set_target_properties(host_tensor PROPERTIES POSITION_INDEPENDENT_CODE ON)
|
||||
target_include_directories(host_tensor SYSTEM PUBLIC $<BUILD_INTERFACE:${HALF_INCLUDE_DIR}>)
|
||||
install(TARGETS host_tensor LIBRARY DESTINATION lib)
|
||||
|
||||
target_include_directories(host_tensor PUBLIC
|
||||
"$<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}/ck>"
|
||||
"$<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}/ck/utility>"
|
||||
"$<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}/ck/library/host_tensor>"
|
||||
)
|
||||
|
||||
install(TARGETS host_tensor
|
||||
EXPORT host_tensorTargets
|
||||
LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
|
||||
ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
|
||||
RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}
|
||||
INCLUDES DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}
|
||||
)
|
||||
|
||||
install(EXPORT host_tensorTargets
|
||||
FILE composable_kernelhost_tensorTargets.cmake
|
||||
NAMESPACE composable_kernel::
|
||||
DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/composable_kernel
|
||||
)
|
||||
|
||||
clang_tidy_check(host_tensor)
|
||||
|
||||
@@ -2,7 +2,7 @@
|
||||
|
||||
DeviceMem::DeviceMem(std::size_t mem_size) : mMemSize(mem_size)
|
||||
{
|
||||
hipGetErrorString(hipMalloc(static_cast<void**>(&mpDeviceBuf), mMemSize));
|
||||
hip_check_error(hipMalloc(static_cast<void**>(&mpDeviceBuf), mMemSize));
|
||||
}
|
||||
|
||||
void* DeviceMem::GetDeviceBuffer() { return mpDeviceBuf; }
|
||||
@@ -11,49 +11,48 @@ std::size_t DeviceMem::GetBufferSize() { return mMemSize; }
|
||||
|
||||
void DeviceMem::ToDevice(const void* p)
|
||||
{
|
||||
hipGetErrorString(
|
||||
hipMemcpy(mpDeviceBuf, const_cast<void*>(p), mMemSize, hipMemcpyHostToDevice));
|
||||
hip_check_error(hipMemcpy(mpDeviceBuf, const_cast<void*>(p), mMemSize, hipMemcpyHostToDevice));
|
||||
}
|
||||
|
||||
void DeviceMem::FromDevice(void* p)
|
||||
{
|
||||
hipGetErrorString(hipMemcpy(p, mpDeviceBuf, mMemSize, hipMemcpyDeviceToHost));
|
||||
hip_check_error(hipMemcpy(p, mpDeviceBuf, mMemSize, hipMemcpyDeviceToHost));
|
||||
}
|
||||
|
||||
void DeviceMem::SetZero() { hipGetErrorString(hipMemset(mpDeviceBuf, 0, mMemSize)); }
|
||||
void DeviceMem::SetZero() { hip_check_error(hipMemset(mpDeviceBuf, 0, mMemSize)); }
|
||||
|
||||
DeviceMem::~DeviceMem() { hipGetErrorString(hipFree(mpDeviceBuf)); }
|
||||
DeviceMem::~DeviceMem() { hip_check_error(hipFree(mpDeviceBuf)); }
|
||||
|
||||
struct KernelTimerImpl
|
||||
{
|
||||
KernelTimerImpl()
|
||||
{
|
||||
hipGetErrorString(hipEventCreate(&mStart));
|
||||
hipGetErrorString(hipEventCreate(&mEnd));
|
||||
hip_check_error(hipEventCreate(&mStart));
|
||||
hip_check_error(hipEventCreate(&mEnd));
|
||||
}
|
||||
|
||||
~KernelTimerImpl()
|
||||
{
|
||||
hipGetErrorString(hipEventDestroy(mStart));
|
||||
hipGetErrorString(hipEventDestroy(mEnd));
|
||||
hip_check_error(hipEventDestroy(mStart));
|
||||
hip_check_error(hipEventDestroy(mEnd));
|
||||
}
|
||||
|
||||
void Start()
|
||||
{
|
||||
hipGetErrorString(hipDeviceSynchronize());
|
||||
hipGetErrorString(hipEventRecord(mStart, nullptr));
|
||||
hip_check_error(hipDeviceSynchronize());
|
||||
hip_check_error(hipEventRecord(mStart, nullptr));
|
||||
}
|
||||
|
||||
void End()
|
||||
{
|
||||
hipGetErrorString(hipEventRecord(mEnd, nullptr));
|
||||
hipGetErrorString(hipEventSynchronize(mEnd));
|
||||
hip_check_error(hipEventRecord(mEnd, nullptr));
|
||||
hip_check_error(hipEventSynchronize(mEnd));
|
||||
}
|
||||
|
||||
float GetElapsedTime() const
|
||||
{
|
||||
float time;
|
||||
hipGetErrorString(hipEventElapsedTime(&time, mStart, mEnd));
|
||||
hip_check_error(hipEventElapsedTime(&time, mStart, mEnd));
|
||||
return time;
|
||||
}
|
||||
|
||||
|
||||
@@ -11,6 +11,7 @@ include_directories(BEFORE
|
||||
${PROJECT_SOURCE_DIR}/include/ck/tensor_operation/gpu/thread
|
||||
${PROJECT_SOURCE_DIR}/include/ck/tensor_operation/gpu/element
|
||||
${PROJECT_SOURCE_DIR}/library/include/ck/library/host_tensor
|
||||
${PROJECT_SOURCE_DIR}/library/include/ck/library/host
|
||||
${PROJECT_SOURCE_DIR}/library/include/ck/library/tensor_operation_instance
|
||||
${PROJECT_SOURCE_DIR}/library/include/ck/library/tensor_operation_instance/gpu/reduce
|
||||
${PROJECT_SOURCE_DIR}/external/include/half
|
||||
@@ -18,7 +19,7 @@ include_directories(BEFORE
|
||||
|
||||
function(add_instance_library INSTANCE_NAME)
|
||||
message("adding instance ${INSTANCE_NAME}")
|
||||
add_library(${INSTANCE_NAME} SHARED ${ARGN})
|
||||
add_library(${INSTANCE_NAME} OBJECT ${ARGN})
|
||||
target_compile_features(${INSTANCE_NAME} PUBLIC)
|
||||
set_target_properties(${INSTANCE_NAME} PROPERTIES POSITION_INDEPENDENT_CODE ON)
|
||||
endfunction(add_instance_library INSTANCE_NAME)
|
||||
@@ -41,3 +42,73 @@ add_subdirectory(convnd_bwd_data)
|
||||
add_subdirectory(grouped_gemm)
|
||||
add_subdirectory(conv2d_bwd_weight)
|
||||
add_subdirectory(batched_gemm_reduce)
|
||||
|
||||
add_library(device_operations STATIC
|
||||
$<TARGET_OBJECTS:device_conv1d_fwd_instance>
|
||||
$<TARGET_OBJECTS:device_batched_gemm_instance>
|
||||
$<TARGET_OBJECTS:device_conv2d_bwd_data_instance>
|
||||
$<TARGET_OBJECTS:device_conv2d_fwd_instance>
|
||||
$<TARGET_OBJECTS:device_conv2d_fwd_bias_relu_instance>
|
||||
$<TARGET_OBJECTS:device_conv2d_fwd_bias_relu_add_instance>
|
||||
$<TARGET_OBJECTS:device_conv2d_fwd_bias_relu_atomic_add_instance>
|
||||
$<TARGET_OBJECTS:device_gemm_instance>
|
||||
$<TARGET_OBJECTS:device_gemm_bias_relu_instance>
|
||||
$<TARGET_OBJECTS:device_gemm_bias_relu_add_instance>
|
||||
$<TARGET_OBJECTS:device_gemm_bias2d_instance>
|
||||
$<TARGET_OBJECTS:device_reduce_instance>
|
||||
$<TARGET_OBJECTS:device_convnd_bwd_data_instance>
|
||||
$<TARGET_OBJECTS:device_grouped_gemm_instance>
|
||||
$<TARGET_OBJECTS:device_conv2d_bwd_weight_instance>
|
||||
$<TARGET_OBJECTS:device_batched_gemm_reduce_instance>
|
||||
$<TARGET_OBJECTS:device_conv3d_fwd_instance>
|
||||
device_conv2d.cpp
|
||||
)
|
||||
add_library(composablekernels::device_operations ALIAS device_operations)
|
||||
|
||||
|
||||
set(DEV_OPS_INC_DIRS
|
||||
${PROJECT_SOURCE_DIR}/include/ck/
|
||||
${PROJECT_SOURCE_DIR}/library/include/ck/
|
||||
${PROJECT_SOURCE_DIR}/external/include/
|
||||
)
|
||||
target_compile_features(device_operations PUBLIC)
|
||||
set_target_properties(device_operations PROPERTIES POSITION_INDEPENDENT_CODE ON)
|
||||
target_include_directories(device_operations PUBLIC
|
||||
$<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}/ck>
|
||||
$<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}/ck/utility>
|
||||
$<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}/ck/tensor_description>
|
||||
$<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}/ck/tensor>
|
||||
$<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}/ck/problem_transform>
|
||||
$<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}/ck/tensor_operation/gpu/device>
|
||||
$<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}/ck/tensor_operation/gpu/grid>
|
||||
$<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}/ck/tensor_operation/gpu/block>
|
||||
$<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}/ck/tensor_operation/gpu/warp>
|
||||
$<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}/ck/tensor_operation/gpu/thread>
|
||||
$<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}/ck/tensor_operation/gpu/element>
|
||||
$<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}/ck/library/host_tensor>
|
||||
$<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}/ck/library/host>
|
||||
$<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}/ck/library/tensor_operation_instance>
|
||||
$<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}/ck/library/tensor_operation_instance/gpu/reduce>
|
||||
$<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}/ck/half>
|
||||
)
|
||||
|
||||
#once new arches are enabled make this an option on the main cmake file
|
||||
# and pass down here to be exported
|
||||
|
||||
target_compile_options(device_operations
|
||||
PRIVATE --offload-arch=gfx908
|
||||
)
|
||||
# install(TARGETS device_operations LIBRARY DESTINATION lib)
|
||||
install(TARGETS device_operations
|
||||
EXPORT device_operationsTargets
|
||||
LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
|
||||
ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
|
||||
RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}
|
||||
INCLUDES DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}
|
||||
)
|
||||
install(DIRECTORY ${DEV_OPS_INC_DIRS} DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/ck)
|
||||
install(EXPORT device_operationsTargets
|
||||
FILE composable_kerneldevice_operationsTargets.cmake
|
||||
NAMESPACE composable_kernel::
|
||||
DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/composable_kernel
|
||||
)
|
||||
|
||||
@@ -18,9 +18,9 @@ set(DEVICE_BATCHED_GEMM_INSTANCE_SOURCE
|
||||
device_batched_gemm_xdl_int8_int8_int8_gkm_gnk_gmn_instance.cpp;
|
||||
)
|
||||
|
||||
add_library(device_batched_gemm_instance SHARED ${DEVICE_BATCHED_GEMM_INSTANCE_SOURCE})
|
||||
target_compile_features(device_batched_gemm_instance PUBLIC)
|
||||
add_library(device_batched_gemm_instance OBJECT ${DEVICE_BATCHED_GEMM_INSTANCE_SOURCE})
|
||||
# target_compile_features(device_batched_gemm_instance PUBLIC)
|
||||
set_target_properties(device_batched_gemm_instance PROPERTIES POSITION_INDEPENDENT_CODE ON)
|
||||
install(TARGETS device_batched_gemm_instance LIBRARY DESTINATION lib)
|
||||
# install(TARGETS device_batched_gemm_instance LIBRARY DESTINATION lib)
|
||||
|
||||
clang_tidy_check(device_batched_gemm_instance)
|
||||
|
||||
@@ -5,7 +5,8 @@ set(DEVICE_BATCHED_GEMM_REDUCE_INSTANCE_SOURCE
|
||||
device_batched_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_gkm_gnk_gmn_instance.cpp
|
||||
)
|
||||
|
||||
add_instance_library(device_batched_gemm_reduce_instance ${DEVICE_BATCHED_GEMM_REDUCE_INSTANCE_SOURCE})
|
||||
install(TARGETS device_batched_gemm_reduce_instance LIBRARY DESTINATION lib)
|
||||
add_instance_library(device_batched_gemm_reduce_instance OBJECT ${DEVICE_BATCHED_GEMM_REDUCE_INSTANCE_SOURCE})
|
||||
target_compile_features(device_batched_gemm_reduce_instance PUBLIC)
|
||||
set_target_properties(device_batched_gemm_reduce_instance PROPERTIES POSITION_INDEPENDENT_CODE ON)
|
||||
clang_tidy_check(device_batched_gemm_reduce_instance)
|
||||
|
||||
|
||||
@@ -6,9 +6,9 @@ set(DEVICE_CONV1D_FWD_INSTANCE_SOURCE
|
||||
device_conv1d_fwd_xdl_nwc_kxc_nwk_int8_instance.cpp;
|
||||
)
|
||||
|
||||
add_library(device_conv1d_fwd_instance SHARED ${DEVICE_CONV1D_FWD_INSTANCE_SOURCE})
|
||||
target_compile_features(device_conv1d_fwd_instance PUBLIC)
|
||||
add_library(device_conv1d_fwd_instance OBJECT ${DEVICE_CONV1D_FWD_INSTANCE_SOURCE})
|
||||
# target_compile_features(device_conv1d_fwd_instance PUBLIC)
|
||||
set_target_properties(device_conv1d_fwd_instance PROPERTIES POSITION_INDEPENDENT_CODE ON)
|
||||
install(TARGETS device_conv1d_fwd_instance LIBRARY DESTINATION lib)
|
||||
# install(TARGETS device_conv1d_fwd_instance LIBRARY DESTINATION lib)
|
||||
|
||||
clang_tidy_check(device_conv1d_fwd_instance)
|
||||
|
||||
@@ -6,9 +6,7 @@ set(DEVICE_CONV2D_BWD_DATA_INSTANCE_SOURCE
|
||||
device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_int8_instance.cpp;
|
||||
)
|
||||
|
||||
add_library(device_conv2d_bwd_data_instance SHARED ${DEVICE_CONV2D_BWD_DATA_INSTANCE_SOURCE})
|
||||
target_compile_features(device_conv2d_bwd_data_instance PUBLIC)
|
||||
add_library(device_conv2d_bwd_data_instance OBJECT ${DEVICE_CONV2D_BWD_DATA_INSTANCE_SOURCE})
|
||||
set_target_properties(device_conv2d_bwd_data_instance PROPERTIES POSITION_INDEPENDENT_CODE ON)
|
||||
install(TARGETS device_conv2d_bwd_data_instance LIBRARY DESTINATION lib)
|
||||
|
||||
clang_tidy_check(device_conv2d_bwd_data_instance)
|
||||
|
||||
@@ -3,7 +3,7 @@ set(DEVICE_CONV2D_BWD_WEIGHT_INSTANCE_SOURCE
|
||||
device_conv2d_bwd_weight_xdl_nhwc_kyxc_nhwk_f16_instance.cpp;
|
||||
device_conv2d_bwd_weight_xdl_nhwc_kyxc_nhwk_f32_instance.cpp;
|
||||
)
|
||||
add_library(device_conv2d_bwd_weight_instance SHARED ${DEVICE_CONV2D_BWD_WEIGHT_INSTANCE_SOURCE})
|
||||
add_library(device_conv2d_bwd_weight_instance OBJECT ${DEVICE_CONV2D_BWD_WEIGHT_INSTANCE_SOURCE})
|
||||
target_compile_features(device_conv2d_bwd_weight_instance PUBLIC)
|
||||
set_target_properties(device_conv2d_bwd_weight_instance PROPERTIES POSITION_INDEPENDENT_CODE ON)
|
||||
install(TARGETS device_conv2d_bwd_weight_instance LIBRARY DESTINATION lib)
|
||||
|
||||
@@ -6,9 +6,7 @@ set(DEVICE_CONV2D_FWD_INSTANCE_SOURCE
|
||||
device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_int8_instance.cpp;
|
||||
device_conv2d_fwd_xdl_c_shuffle_nhwc_kyxc_nhwk_f16_instance.cpp;
|
||||
)
|
||||
add_library(device_conv2d_fwd_instance SHARED ${DEVICE_CONV2D_FWD_INSTANCE_SOURCE})
|
||||
target_compile_features(device_conv2d_fwd_instance PUBLIC)
|
||||
add_library(device_conv2d_fwd_instance OBJECT ${DEVICE_CONV2D_FWD_INSTANCE_SOURCE})
|
||||
set_target_properties(device_conv2d_fwd_instance PROPERTIES POSITION_INDEPENDENT_CODE ON)
|
||||
install(TARGETS device_conv2d_fwd_instance LIBRARY DESTINATION lib)
|
||||
|
||||
clang_tidy_check(device_conv2d_fwd_instance)
|
||||
|
||||
@@ -2,9 +2,7 @@
|
||||
set(DEVICE_CONV2D_FWD_BIAS_RELU_INSTANCE_SOURCE
|
||||
device_conv2d_fwd_xdl_c_shuffle_bias_relu_nhwc_kyxc_nhwk_f16_instance.cpp;
|
||||
)
|
||||
add_library(device_conv2d_fwd_bias_relu_instance SHARED ${DEVICE_CONV2D_FWD_BIAS_RELU_INSTANCE_SOURCE})
|
||||
target_compile_features(device_conv2d_fwd_bias_relu_instance PUBLIC)
|
||||
add_library(device_conv2d_fwd_bias_relu_instance OBJECT ${DEVICE_CONV2D_FWD_BIAS_RELU_INSTANCE_SOURCE})
|
||||
set_target_properties(device_conv2d_fwd_bias_relu_instance PROPERTIES POSITION_INDEPENDENT_CODE ON)
|
||||
install(TARGETS device_conv2d_fwd_bias_relu_instance LIBRARY DESTINATION lib)
|
||||
|
||||
clang_tidy_check(device_conv2d_fwd_bias_relu_instance)
|
||||
|
||||
@@ -2,9 +2,7 @@
|
||||
set(DEVICE_CONV2D_FWD_BIAS_RELU_ADD_INSTANCE_SOURCE
|
||||
device_conv2d_fwd_xdl_c_shuffle_bias_relu_add_nhwc_kyxc_nhwk_f16_instance.cpp;
|
||||
)
|
||||
add_library(device_conv2d_fwd_bias_relu_add_instance SHARED ${DEVICE_CONV2D_FWD_BIAS_RELU_ADD_INSTANCE_SOURCE})
|
||||
target_compile_features(device_conv2d_fwd_bias_relu_add_instance PUBLIC)
|
||||
add_library(device_conv2d_fwd_bias_relu_add_instance OBJECT ${DEVICE_CONV2D_FWD_BIAS_RELU_ADD_INSTANCE_SOURCE})
|
||||
set_target_properties(device_conv2d_fwd_bias_relu_add_instance PROPERTIES POSITION_INDEPENDENT_CODE ON)
|
||||
install(TARGETS device_conv2d_fwd_bias_relu_add_instance LIBRARY DESTINATION lib)
|
||||
|
||||
clang_tidy_check(device_conv2d_fwd_bias_relu_add_instance)
|
||||
|
||||
@@ -3,9 +3,7 @@ set(DEVICE_CONV2D_FWD_BIAS_RELU_ATOMIC_ADD_INSTANCE_SOURCE
|
||||
device_conv2d_fwd_xdl_c_shuffle_bias_relu_atomic_add_nhwc_kyxc_nhwk_f16_instance.cpp;
|
||||
)
|
||||
|
||||
add_library(device_conv2d_fwd_bias_relu_atomic_add_instance SHARED ${DEVICE_CONV2D_FWD_BIAS_RELU_ATOMIC_ADD_INSTANCE_SOURCE})
|
||||
target_compile_features(device_conv2d_fwd_bias_relu_atomic_add_instance PUBLIC)
|
||||
add_library(device_conv2d_fwd_bias_relu_atomic_add_instance OBJECT ${DEVICE_CONV2D_FWD_BIAS_RELU_ATOMIC_ADD_INSTANCE_SOURCE})
|
||||
set_target_properties(device_conv2d_fwd_bias_relu_atomic_add_instance PROPERTIES POSITION_INDEPENDENT_CODE ON)
|
||||
install(TARGETS device_conv2d_fwd_bias_relu_atomic_add_instance LIBRARY DESTINATION lib)
|
||||
|
||||
clang_tidy_check(device_conv2d_fwd_bias_relu_atomic_add_instance)
|
||||
|
||||
@@ -5,9 +5,8 @@ set(DEVICE_CONV3D_FWD_INSTANCE_SOURCE
|
||||
device_conv3d_fwd_xdl_ndhwc_kzyxc_ndhwk_bf16_instance.cpp;
|
||||
device_conv3d_fwd_xdl_ndhwc_kzyxc_ndhwk_int8_instance.cpp;
|
||||
)
|
||||
add_library(device_conv3d_fwd_instance SHARED ${DEVICE_CONV3D_FWD_INSTANCE_SOURCE})
|
||||
add_library(device_conv3d_fwd_instance OBJECT ${DEVICE_CONV3D_FWD_INSTANCE_SOURCE})
|
||||
target_compile_features(device_conv3d_fwd_instance PUBLIC)
|
||||
set_target_properties(device_conv3d_fwd_instance PROPERTIES POSITION_INDEPENDENT_CODE ON)
|
||||
install(TARGETS device_conv3d_fwd_instance LIBRARY DESTINATION lib)
|
||||
|
||||
clang_tidy_check(device_conv3d_fwd_instance)
|
||||
|
||||
@@ -14,7 +14,7 @@ set(DEVICE_CONVND_BWD_DATA_INSTANCE_SOURCE
|
||||
device_conv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_int8_instance.cpp;
|
||||
)
|
||||
|
||||
add_library(device_convnd_bwd_data_instance SHARED ${DEVICE_CONVND_BWD_DATA_INSTANCE_SOURCE})
|
||||
add_library(device_convnd_bwd_data_instance OBJECT ${DEVICE_CONVND_BWD_DATA_INSTANCE_SOURCE})
|
||||
target_compile_features(device_convnd_bwd_data_instance PUBLIC)
|
||||
set_target_properties(device_convnd_bwd_data_instance PROPERTIES POSITION_INDEPENDENT_CODE ON)
|
||||
install(TARGETS device_convnd_bwd_data_instance LIBRARY DESTINATION lib)
|
||||
|
||||
201
library/src/tensor_operation_instance/gpu/device_conv2d.cpp
Normal file
201
library/src/tensor_operation_instance/gpu/device_conv2d.cpp
Normal file
@@ -0,0 +1,201 @@
|
||||
#include <stdlib.h>
|
||||
#include "config.hpp"
|
||||
#include "device_conv2d_fwd_xdl_c_shuffle_nhwc_kyxc_nhwk.hpp"
|
||||
#include "element_wise_operation.hpp"
|
||||
#include "device_operation_instance.hpp"
|
||||
#include "host_interface.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace device_conv2d_fwd_instance {
|
||||
using PassThrough = ck::tensor_operation::element_wise::PassThrough;
|
||||
void add_device_conv2d_fwd_xdl_c_shuffle_nhwc_kyxc_nhwk_f16_instances(
|
||||
std::vector<DeviceConvFwdPtr<PassThrough, PassThrough, PassThrough>>& instances);
|
||||
void add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_f32_instances(
|
||||
std::vector<DeviceConvFwdPtr<PassThrough, PassThrough, PassThrough>>& instances);
|
||||
void add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_bf16_instances(
|
||||
std::vector<DeviceConvFwdPtr<PassThrough, PassThrough, PassThrough>>& instances);
|
||||
void add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_f16_instances(
|
||||
std::vector<DeviceConvFwdPtr<PassThrough, PassThrough, PassThrough>>& instances);
|
||||
void add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_int8_instances(
|
||||
std::vector<DeviceConvFwdPtr<PassThrough, PassThrough, PassThrough>>& instances);
|
||||
|
||||
} // namespace device_conv2d_fwd_instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
} // namespace ck
|
||||
|
||||
using PassThrough = ck::tensor_operation::element_wise::PassThrough;
|
||||
struct DeviceConvFwdPtr_t::DeviceConvFwdPtrImpl
|
||||
{
|
||||
std::unique_ptr<DeviceConvFwdPtr_t::BaseArgument>
|
||||
MakeArgumentPointer(void* in_ptr,
|
||||
void* wei_ptr,
|
||||
void* out_ptr,
|
||||
size_t N,
|
||||
size_t K,
|
||||
size_t C,
|
||||
std::vector<ck::index_t> input_spatial_lengths,
|
||||
std::vector<ck::index_t> filter_spatial_lengths,
|
||||
std::vector<ck::index_t> output_spatial_lengths,
|
||||
std::vector<ck::index_t> conv_filter_strides,
|
||||
std::vector<ck::index_t> conv_filter_dilations,
|
||||
std::vector<ck::index_t> input_left_pads,
|
||||
std::vector<ck::index_t> input_right_pads) const
|
||||
{
|
||||
return el->MakeArgumentPointer(in_ptr,
|
||||
wei_ptr,
|
||||
out_ptr,
|
||||
N,
|
||||
K,
|
||||
C,
|
||||
input_spatial_lengths,
|
||||
filter_spatial_lengths,
|
||||
output_spatial_lengths,
|
||||
conv_filter_strides,
|
||||
conv_filter_dilations,
|
||||
input_left_pads,
|
||||
input_right_pads,
|
||||
PassThrough{},
|
||||
PassThrough{},
|
||||
PassThrough{});
|
||||
}
|
||||
std::unique_ptr<DeviceConvFwdPtr_t::BaseInvoker> MakeInvokerPointer() const
|
||||
{
|
||||
return el->MakeInvokerPointer();
|
||||
}
|
||||
|
||||
std::string GetTypeString() { return el->GetTypeString(); }
|
||||
bool IsSupportedArgument(const DeviceConvFwdPtr_t::BaseArgument* arg)
|
||||
{
|
||||
return el->IsSupportedArgument(arg);
|
||||
}
|
||||
|
||||
ck::tensor_operation::device::DeviceConvFwdPtr<PassThrough, PassThrough, PassThrough> el;
|
||||
};
|
||||
|
||||
DeviceConvFwdPtr_t::DeviceConvFwdPtr_t() : pImpl(nullptr) {}
|
||||
DeviceConvFwdPtr_t::~DeviceConvFwdPtr_t() = default;
|
||||
DeviceConvFwdPtr_t::DeviceConvFwdPtr_t(DeviceConvFwdPtr_t&&) = default;
|
||||
DeviceConvFwdPtr_t::DeviceConvFwdPtr_t(DeviceConvFwdPtr_t::DeviceConvFwdPtrImpl& other)
|
||||
: pImpl(std::make_unique<DeviceConvFwdPtr_t::DeviceConvFwdPtrImpl>(std::move(other)))
|
||||
{
|
||||
}
|
||||
|
||||
std::unique_ptr<DeviceConvFwdPtr_t::BaseArgument>
|
||||
DeviceConvFwdPtr_t::MakeArgumentPointer(void* in_ptr,
|
||||
void* wei_ptr,
|
||||
void* out_ptr,
|
||||
size_t N,
|
||||
size_t K,
|
||||
size_t C,
|
||||
std::vector<ck::index_t> input_spatial_lengths,
|
||||
std::vector<ck::index_t> filter_spatial_lengths,
|
||||
std::vector<ck::index_t> output_spatial_lengths,
|
||||
std::vector<ck::index_t> conv_filter_strides,
|
||||
std::vector<ck::index_t> conv_filter_dilations,
|
||||
std::vector<ck::index_t> input_left_pads,
|
||||
std::vector<ck::index_t> input_right_pads) const
|
||||
{
|
||||
return pImpl->MakeArgumentPointer(in_ptr,
|
||||
wei_ptr,
|
||||
out_ptr,
|
||||
N,
|
||||
K,
|
||||
C,
|
||||
input_spatial_lengths,
|
||||
filter_spatial_lengths,
|
||||
output_spatial_lengths,
|
||||
conv_filter_strides,
|
||||
conv_filter_dilations,
|
||||
input_left_pads,
|
||||
input_right_pads);
|
||||
}
|
||||
|
||||
std::unique_ptr<DeviceConvFwdPtr_t::BaseInvoker> DeviceConvFwdPtr_t::MakeInvokerPointer() const
|
||||
{
|
||||
return pImpl->MakeInvokerPointer();
|
||||
}
|
||||
|
||||
std::string DeviceConvFwdPtr_t::GetTypeString() { return pImpl->GetTypeString(); }
|
||||
bool DeviceConvFwdPtr_t::IsSupportedArgument(const DeviceConvFwdPtr_t::BaseArgument* arg_ptr)
|
||||
{
|
||||
return pImpl->IsSupportedArgument(arg_ptr);
|
||||
}
|
||||
|
||||
using namespace ck::tensor_operation::device::device_conv2d_fwd_instance;
|
||||
void add_device_conv2d_fwd_xdl_c_shuffle_nhwc_kyxc_nhwk_f16_instances_t(
|
||||
std::vector<DeviceConvFwdPtr_t>& instances)
|
||||
{
|
||||
std::vector<
|
||||
ck::tensor_operation::device::DeviceConvFwdPtr<PassThrough, PassThrough, PassThrough>>
|
||||
local_instances;
|
||||
add_device_conv2d_fwd_xdl_c_shuffle_nhwc_kyxc_nhwk_f16_instances(local_instances);
|
||||
for(auto& kinder : local_instances)
|
||||
{
|
||||
DeviceConvFwdPtr_t::DeviceConvFwdPtrImpl tmp{std::move(kinder)};
|
||||
instances.emplace_back(tmp);
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
void add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_f32_instances_t(
|
||||
std::vector<DeviceConvFwdPtr_t>& instances)
|
||||
{
|
||||
std::vector<
|
||||
ck::tensor_operation::device::DeviceConvFwdPtr<PassThrough, PassThrough, PassThrough>>
|
||||
local_instances;
|
||||
add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_f32_instances(local_instances);
|
||||
for(auto& kinder : local_instances)
|
||||
{
|
||||
DeviceConvFwdPtr_t::DeviceConvFwdPtrImpl tmp{std::move(kinder)};
|
||||
instances.emplace_back(tmp); // Perhaps we can do better
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
void add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_bf16_instances_t(
|
||||
std::vector<DeviceConvFwdPtr_t>& instances)
|
||||
{
|
||||
std::vector<
|
||||
ck::tensor_operation::device::DeviceConvFwdPtr<PassThrough, PassThrough, PassThrough>>
|
||||
local_instances;
|
||||
add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_bf16_instances(local_instances);
|
||||
for(auto& kinder : local_instances)
|
||||
{
|
||||
DeviceConvFwdPtr_t::DeviceConvFwdPtrImpl tmp{std::move(kinder)};
|
||||
instances.emplace_back(tmp); // Perhaps we can do better
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
void add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_f16_instances_t(
|
||||
std::vector<DeviceConvFwdPtr_t>& instances)
|
||||
{
|
||||
std::vector<
|
||||
ck::tensor_operation::device::DeviceConvFwdPtr<PassThrough, PassThrough, PassThrough>>
|
||||
local_instances;
|
||||
add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_f16_instances(local_instances);
|
||||
for(auto& kinder : local_instances)
|
||||
{
|
||||
DeviceConvFwdPtr_t::DeviceConvFwdPtrImpl tmp{std::move(kinder)};
|
||||
instances.emplace_back(tmp); // Perhaps we can do better
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
void add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_int8_instances_t(
|
||||
std::vector<DeviceConvFwdPtr_t>& instances)
|
||||
{
|
||||
std::vector<
|
||||
ck::tensor_operation::device::DeviceConvFwdPtr<PassThrough, PassThrough, PassThrough>>
|
||||
local_instances;
|
||||
add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_int8_instances(local_instances);
|
||||
for(auto& kinder : local_instances)
|
||||
{
|
||||
DeviceConvFwdPtr_t::DeviceConvFwdPtrImpl tmp{std::move(kinder)};
|
||||
instances.emplace_back(tmp);
|
||||
}
|
||||
return;
|
||||
}
|
||||
@@ -35,10 +35,9 @@ set(DEVICE_GEMM_INSTANCE_SOURCE
|
||||
device_gemm_xdl_splitk_f16_f16_f16_km_nk_mn_instance.cpp;
|
||||
)
|
||||
|
||||
add_library(device_gemm_instance SHARED ${DEVICE_GEMM_INSTANCE_SOURCE})
|
||||
add_library(device_gemm_instance OBJECT ${DEVICE_GEMM_INSTANCE_SOURCE})
|
||||
|
||||
target_compile_features(device_gemm_instance PUBLIC)
|
||||
set_target_properties(device_gemm_instance PROPERTIES POSITION_INDEPENDENT_CODE ON)
|
||||
install(TARGETS device_gemm_instance LIBRARY DESTINATION lib)
|
||||
|
||||
clang_tidy_check(device_gemm_instance)
|
||||
|
||||
@@ -10,9 +10,7 @@ set(DEVICE_GEMM_BIAS2D_INSTANCE_SOURCE
|
||||
device_gemm_xdl_c_shuffle_bias_2d_f16_f16_f16_mk_nk_mn_instance.cpp;
|
||||
)
|
||||
|
||||
add_library(device_gemm_bias2d_instance SHARED ${DEVICE_GEMM_BIAS2D_INSTANCE_SOURCE})
|
||||
target_compile_features(device_gemm_bias2d_instance PUBLIC)
|
||||
add_library(device_gemm_bias2d_instance OBJECT ${DEVICE_GEMM_BIAS2D_INSTANCE_SOURCE})
|
||||
set_target_properties(device_gemm_bias2d_instance PROPERTIES POSITION_INDEPENDENT_CODE ON)
|
||||
install(TARGETS device_gemm_bias2d_instance LIBRARY DESTINATION lib)
|
||||
|
||||
clang_tidy_check(device_gemm_bias2d_instance)
|
||||
|
||||
@@ -6,9 +6,7 @@ set(DEVICE_GEMM_BIAS_RELU_INSTANCE_SOURCE
|
||||
device_gemm_xdl_c_shuffle_bias_relu_f16_f16_f16_km_nk_mn_instance.cpp;
|
||||
)
|
||||
|
||||
add_library(device_gemm_bias_relu_instance SHARED ${DEVICE_GEMM_BIAS_RELU_INSTANCE_SOURCE})
|
||||
target_compile_features(device_gemm_bias_relu_instance PUBLIC)
|
||||
add_library(device_gemm_bias_relu_instance OBJECT ${DEVICE_GEMM_BIAS_RELU_INSTANCE_SOURCE})
|
||||
set_target_properties(device_gemm_bias_relu_instance PROPERTIES POSITION_INDEPENDENT_CODE ON)
|
||||
install(TARGETS device_gemm_bias_relu_instance LIBRARY DESTINATION lib)
|
||||
|
||||
clang_tidy_check(device_gemm_bias_relu_instance)
|
||||
|
||||
@@ -6,9 +6,7 @@ set(DEVICE_GEMM_BIAS_RELU_ADD_INSTANCE_SOURCE
|
||||
device_gemm_xdl_c_shuffle_bias_relu_add_f16_f16_f16_km_nk_mn_instance.cpp;
|
||||
)
|
||||
|
||||
add_library(device_gemm_bias_relu_add_instance SHARED ${DEVICE_GEMM_BIAS_RELU_ADD_INSTANCE_SOURCE})
|
||||
target_compile_features(device_gemm_bias_relu_add_instance PUBLIC)
|
||||
add_library(device_gemm_bias_relu_add_instance OBJECT ${DEVICE_GEMM_BIAS_RELU_ADD_INSTANCE_SOURCE})
|
||||
set_target_properties(device_gemm_bias_relu_add_instance PROPERTIES POSITION_INDEPENDENT_CODE ON)
|
||||
install(TARGETS device_gemm_bias_relu_add_instance LIBRARY DESTINATION lib)
|
||||
|
||||
clang_tidy_check(device_gemm_bias_relu_add_instance)
|
||||
|
||||
@@ -6,7 +6,7 @@ set(DEVICE_GROUPED_GEMM_INSTANCE_SOURCE
|
||||
device_grouped_gemm_xdl_f16_f16_f16_km_nk_mn_instance.cpp;
|
||||
)
|
||||
|
||||
add_library(device_grouped_gemm_instance SHARED ${DEVICE_GROUPED_GEMM_INSTANCE_SOURCE})
|
||||
add_library(device_grouped_gemm_instance OBJECT ${DEVICE_GROUPED_GEMM_INSTANCE_SOURCE})
|
||||
|
||||
target_compile_features(device_grouped_gemm_instance PUBLIC)
|
||||
set_target_properties(device_grouped_gemm_instance PROPERTIES POSITION_INDEPENDENT_CODE ON)
|
||||
|
||||
@@ -38,9 +38,7 @@ set(DEVICE_REDUCE_INSTANCE_SOURCE
|
||||
device_reduce_instance_multiblock_partial_reduce_b16_f32_b16.cpp;
|
||||
)
|
||||
|
||||
add_library(device_reduce_instance SHARED ${DEVICE_REDUCE_INSTANCE_SOURCE})
|
||||
target_compile_features(device_reduce_instance PUBLIC)
|
||||
add_library(device_reduce_instance OBJECT ${DEVICE_REDUCE_INSTANCE_SOURCE})
|
||||
set_target_properties(device_reduce_instance PROPERTIES POSITION_INDEPENDENT_CODE ON)
|
||||
install(TARGETS device_reduce_instance LIBRARY DESTINATION lib)
|
||||
|
||||
clang_tidy_check(device_reduce_instance)
|
||||
|
||||
Reference in New Issue
Block a user