mirror of
https://github.com/ROCm/composable_kernel.git
synced 2026-05-13 09:45:56 +00:00
* add DeviceGemmXdl * update script * fix naming issue * fix comment * output HostTensorDescriptor * rename * padded GEMM for fwd v4r4r4 nhwc * refactor * refactor * refactor * adding ckProfiler * adding ckProfiler * refactor * fix tuning parameter bug * add more gemm instances * add more fp16 GEMM instances * fix profiler driver * fix bug in tuning parameter * add fp32 gemm instances * small fix * refactor * rename * refactor gemm profiler; adding DeviceConv and conv profiler * refactor * fix * add conv profiler * refactor * adding more GEMM and Conv instance * Create README.md Add build instruction for ckProfiler * Create README.md Add Readme for gemm_xdl example * Update README.md Remove build instruction from top most folder * Update README.md * clean up
59 lines
2.0 KiB
C++
59 lines
2.0 KiB
C++
#ifndef DEVICE_CONV_FWD_XDL_HPP
|
|
#define DEVICE_CONV_FWD_XDL_HPP
|
|
|
|
#include <iostream>
|
|
#include "device.hpp"
|
|
#include "device_base.hpp"
|
|
#include "device_conv.hpp"
|
|
#include "common_header.hpp"
|
|
#include "tensor_layout.hpp"
|
|
#include "tensor_descriptor.hpp"
|
|
#include "tensor_descriptor_helper.hpp"
|
|
#include "gridwise_gemm_xdlops_v2r3.hpp"
|
|
|
|
namespace ck {
|
|
namespace tensor_operation {
|
|
namespace device {
|
|
|
|
template <ck::index_t NDimSpatial,
|
|
typename InDataType,
|
|
typename WeiDataType,
|
|
typename OutDataType,
|
|
typename AccDataType,
|
|
typename InLayout,
|
|
typename WeiLayout,
|
|
typename OutLayout,
|
|
ck::index_t BlockSize,
|
|
ck::index_t MPerBlock,
|
|
ck::index_t NPerBlock,
|
|
ck::index_t K0PerBlock,
|
|
ck::index_t K1,
|
|
ck::index_t MPerXDL,
|
|
ck::index_t NPerXDL,
|
|
ck::index_t MXdlPerWave,
|
|
ck::index_t NXdlPerWave,
|
|
typename ABlockTransferThreadSliceLengths_K0_M_K1,
|
|
typename ABlockTransferThreadClusterLengths_K0_M_K1,
|
|
typename ABlockTransferThreadClusterArrangeOrder,
|
|
typename ABlockTransferSrcAccessOrder,
|
|
ck::index_t ABlockTransferSrcVectorDim,
|
|
ck::index_t ABlockTransferSrcScalarPerVector,
|
|
ck::index_t ABlockTransferDstScalarPerVector_K1,
|
|
typename BBlockTransferThreadSliceLengths_K0_N_K1,
|
|
typename BBlockTransferThreadClusterLengths_K0_N_K1,
|
|
typename BBlockTransferThreadClusterArrangeOrder,
|
|
typename BBlockTransferSrcAccessOrder,
|
|
ck::index_t BBlockTransferSrcVectorDim,
|
|
ck::index_t BBlockTransferSrcScalarPerVector,
|
|
ck::index_t BBlockTransferDstScalarPerVector_K1,
|
|
ck::index_t CThreadTransferSrcDstVectorDim,
|
|
ck::index_t CThreadTransferDstScalarPerVector,
|
|
bool ABlockLdsAddExtraM,
|
|
bool BBlockLdsAddExtraN>
|
|
struct DeviceConvFwdXdl;
|
|
|
|
} // namespace device
|
|
} // namespace tensor_operation
|
|
} // namespace ck
|
|
#endif
|