mirror of
https://github.com/ROCm/composable_kernel.git
synced 2026-05-14 10:09:41 +00:00
Add support for groups in Img2Col/Col2Img (#1007)
* Add support for groups in Img2Col/Col2Img
* Fix interface test
* Fix interface test G to N
* Improve performance
* Change gemm layout to 3d
* Fixes
[ROCm/composable_kernel commit: 2e824c6d46]
This commit is contained in:
@@ -16,10 +16,10 @@
|
||||
using InDataType = ck::half_t;
|
||||
using OutDataType = ck::half_t;
|
||||
|
||||
using ImageLayout = ck::tensor_layout::convolution::GNHWC;
|
||||
using ImageLayout = ck::tensor_layout::convolution::NHWGC;
|
||||
|
||||
static constexpr ck::index_t NumDimSpatial = 2;
|
||||
static constexpr ck::index_t G = 1;
|
||||
static constexpr ck::index_t G = 2;
|
||||
static constexpr ck::index_t N = 32; // batch size
|
||||
static constexpr ck::index_t C = 32; // input channel (per group)
|
||||
static constexpr ck::index_t Y = 3; // filter H
|
||||
@@ -52,18 +52,18 @@ int main()
|
||||
std::array<ck::index_t, 2> wei_spatial_lengths{Y, X};
|
||||
std::array<ck::index_t, 2> out_spatial_lengths{Ho, Wo};
|
||||
|
||||
// We have NHWGC in memory space (G is dummy)
|
||||
// However, CK's API only accept length and stride with order of GNCHW
|
||||
// Hence, we need to adjust the order of stride
|
||||
// We have NHWGC in memory space
|
||||
// However, CK's API only accepts lengths and strides with order of GNCHW.
|
||||
// Hence, we need to adjust the order of strides.
|
||||
std::array<ck::index_t, 5> image_strides{C, Hi * Wi * G * C, 1, Wi * G * C, G * C};
|
||||
std::array<ck::index_t, 2> gemm_strides{Y * X * C, 1};
|
||||
std::array<ck::index_t, 3> gemm_strides{Y * X * C, G * Y * X * C, 1};
|
||||
|
||||
std::array<ck::index_t, NumDimSpatial> filter_strides{1, 1};
|
||||
std::array<ck::index_t, NumDimSpatial> filter_dilations{1, 1};
|
||||
std::array<ck::index_t, NumDimSpatial> input_left_pads{1, 1};
|
||||
std::array<ck::index_t, NumDimSpatial> input_right_pads{1, 1};
|
||||
|
||||
SimpleDeviceMem in(sizeof(InDataType) * N * Ho * Wo * Y * X * C);
|
||||
SimpleDeviceMem in(sizeof(InDataType) * G * N * Ho * Wo * Y * X * C);
|
||||
SimpleDeviceMem out(sizeof(OutDataType) * N * Hi * Wi * G * C);
|
||||
|
||||
using namespace ck::conv_tensor_rearrange_op;
|
||||
@@ -93,6 +93,7 @@ int main()
|
||||
auto& op_ptr = op_ptrs[i];
|
||||
auto argument_ptr = op_ptr->MakeArgumentPointer(in.GetDeviceBuffer(),
|
||||
out.GetDeviceBuffer(),
|
||||
G,
|
||||
N,
|
||||
C,
|
||||
in_spatial_lengths,
|
||||
@@ -112,7 +113,7 @@ int main()
|
||||
float avg_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true});
|
||||
|
||||
std::size_t num_bytes = sizeof(InDataType) * N * Hi * Wi * G * C +
|
||||
sizeof(OutDataType) * N * Ho * Wo * Y * X * C;
|
||||
sizeof(OutDataType) * G * N * Ho * Wo * Y * X * C;
|
||||
|
||||
float gb_per_sec = num_bytes / 1.E6 / avg_time;
|
||||
|
||||
@@ -149,6 +150,7 @@ int main()
|
||||
<< std::endl;
|
||||
auto argument_ptr = op_ptr->MakeArgumentPointer(in.GetDeviceBuffer(),
|
||||
out.GetDeviceBuffer(),
|
||||
G,
|
||||
N,
|
||||
C,
|
||||
in_spatial_lengths,
|
||||
|
||||
@@ -16,10 +16,10 @@
|
||||
using InDataType = ck::half_t;
|
||||
using OutDataType = ck::half_t;
|
||||
|
||||
using ImageLayout = ck::tensor_layout::convolution::GNHWC;
|
||||
using ImageLayout = ck::tensor_layout::convolution::NHWGC;
|
||||
|
||||
static constexpr ck::index_t NumDimSpatial = 2;
|
||||
static constexpr ck::index_t G = 1;
|
||||
static constexpr ck::index_t G = 2;
|
||||
static constexpr ck::index_t N = 32; // batch size
|
||||
static constexpr ck::index_t C = 32; // input channel (per group)
|
||||
static constexpr ck::index_t Y = 3; // filter H
|
||||
@@ -52,11 +52,11 @@ int main()
|
||||
std::array<ck::index_t, 2> wei_spatial_lengths{Y, X};
|
||||
std::array<ck::index_t, 2> out_spatial_lengths{Ho, Wo};
|
||||
|
||||
// We have NHWGC in memory space (G is dummy)
|
||||
// However, CK's API only accept length and stride with order of GNCHW
|
||||
// Hence, we need to adjust the order of stride
|
||||
// We have NHWGC in memory space
|
||||
// However, CK's API only accepts lengths and strides with order of GNCHW.
|
||||
// Hence, we need to adjust the order of strides.
|
||||
std::array<ck::index_t, 5> image_strides{C, Hi * Wi * G * C, 1, Wi * G * C, G * C};
|
||||
std::array<ck::index_t, 2> gemm_strides{Y * X * C, 1};
|
||||
std::array<ck::index_t, 3> gemm_strides{Y * X * C, G * Y * X * C, 1};
|
||||
|
||||
std::array<ck::index_t, NumDimSpatial> filter_strides{1, 1};
|
||||
std::array<ck::index_t, NumDimSpatial> filter_dilations{1, 1};
|
||||
@@ -64,7 +64,7 @@ int main()
|
||||
std::array<ck::index_t, NumDimSpatial> input_right_pads{1, 1};
|
||||
|
||||
SimpleDeviceMem in(sizeof(InDataType) * N * Hi * Wi * G * C);
|
||||
SimpleDeviceMem out(sizeof(OutDataType) * N * Ho * Wo * Y * X * C);
|
||||
SimpleDeviceMem out(sizeof(OutDataType) * G * N * Ho * Wo * Y * X * C);
|
||||
|
||||
using namespace ck::conv_tensor_rearrange_op;
|
||||
|
||||
@@ -93,6 +93,7 @@ int main()
|
||||
auto& op_ptr = op_ptrs[i];
|
||||
auto argument_ptr = op_ptr->MakeArgumentPointer(in.GetDeviceBuffer(),
|
||||
out.GetDeviceBuffer(),
|
||||
G,
|
||||
N,
|
||||
C,
|
||||
in_spatial_lengths,
|
||||
@@ -112,7 +113,7 @@ int main()
|
||||
float avg_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true});
|
||||
|
||||
std::size_t num_bytes = sizeof(InDataType) * N * Hi * Wi * G * C +
|
||||
sizeof(OutDataType) * N * Ho * Wo * Y * X * C;
|
||||
sizeof(OutDataType) * G * N * Ho * Wo * Y * X * C;
|
||||
|
||||
float gb_per_sec = num_bytes / 1.E6 / avg_time;
|
||||
|
||||
@@ -149,6 +150,7 @@ int main()
|
||||
<< std::endl;
|
||||
auto argument_ptr = op_ptr->MakeArgumentPointer(in.GetDeviceBuffer(),
|
||||
out.GetDeviceBuffer(),
|
||||
G,
|
||||
N,
|
||||
C,
|
||||
in_spatial_lengths,
|
||||
|
||||
@@ -20,7 +20,7 @@ using DeviceColToImgInstance = ck::tensor_operation::device::DeviceColumnToImage
|
||||
|
||||
bool RunColumnToImage(const ExecutionConfig& config, const ck::utils::conv::ConvParam& conv_params)
|
||||
{
|
||||
|
||||
const auto G = conv_params.G_;
|
||||
const auto N = conv_params.N_;
|
||||
const auto C = conv_params.C_;
|
||||
|
||||
@@ -31,7 +31,7 @@ bool RunColumnToImage(const ExecutionConfig& config, const ck::utils::conv::Conv
|
||||
C * ck::accumulate_n<ck::index_t>(
|
||||
conv_params.filter_spatial_lengths_.begin(), NDimSpatial, 1, std::multiplies<>());
|
||||
|
||||
const auto in_desc = HostTensorDescriptor({NDoHoWo, CZYX});
|
||||
const auto in_desc = HostTensorDescriptor({G, NDoHoWo, CZYX});
|
||||
const auto out_desc =
|
||||
ck::utils::conv::make_input_host_tensor_descriptor_g_n_c_wis_packed<ImLayout>(conv_params);
|
||||
|
||||
@@ -39,7 +39,7 @@ bool RunColumnToImage(const ExecutionConfig& config, const ck::utils::conv::Conv
|
||||
std::array<ck::index_t, NDimSpatial> filter_spatial_lengths{};
|
||||
std::array<ck::index_t, NDimSpatial> output_spatial_lengths{};
|
||||
std::array<ck::index_t, NDimSpatial + 3> image_g_n_c_wis_strides{};
|
||||
std::array<ck::index_t, 2> gemm_m_k_strides{};
|
||||
std::array<ck::index_t, 3> gemm_g_m_k_strides{};
|
||||
std::array<ck::index_t, NDimSpatial> conv_filter_strides{};
|
||||
std::array<ck::index_t, NDimSpatial> conv_filter_dilations{};
|
||||
std::array<ck::index_t, NDimSpatial> input_left_pads{};
|
||||
@@ -50,7 +50,7 @@ bool RunColumnToImage(const ExecutionConfig& config, const ck::utils::conv::Conv
|
||||
copy(conv_params.input_spatial_lengths_, input_spatial_lengths);
|
||||
copy(conv_params.filter_spatial_lengths_, filter_spatial_lengths);
|
||||
copy(conv_params.output_spatial_lengths_, output_spatial_lengths);
|
||||
copy(in_desc.GetStrides(), gemm_m_k_strides);
|
||||
copy(in_desc.GetStrides(), gemm_g_m_k_strides);
|
||||
copy(out_desc.GetStrides(), image_g_n_c_wis_strides);
|
||||
copy(conv_params.conv_filter_strides_, conv_filter_strides);
|
||||
copy(conv_params.conv_filter_dilations_, conv_filter_dilations);
|
||||
@@ -86,13 +86,14 @@ bool RunColumnToImage(const ExecutionConfig& config, const ck::utils::conv::Conv
|
||||
auto invoker = col2img.MakeInvoker();
|
||||
auto argument = col2img.MakeArgument(in_device_buf.GetDeviceBuffer(),
|
||||
out_device_buf.GetDeviceBuffer(),
|
||||
G,
|
||||
N,
|
||||
C,
|
||||
input_spatial_lengths,
|
||||
filter_spatial_lengths,
|
||||
output_spatial_lengths,
|
||||
image_g_n_c_wis_strides,
|
||||
gemm_m_k_strides,
|
||||
gemm_g_m_k_strides,
|
||||
conv_filter_strides,
|
||||
conv_filter_dilations,
|
||||
input_left_pads,
|
||||
@@ -108,7 +109,7 @@ bool RunColumnToImage(const ExecutionConfig& config, const ck::utils::conv::Conv
|
||||
}
|
||||
|
||||
float ave_time = invoker.Run(argument, StreamConfig{nullptr, config.time_kernel});
|
||||
std::size_t num_btype = NDoHoWo * CZYX * (sizeof(OutDataType) + sizeof(InDataType));
|
||||
std::size_t num_btype = G * NDoHoWo * CZYX * (sizeof(OutDataType) + sizeof(InDataType));
|
||||
float gb_per_sec = num_btype / 1.E6 / ave_time;
|
||||
std::cout << "Perf: " << ave_time << " ms, " << gb_per_sec << " GB/s" << std::endl;
|
||||
|
||||
|
||||
@@ -20,7 +20,7 @@ using DeviceImgToColInstance = ck::tensor_operation::device::DeviceImageToColumn
|
||||
|
||||
bool RunImageToColumn(const ExecutionConfig& config, const ck::utils::conv::ConvParam& conv_params)
|
||||
{
|
||||
|
||||
const auto G = conv_params.G_;
|
||||
const auto N = conv_params.N_;
|
||||
const auto C = conv_params.C_;
|
||||
|
||||
@@ -33,13 +33,13 @@ bool RunImageToColumn(const ExecutionConfig& config, const ck::utils::conv::Conv
|
||||
|
||||
const auto in_desc =
|
||||
ck::utils::conv::make_input_host_tensor_descriptor_g_n_c_wis_packed<ImLayout>(conv_params);
|
||||
const auto out_desc = HostTensorDescriptor({NDoHoWo, CZYX});
|
||||
const auto out_desc = HostTensorDescriptor({G, NDoHoWo, CZYX});
|
||||
|
||||
std::array<ck::index_t, NDimSpatial> input_spatial_lengths{};
|
||||
std::array<ck::index_t, NDimSpatial> filter_spatial_lengths{};
|
||||
std::array<ck::index_t, NDimSpatial> output_spatial_lengths{};
|
||||
std::array<ck::index_t, NDimSpatial + 3> image_g_n_c_wis_strides{};
|
||||
std::array<ck::index_t, 2> gemm_m_k_strides{};
|
||||
std::array<ck::index_t, 3> gemm_g_m_k_strides{};
|
||||
std::array<ck::index_t, NDimSpatial> conv_filter_strides{};
|
||||
std::array<ck::index_t, NDimSpatial> conv_filter_dilations{};
|
||||
std::array<ck::index_t, NDimSpatial> input_left_pads{};
|
||||
@@ -51,7 +51,7 @@ bool RunImageToColumn(const ExecutionConfig& config, const ck::utils::conv::Conv
|
||||
copy(conv_params.filter_spatial_lengths_, filter_spatial_lengths);
|
||||
copy(conv_params.output_spatial_lengths_, output_spatial_lengths);
|
||||
copy(in_desc.GetStrides(), image_g_n_c_wis_strides);
|
||||
copy(out_desc.GetStrides(), gemm_m_k_strides);
|
||||
copy(out_desc.GetStrides(), gemm_g_m_k_strides);
|
||||
copy(conv_params.conv_filter_strides_, conv_filter_strides);
|
||||
copy(conv_params.conv_filter_dilations_, conv_filter_dilations);
|
||||
copy(conv_params.input_left_pads_, input_left_pads);
|
||||
@@ -86,13 +86,14 @@ bool RunImageToColumn(const ExecutionConfig& config, const ck::utils::conv::Conv
|
||||
auto invoker = img2col.MakeInvoker();
|
||||
auto argument = img2col.MakeArgument(in_device_buf.GetDeviceBuffer(),
|
||||
out_device_buf.GetDeviceBuffer(),
|
||||
G,
|
||||
N,
|
||||
C,
|
||||
input_spatial_lengths,
|
||||
filter_spatial_lengths,
|
||||
output_spatial_lengths,
|
||||
image_g_n_c_wis_strides,
|
||||
gemm_m_k_strides,
|
||||
gemm_g_m_k_strides,
|
||||
conv_filter_strides,
|
||||
conv_filter_dilations,
|
||||
input_left_pads,
|
||||
@@ -108,7 +109,7 @@ bool RunImageToColumn(const ExecutionConfig& config, const ck::utils::conv::Conv
|
||||
}
|
||||
|
||||
float ave_time = invoker.Run(argument, StreamConfig{nullptr, config.time_kernel});
|
||||
std::size_t num_btype = NDoHoWo * CZYX * (sizeof(OutDataType) + sizeof(InDataType));
|
||||
std::size_t num_btype = G * NDoHoWo * CZYX * (sizeof(OutDataType) + sizeof(InDataType));
|
||||
float gb_per_sec = num_btype / 1.E6 / ave_time;
|
||||
std::cout << "Perf: " << ave_time << " ms, " << gb_per_sec << " GB/s" << std::endl;
|
||||
|
||||
|
||||
@@ -14,11 +14,12 @@ namespace device {
|
||||
/**
|
||||
* \brief Convolution Tensor Rearrange.
|
||||
*
|
||||
* This Device operator supports conversion image ([G, N, Di, Hi, Wi, C]) to
|
||||
* the gemm problem([N * Do * Ho * Wo, Z * Y * X * C]) (Image to Column) and
|
||||
* conversion gemm form to the image (Column to Image).
|
||||
*
|
||||
* Note that G must be equal to 1.
|
||||
* This Device operator supports converting an image to
|
||||
* the GEMM representation (Image to Column) and
|
||||
* converting a GEMM form to the image (Column to Image).
|
||||
* Supported layouts:
|
||||
* [G, N, Di, Hi, Wi, C] <-> [G, N * Do * Ho * Wo, Z * Y * X * C]
|
||||
* [N, Di, Hi, Wi, G, C] <-> [N * Do * Ho * Wo, G, Z * Y * X * C]
|
||||
*
|
||||
* \tparam NDimSpatial Number of spatial dimensions.
|
||||
* \tparam ImageLayout Input Layout.
|
||||
@@ -39,13 +40,14 @@ struct DeviceConvTensorRearrange : public BaseOperator
|
||||
*
|
||||
* \param p_in A pointer to the device memory of the input image.
|
||||
* \param p_out A pointer to the device memory of the output.
|
||||
* \param G Convolution number of groups.
|
||||
* \param N Convolution batch size.
|
||||
* \param C Convolution number of channels.
|
||||
* \param input_spatial_lengths Input spatial lengths.
|
||||
* \param filter_spatial_lengths Filter spatial lengths.
|
||||
* \param output_spatial_lengths Output spatial lengths.
|
||||
* \param image_g_n_c_wis_strides Image strides in order [G, N, C, D, H, W].
|
||||
* \param gemm_m_k_strides Gemm form strides.
|
||||
* \param gemm_g_m_k_strides Gemm form strides.
|
||||
* \param conv_filter_strides Convolution filter strides.
|
||||
* \param conv_filter_dilations Convolution filter dilations.
|
||||
* \param input_left_pads Convolution left pads.
|
||||
@@ -55,13 +57,14 @@ struct DeviceConvTensorRearrange : public BaseOperator
|
||||
virtual std::unique_ptr<BaseArgument>
|
||||
MakeArgumentPointer(const void* p_in,
|
||||
void* p_out,
|
||||
const ck::index_t G,
|
||||
const ck::index_t N,
|
||||
const ck::index_t C,
|
||||
const std::array<index_t, NDimSpatial>& input_spatial_lengths,
|
||||
const std::array<index_t, NDimSpatial>& filter_spatial_lengths,
|
||||
const std::array<index_t, NDimSpatial>& output_spatial_lengths,
|
||||
const std::array<index_t, NDimSpatial + 3>& image_g_n_c_wis_strides,
|
||||
const std::array<index_t, 2>& gemm_m_k_strides,
|
||||
const std::array<index_t, 3>& gemm_g_m_k_strides,
|
||||
const std::array<index_t, NDimSpatial>& conv_filter_strides,
|
||||
const std::array<index_t, NDimSpatial>& conv_filter_dilations,
|
||||
const std::array<index_t, NDimSpatial>& input_left_pads,
|
||||
|
||||
@@ -17,15 +17,18 @@
|
||||
#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/conv_tensor_rearrange_op.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_utils.hpp"
|
||||
#include "ck/host_utility/io.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
|
||||
// Image to column for input layout NDHWC:
|
||||
// input : image converted to the gemm problem [N * Do * Ho * Wo, Z * Y * X * C]
|
||||
// output : image [N, Di, Hi, Wi, C]
|
||||
// Column to Image:
|
||||
// input : gemm form [G, N * Do * Ho * Wo, Z * Y * X * C]
|
||||
// output : input image [G, N, Di, Hi, Wi, C]
|
||||
// input : gemm form [N * Do * Ho * Wo, G, Z * Y * X * C]
|
||||
// output : input image [N, Di, Hi, Wi, G, C]
|
||||
template <index_t NDimSpatial,
|
||||
typename ImageLayout,
|
||||
typename InputDataType,
|
||||
@@ -43,6 +46,14 @@ struct DeviceColumnToImageImpl
|
||||
OutputDataType,
|
||||
conv_tensor_rearrange_op::ColumnToImage>
|
||||
{
|
||||
static constexpr bool is_NSpatialGC =
|
||||
std::is_same_v<ImageLayout, tensor_layout::convolution::NWGC> ||
|
||||
std::is_same_v<ImageLayout, tensor_layout::convolution::NHWGC> ||
|
||||
std::is_same_v<ImageLayout, tensor_layout::convolution::NDHWGC>;
|
||||
static constexpr bool is_GNSpatialC =
|
||||
std::is_same_v<ImageLayout, tensor_layout::convolution::GNWC> ||
|
||||
std::is_same_v<ImageLayout, tensor_layout::convolution::GNHWC> ||
|
||||
std::is_same_v<ImageLayout, tensor_layout::convolution::GNDHWC>;
|
||||
|
||||
static constexpr auto I0 = Number<0>{};
|
||||
static constexpr auto I1 = Number<1>{};
|
||||
@@ -90,7 +101,7 @@ struct DeviceColumnToImageImpl
|
||||
const std::array<index_t, NDimSpatial>& filter_spatial_lengths,
|
||||
const std::array<index_t, NDimSpatial>& output_spatial_lengths,
|
||||
const std::array<index_t, NDimSpatial>& conv_filter_strides,
|
||||
const std::array<index_t, 2>& gemm_m_k_strides,
|
||||
const std::array<index_t, 3>& gemm_g_m_k_strides,
|
||||
const std::array<index_t, NDimSpatial>& independent_filters,
|
||||
const std::array<index_t, NDimSpatial>& effs)
|
||||
{
|
||||
@@ -100,23 +111,23 @@ struct DeviceColumnToImageImpl
|
||||
C * ck::accumulate_n<index_t>(
|
||||
filter_spatial_lengths.begin(), NDimSpatial, 1, std::multiplies<>());
|
||||
|
||||
const index_t NStride = DoHoWo * gemm_m_k_strides[I0] * gemm_m_k_strides[I1];
|
||||
const index_t NStride = DoHoWo * gemm_g_m_k_strides[I1] * gemm_g_m_k_strides[I2];
|
||||
// Calculate the appropriate stride for each set of independent filters
|
||||
// in each dimension
|
||||
const index_t WStride =
|
||||
math::integer_divide_ceil(effs[XIdx], conv_filter_strides[XIdx]) * gemm_m_k_strides[I0];
|
||||
const index_t WStride = math::integer_divide_ceil(effs[XIdx], conv_filter_strides[XIdx]) *
|
||||
gemm_g_m_k_strides[I1];
|
||||
const index_t HStride = math::integer_divide_ceil(effs[YIdx], conv_filter_strides[YIdx]) *
|
||||
output_spatial_lengths[XIdx] * gemm_m_k_strides[I0];
|
||||
output_spatial_lengths[XIdx] * gemm_g_m_k_strides[I1];
|
||||
const index_t DStride = math::integer_divide_ceil(effs[ZIdx], conv_filter_strides[ZIdx]) *
|
||||
output_spatial_lengths[YIdx] * output_spatial_lengths[XIdx] *
|
||||
gemm_m_k_strides[I0];
|
||||
gemm_g_m_k_strides[I1];
|
||||
// Create descriptor for independent filters in each dimension and
|
||||
// then merge them into column form
|
||||
if constexpr(NDimSpatial == 1)
|
||||
{
|
||||
const auto desc_gemm_form =
|
||||
make_naive_tensor_descriptor(make_tuple(N, independent_filters[XIdx], CZYX),
|
||||
make_tuple(NStride, WStride, gemm_m_k_strides[I1]));
|
||||
make_tuple(NStride, WStride, gemm_g_m_k_strides[I2]));
|
||||
const auto desc_gemm_form_merged_filters = transform_tensor_descriptor(
|
||||
desc_gemm_form,
|
||||
make_tuple(make_merge_transform(make_tuple(N, independent_filters[XIdx])),
|
||||
@@ -130,7 +141,7 @@ struct DeviceColumnToImageImpl
|
||||
{
|
||||
const auto desc_gemm_form = make_naive_tensor_descriptor(
|
||||
make_tuple(N, independent_filters[YIdx], independent_filters[XIdx], CZYX),
|
||||
make_tuple(NStride, HStride, WStride, gemm_m_k_strides[I1]));
|
||||
make_tuple(NStride, HStride, WStride, gemm_g_m_k_strides[I2]));
|
||||
const auto desc_gemm_form_merged_filters = transform_tensor_descriptor(
|
||||
desc_gemm_form,
|
||||
make_tuple(make_merge_transform(
|
||||
@@ -149,7 +160,7 @@ struct DeviceColumnToImageImpl
|
||||
independent_filters[YIdx],
|
||||
independent_filters[XIdx],
|
||||
CZYX),
|
||||
make_tuple(NStride, DStride, HStride, WStride, gemm_m_k_strides[I1]));
|
||||
make_tuple(NStride, DStride, HStride, WStride, gemm_g_m_k_strides[I2]));
|
||||
const auto desc_gemm_form_merged_filters = transform_tensor_descriptor(
|
||||
desc_gemm_form,
|
||||
make_tuple(make_merge_transform(make_tuple(N,
|
||||
@@ -252,34 +263,38 @@ struct DeviceColumnToImageImpl
|
||||
decltype(BlockToCTileMap_M00_N0_M01Adapt<MPerBlock, KPerBlock, InputGridDesc>(
|
||||
InputGridDesc{}))>;
|
||||
|
||||
using GridwiseTensorRearrangeKernel = GridwiseTensorRearrange<InputGridDesc,
|
||||
InputDataType,
|
||||
OutputGridDesc,
|
||||
OutputDataType,
|
||||
BlockSize,
|
||||
MPerBlock,
|
||||
KPerBlock,
|
||||
ThreadClusterLengths,
|
||||
ScalarPerVector,
|
||||
InMemoryDataOperationEnum::Add,
|
||||
Block2ETileMap>;
|
||||
using GridwiseTensorRearrangeKernel =
|
||||
GridwiseTensorRearrange<InputGridDesc,
|
||||
InputDataType,
|
||||
OutputGridDesc,
|
||||
OutputDataType,
|
||||
BlockSize,
|
||||
MPerBlock,
|
||||
KPerBlock,
|
||||
ThreadClusterLengths,
|
||||
ScalarPerVector,
|
||||
InMemoryDataOperationEnum::Add,
|
||||
Block2ETileMap,
|
||||
ComputePtrOffsetOfStridedBatch<I0>>;
|
||||
|
||||
struct Argument : public BaseArgument
|
||||
{
|
||||
Argument(const void* p_in, // input image
|
||||
void* p_out, // output image
|
||||
const ck::index_t G,
|
||||
const ck::index_t N,
|
||||
const ck::index_t C,
|
||||
const std::array<index_t, NDimSpatial>& input_spatial_lengths,
|
||||
const std::array<index_t, NDimSpatial>& filter_spatial_lengths,
|
||||
const std::array<index_t, NDimSpatial>& output_spatial_lengths,
|
||||
const std::array<index_t, NDimSpatial + 3>& image_g_n_c_wis_strides,
|
||||
const std::array<index_t, 2>& gemm_m_k_strides,
|
||||
const std::array<index_t, 3>& gemm_g_m_k_strides,
|
||||
const std::array<index_t, NDimSpatial>& conv_filter_strides,
|
||||
const std::array<index_t, NDimSpatial>& conv_filter_dilations,
|
||||
const std::array<index_t, NDimSpatial>& input_left_pads,
|
||||
const std::array<index_t, NDimSpatial>& input_right_pads)
|
||||
: C_(C),
|
||||
: G_(G),
|
||||
C_(C),
|
||||
X_(filter_spatial_lengths[NDimSpatial - I1]),
|
||||
p_in_{static_cast<const InputDataType*>(p_in)},
|
||||
p_out_{static_cast<OutputDataType*>(p_out)},
|
||||
@@ -289,6 +304,9 @@ struct DeviceColumnToImageImpl
|
||||
input_left_pads_{input_left_pads},
|
||||
input_right_pads_{input_right_pads}
|
||||
{
|
||||
compute_ptr_offset_of_batch_.BatchStrideA_ = gemm_g_m_k_strides[I0];
|
||||
compute_ptr_offset_of_batch_.BatchStrideC_ = image_g_n_c_wis_strides[I0];
|
||||
|
||||
const index_t x_eff =
|
||||
(filter_spatial_lengths[XIdx] - 1) * conv_filter_dilations[XIdx] + 1;
|
||||
const index_t y_eff =
|
||||
@@ -354,7 +372,7 @@ struct DeviceColumnToImageImpl
|
||||
filter_spatial_lengths,
|
||||
output_spatial_lengths,
|
||||
conv_filter_strides,
|
||||
gemm_m_k_strides,
|
||||
gemm_g_m_k_strides,
|
||||
independent_filters,
|
||||
effs);
|
||||
const auto out_grid_desc_m_k =
|
||||
@@ -387,10 +405,9 @@ struct DeviceColumnToImageImpl
|
||||
// Memory offsets to next set of independent filters,
|
||||
// move to independent filters in each dimension
|
||||
const index_t in_offset =
|
||||
x_idx * gemm_m_k_strides[0] +
|
||||
y_idx * gemm_m_k_strides[0] * output_spatial_lengths[XIdx] +
|
||||
z_idx * gemm_m_k_strides[0] * output_spatial_lengths[YIdx] *
|
||||
output_spatial_lengths[XIdx];
|
||||
(x_idx + y_idx * output_spatial_lengths[XIdx] +
|
||||
z_idx * output_spatial_lengths[YIdx] * output_spatial_lengths[XIdx]) *
|
||||
gemm_g_m_k_strides[I1];
|
||||
// Move to independent filters in appropriate dimensions
|
||||
const index_t out_offset =
|
||||
x_offset_with_pad * image_g_n_c_wis_strides[spatial_offset + XIdx] +
|
||||
@@ -417,6 +434,7 @@ struct DeviceColumnToImageImpl
|
||||
}
|
||||
}
|
||||
|
||||
const ck::index_t G_;
|
||||
const ck::index_t C_;
|
||||
const ck::index_t X_;
|
||||
|
||||
@@ -434,6 +452,8 @@ struct DeviceColumnToImageImpl
|
||||
|
||||
std::vector<const InputDataType*> p_in_container_;
|
||||
std::vector<OutputDataType*> p_out_container_;
|
||||
|
||||
ComputePtrOffsetOfStridedBatch<I0> compute_ptr_offset_of_batch_;
|
||||
};
|
||||
|
||||
struct Invoker : public BaseInvoker
|
||||
@@ -451,6 +471,7 @@ struct DeviceColumnToImageImpl
|
||||
OutputGridDesc,
|
||||
OutputDataType,
|
||||
Block2ETileMap,
|
||||
ComputePtrOffsetOfStridedBatch<I0>,
|
||||
GridwiseTensorRearrangeKernel>;
|
||||
|
||||
// Execute each set of independent filters
|
||||
@@ -460,7 +481,7 @@ struct DeviceColumnToImageImpl
|
||||
BlockToCTileMap_M00_N0_M01Adapt<MPerBlock, KPerBlock, InputGridDesc>(
|
||||
arg.out_grid_desc_m_k_container_[i]);
|
||||
const index_t grid_size =
|
||||
block_2_tile_map.CalculateGridSize(arg.in_grid_desc_m_k_container_[i]);
|
||||
block_2_tile_map.CalculateGridSize(arg.in_grid_desc_m_k_container_[i]) * arg.G_;
|
||||
elapsed_time += launch_and_time_kernel(stream_config,
|
||||
kernel,
|
||||
dim3(grid_size),
|
||||
@@ -470,7 +491,9 @@ struct DeviceColumnToImageImpl
|
||||
arg.p_in_container_[i],
|
||||
arg.out_grid_desc_m_k_container_[i],
|
||||
arg.p_out_container_[i],
|
||||
block_2_tile_map);
|
||||
arg.G_,
|
||||
block_2_tile_map,
|
||||
arg.compute_ptr_offset_of_batch_);
|
||||
}
|
||||
return elapsed_time;
|
||||
}
|
||||
@@ -485,8 +508,7 @@ struct DeviceColumnToImageImpl
|
||||
bool IsSupportedArgument(const Argument& arg)
|
||||
{
|
||||
using namespace tensor_layout::convolution;
|
||||
if constexpr(!(std::is_same_v<ImageLayout, GNWC> || std::is_same_v<ImageLayout, GNHWC> ||
|
||||
std::is_same_v<ImageLayout, GNDHWC>))
|
||||
if constexpr(!(is_NSpatialGC || is_GNSpatialC))
|
||||
{
|
||||
return false;
|
||||
}
|
||||
@@ -534,13 +556,14 @@ struct DeviceColumnToImageImpl
|
||||
|
||||
static auto MakeArgument(const void* p_in, // input image
|
||||
void* p_out, // output image
|
||||
const ck::index_t G,
|
||||
const ck::index_t N,
|
||||
const ck::index_t C,
|
||||
const std::array<index_t, NDimSpatial>& input_spatial_lengths,
|
||||
const std::array<index_t, NDimSpatial>& filter_spatial_lengths,
|
||||
const std::array<index_t, NDimSpatial>& output_spatial_lengths,
|
||||
const std::array<index_t, NDimSpatial + 3>& image_g_n_c_wis_strides,
|
||||
const std::array<index_t, 2>& gemm_m_k_strides,
|
||||
const std::array<index_t, 3>& gemm_g_m_k_strides,
|
||||
const std::array<index_t, NDimSpatial>& conv_filter_strides,
|
||||
const std::array<index_t, NDimSpatial>& conv_filter_dilations,
|
||||
const std::array<index_t, NDimSpatial>& input_left_pads,
|
||||
@@ -548,13 +571,14 @@ struct DeviceColumnToImageImpl
|
||||
{
|
||||
return Argument{static_cast<const InputDataType*>(p_in),
|
||||
static_cast<OutputDataType*>(p_out),
|
||||
G,
|
||||
N,
|
||||
C,
|
||||
input_spatial_lengths,
|
||||
filter_spatial_lengths,
|
||||
output_spatial_lengths,
|
||||
image_g_n_c_wis_strides,
|
||||
gemm_m_k_strides,
|
||||
gemm_g_m_k_strides,
|
||||
conv_filter_strides,
|
||||
conv_filter_dilations,
|
||||
input_left_pads,
|
||||
@@ -566,13 +590,14 @@ struct DeviceColumnToImageImpl
|
||||
std::unique_ptr<BaseArgument>
|
||||
MakeArgumentPointer(const void* p_in, // input image
|
||||
void* p_out, // output image
|
||||
const ck::index_t G,
|
||||
const ck::index_t N,
|
||||
const ck::index_t C,
|
||||
const std::array<index_t, NDimSpatial>& input_spatial_lengths,
|
||||
const std::array<index_t, NDimSpatial>& filter_spatial_lengths,
|
||||
const std::array<index_t, NDimSpatial>& output_spatial_lengths,
|
||||
const std::array<index_t, NDimSpatial + 3>& image_g_n_c_wis_strides,
|
||||
const std::array<index_t, 2>& gemm_m_k_strides,
|
||||
const std::array<index_t, 3>& gemm_g_m_k_strides,
|
||||
const std::array<index_t, NDimSpatial>& conv_filter_strides,
|
||||
const std::array<index_t, NDimSpatial>& conv_filter_dilations,
|
||||
const std::array<index_t, NDimSpatial>& input_left_pads,
|
||||
@@ -580,13 +605,14 @@ struct DeviceColumnToImageImpl
|
||||
{
|
||||
return std::make_unique<Argument>(static_cast<const InputDataType*>(p_in),
|
||||
static_cast<OutputDataType*>(p_out),
|
||||
G,
|
||||
N,
|
||||
C,
|
||||
input_spatial_lengths,
|
||||
filter_spatial_lengths,
|
||||
output_spatial_lengths,
|
||||
image_g_n_c_wis_strides,
|
||||
gemm_m_k_strides,
|
||||
gemm_g_m_k_strides,
|
||||
conv_filter_strides,
|
||||
conv_filter_dilations,
|
||||
input_left_pads,
|
||||
|
||||
@@ -15,15 +15,18 @@
|
||||
#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/conv_tensor_rearrange_op.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_utils.hpp"
|
||||
#include "ck/host_utility/io.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
|
||||
// Image to column for input layout NDHWC:
|
||||
// input : input image [N, Di, Hi, Wi, C]
|
||||
// output : gemm form [N * Do * Ho * Wo, Z * Y * X * C]
|
||||
// Image to column:
|
||||
// input : input image [G, N, Di, Hi, Wi, C]
|
||||
// output : gemm form [G * N * Do * Ho * Wo, Z * Y * X * C]
|
||||
// input : input image [N, Di, Hi, Wi, G, C]
|
||||
// output : gemm form [N * Do * Ho * Wo * G, Z * Y * X * C]
|
||||
template <index_t NDimSpatial,
|
||||
typename ImageLayout,
|
||||
typename InputDataType,
|
||||
@@ -41,6 +44,14 @@ struct DeviceImageToColumnImpl
|
||||
OutputDataType,
|
||||
conv_tensor_rearrange_op::ImageToColumn>
|
||||
{
|
||||
static constexpr bool is_NSpatialGC =
|
||||
std::is_same_v<ImageLayout, tensor_layout::convolution::NWGC> ||
|
||||
std::is_same_v<ImageLayout, tensor_layout::convolution::NHWGC> ||
|
||||
std::is_same_v<ImageLayout, tensor_layout::convolution::NDHWGC>;
|
||||
static constexpr bool is_GNSpatialC =
|
||||
std::is_same_v<ImageLayout, tensor_layout::convolution::GNWC> ||
|
||||
std::is_same_v<ImageLayout, tensor_layout::convolution::GNHWC> ||
|
||||
std::is_same_v<ImageLayout, tensor_layout::convolution::GNDHWC>;
|
||||
|
||||
static constexpr auto I0 = Number<0>{};
|
||||
static constexpr auto I1 = Number<1>{};
|
||||
@@ -109,7 +120,7 @@ struct DeviceImageToColumnImpl
|
||||
const ck::index_t C,
|
||||
const std::array<index_t, NDimSpatial>& filter_spatial_lengths,
|
||||
const std::array<index_t, NDimSpatial>& output_spatial_lengths,
|
||||
const std::array<index_t, 2>& gemm_m_k_strides)
|
||||
const std::array<index_t, 3>& gemm_g_m_k_strides)
|
||||
{
|
||||
const index_t NDoHoWo =
|
||||
N * ck::accumulate_n<index_t>(
|
||||
@@ -117,11 +128,10 @@ struct DeviceImageToColumnImpl
|
||||
const index_t CZYX =
|
||||
C * ck::accumulate_n<index_t>(
|
||||
filter_spatial_lengths.begin(), NDimSpatial, 1, std::multiplies<>());
|
||||
const auto desc_mraw_kraw = make_naive_tensor_descriptor(
|
||||
make_tuple(NDoHoWo, CZYX), make_tuple(gemm_m_k_strides[I0], gemm_m_k_strides[I1]));
|
||||
|
||||
const auto desc_m_k = matrix_padder.PadADescriptor_M_K(desc_mraw_kraw);
|
||||
return desc_m_k;
|
||||
const auto desc_mraw_kraw = make_naive_tensor_descriptor(
|
||||
make_tuple(NDoHoWo, CZYX), make_tuple(gemm_g_m_k_strides[I1], gemm_g_m_k_strides[I2]));
|
||||
return matrix_padder.PadADescriptor_M_K(desc_mraw_kraw);
|
||||
}
|
||||
|
||||
using InputGridDesc =
|
||||
@@ -132,34 +142,38 @@ struct DeviceImageToColumnImpl
|
||||
decltype(BlockToCTileMap_M00_N0_M01Adapt<MPerBlock, KPerBlock, OutputGridDesc>(
|
||||
OutputGridDesc{}))>;
|
||||
|
||||
using GridwiseTensorRearrangeKernel = GridwiseTensorRearrange<InputGridDesc,
|
||||
InputDataType,
|
||||
OutputGridDesc,
|
||||
OutputDataType,
|
||||
BlockSize,
|
||||
MPerBlock,
|
||||
KPerBlock,
|
||||
ThreadClusterLengths,
|
||||
ScalarPerVector,
|
||||
InMemoryDataOperationEnum::Set,
|
||||
Block2ETileMap>;
|
||||
using GridwiseTensorRearrangeKernel =
|
||||
GridwiseTensorRearrange<InputGridDesc,
|
||||
InputDataType,
|
||||
OutputGridDesc,
|
||||
OutputDataType,
|
||||
BlockSize,
|
||||
MPerBlock,
|
||||
KPerBlock,
|
||||
ThreadClusterLengths,
|
||||
ScalarPerVector,
|
||||
InMemoryDataOperationEnum::Set,
|
||||
Block2ETileMap,
|
||||
ComputePtrOffsetOfStridedBatch<I0>>;
|
||||
|
||||
struct Argument : public BaseArgument
|
||||
{
|
||||
Argument(const void* p_in, // input image
|
||||
void* p_out, // gemm form
|
||||
const ck::index_t G,
|
||||
const ck::index_t N,
|
||||
const ck::index_t C,
|
||||
const std::array<index_t, NDimSpatial>& input_spatial_lengths,
|
||||
const std::array<index_t, NDimSpatial>& filter_spatial_lengths,
|
||||
const std::array<index_t, NDimSpatial>& output_spatial_lengths,
|
||||
const std::array<index_t, NDimSpatial + 3>& image_g_n_c_wis_strides,
|
||||
const std::array<index_t, 2>& gemm_m_k_strides,
|
||||
const std::array<index_t, 3>& gemm_g_m_k_strides,
|
||||
const std::array<index_t, NDimSpatial>& conv_filter_strides,
|
||||
const std::array<index_t, NDimSpatial>& conv_filter_dilations,
|
||||
const std::array<index_t, NDimSpatial>& input_left_pads,
|
||||
const std::array<index_t, NDimSpatial>& input_right_pads)
|
||||
: C_(C),
|
||||
: G_(G),
|
||||
C_(C),
|
||||
X_(filter_spatial_lengths[NDimSpatial - I1]),
|
||||
p_in_{static_cast<const InputDataType*>(p_in)},
|
||||
p_out_{static_cast<OutputDataType*>(p_out)},
|
||||
@@ -176,14 +190,16 @@ struct DeviceImageToColumnImpl
|
||||
filter_spatial_lengths,
|
||||
output_spatial_lengths,
|
||||
image_g_n_c_wis_strides,
|
||||
|
||||
conv_filter_strides,
|
||||
conv_filter_dilations,
|
||||
input_left_pads,
|
||||
input_right_pads);
|
||||
|
||||
out_grid_desc_m_k_ = MakeOutDescriptor_M_K(
|
||||
N, C, filter_spatial_lengths, output_spatial_lengths, gemm_m_k_strides);
|
||||
N, C, filter_spatial_lengths, output_spatial_lengths, gemm_g_m_k_strides);
|
||||
|
||||
compute_ptr_offset_of_batch_.BatchStrideA_ = image_g_n_c_wis_strides[I0];
|
||||
compute_ptr_offset_of_batch_.BatchStrideC_ = gemm_g_m_k_strides[I0];
|
||||
}
|
||||
|
||||
void Print() const
|
||||
@@ -192,6 +208,7 @@ struct DeviceImageToColumnImpl
|
||||
std::cout << out_grid_desc_m_k_ << std::endl;
|
||||
}
|
||||
|
||||
const ck::index_t G_;
|
||||
const ck::index_t C_;
|
||||
const ck::index_t X_;
|
||||
|
||||
@@ -206,6 +223,8 @@ struct DeviceImageToColumnImpl
|
||||
|
||||
InputGridDesc in_grid_desc_m_k_;
|
||||
OutputGridDesc out_grid_desc_m_k_;
|
||||
|
||||
ComputePtrOffsetOfStridedBatch<I0> compute_ptr_offset_of_batch_;
|
||||
};
|
||||
|
||||
struct Invoker : public BaseInvoker
|
||||
@@ -220,12 +239,14 @@ struct DeviceImageToColumnImpl
|
||||
const auto block_2_tile_map =
|
||||
BlockToCTileMap_M00_N0_M01Adapt<MPerBlock, KPerBlock, OutputGridDesc>(
|
||||
arg.out_grid_desc_m_k_);
|
||||
const index_t grid_size = block_2_tile_map.CalculateGridSize(arg.out_grid_desc_m_k_);
|
||||
const auto kernel = kernel_tensor_rearrange<InputGridDesc,
|
||||
const index_t grid_size =
|
||||
block_2_tile_map.CalculateGridSize(arg.out_grid_desc_m_k_) * arg.G_;
|
||||
const auto kernel = kernel_tensor_rearrange<InputGridDesc,
|
||||
InputDataType,
|
||||
OutputGridDesc,
|
||||
OutputDataType,
|
||||
Block2ETileMap,
|
||||
ComputePtrOffsetOfStridedBatch<I0>,
|
||||
GridwiseTensorRearrangeKernel>;
|
||||
|
||||
float elapsed_time = launch_and_time_kernel(stream_config,
|
||||
@@ -237,7 +258,9 @@ struct DeviceImageToColumnImpl
|
||||
arg.p_in_,
|
||||
arg.out_grid_desc_m_k_,
|
||||
arg.p_out_,
|
||||
block_2_tile_map);
|
||||
arg.G_,
|
||||
block_2_tile_map,
|
||||
arg.compute_ptr_offset_of_batch_);
|
||||
return elapsed_time;
|
||||
}
|
||||
|
||||
@@ -250,9 +273,7 @@ struct DeviceImageToColumnImpl
|
||||
|
||||
bool IsSupportedArgument(const Argument& arg)
|
||||
{
|
||||
using namespace tensor_layout::convolution;
|
||||
if constexpr(!(std::is_same_v<ImageLayout, GNWC> || std::is_same_v<ImageLayout, GNHWC> ||
|
||||
std::is_same_v<ImageLayout, GNDHWC>))
|
||||
if constexpr(!(is_NSpatialGC || is_GNSpatialC))
|
||||
{
|
||||
return false;
|
||||
}
|
||||
@@ -295,13 +316,14 @@ struct DeviceImageToColumnImpl
|
||||
|
||||
static auto MakeArgument(const void* p_in, // input image
|
||||
void* p_out, // gemm form
|
||||
const ck::index_t G,
|
||||
const ck::index_t N,
|
||||
const ck::index_t C,
|
||||
const std::array<index_t, NDimSpatial>& input_spatial_lengths,
|
||||
const std::array<index_t, NDimSpatial>& filter_spatial_lengths,
|
||||
const std::array<index_t, NDimSpatial>& output_spatial_lengths,
|
||||
const std::array<index_t, NDimSpatial + 3>& image_g_n_c_wis_strides,
|
||||
const std::array<index_t, 2>& gemm_m_k_strides,
|
||||
const std::array<index_t, 3>& gemm_g_m_k_strides,
|
||||
const std::array<index_t, NDimSpatial>& conv_filter_strides,
|
||||
const std::array<index_t, NDimSpatial>& conv_filter_dilations,
|
||||
const std::array<index_t, NDimSpatial>& input_left_pads,
|
||||
@@ -309,13 +331,14 @@ struct DeviceImageToColumnImpl
|
||||
{
|
||||
return Argument{static_cast<const InputDataType*>(p_in),
|
||||
static_cast<OutputDataType*>(p_out),
|
||||
G,
|
||||
N,
|
||||
C,
|
||||
input_spatial_lengths,
|
||||
filter_spatial_lengths,
|
||||
output_spatial_lengths,
|
||||
image_g_n_c_wis_strides,
|
||||
gemm_m_k_strides,
|
||||
gemm_g_m_k_strides,
|
||||
conv_filter_strides,
|
||||
conv_filter_dilations,
|
||||
input_left_pads,
|
||||
@@ -327,13 +350,14 @@ struct DeviceImageToColumnImpl
|
||||
std::unique_ptr<BaseArgument>
|
||||
MakeArgumentPointer(const void* p_in, // input image
|
||||
void* p_out, // gemm form
|
||||
const ck::index_t G,
|
||||
const ck::index_t N,
|
||||
const ck::index_t C,
|
||||
const std::array<index_t, NDimSpatial>& input_spatial_lengths,
|
||||
const std::array<index_t, NDimSpatial>& filter_spatial_lengths,
|
||||
const std::array<index_t, NDimSpatial>& output_spatial_lengths,
|
||||
const std::array<index_t, NDimSpatial + 3>& image_g_n_c_wis_strides,
|
||||
const std::array<index_t, 2>& gemm_m_k_strides,
|
||||
const std::array<index_t, 3>& gemm_g_m_k_strides,
|
||||
const std::array<index_t, NDimSpatial>& conv_filter_strides,
|
||||
const std::array<index_t, NDimSpatial>& conv_filter_dilations,
|
||||
const std::array<index_t, NDimSpatial>& input_left_pads,
|
||||
@@ -341,13 +365,14 @@ struct DeviceImageToColumnImpl
|
||||
{
|
||||
return std::make_unique<Argument>(static_cast<const InputDataType*>(p_in),
|
||||
static_cast<OutputDataType*>(p_out),
|
||||
G,
|
||||
N,
|
||||
C,
|
||||
input_spatial_lengths,
|
||||
filter_spatial_lengths,
|
||||
output_spatial_lengths,
|
||||
image_g_n_c_wis_strides,
|
||||
gemm_m_k_strides,
|
||||
gemm_g_m_k_strides,
|
||||
conv_filter_strides,
|
||||
conv_filter_dilations,
|
||||
input_left_pads,
|
||||
|
||||
@@ -21,6 +21,7 @@ template <typename InputGridDesc,
|
||||
typename OutputGridDesc,
|
||||
typename OutputDataType,
|
||||
typename Block2ETileMap,
|
||||
typename ComputePtrOffsetOfStridedBatch,
|
||||
typename GridwiseTensorRearrangeKernel>
|
||||
__global__ void
|
||||
#if CK_USE_LAUNCH_BOUNDS
|
||||
@@ -30,13 +31,20 @@ __global__ void
|
||||
const InputDataType* __restrict__ p_in_global,
|
||||
const OutputGridDesc out_grid_desc,
|
||||
OutputDataType* __restrict__ p_out_global,
|
||||
const Block2ETileMap block_2_tile_map)
|
||||
const index_t batch_count,
|
||||
const Block2ETileMap block_2_tile_map,
|
||||
const ComputePtrOffsetOfStridedBatch compute_ptr_offset_of_batch)
|
||||
{
|
||||
#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx906__) || defined(__gfx908__) || \
|
||||
defined(__gfx90a__) || defined(__gfx940__) || defined(__gfx1030__) || defined(__gfx1100__) || \
|
||||
defined(__gfx1101__) || defined(__gfx1102__) || defined(__gfx941__) || defined(__gfx942__))
|
||||
GridwiseTensorRearrangeKernel::Run(
|
||||
in_grid_desc, p_in_global, out_grid_desc, p_out_global, block_2_tile_map);
|
||||
GridwiseTensorRearrangeKernel::Run(in_grid_desc,
|
||||
p_in_global,
|
||||
out_grid_desc,
|
||||
p_out_global,
|
||||
batch_count,
|
||||
block_2_tile_map,
|
||||
compute_ptr_offset_of_batch);
|
||||
#else
|
||||
ignore = in_grid_desc;
|
||||
ignore = p_in_global;
|
||||
@@ -56,7 +64,8 @@ template <typename InputGridDesc,
|
||||
typename ThreadClusterLengths,
|
||||
index_t ScalarPerVector,
|
||||
InMemoryDataOperationEnum DstInMemOp,
|
||||
typename Block2ETileMap>
|
||||
typename Block2ETileMap,
|
||||
typename ComputePtrOffsetOfStridedBatch>
|
||||
struct GridwiseTensorRearrange
|
||||
{
|
||||
|
||||
@@ -69,7 +78,9 @@ struct GridwiseTensorRearrange
|
||||
const InputDataType* __restrict__ p_in_global,
|
||||
const OutputGridDesc& out_grid_desc,
|
||||
OutputDataType* __restrict__ p_out_global,
|
||||
const Block2ETileMap& block_2_tile_map)
|
||||
const index_t batch_count,
|
||||
const Block2ETileMap& block_2_tile_map,
|
||||
const ComputePtrOffsetOfStridedBatch& compute_ptr_offset_of_batch)
|
||||
{
|
||||
const auto block_work_idx =
|
||||
block_2_tile_map.CalculateBottomIndex(make_multi_index(get_block_1d_id()));
|
||||
@@ -80,12 +91,6 @@ struct GridwiseTensorRearrange
|
||||
const index_t k_block_data_idx_on_grid =
|
||||
__builtin_amdgcn_readfirstlane(block_work_idx[I1] * KPerBlock);
|
||||
|
||||
// Global Memory
|
||||
const auto in_global_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
|
||||
p_in_global, in_grid_desc.GetElementSpaceSize());
|
||||
auto out_global_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
|
||||
p_out_global, out_grid_desc.GetElementSpaceSize());
|
||||
|
||||
auto copy_global_to_global =
|
||||
ThreadGroupTensorSliceTransfer_v7<ThisThreadBlock,
|
||||
Tuple<InputDataType>,
|
||||
@@ -108,6 +113,22 @@ struct GridwiseTensorRearrange
|
||||
make_tuple(make_multi_index(m_block_data_idx_on_grid, k_block_data_idx_on_grid)),
|
||||
tensor_operation::element_wise::PassThrough{}};
|
||||
|
||||
const index_t num_blocks_per_batch =
|
||||
__builtin_amdgcn_readfirstlane(get_grid_size() / batch_count);
|
||||
const index_t g_idx =
|
||||
__builtin_amdgcn_readfirstlane(get_block_1d_id() / num_blocks_per_batch);
|
||||
|
||||
// Global Memory
|
||||
const index_t a_batch_offset =
|
||||
__builtin_amdgcn_readfirstlane(compute_ptr_offset_of_batch.GetAPtrOffset(g_idx));
|
||||
const index_t c_batch_offset =
|
||||
__builtin_amdgcn_readfirstlane(compute_ptr_offset_of_batch.GetCPtrOffset(g_idx));
|
||||
|
||||
const auto in_global_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
|
||||
p_in_global + a_batch_offset, in_grid_desc.GetElementSpaceSize());
|
||||
auto out_global_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
|
||||
p_out_global + c_batch_offset, out_grid_desc.GetElementSpaceSize());
|
||||
|
||||
copy_global_to_global.Run(
|
||||
tie(in_grid_desc), tie(in_global_buf), tie(out_grid_desc), tie(out_global_buf));
|
||||
}
|
||||
|
||||
@@ -19,9 +19,7 @@ namespace host {
|
||||
* \brief Reference implementation for column to image.
|
||||
*
|
||||
* Input tensor descriptor has [N * Do * Ho * Wo, Z * Y * X * C] data layout.
|
||||
* Memory layout is the same.
|
||||
* Output tensor descriptor has [G, N, C, Di, Hi, Wi] data layout.
|
||||
* G must be equal to 1. Memory layout is [G, N, Di, Hi, Wi, C].
|
||||
*
|
||||
* \tparam NDimSpatial Number of spatial dimensions.
|
||||
* \tparam ImageLayout Image Layout.
|
||||
@@ -95,18 +93,19 @@ struct ReferenceColumnToImage : public device::BaseOperator
|
||||
float Run(const Argument& arg)
|
||||
{
|
||||
if(!(arg.output_.GetNumOfDimension() == NDimSpatial + 3 &&
|
||||
arg.input_.GetNumOfDimension() == 2))
|
||||
arg.input_.GetNumOfDimension() == 3))
|
||||
{
|
||||
throw std::runtime_error("wrong! inconsistent dimension");
|
||||
}
|
||||
|
||||
const index_t G = arg.output_.GetLengths()[0];
|
||||
const index_t N = arg.output_.GetLengths()[1];
|
||||
const index_t C = arg.output_.GetLengths()[2];
|
||||
|
||||
if constexpr(NDimSpatial == 1)
|
||||
{
|
||||
const index_t Wo = arg.output_spatial_lengths_[0];
|
||||
auto func = [&](auto n) {
|
||||
auto func = [&](auto g, auto n) {
|
||||
for(index_t wo = 0; wo < Wo; ++wo)
|
||||
{
|
||||
index_t row = n * Wo + wo;
|
||||
@@ -123,9 +122,10 @@ struct ReferenceColumnToImage : public device::BaseOperator
|
||||
if(wi >= 0 &&
|
||||
ck::type_convert<std::size_t>(wi) < arg.output_.GetLengths()[3])
|
||||
{
|
||||
float v_in = ck::type_convert<float>(arg.input_(row, column));
|
||||
float v_out = ck::type_convert<float>(arg.output_(0, n, c, wi));
|
||||
arg.output_(0, n, c, wi) =
|
||||
float v_in =
|
||||
ck::type_convert<float>(arg.input_(g, row, column));
|
||||
float v_out = ck::type_convert<float>(arg.output_(g, n, c, wi));
|
||||
arg.output_(g, n, c, wi) =
|
||||
ck::type_convert<OutDataType>(v_in + v_out);
|
||||
}
|
||||
column++;
|
||||
@@ -134,7 +134,7 @@ struct ReferenceColumnToImage : public device::BaseOperator
|
||||
}
|
||||
};
|
||||
|
||||
make_ParallelTensorFunctor(func, N)(std::thread::hardware_concurrency());
|
||||
make_ParallelTensorFunctor(func, G, N)(std::thread::hardware_concurrency());
|
||||
|
||||
return 0;
|
||||
}
|
||||
@@ -143,7 +143,7 @@ struct ReferenceColumnToImage : public device::BaseOperator
|
||||
const index_t Ho = arg.output_spatial_lengths_[0];
|
||||
const index_t Wo = arg.output_spatial_lengths_[1];
|
||||
|
||||
auto func = [&](auto n) {
|
||||
auto func = [&](auto g, auto n) {
|
||||
for(index_t ho = 0; ho < Ho; ++ho)
|
||||
{
|
||||
for(index_t wo = 0; wo < Wo; ++wo)
|
||||
@@ -176,10 +176,10 @@ struct ReferenceColumnToImage : public device::BaseOperator
|
||||
arg.output_.GetLengths()[4])
|
||||
{
|
||||
float v_in =
|
||||
ck::type_convert<float>(arg.input_(row, column));
|
||||
ck::type_convert<float>(arg.input_(g, row, column));
|
||||
float v_out = ck::type_convert<float>(
|
||||
arg.output_(0, n, c, hi, wi));
|
||||
arg.output_(0, n, c, hi, wi) =
|
||||
arg.output_(g, n, c, hi, wi));
|
||||
arg.output_(g, n, c, hi, wi) =
|
||||
ck::type_convert<OutDataType>(v_in + v_out);
|
||||
}
|
||||
column++;
|
||||
@@ -190,7 +190,7 @@ struct ReferenceColumnToImage : public device::BaseOperator
|
||||
}
|
||||
};
|
||||
|
||||
make_ParallelTensorFunctor(func, N)(std::thread::hardware_concurrency());
|
||||
make_ParallelTensorFunctor(func, G, N)(std::thread::hardware_concurrency());
|
||||
|
||||
return 0;
|
||||
}
|
||||
@@ -200,7 +200,7 @@ struct ReferenceColumnToImage : public device::BaseOperator
|
||||
const index_t Ho = arg.output_spatial_lengths_[1];
|
||||
const index_t Wo = arg.output_spatial_lengths_[2];
|
||||
|
||||
auto func = [&](auto n) {
|
||||
auto func = [&](auto g, auto n) {
|
||||
for(index_t d_o = 0; d_o < Do; ++d_o)
|
||||
{
|
||||
for(index_t ho = 0; ho < Ho; ++ho)
|
||||
@@ -245,10 +245,10 @@ struct ReferenceColumnToImage : public device::BaseOperator
|
||||
arg.output_.GetLengths()[5])
|
||||
{
|
||||
float v_in = ck::type_convert<float>(
|
||||
arg.input_(row, column));
|
||||
arg.input_(g, row, column));
|
||||
float v_out = ck::type_convert<float>(
|
||||
arg.output_(0, n, c, di, hi, wi));
|
||||
arg.output_(0, n, c, di, hi, wi) =
|
||||
arg.output_(g, n, c, di, hi, wi));
|
||||
arg.output_(g, n, c, di, hi, wi) =
|
||||
ck::type_convert<OutDataType>(v_in + v_out);
|
||||
}
|
||||
column++;
|
||||
@@ -261,7 +261,7 @@ struct ReferenceColumnToImage : public device::BaseOperator
|
||||
}
|
||||
};
|
||||
|
||||
make_ParallelTensorFunctor(func, N)(std::thread::hardware_concurrency());
|
||||
make_ParallelTensorFunctor(func, G, N)(std::thread::hardware_concurrency());
|
||||
|
||||
return 0;
|
||||
}
|
||||
@@ -303,8 +303,9 @@ struct ReferenceColumnToImage : public device::BaseOperator
|
||||
C * ck::accumulate_n<index_t>(
|
||||
arg.filter_spatial_lengths_.begin(), NDimSpatial, 1, std::multiplies<>());
|
||||
|
||||
if(!(arg.input_.GetLengths()[0] == static_cast<std::size_t>(NDoHoWo) &&
|
||||
arg.input_.GetLengths()[1] == static_cast<std::size_t>(CZYX)))
|
||||
if(!(arg.input_.GetLengths()[0] == static_cast<std::size_t>(G) &&
|
||||
arg.input_.GetLengths()[1] == static_cast<std::size_t>(NDoHoWo) &&
|
||||
arg.input_.GetLengths()[2] == static_cast<std::size_t>(CZYX)))
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
@@ -19,9 +19,7 @@ namespace host {
|
||||
* \brief Reference implementation for image to column.
|
||||
*
|
||||
* Input tensor descriptor has [G, N, C, Di, Hi, Wi] data layout.
|
||||
* G must be equal to 1. Memory layout is [G, N, Di, Hi, Wi, C].
|
||||
* Output tensor descriptor has [N * Do * Ho * Wo, Z * Y * X * C] data layout.
|
||||
* Memory layout is the same.
|
||||
* Output tensor descriptor has [G * N * Do * Ho * Wo, Z * Y * X * C] data layout.
|
||||
*
|
||||
* \tparam NDimSpatial Number of spatial dimensions.
|
||||
* \tparam ImageLayout Image Layout.
|
||||
@@ -95,18 +93,19 @@ struct ReferenceImageToColumn : public device::BaseOperator
|
||||
float Run(const Argument& arg)
|
||||
{
|
||||
if(!(arg.input_.GetNumOfDimension() == NDimSpatial + 3 &&
|
||||
arg.output_.GetNumOfDimension() == 2))
|
||||
arg.output_.GetNumOfDimension() == 3))
|
||||
{
|
||||
throw std::runtime_error("wrong! inconsistent dimension");
|
||||
}
|
||||
|
||||
const index_t G = arg.input_.GetLengths()[0];
|
||||
const index_t N = arg.input_.GetLengths()[1];
|
||||
const index_t C = arg.input_.GetLengths()[2];
|
||||
|
||||
if constexpr(NDimSpatial == 1)
|
||||
{
|
||||
const index_t Wo = arg.output_spatial_lengths_[0];
|
||||
auto func = [&](auto n, auto wo) {
|
||||
auto func = [&](auto g, auto n, auto wo) {
|
||||
index_t row = n * Wo + wo;
|
||||
index_t column = 0;
|
||||
|
||||
@@ -121,15 +120,15 @@ struct ReferenceImageToColumn : public device::BaseOperator
|
||||
if(wi >= 0 &&
|
||||
ck::type_convert<std::size_t>(wi) < arg.input_.GetLengths()[3])
|
||||
{
|
||||
InDataType v_in = arg.input_(0, n, c, wi);
|
||||
arg.output_(row, column) = ck::type_convert<OutDataType>(v_in);
|
||||
InDataType v_in = arg.input_(g, n, c, wi);
|
||||
arg.output_(g, row, column) = ck::type_convert<OutDataType>(v_in);
|
||||
}
|
||||
column++;
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
make_ParallelTensorFunctor(func, N, Wo)(std::thread::hardware_concurrency());
|
||||
make_ParallelTensorFunctor(func, G, N, Wo)(std::thread::hardware_concurrency());
|
||||
|
||||
return 0;
|
||||
}
|
||||
@@ -138,7 +137,7 @@ struct ReferenceImageToColumn : public device::BaseOperator
|
||||
const index_t Ho = arg.output_spatial_lengths_[0];
|
||||
const index_t Wo = arg.output_spatial_lengths_[1];
|
||||
|
||||
auto func = [&](auto n, auto ho, auto wo) {
|
||||
auto func = [&](auto g, auto n, auto ho, auto wo) {
|
||||
index_t row = n * Ho * Wo + ho * Wo + wo;
|
||||
index_t column = 0;
|
||||
|
||||
@@ -162,8 +161,9 @@ struct ReferenceImageToColumn : public device::BaseOperator
|
||||
wi >= 0 &&
|
||||
ck::type_convert<std::size_t>(wi) < arg.input_.GetLengths()[4])
|
||||
{
|
||||
InDataType v_in = arg.input_(0, n, c, hi, wi);
|
||||
arg.output_(row, column) = ck::type_convert<OutDataType>(v_in);
|
||||
InDataType v_in = arg.input_(g, n, c, hi, wi);
|
||||
arg.output_(g, row, column) =
|
||||
ck::type_convert<OutDataType>(v_in);
|
||||
}
|
||||
column++;
|
||||
}
|
||||
@@ -171,7 +171,7 @@ struct ReferenceImageToColumn : public device::BaseOperator
|
||||
}
|
||||
};
|
||||
|
||||
make_ParallelTensorFunctor(func, N, Ho, Wo)(std::thread::hardware_concurrency());
|
||||
make_ParallelTensorFunctor(func, G, N, Ho, Wo)(std::thread::hardware_concurrency());
|
||||
|
||||
return 0;
|
||||
}
|
||||
@@ -181,7 +181,7 @@ struct ReferenceImageToColumn : public device::BaseOperator
|
||||
const index_t Ho = arg.output_spatial_lengths_[1];
|
||||
const index_t Wo = arg.output_spatial_lengths_[2];
|
||||
|
||||
auto func = [&](auto n, auto d_o, auto ho, auto wo) {
|
||||
auto func = [&](auto g, auto n, auto d_o, auto ho, auto wo) {
|
||||
index_t row = n * Do * Ho * Wo + d_o * Ho * Wo + ho * Wo + wo;
|
||||
index_t column = 0;
|
||||
|
||||
@@ -213,8 +213,8 @@ struct ReferenceImageToColumn : public device::BaseOperator
|
||||
ck::type_convert<std::size_t>(wi) <
|
||||
arg.input_.GetLengths()[5])
|
||||
{
|
||||
InDataType v_in = arg.input_(0, n, c, di, hi, wi);
|
||||
arg.output_(row, column) =
|
||||
InDataType v_in = arg.input_(g, n, c, di, hi, wi);
|
||||
arg.output_(g, row, column) =
|
||||
ck::type_convert<OutDataType>(v_in);
|
||||
}
|
||||
column++;
|
||||
@@ -224,7 +224,7 @@ struct ReferenceImageToColumn : public device::BaseOperator
|
||||
}
|
||||
};
|
||||
|
||||
make_ParallelTensorFunctor(func, N, Do, Ho, Wo)(
|
||||
make_ParallelTensorFunctor(func, G, N, Do, Ho, Wo)(
|
||||
std::thread::hardware_concurrency());
|
||||
|
||||
return 0;
|
||||
@@ -267,8 +267,9 @@ struct ReferenceImageToColumn : public device::BaseOperator
|
||||
C * ck::accumulate_n<index_t>(
|
||||
arg.filter_spatial_lengths_.begin(), NDimSpatial, 1, std::multiplies<>());
|
||||
|
||||
if(!(arg.output_.GetLengths()[0] == static_cast<std::size_t>(NDoHoWo) &&
|
||||
arg.output_.GetLengths()[1] == static_cast<std::size_t>(CZYX)))
|
||||
if(!(arg.output_.GetLengths()[0] == static_cast<std::size_t>(G) &&
|
||||
arg.output_.GetLengths()[1] == static_cast<std::size_t>(NDoHoWo) &&
|
||||
arg.output_.GetLengths()[2] == static_cast<std::size_t>(CZYX)))
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
@@ -19,109 +19,214 @@ namespace instance {
|
||||
|
||||
using namespace ck::conv_tensor_rearrange_op;
|
||||
|
||||
// GNWC/GNHWC/GNDHWC
|
||||
// Image to Column
|
||||
// nhwc, 1d
|
||||
void add_device_image_to_column_nwc_1d_bf16_instances(
|
||||
// GNWC, 1d
|
||||
void add_device_image_to_column_gnwc_1d_bf16_instances(
|
||||
std::vector<std::unique_ptr<DeviceConvTensorRearrange<1, GNWC, BF16, BF16, ImageToColumn>>>&
|
||||
instances);
|
||||
|
||||
void add_device_image_to_column_nwc_1d_f16_instances(
|
||||
void add_device_image_to_column_gnwc_1d_f16_instances(
|
||||
std::vector<std::unique_ptr<DeviceConvTensorRearrange<1, GNWC, F16, F16, ImageToColumn>>>&
|
||||
instances);
|
||||
|
||||
void add_device_image_to_column_nwc_1d_f32_instances(
|
||||
void add_device_image_to_column_gnwc_1d_f32_instances(
|
||||
std::vector<std::unique_ptr<DeviceConvTensorRearrange<1, GNWC, F32, F32, ImageToColumn>>>&
|
||||
instances);
|
||||
|
||||
void add_device_image_to_column_nwc_1d_i8_instances(
|
||||
void add_device_image_to_column_gnwc_1d_i8_instances(
|
||||
std::vector<std::unique_ptr<DeviceConvTensorRearrange<1, GNWC, int8_t, int8_t, ImageToColumn>>>&
|
||||
instances);
|
||||
// nhwc, 2d
|
||||
void add_device_image_to_column_nhwc_2d_bf16_instances(
|
||||
// GNHWC, 2d
|
||||
void add_device_image_to_column_gnhwc_2d_bf16_instances(
|
||||
std::vector<std::unique_ptr<DeviceConvTensorRearrange<2, GNHWC, BF16, BF16, ImageToColumn>>>&
|
||||
instances);
|
||||
|
||||
void add_device_image_to_column_nhwc_2d_f16_instances(
|
||||
void add_device_image_to_column_gnhwc_2d_f16_instances(
|
||||
std::vector<std::unique_ptr<DeviceConvTensorRearrange<2, GNHWC, F16, F16, ImageToColumn>>>&
|
||||
instances);
|
||||
|
||||
void add_device_image_to_column_nhwc_2d_f32_instances(
|
||||
void add_device_image_to_column_gnhwc_2d_f32_instances(
|
||||
std::vector<std::unique_ptr<DeviceConvTensorRearrange<2, GNHWC, F32, F32, ImageToColumn>>>&
|
||||
instances);
|
||||
|
||||
void add_device_image_to_column_nhwc_2d_i8_instances(
|
||||
void add_device_image_to_column_gnhwc_2d_i8_instances(
|
||||
std::vector<
|
||||
std::unique_ptr<DeviceConvTensorRearrange<2, GNHWC, int8_t, int8_t, ImageToColumn>>>&
|
||||
instances);
|
||||
// nhwc, 3d
|
||||
void add_device_image_to_column_ndhwc_3d_bf16_instances(
|
||||
// GNDHWC, 3d
|
||||
void add_device_image_to_column_gndhwc_3d_bf16_instances(
|
||||
std::vector<std::unique_ptr<DeviceConvTensorRearrange<3, GNDHWC, BF16, BF16, ImageToColumn>>>&
|
||||
instances);
|
||||
|
||||
void add_device_image_to_column_ndhwc_3d_f16_instances(
|
||||
void add_device_image_to_column_gndhwc_3d_f16_instances(
|
||||
std::vector<std::unique_ptr<DeviceConvTensorRearrange<3, GNDHWC, F16, F16, ImageToColumn>>>&
|
||||
instances);
|
||||
|
||||
void add_device_image_to_column_ndhwc_3d_f32_instances(
|
||||
void add_device_image_to_column_gndhwc_3d_f32_instances(
|
||||
std::vector<std::unique_ptr<DeviceConvTensorRearrange<3, GNDHWC, F32, F32, ImageToColumn>>>&
|
||||
instances);
|
||||
|
||||
void add_device_image_to_column_ndhwc_3d_i8_instances(
|
||||
void add_device_image_to_column_gndhwc_3d_i8_instances(
|
||||
std::vector<
|
||||
std::unique_ptr<DeviceConvTensorRearrange<3, GNDHWC, int8_t, int8_t, ImageToColumn>>>&
|
||||
instances);
|
||||
|
||||
// Column to Image
|
||||
// nhwc, 1d
|
||||
void add_device_column_to_image_nwc_1d_bf16_instances(
|
||||
// GNWC, 1d
|
||||
void add_device_column_to_image_gnwc_1d_bf16_instances(
|
||||
std::vector<std::unique_ptr<DeviceConvTensorRearrange<1, GNWC, BF16, BF16, ColumnToImage>>>&
|
||||
instances);
|
||||
|
||||
void add_device_column_to_image_nwc_1d_f16_instances(
|
||||
void add_device_column_to_image_gnwc_1d_f16_instances(
|
||||
std::vector<std::unique_ptr<DeviceConvTensorRearrange<1, GNWC, F16, F16, ColumnToImage>>>&
|
||||
instances);
|
||||
|
||||
void add_device_column_to_image_nwc_1d_f32_instances(
|
||||
void add_device_column_to_image_gnwc_1d_f32_instances(
|
||||
std::vector<std::unique_ptr<DeviceConvTensorRearrange<1, GNWC, F32, F32, ColumnToImage>>>&
|
||||
instances);
|
||||
|
||||
void add_device_column_to_image_nwc_1d_i8_instances(
|
||||
void add_device_column_to_image_gnwc_1d_i8_instances(
|
||||
std::vector<std::unique_ptr<DeviceConvTensorRearrange<1, GNWC, int8_t, int8_t, ColumnToImage>>>&
|
||||
instances);
|
||||
// nhwc, 2d
|
||||
void add_device_column_to_image_nhwc_2d_bf16_instances(
|
||||
// GNHWC, 2d
|
||||
void add_device_column_to_image_gnhwc_2d_bf16_instances(
|
||||
std::vector<std::unique_ptr<DeviceConvTensorRearrange<2, GNHWC, BF16, BF16, ColumnToImage>>>&
|
||||
instances);
|
||||
|
||||
void add_device_column_to_image_nhwc_2d_f16_instances(
|
||||
void add_device_column_to_image_gnhwc_2d_f16_instances(
|
||||
std::vector<std::unique_ptr<DeviceConvTensorRearrange<2, GNHWC, F16, F16, ColumnToImage>>>&
|
||||
instances);
|
||||
|
||||
void add_device_column_to_image_nhwc_2d_f32_instances(
|
||||
void add_device_column_to_image_gnhwc_2d_f32_instances(
|
||||
std::vector<std::unique_ptr<DeviceConvTensorRearrange<2, GNHWC, F32, F32, ColumnToImage>>>&
|
||||
instances);
|
||||
|
||||
void add_device_column_to_image_nhwc_2d_i8_instances(
|
||||
void add_device_column_to_image_gnhwc_2d_i8_instances(
|
||||
std::vector<
|
||||
std::unique_ptr<DeviceConvTensorRearrange<2, GNHWC, int8_t, int8_t, ColumnToImage>>>&
|
||||
instances);
|
||||
// nhwc, 3d
|
||||
void add_device_column_to_image_ndhwc_3d_bf16_instances(
|
||||
// GNDHWC, 3d
|
||||
void add_device_column_to_image_gndhwc_3d_bf16_instances(
|
||||
std::vector<std::unique_ptr<DeviceConvTensorRearrange<3, GNDHWC, BF16, BF16, ColumnToImage>>>&
|
||||
instances);
|
||||
|
||||
void add_device_column_to_image_ndhwc_3d_f16_instances(
|
||||
void add_device_column_to_image_gndhwc_3d_f16_instances(
|
||||
std::vector<std::unique_ptr<DeviceConvTensorRearrange<3, GNDHWC, F16, F16, ColumnToImage>>>&
|
||||
instances);
|
||||
|
||||
void add_device_column_to_image_ndhwc_3d_f32_instances(
|
||||
void add_device_column_to_image_gndhwc_3d_f32_instances(
|
||||
std::vector<std::unique_ptr<DeviceConvTensorRearrange<3, GNDHWC, F32, F32, ColumnToImage>>>&
|
||||
instances);
|
||||
|
||||
void add_device_column_to_image_ndhwc_3d_i8_instances(
|
||||
void add_device_column_to_image_gndhwc_3d_i8_instances(
|
||||
std::vector<
|
||||
std::unique_ptr<DeviceConvTensorRearrange<3, GNDHWC, int8_t, int8_t, ColumnToImage>>>&
|
||||
instances);
|
||||
// NWGC/NHWGC/NDHWGC
|
||||
// Image to Column
|
||||
// NWGC, 1d
|
||||
void add_device_image_to_column_nwgc_1d_bf16_instances(
|
||||
std::vector<std::unique_ptr<DeviceConvTensorRearrange<1, NWGC, BF16, BF16, ImageToColumn>>>&
|
||||
instances);
|
||||
|
||||
void add_device_image_to_column_nwgc_1d_f16_instances(
|
||||
std::vector<std::unique_ptr<DeviceConvTensorRearrange<1, NWGC, F16, F16, ImageToColumn>>>&
|
||||
instances);
|
||||
|
||||
void add_device_image_to_column_nwgc_1d_f32_instances(
|
||||
std::vector<std::unique_ptr<DeviceConvTensorRearrange<1, NWGC, F32, F32, ImageToColumn>>>&
|
||||
instances);
|
||||
|
||||
void add_device_image_to_column_nwgc_1d_i8_instances(
|
||||
std::vector<std::unique_ptr<DeviceConvTensorRearrange<1, NWGC, int8_t, int8_t, ImageToColumn>>>&
|
||||
instances);
|
||||
// NHWGC, 2d
|
||||
void add_device_image_to_column_nhwgc_2d_bf16_instances(
|
||||
std::vector<std::unique_ptr<DeviceConvTensorRearrange<2, NHWGC, BF16, BF16, ImageToColumn>>>&
|
||||
instances);
|
||||
|
||||
void add_device_image_to_column_nhwgc_2d_f16_instances(
|
||||
std::vector<std::unique_ptr<DeviceConvTensorRearrange<2, NHWGC, F16, F16, ImageToColumn>>>&
|
||||
instances);
|
||||
|
||||
void add_device_image_to_column_nhwgc_2d_f32_instances(
|
||||
std::vector<std::unique_ptr<DeviceConvTensorRearrange<2, NHWGC, F32, F32, ImageToColumn>>>&
|
||||
instances);
|
||||
|
||||
void add_device_image_to_column_nhwgc_2d_i8_instances(
|
||||
std::vector<
|
||||
std::unique_ptr<DeviceConvTensorRearrange<2, NHWGC, int8_t, int8_t, ImageToColumn>>>&
|
||||
instances);
|
||||
// NDHWGC, 3d
|
||||
void add_device_image_to_column_ndhwgc_3d_bf16_instances(
|
||||
std::vector<std::unique_ptr<DeviceConvTensorRearrange<3, NDHWGC, BF16, BF16, ImageToColumn>>>&
|
||||
instances);
|
||||
|
||||
void add_device_image_to_column_ndhwgc_3d_f16_instances(
|
||||
std::vector<std::unique_ptr<DeviceConvTensorRearrange<3, NDHWGC, F16, F16, ImageToColumn>>>&
|
||||
instances);
|
||||
|
||||
void add_device_image_to_column_ndhwgc_3d_f32_instances(
|
||||
std::vector<std::unique_ptr<DeviceConvTensorRearrange<3, NDHWGC, F32, F32, ImageToColumn>>>&
|
||||
instances);
|
||||
|
||||
void add_device_image_to_column_ndhwgc_3d_i8_instances(
|
||||
std::vector<
|
||||
std::unique_ptr<DeviceConvTensorRearrange<3, NDHWGC, int8_t, int8_t, ImageToColumn>>>&
|
||||
instances);
|
||||
|
||||
// Column to Image
|
||||
// NWGC, 1d
|
||||
void add_device_column_to_image_nwgc_1d_bf16_instances(
|
||||
std::vector<std::unique_ptr<DeviceConvTensorRearrange<1, NWGC, BF16, BF16, ColumnToImage>>>&
|
||||
instances);
|
||||
|
||||
void add_device_column_to_image_nwgc_1d_f16_instances(
|
||||
std::vector<std::unique_ptr<DeviceConvTensorRearrange<1, NWGC, F16, F16, ColumnToImage>>>&
|
||||
instances);
|
||||
|
||||
void add_device_column_to_image_nwgc_1d_f32_instances(
|
||||
std::vector<std::unique_ptr<DeviceConvTensorRearrange<1, NWGC, F32, F32, ColumnToImage>>>&
|
||||
instances);
|
||||
|
||||
void add_device_column_to_image_nwgc_1d_i8_instances(
|
||||
std::vector<std::unique_ptr<DeviceConvTensorRearrange<1, NWGC, int8_t, int8_t, ColumnToImage>>>&
|
||||
instances);
|
||||
// NHWGC, 2d
|
||||
void add_device_column_to_image_nhwgc_2d_bf16_instances(
|
||||
std::vector<std::unique_ptr<DeviceConvTensorRearrange<2, NHWGC, BF16, BF16, ColumnToImage>>>&
|
||||
instances);
|
||||
|
||||
void add_device_column_to_image_nhwgc_2d_f16_instances(
|
||||
std::vector<std::unique_ptr<DeviceConvTensorRearrange<2, NHWGC, F16, F16, ColumnToImage>>>&
|
||||
instances);
|
||||
|
||||
void add_device_column_to_image_nhwgc_2d_f32_instances(
|
||||
std::vector<std::unique_ptr<DeviceConvTensorRearrange<2, NHWGC, F32, F32, ColumnToImage>>>&
|
||||
instances);
|
||||
|
||||
void add_device_column_to_image_nhwgc_2d_i8_instances(
|
||||
std::vector<
|
||||
std::unique_ptr<DeviceConvTensorRearrange<2, NHWGC, int8_t, int8_t, ColumnToImage>>>&
|
||||
instances);
|
||||
// NDHWGC, 3d
|
||||
void add_device_column_to_image_ndhwgc_3d_bf16_instances(
|
||||
std::vector<std::unique_ptr<DeviceConvTensorRearrange<3, NDHWGC, BF16, BF16, ColumnToImage>>>&
|
||||
instances);
|
||||
|
||||
void add_device_column_to_image_ndhwgc_3d_f16_instances(
|
||||
std::vector<std::unique_ptr<DeviceConvTensorRearrange<3, NDHWGC, F16, F16, ColumnToImage>>>&
|
||||
instances);
|
||||
|
||||
void add_device_column_to_image_ndhwgc_3d_f32_instances(
|
||||
std::vector<std::unique_ptr<DeviceConvTensorRearrange<3, NDHWGC, F32, F32, ColumnToImage>>>&
|
||||
instances);
|
||||
|
||||
void add_device_column_to_image_ndhwgc_3d_i8_instances(
|
||||
std::vector<
|
||||
std::unique_ptr<DeviceConvTensorRearrange<3, NDHWGC, int8_t, int8_t, ColumnToImage>>>&
|
||||
instances);
|
||||
|
||||
template <ck::index_t NumDimSpatial,
|
||||
typename ImageLayout,
|
||||
@@ -151,60 +256,120 @@ struct DeviceOperationInstanceFactory<
|
||||
{
|
||||
if constexpr(is_same_v<InDataType, float> && is_same_v<OutDataType, float>)
|
||||
{
|
||||
add_device_image_to_column_nwc_1d_f32_instances(op_ptrs);
|
||||
add_device_image_to_column_gnwc_1d_f32_instances(op_ptrs);
|
||||
}
|
||||
else if constexpr(is_same_v<InDataType, half_t> && is_same_v<OutDataType, half_t>)
|
||||
{
|
||||
add_device_image_to_column_nwc_1d_f16_instances(op_ptrs);
|
||||
add_device_image_to_column_gnwc_1d_f16_instances(op_ptrs);
|
||||
}
|
||||
else if constexpr(is_same_v<InDataType, ck::bhalf_t> &&
|
||||
is_same_v<OutDataType, ck::bhalf_t>)
|
||||
{
|
||||
add_device_image_to_column_nwc_1d_bf16_instances(op_ptrs);
|
||||
add_device_image_to_column_gnwc_1d_bf16_instances(op_ptrs);
|
||||
}
|
||||
else if constexpr(is_same_v<InDataType, int8_t> && is_same_v<OutDataType, int8_t>)
|
||||
{
|
||||
add_device_image_to_column_nwc_1d_i8_instances(op_ptrs);
|
||||
add_device_image_to_column_gnwc_1d_i8_instances(op_ptrs);
|
||||
}
|
||||
}
|
||||
else if constexpr(NumDimSpatial == 2 && is_same_v<ImageLayout, GNHWC>)
|
||||
{
|
||||
if constexpr(is_same_v<InDataType, float> && is_same_v<OutDataType, float>)
|
||||
{
|
||||
add_device_image_to_column_nhwc_2d_f32_instances(op_ptrs);
|
||||
add_device_image_to_column_gnhwc_2d_f32_instances(op_ptrs);
|
||||
}
|
||||
else if constexpr(is_same_v<InDataType, half_t> && is_same_v<OutDataType, half_t>)
|
||||
{
|
||||
add_device_image_to_column_nhwc_2d_f16_instances(op_ptrs);
|
||||
add_device_image_to_column_gnhwc_2d_f16_instances(op_ptrs);
|
||||
}
|
||||
else if constexpr(is_same_v<InDataType, ck::bhalf_t> &&
|
||||
is_same_v<OutDataType, ck::bhalf_t>)
|
||||
{
|
||||
add_device_image_to_column_nhwc_2d_bf16_instances(op_ptrs);
|
||||
add_device_image_to_column_gnhwc_2d_bf16_instances(op_ptrs);
|
||||
}
|
||||
else if constexpr(is_same_v<InDataType, int8_t> && is_same_v<OutDataType, int8_t>)
|
||||
{
|
||||
add_device_image_to_column_nhwc_2d_i8_instances(op_ptrs);
|
||||
add_device_image_to_column_gnhwc_2d_i8_instances(op_ptrs);
|
||||
}
|
||||
}
|
||||
else if constexpr(NumDimSpatial == 3 && is_same_v<ImageLayout, GNDHWC>)
|
||||
{
|
||||
if constexpr(is_same_v<InDataType, float> && is_same_v<OutDataType, float>)
|
||||
{
|
||||
add_device_image_to_column_ndhwc_3d_f32_instances(op_ptrs);
|
||||
add_device_image_to_column_gndhwc_3d_f32_instances(op_ptrs);
|
||||
}
|
||||
else if constexpr(is_same_v<InDataType, half_t> && is_same_v<OutDataType, half_t>)
|
||||
{
|
||||
add_device_image_to_column_ndhwc_3d_f16_instances(op_ptrs);
|
||||
add_device_image_to_column_gndhwc_3d_f16_instances(op_ptrs);
|
||||
}
|
||||
else if constexpr(is_same_v<InDataType, ck::bhalf_t> &&
|
||||
is_same_v<OutDataType, ck::bhalf_t>)
|
||||
{
|
||||
add_device_image_to_column_ndhwc_3d_bf16_instances(op_ptrs);
|
||||
add_device_image_to_column_gndhwc_3d_bf16_instances(op_ptrs);
|
||||
}
|
||||
else if constexpr(is_same_v<InDataType, int8_t> && is_same_v<OutDataType, int8_t>)
|
||||
{
|
||||
add_device_image_to_column_ndhwc_3d_i8_instances(op_ptrs);
|
||||
add_device_image_to_column_gndhwc_3d_i8_instances(op_ptrs);
|
||||
}
|
||||
}
|
||||
else if constexpr(NumDimSpatial == 1 && is_same_v<ImageLayout, NWGC>)
|
||||
{
|
||||
if constexpr(is_same_v<InDataType, float> && is_same_v<OutDataType, float>)
|
||||
{
|
||||
add_device_image_to_column_nwgc_1d_f32_instances(op_ptrs);
|
||||
}
|
||||
else if constexpr(is_same_v<InDataType, half_t> && is_same_v<OutDataType, half_t>)
|
||||
{
|
||||
add_device_image_to_column_nwgc_1d_f16_instances(op_ptrs);
|
||||
}
|
||||
else if constexpr(is_same_v<InDataType, ck::bhalf_t> &&
|
||||
is_same_v<OutDataType, ck::bhalf_t>)
|
||||
{
|
||||
add_device_image_to_column_nwgc_1d_bf16_instances(op_ptrs);
|
||||
}
|
||||
else if constexpr(is_same_v<InDataType, int8_t> && is_same_v<OutDataType, int8_t>)
|
||||
{
|
||||
add_device_image_to_column_nwgc_1d_i8_instances(op_ptrs);
|
||||
}
|
||||
}
|
||||
else if constexpr(NumDimSpatial == 2 && is_same_v<ImageLayout, NHWGC>)
|
||||
{
|
||||
if constexpr(is_same_v<InDataType, float> && is_same_v<OutDataType, float>)
|
||||
{
|
||||
add_device_image_to_column_nhwgc_2d_f32_instances(op_ptrs);
|
||||
}
|
||||
else if constexpr(is_same_v<InDataType, half_t> && is_same_v<OutDataType, half_t>)
|
||||
{
|
||||
add_device_image_to_column_nhwgc_2d_f16_instances(op_ptrs);
|
||||
}
|
||||
else if constexpr(is_same_v<InDataType, ck::bhalf_t> &&
|
||||
is_same_v<OutDataType, ck::bhalf_t>)
|
||||
{
|
||||
add_device_image_to_column_nhwgc_2d_bf16_instances(op_ptrs);
|
||||
}
|
||||
else if constexpr(is_same_v<InDataType, int8_t> && is_same_v<OutDataType, int8_t>)
|
||||
{
|
||||
add_device_image_to_column_nhwgc_2d_i8_instances(op_ptrs);
|
||||
}
|
||||
}
|
||||
else if constexpr(NumDimSpatial == 3 && is_same_v<ImageLayout, NDHWGC>)
|
||||
{
|
||||
if constexpr(is_same_v<InDataType, float> && is_same_v<OutDataType, float>)
|
||||
{
|
||||
add_device_image_to_column_ndhwgc_3d_f32_instances(op_ptrs);
|
||||
}
|
||||
else if constexpr(is_same_v<InDataType, half_t> && is_same_v<OutDataType, half_t>)
|
||||
{
|
||||
add_device_image_to_column_ndhwgc_3d_f16_instances(op_ptrs);
|
||||
}
|
||||
else if constexpr(is_same_v<InDataType, ck::bhalf_t> &&
|
||||
is_same_v<OutDataType, ck::bhalf_t>)
|
||||
{
|
||||
add_device_image_to_column_ndhwgc_3d_bf16_instances(op_ptrs);
|
||||
}
|
||||
else if constexpr(is_same_v<InDataType, int8_t> && is_same_v<OutDataType, int8_t>)
|
||||
{
|
||||
add_device_image_to_column_ndhwgc_3d_i8_instances(op_ptrs);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -214,60 +379,120 @@ struct DeviceOperationInstanceFactory<
|
||||
{
|
||||
if constexpr(is_same_v<InDataType, float> && is_same_v<OutDataType, float>)
|
||||
{
|
||||
add_device_column_to_image_nwc_1d_f32_instances(op_ptrs);
|
||||
add_device_column_to_image_gnwc_1d_f32_instances(op_ptrs);
|
||||
}
|
||||
else if constexpr(is_same_v<InDataType, half_t> && is_same_v<OutDataType, half_t>)
|
||||
{
|
||||
add_device_column_to_image_nwc_1d_f16_instances(op_ptrs);
|
||||
add_device_column_to_image_gnwc_1d_f16_instances(op_ptrs);
|
||||
}
|
||||
else if constexpr(is_same_v<InDataType, ck::bhalf_t> &&
|
||||
is_same_v<OutDataType, ck::bhalf_t>)
|
||||
{
|
||||
add_device_column_to_image_nwc_1d_bf16_instances(op_ptrs);
|
||||
add_device_column_to_image_gnwc_1d_bf16_instances(op_ptrs);
|
||||
}
|
||||
else if constexpr(is_same_v<InDataType, int8_t> && is_same_v<OutDataType, int8_t>)
|
||||
{
|
||||
add_device_column_to_image_nwc_1d_i8_instances(op_ptrs);
|
||||
add_device_column_to_image_gnwc_1d_i8_instances(op_ptrs);
|
||||
}
|
||||
}
|
||||
else if constexpr(NumDimSpatial == 2 && is_same_v<ImageLayout, GNHWC>)
|
||||
{
|
||||
if constexpr(is_same_v<InDataType, float> && is_same_v<OutDataType, float>)
|
||||
{
|
||||
add_device_column_to_image_nhwc_2d_f32_instances(op_ptrs);
|
||||
add_device_column_to_image_gnhwc_2d_f32_instances(op_ptrs);
|
||||
}
|
||||
else if constexpr(is_same_v<InDataType, half_t> && is_same_v<OutDataType, half_t>)
|
||||
{
|
||||
add_device_column_to_image_nhwc_2d_f16_instances(op_ptrs);
|
||||
add_device_column_to_image_gnhwc_2d_f16_instances(op_ptrs);
|
||||
}
|
||||
else if constexpr(is_same_v<InDataType, ck::bhalf_t> &&
|
||||
is_same_v<OutDataType, ck::bhalf_t>)
|
||||
{
|
||||
add_device_column_to_image_nhwc_2d_bf16_instances(op_ptrs);
|
||||
add_device_column_to_image_gnhwc_2d_bf16_instances(op_ptrs);
|
||||
}
|
||||
else if constexpr(is_same_v<InDataType, int8_t> && is_same_v<OutDataType, int8_t>)
|
||||
{
|
||||
add_device_column_to_image_nhwc_2d_i8_instances(op_ptrs);
|
||||
add_device_column_to_image_gnhwc_2d_i8_instances(op_ptrs);
|
||||
}
|
||||
}
|
||||
else if constexpr(NumDimSpatial == 3 && is_same_v<ImageLayout, GNDHWC>)
|
||||
{
|
||||
if constexpr(is_same_v<InDataType, float> && is_same_v<OutDataType, float>)
|
||||
{
|
||||
add_device_column_to_image_ndhwc_3d_f32_instances(op_ptrs);
|
||||
add_device_column_to_image_gndhwc_3d_f32_instances(op_ptrs);
|
||||
}
|
||||
else if constexpr(is_same_v<InDataType, half_t> && is_same_v<OutDataType, half_t>)
|
||||
{
|
||||
add_device_column_to_image_ndhwc_3d_f16_instances(op_ptrs);
|
||||
add_device_column_to_image_gndhwc_3d_f16_instances(op_ptrs);
|
||||
}
|
||||
else if constexpr(is_same_v<InDataType, ck::bhalf_t> &&
|
||||
is_same_v<OutDataType, ck::bhalf_t>)
|
||||
{
|
||||
add_device_column_to_image_ndhwc_3d_bf16_instances(op_ptrs);
|
||||
add_device_column_to_image_gndhwc_3d_bf16_instances(op_ptrs);
|
||||
}
|
||||
else if constexpr(is_same_v<InDataType, int8_t> && is_same_v<OutDataType, int8_t>)
|
||||
{
|
||||
add_device_column_to_image_ndhwc_3d_i8_instances(op_ptrs);
|
||||
add_device_column_to_image_gndhwc_3d_i8_instances(op_ptrs);
|
||||
}
|
||||
}
|
||||
else if constexpr(NumDimSpatial == 1 && is_same_v<ImageLayout, NWGC>)
|
||||
{
|
||||
if constexpr(is_same_v<InDataType, float> && is_same_v<OutDataType, float>)
|
||||
{
|
||||
add_device_column_to_image_nwgc_1d_f32_instances(op_ptrs);
|
||||
}
|
||||
else if constexpr(is_same_v<InDataType, half_t> && is_same_v<OutDataType, half_t>)
|
||||
{
|
||||
add_device_column_to_image_nwgc_1d_f16_instances(op_ptrs);
|
||||
}
|
||||
else if constexpr(is_same_v<InDataType, ck::bhalf_t> &&
|
||||
is_same_v<OutDataType, ck::bhalf_t>)
|
||||
{
|
||||
add_device_column_to_image_nwgc_1d_bf16_instances(op_ptrs);
|
||||
}
|
||||
else if constexpr(is_same_v<InDataType, int8_t> && is_same_v<OutDataType, int8_t>)
|
||||
{
|
||||
add_device_column_to_image_nwgc_1d_i8_instances(op_ptrs);
|
||||
}
|
||||
}
|
||||
else if constexpr(NumDimSpatial == 2 && is_same_v<ImageLayout, NHWGC>)
|
||||
{
|
||||
if constexpr(is_same_v<InDataType, float> && is_same_v<OutDataType, float>)
|
||||
{
|
||||
add_device_column_to_image_nhwgc_2d_f32_instances(op_ptrs);
|
||||
}
|
||||
else if constexpr(is_same_v<InDataType, half_t> && is_same_v<OutDataType, half_t>)
|
||||
{
|
||||
add_device_column_to_image_nhwgc_2d_f16_instances(op_ptrs);
|
||||
}
|
||||
else if constexpr(is_same_v<InDataType, ck::bhalf_t> &&
|
||||
is_same_v<OutDataType, ck::bhalf_t>)
|
||||
{
|
||||
add_device_column_to_image_nhwgc_2d_bf16_instances(op_ptrs);
|
||||
}
|
||||
else if constexpr(is_same_v<InDataType, int8_t> && is_same_v<OutDataType, int8_t>)
|
||||
{
|
||||
add_device_column_to_image_nhwgc_2d_i8_instances(op_ptrs);
|
||||
}
|
||||
}
|
||||
else if constexpr(NumDimSpatial == 3 && is_same_v<ImageLayout, NDHWGC>)
|
||||
{
|
||||
if constexpr(is_same_v<InDataType, float> && is_same_v<OutDataType, float>)
|
||||
{
|
||||
add_device_column_to_image_ndhwgc_3d_f32_instances(op_ptrs);
|
||||
}
|
||||
else if constexpr(is_same_v<InDataType, half_t> && is_same_v<OutDataType, half_t>)
|
||||
{
|
||||
add_device_column_to_image_ndhwgc_3d_f16_instances(op_ptrs);
|
||||
}
|
||||
else if constexpr(is_same_v<InDataType, ck::bhalf_t> &&
|
||||
is_same_v<OutDataType, ck::bhalf_t>)
|
||||
{
|
||||
add_device_column_to_image_ndhwgc_3d_bf16_instances(op_ptrs);
|
||||
}
|
||||
else if constexpr(is_same_v<InDataType, int8_t> && is_same_v<OutDataType, int8_t>)
|
||||
{
|
||||
add_device_column_to_image_ndhwgc_3d_i8_instances(op_ptrs);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,5 +1,8 @@
|
||||
add_instance_library(device_column_to_image_instance
|
||||
device_column_to_image_nhwc_1d_instance.cpp
|
||||
device_column_to_image_nhwc_2d_instance.cpp
|
||||
device_column_to_image_nhwc_3d_instance.cpp
|
||||
device_column_to_image_gnwc_1d_instance.cpp
|
||||
device_column_to_image_gnhwc_2d_instance.cpp
|
||||
device_column_to_image_gndhwc_3d_instance.cpp
|
||||
device_column_to_image_nwgc_1d_instance.cpp
|
||||
device_column_to_image_nhwgc_2d_instance.cpp
|
||||
device_column_to_image_ndhwgc_3d_instance.cpp
|
||||
)
|
||||
|
||||
@@ -11,7 +11,7 @@ namespace instance {
|
||||
|
||||
using namespace ck::conv_tensor_rearrange_op;
|
||||
|
||||
void add_device_column_to_image_ndhwc_3d_bf16_instances(
|
||||
void add_device_column_to_image_gndhwc_3d_bf16_instances(
|
||||
std::vector<std::unique_ptr<DeviceConvTensorRearrange<3, GNDHWC, BF16, BF16, ColumnToImage>>>&
|
||||
instances)
|
||||
{
|
||||
@@ -22,7 +22,7 @@ void add_device_column_to_image_ndhwc_3d_bf16_instances(
|
||||
#endif
|
||||
}
|
||||
|
||||
void add_device_column_to_image_ndhwc_3d_f16_instances(
|
||||
void add_device_column_to_image_gndhwc_3d_f16_instances(
|
||||
std::vector<std::unique_ptr<DeviceConvTensorRearrange<3, GNDHWC, F16, F16, ColumnToImage>>>&
|
||||
instances)
|
||||
{
|
||||
@@ -33,7 +33,7 @@ void add_device_column_to_image_ndhwc_3d_f16_instances(
|
||||
#endif
|
||||
}
|
||||
|
||||
void add_device_column_to_image_ndhwc_3d_f32_instances(
|
||||
void add_device_column_to_image_gndhwc_3d_f32_instances(
|
||||
std::vector<std::unique_ptr<DeviceConvTensorRearrange<3, GNDHWC, F32, F32, ColumnToImage>>>&
|
||||
instances)
|
||||
{
|
||||
@@ -44,7 +44,7 @@ void add_device_column_to_image_ndhwc_3d_f32_instances(
|
||||
#endif
|
||||
}
|
||||
|
||||
void add_device_column_to_image_ndhwc_3d_i8_instances(
|
||||
void add_device_column_to_image_gndhwc_3d_i8_instances(
|
||||
std::vector<
|
||||
std::unique_ptr<DeviceConvTensorRearrange<3, GNDHWC, int8_t, int8_t, ColumnToImage>>>&
|
||||
instances)
|
||||
@@ -11,7 +11,7 @@ namespace instance {
|
||||
|
||||
using namespace ck::conv_tensor_rearrange_op;
|
||||
|
||||
void add_device_column_to_image_nhwc_2d_bf16_instances(
|
||||
void add_device_column_to_image_gnhwc_2d_bf16_instances(
|
||||
std::vector<std::unique_ptr<DeviceConvTensorRearrange<2, GNHWC, BF16, BF16, ColumnToImage>>>&
|
||||
instances)
|
||||
{
|
||||
@@ -22,7 +22,7 @@ void add_device_column_to_image_nhwc_2d_bf16_instances(
|
||||
#endif
|
||||
}
|
||||
|
||||
void add_device_column_to_image_nhwc_2d_f16_instances(
|
||||
void add_device_column_to_image_gnhwc_2d_f16_instances(
|
||||
std::vector<std::unique_ptr<DeviceConvTensorRearrange<2, GNHWC, F16, F16, ColumnToImage>>>&
|
||||
instances)
|
||||
{
|
||||
@@ -33,7 +33,7 @@ void add_device_column_to_image_nhwc_2d_f16_instances(
|
||||
#endif
|
||||
}
|
||||
|
||||
void add_device_column_to_image_nhwc_2d_f32_instances(
|
||||
void add_device_column_to_image_gnhwc_2d_f32_instances(
|
||||
std::vector<std::unique_ptr<DeviceConvTensorRearrange<2, GNHWC, F32, F32, ColumnToImage>>>&
|
||||
instances)
|
||||
{
|
||||
@@ -44,7 +44,7 @@ void add_device_column_to_image_nhwc_2d_f32_instances(
|
||||
#endif
|
||||
}
|
||||
|
||||
void add_device_column_to_image_nhwc_2d_i8_instances(
|
||||
void add_device_column_to_image_gnhwc_2d_i8_instances(
|
||||
std::vector<
|
||||
std::unique_ptr<DeviceConvTensorRearrange<2, GNHWC, int8_t, int8_t, ColumnToImage>>>&
|
||||
instances)
|
||||
@@ -11,7 +11,7 @@ namespace instance {
|
||||
|
||||
using namespace ck::conv_tensor_rearrange_op;
|
||||
|
||||
void add_device_column_to_image_nwc_1d_bf16_instances(
|
||||
void add_device_column_to_image_gnwc_1d_bf16_instances(
|
||||
std::vector<std::unique_ptr<DeviceConvTensorRearrange<1, GNWC, BF16, BF16, ColumnToImage>>>&
|
||||
instances)
|
||||
{
|
||||
@@ -22,7 +22,7 @@ void add_device_column_to_image_nwc_1d_bf16_instances(
|
||||
#endif
|
||||
}
|
||||
|
||||
void add_device_column_to_image_nwc_1d_f16_instances(
|
||||
void add_device_column_to_image_gnwc_1d_f16_instances(
|
||||
std::vector<std::unique_ptr<DeviceConvTensorRearrange<1, GNWC, F16, F16, ColumnToImage>>>&
|
||||
instances)
|
||||
{
|
||||
@@ -33,7 +33,7 @@ void add_device_column_to_image_nwc_1d_f16_instances(
|
||||
#endif
|
||||
}
|
||||
|
||||
void add_device_column_to_image_nwc_1d_f32_instances(
|
||||
void add_device_column_to_image_gnwc_1d_f32_instances(
|
||||
std::vector<std::unique_ptr<DeviceConvTensorRearrange<1, GNWC, F32, F32, ColumnToImage>>>&
|
||||
instances)
|
||||
{
|
||||
@@ -44,7 +44,7 @@ void add_device_column_to_image_nwc_1d_f32_instances(
|
||||
#endif
|
||||
}
|
||||
|
||||
void add_device_column_to_image_nwc_1d_i8_instances(
|
||||
void add_device_column_to_image_gnwc_1d_i8_instances(
|
||||
std::vector<std::unique_ptr<DeviceConvTensorRearrange<1, GNWC, int8_t, int8_t, ColumnToImage>>>&
|
||||
instances)
|
||||
{
|
||||
@@ -0,0 +1,62 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include "ck/library/tensor_operation_instance/gpu/conv_tensor_rearrange/device_column_to_image_instance.hpp"
|
||||
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace instance {
|
||||
|
||||
using namespace ck::conv_tensor_rearrange_op;
|
||||
|
||||
void add_device_column_to_image_ndhwgc_3d_bf16_instances(
|
||||
std::vector<std::unique_ptr<DeviceConvTensorRearrange<3, NDHWGC, BF16, BF16, ColumnToImage>>>&
|
||||
instances)
|
||||
{
|
||||
#ifdef CK_ENABLE_BF16
|
||||
add_device_operation_instances(instances, device_column_to_image_bf16_instances<3, NDHWGC>{});
|
||||
#else
|
||||
ignore = instances;
|
||||
#endif
|
||||
}
|
||||
|
||||
void add_device_column_to_image_ndhwgc_3d_f16_instances(
|
||||
std::vector<std::unique_ptr<DeviceConvTensorRearrange<3, NDHWGC, F16, F16, ColumnToImage>>>&
|
||||
instances)
|
||||
{
|
||||
#ifdef CK_ENABLE_FP16
|
||||
add_device_operation_instances(instances, device_column_to_image_f16_instances<3, NDHWGC>{});
|
||||
#else
|
||||
ignore = instances;
|
||||
#endif
|
||||
}
|
||||
|
||||
void add_device_column_to_image_ndhwgc_3d_f32_instances(
|
||||
std::vector<std::unique_ptr<DeviceConvTensorRearrange<3, NDHWGC, F32, F32, ColumnToImage>>>&
|
||||
instances)
|
||||
{
|
||||
#ifdef CK_ENABLE_FP32
|
||||
add_device_operation_instances(instances, device_column_to_image_f32_instances<3, NDHWGC>{});
|
||||
#else
|
||||
ignore = instances;
|
||||
#endif
|
||||
}
|
||||
|
||||
void add_device_column_to_image_ndhwgc_3d_i8_instances(
|
||||
std::vector<
|
||||
std::unique_ptr<DeviceConvTensorRearrange<3, NDHWGC, int8_t, int8_t, ColumnToImage>>>&
|
||||
instances)
|
||||
{
|
||||
#ifdef CK_ENABLE_INT8
|
||||
add_device_operation_instances(instances, device_column_to_image_i8_instances<3, NDHWGC>{});
|
||||
#else
|
||||
ignore = instances;
|
||||
#endif
|
||||
}
|
||||
|
||||
} // namespace instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
} // namespace ck
|
||||
@@ -0,0 +1,62 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include "ck/library/tensor_operation_instance/gpu/conv_tensor_rearrange/device_column_to_image_instance.hpp"
|
||||
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace instance {
|
||||
|
||||
using namespace ck::conv_tensor_rearrange_op;
|
||||
|
||||
void add_device_column_to_image_nhwgc_2d_bf16_instances(
|
||||
std::vector<std::unique_ptr<DeviceConvTensorRearrange<2, NHWGC, BF16, BF16, ColumnToImage>>>&
|
||||
instances)
|
||||
{
|
||||
#ifdef CK_ENABLE_BF16
|
||||
add_device_operation_instances(instances, device_column_to_image_bf16_instances<2, NHWGC>{});
|
||||
#else
|
||||
ignore = instances;
|
||||
#endif
|
||||
}
|
||||
|
||||
void add_device_column_to_image_nhwgc_2d_f16_instances(
|
||||
std::vector<std::unique_ptr<DeviceConvTensorRearrange<2, NHWGC, F16, F16, ColumnToImage>>>&
|
||||
instances)
|
||||
{
|
||||
#ifdef CK_ENABLE_FP16
|
||||
add_device_operation_instances(instances, device_column_to_image_f16_instances<2, NHWGC>{});
|
||||
#else
|
||||
ignore = instances;
|
||||
#endif
|
||||
}
|
||||
|
||||
void add_device_column_to_image_nhwgc_2d_f32_instances(
|
||||
std::vector<std::unique_ptr<DeviceConvTensorRearrange<2, NHWGC, F32, F32, ColumnToImage>>>&
|
||||
instances)
|
||||
{
|
||||
#ifdef CK_ENABLE_FP32
|
||||
add_device_operation_instances(instances, device_column_to_image_f32_instances<2, NHWGC>{});
|
||||
#else
|
||||
ignore = instances;
|
||||
#endif
|
||||
}
|
||||
|
||||
void add_device_column_to_image_nhwgc_2d_i8_instances(
|
||||
std::vector<
|
||||
std::unique_ptr<DeviceConvTensorRearrange<2, NHWGC, int8_t, int8_t, ColumnToImage>>>&
|
||||
instances)
|
||||
{
|
||||
#ifdef CK_ENABLE_INT8
|
||||
add_device_operation_instances(instances, device_column_to_image_i8_instances<2, NHWGC>{});
|
||||
#else
|
||||
ignore = instances;
|
||||
#endif
|
||||
}
|
||||
|
||||
} // namespace instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
} // namespace ck
|
||||
@@ -0,0 +1,61 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include "ck/library/tensor_operation_instance/gpu/conv_tensor_rearrange/device_column_to_image_instance.hpp"
|
||||
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace instance {
|
||||
|
||||
using namespace ck::conv_tensor_rearrange_op;
|
||||
|
||||
void add_device_column_to_image_nwgc_1d_bf16_instances(
|
||||
std::vector<std::unique_ptr<DeviceConvTensorRearrange<1, NWGC, BF16, BF16, ColumnToImage>>>&
|
||||
instances)
|
||||
{
|
||||
#ifdef CK_ENABLE_BF16
|
||||
add_device_operation_instances(instances, device_column_to_image_bf16_instances<1, NWGC>{});
|
||||
#else
|
||||
ignore = instances;
|
||||
#endif
|
||||
}
|
||||
|
||||
void add_device_column_to_image_nwgc_1d_f16_instances(
|
||||
std::vector<std::unique_ptr<DeviceConvTensorRearrange<1, NWGC, F16, F16, ColumnToImage>>>&
|
||||
instances)
|
||||
{
|
||||
#ifdef CK_ENABLE_FP16
|
||||
add_device_operation_instances(instances, device_column_to_image_f16_instances<1, NWGC>{});
|
||||
#else
|
||||
ignore = instances;
|
||||
#endif
|
||||
}
|
||||
|
||||
void add_device_column_to_image_nwgc_1d_f32_instances(
|
||||
std::vector<std::unique_ptr<DeviceConvTensorRearrange<1, NWGC, F32, F32, ColumnToImage>>>&
|
||||
instances)
|
||||
{
|
||||
#ifdef CK_ENABLE_FP32
|
||||
add_device_operation_instances(instances, device_column_to_image_f32_instances<1, NWGC>{});
|
||||
#else
|
||||
ignore = instances;
|
||||
#endif
|
||||
}
|
||||
|
||||
void add_device_column_to_image_nwgc_1d_i8_instances(
|
||||
std::vector<std::unique_ptr<DeviceConvTensorRearrange<1, NWGC, int8_t, int8_t, ColumnToImage>>>&
|
||||
instances)
|
||||
{
|
||||
#ifdef CK_ENABLE_INT8
|
||||
add_device_operation_instances(instances, device_column_to_image_i8_instances<1, NWGC>{});
|
||||
#else
|
||||
ignore = instances;
|
||||
#endif
|
||||
}
|
||||
|
||||
} // namespace instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
} // namespace ck
|
||||
@@ -1,5 +1,8 @@
|
||||
add_instance_library(device_image_to_column_instance
|
||||
device_image_to_column_nhwc_1d_instance.cpp
|
||||
device_image_to_column_nhwc_2d_instance.cpp
|
||||
device_image_to_column_nhwc_3d_instance.cpp
|
||||
device_image_to_column_gnwc_1d_instance.cpp
|
||||
device_image_to_column_gnhwc_2d_instance.cpp
|
||||
device_image_to_column_gndhwc_3d_instance.cpp
|
||||
device_image_to_column_nwgc_1d_instance.cpp
|
||||
device_image_to_column_nhwgc_2d_instance.cpp
|
||||
device_image_to_column_ndhwgc_3d_instance.cpp
|
||||
)
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include "ck/library/tensor_operation_instance/gpu/conv_tensor_rearrange/device_image_to_column_instance.hpp"
|
||||
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
|
||||
@@ -11,7 +11,7 @@ namespace instance {
|
||||
|
||||
using namespace ck::conv_tensor_rearrange_op;
|
||||
|
||||
void add_device_image_to_column_ndhwc_3d_bf16_instances(
|
||||
void add_device_image_to_column_gndhwc_3d_bf16_instances(
|
||||
std::vector<std::unique_ptr<DeviceConvTensorRearrange<3, GNDHWC, BF16, BF16, ImageToColumn>>>&
|
||||
instances)
|
||||
{
|
||||
@@ -22,7 +22,7 @@ void add_device_image_to_column_ndhwc_3d_bf16_instances(
|
||||
#endif
|
||||
}
|
||||
|
||||
void add_device_image_to_column_ndhwc_3d_f16_instances(
|
||||
void add_device_image_to_column_gndhwc_3d_f16_instances(
|
||||
std::vector<std::unique_ptr<DeviceConvTensorRearrange<3, GNDHWC, F16, F16, ImageToColumn>>>&
|
||||
instances)
|
||||
{
|
||||
@@ -33,7 +33,7 @@ void add_device_image_to_column_ndhwc_3d_f16_instances(
|
||||
#endif
|
||||
}
|
||||
|
||||
void add_device_image_to_column_ndhwc_3d_f32_instances(
|
||||
void add_device_image_to_column_gndhwc_3d_f32_instances(
|
||||
std::vector<std::unique_ptr<DeviceConvTensorRearrange<3, GNDHWC, F32, F32, ImageToColumn>>>&
|
||||
instances)
|
||||
{
|
||||
@@ -44,7 +44,7 @@ void add_device_image_to_column_ndhwc_3d_f32_instances(
|
||||
#endif
|
||||
}
|
||||
|
||||
void add_device_image_to_column_ndhwc_3d_i8_instances(
|
||||
void add_device_image_to_column_gndhwc_3d_i8_instances(
|
||||
std::vector<
|
||||
std::unique_ptr<DeviceConvTensorRearrange<3, GNDHWC, int8_t, int8_t, ImageToColumn>>>&
|
||||
instances)
|
||||
@@ -1,5 +1,5 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include "ck/library/tensor_operation_instance/gpu/conv_tensor_rearrange/device_image_to_column_instance.hpp"
|
||||
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
|
||||
@@ -11,7 +11,7 @@ namespace instance {
|
||||
|
||||
using namespace ck::conv_tensor_rearrange_op;
|
||||
|
||||
void add_device_image_to_column_nhwc_2d_bf16_instances(
|
||||
void add_device_image_to_column_gnhwc_2d_bf16_instances(
|
||||
std::vector<std::unique_ptr<DeviceConvTensorRearrange<2, GNHWC, BF16, BF16, ImageToColumn>>>&
|
||||
instances)
|
||||
{
|
||||
@@ -22,7 +22,7 @@ void add_device_image_to_column_nhwc_2d_bf16_instances(
|
||||
#endif
|
||||
}
|
||||
|
||||
void add_device_image_to_column_nhwc_2d_f16_instances(
|
||||
void add_device_image_to_column_gnhwc_2d_f16_instances(
|
||||
std::vector<std::unique_ptr<DeviceConvTensorRearrange<2, GNHWC, F16, F16, ImageToColumn>>>&
|
||||
instances)
|
||||
{
|
||||
@@ -33,7 +33,7 @@ void add_device_image_to_column_nhwc_2d_f16_instances(
|
||||
#endif
|
||||
}
|
||||
|
||||
void add_device_image_to_column_nhwc_2d_f32_instances(
|
||||
void add_device_image_to_column_gnhwc_2d_f32_instances(
|
||||
std::vector<std::unique_ptr<DeviceConvTensorRearrange<2, GNHWC, F32, F32, ImageToColumn>>>&
|
||||
instances)
|
||||
{
|
||||
@@ -44,7 +44,7 @@ void add_device_image_to_column_nhwc_2d_f32_instances(
|
||||
#endif
|
||||
}
|
||||
|
||||
void add_device_image_to_column_nhwc_2d_i8_instances(
|
||||
void add_device_image_to_column_gnhwc_2d_i8_instances(
|
||||
std::vector<
|
||||
std::unique_ptr<DeviceConvTensorRearrange<2, GNHWC, int8_t, int8_t, ImageToColumn>>>&
|
||||
instances)
|
||||
@@ -1,5 +1,5 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include "ck/library/tensor_operation_instance/gpu/conv_tensor_rearrange/device_image_to_column_instance.hpp"
|
||||
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
|
||||
@@ -11,7 +11,7 @@ namespace instance {
|
||||
|
||||
using namespace ck::conv_tensor_rearrange_op;
|
||||
|
||||
void add_device_image_to_column_nwc_1d_bf16_instances(
|
||||
void add_device_image_to_column_gnwc_1d_bf16_instances(
|
||||
std::vector<std::unique_ptr<DeviceConvTensorRearrange<1, GNWC, BF16, BF16, ImageToColumn>>>&
|
||||
instances)
|
||||
{
|
||||
@@ -22,7 +22,7 @@ void add_device_image_to_column_nwc_1d_bf16_instances(
|
||||
#endif
|
||||
}
|
||||
|
||||
void add_device_image_to_column_nwc_1d_f16_instances(
|
||||
void add_device_image_to_column_gnwc_1d_f16_instances(
|
||||
std::vector<std::unique_ptr<DeviceConvTensorRearrange<1, GNWC, F16, F16, ImageToColumn>>>&
|
||||
instances)
|
||||
{
|
||||
@@ -33,7 +33,7 @@ void add_device_image_to_column_nwc_1d_f16_instances(
|
||||
#endif
|
||||
}
|
||||
|
||||
void add_device_image_to_column_nwc_1d_f32_instances(
|
||||
void add_device_image_to_column_gnwc_1d_f32_instances(
|
||||
std::vector<std::unique_ptr<DeviceConvTensorRearrange<1, GNWC, F32, F32, ImageToColumn>>>&
|
||||
instances)
|
||||
{
|
||||
@@ -44,7 +44,7 @@ void add_device_image_to_column_nwc_1d_f32_instances(
|
||||
#endif
|
||||
}
|
||||
|
||||
void add_device_image_to_column_nwc_1d_i8_instances(
|
||||
void add_device_image_to_column_gnwc_1d_i8_instances(
|
||||
std::vector<std::unique_ptr<DeviceConvTensorRearrange<1, GNWC, int8_t, int8_t, ImageToColumn>>>&
|
||||
instances)
|
||||
{
|
||||
@@ -0,0 +1,62 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include "ck/library/tensor_operation_instance/gpu/conv_tensor_rearrange/device_image_to_column_instance.hpp"
|
||||
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace instance {
|
||||
|
||||
using namespace ck::conv_tensor_rearrange_op;
|
||||
|
||||
void add_device_image_to_column_ndhwgc_3d_bf16_instances(
|
||||
std::vector<std::unique_ptr<DeviceConvTensorRearrange<3, NDHWGC, BF16, BF16, ImageToColumn>>>&
|
||||
instances)
|
||||
{
|
||||
#ifdef CK_ENABLE_BF16
|
||||
add_device_operation_instances(instances, device_image_to_column_bf16_instances<3, NDHWGC>{});
|
||||
#else
|
||||
ignore = instances;
|
||||
#endif
|
||||
}
|
||||
|
||||
void add_device_image_to_column_ndhwgc_3d_f16_instances(
|
||||
std::vector<std::unique_ptr<DeviceConvTensorRearrange<3, NDHWGC, F16, F16, ImageToColumn>>>&
|
||||
instances)
|
||||
{
|
||||
#ifdef CK_ENABLE_FP16
|
||||
add_device_operation_instances(instances, device_image_to_column_f16_instances<3, NDHWGC>{});
|
||||
#else
|
||||
ignore = instances;
|
||||
#endif
|
||||
}
|
||||
|
||||
void add_device_image_to_column_ndhwgc_3d_f32_instances(
|
||||
std::vector<std::unique_ptr<DeviceConvTensorRearrange<3, NDHWGC, F32, F32, ImageToColumn>>>&
|
||||
instances)
|
||||
{
|
||||
#ifdef CK_ENABLE_FP32
|
||||
add_device_operation_instances(instances, device_image_to_column_f32_instances<3, NDHWGC>{});
|
||||
#else
|
||||
ignore = instances;
|
||||
#endif
|
||||
}
|
||||
|
||||
void add_device_image_to_column_ndhwgc_3d_i8_instances(
|
||||
std::vector<
|
||||
std::unique_ptr<DeviceConvTensorRearrange<3, NDHWGC, int8_t, int8_t, ImageToColumn>>>&
|
||||
instances)
|
||||
{
|
||||
#ifdef CK_ENABLE_INT8
|
||||
add_device_operation_instances(instances, device_image_to_column_i8_instances<3, NDHWGC>{});
|
||||
#else
|
||||
ignore = instances;
|
||||
#endif
|
||||
}
|
||||
|
||||
} // namespace instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
} // namespace ck
|
||||
@@ -0,0 +1,62 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include "ck/library/tensor_operation_instance/gpu/conv_tensor_rearrange/device_image_to_column_instance.hpp"
|
||||
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace instance {
|
||||
|
||||
using namespace ck::conv_tensor_rearrange_op;
|
||||
|
||||
void add_device_image_to_column_nhwgc_2d_bf16_instances(
|
||||
std::vector<std::unique_ptr<DeviceConvTensorRearrange<2, NHWGC, BF16, BF16, ImageToColumn>>>&
|
||||
instances)
|
||||
{
|
||||
#ifdef CK_ENABLE_BF16
|
||||
add_device_operation_instances(instances, device_image_to_column_bf16_instances<2, NHWGC>{});
|
||||
#else
|
||||
ignore = instances;
|
||||
#endif
|
||||
}
|
||||
|
||||
void add_device_image_to_column_nhwgc_2d_f16_instances(
|
||||
std::vector<std::unique_ptr<DeviceConvTensorRearrange<2, NHWGC, F16, F16, ImageToColumn>>>&
|
||||
instances)
|
||||
{
|
||||
#ifdef CK_ENABLE_FP16
|
||||
add_device_operation_instances(instances, device_image_to_column_f16_instances<2, NHWGC>{});
|
||||
#else
|
||||
ignore = instances;
|
||||
#endif
|
||||
}
|
||||
|
||||
void add_device_image_to_column_nhwgc_2d_f32_instances(
|
||||
std::vector<std::unique_ptr<DeviceConvTensorRearrange<2, NHWGC, F32, F32, ImageToColumn>>>&
|
||||
instances)
|
||||
{
|
||||
#ifdef CK_ENABLE_FP32
|
||||
add_device_operation_instances(instances, device_image_to_column_f32_instances<2, NHWGC>{});
|
||||
#else
|
||||
ignore = instances;
|
||||
#endif
|
||||
}
|
||||
|
||||
void add_device_image_to_column_nhwgc_2d_i8_instances(
|
||||
std::vector<
|
||||
std::unique_ptr<DeviceConvTensorRearrange<2, NHWGC, int8_t, int8_t, ImageToColumn>>>&
|
||||
instances)
|
||||
{
|
||||
#ifdef CK_ENABLE_INT8
|
||||
add_device_operation_instances(instances, device_image_to_column_i8_instances<2, NHWGC>{});
|
||||
#else
|
||||
ignore = instances;
|
||||
#endif
|
||||
}
|
||||
|
||||
} // namespace instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
} // namespace ck
|
||||
@@ -0,0 +1,61 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include "ck/library/tensor_operation_instance/gpu/conv_tensor_rearrange/device_image_to_column_instance.hpp"
|
||||
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace instance {
|
||||
|
||||
using namespace ck::conv_tensor_rearrange_op;
|
||||
|
||||
void add_device_image_to_column_nwgc_1d_bf16_instances(
|
||||
std::vector<std::unique_ptr<DeviceConvTensorRearrange<1, NWGC, BF16, BF16, ImageToColumn>>>&
|
||||
instances)
|
||||
{
|
||||
#ifdef CK_ENABLE_BF16
|
||||
add_device_operation_instances(instances, device_image_to_column_bf16_instances<1, NWGC>{});
|
||||
#else
|
||||
ignore = instances;
|
||||
#endif
|
||||
}
|
||||
|
||||
void add_device_image_to_column_nwgc_1d_f16_instances(
|
||||
std::vector<std::unique_ptr<DeviceConvTensorRearrange<1, NWGC, F16, F16, ImageToColumn>>>&
|
||||
instances)
|
||||
{
|
||||
#ifdef CK_ENABLE_FP16
|
||||
add_device_operation_instances(instances, device_image_to_column_f16_instances<1, NWGC>{});
|
||||
#else
|
||||
ignore = instances;
|
||||
#endif
|
||||
}
|
||||
|
||||
void add_device_image_to_column_nwgc_1d_f32_instances(
|
||||
std::vector<std::unique_ptr<DeviceConvTensorRearrange<1, NWGC, F32, F32, ImageToColumn>>>&
|
||||
instances)
|
||||
{
|
||||
#ifdef CK_ENABLE_FP32
|
||||
add_device_operation_instances(instances, device_image_to_column_f32_instances<1, NWGC>{});
|
||||
#else
|
||||
ignore = instances;
|
||||
#endif
|
||||
}
|
||||
|
||||
void add_device_image_to_column_nwgc_1d_i8_instances(
|
||||
std::vector<std::unique_ptr<DeviceConvTensorRearrange<1, NWGC, int8_t, int8_t, ImageToColumn>>>&
|
||||
instances)
|
||||
{
|
||||
#ifdef CK_ENABLE_INT8
|
||||
add_device_operation_instances(instances, device_image_to_column_i8_instances<1, NWGC>{});
|
||||
#else
|
||||
ignore = instances;
|
||||
#endif
|
||||
}
|
||||
|
||||
} // namespace instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
} // namespace ck
|
||||
@@ -194,7 +194,8 @@ Note: This kernel use atomic add, this will cause output buffer to be accumulate
|
||||
# 1: Input fp16, Weight fp16, Output fp16
|
||||
# 2: Input bf16, Weight bf16, Output bf16
|
||||
# 3: Input int8, Weight int8, Output int8)
|
||||
# arg3: tensor layout (0: Input[N, Hi, Wi, C], Output[N * Ho * Wo, Y * X * C])
|
||||
# arg3: tensor layout (0: Input[G, N, Hi, Wi, C], Output[G * N * Ho * Wo, Y * X * C],
|
||||
# 1: Input[N, Hi, Wi, G, C], Output[N * Ho * Wo * G, Y * X * C])
|
||||
# arg4: verification (0: no, 1: yes)
|
||||
# arg5: initialization (0: no init, 1: integer value, 2: decimal value)
|
||||
# arg6: print tensor value (0: no; 1: yes)
|
||||
|
||||
@@ -93,6 +93,26 @@ static auto make_ref_op()
|
||||
}
|
||||
}
|
||||
|
||||
template <typename InputLayout>
|
||||
static auto create_gemm_desc(const ck::index_t G, const ck::index_t NDoHoWo, const ck::index_t CZYX)
|
||||
{
|
||||
using namespace ck::tensor_layout::convolution;
|
||||
if constexpr(std::is_same_v<InputLayout, GNWC> || std::is_same_v<InputLayout, GNHWC> ||
|
||||
std::is_same_v<InputLayout, GNDHWC>)
|
||||
{
|
||||
return HostTensorDescriptor({G, NDoHoWo, CZYX});
|
||||
}
|
||||
else if constexpr(std::is_same_v<InputLayout, NWGC> || std::is_same_v<InputLayout, NHWGC> ||
|
||||
std::is_same_v<InputLayout, NDHWGC>)
|
||||
{
|
||||
return HostTensorDescriptor({G, NDoHoWo, CZYX}, {CZYX, CZYX * G, 1});
|
||||
}
|
||||
else
|
||||
{
|
||||
throw std::runtime_error("Unsupported layout!");
|
||||
}
|
||||
}
|
||||
|
||||
template <index_t NDimSpatial,
|
||||
typename InputLayout,
|
||||
typename InputDataType,
|
||||
@@ -116,13 +136,13 @@ bool profile_conv_tensor_rearrange_impl(int do_verification,
|
||||
const auto image_desc =
|
||||
ck::utils::conv::make_input_host_tensor_descriptor_g_n_c_wis_packed<InputLayout>(
|
||||
conv_param);
|
||||
const auto gemm_desc = HostTensorDescriptor({NDoHoWo, CZYX});
|
||||
const auto gemm_desc = create_gemm_desc<InputLayout>(conv_param.G_, NDoHoWo, CZYX);
|
||||
|
||||
std::array<ck::index_t, NDimSpatial> input_spatial_lengths{};
|
||||
std::array<ck::index_t, NDimSpatial> filter_spatial_lengths{};
|
||||
std::array<ck::index_t, NDimSpatial> output_spatial_lengths{};
|
||||
std::array<ck::index_t, NDimSpatial + 3> image_g_n_c_wis_strides{};
|
||||
std::array<ck::index_t, 2> gemm_m_k_strides{};
|
||||
std::array<ck::index_t, 3> gemm_g_m_k_strides{};
|
||||
std::array<ck::index_t, NDimSpatial> conv_filter_strides{};
|
||||
std::array<ck::index_t, NDimSpatial> conv_filter_dilations{};
|
||||
std::array<ck::index_t, NDimSpatial> input_left_pads{};
|
||||
@@ -134,7 +154,7 @@ bool profile_conv_tensor_rearrange_impl(int do_verification,
|
||||
copy(conv_param.filter_spatial_lengths_, filter_spatial_lengths);
|
||||
copy(conv_param.output_spatial_lengths_, output_spatial_lengths);
|
||||
copy(image_desc.GetStrides(), image_g_n_c_wis_strides);
|
||||
copy(gemm_desc.GetStrides(), gemm_m_k_strides);
|
||||
copy(gemm_desc.GetStrides(), gemm_g_m_k_strides);
|
||||
copy(conv_param.conv_filter_strides_, conv_filter_strides);
|
||||
copy(conv_param.conv_filter_dilations_, conv_filter_dilations);
|
||||
copy(conv_param.input_left_pads_, input_left_pads);
|
||||
@@ -212,13 +232,14 @@ bool profile_conv_tensor_rearrange_impl(int do_verification,
|
||||
auto argument_ptr = op_ptr->MakeArgumentPointer(
|
||||
static_cast<InputDataType*>(in_device_buf.GetDeviceBuffer()),
|
||||
static_cast<OutputDataType*>(out_device_buf.GetDeviceBuffer()),
|
||||
conv_param.G_,
|
||||
conv_param.N_,
|
||||
conv_param.C_,
|
||||
input_spatial_lengths,
|
||||
filter_spatial_lengths,
|
||||
output_spatial_lengths,
|
||||
image_g_n_c_wis_strides,
|
||||
gemm_m_k_strides,
|
||||
gemm_g_m_k_strides,
|
||||
conv_filter_strides,
|
||||
conv_filter_dilations,
|
||||
input_left_pads,
|
||||
@@ -234,7 +255,7 @@ bool profile_conv_tensor_rearrange_impl(int do_verification,
|
||||
float avg_time =
|
||||
invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel});
|
||||
std::size_t num_btype =
|
||||
NDoHoWo * CZYX * (sizeof(OutputDataType) + sizeof(InputDataType));
|
||||
conv_param.G_ * NDoHoWo * CZYX * (sizeof(OutputDataType) + sizeof(InputDataType));
|
||||
float gb_per_sec = num_btype / 1.E6 / avg_time;
|
||||
std::cout << "Perf: " << std::setw(10) << avg_time << " ms, " << gb_per_sec << " GB/s, "
|
||||
<< op_name << std::endl;
|
||||
|
||||
@@ -19,7 +19,8 @@ enum struct RearrangeOp
|
||||
|
||||
enum struct ConvLayout
|
||||
{
|
||||
NHWC, // 0
|
||||
GNHWC, // 0
|
||||
NHWGC, // 1
|
||||
};
|
||||
|
||||
enum struct DataType
|
||||
@@ -42,7 +43,8 @@ static void print_helper_msg()
|
||||
<< " 1: Input fp16, Weight fp16, Output fp16\n"
|
||||
<< " 2: Input bf16, Weight bf16, Output bf16\n"
|
||||
<< " 3: Input int8, Weight int8, Output int8)\n"
|
||||
<< "arg3: tensor layout (0: Input[N, Hi, Wi, C], Output[N * Ho * Wo, Y * X * C])\n"
|
||||
<< "arg3: tensor layout (0: Input[G, N, Hi, Wi, C], Output[G * N * Ho * Wo, Y * X * C],\n"
|
||||
<< " 1: Input[N, Hi, Wi, G, C], Output[N * Ho * Wo * G, Y * X * C])\n"
|
||||
<< "arg4: verification (0: no, 1: yes)\n"
|
||||
<< "arg5: initialization (0: no init, 1: integer value, 2: decimal value)\n"
|
||||
<< "arg6: print tensor value (0: no; 1: yes)\n"
|
||||
@@ -114,11 +116,9 @@ int profile_conv_tensor_rearrange(int argc, char* argv[])
|
||||
return pass ? 0 : 1;
|
||||
};
|
||||
|
||||
// Image To Column
|
||||
if(rearrange_op == RearrangeOp::ImageToColumn)
|
||||
{
|
||||
// NHWC
|
||||
if(layout == ConvLayout::NHWC)
|
||||
if(layout == ConvLayout::GNHWC)
|
||||
{
|
||||
if(num_dim_spatial == 1)
|
||||
{
|
||||
@@ -178,11 +178,70 @@ int profile_conv_tensor_rearrange(int argc, char* argv[])
|
||||
}
|
||||
}
|
||||
}
|
||||
else if(layout == ConvLayout::NHWGC)
|
||||
{
|
||||
if(num_dim_spatial == 1)
|
||||
{
|
||||
if(data_type == DataType::F32_F32)
|
||||
{
|
||||
return profile(I1, NWGC{}, F32{}, F32{}, ImageToColumn{});
|
||||
}
|
||||
else if(data_type == DataType::F16_F16)
|
||||
{
|
||||
return profile(I1, NWGC{}, F16{}, F16{}, ImageToColumn{});
|
||||
}
|
||||
else if(data_type == DataType::BF16_BF16)
|
||||
{
|
||||
return profile(I1, NWGC{}, BF16{}, BF16{}, ImageToColumn{});
|
||||
}
|
||||
else if(data_type == DataType::INT8_INT8)
|
||||
{
|
||||
return profile(I1, NWGC{}, INT8{}, INT8{}, ImageToColumn{});
|
||||
}
|
||||
}
|
||||
else if(num_dim_spatial == 2)
|
||||
{
|
||||
if(data_type == DataType::F32_F32)
|
||||
{
|
||||
return profile(I2, NHWGC{}, F32{}, F32{}, ImageToColumn{});
|
||||
}
|
||||
else if(data_type == DataType::F16_F16)
|
||||
{
|
||||
return profile(I2, NHWGC{}, F16{}, F16{}, ImageToColumn{});
|
||||
}
|
||||
else if(data_type == DataType::BF16_BF16)
|
||||
{
|
||||
return profile(I2, NHWGC{}, BF16{}, BF16{}, ImageToColumn{});
|
||||
}
|
||||
else if(data_type == DataType::INT8_INT8)
|
||||
{
|
||||
return profile(I2, NHWGC{}, INT8{}, INT8{}, ImageToColumn{});
|
||||
}
|
||||
}
|
||||
else if(num_dim_spatial == 3)
|
||||
{
|
||||
if(data_type == DataType::F32_F32)
|
||||
{
|
||||
return profile(I3, NDHWGC{}, F32{}, F32{}, ImageToColumn{});
|
||||
}
|
||||
else if(data_type == DataType::F16_F16)
|
||||
{
|
||||
return profile(I3, NDHWGC{}, F16{}, F16{}, ImageToColumn{});
|
||||
}
|
||||
else if(data_type == DataType::BF16_BF16)
|
||||
{
|
||||
return profile(I3, NDHWGC{}, BF16{}, BF16{}, ImageToColumn{});
|
||||
}
|
||||
else if(data_type == DataType::INT8_INT8)
|
||||
{
|
||||
return profile(I3, NDHWGC{}, INT8{}, INT8{}, ImageToColumn{});
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
else if(rearrange_op == RearrangeOp::ColumnToImage)
|
||||
{
|
||||
// NHWC
|
||||
if(layout == ConvLayout::NHWC)
|
||||
if(layout == ConvLayout::GNHWC)
|
||||
{
|
||||
if(num_dim_spatial == 1)
|
||||
{
|
||||
@@ -242,6 +301,66 @@ int profile_conv_tensor_rearrange(int argc, char* argv[])
|
||||
}
|
||||
}
|
||||
}
|
||||
else if(layout == ConvLayout::NHWGC)
|
||||
{
|
||||
if(num_dim_spatial == 1)
|
||||
{
|
||||
if(data_type == DataType::F32_F32)
|
||||
{
|
||||
return profile(I1, NWGC{}, F32{}, F32{}, ColumnToImage{});
|
||||
}
|
||||
else if(data_type == DataType::F16_F16)
|
||||
{
|
||||
return profile(I1, NWGC{}, F16{}, F16{}, ColumnToImage{});
|
||||
}
|
||||
else if(data_type == DataType::BF16_BF16)
|
||||
{
|
||||
return profile(I1, NWGC{}, BF16{}, BF16{}, ColumnToImage{});
|
||||
}
|
||||
else if(data_type == DataType::INT8_INT8)
|
||||
{
|
||||
return profile(I1, NWGC{}, INT8{}, INT8{}, ColumnToImage{});
|
||||
}
|
||||
}
|
||||
else if(num_dim_spatial == 2)
|
||||
{
|
||||
if(data_type == DataType::F32_F32)
|
||||
{
|
||||
return profile(I2, NHWGC{}, F32{}, F32{}, ColumnToImage{});
|
||||
}
|
||||
else if(data_type == DataType::F16_F16)
|
||||
{
|
||||
return profile(I2, NHWGC{}, F16{}, F16{}, ColumnToImage{});
|
||||
}
|
||||
else if(data_type == DataType::BF16_BF16)
|
||||
{
|
||||
return profile(I2, NHWGC{}, BF16{}, BF16{}, ColumnToImage{});
|
||||
}
|
||||
else if(data_type == DataType::INT8_INT8)
|
||||
{
|
||||
return profile(I2, NHWGC{}, INT8{}, INT8{}, ColumnToImage{});
|
||||
}
|
||||
}
|
||||
else if(num_dim_spatial == 3)
|
||||
{
|
||||
if(data_type == DataType::F32_F32)
|
||||
{
|
||||
return profile(I3, NDHWGC{}, F32{}, F32{}, ColumnToImage{});
|
||||
}
|
||||
else if(data_type == DataType::F16_F16)
|
||||
{
|
||||
return profile(I3, NDHWGC{}, F16{}, F16{}, ColumnToImage{});
|
||||
}
|
||||
else if(data_type == DataType::BF16_BF16)
|
||||
{
|
||||
return profile(I3, NDHWGC{}, BF16{}, BF16{}, ColumnToImage{});
|
||||
}
|
||||
else if(data_type == DataType::INT8_INT8)
|
||||
{
|
||||
return profile(I3, NDHWGC{}, INT8{}, INT8{}, ColumnToImage{});
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
std::cout << "this data_type & layout is not implemented" << std::endl;
|
||||
|
||||
@@ -45,14 +45,20 @@ class TestConvTensorRearrange : public ::testing::Test
|
||||
using namespace ck::tensor_layout::convolution;
|
||||
using namespace ck::conv_tensor_rearrange_op;
|
||||
|
||||
using KernelTypes1d =
|
||||
::testing::Types<std::tuple<GNWC, ImageToColumn>, std::tuple<GNWC, ColumnToImage>>;
|
||||
using KernelTypes1d = ::testing::Types<std::tuple<GNWC, ImageToColumn>,
|
||||
std::tuple<GNWC, ColumnToImage>,
|
||||
std::tuple<NWGC, ImageToColumn>,
|
||||
std::tuple<NWGC, ColumnToImage>>;
|
||||
|
||||
using KernelTypes2d =
|
||||
::testing::Types<std::tuple<GNHWC, ImageToColumn>, std::tuple<GNHWC, ColumnToImage>>;
|
||||
using KernelTypes2d = ::testing::Types<std::tuple<GNHWC, ImageToColumn>,
|
||||
std::tuple<GNHWC, ColumnToImage>,
|
||||
std::tuple<NHWGC, ImageToColumn>,
|
||||
std::tuple<NHWGC, ColumnToImage>>;
|
||||
|
||||
using KernelTypes3d =
|
||||
::testing::Types<std::tuple<GNDHWC, ImageToColumn>, std::tuple<GNDHWC, ColumnToImage>>;
|
||||
using KernelTypes3d = ::testing::Types<std::tuple<GNDHWC, ImageToColumn>,
|
||||
std::tuple<GNDHWC, ColumnToImage>,
|
||||
std::tuple<NDHWGC, ImageToColumn>,
|
||||
std::tuple<NDHWGC, ColumnToImage>>;
|
||||
|
||||
template <typename Tuple>
|
||||
class TestConvTensorRearrange1d : public TestConvTensorRearrange<Tuple>
|
||||
@@ -77,16 +83,16 @@ TYPED_TEST(TestConvTensorRearrange1d, Test1D)
|
||||
{
|
||||
this->conv_params.clear();
|
||||
|
||||
this->conv_params.push_back({1, 1, 4, 1, 192, {3}, {28}, {1}, {1}, {1}, {1}});
|
||||
this->conv_params.push_back({1, 1, 64, 1, 64, {3}, {14}, {1}, {1}, {1}, {1}});
|
||||
this->conv_params.push_back({1, 1, 64, 1, 64, {1}, {7}, {3}, {1}, {0}, {0}});
|
||||
this->conv_params.push_back({1, 1, 64, 1, 64, {1}, {3}, {1}, {1}, {0}, {0}});
|
||||
this->conv_params.push_back({1, 2, 4, 1, 192, {3}, {28}, {1}, {1}, {1}, {1}});
|
||||
this->conv_params.push_back({1, 2, 64, 1, 64, {3}, {14}, {1}, {1}, {1}, {1}});
|
||||
this->conv_params.push_back({1, 2, 64, 1, 64, {1}, {7}, {3}, {1}, {0}, {0}});
|
||||
this->conv_params.push_back({1, 2, 64, 1, 64, {1}, {3}, {1}, {1}, {0}, {0}});
|
||||
// ScalarPerVector should be 1
|
||||
this->conv_params.push_back({1, 1, 4, 1, 1, {3}, {28}, {1}, {1}, {1}, {1}});
|
||||
this->conv_params.push_back({1, 2, 4, 1, 1, {3}, {28}, {1}, {1}, {1}, {1}});
|
||||
// stride != 1
|
||||
this->conv_params.push_back({1, 1, 1, 1, 4, {3}, {28}, {2}, {1}, {1}, {1}});
|
||||
this->conv_params.push_back({1, 2, 1, 1, 4, {3}, {28}, {2}, {1}, {1}, {1}});
|
||||
// dilation != 1
|
||||
this->conv_params.push_back({1, 1, 1, 1, 4, {3}, {28}, {1}, {2}, {1}, {1}});
|
||||
this->conv_params.push_back({1, 2, 1, 1, 4, {3}, {28}, {1}, {2}, {1}, {1}});
|
||||
#ifdef CK_ENABLE_FP32
|
||||
this->template Run<1, float, float>();
|
||||
#endif
|
||||
@@ -106,13 +112,13 @@ TYPED_TEST(TestConvTensorRearrange2d, Test2D)
|
||||
this->conv_params.clear();
|
||||
|
||||
this->conv_params.push_back(
|
||||
{2, 1, 4, 1, 192, {3, 3}, {28, 28}, {1, 1}, {1, 1}, {1, 1}, {1, 1}});
|
||||
{2, 2, 4, 1, 192, {3, 3}, {28, 28}, {1, 1}, {1, 1}, {1, 1}, {1, 1}});
|
||||
this->conv_params.push_back(
|
||||
{2, 1, 64, 1, 64, {3, 3}, {14, 14}, {1, 1}, {1, 1}, {1, 1}, {1, 1}});
|
||||
{2, 2, 64, 1, 64, {3, 3}, {14, 14}, {1, 1}, {1, 1}, {1, 1}, {1, 1}});
|
||||
this->conv_params.push_back({2, 1, 64, 1, 64, {1, 1}, {7, 7}, {3, 3}, {1, 1}, {0, 0}, {0, 0}});
|
||||
this->conv_params.push_back({2, 1, 64, 1, 64, {1, 1}, {3, 3}, {1, 1}, {1, 1}, {0, 0}, {0, 0}});
|
||||
this->conv_params.push_back(
|
||||
{2, 1, 64, 1, 64, {3, 3}, {28, 28}, {2, 2}, {2, 2}, {1, 1}, {1, 1}});
|
||||
{2, 2, 64, 1, 64, {3, 3}, {28, 28}, {2, 2}, {2, 2}, {1, 1}, {1, 1}});
|
||||
#ifdef CK_ENABLE_FP32
|
||||
this->template Run<2, float, float>();
|
||||
#endif
|
||||
@@ -131,13 +137,13 @@ TYPED_TEST(TestConvTensorRearrange3d, Test3D)
|
||||
{
|
||||
this->conv_params.clear();
|
||||
this->conv_params.push_back(
|
||||
{3, 1, 16, 1, 64, {1, 1, 1}, {7, 7, 7}, {2, 2, 2}, {3, 3, 3}, {0, 0, 0}, {0, 0, 0}});
|
||||
{3, 2, 16, 1, 64, {1, 1, 1}, {7, 7, 7}, {2, 2, 2}, {3, 3, 3}, {0, 0, 0}, {0, 0, 0}});
|
||||
this->conv_params.push_back(
|
||||
{3, 1, 2, 1, 64, {3, 3, 3}, {14, 14, 3}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}});
|
||||
{3, 2, 2, 1, 64, {3, 3, 3}, {14, 14, 3}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}});
|
||||
this->conv_params.push_back(
|
||||
{3, 1, 32, 1, 64, {1, 1, 1}, {3, 3, 3}, {1, 1, 1}, {1, 1, 1}, {0, 0, 0}, {0, 0, 0}});
|
||||
{3, 2, 32, 1, 64, {1, 1, 1}, {3, 3, 3}, {1, 1, 1}, {1, 1, 1}, {0, 0, 0}, {0, 0, 0}});
|
||||
this->conv_params.push_back(
|
||||
{3, 1, 64, 1, 64, {3, 3, 3}, {14, 14, 14}, {2, 2, 2}, {2, 2, 2}, {1, 1, 1}, {1, 1, 1}});
|
||||
{3, 2, 64, 1, 64, {3, 3, 3}, {14, 14, 14}, {2, 2, 2}, {2, 2, 2}, {1, 1, 1}, {1, 1, 1}});
|
||||
#ifdef CK_ENABLE_FP32
|
||||
this->template Run<3, float, float>();
|
||||
#endif
|
||||
|
||||
@@ -53,7 +53,7 @@ class TestConvTensorRearrangeInterface : public ::testing::Test
|
||||
template <typename ConvTensorRearrangeOp>
|
||||
bool Run()
|
||||
{
|
||||
|
||||
const auto G = conv_param.G_;
|
||||
const auto N = conv_param.N_;
|
||||
const auto C = conv_param.C_;
|
||||
const auto FakeC =
|
||||
@@ -71,13 +71,13 @@ class TestConvTensorRearrangeInterface : public ::testing::Test
|
||||
const auto image_desc =
|
||||
ck::utils::conv::make_input_host_tensor_descriptor_g_n_c_wis_packed<ImLayout>(
|
||||
conv_param);
|
||||
const auto gemm_desc = HostTensorDescriptor({NDoHoWo, CZYX});
|
||||
const auto gemm_desc = HostTensorDescriptor({G, NDoHoWo, CZYX});
|
||||
|
||||
std::array<ck::index_t, NDimSpatial> input_spatial_lengths{};
|
||||
std::array<ck::index_t, NDimSpatial> filter_spatial_lengths{};
|
||||
std::array<ck::index_t, NDimSpatial> output_spatial_lengths{};
|
||||
std::array<ck::index_t, NDimSpatial + 3> input_g_n_c_wis_strides{};
|
||||
std::array<ck::index_t, 2> output_m_k_strides{};
|
||||
std::array<ck::index_t, 3> output_g_m_k_strides{};
|
||||
std::array<ck::index_t, NDimSpatial> conv_filter_strides{};
|
||||
std::array<ck::index_t, NDimSpatial> conv_filter_dilations{};
|
||||
std::array<ck::index_t, NDimSpatial> input_left_pads{};
|
||||
@@ -89,7 +89,7 @@ class TestConvTensorRearrangeInterface : public ::testing::Test
|
||||
copy(conv_param.filter_spatial_lengths_, filter_spatial_lengths);
|
||||
copy(conv_param.output_spatial_lengths_, output_spatial_lengths);
|
||||
copy(image_desc.GetStrides(), input_g_n_c_wis_strides);
|
||||
copy(gemm_desc.GetStrides(), output_m_k_strides);
|
||||
copy(gemm_desc.GetStrides(), output_g_m_k_strides);
|
||||
copy(conv_param.conv_filter_strides_, conv_filter_strides);
|
||||
copy(conv_param.conv_filter_dilations_, conv_filter_dilations);
|
||||
copy(conv_param.input_left_pads_, input_left_pads);
|
||||
@@ -100,13 +100,14 @@ class TestConvTensorRearrangeInterface : public ::testing::Test
|
||||
auto img2col = DeviceImgToColInstance{};
|
||||
auto argument = img2col.MakeArgument(nullptr,
|
||||
nullptr,
|
||||
G,
|
||||
N,
|
||||
IsCPacked ? C : FakeC,
|
||||
input_spatial_lengths,
|
||||
filter_spatial_lengths,
|
||||
output_spatial_lengths,
|
||||
input_g_n_c_wis_strides,
|
||||
output_m_k_strides,
|
||||
output_g_m_k_strides,
|
||||
conv_filter_strides,
|
||||
conv_filter_dilations,
|
||||
input_left_pads,
|
||||
@@ -119,13 +120,14 @@ class TestConvTensorRearrangeInterface : public ::testing::Test
|
||||
auto col2img = DeviceColToimgInstance{};
|
||||
auto argument = col2img.MakeArgument(nullptr,
|
||||
nullptr,
|
||||
G,
|
||||
N,
|
||||
IsCPacked ? C : FakeC,
|
||||
input_spatial_lengths,
|
||||
filter_spatial_lengths,
|
||||
output_spatial_lengths,
|
||||
input_g_n_c_wis_strides,
|
||||
output_m_k_strides,
|
||||
output_g_m_k_strides,
|
||||
conv_filter_strides,
|
||||
conv_filter_dilations,
|
||||
input_left_pads,
|
||||
|
||||
Reference in New Issue
Block a user