Introduce multiABD api and deprecate multiD (#1035)

* Introduce multiABD api and deprecate multiD

* Replace multiD with multiABD

* Mark structures as deprecated

* Change doxygen deprecated to note to avoid warnings

[ROCm/composable_kernel commit: f2398f612d]
This commit is contained in:
Bartłomiej Kocot
2023-11-14 17:00:40 +01:00
committed by GitHub
parent 9866e150da
commit c80803505f
120 changed files with 3821 additions and 3538 deletions

View File

@@ -8,18 +8,18 @@ namespace tensor_operation {
namespace device {
namespace instance {
void add_device_conv2d_dl_bias_perchannel_quantization_int8_instances(
std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<NDimSpatial,
NHWGC,
GKYXC,
GK_GK_Tuple,
NHWGK,
int8_t,
int8_t,
I32_F32_Tuple,
int8_t,
PassThrough,
PassThrough,
Add_Mul2_Clamp>>>& instances)
std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<NDimSpatial,
NHWGC,
GKYXC,
GK_GK_Tuple,
NHWGK,
int8_t,
int8_t,
I32_F32_Tuple,
int8_t,
PassThrough,
PassThrough,
Add_Mul2_Clamp>>>& instances)
{
// dl
add_device_operation_instances(instances,
@@ -52,18 +52,18 @@ void add_device_conv2d_dl_bias_perchannel_quantization_int8_instances(
}
void add_device_conv2d_dl_bias_relu_perchannel_quantization_int8_instances(
std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<NDimSpatial,
NHWGC,
GKYXC,
GK_GK_Tuple,
NHWGK,
int8_t,
int8_t,
I32_F32_Tuple,
int8_t,
PassThrough,
PassThrough,
Add_Relu_Mul2_Clamp>>>& instances)
std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<NDimSpatial,
NHWGC,
GKYXC,
GK_GK_Tuple,
NHWGK,
int8_t,
int8_t,
I32_F32_Tuple,
int8_t,
PassThrough,
PassThrough,
Add_Relu_Mul2_Clamp>>>& instances)
{
// dl
add_device_operation_instances(instances,
@@ -96,18 +96,19 @@ void add_device_conv2d_dl_bias_relu_perchannel_quantization_int8_instances(
}
void add_device_conv2d_dl_bias_tanh_perchannel_quantization_int8_instances(
std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<NDimSpatial,
NHWGC,
GKYXC,
GK_GK_Tuple,
NHWGK,
int8_t,
int8_t,
I32_F32_Tuple,
int8_t,
PassThrough,
PassThrough,
Add_Mul2_TanH_Mul_Clamp>>>& instances)
std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<NDimSpatial,
NHWGC,
GKYXC,
GK_GK_Tuple,
NHWGK,
int8_t,
int8_t,
I32_F32_Tuple,
int8_t,
PassThrough,
PassThrough,
Add_Mul2_TanH_Mul_Clamp>>>&
instances)
{
// dl
add_device_operation_instances(instances,

View File

@@ -8,18 +8,18 @@ namespace tensor_operation {
namespace device {
namespace instance {
void add_device_conv2d_dl_bias_perlayer_quantization_int8_instances(
std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<NDimSpatial,
NHWGC,
GKYXC,
GK_Tuple,
NHWGK,
int8_t,
int8_t,
I32_Tuple,
int8_t,
PassThrough,
PassThrough,
Add_Mul_Clamp>>>& instances)
std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<NDimSpatial,
NHWGC,
GKYXC,
GK_Tuple,
NHWGK,
int8_t,
int8_t,
I32_Tuple,
int8_t,
PassThrough,
PassThrough,
Add_Mul_Clamp>>>& instances)
{
add_device_operation_instances(instances,
device_grouped_conv2d_dl_int8_instances<NHWGC,
@@ -51,18 +51,18 @@ void add_device_conv2d_dl_bias_perlayer_quantization_int8_instances(
}
void add_device_conv2d_dl_bias_relu_perlayer_quantization_int8_instances(
std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<NDimSpatial,
NHWGC,
GKYXC,
GK_Tuple,
NHWGK,
int8_t,
int8_t,
I32_Tuple,
int8_t,
PassThrough,
PassThrough,
Add_Relu_Mul_Clamp>>>& instances)
std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<NDimSpatial,
NHWGC,
GKYXC,
GK_Tuple,
NHWGK,
int8_t,
int8_t,
I32_Tuple,
int8_t,
PassThrough,
PassThrough,
Add_Relu_Mul_Clamp>>>& instances)
{
add_device_operation_instances(instances,
device_grouped_conv2d_dl_int8_instances<NHWGC,
@@ -96,18 +96,19 @@ void add_device_conv2d_dl_bias_relu_perlayer_quantization_int8_instances(
}
void add_device_conv2d_dl_bias_tanh_perlayer_quantization_int8_instances(
std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<NDimSpatial,
NHWGC,
GKYXC,
GK_Tuple,
NHWGK,
int8_t,
int8_t,
I32_Tuple,
int8_t,
PassThrough,
PassThrough,
Add_Mul_TanH_Mul_Clamp>>>& instances)
std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<NDimSpatial,
NHWGC,
GKYXC,
GK_Tuple,
NHWGK,
int8_t,
int8_t,
I32_Tuple,
int8_t,
PassThrough,
PassThrough,
Add_Mul_TanH_Mul_Clamp>>>&
instances)
{
add_device_operation_instances(instances,
device_grouped_conv2d_dl_int8_instances<NHWGC,

View File

@@ -8,18 +8,18 @@ namespace tensor_operation {
namespace device {
namespace instance {
void add_device_conv2d_dl_perchannel_quantization_int8_instances(
std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<NDimSpatial,
NHWGC,
GKYXC,
GK_Tuple,
NHWGK,
int8_t,
int8_t,
F32_Tuple,
int8_t,
PassThrough,
PassThrough,
Mul2_Clamp>>>& instances)
std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<NDimSpatial,
NHWGC,
GKYXC,
GK_Tuple,
NHWGK,
int8_t,
int8_t,
F32_Tuple,
int8_t,
PassThrough,
PassThrough,
Mul2_Clamp>>>& instances)
{
add_device_operation_instances(instances,
device_grouped_conv2d_dl_int8_instances<NHWGC,
@@ -51,18 +51,18 @@ void add_device_conv2d_dl_perchannel_quantization_int8_instances(
}
void add_device_conv2d_dl_relu_perchannel_quantization_int8_instances(
std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<NDimSpatial,
NHWGC,
GKYXC,
GK_Tuple,
NHWGK,
int8_t,
int8_t,
F32_Tuple,
int8_t,
PassThrough,
PassThrough,
Relu_Mul2_Clamp>>>& instances)
std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<NDimSpatial,
NHWGC,
GKYXC,
GK_Tuple,
NHWGK,
int8_t,
int8_t,
F32_Tuple,
int8_t,
PassThrough,
PassThrough,
Relu_Mul2_Clamp>>>& instances)
{
add_device_operation_instances(instances,
device_grouped_conv2d_dl_int8_instances<NHWGC,

View File

@@ -8,18 +8,18 @@ namespace tensor_operation {
namespace device {
namespace instance {
void add_device_conv2d_dl_perlayer_quantization_int8_instances(
std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<NDimSpatial,
NHWGC,
GKYXC,
Empty_Tuple,
NHWGK,
int8_t,
int8_t,
Empty_Tuple,
int8_t,
PassThrough,
PassThrough,
Mul_Clamp>>>& instances)
std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<NDimSpatial,
NHWGC,
GKYXC,
Empty_Tuple,
NHWGK,
int8_t,
int8_t,
Empty_Tuple,
int8_t,
PassThrough,
PassThrough,
Mul_Clamp>>>& instances)
{
add_device_operation_instances(instances,
device_grouped_conv2d_dl_int8_instances<NHWGC,
@@ -51,18 +51,18 @@ void add_device_conv2d_dl_perlayer_quantization_int8_instances(
}
void add_device_conv2d_dl_relu_perlayer_quantization_int8_instances(
std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<NDimSpatial,
NHWGC,
GKYXC,
Empty_Tuple,
NHWGK,
int8_t,
int8_t,
Empty_Tuple,
int8_t,
PassThrough,
PassThrough,
Relu_Mul_Clamp>>>& instances)
std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<NDimSpatial,
NHWGC,
GKYXC,
Empty_Tuple,
NHWGK,
int8_t,
int8_t,
Empty_Tuple,
int8_t,
PassThrough,
PassThrough,
Relu_Mul_Clamp>>>& instances)
{
add_device_operation_instances(instances,
device_grouped_conv2d_dl_int8_instances<NHWGC,

View File

@@ -8,18 +8,18 @@ namespace tensor_operation {
namespace device {
namespace instance {
void add_device_conv2d_xdl_bias_perchannel_quantization_int8_instances(
std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<NDimSpatial,
NHWGC,
GKYXC,
GK_GK_Tuple,
NHWGK,
int8_t,
int8_t,
I32_F32_Tuple,
int8_t,
PassThrough,
PassThrough,
Add_Mul2_Clamp>>>& instances)
std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<NDimSpatial,
NHWGC,
GKYXC,
GK_GK_Tuple,
NHWGK,
int8_t,
int8_t,
I32_F32_Tuple,
int8_t,
PassThrough,
PassThrough,
Add_Mul2_Clamp>>>& instances)
{
add_device_operation_instances(instances,
device_grouped_conv2d_xdl_int8_instances<NHWGC,
@@ -51,18 +51,18 @@ void add_device_conv2d_xdl_bias_perchannel_quantization_int8_instances(
}
void add_device_conv2d_xdl_bias_relu_perchannel_quantization_int8_instances(
std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<NDimSpatial,
NHWGC,
GKYXC,
GK_GK_Tuple,
NHWGK,
int8_t,
int8_t,
I32_F32_Tuple,
int8_t,
PassThrough,
PassThrough,
Add_Relu_Mul2_Clamp>>>& instances)
std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<NDimSpatial,
NHWGC,
GKYXC,
GK_GK_Tuple,
NHWGK,
int8_t,
int8_t,
I32_F32_Tuple,
int8_t,
PassThrough,
PassThrough,
Add_Relu_Mul2_Clamp>>>& instances)
{
add_device_operation_instances(instances,
device_grouped_conv2d_xdl_int8_instances<NHWGC,
@@ -94,18 +94,19 @@ void add_device_conv2d_xdl_bias_relu_perchannel_quantization_int8_instances(
}
void add_device_conv2d_xdl_bias_tanh_perchannel_quantization_int8_instances(
std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<NDimSpatial,
NHWGC,
GKYXC,
GK_GK_Tuple,
NHWGK,
int8_t,
int8_t,
I32_F32_Tuple,
int8_t,
PassThrough,
PassThrough,
Add_Mul2_TanH_Mul_Clamp>>>& instances)
std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<NDimSpatial,
NHWGC,
GKYXC,
GK_GK_Tuple,
NHWGK,
int8_t,
int8_t,
I32_F32_Tuple,
int8_t,
PassThrough,
PassThrough,
Add_Mul2_TanH_Mul_Clamp>>>&
instances)
{
add_device_operation_instances(instances,
device_grouped_conv2d_xdl_int8_instances<NHWGC,

View File

@@ -8,18 +8,18 @@ namespace tensor_operation {
namespace device {
namespace instance {
void add_device_conv2d_xdl_bias_perlayer_quantization_int8_instances(
std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<NDimSpatial,
NHWGC,
GKYXC,
GK_Tuple,
NHWGK,
int8_t,
int8_t,
I32_Tuple,
int8_t,
PassThrough,
PassThrough,
Add_Mul_Clamp>>>& instances)
std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<NDimSpatial,
NHWGC,
GKYXC,
GK_Tuple,
NHWGK,
int8_t,
int8_t,
I32_Tuple,
int8_t,
PassThrough,
PassThrough,
Add_Mul_Clamp>>>& instances)
{
add_device_operation_instances(instances,
device_grouped_conv2d_xdl_int8_instances<NHWGC,
@@ -51,18 +51,18 @@ void add_device_conv2d_xdl_bias_perlayer_quantization_int8_instances(
}
void add_device_conv2d_xdl_bias_relu_perlayer_quantization_int8_instances(
std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<NDimSpatial,
NHWGC,
GKYXC,
GK_Tuple,
NHWGK,
int8_t,
int8_t,
I32_Tuple,
int8_t,
PassThrough,
PassThrough,
Add_Relu_Mul_Clamp>>>& instances)
std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<NDimSpatial,
NHWGC,
GKYXC,
GK_Tuple,
NHWGK,
int8_t,
int8_t,
I32_Tuple,
int8_t,
PassThrough,
PassThrough,
Add_Relu_Mul_Clamp>>>& instances)
{
add_device_operation_instances(instances,
device_grouped_conv2d_xdl_int8_instances<NHWGC,
@@ -96,18 +96,19 @@ void add_device_conv2d_xdl_bias_relu_perlayer_quantization_int8_instances(
}
void add_device_conv2d_xdl_bias_tanh_perlayer_quantization_int8_instances(
std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<NDimSpatial,
NHWGC,
GKYXC,
GK_Tuple,
NHWGK,
int8_t,
int8_t,
I32_Tuple,
int8_t,
PassThrough,
PassThrough,
Add_Mul_TanH_Mul_Clamp>>>& instances)
std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<NDimSpatial,
NHWGC,
GKYXC,
GK_Tuple,
NHWGK,
int8_t,
int8_t,
I32_Tuple,
int8_t,
PassThrough,
PassThrough,
Add_Mul_TanH_Mul_Clamp>>>&
instances)
{
add_device_operation_instances(instances,
device_grouped_conv2d_xdl_int8_instances<NHWGC,

View File

@@ -4,7 +4,7 @@
#pragma once
#include "conv2d_quantization_common.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_xdl_cshuffle.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_abd_xdl_cshuffle.hpp"
namespace ck {
namespace tensor_operation {
@@ -26,19 +26,19 @@ using device_grouped_conv2d_xdl_int8_instances =
//########################################| Spatial| Layout| Layout| Layout| Layout| Type| Type| Type| DataType| DataType| Type| Elementwise| Elementwise| Elementwise| Specialization| Specialization| Prefetch| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MWaveMPerXdl| ScalarPerVector|
//########################################| | | | | | | | | | | | Operation| Operation| Operation| | | Stage| | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NWaveNPerXdl| _NWaveNPerXdl|
//########################################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | |
DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<NDimSpatial, ALayout, BLayout, DsLayout, ELayout, int8_t, int8_t, int32_t, int32_t, DsDatatype, int8_t, PassThrough, PassThrough, OutElementOp, ConvSpec, GemmSpec, 1, 256, 256, 128, 64, 16, 16, 32, 32, 4, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, 1, 1, S<1, 64, 1, 4>, DstScalarPerVector>,
DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<NDimSpatial, ALayout, BLayout, DsLayout, ELayout, int8_t, int8_t, int32_t, int32_t, DsDatatype, int8_t, PassThrough, PassThrough, OutElementOp, ConvSpec, GemmSpec, 1, 256, 128, 256, 64, 16, 16, 32, 32, 2, 4, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, 1, 1, S<1, 64, 1, 4>, DstScalarPerVector>,
DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<NDimSpatial, ALayout, BLayout, DsLayout, ELayout, int8_t, int8_t, int32_t, int32_t, DsDatatype, int8_t, PassThrough, PassThrough, OutElementOp, ConvSpec, GemmSpec, 1, 128, 128, 128, 64, 16, 16, 32, 32, 4, 2, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, 1, 1, S<1, 32, 1, 4>, DstScalarPerVector>,
DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<NDimSpatial, ALayout, BLayout, DsLayout, ELayout, int8_t, int8_t, int32_t, int32_t, DsDatatype, int8_t, PassThrough, PassThrough, OutElementOp, ConvSpec, GemmSpec, 1, 256, 128, 128, 64, 16, 16, 32, 32, 2, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, 1, 1, S<1, 64, 1, 4>, DstScalarPerVector>,
DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<NDimSpatial, ALayout, BLayout, DsLayout, ELayout, int8_t, int8_t, int32_t, int32_t, DsDatatype, int8_t, PassThrough, PassThrough, OutElementOp, ConvSpec, GemmSpec, 1, 128, 128, 64, 64, 16, 16, 32, 32, 2, 2, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, 1, 1, S<1, 64, 1, 2>, DstScalarPerVector>,
DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<NDimSpatial, ALayout, BLayout, DsLayout, ELayout, int8_t, int8_t, int32_t, int32_t, DsDatatype, int8_t, PassThrough, PassThrough, OutElementOp, ConvSpec, GemmSpec, 1, 128, 64, 128, 64, 16, 16, 32, 32, 2, 2, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, 1, 1, S<1, 32, 1, 4>, DstScalarPerVector>,
DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<NDimSpatial, ALayout, BLayout, DsLayout, ELayout, int8_t, int8_t, int32_t, int32_t, DsDatatype, int8_t, PassThrough, PassThrough, OutElementOp, ConvSpec, GemmSpec, 1, 64, 64, 64, 64, 16, 16, 32, 32, 2, 2, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, 1, 1, S<1, 32, 1, 2>, DstScalarPerVector>,
DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<NDimSpatial, ALayout, BLayout, DsLayout, ELayout, int8_t, int8_t, int32_t, int32_t, DsDatatype, int8_t, PassThrough, PassThrough, OutElementOp, ConvSpec, GemmSpec, 1, 256, 128, 64, 64, 16, 16, 32, 32, 2, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, 1, 1, S<1, 64, 1, 4>, DstScalarPerVector>,
DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<NDimSpatial, ALayout, BLayout, DsLayout, ELayout, int8_t, int8_t, int32_t, int32_t, DsDatatype, int8_t, PassThrough, PassThrough, OutElementOp, ConvSpec, GemmSpec, 1, 256, 64, 128, 64, 16, 16, 32, 32, 1, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, 1, 1, S<1, 64, 1, 4>, DstScalarPerVector>,
DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<NDimSpatial, ALayout, BLayout, DsLayout, ELayout, int8_t, int8_t, int32_t, int32_t, DsDatatype, int8_t, PassThrough, PassThrough, OutElementOp, ConvSpec, GemmSpec, 1, 128, 128, 32, 64, 16, 16, 32, 32, 2, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, 1, 1, S<1, 64, 1, 2>, DstScalarPerVector>,
DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<NDimSpatial, ALayout, BLayout, DsLayout, ELayout, int8_t, int8_t, int32_t, int32_t, DsDatatype, int8_t, PassThrough, PassThrough, OutElementOp, ConvSpec, GemmSpec, 1, 128, 32, 128, 64, 16, 16, 32, 32, 1, 2, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, 1, 1, S<1, 32, 1, 4>, DstScalarPerVector>,
DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<NDimSpatial, ALayout, BLayout, DsLayout, ELayout, int8_t, int8_t, int32_t, int32_t, DsDatatype, int8_t, PassThrough, PassThrough, OutElementOp, ConvSpec, GemmSpec, 1, 64, 64, 32, 64, 16, 16, 32, 32, 2, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, 1, 1, S<1, 32, 1, 2>, DstScalarPerVector>,
DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<NDimSpatial, ALayout, BLayout, DsLayout, ELayout, int8_t, int8_t, int32_t, int32_t, DsDatatype, int8_t, PassThrough, PassThrough, OutElementOp, ConvSpec, GemmSpec, 1, 64, 32, 64, 64, 16, 16, 32, 32, 1, 2, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, 1, 1, S<1, 32, 1, 2>, DstScalarPerVector>
DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial, ALayout, BLayout, DsLayout, ELayout, int8_t, int8_t, int32_t, int32_t, DsDatatype, int8_t, PassThrough, PassThrough, OutElementOp, ConvSpec, GemmSpec, 1, 256, 256, 128, 64, 16, 16, 32, 32, 4, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, 1, 1, S<1, 64, 1, 4>, DstScalarPerVector>,
DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial, ALayout, BLayout, DsLayout, ELayout, int8_t, int8_t, int32_t, int32_t, DsDatatype, int8_t, PassThrough, PassThrough, OutElementOp, ConvSpec, GemmSpec, 1, 256, 128, 256, 64, 16, 16, 32, 32, 2, 4, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, 1, 1, S<1, 64, 1, 4>, DstScalarPerVector>,
DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial, ALayout, BLayout, DsLayout, ELayout, int8_t, int8_t, int32_t, int32_t, DsDatatype, int8_t, PassThrough, PassThrough, OutElementOp, ConvSpec, GemmSpec, 1, 128, 128, 128, 64, 16, 16, 32, 32, 4, 2, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, 1, 1, S<1, 32, 1, 4>, DstScalarPerVector>,
DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial, ALayout, BLayout, DsLayout, ELayout, int8_t, int8_t, int32_t, int32_t, DsDatatype, int8_t, PassThrough, PassThrough, OutElementOp, ConvSpec, GemmSpec, 1, 256, 128, 128, 64, 16, 16, 32, 32, 2, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, 1, 1, S<1, 64, 1, 4>, DstScalarPerVector>,
DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial, ALayout, BLayout, DsLayout, ELayout, int8_t, int8_t, int32_t, int32_t, DsDatatype, int8_t, PassThrough, PassThrough, OutElementOp, ConvSpec, GemmSpec, 1, 128, 128, 64, 64, 16, 16, 32, 32, 2, 2, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, 1, 1, S<1, 64, 1, 2>, DstScalarPerVector>,
DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial, ALayout, BLayout, DsLayout, ELayout, int8_t, int8_t, int32_t, int32_t, DsDatatype, int8_t, PassThrough, PassThrough, OutElementOp, ConvSpec, GemmSpec, 1, 128, 64, 128, 64, 16, 16, 32, 32, 2, 2, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, 1, 1, S<1, 32, 1, 4>, DstScalarPerVector>,
DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial, ALayout, BLayout, DsLayout, ELayout, int8_t, int8_t, int32_t, int32_t, DsDatatype, int8_t, PassThrough, PassThrough, OutElementOp, ConvSpec, GemmSpec, 1, 64, 64, 64, 64, 16, 16, 32, 32, 2, 2, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, 1, 1, S<1, 32, 1, 2>, DstScalarPerVector>,
DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial, ALayout, BLayout, DsLayout, ELayout, int8_t, int8_t, int32_t, int32_t, DsDatatype, int8_t, PassThrough, PassThrough, OutElementOp, ConvSpec, GemmSpec, 1, 256, 128, 64, 64, 16, 16, 32, 32, 2, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, 1, 1, S<1, 64, 1, 4>, DstScalarPerVector>,
DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial, ALayout, BLayout, DsLayout, ELayout, int8_t, int8_t, int32_t, int32_t, DsDatatype, int8_t, PassThrough, PassThrough, OutElementOp, ConvSpec, GemmSpec, 1, 256, 64, 128, 64, 16, 16, 32, 32, 1, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, 1, 1, S<1, 64, 1, 4>, DstScalarPerVector>,
DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial, ALayout, BLayout, DsLayout, ELayout, int8_t, int8_t, int32_t, int32_t, DsDatatype, int8_t, PassThrough, PassThrough, OutElementOp, ConvSpec, GemmSpec, 1, 128, 128, 32, 64, 16, 16, 32, 32, 2, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, 1, 1, S<1, 64, 1, 2>, DstScalarPerVector>,
DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial, ALayout, BLayout, DsLayout, ELayout, int8_t, int8_t, int32_t, int32_t, DsDatatype, int8_t, PassThrough, PassThrough, OutElementOp, ConvSpec, GemmSpec, 1, 128, 32, 128, 64, 16, 16, 32, 32, 1, 2, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, 1, 1, S<1, 32, 1, 4>, DstScalarPerVector>,
DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial, ALayout, BLayout, DsLayout, ELayout, int8_t, int8_t, int32_t, int32_t, DsDatatype, int8_t, PassThrough, PassThrough, OutElementOp, ConvSpec, GemmSpec, 1, 64, 64, 32, 64, 16, 16, 32, 32, 2, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, 1, 1, S<1, 32, 1, 2>, DstScalarPerVector>,
DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial, ALayout, BLayout, DsLayout, ELayout, int8_t, int8_t, int32_t, int32_t, DsDatatype, int8_t, PassThrough, PassThrough, OutElementOp, ConvSpec, GemmSpec, 1, 64, 32, 64, 64, 16, 16, 32, 32, 1, 2, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, 1, 1, S<1, 32, 1, 2>, DstScalarPerVector>
>;
// clang-format on

View File

@@ -8,18 +8,18 @@ namespace tensor_operation {
namespace device {
namespace instance {
void add_device_conv2d_xdl_perchannel_quantization_int8_instances(
std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<NDimSpatial,
NHWGC,
GKYXC,
GK_Tuple,
NHWGK,
int8_t,
int8_t,
F32_Tuple,
int8_t,
PassThrough,
PassThrough,
Mul2_Clamp>>>& instances)
std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<NDimSpatial,
NHWGC,
GKYXC,
GK_Tuple,
NHWGK,
int8_t,
int8_t,
F32_Tuple,
int8_t,
PassThrough,
PassThrough,
Mul2_Clamp>>>& instances)
{
add_device_operation_instances(instances,
device_grouped_conv2d_xdl_int8_instances<NHWGC,
@@ -51,18 +51,18 @@ void add_device_conv2d_xdl_perchannel_quantization_int8_instances(
}
void add_device_conv2d_xdl_relu_perchannel_quantization_int8_instances(
std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<NDimSpatial,
NHWGC,
GKYXC,
GK_Tuple,
NHWGK,
int8_t,
int8_t,
F32_Tuple,
int8_t,
PassThrough,
PassThrough,
Relu_Mul2_Clamp>>>& instances)
std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<NDimSpatial,
NHWGC,
GKYXC,
GK_Tuple,
NHWGK,
int8_t,
int8_t,
F32_Tuple,
int8_t,
PassThrough,
PassThrough,
Relu_Mul2_Clamp>>>& instances)
{
add_device_operation_instances(instances,
device_grouped_conv2d_xdl_int8_instances<NHWGC,

View File

@@ -8,18 +8,18 @@ namespace tensor_operation {
namespace device {
namespace instance {
void add_device_conv2d_xdl_perlayer_quantization_int8_instances(
std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<NDimSpatial,
NHWGC,
GKYXC,
Empty_Tuple,
NHWGK,
int8_t,
int8_t,
Empty_Tuple,
int8_t,
PassThrough,
PassThrough,
Mul_Clamp>>>& instances)
std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<NDimSpatial,
NHWGC,
GKYXC,
Empty_Tuple,
NHWGK,
int8_t,
int8_t,
Empty_Tuple,
int8_t,
PassThrough,
PassThrough,
Mul_Clamp>>>& instances)
{
add_device_operation_instances(instances,
device_grouped_conv2d_xdl_int8_instances<NHWGC,
@@ -51,18 +51,18 @@ void add_device_conv2d_xdl_perlayer_quantization_int8_instances(
}
void add_device_conv2d_xdl_relu_perlayer_quantization_int8_instances(
std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<NDimSpatial,
NHWGC,
GKYXC,
Empty_Tuple,
NHWGK,
int8_t,
int8_t,
Empty_Tuple,
int8_t,
PassThrough,
PassThrough,
Relu_Mul_Clamp>>>& instances)
std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<NDimSpatial,
NHWGC,
GKYXC,
Empty_Tuple,
NHWGK,
int8_t,
int8_t,
Empty_Tuple,
int8_t,
PassThrough,
PassThrough,
Relu_Mul_Clamp>>>& instances)
{
add_device_operation_instances(instances,
device_grouped_conv2d_xdl_int8_instances<NHWGC,