Added working fp16 and int8 instances.

This commit is contained in:
Ville Pietilä
2025-12-11 03:58:34 -05:00
parent cb83826b52
commit b4c3a1bbcf
4 changed files with 146 additions and 9 deletions

View File

@@ -116,7 +116,7 @@ bool profile_grouped_conv_fwd_impl(int do_verification,
// get device op instances
const auto ops = ck_tile::ops::DeviceOperationInstanceFactory<DeviceOp>::GetInstances();
std::cout << "found " << ops.size() << " instances" << std::endl;
std::cout << "Found " << ops.size() << " instances\n" << std::endl;
std::string best_op_name;
float best_avg_time = 0;
@@ -149,13 +149,14 @@ bool profile_grouped_conv_fwd_impl(int do_verification,
if(op->IsSupportedArgument(args))
{
num_kernel++;
std::string op_name = op->GetName(args);
if((instance_index != -1) && (instance_index + 1 != num_kernel))
{
// skip test if instance_index is specified
continue;
}
std::string op_name = op->GetName(args);
std::cout << op_name << " is profiled..." << std::endl;
// Run verification first. If it doesn't pass, no need to do performance measurement.
@@ -228,7 +229,7 @@ bool profile_grouped_conv_fwd_impl(int do_verification,
}
else
{
// std::cout << op->GetName(args) << " does not support this problem." << std::endl;
std::cout << op->GetName(args) << " does not support this problem." << std::endl;
}
}

View File

@@ -32,7 +32,9 @@ using tile_grouped_conv_fwd_fp16_instances = std::tuple<
//##############################| Num| InLayout| WeiLayout| OutLayout| InData| WeiData| OutData| In| Wei| Out| Conv| K-block| M-tile| N-tile | K-tile | M-warp| N-warp| K-warp| M-warp| N-warp| K-warp| Vector| Vector| Vector| Double| GEMM|
//##############################| Dim| | | | Type| Type| Type| Elementwise| Elementwise| Elementwise| Spec| per| | | | | | | tile| tile| tile| size| size| size| smem| pipeline|
//##############################| Spatial| | | | | | | Operation| Operation| Operation| | CU| | | | | | | size| size| size| A| B| C| buffer| version|
GroupedConvolutionForwardInvoker<NDimSpatial, ALayout, BLayout, ELayout, F16, F16, F16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 64, 64, 32, 1, 1, 1, 32, 32, 16, 1, 1, 1, false, CK_TILE_PIPELINE_MEMORY>
GroupedConvolutionForwardInvoker<NDimSpatial, ALayout, BLayout, ELayout, F16, F16, F16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 64, 64, 32, 1, 1, 1, 32, 32, 16, 1, 1, 1, false, CK_TILE_PIPELINE_MEMORY>,
GroupedConvolutionForwardInvoker<NDimSpatial, ALayout, BLayout, ELayout, F16, F16, F16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 64, 64, 32, 1, 1, 1, 32, 32, 16, 1, 1, 1, false, CK_TILE_PIPELINE_MEMORY>
// clang-format on
// clang-format on
>;

View File

@@ -22,6 +22,8 @@ using DeviceOpFwd2DINT8 = GroupedConvolutionForwardBaseInvoker<2,
PassThrough,
INT8,
INT8>;
//using enum ConvSpecDef = ConvolutionSpecialization::Default;
template <ck_tile::index_t NDimSpatial,
typename ALayout,
@@ -32,7 +34,139 @@ using tile_grouped_conv_fwd_int8_instances = std::tuple<
//##############################| Num| InLayout| WeiLayout| OutLayout| InData| WeiData| OutData| In| Wei| Out| Conv| K-block| M-tile| N-tile | K-tile | M-warp| N-warp| K-warp| M-warp| N-warp| K-warp| Vector| Vector| Vector| Double| GEMM|
//##############################| Dim| | | | Type| Type| Type| Elementwise| Elementwise| Elementwise| Spec| per| | | | | | | tile| tile| tile| size| size| size| smem| pipeline|
//##############################| Spatial| | | | | | | Operation| Operation| Operation| | CU| | | | | | | size| size| size| A| B| C| buffer| version|
GroupedConvolutionForwardInvoker<NDimSpatial, ALayout, BLayout, ELayout, INT8, INT8, INT8, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 64, 64, 32, 1, 1, 1, 32, 32, 16, 1, 1, 1, false, CK_TILE_PIPELINE_MEMORY>
GroupedConvolutionForwardInvoker<NDimSpatial, ALayout, BLayout, ELayout, INT8, INT8, INT8, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 64, 64, 32, 1, 1, 1, 32, 32, 16, 1, 1, 1, false, CK_TILE_PIPELINE_MEMORY>,
GroupedConvolutionForwardInvoker<NDimSpatial, ALayout, BLayout, ELayout, INT8, INT8, INT8, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 64, 64, 32, 1, 1, 1, 32, 32, 16, 1, 1, 1, false, CK_TILE_PIPELINE_MEMORY>
// Converted from device_grouped_conv_fwd_xdl_int8_comp_instances_2x
// GroupedConvolutionForwardInvoker<NDimSpatial, ALayout, BLayout, ELayout, INT8, INT8, INT8, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 128, 128, 128, 2, 2, 1, 32, 32, 32, 16, 16, 16, false, CK_TILE_PIPELINE_MEMORY>,
// GroupedConvolutionForwardInvoker<NDimSpatial, ALayout, BLayout, ELayout, INT8, INT8, INT8, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 128, 128, 128, 2, 2, 1, 32, 32, 32, 16, 16, 16, false, CK_TILE_PIPELINE_COMPUTE_V3>,
// GroupedConvolutionForwardInvoker<NDimSpatial, ALayout, BLayout, ELayout, INT8, INT8, INT8, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 128, 128, 128, 2, 2, 1, 32, 32, 32, 16, 16, 16, true, CK_TILE_PIPELINE_COMPUTE_V4>,
// GroupedConvolutionForwardInvoker<NDimSpatial, ALayout, BLayout, ELayout, INT8, INT8, INT8, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Pad0, 1, 128, 128, 128, 2, 2, 1, 32, 32, 32, 16, 16, 16, false, CK_TILE_PIPELINE_MEMORY>,
// GroupedConvolutionForwardInvoker<NDimSpatial, ALayout, BLayout, ELayout, INT8, INT8, INT8, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Pad0, 1, 128, 128, 128, 2, 2, 1, 32, 32, 32, 16, 16, 16, false, CK_TILE_PIPELINE_COMPUTE_V3>,
// GroupedConvolutionForwardInvoker<NDimSpatial, ALayout, BLayout, ELayout, INT8, INT8, INT8, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Pad0, 1, 128, 128, 128, 2, 2, 1, 32, 32, 32, 16, 16, 16, true, CK_TILE_PIPELINE_COMPUTE_V4>,
// GroupedConvolutionForwardInvoker<NDimSpatial, ALayout, BLayout, ELayout, INT8, INT8, INT8, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter3x3, 1, 128, 128, 128, 2, 2, 1, 32, 32, 32, 16, 16, 16, false, CK_TILE_PIPELINE_MEMORY>,
// GroupedConvolutionForwardInvoker<NDimSpatial, ALayout, BLayout, ELayout, INT8, INT8, INT8, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter3x3, 1, 128, 128, 128, 2, 2, 1, 32, 32, 32, 16, 16, 16, false, CK_TILE_PIPELINE_COMPUTE_V3>,
// GroupedConvolutionForwardInvoker<NDimSpatial, ALayout, BLayout, ELayout, INT8, INT8, INT8, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter3x3, 1, 128, 128, 128, 2, 2, 1, 32, 32, 32, 16, 16, 16, true, CK_TILE_PIPELINE_COMPUTE_V4>,
// GroupedConvolutionForwardInvoker<NDimSpatial, ALayout, BLayout, ELayout, INT8, INT8, INT8, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 128, 128, 128, 2, 2, 1, 32, 32, 32, 16, 16, 16, false, CK_TILE_PIPELINE_MEMORY>,
// GroupedConvolutionForwardInvoker<NDimSpatial, ALayout, BLayout, ELayout, INT8, INT8, INT8, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 128, 128, 128, 2, 2, 1, 32, 32, 32, 16, 16, 16, false, CK_TILE_PIPELINE_COMPUTE_V3>,
// GroupedConvolutionForwardInvoker<NDimSpatial, ALayout, BLayout, ELayout, INT8, INT8, INT8, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 128, 128, 128, 2, 2, 1, 32, 32, 32, 16, 16, 16, true, CK_TILE_PIPELINE_COMPUTE_V4>,
// Converted from device_grouped_conv_fwd_xdl_int8_comp_instances
// GroupedConvolutionForwardInvoker<NDimSpatial, ALayout, BLayout, ELayout, INT8, INT8, INT8, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 128, 128, 32, 2, 2, 1, 32, 32, 32, 8, 8, 8, false, CK_TILE_PIPELINE_MEMORY>,
// GroupedConvolutionForwardInvoker<NDimSpatial, ALayout, BLayout, ELayout, INT8, INT8, INT8, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 128, 128, 32, 2, 2, 1, 32, 32, 32, 8, 8, 8, false, CK_TILE_PIPELINE_COMPUTE_V3>,
// GroupedConvolutionForwardInvoker<NDimSpatial, ALayout, BLayout, ELayout, INT8, INT8, INT8, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 128, 128, 32, 2, 2, 1, 32, 32, 32, 8, 8, 8, true, CK_TILE_PIPELINE_COMPUTE_V4>,
// GroupedConvolutionForwardInvoker<NDimSpatial, ALayout, BLayout, ELayout, INT8, INT8, INT8, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Pad0, 1, 128, 128, 32, 2, 2, 1, 32, 32, 32, 8, 8, 8, false, CK_TILE_PIPELINE_MEMORY>,
// GroupedConvolutionForwardInvoker<NDimSpatial, ALayout, BLayout, ELayout, INT8, INT8, INT8, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Pad0, 1, 128, 128, 32, 2, 2, 1, 32, 32, 32, 8, 8, 8, false, CK_TILE_PIPELINE_COMPUTE_V3>,
// GroupedConvolutionForwardInvoker<NDimSpatial, ALayout, BLayout, ELayout, INT8, INT8, INT8, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Pad0, 1, 128, 128, 32, 2, 2, 1, 32, 32, 32, 8, 8, 8, true, CK_TILE_PIPELINE_COMPUTE_V4>,
// GroupedConvolutionForwardInvoker<NDimSpatial, ALayout, BLayout, ELayout, INT8, INT8, INT8, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter3x3, 1, 128, 128, 32, 2, 2, 1, 32, 32, 32, 8, 8, 8, false, CK_TILE_PIPELINE_MEMORY>,
// GroupedConvolutionForwardInvoker<NDimSpatial, ALayout, BLayout, ELayout, INT8, INT8, INT8, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter3x3, 1, 128, 128, 32, 2, 2, 1, 32, 32, 32, 8, 8, 8, false, CK_TILE_PIPELINE_COMPUTE_V3>,
// GroupedConvolutionForwardInvoker<NDimSpatial, ALayout, BLayout, ELayout, INT8, INT8, INT8, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter3x3, 1, 128, 128, 32, 2, 2, 1, 32, 32, 32, 8, 8, 8, true, CK_TILE_PIPELINE_COMPUTE_V4>,
// GroupedConvolutionForwardInvoker<NDimSpatial, ALayout, BLayout, ELayout, INT8, INT8, INT8, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 128, 128, 32, 2, 2, 1, 32, 32, 32, 8, 8, 8, false, CK_TILE_PIPELINE_MEMORY>,
// GroupedConvolutionForwardInvoker<NDimSpatial, ALayout, BLayout, ELayout, INT8, INT8, INT8, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 128, 128, 32, 2, 2, 1, 32, 32, 32, 8, 8, 8, false, CK_TILE_PIPELINE_COMPUTE_V3>,
// GroupedConvolutionForwardInvoker<NDimSpatial, ALayout, BLayout, ELayout, INT8, INT8, INT8, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 128, 128, 32, 2, 2, 1, 32, 32, 32, 8, 8, 8, true, CK_TILE_PIPELINE_COMPUTE_V4>,
// Converted from device_grouped_conv_fwd_xdl_int8_comp_instances_part2 - Instance 1
// GroupedConvolutionForwardInvoker<NDimSpatial, ALayout, BLayout, ELayout, INT8, INT8, INT8, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 256, 256, 32, 4, 4, 1, 32, 32, 32, 8, 8, 8, false, CK_TILE_PIPELINE_MEMORY>,
// GroupedConvolutionForwardInvoker<NDimSpatial, ALayout, BLayout, ELayout, INT8, INT8, INT8, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 256, 256, 32, 4, 4, 1, 32, 32, 32, 8, 8, 8, false, CK_TILE_PIPELINE_COMPUTE_V3>,
// GroupedConvolutionForwardInvoker<NDimSpatial, ALayout, BLayout, ELayout, INT8, INT8, INT8, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 256, 256, 32, 4, 4, 1, 32, 32, 32, 8, 8, 8, true, CK_TILE_PIPELINE_COMPUTE_V4>,
// GroupedConvolutionForwardInvoker<NDimSpatial, ALayout, BLayout, ELayout, INT8, INT8, INT8, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Pad0, 1, 256, 256, 32, 4, 4, 1, 32, 32, 32, 8, 8, 8, false, CK_TILE_PIPELINE_MEMORY>,
// GroupedConvolutionForwardInvoker<NDimSpatial, ALayout, BLayout, ELayout, INT8, INT8, INT8, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Pad0, 1, 256, 256, 32, 4, 4, 1, 32, 32, 32, 8, 8, 8, false, CK_TILE_PIPELINE_COMPUTE_V3>,
// GroupedConvolutionForwardInvoker<NDimSpatial, ALayout, BLayout, ELayout, INT8, INT8, INT8, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Pad0, 1, 256, 256, 32, 4, 4, 1, 32, 32, 32, 8, 8, 8, true, CK_TILE_PIPELINE_COMPUTE_V4>,
// GroupedConvolutionForwardInvoker<NDimSpatial, ALayout, BLayout, ELayout, INT8, INT8, INT8, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter3x3, 1, 256, 256, 32, 4, 4, 1, 32, 32, 32, 8, 8, 8, false, CK_TILE_PIPELINE_MEMORY>,
// GroupedConvolutionForwardInvoker<NDimSpatial, ALayout, BLayout, ELayout, INT8, INT8, INT8, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter3x3, 1, 256, 256, 32, 4, 4, 1, 32, 32, 32, 8, 8, 8, false, CK_TILE_PIPELINE_COMPUTE_V3>,
// GroupedConvolutionForwardInvoker<NDimSpatial, ALayout, BLayout, ELayout, INT8, INT8, INT8, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter3x3, 1, 256, 256, 32, 4, 4, 1, 32, 32, 32, 8, 8, 8, true, CK_TILE_PIPELINE_COMPUTE_V4>,
// GroupedConvolutionForwardInvoker<NDimSpatial, ALayout, BLayout, ELayout, INT8, INT8, INT8, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 256, 256, 32, 4, 4, 1, 32, 32, 32, 8, 8, 8, false, CK_TILE_PIPELINE_MEMORY>,
// GroupedConvolutionForwardInvoker<NDimSpatial, ALayout, BLayout, ELayout, INT8, INT8, INT8, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 256, 256, 32, 4, 4, 1, 32, 32, 32, 8, 8, 8, false, CK_TILE_PIPELINE_COMPUTE_V3>,
// GroupedConvolutionForwardInvoker<NDimSpatial, ALayout, BLayout, ELayout, INT8, INT8, INT8, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 256, 256, 32, 4, 4, 1, 32, 32, 32, 8, 8, 8, true, CK_TILE_PIPELINE_COMPUTE_V4>,
// Converted from device_grouped_conv_fwd_xdl_int8_comp_instances_part2 - Instance 2
// GroupedConvolutionForwardInvoker<NDimSpatial, ALayout, BLayout, ELayout, INT8, INT8, INT8, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 256, 256, 32, 4, 4, 1, 32, 32, 32, 8, 8, 8, false, CK_TILE_PIPELINE_MEMORY>,
// GroupedConvolutionForwardInvoker<NDimSpatial, ALayout, BLayout, ELayout, INT8, INT8, INT8, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 256, 256, 32, 4, 4, 1, 32, 32, 32, 8, 8, 8, false, CK_TILE_PIPELINE_COMPUTE_V3>,
// GroupedConvolutionForwardInvoker<NDimSpatial, ALayout, BLayout, ELayout, INT8, INT8, INT8, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 256, 256, 32, 4, 4, 1, 32, 32, 32, 8, 8, 8, true, CK_TILE_PIPELINE_COMPUTE_V4>,
// GroupedConvolutionForwardInvoker<NDimSpatial, ALayout, BLayout, ELayout, INT8, INT8, INT8, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Pad0, 1, 256, 256, 32, 4, 4, 1, 32, 32, 32, 8, 8, 8, false, CK_TILE_PIPELINE_MEMORY>,
// GroupedConvolutionForwardInvoker<NDimSpatial, ALayout, BLayout, ELayout, INT8, INT8, INT8, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Pad0, 1, 256, 256, 32, 4, 4, 1, 32, 32, 32, 8, 8, 8, false, CK_TILE_PIPELINE_COMPUTE_V3>,
// GroupedConvolutionForwardInvoker<NDimSpatial, ALayout, BLayout, ELayout, INT8, INT8, INT8, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Pad0, 1, 256, 256, 32, 4, 4, 1, 32, 32, 32, 8, 8, 8, true, CK_TILE_PIPELINE_COMPUTE_V4>,
// GroupedConvolutionForwardInvoker<NDimSpatial, ALayout, BLayout, ELayout, INT8, INT8, INT8, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter3x3, 1, 256, 256, 32, 4, 4, 1, 32, 32, 32, 8, 8, 8, false, CK_TILE_PIPELINE_MEMORY>,
// GroupedConvolutionForwardInvoker<NDimSpatial, ALayout, BLayout, ELayout, INT8, INT8, INT8, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter3x3, 1, 256, 256, 32, 4, 4, 1, 32, 32, 32, 8, 8, 8, false, CK_TILE_PIPELINE_COMPUTE_V3>,
// GroupedConvolutionForwardInvoker<NDimSpatial, ALayout, BLayout, ELayout, INT8, INT8, INT8, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter3x3, 1, 256, 256, 32, 4, 4, 1, 32, 32, 32, 8, 8, 8, true, CK_TILE_PIPELINE_COMPUTE_V4>,
// GroupedConvolutionForwardInvoker<NDimSpatial, ALayout, BLayout, ELayout, INT8, INT8, INT8, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 256, 256, 32, 4, 4, 1, 32, 32, 32, 8, 8, 8, false, CK_TILE_PIPELINE_MEMORY>,
// GroupedConvolutionForwardInvoker<NDimSpatial, ALayout, BLayout, ELayout, INT8, INT8, INT8, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 256, 256, 32, 4, 4, 1, 32, 32, 32, 8, 8, 8, false, CK_TILE_PIPELINE_COMPUTE_V3>,
// GroupedConvolutionForwardInvoker<NDimSpatial, ALayout, BLayout, ELayout, INT8, INT8, INT8, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 256, 256, 32, 4, 4, 1, 32, 32, 32, 8, 8, 8, true, CK_TILE_PIPELINE_COMPUTE_V4>,
// Converted from device_grouped_conv_fwd_xdl_int8_comp_instances_part2 - Instance 3
// GroupedConvolutionForwardInvoker<NDimSpatial, ALayout, BLayout, ELayout, INT8, INT8, INT8, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 256, 256, 32, 4, 4, 1, 32, 32, 32, 8, 8, 8, false, CK_TILE_PIPELINE_MEMORY>,
// GroupedConvolutionForwardInvoker<NDimSpatial, ALayout, BLayout, ELayout, INT8, INT8, INT8, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 256, 256, 32, 4, 4, 1, 32, 32, 32, 8, 8, 8, false, CK_TILE_PIPELINE_COMPUTE_V3>,
// GroupedConvolutionForwardInvoker<NDimSpatial, ALayout, BLayout, ELayout, INT8, INT8, INT8, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 256, 256, 32, 4, 4, 1, 32, 32, 32, 8, 8, 8, true, CK_TILE_PIPELINE_COMPUTE_V4>,
// GroupedConvolutionForwardInvoker<NDimSpatial, ALayout, BLayout, ELayout, INT8, INT8, INT8, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Pad0, 1, 256, 256, 32, 4, 4, 1, 32, 32, 32, 8, 8, 8, false, CK_TILE_PIPELINE_MEMORY>,
// GroupedConvolutionForwardInvoker<NDimSpatial, ALayout, BLayout, ELayout, INT8, INT8, INT8, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Pad0, 1, 256, 256, 32, 4, 4, 1, 32, 32, 32, 8, 8, 8, false, CK_TILE_PIPELINE_COMPUTE_V3>,
// GroupedConvolutionForwardInvoker<NDimSpatial, ALayout, BLayout, ELayout, INT8, INT8, INT8, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Pad0, 1, 256, 256, 32, 4, 4, 1, 32, 32, 32, 8, 8, 8, true, CK_TILE_PIPELINE_COMPUTE_V4>,
// GroupedConvolutionForwardInvoker<NDimSpatial, ALayout, BLayout, ELayout, INT8, INT8, INT8, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter3x3, 1, 256, 256, 32, 4, 4, 1, 32, 32, 32, 8, 8, 8, false, CK_TILE_PIPELINE_MEMORY>,
// GroupedConvolutionForwardInvoker<NDimSpatial, ALayout, BLayout, ELayout, INT8, INT8, INT8, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter3x3, 1, 256, 256, 32, 4, 4, 1, 32, 32, 32, 8, 8, 8, false, CK_TILE_PIPELINE_COMPUTE_V3>,
// GroupedConvolutionForwardInvoker<NDimSpatial, ALayout, BLayout, ELayout, INT8, INT8, INT8, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter3x3, 1, 256, 256, 32, 4, 4, 1, 32, 32, 32, 8, 8, 8, true, CK_TILE_PIPELINE_COMPUTE_V4>,
// GroupedConvolutionForwardInvoker<NDimSpatial, ALayout, BLayout, ELayout, INT8, INT8, INT8, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 256, 256, 32, 4, 4, 1, 32, 32, 32, 8, 8, 8, false, CK_TILE_PIPELINE_MEMORY>,
// GroupedConvolutionForwardInvoker<NDimSpatial, ALayout, BLayout, ELayout, INT8, INT8, INT8, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 256, 256, 32, 4, 4, 1, 32, 32, 32, 8, 8, 8, false, CK_TILE_PIPELINE_COMPUTE_V3>,
// GroupedConvolutionForwardInvoker<NDimSpatial, ALayout, BLayout, ELayout, INT8, INT8, INT8, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 256, 256, 32, 4, 4, 1, 32, 32, 32, 8, 8, 8, true, CK_TILE_PIPELINE_COMPUTE_V4>,
// Converted from device_grouped_conv_fwd_xdl_int8_comp_instances_part2 - Instance 4 (MPerXDL=16)
// GroupedConvolutionForwardInvoker<NDimSpatial, ALayout, BLayout, ELayout, INT8, INT8, INT8, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 256, 256, 32, 8, 8, 1, 16, 16, 16, 8, 8, 8, false, CK_TILE_PIPELINE_MEMORY>,
// GroupedConvolutionForwardInvoker<NDimSpatial, ALayout, BLayout, ELayout, INT8, INT8, INT8, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 256, 256, 32, 8, 8, 1, 16, 16, 16, 8, 8, 8, false, CK_TILE_PIPELINE_COMPUTE_V3>,
// GroupedConvolutionForwardInvoker<NDimSpatial, ALayout, BLayout, ELayout, INT8, INT8, INT8, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 256, 256, 32, 8, 8, 1, 16, 16, 16, 8, 8, 8, true, CK_TILE_PIPELINE_COMPUTE_V4>,
// GroupedConvolutionForwardInvoker<NDimSpatial, ALayout, BLayout, ELayout, INT8, INT8, INT8, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Pad0, 1, 256, 256, 32, 8, 8, 1, 16, 16, 16, 8, 8, 8, false, CK_TILE_PIPELINE_MEMORY>,
// GroupedConvolutionForwardInvoker<NDimSpatial, ALayout, BLayout, ELayout, INT8, INT8, INT8, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Pad0, 1, 256, 256, 32, 8, 8, 1, 16, 16, 16, 8, 8, 8, false, CK_TILE_PIPELINE_COMPUTE_V3>,
// GroupedConvolutionForwardInvoker<NDimSpatial, ALayout, BLayout, ELayout, INT8, INT8, INT8, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Pad0, 1, 256, 256, 32, 8, 8, 1, 16, 16, 16, 8, 8, 8, true, CK_TILE_PIPELINE_COMPUTE_V4>,
// GroupedConvolutionForwardInvoker<NDimSpatial, ALayout, BLayout, ELayout, INT8, INT8, INT8, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter3x3, 1, 256, 256, 32, 8, 8, 1, 16, 16, 16, 8, 8, 8, false, CK_TILE_PIPELINE_MEMORY>,
// GroupedConvolutionForwardInvoker<NDimSpatial, ALayout, BLayout, ELayout, INT8, INT8, INT8, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter3x3, 1, 256, 256, 32, 8, 8, 1, 16, 16, 16, 8, 8, 8, false, CK_TILE_PIPELINE_COMPUTE_V3>,
// GroupedConvolutionForwardInvoker<NDimSpatial, ALayout, BLayout, ELayout, INT8, INT8, INT8, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter3x3, 1, 256, 256, 32, 8, 8, 1, 16, 16, 16, 8, 8, 8, true, CK_TILE_PIPELINE_COMPUTE_V4>,
// GroupedConvolutionForwardInvoker<NDimSpatial, ALayout, BLayout, ELayout, INT8, INT8, INT8, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 256, 256, 32, 8, 8, 1, 16, 16, 16, 8, 8, 8, false, CK_TILE_PIPELINE_MEMORY>,
// GroupedConvolutionForwardInvoker<NDimSpatial, ALayout, BLayout, ELayout, INT8, INT8, INT8, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 256, 256, 32, 8, 8, 1, 16, 16, 16, 8, 8, 8, false, CK_TILE_PIPELINE_COMPUTE_V3>,
// GroupedConvolutionForwardInvoker<NDimSpatial, ALayout, BLayout, ELayout, INT8, INT8, INT8, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 256, 256, 32, 8, 8, 1, 16, 16, 16, 8, 8, 8, true, CK_TILE_PIPELINE_COMPUTE_V4>,
// Converted from device_grouped_conv_fwd_xdl_int8_comp_instances_part2 - Instance 5
// GroupedConvolutionForwardInvoker<NDimSpatial, ALayout, BLayout, ELayout, INT8, INT8, INT8, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 128, 128, 64, 2, 2, 1, 32, 32, 32, 8, 8, 8, false, CK_TILE_PIPELINE_MEMORY>,
// GroupedConvolutionForwardInvoker<NDimSpatial, ALayout, BLayout, ELayout, INT8, INT8, INT8, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 128, 128, 64, 2, 2, 1, 32, 32, 32, 8, 8, 8, false, CK_TILE_PIPELINE_COMPUTE_V3>,
// GroupedConvolutionForwardInvoker<NDimSpatial, ALayout, BLayout, ELayout, INT8, INT8, INT8, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 128, 128, 64, 2, 2, 1, 32, 32, 32, 8, 8, 8, true, CK_TILE_PIPELINE_COMPUTE_V4>,
// GroupedConvolutionForwardInvoker<NDimSpatial, ALayout, BLayout, ELayout, INT8, INT8, INT8, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Pad0, 1, 128, 128, 64, 2, 2, 1, 32, 32, 32, 8, 8, 8, false, CK_TILE_PIPELINE_MEMORY>,
// GroupedConvolutionForwardInvoker<NDimSpatial, ALayout, BLayout, ELayout, INT8, INT8, INT8, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Pad0, 1, 128, 128, 64, 2, 2, 1, 32, 32, 32, 8, 8, 8, false, CK_TILE_PIPELINE_COMPUTE_V3>,
// GroupedConvolutionForwardInvoker<NDimSpatial, ALayout, BLayout, ELayout, INT8, INT8, INT8, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Pad0, 1, 128, 128, 64, 2, 2, 1, 32, 32, 32, 8, 8, 8, true, CK_TILE_PIPELINE_COMPUTE_V4>,
// GroupedConvolutionForwardInvoker<NDimSpatial, ALayout, BLayout, ELayout, INT8, INT8, INT8, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter3x3, 1, 128, 128, 64, 2, 2, 1, 32, 32, 32, 8, 8, 8, false, CK_TILE_PIPELINE_MEMORY>,
// GroupedConvolutionForwardInvoker<NDimSpatial, ALayout, BLayout, ELayout, INT8, INT8, INT8, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter3x3, 1, 128, 128, 64, 2, 2, 1, 32, 32, 32, 8, 8, 8, false, CK_TILE_PIPELINE_COMPUTE_V3>,
// GroupedConvolutionForwardInvoker<NDimSpatial, ALayout, BLayout, ELayout, INT8, INT8, INT8, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter3x3, 1, 128, 128, 64, 2, 2, 1, 32, 32, 32, 8, 8, 8, true, CK_TILE_PIPELINE_COMPUTE_V4>,
// GroupedConvolutionForwardInvoker<NDimSpatial, ALayout, BLayout, ELayout, INT8, INT8, INT8, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 128, 128, 64, 2, 2, 1, 32, 32, 32, 8, 8, 8, false, CK_TILE_PIPELINE_MEMORY>,
// GroupedConvolutionForwardInvoker<NDimSpatial, ALayout, BLayout, ELayout, INT8, INT8, INT8, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 128, 128, 64, 2, 2, 1, 32, 32, 32, 8, 8, 8, false, CK_TILE_PIPELINE_COMPUTE_V3>,
// GroupedConvolutionForwardInvoker<NDimSpatial, ALayout, BLayout, ELayout, INT8, INT8, INT8, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 128, 128, 64, 2, 2, 1, 32, 32, 32, 8, 8, 8, true, CK_TILE_PIPELINE_COMPUTE_V4>,
// Converted from device_grouped_conv_fwd_xdl_int8_comp_instances_part2 - Instance 6
// GroupedConvolutionForwardInvoker<NDimSpatial, ALayout, BLayout, ELayout, INT8, INT8, INT8, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 128, 256, 32, 2, 4, 1, 32, 32, 32, 8, 8, 8, false, CK_TILE_PIPELINE_MEMORY>,
// GroupedConvolutionForwardInvoker<NDimSpatial, ALayout, BLayout, ELayout, INT8, INT8, INT8, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 128, 256, 32, 2, 4, 1, 32, 32, 32, 8, 8, 8, false, CK_TILE_PIPELINE_COMPUTE_V3>,
// GroupedConvolutionForwardInvoker<NDimSpatial, ALayout, BLayout, ELayout, INT8, INT8, INT8, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 128, 256, 32, 2, 4, 1, 32, 32, 32, 8, 8, 8, true, CK_TILE_PIPELINE_COMPUTE_V4>,
// GroupedConvolutionForwardInvoker<NDimSpatial, ALayout, BLayout, ELayout, INT8, INT8, INT8, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Pad0, 1, 128, 256, 32, 2, 4, 1, 32, 32, 32, 8, 8, 8, false, CK_TILE_PIPELINE_MEMORY>,
// GroupedConvolutionForwardInvoker<NDimSpatial, ALayout, BLayout, ELayout, INT8, INT8, INT8, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Pad0, 1, 128, 256, 32, 2, 4, 1, 32, 32, 32, 8, 8, 8, false, CK_TILE_PIPELINE_COMPUTE_V3>,
// GroupedConvolutionForwardInvoker<NDimSpatial, ALayout, BLayout, ELayout, INT8, INT8, INT8, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Pad0, 1, 128, 256, 32, 2, 4, 1, 32, 32, 32, 8, 8, 8, true, CK_TILE_PIPELINE_COMPUTE_V4>,
// GroupedConvolutionForwardInvoker<NDimSpatial, ALayout, BLayout, ELayout, INT8, INT8, INT8, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter3x3, 1, 128, 256, 32, 2, 4, 1, 32, 32, 32, 8, 8, 8, false, CK_TILE_PIPELINE_MEMORY>,
// GroupedConvolutionForwardInvoker<NDimSpatial, ALayout, BLayout, ELayout, INT8, INT8, INT8, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter3x3, 1, 128, 256, 32, 2, 4, 1, 32, 32, 32, 8, 8, 8, false, CK_TILE_PIPELINE_COMPUTE_V3>,
// GroupedConvolutionForwardInvoker<NDimSpatial, ALayout, BLayout, ELayout, INT8, INT8, INT8, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter3x3, 1, 128, 256, 32, 2, 4, 1, 32, 32, 32, 8, 8, 8, true, CK_TILE_PIPELINE_COMPUTE_V4>,
// GroupedConvolutionForwardInvoker<NDimSpatial, ALayout, BLayout, ELayout, INT8, INT8, INT8, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 128, 256, 32, 2, 4, 1, 32, 32, 32, 8, 8, 8, false, CK_TILE_PIPELINE_MEMORY>,
// GroupedConvolutionForwardInvoker<NDimSpatial, ALayout, BLayout, ELayout, INT8, INT8, INT8, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 128, 256, 32, 2, 4, 1, 32, 32, 32, 8, 8, 8, false, CK_TILE_PIPELINE_COMPUTE_V3>,
// GroupedConvolutionForwardInvoker<NDimSpatial, ALayout, BLayout, ELayout, INT8, INT8, INT8, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 128, 256, 32, 2, 4, 1, 32, 32, 32, 8, 8, 8, true, CK_TILE_PIPELINE_COMPUTE_V4>,
// Converted from device_grouped_conv_fwd_xdl_int8_comp_instances_part2 - Instance 7
// GroupedConvolutionForwardInvoker<NDimSpatial, ALayout, BLayout, ELayout, INT8, INT8, INT8, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 256, 128, 32, 4, 2, 1, 32, 32, 32, 8, 8, 8, false, CK_TILE_PIPELINE_MEMORY>,
// GroupedConvolutionForwardInvoker<NDimSpatial, ALayout, BLayout, ELayout, INT8, INT8, INT8, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 256, 128, 32, 4, 2, 1, 32, 32, 32, 8, 8, 8, false, CK_TILE_PIPELINE_COMPUTE_V3>,
// GroupedConvolutionForwardInvoker<NDimSpatial, ALayout, BLayout, ELayout, INT8, INT8, INT8, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 256, 128, 32, 4, 2, 1, 32, 32, 32, 8, 8, 8, true, CK_TILE_PIPELINE_COMPUTE_V4>,
// GroupedConvolutionForwardInvoker<NDimSpatial, ALayout, BLayout, ELayout, INT8, INT8, INT8, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Pad0, 1, 256, 128, 32, 4, 2, 1, 32, 32, 32, 8, 8, 8, false, CK_TILE_PIPELINE_MEMORY>,
// GroupedConvolutionForwardInvoker<NDimSpatial, ALayout, BLayout, ELayout, INT8, INT8, INT8, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Pad0, 1, 256, 128, 32, 4, 2, 1, 32, 32, 32, 8, 8, 8, false, CK_TILE_PIPELINE_COMPUTE_V3>,
// GroupedConvolutionForwardInvoker<NDimSpatial, ALayout, BLayout, ELayout, INT8, INT8, INT8, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Pad0, 1, 256, 128, 32, 4, 2, 1, 32, 32, 32, 8, 8, 8, true, CK_TILE_PIPELINE_COMPUTE_V4>,
// GroupedConvolutionForwardInvoker<NDimSpatial, ALayout, BLayout, ELayout, INT8, INT8, INT8, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter3x3, 1, 256, 128, 32, 4, 2, 1, 32, 32, 32, 8, 8, 8, false, CK_TILE_PIPELINE_MEMORY>,
// GroupedConvolutionForwardInvoker<NDimSpatial, ALayout, BLayout, ELayout, INT8, INT8, INT8, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter3x3, 1, 256, 128, 32, 4, 2, 1, 32, 32, 32, 8, 8, 8, false, CK_TILE_PIPELINE_COMPUTE_V3>,
// GroupedConvolutionForwardInvoker<NDimSpatial, ALayout, BLayout, ELayout, INT8, INT8, INT8, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter3x3, 1, 256, 128, 32, 4, 2, 1, 32, 32, 32, 8, 8, 8, true, CK_TILE_PIPELINE_COMPUTE_V4>,
// GroupedConvolutionForwardInvoker<NDimSpatial, ALayout, BLayout, ELayout, INT8, INT8, INT8, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 256, 128, 32, 4, 2, 1, 32, 32, 32, 8, 8, 8, false, CK_TILE_PIPELINE_MEMORY>,
// GroupedConvolutionForwardInvoker<NDimSpatial, ALayout, BLayout, ELayout, INT8, INT8, INT8, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 256, 128, 32, 4, 2, 1, 32, 32, 32, 8, 8, 8, false, CK_TILE_PIPELINE_COMPUTE_V3>,
// GroupedConvolutionForwardInvoker<NDimSpatial, ALayout, BLayout, ELayout, INT8, INT8, INT8, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 256, 128, 32, 4, 2, 1, 32, 32, 32, 8, 8, 8, true, CK_TILE_PIPELINE_COMPUTE_V4>,
// Converted from device_grouped_conv_fwd_xdl_int8_comp_instances_part2 - Instance 8
// GroupedConvolutionForwardInvoker<NDimSpatial, ALayout, BLayout, ELayout, INT8, INT8, INT8, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 128, 128, 64, 2, 2, 1, 32, 32, 32, 8, 8, 8, false, CK_TILE_PIPELINE_MEMORY>,
// GroupedConvolutionForwardInvoker<NDimSpatial, ALayout, BLayout, ELayout, INT8, INT8, INT8, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 128, 128, 64, 2, 2, 1, 32, 32, 32, 8, 8, 8, false, CK_TILE_PIPELINE_COMPUTE_V3>,
// GroupedConvolutionForwardInvoker<NDimSpatial, ALayout, BLayout, ELayout, INT8, INT8, INT8, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 128, 128, 64, 2, 2, 1, 32, 32, 32, 8, 8, 8, true, CK_TILE_PIPELINE_COMPUTE_V4>,
// GroupedConvolutionForwardInvoker<NDimSpatial, ALayout, BLayout, ELayout, INT8, INT8, INT8, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Pad0, 1, 128, 128, 64, 2, 2, 1, 32, 32, 32, 8, 8, 8, false, CK_TILE_PIPELINE_MEMORY>,
// GroupedConvolutionForwardInvoker<NDimSpatial, ALayout, BLayout, ELayout, INT8, INT8, INT8, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Pad0, 1, 128, 128, 64, 2, 2, 1, 32, 32, 32, 8, 8, 8, false, CK_TILE_PIPELINE_COMPUTE_V3>,
// GroupedConvolutionForwardInvoker<NDimSpatial, ALayout, BLayout, ELayout, INT8, INT8, INT8, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Pad0, 1, 128, 128, 64, 2, 2, 1, 32, 32, 32, 8, 8, 8, true, CK_TILE_PIPELINE_COMPUTE_V4>,
// GroupedConvolutionForwardInvoker<NDimSpatial, ALayout, BLayout, ELayout, INT8, INT8, INT8, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter3x3, 1, 128, 128, 64, 2, 2, 1, 32, 32, 32, 8, 8, 8, false, CK_TILE_PIPELINE_MEMORY>,
// GroupedConvolutionForwardInvoker<NDimSpatial, ALayout, BLayout, ELayout, INT8, INT8, INT8, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter3x3, 1, 128, 128, 64, 2, 2, 1, 32, 32, 32, 8, 8, 8, false, CK_TILE_PIPELINE_COMPUTE_V3>,
// GroupedConvolutionForwardInvoker<NDimSpatial, ALayout, BLayout, ELayout, INT8, INT8, INT8, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter3x3, 1, 128, 128, 64, 2, 2, 1, 32, 32, 32, 8, 8, 8, true, CK_TILE_PIPELINE_COMPUTE_V4>,
// GroupedConvolutionForwardInvoker<NDimSpatial, ALayout, BLayout, ELayout, INT8, INT8, INT8, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 128, 128, 64, 2, 2, 1, 32, 32, 32, 8, 8, 8, false, CK_TILE_PIPELINE_MEMORY>,
// GroupedConvolutionForwardInvoker<NDimSpatial, ALayout, BLayout, ELayout, INT8, INT8, INT8, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 128, 128, 64, 2, 2, 1, 32, 32, 32, 8, 8, 8, false, CK_TILE_PIPELINE_COMPUTE_V3>,
// GroupedConvolutionForwardInvoker<NDimSpatial, ALayout, BLayout, ELayout, INT8, INT8, INT8, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 128, 128, 64, 2, 2, 1, 32, 32, 32, 8, 8, 8, true, CK_TILE_PIPELINE_COMPUTE_V4>
// clang-format on
>;

View File

@@ -1,7 +1,7 @@
FP16
data_type layout verify init_method print time_kernel spat_dim G N K C Y X Hi Wi Sy Sx Dy Dx Left_pad_y Left_pad_x Right_pad_y Right_pad_x
1 1 0 1 0 1 2 1 1 32 32 3 3 800 1280 1 1 1 1 1 1 1 1
op data_type layout verify init_method print time_kernel spat_dim G N K C Y X Hi Wi Sy Sx Dy Dx Left_pad_y Left_pad_x Right_pad_y Right_pad_x
grouped_conv_fwd 1 1 0 1 0 1 2 1 1 32 32 3 3 800 1280 1 1 1 1 1 1 1 1
INT8
data_type layout verify init_method print time_kernel spat_dim G N K C Y X Hi Wi Sy Sx Dy Dx Left_pad_y Left_pad_x Right_pad_y Right_pad_x
3 1 0 1 0 1 2 1 1 32 32 3 3 800 1280 1 1 1 1 1 1 1 1
op data_type layout verify init_method print time_kernel spat_dim G N K C Y X Hi Wi Sy Sx Dy Dx Left_pad_y Left_pad_x Right_pad_y Right_pad_x
grouped_conv_fwd 3 1 0 1 0 1 2 1 1 32 32 3 3 800 1280 1 1 1 1 1 1 1 1