Add more fp16 instances. Align ckProfiler and ckTileProfiler.

This commit is contained in:
Ville Pietilä
2025-12-11 10:11:49 -05:00
parent 1ccc76e92d
commit 47c2e95b20
6 changed files with 88 additions and 50 deletions

View File

@@ -85,6 +85,8 @@ int tile_profile_grouped_conv_fwd(int argc, char* argv[])
return 1;
}
std::cout << argv[0] << " " << argv[1] << " " << argv[2] << " " << argv[3] << " " << argv[4] << std::endl;
const auto data_type = static_cast<ConvDataType>(std::stoi(argv[2]));
const auto layout = static_cast<ConvLayout>(std::stoi(argv[3]));
const bool do_verification = std::stoi(argv[4]);
@@ -93,8 +95,8 @@ int tile_profile_grouped_conv_fwd(int argc, char* argv[])
const bool time_kernel = std::stoi(argv[7]);
const int num_dim_spatial = std::stoi(argv[8]);
// 7 for control, 1 for num_dim_spatial, 4 for G/N/K/C, and 6 * num_dim_spatial
const int expected_num_args = 7 + 1 + 4 + 6 * num_dim_spatial + 1;
// program name, 7 for control, 1 for num_dim_spatial, 4 for G/N/K/C, and 6 * num_dim_spatial
const int expected_num_args = 1 + 7 + 1 + 4 + 6 * num_dim_spatial;
if(argc != expected_num_args)
{
std::cout << "Received " << argc << " args"<< std::endl;

View File

@@ -32,9 +32,41 @@ using tile_grouped_conv_fwd_fp16_instances = std::tuple<
//##############################| Num| InLayout| WeiLayout| OutLayout| InData| WeiData| OutData| In| Wei| Out| Conv| K-block| M-tile| N-tile | K-tile | M-warp| N-warp| K-warp| M-warp| N-warp| K-warp| Vector| Vector| Vector| Double| GEMM|
//##############################| Dim| | | | Type| Type| Type| Elementwise| Elementwise| Elementwise| Spec| per| | | | | | | tile| tile| tile| size| size| size| smem| pipeline|
//##############################| Spatial| | | | | | | Operation| Operation| Operation| | CU| | | | | | | size| size| size| A| B| C| buffer| version|
// These instances do not compile on Navi4x cards
#if defined(__gfx9__)
GroupedConvolutionForwardInvoker<NDimSpatial, ALayout, BLayout, ELayout, F16, F16, F16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 128, 128, 128, 2, 2, 1, 32, 32, 16, 16, 16, 16, true, CK_TILE_PIPELINE_COMPUTE_V4>,
GroupedConvolutionForwardInvoker<NDimSpatial, ALayout, BLayout, ELayout, F16, F16, F16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Pad0, 1, 128, 128, 128, 2, 2, 1, 32, 32, 16, 16, 16, 16, true, CK_TILE_PIPELINE_COMPUTE_V4>,
GroupedConvolutionForwardInvoker<NDimSpatial, ALayout, BLayout, ELayout, F16, F16, F16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter3x3, 1, 128, 128, 128, 2, 2, 1, 32, 32, 16, 16, 16, 16, true, CK_TILE_PIPELINE_COMPUTE_V4>,
GroupedConvolutionForwardInvoker<NDimSpatial, ALayout, BLayout, ELayout, F16, F16, F16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 128, 128, 128, 2, 2, 1, 32, 32, 16, 16, 16, 16, true, CK_TILE_PIPELINE_COMPUTE_V4>,
#endif
GroupedConvolutionForwardInvoker<NDimSpatial, ALayout, BLayout, ELayout, F16, F16, F16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 64, 64, 32, 1, 1, 1, 32, 32, 16, 1, 1, 1, false, CK_TILE_PIPELINE_MEMORY>,
GroupedConvolutionForwardInvoker<NDimSpatial, ALayout, BLayout, ELayout, F16, F16, F16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 64, 64, 32, 1, 1, 1, 32, 32, 16, 1, 1, 1, false, CK_TILE_PIPELINE_MEMORY>
// clang-format on
GroupedConvolutionForwardInvoker<NDimSpatial, ALayout, BLayout, ELayout, F16, F16, F16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 64, 64, 32, 1, 1, 1, 32, 32, 16, 1, 1, 1, false, CK_TILE_PIPELINE_MEMORY>,
GroupedConvolutionForwardInvoker<NDimSpatial, ALayout, BLayout, ELayout, F16, F16, F16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 64, 64, 32, 1, 1, 1, 32, 32, 16, 1, 1, 1, false, CK_TILE_PIPELINE_MEMORY>,
GroupedConvolutionForwardInvoker<NDimSpatial, ALayout, BLayout, ELayout, F16, F16, F16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 64, 64, 32, 1, 1, 1, 32, 32, 16, 1, 1, 1, false, CK_TILE_PIPELINE_MEMORY>,
GroupedConvolutionForwardInvoker<NDimSpatial, ALayout, BLayout, ELayout, F16, F16, F16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 128, 128, 128, 2, 2, 1, 32, 32, 16, 16, 16, 16, false, CK_TILE_PIPELINE_MEMORY>,
GroupedConvolutionForwardInvoker<NDimSpatial, ALayout, BLayout, ELayout, F16, F16, F16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 128, 128, 128, 2, 2, 1, 32, 32, 16, 16, 16, 16, false, CK_TILE_PIPELINE_COMPUTE_V3>,
GroupedConvolutionForwardInvoker<NDimSpatial, ALayout, BLayout, ELayout, F16, F16, F16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Pad0, 1, 128, 128, 128, 2, 2, 1, 32, 32, 16, 16, 16, 16, false, CK_TILE_PIPELINE_MEMORY>,
GroupedConvolutionForwardInvoker<NDimSpatial, ALayout, BLayout, ELayout, F16, F16, F16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Pad0, 1, 128, 128, 128, 2, 2, 1, 32, 32, 16, 16, 16, 16, false, CK_TILE_PIPELINE_COMPUTE_V3>,
GroupedConvolutionForwardInvoker<NDimSpatial, ALayout, BLayout, ELayout, F16, F16, F16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter3x3, 1, 128, 128, 128, 2, 2, 1, 32, 32, 16, 16, 16, 16, false, CK_TILE_PIPELINE_MEMORY>,
GroupedConvolutionForwardInvoker<NDimSpatial, ALayout, BLayout, ELayout, F16, F16, F16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter3x3, 1, 128, 128, 128, 2, 2, 1, 32, 32, 16, 16, 16, 16, false, CK_TILE_PIPELINE_COMPUTE_V3>,
GroupedConvolutionForwardInvoker<NDimSpatial, ALayout, BLayout, ELayout, F16, F16, F16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 128, 128, 128, 2, 2, 1, 32, 32, 16, 16, 16, 16, false, CK_TILE_PIPELINE_MEMORY>,
GroupedConvolutionForwardInvoker<NDimSpatial, ALayout, BLayout, ELayout, F16, F16, F16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 128, 128, 128, 2, 2, 1, 32, 32, 16, 16, 16, 16, false, CK_TILE_PIPELINE_COMPUTE_V3>,
GroupedConvolutionForwardInvoker<NDimSpatial, ALayout, BLayout, ELayout, F16, F16, F16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 128, 128, 32, 2, 2, 1, 32, 32, 16, 8, 8, 8, false, CK_TILE_PIPELINE_MEMORY>,
GroupedConvolutionForwardInvoker<NDimSpatial, ALayout, BLayout, ELayout, F16, F16, F16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 128, 128, 32, 2, 2, 1, 32, 32, 16, 8, 8, 8, false, CK_TILE_PIPELINE_COMPUTE_V3>,
GroupedConvolutionForwardInvoker<NDimSpatial, ALayout, BLayout, ELayout, F16, F16, F16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 128, 128, 32, 2, 2, 1, 32, 32, 16, 8, 8, 8, true, CK_TILE_PIPELINE_COMPUTE_V4>,
GroupedConvolutionForwardInvoker<NDimSpatial, ALayout, BLayout, ELayout, F16, F16, F16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Pad0, 1, 128, 128, 32, 2, 2, 1, 32, 32, 16, 8, 8, 8, false, CK_TILE_PIPELINE_MEMORY>,
GroupedConvolutionForwardInvoker<NDimSpatial, ALayout, BLayout, ELayout, F16, F16, F16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Pad0, 1, 128, 128, 32, 2, 2, 1, 32, 32, 16, 8, 8, 8, false, CK_TILE_PIPELINE_COMPUTE_V3>,
GroupedConvolutionForwardInvoker<NDimSpatial, ALayout, BLayout, ELayout, F16, F16, F16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Pad0, 1, 128, 128, 32, 2, 2, 1, 32, 32, 16, 8, 8, 8, true, CK_TILE_PIPELINE_COMPUTE_V4>,
GroupedConvolutionForwardInvoker<NDimSpatial, ALayout, BLayout, ELayout, F16, F16, F16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter3x3, 1, 128, 128, 32, 2, 2, 1, 32, 32, 16, 8, 8, 8, false, CK_TILE_PIPELINE_MEMORY>,
GroupedConvolutionForwardInvoker<NDimSpatial, ALayout, BLayout, ELayout, F16, F16, F16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter3x3, 1, 128, 128, 32, 2, 2, 1, 32, 32, 16, 8, 8, 8, false, CK_TILE_PIPELINE_COMPUTE_V3>,
GroupedConvolutionForwardInvoker<NDimSpatial, ALayout, BLayout, ELayout, F16, F16, F16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter3x3, 1, 128, 128, 32, 2, 2, 1, 32, 32, 16, 8, 8, 8, true, CK_TILE_PIPELINE_COMPUTE_V4>,
GroupedConvolutionForwardInvoker<NDimSpatial, ALayout, BLayout, ELayout, F16, F16, F16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 128, 128, 32, 2, 2, 1, 32, 32, 16, 8, 8, 8, false, CK_TILE_PIPELINE_MEMORY>,
GroupedConvolutionForwardInvoker<NDimSpatial, ALayout, BLayout, ELayout, F16, F16, F16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 128, 128, 32, 2, 2, 1, 32, 32, 16, 8, 8, 8, false, CK_TILE_PIPELINE_COMPUTE_V3>,
GroupedConvolutionForwardInvoker<NDimSpatial, ALayout, BLayout, ELayout, F16, F16, F16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 128, 128, 32, 2, 2, 1, 32, 32, 16, 8, 8, 8, true, CK_TILE_PIPELINE_COMPUTE_V4>,
GroupedConvolutionForwardInvoker<NDimSpatial, ALayout, BLayout, ELayout, F16, F16, F16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 256, 256, 32, 4, 4, 1, 32, 32, 16, 8, 8, 8, false, CK_TILE_PIPELINE_MEMORY>
// clang-format on
>;

View File

@@ -34,38 +34,39 @@ using tile_grouped_conv_fwd_int8_instances = std::tuple<
//##############################| Num| InLayout| WeiLayout| OutLayout| InData| WeiData| OutData| In| Wei| Out| Conv| K-block| M-tile| N-tile | K-tile | M-warp| N-warp| K-warp| M-warp| N-warp| K-warp| Vector| Vector| Vector| Double| GEMM|
//##############################| Dim| | | | Type| Type| Type| Elementwise| Elementwise| Elementwise| Spec| per| | | | | | | tile| tile| tile| size| size| size| smem| pipeline|
//##############################| Spatial| | | | | | | Operation| Operation| Operation| | CU| | | | | | | size| size| size| A| B| C| buffer| version|
GroupedConvolutionForwardInvoker<NDimSpatial, ALayout, BLayout, ELayout, INT8, INT8, INT8, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 64, 64, 32, 1, 1, 1, 32, 32, 16, 1, 1, 1, false, CK_TILE_PIPELINE_MEMORY>,
GroupedConvolutionForwardInvoker<NDimSpatial, ALayout, BLayout, ELayout, INT8, INT8, INT8, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 64, 64, 32, 1, 1, 1, 32, 32, 16, 1, 1, 1, false, CK_TILE_PIPELINE_MEMORY>
// Converted from device_grouped_conv_fwd_xdl_int8_comp_instances_2x
// GroupedConvolutionForwardInvoker<NDimSpatial, ALayout, BLayout, ELayout, INT8, INT8, INT8, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 128, 128, 128, 2, 2, 1, 32, 32, 32, 16, 16, 16, false, CK_TILE_PIPELINE_MEMORY>,
// GroupedConvolutionForwardInvoker<NDimSpatial, ALayout, BLayout, ELayout, INT8, INT8, INT8, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 128, 128, 128, 2, 2, 1, 32, 32, 32, 16, 16, 16, false, CK_TILE_PIPELINE_COMPUTE_V3>,
// GroupedConvolutionForwardInvoker<NDimSpatial, ALayout, BLayout, ELayout, INT8, INT8, INT8, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 128, 128, 128, 2, 2, 1, 32, 32, 32, 16, 16, 16, true, CK_TILE_PIPELINE_COMPUTE_V4>,
// GroupedConvolutionForwardInvoker<NDimSpatial, ALayout, BLayout, ELayout, INT8, INT8, INT8, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Pad0, 1, 128, 128, 128, 2, 2, 1, 32, 32, 32, 16, 16, 16, false, CK_TILE_PIPELINE_MEMORY>,
// GroupedConvolutionForwardInvoker<NDimSpatial, ALayout, BLayout, ELayout, INT8, INT8, INT8, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Pad0, 1, 128, 128, 128, 2, 2, 1, 32, 32, 32, 16, 16, 16, false, CK_TILE_PIPELINE_COMPUTE_V3>,
// GroupedConvolutionForwardInvoker<NDimSpatial, ALayout, BLayout, ELayout, INT8, INT8, INT8, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Pad0, 1, 128, 128, 128, 2, 2, 1, 32, 32, 32, 16, 16, 16, true, CK_TILE_PIPELINE_COMPUTE_V4>,
// GroupedConvolutionForwardInvoker<NDimSpatial, ALayout, BLayout, ELayout, INT8, INT8, INT8, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter3x3, 1, 128, 128, 128, 2, 2, 1, 32, 32, 32, 16, 16, 16, false, CK_TILE_PIPELINE_MEMORY>,
// GroupedConvolutionForwardInvoker<NDimSpatial, ALayout, BLayout, ELayout, INT8, INT8, INT8, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter3x3, 1, 128, 128, 128, 2, 2, 1, 32, 32, 32, 16, 16, 16, false, CK_TILE_PIPELINE_COMPUTE_V3>,
// GroupedConvolutionForwardInvoker<NDimSpatial, ALayout, BLayout, ELayout, INT8, INT8, INT8, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter3x3, 1, 128, 128, 128, 2, 2, 1, 32, 32, 32, 16, 16, 16, true, CK_TILE_PIPELINE_COMPUTE_V4>,
// GroupedConvolutionForwardInvoker<NDimSpatial, ALayout, BLayout, ELayout, INT8, INT8, INT8, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 128, 128, 128, 2, 2, 1, 32, 32, 32, 16, 16, 16, false, CK_TILE_PIPELINE_MEMORY>,
// GroupedConvolutionForwardInvoker<NDimSpatial, ALayout, BLayout, ELayout, INT8, INT8, INT8, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 128, 128, 128, 2, 2, 1, 32, 32, 32, 16, 16, 16, false, CK_TILE_PIPELINE_COMPUTE_V3>,
// GroupedConvolutionForwardInvoker<NDimSpatial, ALayout, BLayout, ELayout, INT8, INT8, INT8, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 128, 128, 128, 2, 2, 1, 32, 32, 32, 16, 16, 16, true, CK_TILE_PIPELINE_COMPUTE_V4>,
// Converted from device_grouped_conv_fwd_xdl_int8_comp_instances
// GroupedConvolutionForwardInvoker<NDimSpatial, ALayout, BLayout, ELayout, INT8, INT8, INT8, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 128, 128, 32, 2, 2, 1, 32, 32, 32, 8, 8, 8, false, CK_TILE_PIPELINE_MEMORY>,
// GroupedConvolutionForwardInvoker<NDimSpatial, ALayout, BLayout, ELayout, INT8, INT8, INT8, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 128, 128, 32, 2, 2, 1, 32, 32, 32, 8, 8, 8, false, CK_TILE_PIPELINE_COMPUTE_V3>,
// GroupedConvolutionForwardInvoker<NDimSpatial, ALayout, BLayout, ELayout, INT8, INT8, INT8, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 128, 128, 32, 2, 2, 1, 32, 32, 32, 8, 8, 8, true, CK_TILE_PIPELINE_COMPUTE_V4>,
// GroupedConvolutionForwardInvoker<NDimSpatial, ALayout, BLayout, ELayout, INT8, INT8, INT8, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Pad0, 1, 128, 128, 32, 2, 2, 1, 32, 32, 32, 8, 8, 8, false, CK_TILE_PIPELINE_MEMORY>,
// GroupedConvolutionForwardInvoker<NDimSpatial, ALayout, BLayout, ELayout, INT8, INT8, INT8, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Pad0, 1, 128, 128, 32, 2, 2, 1, 32, 32, 32, 8, 8, 8, false, CK_TILE_PIPELINE_COMPUTE_V3>,
// GroupedConvolutionForwardInvoker<NDimSpatial, ALayout, BLayout, ELayout, INT8, INT8, INT8, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Pad0, 1, 128, 128, 32, 2, 2, 1, 32, 32, 32, 8, 8, 8, true, CK_TILE_PIPELINE_COMPUTE_V4>,
// GroupedConvolutionForwardInvoker<NDimSpatial, ALayout, BLayout, ELayout, INT8, INT8, INT8, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter3x3, 1, 128, 128, 32, 2, 2, 1, 32, 32, 32, 8, 8, 8, false, CK_TILE_PIPELINE_MEMORY>,
// GroupedConvolutionForwardInvoker<NDimSpatial, ALayout, BLayout, ELayout, INT8, INT8, INT8, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter3x3, 1, 128, 128, 32, 2, 2, 1, 32, 32, 32, 8, 8, 8, false, CK_TILE_PIPELINE_COMPUTE_V3>,
// GroupedConvolutionForwardInvoker<NDimSpatial, ALayout, BLayout, ELayout, INT8, INT8, INT8, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter3x3, 1, 128, 128, 32, 2, 2, 1, 32, 32, 32, 8, 8, 8, true, CK_TILE_PIPELINE_COMPUTE_V4>,
// GroupedConvolutionForwardInvoker<NDimSpatial, ALayout, BLayout, ELayout, INT8, INT8, INT8, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 128, 128, 32, 2, 2, 1, 32, 32, 32, 8, 8, 8, false, CK_TILE_PIPELINE_MEMORY>,
// GroupedConvolutionForwardInvoker<NDimSpatial, ALayout, BLayout, ELayout, INT8, INT8, INT8, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 128, 128, 32, 2, 2, 1, 32, 32, 32, 8, 8, 8, false, CK_TILE_PIPELINE_COMPUTE_V3>,
// GroupedConvolutionForwardInvoker<NDimSpatial, ALayout, BLayout, ELayout, INT8, INT8, INT8, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 128, 128, 32, 2, 2, 1, 32, 32, 32, 8, 8, 8, true, CK_TILE_PIPELINE_COMPUTE_V4>,
// Converted from device_grouped_conv_fwd_xdl_int8_comp_instances_part2 - Instance 1
// GroupedConvolutionForwardInvoker<NDimSpatial, ALayout, BLayout, ELayout, INT8, INT8, INT8, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 256, 256, 32, 4, 4, 1, 32, 32, 32, 8, 8, 8, false, CK_TILE_PIPELINE_MEMORY>,
// GroupedConvolutionForwardInvoker<NDimSpatial, ALayout, BLayout, ELayout, INT8, INT8, INT8, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 256, 256, 32, 4, 4, 1, 32, 32, 32, 8, 8, 8, false, CK_TILE_PIPELINE_COMPUTE_V3>,
// These instances do not compile on Navi cards for FP16 due to high memory usage
GroupedConvolutionForwardInvoker<NDimSpatial, ALayout, BLayout, ELayout, INT8, INT8, INT8, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 128, 128, 128, 2, 2, 1, 32, 32, 16, 16, 16, 16, true, CK_TILE_PIPELINE_COMPUTE_V4>,
GroupedConvolutionForwardInvoker<NDimSpatial, ALayout, BLayout, ELayout, INT8, INT8, INT8, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Pad0, 1, 128, 128, 128, 2, 2, 1, 32, 32, 16, 16, 16, 16, true, CK_TILE_PIPELINE_COMPUTE_V4>,
GroupedConvolutionForwardInvoker<NDimSpatial, ALayout, BLayout, ELayout, INT8, INT8, INT8, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter3x3, 1, 128, 128, 128, 2, 2, 1, 32, 32, 16, 16, 16, 16, true, CK_TILE_PIPELINE_COMPUTE_V4>,
GroupedConvolutionForwardInvoker<NDimSpatial, ALayout, BLayout, ELayout, INT8, INT8, INT8, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 128, 128, 128, 2, 2, 1, 32, 32, 16, 16, 16, 16, true, CK_TILE_PIPELINE_COMPUTE_V4>,
GroupedConvolutionForwardInvoker<NDimSpatial, ALayout, BLayout, ELayout, INT8, INT8, INT8, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 64, 64, 32, 1, 1, 1, 32, 32, 16, 1, 1, 1, false, CK_TILE_PIPELINE_MEMORY>,
GroupedConvolutionForwardInvoker<NDimSpatial, ALayout, BLayout, ELayout, INT8, INT8, INT8, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 64, 64, 32, 1, 1, 1, 32, 32, 16, 1, 1, 1, false, CK_TILE_PIPELINE_MEMORY>,
GroupedConvolutionForwardInvoker<NDimSpatial, ALayout, BLayout, ELayout, INT8, INT8, INT8, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 128, 128, 128, 2, 2, 1, 32, 32, 16, 16, 16, 16, false, CK_TILE_PIPELINE_MEMORY>,
GroupedConvolutionForwardInvoker<NDimSpatial, ALayout, BLayout, ELayout, INT8, INT8, INT8, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 128, 128, 128, 2, 2, 1, 32, 32, 16, 16, 16, 16, false, CK_TILE_PIPELINE_COMPUTE_V3>,
GroupedConvolutionForwardInvoker<NDimSpatial, ALayout, BLayout, ELayout, INT8, INT8, INT8, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Pad0, 1, 128, 128, 128, 2, 2, 1, 32, 32, 16, 16, 16, 16, false, CK_TILE_PIPELINE_MEMORY>,
GroupedConvolutionForwardInvoker<NDimSpatial, ALayout, BLayout, ELayout, INT8, INT8, INT8, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Pad0, 1, 128, 128, 128, 2, 2, 1, 32, 32, 16, 16, 16, 16, false, CK_TILE_PIPELINE_COMPUTE_V3>,
GroupedConvolutionForwardInvoker<NDimSpatial, ALayout, BLayout, ELayout, INT8, INT8, INT8, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter3x3, 1, 128, 128, 128, 2, 2, 1, 32, 32, 16, 16, 16, 16, false, CK_TILE_PIPELINE_MEMORY>,
GroupedConvolutionForwardInvoker<NDimSpatial, ALayout, BLayout, ELayout, INT8, INT8, INT8, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter3x3, 1, 128, 128, 128, 2, 2, 1, 32, 32, 16, 16, 16, 16, false, CK_TILE_PIPELINE_COMPUTE_V3>,
GroupedConvolutionForwardInvoker<NDimSpatial, ALayout, BLayout, ELayout, INT8, INT8, INT8, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 128, 128, 128, 2, 2, 1, 32, 32, 16, 16, 16, 16, false, CK_TILE_PIPELINE_MEMORY>,
GroupedConvolutionForwardInvoker<NDimSpatial, ALayout, BLayout, ELayout, INT8, INT8, INT8, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 128, 128, 128, 2, 2, 1, 32, 32, 16, 16, 16, 16, false, CK_TILE_PIPELINE_COMPUTE_V3>,
GroupedConvolutionForwardInvoker<NDimSpatial, ALayout, BLayout, ELayout, INT8, INT8, INT8, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 128, 128, 32, 2, 2, 1, 32, 32, 16, 8, 8, 8, false, CK_TILE_PIPELINE_MEMORY>,
GroupedConvolutionForwardInvoker<NDimSpatial, ALayout, BLayout, ELayout, INT8, INT8, INT8, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 128, 128, 32, 2, 2, 1, 32, 32, 16, 8, 8, 8, false, CK_TILE_PIPELINE_COMPUTE_V3>,
GroupedConvolutionForwardInvoker<NDimSpatial, ALayout, BLayout, ELayout, INT8, INT8, INT8, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 128, 128, 32, 2, 2, 1, 32, 32, 16, 8, 8, 8, true, CK_TILE_PIPELINE_COMPUTE_V4>,
GroupedConvolutionForwardInvoker<NDimSpatial, ALayout, BLayout, ELayout, INT8, INT8, INT8, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Pad0, 1, 128, 128, 32, 2, 2, 1, 32, 32, 16, 8, 8, 8, false, CK_TILE_PIPELINE_MEMORY>,
GroupedConvolutionForwardInvoker<NDimSpatial, ALayout, BLayout, ELayout, INT8, INT8, INT8, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Pad0, 1, 128, 128, 32, 2, 2, 1, 32, 32, 16, 8, 8, 8, false, CK_TILE_PIPELINE_COMPUTE_V3>,
GroupedConvolutionForwardInvoker<NDimSpatial, ALayout, BLayout, ELayout, INT8, INT8, INT8, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Pad0, 1, 128, 128, 32, 2, 2, 1, 32, 32, 16, 8, 8, 8, true, CK_TILE_PIPELINE_COMPUTE_V4>,
GroupedConvolutionForwardInvoker<NDimSpatial, ALayout, BLayout, ELayout, INT8, INT8, INT8, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter3x3, 1, 128, 128, 32, 2, 2, 1, 32, 32, 16, 8, 8, 8, false, CK_TILE_PIPELINE_MEMORY>,
GroupedConvolutionForwardInvoker<NDimSpatial, ALayout, BLayout, ELayout, INT8, INT8, INT8, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter3x3, 1, 128, 128, 32, 2, 2, 1, 32, 32, 16, 8, 8, 8, false, CK_TILE_PIPELINE_COMPUTE_V3>,
GroupedConvolutionForwardInvoker<NDimSpatial, ALayout, BLayout, ELayout, INT8, INT8, INT8, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter3x3, 1, 128, 128, 32, 2, 2, 1, 32, 32, 16, 8, 8, 8, true, CK_TILE_PIPELINE_COMPUTE_V4>,
GroupedConvolutionForwardInvoker<NDimSpatial, ALayout, BLayout, ELayout, INT8, INT8, INT8, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 128, 128, 32, 2, 2, 1, 32, 32, 16, 8, 8, 8, false, CK_TILE_PIPELINE_MEMORY>,
GroupedConvolutionForwardInvoker<NDimSpatial, ALayout, BLayout, ELayout, INT8, INT8, INT8, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 128, 128, 32, 2, 2, 1, 32, 32, 16, 8, 8, 8, false, CK_TILE_PIPELINE_COMPUTE_V3>,
GroupedConvolutionForwardInvoker<NDimSpatial, ALayout, BLayout, ELayout, INT8, INT8, INT8, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 128, 128, 32, 2, 2, 1, 32, 32, 16, 8, 8, 8, true, CK_TILE_PIPELINE_COMPUTE_V4>,
GroupedConvolutionForwardInvoker<NDimSpatial, ALayout, BLayout, ELayout, INT8, INT8, INT8, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 256, 256, 32, 4, 4, 1, 32, 32, 16, 8, 8, 8, false, CK_TILE_PIPELINE_MEMORY>
// GroupedConvolutionForwardInvoker<NDimSpatial, ALayout, BLayout, ELayout, INT8, INT8, INT8, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 256, 256, 32, 4, 4, 1, 32, 32, 16, 8, 8, 8, false, CK_TILE_PIPELINE_COMPUTE_V3>,
// GroupedConvolutionForwardInvoker<NDimSpatial, ALayout, BLayout, ELayout, INT8, INT8, INT8, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 256, 256, 32, 4, 4, 1, 32, 32, 32, 8, 8, 8, true, CK_TILE_PIPELINE_COMPUTE_V4>,
// GroupedConvolutionForwardInvoker<NDimSpatial, ALayout, BLayout, ELayout, INT8, INT8, INT8, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Pad0, 1, 256, 256, 32, 4, 4, 1, 32, 32, 32, 8, 8, 8, false, CK_TILE_PIPELINE_MEMORY>,
// GroupedConvolutionForwardInvoker<NDimSpatial, ALayout, BLayout, ELayout, INT8, INT8, INT8, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Pad0, 1, 256, 256, 32, 4, 4, 1, 32, 32, 32, 8, 8, 8, false, CK_TILE_PIPELINE_COMPUTE_V3>,

View File

@@ -97,8 +97,8 @@ if(SUPPORTED_GPU_TARGETS MATCHES "gfx9" OR SUPPORTED_GPU_TARGETS MATCHES "gfx1[1
list(APPEND PROFILER_OPS profile_grouped_conv_fwd.cpp)
# list(APPEND PROFILER_OPS profile_grouped_conv_fwd_bias_clamp.cpp)
# list(APPEND PROFILER_OPS profile_grouped_conv_fwd_clamp.cpp)
list(APPEND PROFILER_OPS profile_grouped_conv_bwd_data.cpp)
list(APPEND PROFILER_OPS profile_grouped_conv_bwd_weight.cpp)
# list(APPEND PROFILER_OPS profile_grouped_conv_bwd_data.cpp)
# list(APPEND PROFILER_OPS profile_grouped_conv_bwd_weight.cpp)
# list(APPEND PROFILER_OPS profile_gemm_multi_abd.cpp)
# if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES)
# list(APPEND PROFILER_OPS profile_gemm_add_multiply.cpp)
@@ -114,7 +114,7 @@ endif()
if(DL_KERNELS)
#list(APPEND PROFILER_OPS profile_batched_gemm_multi_d.cpp)
list(APPEND PROFILER_OPS profile_grouped_conv_bwd_weight.cpp)
#list(APPEND PROFILER_OPS profile_grouped_conv_bwd_weight.cpp)
endif()
# if(CK_ENABLE_INT8)
@@ -208,9 +208,9 @@ if(SUPPORTED_GPU_TARGETS MATCHES "gfx9" OR SUPPORTED_GPU_TARGETS MATCHES "gfx1[1
# list(APPEND DEVICE_INSTANCES device_conv1d_bwd_data_instance)
# list(APPEND DEVICE_INSTANCES device_conv3d_bwd_data_instance)
# list(APPEND DEVICE_INSTANCES device_conv2d_bwd_data_instance)
list(APPEND DEVICE_INSTANCES device_grouped_conv1d_bwd_weight_instance)
list(APPEND DEVICE_INSTANCES device_grouped_conv2d_bwd_weight_instance)
list(APPEND DEVICE_INSTANCES device_grouped_convnd_bwd_weight_instance)
# list(APPEND DEVICE_INSTANCES device_grouped_conv1d_bwd_weight_instance)
# list(APPEND DEVICE_INSTANCES device_grouped_conv2d_bwd_weight_instance)
# list(APPEND DEVICE_INSTANCES device_grouped_convnd_bwd_weight_instance)
# list(APPEND DEVICE_INSTANCES device_grouped_conv3d_fwd_convscale_instance)
# list(APPEND DEVICE_INSTANCES device_grouped_conv3d_fwd_convinvscale_instance)
# list(APPEND DEVICE_INSTANCES device_grouped_conv2d_fwd_clamp_instance)
@@ -233,10 +233,10 @@ if(SUPPORTED_GPU_TARGETS MATCHES "gfx9" OR SUPPORTED_GPU_TARGETS MATCHES "gfx1[1
# list(APPEND DEVICE_INSTANCES device_gemm_b_scale_instance)
# list(APPEND DEVICE_INSTANCES device_gemm_universal_reduce_instance)
list(APPEND DEVICE_INSTANCES device_grouped_conv3d_fwd_instance)
list(APPEND DEVICE_INSTANCES device_grouped_conv2d_bwd_data_instance)
list(APPEND DEVICE_INSTANCES device_grouped_conv3d_bwd_data_instance)
# list(APPEND DEVICE_INSTANCES device_grouped_conv2d_bwd_data_instance)
# list(APPEND DEVICE_INSTANCES device_grouped_conv3d_bwd_data_instance)
list(APPEND DEVICE_INSTANCES device_grouped_conv2d_fwd_instance)
list(APPEND DEVICE_INSTANCES device_grouped_conv3d_bwd_weight_instance)
# list(APPEND DEVICE_INSTANCES device_grouped_conv3d_bwd_weight_instance)
# list(APPEND DEVICE_INSTANCES device_gemm_add_relu_instance)
# list(APPEND DEVICE_INSTANCES device_gemm_multi_abd_instance)
# if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES)
@@ -253,9 +253,9 @@ endif()
if(DL_KERNELS)
#list(APPEND DEVICE_INSTANCES device_batched_gemm_multi_d_instance)
list(APPEND DEVICE_INSTANCES device_grouped_conv1d_bwd_weight_instance)
list(APPEND DEVICE_INSTANCES device_grouped_conv2d_bwd_weight_instance)
list(APPEND DEVICE_INSTANCES device_grouped_conv3d_bwd_weight_instance)
# list(APPEND DEVICE_INSTANCES device_grouped_conv1d_bwd_weight_instance)
# list(APPEND DEVICE_INSTANCES device_grouped_conv2d_bwd_weight_instance)
# list(APPEND DEVICE_INSTANCES device_grouped_conv3d_bwd_weight_instance)
endif()
# if(CK_ENABLE_INT8)

View File

@@ -89,9 +89,12 @@ int profile_grouped_conv_fwd(int argc, char* argv[])
const bool time_kernel = std::stoi(argv[7]);
const int num_dim_spatial = std::stoi(argv[8]);
// 9 for control, 1 for num_dim_spatial, 4 for G/N/K/C, and 6 * num_dim_spatial
if(argc != 8 + 1 + 4 + 6 * num_dim_spatial + 1)
// program name, 7 for control, 1 for num_dim_spatial, 4 for G/N/K/C, and 6 * num_dim_spatial
const int expected_num_args = 1 + 7 + 1 + 4 + 6 * num_dim_spatial;
if(argc != expected_num_args)
{
std::cout << "Received " << argc << " arguments." << std::endl;
std::cout << "Expected " << expected_num_args << " arguments." << std::endl;
print_helper_msg();
return 1;
}

View File

@@ -4,4 +4,4 @@ grouped_conv_fwd 1 1 0 1 0 1
INT8
op data_type layout verify init_method print time_kernel spat_dim G N K C Y X Hi Wi Sy Sx Dy Dx Left_pad_y Left_pad_x Right_pad_y Right_pad_x
grouped_conv_fwd 3 1 0 1 0 1 2 1 1 32 32 3 3 800 1280 1 1 1 1 1 1 1 1
grouped_conv_fwd 3 1 0 1 0 1 2 1 1 32 32 3 3 800 1280 1 1 1 1 1 1 1 1