mirror of
https://github.com/ROCm/composable_kernel.git
synced 2026-07-01 20:27:42 +00:00
Add more fp16 instances. Align ckProfiler and ckTileProfiler.
This commit is contained in:
@@ -85,6 +85,8 @@ int tile_profile_grouped_conv_fwd(int argc, char* argv[])
|
||||
return 1;
|
||||
}
|
||||
|
||||
std::cout << argv[0] << " " << argv[1] << " " << argv[2] << " " << argv[3] << " " << argv[4] << std::endl;
|
||||
|
||||
const auto data_type = static_cast<ConvDataType>(std::stoi(argv[2]));
|
||||
const auto layout = static_cast<ConvLayout>(std::stoi(argv[3]));
|
||||
const bool do_verification = std::stoi(argv[4]);
|
||||
@@ -93,8 +95,8 @@ int tile_profile_grouped_conv_fwd(int argc, char* argv[])
|
||||
const bool time_kernel = std::stoi(argv[7]);
|
||||
const int num_dim_spatial = std::stoi(argv[8]);
|
||||
|
||||
// 7 for control, 1 for num_dim_spatial, 4 for G/N/K/C, and 6 * num_dim_spatial
|
||||
const int expected_num_args = 7 + 1 + 4 + 6 * num_dim_spatial + 1;
|
||||
// program name, 7 for control, 1 for num_dim_spatial, 4 for G/N/K/C, and 6 * num_dim_spatial
|
||||
const int expected_num_args = 1 + 7 + 1 + 4 + 6 * num_dim_spatial;
|
||||
if(argc != expected_num_args)
|
||||
{
|
||||
std::cout << "Received " << argc << " args"<< std::endl;
|
||||
|
||||
@@ -32,9 +32,41 @@ using tile_grouped_conv_fwd_fp16_instances = std::tuple<
|
||||
//##############################| Num| InLayout| WeiLayout| OutLayout| InData| WeiData| OutData| In| Wei| Out| Conv| K-block| M-tile| N-tile | K-tile | M-warp| N-warp| K-warp| M-warp| N-warp| K-warp| Vector| Vector| Vector| Double| GEMM|
|
||||
//##############################| Dim| | | | Type| Type| Type| Elementwise| Elementwise| Elementwise| Spec| per| | | | | | | tile| tile| tile| size| size| size| smem| pipeline|
|
||||
//##############################| Spatial| | | | | | | Operation| Operation| Operation| | CU| | | | | | | size| size| size| A| B| C| buffer| version|
|
||||
|
||||
// These instances do not compile on Navi4x cards
|
||||
#if defined(__gfx9__)
|
||||
GroupedConvolutionForwardInvoker<NDimSpatial, ALayout, BLayout, ELayout, F16, F16, F16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 128, 128, 128, 2, 2, 1, 32, 32, 16, 16, 16, 16, true, CK_TILE_PIPELINE_COMPUTE_V4>,
|
||||
GroupedConvolutionForwardInvoker<NDimSpatial, ALayout, BLayout, ELayout, F16, F16, F16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Pad0, 1, 128, 128, 128, 2, 2, 1, 32, 32, 16, 16, 16, 16, true, CK_TILE_PIPELINE_COMPUTE_V4>,
|
||||
GroupedConvolutionForwardInvoker<NDimSpatial, ALayout, BLayout, ELayout, F16, F16, F16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter3x3, 1, 128, 128, 128, 2, 2, 1, 32, 32, 16, 16, 16, 16, true, CK_TILE_PIPELINE_COMPUTE_V4>,
|
||||
GroupedConvolutionForwardInvoker<NDimSpatial, ALayout, BLayout, ELayout, F16, F16, F16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 128, 128, 128, 2, 2, 1, 32, 32, 16, 16, 16, 16, true, CK_TILE_PIPELINE_COMPUTE_V4>,
|
||||
#endif
|
||||
|
||||
GroupedConvolutionForwardInvoker<NDimSpatial, ALayout, BLayout, ELayout, F16, F16, F16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 64, 64, 32, 1, 1, 1, 32, 32, 16, 1, 1, 1, false, CK_TILE_PIPELINE_MEMORY>,
|
||||
GroupedConvolutionForwardInvoker<NDimSpatial, ALayout, BLayout, ELayout, F16, F16, F16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 64, 64, 32, 1, 1, 1, 32, 32, 16, 1, 1, 1, false, CK_TILE_PIPELINE_MEMORY>
|
||||
// clang-format on
|
||||
GroupedConvolutionForwardInvoker<NDimSpatial, ALayout, BLayout, ELayout, F16, F16, F16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 64, 64, 32, 1, 1, 1, 32, 32, 16, 1, 1, 1, false, CK_TILE_PIPELINE_MEMORY>,
|
||||
GroupedConvolutionForwardInvoker<NDimSpatial, ALayout, BLayout, ELayout, F16, F16, F16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 64, 64, 32, 1, 1, 1, 32, 32, 16, 1, 1, 1, false, CK_TILE_PIPELINE_MEMORY>,
|
||||
GroupedConvolutionForwardInvoker<NDimSpatial, ALayout, BLayout, ELayout, F16, F16, F16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 64, 64, 32, 1, 1, 1, 32, 32, 16, 1, 1, 1, false, CK_TILE_PIPELINE_MEMORY>,
|
||||
GroupedConvolutionForwardInvoker<NDimSpatial, ALayout, BLayout, ELayout, F16, F16, F16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 128, 128, 128, 2, 2, 1, 32, 32, 16, 16, 16, 16, false, CK_TILE_PIPELINE_MEMORY>,
|
||||
GroupedConvolutionForwardInvoker<NDimSpatial, ALayout, BLayout, ELayout, F16, F16, F16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 128, 128, 128, 2, 2, 1, 32, 32, 16, 16, 16, 16, false, CK_TILE_PIPELINE_COMPUTE_V3>,
|
||||
GroupedConvolutionForwardInvoker<NDimSpatial, ALayout, BLayout, ELayout, F16, F16, F16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Pad0, 1, 128, 128, 128, 2, 2, 1, 32, 32, 16, 16, 16, 16, false, CK_TILE_PIPELINE_MEMORY>,
|
||||
GroupedConvolutionForwardInvoker<NDimSpatial, ALayout, BLayout, ELayout, F16, F16, F16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Pad0, 1, 128, 128, 128, 2, 2, 1, 32, 32, 16, 16, 16, 16, false, CK_TILE_PIPELINE_COMPUTE_V3>,
|
||||
GroupedConvolutionForwardInvoker<NDimSpatial, ALayout, BLayout, ELayout, F16, F16, F16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter3x3, 1, 128, 128, 128, 2, 2, 1, 32, 32, 16, 16, 16, 16, false, CK_TILE_PIPELINE_MEMORY>,
|
||||
GroupedConvolutionForwardInvoker<NDimSpatial, ALayout, BLayout, ELayout, F16, F16, F16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter3x3, 1, 128, 128, 128, 2, 2, 1, 32, 32, 16, 16, 16, 16, false, CK_TILE_PIPELINE_COMPUTE_V3>,
|
||||
GroupedConvolutionForwardInvoker<NDimSpatial, ALayout, BLayout, ELayout, F16, F16, F16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 128, 128, 128, 2, 2, 1, 32, 32, 16, 16, 16, 16, false, CK_TILE_PIPELINE_MEMORY>,
|
||||
GroupedConvolutionForwardInvoker<NDimSpatial, ALayout, BLayout, ELayout, F16, F16, F16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 128, 128, 128, 2, 2, 1, 32, 32, 16, 16, 16, 16, false, CK_TILE_PIPELINE_COMPUTE_V3>,
|
||||
GroupedConvolutionForwardInvoker<NDimSpatial, ALayout, BLayout, ELayout, F16, F16, F16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 128, 128, 32, 2, 2, 1, 32, 32, 16, 8, 8, 8, false, CK_TILE_PIPELINE_MEMORY>,
|
||||
GroupedConvolutionForwardInvoker<NDimSpatial, ALayout, BLayout, ELayout, F16, F16, F16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 128, 128, 32, 2, 2, 1, 32, 32, 16, 8, 8, 8, false, CK_TILE_PIPELINE_COMPUTE_V3>,
|
||||
GroupedConvolutionForwardInvoker<NDimSpatial, ALayout, BLayout, ELayout, F16, F16, F16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 128, 128, 32, 2, 2, 1, 32, 32, 16, 8, 8, 8, true, CK_TILE_PIPELINE_COMPUTE_V4>,
|
||||
GroupedConvolutionForwardInvoker<NDimSpatial, ALayout, BLayout, ELayout, F16, F16, F16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Pad0, 1, 128, 128, 32, 2, 2, 1, 32, 32, 16, 8, 8, 8, false, CK_TILE_PIPELINE_MEMORY>,
|
||||
GroupedConvolutionForwardInvoker<NDimSpatial, ALayout, BLayout, ELayout, F16, F16, F16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Pad0, 1, 128, 128, 32, 2, 2, 1, 32, 32, 16, 8, 8, 8, false, CK_TILE_PIPELINE_COMPUTE_V3>,
|
||||
GroupedConvolutionForwardInvoker<NDimSpatial, ALayout, BLayout, ELayout, F16, F16, F16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Pad0, 1, 128, 128, 32, 2, 2, 1, 32, 32, 16, 8, 8, 8, true, CK_TILE_PIPELINE_COMPUTE_V4>,
|
||||
GroupedConvolutionForwardInvoker<NDimSpatial, ALayout, BLayout, ELayout, F16, F16, F16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter3x3, 1, 128, 128, 32, 2, 2, 1, 32, 32, 16, 8, 8, 8, false, CK_TILE_PIPELINE_MEMORY>,
|
||||
GroupedConvolutionForwardInvoker<NDimSpatial, ALayout, BLayout, ELayout, F16, F16, F16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter3x3, 1, 128, 128, 32, 2, 2, 1, 32, 32, 16, 8, 8, 8, false, CK_TILE_PIPELINE_COMPUTE_V3>,
|
||||
GroupedConvolutionForwardInvoker<NDimSpatial, ALayout, BLayout, ELayout, F16, F16, F16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter3x3, 1, 128, 128, 32, 2, 2, 1, 32, 32, 16, 8, 8, 8, true, CK_TILE_PIPELINE_COMPUTE_V4>,
|
||||
GroupedConvolutionForwardInvoker<NDimSpatial, ALayout, BLayout, ELayout, F16, F16, F16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 128, 128, 32, 2, 2, 1, 32, 32, 16, 8, 8, 8, false, CK_TILE_PIPELINE_MEMORY>,
|
||||
GroupedConvolutionForwardInvoker<NDimSpatial, ALayout, BLayout, ELayout, F16, F16, F16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 128, 128, 32, 2, 2, 1, 32, 32, 16, 8, 8, 8, false, CK_TILE_PIPELINE_COMPUTE_V3>,
|
||||
GroupedConvolutionForwardInvoker<NDimSpatial, ALayout, BLayout, ELayout, F16, F16, F16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 128, 128, 32, 2, 2, 1, 32, 32, 16, 8, 8, 8, true, CK_TILE_PIPELINE_COMPUTE_V4>,
|
||||
GroupedConvolutionForwardInvoker<NDimSpatial, ALayout, BLayout, ELayout, F16, F16, F16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 256, 256, 32, 4, 4, 1, 32, 32, 16, 8, 8, 8, false, CK_TILE_PIPELINE_MEMORY>
|
||||
|
||||
// clang-format on
|
||||
>;
|
||||
|
||||
|
||||
@@ -34,38 +34,39 @@ using tile_grouped_conv_fwd_int8_instances = std::tuple<
|
||||
//##############################| Num| InLayout| WeiLayout| OutLayout| InData| WeiData| OutData| In| Wei| Out| Conv| K-block| M-tile| N-tile | K-tile | M-warp| N-warp| K-warp| M-warp| N-warp| K-warp| Vector| Vector| Vector| Double| GEMM|
|
||||
//##############################| Dim| | | | Type| Type| Type| Elementwise| Elementwise| Elementwise| Spec| per| | | | | | | tile| tile| tile| size| size| size| smem| pipeline|
|
||||
//##############################| Spatial| | | | | | | Operation| Operation| Operation| | CU| | | | | | | size| size| size| A| B| C| buffer| version|
|
||||
GroupedConvolutionForwardInvoker<NDimSpatial, ALayout, BLayout, ELayout, INT8, INT8, INT8, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 64, 64, 32, 1, 1, 1, 32, 32, 16, 1, 1, 1, false, CK_TILE_PIPELINE_MEMORY>,
|
||||
GroupedConvolutionForwardInvoker<NDimSpatial, ALayout, BLayout, ELayout, INT8, INT8, INT8, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 64, 64, 32, 1, 1, 1, 32, 32, 16, 1, 1, 1, false, CK_TILE_PIPELINE_MEMORY>
|
||||
|
||||
// Converted from device_grouped_conv_fwd_xdl_int8_comp_instances_2x
|
||||
// GroupedConvolutionForwardInvoker<NDimSpatial, ALayout, BLayout, ELayout, INT8, INT8, INT8, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 128, 128, 128, 2, 2, 1, 32, 32, 32, 16, 16, 16, false, CK_TILE_PIPELINE_MEMORY>,
|
||||
// GroupedConvolutionForwardInvoker<NDimSpatial, ALayout, BLayout, ELayout, INT8, INT8, INT8, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 128, 128, 128, 2, 2, 1, 32, 32, 32, 16, 16, 16, false, CK_TILE_PIPELINE_COMPUTE_V3>,
|
||||
// GroupedConvolutionForwardInvoker<NDimSpatial, ALayout, BLayout, ELayout, INT8, INT8, INT8, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 128, 128, 128, 2, 2, 1, 32, 32, 32, 16, 16, 16, true, CK_TILE_PIPELINE_COMPUTE_V4>,
|
||||
// GroupedConvolutionForwardInvoker<NDimSpatial, ALayout, BLayout, ELayout, INT8, INT8, INT8, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Pad0, 1, 128, 128, 128, 2, 2, 1, 32, 32, 32, 16, 16, 16, false, CK_TILE_PIPELINE_MEMORY>,
|
||||
// GroupedConvolutionForwardInvoker<NDimSpatial, ALayout, BLayout, ELayout, INT8, INT8, INT8, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Pad0, 1, 128, 128, 128, 2, 2, 1, 32, 32, 32, 16, 16, 16, false, CK_TILE_PIPELINE_COMPUTE_V3>,
|
||||
// GroupedConvolutionForwardInvoker<NDimSpatial, ALayout, BLayout, ELayout, INT8, INT8, INT8, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Pad0, 1, 128, 128, 128, 2, 2, 1, 32, 32, 32, 16, 16, 16, true, CK_TILE_PIPELINE_COMPUTE_V4>,
|
||||
// GroupedConvolutionForwardInvoker<NDimSpatial, ALayout, BLayout, ELayout, INT8, INT8, INT8, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter3x3, 1, 128, 128, 128, 2, 2, 1, 32, 32, 32, 16, 16, 16, false, CK_TILE_PIPELINE_MEMORY>,
|
||||
// GroupedConvolutionForwardInvoker<NDimSpatial, ALayout, BLayout, ELayout, INT8, INT8, INT8, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter3x3, 1, 128, 128, 128, 2, 2, 1, 32, 32, 32, 16, 16, 16, false, CK_TILE_PIPELINE_COMPUTE_V3>,
|
||||
// GroupedConvolutionForwardInvoker<NDimSpatial, ALayout, BLayout, ELayout, INT8, INT8, INT8, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter3x3, 1, 128, 128, 128, 2, 2, 1, 32, 32, 32, 16, 16, 16, true, CK_TILE_PIPELINE_COMPUTE_V4>,
|
||||
// GroupedConvolutionForwardInvoker<NDimSpatial, ALayout, BLayout, ELayout, INT8, INT8, INT8, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 128, 128, 128, 2, 2, 1, 32, 32, 32, 16, 16, 16, false, CK_TILE_PIPELINE_MEMORY>,
|
||||
// GroupedConvolutionForwardInvoker<NDimSpatial, ALayout, BLayout, ELayout, INT8, INT8, INT8, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 128, 128, 128, 2, 2, 1, 32, 32, 32, 16, 16, 16, false, CK_TILE_PIPELINE_COMPUTE_V3>,
|
||||
// GroupedConvolutionForwardInvoker<NDimSpatial, ALayout, BLayout, ELayout, INT8, INT8, INT8, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 128, 128, 128, 2, 2, 1, 32, 32, 32, 16, 16, 16, true, CK_TILE_PIPELINE_COMPUTE_V4>,
|
||||
// Converted from device_grouped_conv_fwd_xdl_int8_comp_instances
|
||||
// GroupedConvolutionForwardInvoker<NDimSpatial, ALayout, BLayout, ELayout, INT8, INT8, INT8, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 128, 128, 32, 2, 2, 1, 32, 32, 32, 8, 8, 8, false, CK_TILE_PIPELINE_MEMORY>,
|
||||
// GroupedConvolutionForwardInvoker<NDimSpatial, ALayout, BLayout, ELayout, INT8, INT8, INT8, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 128, 128, 32, 2, 2, 1, 32, 32, 32, 8, 8, 8, false, CK_TILE_PIPELINE_COMPUTE_V3>,
|
||||
// GroupedConvolutionForwardInvoker<NDimSpatial, ALayout, BLayout, ELayout, INT8, INT8, INT8, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 128, 128, 32, 2, 2, 1, 32, 32, 32, 8, 8, 8, true, CK_TILE_PIPELINE_COMPUTE_V4>,
|
||||
// GroupedConvolutionForwardInvoker<NDimSpatial, ALayout, BLayout, ELayout, INT8, INT8, INT8, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Pad0, 1, 128, 128, 32, 2, 2, 1, 32, 32, 32, 8, 8, 8, false, CK_TILE_PIPELINE_MEMORY>,
|
||||
// GroupedConvolutionForwardInvoker<NDimSpatial, ALayout, BLayout, ELayout, INT8, INT8, INT8, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Pad0, 1, 128, 128, 32, 2, 2, 1, 32, 32, 32, 8, 8, 8, false, CK_TILE_PIPELINE_COMPUTE_V3>,
|
||||
// GroupedConvolutionForwardInvoker<NDimSpatial, ALayout, BLayout, ELayout, INT8, INT8, INT8, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Pad0, 1, 128, 128, 32, 2, 2, 1, 32, 32, 32, 8, 8, 8, true, CK_TILE_PIPELINE_COMPUTE_V4>,
|
||||
// GroupedConvolutionForwardInvoker<NDimSpatial, ALayout, BLayout, ELayout, INT8, INT8, INT8, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter3x3, 1, 128, 128, 32, 2, 2, 1, 32, 32, 32, 8, 8, 8, false, CK_TILE_PIPELINE_MEMORY>,
|
||||
// GroupedConvolutionForwardInvoker<NDimSpatial, ALayout, BLayout, ELayout, INT8, INT8, INT8, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter3x3, 1, 128, 128, 32, 2, 2, 1, 32, 32, 32, 8, 8, 8, false, CK_TILE_PIPELINE_COMPUTE_V3>,
|
||||
// GroupedConvolutionForwardInvoker<NDimSpatial, ALayout, BLayout, ELayout, INT8, INT8, INT8, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter3x3, 1, 128, 128, 32, 2, 2, 1, 32, 32, 32, 8, 8, 8, true, CK_TILE_PIPELINE_COMPUTE_V4>,
|
||||
// GroupedConvolutionForwardInvoker<NDimSpatial, ALayout, BLayout, ELayout, INT8, INT8, INT8, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 128, 128, 32, 2, 2, 1, 32, 32, 32, 8, 8, 8, false, CK_TILE_PIPELINE_MEMORY>,
|
||||
// GroupedConvolutionForwardInvoker<NDimSpatial, ALayout, BLayout, ELayout, INT8, INT8, INT8, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 128, 128, 32, 2, 2, 1, 32, 32, 32, 8, 8, 8, false, CK_TILE_PIPELINE_COMPUTE_V3>,
|
||||
// GroupedConvolutionForwardInvoker<NDimSpatial, ALayout, BLayout, ELayout, INT8, INT8, INT8, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 128, 128, 32, 2, 2, 1, 32, 32, 32, 8, 8, 8, true, CK_TILE_PIPELINE_COMPUTE_V4>,
|
||||
// Converted from device_grouped_conv_fwd_xdl_int8_comp_instances_part2 - Instance 1
|
||||
// GroupedConvolutionForwardInvoker<NDimSpatial, ALayout, BLayout, ELayout, INT8, INT8, INT8, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 256, 256, 32, 4, 4, 1, 32, 32, 32, 8, 8, 8, false, CK_TILE_PIPELINE_MEMORY>,
|
||||
// GroupedConvolutionForwardInvoker<NDimSpatial, ALayout, BLayout, ELayout, INT8, INT8, INT8, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 256, 256, 32, 4, 4, 1, 32, 32, 32, 8, 8, 8, false, CK_TILE_PIPELINE_COMPUTE_V3>,
|
||||
// These instances do not compile on Navi cards for FP16 due to high memory usage
|
||||
GroupedConvolutionForwardInvoker<NDimSpatial, ALayout, BLayout, ELayout, INT8, INT8, INT8, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 128, 128, 128, 2, 2, 1, 32, 32, 16, 16, 16, 16, true, CK_TILE_PIPELINE_COMPUTE_V4>,
|
||||
GroupedConvolutionForwardInvoker<NDimSpatial, ALayout, BLayout, ELayout, INT8, INT8, INT8, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Pad0, 1, 128, 128, 128, 2, 2, 1, 32, 32, 16, 16, 16, 16, true, CK_TILE_PIPELINE_COMPUTE_V4>,
|
||||
GroupedConvolutionForwardInvoker<NDimSpatial, ALayout, BLayout, ELayout, INT8, INT8, INT8, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter3x3, 1, 128, 128, 128, 2, 2, 1, 32, 32, 16, 16, 16, 16, true, CK_TILE_PIPELINE_COMPUTE_V4>,
|
||||
GroupedConvolutionForwardInvoker<NDimSpatial, ALayout, BLayout, ELayout, INT8, INT8, INT8, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 128, 128, 128, 2, 2, 1, 32, 32, 16, 16, 16, 16, true, CK_TILE_PIPELINE_COMPUTE_V4>,
|
||||
|
||||
GroupedConvolutionForwardInvoker<NDimSpatial, ALayout, BLayout, ELayout, INT8, INT8, INT8, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 64, 64, 32, 1, 1, 1, 32, 32, 16, 1, 1, 1, false, CK_TILE_PIPELINE_MEMORY>,
|
||||
GroupedConvolutionForwardInvoker<NDimSpatial, ALayout, BLayout, ELayout, INT8, INT8, INT8, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 64, 64, 32, 1, 1, 1, 32, 32, 16, 1, 1, 1, false, CK_TILE_PIPELINE_MEMORY>,
|
||||
GroupedConvolutionForwardInvoker<NDimSpatial, ALayout, BLayout, ELayout, INT8, INT8, INT8, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 128, 128, 128, 2, 2, 1, 32, 32, 16, 16, 16, 16, false, CK_TILE_PIPELINE_MEMORY>,
|
||||
GroupedConvolutionForwardInvoker<NDimSpatial, ALayout, BLayout, ELayout, INT8, INT8, INT8, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 128, 128, 128, 2, 2, 1, 32, 32, 16, 16, 16, 16, false, CK_TILE_PIPELINE_COMPUTE_V3>,
|
||||
GroupedConvolutionForwardInvoker<NDimSpatial, ALayout, BLayout, ELayout, INT8, INT8, INT8, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Pad0, 1, 128, 128, 128, 2, 2, 1, 32, 32, 16, 16, 16, 16, false, CK_TILE_PIPELINE_MEMORY>,
|
||||
GroupedConvolutionForwardInvoker<NDimSpatial, ALayout, BLayout, ELayout, INT8, INT8, INT8, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Pad0, 1, 128, 128, 128, 2, 2, 1, 32, 32, 16, 16, 16, 16, false, CK_TILE_PIPELINE_COMPUTE_V3>,
|
||||
GroupedConvolutionForwardInvoker<NDimSpatial, ALayout, BLayout, ELayout, INT8, INT8, INT8, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter3x3, 1, 128, 128, 128, 2, 2, 1, 32, 32, 16, 16, 16, 16, false, CK_TILE_PIPELINE_MEMORY>,
|
||||
GroupedConvolutionForwardInvoker<NDimSpatial, ALayout, BLayout, ELayout, INT8, INT8, INT8, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter3x3, 1, 128, 128, 128, 2, 2, 1, 32, 32, 16, 16, 16, 16, false, CK_TILE_PIPELINE_COMPUTE_V3>,
|
||||
GroupedConvolutionForwardInvoker<NDimSpatial, ALayout, BLayout, ELayout, INT8, INT8, INT8, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 128, 128, 128, 2, 2, 1, 32, 32, 16, 16, 16, 16, false, CK_TILE_PIPELINE_MEMORY>,
|
||||
GroupedConvolutionForwardInvoker<NDimSpatial, ALayout, BLayout, ELayout, INT8, INT8, INT8, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 128, 128, 128, 2, 2, 1, 32, 32, 16, 16, 16, 16, false, CK_TILE_PIPELINE_COMPUTE_V3>,
|
||||
GroupedConvolutionForwardInvoker<NDimSpatial, ALayout, BLayout, ELayout, INT8, INT8, INT8, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 128, 128, 32, 2, 2, 1, 32, 32, 16, 8, 8, 8, false, CK_TILE_PIPELINE_MEMORY>,
|
||||
GroupedConvolutionForwardInvoker<NDimSpatial, ALayout, BLayout, ELayout, INT8, INT8, INT8, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 128, 128, 32, 2, 2, 1, 32, 32, 16, 8, 8, 8, false, CK_TILE_PIPELINE_COMPUTE_V3>,
|
||||
GroupedConvolutionForwardInvoker<NDimSpatial, ALayout, BLayout, ELayout, INT8, INT8, INT8, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 128, 128, 32, 2, 2, 1, 32, 32, 16, 8, 8, 8, true, CK_TILE_PIPELINE_COMPUTE_V4>,
|
||||
GroupedConvolutionForwardInvoker<NDimSpatial, ALayout, BLayout, ELayout, INT8, INT8, INT8, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Pad0, 1, 128, 128, 32, 2, 2, 1, 32, 32, 16, 8, 8, 8, false, CK_TILE_PIPELINE_MEMORY>,
|
||||
GroupedConvolutionForwardInvoker<NDimSpatial, ALayout, BLayout, ELayout, INT8, INT8, INT8, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Pad0, 1, 128, 128, 32, 2, 2, 1, 32, 32, 16, 8, 8, 8, false, CK_TILE_PIPELINE_COMPUTE_V3>,
|
||||
GroupedConvolutionForwardInvoker<NDimSpatial, ALayout, BLayout, ELayout, INT8, INT8, INT8, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Pad0, 1, 128, 128, 32, 2, 2, 1, 32, 32, 16, 8, 8, 8, true, CK_TILE_PIPELINE_COMPUTE_V4>,
|
||||
GroupedConvolutionForwardInvoker<NDimSpatial, ALayout, BLayout, ELayout, INT8, INT8, INT8, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter3x3, 1, 128, 128, 32, 2, 2, 1, 32, 32, 16, 8, 8, 8, false, CK_TILE_PIPELINE_MEMORY>,
|
||||
GroupedConvolutionForwardInvoker<NDimSpatial, ALayout, BLayout, ELayout, INT8, INT8, INT8, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter3x3, 1, 128, 128, 32, 2, 2, 1, 32, 32, 16, 8, 8, 8, false, CK_TILE_PIPELINE_COMPUTE_V3>,
|
||||
GroupedConvolutionForwardInvoker<NDimSpatial, ALayout, BLayout, ELayout, INT8, INT8, INT8, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter3x3, 1, 128, 128, 32, 2, 2, 1, 32, 32, 16, 8, 8, 8, true, CK_TILE_PIPELINE_COMPUTE_V4>,
|
||||
GroupedConvolutionForwardInvoker<NDimSpatial, ALayout, BLayout, ELayout, INT8, INT8, INT8, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 128, 128, 32, 2, 2, 1, 32, 32, 16, 8, 8, 8, false, CK_TILE_PIPELINE_MEMORY>,
|
||||
GroupedConvolutionForwardInvoker<NDimSpatial, ALayout, BLayout, ELayout, INT8, INT8, INT8, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 128, 128, 32, 2, 2, 1, 32, 32, 16, 8, 8, 8, false, CK_TILE_PIPELINE_COMPUTE_V3>,
|
||||
GroupedConvolutionForwardInvoker<NDimSpatial, ALayout, BLayout, ELayout, INT8, INT8, INT8, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 128, 128, 32, 2, 2, 1, 32, 32, 16, 8, 8, 8, true, CK_TILE_PIPELINE_COMPUTE_V4>,
|
||||
GroupedConvolutionForwardInvoker<NDimSpatial, ALayout, BLayout, ELayout, INT8, INT8, INT8, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 256, 256, 32, 4, 4, 1, 32, 32, 16, 8, 8, 8, false, CK_TILE_PIPELINE_MEMORY>
|
||||
|
||||
|
||||
// GroupedConvolutionForwardInvoker<NDimSpatial, ALayout, BLayout, ELayout, INT8, INT8, INT8, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 256, 256, 32, 4, 4, 1, 32, 32, 16, 8, 8, 8, false, CK_TILE_PIPELINE_COMPUTE_V3>,
|
||||
// GroupedConvolutionForwardInvoker<NDimSpatial, ALayout, BLayout, ELayout, INT8, INT8, INT8, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 256, 256, 32, 4, 4, 1, 32, 32, 32, 8, 8, 8, true, CK_TILE_PIPELINE_COMPUTE_V4>,
|
||||
// GroupedConvolutionForwardInvoker<NDimSpatial, ALayout, BLayout, ELayout, INT8, INT8, INT8, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Pad0, 1, 256, 256, 32, 4, 4, 1, 32, 32, 32, 8, 8, 8, false, CK_TILE_PIPELINE_MEMORY>,
|
||||
// GroupedConvolutionForwardInvoker<NDimSpatial, ALayout, BLayout, ELayout, INT8, INT8, INT8, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Pad0, 1, 256, 256, 32, 4, 4, 1, 32, 32, 32, 8, 8, 8, false, CK_TILE_PIPELINE_COMPUTE_V3>,
|
||||
|
||||
@@ -97,8 +97,8 @@ if(SUPPORTED_GPU_TARGETS MATCHES "gfx9" OR SUPPORTED_GPU_TARGETS MATCHES "gfx1[1
|
||||
list(APPEND PROFILER_OPS profile_grouped_conv_fwd.cpp)
|
||||
# list(APPEND PROFILER_OPS profile_grouped_conv_fwd_bias_clamp.cpp)
|
||||
# list(APPEND PROFILER_OPS profile_grouped_conv_fwd_clamp.cpp)
|
||||
list(APPEND PROFILER_OPS profile_grouped_conv_bwd_data.cpp)
|
||||
list(APPEND PROFILER_OPS profile_grouped_conv_bwd_weight.cpp)
|
||||
# list(APPEND PROFILER_OPS profile_grouped_conv_bwd_data.cpp)
|
||||
# list(APPEND PROFILER_OPS profile_grouped_conv_bwd_weight.cpp)
|
||||
# list(APPEND PROFILER_OPS profile_gemm_multi_abd.cpp)
|
||||
# if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES)
|
||||
# list(APPEND PROFILER_OPS profile_gemm_add_multiply.cpp)
|
||||
@@ -114,7 +114,7 @@ endif()
|
||||
|
||||
if(DL_KERNELS)
|
||||
#list(APPEND PROFILER_OPS profile_batched_gemm_multi_d.cpp)
|
||||
list(APPEND PROFILER_OPS profile_grouped_conv_bwd_weight.cpp)
|
||||
#list(APPEND PROFILER_OPS profile_grouped_conv_bwd_weight.cpp)
|
||||
endif()
|
||||
|
||||
# if(CK_ENABLE_INT8)
|
||||
@@ -208,9 +208,9 @@ if(SUPPORTED_GPU_TARGETS MATCHES "gfx9" OR SUPPORTED_GPU_TARGETS MATCHES "gfx1[1
|
||||
# list(APPEND DEVICE_INSTANCES device_conv1d_bwd_data_instance)
|
||||
# list(APPEND DEVICE_INSTANCES device_conv3d_bwd_data_instance)
|
||||
# list(APPEND DEVICE_INSTANCES device_conv2d_bwd_data_instance)
|
||||
list(APPEND DEVICE_INSTANCES device_grouped_conv1d_bwd_weight_instance)
|
||||
list(APPEND DEVICE_INSTANCES device_grouped_conv2d_bwd_weight_instance)
|
||||
list(APPEND DEVICE_INSTANCES device_grouped_convnd_bwd_weight_instance)
|
||||
# list(APPEND DEVICE_INSTANCES device_grouped_conv1d_bwd_weight_instance)
|
||||
# list(APPEND DEVICE_INSTANCES device_grouped_conv2d_bwd_weight_instance)
|
||||
# list(APPEND DEVICE_INSTANCES device_grouped_convnd_bwd_weight_instance)
|
||||
# list(APPEND DEVICE_INSTANCES device_grouped_conv3d_fwd_convscale_instance)
|
||||
# list(APPEND DEVICE_INSTANCES device_grouped_conv3d_fwd_convinvscale_instance)
|
||||
# list(APPEND DEVICE_INSTANCES device_grouped_conv2d_fwd_clamp_instance)
|
||||
@@ -233,10 +233,10 @@ if(SUPPORTED_GPU_TARGETS MATCHES "gfx9" OR SUPPORTED_GPU_TARGETS MATCHES "gfx1[1
|
||||
# list(APPEND DEVICE_INSTANCES device_gemm_b_scale_instance)
|
||||
# list(APPEND DEVICE_INSTANCES device_gemm_universal_reduce_instance)
|
||||
list(APPEND DEVICE_INSTANCES device_grouped_conv3d_fwd_instance)
|
||||
list(APPEND DEVICE_INSTANCES device_grouped_conv2d_bwd_data_instance)
|
||||
list(APPEND DEVICE_INSTANCES device_grouped_conv3d_bwd_data_instance)
|
||||
# list(APPEND DEVICE_INSTANCES device_grouped_conv2d_bwd_data_instance)
|
||||
# list(APPEND DEVICE_INSTANCES device_grouped_conv3d_bwd_data_instance)
|
||||
list(APPEND DEVICE_INSTANCES device_grouped_conv2d_fwd_instance)
|
||||
list(APPEND DEVICE_INSTANCES device_grouped_conv3d_bwd_weight_instance)
|
||||
# list(APPEND DEVICE_INSTANCES device_grouped_conv3d_bwd_weight_instance)
|
||||
# list(APPEND DEVICE_INSTANCES device_gemm_add_relu_instance)
|
||||
# list(APPEND DEVICE_INSTANCES device_gemm_multi_abd_instance)
|
||||
# if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES)
|
||||
@@ -253,9 +253,9 @@ endif()
|
||||
|
||||
if(DL_KERNELS)
|
||||
#list(APPEND DEVICE_INSTANCES device_batched_gemm_multi_d_instance)
|
||||
list(APPEND DEVICE_INSTANCES device_grouped_conv1d_bwd_weight_instance)
|
||||
list(APPEND DEVICE_INSTANCES device_grouped_conv2d_bwd_weight_instance)
|
||||
list(APPEND DEVICE_INSTANCES device_grouped_conv3d_bwd_weight_instance)
|
||||
# list(APPEND DEVICE_INSTANCES device_grouped_conv1d_bwd_weight_instance)
|
||||
# list(APPEND DEVICE_INSTANCES device_grouped_conv2d_bwd_weight_instance)
|
||||
# list(APPEND DEVICE_INSTANCES device_grouped_conv3d_bwd_weight_instance)
|
||||
endif()
|
||||
|
||||
# if(CK_ENABLE_INT8)
|
||||
|
||||
@@ -89,9 +89,12 @@ int profile_grouped_conv_fwd(int argc, char* argv[])
|
||||
const bool time_kernel = std::stoi(argv[7]);
|
||||
const int num_dim_spatial = std::stoi(argv[8]);
|
||||
|
||||
// 9 for control, 1 for num_dim_spatial, 4 for G/N/K/C, and 6 * num_dim_spatial
|
||||
if(argc != 8 + 1 + 4 + 6 * num_dim_spatial + 1)
|
||||
// program name, 7 for control, 1 for num_dim_spatial, 4 for G/N/K/C, and 6 * num_dim_spatial
|
||||
const int expected_num_args = 1 + 7 + 1 + 4 + 6 * num_dim_spatial;
|
||||
if(argc != expected_num_args)
|
||||
{
|
||||
std::cout << "Received " << argc << " arguments." << std::endl;
|
||||
std::cout << "Expected " << expected_num_args << " arguments." << std::endl;
|
||||
print_helper_msg();
|
||||
return 1;
|
||||
}
|
||||
|
||||
@@ -4,4 +4,4 @@ grouped_conv_fwd 1 1 0 1 0 1
|
||||
|
||||
INT8
|
||||
op data_type layout verify init_method print time_kernel spat_dim G N K C Y X Hi Wi Sy Sx Dy Dx Left_pad_y Left_pad_x Right_pad_y Right_pad_x
|
||||
grouped_conv_fwd 3 1 0 1 0 1 2 1 1 32 32 3 3 800 1280 1 1 1 1 1 1 1 1
|
||||
grouped_conv_fwd 3 1 0 1 0 1 2 1 1 32 32 3 3 800 1280 1 1 1 1 1 1 1 1
|
||||
|
||||
Reference in New Issue
Block a user