From b6540cb96a7f7c2b320c3b74d224cfb2e0f2bf77 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ville=20Pietil=C3=A4?= <> Date: Thu, 18 Dec 2025 06:31:22 -0500 Subject: [PATCH] List bwd instances. --- ...grouped_conv_bwd_algorithms_instances.json | 1257 +++++++++++++++++ .../grouped_conv_bwd_algorithms_summary.md | 460 ++++++ .../grouped_conv_bwd_data_instances.json | 473 +++++++ .../grouped_conv_bwd_weight_instances.json | 648 +++++++++ 4 files changed, 2838 insertions(+) create mode 100644 experimental/builder/instances/grouped_conv_bwd_algorithms_instances.json create mode 100644 experimental/builder/instances/grouped_conv_bwd_algorithms_summary.md create mode 100644 experimental/builder/instances/grouped_conv_bwd_data_instances.json create mode 100644 experimental/builder/instances/grouped_conv_bwd_weight_instances.json diff --git a/experimental/builder/instances/grouped_conv_bwd_algorithms_instances.json b/experimental/builder/instances/grouped_conv_bwd_algorithms_instances.json new file mode 100644 index 0000000000..fa09d6761c --- /dev/null +++ b/experimental/builder/instances/grouped_conv_bwd_algorithms_instances.json @@ -0,0 +1,1257 @@ +{ + "description": "Comprehensive listing of grouped convolution backward algorithms and their instances in the Composable Kernel library", + "version": "Complete analysis including all algorithms in static library", + "note": "All listed algorithms are part of the static library with pre-compiled instances", + + "algorithms": { + "backward_data": { + "DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1": { + "implementation_file": "include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp", + "description": "XDL (Matrix Core) based backward data convolution with CShuffle optimization", + "instruction_set": "XDL/MFMA (AMD Matrix Cores)", + "status": "active", + "features": [ + "CShuffle optimization", + "Multiple spatial dimensions (1D, 2D, 3D)", + "Prefetch pipelining", + "Supports multiple D tensors for fused operations" + ], + "data_types": ["FP16", "BF16", "FP32", "TF32", "FP8", "BF8"], + "specializations": ["ConvBwdDataDefault", "ConvBwdDataFilter1x1Stride1Pad0"], + "instance_header_files": [ + "library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_data/device_grouped_conv_bwd_data_xdl_instance.hpp", + "library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_data/device_grouped_conv_bwd_data_xdl_bilinear_instance.hpp", + "library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_data/device_grouped_conv_bwd_data_xdl_scale_instance.hpp", + "library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_data/device_grouped_conv_bwd_data_transpose_xdl_instance.hpp" + ], + "instantiation_cpp_files": { + "2D": [ + "library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/xdl/device_grouped_conv2d_bwd_data_xdl_gnhwc_gkyxc_gnhwk_f16_instance.cpp", + "library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/xdl/device_grouped_conv2d_bwd_data_xdl_gnhwc_gkyxc_gnhwk_bf16_instance.cpp", + "library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/xdl/device_grouped_conv2d_bwd_data_xdl_gnhwc_gkyxc_gnhwk_f32_instance.cpp", + "library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/xdl/device_grouped_conv2d_bwd_data_xdl_nhwgc_gkyxc_nhwgk_f16_instance.cpp", + "library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/xdl/device_grouped_conv2d_bwd_data_xdl_nhwgc_gkyxc_nhwgk_bf16_instance.cpp", + "library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/xdl/device_grouped_conv2d_bwd_data_xdl_nhwgc_gkyxc_nhwgk_f32_instance.cpp", + "library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/xdl/device_grouped_conv2d_bwd_data_xdl_nhwgc_gkyxc_nhwgk_f32_tf32_instance.cpp", + "... (and many more variants)" + ], + "3D": [ + "library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data/xdl/*.cpp" + ] + }, + "supported_layouts": { + "2D": ["GNHWK/GKYXC/GNHWC", "NHWGK/GKYXC/NHWGC", "NGKHW/GKCYX/NGCHW"], + "3D": ["GNDHWK/GKZYXC/GNDHWC", "NDHWGK/GKZYXC/NDHWGC"] + } + }, + "DeviceGroupedConvBwdDataMultipleD_Wmma_CShuffle": { + "implementation_file": "include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_data_multiple_d_wmma_cshuffle.hpp", + "description": "WMMA (Wave Matrix Multiply Accumulate) based backward data convolution", + "instruction_set": "WMMA (16x16 matrix operations)", + "status": "active", + "features": [ + "CShuffle optimization", + "Flexible block sizes", + "Optimized for specific GPU architectures" + ], + "data_types": ["FP16", "INT8"], + "specializations": ["ConvBwdDataDefault", "ConvBwdDataFilter1x1Stride1Pad0"], + "instance_header_files": [ + "library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_data/device_grouped_conv_bwd_data_wmma_f16_instance.hpp", + "library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_data/device_grouped_conv_bwd_data_wmma_i8_instance.hpp" + ], + "instantiation_cpp_files": { + "2D": [ + "library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/wmma/device_grouped_conv2d_bwd_data_wmma_gnhwc_gkyxc_gnhwk_f16_instance.cpp", + "library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/wmma/device_grouped_conv2d_bwd_data_wmma_gnhwc_gkyxc_gnhwk_f16_1x1s1p0_instance.cpp", + "library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/wmma/device_grouped_conv2d_bwd_data_wmma_gnhwc_gkyxc_gnhwk_i8_instance.cpp", + "library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/wmma/device_grouped_conv2d_bwd_data_wmma_gnhwc_gkyxc_gnhwk_i8_1x1s1p0_instance.cpp", + "library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/wmma/device_grouped_conv2d_bwd_data_wmma_nhwgc_gkyxc_nhwgk_f16_instance.cpp", + "library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/wmma/device_grouped_conv2d_bwd_data_wmma_nhwgc_gkyxc_nhwgk_i8_instance.cpp" + ], + "3D": [ + "library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data/wmma/*.cpp" + ] + }, + "wmma_configuration": { + "wmma_size": "16x16", + "K1_FP16": 8, + "K1_INT8": 16 + } + } + }, + + "backward_weight": { + "DeviceGroupedConvBwdWeight_Xdl_CShuffleV3": { + "implementation_file": "include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_xdl_cshuffle_v3.hpp", + "description": "Latest XDL-based backward weight convolution with advanced optimizations (version 3)", + "instruction_set": "XDL/MFMA (AMD Matrix Cores)", + "version": "v3", + "status": "active_recommended", + "features": [ + "Latest XDL implementation with CShuffle", + "Split-K support with auto-deduction", + "Multiple pipeline versions (v1, v2, v3, v4)", + "Pipeline schedulers (Intrawave, Interwave)", + "Dual LDS buffer support (v4 pipeline)", + "Support for non-power-of-2 block sizes" + ], + "data_types": ["FP16", "BF16", "FP32", "TF32"], + "specializations": ["ConvBwdWeightDefault", "ConvBwdWeightFilter1x1Stride1Pad0"], + "pipeline_versions": ["v1", "v2", "v3", "v4"], + "pipeline_schedulers": ["Intrawave", "Interwave"], + "instance_header_files": [ + "library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_v3_xdl_instance.hpp" + ], + "instantiation_cpp_files": { + "2D": [ + "library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/gnhwc_gkyxc_gnhwk/device_grouped_conv2d_bwd_weight_v3_xdl_gnhwc_gkyxc_gnhwk_f16_*.cpp", + "library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/nhwgc_gkyxc_nhwgk/device_grouped_conv2d_bwd_weight_v3_xdl_nhwgc_gkyxc_nhwgk_f16_*.cpp" + ], + "3D": [ + "library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_v3_xdl_*.cpp" + ] + }, + "sample_instances": { + "FP16": [ + { + "BlockSize": 64, + "MPerBlock": 32, + "NPerBlock": 32, + "K0PerBlock": 32, + "K1": 8, + "MPerXDL": 32, + "NPerXDL": 32, + "MXdlPerWave": 1, + "NXdlPerWave": 1 + }, + { + "BlockSize": 64, + "MPerBlock": 64, + "NPerBlock": 80, + "K0PerBlock": 32, + "K1": 8, + "MPerXDL": 16, + "NPerXDL": 16, + "MXdlPerWave": 4, + "NXdlPerWave": 5, + "note": "Irregular NPerBlock=80" + } + ] + } + }, + + "DeviceGroupedConvBwdWeight_Wmma_CShuffleV3": { + "implementation_file": "include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_wmma_cshuffle_v3.hpp", + "description": "Latest WMMA-based backward weight convolution (version 3)", + "instruction_set": "WMMA (16x16 matrix operations)", + "version": "v3", + "status": "active_recommended", + "features": [ + "WMMA 16x16 matrix operations", + "CShuffle optimization", + "Pipeline versions and schedulers", + "Support for non-power-of-2 block sizes" + ], + "data_types": ["FP16", "BF16"], + "specializations": ["ConvBwdWeightDefault", "ConvBwdWeightFilter1x1Stride1Pad0"], + "instance_header_files": [ + "library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_v3_wmma_instance.hpp" + ], + "instantiation_cpp_files": { + "2D": [ + "library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/wmma/nhwgc_gkyxc_nhwgk/device_grouped_conv2d_bwd_weight_wmma_nhwgc_gkyxc_nhwgk_f16_instance.cpp", + "library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/wmma/nhwgc_gkyxc_nhwgk/device_grouped_conv2d_bwd_weight_wmma_nhwgc_gkyxc_nhwgk_bf16_instance.cpp" + ], + "3D": [ + "library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/wmma/*.cpp" + ] + }, + "sample_instances": { + "FP16": [ + { + "BlockSize": 64, + "MPerBlock": 32, + "NPerBlock": 32, + "K0PerBlock": 32, + "K1": 8, + "MPerWmma": 16, + "NPerWmma": 16, + "MRepeat": 2, + "NRepeat": 1 + }, + { + "BlockSize": 128, + "MPerBlock": 96, + "NPerBlock": 128, + "K0PerBlock": 64, + "K1": 8, + "MRepeat": 6, + "NRepeat": 2, + "note": "Non-power-of-2 MPerBlock=96" + } + ] + } + }, + + "DeviceGroupedConvBwdWeight_Xdl_CShuffle": { + "implementation_file": "include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_xdl_cshuffle.hpp", + "description": "Original XDL-based backward weight convolution (version 1)", + "instruction_set": "XDL/MFMA", + "version": "v1", + "status": "active_legacy", + "features": [ + "XDL/MFMA matrix core operations", + "CShuffle optimization", + "Supports transpose operations for NCHW layouts", + "Broader layout support than V3" + ], + "data_types": ["FP16", "BF16", "FP32", "TF32", "FP8", "BF8"], + "specializations": ["ConvBwdWeightDefault", "ConvBwdWeightFilter1x1Stride1Pad0"], + "instance_header_files": [ + "library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_xdl_instance.hpp" + ], + "instantiation_cpp_files": { + "1D": [ + "library/src/tensor_operation_instance/gpu/grouped_conv1d_bwd_weight/xdl/device_grouped_conv1d_bwd_weight_xdl_gnwc_gkxc_gnwk_*.cpp" + ], + "2D": [ + "library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/gnhwc_gkyxc_gnhwk/device_grouped_conv2d_bwd_weight_xdl_gnhwc_gkyxc_gnhwk_*.cpp", + "library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/nhwgc_gkyxc_nhwgk/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_*.cpp", + "library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/ngchw_gkcyx_ngkhw/device_grouped_conv2d_bwd_weight_xdl_ngchw_gkcyx_ngkhw_*.cpp", + "library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/ngchw_gkyxc_ngkhw/device_grouped_conv2d_bwd_weight_xdl_ngchw_gkyxc_ngkhw_*.cpp" + ], + "3D": [ + "library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/*.cpp" + ] + }, + "supported_layouts": { + "2D": ["GNHWC/GKYXC/GNHWK", "NHWGC/GKYXC/NHWGK", "NGCHW/GKCYX/NGKHW", "NGCHW/GKYXC/NGKHW"], + "3D": ["GNDHWC/GKZYXC/GNDHWK", "NDHWGC/GKZYXC/NDHWGK"] + } + }, + + "DeviceGroupedConvBwdWeight_Wmma_CShuffle": { + "implementation_file": "include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_wmma_cshuffle.hpp", + "description": "Original WMMA-based backward weight for 3D convolutions only", + "instruction_set": "WMMA (16x16 matrix operations)", + "version": "v1", + "status": "active_3d_only", + "features": [ + "WMMA 16x16 matrix operations", + "CShuffle optimization", + "Specialized for 3D convolutions ONLY", + "Different template structure than other WMMA variants" + ], + "data_types": ["FP16", "BF16", "FP32"], + "specializations": ["ConvBwdWeightDefault", "ConvBwdWeightFilter1x1Stride1Pad0"], + "spatial_dimensions": ["3D only"], + "supported_layouts": { + "3D_only": ["NDHWGC/GKZYXC/NDHWGK", "GNDHWC/GKZYXC/GNDHWK"] + }, + "note": "This is NOT a general-purpose WMMA algorithm. It only supports 3D convolutions." + }, + + "DeviceGroupedConvBwdWeight_TwoStage_Xdl_CShuffle": { + "implementation_file": "include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_two_stage_xdl_cshuffle.hpp", + "description": "Two-stage XDL-based backward weight for large convolutions", + "instruction_set": "XDL/MFMA", + "version": "two_stage", + "status": "active", + "features": [ + "Two-stage computation with intermediate workspace", + "Better memory efficiency for large problems", + "XDL matrix core operations", + "Multiple pipeline versions (v1, v2, v5)", + "Group merging optimization (NumGroupsToMerge)", + "Supports irregular block sizes (e.g., MPerBlock=48, NPerBlock=80, NPerBlock=112)" + ], + "data_types": ["FP16", "BF16"], + "specializations": ["ConvBwdWeightDefault", "ConvBwdWeightFilter1x1Stride1Pad0"], + "pipeline_versions": ["v1", "v2", "v5"], + "instance_header_files": [ + "library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_two_stage_xdl_instance.hpp" + ], + "instantiation_cpp_files": { + "2D": [ + "library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/nhwgc_gkyxc_nhwgk/device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_f16_pipev1_instance.cpp", + "library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/nhwgc_gkyxc_nhwgk/device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_f16_pipev2_instance.cpp", + "library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/nhwgc_gkyxc_nhwgk/device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_f16_pipev5_instance.cpp", + "library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/nhwgc_gkyxc_nhwgk/device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_bf16_pipev*.cpp", + "library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/ngchw_gkcyx_ngkhw/device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkcyx_ngkhw_*.cpp", + "library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/ngchw_gkyxc_ngkhw/device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkyxc_ngkhw_*.cpp" + ], + "3D": [ + "library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/*.cpp" + ] + }, + "special_features": { + "num_groups_to_merge": "Enables group merging for better performance (1, 2, 4, 8)", + "irregular_blocks": "Supports MPerBlock/NPerBlock values like 48, 80, 112, 208" + } + }, + + "DeviceGroupedConvBwdWeight_TwoStage_Wmma_CShuffleV3": { + "implementation_file": "include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_two_stage_wmma_cshuffle_v3.hpp", + "description": "Two-stage WMMA-based backward weight convolution", + "instruction_set": "WMMA (16x16 matrix operations)", + "version": "two_stage_v3", + "status": "active", + "features": [ + "Two-stage computation", + "WMMA 16x16 matrix operations", + "Pipeline versions", + "Group merging optimization" + ], + "data_types": ["FP16", "BF16"], + "specializations": ["ConvBwdWeightDefault", "ConvBwdWeightFilter1x1Stride1Pad0"], + "instance_header_files": [ + "library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_two_stage_wmma_instance.hpp" + ], + "instantiation_cpp_files": { + "2D": [ + "library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/wmma/nhwgc_gkyxc_nhwgk/device_grouped_conv2d_bwd_weight_two_stage_wmma_nhwgc_gkyxc_nhwgk_f16_pipev1_instance.cpp", + "library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/wmma/nhwgc_gkyxc_nhwgk/device_grouped_conv2d_bwd_weight_two_stage_wmma_nhwgc_gkyxc_nhwgk_bf16_pipev1_instance.cpp" + ], + "3D": [ + "library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/wmma/*.cpp" + ] + } + }, + + "DeviceGroupedConvBwdWeight_DL": { + "implementation_file": "include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_dl.hpp", + "description": "Direct Load variant using different memory access pattern", + "instruction_set": "Direct Load (no shared memory for A/B matrices)", + "version": "dl", + "status": "active", + "features": [ + "Direct load memory access pattern", + "No shared memory for A/B matrices", + "Suitable for specific problem sizes", + "Lower shared memory pressure" + ], + "data_types": ["FP16", "BF16/FP32 (mixed)", "FP32"], + "specializations": ["ConvBwdWeightDefault", "ConvBwdWeightFilter1x1Stride1Pad0"], + "instance_header_files": [ + "library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_dl_instance.hpp" + ], + "instantiation_cpp_files": { + "1D": [ + "library/src/tensor_operation_instance/gpu/grouped_conv1d_bwd_weight/dl/device_grouped_conv1d_bwd_weight_dl_gnwc_gkxc_gnwk_f16_instance.cpp", + "library/src/tensor_operation_instance/gpu/grouped_conv1d_bwd_weight/dl/device_grouped_conv1d_bwd_weight_dl_gnwc_gkxc_gnwk_f32_instance.cpp", + "library/src/tensor_operation_instance/gpu/grouped_conv1d_bwd_weight/dl/device_grouped_conv1d_bwd_weight_dl_gnwc_gkxc_gnwk_bf16_f32_bf16_instance.cpp" + ], + "2D": [ + "library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/dl/device_grouped_conv2d_bwd_weight_dl_gnhwc_gkyxc_gnhwk_f16_instance.cpp", + "library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/dl/device_grouped_conv2d_bwd_weight_dl_gnhwc_gkyxc_gnhwk_f32_instance.cpp", + "library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/dl/device_grouped_conv2d_bwd_weight_dl_gnhwc_gkyxc_gnhwk_bf16_f32_bf16_instance.cpp", + "library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/dl/device_grouped_conv2d_bwd_weight_dl_nhwgc_gkyxc_nhwgk_*.cpp" + ], + "3D": [ + "library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/dl/*.cpp" + ] + }, + "sample_instances": { + "FP16": { + "BlockSize": 256, + "MPerBlock": 128, + "NPerBlock": 128, + "K0PerBlock": 16, + "K1": 1 + } + } + }, + + "DeviceGroupedConvBwdWeightMultipleD_Xdl_CShuffle": { + "implementation_file": "include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_multiple_d_xdl_cshuffle.hpp", + "description": "XDL-based backward weight with Multiple D tensor support for fused operations", + "instruction_set": "XDL/MFMA", + "version": "multiple_d", + "status": "active", + "features": [ + "Supports additional input tensors (D tensors)", + "Fused element-wise operations (Bilinear, Scale)", + "XDL matrix core operations", + "CShuffle optimization" + ], + "data_types": ["FP16", "BF16", "FP32", "TF32", "FP8", "BF8"], + "specializations": ["ConvBwdWeightDefault", "ConvBwdWeightFilter1x1Stride1Pad0"], + "fused_operations": ["Bilinear", "Scale"], + "instance_header_files": [ + "library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_xdl_bilinear_instance.hpp", + "library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_xdl_scale_instance.hpp" + ], + "instantiation_cpp_files": { + "3D_bilinear": [ + "library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight_bilinear/xdl/*.cpp" + ], + "3D_scale": [ + "library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight_scale/xdl/*.cpp" + ] + }, + "sample_instances": { + "FP16_Bilinear": [ + { + "BlockSize": 64, + "MPerBlock": 64, + "NPerBlock": 64, + "K0PerBlock": 4, + "K1": 8, + "MPerXDL": 32, + "NPerXDL": 32, + "ElementwiseOp": "Bilinear" + } + ] + } + }, + + "DeviceGroupedConvBwdWeightMultipleD_Wmma_CShuffleV3": { + "implementation_file": "include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_multiple_d_wmma_cshuffle_v3.hpp", + "description": "WMMA-based backward weight with Multiple D tensor support", + "instruction_set": "WMMA (16x16 matrix operations)", + "version": "multiple_d_v3", + "status": "active", + "features": [ + "WMMA 16x16 matrix operations", + "Supports fused Scale operation", + "CShuffle optimization" + ], + "data_types": ["FP16", "BF16"], + "specializations": ["ConvBwdWeightDefault", "ConvBwdWeightFilter1x1Stride1Pad0"], + "fused_operations": ["Scale"], + "instance_header_files": [ + "library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_wmma_scale_instance.hpp" + ], + "instantiation_cpp_files": { + "3D_scale": [ + "library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight_scale/wmma/*.cpp" + ] + } + } + } + }, + + "template_parameters_detailed": { + "XDL_parameters": { + "NDimSpatial": { + "type": "index_t", + "description": "Number of spatial dimensions", + "values": [1, 2, 3] + }, + "InLayout": { + "type": "typename", + "description": "Input tensor layout", + "examples": ["GNHWC", "NHWGC", "NGCHW", "GNDHWC", "GNWC"] + }, + "WeiLayout": { + "type": "typename", + "description": "Weight tensor layout", + "examples": ["GKYXC", "GKCYX", "GKZYXC", "GKXC"] + }, + "OutLayout": { + "type": "typename", + "description": "Output tensor layout", + "examples": ["GNHWK", "NHWGK", "NGKHW", "GNDHWK", "GNWK"] + }, + "InDataType": { + "type": "typename", + "description": "Input data type", + "values": ["F16", "BF16", "F32"] + }, + "WeiDataType": { + "type": "typename", + "description": "Weight data type", + "values": ["F16", "BF16", "F32"] + }, + "OutDataType": { + "type": "typename", + "description": "Output data type", + "values": ["F16", "BF16", "F32"] + }, + "AccDataType": { + "type": "typename", + "description": "Accumulator data type", + "values": ["F32", "I32"], + "note": "Typically F32 for floating point, I32 for INT8" + }, + "ConvolutionBackwardWeightSpecialization": { + "type": "enum", + "values": ["Default", "Filter1x1Stride1Pad0"] + }, + "BlockSize": { + "type": "index_t", + "description": "Total number of threads per block", + "typical_values": [32, 64, 96, 128, 256] + }, + "MPerBlock": { + "type": "index_t", + "description": "M dimension of GEMM tile per block", + "typical_values": [16, 32, 48, 64, 80, 96, 112, 128, 208, 256] + }, + "NPerBlock": { + "type": "index_t", + "description": "N dimension of GEMM tile per block", + "typical_values": [16, 32, 48, 64, 80, 96, 112, 128, 208, 256] + }, + "K0PerBlock": { + "type": "index_t", + "description": "K0 dimension blocking per block", + "typical_values": [4, 8, 16, 32, 64, 128] + }, + "K1": { + "type": "index_t", + "description": "Vector width in K dimension", + "FP16_BF16": 8, + "FP32": 4, + "INT8": 16 + }, + "MPerXDL": { + "type": "index_t", + "description": "M dimension per XDL instruction", + "values": [16, 32] + }, + "NPerXDL": { + "type": "index_t", + "description": "N dimension per XDL instruction", + "values": [16, 32] + }, + "MXdlPerWave": { + "type": "index_t", + "description": "Number of XDL tiles in M dimension per wave", + "typical_values": [1, 2, 3, 4, 5, 6, 7, 8, 13] + }, + "NXdlPerWave": { + "type": "index_t", + "description": "Number of XDL tiles in N dimension per wave", + "typical_values": [1, 2, 3, 4, 5, 7, 8, 13, 16] + }, + "ABlockTransferThreadClusterLengths_K0_M_K1": { + "type": "Sequence", + "description": "Thread cluster dimensions for A matrix block transfer", + "examples": [[4, 8, 1], [4, 16, 1], [4, 32, 1], [8, 8, 1]] + }, + "ABlockTransferThreadClusterArrangeOrder": { + "type": "Sequence", + "description": "Thread cluster arrange order for A", + "common_value": [2, 0, 1] + }, + "ABlockTransferSrcAccessOrder": { + "type": "Sequence", + "description": "Source access order for A block transfer", + "common_values": [[1, 0, 2], [2, 0, 1]] + }, + "ABlockTransferSrcVectorDim": { + "type": "index_t", + "description": "Vector dimension for A source", + "typical_value": 1 + }, + "ABlockTransferSrcScalarPerVector": { + "type": "index_t", + "description": "Vector load size for A matrix source", + "typical_values": [1, 2, 4, 8, 16] + }, + "ABlockTransferDstScalarPerVector_K1": { + "type": "index_t", + "description": "Vector store size for A matrix destination in K1 dimension", + "typical_values": [1, 2, 4, 8] + }, + "ABlockLdsAddExtraM": { + "type": "bool", + "description": "Add extra padding in M dimension for LDS to avoid bank conflicts", + "typical_values": [false, true] + }, + "BBlockTransferThreadClusterLengths_K0_N_K1": { + "type": "Sequence", + "description": "Thread cluster dimensions for B matrix block transfer", + "examples": [[4, 8, 1], [4, 16, 1], [4, 32, 1]] + }, + "BBlockTransferSrcScalarPerVector": { + "type": "index_t", + "description": "Vector load size for B matrix source", + "typical_values": [1, 2, 4, 8, 16] + }, + "BBlockTransferDstScalarPerVector_K1": { + "type": "index_t", + "description": "Vector store size for B matrix destination in K1 dimension", + "typical_values": [1, 2, 4, 8] + }, + "BBlockLdsAddExtraN": { + "type": "bool", + "description": "Add extra padding in N dimension for LDS to avoid bank conflicts" + }, + "CShuffleMXdlPerWavePerShuffle": { + "type": "index_t", + "description": "M XDL tiles per shuffle operation", + "typical_value": 1 + }, + "CShuffleNXdlPerWavePerShuffle": { + "type": "index_t", + "description": "N XDL tiles per shuffle operation", + "typical_value": 1 + }, + "CBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock": { + "type": "Sequence", + "description": "Thread cluster for C matrix output", + "examples": [[1, 8, 1, 8], [1, 16, 1, 16], [1, 32, 1, 8]] + }, + "CBlockTransferScalarPerVector_NWaveNPerXdl": { + "type": "index_t", + "description": "Vector store size for C matrix output", + "typical_values": [1, 2, 4, 8] + }, + "BlkGemmPipeSched": { + "type": "BlockGemmPipelineScheduler", + "description": "Block GEMM pipeline scheduler", + "values": ["Intrawave", "Interwave"] + }, + "BlkGemmPipelineVer": { + "type": "BlockGemmPipelineVersion", + "description": "Block GEMM pipeline version", + "values": ["v1", "v2", "v3", "v4", "v5"] + }, + "ComputeTypeA": { + "type": "typename", + "description": "Compute data type for A matrix", + "examples": ["F16", "BF16", "F32", "TF32", "BF8", "F8"], + "note": "Can differ from input type, e.g., TF32 for FP32 input, BF8/F8 for FP16" + }, + "ComputeTypeB": { + "type": "typename", + "description": "Compute data type for B matrix", + "examples": ["F16", "BF16", "F32", "TF32", "BF8", "F8"] + } + }, + + "WMMA_parameters": { + "BlockSize": { + "type": "index_t", + "description": "Total number of threads per block", + "typical_values": [32, 64, 96, 128, 256] + }, + "MPerBlock": { + "type": "index_t", + "description": "M dimension per block", + "typical_values": [16, 32, 48, 64, 96, 128, 256] + }, + "NPerBlock": { + "type": "index_t", + "description": "N dimension per block", + "typical_values": [16, 32, 64, 128, 256] + }, + "K0PerBlock": { + "type": "index_t", + "description": "K0 dimension per block", + "typical_values": [4, 8, 32, 64, 128] + }, + "K1": { + "type": "index_t", + "description": "Vector width", + "FP16_BF16": 8, + "INT8": 16 + }, + "MPerWmma": { + "type": "index_t", + "description": "M dimension per WMMA instruction", + "value": 16, + "note": "Fixed at 16 for WMMA" + }, + "NPerWmma": { + "type": "index_t", + "description": "N dimension per WMMA instruction", + "value": 16, + "note": "Fixed at 16 for WMMA" + }, + "MRepeat": { + "type": "index_t", + "description": "Number of WMMA operations in M dimension", + "typical_values": [1, 2, 3, 4, 6, 8] + }, + "NRepeat": { + "type": "index_t", + "description": "Number of WMMA operations in N dimension", + "typical_values": [1, 2, 4, 8] + } + }, + + "TwoStage_specific_parameters": { + "NumGroupsToMerge": { + "type": "index_t", + "description": "Number of groups to merge in two-stage algorithm for better performance", + "typical_values": [1, 2, 4, 8] + } + }, + + "DirectLoad_specific_parameters": { + "M1PerThread": { + "type": "index_t", + "description": "M1 dimension per thread in direct load pattern" + }, + "N1PerThread": { + "type": "index_t", + "description": "N1 dimension per thread in direct load pattern" + } + } + }, + + "file_references": { + "algorithm_implementation_headers": { + "path": "include/ck/tensor_operation/gpu/device/impl/", + "backward_data": [ + "device_grouped_conv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp", + "device_grouped_conv_bwd_data_multiple_d_wmma_cshuffle.hpp" + ], + "backward_weight": [ + "device_grouped_conv_bwd_weight_xdl_cshuffle_v3.hpp", + "device_grouped_conv_bwd_weight_wmma_cshuffle_v3.hpp", + "device_grouped_conv_bwd_weight_xdl_cshuffle.hpp", + "device_grouped_conv_bwd_weight_wmma_cshuffle.hpp", + "device_grouped_conv_bwd_weight_two_stage_xdl_cshuffle.hpp", + "device_grouped_conv_bwd_weight_two_stage_wmma_cshuffle_v3.hpp", + "device_grouped_conv_bwd_weight_dl.hpp", + "device_grouped_conv_bwd_weight_multiple_d_xdl_cshuffle.hpp", + "device_grouped_conv_bwd_weight_multiple_d_wmma_cshuffle_v3.hpp" + ] + }, + + "instance_template_headers": { + "path": "library/include/ck/library/tensor_operation_instance/gpu/", + "backward_data": { + "directory": "grouped_conv_bwd_data/", + "files": [ + "device_grouped_conv_bwd_data_xdl_instance.hpp", + "device_grouped_conv_bwd_data_wmma_f16_instance.hpp", + "device_grouped_conv_bwd_data_wmma_i8_instance.hpp", + "device_grouped_conv_bwd_data_xdl_bilinear_instance.hpp", + "device_grouped_conv_bwd_data_xdl_scale_instance.hpp", + "device_grouped_conv_bwd_data_transpose_xdl_instance.hpp" + ] + }, + "backward_weight": { + "directory": "grouped_conv_bwd_weight/", + "files": [ + "device_grouped_conv_bwd_weight_v3_xdl_instance.hpp", + "device_grouped_conv_bwd_weight_v3_wmma_instance.hpp", + "device_grouped_conv_bwd_weight_xdl_instance.hpp", + "device_grouped_conv_bwd_weight_two_stage_xdl_instance.hpp", + "device_grouped_conv_bwd_weight_two_stage_wmma_instance.hpp", + "device_grouped_conv_bwd_weight_dl_instance.hpp", + "device_grouped_conv_bwd_weight_xdl_bilinear_instance.hpp", + "device_grouped_conv_bwd_weight_xdl_scale_instance.hpp", + "device_grouped_conv_bwd_weight_wmma_scale_instance.hpp", + "device_grouped_conv_bwd_weight_wmma_bilinear_instance.hpp" + ] + } + }, + + "instantiation_cpp_files": { + "path": "library/src/tensor_operation_instance/gpu/", + "backward_data": { + "1D": "grouped_conv1d_bwd_data/", + "2D": "grouped_conv2d_bwd_data/{wmma,xdl}/", + "3D": "grouped_conv3d_bwd_data/{wmma,xdl}/" + }, + "backward_weight": { + "1D": "grouped_conv1d_bwd_weight/{dl,xdl}/", + "2D": "grouped_conv2d_bwd_weight/{dl,wmma,xdl}/", + "3D": [ + "grouped_conv3d_bwd_weight/{dl,wmma,xdl}/", + "grouped_conv3d_bwd_weight_bilinear/", + "grouped_conv3d_bwd_weight_scale/" + ] + } + } + }, + + "layout_configurations": { + "backward_data_2D": { + "supported_combinations": [ + { + "output_layout": "GNHWK", + "weight_layout": "GKYXC", + "input_layout": "GNHWC", + "description": "Group-first NHWC layout" + }, + { + "output_layout": "NHWGK", + "weight_layout": "GKYXC", + "input_layout": "NHWGC", + "description": "NHWC with group dimension at end" + }, + { + "output_layout": "NGKHW", + "weight_layout": "GKCYX", + "input_layout": "NGCHW", + "description": "NCHW variant with grouped C" + }, + { + "output_layout": "NGKHW", + "weight_layout": "GKYXC", + "input_layout": "NGCHW", + "description": "Mixed layout NCHW/filter-C-last" + } + ] + }, + "backward_weight_2D": { + "supported_combinations": [ + { + "input_layout": "NHWGC", + "weight_layout": "GKYXC", + "output_layout": "NHWGK", + "algorithms": ["All except Wmma_CShuffle (3D only)"] + }, + { + "input_layout": "GNHWC", + "weight_layout": "GKYXC", + "output_layout": "GNHWK", + "algorithms": ["XDL variants", "DL"] + }, + { + "input_layout": "NGCHW", + "weight_layout": "GKCYX", + "output_layout": "NGKHW", + "algorithms": ["XDL variants (with transpose support)"] + }, + { + "input_layout": "NGCHW", + "weight_layout": "GKYXC", + "output_layout": "NGKHW", + "algorithms": ["XDL variants (with transpose support)"] + } + ] + }, + "backward_weight_3D": { + "supported_combinations": [ + { + "input_layout": "GNDHWC", + "weight_layout": "GKZYXC", + "output_layout": "GNDHWK", + "algorithms": ["All"] + }, + { + "input_layout": "NDHWGC", + "weight_layout": "GKZYXC", + "output_layout": "NDHWGK", + "algorithms": ["All including Wmma_CShuffle"] + } + ] + } + }, + + "statistics": { + "total_algorithm_implementations": { + "backward_data": 2, + "backward_weight": 9, + "total": 11 + }, + "estimated_instance_counts": { + "backward_data": { + "XDL_variants": "~200+ instances (FP16, BF16, FP32, TF32, with various optimizations)", + "WMMA_variants": "~30 instances (FP16: 14, INT8: 13)" + }, + "backward_weight": { + "Xdl_CShuffleV3": "~20 instances (FP16, BF16, FP32 with pipeline variants)", + "Wmma_CShuffleV3": "~20 instances (FP16: 9, BF16: 10)", + "Xdl_CShuffle_v1": "~50+ instances (FP16, BF16, FP32, TF32, FP8/BF8)", + "Wmma_CShuffle_v1": "Limited (3D only)", + "TwoStage_Xdl": "~50+ instances (FP16, BF16 with pipev1/v2/v5, regular and irregular)", + "TwoStage_Wmma": "~2 instances (FP16, BF16 base configs)", + "DL": "~6 instances (FP16, BF16/FP32, FP32 for 1D/2D/3D)", + "MultipleD_Xdl": "~30 instances (Bilinear and Scale variants)", + "MultipleD_Wmma": "~10 instances (Scale variant)" + }, + "total_estimated": "400-500+ pre-compiled instances" + }, + + "data_type_support_matrix": { + "FP16": { + "backward_data": { + "XDL": true, + "WMMA": true + }, + "backward_weight": { + "Xdl_V3": true, + "Wmma_V3": true, + "Xdl_V1": true, + "Wmma_V1_3D": true, + "TwoStage_XDL": true, + "TwoStage_WMMA": true, + "DL": true, + "MultipleD_XDL": true, + "MultipleD_WMMA": true + } + }, + "BF16": { + "backward_data": { + "XDL": true, + "WMMA": false + }, + "backward_weight": { + "Xdl_V3": true, + "Wmma_V3": true, + "Xdl_V1": true, + "Wmma_V1_3D": true, + "TwoStage_XDL": true, + "TwoStage_WMMA": true, + "DL": "mixed (BF16/FP32)", + "MultipleD_XDL": "mixed (BF16/FP32)", + "MultipleD_WMMA": "mixed (BF16/FP32)" + } + }, + "FP32": { + "backward_data": { + "XDL": true, + "WMMA": false + }, + "backward_weight": { + "Xdl_V3": true, + "Wmma_V3": false, + "Xdl_V1": true, + "Wmma_V1_3D": true, + "TwoStage_XDL": false, + "TwoStage_WMMA": false, + "DL": true, + "MultipleD_XDL": true, + "MultipleD_WMMA": false + } + }, + "TF32": { + "backward_data": { + "XDL": "compute_type", + "note": "TF32 is compute type, input/output is FP32" + }, + "backward_weight": { + "Xdl_V1": "compute_type", + "MultipleD_XDL": "compute_type" + } + }, + "INT8": { + "backward_data": { + "WMMA": true + }, + "backward_weight": { + "note": "No INT8 backward weight support" + } + }, + "FP8_BF8": { + "backward_data": { + "XDL": "compute_type (newer GPUs)" + }, + "backward_weight": { + "Xdl_V1": "compute_type", + "MultipleD_XDL": "compute_type" + } + } + } + }, + + "instance_examples": { + "backward_data_XDL_FP16": { + "algorithm": "DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1", + "header_file": "library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_data/device_grouped_conv_bwd_data_xdl_instance.hpp", + "cpp_file": "library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/xdl/device_grouped_conv2d_bwd_data_xdl_gnhwc_gkyxc_gnhwk_f16_instance.cpp", + "instances": [ + { + "BlockSize": 64, + "MPerBlock": 64, + "NPerBlock": 64, + "K0PerBlock": 32, + "K1": 8, + "MPerXDL": 32, + "NPerXDL": 32, + "MXdlPerWave": 2, + "NXdlPerWave": 2, + "specialization": "ConvBwdDataDefault" + }, + { + "BlockSize": 256, + "MPerBlock": 128, + "NPerBlock": 256, + "K0PerBlock": 32, + "K1": 8, + "MPerXDL": 32, + "NPerXDL": 32, + "MXdlPerWave": 2, + "NXdlPerWave": 4 + } + ] + }, + + "backward_data_WMMA_FP16": { + "algorithm": "DeviceGroupedConvBwdDataMultipleD_Wmma_CShuffle", + "header_file": "library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_data/device_grouped_conv_bwd_data_wmma_f16_instance.hpp", + "cpp_file": "library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/wmma/device_grouped_conv2d_bwd_data_wmma_gnhwc_gkyxc_gnhwk_f16_instance.cpp", + "instances": [ + { + "BlockSize": 128, + "MPerBlock": 64, + "NPerBlock": 64, + "K0PerBlock": 4, + "K1": 8, + "MPerWmma": 16, + "NPerWmma": 16, + "MRepeat": 2, + "NRepeat": 2 + }, + { + "BlockSize": 256, + "MPerBlock": 128, + "NPerBlock": 256, + "K0PerBlock": 8, + "K1": 8, + "MRepeat": 4, + "NRepeat": 4 + } + ] + }, + + "backward_weight_XDL_V3_FP16": { + "algorithm": "DeviceGroupedConvBwdWeight_Xdl_CShuffleV3", + "header_file": "library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_v3_xdl_instance.hpp", + "cpp_file": "library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/nhwgc_gkyxc_nhwgk/device_grouped_conv2d_bwd_weight_v3_xdl_nhwgc_gkyxc_nhwgk_f16_*.cpp", + "instances": [ + { + "BlockSize": 64, + "MPerBlock": 32, + "NPerBlock": 32, + "K0PerBlock": 32, + "K1": 8, + "MPerXDL": 32, + "NPerXDL": 32, + "MXdlPerWave": 1, + "NXdlPerWave": 1, + "pipeline_scheduler": "Intrawave", + "pipeline_version": "v1" + }, + { + "BlockSize": 64, + "MPerBlock": 64, + "NPerBlock": 80, + "K0PerBlock": 32, + "K1": 8, + "MPerXDL": 16, + "NPerXDL": 16, + "MXdlPerWave": 4, + "NXdlPerWave": 5, + "note": "Irregular NPerBlock=80" + } + ] + }, + + "backward_weight_WMMA_V3_FP16": { + "algorithm": "DeviceGroupedConvBwdWeight_Wmma_CShuffleV3", + "header_file": "library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_v3_wmma_instance.hpp", + "cpp_file": "library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/wmma/nhwgc_gkyxc_nhwgk/device_grouped_conv2d_bwd_weight_wmma_nhwgc_gkyxc_nhwgk_f16_instance.cpp", + "instances": [ + { + "BlockSize": 64, + "MPerBlock": 32, + "NPerBlock": 32, + "K0PerBlock": 32, + "K1": 8, + "MPerWmma": 16, + "NPerWmma": 16, + "MRepeat": 2, + "NRepeat": 1 + }, + { + "BlockSize": 128, + "MPerBlock": 96, + "NPerBlock": 128, + "K0PerBlock": 64, + "K1": 8, + "MRepeat": 6, + "NRepeat": 2, + "note": "Irregular MPerBlock=96" + } + ] + }, + + "backward_weight_XDL_V1_FP16": { + "algorithm": "DeviceGroupedConvBwdWeight_Xdl_CShuffle", + "header_file": "library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_xdl_instance.hpp", + "cpp_files": [ + "library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/nhwgc_gkyxc_nhwgk/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_f16_*.cpp", + "library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/ngchw_gkcyx_ngkhw/device_grouped_conv2d_bwd_weight_xdl_ngchw_gkcyx_ngkhw_f16_instance.cpp" + ], + "instances": [ + { + "BlockSize": 64, + "MPerBlock": 64, + "NPerBlock": 64, + "K0PerBlock": 4, + "K1": 8, + "MPerXDL": 32, + "NPerXDL": 32, + "MXdlPerWave": 2, + "NXdlPerWave": 2 + } + ] + }, + + "backward_weight_TwoStage_XDL_FP16": { + "algorithm": "DeviceGroupedConvBwdWeight_TwoStage_Xdl_CShuffle", + "header_file": "library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_two_stage_xdl_instance.hpp", + "cpp_files": [ + "library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/nhwgc_gkyxc_nhwgk/device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_f16_pipev1_instance.cpp", + "library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/nhwgc_gkyxc_nhwgk/device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_f16_pipev2_instance.cpp", + "library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/nhwgc_gkyxc_nhwgk/device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_f16_pipev5_instance.cpp" + ], + "instances": [ + { + "BlockSize": 64, + "MPerBlock": 16, + "NPerBlock": 16, + "K0PerBlock": 32, + "K1": 8, + "MPerXDL": 16, + "NPerXDL": 16, + "MXdlPerWave": 1, + "NXdlPerWave": 1, + "NumGroupsToMerge": 1, + "pipeline_version": "v1" + }, + { + "BlockSize": 64, + "MPerBlock": 64, + "NPerBlock": 80, + "K0PerBlock": 32, + "K1": 8, + "MPerXDL": 16, + "NPerXDL": 16, + "MXdlPerWave": 4, + "NXdlPerWave": 5, + "NumGroupsToMerge": 1, + "note": "Irregular NPerBlock=80" + } + ] + }, + + "backward_weight_DL_FP16": { + "algorithm": "DeviceGroupedConvBwdWeight_DL", + "header_file": "library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_dl_instance.hpp", + "cpp_files": [ + "library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/dl/device_grouped_conv2d_bwd_weight_dl_nhwgc_gkyxc_nhwgk_f16_instance.cpp", + "library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/dl/device_grouped_conv2d_bwd_weight_dl_gnhwc_gkyxc_gnhwk_f16_instance.cpp" + ], + "instances": [ + { + "BlockSize": 256, + "MPerBlock": 128, + "NPerBlock": 128, + "K0PerBlock": 16, + "K1": 1, + "M1PerThread": 4, + "N1PerThread": 4 + } + ] + }, + + "backward_weight_MultipleD_XDL_Bilinear_FP16": { + "algorithm": "DeviceGroupedConvBwdWeightMultipleD_Xdl_CShuffle", + "header_file": "library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_xdl_bilinear_instance.hpp", + "cpp_files": [ + "library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight_bilinear/xdl/*.cpp" + ], + "instances": [ + { + "BlockSize": 64, + "MPerBlock": 64, + "NPerBlock": 64, + "K0PerBlock": 4, + "K1": 8, + "MPerXDL": 32, + "NPerXDL": 32, + "MXdlPerWave": 2, + "NXdlPerWave": 2, + "ElementwiseOp": "Bilinear", + "DsTensors": "Tuple" + } + ] + } + }, + + "key_optimizations": { + "CShuffle": { + "description": "Cross-lane shuffle for efficient data redistribution", + "benefit": "Reduces shared memory bank conflicts and improves data reuse" + }, + "Split-K": { + "description": "Parallelizes reduction dimension", + "algorithms": ["Xdl_CShuffleV3"], + "auto_deduction": true, + "benefit": "Better occupancy for large K dimensions" + }, + "Pipeline_Versions": { + "v1": "Basic pipeline", + "v2": "Enhanced prefetching with tail number support (supports tails 1-7)", + "v3": "Further optimizations", + "v4": "Dual LDS buffer support for improved throughput", + "v5": "Advanced prefetching for two-stage algorithms" + }, + "Pipeline_Schedulers": { + "Intrawave": "Schedule within wave for lower latency", + "Interwave": "Schedule across waves for better occupancy" + }, + "LDS_Padding": { + "description": "Extra padding to avoid bank conflicts", + "parameters": ["ABlockLdsAddExtraM", "BBlockLdsAddExtraN"] + }, + "Optimized_Loads": { + "description": "Specialized instances with optimized memory access patterns", + "variants": ["optimized_loads", "16_16 (16x16 XDL)", "vec_transpose"] + }, + "Two_Stage": { + "description": "Splits computation into two stages with intermediate workspace", + "benefit": "Reduces memory footprint for very large convolutions", + "workspace_requirement": true + }, + "Group_Merging": { + "description": "Merges multiple groups for better performance", + "parameter": "NumGroupsToMerge", + "algorithms": ["TwoStage variants"] + } + }, + + "usage_guidelines": { + "recommended_algorithms": { + "backward_data": { + "modern_GPUs": "DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1 with FP16/BF16", + "varied_sizes": "DeviceGroupedConvBwdDataMultipleD_Wmma_CShuffle", + "int8_quantization": "DeviceGroupedConvBwdDataMultipleD_Wmma_CShuffle" + }, + "backward_weight": { + "new_code_XDL": "DeviceGroupedConvBwdWeight_Xdl_CShuffleV3 (latest optimizations)", + "new_code_WMMA": "DeviceGroupedConvBwdWeight_Wmma_CShuffleV3", + "large_convolutions": "DeviceGroupedConvBwdWeight_TwoStage_Xdl_CShuffle", + "NCHW_layouts": "DeviceGroupedConvBwdWeight_Xdl_CShuffle (v1, has transpose support)", + "fused_operations": "DeviceGroupedConvBwdWeightMultipleD_Xdl_CShuffle (Bilinear/Scale)", + "3D_WMMA_only": "DeviceGroupedConvBwdWeight_Wmma_CShuffle (3D specific)", + "low_memory": "DeviceGroupedConvBwdWeight_DL" + } + }, + + "instance_selection": { + "guidelines": [ + "Match block sizes to problem dimensions for best occupancy", + "Larger blocks (256) generally better for large convolutions", + "Smaller blocks (64, 128) for small convolutions or limited resources", + "Consider occupancy vs register pressure tradeoffs", + "Use specialized 1x1s1p0 instances when applicable", + "Test multiple instances to find optimal configuration", + "V3 variants use latest optimizations and are recommended", + "Two-stage variants use intermediate workspace but save memory" + ] + }, + + "performance_tips": [ + "Enable Split-K auto-deduction for V3 XDL (set split_k=-1)", + "Use FP16/BF16 on modern AMD GPUs for best performance", + "TF32 compute can accelerate FP32 convolutions on supported hardware", + "For very large convolutions, consider two-stage algorithms", + "For fused operations (e.g., gradient scaling), use MultipleD variants", + "NCHW layouts require transpose support (use v1 XDL variants)", + "INT8 backward data is only available with WMMA" + ] + } +} diff --git a/experimental/builder/instances/grouped_conv_bwd_algorithms_summary.md b/experimental/builder/instances/grouped_conv_bwd_algorithms_summary.md new file mode 100644 index 0000000000..8ada5248ec --- /dev/null +++ b/experimental/builder/instances/grouped_conv_bwd_algorithms_summary.md @@ -0,0 +1,460 @@ +# Grouped Convolution Backward Algorithms Summary + +This document provides a comprehensive overview of the backward convolution algorithms exposed by the Composable Kernel library for **grouped convolutions**. + +## Overview + +The library provides optimized GPU kernels for two types of backward convolution operations: +1. **Backward Data** (gradient with respect to input) +2. **Backward Weight** (gradient with respect to weights) + +All algorithms are part of the static library and have pre-compiled instances. + +## 1. Backward Data Convolution Algorithms + +### 1.1 DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1 + +**Implementation File:** `include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp` + +**Description:** XDL (Matrix Core) based backward data convolution with CShuffle optimization. + +**Key Features:** +- Uses AMD Matrix Core Instructions (XDL/MFMA) +- CShuffle for efficient data movement +- Supports multiple spatial dimensions (1D, 2D, 3D) +- Multiple data types: FP16, BF16, FP32, TF32, FP8, BF8 +- Two specializations: + - `ConvBwdDataDefault`: General convolution + - `ConvBwdDataFilter1x1Stride1Pad0`: Optimized for 1x1 filters with stride 1 and no padding + +**Instance Files:** +- `device_grouped_conv_bwd_data_xdl_instance.hpp` - Main XDL instances (FP16, BF16, FP32, TF32) +- `device_grouped_conv_bwd_data_xdl_bilinear_instance.hpp` - Bilinear variants +- `device_grouped_conv_bwd_data_xdl_scale_instance.hpp` - Scale variants +- `device_grouped_conv_bwd_data_transpose_xdl_instance.hpp` - Transpose variants + +**Instantiation Sources:** +- `grouped_conv2d_bwd_data/xdl/*.cpp` - 2D convolution instances +- `grouped_conv3d_bwd_data/xdl/*.cpp` - 3D convolution instances + +### 1.2 DeviceGroupedConvBwdDataMultipleD_Wmma_CShuffle + +**Implementation File:** `include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_data_multiple_d_wmma_cshuffle.hpp` + +**Description:** WMMA (Wave Matrix Multiply Accumulate) based backward data convolution. + +**Key Features:** +- Uses WMMA instructions (16x16 matrix operations) +- More flexible for different block sizes +- Supports FP16 and INT8 data types +- Optimized for specific GPU architectures + +**Instance Files:** +- `device_grouped_conv_bwd_data_wmma_f16_instance.hpp` - FP16 WMMA instances +- `device_grouped_conv_bwd_data_wmma_i8_instance.hpp` - INT8 WMMA instances + +**Instantiation Sources:** +- `grouped_conv2d_bwd_data/wmma/*.cpp` - 2D convolution instances +- `grouped_conv3d_bwd_data/wmma/*.cpp` - 3D convolution instances + +## 2. Backward Weight Convolution Algorithms + +### 2.1 DeviceGroupedConvBwdWeight_Xdl_CShuffleV3 + +**Implementation File:** `include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_xdl_cshuffle_v3.hpp` + +**Description:** Latest XDL-based backward weight convolution, version 3 with advanced optimizations. + +**Status:** ✅ **Active - Part of static library** (Recommended for new code) + +**Key Features:** +- Latest XDL implementation with CShuffle +- Support for split-K optimization with auto-deduction +- Multiple pipeline versions (v1, v2, v3, v4) +- Block GEMM pipeline schedulers (Intrawave, Interwave) +- Dual LDS buffer support (v4 pipeline) +- Data types: FP16, BF16, FP32, TF32 +- Two specializations: + - `ConvBwdWeightDefault`: General convolution + - `ConvBwdWeightFilter1x1Stride1Pad0`: Optimized for 1x1 filters + +**Instance Files:** +- `device_grouped_conv_bwd_weight_v3_xdl_instance.hpp` + +**Instantiation Sources (2D):** +- `grouped_conv2d_bwd_weight/xdl/gnhwc_gkyxc_gnhwk/device_grouped_conv2d_bwd_weight_v3_xdl_gnhwc_gkyxc_gnhwk_*.cpp` +- `grouped_conv2d_bwd_weight/xdl/nhwgc_gkyxc_nhwgk/device_grouped_conv2d_bwd_weight_v3_xdl_nhwgc_gkyxc_nhwgk_*.cpp` + +### 2.2 DeviceGroupedConvBwdWeight_Wmma_CShuffleV3 + +**Implementation File:** `include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_wmma_cshuffle_v3.hpp` + +**Description:** Latest WMMA-based backward weight convolution, version 3. + +**Status:** ✅ **Active - Part of static library** (Recommended for WMMA) + +**Key Features:** +- WMMA 16x16 matrix operations +- CShuffle optimization +- Pipeline schedulers and versions +- Data types: FP16, BF16 + +**Instance Files:** +- `device_grouped_conv_bwd_weight_v3_wmma_instance.hpp` + +**Instantiation Sources (2D):** +- `grouped_conv2d_bwd_weight/wmma/nhwgc_gkyxc_nhwgk/device_grouped_conv2d_bwd_weight_wmma_nhwgc_gkyxc_nhwgk_*.cpp` + +### 2.3 DeviceGroupedConvBwdWeight_Xdl_CShuffle + +**Implementation File:** `include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_xdl_cshuffle.hpp` + +**Description:** Original XDL-based backward weight convolution (version 1). + +**Status:** ✅ **Active - Part of static library** (Legacy, but still maintained) + +**Key Features:** +- XDL/MFMA matrix core operations +- CShuffle optimization +- Data types: FP16, BF16, FP32, TF32, FP8/BF8 +- Supports transpose operations for NCHW layouts +- Two specializations (Default, Filter1x1Stride1Pad0) + +**Instance Files:** +- `device_grouped_conv_bwd_weight_xdl_instance.hpp` + +**Instantiation Sources (2D):** +- `grouped_conv2d_bwd_weight/xdl/gnhwc_gkyxc_gnhwk/device_grouped_conv2d_bwd_weight_xdl_gnhwc_gkyxc_gnhwk_*.cpp` +- `grouped_conv2d_bwd_weight/xdl/nhwgc_gkyxc_nhwgk/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_*.cpp` +- `grouped_conv2d_bwd_weight/xdl/ngchw_gkcyx_ngkhw/device_grouped_conv2d_bwd_weight_xdl_ngchw_gkcyx_ngkhw_*.cpp` +- `grouped_conv2d_bwd_weight/xdl/ngchw_gkyxc_ngkhw/device_grouped_conv2d_bwd_weight_xdl_ngchw_gkyxc_ngkhw_*.cpp` + +**Instantiation Sources (1D & 3D):** +- `grouped_conv1d_bwd_weight/xdl/*.cpp` +- `grouped_conv3d_bwd_weight/xdl/*.cpp` + +### 2.4 DeviceGroupedConvBwdWeight_Wmma_CShuffle + +**Implementation File:** `include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_wmma_cshuffle.hpp` + +**Description:** Original WMMA-based backward weight convolution for 3D convolutions only. + +**Status:** ✅ **Active - Part of static library** (3D-specific) + +**Key Features:** +- WMMA 16x16 matrix operations +- CShuffle optimization +- **Specialized for 3D convolutions only** +- Data types: FP16, BF16, FP32 +- Two specializations (Default, Filter1x1Stride1Pad0) + +**Supported Layouts (3D only):** +- NDHWGC/GKZYXC/NDHWGK +- GNDHWC/GKZYXC/GNDHWK + +**Note:** This algorithm is specific to 3D convolutions and uses different template parameter structure than other WMMA variants. + +### 2.5 DeviceGroupedConvBwdWeight_TwoStage_Xdl_CShuffle + +**Implementation File:** `include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_two_stage_xdl_cshuffle.hpp` + +**Description:** Two-stage XDL-based backward weight for large convolutions with memory constraints. + +**Status:** ✅ **Active - Part of static library** + +**Key Features:** +- Two-stage computation with intermediate workspace +- Better memory efficiency for large problems +- XDL matrix core operations +- Multiple pipeline versions (v1, v2, v5) +- Group merging optimization (NumGroupsToMerge parameter) +- Data types: FP16, BF16 +- Supports irregular MPerBlock/NPerBlock configurations + +**Instance Files:** +- `device_grouped_conv_bwd_weight_two_stage_xdl_instance.hpp` + +**Instantiation Sources (2D):** +- `grouped_conv2d_bwd_weight/xdl/nhwgc_gkyxc_nhwgk/device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_*_pipev*.cpp` +- `grouped_conv2d_bwd_weight/xdl/ngchw_gkcyx_ngkhw/device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkcyx_ngkhw_*_pipev*.cpp` +- `grouped_conv2d_bwd_weight/xdl/ngchw_gkyxc_ngkhw/device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkyxc_ngkhw_*_pipev*.cpp` + +**Instantiation Sources (3D):** +- `grouped_conv3d_bwd_weight/xdl/*.cpp` + +### 2.6 DeviceGroupedConvBwdWeight_TwoStage_Wmma_CShuffleV3 + +**Implementation File:** `include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_two_stage_wmma_cshuffle_v3.hpp` + +**Description:** Two-stage WMMA-based backward weight convolution. + +**Status:** ✅ **Active - Part of static library** + +**Key Features:** +- Two-stage computation +- WMMA 16x16 matrix operations +- Pipeline versions +- Group merging optimization +- Data types: FP16, BF16 + +**Instance Files:** +- `device_grouped_conv_bwd_weight_two_stage_wmma_instance.hpp` + +**Instantiation Sources (2D):** +- `grouped_conv2d_bwd_weight/wmma/nhwgc_gkyxc_nhwgk/device_grouped_conv2d_bwd_weight_two_stage_wmma_nhwgc_gkyxc_nhwgk_*_pipev*.cpp` + +**Instantiation Sources (3D):** +- `grouped_conv3d_bwd_weight/wmma/*.cpp` + +### 2.7 DeviceGroupedConvBwdWeight_DL + +**Implementation File:** `include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_dl.hpp` + +**Description:** Direct Load variant using different memory access pattern. + +**Status:** ✅ **Active - Part of static library** + +**Key Features:** +- Direct load memory access pattern (no shared memory for A/B) +- Suitable for specific problem sizes +- Supports 1D, 2D, and 3D convolutions +- Data types: FP16, BF16/FP32 (mixed precision), FP32 +- Two specializations (Default, Filter1x1Stride1Pad0) + +**Instance Files:** +- `device_grouped_conv_bwd_weight_dl_instance.hpp` + +**Instantiation Sources:** +- `grouped_conv1d_bwd_weight/dl/device_grouped_conv1d_bwd_weight_dl_*.cpp` +- `grouped_conv2d_bwd_weight/dl/device_grouped_conv2d_bwd_weight_dl_*.cpp` +- `grouped_conv3d_bwd_weight/dl/device_grouped_conv3d_bwd_weight_dl_*.cpp` + +### 2.8 DeviceGroupedConvBwdWeightMultipleD_Xdl_CShuffle + +**Implementation File:** `include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_multiple_d_xdl_cshuffle.hpp` + +**Description:** XDL-based backward weight with Multiple D tensor support for fused operations. + +**Status:** ✅ **Active - Part of static library** + +**Key Features:** +- Supports additional input tensors (D tensors) for fused operations +- Fused element-wise operations (Bilinear, Scale) +- XDL matrix core operations +- Data types: FP16, BF16, FP32, TF32, FP8/BF8 +- Two specializations (Default, Filter1x1Stride1Pad0) + +**Instance Files:** +- `device_grouped_conv_bwd_weight_xdl_bilinear_instance.hpp` - Bilinear fusion +- `device_grouped_conv_bwd_weight_xdl_scale_instance.hpp` - Scale fusion + +**Instantiation Sources:** +- `grouped_conv3d_bwd_weight_bilinear/*.cpp` - 3D Bilinear variants +- `grouped_conv3d_bwd_weight_scale/*.cpp` - 3D Scale variants + +### 2.9 DeviceGroupedConvBwdWeightMultipleD_Wmma_CShuffleV3 + +**Implementation File:** `include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_multiple_d_wmma_cshuffle_v3.hpp` + +**Description:** WMMA-based backward weight with Multiple D tensor support for fused operations. + +**Status:** ✅ **Active - Part of static library** + +**Key Features:** +- WMMA 16x16 matrix operations +- Supports fused Scale operation +- Data types: FP16, BF16 + +**Instance Files:** +- `device_grouped_conv_bwd_weight_wmma_scale_instance.hpp` + +**Instantiation Sources:** +- `grouped_conv3d_bwd_weight_scale/*.cpp` - 3D Scale variants + +## 3. Algorithm Comparison + +| Algorithm | Version | Instruction Set | Data Types | Fused Ops | Split-K | Dimensions | Status | +|-----------|---------|----------------|------------|-----------|---------|------------|--------| +| **Backward Data** | | | | | | | | +| DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1 | v1 | XDL/MFMA | FP16, BF16, FP32, TF32, FP8, BF8 | Yes | No | 1D, 2D, 3D | ✅ Active | +| DeviceGroupedConvBwdDataMultipleD_Wmma_CShuffle | - | WMMA | FP16, INT8 | No | No | 1D, 2D, 3D | ✅ Active | +| **Backward Weight** | | | | | | | | +| DeviceGroupedConvBwdWeight_Xdl_CShuffleV3 | v3 | XDL/MFMA | FP16, BF16, FP32, TF32 | No | Yes (auto) | 1D, 2D, 3D | ✅ Active (Recommended) | +| DeviceGroupedConvBwdWeight_Wmma_CShuffleV3 | v3 | WMMA | FP16, BF16 | No | No | 1D, 2D, 3D | ✅ Active (Recommended) | +| DeviceGroupedConvBwdWeight_Xdl_CShuffle | v1 | XDL/MFMA | FP16, BF16, FP32, TF32, FP8, BF8 | No | No | 1D, 2D, 3D | ✅ Active (Legacy) | +| DeviceGroupedConvBwdWeight_Wmma_CShuffle | v1 | WMMA | FP16, BF16, FP32 | No | No | 3D only | ✅ Active (3D-specific) | +| DeviceGroupedConvBwdWeight_TwoStage_Xdl_CShuffle | Two-stage | XDL/MFMA | FP16, BF16 | No | No | 1D, 2D, 3D | ✅ Active | +| DeviceGroupedConvBwdWeight_TwoStage_Wmma_CShuffleV3 | Two-stage | WMMA | FP16, BF16 | No | No | 1D, 2D, 3D | ✅ Active | +| DeviceGroupedConvBwdWeight_DL | - | Direct Load | FP16, BF16, FP32 | No | No | 1D, 2D, 3D | ✅ Active | +| DeviceGroupedConvBwdWeightMultipleD_Xdl_CShuffle | - | XDL/MFMA | FP16, BF16, FP32, TF32, FP8, BF8 | Yes | No | 1D, 2D, 3D | ✅ Active | +| DeviceGroupedConvBwdWeightMultipleD_Wmma_CShuffleV3 | v3 | WMMA | FP16, BF16 | Yes (Scale) | No | 3D | ✅ Active | + +## 4. Supported Configurations + +### Data Types +- **FP16** (half_t): 16-bit floating point +- **BF16** (bhalf_t): Brain float 16 +- **FP32** (float): 32-bit floating point +- **TF32** (tf32_t): TensorFloat-32 (compute type) +- **INT8** (int8_t): 8-bit integer (WMMA bwd data only) +- **FP8/BF8**: 8-bit floating point (compute type for newer GPUs) + +### Tensor Layouts + +**Backward Data:** +- Input: GNHWC, NHWGC, NGCHW (2D), GNDHWC, NDHWGC (3D), GNWC (1D) +- Weight: GKYXC, GKCYX (2D), GKZYXC (3D), GKXC (1D) +- Output: GNHWK, NHWGK, NGKHW (2D), GNDHWK, NDHWGK (3D), GNWK (1D) + +**Backward Weight:** +- Input: GNHWC, NHWGC, NGCHW (2D), GNDHWC, NDHWGC (3D), GNWC (1D) +- Weight: GKYXC, GKCYX (2D), GKZYXC (3D), GKXC (1D) +- Output: GNHWK, NHWGK, NGKHW (2D), GNDHWK, NDHWGK (3D), GNWK (1D) + +### Spatial Dimensions +- **1D Convolution**: NDimSpatial = 1 +- **2D Convolution**: NDimSpatial = 2 +- **3D Convolution**: NDimSpatial = 3 + +### Specializations +1. **Default**: General purpose convolution +2. **Filter1x1Stride1Pad0**: Optimized for 1x1 convolution with stride 1 and no padding + +## 5. Instance Organization + +Instances are organized by: +- **Spatial dimension** (1D, 2D, 3D) +- **Data type** (FP16, BF16, FP32, INT8, etc.) +- **Tensor layout** combination +- **Specialization** (Default, 1x1S1P0) +- **Element-wise operations** (PassThrough, Bilinear, Scale) + +Each instance specifies detailed template parameters including: +- Block sizes (BlockSize, MPerBlock, NPerBlock, K0PerBlock) +- Wave/Warp configurations (MXdlPerWave, NXdlPerWave or MRepeat, NRepeat) +- Thread cluster arrangements +- Vector access patterns +- LDS (Local Data Share) optimizations +- CShuffle parameters + +## 6. Key Template Parameters + +### Common Parameters (XDL variants) +- **BlockSize**: Total threads per block (64, 128, 256) +- **MPerBlock, NPerBlock**: GEMM tile sizes per block +- **K0PerBlock**: K dimension blocking +- **K1**: Vector width in K dimension (typically 8 for FP16, 4 for FP32) +- **MPerXDL, NPerXDL**: Matrix dimensions per XDL instruction (16x16 or 32x32) +- **MXdlPerWave, NXdlPerWave**: Number of XDL tiles per wave +- **Pipeline Version**: v1, v2, v3, v4, v5 (different prefetch strategies) +- **Pipeline Scheduler**: Intrawave vs Interwave + +### Common Parameters (WMMA variants) +- **BlockSize**: Total threads per block (32, 64, 96, 128, 256) +- **MPerBlock, NPerBlock**: GEMM tile sizes +- **K0PerBlock**: K dimension blocking +- **K1**: Vector width (typically 8 for FP16, 16 for INT8) +- **MPerWmma, NPerWmma**: WMMA tile size (16x16) +- **MRepeat, NRepeat**: Repetition factors + +### Two-Stage Specific Parameters +- **NumGroupsToMerge**: Number of groups to merge for better performance + +## 7. Performance Considerations + +### Choosing the Right Algorithm + +**For Backward Data:** +1. **XDL variant**: Best for modern AMD GPUs with Matrix Core support (MI100, MI200, MI300 series) +2. **WMMA variant**: Good for varied problem sizes and broader compatibility +3. **Use FP16/BF16** for best performance on modern hardware + +**For Backward Weight:** +1. **V3 variants (XDL or WMMA)**: Recommended for new code, latest optimizations +2. **Two-Stage variants**: Best for very large convolutions with memory constraints +3. **V1 XDL**: Good alternative with broader layout support (including NCHW) +4. **DL variant**: Specific use cases, no shared memory overhead +5. **MultipleD variants**: When you need fused operations (Bilinear, Scale) + +### Optimization Features +- **Split-K**: Parallelizes the reduction dimension for better occupancy (V3 XDL only, auto-deduced) +- **CShuffle**: Optimized cross-lane shuffle for data redistribution +- **Pipeline Versions**: Different prefetch strategies to hide memory latency + - v1: Basic pipeline + - v2: Enhanced prefetching with tail number support (1-7) + - v3: Further optimizations + - v4: Dual LDS buffer support + - v5: Advanced prefetching +- **LDS Padding**: Avoid bank conflicts in shared memory +- **Two-Stage**: Reduces memory footprint for large problems + +## 8. Library Structure + +``` +library/ +├── include/ck/library/tensor_operation_instance/gpu/ +│ ├── grouped_conv_bwd_data/ +│ │ ├── device_grouped_conv_bwd_data_xdl_instance.hpp +│ │ ├── device_grouped_conv_bwd_data_wmma_f16_instance.hpp +│ │ ├── device_grouped_conv_bwd_data_wmma_i8_instance.hpp +│ │ └── ... (other variants) +│ └── grouped_conv_bwd_weight/ +│ ├── device_grouped_conv_bwd_weight_v3_xdl_instance.hpp +│ ├── device_grouped_conv_bwd_weight_v3_wmma_instance.hpp +│ ├── device_grouped_conv_bwd_weight_xdl_instance.hpp +│ ├── device_grouped_conv_bwd_weight_two_stage_xdl_instance.hpp +│ ├── device_grouped_conv_bwd_weight_two_stage_wmma_instance.hpp +│ ├── device_grouped_conv_bwd_weight_dl_instance.hpp +│ ├── device_grouped_conv_bwd_weight_xdl_bilinear_instance.hpp +│ ├── device_grouped_conv_bwd_weight_xdl_scale_instance.hpp +│ ├── device_grouped_conv_bwd_weight_wmma_scale_instance.hpp +│ └── device_grouped_conv_bwd_weight_wmma_bilinear_instance.hpp +└── src/tensor_operation_instance/gpu/ + ├── grouped_conv1d_bwd_weight/ + ├── grouped_conv2d_bwd_data/ + │ ├── wmma/ (WMMA instances for 2D) + │ └── xdl/ (XDL instances for 2D) + ├── grouped_conv2d_bwd_weight/ + │ ├── dl/ (Direct load instances) + │ ├── wmma/ (WMMA instances for 2D) + │ └── xdl/ (XDL instances for 2D - multiple layout subdirs) + ├── grouped_conv3d_bwd_data/ + │ ├── wmma/ (WMMA instances for 3D) + │ └── xdl/ (XDL instances for 3D) + └── grouped_conv3d_bwd_weight/ + ├── dl/ (Direct load) + ├── wmma/ (WMMA) + ├── xdl/ (XDL) + ├── grouped_conv3d_bwd_weight_bilinear/ + └── grouped_conv3d_bwd_weight_scale/ +``` + +## Summary + +The Composable Kernel library provides a comprehensive set of optimized grouped convolution backward kernels: + +### Backward Data Algorithms: 2 +- **XDL variant**: ~200+ instances across all data types and layouts +- **WMMA variant**: ~30 instances for FP16 and INT8 + +### Backward Weight Algorithms: 9 (all part of static library) +1. **DeviceGroupedConvBwdWeight_Xdl_CShuffleV3** - Latest XDL (recommended) +2. **DeviceGroupedConvBwdWeight_Wmma_CShuffleV3** - Latest WMMA (recommended) +3. **DeviceGroupedConvBwdWeight_Xdl_CShuffle** - Original XDL (legacy but maintained) +4. **DeviceGroupedConvBwdWeight_Wmma_CShuffle** - Original WMMA (3D only) +5. **DeviceGroupedConvBwdWeight_TwoStage_Xdl_CShuffle** - Two-stage XDL +6. **DeviceGroupedConvBwdWeight_TwoStage_Wmma_CShuffleV3** - Two-stage WMMA +7. **DeviceGroupedConvBwdWeight_DL** - Direct load variant +8. **DeviceGroupedConvBwdWeightMultipleD_Xdl_CShuffle** - XDL with fused ops +9. **DeviceGroupedConvBwdWeightMultipleD_Wmma_CShuffleV3** - WMMA with fused ops + +**Total estimated instances:** 300-400+ across all algorithms, data types, layouts, and specializations + +**Key differentiators:** +- V3 variants: Latest optimizations, recommended for new code +- Two-stage variants: Better for very large convolutions +- MultipleD variants: Support fused element-wise operations +- DL variant: No shared memory overhead +- Wide range of data types (FP16, BF16, FP32, INT8, FP8/BF8) +- Various tensor layout combinations +- Advanced optimizations (CShuffle, Split-K, Pipeline tuning) diff --git a/experimental/builder/instances/grouped_conv_bwd_data_instances.json b/experimental/builder/instances/grouped_conv_bwd_data_instances.json new file mode 100644 index 0000000000..366f6f5bb7 --- /dev/null +++ b/experimental/builder/instances/grouped_conv_bwd_data_instances.json @@ -0,0 +1,473 @@ +{ + "description": "Complete listing of all backward data convolution instances in the Composable Kernel library", + "note": "All instances are pre-compiled and part of the static library", + + "algorithms": { + "DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1": { + "implementation_file": "include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp", + "instruction_set": "XDL/MFMA", + "status": "active", + + "instances": { + "FP16_generic": { + "data_type": "FP16", + "accumulator": "FP32", + "cshuffle_type": "FP16", + "header_file": "library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_data/device_grouped_conv_bwd_data_xdl_instance.hpp", + "template_alias": "device_grouped_conv_bwd_data_xdl_f16_generic_instances", + "instance": { + "BlockSize": 64, + "MPerBlock": 64, + "NPerBlock": 64, + "K0PerBlock": 32, + "AK1": 8, + "BK1": 8, + "MPerXDL": 32, + "NPerXDL": 32, + "MXdlPerWave": 2, + "NXdlPerWave": 2, + "ABlockTransferThreadClusterLengths_K0_M_K1": [4, 16, 1], + "ABlockTransferThreadClusterArrangeOrder": [1, 0, 2], + "ABlockTransferSrcAccessOrder": [1, 0, 2], + "ABlockTransferSrcVectorDim": 2, + "ABlockTransferSrcScalarPerVector": 1, + "ABlockTransferDstScalarPerVector_K1": 8, + "ABlockLdsAddExtraM": 1, + "BBlockTransferThreadClusterLengths_K0_N_K1": [4, 8, 1], + "BBlockTransferThreadClusterArrangeOrder": [0, 2, 1], + "BBlockTransferSrcAccessOrder": [0, 2, 1], + "BBlockTransferSrcVectorDim": 1, + "BBlockTransferSrcScalarPerVector": 1, + "BBlockTransferDstScalarPerVector_K1": 8, + "BBlockLdsAddExtraN": 1, + "CShuffleMXdlPerWavePerShuffle": 1, + "CShuffleNXdlPerWavePerShuffle": 1, + "CDEBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock": [1, 16, 1, 4], + "CDEBlockTransferScalarPerVector_NPerBlock": 1, + "specialization": "ConvBwdDataDefault or Filter1x1Stride1Pad0" + } + }, + + "FP16_16x16": { + "data_type": "FP16", + "xdl_size": "16x16", + "header_file": "library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_data/device_grouped_conv_bwd_data_xdl_instance.hpp", + "template_alias": "device_grouped_conv_bwd_data_xdl_f16_16_16_instances", + "instances": [ + { + "BlockSize": 64, + "MPerBlock": 16, + "NPerBlock": 64, + "K0PerBlock": 32, + "AK1": 8, + "BK1": 8, + "MPerXDL": 16, + "NPerXDL": 16, + "MXdlPerWave": 1, + "NXdlPerWave": 4, + "ABlockTransferSrcScalarPerVector": 8, + "BBlockTransferSrcScalarPerVector": 8, + "CDEBlockTransferScalarPerVector_NPerBlock": 4 + }, + { + "BlockSize": 64, + "MPerBlock": 16, + "NPerBlock": 64, + "K0PerBlock": 32, + "MPerXDL": 16, + "NPerXDL": 16, + "MXdlPerWave": 1, + "NXdlPerWave": 4, + "ABlockTransferSrcScalarPerVector": 8, + "BBlockTransferSrcScalarPerVector": 1, + "CDEBlockTransferScalarPerVector_NPerBlock": 1 + }, + { + "BlockSize": 64, + "MPerBlock": 16, + "NPerBlock": 64, + "K0PerBlock": 32, + "MXdlPerWave": 1, + "NXdlPerWave": 4, + "ABlockTransferSrcScalarPerVector": 1, + "BBlockTransferSrcScalarPerVector": 8, + "CDEBlockTransferScalarPerVector_NPerBlock": 4 + }, + { + "BlockSize": 64, + "MPerBlock": 64, + "NPerBlock": 16, + "K0PerBlock": 32, + "MXdlPerWave": 4, + "NXdlPerWave": 1, + "ABlockTransferSrcScalarPerVector": 8, + "BBlockTransferSrcScalarPerVector": 8, + "CDEBlockTransferScalarPerVector_NPerBlock": 4 + }, + { + "BlockSize": 64, + "MPerBlock": 64, + "NPerBlock": 16, + "K0PerBlock": 32, + "MXdlPerWave": 4, + "NXdlPerWave": 1, + "ABlockTransferSrcScalarPerVector": 8, + "BBlockTransferSrcScalarPerVector": 1, + "CDEBlockTransferScalarPerVector_NPerBlock": 1 + }, + { + "BlockSize": 64, + "MPerBlock": 64, + "NPerBlock": 16, + "K0PerBlock": 32, + "MXdlPerWave": 4, + "NXdlPerWave": 1, + "ABlockTransferSrcScalarPerVector": 1, + "BBlockTransferSrcScalarPerVector": 8, + "CDEBlockTransferScalarPerVector_NPerBlock": 4 + } + ] + }, + + "FP16_32x32_optimized_loads": { + "data_type": "FP16", + "xdl_size": "32x32", + "variant": "optimized memory loads", + "header_file": "library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_data/device_grouped_conv_bwd_data_xdl_instance.hpp", + "template_alias": "device_grouped_conv_bwd_data_xdl_f16_optimized_loads_instances", + "instances": [ + { + "BlockSize": 256, + "MPerBlock": 128, + "NPerBlock": 32, + "K0PerBlock": 64, + "AK1": 16, + "BK1": 16, + "MPerXDL": 32, + "NPerXDL": 32, + "MXdlPerWave": 1, + "NXdlPerWave": 1, + "ABlockTransferSrcScalarPerVector": 16, + "ABlockTransferDstScalarPerVector_K1": 4, + "BBlockTransferSrcScalarPerVector": 8, + "BBlockTransferDstScalarPerVector_K1": 1, + "CDEBlockTransferScalarPerVector_NPerBlock": 8 + }, + { + "BlockSize": 256, + "MPerBlock": 128, + "NPerBlock": 32, + "K0PerBlock": 32, + "AK1": 8, + "BK1": 8, + "MXdlPerWave": 1, + "NXdlPerWave": 1, + "ABlockTransferSrcScalarPerVector": 8, + "ABlockTransferDstScalarPerVector_K1": 4, + "BBlockTransferSrcScalarPerVector": 8, + "BBlockTransferDstScalarPerVector_K1": 1, + "CDEBlockTransferScalarPerVector_NPerBlock": 8 + }, + { + "BlockSize": 256, + "MPerBlock": 128, + "NPerBlock": 32, + "K0PerBlock": 16, + "AK1": 4, + "BK1": 4, + "MXdlPerWave": 1, + "NXdlPerWave": 1, + "ABlockTransferSrcScalarPerVector": 4, + "ABlockTransferDstScalarPerVector_K1": 4, + "BBlockTransferSrcScalarPerVector": 8, + "BBlockTransferDstScalarPerVector_K1": 1, + "CDEBlockTransferScalarPerVector_NPerBlock": 8 + }, + { + "BlockSize": 256, + "MPerBlock": 128, + "NPerBlock": 32, + "K0PerBlock": 64, + "AK1": 16, + "BK1": 16, + "ABlockTransferSrcScalarPerVector": 16, + "ABlockTransferDstScalarPerVector_K1": 4, + "BBlockTransferSrcScalarPerVector": 2, + "BBlockTransferDstScalarPerVector_K1": 1, + "CDEBlockTransferScalarPerVector_NPerBlock": 2 + }, + { + "BlockSize": 256, + "MPerBlock": 128, + "NPerBlock": 32, + "K0PerBlock": 32, + "ABlockTransferSrcScalarPerVector": 8, + "ABlockTransferDstScalarPerVector_K1": 4, + "BBlockTransferSrcScalarPerVector": 2, + "BBlockTransferDstScalarPerVector_K1": 1, + "CDEBlockTransferScalarPerVector_NPerBlock": 2 + }, + { + "BlockSize": 256, + "MPerBlock": 128, + "NPerBlock": 32, + "K0PerBlock": 16, + "ABlockTransferSrcScalarPerVector": 4, + "ABlockTransferDstScalarPerVector_K1": 4, + "BBlockTransferSrcScalarPerVector": 2, + "BBlockTransferDstScalarPerVector_K1": 1, + "CDEBlockTransferScalarPerVector_NPerBlock": 2 + }, + { + "BlockSize": 256, + "MPerBlock": 128, + "NPerBlock": 32, + "K0PerBlock": 64, + "ABlockTransferSrcScalarPerVector": 16, + "ABlockTransferDstScalarPerVector_K1": 4, + "BBlockTransferSrcScalarPerVector": 1, + "BBlockTransferDstScalarPerVector_K1": 1, + "CDEBlockTransferScalarPerVector_NPerBlock": 1 + }, + { + "BlockSize": 256, + "MPerBlock": 128, + "NPerBlock": 32, + "K0PerBlock": 32, + "ABlockTransferSrcScalarPerVector": 8, + "ABlockTransferDstScalarPerVector_K1": 4, + "BBlockTransferSrcScalarPerVector": 1, + "BBlockTransferDstScalarPerVector_K1": 1, + "CDEBlockTransferScalarPerVector_NPerBlock": 1 + }, + { + "BlockSize": 256, + "MPerBlock": 128, + "NPerBlock": 32, + "K0PerBlock": 16, + "ABlockTransferSrcScalarPerVector": 4, + "ABlockTransferDstScalarPerVector_K1": 4, + "BBlockTransferSrcScalarPerVector": 1, + "BBlockTransferDstScalarPerVector_K1": 1, + "CDEBlockTransferScalarPerVector_NPerBlock": 1 + }, + { + "BlockSize": 256, + "MPerBlock": 64, + "NPerBlock": 16, + "K0PerBlock": 64, + "ABlockTransferSrcScalarPerVector": 16, + "ABlockTransferDstScalarPerVector_K1": 4, + "BBlockTransferSrcScalarPerVector": 8, + "BBlockTransferDstScalarPerVector_K1": 1, + "CDEBlockTransferScalarPerVector_NPerBlock": 4 + }, + { + "BlockSize": 256, + "MPerBlock": 64, + "NPerBlock": 16, + "K0PerBlock": 32, + "ABlockTransferSrcScalarPerVector": 8, + "ABlockTransferDstScalarPerVector_K1": 4, + "BBlockTransferSrcScalarPerVector": 8, + "BBlockTransferDstScalarPerVector_K1": 1, + "CDEBlockTransferScalarPerVector_NPerBlock": 4 + }, + { + "BlockSize": 256, + "MPerBlock": 64, + "NPerBlock": 16, + "K0PerBlock": 16, + "ABlockTransferSrcScalarPerVector": 4, + "ABlockTransferDstScalarPerVector_K1": 4, + "BBlockTransferSrcScalarPerVector": 8, + "BBlockTransferDstScalarPerVector_K1": 1, + "CDEBlockTransferScalarPerVector_NPerBlock": 4 + }, + { + "BlockSize": 256, + "MPerBlock": 64, + "NPerBlock": 16, + "K0PerBlock": 64, + "ABlockTransferSrcScalarPerVector": 16, + "ABlockTransferDstScalarPerVector_K1": 4, + "BBlockTransferSrcScalarPerVector": 2, + "BBlockTransferDstScalarPerVector_K1": 1, + "CDEBlockTransferScalarPerVector_NPerBlock": 2 + }, + { + "BlockSize": 256, + "MPerBlock": 64, + "NPerBlock": 16, + "K0PerBlock": 32, + "ABlockTransferSrcScalarPerVector": 8, + "ABlockTransferDstScalarPerVector_K1": 4, + "BBlockTransferSrcScalarPerVector": 2, + "BBlockTransferDstScalarPerVector_K1": 1, + "CDEBlockTransferScalarPerVector_NPerBlock": 2 + }, + { + "BlockSize": 256, + "MPerBlock": 64, + "NPerBlock": 16, + "K0PerBlock": 16, + "ABlockTransferSrcScalarPerVector": 4, + "ABlockTransferDstScalarPerVector_K1": 4, + "BBlockTransferSrcScalarPerVector": 2, + "BBlockTransferDstScalarPerVector_K1": 1, + "CDEBlockTransferScalarPerVector_NPerBlock": 2 + }, + { + "BlockSize": 256, + "MPerBlock": 64, + "NPerBlock": 16, + "K0PerBlock": 64, + "ABlockTransferSrcScalarPerVector": 16, + "ABlockTransferDstScalarPerVector_K1": 4, + "BBlockTransferSrcScalarPerVector": 1, + "BBlockTransferDstScalarPerVector_K1": 1, + "CDEBlockTransferScalarPerVector_NPerBlock": 1 + }, + { + "BlockSize": 256, + "MPerBlock": 64, + "NPerBlock": 16, + "K0PerBlock": 32, + "ABlockTransferSrcScalarPerVector": 8, + "ABlockTransferDstScalarPerVector_K1": 4, + "BBlockTransferSrcScalarPerVector": 1, + "BBlockTransferDstScalarPerVector_K1": 1, + "CDEBlockTransferScalarPerVector_NPerBlock": 1 + }, + { + "BlockSize": 256, + "MPerBlock": 64, + "NPerBlock": 16, + "K0PerBlock": 16, + "ABlockTransferSrcScalarPerVector": 4, + "ABlockTransferDstScalarPerVector_K1": 4, + "BBlockTransferSrcScalarPerVector": 1, + "BBlockTransferDstScalarPerVector_K1": 1, + "CDEBlockTransferScalarPerVector_NPerBlock": 1 + } + ] + }, + + "FP16_32x32_standard": { + "data_type": "FP16", + "xdl_size": "32x32", + "header_file": "library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_data/device_grouped_conv_bwd_data_xdl_instance.hpp", + "template_alias": "device_grouped_conv_bwd_data_xdl_f16_instances", + "instances": [ + {"BlockSize": 64, "MPerBlock": 64, "NPerBlock": 64, "K0PerBlock": 32, "AK1": 8, "BK1": 8, "MPerXDL": 32, "NPerXDL": 32, "MXdlPerWave": 2, "NXdlPerWave": 2, "CDEBlockTransferScalarPerVector_NPerBlock": 1}, + {"BlockSize": 256, "MPerBlock": 64, "NPerBlock": 128, "K0PerBlock": 32, "MXdlPerWave": 1, "NXdlPerWave": 2, "CDEBlockTransferScalarPerVector_NPerBlock": 8}, + {"BlockSize": 128, "MPerBlock": 128, "NPerBlock": 32, "K0PerBlock": 32, "MXdlPerWave": 2, "NXdlPerWave": 1, "CDEBlockTransferScalarPerVector_NPerBlock": 1}, + {"BlockSize": 256, "MPerBlock": 128, "NPerBlock": 256, "K0PerBlock": 32, "MXdlPerWave": 2, "NXdlPerWave": 4, "CDEBlockTransferScalarPerVector_NPerBlock": 8}, + {"BlockSize": 256, "MPerBlock": 256, "NPerBlock": 128, "K0PerBlock": 32, "MXdlPerWave": 4, "NXdlPerWave": 2, "CDEBlockTransferScalarPerVector_NPerBlock": 8}, + {"BlockSize": 256, "MPerBlock": 128, "NPerBlock": 256, "K0PerBlock": 32, "MXdlPerWave": 2, "NXdlPerWave": 4, "CDEBlockTransferScalarPerVector_NPerBlock": 8}, + {"BlockSize": 128, "MPerBlock": 128, "NPerBlock": 128, "K0PerBlock": 32, "MXdlPerWave": 4, "NXdlPerWave": 2, "CDEBlockTransferScalarPerVector_NPerBlock": 8}, + {"BlockSize": 256, "MPerBlock": 128, "NPerBlock": 128, "K0PerBlock": 32, "MXdlPerWave": 2, "NXdlPerWave": 2, "CDEBlockTransferScalarPerVector_NPerBlock": 8}, + {"BlockSize": 128, "MPerBlock": 128, "NPerBlock": 64, "K0PerBlock": 32, "MXdlPerWave": 2, "NXdlPerWave": 2, "CDEBlockTransferScalarPerVector_NPerBlock": 8}, + {"BlockSize": 128, "MPerBlock": 64, "NPerBlock": 128, "K0PerBlock": 32, "MXdlPerWave": 2, "NXdlPerWave": 2, "CDEBlockTransferScalarPerVector_NPerBlock": 8}, + {"BlockSize": 64, "MPerBlock": 64, "NPerBlock": 64, "K0PerBlock": 32, "MXdlPerWave": 2, "NXdlPerWave": 2, "CDEBlockTransferScalarPerVector_NPerBlock": 8}, + {"BlockSize": 256, "MPerBlock": 128, "NPerBlock": 64, "K0PerBlock": 32, "MXdlPerWave": 2, "NXdlPerWave": 1, "CDEBlockTransferScalarPerVector_NPerBlock": 8}, + {"BlockSize": 256, "MPerBlock": 64, "NPerBlock": 128, "K0PerBlock": 32, "MXdlPerWave": 1, "NXdlPerWave": 2, "CDEBlockTransferScalarPerVector_NPerBlock": 8}, + {"BlockSize": 128, "MPerBlock": 128, "NPerBlock": 32, "K0PerBlock": 32, "MXdlPerWave": 2, "NXdlPerWave": 1, "CDEBlockTransferScalarPerVector_NPerBlock": 8}, + {"BlockSize": 128, "MPerBlock": 32, "NPerBlock": 128, "K0PerBlock": 32, "MXdlPerWave": 1, "NXdlPerWave": 2, "CDEBlockTransferScalarPerVector_NPerBlock": 8}, + {"BlockSize": 64, "MPerBlock": 64, "NPerBlock": 32, "K0PerBlock": 32, "MXdlPerWave": 2, "NXdlPerWave": 1, "CDEBlockTransferScalarPerVector_NPerBlock": 8}, + {"BlockSize": 64, "MPerBlock": 32, "NPerBlock": 64, "K0PerBlock": 32, "MXdlPerWave": 1, "NXdlPerWave": 2, "CDEBlockTransferScalarPerVector_NPerBlock": 8} + ] + } + } + }, + + "DeviceGroupedConvBwdDataMultipleD_Wmma_CShuffle": { + "implementation_file": "include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_data_multiple_d_wmma_cshuffle.hpp", + "instruction_set": "WMMA", + "status": "active", + + "instances": { + "FP16": { + "data_type": "FP16", + "accumulator": "FP32", + "cshuffle_type": "FP16", + "header_file": "library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_data/device_grouped_conv_bwd_data_wmma_f16_instance.hpp", + "template_alias": "device_grouped_conv_bwd_data_wmma_f16_instances", + "all_instances": [ + {"BlockSize": 128, "MPerBlock": 64, "NPerBlock": 64, "K0PerBlock": 4, "K1": 8, "MPerWmma": 16, "NPerWmma": 16, "MRepeat": 2, "NRepeat": 2, "CDEBlockTransferScalarPerVector_NPerBlock": 1}, + {"BlockSize": 256, "MPerBlock": 128, "NPerBlock": 256, "K0PerBlock": 8, "K1": 8, "MRepeat": 4, "NRepeat": 4, "CDEBlockTransferScalarPerVector_NPerBlock": 8}, + {"BlockSize": 256, "MPerBlock": 64, "NPerBlock": 256, "K0PerBlock": 8, "K1": 8, "MRepeat": 2, "NRepeat": 4, "CDEBlockTransferScalarPerVector_NPerBlock": 8}, + {"BlockSize": 256, "MPerBlock": 128, "NPerBlock": 256, "K0PerBlock": 8, "K1": 8, "MRepeat": 4, "NRepeat": 4, "CDEBlockTransferScalarPerVector_NPerBlock": 8}, + {"BlockSize": 256, "MPerBlock": 128, "NPerBlock": 64, "K0PerBlock": 8, "K1": 8, "MRepeat": 4, "NRepeat": 1, "CDEBlockTransferScalarPerVector_NPerBlock": 8}, + {"BlockSize": 128, "MPerBlock": 64, "NPerBlock": 128, "K0PerBlock": 8, "K1": 8, "MRepeat": 2, "NRepeat": 4, "CDEBlockTransferScalarPerVector_NPerBlock": 8}, + {"BlockSize": 128, "MPerBlock": 128, "NPerBlock": 64, "K0PerBlock": 8, "K1": 8, "MRepeat": 4, "NRepeat": 2, "CDEBlockTransferScalarPerVector_NPerBlock": 8}, + {"BlockSize": 128, "MPerBlock": 128, "NPerBlock": 128, "K0PerBlock": 8, "K1": 8, "MRepeat": 4, "NRepeat": 4, "CDEBlockTransferScalarPerVector_NPerBlock": 8}, + {"BlockSize": 128, "MPerBlock": 32, "NPerBlock": 256, "K0PerBlock": 8, "K1": 8, "MRepeat": 1, "NRepeat": 8, "CDEBlockTransferScalarPerVector_NPerBlock": 8}, + {"BlockSize": 64, "MPerBlock": 32, "NPerBlock": 64, "K0PerBlock": 8, "K1": 8, "MRepeat": 1, "NRepeat": 4, "CDEBlockTransferScalarPerVector_NPerBlock": 8}, + {"BlockSize": 64, "MPerBlock": 64, "NPerBlock": 64, "K0PerBlock": 8, "K1": 8, "MRepeat": 2, "NRepeat": 4, "CDEBlockTransferScalarPerVector_NPerBlock": 8}, + {"BlockSize": 64, "MPerBlock": 32, "NPerBlock": 64, "K0PerBlock": 8, "K1": 8, "MRepeat": 1, "NRepeat": 4, "CDEBlockTransferScalarPerVector_NPerBlock": 8}, + {"BlockSize": 64, "MPerBlock": 32, "NPerBlock": 128, "K0PerBlock": 8, "K1": 8, "MRepeat": 1, "NRepeat": 8, "CDEBlockTransferScalarPerVector_NPerBlock": 8}, + {"BlockSize": 32, "MPerBlock": 16, "NPerBlock": 64, "K0PerBlock": 8, "K1": 8, "MRepeat": 1, "NRepeat": 4, "CDEBlockTransferScalarPerVector_NPerBlock": 8}, + {"BlockSize": 32, "MPerBlock": 64, "NPerBlock": 32, "K0PerBlock": 8, "K1": 8, "MRepeat": 4, "NRepeat": 2, "CDEBlockTransferScalarPerVector_NPerBlock": 8}, + {"BlockSize": 32, "MPerBlock": 32, "NPerBlock": 32, "K0PerBlock": 8, "K1": 8, "MRepeat": 2, "NRepeat": 2, "CDEBlockTransferScalarPerVector_NPerBlock": 8}, + {"BlockSize": 32, "MPerBlock": 16, "NPerBlock": 32, "K0PerBlock": 8, "K1": 8, "MRepeat": 1, "NRepeat": 2, "CDEBlockTransferScalarPerVector_NPerBlock": 8} + ] + }, + + "INT8": { + "data_type": "INT8", + "accumulator": "INT32", + "cshuffle_type": "INT8", + "header_file": "library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_data/device_grouped_conv_bwd_data_wmma_i8_instance.hpp", + "template_alias": "device_grouped_conv_bwd_data_wmma_i8_instances", + "all_instances": [ + {"BlockSize": 128, "MPerBlock": 64, "NPerBlock": 64, "K0PerBlock": 4, "K1": 16, "MRepeat": 2, "NRepeat": 2, "CDEBlockTransferScalarPerVector_NPerBlock": 1}, + {"BlockSize": 256, "MPerBlock": 64, "NPerBlock": 256, "K0PerBlock": 8, "K1": 16, "MRepeat": 2, "NRepeat": 4, "CDEBlockTransferScalarPerVector_NPerBlock": 8}, + {"BlockSize": 128, "MPerBlock": 64, "NPerBlock": 256, "K0PerBlock": 8, "K1": 16, "MRepeat": 2, "NRepeat": 8, "CDEBlockTransferScalarPerVector_NPerBlock": 8}, + {"BlockSize": 128, "MPerBlock": 64, "NPerBlock": 128, "K0PerBlock": 8, "K1": 16, "MRepeat": 2, "NRepeat": 4, "CDEBlockTransferScalarPerVector_NPerBlock": 8}, + {"BlockSize": 128, "MPerBlock": 128, "NPerBlock": 256, "K0PerBlock": 8, "K1": 16, "MRepeat": 4, "NRepeat": 8, "CDEBlockTransferScalarPerVector_NPerBlock": 8}, + {"BlockSize": 128, "MPerBlock": 32, "NPerBlock": 256, "K0PerBlock": 8, "K1": 16, "MRepeat": 1, "NRepeat": 8, "CDEBlockTransferScalarPerVector_NPerBlock": 8}, + {"BlockSize": 128, "MPerBlock": 256, "NPerBlock": 128, "K0PerBlock": 8, "K1": 16, "MRepeat": 8, "NRepeat": 4, "CDEBlockTransferScalarPerVector_NPerBlock": 8}, + {"BlockSize": 64, "MPerBlock": 32, "NPerBlock": 128, "K0PerBlock": 8, "K1": 16, "MRepeat": 1, "NRepeat": 8, "CDEBlockTransferScalarPerVector_NPerBlock": 8}, + {"BlockSize": 64, "MPerBlock": 64, "NPerBlock": 128, "K0PerBlock": 8, "K1": 16, "MRepeat": 2, "NRepeat": 8, "CDEBlockTransferScalarPerVector_NPerBlock": 8}, + {"BlockSize": 64, "MPerBlock": 32, "NPerBlock": 128, "K0PerBlock": 8, "K1": 16, "MRepeat": 1, "NRepeat": 8, "CDEBlockTransferScalarPerVector_NPerBlock": 8}, + {"BlockSize": 64, "MPerBlock": 32, "NPerBlock": 64, "K0PerBlock": 8, "K1": 16, "MRepeat": 1, "NRepeat": 4, "CDEBlockTransferScalarPerVector_NPerBlock": 8}, + {"BlockSize": 32, "MPerBlock": 16, "NPerBlock": 64, "K0PerBlock": 8, "K1": 16, "MRepeat": 1, "NRepeat": 4, "CDEBlockTransferScalarPerVector_NPerBlock": 8}, + {"BlockSize": 32, "MPerBlock": 64, "NPerBlock": 64, "K0PerBlock": 8, "K1": 16, "MRepeat": 4, "NRepeat": 4, "CDEBlockTransferScalarPerVector_NPerBlock": 8}, + {"BlockSize": 32, "MPerBlock": 32, "NPerBlock": 32, "K0PerBlock": 8, "K1": 16, "MRepeat": 2, "NRepeat": 2, "CDEBlockTransferScalarPerVector_NPerBlock": 8} + ] + } + } + } + }, + + "instance_count_summary": { + "DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1": { + "FP16": { + "generic": 1, + "16x16_variants": 6, + "32x32_standard": 13, + "32x32_optimized_loads": 18, + "32x32_nchw": 17, + "total_per_layout_specialization": "~50+" + }, + "BF16": "Similar to FP16 (~50+ instances)", + "FP32": "~40+ instances (K1=4 instead of 8)", + "FP32_TF32": "~30+ instances with TF32 compute", + "FP16_comp_BF8_F8": "~20 instances for newer GPUs", + "total_estimated": "~200+" + }, + "DeviceGroupedConvBwdDataMultipleD_Wmma_CShuffle": { + "FP16": 17, + "INT8": 13, + "total": 30 + } + }, + + "notes": { + "instance_variations": "Each instance template can be instantiated for multiple layouts and specializations", + "layout_multiplier": "Each instance set is typically instantiated for 2-4 layout combinations", + "specialization_multiplier": "Each layout may have 1-2 specializations (Default, Filter1x1Stride1Pad0)", + "total_instances": "The actual number of compiled instances is significantly higher when accounting for all layout and specialization combinations" + } +} diff --git a/experimental/builder/instances/grouped_conv_bwd_weight_instances.json b/experimental/builder/instances/grouped_conv_bwd_weight_instances.json new file mode 100644 index 0000000000..b4dacc5415 --- /dev/null +++ b/experimental/builder/instances/grouped_conv_bwd_weight_instances.json @@ -0,0 +1,648 @@ +{ + "description": "Complete listing of all backward weight convolution instances in the Composable Kernel library", + "note": "All instances are pre-compiled and part of the static library. All 9 algorithms are active.", + + "algorithms": { + "DeviceGroupedConvBwdWeight_Xdl_CShuffleV3": { + "implementation_file": "include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_xdl_cshuffle_v3.hpp", + "instruction_set": "XDL/MFMA", + "version": "v3", + "status": "active_recommended", + + "instances": { + "FP32": { + "data_type": "FP32", + "accumulator": "FP32", + "header_file": "library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_v3_xdl_instance.hpp", + "template_alias": "device_grouped_conv_bwd_weight_v3_xdl_c_shuffle_f32_instances", + "instance": { + "BlockSize": 64, + "MPerBlock": 16, + "NPerBlock": 16, + "K0PerBlock": 32, + "K1": 8, + "MPerXDL": 16, + "NPerXDL": 16, + "MXdlPerWave": 1, + "NXdlPerWave": 1, + "CBlockTransferScalarPerVector_NWaveNPerXdl": 2 + } + }, + + "FP32_TF32": { + "data_type": "FP32", + "accumulator": "FP32", + "compute_type": "TF32", + "header_file": "library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_v3_xdl_instance.hpp", + "template_alias": "device_grouped_conv_bwd_weight_v3_xdl_c_shuffle_f32_tf32_instances", + "instance": { + "BlockSize": 64, + "MPerBlock": 16, + "NPerBlock": 16, + "K0PerBlock": 32, + "K1": 8, + "MPerXDL": 16, + "NPerXDL": 16, + "MXdlPerWave": 1, + "NXdlPerWave": 1, + "ComputeTypeA": "TF32", + "ComputeTypeB": "TF32" + } + }, + + "FP16": { + "data_type": "FP16", + "accumulator": "FP32", + "header_file": "library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_v3_xdl_instance.hpp", + "template_alias": "device_grouped_conv_bwd_weight_v3_xdl_c_shuffle_f16_instances", + "all_instances": [ + {"BlockSize": 64, "MPerBlock": 32, "NPerBlock": 32, "K0PerBlock": 32, "K1": 8, "MPerXDL": 32, "NPerXDL": 32, "MXdlPerWave": 1, "NXdlPerWave": 1, "CBlockTransferScalarPerVector_NWaveNPerXdl": 2}, + {"BlockSize": 64, "MPerBlock": 32, "NPerBlock": 64, "K0PerBlock": 32, "K1": 8, "MPerXDL": 32, "NPerXDL": 32, "MXdlPerWave": 1, "NXdlPerWave": 2, "CBlockTransferScalarPerVector_NWaveNPerXdl": 2}, + {"BlockSize": 64, "MPerBlock": 32, "NPerBlock": 128, "K0PerBlock": 32, "K1": 8, "MPerXDL": 32, "NPerXDL": 32, "MXdlPerWave": 1, "NXdlPerWave": 4, "CBlockTransferScalarPerVector_NWaveNPerXdl": 2}, + {"BlockSize": 64, "MPerBlock": 64, "NPerBlock": 32, "K0PerBlock": 32, "K1": 8, "MPerXDL": 32, "NPerXDL": 32, "MXdlPerWave": 2, "NXdlPerWave": 1, "CBlockTransferScalarPerVector_NWaveNPerXdl": 2}, + {"BlockSize": 64, "MPerBlock": 128, "NPerBlock": 32, "K0PerBlock": 32, "K1": 8, "MPerXDL": 32, "NPerXDL": 32, "MXdlPerWave": 4, "NXdlPerWave": 1, "CBlockTransferScalarPerVector_NWaveNPerXdl": 2}, + {"BlockSize": 64, "MPerBlock": 64, "NPerBlock": 80, "K0PerBlock": 32, "K1": 8, "MPerXDL": 16, "NPerXDL": 16, "MXdlPerWave": 4, "NXdlPerWave": 5, "CBlockTransferScalarPerVector_NWaveNPerXdl": 2, "note": "Irregular NPerBlock"}, + {"BlockSize": 64, "MPerBlock": 64, "NPerBlock": 112, "K0PerBlock": 32, "K1": 8, "MPerXDL": 16, "NPerXDL": 16, "MXdlPerWave": 4, "NXdlPerWave": 7, "CBlockTransferScalarPerVector_NWaveNPerXdl": 2, "note": "Irregular NPerBlock"} + ] + }, + + "BF16": { + "data_type": "BF16", + "accumulator": "FP32", + "header_file": "library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_v3_xdl_instance.hpp", + "template_alias": "device_grouped_conv_bwd_weight_v3_xdl_c_shuffle_bf16_instances", + "all_instances": [ + {"BlockSize": 64, "MPerBlock": 32, "NPerBlock": 32, "K0PerBlock": 32, "K1": 8, "MPerXDL": 32, "NPerXDL": 32, "MXdlPerWave": 1, "NXdlPerWave": 1, "CBlockTransferScalarPerVector_NWaveNPerXdl": 2}, + {"BlockSize": 64, "MPerBlock": 32, "NPerBlock": 64, "K0PerBlock": 32, "K1": 8, "MXdlPerWave": 1, "NXdlPerWave": 2, "CBlockTransferScalarPerVector_NWaveNPerXdl": 2}, + {"BlockSize": 64, "MPerBlock": 32, "NPerBlock": 128, "K0PerBlock": 32, "K1": 8, "MXdlPerWave": 1, "NXdlPerWave": 4, "CBlockTransferScalarPerVector_NWaveNPerXdl": 2}, + {"BlockSize": 64, "MPerBlock": 64, "NPerBlock": 32, "K0PerBlock": 32, "K1": 8, "MXdlPerWave": 2, "NXdlPerWave": 1, "CBlockTransferScalarPerVector_NWaveNPerXdl": 2}, + {"BlockSize": 64, "MPerBlock": 128, "NPerBlock": 32, "K0PerBlock": 32, "K1": 8, "MXdlPerWave": 4, "NXdlPerWave": 1, "CBlockTransferScalarPerVector_NWaveNPerXdl": 2}, + {"BlockSize": 64, "MPerBlock": 64, "NPerBlock": 80, "K0PerBlock": 32, "K1": 8, "MPerXDL": 16, "NPerXDL": 16, "MXdlPerWave": 4, "NXdlPerWave": 5}, + {"BlockSize": 64, "MPerBlock": 64, "NPerBlock": 112, "K0PerBlock": 32, "K1": 8, "MPerXDL": 16, "NPerXDL": 16, "MXdlPerWave": 4, "NXdlPerWave": 7} + ] + } + } + }, + + "DeviceGroupedConvBwdWeight_Wmma_CShuffleV3": { + "implementation_file": "include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_wmma_cshuffle_v3.hpp", + "instruction_set": "WMMA", + "version": "v3", + "status": "active_recommended", + + "instances": { + "FP16": { + "data_type": "FP16", + "accumulator": "FP32", + "header_file": "library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_v3_wmma_instance.hpp", + "template_alias": "device_grouped_conv_bwd_weight_v3_wmma_c_shuffle_f16_instances", + "all_instances": [ + {"BlockSize": 64, "MPerBlock": 32, "NPerBlock": 32, "K0PerBlock": 32, "K1": 8, "MPerWmma": 16, "NPerWmma": 16, "MRepeat": 2, "NRepeat": 1, "CBlockTransferScalarPerVector_NPerBlock": 2}, + {"BlockSize": 128, "MPerBlock": 128, "NPerBlock": 128, "K0PerBlock": 32, "K1": 8, "MRepeat": 8, "NRepeat": 2, "CBlockTransferScalarPerVector_NPerBlock": 8}, + {"BlockSize": 128, "MPerBlock": 128, "NPerBlock": 128, "K0PerBlock": 32, "K1": 8, "MRepeat": 8, "NRepeat": 2, "CBlockTransferScalarPerVector_NPerBlock": 8, "note": "Extra LDS"}, + {"BlockSize": 64, "MPerBlock": 64, "NPerBlock": 64, "K0PerBlock": 64, "K1": 8, "MRepeat": 4, "NRepeat": 2, "CBlockTransferScalarPerVector_NPerBlock": 8}, + {"BlockSize": 256, "MPerBlock": 128, "NPerBlock": 256, "K0PerBlock": 64, "K1": 8, "MRepeat": 8, "NRepeat": 2, "CBlockTransferScalarPerVector_NPerBlock": 8}, + {"BlockSize": 128, "MPerBlock": 48, "NPerBlock": 64, "K0PerBlock": 128, "K1": 8, "MRepeat": 3, "NRepeat": 1, "CBlockTransferScalarPerVector_NPerBlock": 8, "note": "Irregular MPerBlock=48"}, + {"BlockSize": 128, "MPerBlock": 96, "NPerBlock": 128, "K0PerBlock": 64, "K1": 8, "MRepeat": 6, "NRepeat": 2, "CBlockTransferScalarPerVector_NPerBlock": 8, "note": "Irregular MPerBlock=96"}, + {"BlockSize": 128, "MPerBlock": 64, "NPerBlock": 64, "K0PerBlock": 128, "K1": 8, "MRepeat": 4, "NRepeat": 1, "CBlockTransferScalarPerVector_NPerBlock": 8}, + {"BlockSize": 256, "MPerBlock": 96, "NPerBlock": 128, "K0PerBlock": 128, "K1": 8, "MRepeat": 6, "NRepeat": 1, "CBlockTransferScalarPerVector_NPerBlock": 8, "note": "Irregular MPerBlock=96"} + ] + }, + + "BF16": { + "data_type": "BF16", + "accumulator": "FP32", + "header_file": "library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_v3_wmma_instance.hpp", + "template_alias": "device_grouped_conv_bwd_weight_v3_wmma_c_shuffle_bf16_instances", + "all_instances": [ + {"BlockSize": 64, "MPerBlock": 32, "NPerBlock": 32, "K0PerBlock": 32, "K1": 8, "MRepeat": 2, "NRepeat": 1, "CBlockTransferScalarPerVector_NPerBlock": 2}, + {"BlockSize": 128, "MPerBlock": 128, "NPerBlock": 128, "K0PerBlock": 32, "K1": 8, "MRepeat": 8, "NRepeat": 2, "CBlockTransferScalarPerVector_NPerBlock": 8}, + {"BlockSize": 128, "MPerBlock": 128, "NPerBlock": 128, "K0PerBlock": 32, "K1": 8, "MRepeat": 8, "NRepeat": 2, "CBlockTransferScalarPerVector_NPerBlock": 8, "note": "Extra LDS"}, + {"BlockSize": 64, "MPerBlock": 64, "NPerBlock": 64, "K0PerBlock": 64, "K1": 8, "MRepeat": 4, "NRepeat": 2, "CBlockTransferScalarPerVector_NPerBlock": 8}, + {"BlockSize": 256, "MPerBlock": 128, "NPerBlock": 256, "K0PerBlock": 64, "K1": 8, "MRepeat": 8, "NRepeat": 2, "CBlockTransferScalarPerVector_NPerBlock": 8}, + {"BlockSize": 128, "MPerBlock": 48, "NPerBlock": 64, "K0PerBlock": 128, "K1": 8, "MRepeat": 3, "NRepeat": 1, "CBlockTransferScalarPerVector_NPerBlock": 8}, + {"BlockSize": 128, "MPerBlock": 96, "NPerBlock": 128, "K0PerBlock": 64, "K1": 8, "MRepeat": 6, "NRepeat": 2, "CBlockTransferScalarPerVector_NPerBlock": 8}, + {"BlockSize": 128, "MPerBlock": 64, "NPerBlock": 64, "K0PerBlock": 128, "K1": 8, "MRepeat": 4, "NRepeat": 1, "CBlockTransferScalarPerVector_NPerBlock": 8}, + {"BlockSize": 256, "MPerBlock": 96, "NPerBlock": 128, "K0PerBlock": 128, "K1": 8, "MRepeat": 6, "NRepeat": 1, "CBlockTransferScalarPerVector_NPerBlock": 8}, + {"BlockSize": 96, "MPerBlock": 96, "NPerBlock": 96, "K0PerBlock": 48, "K1": 8, "MRepeat": 6, "NRepeat": 2, "CBlockTransferScalarPerVector_NPerBlock": 8} + ] + } + } + }, + + "DeviceGroupedConvBwdWeight_Xdl_CShuffle": { + "implementation_file": "include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_xdl_cshuffle.hpp", + "instruction_set": "XDL/MFMA", + "version": "v1", + "status": "active_legacy", + + "instances": { + "FP32_generic": { + "data_type": "FP32", + "header_file": "library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_xdl_instance.hpp", + "template_alias": "device_grouped_conv_bwd_weight_xdl_c_shuffle_f32_generic_instances", + "instance": { + "BlockSize": 64, + "MPerBlock": 64, + "NPerBlock": 64, + "K0PerBlock": 4, + "K1": 4, + "MPerXDL": 32, + "NPerXDL": 32, + "MXdlPerWave": 2, + "NXdlPerWave": 2, + "CBlockTransferScalarPerVector_NWaveNPerXdl": 1 + } + }, + + "FP32": { + "data_type": "FP32", + "header_file": "library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_xdl_instance.hpp", + "template_alias": "device_grouped_conv_bwd_weight_xdl_c_shuffle_f32_instances", + "all_instances": [ + {"BlockSize": 64, "MPerBlock": 64, "NPerBlock": 64, "K0PerBlock": 4, "K1": 4, "MXdlPerWave": 2, "NXdlPerWave": 2, "CBlockTransferScalarPerVector_NWaveNPerXdl": 1}, + {"BlockSize": 128, "MPerBlock": 128, "NPerBlock": 32, "K0PerBlock": 4, "K1": 4, "MXdlPerWave": 2, "NXdlPerWave": 1, "CBlockTransferScalarPerVector_NWaveNPerXdl": 1}, + {"BlockSize": 64, "MPerBlock": 32, "NPerBlock": 64, "K0PerBlock": 4, "K1": 4, "MXdlPerWave": 1, "NXdlPerWave": 2, "CBlockTransferScalarPerVector_NWaveNPerXdl": 4}, + {"BlockSize": 256, "MPerBlock": 256, "NPerBlock": 128, "K0PerBlock": 4, "K1": 4, "MXdlPerWave": 4, "NXdlPerWave": 2, "CBlockTransferScalarPerVector_NWaveNPerXdl": 4}, + {"BlockSize": 256, "MPerBlock": 128, "NPerBlock": 256, "K0PerBlock": 4, "K1": 4, "MXdlPerWave": 2, "NXdlPerWave": 4, "CBlockTransferScalarPerVector_NWaveNPerXdl": 4}, + {"BlockSize": 128, "MPerBlock": 128, "NPerBlock": 128, "K0PerBlock": 4, "K1": 4, "MXdlPerWave": 4, "NXdlPerWave": 2, "CBlockTransferScalarPerVector_NWaveNPerXdl": 4}, + {"BlockSize": 256, "MPerBlock": 128, "NPerBlock": 128, "K0PerBlock": 4, "K1": 4, "MXdlPerWave": 2, "NXdlPerWave": 2, "CBlockTransferScalarPerVector_NWaveNPerXdl": 4}, + {"BlockSize": 128, "MPerBlock": 128, "NPerBlock": 64, "K0PerBlock": 4, "K1": 4, "MXdlPerWave": 2, "NXdlPerWave": 2, "CBlockTransferScalarPerVector_NWaveNPerXdl": 4}, + {"BlockSize": 128, "MPerBlock": 64, "NPerBlock": 128, "K0PerBlock": 4, "K1": 4, "MXdlPerWave": 2, "NXdlPerWave": 2, "CBlockTransferScalarPerVector_NWaveNPerXdl": 4}, + {"BlockSize": 64, "MPerBlock": 64, "NPerBlock": 64, "K0PerBlock": 4, "K1": 4, "MXdlPerWave": 2, "NXdlPerWave": 2, "CBlockTransferScalarPerVector_NWaveNPerXdl": 4}, + {"BlockSize": 256, "MPerBlock": 128, "NPerBlock": 64, "K0PerBlock": 4, "K1": 4, "MXdlPerWave": 2, "NXdlPerWave": 1, "CBlockTransferScalarPerVector_NWaveNPerXdl": 4}, + {"BlockSize": 256, "MPerBlock": 64, "NPerBlock": 128, "K0PerBlock": 4, "K1": 4, "MXdlPerWave": 1, "NXdlPerWave": 2, "CBlockTransferScalarPerVector_NWaveNPerXdl": 4}, + {"BlockSize": 128, "MPerBlock": 128, "NPerBlock": 32, "K0PerBlock": 4, "K1": 4, "MXdlPerWave": 2, "NXdlPerWave": 1, "CBlockTransferScalarPerVector_NWaveNPerXdl": 4}, + {"BlockSize": 128, "MPerBlock": 32, "NPerBlock": 128, "K0PerBlock": 4, "K1": 4, "MXdlPerWave": 1, "NXdlPerWave": 2, "CBlockTransferScalarPerVector_NWaveNPerXdl": 4}, + {"BlockSize": 64, "MPerBlock": 64, "NPerBlock": 32, "K0PerBlock": 4, "K1": 4, "MXdlPerWave": 2, "NXdlPerWave": 1, "CBlockTransferScalarPerVector_NWaveNPerXdl": 4}, + {"BlockSize": 64, "MPerBlock": 32, "NPerBlock": 64, "K0PerBlock": 4, "K1": 4, "MXdlPerWave": 1, "NXdlPerWave": 2, "CBlockTransferScalarPerVector_NWaveNPerXdl": 4}, + {"BlockSize": 256, "MPerBlock": 64, "NPerBlock": 64, "K0PerBlock": 8, "K1": 8, "MPerXDL": 32, "NPerXDL": 32, "MXdlPerWave": 1, "NXdlPerWave": 1, "CBlockTransferScalarPerVector_NWaveNPerXdl": 4}, + {"BlockSize": 256, "MPerBlock": 64, "NPerBlock": 64, "K0PerBlock": 8, "K1": 8, "MXdlPerWave": 1, "NXdlPerWave": 1, "CBlockTransferScalarPerVector_NWaveNPerXdl": 1} + ] + }, + + "FP16": { + "data_type": "FP16", + "accumulator": "FP32", + "header_file": "library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_xdl_instance.hpp", + "template_alias": "device_grouped_conv_bwd_weight_xdl_c_shuffle_f16_instances", + "all_instances": [ + {"BlockSize": 64, "MPerBlock": 64, "NPerBlock": 64, "K0PerBlock": 4, "K1": 8, "MPerXDL": 32, "NPerXDL": 32, "MXdlPerWave": 2, "NXdlPerWave": 2, "CBlockTransferScalarPerVector_NWaveNPerXdl": 2}, + {"BlockSize": 128, "MPerBlock": 128, "NPerBlock": 32, "K0PerBlock": 4, "K1": 8, "MXdlPerWave": 2, "NXdlPerWave": 1, "CBlockTransferScalarPerVector_NWaveNPerXdl": 2}, + {"BlockSize": 64, "MPerBlock": 32, "NPerBlock": 64, "K0PerBlock": 4, "K1": 8, "MXdlPerWave": 1, "NXdlPerWave": 2, "CBlockTransferScalarPerVector_NWaveNPerXdl": 8}, + {"BlockSize": 256, "MPerBlock": 256, "NPerBlock": 128, "K0PerBlock": 4, "K1": 8, "MXdlPerWave": 4, "NXdlPerWave": 2, "CBlockTransferScalarPerVector_NWaveNPerXdl": 8}, + {"BlockSize": 256, "MPerBlock": 128, "NPerBlock": 256, "K0PerBlock": 4, "K1": 8, "MXdlPerWave": 2, "NXdlPerWave": 4, "CBlockTransferScalarPerVector_NWaveNPerXdl": 8}, + {"BlockSize": 128, "MPerBlock": 128, "NPerBlock": 128, "K0PerBlock": 4, "K1": 8, "MXdlPerWave": 4, "NXdlPerWave": 2, "CBlockTransferScalarPerVector_NWaveNPerXdl": 8}, + {"BlockSize": 256, "MPerBlock": 128, "NPerBlock": 128, "K0PerBlock": 4, "K1": 8, "MXdlPerWave": 2, "NXdlPerWave": 2, "CBlockTransferScalarPerVector_NWaveNPerXdl": 8}, + {"BlockSize": 128, "MPerBlock": 128, "NPerBlock": 64, "K0PerBlock": 4, "K1": 8, "MXdlPerWave": 2, "NXdlPerWave": 2, "CBlockTransferScalarPerVector_NWaveNPerXdl": 8}, + {"BlockSize": 128, "MPerBlock": 64, "NPerBlock": 128, "K0PerBlock": 4, "K1": 8, "MXdlPerWave": 2, "NXdlPerWave": 2, "CBlockTransferScalarPerVector_NWaveNPerXdl": 8}, + {"BlockSize": 64, "MPerBlock": 64, "NPerBlock": 64, "K0PerBlock": 4, "K1": 8, "MXdlPerWave": 2, "NXdlPerWave": 2, "CBlockTransferScalarPerVector_NWaveNPerXdl": 8}, + {"BlockSize": 256, "MPerBlock": 128, "NPerBlock": 64, "K0PerBlock": 4, "K1": 8, "MXdlPerWave": 2, "NXdlPerWave": 1, "CBlockTransferScalarPerVector_NWaveNPerXdl": 8}, + {"BlockSize": 256, "MPerBlock": 64, "NPerBlock": 128, "K0PerBlock": 4, "K1": 8, "MXdlPerWave": 1, "NXdlPerWave": 2, "CBlockTransferScalarPerVector_NWaveNPerXdl": 8}, + {"BlockSize": 128, "MPerBlock": 128, "NPerBlock": 32, "K0PerBlock": 4, "K1": 8, "MXdlPerWave": 2, "NXdlPerWave": 1, "CBlockTransferScalarPerVector_NWaveNPerXdl": 8}, + {"BlockSize": 128, "MPerBlock": 32, "NPerBlock": 128, "K0PerBlock": 4, "K1": 8, "MXdlPerWave": 1, "NXdlPerWave": 2, "CBlockTransferScalarPerVector_NWaveNPerXdl": 8}, + {"BlockSize": 64, "MPerBlock": 64, "NPerBlock": 32, "K0PerBlock": 4, "K1": 8, "MXdlPerWave": 2, "NXdlPerWave": 1, "CBlockTransferScalarPerVector_NWaveNPerXdl": 8}, + {"BlockSize": 64, "MPerBlock": 32, "NPerBlock": 64, "K0PerBlock": 4, "K1": 8, "MXdlPerWave": 1, "NXdlPerWave": 2, "CBlockTransferScalarPerVector_NWaveNPerXdl": 8} + ] + }, + + "BF16": { + "data_type": "BF16", + "accumulator": "FP32", + "note": "Similar instance pattern as FP16, ~15 instances" + }, + + "FP32_TF32": { + "data_type": "FP32", + "compute_type": "TF32", + "note": "Similar instance pattern, ~20 instances with TF32 compute" + } + } + }, + + "DeviceGroupedConvBwdWeight_TwoStage_Xdl_CShuffle": { + "implementation_file": "include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_two_stage_xdl_cshuffle.hpp", + "instruction_set": "XDL/MFMA", + "version": "two_stage", + "status": "active", + + "instances": { + "FP16_NHWGC_generic": { + "data_type": "FP16", + "layout": "NHWGC/GKYXC/NHWGK", + "header_file": "library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_two_stage_xdl_instance.hpp", + "template_alias": "device_grouped_conv_bwd_weight_two_stage_nhwgc_xdl_c_shuffle_f16_generic_instances", + "instance": { + "BlockSize": 64, + "MPerBlock": 16, + "NPerBlock": 16, + "K0PerBlock": 32, + "K1": 8, + "MPerXDL": 16, + "NPerXDL": 16, + "MXdlPerWave": 1, + "NXdlPerWave": 1, + "NumGroupsToMerge": 1 + } + }, + + "FP16_NHWGC": { + "data_type": "FP16", + "layout": "NHWGC/GKYXC/NHWGK", + "header_file": "library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_two_stage_xdl_instance.hpp", + "template_alias": "device_grouped_conv_bwd_weight_two_stage_nhwgc_xdl_c_shuffle_f16_instances", + "all_instances": [ + {"BlockSize": 64, "MPerBlock": 16, "NPerBlock": 16, "K0PerBlock": 32, "K1": 8, "MPerXDL": 16, "NPerXDL": 16, "MXdlPerWave": 1, "NXdlPerWave": 1, "NumGroupsToMerge": 1}, + {"BlockSize": 64, "MPerBlock": 32, "NPerBlock": 64, "K0PerBlock": 32, "K1": 8, "MPerXDL": 32, "NPerXDL": 32, "MXdlPerWave": 1, "NXdlPerWave": 2, "NumGroupsToMerge": 2}, + {"BlockSize": 64, "MPerBlock": 32, "NPerBlock": 32, "K0PerBlock": 32, "K1": 8, "MXdlPerWave": 1, "NXdlPerWave": 1, "NumGroupsToMerge": 2}, + {"BlockSize": 64, "MPerBlock": 32, "NPerBlock": 64, "K0PerBlock": 32, "K1": 8, "MXdlPerWave": 1, "NXdlPerWave": 2, "NumGroupsToMerge": 4}, + {"BlockSize": 64, "MPerBlock": 32, "NPerBlock": 128, "K0PerBlock": 32, "K1": 8, "MXdlPerWave": 1, "NXdlPerWave": 4, "NumGroupsToMerge": 8}, + {"BlockSize": 64, "MPerBlock": 32, "NPerBlock": 32, "K0PerBlock": 32, "K1": 8, "MPerXDL": 32, "NPerXDL": 32, "MXdlPerWave": 1, "NXdlPerWave": 1, "NumGroupsToMerge": 2}, + {"BlockSize": 64, "MPerBlock": 64, "NPerBlock": 32, "K0PerBlock": 32, "K1": 8, "MXdlPerWave": 2, "NXdlPerWave": 1, "NumGroupsToMerge": 4}, + {"BlockSize": 64, "MPerBlock": 128, "NPerBlock": 32, "K0PerBlock": 32, "K1": 8, "MXdlPerWave": 4, "NXdlPerWave": 1, "NumGroupsToMerge": 8} + ] + }, + + "FP16_NHWGC_part2": { + "data_type": "FP16", + "layout": "NHWGC/GKYXC/NHWGK", + "template_alias": "device_grouped_conv_bwd_weight_two_stage_nhwgc_xdl_c_shuffle_f16_part2_instances", + "all_instances": [ + {"BlockSize": 256, "MPerBlock": 64, "NPerBlock": 64, "K0PerBlock": 64, "K1": 8, "MPerXDL": 32, "NPerXDL": 32, "MXdlPerWave": 1, "NXdlPerWave": 1, "NumGroupsToMerge": 1}, + {"BlockSize": 256, "MPerBlock": 64, "NPerBlock": 64, "K0PerBlock": 64, "K1": 8, "MXdlPerWave": 1, "NXdlPerWave": 1, "NumGroupsToMerge": 1}, + {"BlockSize": 256, "MPerBlock": 64, "NPerBlock": 64, "K0PerBlock": 64, "K1": 8, "MXdlPerWave": 1, "NXdlPerWave": 1, "NumGroupsToMerge": 1}, + {"BlockSize": 256, "MPerBlock": 64, "NPerBlock": 64, "K0PerBlock": 64, "K1": 8, "MXdlPerWave": 1, "NXdlPerWave": 1, "NumGroupsToMerge": 1}, + {"BlockSize": 256, "MPerBlock": 64, "NPerBlock": 64, "K0PerBlock": 64, "K1": 8, "MXdlPerWave": 1, "NXdlPerWave": 1, "NumGroupsToMerge": 1}, + {"BlockSize": 64, "MPerBlock": 16, "NPerBlock": 256, "K0PerBlock": 32, "K1": 8, "MPerXDL": 16, "NPerXDL": 16, "MXdlPerWave": 1, "NXdlPerWave": 16, "NumGroupsToMerge": 8}, + {"BlockSize": 64, "MPerBlock": 16, "NPerBlock": 128, "K0PerBlock": 32, "K1": 8, "MXdlPerWave": 1, "NXdlPerWave": 8, "NumGroupsToMerge": 4} + ] + }, + + "FP16_NHWGC_irregular": { + "data_type": "FP16", + "layout": "NHWGC/GKYXC/NHWGK", + "variant": "irregular block sizes", + "template_alias": "device_grouped_conv_bwd_weight_two_stage_nhwgc_xdl_c_shuffle_f16_irregular_instances", + "all_instances": [ + {"BlockSize": 64, "MPerBlock": 48, "NPerBlock": 64, "K0PerBlock": 32, "K1": 8, "MPerXDL": 16, "NPerXDL": 16, "MXdlPerWave": 3, "NXdlPerWave": 4, "NumGroupsToMerge": 1, "note": "Irregular MPerBlock=48"}, + {"BlockSize": 64, "MPerBlock": 64, "NPerBlock": 48, "K0PerBlock": 32, "K1": 8, "MXdlPerWave": 4, "NXdlPerWave": 3, "NumGroupsToMerge": 1, "note": "Irregular NPerBlock=48"}, + {"BlockSize": 64, "MPerBlock": 64, "NPerBlock": 80, "K0PerBlock": 32, "K1": 8, "MXdlPerWave": 4, "NXdlPerWave": 5, "NumGroupsToMerge": 1, "note": "Irregular NPerBlock=80"}, + {"BlockSize": 64, "MPerBlock": 64, "NPerBlock": 112, "K0PerBlock": 32, "K1": 8, "MXdlPerWave": 4, "NXdlPerWave": 7, "NumGroupsToMerge": 1, "note": "Irregular NPerBlock=112"}, + {"BlockSize": 64, "MPerBlock": 64, "NPerBlock": 208, "K0PerBlock": 32, "K1": 8, "MXdlPerWave": 4, "NXdlPerWave": 13, "NumGroupsToMerge": 1, "note": "Irregular NPerBlock=208"} + ] + }, + + "BF16_NHWGC": { + "data_type": "BF16", + "layout": "NHWGC/GKYXC/NHWGK", + "note": "Similar patterns as FP16, multiple pipeline versions (v1, v2, v5)" + } + } + }, + + "DeviceGroupedConvBwdWeight_TwoStage_Wmma_CShuffleV3": { + "implementation_file": "include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_two_stage_wmma_cshuffle_v3.hpp", + "instruction_set": "WMMA", + "version": "two_stage_v3", + "status": "active", + + "instances": { + "FP16": { + "data_type": "FP16", + "accumulator": "FP32", + "header_file": "library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_two_stage_wmma_instance.hpp", + "template_alias": "device_grouped_conv_bwd_weight_two_stage_nhwgc_wmma_c_shuffle_f16_instances", + "instance": { + "BlockSize": 32, + "MPerBlock": 16, + "NPerBlock": 16, + "K0PerBlock": 32, + "K1": 8, + "MPerWmma": 16, + "NPerWmma": 16, + "MRepeat": 1, + "NRepeat": 1, + "NumGroupsToMerge": 1, + "CBlockTransferScalarPerVector_NPerBlock": 1 + } + }, + + "BF16": { + "data_type": "BF16", + "accumulator": "FP32", + "header_file": "library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_two_stage_wmma_instance.hpp", + "template_alias": "device_grouped_conv_bwd_weight_two_stage_nhwgc_wmma_c_shuffle_bf16_instances", + "instance": { + "BlockSize": 32, + "MPerBlock": 16, + "NPerBlock": 16, + "K0PerBlock": 32, + "K1": 8, + "MRepeat": 1, + "NRepeat": 1, + "NumGroupsToMerge": 1 + } + } + } + }, + + "DeviceGroupedConvBwdWeight_DL": { + "implementation_file": "include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_dl.hpp", + "instruction_set": "Direct Load", + "version": "dl", + "status": "active", + + "instances": { + "FP32": { + "data_type": "FP32", + "header_file": "library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_dl_instance.hpp", + "template_alias": "device_grouped_conv_bwd_weight_dl_f32_instances", + "instance": { + "BlockSize": 256, + "MPerBlock": 128, + "NPerBlock": 128, + "K0PerBlock": 16, + "K1": 1, + "M1PerThread": 4, + "N1PerThread": 4, + "KPerThread": 1 + } + }, + + "FP16": { + "data_type": "FP16", + "accumulator": "FP32", + "header_file": "library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_dl_instance.hpp", + "template_alias": "device_grouped_conv_bwd_weight_dl_f16_instances", + "instance": { + "BlockSize": 256, + "MPerBlock": 128, + "NPerBlock": 128, + "K0PerBlock": 16, + "K1": 1, + "M1PerThread": 4, + "N1PerThread": 4, + "KPerThread": 1 + } + }, + + "BF16_mixed": { + "data_type": "BF16", + "weight_type": "FP32", + "output_type": "BF16", + "accumulator": "FP32", + "header_file": "library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_dl_instance.hpp", + "template_alias": "device_grouped_conv_bwd_weight_dl_bf16_instances", + "instance": { + "BlockSize": 256, + "MPerBlock": 128, + "NPerBlock": 128, + "K0PerBlock": 16, + "K1": 1, + "note": "Mixed precision: BF16 input/output, FP32 weights" + } + } + } + }, + + "DeviceGroupedConvBwdWeightMultipleD_Xdl_CShuffle": { + "implementation_file": "include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_multiple_d_xdl_cshuffle.hpp", + "instruction_set": "XDL/MFMA", + "version": "multiple_d", + "status": "active", + "fused_operations": ["Bilinear", "Scale"], + + "instances": { + "FP32_Bilinear": { + "data_type": "FP32", + "element_wise_op": "Bilinear", + "header_file": "library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_xdl_bilinear_instance.hpp", + "template_alias": "device_grouped_conv_bwd_weight_xdl_c_shuffle_f32_bilinear_instances", + "all_instances": [ + {"BlockSize": 64, "MPerBlock": 64, "NPerBlock": 64, "K0PerBlock": 4, "K1": 4, "MPerXDL": 32, "NPerXDL": 32, "MXdlPerWave": 2, "NXdlPerWave": 2, "CBlockTransferScalarPerVector_NWaveNPerXdl": 1, "DsTensors": "Tuple"}, + {"BlockSize": 128, "MPerBlock": 128, "NPerBlock": 32, "K0PerBlock": 4, "K1": 4, "MXdlPerWave": 2, "NXdlPerWave": 1, "CBlockTransferScalarPerVector_NWaveNPerXdl": 1}, + {"BlockSize": 64, "MPerBlock": 32, "NPerBlock": 64, "K0PerBlock": 4, "K1": 4, "MXdlPerWave": 1, "NXdlPerWave": 2, "CBlockTransferScalarPerVector_NWaveNPerXdl": 4}, + {"BlockSize": 256, "MPerBlock": 256, "NPerBlock": 128, "K0PerBlock": 4, "K1": 4, "MXdlPerWave": 4, "NXdlPerWave": 2, "CBlockTransferScalarPerVector_NWaveNPerXdl": 4}, + {"BlockSize": 256, "MPerBlock": 128, "NPerBlock": 256, "K0PerBlock": 4, "K1": 4, "MXdlPerWave": 2, "NXdlPerWave": 4, "CBlockTransferScalarPerVector_NWaveNPerXdl": 4}, + {"BlockSize": 128, "MPerBlock": 128, "NPerBlock": 128, "K0PerBlock": 4, "K1": 4, "MXdlPerWave": 4, "NXdlPerWave": 2, "CBlockTransferScalarPerVector_NWaveNPerXdl": 4}, + {"BlockSize": 256, "MPerBlock": 128, "NPerBlock": 128, "K0PerBlock": 4, "K1": 4, "MXdlPerWave": 2, "NXdlPerWave": 2, "CBlockTransferScalarPerVector_NWaveNPerXdl": 4}, + {"BlockSize": 128, "MPerBlock": 128, "NPerBlock": 64, "K0PerBlock": 4, "K1": 4, "MXdlPerWave": 2, "NXdlPerWave": 2, "CBlockTransferScalarPerVector_NWaveNPerXdl": 4}, + {"BlockSize": 128, "MPerBlock": 64, "NPerBlock": 128, "K0PerBlock": 4, "K1": 4, "MXdlPerWave": 2, "NXdlPerWave": 2, "CBlockTransferScalarPerVector_NWaveNPerXdl": 4}, + {"BlockSize": 64, "MPerBlock": 64, "NPerBlock": 64, "K0PerBlock": 4, "K1": 4, "MXdlPerWave": 2, "NXdlPerWave": 2, "CBlockTransferScalarPerVector_NWaveNPerXdl": 4}, + {"BlockSize": 256, "MPerBlock": 128, "NPerBlock": 64, "K0PerBlock": 4, "K1": 4, "MXdlPerWave": 2, "NXdlPerWave": 1, "CBlockTransferScalarPerVector_NWaveNPerXdl": 4}, + {"BlockSize": 256, "MPerBlock": 64, "NPerBlock": 128, "K0PerBlock": 4, "K1": 4, "MXdlPerWave": 1, "NXdlPerWave": 2, "CBlockTransferScalarPerVector_NWaveNPerXdl": 4}, + {"BlockSize": 128, "MPerBlock": 128, "NPerBlock": 32, "K0PerBlock": 4, "K1": 4, "MXdlPerWave": 2, "NXdlPerWave": 1, "CBlockTransferScalarPerVector_NWaveNPerXdl": 4}, + {"BlockSize": 128, "MPerBlock": 32, "NPerBlock": 128, "K0PerBlock": 4, "K1": 4, "MXdlPerWave": 1, "NXdlPerWave": 2, "CBlockTransferScalarPerVector_NWaveNPerXdl": 4}, + {"BlockSize": 64, "MPerBlock": 64, "NPerBlock": 32, "K0PerBlock": 4, "K1": 4, "MXdlPerWave": 2, "NXdlPerWave": 1, "CBlockTransferScalarPerVector_NWaveNPerXdl": 4}, + {"BlockSize": 64, "MPerBlock": 32, "NPerBlock": 64, "K0PerBlock": 4, "K1": 4, "MXdlPerWave": 1, "NXdlPerWave": 2, "CBlockTransferScalarPerVector_NWaveNPerXdl": 4} + ] + }, + + "FP16_Bilinear": { + "data_type": "FP16", + "element_wise_op": "Bilinear", + "header_file": "library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_xdl_bilinear_instance.hpp", + "template_alias": "device_grouped_conv_bwd_weight_xdl_c_shuffle_f16_bilinear_instances", + "all_instances": [ + {"BlockSize": 64, "MPerBlock": 64, "NPerBlock": 64, "K0PerBlock": 4, "K1": 8, "MXdlPerWave": 2, "NXdlPerWave": 2, "CBlockTransferScalarPerVector_NWaveNPerXdl": 1, "DsTensors": "Tuple"}, + {"BlockSize": 64, "MPerBlock": 64, "NPerBlock": 64, "K0PerBlock": 4, "K1": 8, "MXdlPerWave": 2, "NXdlPerWave": 2, "CBlockTransferScalarPerVector_NWaveNPerXdl": 2}, + {"BlockSize": 128, "MPerBlock": 128, "NPerBlock": 32, "K0PerBlock": 4, "K1": 8, "MXdlPerWave": 2, "NXdlPerWave": 1, "CBlockTransferScalarPerVector_NWaveNPerXdl": 2}, + {"BlockSize": 64, "MPerBlock": 32, "NPerBlock": 64, "K0PerBlock": 4, "K1": 8, "MXdlPerWave": 1, "NXdlPerWave": 2, "CBlockTransferScalarPerVector_NWaveNPerXdl": 8}, + {"BlockSize": 256, "MPerBlock": 256, "NPerBlock": 128, "K0PerBlock": 4, "K1": 8, "MXdlPerWave": 4, "NXdlPerWave": 2, "CBlockTransferScalarPerVector_NWaveNPerXdl": 8}, + {"BlockSize": 256, "MPerBlock": 128, "NPerBlock": 256, "K0PerBlock": 4, "K1": 8, "MXdlPerWave": 2, "NXdlPerWave": 4, "CBlockTransferScalarPerVector_NWaveNPerXdl": 8}, + {"BlockSize": 128, "MPerBlock": 128, "NPerBlock": 128, "K0PerBlock": 4, "K1": 8, "MXdlPerWave": 4, "NXdlPerWave": 2, "CBlockTransferScalarPerVector_NWaveNPerXdl": 8}, + {"BlockSize": 256, "MPerBlock": 128, "NPerBlock": 128, "K0PerBlock": 4, "K1": 8, "MXdlPerWave": 2, "NXdlPerWave": 2, "CBlockTransferScalarPerVector_NWaveNPerXdl": 8}, + {"BlockSize": 128, "MPerBlock": 128, "NPerBlock": 64, "K0PerBlock": 4, "K1": 8, "MXdlPerWave": 2, "NXdlPerWave": 2, "CBlockTransferScalarPerVector_NWaveNPerXdl": 8}, + {"BlockSize": 128, "MPerBlock": 64, "NPerBlock": 128, "K0PerBlock": 4, "K1": 8, "MXdlPerWave": 2, "NXdlPerWave": 2, "CBlockTransferScalarPerVector_NWaveNPerXdl": 8}, + {"BlockSize": 64, "MPerBlock": 64, "NPerBlock": 64, "K0PerBlock": 4, "K1": 8, "MXdlPerWave": 2, "NXdlPerWave": 2, "CBlockTransferScalarPerVector_NWaveNPerXdl": 8}, + {"BlockSize": 256, "MPerBlock": 128, "NPerBlock": 64, "K0PerBlock": 4, "K1": 8, "MXdlPerWave": 2, "NXdlPerWave": 1, "CBlockTransferScalarPerVector_NWaveNPerXdl": 8}, + {"BlockSize": 256, "MPerBlock": 64, "NPerBlock": 128, "K0PerBlock": 4, "K1": 8, "MXdlPerWave": 1, "NXdlPerWave": 2, "CBlockTransferScalarPerVector_NWaveNPerXdl": 8}, + {"BlockSize": 128, "MPerBlock": 128, "NPerBlock": 32, "K0PerBlock": 4, "K1": 8, "MXdlPerWave": 2, "NXdlPerWave": 1, "CBlockTransferScalarPerVector_NWaveNPerXdl": 8}, + {"BlockSize": 128, "MPerBlock": 32, "NPerBlock": 128, "K0PerBlock": 4, "K1": 8, "MXdlPerWave": 1, "NXdlPerWave": 2, "CBlockTransferScalarPerVector_NWaveNPerXdl": 8}, + {"BlockSize": 64, "MPerBlock": 64, "NPerBlock": 32, "K0PerBlock": 4, "K1": 8, "MXdlPerWave": 2, "NXdlPerWave": 1, "CBlockTransferScalarPerVector_NWaveNPerXdl": 8}, + {"BlockSize": 64, "MPerBlock": 32, "NPerBlock": 64, "K0PerBlock": 4, "K1": 8, "MXdlPerWave": 1, "NXdlPerWave": 2, "CBlockTransferScalarPerVector_NWaveNPerXdl": 8} + ] + } + } + }, + + "DeviceGroupedConvBwdWeightMultipleD_Wmma_CShuffleV3": { + "implementation_file": "include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_multiple_d_wmma_cshuffle_v3.hpp", + "instruction_set": "WMMA", + "version": "multiple_d_v3", + "status": "active", + "fused_operations": ["Scale"], + + "instances": { + "FP16_Scale": { + "data_type": "FP16", + "accumulator": "FP32", + "element_wise_op": "Scale", + "header_file": "library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_wmma_scale_instance.hpp", + "template_alias": "device_grouped_conv_bwd_weight_wmma_c_shuffle_f16_scale_instances", + "all_instances": [ + {"BlockSize": 64, "MPerBlock": 64, "NPerBlock": 64, "K0PerBlock": 32, "K1": 8, "MRepeat": 4, "NRepeat": 2, "CBlockTransferScalarPerVector_NPerBlock": 2}, + {"BlockSize": 64, "MPerBlock": 32, "NPerBlock": 32, "K0PerBlock": 32, "K1": 8, "MRepeat": 2, "NRepeat": 1, "CBlockTransferScalarPerVector_NPerBlock": 2}, + {"BlockSize": 128, "MPerBlock": 128, "NPerBlock": 128, "K0PerBlock": 32, "K1": 8, "MRepeat": 8, "NRepeat": 2, "CBlockTransferScalarPerVector_NPerBlock": 8}, + {"BlockSize": 128, "MPerBlock": 128, "NPerBlock": 128, "K0PerBlock": 32, "K1": 8, "MRepeat": 8, "NRepeat": 2, "CBlockTransferScalarPerVector_NPerBlock": 8, "note": "With LDS extra"}, + {"BlockSize": 64, "MPerBlock": 64, "NPerBlock": 64, "K0PerBlock": 64, "K1": 8, "MRepeat": 4, "NRepeat": 2, "CBlockTransferScalarPerVector_NPerBlock": 8}, + {"BlockSize": 256, "MPerBlock": 128, "NPerBlock": 256, "K0PerBlock": 64, "K1": 8, "MRepeat": 8, "NRepeat": 2, "CBlockTransferScalarPerVector_NPerBlock": 8}, + {"BlockSize": 128, "MPerBlock": 48, "NPerBlock": 64, "K0PerBlock": 128, "K1": 8, "MRepeat": 3, "NRepeat": 1, "CBlockTransferScalarPerVector_NPerBlock": 8}, + {"BlockSize": 128, "MPerBlock": 96, "NPerBlock": 128, "K0PerBlock": 64, "K1": 8, "MRepeat": 6, "NRepeat": 2, "CBlockTransferScalarPerVector_NPerBlock": 8}, + {"BlockSize": 128, "MPerBlock": 64, "NPerBlock": 64, "K0PerBlock": 128, "K1": 8, "MRepeat": 4, "NRepeat": 1, "CBlockTransferScalarPerVector_NPerBlock": 8}, + {"BlockSize": 256, "MPerBlock": 96, "NPerBlock": 128, "K0PerBlock": 128, "K1": 8, "MRepeat": 6, "NRepeat": 1, "CBlockTransferScalarPerVector_NPerBlock": 8} + ] + }, + + "BF16_Scale": { + "data_type": "BF16", + "weight_type": "FP32", + "output_type": "BF16", + "accumulator": "FP32", + "element_wise_op": "Scale", + "note": "Similar instance pattern as FP16, ~13 instances with mixed precision" + } + } + } + }, + + "instance_count_summary": { + "DeviceGroupedConvBwdWeight_Xdl_CShuffleV3": { + "FP16": 7, + "BF16": 7, + "FP32": 1, + "FP32_TF32": 1, + "total": "~16 base instances, multiply by layouts and pipeline variants" + }, + "DeviceGroupedConvBwdWeight_Wmma_CShuffleV3": { + "FP16": 9, + "BF16": 10, + "total": 19 + }, + "DeviceGroupedConvBwdWeight_Xdl_CShuffle": { + "FP32": 17, + "FP32_TF32": 20, + "FP16": 16, + "BF16": 15, + "BF16_F32_BF16_mixed": 14, + "FP16_comp_BF8_F8": 15, + "total": "~100+" + }, + "DeviceGroupedConvBwdWeight_Wmma_CShuffle": { + "note": "3D only, limited instances" + }, + "DeviceGroupedConvBwdWeight_TwoStage_Xdl_CShuffle": { + "FP16_NHWGC": 15, + "FP16_NHWGC_irregular": 5, + "FP16_NGCHW": 15, + "BF16_NHWGC": 15, + "BF16_NGCHW": 15, + "total": "~65+ (across pipev1/v2/v5 and layouts)" + }, + "DeviceGroupedConvBwdWeight_TwoStage_Wmma_CShuffleV3": { + "FP16": 1, + "BF16": 1, + "total": 2 + }, + "DeviceGroupedConvBwdWeight_DL": { + "FP32": 1, + "FP16": 1, + "BF16_mixed": 1, + "total": "3 base (multiply by layouts: ~6-9)" + }, + "DeviceGroupedConvBwdWeightMultipleD_Xdl_CShuffle": { + "FP32_Bilinear": 15, + "FP32_TF32_Bilinear": 14, + "FP16_Bilinear": 16, + "BF16_Bilinear": 15, + "FP16_comp_BF8_F8_Bilinear": 15, + "note": "Similar counts for Scale variants", + "total": "~100+" + }, + "DeviceGroupedConvBwdWeightMultipleD_Wmma_CShuffleV3": { + "FP16_Scale": 10, + "BF16_Scale": 13, + "total": 23 + } + }, + + "file_references_by_algorithm": { + "DeviceGroupedConvBwdWeight_Xdl_CShuffleV3": { + "implementation": "include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_xdl_cshuffle_v3.hpp", + "instances_header": "library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_v3_xdl_instance.hpp", + "cpp_files": [ + "library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/gnhwc_gkyxc_gnhwk/device_grouped_conv2d_bwd_weight_v3_xdl_gnhwc_gkyxc_gnhwk_f16_default_pipev1_instance.cpp", + "library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/nhwgc_gkyxc_nhwgk/device_grouped_conv2d_bwd_weight_v3_xdl_nhwgc_gkyxc_nhwgk_f16_default_pipev1_instance.cpp" + ] + }, + "DeviceGroupedConvBwdWeight_Wmma_CShuffleV3": { + "implementation": "include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_wmma_cshuffle_v3.hpp", + "instances_header": "library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_v3_wmma_instance.hpp", + "cpp_files": [ + "library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/wmma/nhwgc_gkyxc_nhwgk/device_grouped_conv2d_bwd_weight_wmma_nhwgc_gkyxc_nhwgk_f16_instance.cpp", + "library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/wmma/nhwgc_gkyxc_nhwgk/device_grouped_conv2d_bwd_weight_wmma_nhwgc_gkyxc_nhwgk_bf16_instance.cpp" + ] + }, + "DeviceGroupedConvBwdWeight_Xdl_CShuffle": { + "implementation": "include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_xdl_cshuffle.hpp", + "instances_header": "library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_xdl_instance.hpp", + "cpp_files": [ + "library/src/tensor_operation_instance/gpu/grouped_conv1d_bwd_weight/xdl/device_grouped_conv1d_bwd_weight_xdl_gnwc_gkxc_gnwk_*.cpp", + "library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/*/device_grouped_conv2d_bwd_weight_xdl_*_*.cpp", + "library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/*.cpp" + ] + }, + "DeviceGroupedConvBwdWeight_TwoStage_Xdl_CShuffle": { + "implementation": "include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_two_stage_xdl_cshuffle.hpp", + "instances_header": "library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_two_stage_xdl_instance.hpp", + "cpp_files": [ + "library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/nhwgc_gkyxc_nhwgk/device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_*_pipev*.cpp", + "library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/ngchw_gkcyx_ngkhw/device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkcyx_ngkhw_*_pipev*.cpp", + "library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/ngchw_gkyxc_ngkhw/device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkyxc_ngkhw_*_pipev*.cpp" + ] + }, + "DeviceGroupedConvBwdWeight_TwoStage_Wmma_CShuffleV3": { + "implementation": "include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_two_stage_wmma_cshuffle_v3.hpp", + "instances_header": "library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_two_stage_wmma_instance.hpp", + "cpp_files": [ + "library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/wmma/nhwgc_gkyxc_nhwgk/device_grouped_conv2d_bwd_weight_two_stage_wmma_nhwgc_gkyxc_nhwgk_*_pipev1_instance.cpp" + ] + }, + "DeviceGroupedConvBwdWeight_DL": { + "implementation": "include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_dl.hpp", + "instances_header": "library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_dl_instance.hpp", + "cpp_files": [ + "library/src/tensor_operation_instance/gpu/grouped_conv1d_bwd_weight/dl/device_grouped_conv1d_bwd_weight_dl_*.cpp", + "library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/dl/device_grouped_conv2d_bwd_weight_dl_*.cpp", + "library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/dl/device_grouped_conv3d_bwd_weight_dl_*.cpp" + ] + }, + "DeviceGroupedConvBwdWeightMultipleD_Xdl_CShuffle": { + "implementation": "include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_multiple_d_xdl_cshuffle.hpp", + "instances_headers": [ + "library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_xdl_bilinear_instance.hpp", + "library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_xdl_scale_instance.hpp" + ], + "cpp_files": [ + "library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight_bilinear/xdl/*.cpp", + "library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight_scale/xdl/*.cpp" + ] + }, + "DeviceGroupedConvBwdWeightMultipleD_Wmma_CShuffleV3": { + "implementation": "include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_multiple_d_wmma_cshuffle_v3.hpp", + "instances_header": "library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_wmma_scale_instance.hpp", + "cpp_files": [ + "library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight_scale/wmma/*.cpp" + ] + } + }, + + "notes": { + "total_instances": "Estimated 300-400+ backward weight instances across all 9 algorithms", + "instance_multipliers": "Each instance template is instantiated for multiple layout combinations and specializations", + "pipeline_variants": "Two-stage and V3 algorithms have multiple pipeline versions, multiplying instance count", + "layout_support": "V1 XDL supports more layouts (including NCHW) than V3", + "irregular_blocks": "Two-stage and V3 variants support non-power-of-2 block sizes (48, 80, 96, 112, 208)", + "fused_operations": "MultipleD variants support Bilinear and Scale fused operations with additional D tensors" + } +}