mirror of
https://github.com/ROCm/composable_kernel.git
synced 2026-05-12 09:16:52 +00:00
[CK][CK TILE] Dispatcher kernel selection heuristic for grouped conv (#6327) ## Motivation The ML heuristic in dispatcher does not support grouped-conv operator yet. In this PR, the support for fwd, bdw-data, and bwd-weight grouped-conv kernels have been added. A tile_engine utility has also been added to compile and run any selected kernel configuration through dispatcher infrastructure. ## Technical Details 1. Tile engine utility is added to benchmark each shape with all the possible kernel+tile_size combinations here - [https://github.com/ROCm/rocm-libraries/blob/users/yraparti/ck/dispatcher-grouped-conv-heuristics/projects/composablekernel/tile_engine/ops/grouped_conv/grouped_conv_full_benchmark.py](url) 2. New LGBM regressor models for grouped conv are added to models directory. We have 3 separate models for fwd, bwd-data, and bwd-weights [https://github.com/ROCm/rocm-libraries/tree/users/yraparti/ck/dispatcher-grouped-conv-heuristics/projects/composablekernel/dispatcher/heuristics/models](url) 3. Implemented lazy GPU initialization (dispatcher/python) - **Issue**: ProcessPoolExecutor fork() + GPU context caused memory access faults - **Solution**: Mirror FMHA pattern - defer GPU initialization until first run() - **Changes**: - setup_multiple_grouped_conv_dispatchers() returns List[Path], not loaded libs - GpuGroupedConvRunner.__init__() no longer calls ctypes.CDLL - Added _ensure_initialized() method for lazy GPU loading - GPU context created only on first run() call - **Benefit**: Parallel compilation now works without GPU conflicts 4. Addressed few miscellaneous issues such as: - Fixed BF16->FP16 naming bug in the dispatcher wrapper - Added new tile sizes, and comp_v5 pipeline to the arch spec to expand the kernel selection - Added automatic padding support for unsupported shapes in dispatcher runner - Created a single source of truth between tile_engine and dispatcher about the architecture and tile_size details - Build a validation scripts to compare oracle_best vs ml_heuristic comparison ## Test Plan 1. Validated fwd, bwd-data, and bwd-weight kernels with both known and unseen data sets with up to 300 problems. 2. Ensured that test cases are added in both dispatcher and tile_engine to validate the heuristic. ## Test Result Results on Unseen shapes validated on gfx950 #### Forward Pass Model - **Training Data**: 48,845 measurements across 1,372 unique problem shapes - **Validation Set**: 300 unseen problems from model crawler - **Validation Performance** (vs. oracle): - Mean Efficiency: **93.05%** - Median Efficiency: **96.8%** - P10 Efficiency: **79.9%** #### Backward Data Gradient (bwd_data) Model - **Training Data**: 18,773 measurements across 891 unique problem shapes - **Validation Set**: 300 unseen problems from model crawler - **Validation Performance** (vs. oracle): - Mean Efficiency: **93.8%** - Median Efficiency: **96.5%** - P10 Efficiency: **82.9%** #### Backward Weight Gradient (bwd_weight) Model - **Training Data**: 34,900 measurements across 1,508 unique problem shapes - **Validation Set**: 300 unseen problems from model crawler - **Validation Performance** (vs. oracle): - Mean Efficiency: **96.1%** - Median Efficiency: **99.2%** - P10 Efficiency: **89.4%** ## Submission Checklist - [ x] Look over the contributing guidelines at https://github.com/ROCm/ROCm/blob/develop/CONTRIBUTING.md#pull-requests.
273 lines
9.3 KiB
JSON
273 lines
9.3 KiB
JSON
{
|
|
"_comment": "Single source of truth for GPU architecture specifications. Edit this file to add new GPU support.",
|
|
"_version": "1.2.0",
|
|
"_instructions": "See ADDING_NEW_GPU.md for instructions on adding new GPU support.",
|
|
"_supported_arch_note": "CK Tile supports: GFX9 (gfx908, gfx90a, gfx942, gfx950), GFX10.3 (gfx103x), GFX11 (gfx110x, gfx115x), GFX12 (gfx120x)",
|
|
|
|
"architectures": {
|
|
"gfx908": {
|
|
"family": "cdna1",
|
|
"target_family": "gfx9",
|
|
"architecture": "cdna",
|
|
"description": "AMD Instinct MI100",
|
|
"warp_size": 64,
|
|
"lds_capacity_kb": 64,
|
|
"warp_configs": [
|
|
[1, 4, 1],
|
|
[2, 2, 1],
|
|
[4, 1, 1]
|
|
],
|
|
"warp_tile_combos": {
|
|
"fp32_fp32_fp32": [[16, 16, 4], [16, 16, 16]],
|
|
"fp16_fp16_fp32": [[32, 32, 8], [16, 16, 16], [32, 32, 16], [16, 16, 32]],
|
|
"bf16_bf16_fp32": [[32, 32, 8], [16, 16, 16], [32, 32, 16], [16, 16, 32]],
|
|
"int8_int8_int32": [[32, 32, 16], [16, 16, 32]]
|
|
}
|
|
},
|
|
|
|
"gfx90a": {
|
|
"family": "cdna2",
|
|
"target_family": "gfx9",
|
|
"architecture": "cdna",
|
|
"description": "AMD Instinct MI200 series",
|
|
"warp_size": 64,
|
|
"lds_capacity_kb": 64,
|
|
"warp_configs": [
|
|
[1, 4, 1],
|
|
[2, 2, 1],
|
|
[4, 1, 1]
|
|
],
|
|
"warp_tile_combos": {
|
|
"fp32_fp32_fp32": [[16, 16, 4], [16, 16, 16]],
|
|
"fp16_fp16_fp32": [[32, 32, 8], [16, 16, 16], [32, 32, 16], [16, 16, 32], [4, 64, 16], [64, 4, 16]],
|
|
"bf16_bf16_fp32": [[32, 32, 8], [16, 16, 16], [32, 32, 16], [16, 16, 32], [4, 64, 16], [64, 4, 16]],
|
|
"fp8_fp8_fp32": [[32, 32, 16], [32, 32, 32]],
|
|
"bf8_bf8_fp32": [[32, 32, 16], [32, 32, 32]],
|
|
"int8_int8_int32": [[32, 32, 16], [16, 16, 32]]
|
|
}
|
|
},
|
|
|
|
"gfx942": {
|
|
"family": "cdna3",
|
|
"target_family": "gfx9",
|
|
"architecture": "cdna",
|
|
"description": "AMD Instinct MI300 series",
|
|
"warp_size": 64,
|
|
"lds_capacity_kb": 64,
|
|
"warp_configs": [
|
|
[1, 4, 1],
|
|
[2, 2, 1],
|
|
[4, 1, 1]
|
|
],
|
|
"warp_tile_combos": {
|
|
"fp32_fp32_fp32": [[16, 16, 4], [16, 16, 16]],
|
|
"fp16_fp16_fp32": [[32, 32, 8], [16, 16, 16], [32, 32, 16], [16, 16, 32], [4, 64, 16], [64, 4, 16]],
|
|
"bf16_bf16_fp32": [[32, 32, 8], [16, 16, 16], [32, 32, 16], [16, 16, 32], [4, 64, 16], [64, 4, 16]],
|
|
"fp8_fp8_fp32": [[32, 32, 16], [32, 32, 32], [16, 16, 32], [16, 16, 64]],
|
|
"fp8_bf8_fp32": [[32, 32, 16], [16, 16, 32], [32, 32, 32]],
|
|
"bf8_fp8_fp32": [[32, 32, 16]],
|
|
"bf8_bf8_fp32": [[32, 32, 16], [32, 32, 32], [16, 16, 32], [16, 16, 64]],
|
|
"int8_int8_int32": [[32, 32, 16], [16, 16, 32]]
|
|
}
|
|
},
|
|
|
|
"gfx950": {
|
|
"family": "cdna4",
|
|
"target_family": "gfx9",
|
|
"architecture": "cdna",
|
|
"description": "AMD Instinct MI350 series",
|
|
"warp_size": 64,
|
|
"lds_capacity_kb": 160,
|
|
"warp_configs": [
|
|
[1, 4, 1],
|
|
[2, 2, 1],
|
|
[4, 1, 1],
|
|
[8, 2, 1],
|
|
[4, 4, 1]
|
|
],
|
|
"warp_tile_combos": {
|
|
"fp32_fp32_fp32": [[16, 16, 4], [16, 16, 16]],
|
|
"fp16_fp16_fp32": [[32, 32, 8], [16, 16, 16], [32, 32, 16], [16, 16, 32], [4, 64, 16], [64, 4, 16]],
|
|
"bf16_bf16_fp32": [[32, 32, 8], [16, 16, 16], [32, 32, 16], [16, 16, 32], [4, 64, 16], [64, 4, 16]],
|
|
"fp8_fp8_fp32": [[32, 32, 16], [32, 32, 32], [16, 16, 32], [16, 16, 64], [16, 16, 128], [32, 32, 64]],
|
|
"fp8_bf8_fp32": [[32, 32, 16], [32, 32, 32], [16, 16, 32], [16, 16, 128], [32, 32, 64]],
|
|
"bf8_fp8_fp32": [[32, 32, 16], [16, 16, 128], [32, 32, 64]],
|
|
"bf8_bf8_fp32": [[32, 32, 16], [32, 32, 32], [16, 16, 32], [16, 16, 64], [16, 16, 128], [32, 32, 64]],
|
|
"int8_int8_int32": [[32, 32, 16], [16, 16, 32]],
|
|
"pk_fp4_pk_fp4_fp32": [[16, 16, 128]]
|
|
}
|
|
},
|
|
|
|
"gfx1100": {
|
|
"family": "rdna3",
|
|
"target_family": "gfx11",
|
|
"architecture": "rdna",
|
|
"description": "AMD Radeon RX 7900 series (RDNA3)",
|
|
"warp_size": 32,
|
|
"lds_capacity_kb": 64,
|
|
"warp_configs": [
|
|
[2, 4, 1],
|
|
[1, 8, 1],
|
|
[8, 1, 1],
|
|
[4, 2, 1]
|
|
],
|
|
"warp_tile_combos": {
|
|
"fp16_fp16_fp32": [[16, 16, 16]],
|
|
"bf16_bf16_fp32": [[16, 16, 16]],
|
|
"int8_int8_int32": [[16, 16, 16]]
|
|
}
|
|
},
|
|
|
|
"gfx1200": {
|
|
"family": "rdna4",
|
|
"target_family": "gfx12",
|
|
"architecture": "rdna",
|
|
"description": "AMD Radeon RX 9000 series (RDNA4)",
|
|
"warp_size": 32,
|
|
"lds_capacity_kb": 64,
|
|
"warp_configs": [
|
|
[2, 4, 1],
|
|
[1, 8, 1],
|
|
[8, 1, 1],
|
|
[4, 2, 1]
|
|
],
|
|
"warp_tile_combos": {
|
|
"fp16_fp16_fp32": [[16, 16, 16]],
|
|
"bf16_bf16_fp32": [[16, 16, 16]],
|
|
"fp8_fp8_fp32": [[16, 16, 16]],
|
|
"bf8_bf8_fp32": [[16, 16, 16]],
|
|
"fp8_bf8_fp32": [[16, 16, 16]],
|
|
"bf8_fp8_fp32": [[16, 16, 16]],
|
|
"int8_int8_int32": [[16, 16, 16]]
|
|
}
|
|
},
|
|
|
|
"gfx1201": {
|
|
"family": "rdna4",
|
|
"target_family": "gfx12",
|
|
"architecture": "rdna",
|
|
"description": "AMD Radeon RX 9000 series (RDNA4)",
|
|
"warp_size": 32,
|
|
"lds_capacity_kb": 64,
|
|
"warp_configs": [
|
|
[2, 4, 1],
|
|
[1, 8, 1],
|
|
[8, 1, 1],
|
|
[4, 2, 1]
|
|
],
|
|
"warp_tile_combos": {
|
|
"fp16_fp16_fp32": [[16, 16, 16]],
|
|
"bf16_bf16_fp32": [[16, 16, 16]],
|
|
"fp8_fp8_fp32": [[16, 16, 16]],
|
|
"bf8_bf8_fp32": [[16, 16, 16]],
|
|
"fp8_bf8_fp32": [[16, 16, 16]],
|
|
"bf8_fp8_fp32": [[16, 16, 16]],
|
|
"int8_int8_int32": [[16, 16, 16]]
|
|
}
|
|
}
|
|
},
|
|
|
|
"element_sizes": {
|
|
"fp16": 2,
|
|
"bf16": 2,
|
|
"fp32": 4,
|
|
"fp64": 8,
|
|
"fp8": 1,
|
|
"bf8": 1,
|
|
"int8": 1,
|
|
"int4": 0.5,
|
|
"pk_fp4": 0.5,
|
|
"int32": 4
|
|
},
|
|
|
|
"datatype_cpp_map": {
|
|
"_comment": "Maps dtype string to CK Tile C++ type for code generation",
|
|
"fp16": "ck_tile::half_t",
|
|
"bf16": "ck_tile::bf16_t",
|
|
"fp32": "float",
|
|
"fp64": "double",
|
|
"fp8": "ck_tile::fp8_t",
|
|
"bf8": "ck_tile::bf8_t",
|
|
"int8": "ck_tile::int8_t",
|
|
"int4": "ck_tile::pk_int4_t",
|
|
"pk_fp4": "ck_tile::pk_fp4_t",
|
|
"int32": "ck_tile::int32_t"
|
|
},
|
|
|
|
"dtype_combinations": {
|
|
"_comment": "All valid (A, B) -> Acc combinations for GEMM from warp_gemm_dispatcher.hpp",
|
|
"fp32_fp32": {"acc": "fp32", "notes": "Full precision"},
|
|
"fp16_fp16": {"acc": "fp32", "notes": "Standard half precision"},
|
|
"bf16_bf16": {"acc": "fp32", "notes": "Brain float 16"},
|
|
"fp8_fp8": {"acc": "fp32", "notes": "FP8 E4M3"},
|
|
"fp8_bf8": {"acc": "fp32", "notes": "Mixed FP8/BF8"},
|
|
"bf8_fp8": {"acc": "fp32", "notes": "Mixed BF8/FP8"},
|
|
"bf8_bf8": {"acc": "fp32", "notes": "BF8 E5M2"},
|
|
"int8_int8": {"acc": "int32", "notes": "Integer GEMM"},
|
|
"pk_fp4_pk_fp4": {"acc": "fp32", "notes": "Packed 4-bit float"}
|
|
},
|
|
|
|
"layout_cpp_map": {
|
|
"_comment": "Maps layout character to CK Tile C++ type",
|
|
"r": "ck_tile::tensor_layout::gemm::RowMajor",
|
|
"c": "ck_tile::tensor_layout::gemm::ColumnMajor"
|
|
},
|
|
|
|
"pipeline_lds_limits": {
|
|
"_comment": "LDS capacity limits in bytes for different pipeline types",
|
|
"mem": 65536,
|
|
"compv1": 65536,
|
|
"compv2": 65536,
|
|
"compv3": 65536,
|
|
"compv4": 32768,
|
|
"compv5": 65536,
|
|
"preshufflev1": 32768,
|
|
"preshufflev2": 32768,
|
|
"default": 65536
|
|
},
|
|
|
|
"unsupported_trait_combos": {
|
|
"_comment": "Only 'mem' pipeline supports interwave scheduler. All compute pipelines only support intrawave.",
|
|
"combinations": [
|
|
["compv3", "cshuffle", "interwave"],
|
|
["compv3", "default", "interwave"],
|
|
["compv4", "cshuffle", "interwave"],
|
|
["compv4", "default", "interwave"],
|
|
["compv5", "cshuffle", "interwave"],
|
|
["compv5", "default", "interwave"],
|
|
["compv6", "cshuffle", "interwave"],
|
|
["compv6", "default", "interwave"],
|
|
["comp_async", "cshuffle", "interwave"],
|
|
["comp_async", "default", "interwave"]
|
|
]
|
|
},
|
|
|
|
"preshuffle_warp_tile_combos": {
|
|
"_comment": "Preshuffle-specific warp tile combinations (subset of standard GEMM, no [4, 64, 16])",
|
|
"gfx90a": {
|
|
"fp16_fp16_fp32": [[32, 32, 8], [16, 16, 16], [32, 32, 16], [16, 16, 32], [64, 4, 16]],
|
|
"bf16_bf16_fp32": [[32, 32, 8], [16, 16, 16], [32, 32, 16], [16, 16, 32], [64, 4, 16]],
|
|
"fp8_fp8_fp32": [[32, 32, 16], [32, 32, 32]],
|
|
"bf8_bf8_fp32": [[32, 32, 16], [32, 32, 32]]
|
|
},
|
|
"gfx942": {
|
|
"fp16_fp16_fp32": [[32, 32, 8], [16, 16, 16], [32, 32, 16], [16, 16, 32], [64, 4, 16]],
|
|
"bf16_bf16_fp32": [[32, 32, 8], [16, 16, 16], [32, 32, 16], [16, 16, 32], [64, 4, 16]],
|
|
"fp8_fp8_fp32": [[32, 32, 16], [32, 32, 32], [16, 16, 32], [16, 16, 64]],
|
|
"bf8_bf8_fp32": [[32, 32, 16], [32, 32, 32], [16, 16, 64], [16, 16, 32]],
|
|
"int8_int8_int32": [[16, 16, 32], [32, 32, 16]]
|
|
},
|
|
"gfx950": {
|
|
"fp16_fp16_fp32": [[32, 32, 8], [16, 16, 16], [32, 32, 16], [16, 16, 32], [64, 4, 16], [32, 32, 32], [16, 16, 64]],
|
|
"bf16_bf16_fp32": [[32, 32, 8], [16, 16, 16], [32, 32, 16], [16, 16, 32], [64, 4, 16], [32, 32, 32], [16, 16, 64]],
|
|
"fp8_fp8_fp32": [[32, 32, 16], [32, 32, 32], [16, 16, 32], [16, 16, 64], [16, 16, 128], [32, 32, 64]],
|
|
"bf8_bf8_fp32": [[32, 32, 16], [32, 32, 32], [16, 16, 64], [16, 16, 32], [16, 16, 128], [32, 32, 64]]
|
|
}
|
|
},
|
|
|
|
"preshuffle_pipelines": {
|
|
"_comment": "Pipelines supported for preshuffle GEMM variant",
|
|
"supported": ["preshufflev2"]
|
|
}
|
|
}
|