mirror of
https://github.com/ROCm/composable_kernel.git
synced 2026-03-24 00:57:39 +00:00
* WIP POC of dispatcher * Dispatcher python workflow setup. * Dispatcher cleanup and updates. Further dispatcher cleanup and updates. Build fixes Improvements and python to CK example Improvements to readme * Fixes to python paths * Cleaning up code * Improving dispatcher support for different arch Fixing typos * Fix formatting errors * Cleaning up examples * Improving codegeneration * Improving and fixing C++ examples * Adding conv functionality (fwd,bwd,bwdw) and examples. * Fixes based on feedback. * Further fixes based on feedback. * Adding stress test for autogeneration and autocorrection, and fixing preshuffle bug. * Another round of improvements based on feedback. * Trimming out unnecessary code. * Fixing the multi-D implementation. * Using gpu verification for gemms and fixing convolutions tflops calculation. * Fix counter usage issue and arch filtering per ops. * Adding changelog and other fixes. * Improve examples and resolve critical bugs. * Reduce build time for python examples. * Fixing minor bug. * Fix compilation error. * Improve installation instructions for dispatcher. * Add docker based installation instructions for dispatcher. * Fixing arch-based filtering to match tile engine. * Remove dead code and fix arch filtering. * Minor bugfix. * Updates after rebase. * Trimming code. * Fix copyright headers. * Consolidate examples, cut down code. * Minor fixes. * Improving python examples. * Update readmes. * Remove conv functionality. * Cleanup following conv removable.
271 lines
9.3 KiB
JSON
271 lines
9.3 KiB
JSON
{
|
|
"_comment": "Single source of truth for GPU architecture specifications. Edit this file to add new GPU support.",
|
|
"_version": "1.2.0",
|
|
"_instructions": "See ADDING_NEW_GPU.md for instructions on adding new GPU support.",
|
|
"_supported_arch_note": "CK Tile supports: GFX9 (gfx908, gfx90a, gfx942, gfx950), GFX10.3 (gfx103x), GFX11 (gfx110x, gfx115x), GFX12 (gfx120x)",
|
|
|
|
"architectures": {
|
|
"gfx908": {
|
|
"family": "cdna1",
|
|
"target_family": "gfx9",
|
|
"architecture": "cdna",
|
|
"description": "AMD Instinct MI100",
|
|
"warp_size": 64,
|
|
"lds_capacity_kb": 64,
|
|
"warp_configs": [
|
|
[1, 4, 1],
|
|
[2, 2, 1],
|
|
[4, 1, 1]
|
|
],
|
|
"warp_tile_combos": {
|
|
"fp32_fp32_fp32": [[16, 16, 4], [16, 16, 16]],
|
|
"fp16_fp16_fp32": [[32, 32, 8], [16, 16, 16], [32, 32, 16], [16, 16, 32]],
|
|
"bf16_bf16_fp32": [[32, 32, 8], [16, 16, 16], [32, 32, 16], [16, 16, 32]],
|
|
"int8_int8_int32": [[32, 32, 16], [16, 16, 32]]
|
|
}
|
|
},
|
|
|
|
"gfx90a": {
|
|
"family": "cdna2",
|
|
"target_family": "gfx9",
|
|
"architecture": "cdna",
|
|
"description": "AMD Instinct MI200 series",
|
|
"warp_size": 64,
|
|
"lds_capacity_kb": 64,
|
|
"warp_configs": [
|
|
[1, 4, 1],
|
|
[2, 2, 1],
|
|
[4, 1, 1]
|
|
],
|
|
"warp_tile_combos": {
|
|
"fp32_fp32_fp32": [[16, 16, 4], [16, 16, 16]],
|
|
"fp16_fp16_fp32": [[32, 32, 8], [16, 16, 16], [32, 32, 16], [16, 16, 32], [4, 64, 16], [64, 4, 16]],
|
|
"bf16_bf16_fp32": [[32, 32, 8], [16, 16, 16], [32, 32, 16], [16, 16, 32], [4, 64, 16], [64, 4, 16]],
|
|
"fp8_fp8_fp32": [[32, 32, 16], [32, 32, 32]],
|
|
"bf8_bf8_fp32": [[32, 32, 16], [32, 32, 32]],
|
|
"int8_int8_int32": [[32, 32, 16], [16, 16, 32]]
|
|
}
|
|
},
|
|
|
|
"gfx942": {
|
|
"family": "cdna3",
|
|
"target_family": "gfx9",
|
|
"architecture": "cdna",
|
|
"description": "AMD Instinct MI300 series",
|
|
"warp_size": 64,
|
|
"lds_capacity_kb": 64,
|
|
"warp_configs": [
|
|
[1, 4, 1],
|
|
[2, 2, 1],
|
|
[4, 1, 1]
|
|
],
|
|
"warp_tile_combos": {
|
|
"fp32_fp32_fp32": [[16, 16, 4], [16, 16, 16]],
|
|
"fp16_fp16_fp32": [[32, 32, 8], [16, 16, 16], [32, 32, 16], [16, 16, 32], [4, 64, 16], [64, 4, 16]],
|
|
"bf16_bf16_fp32": [[32, 32, 8], [16, 16, 16], [32, 32, 16], [16, 16, 32], [4, 64, 16], [64, 4, 16]],
|
|
"fp8_fp8_fp32": [[32, 32, 16], [32, 32, 32], [16, 16, 32], [16, 16, 64]],
|
|
"fp8_bf8_fp32": [[32, 32, 16], [16, 16, 32], [32, 32, 32]],
|
|
"bf8_fp8_fp32": [[32, 32, 16]],
|
|
"bf8_bf8_fp32": [[32, 32, 16], [32, 32, 32], [16, 16, 32], [16, 16, 64]],
|
|
"int8_int8_int32": [[32, 32, 16], [16, 16, 32]]
|
|
}
|
|
},
|
|
|
|
"gfx950": {
|
|
"family": "cdna4",
|
|
"target_family": "gfx9",
|
|
"architecture": "cdna",
|
|
"description": "AMD Instinct MI350 series",
|
|
"warp_size": 64,
|
|
"lds_capacity_kb": 160,
|
|
"warp_configs": [
|
|
[1, 4, 1],
|
|
[2, 2, 1],
|
|
[4, 1, 1]
|
|
],
|
|
"warp_tile_combos": {
|
|
"fp32_fp32_fp32": [[16, 16, 4], [16, 16, 16]],
|
|
"fp16_fp16_fp32": [[32, 32, 8], [16, 16, 16], [32, 32, 16], [16, 16, 32], [4, 64, 16], [64, 4, 16]],
|
|
"bf16_bf16_fp32": [[32, 32, 8], [16, 16, 16], [32, 32, 16], [16, 16, 32], [4, 64, 16], [64, 4, 16]],
|
|
"fp8_fp8_fp32": [[32, 32, 16], [32, 32, 32], [16, 16, 32], [16, 16, 64], [16, 16, 128], [32, 32, 64]],
|
|
"fp8_bf8_fp32": [[32, 32, 16], [32, 32, 32], [16, 16, 32], [16, 16, 128], [32, 32, 64]],
|
|
"bf8_fp8_fp32": [[32, 32, 16], [16, 16, 128], [32, 32, 64]],
|
|
"bf8_bf8_fp32": [[32, 32, 16], [32, 32, 32], [16, 16, 32], [16, 16, 64], [16, 16, 128], [32, 32, 64]],
|
|
"int8_int8_int32": [[32, 32, 16], [16, 16, 32]],
|
|
"pk_fp4_pk_fp4_fp32": [[16, 16, 128]]
|
|
}
|
|
},
|
|
|
|
"gfx1100": {
|
|
"family": "rdna3",
|
|
"target_family": "gfx11",
|
|
"architecture": "rdna",
|
|
"description": "AMD Radeon RX 7900 series (RDNA3)",
|
|
"warp_size": 32,
|
|
"lds_capacity_kb": 64,
|
|
"warp_configs": [
|
|
[2, 4, 1],
|
|
[1, 8, 1],
|
|
[8, 1, 1],
|
|
[4, 2, 1]
|
|
],
|
|
"warp_tile_combos": {
|
|
"fp16_fp16_fp32": [[16, 16, 16]],
|
|
"bf16_bf16_fp32": [[16, 16, 16]],
|
|
"int8_int8_int32": [[16, 16, 16]]
|
|
}
|
|
},
|
|
|
|
"gfx1200": {
|
|
"family": "rdna4",
|
|
"target_family": "gfx12",
|
|
"architecture": "rdna",
|
|
"description": "AMD Radeon RX 9000 series (RDNA4)",
|
|
"warp_size": 32,
|
|
"lds_capacity_kb": 64,
|
|
"warp_configs": [
|
|
[2, 4, 1],
|
|
[1, 8, 1],
|
|
[8, 1, 1],
|
|
[4, 2, 1]
|
|
],
|
|
"warp_tile_combos": {
|
|
"fp16_fp16_fp32": [[16, 16, 16]],
|
|
"bf16_bf16_fp32": [[16, 16, 16]],
|
|
"fp8_fp8_fp32": [[16, 16, 16]],
|
|
"bf8_bf8_fp32": [[16, 16, 16]],
|
|
"fp8_bf8_fp32": [[16, 16, 16]],
|
|
"bf8_fp8_fp32": [[16, 16, 16]],
|
|
"int8_int8_int32": [[16, 16, 16]]
|
|
}
|
|
},
|
|
|
|
"gfx1201": {
|
|
"family": "rdna4",
|
|
"target_family": "gfx12",
|
|
"architecture": "rdna",
|
|
"description": "AMD Radeon RX 9000 series (RDNA4)",
|
|
"warp_size": 32,
|
|
"lds_capacity_kb": 64,
|
|
"warp_configs": [
|
|
[2, 4, 1],
|
|
[1, 8, 1],
|
|
[8, 1, 1],
|
|
[4, 2, 1]
|
|
],
|
|
"warp_tile_combos": {
|
|
"fp16_fp16_fp32": [[16, 16, 16]],
|
|
"bf16_bf16_fp32": [[16, 16, 16]],
|
|
"fp8_fp8_fp32": [[16, 16, 16]],
|
|
"bf8_bf8_fp32": [[16, 16, 16]],
|
|
"fp8_bf8_fp32": [[16, 16, 16]],
|
|
"bf8_fp8_fp32": [[16, 16, 16]],
|
|
"int8_int8_int32": [[16, 16, 16]]
|
|
}
|
|
}
|
|
},
|
|
|
|
"element_sizes": {
|
|
"fp16": 2,
|
|
"bf16": 2,
|
|
"fp32": 4,
|
|
"fp64": 8,
|
|
"fp8": 1,
|
|
"bf8": 1,
|
|
"int8": 1,
|
|
"int4": 0.5,
|
|
"pk_fp4": 0.5,
|
|
"int32": 4
|
|
},
|
|
|
|
"datatype_cpp_map": {
|
|
"_comment": "Maps dtype string to CK Tile C++ type for code generation",
|
|
"fp16": "ck_tile::half_t",
|
|
"bf16": "ck_tile::bf16_t",
|
|
"fp32": "float",
|
|
"fp64": "double",
|
|
"fp8": "ck_tile::fp8_t",
|
|
"bf8": "ck_tile::bf8_t",
|
|
"int8": "ck_tile::int8_t",
|
|
"int4": "ck_tile::pk_int4_t",
|
|
"pk_fp4": "ck_tile::pk_fp4_t",
|
|
"int32": "ck_tile::int32_t"
|
|
},
|
|
|
|
"dtype_combinations": {
|
|
"_comment": "All valid (A, B) -> Acc combinations for GEMM from warp_gemm_dispatcher.hpp",
|
|
"fp32_fp32": {"acc": "fp32", "notes": "Full precision"},
|
|
"fp16_fp16": {"acc": "fp32", "notes": "Standard half precision"},
|
|
"bf16_bf16": {"acc": "fp32", "notes": "Brain float 16"},
|
|
"fp8_fp8": {"acc": "fp32", "notes": "FP8 E4M3"},
|
|
"fp8_bf8": {"acc": "fp32", "notes": "Mixed FP8/BF8"},
|
|
"bf8_fp8": {"acc": "fp32", "notes": "Mixed BF8/FP8"},
|
|
"bf8_bf8": {"acc": "fp32", "notes": "BF8 E5M2"},
|
|
"int8_int8": {"acc": "int32", "notes": "Integer GEMM"},
|
|
"pk_fp4_pk_fp4": {"acc": "fp32", "notes": "Packed 4-bit float"}
|
|
},
|
|
|
|
"layout_cpp_map": {
|
|
"_comment": "Maps layout character to CK Tile C++ type",
|
|
"r": "ck_tile::tensor_layout::gemm::RowMajor",
|
|
"c": "ck_tile::tensor_layout::gemm::ColumnMajor"
|
|
},
|
|
|
|
"pipeline_lds_limits": {
|
|
"_comment": "LDS capacity limits in bytes for different pipeline types",
|
|
"mem": 65536,
|
|
"compv1": 65536,
|
|
"compv2": 65536,
|
|
"compv3": 65536,
|
|
"compv4": 32768,
|
|
"compv5": 65536,
|
|
"preshufflev1": 32768,
|
|
"preshufflev2": 32768,
|
|
"default": 65536
|
|
},
|
|
|
|
"unsupported_trait_combos": {
|
|
"_comment": "Only 'mem' pipeline supports interwave scheduler. All compute pipelines only support intrawave.",
|
|
"combinations": [
|
|
["compv3", "cshuffle", "interwave"],
|
|
["compv3", "default", "interwave"],
|
|
["compv4", "cshuffle", "interwave"],
|
|
["compv4", "default", "interwave"],
|
|
["compv5", "cshuffle", "interwave"],
|
|
["compv5", "default", "interwave"],
|
|
["compv6", "cshuffle", "interwave"],
|
|
["compv6", "default", "interwave"],
|
|
["comp_async", "cshuffle", "interwave"],
|
|
["comp_async", "default", "interwave"]
|
|
]
|
|
},
|
|
|
|
"preshuffle_warp_tile_combos": {
|
|
"_comment": "Preshuffle-specific warp tile combinations (subset of standard GEMM, no [4, 64, 16])",
|
|
"gfx90a": {
|
|
"fp16_fp16_fp32": [[32, 32, 8], [16, 16, 16], [32, 32, 16], [16, 16, 32], [64, 4, 16]],
|
|
"bf16_bf16_fp32": [[32, 32, 8], [16, 16, 16], [32, 32, 16], [16, 16, 32], [64, 4, 16]],
|
|
"fp8_fp8_fp32": [[32, 32, 16], [32, 32, 32]],
|
|
"bf8_bf8_fp32": [[32, 32, 16], [32, 32, 32]]
|
|
},
|
|
"gfx942": {
|
|
"fp16_fp16_fp32": [[32, 32, 8], [16, 16, 16], [32, 32, 16], [16, 16, 32], [64, 4, 16]],
|
|
"bf16_bf16_fp32": [[32, 32, 8], [16, 16, 16], [32, 32, 16], [16, 16, 32], [64, 4, 16]],
|
|
"fp8_fp8_fp32": [[32, 32, 16], [32, 32, 32], [16, 16, 32], [16, 16, 64]],
|
|
"bf8_bf8_fp32": [[32, 32, 16], [32, 32, 32], [16, 16, 64], [16, 16, 32]],
|
|
"int8_int8_int32": [[16, 16, 32], [32, 32, 16]]
|
|
},
|
|
"gfx950": {
|
|
"fp16_fp16_fp32": [[32, 32, 8], [16, 16, 16], [32, 32, 16], [16, 16, 32], [64, 4, 16]],
|
|
"bf16_bf16_fp32": [[32, 32, 8], [16, 16, 16], [32, 32, 16], [16, 16, 32], [64, 4, 16]],
|
|
"fp8_fp8_fp32": [[32, 32, 16], [32, 32, 32], [16, 16, 32], [16, 16, 64], [16, 16, 128], [32, 32, 64]],
|
|
"bf8_bf8_fp32": [[32, 32, 16], [32, 32, 32], [16, 16, 64], [16, 16, 32], [16, 16, 128], [32, 32, 64]]
|
|
}
|
|
},
|
|
|
|
"preshuffle_pipelines": {
|
|
"_comment": "Pipelines supported for preshuffle GEMM variant",
|
|
"supported": ["preshufflev2"]
|
|
}
|
|
}
|