Files
composable_kernel/dispatcher/codegen/arch_specs.json
Vidyasagar Ananthan 9e049a32a1 Adding dispatcher architecture (#3300)
* WIP POC of dispatcher

* Dispatcher python workflow setup.

* Dispatcher cleanup and updates.

Further dispatcher cleanup and updates.

Build fixes

Improvements and python to CK example

Improvements to readme

* Fixes to python paths

* Cleaning up code

* Improving dispatcher support for different arch

Fixing typos

* Fix formatting errors

* Cleaning up examples

* Improving codegeneration

* Improving and fixing C++ examples

* Adding conv functionality (fwd,bwd,bwdw) and examples.

* Fixes based on feedback.

* Further fixes based on feedback.

* Adding stress test for autogeneration and autocorrection, and fixing preshuffle bug.

* Another round of improvements  based on feedback.

* Trimming out unnecessary code.

* Fixing the multi-D implementation.

* Using gpu verification for gemms and fixing convolutions tflops calculation.

* Fix counter usage issue and arch filtering per ops.

* Adding changelog and other fixes.

* Improve examples and resolve critical bugs.

* Reduce build time for python examples.

* Fixing minor bug.

* Fix compilation error.

* Improve installation instructions for dispatcher.

* Add docker based  installation instructions for dispatcher.

* Fixing arch-based filtering to match tile engine.

* Remove dead code and fix arch filtering.

* Minor bugfix.

* Updates after rebase.

* Trimming code.

* Fix copyright headers.

* Consolidate examples, cut down code.

* Minor fixes.

* Improving python examples.

* Update readmes.

* Remove conv functionality.

* Cleanup following conv removable.
2026-01-22 09:34:33 -08:00

271 lines
9.3 KiB
JSON

{
"_comment": "Single source of truth for GPU architecture specifications. Edit this file to add new GPU support.",
"_version": "1.2.0",
"_instructions": "See ADDING_NEW_GPU.md for instructions on adding new GPU support.",
"_supported_arch_note": "CK Tile supports: GFX9 (gfx908, gfx90a, gfx942, gfx950), GFX10.3 (gfx103x), GFX11 (gfx110x, gfx115x), GFX12 (gfx120x)",
"architectures": {
"gfx908": {
"family": "cdna1",
"target_family": "gfx9",
"architecture": "cdna",
"description": "AMD Instinct MI100",
"warp_size": 64,
"lds_capacity_kb": 64,
"warp_configs": [
[1, 4, 1],
[2, 2, 1],
[4, 1, 1]
],
"warp_tile_combos": {
"fp32_fp32_fp32": [[16, 16, 4], [16, 16, 16]],
"fp16_fp16_fp32": [[32, 32, 8], [16, 16, 16], [32, 32, 16], [16, 16, 32]],
"bf16_bf16_fp32": [[32, 32, 8], [16, 16, 16], [32, 32, 16], [16, 16, 32]],
"int8_int8_int32": [[32, 32, 16], [16, 16, 32]]
}
},
"gfx90a": {
"family": "cdna2",
"target_family": "gfx9",
"architecture": "cdna",
"description": "AMD Instinct MI200 series",
"warp_size": 64,
"lds_capacity_kb": 64,
"warp_configs": [
[1, 4, 1],
[2, 2, 1],
[4, 1, 1]
],
"warp_tile_combos": {
"fp32_fp32_fp32": [[16, 16, 4], [16, 16, 16]],
"fp16_fp16_fp32": [[32, 32, 8], [16, 16, 16], [32, 32, 16], [16, 16, 32], [4, 64, 16], [64, 4, 16]],
"bf16_bf16_fp32": [[32, 32, 8], [16, 16, 16], [32, 32, 16], [16, 16, 32], [4, 64, 16], [64, 4, 16]],
"fp8_fp8_fp32": [[32, 32, 16], [32, 32, 32]],
"bf8_bf8_fp32": [[32, 32, 16], [32, 32, 32]],
"int8_int8_int32": [[32, 32, 16], [16, 16, 32]]
}
},
"gfx942": {
"family": "cdna3",
"target_family": "gfx9",
"architecture": "cdna",
"description": "AMD Instinct MI300 series",
"warp_size": 64,
"lds_capacity_kb": 64,
"warp_configs": [
[1, 4, 1],
[2, 2, 1],
[4, 1, 1]
],
"warp_tile_combos": {
"fp32_fp32_fp32": [[16, 16, 4], [16, 16, 16]],
"fp16_fp16_fp32": [[32, 32, 8], [16, 16, 16], [32, 32, 16], [16, 16, 32], [4, 64, 16], [64, 4, 16]],
"bf16_bf16_fp32": [[32, 32, 8], [16, 16, 16], [32, 32, 16], [16, 16, 32], [4, 64, 16], [64, 4, 16]],
"fp8_fp8_fp32": [[32, 32, 16], [32, 32, 32], [16, 16, 32], [16, 16, 64]],
"fp8_bf8_fp32": [[32, 32, 16], [16, 16, 32], [32, 32, 32]],
"bf8_fp8_fp32": [[32, 32, 16]],
"bf8_bf8_fp32": [[32, 32, 16], [32, 32, 32], [16, 16, 32], [16, 16, 64]],
"int8_int8_int32": [[32, 32, 16], [16, 16, 32]]
}
},
"gfx950": {
"family": "cdna4",
"target_family": "gfx9",
"architecture": "cdna",
"description": "AMD Instinct MI350 series",
"warp_size": 64,
"lds_capacity_kb": 160,
"warp_configs": [
[1, 4, 1],
[2, 2, 1],
[4, 1, 1]
],
"warp_tile_combos": {
"fp32_fp32_fp32": [[16, 16, 4], [16, 16, 16]],
"fp16_fp16_fp32": [[32, 32, 8], [16, 16, 16], [32, 32, 16], [16, 16, 32], [4, 64, 16], [64, 4, 16]],
"bf16_bf16_fp32": [[32, 32, 8], [16, 16, 16], [32, 32, 16], [16, 16, 32], [4, 64, 16], [64, 4, 16]],
"fp8_fp8_fp32": [[32, 32, 16], [32, 32, 32], [16, 16, 32], [16, 16, 64], [16, 16, 128], [32, 32, 64]],
"fp8_bf8_fp32": [[32, 32, 16], [32, 32, 32], [16, 16, 32], [16, 16, 128], [32, 32, 64]],
"bf8_fp8_fp32": [[32, 32, 16], [16, 16, 128], [32, 32, 64]],
"bf8_bf8_fp32": [[32, 32, 16], [32, 32, 32], [16, 16, 32], [16, 16, 64], [16, 16, 128], [32, 32, 64]],
"int8_int8_int32": [[32, 32, 16], [16, 16, 32]],
"pk_fp4_pk_fp4_fp32": [[16, 16, 128]]
}
},
"gfx1100": {
"family": "rdna3",
"target_family": "gfx11",
"architecture": "rdna",
"description": "AMD Radeon RX 7900 series (RDNA3)",
"warp_size": 32,
"lds_capacity_kb": 64,
"warp_configs": [
[2, 4, 1],
[1, 8, 1],
[8, 1, 1],
[4, 2, 1]
],
"warp_tile_combos": {
"fp16_fp16_fp32": [[16, 16, 16]],
"bf16_bf16_fp32": [[16, 16, 16]],
"int8_int8_int32": [[16, 16, 16]]
}
},
"gfx1200": {
"family": "rdna4",
"target_family": "gfx12",
"architecture": "rdna",
"description": "AMD Radeon RX 9000 series (RDNA4)",
"warp_size": 32,
"lds_capacity_kb": 64,
"warp_configs": [
[2, 4, 1],
[1, 8, 1],
[8, 1, 1],
[4, 2, 1]
],
"warp_tile_combos": {
"fp16_fp16_fp32": [[16, 16, 16]],
"bf16_bf16_fp32": [[16, 16, 16]],
"fp8_fp8_fp32": [[16, 16, 16]],
"bf8_bf8_fp32": [[16, 16, 16]],
"fp8_bf8_fp32": [[16, 16, 16]],
"bf8_fp8_fp32": [[16, 16, 16]],
"int8_int8_int32": [[16, 16, 16]]
}
},
"gfx1201": {
"family": "rdna4",
"target_family": "gfx12",
"architecture": "rdna",
"description": "AMD Radeon RX 9000 series (RDNA4)",
"warp_size": 32,
"lds_capacity_kb": 64,
"warp_configs": [
[2, 4, 1],
[1, 8, 1],
[8, 1, 1],
[4, 2, 1]
],
"warp_tile_combos": {
"fp16_fp16_fp32": [[16, 16, 16]],
"bf16_bf16_fp32": [[16, 16, 16]],
"fp8_fp8_fp32": [[16, 16, 16]],
"bf8_bf8_fp32": [[16, 16, 16]],
"fp8_bf8_fp32": [[16, 16, 16]],
"bf8_fp8_fp32": [[16, 16, 16]],
"int8_int8_int32": [[16, 16, 16]]
}
}
},
"element_sizes": {
"fp16": 2,
"bf16": 2,
"fp32": 4,
"fp64": 8,
"fp8": 1,
"bf8": 1,
"int8": 1,
"int4": 0.5,
"pk_fp4": 0.5,
"int32": 4
},
"datatype_cpp_map": {
"_comment": "Maps dtype string to CK Tile C++ type for code generation",
"fp16": "ck_tile::half_t",
"bf16": "ck_tile::bf16_t",
"fp32": "float",
"fp64": "double",
"fp8": "ck_tile::fp8_t",
"bf8": "ck_tile::bf8_t",
"int8": "ck_tile::int8_t",
"int4": "ck_tile::pk_int4_t",
"pk_fp4": "ck_tile::pk_fp4_t",
"int32": "ck_tile::int32_t"
},
"dtype_combinations": {
"_comment": "All valid (A, B) -> Acc combinations for GEMM from warp_gemm_dispatcher.hpp",
"fp32_fp32": {"acc": "fp32", "notes": "Full precision"},
"fp16_fp16": {"acc": "fp32", "notes": "Standard half precision"},
"bf16_bf16": {"acc": "fp32", "notes": "Brain float 16"},
"fp8_fp8": {"acc": "fp32", "notes": "FP8 E4M3"},
"fp8_bf8": {"acc": "fp32", "notes": "Mixed FP8/BF8"},
"bf8_fp8": {"acc": "fp32", "notes": "Mixed BF8/FP8"},
"bf8_bf8": {"acc": "fp32", "notes": "BF8 E5M2"},
"int8_int8": {"acc": "int32", "notes": "Integer GEMM"},
"pk_fp4_pk_fp4": {"acc": "fp32", "notes": "Packed 4-bit float"}
},
"layout_cpp_map": {
"_comment": "Maps layout character to CK Tile C++ type",
"r": "ck_tile::tensor_layout::gemm::RowMajor",
"c": "ck_tile::tensor_layout::gemm::ColumnMajor"
},
"pipeline_lds_limits": {
"_comment": "LDS capacity limits in bytes for different pipeline types",
"mem": 65536,
"compv1": 65536,
"compv2": 65536,
"compv3": 65536,
"compv4": 32768,
"compv5": 65536,
"preshufflev1": 32768,
"preshufflev2": 32768,
"default": 65536
},
"unsupported_trait_combos": {
"_comment": "Only 'mem' pipeline supports interwave scheduler. All compute pipelines only support intrawave.",
"combinations": [
["compv3", "cshuffle", "interwave"],
["compv3", "default", "interwave"],
["compv4", "cshuffle", "interwave"],
["compv4", "default", "interwave"],
["compv5", "cshuffle", "interwave"],
["compv5", "default", "interwave"],
["compv6", "cshuffle", "interwave"],
["compv6", "default", "interwave"],
["comp_async", "cshuffle", "interwave"],
["comp_async", "default", "interwave"]
]
},
"preshuffle_warp_tile_combos": {
"_comment": "Preshuffle-specific warp tile combinations (subset of standard GEMM, no [4, 64, 16])",
"gfx90a": {
"fp16_fp16_fp32": [[32, 32, 8], [16, 16, 16], [32, 32, 16], [16, 16, 32], [64, 4, 16]],
"bf16_bf16_fp32": [[32, 32, 8], [16, 16, 16], [32, 32, 16], [16, 16, 32], [64, 4, 16]],
"fp8_fp8_fp32": [[32, 32, 16], [32, 32, 32]],
"bf8_bf8_fp32": [[32, 32, 16], [32, 32, 32]]
},
"gfx942": {
"fp16_fp16_fp32": [[32, 32, 8], [16, 16, 16], [32, 32, 16], [16, 16, 32], [64, 4, 16]],
"bf16_bf16_fp32": [[32, 32, 8], [16, 16, 16], [32, 32, 16], [16, 16, 32], [64, 4, 16]],
"fp8_fp8_fp32": [[32, 32, 16], [32, 32, 32], [16, 16, 32], [16, 16, 64]],
"bf8_bf8_fp32": [[32, 32, 16], [32, 32, 32], [16, 16, 64], [16, 16, 32]],
"int8_int8_int32": [[16, 16, 32], [32, 32, 16]]
},
"gfx950": {
"fp16_fp16_fp32": [[32, 32, 8], [16, 16, 16], [32, 32, 16], [16, 16, 32], [64, 4, 16]],
"bf16_bf16_fp32": [[32, 32, 8], [16, 16, 16], [32, 32, 16], [16, 16, 32], [64, 4, 16]],
"fp8_fp8_fp32": [[32, 32, 16], [32, 32, 32], [16, 16, 32], [16, 16, 64], [16, 16, 128], [32, 32, 64]],
"bf8_bf8_fp32": [[32, 32, 16], [32, 32, 32], [16, 16, 64], [16, 16, 32], [16, 16, 128], [32, 32, 64]]
}
},
"preshuffle_pipelines": {
"_comment": "Pipelines supported for preshuffle GEMM variant",
"supported": ["preshufflev2"]
}
}