mirror of
https://github.com/ROCm/composable_kernel.git
synced 2026-05-14 10:09:41 +00:00
* WIP POC of dispatcher
* Dispatcher python workflow setup.
* Dispatcher cleanup and updates.
Further dispatcher cleanup and updates.
Build fixes
Improvements and python to CK example
Improvements to readme
* Fixes to python paths
* Cleaning up code
* Improving dispatcher support for different arch
Fixing typos
* Fix formatting errors
* Cleaning up examples
* Improving codegeneration
* Improving and fixing C++ examples
* Adding conv functionality (fwd,bwd,bwdw) and examples.
* Fixes based on feedback.
* Further fixes based on feedback.
* Adding stress test for autogeneration and autocorrection, and fixing preshuffle bug.
* Another round of improvements based on feedback.
* Trimming out unnecessary code.
* Fixing the multi-D implementation.
* Using gpu verification for gemms and fixing convolutions tflops calculation.
* Fix counter usage issue and arch filtering per ops.
* Adding changelog and other fixes.
* Improve examples and resolve critical bugs.
* Reduce build time for python examples.
* Fixing minor bug.
* Fix compilation error.
* Improve installation instructions for dispatcher.
* Add docker based installation instructions for dispatcher.
* Fixing arch-based filtering to match tile engine.
* Remove dead code and fix arch filtering.
* Minor bugfix.
* Updates after rebase.
* Trimming code.
* Fix copyright headers.
* Consolidate examples, cut down code.
* Minor fixes.
* Improving python examples.
* Update readmes.
* Remove conv functionality.
* Cleanup following conv removable.
[ROCm/composable_kernel commit: 9e049a32a1]
271 lines
9.3 KiB
JSON
271 lines
9.3 KiB
JSON
{
|
|
"_comment": "Single source of truth for GPU architecture specifications. Edit this file to add new GPU support.",
|
|
"_version": "1.2.0",
|
|
"_instructions": "See ADDING_NEW_GPU.md for instructions on adding new GPU support.",
|
|
"_supported_arch_note": "CK Tile supports: GFX9 (gfx908, gfx90a, gfx942, gfx950), GFX10.3 (gfx103x), GFX11 (gfx110x, gfx115x), GFX12 (gfx120x)",
|
|
|
|
"architectures": {
|
|
"gfx908": {
|
|
"family": "cdna1",
|
|
"target_family": "gfx9",
|
|
"architecture": "cdna",
|
|
"description": "AMD Instinct MI100",
|
|
"warp_size": 64,
|
|
"lds_capacity_kb": 64,
|
|
"warp_configs": [
|
|
[1, 4, 1],
|
|
[2, 2, 1],
|
|
[4, 1, 1]
|
|
],
|
|
"warp_tile_combos": {
|
|
"fp32_fp32_fp32": [[16, 16, 4], [16, 16, 16]],
|
|
"fp16_fp16_fp32": [[32, 32, 8], [16, 16, 16], [32, 32, 16], [16, 16, 32]],
|
|
"bf16_bf16_fp32": [[32, 32, 8], [16, 16, 16], [32, 32, 16], [16, 16, 32]],
|
|
"int8_int8_int32": [[32, 32, 16], [16, 16, 32]]
|
|
}
|
|
},
|
|
|
|
"gfx90a": {
|
|
"family": "cdna2",
|
|
"target_family": "gfx9",
|
|
"architecture": "cdna",
|
|
"description": "AMD Instinct MI200 series",
|
|
"warp_size": 64,
|
|
"lds_capacity_kb": 64,
|
|
"warp_configs": [
|
|
[1, 4, 1],
|
|
[2, 2, 1],
|
|
[4, 1, 1]
|
|
],
|
|
"warp_tile_combos": {
|
|
"fp32_fp32_fp32": [[16, 16, 4], [16, 16, 16]],
|
|
"fp16_fp16_fp32": [[32, 32, 8], [16, 16, 16], [32, 32, 16], [16, 16, 32], [4, 64, 16], [64, 4, 16]],
|
|
"bf16_bf16_fp32": [[32, 32, 8], [16, 16, 16], [32, 32, 16], [16, 16, 32], [4, 64, 16], [64, 4, 16]],
|
|
"fp8_fp8_fp32": [[32, 32, 16], [32, 32, 32]],
|
|
"bf8_bf8_fp32": [[32, 32, 16], [32, 32, 32]],
|
|
"int8_int8_int32": [[32, 32, 16], [16, 16, 32]]
|
|
}
|
|
},
|
|
|
|
"gfx942": {
|
|
"family": "cdna3",
|
|
"target_family": "gfx9",
|
|
"architecture": "cdna",
|
|
"description": "AMD Instinct MI300 series",
|
|
"warp_size": 64,
|
|
"lds_capacity_kb": 64,
|
|
"warp_configs": [
|
|
[1, 4, 1],
|
|
[2, 2, 1],
|
|
[4, 1, 1]
|
|
],
|
|
"warp_tile_combos": {
|
|
"fp32_fp32_fp32": [[16, 16, 4], [16, 16, 16]],
|
|
"fp16_fp16_fp32": [[32, 32, 8], [16, 16, 16], [32, 32, 16], [16, 16, 32], [4, 64, 16], [64, 4, 16]],
|
|
"bf16_bf16_fp32": [[32, 32, 8], [16, 16, 16], [32, 32, 16], [16, 16, 32], [4, 64, 16], [64, 4, 16]],
|
|
"fp8_fp8_fp32": [[32, 32, 16], [32, 32, 32], [16, 16, 32], [16, 16, 64]],
|
|
"fp8_bf8_fp32": [[32, 32, 16], [16, 16, 32], [32, 32, 32]],
|
|
"bf8_fp8_fp32": [[32, 32, 16]],
|
|
"bf8_bf8_fp32": [[32, 32, 16], [32, 32, 32], [16, 16, 32], [16, 16, 64]],
|
|
"int8_int8_int32": [[32, 32, 16], [16, 16, 32]]
|
|
}
|
|
},
|
|
|
|
"gfx950": {
|
|
"family": "cdna4",
|
|
"target_family": "gfx9",
|
|
"architecture": "cdna",
|
|
"description": "AMD Instinct MI350 series",
|
|
"warp_size": 64,
|
|
"lds_capacity_kb": 160,
|
|
"warp_configs": [
|
|
[1, 4, 1],
|
|
[2, 2, 1],
|
|
[4, 1, 1]
|
|
],
|
|
"warp_tile_combos": {
|
|
"fp32_fp32_fp32": [[16, 16, 4], [16, 16, 16]],
|
|
"fp16_fp16_fp32": [[32, 32, 8], [16, 16, 16], [32, 32, 16], [16, 16, 32], [4, 64, 16], [64, 4, 16]],
|
|
"bf16_bf16_fp32": [[32, 32, 8], [16, 16, 16], [32, 32, 16], [16, 16, 32], [4, 64, 16], [64, 4, 16]],
|
|
"fp8_fp8_fp32": [[32, 32, 16], [32, 32, 32], [16, 16, 32], [16, 16, 64], [16, 16, 128], [32, 32, 64]],
|
|
"fp8_bf8_fp32": [[32, 32, 16], [32, 32, 32], [16, 16, 32], [16, 16, 128], [32, 32, 64]],
|
|
"bf8_fp8_fp32": [[32, 32, 16], [16, 16, 128], [32, 32, 64]],
|
|
"bf8_bf8_fp32": [[32, 32, 16], [32, 32, 32], [16, 16, 32], [16, 16, 64], [16, 16, 128], [32, 32, 64]],
|
|
"int8_int8_int32": [[32, 32, 16], [16, 16, 32]],
|
|
"pk_fp4_pk_fp4_fp32": [[16, 16, 128]]
|
|
}
|
|
},
|
|
|
|
"gfx1100": {
|
|
"family": "rdna3",
|
|
"target_family": "gfx11",
|
|
"architecture": "rdna",
|
|
"description": "AMD Radeon RX 7900 series (RDNA3)",
|
|
"warp_size": 32,
|
|
"lds_capacity_kb": 64,
|
|
"warp_configs": [
|
|
[2, 4, 1],
|
|
[1, 8, 1],
|
|
[8, 1, 1],
|
|
[4, 2, 1]
|
|
],
|
|
"warp_tile_combos": {
|
|
"fp16_fp16_fp32": [[16, 16, 16]],
|
|
"bf16_bf16_fp32": [[16, 16, 16]],
|
|
"int8_int8_int32": [[16, 16, 16]]
|
|
}
|
|
},
|
|
|
|
"gfx1200": {
|
|
"family": "rdna4",
|
|
"target_family": "gfx12",
|
|
"architecture": "rdna",
|
|
"description": "AMD Radeon RX 9000 series (RDNA4)",
|
|
"warp_size": 32,
|
|
"lds_capacity_kb": 64,
|
|
"warp_configs": [
|
|
[2, 4, 1],
|
|
[1, 8, 1],
|
|
[8, 1, 1],
|
|
[4, 2, 1]
|
|
],
|
|
"warp_tile_combos": {
|
|
"fp16_fp16_fp32": [[16, 16, 16]],
|
|
"bf16_bf16_fp32": [[16, 16, 16]],
|
|
"fp8_fp8_fp32": [[16, 16, 16]],
|
|
"bf8_bf8_fp32": [[16, 16, 16]],
|
|
"fp8_bf8_fp32": [[16, 16, 16]],
|
|
"bf8_fp8_fp32": [[16, 16, 16]],
|
|
"int8_int8_int32": [[16, 16, 16]]
|
|
}
|
|
},
|
|
|
|
"gfx1201": {
|
|
"family": "rdna4",
|
|
"target_family": "gfx12",
|
|
"architecture": "rdna",
|
|
"description": "AMD Radeon RX 9000 series (RDNA4)",
|
|
"warp_size": 32,
|
|
"lds_capacity_kb": 64,
|
|
"warp_configs": [
|
|
[2, 4, 1],
|
|
[1, 8, 1],
|
|
[8, 1, 1],
|
|
[4, 2, 1]
|
|
],
|
|
"warp_tile_combos": {
|
|
"fp16_fp16_fp32": [[16, 16, 16]],
|
|
"bf16_bf16_fp32": [[16, 16, 16]],
|
|
"fp8_fp8_fp32": [[16, 16, 16]],
|
|
"bf8_bf8_fp32": [[16, 16, 16]],
|
|
"fp8_bf8_fp32": [[16, 16, 16]],
|
|
"bf8_fp8_fp32": [[16, 16, 16]],
|
|
"int8_int8_int32": [[16, 16, 16]]
|
|
}
|
|
}
|
|
},
|
|
|
|
"element_sizes": {
|
|
"fp16": 2,
|
|
"bf16": 2,
|
|
"fp32": 4,
|
|
"fp64": 8,
|
|
"fp8": 1,
|
|
"bf8": 1,
|
|
"int8": 1,
|
|
"int4": 0.5,
|
|
"pk_fp4": 0.5,
|
|
"int32": 4
|
|
},
|
|
|
|
"datatype_cpp_map": {
|
|
"_comment": "Maps dtype string to CK Tile C++ type for code generation",
|
|
"fp16": "ck_tile::half_t",
|
|
"bf16": "ck_tile::bf16_t",
|
|
"fp32": "float",
|
|
"fp64": "double",
|
|
"fp8": "ck_tile::fp8_t",
|
|
"bf8": "ck_tile::bf8_t",
|
|
"int8": "ck_tile::int8_t",
|
|
"int4": "ck_tile::pk_int4_t",
|
|
"pk_fp4": "ck_tile::pk_fp4_t",
|
|
"int32": "ck_tile::int32_t"
|
|
},
|
|
|
|
"dtype_combinations": {
|
|
"_comment": "All valid (A, B) -> Acc combinations for GEMM from warp_gemm_dispatcher.hpp",
|
|
"fp32_fp32": {"acc": "fp32", "notes": "Full precision"},
|
|
"fp16_fp16": {"acc": "fp32", "notes": "Standard half precision"},
|
|
"bf16_bf16": {"acc": "fp32", "notes": "Brain float 16"},
|
|
"fp8_fp8": {"acc": "fp32", "notes": "FP8 E4M3"},
|
|
"fp8_bf8": {"acc": "fp32", "notes": "Mixed FP8/BF8"},
|
|
"bf8_fp8": {"acc": "fp32", "notes": "Mixed BF8/FP8"},
|
|
"bf8_bf8": {"acc": "fp32", "notes": "BF8 E5M2"},
|
|
"int8_int8": {"acc": "int32", "notes": "Integer GEMM"},
|
|
"pk_fp4_pk_fp4": {"acc": "fp32", "notes": "Packed 4-bit float"}
|
|
},
|
|
|
|
"layout_cpp_map": {
|
|
"_comment": "Maps layout character to CK Tile C++ type",
|
|
"r": "ck_tile::tensor_layout::gemm::RowMajor",
|
|
"c": "ck_tile::tensor_layout::gemm::ColumnMajor"
|
|
},
|
|
|
|
"pipeline_lds_limits": {
|
|
"_comment": "LDS capacity limits in bytes for different pipeline types",
|
|
"mem": 65536,
|
|
"compv1": 65536,
|
|
"compv2": 65536,
|
|
"compv3": 65536,
|
|
"compv4": 32768,
|
|
"compv5": 65536,
|
|
"preshufflev1": 32768,
|
|
"preshufflev2": 32768,
|
|
"default": 65536
|
|
},
|
|
|
|
"unsupported_trait_combos": {
|
|
"_comment": "Only 'mem' pipeline supports interwave scheduler. All compute pipelines only support intrawave.",
|
|
"combinations": [
|
|
["compv3", "cshuffle", "interwave"],
|
|
["compv3", "default", "interwave"],
|
|
["compv4", "cshuffle", "interwave"],
|
|
["compv4", "default", "interwave"],
|
|
["compv5", "cshuffle", "interwave"],
|
|
["compv5", "default", "interwave"],
|
|
["compv6", "cshuffle", "interwave"],
|
|
["compv6", "default", "interwave"],
|
|
["comp_async", "cshuffle", "interwave"],
|
|
["comp_async", "default", "interwave"]
|
|
]
|
|
},
|
|
|
|
"preshuffle_warp_tile_combos": {
|
|
"_comment": "Preshuffle-specific warp tile combinations (subset of standard GEMM, no [4, 64, 16])",
|
|
"gfx90a": {
|
|
"fp16_fp16_fp32": [[32, 32, 8], [16, 16, 16], [32, 32, 16], [16, 16, 32], [64, 4, 16]],
|
|
"bf16_bf16_fp32": [[32, 32, 8], [16, 16, 16], [32, 32, 16], [16, 16, 32], [64, 4, 16]],
|
|
"fp8_fp8_fp32": [[32, 32, 16], [32, 32, 32]],
|
|
"bf8_bf8_fp32": [[32, 32, 16], [32, 32, 32]]
|
|
},
|
|
"gfx942": {
|
|
"fp16_fp16_fp32": [[32, 32, 8], [16, 16, 16], [32, 32, 16], [16, 16, 32], [64, 4, 16]],
|
|
"bf16_bf16_fp32": [[32, 32, 8], [16, 16, 16], [32, 32, 16], [16, 16, 32], [64, 4, 16]],
|
|
"fp8_fp8_fp32": [[32, 32, 16], [32, 32, 32], [16, 16, 32], [16, 16, 64]],
|
|
"bf8_bf8_fp32": [[32, 32, 16], [32, 32, 32], [16, 16, 64], [16, 16, 32]],
|
|
"int8_int8_int32": [[16, 16, 32], [32, 32, 16]]
|
|
},
|
|
"gfx950": {
|
|
"fp16_fp16_fp32": [[32, 32, 8], [16, 16, 16], [32, 32, 16], [16, 16, 32], [64, 4, 16]],
|
|
"bf16_bf16_fp32": [[32, 32, 8], [16, 16, 16], [32, 32, 16], [16, 16, 32], [64, 4, 16]],
|
|
"fp8_fp8_fp32": [[32, 32, 16], [32, 32, 32], [16, 16, 32], [16, 16, 64], [16, 16, 128], [32, 32, 64]],
|
|
"bf8_bf8_fp32": [[32, 32, 16], [32, 32, 32], [16, 16, 64], [16, 16, 32], [16, 16, 128], [32, 32, 64]]
|
|
}
|
|
},
|
|
|
|
"preshuffle_pipelines": {
|
|
"_comment": "Pipelines supported for preshuffle GEMM variant",
|
|
"supported": ["preshufflev2"]
|
|
}
|
|
}
|