{ "_comment": "Single source of truth for GPU architecture specifications. Edit this file to add new GPU support.", "_version": "1.2.0", "_instructions": "See ADDING_NEW_GPU.md for instructions on adding new GPU support.", "_supported_arch_note": "CK Tile supports: GFX9 (gfx908, gfx90a, gfx942, gfx950), GFX10.3 (gfx103x), GFX11 (gfx110x, gfx115x), GFX12 (gfx120x)", "architectures": { "gfx908": { "family": "cdna1", "target_family": "gfx9", "architecture": "cdna", "description": "AMD Instinct MI100", "warp_size": 64, "lds_capacity_kb": 64, "warp_configs": [ [1, 4, 1], [2, 2, 1], [4, 1, 1] ], "warp_tile_combos": { "fp32_fp32_fp32": [[16, 16, 4], [16, 16, 16]], "fp16_fp16_fp32": [[32, 32, 8], [16, 16, 16], [32, 32, 16], [16, 16, 32]], "bf16_bf16_fp32": [[32, 32, 8], [16, 16, 16], [32, 32, 16], [16, 16, 32]], "int8_int8_int32": [[32, 32, 16], [16, 16, 32]] } }, "gfx90a": { "family": "cdna2", "target_family": "gfx9", "architecture": "cdna", "description": "AMD Instinct MI200 series", "warp_size": 64, "lds_capacity_kb": 64, "warp_configs": [ [1, 4, 1], [2, 2, 1], [4, 1, 1] ], "warp_tile_combos": { "fp32_fp32_fp32": [[16, 16, 4], [16, 16, 16]], "fp16_fp16_fp32": [[32, 32, 8], [16, 16, 16], [32, 32, 16], [16, 16, 32], [4, 64, 16], [64, 4, 16]], "bf16_bf16_fp32": [[32, 32, 8], [16, 16, 16], [32, 32, 16], [16, 16, 32], [4, 64, 16], [64, 4, 16]], "fp8_fp8_fp32": [[32, 32, 16], [32, 32, 32]], "bf8_bf8_fp32": [[32, 32, 16], [32, 32, 32]], "int8_int8_int32": [[32, 32, 16], [16, 16, 32]] } }, "gfx942": { "family": "cdna3", "target_family": "gfx9", "architecture": "cdna", "description": "AMD Instinct MI300 series", "warp_size": 64, "lds_capacity_kb": 64, "warp_configs": [ [1, 4, 1], [2, 2, 1], [4, 1, 1] ], "warp_tile_combos": { "fp32_fp32_fp32": [[16, 16, 4], [16, 16, 16]], "fp16_fp16_fp32": [[32, 32, 8], [16, 16, 16], [32, 32, 16], [16, 16, 32], [4, 64, 16], [64, 4, 16]], "bf16_bf16_fp32": [[32, 32, 8], [16, 16, 16], [32, 32, 16], [16, 16, 32], [4, 64, 16], [64, 4, 16]], "fp8_fp8_fp32": [[32, 32, 16], [32, 32, 32], [16, 16, 32], [16, 16, 64]], "fp8_bf8_fp32": [[32, 32, 16], [16, 16, 32], [32, 32, 32]], "bf8_fp8_fp32": [[32, 32, 16]], "bf8_bf8_fp32": [[32, 32, 16], [32, 32, 32], [16, 16, 32], [16, 16, 64]], "int8_int8_int32": [[32, 32, 16], [16, 16, 32]] } }, "gfx950": { "family": "cdna4", "target_family": "gfx9", "architecture": "cdna", "description": "AMD Instinct MI350 series", "warp_size": 64, "lds_capacity_kb": 160, "warp_configs": [ [1, 4, 1], [2, 2, 1], [4, 1, 1] ], "warp_tile_combos": { "fp32_fp32_fp32": [[16, 16, 4], [16, 16, 16]], "fp16_fp16_fp32": [[32, 32, 8], [16, 16, 16], [32, 32, 16], [16, 16, 32], [4, 64, 16], [64, 4, 16]], "bf16_bf16_fp32": [[32, 32, 8], [16, 16, 16], [32, 32, 16], [16, 16, 32], [4, 64, 16], [64, 4, 16]], "fp8_fp8_fp32": [[32, 32, 16], [32, 32, 32], [16, 16, 32], [16, 16, 64], [16, 16, 128], [32, 32, 64]], "fp8_bf8_fp32": [[32, 32, 16], [32, 32, 32], [16, 16, 32], [16, 16, 128], [32, 32, 64]], "bf8_fp8_fp32": [[32, 32, 16], [16, 16, 128], [32, 32, 64]], "bf8_bf8_fp32": [[32, 32, 16], [32, 32, 32], [16, 16, 32], [16, 16, 64], [16, 16, 128], [32, 32, 64]], "int8_int8_int32": [[32, 32, 16], [16, 16, 32]], "pk_fp4_pk_fp4_fp32": [[16, 16, 128]] } }, "gfx1100": { "family": "rdna3", "target_family": "gfx11", "architecture": "rdna", "description": "AMD Radeon RX 7900 series (RDNA3)", "warp_size": 32, "lds_capacity_kb": 64, "warp_configs": [ [2, 4, 1], [1, 8, 1], [8, 1, 1], [4, 2, 1] ], "warp_tile_combos": { "fp16_fp16_fp32": [[16, 16, 16]], "bf16_bf16_fp32": [[16, 16, 16]], "int8_int8_int32": [[16, 16, 16]] } }, "gfx1200": { "family": "rdna4", "target_family": "gfx12", "architecture": "rdna", "description": "AMD Radeon RX 9000 series (RDNA4)", "warp_size": 32, "lds_capacity_kb": 64, "warp_configs": [ [2, 4, 1], [1, 8, 1], [8, 1, 1], [4, 2, 1] ], "warp_tile_combos": { "fp16_fp16_fp32": [[16, 16, 16]], "bf16_bf16_fp32": [[16, 16, 16]], "fp8_fp8_fp32": [[16, 16, 16]], "bf8_bf8_fp32": [[16, 16, 16]], "fp8_bf8_fp32": [[16, 16, 16]], "bf8_fp8_fp32": [[16, 16, 16]], "int8_int8_int32": [[16, 16, 16]] } }, "gfx1201": { "family": "rdna4", "target_family": "gfx12", "architecture": "rdna", "description": "AMD Radeon RX 9000 series (RDNA4)", "warp_size": 32, "lds_capacity_kb": 64, "warp_configs": [ [2, 4, 1], [1, 8, 1], [8, 1, 1], [4, 2, 1] ], "warp_tile_combos": { "fp16_fp16_fp32": [[16, 16, 16]], "bf16_bf16_fp32": [[16, 16, 16]], "fp8_fp8_fp32": [[16, 16, 16]], "bf8_bf8_fp32": [[16, 16, 16]], "fp8_bf8_fp32": [[16, 16, 16]], "bf8_fp8_fp32": [[16, 16, 16]], "int8_int8_int32": [[16, 16, 16]] } } }, "element_sizes": { "fp16": 2, "bf16": 2, "fp32": 4, "fp64": 8, "fp8": 1, "bf8": 1, "int8": 1, "int4": 0.5, "pk_fp4": 0.5, "int32": 4 }, "datatype_cpp_map": { "_comment": "Maps dtype string to CK Tile C++ type for code generation", "fp16": "ck_tile::half_t", "bf16": "ck_tile::bf16_t", "fp32": "float", "fp64": "double", "fp8": "ck_tile::fp8_t", "bf8": "ck_tile::bf8_t", "int8": "ck_tile::int8_t", "int4": "ck_tile::pk_int4_t", "pk_fp4": "ck_tile::pk_fp4_t", "int32": "ck_tile::int32_t" }, "dtype_combinations": { "_comment": "All valid (A, B) -> Acc combinations for GEMM from warp_gemm_dispatcher.hpp", "fp32_fp32": {"acc": "fp32", "notes": "Full precision"}, "fp16_fp16": {"acc": "fp32", "notes": "Standard half precision"}, "bf16_bf16": {"acc": "fp32", "notes": "Brain float 16"}, "fp8_fp8": {"acc": "fp32", "notes": "FP8 E4M3"}, "fp8_bf8": {"acc": "fp32", "notes": "Mixed FP8/BF8"}, "bf8_fp8": {"acc": "fp32", "notes": "Mixed BF8/FP8"}, "bf8_bf8": {"acc": "fp32", "notes": "BF8 E5M2"}, "int8_int8": {"acc": "int32", "notes": "Integer GEMM"}, "pk_fp4_pk_fp4": {"acc": "fp32", "notes": "Packed 4-bit float"} }, "layout_cpp_map": { "_comment": "Maps layout character to CK Tile C++ type", "r": "ck_tile::tensor_layout::gemm::RowMajor", "c": "ck_tile::tensor_layout::gemm::ColumnMajor" }, "pipeline_lds_limits": { "_comment": "LDS capacity limits in bytes for different pipeline types", "mem": 65536, "compv1": 65536, "compv2": 65536, "compv3": 65536, "compv4": 32768, "compv5": 65536, "preshufflev1": 32768, "preshufflev2": 32768, "default": 65536 }, "unsupported_trait_combos": { "_comment": "Only 'mem' pipeline supports interwave scheduler. All compute pipelines only support intrawave.", "combinations": [ ["compv3", "cshuffle", "interwave"], ["compv3", "default", "interwave"], ["compv4", "cshuffle", "interwave"], ["compv4", "default", "interwave"], ["compv5", "cshuffle", "interwave"], ["compv5", "default", "interwave"], ["compv6", "cshuffle", "interwave"], ["compv6", "default", "interwave"], ["comp_async", "cshuffle", "interwave"], ["comp_async", "default", "interwave"] ] }, "preshuffle_warp_tile_combos": { "_comment": "Preshuffle-specific warp tile combinations (subset of standard GEMM, no [4, 64, 16])", "gfx90a": { "fp16_fp16_fp32": [[32, 32, 8], [16, 16, 16], [32, 32, 16], [16, 16, 32], [64, 4, 16]], "bf16_bf16_fp32": [[32, 32, 8], [16, 16, 16], [32, 32, 16], [16, 16, 32], [64, 4, 16]], "fp8_fp8_fp32": [[32, 32, 16], [32, 32, 32]], "bf8_bf8_fp32": [[32, 32, 16], [32, 32, 32]] }, "gfx942": { "fp16_fp16_fp32": [[32, 32, 8], [16, 16, 16], [32, 32, 16], [16, 16, 32], [64, 4, 16]], "bf16_bf16_fp32": [[32, 32, 8], [16, 16, 16], [32, 32, 16], [16, 16, 32], [64, 4, 16]], "fp8_fp8_fp32": [[32, 32, 16], [32, 32, 32], [16, 16, 32], [16, 16, 64]], "bf8_bf8_fp32": [[32, 32, 16], [32, 32, 32], [16, 16, 64], [16, 16, 32]], "int8_int8_int32": [[16, 16, 32], [32, 32, 16]] }, "gfx950": { "fp16_fp16_fp32": [[32, 32, 8], [16, 16, 16], [32, 32, 16], [16, 16, 32], [64, 4, 16]], "bf16_bf16_fp32": [[32, 32, 8], [16, 16, 16], [32, 32, 16], [16, 16, 32], [64, 4, 16]], "fp8_fp8_fp32": [[32, 32, 16], [32, 32, 32], [16, 16, 32], [16, 16, 64], [16, 16, 128], [32, 32, 64]], "bf8_bf8_fp32": [[32, 32, 16], [32, 32, 32], [16, 16, 64], [16, 16, 32], [16, 16, 128], [32, 32, 64]] } }, "preshuffle_pipelines": { "_comment": "Pipelines supported for preshuffle GEMM variant", "supported": ["preshufflev2"] } }