Remove "basic" and universal GEMM tests, and incorporate their test cases into the GEMM pipeline tests (#3094)

* Add missing copyright statements

* Use ck_tile::host_tensor_descriptor instead of a custom lambda

* Refactor use of check_data_type in test classes

* Use TEST_SUITE_NAME with TYPED_TEST_SUITE

* Remove an unused namespace

* Make dim3 const

* Add BF8 x BF8 tests for CompV3 in test_gemm_pipeline_kernel_types.hpp

* Add F8 x BF8 tests for CompV3 in test_gemm_pipeline_kernel_types.hpp

* Add BF16 x I4 tests for CompV3 in test_gemm_pipeline_kernel_types.hpp

* Add BF16 x BF16 tests for CompV3 in test_gemm_pipeline_kernel_types.hpp

* Add BF8 x I4 tests for CompV3 in test_gemm_pipeline_kernel_types.hpp

* Add F8 x I4 tests for CompV3 in test_gemm_pipeline_kernel_types.hpp

* Add F16 x I4 tests for CompV3 in test_gemm_pipeline_kernel_types.hpp

* Skip failing tests of F16 x I4 for CompV3 with K == 2 * K_Tile

* Add missing precision type combinations to CompV4 from CompV3

* Move the INT8 tests around for consistency with KernelTypesCompV3Wmma

* Add missing precision type combinations to CompV3Wmma from CompV3

* Remove the basic and universal tests and their dependencies

* On __gfx950__, avoid using transposed loading of A with datatype pk_int4_t of B

* Use ADataType and BDataType instead of ComputeDataType for WarpGemm

* Explicitly set some return types to void

* Use more general typenames in InterleavedPKTypeLoader

* Add load_interleaved_pk_type.hpp to common.hpp

* Use std::is_same_v in load_int4_tile

* Add handling of LoadTranspose to load_int4_tile

* Factor out common code in several places using load_int4_tile

* Add support for pk_int4_t using load_int4_tile

* Fix formatting
This commit is contained in:
SamiAario-AMD
2025-11-13 21:01:27 +02:00
committed by GitHub
parent 7d57bc169f
commit f2cfc6b94e
38 changed files with 352 additions and 1888 deletions

View File

@@ -13,49 +13,6 @@ list(APPEND EXAMPLE_GEMM_COMPILE_COMPUTE_V4_OPTIONS
)
set(EXAMPLE_GEMM_COMPILE_COMPUTE_ASYNC_OPTIONS ${EXAMPLE_GEMM_COMPILE_COMPUTE_V4_OPTIONS})
if(GPU_TARGETS MATCHES "gfx94|gfx95|gfx11|gfx12")
add_gtest_executable(test_ck_tile_gemm_pipeline_universal_int8 test_gemm_pipeline_universal_int8.cpp)
target_compile_options(test_ck_tile_gemm_pipeline_universal_int8 PRIVATE ${EXAMPLE_GEMM_COMPILE_OPTIONS})
add_gtest_executable(test_ck_tile_gemm_pipeline_universal_pk_int4 test_gemm_pipeline_universal_pk_int4.cpp)
target_compile_options(test_ck_tile_gemm_pipeline_universal_pk_int4 PRIVATE ${EXAMPLE_GEMM_COMPILE_OPTIONS})
else()
message(DEBUG "Skipping ck_tile_gemm tests for current target")
endif()
if(GPU_TARGETS MATCHES "gfx94|gfx95|gfx12")
add_gtest_executable(test_gemm_pipeline_compiler test_gemm_pipeline_compiler.cpp)
target_compile_options(test_gemm_pipeline_compiler PRIVATE ${EXAMPLE_GEMM_COMPILE_OPTIONS})
endif()
if(GPU_TARGETS MATCHES "gfx94|gfx95|gfx12")
add_gtest_executable(test_ck_tile_gemm_pipeline_universal_fp8 test_gemm_pipeline_universal_fp8.cpp)
add_gtest_executable(test_ck_tile_gemm_pipeline_universal_bf8 test_gemm_pipeline_universal_bf8.cpp)
add_gtest_executable(test_ck_tile_gemm_pipeline_basic_fp8 test_gemm_pipeline_basic_fp8.cpp)
add_gtest_executable(test_ck_tile_gemm_pipeline_basic_bf8 test_gemm_pipeline_basic_bf8.cpp)
target_compile_options(test_ck_tile_gemm_pipeline_universal_fp8 PRIVATE ${EXAMPLE_GEMM_COMPILE_OPTIONS})
target_compile_options(test_ck_tile_gemm_pipeline_universal_bf8 PRIVATE ${EXAMPLE_GEMM_COMPILE_OPTIONS})
target_compile_options(test_ck_tile_gemm_pipeline_basic_fp8 PRIVATE ${EXAMPLE_GEMM_COMPILE_OPTIONS})
target_compile_options(test_ck_tile_gemm_pipeline_basic_bf8 PRIVATE ${EXAMPLE_GEMM_COMPILE_OPTIONS})
else()
message(DEBUG "Skipping ck_tile_gemm tests for current target")
endif()
if(GPU_TARGETS MATCHES "gfx94|gfx95|gfx90a|gfx11|gfx12")
add_gtest_executable(test_ck_tile_gemm_pipeline_universal_fp16 test_gemm_pipeline_universal_fp16.cpp)
target_compile_options(test_ck_tile_gemm_pipeline_universal_fp16 PRIVATE ${EXAMPLE_GEMM_COMPILE_OPTIONS})
target_compile_options(test_ck_tile_gemm_pipeline_universal_fp16 PRIVATE --save-temps -Wno-gnu-line-marker)
add_gtest_executable(test_ck_tile_gemm_pipeline_universal_bf16 test_gemm_pipeline_universal_bf16.cpp)
target_compile_options(test_ck_tile_gemm_pipeline_universal_bf16 PRIVATE ${EXAMPLE_GEMM_COMPILE_OPTIONS})
add_gtest_executable(test_ck_tile_gemm_pipeline_basic_fp16 test_gemm_pipeline_basic_fp16.cpp)
target_compile_options(test_ck_tile_gemm_pipeline_basic_fp16 PRIVATE ${EXAMPLE_GEMM_COMPILE_OPTIONS})
add_gtest_executable(test_ck_tile_gemm_pipeline_basic_bf16 test_gemm_pipeline_basic_bf16.cpp)
target_compile_options(test_ck_tile_gemm_pipeline_basic_bf16 PRIVATE ${EXAMPLE_GEMM_COMPILE_OPTIONS})
else()
message(DEBUG "Skipping ck_tile_gemm tests for current target ")
endif()
if(GPU_TARGETS MATCHES "gfx94|gfx95|gfx90a|gfx11|gfx12")
if(GPU_TARGETS MATCHES "gfx94|gfx95")
add_gtest_executable(test_ck_tile_gemm_pipeline_mem test_gemm_pipeline_mem.cpp)
@@ -77,7 +34,16 @@ if(GPU_TARGETS MATCHES "gfx94|gfx95|gfx90a|gfx11|gfx12")
endif()
if(GPU_TARGETS MATCHES "gfx11|gfx12")
# On Radeon devices, build the WMMA version instead
# On Radeon devices, build the WMMA version instead
# Define architecture macros for compile-time detection
if(GPU_TARGETS MATCHES "gfx12")
list(APPEND EXAMPLE_GEMM_COMPILE_OPTIONS -DARCH_GFX12)
list(APPEND EXAMPLE_GEMM_COMPILE_COMPUTE_V4_OPTIONS -DARCH_GFX12)
elseif(GPU_TARGETS MATCHES "gfx11")
list(APPEND EXAMPLE_GEMM_COMPILE_OPTIONS -DARCH_GFX11)
list(APPEND EXAMPLE_GEMM_COMPILE_COMPUTE_V4_OPTIONS -DARCH_GFX11)
endif()
add_gtest_executable(test_ck_tile_gemm_pipeline_mem_wmma test_gemm_pipeline_mem_wmma.cpp)
add_gtest_executable(test_ck_tile_gemm_pipeline_compv3_wmma test_gemm_pipeline_compv3_wmma.cpp)
add_gtest_executable(test_ck_tile_gemm_pipeline_compv4_wmma test_gemm_pipeline_compv4_wmma.cpp)