From 5366d3415bdffaa3a2e67f61e67dbf85501e9fcd Mon Sep 17 00:00:00 2001 From: aska-0096 Date: Tue, 22 Apr 2025 10:59:03 +0000 Subject: [PATCH 1/2] f8 mfma issue --- example/01_gemm/CMakeLists.txt | 6 ++++++ example/01_gemm/gemm_xdl_fp8_v3.cpp | 4 ++-- .../gpu/block/blockwise_gemm_pipeline_xdlops_v3.hpp | 2 +- include/ck/tensor_operation/gpu/warp/xdlops_gemm.hpp | 8 ++++++++ include/ck/utility/dtype_vector.hpp | 12 ++++++------ 5 files changed, 23 insertions(+), 9 deletions(-) diff --git a/example/01_gemm/CMakeLists.txt b/example/01_gemm/CMakeLists.txt index 96678d275a..02482d5b39 100755 --- a/example/01_gemm/CMakeLists.txt +++ b/example/01_gemm/CMakeLists.txt @@ -38,6 +38,12 @@ add_example_dependencies(example_gemm_xdl example_gemm_xdl_fp16_fp8_streamk_v3) add_example_executable(example_gemm_xdl_bf16_v3 gemm_xdl_bf16_v3.cpp) add_example_dependencies(example_gemm_xdl example_gemm_xdl_bf16_v3) +set(GEMM_OPTIONS) +# list(APPEND GEMM_OPTIONS -mllvm -greedy-reverse-local-assignment=1) +list(APPEND GEMM_OPTIONS -v --save-temps -Wno-gnu-line-marker) +target_compile_options(example_gemm_xdl_bf16_v3 PRIVATE ${GEMM_OPTIONS}) +target_compile_options(example_gemm_xdl_fp8_v3 PRIVATE ${GEMM_OPTIONS}) + list(APPEND gpu_list gfx942 gfx950) set(target 0) diff --git a/example/01_gemm/gemm_xdl_fp8_v3.cpp b/example/01_gemm/gemm_xdl_fp8_v3.cpp index da891267b2..0270ffe591 100644 --- a/example/01_gemm/gemm_xdl_fp8_v3.cpp +++ b/example/01_gemm/gemm_xdl_fp8_v3.cpp @@ -28,10 +28,10 @@ using DeviceGemmV2Instance = ADataType, BDataType, CDataType, AccDataType, CShuffleDataType, PassThrough, PassThrough, PassThrough, GemmDefault, 256, - 224, 256, + 128, 128, 128, 16, 16, 16, 16, - 7, 8, + 4, 4, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, diff --git a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_v3.hpp b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_v3.hpp index 171a232c0f..fe66d320e0 100644 --- a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_v3.hpp +++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_v3.hpp @@ -178,7 +178,7 @@ struct BlockwiseGemmXdlops_pipeline_v3 constexpr auto GetMfma() { +#if defined(__gfx950__) + return MfmaInstr::mfma_f32_32x32x64f8f6f4; +#else return MfmaInstr::mfma_f32_32x32x16f8f8; +#endif } template <> @@ -1132,7 +1136,11 @@ struct MfmaSelector template <> constexpr auto GetMfma() { +#if defined(__gfx950__) + return MfmaInstr::mfma_f32_16x16x128f8f6f4; +#else return MfmaInstr::mfma_f32_16x16x32f8f8; +#endif } template <> diff --git a/include/ck/utility/dtype_vector.hpp b/include/ck/utility/dtype_vector.hpp index 9c40d923d3..466116242f 100644 --- a/include/ck/utility/dtype_vector.hpp +++ b/include/ck/utility/dtype_vector.hpp @@ -954,11 +954,11 @@ struct vector_type()>> StaticallyIndexedArray d32x4_; StaticallyIndexedArray d64x2_; StaticallyIndexedArray d128x1_; - } data_; + } data_ = {d128_t{0}}; - __host__ __device__ constexpr vector_type() : data_{type{0}} {} + __attribute__((host)) __attribute__((device)) constexpr vector_type() {} - __host__ __device__ constexpr vector_type(type v) : data_{v} {} + __attribute__((host)) __attribute__((device)) constexpr vector_type(type v) { (void)v; } template __host__ __device__ constexpr const auto& AsType() const @@ -1082,11 +1082,11 @@ struct vector_type()>> StaticallyIndexedArray d64x4_; StaticallyIndexedArray d128x2_; StaticallyIndexedArray d256x1_; - } data_; + } data_ = {d256_t{0}}; - __host__ __device__ constexpr vector_type() : data_{type{0}} {} + __attribute__((host)) __attribute__((device)) constexpr vector_type() {} - __host__ __device__ constexpr vector_type(type v) : data_{v} {} + __attribute__((host)) __attribute__((device)) constexpr vector_type(type v) { (void)v; } template __host__ __device__ constexpr const auto& AsType() const From 25bb0d2fee306a6192d68ab3d3477affce125427 Mon Sep 17 00:00:00 2001 From: aska-0096 Date: Wed, 23 Apr 2025 02:08:44 +0000 Subject: [PATCH 2/2] add flags to avoid vectorizer problem --- example/01_gemm/CMakeLists.txt | 2 +- example/01_gemm/gemm_xdl_fp8_v3.cpp | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/example/01_gemm/CMakeLists.txt b/example/01_gemm/CMakeLists.txt index 02482d5b39..39212d2904 100755 --- a/example/01_gemm/CMakeLists.txt +++ b/example/01_gemm/CMakeLists.txt @@ -39,7 +39,7 @@ add_example_dependencies(example_gemm_xdl example_gemm_xdl_fp16_fp8_streamk_v3) add_example_executable(example_gemm_xdl_bf16_v3 gemm_xdl_bf16_v3.cpp) add_example_dependencies(example_gemm_xdl example_gemm_xdl_bf16_v3) set(GEMM_OPTIONS) -# list(APPEND GEMM_OPTIONS -mllvm -greedy-reverse-local-assignment=1) +list(APPEND GEMM_OPTIONS "SHELL: -mllvm -greedy-reverse-local-assignment=1 -mllvm --slp-threshold=-16") list(APPEND GEMM_OPTIONS -v --save-temps -Wno-gnu-line-marker) target_compile_options(example_gemm_xdl_bf16_v3 PRIVATE ${GEMM_OPTIONS}) target_compile_options(example_gemm_xdl_fp8_v3 PRIVATE ${GEMM_OPTIONS}) diff --git a/example/01_gemm/gemm_xdl_fp8_v3.cpp b/example/01_gemm/gemm_xdl_fp8_v3.cpp index 0270ffe591..55a6c60273 100644 --- a/example/01_gemm/gemm_xdl_fp8_v3.cpp +++ b/example/01_gemm/gemm_xdl_fp8_v3.cpp @@ -28,10 +28,10 @@ using DeviceGemmV2Instance = ADataType, BDataType, CDataType, AccDataType, CShuffleDataType, PassThrough, PassThrough, PassThrough, GemmDefault, 256, - 128, 128, + 256, 256, 128, 16, 16, 16, 16, - 4, 4, + 8, 8, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>,