From c8b6b64240e840a7decf76dfaa13c37da5294c4a Mon Sep 17 00:00:00 2001
From: Illia Silin <98187287+illsilin@users.noreply.github.com>
Date: Fri, 16 Aug 2024 15:07:52 -0700
Subject: [PATCH] Re-enable fp8 types for all architectures. (#1470)

* re-enable fp8 and bf8 for all targets

* restore the fp8 gemm instances

* re-enable conv_3d fp8 on all architectures

* diasble several fp8 gemm instances on all architectures except gfx94

* clang format fix
---
 CMakeLists.txt                                |  9 +++-----
 .../gpu/gemm/CMakeLists.txt                   | 23 +++++++++----------
 .../gpu/gemm_universal/CMakeLists.txt         |  5 ++--
 ...gemm_xdl_universal_f8_f8_bf16_mk_kn_mn.hpp |  6 +++--
 .../grouped_conv3d_bwd_data/CMakeLists.txt    |  2 +-
 .../grouped_conv3d_bwd_weight/CMakeLists.txt  |  2 +-
 .../CMakeLists.txt                            |  2 +-
 .../CMakeLists.txt                            |  2 +-
 .../gpu/grouped_conv3d_fwd/CMakeLists.txt     |  8 +++----
 9 files changed, 28 insertions(+), 31 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 2039948a12..8a08ddd19c 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -62,17 +62,14 @@ if (DTYPES)
     endif()
     message("DTYPES macro set to ${DTYPES}")
 else()
-    add_definitions(-DCK_ENABLE_INT8 -DCK_ENABLE_FP16 -DCK_ENABLE_FP32 -DCK_ENABLE_FP64 -DCK_ENABLE_BF16)
+    add_definitions(-DCK_ENABLE_INT8 -DCK_ENABLE_FP16 -DCK_ENABLE_FP32 -DCK_ENABLE_FP64 -DCK_ENABLE_BF16 -DCK_ENABLE_FP8 -DCK_ENABLE_BF8)
     set(CK_ENABLE_INT8 "ON")
     set(CK_ENABLE_FP16 "ON")
     set(CK_ENABLE_FP32 "ON")
     set(CK_ENABLE_FP64 "ON")
     set(CK_ENABLE_BF16 "ON")
-    if (GPU_TARGETS MATCHES "gfx94")
-        add_definitions(-DCK_ENABLE_FP8 -DCK_ENABLE_BF8)
-        set(CK_ENABLE_FP8 "ON")
-        set(CK_ENABLE_BF8 "ON")
-    endif()
+    set(CK_ENABLE_FP8 "ON")
+    set(CK_ENABLE_BF8 "ON")
 endif()
 
 #for f8/bf8_t type
diff --git a/library/src/tensor_operation_instance/gpu/gemm/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/gemm/CMakeLists.txt
index 0cd54c7788..e4efae6173 100644
--- a/library/src/tensor_operation_instance/gpu/gemm/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/gemm/CMakeLists.txt
@@ -100,18 +100,17 @@ list(APPEND GEMM_INSTANCES
     device_gemm_xdl_c_shuffle_bf16_bf16_bf16_km_kn_mn_instance.cpp
     device_gemm_xdl_c_shuffle_bf16_bf16_bf16_km_nk_mn_instance.cpp)
 
-if(GPU_TARGETS MATCHES "gfx94")
-  list(APPEND GEMM_INSTANCES
-      device_gemm_xdl_c_shuffle_fp8_fp8_fp8_mk_kn_mn_v1_default_instance.cpp
-      device_gemm_xdl_c_shuffle_fp8_fp8_fp8_mk_kn_mn_v1_interwave_default_instance.cpp
-      device_gemm_xdl_c_shuffle_fp8_fp8_fp8_mk_kn_mn_v2_default_instance.cpp
-      device_gemm_xdl_c_shuffle_fp8_fp8_fp8_mk_kn_mn_v1_padded_instance.cpp
-      device_gemm_xdl_c_shuffle_fp8_fp8_fp8_mk_kn_mn_v1_interwave_padded_instance.cpp
-      device_gemm_xdl_c_shuffle_fp8_fp8_fp8_mk_kn_mn_v2_padded_instance.cpp
-      device_gemm_xdl_c_shuffle_fp8_fp8_fp8_mk_nk_mn_instance.cpp
-      device_gemm_xdl_c_shuffle_fp8_fp8_fp8_km_kn_mn_instance.cpp
-      device_gemm_xdl_c_shuffle_fp8_fp8_fp8_km_nk_mn_instance.cpp)
-endif()
+list(APPEND GEMM_INSTANCES
+    device_gemm_xdl_c_shuffle_fp8_fp8_fp8_mk_kn_mn_v1_default_instance.cpp
+    device_gemm_xdl_c_shuffle_fp8_fp8_fp8_mk_kn_mn_v1_interwave_default_instance.cpp
+    device_gemm_xdl_c_shuffle_fp8_fp8_fp8_mk_kn_mn_v2_default_instance.cpp
+    device_gemm_xdl_c_shuffle_fp8_fp8_fp8_mk_kn_mn_v1_padded_instance.cpp
+    device_gemm_xdl_c_shuffle_fp8_fp8_fp8_mk_kn_mn_v1_interwave_padded_instance.cpp
+    device_gemm_xdl_c_shuffle_fp8_fp8_fp8_mk_kn_mn_v2_padded_instance.cpp
+    device_gemm_xdl_c_shuffle_fp8_fp8_fp8_mk_nk_mn_instance.cpp
+    device_gemm_xdl_c_shuffle_fp8_fp8_fp8_km_kn_mn_instance.cpp
+    device_gemm_xdl_c_shuffle_fp8_fp8_fp8_km_nk_mn_instance.cpp)
+
 
 list(APPEND GEMM_INSTANCES
     device_gemm_wmma_f16_f16_f16_mk_kn_mn_instance.cpp
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/gemm_universal/CMakeLists.txt
index cc4ce76606..5b6e985e59 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_universal/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal/CMakeLists.txt
@@ -51,8 +51,7 @@ set_source_files_properties(device_gemm_xdl_universal_bf16_bf16_bf16/device_gemm
 set_source_files_properties(device_gemm_xdl_universal_bf16_bf16_bf16/device_gemm_xdl_universal_bf16_bf16_bf16_mk_nk_mn_comp_default_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
 set_source_files_properties(device_gemm_xdl_universal_bf16_bf16_bf16/device_gemm_xdl_universal_bf16_bf16_bf16_mk_nk_mn_comp_kpadding_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
 
-if(GPU_TARGETS MATCHES "gfx94")
-  list(APPEND GEMM_UNIVERSAL_INSTANCES        
+list(APPEND GEMM_UNIVERSAL_INSTANCES
           device_gemm_xdl_universal_f16_f8_f16/device_gemm_xdl_universal_f16_f8_f16_mk_kn_mn_comp_default_instance.cpp
           device_gemm_xdl_universal_f16_f8_f16/device_gemm_xdl_universal_f16_f8_f16_mk_kn_mn_comp_kpadding_instance.cpp
           device_gemm_xdl_universal_f16_f8_f16/device_gemm_xdl_universal_f16_f8_f16_mk_kn_mn_comp_mnpadding_instance.cpp
@@ -123,6 +122,6 @@ if(GPU_TARGETS MATCHES "gfx94")
  set_source_files_properties(device_gemm_xdl_universal_f8_f8_bf16/device_gemm_xdl_universal_f8_f8_bf16_mk_kn_mn_comp_nkpadding_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
  set_source_files_properties(device_gemm_xdl_universal_f8_f8_bf16/device_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn_comp_default_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
  set_source_files_properties(device_gemm_xdl_universal_f8_f8_bf16/device_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn_comp_kpadding_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
-endif()
+
 
 add_instance_library(device_gemm_universal_instance ${GEMM_UNIVERSAL_INSTANCES})
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f8_f8_bf16/device_gemm_xdl_universal_f8_f8_bf16_mk_kn_mn.hpp b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f8_f8_bf16/device_gemm_xdl_universal_f8_f8_bf16_mk_kn_mn.hpp
index 12994aeecd..3b930e9894 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f8_f8_bf16/device_gemm_xdl_universal_f8_f8_bf16_mk_kn_mn.hpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f8_f8_bf16/device_gemm_xdl_universal_f8_f8_bf16_mk_kn_mn.hpp
@@ -35,12 +35,13 @@ static constexpr auto Interwave = BlockGemmPipelineScheduler::Interwave;
 
 template <GemmSpecialization GemmSpec>
 using device_gemm_xdl_universal_f8_f8_bf16_mk_kn_mn_comp_instances = std::tuple<
-    // clang-format off
+// clang-format off
         //#########################| ALayout| BLayout| CLayout|AData| BData| CData| AccData| Cshuffle|           A|           B|           C|          GEMM| Block|  MPer|  NPer|  KPer| AK1| BK1|MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|                         Block-wiseGemm|               Block-wiseGemm|
         //#########################|        |        |        | Type|  Type|  Type|    Type|     Type| Elementwise| Elementwise| Elementwise|Specialization|  Size| Block| Block| Block|    |    | XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|                               Pipeline|                     Pipeline|
         //#########################|        |        |        |     |      |      |        |         |   Operation|   Operation|   Operation|              |      |      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|                              Scheduler|                     Verision|
         //#########################|        |        |        |     |      |      |        |         |            |            |            |              |      |      |      |      |    |    |    |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |                                       |                             |
-        
+#ifdef __gfx94__
+        //Only enable these instances on gfx94x
         // Compute friendly
         DeviceGemm_Xdl_CShuffleV3<  Row,     Row,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   256,    64,  16,   4,  32,   32,    4,    4,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<16, 16, 1>,    S<0, 2, 1>,    S<0, 2, 1>,               1,             16,              4,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4, F8>,
         DeviceGemm_Xdl_CShuffleV3<  Row,     Row,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,   128,  16,   4,  32,   32,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<32,  8, 1>,    S<0, 2, 1>,    S<0, 2, 1>,               1,             16,              4,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4, F8>,
@@ -55,6 +56,7 @@ using device_gemm_xdl_universal_f8_f8_bf16_mk_kn_mn_comp_instances = std::tuple<
         DeviceGemm_Xdl_CShuffleV3<  Row,     Row,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,   128,  16,   4,  32,   32,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<32,  8, 1>,    S<0, 2, 1>,    S<0, 2, 1>,               1,             16,              4,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Interwave, BlockGemmPipelineVersion::v1, F8>,
         DeviceGemm_Xdl_CShuffleV3<  Row,     Row,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,    64,   128,  16,   4,  32,   32,    2,    1,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<32,  8, 1>,    S<0, 2, 1>,    S<0, 2, 1>,               1,              8,              4,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
         DeviceGemm_Xdl_CShuffleV3<  Row,     Row,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,    64,    64,   128,  16,   4,  32,   32,    1,    1,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<32,  8, 1>,    S<0, 2, 1>,    S<0, 2, 1>,               1,              8,              4,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>
+#endif
     // clang-format on
     >;
 
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data/CMakeLists.txt
index 7bb7e71c54..29fa8fa3c5 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data/CMakeLists.txt
@@ -15,7 +15,7 @@ set(GROUPED_CONV3D_BWD_DATA
    wmma/device_grouped_conv3d_bwd_data_wmma_gndhwc_gkzyxc_gndhwk_i8_1x1s1p0_instance.cpp
    wmma/device_grouped_conv3d_bwd_data_wmma_ndhwgc_gkzyxc_ndhwgk_i8_1x1s1p0_instance.cpp)
 
-if((DTYPES MATCHES "fp8" AND DTYPES MATCHES "bf8" AND DTYPES MATCHES "fp16") OR (NOT DEFINED DTYPES AND GPU_TARGETS MATCHES "gfx94"))
+if((DTYPES MATCHES "fp8" AND DTYPES MATCHES "bf8" AND DTYPES MATCHES "fp16") OR NOT DEFINED DTYPES)
    list(APPEND GROUPED_CONV3D_BWD_DATA
       xdl/device_grouped_conv3d_bwd_data_xdl_ndhwgc_gkzyxc_ndhwgk_input_f16_comp_bf8_f8_instance.cpp)
 endif()
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/CMakeLists.txt
index 1a9c455220..8e939c15a9 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/CMakeLists.txt
@@ -30,7 +30,7 @@ list(APPEND GROUPED_CONV3D_BWD_WEIGHT
     wmma/device_grouped_conv3d_bwd_weight_wmma_gndhwc_gkzyxc_gndhwk_i8_instance.cpp
     wmma/device_grouped_conv3d_bwd_weight_wmma_ndhwgc_gkzyxc_ndhwgk_i8_instance.cpp)
 
-if((DTYPES MATCHES "fp8" AND DTYPES MATCHES "bf8" AND DTYPES MATCHES "fp16") OR (NOT DEFINED DTYPES AND GPU_TARGETS MATCHES "gfx94"))
+if((DTYPES MATCHES "fp8" AND DTYPES MATCHES "bf8" AND DTYPES MATCHES "fp16") OR NOT DEFINED DTYPES)
     list(APPEND GROUPED_CONV3D_BWD_WEIGHT
       xdl/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f16_comp_bf8_fp8_instance.cpp)
 endif()
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight_bilinear/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight_bilinear/CMakeLists.txt
index 5781f07080..329e8e4c7f 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight_bilinear/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight_bilinear/CMakeLists.txt
@@ -4,7 +4,7 @@ set(GROUPED_CONV3D_BWD_WEIGHT_BILINEAR
     xdl/device_grouped_conv3d_bwd_weight_xdl_bilinear_ndhwgc_gkzyxc_ndhwgk_f32_instance.cpp
     xdl/device_grouped_conv3d_bwd_weight_xdl_bilinear_ndhwgc_gkzyxc_ndhwgk_bf16_instance.cpp)
 
-if((DTYPES MATCHES "fp8" AND DTYPES MATCHES "bf8" AND DTYPES MATCHES "fp16") OR (NOT DEFINED DTYPES AND GPU_TARGETS MATCHES "gfx94"))
+if((DTYPES MATCHES "fp8" AND DTYPES MATCHES "bf8" AND DTYPES MATCHES "fp16") OR NOT DEFINED DTYPES)
     list(APPEND GROUPED_CONV3D_BWD_WEIGHT_BILINEAR
       xdl/device_grouped_conv3d_bwd_weight_xdl_bilinear_ndhwgc_gkzyxc_ndhwgk_f16_comp_bf8_fp8_instance.cpp)
 endif()
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight_scale/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight_scale/CMakeLists.txt
index be54eb4adf..9a42d1ec3a 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight_scale/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight_scale/CMakeLists.txt
@@ -4,7 +4,7 @@ set(GROUPED_CONV3D_BWD_WEIGHT_SCALE
     xdl/device_grouped_conv3d_bwd_weight_xdl_scale_ndhwgc_gkzyxc_ndhwgk_f32_instance.cpp
     xdl/device_grouped_conv3d_bwd_weight_xdl_scale_ndhwgc_gkzyxc_ndhwgk_bf16_instance.cpp)
 
-if((DTYPES MATCHES "fp8" AND DTYPES MATCHES "bf8" AND DTYPES MATCHES "fp16") OR (NOT DEFINED DTYPES AND GPU_TARGETS MATCHES "gfx94"))
+if((DTYPES MATCHES "fp8" AND DTYPES MATCHES "bf8" AND DTYPES MATCHES "fp16") OR NOT DEFINED DTYPES)
     list(APPEND GROUPED_CONV3D_BWD_WEIGHT_SCALE
       xdl/device_grouped_conv3d_bwd_weight_xdl_scale_ndhwgc_gkzyxc_ndhwgk_f16_comp_bf8_fp8_instance.cpp)
 endif()
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/CMakeLists.txt
index 9bb6d807e6..5a346f1e94 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/CMakeLists.txt
@@ -43,22 +43,22 @@ set(GROUPED_CONV3D_FWD
    wmma/device_grouped_conv3d_fwd_wmma_ndhwgc_gkzyxc_ndhwgk_i8_oddc_instance.cpp
 )
 
-if((DTYPES MATCHES "fp8" AND DTYPES MATCHES "fp16") OR (NOT DEFINED DTYPES AND GPU_TARGETS MATCHES "gfx94"))
+if((DTYPES MATCHES "fp8" AND DTYPES MATCHES "fp16") OR NOT DEFINED DTYPES)
     list(APPEND GROUPED_CONV3D_FWD
       xdl/device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_f16_comp_fp8_instance.cpp)
 endif()
 
-if((DTYPES MATCHES "fp8") OR (NOT DEFINED DTYPES AND GPU_TARGETS MATCHES "gfx94"))
+if((DTYPES MATCHES "fp8") OR NOT DEFINED DTYPES)
     list(APPEND GROUPED_CONV3D_FWD
       xdl/device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_fp8_instance.cpp)
 endif()
 
-if((DTYPES MATCHES "bf8") OR (NOT DEFINED DTYPES AND GPU_TARGETS MATCHES "gfx94"))
+if((DTYPES MATCHES "bf8") OR NOT DEFINED DTYPES)
     list(APPEND GROUPED_CONV3D_FWD
       xdl/device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_bf8_instance.cpp)
 endif()
 
-if((DTYPES MATCHES "fp8" AND DTYPES MATCHES "bf8") OR (NOT DEFINED DTYPES AND GPU_TARGETS MATCHES "gfx94"))
+if((DTYPES MATCHES "fp8" AND DTYPES MATCHES "bf8") OR NOT DEFINED DTYPES)
     list(APPEND GROUPED_CONV3D_FWD
       xdl/device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_fp8_bf8_instance.cpp)
     list(APPEND GROUPED_CONV3D_FWD