diff --git a/library/src/tensor_operation_instance/gpu/gemm/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/gemm/CMakeLists.txt index b6f63f39d7..266a5d4b50 100644 --- a/library/src/tensor_operation_instance/gpu/gemm/CMakeLists.txt +++ b/library/src/tensor_operation_instance/gpu/gemm/CMakeLists.txt @@ -95,36 +95,39 @@ endif() add_instance_library(device_gemm_instance ${GEMM_INSTANCES}) -set(ENABLE_PIPELINE_V2_OPT OFF) -if (ENABLE_PIPELINE_V2_OPT) -set(MAX_ILP_OPTS - -mllvm - -amdgpu-enable-max-ilp-scheduling-strategy -) -set(WAVES_PER_EU_DEFS - CK_USE_WAVES_PER_EU=1 - CK_MIN_WAVES_PER_EU=1 - CK_MAX_WAVES_PER_EU=1 -) -set(IGLP_OPT_DEFS - CK_EXPERIMENTAL_PIPELINE_V2_IGLP_OPT=1 -) +if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES) + set(ENABLE_PIPELINE_V2_OPT OFF) -# layout=NT -set_source_files_properties(device_gemm_xdl_f16_f16_f16/km_kn_mn_default_pipeline_v2_opt_instance.cpp PROPERTIES - COMPILE_OPTIONS ";;" - COMPILE_DEFINITIONS "${WAVES_PER_EU_DEFS};${IGLP_OPT_DEFS}") -# layout=NN -set_source_files_properties(device_gemm_xdl_f16_f16_f16/km_nk_mn_default_pipeline_v2_opt_instance.cpp PROPERTIES - COMPILE_OPTIONS "${MAX_ILP_OPTS}" - COMPILE_DEFINITIONS "${WAVES_PER_EU_DEFS};${IGLP_OPT_DEFS}") -# layout=TT -set_source_files_properties(device_gemm_xdl_f16_f16_f16/mk_kn_mn_default_pipeline_v2_opt_instance.cpp PROPERTIES - COMPILE_OPTIONS "${MAX_ILP_OPTS}" - COMPILE_DEFINITIONS "${WAVES_PER_EU_DEFS}") -# layout=TN -set_source_files_properties(device_gemm_xdl_f16_f16_f16/mk_nk_mn_default_pipeline_v2_opt_instance.cpp PROPERTIES - COMPILE_OPTIONS "${MAX_ILP_OPTS}" - COMPILE_DEFINITIONS "${WAVES_PER_EU_DEFS};${IGLP_OPT_DEFS}") -endif(ENABLE_PIPELINE_V2_OPT) + if (ENABLE_PIPELINE_V2_OPT) + set(MAX_ILP_OPTS + -mllvm + -amdgpu-enable-max-ilp-scheduling-strategy + ) + set(WAVES_PER_EU_DEFS + CK_USE_WAVES_PER_EU=1 + CK_MIN_WAVES_PER_EU=1 + CK_MAX_WAVES_PER_EU=1 + ) + set(IGLP_OPT_DEFS + CK_EXPERIMENTAL_PIPELINE_V2_IGLP_OPT=1 + ) + + # layout=NT + set_source_files_properties(device_gemm_xdl_f16_f16_f16/km_kn_mn_default_pipeline_v2_opt_instance.cpp PROPERTIES + COMPILE_OPTIONS ";;" + COMPILE_DEFINITIONS "${WAVES_PER_EU_DEFS};${IGLP_OPT_DEFS}") + # layout=NN + set_source_files_properties(device_gemm_xdl_f16_f16_f16/km_nk_mn_default_pipeline_v2_opt_instance.cpp PROPERTIES + COMPILE_OPTIONS "${MAX_ILP_OPTS}" + COMPILE_DEFINITIONS "${WAVES_PER_EU_DEFS};${IGLP_OPT_DEFS}") + # layout=TT + set_source_files_properties(device_gemm_xdl_f16_f16_f16/mk_kn_mn_default_pipeline_v2_opt_instance.cpp PROPERTIES + COMPILE_OPTIONS ";;" + COMPILE_DEFINITIONS "${WAVES_PER_EU_DEFS};${IGLP_OPT_DEFS}") + # layout=TN + set_source_files_properties(device_gemm_xdl_f16_f16_f16/mk_nk_mn_default_pipeline_v2_opt_instance.cpp PROPERTIES + COMPILE_OPTIONS "${MAX_ILP_OPTS}" + COMPILE_DEFINITIONS "${WAVES_PER_EU_DEFS};${IGLP_OPT_DEFS}") + endif(ENABLE_PIPELINE_V2_OPT) +endif(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES) diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16/mk_kn_mn_default_pipeline_v2_opt_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16/mk_kn_mn_default_pipeline_v2_opt_instance.cpp index 5314c7fc20..b931b8fdfd 100644 --- a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16/mk_kn_mn_default_pipeline_v2_opt_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16/mk_kn_mn_default_pipeline_v2_opt_instance.cpp @@ -18,7 +18,7 @@ using Instances = //##########| Type| Type| Type| Type| | | | Elementwise| Elementwise| Elementwise|Specialization| Size| Block| Block| Block| | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| SrcDstVectorDim| DstScalar| | | | //##########| | | | | | | | Operation| Operation| Operation| | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | | PerVector| | | | //##########| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | - DeviceGemmXdl< F16, F16, F16, F32, Row, Row, Row, PassThrough, PassThrough, PassThrough, GemmDefault, 256, 64, 128, 8, 8, 32, 32, 1, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 8, true, 7, 1, 1, LoopScheduler::Default, PipelineVersion::v2> + DeviceGemmXdl< F16, F16, F16, F32, Row, Row, Row, PassThrough, PassThrough, PassThrough, GemmDefault, 256, 64, 128, 8, 8, 32, 32, 1, 2, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 8, true, 7, 1, 1, LoopScheduler::Default, PipelineVersion::v2> #endif // clang-format on >; diff --git a/profiler/src/profile_gemm.cpp b/profiler/src/profile_gemm.cpp index fbef123860..d1f5debc7d 100644 --- a/profiler/src/profile_gemm.cpp +++ b/profiler/src/profile_gemm.cpp @@ -121,7 +121,10 @@ int profile_gemm(int argc, char* argv[]) return pass ? 0 : 1; }; - if(data_type == GemmDataType::F32_F32_F32 && layout == GemmMatrixLayout::MK_KN_MN) + if(false) + ; +#ifdef __fp32__ + else if(data_type == GemmDataType::F32_F32_F32 && layout == GemmMatrixLayout::MK_KN_MN) { return profile(Row{}, Row{}, Row{}, F32{}, F32{}, F32{}, F32{}); } @@ -137,6 +140,8 @@ int profile_gemm(int argc, char* argv[]) { return profile(Col{}, Col{}, Row{}, F32{}, F32{}, F32{}, F32{}); } +#endif +#ifdef __fp16__ else if(data_type == GemmDataType::F16_F16_F16 && layout == GemmMatrixLayout::MK_KN_MN) { return profile(Row{}, Row{}, Row{}, F16{}, F16{}, F32{}, F16{}); @@ -153,6 +158,7 @@ int profile_gemm(int argc, char* argv[]) { return profile(Col{}, Col{}, Row{}, F16{}, F16{}, F32{}, F16{}); } +#endif #ifdef __bf16__ else if(data_type == GemmDataType::BF16_BF16_BF16 && layout == GemmMatrixLayout::MK_KN_MN) {