mirror of
https://github.com/ROCm/composable_kernel.git
synced 2026-05-13 09:45:56 +00:00
Update tuning parameter & compilation options of DeviceGemmXdl<> instance (layout=TT) (#819)
* Enable pipeline v2 opt for layout=TT instance * Use better thread mapping for reading A tile * Conditionally enable pipeline v2 opt * Allow enabling only fp16 gemm instances in profiler * Fix formatting error * Fix compilation error if we enable fp32 in profiler
This commit is contained in:
@@ -95,36 +95,39 @@ endif()
|
||||
|
||||
add_instance_library(device_gemm_instance ${GEMM_INSTANCES})
|
||||
|
||||
set(ENABLE_PIPELINE_V2_OPT OFF)
|
||||
|
||||
if (ENABLE_PIPELINE_V2_OPT)
|
||||
set(MAX_ILP_OPTS
|
||||
-mllvm
|
||||
-amdgpu-enable-max-ilp-scheduling-strategy
|
||||
)
|
||||
set(WAVES_PER_EU_DEFS
|
||||
CK_USE_WAVES_PER_EU=1
|
||||
CK_MIN_WAVES_PER_EU=1
|
||||
CK_MAX_WAVES_PER_EU=1
|
||||
)
|
||||
set(IGLP_OPT_DEFS
|
||||
CK_EXPERIMENTAL_PIPELINE_V2_IGLP_OPT=1
|
||||
)
|
||||
if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES)
|
||||
set(ENABLE_PIPELINE_V2_OPT OFF)
|
||||
|
||||
# layout=NT
|
||||
set_source_files_properties(device_gemm_xdl_f16_f16_f16/km_kn_mn_default_pipeline_v2_opt_instance.cpp PROPERTIES
|
||||
COMPILE_OPTIONS ";;"
|
||||
COMPILE_DEFINITIONS "${WAVES_PER_EU_DEFS};${IGLP_OPT_DEFS}")
|
||||
# layout=NN
|
||||
set_source_files_properties(device_gemm_xdl_f16_f16_f16/km_nk_mn_default_pipeline_v2_opt_instance.cpp PROPERTIES
|
||||
COMPILE_OPTIONS "${MAX_ILP_OPTS}"
|
||||
COMPILE_DEFINITIONS "${WAVES_PER_EU_DEFS};${IGLP_OPT_DEFS}")
|
||||
# layout=TT
|
||||
set_source_files_properties(device_gemm_xdl_f16_f16_f16/mk_kn_mn_default_pipeline_v2_opt_instance.cpp PROPERTIES
|
||||
COMPILE_OPTIONS "${MAX_ILP_OPTS}"
|
||||
COMPILE_DEFINITIONS "${WAVES_PER_EU_DEFS}")
|
||||
# layout=TN
|
||||
set_source_files_properties(device_gemm_xdl_f16_f16_f16/mk_nk_mn_default_pipeline_v2_opt_instance.cpp PROPERTIES
|
||||
COMPILE_OPTIONS "${MAX_ILP_OPTS}"
|
||||
COMPILE_DEFINITIONS "${WAVES_PER_EU_DEFS};${IGLP_OPT_DEFS}")
|
||||
endif(ENABLE_PIPELINE_V2_OPT)
|
||||
if (ENABLE_PIPELINE_V2_OPT)
|
||||
set(MAX_ILP_OPTS
|
||||
-mllvm
|
||||
-amdgpu-enable-max-ilp-scheduling-strategy
|
||||
)
|
||||
set(WAVES_PER_EU_DEFS
|
||||
CK_USE_WAVES_PER_EU=1
|
||||
CK_MIN_WAVES_PER_EU=1
|
||||
CK_MAX_WAVES_PER_EU=1
|
||||
)
|
||||
set(IGLP_OPT_DEFS
|
||||
CK_EXPERIMENTAL_PIPELINE_V2_IGLP_OPT=1
|
||||
)
|
||||
|
||||
# layout=NT
|
||||
set_source_files_properties(device_gemm_xdl_f16_f16_f16/km_kn_mn_default_pipeline_v2_opt_instance.cpp PROPERTIES
|
||||
COMPILE_OPTIONS ";;"
|
||||
COMPILE_DEFINITIONS "${WAVES_PER_EU_DEFS};${IGLP_OPT_DEFS}")
|
||||
# layout=NN
|
||||
set_source_files_properties(device_gemm_xdl_f16_f16_f16/km_nk_mn_default_pipeline_v2_opt_instance.cpp PROPERTIES
|
||||
COMPILE_OPTIONS "${MAX_ILP_OPTS}"
|
||||
COMPILE_DEFINITIONS "${WAVES_PER_EU_DEFS};${IGLP_OPT_DEFS}")
|
||||
# layout=TT
|
||||
set_source_files_properties(device_gemm_xdl_f16_f16_f16/mk_kn_mn_default_pipeline_v2_opt_instance.cpp PROPERTIES
|
||||
COMPILE_OPTIONS ";;"
|
||||
COMPILE_DEFINITIONS "${WAVES_PER_EU_DEFS};${IGLP_OPT_DEFS}")
|
||||
# layout=TN
|
||||
set_source_files_properties(device_gemm_xdl_f16_f16_f16/mk_nk_mn_default_pipeline_v2_opt_instance.cpp PROPERTIES
|
||||
COMPILE_OPTIONS "${MAX_ILP_OPTS}"
|
||||
COMPILE_DEFINITIONS "${WAVES_PER_EU_DEFS};${IGLP_OPT_DEFS}")
|
||||
endif(ENABLE_PIPELINE_V2_OPT)
|
||||
endif(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES)
|
||||
|
||||
@@ -18,7 +18,7 @@ using Instances =
|
||||
//##########| Type| Type| Type| Type| | | | Elementwise| Elementwise| Elementwise|Specialization| Size| Block| Block| Block| | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| SrcDstVectorDim| DstScalar| | | |
|
||||
//##########| | | | | | | | Operation| Operation| Operation| | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | | PerVector| | | |
|
||||
//##########| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | |
|
||||
DeviceGemmXdl< F16, F16, F16, F32, Row, Row, Row, PassThrough, PassThrough, PassThrough, GemmDefault, 256, 64, 128, 8, 8, 32, 32, 1, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 8, true, 7, 1, 1, LoopScheduler::Default, PipelineVersion::v2>
|
||||
DeviceGemmXdl< F16, F16, F16, F32, Row, Row, Row, PassThrough, PassThrough, PassThrough, GemmDefault, 256, 64, 128, 8, 8, 32, 32, 1, 2, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 8, true, 7, 1, 1, LoopScheduler::Default, PipelineVersion::v2>
|
||||
#endif
|
||||
// clang-format on
|
||||
>;
|
||||
|
||||
Reference in New Issue
Block a user