diff --git a/library/src/tensor_operation_instance/gpu/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/CMakeLists.txt index eab81f215d..0fff0dfa31 100644 --- a/library/src/tensor_operation_instance/gpu/CMakeLists.txt +++ b/library/src/tensor_operation_instance/gpu/CMakeLists.txt @@ -64,6 +64,13 @@ function(add_instance_library INSTANCE_NAME) list(REMOVE_ITEM ARGN "${source}") endif() endforeach() + # Do not build mha instances if gfx94 targets are not on the target list + foreach(source IN LISTS ARGN) + if(NOT INST_TARGETS MATCHES "gfx94" AND source MATCHES "mha") + message("removing mha instance ${source} ") + list(REMOVE_ITEM ARGN "${source}") + endif() + endforeach() #only continue if there are some source files left on the list if(ARGN) set(INST_OBJ) @@ -77,6 +84,8 @@ function(add_instance_library INSTANCE_NAME) list(REMOVE_ITEM INST_TARGETS gfx900 gfx906 gfx1030 gfx1100 gfx1101 gfx1102 gfx1103) elseif(ARGN MATCHES "_wmma") list(REMOVE_ITEM INST_TARGETS gfx900 gfx906 gfx908 gfx90a gfx940 gfx941 gfx942 gfx1030) + elseif(ARGN MATCHES "mha") + list(REMOVE_ITEM INST_TARGETS gfx900 gfx906 gfx908 gfx90a gfx1030 gfx1100 gfx1101 gfx1102 gfx1103) endif() set(offload_targets) foreach(target IN LISTS INST_TARGETS) @@ -86,6 +95,21 @@ function(add_instance_library INSTANCE_NAME) list(APPEND INST_OBJ ${source}) endforeach() add_library(${INSTANCE_NAME} OBJECT ${INST_OBJ}) + + # Allow comparing floating points directly in order to check sentinel values + if(${INSTANCE_NAME} STREQUAL "device_mha_instance") + if(NOT DEFINED FMHA_FWD_FAST_EXP2) + set(FMHA_FWD_FAST_EXP2 true) + endif() + if(FMHA_FWD_FAST_EXP2) + list(APPEND EXAMPLE_FMHA_FWD_COMPILE_OPTIONS -Wno-undefined-func-template -DCK_TILE_FMHA_FWD_FAST_EXP2=1 -fgpu-flush-denormals-to-zero) + else() + list(APPEND EXAMPLE_FMHA_FWD_COMPILE_OPTIONS -Wno-undefined-func-template -DCK_TILE_FMHA_FWD_FAST_EXP2=0) + endif() + list(APPEND EXAMPLE_FMHA_FWD_COMPILE_OPTIONS -Wno-float-equal) + target_compile_options(device_mha_instance PRIVATE ${EXAMPLE_FMHA_FWD_COMPILE_OPTIONS}) + endif() + target_compile_features(${INSTANCE_NAME} PUBLIC) # flags to compress the library @@ -293,20 +317,22 @@ if(CK_DEVICE_CONV_INSTANCES) ) endif() if(CK_DEVICE_MHA_INSTANCES) - add_library(device_mha_operations STATIC ${CK_DEVICE_MHA_INSTANCES}) - add_library(composablekernels::device_mha_operations ALIAS device_mha_operations) - target_compile_features(device_mha_operations PUBLIC) - set_target_properties(device_mha_operations PROPERTIES POSITION_INDEPENDENT_CODE ON) - target_include_directories(device_mha_operations PUBLIC - $ - ) - rocm_install(TARGETS device_mha_operations - EXPORT device_mha_operationsTargets) - rocm_install(EXPORT device_mha_operationsTargets - FILE composable_kerneldevice_mha_operationsTargets.cmake - NAMESPACE composable_kernel:: - DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/composable_kernel - ) + set(gpu_list ${INST_TARGETS}) + list(FILTER gpu_list INCLUDE REGEX "^gfx94") + if(gpu_list) + add_library(device_mha_operations STATIC ${CK_DEVICE_MHA_INSTANCES}) + add_library(composablekernels::device_mha_operations ALIAS device_mha_operations) + target_compile_features(device_mha_operations PUBLIC) + set_target_properties(device_mha_operations PROPERTIES POSITION_INDEPENDENT_CODE ON) + + rocm_install(TARGETS device_mha_operations + EXPORT device_mha_operationsTargets) + rocm_install(EXPORT device_mha_operationsTargets + FILE composable_kerneldevice_mha_operationsTargets.cmake + NAMESPACE composable_kernel:: + DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/composable_kernel + ) + endif() endif() if(CK_DEVICE_CONTRACTION_INSTANCES) add_library(device_contraction_operations STATIC ${CK_DEVICE_CONTRACTION_INSTANCES}) diff --git a/library/src/tensor_operation_instance/gpu/mha/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/mha/CMakeLists.txt new file mode 100644 index 0000000000..59ae09b739 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/mha/CMakeLists.txt @@ -0,0 +1,55 @@ +set(FMHA_CPP_FOLDER ${CMAKE_CURRENT_BINARY_DIR}) +set(FMHA_SRC_FOLDER ${CMAKE_SOURCE_DIR}/example/ck_tile/01_fmha/) +set(CK_TILE_SRC_FOLDER ${CMAKE_SOURCE_DIR}/include/ck_tile/) +# python stuff +find_package(PythonInterp 3 REQUIRED) + +rocm_install(DIRECTORY ${CK_TILE_SRC_FOLDER} DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/ck_tile) + +rocm_install(FILES + "${FMHA_SRC_FOLDER}/fmha_fwd.hpp" + "${FMHA_SRC_FOLDER}/bias.hpp" + "${FMHA_SRC_FOLDER}/mask.hpp" + DESTINATION include/ck_tile/ops +) + +# header for building lib +file(COPY ${FMHA_SRC_FOLDER}/fmha_fwd.hpp DESTINATION ${FMHA_CPP_FOLDER}) +file(COPY ${FMHA_SRC_FOLDER}/bias.hpp DESTINATION ${FMHA_CPP_FOLDER}) +file(COPY ${FMHA_SRC_FOLDER}/mask.hpp DESTINATION ${FMHA_CPP_FOLDER}) + +# generate a list of kernels, but not actually emit files at config stage +execute_process( + COMMAND ${PYTHON_EXECUTABLE} ${CMAKE_SOURCE_DIR}/example/ck_tile/01_fmha/generate.py + --list_blobs ${FMHA_CPP_FOLDER}/blob_list.txt +) +file(STRINGS ${FMHA_CPP_FOLDER}/blob_list.txt FMHA_FWD_GEN_BLOBS) + +# actually generate the cpp files +add_custom_command( + OUTPUT ${FMHA_FWD_GEN_BLOBS} + COMMAND ${PYTHON_EXECUTABLE} ${CMAKE_SOURCE_DIR}/example/ck_tile/01_fmha/generate.py + --output_dir ${FMHA_CPP_FOLDER} + COMMENT "Generating mha kernel (cpp) files now ..." + VERBATIM +) + +# This is done to remove path info and just +# have filename. Since, it was cauing the cmake +# to throw "File name too long" +set(device_files) +foreach(filepath IN LISTS FMHA_FWD_GEN_BLOBS) + get_filename_component(filename ${filepath} NAME) + # Append the filename to the device_files list + list(APPEND device_files ${filename}) +endforeach() +add_custom_target(generate_cpp_files DEPENDS ${FMHA_FWD_GEN_BLOBS}) + +add_instance_library(device_mha_instance ${device_files}) + + + +if (TARGET device_mha_instance) + add_dependencies(device_mha_instance generate_cpp_files) +endif() +