From 308d3698c0f54db1a58bf48c7b371a21df09b0f0 Mon Sep 17 00:00:00 2001 From: BrianHarrisonAMD <169072757+BrianHarrisonAMD@users.noreply.github.com> Date: Tue, 24 Sep 2024 10:15:30 -0600 Subject: [PATCH] Add additional instances to device_mha_instance (#1522) * Add additional instances to device_mha_instance * Add comment to describe what receipt 3 option filters --------- Co-authored-by: Po Yen Chen [ROCm/composable_kernel commit: 3528a523ffb7aadd48fb076cb8d4476954abf8c6] --- .../gpu/CMakeLists.txt | 10 +++++---- .../gpu/mha/CMakeLists.txt | 22 ++++++++++++++----- 2 files changed, 22 insertions(+), 10 deletions(-) diff --git a/library/src/tensor_operation_instance/gpu/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/CMakeLists.txt index 72026dc185..df3283b543 100644 --- a/library/src/tensor_operation_instance/gpu/CMakeLists.txt +++ b/library/src/tensor_operation_instance/gpu/CMakeLists.txt @@ -102,12 +102,14 @@ function(add_instance_library INSTANCE_NAME) set(FMHA_FWD_FAST_EXP2 true) endif() if(FMHA_FWD_FAST_EXP2) - list(APPEND EXAMPLE_FMHA_FWD_COMPILE_OPTIONS -Wno-undefined-func-template -DCK_TILE_FMHA_FWD_FAST_EXP2=1 -fgpu-flush-denormals-to-zero) + list(APPEND FMHA_COMPILE_OPTIONS -Wno-undefined-func-template -DCK_TILE_FMHA_FWD_FAST_EXP2=1 -fgpu-flush-denormals-to-zero) else() - list(APPEND EXAMPLE_FMHA_FWD_COMPILE_OPTIONS -Wno-undefined-func-template -DCK_TILE_FMHA_FWD_FAST_EXP2=0) + list(APPEND FMHA_COMPILE_OPTIONS -Wno-undefined-func-template -DCK_TILE_FMHA_FWD_FAST_EXP2=0) endif() - list(APPEND EXAMPLE_FMHA_FWD_COMPILE_OPTIONS -Wno-float-equal) - target_compile_options(device_mha_instance PRIVATE ${EXAMPLE_FMHA_FWD_COMPILE_OPTIONS}) + list(APPEND FMHA_COMPILE_OPTIONS -Wno-float-equal) + list(APPEND FMHA_COMPILE_OPTIONS -DCK_TILE_FMHA_FWD_SPLITKV_API=1) + list(APPEND FMHA_COMPILE_OPTIONS -DCK_TILE_FMHA_FWD_APPENDKV_API=1) + target_compile_options(device_mha_instance PRIVATE ${FMHA_COMPILE_OPTIONS}) endif() target_compile_features(${INSTANCE_NAME} PUBLIC) diff --git a/library/src/tensor_operation_instance/gpu/mha/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/mha/CMakeLists.txt index 89dee21fb1..6d638b1747 100644 --- a/library/src/tensor_operation_instance/gpu/mha/CMakeLists.txt +++ b/library/src/tensor_operation_instance/gpu/mha/CMakeLists.txt @@ -32,23 +32,33 @@ if(EXISTS ${FMHA_CPP_FOLDER}/blob_list.txt) file(REMOVE ${FMHA_CPP_FOLDER}/blob_list.txt) endif() +set(FMHA_KNOWN_APIS "fwd,fwd_splitkv,fwd_appendkv,bwd") + # generate a list of kernels, but not actually emit files at config stage +# Note: The receipt 3 arg filters the generated backwards instances to reduce compilation time. +# With receipt 3 set, we are generating instances for datatype == {fp16 || bfp16}, bias == {no || alibi}, deterministic == off, and dpad == dvpad. execute_process( - COMMAND ${PYTHON_EXECUTABLE} ${CMAKE_SOURCE_DIR}/example/ck_tile/01_fmha/generate.py + COMMAND ${PYTHON_EXECUTABLE} ${FMHA_SRC_FOLDER}/generate.py --list_blobs ${FMHA_CPP_FOLDER}/blob_list.txt + --api ${FMHA_KNOWN_APIS} + --receipt 3 RESULT_VARIABLE ret ) if(ret AND NOT ret EQUAL 0) message( FATAL_ERROR "CK Tile MHA FAILED to genrate a list of kernels via Python.") else() - file(STRINGS ${FMHA_CPP_FOLDER}/blob_list.txt FMHA_FWD_GEN_BLOBS) + file(STRINGS ${FMHA_CPP_FOLDER}/blob_list.txt FMHA_GEN_BLOBS) endif() # actually generate the kernel content now +# Note: The receipt 3 arg filters the generated backwards instances to reduce compilation time. +# With receipt 3 set, we are generating instances for datatype == {fp16 || bfp16}, bias == {no || alibi}, deterministic == off, and dpad == dvpad. add_custom_command( - OUTPUT ${FMHA_FWD_GEN_BLOBS} - COMMAND ${PYTHON_EXECUTABLE} ${CMAKE_SOURCE_DIR}/example/ck_tile/01_fmha/generate.py + OUTPUT ${FMHA_GEN_BLOBS} + COMMAND ${PYTHON_EXECUTABLE} ${FMHA_SRC_FOLDER}/generate.py --output_dir ${FMHA_CPP_FOLDER} + --api ${FMHA_KNOWN_APIS} + --receipt 3 COMMENT "Generating mha kernel (cpp) files now ..." VERBATIM ) @@ -57,12 +67,12 @@ add_custom_command( # have filename. Since, it was cauing the cmake # to throw "File name too long" set(device_files) -foreach(filepath IN LISTS FMHA_FWD_GEN_BLOBS) +foreach(filepath IN LISTS FMHA_GEN_BLOBS) get_filename_component(filename ${filepath} NAME) # Append the filename to the device_files list list(APPEND device_files ${filename}) endforeach() -add_custom_target(generate_cpp_files DEPENDS ${FMHA_FWD_GEN_BLOBS}) +add_custom_target(generate_cpp_files DEPENDS ${FMHA_GEN_BLOBS}) add_instance_library(device_mha_instance ${device_files})