mirror of
https://github.com/ROCm/composable_kernel.git
synced 2026-05-13 01:36:06 +00:00
[CK_TILE] Add CK Tile bwd weight profiler MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ## Motivation To compare old CK and CK Tile, we need to extend the current CK profiler to support running also CK Tile instance with the same API. In order to have the same instance coverage in CK Tile compared to the old CK, I've added code generation from old CK configurations to CK Tile instances using the CK Builder. ## Technical Details - The codegen python script for CK Tile fwd convs is extended to support also bwd weight and bwd data. - The generated instances are added to the CMake build (target `device_grouped_conv_bwd_weight_tile_instance`s). - A new profiler op (`grouped_conv_bwd_weight_tile`) has been added to the CK Profiler.
310 lines
15 KiB
CMake
310 lines
15 KiB
CMake
# Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
|
# SPDX-License-Identifier: MIT
|
|
|
|
# ckProfiler
|
|
set(CK_PROFILER_OP_FILTER "" CACHE STRING "Filter for the operators to be profiled. Default is to include all")
|
|
set(CK_PROFILER_INSTANCE_FILTER "" CACHE STRING "Filter for the kernels instances to be profiled. Default is to be the same as the operator filter")
|
|
if (CK_PROFILER_OP_FILTER STREQUAL "")
|
|
set(CK_PROFILER_OP_FILTER ".+")
|
|
endif()
|
|
if (CK_PROFILER_INSTANCE_FILTER STREQUAL "")
|
|
set(CK_PROFILER_INSTANCE_FILTER ${CK_PROFILER_OP_FILTER})
|
|
endif()
|
|
message(STATUS "CK_PROFILER_OP_FILTER: ${CK_PROFILER_OP_FILTER}")
|
|
message(STATUS "CK_PROFILER_INSTANCE_FILTER: ${CK_PROFILER_INSTANCE_FILTER}")
|
|
|
|
set(PROFILER_OPS
|
|
profile_gemm.cpp
|
|
profile_reduce.cpp
|
|
profile_groupnorm_bwd_data.cpp
|
|
profile_groupnorm_fwd.cpp
|
|
profile_layernorm_bwd_data.cpp
|
|
profile_layernorm_bwd_gamma_beta.cpp
|
|
profile_groupnorm_bwd_gamma_beta.cpp
|
|
profile_layernorm_fwd.cpp
|
|
profile_max_pool2d_fwd.cpp
|
|
profile_pool3d_fwd.cpp
|
|
profile_avg_pool3d_bwd.cpp
|
|
profile_max_pool3d_bwd.cpp
|
|
profile_avg_pool2d_bwd.cpp
|
|
profile_max_pool2d_bwd.cpp
|
|
profile_softmax.cpp
|
|
profile_batchnorm_fwd.cpp
|
|
profile_batchnorm_bwd.cpp
|
|
profile_batchnorm_infer.cpp
|
|
profile_conv_tensor_rearrange.cpp
|
|
profile_transpose.cpp
|
|
profile_permute_scale.cpp
|
|
profile_gemm_quantization.cpp
|
|
)
|
|
|
|
if(SUPPORTED_GPU_TARGETS MATCHES "gfx9")
|
|
if(DTYPES MATCHES "fp32" OR DTYPES MATCHES "fp64" OR NOT DEFINED DTYPES)
|
|
list(APPEND PROFILER_OPS profile_contraction_bilinear.cpp)
|
|
list(APPEND PROFILER_OPS profile_contraction_scale.cpp)
|
|
endif()
|
|
if(CK_EXPERIMENTAL_BUILDER)
|
|
list(APPEND PROFILER_OPS profile_grouped_conv_fwd_tile.cpp)
|
|
list(APPEND PROFILER_OPS profile_grouped_conv_bwd_weight_tile.cpp)
|
|
endif()
|
|
endif()
|
|
|
|
if(SUPPORTED_GPU_TARGETS MATCHES "gfx9|gfx1[12]")
|
|
if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES)
|
|
list(APPEND PROFILER_OPS profile_gemm_reduce.cpp)
|
|
list(APPEND PROFILER_OPS profile_batched_gemm_add_relu_gemm_add.cpp)
|
|
list(APPEND PROFILER_OPS profile_gemm_add.cpp)
|
|
list(APPEND PROFILER_OPS profile_grouped_gemm.cpp)
|
|
list(APPEND PROFILER_OPS profile_gemm_streamk.cpp)
|
|
list(APPEND PROFILER_OPS profile_gemm_add_relu.cpp)
|
|
list(APPEND PROFILER_OPS profile_gemm_add_relu_add_layernorm.cpp)
|
|
list(APPEND PROFILER_OPS profile_grouped_gemm_fixed_nk.cpp)
|
|
list(APPEND PROFILER_OPS profile_grouped_gemm_fastgelu.cpp)
|
|
list(APPEND PROFILER_OPS profile_grouped_gemm_tile_loop.cpp)
|
|
list(APPEND PROFILER_OPS profile_grouped_gemm_multiply_tile_loop.cpp)
|
|
endif()
|
|
if(SUPPORTED_GPU_TARGETS MATCHES "gfx9[45]|gfx12")
|
|
list(APPEND PROFILER_OPS profile_gemm_multiply_multiply_wp.cpp)
|
|
list(APPEND PROFILER_OPS profile_gemm_ab_scale.cpp)
|
|
list(APPEND PROFILER_OPS profile_gemm_blockscale_wp.cpp)
|
|
list(APPEND PROFILER_OPS profile_gemm_universal_preshuffle.cpp)
|
|
endif()
|
|
if(SUPPORTED_GPU_TARGETS MATCHES "gfx95")
|
|
list(APPEND PROFILER_OPS profile_gemm_mx.cpp)
|
|
endif()
|
|
list(APPEND PROFILER_OPS profile_batched_gemm_reduce.cpp)
|
|
list(APPEND PROFILER_OPS profile_gemm_add_multiply.cpp)
|
|
list(APPEND PROFILER_OPS profile_gemm_add.cpp)
|
|
list(APPEND PROFILER_OPS profile_gemm_bias_add_reduce.cpp)
|
|
list(APPEND PROFILER_OPS profile_gemm_splitk.cpp)
|
|
list(APPEND PROFILER_OPS profile_gemm_universal_batched.cpp)
|
|
list(APPEND PROFILER_OPS profile_gemm_universal_streamk.cpp)
|
|
list(APPEND PROFILER_OPS profile_conv_fwd_bias_relu.cpp)
|
|
list(APPEND PROFILER_OPS profile_conv_fwd_bias_relu_add.cpp)
|
|
list(APPEND PROFILER_OPS profile_conv_bwd_data.cpp)
|
|
list(APPEND PROFILER_OPS profile_conv_fwd.cpp)
|
|
endif()
|
|
|
|
if((SUPPORTED_GPU_TARGETS MATCHES "gfx9" AND (DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES)) OR
|
|
(SUPPORTED_GPU_TARGETS MATCHES "gfx1[12]"))
|
|
list(APPEND PROFILER_OPS profile_gemm_bilinear.cpp)
|
|
endif()
|
|
if(SUPPORTED_GPU_TARGETS MATCHES "gfx(9[45]|1[12])")
|
|
list(APPEND PROFILER_OPS profile_gemm_multiply_multiply.cpp)
|
|
endif()
|
|
|
|
if(SUPPORTED_GPU_TARGETS MATCHES "gfx9|gfx1[12]")
|
|
list(APPEND PROFILER_OPS profile_gemm_universal.cpp)
|
|
list(APPEND PROFILER_OPS profile_batched_gemm.cpp)
|
|
list(APPEND PROFILER_OPS profile_batched_gemm_b_scale.cpp)
|
|
list(APPEND PROFILER_OPS profile_gemm_b_scale.cpp)
|
|
list(APPEND PROFILER_OPS profile_gemm_universal_reduce.cpp)
|
|
list(APPEND PROFILER_OPS profile_grouped_conv_fwd.cpp)
|
|
list(APPEND PROFILER_OPS profile_grouped_conv_fwd_bias_clamp.cpp)
|
|
list(APPEND PROFILER_OPS profile_grouped_conv_fwd_bias_bnorm_clamp.cpp)
|
|
list(APPEND PROFILER_OPS profile_grouped_conv_fwd_clamp.cpp)
|
|
list(APPEND PROFILER_OPS profile_grouped_conv_fwd_convscale_add.cpp)
|
|
list(APPEND PROFILER_OPS profile_grouped_conv_bwd_data.cpp)
|
|
list(APPEND PROFILER_OPS profile_grouped_conv_fwd_dynamic_op.cpp)
|
|
list(APPEND PROFILER_OPS profile_grouped_conv_fwd_bilinear.cpp)
|
|
list(APPEND PROFILER_OPS profile_grouped_conv_bwd_weight.cpp)
|
|
list(APPEND PROFILER_OPS profile_grouped_conv_fwd_outelementop.cpp)
|
|
list(APPEND PROFILER_OPS profile_gemm_multi_abd.cpp)
|
|
list(APPEND PROFILER_OPS profile_grouped_conv_fwd_scaleadd_scaleadd_relu.cpp)
|
|
if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES)
|
|
list(APPEND PROFILER_OPS profile_gemm_add_multiply.cpp)
|
|
list(APPEND PROFILER_OPS profile_gemm_multiply_add.cpp)
|
|
list(APPEND PROFILER_OPS profile_gemm_add_silu.cpp)
|
|
list(APPEND PROFILER_OPS profile_gemm_fastgelu.cpp)
|
|
list(APPEND PROFILER_OPS profile_gemm_add_fastgelu.cpp)
|
|
list(APPEND PROFILER_OPS profile_gemm_add_add_fastgelu.cpp)
|
|
list(APPEND PROFILER_SOURCES profile_gemm_add.cpp)
|
|
endif()
|
|
list(APPEND PROFILER_OPS profile_batched_gemm_gemm.cpp)
|
|
endif()
|
|
|
|
if(DL_KERNELS)
|
|
list(APPEND PROFILER_OPS profile_batched_gemm_multi_d.cpp)
|
|
list(APPEND PROFILER_OPS profile_grouped_conv_bwd_weight.cpp)
|
|
endif()
|
|
|
|
if(CK_ENABLE_INT8)
|
|
list(APPEND PROFILER_OPS profile_gemm_quantization.cpp)
|
|
endif()
|
|
|
|
set(PROFILER_SOURCES profiler.cpp)
|
|
foreach(SOURCE ${PROFILER_OPS})
|
|
string(REGEX REPLACE "profile_(.+)\.cpp" "\\1" OP_NAME ${SOURCE})
|
|
if (OP_NAME STREQUAL "")
|
|
message(FATAL_ERROR "Unexpected source file name: ${SOURCE}")
|
|
endif()
|
|
if("${OP_NAME}" MATCHES "${CK_PROFILER_OP_FILTER}")
|
|
list(APPEND PROFILER_SOURCES ${SOURCE})
|
|
endif()
|
|
endforeach()
|
|
message(VERBOSE "ckProfiler sources: ${PROFILER_SOURCES}")
|
|
|
|
set(PROFILER_EXECUTABLE ckProfiler)
|
|
|
|
add_executable(${PROFILER_EXECUTABLE} ${PROFILER_SOURCES})
|
|
target_compile_options(${PROFILER_EXECUTABLE} PRIVATE -Wno-global-constructors)
|
|
# flags to compress the library
|
|
if(NOT WIN32 AND ${hip_VERSION_FLAT} GREATER 600241132)
|
|
message(DEBUG "Adding --offload-compress flag for ${PROFILER_EXECUTABLE}")
|
|
target_compile_options(${PROFILER_EXECUTABLE} PRIVATE --offload-compress)
|
|
endif()
|
|
|
|
|
|
set(DEVICE_INSTANCES "")
|
|
list(APPEND DEVICE_INSTANCES device_gemm_instance)
|
|
list(APPEND DEVICE_INSTANCES device_normalization_fwd_instance)
|
|
list(APPEND DEVICE_INSTANCES device_normalization_bwd_data_instance)
|
|
list(APPEND DEVICE_INSTANCES device_normalization_bwd_gamma_beta_instance)
|
|
list(APPEND DEVICE_INSTANCES device_softmax_instance)
|
|
list(APPEND DEVICE_INSTANCES device_reduce_instance)
|
|
list(APPEND DEVICE_INSTANCES device_batchnorm_instance)
|
|
list(APPEND DEVICE_INSTANCES device_pool2d_fwd_instance)
|
|
list(APPEND DEVICE_INSTANCES device_pool3d_fwd_instance)
|
|
list(APPEND DEVICE_INSTANCES device_avg_pool2d_bwd_instance)
|
|
list(APPEND DEVICE_INSTANCES device_avg_pool3d_bwd_instance)
|
|
list(APPEND DEVICE_INSTANCES device_max_pool_bwd_instance)
|
|
list(APPEND DEVICE_INSTANCES device_image_to_column_instance)
|
|
list(APPEND DEVICE_INSTANCES device_column_to_image_instance)
|
|
list(APPEND DEVICE_INSTANCES device_transpose_instance)
|
|
list(APPEND DEVICE_INSTANCES device_permute_scale_instance)
|
|
|
|
if(SUPPORTED_GPU_TARGETS MATCHES "gfx9|gfx1[12]")
|
|
if(DTYPES MATCHES "fp32" OR DTYPES MATCHES "fp64" OR NOT DEFINED DTYPES)
|
|
list(APPEND DEVICE_INSTANCES device_contraction_bilinear_instance)
|
|
list(APPEND DEVICE_INSTANCES device_contraction_scale_instance)
|
|
endif()
|
|
if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES)
|
|
list(APPEND DEVICE_INSTANCES device_gemm_add_instance)
|
|
list(APPEND DEVICE_INSTANCES device_batched_gemm_gemm_instance)
|
|
list(APPEND DEVICE_INSTANCES device_gemm_add_add_fastgelu_instance)
|
|
list(APPEND DEVICE_INSTANCES device_gemm_fastgelu_instance)
|
|
list(APPEND DEVICE_INSTANCES device_batched_gemm_add_relu_gemm_add_instance)
|
|
list(APPEND DEVICE_INSTANCES device_grouped_gemm_instance)
|
|
list(APPEND DEVICE_INSTANCES device_gemm_streamk_instance)
|
|
list(APPEND DEVICE_INSTANCES device_gemm_add_relu_instance)
|
|
list(APPEND DEVICE_INSTANCES device_gemm_add_relu_add_layernorm_instance)
|
|
list(APPEND DEVICE_INSTANCES device_grouped_gemm_fixed_nk_instance)
|
|
list(APPEND DEVICE_INSTANCES device_grouped_gemm_fastgelu_instance)
|
|
list(APPEND DEVICE_INSTANCES device_grouped_gemm_tile_loop_instance)
|
|
endif()
|
|
list(APPEND DEVICE_INSTANCES device_batched_gemm_reduce_instance)
|
|
if(SUPPORTED_GPU_TARGETS MATCHES "gfx9[45]|gfx12")
|
|
list(APPEND DEVICE_INSTANCES device_gemm_multiply_multiply_wp_instance)
|
|
list(APPEND DEVICE_INSTANCES device_gemm_universal_preshuffle_instance)
|
|
endif()
|
|
if(SUPPORTED_GPU_TARGETS MATCHES "gfx9[45]|gfx1[12]")
|
|
list(APPEND DEVICE_INSTANCES device_gemm_ab_scale_instance)
|
|
list(APPEND DEVICE_INSTANCES device_gemm_blockscale_wp_instance)
|
|
endif()
|
|
if(SUPPORTED_GPU_TARGETS MATCHES "gfx95")
|
|
list(APPEND DEVICE_INSTANCES device_gemm_mx_instance)
|
|
endif()
|
|
list(APPEND DEVICE_INSTANCES device_gemm_splitk_instance)
|
|
list(APPEND DEVICE_INSTANCES device_gemm_universal_batched_instance)
|
|
list(APPEND DEVICE_INSTANCES device_gemm_universal_streamk_instance)
|
|
list(APPEND DEVICE_INSTANCES device_gemm_add_multiply_instance)
|
|
list(APPEND DEVICE_INSTANCES device_gemm_add_instance)
|
|
list(APPEND DEVICE_INSTANCES device_gemm_reduce_instance)
|
|
list(APPEND DEVICE_INSTANCES device_gemm_bias_add_reduce_instance)
|
|
list(APPEND DEVICE_INSTANCES device_conv2d_fwd_instance)
|
|
list(APPEND DEVICE_INSTANCES device_conv2d_fwd_bias_relu_instance)
|
|
list(APPEND DEVICE_INSTANCES device_conv2d_fwd_bias_relu_add_instance)
|
|
list(APPEND DEVICE_INSTANCES device_conv1d_bwd_data_instance)
|
|
list(APPEND DEVICE_INSTANCES device_conv3d_bwd_data_instance)
|
|
list(APPEND DEVICE_INSTANCES device_conv2d_bwd_data_instance)
|
|
endif()
|
|
|
|
if((SUPPORTED_GPU_TARGETS MATCHES "gfx9" AND (DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES)) OR
|
|
(SUPPORTED_GPU_TARGETS MATCHES "gfx1[12]" ))
|
|
list(APPEND DEVICE_INSTANCES device_gemm_bilinear_instance)
|
|
endif()
|
|
if(SUPPORTED_GPU_TARGETS MATCHES "gfx(9[45]|1[12])")
|
|
list(APPEND DEVICE_INSTANCES device_gemm_multiply_multiply_instance)
|
|
endif()
|
|
|
|
if(SUPPORTED_GPU_TARGETS MATCHES "gfx9|gfx1[12]")
|
|
list(APPEND DEVICE_INSTANCES device_gemm_universal_instance)
|
|
list(APPEND DEVICE_INSTANCES device_batched_gemm_instance)
|
|
list(APPEND DEVICE_INSTANCES device_gemm_b_scale_instance)
|
|
list(APPEND DEVICE_INSTANCES device_gemm_universal_reduce_instance)
|
|
list(APPEND DEVICE_INSTANCES device_batched_gemm_b_scale_instance)
|
|
list(APPEND DEVICE_INSTANCES device_grouped_conv2d_bwd_data_instance)
|
|
list(APPEND DEVICE_INSTANCES device_grouped_conv3d_bwd_data_instance)
|
|
list(APPEND DEVICE_INSTANCES device_grouped_conv1d_fwd_instance)
|
|
list(APPEND DEVICE_INSTANCES device_grouped_conv2d_fwd_instance)
|
|
list(APPEND DEVICE_INSTANCES device_grouped_conv3d_fwd_convinvscale_instance)
|
|
list(APPEND DEVICE_INSTANCES device_grouped_conv3d_fwd_convscale_add_instance)
|
|
list(APPEND DEVICE_INSTANCES device_grouped_conv3d_fwd_convscale_relu_instance)
|
|
list(APPEND DEVICE_INSTANCES device_grouped_conv3d_fwd_convscale_instance)
|
|
list(APPEND DEVICE_INSTANCES device_grouped_conv3d_fwd_instance)
|
|
list(APPEND DEVICE_INSTANCES device_grouped_conv2d_fwd_clamp_instance)
|
|
list(APPEND DEVICE_INSTANCES device_grouped_conv3d_fwd_clamp_instance)
|
|
list(APPEND DEVICE_INSTANCES device_grouped_conv3d_fwd_scale_instance)
|
|
list(APPEND DEVICE_INSTANCES device_grouped_conv3d_fwd_scaleadd_scaleadd_relu_instance)
|
|
list(APPEND DEVICE_INSTANCES device_grouped_conv2d_fwd_bias_clamp_instance)
|
|
list(APPEND DEVICE_INSTANCES device_grouped_conv3d_fwd_bias_clamp_instance)
|
|
list(APPEND DEVICE_INSTANCES device_grouped_conv2d_fwd_bias_bnorm_clamp_instance)
|
|
list(APPEND DEVICE_INSTANCES device_grouped_conv3d_fwd_bias_bnorm_clamp_instance)
|
|
list(APPEND DEVICE_INSTANCES device_grouped_conv3d_fwd_bilinear_instance)
|
|
list(APPEND DEVICE_INSTANCES device_grouped_conv2d_fwd_dynamic_op_instance)
|
|
list(APPEND DEVICE_INSTANCES device_grouped_conv3d_fwd_dynamic_op_instance)
|
|
list(APPEND DEVICE_INSTANCES device_gemm_add_relu_instance)
|
|
list(APPEND DEVICE_INSTANCES device_gemm_multi_abd_instance)
|
|
if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES)
|
|
list(APPEND DEVICE_INSTANCES device_gemm_add_multiply_instance)
|
|
list(APPEND DEVICE_INSTANCES device_gemm_multiply_add_instance)
|
|
list(APPEND DEVICE_INSTANCES device_gemm_add_instance)
|
|
list(APPEND DEVICE_INSTANCES device_gemm_add_silu_instance)
|
|
list(APPEND DEVICE_INSTANCES device_gemm_fastgelu_instance)
|
|
list(APPEND DEVICE_INSTANCES device_gemm_add_fastgelu_instance)
|
|
list(APPEND DEVICE_INSTANCES device_gemm_add_add_fastgelu_instance)
|
|
endif()
|
|
list(APPEND DEVICE_INSTANCES device_batched_gemm_gemm_instance)
|
|
list(APPEND DEVICE_INSTANCES device_grouped_conv1d_bwd_weight_instance)
|
|
list(APPEND DEVICE_INSTANCES device_grouped_conv2d_bwd_weight_instance)
|
|
list(APPEND DEVICE_INSTANCES device_grouped_convnd_bwd_weight_instance)
|
|
list(APPEND DEVICE_INSTANCES device_grouped_conv3d_bwd_weight_instance)
|
|
endif()
|
|
|
|
if(SUPPORTED_GPU_TARGETS MATCHES "gfx9")
|
|
if(CK_EXPERIMENTAL_BUILDER)
|
|
list(APPEND DEVICE_INSTANCES device_grouped_conv_fwd_tile_instances)
|
|
list(APPEND DEVICE_INSTANCES device_grouped_conv_bwd_weight_tile_instances)
|
|
endif()
|
|
endif()
|
|
|
|
if(DL_KERNELS)
|
|
list(APPEND DEVICE_INSTANCES device_batched_gemm_multi_d_instance)
|
|
list(APPEND DEVICE_INSTANCES device_grouped_conv1d_bwd_weight_instance)
|
|
list(APPEND DEVICE_INSTANCES device_grouped_conv2d_bwd_weight_instance)
|
|
list(APPEND DEVICE_INSTANCES device_grouped_conv3d_bwd_weight_instance)
|
|
endif()
|
|
|
|
if(CK_ENABLE_INT8)
|
|
list(APPEND DEVICE_INSTANCES device_quantization_instance)
|
|
endif()
|
|
|
|
set(PROFILER_LIBS utility getopt::getopt)
|
|
foreach(LIB ${DEVICE_INSTANCES})
|
|
string(REGEX REPLACE "device_(.+)_instance" "\\1" INSTANCE_NAME ${LIB})
|
|
if (INSTANCE_NAME STREQUAL "")
|
|
message(FATAL_ERROR "Unexpected kernel instance name: ${LIB}")
|
|
endif()
|
|
if("${INSTANCE_NAME}" MATCHES "${CK_PROFILER_INSTANCE_FILTER}")
|
|
# Only link if the target was actually created
|
|
if(TARGET ${LIB})
|
|
list(APPEND PROFILER_LIBS ${LIB})
|
|
else()
|
|
message(VERBOSE "Skipping ${LIB} - no instances built for current GPU targets")
|
|
endif()
|
|
endif()
|
|
endforeach()
|
|
message(VERBOSE "ckProfiler libs: ${PROFILER_LIBS}")
|
|
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE ${PROFILER_LIBS})
|
|
|
|
rocm_install(TARGETS ${PROFILER_EXECUTABLE} COMPONENT profiler) |