From c6a3bd9f9309fca64c7aaf1f3c00a0e517ec22e8 Mon Sep 17 00:00:00 2001 From: ozturkosu Date: Wed, 15 Jan 2025 19:32:16 -0600 Subject: [PATCH] Comment other instance in Profiler CMAKE --- .../gridwise_gemm_xdl_cshuffle_streamk_v3.hpp | 21 +- profiler/src/CMakeLists.txt | 316 +++++++++--------- 2 files changed, 168 insertions(+), 169 deletions(-) diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_streamk_v3.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_streamk_v3.hpp index c58ce4a630..5efc408844 100755 --- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_streamk_v3.hpp +++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_streamk_v3.hpp @@ -946,7 +946,7 @@ struct GridwiseGemm_xdl_cshuffle_streamk_v3 if constexpr(!(GemmSpec == tensor_operation::device::GemmSpecialization::MPadding || GemmSpec == tensor_operation::device::GemmSpecialization::MNPadding || GemmSpec == tensor_operation::device::GemmSpecialization::MKPadding || - GemmSpec == tensor_operation::device::GemmSpecialization::MNKPadding)) && + GemmSpec == tensor_operation::device::GemmSpecialization::MNKPadding) && !(is_same::value)) { if(!(karg.M % MPerBlock == 0)) @@ -964,7 +964,7 @@ struct GridwiseGemm_xdl_cshuffle_streamk_v3 if constexpr(!(GemmSpec == tensor_operation::device::GemmSpecialization::NPadding || GemmSpec == tensor_operation::device::GemmSpecialization::MNPadding || GemmSpec == tensor_operation::device::GemmSpecialization::NKPadding || - GemmSpec == tensor_operation::device::GemmSpecialization::MNKPadding)) && + GemmSpec == tensor_operation::device::GemmSpecialization::MNKPadding) && (is_same::value)) { if(!(karg.N % NPerBlock == 0)) @@ -986,6 +986,7 @@ struct GridwiseGemm_xdl_cshuffle_streamk_v3 { auto K_t = KPerBlock; + // auto K_t = karg.KBatch * KPerBlock; if(!(karg.K % K_t == 0)) { if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING))) @@ -1000,17 +1001,17 @@ struct GridwiseGemm_xdl_cshuffle_streamk_v3 else { - // if(karg.K <= 0) - // { - // return false; - // } - constexpr auto KReadVec = math::lcm(AK1Number, BK1Number); - auto K_t = karg.KBatch * KReadVec; - auto KReadPadSplited = math::integer_divide_ceil(karg.K, K_t) * KReadVec; - if((KReadPadSplited * (karg.KBatch - 1)) >= karg.K) + if(karg.K <= 0) { return false; } + // constexpr auto KReadVec = math::lcm(AK1Number, BK1Number); + // auto K_t = karg.KBatch * KReadVec; + // auto KReadPadSplited = math::integer_divide_ceil(karg.K, K_t) * KReadVec; + // if((KReadPadSplited * (karg.KBatch - 1)) >= karg.K) + // { + // return false; + // } } if constexpr(is_same::value) diff --git a/profiler/src/CMakeLists.txt b/profiler/src/CMakeLists.txt index 61017d4b34..e183c98ffe 100644 --- a/profiler/src/CMakeLists.txt +++ b/profiler/src/CMakeLists.txt @@ -1,88 +1,87 @@ # ckProfiler set(PROFILER_SOURCES profiler.cpp - profile_gemm.cpp - profile_reduce.cpp - profile_groupnorm_bwd_data.cpp - profile_groupnorm_fwd.cpp - profile_layernorm_bwd_data.cpp - profile_layernorm_bwd_gamma_beta.cpp - profile_groupnorm_bwd_gamma_beta.cpp - profile_layernorm_fwd.cpp - profile_max_pool2d_fwd.cpp - profile_pool3d_fwd.cpp - profile_avg_pool3d_bwd.cpp - profile_max_pool3d_bwd.cpp - profile_avg_pool2d_bwd.cpp - profile_max_pool2d_bwd.cpp - profile_softmax.cpp - profile_batchnorm_fwd.cpp - profile_batchnorm_bwd.cpp - profile_batchnorm_infer.cpp - profile_conv_tensor_rearrange.cpp - profile_transpose.cpp - profile_permute_scale.cpp + # profile_gemm.cpp + # profile_reduce.cpp + # profile_groupnorm_bwd_data.cpp + # profile_groupnorm_fwd.cpp + # profile_layernorm_bwd_data.cpp + # profile_layernorm_bwd_gamma_beta.cpp + # profile_groupnorm_bwd_gamma_beta.cpp + # profile_layernorm_fwd.cpp + # profile_max_pool2d_fwd.cpp + # profile_pool3d_fwd.cpp + # profile_avg_pool3d_bwd.cpp + # profile_max_pool3d_bwd.cpp + # profile_avg_pool2d_bwd.cpp + # profile_max_pool2d_bwd.cpp + # profile_softmax.cpp + # profile_batchnorm_fwd.cpp + # profile_batchnorm_bwd.cpp + # profile_batchnorm_infer.cpp + # profile_conv_tensor_rearrange.cpp + # profile_transpose.cpp + # profile_permute_scale.cpp ) -if(SUPPORTED_GPU_TARGETS MATCHES "gfx9") - if(DTYPES MATCHES "fp32" OR DTYPES MATCHES "fp64" OR NOT DEFINED DTYPES) - list(APPEND PROFILER_SOURCES profile_contraction_bilinear.cpp) - list(APPEND PROFILER_SOURCES profile_contraction_scale.cpp) - endif() - if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES) - list(APPEND PROFILER_SOURCES profile_gemm_reduce.cpp) - list(APPEND PROFILER_SOURCES profile_batched_gemm_gemm.cpp) - list(APPEND PROFILER_SOURCES profile_batched_gemm_add_relu_gemm_add.cpp) - list(APPEND PROFILER_SOURCES profile_gemm_add.cpp) - list(APPEND PROFILER_SOURCES profile_gemm_add_add_fastgelu.cpp) - list(APPEND PROFILER_SOURCES profile_gemm_add_fastgelu.cpp) - list(APPEND PROFILER_SOURCES profile_grouped_gemm.cpp) - list(APPEND PROFILER_SOURCES profile_gemm_streamk.cpp) - list(APPEND PROFILER_SOURCES profile_gemm_fastgelu.cpp) - list(APPEND PROFILER_SOURCES profile_gemm_add_relu.cpp) - list(APPEND PROFILER_SOURCES profile_gemm_add_silu.cpp) - list(APPEND PROFILER_SOURCES profile_gemm_add_relu_add_layernorm.cpp) - list(APPEND PROFILER_SOURCES profile_grouped_gemm_fixed_nk.cpp) - list(APPEND PROFILER_SOURCES profile_grouped_gemm_fastgelu.cpp) - list(APPEND PROFILER_SOURCES profile_grouped_gemm_tile_loop.cpp) - list(APPEND PROFILER_SOURCES profile_grouped_gemm_multiply_tile_loop.cpp) - endif() - list(APPEND PROFILER_SOURCES profile_gemm_multiply_add.cpp) - if(SUPPORTED_GPU_TARGETS MATCHES "gfx94") - list(APPEND PROFILER_SOURCES profile_gemm_multiply_multiply.cpp) - list(APPEND PROFILER_SOURCES profile_gemm_ab_scale.cpp) - endif() - list(APPEND PROFILER_SOURCES profile_batched_gemm.cpp) - list(APPEND PROFILER_SOURCES profile_batched_gemm_reduce.cpp) - list(APPEND PROFILER_SOURCES profile_gemm_add_multiply.cpp) - list(APPEND PROFILER_SOURCES profile_gemm_bias_add_reduce.cpp) - list(APPEND PROFILER_SOURCES profile_gemm_splitk.cpp) - list(APPEND PROFILER_SOURCES profile_gemm_universal.cpp) - list(APPEND PROFILER_SOURCES profile_gemm_b_scale.cpp) - list(APPEND PROFILER_SOURCES profile_gemm_universal_batched.cpp) - list(APPEND PROFILER_SOURCES profile_gemm_universal_reduce.cpp) - list(APPEND PROFILER_SOURCES profile_gemm_universal_streamk.cpp) - list(APPEND PROFILER_SOURCES profile_conv_fwd_bias_relu.cpp) - list(APPEND PROFILER_SOURCES profile_conv_fwd_bias_relu_add.cpp) - list(APPEND PROFILER_SOURCES profile_conv_bwd_data.cpp) - list(APPEND PROFILER_SOURCES profile_conv_fwd.cpp) - list(APPEND PROFILER_SOURCES profile_grouped_conv_fwd_outelementop.cpp) +# if(SUPPORTED_GPU_TARGETS MATCHES "gfx9") +# if(DTYPES MATCHES "fp32" OR DTYPES MATCHES "fp64" OR NOT DEFINED DTYPES) +# list(APPEND PROFILER_SOURCES profile_contraction_bilinear.cpp) +# list(APPEND PROFILER_SOURCES profile_contraction_scale.cpp) +# endif() +# if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES) +# list(APPEND PROFILER_SOURCES profile_gemm_reduce.cpp) +# list(APPEND PROFILER_SOURCES profile_batched_gemm_gemm.cpp) +# list(APPEND PROFILER_SOURCES profile_batched_gemm_add_relu_gemm_add.cpp) +# list(APPEND PROFILER_SOURCES profile_gemm_add.cpp) +# list(APPEND PROFILER_SOURCES profile_gemm_add_add_fastgelu.cpp) +# list(APPEND PROFILER_SOURCES profile_gemm_add_fastgelu.cpp) +# list(APPEND PROFILER_SOURCES profile_grouped_gemm.cpp) +# list(APPEND PROFILER_SOURCES profile_gemm_streamk.cpp) +# list(APPEND PROFILER_SOURCES profile_gemm_fastgelu.cpp) +# list(APPEND PROFILER_SOURCES profile_gemm_add_relu.cpp) +# list(APPEND PROFILER_SOURCES profile_gemm_add_silu.cpp) +# list(APPEND PROFILER_SOURCES profile_gemm_add_relu_add_layernorm.cpp) +# list(APPEND PROFILER_SOURCES profile_grouped_gemm_fixed_nk.cpp) +# list(APPEND PROFILER_SOURCES profile_grouped_gemm_fastgelu.cpp) +# list(APPEND PROFILER_SOURCES profile_grouped_gemm_tile_loop.cpp) +# list(APPEND PROFILER_SOURCES profile_grouped_gemm_multiply_tile_loop.cpp) +# endif() +# list(APPEND PROFILER_SOURCES profile_gemm_multiply_add.cpp) +# if(SUPPORTED_GPU_TARGETS MATCHES "gfx94") +# list(APPEND PROFILER_SOURCES profile_gemm_multiply_multiply.cpp) +# list(APPEND PROFILER_SOURCES profile_gemm_ab_scale.cpp) +# endif() +# list(APPEND PROFILER_SOURCES profile_batched_gemm.cpp) +# list(APPEND PROFILER_SOURCES profile_batched_gemm_reduce.cpp) +# list(APPEND PROFILER_SOURCES profile_gemm_add_multiply.cpp) +# list(APPEND PROFILER_SOURCES profile_gemm_bias_add_reduce.cpp) +# list(APPEND PROFILER_SOURCES profile_gemm_splitk.cpp) +list(APPEND PROFILER_SOURCES profile_gemm_universal.cpp) +# list(APPEND PROFILER_SOURCES profile_gemm_universal_batched.cpp) +# list(APPEND PROFILER_SOURCES profile_gemm_universal_reduce.cpp) +# list(APPEND PROFILER_SOURCES profile_gemm_universal_streamk.cpp) +# list(APPEND PROFILER_SOURCES profile_conv_fwd_bias_relu.cpp) +# list(APPEND PROFILER_SOURCES profile_conv_fwd_bias_relu_add.cpp) +# list(APPEND PROFILER_SOURCES profile_conv_bwd_data.cpp) +# list(APPEND PROFILER_SOURCES profile_conv_fwd.cpp) +# list(APPEND PROFILER_SOURCES profile_grouped_conv_fwd_outelementop.cpp) -endif() +# endif() -if(SUPPORTED_GPU_TARGETS MATCHES "gfx11" OR SUPPORTED_GPU_TARGETS MATCHES "gfx12" OR SUPPORTED_GPU_TARGETS MATCHES "gfx9") - if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES) - list(APPEND PROFILER_SOURCES profile_gemm_bilinear.cpp) - endif() - list(APPEND PROFILER_SOURCES profile_grouped_conv_fwd.cpp) - list(APPEND PROFILER_SOURCES profile_grouped_conv_bwd_data.cpp) - list(APPEND PROFILER_SOURCES profile_grouped_conv_bwd_weight.cpp) -endif() +# if(SUPPORTED_GPU_TARGETS MATCHES "gfx11" OR SUPPORTED_GPU_TARGETS MATCHES "gfx12" OR SUPPORTED_GPU_TARGETS MATCHES "gfx9") +# if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES) +# list(APPEND PROFILER_SOURCES profile_gemm_bilinear.cpp) +# endif() +# list(APPEND PROFILER_SOURCES profile_grouped_conv_fwd.cpp) +# list(APPEND PROFILER_SOURCES profile_grouped_conv_bwd_data.cpp) +# list(APPEND PROFILER_SOURCES profile_grouped_conv_bwd_weight.cpp) +# endif() -if(DL_KERNELS) - list(APPEND PROFILER_SOURCES profile_batched_gemm_multi_d.cpp) - list(APPEND PROFILER_SOURCES profile_grouped_conv_bwd_weight.cpp) -endif() +# if(DL_KERNELS) +# list(APPEND PROFILER_SOURCES profile_batched_gemm_multi_d.cpp) +# list(APPEND PROFILER_SOURCES profile_grouped_conv_bwd_weight.cpp) +# endif() set(PROFILER_EXECUTABLE ckProfiler) @@ -95,88 +94,87 @@ if(NOT WIN32 AND ${hip_VERSION_FLAT} GREATER 600241132) endif() target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE utility getopt::getopt) -target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_instance) -target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_normalization_fwd_instance) -target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_normalization_bwd_data_instance) -target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_normalization_bwd_gamma_beta_instance) -target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_softmax_instance) -target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_reduce_instance) -target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_batchnorm_instance) -target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_pool2d_fwd_instance) -target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_pool3d_fwd_instance) -target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_avg_pool2d_bwd_instance) -target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_avg_pool3d_bwd_instance) -target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_max_pool_bwd_instance) -target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_image_to_column_instance) -target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_column_to_image_instance) -target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_transpose_instance) -target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_permute_scale_instance) +# target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_instance) +# target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_normalization_fwd_instance) +# target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_normalization_bwd_data_instance) +# target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_normalization_bwd_gamma_beta_instance) +# target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_softmax_instance) +# target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_reduce_instance) +# target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_batchnorm_instance) +# target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_pool2d_fwd_instance) +# target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_pool3d_fwd_instance) +# target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_avg_pool2d_bwd_instance) +# target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_avg_pool3d_bwd_instance) +# target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_max_pool_bwd_instance) +# target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_image_to_column_instance) +# target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_column_to_image_instance) +# target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_transpose_instance) +# target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_permute_scale_instance) -if(SUPPORTED_GPU_TARGETS MATCHES "gfx9") - if(DTYPES MATCHES "fp32" OR DTYPES MATCHES "fp64" OR NOT DEFINED DTYPES) - target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_contraction_bilinear_instance) - target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_contraction_scale_instance) - endif() - if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES) - target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_add_instance) - target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_add_add_fastgelu_instance) - target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_fastgelu_instance) - target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_batched_gemm_gemm_instance) - target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_batched_gemm_add_relu_gemm_add_instance) - target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_gemm_instance) - target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_streamk_instance) - target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_add_fastgelu_instance) - target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_add_relu_instance) - target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_add_silu_instance) - target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_add_relu_add_layernorm_instance) - target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_gemm_fixed_nk_instance) - target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_gemm_fastgelu_instance) - target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_gemm_tile_loop_instance) - endif() - target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_batched_gemm_instance) - target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_batched_gemm_reduce_instance) - target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_multiply_add_instance) - if(SUPPORTED_GPU_TARGETS MATCHES "gfx94") - target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_multiply_multiply_instance) - target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_ab_scale_instance) - endif() - target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_splitk_instance) - target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_universal_instance) - target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_b_scale_instance) - target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_universal_batched_instance) - target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_universal_reduce_instance) - target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_universal_streamk_instance) - target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_add_multiply_instance) - target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_reduce_instance) - target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_bias_add_reduce_instance) - target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_conv2d_fwd_instance) - target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_conv2d_fwd_bias_relu_instance) - target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_conv2d_fwd_bias_relu_add_instance) - target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_conv1d_fwd_instance) - target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_conv1d_bwd_data_instance) - target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_conv3d_bwd_data_instance) - target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_conv2d_bwd_data_instance) - target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_conv1d_bwd_weight_instance) - target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_conv2d_bwd_weight_instance) - target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_conv3d_fwd_convscale_instance) - target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_conv3d_fwd_convinvscale_instance) -endif() +# if(SUPPORTED_GPU_TARGETS MATCHES "gfx9") +# if(DTYPES MATCHES "fp32" OR DTYPES MATCHES "fp64" OR NOT DEFINED DTYPES) +# target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_contraction_bilinear_instance) +# target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_contraction_scale_instance) +# endif() +# if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES) +# target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_add_instance) +# target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_add_add_fastgelu_instance) +# target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_fastgelu_instance) +# target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_batched_gemm_gemm_instance) +# target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_batched_gemm_add_relu_gemm_add_instance) +# target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_gemm_instance) +# target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_streamk_instance) +# target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_add_fastgelu_instance) +# target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_add_relu_instance) +# target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_add_silu_instance) +# target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_add_relu_add_layernorm_instance) +# target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_gemm_fixed_nk_instance) +# target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_gemm_fastgelu_instance) +# target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_gemm_tile_loop_instance) +# endif() +# target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_batched_gemm_instance) +# target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_batched_gemm_reduce_instance) +# target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_multiply_add_instance) +# if(SUPPORTED_GPU_TARGETS MATCHES "gfx94") +# target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_multiply_multiply_instance) +# target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_ab_scale_instance) +# endif() +# target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_splitk_instance) +target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_universal_instance) +# target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_universal_batched_instance) +# target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_universal_reduce_instance) +# target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_universal_streamk_instance) +# target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_add_multiply_instance) +# target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_reduce_instance) +# target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_bias_add_reduce_instance) +# target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_conv2d_fwd_instance) +# target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_conv2d_fwd_bias_relu_instance) +# target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_conv2d_fwd_bias_relu_add_instance) +# target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_conv1d_fwd_instance) +# target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_conv1d_bwd_data_instance) +# target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_conv3d_bwd_data_instance) +# target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_conv2d_bwd_data_instance) +# target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_conv1d_bwd_weight_instance) +# target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_conv2d_bwd_weight_instance) +# target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_conv3d_fwd_convscale_instance) +# target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_conv3d_fwd_convinvscale_instance) +# endif() -if(SUPPORTED_GPU_TARGETS MATCHES "gfx9" OR SUPPORTED_GPU_TARGETS MATCHES "gfx11" OR SUPPORTED_GPU_TARGETS MATCHES "gfx12") - if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES) - target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_bilinear_instance) - endif() - target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_conv3d_fwd_instance) - target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_conv2d_bwd_data_instance) - target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_conv3d_bwd_data_instance) - target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_conv2d_fwd_instance) - target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_conv3d_bwd_weight_instance) -endif() +# if(SUPPORTED_GPU_TARGETS MATCHES "gfx9" OR SUPPORTED_GPU_TARGETS MATCHES "gfx11" OR SUPPORTED_GPU_TARGETS MATCHES "gfx12") +# if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES) +# target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_bilinear_instance) +# endif() +# target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_conv3d_fwd_instance) +# target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_conv2d_bwd_data_instance) +# target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_conv3d_bwd_data_instance) +# target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_conv2d_fwd_instance) +# target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_conv3d_bwd_weight_instance) +# endif() -if(DL_KERNELS) - target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_batched_gemm_multi_d_instance) - target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_conv1d_bwd_weight_instance) - target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_conv2d_bwd_weight_instance) - target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_conv3d_bwd_weight_instance) -endif() -rocm_install(TARGETS ${PROFILER_EXECUTABLE} COMPONENT profiler) +# if(DL_KERNELS) +# target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_batched_gemm_multi_d_instance) +# target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_conv1d_bwd_weight_instance) +# target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_conv2d_bwd_weight_instance) +# target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_conv3d_bwd_weight_instance) +# endif() +rocm_install(TARGETS ${PROFILER_EXECUTABLE} COMPONENT profiler) \ No newline at end of file