From 26257585227f9182860b652ce6e2dbcdf4878cf0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ville=20Pietil=C3=A4?= <> Date: Mon, 19 Jan 2026 06:09:29 -0500 Subject: [PATCH] Build only CK conv profilers. --- profiler/src/CMakeLists.txt | 334 ++++++++++++++++++------------------ 1 file changed, 167 insertions(+), 167 deletions(-) diff --git a/profiler/src/CMakeLists.txt b/profiler/src/CMakeLists.txt index e484ff9ef7..6abbaa3dc4 100644 --- a/profiler/src/CMakeLists.txt +++ b/profiler/src/CMakeLists.txt @@ -13,30 +13,30 @@ endif() message(STATUS "CK_PROFILER_OP_FILTER: ${CK_PROFILER_OP_FILTER}") message(STATUS "CK_PROFILER_INSTANCE_FILTER: ${CK_PROFILER_INSTANCE_FILTER}") -set(PROFILER_OPS - profile_gemm.cpp - profile_reduce.cpp - profile_groupnorm_bwd_data.cpp - profile_groupnorm_fwd.cpp - profile_layernorm_bwd_data.cpp - profile_layernorm_bwd_gamma_beta.cpp - profile_groupnorm_bwd_gamma_beta.cpp - profile_layernorm_fwd.cpp - profile_max_pool2d_fwd.cpp - profile_pool3d_fwd.cpp - profile_avg_pool3d_bwd.cpp - profile_max_pool3d_bwd.cpp - profile_avg_pool2d_bwd.cpp - profile_max_pool2d_bwd.cpp - profile_softmax.cpp - profile_batchnorm_fwd.cpp - profile_batchnorm_bwd.cpp - profile_batchnorm_infer.cpp - profile_conv_tensor_rearrange.cpp - profile_transpose.cpp - profile_permute_scale.cpp - profile_gemm_quantization.cpp -) +# set(PROFILER_OPS +# profile_gemm.cpp +# profile_reduce.cpp +# profile_groupnorm_bwd_data.cpp +# profile_groupnorm_fwd.cpp +# profile_layernorm_bwd_data.cpp +# profile_layernorm_bwd_gamma_beta.cpp +# profile_groupnorm_bwd_gamma_beta.cpp +# profile_layernorm_fwd.cpp +# profile_max_pool2d_fwd.cpp +# profile_pool3d_fwd.cpp +# profile_avg_pool3d_bwd.cpp +# profile_max_pool3d_bwd.cpp +# profile_avg_pool2d_bwd.cpp +# profile_max_pool2d_bwd.cpp +# profile_softmax.cpp +# profile_batchnorm_fwd.cpp +# profile_batchnorm_bwd.cpp +# profile_batchnorm_infer.cpp +# profile_conv_tensor_rearrange.cpp +# profile_transpose.cpp +# profile_permute_scale.cpp +# profile_gemm_quantization.cpp +# ) if(SUPPORTED_GPU_TARGETS MATCHES "gfx9") if(DTYPES MATCHES "fp32" OR DTYPES MATCHES "fp64" OR NOT DEFINED DTYPES) @@ -45,56 +45,56 @@ if(SUPPORTED_GPU_TARGETS MATCHES "gfx9") endif() endif() -if(SUPPORTED_GPU_TARGETS MATCHES "gfx9|gfx1[12]") - if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES) - list(APPEND PROFILER_OPS profile_gemm_reduce.cpp) - list(APPEND PROFILER_OPS profile_batched_gemm_add_relu_gemm_add.cpp) - list(APPEND PROFILER_OPS profile_gemm_add.cpp) - list(APPEND PROFILER_OPS profile_grouped_gemm.cpp) - list(APPEND PROFILER_OPS profile_gemm_streamk.cpp) - list(APPEND PROFILER_OPS profile_gemm_add_relu.cpp) - list(APPEND PROFILER_OPS profile_gemm_add_relu_add_layernorm.cpp) - list(APPEND PROFILER_OPS profile_grouped_gemm_fixed_nk.cpp) - list(APPEND PROFILER_OPS profile_grouped_gemm_fastgelu.cpp) - list(APPEND PROFILER_OPS profile_grouped_gemm_tile_loop.cpp) - list(APPEND PROFILER_OPS profile_grouped_gemm_multiply_tile_loop.cpp) - endif() - if(SUPPORTED_GPU_TARGETS MATCHES "gfx9[45]|gfx12") - list(APPEND PROFILER_OPS profile_gemm_multiply_multiply_wp.cpp) - list(APPEND PROFILER_OPS profile_gemm_ab_scale.cpp) - list(APPEND PROFILER_OPS profile_gemm_blockscale_wp.cpp) - list(APPEND PROFILER_OPS profile_gemm_universal_preshuffle.cpp) - endif() - if(SUPPORTED_GPU_TARGETS MATCHES "gfx95") - list(APPEND PROFILER_OPS profile_gemm_mx.cpp) - endif() - list(APPEND PROFILER_OPS profile_batched_gemm_reduce.cpp) - list(APPEND PROFILER_OPS profile_gemm_add_multiply.cpp) - list(APPEND PROFILER_OPS profile_gemm_add.cpp) - list(APPEND PROFILER_OPS profile_gemm_bias_add_reduce.cpp) - list(APPEND PROFILER_OPS profile_gemm_splitk.cpp) - list(APPEND PROFILER_OPS profile_gemm_universal_batched.cpp) - list(APPEND PROFILER_OPS profile_gemm_universal_streamk.cpp) - list(APPEND PROFILER_OPS profile_conv_fwd_bias_relu.cpp) - list(APPEND PROFILER_OPS profile_conv_fwd_bias_relu_add.cpp) - list(APPEND PROFILER_OPS profile_conv_bwd_data.cpp) - list(APPEND PROFILER_OPS profile_conv_fwd.cpp) -endif() +# if(SUPPORTED_GPU_TARGETS MATCHES "gfx9|gfx1[12]") +# if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES) +# list(APPEND PROFILER_OPS profile_gemm_reduce.cpp) +# list(APPEND PROFILER_OPS profile_batched_gemm_add_relu_gemm_add.cpp) +# list(APPEND PROFILER_OPS profile_gemm_add.cpp) +# list(APPEND PROFILER_OPS profile_grouped_gemm.cpp) +# list(APPEND PROFILER_OPS profile_gemm_streamk.cpp) +# list(APPEND PROFILER_OPS profile_gemm_add_relu.cpp) +# list(APPEND PROFILER_OPS profile_gemm_add_relu_add_layernorm.cpp) +# list(APPEND PROFILER_OPS profile_grouped_gemm_fixed_nk.cpp) +# list(APPEND PROFILER_OPS profile_grouped_gemm_fastgelu.cpp) +# list(APPEND PROFILER_OPS profile_grouped_gemm_tile_loop.cpp) +# list(APPEND PROFILER_OPS profile_grouped_gemm_multiply_tile_loop.cpp) +# endif() +# if(SUPPORTED_GPU_TARGETS MATCHES "gfx9[45]|gfx12") +# list(APPEND PROFILER_OPS profile_gemm_multiply_multiply_wp.cpp) +# list(APPEND PROFILER_OPS profile_gemm_ab_scale.cpp) +# list(APPEND PROFILER_OPS profile_gemm_blockscale_wp.cpp) +# list(APPEND PROFILER_OPS profile_gemm_universal_preshuffle.cpp) +# endif() +# if(SUPPORTED_GPU_TARGETS MATCHES "gfx95") +# list(APPEND PROFILER_OPS profile_gemm_mx.cpp) +# endif() +# list(APPEND PROFILER_OPS profile_batched_gemm_reduce.cpp) +# list(APPEND PROFILER_OPS profile_gemm_add_multiply.cpp) +# list(APPEND PROFILER_OPS profile_gemm_add.cpp) +# list(APPEND PROFILER_OPS profile_gemm_bias_add_reduce.cpp) +# list(APPEND PROFILER_OPS profile_gemm_splitk.cpp) +# list(APPEND PROFILER_OPS profile_gemm_universal_batched.cpp) +# list(APPEND PROFILER_OPS profile_gemm_universal_streamk.cpp) +# list(APPEND PROFILER_OPS profile_conv_fwd_bias_relu.cpp) +# list(APPEND PROFILER_OPS profile_conv_fwd_bias_relu_add.cpp) +# list(APPEND PROFILER_OPS profile_conv_bwd_data.cpp) +# list(APPEND PROFILER_OPS profile_conv_fwd.cpp) +# endif() -if((SUPPORTED_GPU_TARGETS MATCHES "gfx9" AND (DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES)) OR - (SUPPORTED_GPU_TARGETS MATCHES "gfx1[12]")) - list(APPEND PROFILER_OPS profile_gemm_bilinear.cpp) -endif() -if(SUPPORTED_GPU_TARGETS MATCHES "gfx(9[45]|1[12])") - list(APPEND PROFILER_OPS profile_gemm_multiply_multiply.cpp) -endif() +# if((SUPPORTED_GPU_TARGETS MATCHES "gfx9" AND (DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES)) OR +# (SUPPORTED_GPU_TARGETS MATCHES "gfx1[12]")) +# list(APPEND PROFILER_OPS profile_gemm_bilinear.cpp) +# endif() +# if(SUPPORTED_GPU_TARGETS MATCHES "gfx(9[45]|1[12])") +# list(APPEND PROFILER_OPS profile_gemm_multiply_multiply.cpp) +# endif() if(SUPPORTED_GPU_TARGETS MATCHES "gfx9|gfx1[12]") - list(APPEND PROFILER_OPS profile_gemm_universal.cpp) - list(APPEND PROFILER_OPS profile_batched_gemm.cpp) - list(APPEND PROFILER_OPS profile_batched_gemm_b_scale.cpp) - list(APPEND PROFILER_OPS profile_gemm_b_scale.cpp) - list(APPEND PROFILER_OPS profile_gemm_universal_reduce.cpp) + # list(APPEND PROFILER_OPS profile_gemm_universal.cpp) + # list(APPEND PROFILER_OPS profile_batched_gemm.cpp) + # list(APPEND PROFILER_OPS profile_batched_gemm_b_scale.cpp) + # list(APPEND PROFILER_OPS profile_gemm_b_scale.cpp) + # list(APPEND PROFILER_OPS profile_gemm_universal_reduce.cpp) list(APPEND PROFILER_OPS profile_grouped_conv_fwd.cpp) list(APPEND PROFILER_OPS profile_grouped_conv_fwd_bias_clamp.cpp) list(APPEND PROFILER_OPS profile_grouped_conv_fwd_clamp.cpp) @@ -102,27 +102,27 @@ if(SUPPORTED_GPU_TARGETS MATCHES "gfx9|gfx1[12]") list(APPEND PROFILER_OPS profile_grouped_conv_fwd_bilinear.cpp) list(APPEND PROFILER_OPS profile_grouped_conv_bwd_weight.cpp) list(APPEND PROFILER_OPS profile_grouped_conv_fwd_outelementop.cpp) - list(APPEND PROFILER_OPS profile_gemm_multi_abd.cpp) - if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES) - list(APPEND PROFILER_OPS profile_gemm_add_multiply.cpp) - list(APPEND PROFILER_OPS profile_gemm_multiply_add.cpp) - list(APPEND PROFILER_OPS profile_gemm_add_silu.cpp) - list(APPEND PROFILER_OPS profile_gemm_fastgelu.cpp) - list(APPEND PROFILER_OPS profile_gemm_add_fastgelu.cpp) - list(APPEND PROFILER_OPS profile_gemm_add_add_fastgelu.cpp) - list(APPEND PROFILER_SOURCES profile_gemm_add.cpp) - endif() - list(APPEND PROFILER_OPS profile_batched_gemm_gemm.cpp) + # list(APPEND PROFILER_OPS profile_gemm_multi_abd.cpp) + # if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES) + # list(APPEND PROFILER_OPS profile_gemm_add_multiply.cpp) + # list(APPEND PROFILER_OPS profile_gemm_multiply_add.cpp) + # list(APPEND PROFILER_OPS profile_gemm_add_silu.cpp) + # list(APPEND PROFILER_OPS profile_gemm_fastgelu.cpp) + # list(APPEND PROFILER_OPS profile_gemm_add_fastgelu.cpp) + # list(APPEND PROFILER_OPS profile_gemm_add_add_fastgelu.cpp) + # list(APPEND PROFILER_SOURCES profile_gemm_add.cpp) + # endif() + # list(APPEND PROFILER_OPS profile_batched_gemm_gemm.cpp) endif() if(DL_KERNELS) - list(APPEND PROFILER_OPS profile_batched_gemm_multi_d.cpp) + # list(APPEND PROFILER_OPS profile_batched_gemm_multi_d.cpp) list(APPEND PROFILER_OPS profile_grouped_conv_bwd_weight.cpp) endif() -if(CK_ENABLE_INT8) - list(APPEND PROFILER_OPS profile_gemm_quantization.cpp) -endif() +# if(CK_ENABLE_INT8) +# list(APPEND PROFILER_OPS profile_gemm_quantization.cpp) +# endif() set(PROFILER_SOURCES profiler.cpp) foreach(SOURCE ${PROFILER_OPS}) @@ -148,61 +148,61 @@ endif() set(DEVICE_INSTANCES "") -list(APPEND DEVICE_INSTANCES device_gemm_instance) -list(APPEND DEVICE_INSTANCES device_normalization_fwd_instance) -list(APPEND DEVICE_INSTANCES device_normalization_bwd_data_instance) -list(APPEND DEVICE_INSTANCES device_normalization_bwd_gamma_beta_instance) -list(APPEND DEVICE_INSTANCES device_softmax_instance) -list(APPEND DEVICE_INSTANCES device_reduce_instance) -list(APPEND DEVICE_INSTANCES device_batchnorm_instance) -list(APPEND DEVICE_INSTANCES device_pool2d_fwd_instance) -list(APPEND DEVICE_INSTANCES device_pool3d_fwd_instance) -list(APPEND DEVICE_INSTANCES device_avg_pool2d_bwd_instance) -list(APPEND DEVICE_INSTANCES device_avg_pool3d_bwd_instance) -list(APPEND DEVICE_INSTANCES device_max_pool_bwd_instance) -list(APPEND DEVICE_INSTANCES device_image_to_column_instance) -list(APPEND DEVICE_INSTANCES device_column_to_image_instance) -list(APPEND DEVICE_INSTANCES device_transpose_instance) -list(APPEND DEVICE_INSTANCES device_permute_scale_instance) +# list(APPEND DEVICE_INSTANCES device_gemm_instance) +# list(APPEND DEVICE_INSTANCES device_normalization_fwd_instance) +# list(APPEND DEVICE_INSTANCES device_normalization_bwd_data_instance) +# list(APPEND DEVICE_INSTANCES device_normalization_bwd_gamma_beta_instance) +# list(APPEND DEVICE_INSTANCES device_softmax_instance) +# list(APPEND DEVICE_INSTANCES device_reduce_instance) +# list(APPEND DEVICE_INSTANCES device_batchnorm_instance) +# list(APPEND DEVICE_INSTANCES device_pool2d_fwd_instance) +# list(APPEND DEVICE_INSTANCES device_pool3d_fwd_instance) +# list(APPEND DEVICE_INSTANCES device_avg_pool2d_bwd_instance) +# list(APPEND DEVICE_INSTANCES device_avg_pool3d_bwd_instance) +# list(APPEND DEVICE_INSTANCES device_max_pool_bwd_instance) +# list(APPEND DEVICE_INSTANCES device_image_to_column_instance) +# list(APPEND DEVICE_INSTANCES device_column_to_image_instance) +# list(APPEND DEVICE_INSTANCES device_transpose_instance) +# list(APPEND DEVICE_INSTANCES device_permute_scale_instance) if(SUPPORTED_GPU_TARGETS MATCHES "gfx9|gfx1[12]") - if(DTYPES MATCHES "fp32" OR DTYPES MATCHES "fp64" OR NOT DEFINED DTYPES) - list(APPEND DEVICE_INSTANCES device_contraction_bilinear_instance) - list(APPEND DEVICE_INSTANCES device_contraction_scale_instance) - endif() - if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES) - list(APPEND DEVICE_INSTANCES device_gemm_add_instance) - list(APPEND DEVICE_INSTANCES device_batched_gemm_gemm_instance) - list(APPEND DEVICE_INSTANCES device_gemm_add_add_fastgelu_instance) - list(APPEND DEVICE_INSTANCES device_gemm_fastgelu_instance) - list(APPEND DEVICE_INSTANCES device_batched_gemm_add_relu_gemm_add_instance) - list(APPEND DEVICE_INSTANCES device_grouped_gemm_instance) - list(APPEND DEVICE_INSTANCES device_gemm_streamk_instance) - list(APPEND DEVICE_INSTANCES device_gemm_add_relu_instance) - list(APPEND DEVICE_INSTANCES device_gemm_add_relu_add_layernorm_instance) - list(APPEND DEVICE_INSTANCES device_grouped_gemm_fixed_nk_instance) - list(APPEND DEVICE_INSTANCES device_grouped_gemm_fastgelu_instance) - list(APPEND DEVICE_INSTANCES device_grouped_gemm_tile_loop_instance) - endif() - list(APPEND DEVICE_INSTANCES device_batched_gemm_reduce_instance) - if(SUPPORTED_GPU_TARGETS MATCHES "gfx9[45]|gfx12") - list(APPEND DEVICE_INSTANCES device_gemm_multiply_multiply_wp_instance) - list(APPEND DEVICE_INSTANCES device_gemm_universal_preshuffle_instance) - endif() - if(SUPPORTED_GPU_TARGETS MATCHES "gfx9[45]|gfx1[12]") - list(APPEND DEVICE_INSTANCES device_gemm_ab_scale_instance) - list(APPEND DEVICE_INSTANCES device_gemm_blockscale_wp_instance) - endif() - if(SUPPORTED_GPU_TARGETS MATCHES "gfx95") - list(APPEND DEVICE_INSTANCES device_gemm_mx_instance) - endif() - list(APPEND DEVICE_INSTANCES device_gemm_splitk_instance) - list(APPEND DEVICE_INSTANCES device_gemm_universal_batched_instance) - list(APPEND DEVICE_INSTANCES device_gemm_universal_streamk_instance) - list(APPEND DEVICE_INSTANCES device_gemm_add_multiply_instance) - list(APPEND DEVICE_INSTANCES device_gemm_add_instance) - list(APPEND DEVICE_INSTANCES device_gemm_reduce_instance) - list(APPEND DEVICE_INSTANCES device_gemm_bias_add_reduce_instance) + # if(DTYPES MATCHES "fp32" OR DTYPES MATCHES "fp64" OR NOT DEFINED DTYPES) + # list(APPEND DEVICE_INSTANCES device_contraction_bilinear_instance) + # list(APPEND DEVICE_INSTANCES device_contraction_scale_instance) + # endif() + # if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES) + # list(APPEND DEVICE_INSTANCES device_gemm_add_instance) + # list(APPEND DEVICE_INSTANCES device_batched_gemm_gemm_instance) + # list(APPEND DEVICE_INSTANCES device_gemm_add_add_fastgelu_instance) + # list(APPEND DEVICE_INSTANCES device_gemm_fastgelu_instance) + # list(APPEND DEVICE_INSTANCES device_batched_gemm_add_relu_gemm_add_instance) + # list(APPEND DEVICE_INSTANCES device_grouped_gemm_instance) + # list(APPEND DEVICE_INSTANCES device_gemm_streamk_instance) + # list(APPEND DEVICE_INSTANCES device_gemm_add_relu_instance) + # list(APPEND DEVICE_INSTANCES device_gemm_add_relu_add_layernorm_instance) + # list(APPEND DEVICE_INSTANCES device_grouped_gemm_fixed_nk_instance) + # list(APPEND DEVICE_INSTANCES device_grouped_gemm_fastgelu_instance) + # list(APPEND DEVICE_INSTANCES device_grouped_gemm_tile_loop_instance) + # endif() + # list(APPEND DEVICE_INSTANCES device_batched_gemm_reduce_instance) + # if(SUPPORTED_GPU_TARGETS MATCHES "gfx9[45]|gfx12") + # list(APPEND DEVICE_INSTANCES device_gemm_multiply_multiply_wp_instance) + # list(APPEND DEVICE_INSTANCES device_gemm_universal_preshuffle_instance) + # endif() + # if(SUPPORTED_GPU_TARGETS MATCHES "gfx9[45]|gfx1[12]") + # list(APPEND DEVICE_INSTANCES device_gemm_ab_scale_instance) + # list(APPEND DEVICE_INSTANCES device_gemm_blockscale_wp_instance) + # endif() + # if(SUPPORTED_GPU_TARGETS MATCHES "gfx95") + # list(APPEND DEVICE_INSTANCES device_gemm_mx_instance) + # endif() + # list(APPEND DEVICE_INSTANCES device_gemm_splitk_instance) + # list(APPEND DEVICE_INSTANCES device_gemm_universal_batched_instance) + # list(APPEND DEVICE_INSTANCES device_gemm_universal_streamk_instance) + # list(APPEND DEVICE_INSTANCES device_gemm_add_multiply_instance) + # list(APPEND DEVICE_INSTANCES device_gemm_add_instance) + # list(APPEND DEVICE_INSTANCES device_gemm_reduce_instance) + # list(APPEND DEVICE_INSTANCES device_gemm_bias_add_reduce_instance) list(APPEND DEVICE_INSTANCES device_conv2d_fwd_instance) list(APPEND DEVICE_INSTANCES device_conv2d_fwd_bias_relu_instance) list(APPEND DEVICE_INSTANCES device_conv2d_fwd_bias_relu_add_instance) @@ -213,20 +213,20 @@ if(SUPPORTED_GPU_TARGETS MATCHES "gfx9|gfx1[12]") list(APPEND DEVICE_INSTANCES device_grouped_conv3d_fwd_convinvscale_instance) endif() -if((SUPPORTED_GPU_TARGETS MATCHES "gfx9" AND (DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES)) OR - (SUPPORTED_GPU_TARGETS MATCHES "gfx1[12]" )) - list(APPEND DEVICE_INSTANCES device_gemm_bilinear_instance) -endif() -if(SUPPORTED_GPU_TARGETS MATCHES "gfx(9[45]|1[12])") - list(APPEND DEVICE_INSTANCES device_gemm_multiply_multiply_instance) -endif() +# if((SUPPORTED_GPU_TARGETS MATCHES "gfx9" AND (DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES)) OR +# (SUPPORTED_GPU_TARGETS MATCHES "gfx1[12]" )) +# list(APPEND DEVICE_INSTANCES device_gemm_bilinear_instance) +# endif() +# if(SUPPORTED_GPU_TARGETS MATCHES "gfx(9[45]|1[12])") +# list(APPEND DEVICE_INSTANCES device_gemm_multiply_multiply_instance) +# endif() if(SUPPORTED_GPU_TARGETS MATCHES "gfx9|gfx1[12]") - list(APPEND DEVICE_INSTANCES device_gemm_universal_instance) - list(APPEND DEVICE_INSTANCES device_batched_gemm_instance) - list(APPEND DEVICE_INSTANCES device_gemm_b_scale_instance) - list(APPEND DEVICE_INSTANCES device_gemm_universal_reduce_instance) - list(APPEND DEVICE_INSTANCES device_batched_gemm_b_scale_instance) + # list(APPEND DEVICE_INSTANCES device_gemm_universal_instance) + # list(APPEND DEVICE_INSTANCES device_batched_gemm_instance) + # list(APPEND DEVICE_INSTANCES device_gemm_b_scale_instance) + # list(APPEND DEVICE_INSTANCES device_gemm_universal_reduce_instance) + # list(APPEND DEVICE_INSTANCES device_batched_gemm_b_scale_instance) list(APPEND DEVICE_INSTANCES device_grouped_conv2d_bwd_data_instance) list(APPEND DEVICE_INSTANCES device_grouped_conv3d_bwd_data_instance) list(APPEND DEVICE_INSTANCES device_grouped_conv1d_fwd_instance) @@ -238,18 +238,18 @@ if(SUPPORTED_GPU_TARGETS MATCHES "gfx9|gfx1[12]") list(APPEND DEVICE_INSTANCES device_grouped_conv2d_fwd_bias_clamp_instance) list(APPEND DEVICE_INSTANCES device_grouped_conv3d_fwd_bias_clamp_instance) list(APPEND DEVICE_INSTANCES device_grouped_conv3d_fwd_bilinear_instance) - list(APPEND DEVICE_INSTANCES device_gemm_add_relu_instance) - list(APPEND DEVICE_INSTANCES device_gemm_multi_abd_instance) - if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES) - list(APPEND DEVICE_INSTANCES device_gemm_add_multiply_instance) - list(APPEND DEVICE_INSTANCES device_gemm_multiply_add_instance) - list(APPEND DEVICE_INSTANCES device_gemm_add_instance) - list(APPEND DEVICE_INSTANCES device_gemm_add_silu_instance) - list(APPEND DEVICE_INSTANCES device_gemm_fastgelu_instance) - list(APPEND DEVICE_INSTANCES device_gemm_add_fastgelu_instance) - list(APPEND DEVICE_INSTANCES device_gemm_add_add_fastgelu_instance) - endif() - list(APPEND DEVICE_INSTANCES device_batched_gemm_gemm_instance) + # list(APPEND DEVICE_INSTANCES device_gemm_add_relu_instance) + # list(APPEND DEVICE_INSTANCES device_gemm_multi_abd_instance) + # if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES) + # list(APPEND DEVICE_INSTANCES device_gemm_add_multiply_instance) + # list(APPEND DEVICE_INSTANCES device_gemm_multiply_add_instance) + # list(APPEND DEVICE_INSTANCES device_gemm_add_instance) + # list(APPEND DEVICE_INSTANCES device_gemm_add_silu_instance) + # list(APPEND DEVICE_INSTANCES device_gemm_fastgelu_instance) + # list(APPEND DEVICE_INSTANCES device_gemm_add_fastgelu_instance) + # list(APPEND DEVICE_INSTANCES device_gemm_add_add_fastgelu_instance) + # endif() + # list(APPEND DEVICE_INSTANCES device_batched_gemm_gemm_instance) list(APPEND DEVICE_INSTANCES device_grouped_conv1d_bwd_weight_instance) list(APPEND DEVICE_INSTANCES device_grouped_conv2d_bwd_weight_instance) list(APPEND DEVICE_INSTANCES device_grouped_convnd_bwd_weight_instance) @@ -257,15 +257,15 @@ if(SUPPORTED_GPU_TARGETS MATCHES "gfx9|gfx1[12]") endif() if(DL_KERNELS) - list(APPEND DEVICE_INSTANCES device_batched_gemm_multi_d_instance) + # list(APPEND DEVICE_INSTANCES device_batched_gemm_multi_d_instance) list(APPEND DEVICE_INSTANCES device_grouped_conv1d_bwd_weight_instance) list(APPEND DEVICE_INSTANCES device_grouped_conv2d_bwd_weight_instance) list(APPEND DEVICE_INSTANCES device_grouped_conv3d_bwd_weight_instance) endif() -if(CK_ENABLE_INT8) - list(APPEND DEVICE_INSTANCES device_quantization_instance) -endif() +# if(CK_ENABLE_INT8) +# list(APPEND DEVICE_INSTANCES device_quantization_instance) +# endif() set(PROFILER_LIBS utility getopt::getopt) foreach(LIB ${DEVICE_INSTANCES})