Build only CK conv profilers.

2026-06-30 03:37:38 +00:00 · 2026-01-19 06:09:29 -05:00
parent 1a6d1b59ef
commit 2625758522
1 changed files with 167 additions and 167 deletions
--- a/profiler/src/CMakeLists.txt
+++ b/profiler/src/CMakeLists.txt
@@ -13,30 +13,30 @@ endif()
 message(STATUS "CK_PROFILER_OP_FILTER: ${CK_PROFILER_OP_FILTER}")
 message(STATUS "CK_PROFILER_INSTANCE_FILTER: ${CK_PROFILER_INSTANCE_FILTER}")

-set(PROFILER_OPS
-    profile_gemm.cpp
-    profile_reduce.cpp
-    profile_groupnorm_bwd_data.cpp
-    profile_groupnorm_fwd.cpp
-    profile_layernorm_bwd_data.cpp
-    profile_layernorm_bwd_gamma_beta.cpp
-    profile_groupnorm_bwd_gamma_beta.cpp
-    profile_layernorm_fwd.cpp
-    profile_max_pool2d_fwd.cpp
-    profile_pool3d_fwd.cpp
-    profile_avg_pool3d_bwd.cpp
-    profile_max_pool3d_bwd.cpp
-    profile_avg_pool2d_bwd.cpp
-    profile_max_pool2d_bwd.cpp
-    profile_softmax.cpp
-    profile_batchnorm_fwd.cpp
-    profile_batchnorm_bwd.cpp
-    profile_batchnorm_infer.cpp
-    profile_conv_tensor_rearrange.cpp
-    profile_transpose.cpp
-    profile_permute_scale.cpp
-    profile_gemm_quantization.cpp
-)
+# set(PROFILER_OPS
+#     profile_gemm.cpp
+#     profile_reduce.cpp
+#     profile_groupnorm_bwd_data.cpp
+#     profile_groupnorm_fwd.cpp
+#     profile_layernorm_bwd_data.cpp
+#     profile_layernorm_bwd_gamma_beta.cpp
+#     profile_groupnorm_bwd_gamma_beta.cpp
+#     profile_layernorm_fwd.cpp
+#     profile_max_pool2d_fwd.cpp
+#     profile_pool3d_fwd.cpp
+#     profile_avg_pool3d_bwd.cpp
+#     profile_max_pool3d_bwd.cpp
+#     profile_avg_pool2d_bwd.cpp
+#     profile_max_pool2d_bwd.cpp
+#     profile_softmax.cpp
+#     profile_batchnorm_fwd.cpp
+#     profile_batchnorm_bwd.cpp
+#     profile_batchnorm_infer.cpp
+#     profile_conv_tensor_rearrange.cpp
+#     profile_transpose.cpp
+#     profile_permute_scale.cpp
+#     profile_gemm_quantization.cpp
+# )

 if(SUPPORTED_GPU_TARGETS MATCHES "gfx9")
  if(DTYPES MATCHES "fp32" OR DTYPES MATCHES "fp64" OR NOT DEFINED DTYPES)
@@ -45,56 +45,56 @@ if(SUPPORTED_GPU_TARGETS MATCHES "gfx9")
  endif()
 endif()

-if(SUPPORTED_GPU_TARGETS MATCHES "gfx9|gfx1[12]")  
-  if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES)
-    list(APPEND PROFILER_OPS profile_gemm_reduce.cpp)
-    list(APPEND PROFILER_OPS profile_batched_gemm_add_relu_gemm_add.cpp)
-    list(APPEND PROFILER_OPS profile_gemm_add.cpp)
-    list(APPEND PROFILER_OPS profile_grouped_gemm.cpp)
-    list(APPEND PROFILER_OPS profile_gemm_streamk.cpp)
-    list(APPEND PROFILER_OPS profile_gemm_add_relu.cpp)
-    list(APPEND PROFILER_OPS profile_gemm_add_relu_add_layernorm.cpp)
-    list(APPEND PROFILER_OPS profile_grouped_gemm_fixed_nk.cpp)
-    list(APPEND PROFILER_OPS profile_grouped_gemm_fastgelu.cpp)
-    list(APPEND PROFILER_OPS profile_grouped_gemm_tile_loop.cpp)
-    list(APPEND PROFILER_OPS profile_grouped_gemm_multiply_tile_loop.cpp)
-  endif()
-  if(SUPPORTED_GPU_TARGETS MATCHES "gfx9[45]|gfx12")
-    list(APPEND PROFILER_OPS profile_gemm_multiply_multiply_wp.cpp)
-    list(APPEND PROFILER_OPS profile_gemm_ab_scale.cpp)
-    list(APPEND PROFILER_OPS profile_gemm_blockscale_wp.cpp)
-    list(APPEND PROFILER_OPS profile_gemm_universal_preshuffle.cpp)
-  endif()
-  if(SUPPORTED_GPU_TARGETS MATCHES "gfx95")
-    list(APPEND PROFILER_OPS profile_gemm_mx.cpp)
-  endif()
-  list(APPEND PROFILER_OPS profile_batched_gemm_reduce.cpp)
-  list(APPEND PROFILER_OPS profile_gemm_add_multiply.cpp)
-  list(APPEND PROFILER_OPS profile_gemm_add.cpp)
-  list(APPEND PROFILER_OPS profile_gemm_bias_add_reduce.cpp)
-  list(APPEND PROFILER_OPS profile_gemm_splitk.cpp)
-  list(APPEND PROFILER_OPS profile_gemm_universal_batched.cpp)
-  list(APPEND PROFILER_OPS profile_gemm_universal_streamk.cpp)
-  list(APPEND PROFILER_OPS profile_conv_fwd_bias_relu.cpp)
-  list(APPEND PROFILER_OPS profile_conv_fwd_bias_relu_add.cpp)
-  list(APPEND PROFILER_OPS profile_conv_bwd_data.cpp)
-  list(APPEND PROFILER_OPS profile_conv_fwd.cpp)
-endif()
+# if(SUPPORTED_GPU_TARGETS MATCHES "gfx9|gfx1[12]")  
+#   if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES)
+#     list(APPEND PROFILER_OPS profile_gemm_reduce.cpp)
+#     list(APPEND PROFILER_OPS profile_batched_gemm_add_relu_gemm_add.cpp)
+#     list(APPEND PROFILER_OPS profile_gemm_add.cpp)
+#     list(APPEND PROFILER_OPS profile_grouped_gemm.cpp)
+#     list(APPEND PROFILER_OPS profile_gemm_streamk.cpp)
+#     list(APPEND PROFILER_OPS profile_gemm_add_relu.cpp)
+#     list(APPEND PROFILER_OPS profile_gemm_add_relu_add_layernorm.cpp)
+#     list(APPEND PROFILER_OPS profile_grouped_gemm_fixed_nk.cpp)
+#     list(APPEND PROFILER_OPS profile_grouped_gemm_fastgelu.cpp)
+#     list(APPEND PROFILER_OPS profile_grouped_gemm_tile_loop.cpp)
+#     list(APPEND PROFILER_OPS profile_grouped_gemm_multiply_tile_loop.cpp)
+#   endif()
+#   if(SUPPORTED_GPU_TARGETS MATCHES "gfx9[45]|gfx12")
+#     list(APPEND PROFILER_OPS profile_gemm_multiply_multiply_wp.cpp)
+#     list(APPEND PROFILER_OPS profile_gemm_ab_scale.cpp)
+#     list(APPEND PROFILER_OPS profile_gemm_blockscale_wp.cpp)
+#     list(APPEND PROFILER_OPS profile_gemm_universal_preshuffle.cpp)
+#   endif()
+#   if(SUPPORTED_GPU_TARGETS MATCHES "gfx95")
+#     list(APPEND PROFILER_OPS profile_gemm_mx.cpp)
+#   endif()
+#   list(APPEND PROFILER_OPS profile_batched_gemm_reduce.cpp)
+#   list(APPEND PROFILER_OPS profile_gemm_add_multiply.cpp)
+#   list(APPEND PROFILER_OPS profile_gemm_add.cpp)
+#   list(APPEND PROFILER_OPS profile_gemm_bias_add_reduce.cpp)
+#   list(APPEND PROFILER_OPS profile_gemm_splitk.cpp)
+#   list(APPEND PROFILER_OPS profile_gemm_universal_batched.cpp)
+#   list(APPEND PROFILER_OPS profile_gemm_universal_streamk.cpp)
+#   list(APPEND PROFILER_OPS profile_conv_fwd_bias_relu.cpp)
+#   list(APPEND PROFILER_OPS profile_conv_fwd_bias_relu_add.cpp)
+#   list(APPEND PROFILER_OPS profile_conv_bwd_data.cpp)
+#   list(APPEND PROFILER_OPS profile_conv_fwd.cpp)
+# endif()

-if((SUPPORTED_GPU_TARGETS MATCHES "gfx9" AND (DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES)) OR
-   (SUPPORTED_GPU_TARGETS MATCHES "gfx1[12]"))
-  list(APPEND PROFILER_OPS profile_gemm_bilinear.cpp)
-endif()
-if(SUPPORTED_GPU_TARGETS MATCHES "gfx(9[45]|1[12])")
-  list(APPEND PROFILER_OPS profile_gemm_multiply_multiply.cpp)
-endif()
+# if((SUPPORTED_GPU_TARGETS MATCHES "gfx9" AND (DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES)) OR
+#    (SUPPORTED_GPU_TARGETS MATCHES "gfx1[12]"))
+#   list(APPEND PROFILER_OPS profile_gemm_bilinear.cpp)
+# endif()
+# if(SUPPORTED_GPU_TARGETS MATCHES "gfx(9[45]|1[12])")
+#   list(APPEND PROFILER_OPS profile_gemm_multiply_multiply.cpp)
+# endif()

 if(SUPPORTED_GPU_TARGETS MATCHES "gfx9|gfx1[12]")
-  list(APPEND PROFILER_OPS profile_gemm_universal.cpp)
-  list(APPEND PROFILER_OPS profile_batched_gemm.cpp)
-  list(APPEND PROFILER_OPS profile_batched_gemm_b_scale.cpp)
-  list(APPEND PROFILER_OPS profile_gemm_b_scale.cpp)
-  list(APPEND PROFILER_OPS profile_gemm_universal_reduce.cpp)
+  # list(APPEND PROFILER_OPS profile_gemm_universal.cpp)
+  # list(APPEND PROFILER_OPS profile_batched_gemm.cpp)
+  # list(APPEND PROFILER_OPS profile_batched_gemm_b_scale.cpp)
+  # list(APPEND PROFILER_OPS profile_gemm_b_scale.cpp)
+  # list(APPEND PROFILER_OPS profile_gemm_universal_reduce.cpp)
  list(APPEND PROFILER_OPS profile_grouped_conv_fwd.cpp)
  list(APPEND PROFILER_OPS profile_grouped_conv_fwd_bias_clamp.cpp)
  list(APPEND PROFILER_OPS profile_grouped_conv_fwd_clamp.cpp)
@@ -102,27 +102,27 @@ if(SUPPORTED_GPU_TARGETS MATCHES "gfx9|gfx1[12]")
  list(APPEND PROFILER_OPS profile_grouped_conv_fwd_bilinear.cpp)
  list(APPEND PROFILER_OPS profile_grouped_conv_bwd_weight.cpp)
  list(APPEND PROFILER_OPS profile_grouped_conv_fwd_outelementop.cpp)
-  list(APPEND PROFILER_OPS profile_gemm_multi_abd.cpp)
-  if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES)
-    list(APPEND PROFILER_OPS profile_gemm_add_multiply.cpp)
-    list(APPEND PROFILER_OPS profile_gemm_multiply_add.cpp)
-    list(APPEND PROFILER_OPS profile_gemm_add_silu.cpp)
-    list(APPEND PROFILER_OPS profile_gemm_fastgelu.cpp)
-    list(APPEND PROFILER_OPS profile_gemm_add_fastgelu.cpp)
-    list(APPEND PROFILER_OPS profile_gemm_add_add_fastgelu.cpp)
-    list(APPEND PROFILER_SOURCES profile_gemm_add.cpp)
-  endif()
-  list(APPEND PROFILER_OPS profile_batched_gemm_gemm.cpp)
+  # list(APPEND PROFILER_OPS profile_gemm_multi_abd.cpp)
+  # if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES)
+  #   list(APPEND PROFILER_OPS profile_gemm_add_multiply.cpp)
+  #   list(APPEND PROFILER_OPS profile_gemm_multiply_add.cpp)
+  #   list(APPEND PROFILER_OPS profile_gemm_add_silu.cpp)
+  #   list(APPEND PROFILER_OPS profile_gemm_fastgelu.cpp)
+  #   list(APPEND PROFILER_OPS profile_gemm_add_fastgelu.cpp)
+  #   list(APPEND PROFILER_OPS profile_gemm_add_add_fastgelu.cpp)
+  #   list(APPEND PROFILER_SOURCES profile_gemm_add.cpp)
+  # endif()
+  # list(APPEND PROFILER_OPS profile_batched_gemm_gemm.cpp)
 endif()

 if(DL_KERNELS)
-  list(APPEND PROFILER_OPS profile_batched_gemm_multi_d.cpp)
+  # list(APPEND PROFILER_OPS profile_batched_gemm_multi_d.cpp)
  list(APPEND PROFILER_OPS profile_grouped_conv_bwd_weight.cpp)
 endif()

-if(CK_ENABLE_INT8)
-  list(APPEND PROFILER_OPS profile_gemm_quantization.cpp)
-endif()
+# if(CK_ENABLE_INT8)
+#   list(APPEND PROFILER_OPS profile_gemm_quantization.cpp)
+# endif()

 set(PROFILER_SOURCES profiler.cpp)
 foreach(SOURCE ${PROFILER_OPS})
@@ -148,61 +148,61 @@ endif()


 set(DEVICE_INSTANCES "")
-list(APPEND DEVICE_INSTANCES device_gemm_instance)
-list(APPEND DEVICE_INSTANCES device_normalization_fwd_instance)
-list(APPEND DEVICE_INSTANCES device_normalization_bwd_data_instance)
-list(APPEND DEVICE_INSTANCES device_normalization_bwd_gamma_beta_instance)
-list(APPEND DEVICE_INSTANCES device_softmax_instance)
-list(APPEND DEVICE_INSTANCES device_reduce_instance)
-list(APPEND DEVICE_INSTANCES device_batchnorm_instance)
-list(APPEND DEVICE_INSTANCES device_pool2d_fwd_instance)
-list(APPEND DEVICE_INSTANCES device_pool3d_fwd_instance)
-list(APPEND DEVICE_INSTANCES device_avg_pool2d_bwd_instance)
-list(APPEND DEVICE_INSTANCES device_avg_pool3d_bwd_instance)
-list(APPEND DEVICE_INSTANCES device_max_pool_bwd_instance)
-list(APPEND DEVICE_INSTANCES device_image_to_column_instance)
-list(APPEND DEVICE_INSTANCES device_column_to_image_instance)
-list(APPEND DEVICE_INSTANCES device_transpose_instance)
-list(APPEND DEVICE_INSTANCES device_permute_scale_instance)
+# list(APPEND DEVICE_INSTANCES device_gemm_instance)
+# list(APPEND DEVICE_INSTANCES device_normalization_fwd_instance)
+# list(APPEND DEVICE_INSTANCES device_normalization_bwd_data_instance)
+# list(APPEND DEVICE_INSTANCES device_normalization_bwd_gamma_beta_instance)
+# list(APPEND DEVICE_INSTANCES device_softmax_instance)
+# list(APPEND DEVICE_INSTANCES device_reduce_instance)
+# list(APPEND DEVICE_INSTANCES device_batchnorm_instance)
+# list(APPEND DEVICE_INSTANCES device_pool2d_fwd_instance)
+# list(APPEND DEVICE_INSTANCES device_pool3d_fwd_instance)
+# list(APPEND DEVICE_INSTANCES device_avg_pool2d_bwd_instance)
+# list(APPEND DEVICE_INSTANCES device_avg_pool3d_bwd_instance)
+# list(APPEND DEVICE_INSTANCES device_max_pool_bwd_instance)
+# list(APPEND DEVICE_INSTANCES device_image_to_column_instance)
+# list(APPEND DEVICE_INSTANCES device_column_to_image_instance)
+# list(APPEND DEVICE_INSTANCES device_transpose_instance)
+# list(APPEND DEVICE_INSTANCES device_permute_scale_instance)

 if(SUPPORTED_GPU_TARGETS MATCHES "gfx9|gfx1[12]")
-  if(DTYPES MATCHES "fp32" OR DTYPES MATCHES "fp64" OR NOT DEFINED DTYPES)
-    list(APPEND DEVICE_INSTANCES device_contraction_bilinear_instance)
-    list(APPEND DEVICE_INSTANCES device_contraction_scale_instance)
-  endif()
-  if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES)
-    list(APPEND DEVICE_INSTANCES device_gemm_add_instance)
-    list(APPEND DEVICE_INSTANCES device_batched_gemm_gemm_instance)
-    list(APPEND DEVICE_INSTANCES device_gemm_add_add_fastgelu_instance)
-    list(APPEND DEVICE_INSTANCES device_gemm_fastgelu_instance)
-    list(APPEND DEVICE_INSTANCES device_batched_gemm_add_relu_gemm_add_instance)
-    list(APPEND DEVICE_INSTANCES device_grouped_gemm_instance)
-    list(APPEND DEVICE_INSTANCES device_gemm_streamk_instance)
-    list(APPEND DEVICE_INSTANCES device_gemm_add_relu_instance)
-    list(APPEND DEVICE_INSTANCES device_gemm_add_relu_add_layernorm_instance)
-    list(APPEND DEVICE_INSTANCES device_grouped_gemm_fixed_nk_instance)
-    list(APPEND DEVICE_INSTANCES device_grouped_gemm_fastgelu_instance)
-    list(APPEND DEVICE_INSTANCES device_grouped_gemm_tile_loop_instance)
-  endif()
-  list(APPEND DEVICE_INSTANCES device_batched_gemm_reduce_instance)
-  if(SUPPORTED_GPU_TARGETS MATCHES "gfx9[45]|gfx12")
-    list(APPEND DEVICE_INSTANCES device_gemm_multiply_multiply_wp_instance)
-    list(APPEND DEVICE_INSTANCES device_gemm_universal_preshuffle_instance)
-  endif()
-    if(SUPPORTED_GPU_TARGETS MATCHES "gfx9[45]|gfx1[12]")
-    list(APPEND DEVICE_INSTANCES device_gemm_ab_scale_instance)
-    list(APPEND DEVICE_INSTANCES device_gemm_blockscale_wp_instance)
-  endif()
-  if(SUPPORTED_GPU_TARGETS MATCHES "gfx95")
-    list(APPEND DEVICE_INSTANCES device_gemm_mx_instance)
-  endif()
-  list(APPEND DEVICE_INSTANCES device_gemm_splitk_instance)
-  list(APPEND DEVICE_INSTANCES device_gemm_universal_batched_instance)
-  list(APPEND DEVICE_INSTANCES device_gemm_universal_streamk_instance)
-  list(APPEND DEVICE_INSTANCES device_gemm_add_multiply_instance)
-  list(APPEND DEVICE_INSTANCES device_gemm_add_instance)
-  list(APPEND DEVICE_INSTANCES device_gemm_reduce_instance)
-  list(APPEND DEVICE_INSTANCES device_gemm_bias_add_reduce_instance)
+  # if(DTYPES MATCHES "fp32" OR DTYPES MATCHES "fp64" OR NOT DEFINED DTYPES)
+  #   list(APPEND DEVICE_INSTANCES device_contraction_bilinear_instance)
+  #   list(APPEND DEVICE_INSTANCES device_contraction_scale_instance)
+  # endif()
+  # if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES)
+  #   list(APPEND DEVICE_INSTANCES device_gemm_add_instance)
+  #   list(APPEND DEVICE_INSTANCES device_batched_gemm_gemm_instance)
+  #   list(APPEND DEVICE_INSTANCES device_gemm_add_add_fastgelu_instance)
+  #   list(APPEND DEVICE_INSTANCES device_gemm_fastgelu_instance)
+  #   list(APPEND DEVICE_INSTANCES device_batched_gemm_add_relu_gemm_add_instance)
+  #   list(APPEND DEVICE_INSTANCES device_grouped_gemm_instance)
+  #   list(APPEND DEVICE_INSTANCES device_gemm_streamk_instance)
+  #   list(APPEND DEVICE_INSTANCES device_gemm_add_relu_instance)
+  #   list(APPEND DEVICE_INSTANCES device_gemm_add_relu_add_layernorm_instance)
+  #   list(APPEND DEVICE_INSTANCES device_grouped_gemm_fixed_nk_instance)
+  #   list(APPEND DEVICE_INSTANCES device_grouped_gemm_fastgelu_instance)
+  #   list(APPEND DEVICE_INSTANCES device_grouped_gemm_tile_loop_instance)
+  # endif()
+  # list(APPEND DEVICE_INSTANCES device_batched_gemm_reduce_instance)
+  # if(SUPPORTED_GPU_TARGETS MATCHES "gfx9[45]|gfx12")
+  #   list(APPEND DEVICE_INSTANCES device_gemm_multiply_multiply_wp_instance)
+  #   list(APPEND DEVICE_INSTANCES device_gemm_universal_preshuffle_instance)
+  # endif()
+  #   if(SUPPORTED_GPU_TARGETS MATCHES "gfx9[45]|gfx1[12]")
+  #   list(APPEND DEVICE_INSTANCES device_gemm_ab_scale_instance)
+  #   list(APPEND DEVICE_INSTANCES device_gemm_blockscale_wp_instance)
+  # endif()
+  # if(SUPPORTED_GPU_TARGETS MATCHES "gfx95")
+  #   list(APPEND DEVICE_INSTANCES device_gemm_mx_instance)
+  # endif()
+  # list(APPEND DEVICE_INSTANCES device_gemm_splitk_instance)
+  # list(APPEND DEVICE_INSTANCES device_gemm_universal_batched_instance)
+  # list(APPEND DEVICE_INSTANCES device_gemm_universal_streamk_instance)
+  # list(APPEND DEVICE_INSTANCES device_gemm_add_multiply_instance)
+  # list(APPEND DEVICE_INSTANCES device_gemm_add_instance)
+  # list(APPEND DEVICE_INSTANCES device_gemm_reduce_instance)
+  # list(APPEND DEVICE_INSTANCES device_gemm_bias_add_reduce_instance)
  list(APPEND DEVICE_INSTANCES device_conv2d_fwd_instance)
  list(APPEND DEVICE_INSTANCES device_conv2d_fwd_bias_relu_instance)
  list(APPEND DEVICE_INSTANCES device_conv2d_fwd_bias_relu_add_instance)
@@ -213,20 +213,20 @@ if(SUPPORTED_GPU_TARGETS MATCHES "gfx9|gfx1[12]")
  list(APPEND DEVICE_INSTANCES device_grouped_conv3d_fwd_convinvscale_instance)
 endif()

-if((SUPPORTED_GPU_TARGETS MATCHES "gfx9" AND (DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES)) OR
-   (SUPPORTED_GPU_TARGETS MATCHES "gfx1[12]" ))
-  list(APPEND DEVICE_INSTANCES device_gemm_bilinear_instance)
-endif()
-if(SUPPORTED_GPU_TARGETS MATCHES "gfx(9[45]|1[12])")
-  list(APPEND DEVICE_INSTANCES device_gemm_multiply_multiply_instance)
-endif()
+# if((SUPPORTED_GPU_TARGETS MATCHES "gfx9" AND (DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES)) OR
+#    (SUPPORTED_GPU_TARGETS MATCHES "gfx1[12]" ))
+#   list(APPEND DEVICE_INSTANCES device_gemm_bilinear_instance)
+# endif()
+# if(SUPPORTED_GPU_TARGETS MATCHES "gfx(9[45]|1[12])")
+#   list(APPEND DEVICE_INSTANCES device_gemm_multiply_multiply_instance)
+# endif()

 if(SUPPORTED_GPU_TARGETS MATCHES "gfx9|gfx1[12]")
-  list(APPEND DEVICE_INSTANCES device_gemm_universal_instance)
-  list(APPEND DEVICE_INSTANCES device_batched_gemm_instance)
-  list(APPEND DEVICE_INSTANCES device_gemm_b_scale_instance)
-  list(APPEND DEVICE_INSTANCES device_gemm_universal_reduce_instance)
-  list(APPEND DEVICE_INSTANCES device_batched_gemm_b_scale_instance)
+  # list(APPEND DEVICE_INSTANCES device_gemm_universal_instance)
+  # list(APPEND DEVICE_INSTANCES device_batched_gemm_instance)
+  # list(APPEND DEVICE_INSTANCES device_gemm_b_scale_instance)
+  # list(APPEND DEVICE_INSTANCES device_gemm_universal_reduce_instance)
+  # list(APPEND DEVICE_INSTANCES device_batched_gemm_b_scale_instance)
  list(APPEND DEVICE_INSTANCES device_grouped_conv2d_bwd_data_instance)
  list(APPEND DEVICE_INSTANCES device_grouped_conv3d_bwd_data_instance)
  list(APPEND DEVICE_INSTANCES device_grouped_conv1d_fwd_instance)
@@ -238,18 +238,18 @@ if(SUPPORTED_GPU_TARGETS MATCHES "gfx9|gfx1[12]")
  list(APPEND DEVICE_INSTANCES device_grouped_conv2d_fwd_bias_clamp_instance)
  list(APPEND DEVICE_INSTANCES device_grouped_conv3d_fwd_bias_clamp_instance)
  list(APPEND DEVICE_INSTANCES device_grouped_conv3d_fwd_bilinear_instance)
-  list(APPEND DEVICE_INSTANCES device_gemm_add_relu_instance)
-  list(APPEND DEVICE_INSTANCES device_gemm_multi_abd_instance)
-  if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES)
-    list(APPEND DEVICE_INSTANCES device_gemm_add_multiply_instance)
-    list(APPEND DEVICE_INSTANCES device_gemm_multiply_add_instance)
-    list(APPEND DEVICE_INSTANCES device_gemm_add_instance)
-    list(APPEND DEVICE_INSTANCES device_gemm_add_silu_instance)
-    list(APPEND DEVICE_INSTANCES device_gemm_fastgelu_instance)
-    list(APPEND DEVICE_INSTANCES device_gemm_add_fastgelu_instance)
-    list(APPEND DEVICE_INSTANCES device_gemm_add_add_fastgelu_instance)
-  endif()
-  list(APPEND DEVICE_INSTANCES device_batched_gemm_gemm_instance)
+  # list(APPEND DEVICE_INSTANCES device_gemm_add_relu_instance)
+  # list(APPEND DEVICE_INSTANCES device_gemm_multi_abd_instance)
+  # if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES)
+  #   list(APPEND DEVICE_INSTANCES device_gemm_add_multiply_instance)
+  #   list(APPEND DEVICE_INSTANCES device_gemm_multiply_add_instance)
+  #   list(APPEND DEVICE_INSTANCES device_gemm_add_instance)
+  #   list(APPEND DEVICE_INSTANCES device_gemm_add_silu_instance)
+  #   list(APPEND DEVICE_INSTANCES device_gemm_fastgelu_instance)
+  #   list(APPEND DEVICE_INSTANCES device_gemm_add_fastgelu_instance)
+  #   list(APPEND DEVICE_INSTANCES device_gemm_add_add_fastgelu_instance)
+  # endif()
+  # list(APPEND DEVICE_INSTANCES device_batched_gemm_gemm_instance)
  list(APPEND DEVICE_INSTANCES device_grouped_conv1d_bwd_weight_instance)
  list(APPEND DEVICE_INSTANCES device_grouped_conv2d_bwd_weight_instance)
  list(APPEND DEVICE_INSTANCES device_grouped_convnd_bwd_weight_instance)
@@ -257,15 +257,15 @@ if(SUPPORTED_GPU_TARGETS MATCHES "gfx9|gfx1[12]")
 endif()

 if(DL_KERNELS)
-  list(APPEND DEVICE_INSTANCES device_batched_gemm_multi_d_instance)
+  # list(APPEND DEVICE_INSTANCES device_batched_gemm_multi_d_instance)
  list(APPEND DEVICE_INSTANCES device_grouped_conv1d_bwd_weight_instance)
  list(APPEND DEVICE_INSTANCES device_grouped_conv2d_bwd_weight_instance)
  list(APPEND DEVICE_INSTANCES device_grouped_conv3d_bwd_weight_instance)
 endif()

-if(CK_ENABLE_INT8)
-  list(APPEND DEVICE_INSTANCES device_quantization_instance)
-endif()
+# if(CK_ENABLE_INT8)
+#   list(APPEND DEVICE_INSTANCES device_quantization_instance)
+# endif()

 set(PROFILER_LIBS utility getopt::getopt)
 foreach(LIB ${DEVICE_INSTANCES})