From 02db748e747b9788dcb53e7c4a2f968d5b60149f Mon Sep 17 00:00:00 2001
From: Po Yen Chen <PoYen.Chen@amd.com>
Date: Fri, 2 Dec 2022 05:15:02 +0800
Subject: [PATCH] Modularize ckProfiler operations (#514)

* Re-structure ckProfiler source files

* Rename profiler.cpp to main.cpp

* Modularize ckProfiler operations

* Add description for profiler operations

* Use longer name to avoid name collision

* Use macro to delay expansion

* Use std::move() to avoid object copying

* Prohibit users from calling dtor

* Use macro to eliminate redundant code

* Make friend function hidden

* Add missing include directive <iostream>

* Fix wrong include directives

* Remove int8 from batchnorm-forward instances since it is not needed for forward training and could fail test

Co-authored-by: Qianfeng Zhang <Qianfeng.Zhang@amd.com>

[ROCm/composable_kernel commit: 8784a72e23538d594ea6b1bd527478fba2962d30]
---
 profiler/CMakeLists.txt                       |  63 +-------
 .../include/{ => profiler}/data_type_enum.hpp |   0
 .../{ => profiler}/data_type_enum_helper.hpp  |   2 +-
 ...le_batched_gemm_add_relu_gemm_add_impl.hpp |   0
 .../profile_batched_gemm_gemm_impl.hpp        |   0
 .../profile_batched_gemm_impl.hpp             |   0
 .../profile_batched_gemm_reduce_impl.hpp      |   0
 ...profile_batched_gemm_softmax_gemm_impl.hpp |   0
 ...batched_gemm_softmax_gemm_permute_impl.hpp |   0
 .../profile_batchnorm_backward_impl.hpp       |   0
 .../profile_batchnorm_forward_impl.hpp        |   0
 .../profile_conv_bwd_data_impl.hpp            |   0
 .../profile_conv_fwd_bias_relu_add_impl.hpp   |   0
 .../profile_conv_fwd_bias_relu_impl.hpp       |   0
 .../{ => profiler}/profile_conv_fwd_impl.hpp  |   0
 .../profile_convnd_bwd_data_impl.hpp          |   0
 .../profile_convnd_bwd_weight_impl.hpp        |   0
 .../profile_elementwise_layernorm_impl.hpp    |   0
 .../profile_gemm_add_add_fastgelu_impl.hpp    |   0
 .../profile_gemm_bias_add_reduce_impl.hpp     |   0
 .../profile_gemm_bilinear_impl.hpp            |   0
 .../{ => profiler}/profile_gemm_impl.hpp      |   0
 .../profile_gemm_reduce_impl.hpp              |   0
 .../profile_gemm_splitk_impl.hpp              |   0
 .../profile_grouped_conv_bwd_weight_impl.hpp  |   0
 .../profile_grouped_conv_fwd_impl.hpp         |   0
 .../profile_grouped_gemm_impl.hpp             |   0
 .../{ => profiler}/profile_groupnorm_impl.hpp |   0
 .../{ => profiler}/profile_layernorm_impl.hpp |   0
 .../{ => profiler}/profile_reduce_impl.hpp    |   0
 .../{ => profiler}/profile_softmax_impl.hpp   |   0
 profiler/src/CMakeLists.txt                   |  63 ++++++++
 profiler/src/profile_batched_gemm.cpp         |  10 +-
 ...profile_batched_gemm_add_relu_gemm_add.cpp |  11 +-
 profiler/src/profile_batched_gemm_gemm.cpp    |  10 +-
 profiler/src/profile_batched_gemm_reduce.cpp  |  10 +-
 profiler/src/profile_batchnorm_bwd.cpp        |   5 +-
 profiler/src/profile_batchnorm_fwd.cpp        |   5 +-
 profiler/src/profile_conv_bwd_data.cpp        |  10 +-
 profiler/src/profile_conv_fwd.cpp             |  10 +-
 profiler/src/profile_conv_fwd_bias_relu.cpp   |  10 +-
 .../src/profile_conv_fwd_bias_relu_add.cpp    |  11 +-
 profiler/src/profile_gemm.cpp                 |  10 +-
 .../src/profile_gemm_add_add_fastgelu.cpp     |  10 +-
 profiler/src/profile_gemm_bias_add_reduce.cpp |  10 +-
 profiler/src/profile_gemm_bilinear.cpp        |  10 +-
 profiler/src/profile_gemm_reduce.cpp          |  10 +-
 profiler/src/profile_gemm_splitk.cpp          |  10 +-
 .../src/profile_grouped_conv_bwd_weight.cpp   |  10 +-
 profiler/src/profile_grouped_conv_fwd.cpp     |  10 +-
 profiler/src/profile_grouped_gemm.cpp         |  10 +-
 profiler/src/profile_groupnorm.cpp            |  12 +-
 profiler/src/profile_layernorm.cpp            |   7 +-
 profiler/src/profile_reduce.cpp               |   7 +-
 profiler/src/profile_softmax.cpp              |   5 +-
 profiler/src/profiler.cpp                     | 150 ++----------------
 profiler/src/profiler_operation_registry.hpp  |  79 +++++++++
 test/CMakeLists.txt                           |   1 +
 test/batched_gemm/batched_gemm_bf16.cpp       |   2 +-
 test/batched_gemm/batched_gemm_fp16.cpp       |   2 +-
 test/batched_gemm/batched_gemm_fp32.cpp       |   2 +-
 test/batched_gemm/batched_gemm_int8.cpp       |   2 +-
 .../test_batched_gemm_gemm_util.hpp           |   2 +-
 .../batched_gemm_reduce_fp16.cpp              |   2 +-
 .../test_batched_gemm_softmax_gemm_util.hpp   |   2 +-
 ...batched_gemm_softmax_gemm_permute_util.hpp |   2 +-
 test/batchnorm/batchnorm_bwd_rank_4.cpp       |   2 +-
 test/batchnorm/batchnorm_fwd_rank_4.cpp       |   2 +-
 test/convnd_bwd_data/convnd_bwd_data.cpp      |   2 +-
 test/convnd_fwd/convnd_fwd.cpp                |   2 +-
 .../test_elementwise_layernorm_fp16.cpp       |   2 +-
 test/gemm_reduce/gemm_reduce_fp16.cpp         |   2 +-
 .../grouped_convnd_bwd_weight.cpp             |   2 +-
 .../grouped_convnd_fwd/grouped_convnd_fwd.cpp |   2 +-
 test/grouped_gemm/grouped_gemm_fp16.cpp       |   2 +-
 test/normalization/test_groupnorm_fp16.cpp    |   2 +-
 test/normalization/test_groupnorm_fp32.cpp    |   2 +-
 test/normalization/test_layernorm2d_fp16.cpp  |   2 +-
 test/normalization/test_layernorm2d_fp32.cpp  |   2 +-
 test/reduce/reduce_no_index.cpp               |   2 +-
 test/reduce/reduce_with_index.cpp             |   2 +-
 test/softmax/test_softmax_util.hpp            |   2 +-
 82 files changed, 346 insertions(+), 273 deletions(-)
 rename profiler/include/{ => profiler}/data_type_enum.hpp (100%)
 rename profiler/include/{ => profiler}/data_type_enum_helper.hpp (96%)
 rename profiler/include/{ => profiler}/profile_batched_gemm_add_relu_gemm_add_impl.hpp (100%)
 rename profiler/include/{ => profiler}/profile_batched_gemm_gemm_impl.hpp (100%)
 rename profiler/include/{ => profiler}/profile_batched_gemm_impl.hpp (100%)
 rename profiler/include/{ => profiler}/profile_batched_gemm_reduce_impl.hpp (100%)
 rename profiler/include/{ => profiler}/profile_batched_gemm_softmax_gemm_impl.hpp (100%)
 rename profiler/include/{ => profiler}/profile_batched_gemm_softmax_gemm_permute_impl.hpp (100%)
 rename profiler/include/{ => profiler}/profile_batchnorm_backward_impl.hpp (100%)
 rename profiler/include/{ => profiler}/profile_batchnorm_forward_impl.hpp (100%)
 rename profiler/include/{ => profiler}/profile_conv_bwd_data_impl.hpp (100%)
 rename profiler/include/{ => profiler}/profile_conv_fwd_bias_relu_add_impl.hpp (100%)
 rename profiler/include/{ => profiler}/profile_conv_fwd_bias_relu_impl.hpp (100%)
 rename profiler/include/{ => profiler}/profile_conv_fwd_impl.hpp (100%)
 rename profiler/include/{ => profiler}/profile_convnd_bwd_data_impl.hpp (100%)
 rename profiler/include/{ => profiler}/profile_convnd_bwd_weight_impl.hpp (100%)
 rename profiler/include/{ => profiler}/profile_elementwise_layernorm_impl.hpp (100%)
 rename profiler/include/{ => profiler}/profile_gemm_add_add_fastgelu_impl.hpp (100%)
 rename profiler/include/{ => profiler}/profile_gemm_bias_add_reduce_impl.hpp (100%)
 rename profiler/include/{ => profiler}/profile_gemm_bilinear_impl.hpp (100%)
 rename profiler/include/{ => profiler}/profile_gemm_impl.hpp (100%)
 rename profiler/include/{ => profiler}/profile_gemm_reduce_impl.hpp (100%)
 rename profiler/include/{ => profiler}/profile_gemm_splitk_impl.hpp (100%)
 rename profiler/include/{ => profiler}/profile_grouped_conv_bwd_weight_impl.hpp (100%)
 rename profiler/include/{ => profiler}/profile_grouped_conv_fwd_impl.hpp (100%)
 rename profiler/include/{ => profiler}/profile_grouped_gemm_impl.hpp (100%)
 rename profiler/include/{ => profiler}/profile_groupnorm_impl.hpp (100%)
 rename profiler/include/{ => profiler}/profile_layernorm_impl.hpp (100%)
 rename profiler/include/{ => profiler}/profile_reduce_impl.hpp (100%)
 rename profiler/include/{ => profiler}/profile_softmax_impl.hpp (100%)
 create mode 100644 profiler/src/CMakeLists.txt
 create mode 100644 profiler/src/profiler_operation_registry.hpp
diff --git a/profiler/CMakeLists.txt b/profiler/CMakeLists.txt
index 0dccfff476..bdd7125ac1 100644
--- a/profiler/CMakeLists.txt
+++ b/profiler/CMakeLists.txt
@@ -1,64 +1,5 @@
 include_directories(BEFORE
-    ${PROJECT_SOURCE_DIR}/
+    ${CMAKE_CURRENT_LIST_DIR}/include
 )
 
-# ck_profiler
-set(PROFILER_SOURCE
-    src/profiler.cpp
-    src/profile_gemm.cpp
-    src/profile_gemm_splitk.cpp
-    src/profile_gemm_bilinear.cpp
-    src/profile_gemm_bias_add_reduce.cpp
-    src/profile_gemm_add_add_fastgelu.cpp
-    src/profile_gemm_reduce.cpp
-    src/profile_batched_gemm.cpp
-    src/profile_batched_gemm_gemm.cpp
-    src/profile_batched_gemm_add_relu_gemm_add.cpp
-    src/profile_batched_gemm_reduce.cpp
-    src/profile_grouped_gemm.cpp
-    src/profile_conv_fwd.cpp
-    src/profile_conv_fwd_bias_relu.cpp
-    src/profile_conv_fwd_bias_relu_add.cpp
-    src/profile_conv_bwd_data.cpp
-    src/profile_grouped_conv_fwd.cpp
-    src/profile_grouped_conv_bwd_weight.cpp
-    src/profile_reduce.cpp
-    src/profile_groupnorm.cpp
-    src/profile_layernorm.cpp
-    src/profile_softmax.cpp
-    src/profile_batchnorm_fwd.cpp
-    src/profile_batchnorm_bwd.cpp
-)
-
-add_executable(ckProfiler ${PROFILER_SOURCE})
-
-target_link_libraries(ckProfiler PRIVATE utility)
-target_link_libraries(ckProfiler PRIVATE device_gemm_instance)
-target_link_libraries(ckProfiler PRIVATE device_gemm_splitk_instance)
-target_link_libraries(ckProfiler PRIVATE device_gemm_bilinear_instance)
-target_link_libraries(ckProfiler PRIVATE device_gemm_add_add_fastgelu_instance)
-target_link_libraries(ckProfiler PRIVATE device_gemm_reduce_instance)
-target_link_libraries(ckProfiler PRIVATE device_gemm_bias_add_reduce_instance)
-target_link_libraries(ckProfiler PRIVATE device_batched_gemm_instance)
-target_link_libraries(ckProfiler PRIVATE device_batched_gemm_gemm_instance)
-target_link_libraries(ckProfiler PRIVATE device_batched_gemm_add_relu_gemm_add_instance)
-target_link_libraries(ckProfiler PRIVATE device_batched_gemm_reduce_instance)
-target_link_libraries(ckProfiler PRIVATE device_grouped_gemm_instance)
-target_link_libraries(ckProfiler PRIVATE device_conv2d_fwd_instance)
-target_link_libraries(ckProfiler PRIVATE device_grouped_conv1d_fwd_instance)
-target_link_libraries(ckProfiler PRIVATE device_grouped_conv2d_fwd_instance)
-target_link_libraries(ckProfiler PRIVATE device_grouped_conv3d_fwd_instance)
-target_link_libraries(ckProfiler PRIVATE device_conv1d_bwd_data_instance)
-target_link_libraries(ckProfiler PRIVATE device_conv2d_bwd_data_instance)
-target_link_libraries(ckProfiler PRIVATE device_conv3d_bwd_data_instance)
-target_link_libraries(ckProfiler PRIVATE device_grouped_conv1d_bwd_weight_instance)
-target_link_libraries(ckProfiler PRIVATE device_grouped_conv2d_bwd_weight_instance)
-target_link_libraries(ckProfiler PRIVATE device_grouped_conv3d_bwd_weight_instance)
-target_link_libraries(ckProfiler PRIVATE device_conv2d_fwd_bias_relu_instance)
-target_link_libraries(ckProfiler PRIVATE device_conv2d_fwd_bias_relu_add_instance)
-target_link_libraries(ckProfiler PRIVATE device_normalization_instance)
-target_link_libraries(ckProfiler PRIVATE device_softmax_instance)
-target_link_libraries(ckProfiler PRIVATE device_reduce_instance)
-target_link_libraries(ckProfiler PRIVATE device_batchnorm_instance)
-
-rocm_install(TARGETS ckProfiler COMPONENT profiler)
+add_subdirectory(src)
diff --git a/profiler/include/data_type_enum.hpp b/profiler/include/profiler/data_type_enum.hpp
similarity index 100%
rename from profiler/include/data_type_enum.hpp
rename to profiler/include/profiler/data_type_enum.hpp
diff --git a/profiler/include/data_type_enum_helper.hpp b/profiler/include/profiler/data_type_enum_helper.hpp
similarity index 96%
rename from profiler/include/data_type_enum_helper.hpp
rename to profiler/include/profiler/data_type_enum_helper.hpp
index 6f8ef2b9f7..d9bd5e1a40 100644
--- a/profiler/include/data_type_enum_helper.hpp
+++ b/profiler/include/profiler/data_type_enum_helper.hpp
@@ -4,7 +4,7 @@
 #pragma
 
 #include "ck/utility/data_type.hpp"
-#include "profiler/include/data_type_enum.hpp"
+#include "profiler/data_type_enum.hpp"
 
 namespace ck {
 
diff --git a/profiler/include/profile_batched_gemm_add_relu_gemm_add_impl.hpp b/profiler/include/profiler/profile_batched_gemm_add_relu_gemm_add_impl.hpp
similarity index 100%
rename from profiler/include/profile_batched_gemm_add_relu_gemm_add_impl.hpp
rename to profiler/include/profiler/profile_batched_gemm_add_relu_gemm_add_impl.hpp
diff --git a/profiler/include/profile_batched_gemm_gemm_impl.hpp b/profiler/include/profiler/profile_batched_gemm_gemm_impl.hpp
similarity index 100%
rename from profiler/include/profile_batched_gemm_gemm_impl.hpp
rename to profiler/include/profiler/profile_batched_gemm_gemm_impl.hpp
diff --git a/profiler/include/profile_batched_gemm_impl.hpp b/profiler/include/profiler/profile_batched_gemm_impl.hpp
similarity index 100%
rename from profiler/include/profile_batched_gemm_impl.hpp
rename to profiler/include/profiler/profile_batched_gemm_impl.hpp
diff --git a/profiler/include/profile_batched_gemm_reduce_impl.hpp b/profiler/include/profiler/profile_batched_gemm_reduce_impl.hpp
similarity index 100%
rename from profiler/include/profile_batched_gemm_reduce_impl.hpp
rename to profiler/include/profiler/profile_batched_gemm_reduce_impl.hpp
diff --git a/profiler/include/profile_batched_gemm_softmax_gemm_impl.hpp b/profiler/include/profiler/profile_batched_gemm_softmax_gemm_impl.hpp
similarity index 100%
rename from profiler/include/profile_batched_gemm_softmax_gemm_impl.hpp
rename to profiler/include/profiler/profile_batched_gemm_softmax_gemm_impl.hpp
diff --git a/profiler/include/profile_batched_gemm_softmax_gemm_permute_impl.hpp b/profiler/include/profiler/profile_batched_gemm_softmax_gemm_permute_impl.hpp
similarity index 100%
rename from profiler/include/profile_batched_gemm_softmax_gemm_permute_impl.hpp
rename to profiler/include/profiler/profile_batched_gemm_softmax_gemm_permute_impl.hpp
diff --git a/profiler/include/profile_batchnorm_backward_impl.hpp b/profiler/include/profiler/profile_batchnorm_backward_impl.hpp
similarity index 100%
rename from profiler/include/profile_batchnorm_backward_impl.hpp
rename to profiler/include/profiler/profile_batchnorm_backward_impl.hpp
diff --git a/profiler/include/profile_batchnorm_forward_impl.hpp b/profiler/include/profiler/profile_batchnorm_forward_impl.hpp
similarity index 100%
rename from profiler/include/profile_batchnorm_forward_impl.hpp
rename to profiler/include/profiler/profile_batchnorm_forward_impl.hpp
diff --git a/profiler/include/profile_conv_bwd_data_impl.hpp b/profiler/include/profiler/profile_conv_bwd_data_impl.hpp
similarity index 100%
rename from profiler/include/profile_conv_bwd_data_impl.hpp
rename to profiler/include/profiler/profile_conv_bwd_data_impl.hpp
diff --git a/profiler/include/profile_conv_fwd_bias_relu_add_impl.hpp b/profiler/include/profiler/profile_conv_fwd_bias_relu_add_impl.hpp
similarity index 100%
rename from profiler/include/profile_conv_fwd_bias_relu_add_impl.hpp
rename to profiler/include/profiler/profile_conv_fwd_bias_relu_add_impl.hpp
diff --git a/profiler/include/profile_conv_fwd_bias_relu_impl.hpp b/profiler/include/profiler/profile_conv_fwd_bias_relu_impl.hpp
similarity index 100%
rename from profiler/include/profile_conv_fwd_bias_relu_impl.hpp
rename to profiler/include/profiler/profile_conv_fwd_bias_relu_impl.hpp
diff --git a/profiler/include/profile_conv_fwd_impl.hpp b/profiler/include/profiler/profile_conv_fwd_impl.hpp
similarity index 100%
rename from profiler/include/profile_conv_fwd_impl.hpp
rename to profiler/include/profiler/profile_conv_fwd_impl.hpp
diff --git a/profiler/include/profile_convnd_bwd_data_impl.hpp b/profiler/include/profiler/profile_convnd_bwd_data_impl.hpp
similarity index 100%
rename from profiler/include/profile_convnd_bwd_data_impl.hpp
rename to profiler/include/profiler/profile_convnd_bwd_data_impl.hpp
diff --git a/profiler/include/profile_convnd_bwd_weight_impl.hpp b/profiler/include/profiler/profile_convnd_bwd_weight_impl.hpp
similarity index 100%
rename from profiler/include/profile_convnd_bwd_weight_impl.hpp
rename to profiler/include/profiler/profile_convnd_bwd_weight_impl.hpp
diff --git a/profiler/include/profile_elementwise_layernorm_impl.hpp b/profiler/include/profiler/profile_elementwise_layernorm_impl.hpp
similarity index 100%
rename from profiler/include/profile_elementwise_layernorm_impl.hpp
rename to profiler/include/profiler/profile_elementwise_layernorm_impl.hpp
diff --git a/profiler/include/profile_gemm_add_add_fastgelu_impl.hpp b/profiler/include/profiler/profile_gemm_add_add_fastgelu_impl.hpp
similarity index 100%
rename from profiler/include/profile_gemm_add_add_fastgelu_impl.hpp
rename to profiler/include/profiler/profile_gemm_add_add_fastgelu_impl.hpp
diff --git a/profiler/include/profile_gemm_bias_add_reduce_impl.hpp b/profiler/include/profiler/profile_gemm_bias_add_reduce_impl.hpp
similarity index 100%
rename from profiler/include/profile_gemm_bias_add_reduce_impl.hpp
rename to profiler/include/profiler/profile_gemm_bias_add_reduce_impl.hpp
diff --git a/profiler/include/profile_gemm_bilinear_impl.hpp b/profiler/include/profiler/profile_gemm_bilinear_impl.hpp
similarity index 100%
rename from profiler/include/profile_gemm_bilinear_impl.hpp
rename to profiler/include/profiler/profile_gemm_bilinear_impl.hpp
diff --git a/profiler/include/profile_gemm_impl.hpp b/profiler/include/profiler/profile_gemm_impl.hpp
similarity index 100%
rename from profiler/include/profile_gemm_impl.hpp
rename to profiler/include/profiler/profile_gemm_impl.hpp
diff --git a/profiler/include/profile_gemm_reduce_impl.hpp b/profiler/include/profiler/profile_gemm_reduce_impl.hpp
similarity index 100%
rename from profiler/include/profile_gemm_reduce_impl.hpp
rename to profiler/include/profiler/profile_gemm_reduce_impl.hpp
diff --git a/profiler/include/profile_gemm_splitk_impl.hpp b/profiler/include/profiler/profile_gemm_splitk_impl.hpp
similarity index 100%
rename from profiler/include/profile_gemm_splitk_impl.hpp
rename to profiler/include/profiler/profile_gemm_splitk_impl.hpp
diff --git a/profiler/include/profile_grouped_conv_bwd_weight_impl.hpp b/profiler/include/profiler/profile_grouped_conv_bwd_weight_impl.hpp
similarity index 100%
rename from profiler/include/profile_grouped_conv_bwd_weight_impl.hpp
rename to profiler/include/profiler/profile_grouped_conv_bwd_weight_impl.hpp
diff --git a/profiler/include/profile_grouped_conv_fwd_impl.hpp b/profiler/include/profiler/profile_grouped_conv_fwd_impl.hpp
similarity index 100%
rename from profiler/include/profile_grouped_conv_fwd_impl.hpp
rename to profiler/include/profiler/profile_grouped_conv_fwd_impl.hpp
diff --git a/profiler/include/profile_grouped_gemm_impl.hpp b/profiler/include/profiler/profile_grouped_gemm_impl.hpp
similarity index 100%
rename from profiler/include/profile_grouped_gemm_impl.hpp
rename to profiler/include/profiler/profile_grouped_gemm_impl.hpp
diff --git a/profiler/include/profile_groupnorm_impl.hpp b/profiler/include/profiler/profile_groupnorm_impl.hpp
similarity index 100%
rename from profiler/include/profile_groupnorm_impl.hpp
rename to profiler/include/profiler/profile_groupnorm_impl.hpp
diff --git a/profiler/include/profile_layernorm_impl.hpp b/profiler/include/profiler/profile_layernorm_impl.hpp
similarity index 100%
rename from profiler/include/profile_layernorm_impl.hpp
rename to profiler/include/profiler/profile_layernorm_impl.hpp
diff --git a/profiler/include/profile_reduce_impl.hpp b/profiler/include/profiler/profile_reduce_impl.hpp
similarity index 100%
rename from profiler/include/profile_reduce_impl.hpp
rename to profiler/include/profiler/profile_reduce_impl.hpp
diff --git a/profiler/include/profile_softmax_impl.hpp b/profiler/include/profiler/profile_softmax_impl.hpp
similarity index 100%
rename from profiler/include/profile_softmax_impl.hpp
rename to profiler/include/profiler/profile_softmax_impl.hpp
diff --git a/profiler/src/CMakeLists.txt b/profiler/src/CMakeLists.txt
new file mode 100644
index 0000000000..51d039526f
--- /dev/null
+++ b/profiler/src/CMakeLists.txt
@@ -0,0 +1,63 @@
+# ckProfiler
+set(PROFILER_SOURCES
+    profiler.cpp
+    profile_gemm.cpp
+    profile_gemm_splitk.cpp
+    profile_gemm_bilinear.cpp
+    profile_gemm_bias_add_reduce.cpp
+    profile_gemm_add_add_fastgelu.cpp
+    profile_gemm_reduce.cpp
+    profile_batched_gemm.cpp
+    profile_batched_gemm_gemm.cpp
+    profile_batched_gemm_add_relu_gemm_add.cpp
+    profile_batched_gemm_reduce.cpp
+    profile_grouped_gemm.cpp
+    profile_conv_fwd.cpp
+    profile_conv_fwd_bias_relu.cpp
+    profile_conv_fwd_bias_relu_add.cpp
+    profile_conv_bwd_data.cpp
+    profile_grouped_conv_fwd.cpp
+    profile_grouped_conv_bwd_weight.cpp
+    profile_reduce.cpp
+    profile_groupnorm.cpp
+    profile_layernorm.cpp
+    profile_softmax.cpp
+    profile_batchnorm_fwd.cpp
+    profile_batchnorm_bwd.cpp
+)
+
+set(PROFILER_EXECUTABLE ckProfiler)
+
+add_executable(${PROFILER_EXECUTABLE} ${PROFILER_SOURCES})
+target_compile_options(${PROFILER_EXECUTABLE} PRIVATE -Wno-global-constructors)
+
+target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE utility)
+target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_instance)
+target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_splitk_instance)
+target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_bilinear_instance)
+target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_add_add_fastgelu_instance)
+target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_reduce_instance)
+target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_bias_add_reduce_instance)
+target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_batched_gemm_instance)
+target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_batched_gemm_gemm_instance)
+target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_batched_gemm_add_relu_gemm_add_instance)
+target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_batched_gemm_reduce_instance)
+target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_gemm_instance)
+target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_conv2d_fwd_instance)
+target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_conv1d_fwd_instance)
+target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_conv2d_fwd_instance)
+target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_conv3d_fwd_instance)
+target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_conv1d_bwd_data_instance)
+target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_conv2d_bwd_data_instance)
+target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_conv3d_bwd_data_instance)
+target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_conv1d_bwd_weight_instance)
+target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_conv2d_bwd_weight_instance)
+target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_conv3d_bwd_weight_instance)
+target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_conv2d_fwd_bias_relu_instance)
+target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_conv2d_fwd_bias_relu_add_instance)
+target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_normalization_instance)
+target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_softmax_instance)
+target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_reduce_instance)
+target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_batchnorm_instance)
+
+rocm_install(TARGETS ${PROFILER_EXECUTABLE} COMPONENT profiler)
diff --git a/profiler/src/profile_batched_gemm.cpp b/profiler/src/profile_batched_gemm.cpp
index 7c4e2f7b7d..907a373794 100644
--- a/profiler/src/profile_batched_gemm.cpp
+++ b/profiler/src/profile_batched_gemm.cpp
@@ -7,7 +7,8 @@
 #include <initializer_list>
 #include <cstdlib>
 
-#include "profiler/include/profile_batched_gemm_impl.hpp"
+#include "profiler/profile_batched_gemm_impl.hpp"
+#include "profiler_operation_registry.hpp"
 
 enum struct GemmMatrixLayout
 {
@@ -25,12 +26,15 @@ enum struct GemmDataType
     INT8_INT8_INT8, // 3
 };
 
+#define OP_NAME "batched_gemm"
+#define OP_DESC "Batched GEMM"
+
 int profile_batched_gemm(int argc, char* argv[])
 {
     if(argc != 18)
     {
         // clang-format off
-        printf("arg1: tensor operation (batched_gemm: Batched GEMM)\n");
+        printf("arg1: tensor operation (" OP_NAME ": " OP_DESC ")\n");
         printf("arg2: data type (0: fp32; 1: fp16, 2: bf16, 3: int8)\n");
         printf("arg3: matrix layout (0: A[g, m, k] * B[g, k, n] = C[g, m, n];\n");
         printf("                     1: A[g, m, k] * B[g, n, k] = C[g, m, n];\n");
@@ -195,3 +199,5 @@ int profile_batched_gemm(int argc, char* argv[])
         return 1;
     }
 }
+
+REGISTER_PROFILER_OPERATION(OP_NAME, OP_DESC, profile_batched_gemm);
diff --git a/profiler/src/profile_batched_gemm_add_relu_gemm_add.cpp b/profiler/src/profile_batched_gemm_add_relu_gemm_add.cpp
index 1aca388715..f440a3094e 100644
--- a/profiler/src/profile_batched_gemm_add_relu_gemm_add.cpp
+++ b/profiler/src/profile_batched_gemm_add_relu_gemm_add.cpp
@@ -6,7 +6,8 @@
 #include <initializer_list>
 #include <cstdlib>
 
-#include "profiler/include/profile_batched_gemm_add_relu_gemm_add_impl.hpp"
+#include "profiler/profile_batched_gemm_add_relu_gemm_add_impl.hpp"
+#include "profiler_operation_registry.hpp"
 
 using F16 = ck::half_t;
 using F32 = float;
@@ -14,6 +15,9 @@ using F32 = float;
 using Row = ck::tensor_layout::gemm::RowMajor;
 using Col = ck::tensor_layout::gemm::ColumnMajor;
 
+#define OP_NAME "batched_gemm_add_relu_gemm_add"
+#define OP_DESC "Batched GEMM+Add+Relu+GEMM+Add"
+
 int profile_batched_gemm_add_relu_gemm_add(int argc, char* argv[])
 {
     enum struct GemmMatrixLayout
@@ -109,8 +113,7 @@ int profile_batched_gemm_add_relu_gemm_add(int argc, char* argv[])
     }
     else
     {
-        printf("arg1: tensor operation (batched_gemm_add_relu_gemm_add: "
-               "Batched_GEMM+Add+Relu+Gemm+Add)\n");
+        printf("arg1: tensor operation (" OP_NAME ": " OP_DESC ")\n");
         printf("arg2: data type (1: fp16)\n");
         printf("arg3: matrix layout (0: Relu(A0[m, k] * B0[n, k] + D0[m, n]) * B1[n, o] + D1[m, o] "
                "= E1[m, o]; 1: Relu(A0[m, k] * B0[n, k] + D0[m, n]) * B1[o, n] + D1[m, o] = "
@@ -207,3 +210,5 @@ int profile_batched_gemm_add_relu_gemm_add(int argc, char* argv[])
 
     return 0;
 }
+
+REGISTER_PROFILER_OPERATION(OP_NAME, OP_DESC, profile_batched_gemm_add_relu_gemm_add);
diff --git a/profiler/src/profile_batched_gemm_gemm.cpp b/profiler/src/profile_batched_gemm_gemm.cpp
index a28c494a0e..6015c93be3 100644
--- a/profiler/src/profile_batched_gemm_gemm.cpp
+++ b/profiler/src/profile_batched_gemm_gemm.cpp
@@ -6,7 +6,8 @@
 #include <initializer_list>
 #include <cstdlib>
 
-#include "profiler/include/profile_batched_gemm_gemm_impl.hpp"
+#include "profiler/profile_batched_gemm_gemm_impl.hpp"
+#include "profiler_operation_registry.hpp"
 
 using F16 = ck::half_t;
 using F32 = float;
@@ -14,6 +15,9 @@ using F32 = float;
 using Row = ck::tensor_layout::gemm::RowMajor;
 using Col = ck::tensor_layout::gemm::ColumnMajor;
 
+#define OP_NAME "batched_gemm_gemm"
+#define OP_DESC "Batched GEMM+GEMM"
+
 int profile_batched_gemm_gemm(int argc, char* argv[])
 {
     enum struct GemmMatrixLayout
@@ -101,7 +105,7 @@ int profile_batched_gemm_gemm(int argc, char* argv[])
     }
     else
     {
-        printf("arg1: tensor operation (batched_gemm_gemm: Batched_GEMM+Gemm)\n");
+        printf("arg1: tensor operation (" OP_NAME ": " OP_DESC ")\n");
         printf("arg2: data type (1: fp16)\n");
         printf("arg3: matrix layout (0: Relu(A0[m, k] * B0[n, k] + D0[m, n]) * B1[n, o] + D1[m, o] "
                "= E1[m, o];  1: Relu(A0[m, k] * B0[n, k] + D0[m, n]) * B1[o, n] + D1[m, o] = E1[m, "
@@ -179,3 +183,5 @@ int profile_batched_gemm_gemm(int argc, char* argv[])
 
     return 0;
 }
+
+REGISTER_PROFILER_OPERATION(OP_NAME, OP_DESC, profile_batched_gemm_gemm);
diff --git a/profiler/src/profile_batched_gemm_reduce.cpp b/profiler/src/profile_batched_gemm_reduce.cpp
index d734b5d87b..6b1dfc0142 100644
--- a/profiler/src/profile_batched_gemm_reduce.cpp
+++ b/profiler/src/profile_batched_gemm_reduce.cpp
@@ -6,7 +6,11 @@
 #include <initializer_list>
 #include <cstdlib>
 
-#include "profiler/include/profile_batched_gemm_reduce_impl.hpp"
+#include "profiler/profile_batched_gemm_reduce_impl.hpp"
+#include "profiler_operation_registry.hpp"
+
+#define OP_NAME "batched_gemm_reduce"
+#define OP_DESC "Batched GEMM+Reduce"
 
 int profile_batched_gemm_reduce(int argc, char* argv[])
 {
@@ -26,7 +30,7 @@ int profile_batched_gemm_reduce(int argc, char* argv[])
 
     if(argc != 15)
     {
-        printf("arg1: tensor operation (batched_gemm_reduce: BatchedGEMM+Reduce)\n");
+        printf("arg1: tensor operation (" OP_NAME ": " OP_DESC ")\n");
         printf("arg2: data type (0: fp32; 1: fp16)\n");
         printf("arg3: matrix layout (0: A[m, k] * B[k, n] = C[m, n];\n");
         printf("                     1: A[m, k] * B[n, k] = C[m, n];\n");
@@ -151,3 +155,5 @@ int profile_batched_gemm_reduce(int argc, char* argv[])
 
     return 0;
 }
+
+REGISTER_PROFILER_OPERATION(OP_NAME, OP_DESC, profile_batched_gemm_reduce);
diff --git a/profiler/src/profile_batchnorm_bwd.cpp b/profiler/src/profile_batchnorm_bwd.cpp
index d5938a1e6b..44ce7350ff 100644
--- a/profiler/src/profile_batchnorm_bwd.cpp
+++ b/profiler/src/profile_batchnorm_bwd.cpp
@@ -6,7 +6,8 @@
 #include <getopt.h>
 
 #include "ck/library/utility/host_common_util.hpp"
-#include "profiler/include/profile_batchnorm_backward_impl.hpp"
+#include "profiler/profile_batchnorm_backward_impl.hpp"
+#include "profiler_operation_registry.hpp"
 
 using ck::index_t;
 
@@ -202,3 +203,5 @@ int profile_batchnorm_backward(int argc, char* argv[])
 
     return 0;
 }
+
+REGISTER_PROFILER_OPERATION("bnorm_bwd", "Batchnorm backward", profile_batchnorm_backward);
diff --git a/profiler/src/profile_batchnorm_fwd.cpp b/profiler/src/profile_batchnorm_fwd.cpp
index db443e5d7b..902a1fc423 100644
--- a/profiler/src/profile_batchnorm_fwd.cpp
+++ b/profiler/src/profile_batchnorm_fwd.cpp
@@ -6,7 +6,8 @@
 #include <getopt.h>
 
 #include "ck/library/utility/host_common_util.hpp"
-#include "profiler/include/profile_batchnorm_forward_impl.hpp"
+#include "profiler/profile_batchnorm_forward_impl.hpp"
+#include "profiler_operation_registry.hpp"
 
 using ck::index_t;
 
@@ -214,3 +215,5 @@ int profile_batchnorm_forward(int argc, char* argv[])
 
     return 0;
 }
+
+REGISTER_PROFILER_OPERATION("bnorm_fwd", "Batchnorm forward", profile_batchnorm_forward);
diff --git a/profiler/src/profile_conv_bwd_data.cpp b/profiler/src/profile_conv_bwd_data.cpp
index cf42afd2aa..9241ead738 100644
--- a/profiler/src/profile_conv_bwd_data.cpp
+++ b/profiler/src/profile_conv_bwd_data.cpp
@@ -6,7 +6,8 @@
 #include <initializer_list>
 #include <cstdlib>
 
-#include "profiler/include/profile_conv_bwd_data_impl.hpp"
+#include "profiler/profile_conv_bwd_data_impl.hpp"
+#include "profiler_operation_registry.hpp"
 
 namespace {
 
@@ -24,10 +25,13 @@ enum struct ConvDataType
     INT8_INT8_INT8, // 3
 };
 
+#define OP_NAME "conv_bwd_data"
+#define OP_DESC "Convolution Backward Data"
+
 static void print_helper_msg()
 {
     std::cout
-        << "arg1: tensor operation (conv_bwd_data: Convolution Backward Data)\n"
+        << "arg1: tensor operation (" OP_NAME ": " OP_DESC ")\n"
         << "arg2: data type (0: Input fp32, Weight fp32, Output fp32\n"
         << "                 1: Input fp16, Weight fp16, Output fp16\n"
         << "                 2: Input bf16, Weight bf16, Output bf16\n"
@@ -182,3 +186,5 @@ int profile_conv_bwd_data(int argc, char* argv[])
 
     return 1;
 }
+
+REGISTER_PROFILER_OPERATION(OP_NAME, OP_DESC, profile_conv_bwd_data);
diff --git a/profiler/src/profile_conv_fwd.cpp b/profiler/src/profile_conv_fwd.cpp
index 72b6a6b629..b57ee7fd94 100644
--- a/profiler/src/profile_conv_fwd.cpp
+++ b/profiler/src/profile_conv_fwd.cpp
@@ -6,7 +6,8 @@
 #include <initializer_list>
 #include <cstdlib>
 
-#include "profiler/include/profile_conv_fwd_impl.hpp"
+#include "profiler/profile_conv_fwd_impl.hpp"
+#include "profiler_operation_registry.hpp"
 
 namespace {
 
@@ -24,11 +25,14 @@ enum struct ConvDataType
     INT8_INT8_INT8, // 3
 };
 
+#define OP_NAME "conv_fwd"
+#define OP_DESC "Convolution Forward"
+
 static void print_helper_msg()
 {
     std::cout
         // clang-format-off
-        << "arg1: tensor operation (conv_fwd: Convolution Forward)\n"
+        << "arg1: tensor operation (" OP_NAME ": " OP_DESC ")\n"
         << "arg2: data type (0: Input fp32, Weight fp32, Output fp32\n"
         << "                 1: Input fp16, Weight fp16, Output fp16\n"
         << "                 2: Input bf16, Weight bf16, Output bf16\n"
@@ -184,3 +188,5 @@ int profile_conv_fwd(int argc, char* argv[])
 
     return 1;
 }
+
+REGISTER_PROFILER_OPERATION(OP_NAME, OP_DESC, profile_conv_fwd);
diff --git a/profiler/src/profile_conv_fwd_bias_relu.cpp b/profiler/src/profile_conv_fwd_bias_relu.cpp
index 91f4836a2b..b44007cde4 100644
--- a/profiler/src/profile_conv_fwd_bias_relu.cpp
+++ b/profiler/src/profile_conv_fwd_bias_relu.cpp
@@ -6,7 +6,8 @@
 #include <initializer_list>
 #include <cstdlib>
 
-#include "profiler/include/profile_conv_fwd_bias_relu_impl.hpp"
+#include "profiler/profile_conv_fwd_bias_relu_impl.hpp"
+#include "profiler_operation_registry.hpp"
 
 enum struct ConvDataType
 {
@@ -32,11 +33,14 @@ enum struct ConvOutputLayout
     NHWK, // 1
 };
 
+#define OP_NAME "conv_fwd_bias_relu"
+#define OP_DESC "Convolution Forward+Bias+ReLU"
+
 int profile_conv_fwd_bias_relu(int argc, char* argv[])
 {
     if(argc != 25)
     {
-        printf("arg1: tensor operation (conv_fwd_bias_relu: ForwardConvolution+Bias+ReLu)\n");
+        printf("arg1: tensor operation (" OP_NAME ": " OP_DESC ")\n");
         printf("arg2: data type (0: fp32; 1: fp16)\n");
         printf("arg3: input tensor layout (0: NCHW; 1: NHWC)\n");
         printf("arg4: weight tensor layout (0: KCYX; 1: KYXC)\n");
@@ -114,3 +118,5 @@ int profile_conv_fwd_bias_relu(int argc, char* argv[])
 
     return 0;
 }
+
+REGISTER_PROFILER_OPERATION(OP_NAME, OP_DESC, profile_conv_fwd_bias_relu);
diff --git a/profiler/src/profile_conv_fwd_bias_relu_add.cpp b/profiler/src/profile_conv_fwd_bias_relu_add.cpp
index 5cc6faba34..408dd02f78 100644
--- a/profiler/src/profile_conv_fwd_bias_relu_add.cpp
+++ b/profiler/src/profile_conv_fwd_bias_relu_add.cpp
@@ -6,7 +6,8 @@
 #include <initializer_list>
 #include <cstdlib>
 
-#include "profiler/include/profile_conv_fwd_bias_relu_add_impl.hpp"
+#include "profiler/profile_conv_fwd_bias_relu_add_impl.hpp"
+#include "profiler_operation_registry.hpp"
 
 enum struct ConvDataType
 {
@@ -32,12 +33,14 @@ enum struct ConvOutputLayout
     NHWK, // 1
 };
 
+#define OP_NAME "conv_fwd_bias_relu_add"
+#define OP_DESC "Convolution Forward+Bias+ReLU+Add"
+
 int profile_conv_fwd_bias_relu_add(int argc, char* argv[])
 {
     if(argc != 25)
     {
-        printf(
-            "arg1: tensor operation (conv_fwd_bias_relu_add: ForwardConvolution+Bias+ReLu+Add)\n");
+        printf("arg1: tensor operation (" OP_NAME ": " OP_DESC ")\n");
         printf("arg2: data type (0: fp32; 1: fp16)\n");
         printf("arg3: input tensor layout (0: NCHW; 1: NHWC)\n");
         printf("arg4: weight tensor layout (0: KCYX; 1: KYXC)\n");
@@ -115,3 +118,5 @@ int profile_conv_fwd_bias_relu_add(int argc, char* argv[])
 
     return 0;
 }
+
+REGISTER_PROFILER_OPERATION(OP_NAME, OP_DESC, profile_conv_fwd_bias_relu_add);
diff --git a/profiler/src/profile_gemm.cpp b/profiler/src/profile_gemm.cpp
index 70219c4c8c..61bae6ae70 100644
--- a/profiler/src/profile_gemm.cpp
+++ b/profiler/src/profile_gemm.cpp
@@ -6,7 +6,8 @@
 #include <initializer_list>
 #include <cstdlib>
 
-#include "profiler/include/profile_gemm_impl.hpp"
+#include "profiler/profile_gemm_impl.hpp"
+#include "profiler_operation_registry.hpp"
 
 enum struct GemmMatrixLayout
 {
@@ -24,9 +25,12 @@ enum struct GemmDataType
     INT8_INT8_INT8, // 3
 };
 
+#define OP_NAME "gemm"
+#define OP_DESC "GEMM"
+
 static void print_helper_msg()
 {
-    std::cout << "arg1: tensor operation (gemm: GEMM)\n"
+    std::cout << "arg1: tensor operation (" OP_NAME ": " OP_DESC ")\n"
               << "arg2: data type (0: fp32; 1: fp16; 2: bf16; 3: int8)\n"
               << "arg3: matrix layout (0: A[m, k] * B[k, n] = C[m, n];\n"
               << "                     1: A[m, k] * B[n, k] = C[m, n];\n"
@@ -184,3 +188,5 @@ int profile_gemm(int argc, char* argv[])
         return 1;
     }
 }
+
+REGISTER_PROFILER_OPERATION(OP_NAME, OP_DESC, profile_gemm);
diff --git a/profiler/src/profile_gemm_add_add_fastgelu.cpp b/profiler/src/profile_gemm_add_add_fastgelu.cpp
index 8d3d280d7b..c3c0fb7b67 100644
--- a/profiler/src/profile_gemm_add_add_fastgelu.cpp
+++ b/profiler/src/profile_gemm_add_add_fastgelu.cpp
@@ -6,7 +6,11 @@
 #include <initializer_list>
 #include <cstdlib>
 
-#include "profiler/include/profile_gemm_add_add_fastgelu_impl.hpp"
+#include "profiler/profile_gemm_add_add_fastgelu_impl.hpp"
+#include "profiler_operation_registry.hpp"
+
+#define OP_NAME "gemm_add_add_fastgelu"
+#define OP_DESC "GEMM+Add+Add+FastGeLU"
 
 int profile_gemm_add_add_fastgelu(int argc, char* argv[])
 {
@@ -29,7 +33,7 @@ int profile_gemm_add_add_fastgelu(int argc, char* argv[])
     if(argc != 16)
     {
         // clang-format off
-        printf("arg1: tensor operation (gemm_add_add_fastgelu: GEMM+Add+Add+FastGeLU)\n");
+        printf("arg1: tensor operation (" OP_NAME ": " OP_DESC ")\n");
         printf("arg2: data type (0: fp32; 1: fp16; 2: bf16; 3: int8)\n");
         printf("arg3: matrix layout (0: E[m, n] = FastGeLU(A[m, k] * B[k, n] + D0[m, n] + D1[m, n]);\n");
         printf("                     1: E[m, n] = FastGeLU(A[m, k] * B[n, k] + D0[m, n] + D1[m, n]);\n");
@@ -150,3 +154,5 @@ int profile_gemm_add_add_fastgelu(int argc, char* argv[])
         return 1;
     }
 }
+
+REGISTER_PROFILER_OPERATION(OP_NAME, OP_DESC, profile_gemm_add_add_fastgelu);
diff --git a/profiler/src/profile_gemm_bias_add_reduce.cpp b/profiler/src/profile_gemm_bias_add_reduce.cpp
index bc2675703f..6d86db0822 100644
--- a/profiler/src/profile_gemm_bias_add_reduce.cpp
+++ b/profiler/src/profile_gemm_bias_add_reduce.cpp
@@ -6,7 +6,11 @@
 #include <initializer_list>
 #include <cstdlib>
 
-#include "profiler/include/profile_gemm_bias_add_reduce_impl.hpp"
+#include "profiler/profile_gemm_bias_add_reduce_impl.hpp"
+#include "profiler_operation_registry.hpp"
+
+#define OP_NAME "gemm_bias_add_reduce"
+#define OP_DESC "GEMM+Bias+Add+Reduce"
 
 int profile_gemm_bias_add_reduce(int argc, char* argv[])
 {
@@ -26,7 +30,7 @@ int profile_gemm_bias_add_reduce(int argc, char* argv[])
 
     if(!(argc == 14 || argc == 15))
     {
-        printf("arg1: tensor operation (gemm: GEMM+bias+add+Reduce)\n");
+        printf("arg1: tensor operation (" OP_NAME ": " OP_DESC ")\n");
         printf("arg2: data type (0: fp32; 1: fp16)\n");
         printf("arg3: matrix layout (0: A[m, k] * B[k, n] = C[m, n];\n");
         printf("                     1: A[m, k] * B[n, k] = C[m, n];\n");
@@ -159,3 +163,5 @@ int profile_gemm_bias_add_reduce(int argc, char* argv[])
 
     return 0;
 }
+
+REGISTER_PROFILER_OPERATION(OP_NAME, OP_DESC, profile_gemm_bias_add_reduce);
diff --git a/profiler/src/profile_gemm_bilinear.cpp b/profiler/src/profile_gemm_bilinear.cpp
index 4f7e5a800d..3480014ba6 100644
--- a/profiler/src/profile_gemm_bilinear.cpp
+++ b/profiler/src/profile_gemm_bilinear.cpp
@@ -6,7 +6,11 @@
 #include <initializer_list>
 #include <cstdlib>
 
-#include "profiler/include/profile_gemm_bilinear_impl.hpp"
+#include "profiler/profile_gemm_bilinear_impl.hpp"
+#include "profiler_operation_registry.hpp"
+
+#define OP_NAME "gemm_bilinear"
+#define OP_DESC "GEMM+Bilinear"
 
 int profile_gemm_bilinear(int argc, char* argv[])
 {
@@ -29,7 +33,7 @@ int profile_gemm_bilinear(int argc, char* argv[])
     if(argc != 17)
     {
         // clang-format off
-        printf("arg1: tensor operation (gemm_bilinear: GEMM+Bilinear)\n");
+        printf("arg1: tensor operation (" OP_NAME ": " OP_DESC ")\n");
         printf("arg2: data type (0: fp32; 1: fp16; 2: bf16; 3: int8)\n");
         printf("arg3: matrix layout (0: E[m, n] = alpha * A[m, k] * B[k, n] + beta * D[m, n];\n");
         printf("                     1: E[m, n] = alpha * A[m, k] * B[n, k] + beta * D[m, n];\n");
@@ -144,3 +148,5 @@ int profile_gemm_bilinear(int argc, char* argv[])
         return 1;
     }
 }
+
+REGISTER_PROFILER_OPERATION(OP_NAME, OP_DESC, profile_gemm_bilinear);
diff --git a/profiler/src/profile_gemm_reduce.cpp b/profiler/src/profile_gemm_reduce.cpp
index 476943c8a7..395bf0627e 100644
--- a/profiler/src/profile_gemm_reduce.cpp
+++ b/profiler/src/profile_gemm_reduce.cpp
@@ -6,7 +6,11 @@
 #include <initializer_list>
 #include <cstdlib>
 
-#include "profiler/include/profile_gemm_reduce_impl.hpp"
+#include "profiler/profile_gemm_reduce_impl.hpp"
+#include "profiler_operation_registry.hpp"
+
+#define OP_NAME "gemm_reduce"
+#define OP_DESC "GEMM+Reduce"
 
 int profile_gemm_reduce(int argc, char* argv[])
 {
@@ -26,7 +30,7 @@ int profile_gemm_reduce(int argc, char* argv[])
 
     if(!(argc == 14 || argc == 15))
     {
-        printf("arg1: tensor operation (gemm: GEMM+Reduce)\n");
+        printf("arg1: tensor operation (" OP_NAME ": " OP_DESC ")\n");
         printf("arg2: data type (0: fp32; 1: fp16)\n");
         printf("arg3: matrix layout (0: A[m, k] * B[k, n] = C[m, n];\n");
         printf("                     1: A[m, k] * B[n, k] = C[m, n];\n");
@@ -146,3 +150,5 @@ int profile_gemm_reduce(int argc, char* argv[])
 
     return 0;
 }
+
+REGISTER_PROFILER_OPERATION(OP_NAME, OP_DESC, profile_gemm_reduce);
diff --git a/profiler/src/profile_gemm_splitk.cpp b/profiler/src/profile_gemm_splitk.cpp
index fff023c8e0..f636ce718c 100644
--- a/profiler/src/profile_gemm_splitk.cpp
+++ b/profiler/src/profile_gemm_splitk.cpp
@@ -6,7 +6,8 @@
 #include <initializer_list>
 #include <cstdlib>
 
-#include "profiler/include/profile_gemm_splitk_impl.hpp"
+#include "profiler/profile_gemm_splitk_impl.hpp"
+#include "profiler_operation_registry.hpp"
 
 enum struct GemmMatrixLayout
 {
@@ -24,11 +25,14 @@ enum struct GemmDataType
     INT8_INT8_INT8, // 3
 };
 
+#define OP_NAME "gemm_splitk"
+#define OP_DESC "Split-K GEMM"
+
 int profile_gemm_splitk(int argc, char* argv[])
 {
     if(argc != 15)
     {
-        printf("arg1: tensor operation (gemm_splitk: Split-K GEMM)\n");
+        printf("arg1: tensor operation (" OP_NAME ": " OP_DESC ")\n");
         printf("arg2: data type (0: fp32; 1: fp16; 2: bf16; 3: int8)\n");
         printf("arg3: matrix layout (0: A[m, k] * B[k, n] = C[m, n];\n");
         printf("                     1: A[m, k] * B[n, k] = C[m, n];\n");
@@ -146,3 +150,5 @@ int profile_gemm_splitk(int argc, char* argv[])
         return 1;
     }
 }
+
+REGISTER_PROFILER_OPERATION(OP_NAME, OP_DESC, profile_gemm_splitk);
diff --git a/profiler/src/profile_grouped_conv_bwd_weight.cpp b/profiler/src/profile_grouped_conv_bwd_weight.cpp
index deb5741cef..dfd8a099f5 100644
--- a/profiler/src/profile_grouped_conv_bwd_weight.cpp
+++ b/profiler/src/profile_grouped_conv_bwd_weight.cpp
@@ -6,7 +6,8 @@
 #include <iostream>
 #include <numeric>
 
-#include "profiler/include/profile_grouped_conv_bwd_weight_impl.hpp"
+#include "profiler/profile_grouped_conv_bwd_weight_impl.hpp"
+#include "profiler_operation_registry.hpp"
 
 namespace {
 
@@ -23,9 +24,12 @@ enum struct ConvDataType
     BF16_F32_BF16, // 2
 };
 
+#define OP_NAME "grouped_conv_bwd_weight"
+#define OP_DESC "Grouped Convolution Backward Weight"
+
 static void print_helper_msg()
 {
-    std::cout << "arg1: tensor operation (conv_bwd_weight: Convolution Backward Weight\n"
+    std::cout << "arg1: tensor operation (" OP_NAME ": " OP_DESC ")\n"
               << "arg2: data type (0: Input fp32, Weight fp32, Output fp32\n"
               << "                 1: Input fp16, Weight fp16, Output fp16\n"
               << "                 2: Input bf16, Weight fp32, Output bf16)\n"
@@ -174,3 +178,5 @@ int profile_grouped_conv_bwd_weight(int argc, char* argv[])
 
     return 1;
 }
+
+REGISTER_PROFILER_OPERATION(OP_NAME, OP_DESC, profile_grouped_conv_bwd_weight);
diff --git a/profiler/src/profile_grouped_conv_fwd.cpp b/profiler/src/profile_grouped_conv_fwd.cpp
index cb7c69b473..9ff3c15af0 100644
--- a/profiler/src/profile_grouped_conv_fwd.cpp
+++ b/profiler/src/profile_grouped_conv_fwd.cpp
@@ -6,7 +6,8 @@
 #include <initializer_list>
 #include <cstdlib>
 
-#include "profiler/include/profile_grouped_conv_fwd_impl.hpp"
+#include "profiler/profile_grouped_conv_fwd_impl.hpp"
+#include "profiler_operation_registry.hpp"
 
 namespace {
 
@@ -24,11 +25,14 @@ enum struct ConvDataType
     INT8_INT8_INT8, // 3
 };
 
+#define OP_NAME "grouped_conv_fwd"
+#define OP_DESC "Grouped Convolution Forward"
+
 static void print_helper_msg()
 {
     std::cout
         // clang-format off
-        << "arg1: tensor operation (grouped_conv_fwd: Grouped Convolution Forward)\n"
+        << "arg1: tensor operation (" OP_NAME ": " OP_DESC ")\n"
         << "arg2: data type (0: Input fp32, Weight fp32, Output fp32\n"
         << "                 1: Input fp16, Weight fp16, Output fp16\n"
         << "                 2: Input bf16, Weight bf16, Output bf16\n"
@@ -252,3 +256,5 @@ int profile_grouped_conv_fwd(int argc, char* argv[])
 
     return 1;
 }
+
+REGISTER_PROFILER_OPERATION(OP_NAME, OP_DESC, profile_grouped_conv_fwd);
diff --git a/profiler/src/profile_grouped_gemm.cpp b/profiler/src/profile_grouped_gemm.cpp
index 1e24c6091b..65e24bd9cc 100644
--- a/profiler/src/profile_grouped_gemm.cpp
+++ b/profiler/src/profile_grouped_gemm.cpp
@@ -6,7 +6,8 @@
 #include <initializer_list>
 #include <cstdlib>
 
-#include "profiler/include/profile_grouped_gemm_impl.hpp"
+#include "profiler/profile_grouped_gemm_impl.hpp"
+#include "profiler_operation_registry.hpp"
 
 enum struct GemmMatrixLayout
 {
@@ -44,11 +45,14 @@ std::vector<int> argToIntArray(char* input)
     return out;
 }
 
+#define OP_NAME "grouped_gemm"
+#define OP_DESC "Grouped GEMM"
+
 int profile_grouped_gemm(int argc, char* argv[])
 {
     if(!(argc == 14))
     {
-        printf("arg1: tensor operation (grouped_gemm: Grouped GEMM)\n");
+        printf("arg1: tensor operation (" OP_NAME ": " OP_DESC ")\n");
         printf("arg2: data type (0: fp32; 1: fp16; 2: bf16; 3: int8)\n");
         printf("arg3: matrix layout (0: A[m, k] * B[k, n] = C[m, n];\n");
         printf("                     1: A[m, k] * B[n, k] = C[m, n];\n");
@@ -161,3 +165,5 @@ int profile_grouped_gemm(int argc, char* argv[])
 
     return 0;
 }
+
+REGISTER_PROFILER_OPERATION(OP_NAME, OP_DESC, profile_grouped_gemm);
diff --git a/profiler/src/profile_groupnorm.cpp b/profiler/src/profile_groupnorm.cpp
index 7eeaca7d45..2741f52717 100644
--- a/profiler/src/profile_groupnorm.cpp
+++ b/profiler/src/profile_groupnorm.cpp
@@ -5,8 +5,9 @@
 #include <vector>
 #include <unordered_map>
 
-#include "profiler/include/data_type_enum.hpp"
-#include "profiler/include/profile_groupnorm_impl.hpp"
+#include "profiler/data_type_enum.hpp"
+#include "profiler/profile_groupnorm_impl.hpp"
+#include "profiler_operation_registry.hpp"
 
 using ck::index_t;
 
@@ -43,9 +44,12 @@ struct GroupnormArgParser
     }
 };
 
+#define OP_NAME "groupnorm"
+#define OP_DESC "Group Normalization"
+
 void print_help_groupnorm()
 {
-    std::cout << "arg1: tensor operation (groupnorm: Group normalization)\n"
+    std::cout << "arg1: tensor operation (" OP_NAME ": " OP_DESC ")\n"
               << "arg2: data type (0: fp16; 1: fp32)\n"
               << "arg3: verification (0: no; 1: yes)\n"
               << "arg4: initialization (0: no init; 1: integer value; 2: decimal value)\n"
@@ -104,3 +108,5 @@ int profile_groupnorm(int argc, char* argv[])
 
     return 0;
 }
+
+REGISTER_PROFILER_OPERATION(OP_NAME, OP_DESC, profile_groupnorm);
diff --git a/profiler/src/profile_layernorm.cpp b/profiler/src/profile_layernorm.cpp
index b090a4e1c8..e93fc2dbd2 100644
--- a/profiler/src/profile_layernorm.cpp
+++ b/profiler/src/profile_layernorm.cpp
@@ -5,8 +5,9 @@
 #include <vector>
 #include <unordered_map>
 
-#include "profiler/include/data_type_enum.hpp"
-#include "profiler/include/profile_layernorm_impl.hpp"
+#include "profiler/data_type_enum.hpp"
+#include "profiler/profile_layernorm_impl.hpp"
+#include "profiler_operation_registry.hpp"
 
 using ck::index_t;
 
@@ -96,3 +97,5 @@ int profile_layernorm(int argc, char* argv[])
 
     return 0;
 }
+
+REGISTER_PROFILER_OPERATION("layernorm", "Layer Normalization", profile_layernorm);
diff --git a/profiler/src/profile_reduce.cpp b/profiler/src/profile_reduce.cpp
index 1ec2a6d6e6..6925371858 100644
--- a/profiler/src/profile_reduce.cpp
+++ b/profiler/src/profile_reduce.cpp
@@ -13,8 +13,9 @@
 
 #include "ck/library/utility/host_common_util.hpp"
 
-#include "profiler/include/profile_reduce_impl.hpp"
-#include "profiler/include/data_type_enum.hpp"
+#include "profiler/profile_reduce_impl.hpp"
+#include "profiler/data_type_enum.hpp"
+#include "profiler_operation_registry.hpp"
 
 using namespace std;
 
@@ -429,3 +430,5 @@ int profile_reduce(int argc, char* argv[])
 
     return (0);
 };
+
+REGISTER_PROFILER_OPERATION("reduce", "Reduce", profile_reduce);
diff --git a/profiler/src/profile_softmax.cpp b/profiler/src/profile_softmax.cpp
index 0cf4e2b5d5..30f627dd29 100644
--- a/profiler/src/profile_softmax.cpp
+++ b/profiler/src/profile_softmax.cpp
@@ -5,7 +5,8 @@
 #include <vector>
 #include <unordered_map>
 
-#include "profiler/include/profile_softmax_impl.hpp"
+#include "profiler/profile_softmax_impl.hpp"
+#include "profiler_operation_registry.hpp"
 
 using ck::index_t;
 using ck::profiler::SoftmaxDataType;
@@ -164,3 +165,5 @@ int profile_softmax(int argc, char* argv[])
 //     profile_normalization(argc, argv);
 //     return 0;
 // }
+
+REGISTER_PROFILER_OPERATION("softmax", "Softmax", profile_softmax);
diff --git a/profiler/src/profiler.cpp b/profiler/src/profiler.cpp
index 34d0f5409f..080117e390 100644
--- a/profiler/src/profiler.cpp
+++ b/profiler/src/profiler.cpp
@@ -1,56 +1,14 @@
 // SPDX-License-Identifier: MIT
 // Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
 
-#include <cstring>
+#include <cstdlib>
+#include <iostream>
 
-int profile_gemm(int, char*[]);
-int profile_gemm_splitk(int, char*[]);
-int profile_gemm_bilinear(int, char*[]);
-int profile_gemm_add_add_fastgelu(int, char*[]);
-int profile_gemm_reduce(int, char*[]);
-int profile_gemm_bias_add_reduce(int, char*[]);
-int profile_batched_gemm(int, char*[]);
-int profile_batched_gemm_gemm(int, char*[]);
-int profile_batched_gemm_add_relu_gemm_add(int, char*[]);
-int profile_batched_gemm_reduce(int, char*[]);
-int profile_grouped_gemm(int, char*[]);
-int profile_conv_fwd(int, char*[]);
-int profile_conv_fwd_bias_relu(int, char*[]);
-int profile_conv_fwd_bias_relu_add(int, char*[]);
-int profile_conv_bwd_data(int, char*[]);
-int profile_grouped_conv_fwd(int, char*[]);
-int profile_grouped_conv_bwd_weight(int, char*[]);
-int profile_softmax(int, char*[]);
-int profile_layernorm(int, char*[]);
-int profile_groupnorm(int, char*[]);
-int profile_reduce(int, char*[]);
-int profile_batchnorm_forward(int, char*[]);
-int profile_batchnorm_backward(int, char*[]);
+#include "profiler_operation_registry.hpp"
 
 static void print_helper_message()
 {
-    // clang-format off
-    printf("arg1: tensor operation (gemm: GEMM\n"
-           "                        gemm_splitk: Split-K GEMM\n"
-           "                        gemm_bilinear: GEMM+Bilinear\n"
-           "                        gemm_add_add_fastgelu: GEMM+Add+Add+FastGeLU\n"
-           "                        gemm_reduce: GEMM+Reduce\n"
-           "                        gemm_bias_add_reduce: GEMM+Bias+Add+Reduce\n"
-           "                        batched_gemm: Batched GEMM\n"
-           "                        batched_gemm_gemm: Batched+GEMM+GEMM\n"
-           "                        batched_gemm_add_relu_gemm_add: Batched+GEMM+bias+gelu+GEMM+bias\n"
-           "                        batched_gemm_reduce: Batched GEMM+Reduce\n"
-           "                        grouped_gemm: Grouped GEMM\n"
-           "                        conv_fwd: Convolution Forward\n"
-           "                        conv_fwd_bias_relu: ForwardConvolution+Bias+ReLU\n"
-           "                        conv_fwd_bias_relu_add: ForwardConvolution+Bias+ReLU+Add\n"
-           "                        conv_bwd_data: Convolution Backward Data\n"
-           "                        grouped_conv_fwd: Grouped Convolution Forward\n"
-           "                        grouped_conv_bwd_weight: Grouped Convolution Backward Weight\n"
-           "                        softmax: Softmax\n"
-           "                        reduce: Reduce\n"
-	   "                        bnorm_fwd: Batchnorm forward\n");
-    // clang-format on
+    std::cout << "arg1: tensor operation " << ProfilerOperationRegistry::GetInstance() << std::endl;
 }
 
 int main(int argc, char* argv[])
@@ -58,105 +16,15 @@ int main(int argc, char* argv[])
     if(argc == 1)
     {
         print_helper_message();
-
-        return 0;
     }
-    else if(strcmp(argv[1], "gemm") == 0)
+    else if(const auto operation = ProfilerOperationRegistry::GetInstance().Get(argv[1]);
+            operation.has_value())
     {
-        return profile_gemm(argc, argv);
-    }
-    else if(strcmp(argv[1], "gemm_splitk") == 0)
-    {
-        return profile_gemm_splitk(argc, argv);
-    }
-    else if(strcmp(argv[1], "gemm_bilinear") == 0)
-    {
-        return profile_gemm_bilinear(argc, argv);
-    }
-    else if(strcmp(argv[1], "gemm_add_add_fastgelu") == 0)
-    {
-        return profile_gemm_add_add_fastgelu(argc, argv);
-    }
-    else if(strcmp(argv[1], "gemm_reduce") == 0)
-    {
-        return profile_gemm_reduce(argc, argv);
-    }
-    else if(strcmp(argv[1], "gemm_bias_add_reduce") == 0)
-    {
-        return profile_gemm_bias_add_reduce(argc, argv);
-    }
-    else if(strcmp(argv[1], "batched_gemm") == 0)
-    {
-        return profile_batched_gemm(argc, argv);
-    }
-    else if(strcmp(argv[1], "batched_gemm_gemm") == 0)
-    {
-        return profile_batched_gemm_gemm(argc, argv);
-    }
-    else if(strcmp(argv[1], "batched_gemm_add_relu_gemm_add") == 0)
-    {
-        return profile_batched_gemm_add_relu_gemm_add(argc, argv);
-    }
-    else if(strcmp(argv[1], "batched_gemm_reduce") == 0)
-    {
-        return profile_batched_gemm_reduce(argc, argv);
-    }
-    else if(strcmp(argv[1], "grouped_gemm") == 0)
-    {
-        return profile_grouped_gemm(argc, argv);
-    }
-    else if(strcmp(argv[1], "conv_fwd") == 0)
-    {
-        return profile_conv_fwd(argc, argv);
-    }
-    else if(strcmp(argv[1], "conv_fwd_bias_relu") == 0)
-    {
-        return profile_conv_fwd_bias_relu(argc, argv);
-    }
-    else if(strcmp(argv[1], "conv_fwd_bias_relu_add") == 0)
-    {
-        return profile_conv_fwd_bias_relu_add(argc, argv);
-    }
-    else if(strcmp(argv[1], "conv_bwd_data") == 0)
-    {
-        return profile_conv_bwd_data(argc, argv);
-    }
-    else if(strcmp(argv[1], "grouped_conv_fwd") == 0)
-    {
-        return profile_grouped_conv_fwd(argc, argv);
-    }
-    else if(strcmp(argv[1], "conv_bwd_weight") == 0)
-    {
-        return profile_grouped_conv_bwd_weight(argc, argv);
-    }
-    else if(strcmp(argv[1], "reduce") == 0)
-    {
-        return profile_reduce(argc, argv);
-    }
-    else if(strcmp(argv[1], "softmax") == 0)
-    {
-        return profile_softmax(argc, argv);
-    }
-    else if(strcmp(argv[1], "layernorm") == 0)
-    {
-        return profile_layernorm(argc, argv);
-    }
-    else if(strcmp(argv[1], "groupnorm") == 0)
-    {
-        return profile_groupnorm(argc, argv);
-    }
-    else if(strcmp(argv[1], "bnorm_fwd") == 0)
-    {
-        return profile_batchnorm_forward(argc, argv);
-    }
-    else if(strcmp(argv[1], "bnorm_bwd") == 0)
-    {
-        return profile_batchnorm_backward(argc, argv);
+        return (*operation)(argc, argv);
     }
     else
     {
-        print_helper_message();
-
-        return 0;
+        std::cerr << "cannot find operation: " << argv[1] << std::endl;
+        return EXIT_FAILURE;
     }
 }
diff --git a/profiler/src/profiler_operation_registry.hpp b/profiler/src/profiler_operation_registry.hpp
new file mode 100644
index 0000000000..91ff291233
--- /dev/null
+++ b/profiler/src/profiler_operation_registry.hpp
@@ -0,0 +1,79 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <functional>
+#include <iostream>
+#include <iterator>
+#include <map>
+#include <optional>
+#include <string_view>
+#include <utility>
+
+class ProfilerOperationRegistry final
+{
+    ProfilerOperationRegistry()  = default;
+    ~ProfilerOperationRegistry() = default;
+
+    public:
+    using Operation = std::function<int(int, char*[])>;
+
+    private:
+    struct Entry final
+    {
+        explicit Entry(std::string_view description, Operation operation) noexcept
+            : description_(description), operation_(std::move(operation))
+        {
+        }
+
+        std::string_view description_;
+        Operation operation_;
+    };
+
+    std::map<std::string_view, Entry> entries_;
+
+    friend std::ostream& operator<<(std::ostream& stream, const ProfilerOperationRegistry& registry)
+    {
+        stream << "{\n";
+        for(auto& [name, entry] : registry.entries_)
+        {
+            stream << "\t" << name << ": " << entry.description_ << "\n";
+        }
+        stream << "}";
+
+        return stream;
+    }
+
+    public:
+    static ProfilerOperationRegistry& GetInstance()
+    {
+        static ProfilerOperationRegistry registry;
+        return registry;
+    }
+
+    std::optional<Operation> Get(std::string_view name) const
+    {
+        const auto found = entries_.find(name);
+        if(found == end(entries_))
+        {
+            return std::nullopt;
+        }
+
+        return (found->second).operation_;
+    }
+
+    bool Add(std::string_view name, std::string_view description, Operation operation)
+    {
+        return entries_
+            .emplace(std::piecewise_construct,
+                     std::forward_as_tuple(name),
+                     std::forward_as_tuple(description, std::move(operation)))
+            .second;
+    }
+};
+
+#define PP_CONCAT(x, y) PP_CONCAT_IMPL(x, y)
+#define PP_CONCAT_IMPL(x, y) x##y
+
+#define REGISTER_PROFILER_OPERATION(name, description, operation)              \
+    static const bool PP_CONCAT(operation_registration_result_, __COUNTER__) = \
+        ::ProfilerOperationRegistry::GetInstance().Add(name, description, operation)
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index a3d2bcdc82..a8347d9e38 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -1,5 +1,6 @@
 include_directories(BEFORE
     ${PROJECT_SOURCE_DIR}/
+    ${PROJECT_SOURCE_DIR}/profiler/include
 )
 
 include(googletest)
diff --git a/test/batched_gemm/batched_gemm_bf16.cpp b/test/batched_gemm/batched_gemm_bf16.cpp
index 698e9faada..78be540627 100644
--- a/test/batched_gemm/batched_gemm_bf16.cpp
+++ b/test/batched_gemm/batched_gemm_bf16.cpp
@@ -3,7 +3,7 @@
 
 #include <iostream>
 
-#include "profiler/include/profile_batched_gemm_impl.hpp"
+#include "profiler/profile_batched_gemm_impl.hpp"
 
 namespace {
 using ADataType = ck::bhalf_t;
diff --git a/test/batched_gemm/batched_gemm_fp16.cpp b/test/batched_gemm/batched_gemm_fp16.cpp
index 7fc1f24f5f..6cbbedf677 100644
--- a/test/batched_gemm/batched_gemm_fp16.cpp
+++ b/test/batched_gemm/batched_gemm_fp16.cpp
@@ -3,7 +3,7 @@
 
 #include <iostream>
 
-#include "profiler/include/profile_batched_gemm_impl.hpp"
+#include "profiler/profile_batched_gemm_impl.hpp"
 
 namespace {
 using ADataType = ck::half_t;
diff --git a/test/batched_gemm/batched_gemm_fp32.cpp b/test/batched_gemm/batched_gemm_fp32.cpp
index 59072acc50..c9e565e264 100644
--- a/test/batched_gemm/batched_gemm_fp32.cpp
+++ b/test/batched_gemm/batched_gemm_fp32.cpp
@@ -3,7 +3,7 @@
 
 #include <iostream>
 
-#include "profiler/include/profile_batched_gemm_impl.hpp"
+#include "profiler/profile_batched_gemm_impl.hpp"
 
 namespace {
 using ADataType = float;
diff --git a/test/batched_gemm/batched_gemm_int8.cpp b/test/batched_gemm/batched_gemm_int8.cpp
index b68649ddf7..4da941a576 100644
--- a/test/batched_gemm/batched_gemm_int8.cpp
+++ b/test/batched_gemm/batched_gemm_int8.cpp
@@ -3,7 +3,7 @@
 
 #include <iostream>
 
-#include "profiler/include/profile_batched_gemm_impl.hpp"
+#include "profiler/profile_batched_gemm_impl.hpp"
 
 namespace {
 using ADataType = int8_t;
diff --git a/test/batched_gemm_gemm/test_batched_gemm_gemm_util.hpp b/test/batched_gemm_gemm/test_batched_gemm_gemm_util.hpp
index d7fbc37f01..53c4d37c44 100644
--- a/test/batched_gemm_gemm/test_batched_gemm_gemm_util.hpp
+++ b/test/batched_gemm_gemm/test_batched_gemm_gemm_util.hpp
@@ -6,7 +6,7 @@
 #include <vector>
 #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
 #include "ck/tensor_operation/gpu/device/impl/device_batched_gemm_gemm_xdl_cshuffle.hpp"
-#include "profiler/include/profile_batched_gemm_gemm_impl.hpp"
+#include "profiler/profile_batched_gemm_gemm_impl.hpp"
 
 using ck::tensor_operation::device::GemmSpecialization;
 
diff --git a/test/batched_gemm_reduce/batched_gemm_reduce_fp16.cpp b/test/batched_gemm_reduce/batched_gemm_reduce_fp16.cpp
index 456d21142f..b150ce50d1 100644
--- a/test/batched_gemm_reduce/batched_gemm_reduce_fp16.cpp
+++ b/test/batched_gemm_reduce/batched_gemm_reduce_fp16.cpp
@@ -3,7 +3,7 @@
 
 #include <iostream>
 
-#include "profiler/include/profile_batched_gemm_reduce_impl.hpp"
+#include "profiler/profile_batched_gemm_reduce_impl.hpp"
 
 int main()
 {
diff --git a/test/batched_gemm_softmax_gemm/test_batched_gemm_softmax_gemm_util.hpp b/test/batched_gemm_softmax_gemm/test_batched_gemm_softmax_gemm_util.hpp
index e9fd514cce..98debe19c3 100644
--- a/test/batched_gemm_softmax_gemm/test_batched_gemm_softmax_gemm_util.hpp
+++ b/test/batched_gemm_softmax_gemm/test_batched_gemm_softmax_gemm_util.hpp
@@ -6,7 +6,7 @@
 #include <vector>
 #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
 #include "ck/tensor_operation/gpu/device/impl/device_batched_gemm_softmax_gemm_xdl_cshuffle.hpp"
-#include "profiler/include/profile_batched_gemm_softmax_gemm_impl.hpp"
+#include "profiler/profile_batched_gemm_softmax_gemm_impl.hpp"
 using ck::tensor_operation::device::GemmSpecialization;
 
 template <ck::index_t N>
diff --git a/test/batched_gemm_softmax_gemm_permute/test_batched_gemm_softmax_gemm_permute_util.hpp b/test/batched_gemm_softmax_gemm_permute/test_batched_gemm_softmax_gemm_permute_util.hpp
index 138b9f8ffc..912bbc91ed 100644
--- a/test/batched_gemm_softmax_gemm_permute/test_batched_gemm_softmax_gemm_permute_util.hpp
+++ b/test/batched_gemm_softmax_gemm_permute/test_batched_gemm_softmax_gemm_permute_util.hpp
@@ -7,7 +7,7 @@
 #include "ck/ck.hpp"
 #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
 #include "ck/tensor_operation/gpu/device/impl/device_batched_gemm_softmax_gemm_permute_xdl_cshuffle.hpp"
-#include "profiler/include/profile_batched_gemm_softmax_gemm_permute_impl.hpp"
+#include "profiler/profile_batched_gemm_softmax_gemm_permute_impl.hpp"
 
 using ck::tensor_operation::device::GemmSpecialization;
 using ck::tensor_operation::device::MaskingSpecialization;
diff --git a/test/batchnorm/batchnorm_bwd_rank_4.cpp b/test/batchnorm/batchnorm_bwd_rank_4.cpp
index 77590626dc..caa7331ea2 100644
--- a/test/batchnorm/batchnorm_bwd_rank_4.cpp
+++ b/test/batchnorm/batchnorm_bwd_rank_4.cpp
@@ -8,7 +8,7 @@
 #include <tuple>
 #include <gtest/gtest.h>
 
-#include "profiler/include/profile_batchnorm_backward_impl.hpp"
+#include "profiler/profile_batchnorm_backward_impl.hpp"
 
 using F16  = ck::half_t;
 using F32  = float;
diff --git a/test/batchnorm/batchnorm_fwd_rank_4.cpp b/test/batchnorm/batchnorm_fwd_rank_4.cpp
index bc820be462..13aef7d6bf 100644
--- a/test/batchnorm/batchnorm_fwd_rank_4.cpp
+++ b/test/batchnorm/batchnorm_fwd_rank_4.cpp
@@ -8,7 +8,7 @@
 #include <tuple>
 #include <gtest/gtest.h>
 
-#include "profiler/include/profile_batchnorm_forward_impl.hpp"
+#include "profiler/profile_batchnorm_forward_impl.hpp"
 
 using F16  = ck::half_t;
 using F32  = float;
diff --git a/test/convnd_bwd_data/convnd_bwd_data.cpp b/test/convnd_bwd_data/convnd_bwd_data.cpp
index c31e399ef6..70231d42ae 100644
--- a/test/convnd_bwd_data/convnd_bwd_data.cpp
+++ b/test/convnd_bwd_data/convnd_bwd_data.cpp
@@ -8,7 +8,7 @@
 #include <tuple>
 #include <gtest/gtest.h>
 
-#include "profiler/include/profile_conv_bwd_data_impl.hpp"
+#include "profiler/profile_conv_bwd_data_impl.hpp"
 
 template <typename Tuple>
 class TestConvndBwdData : public ::testing::Test
diff --git a/test/convnd_fwd/convnd_fwd.cpp b/test/convnd_fwd/convnd_fwd.cpp
index 7a9782ebc0..a1921a9bfb 100644
--- a/test/convnd_fwd/convnd_fwd.cpp
+++ b/test/convnd_fwd/convnd_fwd.cpp
@@ -8,7 +8,7 @@
 #include <tuple>
 #include <gtest/gtest.h>
 
-#include "profiler/include/profile_conv_fwd_impl.hpp"
+#include "profiler/profile_conv_fwd_impl.hpp"
 
 template <typename Tuple>
 class TestConvndFwd : public ::testing::Test
diff --git a/test/elementwise_normalization/test_elementwise_layernorm_fp16.cpp b/test/elementwise_normalization/test_elementwise_layernorm_fp16.cpp
index f01e963bdb..403881b3cc 100644
--- a/test/elementwise_normalization/test_elementwise_layernorm_fp16.cpp
+++ b/test/elementwise_normalization/test_elementwise_layernorm_fp16.cpp
@@ -2,7 +2,7 @@
 // Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "gtest/gtest.h"
-#include "profiler/include/profile_elementwise_layernorm_impl.hpp"
+#include "profiler/profile_elementwise_layernorm_impl.hpp"
 
 using F16 = ck::half_t;
 using F32 = float;
diff --git a/test/gemm_reduce/gemm_reduce_fp16.cpp b/test/gemm_reduce/gemm_reduce_fp16.cpp
index 16f787e07e..029165ece1 100644
--- a/test/gemm_reduce/gemm_reduce_fp16.cpp
+++ b/test/gemm_reduce/gemm_reduce_fp16.cpp
@@ -3,7 +3,7 @@
 
 #include <iostream>
 
-#include "profiler/include/profile_gemm_reduce_impl.hpp"
+#include "profiler/profile_gemm_reduce_impl.hpp"
 
 int main()
 {
diff --git a/test/grouped_convnd_bwd_weight/grouped_convnd_bwd_weight.cpp b/test/grouped_convnd_bwd_weight/grouped_convnd_bwd_weight.cpp
index 1fc9c50d1e..e14173cb41 100644
--- a/test/grouped_convnd_bwd_weight/grouped_convnd_bwd_weight.cpp
+++ b/test/grouped_convnd_bwd_weight/grouped_convnd_bwd_weight.cpp
@@ -9,7 +9,7 @@
 
 #include <gtest/gtest.h>
 
-#include "profiler/include/profile_grouped_conv_bwd_weight_impl.hpp"
+#include "profiler/profile_grouped_conv_bwd_weight_impl.hpp"
 
 template <typename Tuple>
 class TestGroupedConvndBwdWeight : public ::testing::Test
diff --git a/test/grouped_convnd_fwd/grouped_convnd_fwd.cpp b/test/grouped_convnd_fwd/grouped_convnd_fwd.cpp
index fbd6e9972f..6df7f9969c 100644
--- a/test/grouped_convnd_fwd/grouped_convnd_fwd.cpp
+++ b/test/grouped_convnd_fwd/grouped_convnd_fwd.cpp
@@ -7,7 +7,7 @@
 #include <vector>
 #include <gtest/gtest.h>
 
-#include "profiler/include/profile_grouped_conv_fwd_impl.hpp"
+#include "profiler/profile_grouped_conv_fwd_impl.hpp"
 
 class TestGroupedConvNdFwd : public ::testing::Test
 {
diff --git a/test/grouped_gemm/grouped_gemm_fp16.cpp b/test/grouped_gemm/grouped_gemm_fp16.cpp
index f81875ab73..b3f7cca418 100644
--- a/test/grouped_gemm/grouped_gemm_fp16.cpp
+++ b/test/grouped_gemm/grouped_gemm_fp16.cpp
@@ -3,7 +3,7 @@
 
 #include <iostream>
 
-#include "profiler/include/profile_grouped_gemm_impl.hpp"
+#include "profiler/profile_grouped_gemm_impl.hpp"
 
 namespace {
 
diff --git a/test/normalization/test_groupnorm_fp16.cpp b/test/normalization/test_groupnorm_fp16.cpp
index 8f7438247c..636e522dce 100644
--- a/test/normalization/test_groupnorm_fp16.cpp
+++ b/test/normalization/test_groupnorm_fp16.cpp
@@ -2,7 +2,7 @@
 // Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "gtest/gtest.h"
-#include "profiler/include/profile_groupnorm_impl.hpp"
+#include "profiler/profile_groupnorm_impl.hpp"
 
 using F16 = ck::half_t;
 using F32 = float;
diff --git a/test/normalization/test_groupnorm_fp32.cpp b/test/normalization/test_groupnorm_fp32.cpp
index 8dadbb60f8..ef492664bf 100644
--- a/test/normalization/test_groupnorm_fp32.cpp
+++ b/test/normalization/test_groupnorm_fp32.cpp
@@ -2,7 +2,7 @@
 // Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "gtest/gtest.h"
-#include "profiler/include/profile_groupnorm_impl.hpp"
+#include "profiler/profile_groupnorm_impl.hpp"
 
 using F16 = ck::half_t;
 using F32 = float;
diff --git a/test/normalization/test_layernorm2d_fp16.cpp b/test/normalization/test_layernorm2d_fp16.cpp
index 7e3af7135e..eeb8ec150a 100644
--- a/test/normalization/test_layernorm2d_fp16.cpp
+++ b/test/normalization/test_layernorm2d_fp16.cpp
@@ -2,7 +2,7 @@
 // Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "gtest/gtest.h"
-#include "profiler/include/profile_layernorm_impl.hpp"
+#include "profiler/profile_layernorm_impl.hpp"
 
 using F16 = ck::half_t;
 using F32 = float;
diff --git a/test/normalization/test_layernorm2d_fp32.cpp b/test/normalization/test_layernorm2d_fp32.cpp
index a7c4380d59..f555b42592 100644
--- a/test/normalization/test_layernorm2d_fp32.cpp
+++ b/test/normalization/test_layernorm2d_fp32.cpp
@@ -2,7 +2,7 @@
 // Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "gtest/gtest.h"
-#include "profiler/include/profile_layernorm_impl.hpp"
+#include "profiler/profile_layernorm_impl.hpp"
 
 using F16 = ck::half_t;
 using F32 = float;
diff --git a/test/reduce/reduce_no_index.cpp b/test/reduce/reduce_no_index.cpp
index 475ebfd080..3f4d0676b4 100644
--- a/test/reduce/reduce_no_index.cpp
+++ b/test/reduce/reduce_no_index.cpp
@@ -4,7 +4,7 @@
 #include <getopt.h>
 
 #include "ck/library/utility/host_common_util.hpp"
-#include "profiler/include/profile_reduce_impl.hpp"
+#include "profiler/profile_reduce_impl.hpp"
 
 using namespace ck;
 
diff --git a/test/reduce/reduce_with_index.cpp b/test/reduce/reduce_with_index.cpp
index c319dca69c..c616a68e74 100644
--- a/test/reduce/reduce_with_index.cpp
+++ b/test/reduce/reduce_with_index.cpp
@@ -4,7 +4,7 @@
 #include <getopt.h>
 
 #include "ck/library/utility/host_common_util.hpp"
-#include "profiler/include/profile_reduce_impl.hpp"
+#include "profiler/profile_reduce_impl.hpp"
 
 using namespace ck;
 
diff --git a/test/softmax/test_softmax_util.hpp b/test/softmax/test_softmax_util.hpp
index 23ac3d20e2..40b300cf99 100644
--- a/test/softmax/test_softmax_util.hpp
+++ b/test/softmax/test_softmax_util.hpp
@@ -13,7 +13,7 @@
 #include "ck/tensor_operation/gpu/device/impl/device_softmax_impl.hpp"
 #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
 #include "include/ck/utility/data_type.hpp"
-#include "profiler/include/profile_softmax_impl.hpp"
+#include "profiler/profile_softmax_impl.hpp"
 
 namespace ck {