Modularize ckProfiler operations (#514)

* Re-structure ckProfiler source files * Rename profiler.cpp to main.cpp * Modularize ckProfiler operations * Add description for profiler operations * Use longer name to avoid name collision * Use macro to delay expansion * Use std::move() to avoid object copying * Prohibit users from calling dtor * Use macro to eliminate redundant code * Make friend function hidden * Add missing include directive <iostream> * Fix wrong include directives * Remove int8 from batchnorm-forward instances since it is not needed for forward training and could fail test Co-authored-by: Qianfeng Zhang <Qianfeng.Zhang@amd.com> [ROCm/composable_kernel commit: 8784a72e23]
2026-05-17 11:30:02 +00:00 · 2022-12-02 05:15:02 +08:00
parent 8e868bf880
commit 02db748e74
82 changed files with 346 additions and 273 deletions
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -1,5 +1,6 @@
 include_directories(BEFORE
    ${PROJECT_SOURCE_DIR}/
+    ${PROJECT_SOURCE_DIR}/profiler/include
 )

 include(googletest)
--- a/test/batched_gemm/batched_gemm_bf16.cpp
+++ b/test/batched_gemm/batched_gemm_bf16.cpp
@@ -3,7 +3,7 @@

 #include <iostream>

-#include "profiler/include/profile_batched_gemm_impl.hpp"
+#include "profiler/profile_batched_gemm_impl.hpp"

 namespace {
 using ADataType = ck::bhalf_t;
--- a/test/batched_gemm/batched_gemm_fp16.cpp
+++ b/test/batched_gemm/batched_gemm_fp16.cpp
@@ -3,7 +3,7 @@

 #include <iostream>

-#include "profiler/include/profile_batched_gemm_impl.hpp"
+#include "profiler/profile_batched_gemm_impl.hpp"

 namespace {
 using ADataType = ck::half_t;
--- a/test/batched_gemm/batched_gemm_fp32.cpp
+++ b/test/batched_gemm/batched_gemm_fp32.cpp
@@ -3,7 +3,7 @@

 #include <iostream>

-#include "profiler/include/profile_batched_gemm_impl.hpp"
+#include "profiler/profile_batched_gemm_impl.hpp"

 namespace {
 using ADataType = float;
--- a/test/batched_gemm/batched_gemm_int8.cpp
+++ b/test/batched_gemm/batched_gemm_int8.cpp
@@ -3,7 +3,7 @@

 #include <iostream>

-#include "profiler/include/profile_batched_gemm_impl.hpp"
+#include "profiler/profile_batched_gemm_impl.hpp"

 namespace {
 using ADataType = int8_t;
--- a/test/batched_gemm_gemm/test_batched_gemm_gemm_util.hpp
+++ b/test/batched_gemm_gemm/test_batched_gemm_gemm_util.hpp
@@ -6,7 +6,7 @@
 #include <vector>
 #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
 #include "ck/tensor_operation/gpu/device/impl/device_batched_gemm_gemm_xdl_cshuffle.hpp"
-#include "profiler/include/profile_batched_gemm_gemm_impl.hpp"
+#include "profiler/profile_batched_gemm_gemm_impl.hpp"

 using ck::tensor_operation::device::GemmSpecialization;

--- a/test/batched_gemm_reduce/batched_gemm_reduce_fp16.cpp
+++ b/test/batched_gemm_reduce/batched_gemm_reduce_fp16.cpp
@@ -3,7 +3,7 @@

 #include <iostream>

-#include "profiler/include/profile_batched_gemm_reduce_impl.hpp"
+#include "profiler/profile_batched_gemm_reduce_impl.hpp"

 int main()
 {
--- a/test/batched_gemm_softmax_gemm/test_batched_gemm_softmax_gemm_util.hpp
+++ b/test/batched_gemm_softmax_gemm/test_batched_gemm_softmax_gemm_util.hpp
@@ -6,7 +6,7 @@
 #include <vector>
 #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
 #include "ck/tensor_operation/gpu/device/impl/device_batched_gemm_softmax_gemm_xdl_cshuffle.hpp"
-#include "profiler/include/profile_batched_gemm_softmax_gemm_impl.hpp"
+#include "profiler/profile_batched_gemm_softmax_gemm_impl.hpp"
 using ck::tensor_operation::device::GemmSpecialization;

 template <ck::index_t N>
--- a/test/batched_gemm_softmax_gemm_permute/test_batched_gemm_softmax_gemm_permute_util.hpp
+++ b/test/batched_gemm_softmax_gemm_permute/test_batched_gemm_softmax_gemm_permute_util.hpp
@@ -7,7 +7,7 @@
 #include "ck/ck.hpp"
 #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
 #include "ck/tensor_operation/gpu/device/impl/device_batched_gemm_softmax_gemm_permute_xdl_cshuffle.hpp"
-#include "profiler/include/profile_batched_gemm_softmax_gemm_permute_impl.hpp"
+#include "profiler/profile_batched_gemm_softmax_gemm_permute_impl.hpp"

 using ck::tensor_operation::device::GemmSpecialization;
 using ck::tensor_operation::device::MaskingSpecialization;
--- a/test/batchnorm/batchnorm_bwd_rank_4.cpp
+++ b/test/batchnorm/batchnorm_bwd_rank_4.cpp
@@ -8,7 +8,7 @@
 #include <tuple>
 #include <gtest/gtest.h>

-#include "profiler/include/profile_batchnorm_backward_impl.hpp"
+#include "profiler/profile_batchnorm_backward_impl.hpp"

 using F16  = ck::half_t;
 using F32  = float;
--- a/test/batchnorm/batchnorm_fwd_rank_4.cpp
+++ b/test/batchnorm/batchnorm_fwd_rank_4.cpp
@@ -8,7 +8,7 @@
 #include <tuple>
 #include <gtest/gtest.h>

-#include "profiler/include/profile_batchnorm_forward_impl.hpp"
+#include "profiler/profile_batchnorm_forward_impl.hpp"

 using F16  = ck::half_t;
 using F32  = float;
--- a/test/convnd_bwd_data/convnd_bwd_data.cpp
+++ b/test/convnd_bwd_data/convnd_bwd_data.cpp
@@ -8,7 +8,7 @@
 #include <tuple>
 #include <gtest/gtest.h>

-#include "profiler/include/profile_conv_bwd_data_impl.hpp"
+#include "profiler/profile_conv_bwd_data_impl.hpp"

 template <typename Tuple>
 class TestConvndBwdData : public ::testing::Test
--- a/test/convnd_fwd/convnd_fwd.cpp
+++ b/test/convnd_fwd/convnd_fwd.cpp
@@ -8,7 +8,7 @@
 #include <tuple>
 #include <gtest/gtest.h>

-#include "profiler/include/profile_conv_fwd_impl.hpp"
+#include "profiler/profile_conv_fwd_impl.hpp"

 template <typename Tuple>
 class TestConvndFwd : public ::testing::Test
--- a/test/elementwise_normalization/test_elementwise_layernorm_fp16.cpp
+++ b/test/elementwise_normalization/test_elementwise_layernorm_fp16.cpp
@@ -2,7 +2,7 @@
 // Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.

 #include "gtest/gtest.h"
-#include "profiler/include/profile_elementwise_layernorm_impl.hpp"
+#include "profiler/profile_elementwise_layernorm_impl.hpp"

 using F16 = ck::half_t;
 using F32 = float;
--- a/test/gemm_reduce/gemm_reduce_fp16.cpp
+++ b/test/gemm_reduce/gemm_reduce_fp16.cpp
@@ -3,7 +3,7 @@

 #include <iostream>

-#include "profiler/include/profile_gemm_reduce_impl.hpp"
+#include "profiler/profile_gemm_reduce_impl.hpp"

 int main()
 {
--- a/test/grouped_convnd_bwd_weight/grouped_convnd_bwd_weight.cpp
+++ b/test/grouped_convnd_bwd_weight/grouped_convnd_bwd_weight.cpp
@@ -9,7 +9,7 @@

 #include <gtest/gtest.h>

-#include "profiler/include/profile_grouped_conv_bwd_weight_impl.hpp"
+#include "profiler/profile_grouped_conv_bwd_weight_impl.hpp"

 template <typename Tuple>
 class TestGroupedConvndBwdWeight : public ::testing::Test
--- a/test/grouped_convnd_fwd/grouped_convnd_fwd.cpp
+++ b/test/grouped_convnd_fwd/grouped_convnd_fwd.cpp
@@ -7,7 +7,7 @@
 #include <vector>
 #include <gtest/gtest.h>

-#include "profiler/include/profile_grouped_conv_fwd_impl.hpp"
+#include "profiler/profile_grouped_conv_fwd_impl.hpp"

 class TestGroupedConvNdFwd : public ::testing::Test
 {
--- a/test/grouped_gemm/grouped_gemm_fp16.cpp
+++ b/test/grouped_gemm/grouped_gemm_fp16.cpp
@@ -3,7 +3,7 @@

 #include <iostream>

-#include "profiler/include/profile_grouped_gemm_impl.hpp"
+#include "profiler/profile_grouped_gemm_impl.hpp"

 namespace {

--- a/test/normalization/test_groupnorm_fp16.cpp
+++ b/test/normalization/test_groupnorm_fp16.cpp
@@ -2,7 +2,7 @@
 // Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.

 #include "gtest/gtest.h"
-#include "profiler/include/profile_groupnorm_impl.hpp"
+#include "profiler/profile_groupnorm_impl.hpp"

 using F16 = ck::half_t;
 using F32 = float;
--- a/test/normalization/test_groupnorm_fp32.cpp
+++ b/test/normalization/test_groupnorm_fp32.cpp
@@ -2,7 +2,7 @@
 // Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.

 #include "gtest/gtest.h"
-#include "profiler/include/profile_groupnorm_impl.hpp"
+#include "profiler/profile_groupnorm_impl.hpp"

 using F16 = ck::half_t;
 using F32 = float;
--- a/test/normalization/test_layernorm2d_fp16.cpp
+++ b/test/normalization/test_layernorm2d_fp16.cpp
@@ -2,7 +2,7 @@
 // Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.

 #include "gtest/gtest.h"
-#include "profiler/include/profile_layernorm_impl.hpp"
+#include "profiler/profile_layernorm_impl.hpp"

 using F16 = ck::half_t;
 using F32 = float;
--- a/test/normalization/test_layernorm2d_fp32.cpp
+++ b/test/normalization/test_layernorm2d_fp32.cpp
@@ -2,7 +2,7 @@
 // Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.

 #include "gtest/gtest.h"
-#include "profiler/include/profile_layernorm_impl.hpp"
+#include "profiler/profile_layernorm_impl.hpp"

 using F16 = ck::half_t;
 using F32 = float;
--- a/test/reduce/reduce_no_index.cpp
+++ b/test/reduce/reduce_no_index.cpp
@@ -4,7 +4,7 @@
 #include <getopt.h>

 #include "ck/library/utility/host_common_util.hpp"
-#include "profiler/include/profile_reduce_impl.hpp"
+#include "profiler/profile_reduce_impl.hpp"

 using namespace ck;

--- a/test/reduce/reduce_with_index.cpp
+++ b/test/reduce/reduce_with_index.cpp
@@ -4,7 +4,7 @@
 #include <getopt.h>

 #include "ck/library/utility/host_common_util.hpp"
-#include "profiler/include/profile_reduce_impl.hpp"
+#include "profiler/profile_reduce_impl.hpp"

 using namespace ck;

--- a/test/softmax/test_softmax_util.hpp
+++ b/test/softmax/test_softmax_util.hpp
@@ -13,7 +13,7 @@
 #include "ck/tensor_operation/gpu/device/impl/device_softmax_impl.hpp"
 #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
 #include "include/ck/utility/data_type.hpp"
-#include "profiler/include/profile_softmax_impl.hpp"
+#include "profiler/profile_softmax_impl.hpp"

 namespace ck {