mirror of
https://github.com/ROCm/composable_kernel.git
synced 2026-04-19 22:39:03 +00:00
Modularize ckProfiler operations (#514)
* Re-structure ckProfiler source files * Rename profiler.cpp to main.cpp * Modularize ckProfiler operations * Add description for profiler operations * Use longer name to avoid name collision * Use macro to delay expansion * Use std::move() to avoid object copying * Prohibit users from calling dtor * Use macro to eliminate redundant code * Make friend function hidden * Add missing include directive <iostream> * Fix wrong include directives * Remove int8 from batchnorm-forward instances since it is not needed for forward training and could fail test Co-authored-by: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
This commit is contained in:
@@ -1,64 +1,5 @@
|
||||
include_directories(BEFORE
|
||||
${PROJECT_SOURCE_DIR}/
|
||||
${CMAKE_CURRENT_LIST_DIR}/include
|
||||
)
|
||||
|
||||
# ck_profiler
|
||||
set(PROFILER_SOURCE
|
||||
src/profiler.cpp
|
||||
src/profile_gemm.cpp
|
||||
src/profile_gemm_splitk.cpp
|
||||
src/profile_gemm_bilinear.cpp
|
||||
src/profile_gemm_bias_add_reduce.cpp
|
||||
src/profile_gemm_add_add_fastgelu.cpp
|
||||
src/profile_gemm_reduce.cpp
|
||||
src/profile_batched_gemm.cpp
|
||||
src/profile_batched_gemm_gemm.cpp
|
||||
src/profile_batched_gemm_add_relu_gemm_add.cpp
|
||||
src/profile_batched_gemm_reduce.cpp
|
||||
src/profile_grouped_gemm.cpp
|
||||
src/profile_conv_fwd.cpp
|
||||
src/profile_conv_fwd_bias_relu.cpp
|
||||
src/profile_conv_fwd_bias_relu_add.cpp
|
||||
src/profile_conv_bwd_data.cpp
|
||||
src/profile_grouped_conv_fwd.cpp
|
||||
src/profile_grouped_conv_bwd_weight.cpp
|
||||
src/profile_reduce.cpp
|
||||
src/profile_groupnorm.cpp
|
||||
src/profile_layernorm.cpp
|
||||
src/profile_softmax.cpp
|
||||
src/profile_batchnorm_fwd.cpp
|
||||
src/profile_batchnorm_bwd.cpp
|
||||
)
|
||||
|
||||
add_executable(ckProfiler ${PROFILER_SOURCE})
|
||||
|
||||
target_link_libraries(ckProfiler PRIVATE utility)
|
||||
target_link_libraries(ckProfiler PRIVATE device_gemm_instance)
|
||||
target_link_libraries(ckProfiler PRIVATE device_gemm_splitk_instance)
|
||||
target_link_libraries(ckProfiler PRIVATE device_gemm_bilinear_instance)
|
||||
target_link_libraries(ckProfiler PRIVATE device_gemm_add_add_fastgelu_instance)
|
||||
target_link_libraries(ckProfiler PRIVATE device_gemm_reduce_instance)
|
||||
target_link_libraries(ckProfiler PRIVATE device_gemm_bias_add_reduce_instance)
|
||||
target_link_libraries(ckProfiler PRIVATE device_batched_gemm_instance)
|
||||
target_link_libraries(ckProfiler PRIVATE device_batched_gemm_gemm_instance)
|
||||
target_link_libraries(ckProfiler PRIVATE device_batched_gemm_add_relu_gemm_add_instance)
|
||||
target_link_libraries(ckProfiler PRIVATE device_batched_gemm_reduce_instance)
|
||||
target_link_libraries(ckProfiler PRIVATE device_grouped_gemm_instance)
|
||||
target_link_libraries(ckProfiler PRIVATE device_conv2d_fwd_instance)
|
||||
target_link_libraries(ckProfiler PRIVATE device_grouped_conv1d_fwd_instance)
|
||||
target_link_libraries(ckProfiler PRIVATE device_grouped_conv2d_fwd_instance)
|
||||
target_link_libraries(ckProfiler PRIVATE device_grouped_conv3d_fwd_instance)
|
||||
target_link_libraries(ckProfiler PRIVATE device_conv1d_bwd_data_instance)
|
||||
target_link_libraries(ckProfiler PRIVATE device_conv2d_bwd_data_instance)
|
||||
target_link_libraries(ckProfiler PRIVATE device_conv3d_bwd_data_instance)
|
||||
target_link_libraries(ckProfiler PRIVATE device_grouped_conv1d_bwd_weight_instance)
|
||||
target_link_libraries(ckProfiler PRIVATE device_grouped_conv2d_bwd_weight_instance)
|
||||
target_link_libraries(ckProfiler PRIVATE device_grouped_conv3d_bwd_weight_instance)
|
||||
target_link_libraries(ckProfiler PRIVATE device_conv2d_fwd_bias_relu_instance)
|
||||
target_link_libraries(ckProfiler PRIVATE device_conv2d_fwd_bias_relu_add_instance)
|
||||
target_link_libraries(ckProfiler PRIVATE device_normalization_instance)
|
||||
target_link_libraries(ckProfiler PRIVATE device_softmax_instance)
|
||||
target_link_libraries(ckProfiler PRIVATE device_reduce_instance)
|
||||
target_link_libraries(ckProfiler PRIVATE device_batchnorm_instance)
|
||||
|
||||
rocm_install(TARGETS ckProfiler COMPONENT profiler)
|
||||
add_subdirectory(src)
|
||||
|
||||
@@ -4,7 +4,7 @@
|
||||
#pragma
|
||||
|
||||
#include "ck/utility/data_type.hpp"
|
||||
#include "profiler/include/data_type_enum.hpp"
|
||||
#include "profiler/data_type_enum.hpp"
|
||||
|
||||
namespace ck {
|
||||
|
||||
63
profiler/src/CMakeLists.txt
Normal file
63
profiler/src/CMakeLists.txt
Normal file
@@ -0,0 +1,63 @@
|
||||
# ckProfiler
|
||||
set(PROFILER_SOURCES
|
||||
profiler.cpp
|
||||
profile_gemm.cpp
|
||||
profile_gemm_splitk.cpp
|
||||
profile_gemm_bilinear.cpp
|
||||
profile_gemm_bias_add_reduce.cpp
|
||||
profile_gemm_add_add_fastgelu.cpp
|
||||
profile_gemm_reduce.cpp
|
||||
profile_batched_gemm.cpp
|
||||
profile_batched_gemm_gemm.cpp
|
||||
profile_batched_gemm_add_relu_gemm_add.cpp
|
||||
profile_batched_gemm_reduce.cpp
|
||||
profile_grouped_gemm.cpp
|
||||
profile_conv_fwd.cpp
|
||||
profile_conv_fwd_bias_relu.cpp
|
||||
profile_conv_fwd_bias_relu_add.cpp
|
||||
profile_conv_bwd_data.cpp
|
||||
profile_grouped_conv_fwd.cpp
|
||||
profile_grouped_conv_bwd_weight.cpp
|
||||
profile_reduce.cpp
|
||||
profile_groupnorm.cpp
|
||||
profile_layernorm.cpp
|
||||
profile_softmax.cpp
|
||||
profile_batchnorm_fwd.cpp
|
||||
profile_batchnorm_bwd.cpp
|
||||
)
|
||||
|
||||
set(PROFILER_EXECUTABLE ckProfiler)
|
||||
|
||||
add_executable(${PROFILER_EXECUTABLE} ${PROFILER_SOURCES})
|
||||
target_compile_options(${PROFILER_EXECUTABLE} PRIVATE -Wno-global-constructors)
|
||||
|
||||
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE utility)
|
||||
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_instance)
|
||||
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_splitk_instance)
|
||||
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_bilinear_instance)
|
||||
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_add_add_fastgelu_instance)
|
||||
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_reduce_instance)
|
||||
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_bias_add_reduce_instance)
|
||||
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_batched_gemm_instance)
|
||||
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_batched_gemm_gemm_instance)
|
||||
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_batched_gemm_add_relu_gemm_add_instance)
|
||||
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_batched_gemm_reduce_instance)
|
||||
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_gemm_instance)
|
||||
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_conv2d_fwd_instance)
|
||||
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_conv1d_fwd_instance)
|
||||
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_conv2d_fwd_instance)
|
||||
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_conv3d_fwd_instance)
|
||||
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_conv1d_bwd_data_instance)
|
||||
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_conv2d_bwd_data_instance)
|
||||
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_conv3d_bwd_data_instance)
|
||||
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_conv1d_bwd_weight_instance)
|
||||
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_conv2d_bwd_weight_instance)
|
||||
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_conv3d_bwd_weight_instance)
|
||||
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_conv2d_fwd_bias_relu_instance)
|
||||
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_conv2d_fwd_bias_relu_add_instance)
|
||||
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_normalization_instance)
|
||||
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_softmax_instance)
|
||||
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_reduce_instance)
|
||||
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_batchnorm_instance)
|
||||
|
||||
rocm_install(TARGETS ${PROFILER_EXECUTABLE} COMPONENT profiler)
|
||||
@@ -7,7 +7,8 @@
|
||||
#include <initializer_list>
|
||||
#include <cstdlib>
|
||||
|
||||
#include "profiler/include/profile_batched_gemm_impl.hpp"
|
||||
#include "profiler/profile_batched_gemm_impl.hpp"
|
||||
#include "profiler_operation_registry.hpp"
|
||||
|
||||
enum struct GemmMatrixLayout
|
||||
{
|
||||
@@ -25,12 +26,15 @@ enum struct GemmDataType
|
||||
INT8_INT8_INT8, // 3
|
||||
};
|
||||
|
||||
#define OP_NAME "batched_gemm"
|
||||
#define OP_DESC "Batched GEMM"
|
||||
|
||||
int profile_batched_gemm(int argc, char* argv[])
|
||||
{
|
||||
if(argc != 18)
|
||||
{
|
||||
// clang-format off
|
||||
printf("arg1: tensor operation (batched_gemm: Batched GEMM)\n");
|
||||
printf("arg1: tensor operation (" OP_NAME ": " OP_DESC ")\n");
|
||||
printf("arg2: data type (0: fp32; 1: fp16, 2: bf16, 3: int8)\n");
|
||||
printf("arg3: matrix layout (0: A[g, m, k] * B[g, k, n] = C[g, m, n];\n");
|
||||
printf(" 1: A[g, m, k] * B[g, n, k] = C[g, m, n];\n");
|
||||
@@ -195,3 +199,5 @@ int profile_batched_gemm(int argc, char* argv[])
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
|
||||
REGISTER_PROFILER_OPERATION(OP_NAME, OP_DESC, profile_batched_gemm);
|
||||
|
||||
@@ -6,7 +6,8 @@
|
||||
#include <initializer_list>
|
||||
#include <cstdlib>
|
||||
|
||||
#include "profiler/include/profile_batched_gemm_add_relu_gemm_add_impl.hpp"
|
||||
#include "profiler/profile_batched_gemm_add_relu_gemm_add_impl.hpp"
|
||||
#include "profiler_operation_registry.hpp"
|
||||
|
||||
using F16 = ck::half_t;
|
||||
using F32 = float;
|
||||
@@ -14,6 +15,9 @@ using F32 = float;
|
||||
using Row = ck::tensor_layout::gemm::RowMajor;
|
||||
using Col = ck::tensor_layout::gemm::ColumnMajor;
|
||||
|
||||
#define OP_NAME "batched_gemm_add_relu_gemm_add"
|
||||
#define OP_DESC "Batched GEMM+Add+Relu+GEMM+Add"
|
||||
|
||||
int profile_batched_gemm_add_relu_gemm_add(int argc, char* argv[])
|
||||
{
|
||||
enum struct GemmMatrixLayout
|
||||
@@ -109,8 +113,7 @@ int profile_batched_gemm_add_relu_gemm_add(int argc, char* argv[])
|
||||
}
|
||||
else
|
||||
{
|
||||
printf("arg1: tensor operation (batched_gemm_add_relu_gemm_add: "
|
||||
"Batched_GEMM+Add+Relu+Gemm+Add)\n");
|
||||
printf("arg1: tensor operation (" OP_NAME ": " OP_DESC ")\n");
|
||||
printf("arg2: data type (1: fp16)\n");
|
||||
printf("arg3: matrix layout (0: Relu(A0[m, k] * B0[n, k] + D0[m, n]) * B1[n, o] + D1[m, o] "
|
||||
"= E1[m, o]; 1: Relu(A0[m, k] * B0[n, k] + D0[m, n]) * B1[o, n] + D1[m, o] = "
|
||||
@@ -207,3 +210,5 @@ int profile_batched_gemm_add_relu_gemm_add(int argc, char* argv[])
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
REGISTER_PROFILER_OPERATION(OP_NAME, OP_DESC, profile_batched_gemm_add_relu_gemm_add);
|
||||
|
||||
@@ -6,7 +6,8 @@
|
||||
#include <initializer_list>
|
||||
#include <cstdlib>
|
||||
|
||||
#include "profiler/include/profile_batched_gemm_gemm_impl.hpp"
|
||||
#include "profiler/profile_batched_gemm_gemm_impl.hpp"
|
||||
#include "profiler_operation_registry.hpp"
|
||||
|
||||
using F16 = ck::half_t;
|
||||
using F32 = float;
|
||||
@@ -14,6 +15,9 @@ using F32 = float;
|
||||
using Row = ck::tensor_layout::gemm::RowMajor;
|
||||
using Col = ck::tensor_layout::gemm::ColumnMajor;
|
||||
|
||||
#define OP_NAME "batched_gemm_gemm"
|
||||
#define OP_DESC "Batched GEMM+GEMM"
|
||||
|
||||
int profile_batched_gemm_gemm(int argc, char* argv[])
|
||||
{
|
||||
enum struct GemmMatrixLayout
|
||||
@@ -101,7 +105,7 @@ int profile_batched_gemm_gemm(int argc, char* argv[])
|
||||
}
|
||||
else
|
||||
{
|
||||
printf("arg1: tensor operation (batched_gemm_gemm: Batched_GEMM+Gemm)\n");
|
||||
printf("arg1: tensor operation (" OP_NAME ": " OP_DESC ")\n");
|
||||
printf("arg2: data type (1: fp16)\n");
|
||||
printf("arg3: matrix layout (0: Relu(A0[m, k] * B0[n, k] + D0[m, n]) * B1[n, o] + D1[m, o] "
|
||||
"= E1[m, o]; 1: Relu(A0[m, k] * B0[n, k] + D0[m, n]) * B1[o, n] + D1[m, o] = E1[m, "
|
||||
@@ -179,3 +183,5 @@ int profile_batched_gemm_gemm(int argc, char* argv[])
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
REGISTER_PROFILER_OPERATION(OP_NAME, OP_DESC, profile_batched_gemm_gemm);
|
||||
|
||||
@@ -6,7 +6,11 @@
|
||||
#include <initializer_list>
|
||||
#include <cstdlib>
|
||||
|
||||
#include "profiler/include/profile_batched_gemm_reduce_impl.hpp"
|
||||
#include "profiler/profile_batched_gemm_reduce_impl.hpp"
|
||||
#include "profiler_operation_registry.hpp"
|
||||
|
||||
#define OP_NAME "batched_gemm_reduce"
|
||||
#define OP_DESC "Batched GEMM+Reduce"
|
||||
|
||||
int profile_batched_gemm_reduce(int argc, char* argv[])
|
||||
{
|
||||
@@ -26,7 +30,7 @@ int profile_batched_gemm_reduce(int argc, char* argv[])
|
||||
|
||||
if(argc != 15)
|
||||
{
|
||||
printf("arg1: tensor operation (batched_gemm_reduce: BatchedGEMM+Reduce)\n");
|
||||
printf("arg1: tensor operation (" OP_NAME ": " OP_DESC ")\n");
|
||||
printf("arg2: data type (0: fp32; 1: fp16)\n");
|
||||
printf("arg3: matrix layout (0: A[m, k] * B[k, n] = C[m, n];\n");
|
||||
printf(" 1: A[m, k] * B[n, k] = C[m, n];\n");
|
||||
@@ -151,3 +155,5 @@ int profile_batched_gemm_reduce(int argc, char* argv[])
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
REGISTER_PROFILER_OPERATION(OP_NAME, OP_DESC, profile_batched_gemm_reduce);
|
||||
|
||||
@@ -6,7 +6,8 @@
|
||||
#include <getopt.h>
|
||||
|
||||
#include "ck/library/utility/host_common_util.hpp"
|
||||
#include "profiler/include/profile_batchnorm_backward_impl.hpp"
|
||||
#include "profiler/profile_batchnorm_backward_impl.hpp"
|
||||
#include "profiler_operation_registry.hpp"
|
||||
|
||||
using ck::index_t;
|
||||
|
||||
@@ -202,3 +203,5 @@ int profile_batchnorm_backward(int argc, char* argv[])
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
REGISTER_PROFILER_OPERATION("bnorm_bwd", "Batchnorm backward", profile_batchnorm_backward);
|
||||
|
||||
@@ -6,7 +6,8 @@
|
||||
#include <getopt.h>
|
||||
|
||||
#include "ck/library/utility/host_common_util.hpp"
|
||||
#include "profiler/include/profile_batchnorm_forward_impl.hpp"
|
||||
#include "profiler/profile_batchnorm_forward_impl.hpp"
|
||||
#include "profiler_operation_registry.hpp"
|
||||
|
||||
using ck::index_t;
|
||||
|
||||
@@ -214,3 +215,5 @@ int profile_batchnorm_forward(int argc, char* argv[])
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
REGISTER_PROFILER_OPERATION("bnorm_fwd", "Batchnorm forward", profile_batchnorm_forward);
|
||||
|
||||
@@ -6,7 +6,8 @@
|
||||
#include <initializer_list>
|
||||
#include <cstdlib>
|
||||
|
||||
#include "profiler/include/profile_conv_bwd_data_impl.hpp"
|
||||
#include "profiler/profile_conv_bwd_data_impl.hpp"
|
||||
#include "profiler_operation_registry.hpp"
|
||||
|
||||
namespace {
|
||||
|
||||
@@ -24,10 +25,13 @@ enum struct ConvDataType
|
||||
INT8_INT8_INT8, // 3
|
||||
};
|
||||
|
||||
#define OP_NAME "conv_bwd_data"
|
||||
#define OP_DESC "Convolution Backward Data"
|
||||
|
||||
static void print_helper_msg()
|
||||
{
|
||||
std::cout
|
||||
<< "arg1: tensor operation (conv_bwd_data: Convolution Backward Data)\n"
|
||||
<< "arg1: tensor operation (" OP_NAME ": " OP_DESC ")\n"
|
||||
<< "arg2: data type (0: Input fp32, Weight fp32, Output fp32\n"
|
||||
<< " 1: Input fp16, Weight fp16, Output fp16\n"
|
||||
<< " 2: Input bf16, Weight bf16, Output bf16\n"
|
||||
@@ -182,3 +186,5 @@ int profile_conv_bwd_data(int argc, char* argv[])
|
||||
|
||||
return 1;
|
||||
}
|
||||
|
||||
REGISTER_PROFILER_OPERATION(OP_NAME, OP_DESC, profile_conv_bwd_data);
|
||||
|
||||
@@ -6,7 +6,8 @@
|
||||
#include <initializer_list>
|
||||
#include <cstdlib>
|
||||
|
||||
#include "profiler/include/profile_conv_fwd_impl.hpp"
|
||||
#include "profiler/profile_conv_fwd_impl.hpp"
|
||||
#include "profiler_operation_registry.hpp"
|
||||
|
||||
namespace {
|
||||
|
||||
@@ -24,11 +25,14 @@ enum struct ConvDataType
|
||||
INT8_INT8_INT8, // 3
|
||||
};
|
||||
|
||||
#define OP_NAME "conv_fwd"
|
||||
#define OP_DESC "Convolution Forward"
|
||||
|
||||
static void print_helper_msg()
|
||||
{
|
||||
std::cout
|
||||
// clang-format-off
|
||||
<< "arg1: tensor operation (conv_fwd: Convolution Forward)\n"
|
||||
<< "arg1: tensor operation (" OP_NAME ": " OP_DESC ")\n"
|
||||
<< "arg2: data type (0: Input fp32, Weight fp32, Output fp32\n"
|
||||
<< " 1: Input fp16, Weight fp16, Output fp16\n"
|
||||
<< " 2: Input bf16, Weight bf16, Output bf16\n"
|
||||
@@ -184,3 +188,5 @@ int profile_conv_fwd(int argc, char* argv[])
|
||||
|
||||
return 1;
|
||||
}
|
||||
|
||||
REGISTER_PROFILER_OPERATION(OP_NAME, OP_DESC, profile_conv_fwd);
|
||||
|
||||
@@ -6,7 +6,8 @@
|
||||
#include <initializer_list>
|
||||
#include <cstdlib>
|
||||
|
||||
#include "profiler/include/profile_conv_fwd_bias_relu_impl.hpp"
|
||||
#include "profiler/profile_conv_fwd_bias_relu_impl.hpp"
|
||||
#include "profiler_operation_registry.hpp"
|
||||
|
||||
enum struct ConvDataType
|
||||
{
|
||||
@@ -32,11 +33,14 @@ enum struct ConvOutputLayout
|
||||
NHWK, // 1
|
||||
};
|
||||
|
||||
#define OP_NAME "conv_fwd_bias_relu"
|
||||
#define OP_DESC "Convolution Forward+Bias+ReLU"
|
||||
|
||||
int profile_conv_fwd_bias_relu(int argc, char* argv[])
|
||||
{
|
||||
if(argc != 25)
|
||||
{
|
||||
printf("arg1: tensor operation (conv_fwd_bias_relu: ForwardConvolution+Bias+ReLu)\n");
|
||||
printf("arg1: tensor operation (" OP_NAME ": " OP_DESC ")\n");
|
||||
printf("arg2: data type (0: fp32; 1: fp16)\n");
|
||||
printf("arg3: input tensor layout (0: NCHW; 1: NHWC)\n");
|
||||
printf("arg4: weight tensor layout (0: KCYX; 1: KYXC)\n");
|
||||
@@ -114,3 +118,5 @@ int profile_conv_fwd_bias_relu(int argc, char* argv[])
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
REGISTER_PROFILER_OPERATION(OP_NAME, OP_DESC, profile_conv_fwd_bias_relu);
|
||||
|
||||
@@ -6,7 +6,8 @@
|
||||
#include <initializer_list>
|
||||
#include <cstdlib>
|
||||
|
||||
#include "profiler/include/profile_conv_fwd_bias_relu_add_impl.hpp"
|
||||
#include "profiler/profile_conv_fwd_bias_relu_add_impl.hpp"
|
||||
#include "profiler_operation_registry.hpp"
|
||||
|
||||
enum struct ConvDataType
|
||||
{
|
||||
@@ -32,12 +33,14 @@ enum struct ConvOutputLayout
|
||||
NHWK, // 1
|
||||
};
|
||||
|
||||
#define OP_NAME "conv_fwd_bias_relu_add"
|
||||
#define OP_DESC "Convolution Forward+Bias+ReLU+Add"
|
||||
|
||||
int profile_conv_fwd_bias_relu_add(int argc, char* argv[])
|
||||
{
|
||||
if(argc != 25)
|
||||
{
|
||||
printf(
|
||||
"arg1: tensor operation (conv_fwd_bias_relu_add: ForwardConvolution+Bias+ReLu+Add)\n");
|
||||
printf("arg1: tensor operation (" OP_NAME ": " OP_DESC ")\n");
|
||||
printf("arg2: data type (0: fp32; 1: fp16)\n");
|
||||
printf("arg3: input tensor layout (0: NCHW; 1: NHWC)\n");
|
||||
printf("arg4: weight tensor layout (0: KCYX; 1: KYXC)\n");
|
||||
@@ -115,3 +118,5 @@ int profile_conv_fwd_bias_relu_add(int argc, char* argv[])
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
REGISTER_PROFILER_OPERATION(OP_NAME, OP_DESC, profile_conv_fwd_bias_relu_add);
|
||||
|
||||
@@ -6,7 +6,8 @@
|
||||
#include <initializer_list>
|
||||
#include <cstdlib>
|
||||
|
||||
#include "profiler/include/profile_gemm_impl.hpp"
|
||||
#include "profiler/profile_gemm_impl.hpp"
|
||||
#include "profiler_operation_registry.hpp"
|
||||
|
||||
enum struct GemmMatrixLayout
|
||||
{
|
||||
@@ -24,9 +25,12 @@ enum struct GemmDataType
|
||||
INT8_INT8_INT8, // 3
|
||||
};
|
||||
|
||||
#define OP_NAME "gemm"
|
||||
#define OP_DESC "GEMM"
|
||||
|
||||
static void print_helper_msg()
|
||||
{
|
||||
std::cout << "arg1: tensor operation (gemm: GEMM)\n"
|
||||
std::cout << "arg1: tensor operation (" OP_NAME ": " OP_DESC ")\n"
|
||||
<< "arg2: data type (0: fp32; 1: fp16; 2: bf16; 3: int8)\n"
|
||||
<< "arg3: matrix layout (0: A[m, k] * B[k, n] = C[m, n];\n"
|
||||
<< " 1: A[m, k] * B[n, k] = C[m, n];\n"
|
||||
@@ -184,3 +188,5 @@ int profile_gemm(int argc, char* argv[])
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
|
||||
REGISTER_PROFILER_OPERATION(OP_NAME, OP_DESC, profile_gemm);
|
||||
|
||||
@@ -6,7 +6,11 @@
|
||||
#include <initializer_list>
|
||||
#include <cstdlib>
|
||||
|
||||
#include "profiler/include/profile_gemm_add_add_fastgelu_impl.hpp"
|
||||
#include "profiler/profile_gemm_add_add_fastgelu_impl.hpp"
|
||||
#include "profiler_operation_registry.hpp"
|
||||
|
||||
#define OP_NAME "gemm_add_add_fastgelu"
|
||||
#define OP_DESC "GEMM+Add+Add+FastGeLU"
|
||||
|
||||
int profile_gemm_add_add_fastgelu(int argc, char* argv[])
|
||||
{
|
||||
@@ -29,7 +33,7 @@ int profile_gemm_add_add_fastgelu(int argc, char* argv[])
|
||||
if(argc != 16)
|
||||
{
|
||||
// clang-format off
|
||||
printf("arg1: tensor operation (gemm_add_add_fastgelu: GEMM+Add+Add+FastGeLU)\n");
|
||||
printf("arg1: tensor operation (" OP_NAME ": " OP_DESC ")\n");
|
||||
printf("arg2: data type (0: fp32; 1: fp16; 2: bf16; 3: int8)\n");
|
||||
printf("arg3: matrix layout (0: E[m, n] = FastGeLU(A[m, k] * B[k, n] + D0[m, n] + D1[m, n]);\n");
|
||||
printf(" 1: E[m, n] = FastGeLU(A[m, k] * B[n, k] + D0[m, n] + D1[m, n]);\n");
|
||||
@@ -150,3 +154,5 @@ int profile_gemm_add_add_fastgelu(int argc, char* argv[])
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
|
||||
REGISTER_PROFILER_OPERATION(OP_NAME, OP_DESC, profile_gemm_add_add_fastgelu);
|
||||
|
||||
@@ -6,7 +6,11 @@
|
||||
#include <initializer_list>
|
||||
#include <cstdlib>
|
||||
|
||||
#include "profiler/include/profile_gemm_bias_add_reduce_impl.hpp"
|
||||
#include "profiler/profile_gemm_bias_add_reduce_impl.hpp"
|
||||
#include "profiler_operation_registry.hpp"
|
||||
|
||||
#define OP_NAME "gemm_bias_add_reduce"
|
||||
#define OP_DESC "GEMM+Bias+Add+Reduce"
|
||||
|
||||
int profile_gemm_bias_add_reduce(int argc, char* argv[])
|
||||
{
|
||||
@@ -26,7 +30,7 @@ int profile_gemm_bias_add_reduce(int argc, char* argv[])
|
||||
|
||||
if(!(argc == 14 || argc == 15))
|
||||
{
|
||||
printf("arg1: tensor operation (gemm: GEMM+bias+add+Reduce)\n");
|
||||
printf("arg1: tensor operation (" OP_NAME ": " OP_DESC ")\n");
|
||||
printf("arg2: data type (0: fp32; 1: fp16)\n");
|
||||
printf("arg3: matrix layout (0: A[m, k] * B[k, n] = C[m, n];\n");
|
||||
printf(" 1: A[m, k] * B[n, k] = C[m, n];\n");
|
||||
@@ -159,3 +163,5 @@ int profile_gemm_bias_add_reduce(int argc, char* argv[])
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
REGISTER_PROFILER_OPERATION(OP_NAME, OP_DESC, profile_gemm_bias_add_reduce);
|
||||
|
||||
@@ -6,7 +6,11 @@
|
||||
#include <initializer_list>
|
||||
#include <cstdlib>
|
||||
|
||||
#include "profiler/include/profile_gemm_bilinear_impl.hpp"
|
||||
#include "profiler/profile_gemm_bilinear_impl.hpp"
|
||||
#include "profiler_operation_registry.hpp"
|
||||
|
||||
#define OP_NAME "gemm_bilinear"
|
||||
#define OP_DESC "GEMM+Bilinear"
|
||||
|
||||
int profile_gemm_bilinear(int argc, char* argv[])
|
||||
{
|
||||
@@ -29,7 +33,7 @@ int profile_gemm_bilinear(int argc, char* argv[])
|
||||
if(argc != 17)
|
||||
{
|
||||
// clang-format off
|
||||
printf("arg1: tensor operation (gemm_bilinear: GEMM+Bilinear)\n");
|
||||
printf("arg1: tensor operation (" OP_NAME ": " OP_DESC ")\n");
|
||||
printf("arg2: data type (0: fp32; 1: fp16; 2: bf16; 3: int8)\n");
|
||||
printf("arg3: matrix layout (0: E[m, n] = alpha * A[m, k] * B[k, n] + beta * D[m, n];\n");
|
||||
printf(" 1: E[m, n] = alpha * A[m, k] * B[n, k] + beta * D[m, n];\n");
|
||||
@@ -144,3 +148,5 @@ int profile_gemm_bilinear(int argc, char* argv[])
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
|
||||
REGISTER_PROFILER_OPERATION(OP_NAME, OP_DESC, profile_gemm_bilinear);
|
||||
|
||||
@@ -6,7 +6,11 @@
|
||||
#include <initializer_list>
|
||||
#include <cstdlib>
|
||||
|
||||
#include "profiler/include/profile_gemm_reduce_impl.hpp"
|
||||
#include "profiler/profile_gemm_reduce_impl.hpp"
|
||||
#include "profiler_operation_registry.hpp"
|
||||
|
||||
#define OP_NAME "gemm_reduce"
|
||||
#define OP_DESC "GEMM+Reduce"
|
||||
|
||||
int profile_gemm_reduce(int argc, char* argv[])
|
||||
{
|
||||
@@ -26,7 +30,7 @@ int profile_gemm_reduce(int argc, char* argv[])
|
||||
|
||||
if(!(argc == 14 || argc == 15))
|
||||
{
|
||||
printf("arg1: tensor operation (gemm: GEMM+Reduce)\n");
|
||||
printf("arg1: tensor operation (" OP_NAME ": " OP_DESC ")\n");
|
||||
printf("arg2: data type (0: fp32; 1: fp16)\n");
|
||||
printf("arg3: matrix layout (0: A[m, k] * B[k, n] = C[m, n];\n");
|
||||
printf(" 1: A[m, k] * B[n, k] = C[m, n];\n");
|
||||
@@ -146,3 +150,5 @@ int profile_gemm_reduce(int argc, char* argv[])
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
REGISTER_PROFILER_OPERATION(OP_NAME, OP_DESC, profile_gemm_reduce);
|
||||
|
||||
@@ -6,7 +6,8 @@
|
||||
#include <initializer_list>
|
||||
#include <cstdlib>
|
||||
|
||||
#include "profiler/include/profile_gemm_splitk_impl.hpp"
|
||||
#include "profiler/profile_gemm_splitk_impl.hpp"
|
||||
#include "profiler_operation_registry.hpp"
|
||||
|
||||
enum struct GemmMatrixLayout
|
||||
{
|
||||
@@ -24,11 +25,14 @@ enum struct GemmDataType
|
||||
INT8_INT8_INT8, // 3
|
||||
};
|
||||
|
||||
#define OP_NAME "gemm_splitk"
|
||||
#define OP_DESC "Split-K GEMM"
|
||||
|
||||
int profile_gemm_splitk(int argc, char* argv[])
|
||||
{
|
||||
if(argc != 15)
|
||||
{
|
||||
printf("arg1: tensor operation (gemm_splitk: Split-K GEMM)\n");
|
||||
printf("arg1: tensor operation (" OP_NAME ": " OP_DESC ")\n");
|
||||
printf("arg2: data type (0: fp32; 1: fp16; 2: bf16; 3: int8)\n");
|
||||
printf("arg3: matrix layout (0: A[m, k] * B[k, n] = C[m, n];\n");
|
||||
printf(" 1: A[m, k] * B[n, k] = C[m, n];\n");
|
||||
@@ -146,3 +150,5 @@ int profile_gemm_splitk(int argc, char* argv[])
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
|
||||
REGISTER_PROFILER_OPERATION(OP_NAME, OP_DESC, profile_gemm_splitk);
|
||||
|
||||
@@ -6,7 +6,8 @@
|
||||
#include <iostream>
|
||||
#include <numeric>
|
||||
|
||||
#include "profiler/include/profile_grouped_conv_bwd_weight_impl.hpp"
|
||||
#include "profiler/profile_grouped_conv_bwd_weight_impl.hpp"
|
||||
#include "profiler_operation_registry.hpp"
|
||||
|
||||
namespace {
|
||||
|
||||
@@ -23,9 +24,12 @@ enum struct ConvDataType
|
||||
BF16_F32_BF16, // 2
|
||||
};
|
||||
|
||||
#define OP_NAME "grouped_conv_bwd_weight"
|
||||
#define OP_DESC "Grouped Convolution Backward Weight"
|
||||
|
||||
static void print_helper_msg()
|
||||
{
|
||||
std::cout << "arg1: tensor operation (conv_bwd_weight: Convolution Backward Weight\n"
|
||||
std::cout << "arg1: tensor operation (" OP_NAME ": " OP_DESC ")\n"
|
||||
<< "arg2: data type (0: Input fp32, Weight fp32, Output fp32\n"
|
||||
<< " 1: Input fp16, Weight fp16, Output fp16\n"
|
||||
<< " 2: Input bf16, Weight fp32, Output bf16)\n"
|
||||
@@ -174,3 +178,5 @@ int profile_grouped_conv_bwd_weight(int argc, char* argv[])
|
||||
|
||||
return 1;
|
||||
}
|
||||
|
||||
REGISTER_PROFILER_OPERATION(OP_NAME, OP_DESC, profile_grouped_conv_bwd_weight);
|
||||
|
||||
@@ -6,7 +6,8 @@
|
||||
#include <initializer_list>
|
||||
#include <cstdlib>
|
||||
|
||||
#include "profiler/include/profile_grouped_conv_fwd_impl.hpp"
|
||||
#include "profiler/profile_grouped_conv_fwd_impl.hpp"
|
||||
#include "profiler_operation_registry.hpp"
|
||||
|
||||
namespace {
|
||||
|
||||
@@ -24,11 +25,14 @@ enum struct ConvDataType
|
||||
INT8_INT8_INT8, // 3
|
||||
};
|
||||
|
||||
#define OP_NAME "grouped_conv_fwd"
|
||||
#define OP_DESC "Grouped Convolution Forward"
|
||||
|
||||
static void print_helper_msg()
|
||||
{
|
||||
std::cout
|
||||
// clang-format off
|
||||
<< "arg1: tensor operation (grouped_conv_fwd: Grouped Convolution Forward)\n"
|
||||
<< "arg1: tensor operation (" OP_NAME ": " OP_DESC ")\n"
|
||||
<< "arg2: data type (0: Input fp32, Weight fp32, Output fp32\n"
|
||||
<< " 1: Input fp16, Weight fp16, Output fp16\n"
|
||||
<< " 2: Input bf16, Weight bf16, Output bf16\n"
|
||||
@@ -252,3 +256,5 @@ int profile_grouped_conv_fwd(int argc, char* argv[])
|
||||
|
||||
return 1;
|
||||
}
|
||||
|
||||
REGISTER_PROFILER_OPERATION(OP_NAME, OP_DESC, profile_grouped_conv_fwd);
|
||||
|
||||
@@ -6,7 +6,8 @@
|
||||
#include <initializer_list>
|
||||
#include <cstdlib>
|
||||
|
||||
#include "profiler/include/profile_grouped_gemm_impl.hpp"
|
||||
#include "profiler/profile_grouped_gemm_impl.hpp"
|
||||
#include "profiler_operation_registry.hpp"
|
||||
|
||||
enum struct GemmMatrixLayout
|
||||
{
|
||||
@@ -44,11 +45,14 @@ std::vector<int> argToIntArray(char* input)
|
||||
return out;
|
||||
}
|
||||
|
||||
#define OP_NAME "grouped_gemm"
|
||||
#define OP_DESC "Grouped GEMM"
|
||||
|
||||
int profile_grouped_gemm(int argc, char* argv[])
|
||||
{
|
||||
if(!(argc == 14))
|
||||
{
|
||||
printf("arg1: tensor operation (grouped_gemm: Grouped GEMM)\n");
|
||||
printf("arg1: tensor operation (" OP_NAME ": " OP_DESC ")\n");
|
||||
printf("arg2: data type (0: fp32; 1: fp16; 2: bf16; 3: int8)\n");
|
||||
printf("arg3: matrix layout (0: A[m, k] * B[k, n] = C[m, n];\n");
|
||||
printf(" 1: A[m, k] * B[n, k] = C[m, n];\n");
|
||||
@@ -161,3 +165,5 @@ int profile_grouped_gemm(int argc, char* argv[])
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
REGISTER_PROFILER_OPERATION(OP_NAME, OP_DESC, profile_grouped_gemm);
|
||||
|
||||
@@ -5,8 +5,9 @@
|
||||
#include <vector>
|
||||
#include <unordered_map>
|
||||
|
||||
#include "profiler/include/data_type_enum.hpp"
|
||||
#include "profiler/include/profile_groupnorm_impl.hpp"
|
||||
#include "profiler/data_type_enum.hpp"
|
||||
#include "profiler/profile_groupnorm_impl.hpp"
|
||||
#include "profiler_operation_registry.hpp"
|
||||
|
||||
using ck::index_t;
|
||||
|
||||
@@ -43,9 +44,12 @@ struct GroupnormArgParser
|
||||
}
|
||||
};
|
||||
|
||||
#define OP_NAME "groupnorm"
|
||||
#define OP_DESC "Group Normalization"
|
||||
|
||||
void print_help_groupnorm()
|
||||
{
|
||||
std::cout << "arg1: tensor operation (groupnorm: Group normalization)\n"
|
||||
std::cout << "arg1: tensor operation (" OP_NAME ": " OP_DESC ")\n"
|
||||
<< "arg2: data type (0: fp16; 1: fp32)\n"
|
||||
<< "arg3: verification (0: no; 1: yes)\n"
|
||||
<< "arg4: initialization (0: no init; 1: integer value; 2: decimal value)\n"
|
||||
@@ -104,3 +108,5 @@ int profile_groupnorm(int argc, char* argv[])
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
REGISTER_PROFILER_OPERATION(OP_NAME, OP_DESC, profile_groupnorm);
|
||||
|
||||
@@ -5,8 +5,9 @@
|
||||
#include <vector>
|
||||
#include <unordered_map>
|
||||
|
||||
#include "profiler/include/data_type_enum.hpp"
|
||||
#include "profiler/include/profile_layernorm_impl.hpp"
|
||||
#include "profiler/data_type_enum.hpp"
|
||||
#include "profiler/profile_layernorm_impl.hpp"
|
||||
#include "profiler_operation_registry.hpp"
|
||||
|
||||
using ck::index_t;
|
||||
|
||||
@@ -96,3 +97,5 @@ int profile_layernorm(int argc, char* argv[])
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
REGISTER_PROFILER_OPERATION("layernorm", "Layer Normalization", profile_layernorm);
|
||||
|
||||
@@ -13,8 +13,9 @@
|
||||
|
||||
#include "ck/library/utility/host_common_util.hpp"
|
||||
|
||||
#include "profiler/include/profile_reduce_impl.hpp"
|
||||
#include "profiler/include/data_type_enum.hpp"
|
||||
#include "profiler/profile_reduce_impl.hpp"
|
||||
#include "profiler/data_type_enum.hpp"
|
||||
#include "profiler_operation_registry.hpp"
|
||||
|
||||
using namespace std;
|
||||
|
||||
@@ -429,3 +430,5 @@ int profile_reduce(int argc, char* argv[])
|
||||
|
||||
return (0);
|
||||
};
|
||||
|
||||
REGISTER_PROFILER_OPERATION("reduce", "Reduce", profile_reduce);
|
||||
|
||||
@@ -5,7 +5,8 @@
|
||||
#include <vector>
|
||||
#include <unordered_map>
|
||||
|
||||
#include "profiler/include/profile_softmax_impl.hpp"
|
||||
#include "profiler/profile_softmax_impl.hpp"
|
||||
#include "profiler_operation_registry.hpp"
|
||||
|
||||
using ck::index_t;
|
||||
using ck::profiler::SoftmaxDataType;
|
||||
@@ -164,3 +165,5 @@ int profile_softmax(int argc, char* argv[])
|
||||
// profile_normalization(argc, argv);
|
||||
// return 0;
|
||||
// }
|
||||
|
||||
REGISTER_PROFILER_OPERATION("softmax", "Softmax", profile_softmax);
|
||||
|
||||
@@ -1,56 +1,14 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include <cstring>
|
||||
#include <cstdlib>
|
||||
#include <iostream>
|
||||
|
||||
int profile_gemm(int, char*[]);
|
||||
int profile_gemm_splitk(int, char*[]);
|
||||
int profile_gemm_bilinear(int, char*[]);
|
||||
int profile_gemm_add_add_fastgelu(int, char*[]);
|
||||
int profile_gemm_reduce(int, char*[]);
|
||||
int profile_gemm_bias_add_reduce(int, char*[]);
|
||||
int profile_batched_gemm(int, char*[]);
|
||||
int profile_batched_gemm_gemm(int, char*[]);
|
||||
int profile_batched_gemm_add_relu_gemm_add(int, char*[]);
|
||||
int profile_batched_gemm_reduce(int, char*[]);
|
||||
int profile_grouped_gemm(int, char*[]);
|
||||
int profile_conv_fwd(int, char*[]);
|
||||
int profile_conv_fwd_bias_relu(int, char*[]);
|
||||
int profile_conv_fwd_bias_relu_add(int, char*[]);
|
||||
int profile_conv_bwd_data(int, char*[]);
|
||||
int profile_grouped_conv_fwd(int, char*[]);
|
||||
int profile_grouped_conv_bwd_weight(int, char*[]);
|
||||
int profile_softmax(int, char*[]);
|
||||
int profile_layernorm(int, char*[]);
|
||||
int profile_groupnorm(int, char*[]);
|
||||
int profile_reduce(int, char*[]);
|
||||
int profile_batchnorm_forward(int, char*[]);
|
||||
int profile_batchnorm_backward(int, char*[]);
|
||||
#include "profiler_operation_registry.hpp"
|
||||
|
||||
static void print_helper_message()
|
||||
{
|
||||
// clang-format off
|
||||
printf("arg1: tensor operation (gemm: GEMM\n"
|
||||
" gemm_splitk: Split-K GEMM\n"
|
||||
" gemm_bilinear: GEMM+Bilinear\n"
|
||||
" gemm_add_add_fastgelu: GEMM+Add+Add+FastGeLU\n"
|
||||
" gemm_reduce: GEMM+Reduce\n"
|
||||
" gemm_bias_add_reduce: GEMM+Bias+Add+Reduce\n"
|
||||
" batched_gemm: Batched GEMM\n"
|
||||
" batched_gemm_gemm: Batched+GEMM+GEMM\n"
|
||||
" batched_gemm_add_relu_gemm_add: Batched+GEMM+bias+gelu+GEMM+bias\n"
|
||||
" batched_gemm_reduce: Batched GEMM+Reduce\n"
|
||||
" grouped_gemm: Grouped GEMM\n"
|
||||
" conv_fwd: Convolution Forward\n"
|
||||
" conv_fwd_bias_relu: ForwardConvolution+Bias+ReLU\n"
|
||||
" conv_fwd_bias_relu_add: ForwardConvolution+Bias+ReLU+Add\n"
|
||||
" conv_bwd_data: Convolution Backward Data\n"
|
||||
" grouped_conv_fwd: Grouped Convolution Forward\n"
|
||||
" grouped_conv_bwd_weight: Grouped Convolution Backward Weight\n"
|
||||
" softmax: Softmax\n"
|
||||
" reduce: Reduce\n"
|
||||
" bnorm_fwd: Batchnorm forward\n");
|
||||
// clang-format on
|
||||
std::cout << "arg1: tensor operation " << ProfilerOperationRegistry::GetInstance() << std::endl;
|
||||
}
|
||||
|
||||
int main(int argc, char* argv[])
|
||||
@@ -58,105 +16,15 @@ int main(int argc, char* argv[])
|
||||
if(argc == 1)
|
||||
{
|
||||
print_helper_message();
|
||||
|
||||
return 0;
|
||||
}
|
||||
else if(strcmp(argv[1], "gemm") == 0)
|
||||
else if(const auto operation = ProfilerOperationRegistry::GetInstance().Get(argv[1]);
|
||||
operation.has_value())
|
||||
{
|
||||
return profile_gemm(argc, argv);
|
||||
}
|
||||
else if(strcmp(argv[1], "gemm_splitk") == 0)
|
||||
{
|
||||
return profile_gemm_splitk(argc, argv);
|
||||
}
|
||||
else if(strcmp(argv[1], "gemm_bilinear") == 0)
|
||||
{
|
||||
return profile_gemm_bilinear(argc, argv);
|
||||
}
|
||||
else if(strcmp(argv[1], "gemm_add_add_fastgelu") == 0)
|
||||
{
|
||||
return profile_gemm_add_add_fastgelu(argc, argv);
|
||||
}
|
||||
else if(strcmp(argv[1], "gemm_reduce") == 0)
|
||||
{
|
||||
return profile_gemm_reduce(argc, argv);
|
||||
}
|
||||
else if(strcmp(argv[1], "gemm_bias_add_reduce") == 0)
|
||||
{
|
||||
return profile_gemm_bias_add_reduce(argc, argv);
|
||||
}
|
||||
else if(strcmp(argv[1], "batched_gemm") == 0)
|
||||
{
|
||||
return profile_batched_gemm(argc, argv);
|
||||
}
|
||||
else if(strcmp(argv[1], "batched_gemm_gemm") == 0)
|
||||
{
|
||||
return profile_batched_gemm_gemm(argc, argv);
|
||||
}
|
||||
else if(strcmp(argv[1], "batched_gemm_add_relu_gemm_add") == 0)
|
||||
{
|
||||
return profile_batched_gemm_add_relu_gemm_add(argc, argv);
|
||||
}
|
||||
else if(strcmp(argv[1], "batched_gemm_reduce") == 0)
|
||||
{
|
||||
return profile_batched_gemm_reduce(argc, argv);
|
||||
}
|
||||
else if(strcmp(argv[1], "grouped_gemm") == 0)
|
||||
{
|
||||
return profile_grouped_gemm(argc, argv);
|
||||
}
|
||||
else if(strcmp(argv[1], "conv_fwd") == 0)
|
||||
{
|
||||
return profile_conv_fwd(argc, argv);
|
||||
}
|
||||
else if(strcmp(argv[1], "conv_fwd_bias_relu") == 0)
|
||||
{
|
||||
return profile_conv_fwd_bias_relu(argc, argv);
|
||||
}
|
||||
else if(strcmp(argv[1], "conv_fwd_bias_relu_add") == 0)
|
||||
{
|
||||
return profile_conv_fwd_bias_relu_add(argc, argv);
|
||||
}
|
||||
else if(strcmp(argv[1], "conv_bwd_data") == 0)
|
||||
{
|
||||
return profile_conv_bwd_data(argc, argv);
|
||||
}
|
||||
else if(strcmp(argv[1], "grouped_conv_fwd") == 0)
|
||||
{
|
||||
return profile_grouped_conv_fwd(argc, argv);
|
||||
}
|
||||
else if(strcmp(argv[1], "conv_bwd_weight") == 0)
|
||||
{
|
||||
return profile_grouped_conv_bwd_weight(argc, argv);
|
||||
}
|
||||
else if(strcmp(argv[1], "reduce") == 0)
|
||||
{
|
||||
return profile_reduce(argc, argv);
|
||||
}
|
||||
else if(strcmp(argv[1], "softmax") == 0)
|
||||
{
|
||||
return profile_softmax(argc, argv);
|
||||
}
|
||||
else if(strcmp(argv[1], "layernorm") == 0)
|
||||
{
|
||||
return profile_layernorm(argc, argv);
|
||||
}
|
||||
else if(strcmp(argv[1], "groupnorm") == 0)
|
||||
{
|
||||
return profile_groupnorm(argc, argv);
|
||||
}
|
||||
else if(strcmp(argv[1], "bnorm_fwd") == 0)
|
||||
{
|
||||
return profile_batchnorm_forward(argc, argv);
|
||||
}
|
||||
else if(strcmp(argv[1], "bnorm_bwd") == 0)
|
||||
{
|
||||
return profile_batchnorm_backward(argc, argv);
|
||||
return (*operation)(argc, argv);
|
||||
}
|
||||
else
|
||||
{
|
||||
print_helper_message();
|
||||
|
||||
return 0;
|
||||
std::cerr << "cannot find operation: " << argv[1] << std::endl;
|
||||
return EXIT_FAILURE;
|
||||
}
|
||||
}
|
||||
|
||||
79
profiler/src/profiler_operation_registry.hpp
Normal file
79
profiler/src/profiler_operation_registry.hpp
Normal file
@@ -0,0 +1,79 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include <functional>
|
||||
#include <iostream>
|
||||
#include <iterator>
|
||||
#include <map>
|
||||
#include <optional>
|
||||
#include <string_view>
|
||||
#include <utility>
|
||||
|
||||
class ProfilerOperationRegistry final
|
||||
{
|
||||
ProfilerOperationRegistry() = default;
|
||||
~ProfilerOperationRegistry() = default;
|
||||
|
||||
public:
|
||||
using Operation = std::function<int(int, char*[])>;
|
||||
|
||||
private:
|
||||
struct Entry final
|
||||
{
|
||||
explicit Entry(std::string_view description, Operation operation) noexcept
|
||||
: description_(description), operation_(std::move(operation))
|
||||
{
|
||||
}
|
||||
|
||||
std::string_view description_;
|
||||
Operation operation_;
|
||||
};
|
||||
|
||||
std::map<std::string_view, Entry> entries_;
|
||||
|
||||
friend std::ostream& operator<<(std::ostream& stream, const ProfilerOperationRegistry& registry)
|
||||
{
|
||||
stream << "{\n";
|
||||
for(auto& [name, entry] : registry.entries_)
|
||||
{
|
||||
stream << "\t" << name << ": " << entry.description_ << "\n";
|
||||
}
|
||||
stream << "}";
|
||||
|
||||
return stream;
|
||||
}
|
||||
|
||||
public:
|
||||
static ProfilerOperationRegistry& GetInstance()
|
||||
{
|
||||
static ProfilerOperationRegistry registry;
|
||||
return registry;
|
||||
}
|
||||
|
||||
std::optional<Operation> Get(std::string_view name) const
|
||||
{
|
||||
const auto found = entries_.find(name);
|
||||
if(found == end(entries_))
|
||||
{
|
||||
return std::nullopt;
|
||||
}
|
||||
|
||||
return (found->second).operation_;
|
||||
}
|
||||
|
||||
bool Add(std::string_view name, std::string_view description, Operation operation)
|
||||
{
|
||||
return entries_
|
||||
.emplace(std::piecewise_construct,
|
||||
std::forward_as_tuple(name),
|
||||
std::forward_as_tuple(description, std::move(operation)))
|
||||
.second;
|
||||
}
|
||||
};
|
||||
|
||||
#define PP_CONCAT(x, y) PP_CONCAT_IMPL(x, y)
|
||||
#define PP_CONCAT_IMPL(x, y) x##y
|
||||
|
||||
#define REGISTER_PROFILER_OPERATION(name, description, operation) \
|
||||
static const bool PP_CONCAT(operation_registration_result_, __COUNTER__) = \
|
||||
::ProfilerOperationRegistry::GetInstance().Add(name, description, operation)
|
||||
@@ -1,5 +1,6 @@
|
||||
include_directories(BEFORE
|
||||
${PROJECT_SOURCE_DIR}/
|
||||
${PROJECT_SOURCE_DIR}/profiler/include
|
||||
)
|
||||
|
||||
include(googletest)
|
||||
|
||||
@@ -3,7 +3,7 @@
|
||||
|
||||
#include <iostream>
|
||||
|
||||
#include "profiler/include/profile_batched_gemm_impl.hpp"
|
||||
#include "profiler/profile_batched_gemm_impl.hpp"
|
||||
|
||||
namespace {
|
||||
using ADataType = ck::bhalf_t;
|
||||
|
||||
@@ -3,7 +3,7 @@
|
||||
|
||||
#include <iostream>
|
||||
|
||||
#include "profiler/include/profile_batched_gemm_impl.hpp"
|
||||
#include "profiler/profile_batched_gemm_impl.hpp"
|
||||
|
||||
namespace {
|
||||
using ADataType = ck::half_t;
|
||||
|
||||
@@ -3,7 +3,7 @@
|
||||
|
||||
#include <iostream>
|
||||
|
||||
#include "profiler/include/profile_batched_gemm_impl.hpp"
|
||||
#include "profiler/profile_batched_gemm_impl.hpp"
|
||||
|
||||
namespace {
|
||||
using ADataType = float;
|
||||
|
||||
@@ -3,7 +3,7 @@
|
||||
|
||||
#include <iostream>
|
||||
|
||||
#include "profiler/include/profile_batched_gemm_impl.hpp"
|
||||
#include "profiler/profile_batched_gemm_impl.hpp"
|
||||
|
||||
namespace {
|
||||
using ADataType = int8_t;
|
||||
|
||||
@@ -6,7 +6,7 @@
|
||||
#include <vector>
|
||||
#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/impl/device_batched_gemm_gemm_xdl_cshuffle.hpp"
|
||||
#include "profiler/include/profile_batched_gemm_gemm_impl.hpp"
|
||||
#include "profiler/profile_batched_gemm_gemm_impl.hpp"
|
||||
|
||||
using ck::tensor_operation::device::GemmSpecialization;
|
||||
|
||||
|
||||
@@ -3,7 +3,7 @@
|
||||
|
||||
#include <iostream>
|
||||
|
||||
#include "profiler/include/profile_batched_gemm_reduce_impl.hpp"
|
||||
#include "profiler/profile_batched_gemm_reduce_impl.hpp"
|
||||
|
||||
int main()
|
||||
{
|
||||
|
||||
@@ -6,7 +6,7 @@
|
||||
#include <vector>
|
||||
#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/impl/device_batched_gemm_softmax_gemm_xdl_cshuffle.hpp"
|
||||
#include "profiler/include/profile_batched_gemm_softmax_gemm_impl.hpp"
|
||||
#include "profiler/profile_batched_gemm_softmax_gemm_impl.hpp"
|
||||
using ck::tensor_operation::device::GemmSpecialization;
|
||||
|
||||
template <ck::index_t N>
|
||||
|
||||
@@ -7,7 +7,7 @@
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/impl/device_batched_gemm_softmax_gemm_permute_xdl_cshuffle.hpp"
|
||||
#include "profiler/include/profile_batched_gemm_softmax_gemm_permute_impl.hpp"
|
||||
#include "profiler/profile_batched_gemm_softmax_gemm_permute_impl.hpp"
|
||||
|
||||
using ck::tensor_operation::device::GemmSpecialization;
|
||||
using ck::tensor_operation::device::MaskingSpecialization;
|
||||
|
||||
@@ -8,7 +8,7 @@
|
||||
#include <tuple>
|
||||
#include <gtest/gtest.h>
|
||||
|
||||
#include "profiler/include/profile_batchnorm_backward_impl.hpp"
|
||||
#include "profiler/profile_batchnorm_backward_impl.hpp"
|
||||
|
||||
using F16 = ck::half_t;
|
||||
using F32 = float;
|
||||
|
||||
@@ -8,7 +8,7 @@
|
||||
#include <tuple>
|
||||
#include <gtest/gtest.h>
|
||||
|
||||
#include "profiler/include/profile_batchnorm_forward_impl.hpp"
|
||||
#include "profiler/profile_batchnorm_forward_impl.hpp"
|
||||
|
||||
using F16 = ck::half_t;
|
||||
using F32 = float;
|
||||
|
||||
@@ -8,7 +8,7 @@
|
||||
#include <tuple>
|
||||
#include <gtest/gtest.h>
|
||||
|
||||
#include "profiler/include/profile_conv_bwd_data_impl.hpp"
|
||||
#include "profiler/profile_conv_bwd_data_impl.hpp"
|
||||
|
||||
template <typename Tuple>
|
||||
class TestConvndBwdData : public ::testing::Test
|
||||
|
||||
@@ -8,7 +8,7 @@
|
||||
#include <tuple>
|
||||
#include <gtest/gtest.h>
|
||||
|
||||
#include "profiler/include/profile_conv_fwd_impl.hpp"
|
||||
#include "profiler/profile_conv_fwd_impl.hpp"
|
||||
|
||||
template <typename Tuple>
|
||||
class TestConvndFwd : public ::testing::Test
|
||||
|
||||
@@ -2,7 +2,7 @@
|
||||
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include "gtest/gtest.h"
|
||||
#include "profiler/include/profile_elementwise_layernorm_impl.hpp"
|
||||
#include "profiler/profile_elementwise_layernorm_impl.hpp"
|
||||
|
||||
using F16 = ck::half_t;
|
||||
using F32 = float;
|
||||
|
||||
@@ -3,7 +3,7 @@
|
||||
|
||||
#include <iostream>
|
||||
|
||||
#include "profiler/include/profile_gemm_reduce_impl.hpp"
|
||||
#include "profiler/profile_gemm_reduce_impl.hpp"
|
||||
|
||||
int main()
|
||||
{
|
||||
|
||||
@@ -9,7 +9,7 @@
|
||||
|
||||
#include <gtest/gtest.h>
|
||||
|
||||
#include "profiler/include/profile_grouped_conv_bwd_weight_impl.hpp"
|
||||
#include "profiler/profile_grouped_conv_bwd_weight_impl.hpp"
|
||||
|
||||
template <typename Tuple>
|
||||
class TestGroupedConvndBwdWeight : public ::testing::Test
|
||||
|
||||
@@ -7,7 +7,7 @@
|
||||
#include <vector>
|
||||
#include <gtest/gtest.h>
|
||||
|
||||
#include "profiler/include/profile_grouped_conv_fwd_impl.hpp"
|
||||
#include "profiler/profile_grouped_conv_fwd_impl.hpp"
|
||||
|
||||
class TestGroupedConvNdFwd : public ::testing::Test
|
||||
{
|
||||
|
||||
@@ -3,7 +3,7 @@
|
||||
|
||||
#include <iostream>
|
||||
|
||||
#include "profiler/include/profile_grouped_gemm_impl.hpp"
|
||||
#include "profiler/profile_grouped_gemm_impl.hpp"
|
||||
|
||||
namespace {
|
||||
|
||||
|
||||
@@ -2,7 +2,7 @@
|
||||
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include "gtest/gtest.h"
|
||||
#include "profiler/include/profile_groupnorm_impl.hpp"
|
||||
#include "profiler/profile_groupnorm_impl.hpp"
|
||||
|
||||
using F16 = ck::half_t;
|
||||
using F32 = float;
|
||||
|
||||
@@ -2,7 +2,7 @@
|
||||
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include "gtest/gtest.h"
|
||||
#include "profiler/include/profile_groupnorm_impl.hpp"
|
||||
#include "profiler/profile_groupnorm_impl.hpp"
|
||||
|
||||
using F16 = ck::half_t;
|
||||
using F32 = float;
|
||||
|
||||
@@ -2,7 +2,7 @@
|
||||
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include "gtest/gtest.h"
|
||||
#include "profiler/include/profile_layernorm_impl.hpp"
|
||||
#include "profiler/profile_layernorm_impl.hpp"
|
||||
|
||||
using F16 = ck::half_t;
|
||||
using F32 = float;
|
||||
|
||||
@@ -2,7 +2,7 @@
|
||||
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include "gtest/gtest.h"
|
||||
#include "profiler/include/profile_layernorm_impl.hpp"
|
||||
#include "profiler/profile_layernorm_impl.hpp"
|
||||
|
||||
using F16 = ck::half_t;
|
||||
using F32 = float;
|
||||
|
||||
@@ -4,7 +4,7 @@
|
||||
#include <getopt.h>
|
||||
|
||||
#include "ck/library/utility/host_common_util.hpp"
|
||||
#include "profiler/include/profile_reduce_impl.hpp"
|
||||
#include "profiler/profile_reduce_impl.hpp"
|
||||
|
||||
using namespace ck;
|
||||
|
||||
|
||||
@@ -4,7 +4,7 @@
|
||||
#include <getopt.h>
|
||||
|
||||
#include "ck/library/utility/host_common_util.hpp"
|
||||
#include "profiler/include/profile_reduce_impl.hpp"
|
||||
#include "profiler/profile_reduce_impl.hpp"
|
||||
|
||||
using namespace ck;
|
||||
|
||||
|
||||
@@ -13,7 +13,7 @@
|
||||
#include "ck/tensor_operation/gpu/device/impl/device_softmax_impl.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
|
||||
#include "include/ck/utility/data_type.hpp"
|
||||
#include "profiler/include/profile_softmax_impl.hpp"
|
||||
#include "profiler/profile_softmax_impl.hpp"
|
||||
|
||||
namespace ck {
|
||||
|
||||
|
||||
Reference in New Issue
Block a user