From 2071869077769563b90bd374ee02124c56c0b63a Mon Sep 17 00:00:00 2001 From: Chao Liu Date: Thu, 14 Apr 2022 00:01:29 +0000 Subject: [PATCH] adding gemm pipeline --- example/01_gemm/gemm_xdl_fp16.cpp | 43 ++++-- include/ck/config.hpp | 2 +- .../gpu/grid/gridwise_gemm_pipeline_v2.hpp | 79 ++++++----- .../grid/gridwise_gemm_xdl_cshuffle_v2.hpp | 17 +-- profiler/CMakeLists.txt | 58 ++++---- profiler/src/profiler.cpp | 128 +++++++++--------- 6 files changed, 172 insertions(+), 155 deletions(-) diff --git a/example/01_gemm/gemm_xdl_fp16.cpp b/example/01_gemm/gemm_xdl_fp16.cpp index 2d5a95e400..78505f8b9e 100644 --- a/example/01_gemm/gemm_xdl_fp16.cpp +++ b/example/01_gemm/gemm_xdl_fp16.cpp @@ -11,9 +11,10 @@ #include "host_tensor.hpp" #include "host_tensor_generator.hpp" #include "device_tensor.hpp" -#include "device_gemm_xdl.hpp" -#include "device_gemm_xdl_c_shuffle.hpp" -#include "device_gemm_xdl_cshuffle.hpp" +//#include "device_gemm_xdl.hpp" +//#include "device_gemm_xdl_c_shuffle.hpp" +//#include "device_gemm_xdl_cshuffle.hpp" +#include "device_gemm_xdl_cshuffle_v2.hpp" #include "element_wise_operation.hpp" #include "reference_gemm.hpp" #include "gemm_specialization.hpp" @@ -42,15 +43,39 @@ using AElementOp = ck::tensor_operation::element_wise::PassThrough; using BElementOp = ck::tensor_operation::element_wise::PassThrough; using CElementOp = ck::tensor_operation::element_wise::PassThrough; -static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default; +static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default; +static constexpr auto GemmMNPadding = ck::tensor_operation::device::GemmSpecialization::MNPadding; // clang-format off +#if 0 using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemm_Xdl_CShuffle -//######| ALayout| BLayout| CLayout|AData| BData| CData| GemmAcc| CShuffle| A| B| C| GEMM| NumGemmK| Block| MPer| NPer| KPer| AK1| BK1| MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CBlockTransferClusterLengths| CBlockTransfer| -//######| | | | Type| Type| Type| DataType| DataType| Elementwise| Elementwise| Elementwise| Spacialization| Prefetch| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| ExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| ExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MPerBlock| ScalarPerVector| -//######| | | | | | | | | Operation| Operation| Operation| | Stage| | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NPerBlock| _NPerBlock| -//######| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | - < Row, Col, Row, F16, F16, F16, F32, F32, AElementOp, BElementOp, CElementOp, GemmDefault, 1, 256, 256, 128, 32, 8, 8, 32, 32, 4, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 32, 1, 8>, 8>; +//######| ALayout| BLayout| CLayout| AData| BData| CData| AccData| CShuffle| A| B| C| GEMM| NumGemmK| Block| MPer| NPer| KPer| AK1| BK1| MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CBlockTransferClusterLengths| CBlockTransfer| +//######| | | | Type| Type| Type| Type| DataType| Elementwise| Elementwise| Elementwise| Spacialization| Prefetch| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MWaveMPerXdl| ScalarPerVector| +//######| | | | | | | | | Operation| Operation| Operation| | Stage| | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NWaveNPerXdl| _NWaveNPerXdl| +//######| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | + //< Row, Col, Row, F16, F16, F16, F32, F32, AElementOp, BElementOp, CElementOp, GemmDefault, 1, 256, 256, 128, 32, 8, 8, 32, 32, 4, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 32, 1, 8>, 8>; +// // 1-stage prefetch + < Row, Col, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmDefault, 1, 256, 128, 128, 32, 8, 8, 32, 32, 2, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 32, 1, 8>, 8>; +// // 2-stage prefetch +// < Row, Col, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmDefault, 2, 256, 128, 128, 32, 8, 8, 32, 32, 2, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 32, 1, 8>, 8>; +#elif 1 +using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemm_Xdl_CShuffle_v2 +//######| ALayout| BLayout| CLayout| AData| BData| CData| AccData| CShuffle| A| B| C| GEMM| NumGemmK| ABBlockTransfer| BlockGemm| MPer| NPer| KPer| AK1| BK1| MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CBlockTransferClusterLengths| CBlockTransfer| +//######| | | | Type| Type| Type| Type| DataType| Elementwise| Elementwise| Elementwise| Spacialization| Prefetch| ThreadGroupSize| ThreadGroupSize| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MWaveMPerXdl| ScalarPerVector| +//######| | | | | | | | | Operation| Operation| Operation| | Stage| | | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NWaveNPerXdl| _NWaveNPerXdl| +//######| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | + < Row, Col, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmDefault, 1, 256, 256, 128, 128, 32, 8, 8, 32, 32, 2, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 64, 1, 8>, 8>; +// < Row, Col, Row, F16, F16, F16, F32, F16, AElementOp, BElementOp, CElementOp, GemmDefault, 1, 256, 256, 256, 128, 32, 8, 8, 32, 32, 4, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 64, 1, 8>, 8>; +#elif 1 +using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemmXdl +//######| AData| BData| CData| AccData| ALayout| BLayout| CLayout| A| B| C| GEMM| Block| MPer| NPer| K0Per| K1| MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CThreadTransfer| CThreadTransfer| +//######| Type| Type| Type| Type| | | | Elementwise| Elementwise| Elementwise|Spacialization| Size| Block| Block| Block| | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| SrcDstVectorDim| DstScalar| +//######| | | | | | | | Operation| Operation| Operation| | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | | PerVector| +//######| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | + < F16, F16, F16, F32, Row, Col, Row, PassThrough, PassThrough, PassThrough, GemmDefault, 256, 128, 128, 4, 8, 32, 32, 2, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, 7, 1>; +// < F16, F16, F16, F32, Row, Col, Row, PassThrough, PassThrough, PassThrough, GemmMNPadding, 256, 128, 144, 8, 8, 16, 16, 2, 9, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, S<8, 8, 4>, S<1, 0, 2>, S<1, 0, 2>, 2, 2, 2, true, 7, 1>; +// < F16, F16, F16, F32, Row, Col, Row, PassThrough, PassThrough, PassThrough, GemmMNPadding, 256, 128, 144, 4, 8, 16, 16, 2, 9, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, S<4, 16, 4>, S<1, 0, 2>, S<1, 0, 2>, 2, 2, 2, true, 7, 1>; +#endif // clang-format on using ReferenceGemmInstance = ck::tensor_operation::host:: diff --git a/include/ck/config.hpp b/include/ck/config.hpp index 13a0edeee9..22e2ac943e 100644 --- a/include/ck/config.hpp +++ b/include/ck/config.hpp @@ -14,7 +14,7 @@ #define CK_USE_LAUNCH_BOUNDS 1 #ifdef CK_USE_LAUNCH_BOUNDS -#define CK_MAX_THREAD_PER_BLOCK 256 +#define CK_MAX_THREAD_PER_BLOCK 512 #define CK_MIN_BLOCK_PER_CU 1 #endif diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_pipeline_v2.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_pipeline_v2.hpp index ca3bf166fe..8507084fb1 100644 --- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_pipeline_v2.hpp +++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_pipeline_v2.hpp @@ -71,35 +71,35 @@ struct GridwiseGemmPipeline_v2 2 @@ -116,18 +116,18 @@ struct GridwiseGemmPipeline_v2; -#if 1 - using ABBlockTransferThreadGroup = ThisThreadBlock; - using BlockGemmThreadGroup = ThisThreadBlock; - using CShuffleBlockTransferThreadGroup = ThisThreadBlock; -#else struct ABBlockTransferThreadGroup { __device__ static constexpr index_t GetNumOfThread() @@ -157,7 +151,6 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdl_cshuffle_v2 }; using CShuffleBlockTransferThreadGroup = ThisThreadBlock; -#endif __host__ __device__ static constexpr auto GetABlockDescriptor_AK0PerBlock_MPerBlock_AK1() { @@ -494,7 +487,7 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdl_cshuffle_v2 (a_grid_desc_ak0_m_ak1.GetLength(I0) * a_grid_desc_ak0_m_ak1.GetLength(I2)) / KPerBlock); -#if 1 +#if 0 // gridwise GEMM pipeline const auto gridwise_gemm_pipeline = GridwiseGemmPipeline_v1, @@ -667,9 +660,9 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdl_cshuffle_v2 // shuffle: blockwise copy C from LDS to global auto c_shuffle_block_copy_lds_to_global = ThreadGroupTensorSliceTransfer_v6r1< - ThisThreadBlock, // ThreadGroup - CElementwiseOperation, // ElementwiseOperation, - CGlobalMemoryDataOperation, // DstInMemOp, + CShuffleBlockTransferThreadGroup, // ThreadGroup + CElementwiseOperation, // ElementwiseOperation, + CGlobalMemoryDataOperation, // DstInMemOp, Sequence<1, CShuffleMXdlPerWavePerShuffle * MWave * MPerXdl, 1, diff --git a/profiler/CMakeLists.txt b/profiler/CMakeLists.txt index a2cf6eeb62..861dcaecea 100644 --- a/profiler/CMakeLists.txt +++ b/profiler/CMakeLists.txt @@ -24,38 +24,38 @@ include_directories(BEFORE set(PROFILER_SOURCE src/profiler.cpp src/profile_gemm.cpp - src/profile_gemm_bias_2d.cpp - src/profile_gemm_bias_relu.cpp - src/profile_gemm_bias_relu_add.cpp - src/profile_gemm_reduce.cpp - src/profile_batched_gemm.cpp - src/profile_conv_fwd.cpp - src/profile_conv_fwd_bias_relu.cpp - src/profile_conv_fwd_bias_relu_add.cpp - src/profile_conv_fwd_bias_relu_atomic_add.cpp - src/profile_convnd_bwd_data.cpp - src/profile_reduce.cpp - src/profile_grouped_gemm.cpp - src/profile_conv_bwd_weight.cpp - src/profile_batched_gemm_reduce.cpp +# src/profile_gemm_bias_2d.cpp +# src/profile_gemm_bias_relu.cpp +# src/profile_gemm_bias_relu_add.cpp +# src/profile_gemm_reduce.cpp +# src/profile_batched_gemm.cpp +# src/profile_conv_fwd.cpp +# src/profile_conv_fwd_bias_relu.cpp +# src/profile_conv_fwd_bias_relu_add.cpp +# src/profile_conv_fwd_bias_relu_atomic_add.cpp +# src/profile_convnd_bwd_data.cpp +# src/profile_reduce.cpp +# src/profile_grouped_gemm.cpp +# src/profile_conv_bwd_weight.cpp +# src/profile_batched_gemm_reduce.cpp ) add_executable(ckProfiler ${PROFILER_SOURCE}) target_link_libraries(ckProfiler PRIVATE host_tensor) -target_link_libraries(ckProfiler PRIVATE device_gemm_reduce_instance) +#target_link_libraries(ckProfiler PRIVATE device_gemm_reduce_instance) target_link_libraries(ckProfiler PRIVATE device_gemm_instance) -target_link_libraries(ckProfiler PRIVATE device_gemm_bias2d_instance) -target_link_libraries(ckProfiler PRIVATE device_gemm_bias_relu_instance) -target_link_libraries(ckProfiler PRIVATE device_gemm_bias_relu_add_instance) -target_link_libraries(ckProfiler PRIVATE device_batched_gemm_instance) -target_link_libraries(ckProfiler PRIVATE device_conv2d_fwd_instance) -target_link_libraries(ckProfiler PRIVATE device_conv2d_fwd_bias_relu_instance) -target_link_libraries(ckProfiler PRIVATE device_conv2d_fwd_bias_relu_add_instance) -target_link_libraries(ckProfiler PRIVATE device_conv2d_fwd_bias_relu_atomic_add_instance) -target_link_libraries(ckProfiler PRIVATE device_convnd_bwd_data_instance) -target_link_libraries(ckProfiler PRIVATE device_reduce_instance) -target_link_libraries(ckProfiler PRIVATE device_reduce_instance) -target_link_libraries(ckProfiler PRIVATE device_grouped_gemm_instance) -target_link_libraries(ckProfiler PRIVATE device_conv2d_bwd_weight_instance) -target_link_libraries(ckProfiler PRIVATE device_batched_gemm_reduce_instance) +#target_link_libraries(ckProfiler PRIVATE device_gemm_bias2d_instance) +#target_link_libraries(ckProfiler PRIVATE device_gemm_bias_relu_instance) +#target_link_libraries(ckProfiler PRIVATE device_gemm_bias_relu_add_instance) +#target_link_libraries(ckProfiler PRIVATE device_batched_gemm_instance) +#target_link_libraries(ckProfiler PRIVATE device_conv2d_fwd_instance) +#target_link_libraries(ckProfiler PRIVATE device_conv2d_fwd_bias_relu_instance) +#target_link_libraries(ckProfiler PRIVATE device_conv2d_fwd_bias_relu_add_instance) +#target_link_libraries(ckProfiler PRIVATE device_conv2d_fwd_bias_relu_atomic_add_instance) +#target_link_libraries(ckProfiler PRIVATE device_convnd_bwd_data_instance) +#target_link_libraries(ckProfiler PRIVATE device_reduce_instance) +#target_link_libraries(ckProfiler PRIVATE device_reduce_instance) +#target_link_libraries(ckProfiler PRIVATE device_grouped_gemm_instance) +#target_link_libraries(ckProfiler PRIVATE device_conv2d_bwd_weight_instance) +#target_link_libraries(ckProfiler PRIVATE device_batched_gemm_reduce_instance) diff --git a/profiler/src/profiler.cpp b/profiler/src/profiler.cpp index 3cd454e351..64344e8f8d 100644 --- a/profiler/src/profiler.cpp +++ b/profiler/src/profiler.cpp @@ -26,70 +26,70 @@ int main(int argc, char* argv[]) { return profile_gemm(argc, argv); } - else if(strcmp(argv[1], "gemm_bias_2d") == 0) - { - return profile_gemm_bias_2d(argc, argv); - } - else if(strcmp(argv[1], "gemm_bias_relu") == 0) - { - return profile_gemm_bias_relu(argc, argv); - } - else if(strcmp(argv[1], "gemm_bias_relu_add") == 0) - { - return profile_gemm_bias_relu_add(argc, argv); - } - else if(strcmp(argv[1], "gemm_reduce") == 0) - { - return profile_gemm_reduce(argc, argv); - } - else if(strcmp(argv[1], "batched_gemm") == 0) - { - return profile_batched_gemm(argc, argv); - } - else if(strcmp(argv[1], "batched_gemm_reduce") == 0) - { - return profile_batched_gemm_reduce(argc, argv); - } - else if(strcmp(argv[1], "grouped_gemm") == 0) - { - profile_grouped_gemm(argc, argv); - } - else if(strcmp(argv[1], "conv_fwd") == 0) - { - return profile_conv_fwd(argc, argv); - } - else if(strcmp(argv[1], "conv_fwd_bias_relu") == 0) - { - return profile_conv_fwd_bias_relu(argc, argv); - } - else if(strcmp(argv[1], "conv_fwd_bias_relu_add") == 0) - { - return profile_conv_fwd_bias_relu_add(argc, argv); - } - else if(strcmp(argv[1], "conv_fwd_bias_relu_atomic_add") == 0) - { - return profile_conv_fwd_bias_relu_atomic_add(argc, argv); - } - else if(strcmp(argv[1], "conv1d_bwd_data") == 0) - { - return profile_convnd_bwd_data(argc, argv, 1); - } - else if(strcmp(argv[1], "conv2d_bwd_data") == 0) - { - return profile_convnd_bwd_data(argc, argv, 2); - } - else if(strcmp(argv[1], "conv3d_bwd_data") == 0) - { - return profile_convnd_bwd_data(argc, argv, 3); - } - else if(strcmp(argv[1], "reduce") == 0) - { - return profile_reduce(argc, argv); - } - else if(strcmp(argv[1], "conv2d_bwd_weight") == 0) - { - return profile_conv_bwd_weight(argc, argv); - } + // else if(strcmp(argv[1], "gemm_bias_2d") == 0) + // { + // return profile_gemm_bias_2d(argc, argv); + // } + // else if(strcmp(argv[1], "gemm_bias_relu") == 0) + // { + // return profile_gemm_bias_relu(argc, argv); + // } + // else if(strcmp(argv[1], "gemm_bias_relu_add") == 0) + // { + // return profile_gemm_bias_relu_add(argc, argv); + // } + // else if(strcmp(argv[1], "gemm_reduce") == 0) + // { + // return profile_gemm_reduce(argc, argv); + // } + // else if(strcmp(argv[1], "batched_gemm") == 0) + // { + // return profile_batched_gemm(argc, argv); + // } + // else if(strcmp(argv[1], "batched_gemm_reduce") == 0) + // { + // return profile_batched_gemm_reduce(argc, argv); + // } + // else if(strcmp(argv[1], "grouped_gemm") == 0) + // { + // profile_grouped_gemm(argc, argv); + // } + // else if(strcmp(argv[1], "conv_fwd") == 0) + // { + // return profile_conv_fwd(argc, argv); + // } + // else if(strcmp(argv[1], "conv_fwd_bias_relu") == 0) + // { + // return profile_conv_fwd_bias_relu(argc, argv); + // } + // else if(strcmp(argv[1], "conv_fwd_bias_relu_add") == 0) + // { + // return profile_conv_fwd_bias_relu_add(argc, argv); + // } + // else if(strcmp(argv[1], "conv_fwd_bias_relu_atomic_add") == 0) + // { + // return profile_conv_fwd_bias_relu_atomic_add(argc, argv); + // } + // else if(strcmp(argv[1], "conv1d_bwd_data") == 0) + // { + // return profile_convnd_bwd_data(argc, argv, 1); + // } + // else if(strcmp(argv[1], "conv2d_bwd_data") == 0) + // { + // return profile_convnd_bwd_data(argc, argv, 2); + // } + // else if(strcmp(argv[1], "conv3d_bwd_data") == 0) + // { + // return profile_convnd_bwd_data(argc, argv, 3); + // } + // else if(strcmp(argv[1], "reduce") == 0) + // { + // return profile_reduce(argc, argv); + // } + // else if(strcmp(argv[1], "conv2d_bwd_weight") == 0) + // { + // return profile_conv_bwd_weight(argc, argv); + // } else { // clang-format off