From 69d5f78b1682fc11c2f19a2e271e663490a2eed3 Mon Sep 17 00:00:00 2001 From: JD Date: Thu, 12 May 2022 09:21:01 -0500 Subject: [PATCH] Add host API (#220) * Add host API * manually rebase on develop * clean * manually rebase on develop * exclude tests from all target * address review comments * update client app name * fix missing lib name * clang-format update * refactor * refactor * refactor * refactor * refactor * fix test issue * refactor * refactor * refactor * upate cmake and readme Co-authored-by: Chao Liu [ROCm/composable_kernel commit: cec69bc3bc200de7e09396579fe33cb153f8afeb] --- CMakeLists.txt | 30 ++- Config.cmake.in | 11 + Dockerfile | 15 +- Jenkinsfile | 19 +- README.md | 10 + cmake/googletest.cmake | 3 +- example/01_gemm/gemm_xdl_bf16.cpp | 14 +- example/01_gemm/gemm_xdl_fp16.cpp | 14 +- example/01_gemm/gemm_xdl_int8.cpp | 14 +- .../gemm_xdl_alpha_beta.cpp | 16 +- .../03_gemm_bias_relu/gemm_xdl_bias_relu.cpp | 14 +- .../gemm_xdl_bias_relu_add.cpp | 14 +- .../conv2d_fwd_xdl_bias_relu.cpp | 12 +- .../conv2d_fwd_xdl_bias_relu_add.cpp | 12 +- example/09_convnd_fwd/convnd_fwd_xdl.cpp | 12 +- example/09_convnd_fwd/convnd_fwd_xdl_fp16.cpp | 12 +- example/09_convnd_fwd/convnd_fwd_xdl_int8.cpp | 12 +- .../conv2d_bwd_data_xdl.cpp | 14 +- .../conv2d_bwd_weight_xdl.cpp | 14 +- example/12_reduce/reduce_blockwise.cpp | 13 +- example/13_pool2d_fwd/pool2d_fwd.cpp | 14 +- .../gemm_xdl_requant_relu_requant_int8.cpp | 14 +- .../15_grouped_gemm/grouped_gemm_xdl_fp16.cpp | 12 +- .../16_gemm_reduce/gemm_reduce_xdl_fp16.cpp | 39 +--- .../convnd_bwd_data_xdl.cpp | 14 +- .../batched_gemm_reduce_xdl_fp16.cpp | 39 +--- include/ck/hip_version.hpp.in | 28 --- include/ck/options.hpp.in | 3 + include/ck/stream_config.hpp | 10 + .../gpu/device/device_base.hpp | 15 +- ...evice_batched_gemm_reduce_xdl_cshuffle.hpp | 94 ++++---- .../gpu/device/device_batched_gemm_xdl.hpp | 15 +- ...rd_weight_xdl_c_shuffle_nhwc_kyxc_nhwk.hpp | 70 ++---- ...ice_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk.hpp | 11 +- ...fle_bias_activation_add_nhwc_kyxc_nhwk.hpp | 11 +- ...shuffle_bias_activation_nhwc_kyxc_nhwk.hpp | 11 +- ...onv2d_fwd_xdl_c_shuffle_nhwc_kyxc_nhwk.hpp | 11 +- .../device_conv2d_fwd_xdl_nhwc_kyxc_nhwk.hpp | 15 +- ...ice_conv3d_fwd_naive_ndhwc_kzyxc_ndhwk.hpp | 11 +- ...evice_conv3d_fwd_xdl_ndhwc_kzyxc_ndhwk.hpp | 15 +- ..._convnd_bwd_data_xdl_ndhwc_kzyxc_ndhwk.hpp | 11 +- .../device_convnd_fwd_xdl_nhwc_kyxc_nhwk.hpp | 15 +- .../device_gemm_reduce_xdl_cshuffle.hpp | 86 +++---- .../gpu/device/device_gemm_xdl.hpp | 15 +- .../device_gemm_xdl_c_shuffle_bias_2d.hpp | 11 +- ...ice_gemm_xdl_c_shuffle_bias_activation.hpp | 11 +- ...gemm_xdl_c_shuffle_bias_activation_add.hpp | 11 +- .../gpu/device/device_gemm_xdl_cshuffle.hpp | 112 +++------ .../gpu/device/device_gemm_xdl_splitk.hpp | 74 +++--- .../device_gemm_xdl_splitk_c_shuffle.hpp | 74 +++--- .../gpu/device/device_grouped_gemm_xdl.hpp | 15 +- .../device/device_pool2d_fwd_nhwc_nhwc.hpp | 11 +- .../gpu/device/device_reduce_blockwise.hpp | 11 +- .../device_reduce_blockwise_second_call.hpp | 13 +- .../device_reduce_multiblock_atomic_add.hpp | 72 +++--- ...evice_reduce_multiblock_partial_reduce.hpp | 13 +- .../gpu/device/device_reduce_threadwise.hpp | 13 +- .../ck/library/host/host_interface.hpp | 54 +++++ .../include/ck/library/host_tensor/device.hpp | 109 +++++---- .../cpu/reference_batched_gemm.hpp | 3 +- .../cpu/reference_conv_backward_weight.hpp | 3 +- .../cpu/reference_conv_bwd_data.hpp | 3 +- .../cpu/reference_conv_fwd.hpp | 9 +- .../reference_conv_fwd_bias_activation.hpp | 3 +- ...reference_conv_fwd_bias_activation_add.hpp | 3 +- .../cpu/reference_gemm.hpp | 3 +- .../cpu/reference_gemm_bias_2d.hpp | 3 +- .../cpu/reference_gemm_bias_activation.hpp | 3 +- .../reference_gemm_bias_activation_add.hpp | 3 +- .../ck/library/utility/op_instance_engine.hpp | 4 +- library/src/host_tensor/CMakeLists.txt | 25 +- library/src/host_tensor/device.cpp | 29 ++- .../gpu/CMakeLists.txt | 73 +++++- .../gpu/batched_gemm/CMakeLists.txt | 6 +- .../gpu/batched_gemm_reduce/CMakeLists.txt | 5 +- .../gpu/conv1d_fwd/CMakeLists.txt | 6 +- .../gpu/conv2d_bwd_data/CMakeLists.txt | 4 +- .../gpu/conv2d_bwd_weight/CMakeLists.txt | 2 +- .../gpu/conv2d_fwd/CMakeLists.txt | 4 +- .../gpu/conv2d_fwd_bias_relu/CMakeLists.txt | 4 +- .../conv2d_fwd_bias_relu_add/CMakeLists.txt | 4 +- .../CMakeLists.txt | 4 +- .../gpu/conv3d_fwd/CMakeLists.txt | 3 +- .../gpu/convnd_bwd_data/CMakeLists.txt | 2 +- .../gpu/device_conv2d.cpp | 201 ++++++++++++++++ .../gpu/gemm/CMakeLists.txt | 3 +- .../gpu/gemm_bias2d/CMakeLists.txt | 4 +- .../gpu/gemm_bias_relu/CMakeLists.txt | 4 +- .../gpu/gemm_bias_relu_add/CMakeLists.txt | 4 +- .../gpu/grouped_gemm/CMakeLists.txt | 2 +- .../gpu/reduce/CMakeLists.txt | 4 +- .../include/profile_batched_gemm_impl.hpp | 7 +- .../profile_batched_gemm_reduce_impl.hpp | 30 +-- .../include/profile_conv_bwd_data_impl.hpp | 5 +- .../include/profile_conv_bwd_weight_impl.hpp | 10 +- .../profile_conv_fwd_bias_relu_add_impl.hpp | 5 +- ...ile_conv_fwd_bias_relu_atomic_add_impl.hpp | 5 +- .../profile_conv_fwd_bias_relu_impl.hpp | 5 +- .../include/profile_convnd_bwd_data_impl.hpp | 5 +- .../include/profile_gemm_bias_2d_impl.hpp | 5 +- .../profile_gemm_bias_relu_add_impl.hpp | 5 +- .../include/profile_gemm_bias_relu_impl.hpp | 5 +- profiler/include/profile_gemm_impl.hpp | 5 +- profiler/include/profile_gemm_reduce_impl.hpp | 32 +-- .../include/profile_grouped_gemm_impl.hpp | 5 +- profiler/include/profile_reduce_impl.hpp | 15 +- profiler/src/profile_batched_gemm.cpp | 38 ++-- profiler/src/profile_batched_gemm_reduce.cpp | 14 +- profiler/src/profile_conv_bwd_data.cpp | 12 +- profiler/src/profile_conv_bwd_weight.cpp | 6 +- profiler/src/profile_conv_fwd_bias_relu.cpp | 6 +- .../src/profile_conv_fwd_bias_relu_add.cpp | 6 +- .../profile_conv_fwd_bias_relu_atomic_add.cpp | 6 +- profiler/src/profile_convnd_bwd_data.cpp | 10 +- profiler/src/profile_convnd_fwd.cpp | 32 +-- profiler/src/profile_gemm.cpp | 38 ++-- profiler/src/profile_gemm_bias_2d.cpp | 22 +- profiler/src/profile_gemm_bias_relu.cpp | 14 +- profiler/src/profile_gemm_bias_relu_add.cpp | 14 +- profiler/src/profile_gemm_reduce.cpp | 14 +- profiler/src/profile_grouped_gemm.cpp | 14 +- profiler/src/profile_reduce.cpp | 20 +- test/CMakeLists.txt | 5 +- .../batched_gemm_reduce_fp16.cpp | 8 +- test/client_app/CMakeLists.txt | 11 + test/client_app/client_app.cpp | 77 +++++++ test/client_app/client_app_impl.hpp | 214 ++++++++++++++++++ test/conv2d_bwd_weight/conv2d_bwd_weight.cpp | 32 +-- test/convnd_bwd_data/convnd_bwd_data.cpp | 96 ++++---- test/gemm_reduce/gemm_reduce_fp16.cpp | 8 +- test/gemm_split_k/gemm_split_k.cpp | 3 +- 131 files changed, 1666 insertions(+), 1089 deletions(-) create mode 100644 Config.cmake.in delete mode 100644 include/ck/hip_version.hpp.in create mode 100644 include/ck/options.hpp.in create mode 100644 include/ck/stream_config.hpp create mode 100644 library/include/ck/library/host/host_interface.hpp create mode 100644 library/src/tensor_operation_instance/gpu/device_conv2d.cpp create mode 100644 test/client_app/CMakeLists.txt create mode 100644 test/client_app/client_app.cpp create mode 100644 test/client_app/client_app_impl.hpp diff --git a/CMakeLists.txt b/CMakeLists.txt index 2b798e38f3..f18c85c683 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -27,6 +27,8 @@ set(CMAKE_CXX_STANDARD_REQUIRED ON) set(CMAKE_CXX_EXTENSIONS OFF) message("CMAKE_CXX_COMPILER_ID: ${CMAKE_CXX_COMPILER_ID}") +option(CK_TIME_KERNEL "Turning off will disable kernel timing globally" ON) + ## OpenMP if(CMAKE_CXX_COMPILER_ID MATCHES "Clang") # workaround issue hipcc in rocm3.5 cannot find openmp @@ -72,8 +74,9 @@ message(STATUS "Build with HIP ${HIP_VERSION}") rocm_create_package( - NAME CK-${CK_BACKEND} + NAME composablekernel DESCRIPTION "High Performance Composable Kernel for AMD GPUs" + MAINTAINER "MIOpen Kernels Dev Team " LDCONFIG ) @@ -226,7 +229,7 @@ set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/lib) set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/lib) set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/bin) -configure_file("${PROJECT_SOURCE_DIR}/include/ck/hip_version.hpp.in" "${PROJECT_BINARY_DIR}/include/ck/hip_version.hpp") +configure_file("${PROJECT_SOURCE_DIR}/include/ck/options.hpp.in" "${PROJECT_BINARY_DIR}/include/ck/options.hpp") include_directories(BEFORE ${PROJECT_SOURCE_DIR}/include @@ -234,7 +237,6 @@ include_directories(BEFORE ${PROJECT_SOURCE_DIR}/library/include ) -include(googletest) SET(BUILD_DEV ON CACHE BOOL "BUILD_DEV") if(BUILD_DEV) @@ -247,3 +249,25 @@ add_subdirectory(library) add_subdirectory(example) add_subdirectory(test) add_subdirectory(profiler) + +#Create an interface target for the include only files and call it "composablekernels" +include(CMakePackageConfigHelpers) + +set(version 1.0.0) +write_basic_package_version_file( + "${CMAKE_CURRENT_BINARY_DIR}/composable_kernelConfigVersion.cmake" + VERSION "${version}" + COMPATIBILITY AnyNewerVersion +) + +configure_package_config_file(${CMAKE_CURRENT_SOURCE_DIR}/Config.cmake.in + "${CMAKE_CURRENT_BINARY_DIR}/composable_kernelConfig.cmake" + INSTALL_DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/composable_kernel + NO_CHECK_REQUIRED_COMPONENTS_MACRO +) + +install(FILES + "${CMAKE_CURRENT_BINARY_DIR}/composable_kernelConfig.cmake" + "${CMAKE_CURRENT_BINARY_DIR}/composable_kernelConfigVersion.cmake" + DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/composable_kernel +) diff --git a/Config.cmake.in b/Config.cmake.in new file mode 100644 index 0000000000..12b5c331ae --- /dev/null +++ b/Config.cmake.in @@ -0,0 +1,11 @@ +@PACKAGE_INIT@ + +set(_composable_kernel_supported_components device_operations host_tensor) + +foreach(_comp ${composable_kernel_FIND_COMPONENTS}) + if(NOT _comp IN_LIST _composable_kernel_supported_components) + set(composable_kernel_FOUND False) + set(composable_kernel_NOT_FOUND_MESSAGE "Unsupported component: ${_comp}") + endif() + include("${CMAKE_CURRENT_LIST_DIR}/composable_kernel${_comp}Targets.cmake") +endforeach() diff --git a/Dockerfile b/Dockerfile index c4cf0fac57..9a443e01de 100644 --- a/Dockerfile +++ b/Dockerfile @@ -11,13 +11,7 @@ ARG DEB_ROCM_REPO=http://repo.radeon.com/rocm/apt/.apt_$ROCMVERSION/ RUN apt-get update RUN apt-get install -y wget gnupg RUN wget -qO - http://repo.radeon.com/rocm/rocm.gpg.key | apt-key add - -RUN if ! [ -z $OSDB_BKC_VERSION ]; then \ - echo "Using BKC VERISION: $OSDB_BKC_VERSION";\ - sh -c "echo deb [arch=amd64 trusted=yes] http://compute-artifactory.amd.com/artifactory/list/rocm-osdb-deb/ compute-rocm-dkms-no-npi-hipclang ${OSDB_BKC_VERSION} > /etc/apt/sources.list.d/rocm.list" ;\ - cat /etc/apt/sources.list.d/rocm.list;\ - else \ - sh -c "echo deb [arch=amd64] $DEB_ROCM_REPO ubuntu main > /etc/apt/sources.list.d/rocm.list" ;\ - fi +RUN sh -c "echo deb [arch=amd64] $DEB_ROCM_REPO ubuntu main > /etc/apt/sources.list.d/rocm.list" RUN wget --no-check-certificate -qO - https://apt.kitware.com/keys/kitware-archive-latest.asc 2>/dev/null | apt-key add - RUN sh -c "echo deb https://apt.kitware.com/ubuntu/ bionic main | tee -a /etc/apt/sources.list" @@ -25,18 +19,15 @@ RUN sh -c "echo deb https://apt.kitware.com/ubuntu/ bionic main | tee -a /etc/ap # Install dependencies RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-unauthenticated \ apt-utils \ - sshpass \ build-essential \ cmake-data=3.15.1-0kitware1 \ cmake=3.15.1-0kitware1 \ curl \ - doxygen \ g++ \ gdb \ git \ hip-rocclr \ jq \ - lcov \ libelf-dev \ libncurses5-dev \ libnuma-dev \ @@ -62,8 +53,6 @@ RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --allow- apt-get clean && \ rm -rf /var/lib/apt/lists/* -# RUN pip3 install --default-timeout=100000 -r requirements.txt - # Setup ubsan environment to printstacktrace RUN ln -s /usr/bin/llvm-symbolizer-3.8 /usr/local/bin/llvm-symbolizer ENV UBSAN_OPTIONS=print_stacktrace=1 @@ -92,5 +81,3 @@ ADD rbuild.ini /rbuild.ini ADD dev-requirements.txt dev-requirements.txt RUN rbuild prepare -s develop -d $PREFIX RUN groupadd -f render -# RUN cget install -f min-requirements.txt -# RUN CXXFLAGS='-isystem $PREFIX/include' cget install -f ./mlir-requirements.txt diff --git a/Jenkinsfile b/Jenkinsfile index f065d4ecc5..77f4d9d8be 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -320,7 +320,7 @@ pipeline { { agent{ label rocmnode("gfx908")} environment{ - setup_args = """ -D CMAKE_CXX_FLAGS="--offload-arch=gfx908 -O3 " -DBUILD_DEV=On """ + setup_args = """ -D CMAKE_CXX_FLAGS=" --offload-arch=gfx900 --offload-arch=gfx906 --offload-arch=gfx908 --offload-arch=gfx90a -O3 " -DBUILD_DEV=On """ } steps{ buildHipClangJobAndReboot(setup_args:setup_args, config_targets: "check", no_reboot:true, build_type: 'Release') @@ -341,6 +341,23 @@ pipeline { } } + stage("Client App") + { + parallel + { + stage("Run Client App") + { + agent{ label rocmnode("gfx908")} + environment{ + setup_args = """ -D -DBUILD_DEV=Off -DCMAKE_INSTALL_PREFIX=../install CMAKE_CXX_FLAGS="--offload-arch=gfx908 -O3 " """ + execute_args = """ cd ../test/client_app && rm -rf build && mkdir build && cd build && cmake -DCMAKE_PREFIX_PATH="${env.WORKSPACE}/install;/opt/rocm" .. && make """ + } + steps{ + buildHipClangJobAndReboot(setup_args: setup_args, config_targets: "install", no_reboot:true, build_type: 'Release', execute_cmd: execute_args, prefixpath: '/usr/local') + } + } + } + } stage("Performance Tests") { parallel diff --git a/README.md b/README.md index f5341b5736..9d7b578046 100644 --- a/README.md +++ b/README.md @@ -43,3 +43,13 @@ Instructions for running each individual examples are under ```example/``` make -j ckProfiler ``` Instructions for running ckProfiler are under ```profiler/``` + + +## Caveat +### Kernel Timing and Verification +CK's own kernel timer will warn up kernel once, and then run it multiple times +to get average kernel time. For some kernels that use atomic add, this will cause +output buffer to be accumulated multiple times, causing verfication failure. +To work around it, do not use CK's own timer and do verification at the same time. +CK's own timer and verification in each example and ckProfiler can be enabled or +disabled from command line. diff --git a/cmake/googletest.cmake b/cmake/googletest.cmake index f869ba483e..959bc4f4b0 100644 --- a/cmake/googletest.cmake +++ b/cmake/googletest.cmake @@ -19,6 +19,7 @@ list(APPEND GTEST_CMAKE_CXX_FLAGS -Wno-zero-as-null-pointer-constant -Wno-unused-member-function -Wno-comma + -Wno-old-style-cast ) message(STATUS "Suppressing googltest warnings with flags: ${GTEST_CMAKE_CXX_FLAGS}") @@ -35,4 +36,4 @@ FetchContent_MakeAvailable(googletest) target_compile_options(gtest PRIVATE ${GTEST_CMAKE_CXX_FLAGS}) target_compile_options(gtest_main PRIVATE ${GTEST_CMAKE_CXX_FLAGS}) target_compile_options(gmock PRIVATE ${GTEST_CMAKE_CXX_FLAGS}) - +target_compile_options(gmock_main PRIVATE ${GTEST_CMAKE_CXX_FLAGS}) diff --git a/example/01_gemm/gemm_xdl_bf16.cpp b/example/01_gemm/gemm_xdl_bf16.cpp index a4567dcd6e..4077a4f8d8 100644 --- a/example/01_gemm/gemm_xdl_bf16.cpp +++ b/example/01_gemm/gemm_xdl_bf16.cpp @@ -88,9 +88,9 @@ using ReferenceGemmInstance = ck::tensor_operation::host:: int main(int argc, char* argv[]) { - bool do_verification = 0; - int init_method = 0; - int nrepeat = 5; + bool do_verification = true; + int init_method = 1; + bool time_kernel = false; // GEMM shape ck::index_t M = 3840; @@ -105,13 +105,13 @@ int main(int argc, char* argv[]) { do_verification = std::stoi(argv[1]); init_method = std::stoi(argv[2]); - nrepeat = std::stoi(argv[3]); + time_kernel = std::stoi(argv[3]); } else if(argc == 10) { do_verification = std::stoi(argv[1]); init_method = std::stoi(argv[2]); - nrepeat = std::stoi(argv[3]); + time_kernel = std::stoi(argv[3]); M = std::stoi(argv[4]); N = std::stoi(argv[5]); @@ -125,7 +125,7 @@ int main(int argc, char* argv[]) { printf("arg1: verification (0=no, 1=yes)\n"); printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n"); - printf("arg3: run kernel # of times (>1)\n"); + printf("arg3: time kernel (0=n0, 1=yes)\n"); printf("arg4 to 9: M (256x), N(128x), K(32x), StrideA, StrideB, StrideC\n"); exit(0); } @@ -198,7 +198,7 @@ int main(int argc, char* argv[]) "not support this GEMM problem"); } - float ave_time = invoker.Run(argument, nrepeat); + float ave_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel}); std::size_t flop = std::size_t(2) * M * N * K; std::size_t num_btype = diff --git a/example/01_gemm/gemm_xdl_fp16.cpp b/example/01_gemm/gemm_xdl_fp16.cpp index fc04a13ca5..4f0228eafe 100644 --- a/example/01_gemm/gemm_xdl_fp16.cpp +++ b/example/01_gemm/gemm_xdl_fp16.cpp @@ -56,9 +56,9 @@ using ReferenceGemmInstance = ck::tensor_operation::host:: int main(int argc, char* argv[]) { - bool do_verification = 0; - int init_method = 0; - int nrepeat = 5; + bool do_verification = true; + int init_method = 1; + bool time_kernel = false; // GEMM shape ck::index_t M = 3840; @@ -73,13 +73,13 @@ int main(int argc, char* argv[]) { do_verification = std::stoi(argv[1]); init_method = std::stoi(argv[2]); - nrepeat = std::stoi(argv[3]); + time_kernel = std::stoi(argv[3]); } else if(argc == 10) { do_verification = std::stoi(argv[1]); init_method = std::stoi(argv[2]); - nrepeat = std::stoi(argv[3]); + time_kernel = std::stoi(argv[3]); M = std::stoi(argv[4]); N = std::stoi(argv[5]); @@ -93,7 +93,7 @@ int main(int argc, char* argv[]) { printf("arg1: verification (0=no, 1=yes)\n"); printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n"); - printf("arg3: run kernel # of times (>1)\n"); + printf("arg3: time kernel (0=n0, 1=yes)\n"); printf("arg4 to 9: M (256x), N(128x), K(32x), StrideA, StrideB, StrideC\n"); exit(0); } @@ -171,7 +171,7 @@ int main(int argc, char* argv[]) "not support this GEMM problem"); } - float ave_time = invoker.Run(argument, nrepeat); + float ave_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel}); std::size_t flop = std::size_t(2) * M * N * K; std::size_t num_btype = diff --git a/example/01_gemm/gemm_xdl_int8.cpp b/example/01_gemm/gemm_xdl_int8.cpp index ab5869db61..d5bf4a8bde 100644 --- a/example/01_gemm/gemm_xdl_int8.cpp +++ b/example/01_gemm/gemm_xdl_int8.cpp @@ -83,9 +83,9 @@ using ReferenceGemmInstance = ck::tensor_operation::host:: int main(int argc, char* argv[]) { - bool do_verification = 0; - int init_method = 0; - int nrepeat = 5; + bool do_verification = true; + int init_method = 1; + bool time_kernel = false; // GEMM shape ck::index_t M = 3840; @@ -100,13 +100,13 @@ int main(int argc, char* argv[]) { do_verification = std::stoi(argv[1]); init_method = std::stoi(argv[2]); - nrepeat = std::stoi(argv[3]); + time_kernel = std::stoi(argv[3]); } else if(argc == 10) { do_verification = std::stoi(argv[1]); init_method = std::stoi(argv[2]); - nrepeat = std::stoi(argv[3]); + time_kernel = std::stoi(argv[3]); M = std::stoi(argv[4]); N = std::stoi(argv[5]); @@ -120,7 +120,7 @@ int main(int argc, char* argv[]) { printf("arg1: verification (0=no, 1=yes)\n"); printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n"); - printf("arg3: run kernel # of times (>1)\n"); + printf("arg3: time kernel (0=n0, 1=yes)\n"); printf("arg4 to 9: M (256x), N(128x), K(32x), StrideA, StrideB, StrideC\n"); exit(0); } @@ -194,7 +194,7 @@ int main(int argc, char* argv[]) "not support this GEMM problem"); } - float ave_time = invoker.Run(argument, nrepeat); + float ave_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel}); std::size_t flop = std::size_t(2) * M * N * K; std::size_t num_btype = diff --git a/example/02_gemm_alpha_beta/gemm_xdl_alpha_beta.cpp b/example/02_gemm_alpha_beta/gemm_xdl_alpha_beta.cpp index 2abebbbac4..451200e798 100644 --- a/example/02_gemm_alpha_beta/gemm_xdl_alpha_beta.cpp +++ b/example/02_gemm_alpha_beta/gemm_xdl_alpha_beta.cpp @@ -86,9 +86,9 @@ using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemmBias2D1)\n"); + printf("arg3: time kernel (0=n0, 1=yes)\n"); printf("arg4 to 9: M (256x), N(128x), K(32x), StrideA, StrideB, StrideC, alpha, beta\n"); exit(0); } @@ -216,7 +216,7 @@ int main(int argc, char* argv[]) "not support this GEMM problem"); } - float ave_time = invoker.Run(argument, nrepeat); + float ave_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel}); std::size_t flop = std::size_t(2) * M * N * K; std::size_t num_btype = diff --git a/example/03_gemm_bias_relu/gemm_xdl_bias_relu.cpp b/example/03_gemm_bias_relu/gemm_xdl_bias_relu.cpp index f3ed2bad37..308d423ce7 100644 --- a/example/03_gemm_bias_relu/gemm_xdl_bias_relu.cpp +++ b/example/03_gemm_bias_relu/gemm_xdl_bias_relu.cpp @@ -83,9 +83,9 @@ using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemmBiasActiv int main(int argc, char* argv[]) { - bool do_verification = 0; - int init_method = 0; - int nrepeat = 5; + bool do_verification = true; + int init_method = 1; + bool time_kernel = false; // GEMM shape ck::index_t M = 3840; @@ -100,13 +100,13 @@ int main(int argc, char* argv[]) { do_verification = std::stoi(argv[1]); init_method = std::stoi(argv[2]); - nrepeat = std::stoi(argv[3]); + time_kernel = std::stoi(argv[3]); } else if(argc == 10) { do_verification = std::stoi(argv[1]); init_method = std::stoi(argv[2]); - nrepeat = std::stoi(argv[3]); + time_kernel = std::stoi(argv[3]); M = std::stoi(argv[4]); N = std::stoi(argv[5]); @@ -120,7 +120,7 @@ int main(int argc, char* argv[]) { printf("arg1: verification (0=no, 1=yes)\n"); printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n"); - printf("arg3: run kernel # of times (>1)\n"); + printf("arg3: time kernel (0=n0, 1=yes)\n"); printf("arg4 to 9: M (256x), N(128x), K(32x), StrideA, StrideB, StrideC\n"); exit(0); } @@ -206,7 +206,7 @@ int main(int argc, char* argv[]) "not support this GEMM problem"); } - float ave_time = invoker.Run(argument, nrepeat); + float ave_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel}); std::size_t flop = std::size_t(2) * M * N * K; diff --git a/example/04_gemm_bias_relu_add/gemm_xdl_bias_relu_add.cpp b/example/04_gemm_bias_relu_add/gemm_xdl_bias_relu_add.cpp index 9405c36881..012fd21341 100644 --- a/example/04_gemm_bias_relu_add/gemm_xdl_bias_relu_add.cpp +++ b/example/04_gemm_bias_relu_add/gemm_xdl_bias_relu_add.cpp @@ -83,9 +83,9 @@ using ReferenceGemmInstance = CElementOp>; int main(int argc, char* argv[]) { - bool do_verification = 0; - int init_method = 0; - int nrepeat = 5; + bool do_verification = true; + int init_method = 1; + bool time_kernel = false; // GEMM shape ck::index_t M = 3840; @@ -101,13 +101,13 @@ int main(int argc, char* argv[]) { do_verification = std::stoi(argv[1]); init_method = std::stoi(argv[2]); - nrepeat = std::stoi(argv[3]); + time_kernel = std::stoi(argv[3]); } else if(argc == 11) { do_verification = std::stoi(argv[1]); init_method = std::stoi(argv[2]); - nrepeat = std::stoi(argv[3]); + time_kernel = std::stoi(argv[3]); M = std::stoi(argv[4]); N = std::stoi(argv[5]); @@ -122,7 +122,7 @@ int main(int argc, char* argv[]) { printf("arg1: verification (0=no, 1=yes)\n"); printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n"); - printf("arg3: run kernel # of times (>1)\n"); + printf("arg3: time kernel (0=n0, 1=yes)\n"); printf("arg4 to 10: M (256x), N(128x), K(32x), StrideA, StrideB, StrideC, StrideC1\n"); exit(0); } @@ -218,7 +218,7 @@ int main(int argc, char* argv[]) "not support this GEMM problem"); } - float ave_time = invoker.Run(argument, nrepeat); + float ave_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel}); std::size_t flop = std::size_t(2) * M * N * K; std::size_t num_btype = sizeof(ADataType) * M * K + sizeof(BDataType) * K * M + diff --git a/example/06_conv2d_fwd_bias_relu/conv2d_fwd_xdl_bias_relu.cpp b/example/06_conv2d_fwd_bias_relu/conv2d_fwd_xdl_bias_relu.cpp index 53095bde0d..342de268e3 100644 --- a/example/06_conv2d_fwd_bias_relu/conv2d_fwd_xdl_bias_relu.cpp +++ b/example/06_conv2d_fwd_bias_relu/conv2d_fwd_xdl_bias_relu.cpp @@ -93,7 +93,7 @@ void PrintUseMsg() { std::cout << "arg1: verification (0=no, 1=yes)\n" << "arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n" - << "arg3: run kernel # of times (>1)\n" + << "arg3: time kernel (0=n0, 1=yes)\n" << "Following arguments:\n" << " N, K, C, \n" << " , (ie Y, X for 2D)\n" @@ -165,9 +165,9 @@ int main(int argc, char* argv[]) { using namespace ck::utils::conv; - bool do_verification = 0; - int init_method = 0; - int nrepeat = 5; + bool do_verification = true; + int init_method = 1; + bool time_kernel = false; const int num_dim_spatial = 2; ck::utils::conv::ConvParams params; @@ -176,7 +176,7 @@ int main(int argc, char* argv[]) { do_verification = std::stoi(argv[1]); init_method = std::stoi(argv[2]); - nrepeat = std::stoi(argv[3]); + time_kernel = std::stoi(argv[3]); } if(argc >= 5) @@ -269,7 +269,7 @@ int main(int argc, char* argv[]) "not support this problem"); } - float ave_time = invoker.Run(argument, nrepeat); + float ave_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel}); std::size_t flop = get_flops( params.N_, params.C_, params.K_, params.filter_spatial_lengths_, output_spatial_lengths); diff --git a/example/07_conv2d_fwd_bias_relu_add/conv2d_fwd_xdl_bias_relu_add.cpp b/example/07_conv2d_fwd_bias_relu_add/conv2d_fwd_xdl_bias_relu_add.cpp index c2b4ca0b5d..ff4fc66cb8 100644 --- a/example/07_conv2d_fwd_bias_relu_add/conv2d_fwd_xdl_bias_relu_add.cpp +++ b/example/07_conv2d_fwd_bias_relu_add/conv2d_fwd_xdl_bias_relu_add.cpp @@ -90,7 +90,7 @@ void PrintUseMsg() { std::cout << "arg1: verification (0=no, 1=yes)\n" << "arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n" - << "arg3: run kernel # of times (>1)\n" + << "arg3: time kernel (0=n0, 1=yes)\n" << "Following arguments:\n" << " N, K, C, \n" << " , (ie Y, X for 2D)\n" @@ -162,9 +162,9 @@ int main(int argc, char* argv[]) { using namespace ck::utils::conv; - bool do_verification = 0; - int init_method = 0; - int nrepeat = 5; + bool do_verification = true; + int init_method = 1; + bool time_kernel = false; const int num_dim_spatial = 2; ck::utils::conv::ConvParams params; @@ -173,7 +173,7 @@ int main(int argc, char* argv[]) { do_verification = std::stoi(argv[1]); init_method = std::stoi(argv[2]); - nrepeat = std::stoi(argv[3]); + time_kernel = std::stoi(argv[3]); } if(argc >= 5) @@ -280,7 +280,7 @@ int main(int argc, char* argv[]) "not support this problem"); } - float ave_time = invoker.Run(argument, nrepeat); + float ave_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel}); std::size_t flop = get_flops( params.N_, params.C_, params.K_, params.filter_spatial_lengths_, output_spatial_lengths); diff --git a/example/09_convnd_fwd/convnd_fwd_xdl.cpp b/example/09_convnd_fwd/convnd_fwd_xdl.cpp index 71f49b5e71..112d606f56 100644 --- a/example/09_convnd_fwd/convnd_fwd_xdl.cpp +++ b/example/09_convnd_fwd/convnd_fwd_xdl.cpp @@ -107,7 +107,7 @@ void print_use_msg() { std::cout << "arg1: verification (0=no, 1=yes)\n" << "arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n" - << "arg3: run kernel # of times (>1)\n" + << "arg3: time kernel (0=n0, 1=yes)\n" << "arg4: N spatial dimensions (default 2)\n" << "Following arguments (depending on number of spatial dims):\n" << " N, K, C, \n" @@ -179,9 +179,9 @@ int main(int argc, char* argv[]) { using namespace ck::utils::conv; - bool do_verification = 0; - int init_method = 0; - int nrepeat = 5; + bool do_verification = true; + int init_method = 1; + bool time_kernel = false; int num_dim_spatial = 2; ck::utils::conv::ConvParams params; @@ -190,7 +190,7 @@ int main(int argc, char* argv[]) { do_verification = std::stoi(argv[1]); init_method = std::stoi(argv[2]); - nrepeat = std::stoi(argv[3]); + time_kernel = std::stoi(argv[3]); num_dim_spatial = std::stoi(argv[4]); } @@ -276,7 +276,7 @@ int main(int argc, char* argv[]) "not support this Conv problem"); } - float ave_time = invoker->Run(argument.get(), nrepeat); + float ave_time = invoker->Run(argument.get(), StreamConfig{nullptr, time_kernel}); std::size_t flop = get_flops( params.N_, params.C_, params.K_, params.filter_spatial_lengths_, output_spatial_lengths); diff --git a/example/09_convnd_fwd/convnd_fwd_xdl_fp16.cpp b/example/09_convnd_fwd/convnd_fwd_xdl_fp16.cpp index c1361a8db3..8b658e7790 100644 --- a/example/09_convnd_fwd/convnd_fwd_xdl_fp16.cpp +++ b/example/09_convnd_fwd/convnd_fwd_xdl_fp16.cpp @@ -110,7 +110,7 @@ void print_use_msg() { std::cout << "arg1: verification (0=no, 1=yes)\n" << "arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n" - << "arg3: run kernel # of times (>1)\n" + << "arg3: time kernel (0=n0, 1=yes)\n" << "arg4: N spatial dimensions (default 2)\n" << "Following arguments (depending on number of spatial dims):\n" << " N, K, C, \n" @@ -182,9 +182,9 @@ int main(int argc, char* argv[]) { using namespace ck::utils::conv; - bool do_verification = 0; - int init_method = 0; - int nrepeat = 5; + bool do_verification = true; + int init_method = 1; + bool time_kernel = false; int num_dim_spatial = 2; ck::utils::conv::ConvParams params; @@ -193,7 +193,7 @@ int main(int argc, char* argv[]) { do_verification = std::stoi(argv[1]); init_method = std::stoi(argv[2]); - nrepeat = std::stoi(argv[3]); + time_kernel = std::stoi(argv[3]); num_dim_spatial = std::stoi(argv[4]); } @@ -277,7 +277,7 @@ int main(int argc, char* argv[]) "not support this Conv problem"); } - float ave_time = invoker->Run(argument.get(), nrepeat); + float ave_time = invoker->Run(argument.get(), StreamConfig{nullptr, time_kernel}); std::size_t flop = get_flops( params.N_, params.C_, params.K_, params.filter_spatial_lengths_, output_spatial_lengths); diff --git a/example/09_convnd_fwd/convnd_fwd_xdl_int8.cpp b/example/09_convnd_fwd/convnd_fwd_xdl_int8.cpp index 3d3e34dfd9..e7988d8683 100644 --- a/example/09_convnd_fwd/convnd_fwd_xdl_int8.cpp +++ b/example/09_convnd_fwd/convnd_fwd_xdl_int8.cpp @@ -112,7 +112,7 @@ void print_use_msg() { std::cout << "arg1: verification (0=no, 1=yes)\n" << "arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n" - << "arg3: run kernel # of times (>1)\n" + << "arg3: time kernel (0=n0, 1=yes)\n" << "arg4: N spatial dimensions (default 2)\n" << "Following arguments (depending on number of spatial dims):\n" << " N, K, C, \n" @@ -184,9 +184,9 @@ int main(int argc, char* argv[]) { using namespace ck::utils::conv; - bool do_verification = 0; - int init_method = 0; - int nrepeat = 5; + bool do_verification = true; + int init_method = 1; + bool time_kernel = false; int num_dim_spatial = 2; ck::utils::conv::ConvParams params; @@ -195,7 +195,7 @@ int main(int argc, char* argv[]) { do_verification = std::stoi(argv[1]); init_method = std::stoi(argv[2]); - nrepeat = std::stoi(argv[3]); + time_kernel = std::stoi(argv[3]); num_dim_spatial = std::stoi(argv[4]); } @@ -279,7 +279,7 @@ int main(int argc, char* argv[]) "not support this Conv problem"); } - float ave_time = invoker->Run(argument.get(), nrepeat); + float ave_time = invoker->Run(argument.get(), StreamConfig{nullptr, time_kernel}); std::size_t flop = get_flops( params.N_, params.C_, params.K_, params.filter_spatial_lengths_, output_spatial_lengths); diff --git a/example/10_conv2d_bwd_data/conv2d_bwd_data_xdl.cpp b/example/10_conv2d_bwd_data/conv2d_bwd_data_xdl.cpp index f3f9b497f5..73210fa543 100644 --- a/example/10_conv2d_bwd_data/conv2d_bwd_data_xdl.cpp +++ b/example/10_conv2d_bwd_data/conv2d_bwd_data_xdl.cpp @@ -77,9 +77,9 @@ using ReferenceConvBwdInstance = ck::tensor_operation::host::ReferenceConvBwdDat int main(int argc, char* argv[]) { - bool do_verification = 0; - int init_method = 0; - int nrepeat = 5; + bool do_verification = true; + int init_method = 1; + bool time_kernel = false; // Conv shape ck::index_t N = 128; @@ -102,13 +102,13 @@ int main(int argc, char* argv[]) { do_verification = std::stoi(argv[1]); init_method = std::stoi(argv[2]); - nrepeat = std::stoi(argv[3]); + time_kernel = std::stoi(argv[3]); } else if(argc == 19) { do_verification = std::stoi(argv[1]); init_method = std::stoi(argv[2]); - nrepeat = std::stoi(argv[3]); + time_kernel = std::stoi(argv[3]); N = std::stoi(argv[4]); K = std::stoi(argv[5]); @@ -130,7 +130,7 @@ int main(int argc, char* argv[]) { printf("arg1: verification (0=no, 1=yes)\n"); printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n"); - printf("arg3: run kernel # of times (>1)\n"); + printf("arg3: time kernel (0=n0, 1=yes)\n"); printf("arg4 to 18: N, K, C, Y, X, Hi, Wi, Sy, Sx, Dy, Dx, LeftPy, LeftPx, RightPy, " "RightPx\n"); exit(0); @@ -214,7 +214,7 @@ int main(int argc, char* argv[]) "not support this Conv problem"); } - float ave_time = invoker.Run(argument, nrepeat); + float ave_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel}); std::size_t flop = std::size_t(2) * N * K * Ho * Wo * C * Y * X; diff --git a/example/11_conv2d_bwd_weight/conv2d_bwd_weight_xdl.cpp b/example/11_conv2d_bwd_weight/conv2d_bwd_weight_xdl.cpp index bf78cc87e0..0c996dc21b 100644 --- a/example/11_conv2d_bwd_weight/conv2d_bwd_weight_xdl.cpp +++ b/example/11_conv2d_bwd_weight/conv2d_bwd_weight_xdl.cpp @@ -82,9 +82,9 @@ using ReferenceConvBwdWeightInstance = int main(int argc, char* argv[]) { - bool do_verification = 0; - int init_method = 0; - int nrepeat = 5; + bool do_verification = true; + int init_method = 1; + bool time_kernel = false; int do_log = 0; int split_k = 4; @@ -109,7 +109,7 @@ int main(int argc, char* argv[]) { do_verification = std::stoi(argv[1]); init_method = std::stoi(argv[2]); - nrepeat = std::stoi(argv[3]); + time_kernel = std::stoi(argv[3]); do_log = std::stoi(argv[4]); split_k = std::stoi(argv[5]); } @@ -117,7 +117,7 @@ int main(int argc, char* argv[]) { do_verification = std::stoi(argv[1]); init_method = std::stoi(argv[2]); - nrepeat = std::stoi(argv[3]); + time_kernel = std::stoi(argv[3]); do_log = std::stoi(argv[4]); split_k = std::stoi(argv[5]); @@ -141,7 +141,7 @@ int main(int argc, char* argv[]) { printf("arg1: verification (0=no, 1=yes)\n"); printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n"); - printf("arg3: run kernel # of times (>1)\n"); + printf("arg3: time kernel (0=n0, 1=yes)\n"); printf("arg4: is show log (0=no, 1=yes)\n"); printf("arg5: split-k \n"); printf("arg6 to 19: N, K, C, Y, X, Hi, Wi, Sy, Sx, Dy, Dx, LeftPy, LeftPx, RightPy, " @@ -246,7 +246,7 @@ int main(int argc, char* argv[]) return 1; } - float ave_time = invoker.Run(argument, nrepeat); + float ave_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel}); std::size_t flop = std::size_t(2) * N * K * Ho * Wo * C * Y * X; diff --git a/example/12_reduce/reduce_blockwise.cpp b/example/12_reduce/reduce_blockwise.cpp index 7ca9823ff5..caa93c9df2 100644 --- a/example/12_reduce/reduce_blockwise.cpp +++ b/example/12_reduce/reduce_blockwise.cpp @@ -116,10 +116,9 @@ class SimpleAppArgs std::vector inLengths; std::vector scales; - bool do_verification = false; - - int init_method = 1; - int nrepeat = 5; + bool do_verification = true; + int init_method = 1; + bool time_kernel = false; public: void show_usage(const char* cmd) @@ -135,7 +134,7 @@ class SimpleAppArgs std::cout << "Arg1 -- init method (0=no init, 1=single integer value, 2=scope integer " "value, 3=decimal value)" << std::endl; - std::cout << "Arg2 -- number of repeats to run the kernel" << std::endl; + std::cout << "Arg2 -- time kernel (0=n0, 1=yes)" << std::endl; }; int processArgs(int argc, char* argv[]) @@ -182,7 +181,7 @@ class SimpleAppArgs throw std::runtime_error("Invalid cmd-line arguments, more argumetns are needed!"); init_method = std::atoi(argv[optind++]); - nrepeat = std::atoi(argv[optind]); + time_kernel = std::atoi(argv[optind]); if(scales.empty()) { @@ -352,7 +351,7 @@ int main(int argc, char* argv[]) auto invoker_ptr = reduce.MakeInvokerPointer(); - float avg_time = invoker_ptr->Run(argument_ptr.get(), args.nrepeat); + float avg_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, args.time_kernel}); std::size_t num_bytes = invariant_total_length * reduce_total_length * sizeof(InDataType) + invariant_total_length * sizeof(OutDataType); diff --git a/example/13_pool2d_fwd/pool2d_fwd.cpp b/example/13_pool2d_fwd/pool2d_fwd.cpp index a18761095c..f4eb9d79f6 100644 --- a/example/13_pool2d_fwd/pool2d_fwd.cpp +++ b/example/13_pool2d_fwd/pool2d_fwd.cpp @@ -149,9 +149,9 @@ int main(int argc, char* argv[]) { using namespace ck::host_reduce; - bool do_verification = 0; - int init_method = 0; - int nrepeat = 5; + bool do_verification = true; + int init_method = 1; + bool time_kernel = false; // Pool shape ck::index_t N = 128; @@ -171,13 +171,13 @@ int main(int argc, char* argv[]) { do_verification = std::stoi(argv[1]); init_method = std::stoi(argv[2]); - nrepeat = std::stoi(argv[3]); + time_kernel = std::stoi(argv[3]); } else if(argc == 16) { do_verification = std::stoi(argv[1]); init_method = std::stoi(argv[2]); - nrepeat = std::stoi(argv[3]); + time_kernel = std::stoi(argv[3]); N = std::stoi(argv[4]); C = std::stoi(argv[5]); @@ -196,7 +196,7 @@ int main(int argc, char* argv[]) { printf("arg1: verification (0=no, 1=yes)\n"); printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n"); - printf("arg3: run kernel # of times (>1)\n"); + printf("arg3: time kernel (0=n0, 1=yes)\n"); printf("arg4 to 15: N, C, Y, X, Hi, Wi, Sy, Sx, LeftPy, LeftPx, RightPy, " "RightPx\n"); exit(0); @@ -271,7 +271,7 @@ int main(int argc, char* argv[]) "not support this problem"); } - float ave_time = invoker_ptr->Run(argument_ptr.get(), nrepeat); + float ave_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel}); std::size_t flop = std::size_t(2) * N * C * Ho * Wo * Y * X; diff --git a/example/14_gemm_xdl_requant_relu_requant/gemm_xdl_requant_relu_requant_int8.cpp b/example/14_gemm_xdl_requant_relu_requant/gemm_xdl_requant_relu_requant_int8.cpp index 324dc35d3f..9fc63308b7 100644 --- a/example/14_gemm_xdl_requant_relu_requant/gemm_xdl_requant_relu_requant_int8.cpp +++ b/example/14_gemm_xdl_requant_relu_requant/gemm_xdl_requant_relu_requant_int8.cpp @@ -105,9 +105,9 @@ using ReferenceGemmInstance = ck::tensor_operation::host:: int main(int argc, char* argv[]) { - bool do_verification = 0; - int init_method = 0; - int nrepeat = 5; + bool do_verification = true; + int init_method = 1; + bool time_kernel = false; // GEMM shape ck::index_t M = 3840; @@ -125,13 +125,13 @@ int main(int argc, char* argv[]) { do_verification = std::stoi(argv[1]); init_method = std::stoi(argv[2]); - nrepeat = std::stoi(argv[3]); + time_kernel = std::stoi(argv[3]); } else if(argc == 10) { do_verification = std::stoi(argv[1]); init_method = std::stoi(argv[2]); - nrepeat = std::stoi(argv[3]); + time_kernel = std::stoi(argv[3]); M = std::stoi(argv[4]); N = std::stoi(argv[5]); @@ -145,7 +145,7 @@ int main(int argc, char* argv[]) { printf("arg1: verification (0=no, 1=yes)\n"); printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n"); - printf("arg3: run kernel # of times (>1)\n"); + printf("arg3: time kernel (0=n0, 1=yes)\n"); printf("arg4 to 9: M (256x), N(128x), K(32x), StrideA, StrideB, StrideC\n"); exit(0); } @@ -219,7 +219,7 @@ int main(int argc, char* argv[]) "not support this GEMM problem"); } - float ave_time = invoker.Run(argument, nrepeat); + float ave_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel}); std::size_t flop = std::size_t(2) * M * N * K; std::size_t num_btype = diff --git a/example/15_grouped_gemm/grouped_gemm_xdl_fp16.cpp b/example/15_grouped_gemm/grouped_gemm_xdl_fp16.cpp index 29ef01f2ef..f55db1d45c 100644 --- a/example/15_grouped_gemm/grouped_gemm_xdl_fp16.cpp +++ b/example/15_grouped_gemm/grouped_gemm_xdl_fp16.cpp @@ -60,21 +60,21 @@ using ReferenceGemmInstance = ck::tensor_operation::host:: int main(int argc, char* argv[]) { - bool do_verification = 0; - int init_method = 0; - int nrepeat = 5; + bool do_verification = true; + int init_method = 1; + bool time_kernel = false; if(argc == 4) { do_verification = std::stoi(argv[1]); init_method = std::stoi(argv[2]); - nrepeat = std::stoi(argv[3]); + time_kernel = std::stoi(argv[3]); } else { printf("arg1: verification (0=no, 1=yes)\n"); printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n"); - printf("arg3: run kernel # of times (>1)\n"); + printf("arg3: time kernel (0=n0, 1=yes)\n"); exit(0); } @@ -202,7 +202,7 @@ int main(int argc, char* argv[]) "not support this GEMM problem"); } - float ave_time = invoker.Run(argument, nrepeat); + float ave_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel}); float tflops = static_cast(flop) / 1.E9 / ave_time; diff --git a/example/16_gemm_reduce/gemm_reduce_xdl_fp16.cpp b/example/16_gemm_reduce/gemm_reduce_xdl_fp16.cpp index 90064ae584..8fea54f635 100644 --- a/example/16_gemm_reduce/gemm_reduce_xdl_fp16.cpp +++ b/example/16_gemm_reduce/gemm_reduce_xdl_fp16.cpp @@ -58,9 +58,9 @@ using ReferenceGemmInstance = ck::tensor_operation::host:: int main(int argc, char* argv[]) { - bool do_verification = 1; + bool do_verification = true; int init_method = 1; - int nrepeat = 5; + bool time_kernel = false; // GEMM shape ck::index_t M = 3840; @@ -79,13 +79,13 @@ int main(int argc, char* argv[]) { do_verification = std::stoi(argv[1]); init_method = std::stoi(argv[2]); - nrepeat = std::stoi(argv[3]); + time_kernel = std::stoi(argv[3]); } else if(argc == 10) { do_verification = std::stoi(argv[1]); init_method = std::stoi(argv[2]); - nrepeat = std::stoi(argv[3]); + time_kernel = std::stoi(argv[3]); M = std::stoi(argv[4]); N = std::stoi(argv[5]); @@ -99,7 +99,7 @@ int main(int argc, char* argv[]) { printf("arg1: verification (0=no, 1=yes)\n"); printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n"); - printf("arg3: run kernel # of times (>1)\n"); + printf("arg3: time kernel (0=n0, 1=yes)\n"); printf("arg4 to 9: M (256x), N(128x), K(32x), StrideA, StrideB, StrideC\n"); exit(0); } @@ -192,30 +192,13 @@ int main(int argc, char* argv[]) "not support this GEMM problem"); } - // warm up - invoker.Run(argument); + // init DO, D1 to 0 + d0_device_buf.SetZero(); + d1_device_buf.SetZero(); - // timing - float total_time = 0; - - for(int i = 0; i < nrepeat; ++i) - { - // init DO, D1 to 0 - d0_device_buf.SetZero(); - d1_device_buf.SetZero(); - - KernelTimer timer; - - timer.Start(); - - invoker.Run(argument); - - timer.End(); - - total_time += timer.GetElapsedTime(); - } - - float ave_time = total_time / nrepeat; + // if time_kernel == true, kernel will run multiple times. This kernel use atomic-add so result + // will not be correct. need to set time_kernel = false for correctness test + float ave_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel}); std::size_t flop = std::size_t(2) * M * N * K; std::size_t num_btype = diff --git a/example/17_convnd_bwd_data_xdl/convnd_bwd_data_xdl.cpp b/example/17_convnd_bwd_data_xdl/convnd_bwd_data_xdl.cpp index 1b375ea339..a013f39827 100644 --- a/example/17_convnd_bwd_data_xdl/convnd_bwd_data_xdl.cpp +++ b/example/17_convnd_bwd_data_xdl/convnd_bwd_data_xdl.cpp @@ -87,7 +87,7 @@ void print_use_msg() { std::cout << "arg1: verification (0=no, 1=yes)\n" << "arg2: initialization (0=no init, 1=random value, 2= init to 1 )\n" - << "arg3: run kernel # of times (>1)\n" + << "arg3: time kernel (0=n0, 1=yes)\n" << "arg4: N spatial dimensions (default 2)\n" << "Following arguments (depending on number of spatial dims):\n" << " N, K, C, \n" @@ -165,9 +165,9 @@ DeviceConvBwdDataBasePtr get_conv_instance(int num_dim_spatial) int main(int argc, char* argv[]) { - bool do_verification = 0; - int init_method = 0; - int nrepeat = 5; + bool do_verification = true; + int init_method = 1; + bool time_kernel = false; int num_dim_spatial = 2; ck::utils::conv::ConvParams params; @@ -177,13 +177,13 @@ int main(int argc, char* argv[]) { do_verification = std::stoi(argv[1]); init_method = std::stoi(argv[2]); - nrepeat = std::stoi(argv[3]); + time_kernel = std::stoi(argv[3]); } else if(argc > 4) { do_verification = std::stoi(argv[1]); init_method = std::stoi(argv[2]); - nrepeat = std::stoi(argv[3]); + time_kernel = std::stoi(argv[3]); num_dim_spatial = std::stoi(argv[4]); // check args number int conv_args = 3 + num_dim_spatial * 6; @@ -284,7 +284,7 @@ int main(int argc, char* argv[]) "not support this Conv problem"); } - float ave_time = invoker->Run(argument.get(), nrepeat); + float ave_time = invoker->Run(argument.get(), StreamConfig{nullptr, time_kernel}); std::size_t flop = ck::utils::conv::get_flops( params.N_, params.C_, params.K_, params.filter_spatial_lengths_, output_spatial_lengths); diff --git a/example/18_batched_gemm_reduce/batched_gemm_reduce_xdl_fp16.cpp b/example/18_batched_gemm_reduce/batched_gemm_reduce_xdl_fp16.cpp index eb18655d1b..f620ee1b20 100644 --- a/example/18_batched_gemm_reduce/batched_gemm_reduce_xdl_fp16.cpp +++ b/example/18_batched_gemm_reduce/batched_gemm_reduce_xdl_fp16.cpp @@ -57,9 +57,9 @@ using ReferenceBatchedGemmInstance = ck::tensor_operation::host:: int main(int argc, char* argv[]) { - bool do_verification = 1; + bool do_verification = true; int init_method = 1; - int nrepeat = 5; + bool time_kernel = false; // GEMM shape ck::index_t M = 3840; @@ -80,13 +80,13 @@ int main(int argc, char* argv[]) { do_verification = std::stoi(argv[1]); init_method = std::stoi(argv[2]); - nrepeat = std::stoi(argv[3]); + time_kernel = std::stoi(argv[3]); } else if(argc == 11) { do_verification = std::stoi(argv[1]); init_method = std::stoi(argv[2]); - nrepeat = std::stoi(argv[3]); + time_kernel = std::stoi(argv[3]); M = std::stoi(argv[4]); N = std::stoi(argv[5]); @@ -102,7 +102,7 @@ int main(int argc, char* argv[]) { printf("arg1: verification (0=no, 1=yes)\n"); printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n"); - printf("arg3: run kernel # of times (>1)\n"); + printf("arg3: time kernel (0=n0, 1=yes)\n"); printf("arg4 to 10: M (256x), N(128x), K(32x), StrideA, StrideB, StrideC, BatchCount\n"); exit(0); } @@ -204,30 +204,13 @@ int main(int argc, char* argv[]) "not support this GEMM problem"); } - // warm up - invoker.Run(argument); + // init DO, D1 to 0 + d0_device_buf.SetZero(); + d1_device_buf.SetZero(); - // timing - float total_time = 0; - - for(int i = 0; i < nrepeat; ++i) - { - // init DO, D1 to 0 - d0_device_buf.SetZero(); - d1_device_buf.SetZero(); - - KernelTimer timer; - - timer.Start(); - - invoker.Run(argument); - - timer.End(); - - total_time += timer.GetElapsedTime(); - } - - float ave_time = total_time / nrepeat; + // if time_kernel == true, kernel will run multiple times. This kernel use atomic-add so result + // will not be correct. need to set time_kernel = false for correctness test + float ave_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel}); std::size_t flop = std::size_t(2) * BatchCount * M * N * K; std::size_t num_btype = sizeof(ADataType) * BatchCount * M * K + diff --git a/include/ck/hip_version.hpp.in b/include/ck/hip_version.hpp.in deleted file mode 100644 index 4290ef7e0d..0000000000 --- a/include/ck/hip_version.hpp.in +++ /dev/null @@ -1,28 +0,0 @@ -#pragma once - -// "_PACKAGE_" to avoid name contentions: the macros like -// HIP_VERSION_MAJOR are defined in HIP_VERSION.h. -// clang-format off -#define CK_HIP_PACKAGE_VERSION_MAJOR @CK_HIP_VERSION_MAJOR@ -#define CK_HIP_PACKAGE_VERSION_MINOR @CK_HIP_VERSION_MINOR@ -#define CK_HIP_PACKAGE_VERSION_PATCH @CK_HIP_VERSION_PATCH@ -// clang-format on - -#ifndef CK_HIP_PACKAGE_VERSION_MAJOR -#define CK_HIP_PACKAGE_VERSION_MAJOR 0 -#endif -#ifndef CK_HIP_PACKAGE_VERSION_MINOR -#define CK_HIP_PACKAGE_VERSION_MINOR 0 -#endif -#ifndef CK_HIP_PACKAGE_VERSION_PATCH -#define CK_HIP_PACKAGE_VERSION_PATCH 0 -#endif -// 3 decimal digits for major and minor, 6 digits for patch number. -// Max number is 999,999,999999 == 0xE8,D4A5,0FFF that fits into 64-bit math. -#if CK_HIP_PACKAGE_VERSION_MAJOR > 999 || CK_HIP_PACKAGE_VERSION_MAJOR > 999 || \ - CK_HIP_PACKAGE_VERSION_PATCH > 999999 -#error "Too big HIP version number(s)" -#endif -#define CK_HIP_PACKAGE_VERSION_FLAT \ - ((CK_HIP_PACKAGE_VERSION_MAJOR * 1000ULL + CK_HIP_PACKAGE_VERSION_MINOR) * 1000000 + \ - CK_HIP_PACKAGE_VERSION_PATCH) diff --git a/include/ck/options.hpp.in b/include/ck/options.hpp.in new file mode 100644 index 0000000000..87ed6026a4 --- /dev/null +++ b/include/ck/options.hpp.in @@ -0,0 +1,3 @@ +#pragma once + +#cmakedefine01 CK_TIME_KERNEL diff --git a/include/ck/stream_config.hpp b/include/ck/stream_config.hpp new file mode 100644 index 0000000000..3e80b4c892 --- /dev/null +++ b/include/ck/stream_config.hpp @@ -0,0 +1,10 @@ +#pragma once + +#include +#include + +struct StreamConfig +{ + hipStream_t stream_id_ = nullptr; + bool time_kernel_ = false; +}; diff --git a/include/ck/tensor_operation/gpu/device/device_base.hpp b/include/ck/tensor_operation/gpu/device/device_base.hpp index cf48695ad0..950cfc1d61 100644 --- a/include/ck/tensor_operation/gpu/device/device_base.hpp +++ b/include/ck/tensor_operation/gpu/device/device_base.hpp @@ -1,8 +1,9 @@ -#ifndef DEVICE_BASE_HPP -#define DEVICE_BASE_HPP +#pragma once #include +#include "stream_config.hpp" + namespace ck { namespace tensor_operation { namespace device { @@ -22,7 +23,10 @@ struct BaseInvoker BaseInvoker(const BaseInvoker&) = default; BaseInvoker& operator=(const BaseInvoker&) = default; - virtual float Run(const BaseArgument*, int = 1) = 0; + virtual float Run(const BaseArgument*, const StreamConfig& = StreamConfig{}) + { + return float{0}; + } virtual ~BaseInvoker() {} }; @@ -33,8 +37,8 @@ struct BaseOperator BaseOperator(const BaseOperator&) = default; BaseOperator& operator=(const BaseOperator&) = default; - virtual bool IsSupportedArgument(const BaseArgument*) = 0; - virtual std::string GetTypeString() const = 0; + virtual bool IsSupportedArgument(const BaseArgument*) { return false; } + virtual std::string GetTypeString() const { return ""; } virtual ~BaseOperator() {} }; @@ -42,4 +46,3 @@ struct BaseOperator } // namespace device } // namespace tensor_operation } // namespace ck -#endif diff --git a/include/ck/tensor_operation/gpu/device/device_batched_gemm_reduce_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/device_batched_gemm_reduce_xdl_cshuffle.hpp index e1d354b344..a6408007ed 100644 --- a/include/ck/tensor_operation/gpu/device/device_batched_gemm_reduce_xdl_cshuffle.hpp +++ b/include/ck/tensor_operation/gpu/device/device_batched_gemm_reduce_xdl_cshuffle.hpp @@ -693,7 +693,7 @@ struct DeviceBatchedGemmReduce_Xdl_CShuffle : public DeviceGemmReduce, true>; - launch_kernel(kernel, - dim3(grid_size), - dim3(BlockSize), - 0, - arg.p_a_grid_, - arg.p_b_grid_, - arg.p_c_grid_, - arg.p_d0_grid_, - arg.p_d1_grid_, - arg.BatchCount_, - arg.a_element_op_, - arg.b_element_op_, - arg.c_element_op_, - arg.d1_element_op_, - arg.a_grid_desc_ak0_m_ak1_, - arg.b_grid_desc_bk0_n_bk1_, - arg.c_grid_desc_mblock_mperblock_nblock_nperblock_, - arg.d_grid_desc_mblock_mperblock_, - arg.compute_base_ptr_of_batch_, - arg.block_2_ctile_map_); + elapsed_time = + launch_and_time_kernel(stream_config, + kernel, + dim3(grid_size), + dim3(BlockSize), + 0, + arg.p_a_grid_, + arg.p_b_grid_, + arg.p_c_grid_, + arg.p_d0_grid_, + arg.p_d1_grid_, + arg.BatchCount_, + arg.a_element_op_, + arg.b_element_op_, + arg.c_element_op_, + arg.d1_element_op_, + arg.a_grid_desc_ak0_m_ak1_, + arg.b_grid_desc_bk0_n_bk1_, + arg.c_grid_desc_mblock_mperblock_nblock_nperblock_, + arg.d_grid_desc_mblock_mperblock_, + arg.compute_base_ptr_of_batch_, + arg.block_2_ctile_map_); } else { @@ -788,35 +791,38 @@ struct DeviceBatchedGemmReduce_Xdl_CShuffle : public DeviceGemmReduce, false>; - launch_kernel(kernel, - dim3(grid_size), - dim3(BlockSize), - 0, - arg.p_a_grid_, - arg.p_b_grid_, - arg.p_c_grid_, - arg.p_d0_grid_, - arg.p_d1_grid_, - arg.BatchCount_, - arg.a_element_op_, - arg.b_element_op_, - arg.c_element_op_, - arg.d1_element_op_, - arg.a_grid_desc_ak0_m_ak1_, - arg.b_grid_desc_bk0_n_bk1_, - arg.c_grid_desc_mblock_mperblock_nblock_nperblock_, - arg.d_grid_desc_mblock_mperblock_, - arg.compute_base_ptr_of_batch_, - arg.block_2_ctile_map_); + elapsed_time = + launch_and_time_kernel(stream_config, + kernel, + dim3(grid_size), + dim3(BlockSize), + 0, + arg.p_a_grid_, + arg.p_b_grid_, + arg.p_c_grid_, + arg.p_d0_grid_, + arg.p_d1_grid_, + arg.BatchCount_, + arg.a_element_op_, + arg.b_element_op_, + arg.c_element_op_, + arg.d1_element_op_, + arg.a_grid_desc_ak0_m_ak1_, + arg.b_grid_desc_bk0_n_bk1_, + arg.c_grid_desc_mblock_mperblock_nblock_nperblock_, + arg.d_grid_desc_mblock_mperblock_, + arg.compute_base_ptr_of_batch_, + arg.block_2_ctile_map_); } - return 0; + return elapsed_time; } // polymorphic - float Run(const BaseArgument* p_arg, int nrepeat = 1) override + float Run(const BaseArgument* p_arg, + const StreamConfig& stream_config = StreamConfig{}) override { - return Run(*dynamic_cast(p_arg), nrepeat); + return Run(*dynamic_cast(p_arg), stream_config); } }; diff --git a/include/ck/tensor_operation/gpu/device/device_batched_gemm_xdl.hpp b/include/ck/tensor_operation/gpu/device/device_batched_gemm_xdl.hpp index 88974a5221..ea7704951e 100644 --- a/include/ck/tensor_operation/gpu/device/device_batched_gemm_xdl.hpp +++ b/include/ck/tensor_operation/gpu/device/device_batched_gemm_xdl.hpp @@ -428,7 +428,7 @@ struct DeviceBatchedGemmXdl { using Argument = DeviceBatchedGemmXdl::Argument; - float Run(const Argument& arg, int nrepeat = 1) + float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{}) { { std::cout << "arg.a_grid_desc_k0_m_k1_{" << arg.a_grid_desc_k0_m_k1_.GetLength(I0) @@ -477,8 +477,8 @@ struct DeviceBatchedGemmXdl remove_reference_t, true>; - ave_time = launch_and_time_kernel(kernel, - nrepeat, + ave_time = launch_and_time_kernel(stream_config, + kernel, dim3(grid_size), dim3(BlockSize), 0, @@ -511,8 +511,8 @@ struct DeviceBatchedGemmXdl remove_reference_t, false>; - ave_time = launch_and_time_kernel(kernel, - nrepeat, + ave_time = launch_and_time_kernel(stream_config, + kernel, dim3(grid_size), dim3(BlockSize), 0, @@ -534,9 +534,10 @@ struct DeviceBatchedGemmXdl } // polymorphic - float Run(const BaseArgument* p_arg, int nrepeat = 1) override + float Run(const BaseArgument* p_arg, + const StreamConfig& stream_config = StreamConfig{}) override { - return Run(*dynamic_cast(p_arg), nrepeat); + return Run(*dynamic_cast(p_arg), stream_config); } }; diff --git a/include/ck/tensor_operation/gpu/device/device_conv2d_backward_weight_xdl_c_shuffle_nhwc_kyxc_nhwk.hpp b/include/ck/tensor_operation/gpu/device/device_conv2d_backward_weight_xdl_c_shuffle_nhwc_kyxc_nhwk.hpp index 466e6ad89f..c36227083c 100644 --- a/include/ck/tensor_operation/gpu/device/device_conv2d_backward_weight_xdl_c_shuffle_nhwc_kyxc_nhwk.hpp +++ b/include/ck/tensor_operation/gpu/device/device_conv2d_backward_weight_xdl_c_shuffle_nhwc_kyxc_nhwk.hpp @@ -415,9 +415,10 @@ struct DeviceConv2dBwdWeightXdl_C_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_ << arg.c_grid_desc_m_n_.GetLength(I1) << "}" << std::endl; } - float Run(const Argument& arg, int nrepeat = 1) + float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{}) { ShowInfo(arg); + if(!GridwiseGemm::CheckValidity(arg.a_grid_desc_kbatch_k0_m_k1_, arg.b_grid_desc_kbatch_k0_n_k1_, arg.c_grid_desc_m_n_, @@ -437,49 +438,27 @@ struct DeviceConv2dBwdWeightXdl_C_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_ float ave_time = 0; const auto Run = [&](const auto& kernel) { - if(nrepeat > 0) - { - ave_time = - launch_and_time_kernel(kernel, - nrepeat, - dim3(grid_size), - dim3(BlockSize), - 0, - arg.p_a_grid_, - arg.p_b_grid_, - arg.p_c_grid_, - arg.a_grid_desc_kbatch_k0_m_k1_, - arg.b_grid_desc_kbatch_k0_n_k1_, - arg.c_grid_desc_mblock_mperblock_nblock_nperblock_, - arg.a_element_op_, - arg.b_element_op_, - arg.c_element_op_, - arg.block_2_ctile_map_); - } + hipGetErrorString(hipMemset( + arg.p_c_grid_, + 0, + arg.c_grid_desc_mblock_mperblock_nblock_nperblock_.GetElementSpaceSize() * + sizeof(CDataType))); - if(kbatch > 1 || nrepeat <= 0) - { - hipGetErrorString(hipMemset( - arg.p_c_grid_, - 0, - arg.c_grid_desc_mblock_mperblock_nblock_nperblock_.GetElementSpaceSize() * - sizeof(CDataType))); - - launch_kernel(kernel, - dim3(grid_size), - dim3(BlockSize), - 0, - arg.p_a_grid_, - arg.p_b_grid_, - arg.p_c_grid_, - arg.a_grid_desc_kbatch_k0_m_k1_, - arg.b_grid_desc_kbatch_k0_n_k1_, - arg.c_grid_desc_mblock_mperblock_nblock_nperblock_, - arg.a_element_op_, - arg.b_element_op_, - arg.c_element_op_, - arg.block_2_ctile_map_); - } + launch_and_time_kernel(stream_config, + kernel, + dim3(grid_size), + dim3(BlockSize), + 0, + arg.p_a_grid_, + arg.p_b_grid_, + arg.p_c_grid_, + arg.a_grid_desc_kbatch_k0_m_k1_, + arg.b_grid_desc_kbatch_k0_n_k1_, + arg.c_grid_desc_mblock_mperblock_nblock_nperblock_, + arg.a_element_op_, + arg.b_element_op_, + arg.c_element_op_, + arg.block_2_ctile_map_); }; if(has_main_k0_block_loop) @@ -560,9 +539,10 @@ struct DeviceConv2dBwdWeightXdl_C_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_ return ave_time; } - float Run(const BaseArgument* p_arg, int nrepeat = 1) override + float Run(const BaseArgument* p_arg, + const StreamConfig& stream_config = StreamConfig{}) override { - return Run(*dynamic_cast(p_arg), nrepeat); + return Run(*dynamic_cast(p_arg), stream_config); } }; diff --git a/include/ck/tensor_operation/gpu/device/device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk.hpp b/include/ck/tensor_operation/gpu/device/device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk.hpp index fad4ec1ffa..def6af74ac 100644 --- a/include/ck/tensor_operation/gpu/device/device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk.hpp +++ b/include/ck/tensor_operation/gpu/device/device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk.hpp @@ -531,7 +531,7 @@ struct DeviceConv2dBwdDataXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K { using Argument = DeviceOp::Argument; - float Run(const Argument& arg, int nrepeat = 1) + float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{}) { float ave_time = 0; for(size_t i = 0; i < arg.a_grid_desc_k0_m_k1_container_.size(); i++) @@ -602,8 +602,8 @@ struct DeviceConv2dBwdDataXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K true>; ave_time += launch_and_time_kernel( + stream_config, kernel, - nrepeat, dim3(grid_size), dim3(BlockSize), 0, @@ -635,8 +635,8 @@ struct DeviceConv2dBwdDataXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K false>; ave_time += launch_and_time_kernel( + stream_config, kernel, - nrepeat, dim3(grid_size), dim3(BlockSize), 0, @@ -655,9 +655,10 @@ struct DeviceConv2dBwdDataXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K return ave_time; } - float Run(const BaseArgument* p_arg, int nrepeat = 1) override + float Run(const BaseArgument* p_arg, + const StreamConfig& stream_config = StreamConfig{}) override { - return Run(*dynamic_cast(p_arg), nrepeat); + return Run(*dynamic_cast(p_arg), stream_config); } }; diff --git a/include/ck/tensor_operation/gpu/device/device_conv2d_fwd_xdl_c_shuffle_bias_activation_add_nhwc_kyxc_nhwk.hpp b/include/ck/tensor_operation/gpu/device/device_conv2d_fwd_xdl_c_shuffle_bias_activation_add_nhwc_kyxc_nhwk.hpp index 6648929cd5..fd95c184ca 100644 --- a/include/ck/tensor_operation/gpu/device/device_conv2d_fwd_xdl_c_shuffle_bias_activation_add_nhwc_kyxc_nhwk.hpp +++ b/include/ck/tensor_operation/gpu/device/device_conv2d_fwd_xdl_c_shuffle_bias_activation_add_nhwc_kyxc_nhwk.hpp @@ -642,7 +642,7 @@ struct { using Argument = DeviceOp::Argument; - float Run(const Argument& arg, int nrepeat = 1) + float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{}) { #if 0 { @@ -727,8 +727,8 @@ struct true>; ave_time = launch_and_time_kernel( + stream_config, kernel, - nrepeat, dim3(grid_size), dim3(BlockSize), 0, @@ -771,8 +771,8 @@ struct false>; ave_time = launch_and_time_kernel( + stream_config, kernel, - nrepeat, dim3(grid_size), dim3(BlockSize), 0, @@ -795,9 +795,10 @@ struct return ave_time; } - float Run(const BaseArgument* p_arg, int nrepeat = 1) override + float Run(const BaseArgument* p_arg, + const StreamConfig& stream_config = StreamConfig{}) override { - return Run(*dynamic_cast(p_arg), nrepeat); + return Run(*dynamic_cast(p_arg), stream_config); } }; diff --git a/include/ck/tensor_operation/gpu/device/device_conv2d_fwd_xdl_c_shuffle_bias_activation_nhwc_kyxc_nhwk.hpp b/include/ck/tensor_operation/gpu/device/device_conv2d_fwd_xdl_c_shuffle_bias_activation_nhwc_kyxc_nhwk.hpp index fd0941420c..61c91c0b76 100644 --- a/include/ck/tensor_operation/gpu/device/device_conv2d_fwd_xdl_c_shuffle_bias_activation_nhwc_kyxc_nhwk.hpp +++ b/include/ck/tensor_operation/gpu/device/device_conv2d_fwd_xdl_c_shuffle_bias_activation_nhwc_kyxc_nhwk.hpp @@ -605,7 +605,7 @@ struct DeviceConv2dFwdXdl_C_Shuffle_Bias_Activation_Input_N_Hi_Wi_C_Weight_K_Y_X { using Argument = DeviceOp::Argument; - float Run(const Argument& arg, int nrepeat = 1) + float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{}) { #if 0 { @@ -684,8 +684,8 @@ struct DeviceConv2dFwdXdl_C_Shuffle_Bias_Activation_Input_N_Hi_Wi_C_Weight_K_Y_X true>; ave_time = launch_and_time_kernel( + stream_config, kernel, - nrepeat, dim3(grid_size), dim3(BlockSize), 0, @@ -723,8 +723,8 @@ struct DeviceConv2dFwdXdl_C_Shuffle_Bias_Activation_Input_N_Hi_Wi_C_Weight_K_Y_X false>; ave_time = launch_and_time_kernel( + stream_config, kernel, - nrepeat, dim3(grid_size), dim3(BlockSize), 0, @@ -745,9 +745,10 @@ struct DeviceConv2dFwdXdl_C_Shuffle_Bias_Activation_Input_N_Hi_Wi_C_Weight_K_Y_X return ave_time; } - float Run(const BaseArgument* p_arg, int nrepeat = 1) override + float Run(const BaseArgument* p_arg, + const StreamConfig& stream_config = StreamConfig{}) override { - return Run(*dynamic_cast(p_arg), nrepeat); + return Run(*dynamic_cast(p_arg), stream_config); } }; diff --git a/include/ck/tensor_operation/gpu/device/device_conv2d_fwd_xdl_c_shuffle_nhwc_kyxc_nhwk.hpp b/include/ck/tensor_operation/gpu/device/device_conv2d_fwd_xdl_c_shuffle_nhwc_kyxc_nhwk.hpp index b508606a75..f4cddc1946 100644 --- a/include/ck/tensor_operation/gpu/device/device_conv2d_fwd_xdl_c_shuffle_nhwc_kyxc_nhwk.hpp +++ b/include/ck/tensor_operation/gpu/device/device_conv2d_fwd_xdl_c_shuffle_nhwc_kyxc_nhwk.hpp @@ -568,7 +568,7 @@ struct DeviceConv2dFwdXdl_C_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_W { using Argument = DeviceOp::Argument; - float Run(const Argument& arg, int nrepeat = 1) + float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{}) { #if 0 { @@ -663,8 +663,8 @@ struct DeviceConv2dFwdXdl_C_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_W true>; ave_time = launch_and_time_kernel( + stream_config, kernel, - nrepeat, dim3(grid_size), dim3(BlockSize), 0, @@ -697,8 +697,8 @@ struct DeviceConv2dFwdXdl_C_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_W false>; ave_time = launch_and_time_kernel( + stream_config, kernel, - nrepeat, dim3(grid_size), dim3(BlockSize), 0, @@ -717,9 +717,10 @@ struct DeviceConv2dFwdXdl_C_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_W return ave_time; } - float Run(const BaseArgument* p_arg, int nrepeat = 1) override + float Run(const BaseArgument* p_arg, + const StreamConfig& stream_config = StreamConfig{}) override { - return Run(*dynamic_cast(p_arg), nrepeat); + return Run(*dynamic_cast(p_arg), stream_config); } }; diff --git a/include/ck/tensor_operation/gpu/device/device_conv2d_fwd_xdl_nhwc_kyxc_nhwk.hpp b/include/ck/tensor_operation/gpu/device/device_conv2d_fwd_xdl_nhwc_kyxc_nhwk.hpp index 3574f7667e..aa9229f7cb 100644 --- a/include/ck/tensor_operation/gpu/device/device_conv2d_fwd_xdl_nhwc_kyxc_nhwk.hpp +++ b/include/ck/tensor_operation/gpu/device/device_conv2d_fwd_xdl_nhwc_kyxc_nhwk.hpp @@ -450,7 +450,7 @@ struct DeviceConv2dFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K { using Argument = DeviceOp::Argument; - float Run(const Argument& arg, int nrepeat = 1) + float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{}) { #if 0 { @@ -498,8 +498,8 @@ struct DeviceConv2dFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K remove_reference_t, true>; - ave_time = launch_and_time_kernel(kernel, - nrepeat, + ave_time = launch_and_time_kernel(stream_config, + kernel, dim3(grid_size), dim3(BlockSize), 0, @@ -529,8 +529,8 @@ struct DeviceConv2dFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K remove_reference_t, false>; - ave_time = launch_and_time_kernel(kernel, - nrepeat, + ave_time = launch_and_time_kernel(stream_config, + kernel, dim3(grid_size), dim3(BlockSize), 0, @@ -549,9 +549,10 @@ struct DeviceConv2dFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K return ave_time; } - float Run(const BaseArgument* p_arg, int nrepeat = 1) override + float Run(const BaseArgument* p_arg, + const StreamConfig& stream_config = StreamConfig{}) override { - return Run(*dynamic_cast(p_arg), nrepeat); + return Run(*dynamic_cast(p_arg), stream_config); } }; diff --git a/include/ck/tensor_operation/gpu/device/device_conv3d_fwd_naive_ndhwc_kzyxc_ndhwk.hpp b/include/ck/tensor_operation/gpu/device/device_conv3d_fwd_naive_ndhwc_kzyxc_ndhwk.hpp index 1bfe0bb256..b1eea0b33f 100644 --- a/include/ck/tensor_operation/gpu/device/device_conv3d_fwd_naive_ndhwc_kzyxc_ndhwk.hpp +++ b/include/ck/tensor_operation/gpu/device/device_conv3d_fwd_naive_ndhwc_kzyxc_ndhwk.hpp @@ -92,7 +92,7 @@ struct DeviceConv3dFwdNaive_Input_N_Di_Hi_Wi_C_Weight_K_Z_Y_X_C_Output_N_Do_Ho_W { using Argument = DeviceOp::Argument; - float Run(const Argument& arg, int nrepeat = 1) + float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{}) { const auto naive_conv3d_fwd = ref::naive_conv_fwd_ndhwc_kzyxc_ndhwk; - float ave_time = launch_and_time_kernel(naive_conv3d_fwd, - nrepeat, + float ave_time = launch_and_time_kernel(stream_config, + naive_conv3d_fwd, dim3(256), dim3(256), 0, @@ -137,9 +137,10 @@ struct DeviceConv3dFwdNaive_Input_N_Di_Hi_Wi_C_Weight_K_Z_Y_X_C_Output_N_Do_Ho_W } // polymorphic - float Run(const BaseArgument* p_arg, int nrepeat = 1) override + float Run(const BaseArgument* p_arg, + const StreamConfig& stream_config = StreamConfig{}) override { - return Run(*dynamic_cast(p_arg), nrepeat); + return Run(*dynamic_cast(p_arg), stream_config); } }; diff --git a/include/ck/tensor_operation/gpu/device/device_conv3d_fwd_xdl_ndhwc_kzyxc_ndhwk.hpp b/include/ck/tensor_operation/gpu/device/device_conv3d_fwd_xdl_ndhwc_kzyxc_ndhwk.hpp index ff30a6880d..0f98ba054d 100644 --- a/include/ck/tensor_operation/gpu/device/device_conv3d_fwd_xdl_ndhwc_kzyxc_ndhwk.hpp +++ b/include/ck/tensor_operation/gpu/device/device_conv3d_fwd_xdl_ndhwc_kzyxc_ndhwk.hpp @@ -438,7 +438,7 @@ struct DeviceConv3dFwdXdl_Input_N_Di_Hi_Wi_C_Weight_K_Z_Y_X_C_Output_N_Do_Ho_Wo_ { using Argument = DeviceOp::Argument; - float Run(const Argument& arg, int nrepeat = 1) + float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{}) { { std::cout << "num_batches_of_GEMM = " << arg.num_subbatches_ << std::endl; @@ -487,8 +487,8 @@ struct DeviceConv3dFwdXdl_Input_N_Di_Hi_Wi_C_Weight_K_Z_Y_X_C_Output_N_Do_Ho_Wo_ OutElementwiseOperation, remove_reference_t, true>; - ave_time = launch_and_time_kernel(kernel, - nrepeat, + ave_time = launch_and_time_kernel(stream_config, + kernel, dim3(grid_size), dim3(BlockSize), 0, @@ -522,8 +522,8 @@ struct DeviceConv3dFwdXdl_Input_N_Di_Hi_Wi_C_Weight_K_Z_Y_X_C_Output_N_Do_Ho_Wo_ remove_reference_t, false>; - ave_time = launch_and_time_kernel(kernel, - nrepeat, + ave_time = launch_and_time_kernel(stream_config, + kernel, dim3(grid_size), dim3(BlockSize), 0, @@ -547,9 +547,10 @@ struct DeviceConv3dFwdXdl_Input_N_Di_Hi_Wi_C_Weight_K_Z_Y_X_C_Output_N_Do_Ho_Wo_ } // polymorphic - float Run(const BaseArgument* p_arg, int nrepeat = 1) override + float Run(const BaseArgument* p_arg, + const StreamConfig& stream_config = StreamConfig{}) override { - return Run(*dynamic_cast(p_arg), nrepeat); + return Run(*dynamic_cast(p_arg), stream_config); } }; diff --git a/include/ck/tensor_operation/gpu/device/device_convnd_bwd_data_xdl_ndhwc_kzyxc_ndhwk.hpp b/include/ck/tensor_operation/gpu/device/device_convnd_bwd_data_xdl_ndhwc_kzyxc_ndhwk.hpp index 5dca8f9629..209b3c866e 100644 --- a/include/ck/tensor_operation/gpu/device/device_convnd_bwd_data_xdl_ndhwc_kzyxc_ndhwk.hpp +++ b/include/ck/tensor_operation/gpu/device/device_convnd_bwd_data_xdl_ndhwc_kzyxc_ndhwk.hpp @@ -1241,7 +1241,7 @@ struct DeviceConvndBwdDataXdl_Input_N_Di_Hi_Wi_C_Weight_K_Z_Y_X_C_Output_N_Do_Ho { using Argument = DeviceOp::Argument; - float Run(const Argument& arg, int nrepeat = 1) + float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{}) { float ave_time = 0; for(size_t i = 0; i < arg.a_grid_desc_k0_m_k1_container_.size(); i++) @@ -1316,8 +1316,8 @@ struct DeviceConvndBwdDataXdl_Input_N_Di_Hi_Wi_C_Weight_K_Z_Y_X_C_Output_N_Do_Ho true>; ave_time += launch_and_time_kernel( + stream_config, kernel, - nrepeat, dim3(grid_size), dim3(BlockSize), 0, @@ -1349,8 +1349,8 @@ struct DeviceConvndBwdDataXdl_Input_N_Di_Hi_Wi_C_Weight_K_Z_Y_X_C_Output_N_Do_Ho false>; ave_time += launch_and_time_kernel( + stream_config, kernel, - nrepeat, dim3(grid_size), dim3(BlockSize), 0, @@ -1369,9 +1369,10 @@ struct DeviceConvndBwdDataXdl_Input_N_Di_Hi_Wi_C_Weight_K_Z_Y_X_C_Output_N_Do_Ho return ave_time; } - float Run(const BaseArgument* p_arg, int nrepeat = 1) override + float Run(const BaseArgument* p_arg, + const StreamConfig& stream_config = StreamConfig{}) override { - return Run(*dynamic_cast(p_arg), nrepeat); + return Run(*dynamic_cast(p_arg), stream_config); } }; diff --git a/include/ck/tensor_operation/gpu/device/device_convnd_fwd_xdl_nhwc_kyxc_nhwk.hpp b/include/ck/tensor_operation/gpu/device/device_convnd_fwd_xdl_nhwc_kyxc_nhwk.hpp index 7365f9a3e2..4251052a99 100644 --- a/include/ck/tensor_operation/gpu/device/device_convnd_fwd_xdl_nhwc_kyxc_nhwk.hpp +++ b/include/ck/tensor_operation/gpu/device/device_convnd_fwd_xdl_nhwc_kyxc_nhwk.hpp @@ -747,7 +747,7 @@ struct DeviceConvNDFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K { using Argument = DeviceOp::Argument; - float Run(const Argument& arg, int nrepeat = 1) + float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{}) { #if 0 { @@ -795,8 +795,8 @@ struct DeviceConvNDFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K remove_reference_t, true>; - ave_time = launch_and_time_kernel(kernel, - nrepeat, + ave_time = launch_and_time_kernel(stream_config, + kernel, dim3(grid_size), dim3(BlockSize), 0, @@ -826,8 +826,8 @@ struct DeviceConvNDFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K remove_reference_t, false>; - ave_time = launch_and_time_kernel(kernel, - nrepeat, + ave_time = launch_and_time_kernel(stream_config, + kernel, dim3(grid_size), dim3(BlockSize), 0, @@ -846,9 +846,10 @@ struct DeviceConvNDFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K return ave_time; } - float Run(const BaseArgument* p_arg, int nrepeat = 1) override + float Run(const BaseArgument* p_arg, + const StreamConfig& stream_config = StreamConfig{}) override { - return Run(*dynamic_cast(p_arg), nrepeat); + return Run(*dynamic_cast(p_arg), stream_config); } }; diff --git a/include/ck/tensor_operation/gpu/device/device_gemm_reduce_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/device_gemm_reduce_xdl_cshuffle.hpp index daa309888f..69c29b72d3 100644 --- a/include/ck/tensor_operation/gpu/device/device_gemm_reduce_xdl_cshuffle.hpp +++ b/include/ck/tensor_operation/gpu/device/device_gemm_reduce_xdl_cshuffle.hpp @@ -503,7 +503,7 @@ struct DeviceGemmReduce_Xdl_CShuffle : public DeviceGemmReduce; - launch_kernel(kernel, - dim3(grid_size), - dim3(BlockSize), - 0, - arg.p_a_grid_, - arg.p_b_grid_, - arg.p_c_grid_, - arg.p_d0_grid_, - arg.p_d1_grid_, - arg.a_element_op_, - arg.b_element_op_, - arg.c_element_op_, - arg.d1_element_op_, - arg.a_grid_desc_ak0_m_ak1_, - arg.b_grid_desc_bk0_n_bk1_, - arg.c_grid_desc_mblock_mperblock_nblock_nperblock_, - arg.d_grid_desc_mblock_mperblock_, - arg.block_2_ctile_map_); + elapsed_time = + launch_and_time_kernel(stream_config, + kernel, + dim3(grid_size), + dim3(BlockSize), + 0, + arg.p_a_grid_, + arg.p_b_grid_, + arg.p_c_grid_, + arg.p_d0_grid_, + arg.p_d1_grid_, + arg.a_element_op_, + arg.b_element_op_, + arg.c_element_op_, + arg.d1_element_op_, + arg.a_grid_desc_ak0_m_ak1_, + arg.b_grid_desc_bk0_n_bk1_, + arg.c_grid_desc_mblock_mperblock_nblock_nperblock_, + arg.d_grid_desc_mblock_mperblock_, + arg.block_2_ctile_map_); } else { @@ -591,33 +594,36 @@ struct DeviceGemmReduce_Xdl_CShuffle : public DeviceGemmReduce; - launch_kernel(kernel, - dim3(grid_size), - dim3(BlockSize), - 0, - arg.p_a_grid_, - arg.p_b_grid_, - arg.p_c_grid_, - arg.p_d0_grid_, - arg.p_d1_grid_, - arg.a_element_op_, - arg.b_element_op_, - arg.c_element_op_, - arg.d1_element_op_, - arg.a_grid_desc_ak0_m_ak1_, - arg.b_grid_desc_bk0_n_bk1_, - arg.c_grid_desc_mblock_mperblock_nblock_nperblock_, - arg.d_grid_desc_mblock_mperblock_, - arg.block_2_ctile_map_); + elapsed_time = + launch_and_time_kernel(stream_config, + kernel, + dim3(grid_size), + dim3(BlockSize), + 0, + arg.p_a_grid_, + arg.p_b_grid_, + arg.p_c_grid_, + arg.p_d0_grid_, + arg.p_d1_grid_, + arg.a_element_op_, + arg.b_element_op_, + arg.c_element_op_, + arg.d1_element_op_, + arg.a_grid_desc_ak0_m_ak1_, + arg.b_grid_desc_bk0_n_bk1_, + arg.c_grid_desc_mblock_mperblock_nblock_nperblock_, + arg.d_grid_desc_mblock_mperblock_, + arg.block_2_ctile_map_); } - return 0; + return elapsed_time; } // polymorphic - float Run(const BaseArgument* p_arg, int nrepeat = 1) override + float Run(const BaseArgument* p_arg, + const StreamConfig& stream_config = StreamConfig{}) override { - return Run(*dynamic_cast(p_arg), nrepeat); + return Run(*dynamic_cast(p_arg), stream_config); } }; diff --git a/include/ck/tensor_operation/gpu/device/device_gemm_xdl.hpp b/include/ck/tensor_operation/gpu/device/device_gemm_xdl.hpp index 47997cd802..2bb7f6e78a 100644 --- a/include/ck/tensor_operation/gpu/device/device_gemm_xdl.hpp +++ b/include/ck/tensor_operation/gpu/device/device_gemm_xdl.hpp @@ -290,7 +290,7 @@ struct DeviceGemmXdl { using Argument = DeviceGemmXdl::Argument; - float Run(const Argument& arg, int nrepeat = 1) + float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{}) { #if 0 { @@ -339,8 +339,8 @@ struct DeviceGemmXdl remove_reference_t, true>; - ave_time = launch_and_time_kernel(kernel, - nrepeat, + ave_time = launch_and_time_kernel(stream_config, + kernel, dim3(grid_size), dim3(BlockSize), 0, @@ -370,8 +370,8 @@ struct DeviceGemmXdl remove_reference_t, false>; - ave_time = launch_and_time_kernel(kernel, - nrepeat, + ave_time = launch_and_time_kernel(stream_config, + kernel, dim3(grid_size), dim3(BlockSize), 0, @@ -391,9 +391,10 @@ struct DeviceGemmXdl } // polymorphic - float Run(const BaseArgument* p_arg, int nrepeat = 1) override + float Run(const BaseArgument* p_arg, + const StreamConfig& stream_config = StreamConfig{}) override { - return Run(*dynamic_cast(p_arg), nrepeat); + return Run(*dynamic_cast(p_arg), stream_config); } }; diff --git a/include/ck/tensor_operation/gpu/device/device_gemm_xdl_c_shuffle_bias_2d.hpp b/include/ck/tensor_operation/gpu/device/device_gemm_xdl_c_shuffle_bias_2d.hpp index 4010965312..315f39d9bf 100644 --- a/include/ck/tensor_operation/gpu/device/device_gemm_xdl_c_shuffle_bias_2d.hpp +++ b/include/ck/tensor_operation/gpu/device/device_gemm_xdl_c_shuffle_bias_2d.hpp @@ -264,7 +264,7 @@ struct DeviceGemmXdl_C_Shuffle_Bias_2d { using Argument = DeviceGemmXdl_C_Shuffle_Bias_2d::Argument; - float Run(const Argument& arg, int nrepeat = 1) + float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{}) { { std::cout << "arg.a_grid_desc_k0_m_k1_{" << arg.a_grid_desc_k0_m_k1_.GetLength(I0) @@ -320,8 +320,8 @@ struct DeviceGemmXdl_C_Shuffle_Bias_2d true>; ave_time = launch_and_time_kernel( + stream_config, kernel, - nrepeat, dim3(grid_size), dim3(BlockSize), 0, @@ -359,8 +359,8 @@ struct DeviceGemmXdl_C_Shuffle_Bias_2d false>; ave_time = launch_and_time_kernel( + stream_config, kernel, - nrepeat, dim3(grid_size), dim3(BlockSize), 0, @@ -382,9 +382,10 @@ struct DeviceGemmXdl_C_Shuffle_Bias_2d } // polymorphic - float Run(const BaseArgument* p_arg, int nrepeat = 1) override + float Run(const BaseArgument* p_arg, + const StreamConfig& stream_config = StreamConfig{}) override { - return Run(*dynamic_cast(p_arg), nrepeat); + return Run(*dynamic_cast(p_arg), stream_config); } }; diff --git a/include/ck/tensor_operation/gpu/device/device_gemm_xdl_c_shuffle_bias_activation.hpp b/include/ck/tensor_operation/gpu/device/device_gemm_xdl_c_shuffle_bias_activation.hpp index c65ff6022a..f1f9f41724 100644 --- a/include/ck/tensor_operation/gpu/device/device_gemm_xdl_c_shuffle_bias_activation.hpp +++ b/include/ck/tensor_operation/gpu/device/device_gemm_xdl_c_shuffle_bias_activation.hpp @@ -273,7 +273,7 @@ struct DeviceGemmXdl_C_Shuffle_Bias_Activation { using Argument = DeviceOp::Argument; - float Run(const Argument& arg, int nrepeat = 1) + float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{}) { { std::cout << "arg.a_grid_desc_k0_m_k1_{" << arg.a_grid_desc_k0_m_k1_.GetLength(I0) @@ -329,8 +329,8 @@ struct DeviceGemmXdl_C_Shuffle_Bias_Activation true>; ave_time = launch_and_time_kernel( + stream_config, kernel, - nrepeat, dim3(grid_size), dim3(BlockSize), 0, @@ -368,8 +368,8 @@ struct DeviceGemmXdl_C_Shuffle_Bias_Activation false>; ave_time = launch_and_time_kernel( + stream_config, kernel, - nrepeat, dim3(grid_size), dim3(BlockSize), 0, @@ -391,9 +391,10 @@ struct DeviceGemmXdl_C_Shuffle_Bias_Activation } // polymorphic - float Run(const BaseArgument* p_arg, int nrepeat = 1) override + float Run(const BaseArgument* p_arg, + const StreamConfig& stream_config = StreamConfig{}) override { - return Run(*dynamic_cast(p_arg), nrepeat); + return Run(*dynamic_cast(p_arg), stream_config); } }; diff --git a/include/ck/tensor_operation/gpu/device/device_gemm_xdl_c_shuffle_bias_activation_add.hpp b/include/ck/tensor_operation/gpu/device/device_gemm_xdl_c_shuffle_bias_activation_add.hpp index 4a478c995d..e3d0986aba 100644 --- a/include/ck/tensor_operation/gpu/device/device_gemm_xdl_c_shuffle_bias_activation_add.hpp +++ b/include/ck/tensor_operation/gpu/device/device_gemm_xdl_c_shuffle_bias_activation_add.hpp @@ -312,7 +312,7 @@ struct DeviceGemmXdl_C_Shuffle_Bias_Activation_Add { using Argument = DeviceOp::Argument; - float Run(const Argument& arg, int nrepeat = 1) + float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{}) { { std::cout << "arg.a_grid_desc_k0_m_k1_{" << arg.a_grid_desc_k0_m_k1_.GetLength(I0) @@ -374,8 +374,8 @@ struct DeviceGemmXdl_C_Shuffle_Bias_Activation_Add true>; ave_time = launch_and_time_kernel( + stream_config, kernel, - nrepeat, dim3(grid_size), dim3(BlockSize), 0, @@ -418,8 +418,8 @@ struct DeviceGemmXdl_C_Shuffle_Bias_Activation_Add false>; ave_time = launch_and_time_kernel( + stream_config, kernel, - nrepeat, dim3(grid_size), dim3(BlockSize), 0, @@ -443,9 +443,10 @@ struct DeviceGemmXdl_C_Shuffle_Bias_Activation_Add } // polymorphic - float Run(const BaseArgument* p_arg, int nrepeat = 1) override + float Run(const BaseArgument* p_arg, + const StreamConfig& stream_config = StreamConfig{}) override { - return Run(*dynamic_cast(p_arg), nrepeat); + return Run(*dynamic_cast(p_arg), stream_config); } }; diff --git a/include/ck/tensor_operation/gpu/device/device_gemm_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/device_gemm_xdl_cshuffle.hpp index fde27acdb1..952630120a 100644 --- a/include/ck/tensor_operation/gpu/device/device_gemm_xdl_cshuffle.hpp +++ b/include/ck/tensor_operation/gpu/device/device_gemm_xdl_cshuffle.hpp @@ -440,7 +440,7 @@ struct DeviceGemm_Xdl_CShuffle { using Argument = DeviceOp::Argument; - float Run(const Argument& arg, int nrepeat = 1) + float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{}) { #if 0 { @@ -487,42 +487,22 @@ struct DeviceGemm_Xdl_CShuffle typename GridwiseGemm::DefaultBlock2CTileMap, true>; - if(nrepeat == 0) - { - launch_kernel(kernel, - dim3(grid_size), - dim3(BlockSize), - 0, - arg.p_a_grid_, - arg.p_b_grid_, - arg.p_c_grid_, - arg.a_element_op_, - arg.b_element_op_, - arg.c_element_op_, - arg.a_grid_desc_ak0_m_ak1_, - arg.b_grid_desc_bk0_n_bk1_, - arg.c_grid_desc_mblock_mperblock_nblock_nperblock_, - arg.block_2_ctile_map_); - } - else - { - ave_time = - launch_and_time_kernel(kernel, - nrepeat, - dim3(grid_size), - dim3(BlockSize), - 0, - arg.p_a_grid_, - arg.p_b_grid_, - arg.p_c_grid_, - arg.a_element_op_, - arg.b_element_op_, - arg.c_element_op_, - arg.a_grid_desc_ak0_m_ak1_, - arg.b_grid_desc_bk0_n_bk1_, - arg.c_grid_desc_mblock_mperblock_nblock_nperblock_, - arg.block_2_ctile_map_); - } + ave_time = + launch_and_time_kernel(stream_config, + kernel, + dim3(grid_size), + dim3(BlockSize), + 0, + arg.p_a_grid_, + arg.p_b_grid_, + arg.p_c_grid_, + arg.a_element_op_, + arg.b_element_op_, + arg.c_element_op_, + arg.a_grid_desc_ak0_m_ak1_, + arg.b_grid_desc_bk0_n_bk1_, + arg.c_grid_desc_mblock_mperblock_nblock_nperblock_, + arg.block_2_ctile_map_); } else { @@ -538,52 +518,32 @@ struct DeviceGemm_Xdl_CShuffle typename GridwiseGemm::CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock, typename GridwiseGemm::DefaultBlock2CTileMap, false>; - - if(nrepeat == 0) - { - launch_kernel(kernel, - dim3(grid_size), - dim3(BlockSize), - 0, - arg.p_a_grid_, - arg.p_b_grid_, - arg.p_c_grid_, - arg.a_element_op_, - arg.b_element_op_, - arg.c_element_op_, - arg.a_grid_desc_ak0_m_ak1_, - arg.b_grid_desc_bk0_n_bk1_, - arg.c_grid_desc_mblock_mperblock_nblock_nperblock_, - arg.block_2_ctile_map_); - } - else - { - ave_time = - launch_and_time_kernel(kernel, - nrepeat, - dim3(grid_size), - dim3(BlockSize), - 0, - arg.p_a_grid_, - arg.p_b_grid_, - arg.p_c_grid_, - arg.a_element_op_, - arg.b_element_op_, - arg.c_element_op_, - arg.a_grid_desc_ak0_m_ak1_, - arg.b_grid_desc_bk0_n_bk1_, - arg.c_grid_desc_mblock_mperblock_nblock_nperblock_, - arg.block_2_ctile_map_); - } + ave_time = + launch_and_time_kernel(stream_config, + kernel, + dim3(grid_size), + dim3(BlockSize), + 0, + arg.p_a_grid_, + arg.p_b_grid_, + arg.p_c_grid_, + arg.a_element_op_, + arg.b_element_op_, + arg.c_element_op_, + arg.a_grid_desc_ak0_m_ak1_, + arg.b_grid_desc_bk0_n_bk1_, + arg.c_grid_desc_mblock_mperblock_nblock_nperblock_, + arg.block_2_ctile_map_); } return ave_time; } // polymorphic - float Run(const BaseArgument* p_arg, int nrepeat = 1) override + float Run(const BaseArgument* p_arg, + const StreamConfig& stream_config = StreamConfig{}) override { - return Run(*dynamic_cast(p_arg), nrepeat); + return Run(*dynamic_cast(p_arg), stream_config); } }; diff --git a/include/ck/tensor_operation/gpu/device/device_gemm_xdl_splitk.hpp b/include/ck/tensor_operation/gpu/device/device_gemm_xdl_splitk.hpp index db6c884739..e603af1fba 100644 --- a/include/ck/tensor_operation/gpu/device/device_gemm_xdl_splitk.hpp +++ b/include/ck/tensor_operation/gpu/device/device_gemm_xdl_splitk.hpp @@ -385,8 +385,11 @@ struct DeviceGemmXdlSplitK std::cout << "arg.c_grid_desc_m_n_{ " << arg.c_grid_desc_m_n_.GetLength(I0) << ", " << arg.c_grid_desc_m_n_.GetLength(I1) << "}" << std::endl; } - float Run(const Argument& arg, int nrepeat = 1) + + float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{}) { + ShowInfo(arg); + const auto kbatch = arg.a_grid_desc_kbatch_k0_m_k1_.GetLength(I0); if(!GridwiseGemm::CheckValidity(arg.a_grid_desc_kbatch_k0_m_k1_, @@ -408,50 +411,30 @@ struct DeviceGemmXdlSplitK float ave_time = 0; const auto Run = [&](const auto& kernel) { - if(nrepeat > 0) - { - ShowInfo(arg); - ave_time = launch_and_time_kernel(kernel, - nrepeat, - dim3(grid_size), - dim3(BlockSize), - 0, - arg.p_a_grid_, - arg.p_b_grid_, - arg.p_c_grid_, - arg.a_grid_desc_kbatch_k0_m_k1_, - arg.b_grid_desc_kbatch_k0_n_k1_, - arg.c_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2_, - arg.a_element_op_, - arg.b_element_op_, - arg.c_element_op_, - arg.block_2_ctile_map_); - } + // FIXME: this should be moved outside of DeviceOp + hipGetErrorString( + hipMemset(arg.p_c_grid_, + 0, + arg.c_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2_.GetElementSpaceSize() * + sizeof(CDataType))); - if(kbatch > 1 || nrepeat <= 0) - { - hipGetErrorString( - hipMemset(arg.p_c_grid_, - 0, - arg.c_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2_.GetElementSpaceSize() * - sizeof(CDataType))); - - launch_kernel(kernel, - dim3(grid_size), - dim3(BlockSize), - 0, - arg.p_a_grid_, - arg.p_b_grid_, - arg.p_c_grid_, - arg.a_grid_desc_kbatch_k0_m_k1_, - arg.b_grid_desc_kbatch_k0_n_k1_, - arg.c_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2_, - arg.a_element_op_, - arg.b_element_op_, - arg.c_element_op_, - arg.block_2_ctile_map_); - } + ave_time = launch_and_time_kernel(stream_config, + kernel, + dim3(grid_size), + dim3(BlockSize), + 0, + arg.p_a_grid_, + arg.p_b_grid_, + arg.p_c_grid_, + arg.a_grid_desc_kbatch_k0_m_k1_, + arg.b_grid_desc_kbatch_k0_n_k1_, + arg.c_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2_, + arg.a_element_op_, + arg.b_element_op_, + arg.c_element_op_, + arg.block_2_ctile_map_); }; + if(has_main_k0_block_loop) { if(kbatch == 1) @@ -531,9 +514,10 @@ struct DeviceGemmXdlSplitK } // polymorphic - float Run(const BaseArgument* p_arg, int nrepeat = 1) override + float Run(const BaseArgument* p_arg, + const StreamConfig& stream_config = StreamConfig{}) override { - return Run(*dynamic_cast(p_arg), nrepeat); + return Run(*dynamic_cast(p_arg), stream_config); } }; diff --git a/include/ck/tensor_operation/gpu/device/device_gemm_xdl_splitk_c_shuffle.hpp b/include/ck/tensor_operation/gpu/device/device_gemm_xdl_splitk_c_shuffle.hpp index 9de5361ab6..7d00224429 100644 --- a/include/ck/tensor_operation/gpu/device/device_gemm_xdl_splitk_c_shuffle.hpp +++ b/include/ck/tensor_operation/gpu/device/device_gemm_xdl_splitk_c_shuffle.hpp @@ -391,8 +391,11 @@ struct DeviceGemmXdlSplitKCShuffle std::cout << "arg.c_grid_desc_m_n_{ " << arg.c_grid_desc_m_n_.GetLength(I0) << ", " << arg.c_grid_desc_m_n_.GetLength(I1) << "}" << std::endl; } - float Run(const Argument& arg, int nrepeat = 1) + + float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{}) { + ShowInfo(arg); + const auto kbatch = arg.a_grid_desc_kbatch_k0_m_k1_.GetLength(I0); if(!GridwiseGemm::CheckValidity(arg.a_grid_desc_kbatch_k0_m_k1_, @@ -414,51 +417,29 @@ struct DeviceGemmXdlSplitKCShuffle float ave_time = 0; const auto Run = [&](const auto& kernel) { - if(nrepeat > 0) - { - ShowInfo(arg); - ave_time = - launch_and_time_kernel(kernel, - nrepeat, - dim3(grid_size), - dim3(BlockSize), - 0, - arg.p_a_grid_, - arg.p_b_grid_, - arg.p_c_grid_, - arg.a_grid_desc_kbatch_k0_m_k1_, - arg.b_grid_desc_kbatch_k0_n_k1_, - arg.c_grid_desc_mblock_mperblock_nblock_nperblock_, - arg.a_element_op_, - arg.b_element_op_, - arg.c_element_op_, - arg.block_2_ctile_map_); - } + hipGetErrorString(hipMemset( + arg.p_c_grid_, + 0, + arg.c_grid_desc_mblock_mperblock_nblock_nperblock_.GetElementSpaceSize() * + sizeof(CDataType))); - if(kbatch > 1 || nrepeat <= 0) - { - hipGetErrorString(hipMemset( - arg.p_c_grid_, - 0, - arg.c_grid_desc_mblock_mperblock_nblock_nperblock_.GetElementSpaceSize() * - sizeof(CDataType))); - - launch_kernel(kernel, - dim3(grid_size), - dim3(BlockSize), - 0, - arg.p_a_grid_, - arg.p_b_grid_, - arg.p_c_grid_, - arg.a_grid_desc_kbatch_k0_m_k1_, - arg.b_grid_desc_kbatch_k0_n_k1_, - arg.c_grid_desc_mblock_mperblock_nblock_nperblock_, - arg.a_element_op_, - arg.b_element_op_, - arg.c_element_op_, - arg.block_2_ctile_map_); - } + launch_and_time_kernel(stream_config, + kernel, + dim3(grid_size), + dim3(BlockSize), + 0, + arg.p_a_grid_, + arg.p_b_grid_, + arg.p_c_grid_, + arg.a_grid_desc_kbatch_k0_m_k1_, + arg.b_grid_desc_kbatch_k0_n_k1_, + arg.c_grid_desc_mblock_mperblock_nblock_nperblock_, + arg.a_element_op_, + arg.b_element_op_, + arg.c_element_op_, + arg.block_2_ctile_map_); }; + if(has_main_k0_block_loop) { if(kbatch == 1) @@ -542,9 +523,10 @@ struct DeviceGemmXdlSplitKCShuffle } // polymorphic - float Run(const BaseArgument* p_arg, int nrepeat = 1) override + float Run(const BaseArgument* p_arg, + const StreamConfig& stream_config = StreamConfig{}) override { - return Run(*dynamic_cast(p_arg), nrepeat); + return Run(*dynamic_cast(p_arg), stream_config); } }; diff --git a/include/ck/tensor_operation/gpu/device/device_grouped_gemm_xdl.hpp b/include/ck/tensor_operation/gpu/device/device_grouped_gemm_xdl.hpp index dfc1ce2715..730b2d787e 100644 --- a/include/ck/tensor_operation/gpu/device/device_grouped_gemm_xdl.hpp +++ b/include/ck/tensor_operation/gpu/device/device_grouped_gemm_xdl.hpp @@ -449,7 +449,7 @@ struct DeviceGroupedGemmXdl { using Argument = DeviceGroupedGemmXdl::Argument; - float Run(const Argument& arg, int nrepeat = 1) + float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{}) { StaticallyIndexedArray gemm_desc_kernel_args; @@ -510,8 +510,8 @@ struct DeviceGroupedGemmXdl true, MaxGroupCount>; - ave_time = launch_and_time_kernel(kernel, - nrepeat, + ave_time = launch_and_time_kernel(stream_config, + kernel, dim3(arg.grid_size_), dim3(BlockSize), 0, @@ -534,8 +534,8 @@ struct DeviceGroupedGemmXdl false, MaxGroupCount>; - ave_time = launch_and_time_kernel(kernel, - nrepeat, + ave_time = launch_and_time_kernel(stream_config, + kernel, dim3(arg.grid_size_), dim3(BlockSize), 0, @@ -550,9 +550,10 @@ struct DeviceGroupedGemmXdl } // polymorphic - float Run(const BaseArgument* p_arg, int nrepeat = 1) override + float Run(const BaseArgument* p_arg, + const StreamConfig& stream_config = StreamConfig{}) override { - return Run(*dynamic_cast(p_arg), nrepeat); + return Run(*dynamic_cast(p_arg), stream_config); } }; diff --git a/include/ck/tensor_operation/gpu/device/device_pool2d_fwd_nhwc_nhwc.hpp b/include/ck/tensor_operation/gpu/device/device_pool2d_fwd_nhwc_nhwc.hpp index 651d31ae2f..f665378e08 100644 --- a/include/ck/tensor_operation/gpu/device/device_pool2d_fwd_nhwc_nhwc.hpp +++ b/include/ck/tensor_operation/gpu/device/device_pool2d_fwd_nhwc_nhwc.hpp @@ -204,7 +204,7 @@ struct DevicePool2dFwd_Input_N_Hi_Wi_C_Output_N_Ho_Wo_C : public DevicePool2dFwd struct Invoker : public BaseInvoker { - float Run(const Argument& arg, int nrepeat = 1) + float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{}) { using gridwise_reduce = GridwiseReduction_mk_to_m_threadwise(p_arg), nrepeat); + return Run(*dynamic_cast(p_arg), stream_config); } }; diff --git a/include/ck/tensor_operation/gpu/device/device_reduce_blockwise.hpp b/include/ck/tensor_operation/gpu/device/device_reduce_blockwise.hpp index 4f17989b53..860f53d8c5 100644 --- a/include/ck/tensor_operation/gpu/device/device_reduce_blockwise.hpp +++ b/include/ck/tensor_operation/gpu/device/device_reduce_blockwise.hpp @@ -211,7 +211,7 @@ struct DeviceReduceBlockWise : public DeviceReduce; - avg_time = launch_and_time_kernel(kernel, - nrepeat, + avg_time = launch_and_time_kernel(stream_config, + kernel, dim3(arg.gridSize), dim3(BlockSize), 0, @@ -272,9 +272,10 @@ struct DeviceReduceBlockWise : public DeviceReduce(p_arg), nrepeat); + return Run(*dynamic_cast(p_arg), stream_config); }; }; diff --git a/include/ck/tensor_operation/gpu/device/device_reduce_blockwise_second_call.hpp b/include/ck/tensor_operation/gpu/device/device_reduce_blockwise_second_call.hpp index d3b1b4b5c3..43ac48cecc 100644 --- a/include/ck/tensor_operation/gpu/device/device_reduce_blockwise_second_call.hpp +++ b/include/ck/tensor_operation/gpu/device/device_reduce_blockwise_second_call.hpp @@ -182,7 +182,7 @@ struct DeviceReduceBlockWiseSecondCall struct Invoker : public BaseInvoker { - float Run(const Argument& arg, int nrepeat = 1) + float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{}) { const auto in_grid_desc_m_k = DeviceReduceBlockWiseSecondCall::MakeSrc2dDescriptor( arg.inLengths_, arg.inStrides_); @@ -224,8 +224,8 @@ struct DeviceReduceBlockWiseSecondCall InElementwiseOperation, AccElementwiseOperation>; - avg_time = launch_and_time_kernel(kernel, - nrepeat, + avg_time = launch_and_time_kernel(stream_config, + kernel, dim3(arg.gridSize), dim3(BlockSize), 0, @@ -243,10 +243,11 @@ struct DeviceReduceBlockWiseSecondCall return (avg_time); }; - float Run(const BaseArgument* p_arg, int nrepeat = 1) override + float Run(const BaseArgument* p_arg, + const StreamConfig& stream_config = StreamConfig{}) override { - return Run(*dynamic_cast(p_arg), nrepeat); - }; + return Run(*dynamic_cast(p_arg), stream_config); + } }; bool IsSupportedArgument(const BaseArgument* p_arg) override diff --git a/include/ck/tensor_operation/gpu/device/device_reduce_multiblock_atomic_add.hpp b/include/ck/tensor_operation/gpu/device/device_reduce_multiblock_atomic_add.hpp index 889c366875..f93c65fe18 100644 --- a/include/ck/tensor_operation/gpu/device/device_reduce_multiblock_atomic_add.hpp +++ b/include/ck/tensor_operation/gpu/device/device_reduce_multiblock_atomic_add.hpp @@ -245,7 +245,7 @@ struct DeviceReduceMultiBlockAtomicAdd struct Invoker : public BaseInvoker { - float Run(const Argument& arg, int nrepeat = 1) + float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{}) { const auto in_grid_desc_m_k = DeviceReduceMultiBlockAtomicAdd::MakeSrc2dDescriptor( arg.inLengths_, arg.inStrides_, arg.blkGroupSize, arg.kBlockTileIterations); @@ -275,8 +275,6 @@ struct DeviceReduceMultiBlockAtomicAdd float avg_time = 0; - KernelTimer timer; - const auto kernel_pre = kernel_buffer_set_value; const auto kernel_main = kernel_reduce_multiblock_atocmi_add; - printf("launch_and_time_kernel: grid_dim {%ld, 1, 1}, block_dim {%d, 1, 1} \n", - arg.gridSize, - BlockSize); - printf("Warm up\n"); + avg_time += launch_and_time_kernel(stream_config, + kernel_pre, + dim3(arg.gridSize_pre), + dim3(BlockSize), + 0, + out_grid_desc_m, + arg.out_dev_, + static_cast(0.0f)); - for(int i = 0; i < nrepeat + 1; i++) - { - if(i == 1) - timer.Start(); + avg_time += launch_and_time_kernel(stream_config, + kernel_main, + dim3(arg.gridSize), + dim3(BlockSize), + 0, + in_grid_desc_m_k, + out_grid_desc_m, + arg.in_elementwise_op_, + arg.acc_elementwise_op_, + arg.blkGroupSize, + arg.kBlockTileIterations, + arg.alpha_, + arg.in_dev_, + arg.out_dev_); - launch_kernel(kernel_pre, - dim3(arg.gridSize_pre), - dim3(BlockSize), - 0, - out_grid_desc_m, - arg.out_dev_, - static_cast(0.0f)); + return avg_time; + } - launch_kernel(kernel_main, - dim3(arg.gridSize), - dim3(BlockSize), - 0, - in_grid_desc_m_k, - out_grid_desc_m, - arg.in_elementwise_op_, - arg.acc_elementwise_op_, - arg.blkGroupSize, - arg.kBlockTileIterations, - arg.alpha_, - arg.in_dev_, - arg.out_dev_); - }; - - timer.End(); - - avg_time = timer.GetElapsedTime() / nrepeat; - - return (avg_time); - }; - - float Run(const BaseArgument* p_arg, int nrepeat = 1) override + float Run(const BaseArgument* p_arg, + const StreamConfig& stream_config = StreamConfig{}) override { - return Run(*dynamic_cast(p_arg), nrepeat); - }; + return Run(*dynamic_cast(p_arg), stream_config); + } }; bool IsSupportedArgument(const BaseArgument* p_arg) override diff --git a/include/ck/tensor_operation/gpu/device/device_reduce_multiblock_partial_reduce.hpp b/include/ck/tensor_operation/gpu/device/device_reduce_multiblock_partial_reduce.hpp index d583f7f1b8..b4eb8116c2 100644 --- a/include/ck/tensor_operation/gpu/device/device_reduce_multiblock_partial_reduce.hpp +++ b/include/ck/tensor_operation/gpu/device/device_reduce_multiblock_partial_reduce.hpp @@ -273,7 +273,7 @@ struct DeviceReduceMultiBlockPartialReduce struct Invoker : public BaseInvoker { - float Run(const Argument& arg, int nrepeat = 1) + float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{}) { const auto in_grid_desc_m_k = DeviceReduceMultiBlockPartialReduce::MakeSrc2dDescriptor( arg.inLengths_, arg.inStrides_, arg.blkGroupSize, arg.kBlockTileIterations); @@ -313,8 +313,8 @@ struct DeviceReduceMultiBlockPartialReduce InElementwiseOperation, AccElementwiseOperation>; - avg_time = launch_and_time_kernel(kernel, - nrepeat, + avg_time = launch_and_time_kernel(stream_config, + kernel, dim3(arg.gridSize), dim3(BlockSize), 0, @@ -331,10 +331,11 @@ struct DeviceReduceMultiBlockPartialReduce return (avg_time); }; - float Run(const BaseArgument* p_arg, int nrepeat = 1) override + float Run(const BaseArgument* p_arg, + const StreamConfig& stream_config = StreamConfig{}) override { - return Run(*dynamic_cast(p_arg), nrepeat); - }; + return Run(*dynamic_cast(p_arg), stream_config); + } }; bool IsSupportedArgument(const BaseArgument* p_arg) override diff --git a/include/ck/tensor_operation/gpu/device/device_reduce_threadwise.hpp b/include/ck/tensor_operation/gpu/device/device_reduce_threadwise.hpp index bf4088a96b..dacb175043 100644 --- a/include/ck/tensor_operation/gpu/device/device_reduce_threadwise.hpp +++ b/include/ck/tensor_operation/gpu/device/device_reduce_threadwise.hpp @@ -212,7 +212,7 @@ struct DeviceReduceThreadWise : public DeviceReduce; - avg_time = launch_and_time_kernel(kernel, - nrepeat, + avg_time = launch_and_time_kernel(stream_config, + kernel, dim3(arg.gridSize), dim3(BlockSize), 0, @@ -272,10 +272,11 @@ struct DeviceReduceThreadWise : public DeviceReduce(p_arg), nrepeat); - }; + return Run(*dynamic_cast(p_arg), stream_config); + } }; bool IsSupportedArgument(const BaseArgument* p_arg) override diff --git a/library/include/ck/library/host/host_interface.hpp b/library/include/ck/library/host/host_interface.hpp new file mode 100644 index 0000000000..955da0f4be --- /dev/null +++ b/library/include/ck/library/host/host_interface.hpp @@ -0,0 +1,54 @@ +#pragma once + +#include +#include + +#include "stream_config.hpp" +#include "config.hpp" +#include "device_base.hpp" + +struct DeviceConvFwdPtr_t +{ + using BaseArgument = ck::tensor_operation::device::BaseArgument; + using BaseInvoker = ck::tensor_operation::device::BaseInvoker; + + struct DeviceConvFwdPtrImpl; + std::unique_ptr pImpl; + DeviceConvFwdPtr_t(); + ~DeviceConvFwdPtr_t(); + DeviceConvFwdPtr_t(DeviceConvFwdPtr_t&&); + DeviceConvFwdPtr_t(DeviceConvFwdPtrImpl&); + DeviceConvFwdPtr_t& operator=(DeviceConvFwdPtr_t&) = delete; + DeviceConvFwdPtr_t& operator=(const DeviceConvFwdPtr_t&) = delete; + std::unique_ptr + MakeArgumentPointer(void* in_ptr, + void* wei_ptr, + void* out_ptr, + size_t N, + size_t K, + size_t C, + std::vector input_spatial_lengths, + std::vector filter_spatial_lengths, + std::vector output_spatial_lengths, + std::vector conv_filter_strides, + std::vector conv_filter_dilations, + std::vector input_left_pads, + std::vector input_right_pads) + const; // in,wei and out element ops are ignored for now since even if we change them, they + // cant be linked + std::unique_ptr + MakeInvokerPointer() const; // requires including BaseInvoker headers + std::string GetTypeString(); + bool IsSupportedArgument(const BaseArgument* arg_ptr); +}; + +void add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_f32_instances_t( + std::vector& instances); +void add_device_conv2d_fwd_xdl_c_shuffle_nhwc_kyxc_nhwk_f16_instances_t( + std::vector& instances); +void add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_bf16_instances_t( + std::vector& instances); +void add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_f16_instances_t( + std::vector& instances); +void add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_int8_instances_t( + std::vector& instances); diff --git a/library/include/ck/library/host_tensor/device.hpp b/library/include/ck/library/host_tensor/device.hpp index f33b8d4f40..d549b14c8c 100644 --- a/library/include/ck/library/host_tensor/device.hpp +++ b/library/include/ck/library/host_tensor/device.hpp @@ -1,12 +1,25 @@ -#ifndef DEVICE_HPP -#define DEVICE_HPP +#pragma once #include #include #include #include -#include "hip/hip_runtime.h" -#include "hip/hip_fp16.h" +#include +#include + +#include "stream_config.hpp" +#include "ck/options.hpp" + +inline void hip_check_error(hipError_t x) +{ + if(x != hipSuccess) + { + std::ostringstream ss; + ss << "HIP runtime error: " << hipGetErrorString(x) << ". " << __FILE__ << ": " << __LINE__ + << "in function: " << __func__; + throw std::runtime_error(ss.str()); + } +} struct DeviceMem { @@ -36,49 +49,59 @@ struct KernelTimer std::unique_ptr impl; }; -using device_stream_t = hipStream_t; - template -void launch_kernel(F kernel, dim3 grid_dim, dim3 block_dim, std::size_t lds_byte, Args... args) +float launch_and_time_kernel(const StreamConfig& stream_config, + F kernel, + dim3 grid_dim, + dim3 block_dim, + std::size_t lds_byte, + Args... args) { - hipStream_t stream_id = nullptr; - - hipLaunchKernelGGL(kernel, grid_dim, block_dim, lds_byte, stream_id, args...); -} - -template -float launch_and_time_kernel( - F kernel, int nrepeat, dim3 grid_dim, dim3 block_dim, std::size_t lds_byte, Args... args) -{ - KernelTimer timer; - - printf("%s: grid_dim {%d, %d, %d}, block_dim {%d, %d, %d} \n", - __func__, - grid_dim.x, - grid_dim.y, - grid_dim.z, - block_dim.x, - block_dim.y, - block_dim.z); - - printf("Warm up\n"); - - hipStream_t stream_id = nullptr; - - // warm up - hipLaunchKernelGGL(kernel, grid_dim, block_dim, lds_byte, stream_id, args...); - - printf("Start running %d times...\n", nrepeat); - - timer.Start(); - - for(int i = 0; i < nrepeat; ++i) +#if CK_TIME_KERNEL + if(stream_config.time_kernel_) { - hipLaunchKernelGGL(kernel, grid_dim, block_dim, lds_byte, stream_id, args...); + printf("%s: grid_dim {%d, %d, %d}, block_dim {%d, %d, %d} \n", + __func__, + grid_dim.x, + grid_dim.y, + grid_dim.z, + block_dim.x, + block_dim.y, + block_dim.z); + + const int nrepeat = 10; + + printf("Warm up 1 time\n"); + + // warm up + hipLaunchKernelGGL( + kernel, grid_dim, block_dim, lds_byte, stream_config.stream_id_, args...); + + printf("Start running %d times...\n", nrepeat); + + KernelTimer timer; + timer.Start(); + + for(int i = 0; i < nrepeat; ++i) + { + hipLaunchKernelGGL( + kernel, grid_dim, block_dim, lds_byte, stream_config.stream_id_, args...); + } + + timer.End(); + + return timer.GetElapsedTime() / nrepeat; } + else + { + hipLaunchKernelGGL( + kernel, grid_dim, block_dim, lds_byte, stream_config.stream_id_, args...); - timer.End(); + return 0; + } +#else + hipLaunchKernelGGL(kernel, grid_dim, block_dim, lds_byte, stream_config.stream_id_, args...); - return timer.GetElapsedTime() / nrepeat; -} + return 0; #endif +} diff --git a/library/include/ck/library/reference_tensor_operation/cpu/reference_batched_gemm.hpp b/library/include/ck/library/reference_tensor_operation/cpu/reference_batched_gemm.hpp index 3a706dac0b..f4944a28d2 100644 --- a/library/include/ck/library/reference_tensor_operation/cpu/reference_batched_gemm.hpp +++ b/library/include/ck/library/reference_tensor_operation/cpu/reference_batched_gemm.hpp @@ -84,7 +84,8 @@ struct ReferenceBatchedGemm : public device::BaseOperator return 0; } - float Run(const device::BaseArgument* p_arg, int) override + float Run(const device::BaseArgument* p_arg, + const StreamConfig& /* stream_config */ = StreamConfig{}) override { return Run(*dynamic_cast(p_arg)); } diff --git a/library/include/ck/library/reference_tensor_operation/cpu/reference_conv_backward_weight.hpp b/library/include/ck/library/reference_tensor_operation/cpu/reference_conv_backward_weight.hpp index c5f3cbad69..10619ae6d9 100644 --- a/library/include/ck/library/reference_tensor_operation/cpu/reference_conv_backward_weight.hpp +++ b/library/include/ck/library/reference_tensor_operation/cpu/reference_conv_backward_weight.hpp @@ -121,7 +121,8 @@ struct ReferenceConvBwdWeight : public device::BaseOperator return 0; } - float Run(const device::BaseArgument* p_arg, int) override + float Run(const device::BaseArgument* p_arg, + const StreamConfig& /*stream_config*/ = StreamConfig{}) override { return Run(*dynamic_cast(p_arg)); } diff --git a/library/include/ck/library/reference_tensor_operation/cpu/reference_conv_bwd_data.hpp b/library/include/ck/library/reference_tensor_operation/cpu/reference_conv_bwd_data.hpp index 9e91f06e7f..45fc8b8503 100644 --- a/library/include/ck/library/reference_tensor_operation/cpu/reference_conv_bwd_data.hpp +++ b/library/include/ck/library/reference_tensor_operation/cpu/reference_conv_bwd_data.hpp @@ -291,7 +291,8 @@ struct ReferenceConvBwdData : public device::BaseOperator } } - float Run(const device::BaseArgument* p_arg, int) override + float Run(const device::BaseArgument* p_arg, + const StreamConfig& /* stream_config */ = StreamConfig{}) override { return Run(*dynamic_cast(p_arg)); } diff --git a/library/include/ck/library/reference_tensor_operation/cpu/reference_conv_fwd.hpp b/library/include/ck/library/reference_tensor_operation/cpu/reference_conv_fwd.hpp index 65e59db2f8..d1afa898e4 100644 --- a/library/include/ck/library/reference_tensor_operation/cpu/reference_conv_fwd.hpp +++ b/library/include/ck/library/reference_tensor_operation/cpu/reference_conv_fwd.hpp @@ -1,9 +1,10 @@ -#ifndef REFERENCE_CONV_FWD_HPP -#define REFERENCE_CONV_FWD_HPP +#pragma once #include #include #include + +#include "stream_config.hpp" #include "device_base.hpp" #include "host_tensor.hpp" @@ -251,7 +252,8 @@ struct ReferenceConvFwd : public device::BaseOperator } } - float Run(const device::BaseArgument* p_arg, int) override + float Run(const device::BaseArgument* p_arg, + const StreamConfig& /*stream_config*/ = StreamConfig{}) override { return Run(*dynamic_cast(p_arg)); } @@ -311,4 +313,3 @@ struct ReferenceConvFwd : public device::BaseOperator } // namespace host } // namespace tensor_operation } // namespace ck -#endif diff --git a/library/include/ck/library/reference_tensor_operation/cpu/reference_conv_fwd_bias_activation.hpp b/library/include/ck/library/reference_tensor_operation/cpu/reference_conv_fwd_bias_activation.hpp index ee95cd410a..4be6169c15 100644 --- a/library/include/ck/library/reference_tensor_operation/cpu/reference_conv_fwd_bias_activation.hpp +++ b/library/include/ck/library/reference_tensor_operation/cpu/reference_conv_fwd_bias_activation.hpp @@ -124,7 +124,8 @@ struct ReferenceConvFwd_Bias_Activation : public device::BaseOperator return 0; } - float Run(const device::BaseArgument* p_arg, int) override + float Run(const device::BaseArgument* p_arg, + const StreamConfig& /* stream_config */ = StreamConfig{}) override { return Run(*dynamic_cast(p_arg)); } diff --git a/library/include/ck/library/reference_tensor_operation/cpu/reference_conv_fwd_bias_activation_add.hpp b/library/include/ck/library/reference_tensor_operation/cpu/reference_conv_fwd_bias_activation_add.hpp index 11232cc98f..466537c686 100644 --- a/library/include/ck/library/reference_tensor_operation/cpu/reference_conv_fwd_bias_activation_add.hpp +++ b/library/include/ck/library/reference_tensor_operation/cpu/reference_conv_fwd_bias_activation_add.hpp @@ -130,7 +130,8 @@ struct ReferenceConvFwd_Bias_Activation_Add : public device::BaseOperator return 0; } - float Run(const device::BaseArgument* p_arg, int) override + float Run(const device::BaseArgument* p_arg, + const StreamConfig& /*stream_config*/ = StreamConfig{}) override { return Run(*dynamic_cast(p_arg)); } diff --git a/library/include/ck/library/reference_tensor_operation/cpu/reference_gemm.hpp b/library/include/ck/library/reference_tensor_operation/cpu/reference_gemm.hpp index 1b49ca5740..d89c8f5e05 100644 --- a/library/include/ck/library/reference_tensor_operation/cpu/reference_gemm.hpp +++ b/library/include/ck/library/reference_tensor_operation/cpu/reference_gemm.hpp @@ -80,7 +80,8 @@ struct ReferenceGemm : public device::BaseOperator return 0; } - float Run(const device::BaseArgument* p_arg, int) override + float Run(const device::BaseArgument* p_arg, + const StreamConfig& /* stream_config */ = StreamConfig{}) override { return Run(*dynamic_cast(p_arg)); } diff --git a/library/include/ck/library/reference_tensor_operation/cpu/reference_gemm_bias_2d.hpp b/library/include/ck/library/reference_tensor_operation/cpu/reference_gemm_bias_2d.hpp index 7dd6fc9199..3e7f220e03 100644 --- a/library/include/ck/library/reference_tensor_operation/cpu/reference_gemm_bias_2d.hpp +++ b/library/include/ck/library/reference_tensor_operation/cpu/reference_gemm_bias_2d.hpp @@ -82,7 +82,8 @@ struct ReferenceGemmBias2D : public device::BaseOperator return 0; } - float Run(const device::BaseArgument* p_arg, int) override + float Run(const device::BaseArgument* p_arg, + const StreamConfig& /* stream_config */ = StreamConfig{}) override { return Run(*dynamic_cast(p_arg)); } diff --git a/library/include/ck/library/reference_tensor_operation/cpu/reference_gemm_bias_activation.hpp b/library/include/ck/library/reference_tensor_operation/cpu/reference_gemm_bias_activation.hpp index 7c9df272c2..60f72e9e51 100644 --- a/library/include/ck/library/reference_tensor_operation/cpu/reference_gemm_bias_activation.hpp +++ b/library/include/ck/library/reference_tensor_operation/cpu/reference_gemm_bias_activation.hpp @@ -85,7 +85,8 @@ struct ReferenceGemmBiasActivation : public device::BaseOperator return 0; } - float Run(const device::BaseArgument* p_arg, int) override + float Run(const device::BaseArgument* p_arg, + const StreamConfig& /* stream_config */ = StreamConfig{}) override { return Run(*dynamic_cast(p_arg)); } diff --git a/library/include/ck/library/reference_tensor_operation/cpu/reference_gemm_bias_activation_add.hpp b/library/include/ck/library/reference_tensor_operation/cpu/reference_gemm_bias_activation_add.hpp index 4d3c5effae..5e0ec75e5e 100644 --- a/library/include/ck/library/reference_tensor_operation/cpu/reference_gemm_bias_activation_add.hpp +++ b/library/include/ck/library/reference_tensor_operation/cpu/reference_gemm_bias_activation_add.hpp @@ -91,7 +91,8 @@ struct ReferenceGemmBiasActivationAdd : public device::BaseOperator return 0; } - float Run(const device::BaseArgument* p_arg, int) override + float Run(const device::BaseArgument* p_arg, + const StreamConfig& /* stream_config */ = StreamConfig{}) override { return Run(*dynamic_cast(p_arg)); } diff --git a/library/include/ck/library/utility/op_instance_engine.hpp b/library/include/ck/library/utility/op_instance_engine.hpp index ec88b4e1b9..5429f66d3e 100644 --- a/library/include/ck/library/utility/op_instance_engine.hpp +++ b/library/include/ck/library/utility/op_instance_engine.hpp @@ -128,7 +128,7 @@ class OpInstanceRunEngine template ProfileBestConfig Profile(const std::vector& op_ptrs, - int nrepeat = 100, + bool time_kernel = false, bool do_verification = false, bool do_log = false) { @@ -143,7 +143,7 @@ class OpInstanceRunEngine if(op_ptr->IsSupportedArgument(argument.get())) { std::string op_name = op_ptr->GetTypeString(); - float avg_time = invoker->Run(argument.get(), nrepeat); + float avg_time = invoker->Run(argument.get(), StreamConfig{nullptr, time_kernel}); std::size_t flops = op_instance_.GetFlops(); std::size_t num_btype = op_instance_.GetBtype(); diff --git a/library/src/host_tensor/CMakeLists.txt b/library/src/host_tensor/CMakeLists.txt index fd100e477f..2a020b763d 100644 --- a/library/src/host_tensor/CMakeLists.txt +++ b/library/src/host_tensor/CMakeLists.txt @@ -10,10 +10,31 @@ set(HOST_TENSOR_SOURCE host_tensor.cpp ) -add_library(host_tensor SHARED ${HOST_TENSOR_SOURCE}) +add_library(host_tensor STATIC ${HOST_TENSOR_SOURCE}) +add_library(composable_kernel::host_tensor ALIAS host_tensor) + target_compile_features(host_tensor PUBLIC) set_target_properties(host_tensor PROPERTIES POSITION_INDEPENDENT_CODE ON) target_include_directories(host_tensor SYSTEM PUBLIC $) -install(TARGETS host_tensor LIBRARY DESTINATION lib) + +target_include_directories(host_tensor PUBLIC + "$" + "$" + "$" +) + +install(TARGETS host_tensor + EXPORT host_tensorTargets + LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} + ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR} + RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR} + INCLUDES DESTINATION ${CMAKE_INSTALL_INCLUDEDIR} +) + +install(EXPORT host_tensorTargets + FILE composable_kernelhost_tensorTargets.cmake + NAMESPACE composable_kernel:: + DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/composable_kernel +) clang_tidy_check(host_tensor) diff --git a/library/src/host_tensor/device.cpp b/library/src/host_tensor/device.cpp index 3e80df80fb..9f0d982dbc 100644 --- a/library/src/host_tensor/device.cpp +++ b/library/src/host_tensor/device.cpp @@ -2,7 +2,7 @@ DeviceMem::DeviceMem(std::size_t mem_size) : mMemSize(mem_size) { - hipGetErrorString(hipMalloc(static_cast(&mpDeviceBuf), mMemSize)); + hip_check_error(hipMalloc(static_cast(&mpDeviceBuf), mMemSize)); } void* DeviceMem::GetDeviceBuffer() { return mpDeviceBuf; } @@ -11,49 +11,48 @@ std::size_t DeviceMem::GetBufferSize() { return mMemSize; } void DeviceMem::ToDevice(const void* p) { - hipGetErrorString( - hipMemcpy(mpDeviceBuf, const_cast(p), mMemSize, hipMemcpyHostToDevice)); + hip_check_error(hipMemcpy(mpDeviceBuf, const_cast(p), mMemSize, hipMemcpyHostToDevice)); } void DeviceMem::FromDevice(void* p) { - hipGetErrorString(hipMemcpy(p, mpDeviceBuf, mMemSize, hipMemcpyDeviceToHost)); + hip_check_error(hipMemcpy(p, mpDeviceBuf, mMemSize, hipMemcpyDeviceToHost)); } -void DeviceMem::SetZero() { hipGetErrorString(hipMemset(mpDeviceBuf, 0, mMemSize)); } +void DeviceMem::SetZero() { hip_check_error(hipMemset(mpDeviceBuf, 0, mMemSize)); } -DeviceMem::~DeviceMem() { hipGetErrorString(hipFree(mpDeviceBuf)); } +DeviceMem::~DeviceMem() { hip_check_error(hipFree(mpDeviceBuf)); } struct KernelTimerImpl { KernelTimerImpl() { - hipGetErrorString(hipEventCreate(&mStart)); - hipGetErrorString(hipEventCreate(&mEnd)); + hip_check_error(hipEventCreate(&mStart)); + hip_check_error(hipEventCreate(&mEnd)); } ~KernelTimerImpl() { - hipGetErrorString(hipEventDestroy(mStart)); - hipGetErrorString(hipEventDestroy(mEnd)); + hip_check_error(hipEventDestroy(mStart)); + hip_check_error(hipEventDestroy(mEnd)); } void Start() { - hipGetErrorString(hipDeviceSynchronize()); - hipGetErrorString(hipEventRecord(mStart, nullptr)); + hip_check_error(hipDeviceSynchronize()); + hip_check_error(hipEventRecord(mStart, nullptr)); } void End() { - hipGetErrorString(hipEventRecord(mEnd, nullptr)); - hipGetErrorString(hipEventSynchronize(mEnd)); + hip_check_error(hipEventRecord(mEnd, nullptr)); + hip_check_error(hipEventSynchronize(mEnd)); } float GetElapsedTime() const { float time; - hipGetErrorString(hipEventElapsedTime(&time, mStart, mEnd)); + hip_check_error(hipEventElapsedTime(&time, mStart, mEnd)); return time; } diff --git a/library/src/tensor_operation_instance/gpu/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/CMakeLists.txt index 7b361b48bd..5abfb0c074 100644 --- a/library/src/tensor_operation_instance/gpu/CMakeLists.txt +++ b/library/src/tensor_operation_instance/gpu/CMakeLists.txt @@ -11,6 +11,7 @@ include_directories(BEFORE ${PROJECT_SOURCE_DIR}/include/ck/tensor_operation/gpu/thread ${PROJECT_SOURCE_DIR}/include/ck/tensor_operation/gpu/element ${PROJECT_SOURCE_DIR}/library/include/ck/library/host_tensor + ${PROJECT_SOURCE_DIR}/library/include/ck/library/host ${PROJECT_SOURCE_DIR}/library/include/ck/library/tensor_operation_instance ${PROJECT_SOURCE_DIR}/library/include/ck/library/tensor_operation_instance/gpu/reduce ${PROJECT_SOURCE_DIR}/external/include/half @@ -18,7 +19,7 @@ include_directories(BEFORE function(add_instance_library INSTANCE_NAME) message("adding instance ${INSTANCE_NAME}") - add_library(${INSTANCE_NAME} SHARED ${ARGN}) + add_library(${INSTANCE_NAME} OBJECT ${ARGN}) target_compile_features(${INSTANCE_NAME} PUBLIC) set_target_properties(${INSTANCE_NAME} PROPERTIES POSITION_INDEPENDENT_CODE ON) endfunction(add_instance_library INSTANCE_NAME) @@ -41,3 +42,73 @@ add_subdirectory(convnd_bwd_data) add_subdirectory(grouped_gemm) add_subdirectory(conv2d_bwd_weight) add_subdirectory(batched_gemm_reduce) + +add_library(device_operations STATIC + $ + $ + $ + $ + $ + $ + $ + $ + $ + $ + $ + $ + $ + $ + $ + $ + $ + device_conv2d.cpp +) +add_library(composablekernels::device_operations ALIAS device_operations) + + +set(DEV_OPS_INC_DIRS + ${PROJECT_SOURCE_DIR}/include/ck/ + ${PROJECT_SOURCE_DIR}/library/include/ck/ + ${PROJECT_SOURCE_DIR}/external/include/ +) +target_compile_features(device_operations PUBLIC) +set_target_properties(device_operations PROPERTIES POSITION_INDEPENDENT_CODE ON) +target_include_directories(device_operations PUBLIC + $ + $ + $ + $ + $ + $ + $ + $ + $ + $ + $ + $ + $ + $ + $ + $ +) + +#once new arches are enabled make this an option on the main cmake file +# and pass down here to be exported + +target_compile_options(device_operations +PRIVATE --offload-arch=gfx908 +) +# install(TARGETS device_operations LIBRARY DESTINATION lib) +install(TARGETS device_operations + EXPORT device_operationsTargets + LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} + ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR} + RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR} + INCLUDES DESTINATION ${CMAKE_INSTALL_INCLUDEDIR} +) +install(DIRECTORY ${DEV_OPS_INC_DIRS} DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/ck) +install(EXPORT device_operationsTargets + FILE composable_kerneldevice_operationsTargets.cmake + NAMESPACE composable_kernel:: + DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/composable_kernel +) diff --git a/library/src/tensor_operation_instance/gpu/batched_gemm/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/batched_gemm/CMakeLists.txt index 35e24462b5..016c85f673 100644 --- a/library/src/tensor_operation_instance/gpu/batched_gemm/CMakeLists.txt +++ b/library/src/tensor_operation_instance/gpu/batched_gemm/CMakeLists.txt @@ -18,9 +18,9 @@ set(DEVICE_BATCHED_GEMM_INSTANCE_SOURCE device_batched_gemm_xdl_int8_int8_int8_gkm_gnk_gmn_instance.cpp; ) -add_library(device_batched_gemm_instance SHARED ${DEVICE_BATCHED_GEMM_INSTANCE_SOURCE}) -target_compile_features(device_batched_gemm_instance PUBLIC) +add_library(device_batched_gemm_instance OBJECT ${DEVICE_BATCHED_GEMM_INSTANCE_SOURCE}) +# target_compile_features(device_batched_gemm_instance PUBLIC) set_target_properties(device_batched_gemm_instance PROPERTIES POSITION_INDEPENDENT_CODE ON) -install(TARGETS device_batched_gemm_instance LIBRARY DESTINATION lib) +# install(TARGETS device_batched_gemm_instance LIBRARY DESTINATION lib) clang_tidy_check(device_batched_gemm_instance) diff --git a/library/src/tensor_operation_instance/gpu/batched_gemm_reduce/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/batched_gemm_reduce/CMakeLists.txt index 59eb6cb1cc..67a3c15d00 100644 --- a/library/src/tensor_operation_instance/gpu/batched_gemm_reduce/CMakeLists.txt +++ b/library/src/tensor_operation_instance/gpu/batched_gemm_reduce/CMakeLists.txt @@ -5,7 +5,8 @@ set(DEVICE_BATCHED_GEMM_REDUCE_INSTANCE_SOURCE device_batched_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_gkm_gnk_gmn_instance.cpp ) -add_instance_library(device_batched_gemm_reduce_instance ${DEVICE_BATCHED_GEMM_REDUCE_INSTANCE_SOURCE}) -install(TARGETS device_batched_gemm_reduce_instance LIBRARY DESTINATION lib) +add_instance_library(device_batched_gemm_reduce_instance OBJECT ${DEVICE_BATCHED_GEMM_REDUCE_INSTANCE_SOURCE}) +target_compile_features(device_batched_gemm_reduce_instance PUBLIC) +set_target_properties(device_batched_gemm_reduce_instance PROPERTIES POSITION_INDEPENDENT_CODE ON) clang_tidy_check(device_batched_gemm_reduce_instance) diff --git a/library/src/tensor_operation_instance/gpu/conv1d_fwd/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/conv1d_fwd/CMakeLists.txt index 6c7c3e4f78..77aa6198f5 100644 --- a/library/src/tensor_operation_instance/gpu/conv1d_fwd/CMakeLists.txt +++ b/library/src/tensor_operation_instance/gpu/conv1d_fwd/CMakeLists.txt @@ -6,9 +6,9 @@ set(DEVICE_CONV1D_FWD_INSTANCE_SOURCE device_conv1d_fwd_xdl_nwc_kxc_nwk_int8_instance.cpp; ) -add_library(device_conv1d_fwd_instance SHARED ${DEVICE_CONV1D_FWD_INSTANCE_SOURCE}) -target_compile_features(device_conv1d_fwd_instance PUBLIC) +add_library(device_conv1d_fwd_instance OBJECT ${DEVICE_CONV1D_FWD_INSTANCE_SOURCE}) +# target_compile_features(device_conv1d_fwd_instance PUBLIC) set_target_properties(device_conv1d_fwd_instance PROPERTIES POSITION_INDEPENDENT_CODE ON) -install(TARGETS device_conv1d_fwd_instance LIBRARY DESTINATION lib) +# install(TARGETS device_conv1d_fwd_instance LIBRARY DESTINATION lib) clang_tidy_check(device_conv1d_fwd_instance) diff --git a/library/src/tensor_operation_instance/gpu/conv2d_bwd_data/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/conv2d_bwd_data/CMakeLists.txt index d619ef4bf1..d7882a7d8b 100644 --- a/library/src/tensor_operation_instance/gpu/conv2d_bwd_data/CMakeLists.txt +++ b/library/src/tensor_operation_instance/gpu/conv2d_bwd_data/CMakeLists.txt @@ -6,9 +6,7 @@ set(DEVICE_CONV2D_BWD_DATA_INSTANCE_SOURCE device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_int8_instance.cpp; ) -add_library(device_conv2d_bwd_data_instance SHARED ${DEVICE_CONV2D_BWD_DATA_INSTANCE_SOURCE}) -target_compile_features(device_conv2d_bwd_data_instance PUBLIC) +add_library(device_conv2d_bwd_data_instance OBJECT ${DEVICE_CONV2D_BWD_DATA_INSTANCE_SOURCE}) set_target_properties(device_conv2d_bwd_data_instance PROPERTIES POSITION_INDEPENDENT_CODE ON) -install(TARGETS device_conv2d_bwd_data_instance LIBRARY DESTINATION lib) clang_tidy_check(device_conv2d_bwd_data_instance) diff --git a/library/src/tensor_operation_instance/gpu/conv2d_bwd_weight/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/conv2d_bwd_weight/CMakeLists.txt index 6183e70b9b..7c384a882b 100644 --- a/library/src/tensor_operation_instance/gpu/conv2d_bwd_weight/CMakeLists.txt +++ b/library/src/tensor_operation_instance/gpu/conv2d_bwd_weight/CMakeLists.txt @@ -3,7 +3,7 @@ set(DEVICE_CONV2D_BWD_WEIGHT_INSTANCE_SOURCE device_conv2d_bwd_weight_xdl_nhwc_kyxc_nhwk_f16_instance.cpp; device_conv2d_bwd_weight_xdl_nhwc_kyxc_nhwk_f32_instance.cpp; ) -add_library(device_conv2d_bwd_weight_instance SHARED ${DEVICE_CONV2D_BWD_WEIGHT_INSTANCE_SOURCE}) +add_library(device_conv2d_bwd_weight_instance OBJECT ${DEVICE_CONV2D_BWD_WEIGHT_INSTANCE_SOURCE}) target_compile_features(device_conv2d_bwd_weight_instance PUBLIC) set_target_properties(device_conv2d_bwd_weight_instance PROPERTIES POSITION_INDEPENDENT_CODE ON) install(TARGETS device_conv2d_bwd_weight_instance LIBRARY DESTINATION lib) diff --git a/library/src/tensor_operation_instance/gpu/conv2d_fwd/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/conv2d_fwd/CMakeLists.txt index 7483861524..857e36d6f5 100644 --- a/library/src/tensor_operation_instance/gpu/conv2d_fwd/CMakeLists.txt +++ b/library/src/tensor_operation_instance/gpu/conv2d_fwd/CMakeLists.txt @@ -6,9 +6,7 @@ set(DEVICE_CONV2D_FWD_INSTANCE_SOURCE device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_int8_instance.cpp; device_conv2d_fwd_xdl_c_shuffle_nhwc_kyxc_nhwk_f16_instance.cpp; ) -add_library(device_conv2d_fwd_instance SHARED ${DEVICE_CONV2D_FWD_INSTANCE_SOURCE}) -target_compile_features(device_conv2d_fwd_instance PUBLIC) +add_library(device_conv2d_fwd_instance OBJECT ${DEVICE_CONV2D_FWD_INSTANCE_SOURCE}) set_target_properties(device_conv2d_fwd_instance PROPERTIES POSITION_INDEPENDENT_CODE ON) -install(TARGETS device_conv2d_fwd_instance LIBRARY DESTINATION lib) clang_tidy_check(device_conv2d_fwd_instance) diff --git a/library/src/tensor_operation_instance/gpu/conv2d_fwd_bias_relu/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/conv2d_fwd_bias_relu/CMakeLists.txt index 27a9736a3f..ad66c73bf8 100644 --- a/library/src/tensor_operation_instance/gpu/conv2d_fwd_bias_relu/CMakeLists.txt +++ b/library/src/tensor_operation_instance/gpu/conv2d_fwd_bias_relu/CMakeLists.txt @@ -2,9 +2,7 @@ set(DEVICE_CONV2D_FWD_BIAS_RELU_INSTANCE_SOURCE device_conv2d_fwd_xdl_c_shuffle_bias_relu_nhwc_kyxc_nhwk_f16_instance.cpp; ) -add_library(device_conv2d_fwd_bias_relu_instance SHARED ${DEVICE_CONV2D_FWD_BIAS_RELU_INSTANCE_SOURCE}) -target_compile_features(device_conv2d_fwd_bias_relu_instance PUBLIC) +add_library(device_conv2d_fwd_bias_relu_instance OBJECT ${DEVICE_CONV2D_FWD_BIAS_RELU_INSTANCE_SOURCE}) set_target_properties(device_conv2d_fwd_bias_relu_instance PROPERTIES POSITION_INDEPENDENT_CODE ON) -install(TARGETS device_conv2d_fwd_bias_relu_instance LIBRARY DESTINATION lib) clang_tidy_check(device_conv2d_fwd_bias_relu_instance) diff --git a/library/src/tensor_operation_instance/gpu/conv2d_fwd_bias_relu_add/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/conv2d_fwd_bias_relu_add/CMakeLists.txt index d7bec82174..36b1f6c153 100644 --- a/library/src/tensor_operation_instance/gpu/conv2d_fwd_bias_relu_add/CMakeLists.txt +++ b/library/src/tensor_operation_instance/gpu/conv2d_fwd_bias_relu_add/CMakeLists.txt @@ -2,9 +2,7 @@ set(DEVICE_CONV2D_FWD_BIAS_RELU_ADD_INSTANCE_SOURCE device_conv2d_fwd_xdl_c_shuffle_bias_relu_add_nhwc_kyxc_nhwk_f16_instance.cpp; ) -add_library(device_conv2d_fwd_bias_relu_add_instance SHARED ${DEVICE_CONV2D_FWD_BIAS_RELU_ADD_INSTANCE_SOURCE}) -target_compile_features(device_conv2d_fwd_bias_relu_add_instance PUBLIC) +add_library(device_conv2d_fwd_bias_relu_add_instance OBJECT ${DEVICE_CONV2D_FWD_BIAS_RELU_ADD_INSTANCE_SOURCE}) set_target_properties(device_conv2d_fwd_bias_relu_add_instance PROPERTIES POSITION_INDEPENDENT_CODE ON) -install(TARGETS device_conv2d_fwd_bias_relu_add_instance LIBRARY DESTINATION lib) clang_tidy_check(device_conv2d_fwd_bias_relu_add_instance) diff --git a/library/src/tensor_operation_instance/gpu/conv2d_fwd_bias_relu_atomic_add/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/conv2d_fwd_bias_relu_atomic_add/CMakeLists.txt index c0942d5485..5906c7c5ac 100644 --- a/library/src/tensor_operation_instance/gpu/conv2d_fwd_bias_relu_atomic_add/CMakeLists.txt +++ b/library/src/tensor_operation_instance/gpu/conv2d_fwd_bias_relu_atomic_add/CMakeLists.txt @@ -3,9 +3,7 @@ set(DEVICE_CONV2D_FWD_BIAS_RELU_ATOMIC_ADD_INSTANCE_SOURCE device_conv2d_fwd_xdl_c_shuffle_bias_relu_atomic_add_nhwc_kyxc_nhwk_f16_instance.cpp; ) -add_library(device_conv2d_fwd_bias_relu_atomic_add_instance SHARED ${DEVICE_CONV2D_FWD_BIAS_RELU_ATOMIC_ADD_INSTANCE_SOURCE}) -target_compile_features(device_conv2d_fwd_bias_relu_atomic_add_instance PUBLIC) +add_library(device_conv2d_fwd_bias_relu_atomic_add_instance OBJECT ${DEVICE_CONV2D_FWD_BIAS_RELU_ATOMIC_ADD_INSTANCE_SOURCE}) set_target_properties(device_conv2d_fwd_bias_relu_atomic_add_instance PROPERTIES POSITION_INDEPENDENT_CODE ON) -install(TARGETS device_conv2d_fwd_bias_relu_atomic_add_instance LIBRARY DESTINATION lib) clang_tidy_check(device_conv2d_fwd_bias_relu_atomic_add_instance) diff --git a/library/src/tensor_operation_instance/gpu/conv3d_fwd/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/conv3d_fwd/CMakeLists.txt index f6849a7bb2..91a299c742 100644 --- a/library/src/tensor_operation_instance/gpu/conv3d_fwd/CMakeLists.txt +++ b/library/src/tensor_operation_instance/gpu/conv3d_fwd/CMakeLists.txt @@ -5,9 +5,8 @@ set(DEVICE_CONV3D_FWD_INSTANCE_SOURCE device_conv3d_fwd_xdl_ndhwc_kzyxc_ndhwk_bf16_instance.cpp; device_conv3d_fwd_xdl_ndhwc_kzyxc_ndhwk_int8_instance.cpp; ) -add_library(device_conv3d_fwd_instance SHARED ${DEVICE_CONV3D_FWD_INSTANCE_SOURCE}) +add_library(device_conv3d_fwd_instance OBJECT ${DEVICE_CONV3D_FWD_INSTANCE_SOURCE}) target_compile_features(device_conv3d_fwd_instance PUBLIC) set_target_properties(device_conv3d_fwd_instance PROPERTIES POSITION_INDEPENDENT_CODE ON) -install(TARGETS device_conv3d_fwd_instance LIBRARY DESTINATION lib) clang_tidy_check(device_conv3d_fwd_instance) diff --git a/library/src/tensor_operation_instance/gpu/convnd_bwd_data/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/convnd_bwd_data/CMakeLists.txt index 9ee961ad74..037f860808 100644 --- a/library/src/tensor_operation_instance/gpu/convnd_bwd_data/CMakeLists.txt +++ b/library/src/tensor_operation_instance/gpu/convnd_bwd_data/CMakeLists.txt @@ -14,7 +14,7 @@ set(DEVICE_CONVND_BWD_DATA_INSTANCE_SOURCE device_conv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_int8_instance.cpp; ) -add_library(device_convnd_bwd_data_instance SHARED ${DEVICE_CONVND_BWD_DATA_INSTANCE_SOURCE}) +add_library(device_convnd_bwd_data_instance OBJECT ${DEVICE_CONVND_BWD_DATA_INSTANCE_SOURCE}) target_compile_features(device_convnd_bwd_data_instance PUBLIC) set_target_properties(device_convnd_bwd_data_instance PROPERTIES POSITION_INDEPENDENT_CODE ON) install(TARGETS device_convnd_bwd_data_instance LIBRARY DESTINATION lib) diff --git a/library/src/tensor_operation_instance/gpu/device_conv2d.cpp b/library/src/tensor_operation_instance/gpu/device_conv2d.cpp new file mode 100644 index 0000000000..6b99433ffa --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/device_conv2d.cpp @@ -0,0 +1,201 @@ +#include +#include "config.hpp" +#include "device_conv2d_fwd_xdl_c_shuffle_nhwc_kyxc_nhwk.hpp" +#include "element_wise_operation.hpp" +#include "device_operation_instance.hpp" +#include "host_interface.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace device_conv2d_fwd_instance { +using PassThrough = ck::tensor_operation::element_wise::PassThrough; +void add_device_conv2d_fwd_xdl_c_shuffle_nhwc_kyxc_nhwk_f16_instances( + std::vector>& instances); +void add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_f32_instances( + std::vector>& instances); +void add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_bf16_instances( + std::vector>& instances); +void add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_f16_instances( + std::vector>& instances); +void add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_int8_instances( + std::vector>& instances); + +} // namespace device_conv2d_fwd_instance +} // namespace device +} // namespace tensor_operation +} // namespace ck + +using PassThrough = ck::tensor_operation::element_wise::PassThrough; +struct DeviceConvFwdPtr_t::DeviceConvFwdPtrImpl +{ + std::unique_ptr + MakeArgumentPointer(void* in_ptr, + void* wei_ptr, + void* out_ptr, + size_t N, + size_t K, + size_t C, + std::vector input_spatial_lengths, + std::vector filter_spatial_lengths, + std::vector output_spatial_lengths, + std::vector conv_filter_strides, + std::vector conv_filter_dilations, + std::vector input_left_pads, + std::vector input_right_pads) const + { + return el->MakeArgumentPointer(in_ptr, + wei_ptr, + out_ptr, + N, + K, + C, + input_spatial_lengths, + filter_spatial_lengths, + output_spatial_lengths, + conv_filter_strides, + conv_filter_dilations, + input_left_pads, + input_right_pads, + PassThrough{}, + PassThrough{}, + PassThrough{}); + } + std::unique_ptr MakeInvokerPointer() const + { + return el->MakeInvokerPointer(); + } + + std::string GetTypeString() { return el->GetTypeString(); } + bool IsSupportedArgument(const DeviceConvFwdPtr_t::BaseArgument* arg) + { + return el->IsSupportedArgument(arg); + } + + ck::tensor_operation::device::DeviceConvFwdPtr el; +}; + +DeviceConvFwdPtr_t::DeviceConvFwdPtr_t() : pImpl(nullptr) {} +DeviceConvFwdPtr_t::~DeviceConvFwdPtr_t() = default; +DeviceConvFwdPtr_t::DeviceConvFwdPtr_t(DeviceConvFwdPtr_t&&) = default; +DeviceConvFwdPtr_t::DeviceConvFwdPtr_t(DeviceConvFwdPtr_t::DeviceConvFwdPtrImpl& other) + : pImpl(std::make_unique(std::move(other))) +{ +} + +std::unique_ptr +DeviceConvFwdPtr_t::MakeArgumentPointer(void* in_ptr, + void* wei_ptr, + void* out_ptr, + size_t N, + size_t K, + size_t C, + std::vector input_spatial_lengths, + std::vector filter_spatial_lengths, + std::vector output_spatial_lengths, + std::vector conv_filter_strides, + std::vector conv_filter_dilations, + std::vector input_left_pads, + std::vector input_right_pads) const +{ + return pImpl->MakeArgumentPointer(in_ptr, + wei_ptr, + out_ptr, + N, + K, + C, + input_spatial_lengths, + filter_spatial_lengths, + output_spatial_lengths, + conv_filter_strides, + conv_filter_dilations, + input_left_pads, + input_right_pads); +} + +std::unique_ptr DeviceConvFwdPtr_t::MakeInvokerPointer() const +{ + return pImpl->MakeInvokerPointer(); +} + +std::string DeviceConvFwdPtr_t::GetTypeString() { return pImpl->GetTypeString(); } +bool DeviceConvFwdPtr_t::IsSupportedArgument(const DeviceConvFwdPtr_t::BaseArgument* arg_ptr) +{ + return pImpl->IsSupportedArgument(arg_ptr); +} + +using namespace ck::tensor_operation::device::device_conv2d_fwd_instance; +void add_device_conv2d_fwd_xdl_c_shuffle_nhwc_kyxc_nhwk_f16_instances_t( + std::vector& instances) +{ + std::vector< + ck::tensor_operation::device::DeviceConvFwdPtr> + local_instances; + add_device_conv2d_fwd_xdl_c_shuffle_nhwc_kyxc_nhwk_f16_instances(local_instances); + for(auto& kinder : local_instances) + { + DeviceConvFwdPtr_t::DeviceConvFwdPtrImpl tmp{std::move(kinder)}; + instances.emplace_back(tmp); + } + return; +} + +void add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_f32_instances_t( + std::vector& instances) +{ + std::vector< + ck::tensor_operation::device::DeviceConvFwdPtr> + local_instances; + add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_f32_instances(local_instances); + for(auto& kinder : local_instances) + { + DeviceConvFwdPtr_t::DeviceConvFwdPtrImpl tmp{std::move(kinder)}; + instances.emplace_back(tmp); // Perhaps we can do better + } + return; +} + +void add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_bf16_instances_t( + std::vector& instances) +{ + std::vector< + ck::tensor_operation::device::DeviceConvFwdPtr> + local_instances; + add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_bf16_instances(local_instances); + for(auto& kinder : local_instances) + { + DeviceConvFwdPtr_t::DeviceConvFwdPtrImpl tmp{std::move(kinder)}; + instances.emplace_back(tmp); // Perhaps we can do better + } + return; +} + +void add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_f16_instances_t( + std::vector& instances) +{ + std::vector< + ck::tensor_operation::device::DeviceConvFwdPtr> + local_instances; + add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_f16_instances(local_instances); + for(auto& kinder : local_instances) + { + DeviceConvFwdPtr_t::DeviceConvFwdPtrImpl tmp{std::move(kinder)}; + instances.emplace_back(tmp); // Perhaps we can do better + } + return; +} + +void add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_int8_instances_t( + std::vector& instances) +{ + std::vector< + ck::tensor_operation::device::DeviceConvFwdPtr> + local_instances; + add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_int8_instances(local_instances); + for(auto& kinder : local_instances) + { + DeviceConvFwdPtr_t::DeviceConvFwdPtrImpl tmp{std::move(kinder)}; + instances.emplace_back(tmp); + } + return; +} diff --git a/library/src/tensor_operation_instance/gpu/gemm/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/gemm/CMakeLists.txt index 5f057adcc5..556b06d7e1 100644 --- a/library/src/tensor_operation_instance/gpu/gemm/CMakeLists.txt +++ b/library/src/tensor_operation_instance/gpu/gemm/CMakeLists.txt @@ -35,10 +35,9 @@ set(DEVICE_GEMM_INSTANCE_SOURCE device_gemm_xdl_splitk_f16_f16_f16_km_nk_mn_instance.cpp; ) -add_library(device_gemm_instance SHARED ${DEVICE_GEMM_INSTANCE_SOURCE}) +add_library(device_gemm_instance OBJECT ${DEVICE_GEMM_INSTANCE_SOURCE}) target_compile_features(device_gemm_instance PUBLIC) set_target_properties(device_gemm_instance PROPERTIES POSITION_INDEPENDENT_CODE ON) -install(TARGETS device_gemm_instance LIBRARY DESTINATION lib) clang_tidy_check(device_gemm_instance) diff --git a/library/src/tensor_operation_instance/gpu/gemm_bias2d/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/gemm_bias2d/CMakeLists.txt index a0e5ba61a1..e2b0abb1d1 100644 --- a/library/src/tensor_operation_instance/gpu/gemm_bias2d/CMakeLists.txt +++ b/library/src/tensor_operation_instance/gpu/gemm_bias2d/CMakeLists.txt @@ -10,9 +10,7 @@ set(DEVICE_GEMM_BIAS2D_INSTANCE_SOURCE device_gemm_xdl_c_shuffle_bias_2d_f16_f16_f16_mk_nk_mn_instance.cpp; ) -add_library(device_gemm_bias2d_instance SHARED ${DEVICE_GEMM_BIAS2D_INSTANCE_SOURCE}) -target_compile_features(device_gemm_bias2d_instance PUBLIC) +add_library(device_gemm_bias2d_instance OBJECT ${DEVICE_GEMM_BIAS2D_INSTANCE_SOURCE}) set_target_properties(device_gemm_bias2d_instance PROPERTIES POSITION_INDEPENDENT_CODE ON) -install(TARGETS device_gemm_bias2d_instance LIBRARY DESTINATION lib) clang_tidy_check(device_gemm_bias2d_instance) diff --git a/library/src/tensor_operation_instance/gpu/gemm_bias_relu/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/gemm_bias_relu/CMakeLists.txt index 69e05673d6..e2e7d4badd 100644 --- a/library/src/tensor_operation_instance/gpu/gemm_bias_relu/CMakeLists.txt +++ b/library/src/tensor_operation_instance/gpu/gemm_bias_relu/CMakeLists.txt @@ -6,9 +6,7 @@ set(DEVICE_GEMM_BIAS_RELU_INSTANCE_SOURCE device_gemm_xdl_c_shuffle_bias_relu_f16_f16_f16_km_nk_mn_instance.cpp; ) -add_library(device_gemm_bias_relu_instance SHARED ${DEVICE_GEMM_BIAS_RELU_INSTANCE_SOURCE}) -target_compile_features(device_gemm_bias_relu_instance PUBLIC) +add_library(device_gemm_bias_relu_instance OBJECT ${DEVICE_GEMM_BIAS_RELU_INSTANCE_SOURCE}) set_target_properties(device_gemm_bias_relu_instance PROPERTIES POSITION_INDEPENDENT_CODE ON) -install(TARGETS device_gemm_bias_relu_instance LIBRARY DESTINATION lib) clang_tidy_check(device_gemm_bias_relu_instance) diff --git a/library/src/tensor_operation_instance/gpu/gemm_bias_relu_add/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/gemm_bias_relu_add/CMakeLists.txt index 016bc4be2d..a10dbb555d 100644 --- a/library/src/tensor_operation_instance/gpu/gemm_bias_relu_add/CMakeLists.txt +++ b/library/src/tensor_operation_instance/gpu/gemm_bias_relu_add/CMakeLists.txt @@ -6,9 +6,7 @@ set(DEVICE_GEMM_BIAS_RELU_ADD_INSTANCE_SOURCE device_gemm_xdl_c_shuffle_bias_relu_add_f16_f16_f16_km_nk_mn_instance.cpp; ) -add_library(device_gemm_bias_relu_add_instance SHARED ${DEVICE_GEMM_BIAS_RELU_ADD_INSTANCE_SOURCE}) -target_compile_features(device_gemm_bias_relu_add_instance PUBLIC) +add_library(device_gemm_bias_relu_add_instance OBJECT ${DEVICE_GEMM_BIAS_RELU_ADD_INSTANCE_SOURCE}) set_target_properties(device_gemm_bias_relu_add_instance PROPERTIES POSITION_INDEPENDENT_CODE ON) -install(TARGETS device_gemm_bias_relu_add_instance LIBRARY DESTINATION lib) clang_tidy_check(device_gemm_bias_relu_add_instance) diff --git a/library/src/tensor_operation_instance/gpu/grouped_gemm/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/grouped_gemm/CMakeLists.txt index 8f591d8c49..6c5e31fddd 100644 --- a/library/src/tensor_operation_instance/gpu/grouped_gemm/CMakeLists.txt +++ b/library/src/tensor_operation_instance/gpu/grouped_gemm/CMakeLists.txt @@ -6,7 +6,7 @@ set(DEVICE_GROUPED_GEMM_INSTANCE_SOURCE device_grouped_gemm_xdl_f16_f16_f16_km_nk_mn_instance.cpp; ) -add_library(device_grouped_gemm_instance SHARED ${DEVICE_GROUPED_GEMM_INSTANCE_SOURCE}) +add_library(device_grouped_gemm_instance OBJECT ${DEVICE_GROUPED_GEMM_INSTANCE_SOURCE}) target_compile_features(device_grouped_gemm_instance PUBLIC) set_target_properties(device_grouped_gemm_instance PROPERTIES POSITION_INDEPENDENT_CODE ON) diff --git a/library/src/tensor_operation_instance/gpu/reduce/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/reduce/CMakeLists.txt index cced3a4b76..81987ac0d4 100644 --- a/library/src/tensor_operation_instance/gpu/reduce/CMakeLists.txt +++ b/library/src/tensor_operation_instance/gpu/reduce/CMakeLists.txt @@ -38,9 +38,7 @@ set(DEVICE_REDUCE_INSTANCE_SOURCE device_reduce_instance_multiblock_partial_reduce_b16_f32_b16.cpp; ) -add_library(device_reduce_instance SHARED ${DEVICE_REDUCE_INSTANCE_SOURCE}) -target_compile_features(device_reduce_instance PUBLIC) +add_library(device_reduce_instance OBJECT ${DEVICE_REDUCE_INSTANCE_SOURCE}) set_target_properties(device_reduce_instance PROPERTIES POSITION_INDEPENDENT_CODE ON) -install(TARGETS device_reduce_instance LIBRARY DESTINATION lib) clang_tidy_check(device_reduce_instance) diff --git a/profiler/include/profile_batched_gemm_impl.hpp b/profiler/include/profile_batched_gemm_impl.hpp index 7abbf7a042..3393110c33 100644 --- a/profiler/include/profile_batched_gemm_impl.hpp +++ b/profiler/include/profile_batched_gemm_impl.hpp @@ -63,7 +63,7 @@ template GetTypeString(); - float ave_time = invoker_ptr->Run(argument_ptr.get(), nrepeat); + float ave_time = + invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel}); std::size_t flop = std::size_t(2) * BatchCount * M * N * K; - std::size_t num_btype = (sizeof(ADataType) * M * K + sizeof(BDataType) * K * M + + std::size_t num_btype = (sizeof(ADataType) * M * K + sizeof(BDataType) * K * N + sizeof(CDataType) * M * N) * BatchCount; diff --git a/profiler/include/profile_batched_gemm_reduce_impl.hpp b/profiler/include/profile_batched_gemm_reduce_impl.hpp index a6399c20d8..bd74dbf459 100644 --- a/profiler/include/profile_batched_gemm_reduce_impl.hpp +++ b/profiler/include/profile_batched_gemm_reduce_impl.hpp @@ -53,7 +53,7 @@ template IsSupportedArgument(argument_ptr.get())) { - // warm up - invoker_ptr->Run(argument_ptr.get()); + // init DO, D1 to 0 + d0_device_buf.SetZero(); + d1_device_buf.SetZero(); - // timing - float total_time = 0; - - for(int i = 0; i < nrepeat; ++i) - { - // init DO, D1 to 0 - d0_device_buf.SetZero(); - d1_device_buf.SetZero(); - - KernelTimer timer; - - timer.Start(); - - invoker_ptr->Run(argument_ptr.get()); - - timer.End(); - - total_time += timer.GetElapsedTime(); - } - - float ave_time = total_time / nrepeat; + float ave_time = + invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel}); std::string gemm_name = gemm_ptr->GetTypeString(); diff --git a/profiler/include/profile_conv_bwd_data_impl.hpp b/profiler/include/profile_conv_bwd_data_impl.hpp index bec97e40f5..dfec033737 100644 --- a/profiler/include/profile_conv_bwd_data_impl.hpp +++ b/profiler/include/profile_conv_bwd_data_impl.hpp @@ -51,7 +51,7 @@ template GetTypeString(); - float ave_time = invoker_ptr->Run(argument_ptr.get(), nrepeat); + float ave_time = + invoker_ptr->Run(argument_ptr.get(), StreamControl{nullptr, time_kernel}); std::size_t flop = std::size_t(2) * N * K * Ho * Wo * C * Y * X; diff --git a/profiler/include/profile_conv_bwd_weight_impl.hpp b/profiler/include/profile_conv_bwd_weight_impl.hpp index 20fe0ef549..8e3a4074b0 100644 --- a/profiler/include/profile_conv_bwd_weight_impl.hpp +++ b/profiler/include/profile_conv_bwd_weight_impl.hpp @@ -1,4 +1,6 @@ #pragma once + +#include "stream_config.hpp" #include "config.hpp" #include "device.hpp" #include "host_tensor.hpp" @@ -43,7 +45,7 @@ template MakeArgumentPointer( static_cast(in_device_buf.GetDeviceBuffer()), static_cast(wei_device_buf.GetDeviceBuffer()), @@ -214,7 +218,8 @@ bool profile_conv_bwd_weight_impl(int do_verification, { std::string conv_name = conv_ptr->GetTypeString(); - float ave_time = invoker_ptr->Run(argument_ptr.get(), nrepeat); + float ave_time = + invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel}); std::size_t flop = std::size_t(2) * N * K * Ho * Wo * C * Y * X; @@ -242,6 +247,7 @@ bool profile_conv_bwd_weight_impl(int do_verification, wei_device_buf.FromDevice(wei_k_c_y_x_device_result.mData.data()); float max_error = check_error(wei_k_c_y_x_host_result, wei_k_c_y_x_device_result); + if(max_error > 8) { pass = false; diff --git a/profiler/include/profile_conv_fwd_bias_relu_add_impl.hpp b/profiler/include/profile_conv_fwd_bias_relu_add_impl.hpp index d0de7307d2..5ea35cd72f 100644 --- a/profiler/include/profile_conv_fwd_bias_relu_add_impl.hpp +++ b/profiler/include/profile_conv_fwd_bias_relu_add_impl.hpp @@ -42,7 +42,7 @@ template GetTypeString(); - float ave_time = invoker_ptr->Run(argument_ptr.get(), nrepeat); + float ave_time = + invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel}); std::size_t flop = std::size_t(2) * N * K * Ho * Wo * C * Y * X; diff --git a/profiler/include/profile_conv_fwd_bias_relu_atomic_add_impl.hpp b/profiler/include/profile_conv_fwd_bias_relu_atomic_add_impl.hpp index 9bdfa61283..f1c2fd300a 100644 --- a/profiler/include/profile_conv_fwd_bias_relu_atomic_add_impl.hpp +++ b/profiler/include/profile_conv_fwd_bias_relu_atomic_add_impl.hpp @@ -119,7 +119,7 @@ template GetTypeString(); - float ave_time = invoker_ptr->Run(argument_ptr.get(), nrepeat); + float ave_time = + invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel}); std::size_t flop = std::size_t(2) * N * K * Ho * Wo * C * Y * X; diff --git a/profiler/include/profile_conv_fwd_bias_relu_impl.hpp b/profiler/include/profile_conv_fwd_bias_relu_impl.hpp index f34e52048e..eeb2b93e4e 100644 --- a/profiler/include/profile_conv_fwd_bias_relu_impl.hpp +++ b/profiler/include/profile_conv_fwd_bias_relu_impl.hpp @@ -41,7 +41,7 @@ template GetTypeString(); - float ave_time = invoker_ptr->Run(argument_ptr.get(), nrepeat); + float ave_time = + invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel}); std::size_t flop = std::size_t(2) * N * K * Ho * Wo * C * Y * X; diff --git a/profiler/include/profile_convnd_bwd_data_impl.hpp b/profiler/include/profile_convnd_bwd_data_impl.hpp index 5b1ba71163..291bf2abc0 100644 --- a/profiler/include/profile_convnd_bwd_data_impl.hpp +++ b/profiler/include/profile_convnd_bwd_data_impl.hpp @@ -269,7 +269,7 @@ template GetTypeString(); - float ave_time = invoker_ptr->Run(argument_ptr.get(), nrepeat); + float ave_time = + invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel}); std::size_t flop = ck::utils::conv::get_flops(N, C, K, filter_spatial_lengths, output_spatial_lengths); diff --git a/profiler/include/profile_gemm_bias_2d_impl.hpp b/profiler/include/profile_gemm_bias_2d_impl.hpp index 98e4ad76c9..8565f9637c 100644 --- a/profiler/include/profile_gemm_bias_2d_impl.hpp +++ b/profiler/include/profile_gemm_bias_2d_impl.hpp @@ -65,7 +65,7 @@ template GetTypeString(); - float ave_time = invoker_ptr->Run(argument_ptr.get(), nrepeat); + float ave_time = + invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel}); std::size_t flop = std::size_t(2) * M * N * K; diff --git a/profiler/include/profile_gemm_bias_relu_add_impl.hpp b/profiler/include/profile_gemm_bias_relu_add_impl.hpp index 75ed78075b..6fec17c199 100644 --- a/profiler/include/profile_gemm_bias_relu_add_impl.hpp +++ b/profiler/include/profile_gemm_bias_relu_add_impl.hpp @@ -48,7 +48,7 @@ template GetTypeString(); - float ave_time = invoker_ptr->Run(argument_ptr.get(), nrepeat); + float ave_time = + invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel}); std::size_t flop = std::size_t(2) * M * N * K; diff --git a/profiler/include/profile_gemm_bias_relu_impl.hpp b/profiler/include/profile_gemm_bias_relu_impl.hpp index 0735f3c31b..69010becc5 100644 --- a/profiler/include/profile_gemm_bias_relu_impl.hpp +++ b/profiler/include/profile_gemm_bias_relu_impl.hpp @@ -48,7 +48,7 @@ template GetTypeString(); - float ave_time = invoker_ptr->Run(argument_ptr.get(), nrepeat); + float ave_time = + invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel}); std::size_t flop = std::size_t(2) * M * N * K; diff --git a/profiler/include/profile_gemm_impl.hpp b/profiler/include/profile_gemm_impl.hpp index 93262fe802..45e6174260 100644 --- a/profiler/include/profile_gemm_impl.hpp +++ b/profiler/include/profile_gemm_impl.hpp @@ -91,7 +91,7 @@ template GetTypeString(); - float ave_time = invoker_ptr->Run(argument_ptr.get(), nrepeat); + float ave_time = + invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel}); std::size_t flop = std::size_t(2) * M * N * K; diff --git a/profiler/include/profile_gemm_reduce_impl.hpp b/profiler/include/profile_gemm_reduce_impl.hpp index 6ef3e010b1..d034c9f750 100644 --- a/profiler/include/profile_gemm_reduce_impl.hpp +++ b/profiler/include/profile_gemm_reduce_impl.hpp @@ -52,7 +52,7 @@ template IsSupportedArgument(argument_ptr.get())) { - // warm up - invoker_ptr->Run(argument_ptr.get()); + // init DO, D1 to 0 + d0_device_buf.SetZero(); + d1_device_buf.SetZero(); - // timing - float total_time = 0; - - for(int i = 0; i < nrepeat; ++i) - { - // init DO, D1 to 0 - d0_device_buf.SetZero(); - d1_device_buf.SetZero(); - - KernelTimer timer; - - timer.Start(); - - invoker_ptr->Run(argument_ptr.get()); - - timer.End(); - - total_time += timer.GetElapsedTime(); - } - - float ave_time = total_time / nrepeat; + float ave_time = + invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel}); std::string gemm_name = gemm_ptr->GetTypeString(); std::size_t flop = std::size_t(2) * M * N * K; - std::size_t num_btype = sizeof(ADataType) * M * K + sizeof(BDataType) * K * M + + std::size_t num_btype = sizeof(ADataType) * M * K + sizeof(BDataType) * K * N + sizeof(CDataType) * M * N + sizeof(CDataType) * N; float tflops = static_cast(flop) / 1.E9 / ave_time; diff --git a/profiler/include/profile_grouped_gemm_impl.hpp b/profiler/include/profile_grouped_gemm_impl.hpp index ae70f551f1..96d34c7e42 100644 --- a/profiler/include/profile_grouped_gemm_impl.hpp +++ b/profiler/include/profile_grouped_gemm_impl.hpp @@ -49,7 +49,7 @@ template & Ms, const std::vector& Ns, const std::vector& Ks, @@ -231,7 +231,8 @@ void profile_grouped_gemm_impl(int do_verification, { std::string gemm_name = gemm_ptr->GetTypeString(); - float ave_time = invoker_ptr->Run(argument_ptr.get(), nrepeat); + float ave_time = + invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel}); std::size_t flop = 0, num_btype = 0; for(std::size_t i = 0; i < gemm_shapes.size(); i++) diff --git a/profiler/include/profile_reduce_impl.hpp b/profiler/include/profile_reduce_impl.hpp index 678134f60b..33c7929ddd 100644 --- a/profiler/include/profile_reduce_impl.hpp +++ b/profiler/include/profile_reduce_impl.hpp @@ -157,7 +157,7 @@ void profile_reduce_impl_impl(bool do_verification, int init_method, bool do_log, bool do_dumpout, - int nrepeat, + bool time_kernel, const std::vector& inLengths, const std::vector& reduceDims, float alpha, @@ -430,7 +430,8 @@ void profile_reduce_impl_impl(bool do_verification, auto invoker_ptr = reduce_ptr->MakeInvokerPointer(); - float avg_time = invoker_ptr->Run(argument_ptr.get(), nrepeat); + float avg_time = + invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel}); std::size_t num_bytes = invariant_total_length * reduce_total_length * sizeof(InDataType) + @@ -516,7 +517,8 @@ void profile_reduce_impl_impl(bool do_verification, auto invoker_ptr = reduce_ptr->MakeInvokerPointer(); - float avg_time = invoker_ptr->Run(argument_ptr.get(), nrepeat); + float avg_time = + invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel}); std::size_t num_bytes = invariant_total_length * reduce_total_length * sizeof(InDataType) + @@ -554,7 +556,8 @@ void profile_reduce_impl_impl(bool do_verification, auto invoker2_ptr = reduce2_ptr->MakeInvokerPointer(); - float avg_time_2 = invoker2_ptr->Run(argument2_ptr.get(), nrepeat); + float avg_time_2 = + invoker2_ptr->Run(argument2_ptr.get(), StreamConfig{nullptr, time_kernel}); std::size_t num_bytes_2 = static_cast(inLengths2[0]) * inLengths2[1] * sizeof(AccDataType); @@ -625,7 +628,7 @@ void profile_reduce_impl(bool do_verification, int init_method, bool do_log, bool do_dumpout, - int nrepeat, + bool time_kernel, const std::vector& inLengths, const std::vector& reduceDims, ReduceTensorOp ReduceOpId, @@ -663,7 +666,7 @@ void profile_reduce_impl(bool do_verification, init_method, do_log, do_dumpout, - nrepeat, + time_kernel, inLengths, reduceDims, alpha, diff --git a/profiler/src/profile_batched_gemm.cpp b/profiler/src/profile_batched_gemm.cpp index 2a806b0818..db5486e0ac 100644 --- a/profiler/src/profile_batched_gemm.cpp +++ b/profiler/src/profile_batched_gemm.cpp @@ -48,8 +48,8 @@ int profile_batched_gemm(int argc, char* argv[]) printf(" 3: A[g, k, m] * B[g, n, k] = C[g, m, n])\n"); printf("arg4: verification (0: no; 1: yes)\n"); printf("arg5: initialization (0: no init; 1: integer value; 2: decimal value)\n"); - printf("arg8: print tensor value (0: no; 1: yes)\n"); - printf("arg7: run kernel # of times (>1)\n"); + printf("arg6: print tensor value (0: no; 1: yes)\n"); + printf("arg7: time kernel (0=n0, 1=yes)\n"); printf("arg8 to 14: M, N, K, StrideA, StrideB, StrideC, BatchCount\n"); exit(1); } @@ -59,7 +59,7 @@ int profile_batched_gemm(int argc, char* argv[]) const bool do_verification = std::stoi(argv[4]); const int init_method = std::stoi(argv[5]); const bool do_log = std::stoi(argv[6]); - const int nrepeat = std::stoi(argv[7]); + const bool time_kernel = std::stoi(argv[7]); const int M = std::stoi(argv[8]); const int N = std::stoi(argv[9]); @@ -82,7 +82,7 @@ int profile_batched_gemm(int argc, char* argv[]) do_verification, init_method, do_log, - nrepeat, + time_kernel, M, N, K, @@ -102,7 +102,7 @@ int profile_batched_gemm(int argc, char* argv[]) do_verification, init_method, do_log, - nrepeat, + time_kernel, M, N, K, @@ -122,7 +122,7 @@ int profile_batched_gemm(int argc, char* argv[]) do_verification, init_method, do_log, - nrepeat, + time_kernel, M, N, K, @@ -142,7 +142,7 @@ int profile_batched_gemm(int argc, char* argv[]) do_verification, init_method, do_log, - nrepeat, + time_kernel, M, N, K, @@ -162,7 +162,7 @@ int profile_batched_gemm(int argc, char* argv[]) do_verification, init_method, do_log, - nrepeat, + time_kernel, M, N, K, @@ -182,7 +182,7 @@ int profile_batched_gemm(int argc, char* argv[]) do_verification, init_method, do_log, - nrepeat, + time_kernel, M, N, K, @@ -202,7 +202,7 @@ int profile_batched_gemm(int argc, char* argv[]) do_verification, init_method, do_log, - nrepeat, + time_kernel, M, N, K, @@ -222,7 +222,7 @@ int profile_batched_gemm(int argc, char* argv[]) do_verification, init_method, do_log, - nrepeat, + time_kernel, M, N, K, @@ -242,7 +242,7 @@ int profile_batched_gemm(int argc, char* argv[]) do_verification, init_method, do_log, - nrepeat, + time_kernel, M, N, K, @@ -262,7 +262,7 @@ int profile_batched_gemm(int argc, char* argv[]) do_verification, init_method, do_log, - nrepeat, + time_kernel, M, N, K, @@ -282,7 +282,7 @@ int profile_batched_gemm(int argc, char* argv[]) do_verification, init_method, do_log, - nrepeat, + time_kernel, M, N, K, @@ -302,7 +302,7 @@ int profile_batched_gemm(int argc, char* argv[]) do_verification, init_method, do_log, - nrepeat, + time_kernel, M, N, K, @@ -322,7 +322,7 @@ int profile_batched_gemm(int argc, char* argv[]) do_verification, init_method, do_log, - nrepeat, + time_kernel, M, N, K, @@ -342,7 +342,7 @@ int profile_batched_gemm(int argc, char* argv[]) do_verification, init_method, do_log, - nrepeat, + time_kernel, M, N, K, @@ -362,7 +362,7 @@ int profile_batched_gemm(int argc, char* argv[]) do_verification, init_method, do_log, - nrepeat, + time_kernel, M, N, K, @@ -382,7 +382,7 @@ int profile_batched_gemm(int argc, char* argv[]) do_verification, init_method, do_log, - nrepeat, + time_kernel, M, N, K, diff --git a/profiler/src/profile_batched_gemm_reduce.cpp b/profiler/src/profile_batched_gemm_reduce.cpp index 38c3f52193..f67e561865 100644 --- a/profiler/src/profile_batched_gemm_reduce.cpp +++ b/profiler/src/profile_batched_gemm_reduce.cpp @@ -33,8 +33,8 @@ int profile_batched_gemm_reduce(int argc, char* argv[]) printf(" 3: A[k, m] * B[n, k] = C[m, n])\n"); printf("arg4: verification (0: no; 1: yes)\n"); printf("arg5: initialization (0: no init; 1: integer value; 2: decimal value)\n"); - printf("arg8: print tensor value (0: no; 1: yes)\n"); - printf("arg7: run kernel # of times (>1)\n"); + printf("arg6: print tensor value (0: no; 1: yes)\n"); + printf("arg7: time kernel (0=n0, 1=yes)\n"); printf("arg8 to 14: M, N, K, StrideA, StrideB, StrideC, BatchCount\n"); printf("arg15: split k into mulitiple batch\n"); exit(1); @@ -45,7 +45,7 @@ int profile_batched_gemm_reduce(int argc, char* argv[]) const bool do_verification = std::stoi(argv[4]); const int init_method = std::stoi(argv[5]); const bool do_log = std::stoi(argv[6]); - const int nrepeat = std::stoi(argv[7]); + const bool time_kernel = std::stoi(argv[7]); const int M = std::stoi(argv[8]); const int N = std::stoi(argv[9]); @@ -69,7 +69,7 @@ int profile_batched_gemm_reduce(int argc, char* argv[]) do_verification, init_method, do_log, - nrepeat, + time_kernel, M, N, K, @@ -91,7 +91,7 @@ int profile_batched_gemm_reduce(int argc, char* argv[]) do_verification, init_method, do_log, - nrepeat, + time_kernel, M, N, K, @@ -113,7 +113,7 @@ int profile_batched_gemm_reduce(int argc, char* argv[]) do_verification, init_method, do_log, - nrepeat, + time_kernel, M, N, K, @@ -135,7 +135,7 @@ int profile_batched_gemm_reduce(int argc, char* argv[]) do_verification, init_method, do_log, - nrepeat, + time_kernel, M, N, K, diff --git a/profiler/src/profile_conv_bwd_data.cpp b/profiler/src/profile_conv_bwd_data.cpp index 2861af3d10..206d486ea0 100644 --- a/profiler/src/profile_conv_bwd_data.cpp +++ b/profiler/src/profile_conv_bwd_data.cpp @@ -44,7 +44,7 @@ int profile_conv_bwd_data(int argc, char* argv[]) printf("arg6: verification (0: no; 1: yes)\n"); printf("arg7: initialization (0: no init; 1: integer value; 2: decimal value)\n"); printf("arg8: print tensor value (0: no; 1: yes)\n"); - printf("arg9: run kernel # of times (>1)\n"); + printf("arg9: time kernel (0=n0, 1=yes)\n"); printf("arg10 to 24: N, K, C, Y, X, Hi, Wi, Sy, Sx, Dy, Dx, LeftPy, LeftPx, RightPy, " "RightPx\n"); exit(1); @@ -57,7 +57,7 @@ int profile_conv_bwd_data(int argc, char* argv[]) const bool do_verification = std::stoi(argv[6]); const int init_method = std::stoi(argv[7]); const bool do_log = std::stoi(argv[8]); - const int nrepeat = std::stoi(argv[9]); + const bool time_kernel = std::stoi(argv[9]); const ck::index_t N = std::stoi(argv[10]); const ck::index_t K = std::stoi(argv[11]); @@ -96,7 +96,7 @@ int profile_conv_bwd_data(int argc, char* argv[]) do_verification, init_method, do_log, - nrepeat, + StreamControl{nullptr, time_kernel}, N, K, C, @@ -122,7 +122,7 @@ int profile_conv_bwd_data(int argc, char* argv[]) do_verification, init_method, do_log, - nrepeat, + StreamControl{nullptr, time_kernel}, N, K, C, @@ -148,7 +148,7 @@ int profile_conv_bwd_data(int argc, char* argv[]) do_verification, init_method, do_log, - nrepeat, + StreamControl{nullptr, time_kernel}, N, K, C, @@ -174,7 +174,7 @@ int profile_conv_bwd_data(int argc, char* argv[]) do_verification, init_method, do_log, - nrepeat, + StreamControl{nullptr, time_kernel}, N, K, C, diff --git a/profiler/src/profile_conv_bwd_weight.cpp b/profiler/src/profile_conv_bwd_weight.cpp index 309cc8ea2c..c022d19ee0 100644 --- a/profiler/src/profile_conv_bwd_weight.cpp +++ b/profiler/src/profile_conv_bwd_weight.cpp @@ -58,7 +58,7 @@ int profile_conv_bwd_weight(int argc, char* argv[]) const bool do_verification = std::stoi(argv[6]); const int init_method = std::stoi(argv[7]); const bool do_log = std::stoi(argv[8]); - const int nrepeat = std::stoi(argv[9]); + const bool time_kernel = std::stoi(argv[9]); const ck::index_t N = std::stoi(argv[10]); const ck::index_t K = std::stoi(argv[11]); @@ -98,7 +98,7 @@ int profile_conv_bwd_weight(int argc, char* argv[]) do_verification, init_method, do_log, - nrepeat, + time_kernel, N, K, C, @@ -124,7 +124,7 @@ int profile_conv_bwd_weight(int argc, char* argv[]) do_verification, init_method, do_log, - nrepeat, + time_kernel, N, K, C, diff --git a/profiler/src/profile_conv_fwd_bias_relu.cpp b/profiler/src/profile_conv_fwd_bias_relu.cpp index 1c447b483e..28aa49687f 100644 --- a/profiler/src/profile_conv_fwd_bias_relu.cpp +++ b/profiler/src/profile_conv_fwd_bias_relu.cpp @@ -42,7 +42,7 @@ int profile_conv_fwd_bias_relu(int argc, char* argv[]) printf("arg6: verification (0: no; 1: yes)\n"); printf("arg7: initialization (0: no init; 1: integer value; 2: decimal value)\n"); printf("arg8: print tensor value (0: no; 1: yes)\n"); - printf("arg9: run kernel # of times (>1)\n"); + printf("arg9: time kernel (0=n0, 1=yes)\n"); printf("arg10 to 24: N, K, C, Y, X, Hi, Wi, Sy, Sx, Dy, Dx, LeftPy, LeftPx, RightPy, " "RightPx\n"); exit(1); @@ -55,7 +55,7 @@ int profile_conv_fwd_bias_relu(int argc, char* argv[]) const bool do_verification = std::stoi(argv[6]); const int init_method = std::stoi(argv[7]); const bool do_log = std::stoi(argv[8]); - const int nrepeat = std::stoi(argv[9]); + const bool time_kernel = std::stoi(argv[9]); const ck::index_t N = std::stoi(argv[10]); const ck::index_t K = std::stoi(argv[11]); @@ -93,7 +93,7 @@ int profile_conv_fwd_bias_relu(int argc, char* argv[]) do_verification, init_method, do_log, - nrepeat, + time_kernel, N, K, C, diff --git a/profiler/src/profile_conv_fwd_bias_relu_add.cpp b/profiler/src/profile_conv_fwd_bias_relu_add.cpp index 522487c77b..7e033a51e2 100644 --- a/profiler/src/profile_conv_fwd_bias_relu_add.cpp +++ b/profiler/src/profile_conv_fwd_bias_relu_add.cpp @@ -43,7 +43,7 @@ int profile_conv_fwd_bias_relu_add(int argc, char* argv[]) printf("arg6: verification (0: no; 1: yes)\n"); printf("arg7: initialization (0: no init; 1: integer value; 2: decimal value)\n"); printf("arg8: print tensor value (0: no; 1: yes)\n"); - printf("arg9: run kernel # of times (>1)\n"); + printf("arg9: time kernel (0=n0, 1=yes)\n"); printf("arg10 to 24: N, K, C, Y, X, Hi, Wi, Sy, Sx, Dy, Dx, LeftPy, LeftPx, RightPy, " "RightPx\n"); exit(1); @@ -56,7 +56,7 @@ int profile_conv_fwd_bias_relu_add(int argc, char* argv[]) const bool do_verification = std::stoi(argv[6]); const int init_method = std::stoi(argv[7]); const bool do_log = std::stoi(argv[8]); - const int nrepeat = std::stoi(argv[9]); + const bool time_kernel = std::stoi(argv[9]); const ck::index_t N = std::stoi(argv[10]); const ck::index_t K = std::stoi(argv[11]); @@ -94,7 +94,7 @@ int profile_conv_fwd_bias_relu_add(int argc, char* argv[]) do_verification, init_method, do_log, - nrepeat, + time_kernel, N, K, C, diff --git a/profiler/src/profile_conv_fwd_bias_relu_atomic_add.cpp b/profiler/src/profile_conv_fwd_bias_relu_atomic_add.cpp index 833f2851db..095536f701 100644 --- a/profiler/src/profile_conv_fwd_bias_relu_atomic_add.cpp +++ b/profiler/src/profile_conv_fwd_bias_relu_atomic_add.cpp @@ -43,7 +43,7 @@ int profile_conv_fwd_bias_relu_atomic_add(int argc, char* argv[]) printf("arg6: verification (0: no; 1: yes)\n"); printf("arg7: initialization (0: no init; 1: integer value; 2: decimal value)\n"); printf("arg8: print tensor value (0: no; 1: yes)\n"); - printf("arg9: run kernel # of times (>1)\n"); + printf("arg9: time kernel (0=n0, 1=yes)\n"); printf("arg10 to 24: N, K, C, Y, X, Hi, Wi, Sy, Sx, Dy, Dx, LeftPy, LeftPx, RightPy, " "RightPx\n"); exit(1); @@ -56,7 +56,7 @@ int profile_conv_fwd_bias_relu_atomic_add(int argc, char* argv[]) const bool do_verification = std::stoi(argv[6]); const int init_method = std::stoi(argv[7]); const bool do_log = std::stoi(argv[8]); - const int nrepeat = std::stoi(argv[9]); + const bool time_kernel = std::stoi(argv[9]); const ck::index_t N = std::stoi(argv[10]); const ck::index_t K = std::stoi(argv[11]); @@ -95,7 +95,7 @@ int profile_conv_fwd_bias_relu_atomic_add(int argc, char* argv[]) do_verification, init_method, do_log, - nrepeat, + time_kernel, N, K, C, diff --git a/profiler/src/profile_convnd_bwd_data.cpp b/profiler/src/profile_convnd_bwd_data.cpp index 4d6b9a7b37..5d0e6a34c7 100644 --- a/profiler/src/profile_convnd_bwd_data.cpp +++ b/profiler/src/profile_convnd_bwd_data.cpp @@ -95,7 +95,7 @@ int profile_convnd_bwd_data(int argc, char* argv[], int num_dim_spatial) printf("arg6: verification (0: no; 1: yes)\n"); printf("arg7: initialization (0: no init; 1: integer value; 2: decimal value)\n"); printf("arg8: print tensor value (0: no; 1: yes)\n"); - printf("arg9: run kernel # of times (>1)\n"); + printf("arg9: time kernel (0=n0, 1=yes)\n"); printf("arg10 to 24: N, K, C, Y, X, Hi, Wi, Sy, Sx, Dy, Dx, LeftPy, LeftPx, RightPy, " "RightPx\n"); return 1; @@ -108,7 +108,7 @@ int profile_convnd_bwd_data(int argc, char* argv[], int num_dim_spatial) const bool do_verification = std::stoi(argv[6]); const int init_method = std::stoi(argv[7]); const bool do_log = std::stoi(argv[8]); - const int nrepeat = std::stoi(argv[9]); + const bool time_kernel = std::stoi(argv[9]); ck::utils::conv::ConvParams params = parse_conv_params(num_dim_spatial, argv, preParams); @@ -132,7 +132,7 @@ int profile_convnd_bwd_data(int argc, char* argv[], int num_dim_spatial) do_verification, init_method, do_log, - nrepeat, + time_kernel, params.N_, params.K_, params.C_, @@ -157,7 +157,7 @@ int profile_convnd_bwd_data(int argc, char* argv[], int num_dim_spatial) do_verification, init_method, do_log, - nrepeat, + time_kernel, params.N_, params.K_, params.C_, @@ -182,7 +182,7 @@ int profile_convnd_bwd_data(int argc, char* argv[], int num_dim_spatial) do_verification, init_method, do_log, - nrepeat, + time_kernel, params.N_, params.K_, params.C_, diff --git a/profiler/src/profile_convnd_fwd.cpp b/profiler/src/profile_convnd_fwd.cpp index 7902cdb002..722e86c2ea 100644 --- a/profiler/src/profile_convnd_fwd.cpp +++ b/profiler/src/profile_convnd_fwd.cpp @@ -119,7 +119,7 @@ template ::template Get(), - nrepeat, + time_kernel, do_verification, do_log); @@ -201,7 +201,7 @@ void profile_convnd_instances(ConvDataType data_type, const ck::utils::conv::ConvParams& params, bool do_verification, bool do_log, - int nrepeat, + bool time_kernel, int init_method) { switch(data_layout) @@ -214,7 +214,7 @@ void profile_convnd_instances(ConvDataType data_type, params, do_verification, do_log, - nrepeat, + time_kernel, init_method, ConvolutionLayouts{}); break; @@ -223,7 +223,7 @@ void profile_convnd_instances(ConvDataType data_type, params, do_verification, do_log, - nrepeat, + time_kernel, init_method, ConvolutionLayouts{}); break; @@ -232,7 +232,7 @@ void profile_convnd_instances(ConvDataType data_type, params, do_verification, do_log, - nrepeat, + time_kernel, init_method, ConvolutionLayouts{}); break; @@ -241,7 +241,7 @@ void profile_convnd_instances(ConvDataType data_type, params, do_verification, do_log, - nrepeat, + time_kernel, init_method, ConvolutionLayouts{}); break; @@ -256,7 +256,7 @@ void profile_convnd_instances(ConvDataType data_type, params, do_verification, do_log, - nrepeat, + time_kernel, init_method, ConvolutionLayouts{}); break; @@ -265,7 +265,7 @@ void profile_convnd_instances(ConvDataType data_type, params, do_verification, do_log, - nrepeat, + time_kernel, init_method, ConvolutionLayouts{}); break; @@ -274,7 +274,7 @@ void profile_convnd_instances(ConvDataType data_type, params, do_verification, do_log, - nrepeat, + time_kernel, init_method, ConvolutionLayouts{}); break; @@ -283,7 +283,7 @@ void profile_convnd_instances(ConvDataType data_type, params, do_verification, do_log, - nrepeat, + time_kernel, init_method, ConvolutionLayouts{}); break; @@ -304,7 +304,7 @@ int ck::profiler::profile_convnd_fwd(int argc, char* argv[]) bool do_verification{true}; int init_method{2}; bool do_log{false}; - int nrepeat{100}; + bool time_kernel{false}; int num_dim_spatial{2}; ConvParams params; @@ -318,7 +318,7 @@ int ck::profiler::profile_convnd_fwd(int argc, char* argv[]) do_verification = std::stoi(argv[4]); init_method = std::stoi(argv[5]); do_log = std::stoi(argv[6]); - nrepeat = std::stoi(argv[7]); + time_kernel = std::stoi(argv[7]); num_dim_spatial = std::stoi(argv[8]); } if(argc >= 10) @@ -332,15 +332,15 @@ int ck::profiler::profile_convnd_fwd(int argc, char* argv[]) { case 1: profile_convnd_instances<1>( - data_type, data_layout, params, do_verification, do_log, nrepeat, init_method); + data_type, data_layout, params, do_verification, do_log, time_kernel, init_method); break; case 2: profile_convnd_instances<2>( - data_type, data_layout, params, do_verification, do_log, nrepeat, init_method); + data_type, data_layout, params, do_verification, do_log, time_kernel, init_method); break; case 3: profile_convnd_instances<3>( - data_type, data_layout, params, do_verification, do_log, nrepeat, init_method); + data_type, data_layout, params, do_verification, do_log, time_kernel, init_method); break; default: throw std::runtime_error("profile_conv_fwd: unsupported num_dim_spatial value: " + diff --git a/profiler/src/profile_gemm.cpp b/profiler/src/profile_gemm.cpp index 7a72be2d8e..4c6a3b0487 100644 --- a/profiler/src/profile_gemm.cpp +++ b/profiler/src/profile_gemm.cpp @@ -38,8 +38,8 @@ int profile_gemm(int argc, char* argv[]) printf(" 3: A[k, m] * B[n, k] = C[m, n])\n"); printf("arg4: verification (0: no; 1: yes)\n"); printf("arg5: initialization (0: no init; 1: integer value; 2: decimal value)\n"); - printf("arg8: print tensor value (0: no; 1: yes)\n"); - printf("arg7: run kernel # of times (>1)\n"); + printf("arg6: print tensor value (0: no; 1: yes)\n"); + printf("arg7: time kernel (0=n0, 1=yes)\n"); printf("arg8 to 13: M, N, K, StrideA, StrideB, StrideC\n"); printf("arg14: split k into mulitiple batch\n"); exit(1); @@ -50,7 +50,7 @@ int profile_gemm(int argc, char* argv[]) const bool do_verification = std::stoi(argv[4]); const int init_method = std::stoi(argv[5]); const bool do_log = std::stoi(argv[6]); - const int nrepeat = std::stoi(argv[7]); + const bool time_kernel = std::stoi(argv[7]); const int M = std::stoi(argv[8]); const int N = std::stoi(argv[9]); @@ -74,7 +74,7 @@ int profile_gemm(int argc, char* argv[]) do_verification, init_method, do_log, - nrepeat, + time_kernel, M, N, K, @@ -94,7 +94,7 @@ int profile_gemm(int argc, char* argv[]) do_verification, init_method, do_log, - nrepeat, + time_kernel, M, N, K, @@ -114,7 +114,7 @@ int profile_gemm(int argc, char* argv[]) do_verification, init_method, do_log, - nrepeat, + time_kernel, M, N, K, @@ -134,7 +134,7 @@ int profile_gemm(int argc, char* argv[]) do_verification, init_method, do_log, - nrepeat, + time_kernel, M, N, K, @@ -154,7 +154,7 @@ int profile_gemm(int argc, char* argv[]) do_verification, init_method, do_log, - nrepeat, + time_kernel, M, N, K, @@ -174,7 +174,7 @@ int profile_gemm(int argc, char* argv[]) do_verification, init_method, do_log, - nrepeat, + time_kernel, M, N, K, @@ -194,7 +194,7 @@ int profile_gemm(int argc, char* argv[]) do_verification, init_method, do_log, - nrepeat, + time_kernel, M, N, K, @@ -214,7 +214,7 @@ int profile_gemm(int argc, char* argv[]) do_verification, init_method, do_log, - nrepeat, + time_kernel, M, N, K, @@ -234,7 +234,7 @@ int profile_gemm(int argc, char* argv[]) do_verification, init_method, do_log, - nrepeat, + time_kernel, M, N, K, @@ -254,7 +254,7 @@ int profile_gemm(int argc, char* argv[]) do_verification, init_method, do_log, - nrepeat, + time_kernel, M, N, K, @@ -274,7 +274,7 @@ int profile_gemm(int argc, char* argv[]) do_verification, init_method, do_log, - nrepeat, + time_kernel, M, N, K, @@ -294,7 +294,7 @@ int profile_gemm(int argc, char* argv[]) do_verification, init_method, do_log, - nrepeat, + time_kernel, M, N, K, @@ -314,7 +314,7 @@ int profile_gemm(int argc, char* argv[]) do_verification, init_method, do_log, - nrepeat, + time_kernel, M, N, K, @@ -334,7 +334,7 @@ int profile_gemm(int argc, char* argv[]) do_verification, init_method, do_log, - nrepeat, + time_kernel, M, N, K, @@ -354,7 +354,7 @@ int profile_gemm(int argc, char* argv[]) do_verification, init_method, do_log, - nrepeat, + time_kernel, M, N, K, @@ -374,7 +374,7 @@ int profile_gemm(int argc, char* argv[]) do_verification, init_method, do_log, - nrepeat, + time_kernel, M, N, K, diff --git a/profiler/src/profile_gemm_bias_2d.cpp b/profiler/src/profile_gemm_bias_2d.cpp index dd7e418087..46d4f90c17 100644 --- a/profiler/src/profile_gemm_bias_2d.cpp +++ b/profiler/src/profile_gemm_bias_2d.cpp @@ -36,8 +36,8 @@ int profile_gemm_bias_2d(int argc, char* argv[]) printf(" 3: A[k, m] * B[n, k] = C[m, n])\n"); printf("arg4: verification (0: no; 1: yes)\n"); printf("arg5: initialization (0: no init; 1: integer value; 2: decimal value)\n"); - printf("arg8: print tensor value (0: no; 1: yes)\n"); - printf("arg7: run kernel # of times (>1)\n"); + printf("arg6: print tensor value (0: no; 1: yes)\n"); + printf("arg7: time kernel (0=n0, 1=yes)\n"); printf("arg8 to 13: M, N, K, StrideA, StrideB, StrideC\n"); printf("arg14: alpha\n"); printf("arg15: beta\n"); @@ -50,7 +50,7 @@ int profile_gemm_bias_2d(int argc, char* argv[]) const bool do_verification = std::stoi(argv[4]); const int init_method = std::stoi(argv[5]); const bool do_log = std::stoi(argv[6]); - const int nrepeat = std::stoi(argv[7]); + const bool time_kernel = std::stoi(argv[7]); const int M = std::stoi(argv[8]); const int N = std::stoi(argv[9]); @@ -76,7 +76,7 @@ int profile_gemm_bias_2d(int argc, char* argv[]) do_verification, init_method, do_log, - nrepeat, + time_kernel, M, N, K, @@ -99,7 +99,7 @@ int profile_gemm_bias_2d(int argc, char* argv[]) do_verification, init_method, do_log, - nrepeat, + time_kernel, M, N, K, @@ -122,7 +122,7 @@ int profile_gemm_bias_2d(int argc, char* argv[]) do_verification, init_method, do_log, - nrepeat, + time_kernel, M, N, K, @@ -145,7 +145,7 @@ int profile_gemm_bias_2d(int argc, char* argv[]) do_verification, init_method, do_log, - nrepeat, + time_kernel, M, N, K, @@ -168,7 +168,7 @@ int profile_gemm_bias_2d(int argc, char* argv[]) do_verification, init_method, do_log, - nrepeat, + time_kernel, M, N, K, @@ -191,7 +191,7 @@ int profile_gemm_bias_2d(int argc, char* argv[]) do_verification, init_method, do_log, - nrepeat, + time_kernel, M, N, K, @@ -214,7 +214,7 @@ int profile_gemm_bias_2d(int argc, char* argv[]) do_verification, init_method, do_log, - nrepeat, + time_kernel, M, N, K, @@ -237,7 +237,7 @@ int profile_gemm_bias_2d(int argc, char* argv[]) do_verification, init_method, do_log, - nrepeat, + time_kernel, M, N, K, diff --git a/profiler/src/profile_gemm_bias_relu.cpp b/profiler/src/profile_gemm_bias_relu.cpp index 67a47cf9ec..4346650c9f 100644 --- a/profiler/src/profile_gemm_bias_relu.cpp +++ b/profiler/src/profile_gemm_bias_relu.cpp @@ -36,8 +36,8 @@ int profile_gemm_bias_relu(int argc, char* argv[]) printf(" 3: A[k, m] * B[n, k] = C[m, n])\n"); printf("arg4: verification (0: no; 1: yes)\n"); printf("arg5: initialization (0: no init; 1: integer value; 2: decimal value)\n"); - printf("arg8: print tensor value (0: no; 1: yes)\n"); - printf("arg7: run kernel # of times (>1)\n"); + printf("arg6: print tensor value (0: no; 1: yes)\n"); + printf("arg7: time kernel (0=n0, 1=yes)\n"); printf("arg8 to 13: M, N, K, StrideA, StrideB, StrideC\n"); printf("arg14: split k into mulitiple batch\n"); exit(1); @@ -48,7 +48,7 @@ int profile_gemm_bias_relu(int argc, char* argv[]) const bool do_verification = std::stoi(argv[4]); const int init_method = std::stoi(argv[5]); const bool do_log = std::stoi(argv[6]); - const int nrepeat = std::stoi(argv[7]); + const bool time_kernel = std::stoi(argv[7]); const int M = std::stoi(argv[8]); const int N = std::stoi(argv[9]); @@ -69,7 +69,7 @@ int profile_gemm_bias_relu(int argc, char* argv[]) do_verification, init_method, do_log, - nrepeat, + time_kernel, M, N, K, @@ -88,7 +88,7 @@ int profile_gemm_bias_relu(int argc, char* argv[]) do_verification, init_method, do_log, - nrepeat, + time_kernel, M, N, K, @@ -107,7 +107,7 @@ int profile_gemm_bias_relu(int argc, char* argv[]) do_verification, init_method, do_log, - nrepeat, + time_kernel, M, N, K, @@ -126,7 +126,7 @@ int profile_gemm_bias_relu(int argc, char* argv[]) do_verification, init_method, do_log, - nrepeat, + time_kernel, M, N, K, diff --git a/profiler/src/profile_gemm_bias_relu_add.cpp b/profiler/src/profile_gemm_bias_relu_add.cpp index 52406e93d6..186f32cf6f 100644 --- a/profiler/src/profile_gemm_bias_relu_add.cpp +++ b/profiler/src/profile_gemm_bias_relu_add.cpp @@ -36,8 +36,8 @@ int profile_gemm_bias_relu_add(int argc, char* argv[]) printf(" 3: A[k, m] * B[n, k] = C[m, n])\n"); printf("arg4: verification (0: no; 1: yes)\n"); printf("arg5: initialization (0: no init; 1: integer value; 2: decimal value)\n"); - printf("arg8: print tensor value (0: no; 1: yes)\n"); - printf("arg7: run kernel # of times (>1)\n"); + printf("arg6: print tensor value (0: no; 1: yes)\n"); + printf("arg7: time kernel (0=n0, 1=yes)\n"); printf("arg8 to 14: M, N, K, StrideA, StrideB, StrideC, StrideC1\n"); printf("arg15: split k into mulitiple batch\n"); exit(1); @@ -48,7 +48,7 @@ int profile_gemm_bias_relu_add(int argc, char* argv[]) const bool do_verification = std::stoi(argv[4]); const int init_method = std::stoi(argv[5]); const bool do_log = std::stoi(argv[6]); - const int nrepeat = std::stoi(argv[7]); + const bool time_kernel = std::stoi(argv[7]); const int M = std::stoi(argv[8]); const int N = std::stoi(argv[9]); @@ -70,7 +70,7 @@ int profile_gemm_bias_relu_add(int argc, char* argv[]) do_verification, init_method, do_log, - nrepeat, + time_kernel, M, N, K, @@ -90,7 +90,7 @@ int profile_gemm_bias_relu_add(int argc, char* argv[]) do_verification, init_method, do_log, - nrepeat, + time_kernel, M, N, K, @@ -110,7 +110,7 @@ int profile_gemm_bias_relu_add(int argc, char* argv[]) do_verification, init_method, do_log, - nrepeat, + time_kernel, M, N, K, @@ -130,7 +130,7 @@ int profile_gemm_bias_relu_add(int argc, char* argv[]) do_verification, init_method, do_log, - nrepeat, + time_kernel, M, N, K, diff --git a/profiler/src/profile_gemm_reduce.cpp b/profiler/src/profile_gemm_reduce.cpp index a83d4ce9a1..986acaf010 100644 --- a/profiler/src/profile_gemm_reduce.cpp +++ b/profiler/src/profile_gemm_reduce.cpp @@ -32,8 +32,8 @@ int profile_gemm_reduce(int argc, char* argv[]) printf(" 3: A[k, m] * B[n, k] = C[m, n])\n"); printf("arg4: verification (0: no; 1: yes)\n"); printf("arg5: initialization (0: no init; 1: integer value; 2: decimal value)\n"); - printf("arg8: print tensor value (0: no; 1: yes)\n"); - printf("arg7: run kernel # of times (>1)\n"); + printf("arg6: print tensor value (0: no; 1: yes)\n"); + printf("arg7: time kernel (0=n0, 1=yes)\n"); printf("arg8 to 13: M, N, K, StrideA, StrideB, StrideC\n"); printf("arg14: split k into mulitiple batch\n"); exit(1); @@ -44,7 +44,7 @@ int profile_gemm_reduce(int argc, char* argv[]) const bool do_verification = std::stoi(argv[4]); const int init_method = std::stoi(argv[5]); const bool do_log = std::stoi(argv[6]); - const int nrepeat = std::stoi(argv[7]); + const bool time_kernel = std::stoi(argv[7]); const int M = std::stoi(argv[8]); const int N = std::stoi(argv[9]); @@ -66,7 +66,7 @@ int profile_gemm_reduce(int argc, char* argv[]) do_verification, init_method, do_log, - nrepeat, + time_kernel, M, N, K, @@ -87,7 +87,7 @@ int profile_gemm_reduce(int argc, char* argv[]) do_verification, init_method, do_log, - nrepeat, + time_kernel, M, N, K, @@ -108,7 +108,7 @@ int profile_gemm_reduce(int argc, char* argv[]) do_verification, init_method, do_log, - nrepeat, + time_kernel, M, N, K, @@ -129,7 +129,7 @@ int profile_gemm_reduce(int argc, char* argv[]) do_verification, init_method, do_log, - nrepeat, + time_kernel, M, N, K, diff --git a/profiler/src/profile_grouped_gemm.cpp b/profiler/src/profile_grouped_gemm.cpp index 88a2a8f855..d35484cfae 100644 --- a/profiler/src/profile_grouped_gemm.cpp +++ b/profiler/src/profile_grouped_gemm.cpp @@ -54,8 +54,8 @@ int profile_grouped_gemm(int argc, char* argv[]) printf(" 3: A[k, m] * B[n, k] = C[m, n])\n"); printf("arg4: verification (0: no; 1: yes)\n"); printf("arg5: initialization (0: no init; 1: integer value; 2: decimal value)\n"); - printf("arg8: print tensor value (0: no; 1: yes)\n"); - printf("arg7: run kernel # of times (>1)\n"); + printf("arg6: print tensor value (0: no; 1: yes)\n"); + printf("arg7: time kernel (0=n0, 1=yes)\n"); printf("arg8 to 13: Ms, Ns, Ks, StrideAs, StrideBs, StrideCs (e.g., 256,256 128,128 64,64 " "64,64 64,64 128,128)\n"); exit(1); @@ -66,7 +66,7 @@ int profile_grouped_gemm(int argc, char* argv[]) const bool do_verification = std::stoi(argv[4]); const int init_method = std::stoi(argv[5]); const bool do_log = std::stoi(argv[6]); - const int nrepeat = std::stoi(argv[7]); + const bool time_kernel = std::stoi(argv[7]); const auto Ms = argToIntArray(argv[8]); const auto Ns = argToIntArray(argv[9]); @@ -86,7 +86,7 @@ int profile_grouped_gemm(int argc, char* argv[]) ck::tensor_layout::gemm::RowMajor>(do_verification, init_method, do_log, - nrepeat, + time_kernel, Ms, Ns, Ks, @@ -104,7 +104,7 @@ int profile_grouped_gemm(int argc, char* argv[]) ck::tensor_layout::gemm::RowMajor>(do_verification, init_method, do_log, - nrepeat, + time_kernel, Ms, Ns, Ks, @@ -122,7 +122,7 @@ int profile_grouped_gemm(int argc, char* argv[]) ck::tensor_layout::gemm::RowMajor>(do_verification, init_method, do_log, - nrepeat, + time_kernel, Ms, Ns, Ks, @@ -140,7 +140,7 @@ int profile_grouped_gemm(int argc, char* argv[]) ck::tensor_layout::gemm::RowMajor>(do_verification, init_method, do_log, - nrepeat, + time_kernel, Ms, Ns, Ks, diff --git a/profiler/src/profile_reduce.cpp b/profiler/src/profile_reduce.cpp index 96fa78964a..5e91a1d2d1 100644 --- a/profiler/src/profile_reduce.cpp +++ b/profiler/src/profile_reduce.cpp @@ -144,7 +144,7 @@ class AppArgs bool do_dumpout = false; int init_method; - int nrepeat; + bool time_kernel; bool need_indices = false; @@ -295,7 +295,7 @@ class AppArgs throw std::runtime_error("Invalid cmd-line arguments, more argumetns are needed!"); init_method = std::atoi(argv[optind++]); - nrepeat = std::atoi(argv[optind]); + time_kernel = std::atoi(argv[optind]); if(scales.empty()) { @@ -354,7 +354,7 @@ int profile_reduce(int argc, char* argv[]) args.init_method, args.do_log, args.do_dumpout, - args.nrepeat, + args.time_kernel, args.inLengths, args.reduceDims, args.reduceOp, @@ -369,7 +369,7 @@ int profile_reduce(int argc, char* argv[]) args.init_method, args.do_log, args.do_dumpout, - args.nrepeat, + args.time_kernel, args.inLengths, args.reduceDims, args.reduceOp, @@ -387,7 +387,7 @@ int profile_reduce(int argc, char* argv[]) args.init_method, args.do_log, args.do_dumpout, - args.nrepeat, + args.time_kernel, args.inLengths, args.reduceDims, args.reduceOp, @@ -414,7 +414,7 @@ int profile_reduce(int argc, char* argv[]) args.init_method, args.do_log, args.do_dumpout, - args.nrepeat, + args.time_kernel, args.inLengths, args.reduceDims, args.reduceOp, @@ -429,7 +429,7 @@ int profile_reduce(int argc, char* argv[]) args.init_method, args.do_log, args.do_dumpout, - args.nrepeat, + args.time_kernel, args.inLengths, args.reduceDims, args.reduceOp, @@ -454,7 +454,7 @@ int profile_reduce(int argc, char* argv[]) args.init_method, args.do_log, args.do_dumpout, - args.nrepeat, + args.time_kernel, args.inLengths, args.reduceDims, args.reduceOp, @@ -471,7 +471,7 @@ int profile_reduce(int argc, char* argv[]) args.init_method, args.do_log, args.do_dumpout, - args.nrepeat, + args.time_kernel, args.inLengths, args.reduceDims, args.reduceOp, @@ -486,7 +486,7 @@ int profile_reduce(int argc, char* argv[]) args.init_method, args.do_log, args.do_dumpout, - args.nrepeat, + args.time_kernel, args.inLengths, args.reduceDims, args.reduceOp, diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 8a9db2adbd..c696069393 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -22,6 +22,8 @@ include_directories(BEFORE ${PROJECT_SOURCE_DIR}/external/include/half ) +include(googletest) + add_custom_target(check COMMAND ${CMAKE_CTEST_COMMAND} --output-on-failure -C ${CMAKE_CFG_INTDIR}) add_custom_target(tests) @@ -61,4 +63,5 @@ add_subdirectory(grouped_gemm) add_subdirectory(convnd_fwd) add_subdirectory(reduce) add_subdirectory(conv2d_bwd_weight) -add_subdirectory(convnd_bwd_data) \ No newline at end of file +add_subdirectory(convnd_bwd_data) +# DONOT add client_app, that is tested via CI independently \ No newline at end of file diff --git a/test/batched_gemm_reduce/batched_gemm_reduce_fp16.cpp b/test/batched_gemm_reduce/batched_gemm_reduce_fp16.cpp index ce061c644b..7b311cff17 100644 --- a/test/batched_gemm_reduce/batched_gemm_reduce_fp16.cpp +++ b/test/batched_gemm_reduce/batched_gemm_reduce_fp16.cpp @@ -22,7 +22,7 @@ int main() Row, Row, Row>( - true, 1, false, 1, M, N, K, K, N, N, BatchCount); + true, 1, false, false, M, N, K, K, N, N, BatchCount); pass = pass && ck::profiler::profile_batched_gemm_reduce_impl( - true, 1, false, 1, M, N, K, K, K, N, BatchCount); + true, 1, false, false, M, N, K, K, K, N, BatchCount); pass = pass && ck::profiler::profile_batched_gemm_reduce_impl( - true, 1, false, 1, M, N, K, M, N, N, BatchCount); + true, 1, false, false, M, N, K, M, N, N, BatchCount); pass = pass && ck::profiler::profile_batched_gemm_reduce_impl( - true, 1, false, 1, M, N, K, M, K, N, BatchCount); + true, 1, false, false, M, N, K, M, K, N, BatchCount); if(pass) { diff --git a/test/client_app/CMakeLists.txt b/test/client_app/CMakeLists.txt new file mode 100644 index 0000000000..f8dd8c4e0a --- /dev/null +++ b/test/client_app/CMakeLists.txt @@ -0,0 +1,11 @@ +cmake_minimum_required(VERSION 3.15) +project(ck_app) +add_compile_options(-std=c++14) + +find_package(composable_kernel 1.0.0 COMPONENTS device_operations host_tensor) +find_package(hip REQUIRED PATHS /opt/rocm) +message(STATUS "Build with HIP ${hip_VERSION}") + +add_executable(test_client_app client_app.cpp) + +target_link_libraries(test_client_app PRIVATE composable_kernel::device_operations composable_kernel::host_tensor hip::host) diff --git a/test/client_app/client_app.cpp b/test/client_app/client_app.cpp new file mode 100644 index 0000000000..665a103f70 --- /dev/null +++ b/test/client_app/client_app.cpp @@ -0,0 +1,77 @@ +#include +#include +#include +#include +#include +#include +#include + +#include "client_app_impl.hpp" + +int main(int argc, char* argv[]) +{ + if(argc != 25) + { + printf("arg1: tensor operation (conv_fwd: ForwardConvolution)\n"); + printf("arg2: data type (0: fp32; 1: fp16)\n"); + printf("arg3: input tensor layout (0: NCHW; 1: NHWC)\n"); + printf("arg4: weight tensor layout (0: KCYX; 1: KYXC)\n"); + printf("arg5: output tensor layout (0: NKHW; 1: NHWK)\n"); + printf("arg6: verification (0: no; 1: yes)\n"); + printf("arg7: initialization (0: no init; 1: integer value; 2: decimal value)\n"); + printf("arg8: print tensor value (0: no; 1: yes)\n"); + printf("arg9: time kernel (0=n0, 1=yes)\n"); + printf("arg10 to 24: N, K, C, Y, X, Hi, Wi, Sy, Sx, Dy, Dx, LeftPy, LeftPx, RightPy, " + "RightPx\n"); + exit(1); + } + + const ConvDataType data_type = static_cast(std::stoi(argv[2])); + const int in_layout = static_cast(std::stoi(argv[3])); + const int wei_layout = static_cast(std::stoi(argv[4])); + const int out_layout = static_cast(std::stoi(argv[5])); + const bool do_verification = std::stoi(argv[6]); + const int init_method = std::stoi(argv[7]); + const bool do_log = std::stoi(argv[8]); + const bool time_kernel = std::stoi(argv[9]); + + const ck::index_t N = std::stoi(argv[10]); + const ck::index_t K = std::stoi(argv[11]); + const ck::index_t C = std::stoi(argv[12]); + const ck::index_t Y = std::stoi(argv[13]); + const ck::index_t X = std::stoi(argv[14]); + const ck::index_t Hi = std::stoi(argv[15]); + const ck::index_t Wi = std::stoi(argv[16]); + + const ck::index_t conv_stride_h = std::stoi(argv[17]); + const ck::index_t conv_stride_w = std::stoi(argv[18]); + const ck::index_t conv_dilation_h = std::stoi(argv[19]); + const ck::index_t conv_dilation_w = std::stoi(argv[20]); + const ck::index_t in_left_pad_h = std::stoi(argv[21]); + const ck::index_t in_left_pad_w = std::stoi(argv[22]); + const ck::index_t in_right_pad_h = std::stoi(argv[23]); + const ck::index_t in_right_pad_w = std::stoi(argv[24]); + + const ck::index_t YEff = (Y - 1) * conv_dilation_h + 1; + const ck::index_t XEff = (X - 1) * conv_dilation_w + 1; + + const ck::index_t Ho = (Hi + in_left_pad_h + in_right_pad_h - YEff) / conv_stride_h + 1; + const ck::index_t Wo = (Wi + in_left_pad_w + in_right_pad_w - XEff) / conv_stride_w + 1; + + ck::app::profile_conv_fwd_impl(do_verification, + init_method, + do_log, + time_kernel, + data_type, + N, + K, + C, + std::vector{Hi, Wi}, + std::vector{Y, X}, + std::vector{Ho, Wo}, + std::vector{conv_stride_h, conv_stride_w}, + std::vector{conv_dilation_h, conv_dilation_w}, + std::vector{in_left_pad_h, in_left_pad_w}, + std::vector{in_right_pad_h, in_right_pad_w}); + return 1; +} diff --git a/test/client_app/client_app_impl.hpp b/test/client_app/client_app_impl.hpp new file mode 100644 index 0000000000..f9e4145ba0 --- /dev/null +++ b/test/client_app/client_app_impl.hpp @@ -0,0 +1,214 @@ +#pragma once + +#include "host_interface.hpp" + +enum ConvDataType +{ + F32_F32_F32, // 0 + F16_F16_F16, // 1 + BF16_BF16_BF16, // 2 + INT8_INT8_INT8, // 3 +}; + +enum ConvInputLayout +{ + NCHW, // 0 + NHWC, // 1 +}; + +enum ConvWeightLayout +{ + KCYX, // 0 + KYXC, // 1 +}; + +enum ConvOutputLayout +{ + NKHW, // 0 + NHWK, // 1 +}; + +void check_hip_error(void) +{ + hipError_t err = hipGetLastError(); + if(err != hipSuccess) + { + std::cerr << "Error: " << hipGetErrorString(err) << std::endl; + exit(err); + } +} +std::string getDeviceName(int device) +{ + struct hipDeviceProp_t prop; + hipGetDeviceProperties(&prop, device); + check_hip_error(); + return std::string(prop.name); +} + +int getDriver(void) +{ + int driver; + hipDriverGetVersion(&driver); + check_hip_error(); + return driver; +} + +namespace ck { +namespace app { +struct DeviceMem +{ + DeviceMem() = delete; + DeviceMem(std::size_t mem_size); + void* GetDeviceBuffer(); + void ToDevice(const void* p); + void FromDevice(void* p); + ~DeviceMem(); + + void* mpDeviceBuf; + std::size_t mMemSize; +}; + +DeviceMem::DeviceMem(std::size_t mem_size) : mMemSize(mem_size) +{ + hipGetErrorString(hipMalloc(static_cast(&mpDeviceBuf), mMemSize)); +} + +void* DeviceMem::GetDeviceBuffer() { return mpDeviceBuf; } + +void DeviceMem::ToDevice(const void* p) +{ + hipGetErrorString( + hipMemcpy(mpDeviceBuf, const_cast(p), mMemSize, hipMemcpyHostToDevice)); +} + +void DeviceMem::FromDevice(void* p) +{ + hipGetErrorString(hipMemcpy(p, mpDeviceBuf, mMemSize, hipMemcpyDeviceToHost)); +} + +DeviceMem::~DeviceMem() { hipGetErrorString(hipFree(mpDeviceBuf)); } + +void profile_conv_fwd_impl(int do_verification, + int init_method, + bool do_log, + bool time_kernel, + ConvDataType data_type, + ck::index_t N, + ck::index_t K, + ck::index_t C, + std::vector input_spatial_lengths, + std::vector filter_spatial_lengths, + std::vector output_spatial_lengths, + std::vector conv_filter_strides, + std::vector conv_filter_dilations, + std::vector input_left_pads, + std::vector input_right_pads) +{ + const ck::index_t Y = filter_spatial_lengths[0]; + const ck::index_t X = filter_spatial_lengths[1]; + + const ck::index_t Hi = input_spatial_lengths[0]; + const ck::index_t Wi = input_spatial_lengths[1]; + + const ck::index_t Ho = output_spatial_lengths[0]; + const ck::index_t Wo = output_spatial_lengths[1]; + + const auto in_sz = N * C * Hi * Wi; + const auto wei_sz = K * C * Y * X; + const auto out_sz = N * K * Ho * Wo; + + using WeiDataType = float; + using InDataType = float; + using OutDataType = float; + + app::DeviceMem in_device_buf(sizeof(InDataType) * in_sz); + app::DeviceMem wei_device_buf(sizeof(WeiDataType) * wei_sz); + app::DeviceMem out_device_buf(sizeof(OutDataType) * out_sz); + // data is already on device! + + // add device Conv instances + std::vector conv_ptrs; + if(data_type == F16_F16_F16) + { + add_device_conv2d_fwd_xdl_c_shuffle_nhwc_kyxc_nhwk_f16_instances_t(conv_ptrs); + add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_f16_instances_t(conv_ptrs); + } + else if(data_type == BF16_BF16_BF16) + add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_bf16_instances_t(conv_ptrs); + else if(data_type == F32_F32_F32) + add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_f32_instances_t(conv_ptrs); + else if(data_type == INT8_INT8_INT8) + add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_int8_instances_t(conv_ptrs); + else + throw std::runtime_error("wrong! Invalid data type"); + if(conv_ptrs.empty()) + { + throw std::runtime_error("wrong! no device Conv instance found"); + } + + std::string best_conv_name; + float best_ave_time = 0; + float best_tflops = 0; + float best_gb_per_sec = 0; + int deviceIndex = 0; + hipSetDevice(deviceIndex); + check_hip_error(); + + StreamConfig stream_config{nullptr, time_kernel}; + hipStreamCreate(&stream_config.stream_id_); + check_hip_error(); + + // profile device Conv instances + for(auto& conv_ptr : conv_ptrs) + { + auto argument_ptr = + conv_ptr.MakeArgumentPointer(static_cast(in_device_buf.GetDeviceBuffer()), + static_cast(wei_device_buf.GetDeviceBuffer()), + static_cast(out_device_buf.GetDeviceBuffer()), + N, + K, + C, + input_spatial_lengths, + filter_spatial_lengths, + output_spatial_lengths, + conv_filter_strides, + conv_filter_dilations, + input_left_pads, + input_right_pads); + + auto invoker_ptr = conv_ptr.MakeInvokerPointer(); + + if(conv_ptr.IsSupportedArgument(argument_ptr.get())) + { + std::string conv_name = conv_ptr.GetTypeString(); + float ave_time = invoker_ptr->Run(argument_ptr.get(), stream_config); + + std::size_t flop = std::size_t(2) * N * K * Ho * Wo * C * Y * X; + + std::size_t num_btype = sizeof(InDataType) * (N * C * Hi * Wi) + + sizeof(WeiDataType) * (K * C * Y * X) + + sizeof(OutDataType) * (N * K * Ho * Wo); + + float tflops = static_cast(flop) / 1.E9 / ave_time; + + float gb_per_sec = num_btype / 1.E6 / ave_time; + + std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec + << " GB/s, " << conv_name << std::endl; + + if(tflops > best_tflops) + { + best_conv_name = conv_name; + best_tflops = tflops; + best_ave_time = ave_time; + best_gb_per_sec = gb_per_sec; + } + } + } + + std::cout << "Best Perf: " << best_ave_time << " ms, " << best_tflops << " TFlops, " + << best_gb_per_sec << " GB/s, " << best_conv_name << std::endl; +} + +} // namespace app +} // namespace ck diff --git a/test/conv2d_bwd_weight/conv2d_bwd_weight.cpp b/test/conv2d_bwd_weight/conv2d_bwd_weight.cpp index 085473f695..671980f49e 100644 --- a/test/conv2d_bwd_weight/conv2d_bwd_weight.cpp +++ b/test/conv2d_bwd_weight/conv2d_bwd_weight.cpp @@ -28,10 +28,10 @@ int test_self() ck::tensor_layout::convolution::NHWC, ck::tensor_layout::convolution::KYXC, ck::tensor_layout::convolution::NHWK>( - 1, // do_verification, - 1, // init_method, - 0, // do_log, - 1, // nrepeat, + true, // do_verification + 1, // init_method + false, // do_log + false, // time_kernel param.N_, param.K_, param.C_, @@ -52,10 +52,10 @@ int test_self() ck::tensor_layout::convolution::NHWC, ck::tensor_layout::convolution::KYXC, ck::tensor_layout::convolution::NHWK>( - 1, // do_verification, - 1, // init_method, - 0, // do_log, - 1, // nrepeat, + true, // do_verification + 1, // init_method + false, // do_log + false, // time_kernel param.N_, param.K_, param.C_, @@ -72,8 +72,8 @@ int test_self() } int main(int argc, char* argv[]) { - int data_type = 0; - int init_method = 0; + int data_type = 1; + int init_method = 1; // Conv shape ck::index_t N = 128; @@ -155,10 +155,10 @@ int main(int argc, char* argv[]) ck::tensor_layout::convolution::NHWC, ck::tensor_layout::convolution::KYXC, ck::tensor_layout::convolution::NHWK>( - 1, + true, // do_verification init_method, - 0, - 1, + false, // do_log + false, // time_kernel param.N_, param.K_, param.C_, @@ -180,10 +180,10 @@ int main(int argc, char* argv[]) ck::tensor_layout::convolution::NHWC, ck::tensor_layout::convolution::KYXC, ck::tensor_layout::convolution::NHWK>( - 1, + true, // do_verification init_method, - 0, - 1, + false, // do_log + false, // time_kernel param.N_, param.K_, param.C_, diff --git a/test/convnd_bwd_data/convnd_bwd_data.cpp b/test/convnd_bwd_data/convnd_bwd_data.cpp index 0b6ddb1405..7284680e0e 100644 --- a/test/convnd_bwd_data/convnd_bwd_data.cpp +++ b/test/convnd_bwd_data/convnd_bwd_data.cpp @@ -27,10 +27,10 @@ int main() ck::tensor_layout::convolution::NWC, ck::tensor_layout::convolution::KXC, ck::tensor_layout::convolution::NWK>( - 1, // do_verification, - 1, // init_method, - 0, // do_log, - 1, // nrepeat, + true, // do_verification + 1, // init_method + false, // do_log + false, // time_kernel param.N_, param.K_, param.C_, @@ -50,10 +50,10 @@ int main() ck::tensor_layout::convolution::NWC, ck::tensor_layout::convolution::KXC, ck::tensor_layout::convolution::NWK>( - 1, // do_verification, - 1, // init_method, - 0, // do_log, - 1, // nrepeat, + true, // do_verification + 1, // init_method + false, // do_log + false, // time_kernel param.N_, param.K_, param.C_, @@ -73,10 +73,10 @@ int main() ck::tensor_layout::convolution::NWC, ck::tensor_layout::convolution::KXC, ck::tensor_layout::convolution::NWK>( - 1, // do_verification, - 1, // init_method, - 0, // do_log, - 1, // nrepeat, + true, // do_verification + 1, // init_method + false, // do_log + false, // time_kernel param.N_, param.K_, param.C_, @@ -96,10 +96,10 @@ int main() ck::tensor_layout::convolution::NWC, ck::tensor_layout::convolution::KXC, ck::tensor_layout::convolution::NWK>( - 1, // do_verification, - 1, // init_method, - 0, // do_log, - 1, // nrepeat, + true, // do_verification + 1, // init_method + false, // do_log + false, // time_kernel param.N_, param.K_, param.C_, @@ -128,10 +128,10 @@ int main() ck::tensor_layout::convolution::NHWC, ck::tensor_layout::convolution::KYXC, ck::tensor_layout::convolution::NHWK>( - 1, // do_verification, - 1, // init_method, - 0, // do_log, - 1, // nrepeat, + true, // do_verification + 1, // init_method + false, // do_log + false, // time_kernel param.N_, param.K_, param.C_, @@ -151,10 +151,10 @@ int main() ck::tensor_layout::convolution::NHWC, ck::tensor_layout::convolution::KYXC, ck::tensor_layout::convolution::NHWK>( - 1, // do_verification, - 1, // init_method, - 0, // do_log, - 1, // nrepeat, + true, // do_verification + 1, // init_method + false, // do_log + false, // time_kernel param.N_, param.K_, param.C_, @@ -174,10 +174,10 @@ int main() ck::tensor_layout::convolution::NHWC, ck::tensor_layout::convolution::KYXC, ck::tensor_layout::convolution::NHWK>( - 1, // do_verification, - 1, // init_method, - 0, // do_log, - 1, // nrepeat, + true, // do_verification + 1, // init_method + false, // do_log + false, // time_kernel param.N_, param.K_, param.C_, @@ -197,10 +197,10 @@ int main() ck::tensor_layout::convolution::NHWC, ck::tensor_layout::convolution::KYXC, ck::tensor_layout::convolution::NHWK>( - 1, // do_verification, - 1, // init_method, - 0, // do_log, - 1, // nrepeat, + true, // do_verification + 1, // init_method + false, // do_log + false, // time_kernel param.N_, param.K_, param.C_, @@ -232,10 +232,10 @@ int main() ck::tensor_layout::convolution::NDHWC, ck::tensor_layout::convolution::KZYXC, ck::tensor_layout::convolution::NDHWK>( - 1, // do_verification, - 1, // init_method, - 0, // do_log, - 1, // nrepeat, + true, // do_verification + 1, // init_method + false, // do_log + false, // time_kernel param.N_, param.K_, param.C_, @@ -255,10 +255,10 @@ int main() ck::tensor_layout::convolution::NDHWC, ck::tensor_layout::convolution::KZYXC, ck::tensor_layout::convolution::NDHWK>( - 1, // do_verification, - 1, // init_method, - 0, // do_log, - 1, // nrepeat, + true, // do_verification + 1, // init_method + false, // do_log + false, // time_kernel param.N_, param.K_, param.C_, @@ -278,10 +278,10 @@ int main() ck::tensor_layout::convolution::NDHWC, ck::tensor_layout::convolution::KZYXC, ck::tensor_layout::convolution::NDHWK>( - 1, // do_verification, - 1, // init_method, - 0, // do_log, - 1, // nrepeat, + true, // do_verification + 1, // init_method + false, // do_log + false, // time_kernel param.N_, param.K_, param.C_, @@ -301,10 +301,10 @@ int main() ck::tensor_layout::convolution::NDHWC, ck::tensor_layout::convolution::KZYXC, ck::tensor_layout::convolution::NDHWK>( - 1, // do_verification, - 1, // init_method, - 0, // do_log, - 1, // nrepeat, + true, // do_verification + 1, // init_method + false, // do_log + false, // time_kernel param.N_, param.K_, param.C_, diff --git a/test/gemm_reduce/gemm_reduce_fp16.cpp b/test/gemm_reduce/gemm_reduce_fp16.cpp index 8deb66b2b0..6c7bb9658f 100644 --- a/test/gemm_reduce/gemm_reduce_fp16.cpp +++ b/test/gemm_reduce/gemm_reduce_fp16.cpp @@ -16,22 +16,22 @@ int main() pass = pass && ck::profiler:: profile_gemm_reduce_impl( - true, 1, false, 1, M, N, K, K, N, N); + true, 1, false, false, M, N, K, K, N, N); pass = pass && ck::profiler:: profile_gemm_reduce_impl( - true, 1, false, 1, M, N, K, K, K, N); + true, 1, false, false, M, N, K, K, K, N); pass = pass && ck::profiler:: profile_gemm_reduce_impl( - true, 1, false, 1, M, N, K, M, N, N); + true, 1, false, false, M, N, K, M, N, N); pass = pass && ck::profiler:: profile_gemm_reduce_impl( - true, 1, false, 1, M, N, K, M, K, N); + true, 1, false, false, M, N, K, M, K, N); if(pass) { diff --git a/test/gemm_split_k/gemm_split_k.cpp b/test/gemm_split_k/gemm_split_k.cpp index c788b66aa3..b63361aa1b 100644 --- a/test/gemm_split_k/gemm_split_k.cpp +++ b/test/gemm_split_k/gemm_split_k.cpp @@ -187,9 +187,10 @@ int test_gemm(const gemmArgs& args) if(gemm_ptr->IsSupportedArgument(argument_ptr.get())) { - invoker_ptr->Run(argument_ptr.get(), 0); + invoker_ptr->Run(argument_ptr.get()); c_device_buf.FromDevice(c_m_n_device_result.mData.data()); + if(!check_out(c_m_n_host_result, c_m_n_device_result)) { success = false;