diff --git a/CMakeLists.txt b/CMakeLists.txt index 2b798e38f3..a3ec91e3bc 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -27,6 +27,8 @@ set(CMAKE_CXX_STANDARD_REQUIRED ON) set(CMAKE_CXX_EXTENSIONS OFF) message("CMAKE_CXX_COMPILER_ID: ${CMAKE_CXX_COMPILER_ID}") +option(CK_TIME_KERNEL "Turning off will disable kernel timing globally" ON) + ## OpenMP if(CMAKE_CXX_COMPILER_ID MATCHES "Clang") # workaround issue hipcc in rocm3.5 cannot find openmp @@ -72,8 +74,9 @@ message(STATUS "Build with HIP ${HIP_VERSION}") rocm_create_package( - NAME CK-${CK_BACKEND} + NAME composablekernel DESCRIPTION "High Performance Composable Kernel for AMD GPUs" + MAINTAINER "MIOpen Kernels Dev Team " LDCONFIG ) @@ -226,7 +229,7 @@ set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/lib) set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/lib) set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/bin) -configure_file("${PROJECT_SOURCE_DIR}/include/ck/hip_version.hpp.in" "${PROJECT_BINARY_DIR}/include/ck/hip_version.hpp") +configure_file("${PROJECT_SOURCE_DIR}/include/ck/options.hpp.in" "${PROJECT_BINARY_DIR}/include/ck/options.hpp") include_directories(BEFORE ${PROJECT_SOURCE_DIR}/include @@ -234,7 +237,6 @@ include_directories(BEFORE ${PROJECT_SOURCE_DIR}/library/include ) -include(googletest) SET(BUILD_DEV ON CACHE BOOL "BUILD_DEV") if(BUILD_DEV) @@ -243,7 +245,31 @@ if(BUILD_DEV) endif() message("CMAKE_CXX_FLAGS: ${CMAKE_CXX_FLAGS}") +add_custom_target(check COMMAND ${CMAKE_CTEST_COMMAND} --output-on-failure -C ${CMAKE_CFG_INTDIR}) + add_subdirectory(library) add_subdirectory(example) add_subdirectory(test) add_subdirectory(profiler) + +#Create an interface target for the include only files and call it "composablekernels" +include(CMakePackageConfigHelpers) + +set(version 1.0.0) +write_basic_package_version_file( + "${CMAKE_CURRENT_BINARY_DIR}/composable_kernelConfigVersion.cmake" + VERSION "${version}" + COMPATIBILITY AnyNewerVersion +) + +configure_package_config_file(${CMAKE_CURRENT_SOURCE_DIR}/Config.cmake.in + "${CMAKE_CURRENT_BINARY_DIR}/composable_kernelConfig.cmake" + INSTALL_DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/composable_kernel + NO_CHECK_REQUIRED_COMPONENTS_MACRO +) + +install(FILES + "${CMAKE_CURRENT_BINARY_DIR}/composable_kernelConfig.cmake" + "${CMAKE_CURRENT_BINARY_DIR}/composable_kernelConfigVersion.cmake" + DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/composable_kernel +) diff --git a/Config.cmake.in b/Config.cmake.in new file mode 100644 index 0000000000..12b5c331ae --- /dev/null +++ b/Config.cmake.in @@ -0,0 +1,11 @@ +@PACKAGE_INIT@ + +set(_composable_kernel_supported_components device_operations host_tensor) + +foreach(_comp ${composable_kernel_FIND_COMPONENTS}) + if(NOT _comp IN_LIST _composable_kernel_supported_components) + set(composable_kernel_FOUND False) + set(composable_kernel_NOT_FOUND_MESSAGE "Unsupported component: ${_comp}") + endif() + include("${CMAKE_CURRENT_LIST_DIR}/composable_kernel${_comp}Targets.cmake") +endforeach() diff --git a/Dockerfile b/Dockerfile index c4cf0fac57..9a443e01de 100644 --- a/Dockerfile +++ b/Dockerfile @@ -11,13 +11,7 @@ ARG DEB_ROCM_REPO=http://repo.radeon.com/rocm/apt/.apt_$ROCMVERSION/ RUN apt-get update RUN apt-get install -y wget gnupg RUN wget -qO - http://repo.radeon.com/rocm/rocm.gpg.key | apt-key add - -RUN if ! [ -z $OSDB_BKC_VERSION ]; then \ - echo "Using BKC VERISION: $OSDB_BKC_VERSION";\ - sh -c "echo deb [arch=amd64 trusted=yes] http://compute-artifactory.amd.com/artifactory/list/rocm-osdb-deb/ compute-rocm-dkms-no-npi-hipclang ${OSDB_BKC_VERSION} > /etc/apt/sources.list.d/rocm.list" ;\ - cat /etc/apt/sources.list.d/rocm.list;\ - else \ - sh -c "echo deb [arch=amd64] $DEB_ROCM_REPO ubuntu main > /etc/apt/sources.list.d/rocm.list" ;\ - fi +RUN sh -c "echo deb [arch=amd64] $DEB_ROCM_REPO ubuntu main > /etc/apt/sources.list.d/rocm.list" RUN wget --no-check-certificate -qO - https://apt.kitware.com/keys/kitware-archive-latest.asc 2>/dev/null | apt-key add - RUN sh -c "echo deb https://apt.kitware.com/ubuntu/ bionic main | tee -a /etc/apt/sources.list" @@ -25,18 +19,15 @@ RUN sh -c "echo deb https://apt.kitware.com/ubuntu/ bionic main | tee -a /etc/ap # Install dependencies RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-unauthenticated \ apt-utils \ - sshpass \ build-essential \ cmake-data=3.15.1-0kitware1 \ cmake=3.15.1-0kitware1 \ curl \ - doxygen \ g++ \ gdb \ git \ hip-rocclr \ jq \ - lcov \ libelf-dev \ libncurses5-dev \ libnuma-dev \ @@ -62,8 +53,6 @@ RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --allow- apt-get clean && \ rm -rf /var/lib/apt/lists/* -# RUN pip3 install --default-timeout=100000 -r requirements.txt - # Setup ubsan environment to printstacktrace RUN ln -s /usr/bin/llvm-symbolizer-3.8 /usr/local/bin/llvm-symbolizer ENV UBSAN_OPTIONS=print_stacktrace=1 @@ -92,5 +81,3 @@ ADD rbuild.ini /rbuild.ini ADD dev-requirements.txt dev-requirements.txt RUN rbuild prepare -s develop -d $PREFIX RUN groupadd -f render -# RUN cget install -f min-requirements.txt -# RUN CXXFLAGS='-isystem $PREFIX/include' cget install -f ./mlir-requirements.txt diff --git a/Jenkinsfile b/Jenkinsfile index f065d4ecc5..77f4d9d8be 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -320,7 +320,7 @@ pipeline { { agent{ label rocmnode("gfx908")} environment{ - setup_args = """ -D CMAKE_CXX_FLAGS="--offload-arch=gfx908 -O3 " -DBUILD_DEV=On """ + setup_args = """ -D CMAKE_CXX_FLAGS=" --offload-arch=gfx900 --offload-arch=gfx906 --offload-arch=gfx908 --offload-arch=gfx90a -O3 " -DBUILD_DEV=On """ } steps{ buildHipClangJobAndReboot(setup_args:setup_args, config_targets: "check", no_reboot:true, build_type: 'Release') @@ -341,6 +341,23 @@ pipeline { } } + stage("Client App") + { + parallel + { + stage("Run Client App") + { + agent{ label rocmnode("gfx908")} + environment{ + setup_args = """ -D -DBUILD_DEV=Off -DCMAKE_INSTALL_PREFIX=../install CMAKE_CXX_FLAGS="--offload-arch=gfx908 -O3 " """ + execute_args = """ cd ../test/client_app && rm -rf build && mkdir build && cd build && cmake -DCMAKE_PREFIX_PATH="${env.WORKSPACE}/install;/opt/rocm" .. && make """ + } + steps{ + buildHipClangJobAndReboot(setup_args: setup_args, config_targets: "install", no_reboot:true, build_type: 'Release', execute_cmd: execute_args, prefixpath: '/usr/local') + } + } + } + } stage("Performance Tests") { parallel diff --git a/README.md b/README.md index f5341b5736..9d7b578046 100644 --- a/README.md +++ b/README.md @@ -43,3 +43,13 @@ Instructions for running each individual examples are under ```example/``` make -j ckProfiler ``` Instructions for running ckProfiler are under ```profiler/``` + + +## Caveat +### Kernel Timing and Verification +CK's own kernel timer will warn up kernel once, and then run it multiple times +to get average kernel time. For some kernels that use atomic add, this will cause +output buffer to be accumulated multiple times, causing verfication failure. +To work around it, do not use CK's own timer and do verification at the same time. +CK's own timer and verification in each example and ckProfiler can be enabled or +disabled from command line. diff --git a/cmake/googletest.cmake b/cmake/googletest.cmake index c7e70cc8a9..959bc4f4b0 100644 --- a/cmake/googletest.cmake +++ b/cmake/googletest.cmake @@ -18,6 +18,8 @@ list(APPEND GTEST_CMAKE_CXX_FLAGS -Wno-switch-enum -Wno-zero-as-null-pointer-constant -Wno-unused-member-function + -Wno-comma + -Wno-old-style-cast ) message(STATUS "Suppressing googltest warnings with flags: ${GTEST_CMAKE_CXX_FLAGS}") @@ -33,4 +35,5 @@ FetchContent_MakeAvailable(googletest) target_compile_options(gtest PRIVATE ${GTEST_CMAKE_CXX_FLAGS}) target_compile_options(gtest_main PRIVATE ${GTEST_CMAKE_CXX_FLAGS}) - +target_compile_options(gmock PRIVATE ${GTEST_CMAKE_CXX_FLAGS}) +target_compile_options(gmock_main PRIVATE ${GTEST_CMAKE_CXX_FLAGS}) diff --git a/example/01_gemm/gemm_xdl_bf16.cpp b/example/01_gemm/gemm_xdl_bf16.cpp index a4567dcd6e..060750e676 100644 --- a/example/01_gemm/gemm_xdl_bf16.cpp +++ b/example/01_gemm/gemm_xdl_bf16.cpp @@ -88,9 +88,9 @@ using ReferenceGemmInstance = ck::tensor_operation::host:: int main(int argc, char* argv[]) { - bool do_verification = 0; - int init_method = 0; - int nrepeat = 5; + bool do_verification = true; + int init_method = 1; + bool time_kernel = false; // GEMM shape ck::index_t M = 3840; @@ -105,13 +105,13 @@ int main(int argc, char* argv[]) { do_verification = std::stoi(argv[1]); init_method = std::stoi(argv[2]); - nrepeat = std::stoi(argv[3]); + time_kernel = std::stoi(argv[3]); } else if(argc == 10) { do_verification = std::stoi(argv[1]); init_method = std::stoi(argv[2]); - nrepeat = std::stoi(argv[3]); + time_kernel = std::stoi(argv[3]); M = std::stoi(argv[4]); N = std::stoi(argv[5]); @@ -125,7 +125,7 @@ int main(int argc, char* argv[]) { printf("arg1: verification (0=no, 1=yes)\n"); printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n"); - printf("arg3: run kernel # of times (>1)\n"); + printf("arg3: time kernel (0=n0, 1=yes)\n"); printf("arg4 to 9: M (256x), N(128x), K(32x), StrideA, StrideB, StrideC\n"); exit(0); } @@ -198,7 +198,7 @@ int main(int argc, char* argv[]) "not support this GEMM problem"); } - float ave_time = invoker.Run(argument, nrepeat); + float ave_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel}); std::size_t flop = std::size_t(2) * M * N * K; std::size_t num_btype = @@ -232,7 +232,7 @@ int main(int argc, char* argv[]) ref_invoker.Run(ref_argument); - ck::utils::check_err(c_m_n_device_f32_result.mData, c_m_n_host_result.mData); + return ck::utils::check_err(c_m_n_device_f32_result.mData, c_m_n_host_result.mData) ? 0 : 1; } return 0; diff --git a/example/01_gemm/gemm_xdl_fp16.cpp b/example/01_gemm/gemm_xdl_fp16.cpp index fc04a13ca5..06523037f9 100644 --- a/example/01_gemm/gemm_xdl_fp16.cpp +++ b/example/01_gemm/gemm_xdl_fp16.cpp @@ -56,9 +56,9 @@ using ReferenceGemmInstance = ck::tensor_operation::host:: int main(int argc, char* argv[]) { - bool do_verification = 0; - int init_method = 0; - int nrepeat = 5; + bool do_verification = true; + int init_method = 1; + bool time_kernel = false; // GEMM shape ck::index_t M = 3840; @@ -73,13 +73,13 @@ int main(int argc, char* argv[]) { do_verification = std::stoi(argv[1]); init_method = std::stoi(argv[2]); - nrepeat = std::stoi(argv[3]); + time_kernel = std::stoi(argv[3]); } else if(argc == 10) { do_verification = std::stoi(argv[1]); init_method = std::stoi(argv[2]); - nrepeat = std::stoi(argv[3]); + time_kernel = std::stoi(argv[3]); M = std::stoi(argv[4]); N = std::stoi(argv[5]); @@ -93,7 +93,7 @@ int main(int argc, char* argv[]) { printf("arg1: verification (0=no, 1=yes)\n"); printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n"); - printf("arg3: run kernel # of times (>1)\n"); + printf("arg3: time kernel (0=n0, 1=yes)\n"); printf("arg4 to 9: M (256x), N(128x), K(32x), StrideA, StrideB, StrideC\n"); exit(0); } @@ -171,7 +171,7 @@ int main(int argc, char* argv[]) "not support this GEMM problem"); } - float ave_time = invoker.Run(argument, nrepeat); + float ave_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel}); std::size_t flop = std::size_t(2) * M * N * K; std::size_t num_btype = @@ -196,7 +196,7 @@ int main(int argc, char* argv[]) ref_invoker.Run(ref_argument); - ck::utils::check_err(c_m_n_device_result.mData, c_m_n_host_result.mData); + return ck::utils::check_err(c_m_n_device_result.mData, c_m_n_host_result.mData) ? 0 : 1; } return 0; diff --git a/example/01_gemm/gemm_xdl_int8.cpp b/example/01_gemm/gemm_xdl_int8.cpp index ab5869db61..a22c21e40e 100644 --- a/example/01_gemm/gemm_xdl_int8.cpp +++ b/example/01_gemm/gemm_xdl_int8.cpp @@ -83,9 +83,9 @@ using ReferenceGemmInstance = ck::tensor_operation::host:: int main(int argc, char* argv[]) { - bool do_verification = 0; - int init_method = 0; - int nrepeat = 5; + bool do_verification = true; + int init_method = 1; + bool time_kernel = false; // GEMM shape ck::index_t M = 3840; @@ -100,13 +100,13 @@ int main(int argc, char* argv[]) { do_verification = std::stoi(argv[1]); init_method = std::stoi(argv[2]); - nrepeat = std::stoi(argv[3]); + time_kernel = std::stoi(argv[3]); } else if(argc == 10) { do_verification = std::stoi(argv[1]); init_method = std::stoi(argv[2]); - nrepeat = std::stoi(argv[3]); + time_kernel = std::stoi(argv[3]); M = std::stoi(argv[4]); N = std::stoi(argv[5]); @@ -120,7 +120,7 @@ int main(int argc, char* argv[]) { printf("arg1: verification (0=no, 1=yes)\n"); printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n"); - printf("arg3: run kernel # of times (>1)\n"); + printf("arg3: time kernel (0=n0, 1=yes)\n"); printf("arg4 to 9: M (256x), N(128x), K(32x), StrideA, StrideB, StrideC\n"); exit(0); } @@ -194,7 +194,7 @@ int main(int argc, char* argv[]) "not support this GEMM problem"); } - float ave_time = invoker.Run(argument, nrepeat); + float ave_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel}); std::size_t flop = std::size_t(2) * M * N * K; std::size_t num_btype = @@ -219,7 +219,7 @@ int main(int argc, char* argv[]) ref_invoker.Run(ref_argument); - ck::utils::check_err(c_m_n_device_result.mData, c_m_n_host_result.mData); + return ck::utils::check_err(c_m_n_device_result.mData, c_m_n_host_result.mData) ? 0 : 1; } return 0; diff --git a/example/02_gemm_alpha_beta/gemm_xdl_alpha_beta.cpp b/example/02_gemm_alpha_beta/gemm_xdl_alpha_beta.cpp index 2abebbbac4..1a6e1de4dc 100644 --- a/example/02_gemm_alpha_beta/gemm_xdl_alpha_beta.cpp +++ b/example/02_gemm_alpha_beta/gemm_xdl_alpha_beta.cpp @@ -86,9 +86,9 @@ using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemmBias2D1)\n"); + printf("arg3: time kernel (0=n0, 1=yes)\n"); printf("arg4 to 9: M (256x), N(128x), K(32x), StrideA, StrideB, StrideC, alpha, beta\n"); exit(0); } @@ -216,7 +216,7 @@ int main(int argc, char* argv[]) "not support this GEMM problem"); } - float ave_time = invoker.Run(argument, nrepeat); + float ave_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel}); std::size_t flop = std::size_t(2) * M * N * K; std::size_t num_btype = @@ -246,6 +246,8 @@ int main(int argc, char* argv[]) ref_invoker.Run(ref_argument); - ck::utils::check_err(c_m_n_device_result.mData, c_m_n_host_result.mData); + return ck::utils::check_err(c_m_n_device_result.mData, c_m_n_host_result.mData) ? 0 : 1; } + + return 0; } diff --git a/example/03_gemm_bias_relu/gemm_xdl_bias_relu.cpp b/example/03_gemm_bias_relu/gemm_xdl_bias_relu.cpp index f3ed2bad37..3bf3003c14 100644 --- a/example/03_gemm_bias_relu/gemm_xdl_bias_relu.cpp +++ b/example/03_gemm_bias_relu/gemm_xdl_bias_relu.cpp @@ -83,9 +83,9 @@ using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemmBiasActiv int main(int argc, char* argv[]) { - bool do_verification = 0; - int init_method = 0; - int nrepeat = 5; + bool do_verification = true; + int init_method = 1; + bool time_kernel = false; // GEMM shape ck::index_t M = 3840; @@ -100,13 +100,13 @@ int main(int argc, char* argv[]) { do_verification = std::stoi(argv[1]); init_method = std::stoi(argv[2]); - nrepeat = std::stoi(argv[3]); + time_kernel = std::stoi(argv[3]); } else if(argc == 10) { do_verification = std::stoi(argv[1]); init_method = std::stoi(argv[2]); - nrepeat = std::stoi(argv[3]); + time_kernel = std::stoi(argv[3]); M = std::stoi(argv[4]); N = std::stoi(argv[5]); @@ -120,7 +120,7 @@ int main(int argc, char* argv[]) { printf("arg1: verification (0=no, 1=yes)\n"); printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n"); - printf("arg3: run kernel # of times (>1)\n"); + printf("arg3: time kernel (0=n0, 1=yes)\n"); printf("arg4 to 9: M (256x), N(128x), K(32x), StrideA, StrideB, StrideC\n"); exit(0); } @@ -206,7 +206,7 @@ int main(int argc, char* argv[]) "not support this GEMM problem"); } - float ave_time = invoker.Run(argument, nrepeat); + float ave_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel}); std::size_t flop = std::size_t(2) * M * N * K; @@ -232,6 +232,8 @@ int main(int argc, char* argv[]) ref_invoker.Run(ref_argument); - ck::utils::check_err(c_m_n_device_result.mData, c_m_n_host_result.mData); + return ck::utils::check_err(c_m_n_device_result.mData, c_m_n_host_result.mData) ? 0 : 1; } + + return 0; } diff --git a/example/04_gemm_bias_relu_add/gemm_xdl_bias_relu_add.cpp b/example/04_gemm_bias_relu_add/gemm_xdl_bias_relu_add.cpp index 9405c36881..73e92f9d11 100644 --- a/example/04_gemm_bias_relu_add/gemm_xdl_bias_relu_add.cpp +++ b/example/04_gemm_bias_relu_add/gemm_xdl_bias_relu_add.cpp @@ -83,9 +83,9 @@ using ReferenceGemmInstance = CElementOp>; int main(int argc, char* argv[]) { - bool do_verification = 0; - int init_method = 0; - int nrepeat = 5; + bool do_verification = true; + int init_method = 1; + bool time_kernel = false; // GEMM shape ck::index_t M = 3840; @@ -101,13 +101,13 @@ int main(int argc, char* argv[]) { do_verification = std::stoi(argv[1]); init_method = std::stoi(argv[2]); - nrepeat = std::stoi(argv[3]); + time_kernel = std::stoi(argv[3]); } else if(argc == 11) { do_verification = std::stoi(argv[1]); init_method = std::stoi(argv[2]); - nrepeat = std::stoi(argv[3]); + time_kernel = std::stoi(argv[3]); M = std::stoi(argv[4]); N = std::stoi(argv[5]); @@ -122,7 +122,7 @@ int main(int argc, char* argv[]) { printf("arg1: verification (0=no, 1=yes)\n"); printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n"); - printf("arg3: run kernel # of times (>1)\n"); + printf("arg3: time kernel (0=n0, 1=yes)\n"); printf("arg4 to 10: M (256x), N(128x), K(32x), StrideA, StrideB, StrideC, StrideC1\n"); exit(0); } @@ -218,7 +218,7 @@ int main(int argc, char* argv[]) "not support this GEMM problem"); } - float ave_time = invoker.Run(argument, nrepeat); + float ave_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel}); std::size_t flop = std::size_t(2) * M * N * K; std::size_t num_btype = sizeof(ADataType) * M * K + sizeof(BDataType) * K * M + @@ -250,6 +250,8 @@ int main(int argc, char* argv[]) ref_invoker.Run(ref_argument); - ck::utils::check_err(c_m_n_device_result.mData, c_m_n_host_result.mData); + return ck::utils::check_err(c_m_n_device_result.mData, c_m_n_host_result.mData) ? 0 : 1; } + + return 0; } diff --git a/example/06_conv2d_fwd_bias_relu/CMakeLists.txt b/example/06_conv2d_fwd_bias_relu/CMakeLists.txt index df8f70606c..4e1dd1f3e6 100644 --- a/example/06_conv2d_fwd_bias_relu/CMakeLists.txt +++ b/example/06_conv2d_fwd_bias_relu/CMakeLists.txt @@ -1,2 +1,2 @@ add_example_executable(example_conv2d_fwd_xdl_bias_relu conv2d_fwd_xdl_bias_relu.cpp) -target_link_libraries(example_conv2d_fwd_xdl_bias_relu PRIVATE conv_fwd_util) +target_link_libraries(example_conv2d_fwd_xdl_bias_relu PRIVATE conv_util) diff --git a/example/06_conv2d_fwd_bias_relu/conv2d_fwd_xdl_bias_relu.cpp b/example/06_conv2d_fwd_bias_relu/conv2d_fwd_xdl_bias_relu.cpp index 751ce16b90..d50afb6854 100644 --- a/example/06_conv2d_fwd_bias_relu/conv2d_fwd_xdl_bias_relu.cpp +++ b/example/06_conv2d_fwd_bias_relu/conv2d_fwd_xdl_bias_relu.cpp @@ -7,7 +7,7 @@ #include "check_err.hpp" #include "config.hpp" -#include "conv_fwd_util.hpp" +#include "conv_util.hpp" #include "device.hpp" #include "device_conv2d_fwd_xdl_c_shuffle_bias_activation_nhwc_kyxc_nhwk.hpp" #include "device_tensor.hpp" @@ -93,7 +93,7 @@ void PrintUseMsg() { std::cout << "arg1: verification (0=no, 1=yes)\n" << "arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n" - << "arg3: run kernel # of times (>1)\n" + << "arg3: time kernel (0=n0, 1=yes)\n" << "Following arguments:\n" << " N, K, C, \n" << " , (ie Y, X for 2D)\n" @@ -120,40 +120,40 @@ ck::utils::conv::ConvParams ParseConvParams(int argc, char* argv[]) ck::utils::conv::ConvParams params; int arg_idx = 4; - params.num_dim_spatial = num_dim_spatial; - params.N = std::stoi(argv[arg_idx++]); - params.K = std::stoi(argv[arg_idx++]); - params.C = std::stoi(argv[arg_idx++]); + params.num_dim_spatial_ = num_dim_spatial; + params.N_ = std::stoi(argv[arg_idx++]); + params.K_ = std::stoi(argv[arg_idx++]); + params.C_ = std::stoi(argv[arg_idx++]); - params.filter_spatial_lengths.resize(num_dim_spatial); + params.filter_spatial_lengths_.resize(num_dim_spatial); for(int i = 0; i < num_dim_spatial; ++i) { - params.filter_spatial_lengths[i] = std::stoi(argv[arg_idx++]); + params.filter_spatial_lengths_[i] = std::stoi(argv[arg_idx++]); } - params.input_spatial_lengths.resize(num_dim_spatial); + params.input_spatial_lengths_.resize(num_dim_spatial); for(int i = 0; i < num_dim_spatial; ++i) { - params.input_spatial_lengths[i] = std::stoi(argv[arg_idx++]); + params.input_spatial_lengths_[i] = std::stoi(argv[arg_idx++]); } - params.conv_filter_strides.resize(num_dim_spatial); + params.conv_filter_strides_.resize(num_dim_spatial); for(int i = 0; i < num_dim_spatial; ++i) { - params.conv_filter_strides[i] = std::stoi(argv[arg_idx++]); + params.conv_filter_strides_[i] = std::stoi(argv[arg_idx++]); } - params.conv_filter_dilations.resize(num_dim_spatial); + params.conv_filter_dilations_.resize(num_dim_spatial); for(int i = 0; i < num_dim_spatial; ++i) { - params.conv_filter_dilations[i] = std::stoi(argv[arg_idx++]); + params.conv_filter_dilations_[i] = std::stoi(argv[arg_idx++]); } - params.input_left_pads.resize(num_dim_spatial); + params.input_left_pads_.resize(num_dim_spatial); for(int i = 0; i < num_dim_spatial; ++i) { - params.input_left_pads[i] = std::stoi(argv[arg_idx++]); + params.input_left_pads_[i] = std::stoi(argv[arg_idx++]); } - params.input_right_pads.resize(num_dim_spatial); + params.input_right_pads_.resize(num_dim_spatial); for(int i = 0; i < num_dim_spatial; ++i) { - params.input_right_pads[i] = std::stoi(argv[arg_idx++]); + params.input_right_pads_[i] = std::stoi(argv[arg_idx++]); } return params; @@ -165,9 +165,9 @@ int main(int argc, char* argv[]) { using namespace ck::utils::conv; - bool do_verification = 0; - int init_method = 0; - int nrepeat = 5; + bool do_verification = true; + int init_method = 1; + bool time_kernel = false; const int num_dim_spatial = 2; ck::utils::conv::ConvParams params; @@ -176,7 +176,7 @@ int main(int argc, char* argv[]) { do_verification = std::stoi(argv[1]); init_method = std::stoi(argv[2]); - nrepeat = std::stoi(argv[3]); + time_kernel = std::stoi(argv[3]); } if(argc >= 5) @@ -184,21 +184,21 @@ int main(int argc, char* argv[]) params = ParseConvParams(argc, argv); } - std::vector input_dims{static_cast(params.N), - static_cast(params.C)}; + std::vector input_dims{static_cast(params.N_), + static_cast(params.C_)}; input_dims.insert(std::end(input_dims), - std::begin(params.input_spatial_lengths), - std::end(params.input_spatial_lengths)); + std::begin(params.input_spatial_lengths_), + std::end(params.input_spatial_lengths_)); - std::vector filter_dims{static_cast(params.K), - static_cast(params.C)}; + std::vector filter_dims{static_cast(params.K_), + static_cast(params.C_)}; filter_dims.insert(std::end(filter_dims), - std::begin(params.filter_spatial_lengths), - std::end(params.filter_spatial_lengths)); + std::begin(params.filter_spatial_lengths_), + std::end(params.filter_spatial_lengths_)); const std::vector& output_spatial_lengths = params.GetOutputSpatialLengths(); - std::vector output_dims{static_cast(params.N), - static_cast(params.K)}; + std::vector output_dims{static_cast(params.N_), + static_cast(params.K_)}; output_dims.insert(std::end(output_dims), std::begin(output_spatial_lengths), std::end(output_spatial_lengths)); @@ -211,7 +211,7 @@ int main(int argc, char* argv[]) get_output_host_tensor_descriptor(output_dims, num_dim_spatial)); // bias: assume contiguous 1d vector Tensor bias( - HostTensorDescriptor(std::vector({static_cast(params.K)}))); + HostTensorDescriptor(std::vector({static_cast(params.K_)}))); std::cout << "input: " << input.mDesc << std::endl; std::cout << "weights: " << weights.mDesc << std::endl; @@ -248,16 +248,16 @@ int main(int argc, char* argv[]) static_cast(wei_device_buf.GetDeviceBuffer()), static_cast(out_device_buf.GetDeviceBuffer()), static_cast(bias_device_buf.GetDeviceBuffer()), - params.N, - params.K, - params.C, - params.input_spatial_lengths, - params.filter_spatial_lengths, + params.N_, + params.K_, + params.C_, + params.input_spatial_lengths_, + params.filter_spatial_lengths_, output_spatial_lengths, - params.conv_filter_strides, - params.conv_filter_dilations, - params.input_left_pads, - params.input_right_pads, + params.conv_filter_strides_, + params.conv_filter_dilations_, + params.input_left_pads_, + params.input_right_pads_, InElementOp{}, WeiElementOp{}, OutElementOp{}); @@ -269,18 +269,18 @@ int main(int argc, char* argv[]) "not support this problem"); } - float ave_time = invoker.Run(argument, nrepeat); + float ave_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel}); std::size_t flop = get_flops( - params.N, params.C, params.K, params.filter_spatial_lengths, output_spatial_lengths); + params.N_, params.C_, params.K_, params.filter_spatial_lengths_, output_spatial_lengths); std::size_t num_btype = - get_btype(params.N, - params.C, - params.K, - params.input_spatial_lengths, - params.filter_spatial_lengths, + get_btype(params.N_, + params.C_, + params.K_, + params.input_spatial_lengths_, + params.filter_spatial_lengths_, output_spatial_lengths) + - sizeof(OutDataType) * (params.K); + sizeof(OutDataType) * (params.K_); float tflops = static_cast(flop) / 1.E9 / ave_time; float gb_per_sec = num_btype / 1.E6 / ave_time; @@ -296,16 +296,17 @@ int main(int argc, char* argv[]) weights, host_output, bias, - params.conv_filter_strides, - params.conv_filter_dilations, - params.input_left_pads, - params.input_right_pads, + params.conv_filter_strides_, + params.conv_filter_dilations_, + params.input_left_pads_, + params.input_right_pads_, InElementOp{}, WeiElementOp{}, OutElementOp{}); ref_invoker.Run(ref_argument); out_device_buf.FromDevice(device_output.mData.data()); - ck::utils::check_err( - host_output.mData, device_output.mData, "Error: incorrect results!", 1e-5f, 1e-4f); + return ck::utils::check_err(device_output.mData, host_output.mData) ? 0 : 1; } + + return 0; } diff --git a/example/07_conv2d_fwd_bias_relu_add/CMakeLists.txt b/example/07_conv2d_fwd_bias_relu_add/CMakeLists.txt index 8bc5980025..b4dd39d83a 100644 --- a/example/07_conv2d_fwd_bias_relu_add/CMakeLists.txt +++ b/example/07_conv2d_fwd_bias_relu_add/CMakeLists.txt @@ -1,2 +1,3 @@ -add_example_executable(example_conv2d_fwd_xdl_bias_relu_add conv2d_fwd_xdl_bias_relu_add.cpp) -target_link_libraries(example_conv2d_fwd_xdl_bias_relu_add PRIVATE conv_fwd_util) +# FIXME: should fix validation failure +add_example_executable_no_testing(example_conv2d_fwd_xdl_bias_relu_add conv2d_fwd_xdl_bias_relu_add.cpp) +target_link_libraries(example_conv2d_fwd_xdl_bias_relu_add PRIVATE conv_util) diff --git a/example/07_conv2d_fwd_bias_relu_add/conv2d_fwd_xdl_bias_relu_add.cpp b/example/07_conv2d_fwd_bias_relu_add/conv2d_fwd_xdl_bias_relu_add.cpp index e6339fcd23..53d882778a 100644 --- a/example/07_conv2d_fwd_bias_relu_add/conv2d_fwd_xdl_bias_relu_add.cpp +++ b/example/07_conv2d_fwd_bias_relu_add/conv2d_fwd_xdl_bias_relu_add.cpp @@ -7,7 +7,7 @@ #include "check_err.hpp" #include "config.hpp" -#include "conv_fwd_util.hpp" +#include "conv_util.hpp" #include "device.hpp" #include "device_conv2d_fwd_xdl_c_shuffle_bias_activation_add_nhwc_kyxc_nhwk.hpp" #include "device_tensor.hpp" @@ -90,7 +90,7 @@ void PrintUseMsg() { std::cout << "arg1: verification (0=no, 1=yes)\n" << "arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n" - << "arg3: run kernel # of times (>1)\n" + << "arg3: time kernel (0=n0, 1=yes)\n" << "Following arguments:\n" << " N, K, C, \n" << " , (ie Y, X for 2D)\n" @@ -117,40 +117,40 @@ ck::utils::conv::ConvParams ParseConvParams(int argc, char* argv[]) ck::utils::conv::ConvParams params; int arg_idx = 4; - params.num_dim_spatial = num_dim_spatial; - params.N = std::stoi(argv[arg_idx++]); - params.K = std::stoi(argv[arg_idx++]); - params.C = std::stoi(argv[arg_idx++]); + params.num_dim_spatial_ = num_dim_spatial; + params.N_ = std::stoi(argv[arg_idx++]); + params.K_ = std::stoi(argv[arg_idx++]); + params.C_ = std::stoi(argv[arg_idx++]); - params.filter_spatial_lengths.resize(num_dim_spatial); + params.filter_spatial_lengths_.resize(num_dim_spatial); for(int i = 0; i < num_dim_spatial; ++i) { - params.filter_spatial_lengths[i] = std::stoi(argv[arg_idx++]); + params.filter_spatial_lengths_[i] = std::stoi(argv[arg_idx++]); } - params.input_spatial_lengths.resize(num_dim_spatial); + params.input_spatial_lengths_.resize(num_dim_spatial); for(int i = 0; i < num_dim_spatial; ++i) { - params.input_spatial_lengths[i] = std::stoi(argv[arg_idx++]); + params.input_spatial_lengths_[i] = std::stoi(argv[arg_idx++]); } - params.conv_filter_strides.resize(num_dim_spatial); + params.conv_filter_strides_.resize(num_dim_spatial); for(int i = 0; i < num_dim_spatial; ++i) { - params.conv_filter_strides[i] = std::stoi(argv[arg_idx++]); + params.conv_filter_strides_[i] = std::stoi(argv[arg_idx++]); } - params.conv_filter_dilations.resize(num_dim_spatial); + params.conv_filter_dilations_.resize(num_dim_spatial); for(int i = 0; i < num_dim_spatial; ++i) { - params.conv_filter_dilations[i] = std::stoi(argv[arg_idx++]); + params.conv_filter_dilations_[i] = std::stoi(argv[arg_idx++]); } - params.input_left_pads.resize(num_dim_spatial); + params.input_left_pads_.resize(num_dim_spatial); for(int i = 0; i < num_dim_spatial; ++i) { - params.input_left_pads[i] = std::stoi(argv[arg_idx++]); + params.input_left_pads_[i] = std::stoi(argv[arg_idx++]); } - params.input_right_pads.resize(num_dim_spatial); + params.input_right_pads_.resize(num_dim_spatial); for(int i = 0; i < num_dim_spatial; ++i) { - params.input_right_pads[i] = std::stoi(argv[arg_idx++]); + params.input_right_pads_[i] = std::stoi(argv[arg_idx++]); } return params; @@ -162,9 +162,9 @@ int main(int argc, char* argv[]) { using namespace ck::utils::conv; - bool do_verification = 0; - int init_method = 0; - int nrepeat = 5; + bool do_verification = true; + int init_method = 1; + bool time_kernel = false; const int num_dim_spatial = 2; ck::utils::conv::ConvParams params; @@ -173,7 +173,7 @@ int main(int argc, char* argv[]) { do_verification = std::stoi(argv[1]); init_method = std::stoi(argv[2]); - nrepeat = std::stoi(argv[3]); + time_kernel = std::stoi(argv[3]); } if(argc >= 5) @@ -181,21 +181,21 @@ int main(int argc, char* argv[]) params = ParseConvParams(argc, argv); } - std::vector input_dims{static_cast(params.N), - static_cast(params.C)}; + std::vector input_dims{static_cast(params.N_), + static_cast(params.C_)}; input_dims.insert(std::end(input_dims), - std::begin(params.input_spatial_lengths), - std::end(params.input_spatial_lengths)); + std::begin(params.input_spatial_lengths_), + std::end(params.input_spatial_lengths_)); - std::vector filter_dims{static_cast(params.K), - static_cast(params.C)}; + std::vector filter_dims{static_cast(params.K_), + static_cast(params.C_)}; filter_dims.insert(std::end(filter_dims), - std::begin(params.filter_spatial_lengths), - std::end(params.filter_spatial_lengths)); + std::begin(params.filter_spatial_lengths_), + std::end(params.filter_spatial_lengths_)); const std::vector& output_spatial_lengths = params.GetOutputSpatialLengths(); - std::vector output_dims{static_cast(params.N), - static_cast(params.K)}; + std::vector output_dims{static_cast(params.N_), + static_cast(params.K_)}; output_dims.insert(std::end(output_dims), std::begin(output_spatial_lengths), std::end(output_spatial_lengths)); @@ -209,7 +209,7 @@ int main(int argc, char* argv[]) // bias: assume contiguous 1d vector Tensor bias( - HostTensorDescriptor(std::vector({static_cast(params.K)}))); + HostTensorDescriptor(std::vector({static_cast(params.K_)}))); // residual: assume same layout as output tensor Tensor residual(get_output_host_tensor_descriptor(output_dims, num_dim_spatial)); @@ -259,16 +259,16 @@ int main(int argc, char* argv[]) static_cast(out_device_buf.GetDeviceBuffer()), static_cast(bias_device_buf.GetDeviceBuffer()), static_cast(resi_device_buf.GetDeviceBuffer()), - params.N, - params.K, - params.C, - params.input_spatial_lengths, - params.filter_spatial_lengths, + params.N_, + params.K_, + params.C_, + params.input_spatial_lengths_, + params.filter_spatial_lengths_, output_spatial_lengths, - params.conv_filter_strides, - params.conv_filter_dilations, - params.input_left_pads, - params.input_right_pads, + params.conv_filter_strides_, + params.conv_filter_dilations_, + params.input_left_pads_, + params.input_right_pads_, in_element_op, wei_element_op, out_element_op); @@ -280,20 +280,20 @@ int main(int argc, char* argv[]) "not support this problem"); } - float ave_time = invoker.Run(argument, nrepeat); + float ave_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel}); std::size_t flop = get_flops( - params.N, params.C, params.K, params.filter_spatial_lengths, output_spatial_lengths); + params.N_, params.C_, params.K_, params.filter_spatial_lengths_, output_spatial_lengths); std::size_t num_btype = - get_btype(params.N, - params.C, - params.K, - params.input_spatial_lengths, - params.filter_spatial_lengths, + get_btype(params.N_, + params.C_, + params.K_, + params.input_spatial_lengths_, + params.filter_spatial_lengths_, output_spatial_lengths) + - sizeof(OutDataType) * (params.K) + + sizeof(OutDataType) * (params.K_) + sizeof(OutDataType) * - (params.N * params.K * output_spatial_lengths[0] * output_spatial_lengths[1]); + (params.N_ * params.K_ * output_spatial_lengths[0] * output_spatial_lengths[1]); float tflops = static_cast(flop) / 1.E9 / ave_time; float gb_per_sec = num_btype / 1.E6 / ave_time; @@ -310,17 +310,18 @@ int main(int argc, char* argv[]) host_output, bias, residual, - params.conv_filter_strides, - params.conv_filter_dilations, - params.input_left_pads, - params.input_right_pads, + params.conv_filter_strides_, + params.conv_filter_dilations_, + params.input_left_pads_, + params.input_right_pads_, in_element_op, wei_element_op, out_element_op); ref_invoker.Run(ref_argument); out_device_buf.FromDevice(device_output.mData.data()); - ck::utils::check_err( - host_output.mData, device_output.mData, "Error: incorrect results!", 1e-5f, 1e-4f); + return ck::utils::check_err(device_output.mData, host_output.mData) ? 0 : 1; } + + return 0; } diff --git a/example/09_convnd_fwd/CMakeLists.txt b/example/09_convnd_fwd/CMakeLists.txt index f602862a04..ceceb4aedc 100644 --- a/example/09_convnd_fwd/CMakeLists.txt +++ b/example/09_convnd_fwd/CMakeLists.txt @@ -1,6 +1,6 @@ -add_example_executable(example_convnd_fwd_xdl convnd_fwd_xdl.cpp) -target_link_libraries(example_convnd_fwd_xdl PRIVATE conv_fwd_util) +add_example_executable(example_convnd_fwd_xdl_fp32 convnd_fwd_xdl_fp32.cpp) add_example_executable(example_convnd_fwd_xdl_int8 convnd_fwd_xdl_int8.cpp) -target_link_libraries(example_convnd_fwd_xdl_int8 PRIVATE conv_fwd_util) add_example_executable(example_convnd_fwd_xdl_fp16 convnd_fwd_xdl_fp16.cpp) -target_link_libraries(example_convnd_fwd_xdl_fp16 PRIVATE conv_fwd_util) +target_link_libraries(example_convnd_fwd_xdl_fp32 PRIVATE conv_util) +target_link_libraries(example_convnd_fwd_xdl_int8 PRIVATE conv_util) +target_link_libraries(example_convnd_fwd_xdl_fp16 PRIVATE conv_util) diff --git a/example/09_convnd_fwd/convnd_fwd_xdl_fp16.cpp b/example/09_convnd_fwd/convnd_fwd_xdl_fp16.cpp index eaa5683978..7ad83d5ad6 100644 --- a/example/09_convnd_fwd/convnd_fwd_xdl_fp16.cpp +++ b/example/09_convnd_fwd/convnd_fwd_xdl_fp16.cpp @@ -5,7 +5,7 @@ #include "check_err.hpp" #include "config.hpp" -#include "conv_fwd_util.hpp" +#include "conv_util.hpp" #include "device.hpp" #include "device_tensor.hpp" #include "device_convnd_fwd_xdl_nhwc_kyxc_nhwk.hpp" @@ -43,10 +43,10 @@ template using DeviceConvNDFwdInstance = ck::tensor_operation::device:: DeviceConvNDFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K< // clang-format off - InDataType, // + InDataType, // WeiDataType, // OutDataType, // - AccDataType, // + AccDataType, // InElementOp, // Input Elementwise Operation WeiElementOp, // Weights Elementwise Operation OutElementOp, // Output Elementwise Operation @@ -110,7 +110,7 @@ void print_use_msg() { std::cout << "arg1: verification (0=no, 1=yes)\n" << "arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n" - << "arg3: run kernel # of times (>1)\n" + << "arg3: time kernel (0=n0, 1=yes)\n" << "arg4: N spatial dimensions (default 2)\n" << "Following arguments (depending on number of spatial dims):\n" << " N, K, C, \n" @@ -137,40 +137,40 @@ ck::utils::conv::ConvParams parse_conv_params(int num_dim_spatial, int argc, cha ck::utils::conv::ConvParams params; int arg_idx = 5; - params.num_dim_spatial = num_dim_spatial; - params.N = std::stoi(argv[arg_idx++]); - params.K = std::stoi(argv[arg_idx++]); - params.C = std::stoi(argv[arg_idx++]); + params.num_dim_spatial_ = num_dim_spatial; + params.N_ = std::stoi(argv[arg_idx++]); + params.K_ = std::stoi(argv[arg_idx++]); + params.C_ = std::stoi(argv[arg_idx++]); - params.filter_spatial_lengths.resize(num_dim_spatial); + params.filter_spatial_lengths_.resize(num_dim_spatial); for(int i = 0; i < num_dim_spatial; ++i) { - params.filter_spatial_lengths[i] = std::stoi(argv[arg_idx++]); + params.filter_spatial_lengths_[i] = std::stoi(argv[arg_idx++]); } - params.input_spatial_lengths.resize(num_dim_spatial); + params.input_spatial_lengths_.resize(num_dim_spatial); for(int i = 0; i < num_dim_spatial; ++i) { - params.input_spatial_lengths[i] = std::stoi(argv[arg_idx++]); + params.input_spatial_lengths_[i] = std::stoi(argv[arg_idx++]); } - params.conv_filter_strides.resize(num_dim_spatial); + params.conv_filter_strides_.resize(num_dim_spatial); for(int i = 0; i < num_dim_spatial; ++i) { - params.conv_filter_strides[i] = std::stoi(argv[arg_idx++]); + params.conv_filter_strides_[i] = std::stoi(argv[arg_idx++]); } - params.conv_filter_dilations.resize(num_dim_spatial); + params.conv_filter_dilations_.resize(num_dim_spatial); for(int i = 0; i < num_dim_spatial; ++i) { - params.conv_filter_dilations[i] = std::stoi(argv[arg_idx++]); + params.conv_filter_dilations_[i] = std::stoi(argv[arg_idx++]); } - params.input_left_pads.resize(num_dim_spatial); + params.input_left_pads_.resize(num_dim_spatial); for(int i = 0; i < num_dim_spatial; ++i) { - params.input_left_pads[i] = std::stoi(argv[arg_idx++]); + params.input_left_pads_[i] = std::stoi(argv[arg_idx++]); } - params.input_right_pads.resize(num_dim_spatial); + params.input_right_pads_.resize(num_dim_spatial); for(int i = 0; i < num_dim_spatial; ++i) { - params.input_right_pads[i] = std::stoi(argv[arg_idx++]); + params.input_right_pads_[i] = std::stoi(argv[arg_idx++]); } return params; @@ -182,9 +182,9 @@ int main(int argc, char* argv[]) { using namespace ck::utils::conv; - bool do_verification = 0; - int init_method = 0; - int nrepeat = 5; + bool do_verification = true; + int init_method = 1; + bool time_kernel = false; int num_dim_spatial = 2; ck::utils::conv::ConvParams params; @@ -193,7 +193,7 @@ int main(int argc, char* argv[]) { do_verification = std::stoi(argv[1]); init_method = std::stoi(argv[2]); - nrepeat = std::stoi(argv[3]); + time_kernel = std::stoi(argv[3]); num_dim_spatial = std::stoi(argv[4]); } @@ -202,21 +202,21 @@ int main(int argc, char* argv[]) params = parse_conv_params(num_dim_spatial, argc, argv); } - std::vector input_dims{static_cast(params.N), - static_cast(params.C)}; + std::vector input_dims{static_cast(params.N_), + static_cast(params.C_)}; input_dims.insert(std::end(input_dims), - std::begin(params.input_spatial_lengths), - std::end(params.input_spatial_lengths)); + std::begin(params.input_spatial_lengths_), + std::end(params.input_spatial_lengths_)); - std::vector filter_dims{static_cast(params.K), - static_cast(params.C)}; + std::vector filter_dims{static_cast(params.K_), + static_cast(params.C_)}; filter_dims.insert(std::end(filter_dims), - std::begin(params.filter_spatial_lengths), - std::end(params.filter_spatial_lengths)); + std::begin(params.filter_spatial_lengths_), + std::end(params.filter_spatial_lengths_)); const std::vector& output_spatial_lengths = params.GetOutputSpatialLengths(); - std::vector output_dims{static_cast(params.N), - static_cast(params.K)}; + std::vector output_dims{static_cast(params.N_), + static_cast(params.K_)}; output_dims.insert(std::end(output_dims), std::begin(output_spatial_lengths), std::end(output_spatial_lengths)); @@ -256,16 +256,16 @@ int main(int argc, char* argv[]) conv->MakeArgumentPointer(static_cast(in_device_buf.GetDeviceBuffer()), static_cast(wei_device_buf.GetDeviceBuffer()), static_cast(out_device_buf.GetDeviceBuffer()), - params.N, - params.K, - params.C, - params.input_spatial_lengths, - params.filter_spatial_lengths, + params.N_, + params.K_, + params.C_, + params.input_spatial_lengths_, + params.filter_spatial_lengths_, output_spatial_lengths, - params.conv_filter_strides, - params.conv_filter_dilations, - params.input_left_pads, - params.input_right_pads, + params.conv_filter_strides_, + params.conv_filter_dilations_, + params.input_left_pads_, + params.input_right_pads_, InElementOp{}, WeiElementOp{}, OutElementOp{}); @@ -277,16 +277,16 @@ int main(int argc, char* argv[]) "not support this Conv problem"); } - float ave_time = invoker->Run(argument.get(), nrepeat); + float ave_time = invoker->Run(argument.get(), StreamConfig{nullptr, time_kernel}); std::size_t flop = get_flops( - params.N, params.C, params.K, params.filter_spatial_lengths, output_spatial_lengths); + params.N_, params.C_, params.K_, params.filter_spatial_lengths_, output_spatial_lengths); std::size_t num_btype = get_btype( - params.N, - params.C, - params.K, - params.input_spatial_lengths, - params.filter_spatial_lengths, + params.N_, + params.C_, + params.K_, + params.input_spatial_lengths_, + params.filter_spatial_lengths_, output_spatial_lengths); float tflops = static_cast(flop) / 1.E9 / ave_time; @@ -302,18 +302,18 @@ int main(int argc, char* argv[]) auto ref_argument = ref_conv.MakeArgument(input, weights, host_output, - params.conv_filter_strides, - params.conv_filter_dilations, - params.input_left_pads, - params.input_right_pads, + params.conv_filter_strides_, + params.conv_filter_dilations_, + params.input_left_pads_, + params.input_right_pads_, InElementOp{}, WeiElementOp{}, OutElementOp{}); ref_invoker.Run(ref_argument); out_device_buf.FromDevice(device_output.mData.data()); - ck::utils::check_err( - host_output.mData, device_output.mData, "Error: incorrect results!", 1e-5f, 1e-4f); + return ck::utils::check_err( + host_output.mData, device_output.mData, "Error: incorrect results!", 1e-5f, 1e-4f) ? 0 : 1; }; switch(num_dim_spatial) @@ -338,4 +338,5 @@ int main(int argc, char* argv[]) } } } + return 0; } diff --git a/example/09_convnd_fwd/convnd_fwd_xdl.cpp b/example/09_convnd_fwd/convnd_fwd_xdl_fp32.cpp similarity index 80% rename from example/09_convnd_fwd/convnd_fwd_xdl.cpp rename to example/09_convnd_fwd/convnd_fwd_xdl_fp32.cpp index e8895b8639..8a9633d84a 100644 --- a/example/09_convnd_fwd/convnd_fwd_xdl.cpp +++ b/example/09_convnd_fwd/convnd_fwd_xdl_fp32.cpp @@ -5,7 +5,7 @@ #include "check_err.hpp" #include "config.hpp" -#include "conv_fwd_util.hpp" +#include "conv_util.hpp" #include "device.hpp" #include "device_tensor.hpp" #include "device_convnd_fwd_xdl_nhwc_kyxc_nhwk.hpp" @@ -39,10 +39,10 @@ template using DeviceConvNDFwdInstance = ck::tensor_operation::device:: DeviceConvNDFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K< // clang-format off - InDataType, // + InDataType, // WeiDataType, // OutDataType, // - AccDataType, // + AccDataType, // InElementOp, // Input Elementwise Operation WeiElementOp, // Weights Elementwise Operation OutElementOp, // Output Elementwise Operation @@ -107,7 +107,7 @@ void print_use_msg() { std::cout << "arg1: verification (0=no, 1=yes)\n" << "arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n" - << "arg3: run kernel # of times (>1)\n" + << "arg3: time kernel (0=n0, 1=yes)\n" << "arg4: N spatial dimensions (default 2)\n" << "Following arguments (depending on number of spatial dims):\n" << " N, K, C, \n" @@ -134,40 +134,40 @@ ck::utils::conv::ConvParams parse_conv_params(int num_dim_spatial, int argc, cha ck::utils::conv::ConvParams params; int arg_idx = 5; - params.num_dim_spatial = num_dim_spatial; - params.N = std::stoi(argv[arg_idx++]); - params.K = std::stoi(argv[arg_idx++]); - params.C = std::stoi(argv[arg_idx++]); + params.num_dim_spatial_ = num_dim_spatial; + params.N_ = std::stoi(argv[arg_idx++]); + params.K_ = std::stoi(argv[arg_idx++]); + params.C_ = std::stoi(argv[arg_idx++]); - params.filter_spatial_lengths.resize(num_dim_spatial); + params.filter_spatial_lengths_.resize(num_dim_spatial); for(int i = 0; i < num_dim_spatial; ++i) { - params.filter_spatial_lengths[i] = std::stoi(argv[arg_idx++]); + params.filter_spatial_lengths_[i] = std::stoi(argv[arg_idx++]); } - params.input_spatial_lengths.resize(num_dim_spatial); + params.input_spatial_lengths_.resize(num_dim_spatial); for(int i = 0; i < num_dim_spatial; ++i) { - params.input_spatial_lengths[i] = std::stoi(argv[arg_idx++]); + params.input_spatial_lengths_[i] = std::stoi(argv[arg_idx++]); } - params.conv_filter_strides.resize(num_dim_spatial); + params.conv_filter_strides_.resize(num_dim_spatial); for(int i = 0; i < num_dim_spatial; ++i) { - params.conv_filter_strides[i] = std::stoi(argv[arg_idx++]); + params.conv_filter_strides_[i] = std::stoi(argv[arg_idx++]); } - params.conv_filter_dilations.resize(num_dim_spatial); + params.conv_filter_dilations_.resize(num_dim_spatial); for(int i = 0; i < num_dim_spatial; ++i) { - params.conv_filter_dilations[i] = std::stoi(argv[arg_idx++]); + params.conv_filter_dilations_[i] = std::stoi(argv[arg_idx++]); } - params.input_left_pads.resize(num_dim_spatial); + params.input_left_pads_.resize(num_dim_spatial); for(int i = 0; i < num_dim_spatial; ++i) { - params.input_left_pads[i] = std::stoi(argv[arg_idx++]); + params.input_left_pads_[i] = std::stoi(argv[arg_idx++]); } - params.input_right_pads.resize(num_dim_spatial); + params.input_right_pads_.resize(num_dim_spatial); for(int i = 0; i < num_dim_spatial; ++i) { - params.input_right_pads[i] = std::stoi(argv[arg_idx++]); + params.input_right_pads_[i] = std::stoi(argv[arg_idx++]); } return params; @@ -179,9 +179,9 @@ int main(int argc, char* argv[]) { using namespace ck::utils::conv; - bool do_verification = 0; - int init_method = 0; - int nrepeat = 5; + bool do_verification = true; + int init_method = 1; + bool time_kernel = false; int num_dim_spatial = 2; ck::utils::conv::ConvParams params; @@ -190,7 +190,7 @@ int main(int argc, char* argv[]) { do_verification = std::stoi(argv[1]); init_method = std::stoi(argv[2]); - nrepeat = std::stoi(argv[3]); + time_kernel = std::stoi(argv[3]); num_dim_spatial = std::stoi(argv[4]); } @@ -199,21 +199,21 @@ int main(int argc, char* argv[]) params = parse_conv_params(num_dim_spatial, argc, argv); } - std::vector input_dims{static_cast(params.N), - static_cast(params.C)}; + std::vector input_dims{static_cast(params.N_), + static_cast(params.C_)}; input_dims.insert(std::end(input_dims), - std::begin(params.input_spatial_lengths), - std::end(params.input_spatial_lengths)); + std::begin(params.input_spatial_lengths_), + std::end(params.input_spatial_lengths_)); - std::vector filter_dims{static_cast(params.K), - static_cast(params.C)}; + std::vector filter_dims{static_cast(params.K_), + static_cast(params.C_)}; filter_dims.insert(std::end(filter_dims), - std::begin(params.filter_spatial_lengths), - std::end(params.filter_spatial_lengths)); + std::begin(params.filter_spatial_lengths_), + std::end(params.filter_spatial_lengths_)); const std::vector& output_spatial_lengths = params.GetOutputSpatialLengths(); - std::vector output_dims{static_cast(params.N), - static_cast(params.K)}; + std::vector output_dims{static_cast(params.N_), + static_cast(params.K_)}; output_dims.insert(std::end(output_dims), std::begin(output_spatial_lengths), std::end(output_spatial_lengths)); @@ -255,16 +255,16 @@ int main(int argc, char* argv[]) conv->MakeArgumentPointer(static_cast(in_device_buf.GetDeviceBuffer()), static_cast(wei_device_buf.GetDeviceBuffer()), static_cast(out_device_buf.GetDeviceBuffer()), - params.N, - params.K, - params.C, - params.input_spatial_lengths, - params.filter_spatial_lengths, + params.N_, + params.K_, + params.C_, + params.input_spatial_lengths_, + params.filter_spatial_lengths_, output_spatial_lengths, - params.conv_filter_strides, - params.conv_filter_dilations, - params.input_left_pads, - params.input_right_pads, + params.conv_filter_strides_, + params.conv_filter_dilations_, + params.input_left_pads_, + params.input_right_pads_, InElementOp{}, WeiElementOp{}, OutElementOp{}); @@ -276,16 +276,16 @@ int main(int argc, char* argv[]) "not support this Conv problem"); } - float ave_time = invoker->Run(argument.get(), nrepeat); + float ave_time = invoker->Run(argument.get(), StreamConfig{nullptr, time_kernel}); std::size_t flop = get_flops( - params.N, params.C, params.K, params.filter_spatial_lengths, output_spatial_lengths); + params.N_, params.C_, params.K_, params.filter_spatial_lengths_, output_spatial_lengths); std::size_t num_btype = - get_btype(params.N, - params.C, - params.K, - params.input_spatial_lengths, - params.filter_spatial_lengths, + get_btype(params.N_, + params.C_, + params.K_, + params.input_spatial_lengths_, + params.filter_spatial_lengths_, output_spatial_lengths); float tflops = static_cast(flop) / 1.E9 / ave_time; @@ -301,18 +301,23 @@ int main(int argc, char* argv[]) auto ref_argument = ref_conv.MakeArgument(input, weights, host_output, - params.conv_filter_strides, - params.conv_filter_dilations, - params.input_left_pads, - params.input_right_pads, + params.conv_filter_strides_, + params.conv_filter_dilations_, + params.input_left_pads_, + params.input_right_pads_, InElementOp{}, WeiElementOp{}, OutElementOp{}); ref_invoker.Run(ref_argument); out_device_buf.FromDevice(device_output.mData.data()); - ck::utils::check_err( - host_output.mData, device_output.mData, "Error: incorrect results!", 1e-5f, 1e-4f); + return ck::utils::check_err(device_output.mData, + host_output.mData, + "Error: incorrect results!", + 1e-5f, + 1e-4f) + ? 0 + : 1; }; switch(num_dim_spatial) @@ -337,4 +342,5 @@ int main(int argc, char* argv[]) } } } + return 0; } diff --git a/example/09_convnd_fwd/convnd_fwd_xdl_int8.cpp b/example/09_convnd_fwd/convnd_fwd_xdl_int8.cpp index 34b4645770..f196d27182 100644 --- a/example/09_convnd_fwd/convnd_fwd_xdl_int8.cpp +++ b/example/09_convnd_fwd/convnd_fwd_xdl_int8.cpp @@ -5,7 +5,7 @@ #include "check_err.hpp" #include "config.hpp" -#include "conv_fwd_util.hpp" +#include "conv_util.hpp" #include "device.hpp" #include "device_tensor.hpp" #include "device_convnd_fwd_xdl_nhwc_kyxc_nhwk.hpp" @@ -45,10 +45,10 @@ template using DeviceConvNDFwdInstance = ck::tensor_operation::device:: DeviceConvNDFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K< // clang-format off - InDataType, // + InDataType, // WeiDataType, // OutDataType, // - AccDataType, // + AccDataType, // InElementOp, // Input Elementwise Operation WeiElementOp, // Weights Elementwise Operation OutElementOp, // Output Elementwise Operation @@ -112,7 +112,7 @@ void print_use_msg() { std::cout << "arg1: verification (0=no, 1=yes)\n" << "arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n" - << "arg3: run kernel # of times (>1)\n" + << "arg3: time kernel (0=n0, 1=yes)\n" << "arg4: N spatial dimensions (default 2)\n" << "Following arguments (depending on number of spatial dims):\n" << " N, K, C, \n" @@ -139,40 +139,40 @@ ck::utils::conv::ConvParams parse_conv_params(int num_dim_spatial, int argc, cha ck::utils::conv::ConvParams params; int arg_idx = 5; - params.num_dim_spatial = num_dim_spatial; - params.N = std::stoi(argv[arg_idx++]); - params.K = std::stoi(argv[arg_idx++]); - params.C = std::stoi(argv[arg_idx++]); + params.num_dim_spatial_ = num_dim_spatial; + params.N_ = std::stoi(argv[arg_idx++]); + params.K_ = std::stoi(argv[arg_idx++]); + params.C_ = std::stoi(argv[arg_idx++]); - params.filter_spatial_lengths.resize(num_dim_spatial); + params.filter_spatial_lengths_.resize(num_dim_spatial); for(int i = 0; i < num_dim_spatial; ++i) { - params.filter_spatial_lengths[i] = std::stoi(argv[arg_idx++]); + params.filter_spatial_lengths_[i] = std::stoi(argv[arg_idx++]); } - params.input_spatial_lengths.resize(num_dim_spatial); + params.input_spatial_lengths_.resize(num_dim_spatial); for(int i = 0; i < num_dim_spatial; ++i) { - params.input_spatial_lengths[i] = std::stoi(argv[arg_idx++]); + params.input_spatial_lengths_[i] = std::stoi(argv[arg_idx++]); } - params.conv_filter_strides.resize(num_dim_spatial); + params.conv_filter_strides_.resize(num_dim_spatial); for(int i = 0; i < num_dim_spatial; ++i) { - params.conv_filter_strides[i] = std::stoi(argv[arg_idx++]); + params.conv_filter_strides_[i] = std::stoi(argv[arg_idx++]); } - params.conv_filter_dilations.resize(num_dim_spatial); + params.conv_filter_dilations_.resize(num_dim_spatial); for(int i = 0; i < num_dim_spatial; ++i) { - params.conv_filter_dilations[i] = std::stoi(argv[arg_idx++]); + params.conv_filter_dilations_[i] = std::stoi(argv[arg_idx++]); } - params.input_left_pads.resize(num_dim_spatial); + params.input_left_pads_.resize(num_dim_spatial); for(int i = 0; i < num_dim_spatial; ++i) { - params.input_left_pads[i] = std::stoi(argv[arg_idx++]); + params.input_left_pads_[i] = std::stoi(argv[arg_idx++]); } - params.input_right_pads.resize(num_dim_spatial); + params.input_right_pads_.resize(num_dim_spatial); for(int i = 0; i < num_dim_spatial; ++i) { - params.input_right_pads[i] = std::stoi(argv[arg_idx++]); + params.input_right_pads_[i] = std::stoi(argv[arg_idx++]); } return params; @@ -184,9 +184,9 @@ int main(int argc, char* argv[]) { using namespace ck::utils::conv; - bool do_verification = 0; - int init_method = 0; - int nrepeat = 5; + bool do_verification = true; + int init_method = 1; + bool time_kernel = false; int num_dim_spatial = 2; ck::utils::conv::ConvParams params; @@ -195,7 +195,7 @@ int main(int argc, char* argv[]) { do_verification = std::stoi(argv[1]); init_method = std::stoi(argv[2]); - nrepeat = std::stoi(argv[3]); + time_kernel = std::stoi(argv[3]); num_dim_spatial = std::stoi(argv[4]); } @@ -204,21 +204,21 @@ int main(int argc, char* argv[]) params = parse_conv_params(num_dim_spatial, argc, argv); } - std::vector input_dims{static_cast(params.N), - static_cast(params.C)}; + std::vector input_dims{static_cast(params.N_), + static_cast(params.C_)}; input_dims.insert(std::end(input_dims), - std::begin(params.input_spatial_lengths), - std::end(params.input_spatial_lengths)); + std::begin(params.input_spatial_lengths_), + std::end(params.input_spatial_lengths_)); - std::vector filter_dims{static_cast(params.K), - static_cast(params.C)}; + std::vector filter_dims{static_cast(params.K_), + static_cast(params.C_)}; filter_dims.insert(std::end(filter_dims), - std::begin(params.filter_spatial_lengths), - std::end(params.filter_spatial_lengths)); + std::begin(params.filter_spatial_lengths_), + std::end(params.filter_spatial_lengths_)); const std::vector& output_spatial_lengths = params.GetOutputSpatialLengths(); - std::vector output_dims{static_cast(params.N), - static_cast(params.K)}; + std::vector output_dims{static_cast(params.N_), + static_cast(params.K_)}; output_dims.insert(std::end(output_dims), std::begin(output_spatial_lengths), std::end(output_spatial_lengths)); @@ -258,16 +258,16 @@ int main(int argc, char* argv[]) conv->MakeArgumentPointer(static_cast(in_device_buf.GetDeviceBuffer()), static_cast(wei_device_buf.GetDeviceBuffer()), static_cast(out_device_buf.GetDeviceBuffer()), - params.N, - params.K, - params.C, - params.input_spatial_lengths, - params.filter_spatial_lengths, + params.N_, + params.K_, + params.C_, + params.input_spatial_lengths_, + params.filter_spatial_lengths_, output_spatial_lengths, - params.conv_filter_strides, - params.conv_filter_dilations, - params.input_left_pads, - params.input_right_pads, + params.conv_filter_strides_, + params.conv_filter_dilations_, + params.input_left_pads_, + params.input_right_pads_, InElementOp{}, WeiElementOp{}, OutElementOp{}); @@ -279,16 +279,16 @@ int main(int argc, char* argv[]) "not support this Conv problem"); } - float ave_time = invoker->Run(argument.get(), nrepeat); + float ave_time = invoker->Run(argument.get(), StreamConfig{nullptr, time_kernel}); std::size_t flop = get_flops( - params.N, params.C, params.K, params.filter_spatial_lengths, output_spatial_lengths); + params.N_, params.C_, params.K_, params.filter_spatial_lengths_, output_spatial_lengths); std::size_t num_btype = get_btype( - params.N, - params.C, - params.K, - params.input_spatial_lengths, - params.filter_spatial_lengths, + params.N_, + params.C_, + params.K_, + params.input_spatial_lengths_, + params.filter_spatial_lengths_, output_spatial_lengths); float tflops = static_cast(flop) / 1.E9 / ave_time; @@ -304,18 +304,18 @@ int main(int argc, char* argv[]) auto ref_argument = ref_conv.MakeArgument(input, weights, host_output, - params.conv_filter_strides, - params.conv_filter_dilations, - params.input_left_pads, - params.input_right_pads, + params.conv_filter_strides_, + params.conv_filter_dilations_, + params.input_left_pads_, + params.input_right_pads_, InElementOp{}, WeiElementOp{}, OutElementOp{}); ref_invoker.Run(ref_argument); out_device_buf.FromDevice(device_output.mData.data()); - ck::utils::check_err( - host_output.mData, device_output.mData, "Error: incorrect results!", 1e-5f, 1e-4f); + return ck::utils::check_err( + host_output.mData, device_output.mData, "Error: incorrect results!", 1e-5f, 1e-4f) ? 0 : 1; }; switch(num_dim_spatial) @@ -340,4 +340,5 @@ int main(int argc, char* argv[]) } } } + return 0; } diff --git a/example/10_conv2d_bwd_data/CMakeLists.txt b/example/10_conv2d_bwd_data/CMakeLists.txt index f300bc9645..17aca1481b 100644 --- a/example/10_conv2d_bwd_data/CMakeLists.txt +++ b/example/10_conv2d_bwd_data/CMakeLists.txt @@ -1,2 +1,2 @@ add_example_executable(example_conv2d_bwd_data_xdl conv2d_bwd_data_xdl.cpp) -target_link_libraries(example_conv2d_bwd_data_xdl PRIVATE conv_fwd_util) +target_link_libraries(example_conv2d_bwd_data_xdl PRIVATE conv_util) diff --git a/example/10_conv2d_bwd_data/conv2d_bwd_data_xdl.cpp b/example/10_conv2d_bwd_data/conv2d_bwd_data_xdl.cpp index f3f9b497f5..2d25f5ac2f 100644 --- a/example/10_conv2d_bwd_data/conv2d_bwd_data_xdl.cpp +++ b/example/10_conv2d_bwd_data/conv2d_bwd_data_xdl.cpp @@ -77,9 +77,9 @@ using ReferenceConvBwdInstance = ck::tensor_operation::host::ReferenceConvBwdDat int main(int argc, char* argv[]) { - bool do_verification = 0; - int init_method = 0; - int nrepeat = 5; + bool do_verification = true; + int init_method = 1; + bool time_kernel = false; // Conv shape ck::index_t N = 128; @@ -102,13 +102,13 @@ int main(int argc, char* argv[]) { do_verification = std::stoi(argv[1]); init_method = std::stoi(argv[2]); - nrepeat = std::stoi(argv[3]); + time_kernel = std::stoi(argv[3]); } else if(argc == 19) { do_verification = std::stoi(argv[1]); init_method = std::stoi(argv[2]); - nrepeat = std::stoi(argv[3]); + time_kernel = std::stoi(argv[3]); N = std::stoi(argv[4]); K = std::stoi(argv[5]); @@ -130,7 +130,7 @@ int main(int argc, char* argv[]) { printf("arg1: verification (0=no, 1=yes)\n"); printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n"); - printf("arg3: run kernel # of times (>1)\n"); + printf("arg3: time kernel (0=n0, 1=yes)\n"); printf("arg4 to 18: N, K, C, Y, X, Hi, Wi, Sy, Sx, Dy, Dx, LeftPy, LeftPx, RightPy, " "RightPx\n"); exit(0); @@ -214,7 +214,7 @@ int main(int argc, char* argv[]) "not support this Conv problem"); } - float ave_time = invoker.Run(argument, nrepeat); + float ave_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel}); std::size_t flop = std::size_t(2) * N * K * Ho * Wo * C * Y * X; @@ -249,6 +249,10 @@ int main(int argc, char* argv[]) in_device_buf.FromDevice(in_n_c_hi_wi_device_result.mData.data()); - ck::utils::check_err(in_n_c_hi_wi_device_result.mData, in_n_c_hi_wi_host_result.mData); + return ck::utils::check_err(in_n_c_hi_wi_device_result.mData, + in_n_c_hi_wi_host_result.mData) + ? 0 + : 1; } + return 0; } diff --git a/example/11_conv2d_bwd_weight/CMakeLists.txt b/example/11_conv2d_bwd_weight/CMakeLists.txt index ff001eab72..3d771b5569 100644 --- a/example/11_conv2d_bwd_weight/CMakeLists.txt +++ b/example/11_conv2d_bwd_weight/CMakeLists.txt @@ -1,2 +1,2 @@ add_example_executable(example_conv2d_bwd_weight_xdl conv2d_bwd_weight_xdl.cpp) -target_link_libraries(example_conv2d_bwd_weight_xdl PRIVATE conv_fwd_util) +target_link_libraries(example_conv2d_bwd_weight_xdl PRIVATE conv_util) diff --git a/example/11_conv2d_bwd_weight/conv2d_bwd_weight_xdl.cpp b/example/11_conv2d_bwd_weight/conv2d_bwd_weight_xdl.cpp index bf78cc87e0..1578161116 100644 --- a/example/11_conv2d_bwd_weight/conv2d_bwd_weight_xdl.cpp +++ b/example/11_conv2d_bwd_weight/conv2d_bwd_weight_xdl.cpp @@ -82,9 +82,9 @@ using ReferenceConvBwdWeightInstance = int main(int argc, char* argv[]) { - bool do_verification = 0; - int init_method = 0; - int nrepeat = 5; + bool do_verification = true; + int init_method = 1; + bool time_kernel = false; int do_log = 0; int split_k = 4; @@ -109,7 +109,7 @@ int main(int argc, char* argv[]) { do_verification = std::stoi(argv[1]); init_method = std::stoi(argv[2]); - nrepeat = std::stoi(argv[3]); + time_kernel = std::stoi(argv[3]); do_log = std::stoi(argv[4]); split_k = std::stoi(argv[5]); } @@ -117,7 +117,7 @@ int main(int argc, char* argv[]) { do_verification = std::stoi(argv[1]); init_method = std::stoi(argv[2]); - nrepeat = std::stoi(argv[3]); + time_kernel = std::stoi(argv[3]); do_log = std::stoi(argv[4]); split_k = std::stoi(argv[5]); @@ -141,7 +141,7 @@ int main(int argc, char* argv[]) { printf("arg1: verification (0=no, 1=yes)\n"); printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n"); - printf("arg3: run kernel # of times (>1)\n"); + printf("arg3: time kernel (0=n0, 1=yes)\n"); printf("arg4: is show log (0=no, 1=yes)\n"); printf("arg5: split-k \n"); printf("arg6 to 19: N, K, C, Y, X, Hi, Wi, Sy, Sx, Dy, Dx, LeftPy, LeftPx, RightPy, " @@ -246,7 +246,7 @@ int main(int argc, char* argv[]) return 1; } - float ave_time = invoker.Run(argument, nrepeat); + float ave_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel}); std::size_t flop = std::size_t(2) * N * K * Ho * Wo * C * Y * X; @@ -291,6 +291,9 @@ int main(int argc, char* argv[]) LogRangeAsType(std::cout << "wei_host : ", wei_k_c_y_x_host_result.mData, ",") << std::endl; } - ck::utils::check_err(wei_k_c_y_x_device_result.mData, wei_k_c_y_x_host_result.mData); + return ck::utils::check_err(wei_k_c_y_x_device_result.mData, wei_k_c_y_x_host_result.mData) + ? 0 + : 1; } + return 0; } diff --git a/example/12_reduce/CMakeLists.txt b/example/12_reduce/CMakeLists.txt index 734c1955d6..d6866abeb8 100644 --- a/example/12_reduce/CMakeLists.txt +++ b/example/12_reduce/CMakeLists.txt @@ -1 +1 @@ -add_example_executable(example_reduce_blockwise reduce_blockwise.cpp) +add_example_executable(example_reduce_blockwise reduce_blockwise.cpp -D 16,64,32,960 -v 1 1 10) diff --git a/example/12_reduce/reduce_blockwise.cpp b/example/12_reduce/reduce_blockwise.cpp index 7ca9823ff5..b2d312ae8c 100644 --- a/example/12_reduce/reduce_blockwise.cpp +++ b/example/12_reduce/reduce_blockwise.cpp @@ -116,10 +116,9 @@ class SimpleAppArgs std::vector inLengths; std::vector scales; - bool do_verification = false; - - int init_method = 1; - int nrepeat = 5; + bool do_verification = true; + int init_method = 1; + bool time_kernel = false; public: void show_usage(const char* cmd) @@ -135,7 +134,7 @@ class SimpleAppArgs std::cout << "Arg1 -- init method (0=no init, 1=single integer value, 2=scope integer " "value, 3=decimal value)" << std::endl; - std::cout << "Arg2 -- number of repeats to run the kernel" << std::endl; + std::cout << "Arg2 -- time kernel (0=n0, 1=yes)" << std::endl; }; int processArgs(int argc, char* argv[]) @@ -182,7 +181,7 @@ class SimpleAppArgs throw std::runtime_error("Invalid cmd-line arguments, more argumetns are needed!"); init_method = std::atoi(argv[optind++]); - nrepeat = std::atoi(argv[optind]); + time_kernel = std::atoi(argv[optind]); if(scales.empty()) { @@ -352,7 +351,7 @@ int main(int argc, char* argv[]) auto invoker_ptr = reduce.MakeInvokerPointer(); - float avg_time = invoker_ptr->Run(argument_ptr.get(), args.nrepeat); + float avg_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, args.time_kernel}); std::size_t num_bytes = invariant_total_length * reduce_total_length * sizeof(InDataType) + invariant_total_length * sizeof(OutDataType); @@ -362,16 +361,17 @@ int main(int argc, char* argv[]) std::cout << "Perf: " << avg_time << " ms, " << gb_per_sec << " GB/s, " << reduce_name << std::endl; + bool pass = true; if(args.do_verification) { out_dev.FromDevice(out.mData.data()); - ck::utils::check_err(out.mData, out_ref.mData); + pass &= ck::utils::check_err(out.mData, out_ref.mData); if(NeedIndices) { out_indices_dev.FromDevice(out_indices.mData.data()); - ck::utils::check_err(out_indices.mData, out_indices_ref.mData); - ; + pass &= ck::utils::check_err(out_indices.mData, out_indices_ref.mData); }; }; + return pass ? 0 : 1; } diff --git a/example/13_pool2d_fwd/pool2d_fwd.cpp b/example/13_pool2d_fwd/pool2d_fwd.cpp index a18761095c..e6749bf8d7 100644 --- a/example/13_pool2d_fwd/pool2d_fwd.cpp +++ b/example/13_pool2d_fwd/pool2d_fwd.cpp @@ -149,9 +149,9 @@ int main(int argc, char* argv[]) { using namespace ck::host_reduce; - bool do_verification = 0; - int init_method = 0; - int nrepeat = 5; + bool do_verification = true; + int init_method = 1; + bool time_kernel = false; // Pool shape ck::index_t N = 128; @@ -171,13 +171,13 @@ int main(int argc, char* argv[]) { do_verification = std::stoi(argv[1]); init_method = std::stoi(argv[2]); - nrepeat = std::stoi(argv[3]); + time_kernel = std::stoi(argv[3]); } else if(argc == 16) { do_verification = std::stoi(argv[1]); init_method = std::stoi(argv[2]); - nrepeat = std::stoi(argv[3]); + time_kernel = std::stoi(argv[3]); N = std::stoi(argv[4]); C = std::stoi(argv[5]); @@ -196,7 +196,7 @@ int main(int argc, char* argv[]) { printf("arg1: verification (0=no, 1=yes)\n"); printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n"); - printf("arg3: run kernel # of times (>1)\n"); + printf("arg3: time kernel (0=n0, 1=yes)\n"); printf("arg4 to 15: N, C, Y, X, Hi, Wi, Sy, Sx, LeftPy, LeftPx, RightPy, " "RightPx\n"); exit(0); @@ -271,7 +271,7 @@ int main(int argc, char* argv[]) "not support this problem"); } - float ave_time = invoker_ptr->Run(argument_ptr.get(), nrepeat); + float ave_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel}); std::size_t flop = std::size_t(2) * N * C * Ho * Wo * Y * X; @@ -285,6 +285,7 @@ int main(int argc, char* argv[]) std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s" << std::endl; + bool pass = true; if(do_verification) { pool_host_verify1)\n"); + printf("arg3: time kernel (0=n0, 1=yes)\n"); printf("arg4 to 9: M (256x), N(128x), K(32x), StrideA, StrideB, StrideC\n"); exit(0); } @@ -219,7 +219,7 @@ int main(int argc, char* argv[]) "not support this GEMM problem"); } - float ave_time = invoker.Run(argument, nrepeat); + float ave_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel}); std::size_t flop = std::size_t(2) * M * N * K; std::size_t num_btype = @@ -244,7 +244,7 @@ int main(int argc, char* argv[]) ref_invoker.Run(ref_argument); - ck::utils::check_err(c_m_n_device_result.mData, c_m_n_host_result.mData); + return ck::utils::check_err(c_m_n_device_result.mData, c_m_n_host_result.mData) ? 0 : 1; } return 0; diff --git a/example/15_grouped_gemm/grouped_gemm_xdl_fp16.cpp b/example/15_grouped_gemm/grouped_gemm_xdl_fp16.cpp index 29ef01f2ef..8c3491c8c9 100644 --- a/example/15_grouped_gemm/grouped_gemm_xdl_fp16.cpp +++ b/example/15_grouped_gemm/grouped_gemm_xdl_fp16.cpp @@ -60,21 +60,21 @@ using ReferenceGemmInstance = ck::tensor_operation::host:: int main(int argc, char* argv[]) { - bool do_verification = 0; - int init_method = 0; - int nrepeat = 5; + bool do_verification = true; + int init_method = 1; + bool time_kernel = false; if(argc == 4) { do_verification = std::stoi(argv[1]); init_method = std::stoi(argv[2]); - nrepeat = std::stoi(argv[3]); + time_kernel = std::stoi(argv[3]); } else { printf("arg1: verification (0=no, 1=yes)\n"); printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n"); - printf("arg3: run kernel # of times (>1)\n"); + printf("arg3: time kernel (0=n0, 1=yes)\n"); exit(0); } @@ -202,7 +202,7 @@ int main(int argc, char* argv[]) "not support this GEMM problem"); } - float ave_time = invoker.Run(argument, nrepeat); + float ave_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel}); float tflops = static_cast(flop) / 1.E9 / ave_time; @@ -211,6 +211,7 @@ int main(int argc, char* argv[]) std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s, " << gemm.GetTypeString() << std::endl; + bool pass = true; if(do_verification) { for(std::size_t i = 0; i < gemm_shapes.size(); i++) @@ -227,9 +228,9 @@ int main(int argc, char* argv[]) c_element_op); ref_invoker.Run(ref_argument); - ck::utils::check_err(c_device_tensors[i].mData, c_host_tensors[i].mData); + pass &= ck::utils::check_err(c_device_tensors[i].mData, c_host_tensors[i].mData); } } - return 0; + return pass ? 0 : 1; } diff --git a/example/16_gemm_reduce/gemm_reduce_xdl_fp16.cpp b/example/16_gemm_reduce/gemm_reduce_xdl_fp16.cpp index 90064ae584..860d9eea2a 100644 --- a/example/16_gemm_reduce/gemm_reduce_xdl_fp16.cpp +++ b/example/16_gemm_reduce/gemm_reduce_xdl_fp16.cpp @@ -4,6 +4,7 @@ #include #include #include +#include "check_err.hpp" #include "config.hpp" #include "device.hpp" #include "host_tensor.hpp" @@ -58,9 +59,9 @@ using ReferenceGemmInstance = ck::tensor_operation::host:: int main(int argc, char* argv[]) { - bool do_verification = 1; + bool do_verification = true; int init_method = 1; - int nrepeat = 5; + bool time_kernel = false; // GEMM shape ck::index_t M = 3840; @@ -79,13 +80,13 @@ int main(int argc, char* argv[]) { do_verification = std::stoi(argv[1]); init_method = std::stoi(argv[2]); - nrepeat = std::stoi(argv[3]); + time_kernel = std::stoi(argv[3]); } else if(argc == 10) { do_verification = std::stoi(argv[1]); init_method = std::stoi(argv[2]); - nrepeat = std::stoi(argv[3]); + time_kernel = std::stoi(argv[3]); M = std::stoi(argv[4]); N = std::stoi(argv[5]); @@ -99,7 +100,7 @@ int main(int argc, char* argv[]) { printf("arg1: verification (0=no, 1=yes)\n"); printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n"); - printf("arg3: run kernel # of times (>1)\n"); + printf("arg3: time kernel (0=n0, 1=yes)\n"); printf("arg4 to 9: M (256x), N(128x), K(32x), StrideA, StrideB, StrideC\n"); exit(0); } @@ -192,30 +193,13 @@ int main(int argc, char* argv[]) "not support this GEMM problem"); } - // warm up - invoker.Run(argument); + // init DO, D1 to 0 + d0_device_buf.SetZero(); + d1_device_buf.SetZero(); - // timing - float total_time = 0; - - for(int i = 0; i < nrepeat; ++i) - { - // init DO, D1 to 0 - d0_device_buf.SetZero(); - d1_device_buf.SetZero(); - - KernelTimer timer; - - timer.Start(); - - invoker.Run(argument); - - timer.End(); - - total_time += timer.GetElapsedTime(); - } - - float ave_time = total_time / nrepeat; + // if time_kernel == true, kernel will run multiple times. This kernel use atomic-add so result + // will not be correct. need to set time_kernel = false for correctness test + float ave_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel}); std::size_t flop = std::size_t(2) * M * N * K; std::size_t num_btype = @@ -228,6 +212,7 @@ int main(int argc, char* argv[]) std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s, " << gemm.GetTypeString() << std::endl; + bool pass = true; if(do_verification) { c_device_buf.FromDevice(c_m_n_device_result.mData.data()); @@ -264,10 +249,19 @@ int main(int argc, char* argv[]) d1_m_host_result(m) = ck::type_convert(d1_acc); } - check_error(c_m_n_host_result, c_m_n_device_result); - check_error(d0_m_host_result, d0_m_device_result); - check_error(d1_m_host_result, d1_m_device_result); + pass &= ck::utils::check_err( + c_m_n_device_result.mData, c_m_n_host_result.mData, "Error: Incorrect results c"); + pass &= ck::utils::check_err(d0_m_device_result.mData, + d0_m_host_result.mData, + "Error: Incorrect results d0", + 1e-3, + 1e-3); + pass &= ck::utils::check_err(d1_m_device_result.mData, + d1_m_host_result.mData, + "Error: Incorrect results d1", + 1e-3, + 1e-3); } - return 0; + return pass ? 0 : 1; } diff --git a/example/17_convnd_bwd_data_xdl/CMakeLists.txt b/example/17_convnd_bwd_data_xdl/CMakeLists.txt index 0ed906f8f7..963f311703 100644 --- a/example/17_convnd_bwd_data_xdl/CMakeLists.txt +++ b/example/17_convnd_bwd_data_xdl/CMakeLists.txt @@ -1,2 +1,2 @@ add_example_executable(example_convnd_bwd_data_xdl convnd_bwd_data_xdl.cpp) -target_link_libraries(example_convnd_bwd_data_xdl PRIVATE conv_fwd_util) +target_link_libraries(example_convnd_bwd_data_xdl PRIVATE conv_util) diff --git a/example/17_convnd_bwd_data_xdl/convnd_bwd_data_xdl.cpp b/example/17_convnd_bwd_data_xdl/convnd_bwd_data_xdl.cpp index 962627ce90..ff2cfac1fa 100644 --- a/example/17_convnd_bwd_data_xdl/convnd_bwd_data_xdl.cpp +++ b/example/17_convnd_bwd_data_xdl/convnd_bwd_data_xdl.cpp @@ -6,7 +6,7 @@ #include #include "config.hpp" -#include "conv_fwd_util.hpp" +#include "conv_util.hpp" #include "print.hpp" #include "device.hpp" #include "host_tensor.hpp" @@ -87,7 +87,7 @@ void print_use_msg() { std::cout << "arg1: verification (0=no, 1=yes)\n" << "arg2: initialization (0=no init, 1=random value, 2= init to 1 )\n" - << "arg3: run kernel # of times (>1)\n" + << "arg3: time kernel (0=n0, 1=yes)\n" << "arg4: N spatial dimensions (default 2)\n" << "Following arguments (depending on number of spatial dims):\n" << " N, K, C, \n" @@ -105,40 +105,40 @@ ck::utils::conv::ConvParams parse_conv_params(int num_dim_spatial, char* argv[]) ck::utils::conv::ConvParams params; int arg_idx = 5; - params.num_dim_spatial = num_dim_spatial; - params.N = std::stoi(argv[arg_idx++]); - params.K = std::stoi(argv[arg_idx++]); - params.C = std::stoi(argv[arg_idx++]); + params.num_dim_spatial_ = num_dim_spatial; + params.N_ = std::stoi(argv[arg_idx++]); + params.K_ = std::stoi(argv[arg_idx++]); + params.C_ = std::stoi(argv[arg_idx++]); - params.filter_spatial_lengths.resize(num_dim_spatial); + params.filter_spatial_lengths_.resize(num_dim_spatial); for(int i = 0; i < num_dim_spatial; ++i) { - params.filter_spatial_lengths[i] = std::stoi(argv[arg_idx++]); + params.filter_spatial_lengths_[i] = std::stoi(argv[arg_idx++]); } - params.input_spatial_lengths.resize(num_dim_spatial); + params.input_spatial_lengths_.resize(num_dim_spatial); for(int i = 0; i < num_dim_spatial; ++i) { - params.input_spatial_lengths[i] = std::stoi(argv[arg_idx++]); + params.input_spatial_lengths_[i] = std::stoi(argv[arg_idx++]); } - params.conv_filter_strides.resize(num_dim_spatial); + params.conv_filter_strides_.resize(num_dim_spatial); for(int i = 0; i < num_dim_spatial; ++i) { - params.conv_filter_strides[i] = std::stoi(argv[arg_idx++]); + params.conv_filter_strides_[i] = std::stoi(argv[arg_idx++]); } - params.conv_filter_dilations.resize(num_dim_spatial); + params.conv_filter_dilations_.resize(num_dim_spatial); for(int i = 0; i < num_dim_spatial; ++i) { - params.conv_filter_dilations[i] = std::stoi(argv[arg_idx++]); + params.conv_filter_dilations_[i] = std::stoi(argv[arg_idx++]); } - params.input_left_pads.resize(num_dim_spatial); + params.input_left_pads_.resize(num_dim_spatial); for(int i = 0; i < num_dim_spatial; ++i) { - params.input_left_pads[i] = std::stoi(argv[arg_idx++]); + params.input_left_pads_[i] = std::stoi(argv[arg_idx++]); } - params.input_right_pads.resize(num_dim_spatial); + params.input_right_pads_.resize(num_dim_spatial); for(int i = 0; i < num_dim_spatial; ++i) { - params.input_right_pads[i] = std::stoi(argv[arg_idx++]); + params.input_right_pads_[i] = std::stoi(argv[arg_idx++]); } return params; @@ -165,25 +165,25 @@ DeviceConvBwdDataBasePtr get_conv_instance(int num_dim_spatial) int main(int argc, char* argv[]) { - bool do_verification = 0; - int init_method = 0; - int nrepeat = 5; + bool do_verification = true; + int init_method = 1; + bool time_kernel = false; int num_dim_spatial = 2; ck::utils::conv::ConvParams params; - params.C = 128; + params.C_ = 128; if(argc == 4) { do_verification = std::stoi(argv[1]); init_method = std::stoi(argv[2]); - nrepeat = std::stoi(argv[3]); + time_kernel = std::stoi(argv[3]); } else if(argc > 4) { do_verification = std::stoi(argv[1]); init_method = std::stoi(argv[2]); - nrepeat = std::stoi(argv[3]); + time_kernel = std::stoi(argv[3]); num_dim_spatial = std::stoi(argv[4]); // check args number int conv_args = 3 + num_dim_spatial * 6; @@ -202,21 +202,21 @@ int main(int argc, char* argv[]) exit(1); } - std::vector input_dims{static_cast(params.N), - static_cast(params.C)}; + std::vector input_dims{static_cast(params.N_), + static_cast(params.C_)}; input_dims.insert(std::end(input_dims), - std::begin(params.input_spatial_lengths), - std::end(params.input_spatial_lengths)); + std::begin(params.input_spatial_lengths_), + std::end(params.input_spatial_lengths_)); - std::vector filter_dims{static_cast(params.K), - static_cast(params.C)}; + std::vector filter_dims{static_cast(params.K_), + static_cast(params.C_)}; filter_dims.insert(std::end(filter_dims), - std::begin(params.filter_spatial_lengths), - std::end(params.filter_spatial_lengths)); + std::begin(params.filter_spatial_lengths_), + std::end(params.filter_spatial_lengths_)); const std::vector& output_spatial_lengths = params.GetOutputSpatialLengths(); - std::vector output_dims{static_cast(params.N), - static_cast(params.K)}; + std::vector output_dims{static_cast(params.N_), + static_cast(params.K_)}; output_dims.insert(std::end(output_dims), std::begin(output_spatial_lengths), std::end(output_spatial_lengths)); @@ -263,16 +263,16 @@ int main(int argc, char* argv[]) conv->MakeArgumentPointer(static_cast(in_device_buf.GetDeviceBuffer()), static_cast(wei_device_buf.GetDeviceBuffer()), static_cast(out_device_buf.GetDeviceBuffer()), - params.N, - params.K, - params.C, - params.input_spatial_lengths, - params.filter_spatial_lengths, + params.N_, + params.K_, + params.C_, + params.input_spatial_lengths_, + params.filter_spatial_lengths_, output_spatial_lengths, - params.conv_filter_strides, - params.conv_filter_dilations, - params.input_left_pads, - params.input_right_pads, + params.conv_filter_strides_, + params.conv_filter_dilations_, + params.input_left_pads_, + params.input_right_pads_, InElementOp{}, WeiElementOp{}, OutElementOp{}); @@ -284,16 +284,16 @@ int main(int argc, char* argv[]) "not support this Conv problem"); } - float ave_time = invoker->Run(argument.get(), nrepeat); + float ave_time = invoker->Run(argument.get(), StreamConfig{nullptr, time_kernel}); std::size_t flop = ck::utils::conv::get_flops( - params.N, params.C, params.K, params.filter_spatial_lengths, output_spatial_lengths); + params.N_, params.C_, params.K_, params.filter_spatial_lengths_, output_spatial_lengths); std::size_t num_btype = ck::utils::conv::get_btype( - params.N, - params.C, - params.K, - params.input_spatial_lengths, - params.filter_spatial_lengths, + params.N_, + params.C_, + params.K_, + params.input_spatial_lengths_, + params.filter_spatial_lengths_, output_spatial_lengths); float tflops = static_cast(flop) / 1.E9 / ave_time; @@ -310,10 +310,10 @@ int main(int argc, char* argv[]) auto ref_argument = ref_conv.MakeArgument(in_n_c_hi_wi_host_result, wei_k_c_y_x, out_n_k_ho_wo, - params.conv_filter_strides, - params.conv_filter_dilations, - params.input_left_pads, - params.input_right_pads, + params.conv_filter_strides_, + params.conv_filter_dilations_, + params.input_left_pads_, + params.input_right_pads_, InElementOp{}, WeiElementOp{}, OutElementOp{}); @@ -322,7 +322,10 @@ int main(int argc, char* argv[]) in_device_buf.FromDevice(in_n_c_hi_wi_device_result.mData.data()); - check_error(in_n_c_hi_wi_host_result, in_n_c_hi_wi_device_result); + return ck::utils::check_err(in_n_c_hi_wi_device_result.mData, + in_n_c_hi_wi_host_result.mData) + ? 0 + : 1; }; switch(num_dim_spatial) @@ -347,4 +350,5 @@ int main(int argc, char* argv[]) } } } + return 0; } diff --git a/example/18_batched_gemm_reduce/batched_gemm_reduce_xdl_fp16.cpp b/example/18_batched_gemm_reduce/batched_gemm_reduce_xdl_fp16.cpp index eb18655d1b..d993c8e8d1 100644 --- a/example/18_batched_gemm_reduce/batched_gemm_reduce_xdl_fp16.cpp +++ b/example/18_batched_gemm_reduce/batched_gemm_reduce_xdl_fp16.cpp @@ -4,6 +4,7 @@ #include #include #include +#include "check_err.hpp" #include "config.hpp" #include "device.hpp" #include "host_tensor.hpp" @@ -57,18 +58,18 @@ using ReferenceBatchedGemmInstance = ck::tensor_operation::host:: int main(int argc, char* argv[]) { - bool do_verification = 1; + bool do_verification = true; int init_method = 1; - int nrepeat = 5; + bool time_kernel = false; // GEMM shape - ck::index_t M = 3840; - ck::index_t N = 4096; - ck::index_t K = 4096; + ck::index_t M = 2048; + ck::index_t N = 1920; + ck::index_t K = 2048; - ck::index_t StrideA = 4096; - ck::index_t StrideB = 4096; - ck::index_t StrideC = 4096; + ck::index_t StrideA = 2048; + ck::index_t StrideB = 2048; + ck::index_t StrideC = 1920; ck::index_t BatchCount = 4; @@ -80,13 +81,13 @@ int main(int argc, char* argv[]) { do_verification = std::stoi(argv[1]); init_method = std::stoi(argv[2]); - nrepeat = std::stoi(argv[3]); + time_kernel = std::stoi(argv[3]); } else if(argc == 11) { do_verification = std::stoi(argv[1]); init_method = std::stoi(argv[2]); - nrepeat = std::stoi(argv[3]); + time_kernel = std::stoi(argv[3]); M = std::stoi(argv[4]); N = std::stoi(argv[5]); @@ -96,13 +97,13 @@ int main(int argc, char* argv[]) StrideB = std::stoi(argv[8]); StrideC = std::stoi(argv[9]); - BatchCount = std::stoi(argv[9]); + BatchCount = std::stoi(argv[10]); } else { printf("arg1: verification (0=no, 1=yes)\n"); printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n"); - printf("arg3: run kernel # of times (>1)\n"); + printf("arg3: time kernel (0=n0, 1=yes)\n"); printf("arg4 to 10: M (256x), N(128x), K(32x), StrideA, StrideB, StrideC, BatchCount\n"); exit(0); } @@ -204,30 +205,13 @@ int main(int argc, char* argv[]) "not support this GEMM problem"); } - // warm up - invoker.Run(argument); + // init DO, D1 to 0 + d0_device_buf.SetZero(); + d1_device_buf.SetZero(); - // timing - float total_time = 0; - - for(int i = 0; i < nrepeat; ++i) - { - // init DO, D1 to 0 - d0_device_buf.SetZero(); - d1_device_buf.SetZero(); - - KernelTimer timer; - - timer.Start(); - - invoker.Run(argument); - - timer.End(); - - total_time += timer.GetElapsedTime(); - } - - float ave_time = total_time / nrepeat; + // if time_kernel == true, kernel will run multiple times. This kernel use atomic-add so result + // will not be correct. need to set time_kernel = false for correctness test + float ave_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel}); std::size_t flop = std::size_t(2) * BatchCount * M * N * K; std::size_t num_btype = sizeof(ADataType) * BatchCount * M * K + @@ -241,6 +225,7 @@ int main(int argc, char* argv[]) std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s, " << batched_gemm.GetTypeString() << std::endl; + bool pass = true; if(do_verification) { c_device_buf.FromDevice(c_g_m_n_device_result.mData.data()); @@ -264,7 +249,7 @@ int main(int argc, char* argv[]) for(int n = 0; n < N; ++n) { - float d0_val = ck::type_convert(c_g_m_n_host_result(m, n)); + float d0_val = ck::type_convert(c_g_m_n_host_result(batch, m, n)); float d1_val; d1_element_op(d1_val, d0_val); @@ -277,10 +262,18 @@ int main(int argc, char* argv[]) } } - check_error(c_g_m_n_host_result, c_g_m_n_device_result); - check_error(d0_g_m_host_result, d0_g_m_device_result); - check_error(d1_g_m_host_result, d1_g_m_device_result); + pass &= ck::utils::check_err(c_g_m_n_host_result.mData, c_g_m_n_device_result.mData); + pass &= ck::utils::check_err(d0_g_m_device_result.mData, + d0_g_m_host_result.mData, + "Error: Incorrect results! D0", + 1e-3, + 1e-3); + pass &= ck::utils::check_err(d1_g_m_device_result.mData, + d1_g_m_host_result.mData, + "Error: Incorrect results! D1", + 1e-3, + 1e-3); } - return 0; + return pass ? 0 : 1; } diff --git a/example/19_cgemm/cgemm_xdl_bf16.cpp b/example/19_cgemm/cgemm_xdl_bf16.cpp index 309fa6ac86..836a3c13dc 100644 --- a/example/19_cgemm/cgemm_xdl_bf16.cpp +++ b/example/19_cgemm/cgemm_xdl_bf16.cpp @@ -88,9 +88,9 @@ using ReferenceCGemmInstance = ck::tensor_operation::host:: int main(int argc, char* argv[]) { - bool do_verification = 0; - int init_method = 0; - int nrepeat = 5; + bool do_verification = true; + int init_method = 1; + bool time_kernel = false; // CGEMM shape ck::index_t M = 3840; @@ -105,13 +105,13 @@ int main(int argc, char* argv[]) { do_verification = std::stoi(argv[1]); init_method = std::stoi(argv[2]); - nrepeat = std::stoi(argv[3]); + time_kernel = std::stoi(argv[3]); } else if(argc == 10) { do_verification = std::stoi(argv[1]); init_method = std::stoi(argv[2]); - nrepeat = std::stoi(argv[3]); + time_kernel = std::stoi(argv[3]); M = std::stoi(argv[4]); N = std::stoi(argv[5]); @@ -223,7 +223,7 @@ int main(int argc, char* argv[]) "not support this CGEMM problem"); } - float ave_time = invoker.Run(argument, nrepeat); + float ave_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel}); std::size_t flop = std::size_t(8) * M * N * K; std::size_t num_btype = std::size_t(2) * sizeof(ADataType) * M * K + sizeof(BDataType) * K * N + diff --git a/example/CMakeLists.txt b/example/CMakeLists.txt index 5ea3889844..051242ce2a 100644 --- a/example/CMakeLists.txt +++ b/example/CMakeLists.txt @@ -19,9 +19,18 @@ include_directories(BEFORE add_custom_target(examples) -function(add_example_executable EXAMPLE_NAME) +function(add_example_executable EXAMPLE_NAME FILE_NAME) message("adding example ${EXAMPLE_NAME}") - add_executable(${EXAMPLE_NAME} ${ARGN}) + add_executable(${EXAMPLE_NAME} ${FILE_NAME}) + target_link_libraries(${EXAMPLE_NAME} PRIVATE host_tensor) + add_test(NAME ${EXAMPLE_NAME} COMMAND $ ${ARGN}) + add_dependencies(examples ${EXAMPLE_NAME}) + add_dependencies(check ${EXAMPLE_NAME}) +endfunction(add_example_executable EXAMPLE_NAME) + +function(add_example_executable_no_testing EXAMPLE_NAME FILE_NAME) + message("adding example ${EXAMPLE_NAME}") + add_executable(${EXAMPLE_NAME} ${FILE_NAME}) target_link_libraries(${EXAMPLE_NAME} PRIVATE host_tensor) add_dependencies(examples ${EXAMPLE_NAME}) endfunction(add_example_executable EXAMPLE_NAME) diff --git a/include/ck/config.hpp b/include/ck/config.hpp index e6deefcbe3..710cd552d7 100644 --- a/include/ck/config.hpp +++ b/include/ck/config.hpp @@ -109,6 +109,10 @@ // experimental feature: use __builtin_memcpy instead of union to do bit_cast #define CK_EXPERIMENTAL_USE_MEMCPY_FOR_BIT_CAST 1 +// experimental feature: optimize for inter-wave scheduling policy +#define CK_EXPERIMENTAL_INTER_WAVE_SCHEDULING 0 +#define CK_EXPERIMENTAL_INTER_WAVE_SCHEDULING_MAC_CLUSTERS 1 + // hack: have underlying assumption that need to be satsified, otherwise it's a bug // hack for forcing register to keep idx_diff_low_const in SGPR. idx_diff_low_const must be // thread-invariant, otherwise it's a bug diff --git a/include/ck/hip_version.hpp.in b/include/ck/hip_version.hpp.in deleted file mode 100644 index 4290ef7e0d..0000000000 --- a/include/ck/hip_version.hpp.in +++ /dev/null @@ -1,28 +0,0 @@ -#pragma once - -// "_PACKAGE_" to avoid name contentions: the macros like -// HIP_VERSION_MAJOR are defined in HIP_VERSION.h. -// clang-format off -#define CK_HIP_PACKAGE_VERSION_MAJOR @CK_HIP_VERSION_MAJOR@ -#define CK_HIP_PACKAGE_VERSION_MINOR @CK_HIP_VERSION_MINOR@ -#define CK_HIP_PACKAGE_VERSION_PATCH @CK_HIP_VERSION_PATCH@ -// clang-format on - -#ifndef CK_HIP_PACKAGE_VERSION_MAJOR -#define CK_HIP_PACKAGE_VERSION_MAJOR 0 -#endif -#ifndef CK_HIP_PACKAGE_VERSION_MINOR -#define CK_HIP_PACKAGE_VERSION_MINOR 0 -#endif -#ifndef CK_HIP_PACKAGE_VERSION_PATCH -#define CK_HIP_PACKAGE_VERSION_PATCH 0 -#endif -// 3 decimal digits for major and minor, 6 digits for patch number. -// Max number is 999,999,999999 == 0xE8,D4A5,0FFF that fits into 64-bit math. -#if CK_HIP_PACKAGE_VERSION_MAJOR > 999 || CK_HIP_PACKAGE_VERSION_MAJOR > 999 || \ - CK_HIP_PACKAGE_VERSION_PATCH > 999999 -#error "Too big HIP version number(s)" -#endif -#define CK_HIP_PACKAGE_VERSION_FLAT \ - ((CK_HIP_PACKAGE_VERSION_MAJOR * 1000ULL + CK_HIP_PACKAGE_VERSION_MINOR) * 1000000 + \ - CK_HIP_PACKAGE_VERSION_PATCH) diff --git a/include/ck/options.hpp.in b/include/ck/options.hpp.in new file mode 100644 index 0000000000..87ed6026a4 --- /dev/null +++ b/include/ck/options.hpp.in @@ -0,0 +1,3 @@ +#pragma once + +#cmakedefine01 CK_TIME_KERNEL diff --git a/include/ck/stream_config.hpp b/include/ck/stream_config.hpp new file mode 100644 index 0000000000..3e80b4c892 --- /dev/null +++ b/include/ck/stream_config.hpp @@ -0,0 +1,10 @@ +#pragma once + +#include +#include + +struct StreamConfig +{ + hipStream_t stream_id_ = nullptr; + bool time_kernel_ = false; +}; diff --git a/include/ck/tensor_operation/gpu/block/blockwise_gemm_xdlops.hpp b/include/ck/tensor_operation/gpu/block/blockwise_gemm_xdlops.hpp index f1670d9c89..a989cb5297 100644 --- a/include/ck/tensor_operation/gpu/block/blockwise_gemm_xdlops.hpp +++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_xdlops.hpp @@ -7,6 +7,21 @@ namespace ck { +enum struct LoopScheduler +{ + Default, + Interwave, +}; + +constexpr LoopScheduler make_default_loop_scheduler() +{ +#if CK_EXPERIMENTAL_INTER_WAVE_SCHEDULING + return LoopScheduler::Interwave; +#else + return LoopScheduler::Default; +#endif // if CK_EXPERIMENTAL_INTER_WAVE_SCHEDULING +} + template {})); @@ -339,4 +354,232 @@ struct BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1 BThreadCopy b_thread_copy_{CalculateBThreadOriginDataIndex()}; }; +// Note: To facilitate the inter-wave loop scheduler, we need to explicitly set the macro +// CK_EXPERIMENTAL_INTER_WAVE_SCHEDULING=1 as a few intrinsics are not yet available in +// the latest ROCm release. For unsupported compilers, inter-wave loop scheduler falls back to the +// default loop scheduler which is given by the macro CK_EXPERIMENTAL_INTER_WAVE_SCHEDULING=0 +template +struct BlockwiseGemmXdlopsInterwave_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1 + : public BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1 +{ + using Base = BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1; + +#if CK_EXPERIMENTAL_INTER_WAVE_SCHEDULING + using Base::a_block_desc_m0_m1_m2_k; + using Base::A_K1; + using Base::b_block_desc_n0_n1_n2_k; + using Base::B_K1; + using Base::c_thread_buf_; + using Base::c_thread_desc_; + using Base::CalculateAThreadOriginDataIndex; + using Base::CalculateBThreadOriginDataIndex; + using Base::I0; + using Base::I1; + using Base::KPerThread; + using Base::xdlops_gemm; + + static constexpr index_t KPerInnerLoop = math::max(KPerThread / NumMacClusters, KPack); + + // 2-wave optimized blockwise gemm + template + __device__ void Run(const ABlockBuffer& a_block_buf, + const BBlockBuffer& b_block_buf, + CThreadBuffer& c_thread_buf) const + { + auto a_thread_buf = make_static_buffer( + a_thread_desc_.GetElementSpaceSize()); + auto b_thread_buf = make_static_buffer( + b_thread_desc_.GetElementSpaceSize()); + + static_for<0, KPerThread, KPerInnerLoop>{}([&](auto k) { + static_for<0, MRepeat, 1>{}([&](auto m0) { + // read A + a_thread_copy_.Run(a_block_desc_m0_m1_m2_k, + make_tuple(m0, I0, I0, k), + a_block_buf, + a_thread_desc_, + make_tuple(m0, I0, I0, I0), + a_thread_buf); + }); + static_for<0, NRepeat, 1>{}([&](auto n0) { + // read B + b_thread_copy_.Run(b_block_desc_n0_n1_n2_k, + make_tuple(n0, I0, I0, k), + b_block_buf, + b_thread_desc_, + make_tuple(n0, I0, I0, I0), + b_thread_buf); + }); + __builtin_amdgcn_sched_barrier(); + // NOTE: Synchronize threads in a workgroup at the start of each MAC cluster, but except + // the first, as we can shorten non-MAC cluster a bit and there's no observable negative + // impact. The desired effect is waves in a workgroup executing MAC in sync. This avoids + // some out-of-sync waves hijacking MAC resource from other workgroups and reducing the + // chance of latency hiding by waiting for the rest of the workgroup at the eventual + // sync point. + if constexpr(k.value != 0 || KPerInnerLoop == KPerThread) + { + asm volatile("s_barrier" ::); + __builtin_amdgcn_sched_barrier(); + } + static_for<0, KPerInnerLoop, KPack>{}([&](auto k_) { + static_for<0, MRepeat, 1>{}([&](auto m0) { + static_for<0, NRepeat, 1>{}([&](auto n0) { + vector_type a_thread_vec; + vector_type b_thread_vec; + + static_for<0, KPack, 1>{}([&](auto i) { + a_thread_vec.template AsType()(i) = + a_thread_buf[Number{}]; + b_thread_vec.template AsType()(i) = + b_thread_buf[Number{}]; + }); + + using mfma_input_type = + typename vector_type::type; + + constexpr index_t c_offset = + c_thread_desc_.CalculateOffset(make_tuple(m0, n0, 0)); + + // The block_sync_lds() here performs double duty: + // A) safeguard against data hazard because barrier from blockwise_gemm is + // moved here B) reduce VMEM FIFO congestion by applying small delays to + // different wavefronts It is performed near the end of MAC cluster to + // minimize lgkmcnt penalty + if constexpr(k.value == KPerThread - KPerInnerLoop && + k_.value == KPerInnerLoop - KPack && m0.value == MRepeat - 1 && + n0.value == NRepeat - 1) + { + __builtin_amdgcn_sched_barrier(); + block_sync_lds(); + __builtin_amdgcn_sched_barrier(); + } + + // TODO: insert setprio in more precise manner since we + // could have more than >1 MFMA instructions in single call + xdlops_gemm.template Run( + a_thread_vec.template AsType(), + b_thread_vec.template AsType(), + c_thread_buf.GetVectorTypeReference(Number{})); + if constexpr(k_.value == 0 && m0.value == 0 && n0.value == 0) + { + __builtin_amdgcn_sched_barrier(); + __builtin_amdgcn_s_setprio(1); + __builtin_amdgcn_sched_barrier(); + } + }); + }); + }); + __builtin_amdgcn_sched_barrier(); + __builtin_amdgcn_s_setprio(0); + __builtin_amdgcn_sched_barrier(); + }); + } + + protected: + // A[M0, M1, M2, KPerInnerLoop] + static constexpr auto a_thread_desc_ = make_naive_tensor_descriptor_packed( + make_tuple(Number{}, I1, I1, Number{})); + + // B[N0, N1, N2, KPerInnerLoop] + static constexpr auto b_thread_desc_ = make_naive_tensor_descriptor_packed( + make_tuple(Number{}, I1, I1, Number{})); + + using AThreadCopy = ThreadwiseTensorSliceTransfer_v4, + Sequence<0, 1, 2, 3>, + 3, + A_K1, + A_K1>; + + using BThreadCopy = ThreadwiseTensorSliceTransfer_v4, + Sequence<0, 1, 2, 3>, + 3, + B_K1, + B_K1>; + + AThreadCopy a_thread_copy_{CalculateAThreadOriginDataIndex()}; + BThreadCopy b_thread_copy_{CalculateBThreadOriginDataIndex()}; + +#endif // #if CK_EXPERIMENTAL_INTER_WAVE_SCHEDULING +}; + +template +constexpr auto BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_Selector() +{ + if constexpr(LoopSched == LoopScheduler::Default) + { + return BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1{}; + } + else if constexpr(LoopSched == LoopScheduler::Interwave) + { + return BlockwiseGemmXdlopsInterwave_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1{}; + } +}; + } // namespace ck diff --git a/include/ck/tensor_operation/gpu/device/device_base.hpp b/include/ck/tensor_operation/gpu/device/device_base.hpp index cf48695ad0..950cfc1d61 100644 --- a/include/ck/tensor_operation/gpu/device/device_base.hpp +++ b/include/ck/tensor_operation/gpu/device/device_base.hpp @@ -1,8 +1,9 @@ -#ifndef DEVICE_BASE_HPP -#define DEVICE_BASE_HPP +#pragma once #include +#include "stream_config.hpp" + namespace ck { namespace tensor_operation { namespace device { @@ -22,7 +23,10 @@ struct BaseInvoker BaseInvoker(const BaseInvoker&) = default; BaseInvoker& operator=(const BaseInvoker&) = default; - virtual float Run(const BaseArgument*, int = 1) = 0; + virtual float Run(const BaseArgument*, const StreamConfig& = StreamConfig{}) + { + return float{0}; + } virtual ~BaseInvoker() {} }; @@ -33,8 +37,8 @@ struct BaseOperator BaseOperator(const BaseOperator&) = default; BaseOperator& operator=(const BaseOperator&) = default; - virtual bool IsSupportedArgument(const BaseArgument*) = 0; - virtual std::string GetTypeString() const = 0; + virtual bool IsSupportedArgument(const BaseArgument*) { return false; } + virtual std::string GetTypeString() const { return ""; } virtual ~BaseOperator() {} }; @@ -42,4 +46,3 @@ struct BaseOperator } // namespace device } // namespace tensor_operation } // namespace ck -#endif diff --git a/include/ck/tensor_operation/gpu/device/device_batched_gemm_reduce_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/device_batched_gemm_reduce_xdl_cshuffle.hpp index 92655b2755..a6408007ed 100644 --- a/include/ck/tensor_operation/gpu/device/device_batched_gemm_reduce_xdl_cshuffle.hpp +++ b/include/ck/tensor_operation/gpu/device/device_batched_gemm_reduce_xdl_cshuffle.hpp @@ -106,6 +106,9 @@ __global__ void #endif // end of if defined (defined(__gfx908__) || defined(__gfx90a__)) } +// Note: inter-wave loop scheduler is rolled out to c-shuffle version first. Becuase non c-shuffle +// version currently has compiler issues with register spill which further causes validation +// failures. template + index_t CReduceThreadVgpr2GlobalCopySrcDstScalarPerVector_MPerBlock, + LoopScheduler LoopSched = make_default_loop_scheduler()> struct DeviceBatchedGemmReduce_Xdl_CShuffle : public DeviceGemmReduce; + CReduceThreadVgpr2GlobalCopySrcDstScalarPerVector_MPerBlock, + LoopSched>; using Block2CTileMap = decltype(MakeBlock2CTileMap(1, CGridDesc_M_N{}, 1, 1)); @@ -688,7 +693,7 @@ struct DeviceBatchedGemmReduce_Xdl_CShuffle : public DeviceGemmReduce, true>; - launch_kernel(kernel, - dim3(grid_size), - dim3(BlockSize), - 0, - arg.p_a_grid_, - arg.p_b_grid_, - arg.p_c_grid_, - arg.p_d0_grid_, - arg.p_d1_grid_, - arg.BatchCount_, - arg.a_element_op_, - arg.b_element_op_, - arg.c_element_op_, - arg.d1_element_op_, - arg.a_grid_desc_ak0_m_ak1_, - arg.b_grid_desc_bk0_n_bk1_, - arg.c_grid_desc_mblock_mperblock_nblock_nperblock_, - arg.d_grid_desc_mblock_mperblock_, - arg.compute_base_ptr_of_batch_, - arg.block_2_ctile_map_); + elapsed_time = + launch_and_time_kernel(stream_config, + kernel, + dim3(grid_size), + dim3(BlockSize), + 0, + arg.p_a_grid_, + arg.p_b_grid_, + arg.p_c_grid_, + arg.p_d0_grid_, + arg.p_d1_grid_, + arg.BatchCount_, + arg.a_element_op_, + arg.b_element_op_, + arg.c_element_op_, + arg.d1_element_op_, + arg.a_grid_desc_ak0_m_ak1_, + arg.b_grid_desc_bk0_n_bk1_, + arg.c_grid_desc_mblock_mperblock_nblock_nperblock_, + arg.d_grid_desc_mblock_mperblock_, + arg.compute_base_ptr_of_batch_, + arg.block_2_ctile_map_); } else { @@ -783,35 +791,38 @@ struct DeviceBatchedGemmReduce_Xdl_CShuffle : public DeviceGemmReduce, false>; - launch_kernel(kernel, - dim3(grid_size), - dim3(BlockSize), - 0, - arg.p_a_grid_, - arg.p_b_grid_, - arg.p_c_grid_, - arg.p_d0_grid_, - arg.p_d1_grid_, - arg.BatchCount_, - arg.a_element_op_, - arg.b_element_op_, - arg.c_element_op_, - arg.d1_element_op_, - arg.a_grid_desc_ak0_m_ak1_, - arg.b_grid_desc_bk0_n_bk1_, - arg.c_grid_desc_mblock_mperblock_nblock_nperblock_, - arg.d_grid_desc_mblock_mperblock_, - arg.compute_base_ptr_of_batch_, - arg.block_2_ctile_map_); + elapsed_time = + launch_and_time_kernel(stream_config, + kernel, + dim3(grid_size), + dim3(BlockSize), + 0, + arg.p_a_grid_, + arg.p_b_grid_, + arg.p_c_grid_, + arg.p_d0_grid_, + arg.p_d1_grid_, + arg.BatchCount_, + arg.a_element_op_, + arg.b_element_op_, + arg.c_element_op_, + arg.d1_element_op_, + arg.a_grid_desc_ak0_m_ak1_, + arg.b_grid_desc_bk0_n_bk1_, + arg.c_grid_desc_mblock_mperblock_nblock_nperblock_, + arg.d_grid_desc_mblock_mperblock_, + arg.compute_base_ptr_of_batch_, + arg.block_2_ctile_map_); } - return 0; + return elapsed_time; } // polymorphic - float Run(const BaseArgument* p_arg, int nrepeat = 1) override + float Run(const BaseArgument* p_arg, + const StreamConfig& stream_config = StreamConfig{}) override { - return Run(*dynamic_cast(p_arg), nrepeat); + return Run(*dynamic_cast(p_arg), stream_config); } }; diff --git a/include/ck/tensor_operation/gpu/device/device_batched_gemm_xdl.hpp b/include/ck/tensor_operation/gpu/device/device_batched_gemm_xdl.hpp index 88974a5221..ea7704951e 100644 --- a/include/ck/tensor_operation/gpu/device/device_batched_gemm_xdl.hpp +++ b/include/ck/tensor_operation/gpu/device/device_batched_gemm_xdl.hpp @@ -428,7 +428,7 @@ struct DeviceBatchedGemmXdl { using Argument = DeviceBatchedGemmXdl::Argument; - float Run(const Argument& arg, int nrepeat = 1) + float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{}) { { std::cout << "arg.a_grid_desc_k0_m_k1_{" << arg.a_grid_desc_k0_m_k1_.GetLength(I0) @@ -477,8 +477,8 @@ struct DeviceBatchedGemmXdl remove_reference_t, true>; - ave_time = launch_and_time_kernel(kernel, - nrepeat, + ave_time = launch_and_time_kernel(stream_config, + kernel, dim3(grid_size), dim3(BlockSize), 0, @@ -511,8 +511,8 @@ struct DeviceBatchedGemmXdl remove_reference_t, false>; - ave_time = launch_and_time_kernel(kernel, - nrepeat, + ave_time = launch_and_time_kernel(stream_config, + kernel, dim3(grid_size), dim3(BlockSize), 0, @@ -534,9 +534,10 @@ struct DeviceBatchedGemmXdl } // polymorphic - float Run(const BaseArgument* p_arg, int nrepeat = 1) override + float Run(const BaseArgument* p_arg, + const StreamConfig& stream_config = StreamConfig{}) override { - return Run(*dynamic_cast(p_arg), nrepeat); + return Run(*dynamic_cast(p_arg), stream_config); } }; diff --git a/include/ck/tensor_operation/gpu/device/device_cgemm_4gemm_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/device_cgemm_4gemm_xdl_cshuffle.hpp index 2643e46ff2..1f6ebc7042 100644 --- a/include/ck/tensor_operation/gpu/device/device_cgemm_4gemm_xdl_cshuffle.hpp +++ b/include/ck/tensor_operation/gpu/device/device_cgemm_4gemm_xdl_cshuffle.hpp @@ -55,7 +55,8 @@ template + index_t CShuffleBlockTransferScalarPerVector_NPerBlock, + LoopScheduler LoopSched = make_default_loop_scheduler()> struct DeviceCGemm_4Gemm_Xdl_CShuffle : public DeviceCGemm { @@ -376,7 +377,8 @@ struct DeviceCGemm_4Gemm_Xdl_CShuffle CShuffleMXdlPerWavePerShuffle, CShuffleNXdlPerWavePerShuffle, CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, - CShuffleBlockTransferScalarPerVector_NPerBlock>; + CShuffleBlockTransferScalarPerVector_NPerBlock, + LoopSched>; // Argument struct Argument : public BaseArgument @@ -448,7 +450,7 @@ struct DeviceCGemm_4Gemm_Xdl_CShuffle { using Argument = DeviceOp::Argument; - float Run(const Argument& arg, int nrepeat = 1) + float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{}) { if(!GridwiseGemm::CheckValidity( arg.a_grid_desc_ak0_m_ak1_, arg.b_grid_desc_bk0_n_bk1_, arg.c_grid_desc_m_n_)) @@ -478,146 +480,77 @@ struct DeviceCGemm_4Gemm_Xdl_CShuffle typename GridwiseGemm::DefaultBlock2CTileMap, true>; - if(nrepeat == 0) - { - launch_kernel(kernel, - dim3(grid_size), - dim3(BlockSize), - 0, - arg.p_a_grid_real_, - arg.p_b_grid_real_, - arg.p_c_grid_real_, - arg.a_element_op_, - arg.b_element_op_, - arg.c_element_op_, - arg.a_grid_desc_ak0_m_ak1_, - arg.b_grid_desc_bk0_n_bk1_, - arg.c_grid_desc_mblock_mperblock_nblock_nperblock_, - arg.block_2_ctile_map_); + ave_time += + launch_and_time_kernel(stream_config, + kernel, + dim3(grid_size), + dim3(BlockSize), + 0, + arg.p_a_grid_real_, + arg.p_b_grid_real_, + arg.p_c_grid_real_, + arg.a_element_op_, + arg.b_element_op_, + arg.c_element_op_, + arg.a_grid_desc_ak0_m_ak1_, + arg.b_grid_desc_bk0_n_bk1_, + arg.c_grid_desc_mblock_mperblock_nblock_nperblock_, + arg.block_2_ctile_map_); - launch_kernel(kernel, - dim3(grid_size), - dim3(BlockSize), - 0, - arg.p_a_grid_imag_, - arg.p_b_grid_imag_, - arg.p_aux_grid_, - arg.a_element_op_, - arg.b_element_op_, - arg.c_element_op_, - arg.a_grid_desc_ak0_m_ak1_, - arg.b_grid_desc_bk0_n_bk1_, - arg.c_grid_desc_mblock_mperblock_nblock_nperblock_, - arg.block_2_ctile_map_); + ave_time += + launch_and_time_kernel(stream_config, + kernel, + dim3(grid_size), + dim3(BlockSize), + 0, + arg.p_a_grid_imag_, + arg.p_b_grid_imag_, + arg.p_aux_grid_, + arg.a_element_op_, + arg.b_element_op_, + arg.c_element_op_, + arg.a_grid_desc_ak0_m_ak1_, + arg.b_grid_desc_bk0_n_bk1_, + arg.c_grid_desc_mblock_mperblock_nblock_nperblock_, + arg.block_2_ctile_map_); - // c_real = c_real - aux needed here!!! + // c_real = c_real - aux needed here!!! - launch_kernel(kernel, - dim3(grid_size), - dim3(BlockSize), - 0, - arg.p_a_grid_real_, - arg.p_b_grid_imag_, - arg.p_c_grid_imag_, - arg.a_element_op_, - arg.b_element_op_, - arg.c_element_op_, - arg.a_grid_desc_ak0_m_ak1_, - arg.b_grid_desc_bk0_n_bk1_, - arg.c_grid_desc_mblock_mperblock_nblock_nperblock_, - arg.block_2_ctile_map_); + ave_time += + launch_and_time_kernel(stream_config, + kernel, + dim3(grid_size), + dim3(BlockSize), + 0, + arg.p_a_grid_real_, + arg.p_b_grid_imag_, + arg.p_c_grid_imag_, + arg.a_element_op_, + arg.b_element_op_, + arg.c_element_op_, + arg.a_grid_desc_ak0_m_ak1_, + arg.b_grid_desc_bk0_n_bk1_, + arg.c_grid_desc_mblock_mperblock_nblock_nperblock_, + arg.block_2_ctile_map_); - launch_kernel(kernel, - dim3(grid_size), - dim3(BlockSize), - 0, - arg.p_a_grid_imag_, - arg.p_b_grid_real_, - arg.p_aux_grid_, - arg.a_element_op_, - arg.b_element_op_, - arg.c_element_op_, - arg.a_grid_desc_ak0_m_ak1_, - arg.b_grid_desc_bk0_n_bk1_, - arg.c_grid_desc_mblock_mperblock_nblock_nperblock_, - arg.block_2_ctile_map_); + ave_time += + launch_and_time_kernel(stream_config, + kernel, + dim3(grid_size), + dim3(BlockSize), + 0, + arg.p_a_grid_imag_, + arg.p_b_grid_real_, + arg.p_aux_grid_, + arg.a_element_op_, + arg.b_element_op_, + arg.c_element_op_, + arg.a_grid_desc_ak0_m_ak1_, + arg.b_grid_desc_bk0_n_bk1_, + arg.c_grid_desc_mblock_mperblock_nblock_nperblock_, + arg.block_2_ctile_map_); - // c_imag = c_imag + aux needed here!!! - } - else - { - ave_time += - launch_and_time_kernel(kernel, - nrepeat, - dim3(grid_size), - dim3(BlockSize), - 0, - arg.p_a_grid_real_, - arg.p_b_grid_real_, - arg.p_c_grid_real_, - arg.a_element_op_, - arg.b_element_op_, - arg.c_element_op_, - arg.a_grid_desc_ak0_m_ak1_, - arg.b_grid_desc_bk0_n_bk1_, - arg.c_grid_desc_mblock_mperblock_nblock_nperblock_, - arg.block_2_ctile_map_); - - ave_time += - launch_and_time_kernel(kernel, - nrepeat, - dim3(grid_size), - dim3(BlockSize), - 0, - arg.p_a_grid_imag_, - arg.p_b_grid_imag_, - arg.p_aux_grid_, - arg.a_element_op_, - arg.b_element_op_, - arg.c_element_op_, - arg.a_grid_desc_ak0_m_ak1_, - arg.b_grid_desc_bk0_n_bk1_, - arg.c_grid_desc_mblock_mperblock_nblock_nperblock_, - arg.block_2_ctile_map_); - - // // c_real = c_real - aux needed here!!! - - ave_time += - launch_and_time_kernel(kernel, - nrepeat, - dim3(grid_size), - dim3(BlockSize), - 0, - arg.p_a_grid_real_, - arg.p_b_grid_imag_, - arg.p_c_grid_imag_, - arg.a_element_op_, - arg.b_element_op_, - arg.c_element_op_, - arg.a_grid_desc_ak0_m_ak1_, - arg.b_grid_desc_bk0_n_bk1_, - arg.c_grid_desc_mblock_mperblock_nblock_nperblock_, - arg.block_2_ctile_map_); - - ave_time += - launch_and_time_kernel(kernel, - nrepeat, - dim3(grid_size), - dim3(BlockSize), - 0, - arg.p_a_grid_imag_, - arg.p_b_grid_real_, - arg.p_aux_grid_, - arg.a_element_op_, - arg.b_element_op_, - arg.c_element_op_, - arg.a_grid_desc_ak0_m_ak1_, - arg.b_grid_desc_bk0_n_bk1_, - arg.c_grid_desc_mblock_mperblock_nblock_nperblock_, - arg.block_2_ctile_map_); - - // c_imag = c_imag + aux needed here!!! - } + // c_imag = c_imag + aux needed here!!! } else { @@ -634,155 +567,87 @@ struct DeviceCGemm_4Gemm_Xdl_CShuffle typename GridwiseGemm::DefaultBlock2CTileMap, false>; - if(nrepeat == 0) - { - launch_kernel(kernel, - dim3(grid_size), - dim3(BlockSize), - 0, - arg.p_a_grid_real_, - arg.p_b_grid_real_, - arg.p_c_grid_real_, - arg.a_element_op_, - arg.b_element_op_, - arg.c_element_op_, - arg.a_grid_desc_ak0_m_ak1_, - arg.b_grid_desc_bk0_n_bk1_, - arg.c_grid_desc_mblock_mperblock_nblock_nperblock_, - arg.block_2_ctile_map_); + ave_time += + launch_and_time_kernel(stream_config, + kernel, + dim3(grid_size), + dim3(BlockSize), + 0, + arg.p_a_grid_real_, + arg.p_b_grid_real_, + arg.p_c_grid_real_, + arg.a_element_op_, + arg.b_element_op_, + arg.c_element_op_, + arg.a_grid_desc_ak0_m_ak1_, + arg.b_grid_desc_bk0_n_bk1_, + arg.c_grid_desc_mblock_mperblock_nblock_nperblock_, + arg.block_2_ctile_map_); - launch_kernel(kernel, - dim3(grid_size), - dim3(BlockSize), - 0, - arg.p_a_grid_imag_, - arg.p_b_grid_imag_, - arg.p_aux_grid_, - arg.a_element_op_, - arg.b_element_op_, - arg.c_element_op_, - arg.a_grid_desc_ak0_m_ak1_, - arg.b_grid_desc_bk0_n_bk1_, - arg.c_grid_desc_mblock_mperblock_nblock_nperblock_, - arg.block_2_ctile_map_); + ave_time += + launch_and_time_kernel(stream_config, + kernel, + dim3(grid_size), + dim3(BlockSize), + 0, + arg.p_a_grid_imag_, + arg.p_b_grid_imag_, + arg.p_aux_grid_, + arg.a_element_op_, + arg.b_element_op_, + arg.c_element_op_, + arg.a_grid_desc_ak0_m_ak1_, + arg.b_grid_desc_bk0_n_bk1_, + arg.c_grid_desc_mblock_mperblock_nblock_nperblock_, + arg.block_2_ctile_map_); - // // c_real = c_real - aux needed here!!! + // // c_real = c_real - aux needed here!!! - launch_kernel(kernel, - dim3(grid_size), - dim3(BlockSize), - 0, - arg.p_a_grid_real_, - arg.p_b_grid_imag_, - arg.p_c_grid_imag_, - arg.a_element_op_, - arg.b_element_op_, - arg.c_element_op_, - arg.a_grid_desc_ak0_m_ak1_, - arg.b_grid_desc_bk0_n_bk1_, - arg.c_grid_desc_mblock_mperblock_nblock_nperblock_, - arg.block_2_ctile_map_); + ave_time += + launch_and_time_kernel(stream_config, + kernel, + dim3(grid_size), + dim3(BlockSize), + 0, + arg.p_a_grid_real_, + arg.p_b_grid_imag_, + arg.p_c_grid_imag_, + arg.a_element_op_, + arg.b_element_op_, + arg.c_element_op_, + arg.a_grid_desc_ak0_m_ak1_, + arg.b_grid_desc_bk0_n_bk1_, + arg.c_grid_desc_mblock_mperblock_nblock_nperblock_, + arg.block_2_ctile_map_); - launch_kernel(kernel, - dim3(grid_size), - dim3(BlockSize), - 0, - arg.p_a_grid_imag_, - arg.p_b_grid_real_, - arg.p_aux_grid_, - arg.a_element_op_, - arg.b_element_op_, - arg.c_element_op_, - arg.a_grid_desc_ak0_m_ak1_, - arg.b_grid_desc_bk0_n_bk1_, - arg.c_grid_desc_mblock_mperblock_nblock_nperblock_, - arg.block_2_ctile_map_); + ave_time += + launch_and_time_kernel(stream_config, + kernel, + dim3(grid_size), + dim3(BlockSize), + 0, + arg.p_a_grid_imag_, + arg.p_b_grid_real_, + arg.p_aux_grid_, + arg.a_element_op_, + arg.b_element_op_, + arg.c_element_op_, + arg.a_grid_desc_ak0_m_ak1_, + arg.b_grid_desc_bk0_n_bk1_, + arg.c_grid_desc_mblock_mperblock_nblock_nperblock_, + arg.block_2_ctile_map_); - // c_imag = c_imag + aux needed here!!! - } - else - { - ave_time += - launch_and_time_kernel(kernel, - nrepeat, - dim3(grid_size), - dim3(BlockSize), - 0, - arg.p_a_grid_real_, - arg.p_b_grid_real_, - arg.p_c_grid_real_, - arg.a_element_op_, - arg.b_element_op_, - arg.c_element_op_, - arg.a_grid_desc_ak0_m_ak1_, - arg.b_grid_desc_bk0_n_bk1_, - arg.c_grid_desc_mblock_mperblock_nblock_nperblock_, - arg.block_2_ctile_map_); - - ave_time += - launch_and_time_kernel(kernel, - nrepeat, - dim3(grid_size), - dim3(BlockSize), - 0, - arg.p_a_grid_imag_, - arg.p_b_grid_imag_, - arg.p_aux_grid_, - arg.a_element_op_, - arg.b_element_op_, - arg.c_element_op_, - arg.a_grid_desc_ak0_m_ak1_, - arg.b_grid_desc_bk0_n_bk1_, - arg.c_grid_desc_mblock_mperblock_nblock_nperblock_, - arg.block_2_ctile_map_); - - // c_real = c_real - aux needed here!!! - - ave_time += - launch_and_time_kernel(kernel, - nrepeat, - dim3(grid_size), - dim3(BlockSize), - 0, - arg.p_a_grid_real_, - arg.p_b_grid_imag_, - arg.p_c_grid_imag_, - arg.a_element_op_, - arg.b_element_op_, - arg.c_element_op_, - arg.a_grid_desc_ak0_m_ak1_, - arg.b_grid_desc_bk0_n_bk1_, - arg.c_grid_desc_mblock_mperblock_nblock_nperblock_, - arg.block_2_ctile_map_); - - ave_time += - launch_and_time_kernel(kernel, - nrepeat, - dim3(grid_size), - dim3(BlockSize), - 0, - arg.p_a_grid_imag_, - arg.p_b_grid_real_, - arg.p_aux_grid_, - arg.a_element_op_, - arg.b_element_op_, - arg.c_element_op_, - arg.a_grid_desc_ak0_m_ak1_, - arg.b_grid_desc_bk0_n_bk1_, - arg.c_grid_desc_mblock_mperblock_nblock_nperblock_, - arg.block_2_ctile_map_); - - // c_imag = c_imag + aux needed here!!! - } + // c_imag = c_imag + aux needed here!!! } return ave_time; } // polymorphic - float Run(const BaseArgument* p_arg, int nrepeat = 1) override + float Run(const BaseArgument* p_arg, + const StreamConfig& stream_config = StreamConfig{}) override { - return Run(*dynamic_cast(p_arg), nrepeat); + return Run(*dynamic_cast(p_arg), stream_config); } }; diff --git a/include/ck/tensor_operation/gpu/device/device_conv2d_backward_weight_xdl_c_shuffle_nhwc_kyxc_nhwk.hpp b/include/ck/tensor_operation/gpu/device/device_conv2d_backward_weight_xdl_c_shuffle_nhwc_kyxc_nhwk.hpp index 466e6ad89f..c36227083c 100644 --- a/include/ck/tensor_operation/gpu/device/device_conv2d_backward_weight_xdl_c_shuffle_nhwc_kyxc_nhwk.hpp +++ b/include/ck/tensor_operation/gpu/device/device_conv2d_backward_weight_xdl_c_shuffle_nhwc_kyxc_nhwk.hpp @@ -415,9 +415,10 @@ struct DeviceConv2dBwdWeightXdl_C_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_ << arg.c_grid_desc_m_n_.GetLength(I1) << "}" << std::endl; } - float Run(const Argument& arg, int nrepeat = 1) + float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{}) { ShowInfo(arg); + if(!GridwiseGemm::CheckValidity(arg.a_grid_desc_kbatch_k0_m_k1_, arg.b_grid_desc_kbatch_k0_n_k1_, arg.c_grid_desc_m_n_, @@ -437,49 +438,27 @@ struct DeviceConv2dBwdWeightXdl_C_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_ float ave_time = 0; const auto Run = [&](const auto& kernel) { - if(nrepeat > 0) - { - ave_time = - launch_and_time_kernel(kernel, - nrepeat, - dim3(grid_size), - dim3(BlockSize), - 0, - arg.p_a_grid_, - arg.p_b_grid_, - arg.p_c_grid_, - arg.a_grid_desc_kbatch_k0_m_k1_, - arg.b_grid_desc_kbatch_k0_n_k1_, - arg.c_grid_desc_mblock_mperblock_nblock_nperblock_, - arg.a_element_op_, - arg.b_element_op_, - arg.c_element_op_, - arg.block_2_ctile_map_); - } + hipGetErrorString(hipMemset( + arg.p_c_grid_, + 0, + arg.c_grid_desc_mblock_mperblock_nblock_nperblock_.GetElementSpaceSize() * + sizeof(CDataType))); - if(kbatch > 1 || nrepeat <= 0) - { - hipGetErrorString(hipMemset( - arg.p_c_grid_, - 0, - arg.c_grid_desc_mblock_mperblock_nblock_nperblock_.GetElementSpaceSize() * - sizeof(CDataType))); - - launch_kernel(kernel, - dim3(grid_size), - dim3(BlockSize), - 0, - arg.p_a_grid_, - arg.p_b_grid_, - arg.p_c_grid_, - arg.a_grid_desc_kbatch_k0_m_k1_, - arg.b_grid_desc_kbatch_k0_n_k1_, - arg.c_grid_desc_mblock_mperblock_nblock_nperblock_, - arg.a_element_op_, - arg.b_element_op_, - arg.c_element_op_, - arg.block_2_ctile_map_); - } + launch_and_time_kernel(stream_config, + kernel, + dim3(grid_size), + dim3(BlockSize), + 0, + arg.p_a_grid_, + arg.p_b_grid_, + arg.p_c_grid_, + arg.a_grid_desc_kbatch_k0_m_k1_, + arg.b_grid_desc_kbatch_k0_n_k1_, + arg.c_grid_desc_mblock_mperblock_nblock_nperblock_, + arg.a_element_op_, + arg.b_element_op_, + arg.c_element_op_, + arg.block_2_ctile_map_); }; if(has_main_k0_block_loop) @@ -560,9 +539,10 @@ struct DeviceConv2dBwdWeightXdl_C_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_ return ave_time; } - float Run(const BaseArgument* p_arg, int nrepeat = 1) override + float Run(const BaseArgument* p_arg, + const StreamConfig& stream_config = StreamConfig{}) override { - return Run(*dynamic_cast(p_arg), nrepeat); + return Run(*dynamic_cast(p_arg), stream_config); } }; diff --git a/include/ck/tensor_operation/gpu/device/device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk.hpp b/include/ck/tensor_operation/gpu/device/device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk.hpp index fad4ec1ffa..def6af74ac 100644 --- a/include/ck/tensor_operation/gpu/device/device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk.hpp +++ b/include/ck/tensor_operation/gpu/device/device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk.hpp @@ -531,7 +531,7 @@ struct DeviceConv2dBwdDataXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K { using Argument = DeviceOp::Argument; - float Run(const Argument& arg, int nrepeat = 1) + float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{}) { float ave_time = 0; for(size_t i = 0; i < arg.a_grid_desc_k0_m_k1_container_.size(); i++) @@ -602,8 +602,8 @@ struct DeviceConv2dBwdDataXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K true>; ave_time += launch_and_time_kernel( + stream_config, kernel, - nrepeat, dim3(grid_size), dim3(BlockSize), 0, @@ -635,8 +635,8 @@ struct DeviceConv2dBwdDataXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K false>; ave_time += launch_and_time_kernel( + stream_config, kernel, - nrepeat, dim3(grid_size), dim3(BlockSize), 0, @@ -655,9 +655,10 @@ struct DeviceConv2dBwdDataXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K return ave_time; } - float Run(const BaseArgument* p_arg, int nrepeat = 1) override + float Run(const BaseArgument* p_arg, + const StreamConfig& stream_config = StreamConfig{}) override { - return Run(*dynamic_cast(p_arg), nrepeat); + return Run(*dynamic_cast(p_arg), stream_config); } }; diff --git a/include/ck/tensor_operation/gpu/device/device_conv2d_fwd_xdl_c_shuffle_bias_activation_add_nhwc_kyxc_nhwk.hpp b/include/ck/tensor_operation/gpu/device/device_conv2d_fwd_xdl_c_shuffle_bias_activation_add_nhwc_kyxc_nhwk.hpp index 6648929cd5..fd95c184ca 100644 --- a/include/ck/tensor_operation/gpu/device/device_conv2d_fwd_xdl_c_shuffle_bias_activation_add_nhwc_kyxc_nhwk.hpp +++ b/include/ck/tensor_operation/gpu/device/device_conv2d_fwd_xdl_c_shuffle_bias_activation_add_nhwc_kyxc_nhwk.hpp @@ -642,7 +642,7 @@ struct { using Argument = DeviceOp::Argument; - float Run(const Argument& arg, int nrepeat = 1) + float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{}) { #if 0 { @@ -727,8 +727,8 @@ struct true>; ave_time = launch_and_time_kernel( + stream_config, kernel, - nrepeat, dim3(grid_size), dim3(BlockSize), 0, @@ -771,8 +771,8 @@ struct false>; ave_time = launch_and_time_kernel( + stream_config, kernel, - nrepeat, dim3(grid_size), dim3(BlockSize), 0, @@ -795,9 +795,10 @@ struct return ave_time; } - float Run(const BaseArgument* p_arg, int nrepeat = 1) override + float Run(const BaseArgument* p_arg, + const StreamConfig& stream_config = StreamConfig{}) override { - return Run(*dynamic_cast(p_arg), nrepeat); + return Run(*dynamic_cast(p_arg), stream_config); } }; diff --git a/include/ck/tensor_operation/gpu/device/device_conv2d_fwd_xdl_c_shuffle_bias_activation_nhwc_kyxc_nhwk.hpp b/include/ck/tensor_operation/gpu/device/device_conv2d_fwd_xdl_c_shuffle_bias_activation_nhwc_kyxc_nhwk.hpp index fd0941420c..61c91c0b76 100644 --- a/include/ck/tensor_operation/gpu/device/device_conv2d_fwd_xdl_c_shuffle_bias_activation_nhwc_kyxc_nhwk.hpp +++ b/include/ck/tensor_operation/gpu/device/device_conv2d_fwd_xdl_c_shuffle_bias_activation_nhwc_kyxc_nhwk.hpp @@ -605,7 +605,7 @@ struct DeviceConv2dFwdXdl_C_Shuffle_Bias_Activation_Input_N_Hi_Wi_C_Weight_K_Y_X { using Argument = DeviceOp::Argument; - float Run(const Argument& arg, int nrepeat = 1) + float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{}) { #if 0 { @@ -684,8 +684,8 @@ struct DeviceConv2dFwdXdl_C_Shuffle_Bias_Activation_Input_N_Hi_Wi_C_Weight_K_Y_X true>; ave_time = launch_and_time_kernel( + stream_config, kernel, - nrepeat, dim3(grid_size), dim3(BlockSize), 0, @@ -723,8 +723,8 @@ struct DeviceConv2dFwdXdl_C_Shuffle_Bias_Activation_Input_N_Hi_Wi_C_Weight_K_Y_X false>; ave_time = launch_and_time_kernel( + stream_config, kernel, - nrepeat, dim3(grid_size), dim3(BlockSize), 0, @@ -745,9 +745,10 @@ struct DeviceConv2dFwdXdl_C_Shuffle_Bias_Activation_Input_N_Hi_Wi_C_Weight_K_Y_X return ave_time; } - float Run(const BaseArgument* p_arg, int nrepeat = 1) override + float Run(const BaseArgument* p_arg, + const StreamConfig& stream_config = StreamConfig{}) override { - return Run(*dynamic_cast(p_arg), nrepeat); + return Run(*dynamic_cast(p_arg), stream_config); } }; diff --git a/include/ck/tensor_operation/gpu/device/device_conv2d_fwd_xdl_c_shuffle_nhwc_kyxc_nhwk.hpp b/include/ck/tensor_operation/gpu/device/device_conv2d_fwd_xdl_c_shuffle_nhwc_kyxc_nhwk.hpp index b508606a75..f4cddc1946 100644 --- a/include/ck/tensor_operation/gpu/device/device_conv2d_fwd_xdl_c_shuffle_nhwc_kyxc_nhwk.hpp +++ b/include/ck/tensor_operation/gpu/device/device_conv2d_fwd_xdl_c_shuffle_nhwc_kyxc_nhwk.hpp @@ -568,7 +568,7 @@ struct DeviceConv2dFwdXdl_C_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_W { using Argument = DeviceOp::Argument; - float Run(const Argument& arg, int nrepeat = 1) + float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{}) { #if 0 { @@ -663,8 +663,8 @@ struct DeviceConv2dFwdXdl_C_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_W true>; ave_time = launch_and_time_kernel( + stream_config, kernel, - nrepeat, dim3(grid_size), dim3(BlockSize), 0, @@ -697,8 +697,8 @@ struct DeviceConv2dFwdXdl_C_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_W false>; ave_time = launch_and_time_kernel( + stream_config, kernel, - nrepeat, dim3(grid_size), dim3(BlockSize), 0, @@ -717,9 +717,10 @@ struct DeviceConv2dFwdXdl_C_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_W return ave_time; } - float Run(const BaseArgument* p_arg, int nrepeat = 1) override + float Run(const BaseArgument* p_arg, + const StreamConfig& stream_config = StreamConfig{}) override { - return Run(*dynamic_cast(p_arg), nrepeat); + return Run(*dynamic_cast(p_arg), stream_config); } }; diff --git a/include/ck/tensor_operation/gpu/device/device_conv2d_fwd_xdl_nhwc_kyxc_nhwk.hpp b/include/ck/tensor_operation/gpu/device/device_conv2d_fwd_xdl_nhwc_kyxc_nhwk.hpp index 3574f7667e..aa9229f7cb 100644 --- a/include/ck/tensor_operation/gpu/device/device_conv2d_fwd_xdl_nhwc_kyxc_nhwk.hpp +++ b/include/ck/tensor_operation/gpu/device/device_conv2d_fwd_xdl_nhwc_kyxc_nhwk.hpp @@ -450,7 +450,7 @@ struct DeviceConv2dFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K { using Argument = DeviceOp::Argument; - float Run(const Argument& arg, int nrepeat = 1) + float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{}) { #if 0 { @@ -498,8 +498,8 @@ struct DeviceConv2dFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K remove_reference_t, true>; - ave_time = launch_and_time_kernel(kernel, - nrepeat, + ave_time = launch_and_time_kernel(stream_config, + kernel, dim3(grid_size), dim3(BlockSize), 0, @@ -529,8 +529,8 @@ struct DeviceConv2dFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K remove_reference_t, false>; - ave_time = launch_and_time_kernel(kernel, - nrepeat, + ave_time = launch_and_time_kernel(stream_config, + kernel, dim3(grid_size), dim3(BlockSize), 0, @@ -549,9 +549,10 @@ struct DeviceConv2dFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K return ave_time; } - float Run(const BaseArgument* p_arg, int nrepeat = 1) override + float Run(const BaseArgument* p_arg, + const StreamConfig& stream_config = StreamConfig{}) override { - return Run(*dynamic_cast(p_arg), nrepeat); + return Run(*dynamic_cast(p_arg), stream_config); } }; diff --git a/include/ck/tensor_operation/gpu/device/device_conv3d_fwd_naive_ndhwc_kzyxc_ndhwk.hpp b/include/ck/tensor_operation/gpu/device/device_conv3d_fwd_naive_ndhwc_kzyxc_ndhwk.hpp index c3ebe58865..b1eea0b33f 100644 --- a/include/ck/tensor_operation/gpu/device/device_conv3d_fwd_naive_ndhwc_kzyxc_ndhwk.hpp +++ b/include/ck/tensor_operation/gpu/device/device_conv3d_fwd_naive_ndhwc_kzyxc_ndhwk.hpp @@ -4,7 +4,7 @@ #include #include #include -#include "conv_fwd_util.hpp" +#include "conv_util.hpp" #include "device.hpp" #include "device_conv_fwd.hpp" #include "common_header.hpp" @@ -92,7 +92,7 @@ struct DeviceConv3dFwdNaive_Input_N_Di_Hi_Wi_C_Weight_K_Z_Y_X_C_Output_N_Do_Ho_W { using Argument = DeviceOp::Argument; - float Run(const Argument& arg, int nrepeat = 1) + float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{}) { const auto naive_conv3d_fwd = ref::naive_conv_fwd_ndhwc_kzyxc_ndhwk; - float ave_time = launch_and_time_kernel(naive_conv3d_fwd, - nrepeat, + float ave_time = launch_and_time_kernel(stream_config, + naive_conv3d_fwd, dim3(256), dim3(256), 0, @@ -137,9 +137,10 @@ struct DeviceConv3dFwdNaive_Input_N_Di_Hi_Wi_C_Weight_K_Z_Y_X_C_Output_N_Do_Ho_W } // polymorphic - float Run(const BaseArgument* p_arg, int nrepeat = 1) override + float Run(const BaseArgument* p_arg, + const StreamConfig& stream_config = StreamConfig{}) override { - return Run(*dynamic_cast(p_arg), nrepeat); + return Run(*dynamic_cast(p_arg), stream_config); } }; diff --git a/include/ck/tensor_operation/gpu/device/device_conv3d_fwd_xdl_ndhwc_kzyxc_ndhwk.hpp b/include/ck/tensor_operation/gpu/device/device_conv3d_fwd_xdl_ndhwc_kzyxc_ndhwk.hpp index ff30a6880d..0f98ba054d 100644 --- a/include/ck/tensor_operation/gpu/device/device_conv3d_fwd_xdl_ndhwc_kzyxc_ndhwk.hpp +++ b/include/ck/tensor_operation/gpu/device/device_conv3d_fwd_xdl_ndhwc_kzyxc_ndhwk.hpp @@ -438,7 +438,7 @@ struct DeviceConv3dFwdXdl_Input_N_Di_Hi_Wi_C_Weight_K_Z_Y_X_C_Output_N_Do_Ho_Wo_ { using Argument = DeviceOp::Argument; - float Run(const Argument& arg, int nrepeat = 1) + float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{}) { { std::cout << "num_batches_of_GEMM = " << arg.num_subbatches_ << std::endl; @@ -487,8 +487,8 @@ struct DeviceConv3dFwdXdl_Input_N_Di_Hi_Wi_C_Weight_K_Z_Y_X_C_Output_N_Do_Ho_Wo_ OutElementwiseOperation, remove_reference_t, true>; - ave_time = launch_and_time_kernel(kernel, - nrepeat, + ave_time = launch_and_time_kernel(stream_config, + kernel, dim3(grid_size), dim3(BlockSize), 0, @@ -522,8 +522,8 @@ struct DeviceConv3dFwdXdl_Input_N_Di_Hi_Wi_C_Weight_K_Z_Y_X_C_Output_N_Do_Ho_Wo_ remove_reference_t, false>; - ave_time = launch_and_time_kernel(kernel, - nrepeat, + ave_time = launch_and_time_kernel(stream_config, + kernel, dim3(grid_size), dim3(BlockSize), 0, @@ -547,9 +547,10 @@ struct DeviceConv3dFwdXdl_Input_N_Di_Hi_Wi_C_Weight_K_Z_Y_X_C_Output_N_Do_Ho_Wo_ } // polymorphic - float Run(const BaseArgument* p_arg, int nrepeat = 1) override + float Run(const BaseArgument* p_arg, + const StreamConfig& stream_config = StreamConfig{}) override { - return Run(*dynamic_cast(p_arg), nrepeat); + return Run(*dynamic_cast(p_arg), stream_config); } }; diff --git a/include/ck/tensor_operation/gpu/device/device_convnd_bwd_data_xdl_ndhwc_kzyxc_ndhwk.hpp b/include/ck/tensor_operation/gpu/device/device_convnd_bwd_data_xdl_ndhwc_kzyxc_ndhwk.hpp index 5dca8f9629..209b3c866e 100644 --- a/include/ck/tensor_operation/gpu/device/device_convnd_bwd_data_xdl_ndhwc_kzyxc_ndhwk.hpp +++ b/include/ck/tensor_operation/gpu/device/device_convnd_bwd_data_xdl_ndhwc_kzyxc_ndhwk.hpp @@ -1241,7 +1241,7 @@ struct DeviceConvndBwdDataXdl_Input_N_Di_Hi_Wi_C_Weight_K_Z_Y_X_C_Output_N_Do_Ho { using Argument = DeviceOp::Argument; - float Run(const Argument& arg, int nrepeat = 1) + float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{}) { float ave_time = 0; for(size_t i = 0; i < arg.a_grid_desc_k0_m_k1_container_.size(); i++) @@ -1316,8 +1316,8 @@ struct DeviceConvndBwdDataXdl_Input_N_Di_Hi_Wi_C_Weight_K_Z_Y_X_C_Output_N_Do_Ho true>; ave_time += launch_and_time_kernel( + stream_config, kernel, - nrepeat, dim3(grid_size), dim3(BlockSize), 0, @@ -1349,8 +1349,8 @@ struct DeviceConvndBwdDataXdl_Input_N_Di_Hi_Wi_C_Weight_K_Z_Y_X_C_Output_N_Do_Ho false>; ave_time += launch_and_time_kernel( + stream_config, kernel, - nrepeat, dim3(grid_size), dim3(BlockSize), 0, @@ -1369,9 +1369,10 @@ struct DeviceConvndBwdDataXdl_Input_N_Di_Hi_Wi_C_Weight_K_Z_Y_X_C_Output_N_Do_Ho return ave_time; } - float Run(const BaseArgument* p_arg, int nrepeat = 1) override + float Run(const BaseArgument* p_arg, + const StreamConfig& stream_config = StreamConfig{}) override { - return Run(*dynamic_cast(p_arg), nrepeat); + return Run(*dynamic_cast(p_arg), stream_config); } }; diff --git a/include/ck/tensor_operation/gpu/device/device_convnd_fwd_xdl_nhwc_kyxc_nhwk.hpp b/include/ck/tensor_operation/gpu/device/device_convnd_fwd_xdl_nhwc_kyxc_nhwk.hpp index 7365f9a3e2..4251052a99 100644 --- a/include/ck/tensor_operation/gpu/device/device_convnd_fwd_xdl_nhwc_kyxc_nhwk.hpp +++ b/include/ck/tensor_operation/gpu/device/device_convnd_fwd_xdl_nhwc_kyxc_nhwk.hpp @@ -747,7 +747,7 @@ struct DeviceConvNDFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K { using Argument = DeviceOp::Argument; - float Run(const Argument& arg, int nrepeat = 1) + float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{}) { #if 0 { @@ -795,8 +795,8 @@ struct DeviceConvNDFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K remove_reference_t, true>; - ave_time = launch_and_time_kernel(kernel, - nrepeat, + ave_time = launch_and_time_kernel(stream_config, + kernel, dim3(grid_size), dim3(BlockSize), 0, @@ -826,8 +826,8 @@ struct DeviceConvNDFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K remove_reference_t, false>; - ave_time = launch_and_time_kernel(kernel, - nrepeat, + ave_time = launch_and_time_kernel(stream_config, + kernel, dim3(grid_size), dim3(BlockSize), 0, @@ -846,9 +846,10 @@ struct DeviceConvNDFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K return ave_time; } - float Run(const BaseArgument* p_arg, int nrepeat = 1) override + float Run(const BaseArgument* p_arg, + const StreamConfig& stream_config = StreamConfig{}) override { - return Run(*dynamic_cast(p_arg), nrepeat); + return Run(*dynamic_cast(p_arg), stream_config); } }; diff --git a/include/ck/tensor_operation/gpu/device/device_gemm_reduce_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/device_gemm_reduce_xdl_cshuffle.hpp index 1a3fbdf956..69c29b72d3 100644 --- a/include/ck/tensor_operation/gpu/device/device_gemm_reduce_xdl_cshuffle.hpp +++ b/include/ck/tensor_operation/gpu/device/device_gemm_reduce_xdl_cshuffle.hpp @@ -14,6 +14,9 @@ namespace ck { namespace tensor_operation { namespace device { +// Note: inter-wave loop scheduler is rolled out to c-shuffle version first. Becuase non c-shuffle +// version currently has compiler issues with register spill which further causes validation +// failures. template + index_t CReduceThreadVgpr2GlobalCopySrcDstScalarPerVector_MPerBlock, + LoopScheduler LoopSched = make_default_loop_scheduler()> struct DeviceGemmReduce_Xdl_CShuffle : public DeviceGemmReduce; + CReduceThreadVgpr2GlobalCopySrcDstScalarPerVector_MPerBlock, + LoopSched>; // Argument struct Argument : public BaseArgument @@ -498,7 +503,7 @@ struct DeviceGemmReduce_Xdl_CShuffle : public DeviceGemmReduce; - launch_kernel(kernel, - dim3(grid_size), - dim3(BlockSize), - 0, - arg.p_a_grid_, - arg.p_b_grid_, - arg.p_c_grid_, - arg.p_d0_grid_, - arg.p_d1_grid_, - arg.a_element_op_, - arg.b_element_op_, - arg.c_element_op_, - arg.d1_element_op_, - arg.a_grid_desc_ak0_m_ak1_, - arg.b_grid_desc_bk0_n_bk1_, - arg.c_grid_desc_mblock_mperblock_nblock_nperblock_, - arg.d_grid_desc_mblock_mperblock_, - arg.block_2_ctile_map_); + elapsed_time = + launch_and_time_kernel(stream_config, + kernel, + dim3(grid_size), + dim3(BlockSize), + 0, + arg.p_a_grid_, + arg.p_b_grid_, + arg.p_c_grid_, + arg.p_d0_grid_, + arg.p_d1_grid_, + arg.a_element_op_, + arg.b_element_op_, + arg.c_element_op_, + arg.d1_element_op_, + arg.a_grid_desc_ak0_m_ak1_, + arg.b_grid_desc_bk0_n_bk1_, + arg.c_grid_desc_mblock_mperblock_nblock_nperblock_, + arg.d_grid_desc_mblock_mperblock_, + arg.block_2_ctile_map_); } else { @@ -586,33 +594,36 @@ struct DeviceGemmReduce_Xdl_CShuffle : public DeviceGemmReduce; - launch_kernel(kernel, - dim3(grid_size), - dim3(BlockSize), - 0, - arg.p_a_grid_, - arg.p_b_grid_, - arg.p_c_grid_, - arg.p_d0_grid_, - arg.p_d1_grid_, - arg.a_element_op_, - arg.b_element_op_, - arg.c_element_op_, - arg.d1_element_op_, - arg.a_grid_desc_ak0_m_ak1_, - arg.b_grid_desc_bk0_n_bk1_, - arg.c_grid_desc_mblock_mperblock_nblock_nperblock_, - arg.d_grid_desc_mblock_mperblock_, - arg.block_2_ctile_map_); + elapsed_time = + launch_and_time_kernel(stream_config, + kernel, + dim3(grid_size), + dim3(BlockSize), + 0, + arg.p_a_grid_, + arg.p_b_grid_, + arg.p_c_grid_, + arg.p_d0_grid_, + arg.p_d1_grid_, + arg.a_element_op_, + arg.b_element_op_, + arg.c_element_op_, + arg.d1_element_op_, + arg.a_grid_desc_ak0_m_ak1_, + arg.b_grid_desc_bk0_n_bk1_, + arg.c_grid_desc_mblock_mperblock_nblock_nperblock_, + arg.d_grid_desc_mblock_mperblock_, + arg.block_2_ctile_map_); } - return 0; + return elapsed_time; } // polymorphic - float Run(const BaseArgument* p_arg, int nrepeat = 1) override + float Run(const BaseArgument* p_arg, + const StreamConfig& stream_config = StreamConfig{}) override { - return Run(*dynamic_cast(p_arg), nrepeat); + return Run(*dynamic_cast(p_arg), stream_config); } }; diff --git a/include/ck/tensor_operation/gpu/device/device_gemm_xdl.hpp b/include/ck/tensor_operation/gpu/device/device_gemm_xdl.hpp index 47997cd802..2bb7f6e78a 100644 --- a/include/ck/tensor_operation/gpu/device/device_gemm_xdl.hpp +++ b/include/ck/tensor_operation/gpu/device/device_gemm_xdl.hpp @@ -290,7 +290,7 @@ struct DeviceGemmXdl { using Argument = DeviceGemmXdl::Argument; - float Run(const Argument& arg, int nrepeat = 1) + float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{}) { #if 0 { @@ -339,8 +339,8 @@ struct DeviceGemmXdl remove_reference_t, true>; - ave_time = launch_and_time_kernel(kernel, - nrepeat, + ave_time = launch_and_time_kernel(stream_config, + kernel, dim3(grid_size), dim3(BlockSize), 0, @@ -370,8 +370,8 @@ struct DeviceGemmXdl remove_reference_t, false>; - ave_time = launch_and_time_kernel(kernel, - nrepeat, + ave_time = launch_and_time_kernel(stream_config, + kernel, dim3(grid_size), dim3(BlockSize), 0, @@ -391,9 +391,10 @@ struct DeviceGemmXdl } // polymorphic - float Run(const BaseArgument* p_arg, int nrepeat = 1) override + float Run(const BaseArgument* p_arg, + const StreamConfig& stream_config = StreamConfig{}) override { - return Run(*dynamic_cast(p_arg), nrepeat); + return Run(*dynamic_cast(p_arg), stream_config); } }; diff --git a/include/ck/tensor_operation/gpu/device/device_gemm_xdl_c_shuffle_bias_2d.hpp b/include/ck/tensor_operation/gpu/device/device_gemm_xdl_c_shuffle_bias_2d.hpp index 4010965312..315f39d9bf 100644 --- a/include/ck/tensor_operation/gpu/device/device_gemm_xdl_c_shuffle_bias_2d.hpp +++ b/include/ck/tensor_operation/gpu/device/device_gemm_xdl_c_shuffle_bias_2d.hpp @@ -264,7 +264,7 @@ struct DeviceGemmXdl_C_Shuffle_Bias_2d { using Argument = DeviceGemmXdl_C_Shuffle_Bias_2d::Argument; - float Run(const Argument& arg, int nrepeat = 1) + float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{}) { { std::cout << "arg.a_grid_desc_k0_m_k1_{" << arg.a_grid_desc_k0_m_k1_.GetLength(I0) @@ -320,8 +320,8 @@ struct DeviceGemmXdl_C_Shuffle_Bias_2d true>; ave_time = launch_and_time_kernel( + stream_config, kernel, - nrepeat, dim3(grid_size), dim3(BlockSize), 0, @@ -359,8 +359,8 @@ struct DeviceGemmXdl_C_Shuffle_Bias_2d false>; ave_time = launch_and_time_kernel( + stream_config, kernel, - nrepeat, dim3(grid_size), dim3(BlockSize), 0, @@ -382,9 +382,10 @@ struct DeviceGemmXdl_C_Shuffle_Bias_2d } // polymorphic - float Run(const BaseArgument* p_arg, int nrepeat = 1) override + float Run(const BaseArgument* p_arg, + const StreamConfig& stream_config = StreamConfig{}) override { - return Run(*dynamic_cast(p_arg), nrepeat); + return Run(*dynamic_cast(p_arg), stream_config); } }; diff --git a/include/ck/tensor_operation/gpu/device/device_gemm_xdl_c_shuffle_bias_activation.hpp b/include/ck/tensor_operation/gpu/device/device_gemm_xdl_c_shuffle_bias_activation.hpp index c65ff6022a..f1f9f41724 100644 --- a/include/ck/tensor_operation/gpu/device/device_gemm_xdl_c_shuffle_bias_activation.hpp +++ b/include/ck/tensor_operation/gpu/device/device_gemm_xdl_c_shuffle_bias_activation.hpp @@ -273,7 +273,7 @@ struct DeviceGemmXdl_C_Shuffle_Bias_Activation { using Argument = DeviceOp::Argument; - float Run(const Argument& arg, int nrepeat = 1) + float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{}) { { std::cout << "arg.a_grid_desc_k0_m_k1_{" << arg.a_grid_desc_k0_m_k1_.GetLength(I0) @@ -329,8 +329,8 @@ struct DeviceGemmXdl_C_Shuffle_Bias_Activation true>; ave_time = launch_and_time_kernel( + stream_config, kernel, - nrepeat, dim3(grid_size), dim3(BlockSize), 0, @@ -368,8 +368,8 @@ struct DeviceGemmXdl_C_Shuffle_Bias_Activation false>; ave_time = launch_and_time_kernel( + stream_config, kernel, - nrepeat, dim3(grid_size), dim3(BlockSize), 0, @@ -391,9 +391,10 @@ struct DeviceGemmXdl_C_Shuffle_Bias_Activation } // polymorphic - float Run(const BaseArgument* p_arg, int nrepeat = 1) override + float Run(const BaseArgument* p_arg, + const StreamConfig& stream_config = StreamConfig{}) override { - return Run(*dynamic_cast(p_arg), nrepeat); + return Run(*dynamic_cast(p_arg), stream_config); } }; diff --git a/include/ck/tensor_operation/gpu/device/device_gemm_xdl_c_shuffle_bias_activation_add.hpp b/include/ck/tensor_operation/gpu/device/device_gemm_xdl_c_shuffle_bias_activation_add.hpp index 4a478c995d..e3d0986aba 100644 --- a/include/ck/tensor_operation/gpu/device/device_gemm_xdl_c_shuffle_bias_activation_add.hpp +++ b/include/ck/tensor_operation/gpu/device/device_gemm_xdl_c_shuffle_bias_activation_add.hpp @@ -312,7 +312,7 @@ struct DeviceGemmXdl_C_Shuffle_Bias_Activation_Add { using Argument = DeviceOp::Argument; - float Run(const Argument& arg, int nrepeat = 1) + float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{}) { { std::cout << "arg.a_grid_desc_k0_m_k1_{" << arg.a_grid_desc_k0_m_k1_.GetLength(I0) @@ -374,8 +374,8 @@ struct DeviceGemmXdl_C_Shuffle_Bias_Activation_Add true>; ave_time = launch_and_time_kernel( + stream_config, kernel, - nrepeat, dim3(grid_size), dim3(BlockSize), 0, @@ -418,8 +418,8 @@ struct DeviceGemmXdl_C_Shuffle_Bias_Activation_Add false>; ave_time = launch_and_time_kernel( + stream_config, kernel, - nrepeat, dim3(grid_size), dim3(BlockSize), 0, @@ -443,9 +443,10 @@ struct DeviceGemmXdl_C_Shuffle_Bias_Activation_Add } // polymorphic - float Run(const BaseArgument* p_arg, int nrepeat = 1) override + float Run(const BaseArgument* p_arg, + const StreamConfig& stream_config = StreamConfig{}) override { - return Run(*dynamic_cast(p_arg), nrepeat); + return Run(*dynamic_cast(p_arg), stream_config); } }; diff --git a/include/ck/tensor_operation/gpu/device/device_gemm_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/device_gemm_xdl_cshuffle.hpp index 440519537e..952630120a 100644 --- a/include/ck/tensor_operation/gpu/device/device_gemm_xdl_cshuffle.hpp +++ b/include/ck/tensor_operation/gpu/device/device_gemm_xdl_cshuffle.hpp @@ -14,6 +14,9 @@ namespace ck { namespace tensor_operation { namespace device { +// Note: inter-wave loop scheduler is rolled out to c-shuffle version first. Becuase non c-shuffle +// version currently has compiler issues with register spill which further causes validation +// failures. template + index_t CShuffleBlockTransferScalarPerVector_NPerBlock, + LoopScheduler LoopSched = make_default_loop_scheduler()> struct DeviceGemm_Xdl_CShuffle : public DeviceGemm { @@ -375,7 +379,8 @@ struct DeviceGemm_Xdl_CShuffle CShuffleMXdlPerWavePerShuffle, CShuffleNXdlPerWavePerShuffle, CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, - CShuffleBlockTransferScalarPerVector_NPerBlock>; + CShuffleBlockTransferScalarPerVector_NPerBlock, + LoopSched>; // Argument struct Argument : public BaseArgument @@ -435,7 +440,7 @@ struct DeviceGemm_Xdl_CShuffle { using Argument = DeviceOp::Argument; - float Run(const Argument& arg, int nrepeat = 1) + float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{}) { #if 0 { @@ -482,42 +487,22 @@ struct DeviceGemm_Xdl_CShuffle typename GridwiseGemm::DefaultBlock2CTileMap, true>; - if(nrepeat == 0) - { - launch_kernel(kernel, - dim3(grid_size), - dim3(BlockSize), - 0, - arg.p_a_grid_, - arg.p_b_grid_, - arg.p_c_grid_, - arg.a_element_op_, - arg.b_element_op_, - arg.c_element_op_, - arg.a_grid_desc_ak0_m_ak1_, - arg.b_grid_desc_bk0_n_bk1_, - arg.c_grid_desc_mblock_mperblock_nblock_nperblock_, - arg.block_2_ctile_map_); - } - else - { - ave_time = - launch_and_time_kernel(kernel, - nrepeat, - dim3(grid_size), - dim3(BlockSize), - 0, - arg.p_a_grid_, - arg.p_b_grid_, - arg.p_c_grid_, - arg.a_element_op_, - arg.b_element_op_, - arg.c_element_op_, - arg.a_grid_desc_ak0_m_ak1_, - arg.b_grid_desc_bk0_n_bk1_, - arg.c_grid_desc_mblock_mperblock_nblock_nperblock_, - arg.block_2_ctile_map_); - } + ave_time = + launch_and_time_kernel(stream_config, + kernel, + dim3(grid_size), + dim3(BlockSize), + 0, + arg.p_a_grid_, + arg.p_b_grid_, + arg.p_c_grid_, + arg.a_element_op_, + arg.b_element_op_, + arg.c_element_op_, + arg.a_grid_desc_ak0_m_ak1_, + arg.b_grid_desc_bk0_n_bk1_, + arg.c_grid_desc_mblock_mperblock_nblock_nperblock_, + arg.block_2_ctile_map_); } else { @@ -533,52 +518,32 @@ struct DeviceGemm_Xdl_CShuffle typename GridwiseGemm::CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock, typename GridwiseGemm::DefaultBlock2CTileMap, false>; - - if(nrepeat == 0) - { - launch_kernel(kernel, - dim3(grid_size), - dim3(BlockSize), - 0, - arg.p_a_grid_, - arg.p_b_grid_, - arg.p_c_grid_, - arg.a_element_op_, - arg.b_element_op_, - arg.c_element_op_, - arg.a_grid_desc_ak0_m_ak1_, - arg.b_grid_desc_bk0_n_bk1_, - arg.c_grid_desc_mblock_mperblock_nblock_nperblock_, - arg.block_2_ctile_map_); - } - else - { - ave_time = - launch_and_time_kernel(kernel, - nrepeat, - dim3(grid_size), - dim3(BlockSize), - 0, - arg.p_a_grid_, - arg.p_b_grid_, - arg.p_c_grid_, - arg.a_element_op_, - arg.b_element_op_, - arg.c_element_op_, - arg.a_grid_desc_ak0_m_ak1_, - arg.b_grid_desc_bk0_n_bk1_, - arg.c_grid_desc_mblock_mperblock_nblock_nperblock_, - arg.block_2_ctile_map_); - } + ave_time = + launch_and_time_kernel(stream_config, + kernel, + dim3(grid_size), + dim3(BlockSize), + 0, + arg.p_a_grid_, + arg.p_b_grid_, + arg.p_c_grid_, + arg.a_element_op_, + arg.b_element_op_, + arg.c_element_op_, + arg.a_grid_desc_ak0_m_ak1_, + arg.b_grid_desc_bk0_n_bk1_, + arg.c_grid_desc_mblock_mperblock_nblock_nperblock_, + arg.block_2_ctile_map_); } return ave_time; } // polymorphic - float Run(const BaseArgument* p_arg, int nrepeat = 1) override + float Run(const BaseArgument* p_arg, + const StreamConfig& stream_config = StreamConfig{}) override { - return Run(*dynamic_cast(p_arg), nrepeat); + return Run(*dynamic_cast(p_arg), stream_config); } }; diff --git a/include/ck/tensor_operation/gpu/device/device_gemm_xdl_splitk.hpp b/include/ck/tensor_operation/gpu/device/device_gemm_xdl_splitk.hpp index db6c884739..e603af1fba 100644 --- a/include/ck/tensor_operation/gpu/device/device_gemm_xdl_splitk.hpp +++ b/include/ck/tensor_operation/gpu/device/device_gemm_xdl_splitk.hpp @@ -385,8 +385,11 @@ struct DeviceGemmXdlSplitK std::cout << "arg.c_grid_desc_m_n_{ " << arg.c_grid_desc_m_n_.GetLength(I0) << ", " << arg.c_grid_desc_m_n_.GetLength(I1) << "}" << std::endl; } - float Run(const Argument& arg, int nrepeat = 1) + + float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{}) { + ShowInfo(arg); + const auto kbatch = arg.a_grid_desc_kbatch_k0_m_k1_.GetLength(I0); if(!GridwiseGemm::CheckValidity(arg.a_grid_desc_kbatch_k0_m_k1_, @@ -408,50 +411,30 @@ struct DeviceGemmXdlSplitK float ave_time = 0; const auto Run = [&](const auto& kernel) { - if(nrepeat > 0) - { - ShowInfo(arg); - ave_time = launch_and_time_kernel(kernel, - nrepeat, - dim3(grid_size), - dim3(BlockSize), - 0, - arg.p_a_grid_, - arg.p_b_grid_, - arg.p_c_grid_, - arg.a_grid_desc_kbatch_k0_m_k1_, - arg.b_grid_desc_kbatch_k0_n_k1_, - arg.c_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2_, - arg.a_element_op_, - arg.b_element_op_, - arg.c_element_op_, - arg.block_2_ctile_map_); - } + // FIXME: this should be moved outside of DeviceOp + hipGetErrorString( + hipMemset(arg.p_c_grid_, + 0, + arg.c_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2_.GetElementSpaceSize() * + sizeof(CDataType))); - if(kbatch > 1 || nrepeat <= 0) - { - hipGetErrorString( - hipMemset(arg.p_c_grid_, - 0, - arg.c_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2_.GetElementSpaceSize() * - sizeof(CDataType))); - - launch_kernel(kernel, - dim3(grid_size), - dim3(BlockSize), - 0, - arg.p_a_grid_, - arg.p_b_grid_, - arg.p_c_grid_, - arg.a_grid_desc_kbatch_k0_m_k1_, - arg.b_grid_desc_kbatch_k0_n_k1_, - arg.c_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2_, - arg.a_element_op_, - arg.b_element_op_, - arg.c_element_op_, - arg.block_2_ctile_map_); - } + ave_time = launch_and_time_kernel(stream_config, + kernel, + dim3(grid_size), + dim3(BlockSize), + 0, + arg.p_a_grid_, + arg.p_b_grid_, + arg.p_c_grid_, + arg.a_grid_desc_kbatch_k0_m_k1_, + arg.b_grid_desc_kbatch_k0_n_k1_, + arg.c_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2_, + arg.a_element_op_, + arg.b_element_op_, + arg.c_element_op_, + arg.block_2_ctile_map_); }; + if(has_main_k0_block_loop) { if(kbatch == 1) @@ -531,9 +514,10 @@ struct DeviceGemmXdlSplitK } // polymorphic - float Run(const BaseArgument* p_arg, int nrepeat = 1) override + float Run(const BaseArgument* p_arg, + const StreamConfig& stream_config = StreamConfig{}) override { - return Run(*dynamic_cast(p_arg), nrepeat); + return Run(*dynamic_cast(p_arg), stream_config); } }; diff --git a/include/ck/tensor_operation/gpu/device/device_gemm_xdl_splitk_c_shuffle.hpp b/include/ck/tensor_operation/gpu/device/device_gemm_xdl_splitk_c_shuffle.hpp index 9de5361ab6..7d00224429 100644 --- a/include/ck/tensor_operation/gpu/device/device_gemm_xdl_splitk_c_shuffle.hpp +++ b/include/ck/tensor_operation/gpu/device/device_gemm_xdl_splitk_c_shuffle.hpp @@ -391,8 +391,11 @@ struct DeviceGemmXdlSplitKCShuffle std::cout << "arg.c_grid_desc_m_n_{ " << arg.c_grid_desc_m_n_.GetLength(I0) << ", " << arg.c_grid_desc_m_n_.GetLength(I1) << "}" << std::endl; } - float Run(const Argument& arg, int nrepeat = 1) + + float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{}) { + ShowInfo(arg); + const auto kbatch = arg.a_grid_desc_kbatch_k0_m_k1_.GetLength(I0); if(!GridwiseGemm::CheckValidity(arg.a_grid_desc_kbatch_k0_m_k1_, @@ -414,51 +417,29 @@ struct DeviceGemmXdlSplitKCShuffle float ave_time = 0; const auto Run = [&](const auto& kernel) { - if(nrepeat > 0) - { - ShowInfo(arg); - ave_time = - launch_and_time_kernel(kernel, - nrepeat, - dim3(grid_size), - dim3(BlockSize), - 0, - arg.p_a_grid_, - arg.p_b_grid_, - arg.p_c_grid_, - arg.a_grid_desc_kbatch_k0_m_k1_, - arg.b_grid_desc_kbatch_k0_n_k1_, - arg.c_grid_desc_mblock_mperblock_nblock_nperblock_, - arg.a_element_op_, - arg.b_element_op_, - arg.c_element_op_, - arg.block_2_ctile_map_); - } + hipGetErrorString(hipMemset( + arg.p_c_grid_, + 0, + arg.c_grid_desc_mblock_mperblock_nblock_nperblock_.GetElementSpaceSize() * + sizeof(CDataType))); - if(kbatch > 1 || nrepeat <= 0) - { - hipGetErrorString(hipMemset( - arg.p_c_grid_, - 0, - arg.c_grid_desc_mblock_mperblock_nblock_nperblock_.GetElementSpaceSize() * - sizeof(CDataType))); - - launch_kernel(kernel, - dim3(grid_size), - dim3(BlockSize), - 0, - arg.p_a_grid_, - arg.p_b_grid_, - arg.p_c_grid_, - arg.a_grid_desc_kbatch_k0_m_k1_, - arg.b_grid_desc_kbatch_k0_n_k1_, - arg.c_grid_desc_mblock_mperblock_nblock_nperblock_, - arg.a_element_op_, - arg.b_element_op_, - arg.c_element_op_, - arg.block_2_ctile_map_); - } + launch_and_time_kernel(stream_config, + kernel, + dim3(grid_size), + dim3(BlockSize), + 0, + arg.p_a_grid_, + arg.p_b_grid_, + arg.p_c_grid_, + arg.a_grid_desc_kbatch_k0_m_k1_, + arg.b_grid_desc_kbatch_k0_n_k1_, + arg.c_grid_desc_mblock_mperblock_nblock_nperblock_, + arg.a_element_op_, + arg.b_element_op_, + arg.c_element_op_, + arg.block_2_ctile_map_); }; + if(has_main_k0_block_loop) { if(kbatch == 1) @@ -542,9 +523,10 @@ struct DeviceGemmXdlSplitKCShuffle } // polymorphic - float Run(const BaseArgument* p_arg, int nrepeat = 1) override + float Run(const BaseArgument* p_arg, + const StreamConfig& stream_config = StreamConfig{}) override { - return Run(*dynamic_cast(p_arg), nrepeat); + return Run(*dynamic_cast(p_arg), stream_config); } }; diff --git a/include/ck/tensor_operation/gpu/device/device_grouped_gemm_xdl.hpp b/include/ck/tensor_operation/gpu/device/device_grouped_gemm_xdl.hpp index dfc1ce2715..730b2d787e 100644 --- a/include/ck/tensor_operation/gpu/device/device_grouped_gemm_xdl.hpp +++ b/include/ck/tensor_operation/gpu/device/device_grouped_gemm_xdl.hpp @@ -449,7 +449,7 @@ struct DeviceGroupedGemmXdl { using Argument = DeviceGroupedGemmXdl::Argument; - float Run(const Argument& arg, int nrepeat = 1) + float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{}) { StaticallyIndexedArray gemm_desc_kernel_args; @@ -510,8 +510,8 @@ struct DeviceGroupedGemmXdl true, MaxGroupCount>; - ave_time = launch_and_time_kernel(kernel, - nrepeat, + ave_time = launch_and_time_kernel(stream_config, + kernel, dim3(arg.grid_size_), dim3(BlockSize), 0, @@ -534,8 +534,8 @@ struct DeviceGroupedGemmXdl false, MaxGroupCount>; - ave_time = launch_and_time_kernel(kernel, - nrepeat, + ave_time = launch_and_time_kernel(stream_config, + kernel, dim3(arg.grid_size_), dim3(BlockSize), 0, @@ -550,9 +550,10 @@ struct DeviceGroupedGemmXdl } // polymorphic - float Run(const BaseArgument* p_arg, int nrepeat = 1) override + float Run(const BaseArgument* p_arg, + const StreamConfig& stream_config = StreamConfig{}) override { - return Run(*dynamic_cast(p_arg), nrepeat); + return Run(*dynamic_cast(p_arg), stream_config); } }; diff --git a/include/ck/tensor_operation/gpu/device/device_pool2d_fwd_nhwc_nhwc.hpp b/include/ck/tensor_operation/gpu/device/device_pool2d_fwd_nhwc_nhwc.hpp index 651d31ae2f..f665378e08 100644 --- a/include/ck/tensor_operation/gpu/device/device_pool2d_fwd_nhwc_nhwc.hpp +++ b/include/ck/tensor_operation/gpu/device/device_pool2d_fwd_nhwc_nhwc.hpp @@ -204,7 +204,7 @@ struct DevicePool2dFwd_Input_N_Hi_Wi_C_Output_N_Ho_Wo_C : public DevicePool2dFwd struct Invoker : public BaseInvoker { - float Run(const Argument& arg, int nrepeat = 1) + float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{}) { using gridwise_reduce = GridwiseReduction_mk_to_m_threadwise(p_arg), nrepeat); + return Run(*dynamic_cast(p_arg), stream_config); } }; diff --git a/include/ck/tensor_operation/gpu/device/device_reduce_blockwise.hpp b/include/ck/tensor_operation/gpu/device/device_reduce_blockwise.hpp index 4f17989b53..860f53d8c5 100644 --- a/include/ck/tensor_operation/gpu/device/device_reduce_blockwise.hpp +++ b/include/ck/tensor_operation/gpu/device/device_reduce_blockwise.hpp @@ -211,7 +211,7 @@ struct DeviceReduceBlockWise : public DeviceReduce; - avg_time = launch_and_time_kernel(kernel, - nrepeat, + avg_time = launch_and_time_kernel(stream_config, + kernel, dim3(arg.gridSize), dim3(BlockSize), 0, @@ -272,9 +272,10 @@ struct DeviceReduceBlockWise : public DeviceReduce(p_arg), nrepeat); + return Run(*dynamic_cast(p_arg), stream_config); }; }; diff --git a/include/ck/tensor_operation/gpu/device/device_reduce_blockwise_second_call.hpp b/include/ck/tensor_operation/gpu/device/device_reduce_blockwise_second_call.hpp index d3b1b4b5c3..43ac48cecc 100644 --- a/include/ck/tensor_operation/gpu/device/device_reduce_blockwise_second_call.hpp +++ b/include/ck/tensor_operation/gpu/device/device_reduce_blockwise_second_call.hpp @@ -182,7 +182,7 @@ struct DeviceReduceBlockWiseSecondCall struct Invoker : public BaseInvoker { - float Run(const Argument& arg, int nrepeat = 1) + float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{}) { const auto in_grid_desc_m_k = DeviceReduceBlockWiseSecondCall::MakeSrc2dDescriptor( arg.inLengths_, arg.inStrides_); @@ -224,8 +224,8 @@ struct DeviceReduceBlockWiseSecondCall InElementwiseOperation, AccElementwiseOperation>; - avg_time = launch_and_time_kernel(kernel, - nrepeat, + avg_time = launch_and_time_kernel(stream_config, + kernel, dim3(arg.gridSize), dim3(BlockSize), 0, @@ -243,10 +243,11 @@ struct DeviceReduceBlockWiseSecondCall return (avg_time); }; - float Run(const BaseArgument* p_arg, int nrepeat = 1) override + float Run(const BaseArgument* p_arg, + const StreamConfig& stream_config = StreamConfig{}) override { - return Run(*dynamic_cast(p_arg), nrepeat); - }; + return Run(*dynamic_cast(p_arg), stream_config); + } }; bool IsSupportedArgument(const BaseArgument* p_arg) override diff --git a/include/ck/tensor_operation/gpu/device/device_reduce_multiblock_atomic_add.hpp b/include/ck/tensor_operation/gpu/device/device_reduce_multiblock_atomic_add.hpp index 889c366875..f93c65fe18 100644 --- a/include/ck/tensor_operation/gpu/device/device_reduce_multiblock_atomic_add.hpp +++ b/include/ck/tensor_operation/gpu/device/device_reduce_multiblock_atomic_add.hpp @@ -245,7 +245,7 @@ struct DeviceReduceMultiBlockAtomicAdd struct Invoker : public BaseInvoker { - float Run(const Argument& arg, int nrepeat = 1) + float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{}) { const auto in_grid_desc_m_k = DeviceReduceMultiBlockAtomicAdd::MakeSrc2dDescriptor( arg.inLengths_, arg.inStrides_, arg.blkGroupSize, arg.kBlockTileIterations); @@ -275,8 +275,6 @@ struct DeviceReduceMultiBlockAtomicAdd float avg_time = 0; - KernelTimer timer; - const auto kernel_pre = kernel_buffer_set_value; const auto kernel_main = kernel_reduce_multiblock_atocmi_add; - printf("launch_and_time_kernel: grid_dim {%ld, 1, 1}, block_dim {%d, 1, 1} \n", - arg.gridSize, - BlockSize); - printf("Warm up\n"); + avg_time += launch_and_time_kernel(stream_config, + kernel_pre, + dim3(arg.gridSize_pre), + dim3(BlockSize), + 0, + out_grid_desc_m, + arg.out_dev_, + static_cast(0.0f)); - for(int i = 0; i < nrepeat + 1; i++) - { - if(i == 1) - timer.Start(); + avg_time += launch_and_time_kernel(stream_config, + kernel_main, + dim3(arg.gridSize), + dim3(BlockSize), + 0, + in_grid_desc_m_k, + out_grid_desc_m, + arg.in_elementwise_op_, + arg.acc_elementwise_op_, + arg.blkGroupSize, + arg.kBlockTileIterations, + arg.alpha_, + arg.in_dev_, + arg.out_dev_); - launch_kernel(kernel_pre, - dim3(arg.gridSize_pre), - dim3(BlockSize), - 0, - out_grid_desc_m, - arg.out_dev_, - static_cast(0.0f)); + return avg_time; + } - launch_kernel(kernel_main, - dim3(arg.gridSize), - dim3(BlockSize), - 0, - in_grid_desc_m_k, - out_grid_desc_m, - arg.in_elementwise_op_, - arg.acc_elementwise_op_, - arg.blkGroupSize, - arg.kBlockTileIterations, - arg.alpha_, - arg.in_dev_, - arg.out_dev_); - }; - - timer.End(); - - avg_time = timer.GetElapsedTime() / nrepeat; - - return (avg_time); - }; - - float Run(const BaseArgument* p_arg, int nrepeat = 1) override + float Run(const BaseArgument* p_arg, + const StreamConfig& stream_config = StreamConfig{}) override { - return Run(*dynamic_cast(p_arg), nrepeat); - }; + return Run(*dynamic_cast(p_arg), stream_config); + } }; bool IsSupportedArgument(const BaseArgument* p_arg) override diff --git a/include/ck/tensor_operation/gpu/device/device_reduce_multiblock_partial_reduce.hpp b/include/ck/tensor_operation/gpu/device/device_reduce_multiblock_partial_reduce.hpp index d583f7f1b8..b4eb8116c2 100644 --- a/include/ck/tensor_operation/gpu/device/device_reduce_multiblock_partial_reduce.hpp +++ b/include/ck/tensor_operation/gpu/device/device_reduce_multiblock_partial_reduce.hpp @@ -273,7 +273,7 @@ struct DeviceReduceMultiBlockPartialReduce struct Invoker : public BaseInvoker { - float Run(const Argument& arg, int nrepeat = 1) + float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{}) { const auto in_grid_desc_m_k = DeviceReduceMultiBlockPartialReduce::MakeSrc2dDescriptor( arg.inLengths_, arg.inStrides_, arg.blkGroupSize, arg.kBlockTileIterations); @@ -313,8 +313,8 @@ struct DeviceReduceMultiBlockPartialReduce InElementwiseOperation, AccElementwiseOperation>; - avg_time = launch_and_time_kernel(kernel, - nrepeat, + avg_time = launch_and_time_kernel(stream_config, + kernel, dim3(arg.gridSize), dim3(BlockSize), 0, @@ -331,10 +331,11 @@ struct DeviceReduceMultiBlockPartialReduce return (avg_time); }; - float Run(const BaseArgument* p_arg, int nrepeat = 1) override + float Run(const BaseArgument* p_arg, + const StreamConfig& stream_config = StreamConfig{}) override { - return Run(*dynamic_cast(p_arg), nrepeat); - }; + return Run(*dynamic_cast(p_arg), stream_config); + } }; bool IsSupportedArgument(const BaseArgument* p_arg) override diff --git a/include/ck/tensor_operation/gpu/device/device_reduce_threadwise.hpp b/include/ck/tensor_operation/gpu/device/device_reduce_threadwise.hpp index bf4088a96b..dacb175043 100644 --- a/include/ck/tensor_operation/gpu/device/device_reduce_threadwise.hpp +++ b/include/ck/tensor_operation/gpu/device/device_reduce_threadwise.hpp @@ -212,7 +212,7 @@ struct DeviceReduceThreadWise : public DeviceReduce; - avg_time = launch_and_time_kernel(kernel, - nrepeat, + avg_time = launch_and_time_kernel(stream_config, + kernel, dim3(arg.gridSize), dim3(BlockSize), 0, @@ -272,10 +272,11 @@ struct DeviceReduceThreadWise : public DeviceReduce(p_arg), nrepeat); - }; + return Run(*dynamic_cast(p_arg), stream_config); + } }; bool IsSupportedArgument(const BaseArgument* p_arg) override diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_pipeline_v1.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_pipeline_v1.hpp index 6a1b6eef31..20c3a0b618 100644 --- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_pipeline_v1.hpp +++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_pipeline_v1.hpp @@ -1,5 +1,6 @@ #pragma once #include "common_header.hpp" +#include "tensor_operation/gpu/block/blockwise_gemm_xdlops.hpp" namespace ck { @@ -248,4 +249,116 @@ struct GridwiseGemmPipeline_v1<2> } }; +template +struct GridwiseGemmPipelineInterwave_v1; + +template <> +struct GridwiseGemmPipelineInterwave_v1<1> +{ + __host__ __device__ static constexpr bool IsSupported(index_t /* num_loop */) { return true; } + + __host__ __device__ static constexpr bool CalculateHasMainLoop(index_t num_loop) + { + return num_loop > 1; + } + + template + static __device__ void Run(const AGridDesc& a_grid_desc, + const ABlockDesc& a_block_desc, + ABlockTransfer& a_blockwise_copy, + const AGridBuffer& a_grid_buf, + ABlockBuffer& a_block_buf, + const ABlockTransferStep& a_block_copy_step, + const BGridDesc& b_grid_desc, + const BBlockDesc& b_block_desc, + BBlockTransfer& b_blockwise_copy, + const BGridBuffer& b_grid_buf, + BBlockBuffer& b_block_buf, + const BBlockTransferStep& b_block_copy_step, + const BlockwiseGemm& blockwise_gemm, + CThreadBuffer& c_thread_buf, + index_t num_loop) + { + // preload data into LDS + a_blockwise_copy.RunRead(a_grid_desc, a_grid_buf); + b_blockwise_copy.RunRead(b_grid_desc, b_grid_buf); + + a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc, a_block_copy_step); + b_blockwise_copy.MoveSrcSliceWindow(b_grid_desc, b_block_copy_step); + + // Initialize C + c_thread_buf.Clear(); + + a_blockwise_copy.RunWrite(a_block_desc, a_block_buf); + b_blockwise_copy.RunWrite(b_block_desc, b_block_buf); + + // main body + if constexpr(HasMainLoop) + { + index_t i = 0; + + do + { + a_blockwise_copy.RunRead(a_grid_desc, a_grid_buf); + + block_sync_lds(); + + b_blockwise_copy.RunRead(b_grid_desc, b_grid_buf); + + blockwise_gemm.Run(a_block_buf, b_block_buf, c_thread_buf); + + // block_sync_lds(); // moved into blockwise_gemm + + a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc, a_block_copy_step); + b_blockwise_copy.MoveSrcSliceWindow(b_grid_desc, b_block_copy_step); + + a_blockwise_copy.RunWrite(a_block_desc, a_block_buf); + b_blockwise_copy.RunWrite(b_block_desc, b_block_buf); + + ++i; + } while(i < (num_loop - 1)); + } + + // tail + { + block_sync_lds(); + + blockwise_gemm.Run(a_block_buf, b_block_buf, c_thread_buf); + } + } +}; + +// Note: 2 stage prefetch not optimized for inter-wave loop scheduler +template <> +struct GridwiseGemmPipelineInterwave_v1<2> : public GridwiseGemmPipeline_v1<2> +{ +}; + +template +constexpr auto GridwiseGemmPipeline_v1_Selector() +{ + if constexpr(LoopSched == LoopScheduler::Default) + { + return GridwiseGemmPipeline_v1{}; + } + else if constexpr(LoopSched == LoopScheduler::Interwave) + { + return GridwiseGemmPipelineInterwave_v1{}; + } +} + } // namespace ck diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_reduce_xdl_cshuffle_v1.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_reduce_xdl_cshuffle_v1.hpp index 4e2e279ef3..cf98ea8043 100644 --- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_reduce_xdl_cshuffle_v1.hpp +++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_reduce_xdl_cshuffle_v1.hpp @@ -134,7 +134,8 @@ template + index_t CReduceThreadVgpr2GlobalCopySrcDstScalarPerVector_MPerBlock, + LoopScheduler LoopSched> struct GridwiseGemmReduce_k0mk1_k0nk1_mn_xdl_cshuffle_v1 { static constexpr auto I0 = Number<0>{}; @@ -473,17 +474,18 @@ struct GridwiseGemmReduce_k0mk1_k0nk1_mn_xdl_cshuffle_v1 constexpr index_t KPack = math::max( math::lcm(AK1, BK1), MfmaSelector::selected_mfma.k_per_blk); - auto blockwise_gemm = - BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1{}; + auto blockwise_gemm = BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_Selector< + BlockSize, + FloatAB, + FloatGemmAcc, + decltype(a_block_desc_ak0_m_ak1), + decltype(b_block_desc_bk0_n_bk1), + MPerXdl, + NPerXdl, + MXdlPerWave, + NXdlPerWave, + KPack, + LoopSched>(); auto c_thread_buf = blockwise_gemm.GetCThreadBuffer(); @@ -502,25 +504,28 @@ struct GridwiseGemmReduce_k0mk1_k0nk1_mn_xdl_cshuffle_v1 constexpr auto b_block_slice_copy_step = make_multi_index(KPerBlock / BK1, 0, 0); // gridwise GEMM pipeline + const auto gridwise_gemm_pipeline = + GridwiseGemmPipeline_v1_Selector(); + const index_t num_k_block_main_loop = __builtin_amdgcn_readfirstlane( (a_grid_desc_ak0_m_ak1.GetLength(I0) * a_grid_desc_ak0_m_ak1.GetLength(I2)) / KPerBlock); - GridwiseGemmPipe::template Run(a_grid_desc_ak0_m_ak1, - a_block_desc_ak0_m_ak1, - a_blockwise_copy, - a_grid_buf, - a_block_buf, - a_block_slice_copy_step, - b_grid_desc_bk0_n_bk1, - b_block_desc_bk0_n_bk1, - b_blockwise_copy, - b_grid_buf, - b_block_buf, - b_block_slice_copy_step, - blockwise_gemm, - c_thread_buf, - num_k_block_main_loop); + gridwise_gemm_pipeline.template Run(a_grid_desc_ak0_m_ak1, + a_block_desc_ak0_m_ak1, + a_blockwise_copy, + a_grid_buf, + a_block_buf, + a_block_slice_copy_step, + b_grid_desc_bk0_n_bk1, + b_block_desc_bk0_n_bk1, + b_blockwise_copy, + b_grid_buf, + b_block_buf, + b_block_slice_copy_step, + blockwise_gemm, + c_thread_buf, + num_k_block_main_loop); // shuffle C and write out { diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v1.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v1.hpp index b28907b43e..f0eabf9de6 100644 --- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v1.hpp +++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v1.hpp @@ -107,7 +107,8 @@ template + index_t CShuffleBlockTransferScalarPerVector_NPerBlock, + LoopScheduler LoopSched> struct GridwiseGemm_k0mk1_k0nk1_mn_xdl_cshuffle_v1 { static constexpr auto I0 = Number<0>{}; @@ -416,17 +417,18 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdl_cshuffle_v1 constexpr index_t KPack = math::max( math::lcm(AK1, BK1), MfmaSelector::selected_mfma.k_per_blk); - auto blockwise_gemm = - BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1{}; + auto blockwise_gemm = BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_Selector< + BlockSize, + FloatAB, + FloatGemmAcc, + decltype(a_block_desc_ak0_m_ak1), + decltype(b_block_desc_bk0_n_bk1), + MPerXdl, + NPerXdl, + MXdlPerWave, + NXdlPerWave, + KPack, + LoopSched>(); auto c_thread_buf = blockwise_gemm.GetCThreadBuffer(); @@ -445,25 +447,28 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdl_cshuffle_v1 constexpr auto b_block_slice_copy_step = make_multi_index(KPerBlock / BK1, 0, 0); // gridwise GEMM pipeline + const auto gridwise_gemm_pipeline = + GridwiseGemmPipeline_v1_Selector(); + const index_t num_k_block_main_loop = __builtin_amdgcn_readfirstlane( (a_grid_desc_ak0_m_ak1.GetLength(I0) * a_grid_desc_ak0_m_ak1.GetLength(I2)) / KPerBlock); - GridwiseGemmPipe::template Run(a_grid_desc_ak0_m_ak1, - a_block_desc_ak0_m_ak1, - a_blockwise_copy, - a_grid_buf, - a_block_buf, - a_block_slice_copy_step, - b_grid_desc_bk0_n_bk1, - b_block_desc_bk0_n_bk1, - b_blockwise_copy, - b_grid_buf, - b_block_buf, - b_block_slice_copy_step, - blockwise_gemm, - c_thread_buf, - num_k_block_main_loop); + gridwise_gemm_pipeline.template Run(a_grid_desc_ak0_m_ak1, + a_block_desc_ak0_m_ak1, + a_blockwise_copy, + a_grid_buf, + a_block_buf, + a_block_slice_copy_step, + b_grid_desc_bk0_n_bk1, + b_block_desc_bk0_n_bk1, + b_blockwise_copy, + b_grid_buf, + b_block_buf, + b_block_slice_copy_step, + blockwise_gemm, + c_thread_buf, + num_k_block_main_loop); // shuffle C and write out { diff --git a/library/include/ck/library/host/host_interface.hpp b/library/include/ck/library/host/host_interface.hpp new file mode 100644 index 0000000000..955da0f4be --- /dev/null +++ b/library/include/ck/library/host/host_interface.hpp @@ -0,0 +1,54 @@ +#pragma once + +#include +#include + +#include "stream_config.hpp" +#include "config.hpp" +#include "device_base.hpp" + +struct DeviceConvFwdPtr_t +{ + using BaseArgument = ck::tensor_operation::device::BaseArgument; + using BaseInvoker = ck::tensor_operation::device::BaseInvoker; + + struct DeviceConvFwdPtrImpl; + std::unique_ptr pImpl; + DeviceConvFwdPtr_t(); + ~DeviceConvFwdPtr_t(); + DeviceConvFwdPtr_t(DeviceConvFwdPtr_t&&); + DeviceConvFwdPtr_t(DeviceConvFwdPtrImpl&); + DeviceConvFwdPtr_t& operator=(DeviceConvFwdPtr_t&) = delete; + DeviceConvFwdPtr_t& operator=(const DeviceConvFwdPtr_t&) = delete; + std::unique_ptr + MakeArgumentPointer(void* in_ptr, + void* wei_ptr, + void* out_ptr, + size_t N, + size_t K, + size_t C, + std::vector input_spatial_lengths, + std::vector filter_spatial_lengths, + std::vector output_spatial_lengths, + std::vector conv_filter_strides, + std::vector conv_filter_dilations, + std::vector input_left_pads, + std::vector input_right_pads) + const; // in,wei and out element ops are ignored for now since even if we change them, they + // cant be linked + std::unique_ptr + MakeInvokerPointer() const; // requires including BaseInvoker headers + std::string GetTypeString(); + bool IsSupportedArgument(const BaseArgument* arg_ptr); +}; + +void add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_f32_instances_t( + std::vector& instances); +void add_device_conv2d_fwd_xdl_c_shuffle_nhwc_kyxc_nhwk_f16_instances_t( + std::vector& instances); +void add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_bf16_instances_t( + std::vector& instances); +void add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_f16_instances_t( + std::vector& instances); +void add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_int8_instances_t( + std::vector& instances); diff --git a/library/include/ck/library/host_tensor/device.hpp b/library/include/ck/library/host_tensor/device.hpp index f33b8d4f40..d549b14c8c 100644 --- a/library/include/ck/library/host_tensor/device.hpp +++ b/library/include/ck/library/host_tensor/device.hpp @@ -1,12 +1,25 @@ -#ifndef DEVICE_HPP -#define DEVICE_HPP +#pragma once #include #include #include #include -#include "hip/hip_runtime.h" -#include "hip/hip_fp16.h" +#include +#include + +#include "stream_config.hpp" +#include "ck/options.hpp" + +inline void hip_check_error(hipError_t x) +{ + if(x != hipSuccess) + { + std::ostringstream ss; + ss << "HIP runtime error: " << hipGetErrorString(x) << ". " << __FILE__ << ": " << __LINE__ + << "in function: " << __func__; + throw std::runtime_error(ss.str()); + } +} struct DeviceMem { @@ -36,49 +49,59 @@ struct KernelTimer std::unique_ptr impl; }; -using device_stream_t = hipStream_t; - template -void launch_kernel(F kernel, dim3 grid_dim, dim3 block_dim, std::size_t lds_byte, Args... args) +float launch_and_time_kernel(const StreamConfig& stream_config, + F kernel, + dim3 grid_dim, + dim3 block_dim, + std::size_t lds_byte, + Args... args) { - hipStream_t stream_id = nullptr; - - hipLaunchKernelGGL(kernel, grid_dim, block_dim, lds_byte, stream_id, args...); -} - -template -float launch_and_time_kernel( - F kernel, int nrepeat, dim3 grid_dim, dim3 block_dim, std::size_t lds_byte, Args... args) -{ - KernelTimer timer; - - printf("%s: grid_dim {%d, %d, %d}, block_dim {%d, %d, %d} \n", - __func__, - grid_dim.x, - grid_dim.y, - grid_dim.z, - block_dim.x, - block_dim.y, - block_dim.z); - - printf("Warm up\n"); - - hipStream_t stream_id = nullptr; - - // warm up - hipLaunchKernelGGL(kernel, grid_dim, block_dim, lds_byte, stream_id, args...); - - printf("Start running %d times...\n", nrepeat); - - timer.Start(); - - for(int i = 0; i < nrepeat; ++i) +#if CK_TIME_KERNEL + if(stream_config.time_kernel_) { - hipLaunchKernelGGL(kernel, grid_dim, block_dim, lds_byte, stream_id, args...); + printf("%s: grid_dim {%d, %d, %d}, block_dim {%d, %d, %d} \n", + __func__, + grid_dim.x, + grid_dim.y, + grid_dim.z, + block_dim.x, + block_dim.y, + block_dim.z); + + const int nrepeat = 10; + + printf("Warm up 1 time\n"); + + // warm up + hipLaunchKernelGGL( + kernel, grid_dim, block_dim, lds_byte, stream_config.stream_id_, args...); + + printf("Start running %d times...\n", nrepeat); + + KernelTimer timer; + timer.Start(); + + for(int i = 0; i < nrepeat; ++i) + { + hipLaunchKernelGGL( + kernel, grid_dim, block_dim, lds_byte, stream_config.stream_id_, args...); + } + + timer.End(); + + return timer.GetElapsedTime() / nrepeat; } + else + { + hipLaunchKernelGGL( + kernel, grid_dim, block_dim, lds_byte, stream_config.stream_id_, args...); - timer.End(); + return 0; + } +#else + hipLaunchKernelGGL(kernel, grid_dim, block_dim, lds_byte, stream_config.stream_id_, args...); - return timer.GetElapsedTime() / nrepeat; -} + return 0; #endif +} diff --git a/library/include/ck/library/reference_tensor_operation/cpu/reference_batched_gemm.hpp b/library/include/ck/library/reference_tensor_operation/cpu/reference_batched_gemm.hpp index 3a706dac0b..f4944a28d2 100644 --- a/library/include/ck/library/reference_tensor_operation/cpu/reference_batched_gemm.hpp +++ b/library/include/ck/library/reference_tensor_operation/cpu/reference_batched_gemm.hpp @@ -84,7 +84,8 @@ struct ReferenceBatchedGemm : public device::BaseOperator return 0; } - float Run(const device::BaseArgument* p_arg, int) override + float Run(const device::BaseArgument* p_arg, + const StreamConfig& /* stream_config */ = StreamConfig{}) override { return Run(*dynamic_cast(p_arg)); } diff --git a/library/include/ck/library/reference_tensor_operation/cpu/reference_cgemm.hpp b/library/include/ck/library/reference_tensor_operation/cpu/reference_cgemm.hpp index c55b86aea7..79c0468c82 100644 --- a/library/include/ck/library/reference_tensor_operation/cpu/reference_cgemm.hpp +++ b/library/include/ck/library/reference_tensor_operation/cpu/reference_cgemm.hpp @@ -135,7 +135,8 @@ struct ReferenceCGemm : public device::BaseOperator return 0; } - float Run(const device::BaseArgument* p_arg, int) override + float Run(const device::BaseArgument* p_arg, + const StreamConfig& /* stream_config */ = StreamConfig{}) override { return Run(*dynamic_cast(p_arg)); } diff --git a/library/include/ck/library/reference_tensor_operation/cpu/reference_conv_backward_weight.hpp b/library/include/ck/library/reference_tensor_operation/cpu/reference_conv_backward_weight.hpp index c5f3cbad69..10619ae6d9 100644 --- a/library/include/ck/library/reference_tensor_operation/cpu/reference_conv_backward_weight.hpp +++ b/library/include/ck/library/reference_tensor_operation/cpu/reference_conv_backward_weight.hpp @@ -121,7 +121,8 @@ struct ReferenceConvBwdWeight : public device::BaseOperator return 0; } - float Run(const device::BaseArgument* p_arg, int) override + float Run(const device::BaseArgument* p_arg, + const StreamConfig& /*stream_config*/ = StreamConfig{}) override { return Run(*dynamic_cast(p_arg)); } diff --git a/library/include/ck/library/reference_tensor_operation/cpu/reference_conv_bwd_data.hpp b/library/include/ck/library/reference_tensor_operation/cpu/reference_conv_bwd_data.hpp index 9e91f06e7f..45fc8b8503 100644 --- a/library/include/ck/library/reference_tensor_operation/cpu/reference_conv_bwd_data.hpp +++ b/library/include/ck/library/reference_tensor_operation/cpu/reference_conv_bwd_data.hpp @@ -291,7 +291,8 @@ struct ReferenceConvBwdData : public device::BaseOperator } } - float Run(const device::BaseArgument* p_arg, int) override + float Run(const device::BaseArgument* p_arg, + const StreamConfig& /* stream_config */ = StreamConfig{}) override { return Run(*dynamic_cast(p_arg)); } diff --git a/library/include/ck/library/reference_tensor_operation/cpu/reference_conv_fwd.hpp b/library/include/ck/library/reference_tensor_operation/cpu/reference_conv_fwd.hpp index 65e59db2f8..d1afa898e4 100644 --- a/library/include/ck/library/reference_tensor_operation/cpu/reference_conv_fwd.hpp +++ b/library/include/ck/library/reference_tensor_operation/cpu/reference_conv_fwd.hpp @@ -1,9 +1,10 @@ -#ifndef REFERENCE_CONV_FWD_HPP -#define REFERENCE_CONV_FWD_HPP +#pragma once #include #include #include + +#include "stream_config.hpp" #include "device_base.hpp" #include "host_tensor.hpp" @@ -251,7 +252,8 @@ struct ReferenceConvFwd : public device::BaseOperator } } - float Run(const device::BaseArgument* p_arg, int) override + float Run(const device::BaseArgument* p_arg, + const StreamConfig& /*stream_config*/ = StreamConfig{}) override { return Run(*dynamic_cast(p_arg)); } @@ -311,4 +313,3 @@ struct ReferenceConvFwd : public device::BaseOperator } // namespace host } // namespace tensor_operation } // namespace ck -#endif diff --git a/library/include/ck/library/reference_tensor_operation/cpu/reference_conv_fwd_bias_activation.hpp b/library/include/ck/library/reference_tensor_operation/cpu/reference_conv_fwd_bias_activation.hpp index ee95cd410a..4be6169c15 100644 --- a/library/include/ck/library/reference_tensor_operation/cpu/reference_conv_fwd_bias_activation.hpp +++ b/library/include/ck/library/reference_tensor_operation/cpu/reference_conv_fwd_bias_activation.hpp @@ -124,7 +124,8 @@ struct ReferenceConvFwd_Bias_Activation : public device::BaseOperator return 0; } - float Run(const device::BaseArgument* p_arg, int) override + float Run(const device::BaseArgument* p_arg, + const StreamConfig& /* stream_config */ = StreamConfig{}) override { return Run(*dynamic_cast(p_arg)); } diff --git a/library/include/ck/library/reference_tensor_operation/cpu/reference_conv_fwd_bias_activation_add.hpp b/library/include/ck/library/reference_tensor_operation/cpu/reference_conv_fwd_bias_activation_add.hpp index 11232cc98f..466537c686 100644 --- a/library/include/ck/library/reference_tensor_operation/cpu/reference_conv_fwd_bias_activation_add.hpp +++ b/library/include/ck/library/reference_tensor_operation/cpu/reference_conv_fwd_bias_activation_add.hpp @@ -130,7 +130,8 @@ struct ReferenceConvFwd_Bias_Activation_Add : public device::BaseOperator return 0; } - float Run(const device::BaseArgument* p_arg, int) override + float Run(const device::BaseArgument* p_arg, + const StreamConfig& /*stream_config*/ = StreamConfig{}) override { return Run(*dynamic_cast(p_arg)); } diff --git a/library/include/ck/library/reference_tensor_operation/cpu/reference_gemm.hpp b/library/include/ck/library/reference_tensor_operation/cpu/reference_gemm.hpp index 1b49ca5740..d89c8f5e05 100644 --- a/library/include/ck/library/reference_tensor_operation/cpu/reference_gemm.hpp +++ b/library/include/ck/library/reference_tensor_operation/cpu/reference_gemm.hpp @@ -80,7 +80,8 @@ struct ReferenceGemm : public device::BaseOperator return 0; } - float Run(const device::BaseArgument* p_arg, int) override + float Run(const device::BaseArgument* p_arg, + const StreamConfig& /* stream_config */ = StreamConfig{}) override { return Run(*dynamic_cast(p_arg)); } diff --git a/library/include/ck/library/reference_tensor_operation/cpu/reference_gemm_bias_2d.hpp b/library/include/ck/library/reference_tensor_operation/cpu/reference_gemm_bias_2d.hpp index 7dd6fc9199..3e7f220e03 100644 --- a/library/include/ck/library/reference_tensor_operation/cpu/reference_gemm_bias_2d.hpp +++ b/library/include/ck/library/reference_tensor_operation/cpu/reference_gemm_bias_2d.hpp @@ -82,7 +82,8 @@ struct ReferenceGemmBias2D : public device::BaseOperator return 0; } - float Run(const device::BaseArgument* p_arg, int) override + float Run(const device::BaseArgument* p_arg, + const StreamConfig& /* stream_config */ = StreamConfig{}) override { return Run(*dynamic_cast(p_arg)); } diff --git a/library/include/ck/library/reference_tensor_operation/cpu/reference_gemm_bias_activation.hpp b/library/include/ck/library/reference_tensor_operation/cpu/reference_gemm_bias_activation.hpp index 7c9df272c2..60f72e9e51 100644 --- a/library/include/ck/library/reference_tensor_operation/cpu/reference_gemm_bias_activation.hpp +++ b/library/include/ck/library/reference_tensor_operation/cpu/reference_gemm_bias_activation.hpp @@ -85,7 +85,8 @@ struct ReferenceGemmBiasActivation : public device::BaseOperator return 0; } - float Run(const device::BaseArgument* p_arg, int) override + float Run(const device::BaseArgument* p_arg, + const StreamConfig& /* stream_config */ = StreamConfig{}) override { return Run(*dynamic_cast(p_arg)); } diff --git a/library/include/ck/library/reference_tensor_operation/cpu/reference_gemm_bias_activation_add.hpp b/library/include/ck/library/reference_tensor_operation/cpu/reference_gemm_bias_activation_add.hpp index 4d3c5effae..5e0ec75e5e 100644 --- a/library/include/ck/library/reference_tensor_operation/cpu/reference_gemm_bias_activation_add.hpp +++ b/library/include/ck/library/reference_tensor_operation/cpu/reference_gemm_bias_activation_add.hpp @@ -91,7 +91,8 @@ struct ReferenceGemmBiasActivationAdd : public device::BaseOperator return 0; } - float Run(const device::BaseArgument* p_arg, int) override + float Run(const device::BaseArgument* p_arg, + const StreamConfig& /* stream_config */ = StreamConfig{}) override { return Run(*dynamic_cast(p_arg)); } diff --git a/library/include/ck/library/utility/conv_fwd_util.hpp b/library/include/ck/library/utility/conv_util.hpp similarity index 95% rename from library/include/ck/library/utility/conv_fwd_util.hpp rename to library/include/ck/library/utility/conv_util.hpp index a29eb814fd..c881b89705 100644 --- a/library/include/ck/library/utility/conv_fwd_util.hpp +++ b/library/include/ck/library/utility/conv_util.hpp @@ -146,19 +146,19 @@ struct ConvParams const std::vector& left_pads, const std::vector& right_pads); - ck::index_t num_dim_spatial; - ck::index_t N; - ck::index_t K; - ck::index_t C; + ck::index_t num_dim_spatial_; + ck::index_t N_; + ck::index_t K_; + ck::index_t C_; - std::vector filter_spatial_lengths; - std::vector input_spatial_lengths; + std::vector filter_spatial_lengths_; + std::vector input_spatial_lengths_; - std::vector conv_filter_strides; - std::vector conv_filter_dilations; + std::vector conv_filter_strides_; + std::vector conv_filter_dilations_; - std::vector input_left_pads; - std::vector input_right_pads; + std::vector input_left_pads_; + std::vector input_right_pads_; std::vector GetOutputSpatialLengths() const; }; @@ -268,10 +268,10 @@ void run_reference_convolution_forward(const ConvParams& params, auto ref_argument = ref_conv.MakeArgument(input, weights, output, - params.conv_filter_strides, - params.conv_filter_dilations, - params.input_left_pads, - params.input_right_pads, + params.conv_filter_strides_, + params.conv_filter_dilations_, + params.input_left_pads_, + params.input_right_pads_, PassThrough{}, PassThrough{}, PassThrough{}); @@ -437,17 +437,17 @@ class ConvFwdOpInstance : public ck::utils::OpInstance input_dims{static_cast(params_.N), - static_cast(params_.C)}; + std::vector input_dims{static_cast(params_.N_), + static_cast(params_.C_)}; input_dims.insert(std::end(input_dims), - std::begin(params_.input_spatial_lengths), - std::end(params_.input_spatial_lengths)); + std::begin(params_.input_spatial_lengths_), + std::end(params_.input_spatial_lengths_)); - std::vector filter_dims{static_cast(params_.K), - static_cast(params_.C)}; + std::vector filter_dims{static_cast(params_.K_), + static_cast(params_.C_)}; filter_dims.insert(std::end(filter_dims), - std::begin(params_.filter_spatial_lengths), - std::end(params_.filter_spatial_lengths)); + std::begin(params_.filter_spatial_lengths_), + std::end(params_.filter_spatial_lengths_)); auto input = std::make_unique>( get_host_tensor_descriptor(input_dims, InLayout{})); @@ -465,8 +465,8 @@ class ConvFwdOpInstance : public ck::utils::OpInstance GetOutputTensor() const override { - std::vector output_dims{static_cast(params_.N), - static_cast(params_.K)}; + std::vector output_dims{static_cast(params_.N_), + static_cast(params_.K_)}; output_dims.insert(std::end(output_dims), std::begin(output_spatial_lengths_), std::end(output_spatial_lengths_)); @@ -522,16 +522,16 @@ class ConvFwdOpInstance : public ck::utils::OpInstance(in_device_buffers[0]->GetDeviceBuffer()), static_cast(in_device_buffers[1]->GetDeviceBuffer()), static_cast(out_device_buffer->GetDeviceBuffer()), - params_.N, - params_.K, - params_.C, - params_.input_spatial_lengths, - params_.filter_spatial_lengths, + params_.N_, + params_.K_, + params_.C_, + params_.input_spatial_lengths_, + params_.filter_spatial_lengths_, output_spatial_lengths_, - params_.conv_filter_strides, - params_.conv_filter_dilations, - params_.input_left_pads, - params_.input_right_pads, + params_.conv_filter_strides_, + params_.conv_filter_dilations_, + params_.input_left_pads_, + params_.input_right_pads_, InElementwiseOp{}, WeiElementwiseOp{}, OutElementwiseOp{}); @@ -539,20 +539,20 @@ class ConvFwdOpInstance : public ck::utils::OpInstance(params_.N, - params_.C, - params_.K, - params_.input_spatial_lengths, - params_.filter_spatial_lengths, + return get_btype(params_.N_, + params_.C_, + params_.K_, + params_.input_spatial_lengths_, + params_.filter_spatial_lengths_, output_spatial_lengths_); } diff --git a/library/include/ck/library/utility/op_instance_engine.hpp b/library/include/ck/library/utility/op_instance_engine.hpp index ec88b4e1b9..5429f66d3e 100644 --- a/library/include/ck/library/utility/op_instance_engine.hpp +++ b/library/include/ck/library/utility/op_instance_engine.hpp @@ -128,7 +128,7 @@ class OpInstanceRunEngine template ProfileBestConfig Profile(const std::vector& op_ptrs, - int nrepeat = 100, + bool time_kernel = false, bool do_verification = false, bool do_log = false) { @@ -143,7 +143,7 @@ class OpInstanceRunEngine if(op_ptr->IsSupportedArgument(argument.get())) { std::string op_name = op_ptr->GetTypeString(); - float avg_time = invoker->Run(argument.get(), nrepeat); + float avg_time = invoker->Run(argument.get(), StreamConfig{nullptr, time_kernel}); std::size_t flops = op_instance_.GetFlops(); std::size_t num_btype = op_instance_.GetBtype(); diff --git a/library/src/host_tensor/CMakeLists.txt b/library/src/host_tensor/CMakeLists.txt index fd100e477f..2a020b763d 100644 --- a/library/src/host_tensor/CMakeLists.txt +++ b/library/src/host_tensor/CMakeLists.txt @@ -10,10 +10,31 @@ set(HOST_TENSOR_SOURCE host_tensor.cpp ) -add_library(host_tensor SHARED ${HOST_TENSOR_SOURCE}) +add_library(host_tensor STATIC ${HOST_TENSOR_SOURCE}) +add_library(composable_kernel::host_tensor ALIAS host_tensor) + target_compile_features(host_tensor PUBLIC) set_target_properties(host_tensor PROPERTIES POSITION_INDEPENDENT_CODE ON) target_include_directories(host_tensor SYSTEM PUBLIC $) -install(TARGETS host_tensor LIBRARY DESTINATION lib) + +target_include_directories(host_tensor PUBLIC + "$" + "$" + "$" +) + +install(TARGETS host_tensor + EXPORT host_tensorTargets + LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} + ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR} + RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR} + INCLUDES DESTINATION ${CMAKE_INSTALL_INCLUDEDIR} +) + +install(EXPORT host_tensorTargets + FILE composable_kernelhost_tensorTargets.cmake + NAMESPACE composable_kernel:: + DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/composable_kernel +) clang_tidy_check(host_tensor) diff --git a/library/src/host_tensor/device.cpp b/library/src/host_tensor/device.cpp index 3e80df80fb..9f0d982dbc 100644 --- a/library/src/host_tensor/device.cpp +++ b/library/src/host_tensor/device.cpp @@ -2,7 +2,7 @@ DeviceMem::DeviceMem(std::size_t mem_size) : mMemSize(mem_size) { - hipGetErrorString(hipMalloc(static_cast(&mpDeviceBuf), mMemSize)); + hip_check_error(hipMalloc(static_cast(&mpDeviceBuf), mMemSize)); } void* DeviceMem::GetDeviceBuffer() { return mpDeviceBuf; } @@ -11,49 +11,48 @@ std::size_t DeviceMem::GetBufferSize() { return mMemSize; } void DeviceMem::ToDevice(const void* p) { - hipGetErrorString( - hipMemcpy(mpDeviceBuf, const_cast(p), mMemSize, hipMemcpyHostToDevice)); + hip_check_error(hipMemcpy(mpDeviceBuf, const_cast(p), mMemSize, hipMemcpyHostToDevice)); } void DeviceMem::FromDevice(void* p) { - hipGetErrorString(hipMemcpy(p, mpDeviceBuf, mMemSize, hipMemcpyDeviceToHost)); + hip_check_error(hipMemcpy(p, mpDeviceBuf, mMemSize, hipMemcpyDeviceToHost)); } -void DeviceMem::SetZero() { hipGetErrorString(hipMemset(mpDeviceBuf, 0, mMemSize)); } +void DeviceMem::SetZero() { hip_check_error(hipMemset(mpDeviceBuf, 0, mMemSize)); } -DeviceMem::~DeviceMem() { hipGetErrorString(hipFree(mpDeviceBuf)); } +DeviceMem::~DeviceMem() { hip_check_error(hipFree(mpDeviceBuf)); } struct KernelTimerImpl { KernelTimerImpl() { - hipGetErrorString(hipEventCreate(&mStart)); - hipGetErrorString(hipEventCreate(&mEnd)); + hip_check_error(hipEventCreate(&mStart)); + hip_check_error(hipEventCreate(&mEnd)); } ~KernelTimerImpl() { - hipGetErrorString(hipEventDestroy(mStart)); - hipGetErrorString(hipEventDestroy(mEnd)); + hip_check_error(hipEventDestroy(mStart)); + hip_check_error(hipEventDestroy(mEnd)); } void Start() { - hipGetErrorString(hipDeviceSynchronize()); - hipGetErrorString(hipEventRecord(mStart, nullptr)); + hip_check_error(hipDeviceSynchronize()); + hip_check_error(hipEventRecord(mStart, nullptr)); } void End() { - hipGetErrorString(hipEventRecord(mEnd, nullptr)); - hipGetErrorString(hipEventSynchronize(mEnd)); + hip_check_error(hipEventRecord(mEnd, nullptr)); + hip_check_error(hipEventSynchronize(mEnd)); } float GetElapsedTime() const { float time; - hipGetErrorString(hipEventElapsedTime(&time, mStart, mEnd)); + hip_check_error(hipEventElapsedTime(&time, mStart, mEnd)); return time; } diff --git a/library/src/tensor_operation_instance/gpu/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/CMakeLists.txt index 9d4c19db90..66dfa7c605 100644 --- a/library/src/tensor_operation_instance/gpu/CMakeLists.txt +++ b/library/src/tensor_operation_instance/gpu/CMakeLists.txt @@ -11,6 +11,7 @@ include_directories(BEFORE ${PROJECT_SOURCE_DIR}/include/ck/tensor_operation/gpu/thread ${PROJECT_SOURCE_DIR}/include/ck/tensor_operation/gpu/element ${PROJECT_SOURCE_DIR}/library/include/ck/library/host_tensor + ${PROJECT_SOURCE_DIR}/library/include/ck/library/host ${PROJECT_SOURCE_DIR}/library/include/ck/library/tensor_operation_instance ${PROJECT_SOURCE_DIR}/library/include/ck/library/tensor_operation_instance/gpu/reduce ${PROJECT_SOURCE_DIR}/external/include/half @@ -18,7 +19,7 @@ include_directories(BEFORE function(add_instance_library INSTANCE_NAME) message("adding instance ${INSTANCE_NAME}") - add_library(${INSTANCE_NAME} SHARED ${ARGN}) + add_library(${INSTANCE_NAME} OBJECT ${ARGN}) target_compile_features(${INSTANCE_NAME} PUBLIC) set_target_properties(${INSTANCE_NAME} PROPERTIES POSITION_INDEPENDENT_CODE ON) endfunction(add_instance_library INSTANCE_NAME) @@ -42,3 +43,74 @@ add_subdirectory(grouped_gemm) add_subdirectory(conv2d_bwd_weight) add_subdirectory(batched_gemm_reduce) add_subdirectory(cgemm) + +add_library(device_operations STATIC + $ + $ + $ + $ + $ + $ + $ + $ + $ + $ + $ + $ + $ + $ + $ + $ + $ + $ + device_conv2d.cpp +) +add_library(composablekernels::device_operations ALIAS device_operations) + + +set(DEV_OPS_INC_DIRS + ${PROJECT_SOURCE_DIR}/include/ck/ + ${PROJECT_SOURCE_DIR}/library/include/ck/ + ${PROJECT_SOURCE_DIR}/external/include/ +) +target_compile_features(device_operations PUBLIC) +set_target_properties(device_operations PROPERTIES POSITION_INDEPENDENT_CODE ON) +target_include_directories(device_operations PUBLIC + $ + $ + $ + $ + $ + $ + $ + $ + $ + $ + $ + $ + $ + $ + $ + $ +) + +#once new arches are enabled make this an option on the main cmake file +# and pass down here to be exported + +target_compile_options(device_operations +PRIVATE --offload-arch=gfx908 +) +# install(TARGETS device_operations LIBRARY DESTINATION lib) +install(TARGETS device_operations + EXPORT device_operationsTargets + LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} + ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR} + RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR} + INCLUDES DESTINATION ${CMAKE_INSTALL_INCLUDEDIR} +) +install(DIRECTORY ${DEV_OPS_INC_DIRS} DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/ck) +install(EXPORT device_operationsTargets + FILE composable_kerneldevice_operationsTargets.cmake + NAMESPACE composable_kernel:: + DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/composable_kernel +) diff --git a/library/src/tensor_operation_instance/gpu/batched_gemm/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/batched_gemm/CMakeLists.txt index 35e24462b5..016c85f673 100644 --- a/library/src/tensor_operation_instance/gpu/batched_gemm/CMakeLists.txt +++ b/library/src/tensor_operation_instance/gpu/batched_gemm/CMakeLists.txt @@ -18,9 +18,9 @@ set(DEVICE_BATCHED_GEMM_INSTANCE_SOURCE device_batched_gemm_xdl_int8_int8_int8_gkm_gnk_gmn_instance.cpp; ) -add_library(device_batched_gemm_instance SHARED ${DEVICE_BATCHED_GEMM_INSTANCE_SOURCE}) -target_compile_features(device_batched_gemm_instance PUBLIC) +add_library(device_batched_gemm_instance OBJECT ${DEVICE_BATCHED_GEMM_INSTANCE_SOURCE}) +# target_compile_features(device_batched_gemm_instance PUBLIC) set_target_properties(device_batched_gemm_instance PROPERTIES POSITION_INDEPENDENT_CODE ON) -install(TARGETS device_batched_gemm_instance LIBRARY DESTINATION lib) +# install(TARGETS device_batched_gemm_instance LIBRARY DESTINATION lib) clang_tidy_check(device_batched_gemm_instance) diff --git a/library/src/tensor_operation_instance/gpu/batched_gemm_reduce/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/batched_gemm_reduce/CMakeLists.txt index 59eb6cb1cc..67a3c15d00 100644 --- a/library/src/tensor_operation_instance/gpu/batched_gemm_reduce/CMakeLists.txt +++ b/library/src/tensor_operation_instance/gpu/batched_gemm_reduce/CMakeLists.txt @@ -5,7 +5,8 @@ set(DEVICE_BATCHED_GEMM_REDUCE_INSTANCE_SOURCE device_batched_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_gkm_gnk_gmn_instance.cpp ) -add_instance_library(device_batched_gemm_reduce_instance ${DEVICE_BATCHED_GEMM_REDUCE_INSTANCE_SOURCE}) -install(TARGETS device_batched_gemm_reduce_instance LIBRARY DESTINATION lib) +add_instance_library(device_batched_gemm_reduce_instance OBJECT ${DEVICE_BATCHED_GEMM_REDUCE_INSTANCE_SOURCE}) +target_compile_features(device_batched_gemm_reduce_instance PUBLIC) +set_target_properties(device_batched_gemm_reduce_instance PROPERTIES POSITION_INDEPENDENT_CODE ON) clang_tidy_check(device_batched_gemm_reduce_instance) diff --git a/library/src/tensor_operation_instance/gpu/conv1d_fwd/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/conv1d_fwd/CMakeLists.txt index 6c7c3e4f78..77aa6198f5 100644 --- a/library/src/tensor_operation_instance/gpu/conv1d_fwd/CMakeLists.txt +++ b/library/src/tensor_operation_instance/gpu/conv1d_fwd/CMakeLists.txt @@ -6,9 +6,9 @@ set(DEVICE_CONV1D_FWD_INSTANCE_SOURCE device_conv1d_fwd_xdl_nwc_kxc_nwk_int8_instance.cpp; ) -add_library(device_conv1d_fwd_instance SHARED ${DEVICE_CONV1D_FWD_INSTANCE_SOURCE}) -target_compile_features(device_conv1d_fwd_instance PUBLIC) +add_library(device_conv1d_fwd_instance OBJECT ${DEVICE_CONV1D_FWD_INSTANCE_SOURCE}) +# target_compile_features(device_conv1d_fwd_instance PUBLIC) set_target_properties(device_conv1d_fwd_instance PROPERTIES POSITION_INDEPENDENT_CODE ON) -install(TARGETS device_conv1d_fwd_instance LIBRARY DESTINATION lib) +# install(TARGETS device_conv1d_fwd_instance LIBRARY DESTINATION lib) clang_tidy_check(device_conv1d_fwd_instance) diff --git a/library/src/tensor_operation_instance/gpu/conv2d_bwd_data/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/conv2d_bwd_data/CMakeLists.txt index d619ef4bf1..d7882a7d8b 100644 --- a/library/src/tensor_operation_instance/gpu/conv2d_bwd_data/CMakeLists.txt +++ b/library/src/tensor_operation_instance/gpu/conv2d_bwd_data/CMakeLists.txt @@ -6,9 +6,7 @@ set(DEVICE_CONV2D_BWD_DATA_INSTANCE_SOURCE device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_int8_instance.cpp; ) -add_library(device_conv2d_bwd_data_instance SHARED ${DEVICE_CONV2D_BWD_DATA_INSTANCE_SOURCE}) -target_compile_features(device_conv2d_bwd_data_instance PUBLIC) +add_library(device_conv2d_bwd_data_instance OBJECT ${DEVICE_CONV2D_BWD_DATA_INSTANCE_SOURCE}) set_target_properties(device_conv2d_bwd_data_instance PROPERTIES POSITION_INDEPENDENT_CODE ON) -install(TARGETS device_conv2d_bwd_data_instance LIBRARY DESTINATION lib) clang_tidy_check(device_conv2d_bwd_data_instance) diff --git a/library/src/tensor_operation_instance/gpu/conv2d_bwd_weight/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/conv2d_bwd_weight/CMakeLists.txt index 6183e70b9b..7c384a882b 100644 --- a/library/src/tensor_operation_instance/gpu/conv2d_bwd_weight/CMakeLists.txt +++ b/library/src/tensor_operation_instance/gpu/conv2d_bwd_weight/CMakeLists.txt @@ -3,7 +3,7 @@ set(DEVICE_CONV2D_BWD_WEIGHT_INSTANCE_SOURCE device_conv2d_bwd_weight_xdl_nhwc_kyxc_nhwk_f16_instance.cpp; device_conv2d_bwd_weight_xdl_nhwc_kyxc_nhwk_f32_instance.cpp; ) -add_library(device_conv2d_bwd_weight_instance SHARED ${DEVICE_CONV2D_BWD_WEIGHT_INSTANCE_SOURCE}) +add_library(device_conv2d_bwd_weight_instance OBJECT ${DEVICE_CONV2D_BWD_WEIGHT_INSTANCE_SOURCE}) target_compile_features(device_conv2d_bwd_weight_instance PUBLIC) set_target_properties(device_conv2d_bwd_weight_instance PROPERTIES POSITION_INDEPENDENT_CODE ON) install(TARGETS device_conv2d_bwd_weight_instance LIBRARY DESTINATION lib) diff --git a/library/src/tensor_operation_instance/gpu/conv2d_fwd/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/conv2d_fwd/CMakeLists.txt index 7483861524..857e36d6f5 100644 --- a/library/src/tensor_operation_instance/gpu/conv2d_fwd/CMakeLists.txt +++ b/library/src/tensor_operation_instance/gpu/conv2d_fwd/CMakeLists.txt @@ -6,9 +6,7 @@ set(DEVICE_CONV2D_FWD_INSTANCE_SOURCE device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_int8_instance.cpp; device_conv2d_fwd_xdl_c_shuffle_nhwc_kyxc_nhwk_f16_instance.cpp; ) -add_library(device_conv2d_fwd_instance SHARED ${DEVICE_CONV2D_FWD_INSTANCE_SOURCE}) -target_compile_features(device_conv2d_fwd_instance PUBLIC) +add_library(device_conv2d_fwd_instance OBJECT ${DEVICE_CONV2D_FWD_INSTANCE_SOURCE}) set_target_properties(device_conv2d_fwd_instance PROPERTIES POSITION_INDEPENDENT_CODE ON) -install(TARGETS device_conv2d_fwd_instance LIBRARY DESTINATION lib) clang_tidy_check(device_conv2d_fwd_instance) diff --git a/library/src/tensor_operation_instance/gpu/conv2d_fwd_bias_relu/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/conv2d_fwd_bias_relu/CMakeLists.txt index 27a9736a3f..ad66c73bf8 100644 --- a/library/src/tensor_operation_instance/gpu/conv2d_fwd_bias_relu/CMakeLists.txt +++ b/library/src/tensor_operation_instance/gpu/conv2d_fwd_bias_relu/CMakeLists.txt @@ -2,9 +2,7 @@ set(DEVICE_CONV2D_FWD_BIAS_RELU_INSTANCE_SOURCE device_conv2d_fwd_xdl_c_shuffle_bias_relu_nhwc_kyxc_nhwk_f16_instance.cpp; ) -add_library(device_conv2d_fwd_bias_relu_instance SHARED ${DEVICE_CONV2D_FWD_BIAS_RELU_INSTANCE_SOURCE}) -target_compile_features(device_conv2d_fwd_bias_relu_instance PUBLIC) +add_library(device_conv2d_fwd_bias_relu_instance OBJECT ${DEVICE_CONV2D_FWD_BIAS_RELU_INSTANCE_SOURCE}) set_target_properties(device_conv2d_fwd_bias_relu_instance PROPERTIES POSITION_INDEPENDENT_CODE ON) -install(TARGETS device_conv2d_fwd_bias_relu_instance LIBRARY DESTINATION lib) clang_tidy_check(device_conv2d_fwd_bias_relu_instance) diff --git a/library/src/tensor_operation_instance/gpu/conv2d_fwd_bias_relu_add/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/conv2d_fwd_bias_relu_add/CMakeLists.txt index d7bec82174..36b1f6c153 100644 --- a/library/src/tensor_operation_instance/gpu/conv2d_fwd_bias_relu_add/CMakeLists.txt +++ b/library/src/tensor_operation_instance/gpu/conv2d_fwd_bias_relu_add/CMakeLists.txt @@ -2,9 +2,7 @@ set(DEVICE_CONV2D_FWD_BIAS_RELU_ADD_INSTANCE_SOURCE device_conv2d_fwd_xdl_c_shuffle_bias_relu_add_nhwc_kyxc_nhwk_f16_instance.cpp; ) -add_library(device_conv2d_fwd_bias_relu_add_instance SHARED ${DEVICE_CONV2D_FWD_BIAS_RELU_ADD_INSTANCE_SOURCE}) -target_compile_features(device_conv2d_fwd_bias_relu_add_instance PUBLIC) +add_library(device_conv2d_fwd_bias_relu_add_instance OBJECT ${DEVICE_CONV2D_FWD_BIAS_RELU_ADD_INSTANCE_SOURCE}) set_target_properties(device_conv2d_fwd_bias_relu_add_instance PROPERTIES POSITION_INDEPENDENT_CODE ON) -install(TARGETS device_conv2d_fwd_bias_relu_add_instance LIBRARY DESTINATION lib) clang_tidy_check(device_conv2d_fwd_bias_relu_add_instance) diff --git a/library/src/tensor_operation_instance/gpu/conv2d_fwd_bias_relu_atomic_add/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/conv2d_fwd_bias_relu_atomic_add/CMakeLists.txt index c0942d5485..5906c7c5ac 100644 --- a/library/src/tensor_operation_instance/gpu/conv2d_fwd_bias_relu_atomic_add/CMakeLists.txt +++ b/library/src/tensor_operation_instance/gpu/conv2d_fwd_bias_relu_atomic_add/CMakeLists.txt @@ -3,9 +3,7 @@ set(DEVICE_CONV2D_FWD_BIAS_RELU_ATOMIC_ADD_INSTANCE_SOURCE device_conv2d_fwd_xdl_c_shuffle_bias_relu_atomic_add_nhwc_kyxc_nhwk_f16_instance.cpp; ) -add_library(device_conv2d_fwd_bias_relu_atomic_add_instance SHARED ${DEVICE_CONV2D_FWD_BIAS_RELU_ATOMIC_ADD_INSTANCE_SOURCE}) -target_compile_features(device_conv2d_fwd_bias_relu_atomic_add_instance PUBLIC) +add_library(device_conv2d_fwd_bias_relu_atomic_add_instance OBJECT ${DEVICE_CONV2D_FWD_BIAS_RELU_ATOMIC_ADD_INSTANCE_SOURCE}) set_target_properties(device_conv2d_fwd_bias_relu_atomic_add_instance PROPERTIES POSITION_INDEPENDENT_CODE ON) -install(TARGETS device_conv2d_fwd_bias_relu_atomic_add_instance LIBRARY DESTINATION lib) clang_tidy_check(device_conv2d_fwd_bias_relu_atomic_add_instance) diff --git a/library/src/tensor_operation_instance/gpu/conv3d_fwd/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/conv3d_fwd/CMakeLists.txt index f6849a7bb2..91a299c742 100644 --- a/library/src/tensor_operation_instance/gpu/conv3d_fwd/CMakeLists.txt +++ b/library/src/tensor_operation_instance/gpu/conv3d_fwd/CMakeLists.txt @@ -5,9 +5,8 @@ set(DEVICE_CONV3D_FWD_INSTANCE_SOURCE device_conv3d_fwd_xdl_ndhwc_kzyxc_ndhwk_bf16_instance.cpp; device_conv3d_fwd_xdl_ndhwc_kzyxc_ndhwk_int8_instance.cpp; ) -add_library(device_conv3d_fwd_instance SHARED ${DEVICE_CONV3D_FWD_INSTANCE_SOURCE}) +add_library(device_conv3d_fwd_instance OBJECT ${DEVICE_CONV3D_FWD_INSTANCE_SOURCE}) target_compile_features(device_conv3d_fwd_instance PUBLIC) set_target_properties(device_conv3d_fwd_instance PROPERTIES POSITION_INDEPENDENT_CODE ON) -install(TARGETS device_conv3d_fwd_instance LIBRARY DESTINATION lib) clang_tidy_check(device_conv3d_fwd_instance) diff --git a/library/src/tensor_operation_instance/gpu/convnd_bwd_data/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/convnd_bwd_data/CMakeLists.txt index 9ee961ad74..037f860808 100644 --- a/library/src/tensor_operation_instance/gpu/convnd_bwd_data/CMakeLists.txt +++ b/library/src/tensor_operation_instance/gpu/convnd_bwd_data/CMakeLists.txt @@ -14,7 +14,7 @@ set(DEVICE_CONVND_BWD_DATA_INSTANCE_SOURCE device_conv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_int8_instance.cpp; ) -add_library(device_convnd_bwd_data_instance SHARED ${DEVICE_CONVND_BWD_DATA_INSTANCE_SOURCE}) +add_library(device_convnd_bwd_data_instance OBJECT ${DEVICE_CONVND_BWD_DATA_INSTANCE_SOURCE}) target_compile_features(device_convnd_bwd_data_instance PUBLIC) set_target_properties(device_convnd_bwd_data_instance PROPERTIES POSITION_INDEPENDENT_CODE ON) install(TARGETS device_convnd_bwd_data_instance LIBRARY DESTINATION lib) diff --git a/library/src/tensor_operation_instance/gpu/device_conv2d.cpp b/library/src/tensor_operation_instance/gpu/device_conv2d.cpp new file mode 100644 index 0000000000..6b99433ffa --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/device_conv2d.cpp @@ -0,0 +1,201 @@ +#include +#include "config.hpp" +#include "device_conv2d_fwd_xdl_c_shuffle_nhwc_kyxc_nhwk.hpp" +#include "element_wise_operation.hpp" +#include "device_operation_instance.hpp" +#include "host_interface.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace device_conv2d_fwd_instance { +using PassThrough = ck::tensor_operation::element_wise::PassThrough; +void add_device_conv2d_fwd_xdl_c_shuffle_nhwc_kyxc_nhwk_f16_instances( + std::vector>& instances); +void add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_f32_instances( + std::vector>& instances); +void add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_bf16_instances( + std::vector>& instances); +void add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_f16_instances( + std::vector>& instances); +void add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_int8_instances( + std::vector>& instances); + +} // namespace device_conv2d_fwd_instance +} // namespace device +} // namespace tensor_operation +} // namespace ck + +using PassThrough = ck::tensor_operation::element_wise::PassThrough; +struct DeviceConvFwdPtr_t::DeviceConvFwdPtrImpl +{ + std::unique_ptr + MakeArgumentPointer(void* in_ptr, + void* wei_ptr, + void* out_ptr, + size_t N, + size_t K, + size_t C, + std::vector input_spatial_lengths, + std::vector filter_spatial_lengths, + std::vector output_spatial_lengths, + std::vector conv_filter_strides, + std::vector conv_filter_dilations, + std::vector input_left_pads, + std::vector input_right_pads) const + { + return el->MakeArgumentPointer(in_ptr, + wei_ptr, + out_ptr, + N, + K, + C, + input_spatial_lengths, + filter_spatial_lengths, + output_spatial_lengths, + conv_filter_strides, + conv_filter_dilations, + input_left_pads, + input_right_pads, + PassThrough{}, + PassThrough{}, + PassThrough{}); + } + std::unique_ptr MakeInvokerPointer() const + { + return el->MakeInvokerPointer(); + } + + std::string GetTypeString() { return el->GetTypeString(); } + bool IsSupportedArgument(const DeviceConvFwdPtr_t::BaseArgument* arg) + { + return el->IsSupportedArgument(arg); + } + + ck::tensor_operation::device::DeviceConvFwdPtr el; +}; + +DeviceConvFwdPtr_t::DeviceConvFwdPtr_t() : pImpl(nullptr) {} +DeviceConvFwdPtr_t::~DeviceConvFwdPtr_t() = default; +DeviceConvFwdPtr_t::DeviceConvFwdPtr_t(DeviceConvFwdPtr_t&&) = default; +DeviceConvFwdPtr_t::DeviceConvFwdPtr_t(DeviceConvFwdPtr_t::DeviceConvFwdPtrImpl& other) + : pImpl(std::make_unique(std::move(other))) +{ +} + +std::unique_ptr +DeviceConvFwdPtr_t::MakeArgumentPointer(void* in_ptr, + void* wei_ptr, + void* out_ptr, + size_t N, + size_t K, + size_t C, + std::vector input_spatial_lengths, + std::vector filter_spatial_lengths, + std::vector output_spatial_lengths, + std::vector conv_filter_strides, + std::vector conv_filter_dilations, + std::vector input_left_pads, + std::vector input_right_pads) const +{ + return pImpl->MakeArgumentPointer(in_ptr, + wei_ptr, + out_ptr, + N, + K, + C, + input_spatial_lengths, + filter_spatial_lengths, + output_spatial_lengths, + conv_filter_strides, + conv_filter_dilations, + input_left_pads, + input_right_pads); +} + +std::unique_ptr DeviceConvFwdPtr_t::MakeInvokerPointer() const +{ + return pImpl->MakeInvokerPointer(); +} + +std::string DeviceConvFwdPtr_t::GetTypeString() { return pImpl->GetTypeString(); } +bool DeviceConvFwdPtr_t::IsSupportedArgument(const DeviceConvFwdPtr_t::BaseArgument* arg_ptr) +{ + return pImpl->IsSupportedArgument(arg_ptr); +} + +using namespace ck::tensor_operation::device::device_conv2d_fwd_instance; +void add_device_conv2d_fwd_xdl_c_shuffle_nhwc_kyxc_nhwk_f16_instances_t( + std::vector& instances) +{ + std::vector< + ck::tensor_operation::device::DeviceConvFwdPtr> + local_instances; + add_device_conv2d_fwd_xdl_c_shuffle_nhwc_kyxc_nhwk_f16_instances(local_instances); + for(auto& kinder : local_instances) + { + DeviceConvFwdPtr_t::DeviceConvFwdPtrImpl tmp{std::move(kinder)}; + instances.emplace_back(tmp); + } + return; +} + +void add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_f32_instances_t( + std::vector& instances) +{ + std::vector< + ck::tensor_operation::device::DeviceConvFwdPtr> + local_instances; + add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_f32_instances(local_instances); + for(auto& kinder : local_instances) + { + DeviceConvFwdPtr_t::DeviceConvFwdPtrImpl tmp{std::move(kinder)}; + instances.emplace_back(tmp); // Perhaps we can do better + } + return; +} + +void add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_bf16_instances_t( + std::vector& instances) +{ + std::vector< + ck::tensor_operation::device::DeviceConvFwdPtr> + local_instances; + add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_bf16_instances(local_instances); + for(auto& kinder : local_instances) + { + DeviceConvFwdPtr_t::DeviceConvFwdPtrImpl tmp{std::move(kinder)}; + instances.emplace_back(tmp); // Perhaps we can do better + } + return; +} + +void add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_f16_instances_t( + std::vector& instances) +{ + std::vector< + ck::tensor_operation::device::DeviceConvFwdPtr> + local_instances; + add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_f16_instances(local_instances); + for(auto& kinder : local_instances) + { + DeviceConvFwdPtr_t::DeviceConvFwdPtrImpl tmp{std::move(kinder)}; + instances.emplace_back(tmp); // Perhaps we can do better + } + return; +} + +void add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_int8_instances_t( + std::vector& instances) +{ + std::vector< + ck::tensor_operation::device::DeviceConvFwdPtr> + local_instances; + add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_int8_instances(local_instances); + for(auto& kinder : local_instances) + { + DeviceConvFwdPtr_t::DeviceConvFwdPtrImpl tmp{std::move(kinder)}; + instances.emplace_back(tmp); + } + return; +} diff --git a/library/src/tensor_operation_instance/gpu/gemm/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/gemm/CMakeLists.txt index 5f057adcc5..556b06d7e1 100644 --- a/library/src/tensor_operation_instance/gpu/gemm/CMakeLists.txt +++ b/library/src/tensor_operation_instance/gpu/gemm/CMakeLists.txt @@ -35,10 +35,9 @@ set(DEVICE_GEMM_INSTANCE_SOURCE device_gemm_xdl_splitk_f16_f16_f16_km_nk_mn_instance.cpp; ) -add_library(device_gemm_instance SHARED ${DEVICE_GEMM_INSTANCE_SOURCE}) +add_library(device_gemm_instance OBJECT ${DEVICE_GEMM_INSTANCE_SOURCE}) target_compile_features(device_gemm_instance PUBLIC) set_target_properties(device_gemm_instance PROPERTIES POSITION_INDEPENDENT_CODE ON) -install(TARGETS device_gemm_instance LIBRARY DESTINATION lib) clang_tidy_check(device_gemm_instance) diff --git a/library/src/tensor_operation_instance/gpu/gemm_bias2d/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/gemm_bias2d/CMakeLists.txt index a0e5ba61a1..e2b0abb1d1 100644 --- a/library/src/tensor_operation_instance/gpu/gemm_bias2d/CMakeLists.txt +++ b/library/src/tensor_operation_instance/gpu/gemm_bias2d/CMakeLists.txt @@ -10,9 +10,7 @@ set(DEVICE_GEMM_BIAS2D_INSTANCE_SOURCE device_gemm_xdl_c_shuffle_bias_2d_f16_f16_f16_mk_nk_mn_instance.cpp; ) -add_library(device_gemm_bias2d_instance SHARED ${DEVICE_GEMM_BIAS2D_INSTANCE_SOURCE}) -target_compile_features(device_gemm_bias2d_instance PUBLIC) +add_library(device_gemm_bias2d_instance OBJECT ${DEVICE_GEMM_BIAS2D_INSTANCE_SOURCE}) set_target_properties(device_gemm_bias2d_instance PROPERTIES POSITION_INDEPENDENT_CODE ON) -install(TARGETS device_gemm_bias2d_instance LIBRARY DESTINATION lib) clang_tidy_check(device_gemm_bias2d_instance) diff --git a/library/src/tensor_operation_instance/gpu/gemm_bias_relu/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/gemm_bias_relu/CMakeLists.txt index 69e05673d6..e2e7d4badd 100644 --- a/library/src/tensor_operation_instance/gpu/gemm_bias_relu/CMakeLists.txt +++ b/library/src/tensor_operation_instance/gpu/gemm_bias_relu/CMakeLists.txt @@ -6,9 +6,7 @@ set(DEVICE_GEMM_BIAS_RELU_INSTANCE_SOURCE device_gemm_xdl_c_shuffle_bias_relu_f16_f16_f16_km_nk_mn_instance.cpp; ) -add_library(device_gemm_bias_relu_instance SHARED ${DEVICE_GEMM_BIAS_RELU_INSTANCE_SOURCE}) -target_compile_features(device_gemm_bias_relu_instance PUBLIC) +add_library(device_gemm_bias_relu_instance OBJECT ${DEVICE_GEMM_BIAS_RELU_INSTANCE_SOURCE}) set_target_properties(device_gemm_bias_relu_instance PROPERTIES POSITION_INDEPENDENT_CODE ON) -install(TARGETS device_gemm_bias_relu_instance LIBRARY DESTINATION lib) clang_tidy_check(device_gemm_bias_relu_instance) diff --git a/library/src/tensor_operation_instance/gpu/gemm_bias_relu_add/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/gemm_bias_relu_add/CMakeLists.txt index 016bc4be2d..a10dbb555d 100644 --- a/library/src/tensor_operation_instance/gpu/gemm_bias_relu_add/CMakeLists.txt +++ b/library/src/tensor_operation_instance/gpu/gemm_bias_relu_add/CMakeLists.txt @@ -6,9 +6,7 @@ set(DEVICE_GEMM_BIAS_RELU_ADD_INSTANCE_SOURCE device_gemm_xdl_c_shuffle_bias_relu_add_f16_f16_f16_km_nk_mn_instance.cpp; ) -add_library(device_gemm_bias_relu_add_instance SHARED ${DEVICE_GEMM_BIAS_RELU_ADD_INSTANCE_SOURCE}) -target_compile_features(device_gemm_bias_relu_add_instance PUBLIC) +add_library(device_gemm_bias_relu_add_instance OBJECT ${DEVICE_GEMM_BIAS_RELU_ADD_INSTANCE_SOURCE}) set_target_properties(device_gemm_bias_relu_add_instance PROPERTIES POSITION_INDEPENDENT_CODE ON) -install(TARGETS device_gemm_bias_relu_add_instance LIBRARY DESTINATION lib) clang_tidy_check(device_gemm_bias_relu_add_instance) diff --git a/library/src/tensor_operation_instance/gpu/grouped_gemm/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/grouped_gemm/CMakeLists.txt index 8f591d8c49..6c5e31fddd 100644 --- a/library/src/tensor_operation_instance/gpu/grouped_gemm/CMakeLists.txt +++ b/library/src/tensor_operation_instance/gpu/grouped_gemm/CMakeLists.txt @@ -6,7 +6,7 @@ set(DEVICE_GROUPED_GEMM_INSTANCE_SOURCE device_grouped_gemm_xdl_f16_f16_f16_km_nk_mn_instance.cpp; ) -add_library(device_grouped_gemm_instance SHARED ${DEVICE_GROUPED_GEMM_INSTANCE_SOURCE}) +add_library(device_grouped_gemm_instance OBJECT ${DEVICE_GROUPED_GEMM_INSTANCE_SOURCE}) target_compile_features(device_grouped_gemm_instance PUBLIC) set_target_properties(device_grouped_gemm_instance PROPERTIES POSITION_INDEPENDENT_CODE ON) diff --git a/library/src/tensor_operation_instance/gpu/reduce/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/reduce/CMakeLists.txt index cced3a4b76..81987ac0d4 100644 --- a/library/src/tensor_operation_instance/gpu/reduce/CMakeLists.txt +++ b/library/src/tensor_operation_instance/gpu/reduce/CMakeLists.txt @@ -38,9 +38,7 @@ set(DEVICE_REDUCE_INSTANCE_SOURCE device_reduce_instance_multiblock_partial_reduce_b16_f32_b16.cpp; ) -add_library(device_reduce_instance SHARED ${DEVICE_REDUCE_INSTANCE_SOURCE}) -target_compile_features(device_reduce_instance PUBLIC) +add_library(device_reduce_instance OBJECT ${DEVICE_REDUCE_INSTANCE_SOURCE}) set_target_properties(device_reduce_instance PROPERTIES POSITION_INDEPENDENT_CODE ON) -install(TARGETS device_reduce_instance LIBRARY DESTINATION lib) clang_tidy_check(device_reduce_instance) diff --git a/library/src/utility/CMakeLists.txt b/library/src/utility/CMakeLists.txt index 3580ba1a8f..0914855d59 100644 --- a/library/src/utility/CMakeLists.txt +++ b/library/src/utility/CMakeLists.txt @@ -8,14 +8,14 @@ include_directories(BEFORE ${PROJECT_SOURCE_DIR}/library/include/ck/library/utility ) -set(CONV_FWD_UTIL_SOURCE - conv_fwd_util.cpp +set(CONV_UTIL_SOURCE + conv_util.cpp ) -add_library(conv_fwd_util SHARED ${CONV_FWD_UTIL_SOURCE}) -target_link_libraries(conv_fwd_util PRIVATE host_tensor) -target_compile_features(conv_fwd_util PUBLIC) -set_target_properties(conv_fwd_util PROPERTIES POSITION_INDEPENDENT_CODE ON) -target_include_directories(conv_fwd_util SYSTEM PUBLIC $) +add_library(conv_util SHARED ${CONV_UTIL_SOURCE}) +target_link_libraries(conv_util PRIVATE host_tensor) +target_compile_features(conv_util PUBLIC) +set_target_properties(conv_util PROPERTIES POSITION_INDEPENDENT_CODE ON) +target_include_directories(conv_util SYSTEM PUBLIC $) -clang_tidy_check(conv_fwd_util) +clang_tidy_check(conv_util) diff --git a/library/src/utility/conv_fwd_util.cpp b/library/src/utility/conv_util.cpp similarity index 62% rename from library/src/utility/conv_fwd_util.cpp rename to library/src/utility/conv_util.cpp index 01bfeda16d..a60d1a3495 100644 --- a/library/src/utility/conv_fwd_util.cpp +++ b/library/src/utility/conv_util.cpp @@ -1,5 +1,5 @@ -#include "conv_fwd_util.hpp" +#include "conv_util.hpp" namespace ck { namespace utils { @@ -37,16 +37,16 @@ std::size_t get_flops(ck::index_t N, } ConvParams::ConvParams() - : num_dim_spatial(2), - N(128), - K(256), - C(192), - filter_spatial_lengths(2, 3), - input_spatial_lengths(2, 71), - conv_filter_strides(2, 2), - conv_filter_dilations(2, 1), - input_left_pads(2, 1), - input_right_pads(2, 1) + : num_dim_spatial_(2), + N_(128), + K_(256), + C_(192), + filter_spatial_lengths_(2, 3), + input_spatial_lengths_(2, 71), + conv_filter_strides_(2, 2), + conv_filter_dilations_(2, 1), + input_left_pads_(2, 1), + input_right_pads_(2, 1) { } @@ -60,23 +60,23 @@ ConvParams::ConvParams(ck::index_t n_dim, const std::vector& dilations, const std::vector& left_pads, const std::vector& right_pads) - : num_dim_spatial(n_dim), - N(n_batch), - K(n_out_channels), - C(n_in_channels), - filter_spatial_lengths(filters_len), - input_spatial_lengths(input_len), - conv_filter_strides(strides), - conv_filter_dilations(dilations), - input_left_pads(left_pads), - input_right_pads(right_pads) + : num_dim_spatial_(n_dim), + N_(n_batch), + K_(n_out_channels), + C_(n_in_channels), + filter_spatial_lengths_(filters_len), + input_spatial_lengths_(input_len), + conv_filter_strides_(strides), + conv_filter_dilations_(dilations), + input_left_pads_(left_pads), + input_right_pads_(right_pads) { - if(ck::type_convert(filter_spatial_lengths.size()) != num_dim_spatial || - ck::type_convert(input_spatial_lengths.size()) != num_dim_spatial || - ck::type_convert(conv_filter_strides.size()) != num_dim_spatial || - ck::type_convert(conv_filter_dilations.size()) != num_dim_spatial || - ck::type_convert(input_left_pads.size()) != num_dim_spatial || - ck::type_convert(input_right_pads.size()) != num_dim_spatial) + if(ck::type_convert(filter_spatial_lengths_.size()) != num_dim_spatial_ || + ck::type_convert(input_spatial_lengths_.size()) != num_dim_spatial_ || + ck::type_convert(conv_filter_strides_.size()) != num_dim_spatial_ || + ck::type_convert(conv_filter_dilations_.size()) != num_dim_spatial_ || + ck::type_convert(input_left_pads_.size()) != num_dim_spatial_ || + ck::type_convert(input_right_pads_.size()) != num_dim_spatial_) { throw( std::runtime_error("ConvParams::GetOutputSpatialLengths: " @@ -86,27 +86,28 @@ ConvParams::ConvParams(ck::index_t n_dim, std::vector ConvParams::GetOutputSpatialLengths() const { - if(ck::type_convert(filter_spatial_lengths.size()) != num_dim_spatial || - ck::type_convert(input_spatial_lengths.size()) != num_dim_spatial || - ck::type_convert(conv_filter_strides.size()) != num_dim_spatial || - ck::type_convert(conv_filter_dilations.size()) != num_dim_spatial || - ck::type_convert(input_left_pads.size()) != num_dim_spatial || - ck::type_convert(input_right_pads.size()) != num_dim_spatial) + if(ck::type_convert(filter_spatial_lengths_.size()) != num_dim_spatial_ || + ck::type_convert(input_spatial_lengths_.size()) != num_dim_spatial_ || + ck::type_convert(conv_filter_strides_.size()) != num_dim_spatial_ || + ck::type_convert(conv_filter_dilations_.size()) != num_dim_spatial_ || + ck::type_convert(input_left_pads_.size()) != num_dim_spatial_ || + ck::type_convert(input_right_pads_.size()) != num_dim_spatial_) { throw( std::runtime_error("ConvParams::GetOutputSpatialLengths: " "parameter size is different from number of declared dimensions!")); } - std::vector out_spatial_len(num_dim_spatial, 0); - for(ck::index_t i = 0; i < num_dim_spatial; ++i) + std::vector out_spatial_len(num_dim_spatial_, 0); + for(ck::index_t i = 0; i < num_dim_spatial_; ++i) { // XEff = (X - 1) * conv_dilation_w + 1; // Wo = (Wi + in_left_pad_w + in_right_pad_w - XEff) / conv_stride_w + 1; - const ck::index_t idx_eff = (filter_spatial_lengths[i] - 1) * conv_filter_dilations[i] + 1; + const ck::index_t idx_eff = + (filter_spatial_lengths_[i] - 1) * conv_filter_dilations_[i] + 1; out_spatial_len[i] = - (input_spatial_lengths[i] + input_left_pads[i] + input_right_pads[i] - idx_eff) / - conv_filter_strides[i] + + (input_spatial_lengths_[i] + input_left_pads_[i] + input_right_pads_[i] - idx_eff) / + conv_filter_strides_[i] + 1; } return out_spatial_len; @@ -116,40 +117,40 @@ ConvParams parse_conv_params(int num_dim_spatial, int arg_idx, char* const argv[ { ck::utils::conv::ConvParams params; - params.num_dim_spatial = num_dim_spatial; - params.N = std::stoi(argv[arg_idx++]); - params.K = std::stoi(argv[arg_idx++]); - params.C = std::stoi(argv[arg_idx++]); + params.num_dim_spatial_ = num_dim_spatial; + params.N_ = std::stoi(argv[arg_idx++]); + params.K_ = std::stoi(argv[arg_idx++]); + params.C_ = std::stoi(argv[arg_idx++]); - params.filter_spatial_lengths.resize(num_dim_spatial); + params.filter_spatial_lengths_.resize(num_dim_spatial); for(int i = 0; i < num_dim_spatial; ++i) { - params.filter_spatial_lengths[i] = std::stoi(argv[arg_idx++]); + params.filter_spatial_lengths_[i] = std::stoi(argv[arg_idx++]); } - params.input_spatial_lengths.resize(num_dim_spatial); + params.input_spatial_lengths_.resize(num_dim_spatial); for(int i = 0; i < num_dim_spatial; ++i) { - params.input_spatial_lengths[i] = std::stoi(argv[arg_idx++]); + params.input_spatial_lengths_[i] = std::stoi(argv[arg_idx++]); } - params.conv_filter_strides.resize(num_dim_spatial); + params.conv_filter_strides_.resize(num_dim_spatial); for(int i = 0; i < num_dim_spatial; ++i) { - params.conv_filter_strides[i] = std::stoi(argv[arg_idx++]); + params.conv_filter_strides_[i] = std::stoi(argv[arg_idx++]); } - params.conv_filter_dilations.resize(num_dim_spatial); + params.conv_filter_dilations_.resize(num_dim_spatial); for(int i = 0; i < num_dim_spatial; ++i) { - params.conv_filter_dilations[i] = std::stoi(argv[arg_idx++]); + params.conv_filter_dilations_[i] = std::stoi(argv[arg_idx++]); } - params.input_left_pads.resize(num_dim_spatial); + params.input_left_pads_.resize(num_dim_spatial); for(int i = 0; i < num_dim_spatial; ++i) { - params.input_left_pads[i] = std::stoi(argv[arg_idx++]); + params.input_left_pads_[i] = std::stoi(argv[arg_idx++]); } - params.input_right_pads.resize(num_dim_spatial); + params.input_right_pads_.resize(num_dim_spatial); for(int i = 0; i < num_dim_spatial; ++i) { - params.input_right_pads[i] = std::stoi(argv[arg_idx++]); + params.input_right_pads_[i] = std::stoi(argv[arg_idx++]); } return params; @@ -228,12 +229,12 @@ HostTensorDescriptor get_input_host_tensor_descriptor(const std::vectorGetTypeString(); - float ave_time = invoker_ptr->Run(argument_ptr.get(), nrepeat); + float ave_time = + invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel}); std::size_t flop = std::size_t(2) * BatchCount * M * N * K; - std::size_t num_btype = (sizeof(ADataType) * M * K + sizeof(BDataType) * K * M + + std::size_t num_btype = (sizeof(ADataType) * M * K + sizeof(BDataType) * K * N + sizeof(CDataType) * M * N) * BatchCount; diff --git a/profiler/include/profile_batched_gemm_reduce_impl.hpp b/profiler/include/profile_batched_gemm_reduce_impl.hpp index a6399c20d8..bd74dbf459 100644 --- a/profiler/include/profile_batched_gemm_reduce_impl.hpp +++ b/profiler/include/profile_batched_gemm_reduce_impl.hpp @@ -53,7 +53,7 @@ template IsSupportedArgument(argument_ptr.get())) { - // warm up - invoker_ptr->Run(argument_ptr.get()); + // init DO, D1 to 0 + d0_device_buf.SetZero(); + d1_device_buf.SetZero(); - // timing - float total_time = 0; - - for(int i = 0; i < nrepeat; ++i) - { - // init DO, D1 to 0 - d0_device_buf.SetZero(); - d1_device_buf.SetZero(); - - KernelTimer timer; - - timer.Start(); - - invoker_ptr->Run(argument_ptr.get()); - - timer.End(); - - total_time += timer.GetElapsedTime(); - } - - float ave_time = total_time / nrepeat; + float ave_time = + invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel}); std::string gemm_name = gemm_ptr->GetTypeString(); diff --git a/profiler/include/profile_conv_bwd_data_impl.hpp b/profiler/include/profile_conv_bwd_data_impl.hpp index bec97e40f5..dfec033737 100644 --- a/profiler/include/profile_conv_bwd_data_impl.hpp +++ b/profiler/include/profile_conv_bwd_data_impl.hpp @@ -51,7 +51,7 @@ template GetTypeString(); - float ave_time = invoker_ptr->Run(argument_ptr.get(), nrepeat); + float ave_time = + invoker_ptr->Run(argument_ptr.get(), StreamControl{nullptr, time_kernel}); std::size_t flop = std::size_t(2) * N * K * Ho * Wo * C * Y * X; diff --git a/profiler/include/profile_conv_bwd_weight_impl.hpp b/profiler/include/profile_conv_bwd_weight_impl.hpp index 20fe0ef549..8e3a4074b0 100644 --- a/profiler/include/profile_conv_bwd_weight_impl.hpp +++ b/profiler/include/profile_conv_bwd_weight_impl.hpp @@ -1,4 +1,6 @@ #pragma once + +#include "stream_config.hpp" #include "config.hpp" #include "device.hpp" #include "host_tensor.hpp" @@ -43,7 +45,7 @@ template MakeArgumentPointer( static_cast(in_device_buf.GetDeviceBuffer()), static_cast(wei_device_buf.GetDeviceBuffer()), @@ -214,7 +218,8 @@ bool profile_conv_bwd_weight_impl(int do_verification, { std::string conv_name = conv_ptr->GetTypeString(); - float ave_time = invoker_ptr->Run(argument_ptr.get(), nrepeat); + float ave_time = + invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel}); std::size_t flop = std::size_t(2) * N * K * Ho * Wo * C * Y * X; @@ -242,6 +247,7 @@ bool profile_conv_bwd_weight_impl(int do_verification, wei_device_buf.FromDevice(wei_k_c_y_x_device_result.mData.data()); float max_error = check_error(wei_k_c_y_x_host_result, wei_k_c_y_x_device_result); + if(max_error > 8) { pass = false; diff --git a/profiler/include/profile_conv_fwd_bias_relu_add_impl.hpp b/profiler/include/profile_conv_fwd_bias_relu_add_impl.hpp index d0de7307d2..5ea35cd72f 100644 --- a/profiler/include/profile_conv_fwd_bias_relu_add_impl.hpp +++ b/profiler/include/profile_conv_fwd_bias_relu_add_impl.hpp @@ -42,7 +42,7 @@ template GetTypeString(); - float ave_time = invoker_ptr->Run(argument_ptr.get(), nrepeat); + float ave_time = + invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel}); std::size_t flop = std::size_t(2) * N * K * Ho * Wo * C * Y * X; diff --git a/profiler/include/profile_conv_fwd_bias_relu_atomic_add_impl.hpp b/profiler/include/profile_conv_fwd_bias_relu_atomic_add_impl.hpp index 9bdfa61283..f1c2fd300a 100644 --- a/profiler/include/profile_conv_fwd_bias_relu_atomic_add_impl.hpp +++ b/profiler/include/profile_conv_fwd_bias_relu_atomic_add_impl.hpp @@ -119,7 +119,7 @@ template GetTypeString(); - float ave_time = invoker_ptr->Run(argument_ptr.get(), nrepeat); + float ave_time = + invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel}); std::size_t flop = std::size_t(2) * N * K * Ho * Wo * C * Y * X; diff --git a/profiler/include/profile_conv_fwd_bias_relu_impl.hpp b/profiler/include/profile_conv_fwd_bias_relu_impl.hpp index f34e52048e..eeb2b93e4e 100644 --- a/profiler/include/profile_conv_fwd_bias_relu_impl.hpp +++ b/profiler/include/profile_conv_fwd_bias_relu_impl.hpp @@ -41,7 +41,7 @@ template GetTypeString(); - float ave_time = invoker_ptr->Run(argument_ptr.get(), nrepeat); + float ave_time = + invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel}); std::size_t flop = std::size_t(2) * N * K * Ho * Wo * C * Y * X; diff --git a/profiler/include/profile_convnd_bwd_data_impl.hpp b/profiler/include/profile_convnd_bwd_data_impl.hpp index c9051f006f..291bf2abc0 100644 --- a/profiler/include/profile_convnd_bwd_data_impl.hpp +++ b/profiler/include/profile_convnd_bwd_data_impl.hpp @@ -1,7 +1,7 @@ #pragma once #include "config.hpp" #include "device.hpp" -#include "conv_fwd_util.hpp" +#include "conv_util.hpp" #include "host_tensor.hpp" #include "host_tensor_generator.hpp" #include "tensor_layout.hpp" @@ -269,7 +269,7 @@ template GetTypeString(); - float ave_time = invoker_ptr->Run(argument_ptr.get(), nrepeat); + float ave_time = + invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel}); std::size_t flop = ck::utils::conv::get_flops(N, C, K, filter_spatial_lengths, output_spatial_lengths); diff --git a/profiler/include/profile_gemm_bias_2d_impl.hpp b/profiler/include/profile_gemm_bias_2d_impl.hpp index 98e4ad76c9..8565f9637c 100644 --- a/profiler/include/profile_gemm_bias_2d_impl.hpp +++ b/profiler/include/profile_gemm_bias_2d_impl.hpp @@ -65,7 +65,7 @@ template GetTypeString(); - float ave_time = invoker_ptr->Run(argument_ptr.get(), nrepeat); + float ave_time = + invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel}); std::size_t flop = std::size_t(2) * M * N * K; diff --git a/profiler/include/profile_gemm_bias_relu_add_impl.hpp b/profiler/include/profile_gemm_bias_relu_add_impl.hpp index 75ed78075b..6fec17c199 100644 --- a/profiler/include/profile_gemm_bias_relu_add_impl.hpp +++ b/profiler/include/profile_gemm_bias_relu_add_impl.hpp @@ -48,7 +48,7 @@ template GetTypeString(); - float ave_time = invoker_ptr->Run(argument_ptr.get(), nrepeat); + float ave_time = + invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel}); std::size_t flop = std::size_t(2) * M * N * K; diff --git a/profiler/include/profile_gemm_bias_relu_impl.hpp b/profiler/include/profile_gemm_bias_relu_impl.hpp index 0735f3c31b..69010becc5 100644 --- a/profiler/include/profile_gemm_bias_relu_impl.hpp +++ b/profiler/include/profile_gemm_bias_relu_impl.hpp @@ -48,7 +48,7 @@ template GetTypeString(); - float ave_time = invoker_ptr->Run(argument_ptr.get(), nrepeat); + float ave_time = + invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel}); std::size_t flop = std::size_t(2) * M * N * K; diff --git a/profiler/include/profile_gemm_impl.hpp b/profiler/include/profile_gemm_impl.hpp index 93262fe802..45e6174260 100644 --- a/profiler/include/profile_gemm_impl.hpp +++ b/profiler/include/profile_gemm_impl.hpp @@ -91,7 +91,7 @@ template GetTypeString(); - float ave_time = invoker_ptr->Run(argument_ptr.get(), nrepeat); + float ave_time = + invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel}); std::size_t flop = std::size_t(2) * M * N * K; diff --git a/profiler/include/profile_gemm_reduce_impl.hpp b/profiler/include/profile_gemm_reduce_impl.hpp index 6ef3e010b1..d034c9f750 100644 --- a/profiler/include/profile_gemm_reduce_impl.hpp +++ b/profiler/include/profile_gemm_reduce_impl.hpp @@ -52,7 +52,7 @@ template IsSupportedArgument(argument_ptr.get())) { - // warm up - invoker_ptr->Run(argument_ptr.get()); + // init DO, D1 to 0 + d0_device_buf.SetZero(); + d1_device_buf.SetZero(); - // timing - float total_time = 0; - - for(int i = 0; i < nrepeat; ++i) - { - // init DO, D1 to 0 - d0_device_buf.SetZero(); - d1_device_buf.SetZero(); - - KernelTimer timer; - - timer.Start(); - - invoker_ptr->Run(argument_ptr.get()); - - timer.End(); - - total_time += timer.GetElapsedTime(); - } - - float ave_time = total_time / nrepeat; + float ave_time = + invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel}); std::string gemm_name = gemm_ptr->GetTypeString(); std::size_t flop = std::size_t(2) * M * N * K; - std::size_t num_btype = sizeof(ADataType) * M * K + sizeof(BDataType) * K * M + + std::size_t num_btype = sizeof(ADataType) * M * K + sizeof(BDataType) * K * N + sizeof(CDataType) * M * N + sizeof(CDataType) * N; float tflops = static_cast(flop) / 1.E9 / ave_time; diff --git a/profiler/include/profile_grouped_gemm_impl.hpp b/profiler/include/profile_grouped_gemm_impl.hpp index ae70f551f1..96d34c7e42 100644 --- a/profiler/include/profile_grouped_gemm_impl.hpp +++ b/profiler/include/profile_grouped_gemm_impl.hpp @@ -49,7 +49,7 @@ template & Ms, const std::vector& Ns, const std::vector& Ks, @@ -231,7 +231,8 @@ void profile_grouped_gemm_impl(int do_verification, { std::string gemm_name = gemm_ptr->GetTypeString(); - float ave_time = invoker_ptr->Run(argument_ptr.get(), nrepeat); + float ave_time = + invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel}); std::size_t flop = 0, num_btype = 0; for(std::size_t i = 0; i < gemm_shapes.size(); i++) diff --git a/profiler/include/profile_reduce_impl.hpp b/profiler/include/profile_reduce_impl.hpp index 678134f60b..33c7929ddd 100644 --- a/profiler/include/profile_reduce_impl.hpp +++ b/profiler/include/profile_reduce_impl.hpp @@ -157,7 +157,7 @@ void profile_reduce_impl_impl(bool do_verification, int init_method, bool do_log, bool do_dumpout, - int nrepeat, + bool time_kernel, const std::vector& inLengths, const std::vector& reduceDims, float alpha, @@ -430,7 +430,8 @@ void profile_reduce_impl_impl(bool do_verification, auto invoker_ptr = reduce_ptr->MakeInvokerPointer(); - float avg_time = invoker_ptr->Run(argument_ptr.get(), nrepeat); + float avg_time = + invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel}); std::size_t num_bytes = invariant_total_length * reduce_total_length * sizeof(InDataType) + @@ -516,7 +517,8 @@ void profile_reduce_impl_impl(bool do_verification, auto invoker_ptr = reduce_ptr->MakeInvokerPointer(); - float avg_time = invoker_ptr->Run(argument_ptr.get(), nrepeat); + float avg_time = + invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel}); std::size_t num_bytes = invariant_total_length * reduce_total_length * sizeof(InDataType) + @@ -554,7 +556,8 @@ void profile_reduce_impl_impl(bool do_verification, auto invoker2_ptr = reduce2_ptr->MakeInvokerPointer(); - float avg_time_2 = invoker2_ptr->Run(argument2_ptr.get(), nrepeat); + float avg_time_2 = + invoker2_ptr->Run(argument2_ptr.get(), StreamConfig{nullptr, time_kernel}); std::size_t num_bytes_2 = static_cast(inLengths2[0]) * inLengths2[1] * sizeof(AccDataType); @@ -625,7 +628,7 @@ void profile_reduce_impl(bool do_verification, int init_method, bool do_log, bool do_dumpout, - int nrepeat, + bool time_kernel, const std::vector& inLengths, const std::vector& reduceDims, ReduceTensorOp ReduceOpId, @@ -663,7 +666,7 @@ void profile_reduce_impl(bool do_verification, init_method, do_log, do_dumpout, - nrepeat, + time_kernel, inLengths, reduceDims, alpha, diff --git a/profiler/src/profile_batched_gemm.cpp b/profiler/src/profile_batched_gemm.cpp index 2a806b0818..db5486e0ac 100644 --- a/profiler/src/profile_batched_gemm.cpp +++ b/profiler/src/profile_batched_gemm.cpp @@ -48,8 +48,8 @@ int profile_batched_gemm(int argc, char* argv[]) printf(" 3: A[g, k, m] * B[g, n, k] = C[g, m, n])\n"); printf("arg4: verification (0: no; 1: yes)\n"); printf("arg5: initialization (0: no init; 1: integer value; 2: decimal value)\n"); - printf("arg8: print tensor value (0: no; 1: yes)\n"); - printf("arg7: run kernel # of times (>1)\n"); + printf("arg6: print tensor value (0: no; 1: yes)\n"); + printf("arg7: time kernel (0=n0, 1=yes)\n"); printf("arg8 to 14: M, N, K, StrideA, StrideB, StrideC, BatchCount\n"); exit(1); } @@ -59,7 +59,7 @@ int profile_batched_gemm(int argc, char* argv[]) const bool do_verification = std::stoi(argv[4]); const int init_method = std::stoi(argv[5]); const bool do_log = std::stoi(argv[6]); - const int nrepeat = std::stoi(argv[7]); + const bool time_kernel = std::stoi(argv[7]); const int M = std::stoi(argv[8]); const int N = std::stoi(argv[9]); @@ -82,7 +82,7 @@ int profile_batched_gemm(int argc, char* argv[]) do_verification, init_method, do_log, - nrepeat, + time_kernel, M, N, K, @@ -102,7 +102,7 @@ int profile_batched_gemm(int argc, char* argv[]) do_verification, init_method, do_log, - nrepeat, + time_kernel, M, N, K, @@ -122,7 +122,7 @@ int profile_batched_gemm(int argc, char* argv[]) do_verification, init_method, do_log, - nrepeat, + time_kernel, M, N, K, @@ -142,7 +142,7 @@ int profile_batched_gemm(int argc, char* argv[]) do_verification, init_method, do_log, - nrepeat, + time_kernel, M, N, K, @@ -162,7 +162,7 @@ int profile_batched_gemm(int argc, char* argv[]) do_verification, init_method, do_log, - nrepeat, + time_kernel, M, N, K, @@ -182,7 +182,7 @@ int profile_batched_gemm(int argc, char* argv[]) do_verification, init_method, do_log, - nrepeat, + time_kernel, M, N, K, @@ -202,7 +202,7 @@ int profile_batched_gemm(int argc, char* argv[]) do_verification, init_method, do_log, - nrepeat, + time_kernel, M, N, K, @@ -222,7 +222,7 @@ int profile_batched_gemm(int argc, char* argv[]) do_verification, init_method, do_log, - nrepeat, + time_kernel, M, N, K, @@ -242,7 +242,7 @@ int profile_batched_gemm(int argc, char* argv[]) do_verification, init_method, do_log, - nrepeat, + time_kernel, M, N, K, @@ -262,7 +262,7 @@ int profile_batched_gemm(int argc, char* argv[]) do_verification, init_method, do_log, - nrepeat, + time_kernel, M, N, K, @@ -282,7 +282,7 @@ int profile_batched_gemm(int argc, char* argv[]) do_verification, init_method, do_log, - nrepeat, + time_kernel, M, N, K, @@ -302,7 +302,7 @@ int profile_batched_gemm(int argc, char* argv[]) do_verification, init_method, do_log, - nrepeat, + time_kernel, M, N, K, @@ -322,7 +322,7 @@ int profile_batched_gemm(int argc, char* argv[]) do_verification, init_method, do_log, - nrepeat, + time_kernel, M, N, K, @@ -342,7 +342,7 @@ int profile_batched_gemm(int argc, char* argv[]) do_verification, init_method, do_log, - nrepeat, + time_kernel, M, N, K, @@ -362,7 +362,7 @@ int profile_batched_gemm(int argc, char* argv[]) do_verification, init_method, do_log, - nrepeat, + time_kernel, M, N, K, @@ -382,7 +382,7 @@ int profile_batched_gemm(int argc, char* argv[]) do_verification, init_method, do_log, - nrepeat, + time_kernel, M, N, K, diff --git a/profiler/src/profile_batched_gemm_reduce.cpp b/profiler/src/profile_batched_gemm_reduce.cpp index 38c3f52193..f67e561865 100644 --- a/profiler/src/profile_batched_gemm_reduce.cpp +++ b/profiler/src/profile_batched_gemm_reduce.cpp @@ -33,8 +33,8 @@ int profile_batched_gemm_reduce(int argc, char* argv[]) printf(" 3: A[k, m] * B[n, k] = C[m, n])\n"); printf("arg4: verification (0: no; 1: yes)\n"); printf("arg5: initialization (0: no init; 1: integer value; 2: decimal value)\n"); - printf("arg8: print tensor value (0: no; 1: yes)\n"); - printf("arg7: run kernel # of times (>1)\n"); + printf("arg6: print tensor value (0: no; 1: yes)\n"); + printf("arg7: time kernel (0=n0, 1=yes)\n"); printf("arg8 to 14: M, N, K, StrideA, StrideB, StrideC, BatchCount\n"); printf("arg15: split k into mulitiple batch\n"); exit(1); @@ -45,7 +45,7 @@ int profile_batched_gemm_reduce(int argc, char* argv[]) const bool do_verification = std::stoi(argv[4]); const int init_method = std::stoi(argv[5]); const bool do_log = std::stoi(argv[6]); - const int nrepeat = std::stoi(argv[7]); + const bool time_kernel = std::stoi(argv[7]); const int M = std::stoi(argv[8]); const int N = std::stoi(argv[9]); @@ -69,7 +69,7 @@ int profile_batched_gemm_reduce(int argc, char* argv[]) do_verification, init_method, do_log, - nrepeat, + time_kernel, M, N, K, @@ -91,7 +91,7 @@ int profile_batched_gemm_reduce(int argc, char* argv[]) do_verification, init_method, do_log, - nrepeat, + time_kernel, M, N, K, @@ -113,7 +113,7 @@ int profile_batched_gemm_reduce(int argc, char* argv[]) do_verification, init_method, do_log, - nrepeat, + time_kernel, M, N, K, @@ -135,7 +135,7 @@ int profile_batched_gemm_reduce(int argc, char* argv[]) do_verification, init_method, do_log, - nrepeat, + time_kernel, M, N, K, diff --git a/profiler/src/profile_conv_bwd_data.cpp b/profiler/src/profile_conv_bwd_data.cpp index 2861af3d10..206d486ea0 100644 --- a/profiler/src/profile_conv_bwd_data.cpp +++ b/profiler/src/profile_conv_bwd_data.cpp @@ -44,7 +44,7 @@ int profile_conv_bwd_data(int argc, char* argv[]) printf("arg6: verification (0: no; 1: yes)\n"); printf("arg7: initialization (0: no init; 1: integer value; 2: decimal value)\n"); printf("arg8: print tensor value (0: no; 1: yes)\n"); - printf("arg9: run kernel # of times (>1)\n"); + printf("arg9: time kernel (0=n0, 1=yes)\n"); printf("arg10 to 24: N, K, C, Y, X, Hi, Wi, Sy, Sx, Dy, Dx, LeftPy, LeftPx, RightPy, " "RightPx\n"); exit(1); @@ -57,7 +57,7 @@ int profile_conv_bwd_data(int argc, char* argv[]) const bool do_verification = std::stoi(argv[6]); const int init_method = std::stoi(argv[7]); const bool do_log = std::stoi(argv[8]); - const int nrepeat = std::stoi(argv[9]); + const bool time_kernel = std::stoi(argv[9]); const ck::index_t N = std::stoi(argv[10]); const ck::index_t K = std::stoi(argv[11]); @@ -96,7 +96,7 @@ int profile_conv_bwd_data(int argc, char* argv[]) do_verification, init_method, do_log, - nrepeat, + StreamControl{nullptr, time_kernel}, N, K, C, @@ -122,7 +122,7 @@ int profile_conv_bwd_data(int argc, char* argv[]) do_verification, init_method, do_log, - nrepeat, + StreamControl{nullptr, time_kernel}, N, K, C, @@ -148,7 +148,7 @@ int profile_conv_bwd_data(int argc, char* argv[]) do_verification, init_method, do_log, - nrepeat, + StreamControl{nullptr, time_kernel}, N, K, C, @@ -174,7 +174,7 @@ int profile_conv_bwd_data(int argc, char* argv[]) do_verification, init_method, do_log, - nrepeat, + StreamControl{nullptr, time_kernel}, N, K, C, diff --git a/profiler/src/profile_conv_bwd_weight.cpp b/profiler/src/profile_conv_bwd_weight.cpp index 309cc8ea2c..c022d19ee0 100644 --- a/profiler/src/profile_conv_bwd_weight.cpp +++ b/profiler/src/profile_conv_bwd_weight.cpp @@ -58,7 +58,7 @@ int profile_conv_bwd_weight(int argc, char* argv[]) const bool do_verification = std::stoi(argv[6]); const int init_method = std::stoi(argv[7]); const bool do_log = std::stoi(argv[8]); - const int nrepeat = std::stoi(argv[9]); + const bool time_kernel = std::stoi(argv[9]); const ck::index_t N = std::stoi(argv[10]); const ck::index_t K = std::stoi(argv[11]); @@ -98,7 +98,7 @@ int profile_conv_bwd_weight(int argc, char* argv[]) do_verification, init_method, do_log, - nrepeat, + time_kernel, N, K, C, @@ -124,7 +124,7 @@ int profile_conv_bwd_weight(int argc, char* argv[]) do_verification, init_method, do_log, - nrepeat, + time_kernel, N, K, C, diff --git a/profiler/src/profile_conv_fwd_bias_relu.cpp b/profiler/src/profile_conv_fwd_bias_relu.cpp index 1c447b483e..28aa49687f 100644 --- a/profiler/src/profile_conv_fwd_bias_relu.cpp +++ b/profiler/src/profile_conv_fwd_bias_relu.cpp @@ -42,7 +42,7 @@ int profile_conv_fwd_bias_relu(int argc, char* argv[]) printf("arg6: verification (0: no; 1: yes)\n"); printf("arg7: initialization (0: no init; 1: integer value; 2: decimal value)\n"); printf("arg8: print tensor value (0: no; 1: yes)\n"); - printf("arg9: run kernel # of times (>1)\n"); + printf("arg9: time kernel (0=n0, 1=yes)\n"); printf("arg10 to 24: N, K, C, Y, X, Hi, Wi, Sy, Sx, Dy, Dx, LeftPy, LeftPx, RightPy, " "RightPx\n"); exit(1); @@ -55,7 +55,7 @@ int profile_conv_fwd_bias_relu(int argc, char* argv[]) const bool do_verification = std::stoi(argv[6]); const int init_method = std::stoi(argv[7]); const bool do_log = std::stoi(argv[8]); - const int nrepeat = std::stoi(argv[9]); + const bool time_kernel = std::stoi(argv[9]); const ck::index_t N = std::stoi(argv[10]); const ck::index_t K = std::stoi(argv[11]); @@ -93,7 +93,7 @@ int profile_conv_fwd_bias_relu(int argc, char* argv[]) do_verification, init_method, do_log, - nrepeat, + time_kernel, N, K, C, diff --git a/profiler/src/profile_conv_fwd_bias_relu_add.cpp b/profiler/src/profile_conv_fwd_bias_relu_add.cpp index 522487c77b..7e033a51e2 100644 --- a/profiler/src/profile_conv_fwd_bias_relu_add.cpp +++ b/profiler/src/profile_conv_fwd_bias_relu_add.cpp @@ -43,7 +43,7 @@ int profile_conv_fwd_bias_relu_add(int argc, char* argv[]) printf("arg6: verification (0: no; 1: yes)\n"); printf("arg7: initialization (0: no init; 1: integer value; 2: decimal value)\n"); printf("arg8: print tensor value (0: no; 1: yes)\n"); - printf("arg9: run kernel # of times (>1)\n"); + printf("arg9: time kernel (0=n0, 1=yes)\n"); printf("arg10 to 24: N, K, C, Y, X, Hi, Wi, Sy, Sx, Dy, Dx, LeftPy, LeftPx, RightPy, " "RightPx\n"); exit(1); @@ -56,7 +56,7 @@ int profile_conv_fwd_bias_relu_add(int argc, char* argv[]) const bool do_verification = std::stoi(argv[6]); const int init_method = std::stoi(argv[7]); const bool do_log = std::stoi(argv[8]); - const int nrepeat = std::stoi(argv[9]); + const bool time_kernel = std::stoi(argv[9]); const ck::index_t N = std::stoi(argv[10]); const ck::index_t K = std::stoi(argv[11]); @@ -94,7 +94,7 @@ int profile_conv_fwd_bias_relu_add(int argc, char* argv[]) do_verification, init_method, do_log, - nrepeat, + time_kernel, N, K, C, diff --git a/profiler/src/profile_conv_fwd_bias_relu_atomic_add.cpp b/profiler/src/profile_conv_fwd_bias_relu_atomic_add.cpp index 833f2851db..095536f701 100644 --- a/profiler/src/profile_conv_fwd_bias_relu_atomic_add.cpp +++ b/profiler/src/profile_conv_fwd_bias_relu_atomic_add.cpp @@ -43,7 +43,7 @@ int profile_conv_fwd_bias_relu_atomic_add(int argc, char* argv[]) printf("arg6: verification (0: no; 1: yes)\n"); printf("arg7: initialization (0: no init; 1: integer value; 2: decimal value)\n"); printf("arg8: print tensor value (0: no; 1: yes)\n"); - printf("arg9: run kernel # of times (>1)\n"); + printf("arg9: time kernel (0=n0, 1=yes)\n"); printf("arg10 to 24: N, K, C, Y, X, Hi, Wi, Sy, Sx, Dy, Dx, LeftPy, LeftPx, RightPy, " "RightPx\n"); exit(1); @@ -56,7 +56,7 @@ int profile_conv_fwd_bias_relu_atomic_add(int argc, char* argv[]) const bool do_verification = std::stoi(argv[6]); const int init_method = std::stoi(argv[7]); const bool do_log = std::stoi(argv[8]); - const int nrepeat = std::stoi(argv[9]); + const bool time_kernel = std::stoi(argv[9]); const ck::index_t N = std::stoi(argv[10]); const ck::index_t K = std::stoi(argv[11]); @@ -95,7 +95,7 @@ int profile_conv_fwd_bias_relu_atomic_add(int argc, char* argv[]) do_verification, init_method, do_log, - nrepeat, + time_kernel, N, K, C, diff --git a/profiler/src/profile_convnd_bwd_data.cpp b/profiler/src/profile_convnd_bwd_data.cpp index 893fb8c791..5d0e6a34c7 100644 --- a/profiler/src/profile_convnd_bwd_data.cpp +++ b/profiler/src/profile_convnd_bwd_data.cpp @@ -39,40 +39,40 @@ ck::utils::conv::ConvParams parse_conv_params(int num_dim_spatial, char* argv[], // (N, K, C) + num_dim_spatial * 6 (filter, input, strides, dilations, pad left, pad right) ck::utils::conv::ConvParams params; - params.num_dim_spatial = num_dim_spatial; - params.N = std::stoi(argv[arg_idx++]); - params.K = std::stoi(argv[arg_idx++]); - params.C = std::stoi(argv[arg_idx++]); + params.num_dim_spatial_ = num_dim_spatial; + params.N_ = std::stoi(argv[arg_idx++]); + params.K_ = std::stoi(argv[arg_idx++]); + params.C_ = std::stoi(argv[arg_idx++]); - params.filter_spatial_lengths.resize(num_dim_spatial); + params.filter_spatial_lengths_.resize(num_dim_spatial); for(int i = 0; i < num_dim_spatial; ++i) { - params.filter_spatial_lengths[i] = std::stoi(argv[arg_idx++]); + params.filter_spatial_lengths_[i] = std::stoi(argv[arg_idx++]); } - params.input_spatial_lengths.resize(num_dim_spatial); + params.input_spatial_lengths_.resize(num_dim_spatial); for(int i = 0; i < num_dim_spatial; ++i) { - params.input_spatial_lengths[i] = std::stoi(argv[arg_idx++]); + params.input_spatial_lengths_[i] = std::stoi(argv[arg_idx++]); } - params.conv_filter_strides.resize(num_dim_spatial); + params.conv_filter_strides_.resize(num_dim_spatial); for(int i = 0; i < num_dim_spatial; ++i) { - params.conv_filter_strides[i] = std::stoi(argv[arg_idx++]); + params.conv_filter_strides_[i] = std::stoi(argv[arg_idx++]); } - params.conv_filter_dilations.resize(num_dim_spatial); + params.conv_filter_dilations_.resize(num_dim_spatial); for(int i = 0; i < num_dim_spatial; ++i) { - params.conv_filter_dilations[i] = std::stoi(argv[arg_idx++]); + params.conv_filter_dilations_[i] = std::stoi(argv[arg_idx++]); } - params.input_left_pads.resize(num_dim_spatial); + params.input_left_pads_.resize(num_dim_spatial); for(int i = 0; i < num_dim_spatial; ++i) { - params.input_left_pads[i] = std::stoi(argv[arg_idx++]); + params.input_left_pads_[i] = std::stoi(argv[arg_idx++]); } - params.input_right_pads.resize(num_dim_spatial); + params.input_right_pads_.resize(num_dim_spatial); for(int i = 0; i < num_dim_spatial; ++i) { - params.input_right_pads[i] = std::stoi(argv[arg_idx++]); + params.input_right_pads_[i] = std::stoi(argv[arg_idx++]); } return params; @@ -95,7 +95,7 @@ int profile_convnd_bwd_data(int argc, char* argv[], int num_dim_spatial) printf("arg6: verification (0: no; 1: yes)\n"); printf("arg7: initialization (0: no init; 1: integer value; 2: decimal value)\n"); printf("arg8: print tensor value (0: no; 1: yes)\n"); - printf("arg9: run kernel # of times (>1)\n"); + printf("arg9: time kernel (0=n0, 1=yes)\n"); printf("arg10 to 24: N, K, C, Y, X, Hi, Wi, Sy, Sx, Dy, Dx, LeftPy, LeftPx, RightPy, " "RightPx\n"); return 1; @@ -108,7 +108,7 @@ int profile_convnd_bwd_data(int argc, char* argv[], int num_dim_spatial) const bool do_verification = std::stoi(argv[6]); const int init_method = std::stoi(argv[7]); const bool do_log = std::stoi(argv[8]); - const int nrepeat = std::stoi(argv[9]); + const bool time_kernel = std::stoi(argv[9]); ck::utils::conv::ConvParams params = parse_conv_params(num_dim_spatial, argv, preParams); @@ -132,17 +132,17 @@ int profile_convnd_bwd_data(int argc, char* argv[], int num_dim_spatial) do_verification, init_method, do_log, - nrepeat, - params.N, - params.K, - params.C, - params.input_spatial_lengths, - params.filter_spatial_lengths, + time_kernel, + params.N_, + params.K_, + params.C_, + params.input_spatial_lengths_, + params.filter_spatial_lengths_, params.GetOutputSpatialLengths(), - params.conv_filter_strides, - params.conv_filter_dilations, - params.input_left_pads, - params.input_right_pads); + params.conv_filter_strides_, + params.conv_filter_dilations_, + params.input_left_pads_, + params.input_right_pads_); break; case 2: @@ -157,17 +157,17 @@ int profile_convnd_bwd_data(int argc, char* argv[], int num_dim_spatial) do_verification, init_method, do_log, - nrepeat, - params.N, - params.K, - params.C, - params.input_spatial_lengths, - params.filter_spatial_lengths, + time_kernel, + params.N_, + params.K_, + params.C_, + params.input_spatial_lengths_, + params.filter_spatial_lengths_, params.GetOutputSpatialLengths(), - params.conv_filter_strides, - params.conv_filter_dilations, - params.input_left_pads, - params.input_right_pads); + params.conv_filter_strides_, + params.conv_filter_dilations_, + params.input_left_pads_, + params.input_right_pads_); break; case 3: @@ -182,17 +182,17 @@ int profile_convnd_bwd_data(int argc, char* argv[], int num_dim_spatial) do_verification, init_method, do_log, - nrepeat, - params.N, - params.K, - params.C, - params.input_spatial_lengths, - params.filter_spatial_lengths, + time_kernel, + params.N_, + params.K_, + params.C_, + params.input_spatial_lengths_, + params.filter_spatial_lengths_, params.GetOutputSpatialLengths(), - params.conv_filter_strides, - params.conv_filter_dilations, - params.input_left_pads, - params.input_right_pads); + params.conv_filter_strides_, + params.conv_filter_dilations_, + params.input_left_pads_, + params.input_right_pads_); break; default: break; diff --git a/profiler/src/profile_convnd_fwd.cpp b/profiler/src/profile_convnd_fwd.cpp index 1abd73c729..722e86c2ea 100644 --- a/profiler/src/profile_convnd_fwd.cpp +++ b/profiler/src/profile_convnd_fwd.cpp @@ -5,7 +5,7 @@ #include #include -#include "conv_fwd_util.hpp" +#include "conv_util.hpp" #include "element_wise_operation.hpp" #include "fill.hpp" #include "profile_convnd_fwd.hpp" @@ -119,7 +119,7 @@ template ::template Get(), - nrepeat, + time_kernel, do_verification, do_log); @@ -201,7 +201,7 @@ void profile_convnd_instances(ConvDataType data_type, const ck::utils::conv::ConvParams& params, bool do_verification, bool do_log, - int nrepeat, + bool time_kernel, int init_method) { switch(data_layout) @@ -214,7 +214,7 @@ void profile_convnd_instances(ConvDataType data_type, params, do_verification, do_log, - nrepeat, + time_kernel, init_method, ConvolutionLayouts{}); break; @@ -223,7 +223,7 @@ void profile_convnd_instances(ConvDataType data_type, params, do_verification, do_log, - nrepeat, + time_kernel, init_method, ConvolutionLayouts{}); break; @@ -232,7 +232,7 @@ void profile_convnd_instances(ConvDataType data_type, params, do_verification, do_log, - nrepeat, + time_kernel, init_method, ConvolutionLayouts{}); break; @@ -241,7 +241,7 @@ void profile_convnd_instances(ConvDataType data_type, params, do_verification, do_log, - nrepeat, + time_kernel, init_method, ConvolutionLayouts{}); break; @@ -256,7 +256,7 @@ void profile_convnd_instances(ConvDataType data_type, params, do_verification, do_log, - nrepeat, + time_kernel, init_method, ConvolutionLayouts{}); break; @@ -265,7 +265,7 @@ void profile_convnd_instances(ConvDataType data_type, params, do_verification, do_log, - nrepeat, + time_kernel, init_method, ConvolutionLayouts{}); break; @@ -274,7 +274,7 @@ void profile_convnd_instances(ConvDataType data_type, params, do_verification, do_log, - nrepeat, + time_kernel, init_method, ConvolutionLayouts{}); break; @@ -283,7 +283,7 @@ void profile_convnd_instances(ConvDataType data_type, params, do_verification, do_log, - nrepeat, + time_kernel, init_method, ConvolutionLayouts{}); break; @@ -304,7 +304,7 @@ int ck::profiler::profile_convnd_fwd(int argc, char* argv[]) bool do_verification{true}; int init_method{2}; bool do_log{false}; - int nrepeat{100}; + bool time_kernel{false}; int num_dim_spatial{2}; ConvParams params; @@ -318,7 +318,7 @@ int ck::profiler::profile_convnd_fwd(int argc, char* argv[]) do_verification = std::stoi(argv[4]); init_method = std::stoi(argv[5]); do_log = std::stoi(argv[6]); - nrepeat = std::stoi(argv[7]); + time_kernel = std::stoi(argv[7]); num_dim_spatial = std::stoi(argv[8]); } if(argc >= 10) @@ -332,15 +332,15 @@ int ck::profiler::profile_convnd_fwd(int argc, char* argv[]) { case 1: profile_convnd_instances<1>( - data_type, data_layout, params, do_verification, do_log, nrepeat, init_method); + data_type, data_layout, params, do_verification, do_log, time_kernel, init_method); break; case 2: profile_convnd_instances<2>( - data_type, data_layout, params, do_verification, do_log, nrepeat, init_method); + data_type, data_layout, params, do_verification, do_log, time_kernel, init_method); break; case 3: profile_convnd_instances<3>( - data_type, data_layout, params, do_verification, do_log, nrepeat, init_method); + data_type, data_layout, params, do_verification, do_log, time_kernel, init_method); break; default: throw std::runtime_error("profile_conv_fwd: unsupported num_dim_spatial value: " + diff --git a/profiler/src/profile_gemm.cpp b/profiler/src/profile_gemm.cpp index 7a72be2d8e..4c6a3b0487 100644 --- a/profiler/src/profile_gemm.cpp +++ b/profiler/src/profile_gemm.cpp @@ -38,8 +38,8 @@ int profile_gemm(int argc, char* argv[]) printf(" 3: A[k, m] * B[n, k] = C[m, n])\n"); printf("arg4: verification (0: no; 1: yes)\n"); printf("arg5: initialization (0: no init; 1: integer value; 2: decimal value)\n"); - printf("arg8: print tensor value (0: no; 1: yes)\n"); - printf("arg7: run kernel # of times (>1)\n"); + printf("arg6: print tensor value (0: no; 1: yes)\n"); + printf("arg7: time kernel (0=n0, 1=yes)\n"); printf("arg8 to 13: M, N, K, StrideA, StrideB, StrideC\n"); printf("arg14: split k into mulitiple batch\n"); exit(1); @@ -50,7 +50,7 @@ int profile_gemm(int argc, char* argv[]) const bool do_verification = std::stoi(argv[4]); const int init_method = std::stoi(argv[5]); const bool do_log = std::stoi(argv[6]); - const int nrepeat = std::stoi(argv[7]); + const bool time_kernel = std::stoi(argv[7]); const int M = std::stoi(argv[8]); const int N = std::stoi(argv[9]); @@ -74,7 +74,7 @@ int profile_gemm(int argc, char* argv[]) do_verification, init_method, do_log, - nrepeat, + time_kernel, M, N, K, @@ -94,7 +94,7 @@ int profile_gemm(int argc, char* argv[]) do_verification, init_method, do_log, - nrepeat, + time_kernel, M, N, K, @@ -114,7 +114,7 @@ int profile_gemm(int argc, char* argv[]) do_verification, init_method, do_log, - nrepeat, + time_kernel, M, N, K, @@ -134,7 +134,7 @@ int profile_gemm(int argc, char* argv[]) do_verification, init_method, do_log, - nrepeat, + time_kernel, M, N, K, @@ -154,7 +154,7 @@ int profile_gemm(int argc, char* argv[]) do_verification, init_method, do_log, - nrepeat, + time_kernel, M, N, K, @@ -174,7 +174,7 @@ int profile_gemm(int argc, char* argv[]) do_verification, init_method, do_log, - nrepeat, + time_kernel, M, N, K, @@ -194,7 +194,7 @@ int profile_gemm(int argc, char* argv[]) do_verification, init_method, do_log, - nrepeat, + time_kernel, M, N, K, @@ -214,7 +214,7 @@ int profile_gemm(int argc, char* argv[]) do_verification, init_method, do_log, - nrepeat, + time_kernel, M, N, K, @@ -234,7 +234,7 @@ int profile_gemm(int argc, char* argv[]) do_verification, init_method, do_log, - nrepeat, + time_kernel, M, N, K, @@ -254,7 +254,7 @@ int profile_gemm(int argc, char* argv[]) do_verification, init_method, do_log, - nrepeat, + time_kernel, M, N, K, @@ -274,7 +274,7 @@ int profile_gemm(int argc, char* argv[]) do_verification, init_method, do_log, - nrepeat, + time_kernel, M, N, K, @@ -294,7 +294,7 @@ int profile_gemm(int argc, char* argv[]) do_verification, init_method, do_log, - nrepeat, + time_kernel, M, N, K, @@ -314,7 +314,7 @@ int profile_gemm(int argc, char* argv[]) do_verification, init_method, do_log, - nrepeat, + time_kernel, M, N, K, @@ -334,7 +334,7 @@ int profile_gemm(int argc, char* argv[]) do_verification, init_method, do_log, - nrepeat, + time_kernel, M, N, K, @@ -354,7 +354,7 @@ int profile_gemm(int argc, char* argv[]) do_verification, init_method, do_log, - nrepeat, + time_kernel, M, N, K, @@ -374,7 +374,7 @@ int profile_gemm(int argc, char* argv[]) do_verification, init_method, do_log, - nrepeat, + time_kernel, M, N, K, diff --git a/profiler/src/profile_gemm_bias_2d.cpp b/profiler/src/profile_gemm_bias_2d.cpp index dd7e418087..46d4f90c17 100644 --- a/profiler/src/profile_gemm_bias_2d.cpp +++ b/profiler/src/profile_gemm_bias_2d.cpp @@ -36,8 +36,8 @@ int profile_gemm_bias_2d(int argc, char* argv[]) printf(" 3: A[k, m] * B[n, k] = C[m, n])\n"); printf("arg4: verification (0: no; 1: yes)\n"); printf("arg5: initialization (0: no init; 1: integer value; 2: decimal value)\n"); - printf("arg8: print tensor value (0: no; 1: yes)\n"); - printf("arg7: run kernel # of times (>1)\n"); + printf("arg6: print tensor value (0: no; 1: yes)\n"); + printf("arg7: time kernel (0=n0, 1=yes)\n"); printf("arg8 to 13: M, N, K, StrideA, StrideB, StrideC\n"); printf("arg14: alpha\n"); printf("arg15: beta\n"); @@ -50,7 +50,7 @@ int profile_gemm_bias_2d(int argc, char* argv[]) const bool do_verification = std::stoi(argv[4]); const int init_method = std::stoi(argv[5]); const bool do_log = std::stoi(argv[6]); - const int nrepeat = std::stoi(argv[7]); + const bool time_kernel = std::stoi(argv[7]); const int M = std::stoi(argv[8]); const int N = std::stoi(argv[9]); @@ -76,7 +76,7 @@ int profile_gemm_bias_2d(int argc, char* argv[]) do_verification, init_method, do_log, - nrepeat, + time_kernel, M, N, K, @@ -99,7 +99,7 @@ int profile_gemm_bias_2d(int argc, char* argv[]) do_verification, init_method, do_log, - nrepeat, + time_kernel, M, N, K, @@ -122,7 +122,7 @@ int profile_gemm_bias_2d(int argc, char* argv[]) do_verification, init_method, do_log, - nrepeat, + time_kernel, M, N, K, @@ -145,7 +145,7 @@ int profile_gemm_bias_2d(int argc, char* argv[]) do_verification, init_method, do_log, - nrepeat, + time_kernel, M, N, K, @@ -168,7 +168,7 @@ int profile_gemm_bias_2d(int argc, char* argv[]) do_verification, init_method, do_log, - nrepeat, + time_kernel, M, N, K, @@ -191,7 +191,7 @@ int profile_gemm_bias_2d(int argc, char* argv[]) do_verification, init_method, do_log, - nrepeat, + time_kernel, M, N, K, @@ -214,7 +214,7 @@ int profile_gemm_bias_2d(int argc, char* argv[]) do_verification, init_method, do_log, - nrepeat, + time_kernel, M, N, K, @@ -237,7 +237,7 @@ int profile_gemm_bias_2d(int argc, char* argv[]) do_verification, init_method, do_log, - nrepeat, + time_kernel, M, N, K, diff --git a/profiler/src/profile_gemm_bias_relu.cpp b/profiler/src/profile_gemm_bias_relu.cpp index 67a47cf9ec..4346650c9f 100644 --- a/profiler/src/profile_gemm_bias_relu.cpp +++ b/profiler/src/profile_gemm_bias_relu.cpp @@ -36,8 +36,8 @@ int profile_gemm_bias_relu(int argc, char* argv[]) printf(" 3: A[k, m] * B[n, k] = C[m, n])\n"); printf("arg4: verification (0: no; 1: yes)\n"); printf("arg5: initialization (0: no init; 1: integer value; 2: decimal value)\n"); - printf("arg8: print tensor value (0: no; 1: yes)\n"); - printf("arg7: run kernel # of times (>1)\n"); + printf("arg6: print tensor value (0: no; 1: yes)\n"); + printf("arg7: time kernel (0=n0, 1=yes)\n"); printf("arg8 to 13: M, N, K, StrideA, StrideB, StrideC\n"); printf("arg14: split k into mulitiple batch\n"); exit(1); @@ -48,7 +48,7 @@ int profile_gemm_bias_relu(int argc, char* argv[]) const bool do_verification = std::stoi(argv[4]); const int init_method = std::stoi(argv[5]); const bool do_log = std::stoi(argv[6]); - const int nrepeat = std::stoi(argv[7]); + const bool time_kernel = std::stoi(argv[7]); const int M = std::stoi(argv[8]); const int N = std::stoi(argv[9]); @@ -69,7 +69,7 @@ int profile_gemm_bias_relu(int argc, char* argv[]) do_verification, init_method, do_log, - nrepeat, + time_kernel, M, N, K, @@ -88,7 +88,7 @@ int profile_gemm_bias_relu(int argc, char* argv[]) do_verification, init_method, do_log, - nrepeat, + time_kernel, M, N, K, @@ -107,7 +107,7 @@ int profile_gemm_bias_relu(int argc, char* argv[]) do_verification, init_method, do_log, - nrepeat, + time_kernel, M, N, K, @@ -126,7 +126,7 @@ int profile_gemm_bias_relu(int argc, char* argv[]) do_verification, init_method, do_log, - nrepeat, + time_kernel, M, N, K, diff --git a/profiler/src/profile_gemm_bias_relu_add.cpp b/profiler/src/profile_gemm_bias_relu_add.cpp index 52406e93d6..186f32cf6f 100644 --- a/profiler/src/profile_gemm_bias_relu_add.cpp +++ b/profiler/src/profile_gemm_bias_relu_add.cpp @@ -36,8 +36,8 @@ int profile_gemm_bias_relu_add(int argc, char* argv[]) printf(" 3: A[k, m] * B[n, k] = C[m, n])\n"); printf("arg4: verification (0: no; 1: yes)\n"); printf("arg5: initialization (0: no init; 1: integer value; 2: decimal value)\n"); - printf("arg8: print tensor value (0: no; 1: yes)\n"); - printf("arg7: run kernel # of times (>1)\n"); + printf("arg6: print tensor value (0: no; 1: yes)\n"); + printf("arg7: time kernel (0=n0, 1=yes)\n"); printf("arg8 to 14: M, N, K, StrideA, StrideB, StrideC, StrideC1\n"); printf("arg15: split k into mulitiple batch\n"); exit(1); @@ -48,7 +48,7 @@ int profile_gemm_bias_relu_add(int argc, char* argv[]) const bool do_verification = std::stoi(argv[4]); const int init_method = std::stoi(argv[5]); const bool do_log = std::stoi(argv[6]); - const int nrepeat = std::stoi(argv[7]); + const bool time_kernel = std::stoi(argv[7]); const int M = std::stoi(argv[8]); const int N = std::stoi(argv[9]); @@ -70,7 +70,7 @@ int profile_gemm_bias_relu_add(int argc, char* argv[]) do_verification, init_method, do_log, - nrepeat, + time_kernel, M, N, K, @@ -90,7 +90,7 @@ int profile_gemm_bias_relu_add(int argc, char* argv[]) do_verification, init_method, do_log, - nrepeat, + time_kernel, M, N, K, @@ -110,7 +110,7 @@ int profile_gemm_bias_relu_add(int argc, char* argv[]) do_verification, init_method, do_log, - nrepeat, + time_kernel, M, N, K, @@ -130,7 +130,7 @@ int profile_gemm_bias_relu_add(int argc, char* argv[]) do_verification, init_method, do_log, - nrepeat, + time_kernel, M, N, K, diff --git a/profiler/src/profile_gemm_reduce.cpp b/profiler/src/profile_gemm_reduce.cpp index a83d4ce9a1..986acaf010 100644 --- a/profiler/src/profile_gemm_reduce.cpp +++ b/profiler/src/profile_gemm_reduce.cpp @@ -32,8 +32,8 @@ int profile_gemm_reduce(int argc, char* argv[]) printf(" 3: A[k, m] * B[n, k] = C[m, n])\n"); printf("arg4: verification (0: no; 1: yes)\n"); printf("arg5: initialization (0: no init; 1: integer value; 2: decimal value)\n"); - printf("arg8: print tensor value (0: no; 1: yes)\n"); - printf("arg7: run kernel # of times (>1)\n"); + printf("arg6: print tensor value (0: no; 1: yes)\n"); + printf("arg7: time kernel (0=n0, 1=yes)\n"); printf("arg8 to 13: M, N, K, StrideA, StrideB, StrideC\n"); printf("arg14: split k into mulitiple batch\n"); exit(1); @@ -44,7 +44,7 @@ int profile_gemm_reduce(int argc, char* argv[]) const bool do_verification = std::stoi(argv[4]); const int init_method = std::stoi(argv[5]); const bool do_log = std::stoi(argv[6]); - const int nrepeat = std::stoi(argv[7]); + const bool time_kernel = std::stoi(argv[7]); const int M = std::stoi(argv[8]); const int N = std::stoi(argv[9]); @@ -66,7 +66,7 @@ int profile_gemm_reduce(int argc, char* argv[]) do_verification, init_method, do_log, - nrepeat, + time_kernel, M, N, K, @@ -87,7 +87,7 @@ int profile_gemm_reduce(int argc, char* argv[]) do_verification, init_method, do_log, - nrepeat, + time_kernel, M, N, K, @@ -108,7 +108,7 @@ int profile_gemm_reduce(int argc, char* argv[]) do_verification, init_method, do_log, - nrepeat, + time_kernel, M, N, K, @@ -129,7 +129,7 @@ int profile_gemm_reduce(int argc, char* argv[]) do_verification, init_method, do_log, - nrepeat, + time_kernel, M, N, K, diff --git a/profiler/src/profile_grouped_gemm.cpp b/profiler/src/profile_grouped_gemm.cpp index 88a2a8f855..d35484cfae 100644 --- a/profiler/src/profile_grouped_gemm.cpp +++ b/profiler/src/profile_grouped_gemm.cpp @@ -54,8 +54,8 @@ int profile_grouped_gemm(int argc, char* argv[]) printf(" 3: A[k, m] * B[n, k] = C[m, n])\n"); printf("arg4: verification (0: no; 1: yes)\n"); printf("arg5: initialization (0: no init; 1: integer value; 2: decimal value)\n"); - printf("arg8: print tensor value (0: no; 1: yes)\n"); - printf("arg7: run kernel # of times (>1)\n"); + printf("arg6: print tensor value (0: no; 1: yes)\n"); + printf("arg7: time kernel (0=n0, 1=yes)\n"); printf("arg8 to 13: Ms, Ns, Ks, StrideAs, StrideBs, StrideCs (e.g., 256,256 128,128 64,64 " "64,64 64,64 128,128)\n"); exit(1); @@ -66,7 +66,7 @@ int profile_grouped_gemm(int argc, char* argv[]) const bool do_verification = std::stoi(argv[4]); const int init_method = std::stoi(argv[5]); const bool do_log = std::stoi(argv[6]); - const int nrepeat = std::stoi(argv[7]); + const bool time_kernel = std::stoi(argv[7]); const auto Ms = argToIntArray(argv[8]); const auto Ns = argToIntArray(argv[9]); @@ -86,7 +86,7 @@ int profile_grouped_gemm(int argc, char* argv[]) ck::tensor_layout::gemm::RowMajor>(do_verification, init_method, do_log, - nrepeat, + time_kernel, Ms, Ns, Ks, @@ -104,7 +104,7 @@ int profile_grouped_gemm(int argc, char* argv[]) ck::tensor_layout::gemm::RowMajor>(do_verification, init_method, do_log, - nrepeat, + time_kernel, Ms, Ns, Ks, @@ -122,7 +122,7 @@ int profile_grouped_gemm(int argc, char* argv[]) ck::tensor_layout::gemm::RowMajor>(do_verification, init_method, do_log, - nrepeat, + time_kernel, Ms, Ns, Ks, @@ -140,7 +140,7 @@ int profile_grouped_gemm(int argc, char* argv[]) ck::tensor_layout::gemm::RowMajor>(do_verification, init_method, do_log, - nrepeat, + time_kernel, Ms, Ns, Ks, diff --git a/profiler/src/profile_reduce.cpp b/profiler/src/profile_reduce.cpp index 96fa78964a..5e91a1d2d1 100644 --- a/profiler/src/profile_reduce.cpp +++ b/profiler/src/profile_reduce.cpp @@ -144,7 +144,7 @@ class AppArgs bool do_dumpout = false; int init_method; - int nrepeat; + bool time_kernel; bool need_indices = false; @@ -295,7 +295,7 @@ class AppArgs throw std::runtime_error("Invalid cmd-line arguments, more argumetns are needed!"); init_method = std::atoi(argv[optind++]); - nrepeat = std::atoi(argv[optind]); + time_kernel = std::atoi(argv[optind]); if(scales.empty()) { @@ -354,7 +354,7 @@ int profile_reduce(int argc, char* argv[]) args.init_method, args.do_log, args.do_dumpout, - args.nrepeat, + args.time_kernel, args.inLengths, args.reduceDims, args.reduceOp, @@ -369,7 +369,7 @@ int profile_reduce(int argc, char* argv[]) args.init_method, args.do_log, args.do_dumpout, - args.nrepeat, + args.time_kernel, args.inLengths, args.reduceDims, args.reduceOp, @@ -387,7 +387,7 @@ int profile_reduce(int argc, char* argv[]) args.init_method, args.do_log, args.do_dumpout, - args.nrepeat, + args.time_kernel, args.inLengths, args.reduceDims, args.reduceOp, @@ -414,7 +414,7 @@ int profile_reduce(int argc, char* argv[]) args.init_method, args.do_log, args.do_dumpout, - args.nrepeat, + args.time_kernel, args.inLengths, args.reduceDims, args.reduceOp, @@ -429,7 +429,7 @@ int profile_reduce(int argc, char* argv[]) args.init_method, args.do_log, args.do_dumpout, - args.nrepeat, + args.time_kernel, args.inLengths, args.reduceDims, args.reduceOp, @@ -454,7 +454,7 @@ int profile_reduce(int argc, char* argv[]) args.init_method, args.do_log, args.do_dumpout, - args.nrepeat, + args.time_kernel, args.inLengths, args.reduceDims, args.reduceOp, @@ -471,7 +471,7 @@ int profile_reduce(int argc, char* argv[]) args.init_method, args.do_log, args.do_dumpout, - args.nrepeat, + args.time_kernel, args.inLengths, args.reduceDims, args.reduceOp, @@ -486,7 +486,7 @@ int profile_reduce(int argc, char* argv[]) args.init_method, args.do_log, args.do_dumpout, - args.nrepeat, + args.time_kernel, args.inLengths, args.reduceDims, args.reduceOp, diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 304ce070ff..2ad13da7b4 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -1,4 +1,5 @@ include_directories(BEFORE + ${PROJECT_SOURCE_DIR}/ ${PROJECT_SOURCE_DIR}/include/ck ${PROJECT_SOURCE_DIR}/include/ck/utility ${PROJECT_SOURCE_DIR}/include/ck/tensor_description @@ -21,7 +22,8 @@ include_directories(BEFORE ${PROJECT_SOURCE_DIR}/external/include/half ) -add_custom_target(check COMMAND ${CMAKE_CTEST_COMMAND} --output-on-failure -C ${CMAKE_CFG_INTDIR}) +include(googletest) + add_custom_target(tests) @@ -41,7 +43,7 @@ function(add_gtest_executable TEST_NAME) add_dependencies(tests ${TEST_NAME}) add_dependencies(check ${TEST_NAME}) # suppress gtest warnings - target_compile_options(${TEST_NAME} PRIVATE -Wno-global-constructors) + target_compile_options(${TEST_NAME} PRIVATE -Wno-global-constructors -Wno-undef) target_link_libraries(${TEST_NAME} PRIVATE gtest_main) gtest_discover_tests(${TEST_NAME}) endfunction(add_gtest_executable TEST_NAME) @@ -60,4 +62,6 @@ add_subdirectory(grouped_gemm) add_subdirectory(convnd_fwd) add_subdirectory(reduce) add_subdirectory(conv2d_bwd_weight) +add_subdirectory(convnd_bwd_data) add_subdirectory(cgemm) +# DONOT add client_app, that is tested via CI independently diff --git a/test/batched_gemm_reduce/batched_gemm_reduce_fp16.cpp b/test/batched_gemm_reduce/batched_gemm_reduce_fp16.cpp index ce061c644b..7b311cff17 100644 --- a/test/batched_gemm_reduce/batched_gemm_reduce_fp16.cpp +++ b/test/batched_gemm_reduce/batched_gemm_reduce_fp16.cpp @@ -22,7 +22,7 @@ int main() Row, Row, Row>( - true, 1, false, 1, M, N, K, K, N, N, BatchCount); + true, 1, false, false, M, N, K, K, N, N, BatchCount); pass = pass && ck::profiler::profile_batched_gemm_reduce_impl( - true, 1, false, 1, M, N, K, K, K, N, BatchCount); + true, 1, false, false, M, N, K, K, K, N, BatchCount); pass = pass && ck::profiler::profile_batched_gemm_reduce_impl( - true, 1, false, 1, M, N, K, M, N, N, BatchCount); + true, 1, false, false, M, N, K, M, N, N, BatchCount); pass = pass && ck::profiler::profile_batched_gemm_reduce_impl( - true, 1, false, 1, M, N, K, M, K, N, BatchCount); + true, 1, false, false, M, N, K, M, K, N, BatchCount); if(pass) { diff --git a/test/client_app/CMakeLists.txt b/test/client_app/CMakeLists.txt new file mode 100644 index 0000000000..f8dd8c4e0a --- /dev/null +++ b/test/client_app/CMakeLists.txt @@ -0,0 +1,11 @@ +cmake_minimum_required(VERSION 3.15) +project(ck_app) +add_compile_options(-std=c++14) + +find_package(composable_kernel 1.0.0 COMPONENTS device_operations host_tensor) +find_package(hip REQUIRED PATHS /opt/rocm) +message(STATUS "Build with HIP ${hip_VERSION}") + +add_executable(test_client_app client_app.cpp) + +target_link_libraries(test_client_app PRIVATE composable_kernel::device_operations composable_kernel::host_tensor hip::host) diff --git a/test/client_app/client_app.cpp b/test/client_app/client_app.cpp new file mode 100644 index 0000000000..665a103f70 --- /dev/null +++ b/test/client_app/client_app.cpp @@ -0,0 +1,77 @@ +#include +#include +#include +#include +#include +#include +#include + +#include "client_app_impl.hpp" + +int main(int argc, char* argv[]) +{ + if(argc != 25) + { + printf("arg1: tensor operation (conv_fwd: ForwardConvolution)\n"); + printf("arg2: data type (0: fp32; 1: fp16)\n"); + printf("arg3: input tensor layout (0: NCHW; 1: NHWC)\n"); + printf("arg4: weight tensor layout (0: KCYX; 1: KYXC)\n"); + printf("arg5: output tensor layout (0: NKHW; 1: NHWK)\n"); + printf("arg6: verification (0: no; 1: yes)\n"); + printf("arg7: initialization (0: no init; 1: integer value; 2: decimal value)\n"); + printf("arg8: print tensor value (0: no; 1: yes)\n"); + printf("arg9: time kernel (0=n0, 1=yes)\n"); + printf("arg10 to 24: N, K, C, Y, X, Hi, Wi, Sy, Sx, Dy, Dx, LeftPy, LeftPx, RightPy, " + "RightPx\n"); + exit(1); + } + + const ConvDataType data_type = static_cast(std::stoi(argv[2])); + const int in_layout = static_cast(std::stoi(argv[3])); + const int wei_layout = static_cast(std::stoi(argv[4])); + const int out_layout = static_cast(std::stoi(argv[5])); + const bool do_verification = std::stoi(argv[6]); + const int init_method = std::stoi(argv[7]); + const bool do_log = std::stoi(argv[8]); + const bool time_kernel = std::stoi(argv[9]); + + const ck::index_t N = std::stoi(argv[10]); + const ck::index_t K = std::stoi(argv[11]); + const ck::index_t C = std::stoi(argv[12]); + const ck::index_t Y = std::stoi(argv[13]); + const ck::index_t X = std::stoi(argv[14]); + const ck::index_t Hi = std::stoi(argv[15]); + const ck::index_t Wi = std::stoi(argv[16]); + + const ck::index_t conv_stride_h = std::stoi(argv[17]); + const ck::index_t conv_stride_w = std::stoi(argv[18]); + const ck::index_t conv_dilation_h = std::stoi(argv[19]); + const ck::index_t conv_dilation_w = std::stoi(argv[20]); + const ck::index_t in_left_pad_h = std::stoi(argv[21]); + const ck::index_t in_left_pad_w = std::stoi(argv[22]); + const ck::index_t in_right_pad_h = std::stoi(argv[23]); + const ck::index_t in_right_pad_w = std::stoi(argv[24]); + + const ck::index_t YEff = (Y - 1) * conv_dilation_h + 1; + const ck::index_t XEff = (X - 1) * conv_dilation_w + 1; + + const ck::index_t Ho = (Hi + in_left_pad_h + in_right_pad_h - YEff) / conv_stride_h + 1; + const ck::index_t Wo = (Wi + in_left_pad_w + in_right_pad_w - XEff) / conv_stride_w + 1; + + ck::app::profile_conv_fwd_impl(do_verification, + init_method, + do_log, + time_kernel, + data_type, + N, + K, + C, + std::vector{Hi, Wi}, + std::vector{Y, X}, + std::vector{Ho, Wo}, + std::vector{conv_stride_h, conv_stride_w}, + std::vector{conv_dilation_h, conv_dilation_w}, + std::vector{in_left_pad_h, in_left_pad_w}, + std::vector{in_right_pad_h, in_right_pad_w}); + return 1; +} diff --git a/test/client_app/client_app_impl.hpp b/test/client_app/client_app_impl.hpp new file mode 100644 index 0000000000..f9e4145ba0 --- /dev/null +++ b/test/client_app/client_app_impl.hpp @@ -0,0 +1,214 @@ +#pragma once + +#include "host_interface.hpp" + +enum ConvDataType +{ + F32_F32_F32, // 0 + F16_F16_F16, // 1 + BF16_BF16_BF16, // 2 + INT8_INT8_INT8, // 3 +}; + +enum ConvInputLayout +{ + NCHW, // 0 + NHWC, // 1 +}; + +enum ConvWeightLayout +{ + KCYX, // 0 + KYXC, // 1 +}; + +enum ConvOutputLayout +{ + NKHW, // 0 + NHWK, // 1 +}; + +void check_hip_error(void) +{ + hipError_t err = hipGetLastError(); + if(err != hipSuccess) + { + std::cerr << "Error: " << hipGetErrorString(err) << std::endl; + exit(err); + } +} +std::string getDeviceName(int device) +{ + struct hipDeviceProp_t prop; + hipGetDeviceProperties(&prop, device); + check_hip_error(); + return std::string(prop.name); +} + +int getDriver(void) +{ + int driver; + hipDriverGetVersion(&driver); + check_hip_error(); + return driver; +} + +namespace ck { +namespace app { +struct DeviceMem +{ + DeviceMem() = delete; + DeviceMem(std::size_t mem_size); + void* GetDeviceBuffer(); + void ToDevice(const void* p); + void FromDevice(void* p); + ~DeviceMem(); + + void* mpDeviceBuf; + std::size_t mMemSize; +}; + +DeviceMem::DeviceMem(std::size_t mem_size) : mMemSize(mem_size) +{ + hipGetErrorString(hipMalloc(static_cast(&mpDeviceBuf), mMemSize)); +} + +void* DeviceMem::GetDeviceBuffer() { return mpDeviceBuf; } + +void DeviceMem::ToDevice(const void* p) +{ + hipGetErrorString( + hipMemcpy(mpDeviceBuf, const_cast(p), mMemSize, hipMemcpyHostToDevice)); +} + +void DeviceMem::FromDevice(void* p) +{ + hipGetErrorString(hipMemcpy(p, mpDeviceBuf, mMemSize, hipMemcpyDeviceToHost)); +} + +DeviceMem::~DeviceMem() { hipGetErrorString(hipFree(mpDeviceBuf)); } + +void profile_conv_fwd_impl(int do_verification, + int init_method, + bool do_log, + bool time_kernel, + ConvDataType data_type, + ck::index_t N, + ck::index_t K, + ck::index_t C, + std::vector input_spatial_lengths, + std::vector filter_spatial_lengths, + std::vector output_spatial_lengths, + std::vector conv_filter_strides, + std::vector conv_filter_dilations, + std::vector input_left_pads, + std::vector input_right_pads) +{ + const ck::index_t Y = filter_spatial_lengths[0]; + const ck::index_t X = filter_spatial_lengths[1]; + + const ck::index_t Hi = input_spatial_lengths[0]; + const ck::index_t Wi = input_spatial_lengths[1]; + + const ck::index_t Ho = output_spatial_lengths[0]; + const ck::index_t Wo = output_spatial_lengths[1]; + + const auto in_sz = N * C * Hi * Wi; + const auto wei_sz = K * C * Y * X; + const auto out_sz = N * K * Ho * Wo; + + using WeiDataType = float; + using InDataType = float; + using OutDataType = float; + + app::DeviceMem in_device_buf(sizeof(InDataType) * in_sz); + app::DeviceMem wei_device_buf(sizeof(WeiDataType) * wei_sz); + app::DeviceMem out_device_buf(sizeof(OutDataType) * out_sz); + // data is already on device! + + // add device Conv instances + std::vector conv_ptrs; + if(data_type == F16_F16_F16) + { + add_device_conv2d_fwd_xdl_c_shuffle_nhwc_kyxc_nhwk_f16_instances_t(conv_ptrs); + add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_f16_instances_t(conv_ptrs); + } + else if(data_type == BF16_BF16_BF16) + add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_bf16_instances_t(conv_ptrs); + else if(data_type == F32_F32_F32) + add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_f32_instances_t(conv_ptrs); + else if(data_type == INT8_INT8_INT8) + add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_int8_instances_t(conv_ptrs); + else + throw std::runtime_error("wrong! Invalid data type"); + if(conv_ptrs.empty()) + { + throw std::runtime_error("wrong! no device Conv instance found"); + } + + std::string best_conv_name; + float best_ave_time = 0; + float best_tflops = 0; + float best_gb_per_sec = 0; + int deviceIndex = 0; + hipSetDevice(deviceIndex); + check_hip_error(); + + StreamConfig stream_config{nullptr, time_kernel}; + hipStreamCreate(&stream_config.stream_id_); + check_hip_error(); + + // profile device Conv instances + for(auto& conv_ptr : conv_ptrs) + { + auto argument_ptr = + conv_ptr.MakeArgumentPointer(static_cast(in_device_buf.GetDeviceBuffer()), + static_cast(wei_device_buf.GetDeviceBuffer()), + static_cast(out_device_buf.GetDeviceBuffer()), + N, + K, + C, + input_spatial_lengths, + filter_spatial_lengths, + output_spatial_lengths, + conv_filter_strides, + conv_filter_dilations, + input_left_pads, + input_right_pads); + + auto invoker_ptr = conv_ptr.MakeInvokerPointer(); + + if(conv_ptr.IsSupportedArgument(argument_ptr.get())) + { + std::string conv_name = conv_ptr.GetTypeString(); + float ave_time = invoker_ptr->Run(argument_ptr.get(), stream_config); + + std::size_t flop = std::size_t(2) * N * K * Ho * Wo * C * Y * X; + + std::size_t num_btype = sizeof(InDataType) * (N * C * Hi * Wi) + + sizeof(WeiDataType) * (K * C * Y * X) + + sizeof(OutDataType) * (N * K * Ho * Wo); + + float tflops = static_cast(flop) / 1.E9 / ave_time; + + float gb_per_sec = num_btype / 1.E6 / ave_time; + + std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec + << " GB/s, " << conv_name << std::endl; + + if(tflops > best_tflops) + { + best_conv_name = conv_name; + best_tflops = tflops; + best_ave_time = ave_time; + best_gb_per_sec = gb_per_sec; + } + } + } + + std::cout << "Best Perf: " << best_ave_time << " ms, " << best_tflops << " TFlops, " + << best_gb_per_sec << " GB/s, " << best_conv_name << std::endl; +} + +} // namespace app +} // namespace ck diff --git a/test/conv2d_bwd_weight/CMakeLists.txt b/test/conv2d_bwd_weight/CMakeLists.txt index 7b515b6b8e..ecd5336c1f 100644 --- a/test/conv2d_bwd_weight/CMakeLists.txt +++ b/test/conv2d_bwd_weight/CMakeLists.txt @@ -4,4 +4,4 @@ include_directories(BEFORE ) add_test_executable(test_conv2d_bwd_weight conv2d_bwd_weight.cpp) -target_link_libraries(test_conv2d_bwd_weight PRIVATE host_tensor device_conv2d_bwd_weight_instance conv_fwd_util) +target_link_libraries(test_conv2d_bwd_weight PRIVATE host_tensor device_conv2d_bwd_weight_instance conv_util) diff --git a/test/conv2d_bwd_weight/conv2d_bwd_weight.cpp b/test/conv2d_bwd_weight/conv2d_bwd_weight.cpp index bb3ed985e3..671980f49e 100644 --- a/test/conv2d_bwd_weight/conv2d_bwd_weight.cpp +++ b/test/conv2d_bwd_weight/conv2d_bwd_weight.cpp @@ -6,7 +6,7 @@ #include #include -#include "conv_fwd_util.hpp" +#include "conv_util.hpp" #include "profile_conv_bwd_weight_impl.hpp" int test_self() @@ -28,20 +28,20 @@ int test_self() ck::tensor_layout::convolution::NHWC, ck::tensor_layout::convolution::KYXC, ck::tensor_layout::convolution::NHWK>( - 1, // do_verification, - 1, // init_method, - 0, // do_log, - 1, // nrepeat, - param.N, - param.K, - param.C, - param.input_spatial_lengths, - param.filter_spatial_lengths, + true, // do_verification + 1, // init_method + false, // do_log + false, // time_kernel + param.N_, + param.K_, + param.C_, + param.input_spatial_lengths_, + param.filter_spatial_lengths_, param.GetOutputSpatialLengths(), - param.conv_filter_strides, - param.conv_filter_dilations, - param.input_left_pads, - param.input_right_pads, + param.conv_filter_strides_, + param.conv_filter_dilations_, + param.input_left_pads_, + param.input_right_pads_, 2); // fp16 @@ -52,28 +52,28 @@ int test_self() ck::tensor_layout::convolution::NHWC, ck::tensor_layout::convolution::KYXC, ck::tensor_layout::convolution::NHWK>( - 1, // do_verification, - 1, // init_method, - 0, // do_log, - 1, // nrepeat, - param.N, - param.K, - param.C, - param.input_spatial_lengths, - param.filter_spatial_lengths, + true, // do_verification + 1, // init_method + false, // do_log + false, // time_kernel + param.N_, + param.K_, + param.C_, + param.input_spatial_lengths_, + param.filter_spatial_lengths_, param.GetOutputSpatialLengths(), - param.conv_filter_strides, - param.conv_filter_dilations, - param.input_left_pads, - param.input_right_pads, + param.conv_filter_strides_, + param.conv_filter_dilations_, + param.input_left_pads_, + param.input_right_pads_, 2); } return pass; } int main(int argc, char* argv[]) { - int data_type = 0; - int init_method = 0; + int data_type = 1; + int init_method = 1; // Conv shape ck::index_t N = 128; @@ -155,20 +155,20 @@ int main(int argc, char* argv[]) ck::tensor_layout::convolution::NHWC, ck::tensor_layout::convolution::KYXC, ck::tensor_layout::convolution::NHWK>( - 1, + true, // do_verification init_method, - 0, - 1, - param.N, - param.K, - param.C, - param.input_spatial_lengths, - param.filter_spatial_lengths, + false, // do_log + false, // time_kernel + param.N_, + param.K_, + param.C_, + param.input_spatial_lengths_, + param.filter_spatial_lengths_, param.GetOutputSpatialLengths(), - param.conv_filter_strides, - param.conv_filter_dilations, - param.input_left_pads, - param.input_right_pads, + param.conv_filter_strides_, + param.conv_filter_dilations_, + param.input_left_pads_, + param.input_right_pads_, split_k); } else if(data_type == 1) @@ -180,20 +180,20 @@ int main(int argc, char* argv[]) ck::tensor_layout::convolution::NHWC, ck::tensor_layout::convolution::KYXC, ck::tensor_layout::convolution::NHWK>( - 1, + true, // do_verification init_method, - 0, - 1, - param.N, - param.K, - param.C, - param.input_spatial_lengths, - param.filter_spatial_lengths, + false, // do_log + false, // time_kernel + param.N_, + param.K_, + param.C_, + param.input_spatial_lengths_, + param.filter_spatial_lengths_, param.GetOutputSpatialLengths(), - param.conv_filter_strides, - param.conv_filter_dilations, - param.input_left_pads, - param.input_right_pads, + param.conv_filter_strides_, + param.conv_filter_dilations_, + param.input_left_pads_, + param.input_right_pads_, split_k); } else diff --git a/test/conv_util/CMakeLists.txt b/test/conv_util/CMakeLists.txt index 70b3e851be..795c9ec0ac 100644 --- a/test/conv_util/CMakeLists.txt +++ b/test/conv_util/CMakeLists.txt @@ -1,2 +1,2 @@ add_gtest_executable(test_conv_util conv_util.cpp) -target_link_libraries(test_conv_util PRIVATE host_tensor conv_fwd_util) +target_link_libraries(test_conv_util PRIVATE host_tensor conv_util) diff --git a/test/conv_util/conv_util.cpp b/test/conv_util/conv_util.cpp index 453225e800..98f55b872e 100644 --- a/test/conv_util/conv_util.cpp +++ b/test/conv_util/conv_util.cpp @@ -1,10 +1,10 @@ #include #include #include -#include "gtest/gtest.h" +#include #include "config.hpp" -#include "conv_fwd_util.hpp" +#include "conv_util.hpp" #include "tensor_layout.hpp" #include "check_err.hpp" @@ -15,13 +15,13 @@ class TestConvUtil : public ::testing::Test public: void SetNDParams(std::size_t ndims) { - conv_params.num_dim_spatial = ndims; - conv_params.filter_spatial_lengths = std::vector(ndims, 3); - conv_params.input_spatial_lengths = std::vector(ndims, 71); - conv_params.conv_filter_strides = std::vector(ndims, 2); - conv_params.conv_filter_dilations = std::vector(ndims, 1); - conv_params.input_left_pads = std::vector(ndims, 1); - conv_params.input_right_pads = std::vector(ndims, 1); + conv_params.num_dim_spatial_ = ndims; + conv_params.filter_spatial_lengths_ = std::vector(ndims, 3); + conv_params.input_spatial_lengths_ = std::vector(ndims, 71); + conv_params.conv_filter_strides_ = std::vector(ndims, 2); + conv_params.conv_filter_dilations_ = std::vector(ndims, 1); + conv_params.input_left_pads_ = std::vector(ndims, 1); + conv_params.input_right_pads_ = std::vector(ndims, 1); } protected: @@ -44,29 +44,29 @@ TEST_F(TestConvUtil, ConvParamsGetOutputSpatialLengths2D) std::vector{36, 36}, "Error: ConvParams 2D default constructor.")); - conv_params.conv_filter_strides = std::vector{1, 1}; - out_spatial_len = conv_params.GetOutputSpatialLengths(); + conv_params.conv_filter_strides_ = std::vector{1, 1}; + out_spatial_len = conv_params.GetOutputSpatialLengths(); EXPECT_TRUE(ck::utils::check_err( out_spatial_len, std::vector{71, 71}, "Error: ConvParams 2D stride {1,1}.")); - conv_params.conv_filter_strides = std::vector{2, 2}; - conv_params.input_left_pads = std::vector{2, 2}; - conv_params.input_right_pads = std::vector{2, 2}; - out_spatial_len = conv_params.GetOutputSpatialLengths(); + conv_params.conv_filter_strides_ = std::vector{2, 2}; + conv_params.input_left_pads_ = std::vector{2, 2}; + conv_params.input_right_pads_ = std::vector{2, 2}; + out_spatial_len = conv_params.GetOutputSpatialLengths(); EXPECT_TRUE(ck::utils::check_err(out_spatial_len, std::vector{37, 37}, "Error: ConvParams 2D padding left/right {2,2}.")); - conv_params.conv_filter_dilations = std::vector{2, 2}; - out_spatial_len = conv_params.GetOutputSpatialLengths(); + conv_params.conv_filter_dilations_ = std::vector{2, 2}; + out_spatial_len = conv_params.GetOutputSpatialLengths(); EXPECT_TRUE(ck::utils::check_err( out_spatial_len, std::vector{36, 36}, "Error: ConvParams 2D dilation {2,2}.")); - conv_params.conv_filter_strides = std::vector{3, 3}; - conv_params.input_left_pads = std::vector{1, 1}; - conv_params.input_right_pads = std::vector{1, 1}; - conv_params.conv_filter_dilations = std::vector{2, 2}; - out_spatial_len = conv_params.GetOutputSpatialLengths(); + conv_params.conv_filter_strides_ = std::vector{3, 3}; + conv_params.input_left_pads_ = std::vector{1, 1}; + conv_params.input_right_pads_ = std::vector{1, 1}; + conv_params.conv_filter_dilations_ = std::vector{2, 2}; + out_spatial_len = conv_params.GetOutputSpatialLengths(); EXPECT_TRUE( ck::utils::check_err(out_spatial_len, std::vector{23, 23}, @@ -81,29 +81,29 @@ TEST_F(TestConvUtil, ConvParamsGetOutputSpatialLengths1D) EXPECT_TRUE(ck::utils::check_err( out_spatial_len, std::vector{36}, "Error: ConvParams 1D.")); - conv_params.conv_filter_strides = std::vector{1}; - out_spatial_len = conv_params.GetOutputSpatialLengths(); + conv_params.conv_filter_strides_ = std::vector{1}; + out_spatial_len = conv_params.GetOutputSpatialLengths(); EXPECT_TRUE(ck::utils::check_err( out_spatial_len, std::vector{71}, "Error: ConvParams 1D stride {1}.")); - conv_params.conv_filter_strides = std::vector{2}; - conv_params.input_left_pads = std::vector{2}; - conv_params.input_right_pads = std::vector{2}; - out_spatial_len = conv_params.GetOutputSpatialLengths(); + conv_params.conv_filter_strides_ = std::vector{2}; + conv_params.input_left_pads_ = std::vector{2}; + conv_params.input_right_pads_ = std::vector{2}; + out_spatial_len = conv_params.GetOutputSpatialLengths(); EXPECT_TRUE(ck::utils::check_err(out_spatial_len, std::vector{37}, "Error: ConvParams 1D padding left/right {2}.")); - conv_params.conv_filter_dilations = std::vector{2}; - out_spatial_len = conv_params.GetOutputSpatialLengths(); + conv_params.conv_filter_dilations_ = std::vector{2}; + out_spatial_len = conv_params.GetOutputSpatialLengths(); EXPECT_TRUE(ck::utils::check_err( out_spatial_len, std::vector{36}, "Error: ConvParams 1D dilation {2}.")); - conv_params.conv_filter_strides = std::vector{3}; - conv_params.input_left_pads = std::vector{1}; - conv_params.input_right_pads = std::vector{1}; - conv_params.conv_filter_dilations = std::vector{2}; - out_spatial_len = conv_params.GetOutputSpatialLengths(); + conv_params.conv_filter_strides_ = std::vector{3}; + conv_params.input_left_pads_ = std::vector{1}; + conv_params.input_right_pads_ = std::vector{1}; + conv_params.conv_filter_dilations_ = std::vector{2}; + out_spatial_len = conv_params.GetOutputSpatialLengths(); EXPECT_TRUE( ck::utils::check_err(out_spatial_len, std::vector{23}, @@ -118,31 +118,31 @@ TEST_F(TestConvUtil, ConvParamsGetOutputSpatialLengths3D) EXPECT_TRUE(ck::utils::check_err( out_spatial_len, std::vector{36, 36, 36}, "Error: ConvParams 3D.")); - conv_params.conv_filter_strides = std::vector{1, 1, 1}; - out_spatial_len = conv_params.GetOutputSpatialLengths(); + conv_params.conv_filter_strides_ = std::vector{1, 1, 1}; + out_spatial_len = conv_params.GetOutputSpatialLengths(); EXPECT_TRUE(ck::utils::check_err(out_spatial_len, std::vector{71, 71, 71}, "Error: ConvParams 3D stride {1, 1, 1}.")); - conv_params.conv_filter_strides = std::vector{2, 2, 2}; - conv_params.input_left_pads = std::vector{2, 2, 2}; - conv_params.input_right_pads = std::vector{2, 2, 2}; - out_spatial_len = conv_params.GetOutputSpatialLengths(); + conv_params.conv_filter_strides_ = std::vector{2, 2, 2}; + conv_params.input_left_pads_ = std::vector{2, 2, 2}; + conv_params.input_right_pads_ = std::vector{2, 2, 2}; + out_spatial_len = conv_params.GetOutputSpatialLengths(); EXPECT_TRUE(ck::utils::check_err(out_spatial_len, std::vector{37, 37, 37}, "Error: ConvParams 3D padding left/right {2, 2, 2}.")); - conv_params.conv_filter_dilations = std::vector{2, 2, 2}; - out_spatial_len = conv_params.GetOutputSpatialLengths(); + conv_params.conv_filter_dilations_ = std::vector{2, 2, 2}; + out_spatial_len = conv_params.GetOutputSpatialLengths(); EXPECT_TRUE(ck::utils::check_err(out_spatial_len, std::vector{36, 36, 36}, "Error: ConvParams 3D dilation {2, 2, 2}.")); - conv_params.conv_filter_strides = std::vector{3, 3, 3}; - conv_params.input_left_pads = std::vector{1, 1, 1}; - conv_params.input_right_pads = std::vector{1, 1, 1}; - conv_params.conv_filter_dilations = std::vector{2, 2, 2}; - out_spatial_len = conv_params.GetOutputSpatialLengths(); + conv_params.conv_filter_strides_ = std::vector{3, 3, 3}; + conv_params.input_left_pads_ = std::vector{1, 1, 1}; + conv_params.input_right_pads_ = std::vector{1, 1, 1}; + conv_params.conv_filter_dilations_ = std::vector{2, 2, 2}; + out_spatial_len = conv_params.GetOutputSpatialLengths(); EXPECT_TRUE(ck::utils::check_err( out_spatial_len, std::vector{23, 23, 23}, diff --git a/test/convnd_bwd_data/CMakeLists.txt b/test/convnd_bwd_data/CMakeLists.txt index 58e6e7d3d0..55d71a41d3 100644 --- a/test/convnd_bwd_data/CMakeLists.txt +++ b/test/convnd_bwd_data/CMakeLists.txt @@ -4,4 +4,4 @@ include_directories(BEFORE ) add_test_executable(test_convnd_bwd_data convnd_bwd_data.cpp) -target_link_libraries(test_convnd_bwd_data PRIVATE host_tensor device_convnd_bwd_data_instance conv_fwd_util) +target_link_libraries(test_convnd_bwd_data PRIVATE host_tensor device_convnd_bwd_data_instance conv_util) diff --git a/test/convnd_bwd_data/convnd_bwd_data.cpp b/test/convnd_bwd_data/convnd_bwd_data.cpp index cbc215033b..7284680e0e 100644 --- a/test/convnd_bwd_data/convnd_bwd_data.cpp +++ b/test/convnd_bwd_data/convnd_bwd_data.cpp @@ -27,20 +27,20 @@ int main() ck::tensor_layout::convolution::NWC, ck::tensor_layout::convolution::KXC, ck::tensor_layout::convolution::NWK>( - 1, // do_verification, - 1, // init_method, - 0, // do_log, - 1, // nrepeat, - param.N, - param.K, - param.C, - param.input_spatial_lengths, - param.filter_spatial_lengths, + true, // do_verification + 1, // init_method + false, // do_log + false, // time_kernel + param.N_, + param.K_, + param.C_, + param.input_spatial_lengths_, + param.filter_spatial_lengths_, param.GetOutputSpatialLengths(), - param.conv_filter_strides, - param.conv_filter_dilations, - param.input_left_pads, - param.input_right_pads); + param.conv_filter_strides_, + param.conv_filter_dilations_, + param.input_left_pads_, + param.input_right_pads_); pass &= ck::profiler::profile_convnd_bwd_data_impl<1, ck::half_t, @@ -50,20 +50,20 @@ int main() ck::tensor_layout::convolution::NWC, ck::tensor_layout::convolution::KXC, ck::tensor_layout::convolution::NWK>( - 1, // do_verification, - 1, // init_method, - 0, // do_log, - 1, // nrepeat, - param.N, - param.K, - param.C, - param.input_spatial_lengths, - param.filter_spatial_lengths, + true, // do_verification + 1, // init_method + false, // do_log + false, // time_kernel + param.N_, + param.K_, + param.C_, + param.input_spatial_lengths_, + param.filter_spatial_lengths_, param.GetOutputSpatialLengths(), - param.conv_filter_strides, - param.conv_filter_dilations, - param.input_left_pads, - param.input_right_pads); + param.conv_filter_strides_, + param.conv_filter_dilations_, + param.input_left_pads_, + param.input_right_pads_); pass &= ck::profiler::profile_convnd_bwd_data_impl<1, ck::bhalf_t, @@ -73,20 +73,20 @@ int main() ck::tensor_layout::convolution::NWC, ck::tensor_layout::convolution::KXC, ck::tensor_layout::convolution::NWK>( - 1, // do_verification, - 1, // init_method, - 0, // do_log, - 1, // nrepeat, - param.N, - param.K, - param.C, - param.input_spatial_lengths, - param.filter_spatial_lengths, + true, // do_verification + 1, // init_method + false, // do_log + false, // time_kernel + param.N_, + param.K_, + param.C_, + param.input_spatial_lengths_, + param.filter_spatial_lengths_, param.GetOutputSpatialLengths(), - param.conv_filter_strides, - param.conv_filter_dilations, - param.input_left_pads, - param.input_right_pads); + param.conv_filter_strides_, + param.conv_filter_dilations_, + param.input_left_pads_, + param.input_right_pads_); pass &= ck::profiler::profile_convnd_bwd_data_impl<1, int8_t, @@ -96,20 +96,20 @@ int main() ck::tensor_layout::convolution::NWC, ck::tensor_layout::convolution::KXC, ck::tensor_layout::convolution::NWK>( - 1, // do_verification, - 1, // init_method, - 0, // do_log, - 1, // nrepeat, - param.N, - param.K, - param.C, - param.input_spatial_lengths, - param.filter_spatial_lengths, + true, // do_verification + 1, // init_method + false, // do_log + false, // time_kernel + param.N_, + param.K_, + param.C_, + param.input_spatial_lengths_, + param.filter_spatial_lengths_, param.GetOutputSpatialLengths(), - param.conv_filter_strides, - param.conv_filter_dilations, - param.input_left_pads, - param.input_right_pads); + param.conv_filter_strides_, + param.conv_filter_dilations_, + param.input_left_pads_, + param.input_right_pads_); } // check 2d @@ -128,20 +128,20 @@ int main() ck::tensor_layout::convolution::NHWC, ck::tensor_layout::convolution::KYXC, ck::tensor_layout::convolution::NHWK>( - 1, // do_verification, - 1, // init_method, - 0, // do_log, - 1, // nrepeat, - param.N, - param.K, - param.C, - param.input_spatial_lengths, - param.filter_spatial_lengths, + true, // do_verification + 1, // init_method + false, // do_log + false, // time_kernel + param.N_, + param.K_, + param.C_, + param.input_spatial_lengths_, + param.filter_spatial_lengths_, param.GetOutputSpatialLengths(), - param.conv_filter_strides, - param.conv_filter_dilations, - param.input_left_pads, - param.input_right_pads); + param.conv_filter_strides_, + param.conv_filter_dilations_, + param.input_left_pads_, + param.input_right_pads_); pass &= ck::profiler::profile_convnd_bwd_data_impl<2, ck::half_t, @@ -151,20 +151,20 @@ int main() ck::tensor_layout::convolution::NHWC, ck::tensor_layout::convolution::KYXC, ck::tensor_layout::convolution::NHWK>( - 1, // do_verification, - 1, // init_method, - 0, // do_log, - 1, // nrepeat, - param.N, - param.K, - param.C, - param.input_spatial_lengths, - param.filter_spatial_lengths, + true, // do_verification + 1, // init_method + false, // do_log + false, // time_kernel + param.N_, + param.K_, + param.C_, + param.input_spatial_lengths_, + param.filter_spatial_lengths_, param.GetOutputSpatialLengths(), - param.conv_filter_strides, - param.conv_filter_dilations, - param.input_left_pads, - param.input_right_pads); + param.conv_filter_strides_, + param.conv_filter_dilations_, + param.input_left_pads_, + param.input_right_pads_); pass &= ck::profiler::profile_convnd_bwd_data_impl<2, ck::bhalf_t, @@ -174,20 +174,20 @@ int main() ck::tensor_layout::convolution::NHWC, ck::tensor_layout::convolution::KYXC, ck::tensor_layout::convolution::NHWK>( - 1, // do_verification, - 1, // init_method, - 0, // do_log, - 1, // nrepeat, - param.N, - param.K, - param.C, - param.input_spatial_lengths, - param.filter_spatial_lengths, + true, // do_verification + 1, // init_method + false, // do_log + false, // time_kernel + param.N_, + param.K_, + param.C_, + param.input_spatial_lengths_, + param.filter_spatial_lengths_, param.GetOutputSpatialLengths(), - param.conv_filter_strides, - param.conv_filter_dilations, - param.input_left_pads, - param.input_right_pads); + param.conv_filter_strides_, + param.conv_filter_dilations_, + param.input_left_pads_, + param.input_right_pads_); pass &= ck::profiler::profile_convnd_bwd_data_impl<2, int8_t, @@ -197,20 +197,20 @@ int main() ck::tensor_layout::convolution::NHWC, ck::tensor_layout::convolution::KYXC, ck::tensor_layout::convolution::NHWK>( - 1, // do_verification, - 1, // init_method, - 0, // do_log, - 1, // nrepeat, - param.N, - param.K, - param.C, - param.input_spatial_lengths, - param.filter_spatial_lengths, + true, // do_verification + 1, // init_method + false, // do_log + false, // time_kernel + param.N_, + param.K_, + param.C_, + param.input_spatial_lengths_, + param.filter_spatial_lengths_, param.GetOutputSpatialLengths(), - param.conv_filter_strides, - param.conv_filter_dilations, - param.input_left_pads, - param.input_right_pads); + param.conv_filter_strides_, + param.conv_filter_dilations_, + param.input_left_pads_, + param.input_right_pads_); } // check 3d @@ -232,20 +232,20 @@ int main() ck::tensor_layout::convolution::NDHWC, ck::tensor_layout::convolution::KZYXC, ck::tensor_layout::convolution::NDHWK>( - 1, // do_verification, - 1, // init_method, - 0, // do_log, - 1, // nrepeat, - param.N, - param.K, - param.C, - param.input_spatial_lengths, - param.filter_spatial_lengths, + true, // do_verification + 1, // init_method + false, // do_log + false, // time_kernel + param.N_, + param.K_, + param.C_, + param.input_spatial_lengths_, + param.filter_spatial_lengths_, param.GetOutputSpatialLengths(), - param.conv_filter_strides, - param.conv_filter_dilations, - param.input_left_pads, - param.input_right_pads); + param.conv_filter_strides_, + param.conv_filter_dilations_, + param.input_left_pads_, + param.input_right_pads_); pass &= ck::profiler::profile_convnd_bwd_data_impl<3, ck::half_t, @@ -255,20 +255,20 @@ int main() ck::tensor_layout::convolution::NDHWC, ck::tensor_layout::convolution::KZYXC, ck::tensor_layout::convolution::NDHWK>( - 1, // do_verification, - 1, // init_method, - 0, // do_log, - 1, // nrepeat, - param.N, - param.K, - param.C, - param.input_spatial_lengths, - param.filter_spatial_lengths, + true, // do_verification + 1, // init_method + false, // do_log + false, // time_kernel + param.N_, + param.K_, + param.C_, + param.input_spatial_lengths_, + param.filter_spatial_lengths_, param.GetOutputSpatialLengths(), - param.conv_filter_strides, - param.conv_filter_dilations, - param.input_left_pads, - param.input_right_pads); + param.conv_filter_strides_, + param.conv_filter_dilations_, + param.input_left_pads_, + param.input_right_pads_); pass &= ck::profiler::profile_convnd_bwd_data_impl<3, ck::bhalf_t, @@ -278,20 +278,20 @@ int main() ck::tensor_layout::convolution::NDHWC, ck::tensor_layout::convolution::KZYXC, ck::tensor_layout::convolution::NDHWK>( - 1, // do_verification, - 1, // init_method, - 0, // do_log, - 1, // nrepeat, - param.N, - param.K, - param.C, - param.input_spatial_lengths, - param.filter_spatial_lengths, + true, // do_verification + 1, // init_method + false, // do_log + false, // time_kernel + param.N_, + param.K_, + param.C_, + param.input_spatial_lengths_, + param.filter_spatial_lengths_, param.GetOutputSpatialLengths(), - param.conv_filter_strides, - param.conv_filter_dilations, - param.input_left_pads, - param.input_right_pads); + param.conv_filter_strides_, + param.conv_filter_dilations_, + param.input_left_pads_, + param.input_right_pads_); pass &= ck::profiler::profile_convnd_bwd_data_impl<3, int8_t, @@ -301,20 +301,20 @@ int main() ck::tensor_layout::convolution::NDHWC, ck::tensor_layout::convolution::KZYXC, ck::tensor_layout::convolution::NDHWK>( - 1, // do_verification, - 1, // init_method, - 0, // do_log, - 1, // nrepeat, - param.N, - param.K, - param.C, - param.input_spatial_lengths, - param.filter_spatial_lengths, + true, // do_verification + 1, // init_method + false, // do_log + false, // time_kernel + param.N_, + param.K_, + param.C_, + param.input_spatial_lengths_, + param.filter_spatial_lengths_, param.GetOutputSpatialLengths(), - param.conv_filter_strides, - param.conv_filter_dilations, - param.input_left_pads, - param.input_right_pads); + param.conv_filter_strides_, + param.conv_filter_dilations_, + param.input_left_pads_, + param.input_right_pads_); } if(pass) diff --git a/test/convnd_fwd/CMakeLists.txt b/test/convnd_fwd/CMakeLists.txt index 1d2ae3e4e3..34e698681b 100644 --- a/test/convnd_fwd/CMakeLists.txt +++ b/test/convnd_fwd/CMakeLists.txt @@ -1,13 +1,13 @@ add_custom_target(test_convnd_fwd) add_gtest_executable(test_conv1d_fwd conv1d_fwd.cpp) -target_link_libraries(test_conv1d_fwd PRIVATE host_tensor device_conv1d_fwd_instance conv_fwd_util) +target_link_libraries(test_conv1d_fwd PRIVATE host_tensor device_conv1d_fwd_instance conv_util) add_dependencies(test_convnd_fwd test_conv1d_fwd) add_gtest_executable(test_conv2d_fwd conv2d_fwd.cpp) -target_link_libraries(test_conv2d_fwd PRIVATE host_tensor device_conv2d_fwd_instance conv_fwd_util) +target_link_libraries(test_conv2d_fwd PRIVATE host_tensor device_conv2d_fwd_instance conv_util) add_dependencies(test_convnd_fwd test_conv2d_fwd) add_gtest_executable(test_conv3d_fwd conv3d_fwd.cpp) -target_link_libraries(test_conv3d_fwd PRIVATE host_tensor device_conv3d_fwd_instance conv_fwd_util) +target_link_libraries(test_conv3d_fwd PRIVATE host_tensor device_conv3d_fwd_instance conv_util) add_dependencies(test_convnd_fwd test_conv3d_fwd) diff --git a/test/convnd_fwd/conv1d_fwd.cpp b/test/convnd_fwd/conv1d_fwd.cpp index c161b2795e..b6b6a89b2c 100644 --- a/test/convnd_fwd/conv1d_fwd.cpp +++ b/test/convnd_fwd/conv1d_fwd.cpp @@ -6,7 +6,7 @@ #include "data_type.hpp" #include "element_wise_operation.hpp" -#include "conv_fwd_util.hpp" +#include "library/include/ck/library/utility/conv_util.hpp" #include "conv_util.hpp" namespace { @@ -19,13 +19,13 @@ bool test_conv1d_nwc_instances(const std::vector{3}; - params.input_spatial_lengths = std::vector{71}; - params.conv_filter_strides = std::vector{2}; - params.conv_filter_dilations = std::vector{1}; - params.input_left_pads = std::vector{1}; - params.input_right_pads = std::vector{1}; + params.num_dim_spatial_ = 1; + params.filter_spatial_lengths_ = std::vector{3}; + params.input_spatial_lengths_ = std::vector{71}; + params.conv_filter_strides_ = std::vector{2}; + params.conv_filter_dilations_ = std::vector{1}; + params.input_left_pads_ = std::vector{1}; + params.input_right_pads_ = std::vector{1}; conv::ConvFwdOpInstance conv_instance(params); @@ -44,16 +44,16 @@ TEST(Conv1DFwdNWC, TestConv1D) namespace ctl = ck::tensor_layout::convolution; ck::utils::conv::ConvParams params; - params.num_dim_spatial = 1; - params.N = 2; - params.K = 16; - params.C = 4; - params.filter_spatial_lengths = std::vector{3}; - params.input_spatial_lengths = std::vector{16}; - params.conv_filter_strides = std::vector{1}; - params.conv_filter_dilations = std::vector{1}; - params.input_left_pads = std::vector{1}; - params.input_right_pads = std::vector{1}; + params.num_dim_spatial_ = 1; + params.N_ = 2; + params.K_ = 16; + params.C_ = 4; + params.filter_spatial_lengths_ = std::vector{3}; + params.input_spatial_lengths_ = std::vector{16}; + params.conv_filter_strides_ = std::vector{1}; + params.conv_filter_dilations_ = std::vector{1}; + params.input_left_pads_ = std::vector{1}; + params.input_right_pads_ = std::vector{1}; std::vector conv_ptrs; test::conv::get_test_convolution_fwd_instance<1>(conv_ptrs); diff --git a/test/convnd_fwd/conv2d_fwd.cpp b/test/convnd_fwd/conv2d_fwd.cpp index e3815f778a..05e46147be 100644 --- a/test/convnd_fwd/conv2d_fwd.cpp +++ b/test/convnd_fwd/conv2d_fwd.cpp @@ -6,7 +6,7 @@ #include "data_type.hpp" #include "element_wise_operation.hpp" -#include "conv_fwd_util.hpp" +#include "ck/library/utility/conv_util.hpp" #include "conv_util.hpp" namespace { @@ -18,13 +18,13 @@ bool test_conv2d_nhwc_instances(const std::vector{3, 3}; - params.input_spatial_lengths = std::vector{71, 71}; - params.conv_filter_strides = std::vector{2, 2}; - params.conv_filter_dilations = std::vector{1, 1}; - params.input_left_pads = std::vector{1, 1}; - params.input_right_pads = std::vector{1, 1}; + params.num_dim_spatial_ = 2; + params.filter_spatial_lengths_ = std::vector{3, 3}; + params.input_spatial_lengths_ = std::vector{71, 71}; + params.conv_filter_strides_ = std::vector{2, 2}; + params.conv_filter_dilations_ = std::vector{1, 1}; + params.input_left_pads_ = std::vector{1, 1}; + params.input_right_pads_ = std::vector{1, 1}; conv::ConvFwdOpInstance conv_instance(params); @@ -42,11 +42,11 @@ TEST(Conv2DFwdNHWC, TestConv2D) using namespace ck::utils; ck::utils::conv::ConvParams params; - params.N = 2; - params.K = 16; - params.C = 4; - params.input_spatial_lengths = std::vector{16, 16}; - params.conv_filter_strides = std::vector{1, 1}; + params.N_ = 2; + params.K_ = 16; + params.C_ = 4; + params.input_spatial_lengths_ = std::vector{16, 16}; + params.conv_filter_strides_ = std::vector{1, 1}; std::vector conv_ptrs; test::conv::get_test_convolution_fwd_instance<2>(conv_ptrs); diff --git a/test/convnd_fwd/conv3d_fwd.cpp b/test/convnd_fwd/conv3d_fwd.cpp index fc3da3e9c7..c6f0e7ec07 100644 --- a/test/convnd_fwd/conv3d_fwd.cpp +++ b/test/convnd_fwd/conv3d_fwd.cpp @@ -7,7 +7,7 @@ #include "data_type.hpp" #include "element_wise_operation.hpp" -#include "conv_fwd_util.hpp" +#include "library/include/ck/library/utility/conv_util.hpp" #include "conv_util.hpp" namespace { @@ -20,14 +20,14 @@ bool test_conv3d_ndhwc_instances(const std::vector{3, 3, 2}; - params.input_spatial_lengths = std::vector{32, 32, 2}; - params.conv_filter_strides = std::vector{2, 2, 2}; - params.conv_filter_dilations = std::vector{1, 1, 1}; - params.input_left_pads = std::vector{1, 1, 1}; - params.input_right_pads = std::vector{1, 1, 1}; + params.N_ = 64; + params.num_dim_spatial_ = 3; + params.filter_spatial_lengths_ = std::vector{3, 3, 2}; + params.input_spatial_lengths_ = std::vector{32, 32, 2}; + params.conv_filter_strides_ = std::vector{2, 2, 2}; + params.conv_filter_dilations_ = std::vector{1, 1, 1}; + params.input_left_pads_ = std::vector{1, 1, 1}; + params.input_right_pads_ = std::vector{1, 1, 1}; conv::ConvFwdOpInstance conv_instance(params); @@ -46,16 +46,16 @@ TEST(Conv3DFwdNDHWC, TestConv3D) namespace ctl = ck::tensor_layout::convolution; conv::ConvParams params; - params.num_dim_spatial = 3; - params.N = 2; - params.K = 16; - params.C = 4; - params.filter_spatial_lengths = std::vector{3, 3, 3}; - params.input_spatial_lengths = std::vector{16, 16, 16}; - params.conv_filter_strides = std::vector{1, 1, 1}; - params.conv_filter_dilations = std::vector{1, 1, 1}; - params.input_left_pads = std::vector{1, 1, 1}; - params.input_right_pads = std::vector{1, 1, 1}; + params.num_dim_spatial_ = 3; + params.N_ = 2; + params.K_ = 16; + params.C_ = 4; + params.filter_spatial_lengths_ = std::vector{3, 3, 3}; + params.input_spatial_lengths_ = std::vector{16, 16, 16}; + params.conv_filter_strides_ = std::vector{1, 1, 1}; + params.conv_filter_dilations_ = std::vector{1, 1, 1}; + params.input_left_pads_ = std::vector{1, 1, 1}; + params.input_right_pads_ = std::vector{1, 1, 1}; std::vector conv_ptrs; test::conv::get_test_convolution_fwd_instance<3>(conv_ptrs); @@ -77,16 +77,16 @@ TEST(Conv3DFwdNDHWC, InputOver2GB) // >2GB Input conv::ConvParams params; - params.num_dim_spatial = 3; - params.N = 2; - params.K = 16; - params.C = 32; - params.filter_spatial_lengths = std::vector{3, 3, 3}; - params.input_spatial_lengths = std::vector{32, 1000, 1000}; - params.conv_filter_strides = std::vector{1, 1, 1}; - params.conv_filter_dilations = std::vector{1, 1, 1}; - params.input_left_pads = std::vector{1, 1, 1}; - params.input_right_pads = std::vector{1, 1, 1}; + params.num_dim_spatial_ = 3; + params.N_ = 2; + params.K_ = 16; + params.C_ = 32; + params.filter_spatial_lengths_ = std::vector{3, 3, 3}; + params.input_spatial_lengths_ = std::vector{32, 1000, 1000}; + params.conv_filter_strides_ = std::vector{1, 1, 1}; + params.conv_filter_dilations_ = std::vector{1, 1, 1}; + params.input_left_pads_ = std::vector{1, 1, 1}; + params.input_right_pads_ = std::vector{1, 1, 1}; std::vector conv_ptrs; test::conv::get_test_convolution_fwd_instance<3>(conv_ptrs); @@ -94,16 +94,16 @@ TEST(Conv3DFwdNDHWC, InputOver2GB) auto arg = conv_ptrs.back()->MakeArgumentPointer(nullptr, nullptr, nullptr, - params.N, - params.K, - params.C, - params.input_spatial_lengths, - params.filter_spatial_lengths, + params.N_, + params.K_, + params.C_, + params.input_spatial_lengths_, + params.filter_spatial_lengths_, params.GetOutputSpatialLengths(), - params.conv_filter_strides, - params.conv_filter_dilations, - params.input_left_pads, - params.input_right_pads, + params.conv_filter_strides_, + params.conv_filter_dilations_, + params.input_left_pads_, + params.input_right_pads_, PassThrough{}, PassThrough{}, PassThrough{}); @@ -117,16 +117,16 @@ TEST(Conv3DFwdNDHWC, FiltersOver2GB) // >2GB Filters conv::ConvParams params; - params.num_dim_spatial = 3; - params.N = 2; - params.K = 16; - params.C = 32; - params.filter_spatial_lengths = std::vector{4, 1000, 1000}; - params.input_spatial_lengths = std::vector{16, 16, 16}; - params.conv_filter_strides = std::vector{1, 1, 1}; - params.conv_filter_dilations = std::vector{1, 1, 1}; - params.input_left_pads = std::vector{1, 1, 1}; - params.input_right_pads = std::vector{1, 1, 1}; + params.num_dim_spatial_ = 3; + params.N_ = 2; + params.K_ = 16; + params.C_ = 32; + params.filter_spatial_lengths_ = std::vector{4, 1000, 1000}; + params.input_spatial_lengths_ = std::vector{16, 16, 16}; + params.conv_filter_strides_ = std::vector{1, 1, 1}; + params.conv_filter_dilations_ = std::vector{1, 1, 1}; + params.input_left_pads_ = std::vector{1, 1, 1}; + params.input_right_pads_ = std::vector{1, 1, 1}; std::vector conv_ptrs; test::conv::get_test_convolution_fwd_instance<3>(conv_ptrs); @@ -134,16 +134,16 @@ TEST(Conv3DFwdNDHWC, FiltersOver2GB) auto arg = conv_ptrs.back()->MakeArgumentPointer(nullptr, nullptr, nullptr, - params.N, - params.K, - params.C, - params.input_spatial_lengths, - params.filter_spatial_lengths, + params.N_, + params.K_, + params.C_, + params.input_spatial_lengths_, + params.filter_spatial_lengths_, params.GetOutputSpatialLengths(), - params.conv_filter_strides, - params.conv_filter_dilations, - params.input_left_pads, - params.input_right_pads, + params.conv_filter_strides_, + params.conv_filter_dilations_, + params.input_left_pads_, + params.input_right_pads_, PassThrough{}, PassThrough{}, PassThrough{}); @@ -157,32 +157,32 @@ TEST(Conv3DFwdNDHWC, OutputOver2GB) // >2GB Output conv::ConvParams params; - params.num_dim_spatial = 3; - params.N = 2; - params.K = 16; - params.C = 2; - params.filter_spatial_lengths = std::vector{1, 1, 1}; - params.input_spatial_lengths = std::vector{1000, 1000, 30}; - params.conv_filter_strides = std::vector{1, 1, 1}; - params.conv_filter_dilations = std::vector{1, 1, 1}; - params.input_left_pads = std::vector{2, 2, 2}; - params.input_right_pads = std::vector{2, 2, 2}; + params.num_dim_spatial_ = 3; + params.N_ = 2; + params.K_ = 16; + params.C_ = 2; + params.filter_spatial_lengths_ = std::vector{1, 1, 1}; + params.input_spatial_lengths_ = std::vector{1000, 1000, 30}; + params.conv_filter_strides_ = std::vector{1, 1, 1}; + params.conv_filter_dilations_ = std::vector{1, 1, 1}; + params.input_left_pads_ = std::vector{2, 2, 2}; + params.input_right_pads_ = std::vector{2, 2, 2}; std::vector conv_ptrs; test::conv::get_test_convolution_fwd_instance<3>(conv_ptrs); auto arg = conv_ptrs.back()->MakeArgumentPointer(nullptr, nullptr, nullptr, - params.N, - params.K, - params.C, - params.input_spatial_lengths, - params.filter_spatial_lengths, + params.N_, + params.K_, + params.C_, + params.input_spatial_lengths_, + params.filter_spatial_lengths_, params.GetOutputSpatialLengths(), - params.conv_filter_strides, - params.conv_filter_dilations, - params.input_left_pads, - params.input_right_pads, + params.conv_filter_strides_, + params.conv_filter_dilations_, + params.input_left_pads_, + params.input_right_pads_, PassThrough{}, PassThrough{}, PassThrough{}); diff --git a/test/convnd_fwd/conv_util.hpp b/test/convnd_fwd/conv_util.hpp index 4f77101563..09f641b415 100644 --- a/test/convnd_fwd/conv_util.hpp +++ b/test/convnd_fwd/conv_util.hpp @@ -4,7 +4,6 @@ #include #include "config.hpp" -#include "conv_fwd_util.hpp" #include "device_convnd_fwd_xdl_nhwc_kyxc_nhwk.hpp" #include "element_wise_operation.hpp" #include "host_tensor.hpp" diff --git a/test/gemm_reduce/gemm_reduce_fp16.cpp b/test/gemm_reduce/gemm_reduce_fp16.cpp index 8deb66b2b0..6c7bb9658f 100644 --- a/test/gemm_reduce/gemm_reduce_fp16.cpp +++ b/test/gemm_reduce/gemm_reduce_fp16.cpp @@ -16,22 +16,22 @@ int main() pass = pass && ck::profiler:: profile_gemm_reduce_impl( - true, 1, false, 1, M, N, K, K, N, N); + true, 1, false, false, M, N, K, K, N, N); pass = pass && ck::profiler:: profile_gemm_reduce_impl( - true, 1, false, 1, M, N, K, K, K, N); + true, 1, false, false, M, N, K, K, K, N); pass = pass && ck::profiler:: profile_gemm_reduce_impl( - true, 1, false, 1, M, N, K, M, N, N); + true, 1, false, false, M, N, K, M, N, N); pass = pass && ck::profiler:: profile_gemm_reduce_impl( - true, 1, false, 1, M, N, K, M, K, N); + true, 1, false, false, M, N, K, M, K, N); if(pass) { diff --git a/test/gemm_split_k/gemm_split_k.cpp b/test/gemm_split_k/gemm_split_k.cpp index c788b66aa3..b63361aa1b 100644 --- a/test/gemm_split_k/gemm_split_k.cpp +++ b/test/gemm_split_k/gemm_split_k.cpp @@ -187,9 +187,10 @@ int test_gemm(const gemmArgs& args) if(gemm_ptr->IsSupportedArgument(argument_ptr.get())) { - invoker_ptr->Run(argument_ptr.get(), 0); + invoker_ptr->Run(argument_ptr.get()); c_device_buf.FromDevice(c_m_n_device_result.mData.data()); + if(!check_out(c_m_n_host_result, c_m_n_device_result)) { success = false; diff --git a/test/reference_conv_fwd/CMakeLists.txt b/test/reference_conv_fwd/CMakeLists.txt index e5a7b31aff..04b720b169 100644 --- a/test/reference_conv_fwd/CMakeLists.txt +++ b/test/reference_conv_fwd/CMakeLists.txt @@ -1,2 +1,2 @@ add_gtest_executable(test_reference_conv_fwd reference_conv_fwd.cpp) -target_link_libraries(test_reference_conv_fwd PRIVATE host_tensor conv_fwd_util) +target_link_libraries(test_reference_conv_fwd PRIVATE host_tensor conv_util) diff --git a/test/reference_conv_fwd/reference_conv_fwd.cpp b/test/reference_conv_fwd/reference_conv_fwd.cpp index f660559e62..69b223989f 100644 --- a/test/reference_conv_fwd/reference_conv_fwd.cpp +++ b/test/reference_conv_fwd/reference_conv_fwd.cpp @@ -8,7 +8,7 @@ #include "check_err.hpp" #include "config.hpp" -#include "conv_fwd_util.hpp" +#include "conv_util.hpp" #include "element_wise_operation.hpp" #include "fill.hpp" #include "host_tensor.hpp" @@ -34,21 +34,21 @@ run_reference_convolution_forward(const ck::utils::conv::ConvParams& params, const FillInputOp& fill_input_op = FillInputOp{}, const FillWeightsOp& fill_weights_op = FillWeightsOp{0.5f}) { - std::vector input_dims{static_cast(params.N), - static_cast(params.C)}; + std::vector input_dims{static_cast(params.N_), + static_cast(params.C_)}; input_dims.insert(std::end(input_dims), - std::begin(params.input_spatial_lengths), - std::end(params.input_spatial_lengths)); + std::begin(params.input_spatial_lengths_), + std::end(params.input_spatial_lengths_)); - std::vector filter_dims{static_cast(params.K), - static_cast(params.C)}; + std::vector filter_dims{static_cast(params.K_), + static_cast(params.C_)}; filter_dims.insert(std::end(filter_dims), - std::begin(params.filter_spatial_lengths), - std::end(params.filter_spatial_lengths)); + std::begin(params.filter_spatial_lengths_), + std::end(params.filter_spatial_lengths_)); const std::vector& output_spatial_lengths = params.GetOutputSpatialLengths(); - std::vector output_dims{static_cast(params.N), - static_cast(params.K)}; + std::vector output_dims{static_cast(params.N_), + static_cast(params.K_)}; output_dims.insert(std::end(output_dims), std::begin(output_spatial_lengths), std::end(output_spatial_lengths)); @@ -74,10 +74,10 @@ run_reference_convolution_forward(const ck::utils::conv::ConvParams& params, auto ref_argument = ref_conv.MakeArgument(input, weights, host_output, - params.conv_filter_strides, - params.conv_filter_dilations, - params.input_left_pads, - params.input_right_pads, + params.conv_filter_strides_, + params.conv_filter_dilations_, + params.input_left_pads_, + params.input_right_pads_, InElementOp{}, WeiElementOp{}, OutElementOp{}); @@ -91,15 +91,15 @@ run_reference_convolution_forward(const ck::utils::conv::ConvParams& params, TEST(ReferenceConvolutionFWD, Conv2DNHWC) { ck::utils::conv::ConvParams params; - params.N = 1; - params.K = 1; - params.C = 2; - params.filter_spatial_lengths = std::vector{3, 3}; - params.input_spatial_lengths = std::vector{6, 6}; - params.conv_filter_strides = std::vector{1, 1}; - params.conv_filter_dilations = std::vector{1, 1}; - params.input_left_pads = std::vector{0, 0}; - params.input_right_pads = std::vector{0, 0}; + params.N_ = 1; + params.K_ = 1; + params.C_ = 2; + params.filter_spatial_lengths_ = std::vector{3, 3}; + params.input_spatial_lengths_ = std::vector{6, 6}; + params.conv_filter_strides_ = std::vector{1, 1}; + params.conv_filter_dilations_ = std::vector{1, 1}; + params.input_left_pads_ = std::vector{0, 0}; + params.input_right_pads_ = std::vector{0, 0}; auto out_tensor = run_reference_convolution_forward<2>(params); std::vector ref_dims{1, 1, 4, 4}; @@ -127,15 +127,15 @@ TEST(ReferenceConvolutionFWD, Conv2DNHWC) TEST(ReferenceConvolutionFWD, Conv2DNHWCStridesDilationsPadding) { ck::utils::conv::ConvParams params; - params.N = 1; - params.K = 2; - params.C = 2; - params.filter_spatial_lengths = std::vector{3, 3}; - params.input_spatial_lengths = std::vector{12, 12}; - params.conv_filter_strides = std::vector{2, 2}; - params.conv_filter_dilations = std::vector{2, 2}; - params.input_left_pads = std::vector{1, 1}; - params.input_right_pads = std::vector{1, 1}; + params.N_ = 1; + params.K_ = 2; + params.C_ = 2; + params.filter_spatial_lengths_ = std::vector{3, 3}; + params.input_spatial_lengths_ = std::vector{12, 12}; + params.conv_filter_strides_ = std::vector{2, 2}; + params.conv_filter_dilations_ = std::vector{2, 2}; + params.input_left_pads_ = std::vector{1, 1}; + params.input_right_pads_ = std::vector{1, 1}; auto out_tensor = run_reference_convolution_forward<2>(params); std::vector ref_dims = std::vector{1, 2, 5, 5}; @@ -153,16 +153,16 @@ TEST(ReferenceConvolutionFWD, Conv2DNHWCStridesDilationsPadding) TEST(ReferenceConvolutionFWD, Conv1DNWC) { ck::utils::conv::ConvParams params; - params.num_dim_spatial = 1; - params.N = 1; - params.K = 1; - params.C = 2; - params.filter_spatial_lengths = std::vector{3}; - params.input_spatial_lengths = std::vector{6}; - params.conv_filter_strides = std::vector{1}; - params.conv_filter_dilations = std::vector{1}; - params.input_left_pads = std::vector{0}; - params.input_right_pads = std::vector{0}; + params.num_dim_spatial_ = 1; + params.N_ = 1; + params.K_ = 1; + params.C_ = 2; + params.filter_spatial_lengths_ = std::vector{3}; + params.input_spatial_lengths_ = std::vector{6}; + params.conv_filter_strides_ = std::vector{1}; + params.conv_filter_dilations_ = std::vector{1}; + params.input_left_pads_ = std::vector{0}; + params.input_right_pads_ = std::vector{0}; auto out_tensor = run_reference_convolution_forward<1, @@ -182,16 +182,16 @@ TEST(ReferenceConvolutionFWD, Conv1DNWC) TEST(ReferenceConvolutionFWD, Conv1DNWCStridesDilationsPadding) { ck::utils::conv::ConvParams params; - params.num_dim_spatial = 1; - params.N = 1; - params.K = 2; - params.C = 2; - params.filter_spatial_lengths = std::vector{3}; - params.input_spatial_lengths = std::vector{12}; - params.conv_filter_strides = std::vector{2}; - params.conv_filter_dilations = std::vector{2}; - params.input_left_pads = std::vector{1}; - params.input_right_pads = std::vector{1}; + params.num_dim_spatial_ = 1; + params.N_ = 1; + params.K_ = 2; + params.C_ = 2; + params.filter_spatial_lengths_ = std::vector{3}; + params.input_spatial_lengths_ = std::vector{12}; + params.conv_filter_strides_ = std::vector{2}; + params.conv_filter_dilations_ = std::vector{2}; + params.input_left_pads_ = std::vector{1}; + params.input_right_pads_ = std::vector{1}; auto out_tensor = run_reference_convolution_forward<1, @@ -211,16 +211,16 @@ TEST(ReferenceConvolutionFWD, Conv1DNWCStridesDilationsPadding) TEST(ReferenceConvolutionFWD, Conv1DNWCSameOutputSize) { ck::utils::conv::ConvParams params; - params.num_dim_spatial = 1; - params.N = 2; - params.K = 16; - params.C = 4; - params.filter_spatial_lengths = std::vector{3}; - params.input_spatial_lengths = std::vector{16}; - params.conv_filter_strides = std::vector{1}; - params.conv_filter_dilations = std::vector{1}; - params.input_left_pads = std::vector{1}; - params.input_right_pads = std::vector{1}; + params.num_dim_spatial_ = 1; + params.N_ = 2; + params.K_ = 16; + params.C_ = 4; + params.filter_spatial_lengths_ = std::vector{3}; + params.input_spatial_lengths_ = std::vector{16}; + params.conv_filter_strides_ = std::vector{1}; + params.conv_filter_dilations_ = std::vector{1}; + params.input_left_pads_ = std::vector{1}; + params.input_right_pads_ = std::vector{1}; auto out_tensor2 = run_reference_convolution_forward<1, float, @@ -305,16 +305,16 @@ TEST(ReferenceConvolutionFWD, Conv1DNWCSameOutputSize) TEST(ReferenceConvolutionFWD, Conv3DNCDHW) { ck::utils::conv::ConvParams params; - params.num_dim_spatial = 3; - params.N = 1; - params.K = 1; - params.C = 2; - params.filter_spatial_lengths = std::vector{3, 3, 3}; - params.input_spatial_lengths = std::vector{6, 6, 6}; - params.conv_filter_strides = std::vector{1, 1, 1}; - params.conv_filter_dilations = std::vector{1, 1, 1}; - params.input_left_pads = std::vector{0, 0, 0}; - params.input_right_pads = std::vector{0, 0, 0}; + params.num_dim_spatial_ = 3; + params.N_ = 1; + params.K_ = 1; + params.C_ = 2; + params.filter_spatial_lengths_ = std::vector{3, 3, 3}; + params.input_spatial_lengths_ = std::vector{6, 6, 6}; + params.conv_filter_strides_ = std::vector{1, 1, 1}; + params.conv_filter_dilations_ = std::vector{1, 1, 1}; + params.input_left_pads_ = std::vector{0, 0, 0}; + params.input_right_pads_ = std::vector{0, 0, 0}; auto out_tensor = run_reference_convolution_forward<3, float, @@ -344,16 +344,16 @@ TEST(ReferenceConvolutionFWD, Conv3DNCDHW) TEST(ReferenceConvolutionFWD, Conv3DNCDHWStridesDilations) { ck::utils::conv::ConvParams params; - params.num_dim_spatial = 3; - params.N = 1; - params.K = 2; - params.C = 2; - params.filter_spatial_lengths = std::vector{3, 3, 3}; - params.input_spatial_lengths = std::vector{12, 12, 12}; - params.conv_filter_strides = std::vector{3, 3, 3}; - params.conv_filter_dilations = std::vector{1, 1, 1}; - params.input_left_pads = std::vector{0, 0, 0}; - params.input_right_pads = std::vector{0, 0, 0}; + params.num_dim_spatial_ = 3; + params.N_ = 1; + params.K_ = 2; + params.C_ = 2; + params.filter_spatial_lengths_ = std::vector{3, 3, 3}; + params.input_spatial_lengths_ = std::vector{12, 12, 12}; + params.conv_filter_strides_ = std::vector{3, 3, 3}; + params.conv_filter_dilations_ = std::vector{1, 1, 1}; + params.input_left_pads_ = std::vector{0, 0, 0}; + params.input_right_pads_ = std::vector{0, 0, 0}; auto out_tensor = run_reference_convolution_forward<3, float,