diff --git a/CMakeLists.txt b/CMakeLists.txt
index 2b798e38f3..a3ec91e3bc 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -27,6 +27,8 @@ set(CMAKE_CXX_STANDARD_REQUIRED ON)
set(CMAKE_CXX_EXTENSIONS OFF)
message("CMAKE_CXX_COMPILER_ID: ${CMAKE_CXX_COMPILER_ID}")
+option(CK_TIME_KERNEL "Turning off will disable kernel timing globally" ON)
+
## OpenMP
if(CMAKE_CXX_COMPILER_ID MATCHES "Clang")
# workaround issue hipcc in rocm3.5 cannot find openmp
@@ -72,8 +74,9 @@ message(STATUS "Build with HIP ${HIP_VERSION}")
rocm_create_package(
- NAME CK-${CK_BACKEND}
+ NAME composablekernel
DESCRIPTION "High Performance Composable Kernel for AMD GPUs"
+ MAINTAINER "MIOpen Kernels Dev Team
"
LDCONFIG
)
@@ -226,7 +229,7 @@ set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/lib)
set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/lib)
set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/bin)
-configure_file("${PROJECT_SOURCE_DIR}/include/ck/hip_version.hpp.in" "${PROJECT_BINARY_DIR}/include/ck/hip_version.hpp")
+configure_file("${PROJECT_SOURCE_DIR}/include/ck/options.hpp.in" "${PROJECT_BINARY_DIR}/include/ck/options.hpp")
include_directories(BEFORE
${PROJECT_SOURCE_DIR}/include
@@ -234,7 +237,6 @@ include_directories(BEFORE
${PROJECT_SOURCE_DIR}/library/include
)
-include(googletest)
SET(BUILD_DEV ON CACHE BOOL "BUILD_DEV")
if(BUILD_DEV)
@@ -243,7 +245,31 @@ if(BUILD_DEV)
endif()
message("CMAKE_CXX_FLAGS: ${CMAKE_CXX_FLAGS}")
+add_custom_target(check COMMAND ${CMAKE_CTEST_COMMAND} --output-on-failure -C ${CMAKE_CFG_INTDIR})
+
add_subdirectory(library)
add_subdirectory(example)
add_subdirectory(test)
add_subdirectory(profiler)
+
+#Create an interface target for the include only files and call it "composablekernels"
+include(CMakePackageConfigHelpers)
+
+set(version 1.0.0)
+write_basic_package_version_file(
+ "${CMAKE_CURRENT_BINARY_DIR}/composable_kernelConfigVersion.cmake"
+ VERSION "${version}"
+ COMPATIBILITY AnyNewerVersion
+)
+
+configure_package_config_file(${CMAKE_CURRENT_SOURCE_DIR}/Config.cmake.in
+ "${CMAKE_CURRENT_BINARY_DIR}/composable_kernelConfig.cmake"
+ INSTALL_DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/composable_kernel
+ NO_CHECK_REQUIRED_COMPONENTS_MACRO
+)
+
+install(FILES
+ "${CMAKE_CURRENT_BINARY_DIR}/composable_kernelConfig.cmake"
+ "${CMAKE_CURRENT_BINARY_DIR}/composable_kernelConfigVersion.cmake"
+ DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/composable_kernel
+)
diff --git a/Config.cmake.in b/Config.cmake.in
new file mode 100644
index 0000000000..12b5c331ae
--- /dev/null
+++ b/Config.cmake.in
@@ -0,0 +1,11 @@
+@PACKAGE_INIT@
+
+set(_composable_kernel_supported_components device_operations host_tensor)
+
+foreach(_comp ${composable_kernel_FIND_COMPONENTS})
+ if(NOT _comp IN_LIST _composable_kernel_supported_components)
+ set(composable_kernel_FOUND False)
+ set(composable_kernel_NOT_FOUND_MESSAGE "Unsupported component: ${_comp}")
+ endif()
+ include("${CMAKE_CURRENT_LIST_DIR}/composable_kernel${_comp}Targets.cmake")
+endforeach()
diff --git a/Dockerfile b/Dockerfile
index c4cf0fac57..9a443e01de 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -11,13 +11,7 @@ ARG DEB_ROCM_REPO=http://repo.radeon.com/rocm/apt/.apt_$ROCMVERSION/
RUN apt-get update
RUN apt-get install -y wget gnupg
RUN wget -qO - http://repo.radeon.com/rocm/rocm.gpg.key | apt-key add -
-RUN if ! [ -z $OSDB_BKC_VERSION ]; then \
- echo "Using BKC VERISION: $OSDB_BKC_VERSION";\
- sh -c "echo deb [arch=amd64 trusted=yes] http://compute-artifactory.amd.com/artifactory/list/rocm-osdb-deb/ compute-rocm-dkms-no-npi-hipclang ${OSDB_BKC_VERSION} > /etc/apt/sources.list.d/rocm.list" ;\
- cat /etc/apt/sources.list.d/rocm.list;\
- else \
- sh -c "echo deb [arch=amd64] $DEB_ROCM_REPO ubuntu main > /etc/apt/sources.list.d/rocm.list" ;\
- fi
+RUN sh -c "echo deb [arch=amd64] $DEB_ROCM_REPO ubuntu main > /etc/apt/sources.list.d/rocm.list"
RUN wget --no-check-certificate -qO - https://apt.kitware.com/keys/kitware-archive-latest.asc 2>/dev/null | apt-key add -
RUN sh -c "echo deb https://apt.kitware.com/ubuntu/ bionic main | tee -a /etc/apt/sources.list"
@@ -25,18 +19,15 @@ RUN sh -c "echo deb https://apt.kitware.com/ubuntu/ bionic main | tee -a /etc/ap
# Install dependencies
RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-unauthenticated \
apt-utils \
- sshpass \
build-essential \
cmake-data=3.15.1-0kitware1 \
cmake=3.15.1-0kitware1 \
curl \
- doxygen \
g++ \
gdb \
git \
hip-rocclr \
jq \
- lcov \
libelf-dev \
libncurses5-dev \
libnuma-dev \
@@ -62,8 +53,6 @@ RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-
apt-get clean && \
rm -rf /var/lib/apt/lists/*
-# RUN pip3 install --default-timeout=100000 -r requirements.txt
-
# Setup ubsan environment to printstacktrace
RUN ln -s /usr/bin/llvm-symbolizer-3.8 /usr/local/bin/llvm-symbolizer
ENV UBSAN_OPTIONS=print_stacktrace=1
@@ -92,5 +81,3 @@ ADD rbuild.ini /rbuild.ini
ADD dev-requirements.txt dev-requirements.txt
RUN rbuild prepare -s develop -d $PREFIX
RUN groupadd -f render
-# RUN cget install -f min-requirements.txt
-# RUN CXXFLAGS='-isystem $PREFIX/include' cget install -f ./mlir-requirements.txt
diff --git a/Jenkinsfile b/Jenkinsfile
index f065d4ecc5..77f4d9d8be 100644
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -320,7 +320,7 @@ pipeline {
{
agent{ label rocmnode("gfx908")}
environment{
- setup_args = """ -D CMAKE_CXX_FLAGS="--offload-arch=gfx908 -O3 " -DBUILD_DEV=On """
+ setup_args = """ -D CMAKE_CXX_FLAGS=" --offload-arch=gfx900 --offload-arch=gfx906 --offload-arch=gfx908 --offload-arch=gfx90a -O3 " -DBUILD_DEV=On """
}
steps{
buildHipClangJobAndReboot(setup_args:setup_args, config_targets: "check", no_reboot:true, build_type: 'Release')
@@ -341,6 +341,23 @@ pipeline {
}
}
+ stage("Client App")
+ {
+ parallel
+ {
+ stage("Run Client App")
+ {
+ agent{ label rocmnode("gfx908")}
+ environment{
+ setup_args = """ -D -DBUILD_DEV=Off -DCMAKE_INSTALL_PREFIX=../install CMAKE_CXX_FLAGS="--offload-arch=gfx908 -O3 " """
+ execute_args = """ cd ../test/client_app && rm -rf build && mkdir build && cd build && cmake -DCMAKE_PREFIX_PATH="${env.WORKSPACE}/install;/opt/rocm" .. && make """
+ }
+ steps{
+ buildHipClangJobAndReboot(setup_args: setup_args, config_targets: "install", no_reboot:true, build_type: 'Release', execute_cmd: execute_args, prefixpath: '/usr/local')
+ }
+ }
+ }
+ }
stage("Performance Tests")
{
parallel
diff --git a/README.md b/README.md
index f5341b5736..9d7b578046 100644
--- a/README.md
+++ b/README.md
@@ -43,3 +43,13 @@ Instructions for running each individual examples are under ```example/```
make -j ckProfiler
```
Instructions for running ckProfiler are under ```profiler/```
+
+
+## Caveat
+### Kernel Timing and Verification
+CK's own kernel timer will warn up kernel once, and then run it multiple times
+to get average kernel time. For some kernels that use atomic add, this will cause
+output buffer to be accumulated multiple times, causing verfication failure.
+To work around it, do not use CK's own timer and do verification at the same time.
+CK's own timer and verification in each example and ckProfiler can be enabled or
+disabled from command line.
diff --git a/cmake/googletest.cmake b/cmake/googletest.cmake
index c7e70cc8a9..959bc4f4b0 100644
--- a/cmake/googletest.cmake
+++ b/cmake/googletest.cmake
@@ -18,6 +18,8 @@ list(APPEND GTEST_CMAKE_CXX_FLAGS
-Wno-switch-enum
-Wno-zero-as-null-pointer-constant
-Wno-unused-member-function
+ -Wno-comma
+ -Wno-old-style-cast
)
message(STATUS "Suppressing googltest warnings with flags: ${GTEST_CMAKE_CXX_FLAGS}")
@@ -33,4 +35,5 @@ FetchContent_MakeAvailable(googletest)
target_compile_options(gtest PRIVATE ${GTEST_CMAKE_CXX_FLAGS})
target_compile_options(gtest_main PRIVATE ${GTEST_CMAKE_CXX_FLAGS})
-
+target_compile_options(gmock PRIVATE ${GTEST_CMAKE_CXX_FLAGS})
+target_compile_options(gmock_main PRIVATE ${GTEST_CMAKE_CXX_FLAGS})
diff --git a/example/01_gemm/gemm_xdl_bf16.cpp b/example/01_gemm/gemm_xdl_bf16.cpp
index a4567dcd6e..060750e676 100644
--- a/example/01_gemm/gemm_xdl_bf16.cpp
+++ b/example/01_gemm/gemm_xdl_bf16.cpp
@@ -88,9 +88,9 @@ using ReferenceGemmInstance = ck::tensor_operation::host::
int main(int argc, char* argv[])
{
- bool do_verification = 0;
- int init_method = 0;
- int nrepeat = 5;
+ bool do_verification = true;
+ int init_method = 1;
+ bool time_kernel = false;
// GEMM shape
ck::index_t M = 3840;
@@ -105,13 +105,13 @@ int main(int argc, char* argv[])
{
do_verification = std::stoi(argv[1]);
init_method = std::stoi(argv[2]);
- nrepeat = std::stoi(argv[3]);
+ time_kernel = std::stoi(argv[3]);
}
else if(argc == 10)
{
do_verification = std::stoi(argv[1]);
init_method = std::stoi(argv[2]);
- nrepeat = std::stoi(argv[3]);
+ time_kernel = std::stoi(argv[3]);
M = std::stoi(argv[4]);
N = std::stoi(argv[5]);
@@ -125,7 +125,7 @@ int main(int argc, char* argv[])
{
printf("arg1: verification (0=no, 1=yes)\n");
printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n");
- printf("arg3: run kernel # of times (>1)\n");
+ printf("arg3: time kernel (0=n0, 1=yes)\n");
printf("arg4 to 9: M (256x), N(128x), K(32x), StrideA, StrideB, StrideC\n");
exit(0);
}
@@ -198,7 +198,7 @@ int main(int argc, char* argv[])
"not support this GEMM problem");
}
- float ave_time = invoker.Run(argument, nrepeat);
+ float ave_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel});
std::size_t flop = std::size_t(2) * M * N * K;
std::size_t num_btype =
@@ -232,7 +232,7 @@ int main(int argc, char* argv[])
ref_invoker.Run(ref_argument);
- ck::utils::check_err(c_m_n_device_f32_result.mData, c_m_n_host_result.mData);
+ return ck::utils::check_err(c_m_n_device_f32_result.mData, c_m_n_host_result.mData) ? 0 : 1;
}
return 0;
diff --git a/example/01_gemm/gemm_xdl_fp16.cpp b/example/01_gemm/gemm_xdl_fp16.cpp
index fc04a13ca5..06523037f9 100644
--- a/example/01_gemm/gemm_xdl_fp16.cpp
+++ b/example/01_gemm/gemm_xdl_fp16.cpp
@@ -56,9 +56,9 @@ using ReferenceGemmInstance = ck::tensor_operation::host::
int main(int argc, char* argv[])
{
- bool do_verification = 0;
- int init_method = 0;
- int nrepeat = 5;
+ bool do_verification = true;
+ int init_method = 1;
+ bool time_kernel = false;
// GEMM shape
ck::index_t M = 3840;
@@ -73,13 +73,13 @@ int main(int argc, char* argv[])
{
do_verification = std::stoi(argv[1]);
init_method = std::stoi(argv[2]);
- nrepeat = std::stoi(argv[3]);
+ time_kernel = std::stoi(argv[3]);
}
else if(argc == 10)
{
do_verification = std::stoi(argv[1]);
init_method = std::stoi(argv[2]);
- nrepeat = std::stoi(argv[3]);
+ time_kernel = std::stoi(argv[3]);
M = std::stoi(argv[4]);
N = std::stoi(argv[5]);
@@ -93,7 +93,7 @@ int main(int argc, char* argv[])
{
printf("arg1: verification (0=no, 1=yes)\n");
printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n");
- printf("arg3: run kernel # of times (>1)\n");
+ printf("arg3: time kernel (0=n0, 1=yes)\n");
printf("arg4 to 9: M (256x), N(128x), K(32x), StrideA, StrideB, StrideC\n");
exit(0);
}
@@ -171,7 +171,7 @@ int main(int argc, char* argv[])
"not support this GEMM problem");
}
- float ave_time = invoker.Run(argument, nrepeat);
+ float ave_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel});
std::size_t flop = std::size_t(2) * M * N * K;
std::size_t num_btype =
@@ -196,7 +196,7 @@ int main(int argc, char* argv[])
ref_invoker.Run(ref_argument);
- ck::utils::check_err(c_m_n_device_result.mData, c_m_n_host_result.mData);
+ return ck::utils::check_err(c_m_n_device_result.mData, c_m_n_host_result.mData) ? 0 : 1;
}
return 0;
diff --git a/example/01_gemm/gemm_xdl_int8.cpp b/example/01_gemm/gemm_xdl_int8.cpp
index ab5869db61..a22c21e40e 100644
--- a/example/01_gemm/gemm_xdl_int8.cpp
+++ b/example/01_gemm/gemm_xdl_int8.cpp
@@ -83,9 +83,9 @@ using ReferenceGemmInstance = ck::tensor_operation::host::
int main(int argc, char* argv[])
{
- bool do_verification = 0;
- int init_method = 0;
- int nrepeat = 5;
+ bool do_verification = true;
+ int init_method = 1;
+ bool time_kernel = false;
// GEMM shape
ck::index_t M = 3840;
@@ -100,13 +100,13 @@ int main(int argc, char* argv[])
{
do_verification = std::stoi(argv[1]);
init_method = std::stoi(argv[2]);
- nrepeat = std::stoi(argv[3]);
+ time_kernel = std::stoi(argv[3]);
}
else if(argc == 10)
{
do_verification = std::stoi(argv[1]);
init_method = std::stoi(argv[2]);
- nrepeat = std::stoi(argv[3]);
+ time_kernel = std::stoi(argv[3]);
M = std::stoi(argv[4]);
N = std::stoi(argv[5]);
@@ -120,7 +120,7 @@ int main(int argc, char* argv[])
{
printf("arg1: verification (0=no, 1=yes)\n");
printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n");
- printf("arg3: run kernel # of times (>1)\n");
+ printf("arg3: time kernel (0=n0, 1=yes)\n");
printf("arg4 to 9: M (256x), N(128x), K(32x), StrideA, StrideB, StrideC\n");
exit(0);
}
@@ -194,7 +194,7 @@ int main(int argc, char* argv[])
"not support this GEMM problem");
}
- float ave_time = invoker.Run(argument, nrepeat);
+ float ave_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel});
std::size_t flop = std::size_t(2) * M * N * K;
std::size_t num_btype =
@@ -219,7 +219,7 @@ int main(int argc, char* argv[])
ref_invoker.Run(ref_argument);
- ck::utils::check_err(c_m_n_device_result.mData, c_m_n_host_result.mData);
+ return ck::utils::check_err(c_m_n_device_result.mData, c_m_n_host_result.mData) ? 0 : 1;
}
return 0;
diff --git a/example/02_gemm_alpha_beta/gemm_xdl_alpha_beta.cpp b/example/02_gemm_alpha_beta/gemm_xdl_alpha_beta.cpp
index 2abebbbac4..1a6e1de4dc 100644
--- a/example/02_gemm_alpha_beta/gemm_xdl_alpha_beta.cpp
+++ b/example/02_gemm_alpha_beta/gemm_xdl_alpha_beta.cpp
@@ -86,9 +86,9 @@ using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemmBias2D1)\n");
+ printf("arg3: time kernel (0=n0, 1=yes)\n");
printf("arg4 to 9: M (256x), N(128x), K(32x), StrideA, StrideB, StrideC, alpha, beta\n");
exit(0);
}
@@ -216,7 +216,7 @@ int main(int argc, char* argv[])
"not support this GEMM problem");
}
- float ave_time = invoker.Run(argument, nrepeat);
+ float ave_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel});
std::size_t flop = std::size_t(2) * M * N * K;
std::size_t num_btype =
@@ -246,6 +246,8 @@ int main(int argc, char* argv[])
ref_invoker.Run(ref_argument);
- ck::utils::check_err(c_m_n_device_result.mData, c_m_n_host_result.mData);
+ return ck::utils::check_err(c_m_n_device_result.mData, c_m_n_host_result.mData) ? 0 : 1;
}
+
+ return 0;
}
diff --git a/example/03_gemm_bias_relu/gemm_xdl_bias_relu.cpp b/example/03_gemm_bias_relu/gemm_xdl_bias_relu.cpp
index f3ed2bad37..3bf3003c14 100644
--- a/example/03_gemm_bias_relu/gemm_xdl_bias_relu.cpp
+++ b/example/03_gemm_bias_relu/gemm_xdl_bias_relu.cpp
@@ -83,9 +83,9 @@ using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemmBiasActiv
int main(int argc, char* argv[])
{
- bool do_verification = 0;
- int init_method = 0;
- int nrepeat = 5;
+ bool do_verification = true;
+ int init_method = 1;
+ bool time_kernel = false;
// GEMM shape
ck::index_t M = 3840;
@@ -100,13 +100,13 @@ int main(int argc, char* argv[])
{
do_verification = std::stoi(argv[1]);
init_method = std::stoi(argv[2]);
- nrepeat = std::stoi(argv[3]);
+ time_kernel = std::stoi(argv[3]);
}
else if(argc == 10)
{
do_verification = std::stoi(argv[1]);
init_method = std::stoi(argv[2]);
- nrepeat = std::stoi(argv[3]);
+ time_kernel = std::stoi(argv[3]);
M = std::stoi(argv[4]);
N = std::stoi(argv[5]);
@@ -120,7 +120,7 @@ int main(int argc, char* argv[])
{
printf("arg1: verification (0=no, 1=yes)\n");
printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n");
- printf("arg3: run kernel # of times (>1)\n");
+ printf("arg3: time kernel (0=n0, 1=yes)\n");
printf("arg4 to 9: M (256x), N(128x), K(32x), StrideA, StrideB, StrideC\n");
exit(0);
}
@@ -206,7 +206,7 @@ int main(int argc, char* argv[])
"not support this GEMM problem");
}
- float ave_time = invoker.Run(argument, nrepeat);
+ float ave_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel});
std::size_t flop = std::size_t(2) * M * N * K;
@@ -232,6 +232,8 @@ int main(int argc, char* argv[])
ref_invoker.Run(ref_argument);
- ck::utils::check_err(c_m_n_device_result.mData, c_m_n_host_result.mData);
+ return ck::utils::check_err(c_m_n_device_result.mData, c_m_n_host_result.mData) ? 0 : 1;
}
+
+ return 0;
}
diff --git a/example/04_gemm_bias_relu_add/gemm_xdl_bias_relu_add.cpp b/example/04_gemm_bias_relu_add/gemm_xdl_bias_relu_add.cpp
index 9405c36881..73e92f9d11 100644
--- a/example/04_gemm_bias_relu_add/gemm_xdl_bias_relu_add.cpp
+++ b/example/04_gemm_bias_relu_add/gemm_xdl_bias_relu_add.cpp
@@ -83,9 +83,9 @@ using ReferenceGemmInstance =
CElementOp>;
int main(int argc, char* argv[])
{
- bool do_verification = 0;
- int init_method = 0;
- int nrepeat = 5;
+ bool do_verification = true;
+ int init_method = 1;
+ bool time_kernel = false;
// GEMM shape
ck::index_t M = 3840;
@@ -101,13 +101,13 @@ int main(int argc, char* argv[])
{
do_verification = std::stoi(argv[1]);
init_method = std::stoi(argv[2]);
- nrepeat = std::stoi(argv[3]);
+ time_kernel = std::stoi(argv[3]);
}
else if(argc == 11)
{
do_verification = std::stoi(argv[1]);
init_method = std::stoi(argv[2]);
- nrepeat = std::stoi(argv[3]);
+ time_kernel = std::stoi(argv[3]);
M = std::stoi(argv[4]);
N = std::stoi(argv[5]);
@@ -122,7 +122,7 @@ int main(int argc, char* argv[])
{
printf("arg1: verification (0=no, 1=yes)\n");
printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n");
- printf("arg3: run kernel # of times (>1)\n");
+ printf("arg3: time kernel (0=n0, 1=yes)\n");
printf("arg4 to 10: M (256x), N(128x), K(32x), StrideA, StrideB, StrideC, StrideC1\n");
exit(0);
}
@@ -218,7 +218,7 @@ int main(int argc, char* argv[])
"not support this GEMM problem");
}
- float ave_time = invoker.Run(argument, nrepeat);
+ float ave_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel});
std::size_t flop = std::size_t(2) * M * N * K;
std::size_t num_btype = sizeof(ADataType) * M * K + sizeof(BDataType) * K * M +
@@ -250,6 +250,8 @@ int main(int argc, char* argv[])
ref_invoker.Run(ref_argument);
- ck::utils::check_err(c_m_n_device_result.mData, c_m_n_host_result.mData);
+ return ck::utils::check_err(c_m_n_device_result.mData, c_m_n_host_result.mData) ? 0 : 1;
}
+
+ return 0;
}
diff --git a/example/06_conv2d_fwd_bias_relu/CMakeLists.txt b/example/06_conv2d_fwd_bias_relu/CMakeLists.txt
index df8f70606c..4e1dd1f3e6 100644
--- a/example/06_conv2d_fwd_bias_relu/CMakeLists.txt
+++ b/example/06_conv2d_fwd_bias_relu/CMakeLists.txt
@@ -1,2 +1,2 @@
add_example_executable(example_conv2d_fwd_xdl_bias_relu conv2d_fwd_xdl_bias_relu.cpp)
-target_link_libraries(example_conv2d_fwd_xdl_bias_relu PRIVATE conv_fwd_util)
+target_link_libraries(example_conv2d_fwd_xdl_bias_relu PRIVATE conv_util)
diff --git a/example/06_conv2d_fwd_bias_relu/conv2d_fwd_xdl_bias_relu.cpp b/example/06_conv2d_fwd_bias_relu/conv2d_fwd_xdl_bias_relu.cpp
index 751ce16b90..d50afb6854 100644
--- a/example/06_conv2d_fwd_bias_relu/conv2d_fwd_xdl_bias_relu.cpp
+++ b/example/06_conv2d_fwd_bias_relu/conv2d_fwd_xdl_bias_relu.cpp
@@ -7,7 +7,7 @@
#include "check_err.hpp"
#include "config.hpp"
-#include "conv_fwd_util.hpp"
+#include "conv_util.hpp"
#include "device.hpp"
#include "device_conv2d_fwd_xdl_c_shuffle_bias_activation_nhwc_kyxc_nhwk.hpp"
#include "device_tensor.hpp"
@@ -93,7 +93,7 @@ void PrintUseMsg()
{
std::cout << "arg1: verification (0=no, 1=yes)\n"
<< "arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n"
- << "arg3: run kernel # of times (>1)\n"
+ << "arg3: time kernel (0=n0, 1=yes)\n"
<< "Following arguments:\n"
<< " N, K, C, \n"
<< " , (ie Y, X for 2D)\n"
@@ -120,40 +120,40 @@ ck::utils::conv::ConvParams ParseConvParams(int argc, char* argv[])
ck::utils::conv::ConvParams params;
int arg_idx = 4;
- params.num_dim_spatial = num_dim_spatial;
- params.N = std::stoi(argv[arg_idx++]);
- params.K = std::stoi(argv[arg_idx++]);
- params.C = std::stoi(argv[arg_idx++]);
+ params.num_dim_spatial_ = num_dim_spatial;
+ params.N_ = std::stoi(argv[arg_idx++]);
+ params.K_ = std::stoi(argv[arg_idx++]);
+ params.C_ = std::stoi(argv[arg_idx++]);
- params.filter_spatial_lengths.resize(num_dim_spatial);
+ params.filter_spatial_lengths_.resize(num_dim_spatial);
for(int i = 0; i < num_dim_spatial; ++i)
{
- params.filter_spatial_lengths[i] = std::stoi(argv[arg_idx++]);
+ params.filter_spatial_lengths_[i] = std::stoi(argv[arg_idx++]);
}
- params.input_spatial_lengths.resize(num_dim_spatial);
+ params.input_spatial_lengths_.resize(num_dim_spatial);
for(int i = 0; i < num_dim_spatial; ++i)
{
- params.input_spatial_lengths[i] = std::stoi(argv[arg_idx++]);
+ params.input_spatial_lengths_[i] = std::stoi(argv[arg_idx++]);
}
- params.conv_filter_strides.resize(num_dim_spatial);
+ params.conv_filter_strides_.resize(num_dim_spatial);
for(int i = 0; i < num_dim_spatial; ++i)
{
- params.conv_filter_strides[i] = std::stoi(argv[arg_idx++]);
+ params.conv_filter_strides_[i] = std::stoi(argv[arg_idx++]);
}
- params.conv_filter_dilations.resize(num_dim_spatial);
+ params.conv_filter_dilations_.resize(num_dim_spatial);
for(int i = 0; i < num_dim_spatial; ++i)
{
- params.conv_filter_dilations[i] = std::stoi(argv[arg_idx++]);
+ params.conv_filter_dilations_[i] = std::stoi(argv[arg_idx++]);
}
- params.input_left_pads.resize(num_dim_spatial);
+ params.input_left_pads_.resize(num_dim_spatial);
for(int i = 0; i < num_dim_spatial; ++i)
{
- params.input_left_pads[i] = std::stoi(argv[arg_idx++]);
+ params.input_left_pads_[i] = std::stoi(argv[arg_idx++]);
}
- params.input_right_pads.resize(num_dim_spatial);
+ params.input_right_pads_.resize(num_dim_spatial);
for(int i = 0; i < num_dim_spatial; ++i)
{
- params.input_right_pads[i] = std::stoi(argv[arg_idx++]);
+ params.input_right_pads_[i] = std::stoi(argv[arg_idx++]);
}
return params;
@@ -165,9 +165,9 @@ int main(int argc, char* argv[])
{
using namespace ck::utils::conv;
- bool do_verification = 0;
- int init_method = 0;
- int nrepeat = 5;
+ bool do_verification = true;
+ int init_method = 1;
+ bool time_kernel = false;
const int num_dim_spatial = 2;
ck::utils::conv::ConvParams params;
@@ -176,7 +176,7 @@ int main(int argc, char* argv[])
{
do_verification = std::stoi(argv[1]);
init_method = std::stoi(argv[2]);
- nrepeat = std::stoi(argv[3]);
+ time_kernel = std::stoi(argv[3]);
}
if(argc >= 5)
@@ -184,21 +184,21 @@ int main(int argc, char* argv[])
params = ParseConvParams(argc, argv);
}
- std::vector input_dims{static_cast(params.N),
- static_cast(params.C)};
+ std::vector input_dims{static_cast(params.N_),
+ static_cast(params.C_)};
input_dims.insert(std::end(input_dims),
- std::begin(params.input_spatial_lengths),
- std::end(params.input_spatial_lengths));
+ std::begin(params.input_spatial_lengths_),
+ std::end(params.input_spatial_lengths_));
- std::vector filter_dims{static_cast(params.K),
- static_cast(params.C)};
+ std::vector filter_dims{static_cast(params.K_),
+ static_cast(params.C_)};
filter_dims.insert(std::end(filter_dims),
- std::begin(params.filter_spatial_lengths),
- std::end(params.filter_spatial_lengths));
+ std::begin(params.filter_spatial_lengths_),
+ std::end(params.filter_spatial_lengths_));
const std::vector& output_spatial_lengths = params.GetOutputSpatialLengths();
- std::vector output_dims{static_cast(params.N),
- static_cast(params.K)};
+ std::vector output_dims{static_cast(params.N_),
+ static_cast(params.K_)};
output_dims.insert(std::end(output_dims),
std::begin(output_spatial_lengths),
std::end(output_spatial_lengths));
@@ -211,7 +211,7 @@ int main(int argc, char* argv[])
get_output_host_tensor_descriptor(output_dims, num_dim_spatial));
// bias: assume contiguous 1d vector
Tensor bias(
- HostTensorDescriptor(std::vector({static_cast(params.K)})));
+ HostTensorDescriptor(std::vector({static_cast(params.K_)})));
std::cout << "input: " << input.mDesc << std::endl;
std::cout << "weights: " << weights.mDesc << std::endl;
@@ -248,16 +248,16 @@ int main(int argc, char* argv[])
static_cast(wei_device_buf.GetDeviceBuffer()),
static_cast(out_device_buf.GetDeviceBuffer()),
static_cast(bias_device_buf.GetDeviceBuffer()),
- params.N,
- params.K,
- params.C,
- params.input_spatial_lengths,
- params.filter_spatial_lengths,
+ params.N_,
+ params.K_,
+ params.C_,
+ params.input_spatial_lengths_,
+ params.filter_spatial_lengths_,
output_spatial_lengths,
- params.conv_filter_strides,
- params.conv_filter_dilations,
- params.input_left_pads,
- params.input_right_pads,
+ params.conv_filter_strides_,
+ params.conv_filter_dilations_,
+ params.input_left_pads_,
+ params.input_right_pads_,
InElementOp{},
WeiElementOp{},
OutElementOp{});
@@ -269,18 +269,18 @@ int main(int argc, char* argv[])
"not support this problem");
}
- float ave_time = invoker.Run(argument, nrepeat);
+ float ave_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel});
std::size_t flop = get_flops(
- params.N, params.C, params.K, params.filter_spatial_lengths, output_spatial_lengths);
+ params.N_, params.C_, params.K_, params.filter_spatial_lengths_, output_spatial_lengths);
std::size_t num_btype =
- get_btype(params.N,
- params.C,
- params.K,
- params.input_spatial_lengths,
- params.filter_spatial_lengths,
+ get_btype(params.N_,
+ params.C_,
+ params.K_,
+ params.input_spatial_lengths_,
+ params.filter_spatial_lengths_,
output_spatial_lengths) +
- sizeof(OutDataType) * (params.K);
+ sizeof(OutDataType) * (params.K_);
float tflops = static_cast(flop) / 1.E9 / ave_time;
float gb_per_sec = num_btype / 1.E6 / ave_time;
@@ -296,16 +296,17 @@ int main(int argc, char* argv[])
weights,
host_output,
bias,
- params.conv_filter_strides,
- params.conv_filter_dilations,
- params.input_left_pads,
- params.input_right_pads,
+ params.conv_filter_strides_,
+ params.conv_filter_dilations_,
+ params.input_left_pads_,
+ params.input_right_pads_,
InElementOp{},
WeiElementOp{},
OutElementOp{});
ref_invoker.Run(ref_argument);
out_device_buf.FromDevice(device_output.mData.data());
- ck::utils::check_err(
- host_output.mData, device_output.mData, "Error: incorrect results!", 1e-5f, 1e-4f);
+ return ck::utils::check_err(device_output.mData, host_output.mData) ? 0 : 1;
}
+
+ return 0;
}
diff --git a/example/07_conv2d_fwd_bias_relu_add/CMakeLists.txt b/example/07_conv2d_fwd_bias_relu_add/CMakeLists.txt
index 8bc5980025..b4dd39d83a 100644
--- a/example/07_conv2d_fwd_bias_relu_add/CMakeLists.txt
+++ b/example/07_conv2d_fwd_bias_relu_add/CMakeLists.txt
@@ -1,2 +1,3 @@
-add_example_executable(example_conv2d_fwd_xdl_bias_relu_add conv2d_fwd_xdl_bias_relu_add.cpp)
-target_link_libraries(example_conv2d_fwd_xdl_bias_relu_add PRIVATE conv_fwd_util)
+# FIXME: should fix validation failure
+add_example_executable_no_testing(example_conv2d_fwd_xdl_bias_relu_add conv2d_fwd_xdl_bias_relu_add.cpp)
+target_link_libraries(example_conv2d_fwd_xdl_bias_relu_add PRIVATE conv_util)
diff --git a/example/07_conv2d_fwd_bias_relu_add/conv2d_fwd_xdl_bias_relu_add.cpp b/example/07_conv2d_fwd_bias_relu_add/conv2d_fwd_xdl_bias_relu_add.cpp
index e6339fcd23..53d882778a 100644
--- a/example/07_conv2d_fwd_bias_relu_add/conv2d_fwd_xdl_bias_relu_add.cpp
+++ b/example/07_conv2d_fwd_bias_relu_add/conv2d_fwd_xdl_bias_relu_add.cpp
@@ -7,7 +7,7 @@
#include "check_err.hpp"
#include "config.hpp"
-#include "conv_fwd_util.hpp"
+#include "conv_util.hpp"
#include "device.hpp"
#include "device_conv2d_fwd_xdl_c_shuffle_bias_activation_add_nhwc_kyxc_nhwk.hpp"
#include "device_tensor.hpp"
@@ -90,7 +90,7 @@ void PrintUseMsg()
{
std::cout << "arg1: verification (0=no, 1=yes)\n"
<< "arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n"
- << "arg3: run kernel # of times (>1)\n"
+ << "arg3: time kernel (0=n0, 1=yes)\n"
<< "Following arguments:\n"
<< " N, K, C, \n"
<< " , (ie Y, X for 2D)\n"
@@ -117,40 +117,40 @@ ck::utils::conv::ConvParams ParseConvParams(int argc, char* argv[])
ck::utils::conv::ConvParams params;
int arg_idx = 4;
- params.num_dim_spatial = num_dim_spatial;
- params.N = std::stoi(argv[arg_idx++]);
- params.K = std::stoi(argv[arg_idx++]);
- params.C = std::stoi(argv[arg_idx++]);
+ params.num_dim_spatial_ = num_dim_spatial;
+ params.N_ = std::stoi(argv[arg_idx++]);
+ params.K_ = std::stoi(argv[arg_idx++]);
+ params.C_ = std::stoi(argv[arg_idx++]);
- params.filter_spatial_lengths.resize(num_dim_spatial);
+ params.filter_spatial_lengths_.resize(num_dim_spatial);
for(int i = 0; i < num_dim_spatial; ++i)
{
- params.filter_spatial_lengths[i] = std::stoi(argv[arg_idx++]);
+ params.filter_spatial_lengths_[i] = std::stoi(argv[arg_idx++]);
}
- params.input_spatial_lengths.resize(num_dim_spatial);
+ params.input_spatial_lengths_.resize(num_dim_spatial);
for(int i = 0; i < num_dim_spatial; ++i)
{
- params.input_spatial_lengths[i] = std::stoi(argv[arg_idx++]);
+ params.input_spatial_lengths_[i] = std::stoi(argv[arg_idx++]);
}
- params.conv_filter_strides.resize(num_dim_spatial);
+ params.conv_filter_strides_.resize(num_dim_spatial);
for(int i = 0; i < num_dim_spatial; ++i)
{
- params.conv_filter_strides[i] = std::stoi(argv[arg_idx++]);
+ params.conv_filter_strides_[i] = std::stoi(argv[arg_idx++]);
}
- params.conv_filter_dilations.resize(num_dim_spatial);
+ params.conv_filter_dilations_.resize(num_dim_spatial);
for(int i = 0; i < num_dim_spatial; ++i)
{
- params.conv_filter_dilations[i] = std::stoi(argv[arg_idx++]);
+ params.conv_filter_dilations_[i] = std::stoi(argv[arg_idx++]);
}
- params.input_left_pads.resize(num_dim_spatial);
+ params.input_left_pads_.resize(num_dim_spatial);
for(int i = 0; i < num_dim_spatial; ++i)
{
- params.input_left_pads[i] = std::stoi(argv[arg_idx++]);
+ params.input_left_pads_[i] = std::stoi(argv[arg_idx++]);
}
- params.input_right_pads.resize(num_dim_spatial);
+ params.input_right_pads_.resize(num_dim_spatial);
for(int i = 0; i < num_dim_spatial; ++i)
{
- params.input_right_pads[i] = std::stoi(argv[arg_idx++]);
+ params.input_right_pads_[i] = std::stoi(argv[arg_idx++]);
}
return params;
@@ -162,9 +162,9 @@ int main(int argc, char* argv[])
{
using namespace ck::utils::conv;
- bool do_verification = 0;
- int init_method = 0;
- int nrepeat = 5;
+ bool do_verification = true;
+ int init_method = 1;
+ bool time_kernel = false;
const int num_dim_spatial = 2;
ck::utils::conv::ConvParams params;
@@ -173,7 +173,7 @@ int main(int argc, char* argv[])
{
do_verification = std::stoi(argv[1]);
init_method = std::stoi(argv[2]);
- nrepeat = std::stoi(argv[3]);
+ time_kernel = std::stoi(argv[3]);
}
if(argc >= 5)
@@ -181,21 +181,21 @@ int main(int argc, char* argv[])
params = ParseConvParams(argc, argv);
}
- std::vector input_dims{static_cast(params.N),
- static_cast(params.C)};
+ std::vector input_dims{static_cast(params.N_),
+ static_cast(params.C_)};
input_dims.insert(std::end(input_dims),
- std::begin(params.input_spatial_lengths),
- std::end(params.input_spatial_lengths));
+ std::begin(params.input_spatial_lengths_),
+ std::end(params.input_spatial_lengths_));
- std::vector filter_dims{static_cast(params.K),
- static_cast(params.C)};
+ std::vector filter_dims{static_cast(params.K_),
+ static_cast(params.C_)};
filter_dims.insert(std::end(filter_dims),
- std::begin(params.filter_spatial_lengths),
- std::end(params.filter_spatial_lengths));
+ std::begin(params.filter_spatial_lengths_),
+ std::end(params.filter_spatial_lengths_));
const std::vector& output_spatial_lengths = params.GetOutputSpatialLengths();
- std::vector output_dims{static_cast(params.N),
- static_cast(params.K)};
+ std::vector output_dims{static_cast(params.N_),
+ static_cast(params.K_)};
output_dims.insert(std::end(output_dims),
std::begin(output_spatial_lengths),
std::end(output_spatial_lengths));
@@ -209,7 +209,7 @@ int main(int argc, char* argv[])
// bias: assume contiguous 1d vector
Tensor bias(
- HostTensorDescriptor(std::vector({static_cast(params.K)})));
+ HostTensorDescriptor(std::vector({static_cast(params.K_)})));
// residual: assume same layout as output tensor
Tensor residual(get_output_host_tensor_descriptor(output_dims, num_dim_spatial));
@@ -259,16 +259,16 @@ int main(int argc, char* argv[])
static_cast(out_device_buf.GetDeviceBuffer()),
static_cast(bias_device_buf.GetDeviceBuffer()),
static_cast(resi_device_buf.GetDeviceBuffer()),
- params.N,
- params.K,
- params.C,
- params.input_spatial_lengths,
- params.filter_spatial_lengths,
+ params.N_,
+ params.K_,
+ params.C_,
+ params.input_spatial_lengths_,
+ params.filter_spatial_lengths_,
output_spatial_lengths,
- params.conv_filter_strides,
- params.conv_filter_dilations,
- params.input_left_pads,
- params.input_right_pads,
+ params.conv_filter_strides_,
+ params.conv_filter_dilations_,
+ params.input_left_pads_,
+ params.input_right_pads_,
in_element_op,
wei_element_op,
out_element_op);
@@ -280,20 +280,20 @@ int main(int argc, char* argv[])
"not support this problem");
}
- float ave_time = invoker.Run(argument, nrepeat);
+ float ave_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel});
std::size_t flop = get_flops(
- params.N, params.C, params.K, params.filter_spatial_lengths, output_spatial_lengths);
+ params.N_, params.C_, params.K_, params.filter_spatial_lengths_, output_spatial_lengths);
std::size_t num_btype =
- get_btype(params.N,
- params.C,
- params.K,
- params.input_spatial_lengths,
- params.filter_spatial_lengths,
+ get_btype(params.N_,
+ params.C_,
+ params.K_,
+ params.input_spatial_lengths_,
+ params.filter_spatial_lengths_,
output_spatial_lengths) +
- sizeof(OutDataType) * (params.K) +
+ sizeof(OutDataType) * (params.K_) +
sizeof(OutDataType) *
- (params.N * params.K * output_spatial_lengths[0] * output_spatial_lengths[1]);
+ (params.N_ * params.K_ * output_spatial_lengths[0] * output_spatial_lengths[1]);
float tflops = static_cast(flop) / 1.E9 / ave_time;
float gb_per_sec = num_btype / 1.E6 / ave_time;
@@ -310,17 +310,18 @@ int main(int argc, char* argv[])
host_output,
bias,
residual,
- params.conv_filter_strides,
- params.conv_filter_dilations,
- params.input_left_pads,
- params.input_right_pads,
+ params.conv_filter_strides_,
+ params.conv_filter_dilations_,
+ params.input_left_pads_,
+ params.input_right_pads_,
in_element_op,
wei_element_op,
out_element_op);
ref_invoker.Run(ref_argument);
out_device_buf.FromDevice(device_output.mData.data());
- ck::utils::check_err(
- host_output.mData, device_output.mData, "Error: incorrect results!", 1e-5f, 1e-4f);
+ return ck::utils::check_err(device_output.mData, host_output.mData) ? 0 : 1;
}
+
+ return 0;
}
diff --git a/example/09_convnd_fwd/CMakeLists.txt b/example/09_convnd_fwd/CMakeLists.txt
index f602862a04..ceceb4aedc 100644
--- a/example/09_convnd_fwd/CMakeLists.txt
+++ b/example/09_convnd_fwd/CMakeLists.txt
@@ -1,6 +1,6 @@
-add_example_executable(example_convnd_fwd_xdl convnd_fwd_xdl.cpp)
-target_link_libraries(example_convnd_fwd_xdl PRIVATE conv_fwd_util)
+add_example_executable(example_convnd_fwd_xdl_fp32 convnd_fwd_xdl_fp32.cpp)
add_example_executable(example_convnd_fwd_xdl_int8 convnd_fwd_xdl_int8.cpp)
-target_link_libraries(example_convnd_fwd_xdl_int8 PRIVATE conv_fwd_util)
add_example_executable(example_convnd_fwd_xdl_fp16 convnd_fwd_xdl_fp16.cpp)
-target_link_libraries(example_convnd_fwd_xdl_fp16 PRIVATE conv_fwd_util)
+target_link_libraries(example_convnd_fwd_xdl_fp32 PRIVATE conv_util)
+target_link_libraries(example_convnd_fwd_xdl_int8 PRIVATE conv_util)
+target_link_libraries(example_convnd_fwd_xdl_fp16 PRIVATE conv_util)
diff --git a/example/09_convnd_fwd/convnd_fwd_xdl_fp16.cpp b/example/09_convnd_fwd/convnd_fwd_xdl_fp16.cpp
index eaa5683978..7ad83d5ad6 100644
--- a/example/09_convnd_fwd/convnd_fwd_xdl_fp16.cpp
+++ b/example/09_convnd_fwd/convnd_fwd_xdl_fp16.cpp
@@ -5,7 +5,7 @@
#include "check_err.hpp"
#include "config.hpp"
-#include "conv_fwd_util.hpp"
+#include "conv_util.hpp"
#include "device.hpp"
#include "device_tensor.hpp"
#include "device_convnd_fwd_xdl_nhwc_kyxc_nhwk.hpp"
@@ -43,10 +43,10 @@ template
using DeviceConvNDFwdInstance = ck::tensor_operation::device::
DeviceConvNDFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<
// clang-format off
- InDataType, //
+ InDataType, //
WeiDataType, //
OutDataType, //
- AccDataType, //
+ AccDataType, //
InElementOp, // Input Elementwise Operation
WeiElementOp, // Weights Elementwise Operation
OutElementOp, // Output Elementwise Operation
@@ -110,7 +110,7 @@ void print_use_msg()
{
std::cout << "arg1: verification (0=no, 1=yes)\n"
<< "arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n"
- << "arg3: run kernel # of times (>1)\n"
+ << "arg3: time kernel (0=n0, 1=yes)\n"
<< "arg4: N spatial dimensions (default 2)\n"
<< "Following arguments (depending on number of spatial dims):\n"
<< " N, K, C, \n"
@@ -137,40 +137,40 @@ ck::utils::conv::ConvParams parse_conv_params(int num_dim_spatial, int argc, cha
ck::utils::conv::ConvParams params;
int arg_idx = 5;
- params.num_dim_spatial = num_dim_spatial;
- params.N = std::stoi(argv[arg_idx++]);
- params.K = std::stoi(argv[arg_idx++]);
- params.C = std::stoi(argv[arg_idx++]);
+ params.num_dim_spatial_ = num_dim_spatial;
+ params.N_ = std::stoi(argv[arg_idx++]);
+ params.K_ = std::stoi(argv[arg_idx++]);
+ params.C_ = std::stoi(argv[arg_idx++]);
- params.filter_spatial_lengths.resize(num_dim_spatial);
+ params.filter_spatial_lengths_.resize(num_dim_spatial);
for(int i = 0; i < num_dim_spatial; ++i)
{
- params.filter_spatial_lengths[i] = std::stoi(argv[arg_idx++]);
+ params.filter_spatial_lengths_[i] = std::stoi(argv[arg_idx++]);
}
- params.input_spatial_lengths.resize(num_dim_spatial);
+ params.input_spatial_lengths_.resize(num_dim_spatial);
for(int i = 0; i < num_dim_spatial; ++i)
{
- params.input_spatial_lengths[i] = std::stoi(argv[arg_idx++]);
+ params.input_spatial_lengths_[i] = std::stoi(argv[arg_idx++]);
}
- params.conv_filter_strides.resize(num_dim_spatial);
+ params.conv_filter_strides_.resize(num_dim_spatial);
for(int i = 0; i < num_dim_spatial; ++i)
{
- params.conv_filter_strides[i] = std::stoi(argv[arg_idx++]);
+ params.conv_filter_strides_[i] = std::stoi(argv[arg_idx++]);
}
- params.conv_filter_dilations.resize(num_dim_spatial);
+ params.conv_filter_dilations_.resize(num_dim_spatial);
for(int i = 0; i < num_dim_spatial; ++i)
{
- params.conv_filter_dilations[i] = std::stoi(argv[arg_idx++]);
+ params.conv_filter_dilations_[i] = std::stoi(argv[arg_idx++]);
}
- params.input_left_pads.resize(num_dim_spatial);
+ params.input_left_pads_.resize(num_dim_spatial);
for(int i = 0; i < num_dim_spatial; ++i)
{
- params.input_left_pads[i] = std::stoi(argv[arg_idx++]);
+ params.input_left_pads_[i] = std::stoi(argv[arg_idx++]);
}
- params.input_right_pads.resize(num_dim_spatial);
+ params.input_right_pads_.resize(num_dim_spatial);
for(int i = 0; i < num_dim_spatial; ++i)
{
- params.input_right_pads[i] = std::stoi(argv[arg_idx++]);
+ params.input_right_pads_[i] = std::stoi(argv[arg_idx++]);
}
return params;
@@ -182,9 +182,9 @@ int main(int argc, char* argv[])
{
using namespace ck::utils::conv;
- bool do_verification = 0;
- int init_method = 0;
- int nrepeat = 5;
+ bool do_verification = true;
+ int init_method = 1;
+ bool time_kernel = false;
int num_dim_spatial = 2;
ck::utils::conv::ConvParams params;
@@ -193,7 +193,7 @@ int main(int argc, char* argv[])
{
do_verification = std::stoi(argv[1]);
init_method = std::stoi(argv[2]);
- nrepeat = std::stoi(argv[3]);
+ time_kernel = std::stoi(argv[3]);
num_dim_spatial = std::stoi(argv[4]);
}
@@ -202,21 +202,21 @@ int main(int argc, char* argv[])
params = parse_conv_params(num_dim_spatial, argc, argv);
}
- std::vector input_dims{static_cast(params.N),
- static_cast(params.C)};
+ std::vector input_dims{static_cast(params.N_),
+ static_cast(params.C_)};
input_dims.insert(std::end(input_dims),
- std::begin(params.input_spatial_lengths),
- std::end(params.input_spatial_lengths));
+ std::begin(params.input_spatial_lengths_),
+ std::end(params.input_spatial_lengths_));
- std::vector filter_dims{static_cast(params.K),
- static_cast(params.C)};
+ std::vector filter_dims{static_cast(params.K_),
+ static_cast(params.C_)};
filter_dims.insert(std::end(filter_dims),
- std::begin(params.filter_spatial_lengths),
- std::end(params.filter_spatial_lengths));
+ std::begin(params.filter_spatial_lengths_),
+ std::end(params.filter_spatial_lengths_));
const std::vector& output_spatial_lengths = params.GetOutputSpatialLengths();
- std::vector output_dims{static_cast(params.N),
- static_cast(params.K)};
+ std::vector output_dims{static_cast(params.N_),
+ static_cast(params.K_)};
output_dims.insert(std::end(output_dims),
std::begin(output_spatial_lengths),
std::end(output_spatial_lengths));
@@ -256,16 +256,16 @@ int main(int argc, char* argv[])
conv->MakeArgumentPointer(static_cast(in_device_buf.GetDeviceBuffer()),
static_cast(wei_device_buf.GetDeviceBuffer()),
static_cast(out_device_buf.GetDeviceBuffer()),
- params.N,
- params.K,
- params.C,
- params.input_spatial_lengths,
- params.filter_spatial_lengths,
+ params.N_,
+ params.K_,
+ params.C_,
+ params.input_spatial_lengths_,
+ params.filter_spatial_lengths_,
output_spatial_lengths,
- params.conv_filter_strides,
- params.conv_filter_dilations,
- params.input_left_pads,
- params.input_right_pads,
+ params.conv_filter_strides_,
+ params.conv_filter_dilations_,
+ params.input_left_pads_,
+ params.input_right_pads_,
InElementOp{},
WeiElementOp{},
OutElementOp{});
@@ -277,16 +277,16 @@ int main(int argc, char* argv[])
"not support this Conv problem");
}
- float ave_time = invoker->Run(argument.get(), nrepeat);
+ float ave_time = invoker->Run(argument.get(), StreamConfig{nullptr, time_kernel});
std::size_t flop = get_flops(
- params.N, params.C, params.K, params.filter_spatial_lengths, output_spatial_lengths);
+ params.N_, params.C_, params.K_, params.filter_spatial_lengths_, output_spatial_lengths);
std::size_t num_btype = get_btype(
- params.N,
- params.C,
- params.K,
- params.input_spatial_lengths,
- params.filter_spatial_lengths,
+ params.N_,
+ params.C_,
+ params.K_,
+ params.input_spatial_lengths_,
+ params.filter_spatial_lengths_,
output_spatial_lengths);
float tflops = static_cast(flop) / 1.E9 / ave_time;
@@ -302,18 +302,18 @@ int main(int argc, char* argv[])
auto ref_argument = ref_conv.MakeArgument(input,
weights,
host_output,
- params.conv_filter_strides,
- params.conv_filter_dilations,
- params.input_left_pads,
- params.input_right_pads,
+ params.conv_filter_strides_,
+ params.conv_filter_dilations_,
+ params.input_left_pads_,
+ params.input_right_pads_,
InElementOp{},
WeiElementOp{},
OutElementOp{});
ref_invoker.Run(ref_argument);
out_device_buf.FromDevice(device_output.mData.data());
- ck::utils::check_err(
- host_output.mData, device_output.mData, "Error: incorrect results!", 1e-5f, 1e-4f);
+ return ck::utils::check_err(
+ host_output.mData, device_output.mData, "Error: incorrect results!", 1e-5f, 1e-4f) ? 0 : 1;
};
switch(num_dim_spatial)
@@ -338,4 +338,5 @@ int main(int argc, char* argv[])
}
}
}
+ return 0;
}
diff --git a/example/09_convnd_fwd/convnd_fwd_xdl.cpp b/example/09_convnd_fwd/convnd_fwd_xdl_fp32.cpp
similarity index 80%
rename from example/09_convnd_fwd/convnd_fwd_xdl.cpp
rename to example/09_convnd_fwd/convnd_fwd_xdl_fp32.cpp
index e8895b8639..8a9633d84a 100644
--- a/example/09_convnd_fwd/convnd_fwd_xdl.cpp
+++ b/example/09_convnd_fwd/convnd_fwd_xdl_fp32.cpp
@@ -5,7 +5,7 @@
#include "check_err.hpp"
#include "config.hpp"
-#include "conv_fwd_util.hpp"
+#include "conv_util.hpp"
#include "device.hpp"
#include "device_tensor.hpp"
#include "device_convnd_fwd_xdl_nhwc_kyxc_nhwk.hpp"
@@ -39,10 +39,10 @@ template
using DeviceConvNDFwdInstance = ck::tensor_operation::device::
DeviceConvNDFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<
// clang-format off
- InDataType, //
+ InDataType, //
WeiDataType, //
OutDataType, //
- AccDataType, //
+ AccDataType, //
InElementOp, // Input Elementwise Operation
WeiElementOp, // Weights Elementwise Operation
OutElementOp, // Output Elementwise Operation
@@ -107,7 +107,7 @@ void print_use_msg()
{
std::cout << "arg1: verification (0=no, 1=yes)\n"
<< "arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n"
- << "arg3: run kernel # of times (>1)\n"
+ << "arg3: time kernel (0=n0, 1=yes)\n"
<< "arg4: N spatial dimensions (default 2)\n"
<< "Following arguments (depending on number of spatial dims):\n"
<< " N, K, C, \n"
@@ -134,40 +134,40 @@ ck::utils::conv::ConvParams parse_conv_params(int num_dim_spatial, int argc, cha
ck::utils::conv::ConvParams params;
int arg_idx = 5;
- params.num_dim_spatial = num_dim_spatial;
- params.N = std::stoi(argv[arg_idx++]);
- params.K = std::stoi(argv[arg_idx++]);
- params.C = std::stoi(argv[arg_idx++]);
+ params.num_dim_spatial_ = num_dim_spatial;
+ params.N_ = std::stoi(argv[arg_idx++]);
+ params.K_ = std::stoi(argv[arg_idx++]);
+ params.C_ = std::stoi(argv[arg_idx++]);
- params.filter_spatial_lengths.resize(num_dim_spatial);
+ params.filter_spatial_lengths_.resize(num_dim_spatial);
for(int i = 0; i < num_dim_spatial; ++i)
{
- params.filter_spatial_lengths[i] = std::stoi(argv[arg_idx++]);
+ params.filter_spatial_lengths_[i] = std::stoi(argv[arg_idx++]);
}
- params.input_spatial_lengths.resize(num_dim_spatial);
+ params.input_spatial_lengths_.resize(num_dim_spatial);
for(int i = 0; i < num_dim_spatial; ++i)
{
- params.input_spatial_lengths[i] = std::stoi(argv[arg_idx++]);
+ params.input_spatial_lengths_[i] = std::stoi(argv[arg_idx++]);
}
- params.conv_filter_strides.resize(num_dim_spatial);
+ params.conv_filter_strides_.resize(num_dim_spatial);
for(int i = 0; i < num_dim_spatial; ++i)
{
- params.conv_filter_strides[i] = std::stoi(argv[arg_idx++]);
+ params.conv_filter_strides_[i] = std::stoi(argv[arg_idx++]);
}
- params.conv_filter_dilations.resize(num_dim_spatial);
+ params.conv_filter_dilations_.resize(num_dim_spatial);
for(int i = 0; i < num_dim_spatial; ++i)
{
- params.conv_filter_dilations[i] = std::stoi(argv[arg_idx++]);
+ params.conv_filter_dilations_[i] = std::stoi(argv[arg_idx++]);
}
- params.input_left_pads.resize(num_dim_spatial);
+ params.input_left_pads_.resize(num_dim_spatial);
for(int i = 0; i < num_dim_spatial; ++i)
{
- params.input_left_pads[i] = std::stoi(argv[arg_idx++]);
+ params.input_left_pads_[i] = std::stoi(argv[arg_idx++]);
}
- params.input_right_pads.resize(num_dim_spatial);
+ params.input_right_pads_.resize(num_dim_spatial);
for(int i = 0; i < num_dim_spatial; ++i)
{
- params.input_right_pads[i] = std::stoi(argv[arg_idx++]);
+ params.input_right_pads_[i] = std::stoi(argv[arg_idx++]);
}
return params;
@@ -179,9 +179,9 @@ int main(int argc, char* argv[])
{
using namespace ck::utils::conv;
- bool do_verification = 0;
- int init_method = 0;
- int nrepeat = 5;
+ bool do_verification = true;
+ int init_method = 1;
+ bool time_kernel = false;
int num_dim_spatial = 2;
ck::utils::conv::ConvParams params;
@@ -190,7 +190,7 @@ int main(int argc, char* argv[])
{
do_verification = std::stoi(argv[1]);
init_method = std::stoi(argv[2]);
- nrepeat = std::stoi(argv[3]);
+ time_kernel = std::stoi(argv[3]);
num_dim_spatial = std::stoi(argv[4]);
}
@@ -199,21 +199,21 @@ int main(int argc, char* argv[])
params = parse_conv_params(num_dim_spatial, argc, argv);
}
- std::vector input_dims{static_cast(params.N),
- static_cast(params.C)};
+ std::vector input_dims{static_cast(params.N_),
+ static_cast(params.C_)};
input_dims.insert(std::end(input_dims),
- std::begin(params.input_spatial_lengths),
- std::end(params.input_spatial_lengths));
+ std::begin(params.input_spatial_lengths_),
+ std::end(params.input_spatial_lengths_));
- std::vector filter_dims{static_cast(params.K),
- static_cast(params.C)};
+ std::vector filter_dims{static_cast(params.K_),
+ static_cast(params.C_)};
filter_dims.insert(std::end(filter_dims),
- std::begin(params.filter_spatial_lengths),
- std::end(params.filter_spatial_lengths));
+ std::begin(params.filter_spatial_lengths_),
+ std::end(params.filter_spatial_lengths_));
const std::vector& output_spatial_lengths = params.GetOutputSpatialLengths();
- std::vector output_dims{static_cast(params.N),
- static_cast(params.K)};
+ std::vector output_dims{static_cast(params.N_),
+ static_cast(params.K_)};
output_dims.insert(std::end(output_dims),
std::begin(output_spatial_lengths),
std::end(output_spatial_lengths));
@@ -255,16 +255,16 @@ int main(int argc, char* argv[])
conv->MakeArgumentPointer(static_cast(in_device_buf.GetDeviceBuffer()),
static_cast(wei_device_buf.GetDeviceBuffer()),
static_cast(out_device_buf.GetDeviceBuffer()),
- params.N,
- params.K,
- params.C,
- params.input_spatial_lengths,
- params.filter_spatial_lengths,
+ params.N_,
+ params.K_,
+ params.C_,
+ params.input_spatial_lengths_,
+ params.filter_spatial_lengths_,
output_spatial_lengths,
- params.conv_filter_strides,
- params.conv_filter_dilations,
- params.input_left_pads,
- params.input_right_pads,
+ params.conv_filter_strides_,
+ params.conv_filter_dilations_,
+ params.input_left_pads_,
+ params.input_right_pads_,
InElementOp{},
WeiElementOp{},
OutElementOp{});
@@ -276,16 +276,16 @@ int main(int argc, char* argv[])
"not support this Conv problem");
}
- float ave_time = invoker->Run(argument.get(), nrepeat);
+ float ave_time = invoker->Run(argument.get(), StreamConfig{nullptr, time_kernel});
std::size_t flop = get_flops(
- params.N, params.C, params.K, params.filter_spatial_lengths, output_spatial_lengths);
+ params.N_, params.C_, params.K_, params.filter_spatial_lengths_, output_spatial_lengths);
std::size_t num_btype =
- get_btype(params.N,
- params.C,
- params.K,
- params.input_spatial_lengths,
- params.filter_spatial_lengths,
+ get_btype(params.N_,
+ params.C_,
+ params.K_,
+ params.input_spatial_lengths_,
+ params.filter_spatial_lengths_,
output_spatial_lengths);
float tflops = static_cast(flop) / 1.E9 / ave_time;
@@ -301,18 +301,23 @@ int main(int argc, char* argv[])
auto ref_argument = ref_conv.MakeArgument(input,
weights,
host_output,
- params.conv_filter_strides,
- params.conv_filter_dilations,
- params.input_left_pads,
- params.input_right_pads,
+ params.conv_filter_strides_,
+ params.conv_filter_dilations_,
+ params.input_left_pads_,
+ params.input_right_pads_,
InElementOp{},
WeiElementOp{},
OutElementOp{});
ref_invoker.Run(ref_argument);
out_device_buf.FromDevice(device_output.mData.data());
- ck::utils::check_err(
- host_output.mData, device_output.mData, "Error: incorrect results!", 1e-5f, 1e-4f);
+ return ck::utils::check_err(device_output.mData,
+ host_output.mData,
+ "Error: incorrect results!",
+ 1e-5f,
+ 1e-4f)
+ ? 0
+ : 1;
};
switch(num_dim_spatial)
@@ -337,4 +342,5 @@ int main(int argc, char* argv[])
}
}
}
+ return 0;
}
diff --git a/example/09_convnd_fwd/convnd_fwd_xdl_int8.cpp b/example/09_convnd_fwd/convnd_fwd_xdl_int8.cpp
index 34b4645770..f196d27182 100644
--- a/example/09_convnd_fwd/convnd_fwd_xdl_int8.cpp
+++ b/example/09_convnd_fwd/convnd_fwd_xdl_int8.cpp
@@ -5,7 +5,7 @@
#include "check_err.hpp"
#include "config.hpp"
-#include "conv_fwd_util.hpp"
+#include "conv_util.hpp"
#include "device.hpp"
#include "device_tensor.hpp"
#include "device_convnd_fwd_xdl_nhwc_kyxc_nhwk.hpp"
@@ -45,10 +45,10 @@ template
using DeviceConvNDFwdInstance = ck::tensor_operation::device::
DeviceConvNDFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<
// clang-format off
- InDataType, //
+ InDataType, //
WeiDataType, //
OutDataType, //
- AccDataType, //
+ AccDataType, //
InElementOp, // Input Elementwise Operation
WeiElementOp, // Weights Elementwise Operation
OutElementOp, // Output Elementwise Operation
@@ -112,7 +112,7 @@ void print_use_msg()
{
std::cout << "arg1: verification (0=no, 1=yes)\n"
<< "arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n"
- << "arg3: run kernel # of times (>1)\n"
+ << "arg3: time kernel (0=n0, 1=yes)\n"
<< "arg4: N spatial dimensions (default 2)\n"
<< "Following arguments (depending on number of spatial dims):\n"
<< " N, K, C, \n"
@@ -139,40 +139,40 @@ ck::utils::conv::ConvParams parse_conv_params(int num_dim_spatial, int argc, cha
ck::utils::conv::ConvParams params;
int arg_idx = 5;
- params.num_dim_spatial = num_dim_spatial;
- params.N = std::stoi(argv[arg_idx++]);
- params.K = std::stoi(argv[arg_idx++]);
- params.C = std::stoi(argv[arg_idx++]);
+ params.num_dim_spatial_ = num_dim_spatial;
+ params.N_ = std::stoi(argv[arg_idx++]);
+ params.K_ = std::stoi(argv[arg_idx++]);
+ params.C_ = std::stoi(argv[arg_idx++]);
- params.filter_spatial_lengths.resize(num_dim_spatial);
+ params.filter_spatial_lengths_.resize(num_dim_spatial);
for(int i = 0; i < num_dim_spatial; ++i)
{
- params.filter_spatial_lengths[i] = std::stoi(argv[arg_idx++]);
+ params.filter_spatial_lengths_[i] = std::stoi(argv[arg_idx++]);
}
- params.input_spatial_lengths.resize(num_dim_spatial);
+ params.input_spatial_lengths_.resize(num_dim_spatial);
for(int i = 0; i < num_dim_spatial; ++i)
{
- params.input_spatial_lengths[i] = std::stoi(argv[arg_idx++]);
+ params.input_spatial_lengths_[i] = std::stoi(argv[arg_idx++]);
}
- params.conv_filter_strides.resize(num_dim_spatial);
+ params.conv_filter_strides_.resize(num_dim_spatial);
for(int i = 0; i < num_dim_spatial; ++i)
{
- params.conv_filter_strides[i] = std::stoi(argv[arg_idx++]);
+ params.conv_filter_strides_[i] = std::stoi(argv[arg_idx++]);
}
- params.conv_filter_dilations.resize(num_dim_spatial);
+ params.conv_filter_dilations_.resize(num_dim_spatial);
for(int i = 0; i < num_dim_spatial; ++i)
{
- params.conv_filter_dilations[i] = std::stoi(argv[arg_idx++]);
+ params.conv_filter_dilations_[i] = std::stoi(argv[arg_idx++]);
}
- params.input_left_pads.resize(num_dim_spatial);
+ params.input_left_pads_.resize(num_dim_spatial);
for(int i = 0; i < num_dim_spatial; ++i)
{
- params.input_left_pads[i] = std::stoi(argv[arg_idx++]);
+ params.input_left_pads_[i] = std::stoi(argv[arg_idx++]);
}
- params.input_right_pads.resize(num_dim_spatial);
+ params.input_right_pads_.resize(num_dim_spatial);
for(int i = 0; i < num_dim_spatial; ++i)
{
- params.input_right_pads[i] = std::stoi(argv[arg_idx++]);
+ params.input_right_pads_[i] = std::stoi(argv[arg_idx++]);
}
return params;
@@ -184,9 +184,9 @@ int main(int argc, char* argv[])
{
using namespace ck::utils::conv;
- bool do_verification = 0;
- int init_method = 0;
- int nrepeat = 5;
+ bool do_verification = true;
+ int init_method = 1;
+ bool time_kernel = false;
int num_dim_spatial = 2;
ck::utils::conv::ConvParams params;
@@ -195,7 +195,7 @@ int main(int argc, char* argv[])
{
do_verification = std::stoi(argv[1]);
init_method = std::stoi(argv[2]);
- nrepeat = std::stoi(argv[3]);
+ time_kernel = std::stoi(argv[3]);
num_dim_spatial = std::stoi(argv[4]);
}
@@ -204,21 +204,21 @@ int main(int argc, char* argv[])
params = parse_conv_params(num_dim_spatial, argc, argv);
}
- std::vector input_dims{static_cast(params.N),
- static_cast(params.C)};
+ std::vector input_dims{static_cast(params.N_),
+ static_cast(params.C_)};
input_dims.insert(std::end(input_dims),
- std::begin(params.input_spatial_lengths),
- std::end(params.input_spatial_lengths));
+ std::begin(params.input_spatial_lengths_),
+ std::end(params.input_spatial_lengths_));
- std::vector filter_dims{static_cast(params.K),
- static_cast(params.C)};
+ std::vector filter_dims{static_cast(params.K_),
+ static_cast(params.C_)};
filter_dims.insert(std::end(filter_dims),
- std::begin(params.filter_spatial_lengths),
- std::end(params.filter_spatial_lengths));
+ std::begin(params.filter_spatial_lengths_),
+ std::end(params.filter_spatial_lengths_));
const std::vector& output_spatial_lengths = params.GetOutputSpatialLengths();
- std::vector output_dims{static_cast(params.N),
- static_cast(params.K)};
+ std::vector output_dims{static_cast(params.N_),
+ static_cast(params.K_)};
output_dims.insert(std::end(output_dims),
std::begin(output_spatial_lengths),
std::end(output_spatial_lengths));
@@ -258,16 +258,16 @@ int main(int argc, char* argv[])
conv->MakeArgumentPointer(static_cast(in_device_buf.GetDeviceBuffer()),
static_cast(wei_device_buf.GetDeviceBuffer()),
static_cast(out_device_buf.GetDeviceBuffer()),
- params.N,
- params.K,
- params.C,
- params.input_spatial_lengths,
- params.filter_spatial_lengths,
+ params.N_,
+ params.K_,
+ params.C_,
+ params.input_spatial_lengths_,
+ params.filter_spatial_lengths_,
output_spatial_lengths,
- params.conv_filter_strides,
- params.conv_filter_dilations,
- params.input_left_pads,
- params.input_right_pads,
+ params.conv_filter_strides_,
+ params.conv_filter_dilations_,
+ params.input_left_pads_,
+ params.input_right_pads_,
InElementOp{},
WeiElementOp{},
OutElementOp{});
@@ -279,16 +279,16 @@ int main(int argc, char* argv[])
"not support this Conv problem");
}
- float ave_time = invoker->Run(argument.get(), nrepeat);
+ float ave_time = invoker->Run(argument.get(), StreamConfig{nullptr, time_kernel});
std::size_t flop = get_flops(
- params.N, params.C, params.K, params.filter_spatial_lengths, output_spatial_lengths);
+ params.N_, params.C_, params.K_, params.filter_spatial_lengths_, output_spatial_lengths);
std::size_t num_btype = get_btype(
- params.N,
- params.C,
- params.K,
- params.input_spatial_lengths,
- params.filter_spatial_lengths,
+ params.N_,
+ params.C_,
+ params.K_,
+ params.input_spatial_lengths_,
+ params.filter_spatial_lengths_,
output_spatial_lengths);
float tflops = static_cast(flop) / 1.E9 / ave_time;
@@ -304,18 +304,18 @@ int main(int argc, char* argv[])
auto ref_argument = ref_conv.MakeArgument(input,
weights,
host_output,
- params.conv_filter_strides,
- params.conv_filter_dilations,
- params.input_left_pads,
- params.input_right_pads,
+ params.conv_filter_strides_,
+ params.conv_filter_dilations_,
+ params.input_left_pads_,
+ params.input_right_pads_,
InElementOp{},
WeiElementOp{},
OutElementOp{});
ref_invoker.Run(ref_argument);
out_device_buf.FromDevice(device_output.mData.data());
- ck::utils::check_err(
- host_output.mData, device_output.mData, "Error: incorrect results!", 1e-5f, 1e-4f);
+ return ck::utils::check_err(
+ host_output.mData, device_output.mData, "Error: incorrect results!", 1e-5f, 1e-4f) ? 0 : 1;
};
switch(num_dim_spatial)
@@ -340,4 +340,5 @@ int main(int argc, char* argv[])
}
}
}
+ return 0;
}
diff --git a/example/10_conv2d_bwd_data/CMakeLists.txt b/example/10_conv2d_bwd_data/CMakeLists.txt
index f300bc9645..17aca1481b 100644
--- a/example/10_conv2d_bwd_data/CMakeLists.txt
+++ b/example/10_conv2d_bwd_data/CMakeLists.txt
@@ -1,2 +1,2 @@
add_example_executable(example_conv2d_bwd_data_xdl conv2d_bwd_data_xdl.cpp)
-target_link_libraries(example_conv2d_bwd_data_xdl PRIVATE conv_fwd_util)
+target_link_libraries(example_conv2d_bwd_data_xdl PRIVATE conv_util)
diff --git a/example/10_conv2d_bwd_data/conv2d_bwd_data_xdl.cpp b/example/10_conv2d_bwd_data/conv2d_bwd_data_xdl.cpp
index f3f9b497f5..2d25f5ac2f 100644
--- a/example/10_conv2d_bwd_data/conv2d_bwd_data_xdl.cpp
+++ b/example/10_conv2d_bwd_data/conv2d_bwd_data_xdl.cpp
@@ -77,9 +77,9 @@ using ReferenceConvBwdInstance = ck::tensor_operation::host::ReferenceConvBwdDat
int main(int argc, char* argv[])
{
- bool do_verification = 0;
- int init_method = 0;
- int nrepeat = 5;
+ bool do_verification = true;
+ int init_method = 1;
+ bool time_kernel = false;
// Conv shape
ck::index_t N = 128;
@@ -102,13 +102,13 @@ int main(int argc, char* argv[])
{
do_verification = std::stoi(argv[1]);
init_method = std::stoi(argv[2]);
- nrepeat = std::stoi(argv[3]);
+ time_kernel = std::stoi(argv[3]);
}
else if(argc == 19)
{
do_verification = std::stoi(argv[1]);
init_method = std::stoi(argv[2]);
- nrepeat = std::stoi(argv[3]);
+ time_kernel = std::stoi(argv[3]);
N = std::stoi(argv[4]);
K = std::stoi(argv[5]);
@@ -130,7 +130,7 @@ int main(int argc, char* argv[])
{
printf("arg1: verification (0=no, 1=yes)\n");
printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n");
- printf("arg3: run kernel # of times (>1)\n");
+ printf("arg3: time kernel (0=n0, 1=yes)\n");
printf("arg4 to 18: N, K, C, Y, X, Hi, Wi, Sy, Sx, Dy, Dx, LeftPy, LeftPx, RightPy, "
"RightPx\n");
exit(0);
@@ -214,7 +214,7 @@ int main(int argc, char* argv[])
"not support this Conv problem");
}
- float ave_time = invoker.Run(argument, nrepeat);
+ float ave_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel});
std::size_t flop = std::size_t(2) * N * K * Ho * Wo * C * Y * X;
@@ -249,6 +249,10 @@ int main(int argc, char* argv[])
in_device_buf.FromDevice(in_n_c_hi_wi_device_result.mData.data());
- ck::utils::check_err(in_n_c_hi_wi_device_result.mData, in_n_c_hi_wi_host_result.mData);
+ return ck::utils::check_err(in_n_c_hi_wi_device_result.mData,
+ in_n_c_hi_wi_host_result.mData)
+ ? 0
+ : 1;
}
+ return 0;
}
diff --git a/example/11_conv2d_bwd_weight/CMakeLists.txt b/example/11_conv2d_bwd_weight/CMakeLists.txt
index ff001eab72..3d771b5569 100644
--- a/example/11_conv2d_bwd_weight/CMakeLists.txt
+++ b/example/11_conv2d_bwd_weight/CMakeLists.txt
@@ -1,2 +1,2 @@
add_example_executable(example_conv2d_bwd_weight_xdl conv2d_bwd_weight_xdl.cpp)
-target_link_libraries(example_conv2d_bwd_weight_xdl PRIVATE conv_fwd_util)
+target_link_libraries(example_conv2d_bwd_weight_xdl PRIVATE conv_util)
diff --git a/example/11_conv2d_bwd_weight/conv2d_bwd_weight_xdl.cpp b/example/11_conv2d_bwd_weight/conv2d_bwd_weight_xdl.cpp
index bf78cc87e0..1578161116 100644
--- a/example/11_conv2d_bwd_weight/conv2d_bwd_weight_xdl.cpp
+++ b/example/11_conv2d_bwd_weight/conv2d_bwd_weight_xdl.cpp
@@ -82,9 +82,9 @@ using ReferenceConvBwdWeightInstance =
int main(int argc, char* argv[])
{
- bool do_verification = 0;
- int init_method = 0;
- int nrepeat = 5;
+ bool do_verification = true;
+ int init_method = 1;
+ bool time_kernel = false;
int do_log = 0;
int split_k = 4;
@@ -109,7 +109,7 @@ int main(int argc, char* argv[])
{
do_verification = std::stoi(argv[1]);
init_method = std::stoi(argv[2]);
- nrepeat = std::stoi(argv[3]);
+ time_kernel = std::stoi(argv[3]);
do_log = std::stoi(argv[4]);
split_k = std::stoi(argv[5]);
}
@@ -117,7 +117,7 @@ int main(int argc, char* argv[])
{
do_verification = std::stoi(argv[1]);
init_method = std::stoi(argv[2]);
- nrepeat = std::stoi(argv[3]);
+ time_kernel = std::stoi(argv[3]);
do_log = std::stoi(argv[4]);
split_k = std::stoi(argv[5]);
@@ -141,7 +141,7 @@ int main(int argc, char* argv[])
{
printf("arg1: verification (0=no, 1=yes)\n");
printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n");
- printf("arg3: run kernel # of times (>1)\n");
+ printf("arg3: time kernel (0=n0, 1=yes)\n");
printf("arg4: is show log (0=no, 1=yes)\n");
printf("arg5: split-k \n");
printf("arg6 to 19: N, K, C, Y, X, Hi, Wi, Sy, Sx, Dy, Dx, LeftPy, LeftPx, RightPy, "
@@ -246,7 +246,7 @@ int main(int argc, char* argv[])
return 1;
}
- float ave_time = invoker.Run(argument, nrepeat);
+ float ave_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel});
std::size_t flop = std::size_t(2) * N * K * Ho * Wo * C * Y * X;
@@ -291,6 +291,9 @@ int main(int argc, char* argv[])
LogRangeAsType(std::cout << "wei_host : ", wei_k_c_y_x_host_result.mData, ",")
<< std::endl;
}
- ck::utils::check_err(wei_k_c_y_x_device_result.mData, wei_k_c_y_x_host_result.mData);
+ return ck::utils::check_err(wei_k_c_y_x_device_result.mData, wei_k_c_y_x_host_result.mData)
+ ? 0
+ : 1;
}
+ return 0;
}
diff --git a/example/12_reduce/CMakeLists.txt b/example/12_reduce/CMakeLists.txt
index 734c1955d6..d6866abeb8 100644
--- a/example/12_reduce/CMakeLists.txt
+++ b/example/12_reduce/CMakeLists.txt
@@ -1 +1 @@
-add_example_executable(example_reduce_blockwise reduce_blockwise.cpp)
+add_example_executable(example_reduce_blockwise reduce_blockwise.cpp -D 16,64,32,960 -v 1 1 10)
diff --git a/example/12_reduce/reduce_blockwise.cpp b/example/12_reduce/reduce_blockwise.cpp
index 7ca9823ff5..b2d312ae8c 100644
--- a/example/12_reduce/reduce_blockwise.cpp
+++ b/example/12_reduce/reduce_blockwise.cpp
@@ -116,10 +116,9 @@ class SimpleAppArgs
std::vector inLengths;
std::vector scales;
- bool do_verification = false;
-
- int init_method = 1;
- int nrepeat = 5;
+ bool do_verification = true;
+ int init_method = 1;
+ bool time_kernel = false;
public:
void show_usage(const char* cmd)
@@ -135,7 +134,7 @@ class SimpleAppArgs
std::cout << "Arg1 -- init method (0=no init, 1=single integer value, 2=scope integer "
"value, 3=decimal value)"
<< std::endl;
- std::cout << "Arg2 -- number of repeats to run the kernel" << std::endl;
+ std::cout << "Arg2 -- time kernel (0=n0, 1=yes)" << std::endl;
};
int processArgs(int argc, char* argv[])
@@ -182,7 +181,7 @@ class SimpleAppArgs
throw std::runtime_error("Invalid cmd-line arguments, more argumetns are needed!");
init_method = std::atoi(argv[optind++]);
- nrepeat = std::atoi(argv[optind]);
+ time_kernel = std::atoi(argv[optind]);
if(scales.empty())
{
@@ -352,7 +351,7 @@ int main(int argc, char* argv[])
auto invoker_ptr = reduce.MakeInvokerPointer();
- float avg_time = invoker_ptr->Run(argument_ptr.get(), args.nrepeat);
+ float avg_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, args.time_kernel});
std::size_t num_bytes = invariant_total_length * reduce_total_length * sizeof(InDataType) +
invariant_total_length * sizeof(OutDataType);
@@ -362,16 +361,17 @@ int main(int argc, char* argv[])
std::cout << "Perf: " << avg_time << " ms, " << gb_per_sec << " GB/s, " << reduce_name
<< std::endl;
+ bool pass = true;
if(args.do_verification)
{
out_dev.FromDevice(out.mData.data());
- ck::utils::check_err(out.mData, out_ref.mData);
+ pass &= ck::utils::check_err(out.mData, out_ref.mData);
if(NeedIndices)
{
out_indices_dev.FromDevice(out_indices.mData.data());
- ck::utils::check_err(out_indices.mData, out_indices_ref.mData);
- ;
+ pass &= ck::utils::check_err(out_indices.mData, out_indices_ref.mData);
};
};
+ return pass ? 0 : 1;
}
diff --git a/example/13_pool2d_fwd/pool2d_fwd.cpp b/example/13_pool2d_fwd/pool2d_fwd.cpp
index a18761095c..e6749bf8d7 100644
--- a/example/13_pool2d_fwd/pool2d_fwd.cpp
+++ b/example/13_pool2d_fwd/pool2d_fwd.cpp
@@ -149,9 +149,9 @@ int main(int argc, char* argv[])
{
using namespace ck::host_reduce;
- bool do_verification = 0;
- int init_method = 0;
- int nrepeat = 5;
+ bool do_verification = true;
+ int init_method = 1;
+ bool time_kernel = false;
// Pool shape
ck::index_t N = 128;
@@ -171,13 +171,13 @@ int main(int argc, char* argv[])
{
do_verification = std::stoi(argv[1]);
init_method = std::stoi(argv[2]);
- nrepeat = std::stoi(argv[3]);
+ time_kernel = std::stoi(argv[3]);
}
else if(argc == 16)
{
do_verification = std::stoi(argv[1]);
init_method = std::stoi(argv[2]);
- nrepeat = std::stoi(argv[3]);
+ time_kernel = std::stoi(argv[3]);
N = std::stoi(argv[4]);
C = std::stoi(argv[5]);
@@ -196,7 +196,7 @@ int main(int argc, char* argv[])
{
printf("arg1: verification (0=no, 1=yes)\n");
printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n");
- printf("arg3: run kernel # of times (>1)\n");
+ printf("arg3: time kernel (0=n0, 1=yes)\n");
printf("arg4 to 15: N, C, Y, X, Hi, Wi, Sy, Sx, LeftPy, LeftPx, RightPy, "
"RightPx\n");
exit(0);
@@ -271,7 +271,7 @@ int main(int argc, char* argv[])
"not support this problem");
}
- float ave_time = invoker_ptr->Run(argument_ptr.get(), nrepeat);
+ float ave_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel});
std::size_t flop = std::size_t(2) * N * C * Ho * Wo * Y * X;
@@ -285,6 +285,7 @@ int main(int argc, char* argv[])
std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s"
<< std::endl;
+ bool pass = true;
if(do_verification)
{
pool_host_verify1)\n");
+ printf("arg3: time kernel (0=n0, 1=yes)\n");
printf("arg4 to 9: M (256x), N(128x), K(32x), StrideA, StrideB, StrideC\n");
exit(0);
}
@@ -219,7 +219,7 @@ int main(int argc, char* argv[])
"not support this GEMM problem");
}
- float ave_time = invoker.Run(argument, nrepeat);
+ float ave_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel});
std::size_t flop = std::size_t(2) * M * N * K;
std::size_t num_btype =
@@ -244,7 +244,7 @@ int main(int argc, char* argv[])
ref_invoker.Run(ref_argument);
- ck::utils::check_err(c_m_n_device_result.mData, c_m_n_host_result.mData);
+ return ck::utils::check_err(c_m_n_device_result.mData, c_m_n_host_result.mData) ? 0 : 1;
}
return 0;
diff --git a/example/15_grouped_gemm/grouped_gemm_xdl_fp16.cpp b/example/15_grouped_gemm/grouped_gemm_xdl_fp16.cpp
index 29ef01f2ef..8c3491c8c9 100644
--- a/example/15_grouped_gemm/grouped_gemm_xdl_fp16.cpp
+++ b/example/15_grouped_gemm/grouped_gemm_xdl_fp16.cpp
@@ -60,21 +60,21 @@ using ReferenceGemmInstance = ck::tensor_operation::host::
int main(int argc, char* argv[])
{
- bool do_verification = 0;
- int init_method = 0;
- int nrepeat = 5;
+ bool do_verification = true;
+ int init_method = 1;
+ bool time_kernel = false;
if(argc == 4)
{
do_verification = std::stoi(argv[1]);
init_method = std::stoi(argv[2]);
- nrepeat = std::stoi(argv[3]);
+ time_kernel = std::stoi(argv[3]);
}
else
{
printf("arg1: verification (0=no, 1=yes)\n");
printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n");
- printf("arg3: run kernel # of times (>1)\n");
+ printf("arg3: time kernel (0=n0, 1=yes)\n");
exit(0);
}
@@ -202,7 +202,7 @@ int main(int argc, char* argv[])
"not support this GEMM problem");
}
- float ave_time = invoker.Run(argument, nrepeat);
+ float ave_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel});
float tflops = static_cast(flop) / 1.E9 / ave_time;
@@ -211,6 +211,7 @@ int main(int argc, char* argv[])
std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s, "
<< gemm.GetTypeString() << std::endl;
+ bool pass = true;
if(do_verification)
{
for(std::size_t i = 0; i < gemm_shapes.size(); i++)
@@ -227,9 +228,9 @@ int main(int argc, char* argv[])
c_element_op);
ref_invoker.Run(ref_argument);
- ck::utils::check_err(c_device_tensors[i].mData, c_host_tensors[i].mData);
+ pass &= ck::utils::check_err(c_device_tensors[i].mData, c_host_tensors[i].mData);
}
}
- return 0;
+ return pass ? 0 : 1;
}
diff --git a/example/16_gemm_reduce/gemm_reduce_xdl_fp16.cpp b/example/16_gemm_reduce/gemm_reduce_xdl_fp16.cpp
index 90064ae584..860d9eea2a 100644
--- a/example/16_gemm_reduce/gemm_reduce_xdl_fp16.cpp
+++ b/example/16_gemm_reduce/gemm_reduce_xdl_fp16.cpp
@@ -4,6 +4,7 @@
#include
#include
#include
+#include "check_err.hpp"
#include "config.hpp"
#include "device.hpp"
#include "host_tensor.hpp"
@@ -58,9 +59,9 @@ using ReferenceGemmInstance = ck::tensor_operation::host::
int main(int argc, char* argv[])
{
- bool do_verification = 1;
+ bool do_verification = true;
int init_method = 1;
- int nrepeat = 5;
+ bool time_kernel = false;
// GEMM shape
ck::index_t M = 3840;
@@ -79,13 +80,13 @@ int main(int argc, char* argv[])
{
do_verification = std::stoi(argv[1]);
init_method = std::stoi(argv[2]);
- nrepeat = std::stoi(argv[3]);
+ time_kernel = std::stoi(argv[3]);
}
else if(argc == 10)
{
do_verification = std::stoi(argv[1]);
init_method = std::stoi(argv[2]);
- nrepeat = std::stoi(argv[3]);
+ time_kernel = std::stoi(argv[3]);
M = std::stoi(argv[4]);
N = std::stoi(argv[5]);
@@ -99,7 +100,7 @@ int main(int argc, char* argv[])
{
printf("arg1: verification (0=no, 1=yes)\n");
printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n");
- printf("arg3: run kernel # of times (>1)\n");
+ printf("arg3: time kernel (0=n0, 1=yes)\n");
printf("arg4 to 9: M (256x), N(128x), K(32x), StrideA, StrideB, StrideC\n");
exit(0);
}
@@ -192,30 +193,13 @@ int main(int argc, char* argv[])
"not support this GEMM problem");
}
- // warm up
- invoker.Run(argument);
+ // init DO, D1 to 0
+ d0_device_buf.SetZero();
+ d1_device_buf.SetZero();
- // timing
- float total_time = 0;
-
- for(int i = 0; i < nrepeat; ++i)
- {
- // init DO, D1 to 0
- d0_device_buf.SetZero();
- d1_device_buf.SetZero();
-
- KernelTimer timer;
-
- timer.Start();
-
- invoker.Run(argument);
-
- timer.End();
-
- total_time += timer.GetElapsedTime();
- }
-
- float ave_time = total_time / nrepeat;
+ // if time_kernel == true, kernel will run multiple times. This kernel use atomic-add so result
+ // will not be correct. need to set time_kernel = false for correctness test
+ float ave_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel});
std::size_t flop = std::size_t(2) * M * N * K;
std::size_t num_btype =
@@ -228,6 +212,7 @@ int main(int argc, char* argv[])
std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s, "
<< gemm.GetTypeString() << std::endl;
+ bool pass = true;
if(do_verification)
{
c_device_buf.FromDevice(c_m_n_device_result.mData.data());
@@ -264,10 +249,19 @@ int main(int argc, char* argv[])
d1_m_host_result(m) = ck::type_convert(d1_acc);
}
- check_error(c_m_n_host_result, c_m_n_device_result);
- check_error(d0_m_host_result, d0_m_device_result);
- check_error(d1_m_host_result, d1_m_device_result);
+ pass &= ck::utils::check_err(
+ c_m_n_device_result.mData, c_m_n_host_result.mData, "Error: Incorrect results c");
+ pass &= ck::utils::check_err(d0_m_device_result.mData,
+ d0_m_host_result.mData,
+ "Error: Incorrect results d0",
+ 1e-3,
+ 1e-3);
+ pass &= ck::utils::check_err(d1_m_device_result.mData,
+ d1_m_host_result.mData,
+ "Error: Incorrect results d1",
+ 1e-3,
+ 1e-3);
}
- return 0;
+ return pass ? 0 : 1;
}
diff --git a/example/17_convnd_bwd_data_xdl/CMakeLists.txt b/example/17_convnd_bwd_data_xdl/CMakeLists.txt
index 0ed906f8f7..963f311703 100644
--- a/example/17_convnd_bwd_data_xdl/CMakeLists.txt
+++ b/example/17_convnd_bwd_data_xdl/CMakeLists.txt
@@ -1,2 +1,2 @@
add_example_executable(example_convnd_bwd_data_xdl convnd_bwd_data_xdl.cpp)
-target_link_libraries(example_convnd_bwd_data_xdl PRIVATE conv_fwd_util)
+target_link_libraries(example_convnd_bwd_data_xdl PRIVATE conv_util)
diff --git a/example/17_convnd_bwd_data_xdl/convnd_bwd_data_xdl.cpp b/example/17_convnd_bwd_data_xdl/convnd_bwd_data_xdl.cpp
index 962627ce90..ff2cfac1fa 100644
--- a/example/17_convnd_bwd_data_xdl/convnd_bwd_data_xdl.cpp
+++ b/example/17_convnd_bwd_data_xdl/convnd_bwd_data_xdl.cpp
@@ -6,7 +6,7 @@
#include
#include "config.hpp"
-#include "conv_fwd_util.hpp"
+#include "conv_util.hpp"
#include "print.hpp"
#include "device.hpp"
#include "host_tensor.hpp"
@@ -87,7 +87,7 @@ void print_use_msg()
{
std::cout << "arg1: verification (0=no, 1=yes)\n"
<< "arg2: initialization (0=no init, 1=random value, 2= init to 1 )\n"
- << "arg3: run kernel # of times (>1)\n"
+ << "arg3: time kernel (0=n0, 1=yes)\n"
<< "arg4: N spatial dimensions (default 2)\n"
<< "Following arguments (depending on number of spatial dims):\n"
<< " N, K, C, \n"
@@ -105,40 +105,40 @@ ck::utils::conv::ConvParams parse_conv_params(int num_dim_spatial, char* argv[])
ck::utils::conv::ConvParams params;
int arg_idx = 5;
- params.num_dim_spatial = num_dim_spatial;
- params.N = std::stoi(argv[arg_idx++]);
- params.K = std::stoi(argv[arg_idx++]);
- params.C = std::stoi(argv[arg_idx++]);
+ params.num_dim_spatial_ = num_dim_spatial;
+ params.N_ = std::stoi(argv[arg_idx++]);
+ params.K_ = std::stoi(argv[arg_idx++]);
+ params.C_ = std::stoi(argv[arg_idx++]);
- params.filter_spatial_lengths.resize(num_dim_spatial);
+ params.filter_spatial_lengths_.resize(num_dim_spatial);
for(int i = 0; i < num_dim_spatial; ++i)
{
- params.filter_spatial_lengths[i] = std::stoi(argv[arg_idx++]);
+ params.filter_spatial_lengths_[i] = std::stoi(argv[arg_idx++]);
}
- params.input_spatial_lengths.resize(num_dim_spatial);
+ params.input_spatial_lengths_.resize(num_dim_spatial);
for(int i = 0; i < num_dim_spatial; ++i)
{
- params.input_spatial_lengths[i] = std::stoi(argv[arg_idx++]);
+ params.input_spatial_lengths_[i] = std::stoi(argv[arg_idx++]);
}
- params.conv_filter_strides.resize(num_dim_spatial);
+ params.conv_filter_strides_.resize(num_dim_spatial);
for(int i = 0; i < num_dim_spatial; ++i)
{
- params.conv_filter_strides[i] = std::stoi(argv[arg_idx++]);
+ params.conv_filter_strides_[i] = std::stoi(argv[arg_idx++]);
}
- params.conv_filter_dilations.resize(num_dim_spatial);
+ params.conv_filter_dilations_.resize(num_dim_spatial);
for(int i = 0; i < num_dim_spatial; ++i)
{
- params.conv_filter_dilations[i] = std::stoi(argv[arg_idx++]);
+ params.conv_filter_dilations_[i] = std::stoi(argv[arg_idx++]);
}
- params.input_left_pads.resize(num_dim_spatial);
+ params.input_left_pads_.resize(num_dim_spatial);
for(int i = 0; i < num_dim_spatial; ++i)
{
- params.input_left_pads[i] = std::stoi(argv[arg_idx++]);
+ params.input_left_pads_[i] = std::stoi(argv[arg_idx++]);
}
- params.input_right_pads.resize(num_dim_spatial);
+ params.input_right_pads_.resize(num_dim_spatial);
for(int i = 0; i < num_dim_spatial; ++i)
{
- params.input_right_pads[i] = std::stoi(argv[arg_idx++]);
+ params.input_right_pads_[i] = std::stoi(argv[arg_idx++]);
}
return params;
@@ -165,25 +165,25 @@ DeviceConvBwdDataBasePtr get_conv_instance(int num_dim_spatial)
int main(int argc, char* argv[])
{
- bool do_verification = 0;
- int init_method = 0;
- int nrepeat = 5;
+ bool do_verification = true;
+ int init_method = 1;
+ bool time_kernel = false;
int num_dim_spatial = 2;
ck::utils::conv::ConvParams params;
- params.C = 128;
+ params.C_ = 128;
if(argc == 4)
{
do_verification = std::stoi(argv[1]);
init_method = std::stoi(argv[2]);
- nrepeat = std::stoi(argv[3]);
+ time_kernel = std::stoi(argv[3]);
}
else if(argc > 4)
{
do_verification = std::stoi(argv[1]);
init_method = std::stoi(argv[2]);
- nrepeat = std::stoi(argv[3]);
+ time_kernel = std::stoi(argv[3]);
num_dim_spatial = std::stoi(argv[4]);
// check args number
int conv_args = 3 + num_dim_spatial * 6;
@@ -202,21 +202,21 @@ int main(int argc, char* argv[])
exit(1);
}
- std::vector input_dims{static_cast(params.N),
- static_cast(params.C)};
+ std::vector input_dims{static_cast(params.N_),
+ static_cast(params.C_)};
input_dims.insert(std::end(input_dims),
- std::begin(params.input_spatial_lengths),
- std::end(params.input_spatial_lengths));
+ std::begin(params.input_spatial_lengths_),
+ std::end(params.input_spatial_lengths_));
- std::vector filter_dims{static_cast(params.K),
- static_cast(params.C)};
+ std::vector filter_dims{static_cast(params.K_),
+ static_cast(params.C_)};
filter_dims.insert(std::end(filter_dims),
- std::begin(params.filter_spatial_lengths),
- std::end(params.filter_spatial_lengths));
+ std::begin(params.filter_spatial_lengths_),
+ std::end(params.filter_spatial_lengths_));
const std::vector& output_spatial_lengths = params.GetOutputSpatialLengths();
- std::vector output_dims{static_cast(params.N),
- static_cast(params.K)};
+ std::vector output_dims{static_cast(params.N_),
+ static_cast(params.K_)};
output_dims.insert(std::end(output_dims),
std::begin(output_spatial_lengths),
std::end(output_spatial_lengths));
@@ -263,16 +263,16 @@ int main(int argc, char* argv[])
conv->MakeArgumentPointer(static_cast(in_device_buf.GetDeviceBuffer()),
static_cast(wei_device_buf.GetDeviceBuffer()),
static_cast(out_device_buf.GetDeviceBuffer()),
- params.N,
- params.K,
- params.C,
- params.input_spatial_lengths,
- params.filter_spatial_lengths,
+ params.N_,
+ params.K_,
+ params.C_,
+ params.input_spatial_lengths_,
+ params.filter_spatial_lengths_,
output_spatial_lengths,
- params.conv_filter_strides,
- params.conv_filter_dilations,
- params.input_left_pads,
- params.input_right_pads,
+ params.conv_filter_strides_,
+ params.conv_filter_dilations_,
+ params.input_left_pads_,
+ params.input_right_pads_,
InElementOp{},
WeiElementOp{},
OutElementOp{});
@@ -284,16 +284,16 @@ int main(int argc, char* argv[])
"not support this Conv problem");
}
- float ave_time = invoker->Run(argument.get(), nrepeat);
+ float ave_time = invoker->Run(argument.get(), StreamConfig{nullptr, time_kernel});
std::size_t flop = ck::utils::conv::get_flops(
- params.N, params.C, params.K, params.filter_spatial_lengths, output_spatial_lengths);
+ params.N_, params.C_, params.K_, params.filter_spatial_lengths_, output_spatial_lengths);
std::size_t num_btype = ck::utils::conv::get_btype(
- params.N,
- params.C,
- params.K,
- params.input_spatial_lengths,
- params.filter_spatial_lengths,
+ params.N_,
+ params.C_,
+ params.K_,
+ params.input_spatial_lengths_,
+ params.filter_spatial_lengths_,
output_spatial_lengths);
float tflops = static_cast(flop) / 1.E9 / ave_time;
@@ -310,10 +310,10 @@ int main(int argc, char* argv[])
auto ref_argument = ref_conv.MakeArgument(in_n_c_hi_wi_host_result,
wei_k_c_y_x,
out_n_k_ho_wo,
- params.conv_filter_strides,
- params.conv_filter_dilations,
- params.input_left_pads,
- params.input_right_pads,
+ params.conv_filter_strides_,
+ params.conv_filter_dilations_,
+ params.input_left_pads_,
+ params.input_right_pads_,
InElementOp{},
WeiElementOp{},
OutElementOp{});
@@ -322,7 +322,10 @@ int main(int argc, char* argv[])
in_device_buf.FromDevice(in_n_c_hi_wi_device_result.mData.data());
- check_error(in_n_c_hi_wi_host_result, in_n_c_hi_wi_device_result);
+ return ck::utils::check_err(in_n_c_hi_wi_device_result.mData,
+ in_n_c_hi_wi_host_result.mData)
+ ? 0
+ : 1;
};
switch(num_dim_spatial)
@@ -347,4 +350,5 @@ int main(int argc, char* argv[])
}
}
}
+ return 0;
}
diff --git a/example/18_batched_gemm_reduce/batched_gemm_reduce_xdl_fp16.cpp b/example/18_batched_gemm_reduce/batched_gemm_reduce_xdl_fp16.cpp
index eb18655d1b..d993c8e8d1 100644
--- a/example/18_batched_gemm_reduce/batched_gemm_reduce_xdl_fp16.cpp
+++ b/example/18_batched_gemm_reduce/batched_gemm_reduce_xdl_fp16.cpp
@@ -4,6 +4,7 @@
#include
#include
#include
+#include "check_err.hpp"
#include "config.hpp"
#include "device.hpp"
#include "host_tensor.hpp"
@@ -57,18 +58,18 @@ using ReferenceBatchedGemmInstance = ck::tensor_operation::host::
int main(int argc, char* argv[])
{
- bool do_verification = 1;
+ bool do_verification = true;
int init_method = 1;
- int nrepeat = 5;
+ bool time_kernel = false;
// GEMM shape
- ck::index_t M = 3840;
- ck::index_t N = 4096;
- ck::index_t K = 4096;
+ ck::index_t M = 2048;
+ ck::index_t N = 1920;
+ ck::index_t K = 2048;
- ck::index_t StrideA = 4096;
- ck::index_t StrideB = 4096;
- ck::index_t StrideC = 4096;
+ ck::index_t StrideA = 2048;
+ ck::index_t StrideB = 2048;
+ ck::index_t StrideC = 1920;
ck::index_t BatchCount = 4;
@@ -80,13 +81,13 @@ int main(int argc, char* argv[])
{
do_verification = std::stoi(argv[1]);
init_method = std::stoi(argv[2]);
- nrepeat = std::stoi(argv[3]);
+ time_kernel = std::stoi(argv[3]);
}
else if(argc == 11)
{
do_verification = std::stoi(argv[1]);
init_method = std::stoi(argv[2]);
- nrepeat = std::stoi(argv[3]);
+ time_kernel = std::stoi(argv[3]);
M = std::stoi(argv[4]);
N = std::stoi(argv[5]);
@@ -96,13 +97,13 @@ int main(int argc, char* argv[])
StrideB = std::stoi(argv[8]);
StrideC = std::stoi(argv[9]);
- BatchCount = std::stoi(argv[9]);
+ BatchCount = std::stoi(argv[10]);
}
else
{
printf("arg1: verification (0=no, 1=yes)\n");
printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n");
- printf("arg3: run kernel # of times (>1)\n");
+ printf("arg3: time kernel (0=n0, 1=yes)\n");
printf("arg4 to 10: M (256x), N(128x), K(32x), StrideA, StrideB, StrideC, BatchCount\n");
exit(0);
}
@@ -204,30 +205,13 @@ int main(int argc, char* argv[])
"not support this GEMM problem");
}
- // warm up
- invoker.Run(argument);
+ // init DO, D1 to 0
+ d0_device_buf.SetZero();
+ d1_device_buf.SetZero();
- // timing
- float total_time = 0;
-
- for(int i = 0; i < nrepeat; ++i)
- {
- // init DO, D1 to 0
- d0_device_buf.SetZero();
- d1_device_buf.SetZero();
-
- KernelTimer timer;
-
- timer.Start();
-
- invoker.Run(argument);
-
- timer.End();
-
- total_time += timer.GetElapsedTime();
- }
-
- float ave_time = total_time / nrepeat;
+ // if time_kernel == true, kernel will run multiple times. This kernel use atomic-add so result
+ // will not be correct. need to set time_kernel = false for correctness test
+ float ave_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel});
std::size_t flop = std::size_t(2) * BatchCount * M * N * K;
std::size_t num_btype = sizeof(ADataType) * BatchCount * M * K +
@@ -241,6 +225,7 @@ int main(int argc, char* argv[])
std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s, "
<< batched_gemm.GetTypeString() << std::endl;
+ bool pass = true;
if(do_verification)
{
c_device_buf.FromDevice(c_g_m_n_device_result.mData.data());
@@ -264,7 +249,7 @@ int main(int argc, char* argv[])
for(int n = 0; n < N; ++n)
{
- float d0_val = ck::type_convert(c_g_m_n_host_result(m, n));
+ float d0_val = ck::type_convert(c_g_m_n_host_result(batch, m, n));
float d1_val;
d1_element_op(d1_val, d0_val);
@@ -277,10 +262,18 @@ int main(int argc, char* argv[])
}
}
- check_error(c_g_m_n_host_result, c_g_m_n_device_result);
- check_error(d0_g_m_host_result, d0_g_m_device_result);
- check_error(d1_g_m_host_result, d1_g_m_device_result);
+ pass &= ck::utils::check_err(c_g_m_n_host_result.mData, c_g_m_n_device_result.mData);
+ pass &= ck::utils::check_err(d0_g_m_device_result.mData,
+ d0_g_m_host_result.mData,
+ "Error: Incorrect results! D0",
+ 1e-3,
+ 1e-3);
+ pass &= ck::utils::check_err(d1_g_m_device_result.mData,
+ d1_g_m_host_result.mData,
+ "Error: Incorrect results! D1",
+ 1e-3,
+ 1e-3);
}
- return 0;
+ return pass ? 0 : 1;
}
diff --git a/example/19_cgemm/cgemm_xdl_bf16.cpp b/example/19_cgemm/cgemm_xdl_bf16.cpp
index 309fa6ac86..836a3c13dc 100644
--- a/example/19_cgemm/cgemm_xdl_bf16.cpp
+++ b/example/19_cgemm/cgemm_xdl_bf16.cpp
@@ -88,9 +88,9 @@ using ReferenceCGemmInstance = ck::tensor_operation::host::
int main(int argc, char* argv[])
{
- bool do_verification = 0;
- int init_method = 0;
- int nrepeat = 5;
+ bool do_verification = true;
+ int init_method = 1;
+ bool time_kernel = false;
// CGEMM shape
ck::index_t M = 3840;
@@ -105,13 +105,13 @@ int main(int argc, char* argv[])
{
do_verification = std::stoi(argv[1]);
init_method = std::stoi(argv[2]);
- nrepeat = std::stoi(argv[3]);
+ time_kernel = std::stoi(argv[3]);
}
else if(argc == 10)
{
do_verification = std::stoi(argv[1]);
init_method = std::stoi(argv[2]);
- nrepeat = std::stoi(argv[3]);
+ time_kernel = std::stoi(argv[3]);
M = std::stoi(argv[4]);
N = std::stoi(argv[5]);
@@ -223,7 +223,7 @@ int main(int argc, char* argv[])
"not support this CGEMM problem");
}
- float ave_time = invoker.Run(argument, nrepeat);
+ float ave_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel});
std::size_t flop = std::size_t(8) * M * N * K;
std::size_t num_btype = std::size_t(2) * sizeof(ADataType) * M * K + sizeof(BDataType) * K * N +
diff --git a/example/CMakeLists.txt b/example/CMakeLists.txt
index 5ea3889844..051242ce2a 100644
--- a/example/CMakeLists.txt
+++ b/example/CMakeLists.txt
@@ -19,9 +19,18 @@ include_directories(BEFORE
add_custom_target(examples)
-function(add_example_executable EXAMPLE_NAME)
+function(add_example_executable EXAMPLE_NAME FILE_NAME)
message("adding example ${EXAMPLE_NAME}")
- add_executable(${EXAMPLE_NAME} ${ARGN})
+ add_executable(${EXAMPLE_NAME} ${FILE_NAME})
+ target_link_libraries(${EXAMPLE_NAME} PRIVATE host_tensor)
+ add_test(NAME ${EXAMPLE_NAME} COMMAND $ ${ARGN})
+ add_dependencies(examples ${EXAMPLE_NAME})
+ add_dependencies(check ${EXAMPLE_NAME})
+endfunction(add_example_executable EXAMPLE_NAME)
+
+function(add_example_executable_no_testing EXAMPLE_NAME FILE_NAME)
+ message("adding example ${EXAMPLE_NAME}")
+ add_executable(${EXAMPLE_NAME} ${FILE_NAME})
target_link_libraries(${EXAMPLE_NAME} PRIVATE host_tensor)
add_dependencies(examples ${EXAMPLE_NAME})
endfunction(add_example_executable EXAMPLE_NAME)
diff --git a/include/ck/config.hpp b/include/ck/config.hpp
index e6deefcbe3..710cd552d7 100644
--- a/include/ck/config.hpp
+++ b/include/ck/config.hpp
@@ -109,6 +109,10 @@
// experimental feature: use __builtin_memcpy instead of union to do bit_cast
#define CK_EXPERIMENTAL_USE_MEMCPY_FOR_BIT_CAST 1
+// experimental feature: optimize for inter-wave scheduling policy
+#define CK_EXPERIMENTAL_INTER_WAVE_SCHEDULING 0
+#define CK_EXPERIMENTAL_INTER_WAVE_SCHEDULING_MAC_CLUSTERS 1
+
// hack: have underlying assumption that need to be satsified, otherwise it's a bug
// hack for forcing register to keep idx_diff_low_const in SGPR. idx_diff_low_const must be
// thread-invariant, otherwise it's a bug
diff --git a/include/ck/hip_version.hpp.in b/include/ck/hip_version.hpp.in
deleted file mode 100644
index 4290ef7e0d..0000000000
--- a/include/ck/hip_version.hpp.in
+++ /dev/null
@@ -1,28 +0,0 @@
-#pragma once
-
-// "_PACKAGE_" to avoid name contentions: the macros like
-// HIP_VERSION_MAJOR are defined in HIP_VERSION.h.
-// clang-format off
-#define CK_HIP_PACKAGE_VERSION_MAJOR @CK_HIP_VERSION_MAJOR@
-#define CK_HIP_PACKAGE_VERSION_MINOR @CK_HIP_VERSION_MINOR@
-#define CK_HIP_PACKAGE_VERSION_PATCH @CK_HIP_VERSION_PATCH@
-// clang-format on
-
-#ifndef CK_HIP_PACKAGE_VERSION_MAJOR
-#define CK_HIP_PACKAGE_VERSION_MAJOR 0
-#endif
-#ifndef CK_HIP_PACKAGE_VERSION_MINOR
-#define CK_HIP_PACKAGE_VERSION_MINOR 0
-#endif
-#ifndef CK_HIP_PACKAGE_VERSION_PATCH
-#define CK_HIP_PACKAGE_VERSION_PATCH 0
-#endif
-// 3 decimal digits for major and minor, 6 digits for patch number.
-// Max number is 999,999,999999 == 0xE8,D4A5,0FFF that fits into 64-bit math.
-#if CK_HIP_PACKAGE_VERSION_MAJOR > 999 || CK_HIP_PACKAGE_VERSION_MAJOR > 999 || \
- CK_HIP_PACKAGE_VERSION_PATCH > 999999
-#error "Too big HIP version number(s)"
-#endif
-#define CK_HIP_PACKAGE_VERSION_FLAT \
- ((CK_HIP_PACKAGE_VERSION_MAJOR * 1000ULL + CK_HIP_PACKAGE_VERSION_MINOR) * 1000000 + \
- CK_HIP_PACKAGE_VERSION_PATCH)
diff --git a/include/ck/options.hpp.in b/include/ck/options.hpp.in
new file mode 100644
index 0000000000..87ed6026a4
--- /dev/null
+++ b/include/ck/options.hpp.in
@@ -0,0 +1,3 @@
+#pragma once
+
+#cmakedefine01 CK_TIME_KERNEL
diff --git a/include/ck/stream_config.hpp b/include/ck/stream_config.hpp
new file mode 100644
index 0000000000..3e80b4c892
--- /dev/null
+++ b/include/ck/stream_config.hpp
@@ -0,0 +1,10 @@
+#pragma once
+
+#include
+#include
+
+struct StreamConfig
+{
+ hipStream_t stream_id_ = nullptr;
+ bool time_kernel_ = false;
+};
diff --git a/include/ck/tensor_operation/gpu/block/blockwise_gemm_xdlops.hpp b/include/ck/tensor_operation/gpu/block/blockwise_gemm_xdlops.hpp
index f1670d9c89..a989cb5297 100644
--- a/include/ck/tensor_operation/gpu/block/blockwise_gemm_xdlops.hpp
+++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_xdlops.hpp
@@ -7,6 +7,21 @@
namespace ck {
+enum struct LoopScheduler
+{
+ Default,
+ Interwave,
+};
+
+constexpr LoopScheduler make_default_loop_scheduler()
+{
+#if CK_EXPERIMENTAL_INTER_WAVE_SCHEDULING
+ return LoopScheduler::Interwave;
+#else
+ return LoopScheduler::Default;
+#endif // if CK_EXPERIMENTAL_INTER_WAVE_SCHEDULING
+}
+
template {}));
@@ -339,4 +354,232 @@ struct BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1
BThreadCopy b_thread_copy_{CalculateBThreadOriginDataIndex()};
};
+// Note: To facilitate the inter-wave loop scheduler, we need to explicitly set the macro
+// CK_EXPERIMENTAL_INTER_WAVE_SCHEDULING=1 as a few intrinsics are not yet available in
+// the latest ROCm release. For unsupported compilers, inter-wave loop scheduler falls back to the
+// default loop scheduler which is given by the macro CK_EXPERIMENTAL_INTER_WAVE_SCHEDULING=0
+template
+struct BlockwiseGemmXdlopsInterwave_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1
+ : public BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1
+{
+ using Base = BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1;
+
+#if CK_EXPERIMENTAL_INTER_WAVE_SCHEDULING
+ using Base::a_block_desc_m0_m1_m2_k;
+ using Base::A_K1;
+ using Base::b_block_desc_n0_n1_n2_k;
+ using Base::B_K1;
+ using Base::c_thread_buf_;
+ using Base::c_thread_desc_;
+ using Base::CalculateAThreadOriginDataIndex;
+ using Base::CalculateBThreadOriginDataIndex;
+ using Base::I0;
+ using Base::I1;
+ using Base::KPerThread;
+ using Base::xdlops_gemm;
+
+ static constexpr index_t KPerInnerLoop = math::max(KPerThread / NumMacClusters, KPack);
+
+ // 2-wave optimized blockwise gemm
+ template
+ __device__ void Run(const ABlockBuffer& a_block_buf,
+ const BBlockBuffer& b_block_buf,
+ CThreadBuffer& c_thread_buf) const
+ {
+ auto a_thread_buf = make_static_buffer(
+ a_thread_desc_.GetElementSpaceSize());
+ auto b_thread_buf = make_static_buffer(
+ b_thread_desc_.GetElementSpaceSize());
+
+ static_for<0, KPerThread, KPerInnerLoop>{}([&](auto k) {
+ static_for<0, MRepeat, 1>{}([&](auto m0) {
+ // read A
+ a_thread_copy_.Run(a_block_desc_m0_m1_m2_k,
+ make_tuple(m0, I0, I0, k),
+ a_block_buf,
+ a_thread_desc_,
+ make_tuple(m0, I0, I0, I0),
+ a_thread_buf);
+ });
+ static_for<0, NRepeat, 1>{}([&](auto n0) {
+ // read B
+ b_thread_copy_.Run(b_block_desc_n0_n1_n2_k,
+ make_tuple(n0, I0, I0, k),
+ b_block_buf,
+ b_thread_desc_,
+ make_tuple(n0, I0, I0, I0),
+ b_thread_buf);
+ });
+ __builtin_amdgcn_sched_barrier();
+ // NOTE: Synchronize threads in a workgroup at the start of each MAC cluster, but except
+ // the first, as we can shorten non-MAC cluster a bit and there's no observable negative
+ // impact. The desired effect is waves in a workgroup executing MAC in sync. This avoids
+ // some out-of-sync waves hijacking MAC resource from other workgroups and reducing the
+ // chance of latency hiding by waiting for the rest of the workgroup at the eventual
+ // sync point.
+ if constexpr(k.value != 0 || KPerInnerLoop == KPerThread)
+ {
+ asm volatile("s_barrier" ::);
+ __builtin_amdgcn_sched_barrier();
+ }
+ static_for<0, KPerInnerLoop, KPack>{}([&](auto k_) {
+ static_for<0, MRepeat, 1>{}([&](auto m0) {
+ static_for<0, NRepeat, 1>{}([&](auto n0) {
+ vector_type a_thread_vec;
+ vector_type b_thread_vec;
+
+ static_for<0, KPack, 1>{}([&](auto i) {
+ a_thread_vec.template AsType()(i) =
+ a_thread_buf[Number{}];
+ b_thread_vec.template AsType()(i) =
+ b_thread_buf[Number{}];
+ });
+
+ using mfma_input_type =
+ typename vector_type::type;
+
+ constexpr index_t c_offset =
+ c_thread_desc_.CalculateOffset(make_tuple(m0, n0, 0));
+
+ // The block_sync_lds() here performs double duty:
+ // A) safeguard against data hazard because barrier from blockwise_gemm is
+ // moved here B) reduce VMEM FIFO congestion by applying small delays to
+ // different wavefronts It is performed near the end of MAC cluster to
+ // minimize lgkmcnt penalty
+ if constexpr(k.value == KPerThread - KPerInnerLoop &&
+ k_.value == KPerInnerLoop - KPack && m0.value == MRepeat - 1 &&
+ n0.value == NRepeat - 1)
+ {
+ __builtin_amdgcn_sched_barrier();
+ block_sync_lds();
+ __builtin_amdgcn_sched_barrier();
+ }
+
+ // TODO: insert setprio in more precise manner since we
+ // could have more than >1 MFMA instructions in single call
+ xdlops_gemm.template Run(
+ a_thread_vec.template AsType(),
+ b_thread_vec.template AsType(),
+ c_thread_buf.GetVectorTypeReference(Number{}));
+ if constexpr(k_.value == 0 && m0.value == 0 && n0.value == 0)
+ {
+ __builtin_amdgcn_sched_barrier();
+ __builtin_amdgcn_s_setprio(1);
+ __builtin_amdgcn_sched_barrier();
+ }
+ });
+ });
+ });
+ __builtin_amdgcn_sched_barrier();
+ __builtin_amdgcn_s_setprio(0);
+ __builtin_amdgcn_sched_barrier();
+ });
+ }
+
+ protected:
+ // A[M0, M1, M2, KPerInnerLoop]
+ static constexpr auto a_thread_desc_ = make_naive_tensor_descriptor_packed(
+ make_tuple(Number{}, I1, I1, Number{}));
+
+ // B[N0, N1, N2, KPerInnerLoop]
+ static constexpr auto b_thread_desc_ = make_naive_tensor_descriptor_packed(
+ make_tuple(Number{}, I1, I1, Number{}));
+
+ using AThreadCopy = ThreadwiseTensorSliceTransfer_v4,
+ Sequence<0, 1, 2, 3>,
+ 3,
+ A_K1,
+ A_K1>;
+
+ using BThreadCopy = ThreadwiseTensorSliceTransfer_v4,
+ Sequence<0, 1, 2, 3>,
+ 3,
+ B_K1,
+ B_K1>;
+
+ AThreadCopy a_thread_copy_{CalculateAThreadOriginDataIndex()};
+ BThreadCopy b_thread_copy_{CalculateBThreadOriginDataIndex()};
+
+#endif // #if CK_EXPERIMENTAL_INTER_WAVE_SCHEDULING
+};
+
+template
+constexpr auto BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_Selector()
+{
+ if constexpr(LoopSched == LoopScheduler::Default)
+ {
+ return BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1{};
+ }
+ else if constexpr(LoopSched == LoopScheduler::Interwave)
+ {
+ return BlockwiseGemmXdlopsInterwave_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1{};
+ }
+};
+
} // namespace ck
diff --git a/include/ck/tensor_operation/gpu/device/device_base.hpp b/include/ck/tensor_operation/gpu/device/device_base.hpp
index cf48695ad0..950cfc1d61 100644
--- a/include/ck/tensor_operation/gpu/device/device_base.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_base.hpp
@@ -1,8 +1,9 @@
-#ifndef DEVICE_BASE_HPP
-#define DEVICE_BASE_HPP
+#pragma once
#include
+#include "stream_config.hpp"
+
namespace ck {
namespace tensor_operation {
namespace device {
@@ -22,7 +23,10 @@ struct BaseInvoker
BaseInvoker(const BaseInvoker&) = default;
BaseInvoker& operator=(const BaseInvoker&) = default;
- virtual float Run(const BaseArgument*, int = 1) = 0;
+ virtual float Run(const BaseArgument*, const StreamConfig& = StreamConfig{})
+ {
+ return float{0};
+ }
virtual ~BaseInvoker() {}
};
@@ -33,8 +37,8 @@ struct BaseOperator
BaseOperator(const BaseOperator&) = default;
BaseOperator& operator=(const BaseOperator&) = default;
- virtual bool IsSupportedArgument(const BaseArgument*) = 0;
- virtual std::string GetTypeString() const = 0;
+ virtual bool IsSupportedArgument(const BaseArgument*) { return false; }
+ virtual std::string GetTypeString() const { return ""; }
virtual ~BaseOperator() {}
};
@@ -42,4 +46,3 @@ struct BaseOperator
} // namespace device
} // namespace tensor_operation
} // namespace ck
-#endif
diff --git a/include/ck/tensor_operation/gpu/device/device_batched_gemm_reduce_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/device_batched_gemm_reduce_xdl_cshuffle.hpp
index 92655b2755..a6408007ed 100644
--- a/include/ck/tensor_operation/gpu/device/device_batched_gemm_reduce_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_batched_gemm_reduce_xdl_cshuffle.hpp
@@ -106,6 +106,9 @@ __global__ void
#endif // end of if defined (defined(__gfx908__) || defined(__gfx90a__))
}
+// Note: inter-wave loop scheduler is rolled out to c-shuffle version first. Becuase non c-shuffle
+// version currently has compiler issues with register spill which further causes validation
+// failures.
template
+ index_t CReduceThreadVgpr2GlobalCopySrcDstScalarPerVector_MPerBlock,
+ LoopScheduler LoopSched = make_default_loop_scheduler()>
struct DeviceBatchedGemmReduce_Xdl_CShuffle : public DeviceGemmReduce;
+ CReduceThreadVgpr2GlobalCopySrcDstScalarPerVector_MPerBlock,
+ LoopSched>;
using Block2CTileMap = decltype(MakeBlock2CTileMap(1, CGridDesc_M_N{}, 1, 1));
@@ -688,7 +693,7 @@ struct DeviceBatchedGemmReduce_Xdl_CShuffle : public DeviceGemmReduce,
true>;
- launch_kernel(kernel,
- dim3(grid_size),
- dim3(BlockSize),
- 0,
- arg.p_a_grid_,
- arg.p_b_grid_,
- arg.p_c_grid_,
- arg.p_d0_grid_,
- arg.p_d1_grid_,
- arg.BatchCount_,
- arg.a_element_op_,
- arg.b_element_op_,
- arg.c_element_op_,
- arg.d1_element_op_,
- arg.a_grid_desc_ak0_m_ak1_,
- arg.b_grid_desc_bk0_n_bk1_,
- arg.c_grid_desc_mblock_mperblock_nblock_nperblock_,
- arg.d_grid_desc_mblock_mperblock_,
- arg.compute_base_ptr_of_batch_,
- arg.block_2_ctile_map_);
+ elapsed_time =
+ launch_and_time_kernel(stream_config,
+ kernel,
+ dim3(grid_size),
+ dim3(BlockSize),
+ 0,
+ arg.p_a_grid_,
+ arg.p_b_grid_,
+ arg.p_c_grid_,
+ arg.p_d0_grid_,
+ arg.p_d1_grid_,
+ arg.BatchCount_,
+ arg.a_element_op_,
+ arg.b_element_op_,
+ arg.c_element_op_,
+ arg.d1_element_op_,
+ arg.a_grid_desc_ak0_m_ak1_,
+ arg.b_grid_desc_bk0_n_bk1_,
+ arg.c_grid_desc_mblock_mperblock_nblock_nperblock_,
+ arg.d_grid_desc_mblock_mperblock_,
+ arg.compute_base_ptr_of_batch_,
+ arg.block_2_ctile_map_);
}
else
{
@@ -783,35 +791,38 @@ struct DeviceBatchedGemmReduce_Xdl_CShuffle : public DeviceGemmReduce,
false>;
- launch_kernel(kernel,
- dim3(grid_size),
- dim3(BlockSize),
- 0,
- arg.p_a_grid_,
- arg.p_b_grid_,
- arg.p_c_grid_,
- arg.p_d0_grid_,
- arg.p_d1_grid_,
- arg.BatchCount_,
- arg.a_element_op_,
- arg.b_element_op_,
- arg.c_element_op_,
- arg.d1_element_op_,
- arg.a_grid_desc_ak0_m_ak1_,
- arg.b_grid_desc_bk0_n_bk1_,
- arg.c_grid_desc_mblock_mperblock_nblock_nperblock_,
- arg.d_grid_desc_mblock_mperblock_,
- arg.compute_base_ptr_of_batch_,
- arg.block_2_ctile_map_);
+ elapsed_time =
+ launch_and_time_kernel(stream_config,
+ kernel,
+ dim3(grid_size),
+ dim3(BlockSize),
+ 0,
+ arg.p_a_grid_,
+ arg.p_b_grid_,
+ arg.p_c_grid_,
+ arg.p_d0_grid_,
+ arg.p_d1_grid_,
+ arg.BatchCount_,
+ arg.a_element_op_,
+ arg.b_element_op_,
+ arg.c_element_op_,
+ arg.d1_element_op_,
+ arg.a_grid_desc_ak0_m_ak1_,
+ arg.b_grid_desc_bk0_n_bk1_,
+ arg.c_grid_desc_mblock_mperblock_nblock_nperblock_,
+ arg.d_grid_desc_mblock_mperblock_,
+ arg.compute_base_ptr_of_batch_,
+ arg.block_2_ctile_map_);
}
- return 0;
+ return elapsed_time;
}
// polymorphic
- float Run(const BaseArgument* p_arg, int nrepeat = 1) override
+ float Run(const BaseArgument* p_arg,
+ const StreamConfig& stream_config = StreamConfig{}) override
{
- return Run(*dynamic_cast(p_arg), nrepeat);
+ return Run(*dynamic_cast(p_arg), stream_config);
}
};
diff --git a/include/ck/tensor_operation/gpu/device/device_batched_gemm_xdl.hpp b/include/ck/tensor_operation/gpu/device/device_batched_gemm_xdl.hpp
index 88974a5221..ea7704951e 100644
--- a/include/ck/tensor_operation/gpu/device/device_batched_gemm_xdl.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_batched_gemm_xdl.hpp
@@ -428,7 +428,7 @@ struct DeviceBatchedGemmXdl
{
using Argument = DeviceBatchedGemmXdl::Argument;
- float Run(const Argument& arg, int nrepeat = 1)
+ float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
{
{
std::cout << "arg.a_grid_desc_k0_m_k1_{" << arg.a_grid_desc_k0_m_k1_.GetLength(I0)
@@ -477,8 +477,8 @@ struct DeviceBatchedGemmXdl
remove_reference_t,
true>;
- ave_time = launch_and_time_kernel(kernel,
- nrepeat,
+ ave_time = launch_and_time_kernel(stream_config,
+ kernel,
dim3(grid_size),
dim3(BlockSize),
0,
@@ -511,8 +511,8 @@ struct DeviceBatchedGemmXdl
remove_reference_t,
false>;
- ave_time = launch_and_time_kernel(kernel,
- nrepeat,
+ ave_time = launch_and_time_kernel(stream_config,
+ kernel,
dim3(grid_size),
dim3(BlockSize),
0,
@@ -534,9 +534,10 @@ struct DeviceBatchedGemmXdl
}
// polymorphic
- float Run(const BaseArgument* p_arg, int nrepeat = 1) override
+ float Run(const BaseArgument* p_arg,
+ const StreamConfig& stream_config = StreamConfig{}) override
{
- return Run(*dynamic_cast(p_arg), nrepeat);
+ return Run(*dynamic_cast(p_arg), stream_config);
}
};
diff --git a/include/ck/tensor_operation/gpu/device/device_cgemm_4gemm_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/device_cgemm_4gemm_xdl_cshuffle.hpp
index 2643e46ff2..1f6ebc7042 100644
--- a/include/ck/tensor_operation/gpu/device/device_cgemm_4gemm_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_cgemm_4gemm_xdl_cshuffle.hpp
@@ -55,7 +55,8 @@ template
+ index_t CShuffleBlockTransferScalarPerVector_NPerBlock,
+ LoopScheduler LoopSched = make_default_loop_scheduler()>
struct DeviceCGemm_4Gemm_Xdl_CShuffle
: public DeviceCGemm
{
@@ -376,7 +377,8 @@ struct DeviceCGemm_4Gemm_Xdl_CShuffle
CShuffleMXdlPerWavePerShuffle,
CShuffleNXdlPerWavePerShuffle,
CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
- CShuffleBlockTransferScalarPerVector_NPerBlock>;
+ CShuffleBlockTransferScalarPerVector_NPerBlock,
+ LoopSched>;
// Argument
struct Argument : public BaseArgument
@@ -448,7 +450,7 @@ struct DeviceCGemm_4Gemm_Xdl_CShuffle
{
using Argument = DeviceOp::Argument;
- float Run(const Argument& arg, int nrepeat = 1)
+ float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
{
if(!GridwiseGemm::CheckValidity(
arg.a_grid_desc_ak0_m_ak1_, arg.b_grid_desc_bk0_n_bk1_, arg.c_grid_desc_m_n_))
@@ -478,146 +480,77 @@ struct DeviceCGemm_4Gemm_Xdl_CShuffle
typename GridwiseGemm::DefaultBlock2CTileMap,
true>;
- if(nrepeat == 0)
- {
- launch_kernel(kernel,
- dim3(grid_size),
- dim3(BlockSize),
- 0,
- arg.p_a_grid_real_,
- arg.p_b_grid_real_,
- arg.p_c_grid_real_,
- arg.a_element_op_,
- arg.b_element_op_,
- arg.c_element_op_,
- arg.a_grid_desc_ak0_m_ak1_,
- arg.b_grid_desc_bk0_n_bk1_,
- arg.c_grid_desc_mblock_mperblock_nblock_nperblock_,
- arg.block_2_ctile_map_);
+ ave_time +=
+ launch_and_time_kernel(stream_config,
+ kernel,
+ dim3(grid_size),
+ dim3(BlockSize),
+ 0,
+ arg.p_a_grid_real_,
+ arg.p_b_grid_real_,
+ arg.p_c_grid_real_,
+ arg.a_element_op_,
+ arg.b_element_op_,
+ arg.c_element_op_,
+ arg.a_grid_desc_ak0_m_ak1_,
+ arg.b_grid_desc_bk0_n_bk1_,
+ arg.c_grid_desc_mblock_mperblock_nblock_nperblock_,
+ arg.block_2_ctile_map_);
- launch_kernel(kernel,
- dim3(grid_size),
- dim3(BlockSize),
- 0,
- arg.p_a_grid_imag_,
- arg.p_b_grid_imag_,
- arg.p_aux_grid_,
- arg.a_element_op_,
- arg.b_element_op_,
- arg.c_element_op_,
- arg.a_grid_desc_ak0_m_ak1_,
- arg.b_grid_desc_bk0_n_bk1_,
- arg.c_grid_desc_mblock_mperblock_nblock_nperblock_,
- arg.block_2_ctile_map_);
+ ave_time +=
+ launch_and_time_kernel(stream_config,
+ kernel,
+ dim3(grid_size),
+ dim3(BlockSize),
+ 0,
+ arg.p_a_grid_imag_,
+ arg.p_b_grid_imag_,
+ arg.p_aux_grid_,
+ arg.a_element_op_,
+ arg.b_element_op_,
+ arg.c_element_op_,
+ arg.a_grid_desc_ak0_m_ak1_,
+ arg.b_grid_desc_bk0_n_bk1_,
+ arg.c_grid_desc_mblock_mperblock_nblock_nperblock_,
+ arg.block_2_ctile_map_);
- // c_real = c_real - aux needed here!!!
+ // c_real = c_real - aux needed here!!!
- launch_kernel(kernel,
- dim3(grid_size),
- dim3(BlockSize),
- 0,
- arg.p_a_grid_real_,
- arg.p_b_grid_imag_,
- arg.p_c_grid_imag_,
- arg.a_element_op_,
- arg.b_element_op_,
- arg.c_element_op_,
- arg.a_grid_desc_ak0_m_ak1_,
- arg.b_grid_desc_bk0_n_bk1_,
- arg.c_grid_desc_mblock_mperblock_nblock_nperblock_,
- arg.block_2_ctile_map_);
+ ave_time +=
+ launch_and_time_kernel(stream_config,
+ kernel,
+ dim3(grid_size),
+ dim3(BlockSize),
+ 0,
+ arg.p_a_grid_real_,
+ arg.p_b_grid_imag_,
+ arg.p_c_grid_imag_,
+ arg.a_element_op_,
+ arg.b_element_op_,
+ arg.c_element_op_,
+ arg.a_grid_desc_ak0_m_ak1_,
+ arg.b_grid_desc_bk0_n_bk1_,
+ arg.c_grid_desc_mblock_mperblock_nblock_nperblock_,
+ arg.block_2_ctile_map_);
- launch_kernel(kernel,
- dim3(grid_size),
- dim3(BlockSize),
- 0,
- arg.p_a_grid_imag_,
- arg.p_b_grid_real_,
- arg.p_aux_grid_,
- arg.a_element_op_,
- arg.b_element_op_,
- arg.c_element_op_,
- arg.a_grid_desc_ak0_m_ak1_,
- arg.b_grid_desc_bk0_n_bk1_,
- arg.c_grid_desc_mblock_mperblock_nblock_nperblock_,
- arg.block_2_ctile_map_);
+ ave_time +=
+ launch_and_time_kernel(stream_config,
+ kernel,
+ dim3(grid_size),
+ dim3(BlockSize),
+ 0,
+ arg.p_a_grid_imag_,
+ arg.p_b_grid_real_,
+ arg.p_aux_grid_,
+ arg.a_element_op_,
+ arg.b_element_op_,
+ arg.c_element_op_,
+ arg.a_grid_desc_ak0_m_ak1_,
+ arg.b_grid_desc_bk0_n_bk1_,
+ arg.c_grid_desc_mblock_mperblock_nblock_nperblock_,
+ arg.block_2_ctile_map_);
- // c_imag = c_imag + aux needed here!!!
- }
- else
- {
- ave_time +=
- launch_and_time_kernel(kernel,
- nrepeat,
- dim3(grid_size),
- dim3(BlockSize),
- 0,
- arg.p_a_grid_real_,
- arg.p_b_grid_real_,
- arg.p_c_grid_real_,
- arg.a_element_op_,
- arg.b_element_op_,
- arg.c_element_op_,
- arg.a_grid_desc_ak0_m_ak1_,
- arg.b_grid_desc_bk0_n_bk1_,
- arg.c_grid_desc_mblock_mperblock_nblock_nperblock_,
- arg.block_2_ctile_map_);
-
- ave_time +=
- launch_and_time_kernel(kernel,
- nrepeat,
- dim3(grid_size),
- dim3(BlockSize),
- 0,
- arg.p_a_grid_imag_,
- arg.p_b_grid_imag_,
- arg.p_aux_grid_,
- arg.a_element_op_,
- arg.b_element_op_,
- arg.c_element_op_,
- arg.a_grid_desc_ak0_m_ak1_,
- arg.b_grid_desc_bk0_n_bk1_,
- arg.c_grid_desc_mblock_mperblock_nblock_nperblock_,
- arg.block_2_ctile_map_);
-
- // // c_real = c_real - aux needed here!!!
-
- ave_time +=
- launch_and_time_kernel(kernel,
- nrepeat,
- dim3(grid_size),
- dim3(BlockSize),
- 0,
- arg.p_a_grid_real_,
- arg.p_b_grid_imag_,
- arg.p_c_grid_imag_,
- arg.a_element_op_,
- arg.b_element_op_,
- arg.c_element_op_,
- arg.a_grid_desc_ak0_m_ak1_,
- arg.b_grid_desc_bk0_n_bk1_,
- arg.c_grid_desc_mblock_mperblock_nblock_nperblock_,
- arg.block_2_ctile_map_);
-
- ave_time +=
- launch_and_time_kernel(kernel,
- nrepeat,
- dim3(grid_size),
- dim3(BlockSize),
- 0,
- arg.p_a_grid_imag_,
- arg.p_b_grid_real_,
- arg.p_aux_grid_,
- arg.a_element_op_,
- arg.b_element_op_,
- arg.c_element_op_,
- arg.a_grid_desc_ak0_m_ak1_,
- arg.b_grid_desc_bk0_n_bk1_,
- arg.c_grid_desc_mblock_mperblock_nblock_nperblock_,
- arg.block_2_ctile_map_);
-
- // c_imag = c_imag + aux needed here!!!
- }
+ // c_imag = c_imag + aux needed here!!!
}
else
{
@@ -634,155 +567,87 @@ struct DeviceCGemm_4Gemm_Xdl_CShuffle
typename GridwiseGemm::DefaultBlock2CTileMap,
false>;
- if(nrepeat == 0)
- {
- launch_kernel(kernel,
- dim3(grid_size),
- dim3(BlockSize),
- 0,
- arg.p_a_grid_real_,
- arg.p_b_grid_real_,
- arg.p_c_grid_real_,
- arg.a_element_op_,
- arg.b_element_op_,
- arg.c_element_op_,
- arg.a_grid_desc_ak0_m_ak1_,
- arg.b_grid_desc_bk0_n_bk1_,
- arg.c_grid_desc_mblock_mperblock_nblock_nperblock_,
- arg.block_2_ctile_map_);
+ ave_time +=
+ launch_and_time_kernel(stream_config,
+ kernel,
+ dim3(grid_size),
+ dim3(BlockSize),
+ 0,
+ arg.p_a_grid_real_,
+ arg.p_b_grid_real_,
+ arg.p_c_grid_real_,
+ arg.a_element_op_,
+ arg.b_element_op_,
+ arg.c_element_op_,
+ arg.a_grid_desc_ak0_m_ak1_,
+ arg.b_grid_desc_bk0_n_bk1_,
+ arg.c_grid_desc_mblock_mperblock_nblock_nperblock_,
+ arg.block_2_ctile_map_);
- launch_kernel(kernel,
- dim3(grid_size),
- dim3(BlockSize),
- 0,
- arg.p_a_grid_imag_,
- arg.p_b_grid_imag_,
- arg.p_aux_grid_,
- arg.a_element_op_,
- arg.b_element_op_,
- arg.c_element_op_,
- arg.a_grid_desc_ak0_m_ak1_,
- arg.b_grid_desc_bk0_n_bk1_,
- arg.c_grid_desc_mblock_mperblock_nblock_nperblock_,
- arg.block_2_ctile_map_);
+ ave_time +=
+ launch_and_time_kernel(stream_config,
+ kernel,
+ dim3(grid_size),
+ dim3(BlockSize),
+ 0,
+ arg.p_a_grid_imag_,
+ arg.p_b_grid_imag_,
+ arg.p_aux_grid_,
+ arg.a_element_op_,
+ arg.b_element_op_,
+ arg.c_element_op_,
+ arg.a_grid_desc_ak0_m_ak1_,
+ arg.b_grid_desc_bk0_n_bk1_,
+ arg.c_grid_desc_mblock_mperblock_nblock_nperblock_,
+ arg.block_2_ctile_map_);
- // // c_real = c_real - aux needed here!!!
+ // // c_real = c_real - aux needed here!!!
- launch_kernel(kernel,
- dim3(grid_size),
- dim3(BlockSize),
- 0,
- arg.p_a_grid_real_,
- arg.p_b_grid_imag_,
- arg.p_c_grid_imag_,
- arg.a_element_op_,
- arg.b_element_op_,
- arg.c_element_op_,
- arg.a_grid_desc_ak0_m_ak1_,
- arg.b_grid_desc_bk0_n_bk1_,
- arg.c_grid_desc_mblock_mperblock_nblock_nperblock_,
- arg.block_2_ctile_map_);
+ ave_time +=
+ launch_and_time_kernel(stream_config,
+ kernel,
+ dim3(grid_size),
+ dim3(BlockSize),
+ 0,
+ arg.p_a_grid_real_,
+ arg.p_b_grid_imag_,
+ arg.p_c_grid_imag_,
+ arg.a_element_op_,
+ arg.b_element_op_,
+ arg.c_element_op_,
+ arg.a_grid_desc_ak0_m_ak1_,
+ arg.b_grid_desc_bk0_n_bk1_,
+ arg.c_grid_desc_mblock_mperblock_nblock_nperblock_,
+ arg.block_2_ctile_map_);
- launch_kernel(kernel,
- dim3(grid_size),
- dim3(BlockSize),
- 0,
- arg.p_a_grid_imag_,
- arg.p_b_grid_real_,
- arg.p_aux_grid_,
- arg.a_element_op_,
- arg.b_element_op_,
- arg.c_element_op_,
- arg.a_grid_desc_ak0_m_ak1_,
- arg.b_grid_desc_bk0_n_bk1_,
- arg.c_grid_desc_mblock_mperblock_nblock_nperblock_,
- arg.block_2_ctile_map_);
+ ave_time +=
+ launch_and_time_kernel(stream_config,
+ kernel,
+ dim3(grid_size),
+ dim3(BlockSize),
+ 0,
+ arg.p_a_grid_imag_,
+ arg.p_b_grid_real_,
+ arg.p_aux_grid_,
+ arg.a_element_op_,
+ arg.b_element_op_,
+ arg.c_element_op_,
+ arg.a_grid_desc_ak0_m_ak1_,
+ arg.b_grid_desc_bk0_n_bk1_,
+ arg.c_grid_desc_mblock_mperblock_nblock_nperblock_,
+ arg.block_2_ctile_map_);
- // c_imag = c_imag + aux needed here!!!
- }
- else
- {
- ave_time +=
- launch_and_time_kernel(kernel,
- nrepeat,
- dim3(grid_size),
- dim3(BlockSize),
- 0,
- arg.p_a_grid_real_,
- arg.p_b_grid_real_,
- arg.p_c_grid_real_,
- arg.a_element_op_,
- arg.b_element_op_,
- arg.c_element_op_,
- arg.a_grid_desc_ak0_m_ak1_,
- arg.b_grid_desc_bk0_n_bk1_,
- arg.c_grid_desc_mblock_mperblock_nblock_nperblock_,
- arg.block_2_ctile_map_);
-
- ave_time +=
- launch_and_time_kernel(kernel,
- nrepeat,
- dim3(grid_size),
- dim3(BlockSize),
- 0,
- arg.p_a_grid_imag_,
- arg.p_b_grid_imag_,
- arg.p_aux_grid_,
- arg.a_element_op_,
- arg.b_element_op_,
- arg.c_element_op_,
- arg.a_grid_desc_ak0_m_ak1_,
- arg.b_grid_desc_bk0_n_bk1_,
- arg.c_grid_desc_mblock_mperblock_nblock_nperblock_,
- arg.block_2_ctile_map_);
-
- // c_real = c_real - aux needed here!!!
-
- ave_time +=
- launch_and_time_kernel(kernel,
- nrepeat,
- dim3(grid_size),
- dim3(BlockSize),
- 0,
- arg.p_a_grid_real_,
- arg.p_b_grid_imag_,
- arg.p_c_grid_imag_,
- arg.a_element_op_,
- arg.b_element_op_,
- arg.c_element_op_,
- arg.a_grid_desc_ak0_m_ak1_,
- arg.b_grid_desc_bk0_n_bk1_,
- arg.c_grid_desc_mblock_mperblock_nblock_nperblock_,
- arg.block_2_ctile_map_);
-
- ave_time +=
- launch_and_time_kernel(kernel,
- nrepeat,
- dim3(grid_size),
- dim3(BlockSize),
- 0,
- arg.p_a_grid_imag_,
- arg.p_b_grid_real_,
- arg.p_aux_grid_,
- arg.a_element_op_,
- arg.b_element_op_,
- arg.c_element_op_,
- arg.a_grid_desc_ak0_m_ak1_,
- arg.b_grid_desc_bk0_n_bk1_,
- arg.c_grid_desc_mblock_mperblock_nblock_nperblock_,
- arg.block_2_ctile_map_);
-
- // c_imag = c_imag + aux needed here!!!
- }
+ // c_imag = c_imag + aux needed here!!!
}
return ave_time;
}
// polymorphic
- float Run(const BaseArgument* p_arg, int nrepeat = 1) override
+ float Run(const BaseArgument* p_arg,
+ const StreamConfig& stream_config = StreamConfig{}) override
{
- return Run(*dynamic_cast(p_arg), nrepeat);
+ return Run(*dynamic_cast(p_arg), stream_config);
}
};
diff --git a/include/ck/tensor_operation/gpu/device/device_conv2d_backward_weight_xdl_c_shuffle_nhwc_kyxc_nhwk.hpp b/include/ck/tensor_operation/gpu/device/device_conv2d_backward_weight_xdl_c_shuffle_nhwc_kyxc_nhwk.hpp
index 466e6ad89f..c36227083c 100644
--- a/include/ck/tensor_operation/gpu/device/device_conv2d_backward_weight_xdl_c_shuffle_nhwc_kyxc_nhwk.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_conv2d_backward_weight_xdl_c_shuffle_nhwc_kyxc_nhwk.hpp
@@ -415,9 +415,10 @@ struct DeviceConv2dBwdWeightXdl_C_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_
<< arg.c_grid_desc_m_n_.GetLength(I1) << "}" << std::endl;
}
- float Run(const Argument& arg, int nrepeat = 1)
+ float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
{
ShowInfo(arg);
+
if(!GridwiseGemm::CheckValidity(arg.a_grid_desc_kbatch_k0_m_k1_,
arg.b_grid_desc_kbatch_k0_n_k1_,
arg.c_grid_desc_m_n_,
@@ -437,49 +438,27 @@ struct DeviceConv2dBwdWeightXdl_C_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_
float ave_time = 0;
const auto Run = [&](const auto& kernel) {
- if(nrepeat > 0)
- {
- ave_time =
- launch_and_time_kernel(kernel,
- nrepeat,
- dim3(grid_size),
- dim3(BlockSize),
- 0,
- arg.p_a_grid_,
- arg.p_b_grid_,
- arg.p_c_grid_,
- arg.a_grid_desc_kbatch_k0_m_k1_,
- arg.b_grid_desc_kbatch_k0_n_k1_,
- arg.c_grid_desc_mblock_mperblock_nblock_nperblock_,
- arg.a_element_op_,
- arg.b_element_op_,
- arg.c_element_op_,
- arg.block_2_ctile_map_);
- }
+ hipGetErrorString(hipMemset(
+ arg.p_c_grid_,
+ 0,
+ arg.c_grid_desc_mblock_mperblock_nblock_nperblock_.GetElementSpaceSize() *
+ sizeof(CDataType)));
- if(kbatch > 1 || nrepeat <= 0)
- {
- hipGetErrorString(hipMemset(
- arg.p_c_grid_,
- 0,
- arg.c_grid_desc_mblock_mperblock_nblock_nperblock_.GetElementSpaceSize() *
- sizeof(CDataType)));
-
- launch_kernel(kernel,
- dim3(grid_size),
- dim3(BlockSize),
- 0,
- arg.p_a_grid_,
- arg.p_b_grid_,
- arg.p_c_grid_,
- arg.a_grid_desc_kbatch_k0_m_k1_,
- arg.b_grid_desc_kbatch_k0_n_k1_,
- arg.c_grid_desc_mblock_mperblock_nblock_nperblock_,
- arg.a_element_op_,
- arg.b_element_op_,
- arg.c_element_op_,
- arg.block_2_ctile_map_);
- }
+ launch_and_time_kernel(stream_config,
+ kernel,
+ dim3(grid_size),
+ dim3(BlockSize),
+ 0,
+ arg.p_a_grid_,
+ arg.p_b_grid_,
+ arg.p_c_grid_,
+ arg.a_grid_desc_kbatch_k0_m_k1_,
+ arg.b_grid_desc_kbatch_k0_n_k1_,
+ arg.c_grid_desc_mblock_mperblock_nblock_nperblock_,
+ arg.a_element_op_,
+ arg.b_element_op_,
+ arg.c_element_op_,
+ arg.block_2_ctile_map_);
};
if(has_main_k0_block_loop)
@@ -560,9 +539,10 @@ struct DeviceConv2dBwdWeightXdl_C_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_
return ave_time;
}
- float Run(const BaseArgument* p_arg, int nrepeat = 1) override
+ float Run(const BaseArgument* p_arg,
+ const StreamConfig& stream_config = StreamConfig{}) override
{
- return Run(*dynamic_cast(p_arg), nrepeat);
+ return Run(*dynamic_cast(p_arg), stream_config);
}
};
diff --git a/include/ck/tensor_operation/gpu/device/device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk.hpp b/include/ck/tensor_operation/gpu/device/device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk.hpp
index fad4ec1ffa..def6af74ac 100644
--- a/include/ck/tensor_operation/gpu/device/device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk.hpp
@@ -531,7 +531,7 @@ struct DeviceConv2dBwdDataXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K
{
using Argument = DeviceOp::Argument;
- float Run(const Argument& arg, int nrepeat = 1)
+ float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
{
float ave_time = 0;
for(size_t i = 0; i < arg.a_grid_desc_k0_m_k1_container_.size(); i++)
@@ -602,8 +602,8 @@ struct DeviceConv2dBwdDataXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K
true>;
ave_time += launch_and_time_kernel(
+ stream_config,
kernel,
- nrepeat,
dim3(grid_size),
dim3(BlockSize),
0,
@@ -635,8 +635,8 @@ struct DeviceConv2dBwdDataXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K
false>;
ave_time += launch_and_time_kernel(
+ stream_config,
kernel,
- nrepeat,
dim3(grid_size),
dim3(BlockSize),
0,
@@ -655,9 +655,10 @@ struct DeviceConv2dBwdDataXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K
return ave_time;
}
- float Run(const BaseArgument* p_arg, int nrepeat = 1) override
+ float Run(const BaseArgument* p_arg,
+ const StreamConfig& stream_config = StreamConfig{}) override
{
- return Run(*dynamic_cast(p_arg), nrepeat);
+ return Run(*dynamic_cast(p_arg), stream_config);
}
};
diff --git a/include/ck/tensor_operation/gpu/device/device_conv2d_fwd_xdl_c_shuffle_bias_activation_add_nhwc_kyxc_nhwk.hpp b/include/ck/tensor_operation/gpu/device/device_conv2d_fwd_xdl_c_shuffle_bias_activation_add_nhwc_kyxc_nhwk.hpp
index 6648929cd5..fd95c184ca 100644
--- a/include/ck/tensor_operation/gpu/device/device_conv2d_fwd_xdl_c_shuffle_bias_activation_add_nhwc_kyxc_nhwk.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_conv2d_fwd_xdl_c_shuffle_bias_activation_add_nhwc_kyxc_nhwk.hpp
@@ -642,7 +642,7 @@ struct
{
using Argument = DeviceOp::Argument;
- float Run(const Argument& arg, int nrepeat = 1)
+ float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
{
#if 0
{
@@ -727,8 +727,8 @@ struct
true>;
ave_time = launch_and_time_kernel(
+ stream_config,
kernel,
- nrepeat,
dim3(grid_size),
dim3(BlockSize),
0,
@@ -771,8 +771,8 @@ struct
false>;
ave_time = launch_and_time_kernel(
+ stream_config,
kernel,
- nrepeat,
dim3(grid_size),
dim3(BlockSize),
0,
@@ -795,9 +795,10 @@ struct
return ave_time;
}
- float Run(const BaseArgument* p_arg, int nrepeat = 1) override
+ float Run(const BaseArgument* p_arg,
+ const StreamConfig& stream_config = StreamConfig{}) override
{
- return Run(*dynamic_cast(p_arg), nrepeat);
+ return Run(*dynamic_cast(p_arg), stream_config);
}
};
diff --git a/include/ck/tensor_operation/gpu/device/device_conv2d_fwd_xdl_c_shuffle_bias_activation_nhwc_kyxc_nhwk.hpp b/include/ck/tensor_operation/gpu/device/device_conv2d_fwd_xdl_c_shuffle_bias_activation_nhwc_kyxc_nhwk.hpp
index fd0941420c..61c91c0b76 100644
--- a/include/ck/tensor_operation/gpu/device/device_conv2d_fwd_xdl_c_shuffle_bias_activation_nhwc_kyxc_nhwk.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_conv2d_fwd_xdl_c_shuffle_bias_activation_nhwc_kyxc_nhwk.hpp
@@ -605,7 +605,7 @@ struct DeviceConv2dFwdXdl_C_Shuffle_Bias_Activation_Input_N_Hi_Wi_C_Weight_K_Y_X
{
using Argument = DeviceOp::Argument;
- float Run(const Argument& arg, int nrepeat = 1)
+ float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
{
#if 0
{
@@ -684,8 +684,8 @@ struct DeviceConv2dFwdXdl_C_Shuffle_Bias_Activation_Input_N_Hi_Wi_C_Weight_K_Y_X
true>;
ave_time = launch_and_time_kernel(
+ stream_config,
kernel,
- nrepeat,
dim3(grid_size),
dim3(BlockSize),
0,
@@ -723,8 +723,8 @@ struct DeviceConv2dFwdXdl_C_Shuffle_Bias_Activation_Input_N_Hi_Wi_C_Weight_K_Y_X
false>;
ave_time = launch_and_time_kernel(
+ stream_config,
kernel,
- nrepeat,
dim3(grid_size),
dim3(BlockSize),
0,
@@ -745,9 +745,10 @@ struct DeviceConv2dFwdXdl_C_Shuffle_Bias_Activation_Input_N_Hi_Wi_C_Weight_K_Y_X
return ave_time;
}
- float Run(const BaseArgument* p_arg, int nrepeat = 1) override
+ float Run(const BaseArgument* p_arg,
+ const StreamConfig& stream_config = StreamConfig{}) override
{
- return Run(*dynamic_cast(p_arg), nrepeat);
+ return Run(*dynamic_cast(p_arg), stream_config);
}
};
diff --git a/include/ck/tensor_operation/gpu/device/device_conv2d_fwd_xdl_c_shuffle_nhwc_kyxc_nhwk.hpp b/include/ck/tensor_operation/gpu/device/device_conv2d_fwd_xdl_c_shuffle_nhwc_kyxc_nhwk.hpp
index b508606a75..f4cddc1946 100644
--- a/include/ck/tensor_operation/gpu/device/device_conv2d_fwd_xdl_c_shuffle_nhwc_kyxc_nhwk.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_conv2d_fwd_xdl_c_shuffle_nhwc_kyxc_nhwk.hpp
@@ -568,7 +568,7 @@ struct DeviceConv2dFwdXdl_C_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_W
{
using Argument = DeviceOp::Argument;
- float Run(const Argument& arg, int nrepeat = 1)
+ float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
{
#if 0
{
@@ -663,8 +663,8 @@ struct DeviceConv2dFwdXdl_C_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_W
true>;
ave_time = launch_and_time_kernel(
+ stream_config,
kernel,
- nrepeat,
dim3(grid_size),
dim3(BlockSize),
0,
@@ -697,8 +697,8 @@ struct DeviceConv2dFwdXdl_C_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_W
false>;
ave_time = launch_and_time_kernel(
+ stream_config,
kernel,
- nrepeat,
dim3(grid_size),
dim3(BlockSize),
0,
@@ -717,9 +717,10 @@ struct DeviceConv2dFwdXdl_C_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_W
return ave_time;
}
- float Run(const BaseArgument* p_arg, int nrepeat = 1) override
+ float Run(const BaseArgument* p_arg,
+ const StreamConfig& stream_config = StreamConfig{}) override
{
- return Run(*dynamic_cast(p_arg), nrepeat);
+ return Run(*dynamic_cast(p_arg), stream_config);
}
};
diff --git a/include/ck/tensor_operation/gpu/device/device_conv2d_fwd_xdl_nhwc_kyxc_nhwk.hpp b/include/ck/tensor_operation/gpu/device/device_conv2d_fwd_xdl_nhwc_kyxc_nhwk.hpp
index 3574f7667e..aa9229f7cb 100644
--- a/include/ck/tensor_operation/gpu/device/device_conv2d_fwd_xdl_nhwc_kyxc_nhwk.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_conv2d_fwd_xdl_nhwc_kyxc_nhwk.hpp
@@ -450,7 +450,7 @@ struct DeviceConv2dFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K
{
using Argument = DeviceOp::Argument;
- float Run(const Argument& arg, int nrepeat = 1)
+ float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
{
#if 0
{
@@ -498,8 +498,8 @@ struct DeviceConv2dFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K
remove_reference_t,
true>;
- ave_time = launch_and_time_kernel(kernel,
- nrepeat,
+ ave_time = launch_and_time_kernel(stream_config,
+ kernel,
dim3(grid_size),
dim3(BlockSize),
0,
@@ -529,8 +529,8 @@ struct DeviceConv2dFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K
remove_reference_t,
false>;
- ave_time = launch_and_time_kernel(kernel,
- nrepeat,
+ ave_time = launch_and_time_kernel(stream_config,
+ kernel,
dim3(grid_size),
dim3(BlockSize),
0,
@@ -549,9 +549,10 @@ struct DeviceConv2dFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K
return ave_time;
}
- float Run(const BaseArgument* p_arg, int nrepeat = 1) override
+ float Run(const BaseArgument* p_arg,
+ const StreamConfig& stream_config = StreamConfig{}) override
{
- return Run(*dynamic_cast(p_arg), nrepeat);
+ return Run(*dynamic_cast(p_arg), stream_config);
}
};
diff --git a/include/ck/tensor_operation/gpu/device/device_conv3d_fwd_naive_ndhwc_kzyxc_ndhwk.hpp b/include/ck/tensor_operation/gpu/device/device_conv3d_fwd_naive_ndhwc_kzyxc_ndhwk.hpp
index c3ebe58865..b1eea0b33f 100644
--- a/include/ck/tensor_operation/gpu/device/device_conv3d_fwd_naive_ndhwc_kzyxc_ndhwk.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_conv3d_fwd_naive_ndhwc_kzyxc_ndhwk.hpp
@@ -4,7 +4,7 @@
#include
#include
#include
-#include "conv_fwd_util.hpp"
+#include "conv_util.hpp"
#include "device.hpp"
#include "device_conv_fwd.hpp"
#include "common_header.hpp"
@@ -92,7 +92,7 @@ struct DeviceConv3dFwdNaive_Input_N_Di_Hi_Wi_C_Weight_K_Z_Y_X_C_Output_N_Do_Ho_W
{
using Argument = DeviceOp::Argument;
- float Run(const Argument& arg, int nrepeat = 1)
+ float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
{
const auto naive_conv3d_fwd =
ref::naive_conv_fwd_ndhwc_kzyxc_ndhwk;
- float ave_time = launch_and_time_kernel(naive_conv3d_fwd,
- nrepeat,
+ float ave_time = launch_and_time_kernel(stream_config,
+ naive_conv3d_fwd,
dim3(256),
dim3(256),
0,
@@ -137,9 +137,10 @@ struct DeviceConv3dFwdNaive_Input_N_Di_Hi_Wi_C_Weight_K_Z_Y_X_C_Output_N_Do_Ho_W
}
// polymorphic
- float Run(const BaseArgument* p_arg, int nrepeat = 1) override
+ float Run(const BaseArgument* p_arg,
+ const StreamConfig& stream_config = StreamConfig{}) override
{
- return Run(*dynamic_cast(p_arg), nrepeat);
+ return Run(*dynamic_cast(p_arg), stream_config);
}
};
diff --git a/include/ck/tensor_operation/gpu/device/device_conv3d_fwd_xdl_ndhwc_kzyxc_ndhwk.hpp b/include/ck/tensor_operation/gpu/device/device_conv3d_fwd_xdl_ndhwc_kzyxc_ndhwk.hpp
index ff30a6880d..0f98ba054d 100644
--- a/include/ck/tensor_operation/gpu/device/device_conv3d_fwd_xdl_ndhwc_kzyxc_ndhwk.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_conv3d_fwd_xdl_ndhwc_kzyxc_ndhwk.hpp
@@ -438,7 +438,7 @@ struct DeviceConv3dFwdXdl_Input_N_Di_Hi_Wi_C_Weight_K_Z_Y_X_C_Output_N_Do_Ho_Wo_
{
using Argument = DeviceOp::Argument;
- float Run(const Argument& arg, int nrepeat = 1)
+ float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
{
{
std::cout << "num_batches_of_GEMM = " << arg.num_subbatches_ << std::endl;
@@ -487,8 +487,8 @@ struct DeviceConv3dFwdXdl_Input_N_Di_Hi_Wi_C_Weight_K_Z_Y_X_C_Output_N_Do_Ho_Wo_
OutElementwiseOperation,
remove_reference_t,
true>;
- ave_time = launch_and_time_kernel(kernel,
- nrepeat,
+ ave_time = launch_and_time_kernel(stream_config,
+ kernel,
dim3(grid_size),
dim3(BlockSize),
0,
@@ -522,8 +522,8 @@ struct DeviceConv3dFwdXdl_Input_N_Di_Hi_Wi_C_Weight_K_Z_Y_X_C_Output_N_Do_Ho_Wo_
remove_reference_t,
false>;
- ave_time = launch_and_time_kernel(kernel,
- nrepeat,
+ ave_time = launch_and_time_kernel(stream_config,
+ kernel,
dim3(grid_size),
dim3(BlockSize),
0,
@@ -547,9 +547,10 @@ struct DeviceConv3dFwdXdl_Input_N_Di_Hi_Wi_C_Weight_K_Z_Y_X_C_Output_N_Do_Ho_Wo_
}
// polymorphic
- float Run(const BaseArgument* p_arg, int nrepeat = 1) override
+ float Run(const BaseArgument* p_arg,
+ const StreamConfig& stream_config = StreamConfig{}) override
{
- return Run(*dynamic_cast(p_arg), nrepeat);
+ return Run(*dynamic_cast(p_arg), stream_config);
}
};
diff --git a/include/ck/tensor_operation/gpu/device/device_convnd_bwd_data_xdl_ndhwc_kzyxc_ndhwk.hpp b/include/ck/tensor_operation/gpu/device/device_convnd_bwd_data_xdl_ndhwc_kzyxc_ndhwk.hpp
index 5dca8f9629..209b3c866e 100644
--- a/include/ck/tensor_operation/gpu/device/device_convnd_bwd_data_xdl_ndhwc_kzyxc_ndhwk.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_convnd_bwd_data_xdl_ndhwc_kzyxc_ndhwk.hpp
@@ -1241,7 +1241,7 @@ struct DeviceConvndBwdDataXdl_Input_N_Di_Hi_Wi_C_Weight_K_Z_Y_X_C_Output_N_Do_Ho
{
using Argument = DeviceOp::Argument;
- float Run(const Argument& arg, int nrepeat = 1)
+ float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
{
float ave_time = 0;
for(size_t i = 0; i < arg.a_grid_desc_k0_m_k1_container_.size(); i++)
@@ -1316,8 +1316,8 @@ struct DeviceConvndBwdDataXdl_Input_N_Di_Hi_Wi_C_Weight_K_Z_Y_X_C_Output_N_Do_Ho
true>;
ave_time += launch_and_time_kernel(
+ stream_config,
kernel,
- nrepeat,
dim3(grid_size),
dim3(BlockSize),
0,
@@ -1349,8 +1349,8 @@ struct DeviceConvndBwdDataXdl_Input_N_Di_Hi_Wi_C_Weight_K_Z_Y_X_C_Output_N_Do_Ho
false>;
ave_time += launch_and_time_kernel(
+ stream_config,
kernel,
- nrepeat,
dim3(grid_size),
dim3(BlockSize),
0,
@@ -1369,9 +1369,10 @@ struct DeviceConvndBwdDataXdl_Input_N_Di_Hi_Wi_C_Weight_K_Z_Y_X_C_Output_N_Do_Ho
return ave_time;
}
- float Run(const BaseArgument* p_arg, int nrepeat = 1) override
+ float Run(const BaseArgument* p_arg,
+ const StreamConfig& stream_config = StreamConfig{}) override
{
- return Run(*dynamic_cast(p_arg), nrepeat);
+ return Run(*dynamic_cast(p_arg), stream_config);
}
};
diff --git a/include/ck/tensor_operation/gpu/device/device_convnd_fwd_xdl_nhwc_kyxc_nhwk.hpp b/include/ck/tensor_operation/gpu/device/device_convnd_fwd_xdl_nhwc_kyxc_nhwk.hpp
index 7365f9a3e2..4251052a99 100644
--- a/include/ck/tensor_operation/gpu/device/device_convnd_fwd_xdl_nhwc_kyxc_nhwk.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_convnd_fwd_xdl_nhwc_kyxc_nhwk.hpp
@@ -747,7 +747,7 @@ struct DeviceConvNDFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K
{
using Argument = DeviceOp::Argument;
- float Run(const Argument& arg, int nrepeat = 1)
+ float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
{
#if 0
{
@@ -795,8 +795,8 @@ struct DeviceConvNDFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K
remove_reference_t,
true>;
- ave_time = launch_and_time_kernel(kernel,
- nrepeat,
+ ave_time = launch_and_time_kernel(stream_config,
+ kernel,
dim3(grid_size),
dim3(BlockSize),
0,
@@ -826,8 +826,8 @@ struct DeviceConvNDFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K
remove_reference_t,
false>;
- ave_time = launch_and_time_kernel(kernel,
- nrepeat,
+ ave_time = launch_and_time_kernel(stream_config,
+ kernel,
dim3(grid_size),
dim3(BlockSize),
0,
@@ -846,9 +846,10 @@ struct DeviceConvNDFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K
return ave_time;
}
- float Run(const BaseArgument* p_arg, int nrepeat = 1) override
+ float Run(const BaseArgument* p_arg,
+ const StreamConfig& stream_config = StreamConfig{}) override
{
- return Run(*dynamic_cast(p_arg), nrepeat);
+ return Run(*dynamic_cast