diff --git a/CMakeLists.txt b/CMakeLists.txt
index 2b798e38f3..f18c85c683 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -27,6 +27,8 @@ set(CMAKE_CXX_STANDARD_REQUIRED ON)
set(CMAKE_CXX_EXTENSIONS OFF)
message("CMAKE_CXX_COMPILER_ID: ${CMAKE_CXX_COMPILER_ID}")
+option(CK_TIME_KERNEL "Turning off will disable kernel timing globally" ON)
+
## OpenMP
if(CMAKE_CXX_COMPILER_ID MATCHES "Clang")
# workaround issue hipcc in rocm3.5 cannot find openmp
@@ -72,8 +74,9 @@ message(STATUS "Build with HIP ${HIP_VERSION}")
rocm_create_package(
- NAME CK-${CK_BACKEND}
+ NAME composablekernel
DESCRIPTION "High Performance Composable Kernel for AMD GPUs"
+ MAINTAINER "MIOpen Kernels Dev Team
"
LDCONFIG
)
@@ -226,7 +229,7 @@ set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/lib)
set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/lib)
set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/bin)
-configure_file("${PROJECT_SOURCE_DIR}/include/ck/hip_version.hpp.in" "${PROJECT_BINARY_DIR}/include/ck/hip_version.hpp")
+configure_file("${PROJECT_SOURCE_DIR}/include/ck/options.hpp.in" "${PROJECT_BINARY_DIR}/include/ck/options.hpp")
include_directories(BEFORE
${PROJECT_SOURCE_DIR}/include
@@ -234,7 +237,6 @@ include_directories(BEFORE
${PROJECT_SOURCE_DIR}/library/include
)
-include(googletest)
SET(BUILD_DEV ON CACHE BOOL "BUILD_DEV")
if(BUILD_DEV)
@@ -247,3 +249,25 @@ add_subdirectory(library)
add_subdirectory(example)
add_subdirectory(test)
add_subdirectory(profiler)
+
+#Create an interface target for the include only files and call it "composablekernels"
+include(CMakePackageConfigHelpers)
+
+set(version 1.0.0)
+write_basic_package_version_file(
+ "${CMAKE_CURRENT_BINARY_DIR}/composable_kernelConfigVersion.cmake"
+ VERSION "${version}"
+ COMPATIBILITY AnyNewerVersion
+)
+
+configure_package_config_file(${CMAKE_CURRENT_SOURCE_DIR}/Config.cmake.in
+ "${CMAKE_CURRENT_BINARY_DIR}/composable_kernelConfig.cmake"
+ INSTALL_DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/composable_kernel
+ NO_CHECK_REQUIRED_COMPONENTS_MACRO
+)
+
+install(FILES
+ "${CMAKE_CURRENT_BINARY_DIR}/composable_kernelConfig.cmake"
+ "${CMAKE_CURRENT_BINARY_DIR}/composable_kernelConfigVersion.cmake"
+ DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/composable_kernel
+)
diff --git a/Config.cmake.in b/Config.cmake.in
new file mode 100644
index 0000000000..12b5c331ae
--- /dev/null
+++ b/Config.cmake.in
@@ -0,0 +1,11 @@
+@PACKAGE_INIT@
+
+set(_composable_kernel_supported_components device_operations host_tensor)
+
+foreach(_comp ${composable_kernel_FIND_COMPONENTS})
+ if(NOT _comp IN_LIST _composable_kernel_supported_components)
+ set(composable_kernel_FOUND False)
+ set(composable_kernel_NOT_FOUND_MESSAGE "Unsupported component: ${_comp}")
+ endif()
+ include("${CMAKE_CURRENT_LIST_DIR}/composable_kernel${_comp}Targets.cmake")
+endforeach()
diff --git a/Dockerfile b/Dockerfile
index c4cf0fac57..9a443e01de 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -11,13 +11,7 @@ ARG DEB_ROCM_REPO=http://repo.radeon.com/rocm/apt/.apt_$ROCMVERSION/
RUN apt-get update
RUN apt-get install -y wget gnupg
RUN wget -qO - http://repo.radeon.com/rocm/rocm.gpg.key | apt-key add -
-RUN if ! [ -z $OSDB_BKC_VERSION ]; then \
- echo "Using BKC VERISION: $OSDB_BKC_VERSION";\
- sh -c "echo deb [arch=amd64 trusted=yes] http://compute-artifactory.amd.com/artifactory/list/rocm-osdb-deb/ compute-rocm-dkms-no-npi-hipclang ${OSDB_BKC_VERSION} > /etc/apt/sources.list.d/rocm.list" ;\
- cat /etc/apt/sources.list.d/rocm.list;\
- else \
- sh -c "echo deb [arch=amd64] $DEB_ROCM_REPO ubuntu main > /etc/apt/sources.list.d/rocm.list" ;\
- fi
+RUN sh -c "echo deb [arch=amd64] $DEB_ROCM_REPO ubuntu main > /etc/apt/sources.list.d/rocm.list"
RUN wget --no-check-certificate -qO - https://apt.kitware.com/keys/kitware-archive-latest.asc 2>/dev/null | apt-key add -
RUN sh -c "echo deb https://apt.kitware.com/ubuntu/ bionic main | tee -a /etc/apt/sources.list"
@@ -25,18 +19,15 @@ RUN sh -c "echo deb https://apt.kitware.com/ubuntu/ bionic main | tee -a /etc/ap
# Install dependencies
RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-unauthenticated \
apt-utils \
- sshpass \
build-essential \
cmake-data=3.15.1-0kitware1 \
cmake=3.15.1-0kitware1 \
curl \
- doxygen \
g++ \
gdb \
git \
hip-rocclr \
jq \
- lcov \
libelf-dev \
libncurses5-dev \
libnuma-dev \
@@ -62,8 +53,6 @@ RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-
apt-get clean && \
rm -rf /var/lib/apt/lists/*
-# RUN pip3 install --default-timeout=100000 -r requirements.txt
-
# Setup ubsan environment to printstacktrace
RUN ln -s /usr/bin/llvm-symbolizer-3.8 /usr/local/bin/llvm-symbolizer
ENV UBSAN_OPTIONS=print_stacktrace=1
@@ -92,5 +81,3 @@ ADD rbuild.ini /rbuild.ini
ADD dev-requirements.txt dev-requirements.txt
RUN rbuild prepare -s develop -d $PREFIX
RUN groupadd -f render
-# RUN cget install -f min-requirements.txt
-# RUN CXXFLAGS='-isystem $PREFIX/include' cget install -f ./mlir-requirements.txt
diff --git a/Jenkinsfile b/Jenkinsfile
index f065d4ecc5..77f4d9d8be 100644
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -320,7 +320,7 @@ pipeline {
{
agent{ label rocmnode("gfx908")}
environment{
- setup_args = """ -D CMAKE_CXX_FLAGS="--offload-arch=gfx908 -O3 " -DBUILD_DEV=On """
+ setup_args = """ -D CMAKE_CXX_FLAGS=" --offload-arch=gfx900 --offload-arch=gfx906 --offload-arch=gfx908 --offload-arch=gfx90a -O3 " -DBUILD_DEV=On """
}
steps{
buildHipClangJobAndReboot(setup_args:setup_args, config_targets: "check", no_reboot:true, build_type: 'Release')
@@ -341,6 +341,23 @@ pipeline {
}
}
+ stage("Client App")
+ {
+ parallel
+ {
+ stage("Run Client App")
+ {
+ agent{ label rocmnode("gfx908")}
+ environment{
+ setup_args = """ -D -DBUILD_DEV=Off -DCMAKE_INSTALL_PREFIX=../install CMAKE_CXX_FLAGS="--offload-arch=gfx908 -O3 " """
+ execute_args = """ cd ../test/client_app && rm -rf build && mkdir build && cd build && cmake -DCMAKE_PREFIX_PATH="${env.WORKSPACE}/install;/opt/rocm" .. && make """
+ }
+ steps{
+ buildHipClangJobAndReboot(setup_args: setup_args, config_targets: "install", no_reboot:true, build_type: 'Release', execute_cmd: execute_args, prefixpath: '/usr/local')
+ }
+ }
+ }
+ }
stage("Performance Tests")
{
parallel
diff --git a/README.md b/README.md
index f5341b5736..9d7b578046 100644
--- a/README.md
+++ b/README.md
@@ -43,3 +43,13 @@ Instructions for running each individual examples are under ```example/```
make -j ckProfiler
```
Instructions for running ckProfiler are under ```profiler/```
+
+
+## Caveat
+### Kernel Timing and Verification
+CK's own kernel timer will warn up kernel once, and then run it multiple times
+to get average kernel time. For some kernels that use atomic add, this will cause
+output buffer to be accumulated multiple times, causing verfication failure.
+To work around it, do not use CK's own timer and do verification at the same time.
+CK's own timer and verification in each example and ckProfiler can be enabled or
+disabled from command line.
diff --git a/cmake/googletest.cmake b/cmake/googletest.cmake
index f869ba483e..959bc4f4b0 100644
--- a/cmake/googletest.cmake
+++ b/cmake/googletest.cmake
@@ -19,6 +19,7 @@ list(APPEND GTEST_CMAKE_CXX_FLAGS
-Wno-zero-as-null-pointer-constant
-Wno-unused-member-function
-Wno-comma
+ -Wno-old-style-cast
)
message(STATUS "Suppressing googltest warnings with flags: ${GTEST_CMAKE_CXX_FLAGS}")
@@ -35,4 +36,4 @@ FetchContent_MakeAvailable(googletest)
target_compile_options(gtest PRIVATE ${GTEST_CMAKE_CXX_FLAGS})
target_compile_options(gtest_main PRIVATE ${GTEST_CMAKE_CXX_FLAGS})
target_compile_options(gmock PRIVATE ${GTEST_CMAKE_CXX_FLAGS})
-
+target_compile_options(gmock_main PRIVATE ${GTEST_CMAKE_CXX_FLAGS})
diff --git a/example/01_gemm/gemm_xdl_bf16.cpp b/example/01_gemm/gemm_xdl_bf16.cpp
index a4567dcd6e..4077a4f8d8 100644
--- a/example/01_gemm/gemm_xdl_bf16.cpp
+++ b/example/01_gemm/gemm_xdl_bf16.cpp
@@ -88,9 +88,9 @@ using ReferenceGemmInstance = ck::tensor_operation::host::
int main(int argc, char* argv[])
{
- bool do_verification = 0;
- int init_method = 0;
- int nrepeat = 5;
+ bool do_verification = true;
+ int init_method = 1;
+ bool time_kernel = false;
// GEMM shape
ck::index_t M = 3840;
@@ -105,13 +105,13 @@ int main(int argc, char* argv[])
{
do_verification = std::stoi(argv[1]);
init_method = std::stoi(argv[2]);
- nrepeat = std::stoi(argv[3]);
+ time_kernel = std::stoi(argv[3]);
}
else if(argc == 10)
{
do_verification = std::stoi(argv[1]);
init_method = std::stoi(argv[2]);
- nrepeat = std::stoi(argv[3]);
+ time_kernel = std::stoi(argv[3]);
M = std::stoi(argv[4]);
N = std::stoi(argv[5]);
@@ -125,7 +125,7 @@ int main(int argc, char* argv[])
{
printf("arg1: verification (0=no, 1=yes)\n");
printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n");
- printf("arg3: run kernel # of times (>1)\n");
+ printf("arg3: time kernel (0=n0, 1=yes)\n");
printf("arg4 to 9: M (256x), N(128x), K(32x), StrideA, StrideB, StrideC\n");
exit(0);
}
@@ -198,7 +198,7 @@ int main(int argc, char* argv[])
"not support this GEMM problem");
}
- float ave_time = invoker.Run(argument, nrepeat);
+ float ave_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel});
std::size_t flop = std::size_t(2) * M * N * K;
std::size_t num_btype =
diff --git a/example/01_gemm/gemm_xdl_fp16.cpp b/example/01_gemm/gemm_xdl_fp16.cpp
index fc04a13ca5..4f0228eafe 100644
--- a/example/01_gemm/gemm_xdl_fp16.cpp
+++ b/example/01_gemm/gemm_xdl_fp16.cpp
@@ -56,9 +56,9 @@ using ReferenceGemmInstance = ck::tensor_operation::host::
int main(int argc, char* argv[])
{
- bool do_verification = 0;
- int init_method = 0;
- int nrepeat = 5;
+ bool do_verification = true;
+ int init_method = 1;
+ bool time_kernel = false;
// GEMM shape
ck::index_t M = 3840;
@@ -73,13 +73,13 @@ int main(int argc, char* argv[])
{
do_verification = std::stoi(argv[1]);
init_method = std::stoi(argv[2]);
- nrepeat = std::stoi(argv[3]);
+ time_kernel = std::stoi(argv[3]);
}
else if(argc == 10)
{
do_verification = std::stoi(argv[1]);
init_method = std::stoi(argv[2]);
- nrepeat = std::stoi(argv[3]);
+ time_kernel = std::stoi(argv[3]);
M = std::stoi(argv[4]);
N = std::stoi(argv[5]);
@@ -93,7 +93,7 @@ int main(int argc, char* argv[])
{
printf("arg1: verification (0=no, 1=yes)\n");
printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n");
- printf("arg3: run kernel # of times (>1)\n");
+ printf("arg3: time kernel (0=n0, 1=yes)\n");
printf("arg4 to 9: M (256x), N(128x), K(32x), StrideA, StrideB, StrideC\n");
exit(0);
}
@@ -171,7 +171,7 @@ int main(int argc, char* argv[])
"not support this GEMM problem");
}
- float ave_time = invoker.Run(argument, nrepeat);
+ float ave_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel});
std::size_t flop = std::size_t(2) * M * N * K;
std::size_t num_btype =
diff --git a/example/01_gemm/gemm_xdl_int8.cpp b/example/01_gemm/gemm_xdl_int8.cpp
index ab5869db61..d5bf4a8bde 100644
--- a/example/01_gemm/gemm_xdl_int8.cpp
+++ b/example/01_gemm/gemm_xdl_int8.cpp
@@ -83,9 +83,9 @@ using ReferenceGemmInstance = ck::tensor_operation::host::
int main(int argc, char* argv[])
{
- bool do_verification = 0;
- int init_method = 0;
- int nrepeat = 5;
+ bool do_verification = true;
+ int init_method = 1;
+ bool time_kernel = false;
// GEMM shape
ck::index_t M = 3840;
@@ -100,13 +100,13 @@ int main(int argc, char* argv[])
{
do_verification = std::stoi(argv[1]);
init_method = std::stoi(argv[2]);
- nrepeat = std::stoi(argv[3]);
+ time_kernel = std::stoi(argv[3]);
}
else if(argc == 10)
{
do_verification = std::stoi(argv[1]);
init_method = std::stoi(argv[2]);
- nrepeat = std::stoi(argv[3]);
+ time_kernel = std::stoi(argv[3]);
M = std::stoi(argv[4]);
N = std::stoi(argv[5]);
@@ -120,7 +120,7 @@ int main(int argc, char* argv[])
{
printf("arg1: verification (0=no, 1=yes)\n");
printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n");
- printf("arg3: run kernel # of times (>1)\n");
+ printf("arg3: time kernel (0=n0, 1=yes)\n");
printf("arg4 to 9: M (256x), N(128x), K(32x), StrideA, StrideB, StrideC\n");
exit(0);
}
@@ -194,7 +194,7 @@ int main(int argc, char* argv[])
"not support this GEMM problem");
}
- float ave_time = invoker.Run(argument, nrepeat);
+ float ave_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel});
std::size_t flop = std::size_t(2) * M * N * K;
std::size_t num_btype =
diff --git a/example/02_gemm_alpha_beta/gemm_xdl_alpha_beta.cpp b/example/02_gemm_alpha_beta/gemm_xdl_alpha_beta.cpp
index 2abebbbac4..451200e798 100644
--- a/example/02_gemm_alpha_beta/gemm_xdl_alpha_beta.cpp
+++ b/example/02_gemm_alpha_beta/gemm_xdl_alpha_beta.cpp
@@ -86,9 +86,9 @@ using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemmBias2D1)\n");
+ printf("arg3: time kernel (0=n0, 1=yes)\n");
printf("arg4 to 9: M (256x), N(128x), K(32x), StrideA, StrideB, StrideC, alpha, beta\n");
exit(0);
}
@@ -216,7 +216,7 @@ int main(int argc, char* argv[])
"not support this GEMM problem");
}
- float ave_time = invoker.Run(argument, nrepeat);
+ float ave_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel});
std::size_t flop = std::size_t(2) * M * N * K;
std::size_t num_btype =
diff --git a/example/03_gemm_bias_relu/gemm_xdl_bias_relu.cpp b/example/03_gemm_bias_relu/gemm_xdl_bias_relu.cpp
index f3ed2bad37..308d423ce7 100644
--- a/example/03_gemm_bias_relu/gemm_xdl_bias_relu.cpp
+++ b/example/03_gemm_bias_relu/gemm_xdl_bias_relu.cpp
@@ -83,9 +83,9 @@ using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemmBiasActiv
int main(int argc, char* argv[])
{
- bool do_verification = 0;
- int init_method = 0;
- int nrepeat = 5;
+ bool do_verification = true;
+ int init_method = 1;
+ bool time_kernel = false;
// GEMM shape
ck::index_t M = 3840;
@@ -100,13 +100,13 @@ int main(int argc, char* argv[])
{
do_verification = std::stoi(argv[1]);
init_method = std::stoi(argv[2]);
- nrepeat = std::stoi(argv[3]);
+ time_kernel = std::stoi(argv[3]);
}
else if(argc == 10)
{
do_verification = std::stoi(argv[1]);
init_method = std::stoi(argv[2]);
- nrepeat = std::stoi(argv[3]);
+ time_kernel = std::stoi(argv[3]);
M = std::stoi(argv[4]);
N = std::stoi(argv[5]);
@@ -120,7 +120,7 @@ int main(int argc, char* argv[])
{
printf("arg1: verification (0=no, 1=yes)\n");
printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n");
- printf("arg3: run kernel # of times (>1)\n");
+ printf("arg3: time kernel (0=n0, 1=yes)\n");
printf("arg4 to 9: M (256x), N(128x), K(32x), StrideA, StrideB, StrideC\n");
exit(0);
}
@@ -206,7 +206,7 @@ int main(int argc, char* argv[])
"not support this GEMM problem");
}
- float ave_time = invoker.Run(argument, nrepeat);
+ float ave_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel});
std::size_t flop = std::size_t(2) * M * N * K;
diff --git a/example/04_gemm_bias_relu_add/gemm_xdl_bias_relu_add.cpp b/example/04_gemm_bias_relu_add/gemm_xdl_bias_relu_add.cpp
index 9405c36881..012fd21341 100644
--- a/example/04_gemm_bias_relu_add/gemm_xdl_bias_relu_add.cpp
+++ b/example/04_gemm_bias_relu_add/gemm_xdl_bias_relu_add.cpp
@@ -83,9 +83,9 @@ using ReferenceGemmInstance =
CElementOp>;
int main(int argc, char* argv[])
{
- bool do_verification = 0;
- int init_method = 0;
- int nrepeat = 5;
+ bool do_verification = true;
+ int init_method = 1;
+ bool time_kernel = false;
// GEMM shape
ck::index_t M = 3840;
@@ -101,13 +101,13 @@ int main(int argc, char* argv[])
{
do_verification = std::stoi(argv[1]);
init_method = std::stoi(argv[2]);
- nrepeat = std::stoi(argv[3]);
+ time_kernel = std::stoi(argv[3]);
}
else if(argc == 11)
{
do_verification = std::stoi(argv[1]);
init_method = std::stoi(argv[2]);
- nrepeat = std::stoi(argv[3]);
+ time_kernel = std::stoi(argv[3]);
M = std::stoi(argv[4]);
N = std::stoi(argv[5]);
@@ -122,7 +122,7 @@ int main(int argc, char* argv[])
{
printf("arg1: verification (0=no, 1=yes)\n");
printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n");
- printf("arg3: run kernel # of times (>1)\n");
+ printf("arg3: time kernel (0=n0, 1=yes)\n");
printf("arg4 to 10: M (256x), N(128x), K(32x), StrideA, StrideB, StrideC, StrideC1\n");
exit(0);
}
@@ -218,7 +218,7 @@ int main(int argc, char* argv[])
"not support this GEMM problem");
}
- float ave_time = invoker.Run(argument, nrepeat);
+ float ave_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel});
std::size_t flop = std::size_t(2) * M * N * K;
std::size_t num_btype = sizeof(ADataType) * M * K + sizeof(BDataType) * K * M +
diff --git a/example/06_conv2d_fwd_bias_relu/conv2d_fwd_xdl_bias_relu.cpp b/example/06_conv2d_fwd_bias_relu/conv2d_fwd_xdl_bias_relu.cpp
index 53095bde0d..342de268e3 100644
--- a/example/06_conv2d_fwd_bias_relu/conv2d_fwd_xdl_bias_relu.cpp
+++ b/example/06_conv2d_fwd_bias_relu/conv2d_fwd_xdl_bias_relu.cpp
@@ -93,7 +93,7 @@ void PrintUseMsg()
{
std::cout << "arg1: verification (0=no, 1=yes)\n"
<< "arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n"
- << "arg3: run kernel # of times (>1)\n"
+ << "arg3: time kernel (0=n0, 1=yes)\n"
<< "Following arguments:\n"
<< " N, K, C, \n"
<< " , (ie Y, X for 2D)\n"
@@ -165,9 +165,9 @@ int main(int argc, char* argv[])
{
using namespace ck::utils::conv;
- bool do_verification = 0;
- int init_method = 0;
- int nrepeat = 5;
+ bool do_verification = true;
+ int init_method = 1;
+ bool time_kernel = false;
const int num_dim_spatial = 2;
ck::utils::conv::ConvParams params;
@@ -176,7 +176,7 @@ int main(int argc, char* argv[])
{
do_verification = std::stoi(argv[1]);
init_method = std::stoi(argv[2]);
- nrepeat = std::stoi(argv[3]);
+ time_kernel = std::stoi(argv[3]);
}
if(argc >= 5)
@@ -269,7 +269,7 @@ int main(int argc, char* argv[])
"not support this problem");
}
- float ave_time = invoker.Run(argument, nrepeat);
+ float ave_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel});
std::size_t flop = get_flops(
params.N_, params.C_, params.K_, params.filter_spatial_lengths_, output_spatial_lengths);
diff --git a/example/07_conv2d_fwd_bias_relu_add/conv2d_fwd_xdl_bias_relu_add.cpp b/example/07_conv2d_fwd_bias_relu_add/conv2d_fwd_xdl_bias_relu_add.cpp
index c2b4ca0b5d..ff4fc66cb8 100644
--- a/example/07_conv2d_fwd_bias_relu_add/conv2d_fwd_xdl_bias_relu_add.cpp
+++ b/example/07_conv2d_fwd_bias_relu_add/conv2d_fwd_xdl_bias_relu_add.cpp
@@ -90,7 +90,7 @@ void PrintUseMsg()
{
std::cout << "arg1: verification (0=no, 1=yes)\n"
<< "arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n"
- << "arg3: run kernel # of times (>1)\n"
+ << "arg3: time kernel (0=n0, 1=yes)\n"
<< "Following arguments:\n"
<< " N, K, C, \n"
<< " , (ie Y, X for 2D)\n"
@@ -162,9 +162,9 @@ int main(int argc, char* argv[])
{
using namespace ck::utils::conv;
- bool do_verification = 0;
- int init_method = 0;
- int nrepeat = 5;
+ bool do_verification = true;
+ int init_method = 1;
+ bool time_kernel = false;
const int num_dim_spatial = 2;
ck::utils::conv::ConvParams params;
@@ -173,7 +173,7 @@ int main(int argc, char* argv[])
{
do_verification = std::stoi(argv[1]);
init_method = std::stoi(argv[2]);
- nrepeat = std::stoi(argv[3]);
+ time_kernel = std::stoi(argv[3]);
}
if(argc >= 5)
@@ -280,7 +280,7 @@ int main(int argc, char* argv[])
"not support this problem");
}
- float ave_time = invoker.Run(argument, nrepeat);
+ float ave_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel});
std::size_t flop = get_flops(
params.N_, params.C_, params.K_, params.filter_spatial_lengths_, output_spatial_lengths);
diff --git a/example/09_convnd_fwd/convnd_fwd_xdl.cpp b/example/09_convnd_fwd/convnd_fwd_xdl.cpp
index 71f49b5e71..112d606f56 100644
--- a/example/09_convnd_fwd/convnd_fwd_xdl.cpp
+++ b/example/09_convnd_fwd/convnd_fwd_xdl.cpp
@@ -107,7 +107,7 @@ void print_use_msg()
{
std::cout << "arg1: verification (0=no, 1=yes)\n"
<< "arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n"
- << "arg3: run kernel # of times (>1)\n"
+ << "arg3: time kernel (0=n0, 1=yes)\n"
<< "arg4: N spatial dimensions (default 2)\n"
<< "Following arguments (depending on number of spatial dims):\n"
<< " N, K, C, \n"
@@ -179,9 +179,9 @@ int main(int argc, char* argv[])
{
using namespace ck::utils::conv;
- bool do_verification = 0;
- int init_method = 0;
- int nrepeat = 5;
+ bool do_verification = true;
+ int init_method = 1;
+ bool time_kernel = false;
int num_dim_spatial = 2;
ck::utils::conv::ConvParams params;
@@ -190,7 +190,7 @@ int main(int argc, char* argv[])
{
do_verification = std::stoi(argv[1]);
init_method = std::stoi(argv[2]);
- nrepeat = std::stoi(argv[3]);
+ time_kernel = std::stoi(argv[3]);
num_dim_spatial = std::stoi(argv[4]);
}
@@ -276,7 +276,7 @@ int main(int argc, char* argv[])
"not support this Conv problem");
}
- float ave_time = invoker->Run(argument.get(), nrepeat);
+ float ave_time = invoker->Run(argument.get(), StreamConfig{nullptr, time_kernel});
std::size_t flop = get_flops(
params.N_, params.C_, params.K_, params.filter_spatial_lengths_, output_spatial_lengths);
diff --git a/example/09_convnd_fwd/convnd_fwd_xdl_fp16.cpp b/example/09_convnd_fwd/convnd_fwd_xdl_fp16.cpp
index c1361a8db3..8b658e7790 100644
--- a/example/09_convnd_fwd/convnd_fwd_xdl_fp16.cpp
+++ b/example/09_convnd_fwd/convnd_fwd_xdl_fp16.cpp
@@ -110,7 +110,7 @@ void print_use_msg()
{
std::cout << "arg1: verification (0=no, 1=yes)\n"
<< "arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n"
- << "arg3: run kernel # of times (>1)\n"
+ << "arg3: time kernel (0=n0, 1=yes)\n"
<< "arg4: N spatial dimensions (default 2)\n"
<< "Following arguments (depending on number of spatial dims):\n"
<< " N, K, C, \n"
@@ -182,9 +182,9 @@ int main(int argc, char* argv[])
{
using namespace ck::utils::conv;
- bool do_verification = 0;
- int init_method = 0;
- int nrepeat = 5;
+ bool do_verification = true;
+ int init_method = 1;
+ bool time_kernel = false;
int num_dim_spatial = 2;
ck::utils::conv::ConvParams params;
@@ -193,7 +193,7 @@ int main(int argc, char* argv[])
{
do_verification = std::stoi(argv[1]);
init_method = std::stoi(argv[2]);
- nrepeat = std::stoi(argv[3]);
+ time_kernel = std::stoi(argv[3]);
num_dim_spatial = std::stoi(argv[4]);
}
@@ -277,7 +277,7 @@ int main(int argc, char* argv[])
"not support this Conv problem");
}
- float ave_time = invoker->Run(argument.get(), nrepeat);
+ float ave_time = invoker->Run(argument.get(), StreamConfig{nullptr, time_kernel});
std::size_t flop = get_flops(
params.N_, params.C_, params.K_, params.filter_spatial_lengths_, output_spatial_lengths);
diff --git a/example/09_convnd_fwd/convnd_fwd_xdl_int8.cpp b/example/09_convnd_fwd/convnd_fwd_xdl_int8.cpp
index 3d3e34dfd9..e7988d8683 100644
--- a/example/09_convnd_fwd/convnd_fwd_xdl_int8.cpp
+++ b/example/09_convnd_fwd/convnd_fwd_xdl_int8.cpp
@@ -112,7 +112,7 @@ void print_use_msg()
{
std::cout << "arg1: verification (0=no, 1=yes)\n"
<< "arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n"
- << "arg3: run kernel # of times (>1)\n"
+ << "arg3: time kernel (0=n0, 1=yes)\n"
<< "arg4: N spatial dimensions (default 2)\n"
<< "Following arguments (depending on number of spatial dims):\n"
<< " N, K, C, \n"
@@ -184,9 +184,9 @@ int main(int argc, char* argv[])
{
using namespace ck::utils::conv;
- bool do_verification = 0;
- int init_method = 0;
- int nrepeat = 5;
+ bool do_verification = true;
+ int init_method = 1;
+ bool time_kernel = false;
int num_dim_spatial = 2;
ck::utils::conv::ConvParams params;
@@ -195,7 +195,7 @@ int main(int argc, char* argv[])
{
do_verification = std::stoi(argv[1]);
init_method = std::stoi(argv[2]);
- nrepeat = std::stoi(argv[3]);
+ time_kernel = std::stoi(argv[3]);
num_dim_spatial = std::stoi(argv[4]);
}
@@ -279,7 +279,7 @@ int main(int argc, char* argv[])
"not support this Conv problem");
}
- float ave_time = invoker->Run(argument.get(), nrepeat);
+ float ave_time = invoker->Run(argument.get(), StreamConfig{nullptr, time_kernel});
std::size_t flop = get_flops(
params.N_, params.C_, params.K_, params.filter_spatial_lengths_, output_spatial_lengths);
diff --git a/example/10_conv2d_bwd_data/conv2d_bwd_data_xdl.cpp b/example/10_conv2d_bwd_data/conv2d_bwd_data_xdl.cpp
index f3f9b497f5..73210fa543 100644
--- a/example/10_conv2d_bwd_data/conv2d_bwd_data_xdl.cpp
+++ b/example/10_conv2d_bwd_data/conv2d_bwd_data_xdl.cpp
@@ -77,9 +77,9 @@ using ReferenceConvBwdInstance = ck::tensor_operation::host::ReferenceConvBwdDat
int main(int argc, char* argv[])
{
- bool do_verification = 0;
- int init_method = 0;
- int nrepeat = 5;
+ bool do_verification = true;
+ int init_method = 1;
+ bool time_kernel = false;
// Conv shape
ck::index_t N = 128;
@@ -102,13 +102,13 @@ int main(int argc, char* argv[])
{
do_verification = std::stoi(argv[1]);
init_method = std::stoi(argv[2]);
- nrepeat = std::stoi(argv[3]);
+ time_kernel = std::stoi(argv[3]);
}
else if(argc == 19)
{
do_verification = std::stoi(argv[1]);
init_method = std::stoi(argv[2]);
- nrepeat = std::stoi(argv[3]);
+ time_kernel = std::stoi(argv[3]);
N = std::stoi(argv[4]);
K = std::stoi(argv[5]);
@@ -130,7 +130,7 @@ int main(int argc, char* argv[])
{
printf("arg1: verification (0=no, 1=yes)\n");
printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n");
- printf("arg3: run kernel # of times (>1)\n");
+ printf("arg3: time kernel (0=n0, 1=yes)\n");
printf("arg4 to 18: N, K, C, Y, X, Hi, Wi, Sy, Sx, Dy, Dx, LeftPy, LeftPx, RightPy, "
"RightPx\n");
exit(0);
@@ -214,7 +214,7 @@ int main(int argc, char* argv[])
"not support this Conv problem");
}
- float ave_time = invoker.Run(argument, nrepeat);
+ float ave_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel});
std::size_t flop = std::size_t(2) * N * K * Ho * Wo * C * Y * X;
diff --git a/example/11_conv2d_bwd_weight/conv2d_bwd_weight_xdl.cpp b/example/11_conv2d_bwd_weight/conv2d_bwd_weight_xdl.cpp
index bf78cc87e0..0c996dc21b 100644
--- a/example/11_conv2d_bwd_weight/conv2d_bwd_weight_xdl.cpp
+++ b/example/11_conv2d_bwd_weight/conv2d_bwd_weight_xdl.cpp
@@ -82,9 +82,9 @@ using ReferenceConvBwdWeightInstance =
int main(int argc, char* argv[])
{
- bool do_verification = 0;
- int init_method = 0;
- int nrepeat = 5;
+ bool do_verification = true;
+ int init_method = 1;
+ bool time_kernel = false;
int do_log = 0;
int split_k = 4;
@@ -109,7 +109,7 @@ int main(int argc, char* argv[])
{
do_verification = std::stoi(argv[1]);
init_method = std::stoi(argv[2]);
- nrepeat = std::stoi(argv[3]);
+ time_kernel = std::stoi(argv[3]);
do_log = std::stoi(argv[4]);
split_k = std::stoi(argv[5]);
}
@@ -117,7 +117,7 @@ int main(int argc, char* argv[])
{
do_verification = std::stoi(argv[1]);
init_method = std::stoi(argv[2]);
- nrepeat = std::stoi(argv[3]);
+ time_kernel = std::stoi(argv[3]);
do_log = std::stoi(argv[4]);
split_k = std::stoi(argv[5]);
@@ -141,7 +141,7 @@ int main(int argc, char* argv[])
{
printf("arg1: verification (0=no, 1=yes)\n");
printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n");
- printf("arg3: run kernel # of times (>1)\n");
+ printf("arg3: time kernel (0=n0, 1=yes)\n");
printf("arg4: is show log (0=no, 1=yes)\n");
printf("arg5: split-k \n");
printf("arg6 to 19: N, K, C, Y, X, Hi, Wi, Sy, Sx, Dy, Dx, LeftPy, LeftPx, RightPy, "
@@ -246,7 +246,7 @@ int main(int argc, char* argv[])
return 1;
}
- float ave_time = invoker.Run(argument, nrepeat);
+ float ave_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel});
std::size_t flop = std::size_t(2) * N * K * Ho * Wo * C * Y * X;
diff --git a/example/12_reduce/reduce_blockwise.cpp b/example/12_reduce/reduce_blockwise.cpp
index 7ca9823ff5..caa93c9df2 100644
--- a/example/12_reduce/reduce_blockwise.cpp
+++ b/example/12_reduce/reduce_blockwise.cpp
@@ -116,10 +116,9 @@ class SimpleAppArgs
std::vector inLengths;
std::vector scales;
- bool do_verification = false;
-
- int init_method = 1;
- int nrepeat = 5;
+ bool do_verification = true;
+ int init_method = 1;
+ bool time_kernel = false;
public:
void show_usage(const char* cmd)
@@ -135,7 +134,7 @@ class SimpleAppArgs
std::cout << "Arg1 -- init method (0=no init, 1=single integer value, 2=scope integer "
"value, 3=decimal value)"
<< std::endl;
- std::cout << "Arg2 -- number of repeats to run the kernel" << std::endl;
+ std::cout << "Arg2 -- time kernel (0=n0, 1=yes)" << std::endl;
};
int processArgs(int argc, char* argv[])
@@ -182,7 +181,7 @@ class SimpleAppArgs
throw std::runtime_error("Invalid cmd-line arguments, more argumetns are needed!");
init_method = std::atoi(argv[optind++]);
- nrepeat = std::atoi(argv[optind]);
+ time_kernel = std::atoi(argv[optind]);
if(scales.empty())
{
@@ -352,7 +351,7 @@ int main(int argc, char* argv[])
auto invoker_ptr = reduce.MakeInvokerPointer();
- float avg_time = invoker_ptr->Run(argument_ptr.get(), args.nrepeat);
+ float avg_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, args.time_kernel});
std::size_t num_bytes = invariant_total_length * reduce_total_length * sizeof(InDataType) +
invariant_total_length * sizeof(OutDataType);
diff --git a/example/13_pool2d_fwd/pool2d_fwd.cpp b/example/13_pool2d_fwd/pool2d_fwd.cpp
index a18761095c..f4eb9d79f6 100644
--- a/example/13_pool2d_fwd/pool2d_fwd.cpp
+++ b/example/13_pool2d_fwd/pool2d_fwd.cpp
@@ -149,9 +149,9 @@ int main(int argc, char* argv[])
{
using namespace ck::host_reduce;
- bool do_verification = 0;
- int init_method = 0;
- int nrepeat = 5;
+ bool do_verification = true;
+ int init_method = 1;
+ bool time_kernel = false;
// Pool shape
ck::index_t N = 128;
@@ -171,13 +171,13 @@ int main(int argc, char* argv[])
{
do_verification = std::stoi(argv[1]);
init_method = std::stoi(argv[2]);
- nrepeat = std::stoi(argv[3]);
+ time_kernel = std::stoi(argv[3]);
}
else if(argc == 16)
{
do_verification = std::stoi(argv[1]);
init_method = std::stoi(argv[2]);
- nrepeat = std::stoi(argv[3]);
+ time_kernel = std::stoi(argv[3]);
N = std::stoi(argv[4]);
C = std::stoi(argv[5]);
@@ -196,7 +196,7 @@ int main(int argc, char* argv[])
{
printf("arg1: verification (0=no, 1=yes)\n");
printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n");
- printf("arg3: run kernel # of times (>1)\n");
+ printf("arg3: time kernel (0=n0, 1=yes)\n");
printf("arg4 to 15: N, C, Y, X, Hi, Wi, Sy, Sx, LeftPy, LeftPx, RightPy, "
"RightPx\n");
exit(0);
@@ -271,7 +271,7 @@ int main(int argc, char* argv[])
"not support this problem");
}
- float ave_time = invoker_ptr->Run(argument_ptr.get(), nrepeat);
+ float ave_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel});
std::size_t flop = std::size_t(2) * N * C * Ho * Wo * Y * X;
diff --git a/example/14_gemm_xdl_requant_relu_requant/gemm_xdl_requant_relu_requant_int8.cpp b/example/14_gemm_xdl_requant_relu_requant/gemm_xdl_requant_relu_requant_int8.cpp
index 324dc35d3f..9fc63308b7 100644
--- a/example/14_gemm_xdl_requant_relu_requant/gemm_xdl_requant_relu_requant_int8.cpp
+++ b/example/14_gemm_xdl_requant_relu_requant/gemm_xdl_requant_relu_requant_int8.cpp
@@ -105,9 +105,9 @@ using ReferenceGemmInstance = ck::tensor_operation::host::
int main(int argc, char* argv[])
{
- bool do_verification = 0;
- int init_method = 0;
- int nrepeat = 5;
+ bool do_verification = true;
+ int init_method = 1;
+ bool time_kernel = false;
// GEMM shape
ck::index_t M = 3840;
@@ -125,13 +125,13 @@ int main(int argc, char* argv[])
{
do_verification = std::stoi(argv[1]);
init_method = std::stoi(argv[2]);
- nrepeat = std::stoi(argv[3]);
+ time_kernel = std::stoi(argv[3]);
}
else if(argc == 10)
{
do_verification = std::stoi(argv[1]);
init_method = std::stoi(argv[2]);
- nrepeat = std::stoi(argv[3]);
+ time_kernel = std::stoi(argv[3]);
M = std::stoi(argv[4]);
N = std::stoi(argv[5]);
@@ -145,7 +145,7 @@ int main(int argc, char* argv[])
{
printf("arg1: verification (0=no, 1=yes)\n");
printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n");
- printf("arg3: run kernel # of times (>1)\n");
+ printf("arg3: time kernel (0=n0, 1=yes)\n");
printf("arg4 to 9: M (256x), N(128x), K(32x), StrideA, StrideB, StrideC\n");
exit(0);
}
@@ -219,7 +219,7 @@ int main(int argc, char* argv[])
"not support this GEMM problem");
}
- float ave_time = invoker.Run(argument, nrepeat);
+ float ave_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel});
std::size_t flop = std::size_t(2) * M * N * K;
std::size_t num_btype =
diff --git a/example/15_grouped_gemm/grouped_gemm_xdl_fp16.cpp b/example/15_grouped_gemm/grouped_gemm_xdl_fp16.cpp
index 29ef01f2ef..f55db1d45c 100644
--- a/example/15_grouped_gemm/grouped_gemm_xdl_fp16.cpp
+++ b/example/15_grouped_gemm/grouped_gemm_xdl_fp16.cpp
@@ -60,21 +60,21 @@ using ReferenceGemmInstance = ck::tensor_operation::host::
int main(int argc, char* argv[])
{
- bool do_verification = 0;
- int init_method = 0;
- int nrepeat = 5;
+ bool do_verification = true;
+ int init_method = 1;
+ bool time_kernel = false;
if(argc == 4)
{
do_verification = std::stoi(argv[1]);
init_method = std::stoi(argv[2]);
- nrepeat = std::stoi(argv[3]);
+ time_kernel = std::stoi(argv[3]);
}
else
{
printf("arg1: verification (0=no, 1=yes)\n");
printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n");
- printf("arg3: run kernel # of times (>1)\n");
+ printf("arg3: time kernel (0=n0, 1=yes)\n");
exit(0);
}
@@ -202,7 +202,7 @@ int main(int argc, char* argv[])
"not support this GEMM problem");
}
- float ave_time = invoker.Run(argument, nrepeat);
+ float ave_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel});
float tflops = static_cast(flop) / 1.E9 / ave_time;
diff --git a/example/16_gemm_reduce/gemm_reduce_xdl_fp16.cpp b/example/16_gemm_reduce/gemm_reduce_xdl_fp16.cpp
index 90064ae584..8fea54f635 100644
--- a/example/16_gemm_reduce/gemm_reduce_xdl_fp16.cpp
+++ b/example/16_gemm_reduce/gemm_reduce_xdl_fp16.cpp
@@ -58,9 +58,9 @@ using ReferenceGemmInstance = ck::tensor_operation::host::
int main(int argc, char* argv[])
{
- bool do_verification = 1;
+ bool do_verification = true;
int init_method = 1;
- int nrepeat = 5;
+ bool time_kernel = false;
// GEMM shape
ck::index_t M = 3840;
@@ -79,13 +79,13 @@ int main(int argc, char* argv[])
{
do_verification = std::stoi(argv[1]);
init_method = std::stoi(argv[2]);
- nrepeat = std::stoi(argv[3]);
+ time_kernel = std::stoi(argv[3]);
}
else if(argc == 10)
{
do_verification = std::stoi(argv[1]);
init_method = std::stoi(argv[2]);
- nrepeat = std::stoi(argv[3]);
+ time_kernel = std::stoi(argv[3]);
M = std::stoi(argv[4]);
N = std::stoi(argv[5]);
@@ -99,7 +99,7 @@ int main(int argc, char* argv[])
{
printf("arg1: verification (0=no, 1=yes)\n");
printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n");
- printf("arg3: run kernel # of times (>1)\n");
+ printf("arg3: time kernel (0=n0, 1=yes)\n");
printf("arg4 to 9: M (256x), N(128x), K(32x), StrideA, StrideB, StrideC\n");
exit(0);
}
@@ -192,30 +192,13 @@ int main(int argc, char* argv[])
"not support this GEMM problem");
}
- // warm up
- invoker.Run(argument);
+ // init DO, D1 to 0
+ d0_device_buf.SetZero();
+ d1_device_buf.SetZero();
- // timing
- float total_time = 0;
-
- for(int i = 0; i < nrepeat; ++i)
- {
- // init DO, D1 to 0
- d0_device_buf.SetZero();
- d1_device_buf.SetZero();
-
- KernelTimer timer;
-
- timer.Start();
-
- invoker.Run(argument);
-
- timer.End();
-
- total_time += timer.GetElapsedTime();
- }
-
- float ave_time = total_time / nrepeat;
+ // if time_kernel == true, kernel will run multiple times. This kernel use atomic-add so result
+ // will not be correct. need to set time_kernel = false for correctness test
+ float ave_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel});
std::size_t flop = std::size_t(2) * M * N * K;
std::size_t num_btype =
diff --git a/example/17_convnd_bwd_data_xdl/convnd_bwd_data_xdl.cpp b/example/17_convnd_bwd_data_xdl/convnd_bwd_data_xdl.cpp
index 1b375ea339..a013f39827 100644
--- a/example/17_convnd_bwd_data_xdl/convnd_bwd_data_xdl.cpp
+++ b/example/17_convnd_bwd_data_xdl/convnd_bwd_data_xdl.cpp
@@ -87,7 +87,7 @@ void print_use_msg()
{
std::cout << "arg1: verification (0=no, 1=yes)\n"
<< "arg2: initialization (0=no init, 1=random value, 2= init to 1 )\n"
- << "arg3: run kernel # of times (>1)\n"
+ << "arg3: time kernel (0=n0, 1=yes)\n"
<< "arg4: N spatial dimensions (default 2)\n"
<< "Following arguments (depending on number of spatial dims):\n"
<< " N, K, C, \n"
@@ -165,9 +165,9 @@ DeviceConvBwdDataBasePtr get_conv_instance(int num_dim_spatial)
int main(int argc, char* argv[])
{
- bool do_verification = 0;
- int init_method = 0;
- int nrepeat = 5;
+ bool do_verification = true;
+ int init_method = 1;
+ bool time_kernel = false;
int num_dim_spatial = 2;
ck::utils::conv::ConvParams params;
@@ -177,13 +177,13 @@ int main(int argc, char* argv[])
{
do_verification = std::stoi(argv[1]);
init_method = std::stoi(argv[2]);
- nrepeat = std::stoi(argv[3]);
+ time_kernel = std::stoi(argv[3]);
}
else if(argc > 4)
{
do_verification = std::stoi(argv[1]);
init_method = std::stoi(argv[2]);
- nrepeat = std::stoi(argv[3]);
+ time_kernel = std::stoi(argv[3]);
num_dim_spatial = std::stoi(argv[4]);
// check args number
int conv_args = 3 + num_dim_spatial * 6;
@@ -284,7 +284,7 @@ int main(int argc, char* argv[])
"not support this Conv problem");
}
- float ave_time = invoker->Run(argument.get(), nrepeat);
+ float ave_time = invoker->Run(argument.get(), StreamConfig{nullptr, time_kernel});
std::size_t flop = ck::utils::conv::get_flops(
params.N_, params.C_, params.K_, params.filter_spatial_lengths_, output_spatial_lengths);
diff --git a/example/18_batched_gemm_reduce/batched_gemm_reduce_xdl_fp16.cpp b/example/18_batched_gemm_reduce/batched_gemm_reduce_xdl_fp16.cpp
index eb18655d1b..f620ee1b20 100644
--- a/example/18_batched_gemm_reduce/batched_gemm_reduce_xdl_fp16.cpp
+++ b/example/18_batched_gemm_reduce/batched_gemm_reduce_xdl_fp16.cpp
@@ -57,9 +57,9 @@ using ReferenceBatchedGemmInstance = ck::tensor_operation::host::
int main(int argc, char* argv[])
{
- bool do_verification = 1;
+ bool do_verification = true;
int init_method = 1;
- int nrepeat = 5;
+ bool time_kernel = false;
// GEMM shape
ck::index_t M = 3840;
@@ -80,13 +80,13 @@ int main(int argc, char* argv[])
{
do_verification = std::stoi(argv[1]);
init_method = std::stoi(argv[2]);
- nrepeat = std::stoi(argv[3]);
+ time_kernel = std::stoi(argv[3]);
}
else if(argc == 11)
{
do_verification = std::stoi(argv[1]);
init_method = std::stoi(argv[2]);
- nrepeat = std::stoi(argv[3]);
+ time_kernel = std::stoi(argv[3]);
M = std::stoi(argv[4]);
N = std::stoi(argv[5]);
@@ -102,7 +102,7 @@ int main(int argc, char* argv[])
{
printf("arg1: verification (0=no, 1=yes)\n");
printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n");
- printf("arg3: run kernel # of times (>1)\n");
+ printf("arg3: time kernel (0=n0, 1=yes)\n");
printf("arg4 to 10: M (256x), N(128x), K(32x), StrideA, StrideB, StrideC, BatchCount\n");
exit(0);
}
@@ -204,30 +204,13 @@ int main(int argc, char* argv[])
"not support this GEMM problem");
}
- // warm up
- invoker.Run(argument);
+ // init DO, D1 to 0
+ d0_device_buf.SetZero();
+ d1_device_buf.SetZero();
- // timing
- float total_time = 0;
-
- for(int i = 0; i < nrepeat; ++i)
- {
- // init DO, D1 to 0
- d0_device_buf.SetZero();
- d1_device_buf.SetZero();
-
- KernelTimer timer;
-
- timer.Start();
-
- invoker.Run(argument);
-
- timer.End();
-
- total_time += timer.GetElapsedTime();
- }
-
- float ave_time = total_time / nrepeat;
+ // if time_kernel == true, kernel will run multiple times. This kernel use atomic-add so result
+ // will not be correct. need to set time_kernel = false for correctness test
+ float ave_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel});
std::size_t flop = std::size_t(2) * BatchCount * M * N * K;
std::size_t num_btype = sizeof(ADataType) * BatchCount * M * K +
diff --git a/include/ck/hip_version.hpp.in b/include/ck/hip_version.hpp.in
deleted file mode 100644
index 4290ef7e0d..0000000000
--- a/include/ck/hip_version.hpp.in
+++ /dev/null
@@ -1,28 +0,0 @@
-#pragma once
-
-// "_PACKAGE_" to avoid name contentions: the macros like
-// HIP_VERSION_MAJOR are defined in HIP_VERSION.h.
-// clang-format off
-#define CK_HIP_PACKAGE_VERSION_MAJOR @CK_HIP_VERSION_MAJOR@
-#define CK_HIP_PACKAGE_VERSION_MINOR @CK_HIP_VERSION_MINOR@
-#define CK_HIP_PACKAGE_VERSION_PATCH @CK_HIP_VERSION_PATCH@
-// clang-format on
-
-#ifndef CK_HIP_PACKAGE_VERSION_MAJOR
-#define CK_HIP_PACKAGE_VERSION_MAJOR 0
-#endif
-#ifndef CK_HIP_PACKAGE_VERSION_MINOR
-#define CK_HIP_PACKAGE_VERSION_MINOR 0
-#endif
-#ifndef CK_HIP_PACKAGE_VERSION_PATCH
-#define CK_HIP_PACKAGE_VERSION_PATCH 0
-#endif
-// 3 decimal digits for major and minor, 6 digits for patch number.
-// Max number is 999,999,999999 == 0xE8,D4A5,0FFF that fits into 64-bit math.
-#if CK_HIP_PACKAGE_VERSION_MAJOR > 999 || CK_HIP_PACKAGE_VERSION_MAJOR > 999 || \
- CK_HIP_PACKAGE_VERSION_PATCH > 999999
-#error "Too big HIP version number(s)"
-#endif
-#define CK_HIP_PACKAGE_VERSION_FLAT \
- ((CK_HIP_PACKAGE_VERSION_MAJOR * 1000ULL + CK_HIP_PACKAGE_VERSION_MINOR) * 1000000 + \
- CK_HIP_PACKAGE_VERSION_PATCH)
diff --git a/include/ck/options.hpp.in b/include/ck/options.hpp.in
new file mode 100644
index 0000000000..87ed6026a4
--- /dev/null
+++ b/include/ck/options.hpp.in
@@ -0,0 +1,3 @@
+#pragma once
+
+#cmakedefine01 CK_TIME_KERNEL
diff --git a/include/ck/stream_config.hpp b/include/ck/stream_config.hpp
new file mode 100644
index 0000000000..3e80b4c892
--- /dev/null
+++ b/include/ck/stream_config.hpp
@@ -0,0 +1,10 @@
+#pragma once
+
+#include
+#include
+
+struct StreamConfig
+{
+ hipStream_t stream_id_ = nullptr;
+ bool time_kernel_ = false;
+};
diff --git a/include/ck/tensor_operation/gpu/device/device_base.hpp b/include/ck/tensor_operation/gpu/device/device_base.hpp
index cf48695ad0..950cfc1d61 100644
--- a/include/ck/tensor_operation/gpu/device/device_base.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_base.hpp
@@ -1,8 +1,9 @@
-#ifndef DEVICE_BASE_HPP
-#define DEVICE_BASE_HPP
+#pragma once
#include
+#include "stream_config.hpp"
+
namespace ck {
namespace tensor_operation {
namespace device {
@@ -22,7 +23,10 @@ struct BaseInvoker
BaseInvoker(const BaseInvoker&) = default;
BaseInvoker& operator=(const BaseInvoker&) = default;
- virtual float Run(const BaseArgument*, int = 1) = 0;
+ virtual float Run(const BaseArgument*, const StreamConfig& = StreamConfig{})
+ {
+ return float{0};
+ }
virtual ~BaseInvoker() {}
};
@@ -33,8 +37,8 @@ struct BaseOperator
BaseOperator(const BaseOperator&) = default;
BaseOperator& operator=(const BaseOperator&) = default;
- virtual bool IsSupportedArgument(const BaseArgument*) = 0;
- virtual std::string GetTypeString() const = 0;
+ virtual bool IsSupportedArgument(const BaseArgument*) { return false; }
+ virtual std::string GetTypeString() const { return ""; }
virtual ~BaseOperator() {}
};
@@ -42,4 +46,3 @@ struct BaseOperator
} // namespace device
} // namespace tensor_operation
} // namespace ck
-#endif
diff --git a/include/ck/tensor_operation/gpu/device/device_batched_gemm_reduce_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/device_batched_gemm_reduce_xdl_cshuffle.hpp
index e1d354b344..a6408007ed 100644
--- a/include/ck/tensor_operation/gpu/device/device_batched_gemm_reduce_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_batched_gemm_reduce_xdl_cshuffle.hpp
@@ -693,7 +693,7 @@ struct DeviceBatchedGemmReduce_Xdl_CShuffle : public DeviceGemmReduce,
true>;
- launch_kernel(kernel,
- dim3(grid_size),
- dim3(BlockSize),
- 0,
- arg.p_a_grid_,
- arg.p_b_grid_,
- arg.p_c_grid_,
- arg.p_d0_grid_,
- arg.p_d1_grid_,
- arg.BatchCount_,
- arg.a_element_op_,
- arg.b_element_op_,
- arg.c_element_op_,
- arg.d1_element_op_,
- arg.a_grid_desc_ak0_m_ak1_,
- arg.b_grid_desc_bk0_n_bk1_,
- arg.c_grid_desc_mblock_mperblock_nblock_nperblock_,
- arg.d_grid_desc_mblock_mperblock_,
- arg.compute_base_ptr_of_batch_,
- arg.block_2_ctile_map_);
+ elapsed_time =
+ launch_and_time_kernel(stream_config,
+ kernel,
+ dim3(grid_size),
+ dim3(BlockSize),
+ 0,
+ arg.p_a_grid_,
+ arg.p_b_grid_,
+ arg.p_c_grid_,
+ arg.p_d0_grid_,
+ arg.p_d1_grid_,
+ arg.BatchCount_,
+ arg.a_element_op_,
+ arg.b_element_op_,
+ arg.c_element_op_,
+ arg.d1_element_op_,
+ arg.a_grid_desc_ak0_m_ak1_,
+ arg.b_grid_desc_bk0_n_bk1_,
+ arg.c_grid_desc_mblock_mperblock_nblock_nperblock_,
+ arg.d_grid_desc_mblock_mperblock_,
+ arg.compute_base_ptr_of_batch_,
+ arg.block_2_ctile_map_);
}
else
{
@@ -788,35 +791,38 @@ struct DeviceBatchedGemmReduce_Xdl_CShuffle : public DeviceGemmReduce,
false>;
- launch_kernel(kernel,
- dim3(grid_size),
- dim3(BlockSize),
- 0,
- arg.p_a_grid_,
- arg.p_b_grid_,
- arg.p_c_grid_,
- arg.p_d0_grid_,
- arg.p_d1_grid_,
- arg.BatchCount_,
- arg.a_element_op_,
- arg.b_element_op_,
- arg.c_element_op_,
- arg.d1_element_op_,
- arg.a_grid_desc_ak0_m_ak1_,
- arg.b_grid_desc_bk0_n_bk1_,
- arg.c_grid_desc_mblock_mperblock_nblock_nperblock_,
- arg.d_grid_desc_mblock_mperblock_,
- arg.compute_base_ptr_of_batch_,
- arg.block_2_ctile_map_);
+ elapsed_time =
+ launch_and_time_kernel(stream_config,
+ kernel,
+ dim3(grid_size),
+ dim3(BlockSize),
+ 0,
+ arg.p_a_grid_,
+ arg.p_b_grid_,
+ arg.p_c_grid_,
+ arg.p_d0_grid_,
+ arg.p_d1_grid_,
+ arg.BatchCount_,
+ arg.a_element_op_,
+ arg.b_element_op_,
+ arg.c_element_op_,
+ arg.d1_element_op_,
+ arg.a_grid_desc_ak0_m_ak1_,
+ arg.b_grid_desc_bk0_n_bk1_,
+ arg.c_grid_desc_mblock_mperblock_nblock_nperblock_,
+ arg.d_grid_desc_mblock_mperblock_,
+ arg.compute_base_ptr_of_batch_,
+ arg.block_2_ctile_map_);
}
- return 0;
+ return elapsed_time;
}
// polymorphic
- float Run(const BaseArgument* p_arg, int nrepeat = 1) override
+ float Run(const BaseArgument* p_arg,
+ const StreamConfig& stream_config = StreamConfig{}) override
{
- return Run(*dynamic_cast(p_arg), nrepeat);
+ return Run(*dynamic_cast(p_arg), stream_config);
}
};
diff --git a/include/ck/tensor_operation/gpu/device/device_batched_gemm_xdl.hpp b/include/ck/tensor_operation/gpu/device/device_batched_gemm_xdl.hpp
index 88974a5221..ea7704951e 100644
--- a/include/ck/tensor_operation/gpu/device/device_batched_gemm_xdl.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_batched_gemm_xdl.hpp
@@ -428,7 +428,7 @@ struct DeviceBatchedGemmXdl
{
using Argument = DeviceBatchedGemmXdl::Argument;
- float Run(const Argument& arg, int nrepeat = 1)
+ float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
{
{
std::cout << "arg.a_grid_desc_k0_m_k1_{" << arg.a_grid_desc_k0_m_k1_.GetLength(I0)
@@ -477,8 +477,8 @@ struct DeviceBatchedGemmXdl
remove_reference_t,
true>;
- ave_time = launch_and_time_kernel(kernel,
- nrepeat,
+ ave_time = launch_and_time_kernel(stream_config,
+ kernel,
dim3(grid_size),
dim3(BlockSize),
0,
@@ -511,8 +511,8 @@ struct DeviceBatchedGemmXdl
remove_reference_t,
false>;
- ave_time = launch_and_time_kernel(kernel,
- nrepeat,
+ ave_time = launch_and_time_kernel(stream_config,
+ kernel,
dim3(grid_size),
dim3(BlockSize),
0,
@@ -534,9 +534,10 @@ struct DeviceBatchedGemmXdl
}
// polymorphic
- float Run(const BaseArgument* p_arg, int nrepeat = 1) override
+ float Run(const BaseArgument* p_arg,
+ const StreamConfig& stream_config = StreamConfig{}) override
{
- return Run(*dynamic_cast(p_arg), nrepeat);
+ return Run(*dynamic_cast(p_arg), stream_config);
}
};
diff --git a/include/ck/tensor_operation/gpu/device/device_conv2d_backward_weight_xdl_c_shuffle_nhwc_kyxc_nhwk.hpp b/include/ck/tensor_operation/gpu/device/device_conv2d_backward_weight_xdl_c_shuffle_nhwc_kyxc_nhwk.hpp
index 466e6ad89f..c36227083c 100644
--- a/include/ck/tensor_operation/gpu/device/device_conv2d_backward_weight_xdl_c_shuffle_nhwc_kyxc_nhwk.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_conv2d_backward_weight_xdl_c_shuffle_nhwc_kyxc_nhwk.hpp
@@ -415,9 +415,10 @@ struct DeviceConv2dBwdWeightXdl_C_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_
<< arg.c_grid_desc_m_n_.GetLength(I1) << "}" << std::endl;
}
- float Run(const Argument& arg, int nrepeat = 1)
+ float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
{
ShowInfo(arg);
+
if(!GridwiseGemm::CheckValidity(arg.a_grid_desc_kbatch_k0_m_k1_,
arg.b_grid_desc_kbatch_k0_n_k1_,
arg.c_grid_desc_m_n_,
@@ -437,49 +438,27 @@ struct DeviceConv2dBwdWeightXdl_C_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_
float ave_time = 0;
const auto Run = [&](const auto& kernel) {
- if(nrepeat > 0)
- {
- ave_time =
- launch_and_time_kernel(kernel,
- nrepeat,
- dim3(grid_size),
- dim3(BlockSize),
- 0,
- arg.p_a_grid_,
- arg.p_b_grid_,
- arg.p_c_grid_,
- arg.a_grid_desc_kbatch_k0_m_k1_,
- arg.b_grid_desc_kbatch_k0_n_k1_,
- arg.c_grid_desc_mblock_mperblock_nblock_nperblock_,
- arg.a_element_op_,
- arg.b_element_op_,
- arg.c_element_op_,
- arg.block_2_ctile_map_);
- }
+ hipGetErrorString(hipMemset(
+ arg.p_c_grid_,
+ 0,
+ arg.c_grid_desc_mblock_mperblock_nblock_nperblock_.GetElementSpaceSize() *
+ sizeof(CDataType)));
- if(kbatch > 1 || nrepeat <= 0)
- {
- hipGetErrorString(hipMemset(
- arg.p_c_grid_,
- 0,
- arg.c_grid_desc_mblock_mperblock_nblock_nperblock_.GetElementSpaceSize() *
- sizeof(CDataType)));
-
- launch_kernel(kernel,
- dim3(grid_size),
- dim3(BlockSize),
- 0,
- arg.p_a_grid_,
- arg.p_b_grid_,
- arg.p_c_grid_,
- arg.a_grid_desc_kbatch_k0_m_k1_,
- arg.b_grid_desc_kbatch_k0_n_k1_,
- arg.c_grid_desc_mblock_mperblock_nblock_nperblock_,
- arg.a_element_op_,
- arg.b_element_op_,
- arg.c_element_op_,
- arg.block_2_ctile_map_);
- }
+ launch_and_time_kernel(stream_config,
+ kernel,
+ dim3(grid_size),
+ dim3(BlockSize),
+ 0,
+ arg.p_a_grid_,
+ arg.p_b_grid_,
+ arg.p_c_grid_,
+ arg.a_grid_desc_kbatch_k0_m_k1_,
+ arg.b_grid_desc_kbatch_k0_n_k1_,
+ arg.c_grid_desc_mblock_mperblock_nblock_nperblock_,
+ arg.a_element_op_,
+ arg.b_element_op_,
+ arg.c_element_op_,
+ arg.block_2_ctile_map_);
};
if(has_main_k0_block_loop)
@@ -560,9 +539,10 @@ struct DeviceConv2dBwdWeightXdl_C_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_
return ave_time;
}
- float Run(const BaseArgument* p_arg, int nrepeat = 1) override
+ float Run(const BaseArgument* p_arg,
+ const StreamConfig& stream_config = StreamConfig{}) override
{
- return Run(*dynamic_cast(p_arg), nrepeat);
+ return Run(*dynamic_cast(p_arg), stream_config);
}
};
diff --git a/include/ck/tensor_operation/gpu/device/device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk.hpp b/include/ck/tensor_operation/gpu/device/device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk.hpp
index fad4ec1ffa..def6af74ac 100644
--- a/include/ck/tensor_operation/gpu/device/device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk.hpp
@@ -531,7 +531,7 @@ struct DeviceConv2dBwdDataXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K
{
using Argument = DeviceOp::Argument;
- float Run(const Argument& arg, int nrepeat = 1)
+ float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
{
float ave_time = 0;
for(size_t i = 0; i < arg.a_grid_desc_k0_m_k1_container_.size(); i++)
@@ -602,8 +602,8 @@ struct DeviceConv2dBwdDataXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K
true>;
ave_time += launch_and_time_kernel(
+ stream_config,
kernel,
- nrepeat,
dim3(grid_size),
dim3(BlockSize),
0,
@@ -635,8 +635,8 @@ struct DeviceConv2dBwdDataXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K
false>;
ave_time += launch_and_time_kernel(
+ stream_config,
kernel,
- nrepeat,
dim3(grid_size),
dim3(BlockSize),
0,
@@ -655,9 +655,10 @@ struct DeviceConv2dBwdDataXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K
return ave_time;
}
- float Run(const BaseArgument* p_arg, int nrepeat = 1) override
+ float Run(const BaseArgument* p_arg,
+ const StreamConfig& stream_config = StreamConfig{}) override
{
- return Run(*dynamic_cast(p_arg), nrepeat);
+ return Run(*dynamic_cast(p_arg), stream_config);
}
};
diff --git a/include/ck/tensor_operation/gpu/device/device_conv2d_fwd_xdl_c_shuffle_bias_activation_add_nhwc_kyxc_nhwk.hpp b/include/ck/tensor_operation/gpu/device/device_conv2d_fwd_xdl_c_shuffle_bias_activation_add_nhwc_kyxc_nhwk.hpp
index 6648929cd5..fd95c184ca 100644
--- a/include/ck/tensor_operation/gpu/device/device_conv2d_fwd_xdl_c_shuffle_bias_activation_add_nhwc_kyxc_nhwk.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_conv2d_fwd_xdl_c_shuffle_bias_activation_add_nhwc_kyxc_nhwk.hpp
@@ -642,7 +642,7 @@ struct
{
using Argument = DeviceOp::Argument;
- float Run(const Argument& arg, int nrepeat = 1)
+ float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
{
#if 0
{
@@ -727,8 +727,8 @@ struct
true>;
ave_time = launch_and_time_kernel(
+ stream_config,
kernel,
- nrepeat,
dim3(grid_size),
dim3(BlockSize),
0,
@@ -771,8 +771,8 @@ struct
false>;
ave_time = launch_and_time_kernel(
+ stream_config,
kernel,
- nrepeat,
dim3(grid_size),
dim3(BlockSize),
0,
@@ -795,9 +795,10 @@ struct
return ave_time;
}
- float Run(const BaseArgument* p_arg, int nrepeat = 1) override
+ float Run(const BaseArgument* p_arg,
+ const StreamConfig& stream_config = StreamConfig{}) override
{
- return Run(*dynamic_cast(p_arg), nrepeat);
+ return Run(*dynamic_cast(p_arg), stream_config);
}
};
diff --git a/include/ck/tensor_operation/gpu/device/device_conv2d_fwd_xdl_c_shuffle_bias_activation_nhwc_kyxc_nhwk.hpp b/include/ck/tensor_operation/gpu/device/device_conv2d_fwd_xdl_c_shuffle_bias_activation_nhwc_kyxc_nhwk.hpp
index fd0941420c..61c91c0b76 100644
--- a/include/ck/tensor_operation/gpu/device/device_conv2d_fwd_xdl_c_shuffle_bias_activation_nhwc_kyxc_nhwk.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_conv2d_fwd_xdl_c_shuffle_bias_activation_nhwc_kyxc_nhwk.hpp
@@ -605,7 +605,7 @@ struct DeviceConv2dFwdXdl_C_Shuffle_Bias_Activation_Input_N_Hi_Wi_C_Weight_K_Y_X
{
using Argument = DeviceOp::Argument;
- float Run(const Argument& arg, int nrepeat = 1)
+ float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
{
#if 0
{
@@ -684,8 +684,8 @@ struct DeviceConv2dFwdXdl_C_Shuffle_Bias_Activation_Input_N_Hi_Wi_C_Weight_K_Y_X
true>;
ave_time = launch_and_time_kernel(
+ stream_config,
kernel,
- nrepeat,
dim3(grid_size),
dim3(BlockSize),
0,
@@ -723,8 +723,8 @@ struct DeviceConv2dFwdXdl_C_Shuffle_Bias_Activation_Input_N_Hi_Wi_C_Weight_K_Y_X
false>;
ave_time = launch_and_time_kernel(
+ stream_config,
kernel,
- nrepeat,
dim3(grid_size),
dim3(BlockSize),
0,
@@ -745,9 +745,10 @@ struct DeviceConv2dFwdXdl_C_Shuffle_Bias_Activation_Input_N_Hi_Wi_C_Weight_K_Y_X
return ave_time;
}
- float Run(const BaseArgument* p_arg, int nrepeat = 1) override
+ float Run(const BaseArgument* p_arg,
+ const StreamConfig& stream_config = StreamConfig{}) override
{
- return Run(*dynamic_cast(p_arg), nrepeat);
+ return Run(*dynamic_cast(p_arg), stream_config);
}
};
diff --git a/include/ck/tensor_operation/gpu/device/device_conv2d_fwd_xdl_c_shuffle_nhwc_kyxc_nhwk.hpp b/include/ck/tensor_operation/gpu/device/device_conv2d_fwd_xdl_c_shuffle_nhwc_kyxc_nhwk.hpp
index b508606a75..f4cddc1946 100644
--- a/include/ck/tensor_operation/gpu/device/device_conv2d_fwd_xdl_c_shuffle_nhwc_kyxc_nhwk.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_conv2d_fwd_xdl_c_shuffle_nhwc_kyxc_nhwk.hpp
@@ -568,7 +568,7 @@ struct DeviceConv2dFwdXdl_C_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_W
{
using Argument = DeviceOp::Argument;
- float Run(const Argument& arg, int nrepeat = 1)
+ float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
{
#if 0
{
@@ -663,8 +663,8 @@ struct DeviceConv2dFwdXdl_C_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_W
true>;
ave_time = launch_and_time_kernel(
+ stream_config,
kernel,
- nrepeat,
dim3(grid_size),
dim3(BlockSize),
0,
@@ -697,8 +697,8 @@ struct DeviceConv2dFwdXdl_C_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_W
false>;
ave_time = launch_and_time_kernel(
+ stream_config,
kernel,
- nrepeat,
dim3(grid_size),
dim3(BlockSize),
0,
@@ -717,9 +717,10 @@ struct DeviceConv2dFwdXdl_C_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_W
return ave_time;
}
- float Run(const BaseArgument* p_arg, int nrepeat = 1) override
+ float Run(const BaseArgument* p_arg,
+ const StreamConfig& stream_config = StreamConfig{}) override
{
- return Run(*dynamic_cast(p_arg), nrepeat);
+ return Run(*dynamic_cast(p_arg), stream_config);
}
};
diff --git a/include/ck/tensor_operation/gpu/device/device_conv2d_fwd_xdl_nhwc_kyxc_nhwk.hpp b/include/ck/tensor_operation/gpu/device/device_conv2d_fwd_xdl_nhwc_kyxc_nhwk.hpp
index 3574f7667e..aa9229f7cb 100644
--- a/include/ck/tensor_operation/gpu/device/device_conv2d_fwd_xdl_nhwc_kyxc_nhwk.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_conv2d_fwd_xdl_nhwc_kyxc_nhwk.hpp
@@ -450,7 +450,7 @@ struct DeviceConv2dFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K
{
using Argument = DeviceOp::Argument;
- float Run(const Argument& arg, int nrepeat = 1)
+ float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
{
#if 0
{
@@ -498,8 +498,8 @@ struct DeviceConv2dFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K
remove_reference_t,
true>;
- ave_time = launch_and_time_kernel(kernel,
- nrepeat,
+ ave_time = launch_and_time_kernel(stream_config,
+ kernel,
dim3(grid_size),
dim3(BlockSize),
0,
@@ -529,8 +529,8 @@ struct DeviceConv2dFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K
remove_reference_t,
false>;
- ave_time = launch_and_time_kernel(kernel,
- nrepeat,
+ ave_time = launch_and_time_kernel(stream_config,
+ kernel,
dim3(grid_size),
dim3(BlockSize),
0,
@@ -549,9 +549,10 @@ struct DeviceConv2dFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K
return ave_time;
}
- float Run(const BaseArgument* p_arg, int nrepeat = 1) override
+ float Run(const BaseArgument* p_arg,
+ const StreamConfig& stream_config = StreamConfig{}) override
{
- return Run(*dynamic_cast(p_arg), nrepeat);
+ return Run(*dynamic_cast(p_arg), stream_config);
}
};
diff --git a/include/ck/tensor_operation/gpu/device/device_conv3d_fwd_naive_ndhwc_kzyxc_ndhwk.hpp b/include/ck/tensor_operation/gpu/device/device_conv3d_fwd_naive_ndhwc_kzyxc_ndhwk.hpp
index 1bfe0bb256..b1eea0b33f 100644
--- a/include/ck/tensor_operation/gpu/device/device_conv3d_fwd_naive_ndhwc_kzyxc_ndhwk.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_conv3d_fwd_naive_ndhwc_kzyxc_ndhwk.hpp
@@ -92,7 +92,7 @@ struct DeviceConv3dFwdNaive_Input_N_Di_Hi_Wi_C_Weight_K_Z_Y_X_C_Output_N_Do_Ho_W
{
using Argument = DeviceOp::Argument;
- float Run(const Argument& arg, int nrepeat = 1)
+ float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
{
const auto naive_conv3d_fwd =
ref::naive_conv_fwd_ndhwc_kzyxc_ndhwk;
- float ave_time = launch_and_time_kernel(naive_conv3d_fwd,
- nrepeat,
+ float ave_time = launch_and_time_kernel(stream_config,
+ naive_conv3d_fwd,
dim3(256),
dim3(256),
0,
@@ -137,9 +137,10 @@ struct DeviceConv3dFwdNaive_Input_N_Di_Hi_Wi_C_Weight_K_Z_Y_X_C_Output_N_Do_Ho_W
}
// polymorphic
- float Run(const BaseArgument* p_arg, int nrepeat = 1) override
+ float Run(const BaseArgument* p_arg,
+ const StreamConfig& stream_config = StreamConfig{}) override
{
- return Run(*dynamic_cast(p_arg), nrepeat);
+ return Run(*dynamic_cast(p_arg), stream_config);
}
};
diff --git a/include/ck/tensor_operation/gpu/device/device_conv3d_fwd_xdl_ndhwc_kzyxc_ndhwk.hpp b/include/ck/tensor_operation/gpu/device/device_conv3d_fwd_xdl_ndhwc_kzyxc_ndhwk.hpp
index ff30a6880d..0f98ba054d 100644
--- a/include/ck/tensor_operation/gpu/device/device_conv3d_fwd_xdl_ndhwc_kzyxc_ndhwk.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_conv3d_fwd_xdl_ndhwc_kzyxc_ndhwk.hpp
@@ -438,7 +438,7 @@ struct DeviceConv3dFwdXdl_Input_N_Di_Hi_Wi_C_Weight_K_Z_Y_X_C_Output_N_Do_Ho_Wo_
{
using Argument = DeviceOp::Argument;
- float Run(const Argument& arg, int nrepeat = 1)
+ float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
{
{
std::cout << "num_batches_of_GEMM = " << arg.num_subbatches_ << std::endl;
@@ -487,8 +487,8 @@ struct DeviceConv3dFwdXdl_Input_N_Di_Hi_Wi_C_Weight_K_Z_Y_X_C_Output_N_Do_Ho_Wo_
OutElementwiseOperation,
remove_reference_t,
true>;
- ave_time = launch_and_time_kernel(kernel,
- nrepeat,
+ ave_time = launch_and_time_kernel(stream_config,
+ kernel,
dim3(grid_size),
dim3(BlockSize),
0,
@@ -522,8 +522,8 @@ struct DeviceConv3dFwdXdl_Input_N_Di_Hi_Wi_C_Weight_K_Z_Y_X_C_Output_N_Do_Ho_Wo_
remove_reference_t,
false>;
- ave_time = launch_and_time_kernel(kernel,
- nrepeat,
+ ave_time = launch_and_time_kernel(stream_config,
+ kernel,
dim3(grid_size),
dim3(BlockSize),
0,
@@ -547,9 +547,10 @@ struct DeviceConv3dFwdXdl_Input_N_Di_Hi_Wi_C_Weight_K_Z_Y_X_C_Output_N_Do_Ho_Wo_
}
// polymorphic
- float Run(const BaseArgument* p_arg, int nrepeat = 1) override
+ float Run(const BaseArgument* p_arg,
+ const StreamConfig& stream_config = StreamConfig{}) override
{
- return Run(*dynamic_cast(p_arg), nrepeat);
+ return Run(*dynamic_cast(p_arg), stream_config);
}
};
diff --git a/include/ck/tensor_operation/gpu/device/device_convnd_bwd_data_xdl_ndhwc_kzyxc_ndhwk.hpp b/include/ck/tensor_operation/gpu/device/device_convnd_bwd_data_xdl_ndhwc_kzyxc_ndhwk.hpp
index 5dca8f9629..209b3c866e 100644
--- a/include/ck/tensor_operation/gpu/device/device_convnd_bwd_data_xdl_ndhwc_kzyxc_ndhwk.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_convnd_bwd_data_xdl_ndhwc_kzyxc_ndhwk.hpp
@@ -1241,7 +1241,7 @@ struct DeviceConvndBwdDataXdl_Input_N_Di_Hi_Wi_C_Weight_K_Z_Y_X_C_Output_N_Do_Ho
{
using Argument = DeviceOp::Argument;
- float Run(const Argument& arg, int nrepeat = 1)
+ float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
{
float ave_time = 0;
for(size_t i = 0; i < arg.a_grid_desc_k0_m_k1_container_.size(); i++)
@@ -1316,8 +1316,8 @@ struct DeviceConvndBwdDataXdl_Input_N_Di_Hi_Wi_C_Weight_K_Z_Y_X_C_Output_N_Do_Ho
true>;
ave_time += launch_and_time_kernel(
+ stream_config,
kernel,
- nrepeat,
dim3(grid_size),
dim3(BlockSize),
0,
@@ -1349,8 +1349,8 @@ struct DeviceConvndBwdDataXdl_Input_N_Di_Hi_Wi_C_Weight_K_Z_Y_X_C_Output_N_Do_Ho
false>;
ave_time += launch_and_time_kernel(
+ stream_config,
kernel,
- nrepeat,
dim3(grid_size),
dim3(BlockSize),
0,
@@ -1369,9 +1369,10 @@ struct DeviceConvndBwdDataXdl_Input_N_Di_Hi_Wi_C_Weight_K_Z_Y_X_C_Output_N_Do_Ho
return ave_time;
}
- float Run(const BaseArgument* p_arg, int nrepeat = 1) override
+ float Run(const BaseArgument* p_arg,
+ const StreamConfig& stream_config = StreamConfig{}) override
{
- return Run(*dynamic_cast(p_arg), nrepeat);
+ return Run(*dynamic_cast(p_arg), stream_config);
}
};
diff --git a/include/ck/tensor_operation/gpu/device/device_convnd_fwd_xdl_nhwc_kyxc_nhwk.hpp b/include/ck/tensor_operation/gpu/device/device_convnd_fwd_xdl_nhwc_kyxc_nhwk.hpp
index 7365f9a3e2..4251052a99 100644
--- a/include/ck/tensor_operation/gpu/device/device_convnd_fwd_xdl_nhwc_kyxc_nhwk.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_convnd_fwd_xdl_nhwc_kyxc_nhwk.hpp
@@ -747,7 +747,7 @@ struct DeviceConvNDFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K
{
using Argument = DeviceOp::Argument;
- float Run(const Argument& arg, int nrepeat = 1)
+ float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
{
#if 0
{
@@ -795,8 +795,8 @@ struct DeviceConvNDFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K
remove_reference_t,
true>;
- ave_time = launch_and_time_kernel(kernel,
- nrepeat,
+ ave_time = launch_and_time_kernel(stream_config,
+ kernel,
dim3(grid_size),
dim3(BlockSize),
0,
@@ -826,8 +826,8 @@ struct DeviceConvNDFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K
remove_reference_t,
false>;
- ave_time = launch_and_time_kernel(kernel,
- nrepeat,
+ ave_time = launch_and_time_kernel(stream_config,
+ kernel,
dim3(grid_size),
dim3(BlockSize),
0,
@@ -846,9 +846,10 @@ struct DeviceConvNDFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K
return ave_time;
}
- float Run(const BaseArgument* p_arg, int nrepeat = 1) override
+ float Run(const BaseArgument* p_arg,
+ const StreamConfig& stream_config = StreamConfig{}) override
{
- return Run(*dynamic_cast(p_arg), nrepeat);
+ return Run(*dynamic_cast(p_arg), stream_config);
}
};
diff --git a/include/ck/tensor_operation/gpu/device/device_gemm_reduce_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/device_gemm_reduce_xdl_cshuffle.hpp
index daa309888f..69c29b72d3 100644
--- a/include/ck/tensor_operation/gpu/device/device_gemm_reduce_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_gemm_reduce_xdl_cshuffle.hpp
@@ -503,7 +503,7 @@ struct DeviceGemmReduce_Xdl_CShuffle : public DeviceGemmReduce;
- launch_kernel(kernel,
- dim3(grid_size),
- dim3(BlockSize),
- 0,
- arg.p_a_grid_,
- arg.p_b_grid_,
- arg.p_c_grid_,
- arg.p_d0_grid_,
- arg.p_d1_grid_,
- arg.a_element_op_,
- arg.b_element_op_,
- arg.c_element_op_,
- arg.d1_element_op_,
- arg.a_grid_desc_ak0_m_ak1_,
- arg.b_grid_desc_bk0_n_bk1_,
- arg.c_grid_desc_mblock_mperblock_nblock_nperblock_,
- arg.d_grid_desc_mblock_mperblock_,
- arg.block_2_ctile_map_);
+ elapsed_time =
+ launch_and_time_kernel(stream_config,
+ kernel,
+ dim3(grid_size),
+ dim3(BlockSize),
+ 0,
+ arg.p_a_grid_,
+ arg.p_b_grid_,
+ arg.p_c_grid_,
+ arg.p_d0_grid_,
+ arg.p_d1_grid_,
+ arg.a_element_op_,
+ arg.b_element_op_,
+ arg.c_element_op_,
+ arg.d1_element_op_,
+ arg.a_grid_desc_ak0_m_ak1_,
+ arg.b_grid_desc_bk0_n_bk1_,
+ arg.c_grid_desc_mblock_mperblock_nblock_nperblock_,
+ arg.d_grid_desc_mblock_mperblock_,
+ arg.block_2_ctile_map_);
}
else
{
@@ -591,33 +594,36 @@ struct DeviceGemmReduce_Xdl_CShuffle : public DeviceGemmReduce;
- launch_kernel(kernel,
- dim3(grid_size),
- dim3(BlockSize),
- 0,
- arg.p_a_grid_,
- arg.p_b_grid_,
- arg.p_c_grid_,
- arg.p_d0_grid_,
- arg.p_d1_grid_,
- arg.a_element_op_,
- arg.b_element_op_,
- arg.c_element_op_,
- arg.d1_element_op_,
- arg.a_grid_desc_ak0_m_ak1_,
- arg.b_grid_desc_bk0_n_bk1_,
- arg.c_grid_desc_mblock_mperblock_nblock_nperblock_,
- arg.d_grid_desc_mblock_mperblock_,
- arg.block_2_ctile_map_);
+ elapsed_time =
+ launch_and_time_kernel(stream_config,
+ kernel,
+ dim3(grid_size),
+ dim3(BlockSize),
+ 0,
+ arg.p_a_grid_,
+ arg.p_b_grid_,
+ arg.p_c_grid_,
+ arg.p_d0_grid_,
+ arg.p_d1_grid_,
+ arg.a_element_op_,
+ arg.b_element_op_,
+ arg.c_element_op_,
+ arg.d1_element_op_,
+ arg.a_grid_desc_ak0_m_ak1_,
+ arg.b_grid_desc_bk0_n_bk1_,
+ arg.c_grid_desc_mblock_mperblock_nblock_nperblock_,
+ arg.d_grid_desc_mblock_mperblock_,
+ arg.block_2_ctile_map_);
}
- return 0;
+ return elapsed_time;
}
// polymorphic
- float Run(const BaseArgument* p_arg, int nrepeat = 1) override
+ float Run(const BaseArgument* p_arg,
+ const StreamConfig& stream_config = StreamConfig{}) override
{
- return Run(*dynamic_cast(p_arg), nrepeat);
+ return Run(*dynamic_cast(p_arg), stream_config);
}
};
diff --git a/include/ck/tensor_operation/gpu/device/device_gemm_xdl.hpp b/include/ck/tensor_operation/gpu/device/device_gemm_xdl.hpp
index 47997cd802..2bb7f6e78a 100644
--- a/include/ck/tensor_operation/gpu/device/device_gemm_xdl.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_gemm_xdl.hpp
@@ -290,7 +290,7 @@ struct DeviceGemmXdl
{
using Argument = DeviceGemmXdl::Argument;
- float Run(const Argument& arg, int nrepeat = 1)
+ float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
{
#if 0
{
@@ -339,8 +339,8 @@ struct DeviceGemmXdl
remove_reference_t,
true>;
- ave_time = launch_and_time_kernel(kernel,
- nrepeat,
+ ave_time = launch_and_time_kernel(stream_config,
+ kernel,
dim3(grid_size),
dim3(BlockSize),
0,
@@ -370,8 +370,8 @@ struct DeviceGemmXdl
remove_reference_t,
false>;
- ave_time = launch_and_time_kernel(kernel,
- nrepeat,
+ ave_time = launch_and_time_kernel(stream_config,
+ kernel,
dim3(grid_size),
dim3(BlockSize),
0,
@@ -391,9 +391,10 @@ struct DeviceGemmXdl
}
// polymorphic
- float Run(const BaseArgument* p_arg, int nrepeat = 1) override
+ float Run(const BaseArgument* p_arg,
+ const StreamConfig& stream_config = StreamConfig{}) override
{
- return Run(*dynamic_cast(p_arg), nrepeat);
+ return Run(*dynamic_cast(p_arg), stream_config);
}
};
diff --git a/include/ck/tensor_operation/gpu/device/device_gemm_xdl_c_shuffle_bias_2d.hpp b/include/ck/tensor_operation/gpu/device/device_gemm_xdl_c_shuffle_bias_2d.hpp
index 4010965312..315f39d9bf 100644
--- a/include/ck/tensor_operation/gpu/device/device_gemm_xdl_c_shuffle_bias_2d.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_gemm_xdl_c_shuffle_bias_2d.hpp
@@ -264,7 +264,7 @@ struct DeviceGemmXdl_C_Shuffle_Bias_2d
{
using Argument = DeviceGemmXdl_C_Shuffle_Bias_2d::Argument;
- float Run(const Argument& arg, int nrepeat = 1)
+ float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
{
{
std::cout << "arg.a_grid_desc_k0_m_k1_{" << arg.a_grid_desc_k0_m_k1_.GetLength(I0)
@@ -320,8 +320,8 @@ struct DeviceGemmXdl_C_Shuffle_Bias_2d
true>;
ave_time = launch_and_time_kernel(
+ stream_config,
kernel,
- nrepeat,
dim3(grid_size),
dim3(BlockSize),
0,
@@ -359,8 +359,8 @@ struct DeviceGemmXdl_C_Shuffle_Bias_2d
false>;
ave_time = launch_and_time_kernel(
+ stream_config,
kernel,
- nrepeat,
dim3(grid_size),
dim3(BlockSize),
0,
@@ -382,9 +382,10 @@ struct DeviceGemmXdl_C_Shuffle_Bias_2d
}
// polymorphic
- float Run(const BaseArgument* p_arg, int nrepeat = 1) override
+ float Run(const BaseArgument* p_arg,
+ const StreamConfig& stream_config = StreamConfig{}) override
{
- return Run(*dynamic_cast(p_arg), nrepeat);
+ return Run(*dynamic_cast(p_arg), stream_config);
}
};
diff --git a/include/ck/tensor_operation/gpu/device/device_gemm_xdl_c_shuffle_bias_activation.hpp b/include/ck/tensor_operation/gpu/device/device_gemm_xdl_c_shuffle_bias_activation.hpp
index c65ff6022a..f1f9f41724 100644
--- a/include/ck/tensor_operation/gpu/device/device_gemm_xdl_c_shuffle_bias_activation.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_gemm_xdl_c_shuffle_bias_activation.hpp
@@ -273,7 +273,7 @@ struct DeviceGemmXdl_C_Shuffle_Bias_Activation
{
using Argument = DeviceOp::Argument;
- float Run(const Argument& arg, int nrepeat = 1)
+ float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
{
{
std::cout << "arg.a_grid_desc_k0_m_k1_{" << arg.a_grid_desc_k0_m_k1_.GetLength(I0)
@@ -329,8 +329,8 @@ struct DeviceGemmXdl_C_Shuffle_Bias_Activation
true>;
ave_time = launch_and_time_kernel(
+ stream_config,
kernel,
- nrepeat,
dim3(grid_size),
dim3(BlockSize),
0,
@@ -368,8 +368,8 @@ struct DeviceGemmXdl_C_Shuffle_Bias_Activation
false>;
ave_time = launch_and_time_kernel(
+ stream_config,
kernel,
- nrepeat,
dim3(grid_size),
dim3(BlockSize),
0,
@@ -391,9 +391,10 @@ struct DeviceGemmXdl_C_Shuffle_Bias_Activation
}
// polymorphic
- float Run(const BaseArgument* p_arg, int nrepeat = 1) override
+ float Run(const BaseArgument* p_arg,
+ const StreamConfig& stream_config = StreamConfig{}) override
{
- return Run(*dynamic_cast(p_arg), nrepeat);
+ return Run(*dynamic_cast(p_arg), stream_config);
}
};
diff --git a/include/ck/tensor_operation/gpu/device/device_gemm_xdl_c_shuffle_bias_activation_add.hpp b/include/ck/tensor_operation/gpu/device/device_gemm_xdl_c_shuffle_bias_activation_add.hpp
index 4a478c995d..e3d0986aba 100644
--- a/include/ck/tensor_operation/gpu/device/device_gemm_xdl_c_shuffle_bias_activation_add.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_gemm_xdl_c_shuffle_bias_activation_add.hpp
@@ -312,7 +312,7 @@ struct DeviceGemmXdl_C_Shuffle_Bias_Activation_Add
{
using Argument = DeviceOp::Argument;
- float Run(const Argument& arg, int nrepeat = 1)
+ float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
{
{
std::cout << "arg.a_grid_desc_k0_m_k1_{" << arg.a_grid_desc_k0_m_k1_.GetLength(I0)
@@ -374,8 +374,8 @@ struct DeviceGemmXdl_C_Shuffle_Bias_Activation_Add
true>;
ave_time = launch_and_time_kernel(
+ stream_config,
kernel,
- nrepeat,
dim3(grid_size),
dim3(BlockSize),
0,
@@ -418,8 +418,8 @@ struct DeviceGemmXdl_C_Shuffle_Bias_Activation_Add
false>;
ave_time = launch_and_time_kernel(
+ stream_config,
kernel,
- nrepeat,
dim3(grid_size),
dim3(BlockSize),
0,
@@ -443,9 +443,10 @@ struct DeviceGemmXdl_C_Shuffle_Bias_Activation_Add
}
// polymorphic
- float Run(const BaseArgument* p_arg, int nrepeat = 1) override
+ float Run(const BaseArgument* p_arg,
+ const StreamConfig& stream_config = StreamConfig{}) override
{
- return Run(*dynamic_cast(p_arg), nrepeat);
+ return Run(*dynamic_cast(p_arg), stream_config);
}
};
diff --git a/include/ck/tensor_operation/gpu/device/device_gemm_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/device_gemm_xdl_cshuffle.hpp
index fde27acdb1..952630120a 100644
--- a/include/ck/tensor_operation/gpu/device/device_gemm_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_gemm_xdl_cshuffle.hpp
@@ -440,7 +440,7 @@ struct DeviceGemm_Xdl_CShuffle
{
using Argument = DeviceOp::Argument;
- float Run(const Argument& arg, int nrepeat = 1)
+ float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
{
#if 0
{
@@ -487,42 +487,22 @@ struct DeviceGemm_Xdl_CShuffle
typename GridwiseGemm::DefaultBlock2CTileMap,
true>;
- if(nrepeat == 0)
- {
- launch_kernel(kernel,
- dim3(grid_size),
- dim3(BlockSize),
- 0,
- arg.p_a_grid_,
- arg.p_b_grid_,
- arg.p_c_grid_,
- arg.a_element_op_,
- arg.b_element_op_,
- arg.c_element_op_,
- arg.a_grid_desc_ak0_m_ak1_,
- arg.b_grid_desc_bk0_n_bk1_,
- arg.c_grid_desc_mblock_mperblock_nblock_nperblock_,
- arg.block_2_ctile_map_);
- }
- else
- {
- ave_time =
- launch_and_time_kernel(kernel,
- nrepeat,
- dim3(grid_size),
- dim3(BlockSize),
- 0,
- arg.p_a_grid_,
- arg.p_b_grid_,
- arg.p_c_grid_,
- arg.a_element_op_,
- arg.b_element_op_,
- arg.c_element_op_,
- arg.a_grid_desc_ak0_m_ak1_,
- arg.b_grid_desc_bk0_n_bk1_,
- arg.c_grid_desc_mblock_mperblock_nblock_nperblock_,
- arg.block_2_ctile_map_);
- }
+ ave_time =
+ launch_and_time_kernel(stream_config,
+ kernel,
+ dim3(grid_size),
+ dim3(BlockSize),
+ 0,
+ arg.p_a_grid_,
+ arg.p_b_grid_,
+ arg.p_c_grid_,
+ arg.a_element_op_,
+ arg.b_element_op_,
+ arg.c_element_op_,
+ arg.a_grid_desc_ak0_m_ak1_,
+ arg.b_grid_desc_bk0_n_bk1_,
+ arg.c_grid_desc_mblock_mperblock_nblock_nperblock_,
+ arg.block_2_ctile_map_);
}
else
{
@@ -538,52 +518,32 @@ struct DeviceGemm_Xdl_CShuffle
typename GridwiseGemm::CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock,
typename GridwiseGemm::DefaultBlock2CTileMap,
false>;
-
- if(nrepeat == 0)
- {
- launch_kernel(kernel,
- dim3(grid_size),
- dim3(BlockSize),
- 0,
- arg.p_a_grid_,
- arg.p_b_grid_,
- arg.p_c_grid_,
- arg.a_element_op_,
- arg.b_element_op_,
- arg.c_element_op_,
- arg.a_grid_desc_ak0_m_ak1_,
- arg.b_grid_desc_bk0_n_bk1_,
- arg.c_grid_desc_mblock_mperblock_nblock_nperblock_,
- arg.block_2_ctile_map_);
- }
- else
- {
- ave_time =
- launch_and_time_kernel(kernel,
- nrepeat,
- dim3(grid_size),
- dim3(BlockSize),
- 0,
- arg.p_a_grid_,
- arg.p_b_grid_,
- arg.p_c_grid_,
- arg.a_element_op_,
- arg.b_element_op_,
- arg.c_element_op_,
- arg.a_grid_desc_ak0_m_ak1_,
- arg.b_grid_desc_bk0_n_bk1_,
- arg.c_grid_desc_mblock_mperblock_nblock_nperblock_,
- arg.block_2_ctile_map_);
- }
+ ave_time =
+ launch_and_time_kernel(stream_config,
+ kernel,
+ dim3(grid_size),
+ dim3(BlockSize),
+ 0,
+ arg.p_a_grid_,
+ arg.p_b_grid_,
+ arg.p_c_grid_,
+ arg.a_element_op_,
+ arg.b_element_op_,
+ arg.c_element_op_,
+ arg.a_grid_desc_ak0_m_ak1_,
+ arg.b_grid_desc_bk0_n_bk1_,
+ arg.c_grid_desc_mblock_mperblock_nblock_nperblock_,
+ arg.block_2_ctile_map_);
}
return ave_time;
}
// polymorphic
- float Run(const BaseArgument* p_arg, int nrepeat = 1) override
+ float Run(const BaseArgument* p_arg,
+ const StreamConfig& stream_config = StreamConfig{}) override
{
- return Run(*dynamic_cast(p_arg), nrepeat);
+ return Run(*dynamic_cast(p_arg), stream_config);
}
};
diff --git a/include/ck/tensor_operation/gpu/device/device_gemm_xdl_splitk.hpp b/include/ck/tensor_operation/gpu/device/device_gemm_xdl_splitk.hpp
index db6c884739..e603af1fba 100644
--- a/include/ck/tensor_operation/gpu/device/device_gemm_xdl_splitk.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_gemm_xdl_splitk.hpp
@@ -385,8 +385,11 @@ struct DeviceGemmXdlSplitK
std::cout << "arg.c_grid_desc_m_n_{ " << arg.c_grid_desc_m_n_.GetLength(I0) << ", "
<< arg.c_grid_desc_m_n_.GetLength(I1) << "}" << std::endl;
}
- float Run(const Argument& arg, int nrepeat = 1)
+
+ float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
{
+ ShowInfo(arg);
+
const auto kbatch = arg.a_grid_desc_kbatch_k0_m_k1_.GetLength(I0);
if(!GridwiseGemm::CheckValidity(arg.a_grid_desc_kbatch_k0_m_k1_,
@@ -408,50 +411,30 @@ struct DeviceGemmXdlSplitK
float ave_time = 0;
const auto Run = [&](const auto& kernel) {
- if(nrepeat > 0)
- {
- ShowInfo(arg);
- ave_time = launch_and_time_kernel(kernel,
- nrepeat,
- dim3(grid_size),
- dim3(BlockSize),
- 0,
- arg.p_a_grid_,
- arg.p_b_grid_,
- arg.p_c_grid_,
- arg.a_grid_desc_kbatch_k0_m_k1_,
- arg.b_grid_desc_kbatch_k0_n_k1_,
- arg.c_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2_,
- arg.a_element_op_,
- arg.b_element_op_,
- arg.c_element_op_,
- arg.block_2_ctile_map_);
- }
+ // FIXME: this should be moved outside of DeviceOp
+ hipGetErrorString(
+ hipMemset(arg.p_c_grid_,
+ 0,
+ arg.c_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2_.GetElementSpaceSize() *
+ sizeof(CDataType)));
- if(kbatch > 1 || nrepeat <= 0)
- {
- hipGetErrorString(
- hipMemset(arg.p_c_grid_,
- 0,
- arg.c_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2_.GetElementSpaceSize() *
- sizeof(CDataType)));
-
- launch_kernel(kernel,
- dim3(grid_size),
- dim3(BlockSize),
- 0,
- arg.p_a_grid_,
- arg.p_b_grid_,
- arg.p_c_grid_,
- arg.a_grid_desc_kbatch_k0_m_k1_,
- arg.b_grid_desc_kbatch_k0_n_k1_,
- arg.c_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2_,
- arg.a_element_op_,
- arg.b_element_op_,
- arg.c_element_op_,
- arg.block_2_ctile_map_);
- }
+ ave_time = launch_and_time_kernel(stream_config,
+ kernel,
+ dim3(grid_size),
+ dim3(BlockSize),
+ 0,
+ arg.p_a_grid_,
+ arg.p_b_grid_,
+ arg.p_c_grid_,
+ arg.a_grid_desc_kbatch_k0_m_k1_,
+ arg.b_grid_desc_kbatch_k0_n_k1_,
+ arg.c_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2_,
+ arg.a_element_op_,
+ arg.b_element_op_,
+ arg.c_element_op_,
+ arg.block_2_ctile_map_);
};
+
if(has_main_k0_block_loop)
{
if(kbatch == 1)
@@ -531,9 +514,10 @@ struct DeviceGemmXdlSplitK
}
// polymorphic
- float Run(const BaseArgument* p_arg, int nrepeat = 1) override
+ float Run(const BaseArgument* p_arg,
+ const StreamConfig& stream_config = StreamConfig{}) override
{
- return Run(*dynamic_cast(p_arg), nrepeat);
+ return Run(*dynamic_cast(p_arg), stream_config);
}
};
diff --git a/include/ck/tensor_operation/gpu/device/device_gemm_xdl_splitk_c_shuffle.hpp b/include/ck/tensor_operation/gpu/device/device_gemm_xdl_splitk_c_shuffle.hpp
index 9de5361ab6..7d00224429 100644
--- a/include/ck/tensor_operation/gpu/device/device_gemm_xdl_splitk_c_shuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_gemm_xdl_splitk_c_shuffle.hpp
@@ -391,8 +391,11 @@ struct DeviceGemmXdlSplitKCShuffle
std::cout << "arg.c_grid_desc_m_n_{ " << arg.c_grid_desc_m_n_.GetLength(I0) << ", "
<< arg.c_grid_desc_m_n_.GetLength(I1) << "}" << std::endl;
}
- float Run(const Argument& arg, int nrepeat = 1)
+
+ float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
{
+ ShowInfo(arg);
+
const auto kbatch = arg.a_grid_desc_kbatch_k0_m_k1_.GetLength(I0);
if(!GridwiseGemm::CheckValidity(arg.a_grid_desc_kbatch_k0_m_k1_,
@@ -414,51 +417,29 @@ struct DeviceGemmXdlSplitKCShuffle
float ave_time = 0;
const auto Run = [&](const auto& kernel) {
- if(nrepeat > 0)
- {
- ShowInfo(arg);
- ave_time =
- launch_and_time_kernel(kernel,
- nrepeat,
- dim3(grid_size),
- dim3(BlockSize),
- 0,
- arg.p_a_grid_,
- arg.p_b_grid_,
- arg.p_c_grid_,
- arg.a_grid_desc_kbatch_k0_m_k1_,
- arg.b_grid_desc_kbatch_k0_n_k1_,
- arg.c_grid_desc_mblock_mperblock_nblock_nperblock_,
- arg.a_element_op_,
- arg.b_element_op_,
- arg.c_element_op_,
- arg.block_2_ctile_map_);
- }
+ hipGetErrorString(hipMemset(
+ arg.p_c_grid_,
+ 0,
+ arg.c_grid_desc_mblock_mperblock_nblock_nperblock_.GetElementSpaceSize() *
+ sizeof(CDataType)));
- if(kbatch > 1 || nrepeat <= 0)
- {
- hipGetErrorString(hipMemset(
- arg.p_c_grid_,
- 0,
- arg.c_grid_desc_mblock_mperblock_nblock_nperblock_.GetElementSpaceSize() *
- sizeof(CDataType)));
-
- launch_kernel(kernel,
- dim3(grid_size),
- dim3(BlockSize),
- 0,
- arg.p_a_grid_,
- arg.p_b_grid_,
- arg.p_c_grid_,
- arg.a_grid_desc_kbatch_k0_m_k1_,
- arg.b_grid_desc_kbatch_k0_n_k1_,
- arg.c_grid_desc_mblock_mperblock_nblock_nperblock_,
- arg.a_element_op_,
- arg.b_element_op_,
- arg.c_element_op_,
- arg.block_2_ctile_map_);
- }
+ launch_and_time_kernel(stream_config,
+ kernel,
+ dim3(grid_size),
+ dim3(BlockSize),
+ 0,
+ arg.p_a_grid_,
+ arg.p_b_grid_,
+ arg.p_c_grid_,
+ arg.a_grid_desc_kbatch_k0_m_k1_,
+ arg.b_grid_desc_kbatch_k0_n_k1_,
+ arg.c_grid_desc_mblock_mperblock_nblock_nperblock_,
+ arg.a_element_op_,
+ arg.b_element_op_,
+ arg.c_element_op_,
+ arg.block_2_ctile_map_);
};
+
if(has_main_k0_block_loop)
{
if(kbatch == 1)
@@ -542,9 +523,10 @@ struct DeviceGemmXdlSplitKCShuffle
}
// polymorphic
- float Run(const BaseArgument* p_arg, int nrepeat = 1) override
+ float Run(const BaseArgument* p_arg,
+ const StreamConfig& stream_config = StreamConfig{}) override
{
- return Run(*dynamic_cast(p_arg), nrepeat);
+ return Run(*dynamic_cast(p_arg), stream_config);
}
};
diff --git a/include/ck/tensor_operation/gpu/device/device_grouped_gemm_xdl.hpp b/include/ck/tensor_operation/gpu/device/device_grouped_gemm_xdl.hpp
index dfc1ce2715..730b2d787e 100644
--- a/include/ck/tensor_operation/gpu/device/device_grouped_gemm_xdl.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_grouped_gemm_xdl.hpp
@@ -449,7 +449,7 @@ struct DeviceGroupedGemmXdl
{
using Argument = DeviceGroupedGemmXdl::Argument;
- float Run(const Argument& arg, int nrepeat = 1)
+ float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
{
StaticallyIndexedArray gemm_desc_kernel_args;
@@ -510,8 +510,8 @@ struct DeviceGroupedGemmXdl
true,
MaxGroupCount>;
- ave_time = launch_and_time_kernel(kernel,
- nrepeat,
+ ave_time = launch_and_time_kernel(stream_config,
+ kernel,
dim3(arg.grid_size_),
dim3(BlockSize),
0,
@@ -534,8 +534,8 @@ struct DeviceGroupedGemmXdl
false,
MaxGroupCount>;
- ave_time = launch_and_time_kernel(kernel,
- nrepeat,
+ ave_time = launch_and_time_kernel(stream_config,
+ kernel,
dim3(arg.grid_size_),
dim3(BlockSize),
0,
@@ -550,9 +550,10 @@ struct DeviceGroupedGemmXdl
}
// polymorphic
- float Run(const BaseArgument* p_arg, int nrepeat = 1) override
+ float Run(const BaseArgument* p_arg,
+ const StreamConfig& stream_config = StreamConfig{}) override
{
- return Run(*dynamic_cast(p_arg), nrepeat);
+ return Run(*dynamic_cast(p_arg), stream_config);
}
};
diff --git a/include/ck/tensor_operation/gpu/device/device_pool2d_fwd_nhwc_nhwc.hpp b/include/ck/tensor_operation/gpu/device/device_pool2d_fwd_nhwc_nhwc.hpp
index 651d31ae2f..f665378e08 100644
--- a/include/ck/tensor_operation/gpu/device/device_pool2d_fwd_nhwc_nhwc.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_pool2d_fwd_nhwc_nhwc.hpp
@@ -204,7 +204,7 @@ struct DevicePool2dFwd_Input_N_Hi_Wi_C_Output_N_Ho_Wo_C : public DevicePool2dFwd
struct Invoker : public BaseInvoker
{
- float Run(const Argument& arg, int nrepeat = 1)
+ float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
{
using gridwise_reduce = GridwiseReduction_mk_to_m_threadwise(p_arg), nrepeat);
+ return Run(*dynamic_cast(p_arg), stream_config);
}
};
diff --git a/include/ck/tensor_operation/gpu/device/device_reduce_blockwise.hpp b/include/ck/tensor_operation/gpu/device/device_reduce_blockwise.hpp
index 4f17989b53..860f53d8c5 100644
--- a/include/ck/tensor_operation/gpu/device/device_reduce_blockwise.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_reduce_blockwise.hpp
@@ -211,7 +211,7 @@ struct DeviceReduceBlockWise : public DeviceReduce;
- avg_time = launch_and_time_kernel(kernel,
- nrepeat,
+ avg_time = launch_and_time_kernel(stream_config,
+ kernel,
dim3(arg.gridSize),
dim3(BlockSize),
0,
@@ -272,9 +272,10 @@ struct DeviceReduceBlockWise : public DeviceReduce(p_arg), nrepeat);
+ return Run(*dynamic_cast(p_arg), stream_config);
};
};
diff --git a/include/ck/tensor_operation/gpu/device/device_reduce_blockwise_second_call.hpp b/include/ck/tensor_operation/gpu/device/device_reduce_blockwise_second_call.hpp
index d3b1b4b5c3..43ac48cecc 100644
--- a/include/ck/tensor_operation/gpu/device/device_reduce_blockwise_second_call.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_reduce_blockwise_second_call.hpp
@@ -182,7 +182,7 @@ struct DeviceReduceBlockWiseSecondCall
struct Invoker : public BaseInvoker
{
- float Run(const Argument& arg, int nrepeat = 1)
+ float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
{
const auto in_grid_desc_m_k = DeviceReduceBlockWiseSecondCall::MakeSrc2dDescriptor(
arg.inLengths_, arg.inStrides_);
@@ -224,8 +224,8 @@ struct DeviceReduceBlockWiseSecondCall
InElementwiseOperation,
AccElementwiseOperation>;
- avg_time = launch_and_time_kernel(kernel,
- nrepeat,
+ avg_time = launch_and_time_kernel(stream_config,
+ kernel,
dim3(arg.gridSize),
dim3(BlockSize),
0,
@@ -243,10 +243,11 @@ struct DeviceReduceBlockWiseSecondCall
return (avg_time);
};
- float Run(const BaseArgument* p_arg, int nrepeat = 1) override
+ float Run(const BaseArgument* p_arg,
+ const StreamConfig& stream_config = StreamConfig{}) override
{
- return Run(*dynamic_cast(p_arg), nrepeat);
- };
+ return Run(*dynamic_cast(p_arg), stream_config);
+ }
};
bool IsSupportedArgument(const BaseArgument* p_arg) override
diff --git a/include/ck/tensor_operation/gpu/device/device_reduce_multiblock_atomic_add.hpp b/include/ck/tensor_operation/gpu/device/device_reduce_multiblock_atomic_add.hpp
index 889c366875..f93c65fe18 100644
--- a/include/ck/tensor_operation/gpu/device/device_reduce_multiblock_atomic_add.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_reduce_multiblock_atomic_add.hpp
@@ -245,7 +245,7 @@ struct DeviceReduceMultiBlockAtomicAdd
struct Invoker : public BaseInvoker
{
- float Run(const Argument& arg, int nrepeat = 1)
+ float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
{
const auto in_grid_desc_m_k = DeviceReduceMultiBlockAtomicAdd::MakeSrc2dDescriptor(
arg.inLengths_, arg.inStrides_, arg.blkGroupSize, arg.kBlockTileIterations);
@@ -275,8 +275,6 @@ struct DeviceReduceMultiBlockAtomicAdd
float avg_time = 0;
- KernelTimer timer;
-
const auto kernel_pre = kernel_buffer_set_value;
const auto kernel_main = kernel_reduce_multiblock_atocmi_add;
- printf("launch_and_time_kernel: grid_dim {%ld, 1, 1}, block_dim {%d, 1, 1} \n",
- arg.gridSize,
- BlockSize);
- printf("Warm up\n");
+ avg_time += launch_and_time_kernel(stream_config,
+ kernel_pre,
+ dim3(arg.gridSize_pre),
+ dim3(BlockSize),
+ 0,
+ out_grid_desc_m,
+ arg.out_dev_,
+ static_cast(0.0f));
- for(int i = 0; i < nrepeat + 1; i++)
- {
- if(i == 1)
- timer.Start();
+ avg_time += launch_and_time_kernel(stream_config,
+ kernel_main,
+ dim3(arg.gridSize),
+ dim3(BlockSize),
+ 0,
+ in_grid_desc_m_k,
+ out_grid_desc_m,
+ arg.in_elementwise_op_,
+ arg.acc_elementwise_op_,
+ arg.blkGroupSize,
+ arg.kBlockTileIterations,
+ arg.alpha_,
+ arg.in_dev_,
+ arg.out_dev_);
- launch_kernel(kernel_pre,
- dim3(arg.gridSize_pre),
- dim3(BlockSize),
- 0,
- out_grid_desc_m,
- arg.out_dev_,
- static_cast(0.0f));
+ return avg_time;
+ }
- launch_kernel(kernel_main,
- dim3(arg.gridSize),
- dim3(BlockSize),
- 0,
- in_grid_desc_m_k,
- out_grid_desc_m,
- arg.in_elementwise_op_,
- arg.acc_elementwise_op_,
- arg.blkGroupSize,
- arg.kBlockTileIterations,
- arg.alpha_,
- arg.in_dev_,
- arg.out_dev_);
- };
-
- timer.End();
-
- avg_time = timer.GetElapsedTime() / nrepeat;
-
- return (avg_time);
- };
-
- float Run(const BaseArgument* p_arg, int nrepeat = 1) override
+ float Run(const BaseArgument* p_arg,
+ const StreamConfig& stream_config = StreamConfig{}) override
{
- return Run(*dynamic_cast(p_arg), nrepeat);
- };
+ return Run(*dynamic_cast(p_arg), stream_config);
+ }
};
bool IsSupportedArgument(const BaseArgument* p_arg) override
diff --git a/include/ck/tensor_operation/gpu/device/device_reduce_multiblock_partial_reduce.hpp b/include/ck/tensor_operation/gpu/device/device_reduce_multiblock_partial_reduce.hpp
index d583f7f1b8..b4eb8116c2 100644
--- a/include/ck/tensor_operation/gpu/device/device_reduce_multiblock_partial_reduce.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_reduce_multiblock_partial_reduce.hpp
@@ -273,7 +273,7 @@ struct DeviceReduceMultiBlockPartialReduce
struct Invoker : public BaseInvoker
{
- float Run(const Argument& arg, int nrepeat = 1)
+ float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
{
const auto in_grid_desc_m_k = DeviceReduceMultiBlockPartialReduce::MakeSrc2dDescriptor(
arg.inLengths_, arg.inStrides_, arg.blkGroupSize, arg.kBlockTileIterations);
@@ -313,8 +313,8 @@ struct DeviceReduceMultiBlockPartialReduce
InElementwiseOperation,
AccElementwiseOperation>;
- avg_time = launch_and_time_kernel(kernel,
- nrepeat,
+ avg_time = launch_and_time_kernel(stream_config,
+ kernel,
dim3(arg.gridSize),
dim3(BlockSize),
0,
@@ -331,10 +331,11 @@ struct DeviceReduceMultiBlockPartialReduce
return (avg_time);
};
- float Run(const BaseArgument* p_arg, int nrepeat = 1) override
+ float Run(const BaseArgument* p_arg,
+ const StreamConfig& stream_config = StreamConfig{}) override
{
- return Run(*dynamic_cast(p_arg), nrepeat);
- };
+ return Run(*dynamic_cast(p_arg), stream_config);
+ }
};
bool IsSupportedArgument(const BaseArgument* p_arg) override
diff --git a/include/ck/tensor_operation/gpu/device/device_reduce_threadwise.hpp b/include/ck/tensor_operation/gpu/device/device_reduce_threadwise.hpp
index bf4088a96b..dacb175043 100644
--- a/include/ck/tensor_operation/gpu/device/device_reduce_threadwise.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_reduce_threadwise.hpp
@@ -212,7 +212,7 @@ struct DeviceReduceThreadWise : public DeviceReduce;
- avg_time = launch_and_time_kernel(kernel,
- nrepeat,
+ avg_time = launch_and_time_kernel(stream_config,
+ kernel,
dim3(arg.gridSize),
dim3(BlockSize),
0,
@@ -272,10 +272,11 @@ struct DeviceReduceThreadWise : public DeviceReduce(p_arg), nrepeat);
- };
+ return Run(*dynamic_cast(p_arg), stream_config);
+ }
};
bool IsSupportedArgument(const BaseArgument* p_arg) override
diff --git a/library/include/ck/library/host/host_interface.hpp b/library/include/ck/library/host/host_interface.hpp
new file mode 100644
index 0000000000..955da0f4be
--- /dev/null
+++ b/library/include/ck/library/host/host_interface.hpp
@@ -0,0 +1,54 @@
+#pragma once
+
+#include
+#include
+
+#include "stream_config.hpp"
+#include "config.hpp"
+#include "device_base.hpp"
+
+struct DeviceConvFwdPtr_t
+{
+ using BaseArgument = ck::tensor_operation::device::BaseArgument;
+ using BaseInvoker = ck::tensor_operation::device::BaseInvoker;
+
+ struct DeviceConvFwdPtrImpl;
+ std::unique_ptr pImpl;
+ DeviceConvFwdPtr_t();
+ ~DeviceConvFwdPtr_t();
+ DeviceConvFwdPtr_t(DeviceConvFwdPtr_t&&);
+ DeviceConvFwdPtr_t(DeviceConvFwdPtrImpl&);
+ DeviceConvFwdPtr_t& operator=(DeviceConvFwdPtr_t&) = delete;
+ DeviceConvFwdPtr_t& operator=(const DeviceConvFwdPtr_t&) = delete;
+ std::unique_ptr
+ MakeArgumentPointer(void* in_ptr,
+ void* wei_ptr,
+ void* out_ptr,
+ size_t N,
+ size_t K,
+ size_t C,
+ std::vector input_spatial_lengths,
+ std::vector filter_spatial_lengths,
+ std::vector output_spatial_lengths,
+ std::vector conv_filter_strides,
+ std::vector conv_filter_dilations,
+ std::vector input_left_pads,
+ std::vector input_right_pads)
+ const; // in,wei and out element ops are ignored for now since even if we change them, they
+ // cant be linked
+ std::unique_ptr
+ MakeInvokerPointer() const; // requires including BaseInvoker headers
+ std::string GetTypeString();
+ bool IsSupportedArgument(const BaseArgument* arg_ptr);
+};
+
+void add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_f32_instances_t(
+ std::vector& instances);
+void add_device_conv2d_fwd_xdl_c_shuffle_nhwc_kyxc_nhwk_f16_instances_t(
+ std::vector& instances);
+void add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_bf16_instances_t(
+ std::vector& instances);
+void add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_f16_instances_t(
+ std::vector& instances);
+void add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_int8_instances_t(
+ std::vector& instances);
diff --git a/library/include/ck/library/host_tensor/device.hpp b/library/include/ck/library/host_tensor/device.hpp
index f33b8d4f40..d549b14c8c 100644
--- a/library/include/ck/library/host_tensor/device.hpp
+++ b/library/include/ck/library/host_tensor/device.hpp
@@ -1,12 +1,25 @@
-#ifndef DEVICE_HPP
-#define DEVICE_HPP
+#pragma once
#include
#include
#include
#include
-#include "hip/hip_runtime.h"
-#include "hip/hip_fp16.h"
+#include
+#include
+
+#include "stream_config.hpp"
+#include "ck/options.hpp"
+
+inline void hip_check_error(hipError_t x)
+{
+ if(x != hipSuccess)
+ {
+ std::ostringstream ss;
+ ss << "HIP runtime error: " << hipGetErrorString(x) << ". " << __FILE__ << ": " << __LINE__
+ << "in function: " << __func__;
+ throw std::runtime_error(ss.str());
+ }
+}
struct DeviceMem
{
@@ -36,49 +49,59 @@ struct KernelTimer
std::unique_ptr impl;
};
-using device_stream_t = hipStream_t;
-
template
-void launch_kernel(F kernel, dim3 grid_dim, dim3 block_dim, std::size_t lds_byte, Args... args)
+float launch_and_time_kernel(const StreamConfig& stream_config,
+ F kernel,
+ dim3 grid_dim,
+ dim3 block_dim,
+ std::size_t lds_byte,
+ Args... args)
{
- hipStream_t stream_id = nullptr;
-
- hipLaunchKernelGGL(kernel, grid_dim, block_dim, lds_byte, stream_id, args...);
-}
-
-template
-float launch_and_time_kernel(
- F kernel, int nrepeat, dim3 grid_dim, dim3 block_dim, std::size_t lds_byte, Args... args)
-{
- KernelTimer timer;
-
- printf("%s: grid_dim {%d, %d, %d}, block_dim {%d, %d, %d} \n",
- __func__,
- grid_dim.x,
- grid_dim.y,
- grid_dim.z,
- block_dim.x,
- block_dim.y,
- block_dim.z);
-
- printf("Warm up\n");
-
- hipStream_t stream_id = nullptr;
-
- // warm up
- hipLaunchKernelGGL(kernel, grid_dim, block_dim, lds_byte, stream_id, args...);
-
- printf("Start running %d times...\n", nrepeat);
-
- timer.Start();
-
- for(int i = 0; i < nrepeat; ++i)
+#if CK_TIME_KERNEL
+ if(stream_config.time_kernel_)
{
- hipLaunchKernelGGL(kernel, grid_dim, block_dim, lds_byte, stream_id, args...);
+ printf("%s: grid_dim {%d, %d, %d}, block_dim {%d, %d, %d} \n",
+ __func__,
+ grid_dim.x,
+ grid_dim.y,
+ grid_dim.z,
+ block_dim.x,
+ block_dim.y,
+ block_dim.z);
+
+ const int nrepeat = 10;
+
+ printf("Warm up 1 time\n");
+
+ // warm up
+ hipLaunchKernelGGL(
+ kernel, grid_dim, block_dim, lds_byte, stream_config.stream_id_, args...);
+
+ printf("Start running %d times...\n", nrepeat);
+
+ KernelTimer timer;
+ timer.Start();
+
+ for(int i = 0; i < nrepeat; ++i)
+ {
+ hipLaunchKernelGGL(
+ kernel, grid_dim, block_dim, lds_byte, stream_config.stream_id_, args...);
+ }
+
+ timer.End();
+
+ return timer.GetElapsedTime() / nrepeat;
}
+ else
+ {
+ hipLaunchKernelGGL(
+ kernel, grid_dim, block_dim, lds_byte, stream_config.stream_id_, args...);
- timer.End();
+ return 0;
+ }
+#else
+ hipLaunchKernelGGL(kernel, grid_dim, block_dim, lds_byte, stream_config.stream_id_, args...);
- return timer.GetElapsedTime() / nrepeat;
-}
+ return 0;
#endif
+}
diff --git a/library/include/ck/library/reference_tensor_operation/cpu/reference_batched_gemm.hpp b/library/include/ck/library/reference_tensor_operation/cpu/reference_batched_gemm.hpp
index 3a706dac0b..f4944a28d2 100644
--- a/library/include/ck/library/reference_tensor_operation/cpu/reference_batched_gemm.hpp
+++ b/library/include/ck/library/reference_tensor_operation/cpu/reference_batched_gemm.hpp
@@ -84,7 +84,8 @@ struct ReferenceBatchedGemm : public device::BaseOperator
return 0;
}
- float Run(const device::BaseArgument* p_arg, int) override
+ float Run(const device::BaseArgument* p_arg,
+ const StreamConfig& /* stream_config */ = StreamConfig{}) override
{
return Run(*dynamic_cast(p_arg));
}
diff --git a/library/include/ck/library/reference_tensor_operation/cpu/reference_conv_backward_weight.hpp b/library/include/ck/library/reference_tensor_operation/cpu/reference_conv_backward_weight.hpp
index c5f3cbad69..10619ae6d9 100644
--- a/library/include/ck/library/reference_tensor_operation/cpu/reference_conv_backward_weight.hpp
+++ b/library/include/ck/library/reference_tensor_operation/cpu/reference_conv_backward_weight.hpp
@@ -121,7 +121,8 @@ struct ReferenceConvBwdWeight : public device::BaseOperator
return 0;
}
- float Run(const device::BaseArgument* p_arg, int) override
+ float Run(const device::BaseArgument* p_arg,
+ const StreamConfig& /*stream_config*/ = StreamConfig{}) override
{
return Run(*dynamic_cast(p_arg));
}
diff --git a/library/include/ck/library/reference_tensor_operation/cpu/reference_conv_bwd_data.hpp b/library/include/ck/library/reference_tensor_operation/cpu/reference_conv_bwd_data.hpp
index 9e91f06e7f..45fc8b8503 100644
--- a/library/include/ck/library/reference_tensor_operation/cpu/reference_conv_bwd_data.hpp
+++ b/library/include/ck/library/reference_tensor_operation/cpu/reference_conv_bwd_data.hpp
@@ -291,7 +291,8 @@ struct ReferenceConvBwdData : public device::BaseOperator
}
}
- float Run(const device::BaseArgument* p_arg, int) override
+ float Run(const device::BaseArgument* p_arg,
+ const StreamConfig& /* stream_config */ = StreamConfig{}) override
{
return Run(*dynamic_cast(p_arg));
}
diff --git a/library/include/ck/library/reference_tensor_operation/cpu/reference_conv_fwd.hpp b/library/include/ck/library/reference_tensor_operation/cpu/reference_conv_fwd.hpp
index 65e59db2f8..d1afa898e4 100644
--- a/library/include/ck/library/reference_tensor_operation/cpu/reference_conv_fwd.hpp
+++ b/library/include/ck/library/reference_tensor_operation/cpu/reference_conv_fwd.hpp
@@ -1,9 +1,10 @@
-#ifndef REFERENCE_CONV_FWD_HPP
-#define REFERENCE_CONV_FWD_HPP
+#pragma once
#include
#include
#include
+
+#include "stream_config.hpp"
#include "device_base.hpp"
#include "host_tensor.hpp"
@@ -251,7 +252,8 @@ struct ReferenceConvFwd : public device::BaseOperator
}
}
- float Run(const device::BaseArgument* p_arg, int) override
+ float Run(const device::BaseArgument* p_arg,
+ const StreamConfig& /*stream_config*/ = StreamConfig{}) override
{
return Run(*dynamic_cast(p_arg));
}
@@ -311,4 +313,3 @@ struct ReferenceConvFwd : public device::BaseOperator
} // namespace host
} // namespace tensor_operation
} // namespace ck
-#endif
diff --git a/library/include/ck/library/reference_tensor_operation/cpu/reference_conv_fwd_bias_activation.hpp b/library/include/ck/library/reference_tensor_operation/cpu/reference_conv_fwd_bias_activation.hpp
index ee95cd410a..4be6169c15 100644
--- a/library/include/ck/library/reference_tensor_operation/cpu/reference_conv_fwd_bias_activation.hpp
+++ b/library/include/ck/library/reference_tensor_operation/cpu/reference_conv_fwd_bias_activation.hpp
@@ -124,7 +124,8 @@ struct ReferenceConvFwd_Bias_Activation : public device::BaseOperator
return 0;
}
- float Run(const device::BaseArgument* p_arg, int) override
+ float Run(const device::BaseArgument* p_arg,
+ const StreamConfig& /* stream_config */ = StreamConfig{}) override
{
return Run(*dynamic_cast(p_arg));
}
diff --git a/library/include/ck/library/reference_tensor_operation/cpu/reference_conv_fwd_bias_activation_add.hpp b/library/include/ck/library/reference_tensor_operation/cpu/reference_conv_fwd_bias_activation_add.hpp
index 11232cc98f..466537c686 100644
--- a/library/include/ck/library/reference_tensor_operation/cpu/reference_conv_fwd_bias_activation_add.hpp
+++ b/library/include/ck/library/reference_tensor_operation/cpu/reference_conv_fwd_bias_activation_add.hpp
@@ -130,7 +130,8 @@ struct ReferenceConvFwd_Bias_Activation_Add : public device::BaseOperator
return 0;
}
- float Run(const device::BaseArgument* p_arg, int) override
+ float Run(const device::BaseArgument* p_arg,
+ const StreamConfig& /*stream_config*/ = StreamConfig{}) override
{
return Run(*dynamic_cast(p_arg));
}
diff --git a/library/include/ck/library/reference_tensor_operation/cpu/reference_gemm.hpp b/library/include/ck/library/reference_tensor_operation/cpu/reference_gemm.hpp
index 1b49ca5740..d89c8f5e05 100644
--- a/library/include/ck/library/reference_tensor_operation/cpu/reference_gemm.hpp
+++ b/library/include/ck/library/reference_tensor_operation/cpu/reference_gemm.hpp
@@ -80,7 +80,8 @@ struct ReferenceGemm : public device::BaseOperator
return 0;
}
- float Run(const device::BaseArgument* p_arg, int) override
+ float Run(const device::BaseArgument* p_arg,
+ const StreamConfig& /* stream_config */ = StreamConfig{}) override
{
return Run(*dynamic_cast(p_arg));
}
diff --git a/library/include/ck/library/reference_tensor_operation/cpu/reference_gemm_bias_2d.hpp b/library/include/ck/library/reference_tensor_operation/cpu/reference_gemm_bias_2d.hpp
index 7dd6fc9199..3e7f220e03 100644
--- a/library/include/ck/library/reference_tensor_operation/cpu/reference_gemm_bias_2d.hpp
+++ b/library/include/ck/library/reference_tensor_operation/cpu/reference_gemm_bias_2d.hpp
@@ -82,7 +82,8 @@ struct ReferenceGemmBias2D : public device::BaseOperator
return 0;
}
- float Run(const device::BaseArgument* p_arg, int) override
+ float Run(const device::BaseArgument* p_arg,
+ const StreamConfig& /* stream_config */ = StreamConfig{}) override
{
return Run(*dynamic_cast(p_arg));
}
diff --git a/library/include/ck/library/reference_tensor_operation/cpu/reference_gemm_bias_activation.hpp b/library/include/ck/library/reference_tensor_operation/cpu/reference_gemm_bias_activation.hpp
index 7c9df272c2..60f72e9e51 100644
--- a/library/include/ck/library/reference_tensor_operation/cpu/reference_gemm_bias_activation.hpp
+++ b/library/include/ck/library/reference_tensor_operation/cpu/reference_gemm_bias_activation.hpp
@@ -85,7 +85,8 @@ struct ReferenceGemmBiasActivation : public device::BaseOperator
return 0;
}
- float Run(const device::BaseArgument* p_arg, int) override
+ float Run(const device::BaseArgument* p_arg,
+ const StreamConfig& /* stream_config */ = StreamConfig{}) override
{
return Run(*dynamic_cast(p_arg));
}
diff --git a/library/include/ck/library/reference_tensor_operation/cpu/reference_gemm_bias_activation_add.hpp b/library/include/ck/library/reference_tensor_operation/cpu/reference_gemm_bias_activation_add.hpp
index 4d3c5effae..5e0ec75e5e 100644
--- a/library/include/ck/library/reference_tensor_operation/cpu/reference_gemm_bias_activation_add.hpp
+++ b/library/include/ck/library/reference_tensor_operation/cpu/reference_gemm_bias_activation_add.hpp
@@ -91,7 +91,8 @@ struct ReferenceGemmBiasActivationAdd : public device::BaseOperator
return 0;
}
- float Run(const device::BaseArgument* p_arg, int) override
+ float Run(const device::BaseArgument* p_arg,
+ const StreamConfig& /* stream_config */ = StreamConfig{}) override
{
return Run(*dynamic_cast(p_arg));
}
diff --git a/library/include/ck/library/utility/op_instance_engine.hpp b/library/include/ck/library/utility/op_instance_engine.hpp
index ec88b4e1b9..5429f66d3e 100644
--- a/library/include/ck/library/utility/op_instance_engine.hpp
+++ b/library/include/ck/library/utility/op_instance_engine.hpp
@@ -128,7 +128,7 @@ class OpInstanceRunEngine
template
ProfileBestConfig Profile(const std::vector& op_ptrs,
- int nrepeat = 100,
+ bool time_kernel = false,
bool do_verification = false,
bool do_log = false)
{
@@ -143,7 +143,7 @@ class OpInstanceRunEngine
if(op_ptr->IsSupportedArgument(argument.get()))
{
std::string op_name = op_ptr->GetTypeString();
- float avg_time = invoker->Run(argument.get(), nrepeat);
+ float avg_time = invoker->Run(argument.get(), StreamConfig{nullptr, time_kernel});
std::size_t flops = op_instance_.GetFlops();
std::size_t num_btype = op_instance_.GetBtype();
diff --git a/library/src/host_tensor/CMakeLists.txt b/library/src/host_tensor/CMakeLists.txt
index fd100e477f..2a020b763d 100644
--- a/library/src/host_tensor/CMakeLists.txt
+++ b/library/src/host_tensor/CMakeLists.txt
@@ -10,10 +10,31 @@ set(HOST_TENSOR_SOURCE
host_tensor.cpp
)
-add_library(host_tensor SHARED ${HOST_TENSOR_SOURCE})
+add_library(host_tensor STATIC ${HOST_TENSOR_SOURCE})
+add_library(composable_kernel::host_tensor ALIAS host_tensor)
+
target_compile_features(host_tensor PUBLIC)
set_target_properties(host_tensor PROPERTIES POSITION_INDEPENDENT_CODE ON)
target_include_directories(host_tensor SYSTEM PUBLIC $)
-install(TARGETS host_tensor LIBRARY DESTINATION lib)
+
+target_include_directories(host_tensor PUBLIC
+ "$"
+ "$"
+ "$"
+)
+
+install(TARGETS host_tensor
+ EXPORT host_tensorTargets
+ LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
+ ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
+ RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}
+ INCLUDES DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}
+)
+
+install(EXPORT host_tensorTargets
+ FILE composable_kernelhost_tensorTargets.cmake
+ NAMESPACE composable_kernel::
+ DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/composable_kernel
+)
clang_tidy_check(host_tensor)
diff --git a/library/src/host_tensor/device.cpp b/library/src/host_tensor/device.cpp
index 3e80df80fb..9f0d982dbc 100644
--- a/library/src/host_tensor/device.cpp
+++ b/library/src/host_tensor/device.cpp
@@ -2,7 +2,7 @@
DeviceMem::DeviceMem(std::size_t mem_size) : mMemSize(mem_size)
{
- hipGetErrorString(hipMalloc(static_cast(&mpDeviceBuf), mMemSize));
+ hip_check_error(hipMalloc(static_cast(&mpDeviceBuf), mMemSize));
}
void* DeviceMem::GetDeviceBuffer() { return mpDeviceBuf; }
@@ -11,49 +11,48 @@ std::size_t DeviceMem::GetBufferSize() { return mMemSize; }
void DeviceMem::ToDevice(const void* p)
{
- hipGetErrorString(
- hipMemcpy(mpDeviceBuf, const_cast(p), mMemSize, hipMemcpyHostToDevice));
+ hip_check_error(hipMemcpy(mpDeviceBuf, const_cast(p), mMemSize, hipMemcpyHostToDevice));
}
void DeviceMem::FromDevice(void* p)
{
- hipGetErrorString(hipMemcpy(p, mpDeviceBuf, mMemSize, hipMemcpyDeviceToHost));
+ hip_check_error(hipMemcpy(p, mpDeviceBuf, mMemSize, hipMemcpyDeviceToHost));
}
-void DeviceMem::SetZero() { hipGetErrorString(hipMemset(mpDeviceBuf, 0, mMemSize)); }
+void DeviceMem::SetZero() { hip_check_error(hipMemset(mpDeviceBuf, 0, mMemSize)); }
-DeviceMem::~DeviceMem() { hipGetErrorString(hipFree(mpDeviceBuf)); }
+DeviceMem::~DeviceMem() { hip_check_error(hipFree(mpDeviceBuf)); }
struct KernelTimerImpl
{
KernelTimerImpl()
{
- hipGetErrorString(hipEventCreate(&mStart));
- hipGetErrorString(hipEventCreate(&mEnd));
+ hip_check_error(hipEventCreate(&mStart));
+ hip_check_error(hipEventCreate(&mEnd));
}
~KernelTimerImpl()
{
- hipGetErrorString(hipEventDestroy(mStart));
- hipGetErrorString(hipEventDestroy(mEnd));
+ hip_check_error(hipEventDestroy(mStart));
+ hip_check_error(hipEventDestroy(mEnd));
}
void Start()
{
- hipGetErrorString(hipDeviceSynchronize());
- hipGetErrorString(hipEventRecord(mStart, nullptr));
+ hip_check_error(hipDeviceSynchronize());
+ hip_check_error(hipEventRecord(mStart, nullptr));
}
void End()
{
- hipGetErrorString(hipEventRecord(mEnd, nullptr));
- hipGetErrorString(hipEventSynchronize(mEnd));
+ hip_check_error(hipEventRecord(mEnd, nullptr));
+ hip_check_error(hipEventSynchronize(mEnd));
}
float GetElapsedTime() const
{
float time;
- hipGetErrorString(hipEventElapsedTime(&time, mStart, mEnd));
+ hip_check_error(hipEventElapsedTime(&time, mStart, mEnd));
return time;
}
diff --git a/library/src/tensor_operation_instance/gpu/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/CMakeLists.txt
index 7b361b48bd..5abfb0c074 100644
--- a/library/src/tensor_operation_instance/gpu/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/CMakeLists.txt
@@ -11,6 +11,7 @@ include_directories(BEFORE
${PROJECT_SOURCE_DIR}/include/ck/tensor_operation/gpu/thread
${PROJECT_SOURCE_DIR}/include/ck/tensor_operation/gpu/element
${PROJECT_SOURCE_DIR}/library/include/ck/library/host_tensor
+ ${PROJECT_SOURCE_DIR}/library/include/ck/library/host
${PROJECT_SOURCE_DIR}/library/include/ck/library/tensor_operation_instance
${PROJECT_SOURCE_DIR}/library/include/ck/library/tensor_operation_instance/gpu/reduce
${PROJECT_SOURCE_DIR}/external/include/half
@@ -18,7 +19,7 @@ include_directories(BEFORE
function(add_instance_library INSTANCE_NAME)
message("adding instance ${INSTANCE_NAME}")
- add_library(${INSTANCE_NAME} SHARED ${ARGN})
+ add_library(${INSTANCE_NAME} OBJECT ${ARGN})
target_compile_features(${INSTANCE_NAME} PUBLIC)
set_target_properties(${INSTANCE_NAME} PROPERTIES POSITION_INDEPENDENT_CODE ON)
endfunction(add_instance_library INSTANCE_NAME)
@@ -41,3 +42,73 @@ add_subdirectory(convnd_bwd_data)
add_subdirectory(grouped_gemm)
add_subdirectory(conv2d_bwd_weight)
add_subdirectory(batched_gemm_reduce)
+
+add_library(device_operations STATIC
+ $
+ $
+ $
+ $
+ $
+ $
+ $
+ $
+ $
+ $
+ $
+ $
+ $
+ $
+ $
+ $
+ $
+ device_conv2d.cpp
+)
+add_library(composablekernels::device_operations ALIAS device_operations)
+
+
+set(DEV_OPS_INC_DIRS
+ ${PROJECT_SOURCE_DIR}/include/ck/
+ ${PROJECT_SOURCE_DIR}/library/include/ck/
+ ${PROJECT_SOURCE_DIR}/external/include/
+)
+target_compile_features(device_operations PUBLIC)
+set_target_properties(device_operations PROPERTIES POSITION_INDEPENDENT_CODE ON)
+target_include_directories(device_operations PUBLIC
+ $
+ $
+ $
+ $
+ $
+ $
+ $
+ $
+ $
+ $
+ $
+ $
+ $
+ $
+ $
+ $
+)
+
+#once new arches are enabled make this an option on the main cmake file
+# and pass down here to be exported
+
+target_compile_options(device_operations
+PRIVATE --offload-arch=gfx908
+)
+# install(TARGETS device_operations LIBRARY DESTINATION lib)
+install(TARGETS device_operations
+ EXPORT device_operationsTargets
+ LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
+ ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
+ RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}
+ INCLUDES DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}
+)
+install(DIRECTORY ${DEV_OPS_INC_DIRS} DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/ck)
+install(EXPORT device_operationsTargets
+ FILE composable_kerneldevice_operationsTargets.cmake
+ NAMESPACE composable_kernel::
+ DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/composable_kernel
+)
diff --git a/library/src/tensor_operation_instance/gpu/batched_gemm/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/batched_gemm/CMakeLists.txt
index 35e24462b5..016c85f673 100644
--- a/library/src/tensor_operation_instance/gpu/batched_gemm/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/batched_gemm/CMakeLists.txt
@@ -18,9 +18,9 @@ set(DEVICE_BATCHED_GEMM_INSTANCE_SOURCE
device_batched_gemm_xdl_int8_int8_int8_gkm_gnk_gmn_instance.cpp;
)
-add_library(device_batched_gemm_instance SHARED ${DEVICE_BATCHED_GEMM_INSTANCE_SOURCE})
-target_compile_features(device_batched_gemm_instance PUBLIC)
+add_library(device_batched_gemm_instance OBJECT ${DEVICE_BATCHED_GEMM_INSTANCE_SOURCE})
+# target_compile_features(device_batched_gemm_instance PUBLIC)
set_target_properties(device_batched_gemm_instance PROPERTIES POSITION_INDEPENDENT_CODE ON)
-install(TARGETS device_batched_gemm_instance LIBRARY DESTINATION lib)
+# install(TARGETS device_batched_gemm_instance LIBRARY DESTINATION lib)
clang_tidy_check(device_batched_gemm_instance)
diff --git a/library/src/tensor_operation_instance/gpu/batched_gemm_reduce/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/batched_gemm_reduce/CMakeLists.txt
index 59eb6cb1cc..67a3c15d00 100644
--- a/library/src/tensor_operation_instance/gpu/batched_gemm_reduce/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/batched_gemm_reduce/CMakeLists.txt
@@ -5,7 +5,8 @@ set(DEVICE_BATCHED_GEMM_REDUCE_INSTANCE_SOURCE
device_batched_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_gkm_gnk_gmn_instance.cpp
)
-add_instance_library(device_batched_gemm_reduce_instance ${DEVICE_BATCHED_GEMM_REDUCE_INSTANCE_SOURCE})
-install(TARGETS device_batched_gemm_reduce_instance LIBRARY DESTINATION lib)
+add_instance_library(device_batched_gemm_reduce_instance OBJECT ${DEVICE_BATCHED_GEMM_REDUCE_INSTANCE_SOURCE})
+target_compile_features(device_batched_gemm_reduce_instance PUBLIC)
+set_target_properties(device_batched_gemm_reduce_instance PROPERTIES POSITION_INDEPENDENT_CODE ON)
clang_tidy_check(device_batched_gemm_reduce_instance)
diff --git a/library/src/tensor_operation_instance/gpu/conv1d_fwd/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/conv1d_fwd/CMakeLists.txt
index 6c7c3e4f78..77aa6198f5 100644
--- a/library/src/tensor_operation_instance/gpu/conv1d_fwd/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/conv1d_fwd/CMakeLists.txt
@@ -6,9 +6,9 @@ set(DEVICE_CONV1D_FWD_INSTANCE_SOURCE
device_conv1d_fwd_xdl_nwc_kxc_nwk_int8_instance.cpp;
)
-add_library(device_conv1d_fwd_instance SHARED ${DEVICE_CONV1D_FWD_INSTANCE_SOURCE})
-target_compile_features(device_conv1d_fwd_instance PUBLIC)
+add_library(device_conv1d_fwd_instance OBJECT ${DEVICE_CONV1D_FWD_INSTANCE_SOURCE})
+# target_compile_features(device_conv1d_fwd_instance PUBLIC)
set_target_properties(device_conv1d_fwd_instance PROPERTIES POSITION_INDEPENDENT_CODE ON)
-install(TARGETS device_conv1d_fwd_instance LIBRARY DESTINATION lib)
+# install(TARGETS device_conv1d_fwd_instance LIBRARY DESTINATION lib)
clang_tidy_check(device_conv1d_fwd_instance)
diff --git a/library/src/tensor_operation_instance/gpu/conv2d_bwd_data/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/conv2d_bwd_data/CMakeLists.txt
index d619ef4bf1..d7882a7d8b 100644
--- a/library/src/tensor_operation_instance/gpu/conv2d_bwd_data/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/conv2d_bwd_data/CMakeLists.txt
@@ -6,9 +6,7 @@ set(DEVICE_CONV2D_BWD_DATA_INSTANCE_SOURCE
device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_int8_instance.cpp;
)
-add_library(device_conv2d_bwd_data_instance SHARED ${DEVICE_CONV2D_BWD_DATA_INSTANCE_SOURCE})
-target_compile_features(device_conv2d_bwd_data_instance PUBLIC)
+add_library(device_conv2d_bwd_data_instance OBJECT ${DEVICE_CONV2D_BWD_DATA_INSTANCE_SOURCE})
set_target_properties(device_conv2d_bwd_data_instance PROPERTIES POSITION_INDEPENDENT_CODE ON)
-install(TARGETS device_conv2d_bwd_data_instance LIBRARY DESTINATION lib)
clang_tidy_check(device_conv2d_bwd_data_instance)
diff --git a/library/src/tensor_operation_instance/gpu/conv2d_bwd_weight/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/conv2d_bwd_weight/CMakeLists.txt
index 6183e70b9b..7c384a882b 100644
--- a/library/src/tensor_operation_instance/gpu/conv2d_bwd_weight/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/conv2d_bwd_weight/CMakeLists.txt
@@ -3,7 +3,7 @@ set(DEVICE_CONV2D_BWD_WEIGHT_INSTANCE_SOURCE
device_conv2d_bwd_weight_xdl_nhwc_kyxc_nhwk_f16_instance.cpp;
device_conv2d_bwd_weight_xdl_nhwc_kyxc_nhwk_f32_instance.cpp;
)
-add_library(device_conv2d_bwd_weight_instance SHARED ${DEVICE_CONV2D_BWD_WEIGHT_INSTANCE_SOURCE})
+add_library(device_conv2d_bwd_weight_instance OBJECT ${DEVICE_CONV2D_BWD_WEIGHT_INSTANCE_SOURCE})
target_compile_features(device_conv2d_bwd_weight_instance PUBLIC)
set_target_properties(device_conv2d_bwd_weight_instance PROPERTIES POSITION_INDEPENDENT_CODE ON)
install(TARGETS device_conv2d_bwd_weight_instance LIBRARY DESTINATION lib)
diff --git a/library/src/tensor_operation_instance/gpu/conv2d_fwd/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/conv2d_fwd/CMakeLists.txt
index 7483861524..857e36d6f5 100644
--- a/library/src/tensor_operation_instance/gpu/conv2d_fwd/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/conv2d_fwd/CMakeLists.txt
@@ -6,9 +6,7 @@ set(DEVICE_CONV2D_FWD_INSTANCE_SOURCE
device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_int8_instance.cpp;
device_conv2d_fwd_xdl_c_shuffle_nhwc_kyxc_nhwk_f16_instance.cpp;
)
-add_library(device_conv2d_fwd_instance SHARED ${DEVICE_CONV2D_FWD_INSTANCE_SOURCE})
-target_compile_features(device_conv2d_fwd_instance PUBLIC)
+add_library(device_conv2d_fwd_instance OBJECT ${DEVICE_CONV2D_FWD_INSTANCE_SOURCE})
set_target_properties(device_conv2d_fwd_instance PROPERTIES POSITION_INDEPENDENT_CODE ON)
-install(TARGETS device_conv2d_fwd_instance LIBRARY DESTINATION lib)
clang_tidy_check(device_conv2d_fwd_instance)
diff --git a/library/src/tensor_operation_instance/gpu/conv2d_fwd_bias_relu/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/conv2d_fwd_bias_relu/CMakeLists.txt
index 27a9736a3f..ad66c73bf8 100644
--- a/library/src/tensor_operation_instance/gpu/conv2d_fwd_bias_relu/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/conv2d_fwd_bias_relu/CMakeLists.txt
@@ -2,9 +2,7 @@
set(DEVICE_CONV2D_FWD_BIAS_RELU_INSTANCE_SOURCE
device_conv2d_fwd_xdl_c_shuffle_bias_relu_nhwc_kyxc_nhwk_f16_instance.cpp;
)
-add_library(device_conv2d_fwd_bias_relu_instance SHARED ${DEVICE_CONV2D_FWD_BIAS_RELU_INSTANCE_SOURCE})
-target_compile_features(device_conv2d_fwd_bias_relu_instance PUBLIC)
+add_library(device_conv2d_fwd_bias_relu_instance OBJECT ${DEVICE_CONV2D_FWD_BIAS_RELU_INSTANCE_SOURCE})
set_target_properties(device_conv2d_fwd_bias_relu_instance PROPERTIES POSITION_INDEPENDENT_CODE ON)
-install(TARGETS device_conv2d_fwd_bias_relu_instance LIBRARY DESTINATION lib)
clang_tidy_check(device_conv2d_fwd_bias_relu_instance)
diff --git a/library/src/tensor_operation_instance/gpu/conv2d_fwd_bias_relu_add/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/conv2d_fwd_bias_relu_add/CMakeLists.txt
index d7bec82174..36b1f6c153 100644
--- a/library/src/tensor_operation_instance/gpu/conv2d_fwd_bias_relu_add/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/conv2d_fwd_bias_relu_add/CMakeLists.txt
@@ -2,9 +2,7 @@
set(DEVICE_CONV2D_FWD_BIAS_RELU_ADD_INSTANCE_SOURCE
device_conv2d_fwd_xdl_c_shuffle_bias_relu_add_nhwc_kyxc_nhwk_f16_instance.cpp;
)
-add_library(device_conv2d_fwd_bias_relu_add_instance SHARED ${DEVICE_CONV2D_FWD_BIAS_RELU_ADD_INSTANCE_SOURCE})
-target_compile_features(device_conv2d_fwd_bias_relu_add_instance PUBLIC)
+add_library(device_conv2d_fwd_bias_relu_add_instance OBJECT ${DEVICE_CONV2D_FWD_BIAS_RELU_ADD_INSTANCE_SOURCE})
set_target_properties(device_conv2d_fwd_bias_relu_add_instance PROPERTIES POSITION_INDEPENDENT_CODE ON)
-install(TARGETS device_conv2d_fwd_bias_relu_add_instance LIBRARY DESTINATION lib)
clang_tidy_check(device_conv2d_fwd_bias_relu_add_instance)
diff --git a/library/src/tensor_operation_instance/gpu/conv2d_fwd_bias_relu_atomic_add/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/conv2d_fwd_bias_relu_atomic_add/CMakeLists.txt
index c0942d5485..5906c7c5ac 100644
--- a/library/src/tensor_operation_instance/gpu/conv2d_fwd_bias_relu_atomic_add/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/conv2d_fwd_bias_relu_atomic_add/CMakeLists.txt
@@ -3,9 +3,7 @@ set(DEVICE_CONV2D_FWD_BIAS_RELU_ATOMIC_ADD_INSTANCE_SOURCE
device_conv2d_fwd_xdl_c_shuffle_bias_relu_atomic_add_nhwc_kyxc_nhwk_f16_instance.cpp;
)
-add_library(device_conv2d_fwd_bias_relu_atomic_add_instance SHARED ${DEVICE_CONV2D_FWD_BIAS_RELU_ATOMIC_ADD_INSTANCE_SOURCE})
-target_compile_features(device_conv2d_fwd_bias_relu_atomic_add_instance PUBLIC)
+add_library(device_conv2d_fwd_bias_relu_atomic_add_instance OBJECT ${DEVICE_CONV2D_FWD_BIAS_RELU_ATOMIC_ADD_INSTANCE_SOURCE})
set_target_properties(device_conv2d_fwd_bias_relu_atomic_add_instance PROPERTIES POSITION_INDEPENDENT_CODE ON)
-install(TARGETS device_conv2d_fwd_bias_relu_atomic_add_instance LIBRARY DESTINATION lib)
clang_tidy_check(device_conv2d_fwd_bias_relu_atomic_add_instance)
diff --git a/library/src/tensor_operation_instance/gpu/conv3d_fwd/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/conv3d_fwd/CMakeLists.txt
index f6849a7bb2..91a299c742 100644
--- a/library/src/tensor_operation_instance/gpu/conv3d_fwd/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/conv3d_fwd/CMakeLists.txt
@@ -5,9 +5,8 @@ set(DEVICE_CONV3D_FWD_INSTANCE_SOURCE
device_conv3d_fwd_xdl_ndhwc_kzyxc_ndhwk_bf16_instance.cpp;
device_conv3d_fwd_xdl_ndhwc_kzyxc_ndhwk_int8_instance.cpp;
)
-add_library(device_conv3d_fwd_instance SHARED ${DEVICE_CONV3D_FWD_INSTANCE_SOURCE})
+add_library(device_conv3d_fwd_instance OBJECT ${DEVICE_CONV3D_FWD_INSTANCE_SOURCE})
target_compile_features(device_conv3d_fwd_instance PUBLIC)
set_target_properties(device_conv3d_fwd_instance PROPERTIES POSITION_INDEPENDENT_CODE ON)
-install(TARGETS device_conv3d_fwd_instance LIBRARY DESTINATION lib)
clang_tidy_check(device_conv3d_fwd_instance)
diff --git a/library/src/tensor_operation_instance/gpu/convnd_bwd_data/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/convnd_bwd_data/CMakeLists.txt
index 9ee961ad74..037f860808 100644
--- a/library/src/tensor_operation_instance/gpu/convnd_bwd_data/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/convnd_bwd_data/CMakeLists.txt
@@ -14,7 +14,7 @@ set(DEVICE_CONVND_BWD_DATA_INSTANCE_SOURCE
device_conv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_int8_instance.cpp;
)
-add_library(device_convnd_bwd_data_instance SHARED ${DEVICE_CONVND_BWD_DATA_INSTANCE_SOURCE})
+add_library(device_convnd_bwd_data_instance OBJECT ${DEVICE_CONVND_BWD_DATA_INSTANCE_SOURCE})
target_compile_features(device_convnd_bwd_data_instance PUBLIC)
set_target_properties(device_convnd_bwd_data_instance PROPERTIES POSITION_INDEPENDENT_CODE ON)
install(TARGETS device_convnd_bwd_data_instance LIBRARY DESTINATION lib)
diff --git a/library/src/tensor_operation_instance/gpu/device_conv2d.cpp b/library/src/tensor_operation_instance/gpu/device_conv2d.cpp
new file mode 100644
index 0000000000..6b99433ffa
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/device_conv2d.cpp
@@ -0,0 +1,201 @@
+#include
+#include "config.hpp"
+#include "device_conv2d_fwd_xdl_c_shuffle_nhwc_kyxc_nhwk.hpp"
+#include "element_wise_operation.hpp"
+#include "device_operation_instance.hpp"
+#include "host_interface.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace device_conv2d_fwd_instance {
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+void add_device_conv2d_fwd_xdl_c_shuffle_nhwc_kyxc_nhwk_f16_instances(
+ std::vector>& instances);
+void add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_f32_instances(
+ std::vector>& instances);
+void add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_bf16_instances(
+ std::vector>& instances);
+void add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_f16_instances(
+ std::vector>& instances);
+void add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_int8_instances(
+ std::vector>& instances);
+
+} // namespace device_conv2d_fwd_instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+struct DeviceConvFwdPtr_t::DeviceConvFwdPtrImpl
+{
+ std::unique_ptr
+ MakeArgumentPointer(void* in_ptr,
+ void* wei_ptr,
+ void* out_ptr,
+ size_t N,
+ size_t K,
+ size_t C,
+ std::vector input_spatial_lengths,
+ std::vector filter_spatial_lengths,
+ std::vector output_spatial_lengths,
+ std::vector conv_filter_strides,
+ std::vector conv_filter_dilations,
+ std::vector input_left_pads,
+ std::vector input_right_pads) const
+ {
+ return el->MakeArgumentPointer(in_ptr,
+ wei_ptr,
+ out_ptr,
+ N,
+ K,
+ C,
+ input_spatial_lengths,
+ filter_spatial_lengths,
+ output_spatial_lengths,
+ conv_filter_strides,
+ conv_filter_dilations,
+ input_left_pads,
+ input_right_pads,
+ PassThrough{},
+ PassThrough{},
+ PassThrough{});
+ }
+ std::unique_ptr MakeInvokerPointer() const
+ {
+ return el->MakeInvokerPointer();
+ }
+
+ std::string GetTypeString() { return el->GetTypeString(); }
+ bool IsSupportedArgument(const DeviceConvFwdPtr_t::BaseArgument* arg)
+ {
+ return el->IsSupportedArgument(arg);
+ }
+
+ ck::tensor_operation::device::DeviceConvFwdPtr el;
+};
+
+DeviceConvFwdPtr_t::DeviceConvFwdPtr_t() : pImpl(nullptr) {}
+DeviceConvFwdPtr_t::~DeviceConvFwdPtr_t() = default;
+DeviceConvFwdPtr_t::DeviceConvFwdPtr_t(DeviceConvFwdPtr_t&&) = default;
+DeviceConvFwdPtr_t::DeviceConvFwdPtr_t(DeviceConvFwdPtr_t::DeviceConvFwdPtrImpl& other)
+ : pImpl(std::make_unique(std::move(other)))
+{
+}
+
+std::unique_ptr
+DeviceConvFwdPtr_t::MakeArgumentPointer(void* in_ptr,
+ void* wei_ptr,
+ void* out_ptr,
+ size_t N,
+ size_t K,
+ size_t C,
+ std::vector input_spatial_lengths,
+ std::vector filter_spatial_lengths,
+ std::vector output_spatial_lengths,
+ std::vector conv_filter_strides,
+ std::vector conv_filter_dilations,
+ std::vector input_left_pads,
+ std::vector input_right_pads) const
+{
+ return pImpl->MakeArgumentPointer(in_ptr,
+ wei_ptr,
+ out_ptr,
+ N,
+ K,
+ C,
+ input_spatial_lengths,
+ filter_spatial_lengths,
+ output_spatial_lengths,
+ conv_filter_strides,
+ conv_filter_dilations,
+ input_left_pads,
+ input_right_pads);
+}
+
+std::unique_ptr DeviceConvFwdPtr_t::MakeInvokerPointer() const
+{
+ return pImpl->MakeInvokerPointer();
+}
+
+std::string DeviceConvFwdPtr_t::GetTypeString() { return pImpl->GetTypeString(); }
+bool DeviceConvFwdPtr_t::IsSupportedArgument(const DeviceConvFwdPtr_t::BaseArgument* arg_ptr)
+{
+ return pImpl->IsSupportedArgument(arg_ptr);
+}
+
+using namespace ck::tensor_operation::device::device_conv2d_fwd_instance;
+void add_device_conv2d_fwd_xdl_c_shuffle_nhwc_kyxc_nhwk_f16_instances_t(
+ std::vector& instances)
+{
+ std::vector<
+ ck::tensor_operation::device::DeviceConvFwdPtr>
+ local_instances;
+ add_device_conv2d_fwd_xdl_c_shuffle_nhwc_kyxc_nhwk_f16_instances(local_instances);
+ for(auto& kinder : local_instances)
+ {
+ DeviceConvFwdPtr_t::DeviceConvFwdPtrImpl tmp{std::move(kinder)};
+ instances.emplace_back(tmp);
+ }
+ return;
+}
+
+void add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_f32_instances_t(
+ std::vector& instances)
+{
+ std::vector<
+ ck::tensor_operation::device::DeviceConvFwdPtr>
+ local_instances;
+ add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_f32_instances(local_instances);
+ for(auto& kinder : local_instances)
+ {
+ DeviceConvFwdPtr_t::DeviceConvFwdPtrImpl tmp{std::move(kinder)};
+ instances.emplace_back(tmp); // Perhaps we can do better
+ }
+ return;
+}
+
+void add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_bf16_instances_t(
+ std::vector& instances)
+{
+ std::vector<
+ ck::tensor_operation::device::DeviceConvFwdPtr>
+ local_instances;
+ add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_bf16_instances(local_instances);
+ for(auto& kinder : local_instances)
+ {
+ DeviceConvFwdPtr_t::DeviceConvFwdPtrImpl tmp{std::move(kinder)};
+ instances.emplace_back(tmp); // Perhaps we can do better
+ }
+ return;
+}
+
+void add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_f16_instances_t(
+ std::vector& instances)
+{
+ std::vector<
+ ck::tensor_operation::device::DeviceConvFwdPtr>
+ local_instances;
+ add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_f16_instances(local_instances);
+ for(auto& kinder : local_instances)
+ {
+ DeviceConvFwdPtr_t::DeviceConvFwdPtrImpl tmp{std::move(kinder)};
+ instances.emplace_back(tmp); // Perhaps we can do better
+ }
+ return;
+}
+
+void add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_int8_instances_t(
+ std::vector& instances)
+{
+ std::vector<
+ ck::tensor_operation::device::DeviceConvFwdPtr>
+ local_instances;
+ add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_int8_instances(local_instances);
+ for(auto& kinder : local_instances)
+ {
+ DeviceConvFwdPtr_t::DeviceConvFwdPtrImpl tmp{std::move(kinder)};
+ instances.emplace_back(tmp);
+ }
+ return;
+}
diff --git a/library/src/tensor_operation_instance/gpu/gemm/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/gemm/CMakeLists.txt
index 5f057adcc5..556b06d7e1 100644
--- a/library/src/tensor_operation_instance/gpu/gemm/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/gemm/CMakeLists.txt
@@ -35,10 +35,9 @@ set(DEVICE_GEMM_INSTANCE_SOURCE
device_gemm_xdl_splitk_f16_f16_f16_km_nk_mn_instance.cpp;
)
-add_library(device_gemm_instance SHARED ${DEVICE_GEMM_INSTANCE_SOURCE})
+add_library(device_gemm_instance OBJECT ${DEVICE_GEMM_INSTANCE_SOURCE})
target_compile_features(device_gemm_instance PUBLIC)
set_target_properties(device_gemm_instance PROPERTIES POSITION_INDEPENDENT_CODE ON)
-install(TARGETS device_gemm_instance LIBRARY DESTINATION lib)
clang_tidy_check(device_gemm_instance)
diff --git a/library/src/tensor_operation_instance/gpu/gemm_bias2d/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/gemm_bias2d/CMakeLists.txt
index a0e5ba61a1..e2b0abb1d1 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_bias2d/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/gemm_bias2d/CMakeLists.txt
@@ -10,9 +10,7 @@ set(DEVICE_GEMM_BIAS2D_INSTANCE_SOURCE
device_gemm_xdl_c_shuffle_bias_2d_f16_f16_f16_mk_nk_mn_instance.cpp;
)
-add_library(device_gemm_bias2d_instance SHARED ${DEVICE_GEMM_BIAS2D_INSTANCE_SOURCE})
-target_compile_features(device_gemm_bias2d_instance PUBLIC)
+add_library(device_gemm_bias2d_instance OBJECT ${DEVICE_GEMM_BIAS2D_INSTANCE_SOURCE})
set_target_properties(device_gemm_bias2d_instance PROPERTIES POSITION_INDEPENDENT_CODE ON)
-install(TARGETS device_gemm_bias2d_instance LIBRARY DESTINATION lib)
clang_tidy_check(device_gemm_bias2d_instance)
diff --git a/library/src/tensor_operation_instance/gpu/gemm_bias_relu/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/gemm_bias_relu/CMakeLists.txt
index 69e05673d6..e2e7d4badd 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_bias_relu/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/gemm_bias_relu/CMakeLists.txt
@@ -6,9 +6,7 @@ set(DEVICE_GEMM_BIAS_RELU_INSTANCE_SOURCE
device_gemm_xdl_c_shuffle_bias_relu_f16_f16_f16_km_nk_mn_instance.cpp;
)
-add_library(device_gemm_bias_relu_instance SHARED ${DEVICE_GEMM_BIAS_RELU_INSTANCE_SOURCE})
-target_compile_features(device_gemm_bias_relu_instance PUBLIC)
+add_library(device_gemm_bias_relu_instance OBJECT ${DEVICE_GEMM_BIAS_RELU_INSTANCE_SOURCE})
set_target_properties(device_gemm_bias_relu_instance PROPERTIES POSITION_INDEPENDENT_CODE ON)
-install(TARGETS device_gemm_bias_relu_instance LIBRARY DESTINATION lib)
clang_tidy_check(device_gemm_bias_relu_instance)
diff --git a/library/src/tensor_operation_instance/gpu/gemm_bias_relu_add/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/gemm_bias_relu_add/CMakeLists.txt
index 016bc4be2d..a10dbb555d 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_bias_relu_add/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/gemm_bias_relu_add/CMakeLists.txt
@@ -6,9 +6,7 @@ set(DEVICE_GEMM_BIAS_RELU_ADD_INSTANCE_SOURCE
device_gemm_xdl_c_shuffle_bias_relu_add_f16_f16_f16_km_nk_mn_instance.cpp;
)
-add_library(device_gemm_bias_relu_add_instance SHARED ${DEVICE_GEMM_BIAS_RELU_ADD_INSTANCE_SOURCE})
-target_compile_features(device_gemm_bias_relu_add_instance PUBLIC)
+add_library(device_gemm_bias_relu_add_instance OBJECT ${DEVICE_GEMM_BIAS_RELU_ADD_INSTANCE_SOURCE})
set_target_properties(device_gemm_bias_relu_add_instance PROPERTIES POSITION_INDEPENDENT_CODE ON)
-install(TARGETS device_gemm_bias_relu_add_instance LIBRARY DESTINATION lib)
clang_tidy_check(device_gemm_bias_relu_add_instance)
diff --git a/library/src/tensor_operation_instance/gpu/grouped_gemm/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/grouped_gemm/CMakeLists.txt
index 8f591d8c49..6c5e31fddd 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_gemm/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/grouped_gemm/CMakeLists.txt
@@ -6,7 +6,7 @@ set(DEVICE_GROUPED_GEMM_INSTANCE_SOURCE
device_grouped_gemm_xdl_f16_f16_f16_km_nk_mn_instance.cpp;
)
-add_library(device_grouped_gemm_instance SHARED ${DEVICE_GROUPED_GEMM_INSTANCE_SOURCE})
+add_library(device_grouped_gemm_instance OBJECT ${DEVICE_GROUPED_GEMM_INSTANCE_SOURCE})
target_compile_features(device_grouped_gemm_instance PUBLIC)
set_target_properties(device_grouped_gemm_instance PROPERTIES POSITION_INDEPENDENT_CODE ON)
diff --git a/library/src/tensor_operation_instance/gpu/reduce/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/reduce/CMakeLists.txt
index cced3a4b76..81987ac0d4 100644
--- a/library/src/tensor_operation_instance/gpu/reduce/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/reduce/CMakeLists.txt
@@ -38,9 +38,7 @@ set(DEVICE_REDUCE_INSTANCE_SOURCE
device_reduce_instance_multiblock_partial_reduce_b16_f32_b16.cpp;
)
-add_library(device_reduce_instance SHARED ${DEVICE_REDUCE_INSTANCE_SOURCE})
-target_compile_features(device_reduce_instance PUBLIC)
+add_library(device_reduce_instance OBJECT ${DEVICE_REDUCE_INSTANCE_SOURCE})
set_target_properties(device_reduce_instance PROPERTIES POSITION_INDEPENDENT_CODE ON)
-install(TARGETS device_reduce_instance LIBRARY DESTINATION lib)
clang_tidy_check(device_reduce_instance)
diff --git a/profiler/include/profile_batched_gemm_impl.hpp b/profiler/include/profile_batched_gemm_impl.hpp
index 7abbf7a042..3393110c33 100644
--- a/profiler/include/profile_batched_gemm_impl.hpp
+++ b/profiler/include/profile_batched_gemm_impl.hpp
@@ -63,7 +63,7 @@ template GetTypeString();
- float ave_time = invoker_ptr->Run(argument_ptr.get(), nrepeat);
+ float ave_time =
+ invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel});
std::size_t flop = std::size_t(2) * BatchCount * M * N * K;
- std::size_t num_btype = (sizeof(ADataType) * M * K + sizeof(BDataType) * K * M +
+ std::size_t num_btype = (sizeof(ADataType) * M * K + sizeof(BDataType) * K * N +
sizeof(CDataType) * M * N) *
BatchCount;
diff --git a/profiler/include/profile_batched_gemm_reduce_impl.hpp b/profiler/include/profile_batched_gemm_reduce_impl.hpp
index a6399c20d8..bd74dbf459 100644
--- a/profiler/include/profile_batched_gemm_reduce_impl.hpp
+++ b/profiler/include/profile_batched_gemm_reduce_impl.hpp
@@ -53,7 +53,7 @@ template IsSupportedArgument(argument_ptr.get()))
{
- // warm up
- invoker_ptr->Run(argument_ptr.get());
+ // init DO, D1 to 0
+ d0_device_buf.SetZero();
+ d1_device_buf.SetZero();
- // timing
- float total_time = 0;
-
- for(int i = 0; i < nrepeat; ++i)
- {
- // init DO, D1 to 0
- d0_device_buf.SetZero();
- d1_device_buf.SetZero();
-
- KernelTimer timer;
-
- timer.Start();
-
- invoker_ptr->Run(argument_ptr.get());
-
- timer.End();
-
- total_time += timer.GetElapsedTime();
- }
-
- float ave_time = total_time / nrepeat;
+ float ave_time =
+ invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel});
std::string gemm_name = gemm_ptr->GetTypeString();
diff --git a/profiler/include/profile_conv_bwd_data_impl.hpp b/profiler/include/profile_conv_bwd_data_impl.hpp
index bec97e40f5..dfec033737 100644
--- a/profiler/include/profile_conv_bwd_data_impl.hpp
+++ b/profiler/include/profile_conv_bwd_data_impl.hpp
@@ -51,7 +51,7 @@ template GetTypeString();
- float ave_time = invoker_ptr->Run(argument_ptr.get(), nrepeat);
+ float ave_time =
+ invoker_ptr->Run(argument_ptr.get(), StreamControl{nullptr, time_kernel});
std::size_t flop = std::size_t(2) * N * K * Ho * Wo * C * Y * X;
diff --git a/profiler/include/profile_conv_bwd_weight_impl.hpp b/profiler/include/profile_conv_bwd_weight_impl.hpp
index 20fe0ef549..8e3a4074b0 100644
--- a/profiler/include/profile_conv_bwd_weight_impl.hpp
+++ b/profiler/include/profile_conv_bwd_weight_impl.hpp
@@ -1,4 +1,6 @@
#pragma once
+
+#include "stream_config.hpp"
#include "config.hpp"
#include "device.hpp"
#include "host_tensor.hpp"
@@ -43,7 +45,7 @@ template MakeArgumentPointer(
static_cast(in_device_buf.GetDeviceBuffer()),
static_cast(wei_device_buf.GetDeviceBuffer()),
@@ -214,7 +218,8 @@ bool profile_conv_bwd_weight_impl(int do_verification,
{
std::string conv_name = conv_ptr->GetTypeString();
- float ave_time = invoker_ptr->Run(argument_ptr.get(), nrepeat);
+ float ave_time =
+ invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel});
std::size_t flop = std::size_t(2) * N * K * Ho * Wo * C * Y * X;
@@ -242,6 +247,7 @@ bool profile_conv_bwd_weight_impl(int do_verification,
wei_device_buf.FromDevice(wei_k_c_y_x_device_result.mData.data());
float max_error = check_error(wei_k_c_y_x_host_result, wei_k_c_y_x_device_result);
+
if(max_error > 8)
{
pass = false;
diff --git a/profiler/include/profile_conv_fwd_bias_relu_add_impl.hpp b/profiler/include/profile_conv_fwd_bias_relu_add_impl.hpp
index d0de7307d2..5ea35cd72f 100644
--- a/profiler/include/profile_conv_fwd_bias_relu_add_impl.hpp
+++ b/profiler/include/profile_conv_fwd_bias_relu_add_impl.hpp
@@ -42,7 +42,7 @@ template GetTypeString();
- float ave_time = invoker_ptr->Run(argument_ptr.get(), nrepeat);
+ float ave_time =
+ invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel});
std::size_t flop = std::size_t(2) * N * K * Ho * Wo * C * Y * X;
diff --git a/profiler/include/profile_conv_fwd_bias_relu_atomic_add_impl.hpp b/profiler/include/profile_conv_fwd_bias_relu_atomic_add_impl.hpp
index 9bdfa61283..f1c2fd300a 100644
--- a/profiler/include/profile_conv_fwd_bias_relu_atomic_add_impl.hpp
+++ b/profiler/include/profile_conv_fwd_bias_relu_atomic_add_impl.hpp
@@ -119,7 +119,7 @@ template GetTypeString();
- float ave_time = invoker_ptr->Run(argument_ptr.get(), nrepeat);
+ float ave_time =
+ invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel});
std::size_t flop = std::size_t(2) * N * K * Ho * Wo * C * Y * X;
diff --git a/profiler/include/profile_conv_fwd_bias_relu_impl.hpp b/profiler/include/profile_conv_fwd_bias_relu_impl.hpp
index f34e52048e..eeb2b93e4e 100644
--- a/profiler/include/profile_conv_fwd_bias_relu_impl.hpp
+++ b/profiler/include/profile_conv_fwd_bias_relu_impl.hpp
@@ -41,7 +41,7 @@ template GetTypeString();
- float ave_time = invoker_ptr->Run(argument_ptr.get(), nrepeat);
+ float ave_time =
+ invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel});
std::size_t flop = std::size_t(2) * N * K * Ho * Wo * C * Y * X;
diff --git a/profiler/include/profile_convnd_bwd_data_impl.hpp b/profiler/include/profile_convnd_bwd_data_impl.hpp
index 5b1ba71163..291bf2abc0 100644
--- a/profiler/include/profile_convnd_bwd_data_impl.hpp
+++ b/profiler/include/profile_convnd_bwd_data_impl.hpp
@@ -269,7 +269,7 @@ template GetTypeString();
- float ave_time = invoker_ptr->Run(argument_ptr.get(), nrepeat);
+ float ave_time =
+ invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel});
std::size_t flop =
ck::utils::conv::get_flops(N, C, K, filter_spatial_lengths, output_spatial_lengths);
diff --git a/profiler/include/profile_gemm_bias_2d_impl.hpp b/profiler/include/profile_gemm_bias_2d_impl.hpp
index 98e4ad76c9..8565f9637c 100644
--- a/profiler/include/profile_gemm_bias_2d_impl.hpp
+++ b/profiler/include/profile_gemm_bias_2d_impl.hpp
@@ -65,7 +65,7 @@ template GetTypeString();
- float ave_time = invoker_ptr->Run(argument_ptr.get(), nrepeat);
+ float ave_time =
+ invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel});
std::size_t flop = std::size_t(2) * M * N * K;
diff --git a/profiler/include/profile_gemm_bias_relu_add_impl.hpp b/profiler/include/profile_gemm_bias_relu_add_impl.hpp
index 75ed78075b..6fec17c199 100644
--- a/profiler/include/profile_gemm_bias_relu_add_impl.hpp
+++ b/profiler/include/profile_gemm_bias_relu_add_impl.hpp
@@ -48,7 +48,7 @@ template GetTypeString();
- float ave_time = invoker_ptr->Run(argument_ptr.get(), nrepeat);
+ float ave_time =
+ invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel});
std::size_t flop = std::size_t(2) * M * N * K;
diff --git a/profiler/include/profile_gemm_bias_relu_impl.hpp b/profiler/include/profile_gemm_bias_relu_impl.hpp
index 0735f3c31b..69010becc5 100644
--- a/profiler/include/profile_gemm_bias_relu_impl.hpp
+++ b/profiler/include/profile_gemm_bias_relu_impl.hpp
@@ -48,7 +48,7 @@ template GetTypeString();
- float ave_time = invoker_ptr->Run(argument_ptr.get(), nrepeat);
+ float ave_time =
+ invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel});
std::size_t flop = std::size_t(2) * M * N * K;
diff --git a/profiler/include/profile_gemm_impl.hpp b/profiler/include/profile_gemm_impl.hpp
index 93262fe802..45e6174260 100644
--- a/profiler/include/profile_gemm_impl.hpp
+++ b/profiler/include/profile_gemm_impl.hpp
@@ -91,7 +91,7 @@ template GetTypeString();
- float ave_time = invoker_ptr->Run(argument_ptr.get(), nrepeat);
+ float ave_time =
+ invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel});
std::size_t flop = std::size_t(2) * M * N * K;
diff --git a/profiler/include/profile_gemm_reduce_impl.hpp b/profiler/include/profile_gemm_reduce_impl.hpp
index 6ef3e010b1..d034c9f750 100644
--- a/profiler/include/profile_gemm_reduce_impl.hpp
+++ b/profiler/include/profile_gemm_reduce_impl.hpp
@@ -52,7 +52,7 @@ template IsSupportedArgument(argument_ptr.get()))
{
- // warm up
- invoker_ptr->Run(argument_ptr.get());
+ // init DO, D1 to 0
+ d0_device_buf.SetZero();
+ d1_device_buf.SetZero();
- // timing
- float total_time = 0;
-
- for(int i = 0; i < nrepeat; ++i)
- {
- // init DO, D1 to 0
- d0_device_buf.SetZero();
- d1_device_buf.SetZero();
-
- KernelTimer timer;
-
- timer.Start();
-
- invoker_ptr->Run(argument_ptr.get());
-
- timer.End();
-
- total_time += timer.GetElapsedTime();
- }
-
- float ave_time = total_time / nrepeat;
+ float ave_time =
+ invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel});
std::string gemm_name = gemm_ptr->GetTypeString();
std::size_t flop = std::size_t(2) * M * N * K;
- std::size_t num_btype = sizeof(ADataType) * M * K + sizeof(BDataType) * K * M +
+ std::size_t num_btype = sizeof(ADataType) * M * K + sizeof(BDataType) * K * N +
sizeof(CDataType) * M * N + sizeof(CDataType) * N;
float tflops = static_cast(flop) / 1.E9 / ave_time;
diff --git a/profiler/include/profile_grouped_gemm_impl.hpp b/profiler/include/profile_grouped_gemm_impl.hpp
index ae70f551f1..96d34c7e42 100644
--- a/profiler/include/profile_grouped_gemm_impl.hpp
+++ b/profiler/include/profile_grouped_gemm_impl.hpp
@@ -49,7 +49,7 @@ template & Ms,
const std::vector& Ns,
const std::vector& Ks,
@@ -231,7 +231,8 @@ void profile_grouped_gemm_impl(int do_verification,
{
std::string gemm_name = gemm_ptr->GetTypeString();
- float ave_time = invoker_ptr->Run(argument_ptr.get(), nrepeat);
+ float ave_time =
+ invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel});
std::size_t flop = 0, num_btype = 0;
for(std::size_t i = 0; i < gemm_shapes.size(); i++)
diff --git a/profiler/include/profile_reduce_impl.hpp b/profiler/include/profile_reduce_impl.hpp
index 678134f60b..33c7929ddd 100644
--- a/profiler/include/profile_reduce_impl.hpp
+++ b/profiler/include/profile_reduce_impl.hpp
@@ -157,7 +157,7 @@ void profile_reduce_impl_impl(bool do_verification,
int init_method,
bool do_log,
bool do_dumpout,
- int nrepeat,
+ bool time_kernel,
const std::vector& inLengths,
const std::vector& reduceDims,
float alpha,
@@ -430,7 +430,8 @@ void profile_reduce_impl_impl(bool do_verification,
auto invoker_ptr = reduce_ptr->MakeInvokerPointer();
- float avg_time = invoker_ptr->Run(argument_ptr.get(), nrepeat);
+ float avg_time =
+ invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel});
std::size_t num_bytes =
invariant_total_length * reduce_total_length * sizeof(InDataType) +
@@ -516,7 +517,8 @@ void profile_reduce_impl_impl(bool do_verification,
auto invoker_ptr = reduce_ptr->MakeInvokerPointer();
- float avg_time = invoker_ptr->Run(argument_ptr.get(), nrepeat);
+ float avg_time =
+ invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel});
std::size_t num_bytes =
invariant_total_length * reduce_total_length * sizeof(InDataType) +
@@ -554,7 +556,8 @@ void profile_reduce_impl_impl(bool do_verification,
auto invoker2_ptr = reduce2_ptr->MakeInvokerPointer();
- float avg_time_2 = invoker2_ptr->Run(argument2_ptr.get(), nrepeat);
+ float avg_time_2 =
+ invoker2_ptr->Run(argument2_ptr.get(), StreamConfig{nullptr, time_kernel});
std::size_t num_bytes_2 =
static_cast(inLengths2[0]) * inLengths2[1] * sizeof(AccDataType);
@@ -625,7 +628,7 @@ void profile_reduce_impl(bool do_verification,
int init_method,
bool do_log,
bool do_dumpout,
- int nrepeat,
+ bool time_kernel,
const std::vector& inLengths,
const std::vector& reduceDims,
ReduceTensorOp ReduceOpId,
@@ -663,7 +666,7 @@ void profile_reduce_impl(bool do_verification,
init_method,
do_log,
do_dumpout,
- nrepeat,
+ time_kernel,
inLengths,
reduceDims,
alpha,
diff --git a/profiler/src/profile_batched_gemm.cpp b/profiler/src/profile_batched_gemm.cpp
index 2a806b0818..db5486e0ac 100644
--- a/profiler/src/profile_batched_gemm.cpp
+++ b/profiler/src/profile_batched_gemm.cpp
@@ -48,8 +48,8 @@ int profile_batched_gemm(int argc, char* argv[])
printf(" 3: A[g, k, m] * B[g, n, k] = C[g, m, n])\n");
printf("arg4: verification (0: no; 1: yes)\n");
printf("arg5: initialization (0: no init; 1: integer value; 2: decimal value)\n");
- printf("arg8: print tensor value (0: no; 1: yes)\n");
- printf("arg7: run kernel # of times (>1)\n");
+ printf("arg6: print tensor value (0: no; 1: yes)\n");
+ printf("arg7: time kernel (0=n0, 1=yes)\n");
printf("arg8 to 14: M, N, K, StrideA, StrideB, StrideC, BatchCount\n");
exit(1);
}
@@ -59,7 +59,7 @@ int profile_batched_gemm(int argc, char* argv[])
const bool do_verification = std::stoi(argv[4]);
const int init_method = std::stoi(argv[5]);
const bool do_log = std::stoi(argv[6]);
- const int nrepeat = std::stoi(argv[7]);
+ const bool time_kernel = std::stoi(argv[7]);
const int M = std::stoi(argv[8]);
const int N = std::stoi(argv[9]);
@@ -82,7 +82,7 @@ int profile_batched_gemm(int argc, char* argv[])
do_verification,
init_method,
do_log,
- nrepeat,
+ time_kernel,
M,
N,
K,
@@ -102,7 +102,7 @@ int profile_batched_gemm(int argc, char* argv[])
do_verification,
init_method,
do_log,
- nrepeat,
+ time_kernel,
M,
N,
K,
@@ -122,7 +122,7 @@ int profile_batched_gemm(int argc, char* argv[])
do_verification,
init_method,
do_log,
- nrepeat,
+ time_kernel,
M,
N,
K,
@@ -142,7 +142,7 @@ int profile_batched_gemm(int argc, char* argv[])
do_verification,
init_method,
do_log,
- nrepeat,
+ time_kernel,
M,
N,
K,
@@ -162,7 +162,7 @@ int profile_batched_gemm(int argc, char* argv[])
do_verification,
init_method,
do_log,
- nrepeat,
+ time_kernel,
M,
N,
K,
@@ -182,7 +182,7 @@ int profile_batched_gemm(int argc, char* argv[])
do_verification,
init_method,
do_log,
- nrepeat,
+ time_kernel,
M,
N,
K,
@@ -202,7 +202,7 @@ int profile_batched_gemm(int argc, char* argv[])
do_verification,
init_method,
do_log,
- nrepeat,
+ time_kernel,
M,
N,
K,
@@ -222,7 +222,7 @@ int profile_batched_gemm(int argc, char* argv[])
do_verification,
init_method,
do_log,
- nrepeat,
+ time_kernel,
M,
N,
K,
@@ -242,7 +242,7 @@ int profile_batched_gemm(int argc, char* argv[])
do_verification,
init_method,
do_log,
- nrepeat,
+ time_kernel,
M,
N,
K,
@@ -262,7 +262,7 @@ int profile_batched_gemm(int argc, char* argv[])
do_verification,
init_method,
do_log,
- nrepeat,
+ time_kernel,
M,
N,
K,
@@ -282,7 +282,7 @@ int profile_batched_gemm(int argc, char* argv[])
do_verification,
init_method,
do_log,
- nrepeat,
+ time_kernel,
M,
N,
K,
@@ -302,7 +302,7 @@ int profile_batched_gemm(int argc, char* argv[])
do_verification,
init_method,
do_log,
- nrepeat,
+ time_kernel,
M,
N,
K,
@@ -322,7 +322,7 @@ int profile_batched_gemm(int argc, char* argv[])
do_verification,
init_method,
do_log,
- nrepeat,
+ time_kernel,
M,
N,
K,
@@ -342,7 +342,7 @@ int profile_batched_gemm(int argc, char* argv[])
do_verification,
init_method,
do_log,
- nrepeat,
+ time_kernel,
M,
N,
K,
@@ -362,7 +362,7 @@ int profile_batched_gemm(int argc, char* argv[])
do_verification,
init_method,
do_log,
- nrepeat,
+ time_kernel,
M,
N,
K,
@@ -382,7 +382,7 @@ int profile_batched_gemm(int argc, char* argv[])
do_verification,
init_method,
do_log,
- nrepeat,
+ time_kernel,
M,
N,
K,
diff --git a/profiler/src/profile_batched_gemm_reduce.cpp b/profiler/src/profile_batched_gemm_reduce.cpp
index 38c3f52193..f67e561865 100644
--- a/profiler/src/profile_batched_gemm_reduce.cpp
+++ b/profiler/src/profile_batched_gemm_reduce.cpp
@@ -33,8 +33,8 @@ int profile_batched_gemm_reduce(int argc, char* argv[])
printf(" 3: A[k, m] * B[n, k] = C[m, n])\n");
printf("arg4: verification (0: no; 1: yes)\n");
printf("arg5: initialization (0: no init; 1: integer value; 2: decimal value)\n");
- printf("arg8: print tensor value (0: no; 1: yes)\n");
- printf("arg7: run kernel # of times (>1)\n");
+ printf("arg6: print tensor value (0: no; 1: yes)\n");
+ printf("arg7: time kernel (0=n0, 1=yes)\n");
printf("arg8 to 14: M, N, K, StrideA, StrideB, StrideC, BatchCount\n");
printf("arg15: split k into mulitiple batch\n");
exit(1);
@@ -45,7 +45,7 @@ int profile_batched_gemm_reduce(int argc, char* argv[])
const bool do_verification = std::stoi(argv[4]);
const int init_method = std::stoi(argv[5]);
const bool do_log = std::stoi(argv[6]);
- const int nrepeat = std::stoi(argv[7]);
+ const bool time_kernel = std::stoi(argv[7]);
const int M = std::stoi(argv[8]);
const int N = std::stoi(argv[9]);
@@ -69,7 +69,7 @@ int profile_batched_gemm_reduce(int argc, char* argv[])
do_verification,
init_method,
do_log,
- nrepeat,
+ time_kernel,
M,
N,
K,
@@ -91,7 +91,7 @@ int profile_batched_gemm_reduce(int argc, char* argv[])
do_verification,
init_method,
do_log,
- nrepeat,
+ time_kernel,
M,
N,
K,
@@ -113,7 +113,7 @@ int profile_batched_gemm_reduce(int argc, char* argv[])
do_verification,
init_method,
do_log,
- nrepeat,
+ time_kernel,
M,
N,
K,
@@ -135,7 +135,7 @@ int profile_batched_gemm_reduce(int argc, char* argv[])
do_verification,
init_method,
do_log,
- nrepeat,
+ time_kernel,
M,
N,
K,
diff --git a/profiler/src/profile_conv_bwd_data.cpp b/profiler/src/profile_conv_bwd_data.cpp
index 2861af3d10..206d486ea0 100644
--- a/profiler/src/profile_conv_bwd_data.cpp
+++ b/profiler/src/profile_conv_bwd_data.cpp
@@ -44,7 +44,7 @@ int profile_conv_bwd_data(int argc, char* argv[])
printf("arg6: verification (0: no; 1: yes)\n");
printf("arg7: initialization (0: no init; 1: integer value; 2: decimal value)\n");
printf("arg8: print tensor value (0: no; 1: yes)\n");
- printf("arg9: run kernel # of times (>1)\n");
+ printf("arg9: time kernel (0=n0, 1=yes)\n");
printf("arg10 to 24: N, K, C, Y, X, Hi, Wi, Sy, Sx, Dy, Dx, LeftPy, LeftPx, RightPy, "
"RightPx\n");
exit(1);
@@ -57,7 +57,7 @@ int profile_conv_bwd_data(int argc, char* argv[])
const bool do_verification = std::stoi(argv[6]);
const int init_method = std::stoi(argv[7]);
const bool do_log = std::stoi(argv[8]);
- const int nrepeat = std::stoi(argv[9]);
+ const bool time_kernel = std::stoi(argv[9]);
const ck::index_t N = std::stoi(argv[10]);
const ck::index_t K = std::stoi(argv[11]);
@@ -96,7 +96,7 @@ int profile_conv_bwd_data(int argc, char* argv[])
do_verification,
init_method,
do_log,
- nrepeat,
+ StreamControl{nullptr, time_kernel},
N,
K,
C,
@@ -122,7 +122,7 @@ int profile_conv_bwd_data(int argc, char* argv[])
do_verification,
init_method,
do_log,
- nrepeat,
+ StreamControl{nullptr, time_kernel},
N,
K,
C,
@@ -148,7 +148,7 @@ int profile_conv_bwd_data(int argc, char* argv[])
do_verification,
init_method,
do_log,
- nrepeat,
+ StreamControl{nullptr, time_kernel},
N,
K,
C,
@@ -174,7 +174,7 @@ int profile_conv_bwd_data(int argc, char* argv[])
do_verification,
init_method,
do_log,
- nrepeat,
+ StreamControl{nullptr, time_kernel},
N,
K,
C,
diff --git a/profiler/src/profile_conv_bwd_weight.cpp b/profiler/src/profile_conv_bwd_weight.cpp
index 309cc8ea2c..c022d19ee0 100644
--- a/profiler/src/profile_conv_bwd_weight.cpp
+++ b/profiler/src/profile_conv_bwd_weight.cpp
@@ -58,7 +58,7 @@ int profile_conv_bwd_weight(int argc, char* argv[])
const bool do_verification = std::stoi(argv[6]);
const int init_method = std::stoi(argv[7]);
const bool do_log = std::stoi(argv[8]);
- const int nrepeat = std::stoi(argv[9]);
+ const bool time_kernel = std::stoi(argv[9]);
const ck::index_t N = std::stoi(argv[10]);
const ck::index_t K = std::stoi(argv[11]);
@@ -98,7 +98,7 @@ int profile_conv_bwd_weight(int argc, char* argv[])
do_verification,
init_method,
do_log,
- nrepeat,
+ time_kernel,
N,
K,
C,
@@ -124,7 +124,7 @@ int profile_conv_bwd_weight(int argc, char* argv[])
do_verification,
init_method,
do_log,
- nrepeat,
+ time_kernel,
N,
K,
C,
diff --git a/profiler/src/profile_conv_fwd_bias_relu.cpp b/profiler/src/profile_conv_fwd_bias_relu.cpp
index 1c447b483e..28aa49687f 100644
--- a/profiler/src/profile_conv_fwd_bias_relu.cpp
+++ b/profiler/src/profile_conv_fwd_bias_relu.cpp
@@ -42,7 +42,7 @@ int profile_conv_fwd_bias_relu(int argc, char* argv[])
printf("arg6: verification (0: no; 1: yes)\n");
printf("arg7: initialization (0: no init; 1: integer value; 2: decimal value)\n");
printf("arg8: print tensor value (0: no; 1: yes)\n");
- printf("arg9: run kernel # of times (>1)\n");
+ printf("arg9: time kernel (0=n0, 1=yes)\n");
printf("arg10 to 24: N, K, C, Y, X, Hi, Wi, Sy, Sx, Dy, Dx, LeftPy, LeftPx, RightPy, "
"RightPx\n");
exit(1);
@@ -55,7 +55,7 @@ int profile_conv_fwd_bias_relu(int argc, char* argv[])
const bool do_verification = std::stoi(argv[6]);
const int init_method = std::stoi(argv[7]);
const bool do_log = std::stoi(argv[8]);
- const int nrepeat = std::stoi(argv[9]);
+ const bool time_kernel = std::stoi(argv[9]);
const ck::index_t N = std::stoi(argv[10]);
const ck::index_t K = std::stoi(argv[11]);
@@ -93,7 +93,7 @@ int profile_conv_fwd_bias_relu(int argc, char* argv[])
do_verification,
init_method,
do_log,
- nrepeat,
+ time_kernel,
N,
K,
C,
diff --git a/profiler/src/profile_conv_fwd_bias_relu_add.cpp b/profiler/src/profile_conv_fwd_bias_relu_add.cpp
index 522487c77b..7e033a51e2 100644
--- a/profiler/src/profile_conv_fwd_bias_relu_add.cpp
+++ b/profiler/src/profile_conv_fwd_bias_relu_add.cpp
@@ -43,7 +43,7 @@ int profile_conv_fwd_bias_relu_add(int argc, char* argv[])
printf("arg6: verification (0: no; 1: yes)\n");
printf("arg7: initialization (0: no init; 1: integer value; 2: decimal value)\n");
printf("arg8: print tensor value (0: no; 1: yes)\n");
- printf("arg9: run kernel # of times (>1)\n");
+ printf("arg9: time kernel (0=n0, 1=yes)\n");
printf("arg10 to 24: N, K, C, Y, X, Hi, Wi, Sy, Sx, Dy, Dx, LeftPy, LeftPx, RightPy, "
"RightPx\n");
exit(1);
@@ -56,7 +56,7 @@ int profile_conv_fwd_bias_relu_add(int argc, char* argv[])
const bool do_verification = std::stoi(argv[6]);
const int init_method = std::stoi(argv[7]);
const bool do_log = std::stoi(argv[8]);
- const int nrepeat = std::stoi(argv[9]);
+ const bool time_kernel = std::stoi(argv[9]);
const ck::index_t N = std::stoi(argv[10]);
const ck::index_t K = std::stoi(argv[11]);
@@ -94,7 +94,7 @@ int profile_conv_fwd_bias_relu_add(int argc, char* argv[])
do_verification,
init_method,
do_log,
- nrepeat,
+ time_kernel,
N,
K,
C,
diff --git a/profiler/src/profile_conv_fwd_bias_relu_atomic_add.cpp b/profiler/src/profile_conv_fwd_bias_relu_atomic_add.cpp
index 833f2851db..095536f701 100644
--- a/profiler/src/profile_conv_fwd_bias_relu_atomic_add.cpp
+++ b/profiler/src/profile_conv_fwd_bias_relu_atomic_add.cpp
@@ -43,7 +43,7 @@ int profile_conv_fwd_bias_relu_atomic_add(int argc, char* argv[])
printf("arg6: verification (0: no; 1: yes)\n");
printf("arg7: initialization (0: no init; 1: integer value; 2: decimal value)\n");
printf("arg8: print tensor value (0: no; 1: yes)\n");
- printf("arg9: run kernel # of times (>1)\n");
+ printf("arg9: time kernel (0=n0, 1=yes)\n");
printf("arg10 to 24: N, K, C, Y, X, Hi, Wi, Sy, Sx, Dy, Dx, LeftPy, LeftPx, RightPy, "
"RightPx\n");
exit(1);
@@ -56,7 +56,7 @@ int profile_conv_fwd_bias_relu_atomic_add(int argc, char* argv[])
const bool do_verification = std::stoi(argv[6]);
const int init_method = std::stoi(argv[7]);
const bool do_log = std::stoi(argv[8]);
- const int nrepeat = std::stoi(argv[9]);
+ const bool time_kernel = std::stoi(argv[9]);
const ck::index_t N = std::stoi(argv[10]);
const ck::index_t K = std::stoi(argv[11]);
@@ -95,7 +95,7 @@ int profile_conv_fwd_bias_relu_atomic_add(int argc, char* argv[])
do_verification,
init_method,
do_log,
- nrepeat,
+ time_kernel,
N,
K,
C,
diff --git a/profiler/src/profile_convnd_bwd_data.cpp b/profiler/src/profile_convnd_bwd_data.cpp
index 4d6b9a7b37..5d0e6a34c7 100644
--- a/profiler/src/profile_convnd_bwd_data.cpp
+++ b/profiler/src/profile_convnd_bwd_data.cpp
@@ -95,7 +95,7 @@ int profile_convnd_bwd_data(int argc, char* argv[], int num_dim_spatial)
printf("arg6: verification (0: no; 1: yes)\n");
printf("arg7: initialization (0: no init; 1: integer value; 2: decimal value)\n");
printf("arg8: print tensor value (0: no; 1: yes)\n");
- printf("arg9: run kernel # of times (>1)\n");
+ printf("arg9: time kernel (0=n0, 1=yes)\n");
printf("arg10 to 24: N, K, C, Y, X, Hi, Wi, Sy, Sx, Dy, Dx, LeftPy, LeftPx, RightPy, "
"RightPx\n");
return 1;
@@ -108,7 +108,7 @@ int profile_convnd_bwd_data(int argc, char* argv[], int num_dim_spatial)
const bool do_verification = std::stoi(argv[6]);
const int init_method = std::stoi(argv[7]);
const bool do_log = std::stoi(argv[8]);
- const int nrepeat = std::stoi(argv[9]);
+ const bool time_kernel = std::stoi(argv[9]);
ck::utils::conv::ConvParams params = parse_conv_params(num_dim_spatial, argv, preParams);
@@ -132,7 +132,7 @@ int profile_convnd_bwd_data(int argc, char* argv[], int num_dim_spatial)
do_verification,
init_method,
do_log,
- nrepeat,
+ time_kernel,
params.N_,
params.K_,
params.C_,
@@ -157,7 +157,7 @@ int profile_convnd_bwd_data(int argc, char* argv[], int num_dim_spatial)
do_verification,
init_method,
do_log,
- nrepeat,
+ time_kernel,
params.N_,
params.K_,
params.C_,
@@ -182,7 +182,7 @@ int profile_convnd_bwd_data(int argc, char* argv[], int num_dim_spatial)
do_verification,
init_method,
do_log,
- nrepeat,
+ time_kernel,
params.N_,
params.K_,
params.C_,
diff --git a/profiler/src/profile_convnd_fwd.cpp b/profiler/src/profile_convnd_fwd.cpp
index 7902cdb002..722e86c2ea 100644
--- a/profiler/src/profile_convnd_fwd.cpp
+++ b/profiler/src/profile_convnd_fwd.cpp
@@ -119,7 +119,7 @@ template ::template Get(),
- nrepeat,
+ time_kernel,
do_verification,
do_log);
@@ -201,7 +201,7 @@ void profile_convnd_instances(ConvDataType data_type,
const ck::utils::conv::ConvParams& params,
bool do_verification,
bool do_log,
- int nrepeat,
+ bool time_kernel,
int init_method)
{
switch(data_layout)
@@ -214,7 +214,7 @@ void profile_convnd_instances(ConvDataType data_type,
params,
do_verification,
do_log,
- nrepeat,
+ time_kernel,
init_method,
ConvolutionLayouts{});
break;
@@ -223,7 +223,7 @@ void profile_convnd_instances(ConvDataType data_type,
params,
do_verification,
do_log,
- nrepeat,
+ time_kernel,
init_method,
ConvolutionLayouts{});
break;
@@ -232,7 +232,7 @@ void profile_convnd_instances(ConvDataType data_type,
params,
do_verification,
do_log,
- nrepeat,
+ time_kernel,
init_method,
ConvolutionLayouts{});
break;
@@ -241,7 +241,7 @@ void profile_convnd_instances(ConvDataType data_type,
params,
do_verification,
do_log,
- nrepeat,
+ time_kernel,
init_method,
ConvolutionLayouts{});
break;
@@ -256,7 +256,7 @@ void profile_convnd_instances(ConvDataType data_type,
params,
do_verification,
do_log,
- nrepeat,
+ time_kernel,
init_method,
ConvolutionLayouts{});
break;
@@ -265,7 +265,7 @@ void profile_convnd_instances(ConvDataType data_type,
params,
do_verification,
do_log,
- nrepeat,
+ time_kernel,
init_method,
ConvolutionLayouts{});
break;
@@ -274,7 +274,7 @@ void profile_convnd_instances(ConvDataType data_type,
params,
do_verification,
do_log,
- nrepeat,
+ time_kernel,
init_method,
ConvolutionLayouts{});
break;
@@ -283,7 +283,7 @@ void profile_convnd_instances(ConvDataType data_type,
params,
do_verification,
do_log,
- nrepeat,
+ time_kernel,
init_method,
ConvolutionLayouts{});
break;
@@ -304,7 +304,7 @@ int ck::profiler::profile_convnd_fwd(int argc, char* argv[])
bool do_verification{true};
int init_method{2};
bool do_log{false};
- int nrepeat{100};
+ bool time_kernel{false};
int num_dim_spatial{2};
ConvParams params;
@@ -318,7 +318,7 @@ int ck::profiler::profile_convnd_fwd(int argc, char* argv[])
do_verification = std::stoi(argv[4]);
init_method = std::stoi(argv[5]);
do_log = std::stoi(argv[6]);
- nrepeat = std::stoi(argv[7]);
+ time_kernel = std::stoi(argv[7]);
num_dim_spatial = std::stoi(argv[8]);
}
if(argc >= 10)
@@ -332,15 +332,15 @@ int ck::profiler::profile_convnd_fwd(int argc, char* argv[])
{
case 1:
profile_convnd_instances<1>(
- data_type, data_layout, params, do_verification, do_log, nrepeat, init_method);
+ data_type, data_layout, params, do_verification, do_log, time_kernel, init_method);
break;
case 2:
profile_convnd_instances<2>(
- data_type, data_layout, params, do_verification, do_log, nrepeat, init_method);
+ data_type, data_layout, params, do_verification, do_log, time_kernel, init_method);
break;
case 3:
profile_convnd_instances<3>(
- data_type, data_layout, params, do_verification, do_log, nrepeat, init_method);
+ data_type, data_layout, params, do_verification, do_log, time_kernel, init_method);
break;
default:
throw std::runtime_error("profile_conv_fwd: unsupported num_dim_spatial value: " +
diff --git a/profiler/src/profile_gemm.cpp b/profiler/src/profile_gemm.cpp
index 7a72be2d8e..4c6a3b0487 100644
--- a/profiler/src/profile_gemm.cpp
+++ b/profiler/src/profile_gemm.cpp
@@ -38,8 +38,8 @@ int profile_gemm(int argc, char* argv[])
printf(" 3: A[k, m] * B[n, k] = C[m, n])\n");
printf("arg4: verification (0: no; 1: yes)\n");
printf("arg5: initialization (0: no init; 1: integer value; 2: decimal value)\n");
- printf("arg8: print tensor value (0: no; 1: yes)\n");
- printf("arg7: run kernel # of times (>1)\n");
+ printf("arg6: print tensor value (0: no; 1: yes)\n");
+ printf("arg7: time kernel (0=n0, 1=yes)\n");
printf("arg8 to 13: M, N, K, StrideA, StrideB, StrideC\n");
printf("arg14: split k into mulitiple batch\n");
exit(1);
@@ -50,7 +50,7 @@ int profile_gemm(int argc, char* argv[])
const bool do_verification = std::stoi(argv[4]);
const int init_method = std::stoi(argv[5]);
const bool do_log = std::stoi(argv[6]);
- const int nrepeat = std::stoi(argv[7]);
+ const bool time_kernel = std::stoi(argv[7]);
const int M = std::stoi(argv[8]);
const int N = std::stoi(argv[9]);
@@ -74,7 +74,7 @@ int profile_gemm(int argc, char* argv[])
do_verification,
init_method,
do_log,
- nrepeat,
+ time_kernel,
M,
N,
K,
@@ -94,7 +94,7 @@ int profile_gemm(int argc, char* argv[])
do_verification,
init_method,
do_log,
- nrepeat,
+ time_kernel,
M,
N,
K,
@@ -114,7 +114,7 @@ int profile_gemm(int argc, char* argv[])
do_verification,
init_method,
do_log,
- nrepeat,
+ time_kernel,
M,
N,
K,
@@ -134,7 +134,7 @@ int profile_gemm(int argc, char* argv[])
do_verification,
init_method,
do_log,
- nrepeat,
+ time_kernel,
M,
N,
K,
@@ -154,7 +154,7 @@ int profile_gemm(int argc, char* argv[])
do_verification,
init_method,
do_log,
- nrepeat,
+ time_kernel,
M,
N,
K,
@@ -174,7 +174,7 @@ int profile_gemm(int argc, char* argv[])
do_verification,
init_method,
do_log,
- nrepeat,
+ time_kernel,
M,
N,
K,
@@ -194,7 +194,7 @@ int profile_gemm(int argc, char* argv[])
do_verification,
init_method,
do_log,
- nrepeat,
+ time_kernel,
M,
N,
K,
@@ -214,7 +214,7 @@ int profile_gemm(int argc, char* argv[])
do_verification,
init_method,
do_log,
- nrepeat,
+ time_kernel,
M,
N,
K,
@@ -234,7 +234,7 @@ int profile_gemm(int argc, char* argv[])
do_verification,
init_method,
do_log,
- nrepeat,
+ time_kernel,
M,
N,
K,
@@ -254,7 +254,7 @@ int profile_gemm(int argc, char* argv[])
do_verification,
init_method,
do_log,
- nrepeat,
+ time_kernel,
M,
N,
K,
@@ -274,7 +274,7 @@ int profile_gemm(int argc, char* argv[])
do_verification,
init_method,
do_log,
- nrepeat,
+ time_kernel,
M,
N,
K,
@@ -294,7 +294,7 @@ int profile_gemm(int argc, char* argv[])
do_verification,
init_method,
do_log,
- nrepeat,
+ time_kernel,
M,
N,
K,
@@ -314,7 +314,7 @@ int profile_gemm(int argc, char* argv[])
do_verification,
init_method,
do_log,
- nrepeat,
+ time_kernel,
M,
N,
K,
@@ -334,7 +334,7 @@ int profile_gemm(int argc, char* argv[])
do_verification,
init_method,
do_log,
- nrepeat,
+ time_kernel,
M,
N,
K,
@@ -354,7 +354,7 @@ int profile_gemm(int argc, char* argv[])
do_verification,
init_method,
do_log,
- nrepeat,
+ time_kernel,
M,
N,
K,
@@ -374,7 +374,7 @@ int profile_gemm(int argc, char* argv[])
do_verification,
init_method,
do_log,
- nrepeat,
+ time_kernel,
M,
N,
K,
diff --git a/profiler/src/profile_gemm_bias_2d.cpp b/profiler/src/profile_gemm_bias_2d.cpp
index dd7e418087..46d4f90c17 100644
--- a/profiler/src/profile_gemm_bias_2d.cpp
+++ b/profiler/src/profile_gemm_bias_2d.cpp
@@ -36,8 +36,8 @@ int profile_gemm_bias_2d(int argc, char* argv[])
printf(" 3: A[k, m] * B[n, k] = C[m, n])\n");
printf("arg4: verification (0: no; 1: yes)\n");
printf("arg5: initialization (0: no init; 1: integer value; 2: decimal value)\n");
- printf("arg8: print tensor value (0: no; 1: yes)\n");
- printf("arg7: run kernel # of times (>1)\n");
+ printf("arg6: print tensor value (0: no; 1: yes)\n");
+ printf("arg7: time kernel (0=n0, 1=yes)\n");
printf("arg8 to 13: M, N, K, StrideA, StrideB, StrideC\n");
printf("arg14: alpha\n");
printf("arg15: beta\n");
@@ -50,7 +50,7 @@ int profile_gemm_bias_2d(int argc, char* argv[])
const bool do_verification = std::stoi(argv[4]);
const int init_method = std::stoi(argv[5]);
const bool do_log = std::stoi(argv[6]);
- const int nrepeat = std::stoi(argv[7]);
+ const bool time_kernel = std::stoi(argv[7]);
const int M = std::stoi(argv[8]);
const int N = std::stoi(argv[9]);
@@ -76,7 +76,7 @@ int profile_gemm_bias_2d(int argc, char* argv[])
do_verification,
init_method,
do_log,
- nrepeat,
+ time_kernel,
M,
N,
K,
@@ -99,7 +99,7 @@ int profile_gemm_bias_2d(int argc, char* argv[])
do_verification,
init_method,
do_log,
- nrepeat,
+ time_kernel,
M,
N,
K,
@@ -122,7 +122,7 @@ int profile_gemm_bias_2d(int argc, char* argv[])
do_verification,
init_method,
do_log,
- nrepeat,
+ time_kernel,
M,
N,
K,
@@ -145,7 +145,7 @@ int profile_gemm_bias_2d(int argc, char* argv[])
do_verification,
init_method,
do_log,
- nrepeat,
+ time_kernel,
M,
N,
K,
@@ -168,7 +168,7 @@ int profile_gemm_bias_2d(int argc, char* argv[])
do_verification,
init_method,
do_log,
- nrepeat,
+ time_kernel,
M,
N,
K,
@@ -191,7 +191,7 @@ int profile_gemm_bias_2d(int argc, char* argv[])
do_verification,
init_method,
do_log,
- nrepeat,
+ time_kernel,
M,
N,
K,
@@ -214,7 +214,7 @@ int profile_gemm_bias_2d(int argc, char* argv[])
do_verification,
init_method,
do_log,
- nrepeat,
+ time_kernel,
M,
N,
K,
@@ -237,7 +237,7 @@ int profile_gemm_bias_2d(int argc, char* argv[])
do_verification,
init_method,
do_log,
- nrepeat,
+ time_kernel,
M,
N,
K,
diff --git a/profiler/src/profile_gemm_bias_relu.cpp b/profiler/src/profile_gemm_bias_relu.cpp
index 67a47cf9ec..4346650c9f 100644
--- a/profiler/src/profile_gemm_bias_relu.cpp
+++ b/profiler/src/profile_gemm_bias_relu.cpp
@@ -36,8 +36,8 @@ int profile_gemm_bias_relu(int argc, char* argv[])
printf(" 3: A[k, m] * B[n, k] = C[m, n])\n");
printf("arg4: verification (0: no; 1: yes)\n");
printf("arg5: initialization (0: no init; 1: integer value; 2: decimal value)\n");
- printf("arg8: print tensor value (0: no; 1: yes)\n");
- printf("arg7: run kernel # of times (>1)\n");
+ printf("arg6: print tensor value (0: no; 1: yes)\n");
+ printf("arg7: time kernel (0=n0, 1=yes)\n");
printf("arg8 to 13: M, N, K, StrideA, StrideB, StrideC\n");
printf("arg14: split k into mulitiple batch\n");
exit(1);
@@ -48,7 +48,7 @@ int profile_gemm_bias_relu(int argc, char* argv[])
const bool do_verification = std::stoi(argv[4]);
const int init_method = std::stoi(argv[5]);
const bool do_log = std::stoi(argv[6]);
- const int nrepeat = std::stoi(argv[7]);
+ const bool time_kernel = std::stoi(argv[7]);
const int M = std::stoi(argv[8]);
const int N = std::stoi(argv[9]);
@@ -69,7 +69,7 @@ int profile_gemm_bias_relu(int argc, char* argv[])
do_verification,
init_method,
do_log,
- nrepeat,
+ time_kernel,
M,
N,
K,
@@ -88,7 +88,7 @@ int profile_gemm_bias_relu(int argc, char* argv[])
do_verification,
init_method,
do_log,
- nrepeat,
+ time_kernel,
M,
N,
K,
@@ -107,7 +107,7 @@ int profile_gemm_bias_relu(int argc, char* argv[])
do_verification,
init_method,
do_log,
- nrepeat,
+ time_kernel,
M,
N,
K,
@@ -126,7 +126,7 @@ int profile_gemm_bias_relu(int argc, char* argv[])
do_verification,
init_method,
do_log,
- nrepeat,
+ time_kernel,
M,
N,
K,
diff --git a/profiler/src/profile_gemm_bias_relu_add.cpp b/profiler/src/profile_gemm_bias_relu_add.cpp
index 52406e93d6..186f32cf6f 100644
--- a/profiler/src/profile_gemm_bias_relu_add.cpp
+++ b/profiler/src/profile_gemm_bias_relu_add.cpp
@@ -36,8 +36,8 @@ int profile_gemm_bias_relu_add(int argc, char* argv[])
printf(" 3: A[k, m] * B[n, k] = C[m, n])\n");
printf("arg4: verification (0: no; 1: yes)\n");
printf("arg5: initialization (0: no init; 1: integer value; 2: decimal value)\n");
- printf("arg8: print tensor value (0: no; 1: yes)\n");
- printf("arg7: run kernel # of times (>1)\n");
+ printf("arg6: print tensor value (0: no; 1: yes)\n");
+ printf("arg7: time kernel (0=n0, 1=yes)\n");
printf("arg8 to 14: M, N, K, StrideA, StrideB, StrideC, StrideC1\n");
printf("arg15: split k into mulitiple batch\n");
exit(1);
@@ -48,7 +48,7 @@ int profile_gemm_bias_relu_add(int argc, char* argv[])
const bool do_verification = std::stoi(argv[4]);
const int init_method = std::stoi(argv[5]);
const bool do_log = std::stoi(argv[6]);
- const int nrepeat = std::stoi(argv[7]);
+ const bool time_kernel = std::stoi(argv[7]);
const int M = std::stoi(argv[8]);
const int N = std::stoi(argv[9]);
@@ -70,7 +70,7 @@ int profile_gemm_bias_relu_add(int argc, char* argv[])
do_verification,
init_method,
do_log,
- nrepeat,
+ time_kernel,
M,
N,
K,
@@ -90,7 +90,7 @@ int profile_gemm_bias_relu_add(int argc, char* argv[])
do_verification,
init_method,
do_log,
- nrepeat,
+ time_kernel,
M,
N,
K,
@@ -110,7 +110,7 @@ int profile_gemm_bias_relu_add(int argc, char* argv[])
do_verification,
init_method,
do_log,
- nrepeat,
+ time_kernel,
M,
N,
K,
@@ -130,7 +130,7 @@ int profile_gemm_bias_relu_add(int argc, char* argv[])
do_verification,
init_method,
do_log,
- nrepeat,
+ time_kernel,
M,
N,
K,
diff --git a/profiler/src/profile_gemm_reduce.cpp b/profiler/src/profile_gemm_reduce.cpp
index a83d4ce9a1..986acaf010 100644
--- a/profiler/src/profile_gemm_reduce.cpp
+++ b/profiler/src/profile_gemm_reduce.cpp
@@ -32,8 +32,8 @@ int profile_gemm_reduce(int argc, char* argv[])
printf(" 3: A[k, m] * B[n, k] = C[m, n])\n");
printf("arg4: verification (0: no; 1: yes)\n");
printf("arg5: initialization (0: no init; 1: integer value; 2: decimal value)\n");
- printf("arg8: print tensor value (0: no; 1: yes)\n");
- printf("arg7: run kernel # of times (>1)\n");
+ printf("arg6: print tensor value (0: no; 1: yes)\n");
+ printf("arg7: time kernel (0=n0, 1=yes)\n");
printf("arg8 to 13: M, N, K, StrideA, StrideB, StrideC\n");
printf("arg14: split k into mulitiple batch\n");
exit(1);
@@ -44,7 +44,7 @@ int profile_gemm_reduce(int argc, char* argv[])
const bool do_verification = std::stoi(argv[4]);
const int init_method = std::stoi(argv[5]);
const bool do_log = std::stoi(argv[6]);
- const int nrepeat = std::stoi(argv[7]);
+ const bool time_kernel = std::stoi(argv[7]);
const int M = std::stoi(argv[8]);
const int N = std::stoi(argv[9]);
@@ -66,7 +66,7 @@ int profile_gemm_reduce(int argc, char* argv[])
do_verification,
init_method,
do_log,
- nrepeat,
+ time_kernel,
M,
N,
K,
@@ -87,7 +87,7 @@ int profile_gemm_reduce(int argc, char* argv[])
do_verification,
init_method,
do_log,
- nrepeat,
+ time_kernel,
M,
N,
K,
@@ -108,7 +108,7 @@ int profile_gemm_reduce(int argc, char* argv[])
do_verification,
init_method,
do_log,
- nrepeat,
+ time_kernel,
M,
N,
K,
@@ -129,7 +129,7 @@ int profile_gemm_reduce(int argc, char* argv[])
do_verification,
init_method,
do_log,
- nrepeat,
+ time_kernel,
M,
N,
K,
diff --git a/profiler/src/profile_grouped_gemm.cpp b/profiler/src/profile_grouped_gemm.cpp
index 88a2a8f855..d35484cfae 100644
--- a/profiler/src/profile_grouped_gemm.cpp
+++ b/profiler/src/profile_grouped_gemm.cpp
@@ -54,8 +54,8 @@ int profile_grouped_gemm(int argc, char* argv[])
printf(" 3: A[k, m] * B[n, k] = C[m, n])\n");
printf("arg4: verification (0: no; 1: yes)\n");
printf("arg5: initialization (0: no init; 1: integer value; 2: decimal value)\n");
- printf("arg8: print tensor value (0: no; 1: yes)\n");
- printf("arg7: run kernel # of times (>1)\n");
+ printf("arg6: print tensor value (0: no; 1: yes)\n");
+ printf("arg7: time kernel (0=n0, 1=yes)\n");
printf("arg8 to 13: Ms, Ns, Ks, StrideAs, StrideBs, StrideCs (e.g., 256,256 128,128 64,64 "
"64,64 64,64 128,128)\n");
exit(1);
@@ -66,7 +66,7 @@ int profile_grouped_gemm(int argc, char* argv[])
const bool do_verification = std::stoi(argv[4]);
const int init_method = std::stoi(argv[5]);
const bool do_log = std::stoi(argv[6]);
- const int nrepeat = std::stoi(argv[7]);
+ const bool time_kernel = std::stoi(argv[7]);
const auto Ms = argToIntArray(argv[8]);
const auto Ns = argToIntArray(argv[9]);
@@ -86,7 +86,7 @@ int profile_grouped_gemm(int argc, char* argv[])
ck::tensor_layout::gemm::RowMajor>(do_verification,
init_method,
do_log,
- nrepeat,
+ time_kernel,
Ms,
Ns,
Ks,
@@ -104,7 +104,7 @@ int profile_grouped_gemm(int argc, char* argv[])
ck::tensor_layout::gemm::RowMajor>(do_verification,
init_method,
do_log,
- nrepeat,
+ time_kernel,
Ms,
Ns,
Ks,
@@ -122,7 +122,7 @@ int profile_grouped_gemm(int argc, char* argv[])
ck::tensor_layout::gemm::RowMajor>(do_verification,
init_method,
do_log,
- nrepeat,
+ time_kernel,
Ms,
Ns,
Ks,
@@ -140,7 +140,7 @@ int profile_grouped_gemm(int argc, char* argv[])
ck::tensor_layout::gemm::RowMajor>(do_verification,
init_method,
do_log,
- nrepeat,
+ time_kernel,
Ms,
Ns,
Ks,
diff --git a/profiler/src/profile_reduce.cpp b/profiler/src/profile_reduce.cpp
index 96fa78964a..5e91a1d2d1 100644
--- a/profiler/src/profile_reduce.cpp
+++ b/profiler/src/profile_reduce.cpp
@@ -144,7 +144,7 @@ class AppArgs
bool do_dumpout = false;
int init_method;
- int nrepeat;
+ bool time_kernel;
bool need_indices = false;
@@ -295,7 +295,7 @@ class AppArgs
throw std::runtime_error("Invalid cmd-line arguments, more argumetns are needed!");
init_method = std::atoi(argv[optind++]);
- nrepeat = std::atoi(argv[optind]);
+ time_kernel = std::atoi(argv[optind]);
if(scales.empty())
{
@@ -354,7 +354,7 @@ int profile_reduce(int argc, char* argv[])
args.init_method,
args.do_log,
args.do_dumpout,
- args.nrepeat,
+ args.time_kernel,
args.inLengths,
args.reduceDims,
args.reduceOp,
@@ -369,7 +369,7 @@ int profile_reduce(int argc, char* argv[])
args.init_method,
args.do_log,
args.do_dumpout,
- args.nrepeat,
+ args.time_kernel,
args.inLengths,
args.reduceDims,
args.reduceOp,
@@ -387,7 +387,7 @@ int profile_reduce(int argc, char* argv[])
args.init_method,
args.do_log,
args.do_dumpout,
- args.nrepeat,
+ args.time_kernel,
args.inLengths,
args.reduceDims,
args.reduceOp,
@@ -414,7 +414,7 @@ int profile_reduce(int argc, char* argv[])
args.init_method,
args.do_log,
args.do_dumpout,
- args.nrepeat,
+ args.time_kernel,
args.inLengths,
args.reduceDims,
args.reduceOp,
@@ -429,7 +429,7 @@ int profile_reduce(int argc, char* argv[])
args.init_method,
args.do_log,
args.do_dumpout,
- args.nrepeat,
+ args.time_kernel,
args.inLengths,
args.reduceDims,
args.reduceOp,
@@ -454,7 +454,7 @@ int profile_reduce(int argc, char* argv[])
args.init_method,
args.do_log,
args.do_dumpout,
- args.nrepeat,
+ args.time_kernel,
args.inLengths,
args.reduceDims,
args.reduceOp,
@@ -471,7 +471,7 @@ int profile_reduce(int argc, char* argv[])
args.init_method,
args.do_log,
args.do_dumpout,
- args.nrepeat,
+ args.time_kernel,
args.inLengths,
args.reduceDims,
args.reduceOp,
@@ -486,7 +486,7 @@ int profile_reduce(int argc, char* argv[])
args.init_method,
args.do_log,
args.do_dumpout,
- args.nrepeat,
+ args.time_kernel,
args.inLengths,
args.reduceDims,
args.reduceOp,
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index 8a9db2adbd..c696069393 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -22,6 +22,8 @@ include_directories(BEFORE
${PROJECT_SOURCE_DIR}/external/include/half
)
+include(googletest)
+
add_custom_target(check COMMAND ${CMAKE_CTEST_COMMAND} --output-on-failure -C ${CMAKE_CFG_INTDIR})
add_custom_target(tests)
@@ -61,4 +63,5 @@ add_subdirectory(grouped_gemm)
add_subdirectory(convnd_fwd)
add_subdirectory(reduce)
add_subdirectory(conv2d_bwd_weight)
-add_subdirectory(convnd_bwd_data)
\ No newline at end of file
+add_subdirectory(convnd_bwd_data)
+# DONOT add client_app, that is tested via CI independently
\ No newline at end of file
diff --git a/test/batched_gemm_reduce/batched_gemm_reduce_fp16.cpp b/test/batched_gemm_reduce/batched_gemm_reduce_fp16.cpp
index ce061c644b..7b311cff17 100644
--- a/test/batched_gemm_reduce/batched_gemm_reduce_fp16.cpp
+++ b/test/batched_gemm_reduce/batched_gemm_reduce_fp16.cpp
@@ -22,7 +22,7 @@ int main()
Row,
Row,
Row>(
- true, 1, false, 1, M, N, K, K, N, N, BatchCount);
+ true, 1, false, false, M, N, K, K, N, N, BatchCount);
pass = pass && ck::profiler::profile_batched_gemm_reduce_impl(
- true, 1, false, 1, M, N, K, K, K, N, BatchCount);
+ true, 1, false, false, M, N, K, K, K, N, BatchCount);
pass = pass && ck::profiler::profile_batched_gemm_reduce_impl(
- true, 1, false, 1, M, N, K, M, N, N, BatchCount);
+ true, 1, false, false, M, N, K, M, N, N, BatchCount);
pass = pass && ck::profiler::profile_batched_gemm_reduce_impl(
- true, 1, false, 1, M, N, K, M, K, N, BatchCount);
+ true, 1, false, false, M, N, K, M, K, N, BatchCount);
if(pass)
{
diff --git a/test/client_app/CMakeLists.txt b/test/client_app/CMakeLists.txt
new file mode 100644
index 0000000000..f8dd8c4e0a
--- /dev/null
+++ b/test/client_app/CMakeLists.txt
@@ -0,0 +1,11 @@
+cmake_minimum_required(VERSION 3.15)
+project(ck_app)
+add_compile_options(-std=c++14)
+
+find_package(composable_kernel 1.0.0 COMPONENTS device_operations host_tensor)
+find_package(hip REQUIRED PATHS /opt/rocm)
+message(STATUS "Build with HIP ${hip_VERSION}")
+
+add_executable(test_client_app client_app.cpp)
+
+target_link_libraries(test_client_app PRIVATE composable_kernel::device_operations composable_kernel::host_tensor hip::host)
diff --git a/test/client_app/client_app.cpp b/test/client_app/client_app.cpp
new file mode 100644
index 0000000000..665a103f70
--- /dev/null
+++ b/test/client_app/client_app.cpp
@@ -0,0 +1,77 @@
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+
+#include "client_app_impl.hpp"
+
+int main(int argc, char* argv[])
+{
+ if(argc != 25)
+ {
+ printf("arg1: tensor operation (conv_fwd: ForwardConvolution)\n");
+ printf("arg2: data type (0: fp32; 1: fp16)\n");
+ printf("arg3: input tensor layout (0: NCHW; 1: NHWC)\n");
+ printf("arg4: weight tensor layout (0: KCYX; 1: KYXC)\n");
+ printf("arg5: output tensor layout (0: NKHW; 1: NHWK)\n");
+ printf("arg6: verification (0: no; 1: yes)\n");
+ printf("arg7: initialization (0: no init; 1: integer value; 2: decimal value)\n");
+ printf("arg8: print tensor value (0: no; 1: yes)\n");
+ printf("arg9: time kernel (0=n0, 1=yes)\n");
+ printf("arg10 to 24: N, K, C, Y, X, Hi, Wi, Sy, Sx, Dy, Dx, LeftPy, LeftPx, RightPy, "
+ "RightPx\n");
+ exit(1);
+ }
+
+ const ConvDataType data_type = static_cast(std::stoi(argv[2]));
+ const int in_layout = static_cast(std::stoi(argv[3]));
+ const int wei_layout = static_cast(std::stoi(argv[4]));
+ const int out_layout = static_cast(std::stoi(argv[5]));
+ const bool do_verification = std::stoi(argv[6]);
+ const int init_method = std::stoi(argv[7]);
+ const bool do_log = std::stoi(argv[8]);
+ const bool time_kernel = std::stoi(argv[9]);
+
+ const ck::index_t N = std::stoi(argv[10]);
+ const ck::index_t K = std::stoi(argv[11]);
+ const ck::index_t C = std::stoi(argv[12]);
+ const ck::index_t Y = std::stoi(argv[13]);
+ const ck::index_t X = std::stoi(argv[14]);
+ const ck::index_t Hi = std::stoi(argv[15]);
+ const ck::index_t Wi = std::stoi(argv[16]);
+
+ const ck::index_t conv_stride_h = std::stoi(argv[17]);
+ const ck::index_t conv_stride_w = std::stoi(argv[18]);
+ const ck::index_t conv_dilation_h = std::stoi(argv[19]);
+ const ck::index_t conv_dilation_w = std::stoi(argv[20]);
+ const ck::index_t in_left_pad_h = std::stoi(argv[21]);
+ const ck::index_t in_left_pad_w = std::stoi(argv[22]);
+ const ck::index_t in_right_pad_h = std::stoi(argv[23]);
+ const ck::index_t in_right_pad_w = std::stoi(argv[24]);
+
+ const ck::index_t YEff = (Y - 1) * conv_dilation_h + 1;
+ const ck::index_t XEff = (X - 1) * conv_dilation_w + 1;
+
+ const ck::index_t Ho = (Hi + in_left_pad_h + in_right_pad_h - YEff) / conv_stride_h + 1;
+ const ck::index_t Wo = (Wi + in_left_pad_w + in_right_pad_w - XEff) / conv_stride_w + 1;
+
+ ck::app::profile_conv_fwd_impl(do_verification,
+ init_method,
+ do_log,
+ time_kernel,
+ data_type,
+ N,
+ K,
+ C,
+ std::vector{Hi, Wi},
+ std::vector